# Programmer: Sriram Pemmaraju # Date: March 4, 2015 # Parses a line and extracts words from the line into a list. Any word that is # extracted needs to be at least 4 letters long. Also, all extracted words are # converted into all-lower-case before being inserted into the list. def parse(s): listOfWords = [] # maintains the list of words in strings s currentWord = "" wordBeingProcessed = False i = 0 # serves as an index into the string s while i < len(s): # if the current character is a lower case letter if (s[i] >= "a" and s[i] <= "z"): wordBeingProcessed = True currentWord = currentWord + s[i] # if the current character is an upper case character # do the same as above, except convert character into corresponding # lower case character using the ord() and chr() functions elif (s[i] >= "A" and s[i] <= "Z"): wordBeingProcessed = True currentWord = currentWord + chr(ord("a") + ord(s[i]) - ord("A")) # else if the current character is a non-letter # immediately following a word elif wordBeingProcessed: if len(currentWord) >= 4: listOfWords.append(currentWord) wordBeingProcessed = False currentWord = "" i = i + 1 if wordBeingProcessed and len(currentWord) >= 4: listOfWords.append(currentWord) return listOfWords # Searches for a word in wordList and returns the index of the first occurrence # of word in wordList if found; otherwise returns -1. def getIndex(wordList, word): # These are indices that keep track of the left # and right boundaries of the list that needs exploring left = 0 right = len(wordList)-1 # while left <= right, there is more to search while left <= right: middle = (left + right) // 2 # Found k if word == wordList[middle]: return middle # If k is less than L[middle] search the # left half if word < wordList[middle]: right = middle - 1 # Otherwise search the right half elif word > wordList[middle]: left = middle + 1 return left # Takes a filename as parameter and parses the file, extracts words from the file # and constructs two lists: one containing the words in the file and the other # containing corresponding word-frequencies. def computeWordFrequencies(filename): f = open(filename, "r") line = f.readline() masterList = [] # for maintaining the list of words frequencies = [] # for maintaining frequencies of these words while line: wordsInLine = parse(line) # parse the current line to extract words index = 0 while index < len(wordsInLine): # if word is already in masterList, increase its frequecy loc = getIndex(masterList, wordsInLine[index]) if loc < len(masterList) and masterList[loc] == wordsInLine[index]: frequencies[loc] = frequencies[loc] + 1 else: # this is the first occurrence of the word, so just append # and set frequency to be 1 masterList.insert(loc, wordsInLine[index]) frequencies.insert(loc, 1) index = index + 1 line = f.readline() f.close() return [masterList, frequencies] def swap(L, i, j): temp = L[i] L[i] = L[j] L[j] = temp # Finds and returns the index of the word with maximum frequency in # the range L[lowerBound..len(L)-1] def maxIndex(L, lowerBound): maxElement = L[lowerBound] indexOfMax = lowerBound index = lowerBound + 1 while index < len(L): if L[index] > maxElement: maxElement = L[index] indexOfMax = index index = index + 1 return indexOfMax # Uses a truncated version of selection sort to bring the most frequent # k words to the front of the list. def mostFrequentWords(wordList, frequencyList, k): n = len(wordList) index = 0 while index < k: # Finds the index of a most frequent word in the range [index..n-1] m = maxIndex(frequencyList, index) # Bring this most frequent word to the "front" by swapping wordList[m] and # wordList[index] and also corresponding frequencies swap(wordList,index, m) swap(frequencyList,index, m) index = index + 1 return wordList[:k] # Merges two word and frequency lists def merge(wL1, f1, wL2, f2): index = 0 # Walk through the words in wL2 while index < len(wL2): loc = getIndex(wL1, wL2[index]) if loc < len(wL1) and wL1[loc] == wL2[index]: # the word wL2[index] is not in wL1 f1[loc] = f1[loc] + f2[index] else: wL1.insert(loc, wL2[index]) f1.insert(loc, f2[index]) index = index + 1 return [wL1, f1] # the merged lists #main program # First deal with War and Peace [wordList, frequencies] = computeWordFrequencies("war.txt") f20 = mostFrequentWords(wordList, frequencies, 20) print("Tolstoy") print(f20) # And then with Jekyl, Hyde, and the treasure island [wordList1, frequencies1] = computeWordFrequencies("hyde.txt") [wordList2, frequencies2] = computeWordFrequencies("treasure.txt") [wordList, frequencies] = merge(wordList1, frequencies1, wordList2, frequencies2) f20 = mostFrequentWords(wordList, frequencies, 20) print("R.L.Stevenson") print(f20)