# Parses a line and extracts words from the line into a list. Any word that is # extracted needs to be at least 4 letters long. Also, all extracted words are # converted into all-lower-case before being inserted into the list. def parse(s): s = s + " " listOfWords = [] # maintains the list of words in strings s currentWord = "" wordBeingProcessed = False i = 0 # serves as an index into the string s while i < len(s): # if the current character is a lower case letter if (s[i] >= "a" and s[i] <= "z"): wordBeingProcessed = True currentWord = currentWord + s[i] # if the current character is an upper case character # do the same as above, except convert character into corresponding # lower case character using the ord() and chr() functions elif (s[i] >= "A" and s[i] <= "Z"): wordBeingProcessed = True currentWord = currentWord + chr(ord("a") + ord(s[i]) - ord("A")) # else if the current character is a non-letter # immediately following a word elif wordBeingProcessed: if len(currentWord) >= 4: listOfWords.append(currentWord) wordBeingProcessed = False currentWord = "" i = i + 1 return listOfWords # Searches for a word in wordList and returns the index of the first occurrence # of word in wordList if found; otherwise returns -1. def getIndex(wordList, word): index = 0 while index < len(wordList): if wordList[index] == word: return index index = index + 1 return -1 # Takes a filename as parameter and parses the file, extracts words from the file # and constructs two lists: one containing the words in the file and the other # containing corresponding word-frequencies. def computeFrequencies(filename): f = open(filename, "r") line = f.readline() masterList = [] # for maintaining the list of words frequencies = [] # for maintaining frequencies of these words while line: wordsInLine = parse(line) # parse the current line to extract words index = 0 while index < len(wordsInLine): # if word is already in masterList, increase its frequecy loc = getIndex(masterList, wordsInLine[index]) if loc >= 0: frequencies[loc] = frequencies[loc] + 1 else: # this is the first occurrence of the word, so just append # and set frequency to be 1 masterList.append(wordsInLine[index]) frequencies.append(1) index = index + 1 line = f.readline() f.close() return [masterList, frequencies] def swap(L, i, j): temp = L[i] L[i] = L[j] L[j] = temp # Finds and returns the index of the word with maximum frequency in # the range L[lowerBound..len(L)-1] def maxIndex(L, lowerBound): maxElement = L[lowerBound] indexOfMax = lowerBound index = lowerBound + 1 while index < len(L): if L[index] > maxElement: maxElement = L[index] indexOfMax = index index = index + 1 return indexOfMax # Uses a truncated version of selection sort to bring the most frequent # k words to the front of the list. def mostFrequentWords(wordList, frequencyList, k): wLCopy = wordList[:] fLCopy = frequencyList[:] n = len(wLCopy) index = 0 while index < k: # Finds the index of a most frequent word in the range [index..n-1] m = maxIndex(fLCopy, index) # Bring this most frequent word to the "front" by swapping wordList[m] and # wordList[index] and also corresponding frequencies swap(wLCopy, index, m) swap(fLCopy, index, m) index = index + 1 return wLCopy[:k] # Prints the words in wordList in positions 0, 1,..., upperBound-1 def printPrefix(wordList, upperBound): index = 0 while index < upperBound: print wordList[index] index = index + 1 #main program # First deal with War and Peace #[wordList, frequencies] = computeFrequencies("war.txt") #mostFrequentWords(wordList, frequencies, 20) #print "Tolstoy" #printPrefix(wordList, 20) # And then with Jekyl and Hyde #[wordList, frequencies] = computeFrequencies("hyde.txt") #mostFrequentWords(wordList, frequencies, 20) #print "R.L.Stevenson" #printPrefix(wordList, 20)