# Programmer: Sriram Pemmaraju # Date: March 4, 2015 # Parses a line and extracts words from the line into a list. Any word that is # extracted needs to be at least 4 letters long. Also, all extracted words are # converted into all-lower-case before being inserted into the list. def parse(s): listOfWords = [] # maintains the list of words in strings s currentWord = "" wordBeingProcessed = False i = 0 # serves as an index into the string s while i < len(s): # if the current character is a lower case letter if (s[i] >= "a" and s[i] <= "z"): wordBeingProcessed = True currentWord = currentWord + s[i] # if the current character is an upper case character # do the same as above, except convert character into corresponding # lower case character using the ord() and chr() functions elif (s[i] >= "A" and s[i] <= "Z"): wordBeingProcessed = True currentWord = currentWord + chr(ord("a") + ord(s[i]) - ord("A")) # else if the current character is a non-letter # immediately following a word elif wordBeingProcessed: if len(currentWord) >= 4: listOfWords.append(currentWord) wordBeingProcessed = False currentWord = "" i = i + 1 if wordBeingProcessed and len(currentWord) >= 4: listOfWords.append(currentWord) return listOfWords # Takes a filename as parameter and parses the file, extracts words from the file # and constructs two lists: one containing the words in the file and the other # containing corresponding word-frequencies. def computeWordFrequencies(filename): f = open(filename, "r") line = f.readline() masterDict = {} # for maintaining words and their frequencies while line: wordsInLine = parse(line) # parse the current line to extract words for word in wordsInLine: if word in masterDict: masterDict[word] = masterDict[word] + 1 else: masterDict[word] = 1 line = f.readline() f.close() return masterDict