# Programmer: Sriram Pemmaraju
# Date: March 4, 2015

# Parses a line and extracts words from the line into a list. Any word that is
# extracted needs to be at least 4 letters long. Also, all extracted words are
# converted into all-lower-case before being inserted into the list.
def parse(s):
    listOfWords = [] # maintains the list of words in strings s
    currentWord = ""
    
    wordBeingProcessed = False
    
    i = 0 # serves as an index into the string s
    while i < len(s):
        # if the current character is a lower case letter
        if (s[i] >= "a" and s[i] <= "z"): 
            wordBeingProcessed = True
            currentWord = currentWord + s[i]
        # if the current character is an upper case character
        # do the same as above, except convert character into corresponding
        # lower case character using the ord() and chr() functions
        elif (s[i] >= "A" and s[i] <= "Z"):
            wordBeingProcessed = True
            currentWord = currentWord + chr(ord("a") + ord(s[i]) - ord("A"))
        # else if the current character is a non-letter
        # immediately following a word
        elif wordBeingProcessed:
            if len(currentWord) >= 4:
                listOfWords.append(currentWord)
            wordBeingProcessed = False
            currentWord = ""
        i = i + 1

    if wordBeingProcessed and len(currentWord) >= 4:
        listOfWords.append(currentWord)
            
    return listOfWords


# Takes a filename as parameter and parses the file, extracts words from the file
# and constructs two lists: one containing the words in the file and the other
# containing corresponding word-frequencies.
def computeWordFrequencies(filename):
    f = open(filename, "r")
    line = f.readline()
    masterDict = {} # for maintaining words and their frequencies
    
    while line:
        wordsInLine = parse(line) # parse the current line to extract words
        
        for word in wordsInLine:
            if word in masterDict:
                masterDict[word] = masterDict[word] + 1
            else:
                masterDict[word] = 1
                
        line = f.readline()

    f.close()            
    return masterDict
