# Programmer: Sriram Pemmaraju
# Date: March 4, 2015

# Parses a line and extracts words from the line into a list. Any word that is
# extracted needs to be at least 4 letters long. Also, all extracted words are
# converted into all-lower-case before being inserted into the list.
def parse(s):
    listOfWords = [] # maintains the list of words in strings s
    currentWord = ""
    
    wordBeingProcessed = False
    
    i = 0 # serves as an index into the string s
    while i < len(s):
        # if the current character is a lower case letter
        if (s[i] >= "a" and s[i] <= "z"): 
            wordBeingProcessed = True
            currentWord = currentWord + s[i]
        # if the current character is an upper case character
        # do the same as above, except convert character into corresponding
        # lower case character using the ord() and chr() functions
        elif (s[i] >= "A" and s[i] <= "Z"):
            wordBeingProcessed = True
            currentWord = currentWord + chr(ord("a") + ord(s[i]) - ord("A"))
        # else if the current character is a non-letter
        # immediately following a word
        elif wordBeingProcessed:
            if len(currentWord) >= 4:
                listOfWords.append(currentWord)
            wordBeingProcessed = False
            currentWord = ""
        i = i + 1

    if wordBeingProcessed and len(currentWord) >= 4:
        listOfWords.append(currentWord)
            
    return listOfWords

# Searches for a word in wordList and returns the index of the first occurrence
# of word in wordList if found; otherwise returns -1.
def getIndex(wordList, word):
    # These are indices that keep track of the left
    # and right boundaries of the list that needs exploring
    left = 0
    right = len(wordList)-1
    
    # while left <= right, there is more to search
    while left <= right:
        middle = (left + right) // 2
        
        # Found k
        if word == wordList[middle]:
            return middle
        
        # If k is less than L[middle] search the 
        # left half
        if word < wordList[middle]:
            right = middle - 1
            
        # Otherwise search the right half
        elif word > wordList[middle]:
            left = middle + 1
        
    return left

# Takes a filename as parameter and parses the file, extracts words from the file
# and constructs two lists: one containing the words in the file and the other
# containing corresponding word-frequencies.
def computeWordFrequencies(filename):
    f = open(filename, "r")
    line = f.readline()
    masterList = [] # for maintaining the list of words
    frequencies = [] # for maintaining frequencies of these words
    
    while line:
        wordsInLine = parse(line) # parse the current line to extract words
 
        index = 0
        while index < len(wordsInLine):
            # if word is already in masterList, increase its frequecy
            loc = getIndex(masterList, wordsInLine[index])
            if loc < len(masterList) and masterList[loc] == wordsInLine[index]:
                frequencies[loc] = frequencies[loc] + 1
            else: # this is the first occurrence of the word, so just append
                  # and set frequency to be 1
                masterList.insert(loc, wordsInLine[index])
                frequencies.insert(loc, 1)
            index = index + 1
            
        line = f.readline()

    f.close()            
    return [masterList, frequencies]

def swap(L, i, j):
    temp = L[i]
    L[i] = L[j]
    L[j] = temp

# Finds and returns the index of the word with maximum frequency in 
# the range L[lowerBound..len(L)-1]
def maxIndex(L, lowerBound):
    maxElement = L[lowerBound]
    indexOfMax = lowerBound
    index = lowerBound + 1

    while index < len(L):
        if L[index] > maxElement:
            maxElement = L[index]
            indexOfMax = index
            
        index = index + 1

    return indexOfMax

# Uses a truncated version of selection sort to bring the most frequent
# k words to the front of the list. 
def mostFrequentWords(wordList, frequencyList, k):
    n = len(wordList)
    index = 0

    while index < k:
        # Finds the index of a most frequent word in the range [index..n-1]
        m = maxIndex(frequencyList, index)

        # Bring this most frequent word to the "front" by swapping wordList[m] and
        # wordList[index] and also corresponding frequencies
        swap(wordList,index, m)
        swap(frequencyList,index, m)

        index = index + 1

    return wordList[:k]

# Merges two word and frequency lists
def merge(wL1, f1, wL2, f2):
    index = 0
    # Walk through the words in wL2 
    while index < len(wL2):
        loc = getIndex(wL1, wL2[index])
        if loc < len(wL1) and wL1[loc] == wL2[index]: # the word wL2[index] is not in wL1
            f1[loc] = f1[loc] + f2[index]
        else:
            wL1.insert(loc, wL2[index])
            f1.insert(loc, f2[index])
        index = index + 1
        
    return [wL1, f1] # the merged lists
            
#main program
# First deal with War and Peace
[wordList, frequencies] = computeWordFrequencies("war.txt")
f20 = mostFrequentWords(wordList, frequencies, 20)
print("Tolstoy")
print(f20)

# And then with Jekyl, Hyde, and the treasure island
[wordList1, frequencies1] = computeWordFrequencies("hyde.txt")
[wordList2, frequencies2] = computeWordFrequencies("treasure.txt")
[wordList, frequencies] = merge(wordList1, frequencies1, wordList2, frequencies2)
f20 = mostFrequentWords(wordList, frequencies, 20)
print("R.L.Stevenson")
print(f20)
