# Programmer: Sriram Pemmaraju
# Date: March 26th, 2014
# Modified on March 24th, 2015

# This program aims to do simple text analysis to figure out the principal 
# characters of a novel. Since character names are proper nouns, starting with
# upper case letters, the idea is to look for words starting with upper case
# letters that do not appear at the beginning of sentences. So the program also
# attempts to partition the text into sentences, assuming that ".", "!", and "?"
# are all the possible sentence delimiters. Then we count the frequency of the
# proper nouns and report the most frequent of these. It is likely that the
# principal characters of the novel can be found among this list.

# Takes a string as parameter and "splits" it into "sentences." 
# We assume that ".", "!", and "?" are sentence delimiters
def parseSentences(bigString):
    return bigString.replace("!", ".").replace("?", ".").split(".")

# Replaces all non-letters in a given string s by space
def replaceNonLetters(s):
    # Make a list of all non-letters. Note the use of the list comprehension here
    nonLetters = [x for x in s if not x.isalpha()]

    # Replaces each nonletter character in s by space
    for char in nonLetters:
        s = s.replace(char, " ")

    return s
    
# Takes a list of sentences and parses each sentence in this list into a list of words.
# So the result is a list of lists, e.g., [["This", "is", "ok"], ["This", "is", "not"]].
# We use the same definition of a word as before. It is a contiguous sequence of letters.
def parseWords(sentenceList):

    # Once non-letters have been replaced by spaces then a simple split() using
    # blank as the delimiter will help us get all the words. Note that this
    # constructs a nested list of words for each sentence.
    return [replaceNonLetters(x).split() for x in sentenceList]

# Searches for a word in wordList and returns the index of the first occurrence
# of word in wordList if found; otherwise returns -1.
def getIndex(wordList, word):
    index = 0
    while index < len(wordList):
        if wordList[index] == word:
            return index
        index = index + 1
    return -1

# Used to compute frequencies of words in a given word list, that may contain
# duplicates.
def computeFrequencies(wordList):
    masterList = [] # for maintaining the list of words
    frequencies = [] # for maintaining frequencies of these words
    
    for word in wordList:
        # if word is already in masterList, increase its frequecy
        loc = getIndex(masterList, word)
        if loc >= 0:
            frequencies[loc] = frequencies[loc] + 1
        else: # this is the first occurrence of the word, so just append
              # and set frequency to be 1
            masterList.append(word)
            frequencies.append(1)
            
    return [masterList, frequencies]

# main program
f = open("illiad.txt", "r")
bigString = f.read()
sentenceList = parseSentences(bigString)
nestedWordList = parseWords(sentenceList)

# This block of code walks through the list of words, ignores
# the first word in each sentence and of the remaining words, picks
# ones that start with an upper case and have length at least 4.
nestedWordList = [x[1:] for x in nestedWordList]
wordList = [y for x in nestedWordList for y in x]
characterNames = [x for x in wordList if x[0].isupper() and len(x) > 3]
            
[masterList, frequencies] = computeFrequencies(characterNames)

# zips the frequencies and words together and sorts the zipped list in descending
# order of frequencies.
combinedList = [[frequencies[i], masterList[i]] for i in range(len(masterList))]
combinedList.sort(reverse = True)

# Prints the 30 most frequent character names
print(combinedList[:30])
