# Programmer: Sriram Pemmaraju # Date: March 26th, 2014 # Modified on March 24th, 2015 # This program aims to do simple text analysis to figure out the principal # characters of a novel. Since character names are proper nouns, starting with # upper case letters, the idea is to look for words starting with upper case # letters that do not appear at the beginning of sentences. So the program also # attempts to partition the text into sentences, assuming that ".", "!", and "?" # are all the possible sentence delimiters. Then we count the frequency of the # proper nouns and report the most frequent of these. It is likely that the # principal characters of the novel can be found among this list. # Takes a string as parameter and "splits" it into "sentences." # We assume that ".", "!", and "?" are sentence delimiters def parseSentences(bigString): return bigString.replace("!", ".").replace("?", ".").split(".") # Replaces all non-letters in a given string s by space def replaceNonLetters(s): # Make a list of all non-letters. Note the use of the list comprehension here nonLetters = [x for x in s if not x.isalpha()] # Replaces each nonletter character in s by space for char in nonLetters: s = s.replace(char, " ") return s # Takes a list of sentences and parses each sentence in this list into a list of words. # So the result is a list of lists, e.g., [["This", "is", "ok"], ["This", "is", "not"]]. # We use the same definition of a word as before. It is a contiguous sequence of letters. def parseWords(sentenceList): # Once non-letters have been replaced by spaces then a simple split() using # blank as the delimiter will help us get all the words. Note that this # constructs a nested list of words for each sentence. return [replaceNonLetters(x).split() for x in sentenceList] # Searches for a word in wordList and returns the index of the first occurrence # of word in wordList if found; otherwise returns -1. def getIndex(wordList, word): index = 0 while index < len(wordList): if wordList[index] == word: return index index = index + 1 return -1 # Used to compute frequencies of words in a given word list, that may contain # duplicates. def computeFrequencies(wordList): masterList = [] # for maintaining the list of words frequencies = [] # for maintaining frequencies of these words for word in wordList: # if word is already in masterList, increase its frequecy loc = getIndex(masterList, word) if loc >= 0: frequencies[loc] = frequencies[loc] + 1 else: # this is the first occurrence of the word, so just append # and set frequency to be 1 masterList.append(word) frequencies.append(1) return [masterList, frequencies] # main program f = open("illiad.txt", "r") bigString = f.read() sentenceList = parseSentences(bigString) nestedWordList = parseWords(sentenceList) # This block of code walks through the list of words, ignores # the first word in each sentence and of the remaining words, picks # ones that start with an upper case and have length at least 4. nestedWordList = [x[1:] for x in nestedWordList] wordList = [y for x in nestedWordList for y in x] characterNames = [x for x in wordList if x[0].isupper() and len(x) > 3] [masterList, frequencies] = computeFrequencies(characterNames) # zips the frequencies and words together and sorts the zipped list in descending # order of frequencies. combinedList = [[frequencies[i], masterList[i]] for i in range(len(masterList))] combinedList.sort(reverse = True) # Prints the 30 most frequent character names print(combinedList[:30])