import numpy, sys, Scalar, random, re text = sys.argv[1] try: random.seed(int(sys.argv[2])) except: random.seed(3) N_states=15 iterations=100 # Read the text all = re.sub('-\n','',open(text,'r').read()) all = re.sub('--+','--',all) all = re.sub('\.\.\.+','\.\.\.',all) # The following sub-patterns match ordinary words, punctuation, times, # punctuated numbers and unpuncuated integers. pattern = "[a-zA-Z']+|" \ +'["'+":;,!.?()$*`'\[\]<>&/-]|" \ +"[0-9]+:[0-9]+|" \ +"[0-9]+[0-9.,]*[0-9]+|" \ +"[0-9]+" all = re.findall(pattern,all) # Now "all" is the sequence of words words = {} for word in all: if words.has_key(word): words[word] += 1 else: words[word] = 1 # Now "words" is a dict of all words that occur. Each key is a word # and the value is the number of occurrences word_list = words.items() comp = lambda x,y: y[1]-x[1] word_list.sort(comp) # Now "word_list" is list of tuples (word,occurrence) sorted by # occurrence # Identify "merge; the beginning of the tail of "word_list" where # occurrence is <= 2 bottom = 2 for n in xrange(len(word_list)): key,count = word_list[n] if count <= bottom: merge = n print '%%bottom=%d merge=%d len(word_list)=%d'%( bottom,merge,len(word_list)) break word_list[merge] = ('****',bottom) # Change value of each entry in dict "words" to be minimum of the word # rank and "merge" for n in xrange(len(word_list)): key,count = word_list[n] if count > bottom: words[key] = n else: words[key] = merge # Map words in "all" to integers in "y" y = numpy.empty(len(all),numpy.int32) for n in xrange(len(all)): y[n] = words[all[n]] # Make an HMM def randomP(A): """ Fill allocated array A with random normalized probability """ sum = 0 for i in xrange(len(A)): x = random.random() sum += x A[i] = x A /= sum return A Card_Y = merge+1 P_S0 = randomP(numpy.zeros(N_states)) P_S0_ergodic = randomP(numpy.zeros(N_states)) P_ScS = numpy.zeros((N_states,N_states)) P_YcS = numpy.zeros((N_states,Card_Y)) for AA in (P_ScS,P_YcS): for A in AA: randomP(A) model = Scalar.HMM(P_S0,P_S0_ergodic,P_ScS,P_YcS) # Train the model print >> sys.stderr, """ Begin training in po_speech.py. Takes 26 minutes on a 1 GHZ 64bit Athlon. """ LL = model.train(y,iterations,display=False) # Do Viterbi decoding ss = model.decode(y) # Print the most frequent 10 words associated with each state comp = lambda x,y: y[1]-x[1] # Same definition as before for s_n in xrange(N_states): s_words = range(merge) for n in xrange(merge): s_words[n] = [n,0] for t in xrange(len(ss)): if ss[t] != s_n: continue if y[t] == merge: continue s_words[int(y[t])][1] += 1 s_words.sort(comp) print '\\rule{0pt}{2.0ex} %d'%(s_n+1), for i in xrange(10): print '&%s'%word_list[s_words[i][0]][0], print '\\\\' #Local Variables: #mode:python #End: