src/dmv.py.~21~

   1 #### changes by KBU:
   2 # 2008-05-24:
   3 # - prettier printout for DMV_Rule
   4 # - DMV_Rule changed a bit. head, L and R are now all pairs of the
   5 #   form (bars, head).
   6 # - Started on P_STOP, a bit less pseudo now..
   7 #
   8 # 2008-05-27:
   9 # - started on initialization. So far, I have frequencies for
  10 #   everything, very harmonic. Still need to make these into 1-summing
  11 #   probabilities
  12 #
  13 # 2008-05-28:
  14 # - more work on initialization (init_freq and init_normalize),
  15 #   getting closer to probabilities now.
  16 #
  17 # 2008-05-29:
  18 # - init_normalize is done, it creates p_STOP, p_ROOT and p_CHOOSE,
  19 #   and also adds the relevant probabilities to p_rules in a grammar.
  20 #   Still, each individual rule has to store both adjacent and non_adj
  21 #   probabilities, and inner() should be able to send some parameter
  22 #   which lets the rule choose... hopefully... Is this possible to do
  23 #   top-down even? when the sentence could be all the same words?
  24 #   todo: extensive testing of identical words in sentences!
  25 # - frequencies (only used in initialization) are stored as strings,
  26 #   but in the rules and p_STOP etc, there are only numbers.
  27 #
  28 # 2008-05-30
  29 # - copied inner() into this file, to make the very dmv-specific
  30 #   adjacency stuff work (have to factor that out later on, when it
  31 #   works).
  32 #
  33 # 2008-06-01
  34 # - finished typing in inner_dmv(), still have to test and debug
  35 #   it. The chart is now four times as big since for any rule we may
  36 #   have attachments to either the left or the right below, which
  37 #   upper rules depend on, for selecting probN or probA
  38 #
  39 # 2008-06-03
  40 # - fixed a number of little bugs in initialization, where certain
  41 #   rules were simply not created, or created "backwards"
  42 # - inner_dmv() should Work now...
  43
  44
  45 # import numpy # numpy provides Fast Arrays, for future optimization
  46 import pprint
  47 import io
  48
  49 # todo: tweak this
  50 HARMONIC_C = 0.0
  51 FNONSTOP_MIN = 10
  52 FSTOP_MIN = 5
  53
  54 # non-tweakable/constant "lookup" globals
  55 BARS = [0,1,2]
  56 RBAR = 1
  57 LRBAR = 2
  58 NOBAR = 0
  59 ROOT = (LRBAR, -1)
  60 STOP = (NOBAR, -2)
  61 # (is this the best way to represent ROOT and STOP?)
  62
  63 # todo: use these instead for attachment constants. Requires making
  64 # the last two indices of chart[] one single pair, boring retyping and
  65 # testing, worth it?
  66 LRATT = (True,   True)
  67 LATT =  (True,  False)
  68 RATT =  (False,  True)
  69 NOATT = (False, False)
  70
  71 if __name__ == "__main__":
  72     print "DMV module tests:"
  73
  74 class DMV_Grammar(io.Grammar):
  75     '''The DMV-PCFG.
  76
  77     Public members:
  78     p_STOP, p_ROOT, p_CHOOSE, p_terminals
  79     These are changed in the Maximation step, then used to set the
  80     new probabilities of each DMV_Rule.
  81
  82     Todo: make p_terminals private? (But it has to be changable in
  83     maximation step due to the short-cutting rules... could of course
  84     make a DMV_Grammar function to update the short-cut rules...)
  85
  86     __p_rules is private, but we can still say stuff like:
  87     for r in g.all_rules():
  88         r.probN = newProbN
  89
  90     What other representations do we need? (P_STOP formula uses
  91     deps_D(h,l/r) at least)'''
  92     def __str__(self):
  93         str = ""
  94         for r in self.all_rules():
  95              str += "%s\n" % r
  96         return str
  97
  98     def rules(self, b_h):
  99         bars = b_h[0]
 100         head = b_h[1]
 101         return [r for r in self.all_rules() if r.head() == head and r.bars() == bars]
 102
 103     def sent_rules(self, b_h, sent_nums):
 104         bars = b_h[0]
 105         head = b_h[1]
 106         sent_nums += [STOP[1]] # (but not ROOT though)
 107         return [r for r in self.all_rules()
 108                 if r.head() == head and r.bars() == bars
 109                 and r.L()[1] in sent_nums and r.R()[1] in sent_nums]
 110
 111     def heads(self):
 112         '''Not sure yet what is needed here, or where this is needed'''
 113         return numtag
 114
 115     def deps_L(self, head):
 116         # todo test, probably this list comprehension doesn't work
 117         return [a for r in self.all_rules() if r.head() == head and a == r.L()]
 118
 119     def deps_R(self, head):
 120         # todo test, probably this list comprehension doesn't work
 121         return [a for r in self.all_rules() if r.head() == head and a == r.R()]
 122
 123     def __init__(self, p_rules, p_terminals, p_STOP, p_CHOOSE, p_ROOT, numtag, tagnum):
 124         io.Grammar.__init__(self, p_rules, p_terminals, numtag, tagnum)
 125         self.p_STOP = p_STOP
 126         self.p_CHOOSE = p_CHOOSE
 127         self.p_ROOT = p_ROOT
 128
 129
 130 class DMV_Rule(io.CNF_Rule):
 131     '''A single CNF rule in the PCFG, of the form
 132     LHS -> L R
 133     where LHS = (bars, head)
 134
 135     Public members:
 136     probN, probA
 137
 138     Private members:
 139     __L, __R, __LHS
 140
 141     Different rule-types have different probabilities associated with
 142     them:
 143
 144     _h_ -> STOP  h_     P( STOP|h,L,    adj)
 145     _h_ -> STOP  h_     P( STOP|h,L,non_adj)
 146      h_ ->  h  STOP     P( STOP|h,R,    adj)
 147      h_ ->  h  STOP     P( STOP|h,R,non_adj)
 148      h_ -> _a_   h_     P(-STOP|h,L,    adj) * P(a|h,L)
 149      h_ -> _a_   h_     P(-STOP|h,L,non_adj) * P(a|h,L)
 150      h  ->  h   _a_     P(-STOP|h,R,    adj) * P(a|h,R)
 151      h  ->  h   _a_     P(-STOP|h,R,non_adj) * P(a|h,R)
 152     '''
 153     def p(self, LRattach, RLattach, *arg):
 154         '''Returns the correct probability, adjacent or non-adjacent,
 155         depending on whether or not there is a some lower attachment
 156         either on the right side of the left child, or the left side
 157         of the right child. '''
 158         if (not LRattach) and (not RLattach):
 159             return self.probA
 160         else:
 161             return self.probN
 162
 163     def bars(self):
 164         return self.LHS()[0]
 165
 166     def head(self):
 167         return self.LHS()[1]
 168
 169     def __init__(self, LHS, L, R, probN, probA):
 170         for b_h in [LHS, L, R]:
 171             if b_h[0] not in BARS:
 172                 raise ValueError("bars must be in %s; was given: %s" % (BARS, b_h[0]))
 173         io.CNF_Rule.__init__(self, LHS, L, R, probN)
 174         self.probA = probA # adjacent
 175         self.probN = probN # non_adj
 176
 177     @classmethod # so we can call DMV_Rule.bar_str(b_h)
 178     def bar_str(cls, b_h):
 179         str = " %d  " % b_h[1]
 180         if(b_h[0] == RBAR):
 181             str = " %d_ " % b_h[1]
 182         if(b_h[0] == LRBAR):
 183             str = "_%d_ " % b_h[1]
 184         if(b_h == ROOT):
 185             str = 'ROOT'
 186         if(b_h == STOP):
 187             str = 'STOP'
 188         return str
 189
 190     def __str__(self):
 191         return "%s-->%s %s\t[N %.2f] [A %.2f]" % (self.bar_str(self.LHS()),
 192                                                   self.bar_str(self.L()),
 193                                                   self.bar_str(self.R()),
 194                                                   self.probN,
 195                                                   self.probA)
 196
 197
 198
 199
 200
 201
 202
 203 ###################################
 204 # dmv-specific version of inner() #
 205 ###################################
 206 def rewrite_adj(bars, Lattach, Rattach):
 207     # todo: make prettier? Although since we call this so many times,
 208     # having it spelled out here is probably faster
 209     if bars == NOBAR and not Lattach and Rattach:
 210         return ( (Lattach, False, False, False),
 211                  (Lattach, False, False,  True),
 212                  (Lattach, False, True,  False),
 213                  (Lattach, False, True,   True),
 214                  (Lattach,  True, False, False),
 215                  (Lattach,  True, False,  True),
 216                  (Lattach,  True, True,  False),
 217                  (Lattach,  True, True,   True), )
 218     elif bars == RBAR and Lattach:
 219         # Rattach may be either true or false here!
 220         return ( (False, False, False, Rattach),
 221                  (False, False, True,  Rattach),
 222                  (False, True,  False, Rattach),
 223                  (False, True,  True,  Rattach),
 224                  (True,  False, False, Rattach),
 225                  (True,  False, True,  Rattach),
 226                  (True,  True,  False, Rattach),
 227                  (True,  True,  True,  Rattach) )
 228     else:
 229         # NOBAR rewrite rules cannot have Lattach below, and must
 230         # have/add Rattach. RBAR rewrite rules must add Lattach, but
 231         # don't care about Rattach. Returning () should ensure we
 232         # don't add any probability to such "false" situations
 233         return ()
 234
 235 def inner_dmv(s, t, LHS, g, sent, chart):
 236     ''' A rewrite of inner in io.py, to take adjacency into accord.
 237
 238     The chart is now 4 times bigger, since there are different values
 239     for with or without L/R attachments:
 240     chart[(s,t,LHS, Lattach, Rattach)]
 241
 242     If Rattach==True then the rule has a right-attachment or there is
 243     one lower in the tree (meaning we're no longer
 244     adjacent). Adjacency depends on whether there is an attachment
 245     lower in the tree, cf. DMV_Rule.p(LRattach, RLattach).
 246
 247     Todo: make this work, then, if possible, refactor (move
 248     dmv-specific stuff back into dmv, so this is "general" again)
 249
 250     Or, if that doesn't work, we might as well make it a method of
 251     DMV_Grammar =P
 252     '''
 253
 254     def debug_inner_dmv(tabs,s,t,LHS,Lattach,Rattach):
 255         if io.DEBUG:
 256             attach = {
 257                 (True, True): "left and right attachments below",
 258                 (True, False): "left attachment(s) below",
 259                 (False, True): "right attachment(s) below",
 260                 (False, False): "no attachments below" }
 261             info = (tabs,O(s),s,O(t),t, DMV_Rule.bar_str(LHS), attach[Lattach,Rattach])
 262             print "%sTrying from  %s_%d  to  %s_%d  with %s, %s:" % info
 263
 264     def O(s):
 265         return sent[s]
 266
 267     sent_nums = [g.tagnum(tag) for tag in sent]
 268
 269     def e(s,t,LHS, Lattach, Rattach, n_t):
 270         def tab():
 271             return "\t"*n_t
 272
 273         if (s, t, LHS, Lattach, Rattach) in chart:
 274             return chart[(s, t, LHS, Lattach, Rattach)]
 275         else:
 276             debug_inner_dmv(tab(),s,t,LHS, Lattach, Rattach)
 277             if s == t:
 278                 if Lattach or Rattach:
 279                     # terminals are always F,F for attachment
 280                     io.debug("%s= 0.0 (1 word, no lower attach)" % tab())
 281                     return 0.0
 282                 elif (LHS, O(s)) in g.p_terminals:
 283                     prob = g.p_terminals[LHS, O(s)] # b[LHS, O(s)] in Lari&Young
 284                 else:
 285                     # todo: assuming this is how to deal with lacking
 286                     # rules, since we add prob.s, and 0 is identity
 287                     prob = 0.0
 288                     io.debug( "%sLACKING TERMINAL:" % tab())
 289                 # todo: add to chart perhaps? Although, it _is_ simple lookup..
 290                 io.debug( "%s= %.1f (terminal: %s -> %s)" % (tab(),prob,
 291                                                              DMV_Rule.bar_str(LHS),
 292                                                              O(s)) )
 293                 return prob
 294             else:
 295                 if (s,t,LHS,Lattach, Rattach) not in chart:
 296                     chart[(s,t,LHS,Lattach,Rattach)] = 0.0
 297                 for rule in g.sent_rules(LHS, sent_nums): # summing over j,k in a[LHS,j,k]
 298                     io.debug( "%ssumming rule %s" % (tab(),rule) )
 299                     L = rule.L()
 300                     R = rule.R()
 301                     # if it's a STOP rule, rewrite for the same range:
 302                     if (L == STOP) or (R == STOP):
 303                         if L == STOP:
 304                             p = rule.p(Lattach, False) # todo check
 305                             pLR = e(s, t, R, Lattach, Rattach, n_t+1)
 306                         elif R == STOP:
 307                             p = rule.p(False, Rattach) # todo check
 308                             pLR = e(s, t, L, Lattach, Rattach, n_t+1)
 309                         chart[(s, t, LHS, Lattach, Rattach)] += p * pLR
 310
 311                     # not a STOP, an attachment rewrite:
 312                     else:
 313                         for r in range(s, t):
 314                             if L[1] in sent_nums[s:r+1] and R[1] in sent_nums[r+1:t+1]:
 315                                 # LL etc are boolean attachment values
 316                                 for (LL, LR, RL, RR) in rewrite_adj(rule.bars(), Lattach, Rattach):
 317                                     p = rule.p(LR, RL) # probN or probA
 318                                     pL = e(s,   r, L, LL, LR, n_t+1)
 319                                     pR = e(r+1, t, R, RL, RR, n_t+1)
 320                                     chart[(s, t, LHS,Lattach,Rattach)] += p * pL * pR
 321
 322                 return chart[(s, t, LHS,Lattach,Rattach)]
 323     # end of e-function
 324
 325     inner_prob = e(s,t,LHS,True,True, 0) + e(s,t,LHS,True,False, 0) + e(s,t,LHS,False,True, 0) + e(s,t,LHS,False,False, 0)
 326     if io.DEBUG:
 327         print "---CHART:---"
 328         for k,v in chart.iteritems():
 329             print "\t%s -> %s_%d ... %s_%d (L:%s, R:%s):\t%.3f" % (DMV_Rule.bar_str(k[2]),
 330                                                                    O(k[0]), k[0],
 331                                                                    O(k[1]), k[1],
 332                                                                    k[3], k[4], v)
 333         print "---CHART:end---"
 334     return [inner_prob, chart]
 335
 336
 337
 338 if __name__ == "__main__":                      # Non, Adj
 339     _h_ = DMV_Rule((LRBAR,0), STOP,    ( RBAR,0), 1.0, 1.0) # LSTOP
 340     h_S = DMV_Rule(( RBAR,0),(NOBAR,0),  STOP,    0.4, 0.3) # RSTOP
 341     h_A = DMV_Rule(( RBAR,0),(LRBAR,0),( RBAR,0), 0.6, 0.7) # Lattach
 342     h   = DMV_Rule((NOBAR,0),(NOBAR,0),(LRBAR,0), 1.0, 1.0) # Rattach
 343     b2  = {}
 344     b2[(NOBAR, 0), 'h'] = 1.0
 345     b2[(RBAR, 0), 'h'] = h_S.probA
 346     b2[(LRBAR, 0), 'h'] = h_S.probA * _h_.probA
 347
 348     g2 = DMV_Grammar([ _h_, h_S, h_A, h ],b2,0,0,0, {0:'h'}, {'h':0})
 349
 350     io.DEBUG = 0
 351     test1 = inner_dmv(0, 1, (LRBAR,0), g2, 'h h'.split(), {})
 352     print "Should be 0.183: %.3f" % test1[0]
 353
 354     #ZZZ current tests
 355
 356 ##############################
 357 # DMV-probabilities, todo:   #
 358 ##############################
 359
 360 def P_STOP(STOP, h, dir, adj, corpus):
 361     '''corpus is a list of sentences s.
 362
 363 This is based on the formula where STOP is True... not sure how we
 364 calculate if STOP is False.
 365
 366
 367 I thought about instead having this:
 368
 369 for rule in g.p_rules:
 370     rule.num = 0
 371     rule.den = 0
 372 for sent in corpus:
 373     for rule in g.p_rules:
 374        for s:
 375            for t:
 376                set num and den using inner
 377 for rule in g.p_rules
 378     rule.prob = rule.num / rule.den
 379
 380 ..the way I'm assuming we do it in the commented out io-function in
 381 io.py. Having sentences as the outer loop at least we can easily just
 382 go through the heads that are actually in the sentence... BUT, this
 383 means having to go through p_rules 3 times, not sure what is slower.
 384
 385 oh, and:
 386 P_STOP(-STOP|...) = 1 - P_STOP(STOP|...)
 387 =D
 388
 389 '''
 390     P_STOP_num = 0
 391     P_STOP_den = 0
 392     for sent in corpus:
 393         # here we should somehow make each word in the sentence
 394         # unique, decorate them with subscripts or something. We have
 395         # to run through the sentence as many times as h appears
 396         # there. This also means changing inner(), I suspect.  Have to
 397         # make sure we separate reading of inner_prob from changing of
 398         # inner_prob...
 399         for s in range(loc(h)): # i<loc(h), where h is in the sentence.
 400             for t in range(i, len(sent)):
 401                 P_STOP_num += inner(s, t, h-r, g, sent, chart)
 402                 P_STOP_den += inner(s, t, l-h-r, g, sent, chart)
 403     return P_STOP_num / P_STOP_den # possibly other way round? todo
 404
 405
 406 def P_CHOOSE():
 407     return "todo"
 408
 409 def DMV(sent, g):
 410     '''Here it seems like they store rule information on a per-head (per
 411     direction) basis, in deps_D(h, dir) which gives us a list. '''
 412     def P_h(h):
 413         P_h = 1 # ?
 414         for dir in ['l', 'r']:
 415             for a in deps(h, dir):
 416                 # D(a)??
 417                 P_h *= \
 418                     P_STOP (0, h, dir, adj) * \
 419                     P_CHOOSE (a, h, dir) * \
 420                     P_h(D(a)) * \
 421                 P_STOP (STOP | h, dir, adj)
 422         return P_h
 423     return P_h(root(sent))
 424
 425
 426
 427 ##############################
 428 # Initialization, todo       #
 429 ##############################
 430 def taglist(corpus):
 431     '''sents is of this form:
 432 [['tag', ...], ['tag2', ...], ...]
 433
 434 Return a list of the tags. (Has to be ordered for enumerating to be
 435 consistent.)
 436
 437 Fortunately only has to run once.
 438 '''
 439     tagset = set()
 440     for sent in corpus:
 441         for tag in sent:
 442             tagset.add(tag)
 443     if 'ROOT' in tagset:
 444         raise ValueError("it seems we must have a new ROOT symbol")
 445     return list(tagset)
 446
 447
 448
 449
 450
 451 def init_zeros(tags):
 452     '''Return a frequency dictionary with DMV-relevant keys set to 0 or
 453     {}.
 454
 455     Todo: tweak (especially for f_STOP).'''
 456     f = {}
 457     for tag in tags:
 458         f['ROOT', tag] = 0
 459         f['sum', 'ROOT'] = 0
 460         for dir_adj in ['LN','LA','RN','RA']:
 461             f[tag, 'STOP', dir_adj] = FSTOP_MIN
 462             f[tag, '-STOP', dir_adj] = FNONSTOP_MIN
 463         f[tag, 'R'] = {}
 464         f[tag, 'L'] = {}
 465         f[tag, 'sum', 'R'] = 0.0
 466         f[tag, 'sum', 'L'] = 0.0
 467     return f
 468
 469 def init_freq(corpus, tags):
 470     '''Returns f, a dictionary with these types of keys:
 471     - ('ROOT', tag) is basically just the frequency of tag
 472     - (tag, 'STOP', 'LN') is for P_STOP(STOP|tag, left, non_adj);
 473       etc. for 'RN', 'LA', 'LN', '-STOP'.
 474     - (tag, 'L') is a dictionary of arg:f, where head could take arg
 475       to direction 'L' (etc. for 'R') and f is "harmonically" divided
 476       by distance, used for finding P_CHOOSE
 477
 478       Does this stuff:
 479       1. counts word frequencies for f_ROOT
 480       2. adds to certain f_STOP counters if a word is found first,
 481          last, first or second, or last or second to last in the sentence
 482          (Left Adjacent, Left Non-Adjacent, etc)
 483       3. adds to f_CHOOSE(arg|head) a "harmonic" number (divided by
 484          distance between arg and head)
 485     '''
 486     f = init_zeros(tags)
 487
 488     for sent in corpus: # sent is ['VBD', 'NN', ...]
 489         n = len(sent)
 490         # NOTE: head in DMV_Rule is a number, while this is the string
 491         for i_h, head in enumerate(sent):
 492             # todo grok: how is this different from just using straight head
 493             # frequency counts, for the ROOT probabilities?
 494             f['ROOT', head] += 1
 495             f['sum', 'ROOT'] += 1
 496
 497             # True = 1, False = 0. todo: make prettier
 498             f[head,  'STOP', 'LN'] += (i_h == 1)     # second word
 499             f[head, '-STOP', 'LN'] += (not i_h == 1) # not second
 500             f[head,  'STOP', 'LA'] += (i_h == 0)     # first word
 501             f[head, '-STOP', 'LA'] += (not i_h == 0) # not first
 502             f[head,  'STOP', 'RN'] += (i_h == n - 2)     # second-to-last
 503             f[head, '-STOP', 'RN'] += (not i_h == n - 2) # not second-to-last
 504             f[head,  'STOP', 'RA'] += (i_h == n - 1)     # last word
 505             f[head, '-STOP', 'RA'] += (not i_h == n - 1) # not last
 506
 507             # this is where we make the "harmonic" distribution. quite.
 508             for i_a, arg in enumerate(sent):
 509                 if i_h != i_a:
 510                     harmony = 1.0/abs(i_h - i_a) + HARMONIC_C
 511                     if i_h > i_a:
 512                         dir = 'L'
 513                     else:
 514                         dir = 'R'
 515                     if arg not in f[head, dir]:
 516                         f[head, dir][arg] = 0.0
 517                     f[head, dir][arg] += harmony
 518                     f[head, 'sum', dir] += harmony
 519                 # todo, optimization: possible to do both directions
 520                 # at once here, and later on rule out the ones we've
 521                 # done? does it actually speed things up?
 522
 523     return f
 524
 525 def init_normalize(f, tags, tagnum, numtag):
 526     '''Use frequencies (and sums) in f to return create p_STOP and
 527     p_CHOOSE; at the same time adding the context-free rules to the
 528     grammar using these probabilities.
 529
 530     Return a usable grammar.'''
 531     p_rules = []
 532     p_STOP, p_ROOT, p_CHOOSE, p_terminals = {},{},{},{}
 533     for n_h, head in enumerate(tags):
 534         p_ROOT[n_h] = float(f['ROOT', head]) / f['sum', 'ROOT']
 535         p_rules.append( DMV_Rule(ROOT, (LRBAR,n_h), STOP,
 536                                  p_ROOT[n_h],
 537                                  p_ROOT[n_h]))
 538
 539         # p_STOP = STOP / (STOP + NOT_STOP)
 540         for dir in ['L','R']:
 541             for adj in ['N','A']:
 542                 p_STOP[n_h, dir+adj] = \
 543                     float(f[head, 'STOP', dir+adj]) / \
 544                     (f[head, 'STOP', dir+adj] + f[head, '-STOP', dir+adj])
 545         # make rule using the previously found probN and probA:
 546         p_rules.append( DMV_Rule((RBAR, n_h), (NOBAR, n_h), STOP,
 547                                  p_STOP[n_h, 'RN'],
 548                                  p_STOP[n_h, 'RA']) )
 549         p_rules.append( DMV_Rule((LRBAR, n_h), STOP, (RBAR, n_h),
 550                                  p_STOP[n_h, 'LN'],
 551                                  p_STOP[n_h, 'LA']) )
 552
 553         # inner() shouldn't have to deal with those long non-branching stops:
 554         p_terminals[(NOBAR, n_h), head] = 1.0
 555         p_terminals[(RBAR, n_h), head] = p_STOP[n_h, 'RA']
 556         p_terminals[(LRBAR, n_h), head] = p_STOP[n_h, 'RA'] * p_STOP[n_h, 'LA']
 557
 558         for dir in ['L', 'R']:
 559             for arg, val in f[head, dir].iteritems():
 560                 p_CHOOSE[tagnum[arg], n_h, dir] = float(val) / f[head,'sum',dir]
 561
 562     # after the head tag-loop, add every head-argument rule:
 563     for (n_a, n_h, dir),p_C in p_CHOOSE.iteritems():
 564         if dir == 'L': # arg is to the left of head
 565             p_rules.append( DMV_Rule((RBAR,n_h), (LRBAR,n_a), (RBAR,n_h),
 566                                      p_C*(1-p_STOP[n_h, dir+'N']),
 567                                      p_C*(1-p_STOP[n_h, dir+'A'])) )
 568         if dir == 'R':
 569             p_rules.append( DMV_Rule((NOBAR,n_h), (NOBAR,n_h), (LRBAR,n_a),
 570                                      p_C*(1-p_STOP[n_h, dir+'N']),
 571                                      p_C*(1-p_STOP[n_h, dir+'A'])) )
 572
 573     return DMV_Grammar(p_rules, p_terminals, p_STOP, p_CHOOSE, p_ROOT, numtag, tagnum)
 574
 575
 576 def initialize(corpus):
 577     '''Return an initialized DMV_Grammar
 578     corpus is a list of lists of tags.'''
 579     tags = taglist(corpus)
 580     tagnum, numtag = {}, {}
 581     for num, tag in enumerate(tags):
 582         tagnum[tag] = num
 583         numtag[num] = tag
 584     # f: frequency counts used in initialization, mostly distances
 585     f = init_freq(corpus, tags)
 586     g = init_normalize(f, tags, tagnum, numtag)
 587     return g
 588
 589
 590 if __name__ == "__main__":
 591 #     print "--------initialization------------"
 592 #     print initialize([['foo', 'two','foo','foo'],
 593 #                       ['zero', 'one','two','three']])
 594     pass
 595
 596
 597
 598
 599
 600
 601
 602
 603
 604
 605
 606
 607 # todo: some testing on the Brown corpus:
 608
 609 if __name__ == "__main__":
 610     # first five sentences of the Brown corpus:
 611     g_brown = initialize([['AT', 'NP-TL', 'NN-TL', 'JJ-TL', 'NN-TL', 'VBD', 'NR', 'AT', 'NN', 'IN', 'NP$', 'JJ', 'NN', 'NN', 'VBD', '``', 'AT', 'NN', "''", 'CS', 'DTI', 'NNS', 'VBD', 'NN', '.'], ['AT', 'NN', 'RBR', 'VBD', 'IN', 'NN', 'NNS', 'CS', 'AT', 'NN-TL', 'JJ-TL', 'NN-TL', ',', 'WDT', 'HVD', 'JJ', 'NN', 'IN', 'AT', 'NN', ',', '``', 'VBZ', 'AT', 'NN', 'CC', 'NNS', 'IN', 'AT', 'NN-TL', 'IN-TL', 'NP-TL', "''", 'IN', 'AT', 'NN', 'IN', 'WDT', 'AT', 'NN', 'BEDZ', 'VBN', '.'], ['AT', 'NP', 'NN', 'NN', 'HVD', 'BEN', 'VBN', 'IN', 'NP-TL', 'JJ-TL', 'NN-TL', 'NN-TL', 'NP', 'NP', 'TO', 'VB', 'NNS', 'IN', 'JJ', '``', 'NNS', "''", 'IN', 'AT', 'JJ', 'NN', 'WDT', 'BEDZ', 'VBN', 'IN', 'NN-TL', 'NP', 'NP', 'NP', '.'], ['``', 'RB', 'AT', 'JJ', 'NN', 'IN', 'JJ', 'NNS', 'BEDZ', 'VBN', "''", ',', 'AT', 'NN', 'VBD', ',', '``', 'IN', 'AT', 'JJ', 'NN', 'IN', 'AT', 'NN', ',', 'AT', 'NN', 'IN', 'NNS', 'CC', 'AT', 'NN', 'IN', 'DT', 'NN', "''", '.'], ['AT', 'NN', 'VBD', 'PPS', 'DOD', 'VB', 'CS', 'AP', 'IN', 'NP$', 'NN', 'CC', 'NN', 'NNS', '``', 'BER', 'JJ', 'CC', 'JJ', 'CC', 'RB', 'JJ', "''", '.'], ['PPS', 'VBD', 'CS', 'NP', 'NNS', 'VB', '``', 'TO', 'HV', 'DTS', 'NNS', 'VBN', 'CC', 'VBN', 'IN', 'AT', 'NN', 'IN', 'VBG', 'CC', 'VBG', 'PPO', "''", '.'], ['AT', 'JJ', 'NN', 'VBD', 'IN', 'AT', 'NN', 'IN', 'AP', 'NNS', ',', 'IN', 'PPO', 'AT', 'NP', 'CC', 'NP-TL', 'NN-TL', 'VBG', 'NNS', 'WDT', 'PPS', 'VBD', '``', 'BER', 'QL', 'VBN', 'CC', 'VB', 'RB', 'VBN', 'NNS', 'WDT', 'VB', 'IN', 'AT', 'JJT', 'NN', 'IN', 'ABX', 'NNS', "''", '.'], ['NN-HL', 'VBN-HL'], ['WRB', ',', 'AT', 'NN', 'VBD', 'PPS', 'VBZ', '``', 'DTS', 'CD', 'NNS', 'MD', 'BE', 'VBN', 'TO', 'VB', 'JJR', 'NN', 'CC', 'VB', 'AT', 'NN', 'IN', 'NN', "''", '.'], ['AT', 'NN-TL', 'VBG-TL', 'NN-TL', ',', 'AT', 'NN', 'VBD', ',', '``', 'BEZ', 'VBG', 'IN', 'VBN', 'JJ', 'NNS', 'CS', 'AT', 'NN', 'IN', 'NN', 'NNS', 'NNS', "''", '.']])
 612     # 36:'AT' in g_brown.numtag, 40:'NP-TL'
 613
 614     io.DEBUG = 0
 615     test_brown = inner_dmv(0,3, (LRBAR,36), g_brown, ['AT', 'NP-TL' ,'NN-TL','JJ-TL'], {})
 616     if io.DEBUG:
 617         for r in  g_brown.rules((2,36)) + g_brown.rules((1,36)) + g_brown.rules((0,36)):
 618             L = r.L()
 619             R = r.R()
 620             if L[1] in [36,40,-2] and R[1] in [36,40,-2]:
 621                 print r
 622     print "Brown-test gives: %.8f" % test_brown[0]
 623
 624
 625
 626     # this will give the tag sequences of all the 6218 Brown corpus
 627     # sentences of length < 7:
 628     # [[tag for (w, tag) in sent]
 629     #  for sent in nltk.corpus.brown.tagged_sents() if len(sent) < 7]
 630
 631
 632 def tagset_brown():
 633     "472 tags, takes a while to extract with tagset(), hardcoded here."
 634     return set(['BEDZ-NC', 'NP$', 'AT-TL', 'CS', 'NP+HVZ', 'IN-TL-HL', 'NR-HL', 'CC-TL-HL', 'NNS$-HL', 'JJS-HL', 'JJ-HL', 'WRB-TL', 'JJT-TL', 'WRB', 'DOD*', 'BER*-NC', ')-HL', 'NPS$-HL', 'RB-HL', 'FW-PPSS', 'NP+HVZ-NC', 'NNS$', '--', 'CC-TL', 'FW-NN-TL', 'NP-TL-HL', 'PPSS+MD', 'NPS', 'RBR+CS', 'DTI', 'NPS-TL', 'BEM', 'FW-AT+NP-TL', 'EX+BEZ', 'BEG', 'BED', 'BEZ', 'DTX', 'DOD*-TL', 'FW-VB-NC', 'DTS', 'DTS+BEZ', 'QL-HL', 'NP$-TL', 'WRB+DOD*', 'JJR+CS', 'NN+MD', 'NN-TL-HL', 'HVD-HL', 'NP+BEZ-NC', 'VBN+TO', '*-TL', 'WDT-HL', 'MD', 'NN-HL', 'FW-BE', 'DT$', 'PN-TL', 'DT-HL', 'FW-NR-TL', 'VBG', 'VBD', 'VBN', 'DOD', 'FW-VBG-TL', 'DOZ', 'ABN-TL', 'VB+JJ-NC', 'VBZ', 'RB+CS', 'FW-PN', 'CS-NC', 'VBG-NC', 'BER-HL', 'MD*', '``', 'WPS-TL', 'OD-TL', 'PPSS-HL', 'PPS+MD', 'DO*', 'DO-HL', 'HVG-HL', 'WRB-HL', 'JJT', 'JJS', 'JJR', 'HV+TO', 'WQL', 'DOD-NC', 'CC-HL', 'FW-PPSS+HV', 'FW-NP-TL', 'MD+TO', 'VB+IN', 'JJT-NC', 'WDT+BEZ-TL', '---HL', 'PN$', 'VB+PPO', 'BE-TL', 'VBG-TL', 'NP$-HL', 'VBZ-TL', 'UH', 'FW-WPO', 'AP+AP-NC', 'FW-IN', 'NRS-TL', 'ABL', 'ABN', 'TO-TL', 'ABX', '*-HL', 'FW-WPS', 'VB-NC', 'HVD*', 'PPS+HVD', 'FW-IN+AT', 'FW-NP', 'QLP', 'FW-NR', 'FW-NN', 'PPS+HVZ', 'NNS-NC', 'DT+BEZ-NC', 'PPO', 'PPO-NC', 'EX-HL', 'AP$', 'OD-NC', 'RP', 'WPS+BEZ', 'NN+BEZ', '.-TL', ',', 'FW-DT+BEZ', 'RB', 'FW-PP$-NC', 'RN', 'JJ$-TL', 'MD-NC', 'VBD-NC', 'PPSS+BER-N', 'RB+BEZ-NC', 'WPS-HL', 'VBN-NC', 'BEZ-HL', 'PPL-NC', 'BER-TL', 'PP$$', 'NNS+MD', 'PPS-NC', 'FW-UH-NC', 'PPS+BEZ-NC', 'PPSS+BER-TL', 'NR-NC', 'FW-JJ', 'PPS+BEZ-HL', 'NPS$', 'RB-TL', 'VB-TL', 'BEM*', 'MD*-HL', 'FW-CC', 'NP+MD', 'EX+HVZ', 'FW-CD', 'EX+HVD', 'IN-HL', 'FW-CS', 'JJR-HL', 'FW-IN+NP-TL', 'JJ-TL-HL', 'FW-UH', 'EX', 'FW-NNS-NC', 'FW-JJ-NC', 'VBZ-HL', 'VB+RP', 'BEZ-NC', 'PPSS+HV-TL', 'HV*', 'IN', 'PP$-NC', 'NP-NC', 'BEN', 'PP$-TL', 'FW-*-TL', 'FW-OD-TL', 'WPS', 'WPO', 'MD+PPSS', 'WDT+BER', 'WDT+BEZ', 'CD-HL', 'WDT+BEZ-NC', 'WP$', 'DO+PPSS', 'HV-HL', 'DT-NC', 'PN-NC', 'FW-VBZ', 'HVD', 'HVG', 'NN+BEZ-TL', 'HVZ', 'FW-VBD', 'FW-VBG', 'NNS$-TL', 'JJ-TL', 'FW-VBN', 'MD-TL', 'WDT+DOD', 'HV-TL', 'NN-TL', 'PPSS', 'NR$', 'BER', 'FW-VB', 'DT', 'PN+BEZ', 'VBG-HL', 'FW-PPL+VBZ', 'FW-NPS-TL', 'RB$', 'FW-IN+NN', 'FW-CC-TL', 'RBT', 'RBR', 'PPS-TL', 'PPSS+HV', 'JJS-TL', 'NPS-HL', 'WPS+BEZ-TL', 'NNS-TL-HL', 'VBN-TL-NC', 'QL-TL', 'NN+NN-NC', 'JJR-TL', 'NN$-TL', 'FW-QL', 'IN-TL', 'BED-NC', 'NRS', '.-HL', 'QL', 'PP$-HL', 'WRB+BER', 'JJ', 'WRB+BEZ', 'NNS$-TL-HL', 'PPSS+BEZ', '(', 'PPSS+BER', 'DT+MD', 'DOZ-TL', 'PPSS+BEM', 'FW-PP$', 'RB+BEZ-HL', 'FW-RB+CC', 'FW-PPS', 'VBG+TO', 'DO*-HL', 'NR+MD', 'PPLS', 'IN+IN', 'BEZ*', 'FW-PPL', 'FW-PPO', 'NNS-HL', 'NIL', 'HVN', 'PPSS+BER-NC', 'AP-TL', 'FW-DT', '(-HL', 'DTI-TL', 'JJ+JJ-NC', 'FW-RB', 'FW-VBD-TL', 'BER-NC', 'NNS$-NC', 'JJ-NC', 'NPS$-TL', 'VB+VB-NC', 'PN', 'VB+TO', 'AT-TL-HL', 'BEM-NC', 'PPL-TL', 'ABN-HL', 'RB-NC', 'DO-NC', 'BE-HL', 'WRB+IN', 'FW-UH-TL', 'PPO-HL', 'FW-CD-TL', 'TO-HL', 'PPS+BEZ', 'CD$', 'DO', 'EX+MD', 'HVZ-TL', 'TO-NC', 'IN-NC', '.', 'WRB+DO', 'CD-NC', 'FW-PPO+IN', 'FW-NN$-TL', 'WDT+BEZ-HL', 'RP-HL', 'CC', 'NN+HVZ-TL', 'FW-NNS-TL', 'DT+BEZ', 'WPS+HVZ', 'BEDZ*', 'NP-TL', ':-TL', 'NN-NC', 'WPO-TL', 'QL-NC', 'FW-AT+NN-TL', 'WDT+HVZ', '.-NC', 'FW-DTS', 'NP-HL', ':-HL', 'RBR-NC', 'OD-HL', 'BEDZ-HL', 'VBD-TL', 'NPS-NC', ')', 'TO+VB', 'FW-IN+NN-TL', 'PPL', 'PPS', 'PPSS+VB', 'DT-TL', 'RP-NC', 'VB', 'FW-VB-TL', 'PP$', 'VBD-HL', 'DTI-HL', 'NN-TL-NC', 'PPL-HL', 'DOZ*', 'NR-TL', 'WRB+MD', 'PN+HVZ', 'FW-IN-TL', 'PN+HVD', 'BEN-TL', 'BE', 'WDT', 'WPS+HVD', 'DO-TL', 'FW-NN-NC', 'WRB+BEZ-TL', 'UH-TL', 'JJR-NC', 'NNS', 'PPSS-NC', 'WPS+BEZ-NC', ',-TL', 'NN$', 'VBN-TL-HL', 'WDT-NC', 'OD', 'FW-OD-NC', 'DOZ*-TL', 'PPSS+HVD', 'CS-TL', 'WRB+DOZ', 'CC-NC', 'HV', 'NN$-HL', 'FW-WDT', 'WRB+DOD', 'NN+HVZ', 'AT-NC', 'NNS-TL', 'FW-BEZ', 'CS-HL', 'WPO-NC', 'FW-BER', 'NNS-TL-NC', 'BEZ-TL', 'FW-IN+AT-T', 'ABN-NC', 'NR-TL-HL', 'BEDZ', 'NP+BEZ', 'FW-AT-TL', 'BER*', 'WPS+MD', 'MD-HL', 'BED*', 'HV-NC', 'WPS-NC', 'VBN-HL', 'FW-TO+VB', 'PPSS+MD-NC', 'HVZ*', 'PPS-HL', 'WRB-NC', 'VBN-TL', 'CD-TL-HL', ',-NC', 'RP-TL', 'AP-HL', 'FW-HV', 'WQL-TL', 'FW-AT', 'NN', 'NR$-TL', 'VBZ-NC', '*', 'PPSS-TL', 'JJT-HL', 'FW-NNS', 'NP', 'UH-HL', 'NR', ':', 'FW-NN$', 'RP+IN', ',-HL', 'JJ-TL-NC', 'AP-NC', '*-NC', 'VB-HL', 'HVZ-NC', 'DTS-HL', 'FW-JJT', 'FW-JJR', 'FW-JJ-TL', 'FW-*', 'RB+BEZ', "''", 'VB+AT', 'PN-HL', 'PPO-TL', 'CD-TL', 'UH-NC', 'FW-NN-TL-NC', 'EX-NC', 'PPSS+BEZ*', 'TO', 'WDT+DO+PPS', 'IN+PPO', 'AP', 'AT', 'DOZ-HL', 'FW-RB-TL', 'CD', 'NN+IN', 'FW-AT-HL', 'PN+MD', "'", 'FW-PP$-TL', 'FW-NPS', 'WDT+BER+PP', 'NN+HVD-TL', 'MD+HV', 'AT-HL', 'FW-IN+AT-TL'])
 635
 636
 637
 638