src/dmv.py

   1 #### changes by KBU:
   2 # 2008-05-24:
   3 # - prettier printout for DMV_Rule
   4 # - DMV_Rule changed a bit. head, L and R are now all pairs of the
   5 #   form (bars, head).
   6 # - Started on P_STOP, a bit less pseudo now..
   7 #
   8 # 2008-05-27:
   9 # - started on initialization. So far, I have frequencies for
  10 #   everything, very harmonic. Still need to make these into 1-summing
  11 #   probabilities
  12 #
  13 # 2008-05-28:
  14 # - more work on initialization (init_freq and init_normalize),
  15 #   getting closer to probabilities now.
  16 #
  17 # 2008-05-29:
  18 # - init_normalize is done, it creates p_STOP, p_ROOT and p_CHOOSE,
  19 #   and also adds the relevant probabilities to p_rules in a grammar.
  20 #   Still, each individual rule has to store both adjacent and non_adj
  21 #   probabilities, and inner() should be able to send some parameter
  22 #   which lets the rule choose... hopefully... Is this possible to do
  23 #   top-down even? when the sentence could be all the same words?
  24 #   todo: extensive testing of identical words in sentences!
  25 # - frequencies (only used in initialization) are stored as strings,
  26 #   but in the rules and p_STOP etc, there are only numbers.
  27 #
  28 # 2008-05-30
  29 # - copied inner() into this file, to make the very dmv-specific
  30 #   adjacency stuff work (have to factor that out later on, when it
  31 #   works).
  32 #
  33 # 2008-06-01
  34 # - finished typing in inner_dmv(), still have to test and debug
  35 #   it. The chart is now four times as big since for any rule we may
  36 #   have attachments to either the left or the right below, which
  37 #   upper rules depend on, for selecting probN or probA
  38 #
  39 # 2008-06-03
  40 # - fixed a number of little bugs in initialization, where certain
  41 #   rules were simply not created, or created "backwards"
  42 # - inner_dmv() should Work now...
  43 #
  44 # 2008-06-04
  45 # - moved initialization to harmonic.py
  46
  47
  48 # import numpy # numpy provides Fast Arrays, for future optimization
  49 import pprint
  50 import io
  51 import harmonic
  52
  53 # non-tweakable/constant "lookup" globals
  54 BARS = [0,1,2]
  55 RBAR = 1
  56 LRBAR = 2
  57 NOBAR = 0
  58 ROOT = (LRBAR, -1)
  59 STOP = (NOBAR, -2)
  60
  61 if __name__ == "__main__":
  62     print "DMV module tests:"
  63
  64
  65 def node(bars, head):
  66     '''Useless function, but just here as documentation. Nodes make up
  67     LHS, R and L in each DMV_Rule'''
  68     return (bars, head)
  69
  70 def bars(node):
  71     return node[0]
  72
  73 def head(node):
  74     return node[1]
  75
  76
  77 class DMV_Grammar(io.Grammar):
  78     '''The DMV-PCFG.
  79
  80     Public members:
  81     p_STOP, p_ROOT, p_CHOOSE, p_terminals
  82     These are changed in the Maximation step, then used to set the
  83     new probabilities of each DMV_Rule.
  84
  85     Todo: make p_terminals private? (But it has to be changable in
  86     maximation step due to the short-cutting rules... could of course
  87     make a DMV_Grammar function to update the short-cut rules...)
  88
  89     __p_rules is private, but we can still say stuff like:
  90     for r in g.all_rules():
  91         r.probN = newProbN
  92
  93     What other representations do we need? (P_STOP formula uses
  94     deps_D(h,l/r) at least)'''
  95     def __str__(self):
  96         str = ""
  97         for r in self.all_rules():
  98              str += "%s\n" % r.__str__(self.numtag)
  99         return str
 100
 101     def h_rules(self, h):
 102         return [r for r in self.all_rules() if r.head() == h]
 103
 104     def rules(self, LHS):
 105         return [r for r in self.all_rules() if r.LHS() == LHS]
 106
 107     def sent_rules(self, LHS, sent_nums):
 108         "Used in inner_dmv."
 109         # We don't want to rule out STOPs!
 110         sent_nums.append( head(STOP) )
 111         return [r for r in self.all_rules() if r.LHS() == LHS
 112                 and head(r.L()) in sent_nums and head(r.R()) in sent_nums]
 113
 114     def heads(self):
 115         '''Not sure yet what is needed here, or where this is needed'''
 116         return numtag
 117
 118     def deps_L(self, head):
 119         # todo test, probably this list comprehension doesn't work
 120         return [a for r in self.all_rules() if r.head() == head and a == r.L()]
 121
 122     def deps_R(self, head):
 123         # todo test, probably this list comprehension doesn't work
 124         return [a for r in self.all_rules() if r.head() == head and a == r.R()]
 125
 126     def __init__(self, p_rules, p_terminals, p_STOP, p_CHOOSE, p_ROOT, numtag, tagnum):
 127         io.Grammar.__init__(self, p_rules, p_terminals, numtag, tagnum)
 128         self.p_STOP = p_STOP
 129         self.p_CHOOSE = p_CHOOSE
 130         self.p_ROOT = p_ROOT
 131
 132
 133 class DMV_Rule(io.CNF_Rule):
 134     '''A single CNF rule in the PCFG, of the form
 135     LHS -> L R
 136     where LHS, L and R are 'nodes', eg. of the form (bars, head).
 137
 138     Public members:
 139     probN, probA
 140
 141     Private members:
 142     __L, __R, __LHS
 143
 144     Different rule-types have different probabilities associated with
 145     them:
 146
 147     _h_ -> STOP  h_     P( STOP|h,L,    adj)
 148     _h_ -> STOP  h_     P( STOP|h,L,non_adj)
 149      h_ ->  h  STOP     P( STOP|h,R,    adj)
 150      h_ ->  h  STOP     P( STOP|h,R,non_adj)
 151      h_ -> _a_   h_     P(-STOP|h,L,    adj) * P(a|h,L)
 152      h_ -> _a_   h_     P(-STOP|h,L,non_adj) * P(a|h,L)
 153      h  ->  h   _a_     P(-STOP|h,R,    adj) * P(a|h,R)
 154      h  ->  h   _a_     P(-STOP|h,R,non_adj) * P(a|h,R)
 155     '''
 156     def p(self, adj, *arg):
 157         if adj:
 158             return self.probA
 159         else:
 160             return self.probN
 161
 162     def p_STOP(self, s, t, loc_h):
 163         '''Returns the correct probability, adjacent if we're rewriting from
 164         the (either left or right) end of the fragment. '''
 165         if self.L() == STOP:
 166             return self.p(s == loc_h)
 167         elif self.R() == STOP:
 168             if not loc_h == s:
 169                 io.debug( "(%s given loc_h:%d but s:%d. Todo: optimize away!)"
 170                           % (self, loc_h, s) )
 171                 return 0.0
 172             else:
 173                 return self.p(t == loc_h)
 174
 175     def p_ATTACH(self, r, loc_h, s=None):
 176         '''Returns the correct probability, adjacent if we haven't attached
 177         anything before.'''
 178         if self.LHS() == self.L():
 179             if not loc_L == s:
 180                 io.debug( "(%s given loc_h (loc_L):%d but s:%d. Todo: optimize away!)"
 181                           % (self, loc_L, s) )
 182                 return 0.0
 183             else:
 184                 return self.p(r == loc_h)
 185         elif self.LHS() == self.R():
 186             return self.p(r+1 == loc_h)
 187
 188     def bars(self):
 189         return bars(self.LHS())
 190
 191     def head(self):
 192         return head(self.LHS())
 193
 194     def __init__(self, LHS, L, R, probN, probA):
 195         for b_h in [LHS, L, R]:
 196             if bars(b_h) not in BARS:
 197                 raise ValueError("bars must be in %s; was given: %s"
 198                                  % (BARS, bars(b_h)))
 199         io.CNF_Rule.__init__(self, LHS, L, R, probN)
 200         self.probA = probA # adjacent
 201         self.probN = probN # non_adj
 202
 203     @classmethod # so we can call DMV_Rule.bar_str(b_h)
 204     def bar_str(cls, b_h, tag=lambda x:x):
 205         if(b_h == ROOT):
 206             return 'ROOT'
 207         elif(b_h == STOP):
 208             return 'STOP'
 209         elif(bars(b_h) == RBAR):
 210             return " %s_ " % tag(head(b_h))
 211         elif(bars(b_h) == LRBAR):
 212             return "_%s_ " % tag(head(b_h))
 213         else:
 214             return " %s  " % tag(head(b_h))
 215
 216
 217     def __str__(self, tag=lambda x:x):
 218         return "%s-->%s %s\t[N %.2f] [A %.2f]" % (self.bar_str(self.LHS(), tag),
 219                                                   self.bar_str(self.L(), tag),
 220                                                   self.bar_str(self.R(), tag),
 221                                                   self.probN,
 222                                                   self.probA)
 223
 224
 225
 226
 227
 228
 229
 230 ###################################
 231 # dmv-specific version of inner() #
 232 ###################################
 233 def locs(h, sent, s=0, t=None):
 234     '''Return the locations of h in sent, or some fragment of sent (in the
 235     latter case we make sure to offset the locations correctly so that
 236     for any x in the returned list, sent[x]==h).'''
 237     if t == None:
 238         t = len(sent)
 239     return [i+s for i,w in enumerate(sent[s:t]) if w == h]
 240
 241
 242 def inner_dmv(s, t, LHS, loc_h, g, sent, chart):
 243     ''' A rewrite of inner in io.py, to take adjacency into accord.
 244
 245     The chart is now of this form:
 246     chart[(s,t,LHS, loc_h)]
 247
 248     loc_h gives adjacency (along with r and location of other child
 249     for attachment rules), and is needed in P_STOP reestimation.
 250
 251     Todo: if possible, refactor (move dmv-specific stuff back into
 252     dmv, so this is "general" enough to be in io.py)
 253     '''
 254
 255     def O(s):
 256         return sent[s]
 257
 258     sent_nums = [g.tagnum(tag) for tag in sent]
 259
 260     def e(s,t,LHS, loc_h, n_t):
 261         def tab():
 262             "Tabs for debug output"
 263             return "\t"*n_t
 264
 265         if (s, t, LHS, loc_h) in chart:
 266             io.debug("%s*= %.4f in chart: s:%d t:%d LHS:%s loc:%d"
 267                      %(tab(),chart[(s, t, LHS, loc_h)], s, t,
 268                        DMV_Rule.bar_str(LHS), loc_h))
 269             return chart[(s, t, LHS, loc_h)]
 270         else:
 271             if s == t:
 272                 if not loc_h == s:
 273                     # terminals are always F,F for attachment
 274                     io.debug("%s*= 0.0 (wrong loc_h)" % tab())
 275                     return 0.0
 276                 elif (LHS, O(s)) in g.p_terminals:
 277                     prob = g.p_terminals[LHS, O(s)] # "b[LHS, O(s)]" in Lari&Young
 278                 else:
 279                     # todo: assuming this is how to deal w/lacking
 280                     # rules, since we add prob.s, and 0 is identity
 281                     prob = 0.0
 282                     io.debug( "%sLACKING TERMINAL:" % tab())
 283                 # todo: add to chart perhaps? Although, it _is_ simple lookup..
 284                 io.debug( "%s*= %.4f (terminal: %s -> %s_%d)"
 285                           % (tab(),prob, DMV_Rule.bar_str(LHS), O(s), loc_h) )
 286                 return prob
 287             else:
 288                 p = 0.0 # "sum over j,k in a[LHS,j,k]"
 289                 for rule in g.sent_rules(LHS, sent_nums):
 290                     io.debug( "%ssumming rule %s s:%d t:%d loc:%d" % (tab(),rule,s,t,loc_h) )
 291                     L = rule.L()
 292                     R = rule.R()
 293                     # if it's a STOP rule, rewrite for the same range:
 294                     if (L == STOP) or (R == STOP):
 295                         if L == STOP:
 296                             pLR = e(s, t, R, loc_h, n_t+1)
 297                         elif R == STOP:
 298                             pLR = e(s, t, L, loc_h, n_t+1)
 299                         p += rule.p_STOP(s, t, loc_h) * pLR
 300                         io.debug( "%sp= %.4f (STOP)" % (tab(), p) )
 301
 302                     else: # not a STOP, an attachment rewrite:
 303                         for r in range(s, t):
 304                             p_h = rule.p_ATTACH(r, loc_h, s=s)
 305                             if rule.LHS() == L:
 306                                 locs_L = [loc_h]
 307                                 locs_R = locs(head(R), sent_nums, r+1, t+1)
 308                             elif rule.LHS() == R:
 309                                 locs_L = locs(head(L), sent_nums,  s,  r+1)
 310                                 locs_R = [loc_h]
 311                             # see http://tinyurl.com/4ffhhw
 312                             p += sum([e(s, r, L, loc_L, n_t+1) *
 313                                       p_h *
 314                                       e(r+1, t, R, loc_R, n_t+1)
 315                                       for loc_L in locs_L
 316                                       for loc_R in locs_R])
 317                             io.debug( "%sp= %.4f (ATTACH)" % (tab(), p) )
 318                 chart[(s, t, LHS, loc_h)] = p
 319                 return p
 320     # end of e-function
 321
 322     inner_prob = e(s,t,LHS,loc_h, 0)
 323     if io.DEBUG:
 324         print "---CHART:---"
 325         for (s,t,LHS,loc_h),v in chart.iteritems():
 326             print "%s -> %s_%d ... %s_%d (loc_h:%s):\t%.3f" % (DMV_Rule.bar_str(LHS,g.numtag),
 327                                                                    O(s), s, O(s), t, loc_h, v)
 328         print "---CHART:end---"
 329     return [inner_prob, chart]
 330 # end of inner_dmv(s, t, LHS, loc_h, g, sent, chart)
 331
 332 def inner_sent_dmv(sent, g, chart):
 333     '''Possibly there's a more efficient way? Although, non-sentence heads
 334     _will_ be ruled out by inner_dmv though.'''
 335     for loc_h,h_tag in enumerate(sent):
 336         inner_dmv(0, len(sent), ROOT, loc_h, g, chart)
 337
 338 if __name__ == "__main__":                      # Non, Adj
 339     _h_ = DMV_Rule((LRBAR,0), STOP,    ( RBAR,0), 1.0, 1.0) # LSTOP
 340     h_S = DMV_Rule(( RBAR,0),(NOBAR,0),  STOP,    0.4, 0.3) # RSTOP
 341     h_A = DMV_Rule(( RBAR,0),(LRBAR,0),( RBAR,0), 0.6, 0.7) # Lattach
 342     h   = DMV_Rule((NOBAR,0),(NOBAR,0),(LRBAR,0), 1.0, 1.0) # Rattach
 343     b2  = {}
 344     b2[(NOBAR, 0), 'h'] = 1.0
 345     b2[(RBAR, 0), 'h'] = h_S.probA
 346     b2[(LRBAR, 0), 'h'] = h_S.probA * _h_.probA
 347
 348     g_dup = DMV_Grammar([ _h_, h_S, h_A, h ],b2,0,0,0, {0:'h'}, {'h':0})
 349
 350     io.DEBUG = 0
 351     test0 = inner_dmv(0, 1, (LRBAR,0), 0, g_dup, 'h h'.split(), {})
 352     if not  "0.120"=="%.3f" % test0[0]:
 353         print "Should be 0.120: %.3f" % test0[0]
 354
 355     test1 = inner_dmv(0, 1, (LRBAR,0), 1, g_dup, 'h h'.split(), {})
 356     if not  "0.063"=="%.3f" % test1[0]:
 357         print "Should be 0.063: %.3f" % test1[0]
 358
 359     test3 = inner_dmv(0, 2, (LRBAR,0), 2, g_dup, 'h h h'.split(), {})
 360     if not  "0.0462"=="%.4f" % test3[0]:
 361         print "Should be 0.0462: %.4f" % test3[0]
 362
 363
 364
 365
 366
 367
 368 ##############################
 369 # DMV-probabilities, todo:   #
 370 ##############################
 371
 372
 373 def P_CHOOSE():
 374     return "todo"
 375
 376 def DMV(sent, g):
 377     '''Here it seems like they store rule information on a per-head (per
 378     direction) basis, in deps_D(h, dir) which gives us a list. '''
 379     def P_h(h):
 380         P_h = 1 # ?
 381         for dir in ['l', 'r']:
 382             for a in deps(h, dir):
 383                 # D(a)??
 384                 P_h *= \
 385                     P_STOP (0, h, dir, adj) * \
 386                     P_CHOOSE (a, h, dir) * \
 387                     P_h(D(a)) * \
 388                 P_STOP (STOP | h, dir, adj)
 389         return P_h
 390     return P_h(root(sent))
 391
 392
 393 def P_STOP(STOP, h, dir, adj, g, corpus):
 394     '''corpus is a list of sentences s.
 395
 396 This is based on the formula where STOP is True... not sure how we
 397 calculate if STOP is False.
 398
 399 I thought about instead having this:
 400
 401 for rule in g.p_rules:
 402     rule.num = 0
 403     rule.den = 0
 404 for sent in corpus:
 405     for rule in g.p_rules:
 406        for s:
 407            for t:
 408                set num and den using inner
 409 for rule in g.p_rules
 410     rule.prob = rule.num / rule.den
 411
 412 ..the way I'm assuming we do it in the commented out io-function in
 413 io.py. Having sentences as the outer loop at least we can easily just
 414 go through the heads that are actually in the sentence... BUT, this
 415 means having to go through p_rules 3 times, not sure what is slower.
 416
 417 Also, now inner_dmv makes sure it only goes through heads that are
 418 actually in the sentence, so that argument falls.
 419
 420 oh, and:
 421 P_STOP(-STOP|...) = 1 - P_STOP(STOP|...)
 422 =D
 423
 424 '''
 425     P_STOP_num = 0
 426     P_STOP_den = 0
 427     h_tag = g.numtag(h)
 428     for sent in corpus:
 429         # have to go through _all_ places where h appears in the
 430         # sentence...how? how to make sure it _works_?
 431         chart = {}
 432         inner_sent_dmv(sent, g, chart) #todo current
 433         if h_tag in sent:
 434             locs_h = locs(h_tag, sent)
 435
 436             io.debug( "locs_h:%s, sent:%s"%(locs_h,sent) )
 437             for loc_h in locs_h:
 438                 for s in range(loc_h): # s<loc(h), range gives strictly less
 439                     for t in range(loc_h, len(sent)):
 440                         P_STOP_num += chart[(s, t, (LRBAR,h), loc_h)]
 441                         P_STOP_den += chart[(s, t, (RBAR,h), loc_h)]
 442
 443     io.debug( "num/den: %s / %s = %s"%(P_STOP_num, P_STOP_den,P_STOP_num / P_STOP_den))
 444     if P_STOP_den > 0.0:
 445         return P_STOP_num / P_STOP_den # upside down in article
 446     else:
 447         return 0.0
 448
 449
 450
 451 def testreestimation():
 452     testcorpus = [s.split() for s in ['det nn vbd c vbd','det nn vbd c nn vbd pp',
 453                                       'det nn vbd',      'det vbd nn c vbd pp',
 454                                       'det nn vbd',      'det vbd c nn vbd pp',
 455                                       'det nn vbd',      'det nn vbd nn c vbd pp',
 456                                       'det nn vbd',      'det nn vbd c det vbd pp',
 457                                       'det nn vbd',      'det nn vbd c vbd det det det pp',
 458                                       'det nn vbd',      'det nn vbd c vbd pp',
 459                                       'det nn vbd',      'det nn vbd c vbd det pp',
 460                                       'det nn vbd',      'det nn vbd c vbd pp',
 461                                       'det nn vbd pp',   'det nn vbd det', ]]
 462     g = harmonic.initialize(testcorpus)
 463
 464     h_tag = 'nn'
 465     h = g.tagnum(h_tag)
 466     print "This will take some time. todo: figure out why it doesn't work"
 467     for r in g.h_rules(h):
 468         if r.L()==STOP:
 469             print r
 470 #             print "off-set the rule, see what happens:"
 471 #             r.probN = 0.7
 472 #             print r
 473             for i in range(3):
 474                 pstophln = P_STOP(True, h, 'L', 'N', g, testcorpus)
 475                 print "p(STOP|%s,L,N):%s"%(h_tag,pstophln)
 476
 477                 for r in g.h_rules(h):
 478                     if r.L()==STOP:
 479                         print r
 480                         r.probN = pstophln
 481                         print r
 482     return "todo"
 483
 484 def testreestimation_h():
 485     _h_ = DMV_Rule((LRBAR,0), STOP,    ( RBAR,0), 1.0, 1.0) # LSTOP
 486     h_S = DMV_Rule(( RBAR,0),(NOBAR,0),  STOP,    0.4, 0.3) # RSTOP
 487     h_A = DMV_Rule(( RBAR,0),(LRBAR,0),( RBAR,0), 0.6, 0.7) # Lattach
 488     h   = DMV_Rule((NOBAR,0),(NOBAR,0),(LRBAR,0), 1.0, 1.0) # Rattach
 489     b2  = {}
 490     b2[(NOBAR, 0), 'h'] = 1.0
 491     b2[(RBAR, 0), 'h'] = h_S.probA
 492     b2[(LRBAR, 0), 'h'] = h_S.probA * _h_.probA
 493
 494     g_dup = DMV_Grammar([ _h_, h_S, h_A, h ],b2,0,0,0, {0:'h'}, {'h':0})
 495
 496     #    test3 = inner_dmv(0, 2, (LRBAR,0), 2, g_dup, 'h h h'.split(), {})
 497     h_tag = 'h'
 498     h = 0
 499     print "todo: figure out why it doesn't work"
 500     for r in g_dup.h_rules(h):
 501         if r.L()==STOP:
 502             print r
 503 #             print "off-set the rule, see what happens:"
 504 #             r.probN = 0.7
 505 #             print r
 506             for i in range(3):
 507                 pstophln = P_STOP(True, h, 'L', 'N', g_dup, ['h h h'.split()])
 508                 print "p(STOP|%s,L,N):%s"%(h_tag,pstophln)
 509
 510                 for r in g_dup.h_rules(h):
 511                     if r.L()==STOP:
 512                         print r
 513                         r.probN = pstophln
 514                         print r
 515     return "todo"
 516
 517 if __name__ == "__main__":
 518     io.DEBUG = 0
 519     import timeit
 520     timeit.Timer("dmv.testreestimation_h()",'''import dmv
 521 reload(dmv)''').timeit(1)
 522     pass
 523
 524
 525
 526 # todo: some more testing on the Brown corpus:
 527 #     # first five sentences of the Brown corpus:
 528 #     g_brown = harmonic.initialize([['AT', 'NP-TL', 'NN-TL', 'JJ-TL', 'NN-TL', 'VBD', 'NR', 'AT', 'NN', 'IN', 'NP$', 'JJ', 'NN', 'NN', 'VBD', '``', 'AT', 'NN', "''", 'CS', 'DTI', 'NNS', 'VBD', 'NN', '.'], ['AT', 'NN', 'RBR', 'VBD', 'IN', 'NN', 'NNS', 'CS', 'AT', 'NN-TL', 'JJ-TL', 'NN-TL', ',', 'WDT', 'HVD', 'JJ', 'NN', 'IN', 'AT', 'NN', ',', '``', 'VBZ', 'AT', 'NN', 'CC', 'NNS', 'IN', 'AT', 'NN-TL', 'IN-TL', 'NP-TL', "''", 'IN', 'AT', 'NN', 'IN', 'WDT', 'AT', 'NN', 'BEDZ', 'VBN', '.'], ['AT', 'NP', 'NN', 'NN', 'HVD', 'BEN', 'VBN', 'IN', 'NP-TL', 'JJ-TL', 'NN-TL', 'NN-TL', 'NP', 'NP', 'TO', 'VB', 'NNS', 'IN', 'JJ', '``', 'NNS', "''", 'IN', 'AT', 'JJ', 'NN', 'WDT', 'BEDZ', 'VBN', 'IN', 'NN-TL', 'NP', 'NP', 'NP', '.'], ['``', 'RB', 'AT', 'JJ', 'NN', 'IN', 'JJ', 'NNS', 'BEDZ', 'VBN', "''", ',', 'AT', 'NN', 'VBD', ',', '``', 'IN', 'AT', 'JJ', 'NN', 'IN', 'AT', 'NN', ',', 'AT', 'NN', 'IN', 'NNS', 'CC', 'AT', 'NN', 'IN', 'DT', 'NN', "''", '.'], ['AT', 'NN', 'VBD', 'PPS', 'DOD', 'VB', 'CS', 'AP', 'IN', 'NP$', 'NN', 'CC', 'NN', 'NNS', '``', 'BER', 'JJ', 'CC', 'JJ', 'CC', 'RB', 'JJ', "''", '.'], ['PPS', 'VBD', 'CS', 'NP', 'NNS', 'VB', '``', 'TO', 'HV', 'DTS', 'NNS', 'VBN', 'CC', 'VBN', 'IN', 'AT', 'NN', 'IN', 'VBG', 'CC', 'VBG', 'PPO', "''", '.'], ['AT', 'JJ', 'NN', 'VBD', 'IN', 'AT', 'NN', 'IN', 'AP', 'NNS', ',', 'IN', 'PPO', 'AT', 'NP', 'CC', 'NP-TL', 'NN-TL', 'VBG', 'NNS', 'WDT', 'PPS', 'VBD', '``', 'BER', 'QL', 'VBN', 'CC', 'VB', 'RB', 'VBN', 'NNS', 'WDT', 'VB', 'IN', 'AT', 'JJT', 'NN', 'IN', 'ABX', 'NNS', "''", '.'], ['NN-HL', 'VBN-HL'], ['WRB', ',', 'AT', 'NN', 'VBD', 'PPS', 'VBZ', '``', 'DTS', 'CD', 'NNS', 'MD', 'BE', 'VBN', 'TO', 'VB', 'JJR', 'NN', 'CC', 'VB', 'AT', 'NN', 'IN', 'NN', "''", '.'], ['AT', 'NN-TL', 'VBG-TL', 'NN-TL', ',', 'AT', 'NN', 'VBD', ',', '``', 'BEZ', 'VBG', 'IN', 'VBN', 'JJ', 'NNS', 'CS', 'AT', 'NN', 'IN', 'NN', 'NNS', 'NNS', "''", '.']])
 529 #     # 36:'AT' in g_brown.numtag, 40:'NP-TL'
 530
 531 #     io.DEBUG = 0
 532 #     test_brown = inner_dmv(0,2, (LRBAR,36), g_brown, ['AT', 'NP-TL' ,'NN-TL','JJ-TL'], {})
 533 #     if io.DEBUG:
 534 #         for r in  g_brown.rules((2,36)) + g_brown.rules((1,36)) + g_brown.rules((0,36)):
 535 #             L = r.L()
 536 #             R = r.R()
 537 #             if head(L) in [36,40,-2] and head(R) in [36,40,-2]:
 538 #                 print r
 539 #     print "Brown-test gives: %.8f" % test_brown[0]
 540
 541
 542
 543     # this will give the tag sequences of all the 6218 Brown corpus
 544     # sentences of length < 7:
 545     # [[tag for (w, tag) in sent]
 546     #  for sent in nltk.corpus.brown.tagged_sents() if len(sent) < 7]
 547
 548
 549
 550
 551
 552