src/dmv.py

   1 #### changes by KBU:
   2 # 2008-05-24:
   3 # - prettier printout for DMV_Rule
   4 # - DMV_Rule changed a bit. head, L and R are now all pairs of the
   5 #   form (bars, head).
   6 # - Started on P_STOP, a bit less pseudo now..
   7 #
   8 # 2008-05-27:
   9 # - started on initialization. So far, I have frequencies for
  10 #   everything, very harmonic. Still need to make these into 1-summing
  11 #   probabilities
  12 #
  13 # 2008-05-28:
  14 # - more work on initialization (init_freq and init_normalize),
  15 #   getting closer to probabilities now.
  16 #
  17 # 2008-05-29:
  18 # - init_normalize is done, it creates p_STOP, p_ROOT and p_CHOOSE,
  19 #   and also adds the relevant probabilities to p_rules in a grammar.
  20 #   Still, each individual rule has to store both adjacent and non_adj
  21 #   probabilities, and inner() should be able to send some parameter
  22 #   which lets the rule choose... hopefully... Is this possible to do
  23 #   top-down even? when the sentence could be all the same words?
  24 #   todo: extensive testing of identical words in sentences!
  25 # - frequencies (only used in initialization) are stored as strings,
  26 #   but in the rules and p_STOP etc, there are only numbers.
  27 #
  28 # 2008-05-30
  29 # - copied inner() into this file, to make the very dmv-specific
  30 #   adjacency stuff work (have to factor that out later on, when it
  31 #   works).
  32 #
  33 # 2008-06-01
  34 # - finished typing in inner_dmv(), still have to test and debug
  35 #   it. The chart is now four times as big since for any rule we may
  36 #   have attachments to either the left or the right below, which
  37 #   upper rules depend on, for selecting probN or probA
  38 #
  39 # 2008-06-03
  40 # - fixed a number of little bugs in initialization, where certain
  41 #   rules were simply not created, or created "backwards"
  42 # - inner_dmv() should Work now...
  43 #
  44 # 2008-06-04
  45 # - moved initialization to harmonic.py
  46
  47
  48 # import numpy # numpy provides Fast Arrays, for future optimization
  49 import pprint
  50 import io
  51 import harmonic
  52
  53 # non-tweakable/constant "lookup" globals
  54 BARS = [0,1,2]
  55 RBAR = 1
  56 LRBAR = 2
  57 NOBAR = 0
  58 ROOT = (LRBAR, -1)
  59 STOP = (NOBAR, -2)
  60
  61 if __name__ == "__main__":
  62     print "DMV module tests:"
  63
  64
  65 def node(bars, head):
  66     '''Useless function, but just here as documentation. Nodes make up
  67     LHS, R and L in each DMV_Rule'''
  68     return (bars, head)
  69
  70 def bars(node):
  71     return node[0]
  72
  73 def head(node):
  74     return node[1]
  75
  76
  77 class DMV_Grammar(io.Grammar):
  78     '''The DMV-PCFG.
  79
  80     Public members:
  81     p_STOP, p_ROOT, p_CHOOSE, p_terminals
  82     These are changed in the Maximation step, then used to set the
  83     new probabilities of each DMV_Rule.
  84
  85     Todo: make p_terminals private? (But it has to be changable in
  86     maximation step due to the short-cutting rules... could of course
  87     make a DMV_Grammar function to update the short-cut rules...)
  88
  89     __p_rules is private, but we can still say stuff like:
  90     for r in g.all_rules():
  91         r.probN = newProbN
  92
  93     What other representations do we need? (P_STOP formula uses
  94     deps_D(h,l/r) at least)'''
  95     def __str__(self):
  96         str = ""
  97         for r in self.all_rules():
  98              str += "%s\n" % r.__str__(self.numtag)
  99         return str
 100
 101     def h_rules(self, h):
 102         return [r for r in self.all_rules() if r.head() == h]
 103
 104     def rules(self, LHS):
 105         return [r for r in self.all_rules() if r.LHS() == LHS]
 106
 107     def sent_rules(self, LHS, sent_nums):
 108         "Used in inner_dmv."
 109         # We don't want to rule out STOPs!
 110         sent_nums.append( head(STOP) )
 111         return [r for r in self.all_rules() if r.LHS() == LHS
 112                 and head(r.L()) in sent_nums and head(r.R()) in sent_nums]
 113
 114     def heads(self):
 115         '''Not sure yet what is needed here, or where this is needed'''
 116         return numtag
 117
 118     def deps_L(self, head):
 119         # todo test, probably this list comprehension doesn't work
 120         return [a for r in self.all_rules() if r.head() == head and a == r.L()]
 121
 122     def deps_R(self, head):
 123         # todo test, probably this list comprehension doesn't work
 124         return [a for r in self.all_rules() if r.head() == head and a == r.R()]
 125
 126     def __init__(self, p_rules, p_terminals, p_STOP, p_CHOOSE, p_ROOT, numtag, tagnum):
 127         io.Grammar.__init__(self, p_rules, p_terminals, numtag, tagnum)
 128         self.p_STOP = p_STOP
 129         self.p_CHOOSE = p_CHOOSE
 130         self.p_ROOT = p_ROOT
 131
 132
 133 class DMV_Rule(io.CNF_Rule):
 134     '''A single CNF rule in the PCFG, of the form
 135     LHS -> L R
 136     where LHS, L and R are 'nodes', eg. of the form (bars, head).
 137
 138     Public members:
 139     probN, probA
 140
 141     Private members:
 142     __L, __R, __LHS
 143
 144     Different rule-types have different probabilities associated with
 145     them:
 146
 147     _h_ -> STOP  h_     P( STOP|h,L,    adj)
 148     _h_ -> STOP  h_     P( STOP|h,L,non_adj)
 149      h_ ->  h  STOP     P( STOP|h,R,    adj)
 150      h_ ->  h  STOP     P( STOP|h,R,non_adj)
 151      h_ -> _a_   h_     P(-STOP|h,L,    adj) * P(a|h,L)
 152      h_ -> _a_   h_     P(-STOP|h,L,non_adj) * P(a|h,L)
 153      h  ->  h   _a_     P(-STOP|h,R,    adj) * P(a|h,R)
 154      h  ->  h   _a_     P(-STOP|h,R,non_adj) * P(a|h,R)
 155     '''
 156     def p(self, adj, *arg):
 157         if adj:
 158             return self.probA
 159         else:
 160             return self.probN
 161
 162     def p_STOP(self, s, t, loc_h):
 163         '''Returns the correct probability, adjacent if we're rewriting from
 164         the (either left or right) end of the fragment. '''
 165         if self.L() == STOP:
 166             return self.p(s == loc_h)
 167         elif self.R() == STOP:
 168             if not loc_h == s:
 169                 io.debug( "(%s given loc_h:%d but s:%d. Todo: optimize away!)"
 170                           % (self, loc_h, s) )
 171                 return 0.0
 172             else:
 173                 return self.p(t == loc_h)
 174
 175     def p_ATTACH(self, r, loc_h, s=None):
 176         '''Returns the correct probability, adjacent if we haven't attached
 177         anything before.'''
 178         if self.LHS() == self.L():
 179             if not loc_h == s:
 180                 io.debug( "(%s given loc_h (loc_L):%d but s:%d. Todo: optimize away!)"
 181                           % (self, loc_h, s) )
 182                 return 0.0
 183             else:
 184                 return self.p(r == loc_h)
 185         elif self.LHS() == self.R():
 186             return self.p(r+1 == loc_h)
 187
 188     def bars(self):
 189         return bars(self.LHS())
 190
 191     def head(self):
 192         return head(self.LHS())
 193
 194     def __init__(self, LHS, L, R, probN, probA):
 195         for b_h in [LHS, L, R]:
 196             if bars(b_h) not in BARS:
 197                 raise ValueError("bars must be in %s; was given: %s"
 198                                  % (BARS, bars(b_h)))
 199         io.CNF_Rule.__init__(self, LHS, L, R, probN)
 200         self.probA = probA # adjacent
 201         self.probN = probN # non_adj
 202
 203     @classmethod # so we can call DMV_Rule.bar_str(b_h)
 204     def bar_str(cls, b_h, tag=lambda x:x):
 205         if(b_h == ROOT):
 206             return 'ROOT'
 207         elif(b_h == STOP):
 208             return 'STOP'
 209         elif(bars(b_h) == RBAR):
 210             return " %s_ " % tag(head(b_h))
 211         elif(bars(b_h) == LRBAR):
 212             return "_%s_ " % tag(head(b_h))
 213         else:
 214             return " %s  " % tag(head(b_h))
 215
 216
 217     def __str__(self, tag=lambda x:x):
 218         return "%s-->%s %s\t[N %.2f] [A %.2f]" % (self.bar_str(self.LHS(), tag),
 219                                                   self.bar_str(self.L(), tag),
 220                                                   self.bar_str(self.R(), tag),
 221                                                   self.probN,
 222                                                   self.probA)
 223
 224
 225
 226
 227
 228
 229
 230 ###################################
 231 # dmv-specific version of inner() #
 232 ###################################
 233 def locs(h, sent, s=0, t=None):
 234     '''Return the locations of h in sent, or some fragment of sent (in the
 235     latter case we make sure to offset the locations correctly so that
 236     for any x in the returned list, sent[x]==h).'''
 237     if t == None:
 238         t = len(sent)
 239     return [i+s for i,w in enumerate(sent[s:t]) if w == h]
 240
 241
 242 def inner_dmv(s, t, LHS, loc_h, g, sent, chart):
 243     ''' A rewrite of inner in io.py, to take adjacency into accord.
 244
 245     The chart is now of this form:
 246     chart[(s,t,LHS, loc_h)]
 247
 248     loc_h gives adjacency (along with r and location of other child
 249     for attachment rules), and is needed in P_STOP reestimation.
 250
 251     Todo: if possible, refactor (move dmv-specific stuff back into
 252     dmv, so this is "general" enough to be in io.py)
 253     '''
 254
 255     def O(s):
 256         return sent[s]
 257
 258     sent_nums = [g.tagnum(tag) for tag in sent]
 259
 260     def e(s,t,LHS, loc_h, n_t):
 261         def tab():
 262             "Tabs for debug output"
 263             return "\t"*n_t
 264
 265         if (s, t, LHS, loc_h) in chart:
 266             io.debug("%s*= %.4f in chart: s:%d t:%d LHS:%s loc:%d"
 267                      %(tab(),chart[(s, t, LHS, loc_h)], s, t,
 268                        DMV_Rule.bar_str(LHS), loc_h))
 269             return chart[(s, t, LHS, loc_h)]
 270         else:
 271             if s == t:
 272                 if not loc_h == s:
 273                     # terminals are always F,F for attachment
 274                     io.debug("%s*= 0.0 (wrong loc_h)" % tab())
 275                     return 0.0
 276                 elif (LHS, O(s)) in g.p_terminals:
 277                     prob = g.p_terminals[LHS, O(s)] # "b[LHS, O(s)]" in Lari&Young
 278                 else:
 279                     # todo: assuming this is how to deal w/lacking
 280                     # rules, since we add prob.s, and 0 is identity
 281                     prob = 0.0
 282                     io.debug( "%sLACKING TERMINAL:" % tab())
 283                 # todo: add to chart perhaps? Although, it _is_ simple lookup..
 284                 io.debug( "%s*= %.4f (terminal: %s -> %s_%d)"
 285                           % (tab(),prob, DMV_Rule.bar_str(LHS), O(s), loc_h) )
 286                 return prob
 287             else:
 288                 p = 0.0 # "sum over j,k in a[LHS,j,k]"
 289                 for rule in g.sent_rules(LHS, sent_nums):
 290                     io.debug( "%ssumming rule %s s:%d t:%d loc:%d" % (tab(),rule,s,t,loc_h) )
 291                     L = rule.L()
 292                     R = rule.R()
 293                     # if it's a STOP rule, rewrite for the same range:
 294                     if (L == STOP) or (R == STOP):
 295                         if L == STOP:
 296                             pLR = e(s, t, R, loc_h, n_t+1)
 297                         elif R == STOP:
 298                             pLR = e(s, t, L, loc_h, n_t+1)
 299                         p += rule.p_STOP(s, t, loc_h) * pLR
 300                         io.debug( "%sp= %.4f (STOP)" % (tab(), p) )
 301
 302                     else: # not a STOP, an attachment rewrite:
 303                         for r in range(s, t):
 304                             p_h = rule.p_ATTACH(r, loc_h, s=s)
 305                             if rule.LHS() == L:
 306                                 locs_L = [loc_h]
 307                                 locs_R = locs(head(R), sent_nums, r+1, t+1)
 308                             elif rule.LHS() == R:
 309                                 locs_L = locs(head(L), sent_nums,  s,  r+1)
 310                                 locs_R = [loc_h]
 311                             # see http://tinyurl.com/4ffhhw
 312                             p += sum([e(s, r, L, loc_L, n_t+1) *
 313                                       p_h *
 314                                       e(r+1, t, R, loc_R, n_t+1)
 315                                       for loc_L in locs_L
 316                                       for loc_R in locs_R])
 317                             io.debug( "%sp= %.4f (ATTACH)" % (tab(), p) )
 318                 chart[(s, t, LHS, loc_h)] = p
 319                 return p
 320     # end of e-function
 321
 322     inner_prob = e(s,t,LHS,loc_h, 0)
 323     if 1 in io.DEBUG:
 324         print "---CHART:---"
 325         for (s,t,LHS,loc_h),v in chart.iteritems():
 326             print "%s -> %s_%d ... %s_%d (loc_h:%s):\t%.4f" % (DMV_Rule.bar_str(LHS,g.numtag),
 327                                                                    O(s), s, O(s), t, loc_h, v)
 328         print "---CHART:end---"
 329     return inner_prob
 330 # end of inner_dmv(s, t, LHS, loc_h, g, sent, chart)
 331
 332 def inner_sent_dmv(sent, g, chart):
 333     '''Possibly there's a more efficient way? Although, non-sentence heads
 334     _will_ be ruled out by inner_dmv though.'''
 335     p = 0.0
 336     for loc_h,h_tag in enumerate(sent):
 337         p += inner_dmv(0, len(sent)-1, ROOT, loc_h, g, sent, chart)
 338     return p
 339
 340 if __name__ == "__main__":                      # Non, Adj
 341     _h_ = DMV_Rule((LRBAR,0), STOP,    ( RBAR,0), 1.0, 1.0) # LSTOP
 342     h_S = DMV_Rule(( RBAR,0),(NOBAR,0),  STOP,    0.4, 0.3) # RSTOP
 343     h_A = DMV_Rule(( RBAR,0),(LRBAR,0),( RBAR,0), 0.6, 0.7) # Lattach
 344     h   = DMV_Rule((NOBAR,0),(NOBAR,0),(LRBAR,0), 1.0, 1.0) # Rattach
 345     b2  = {}
 346     b2[(NOBAR, 0), 'h'] = 1.0
 347     b2[(RBAR, 0), 'h'] = h_S.probA
 348     b2[(LRBAR, 0), 'h'] = h_S.probA * _h_.probA
 349
 350     g_dup = DMV_Grammar([ _h_, h_S, h_A, h ],b2,0,0,0, {0:'h'}, {'h':0})
 351
 352     io.DEBUG = []
 353     test0 = inner_dmv(0, 1, (LRBAR,0), 0, g_dup, 'h h'.split(), {})
 354     if not  "0.120"=="%.3f" % test0:
 355         print "Should be 0.120: %.3f" % test0
 356
 357     test1 = inner_dmv(0, 1, (LRBAR,0), 1, g_dup, 'h h'.split(), {})
 358     if not  "0.063"=="%.3f" % test1:
 359         print "Should be 0.063: %.3f" % test1
 360     io.DEBUG = [1]
 361     test3 = inner_dmv(0, 2, (LRBAR,0), 2, g_dup, 'h h h'.split(), {})
 362     if not  "0.0498"=="%.4f" % test3:
 363         print "Should be 0.0498: %.4f" % test3
 364
 365
 366
 367
 368
 369
 370 ##############################
 371 # DMV-probabilities, todo:   #
 372 ##############################
 373
 374
 375 def P_CHOOSE():
 376     return "todo"
 377
 378 def DMV(sent, g):
 379     '''Here it seems like they store rule information on a per-head (per
 380     direction) basis, in deps_D(h, dir) which gives us a list. '''
 381     def P_h(h):
 382         P_h = 1 # ?
 383         for dir in ['l', 'r']:
 384             for a in deps(h, dir):
 385                 # D(a)??
 386                 P_h *= \
 387                     P_STOP (0, h, dir, adj) * \
 388                     P_CHOOSE (a, h, dir) * \
 389                     P_h(D(a)) * \
 390                 P_STOP (STOP | h, dir, adj)
 391         return P_h
 392     return P_h(root(sent))
 393
 394
 395 def P_STOP(STOP, h, dir, adj, g, corpus):
 396     '''corpus is a list of sentences s.
 397
 398 This is based on the formula where STOP is True... not sure how we
 399 calculate if STOP is False.
 400
 401 I thought about instead having this:
 402
 403 for rule in g.p_rules:
 404     rule.num = 0
 405     rule.den = 0
 406 for sent in corpus:
 407     for rule in g.p_rules:
 408        for s:
 409            for t:
 410                set num and den using inner
 411 for rule in g.p_rules
 412     rule.prob = rule.num / rule.den
 413
 414 ..the way I'm assuming we do it in the commented out io-function in
 415 io.py. Having sentences as the outer loop at least we can easily just
 416 go through the heads that are actually in the sentence... BUT, this
 417 means having to go through p_rules 3 times, not sure what is slower.
 418
 419 Also, now inner_dmv makes sure it only goes through heads that are
 420 actually in the sentence, so that argument falls.
 421
 422 oh, and:
 423 P_STOP(-STOP|...) = 1 - P_STOP(STOP|...)
 424 =D
 425
 426 '''
 427     P_STOP_num = 0
 428     P_STOP_den = 0
 429     h_tag = g.numtag(h)
 430     for sent in corpus:
 431         # have to go through _all_ places where h appears in the
 432         # sentence...how? how to make sure it _works_?
 433         chart = {}
 434         inner_sent_dmv(sent, g, chart) #todo, current. Later on,
 435                                        #generalize to all heads, since
 436                                        #we now have the chart of all
 437                                        #of them anyway
 438         locs_h = locs(h_tag, sent)
 439         io.debug( "locs_h:%s, sent:%s"%(locs_h,sent) , 2)
 440         for loc_h in locs_h:
 441             for s in range(loc_h): # s<loc(h), range gives strictly less
 442                 for t in range(loc_h, len(sent)):
 443                     io.debug( "s:%s t:%s loc:%d"%(s,t,loc_h)  , 2)
 444                     if (s, t, (LRBAR,h), loc_h) in chart:
 445                         io.debug( "num+=%s"%chart[(s, t, (LRBAR,h), loc_h)]  , 2)
 446                         P_STOP_num += chart[(s, t, (LRBAR,h), loc_h)]
 447                     if (s, t, (RBAR,h), loc_h) in chart:
 448                         io.debug( "den+=%s"%chart[(s, t, (LRBAR,h), loc_h)]  , 2)
 449                         P_STOP_den += chart[(s, t, (RBAR,h), loc_h)]
 450         # todo: use sum([chart[(s, t...)] etc? but can we then
 451         # keep den and num separate?
 452
 453     io.debug( "num/den: %s / %s"%(P_STOP_num, P_STOP_den) , 2)
 454     if P_STOP_den > 0.0:
 455         io.debug( "num/den: %s / %s = %s"%(P_STOP_num, P_STOP_den,P_STOP_num / P_STOP_den) , 2)
 456         return P_STOP_num / P_STOP_den # upside down in article
 457     else:
 458         return 0.0
 459
 460
 461
 462 def testreestimation():
 463     testcorpus = [s.split() for s in ['det nn vbd c vbd','det nn vbd c nn vbd pp',
 464                                       'det nn vbd',      'det vbd nn c vbd pp',
 465                                       'det nn vbd',      'det vbd c nn vbd pp',
 466                                       'det nn vbd',      'det nn vbd nn c vbd pp',
 467                                       'det nn vbd',      'det nn vbd c det vbd pp',
 468                                       'det nn vbd',      'det nn vbd c vbd det det det pp',
 469                                       'det nn vbd',      'det nn vbd c vbd pp',
 470                                       'det nn vbd',      'det nn vbd c vbd det pp',
 471                                       'det nn vbd',      'det nn vbd c vbd pp',
 472                                       'det nn vbd pp',   'det nn vbd det', ]]
 473     g = harmonic.initialize(testcorpus)
 474
 475     h_tag = 'nn'
 476     h = g.tagnum(h_tag)
 477     print "This will take some time. todo: figure out why it doesn't work"
 478     for r in g.h_rules(h):
 479         if r.L()==STOP:
 480             print r
 481 #             print "off-set the rule, see what happens:"
 482 #             r.probN = 0.7
 483 #             print r
 484             for i in range(3):
 485                 pstophln = P_STOP(True, h, 'L', 'N', g, testcorpus)
 486                 print "p(STOP|%s,L,N):%s"%(h_tag,pstophln)
 487
 488                 for r in g.h_rules(h):
 489                     if r.L()==STOP:
 490                         print r
 491                         r.probN = pstophln
 492                         print r
 493     return "todo"
 494
 495
 496
 497 def testreestimation_h():
 498     _h_ = DMV_Rule((LRBAR,0), STOP,    ( RBAR,0), 1.0, 1.0) # LSTOP
 499     h_S = DMV_Rule(( RBAR,0),(NOBAR,0),  STOP,    0.4, 0.3) # RSTOP
 500     h_A = DMV_Rule(( RBAR,0),(LRBAR,0),( RBAR,0), 0.6, 0.7) # Lattach
 501     h   = DMV_Rule((NOBAR,0),(NOBAR,0),(LRBAR,0), 1.0, 1.0) # Rattach
 502     rh  = DMV_Rule(   ROOT,   STOP,    (LRBAR,0), 1.0, 1.0) # ROOT
 503     b2  = {}
 504     b2[(NOBAR, 0), 'h'] = 1.0
 505     b2[(RBAR, 0), 'h'] = h_S.probA
 506     b2[(LRBAR, 0), 'h'] = h_S.probA * _h_.probA
 507
 508     g_dup = DMV_Grammar([ rh, _h_, h_S, h_A, h ],b2,0,0,0, {0:'h'}, {'h':0})
 509
 510     #    test3 = inner_dmv(0, 2, (LRBAR,0), 2, g_dup, 'h h h'.split(), {})
 511     h_tag = 'h'
 512     h = 0
 513     print "todo: figure out why it doesn't work"
 514     for r in g_dup.h_rules(h):
 515         if r.L()==STOP:
 516             print r
 517 #             print "off-set the rule, see what happens:"
 518 #             r.probN = 0.7
 519 #             print r
 520             for i in range(3):
 521                 pstophln = P_STOP(True, h, 'L', 'N', g_dup, ['h h h'.split()])
 522                 print "p(STOP|%s,L,N):%s"%(h_tag,pstophln)
 523
 524                 for r in g_dup.h_rules(h):
 525                     if r.L()==STOP:
 526                         print r
 527                         r.probN = pstophln
 528                         print r
 529     return "todo"
 530
 531 if __name__ == "__main__":
 532     io.DEBUG = []
 533 #     import timeit
 534 #     timeit.Timer("dmv.testreestimation_h()",'''import dmv
 535 # reload(dmv)''').timeit(1)
 536     pass
 537
 538
 539
 540 # todo: some more testing on the Brown corpus:
 541 #     # first five sentences of the Brown corpus:
 542 #     g_brown = harmonic.initialize([['AT', 'NP-TL', 'NN-TL', 'JJ-TL', 'NN-TL', 'VBD', 'NR', 'AT', 'NN', 'IN', 'NP$', 'JJ', 'NN', 'NN', 'VBD', '``', 'AT', 'NN', "''", 'CS', 'DTI', 'NNS', 'VBD', 'NN', '.'], ['AT', 'NN', 'RBR', 'VBD', 'IN', 'NN', 'NNS', 'CS', 'AT', 'NN-TL', 'JJ-TL', 'NN-TL', ',', 'WDT', 'HVD', 'JJ', 'NN', 'IN', 'AT', 'NN', ',', '``', 'VBZ', 'AT', 'NN', 'CC', 'NNS', 'IN', 'AT', 'NN-TL', 'IN-TL', 'NP-TL', "''", 'IN', 'AT', 'NN', 'IN', 'WDT', 'AT', 'NN', 'BEDZ', 'VBN', '.'], ['AT', 'NP', 'NN', 'NN', 'HVD', 'BEN', 'VBN', 'IN', 'NP-TL', 'JJ-TL', 'NN-TL', 'NN-TL', 'NP', 'NP', 'TO', 'VB', 'NNS', 'IN', 'JJ', '``', 'NNS', "''", 'IN', 'AT', 'JJ', 'NN', 'WDT', 'BEDZ', 'VBN', 'IN', 'NN-TL', 'NP', 'NP', 'NP', '.'], ['``', 'RB', 'AT', 'JJ', 'NN', 'IN', 'JJ', 'NNS', 'BEDZ', 'VBN', "''", ',', 'AT', 'NN', 'VBD', ',', '``', 'IN', 'AT', 'JJ', 'NN', 'IN', 'AT', 'NN', ',', 'AT', 'NN', 'IN', 'NNS', 'CC', 'AT', 'NN', 'IN', 'DT', 'NN', "''", '.'], ['AT', 'NN', 'VBD', 'PPS', 'DOD', 'VB', 'CS', 'AP', 'IN', 'NP$', 'NN', 'CC', 'NN', 'NNS', '``', 'BER', 'JJ', 'CC', 'JJ', 'CC', 'RB', 'JJ', "''", '.'], ['PPS', 'VBD', 'CS', 'NP', 'NNS', 'VB', '``', 'TO', 'HV', 'DTS', 'NNS', 'VBN', 'CC', 'VBN', 'IN', 'AT', 'NN', 'IN', 'VBG', 'CC', 'VBG', 'PPO', "''", '.'], ['AT', 'JJ', 'NN', 'VBD', 'IN', 'AT', 'NN', 'IN', 'AP', 'NNS', ',', 'IN', 'PPO', 'AT', 'NP', 'CC', 'NP-TL', 'NN-TL', 'VBG', 'NNS', 'WDT', 'PPS', 'VBD', '``', 'BER', 'QL', 'VBN', 'CC', 'VB', 'RB', 'VBN', 'NNS', 'WDT', 'VB', 'IN', 'AT', 'JJT', 'NN', 'IN', 'ABX', 'NNS', "''", '.'], ['NN-HL', 'VBN-HL'], ['WRB', ',', 'AT', 'NN', 'VBD', 'PPS', 'VBZ', '``', 'DTS', 'CD', 'NNS', 'MD', 'BE', 'VBN', 'TO', 'VB', 'JJR', 'NN', 'CC', 'VB', 'AT', 'NN', 'IN', 'NN', "''", '.'], ['AT', 'NN-TL', 'VBG-TL', 'NN-TL', ',', 'AT', 'NN', 'VBD', ',', '``', 'BEZ', 'VBG', 'IN', 'VBN', 'JJ', 'NNS', 'CS', 'AT', 'NN', 'IN', 'NN', 'NNS', 'NNS', "''", '.']])
 543 #     # 36:'AT' in g_brown.numtag, 40:'NP-TL'
 544
 545 #     io.DEBUG = []
 546 #     test_brown = inner_dmv(0,2, (LRBAR,36), g_brown, ['AT', 'NP-TL' ,'NN-TL','JJ-TL'], {})
 547 #     if 1 in io.DEBUG:
 548 #         for r in  g_brown.rules((2,36)) + g_brown.rules((1,36)) + g_brown.rules((0,36)):
 549 #             L = r.L()
 550 #             R = r.R()
 551 #             if head(L) in [36,40,-2] and head(R) in [36,40,-2]:
 552 #                 print r
 553 #     print "Brown-test gives: %.8f" % test_brown
 554
 555
 556
 557     # this will give the tag sequences of all the 6218 Brown corpus
 558     # sentences of length < 7:
 559     # [[tag for (w, tag) in sent]
 560     #  for sent in nltk.corpus.brown.tagged_sents() if len(sent) < 7]
 561
 562
 563
 564
 565
 566