src/dmv.py

   1 #### changes by KBU:
   2 # 2008-05-24:
   3 # - prettier printout for DMV_Rule
   4 # - DMV_Rule changed a bit. head, L and R are now all pairs of the
   5 #   form (bars, head).
   6 # - Started on P_STOP, a bit less pseudo now..
   7 #
   8 # 2008-05-27:
   9 # - started on initialization. So far, I have frequencies for
  10 #   everything, very harmonic. Still need to make these into 1-summing
  11 #   probabilities
  12 #
  13 # 2008-05-28:
  14 # - more work on initialization (init_freq and init_normalize),
  15 #   getting closer to probabilities now.
  16 #
  17 # 2008-05-29:
  18 # - init_normalize is done, it creates p_STOP, p_ROOT and p_CHOOSE,
  19 #   and also adds the relevant probabilities to p_rules in a grammar.
  20 #   Still, each individual rule has to store both adjacent and non_adj
  21 #   probabilities, and inner() should be able to send some parameter
  22 #   which lets the rule choose... hopefully... Is this possible to do
  23 #   top-down even? when the sentence could be all the same words?
  24 #   todo: extensive testing of identical words in sentences!
  25 # - frequencies (only used in initialization) are stored as strings,
  26 #   but in the rules and p_STOP etc, there are only numbers.
  27 #
  28 # 2008-05-30
  29 # - copied inner() into this file, to make the very dmv-specific
  30 #   adjacency stuff work (have to factor that out later on, when it
  31 #   works).
  32 #
  33 # 2008-06-01
  34 # - finished typing in inner_dmv(), still have to test and debug
  35 #   it. The chart is now four times as big since for any rule we may
  36 #   have attachments to either the left or the right below, which
  37 #   upper rules depend on, for selecting probN or probA
  38 #
  39 # 2008-06-03
  40 # - fixed a number of little bugs in initialization, where certain
  41 #   rules were simply not created, or created "backwards"
  42 # - inner_dmv() should Work now...
  43 #
  44 # 2008-06-04
  45 # - moved initialization to harmonic.py
  46
  47
  48 # import numpy # numpy provides Fast Arrays, for future optimization
  49 import pprint
  50 import io
  51 import harmonic
  52
  53 # non-tweakable/constant "lookup" globals
  54 BARS = [0,1,2]
  55 RBAR = 1
  56 LRBAR = 2
  57 NOBAR = 0
  58 ROOT = (LRBAR, -1)
  59 STOP = (NOBAR, -2)
  60
  61 if __name__ == "__main__":
  62     print "DMV module tests:"
  63
  64
  65 def node(bars, head):
  66     '''Useless function, but just here as documentation. Nodes make up
  67     LHS, R and L in each DMV_Rule'''
  68     return (bars, head)
  69
  70 def bars(node):
  71     return node[0]
  72
  73 def head(node):
  74     return node[1]
  75
  76
  77 class DMV_Grammar(io.Grammar):
  78     '''The DMV-PCFG.
  79
  80     Public members:
  81     p_STOP, p_ROOT, p_CHOOSE, p_terminals
  82     These are changed in the Maximation step, then used to set the
  83     new probabilities of each DMV_Rule.
  84
  85     Todo: make p_terminals private? (But it has to be changable in
  86     maximation step due to the short-cutting rules... could of course
  87     make a DMV_Grammar function to update the short-cut rules...)
  88
  89     __p_rules is private, but we can still say stuff like:
  90     for r in g.all_rules():
  91         r.probN = newProbN
  92
  93     What other representations do we need? (P_STOP formula uses
  94     deps_D(h,l/r) at least)'''
  95     def __str__(self):
  96         str = ""
  97         for r in self.all_rules():
  98              str += "%s\n" % r.__str__(self.numtag)
  99         return str
 100
 101     def h_rules(self, h):
 102         return [r for r in self.all_rules() if r.head() == h]
 103
 104     def rules(self, LHS):
 105         return [r for r in self.all_rules() if r.LHS() == LHS]
 106
 107     def sent_rules(self, LHS, sent_nums):
 108         "Used in inner_dmv."
 109         # We don't want to rule out STOPs!
 110         sent_nums.append( head(STOP) )
 111         return [r for r in self.all_rules() if r.LHS() == LHS
 112                 and head(r.L()) in sent_nums and head(r.R()) in sent_nums]
 113
 114     def heads(self):
 115         '''Not sure yet what is needed here, or where this is needed'''
 116         return numtag
 117
 118     def deps_L(self, head):
 119         # todo test, probably this list comprehension doesn't work
 120         return [a for r in self.all_rules() if r.head() == head and a == r.L()]
 121
 122     def deps_R(self, head):
 123         # todo test, probably this list comprehension doesn't work
 124         return [a for r in self.all_rules() if r.head() == head and a == r.R()]
 125
 126     def __init__(self, p_rules, p_terminals, p_STOP, p_CHOOSE, p_ROOT, numtag, tagnum):
 127         io.Grammar.__init__(self, p_rules, p_terminals, numtag, tagnum)
 128         self.p_STOP = p_STOP
 129         self.p_CHOOSE = p_CHOOSE
 130         self.p_ROOT = p_ROOT
 131
 132
 133 class DMV_Rule(io.CNF_Rule):
 134     '''A single CNF rule in the PCFG, of the form
 135     LHS -> L R
 136     where LHS, L and R are 'nodes', eg. of the form (bars, head).
 137
 138     Public members:
 139     probN, probA
 140
 141     Private members:
 142     __L, __R, __LHS
 143
 144     Different rule-types have different probabilities associated with
 145     them:
 146
 147     _h_ -> STOP  h_     P( STOP|h,L,    adj)
 148     _h_ -> STOP  h_     P( STOP|h,L,non_adj)
 149      h_ ->  h  STOP     P( STOP|h,R,    adj)
 150      h_ ->  h  STOP     P( STOP|h,R,non_adj)
 151      h_ -> _a_   h_     P(-STOP|h,L,    adj) * P(a|h,L)
 152      h_ -> _a_   h_     P(-STOP|h,L,non_adj) * P(a|h,L)
 153      h  ->  h   _a_     P(-STOP|h,R,    adj) * P(a|h,R)
 154      h  ->  h   _a_     P(-STOP|h,R,non_adj) * P(a|h,R)
 155     '''
 156     def p(self, adj, *arg):
 157         if adj:
 158             return self.probA
 159         else:
 160             return self.probN
 161
 162     def p_STOP(self, s, t, loc_h):
 163         '''Returns the correct probability, adjacent if we're rewriting from
 164         the (either left or right) end of the fragment. '''
 165         if self.L() == STOP:
 166             return self.p(s == loc_h)
 167         elif self.R() == STOP:
 168             if not loc_h == s:
 169                 io.debug( "(%s given loc_h:%d but s:%d. Todo: optimize away!)"
 170                           % (self, loc_h, s) )
 171                 return 0.0
 172             else:
 173                 return self.p(t == loc_h)
 174
 175     def p_ATTACH(self, r, loc_h, s=None):
 176         '''Returns the correct probability, adjacent if we haven't attached
 177         anything before.'''
 178         if self.LHS() == self.L():
 179             if not loc_h == s:
 180                 io.debug( "(%s given loc_h (loc_L):%d but s:%d. Todo: optimize away!)"
 181                           % (self, loc_h, s) )
 182                 return 0.0
 183             else:
 184                 return self.p(r == loc_h)
 185         elif self.LHS() == self.R():
 186             return self.p(r+1 == loc_h)
 187
 188     def bars(self):
 189         return bars(self.LHS())
 190
 191     def head(self):
 192         return head(self.LHS())
 193
 194     def __init__(self, LHS, L, R, probN, probA):
 195         for b_h in [LHS, L, R]:
 196             if bars(b_h) not in BARS:
 197                 raise ValueError("bars must be in %s; was given: %s"
 198                                  % (BARS, bars(b_h)))
 199         io.CNF_Rule.__init__(self, LHS, L, R, probN)
 200         self.probA = probA # adjacent
 201         self.probN = probN # non_adj
 202
 203     @classmethod # so we can call DMV_Rule.bar_str(b_h)
 204     def bar_str(cls, b_h, tag=lambda x:x):
 205         if(b_h == ROOT):
 206             return 'ROOT'
 207         elif(b_h == STOP):
 208             return 'STOP'
 209         elif(bars(b_h) == RBAR):
 210             return " %s_ " % tag(head(b_h))
 211         elif(bars(b_h) == LRBAR):
 212             return "_%s_ " % tag(head(b_h))
 213         else:
 214             return " %s  " % tag(head(b_h))
 215
 216
 217     def __str__(self, tag=lambda x:x):
 218         return "%s-->%s %s\t[N %.2f] [A %.2f]" % (self.bar_str(self.LHS(), tag),
 219                                                   self.bar_str(self.L(), tag),
 220                                                   self.bar_str(self.R(), tag),
 221                                                   self.probN,
 222                                                   self.probA)
 223
 224
 225
 226
 227
 228
 229
 230 ###################################
 231 # dmv-specific version of inner() #
 232 ###################################
 233 def locs(h, sent, s=0, t=None, remove=None):
 234     '''Return the locations of h in sent, or some fragment of sent (in the
 235     latter case we make sure to offset the locations correctly so that
 236     for any x in the returned list, sent[x]==h).'''
 237     if t == None:
 238         t = len(sent)
 239     return [i+s for i,w in enumerate(sent[s:t])
 240             if w == h and not (i+s) == remove]
 241
 242
 243 def inner_dmv(s, t, LHS, loc_h, g, sent, chart):
 244     ''' A rewrite of inner in io.py, to take adjacency into accord.
 245
 246     The chart is now of this form:
 247     chart[(s,t,LHS, loc_h)]
 248
 249     loc_h gives adjacency (along with r and location of other child
 250     for attachment rules), and is needed in P_STOP reestimation.
 251
 252     Todo: if possible, refactor (move dmv-specific stuff back into
 253     dmv, so this is "general" enough to be in io.py)
 254     '''
 255
 256     def O(s):
 257         return sent[s]
 258
 259     sent_nums = [g.tagnum(tag) for tag in sent]
 260
 261     def e(s,t,LHS, loc_h, n_t):
 262         def tab():
 263             "Tabs for debug output"
 264             return "\t"*n_t
 265
 266         if (s, t, LHS, loc_h) in chart:
 267             io.debug("%s*= %.4f in chart: s:%d t:%d LHS:%s loc:%d"
 268                      %(tab(),chart[(s, t, LHS, loc_h)], s, t,
 269                        DMV_Rule.bar_str(LHS), loc_h))
 270             return chart[(s, t, LHS, loc_h)]
 271         else:
 272             if s == t:
 273                 if not loc_h == s:
 274                     # terminals are always F,F for attachment
 275                     io.debug("%s*= 0.0 (wrong loc_h)" % tab())
 276                     return 0.0
 277                 elif (LHS, O(s)) in g.p_terminals:
 278                     prob = g.p_terminals[LHS, O(s)] # "b[LHS, O(s)]" in Lari&Young
 279                 else:
 280                     # todo: assuming this is how to deal w/lacking
 281                     # rules, since we add prob.s, and 0 is identity
 282                     prob = 0.0
 283                     io.debug( "%sLACKING TERMINAL:" % tab())
 284                 # todo: add to chart perhaps? Although, it _is_ simple lookup..
 285                 io.debug( "%s*= %.4f (terminal: %s -> %s_%d)"
 286                           % (tab(),prob, DMV_Rule.bar_str(LHS), O(s), loc_h) )
 287                 return prob
 288             else:
 289                 p = 0.0 # "sum over j,k in a[LHS,j,k]"
 290                 for rule in g.sent_rules(LHS, sent_nums):
 291                     io.debug( "%ssumming rule %s s:%d t:%d loc:%d" % (tab(),rule,s,t,loc_h) )
 292                     L = rule.L()
 293                     R = rule.R()
 294                     # if it's a STOP rule, rewrite for the same range:
 295                     if (L == STOP) or (R == STOP):
 296                         if L == STOP:
 297                             pLR = e(s, t, R, loc_h, n_t+1)
 298                         elif R == STOP:
 299                             pLR = e(s, t, L, loc_h, n_t+1)
 300                         p += rule.p_STOP(s, t, loc_h) * pLR
 301                         io.debug( "%sp= %.4f (STOP)" % (tab(), p) )
 302
 303                     else: # not a STOP, an attachment rewrite:
 304                         for r in range(s, t):
 305                             # if loc_h == t, no need to try right-attachments,
 306                             # if loc_h == s, no need to try left-attachments... todo
 307                             p_h = rule.p_ATTACH(r, loc_h, s=s)
 308                             if rule.LHS() == L:
 309                                 locs_L = [loc_h]
 310                                 locs_R = locs(head(R), sent_nums, r+1, t+1, loc_h)
 311                             elif rule.LHS() == R:
 312                                 locs_L = locs(head(L), sent_nums,  s,  r+1, loc_h)
 313                                 locs_R = [loc_h]
 314                             # see http://tinyurl.com/4ffhhw
 315                             p += sum([e(s, r, L, loc_L, n_t+1) *
 316                                       p_h *
 317                                       e(r+1, t, R, loc_R, n_t+1)
 318                                       for loc_L in locs_L
 319                                       for loc_R in locs_R])
 320                             io.debug( "%sp= %.4f (ATTACH)" % (tab(), p) )
 321                 chart[(s, t, LHS, loc_h)] = p
 322                 return p
 323     # end of e-function
 324
 325     inner_prob = e(s,t,LHS,loc_h, 0)
 326     if 1 in io.DEBUG:
 327         print "---CHART:---"
 328         for (s,t,LHS,loc_h),v in chart.iteritems():
 329             print "%s -> %s_%d ... %s_%d (loc_h:%s):\t%.4f" % (DMV_Rule.bar_str(LHS,g.numtag),
 330                                                                    O(s), s, O(s), t, loc_h, v)
 331         print "---CHART:end---"
 332     return inner_prob
 333 # end of inner_dmv(s, t, LHS, loc_h, g, sent, chart)
 334
 335 def inner_sent_dmv(sent, g, chart):
 336     '''Possibly there's a more efficient way? Although, non-sentence heads
 337     _will_ be ruled out by inner_dmv though.'''
 338     p = 0.0
 339     for loc_h,h_tag in enumerate(sent):
 340         p += inner_dmv(0, len(sent)-1, ROOT, loc_h, g, sent, chart)
 341     return p
 342
 343 if __name__ == "__main__":                      # Non, Adj
 344     _h_ = DMV_Rule((LRBAR,0), STOP,    ( RBAR,0), 1.0, 1.0) # LSTOP
 345     h_S = DMV_Rule(( RBAR,0),(NOBAR,0),  STOP,    0.4, 0.3) # RSTOP
 346     h_A = DMV_Rule(( RBAR,0),(LRBAR,0),( RBAR,0), 0.6, 0.7) # Lattach
 347     h   = DMV_Rule((NOBAR,0),(NOBAR,0),(LRBAR,0), 1.0, 1.0) # Rattach
 348     b2  = {}
 349     b2[(NOBAR, 0), 'h'] = 1.0
 350     b2[(RBAR, 0), 'h'] = h_S.probA
 351     b2[(LRBAR, 0), 'h'] = h_S.probA * _h_.probA
 352
 353     g_dup = DMV_Grammar([ _h_, h_S, h_A, h ],b2,0,0,0, {0:'h'}, {'h':0})
 354
 355     io.DEBUG = []
 356     test0 = inner_dmv(0, 1, (LRBAR,0), 0, g_dup, 'h h'.split(), {})
 357     if not  "0.120"=="%.3f" % test0:
 358         print "Should be 0.120: %.3f" % test0
 359
 360     test1 = inner_dmv(0, 1, (LRBAR,0), 1, g_dup, 'h h'.split(), {})
 361     if not  "0.063"=="%.3f" % test1:
 362         print "Should be 0.063: %.3f" % test1
 363     io.DEBUG = [1]
 364     test3 = inner_dmv(0, 2, (LRBAR,0), 2, g_dup, 'h h h'.split(), {})
 365     if not  "0.0498"=="%.4f" % test3:
 366         print "Should be 0.0498: %.4f" % test3
 367
 368
 369
 370
 371
 372
 373 ##############################
 374 # DMV-probabilities, todo:   #
 375 ##############################
 376
 377
 378 def P_CHOOSE():
 379     return "todo"
 380
 381 def DMV(sent, g):
 382     '''Here it seems like they store rule information on a per-head (per
 383     direction) basis, in deps_D(h, dir) which gives us a list. '''
 384     def P_h(h):
 385         P_h = 1 # ?
 386         for dir in ['l', 'r']:
 387             for a in deps(h, dir):
 388                 # D(a)??
 389                 P_h *= \
 390                     P_STOP (0, h, dir, adj) * \
 391                     P_CHOOSE (a, h, dir) * \
 392                     P_h(D(a)) * \
 393                 P_STOP (STOP | h, dir, adj)
 394         return P_h
 395     return P_h(root(sent))
 396
 397
 398 def P_STOP(STOP, h, dir, adj, g, corpus):
 399     '''corpus is a list of sentences s.
 400
 401 This is based on the formula where STOP is True... not sure how we
 402 calculate if STOP is False.
 403
 404 I thought about instead having this:
 405
 406 for rule in g.p_rules:
 407     rule.num = 0
 408     rule.den = 0
 409 for sent in corpus:
 410     for rule in g.p_rules:
 411        for s:
 412            for t:
 413                set num and den using inner
 414 for rule in g.p_rules
 415     rule.prob = rule.num / rule.den
 416
 417 ..the way I'm assuming we do it in the commented out io-function in
 418 io.py. Having sentences as the outer loop at least we can easily just
 419 go through the heads that are actually in the sentence... BUT, this
 420 means having to go through p_rules 3 times, not sure what is slower.
 421
 422 Also, now inner_dmv makes sure it only goes through heads that are
 423 actually in the sentence, so that argument falls.
 424
 425 oh, and:
 426 P_STOP(-STOP|...) = 1 - P_STOP(STOP|...)
 427 =D
 428
 429 '''
 430     P_STOP_num = 0
 431     P_STOP_den = 0
 432     h_tag = g.numtag(h)
 433     for sent in corpus:
 434         # have to go through _all_ places where h appears in the
 435         # sentence...how? how to make sure it _works_?
 436         chart = {}
 437         locs_h = locs(h_tag, sent)
 438         io.debug( "locs_h:%s, sent:%s"%(locs_h,sent) , 2)
 439         for loc_h in locs_h:
 440             inner_dmv(0, len(sent)-1, ROOT, loc_h, g, sent, chart)
 441             for s in range(loc_h): # s<loc(h), range gives strictly less
 442                 for t in range(loc_h, len(sent)):
 443                     io.debug( "s:%s t:%s loc:%d"%(s,t,loc_h)  , 2)
 444                     if (s, t, (LRBAR,h), loc_h) in chart:
 445                         io.debug( "num+=%s"%chart[(s, t, (LRBAR,h), loc_h)]  , 2)
 446                         P_STOP_num += chart[(s, t, (LRBAR,h), loc_h)]
 447                     if (s, t, (RBAR,h), loc_h) in chart:
 448                         io.debug( "den+=%s"%chart[(s, t, (RBAR,h), loc_h)]  , 2)
 449                         P_STOP_den += chart[(s, t, (RBAR,h), loc_h)]
 450         # todo: use sum([chart[(s, t...)] etc? but can we then
 451         # keep den and num separate?
 452
 453     io.debug( "num/den: %s / %s"%(P_STOP_num, P_STOP_den) , 2)
 454     if P_STOP_den > 0.0:
 455         io.debug( "num/den: %s / %s = %s"%(P_STOP_num, P_STOP_den,P_STOP_num / P_STOP_den) , 2)
 456         return P_STOP_num / P_STOP_den # upside down in article
 457     else:
 458         return 0.0
 459
 460
 461
 462 def testreestimation():
 463     testcorpus = [s.split() for s in ['det vbd nn c vbd','det nn vbd c nn vbd pp',
 464                                       'det vbd nn',      'det vbd nn c vbd pp',
 465                                       'det vbd nn',      'det vbd c nn vbd pp',
 466                                       'det vbd nn',      'det nn vbd nn c vbd pp',
 467                                       'det vbd nn',      'det nn vbd c det vbd pp',
 468                                       'det vbd nn',      'det nn vbd c vbd det det det pp',
 469                                       'det nn vbd',      'det nn vbd c vbd pp',
 470                                       'det nn vbd',      'det nn vbd c vbd det pp',
 471                                       'det nn vbd',      'det nn vbd c vbd pp',
 472                                       'det nn vbd pp',   'det nn vbd det', ]]
 473     g = harmonic.initialize(testcorpus)
 474
 475     h_tag = 'nn'
 476     h = g.tagnum(h_tag)
 477     print '''This will take some time. todo: figure out why it doesn't do
 478 anything if nn is always second word.'''
 479     for r in g.h_rules(h):
 480         if r.L()==STOP:
 481             print r
 482 #             print "off-set the rule, see what happens:"
 483 #             r.probN = 0.7
 484 #             print r
 485             for i in range(3):
 486                 pstophln = P_STOP(True, h, 'L', 'N', g, testcorpus)
 487                 print "p(STOP|%s,L,N):%s"%(h_tag,pstophln)
 488
 489                 for r in g.h_rules(h):
 490                     if r.L()==STOP:
 491                         print r
 492                         r.probN = pstophln
 493                         print r
 494     return "todo"
 495
 496
 497
 498 def testreestimation_h():
 499     _h_ = DMV_Rule((LRBAR,0), STOP,    ( RBAR,0), 1.0, 1.0) # LSTOP
 500     h_S = DMV_Rule(( RBAR,0),(NOBAR,0),  STOP,    0.4, 0.3) # RSTOP
 501     h_A = DMV_Rule(( RBAR,0),(LRBAR,0),( RBAR,0), 0.6, 0.7) # Lattach
 502     h   = DMV_Rule((NOBAR,0),(NOBAR,0),(LRBAR,0), 1.0, 1.0) # Rattach
 503     rh  = DMV_Rule(   ROOT,   STOP,    (LRBAR,0), 1.0, 1.0) # ROOT
 504     b2  = {}
 505     b2[(NOBAR, 0), 'h'] = 1.0
 506     b2[(RBAR, 0), 'h'] = h_S.probA
 507     b2[(LRBAR, 0), 'h'] = h_S.probA * _h_.probA
 508
 509     g_dup = DMV_Grammar([ rh, _h_, h_S, h_A, h ],b2,0,0,0, {0:'h'}, {'h':0})
 510
 511     #    test3 = inner_dmv(0, 2, (LRBAR,0), 2, g_dup, 'h h h'.split(), {})
 512     h_tag = 'h'
 513     h = 0
 514     print "todo: figure out why it doesn't work"
 515     for r in g_dup.h_rules(h):
 516         if r.L()==STOP:
 517             print r
 518 #             print "off-set the rule, see what happens:"
 519 #             r.probN = 0.7
 520 #             print r
 521             for i in range(3):
 522                 pstophln = P_STOP(True, h, 'L', 'N', g_dup, ['h h h'.split()])
 523                 print "p(STOP|%s,L,N):%s"%(h_tag,pstophln)
 524
 525                 for r in g_dup.h_rules(h):
 526                     if r.L()==STOP:
 527                         print r
 528                         r.probN = pstophln
 529                         print r
 530     return "todo"
 531
 532 if __name__ == "__main__":
 533     io.DEBUG = []
 534     import timeit
 535     timeit.Timer("dmv.testreestimation()",'''import dmv
 536 reload(dmv)''').timeit(1)
 537     pass
 538
 539
 540
 541 # todo: some more testing on the Brown corpus:
 542 #     # first five sentences of the Brown corpus:
 543 #     g_brown = harmonic.initialize([['AT', 'NP-TL', 'NN-TL', 'JJ-TL', 'NN-TL', 'VBD', 'NR', 'AT', 'NN', 'IN', 'NP$', 'JJ', 'NN', 'NN', 'VBD', '``', 'AT', 'NN', "''", 'CS', 'DTI', 'NNS', 'VBD', 'NN', '.'], ['AT', 'NN', 'RBR', 'VBD', 'IN', 'NN', 'NNS', 'CS', 'AT', 'NN-TL', 'JJ-TL', 'NN-TL', ',', 'WDT', 'HVD', 'JJ', 'NN', 'IN', 'AT', 'NN', ',', '``', 'VBZ', 'AT', 'NN', 'CC', 'NNS', 'IN', 'AT', 'NN-TL', 'IN-TL', 'NP-TL', "''", 'IN', 'AT', 'NN', 'IN', 'WDT', 'AT', 'NN', 'BEDZ', 'VBN', '.'], ['AT', 'NP', 'NN', 'NN', 'HVD', 'BEN', 'VBN', 'IN', 'NP-TL', 'JJ-TL', 'NN-TL', 'NN-TL', 'NP', 'NP', 'TO', 'VB', 'NNS', 'IN', 'JJ', '``', 'NNS', "''", 'IN', 'AT', 'JJ', 'NN', 'WDT', 'BEDZ', 'VBN', 'IN', 'NN-TL', 'NP', 'NP', 'NP', '.'], ['``', 'RB', 'AT', 'JJ', 'NN', 'IN', 'JJ', 'NNS', 'BEDZ', 'VBN', "''", ',', 'AT', 'NN', 'VBD', ',', '``', 'IN', 'AT', 'JJ', 'NN', 'IN', 'AT', 'NN', ',', 'AT', 'NN', 'IN', 'NNS', 'CC', 'AT', 'NN', 'IN', 'DT', 'NN', "''", '.'], ['AT', 'NN', 'VBD', 'PPS', 'DOD', 'VB', 'CS', 'AP', 'IN', 'NP$', 'NN', 'CC', 'NN', 'NNS', '``', 'BER', 'JJ', 'CC', 'JJ', 'CC', 'RB', 'JJ', "''", '.'], ['PPS', 'VBD', 'CS', 'NP', 'NNS', 'VB', '``', 'TO', 'HV', 'DTS', 'NNS', 'VBN', 'CC', 'VBN', 'IN', 'AT', 'NN', 'IN', 'VBG', 'CC', 'VBG', 'PPO', "''", '.'], ['AT', 'JJ', 'NN', 'VBD', 'IN', 'AT', 'NN', 'IN', 'AP', 'NNS', ',', 'IN', 'PPO', 'AT', 'NP', 'CC', 'NP-TL', 'NN-TL', 'VBG', 'NNS', 'WDT', 'PPS', 'VBD', '``', 'BER', 'QL', 'VBN', 'CC', 'VB', 'RB', 'VBN', 'NNS', 'WDT', 'VB', 'IN', 'AT', 'JJT', 'NN', 'IN', 'ABX', 'NNS', "''", '.'], ['NN-HL', 'VBN-HL'], ['WRB', ',', 'AT', 'NN', 'VBD', 'PPS', 'VBZ', '``', 'DTS', 'CD', 'NNS', 'MD', 'BE', 'VBN', 'TO', 'VB', 'JJR', 'NN', 'CC', 'VB', 'AT', 'NN', 'IN', 'NN', "''", '.'], ['AT', 'NN-TL', 'VBG-TL', 'NN-TL', ',', 'AT', 'NN', 'VBD', ',', '``', 'BEZ', 'VBG', 'IN', 'VBN', 'JJ', 'NNS', 'CS', 'AT', 'NN', 'IN', 'NN', 'NNS', 'NNS', "''", '.']])
 544 #     # 36:'AT' in g_brown.numtag, 40:'NP-TL'
 545
 546 #     io.DEBUG = []
 547 #     test_brown = inner_dmv(0,2, (LRBAR,36), g_brown, ['AT', 'NP-TL' ,'NN-TL','JJ-TL'], {})
 548 #     if 1 in io.DEBUG:
 549 #         for r in  g_brown.rules((2,36)) + g_brown.rules((1,36)) + g_brown.rules((0,36)):
 550 #             L = r.L()
 551 #             R = r.R()
 552 #             if head(L) in [36,40,-2] and head(R) in [36,40,-2]:
 553 #                 print r
 554 #     print "Brown-test gives: %.8f" % test_brown
 555
 556
 557
 558     # this will give the tag sequences of all the 6218 Brown corpus
 559     # sentences of length < 7:
 560     # [[tag for (w, tag) in sent]
 561     #  for sent in nltk.corpus.brown.tagged_sents() if len(sent) < 7]
 562
 563
 564
 565
 566
 567