src/dmv.py

   1 #### changes by KBU:
   2 # 2008-05-24:
   3 # - prettier printout for DMV_Rule
   4 # - DMV_Rule changed a bit. head, L and R are now all pairs of the
   5 #   form (bars, head).
   6 # - Started on P_STOP, a bit less pseudo now..
   7 #
   8 # 2008-05-27:
   9 # - started on initialization. So far, I have frequencies for
  10 #   everything, very harmonic. Still need to make these into 1-summing
  11 #   probabilities
  12 #
  13 # 2008-05-28:
  14 # - more work on initialization (init_freq and init_normalize),
  15 #   getting closer to probabilities now.
  16 #
  17 # 2008-05-29:
  18 # - init_normalize is done, it creates p_STOP, p_ROOT and p_CHOOSE,
  19 #   and also adds the relevant probabilities to p_rules in a grammar.
  20 #   Still, each individual rule has to store both adjacent and non_adj
  21 #   probabilities, and inner() should be able to send some parameter
  22 #   which lets the rule choose... hopefully... Is this possible to do
  23 #   top-down even? when the sentence could be all the same words?
  24 #   todo: extensive testing of identical words in sentences!
  25 # - frequencies (only used in initialization) are stored as strings,
  26 #   but in the rules and p_STOP etc, there are only numbers.
  27 #
  28 # 2008-05-30
  29 # - copied inner() into this file, to make the very dmv-specific
  30 #   adjacency stuff work (have to factor that out later on, when it
  31 #   works).
  32 #
  33 # 2008-06-01
  34 # - finished typing in inner_dmv(), still have to test and debug
  35 #   it. The chart is now four times as big since for any rule we may
  36 #   have attachments to either the left or the right below, which
  37 #   upper rules depend on, for selecting probN or probA
  38 #
  39 # 2008-06-03
  40 # - fixed a number of little bugs in initialization, where certain
  41 #   rules were simply not created, or created "backwards"
  42 # - inner_dmv() should Work now...
  43 #
  44 # 2008-06-04
  45 # - moved initialization to harmonic.py
  46
  47
  48 # import numpy # numpy provides Fast Arrays, for future optimization
  49 import pprint
  50 import io
  51 import harmonic
  52
  53 # non-tweakable/constant "lookup" globals
  54 BARS = [0,1,2]
  55 RBAR = 1
  56 LRBAR = 2
  57 NOBAR = 0
  58 ROOT = (LRBAR, -1)
  59 STOP = (NOBAR, -2)
  60
  61 if __name__ == "__main__":
  62     print "DMV module tests:"
  63
  64
  65 def node(bars, head):
  66     '''Useless function, but just here as documentation. Nodes make up
  67     LHS, R and L in each DMV_Rule'''
  68     return (bars, head)
  69
  70 def bars(node):
  71     return node[0]
  72
  73 def head(node):
  74     return node[1]
  75
  76
  77 class DMV_Grammar(io.Grammar):
  78     '''The DMV-PCFG.
  79
  80     Public members:
  81     p_STOP, p_ROOT, p_CHOOSE, p_terminals
  82     These are changed in the Maximation step, then used to set the
  83     new probabilities of each DMV_Rule.
  84
  85     Todo: make p_terminals private? (But it has to be changable in
  86     maximation step due to the short-cutting rules... could of course
  87     make a DMV_Grammar function to update the short-cut rules...)
  88
  89     __p_rules is private, but we can still say stuff like:
  90     for r in g.all_rules():
  91         r.probN = newProbN
  92
  93     What other representations do we need? (P_STOP formula uses
  94     deps_D(h,l/r) at least)'''
  95     def __str__(self):
  96         str = ""
  97         for r in self.all_rules():
  98              str += "%s\n" % r.__str__(self.numtag)
  99         return str
 100
 101     def h_rules(self, h):
 102         return [r for r in self.all_rules() if r.head() == h]
 103
 104     def rules(self, LHS):
 105         return [r for r in self.all_rules() if r.LHS() == LHS]
 106
 107     def sent_rules(self, LHS, sent_nums):
 108         "Used in inner_dmv."
 109         # We don't want to rule out STOPs!
 110         sent_nums.append( head(STOP) )
 111         return [r for r in self.all_rules() if r.LHS() == LHS
 112                 and head(r.L()) in sent_nums and head(r.R()) in sent_nums]
 113
 114     def heads(self):
 115         '''Not sure yet what is needed here, or where this is needed'''
 116         return numtag
 117
 118     def deps_L(self, head):
 119         # todo test, probably this list comprehension doesn't work
 120         return [a for r in self.all_rules() if r.head() == head and a == r.L()]
 121
 122     def deps_R(self, head):
 123         # todo test, probably this list comprehension doesn't work
 124         return [a for r in self.all_rules() if r.head() == head and a == r.R()]
 125
 126     def __init__(self, p_rules, p_terminals, p_STOP, p_CHOOSE, p_ROOT, numtag, tagnum):
 127         io.Grammar.__init__(self, p_rules, p_terminals, numtag, tagnum)
 128         self.p_STOP = p_STOP
 129         self.p_CHOOSE = p_CHOOSE
 130         self.p_ROOT = p_ROOT
 131
 132
 133 class DMV_Rule(io.CNF_Rule):
 134     '''A single CNF rule in the PCFG, of the form
 135     LHS -> L R
 136     where LHS, L and R are 'nodes', eg. of the form (bars, head).
 137
 138     Public members:
 139     probN, probA
 140
 141     Private members:
 142     __L, __R, __LHS
 143
 144     Different rule-types have different probabilities associated with
 145     them:
 146
 147     _h_ -> STOP  h_     P( STOP|h,L,    adj)
 148     _h_ -> STOP  h_     P( STOP|h,L,non_adj)
 149      h_ ->  h  STOP     P( STOP|h,R,    adj)
 150      h_ ->  h  STOP     P( STOP|h,R,non_adj)
 151      h_ -> _a_   h_     P(-STOP|h,L,    adj) * P(a|h,L)
 152      h_ -> _a_   h_     P(-STOP|h,L,non_adj) * P(a|h,L)
 153      h  ->  h   _a_     P(-STOP|h,R,    adj) * P(a|h,R)
 154      h  ->  h   _a_     P(-STOP|h,R,non_adj) * P(a|h,R)
 155     '''
 156     def p_old(self, LRattach, RLattach, *arg):
 157         '''Returns the correct probability, adjacent or non-adjacent,
 158         depending on whether or not there is a some lower attachment
 159         either on the right side of the left child, or the left side
 160         of the right child. '''
 161         if (not LRattach) and (not RLattach):
 162             return self.probA
 163         else:
 164             return self.probN
 165
 166     def p(self, s, r, t, loc_h, *arg):
 167         '''Returns the correct probability, adjacent or non-adjacent,
 168         depending on whether or not there is a some lower attachment
 169         either on the right side of the left child, or the left side
 170         of the right child. Uses s, r or t to infer this.'''
 171         if self.L() == STOP:
 172             adj =  s  == loc_h
 173         elif self.R() == STOP:
 174             adj =  t  == loc_h
 175             if not loc_h == s:
 176                 io.debug( "(%s given loc_h:%d, s:%d, todo: optimize away!)"
 177                           % (self,loc_h,s) )
 178                 return 0.0
 179         elif self.LHS() == self.L(): # right attachment
 180             adj =  r  == loc_h
 181             if not loc_h == s:
 182                 io.debug( "(%s given loc_h:%d, s:%d, todo: optimize away!)"
 183                           % (self,loc_h,s) )
 184                 return 0.0
 185         elif self.LHS() == self.R(): # left attachment
 186             adj = r+1 == loc_h
 187         if adj:
 188             return self.probA
 189         else:
 190             return self.probN
 191
 192     def bars(self):
 193         return bars(self.LHS())
 194
 195     def head(self):
 196         return head(self.LHS())
 197
 198     def __init__(self, LHS, L, R, probN, probA):
 199         for b_h in [LHS, L, R]:
 200             if bars(b_h) not in BARS:
 201                 raise ValueError("bars must be in %s; was given: %s"
 202                                  % (BARS, bars(b_h)))
 203         io.CNF_Rule.__init__(self, LHS, L, R, probN)
 204         self.probA = probA # adjacent
 205         self.probN = probN # non_adj
 206
 207     @classmethod # so we can call DMV_Rule.bar_str(b_h)
 208     def bar_str(cls, b_h, tag=lambda x:x):
 209         if(b_h == ROOT):
 210             return 'ROOT'
 211         elif(b_h == STOP):
 212             return 'STOP'
 213         elif(bars(b_h) == RBAR):
 214             return " %s_ " % tag(head(b_h))
 215         elif(bars(b_h) == LRBAR):
 216             return "_%s_ " % tag(head(b_h))
 217         else:
 218             return " %s  " % tag(head(b_h))
 219
 220
 221     def __str__(self, tag=lambda x:x):
 222         return "%s-->%s %s\t[N %.2f] [A %.2f]" % (self.bar_str(self.LHS(), tag),
 223                                                   self.bar_str(self.L(), tag),
 224                                                   self.bar_str(self.R(), tag),
 225                                                   self.probN,
 226                                                   self.probA)
 227
 228
 229
 230
 231
 232
 233
 234 ###################################
 235 # dmv-specific version of inner() #
 236 ###################################
 237 def rewrite_adj(bars, Lattach, Rattach):
 238     '''Returns a list of possible adjacencies for the left and right
 239     children of an attachment rule. Each possible adjacency is a list
 240     of booleans of the form (LL, LR, RL, RR).
 241
 242     Todo: make prettier? Although since we call this so many times,
 243     having it spelled out here is probably faster'''
 244     if bars == NOBAR and not Lattach and Rattach:
 245         return ( (Lattach, False, False, False),
 246                  (Lattach, False, False,  True),
 247                  (Lattach, False, True,  False),
 248                  (Lattach, False, True,   True),
 249                  (Lattach,  True, False, False),
 250                  (Lattach,  True, False,  True),
 251                  (Lattach,  True, True,  False),
 252                  (Lattach,  True, True,   True), )
 253     elif bars == RBAR and Lattach:
 254         # Rattach may be either true or false here!
 255         return ( (False, False, False, Rattach),
 256                  (False, False, True,  Rattach),
 257                  (False, True,  False, Rattach),
 258                  (False, True,  True,  Rattach),
 259                  (True,  False, False, Rattach),
 260                  (True,  False, True,  Rattach),
 261                  (True,  True,  False, Rattach),
 262                  (True,  True,  True,  Rattach) )
 263     else:
 264         # NOBAR rewrite rules cannot have Lattach below, and must
 265         # have/add Rattach. RBAR rewrite rules must add Lattach, but
 266         # don't care about Rattach. Returning () should ensure we
 267         # don't add any probability to such "false" situations
 268         return ()
 269
 270 def inner_dmv_old(s, t, LHS, g, sent, chart):
 271     ''' A rewrite of inner in io.py, to take adjacency into accord.
 272
 273     The chart is now 4 times bigger, since there are different values
 274     for with or without L/R attachments:
 275     chart[(s,t,LHS, Lattach, Rattach)]
 276
 277     If Rattach==True then the rule has a right-attachment or there is
 278     one lower in the tree (meaning we're no longer
 279     adjacent). Adjacency depends on whether there is an attachment
 280     lower in the tree, cf. DMV_Rule.p(LRattach, RLattach).
 281
 282     Todo: if possible, refactor (move dmv-specific stuff back into
 283     dmv, so this is "general" enough to be in io.py)
 284     '''
 285
 286     def debug_inner_dmv(tabs,s,t,LHS,Lattach,Rattach):
 287         if io.DEBUG:
 288             attach = {
 289                 (True, True): "left and right attachments below",
 290                 (True, False): "left attachment(s) below",
 291                 (False, True): "right attachment(s) below",
 292                 (False, False): "no attachments below" }
 293             info = (tabs,O(s),s,O(t),t, DMV_Rule.bar_str(LHS), attach[Lattach,Rattach])
 294             print "%sTrying from  %s_%d  to  %s_%d  with %s, %s:" % info
 295
 296     def O(s):
 297         return sent[s]
 298
 299     sent_nums = [g.tagnum(tag) for tag in sent]
 300
 301     def e(s,t,LHS, Lattach, Rattach, n_t):
 302         def tab():
 303             "Tabs for debug output"
 304             return "\t"*n_t
 305
 306         if (s, t, LHS, Lattach, Rattach) in chart:
 307             return chart[(s, t, LHS, Lattach, Rattach)]
 308         else:
 309             debug_inner_dmv(tab(),s,t,LHS, Lattach, Rattach)
 310             if s == t:
 311                 if Lattach or Rattach:
 312                     # terminals are always F,F for attachment
 313                     io.debug("%s= 0.0 (1 word, no lower attach)" % tab())
 314                     return 0.0
 315                 elif (LHS, O(s)) in g.p_terminals:
 316                     prob = g.p_terminals[LHS, O(s)] # b[LHS, O(s)] in Lari&Young
 317                 else:
 318                     # todo: assuming this is how to deal with lacking
 319                     # rules, since we add prob.s, and 0 is identity
 320                     prob = 0.0
 321                     io.debug( "%sLACKING TERMINAL:" % tab())
 322                 # todo: add to chart perhaps? Although, it _is_ simple lookup..
 323                 io.debug( "%s= %.1f (terminal: %s -> %s)" % (tab(),prob,
 324                                                              DMV_Rule.bar_str(LHS),
 325                                                              O(s)) )
 326                 return prob
 327             else:
 328                 if (s,t,LHS,Lattach, Rattach) not in chart:
 329                     chart[(s,t,LHS,Lattach,Rattach)] = 0.0
 330                 for rule in g.sent_rules(LHS, sent_nums): # summing over j,k in a[LHS,j,k]
 331                     io.debug( "%ssumming rule %s" % (tab(),rule) )
 332                     L = rule.L()
 333                     R = rule.R()
 334                     # if it's a STOP rule, rewrite for the same range:
 335                     if (L == STOP) or (R == STOP):
 336                         if L == STOP:
 337                             p = rule.p_old(Lattach, False) # todo check
 338                             pLR = e(s, t, R, Lattach, Rattach, n_t+1)
 339                         elif R == STOP:
 340                             p = rule.p_old(False, Rattach) # todo check
 341                             pLR = e(s, t, L, Lattach, Rattach, n_t+1)
 342                         chart[(s, t, LHS, Lattach, Rattach)] += p * pLR
 343
 344                     # not a STOP, an attachment rewrite:
 345                     else:
 346                         for r in range(s, t):
 347                             if head(L) in sent_nums[s:r+1] and head(R) in sent_nums[r+1:t+1]:
 348                                 # LL etc are boolean attachment values
 349                                 for (LL, LR, RL, RR) in rewrite_adj(rule.bars(), Lattach, Rattach):
 350                                     p = rule.p_old(LR, RL) # probN or probA
 351                                     pL = e(s,   r, L, LL, LR, n_t+1)
 352                                     pR = e(r+1, t, R, RL, RR, n_t+1)
 353                                     chart[(s, t, LHS,Lattach,Rattach)] += p * pL * pR
 354
 355                 return chart[(s, t, LHS,Lattach,Rattach)]
 356     # end of e-function
 357
 358     inner_prob = e(s,t,LHS,True,True, 0) + e(s,t,LHS,True,False, 0) + e(s,t,LHS,False,True, 0) + e(s,t,LHS,False,False, 0)
 359     if io.DEBUG:
 360         print "---CHART:---"
 361         for (s,t,LHS,L,R),v in chart.iteritems():
 362             print "\t%s -> %s_%d ... %s_%d (L:%s, R:%s):\t%.3f" % (DMV_Rule.bar_str(LHS,g.numtag),
 363                                                                    O(s), s,
 364                                                                    O(s), t,
 365                                                                    L, R, v)
 366         print "---CHART:end---"
 367     return [inner_prob, chart]
 368 # end of inner_dmv_old (Lattach,Rattach)
 369
 370
 371
 372
 373 def locs(h, sent, s=0, t=None):
 374     '''Return the locations of h in sent, or some fragment of sent (in the
 375     latter case we make sure to offset the locations correctly so that
 376     for any x in the returned list, sent[x]==h).'''
 377     if t == None:
 378         t = len(sent)
 379     return [i+s for i,w in enumerate(sent[s:t]) if w == h]
 380
 381
 382 def inner_dmv(s, t, LHS, loc_h, g, sent, chart):
 383     ''' A rewrite of inner in io.py, to take adjacency into accord.
 384
 385     The chart now has loc_h (instead of Lattach, Rattach):
 386     chart[(s,t,LHS, loc_h)]
 387
 388     loc_h gives adjacency (along with r for attachment rules), and is
 389     needed in P_STOP reestimation.
 390
 391     Todo: if possible, refactor (move dmv-specific stuff back into
 392     dmv, so this is "general" enough to be in io.py)
 393     '''
 394
 395     def O(s):
 396         return sent[s]
 397
 398     sent_nums = [g.tagnum(tag) for tag in sent]
 399
 400     def e(s,t,LHS, loc_h, n_t):
 401         def tab():
 402             "Tabs for debug output"
 403             return "\t"*n_t
 404
 405         if (s, t, LHS, loc_h) in chart:
 406             return chart[(s, t, LHS, loc_h)]
 407         else:
 408             if s == t:
 409                 if not loc_h == s:
 410                     # terminals are always F,F for attachment
 411                     io.debug("%s*= 0.0 (wrong loc_h)" % tab())
 412                     return 0.0
 413                 elif (LHS, O(s)) in g.p_terminals:
 414                     prob = g.p_terminals[LHS, O(s)] # "b[LHS, O(s)]" in Lari&Young
 415                 else:
 416                     # todo: assuming this is how to deal w/lacking
 417                     # rules, since we add prob.s, and 0 is identity
 418                     prob = 0.0
 419                     io.debug( "%sLACKING TERMINAL:" % tab())
 420                 # todo: add to chart perhaps? Although, it _is_ simple lookup..
 421                 io.debug( "%s*= %.1f (terminal: %s -> %s_%d)" % (tab(),prob,
 422                                                                  DMV_Rule.bar_str(LHS),
 423                                                                  O(s), loc_h) )
 424                 return prob
 425             else:
 426                 p = 0.0 # "sum over j,k in a[LHS,j,k]"
 427                 for rule in g.sent_rules(LHS, sent_nums):
 428                     io.debug( "%ssumming rule %s" % (tab(),rule) )
 429                     L = rule.L()
 430                     R = rule.R()
 431                     # if it's a STOP rule, rewrite for the same range:
 432                     if (L == STOP) or (R == STOP):
 433                         if L == STOP:
 434                             p_h = rule.p(s,s,t,loc_h)
 435                             pLR = e(s, t, R, loc_h, n_t+1)
 436                         elif R == STOP:
 437                             p_h = rule.p(s,s,t,loc_h)
 438                             pLR = e(s, t, L, loc_h, n_t+1)
 439                         p += p_h * pLR
 440                         io.debug( "%s= %.3f" % (tab(), p) )
 441
 442                     else: # not a STOP, an attachment rewrite:
 443                         for r in range(s, t):
 444                             p_h = rule.p(s,r,t,loc_h) # probN or probA
 445                             if rule.LHS() == L:
 446                                 locs_L = [loc_h]
 447                                 locs_R = locs(head(R), sent_nums, r+1, t+1)
 448                             elif rule.LHS() == R:
 449                                 locs_L = locs(head(L), sent_nums,  s,  r+1)
 450                                 locs_R = [loc_h]
 451                             # see http://tinyurl.com/4ffhhw
 452                             p += sum([e(s, r, L, loc_L, n_t+1) * p_h *
 453                                       e(r+1, t, R, loc_R, n_t+1)
 454                                       for loc_L in locs_L
 455                                       for loc_R in locs_R])
 456                             io.debug( "%s+= %.3f" % (tab(), p) )
 457                 chart[(s, t, LHS, loc_h)] = p
 458                 return p
 459     # end of e-function
 460
 461     inner_prob = e(s,t,LHS,loc_h, 0)
 462     if io.DEBUG:
 463         print "---CHART:---"
 464         for (s,t,LHS,loc_h),v in chart.iteritems():
 465             print "\t%s -> %s_%d ... %s_%d (loc_h:%s):\t%.3f" % (DMV_Rule.bar_str(LHS,g.numtag),
 466                                                                    O(s), s, O(s), t, loc_h, v)
 467         print "---CHART:end---"
 468     return [inner_prob, chart]
 469 # end of inner_dmv (loc_h)
 470
 471 if __name__ == "__main__":                      # Non, Adj
 472     _h_ = DMV_Rule((LRBAR,0), STOP,    ( RBAR,0), 1.0, 1.0) # LSTOP
 473     h_S = DMV_Rule(( RBAR,0),(NOBAR,0),  STOP,    0.4, 0.3) # RSTOP
 474     h_A = DMV_Rule(( RBAR,0),(LRBAR,0),( RBAR,0), 0.6, 0.7) # Lattach
 475     h   = DMV_Rule((NOBAR,0),(NOBAR,0),(LRBAR,0), 1.0, 1.0) # Rattach
 476     b2  = {}
 477     b2[(NOBAR, 0), 'h'] = 1.0
 478     b2[(RBAR, 0), 'h'] = h_S.probA
 479     b2[(LRBAR, 0), 'h'] = h_S.probA * _h_.probA
 480
 481     g_dup = DMV_Grammar([ _h_, h_S, h_A, h ],b2,0,0,0, {0:'h'}, {'h':0})
 482
 483     io.DEBUG = 0
 484     test0 = inner_dmv(0, 1, (LRBAR,0), 0, g_dup, 'h h'.split(), {})
 485     if not  "0.120"=="%.3f" % test0[0]:
 486         print "Should be 0.120: %.3f" % test0[0]
 487
 488     test1 = inner_dmv(0, 1, (LRBAR,0), 1, g_dup, 'h h'.split(), {})
 489     if not  "0.063"=="%.3f" % test1[0]:
 490         print "Should be 0.063: %.3f" % test1[0]
 491
 492
 493 ##############################
 494 # DMV-probabilities, todo:   #
 495 ##############################
 496
 497
 498 def P_CHOOSE():
 499     return "todo"
 500
 501 def DMV(sent, g):
 502     '''Here it seems like they store rule information on a per-head (per
 503     direction) basis, in deps_D(h, dir) which gives us a list. '''
 504     def P_h(h):
 505         P_h = 1 # ?
 506         for dir in ['l', 'r']:
 507             for a in deps(h, dir):
 508                 # D(a)??
 509                 P_h *= \
 510                     P_STOP (0, h, dir, adj) * \
 511                     P_CHOOSE (a, h, dir) * \
 512                     P_h(D(a)) * \
 513                 P_STOP (STOP | h, dir, adj)
 514         return P_h
 515     return P_h(root(sent))
 516
 517
 518 def P_STOP(STOP, h, dir, adj, g, corpus):
 519     '''corpus is a list of sentences s.
 520
 521 This is based on the formula where STOP is True... not sure how we
 522 calculate if STOP is False.
 523
 524
 525 I thought about instead having this:
 526
 527 for rule in g.p_rules:
 528     rule.num = 0
 529     rule.den = 0
 530 for sent in corpus:
 531     for rule in g.p_rules:
 532        for s:
 533            for t:
 534                set num and den using inner
 535 for rule in g.p_rules
 536     rule.prob = rule.num / rule.den
 537
 538 ..the way I'm assuming we do it in the commented out io-function in
 539 io.py. Having sentences as the outer loop at least we can easily just
 540 go through the heads that are actually in the sentence... BUT, this
 541 means having to go through p_rules 3 times, not sure what is slower.
 542
 543 Also, now inner_dmv makes sure it only goes through heads that are
 544 actually in the sentence, so that argument falls.
 545
 546 oh, and:
 547 P_STOP(-STOP|...) = 1 - P_STOP(STOP|...)
 548 =D
 549
 550 '''
 551     P_STOP_num = 0
 552     P_STOP_den = 0
 553     h_tag = g.numtag(h)
 554     for sent in corpus:
 555         # have to go through _all_ places where h appears in the
 556         # sentence...how? how to make sure it _works_?
 557         chart = {} # cuts time from 17s to 7s !
 558         if h_tag in sent:
 559             locs_h = locs(h_tag, sent)
 560             io.debug( "locs_h:%s, sent:%s"%(locs_h,sent))
 561             for loc_h in locs_h:
 562                 for s in range(loc_h): # s<loc(h), range gives strictly less
 563                     for t in range(loc_h, len(sent)): # should not be range(s,..), right? todo
 564                         P_STOP_num += inner_dmv(s, t, (LRBAR,h), loc_h, g, sent, chart)[0]
 565                         P_STOP_den += inner_dmv(s, t, (RBAR,h), loc_h, g, sent, chart)[0]
 566
 567     io.debug( "num/den: %s / %s"%(P_STOP_num, P_STOP_den))
 568     if P_STOP_den > 0.0:
 569         return P_STOP_num / P_STOP_den # upside down in article
 570     else:
 571         return 0.0
 572
 573
 574
 575 def testreestimation():
 576     testcorpus = [s.split() for s in ['det nn vbd c vbd','det nn vbd c nn vbd pp',
 577                                       'det nn vbd',      'det vbd nn c vbd pp',
 578                                       'det nn vbd',      'det vbd c nn vbd pp',
 579                                       'det nn vbd',      'det nn vbd c vbd pp',
 580                                       'det nn vbd',      'det nn vbd c det vbd pp',
 581                                       'det nn vbd',      'det nn vbd c vbd det det det pp',
 582                                       'det nn vbd',      'det nn vbd c vbd pp',
 583                                       'det nn vbd',      'det nn vbd c vbd det pp',
 584                                       'det nn vbd',      'det nn vbd c vbd pp',
 585                                       'det nn vbd pp',   'det nn vbd det', ]]
 586     g = harmonic.initialize(testcorpus)
 587
 588     h_tag = 'nn'
 589     h = g.tagnum(h_tag)
 590     print "This will take some time. todo: figure out why it doesn't work"
 591     for r in g.h_rules(h):
 592         if r.L()==STOP:
 593             print r
 594             print "off-set the rule, see what happens:"
 595             r.probN = 0.7
 596             print r
 597             for i in range(10):
 598                 pstophln = P_STOP(True, h, 'L', 'N', g, testcorpus)
 599                 print "p(STOP|%s,L,N):%s"%(h_tag,pstophln)
 600
 601                 for r in g.h_rules(h):
 602                     if r.L()==STOP:
 603                         print r
 604                         r.probN = pstophln
 605                         print r
 606     return "todo"
 607
 608 if __name__ == "__main__":
 609     io.DEBUG = 0
 610     import timeit
 611     timeit.Timer("dmv.testreestimation()","import dmv").timeit(1)
 612 #    pass
 613
 614
 615
 616 # todo: some more testing on the Brown corpus:
 617 #     # first five sentences of the Brown corpus:
 618 #     g_brown = harmonic.initialize([['AT', 'NP-TL', 'NN-TL', 'JJ-TL', 'NN-TL', 'VBD', 'NR', 'AT', 'NN', 'IN', 'NP$', 'JJ', 'NN', 'NN', 'VBD', '``', 'AT', 'NN', "''", 'CS', 'DTI', 'NNS', 'VBD', 'NN', '.'], ['AT', 'NN', 'RBR', 'VBD', 'IN', 'NN', 'NNS', 'CS', 'AT', 'NN-TL', 'JJ-TL', 'NN-TL', ',', 'WDT', 'HVD', 'JJ', 'NN', 'IN', 'AT', 'NN', ',', '``', 'VBZ', 'AT', 'NN', 'CC', 'NNS', 'IN', 'AT', 'NN-TL', 'IN-TL', 'NP-TL', "''", 'IN', 'AT', 'NN', 'IN', 'WDT', 'AT', 'NN', 'BEDZ', 'VBN', '.'], ['AT', 'NP', 'NN', 'NN', 'HVD', 'BEN', 'VBN', 'IN', 'NP-TL', 'JJ-TL', 'NN-TL', 'NN-TL', 'NP', 'NP', 'TO', 'VB', 'NNS', 'IN', 'JJ', '``', 'NNS', "''", 'IN', 'AT', 'JJ', 'NN', 'WDT', 'BEDZ', 'VBN', 'IN', 'NN-TL', 'NP', 'NP', 'NP', '.'], ['``', 'RB', 'AT', 'JJ', 'NN', 'IN', 'JJ', 'NNS', 'BEDZ', 'VBN', "''", ',', 'AT', 'NN', 'VBD', ',', '``', 'IN', 'AT', 'JJ', 'NN', 'IN', 'AT', 'NN', ',', 'AT', 'NN', 'IN', 'NNS', 'CC', 'AT', 'NN', 'IN', 'DT', 'NN', "''", '.'], ['AT', 'NN', 'VBD', 'PPS', 'DOD', 'VB', 'CS', 'AP', 'IN', 'NP$', 'NN', 'CC', 'NN', 'NNS', '``', 'BER', 'JJ', 'CC', 'JJ', 'CC', 'RB', 'JJ', "''", '.'], ['PPS', 'VBD', 'CS', 'NP', 'NNS', 'VB', '``', 'TO', 'HV', 'DTS', 'NNS', 'VBN', 'CC', 'VBN', 'IN', 'AT', 'NN', 'IN', 'VBG', 'CC', 'VBG', 'PPO', "''", '.'], ['AT', 'JJ', 'NN', 'VBD', 'IN', 'AT', 'NN', 'IN', 'AP', 'NNS', ',', 'IN', 'PPO', 'AT', 'NP', 'CC', 'NP-TL', 'NN-TL', 'VBG', 'NNS', 'WDT', 'PPS', 'VBD', '``', 'BER', 'QL', 'VBN', 'CC', 'VB', 'RB', 'VBN', 'NNS', 'WDT', 'VB', 'IN', 'AT', 'JJT', 'NN', 'IN', 'ABX', 'NNS', "''", '.'], ['NN-HL', 'VBN-HL'], ['WRB', ',', 'AT', 'NN', 'VBD', 'PPS', 'VBZ', '``', 'DTS', 'CD', 'NNS', 'MD', 'BE', 'VBN', 'TO', 'VB', 'JJR', 'NN', 'CC', 'VB', 'AT', 'NN', 'IN', 'NN', "''", '.'], ['AT', 'NN-TL', 'VBG-TL', 'NN-TL', ',', 'AT', 'NN', 'VBD', ',', '``', 'BEZ', 'VBG', 'IN', 'VBN', 'JJ', 'NNS', 'CS', 'AT', 'NN', 'IN', 'NN', 'NNS', 'NNS', "''", '.']])
 619 #     # 36:'AT' in g_brown.numtag, 40:'NP-TL'
 620
 621 #     io.DEBUG = 0
 622 #     test_brown = inner_dmv(0,2, (LRBAR,36), g_brown, ['AT', 'NP-TL' ,'NN-TL','JJ-TL'], {})
 623 #     if io.DEBUG:
 624 #         for r in  g_brown.rules((2,36)) + g_brown.rules((1,36)) + g_brown.rules((0,36)):
 625 #             L = r.L()
 626 #             R = r.R()
 627 #             if head(L) in [36,40,-2] and head(R) in [36,40,-2]:
 628 #                 print r
 629 #     print "Brown-test gives: %.8f" % test_brown[0]
 630
 631
 632
 633     # this will give the tag sequences of all the 6218 Brown corpus
 634     # sentences of length < 7:
 635     # [[tag for (w, tag) in sent]
 636     #  for sent in nltk.corpus.brown.tagged_sents() if len(sent) < 7]
 637
 638
 639
 640
 641
 642