src/dmv.py

   1 #### changes by KBU:
   2 # 2008-06-12
   3 # - outer() seems to be working, wrote c(s,t,LHS,loc_h,...) too now.
   4 #
   5 # 2008-06-11
   6 # - moved prune() to junk.py, now using outer() instead. outer() is
   7 #   written, but needs testing.
   8 #
   9 # 2008-06-09
  10 # - prune() finished, seems to be working.
  11 # - started on implementing the other reestimation formulas, in
  12 #   reestimate()
  13 #
  14 # 2008-06-04
  15 # - moved initialization to harmonic.py
  16 #
  17 # 2008-06-03
  18 # - fixed a number of little bugs in initialization, where certain
  19 #   rules were simply not created, or created "backwards"
  20 # - dmv.inner() should Work now...
  21 #
  22 # 2008-06-01
  23 # - finished typing in dmv.inner(), still have to test and debug
  24 #   it. The ichart is now four times as big since for any rule we may
  25 #   have attachments to either the left or the right below, which
  26 #   upper rules depend on, for selecting probN or probA
  27 #
  28 # 2008-05-30
  29 # - copied inner() into this file, to make the very dmv-specific
  30 #   adjacency stuff work (have to factor that out later on, when it
  31 #   works).
  32 #
  33 # 2008-05-29
  34 # - init_normalize is done, it creates p_STOP, p_ROOT and p_CHOOSE,
  35 #   and also adds the relevant probabilities to p_rules in a grammar.
  36 #   Still, each individual rule has to store both adjacent and non_adj
  37 #   probabilities, and inner() should be able to send some parameter
  38 #   which lets the rule choose... hopefully... Is this possible to do
  39 #   top-down even? when the sentence could be all the same words?
  40 #   todo: extensive testing of identical words in sentences!
  41 # - frequencies (only used in initialization) are stored as strings,
  42 #   but in the rules and p_STOP etc, there are only numbers.
  43 #
  44 # 2008-05-28
  45 # - more work on initialization (init_freq and init_normalize),
  46 #   getting closer to probabilities now.
  47 #
  48 # 2008-05-27
  49 # - started on initialization. So far, I have frequencies for
  50 #   everything, very harmonic. Still need to make these into 1-summing
  51 #   probabilities
  52 #
  53 # 2008-05-24
  54 # - prettier printout for DMV_Rule
  55 # - DMV_Rule changed a bit. head, L and R are now all pairs of the
  56 #   form (seals, head).
  57 # - Started on P_STOP, a bit less pseudo now..
  58
  59
  60
  61 #import numpy # numpy provides Fast Arrays, for future optimization
  62 import io
  63
  64 # non-tweakable/constant "lookup" globals
  65 GO_R = 0  # was: NOBAR
  66 RGO_L = 1 # was: RBAR
  67 SEAL = 2  # was: LRBAR
  68
  69 # probably need these for combined model, see thesis-appendix:
  70 GO_L = 3
  71 LGO_R = 4
  72 SEALS = [GO_R, RGO_L, SEAL, GO_L, LGO_R]
  73
  74 ROOT = (SEAL, -1)
  75 STOP = (GO_R, -2)
  76
  77 if __name__ == "__main__":
  78     print "DMV module tests:"
  79
  80
  81 def node(seals, head):
  82     '''Useless function, but just here as documentation. Nodes make up
  83     LHS, R and L in each DMV_Rule'''
  84     return (seals, head)
  85
  86 def seals(node):
  87     return node[0]
  88
  89 def head(node):
  90     return node[1]
  91
  92
  93 class DMV_Grammar(io.Grammar):
  94     '''The DMV-PCFG.
  95
  96     Public members:
  97     p_STOP, p_ROOT, p_CHOOSE, p_terminals
  98     These are changed in the Maximation step, then used to set the
  99     new probabilities of each DMV_Rule.
 100
 101     Todo: make p_terminals private? (But it has to be changable in
 102     maximation step due to the short-cutting rules... could of course
 103     make a DMV_Grammar function to update the short-cut rules...)
 104
 105     __p_rules is private, but we can still say stuff like:
 106     for r in g.all_rules():
 107         r.probN = newProbN
 108
 109     What other representations do we need? (P_STOP formula uses
 110     deps_D(h,l/r) at least)'''
 111     def __str__(self):
 112         str = ""
 113         for r in self.all_rules():
 114              str += "%s\n" % r.__str__(self.numtag)
 115         return str
 116
 117     def h_rules(self, h):
 118         return [r for r in self.all_rules() if r.head() == h]
 119
 120     def mothersL(self, Node, sent_nums):
 121         return [r for r in self.all_rules() if r.L() == Node]
 122
 123     def mothersR(self, Node, sent_nums):
 124         return [r for r in self.all_rules() if r.R() == Node]
 125
 126     def rules(self, LHS):
 127         return [r for r in self.all_rules() if r.LHS() == LHS]
 128
 129     def sent_rules(self, LHS, sent_nums):
 130         '''Used in dmv.inner. Todo: this takes a _lot_ of time, it
 131         seems. Could use some more space and cache some of this
 132         somehow perhaps?'''
 133         # We don't want to rule out STOPs!
 134         nums = sent_nums + [ head(STOP) ]
 135         return [r for r in self.all_rules() if r.LHS() == LHS
 136                 and head(r.L()) in nums and head(r.R()) in nums]
 137
 138     def deps_L(self, head): # todo: do I use this at all?
 139         # todo test, probably this list comprehension doesn't work
 140         return [a for r in self.all_rules() if r.head() == head and a == r.L()]
 141
 142     def deps_R(self, head):
 143         # todo test, probably this list comprehension doesn't work
 144         return [a for r in self.all_rules() if r.head() == head and a == r.R()]
 145
 146     def __init__(self, p_rules, p_terminals, p_STOP, p_CHOOSE, p_ROOT, numtag, tagnum):
 147         io.Grammar.__init__(self, p_rules, p_terminals, numtag, tagnum)
 148         self.p_STOP = p_STOP
 149         self.p_CHOOSE = p_CHOOSE
 150         self.p_ROOT = p_ROOT
 151         self.head_nums = [k for k in numtag.iterkeys()]
 152
 153
 154 class DMV_Rule(io.CNF_Rule):
 155     '''A single CNF rule in the PCFG, of the form
 156     LHS -> L R
 157     where LHS, L and R are 'nodes', eg. of the form (seals, head).
 158
 159     Public members:
 160     probN, probA
 161
 162     Private members:
 163     __L, __R, __LHS
 164
 165     Different rule-types have different probabilities associated with
 166     them:
 167
 168     _h_ -> STOP  h_     P( STOP|h,L,    adj)
 169     _h_ -> STOP  h_     P( STOP|h,L,non_adj)
 170      h_ ->  h  STOP     P( STOP|h,R,    adj)
 171      h_ ->  h  STOP     P( STOP|h,R,non_adj)
 172      h_ -> _a_   h_     P(-STOP|h,L,    adj) * P(a|h,L)
 173      h_ -> _a_   h_     P(-STOP|h,L,non_adj) * P(a|h,L)
 174      h  ->  h   _a_     P(-STOP|h,R,    adj) * P(a|h,R)
 175      h  ->  h   _a_     P(-STOP|h,R,non_adj) * P(a|h,R)
 176     '''
 177     def p(self, adj, *arg):
 178         if adj:
 179             return self.probA
 180         else:
 181             return self.probN
 182
 183     def p_STOP(self, s, t, loc_h):
 184         '''Returns the correct probability, adjacent if we're rewriting from
 185         the (either left or right) end of the fragment. '''
 186         if self.L() == STOP:
 187             return self.p(s == loc_h)
 188         elif self.R() == STOP:
 189             if not loc_h == s:
 190                 if 'TODO' in io.DEBUG:
 191                     print "(%s given loc_h:%d but s:%d. Todo: optimize away!)" % (self, loc_h, s)
 192                 return 0.0
 193             else:
 194                 return self.p(t == loc_h)
 195
 196     def p_ATTACH(self, r, loc_h, s=None):
 197         '''Returns the correct probability, adjacent if we haven't attached
 198         anything before.'''
 199         if self.LHS() == self.L():
 200             if not loc_h == s:
 201                 if 'TODO' in io.DEBUG:
 202                     print "(%s given loc_h (loc_L):%d but s:%d. Todo: optimize away!)" % (self, loc_h, s)
 203                 return 0.0
 204             else:
 205                 return self.p(r == loc_h)
 206         elif self.LHS() == self.R():
 207             return self.p(r+1 == loc_h)
 208
 209     def seals(self):
 210         return seals(self.LHS())
 211
 212     def head(self):
 213         return head(self.LHS())
 214
 215     def __init__(self, LHS, L, R, probN, probA):
 216         for b_h in [LHS, L, R]:
 217             if seals(b_h) not in SEALS:
 218                 raise ValueError("seals must be in %s; was given: %s"
 219                                  % (SEALS, seals(b_h)))
 220         io.CNF_Rule.__init__(self, LHS, L, R, probN)
 221         self.probA = probA # adjacent
 222         self.probN = probN # non_adj
 223
 224     @classmethod # so we can call DMV_Rule.bar_str(b_h)
 225     def bar_str(cls, b_h, tag=lambda x:x):
 226         if(b_h == ROOT):
 227             return 'ROOT'
 228         elif(b_h == STOP):
 229             return 'STOP'
 230         elif(seals(b_h) == RGO_L):
 231             return " %s_ " % tag(head(b_h))
 232         elif(seals(b_h) == SEAL):
 233             return "_%s_ " % tag(head(b_h))
 234         else:
 235             return " %s  " % tag(head(b_h))
 236
 237
 238     def __str__(self, tag=lambda x:x):
 239         return "%s-->%s %s\t[N %.2f] [A %.2f]" % (self.bar_str(self.LHS(), tag),
 240                                                   self.bar_str(self.L(), tag),
 241                                                   self.bar_str(self.R(), tag),
 242                                                   self.probN,
 243                                                   self.probA)
 244
 245
 246
 247
 248
 249
 250
 251 ###################################
 252 # dmv-specific version of inner() #
 253 ###################################
 254 def locs(h, sent, s=0, t=None, remove=None):
 255     '''Return the locations of h in sent, or some fragment of sent (in the
 256     latter case we make sure to offset the locations correctly so that
 257     for any x in the returned list, sent[x]==h).
 258
 259     t is inclusive, to match the way indices work with inner()
 260     (although python list-splicing has "exclusive" end indices)'''
 261     if t == None:
 262         t = len(sent)-1
 263     return [i+s for i,w in enumerate(sent[s:t+1])
 264             if w == h and not (i+s) == remove]
 265
 266
 267 def inner(s, t, LHS, loc_h, g, sent, ichart):
 268     ''' A rewrite of io.inner(), to take adjacency into accord.
 269
 270     The ichart is now of this form:
 271     ichart[s,t,LHS, loc_h]
 272
 273     loc_h gives adjacency (along with r and location of other child
 274     for attachment rules), and is needed in P_STOP reestimation.
 275
 276     Todo: if possible, refactor (move dmv-specific stuff back into
 277     dmv, so this is "general" enough to be in io.py)
 278     '''
 279
 280     def O(s):
 281         return sent[s]
 282
 283     sent_nums = g.sent_nums(sent)
 284     tree = {}
 285
 286     def e(s,t,LHS, loc_h, n_t):
 287         def tab():
 288             "Tabs for debug output"
 289             return "\t"*n_t
 290
 291         if (s, t, LHS, loc_h) in ichart:
 292             if 'INNER' in io.DEBUG:
 293                 print "%s*= %.4f in ichart: s:%d t:%d LHS:%s loc:%d" % (tab(),ichart[s, t, LHS, loc_h], s, t,
 294                                                                        DMV_Rule.bar_str(LHS), loc_h)
 295             return ichart[s, t, LHS, loc_h]
 296         else:
 297             if s == t and seals(LHS) == GO_R:
 298                 if not loc_h == s:
 299                     if 'INNER' in io.DEBUG:
 300                         print "%s*= 0.0 (wrong loc_h)" % tab()
 301                     return 0.0
 302                 elif (LHS, O(s)) in g.p_terminals:
 303                     prob = g.p_terminals[LHS, O(s)] # "b[LHS, O(s)]" in Lari&Young
 304                 else:
 305                     # todo: assuming this is how to deal w/lacking
 306                     # rules, since we add prob.s, and 0 is identity
 307                     prob = 0.0
 308                     if 'INNER' in io.DEBUG:
 309                         print "%sLACKING TERMINAL:" % tab()
 310                 # todo: add to ichart perhaps? Although, it _is_ simple lookup..
 311                 if 'INNER' in io.DEBUG:
 312                     print "%s*= %.4f (terminal: %s -> %s_%d)" % (tab(),prob, DMV_Rule.bar_str(LHS), O(s), loc_h)
 313                 return prob
 314             else:
 315                 p = 0.0 # "sum over j,k in a[LHS,j,k]"
 316                 for rule in g.sent_rules(LHS, sent_nums):
 317                     if 'INNER' in io.DEBUG:
 318                         print "%ssumming rule %s s:%d t:%d loc:%d" % (tab(),rule,s,t,loc_h)
 319                     L = rule.L()
 320                     R = rule.R()
 321                     if (s,t,LHS,loc_h) not in tree:
 322                         tree[s,t,LHS,loc_h] = set()
 323                     if loc_h == t and rule.LHS() == L:
 324                         continue # todo: speed-test
 325                     if loc_h == s and rule.LHS() == R:
 326                         continue
 327                     # if it's a STOP rule, rewrite for the same range:
 328                     if (L == STOP) or (R == STOP):
 329                         if L == STOP:
 330                             pLR = e(s, t, R, loc_h, n_t+1)
 331                             if pLR > 0.0:
 332                                 tree[s,t,LHS,loc_h].add((s,t,R,loc_h))
 333                         elif R == STOP:
 334                             pLR = e(s, t, L, loc_h, n_t+1)
 335                             if pLR > 0.0:
 336                                 tree[s,t,LHS,loc_h].add((s,t,L,loc_h))
 337                         p += rule.p_STOP(s, t, loc_h) * pLR
 338                         if 'INNER' in io.DEBUG:
 339                             print "%sp= %.4f (STOP)" % (tab(), p)
 340
 341                     elif t > s: # not a STOP, attachment rewrite:
 342                         rp_ATTACH = rule.p_ATTACH # todo: profile/speedtest
 343                         for r in xrange(s, t):
 344                             p_h = rp_ATTACH(r, loc_h, s=s)
 345                             if rule.LHS() == L:
 346                                 locs_L = [loc_h]
 347                                 locs_R = locs(head(R), sent_nums, r+1, t, loc_h)
 348                             elif rule.LHS() == R:
 349                                 locs_L = locs(head(L), sent_nums,  s,  r, loc_h)
 350                                 locs_R = [loc_h]
 351                             for loc_L in locs_L:
 352                                 pL = e(s, r, L, loc_L, n_t+1)
 353                                 if pL > 0.0:
 354                                     for loc_R in locs_R:
 355                                         pR = e(r+1, t, R, loc_R, n_t+1)
 356                                         if pR > 0.0: # and pL > 0.0
 357                                             tree[s,t,LHS,loc_h].add(( s ,r,L,loc_L))
 358                                             tree[s,t,LHS,loc_h].add((r+1,t,R,loc_R))
 359                                         p += pL * p_h * pR
 360                             if 'INNER' in io.DEBUG:
 361                                 print "%sp= %.4f (ATTACH)" % (tab(), p)
 362                 ichart[s, t, LHS, loc_h] = p
 363                 return p
 364     # end of e-function
 365
 366     inner_prob = e(s,t,LHS,loc_h, 0)
 367     ichart['tree'] = {}
 368     if 'INNER' in io.DEBUG:
 369         print debug_ichart(g,sent,ichart)
 370     return inner_prob
 371 # end of dmv.inner(s, t, LHS, loc_h, g, sent, ichart)
 372
 373
 374 def debug_ichart(g,sent,ichart):
 375     str = "---ICHART:---\n"
 376     for (s,t,LHS,loc_h),v in ichart.iteritems():
 377         if type(v) == dict: # skip 'tree'
 378             continue
 379         str += "%s -> %s_%d ... %s_%d (loc_h:%s):\t%.4f\n" % (DMV_Rule.bar_str(LHS,g.numtag),
 380                                                               sent[s], s, sent[s], t, loc_h, v)
 381     str += "---ICHART:end---\n"
 382     return str
 383
 384
 385 def inner_sent(g, sent, ichart):
 386     return sum([inner(0, len(sent)-1, ROOT, loc_h, g, sent, ichart)
 387                 for loc_h in xrange(len(sent))])
 388
 389
 390 def c(s,t,LHS,loc_h,g,sent,ichart,ochart):
 391     # assuming P_sent = P(D(ROOT)) = inner(sent). todo: check K&M about this
 392     p_sent = inner_sent(g, sent, ichart)
 393     p_in = inner(s,t,LHS,loc_h,g,sent,ichart)
 394     p_out = outer(s,t,LHS,loc_h,g,sent,ichart,ochart)
 395     if p_sent > 0.0:
 396         return p_in * p_out / p_sent
 397     else:
 398         return p_sent
 399
 400 ###################################
 401 # dmv-specific version of outer() #
 402 ###################################
 403 def outer(s,t,Node,loc_N, g, sent, ichart, ochart):
 404     ''' http://www.student.uib.no/~kun041/dmvccm/DMVCCM.html#outer
 405     '''
 406     def e(s,t,LHS,loc_h):
 407         # or we could just look it up in ichart, assuming ichart to be done
 408         return inner(s, t, LHS, loc_h, g, sent, ichart)
 409
 410     T = len(sent)-1
 411     sent_nums = g.sent_nums(sent)
 412
 413     def f(s,t,Node,loc_N):
 414         if (s,t,Node) in ochart:
 415             return ochart[(s, t, Node,loc_N)]
 416         if Node == ROOT:
 417             if s == 0 and t == T:
 418                 return 1.0
 419             else: # ROOT may only be used on full sentence
 420                 return 0.0 # but we may have non-ROOTs over full sentence too
 421         p = 0.0
 422
 423         for mom in g.mothersL(Node, sent_nums): # mom.L() == Node
 424             R = mom.R()
 425             mLHS = mom.LHS()
 426             if R == STOP:
 427                 p += f(s,t,mLHS,loc_N) * mom.p_STOP(s,t,loc_N) # == loc_m
 428             else:
 429                 if seals(mLHS) == RGO_L: # left attachment, head(mLHS) == head(L)
 430                     for r in xrange(t+1,T+1): # t+1 to lasT
 431                         for loc_m in locs(head(mLHS),sent_nums,t+1,r):
 432                             p_m = mom.p(t+1 == loc_m)
 433                             p += f(s,r,mLHS,loc_m) * p_m * e(t+1,r,R,loc_m)
 434                 else: # right attachment, head(mLHS) == head(Node)
 435                     loc_m = loc_N
 436                     p_m = mom.p( t  == loc_m)
 437                     for r in xrange(t+1,T+1): # t+1 to lasT
 438                         for loc_R in locs(head(mLHS),sent_nums,t+1,r):
 439                             p += f(s,r,mLHS,loc_m) * p_m * e(t+1,r,R,loc_R)
 440
 441         for mom in g.mothersR(Node, sent_nums):
 442             L = mom.L()
 443             mLHS = mom.LHS()
 444             if L == STOP:
 445                 p += f(s,t,mLHS,loc_N) * mom.p_STOP(s,t,loc_N) # == loc_m
 446             else:
 447                 if seals(mLHS) == RGO_L: # left attachment, head(mLHS) == head(Node)
 448                     loc_m = loc_N
 449                     p_m = mom.p( s  == loc_m)
 450                     for r in xrange(0,s): # first to s-1
 451                         for loc_L in locs(head(L),sent_nums,r,s-1):
 452                             p += e(r,s-1,L, loc_L) * p_m * f(r,t,mLHS,loc_m)
 453                 else: # right attachment, head(mLHS) == head(R)
 454                     for r in xrange(0,s): # first to s-1
 455                         for loc_m in locs(head(mLHS),sent_nums,r,s-1):
 456                             p_m = mom.p(s-1 == loc_m)
 457                             p += e(r,s-1,L, loc_m) * p_m * f(r,t,mLHS,loc_m)
 458         ochart[s,t,Node,loc_N] = p
 459         return p
 460
 461
 462     return f(s,t,Node,loc_N)
 463 # end outer(s,t,Node,loc_N, g,sent, ichart,ochart)
 464
 465
 466
 467 ##############################
 468 #      reestimation, todo:   #
 469 ##############################
 470 def reestimate_zeros(h_nums):
 471     # todo: p_ROOT, p_CHOOSE, p_terminals
 472     f = {}
 473     for h in h_nums:
 474         f[('LNSTOP','num',h)] = 0.0
 475         f[('LNSTOP','den',h)] = 0.0
 476         f[('LASTOP','num',h)] = 0.0
 477         f[('LASTOP','den',h)] = 0.0
 478         f[('RNSTOP','num',h)] = 0.0
 479         f[('RNSTOP','den',h)] = 0.0
 480         f[('RASTOP','num',h)] = 0.0
 481         f[('RASTOP','den',h)] = 0.0
 482     return f
 483
 484 def reestimate(g, corpus):
 485     '''current todo.
 486     P_STOP(-STOP|...) = 1 - P_STOP(STOP|...) '''
 487     f = reestimate_zeros(g.head_nums)
 488     ichart = {}
 489     ochart = {}
 490     def c_g(s,t,LHS,loc_h,sent):
 491         return c(s,t,LHS,loc_h,g,sent,ichart,ochart)
 492     for sent in corpus:
 493         if 'reest' in io.DEBUG:
 494             print sent
 495         sent_nums = g.sent_nums(sent)
 496         ichart = {}
 497         ochart = {}
 498         for loc_h,h in enumerate(sent_nums):
 499             for t in xrange(loc_h, len(sent)):
 500                 for s in xrange(loc_h): # s<loc(h), range gives strictly less
 501                     # left non-adjacent stop
 502                     f[('LNSTOP','num',h)] += c_g(s, t, (SEAL, h), loc_h,sent)
 503                     f[('LNSTOP','den',h)] += c_g(s, t, (RGO_L,h), loc_h,sent)
 504                 # left adjacent stop
 505                 f[('LASTOP','num',h)] += c_g(loc_h, t, (SEAL, h), loc_h,sent)
 506                 f[('LASTOP','den',h)] += c_g(loc_h, t, (RGO_L,h), loc_h,sent)
 507             for t in xrange(loc_h+1, len(sent)):
 508                 # right non-adjacent stop
 509                 f[('RNSTOP','num',h)] += c_g(loc_h, t, (RGO_L,h), loc_h,sent)
 510                 f[('RNSTOP','den',h)] += c_g(loc_h, t, (GO_R, h), loc_h,sent)
 511             f[('RASTOP','num',h)] += c_g(loc_h, loc_h, (RGO_L,h), loc_h,sent)
 512             f[('RASTOP','den',h)] += c_g(loc_h, loc_h, (GO_R, h), loc_h,sent)
 513
 514         # todo: use sum([ichart[s, t...] etc? but can we then
 515         # keep den and num separate within _one_ sum()-call? use map?
 516
 517     # we want to go through only non-ROOT left-STOPs..
 518     for r in g.all_rules():
 519         if r.L() == STOP and not r.LHS() == ROOT:
 520             h = r.head()
 521             if 'reest' in io.DEBUG:
 522                 old_probN = r.probN
 523                 old_probA = r.probA
 524             if f[('LNSTOP','den',h)] > 0.0:
 525                 r.probN = f[('LNSTOP','num',h)] / f[('LNSTOP','den',h)]
 526             else:
 527                 r.probN = 0.0 # or..remove rule? todo
 528             if f[('LASTOP','den',h)] > 0.0:
 529                 r.probA = f[('LASTOP','num',h)] / f[('LASTOP','den',h)]
 530             else:
 531                 r.probA = 0.0 # or..remove rule? todo
 532             if 'reest' in io.DEBUG:
 533                 print "p(STOP|%d=%s,L,N): %.4f / %.4f = %.4f (was: %.4f)"%(h,g.numtag(h),
 534                                                          f[('LNSTOP','num',h)],
 535                                                          f[('LNSTOP','den',h)],
 536                                                                    r.probN,
 537                                                                    old_probN)
 538                 print "p(STOP|%d=%s,L,A): %.4f / %.4f = %.4f (was: %.4f)"%(h,g.numtag(h),
 539                                                          f[('LASTOP','num',h)],
 540                                                          f[('LASTOP','den',h)],
 541                                                                    r.probA,
 542                                                                    old_probA)
 543         if r.R() == STOP and not r.LHS() == ROOT:
 544             h = r.head()
 545             if 'reest' in io.DEBUG:
 546                 old_probN = r.probN
 547                 old_probA = r.probA
 548             if f[('RNSTOP','den',h)] > 0.0:
 549                 r.probN = f[('RNSTOP','num',h)] / f[('RNSTOP','den',h)]
 550             else:
 551                 r.probN = 0.0 # or..remove rule? todo
 552             if f[('RASTOP','den',h)] > 0.0:
 553                 r.probA = f[('RASTOP','num',h)] / f[('RASTOP','den',h)]
 554             else:
 555                 r.probA = 0.0 # or..remove rule? todo
 556             if 'reest' in io.DEBUG:
 557                 print "p(STOP|%d=%s,R,N): %.4f / %.4f = %.4f (was: %.4f)"%(h,g.numtag(h),
 558                                                          f[('RNSTOP','num',h)],
 559                                                          f[('RNSTOP','den',h)],
 560                                                                    r.probN,
 561                                                                    old_probN)
 562                 print "p(STOP|%d=%s,R,A): %.4f / %.4f = %.4f (was: %.4f)"%(h,g.numtag(h),
 563                                                          f[('RASTOP','num',h)],
 564                                                          f[('RASTOP','den',h)],
 565                                                                    r.probA,
 566                                                                    old_probA)
 567
 568
 569
 570
 571
 572
 573
 574
 575 ##############################
 576 #     testing functions:     #
 577 ##############################
 578
 579 testcorpus = [s.split() for s in ['det nn vbd c vbd','vbd nn c vbd',
 580                                   'det nn vbd',      'det nn vbd c pp',
 581                                   'det nn vbd',      'det vbd vbd c pp',
 582                                   'det nn vbd',      'det nn vbd c vbd',
 583                                   'det nn vbd',      'det nn vbd c vbd',
 584                                   'det nn vbd',      'det nn vbd c vbd',
 585                                   'det nn vbd',      'det nn vbd c pp',
 586                                   'det nn vbd pp',   'det nn vbd', ]]
 587 def testgrammar():
 588     import harmonic
 589     reload(harmonic)
 590     return harmonic.initialize(testcorpus)
 591
 592 def testreestimation():
 593     io.DEBUG.add('reest')
 594     g = testgrammar()
 595     reestimate(g, testcorpus)
 596
 597
 598
 599 def testgrammar_a():                            # Non, Adj
 600     _h_ = DMV_Rule((SEAL,0), STOP,    ( RGO_L,0), 0.9, 0.9) # LSTOP
 601     h_S = DMV_Rule(( RGO_L,0),(GO_R,0),  STOP,    0.4, 0.3) # RSTOP
 602     h_A = DMV_Rule(( RGO_L,0),(SEAL,0),( RGO_L,0),0.2, 0.1) # Lattach
 603     h_Aa= DMV_Rule(( RGO_L,0),(SEAL,1),( RGO_L,0),0.4, 0.6) # Lattach to a
 604     h   = DMV_Rule((GO_R,0),(GO_R,0),(SEAL,0),    1.0, 1.0) # Rattach
 605     ha  = DMV_Rule((GO_R,0),(GO_R,0),(SEAL,1),    1.0, 1.0) # Rattach to a
 606     rh  = DMV_Rule(   ROOT,   STOP,    (SEAL,0),  1.0, 1.0) # ROOT
 607
 608     _a_ = DMV_Rule((SEAL,1), STOP,    ( RGO_L,1), 1.0, 1.0) # LSTOP
 609     a_S = DMV_Rule(( RGO_L,1),(GO_R,1),  STOP,    0.4, 0.3) # RSTOP
 610     a_A = DMV_Rule(( RGO_L,1),(SEAL,1),( RGO_L,1),0.4, 0.6) # Lattach
 611     a_Ah= DMV_Rule(( RGO_L,1),(SEAL,0),( RGO_L,1),0.2, 0.1) # Lattach to h
 612     a   = DMV_Rule((GO_R,1),(GO_R,1),(SEAL,1),    1.0, 1.0) # Rattach
 613     ah  = DMV_Rule((GO_R,1),(GO_R,1),(SEAL,0),    1.0, 1.0) # Rattach to h
 614     ra  = DMV_Rule(   ROOT,   STOP,    (SEAL,1),  0.1, 0.1) # ROOT
 615
 616     b2  = {}
 617     b2[(GO_R, 0), 'h'] = 1.0
 618     b2[(GO_R, 1), 'a'] = 1.0
 619
 620     return DMV_Grammar([ h_Aa, ha, a_Ah, ah, ra, _a_, a_S, a_A, a, rh, _h_, h_S, h_A, h ],b2,0,0,0, {0:'h',1:'a'}, {'h':0,'a':1})
 621
 622
 623
 624 def testgrammar_h():                            # Non, Adj
 625     _h_ = DMV_Rule((SEAL,0), STOP,    ( RGO_L,0), 1.0, 1.0) # LSTOP
 626     h_S = DMV_Rule(( RGO_L,0),(GO_R,0),  STOP,    0.4, 0.3) # RSTOP
 627     h_A = DMV_Rule(( RGO_L,0),(SEAL,0),( RGO_L,0), 0.6, 0.7) # Lattach
 628     h   = DMV_Rule((GO_R,0),(GO_R,0),(SEAL,0), 1.0, 1.0) # Rattach
 629     rh  = DMV_Rule(   ROOT,   STOP,    (SEAL,0), 1.0, 1.0) # ROOT
 630     b2  = {}
 631     b2[(GO_R, 0), 'h'] = 1.0
 632
 633     return DMV_Grammar([ rh, _h_, h_S, h_A, h ],b2,0,0,0, {0:'h'}, {'h':0})
 634
 635
 636 def testreestimation_h():
 637     io.DEBUG.add('reest')
 638     g = testgrammar_h()
 639     reestimate(g,['h h h'.split()])
 640
 641 def regression_tests():
 642     g_dup = testgrammar_h()
 643
 644     test0 = inner(0, 1, (SEAL,0), 0, g_dup, 'h h'.split(), {})
 645     if not  "0.120"=="%.3f" % test0:
 646         print "Should be 0.120: %.3f" % test0
 647
 648     test1 = inner(0, 1, (SEAL,0), 1, g_dup, 'h h'.split(), {})
 649     if not  "0.063"=="%.3f" % test1:
 650         print "Should be 0.063: %.3f" % test1
 651
 652     test3 = inner(0, 2, (SEAL,0), 2, g_dup, 'h h h'.split(), {})
 653     if not  "0.0498"=="%.4f" % test3:
 654         print "Should be 0.0498: %.4f" % test3
 655
 656     test4 = outer(1,2,(1,0),2,testgrammar_h(),'h h h'.split(),{},{})
 657     if not "0.58" == "%.2f" % test4:
 658         print "Should be 0.58: %.2f" % test4
 659
 660 if __name__ == "__main__":
 661     import timeit
 662 #     import profile
 663 #     profile.run('testreestimation()')
 664 #     print timeit.Timer("dmv.testreestimation()",'''import dmv
 665 # reload(dmv)''').timeit(1)
 666 #    testreestimation_h()
 667     io.DEBUG.clear()
 668     regression_tests()
 669     print "outer(0,0,(1,0),0,testgrammar_a(),'h a'.split(),{},{}):"
 670     print outer(0,0,(1,0),0,testgrammar_a(),'h a'.split(),{},{})
 671     print "outer(0,0,(0,0),0,testgrammar_a(),'h a'.split(),{},{}):"
 672     print outer(0,0,(0,0),0,testgrammar_a(),'h a'.split(),{},{})
 673
 674     io.DEBUG.clear()
 675     #print "testreestimation():"
 676     # testreestimation()