pstop seems done, todo: pchoose reestimation
[dmvccm.git] / src / dmv.py
blob15ea7d4e3b54a15a33a1e016c84009f0ccc969bc
1 #### changes by KBU:
2 # 2008-06-12
3 # - outer() seems to be working, wrote c(s,t,LHS,loc_h,...) too now.
4 #
5 # 2008-06-11
6 # - moved prune() to junk.py, now using outer() instead. outer() is
7 # written, but needs testing.
9 # 2008-06-09
10 # - prune() finished, seems to be working.
11 # - started on implementing the other reestimation formulas, in
12 # reestimate()
14 # 2008-06-04
15 # - moved initialization to harmonic.py
17 # 2008-06-03
18 # - fixed a number of little bugs in initialization, where certain
19 # rules were simply not created, or created "backwards"
20 # - dmv.inner() should Work now...
22 # 2008-06-01
23 # - finished typing in dmv.inner(), still have to test and debug
24 # it. The ichart is now four times as big since for any rule we may
25 # have attachments to either the left or the right below, which
26 # upper rules depend on, for selecting probN or probA
28 # 2008-05-30
29 # - copied inner() into this file, to make the very dmv-specific
30 # adjacency stuff work (have to factor that out later on, when it
31 # works).
33 # 2008-05-29
34 # - init_normalize is done, it creates p_STOP, p_ROOT and p_CHOOSE,
35 # and also adds the relevant probabilities to p_rules in a grammar.
36 # Still, each individual rule has to store both adjacent and non_adj
37 # probabilities, and inner() should be able to send some parameter
38 # which lets the rule choose... hopefully... Is this possible to do
39 # top-down even? when the sentence could be all the same words?
40 # todo: extensive testing of identical words in sentences!
41 # - frequencies (only used in initialization) are stored as strings,
42 # but in the rules and p_STOP etc, there are only numbers.
44 # 2008-05-28
45 # - more work on initialization (init_freq and init_normalize),
46 # getting closer to probabilities now.
48 # 2008-05-27
49 # - started on initialization. So far, I have frequencies for
50 # everything, very harmonic. Still need to make these into 1-summing
51 # probabilities
53 # 2008-05-24
54 # - prettier printout for DMV_Rule
55 # - DMV_Rule changed a bit. head, L and R are now all pairs of the
56 # form (seals, head).
57 # - Started on P_STOP, a bit less pseudo now..
61 #import numpy # numpy provides Fast Arrays, for future optimization
62 import io
64 # non-tweakable/constant "lookup" globals
65 GO_R = 0 # was: NOBAR
66 RGO_L = 1 # was: RBAR
67 SEAL = 2 # was: LRBAR
69 # probably need these for combined model, see thesis-appendix:
70 GO_L = 3
71 LGO_R = 4
72 SEALS = [GO_R, RGO_L, SEAL, GO_L, LGO_R]
74 ROOT = (SEAL, -1)
75 STOP = (GO_R, -2)
77 if __name__ == "__main__":
78 print "DMV module tests:"
81 def node(seals, head):
82 '''Useless function, but just here as documentation. Nodes make up
83 LHS, R and L in each DMV_Rule'''
84 return (seals, head)
86 def seals(node):
87 return node[0]
89 def head(node):
90 return node[1]
93 class DMV_Grammar(io.Grammar):
94 '''The DMV-PCFG.
96 Public members:
97 p_STOP, p_ROOT, p_CHOOSE, p_terminals
98 These are changed in the Maximation step, then used to set the
99 new probabilities of each DMV_Rule.
101 Todo: make p_terminals private? (But it has to be changable in
102 maximation step due to the short-cutting rules... could of course
103 make a DMV_Grammar function to update the short-cut rules...)
105 __p_rules is private, but we can still say stuff like:
106 for r in g.all_rules():
107 r.probN = newProbN
109 What other representations do we need? (P_STOP formula uses
110 deps_D(h,l/r) at least)'''
111 def __str__(self):
112 str = ""
113 for r in self.all_rules():
114 str += "%s\n" % r.__str__(self.numtag)
115 return str
117 def h_rules(self, h):
118 return [r for r in self.all_rules() if r.head() == h]
120 def mothersL(self, Node, sent_nums):
121 return [r for r in self.all_rules() if r.L() == Node]
123 def mothersR(self, Node, sent_nums):
124 return [r for r in self.all_rules() if r.R() == Node]
126 def rules(self, LHS):
127 return [r for r in self.all_rules() if r.LHS() == LHS]
129 def sent_rules(self, LHS, sent_nums):
130 '''Used in dmv.inner. Todo: this takes a _lot_ of time, it
131 seems. Could use some more space and cache some of this
132 somehow perhaps?'''
133 # We don't want to rule out STOPs!
134 nums = sent_nums + [ head(STOP) ]
135 return [r for r in self.all_rules() if r.LHS() == LHS
136 and head(r.L()) in nums and head(r.R()) in nums]
138 def deps_L(self, head): # todo: do I use this at all?
139 # todo test, probably this list comprehension doesn't work
140 return [a for r in self.all_rules() if r.head() == head and a == r.L()]
142 def deps_R(self, head):
143 # todo test, probably this list comprehension doesn't work
144 return [a for r in self.all_rules() if r.head() == head and a == r.R()]
146 def __init__(self, p_rules, p_terminals, p_STOP, p_CHOOSE, p_ROOT, numtag, tagnum):
147 io.Grammar.__init__(self, p_rules, p_terminals, numtag, tagnum)
148 self.p_STOP = p_STOP
149 self.p_CHOOSE = p_CHOOSE
150 self.p_ROOT = p_ROOT
151 self.head_nums = [k for k in numtag.iterkeys()]
154 class DMV_Rule(io.CNF_Rule):
155 '''A single CNF rule in the PCFG, of the form
156 LHS -> L R
157 where LHS, L and R are 'nodes', eg. of the form (seals, head).
159 Public members:
160 probN, probA
162 Private members:
163 __L, __R, __LHS
165 Different rule-types have different probabilities associated with
166 them:
168 _h_ -> STOP h_ P( STOP|h,L, adj)
169 _h_ -> STOP h_ P( STOP|h,L,non_adj)
170 h_ -> h STOP P( STOP|h,R, adj)
171 h_ -> h STOP P( STOP|h,R,non_adj)
172 h_ -> _a_ h_ P(-STOP|h,L, adj) * P(a|h,L)
173 h_ -> _a_ h_ P(-STOP|h,L,non_adj) * P(a|h,L)
174 h -> h _a_ P(-STOP|h,R, adj) * P(a|h,R)
175 h -> h _a_ P(-STOP|h,R,non_adj) * P(a|h,R)
177 def p(self, adj, *arg):
178 if adj:
179 return self.probA
180 else:
181 return self.probN
183 def p_STOP(self, s, t, loc_h):
184 '''Returns the correct probability, adjacent if we're rewriting from
185 the (either left or right) end of the fragment. '''
186 if self.L() == STOP:
187 return self.p(s == loc_h)
188 elif self.R() == STOP:
189 if not loc_h == s:
190 if 'TODO' in io.DEBUG:
191 print "(%s given loc_h:%d but s:%d. Todo: optimize away!)" % (self, loc_h, s)
192 return 0.0
193 else:
194 return self.p(t == loc_h)
196 def p_ATTACH(self, r, loc_h, s=None):
197 '''Returns the correct probability, adjacent if we haven't attached
198 anything before.'''
199 if self.LHS() == self.L():
200 if not loc_h == s:
201 if 'TODO' in io.DEBUG:
202 print "(%s given loc_h (loc_L):%d but s:%d. Todo: optimize away!)" % (self, loc_h, s)
203 return 0.0
204 else:
205 return self.p(r == loc_h)
206 elif self.LHS() == self.R():
207 return self.p(r+1 == loc_h)
209 def seals(self):
210 return seals(self.LHS())
212 def head(self):
213 return head(self.LHS())
215 def __init__(self, LHS, L, R, probN, probA):
216 for b_h in [LHS, L, R]:
217 if seals(b_h) not in SEALS:
218 raise ValueError("seals must be in %s; was given: %s"
219 % (SEALS, seals(b_h)))
220 io.CNF_Rule.__init__(self, LHS, L, R, probN)
221 self.probA = probA # adjacent
222 self.probN = probN # non_adj
224 @classmethod # so we can call DMV_Rule.bar_str(b_h)
225 def bar_str(cls, b_h, tag=lambda x:x):
226 if(b_h == ROOT):
227 return 'ROOT'
228 elif(b_h == STOP):
229 return 'STOP'
230 elif(seals(b_h) == RGO_L):
231 return " %s_ " % tag(head(b_h))
232 elif(seals(b_h) == SEAL):
233 return "_%s_ " % tag(head(b_h))
234 else:
235 return " %s " % tag(head(b_h))
238 def __str__(self, tag=lambda x:x):
239 return "%s-->%s %s\t[N %.2f] [A %.2f]" % (self.bar_str(self.LHS(), tag),
240 self.bar_str(self.L(), tag),
241 self.bar_str(self.R(), tag),
242 self.probN,
243 self.probA)
251 ###################################
252 # dmv-specific version of inner() #
253 ###################################
254 def locs(h, sent, s=0, t=None, remove=None):
255 '''Return the locations of h in sent, or some fragment of sent (in the
256 latter case we make sure to offset the locations correctly so that
257 for any x in the returned list, sent[x]==h).
259 t is inclusive, to match the way indices work with inner()
260 (although python list-splicing has "exclusive" end indices)'''
261 if t == None:
262 t = len(sent)-1
263 return [i+s for i,w in enumerate(sent[s:t+1])
264 if w == h and not (i+s) == remove]
267 def inner(s, t, LHS, loc_h, g, sent, ichart):
268 ''' A rewrite of io.inner(), to take adjacency into accord.
270 The ichart is now of this form:
271 ichart[s,t,LHS, loc_h]
273 loc_h gives adjacency (along with r and location of other child
274 for attachment rules), and is needed in P_STOP reestimation.
276 Todo: if possible, refactor (move dmv-specific stuff back into
277 dmv, so this is "general" enough to be in io.py)
280 def O(s):
281 return sent[s]
283 sent_nums = g.sent_nums(sent)
284 tree = {}
286 def e(s,t,LHS, loc_h, n_t):
287 def tab():
288 "Tabs for debug output"
289 return "\t"*n_t
291 if (s, t, LHS, loc_h) in ichart:
292 if 'INNER' in io.DEBUG:
293 print "%s*= %.4f in ichart: s:%d t:%d LHS:%s loc:%d" % (tab(),ichart[s, t, LHS, loc_h], s, t,
294 DMV_Rule.bar_str(LHS), loc_h)
295 return ichart[s, t, LHS, loc_h]
296 else:
297 if s == t and seals(LHS) == GO_R:
298 if not loc_h == s:
299 if 'INNER' in io.DEBUG:
300 print "%s*= 0.0 (wrong loc_h)" % tab()
301 return 0.0
302 elif (LHS, O(s)) in g.p_terminals:
303 prob = g.p_terminals[LHS, O(s)] # "b[LHS, O(s)]" in Lari&Young
304 else:
305 # todo: assuming this is how to deal w/lacking
306 # rules, since we add prob.s, and 0 is identity
307 prob = 0.0
308 if 'INNER' in io.DEBUG:
309 print "%sLACKING TERMINAL:" % tab()
310 # todo: add to ichart perhaps? Although, it _is_ simple lookup..
311 if 'INNER' in io.DEBUG:
312 print "%s*= %.4f (terminal: %s -> %s_%d)" % (tab(),prob, DMV_Rule.bar_str(LHS), O(s), loc_h)
313 return prob
314 else:
315 p = 0.0 # "sum over j,k in a[LHS,j,k]"
316 for rule in g.sent_rules(LHS, sent_nums):
317 if 'INNER' in io.DEBUG:
318 print "%ssumming rule %s s:%d t:%d loc:%d" % (tab(),rule,s,t,loc_h)
319 L = rule.L()
320 R = rule.R()
321 if (s,t,LHS,loc_h) not in tree:
322 tree[s,t,LHS,loc_h] = set()
323 if loc_h == t and rule.LHS() == L:
324 continue # todo: speed-test
325 if loc_h == s and rule.LHS() == R:
326 continue
327 # if it's a STOP rule, rewrite for the same range:
328 if (L == STOP) or (R == STOP):
329 if L == STOP:
330 pLR = e(s, t, R, loc_h, n_t+1)
331 if pLR > 0.0:
332 tree[s,t,LHS,loc_h].add((s,t,R,loc_h))
333 elif R == STOP:
334 pLR = e(s, t, L, loc_h, n_t+1)
335 if pLR > 0.0:
336 tree[s,t,LHS,loc_h].add((s,t,L,loc_h))
337 p += rule.p_STOP(s, t, loc_h) * pLR
338 if 'INNER' in io.DEBUG:
339 print "%sp= %.4f (STOP)" % (tab(), p)
341 elif t > s: # not a STOP, attachment rewrite:
342 rp_ATTACH = rule.p_ATTACH # todo: profile/speedtest
343 for r in xrange(s, t):
344 p_h = rp_ATTACH(r, loc_h, s=s)
345 if rule.LHS() == L:
346 locs_L = [loc_h]
347 locs_R = locs(head(R), sent_nums, r+1, t, loc_h)
348 elif rule.LHS() == R:
349 locs_L = locs(head(L), sent_nums, s, r, loc_h)
350 locs_R = [loc_h]
351 for loc_L in locs_L:
352 pL = e(s, r, L, loc_L, n_t+1)
353 if pL > 0.0:
354 for loc_R in locs_R:
355 pR = e(r+1, t, R, loc_R, n_t+1)
356 if pR > 0.0: # and pL > 0.0
357 tree[s,t,LHS,loc_h].add(( s ,r,L,loc_L))
358 tree[s,t,LHS,loc_h].add((r+1,t,R,loc_R))
359 p += pL * p_h * pR
360 if 'INNER' in io.DEBUG:
361 print "%sp= %.4f (ATTACH)" % (tab(), p)
362 ichart[s, t, LHS, loc_h] = p
363 return p
364 # end of e-function
366 inner_prob = e(s,t,LHS,loc_h, 0)
367 ichart['tree'] = {}
368 if 'INNER' in io.DEBUG:
369 print debug_ichart(g,sent,ichart)
370 return inner_prob
371 # end of dmv.inner(s, t, LHS, loc_h, g, sent, ichart)
374 def debug_ichart(g,sent,ichart):
375 str = "---ICHART:---\n"
376 for (s,t,LHS,loc_h),v in ichart.iteritems():
377 if type(v) == dict: # skip 'tree'
378 continue
379 str += "%s -> %s_%d ... %s_%d (loc_h:%s):\t%.4f\n" % (DMV_Rule.bar_str(LHS,g.numtag),
380 sent[s], s, sent[s], t, loc_h, v)
381 str += "---ICHART:end---\n"
382 return str
385 def inner_sent(g, sent, ichart):
386 return sum([inner(0, len(sent)-1, ROOT, loc_h, g, sent, ichart)
387 for loc_h in xrange(len(sent))])
390 def c(s,t,LHS,loc_h,g,sent,ichart,ochart):
391 # assuming P_sent = P(D(ROOT)) = inner(sent). todo: check K&M about this
392 p_sent = inner_sent(g, sent, ichart)
393 p_in = inner(s,t,LHS,loc_h,g,sent,ichart)
394 p_out = outer(s,t,LHS,loc_h,g,sent,ichart,ochart)
395 if p_sent > 0.0:
396 return p_in * p_out / p_sent
397 else:
398 return p_sent
400 ###################################
401 # dmv-specific version of outer() #
402 ###################################
403 def outer(s,t,Node,loc_N, g, sent, ichart, ochart):
404 ''' http://www.student.uib.no/~kun041/dmvccm/DMVCCM.html#outer
406 def e(s,t,LHS,loc_h):
407 # or we could just look it up in ichart, assuming ichart to be done
408 return inner(s, t, LHS, loc_h, g, sent, ichart)
410 T = len(sent)-1
411 sent_nums = g.sent_nums(sent)
413 def f(s,t,Node,loc_N):
414 if (s,t,Node) in ochart:
415 return ochart[(s, t, Node,loc_N)]
416 if Node == ROOT:
417 if s == 0 and t == T:
418 return 1.0
419 else: # ROOT may only be used on full sentence
420 return 0.0 # but we may have non-ROOTs over full sentence too
421 p = 0.0
423 for mom in g.mothersL(Node, sent_nums): # mom.L() == Node
424 R = mom.R()
425 mLHS = mom.LHS()
426 if R == STOP:
427 p += f(s,t,mLHS,loc_N) * mom.p_STOP(s,t,loc_N) # == loc_m
428 else:
429 if seals(mLHS) == RGO_L: # left attachment, head(mLHS) == head(L)
430 for r in xrange(t+1,T+1): # t+1 to lasT
431 for loc_m in locs(head(mLHS),sent_nums,t+1,r):
432 p_m = mom.p(t+1 == loc_m)
433 p += f(s,r,mLHS,loc_m) * p_m * e(t+1,r,R,loc_m)
434 else: # right attachment, head(mLHS) == head(Node)
435 loc_m = loc_N
436 p_m = mom.p( t == loc_m)
437 for r in xrange(t+1,T+1): # t+1 to lasT
438 for loc_R in locs(head(mLHS),sent_nums,t+1,r):
439 p += f(s,r,mLHS,loc_m) * p_m * e(t+1,r,R,loc_R)
441 for mom in g.mothersR(Node, sent_nums):
442 L = mom.L()
443 mLHS = mom.LHS()
444 if L == STOP:
445 p += f(s,t,mLHS,loc_N) * mom.p_STOP(s,t,loc_N) # == loc_m
446 else:
447 if seals(mLHS) == RGO_L: # left attachment, head(mLHS) == head(Node)
448 loc_m = loc_N
449 p_m = mom.p( s == loc_m)
450 for r in xrange(0,s): # first to s-1
451 for loc_L in locs(head(L),sent_nums,r,s-1):
452 p += e(r,s-1,L, loc_L) * p_m * f(r,t,mLHS,loc_m)
453 else: # right attachment, head(mLHS) == head(R)
454 for r in xrange(0,s): # first to s-1
455 for loc_m in locs(head(mLHS),sent_nums,r,s-1):
456 p_m = mom.p(s-1 == loc_m)
457 p += e(r,s-1,L, loc_m) * p_m * f(r,t,mLHS,loc_m)
458 ochart[s,t,Node,loc_N] = p
459 return p
462 return f(s,t,Node,loc_N)
463 # end outer(s,t,Node,loc_N, g,sent, ichart,ochart)
467 ##############################
468 # reestimation, todo: #
469 ##############################
470 def reestimate_zeros(h_nums):
471 # todo: p_ROOT, p_CHOOSE, p_terminals
472 f = {}
473 for h in h_nums:
474 f[('LNSTOP','num',h)] = 0.0
475 f[('LNSTOP','den',h)] = 0.0
476 f[('LASTOP','num',h)] = 0.0
477 f[('LASTOP','den',h)] = 0.0
478 f[('RNSTOP','num',h)] = 0.0
479 f[('RNSTOP','den',h)] = 0.0
480 f[('RASTOP','num',h)] = 0.0
481 f[('RASTOP','den',h)] = 0.0
482 return f
484 def reestimate(g, corpus):
485 '''current todo.
486 P_STOP(-STOP|...) = 1 - P_STOP(STOP|...) '''
487 f = reestimate_zeros(g.head_nums)
488 ichart = {}
489 ochart = {}
490 def c_g(s,t,LHS,loc_h,sent):
491 return c(s,t,LHS,loc_h,g,sent,ichart,ochart)
492 for sent in corpus:
493 if 'reest' in io.DEBUG:
494 print sent
495 sent_nums = g.sent_nums(sent)
496 ichart = {}
497 ochart = {}
498 for loc_h,h in enumerate(sent_nums):
499 for t in xrange(loc_h, len(sent)):
500 for s in xrange(loc_h): # s<loc(h), range gives strictly less
501 # left non-adjacent stop
502 f[('LNSTOP','num',h)] += c_g(s, t, (SEAL, h), loc_h,sent)
503 f[('LNSTOP','den',h)] += c_g(s, t, (RGO_L,h), loc_h,sent)
504 # left adjacent stop
505 f[('LASTOP','num',h)] += c_g(loc_h, t, (SEAL, h), loc_h,sent)
506 f[('LASTOP','den',h)] += c_g(loc_h, t, (RGO_L,h), loc_h,sent)
507 for t in xrange(loc_h+1, len(sent)):
508 # right non-adjacent stop
509 f[('RNSTOP','num',h)] += c_g(loc_h, t, (RGO_L,h), loc_h,sent)
510 f[('RNSTOP','den',h)] += c_g(loc_h, t, (GO_R, h), loc_h,sent)
511 f[('RASTOP','num',h)] += c_g(loc_h, loc_h, (RGO_L,h), loc_h,sent)
512 f[('RASTOP','den',h)] += c_g(loc_h, loc_h, (GO_R, h), loc_h,sent)
514 # todo: use sum([ichart[s, t...] etc? but can we then
515 # keep den and num separate within _one_ sum()-call? use map?
517 # we want to go through only non-ROOT left-STOPs..
518 for r in g.all_rules():
519 if r.L() == STOP and not r.LHS() == ROOT:
520 h = r.head()
521 if 'reest' in io.DEBUG:
522 old_probN = r.probN
523 old_probA = r.probA
524 if f[('LNSTOP','den',h)] > 0.0:
525 r.probN = f[('LNSTOP','num',h)] / f[('LNSTOP','den',h)]
526 else:
527 r.probN = 0.0 # or..remove rule? todo
528 if f[('LASTOP','den',h)] > 0.0:
529 r.probA = f[('LASTOP','num',h)] / f[('LASTOP','den',h)]
530 else:
531 r.probA = 0.0 # or..remove rule? todo
532 if 'reest' in io.DEBUG:
533 print "p(STOP|%d=%s,L,N): %.4f / %.4f = %.4f (was: %.4f)"%(h,g.numtag(h),
534 f[('LNSTOP','num',h)],
535 f[('LNSTOP','den',h)],
536 r.probN,
537 old_probN)
538 print "p(STOP|%d=%s,L,A): %.4f / %.4f = %.4f (was: %.4f)"%(h,g.numtag(h),
539 f[('LASTOP','num',h)],
540 f[('LASTOP','den',h)],
541 r.probA,
542 old_probA)
543 if r.R() == STOP and not r.LHS() == ROOT:
544 h = r.head()
545 if 'reest' in io.DEBUG:
546 old_probN = r.probN
547 old_probA = r.probA
548 if f[('RNSTOP','den',h)] > 0.0:
549 r.probN = f[('RNSTOP','num',h)] / f[('RNSTOP','den',h)]
550 else:
551 r.probN = 0.0 # or..remove rule? todo
552 if f[('RASTOP','den',h)] > 0.0:
553 r.probA = f[('RASTOP','num',h)] / f[('RASTOP','den',h)]
554 else:
555 r.probA = 0.0 # or..remove rule? todo
556 if 'reest' in io.DEBUG:
557 print "p(STOP|%d=%s,R,N): %.4f / %.4f = %.4f (was: %.4f)"%(h,g.numtag(h),
558 f[('RNSTOP','num',h)],
559 f[('RNSTOP','den',h)],
560 r.probN,
561 old_probN)
562 print "p(STOP|%d=%s,R,A): %.4f / %.4f = %.4f (was: %.4f)"%(h,g.numtag(h),
563 f[('RASTOP','num',h)],
564 f[('RASTOP','den',h)],
565 r.probA,
566 old_probA)
575 ##############################
576 # testing functions: #
577 ##############################
579 testcorpus = [s.split() for s in ['det nn vbd c vbd','vbd nn c vbd',
580 'det nn vbd', 'det nn vbd c pp',
581 'det nn vbd', 'det vbd vbd c pp',
582 'det nn vbd', 'det nn vbd c vbd',
583 'det nn vbd', 'det nn vbd c vbd',
584 'det nn vbd', 'det nn vbd c vbd',
585 'det nn vbd', 'det nn vbd c pp',
586 'det nn vbd pp', 'det nn vbd', ]]
587 def testgrammar():
588 import harmonic
589 reload(harmonic)
590 return harmonic.initialize(testcorpus)
592 def testreestimation():
593 io.DEBUG.add('reest')
594 g = testgrammar()
595 reestimate(g, testcorpus)
599 def testgrammar_a(): # Non, Adj
600 _h_ = DMV_Rule((SEAL,0), STOP, ( RGO_L,0), 0.9, 0.9) # LSTOP
601 h_S = DMV_Rule(( RGO_L,0),(GO_R,0), STOP, 0.4, 0.3) # RSTOP
602 h_A = DMV_Rule(( RGO_L,0),(SEAL,0),( RGO_L,0),0.2, 0.1) # Lattach
603 h_Aa= DMV_Rule(( RGO_L,0),(SEAL,1),( RGO_L,0),0.4, 0.6) # Lattach to a
604 h = DMV_Rule((GO_R,0),(GO_R,0),(SEAL,0), 1.0, 1.0) # Rattach
605 ha = DMV_Rule((GO_R,0),(GO_R,0),(SEAL,1), 1.0, 1.0) # Rattach to a
606 rh = DMV_Rule( ROOT, STOP, (SEAL,0), 1.0, 1.0) # ROOT
608 _a_ = DMV_Rule((SEAL,1), STOP, ( RGO_L,1), 1.0, 1.0) # LSTOP
609 a_S = DMV_Rule(( RGO_L,1),(GO_R,1), STOP, 0.4, 0.3) # RSTOP
610 a_A = DMV_Rule(( RGO_L,1),(SEAL,1),( RGO_L,1),0.4, 0.6) # Lattach
611 a_Ah= DMV_Rule(( RGO_L,1),(SEAL,0),( RGO_L,1),0.2, 0.1) # Lattach to h
612 a = DMV_Rule((GO_R,1),(GO_R,1),(SEAL,1), 1.0, 1.0) # Rattach
613 ah = DMV_Rule((GO_R,1),(GO_R,1),(SEAL,0), 1.0, 1.0) # Rattach to h
614 ra = DMV_Rule( ROOT, STOP, (SEAL,1), 0.1, 0.1) # ROOT
616 b2 = {}
617 b2[(GO_R, 0), 'h'] = 1.0
618 b2[(GO_R, 1), 'a'] = 1.0
620 return DMV_Grammar([ h_Aa, ha, a_Ah, ah, ra, _a_, a_S, a_A, a, rh, _h_, h_S, h_A, h ],b2,0,0,0, {0:'h',1:'a'}, {'h':0,'a':1})
624 def testgrammar_h(): # Non, Adj
625 _h_ = DMV_Rule((SEAL,0), STOP, ( RGO_L,0), 1.0, 1.0) # LSTOP
626 h_S = DMV_Rule(( RGO_L,0),(GO_R,0), STOP, 0.4, 0.3) # RSTOP
627 h_A = DMV_Rule(( RGO_L,0),(SEAL,0),( RGO_L,0), 0.6, 0.7) # Lattach
628 h = DMV_Rule((GO_R,0),(GO_R,0),(SEAL,0), 1.0, 1.0) # Rattach
629 rh = DMV_Rule( ROOT, STOP, (SEAL,0), 1.0, 1.0) # ROOT
630 b2 = {}
631 b2[(GO_R, 0), 'h'] = 1.0
633 return DMV_Grammar([ rh, _h_, h_S, h_A, h ],b2,0,0,0, {0:'h'}, {'h':0})
636 def testreestimation_h():
637 io.DEBUG.add('reest')
638 g = testgrammar_h()
639 reestimate(g,['h h h'.split()])
641 def regression_tests():
642 g_dup = testgrammar_h()
644 test0 = inner(0, 1, (SEAL,0), 0, g_dup, 'h h'.split(), {})
645 if not "0.120"=="%.3f" % test0:
646 print "Should be 0.120: %.3f" % test0
648 test1 = inner(0, 1, (SEAL,0), 1, g_dup, 'h h'.split(), {})
649 if not "0.063"=="%.3f" % test1:
650 print "Should be 0.063: %.3f" % test1
652 test3 = inner(0, 2, (SEAL,0), 2, g_dup, 'h h h'.split(), {})
653 if not "0.0498"=="%.4f" % test3:
654 print "Should be 0.0498: %.4f" % test3
656 test4 = outer(1,2,(1,0),2,testgrammar_h(),'h h h'.split(),{},{})
657 if not "0.58" == "%.2f" % test4:
658 print "Should be 0.58: %.2f" % test4
660 if __name__ == "__main__":
661 import timeit
662 # import profile
663 # profile.run('testreestimation()')
664 # print timeit.Timer("dmv.testreestimation()",'''import dmv
665 # reload(dmv)''').timeit(1)
666 # testreestimation_h()
667 io.DEBUG.clear()
668 regression_tests()
669 print "outer(0,0,(1,0),0,testgrammar_a(),'h a'.split(),{},{}):"
670 print outer(0,0,(1,0),0,testgrammar_a(),'h a'.split(),{},{})
671 print "outer(0,0,(0,0),0,testgrammar_a(),'h a'.split(),{},{}):"
672 print outer(0,0,(0,0),0,testgrammar_a(),'h a'.split(),{},{})
674 io.DEBUG.clear()
675 #print "testreestimation():"
676 # testreestimation()