1 # before_betweens_dmv.py
3 # dmv reestimation and inside-outside probabilities using loc_h, but
4 # at-word sentence locations
6 #import numpy # numpy provides Fast Arrays, for future optimization
8 from common_dmv
import *
10 if __name__
== "__main__":
11 print "before_betweens_dmv module tests:"
13 class DMV_Grammar(io
.Grammar
):
17 p_STOP, p_ROOT, p_CHOOSE, p_terminals
18 These are changed in the Maximation step, then used to set the
19 new probabilities of each DMV_Rule.
21 Todo: make p_terminals private? (But it has to be changable in
22 maximation step due to the short-cutting rules... could of course
23 make a DMV_Grammar function to update the short-cut rules...)
25 __p_rules is private, but we can still say stuff like:
26 for r in g.all_rules():
29 What other representations do we need? (P_STOP formula uses
30 deps_D(h,l/r) at least)'''
33 for r
in self
.all_rules():
34 str += "%s\n" % r
.__str
__(self
.numtag
)
38 return [r
for r
in self
.all_rules() if r
.POS() == h
]
40 def mothersL(self
, Node
, sent_nums
, loc_N
):
41 # todo: speed-test with and without sent_nums/loc_N cut-off
42 return [r
for r
in self
.all_rules() if r
.L() == Node
43 and (POS(r
.R()) in sent_nums
[loc_N
+1:] or r
.R() == STOP
)]
45 def mothersR(self
, Node
, sent_nums
, loc_N
):
46 return [r
for r
in self
.all_rules() if r
.R() == Node
47 and (POS(r
.L()) in sent_nums
[:loc_N
] or r
.L() == STOP
)]
50 return [r
for r
in self
.all_rules() if r
.LHS() == LHS
]
52 def sent_rules(self
, LHS
, sent_nums
):
53 '''Used in dmv.inner. Todo: this takes a _lot_ of time, it
54 seems. Could use some more space and cache some of this
56 # We don't want to rule out STOPs!
57 nums
= sent_nums
+ [ POS(STOP
) ]
58 return [r
for r
in self
.all_rules() if r
.LHS() == LHS
59 and POS(r
.L()) in nums
and POS(r
.R()) in nums
]
61 def deps_L(self
, head
): # todo: do I use this at all?
62 # todo test, probably this list comprehension doesn't work
63 return [a
for r
in self
.all_rules() if r
.POS() == head
and a
== r
.L()]
65 def deps_R(self
, head
):
66 # todo test, probably this list comprehension doesn't work
67 return [a
for r
in self
.all_rules() if r
.POS() == head
and a
== r
.R()]
69 def __init__(self
, numtag
, tagnum
, p_rules
, p_terminals
, p_STOP
, p_CHOOSE
, p_ROOT
):
70 io
.Grammar
.__init
__(self
, numtag
, tagnum
, p_rules
, p_terminals
)
72 self
.p_CHOOSE
= p_CHOOSE
74 self
.head_nums
= [k
for k
in numtag
.iterkeys()]
77 class DMV_Rule(io
.CNF_Rule
):
78 '''A single CNF rule in the PCFG, of the form
80 where LHS, L and R are 'nodes', eg. of the form (seals, head).
88 Different rule-types have different probabilities associated with
91 _h_ -> STOP h_ P( STOP|h,L, adj)
92 _h_ -> STOP h_ P( STOP|h,L,non_adj)
93 h_ -> h STOP P( STOP|h,R, adj)
94 h_ -> h STOP P( STOP|h,R,non_adj)
95 h_ -> _a_ h_ P(-STOP|h,L, adj) * P(a|h,L)
96 h_ -> _a_ h_ P(-STOP|h,L,non_adj) * P(a|h,L)
97 h -> h _a_ P(-STOP|h,R, adj) * P(a|h,R)
98 h -> h _a_ P(-STOP|h,R,non_adj) * P(a|h,R)
100 def p(self
, adj
, *arg
):
106 def adj(middle
, loc_h
):
107 "middle is eg. k when rewriting for i<k<j (inside probabilities)."
108 return middle
== loc_h
[0] or middle
== loc_h
[1]
110 def p_STOP(self
, s
, t
, loc_h
):
111 '''Returns the correct probability, adjacent if we're rewriting from
112 the (either left or right) end of the fragment.
115 return self
.p(s
== loc_h
)
116 elif self
.R() == STOP
:
119 print "(%s given loc_h:%d but s:%d. Todo: optimize away!)" % (self
, loc_h
, s
)
122 return self
.p(t
== loc_h
)
124 def p_ATTACH(self
, r
, loc_h
, s
=None):
125 '''Returns the correct probability, adjacent if we haven't attached
127 (This is actually p_choose*(1-p_stop).)'''
128 if self
.LHS() == self
.L():
129 if s
and not loc_h
== s
:
131 print "(%s given loc_h (loc_L):%d but s:%d. Todo: optimize away!)" % (self
, loc_h
, s
)
134 return self
.p(r
== loc_h
)
135 elif self
.LHS() == self
.R():
136 return self
.p(r
+1 == loc_h
)
139 return seals(self
.LHS())
142 return POS(self
.LHS())
144 def __init__(self
, LHS
, L
, R
, probN
, probA
):
145 for b_h
in [LHS
, L
, R
]:
146 if seals(b_h
) not in SEALS
:
147 raise ValueError("seals must be in %s; was given: %s"
148 % (SEALS
, seals(b_h
)))
149 io
.CNF_Rule
.__init
__(self
, LHS
, L
, R
, probN
)
150 self
.probA
= probA
# adjacent
151 self
.probN
= probN
# non_adj
153 @classmethod # so we can call DMV_Rule.bar_str(b_h)
154 def bar_str(cls
, b_h
, tag
=lambda x
:x
):
159 elif(seals(b_h
) == RGOL
):
160 return " %s_ " % tag(POS(b_h
))
161 elif(seals(b_h
) == SEAL
):
162 return "_%s_ " % tag(POS(b_h
))
164 return " %s " % tag(POS(b_h
))
167 def __str__(self
, tag
=lambda x
:x
):
168 return "%s-->%s %s\t[N %.2f] [A %.2f]" % (self
.bar_str(self
.LHS(), tag
),
169 self
.bar_str(self
.L(), tag
),
170 self
.bar_str(self
.R(), tag
),
180 ###################################
181 # dmv-specific version of inner() #
182 ###################################
183 def locs(h
, sent
, s
=0, t
=None, remove
=None):
184 '''Return the locations of h in sent, or some fragment of sent (in the
185 latter case we make sure to offset the locations correctly so that
186 for any x in the returned list, sent[x]==h).
188 t is inclusive, to match the way indices work with inner()
189 (although python list-splicing has "exclusive" end indices)'''
192 return [i
+s
for i
,w
in enumerate(sent
[s
:t
+1])
193 if w
== h
and not (i
+s
) == remove
]
196 def inner(s
, t
, LHS
, loc_h
, g
, sent
, ichart
={}):
197 ''' A rewrite of io.inner(), to take adjacency into accord.
199 The ichart is now of this form:
200 ichart[s,t,LHS, loc_h]
202 loc_h gives adjacency (along with r and location of other child
203 for attachment rules), and is needed in P_STOP reestimation.
205 Todo: if possible, refactor (move dmv-specific stuff back into
206 dmv, so this is "general" enough to be in io.py)
212 sent_nums
= g
.sent_nums(sent
)
214 def e(s
,t
,LHS
, loc_h
, n_t
):
216 "Tabs for debug output"
219 if (s
, t
, LHS
, loc_h
) in ichart
:
221 print "%s*= %.4f in ichart: s:%d t:%d LHS:%s loc:%d" % (tab(),ichart
[s
, t
, LHS
, loc_h
], s
, t
,
222 DMV_Rule
.bar_str(LHS
), loc_h
)
223 return ichart
[s
, t
, LHS
, loc_h
]
225 if s
== t
and seals(LHS
) == GOR
:
228 print "%s*= 0.0 (wrong loc_h)" % tab()
230 elif (LHS
, O(s
)) in g
.p_terminals
:
231 prob
= g
.p_terminals
[LHS
, O(s
)] # "b[LHS, O(s)]" in Lari&Young
233 # todo: assuming this is how to deal w/lacking
234 # rules, since we add prob.s, and 0 is identity
237 print "%sLACKING TERMINAL:" % tab()
238 # todo: add to ichart perhaps? Although, it _is_ simple lookup..
240 print "%s*= %.4f (terminal: %s -> %s_%d)" % (tab(),prob
, DMV_Rule
.bar_str(LHS
), O(s
), loc_h
)
243 p
= 0.0 # "sum over j,k in a[LHS,j,k]"
244 for rule
in g
.sent_rules(LHS
, sent_nums
):
246 print "%ssumming rule %s s:%d t:%d loc:%d" % (tab(),rule
,s
,t
,loc_h
)
249 if loc_h
== t
and LHS
== L
:
250 continue # todo: speed-test
251 if loc_h
== s
and LHS
== R
:
253 # if it's a STOP rule, rewrite for the same xrange:
254 if (L
== STOP
) or (R
== STOP
):
256 pLR
= e(s
, t
, R
, loc_h
, n_t
+1)
258 pLR
= e(s
, t
, L
, loc_h
, n_t
+1)
259 p
+= rule
.p_STOP(s
, t
, loc_h
) * pLR
261 print "%sp= %.4f (STOP)" % (tab(), p
)
263 elif t
> s
: # not a STOP, attachment rewrite:
264 rp_ATTACH
= rule
.p_ATTACH
# todo: profile/speedtest
265 for r
in xrange(s
, t
):
266 p_h
= rp_ATTACH(r
, loc_h
, s
=s
)
269 locs_R
= locs(POS(R
), sent_nums
, r
+1, t
, loc_h
)
271 locs_L
= locs(POS(L
), sent_nums
, s
, r
, loc_h
)
274 pL
= e(s
, r
, L
, loc_L
, n_t
+1)
277 pR
= e(r
+1, t
, R
, loc_R
, n_t
+1)
280 print "%sp= %.4f (ATTACH)" % (tab(), p
)
281 ichart
[s
, t
, LHS
, loc_h
] = p
285 inner_prob
= e(s
,t
,LHS
,loc_h
, 0)
287 print debug_ichart(g
,sent
,ichart
)
289 # end of dmv.inner(s, t, LHS, loc_h, g, sent, ichart={})
292 def debug_ichart(g
,sent
,ichart
):
293 str = "---ICHART:---\n"
294 for (s
,t
,LHS
,loc_h
),v
in ichart
.iteritems():
295 if type(v
) == dict: # skip 'tree'
297 str += "%s -> %s_%d ... %s_%d (loc_h:%s):\t%.4f\n" % (DMV_Rule
.bar_str(LHS
,g
.numtag
),
298 sent
[s
], s
, sent
[s
], t
, loc_h
, v
)
299 str += "---ICHART:end---\n"
303 def inner_sent(g
, sent
, ichart
={}):
304 return sum([inner(0, len(sent
)-1, ROOT
, loc_h
, g
, sent
, ichart
)
305 for loc_h
in xrange(len(sent
))])
308 ###################################
309 # dmv-specific version of outer() #
310 ###################################
311 def outer(s
,t
,Node
,loc_N
, g
, sent
, ichart
={}, ochart
={}):
312 ''' http://www.student.uib.no/~kun041/dmvccm/DMVCCM.html#outer
314 def e(s
,t
,LHS
,loc_h
):
315 # or we could just look it up in ichart, assuming ichart to be done
316 return inner(s
, t
, LHS
, loc_h
, g
, sent
, ichart
)
319 sent_nums
= g
.sent_nums(sent
)
321 def f(s
,t
,Node
,loc_N
):
322 if (s
,t
,Node
,loc_N
) in ochart
:
323 return ochart
[(s
, t
, Node
,loc_N
)]
325 if s
== 0 and t
== T
:
327 else: # ROOT may only be used on full sentence
328 return 0.0 # but we may have non-ROOTs over full sentence too
331 for mom
in g
.mothersL(Node
, sent_nums
, loc_N
): # mom.L() == Node
335 p
+= f(s
,t
,mLHS
,loc_N
) * mom
.p_STOP(s
,t
,loc_N
) # == loc_m
337 if seals(mLHS
) == RGOL
: # left attachment, POS(mLHS) == POS(R)
338 for r
in xrange(t
+1,T
+1): # t+1 to lasT
339 for loc_m
in locs(POS(mLHS
),sent_nums
,t
+1,r
):
340 p_m
= mom
.p(t
+1 == loc_m
)
341 p
+= f(s
,r
,mLHS
,loc_m
) * p_m
* e(t
+1,r
,R
,loc_m
)
342 elif seals(mLHS
) == GOR
: # right attachment, POS(mLHS) == POS(Node)
344 p_m
= mom
.p( t
== loc_m
)
345 for r
in xrange(t
+1,T
+1): # t+1 to lasT
346 for loc_R
in locs(POS(R
),sent_nums
,t
+1,r
):
347 p
+= f(s
,r
,mLHS
,loc_m
) * p_m
* e(t
+1,r
,R
,loc_R
)
349 for mom
in g
.mothersR(Node
, sent_nums
, loc_N
): # mom.R() == Node
353 p
+= f(s
,t
,mLHS
,loc_N
) * mom
.p_STOP(s
,t
,loc_N
) # == loc_m
355 if seals(mLHS
) == RGOL
: # left attachment, POS(mLHS) == POS(Node)
357 p_m
= mom
.p( s
== loc_m
)
358 for r
in xrange(0,s
): # first to s-1
359 for loc_L
in locs(POS(L
),sent_nums
,r
,s
-1):
360 p
+= e(r
,s
-1,L
, loc_L
) * p_m
* f(r
,t
,mLHS
,loc_m
)
361 elif seals(mLHS
) == GOR
: # right attachment, POS(mLHS) == POS(L)
362 for r
in xrange(0,s
): # first to s-1
363 for loc_m
in locs(POS(mLHS
),sent_nums
,r
,s
-1):
364 p_m
= mom
.p(s
-1 == loc_m
)
365 p
+= e(r
,s
-1,L
, loc_m
) * p_m
* f(r
,t
,mLHS
,loc_m
)
366 ochart
[s
,t
,Node
,loc_N
] = p
370 return f(s
,t
,Node
,loc_N
)
371 # end outer(s,t,Node,loc_N, g,sent, ichart,ochart)
375 ##############################
376 # reestimation, todo: #
377 ##############################
378 ## using local version instead
379 # def c(s,t,LHS,loc_h,g,sent,ichart={},ochart={}):
380 # # assuming P_sent = P(D(ROOT)) = inner(sent). todo: check K&M about this
381 # p_sent = inner_sent(g, sent, ichart)
382 # p_in = inner(s,t,LHS,loc_h,g,sent,ichart)
383 # p_out = outer(s,t,LHS,loc_h,g,sent,ichart,ochart)
385 # return p_in * p_out / p_sent
389 def reest_zeros(h_nums
):
390 # todo: p_ROOT? ... p_terminals?
393 for stop
in ['LNSTOP','LASTOP','RNSTOP','RASTOP']:
394 for nd
in ['num','den']:
396 for choice
in ['RCHOOSE', 'LCHOOSE']:
397 f
[choice
,'den',h
] = 0.0
400 def reest_freq(g
, corpus
):
401 ''' P_STOP(-STOP|...) = 1 - P_STOP(STOP|...) '''
402 f
= reest_zeros(g
.head_nums
)
406 p_sent
= None # 50 % speed increase on storing this locally
407 def c_g(s
,t
,LHS
,loc_h
,sent
): # altogether 2x faster than the global c()
408 if (s
,t
,LHS
,loc_h
) in ichart
:
409 p_in
= ichart
[s
,t
,LHS
,loc_h
]
411 p_in
= inner(s
,t
,LHS
,loc_h
,g
,sent
,ichart
)
412 if (s
,t
,LHS
,loc_h
) in ochart
:
413 p_out
= ochart
[s
,t
,LHS
,loc_h
]
415 p_out
= outer(s
,t
,LHS
,loc_h
,g
,sent
,ichart
,ochart
)
418 return p_in
* p_out
/ p_sent
422 def w_g(s
,t
,a
,loc_a
,LHS
,loc_h
,sent
):
423 "Todo: should sum through all r in between s and t in sent(_nums)"
427 return e_L
* e_R
* f_g(s
,t
,(GOR
, h
), loc_h
, sent
) * p_g(r
,(GOR
, h
), (GOR
, h
), (SEAL
, a
), loc_h
, sent_nums
)
429 return e_L
* e_R
* f_g(s
,t
,(RGOL
, h
), loc_h
, sent
) * p_g(r
,(RGOL
, h
),(SEAL
, a
),(RGOL
, h
),loc_h
,sent_nums
)
431 def f_g(s
,t
,LHS
,loc_h
,sent
): # todo: test with choose rules
432 if (s
,t
,LHS
,loc_h
) in ochart
:
433 return ochart
[s
,t
,LHS
,loc_h
]
435 return outer(s
,t
,LHS
,loc_h
,g
,sent
,ichart
,ochart
)
437 def e_g(s
,t
,LHS
,loc_h
,sent
): # todo: test with choose rules
438 if (s
,t
,LHS
,loc_h
) in ichart
:
439 return ichart
[s
,t
,LHS
,loc_h
]
441 return inner(s
,t
,LHS
,loc_h
,g
,sent
,ichart
)
443 def p_g(r
,LHS
,L
,R
,loc_h
,sent
):
444 rules
= [rule
for rule
in g
.sent_rules(LHS
, sent
)
445 if rule
.L() == L
and rule
.R() == R
]
448 raise Exception("Several rules matching a[i,j,k]")
449 return rule
.p_ATTACH(r
,loc_h
)
456 p_sent
= inner_sent(g
, sent
, ichart
)
458 sent_nums
= g
.sent_nums(sent
)
459 # todo: use sum([ichart[s, t...] etc? but can we then
460 # keep den and num separate within _one_ sum()-call?
461 for loc_h
,h
in enumerate(sent_nums
):
462 for t
in xrange(loc_h
, len(sent
)):
463 for s
in xrange(loc_h
): # s<loc(h), xrange gives strictly less
464 # left non-adjacent stop:
465 f
['LNSTOP','num',h
] += c_g(s
, t
, (SEAL
, h
), loc_h
,sent
)
466 f
['LNSTOP','den',h
] += c_g(s
, t
, (RGOL
,h
), loc_h
,sent
)
467 # left adjacent stop:
468 f
['LASTOP','num',h
] += c_g(loc_h
, t
, (SEAL
, h
), loc_h
,sent
)
469 f
['LASTOP','den',h
] += c_g(loc_h
, t
, (RGOL
,h
), loc_h
,sent
)
470 for t
in xrange(loc_h
+1, len(sent
)):
471 # right non-adjacent stop:
472 f
['RNSTOP','num',h
] += c_g(loc_h
, t
, (RGOL
,h
), loc_h
,sent
)
473 f
['RNSTOP','den',h
] += c_g(loc_h
, t
, (GOR
, h
), loc_h
,sent
)
474 # right adjacent stop:
475 f
['RASTOP','num',h
] += c_g(loc_h
, loc_h
, (RGOL
,h
), loc_h
,sent
)
476 f
['RASTOP','den',h
] += c_g(loc_h
, loc_h
, (GOR
, h
), loc_h
,sent
)
478 # right attachment: TODO: try with p*e*e*f instead of c, for numerator
479 if 'reest_attach' in DEBUG
:
480 print "Rattach %s: for t in %s"%(g
.numtag(h
),sent
[loc_h
+1:len(sent
)])
481 for t
in xrange(loc_h
+1, len(sent
)):
482 cM
= c_g(loc_h
,t
,(GOR
, h
), loc_h
, sent
) # v_q in L&Y
483 f
['RCHOOSE','den',h
] += cM
484 if 'reest_attach' in DEBUG
:
485 print "\tc_g( %d , %d, %s, %s, sent)=%.4f"%(loc_h
,t
,g
.numtag(h
),loc_h
,cM
)
486 args
= {} # for summing w_q's in L&Y, without 1/P_q
487 for r
in xrange(loc_h
+1, t
+1): # loc_h < r <= t
488 e_L
= e_g(loc_h
, r
-1, (GOR
, h
), loc_h
, sent
)
489 if 'reest_attach' in DEBUG
:
490 print "\t\te_g( %d , %d, %s, %d, sent)=%.4f"%(loc_h
,r
-1,g
.numtag(h
),loc_h
,e_L
)
491 for i
,a
in enumerate(sent_nums
[r
:t
+1]):
493 e_R
= e_g(r
, t
, (SEAL
, a
), loc_a
, sent
)
496 args
[a
] += e_L
* e_R
* f_g(loc_h
,t
,(GOR
, h
), loc_h
, sent
) * p_g(r
,(GOR
, h
), (GOR
, h
), (SEAL
, a
), loc_h
, sent_nums
)
497 for a
,sum_a
in args
.iteritems():
498 f
['RCHOOSE','num',h
,a
] = sum_a
/ p_sent
502 if 'reest_attach' in DEBUG
:
503 print "Lattach %s: for s in %s"%(g
.numtag(h
),sent
[0:loc_h
])
504 for s
in xrange(0, loc_h
):
505 if 'reest_attach' in DEBUG
:
506 print "\tfor t in %s"%sent
[loc_h
:len(sent
)]
507 for t
in xrange(loc_h
, len(sent
)):
508 c_M
= c_g(s
,t
,(RGOL
, h
), loc_h
, sent
) # v_q in L&Y
509 f
['LCHOOSE','den',h
] += c_M
510 if 'reest_attach' in DEBUG
:
511 print "\t\tc_g( %d , %d, %s_, %s, sent)=%.4f"%(s
,t
,g
.numtag(h
),loc_h
,c_M
)
512 if 'reest_attach' in DEBUG
:
513 print "\t\tfor r in %s"%(sent
[s
:loc_h
])
514 args
= {} # for summing w_q's in L&Y, without 1/P_q
515 for r
in xrange(s
, loc_h
): # s <= r < loc_h <= t
516 e_R
= e_g(r
+1, t
, (RGOL
, h
), loc_h
, sent
)
517 if 'reest_attach' in DEBUG
:
518 print "\t\te_g( %d , %d, %s_, %d, sent)=%.4f"%(r
+1,t
,g
.numtag(h
),loc_h
,e_R
)
519 for i
,a
in enumerate(sent_nums
[s
:r
+1]):
521 e_L
= e_g( s
, r
, (SEAL
, a
), loc_a
, sent
)
524 args
[a
] += e_L
* e_R
* f_g(s
,t
,(RGOL
, h
), loc_h
, sent
) * p_g(r
,(RGOL
, h
),(SEAL
, a
),(RGOL
, h
),loc_h
,sent_nums
)
525 for a
,sum_a
in args
.iteritems():
526 f
['LCHOOSE', 'num',h
,a
] = sum_a
/ p_sent
529 def reestimate(g
, corpus
):
531 f
= reest_freq(g
, corpus
)
532 # we want to go through only non-ROOT left-STOPs..
533 for r
in g
.all_rules():
538 def reest_rule(r
,f
, g
): # g just for numtag / debug output, remove eventually?
539 "remove 0-prob rules? todo"
542 return None # not sure what todo yet here
543 if r
.L() == STOP
or POS(r
.R()) == h
:
545 elif r
.R() == STOP
or POS(r
.L()) == h
:
548 raise Exception("Odd rule in reestimation.")
550 p_stopN
= f
[dir+'NSTOP','den',h
]
552 p_stopN
= f
[dir+'NSTOP','num',h
] / p_stopN
554 p_stopA
= f
[dir+'ASTOP','den',h
]
556 p_stopA
= f
[dir+'ASTOP','num',h
] / p_stopA
558 if r
.L() == STOP
or r
.R() == STOP
: # stop rules
560 print "p(STOP|%d=%s,%s,N): %.4f (was: %.4f)"%(h
,g
.numtag(h
),dir, p_stopN
, r
.probN
)
561 print "p(STOP|%d=%s,%s,A): %.4f (was: %.4f)"%(h
,g
.numtag(h
),dir, p_stopA
, r
.probA
)
565 else: # attachment rules
566 pchoose
= f
[dir+'CHOOSE','den',h
]
568 if POS(r
.R()) == h
: # left attachment
570 elif POS(r
.L()) == h
: # right attachment
572 pchoose
= f
[dir+'CHOOSE','num',h
,a
] / pchoose
573 r
.probN
= (1-p_stopN
) * pchoose
574 r
.probA
= (1-p_stopA
) * pchoose
576 print "p(%d=%s|%d=%s,%s): %.4f,\tprobN: %.4f, probA: %.4f"%(a
,g
.numtag(a
),h
,g
.numtag(h
),dir, pchoose
,r
.probN
,r
.probA
)
584 ##############################
585 # testing functions: #
586 ##############################
588 testcorpus
= [s
.split() for s
in ['det nn vbd c vbd','vbd nn c vbd',
589 'det nn vbd', 'det nn vbd c pp',
590 'det nn vbd', 'det vbd vbd c pp',
591 'det nn vbd', 'det nn vbd c vbd',
592 'det nn vbd', 'det nn vbd c vbd',
593 'det nn vbd', 'det nn vbd c vbd',
594 'det nn vbd', 'det nn vbd c pp',
595 'det nn vbd pp', 'det nn vbd', ]]
598 import before_betweens_harmonic
599 reload(before_betweens_harmonic
)
600 return before_betweens_harmonic
.initialize(testcorpus
)
602 def testreestimation():
604 f
= reestimate(g
, testcorpus
)
605 f_stops
= {('LNSTOP', 'den', 3): 12.212773236178391, ('RASTOP', 'den', 2): 4.0, ('RNSTOP', 'num', 4): 2.5553487221351365, ('LNSTOP', 'den', 2): 1.274904052793207, ('LASTOP', 'num', 1): 14.999999999999995, ('RASTOP', 'den', 3): 15.0, ('LASTOP', 'num', 4): 16.65701084787457, ('LASTOP', 'num', 0): 4.1600647714443468, ('LNSTOP', 'den', 4): 6.0170669155897105, ('LASTOP', 'num', 3): 2.7872267638216113, ('LASTOP', 'num', 2): 2.9723139990470515, ('LASTOP', 'den', 2): 4.0, ('RNSTOP', 'den', 3): 12.945787931730905, ('LASTOP', 'den', 3): 14.999999999999996, ('RNSTOP', 'den', 2): 0.0, ('LASTOP', 'den', 0): 8.0, ('RASTOP', 'num', 4): 19.44465127786486, ('RNSTOP', 'den', 1): 3.1966410324085777, ('LASTOP', 'den', 1): 14.999999999999995, ('RASTOP', 'num', 3): 4.1061665495365558, ('RNSTOP', 'den', 0): 4.8282499043902476, ('LNSTOP', 'num', 4): 5.3429891521254289, ('RASTOP', 'num', 2): 4.0, ('LASTOP', 'den', 4): 22.0, ('RASTOP', 'num', 1): 12.400273895299103, ('LNSTOP', 'num', 2): 1.0276860009529487, ('RASTOP', 'num', 0): 3.1717500956097533, ('LNSTOP', 'num', 3): 12.212773236178391, ('RASTOP', 'den', 4): 22.0, ('RNSTOP', 'den', 4): 2.8705211946979836, ('LNSTOP', 'num', 0): 3.8399352285556518, ('LNSTOP', 'num', 1): 0.0, ('RNSTOP', 'num', 0): 4.8282499043902476, ('RNSTOP', 'num', 1): 2.5997261047008959, ('LNSTOP', 'den', 1): 0.0, ('RASTOP', 'den', 0): 8.0, ('RNSTOP', 'num', 2): 0.0, ('LNSTOP', 'den', 0): 4.6540557322109795, ('RASTOP', 'den', 1): 15.0, ('RNSTOP', 'num', 3): 10.893833450463443}
606 for k
,v
in f_stops
.iteritems():
608 print '''Regression!(?) Something changed in the P_STOP reestimation,
609 expected f[%s]=%.4f, but %s not in f'''%(k
,v
,k
)
611 elif not "%.10f"%f[k
] == "%.10f"%v
:
612 print '''Regression!(?) Something changed in the P_STOP reestimation,
613 expected f[%s]=%.4f, got f[%s]=%.4f.'''%(k
,v
,k
,f
[k
])
617 def testgrammar_a(): # Non, Adj
618 _h_
= DMV_Rule((SEAL
,0), STOP
, ( RGOL
,0), 1.0, 1.0) # LSTOP
619 h_S
= DMV_Rule(( RGOL
,0),(GOR
,0), STOP
, 0.4, 0.3) # RSTOP
620 h_A
= DMV_Rule(( RGOL
,0),(SEAL
,0),( RGOL
,0),0.2, 0.1) # Lattach
621 h_Aa
= DMV_Rule(( RGOL
,0),(SEAL
,1),( RGOL
,0),0.4, 0.6) # Lattach to a
622 h
= DMV_Rule((GOR
,0),(GOR
,0),(SEAL
,0), 1.0, 1.0) # Rattach
623 ha
= DMV_Rule((GOR
,0),(GOR
,0),(SEAL
,1), 1.0, 1.0) # Rattach to a
624 rh
= DMV_Rule( ROOT
, STOP
, (SEAL
,0), 0.9, 0.9) # ROOT
626 _a_
= DMV_Rule((SEAL
,1), STOP
, ( RGOL
,1), 1.0, 1.0) # LSTOP
627 a_S
= DMV_Rule(( RGOL
,1),(GOR
,1), STOP
, 0.4, 0.3) # RSTOP
628 a_A
= DMV_Rule(( RGOL
,1),(SEAL
,1),( RGOL
,1),0.4, 0.6) # Lattach
629 a_Ah
= DMV_Rule(( RGOL
,1),(SEAL
,0),( RGOL
,1),0.2, 0.1) # Lattach to h
630 a
= DMV_Rule((GOR
,1),(GOR
,1),(SEAL
,1), 1.0, 1.0) # Rattach
631 ah
= DMV_Rule((GOR
,1),(GOR
,1),(SEAL
,0), 1.0, 1.0) # Rattach to h
632 ra
= DMV_Rule( ROOT
, STOP
, (SEAL
,1), 0.1, 0.1) # ROOT
635 b2
[(GOR
, 0), 'h'] = 1.0
636 b2
[(GOR
, 1), 'a'] = 1.0
638 return DMV_Grammar({0:'h',1:'a'}, {'h':0,'a':1}, [ h_Aa
, ha
, a_Ah
, ah
, ra
, _a_
, a_S
, a_A
, a
, rh
, _h_
, h_S
, h_A
, h
],b2
,0,0,0)
639 def oa(s
,t
,LHS
,loc_h
):
640 return outer(s
,t
,LHS
,loc_h
,testgrammar_a(),'h a'.split())
641 def ia(s
,t
,LHS
,loc_h
):
642 return inner(s
,t
,LHS
,loc_h
,testgrammar_a(),'h a'.split())
643 def ca(s
,t
,LHS
,loc_h
):
644 return c(s
,t
,LHS
,loc_h
,testgrammar_a(),'h a'.split())
646 def testgrammar_h(): # Non, Adj
647 _h_
= DMV_Rule((SEAL
,0), STOP
, ( RGOL
,0), 1.0, 1.0) # LSTOP
648 h_S
= DMV_Rule(( RGOL
,0),(GOR
,0), STOP
, 0.4, 0.3) # RSTOP
649 h_A
= DMV_Rule(( RGOL
,0),(SEAL
,0),( RGOL
,0), 0.6, 0.7) # Lattach
650 h
= DMV_Rule((GOR
,0),(GOR
,0),(SEAL
,0), 1.0, 1.0) # Rattach
651 rh
= DMV_Rule( ROOT
, STOP
, (SEAL
,0), 1.0, 1.0) # ROOT
653 b2
[(GOR
, 0), 'h'] = 1.0
655 return DMV_Grammar({0:'h'}, {'h':0}, [ rh
, _h_
, h_S
, h_A
, h
],b2
,0,0,0)
658 def testreestimation_h():
660 reestimate(g
,['h h h'.split()])
663 def regression_tests():
664 def test(wanted
, got
):
665 if not wanted
== got
:
666 print "Regression! Should be %s: %s" % (wanted
, got
)
668 g_dup
= testgrammar_h()
671 "%.3f" % inner(0, 1, (SEAL
,0), 0, g_dup
, 'h h'.split(), {}))
674 "%.3f" % inner(0, 1, (SEAL
,0), 1, g_dup
, 'h h'.split(), {}))
677 "%.4f" % inner(0, 2, (SEAL
,0), 2, g_dup
, 'h h h'.split(), {}))
680 "%.2f" % outer(1,2,(1,0),2,testgrammar_h(),'h h h'.split(),{},{}))
683 "%.4f" % outer(0,0,(0,0),0,testgrammar_a(),'h a'.split(),{},{}))
685 "%.4f" % outer(0,1,(0,0),0,testgrammar_a(),'h a'.split(),{},{}))
687 "%.4f" % outer(0,2,(0,0),0,testgrammar_a(),'h a'.split(),{},{}))
690 if __name__
== "__main__":
692 if __name__
== "__main__":
698 inners
= [(sent
, inner_sent(g
, sent
, {})) for sent
in testcorpus
]