3 # - prettier printout for DMV_Rule
4 # - DMV_Rule changed a bit. head, L and R are now all pairs of the
6 # - Started on P_STOP, a bit less pseudo now..
9 # - started on initialization. So far, I have frequencies for
10 # everything, very harmonic. Still need to make these into 1-summing
14 # - more work on initialization (init_freq and init_normalize),
15 # getting closer to probabilities now.
18 # - init_normalize is done, it creates p_STOP, p_ROOT and p_CHOOSE,
19 # and also adds the relevant probabilities to p_rules in a grammar.
20 # Still, each individual rule has to store both adjacent and non_adj
21 # probabilities, and inner() should be able to send some parameter
22 # which lets the rule choose... hopefully... Is this possible to do
23 # top-down even? when the sentence could be all the same words?
24 # todo: extensive testing of identical words in sentences!
25 # - frequencies (only used in initialization) are stored as strings,
26 # but in the rules and p_STOP etc, there are only numbers.
29 # - copied inner() into this file, to make the very dmv-specific
30 # adjacency stuff work (have to factor that out later on, when it
34 # - finished typing in inner_dmv(), still have to test and debug
35 # it. The chart is now four times as big since for any rule we may
36 # have attachments to either the left or the right below, which
37 # upper rules depend on, for selecting probN or probA
40 # - fixed a number of little bugs in initialization, where certain
41 # rules were simply not created, or created "backwards"
42 # - inner_dmv() should Work now...
45 # - moved initialization to harmonic.py
48 # import numpy # numpy provides Fast Arrays, for future optimization
53 # non-tweakable/constant "lookup" globals
61 if __name__
== "__main__":
62 print "DMV module tests:"
66 '''Useless function, but just here as documentation. Nodes make up
67 LHS, R and L in each DMV_Rule'''
77 class DMV_Grammar(io
.Grammar
):
81 p_STOP, p_ROOT, p_CHOOSE, p_terminals
82 These are changed in the Maximation step, then used to set the
83 new probabilities of each DMV_Rule.
85 Todo: make p_terminals private? (But it has to be changable in
86 maximation step due to the short-cutting rules... could of course
87 make a DMV_Grammar function to update the short-cut rules...)
89 __p_rules is private, but we can still say stuff like:
90 for r in g.all_rules():
93 What other representations do we need? (P_STOP formula uses
94 deps_D(h,l/r) at least)'''
97 for r
in self
.all_rules():
98 str += "%s\n" % r
.__str
__(self
.numtag
)
101 def h_rules(self
, h
):
102 return [r
for r
in self
.all_rules() if r
.head() == h
]
104 def rules(self
, LHS
):
105 return [r
for r
in self
.all_rules() if r
.LHS() == LHS
]
107 def sent_rules(self
, LHS
, sent_nums
):
109 # We don't want to rule out STOPs!
110 sent_nums
.append( head(STOP
) )
111 return [r
for r
in self
.all_rules() if r
.LHS() == LHS
112 and head(r
.L()) in sent_nums
and head(r
.R()) in sent_nums
]
115 '''Not sure yet what is needed here, or where this is needed'''
118 def deps_L(self
, head
):
119 # todo test, probably this list comprehension doesn't work
120 return [a
for r
in self
.all_rules() if r
.head() == head
and a
== r
.L()]
122 def deps_R(self
, head
):
123 # todo test, probably this list comprehension doesn't work
124 return [a
for r
in self
.all_rules() if r
.head() == head
and a
== r
.R()]
126 def __init__(self
, p_rules
, p_terminals
, p_STOP
, p_CHOOSE
, p_ROOT
, numtag
, tagnum
):
127 io
.Grammar
.__init
__(self
, p_rules
, p_terminals
, numtag
, tagnum
)
129 self
.p_CHOOSE
= p_CHOOSE
133 class DMV_Rule(io
.CNF_Rule
):
134 '''A single CNF rule in the PCFG, of the form
136 where LHS, L and R are 'nodes', eg. of the form (bars, head).
144 Different rule-types have different probabilities associated with
147 _h_ -> STOP h_ P( STOP|h,L, adj)
148 _h_ -> STOP h_ P( STOP|h,L,non_adj)
149 h_ -> h STOP P( STOP|h,R, adj)
150 h_ -> h STOP P( STOP|h,R,non_adj)
151 h_ -> _a_ h_ P(-STOP|h,L, adj) * P(a|h,L)
152 h_ -> _a_ h_ P(-STOP|h,L,non_adj) * P(a|h,L)
153 h -> h _a_ P(-STOP|h,R, adj) * P(a|h,R)
154 h -> h _a_ P(-STOP|h,R,non_adj) * P(a|h,R)
156 def p(self
, adj
, *arg
):
162 def p_STOP(self
, s
, t
, loc_h
):
163 '''Returns the correct probability, adjacent if we're rewriting from
164 the (either left or right) end of the fragment. '''
166 return self
.p(s
== loc_h
)
167 elif self
.R() == STOP
:
169 io
.debug( "(%s given loc_h:%d but s:%d. Todo: optimize away!)"
173 return self
.p(t
== loc_h
)
175 def p_ATTACH(self
, r
, loc_h
, s
=None):
176 '''Returns the correct probability, adjacent if we haven't attached
178 if self
.LHS() == self
.L():
180 io
.debug( "(%s given loc_h (loc_L):%d but s:%d. Todo: optimize away!)"
184 return self
.p(r
== loc_h
)
185 elif self
.LHS() == self
.R():
186 return self
.p(r
+1 == loc_h
)
189 return bars(self
.LHS())
192 return head(self
.LHS())
194 def __init__(self
, LHS
, L
, R
, probN
, probA
):
195 for b_h
in [LHS
, L
, R
]:
196 if bars(b_h
) not in BARS
:
197 raise ValueError("bars must be in %s; was given: %s"
199 io
.CNF_Rule
.__init
__(self
, LHS
, L
, R
, probN
)
200 self
.probA
= probA
# adjacent
201 self
.probN
= probN
# non_adj
203 @classmethod # so we can call DMV_Rule.bar_str(b_h)
204 def bar_str(cls
, b_h
, tag
=lambda x
:x
):
209 elif(bars(b_h
) == RBAR
):
210 return " %s_ " % tag(head(b_h
))
211 elif(bars(b_h
) == LRBAR
):
212 return "_%s_ " % tag(head(b_h
))
214 return " %s " % tag(head(b_h
))
217 def __str__(self
, tag
=lambda x
:x
):
218 return "%s-->%s %s\t[N %.2f] [A %.2f]" % (self
.bar_str(self
.LHS(), tag
),
219 self
.bar_str(self
.L(), tag
),
220 self
.bar_str(self
.R(), tag
),
230 ###################################
231 # dmv-specific version of inner() #
232 ###################################
233 def locs(h
, sent
, s
=0, t
=None):
234 '''Return the locations of h in sent, or some fragment of sent (in the
235 latter case we make sure to offset the locations correctly so that
236 for any x in the returned list, sent[x]==h).'''
239 return [i
+s
for i
,w
in enumerate(sent
[s
:t
]) if w
== h
]
242 def inner_dmv(s
, t
, LHS
, loc_h
, g
, sent
, chart
):
243 ''' A rewrite of inner in io.py, to take adjacency into accord.
245 The chart is now of this form:
246 chart[(s,t,LHS, loc_h)]
248 loc_h gives adjacency (along with r and location of other child
249 for attachment rules), and is needed in P_STOP reestimation.
251 Todo: if possible, refactor (move dmv-specific stuff back into
252 dmv, so this is "general" enough to be in io.py)
258 sent_nums
= [g
.tagnum(tag
) for tag
in sent
]
260 def e(s
,t
,LHS
, loc_h
, n_t
):
262 "Tabs for debug output"
265 if (s
, t
, LHS
, loc_h
) in chart
:
266 io
.debug("%s*= %.4f in chart: s:%d t:%d LHS:%s loc:%d"
267 %(tab(),chart
[(s
, t
, LHS
, loc_h
)], s
, t
,
268 DMV_Rule
.bar_str(LHS
), loc_h
))
269 return chart
[(s
, t
, LHS
, loc_h
)]
273 # terminals are always F,F for attachment
274 io
.debug("%s*= 0.0 (wrong loc_h)" % tab())
276 elif (LHS
, O(s
)) in g
.p_terminals
:
277 prob
= g
.p_terminals
[LHS
, O(s
)] # "b[LHS, O(s)]" in Lari&Young
279 # todo: assuming this is how to deal w/lacking
280 # rules, since we add prob.s, and 0 is identity
282 io
.debug( "%sLACKING TERMINAL:" % tab())
283 # todo: add to chart perhaps? Although, it _is_ simple lookup..
284 io
.debug( "%s*= %.4f (terminal: %s -> %s_%d)"
285 % (tab(),prob
, DMV_Rule
.bar_str(LHS
), O(s
), loc_h
) )
288 p
= 0.0 # "sum over j,k in a[LHS,j,k]"
289 for rule
in g
.sent_rules(LHS
, sent_nums
):
290 io
.debug( "%ssumming rule %s s:%d t:%d loc:%d" % (tab(),rule
,s
,t
,loc_h
) )
293 # if it's a STOP rule, rewrite for the same range:
294 if (L
== STOP
) or (R
== STOP
):
296 pLR
= e(s
, t
, R
, loc_h
, n_t
+1)
298 pLR
= e(s
, t
, L
, loc_h
, n_t
+1)
299 p
+= rule
.p_STOP(s
, t
, loc_h
) * pLR
300 io
.debug( "%sp= %.4f (STOP)" % (tab(), p
) )
302 else: # not a STOP, an attachment rewrite:
303 for r
in range(s
, t
):
304 p_h
= rule
.p_ATTACH(r
, loc_h
, s
=s
)
307 locs_R
= locs(head(R
), sent_nums
, r
+1, t
+1)
308 elif rule
.LHS() == R
:
309 locs_L
= locs(head(L
), sent_nums
, s
, r
+1)
311 # see http://tinyurl.com/4ffhhw
312 p
+= sum([e(s
, r
, L
, loc_L
, n_t
+1) *
314 e(r
+1, t
, R
, loc_R
, n_t
+1)
316 for loc_R
in locs_R
])
317 io
.debug( "%sp= %.4f (ATTACH)" % (tab(), p
) )
318 chart
[(s
, t
, LHS
, loc_h
)] = p
322 inner_prob
= e(s
,t
,LHS
,loc_h
, 0)
325 for (s
,t
,LHS
,loc_h
),v
in chart
.iteritems():
326 print "%s -> %s_%d ... %s_%d (loc_h:%s):\t%.4f" % (DMV_Rule
.bar_str(LHS
,g
.numtag
),
327 O(s
), s
, O(s
), t
, loc_h
, v
)
328 print "---CHART:end---"
330 # end of inner_dmv(s, t, LHS, loc_h, g, sent, chart)
332 def inner_sent_dmv(sent
, g
, chart
):
333 '''Possibly there's a more efficient way? Although, non-sentence heads
334 _will_ be ruled out by inner_dmv though.'''
336 for loc_h
,h_tag
in enumerate(sent
):
337 p
+= inner_dmv(0, len(sent
)-1, ROOT
, loc_h
, g
, sent
, chart
)
340 if __name__
== "__main__": # Non, Adj
341 _h_
= DMV_Rule((LRBAR
,0), STOP
, ( RBAR
,0), 1.0, 1.0) # LSTOP
342 h_S
= DMV_Rule(( RBAR
,0),(NOBAR
,0), STOP
, 0.4, 0.3) # RSTOP
343 h_A
= DMV_Rule(( RBAR
,0),(LRBAR
,0),( RBAR
,0), 0.6, 0.7) # Lattach
344 h
= DMV_Rule((NOBAR
,0),(NOBAR
,0),(LRBAR
,0), 1.0, 1.0) # Rattach
346 b2
[(NOBAR
, 0), 'h'] = 1.0
347 b2
[(RBAR
, 0), 'h'] = h_S
.probA
348 b2
[(LRBAR
, 0), 'h'] = h_S
.probA
* _h_
.probA
350 g_dup
= DMV_Grammar([ _h_
, h_S
, h_A
, h
],b2
,0,0,0, {0:'h'}, {'h':0})
353 test0
= inner_dmv(0, 1, (LRBAR
,0), 0, g_dup
, 'h h'.split(), {})
354 if not "0.120"=="%.3f" % test0
:
355 print "Should be 0.120: %.3f" % test0
357 test1
= inner_dmv(0, 1, (LRBAR
,0), 1, g_dup
, 'h h'.split(), {})
358 if not "0.063"=="%.3f" % test1
:
359 print "Should be 0.063: %.3f" % test1
361 test3
= inner_dmv(0, 2, (LRBAR
,0), 2, g_dup
, 'h h h'.split(), {})
362 if not "0.0498"=="%.4f" % test3
:
363 print "Should be 0.0498: %.4f" % test3
370 ##############################
371 # DMV-probabilities, todo: #
372 ##############################
379 '''Here it seems like they store rule information on a per-head (per
380 direction) basis, in deps_D(h, dir) which gives us a list. '''
383 for dir in ['l', 'r']:
384 for a
in deps(h
, dir):
387 P_STOP (0, h
, dir, adj
) * \
388 P_CHOOSE (a
, h
, dir) * \
390 P_STOP (STOP | h
, dir, adj
)
392 return P_h(root(sent
))
395 def P_STOP(STOP
, h
, dir, adj
, g
, corpus
):
396 '''corpus is a list of sentences s.
398 This is based on the formula where STOP is True... not sure how we
399 calculate if STOP is False.
401 I thought about instead having this:
403 for rule in g.p_rules:
407 for rule in g.p_rules:
410 set num and den using inner
411 for rule in g.p_rules
412 rule.prob = rule.num / rule.den
414 ..the way I'm assuming we do it in the commented out io-function in
415 io.py. Having sentences as the outer loop at least we can easily just
416 go through the heads that are actually in the sentence... BUT, this
417 means having to go through p_rules 3 times, not sure what is slower.
419 Also, now inner_dmv makes sure it only goes through heads that are
420 actually in the sentence, so that argument falls.
423 P_STOP(-STOP|...) = 1 - P_STOP(STOP|...)
431 # have to go through _all_ places where h appears in the
432 # sentence...how? how to make sure it _works_?
434 inner_sent_dmv(sent
, g
, chart
) #todo, current. Later on,
435 #generalize to all heads, since
436 #we now have the chart of all
438 locs_h
= locs(h_tag
, sent
)
439 io
.debug( "locs_h:%s, sent:%s"%(locs_h
,sent
) , 2)
441 for s
in range(loc_h
): # s<loc(h), range gives strictly less
442 for t
in range(loc_h
, len(sent
)):
443 io
.debug( "s:%s t:%s loc:%d"%(s
,t
,loc_h
) , 2)
444 if (s
, t
, (LRBAR
,h
), loc_h
) in chart
:
445 io
.debug( "num+=%s"%chart
[(s
, t
, (LRBAR
,h
), loc_h
)] , 2)
446 P_STOP_num
+= chart
[(s
, t
, (LRBAR
,h
), loc_h
)]
447 if (s
, t
, (RBAR
,h
), loc_h
) in chart
:
448 io
.debug( "den+=%s"%chart
[(s
, t
, (LRBAR
,h
), loc_h
)] , 2)
449 P_STOP_den
+= chart
[(s
, t
, (RBAR
,h
), loc_h
)]
450 # todo: use sum([chart[(s, t...)] etc? but can we then
451 # keep den and num separate?
453 io
.debug( "num/den: %s / %s"%(P_STOP_num
, P_STOP_den
) , 2)
455 io
.debug( "num/den: %s / %s = %s"%(P_STOP_num
, P_STOP_den
,P_STOP_num
/ P_STOP_den
) , 2)
456 return P_STOP_num
/ P_STOP_den
# upside down in article
462 def testreestimation():
463 testcorpus
= [s
.split() for s
in ['det nn vbd c vbd','det nn vbd c nn vbd pp',
464 'det nn vbd', 'det vbd nn c vbd pp',
465 'det nn vbd', 'det vbd c nn vbd pp',
466 'det nn vbd', 'det nn vbd nn c vbd pp',
467 'det nn vbd', 'det nn vbd c det vbd pp',
468 'det nn vbd', 'det nn vbd c vbd det det det pp',
469 'det nn vbd', 'det nn vbd c vbd pp',
470 'det nn vbd', 'det nn vbd c vbd det pp',
471 'det nn vbd', 'det nn vbd c vbd pp',
472 'det nn vbd pp', 'det nn vbd det', ]]
473 g
= harmonic
.initialize(testcorpus
)
477 print "This will take some time. todo: figure out why it doesn't work"
478 for r
in g
.h_rules(h
):
481 # print "off-set the rule, see what happens:"
485 pstophln
= P_STOP(True, h
, 'L', 'N', g
, testcorpus
)
486 print "p(STOP|%s,L,N):%s"%(h_tag
,pstophln
)
488 for r
in g
.h_rules(h
):
497 def testreestimation_h():
498 _h_
= DMV_Rule((LRBAR
,0), STOP
, ( RBAR
,0), 1.0, 1.0) # LSTOP
499 h_S
= DMV_Rule(( RBAR
,0),(NOBAR
,0), STOP
, 0.4, 0.3) # RSTOP
500 h_A
= DMV_Rule(( RBAR
,0),(LRBAR
,0),( RBAR
,0), 0.6, 0.7) # Lattach
501 h
= DMV_Rule((NOBAR
,0),(NOBAR
,0),(LRBAR
,0), 1.0, 1.0) # Rattach
502 rh
= DMV_Rule( ROOT
, STOP
, (LRBAR
,0), 1.0, 1.0) # ROOT
504 b2
[(NOBAR
, 0), 'h'] = 1.0
505 b2
[(RBAR
, 0), 'h'] = h_S
.probA
506 b2
[(LRBAR
, 0), 'h'] = h_S
.probA
* _h_
.probA
508 g_dup
= DMV_Grammar([ rh
, _h_
, h_S
, h_A
, h
],b2
,0,0,0, {0:'h'}, {'h':0})
510 # test3 = inner_dmv(0, 2, (LRBAR,0), 2, g_dup, 'h h h'.split(), {})
513 print "todo: figure out why it doesn't work"
514 for r
in g_dup
.h_rules(h
):
517 # print "off-set the rule, see what happens:"
521 pstophln
= P_STOP(True, h
, 'L', 'N', g_dup
, ['h h h'.split()])
522 print "p(STOP|%s,L,N):%s"%(h_tag
,pstophln
)
524 for r
in g_dup
.h_rules(h
):
531 if __name__
== "__main__":
534 # timeit.Timer("dmv.testreestimation_h()",'''import dmv
535 # reload(dmv)''').timeit(1)
540 # todo: some more testing on the Brown corpus:
541 # # first five sentences of the Brown corpus:
542 # g_brown = harmonic.initialize([['AT', 'NP-TL', 'NN-TL', 'JJ-TL', 'NN-TL', 'VBD', 'NR', 'AT', 'NN', 'IN', 'NP$', 'JJ', 'NN', 'NN', 'VBD', '``', 'AT', 'NN', "''", 'CS', 'DTI', 'NNS', 'VBD', 'NN', '.'], ['AT', 'NN', 'RBR', 'VBD', 'IN', 'NN', 'NNS', 'CS', 'AT', 'NN-TL', 'JJ-TL', 'NN-TL', ',', 'WDT', 'HVD', 'JJ', 'NN', 'IN', 'AT', 'NN', ',', '``', 'VBZ', 'AT', 'NN', 'CC', 'NNS', 'IN', 'AT', 'NN-TL', 'IN-TL', 'NP-TL', "''", 'IN', 'AT', 'NN', 'IN', 'WDT', 'AT', 'NN', 'BEDZ', 'VBN', '.'], ['AT', 'NP', 'NN', 'NN', 'HVD', 'BEN', 'VBN', 'IN', 'NP-TL', 'JJ-TL', 'NN-TL', 'NN-TL', 'NP', 'NP', 'TO', 'VB', 'NNS', 'IN', 'JJ', '``', 'NNS', "''", 'IN', 'AT', 'JJ', 'NN', 'WDT', 'BEDZ', 'VBN', 'IN', 'NN-TL', 'NP', 'NP', 'NP', '.'], ['``', 'RB', 'AT', 'JJ', 'NN', 'IN', 'JJ', 'NNS', 'BEDZ', 'VBN', "''", ',', 'AT', 'NN', 'VBD', ',', '``', 'IN', 'AT', 'JJ', 'NN', 'IN', 'AT', 'NN', ',', 'AT', 'NN', 'IN', 'NNS', 'CC', 'AT', 'NN', 'IN', 'DT', 'NN', "''", '.'], ['AT', 'NN', 'VBD', 'PPS', 'DOD', 'VB', 'CS', 'AP', 'IN', 'NP$', 'NN', 'CC', 'NN', 'NNS', '``', 'BER', 'JJ', 'CC', 'JJ', 'CC', 'RB', 'JJ', "''", '.'], ['PPS', 'VBD', 'CS', 'NP', 'NNS', 'VB', '``', 'TO', 'HV', 'DTS', 'NNS', 'VBN', 'CC', 'VBN', 'IN', 'AT', 'NN', 'IN', 'VBG', 'CC', 'VBG', 'PPO', "''", '.'], ['AT', 'JJ', 'NN', 'VBD', 'IN', 'AT', 'NN', 'IN', 'AP', 'NNS', ',', 'IN', 'PPO', 'AT', 'NP', 'CC', 'NP-TL', 'NN-TL', 'VBG', 'NNS', 'WDT', 'PPS', 'VBD', '``', 'BER', 'QL', 'VBN', 'CC', 'VB', 'RB', 'VBN', 'NNS', 'WDT', 'VB', 'IN', 'AT', 'JJT', 'NN', 'IN', 'ABX', 'NNS', "''", '.'], ['NN-HL', 'VBN-HL'], ['WRB', ',', 'AT', 'NN', 'VBD', 'PPS', 'VBZ', '``', 'DTS', 'CD', 'NNS', 'MD', 'BE', 'VBN', 'TO', 'VB', 'JJR', 'NN', 'CC', 'VB', 'AT', 'NN', 'IN', 'NN', "''", '.'], ['AT', 'NN-TL', 'VBG-TL', 'NN-TL', ',', 'AT', 'NN', 'VBD', ',', '``', 'BEZ', 'VBG', 'IN', 'VBN', 'JJ', 'NNS', 'CS', 'AT', 'NN', 'IN', 'NN', 'NNS', 'NNS', "''", '.']])
543 # # 36:'AT' in g_brown.numtag, 40:'NP-TL'
546 # test_brown = inner_dmv(0,2, (LRBAR,36), g_brown, ['AT', 'NP-TL' ,'NN-TL','JJ-TL'], {})
548 # for r in g_brown.rules((2,36)) + g_brown.rules((1,36)) + g_brown.rules((0,36)):
551 # if head(L) in [36,40,-2] and head(R) in [36,40,-2]:
553 # print "Brown-test gives: %.8f" % test_brown
557 # this will give the tag sequences of all the 6218 Brown corpus
558 # sentences of length < 7:
559 # [[tag for (w, tag) in sent]
560 # for sent in nltk.corpus.brown.tagged_sents() if len(sent) < 7]