2 # todo: some more testing on the Brown corpus:
3 # # first five sentences of the Brown corpus:
4 # g_brown = harmonic.initialize([['AT', 'NP-TL', 'NN-TL', 'JJ-TL', 'NN-TL', 'VBD', 'NR', 'AT', 'NN', 'IN', 'NP$', 'JJ', 'NN', 'NN', 'VBD', '``', 'AT', 'NN', "''", 'CS', 'DTI', 'NNS', 'VBD', 'NN', '.'], ['AT', 'NN', 'RBR', 'VBD', 'IN', 'NN', 'NNS', 'CS', 'AT', 'NN-TL', 'JJ-TL', 'NN-TL', ',', 'WDT', 'HVD', 'JJ', 'NN', 'IN', 'AT', 'NN', ',', '``', 'VBZ', 'AT', 'NN', 'CC', 'NNS', 'IN', 'AT', 'NN-TL', 'IN-TL', 'NP-TL', "''", 'IN', 'AT', 'NN', 'IN', 'WDT', 'AT', 'NN', 'BEDZ', 'VBN', '.'], ['AT', 'NP', 'NN', 'NN', 'HVD', 'BEN', 'VBN', 'IN', 'NP-TL', 'JJ-TL', 'NN-TL', 'NN-TL', 'NP', 'NP', 'TO', 'VB', 'NNS', 'IN', 'JJ', '``', 'NNS', "''", 'IN', 'AT', 'JJ', 'NN', 'WDT', 'BEDZ', 'VBN', 'IN', 'NN-TL', 'NP', 'NP', 'NP', '.'], ['``', 'RB', 'AT', 'JJ', 'NN', 'IN', 'JJ', 'NNS', 'BEDZ', 'VBN', "''", ',', 'AT', 'NN', 'VBD', ',', '``', 'IN', 'AT', 'JJ', 'NN', 'IN', 'AT', 'NN', ',', 'AT', 'NN', 'IN', 'NNS', 'CC', 'AT', 'NN', 'IN', 'DT', 'NN', "''", '.'], ['AT', 'NN', 'VBD', 'PPS', 'DOD', 'VB', 'CS', 'AP', 'IN', 'NP$', 'NN', 'CC', 'NN', 'NNS', '``', 'BER', 'JJ', 'CC', 'JJ', 'CC', 'RB', 'JJ', "''", '.'], ['PPS', 'VBD', 'CS', 'NP', 'NNS', 'VB', '``', 'TO', 'HV', 'DTS', 'NNS', 'VBN', 'CC', 'VBN', 'IN', 'AT', 'NN', 'IN', 'VBG', 'CC', 'VBG', 'PPO', "''", '.'], ['AT', 'JJ', 'NN', 'VBD', 'IN', 'AT', 'NN', 'IN', 'AP', 'NNS', ',', 'IN', 'PPO', 'AT', 'NP', 'CC', 'NP-TL', 'NN-TL', 'VBG', 'NNS', 'WDT', 'PPS', 'VBD', '``', 'BER', 'QL', 'VBN', 'CC', 'VB', 'RB', 'VBN', 'NNS', 'WDT', 'VB', 'IN', 'AT', 'JJT', 'NN', 'IN', 'ABX', 'NNS', "''", '.'], ['NN-HL', 'VBN-HL'], ['WRB', ',', 'AT', 'NN', 'VBD', 'PPS', 'VBZ', '``', 'DTS', 'CD', 'NNS', 'MD', 'BE', 'VBN', 'TO', 'VB', 'JJR', 'NN', 'CC', 'VB', 'AT', 'NN', 'IN', 'NN', "''", '.'], ['AT', 'NN-TL', 'VBG-TL', 'NN-TL', ',', 'AT', 'NN', 'VBD', ',', '``', 'BEZ', 'VBG', 'IN', 'VBN', 'JJ', 'NNS', 'CS', 'AT', 'NN', 'IN', 'NN', 'NNS', 'NNS', "''", '.']])
5 # # 36:'AT' in g_brown.numtag, 40:'NP-TL'
8 # test_brown = inner(0,2, (LRBAR,36), g_brown, ['AT', 'NP-TL' ,'NN-TL','JJ-TL'], {})
10 # for r in g_brown.rules((2,36)) + g_brown.rules((1,36)) + g_brown.rules((0,36)):
13 # if head(L) in [36,40,-2] and head(R) in [36,40,-2]:
15 # print "Brown-test gives: %.8f" % test_brown
19 # this will give the tag sequences of all the 6218 Brown corpus
20 # sentences of length < 7:
21 # [[tag for (w, tag) in sent]
22 # for sent in nltk.corpus.brown.tagged_sents() if len(sent) < 7]
26 ##############################
28 ##############################
29 def prune2(s
,t
,LHS
,loc_h
, ichart
,tree
):
32 def prune2_helper(s
,t
,LHS
,loc_h
):
33 newichart
[(s
,t
,LHS
,loc_h
)] = ichart
[(s
,t
,LHS
,loc_h
)]
34 for d
in tree
[s
,t
,LHS
,loc_h
]:
35 prune2_helper(d
[0],d
[1],d
[2],d
[3])
37 prune2_helper(s
,t
,LHS
,loc_h
)
40 def prune(s
,t
,LHS
, g
, sent_nums
, ichart
):
41 '''Removes unused subtrees with positive probability from the
44 Unused := any and all mothers (or grandmothers etc.) have
46 def prune_helper(keep
,s
,t
,LHS
,loc_h
):
47 keep
= keep
and ichart
[(s
,t
,LHS
,loc_h
)] > 0.0
48 for rule
in g
.sent_rules(LHS
, sent_nums
):
52 if (s
,t
,L
,loc_h
) in ichart
:
53 prune_helper(keep
, s
,t
, L
,loc_h
)
55 if (s
,t
,R
,loc_h
) in ichart
:
56 prune_helper(keep
, s
,t
, R
,loc_h
)
59 for loc_L
in locs(head(L
), sent_nums
, s
, r
):
60 if (s
,r
,rule
.L(),loc_L
) in ichart
:
61 prune_helper(keep
, s
,r
,rule
.L(),loc_L
)
62 for loc_R
in locs(head(R
), sent_nums
, r
+1, t
):
63 if (r
+1,t
,rule
.R(),loc_R
) in ichart
:
64 prune_helper(keep
,r
+1,t
,rule
.R(),loc_R
)
66 if not (s
,t
,LHS
,loc_h
) in keepichart
:
67 keepichart
[(s
,t
,LHS
,loc_h
)] = keep
68 else: # eg. if previously some parent rule had 0.0, but then a
69 # later rule said "No, I've got a use for this subtree"
70 keepichart
[(s
,t
,LHS
,loc_h
)] += keep
74 for loc_h
,h
in enumerate(sent_nums
):
75 keep
= ichart
[(s
,t
,LHS
,loc_h
)] > 0.0
76 keepichart
[(s
,t
,LHS
,loc_h
)] = keep
77 prune_helper(keep
,s
,t
,LHS
,loc_h
)
79 for (s
,t
,LHS
,loc_h
),v
in keepichart
.iteritems():
81 if 'PRUNE' in io
.DEBUG
:
82 print "popping s:%d t:%d LHS:%s loc_h:%d" % (s
,t
,LHS
,loc_h
)
83 ichart
.pop((s
,t
,LHS
,loc_h
))
84 # end prune(s,t,LHS,loc_h, g, sent_nums, ichart)
86 def prune_sent(g
, sent_nums
, ichart
):
87 return prune(0, len(sent_nums
)-1, ROOT
, g
, sent_nums
, ichart
)
91 def P_STOP(STOP
, h
, dir, adj
, g
, corpus
):
97 locs_h
= locs(h_tag
, sent
)
98 io
.debug( "locs_h:%s, sent:%s"%(locs_h
,sent
) , 'PSTOP')
100 inner(0, len(sent
)-1, ROOT
, loc_h
, g
, sent
, chart
)
101 for s
in range(loc_h
): # s<loc(h), range gives strictly less
102 for t
in range(loc_h
, len(sent
)):
103 io
.debug( "s:%s t:%s loc:%d"%(s
,t
,loc_h
) , 'PSTOP')
104 if (s
, t
, (LRBAR
,h
), loc_h
) in chart
:
105 io
.debug( "num+=%s"%chart
[(s
, t
, (LRBAR
,h
), loc_h
)] , 'PSTOP')
106 P_STOP_num
+= chart
[(s
, t
, (LRBAR
,h
), loc_h
)]
107 if (s
, t
, (RBAR
,h
), loc_h
) in chart
:
108 io
.debug( "den+=%s"%chart
[(s
, t
, (RBAR
,h
), loc_h
)] , 'PSTOP')
109 P_STOP_den
+= chart
[(s
, t
, (RBAR
,h
), loc_h
)]
111 io
.debug( "num/den: %s / %s"%(P_STOP_num
, P_STOP_den
) , 'PSTOP')
113 io
.debug( "num/den: %s / %s = %s"%(P_STOP_num
, P_STOP_den
,P_STOP_num
/ P_STOP_den
) , 'PSTOP')
114 return P_STOP_num
/ P_STOP_den
# upside down in article
120 '''Here it seems like they store rule information on a per-head (per
121 direction) basis, in deps_D(h, dir) which gives us a list. '''
124 for dir in ['l', 'r']:
125 for a
in deps(h
, dir):
128 P_STOP (0, h
, dir, adj
) * \
129 P_CHOOSE (a
, h
, dir) * \
131 P_STOP (STOP | h
, dir, adj
)
133 return P_h(root(sent
))
136 if __name__
== "__main__": # from dmv.py
137 # these are not Real rules, just testing the classes. todo: make
138 # a rule-set to test inner() on.
140 s
= DMV_Rule((LRBAR
,0), (NOBAR
,1),(NOBAR
,2), 1.0, 0.0) # s->np vp
141 np
= DMV_Rule((NOBAR
,1), (NOBAR
,3),(NOBAR
,4), 0.3, 0.0) # np->n p
142 b
[(NOBAR
,1), 'n'] = 0.7 # np->'n'
143 b
[(NOBAR
,3), 'n'] = 1.0 # n->'n'
144 b
[(NOBAR
,4), 'p'] = 1.0 # p->'p'
145 vp
= DMV_Rule((NOBAR
,2), (NOBAR
,5),(NOBAR
,1), 0.1, 0.0) # vp->v np (two parses use this rule)
146 vp2
= DMV_Rule((NOBAR
,2), (NOBAR
,2),(NOBAR
,4), 0.9, 0.0) # vp->vp p
147 b
[(NOBAR
,5), 'v'] = 1.0 # v->'v'
149 g
= DMV_Grammar([s
,np
,vp
,vp2
], b
, "todo","todo", "todo")
152 test1
= io
.inner(0,0, (NOBAR
,1), g
, ['n'], {})
154 print "should be 0.70 : %.2f" % test1
[0]
157 test2
= io
.inner(0,2, (NOBAR
,2), g
, ['v','n','p'], {})
158 if "%.2f" % test2
[0] != "0.09": # 0.092999 etc, don't care about that
159 print "should be 0.09 if the io.py-test is right : %.2f" % test2
[0]
160 # the following should manage to look stuff up in the chart:
161 test2
= io
.inner(0,2, (NOBAR
,2), g
, ['v','n','p'], test2
[1])
162 if "%.2f" % test2
[0] != "0.09":
163 print "should be 0.09 if the io.py-test is right : %.2f" % test2
[0]