rewrote loc_h_harmonic's STOP initialization to reflect report.pdf; simpler now
[dmvccm.git] / src / main.py
blobb5b6336a58f354c55e862901ccd86820dd7372de
1 # Todo: since we evaluate _after_ we reestimate, we loose the icharts
2 # made while reestimating. If we had these available, evaluate and
3 # corpus_likelihood would be a lot faster, but since they need to run
4 # _after_ reestimate, we'll have to store an ichart per sentence. So
5 # try storing those icharts in some loc_h_dmv global, and see if it's
6 # faster using space rather than time.
8 from common_dmv import MPPROOT, test, node_str
9 from wsjdep import WSJDepCorpusReader
10 #HARMONIC_C: 509.637290698, FNONSTOP_MIN: 30.1124584139, FSTOP_MIN: 13.0830178845
11 def initialize_loc_h(tagonlys):
12 import loc_h_harmonic # since we need to change constants (is there a better way?)
13 reload(loc_h_harmonic)
14 import random
15 # loc_h_harmonic.HARMONIC_C = 380.111684914
16 # loc_h_harmonic.FSTOP_MIN = 13.5744632704
17 # loc_h_harmonic.FNONSTOP_MIN = 34.8939452454
18 loc_h_harmonic.HARMONIC_C = random.random() # 0.0 # 509.63 #1000.0 * random.random()
19 loc_h_harmonic.FSTOP_MIN = random.random() # 1.0 # 13.08 #20.0 * random.random()
20 loc_h_harmonic.STOP_C = random.random()
21 loc_h_harmonic.NSTOP_C = random.random()
23 loc_h_harmonic.RIGHT_FIRST = 1.0
24 loc_h_harmonic.OLD_STOP_CALC = False
25 print '''
26 HARMONIC_C: %s, STOP_C: %s, NSTOP_C: %s, FSTOP_MIN: %s
27 RIGHT_FIRST: %s, OLD_STOP_CALC: %s'''%(loc_h_harmonic.HARMONIC_C,
28 loc_h_harmonic.STOP_C,
29 loc_h_harmonic.NSTOP_C,
30 loc_h_harmonic.FSTOP_MIN,
31 loc_h_harmonic.RIGHT_FIRST,
32 loc_h_harmonic.OLD_STOP_CALC)
33 g = loc_h_harmonic.initialize(tagonlys)
34 return g
36 def initialize_cnf(tagonlys):
37 import cnf_harmonic # since we need to change constants (is there a better way?)
38 reload(cnf_harmonic)
39 cnf_harmonic.HARMONIC_C = 0.0
40 cnf_harmonic.FNONSTOP_MIN = 25
41 cnf_harmonic.FSTOP_MIN = 5
42 return cnf_harmonic.initialize(tagonlys)
45 def test_likelihood(reestimate, initialize, inner_sent,
46 corpus_size=20, corpus_offset=1000, iterations=4, eval=False):
47 def run_IO(g, iterations, tagonlys, tags_and_parses):
48 sumlog,msg = corpus_likelihood(g, tagonlys)
49 print msg
50 if eval: print evaluate(g, tags_and_parses)
51 for i in range(iterations):
52 g = reestimate(g, tagonlys)
53 print "reestimation number %d done"%i
54 if eval: print evaluate(g, tags_and_parses)
56 prev_sumlog = sumlog
57 sumlog,msg = corpus_likelihood(g, tagonlys)
58 if sumlog < prev_sumlog:
59 raise Exception, msg+"but previous was %s"%prev_sumlog
60 print msg
61 return g
63 def corpus_likelihood(g, tagsonly):
64 from math import log
65 sumlog = 0.0
66 for sent in tagsonly:
67 p_sent = inner_sent(g, sent, {})
68 if p_sent == 0.0:
69 print "%s had zero probability!"%sent
70 else:
71 sumlog += log(p_sent)
72 avg = sumlog / len(tagsonly)
73 return (sumlog, "Sum of log P_{sentence}: %.4f (should move towards 0), avg: %s\n"%(sumlog,avg))
75 reader = WSJDepCorpusReader(None)
76 tagonlys = reader.tagonly_sents()[corpus_offset:corpus_offset+corpus_size]
77 tags_and_parses = reader.tagged_and_parsed_sents()[corpus_offset:corpus_offset+corpus_size]
79 # from loc_h_dmv import testcorpus
80 # tagonlys = testcorpus
82 print "initializing %d sentences..." % corpus_size,
83 g = initialize(tagonlys)
84 print "initialized"
86 g = run_IO(g, iterations, tagonlys, tags_and_parses) # make iterations argument, todo
87 return g
90 def evaluate(g, tagged_and_parsed_sents):
91 '''
92 tagged_and_parsed_sents is a list of pairs:
93 (tagonly_sent, parsed_sent)
95 R_num += 1 if pair from parsed is in mpp
96 R_den += 1 per pair from parsed
98 P_num += 1 if pair from mpp is in parsed
99 P_den += 1 per pair from mpp
101 F1 = (2 * P * R)/(P + R), harmonisk snitt av P og R
103 from loc_h_dmv import mpp
104 from wsjdep import add_root
106 R, R_r, P, P_r = {}, {}, {}, {}
107 for nd in ['num', 'den']:
108 R[nd], R_r[nd], P[nd], P_r[nd] = 0, 0, 0, 0
109 unrooted = 0 # parses where we couldn't add_root
111 for sent, gold_parse in tagged_and_parsed_sents:
112 mpp_sent = mpp(g, sent)
113 try: gold_parse = add_root(gold_parse)
114 except ValueError: unrooted += 1
116 for pair in gold_parse:
117 dict = R
118 if pair[0] == MPPROOT: dict = R_r
119 dict['den'] += 1
120 if pair in mpp_sent: dict['num'] += 1
122 for pair in mpp_sent:
123 dict = P
124 if pair[0] == MPPROOT: dict = P_r
125 dict['den'] += 1
126 if pair in gold_parse: dict['num'] += 1
128 recall = float(R['num']) / float(R['den'])
129 precision = float(P['num']) / float(P['den'])
130 recall_r = float(R['num']+R_r['num']) / float(R['den']+R_r['den'])
131 precision_r = float(P['num']+P_r['num']) / float(P['den']+P_r['den'])
132 F1, F1_r = 0.0, 0.0
133 if (precision + recall) > 0.0:
134 F1 = (2 * recall * precision) / (precision + recall)
135 if (precision_r + recall_r) > 0.0:
136 F1_r = (2 * recall_r * precision_r) / (precision_r + recall_r)
138 str_vals = (R['num'],R['den'],recall, R['num']+R_r['num'], R['den']+R_r['den'], recall_r,
139 P['num'],P['den'],precision, P['num']+P_r['num'], P['den']+P_r['den'], precision_r,
140 F1, F1_r, unrooted)
141 return '''Recall: %d/%d = %.4f\tRecall_r: %d/%d = %.4f
142 Precision: %d/%d = %.4f\tPrecision_r: %d/%d = %.4f
143 F1: \t\t%.4f\t\tF1_r: \t\t%.4f (unrooted gold parses: %d)'''%str_vals
148 def compare_loc_h_cnf():
149 reader = WSJDepCorpusReader(None)
150 corpus_size = 200
151 corpus_offset = 1000
152 tagonlys = reader.tagonly_sents()[corpus_offset:corpus_offset+corpus_size]
154 import loc_h_harmonic, cnf_harmonic
155 g_l = loc_h_harmonic.initialize(tagonlys)
156 g_c = cnf_harmonic.initialize(tagonlys)
158 initials = [
159 (g_l.p_ROOT.iteritems(), g_c.p_ROOT),
160 (g_c.p_ROOT.iteritems(), g_l.p_ROOT),
161 (g_l.p_STOP.iteritems(), g_c.p_STOP),
162 (g_c.p_STOP.iteritems(), g_l.p_STOP),
163 (g_l.p_ATTACH.iteritems(), g_c.p_ATTACH),
164 (g_c.p_ATTACH.iteritems(), g_l.p_ATTACH)]
165 for a_items, b in initials:
166 for k,v in a_items:
167 if k not in b.keys(): raise Warning, "a[%s]=%s, but %s not in b"(k,v,k)
168 if (k,v) not in b.iteritems(): raise Warning, "a[%s]=%s, but b[%s]=%s"(k,v,k,b[k])
171 import loc_h_dmv, cnf_dmv
172 from common_dmv import GOR
173 for sent in tagonlys:
174 ochart_l, ochart_c, ichart_l, ichart_c = {},{},{},{}
175 i_l = loc_h_dmv.inner_sent(g_l, sent, ichart_l)
176 i_c = cnf_dmv.inner_sent(g_c, sent, ichart_c)
177 test( "%s"%i_l, "%s"%i_c, "i_l","i_c")
179 for loc_w,w in enumerate(sent):
180 w_node = (GOR, g_l.tagnum(w))
181 o_l = loc_h_dmv.outer(loc_w,loc_w+1,w_node,loc_w, g_l, sent, ichart_l,ochart_l)
182 o_c = cnf_dmv.outer(loc_w,loc_w+1,w_node, g_c, sent, ichart_c,ochart_c)
183 print "%s, %s, %s"%(sent,node_str(w_node),loc_w)
184 test("%s"%o_l, "%s"%o_c, "o_l(0,1,(GOR,%s),%d,...)"%(w,loc_w),"o_c")
186 # end compare_loc_h_cnf()
189 def init_nothing(g,H,S,N,M):
190 print '''
191 HARMONIC_C: %s, STOP_C: %s, NSTOP_C: %s, FSTOP_MIN: %s'''%(H,S,N,M)
192 return lambda corpus:g
194 def rnd_grammars_test():
195 import loc_h_dmv
196 reload(loc_h_dmv)
198 rnd_grammars0 = []
199 for i in xrange(20):
200 g = test_likelihood(loc_h_dmv.reestimate,
201 initialize_loc_h,
202 loc_h_dmv.inner_sent,
203 corpus_size=6268,
204 iterations=0,
205 corpus_offset=0,
206 eval=True)
207 rnd_grammars0 += [(g, g.HARMONIC_C, g.STOP_C, g.NSTOP_C, g.FSTOP_MIN)]
209 rnd_grammars1 = [(test_likelihood(loc_h_dmv.reestimate,
210 init_nothing(g,H,S,N,M),
211 loc_h_dmv.inner_sent,
212 corpus_size=6268,
213 iterations=1,
214 corpus_offset=0,
215 eval=True),
216 H,S,N,M)
217 for g,H,S,N,M in rnd_grammars0]
218 rnd_grammars2 = [(test_likelihood(loc_h_dmv.reestimate,
219 init_nothing(g,H,S,N,M),
220 loc_h_dmv.inner_sent,
221 corpus_size=6268,
222 iterations=1,
223 corpus_offset=0,
224 eval=True),
225 H,S,N,M)
226 for g,H,S,N,M in rnd_grammars1]
228 if __name__ == "__main__":
229 print "main.py:"
231 # compare_loc_h_cnf()
233 # import cnf_dmv
234 # reload(cnf_dmv)
235 # print "\ntrying cnf-reestimate ##############################"
236 # g = test_likelihood(cnf_dmv.reestimate,
237 # initialize_cnf,
238 # cnf_dmv.inner_sent,
239 # corpus_size=5,
240 # iterations=4)
242 rnd_grammars_test()
244 # import loc_h_dmv
245 # reload(loc_h_dmv)
246 # print "\ntrying reestimate v.1 ##############################"
247 # g = test_likelihood(loc_h_dmv.reestimate,
248 # initialize_loc_h,
249 # loc_h_dmv.inner_sent,
250 # corpus_size=6268,
251 # iterations=100,
252 # corpus_offset=0,
253 # eval=True)
254 # print g
256 # print "\ntrying reestimate v.2 ##############################"
257 # g = test_likelihood(loc_h_dmv.reestimate2,
258 # initialize_loc_h,
259 # loc_h_dmv.inner_sent,
260 # corpus_size=5,
261 # iterations=4,
262 # corpus_offset=0)
263 # print "main.py: done"
264 # print g