added ccm.py
[dmvccm.git] / src / main.py
blobe6798fd9be0aa932fb5460de6d63a6a1bf8b250f
1 # Todo: since we evaluate _after_ we reestimate, we loose the icharts
2 # made while reestimating. If we had these available, evaluate and
3 # corpus_likelihood would be a lot faster, but since they need to run
4 # _after_ reestimate, we'll have to store an ichart per sentence. So
5 # try storing those icharts in some loc_h_dmv global, and see if it's
6 # faster using space rather than time.
8 from common_dmv import MPPROOT, test, node_str
9 from wsjdep import WSJDepCorpusReader
11 #HARMONIC_C: 509.637290698, FNONSTOP_MIN: 30.1124584139, FSTOP_MIN: 13.0830178845
12 def initialize_loc_h(tagonlys):
13 import loc_h_harmonic # since we need to change constants (is there a better way?)
14 reload(loc_h_harmonic)
15 import random
16 # loc_h_harmonic.HARMONIC_C = 380.111684914
17 # loc_h_harmonic.FSTOP_MIN = 13.5744632704
18 # loc_h_harmonic.FNONSTOP_MIN = 34.8939452454
19 loc_h_harmonic.HARMONIC_C = 0.0 #120.0 * random.random() # 509.63
20 loc_h_harmonic.STOP_C = 3.0 #3.0 * random.random()
21 loc_h_harmonic.NSTOP_C = 1.0 #5.0 * random.random() # 0.1
22 loc_h_harmonic.FSTOP_MIN = 10.0 #20.0 * random.random() # 13.08
24 loc_h_harmonic.RIGHT_FIRST = 1.0
25 loc_h_harmonic.OLD_STOP_CALC = False
26 print '''
27 HARMONIC_C: %s, STOP_C: %s, NSTOP_C: %s, FSTOP_MIN: %s
28 RIGHT_FIRST: %s, OLD_STOP_CALC: %s'''%(loc_h_harmonic.HARMONIC_C,
29 loc_h_harmonic.STOP_C,
30 loc_h_harmonic.NSTOP_C,
31 loc_h_harmonic.FSTOP_MIN,
32 loc_h_harmonic.RIGHT_FIRST,
33 loc_h_harmonic.OLD_STOP_CALC)
34 g = loc_h_harmonic.initialize(tagonlys)
35 return g
37 def initialize_uniform_loc_h(tagonlys):
38 import loc_h_harmonic
39 return loc_h_harmonic.uniform_init(tagonlys)
41 def initialize_cnf(tagonlys):
42 import cnf_harmonic # since we need to change constants (is there a better way?)
43 reload(cnf_harmonic)
44 cnf_harmonic.HARMONIC_C = 0.0
45 cnf_harmonic.FNONSTOP_MIN = 25
46 cnf_harmonic.FSTOP_MIN = 5
47 return cnf_harmonic.initialize(tagonlys)
50 def test_likelihood(reestimate, initialize, inner_sent,
51 corpus_size=20, corpus_offset=1000, iterations=4, EVAL=False):
52 def run_IO(g, iterations, tagonlys, tags_and_parses):
53 sumlog,msg = corpus_likelihood(g, tagonlys)
54 print msg
55 if EVAL:
56 g.E = evaluate(g, tags_and_parses)
57 print g.E
58 for i in range(iterations):
59 g = reestimate(g, tagonlys)
60 print "reestimation number %d done\n"%i
61 if EVAL:
62 g.E = evaluate(g, tags_and_parses)
63 print g.E
64 prev_sumlog = sumlog
65 sumlog,msg = corpus_likelihood(g, tagonlys)
66 if sumlog < prev_sumlog:
67 raise Exception, msg+"but previous was %s"%prev_sumlog
68 print msg
69 # since I want to be able to do stuff with it afterwards:
70 from pickle import dump # let us say g = pickle.load(open('..','rb'))
71 filehandler = open('current_grammar.obj','w')
72 dump(g, filehandler)
73 filehandler.close()
75 return g
77 def corpus_likelihood(g, tagsonly):
78 from math import log
79 sumlog = 0.0
80 for sent in tagsonly:
81 p_sent = inner_sent(g, sent, {})
82 if p_sent == 0.0:
83 print "%s had zero probability!"%sent
84 else:
85 sumlog += log(p_sent)
86 avg = sumlog / len(tagsonly)
87 return (sumlog, "Sum of log P_{sentence}: %.4f (should move towards 0), avg: %s"%(sumlog,avg))
89 reader = WSJDepCorpusReader(None)
90 tagonlys = reader.tagonly_sents()[corpus_offset:corpus_offset+corpus_size]
91 tags_and_parses = reader.tagged_and_parsed_sents()[corpus_offset:corpus_offset+corpus_size]
93 print "\ninitializing %d sentences..." % corpus_size,
94 g = initialize(tagonlys)
95 print "initialized"
97 g = run_IO(g, iterations, tagonlys, tags_and_parses) # make iterations argument, todo
99 if EVAL:
100 import pprint
101 print "underproposed:"
102 pprint.pprint(g.E.underproposed)
103 print "overproposed:"
104 pprint.pprint(g.E.overproposed)
106 return g
109 class Evaluation():
110 "Just a class to hold evaluation-relevant information, sum it up, and print it."
111 def __init__(self):
112 self.underproposed, self.overproposed = {}, {}
113 self.R, self.R_r, self.P, self.P_r = {}, {}, {}, {}
114 for nd in ['num', 'den']:
115 self.R[nd], self.R_r[nd], self.P[nd], self.P_r[nd] = 0, 0, 0, 0
117 self.unrooted = 0 # parses where we couldn't add_root
118 self.double_heads = 0 # parses w/ two heads to one argument
119 self._precision, self._recall, self._precision_r, self._recall_r = 0.0, 0.0, 0.0, 0.0
120 self._F1, self._F1_r = 0.0, 0.0
122 def calc_F1_P_R(self):
123 "F1 = (2 * P * R)/(P + R), harmonic avg. of P and R"
124 self._recall = float(self.R['num']) / float(self.R['den'])
125 self._precision = float(self.P['num']) / float(self.P['den'])
126 self._recall_r = float(self.R['num']+self.R_r['num']) / \
127 float(self.R['den']+self.R_r['den'])
128 self._precision_r = float(self.P['num']+self.P_r['num']) / \
129 float(self.P['den']+self.P_r['den'])
131 if (self._precision + self._recall) > 0.0:
132 self._F1 = (2 * self._recall * self._precision) / (self._precision + self._recall)
133 if (self._precision_r + self._recall_r) > 0.0:
134 self._F1_r = (2 * self._recall_r * self._precision_r) / (self._precision_r + self._recall_r)
136 def __str__(self):
137 self.calc_F1_P_R()
138 R_rnum = self.R['num']+self.R_r['num']
139 R_rden = self.R['den']+self.R_r['den']
140 P_rnum = self.P['num']+self.P_r['num']
141 P_rden = self.P['den']+self.P_r['den']
142 str_vals = (self.P['num'],self.P['den'],self._precision, P_rnum,P_rden,self._precision_r,
143 self.R['num'],self.R['den'],self._recall, R_rnum,R_rden,self._recall_r,
144 self._F1, self._F1_r, self.unrooted, self.double_heads)
145 regular_str = '''P: %5d/%5d = %s | P_r: %5d/%5d = %s
146 R: %5d/%5d = %s | R_r: %5d/%5d = %s
147 F1: %s | F1_r: %s (unrooted gold parses: %d, double-headed: %d)'''%str_vals
149 if self._precision != self._recall: print regular_str # raise Exception todo
151 tex_str_vals = tuple([p * 100 for p in (self._F1,self._F1_r)])
152 tex_str = "$C_A=; C_S=;C_N=;C_M=$ & %.1f & %.1f \\"%tex_str_vals
154 return tex_str # todo make variable
158 def evaluate(g, tagged_and_parsed_sents):
159 ''' tagged_and_parsed_sents is a list of pairs:
160 (tagonly_sent, parsed_sent)
162 R_num += 1 if pair from parsed is in mpp
163 R_den += 1 per pair from parsed
165 P_num += 1 if pair from mpp is in parsed
166 P_den += 1 per pair from mpp '''
167 from loc_h_dmv import mpp
168 from wsjdep import add_root
169 E = Evaluation()
171 for sent, gold_parse in tagged_and_parsed_sents:
172 if len(sent)-1 != len(gold_parse):
173 E.double_heads += 1
174 continue
175 mpp_sent = mpp(g, sent)
176 try: gold_parse = add_root(gold_parse)
177 except RuntimeError: E.unrooted += 1
179 for pair in gold_parse:
180 dict = E.R
181 if pair[0] == MPPROOT: dict = E.R_r
182 dict['den'] += 1
183 if pair in mpp_sent: dict['num'] += 1
184 else:
185 try: E.underproposed[pair] += 1
186 except KeyError: E.underproposed[pair] = 1
188 for pair in mpp_sent:
189 dict = E.P
190 if pair[0] == MPPROOT: dict = E.P_r
191 dict['den'] += 1
192 if pair in gold_parse: dict['num'] += 1
193 else:
194 try: E.overproposed[pair] += 1
195 except KeyError: E.overproposed[pair] = 1
197 return E
201 def compare_loc_h_cnf():
202 reader = WSJDepCorpusReader(None)
203 corpus_size = 200
204 corpus_offset = 1000
205 tagonlys = reader.tagonly_sents()[corpus_offset:corpus_offset+corpus_size]
207 import loc_h_harmonic, cnf_harmonic
208 g_l = loc_h_harmonic.initialize(tagonlys)
209 g_c = cnf_harmonic.initialize(tagonlys)
211 initials = [
212 (g_l.p_ROOT.iteritems(), g_c.p_ROOT),
213 (g_c.p_ROOT.iteritems(), g_l.p_ROOT),
214 (g_l.p_STOP.iteritems(), g_c.p_STOP),
215 (g_c.p_STOP.iteritems(), g_l.p_STOP),
216 (g_l.p_ATTACH.iteritems(), g_c.p_ATTACH),
217 (g_c.p_ATTACH.iteritems(), g_l.p_ATTACH)]
218 for a_items, b in initials:
219 for k,v in a_items:
220 if k not in b.keys(): raise Warning, "a[%s]=%s, but %s not in b"(k,v,k)
221 if (k,v) not in b.iteritems(): raise Warning, "a[%s]=%s, but b[%s]=%s"(k,v,k,b[k])
224 import loc_h_dmv, cnf_dmv
225 from common_dmv import GOR
226 for sent in tagonlys:
227 ochart_l, ochart_c, ichart_l, ichart_c = {},{},{},{}
228 i_l = loc_h_dmv.inner_sent(g_l, sent, ichart_l)
229 i_c = cnf_dmv.inner_sent(g_c, sent, ichart_c)
230 test( "%s"%i_l, "%s"%i_c, "i_l","i_c")
232 for loc_w,w in enumerate(sent):
233 w_node = (GOR, g_l.tagnum(w))
234 o_l = loc_h_dmv.outer(loc_w,loc_w+1,w_node,loc_w, g_l, sent, ichart_l,ochart_l)
235 o_c = cnf_dmv.outer(loc_w,loc_w+1,w_node, g_c, sent, ichart_c,ochart_c)
236 print "%s, %s, %s"%(sent,node_str(w_node),loc_w)
237 test("%s"%o_l, "%s"%o_c, "o_l(0,1,(GOR,%s),%d,...)"%(w,loc_w),"o_c")
239 # end compare_loc_h_cnf()
242 def init_nothing(g,H,S,N,M):
243 print '''
244 HARMONIC_C: %s, STOP_C: %s, NSTOP_C: %s, FSTOP_MIN: %s'''%(H,S,N,M)
245 return lambda corpus:g
247 def rnd_grammars_test():
248 import loc_h_dmv
249 reload(loc_h_dmv)
251 rnd_grammars0 = []
252 for i in xrange(20):
253 g = test_likelihood(loc_h_dmv.reestimate,
254 initialize_loc_h,
255 loc_h_dmv.inner_sent,
256 corpus_size=6268,
257 iterations=0,
258 corpus_offset=0,
259 EVAL=True)
260 rnd_grammars0 += [(g, g.HARMONIC_C, g.STOP_C, g.NSTOP_C, g.FSTOP_MIN)]
262 rnd_grammars1 = [(test_likelihood(loc_h_dmv.reestimate,
263 init_nothing(g,H,S,N,M),
264 loc_h_dmv.inner_sent,
265 corpus_size=6268,
266 iterations=1,
267 corpus_offset=0,
268 EVAL=True),
269 H,S,N,M)
270 for g,H,S,N,M in rnd_grammars0]
271 rnd_grammars2 = [(test_likelihood(loc_h_dmv.reestimate,
272 init_nothing(g,H,S,N,M),
273 loc_h_dmv.inner_sent,
274 corpus_size=6268,
275 iterations=1,
276 corpus_offset=0,
277 EVAL=True),
278 H,S,N,M)
279 for g,H,S,N,M in rnd_grammars1]
282 if __name__ == "__main__":
283 print "main.py:"
285 if False:
286 rnd_grammars_test()
287 else:
288 import loc_h_dmv
289 reload(loc_h_dmv)
290 print "\ntrying reestimate v.1 ##############################"
291 g = test_likelihood(loc_h_dmv.reestimate,
292 initialize_loc_h,
293 loc_h_dmv.inner_sent,
294 corpus_size=6268,
295 iterations=30,
296 corpus_offset=0,
297 EVAL=True)
298 print g
300 # print "\ntrying reestimate v.2 ##############################"
301 # g = test_likelihood(loc_h_dmv.reestimate2,
302 # initialize_loc_h,
303 # loc_h_dmv.inner_sent,
304 # corpus_size=5,
305 # iterations=4,
306 # corpus_offset=0)
307 # print "main.py: done"
308 # print g
311 # compare_loc_h_cnf()
312 # import cnf_dmv
313 # reload(cnf_dmv)
314 # print "\ntrying cnf-reestimate ##############################"
315 # g = test_likelihood(cnf_dmv.reestimate,
316 # initialize_cnf,
317 # cnf_dmv.inner_sent,
318 # corpus_size=5,
319 # iterations=4)