b4058f6909e9ee3c52442e59fdda4c7bad41f72c
[dmvccm.git] / src / main.py
blobb4058f6909e9ee3c52442e59fdda4c7bad41f72c
1 # Todo: since we evaluate _after_ we reestimate, we loose the icharts
2 # made while reestimating. If we had these available, evaluate and
3 # corpus_likelihood would be a lot faster, but since they need to run
4 # _after_ reestimate, we'll have to store an ichart per sentence. So
5 # try storing those icharts in some loc_h_dmv global, and see if it's
6 # faster using space rather than time.
8 from common_dmv import MPPROOT, GOR, test, node_str
9 from wsjdep import WSJDepCorpusReader
11 def initialize_loc_h(tagonlys):
12 import loc_h_harmonic # since we need to change constants (is there a better way?)
13 loc_h_harmonic.HARMONIC_C = 0.0
14 loc_h_harmonic.FNONSTOP_MIN = 25
15 loc_h_harmonic.FSTOP_MIN = 5
16 loc_h_harmonic.RIGHT_FIRST = 1.0
17 return loc_h_harmonic.initialize(tagonlys)
19 def initialize_cnf(tagonlys):
20 import cnf_harmonic # since we need to change constants (is there a better way?)
21 cnf_harmonic.HARMONIC_C = 0.0
22 cnf_harmonic.FNONSTOP_MIN = 25
23 cnf_harmonic.FSTOP_MIN = 5
24 return cnf_harmonic.initialize(tagonlys)
27 def test_likelihood(reestimate, initialize, inner_sent, corpus_size=20, corpus_offset=1000):
28 def run_IO(g, iterations, tagonlys, tags_and_parses):
29 print corpus_likelihood(g, tagonlys)
30 # print evaluate(g, tags_and_parses) #
31 for i in range(iterations):
32 g = reestimate(g, tagonlys)
33 print "reestimation number %d done"%i
34 # print evaluate(g, tags_and_parses) #
35 print corpus_likelihood(g, tagonlys)
36 return g
38 def corpus_likelihood(g, tagsonly):
39 from math import log
40 sumlog = 0.0
41 for sent in tagsonly:
42 p_sent = inner_sent(g, sent, {})
43 if p_sent == 0.0:
44 print "%s had zero probability!"%sent
45 else:
46 sumlog += log(p_sent)
47 return "Sum of log P_{sentence}: %.4f (should move towards 0)\n"%sumlog
49 reader = WSJDepCorpusReader(None)
50 tagonlys = reader.tagonly_sents()[corpus_offset:corpus_offset+corpus_size]
51 tags_and_parses = reader.tagged_and_parsed_sents()[corpus_offset:corpus_offset+corpus_size]
53 # from loc_h_dmv import testcorpus
54 # tagonlys = testcorpus
56 print "initializing %d sentences..." % corpus_size,
57 g = initialize(tagonlys)
58 print "initialized"
60 g = run_IO(g, 4, tagonlys, tags_and_parses)
61 return g
64 def evaluate(g, tagged_and_parsed_sents):
65 '''
66 tagged_and_parsed_sents is a list of pairs:
67 (tagonly_sent, parsed_sent)
69 R_num += 1 if pair from parsed is in mpp
70 R_den += 1 per pair from parsed
72 P_num += 1 if pair from mpp is in parsed
73 P_den += 1 per pair from mpp
75 F1 = (2 * P * R)/(P + R), harmonisk snitt av P og R
76 '''
77 from loc_h_dmv import mpp
79 recall_num = 0
80 recall_den = 0
81 precision_num = 0
82 precision_den = 0
84 for sent, parse in tagged_and_parsed_sents:
85 mpp_sent = mpp(g, sent)
86 for pair in parse:
87 recall_den += 1
88 if pair in mpp_sent: recall_num += 1
89 for pair in mpp_sent:
90 if pair[0] == MPPROOT:
91 continue # todo: add ROOT to parses? (see below)
92 precision_den += 1
93 if pair in parse: precision_num += 1
95 # try:
96 # rooted_parse = add_root(parse) # use? todo
97 # except:
98 # print "No single possible root, todo what?"
100 recall = float(recall_num) / float(recall_den)
101 precision = float(precision_num) / float(precision_den)
102 F1 = 0.0
103 if (precision + recall) > 0.0:
104 F1 = (2 * recall * precision) / (precision + recall)
106 return '''Recall: %d/%d = %.4f
107 Precision: %d/%d = %.4f
108 F1: \t\t%.4f'''%(recall_num,recall_den,recall,precision_num,precision_den, precision, F1)
113 def compare_loc_h_cnf():
114 reader = WSJDepCorpusReader(None)
115 corpus_size = 200
116 corpus_offset = 1000
117 tagonlys = reader.tagonly_sents()[corpus_offset:corpus_offset+corpus_size]
119 import loc_h_harmonic, cnf_harmonic
120 g_l = loc_h_harmonic.initialize(tagonlys)
121 g_c = cnf_harmonic.initialize(tagonlys)
123 initials = [
124 (g_l.p_ROOT.iteritems(), g_c.p_ROOT),
125 (g_c.p_ROOT.iteritems(), g_l.p_ROOT),
126 (g_l.p_STOP.iteritems(), g_c.p_STOP),
127 (g_c.p_STOP.iteritems(), g_l.p_STOP),
128 (g_l.p_ATTACH.iteritems(), g_c.p_ATTACH),
129 (g_c.p_ATTACH.iteritems(), g_l.p_ATTACH)]
130 for a_items, b in initials:
131 for k,v in a_items:
132 if k not in b.keys(): raise Warning, "a[%s]=%s, but %s not in b"(k,v,k)
133 if (k,v) not in b.iteritems(): raise Warning, "a[%s]=%s, but b[%s]=%s"(k,v,k,b[k])
136 import loc_h_dmv, cnf_dmv
137 for sent in tagonlys:
138 ochart_l, ochart_c, ichart_l, ichart_c = {},{},{},{}
139 i_l = loc_h_dmv.inner_sent(g_l, sent, ichart_l)
140 i_c = cnf_dmv.inner_sent(g_c, sent, ichart_c)
141 test( "%s"%i_l, "%s"%i_c, "i_l","i_c")
143 for loc_w,w in enumerate(sent):
144 w_node = (GOR, g_l.tagnum(w))
145 o_l = loc_h_dmv.outer(loc_w,loc_w+1,w_node,loc_w, g_l, sent, ichart_l,ochart_l)
146 o_c = cnf_dmv.outer(loc_w,loc_w+1,w_node, g_c, sent, ichart_c,ochart_c)
147 print "%s, %s, %s"%(sent,node_str(w_node),loc_w)
148 test("%s"%o_l, "%s"%o_c, "o_l(0,1,(GOR,%s),%d,...)"%(w,loc_w),"o_c")
150 # end compare_loc_h_cnf()
152 if __name__ == "__main__":
153 print "main.py:"
154 # compare_loc_h_cnf()
156 import cnf_dmv
157 print "\ntrying cnf-reestimate ##############################"
158 g = test_likelihood(cnf_dmv.reestimate,
159 initialize_cnf,
160 cnf_dmv.inner_sent,
161 corpus_size=20)
163 import loc_h_dmv
164 print "\ntrying reestimate v.1 ##############################"
165 g = test_likelihood(loc_h_dmv.reestimate,
166 initialize_loc_h,
167 loc_h_dmv.inner_sent,
168 corpus_size=20)
169 print "\ntrying reestimate v.2 ##############################"
170 g = test_likelihood(loc_h_dmv.reestimate2,
171 initialize_loc_h,
172 loc_h_dmv.inner_sent,
173 corpus_size=20)
175 print "main.py: done"