inner and outer done for cnf_dmv.py, seems to work. todo: reestimation for cnf
[dmvccm.git] / src / main.py
blob1384d08600e5ab9f37f33ec281aeb67abc5fb8d7
1 # Todo: since we evaluate _after_ we reestimate, we loose the icharts
2 # made while reestimating. If we had these available, evaluate and
3 # corpus_likelihood would be a lot faster, but since they need to run
4 # _after_ reestimate, we'll have to store an ichart per sentence. So
5 # try storing those icharts in some loc_h_dmv global, and see if it's
6 # faster using space rather than time.
8 from common_dmv import MPPROOT
9 from wsjdep import WSJDepCorpusReader
10 from loc_h_dmv import DMV_Grammar, reestimate, DEBUG, mpp
11 from loc_h_harmonic import initialize
13 def corpus_likelihood(g, tagsonly):
14 from math import log
15 from loc_h_dmv import inner_sent
16 sumlog = 0.0
17 for sent in tagsonly:
18 p_sent = inner_sent(g, sent)
19 sumlog += math.log(p_sent)
20 return "Sum of log P_{sentence}: %.4f\n"%sumlog
22 def evaluate(g, tagged_and_parsed_sents):
23 '''
24 tagged_and_parsed_sents is a list of pairs:
25 (tagonly_sent, parsed_sent)
27 R_num += 1 if pair from parsed is in mpp
28 R_den += 1 per pair from parsed
30 P_num += 1 if pair from mpp is in parsed
31 P_den += 1 per pair from mpp
33 F1 = (2 * P * R)/(P + R), harmonisk snitt av P og R
34 '''
35 recall_num = 0
36 recall_den = 0
37 precision_num = 0
38 precision_den = 0
40 for sent, parse in tagged_and_parsed_sents:
41 mpp_sent = mpp(g, sent)
42 for pair in parse:
43 recall_den += 1
44 if pair in mpp_sent: recall_num += 1
45 for pair in mpp_sent:
46 if pair[0] == MPPROOT:
47 continue # todo: add ROOT to parses? (see below)
48 precision_den += 1
49 if pair in parse: precision_num += 1
51 # try:
52 # rooted_parse = add_root(parse) # use? todo
53 # except:
54 # print "No single possible root, todo what?"
56 recall = float(recall_num) / float(recall_den)
57 precision = float(precision_num) / float(precision_den)
58 F1 = 0.0
59 if (precision + recall) > 0.0:
60 F1 = (2 * recall * precision) / (precision + recall)
62 return '''Recall: %d/%d = %.4f
63 Precision: %d/%d = %.4f
64 F1: \t\t%.4f'''%(recall_num,recall_den,recall,precision_num,precision_den, precision, F1)
67 def run_IO(iterations, tagonlys, tags_and_parses):
68 for i in range(iterations):
69 f = reestimate(g, tagonlys)
70 print "reestimation number %d done"%i
71 #print evaluate(g, tags_and_parses)
72 print corpus_likelihood(g, tagonlys)
76 def test_likelihood():
77 reader = WSJDepCorpusReader(None)
78 corpus_size = 100
79 tagonlys = reader.tagonly_sents()[1000:1000+corpus_size]
80 tags_and_parses = reader.tagged_and_parsed_sents()[1000:1000+corpus_size]
82 print "initializing %d sentences"%corpus_size
83 import loc_h_harmonic # since we need to change constants (is there a better way?)
84 loc_h_harmonic.HARMONIC_C = 0.0
85 loc_h_harmonic.FNONSTOP_MIN = 25
86 loc_h_harmonic.FSTOP_MIN = 5
87 loc_h_harmonic.RIGHT_FIRST = 1.0
88 g = initialize(tagonlys)
89 print "initialized"
91 print evaluate(g, tags_and_parses)
92 print corpus_likelihood(g, tagonlys)
94 run_IO(4, tagonlys, tags_and_parses)
97 def compare_loc_h_cnf():
98 reader = WSJDepCorpusReader(None)
99 corpus_size = 200
100 tagonlys = reader.tagonly_sents()[1000:1000+corpus_size]
102 import loc_h_harmonic, cnf_harmonic
103 g_l = loc_h_harmonic.initialize(tagonlys)
104 g_c = cnf_harmonic.initialize(tagonlys)
106 initials = [
107 (g_l.p_ROOT.iteritems(), g_c.p_ROOT),
108 (g_c.p_ROOT.iteritems(), g_l.p_ROOT),
109 (g_l.p_STOP.iteritems(), g_c.p_STOP),
110 (g_c.p_STOP.iteritems(), g_l.p_STOP),
111 (g_l.p_ATTACH.iteritems(), g_c.p_ATTACH),
112 (g_c.p_ATTACH.iteritems(), g_l.p_ATTACH)]
113 for a_items, b in initials:
114 for k,v in a_items:
115 if k not in b.keys(): raise Warning, "a[%s]=%s, but %s not in b"(k,v,k)
116 if (k,v) not in b.iteritems(): raise Warning, "a[%s]=%s, but b[%s]=%s"(k,v,k,b[k])
119 import loc_h_dmv, cnf_dmv
120 for sent in tagonlys:
121 i_l = loc_h_dmv.inner_sent(g_l, sent, {})
122 i_c = cnf_dmv.inner_sent(g_c, sent, {})
123 # print "%s\n%s\n"%(i_l,i_c)
124 if "%s"%i_l != "%s"%i_c:
125 raise Warning, "i_l: %s but i_c: %s"%(i_l,i_c)
126 # end compare_loc_h_cnf()
128 if __name__ == "__main__":
129 print "main.py:"
130 compare_loc_h_cnf()
131 #test_likelihood()
132 print "main.py: done"