1 # Todo: since we evaluate _after_ we reestimate, we loose the icharts
2 # made while reestimating. If we had these available, evaluate and
3 # corpus_likelihood would be a lot faster, but since they need to run
4 # _after_ reestimate, we'll have to store an ichart per sentence. So
5 # try storing those icharts in some loc_h_dmv global, and see if it's
6 # faster using space rather than time.
8 from common_dmv
import MPPROOT
9 from wsjdep
import WSJDepCorpusReader
10 from loc_h_dmv
import DMV_Grammar
, reestimate
, DEBUG
, mpp
11 from loc_h_harmonic
import initialize
13 def corpus_likelihood(g
, tagsonly
):
15 from loc_h_dmv
import inner_sent
18 p_sent
= inner_sent(g
, sent
)
19 sumlog
+= math
.log(p_sent
)
20 return "Sum of log P_{sentence}: %.4f\n"%sumlog
22 def evaluate(g
, tagged_and_parsed_sents
):
24 tagged_and_parsed_sents is a list of pairs:
25 (tagonly_sent, parsed_sent)
27 R_num += 1 if pair from parsed is in mpp
28 R_den += 1 per pair from parsed
30 P_num += 1 if pair from mpp is in parsed
31 P_den += 1 per pair from mpp
33 F1 = (2 * P * R)/(P + R), harmonisk snitt av P og R
40 for sent
, parse
in tagged_and_parsed_sents
:
41 mpp_sent
= mpp(g
, sent
)
44 if pair
in mpp_sent
: recall_num
+= 1
46 if pair
[0] == MPPROOT
:
47 continue # todo: add ROOT to parses? (see below)
49 if pair
in parse
: precision_num
+= 1
52 # rooted_parse = add_root(parse) # use? todo
54 # print "No single possible root, todo what?"
56 recall
= float(recall_num
) / float(recall_den
)
57 precision
= float(precision_num
) / float(precision_den
)
59 if (precision
+ recall
) > 0.0:
60 F1
= (2 * recall
* precision
) / (precision
+ recall
)
62 return '''Recall: %d/%d = %.4f
63 Precision: %d/%d = %.4f
64 F1: \t\t%.4f'''%(recall_num
,recall_den
,recall
,precision_num
,precision_den
, precision
, F1
)
67 def run_IO(iterations
, tagonlys
, tags_and_parses
):
68 for i
in range(iterations
):
69 f
= reestimate(g
, tagonlys
)
70 print "reestimation number %d done"%i
71 #print evaluate(g, tags_and_parses)
72 print corpus_likelihood(g
, tagonlys
)
76 def test_likelihood():
77 reader
= WSJDepCorpusReader(None)
79 tagonlys
= reader
.tagonly_sents()[1000:1000+corpus_size
]
80 tags_and_parses
= reader
.tagged_and_parsed_sents()[1000:1000+corpus_size
]
82 print "initializing %d sentences"%corpus
_size
83 import loc_h_harmonic
# since we need to change constants (is there a better way?)
84 loc_h_harmonic
.HARMONIC_C
= 0.0
85 loc_h_harmonic
.FNONSTOP_MIN
= 25
86 loc_h_harmonic
.FSTOP_MIN
= 5
87 loc_h_harmonic
.RIGHT_FIRST
= 1.0
88 g
= initialize(tagonlys
)
91 print evaluate(g
, tags_and_parses
)
92 print corpus_likelihood(g
, tagonlys
)
94 run_IO(4, tagonlys
, tags_and_parses
)
97 def compare_loc_h_cnf():
98 reader
= WSJDepCorpusReader(None)
100 tagonlys
= reader
.tagonly_sents()[1000:1000+corpus_size
]
102 import loc_h_harmonic
, cnf_harmonic
103 g_l
= loc_h_harmonic
.initialize(tagonlys
)
104 g_c
= cnf_harmonic
.initialize(tagonlys
)
107 (g_l
.p_ROOT
.iteritems(), g_c
.p_ROOT
),
108 (g_c
.p_ROOT
.iteritems(), g_l
.p_ROOT
),
109 (g_l
.p_STOP
.iteritems(), g_c
.p_STOP
),
110 (g_c
.p_STOP
.iteritems(), g_l
.p_STOP
),
111 (g_l
.p_ATTACH
.iteritems(), g_c
.p_ATTACH
),
112 (g_c
.p_ATTACH
.iteritems(), g_l
.p_ATTACH
)]
113 for a_items
, b
in initials
:
115 if k
not in b
.keys(): raise Warning, "a[%s]=%s, but %s not in b"(k
,v
,k
)
116 if (k
,v
) not in b
.iteritems(): raise Warning, "a[%s]=%s, but b[%s]=%s"(k
,v
,k
,b
[k
])
119 import loc_h_dmv
, cnf_dmv
120 for sent
in tagonlys
:
121 i_l
= loc_h_dmv
.inner_sent(g_l
, sent
, {})
122 i_c
= cnf_dmv
.inner_sent(g_c
, sent
, {})
123 # print "%s\n%s\n"%(i_l,i_c)
124 if "%s"%i_l != "%s"%i_c
:
125 raise Warning, "i_l: %s but i_c: %s"%(i_l
,i_c
)
126 # end compare_loc_h_cnf()
128 if __name__
== "__main__":
132 print "main.py: done"