1 # Todo: since we evaluate _after_ we reestimate, we loose the icharts
2 # made while reestimating. If we had these available, evaluate and
3 # corpus_likelihood would be a lot faster, but since they need to run
4 # _after_ reestimate, we'll have to store an ichart per sentence. So
5 # try storing those icharts in some loc_h_dmv global, and see if it's
6 # faster using space rather than time.
8 from common_dmv
import MPPROOT
, GOR
, test
, node_str
9 from wsjdep
import WSJDepCorpusReader
11 def initialize_loc_h(tagonlys
):
12 import loc_h_harmonic
# since we need to change constants (is there a better way?)
13 loc_h_harmonic
.HARMONIC_C
= 0.0
14 loc_h_harmonic
.FNONSTOP_MIN
= 25
15 loc_h_harmonic
.FSTOP_MIN
= 5
16 loc_h_harmonic
.RIGHT_FIRST
= 1.0
17 return loc_h_harmonic
.initialize(tagonlys
)
19 def initialize_cnf(tagonlys
):
20 import cnf_harmonic
# since we need to change constants (is there a better way?)
21 cnf_harmonic
.HARMONIC_C
= 0.0
22 cnf_harmonic
.FNONSTOP_MIN
= 25
23 cnf_harmonic
.FSTOP_MIN
= 5
24 return cnf_harmonic
.initialize(tagonlys
)
27 def test_likelihood(reestimate
, initialize
, inner_sent
):
28 def run_IO_cnf(iterations
, tagonlys
, tags_and_parses
):
29 print corpus_likelihood(g
, tagonlys
)
30 #print evaluate(g, tags_and_parses)
31 for i
in range(iterations
):
32 f
= reestimate(g
, tagonlys
)
33 print "reestimation number %d done"%i
34 #print evaluate(g, tags_and_parses)
35 print corpus_likelihood(g
, tagonlys
)
37 def corpus_likelihood_cnf(g
, tagsonly
):
41 p_sent
= inner_sent(g
, sent
)
42 sumlog
+= math
.log(p_sent
)
43 return "Sum of log P_{sentence}: %.4f\n"%sumlog
45 reader
= WSJDepCorpusReader(None)
47 tagonlys
= reader
.tagonly_sents()[1000:1000+corpus_size
]
48 tags_and_parses
= reader
.tagged_and_parsed_sents()[1000:1000+corpus_size
]
50 print "initializing %d sentences"%corpus
_size
51 g
= initialize(tagonlys
)
54 run_IO(4, tagonlys
, tags_and_parses
)
58 def evaluate(g
, tagged_and_parsed_sents
):
60 tagged_and_parsed_sents is a list of pairs:
61 (tagonly_sent, parsed_sent)
63 R_num += 1 if pair from parsed is in mpp
64 R_den += 1 per pair from parsed
66 P_num += 1 if pair from mpp is in parsed
67 P_den += 1 per pair from mpp
69 F1 = (2 * P * R)/(P + R), harmonisk snitt av P og R
71 from loc_h_dmv
import mpp
78 for sent
, parse
in tagged_and_parsed_sents
:
79 mpp_sent
= mpp(g
, sent
)
82 if pair
in mpp_sent
: recall_num
+= 1
84 if pair
[0] == MPPROOT
:
85 continue # todo: add ROOT to parses? (see below)
87 if pair
in parse
: precision_num
+= 1
90 # rooted_parse = add_root(parse) # use? todo
92 # print "No single possible root, todo what?"
94 recall
= float(recall_num
) / float(recall_den
)
95 precision
= float(precision_num
) / float(precision_den
)
97 if (precision
+ recall
) > 0.0:
98 F1
= (2 * recall
* precision
) / (precision
+ recall
)
100 return '''Recall: %d/%d = %.4f
101 Precision: %d/%d = %.4f
102 F1: \t\t%.4f'''%(recall_num
,recall_den
,recall
,precision_num
,precision_den
, precision
, F1
)
107 def compare_loc_h_cnf():
108 reader
= WSJDepCorpusReader(None)
110 tagonlys
= reader
.tagonly_sents()[1000:1000+corpus_size
]
112 import loc_h_harmonic
, cnf_harmonic
113 g_l
= loc_h_harmonic
.initialize(tagonlys
)
114 g_c
= cnf_harmonic
.initialize(tagonlys
)
117 (g_l
.p_ROOT
.iteritems(), g_c
.p_ROOT
),
118 (g_c
.p_ROOT
.iteritems(), g_l
.p_ROOT
),
119 (g_l
.p_STOP
.iteritems(), g_c
.p_STOP
),
120 (g_c
.p_STOP
.iteritems(), g_l
.p_STOP
),
121 (g_l
.p_ATTACH
.iteritems(), g_c
.p_ATTACH
),
122 (g_c
.p_ATTACH
.iteritems(), g_l
.p_ATTACH
)]
123 for a_items
, b
in initials
:
125 if k
not in b
.keys(): raise Warning, "a[%s]=%s, but %s not in b"(k
,v
,k
)
126 if (k
,v
) not in b
.iteritems(): raise Warning, "a[%s]=%s, but b[%s]=%s"(k
,v
,k
,b
[k
])
129 import loc_h_dmv
, cnf_dmv
130 for sent
in tagonlys
:
131 i_l
= loc_h_dmv
.inner_sent(g_l
, sent
, {})
132 i_c
= cnf_dmv
.inner_sent(g_c
, sent
, {})
133 test( "%s"%i_l, "%s"%i_c
, "i_l","i_c")
135 for loc_w
,w
in enumerate(sent
):
136 w_node
= (GOR
, g_l
.tagnum(w
))
137 o_l
= loc_h_dmv
.outer(loc_w
,loc_w
+1,w_node
,loc_w
, g_l
, sent
, {},{})
138 o_c
= cnf_dmv
.outer(loc_w
,loc_w
+1,w_node
, g_c
, sent
, {},{})
139 print "%s, %s, %s"%(sent
,node_str(w_node
),loc_w
)
140 test("%s"%o_l, "%s"%o_c
, "o_l(0,1,(GOR,%s),%d,...)"%(w
,loc_w
),"o_c")
142 # end compare_loc_h_cnf()
144 if __name__
== "__main__":
147 import loc_h_dmv
, loc_h_harmonic
148 test_likelihood(loc_h_dmv
.reestimate
,
149 loc_h_harmonic
.initialize
,
150 loc_h_dmv
.inner_sent
)
151 print "main.py: done"