src/main.py

   1 # Todo: since we evaluate _after_ we reestimate, we loose the icharts
   2 # made while reestimating. If we had these available, evaluate and
   3 # corpus_likelihood would be a lot faster, but since they need to run
   4 # _after_ reestimate, we'll have to store an ichart per sentence. So
   5 # try storing those icharts in some loc_h_dmv global, and see if it's
   6 # faster using space rather than time.
   7
   8 from common_dmv import MPPROOT, test, node_str
   9 from wsjdep import WSJDepCorpusReader
  10 #HARMONIC_C: 509.637290698, FNONSTOP_MIN: 30.1124584139, FSTOP_MIN: 13.0830178845
  11 def initialize_loc_h(tagonlys):
  12     import loc_h_harmonic # since we need to change constants (is there a better way?)
  13     reload(loc_h_harmonic)
  14     import random
  15 #     loc_h_harmonic.HARMONIC_C = 380.111684914
  16 #     loc_h_harmonic.FSTOP_MIN = 13.5744632704
  17 #     loc_h_harmonic.FNONSTOP_MIN = 34.8939452454
  18     loc_h_harmonic.HARMONIC_C = random.random() # 0.0 # 509.63 #1000.0 * random.random()
  19     loc_h_harmonic.FSTOP_MIN = random.random() # 1.0 # 13.08 #20.0 * random.random()
  20     loc_h_harmonic.STOP_C = random.random()
  21     loc_h_harmonic.NSTOP_C = random.random()
  22
  23     loc_h_harmonic.RIGHT_FIRST = 1.0
  24     loc_h_harmonic.OLD_STOP_CALC = False
  25     print '''
  26 HARMONIC_C: %s, STOP_C: %s, NSTOP_C: %s, FSTOP_MIN: %s
  27 RIGHT_FIRST: %s, OLD_STOP_CALC: %s'''%(loc_h_harmonic.HARMONIC_C,
  28                                        loc_h_harmonic.STOP_C,
  29                                        loc_h_harmonic.NSTOP_C,
  30                                        loc_h_harmonic.FSTOP_MIN,
  31                                        loc_h_harmonic.RIGHT_FIRST,
  32                                        loc_h_harmonic.OLD_STOP_CALC)
  33     g = loc_h_harmonic.initialize(tagonlys)
  34     return g
  35
  36 def initialize_cnf(tagonlys):
  37     import cnf_harmonic # since we need to change constants (is there a better way?)
  38     reload(cnf_harmonic)
  39     cnf_harmonic.HARMONIC_C = 0.0
  40     cnf_harmonic.FNONSTOP_MIN = 25
  41     cnf_harmonic.FSTOP_MIN = 5
  42     return cnf_harmonic.initialize(tagonlys)
  43
  44
  45 def test_likelihood(reestimate, initialize, inner_sent,
  46                     corpus_size=20, corpus_offset=1000, iterations=4, eval=False):
  47     def run_IO(g, iterations, tagonlys, tags_and_parses):
  48         sumlog,msg = corpus_likelihood(g, tagonlys)
  49         print msg
  50         if eval: print evaluate(g, tags_and_parses)
  51         for i in range(iterations):
  52             g = reestimate(g, tagonlys)
  53             print "reestimation number %d done"%i
  54             if eval: print evaluate(g, tags_and_parses)
  55
  56             prev_sumlog = sumlog
  57             sumlog,msg = corpus_likelihood(g, tagonlys)
  58             if sumlog < prev_sumlog:
  59                 raise Exception, msg+"but previous was %s"%prev_sumlog
  60             print msg
  61         return g
  62
  63     def corpus_likelihood(g, tagsonly):
  64         from math import log
  65         sumlog = 0.0
  66         for sent in tagsonly:
  67             p_sent = inner_sent(g, sent, {})
  68             if p_sent == 0.0:
  69                 print "%s had zero probability!"%sent
  70             else:
  71                 sumlog += log(p_sent)
  72         avg = sumlog / len(tagsonly)
  73         return (sumlog, "Sum of log P_{sentence}: %.4f (should move towards 0), avg: %s\n"%(sumlog,avg))
  74
  75     reader = WSJDepCorpusReader(None)
  76     tagonlys = reader.tagonly_sents()[corpus_offset:corpus_offset+corpus_size]
  77     tags_and_parses = reader.tagged_and_parsed_sents()[corpus_offset:corpus_offset+corpus_size]
  78
  79 #     from loc_h_dmv import testcorpus
  80 #     tagonlys = testcorpus
  81
  82     print "initializing %d sentences..." % corpus_size,
  83     g = initialize(tagonlys)
  84     print "initialized"
  85
  86     g = run_IO(g, iterations, tagonlys, tags_and_parses) # make iterations argument, todo
  87     return g
  88
  89
  90 def evaluate(g, tagged_and_parsed_sents):
  91     '''
  92     tagged_and_parsed_sents is a list of pairs:
  93     (tagonly_sent, parsed_sent)
  94
  95     R_num += 1 if pair from parsed is in mpp
  96     R_den += 1 per pair from parsed
  97
  98     P_num += 1 if pair from mpp is in parsed
  99     P_den += 1 per pair from mpp
 100
 101     F1 = (2 * P * R)/(P + R), harmonisk snitt av P og R
 102     '''
 103     from loc_h_dmv import mpp
 104     from wsjdep import add_root
 105
 106     R, R_r, P, P_r = {}, {}, {}, {}
 107     for nd in ['num', 'den']:
 108         R[nd], R_r[nd], P[nd], P_r[nd] = 0, 0, 0, 0
 109     unrooted = 0 # parses where we couldn't add_root
 110
 111     for sent, gold_parse in tagged_and_parsed_sents:
 112         mpp_sent = mpp(g, sent)
 113         try: gold_parse = add_root(gold_parse)
 114         except ValueError: unrooted += 1
 115
 116         for pair in gold_parse:
 117             dict = R
 118             if pair[0] == MPPROOT: dict = R_r
 119             dict['den'] += 1
 120             if pair in mpp_sent: dict['num'] += 1
 121
 122         for pair in mpp_sent:
 123             dict = P
 124             if pair[0] == MPPROOT: dict = P_r
 125             dict['den'] += 1
 126             if pair in gold_parse: dict['num'] += 1
 127
 128     recall = float(R['num']) / float(R['den'])
 129     precision = float(P['num']) / float(P['den'])
 130     recall_r = float(R['num']+R_r['num']) / float(R['den']+R_r['den'])
 131     precision_r = float(P['num']+P_r['num']) / float(P['den']+P_r['den'])
 132     F1, F1_r = 0.0, 0.0
 133     if (precision + recall) > 0.0:
 134         F1 = (2 * recall * precision) / (precision + recall)
 135     if (precision_r + recall_r) > 0.0:
 136         F1_r = (2 * recall_r * precision_r) / (precision_r + recall_r)
 137
 138     str_vals = (R['num'],R['den'],recall, R['num']+R_r['num'], R['den']+R_r['den'], recall_r,
 139                 P['num'],P['den'],precision, P['num']+P_r['num'], P['den']+P_r['den'], precision_r,
 140                 F1, F1_r, unrooted)
 141     return '''Recall: %d/%d = %.4f\tRecall_r: %d/%d = %.4f
 142 Precision: %d/%d = %.4f\tPrecision_r: %d/%d = %.4f
 143 F1: \t\t%.4f\t\tF1_r: \t\t%.4f (unrooted gold parses: %d)'''%str_vals
 144
 145
 146
 147
 148 def compare_loc_h_cnf():
 149     reader = WSJDepCorpusReader(None)
 150     corpus_size = 200
 151     corpus_offset = 1000
 152     tagonlys = reader.tagonly_sents()[corpus_offset:corpus_offset+corpus_size]
 153
 154     import loc_h_harmonic, cnf_harmonic
 155     g_l = loc_h_harmonic.initialize(tagonlys)
 156     g_c = cnf_harmonic.initialize(tagonlys)
 157
 158     initials = [
 159          (g_l.p_ROOT.iteritems(), g_c.p_ROOT),
 160          (g_c.p_ROOT.iteritems(), g_l.p_ROOT),
 161          (g_l.p_STOP.iteritems(), g_c.p_STOP),
 162          (g_c.p_STOP.iteritems(), g_l.p_STOP),
 163          (g_l.p_ATTACH.iteritems(), g_c.p_ATTACH),
 164          (g_c.p_ATTACH.iteritems(), g_l.p_ATTACH)]
 165     for a_items, b in initials:
 166         for k,v in a_items:
 167             if k not in b.keys(): raise Warning, "a[%s]=%s, but %s not in b"(k,v,k)
 168             if (k,v) not in b.iteritems(): raise Warning, "a[%s]=%s, but b[%s]=%s"(k,v,k,b[k])
 169
 170
 171     import loc_h_dmv, cnf_dmv
 172     from common_dmv import GOR
 173     for sent in tagonlys:
 174         ochart_l, ochart_c, ichart_l, ichart_c = {},{},{},{}
 175         i_l = loc_h_dmv.inner_sent(g_l, sent, ichart_l)
 176         i_c = cnf_dmv.inner_sent(g_c, sent, ichart_c)
 177         test( "%s"%i_l, "%s"%i_c, "i_l","i_c")
 178
 179         for loc_w,w in enumerate(sent):
 180             w_node = (GOR, g_l.tagnum(w))
 181             o_l = loc_h_dmv.outer(loc_w,loc_w+1,w_node,loc_w, g_l, sent, ichart_l,ochart_l)
 182             o_c =   cnf_dmv.outer(loc_w,loc_w+1,w_node,       g_c, sent, ichart_c,ochart_c)
 183             print "%s, %s, %s"%(sent,node_str(w_node),loc_w)
 184             test("%s"%o_l, "%s"%o_c, "o_l(0,1,(GOR,%s),%d,...)"%(w,loc_w),"o_c")
 185
 186 # end compare_loc_h_cnf()
 187
 188
 189 def init_nothing(g,H,S,N,M):
 190     print '''
 191 HARMONIC_C: %s, STOP_C: %s, NSTOP_C: %s, FSTOP_MIN: %s'''%(H,S,N,M)
 192     return lambda corpus:g
 193
 194 def rnd_grammars_test():
 195     import loc_h_dmv
 196     reload(loc_h_dmv)
 197
 198     rnd_grammars0 = []
 199     for i in xrange(20):
 200         g = test_likelihood(loc_h_dmv.reestimate,
 201                             initialize_loc_h,
 202                             loc_h_dmv.inner_sent,
 203                             corpus_size=6268,
 204                             iterations=0,
 205                             corpus_offset=0,
 206                             eval=True)
 207         rnd_grammars0 += [(g, g.HARMONIC_C, g.STOP_C, g.NSTOP_C, g.FSTOP_MIN)]
 208
 209     rnd_grammars1 = [(test_likelihood(loc_h_dmv.reestimate,
 210                                       init_nothing(g,H,S,N,M),
 211                                       loc_h_dmv.inner_sent,
 212                                       corpus_size=6268,
 213                                       iterations=1,
 214                                       corpus_offset=0,
 215                                       eval=True),
 216                       H,S,N,M)
 217                     for g,H,S,N,M in rnd_grammars0]
 218     rnd_grammars2 = [(test_likelihood(loc_h_dmv.reestimate,
 219                                       init_nothing(g,H,S,N,M),
 220                                       loc_h_dmv.inner_sent,
 221                                       corpus_size=6268,
 222                                       iterations=1,
 223                                       corpus_offset=0,
 224                                       eval=True),
 225                       H,S,N,M)
 226                     for g,H,S,N,M in rnd_grammars1]
 227
 228 if __name__ == "__main__":
 229     print "main.py:"
 230
 231 #     compare_loc_h_cnf()
 232
 233 #     import cnf_dmv
 234 #     reload(cnf_dmv)
 235 #     print "\ntrying cnf-reestimate ##############################"
 236 #     g = test_likelihood(cnf_dmv.reestimate,
 237 #                         initialize_cnf,
 238 #                         cnf_dmv.inner_sent,
 239 #                         corpus_size=5,
 240 #                         iterations=4)
 241
 242     rnd_grammars_test()
 243
 244 #     import loc_h_dmv
 245 #     reload(loc_h_dmv)
 246 #     print "\ntrying reestimate v.1 ##############################"
 247 #     g = test_likelihood(loc_h_dmv.reestimate,
 248 #                         initialize_loc_h,
 249 #                         loc_h_dmv.inner_sent,
 250 #                         corpus_size=6268,
 251 #                         iterations=100,
 252 #                         corpus_offset=0,
 253 #                         eval=True)
 254 #     print g
 255
 256 #     print "\ntrying reestimate v.2 ##############################"
 257 #     g = test_likelihood(loc_h_dmv.reestimate2,
 258 #                         initialize_loc_h,
 259 #                         loc_h_dmv.inner_sent,
 260 #                         corpus_size=5,
 261 #                         iterations=4,
 262 #                         corpus_offset=0)
 263 #     print "main.py: done"
 264 #     print g