src/main.py

   1 # Todo: since we evaluate _after_ we reestimate, we loose the icharts
   2 # made while reestimating. If we had these available, evaluate and
   3 # corpus_likelihood would be a lot faster, but since they need to run
   4 # _after_ reestimate, we'll have to store an ichart per sentence. So
   5 # try storing those icharts in some loc_h_dmv global, and see if it's
   6 # faster using space rather than time.
   7
   8 from common_dmv import MPPROOT, GOR, test, node_str
   9 from wsjdep import WSJDepCorpusReader
  10
  11 def initialize_loc_h(tagonlys):
  12     import loc_h_harmonic # since we need to change constants (is there a better way?)
  13     loc_h_harmonic.HARMONIC_C = 0.0
  14     loc_h_harmonic.FNONSTOP_MIN = 25
  15     loc_h_harmonic.FSTOP_MIN = 5
  16     loc_h_harmonic.RIGHT_FIRST = 1.0
  17     return loc_h_harmonic.initialize(tagonlys)
  18
  19 def initialize_cnf(tagonlys):
  20     import cnf_harmonic # since we need to change constants (is there a better way?)
  21     cnf_harmonic.HARMONIC_C = 0.0
  22     cnf_harmonic.FNONSTOP_MIN = 25
  23     cnf_harmonic.FSTOP_MIN = 5
  24     return cnf_harmonic.initialize(tagonlys)
  25
  26
  27 def test_likelihood(reestimate, initialize, inner_sent, corpus_size=20, corpus_offset=1000):
  28     def run_IO(g, iterations, tagonlys, tags_and_parses):
  29         print corpus_likelihood(g, tagonlys)
  30         # print evaluate(g, tags_and_parses) #
  31         for i in range(iterations):
  32             g = reestimate(g, tagonlys)
  33             print "reestimation number %d done"%i
  34             # print evaluate(g, tags_and_parses) #
  35             print corpus_likelihood(g, tagonlys)
  36         return g
  37
  38     def corpus_likelihood(g, tagsonly):
  39         from math import log
  40         sumlog = 0.0
  41         for sent in tagsonly:
  42             p_sent = inner_sent(g, sent, {})
  43             if p_sent == 0.0:
  44                 print "%s had zero probability!"%sent
  45             else:
  46                 sumlog += log(p_sent)
  47         return "Sum of log P_{sentence}: %.4f (should move towards 0)\n"%sumlog
  48
  49     reader = WSJDepCorpusReader(None)
  50     tagonlys = reader.tagonly_sents()[corpus_offset:corpus_offset+corpus_size]
  51     tags_and_parses = reader.tagged_and_parsed_sents()[corpus_offset:corpus_offset+corpus_size]
  52
  53 #     from loc_h_dmv import testcorpus
  54 #     tagonlys = testcorpus
  55
  56     print "initializing %d sentences..." % corpus_size,
  57     g = initialize(tagonlys)
  58     print "initialized"
  59
  60     g = run_IO(g, 4, tagonlys, tags_and_parses)
  61     return g
  62
  63
  64 def evaluate(g, tagged_and_parsed_sents):
  65     '''
  66     tagged_and_parsed_sents is a list of pairs:
  67     (tagonly_sent, parsed_sent)
  68
  69     R_num += 1 if pair from parsed is in mpp
  70     R_den += 1 per pair from parsed
  71
  72     P_num += 1 if pair from mpp is in parsed
  73     P_den += 1 per pair from mpp
  74
  75     F1 = (2 * P * R)/(P + R), harmonisk snitt av P og R
  76     '''
  77     from loc_h_dmv import mpp
  78
  79     recall_num = 0
  80     recall_den = 0
  81     precision_num = 0
  82     precision_den = 0
  83
  84     for sent, parse in tagged_and_parsed_sents:
  85         mpp_sent = mpp(g, sent)
  86         for pair in parse:
  87             recall_den += 1
  88             if pair in mpp_sent: recall_num += 1
  89         for pair in mpp_sent:
  90             if pair[0] == MPPROOT:
  91                 continue # todo: add ROOT to parses? (see below)
  92             precision_den += 1
  93             if pair in parse: precision_num += 1
  94
  95 #             try:
  96 #                 rooted_parse = add_root(parse) # use? todo
  97 #             except:
  98 #                 print "No single possible root, todo what?"
  99
 100     recall = float(recall_num) / float(recall_den)
 101     precision = float(precision_num) / float(precision_den)
 102     F1 = 0.0
 103     if (precision + recall) > 0.0:
 104         F1 = (2 * recall * precision) / (precision + recall)
 105
 106     return '''Recall: %d/%d = %.4f
 107 Precision: %d/%d = %.4f
 108 F1: \t\t%.4f'''%(recall_num,recall_den,recall,precision_num,precision_den, precision, F1)
 109
 110
 111
 112
 113 def compare_loc_h_cnf():
 114     reader = WSJDepCorpusReader(None)
 115     corpus_size = 200
 116     corpus_offset = 1000
 117     tagonlys = reader.tagonly_sents()[corpus_offset:corpus_offset+corpus_size]
 118
 119     import loc_h_harmonic, cnf_harmonic
 120     g_l = loc_h_harmonic.initialize(tagonlys)
 121     g_c = cnf_harmonic.initialize(tagonlys)
 122
 123     initials = [
 124          (g_l.p_ROOT.iteritems(), g_c.p_ROOT),
 125          (g_c.p_ROOT.iteritems(), g_l.p_ROOT),
 126          (g_l.p_STOP.iteritems(), g_c.p_STOP),
 127          (g_c.p_STOP.iteritems(), g_l.p_STOP),
 128          (g_l.p_ATTACH.iteritems(), g_c.p_ATTACH),
 129          (g_c.p_ATTACH.iteritems(), g_l.p_ATTACH)]
 130     for a_items, b in initials:
 131         for k,v in a_items:
 132             if k not in b.keys(): raise Warning, "a[%s]=%s, but %s not in b"(k,v,k)
 133             if (k,v) not in b.iteritems(): raise Warning, "a[%s]=%s, but b[%s]=%s"(k,v,k,b[k])
 134
 135
 136     import loc_h_dmv, cnf_dmv
 137     for sent in tagonlys:
 138         ochart_l, ochart_c, ichart_l, ichart_c = {},{},{},{}
 139         i_l = loc_h_dmv.inner_sent(g_l, sent, ichart_l)
 140         i_c = cnf_dmv.inner_sent(g_c, sent, ichart_c)
 141         test( "%s"%i_l, "%s"%i_c, "i_l","i_c")
 142
 143         for loc_w,w in enumerate(sent):
 144             w_node = (GOR, g_l.tagnum(w))
 145             o_l = loc_h_dmv.outer(loc_w,loc_w+1,w_node,loc_w, g_l, sent, ichart_l,ochart_l)
 146             o_c =   cnf_dmv.outer(loc_w,loc_w+1,w_node,       g_c, sent, ichart_c,ochart_c)
 147             print "%s, %s, %s"%(sent,node_str(w_node),loc_w)
 148             test("%s"%o_l, "%s"%o_c, "o_l(0,1,(GOR,%s),%d,...)"%(w,loc_w),"o_c")
 149
 150 # end compare_loc_h_cnf()
 151
 152 if __name__ == "__main__":
 153     print "main.py:"
 154 #     compare_loc_h_cnf()
 155
 156     import cnf_dmv
 157     print "\ntrying cnf-reestimate ##############################"
 158     g = test_likelihood(cnf_dmv.reestimate,
 159                         initialize_cnf,
 160                         cnf_dmv.inner_sent,
 161                         corpus_size=20)
 162
 163     import loc_h_dmv
 164     print "\ntrying reestimate v.1 ##############################"
 165     g = test_likelihood(loc_h_dmv.reestimate,
 166                         initialize_loc_h,
 167                         loc_h_dmv.inner_sent,
 168                         corpus_size=20)
 169     print "\ntrying reestimate v.2 ##############################"
 170     g = test_likelihood(loc_h_dmv.reestimate2,
 171                         initialize_loc_h,
 172                         loc_h_dmv.inner_sent,
 173                         corpus_size=20)
 174
 175     print "main.py: done"