src/main.py

   1 # Todo: since we evaluate _after_ we reestimate, we loose the icharts
   2 # made while reestimating. If we had these available, evaluate and
   3 # corpus_likelihood would be a lot faster, but since they need to run
   4 # _after_ reestimate, we'll have to store an ichart per sentence. So
   5 # try storing those icharts in some loc_h_dmv global, and see if it's
   6 # faster using space rather than time.
   7
   8 from common_dmv import MPPROOT, GOR, test, node_str
   9 from wsjdep import WSJDepCorpusReader
  10
  11 def initialize_loc_h(tagonlys):
  12     import loc_h_harmonic # since we need to change constants (is there a better way?)
  13     loc_h_harmonic.HARMONIC_C = 0.0
  14     loc_h_harmonic.FNONSTOP_MIN = 25
  15     loc_h_harmonic.FSTOP_MIN = 5
  16     loc_h_harmonic.RIGHT_FIRST = 1.0
  17     return loc_h_harmonic.initialize(tagonlys)
  18
  19 def initialize_cnf(tagonlys):
  20     import cnf_harmonic # since we need to change constants (is there a better way?)
  21     cnf_harmonic.HARMONIC_C = 0.0
  22     cnf_harmonic.FNONSTOP_MIN = 25
  23     cnf_harmonic.FSTOP_MIN = 5
  24     return cnf_harmonic.initialize(tagonlys)
  25
  26
  27 def test_likelihood(reestimate, initialize, inner_sent):
  28     def run_IO_cnf(iterations, tagonlys, tags_and_parses):
  29         print corpus_likelihood(g, tagonlys)
  30         #print evaluate(g, tags_and_parses)
  31         for i in range(iterations):
  32             f = reestimate(g, tagonlys)
  33             print "reestimation number %d done"%i
  34             #print evaluate(g, tags_and_parses)
  35             print corpus_likelihood(g, tagonlys)
  36
  37     def corpus_likelihood_cnf(g, tagsonly):
  38         from math import log
  39         sumlog = 0.0
  40         for sent in tagsonly:
  41             p_sent = inner_sent(g, sent)
  42             sumlog += math.log(p_sent)
  43         return "Sum of log P_{sentence}: %.4f\n"%sumlog
  44
  45     reader = WSJDepCorpusReader(None)
  46     corpus_size = 100
  47     tagonlys = reader.tagonly_sents()[1000:1000+corpus_size]
  48     tags_and_parses = reader.tagged_and_parsed_sents()[1000:1000+corpus_size]
  49
  50     print "initializing %d sentences"%corpus_size
  51     g = initialize(tagonlys)
  52     print "initialized"
  53
  54     run_IO(4, tagonlys, tags_and_parses)
  55
  56
  57
  58 def evaluate(g, tagged_and_parsed_sents):
  59     '''
  60     tagged_and_parsed_sents is a list of pairs:
  61     (tagonly_sent, parsed_sent)
  62
  63     R_num += 1 if pair from parsed is in mpp
  64     R_den += 1 per pair from parsed
  65
  66     P_num += 1 if pair from mpp is in parsed
  67     P_den += 1 per pair from mpp
  68
  69     F1 = (2 * P * R)/(P + R), harmonisk snitt av P og R
  70     '''
  71     from loc_h_dmv import mpp
  72
  73     recall_num = 0
  74     recall_den = 0
  75     precision_num = 0
  76     precision_den = 0
  77
  78     for sent, parse in tagged_and_parsed_sents:
  79         mpp_sent = mpp(g, sent)
  80         for pair in parse:
  81             recall_den += 1
  82             if pair in mpp_sent: recall_num += 1
  83         for pair in mpp_sent:
  84             if pair[0] == MPPROOT:
  85                 continue # todo: add ROOT to parses? (see below)
  86             precision_den += 1
  87             if pair in parse: precision_num += 1
  88
  89 #             try:
  90 #                 rooted_parse = add_root(parse) # use? todo
  91 #             except:
  92 #                 print "No single possible root, todo what?"
  93
  94     recall = float(recall_num) / float(recall_den)
  95     precision = float(precision_num) / float(precision_den)
  96     F1 = 0.0
  97     if (precision + recall) > 0.0:
  98         F1 = (2 * recall * precision) / (precision + recall)
  99
 100     return '''Recall: %d/%d = %.4f
 101 Precision: %d/%d = %.4f
 102 F1: \t\t%.4f'''%(recall_num,recall_den,recall,precision_num,precision_den, precision, F1)
 103
 104
 105
 106
 107 def compare_loc_h_cnf():
 108     reader = WSJDepCorpusReader(None)
 109     corpus_size = 200
 110     tagonlys = reader.tagonly_sents()[1000:1000+corpus_size]
 111
 112     import loc_h_harmonic, cnf_harmonic
 113     g_l = loc_h_harmonic.initialize(tagonlys)
 114     g_c = cnf_harmonic.initialize(tagonlys)
 115
 116     initials = [
 117          (g_l.p_ROOT.iteritems(), g_c.p_ROOT),
 118          (g_c.p_ROOT.iteritems(), g_l.p_ROOT),
 119          (g_l.p_STOP.iteritems(), g_c.p_STOP),
 120          (g_c.p_STOP.iteritems(), g_l.p_STOP),
 121          (g_l.p_ATTACH.iteritems(), g_c.p_ATTACH),
 122          (g_c.p_ATTACH.iteritems(), g_l.p_ATTACH)]
 123     for a_items, b in initials:
 124         for k,v in a_items:
 125             if k not in b.keys(): raise Warning, "a[%s]=%s, but %s not in b"(k,v,k)
 126             if (k,v) not in b.iteritems(): raise Warning, "a[%s]=%s, but b[%s]=%s"(k,v,k,b[k])
 127
 128
 129     import loc_h_dmv, cnf_dmv
 130     for sent in tagonlys:
 131         i_l = loc_h_dmv.inner_sent(g_l, sent, {})
 132         i_c = cnf_dmv.inner_sent(g_c, sent, {})
 133         test( "%s"%i_l, "%s"%i_c, "i_l","i_c")
 134
 135         for loc_w,w in enumerate(sent):
 136             w_node = (GOR, g_l.tagnum(w))
 137             o_l = loc_h_dmv.outer(loc_w,loc_w+1,w_node,loc_w, g_l, sent, {},{})
 138             o_c =   cnf_dmv.outer(loc_w,loc_w+1,w_node,       g_c, sent, {},{})
 139             print "%s, %s, %s"%(sent,node_str(w_node),loc_w)
 140             test("%s"%o_l, "%s"%o_c, "o_l(0,1,(GOR,%s),%d,...)"%(w,loc_w),"o_c")
 141
 142 # end compare_loc_h_cnf()
 143
 144 if __name__ == "__main__":
 145     print "main.py:"
 146     #compare_loc_h_cnf()
 147     import loc_h_dmv, loc_h_harmonic
 148     test_likelihood(loc_h_dmv.reestimate,
 149                     loc_h_harmonic.initialize,
 150                     loc_h_dmv.inner_sent)
 151     print "main.py: done"