rewrote loc_h_harmonic's STOP initialization to reflect report.pdf; simpler now
authorKevin Brubeck Unhammer <pixiemotion@gmail.com>
Thu, 18 Sep 2008 10:00:43 +0000 (18 12:00 +0200)
committerKevin Brubeck Unhammer <pixiemotion@gmail.com>
Thu, 18 Sep 2008 10:00:43 +0000 (18 12:00 +0200)
report/report.pdf
report/report.tex
src/loc_h_dmv.py
src/loc_h_dmv.pyc
src/loc_h_harmonic.py
src/main.py

index d5bf696..44b6911 100644 (file)
Binary files a/report/report.pdf and b/report/report.pdf differ
index 0b02970..df06de3 100644 (file)
@@ -375,19 +375,19 @@ border\footnote{For non-adjacent stopping we checked for occurence at
 
 \begin{align*}
   f(stop:\LOC{h},left,adj)=\begin{cases}
-    1 \text{, if } loc(\LOC{h}) = 0,\\
+    C_S \text{, if } loc(\LOC{h}) = 0,\\
     0 \text{, otherwise}
   \end{cases}
 \end{align*}
 
 \begin{align*}
   P_{STOP}(stop|h,left,adj) = \frac
-  {C_S + \sum_{s \in S}\sum_{\LOC{h} \in s} f(stop:\LOC{h},left,adj)} 
-  {C_S + C_N + \sum_{s \in S}\sum_{\LOC{h} \in s} 1}
+  {C_{M} + \sum_{s \in S}\sum_{\LOC{h} \in s} f(stop:\LOC{h},left,adj)} 
+  {C_{M} + \sum_{s \in S}\sum_{\LOC{h} \in s} C_N}
 \end{align*}
 
 \subsection{TODO: Results}
-We tried various values for the initialization constants $C_A, C_S$
+We tried various values for the initialization constants $C_A, C_M, C_S$
 and $C_N$; but it was hard to find any clear pattern for what worked
 best. 
 
index 99b6879..b6f9ac0 100644 (file)
@@ -38,7 +38,9 @@ class DMV_Grammar(io.Grammar):
         def t(n):
             return "%d=%s" % (n, self.numtag(n))
         def p(dict,key):
-            if key in dict: return dict[key]
+            if key in dict:
+                if dict[key] > 1.0: raise Exception, "probability > 1.0:%s"%key
+                return dict[key]
             else: return 0.0
         def no_zeroL(str,tagstr,prob):
             if prob > 0.0: return (str%(tagstr,prob)).ljust(LJUST)
@@ -885,7 +887,7 @@ def testgrammar():
     loc_h_harmonic.FNONSTOP_MIN = 25
     loc_h_harmonic.FSTOP_MIN = 5
     loc_h_harmonic.RIGHT_FIRST = 1.0 
-    loc_h_harmonic.OTHER_STOP_CALC = False
+    loc_h_harmonic.OLD_STOP_CALC = True
     
     return loc_h_harmonic.initialize(testcorpus)
 
@@ -1056,8 +1058,16 @@ def compare_grammars(g1,g2):
 
 
 def testNVNgrammar():
-    from loc_h_harmonic import initialize
-    g = initialize(['n v n'.split()])
+    import loc_h_harmonic
+
+    # make sure these are the way they were when setting up the tests:
+    loc_h_harmonic.HARMONIC_C = 0.0
+    loc_h_harmonic.FNONSTOP_MIN = 25
+    loc_h_harmonic.FSTOP_MIN = 5
+    loc_h_harmonic.RIGHT_FIRST = 1.0 
+    loc_h_harmonic.OLD_STOP_CALC = True
+    
+    g = loc_h_harmonic.initialize(['n v n'.split()])
     return g # todo
 
 def testIO():
@@ -1086,6 +1096,14 @@ if __name__ == "__main__":
 #     g1 = testreestimation()
 #     g2 = testreestimation2()
 #     print compare_grammars(g1,g2)
+
+
+
+
+
+
+
+if False:
     g = testNVNgrammar()
     q_sent = inner_sent(g,'n v n'.split(),{})
     q_tree = {}
index 328748c..dcdc0ba 100644 (file)
Binary files a/src/loc_h_dmv.pyc and b/src/loc_h_dmv.pyc differ
index 1b3eb1e..8932cc9 100644 (file)
@@ -2,11 +2,15 @@
 
 from loc_h_dmv import * # better way to do this?
 
-# todo: tweak these (0.0 on FSTOP_MIN gives too many zero-probabilities though)
+# todo: tweak these
 HARMONIC_C = 0.0
-FNONSTOP_MIN = 0.0
+STOP_C = 1.0
+NSTOP_C = 1.0
 FSTOP_MIN = 1.0
-OTHER_STOP_CALC = True
+
+FNONSTOP_MIN = 0.0 # for OLD_STOP_CALC. 0.0 on FSTOP_MIN gives many
+                   # zero-probabilities for OLD_STOP_CALC
+OLD_STOP_CALC = True
 
 RIGHT_FIRST = 1.0 # apparently right-first is best for DMV-only 
 
@@ -45,8 +49,10 @@ def init_zeros(tags):
         f['sum', 'ROOT'] = 0
         for dir in [LEFT, RIGHT]:
             for adj in [ADJ, NON]:
-                f[tag, 'STOP', dir, adj] = FSTOP_MIN
-                f[tag, '-STOP', dir, adj] = FNONSTOP_MIN
+                f[tag, 'STOP', dir, adj] = 0.0
+                if OLD_STOP_CALC:
+                    f[tag, 'STOP', dir, adj] = FSTOP_MIN
+                    f[tag, '-STOP', dir, adj] = FNONSTOP_MIN
         f[tag, RIGHT] = {}
         f[tag, LEFT] = {}
         f[tag, 'sum', RIGHT] = 0.0
@@ -78,19 +84,10 @@ def init_freq(corpus, tags):
         for loc_h, head in enumerate(sent): 
             # todo grok: how is this different from just using straight head
             # frequency counts, for the ROOT probabilities?
-            f['ROOT', head] += 1
-            f['sum', 'ROOT'] += 1
+            f['ROOT', head] += 1.0
+            f['sum', 'ROOT'] += 1.0
             
-            if OTHER_STOP_CALC:
-                f[head,  'STOP',  LEFT,NON] += (loc_h == 1)     # second word
-                f[head, '-STOP',  LEFT,NON] += (loc_h  > 1)     # after second
-                f[head,  'STOP',  LEFT,ADJ] += (loc_h == 0)     # first word
-                f[head, '-STOP',  LEFT,ADJ] += (loc_h  > 0)     # not first
-                f[head,  'STOP', RIGHT,NON] += (loc_h == n - 1) # second-to-last
-                f[head, '-STOP', RIGHT,NON] += (loc_h  < n - 1) # before second-to-last
-                f[head,  'STOP', RIGHT,ADJ] += (loc_h == n)     # last word
-                f[head, '-STOP', RIGHT,ADJ] += (loc_h  < n)     # not last
-            else: # note: int(True) == 1
+            if OLD_STOP_CALC:
                 f[head,  'STOP',  LEFT,NON] += (loc_h == 1)     # second word
                 f[head, '-STOP',  LEFT,NON] += (not loc_h == 1) # not second
                 f[head,  'STOP',  LEFT,ADJ] += (loc_h == 0)     # first word
@@ -99,6 +96,20 @@ def init_freq(corpus, tags):
                 f[head, '-STOP', RIGHT,NON] += (not loc_h == n - 1) # not second-to-last
                 f[head,  'STOP', RIGHT,ADJ] += (loc_h == n)         # last word
                 f[head, '-STOP', RIGHT,ADJ] += (not loc_h == n)     # not last
+            else:
+                f[head,  'STOP',  LEFT,NON] += (loc_h == 1)     # second word
+                f[head,  'STOP',  LEFT,ADJ] += (loc_h == 0)     # first word
+                f[head,  'STOP', RIGHT,NON] += (loc_h == n - 1) # second-to-last
+                f[head,  'STOP', RIGHT,ADJ] += (loc_h == n)     # last word
+#             elif OTHER_STOP_CALC:
+#                 f[head,  'STOP',  LEFT,NON] += (loc_h == 1)     # second word
+#                 f[head, '-STOP',  LEFT,NON] += (loc_h  > 1)     # after second
+#                 f[head,  'STOP',  LEFT,ADJ] += (loc_h == 0)     # first word
+#                 f[head, '-STOP',  LEFT,ADJ] += (loc_h  > 0)     # not first
+#                 f[head,  'STOP', RIGHT,NON] += (loc_h == n - 1) # second-to-last
+#                 f[head, '-STOP', RIGHT,NON] += (loc_h  < n - 1) # before second-to-last
+#                 f[head,  'STOP', RIGHT,ADJ] += (loc_h == n)     # last word
+#                 f[head, '-STOP', RIGHT,ADJ] += (loc_h  < n)     # not last
             
             # this is where we make the "harmonic" distribution. quite.
             for loc_a, arg in enumerate(sent):
@@ -118,32 +129,34 @@ def init_freq(corpus, tags):
     return f 
 
 def init_normalize(f, tags, numtag, tagnum):
-    '''Use frequencies (and sums) in f to return create p_STOP, p_ATTACH
-    and p_GO_AT (which is (1-p_STOP)*p_ATTACH).
-
-    Return a usable DMV_Grammar2.'''
+    '''Use frequencies (and sums) in f to return create p_STOP, p_ATTACH,
+    p_ROOT.
+    
+    Return a usable DMV_Grammar.'''
     p_rules = []
     p_STOP, p_ROOT, p_ATTACH, p_ORDER = {},{},{},{}
     for h, head in numtag.iteritems():
-        p_ROOT[h] = float(f['ROOT', head]) / f['sum', 'ROOT']
+        # f['ROOT', head] is just a simple frequency count
+        p_ROOT[h] = f['ROOT', head] / f['sum', 'ROOT']
         
-        # p_STOP = STOP / (STOP + NOT_STOP)
         for dir in [LEFT,RIGHT]:
             for adj in [NON,ADJ]:
-                den = f[head, 'STOP', dir, adj] + f[head, '-STOP', dir, adj]
-                if den > 0.0:
-                    p_STOP[h, dir, adj] = float(f[head, 'STOP', dir, adj]) / float(den)
-                else:
-                    p_STOP[h, dir, adj] = 1.0
-                    
+                if OLD_STOP_CALC:
+                    den = f[head, 'STOP', dir, adj] + f[head, '-STOP', dir, adj]
+                    if den > 0.0:
+                        p_STOP[h, dir, adj] = f[head, 'STOP', dir, adj] / den
+                    else: p_STOP[h, dir, adj] = 1.0
+                else: 
+                    p_STOP[h, dir, adj] = \
+                        (FSTOP_MIN + f[head, 'STOP', dir, adj] * STOP_C) / \
+                        (FSTOP_MIN + f['ROOT',head] * NSTOP_C)
                     
-            
         p_ORDER[GOR, h] = RIGHT_FIRST
         p_ORDER[GOL, h] = 1 - RIGHT_FIRST
 
         for dir in [LEFT, RIGHT]:
             for arg, val in f[head, dir].iteritems():
-                p_ATTACH[tagnum[arg], h, dir] = float(val) / f[head,'sum',dir]
+                p_ATTACH[tagnum[arg], h, dir] = val / f[head,'sum',dir]
 
     return DMV_Grammar(numtag, tagnum, p_ROOT, p_STOP, p_ATTACH, p_ORDER)
 
@@ -159,6 +172,8 @@ def initialize(corpus):
     g = init_normalize(f, tags, numtag, tagnum)
 
     g.HARMONIC_C = HARMONIC_C # for evaluations in main.py, todo: remove
+    g.STOP_C = STOP_C
+    g.NSTOP_C = NSTOP_C
     g.FNONSTOP_MIN = FNONSTOP_MIN
     g.FSTOP_MIN = FSTOP_MIN
     
index 3facf68..b5b6336 100644 (file)
@@ -15,19 +15,21 @@ def initialize_loc_h(tagonlys):
 #     loc_h_harmonic.HARMONIC_C = 380.111684914
 #     loc_h_harmonic.FSTOP_MIN = 13.5744632704
 #     loc_h_harmonic.FNONSTOP_MIN = 34.8939452454
-    loc_h_harmonic.HARMONIC_C = 0 # 509.63 #1000.0 * random.random()
-    loc_h_harmonic.FSTOP_MIN = 13.08 #20.0 * random.random()
-    loc_h_harmonic.FNONSTOP_MIN = 30.11 #50.0 * random.random() + loc_h_harmonic.FSTOP_MIN
+    loc_h_harmonic.HARMONIC_C = random.random() # 0.0 # 509.63 #1000.0 * random.random()
+    loc_h_harmonic.FSTOP_MIN = random.random() # 1.0 # 13.08 #20.0 * random.random()
+    loc_h_harmonic.STOP_C = random.random()
+    loc_h_harmonic.NSTOP_C = random.random()
 
     loc_h_harmonic.RIGHT_FIRST = 1.0
-    loc_h_harmonic.OTHER_STOP_CALC = False
+    loc_h_harmonic.OLD_STOP_CALC = False
     print '''
-HARMONIC_C: %s, FNONSTOP_MIN: %s, FSTOP_MIN: %s
-RIGHT_FIRST: %s, OTHER_STOP_CALC: %s'''%(loc_h_harmonic.HARMONIC_C,
-                                         loc_h_harmonic.FNONSTOP_MIN,
-                                         loc_h_harmonic.FSTOP_MIN,
-                                         loc_h_harmonic.RIGHT_FIRST,
-                                         loc_h_harmonic.OTHER_STOP_CALC)
+HARMONIC_C: %s, STOP_C: %s, NSTOP_C: %s, FSTOP_MIN: %s
+RIGHT_FIRST: %s, OLD_STOP_CALC: %s'''%(loc_h_harmonic.HARMONIC_C,
+                                       loc_h_harmonic.STOP_C,
+                                       loc_h_harmonic.NSTOP_C,
+                                       loc_h_harmonic.FSTOP_MIN,
+                                       loc_h_harmonic.RIGHT_FIRST,
+                                       loc_h_harmonic.OLD_STOP_CALC)
     g = loc_h_harmonic.initialize(tagonlys)
     return g
 
@@ -184,9 +186,9 @@ def compare_loc_h_cnf():
 # end compare_loc_h_cnf()
 
 
-def init_nothing(g,H,N,S):
+def init_nothing(g,H,S,N,M):
     print '''
-HARMONIC_C: %s, FNONSTOP_MIN: %s, FSTOP_MIN: %s'''%(H,N,S)
+HARMONIC_C: %s, STOP_C: %s, NSTOP_C: %s, FSTOP_MIN: %s'''%(H,S,N,M)
     return lambda corpus:g
 
 def rnd_grammars_test():
@@ -202,26 +204,26 @@ def rnd_grammars_test():
                             iterations=0,
                             corpus_offset=0,
                             eval=True)
-        rnd_grammars0 += [(g, g.HARMONIC_C, g.FNONSTOP_MIN, g.FSTOP_MIN)]
+        rnd_grammars0 += [(g, g.HARMONIC_C, g.STOP_C, g.NSTOP_C, g.FSTOP_MIN)]
 
     rnd_grammars1 = [(test_likelihood(loc_h_dmv.reestimate,
-                                      init_nothing(g,H,N,S),
+                                      init_nothing(g,H,S,N,M),
                                       loc_h_dmv.inner_sent,
                                       corpus_size=6268,
                                       iterations=1,
                                       corpus_offset=0,
                                       eval=True),
-                      H,N,S)
-                    for g,H,N,S in rnd_grammars0]
+                      H,S,N,M)
+                    for g,H,S,N,M in rnd_grammars0]
     rnd_grammars2 = [(test_likelihood(loc_h_dmv.reestimate,
-                                      init_nothing(g,H,N,S),
+                                      init_nothing(g,H,S,N,M),
                                       loc_h_dmv.inner_sent,
                                       corpus_size=6268,
                                       iterations=1,
                                       corpus_offset=0,
                                       eval=True),
-                      H,N,S)
-                    for g,H,N,S in rnd_grammars1]
+                      H,S,N,M)
+                    for g,H,S,N,M in rnd_grammars1]
 
 if __name__ == "__main__":
     print "main.py:"
@@ -237,24 +239,26 @@ if __name__ == "__main__":
 #                         corpus_size=5,
 #                         iterations=4)
 
-    import loc_h_dmv
+    rnd_grammars_test()
+
+#     import loc_h_dmv
 #     reload(loc_h_dmv)
-#     rnd_grammars_test()
-    print "\ntrying reestimate v.1 ##############################"
-    g = test_likelihood(loc_h_dmv.reestimate,
-                        initialize_loc_h,
-                        loc_h_dmv.inner_sent,
-                        corpus_size=5,
-                        iterations=4,
-                        corpus_offset=0,
-                        eval=True)
-    print g 
-
-    print "\ntrying reestimate v.2 ##############################"
-    g = test_likelihood(loc_h_dmv.reestimate2,
-                        initialize_loc_h,
-                        loc_h_dmv.inner_sent,
-                        corpus_size=5,
-                        iterations=4,
-                        corpus_offset=0)
-    print "main.py: done"
+#     print "\ntrying reestimate v.1 ##############################"
+#     g = test_likelihood(loc_h_dmv.reestimate,
+#                         initialize_loc_h,
+#                         loc_h_dmv.inner_sent,
+#                         corpus_size=6268,
+#                         iterations=100,
+#                         corpus_offset=0,
+#                         eval=True)
+#     print g 
+
+#     print "\ntrying reestimate v.2 ##############################"
+#     g = test_likelihood(loc_h_dmv.reestimate2,
+#                         initialize_loc_h,
+#                         loc_h_dmv.inner_sent,
+#                         corpus_size=5,
+#                         iterations=4,
+#                         corpus_offset=0)
+#     print "main.py: done"
+#     print g