DMV/CCM – todo-list / progress

From d1472a068edf1088a7e439b2f81c0f34b2305fda Mon Sep 17 00:00:00 2001 From: Kevin Brubeck Unhammer Date: Fri, 6 Jun 2008 12:54:40 +0200 Subject: [PATCH] started working on new version of inner --- DMVCCM.html | 36 ++++++++++++--- DMVCCM.org | 17 ++++++-- src/dmv.py | 142 +++++++++++++++++++++++++++++++++++++++++++++++++++++------- src/dmv.pyc | Bin 14599 -> 14782 bytes 4 files changed, 169 insertions(+), 26 deletions(-) diff --git a/DMVCCM.html b/DMVCCM.html index 6cf8768..d072e6a 100755 --- a/DMVCCM.html +++ b/DMVCCM.html @@ -6,7 +6,7 @@ lang="en" xml:lang="en"> DMV/CCM – todo-list / progress - + @@ -132,12 +132,30 @@ b[(LRBAR, n_h), 'h'] = h_.probA * _ h_.probA

How is the P_STOP formula different given other values for dir and adj?
-(Presumably, the P_STOP formula where STOP is True is just the -rule-probability of _ h_ -> STOP h_ or h_ -> h STOP, but how does -adjacency fit in here?) +Assuming this: +

+P_STOP(STOP|h,L,non_adj) = ∑_corpus ∑_s<loc(h) ∑_t +inner(s,t,(LRBAR,h)…) / ∑_corpus ∑_s<loc(h) ∑_t inner(s,t,(RBAR,h)…) +
+P_STOP(STOP|h,L,adj) = ∑_corpus ∑_s=loc(h) ∑_t +inner(s,t,(LRBAR,h)…) / ∑_corpus ∑_s=loc(h) ∑_t inner(s,t,(RBAR,h)…) +
+P_STOP(STOP|h,R,non_adj) = ∑_corpus ∑_s ∑_t>loc(h) +inner(s,t,(LRBAR,h)…) / ∑_corpus ∑_s ∑_t>loc(h) inner(s,t,(RBAR,h)…) +
+P_STOP(STOP|h,R,adj) = ∑_corpus ∑_s ∑_t=loc(h) +inner(s,t,(LRBAR,h)…) / ∑_corpus ∑_s ∑_t=loc(h) inner(s,t,(RBAR,h)…) -
-(And P_STOP(-STOP|…) = 1 - P_STOP(STOP|…) ) + + +

+ +

(And P_STOP(-STOP|…) = 1 - P_STOP(STOP|…) )

@@ -445,6 +463,10 @@ Then push stuff up to the remote server:

+(eval `ssh-agent` and ssh-add to avoid having to type in keyphrase all +the time) +

Make a copy of the (remote) master branch:

  git clone git://repo.or.cz/dmvccm.git 
@@ -483,6 +505,6 @@ Good tutorial:
  Author: Kevin Brubeck Unhammer
 <K.BrubeckUnhammer at student uva nl >
 
- Date: 2008/06/05 12:49:53
+ Date: 2008/06/06 11:55:47
 
Skrive vha. emacs + org-mode
 
diff --git a/DMVCCM.org b/DMVCCM.org
index cf96a18..c060a97 100755
--- a/DMVCCM.org
+++ b/DMVCCM.org
@@ -67,9 +67,17 @@ have to be updated along with the other P_{STOP} updates:
 - b[(LRBAR, n_{h}), 'h'] = h_.probA * _ h_.probA
 
 ** How is the P_STOP formula different given other values for dir and adj?
-(Presumably, the P_{STOP} formula where STOP is True is just the
-rule-probability of _ h_ -> STOP h_ or h_ -> h STOP, but how does
-adjacency fit in here?)
+Assuming this:
+- P_{STOP}(STOP|h,L,non_adj) = \sum_{corpus} \sum_{sloc(h)}
+  inner(s,t,(LRBAR,h)...) / \sum_{corpus} \sum_{s} \sum_{t>loc(h)} inner(s,t,(RBAR,h)...)
+- P_{STOP}(STOP|h,R,adj) = \sum_{corpus} \sum_{s} \sum_{t=loc(h)}
+  inner(s,t,(LRBAR,h)...) / \sum_{corpus} \sum_{s} \sum_{t=loc(h)} inner(s,t,(RBAR,h)...)
+
+
 
 (And P_{STOP}(-STOP|...) = 1 - P_{STOP}(STOP|...) )
 * TODO P_CHOOSE for IO/EM
@@ -252,6 +260,9 @@ Later on: (=-a= does =git rm= and =git add= automatically)
 Then push stuff up to the remote server:
 : git push git+ssh://username@repo.or.cz/srv/git/dmvccm.git master
 
+(=eval `ssh-agent`= and =ssh-add= to avoid having to type in keyphrase all
+the time)
+
 Make a copy of the (remote) master branch:
 : git clone git://repo.or.cz/dmvccm.git 
 
diff --git a/src/dmv.py b/src/dmv.py
index 4961cea..79a6381 100755
--- a/src/dmv.py
+++ b/src/dmv.py
@@ -350,6 +350,107 @@ def inner_dmv(s, t, LHS, g, sent, chart):
 
 
 
+def inner_dmv2(s, t, LHS, loc_h, g, sent, chart):
+    ''' A rewrite of inner in io.py, to take adjacency into accord.
+
+    The chart is now 2 times bigger, since there are different values
+    for with or without L/R attachments:
+    chart[(s,t,LHS, Lattach, Rattach)]
+    
+    If Rattach==True then the rule has a right-attachment or there is
+    one lower in the tree (meaning we're no longer
+    adjacent). Adjacency depends on whether there is an attachment
+    lower in the tree, cf. DMV_Rule.p(LRattach, RLattach).
+    
+    Todo: if possible, refactor (move dmv-specific stuff back into
+    dmv, so this is "general" enough to be in io.py)
+    '''
+    
+    def debug_inner_dmv(tabs,s,t,LHS,Lattach,Rattach):
+        if io.DEBUG:
+            attach = {
+                (True, True): "left and right attachments below",
+                (True, False): "left attachment(s) below",
+                (False, True): "right attachment(s) below",
+                (False, False): "no attachments below" }
+            info = (tabs,O(s),s,O(t),t, DMV_Rule.bar_str(LHS), attach[Lattach,Rattach])
+            print "%sTrying from  %s_%d  to  %s_%d  with %s, %s:" % info 
+            
+    def O(s):
+        return sent[s]
+
+    sent_nums = [g.tagnum(tag) for tag in sent]
+    
+    def e(s,t,LHS, Lattach, Rattach, n_t):
+        def tab():
+            "Tabs for debug output"
+            return "\t"*n_t
+        
+        if (s, t, LHS, Lattach, Rattach) in chart:
+            return chart[(s, t, LHS, Lattach, Rattach)]
+        else:
+            debug_inner_dmv(tab(),s,t,LHS, Lattach, Rattach)
+            if s == t:
+                if Lattach or Rattach:
+                    # terminals are always F,F for attachment
+                    io.debug("%s= 0.0 (1 word, no lower attach)" % tab())
+                    return 0.0
+                elif (LHS, O(s)) in g.p_terminals:
+                    prob = g.p_terminals[LHS, O(s)] # b[LHS, O(s)] in Lari&Young
+                else:
+                    # todo: assuming this is how to deal with lacking
+                    # rules, since we add prob.s, and 0 is identity
+                    prob = 0.0 
+                    io.debug( "%sLACKING TERMINAL:" % tab())
+                # todo: add to chart perhaps? Although, it _is_ simple lookup..
+                io.debug( "%s= %.1f (terminal: %s -> %s)" % (tab(),prob,
+                                                             DMV_Rule.bar_str(LHS),
+                                                             O(s)) )
+                return prob
+            else:
+                if (s,t,LHS,Lattach, Rattach) not in chart:
+                    chart[(s,t,LHS,Lattach,Rattach)] = 0.0
+                for rule in g.sent_rules(LHS, sent_nums): # summing over j,k in a[LHS,j,k]
+                    io.debug( "%ssumming rule %s" % (tab(),rule) ) 
+                    L = rule.L()
+                    R = rule.R()
+                    # if it's a STOP rule, rewrite for the same range:
+                    if (L == STOP) or (R == STOP):
+                        if L == STOP:
+                            p = rule.p(Lattach, False) # todo check
+                            pLR = e(s, t, R, Lattach, Rattach, n_t+1)
+                        elif R == STOP:
+                            p = rule.p(False, Rattach) # todo check
+                            pLR = e(s, t, L, Lattach, Rattach, n_t+1)
+                        chart[(s, t, LHS, Lattach, Rattach)] += p * pLR
+                        
+                    # not a STOP, an attachment rewrite:
+                    else:
+                        for r in range(s, t):
+                            if head(L) in sent_nums[s:r+1] and head(R) in sent_nums[r+1:t+1]:
+                                # LL etc are boolean attachment values
+                                for (LL, LR, RL, RR) in rewrite_adj(rule.bars(), Lattach, Rattach):
+                                    p = rule.p(LR, RL) # probN or probA
+                                    pL = e(s,   r, L, LL, LR, n_t+1)
+                                    pR = e(r+1, t, R, RL, RR, n_t+1)
+                                    chart[(s, t, LHS,Lattach,Rattach)] += p * pL * pR 
+                                
+                return chart[(s, t, LHS,Lattach,Rattach)]
+    # end of e-function
+    
+    inner_prob = e(s,t,LHS,True,True, 0) + e(s,t,LHS,True,False, 0) + e(s,t,LHS,False,True, 0) + e(s,t,LHS,False,False, 0)
+    if io.DEBUG:
+        print "---CHART:---"
+        for (s,t,LHS,L,R),v in chart.iteritems():
+            print "\t%s -> %s_%d ... %s_%d (L:%s, R:%s):\t%.3f" % (DMV_Rule.bar_str(LHS,g.numtag),
+                                                                   O(s), s,
+                                                                   O(s), t,
+                                                                   L, R, v)
+        print "---CHART:end---"
+    return [inner_prob, chart] 
+
+
+
 if __name__ == "__main__":                      # Non, Adj
     _h_ = DMV_Rule((LRBAR,0), STOP,    ( RBAR,0), 1.0, 1.0) # LSTOP
     h_S = DMV_Rule(( RBAR,0),(NOBAR,0),  STOP,    0.4, 0.3) # RSTOP
@@ -432,14 +533,21 @@ P_STOP(-STOP|...) = 1 - P_STOP(STOP|...)
     for sent in corpus:
         # have to go through _all_ places where h appears in the
         # sentence...how? how to make sure it _works_?
+        chart = {} # cuts time from 17s to 7s !
         if h_tag in sent:
-            locs_h = [i for i,w in enumerate(sent) if w == h_tag]
+            locs_h = [i2 for i2,w in enumerate(sent) if w == h_tag]
             io.debug( "locs_h:%s, sent:%s"%(locs_h,sent))
             for loc_h in locs_h:
-                for s in range(loc_h): # i 0.0:
         return P_STOP_num / P_STOP_den # upside down in article
@@ -448,9 +556,8 @@ P_STOP(-STOP|...) = 1 - P_STOP(STOP|...)
 
 
 
-if __name__ == "__main__":
+def testreestimation():
     io.DEBUG = 0
-
     testcorpu2 = [s.split() for s in ['det nn vbd c nn vbd nn','det nn vbd c nn vbd pp nn',
                                       'det nn vbd nn','det nn vbd c nn vbd pp nn', 
                                       'det nn vbd nn','det nn vbd c nn vbd pp nn', 
@@ -458,7 +565,7 @@ if __name__ == "__main__":
                                       'det nn vbd nn','det nn vbd c nn vbd pp nn', 
                                       'det nn vbd pp nn','det nn vbd det nn', ]]
     testcorpus = [s.split() for s in ['det nn vbd c vbd','det nn vbd c vbd pp',
-                                      'det nn vbd','det nn vbd c vbd pp', 
+                                      'det nn vbd','det nn vbd c nn vbd pp', 
                                       'det nn vbd','det nn vbd c vbd pp', 
                                       'det nn vbd','det nn vbd c vbd pp', 
                                       'det nn vbd','det nn vbd c det vbd pp', 
@@ -471,16 +578,15 @@ if __name__ == "__main__":
 
     inner_dmv(0, 2, ROOT, g, 'det nn vbd'.split(), {})
 
-    h_tag = 'det'
+    h_tag = 'nn'
     h = g.tagnum(h_tag)
-
+    print "This will take some time."
     for r in g.h_rules(h):
         if r.L()==STOP:
+#             print r
+#             print "off-set the rule, see what happens:"
+#             r.probN = 0.7
             print r
-            print "off-set the rule, see what happens: (this takes some time)"
-            r.probN = 0.5
-            print r
-            
             for i in range(10):
                 pstophln = P_STOP(True, h, 'L', 'N', g, testcorpus)
                 print "p(STOP|%s,L,N):%s"%(h_tag,pstophln)
@@ -490,8 +596,12 @@ if __name__ == "__main__":
                         print r
                         r.probN = pstophln
                         print r
-                        
-                        
+    return "todo"
+
+if __name__ == "__main__":
+    pass
+
+#    testreestimation()
 
 
 # todo: some more testing on the Brown corpus:
diff --git a/src/dmv.pyc b/src/dmv.pyc
index 4998a44dbb9612cf9fa07a7c7fd6c56f17da7626..4854a586af33ce555bccf8b61743fe3d5eaf0dff 100644
GIT binary patch
delta 1347
zcwTi=&u<%55dLPpUjJOLrH{4?%E)!yXW-#1)Af@}&||Q4etI|Gkrc4yq5T(V18_|Vn#8wH9C;R>I51@x
zABiu)!c_t#B_w6kI&Xv6_DV1nXa_2T$U#z(^YV4V4Dl+PsCkvUxHZ~=ti0AA4cEsARx6U2m9c5
zFJq6V<{aeW=p3Rvq5`6##r!cWSu7v(_($ssq5vBsmP05q$0;6TF9YL3XV6!!w2%j4
zvN9YlC92vsXC>9a36>Fem!csUyPW5s3
zmH&DL_N()Vh;6m=DqFzoV2D9N~=aoJ!*z)55Z
zzI0|fye+O+pe!I^{N|a=^6zwYduOv1r=8O1R8{txUA^06P}>cy+VQU(+kH#=K?O>G
zUY@>6S*r3|7?TJ1h*38&AG>ikuCYm)LT!tw3Zlr0L`Opi-ka=5x*yqF
zx|Er2Az+~|rOaCeMd(W(OI}JJq>p{D^tu0oPc8UX^qk!&LG;4!H*Z+
z9^L)dO+R>k>GD17tr5njjP_&u*U??8t8-8e?ptU)BT^PLVJ(kM!1kt~PC%W6%D;!l
zpvJM8!Sd=bylLnu=xO+4*Z~f-URZag;1&9zicqJZCZHzY;%@IT)#K@X2Zh(~>^XzY
zgKu&a_vg4ByHkWW18u=S4LyS$f>(k*0=|i|p%c(0iPU$f0Xq&zorYJI1Ai7>UG6!J
zqVso8Is-i_%JS;(@G8(1_-DmeCwonL0%GaXWmS@+wK=FV?|M?_0rWYj*}k`d_W|@I
zbV>rMGf)+YFEu*w=Aq7lQ|scs@?ofGt$ebZr6A5#I25K3BoU-=
z-$CO)kjyho!;ombN+)pFLO6ghgCHZul5qeOq!A48lrDm+!#8P!E(}j9CesOA@t3=E
z4C*`-VR`ijgjsMSP1i<{?T7Cm%%RhOu_c^|@8#XG`0N(KK?DwrgCK`s5GH|uBy3=d
zU4PeSOemweTVPd?nAayuQ&E~dWH`QaZPcqfk
zx4&rA{d>{shrgw{Q#{S)mT#hwrIDzySh-@-QhePv{`!{xwXWXgZM80RN=QC|>VVL&
zP*LcrP~&xiX4^EHwXJ#qQS0E5sJe839z}ZTdVJBPlokEII6Y|%^2LWKPANv*l2#m}
K+&=ZV