From d1472a068edf1088a7e439b2f81c0f34b2305fda Mon Sep 17 00:00:00 2001 From: Kevin Brubeck Unhammer Date: Fri, 6 Jun 2008 12:54:40 +0200 Subject: [PATCH] started working on new version of inner --- DMVCCM.html | 36 ++++++++++++--- DMVCCM.org | 17 ++++++-- src/dmv.py | 142 +++++++++++++++++++++++++++++++++++++++++++++++++++++------- src/dmv.pyc | Bin 14599 -> 14782 bytes 4 files changed, 169 insertions(+), 26 deletions(-) diff --git a/DMVCCM.html b/DMVCCM.html index 6cf8768..d072e6a 100755 --- a/DMVCCM.html +++ b/DMVCCM.html @@ -6,7 +6,7 @@ lang="en" xml:lang="en"> DMV/CCM – todo-list / progress - + @@ -132,12 +132,30 @@ b[(LRBAR, nh), 'h'] = h_.probA * _ h_.probA
  • How is the P_STOP formula different given other values for dir and adj?
    -(Presumably, the PSTOP formula where STOP is True is just the -rule-probability of _ h_ -> STOP h_ or h_ -> h STOP, but how does -adjacency fit in here?) +Assuming this: + + +

    (And PSTOP(-STOP|…) = 1 - PSTOP(STOP|…) )

  • @@ -445,6 +463,10 @@ Then push stuff up to the remote server:

    +(eval `ssh-agent` and ssh-add to avoid having to type in keyphrase all +the time) +

    +

    Make a copy of the (remote) master branch:

      git clone git://repo.or.cz/dmvccm.git 
    @@ -483,6 +505,6 @@ Good tutorial:
     

    Author: Kevin Brubeck Unhammer <K.BrubeckUnhammer at student uva nl >

    -

    Date: 2008/06/05 12:49:53

    +

    Date: 2008/06/06 11:55:47

    Skrive vha. emacs + org-mode

    diff --git a/DMVCCM.org b/DMVCCM.org index cf96a18..c060a97 100755 --- a/DMVCCM.org +++ b/DMVCCM.org @@ -67,9 +67,17 @@ have to be updated along with the other P_{STOP} updates: - b[(LRBAR, n_{h}), 'h'] = h_.probA * _ h_.probA ** How is the P_STOP formula different given other values for dir and adj? -(Presumably, the P_{STOP} formula where STOP is True is just the -rule-probability of _ h_ -> STOP h_ or h_ -> h STOP, but how does -adjacency fit in here?) +Assuming this: +- P_{STOP}(STOP|h,L,non_adj) = \sum_{corpus} \sum_{sloc(h)} + inner(s,t,(LRBAR,h)...) / \sum_{corpus} \sum_{s} \sum_{t>loc(h)} inner(s,t,(RBAR,h)...) +- P_{STOP}(STOP|h,R,adj) = \sum_{corpus} \sum_{s} \sum_{t=loc(h)} + inner(s,t,(LRBAR,h)...) / \sum_{corpus} \sum_{s} \sum_{t=loc(h)} inner(s,t,(RBAR,h)...) + + (And P_{STOP}(-STOP|...) = 1 - P_{STOP}(STOP|...) ) * TODO P_CHOOSE for IO/EM @@ -252,6 +260,9 @@ Later on: (=-a= does =git rm= and =git add= automatically) Then push stuff up to the remote server: : git push git+ssh://username@repo.or.cz/srv/git/dmvccm.git master +(=eval `ssh-agent`= and =ssh-add= to avoid having to type in keyphrase all +the time) + Make a copy of the (remote) master branch: : git clone git://repo.or.cz/dmvccm.git diff --git a/src/dmv.py b/src/dmv.py index 4961cea..79a6381 100755 --- a/src/dmv.py +++ b/src/dmv.py @@ -350,6 +350,107 @@ def inner_dmv(s, t, LHS, g, sent, chart): +def inner_dmv2(s, t, LHS, loc_h, g, sent, chart): + ''' A rewrite of inner in io.py, to take adjacency into accord. + + The chart is now 2 times bigger, since there are different values + for with or without L/R attachments: + chart[(s,t,LHS, Lattach, Rattach)] + + If Rattach==True then the rule has a right-attachment or there is + one lower in the tree (meaning we're no longer + adjacent). Adjacency depends on whether there is an attachment + lower in the tree, cf. DMV_Rule.p(LRattach, RLattach). + + Todo: if possible, refactor (move dmv-specific stuff back into + dmv, so this is "general" enough to be in io.py) + ''' + + def debug_inner_dmv(tabs,s,t,LHS,Lattach,Rattach): + if io.DEBUG: + attach = { + (True, True): "left and right attachments below", + (True, False): "left attachment(s) below", + (False, True): "right attachment(s) below", + (False, False): "no attachments below" } + info = (tabs,O(s),s,O(t),t, DMV_Rule.bar_str(LHS), attach[Lattach,Rattach]) + print "%sTrying from %s_%d to %s_%d with %s, %s:" % info + + def O(s): + return sent[s] + + sent_nums = [g.tagnum(tag) for tag in sent] + + def e(s,t,LHS, Lattach, Rattach, n_t): + def tab(): + "Tabs for debug output" + return "\t"*n_t + + if (s, t, LHS, Lattach, Rattach) in chart: + return chart[(s, t, LHS, Lattach, Rattach)] + else: + debug_inner_dmv(tab(),s,t,LHS, Lattach, Rattach) + if s == t: + if Lattach or Rattach: + # terminals are always F,F for attachment + io.debug("%s= 0.0 (1 word, no lower attach)" % tab()) + return 0.0 + elif (LHS, O(s)) in g.p_terminals: + prob = g.p_terminals[LHS, O(s)] # b[LHS, O(s)] in Lari&Young + else: + # todo: assuming this is how to deal with lacking + # rules, since we add prob.s, and 0 is identity + prob = 0.0 + io.debug( "%sLACKING TERMINAL:" % tab()) + # todo: add to chart perhaps? Although, it _is_ simple lookup.. + io.debug( "%s= %.1f (terminal: %s -> %s)" % (tab(),prob, + DMV_Rule.bar_str(LHS), + O(s)) ) + return prob + else: + if (s,t,LHS,Lattach, Rattach) not in chart: + chart[(s,t,LHS,Lattach,Rattach)] = 0.0 + for rule in g.sent_rules(LHS, sent_nums): # summing over j,k in a[LHS,j,k] + io.debug( "%ssumming rule %s" % (tab(),rule) ) + L = rule.L() + R = rule.R() + # if it's a STOP rule, rewrite for the same range: + if (L == STOP) or (R == STOP): + if L == STOP: + p = rule.p(Lattach, False) # todo check + pLR = e(s, t, R, Lattach, Rattach, n_t+1) + elif R == STOP: + p = rule.p(False, Rattach) # todo check + pLR = e(s, t, L, Lattach, Rattach, n_t+1) + chart[(s, t, LHS, Lattach, Rattach)] += p * pLR + + # not a STOP, an attachment rewrite: + else: + for r in range(s, t): + if head(L) in sent_nums[s:r+1] and head(R) in sent_nums[r+1:t+1]: + # LL etc are boolean attachment values + for (LL, LR, RL, RR) in rewrite_adj(rule.bars(), Lattach, Rattach): + p = rule.p(LR, RL) # probN or probA + pL = e(s, r, L, LL, LR, n_t+1) + pR = e(r+1, t, R, RL, RR, n_t+1) + chart[(s, t, LHS,Lattach,Rattach)] += p * pL * pR + + return chart[(s, t, LHS,Lattach,Rattach)] + # end of e-function + + inner_prob = e(s,t,LHS,True,True, 0) + e(s,t,LHS,True,False, 0) + e(s,t,LHS,False,True, 0) + e(s,t,LHS,False,False, 0) + if io.DEBUG: + print "---CHART:---" + for (s,t,LHS,L,R),v in chart.iteritems(): + print "\t%s -> %s_%d ... %s_%d (L:%s, R:%s):\t%.3f" % (DMV_Rule.bar_str(LHS,g.numtag), + O(s), s, + O(s), t, + L, R, v) + print "---CHART:end---" + return [inner_prob, chart] + + + if __name__ == "__main__": # Non, Adj _h_ = DMV_Rule((LRBAR,0), STOP, ( RBAR,0), 1.0, 1.0) # LSTOP h_S = DMV_Rule(( RBAR,0),(NOBAR,0), STOP, 0.4, 0.3) # RSTOP @@ -432,14 +533,21 @@ P_STOP(-STOP|...) = 1 - P_STOP(STOP|...) for sent in corpus: # have to go through _all_ places where h appears in the # sentence...how? how to make sure it _works_? + chart = {} # cuts time from 17s to 7s ! if h_tag in sent: - locs_h = [i for i,w in enumerate(sent) if w == h_tag] + locs_h = [i2 for i2,w in enumerate(sent) if w == h_tag] io.debug( "locs_h:%s, sent:%s"%(locs_h,sent)) for loc_h in locs_h: - for s in range(loc_h): # i 0.0: return P_STOP_num / P_STOP_den # upside down in article @@ -448,9 +556,8 @@ P_STOP(-STOP|...) = 1 - P_STOP(STOP|...) -if __name__ == "__main__": +def testreestimation(): io.DEBUG = 0 - testcorpu2 = [s.split() for s in ['det nn vbd c nn vbd nn','det nn vbd c nn vbd pp nn', 'det nn vbd nn','det nn vbd c nn vbd pp nn', 'det nn vbd nn','det nn vbd c nn vbd pp nn', @@ -458,7 +565,7 @@ if __name__ == "__main__": 'det nn vbd nn','det nn vbd c nn vbd pp nn', 'det nn vbd pp nn','det nn vbd det nn', ]] testcorpus = [s.split() for s in ['det nn vbd c vbd','det nn vbd c vbd pp', - 'det nn vbd','det nn vbd c vbd pp', + 'det nn vbd','det nn vbd c nn vbd pp', 'det nn vbd','det nn vbd c vbd pp', 'det nn vbd','det nn vbd c vbd pp', 'det nn vbd','det nn vbd c det vbd pp', @@ -471,16 +578,15 @@ if __name__ == "__main__": inner_dmv(0, 2, ROOT, g, 'det nn vbd'.split(), {}) - h_tag = 'det' + h_tag = 'nn' h = g.tagnum(h_tag) - + print "This will take some time." for r in g.h_rules(h): if r.L()==STOP: +# print r +# print "off-set the rule, see what happens:" +# r.probN = 0.7 print r - print "off-set the rule, see what happens: (this takes some time)" - r.probN = 0.5 - print r - for i in range(10): pstophln = P_STOP(True, h, 'L', 'N', g, testcorpus) print "p(STOP|%s,L,N):%s"%(h_tag,pstophln) @@ -490,8 +596,12 @@ if __name__ == "__main__": print r r.probN = pstophln print r - - + return "todo" + +if __name__ == "__main__": + pass + +# testreestimation() # todo: some more testing on the Brown corpus: diff --git a/src/dmv.pyc b/src/dmv.pyc index 4998a44dbb9612cf9fa07a7c7fd6c56f17da7626..4854a586af33ce555bccf8b61743fe3d5eaf0dff 100644 GIT binary patch delta 1347 zcwTi=&u<%55dLPpUjJOLrH{4?%E)!yXW-#1)Af@}&||Q4etI|Gkrc4yq5T(V18_|Vn#8wH9C;R>I51@x zABiu)!c_t#B_w6kI&Xv6_DV1nXa_2T$U#z(^YV4V4Dl+PsCkvUxHZ~=ti0AA4cEsARx6U2m9c5 zFJq6V<{aeW=p3Rvq5`6##r!cWSu7v(_($ssq5vBsmP05q$0;6TF9YL3XV6!!w2%j4 zvN9YlC92vsXC>9a36>Fem!csUyPW5s3 zmH&DL_N()Vh;6m=DqFzoV2D9N~=aoJ!*z)55Z zzI0|fye+O+pe!I^{N|a=^6zwYduOv1r=8O1R8{txUA^06P}>cy+VQU(+kH#=K?O>G zUY@>6S*r3|7?TJ1h*38&AG>ikuCYm)LT!tw3Zlr0L`Opi-ka=5x*yqF zx|Er2Az+~|rOaCeMd(W(OI}JJq>p{D^tu0oPc8UX^qk!&LG;4!H*Z+ z9^L)dO+R>k>GD17tr5njjP_&u*U??8t8-8e?ptU)BT^PLVJ(kM!1kt~PC%W6%D;!l zpvJM8!Sd=bylLnu=xO+4*Z~f-URZag;1&9zicqJZCZHzY;%@IT)#K@X2Zh(~>^XzY zgKu&a_vg4ByHkWW18u=S4LyS$f>(k*0=|i|p%c(0iPU$f0Xq&zorYJI1Ai7>UG6!J zqVso8Is-i_%JS;(@G8(1_-DmeCwonL0%GaXWmS@+wK=FV?|M?_0rWYj*}k`d_W|@I zbV>rMGf)+YFEu*w=Aq7lQ|scs@?ofGt$ebZr6A5#I25K3BoU-= z-$CO)kjyho!;ombN+)pFLO6ghgCHZul5qeOq!A48lrDm+!#8P!E(}j9CesOA@t3=E z4C*`-VR`ijgjsMSP1i<{?T7Cm%%RhOu_c^|@8#XG`0N(KK?DwrgCK`s5GH|uBy3=d zU4PeSOemweTVPd?nAayuQ&E~dWH`QaZPcqfk zx4&rA{d>{shrgw{Q#{S)mT#hwrIDzySh-@-QhePv{`!{xwXWXgZM80RN=QC|>VVL& zP*LcrP~&xiX4^EHwXJ#qQS0E5sJe839z}ZTdVJBPlokEII6Y|%^2LWKPANv*l2#m} K+&=ZV