From: Kevin Brubeck Unhammer Date: Wed, 20 Aug 2008 12:27:34 +0000 (+0200) Subject: found a bug in w-formula, 1/P_s should be outside sums, starting to think we should... X-Git-Url: https://repo.or.cz/w/dmvccm.git/commitdiff_plain/74439962bd206e078ab24810cdaff81286c0555f found a bug in w-formula, 1/P_s should be outside sums, starting to think we should store charts and have args as outer loop... --- diff --git a/src/cnf_dmv.py b/src/cnf_dmv.py index 5e2a8be..d596dec 100755 --- a/src/cnf_dmv.py +++ b/src/cnf_dmv.py @@ -50,15 +50,22 @@ class CNF_DMV_Grammar(io.Grammar): # used in outer: def mothersR(self, w_node, argnums): '''For all LHS and x, return all rules of the form 'LHS->x w_node'.''' - return [r for LHS in self.LHSs() - for r in self.arg_rules(LHS, argnums) - if r.R() == w_node] + if w_node not in self.__mothersR: + self.__mothersR[w_node] = [r for LHS in self.LHSs() + for r in self.rules(LHS) + if r.R() == w_node] + return [r for r in self.__mothersR[w_node] + if POS(r.L()) in argnums] def mothersL(self, w_node, argnums): '''For all LHS and x, return all rules of the form 'LHS->w_node x'.''' - return [r for LHS in self.LHSs() - for r in self.arg_rules(LHS, argnums) - if r.L() == w_node] + if w_node not in self.__mothersL: + self.__mothersL[w_node] = [r for LHS in self.LHSs() + for r in self.rules(LHS) + if r.L() == w_node] + return [r for r in self.__mothersL[w_node] + if POS(r.R()) in argnums] + # used in inner: def arg_rules(self, LHS, argnums): @@ -105,6 +112,8 @@ class CNF_DMV_Grammar(io.Grammar): self.p_ROOT = p_ROOT self.p_GO_AT = make_GO_AT(self.p_STOP, self.p_ATTACH) self.make_all_rules() + self.__mothersL = {} + self.__mothersR = {} class CNF_DMV_Rule(io.CNF_Rule): @@ -350,14 +359,14 @@ def reest_freq(g, corpus): def f_g(i,j,LHS,sent): if (i,j,LHS) in ochart: - print ".", +# print ".", return ochart[i,j,LHS] else: return outer(i,j,LHS,g,sent,ichart,ochart) def e_g(i,j,LHS,sent): if (i,j,LHS) in ichart: - print ".", +# print ".", return ichart[i,j,LHS] else: return inner(i,j,LHS,g,sent,ichart) @@ -415,7 +424,7 @@ def testgrammar(): def testreestimation(): from loc_h_dmv import testcorpus g = testgrammar() - f = reestimate(g, testcorpus) + f = reestimate(g, testcorpus[0:4]) return (f,g) def testgrammar_a(): # Non, Adj @@ -503,3 +512,4 @@ if __name__ == "__main__": regression_tests() # g = testgrammar() # print g + print "TODO!!!! fix outer (also, make mothersL and R faster somehow)" diff --git a/src/loc_h_dmv.py b/src/loc_h_dmv.py index 6c5149b..35ea49d 100644 --- a/src/loc_h_dmv.py +++ b/src/loc_h_dmv.py @@ -310,6 +310,11 @@ def outer(i,j,w_node,loc_w, g, sent, ichart={}, ochart={}): # Reestimation: # ################################################### +# todo: it seems we have to rewrite attachment reestimation so that we +# have 'a´ as the outer loop, then sentences... but this means running +# through sentences several times, and that would require storing +# inner probabilites...agh! + def reest_zeros(h_nums): '''A dict to hold numerators and denominators for our 6+ reestimation formulas. ''' @@ -361,10 +366,10 @@ def reest_freq(g, corpus): # end reest_freq.e() def w_left(i,j, x,loc_h,sent,sent_nums): + if not p_sent > 0.0: return + h = POS(x) - if not p_sent > 0.0: - return p_sent - + a_k = {} for k in xtween(i, j): p_out = f(i,j, x,loc_h, sent) if not p_out > 0.0: @@ -376,18 +381,19 @@ def reest_freq(g, corpus): for loc_a,a in locs(sent_nums, i,k): # i<=loc_l(a) 0.0: - return p_sent + if not p_sent > 0.0: return + h = POS(x) for k in xtween(i, j): p_out = f(i,j, x,loc_h, sent) if not p_out > 0.0: @@ -399,7 +405,7 @@ def reest_freq(g, corpus): for loc_a,a in locs(sent_nums, k,j): # k<=loc_l(a)