From: Kevin Brubeck Unhammer Date: Mon, 25 Oct 2010 12:10:19 +0000 (+0200) Subject: remove backups, chmod -x X-Git-Url: https://repo.or.cz/w/dmvccm.git/commitdiff_plain/4caa76786f42c2fcaa8229c6f0b457ff7ad53ca3 remove backups, chmod -x --- diff --git a/src/cnf_dmv.py b/src/cnf_dmv.py old mode 100755 new mode 100644 diff --git a/src/io.py.~11~ b/src/io.py.~11~ deleted file mode 100755 index fd34f25..0000000 --- a/src/io.py.~11~ +++ /dev/null @@ -1,251 +0,0 @@ -################################################################## -# Changes: # -################################################################## -# 2008-05-24, KBU: -# - more elaborate pseudo-code in the io()-function -# -# 2008-05-25, KBU: -# - CNF_Rule has a function LHS() which returns head, in DMV_Rule this -# returns the pair of (bars, head). -# -# 2008-05-27 -# - CNF_Rule has __eq__ and __ne__ defined, so that we can use == and -# != on two such rules - - -# import numpy # numpy provides Fast Arrays, for future optimization -# import pprint # for pretty-printing - -DEBUG = 0 - -# some of dmv-module bleeding in here... todo: prettier (in inner()) -NOBAR = 0 -STOP = (NOBAR, -2) - -def debug(string): - '''Easily turn on/off inline debug printouts with this global. There's -a lot of cluttering debug statements here, todo: clean up''' - if DEBUG: - print string - -class CNF_Rule(): - '''A single CNF rule in the PCFG, of the form - head -> L R - where these are just integers - (where do we save the connection between number and symbol? - symbols being 'vbd' etc.)''' - def __eq__(self, other): - return self.LHS() == other.LHS() and self.R == other.R and self.L == other.L - def __ne__(self, other): - return self.LHS() != other.LHS() or self.R != other.R or self.L != other.L - def __str__(self): - return "%s -> %s %s [%.2f]" % (self.head, self.L, self.R, self.prob) - def __init__(self, head, L, R, prob): - self.head = head - self.R = R - self.L = L - self.prob = prob - def p(self, *arg): - "Return a probability, doesn't care about attachment..." - return self.prob - def LHS(self): - return self.head - -class Grammar(): - '''The PCFG used in the I/O-algorithm. - - Todo: as of now, this allows duplicate rules... should we check - for this? (eg. g = Grammar([x,x],[]) where x.prob == 1 may give - inner probabilities of 2.)''' - def rules(self, head): - return [rule for rule in self.p_rules if rule.head == head] - - def __init__(self, p_rules, p_terminals): - '''rules and p_terminals should be arrays, where p_terminals are of - the form [preterminal, terminal], and rules are CNF_Rule's.''' - self.p_rules = p_rules # could check for summing to 1 (+/- epsilon) - self.p_terminals = p_terminals - - - -def inner(s, t, LHS, g, sent, chart): - ''' Give the inner probability of having the node LHS cover whatever's - between s and t in sentence sent, using grammar g. - - Returns a pair of the inner probability and the chart - - For DMV, LHS is a pair (bar, h), but this function ought to be - agnostic about that. - - e() is an internal function, so the variable chart (a dictionary) - is available to all calls of e(). - - Since terminal probabilities are just simple lookups, they are not - put in the chart (although we could put them in there later to - optimize) - ''' - - def O(s): - return sent[s] - - def e(s,t,LHS): - '''Chart has lists of probability and whether or not we've attached -yet to L and R, each entry is a list [p, Rattach, Lattach], where if -Rattach==True then the rule has a right-attachment or there is one -lower in the tree (meaning we're no longer adjacent).''' - if (s, t, LHS) in chart: - return chart[(s, t, LHS)] - else: - debug( "trying from %d to %d with %s" % (s,t,LHS) ) - if s == t: - if (LHS, O(s)) in g.p_terminals: - prob = g.p_terminals[LHS, O(s)] # b[LHS, O(s)] - else: - prob = 0.0 # todo: is this the right way to deal with lacking rules? - print "\t LACKING TERMINAL:" - debug( "\t terminal: %s -> %s : %.1f" % (LHS, O(s), prob) ) - # terminals have no attachment - return prob - else: - if (s,t,LHS) not in chart: - # by default, not attachment yet - chart[(s,t,LHS)] = 0.0 #, False, False] - for rule in g.rules(LHS): # summing over j,k in a[LHS,j,k] - debug( "\tsumming rule %s" % rule ) - L = rule.L - R = rule.R - for r in range(s, t): # summing over r = s to r = t-1 - p_L = e(s, r, L) - p_R = e(r + 1, t, R) - p = rule.p() - chart[(s, t, LHS)] += p * p_L * p_R - debug( "\tchart[(%d,%d,%s)] = %.2f" % (s,t,LHS, chart[(s,t,LHS)]) ) - return chart[(s, t, LHS)] - # end of e-function - - inner_prob = e(s,t,LHS) - if DEBUG: - print "---CHART:---" - for k,v in chart.iteritems(): - print "\t%s -> %s_%d ... %s_%d : %.1f" % (k[2], O(k[0]), k[0], O(k[1]), k[1], v) - print "---CHART:end---" - return [inner_prob, chart] # inner_prob == chart[(s,t,LHS)] - - - - - - - - -if __name__ == "__main__": - print "IO-module tests:" - b = {} - s = CNF_Rule(0,1,2, 1.0) # s->np vp - np = CNF_Rule(1,3,4, 0.3) # np->n p - b[1, 'n'] = 0.7 # np->'n' - b[3, 'n'] = 1.0 # n->'n' - b[4, 'p'] = 1.0 # p->'p' - vp = CNF_Rule(2,5,1, 0.1) # vp->v np (two parses use this rule) - vp2 = CNF_Rule(2,2,4, 0.9) # vp->vp p - b[5, 'v'] = 1.0 # v->'v' - - g = Grammar([s,np,vp,vp2], b) - - print "The rules:" - for i in range(0,5): - for r in g.rules(i): - print r - print "" - - test1 = inner(0,0, 1, g, ['n'], {}) - if test1[0] != 0.7: - print "should be 0.70 : %.2f" % test1[0] - print "" - - DEBUG = 1 - test2 = inner(0,2, 2, g, ['v','n','p'], test1[1]) - print "should be 0.?? (.09??) : %.2f" % test2[0] - print "------ trying the same again:----------" - test2 = inner(0,2, 2, g, ['v','n','p'], test2[1]) - print "should be 0.?? (.09??) : %.2f" % test2[0] - - -################################################################## -# just junk from here on down: # -################################################################## - -# def io(corpus): -# "(pseudo-code / wishful thinking) " -# g = initialize(corpus) # or corpus.tagset ? - -# P = {('v','n','p'):0.09} -# # P is used in v_q, w_q (expectation), so each sentence in the -# # corpus needs some initial P. - -# # --- Maximation: --- -# # -# # actually, this step (from Lari & Young) probably never happens -# # with DMV, since instead of the a[i,j,k] and b[i,m] vectors, we -# # have P_STOP and P_CHOOSE... or, in a sense it happens only we -# # calculate P_STOP and P_CHOOSE..somehow. -# for rule in g.p_rules: -# rule.num = 0 -# rule.den = 0 -# for pre_term in range(len(g.p_terminals)): -# ptnum[pre_term] = 0 -# ptden[pre_term] = 0 - -# # we could also flip this to make rules the outer loop, then we -# # wouldn't have to initialize den/num in loops of their own -# for sent in corpus: -# for rule in g.p_rules # Equation 20 -# for s in range(len(sent)): -# for t in range(s, len(sent)): -# rule.num += w(s,t, rule.LHS(),rule.L,rule.R, g, sent, P[sent]) -# rule.den += v(s,t, rule.LHS(), g, sent, P[sent]) -# # todo: do we need a "new-prob" vs "old-prob" distinction here? -# probably, since we use inner/outer which checks rule.prob() -# # todo: also, this wouldn't work, since for each sentence, we'd -# # discard the old probability; should rules be the outer -# # loop then? -# rule.prob = rule.num / rule.den -# for pre_term in range(len(g.p_terminals)): # Equation 21 -# num = 0 -# den = 0 -# for s in range(len(sent)): -# for t in range(s, len(sent)): -# num += v(t,t,pre_term, g, sent, P[sent]) -# den += v(s,t,pre_term, g, sent, P[sent]) - -# for rule in g.rules: -# rule.prob = rule.num / rule.den -# for pre_term in range(len(g.p_terminals)): -# g.p_terminals[pre_term] = ptnum[pre_term] / ptden[pre_term] - - -# # --- Expectation: --- -# for sent in corpus: # Equation 11 -# inside = inner(0, len(sent), ROOT, g, sent) -# P[sent] = inside[0] - -# # todo: set inner.chart to {} again, how? - -# # todo: need a old-P new-P distinction to check if we're below -# # threshold difference -# return "todo" - -# def w(s,t, LHS,L,R, g, sent, P_sent): -# w = 0 -# rule = g.rule(LHS, L, R) -# for r in range(s, t): -# w += rule.prob() * inner(s,r, L, g, sent) * inner(r+1, t, R, g, sent) * outer(s,t,LHS,g,sent) -# return w / P_sent - -# def v(s,t, LHS, g, sent, P_sent): -# return ( inner(s,t, LHS, g, sent) * outer(s,t, LHS, g, sent) ) / P_sent - - - - - diff --git a/src/io.py.~12~ b/src/io.py.~12~ deleted file mode 100755 index 7cbb25f..0000000 --- a/src/io.py.~12~ +++ /dev/null @@ -1,251 +0,0 @@ -################################################################## -# Changes: # -################################################################## -# 2008-05-24, KBU: -# - more elaborate pseudo-code in the io()-function -# -# 2008-05-25, KBU: -# - CNF_Rule has a function LHS() which returns head, in DMV_Rule this -# returns the pair of (bars, head). -# -# 2008-05-27 -# - CNF_Rule has __eq__ and __ne__ defined, so that we can use == and -# != on two such rules - - -# import numpy # numpy provides Fast Arrays, for future optimization -# import pprint # for pretty-printing - -DEBUG = 0 - -# some of dmv-module bleeding in here... todo: prettier (in inner()) -NOBAR = 0 -STOP = (NOBAR, -2) - -def debug(string): - '''Easily turn on/off inline debug printouts with this global. There's -a lot of cluttering debug statements here, todo: clean up''' - if DEBUG: - print string - -class CNF_Rule(): - '''A single CNF rule in the PCFG, of the form - head -> L R - where these are just integers - (where do we save the connection between number and symbol? - symbols being 'vbd' etc.)''' - def __eq__(self, other): - return self.LHS() == other.LHS() and self.R == other.R and self.L == other.L - def __ne__(self, other): - return self.LHS() != other.LHS() or self.R != other.R or self.L != other.L - def __str__(self): - return "%s -> %s %s [%.2f]" % (self.head, self.L, self.R, self.prob) - def __init__(self, head, L, R, prob): - self.head = head - self.R = R - self.L = L - self.prob = prob - def p(self, *arg): - "Return a probability, doesn't care about attachment..." - return self.prob - def LHS(self): - return self.head - -class Grammar(): - '''The PCFG used in the I/O-algorithm. - - Todo: as of now, this allows duplicate rules... should we check - for this? (eg. g = Grammar([x,x],[]) where x.prob == 1 may give - inner probabilities of 2.)''' - def rules(self, head): - return [rule for rule in self.p_rules if rule.head == head] - - def __init__(self, p_rules, p_terminals): - '''rules and p_terminals should be arrays, where p_terminals are of - the form [preterminal, terminal], and rules are CNF_Rule's.''' - self.p_rules = p_rules # could check for summing to 1 (+/- epsilon) - self.p_terminals = p_terminals - - - -def inner(s, t, LHS, g, sent, chart): - ''' Give the inner probability of having the node LHS cover whatever's - between s and t in sentence sent, using grammar g. - - Returns a pair of the inner probability and the chart - - For DMV, LHS is a pair (bar, h), but this function ought to be - agnostic about that. - - e() is an internal function, so the variable chart (a dictionary) - is available to all calls of e(). - - Since terminal probabilities are just simple lookups, they are not - put in the chart (although we could put them in there later to - optimize) - ''' - - def O(s): - return sent[s] - - def e(s,t,LHS): - '''Chart has lists of probability and whether or not we've attached -yet to L and R, each entry is a list [p, Rattach, Lattach], where if -Rattach==True then the rule has a right-attachment or there is one -lower in the tree (meaning we're no longer adjacent).''' - if (s, t, LHS) in chart: - return chart[(s, t, LHS)] - else: - debug( "trying from %d to %d with %s" % (s,t,LHS) ) - if s == t: - if (LHS, O(s)) in g.p_terminals: - prob = g.p_terminals[LHS, O(s)] # b[LHS, O(s)] - else: - prob = 0.0 # todo: is this the right way to deal with lacking rules? - print "\t LACKING TERMINAL:" - debug( "\t terminal: %s -> %s : %.1f" % (LHS, O(s), prob) ) - # terminals have no attachment - return prob - else: - if (s,t,LHS) not in chart: - # by default, not attachment yet - chart[(s,t,LHS)] = 0.0 #, False, False] - for rule in g.rules(LHS): # summing over j,k in a[LHS,j,k] - debug( "\tsumming rule %s" % rule ) - L = rule.L - R = rule.R - for r in range(s, t): # summing over r = s to r = t-1 - p_L = e(s, r, L) - p_R = e(r + 1, t, R) - p = rule.p("todo","todo") - chart[(s, t, LHS)] += p * p_L * p_R - debug( "\tchart[(%d,%d,%s)] = %.2f" % (s,t,LHS, chart[(s,t,LHS)]) ) - return chart[(s, t, LHS)] - # end of e-function - - inner_prob = e(s,t,LHS) - if DEBUG: - print "---CHART:---" - for k,v in chart.iteritems(): - print "\t%s -> %s_%d ... %s_%d : %.1f" % (k[2], O(k[0]), k[0], O(k[1]), k[1], v) - print "---CHART:end---" - return [inner_prob, chart] # inner_prob == chart[(s,t,LHS)] - - - - - - - - -if __name__ == "__main__": - print "IO-module tests:" - b = {} - s = CNF_Rule(0,1,2, 1.0) # s->np vp - np = CNF_Rule(1,3,4, 0.3) # np->n p - b[1, 'n'] = 0.7 # np->'n' - b[3, 'n'] = 1.0 # n->'n' - b[4, 'p'] = 1.0 # p->'p' - vp = CNF_Rule(2,5,1, 0.1) # vp->v np (two parses use this rule) - vp2 = CNF_Rule(2,2,4, 0.9) # vp->vp p - b[5, 'v'] = 1.0 # v->'v' - - g = Grammar([s,np,vp,vp2], b) - - print "The rules:" - for i in range(0,5): - for r in g.rules(i): - print r - print "" - - test1 = inner(0,0, 1, g, ['n'], {}) - if test1[0] != 0.7: - print "should be 0.70 : %.2f" % test1[0] - print "" - - DEBUG = 1 - test2 = inner(0,2, 2, g, ['v','n','p'], test1[1]) - print "should be 0.?? (.09??) : %.2f" % test2[0] - print "------ trying the same again:----------" - test2 = inner(0,2, 2, g, ['v','n','p'], test2[1]) - print "should be 0.?? (.09??) : %.2f" % test2[0] - - -################################################################## -# just junk from here on down: # -################################################################## - -# def io(corpus): -# "(pseudo-code / wishful thinking) " -# g = initialize(corpus) # or corpus.tagset ? - -# P = {('v','n','p'):0.09} -# # P is used in v_q, w_q (expectation), so each sentence in the -# # corpus needs some initial P. - -# # --- Maximation: --- -# # -# # actually, this step (from Lari & Young) probably never happens -# # with DMV, since instead of the a[i,j,k] and b[i,m] vectors, we -# # have P_STOP and P_CHOOSE... or, in a sense it happens only we -# # calculate P_STOP and P_CHOOSE..somehow. -# for rule in g.p_rules: -# rule.num = 0 -# rule.den = 0 -# for pre_term in range(len(g.p_terminals)): -# ptnum[pre_term] = 0 -# ptden[pre_term] = 0 - -# # we could also flip this to make rules the outer loop, then we -# # wouldn't have to initialize den/num in loops of their own -# for sent in corpus: -# for rule in g.p_rules # Equation 20 -# for s in range(len(sent)): -# for t in range(s, len(sent)): -# rule.num += w(s,t, rule.LHS(),rule.L,rule.R, g, sent, P[sent]) -# rule.den += v(s,t, rule.LHS(), g, sent, P[sent]) -# # todo: do we need a "new-prob" vs "old-prob" distinction here? -# probably, since we use inner/outer which checks rule.prob() -# # todo: also, this wouldn't work, since for each sentence, we'd -# # discard the old probability; should rules be the outer -# # loop then? -# rule.prob = rule.num / rule.den -# for pre_term in range(len(g.p_terminals)): # Equation 21 -# num = 0 -# den = 0 -# for s in range(len(sent)): -# for t in range(s, len(sent)): -# num += v(t,t,pre_term, g, sent, P[sent]) -# den += v(s,t,pre_term, g, sent, P[sent]) - -# for rule in g.rules: -# rule.prob = rule.num / rule.den -# for pre_term in range(len(g.p_terminals)): -# g.p_terminals[pre_term] = ptnum[pre_term] / ptden[pre_term] - - -# # --- Expectation: --- -# for sent in corpus: # Equation 11 -# inside = inner(0, len(sent), ROOT, g, sent) -# P[sent] = inside[0] - -# # todo: set inner.chart to {} again, how? - -# # todo: need a old-P new-P distinction to check if we're below -# # threshold difference -# return "todo" - -# def w(s,t, LHS,L,R, g, sent, P_sent): -# w = 0 -# rule = g.rule(LHS, L, R) -# for r in range(s, t): -# w += rule.prob() * inner(s,r, L, g, sent) * inner(r+1, t, R, g, sent) * outer(s,t,LHS,g,sent) -# return w / P_sent - -# def v(s,t, LHS, g, sent, P_sent): -# return ( inner(s,t, LHS, g, sent) * outer(s,t, LHS, g, sent) ) / P_sent - - - - - diff --git a/src/io.py.~13~ b/src/io.py.~13~ deleted file mode 100755 index 6fd820a..0000000 --- a/src/io.py.~13~ +++ /dev/null @@ -1,256 +0,0 @@ -################################################################## -# Changes: # -################################################################## -# 2008-05-24, KBU: -# - more elaborate pseudo-code in the io()-function -# -# 2008-05-25, KBU: -# - CNF_Rule has a function LHS() which returns head, in DMV_Rule this -# returns the pair of (bars, head). -# -# 2008-05-27 -# - CNF_Rule has __eq__ and __ne__ defined, so that we can use == and -# != on two such rules - - -# import numpy # numpy provides Fast Arrays, for future optimization -# import pprint # for pretty-printing - -DEBUG = 0 - -# some of dmv-module bleeding in here... todo: prettier (in inner()) -NOBAR = 0 -STOP = (NOBAR, -2) - -def debug(string): - '''Easily turn on/off inline debug printouts with this global. There's -a lot of cluttering debug statements here, todo: clean up''' - if DEBUG: - print string - -class CNF_Rule(): - '''A single CNF rule in the PCFG, of the form - LHS -> L R - where these are just integers - (where do we save the connection between number and symbol? - symbols being 'vbd' etc.)''' - def __eq__(self, other): - return self.LHS() == other.LHS() and self.R() == other.R() and self.L() == other.L() - def __ne__(self, other): - return self.LHS() != other.LHS() or self.R() != other.R() or self.L() != other.L() - def __str__(self): - return "%s -> %s %s [%.2f]" % (self.LHS(), self.L(), self.R(), self.prob) - def __init__(self, LHS, L, R, prob): - self.__LHS = LHS - self.__R = R - self.__L = L - self.prob = prob - def p(self, *arg): - "Return a probability, doesn't care about attachment..." - return self.prob - def LHS(self): - return self.__LHS - def L(self): - return self.__L - def R(self): - return self.__R - - -class Grammar(): - '''The PCFG used in the I/O-algorithm. - - Todo: as of now, this allows duplicate rules... should we check - for this? (eg. g = Grammar([x,x],[]) where x.prob == 1 may give - inner probabilities of 2.)''' - def rules(self, LHS): - return [rule for rule in self.p_rules if rule.LHS() == LHS] - - def __init__(self, p_rules, p_terminals): - '''rules and p_terminals should be arrays, where p_terminals are of - the form [preterminal, terminal], and rules are CNF_Rule's.''' - self.p_rules = p_rules # could check for summing to 1 (+/- epsilon) - self.p_terminals = p_terminals - - - -def inner(s, t, LHS, g, sent, chart): - ''' Give the inner probability of having the node LHS cover whatever's - between s and t in sentence sent, using grammar g. - - Returns a pair of the inner probability and the chart - - For DMV, LHS is a pair (bar, h), but this function ought to be - agnostic about that. - - e() is an internal function, so the variable chart (a dictionary) - is available to all calls of e(). - - Since terminal probabilities are just simple lookups, they are not - put in the chart (although we could put them in there later to - optimize) - ''' - - def O(s): - return sent[s] - - def e(s,t,LHS): - '''Chart has lists of probability and whether or not we've attached -yet to L and R, each entry is a list [p, Rattach, Lattach], where if -Rattach==True then the rule has a right-attachment or there is one -lower in the tree (meaning we're no longer adjacent).''' - if (s, t, LHS) in chart: - return chart[(s, t, LHS)] - else: - debug( "trying from %d to %d with %s" % (s,t,LHS) ) - if s == t: - if (LHS, O(s)) in g.p_terminals: - prob = g.p_terminals[LHS, O(s)] # b[LHS, O(s)] - else: - prob = 0.0 # todo: is this the right way to deal with lacking rules? - print "\t LACKING TERMINAL:" - debug( "\t terminal: %s -> %s : %.1f" % (LHS, O(s), prob) ) - # terminals have no attachment - return prob - else: - if (s,t,LHS) not in chart: - # by default, not attachment yet - chart[(s,t,LHS)] = 0.0 #, False, False] - for rule in g.rules(LHS): # summing over j,k in a[LHS,j,k] - debug( "\tsumming rule %s" % rule ) - L = rule.L() - R = rule.R() - for r in range(s, t): # summing over r = s to r = t-1 - p_L = e(s, r, L) - p_R = e(r + 1, t, R) - p = rule.p("todo","todo") - chart[(s, t, LHS)] += p * p_L * p_R - debug( "\tchart[(%d,%d,%s)] = %.2f" % (s,t,LHS, chart[(s,t,LHS)]) ) - return chart[(s, t, LHS)] - # end of e-function - - inner_prob = e(s,t,LHS) - if DEBUG: - print "---CHART:---" - for k,v in chart.iteritems(): - print "\t%s -> %s_%d ... %s_%d : %.1f" % (k[2], O(k[0]), k[0], O(k[1]), k[1], v) - print "---CHART:end---" - return [inner_prob, chart] # inner_prob == chart[(s,t,LHS)] - - - - - - - - -if __name__ == "__main__": - print "IO-module tests:" - b = {} - s = CNF_Rule(0,1,2, 1.0) # s->np vp - np = CNF_Rule(1,3,4, 0.3) # np->n p - b[1, 'n'] = 0.7 # np->'n' - b[3, 'n'] = 1.0 # n->'n' - b[4, 'p'] = 1.0 # p->'p' - vp = CNF_Rule(2,5,1, 0.1) # vp->v np (two parses use this rule) - vp2 = CNF_Rule(2,2,4, 0.9) # vp->vp p - b[5, 'v'] = 1.0 # v->'v' - - g = Grammar([s,np,vp,vp2], b) - - print "The rules:" - for i in range(0,5): - for r in g.rules(i): - print r - print "" - - test1 = inner(0,0, 1, g, ['n'], {}) - if test1[0] != 0.7: - print "should be 0.70 : %.2f" % test1[0] - print "" - - DEBUG = 1 - test2 = inner(0,2, 2, g, ['v','n','p'], test1[1]) - print "should be 0.?? (.09??) : %.2f" % test2[0] - print "------ trying the same again:----------" - test2 = inner(0,2, 2, g, ['v','n','p'], test2[1]) - print "should be 0.?? (.09??) : %.2f" % test2[0] - - -################################################################## -# just junk from here on down: # -################################################################## - -# def io(corpus): -# "(pseudo-code / wishful thinking) " -# g = initialize(corpus) # or corpus.tagset ? - -# P = {('v','n','p'):0.09} -# # P is used in v_q, w_q (expectation), so each sentence in the -# # corpus needs some initial P. - -# # --- Maximation: --- -# # -# # actually, this step (from Lari & Young) probably never happens -# # with DMV, since instead of the a[i,j,k] and b[i,m] vectors, we -# # have P_STOP and P_CHOOSE... or, in a sense it happens only we -# # calculate P_STOP and P_CHOOSE..somehow. -# for rule in g.p_rules: -# rule.num = 0 -# rule.den = 0 -# for pre_term in range(len(g.p_terminals)): -# ptnum[pre_term] = 0 -# ptden[pre_term] = 0 - -# # we could also flip this to make rules the outer loop, then we -# # wouldn't have to initialize den/num in loops of their own -# for sent in corpus: -# for rule in g.p_rules # Equation 20 -# for s in range(len(sent)): -# for t in range(s, len(sent)): -# rule.num += w(s,t, rule.LHS(),rule.L,rule.R, g, sent, P[sent]) -# rule.den += v(s,t, rule.LHS(), g, sent, P[sent]) -# # todo: do we need a "new-prob" vs "old-prob" distinction here? -# probably, since we use inner/outer which checks rule.prob() -# # todo: also, this wouldn't work, since for each sentence, we'd -# # discard the old probability; should rules be the outer -# # loop then? -# rule.prob = rule.num / rule.den -# for pre_term in range(len(g.p_terminals)): # Equation 21 -# num = 0 -# den = 0 -# for s in range(len(sent)): -# for t in range(s, len(sent)): -# num += v(t,t,pre_term, g, sent, P[sent]) -# den += v(s,t,pre_term, g, sent, P[sent]) - -# for rule in g.rules: -# rule.prob = rule.num / rule.den -# for pre_term in range(len(g.p_terminals)): -# g.p_terminals[pre_term] = ptnum[pre_term] / ptden[pre_term] - - -# # --- Expectation: --- -# for sent in corpus: # Equation 11 -# inside = inner(0, len(sent), ROOT, g, sent) -# P[sent] = inside[0] - -# # todo: set inner.chart to {} again, how? - -# # todo: need a old-P new-P distinction to check if we're below -# # threshold difference -# return "todo" - -# def w(s,t, LHS,L,R, g, sent, P_sent): -# w = 0 -# rule = g.rule(LHS, L, R) -# for r in range(s, t): -# w += rule.prob() * inner(s,r, L, g, sent) * inner(r+1, t, R, g, sent) * outer(s,t,LHS,g,sent) -# return w / P_sent - -# def v(s,t, LHS, g, sent, P_sent): -# return ( inner(s,t, LHS, g, sent) * outer(s,t, LHS, g, sent) ) / P_sent - - - - - diff --git a/src/io.py.~14~ b/src/io.py.~14~ deleted file mode 100755 index 6c146e8..0000000 --- a/src/io.py.~14~ +++ /dev/null @@ -1,264 +0,0 @@ -################################################################## -# Changes: # -################################################################## -# 2008-05-24, KBU: -# - more elaborate pseudo-code in the io()-function -# -# 2008-05-25, KBU: -# - CNF_Rule has a function LHS() which returns head, in DMV_Rule this -# returns the pair of (bars, head). -# -# 2008-05-27 -# - CNF_Rule has __eq__ and __ne__ defined, so that we can use == and -# != on two such rules - - -# import numpy # numpy provides Fast Arrays, for future optimization -# import pprint # for pretty-printing - -DEBUG = 0 - -# some of dmv-module bleeding in here... todo: prettier (in inner()) -NOBAR = 0 -STOP = (NOBAR, -2) - -def debug(string): - '''Easily turn on/off inline debug printouts with this global. There's -a lot of cluttering debug statements here, todo: clean up''' - if DEBUG: - print string - - -class Grammar(): - '''The PCFG used in the I/O-algorithm. - - Public members: - numtag, p_terminals - - Todo: as of now, this allows duplicate rules... should we check - for this? (eg. g = Grammar([x,x],[]) where x.prob == 1 may give - inner probabilities of 2.)''' - def all_rules(self): - return self.__p_rules - - def rules(self, LHS): - return [rule for rule in self.all_rules() if rule.LHS() == LHS] - - def __init__(self, p_rules, p_terminals, numtag): - '''rules and p_terminals should be arrays, where p_terminals are of - the form [preterminal, terminal], and rules are CNF_Rule's.''' - self.__p_rules = p_rules # todo: could check for summing to 1 (+/- epsilon) - self.p_terminals = p_terminals - self.numtag = numtag - - - - -class CNF_Rule(): - '''A single CNF rule in the PCFG, of the form - LHS -> L R - where these are just integers - (where do we save the connection between number and symbol? - symbols being 'vbd' etc.)''' - def __eq__(self, other): - return self.LHS() == other.LHS() and self.R() == other.R() and self.L() == other.L() - def __ne__(self, other): - return self.LHS() != other.LHS() or self.R() != other.R() or self.L() != other.L() - def __str__(self): - return "%s -> %s %s [%.2f]" % (self.LHS(), self.L(), self.R(), self.prob) - def __init__(self, LHS, L, R, prob): - self.__LHS = LHS - self.__R = R - self.__L = L - self.prob = prob - def p(self, *arg): - "Return a probability, doesn't care about attachment..." - return self.prob - def LHS(self): - return self.__LHS - def L(self): - return self.__L - def R(self): - return self.__R - -def inner(s, t, LHS, g, sent, chart): - ''' Give the inner probability of having the node LHS cover whatever's - between s and t in sentence sent, using grammar g. - - Returns a pair of the inner probability and the chart - - For DMV, LHS is a pair (bar, h), but this function ought to be - agnostic about that. - - e() is an internal function, so the variable chart (a dictionary) - is available to all calls of e(). - - Since terminal probabilities are just simple lookups, they are not - put in the chart (although we could put them in there later to - optimize) - ''' - - def O(s): - return sent[s] - - def e(s,t,LHS): - '''Chart has lists of probability and whether or not we've attached -yet to L and R, each entry is a list [p, Rattach, Lattach], where if -Rattach==True then the rule has a right-attachment or there is one -lower in the tree (meaning we're no longer adjacent).''' - if (s, t, LHS) in chart: - return chart[(s, t, LHS)] - else: - debug( "trying from %d to %d with %s" % (s,t,LHS) ) - if s == t: - if (LHS, O(s)) in g.p_terminals: - prob = g.p_terminals[LHS, O(s)] # b[LHS, O(s)] - else: - prob = 0.0 # todo: is this the right way to deal with lacking rules? - print "\t LACKING TERMINAL:" - debug( "\t terminal: %s -> %s : %.1f" % (LHS, O(s), prob) ) - # terminals have no attachment - return prob - else: - if (s,t,LHS) not in chart: - # by default, not attachment yet - chart[(s,t,LHS)] = 0.0 #, False, False] - for rule in g.rules(LHS): # summing over j,k in a[LHS,j,k] - debug( "\tsumming rule %s" % rule ) - L = rule.L() - R = rule.R() - for r in range(s, t): # summing over r = s to r = t-1 - p_L = e(s, r, L) - p_R = e(r + 1, t, R) - p = rule.p("todo","todo") - chart[(s, t, LHS)] += p * p_L * p_R - debug( "\tchart[(%d,%d,%s)] = %.2f" % (s,t,LHS, chart[(s,t,LHS)]) ) - return chart[(s, t, LHS)] - # end of e-function - - inner_prob = e(s,t,LHS) - if DEBUG: - print "---CHART:---" - for k,v in chart.iteritems(): - print "\t%s -> %s_%d ... %s_%d : %.1f" % (k[2], O(k[0]), k[0], O(k[1]), k[1], v) - print "---CHART:end---" - return [inner_prob, chart] # inner_prob == chart[(s,t,LHS)] - - - - - - - - -if __name__ == "__main__": - print "IO-module tests:" - b = {} - s = CNF_Rule(0,1,2, 1.0) # s->np vp - np = CNF_Rule(1,3,4, 0.3) # np->n p - b[1, 'n'] = 0.7 # np->'n' - b[3, 'n'] = 1.0 # n->'n' - b[4, 'p'] = 1.0 # p->'p' - vp = CNF_Rule(2,5,1, 0.1) # vp->v np (two parses use this rule) - vp2 = CNF_Rule(2,2,4, 0.9) # vp->vp p - b[5, 'v'] = 1.0 # v->'v' - - g = Grammar([s,np,vp,vp2], b, {0:'s',1:'np',2:'vp',3:'n',4:'p',5:'v'}) - - print "The rules:" - for i in range(0,5): - for r in g.rules(i): - print r - print "" - - test1 = inner(0,0, 1, g, ['n'], {}) - if test1[0] != 0.7: - print "should be 0.70 : %.2f" % test1[0] - print "" - - DEBUG = 1 - test2 = inner(0,2, 2, g, ['v','n','p'], test1[1]) - print "should be 0.?? (.09??) : %.2f" % test2[0] - print "------ trying the same again:----------" - test2 = inner(0,2, 2, g, ['v','n','p'], test2[1]) - print "should be 0.?? (.09??) : %.2f" % test2[0] - - -################################################################## -# just junk from here on down: # -################################################################## - -# def io(corpus): -# "(pseudo-code / wishful thinking) " -# g = initialize(corpus) # or corpus.tagset ? - -# P = {('v','n','p'):0.09} -# # P is used in v_q, w_q (expectation), so each sentence in the -# # corpus needs some initial P. - -# # --- Maximation: --- -# # -# # actually, this step (from Lari & Young) probably never happens -# # with DMV, since instead of the a[i,j,k] and b[i,m] vectors, we -# # have P_STOP and P_CHOOSE... or, in a sense it happens only we -# # calculate P_STOP and P_CHOOSE..somehow. -# for rule in g.p_rules: -# rule.num = 0 -# rule.den = 0 -# for pre_term in range(len(g.p_terminals)): -# ptnum[pre_term] = 0 -# ptden[pre_term] = 0 - -# # we could also flip this to make rules the outer loop, then we -# # wouldn't have to initialize den/num in loops of their own -# for sent in corpus: -# for rule in g.p_rules # Equation 20 -# for s in range(len(sent)): -# for t in range(s, len(sent)): -# rule.num += w(s,t, rule.LHS(),rule.L,rule.R, g, sent, P[sent]) -# rule.den += v(s,t, rule.LHS(), g, sent, P[sent]) -# # todo: do we need a "new-prob" vs "old-prob" distinction here? -# probably, since we use inner/outer which checks rule.prob() -# # todo: also, this wouldn't work, since for each sentence, we'd -# # discard the old probability; should rules be the outer -# # loop then? -# rule.prob = rule.num / rule.den -# for pre_term in range(len(g.p_terminals)): # Equation 21 -# num = 0 -# den = 0 -# for s in range(len(sent)): -# for t in range(s, len(sent)): -# num += v(t,t,pre_term, g, sent, P[sent]) -# den += v(s,t,pre_term, g, sent, P[sent]) - -# for rule in g.rules: -# rule.prob = rule.num / rule.den -# for pre_term in range(len(g.p_terminals)): -# g.p_terminals[pre_term] = ptnum[pre_term] / ptden[pre_term] - - -# # --- Expectation: --- -# for sent in corpus: # Equation 11 -# inside = inner(0, len(sent), ROOT, g, sent) -# P[sent] = inside[0] - -# # todo: set inner.chart to {} again, how? - -# # todo: need a old-P new-P distinction to check if we're below -# # threshold difference -# return "todo" - -# def w(s,t, LHS,L,R, g, sent, P_sent): -# w = 0 -# rule = g.rule(LHS, L, R) -# for r in range(s, t): -# w += rule.prob() * inner(s,r, L, g, sent) * inner(r+1, t, R, g, sent) * outer(s,t,LHS,g,sent) -# return w / P_sent - -# def v(s,t, LHS, g, sent, P_sent): -# return ( inner(s,t, LHS, g, sent) * outer(s,t, LHS, g, sent) ) / P_sent - - - - - diff --git a/src/io.py.~15~ b/src/io.py.~15~ deleted file mode 100755 index 2b20ad7..0000000 --- a/src/io.py.~15~ +++ /dev/null @@ -1,271 +0,0 @@ -################################################################## -# Changes: # -################################################################## -# 2008-05-24, KBU: -# - more elaborate pseudo-code in the io()-function -# -# 2008-05-25, KBU: -# - CNF_Rule has a function LHS() which returns head, in DMV_Rule this -# returns the pair of (bars, head). -# -# 2008-05-27 -# - CNF_Rule has __eq__ and __ne__ defined, so that we can use == and -# != on two such rules - - -# import numpy # numpy provides Fast Arrays, for future optimization -# import pprint # for pretty-printing - -DEBUG = 0 - -# some of dmv-module bleeding in here... todo: prettier (in inner()) -NOBAR = 0 -STOP = (NOBAR, -2) - -def debug(string): - '''Easily turn on/off inline debug printouts with this global. There's -a lot of cluttering debug statements here, todo: clean up''' - if DEBUG: - print string - - -class Grammar(): - '''The PCFG used in the I/O-algorithm. - - Public members: - p_terminals - - Todo: as of now, this allows duplicate rules... should we check - for this? (eg. g = Grammar([x,x],[]) where x.prob == 1 may give - inner probabilities of 2.)''' - def all_rules(self): - return self.__p_rules - - def rules(self, LHS): - return [rule for rule in self.all_rules() if rule.LHS() == LHS] - - def numtag(self): - return __numtag - - def tagnum(self): - return __tagnum - - def __init__(self, p_rules, p_terminals, numtag, tagnum): - '''rules and p_terminals should be arrays, where p_terminals are of - the form [preterminal, terminal], and rules are CNF_Rule's.''' - self.__p_rules = p_rules # todo: could check for summing to 1 (+/- epsilon) - self.__numtag = numtag - self.__tagnum = tagnum - self.p_terminals = p_terminals - - - - -class CNF_Rule(): - '''A single CNF rule in the PCFG, of the form - LHS -> L R - where these are just integers - (where do we save the connection between number and symbol? - symbols being 'vbd' etc.)''' - def __eq__(self, other): - return self.LHS() == other.LHS() and self.R() == other.R() and self.L() == other.L() - def __ne__(self, other): - return self.LHS() != other.LHS() or self.R() != other.R() or self.L() != other.L() - def __str__(self): - return "%s -> %s %s [%.2f]" % (self.LHS(), self.L(), self.R(), self.prob) - def __init__(self, LHS, L, R, prob): - self.__LHS = LHS - self.__R = R - self.__L = L - self.prob = prob - def p(self, *arg): - "Return a probability, doesn't care about attachment..." - return self.prob - def LHS(self): - return self.__LHS - def L(self): - return self.__L - def R(self): - return self.__R - -def inner(s, t, LHS, g, sent, chart): - ''' Give the inner probability of having the node LHS cover whatever's - between s and t in sentence sent, using grammar g. - - Returns a pair of the inner probability and the chart - - For DMV, LHS is a pair (bar, h), but this function ought to be - agnostic about that. - - e() is an internal function, so the variable chart (a dictionary) - is available to all calls of e(). - - Since terminal probabilities are just simple lookups, they are not - put in the chart (although we could put them in there later to - optimize) - ''' - - def O(s): - return sent[s] - - def e(s,t,LHS): - '''Chart has lists of probability and whether or not we've attached -yet to L and R, each entry is a list [p, Rattach, Lattach], where if -Rattach==True then the rule has a right-attachment or there is one -lower in the tree (meaning we're no longer adjacent).''' - if (s, t, LHS) in chart: - return chart[(s, t, LHS)] - else: - debug( "trying from %d to %d with %s" % (s,t,LHS) ) - if s == t: - if (LHS, O(s)) in g.p_terminals: - prob = g.p_terminals[LHS, O(s)] # b[LHS, O(s)] - else: - prob = 0.0 # todo: is this the right way to deal with lacking rules? - print "\t LACKING TERMINAL:" - debug( "\t terminal: %s -> %s : %.1f" % (LHS, O(s), prob) ) - # terminals have no attachment - return prob - else: - if (s,t,LHS) not in chart: - # by default, not attachment yet - chart[(s,t,LHS)] = 0.0 #, False, False] - for rule in g.rules(LHS): # summing over j,k in a[LHS,j,k] - debug( "\tsumming rule %s" % rule ) - L = rule.L() - R = rule.R() - for r in range(s, t): # summing over r = s to r = t-1 - p_L = e(s, r, L) - p_R = e(r + 1, t, R) - p = rule.p("todo","todo") - chart[(s, t, LHS)] += p * p_L * p_R - debug( "\tchart[(%d,%d,%s)] = %.2f" % (s,t,LHS, chart[(s,t,LHS)]) ) - return chart[(s, t, LHS)] - # end of e-function - - inner_prob = e(s,t,LHS) - if DEBUG: - print "---CHART:---" - for k,v in chart.iteritems(): - print "\t%s -> %s_%d ... %s_%d : %.1f" % (k[2], O(k[0]), k[0], O(k[1]), k[1], v) - print "---CHART:end---" - return [inner_prob, chart] # inner_prob == chart[(s,t,LHS)] - - - - - - - - -if __name__ == "__main__": - print "IO-module tests:" - b = {} - s = CNF_Rule(0,1,2, 1.0) # s->np vp - np = CNF_Rule(1,3,4, 0.3) # np->n p - b[1, 'n'] = 0.7 # np->'n' - b[3, 'n'] = 1.0 # n->'n' - b[4, 'p'] = 1.0 # p->'p' - vp = CNF_Rule(2,5,1, 0.1) # vp->v np (two parses use this rule) - vp2 = CNF_Rule(2,2,4, 0.9) # vp->vp p - b[5, 'v'] = 1.0 # v->'v' - - g = Grammar([s,np,vp,vp2], b, {0:'s',1:'np',2:'vp',3:'n',4:'p',5:'v'}) - - print "The rules:" - for i in range(0,5): - for r in g.rules(i): - print r - print "" - - test1 = inner(0,0, 1, g, ['n'], {}) - if test1[0] != 0.7: - print "should be 0.70 : %.2f" % test1[0] - print "" - - DEBUG = 1 - test2 = inner(0,2, 2, g, ['v','n','p'], test1[1]) - print "should be 0.?? (.09??) : %.2f" % test2[0] - print "------ trying the same again:----------" - test2 = inner(0,2, 2, g, ['v','n','p'], test2[1]) - print "should be 0.?? (.09??) : %.2f" % test2[0] - - -################################################################## -# just junk from here on down: # -################################################################## - -# def io(corpus): -# "(pseudo-code / wishful thinking) " -# g = initialize(corpus) # or corpus.tagset ? - -# P = {('v','n','p'):0.09} -# # P is used in v_q, w_q (expectation), so each sentence in the -# # corpus needs some initial P. - -# # --- Maximation: --- -# # -# # actually, this step (from Lari & Young) probably never happens -# # with DMV, since instead of the a[i,j,k] and b[i,m] vectors, we -# # have P_STOP and P_CHOOSE... or, in a sense it happens only we -# # calculate P_STOP and P_CHOOSE..somehow. -# for rule in g.p_rules: -# rule.num = 0 -# rule.den = 0 -# for pre_term in range(len(g.p_terminals)): -# ptnum[pre_term] = 0 -# ptden[pre_term] = 0 - -# # we could also flip this to make rules the outer loop, then we -# # wouldn't have to initialize den/num in loops of their own -# for sent in corpus: -# for rule in g.p_rules # Equation 20 -# for s in range(len(sent)): -# for t in range(s, len(sent)): -# rule.num += w(s,t, rule.LHS(),rule.L,rule.R, g, sent, P[sent]) -# rule.den += v(s,t, rule.LHS(), g, sent, P[sent]) -# # todo: do we need a "new-prob" vs "old-prob" distinction here? -# probably, since we use inner/outer which checks rule.prob() -# # todo: also, this wouldn't work, since for each sentence, we'd -# # discard the old probability; should rules be the outer -# # loop then? -# rule.prob = rule.num / rule.den -# for pre_term in range(len(g.p_terminals)): # Equation 21 -# num = 0 -# den = 0 -# for s in range(len(sent)): -# for t in range(s, len(sent)): -# num += v(t,t,pre_term, g, sent, P[sent]) -# den += v(s,t,pre_term, g, sent, P[sent]) - -# for rule in g.rules: -# rule.prob = rule.num / rule.den -# for pre_term in range(len(g.p_terminals)): -# g.p_terminals[pre_term] = ptnum[pre_term] / ptden[pre_term] - - -# # --- Expectation: --- -# for sent in corpus: # Equation 11 -# inside = inner(0, len(sent), ROOT, g, sent) -# P[sent] = inside[0] - -# # todo: set inner.chart to {} again, how? - -# # todo: need a old-P new-P distinction to check if we're below -# # threshold difference -# return "todo" - -# def w(s,t, LHS,L,R, g, sent, P_sent): -# w = 0 -# rule = g.rule(LHS, L, R) -# for r in range(s, t): -# w += rule.prob() * inner(s,r, L, g, sent) * inner(r+1, t, R, g, sent) * outer(s,t,LHS,g,sent) -# return w / P_sent - -# def v(s,t, LHS, g, sent, P_sent): -# return ( inner(s,t, LHS, g, sent) * outer(s,t, LHS, g, sent) ) / P_sent - - - - - diff --git a/src/io.py.~16~ b/src/io.py.~16~ deleted file mode 100755 index 5388e9a..0000000 --- a/src/io.py.~16~ +++ /dev/null @@ -1,272 +0,0 @@ -################################################################## -# Changes: # -################################################################## -# 2008-05-24, KBU: -# - more elaborate pseudo-code in the io()-function -# -# 2008-05-25, KBU: -# - CNF_Rule has a function LHS() which returns head, in DMV_Rule this -# returns the pair of (bars, head). -# -# 2008-05-27 -# - CNF_Rule has __eq__ and __ne__ defined, so that we can use == and -# != on two such rules - - -# import numpy # numpy provides Fast Arrays, for future optimization -# import pprint # for pretty-printing - -DEBUG = 0 - -# some of dmv-module bleeding in here... todo: prettier (in inner()) -NOBAR = 0 -STOP = (NOBAR, -2) - -def debug(string): - '''Easily turn on/off inline debug printouts with this global. There's -a lot of cluttering debug statements here, todo: clean up''' - if DEBUG: - print string - - -class Grammar(): - '''The PCFG used in the I/O-algorithm. - - Public members: - p_terminals - - Todo: as of now, this allows duplicate rules... should we check - for this? (eg. g = Grammar([x,x],[]) where x.prob == 1 may give - inner probabilities of 2.)''' - def all_rules(self): - return self.__p_rules - - def rules(self, LHS): - return [rule for rule in self.all_rules() if rule.LHS() == LHS] - - def numtag(self): - return __numtag - - def tagnum(self): - return __tagnum - - def __init__(self, p_rules, p_terminals, numtag, tagnum): - '''rules and p_terminals should be arrays, where p_terminals are of - the form [preterminal, terminal], and rules are CNF_Rule's.''' - self.__p_rules = p_rules # todo: could check for summing to 1 (+/- epsilon) - self.__numtag = numtag - self.__tagnum = tagnum - self.p_terminals = p_terminals - - - - -class CNF_Rule(): - '''A single CNF rule in the PCFG, of the form - LHS -> L R - where these are just integers - (where do we save the connection between number and symbol? - symbols being 'vbd' etc.)''' - def __eq__(self, other): - return self.LHS() == other.LHS() and self.R() == other.R() and self.L() == other.L() - def __ne__(self, other): - return self.LHS() != other.LHS() or self.R() != other.R() or self.L() != other.L() - def __str__(self): - return "%s -> %s %s [%.2f]" % (self.LHS(), self.L(), self.R(), self.prob) - def __init__(self, LHS, L, R, prob): - self.__LHS = LHS - self.__R = R - self.__L = L - self.prob = prob - def p(self, *arg): - "Return a probability, doesn't care about attachment..." - return self.prob - def LHS(self): - return self.__LHS - def L(self): - return self.__L - def R(self): - return self.__R - -def inner(s, t, LHS, g, sent, chart): - ''' Give the inner probability of having the node LHS cover whatever's - between s and t in sentence sent, using grammar g. - - Returns a pair of the inner probability and the chart - - For DMV, LHS is a pair (bar, h), but this function ought to be - agnostic about that. - - e() is an internal function, so the variable chart (a dictionary) - is available to all calls of e(). - - Since terminal probabilities are just simple lookups, they are not - put in the chart (although we could put them in there later to - optimize) - ''' - - def O(s): - return sent[s] - - def e(s,t,LHS): - '''Chart has lists of probability and whether or not we've attached -yet to L and R, each entry is a list [p, Rattach, Lattach], where if -Rattach==True then the rule has a right-attachment or there is one -lower in the tree (meaning we're no longer adjacent).''' - if (s, t, LHS) in chart: - return chart[(s, t, LHS)] - else: - debug( "trying from %d to %d with %s" % (s,t,LHS) ) - if s == t: - if (LHS, O(s)) in g.p_terminals: - prob = g.p_terminals[LHS, O(s)] # b[LHS, O(s)] - else: - prob = 0.0 # todo: is this the right way to deal with lacking rules? - print "\t LACKING TERMINAL:" - debug( "\t terminal: %s -> %s : %.1f" % (LHS, O(s), prob) ) - # terminals have no attachment - return prob - else: - if (s,t,LHS) not in chart: - # by default, not attachment yet - chart[(s,t,LHS)] = 0.0 #, False, False] - for rule in g.rules(LHS): # summing over j,k in a[LHS,j,k] - debug( "\tsumming rule %s" % rule ) - L = rule.L() - R = rule.R() - for r in range(s, t): # summing over r = s to r = t-1 - p_L = e(s, r, L) - p_R = e(r + 1, t, R) - p = rule.p("todo","todo") - chart[(s, t, LHS)] += p * p_L * p_R - debug( "\tchart[(%d,%d,%s)] = %.2f" % (s,t,LHS, chart[(s,t,LHS)]) ) - return chart[(s, t, LHS)] - # end of e-function - - inner_prob = e(s,t,LHS) - if DEBUG: - print "---CHART:---" - for k,v in chart.iteritems(): - print "\t%s -> %s_%d ... %s_%d : %.1f" % (k[2], O(k[0]), k[0], O(k[1]), k[1], v) - print "---CHART:end---" - return [inner_prob, chart] # inner_prob == chart[(s,t,LHS)] - - - - - - - - -if __name__ == "__main__": - print "IO-module tests:" - b = {} - s = CNF_Rule(0,1,2, 1.0) # s->np vp - np = CNF_Rule(1,3,4, 0.3) # np->n p - b[1, 'n'] = 0.7 # np->'n' - b[3, 'n'] = 1.0 # n->'n' - b[4, 'p'] = 1.0 # p->'p' - vp = CNF_Rule(2,5,1, 0.1) # vp->v np (two parses use this rule) - vp2 = CNF_Rule(2,2,4, 0.9) # vp->vp p - b[5, 'v'] = 1.0 # v->'v' - - g = Grammar([s,np,vp,vp2], b, {0:'s',1:'np',2:'vp',3:'n',4:'p',5:'v'}, - {'s':0,'np':1,'vp':2,'n':3,'p':4,'v':5}) - - print "The rules:" - for i in range(0,5): - for r in g.rules(i): - print r - print "" - - test1 = inner(0,0, 1, g, ['n'], {}) - if test1[0] != 0.7: - print "should be 0.70 : %.2f" % test1[0] - print "" - - DEBUG = 1 - test2 = inner(0,2, 2, g, ['v','n','p'], test1[1]) - print "should be 0.?? (.09??) : %.2f" % test2[0] - print "------ trying the same again:----------" - test2 = inner(0,2, 2, g, ['v','n','p'], test2[1]) - print "should be 0.?? (.09??) : %.2f" % test2[0] - - -################################################################## -# just junk from here on down: # -################################################################## - -# def io(corpus): -# "(pseudo-code / wishful thinking) " -# g = initialize(corpus) # or corpus.tagset ? - -# P = {('v','n','p'):0.09} -# # P is used in v_q, w_q (expectation), so each sentence in the -# # corpus needs some initial P. - -# # --- Maximation: --- -# # -# # actually, this step (from Lari & Young) probably never happens -# # with DMV, since instead of the a[i,j,k] and b[i,m] vectors, we -# # have P_STOP and P_CHOOSE... or, in a sense it happens only we -# # calculate P_STOP and P_CHOOSE..somehow. -# for rule in g.p_rules: -# rule.num = 0 -# rule.den = 0 -# for pre_term in range(len(g.p_terminals)): -# ptnum[pre_term] = 0 -# ptden[pre_term] = 0 - -# # we could also flip this to make rules the outer loop, then we -# # wouldn't have to initialize den/num in loops of their own -# for sent in corpus: -# for rule in g.p_rules # Equation 20 -# for s in range(len(sent)): -# for t in range(s, len(sent)): -# rule.num += w(s,t, rule.LHS(),rule.L,rule.R, g, sent, P[sent]) -# rule.den += v(s,t, rule.LHS(), g, sent, P[sent]) -# # todo: do we need a "new-prob" vs "old-prob" distinction here? -# probably, since we use inner/outer which checks rule.prob() -# # todo: also, this wouldn't work, since for each sentence, we'd -# # discard the old probability; should rules be the outer -# # loop then? -# rule.prob = rule.num / rule.den -# for pre_term in range(len(g.p_terminals)): # Equation 21 -# num = 0 -# den = 0 -# for s in range(len(sent)): -# for t in range(s, len(sent)): -# num += v(t,t,pre_term, g, sent, P[sent]) -# den += v(s,t,pre_term, g, sent, P[sent]) - -# for rule in g.rules: -# rule.prob = rule.num / rule.den -# for pre_term in range(len(g.p_terminals)): -# g.p_terminals[pre_term] = ptnum[pre_term] / ptden[pre_term] - - -# # --- Expectation: --- -# for sent in corpus: # Equation 11 -# inside = inner(0, len(sent), ROOT, g, sent) -# P[sent] = inside[0] - -# # todo: set inner.chart to {} again, how? - -# # todo: need a old-P new-P distinction to check if we're below -# # threshold difference -# return "todo" - -# def w(s,t, LHS,L,R, g, sent, P_sent): -# w = 0 -# rule = g.rule(LHS, L, R) -# for r in range(s, t): -# w += rule.prob() * inner(s,r, L, g, sent) * inner(r+1, t, R, g, sent) * outer(s,t,LHS,g,sent) -# return w / P_sent - -# def v(s,t, LHS, g, sent, P_sent): -# return ( inner(s,t, LHS, g, sent) * outer(s,t, LHS, g, sent) ) / P_sent - - - - - diff --git a/src/io.py.~1~ b/src/io.py.~1~ deleted file mode 100755 index e5e5f5b..0000000 --- a/src/io.py.~1~ +++ /dev/null @@ -1,235 +0,0 @@ -################################################################## -# Changes: # -################################################################## -# 2008-05-24, KBU: -# - more elaborate pseudo-code in the io()-function -# -# 2008-05-25, KBU: -# - CNF_Rule has a function LHS() which returns head, in DMV_Rule this -# returns the pair of (bars, head). -# -# 2008-05-27 -# - CNF_Rule has __eq__ and __ne__ defined, so that we can use == and -# != on two such rules - - -# import numpy # numpy provides Fast Arrays, for future optimization -# import pprint # for pretty-printing - -DEBUG = 0 -def debug(string): - '''Easily turn on/off inline debug printouts with this global. There's -a lot of cluttering debug statements here, todo: clean up''' - if DEBUG: - print string - -class CNF_Rule(): - '''A single CNF rule in the PCFG, of the form - head -> L R - where these are just integers - (where do we save the connection between number and symbol? - symbols being 'vbd' etc.)''' - def __eq__(self, other): - return self.LHS() == other.LHS() and self.R == other.R and self.L == other.L - def __ne__(self, other): - return self.LHS() != other.LHS() or self.R != other.R or self.L != other.L - def __str__(self): - return "%s -> %s %s [%.2f]" % (self.head, self.L, self.R, self.prob) - def __init__(self, head, L, R, prob): - self.head = head - self.R = R - self.L = L - self.prob = prob - def LHS(self): - return self.head - -class Grammar(): - '''The PCFG used in the I/O-algorithm. - - Todo: as of now, this allows duplicate rules... should we check - for this? (eg. g = Grammar([x,x],[]) where x.prob == 1 may give - inner probabilities of 2.)''' - def rules(self, head): - return [rule for rule in self.p_rules if rule.head == head] - - def __init__(self, p_rules, p_terminals): - '''p_rules and p_terminals should be arrays, where p_terminals are of - the form [preterminal, terminal], and p_rules are CNF_Rule's.''' - self.p_rules = p_rules # could check for summing to 1 (+/- epsilon) - self.p_terminals = p_terminals - - - -def inner(s, t, LHS, g, sent, chart): - ''' Give the inner probability of having the node LHS cover whatever's - between s and t in sentence sent, using grammar g. - - Returns a pair of the inner probability and the chart - - For DMV, LHS is a pair (bar, h), but this function ought to be - agnostic about that. - - e() is an internal function, so the variable chart (a dictionary) - is available to all calls of e(). - - Since terminal probabilities are just simple lookups, they are not - put in the chart (although we could put them in there later to - optimize) - ''' - - def O(s): - return sent[s] - - def e(s,t,LHS): - if (s, t, LHS) in chart: - return chart[(s, t, LHS)] - else: - debug( "trying from %d to %d with %s" % (s,t,LHS) ) - if s == t: - if (LHS, O(s)) in g.p_terminals: - prob = g.p_terminals[LHS, O(s)] # b[LHS, O(s)] - else: - # if a left stop, we need a right stop below, if a right stop, - # we need the terminal below. todo. - prob = 0.0 # todo: is this the right way to deal with lacking rules? - debug( "\tterminal: %s -> %s : %.1f" % (LHS, O(s), prob) ) - return prob - else: - if (s,t,LHS) not in chart: - chart[(s,t,LHS)] = 0.0 - for rule in g.rules(LHS): # summing over j,k in a[LHS,j,k] - debug( "\tsumming rule %s" % rule ) - L = rule.L - R = rule.R - for r in range(s, t): # summing over r = s to r = t-1 - chart[(s, t, LHS)] += rule.prob * e(s, r, L) * e(r + 1, t, R) - debug( "\tchart[(%d,%d,%s)] = %.2f" % (s,t,LHS, chart[(s,t,LHS)]) ) - return chart[(s, t, LHS)] - # end of e-function - - inner_prob = e(s,t,LHS) - if DEBUG: - print "---CHART:---" - for k,v in chart.iteritems(): - print "\t%s -> %s_%d ... %s_%d : %.1f" % (k[2], O(k[0]), k[0], O(k[1]), k[1], v) - print "---CHART:end---" - return (inner_prob, chart) # inner_prob == chart[(s,t,LHS)] - - - - - - - - -if __name__ == "__main__": - print "IO-module tests:" - b = {} - s = CNF_Rule(0,1,2, 1.0) # s->np vp - np = CNF_Rule(1,3,4, 0.3) # np->n p - b[1, 'n'] = 0.7 # np->'n' - b[3, 'n'] = 1.0 # n->'n' - b[4, 'p'] = 1.0 # p->'p' - vp = CNF_Rule(2,5,1, 0.1) # vp->v np (two parses use this rule) - vp2 = CNF_Rule(2,2,4, 0.9) # vp->vp p - b[5, 'v'] = 1.0 # v->'v' - - g = Grammar([s,np,vp,vp2], b) - - print "The rules:" - for i in range(0,5): - for r in g.rules(i): - print r - print "" - - test1 = inner(0,0, 1, g, ['n'], {}) - if test1[0] != 0.7: - print "should be 0.70 : %.2f" % test1[0] - print "" - - DEBUG = 1 - test2 = inner(0,2, 2, g, ['v','n','p'], test1[1]) - print "should be 0.?? (.09??) : %.2f" % test2[0] - print "------ trying the same again:----------" - test2 = inner(0,2, 2, g, ['v','n','p'], test2[1]) - print "should be 0.?? (.09??) : %.2f" % test2[0] - - -################################################################## -# just junk from here on down: # -################################################################## - -# def io(corpus): -# "(pseudo-code / wishful thinking) " -# g = initialize(corpus) # or corpus.tagset ? - -# P = {('v','n','p'):0.09} -# # P is used in v_q, w_q (expectation), so each sentence in the -# # corpus needs some initial P. - -# # --- Maximation: --- -# # -# # actually, this step (from Lari & Young) probably never happens -# # with DMV, since instead of the a[i,j,k] and b[i,m] vectors, we -# # have P_STOP and P_CHOOSE... or, in a sense it happens only we -# # calculate P_STOP and P_CHOOSE..somehow. -# for rule in g.p_rules: -# rule.num = 0 -# rule.den = 0 -# for pre_term in range(len(g.p_terminals)): -# ptnum[pre_term] = 0 -# ptden[pre_term] = 0 - -# # we could also flip this to make rules the outer loop, then we -# # wouldn't have to initialize den/num in loops of their own -# for sent in corpus: -# for rule in g.p_rules # Equation 20 -# for s in range(len(sent)): -# for t in range(s, len(sent)): -# rule.num += w(s,t, rule.LHS(),rule.L,rule.R, g, sent, P[sent]) -# rule.den += v(s,t, rule.LHS(), g, sent, P[sent]) -# # todo: do we need a "new-prob" vs "old-prob" distinction here? -# probably, since we use inner/outer which checks rule.prob -# # todo: also, this wouldn't work, since for each sentence, we'd -# # discard the old probability; should rules be the outer -# # loop then? -# rule.prob = rule.num / rule.den -# for pre_term in range(len(g.p_terminals)): # Equation 21 -# num = 0 -# den = 0 -# for s in range(len(sent)): -# for t in range(s, len(sent)): -# num += v(t,t,pre_term, g, sent, P[sent]) -# den += v(s,t,pre_term, g, sent, P[sent]) - -# for rule in g.p_rules: -# rule.prob = rule.num / rule.den -# for pre_term in range(len(g.p_terminals)): -# g.p_terminals[pre_term] = ptnum[pre_term] / ptden[pre_term] - - -# # --- Expectation: --- -# for sent in corpus: # Equation 11 -# inside = inner(0, len(sent), ROOT, g, sent) -# P[sent] = inside[0] - -# # todo: set inner.chart to {} again, how? - -# # todo: need a old-P new-P distinction to check if we're below -# # threshold difference -# return "todo" - -# def w(s,t, LHS,L,R, g, sent, P_sent): -# w = 0 -# rule = g.rule(LHS, L, R) -# for r in range(s, t): -# w += rule.prob * inner(s,r, L, g, sent) * inner(r+1, t, R, g, sent) * outer(s,t,LHS,g,sent) -# return w / P_sent - -# def v(s,t, LHS, g, sent, P_sent): -# return ( inner(s,t, LHS, g, sent) * outer(s,t, LHS, g, sent) ) / P_sent - - - - - diff --git a/src/io.py.~2~ b/src/io.py.~2~ deleted file mode 100755 index b4da691..0000000 --- a/src/io.py.~2~ +++ /dev/null @@ -1,237 +0,0 @@ -################################################################## -# Changes: # -################################################################## -# 2008-05-24, KBU: -# - more elaborate pseudo-code in the io()-function -# -# 2008-05-25, KBU: -# - CNF_Rule has a function LHS() which returns head, in DMV_Rule this -# returns the pair of (bars, head). -# -# 2008-05-27 -# - CNF_Rule has __eq__ and __ne__ defined, so that we can use == and -# != on two such rules - - -# import numpy # numpy provides Fast Arrays, for future optimization -# import pprint # for pretty-printing - -DEBUG = 0 -def debug(string): - '''Easily turn on/off inline debug printouts with this global. There's -a lot of cluttering debug statements here, todo: clean up''' - if DEBUG: - print string - -class CNF_Rule(): - '''A single CNF rule in the PCFG, of the form - head -> L R - where these are just integers - (where do we save the connection between number and symbol? - symbols being 'vbd' etc.)''' - def __eq__(self, other): - return self.LHS() == other.LHS() and self.R == other.R and self.L == other.L - def __ne__(self, other): - return self.LHS() != other.LHS() or self.R != other.R or self.L != other.L - def __str__(self): - return "%s -> %s %s [%.2f]" % (self.head, self.L, self.R, self.prob) - def __init__(self, head, L, R, prob): - self.head = head - self.R = R - self.L = L - self.prob = prob - def p(self, s, t, g): - return self.prob - def LHS(self): - return self.head - -class Grammar(): - '''The PCFG used in the I/O-algorithm. - - Todo: as of now, this allows duplicate rules... should we check - for this? (eg. g = Grammar([x,x],[]) where x.prob == 1 may give - inner probabilities of 2.)''' - def rules(self, head): - return [rule for rule in self.rules if rule.head == head] - - def __init__(self, rules, p_terminals): - '''rules and p_terminals should be arrays, where p_terminals are of - the form [preterminal, terminal], and rules are CNF_Rule's.''' - self.rules = rules # could check for summing to 1 (+/- epsilon) - self.p_terminals = p_terminals - - - -def inner(s, t, LHS, g, sent, chart): - ''' Give the inner probability of having the node LHS cover whatever's - between s and t in sentence sent, using grammar g. - - Returns a pair of the inner probability and the chart - - For DMV, LHS is a pair (bar, h), but this function ought to be - agnostic about that. - - e() is an internal function, so the variable chart (a dictionary) - is available to all calls of e(). - - Since terminal probabilities are just simple lookups, they are not - put in the chart (although we could put them in there later to - optimize) - ''' - - def O(s): - return sent[s] - - def e(s,t,LHS): - if (s, t, LHS) in chart: - return chart[(s, t, LHS)] - else: - debug( "trying from %d to %d with %s" % (s,t,LHS) ) - if s == t: - if (LHS, O(s)) in g.p_terminals: - prob = g.p_terminals[LHS, O(s)] # b[LHS, O(s)] - else: - # if a left stop, we need a right stop below, if a right stop, - # we need the terminal below. todo. - prob = 0.0 # todo: is this the right way to deal with lacking rules? - debug( "\tterminal: %s -> %s : %.1f" % (LHS, O(s), prob) ) - return prob - else: - if (s,t,LHS) not in chart: - chart[(s,t,LHS)] = 0.0 - for rule in g.rules(LHS): # summing over j,k in a[LHS,j,k] - debug( "\tsumming rule %s" % rule ) - L = rule.L - R = rule.R - for r in range(s, t): # summing over r = s to r = t-1 - chart[(s, t, LHS)] += rule.p(s,t,g) * e(s, r, L) * e(r + 1, t, R) - debug( "\tchart[(%d,%d,%s)] = %.2f" % (s,t,LHS, chart[(s,t,LHS)]) ) - return chart[(s, t, LHS)] - # end of e-function - - inner_prob = e(s,t,LHS) - if DEBUG: - print "---CHART:---" - for k,v in chart.iteritems(): - print "\t%s -> %s_%d ... %s_%d : %.1f" % (k[2], O(k[0]), k[0], O(k[1]), k[1], v) - print "---CHART:end---" - return (inner_prob, chart) # inner_prob == chart[(s,t,LHS)] - - - - - - - - -if __name__ == "__main__": - print "IO-module tests:" - b = {} - s = CNF_Rule(0,1,2, 1.0) # s->np vp - np = CNF_Rule(1,3,4, 0.3) # np->n p - b[1, 'n'] = 0.7 # np->'n' - b[3, 'n'] = 1.0 # n->'n' - b[4, 'p'] = 1.0 # p->'p' - vp = CNF_Rule(2,5,1, 0.1) # vp->v np (two parses use this rule) - vp2 = CNF_Rule(2,2,4, 0.9) # vp->vp p - b[5, 'v'] = 1.0 # v->'v' - - g = Grammar([s,np,vp,vp2], b) - - print "The rules:" - for i in range(0,5): - for r in g.rules(i): - print r - print "" - - test1 = inner(0,0, 1, g, ['n'], {}) - if test1[0] != 0.7: - print "should be 0.70 : %.2f" % test1[0] - print "" - - DEBUG = 1 - test2 = inner(0,2, 2, g, ['v','n','p'], test1[1]) - print "should be 0.?? (.09??) : %.2f" % test2[0] - print "------ trying the same again:----------" - test2 = inner(0,2, 2, g, ['v','n','p'], test2[1]) - print "should be 0.?? (.09??) : %.2f" % test2[0] - - -################################################################## -# just junk from here on down: # -################################################################## - -# def io(corpus): -# "(pseudo-code / wishful thinking) " -# g = initialize(corpus) # or corpus.tagset ? - -# P = {('v','n','p'):0.09} -# # P is used in v_q, w_q (expectation), so each sentence in the -# # corpus needs some initial P. - -# # --- Maximation: --- -# # -# # actually, this step (from Lari & Young) probably never happens -# # with DMV, since instead of the a[i,j,k] and b[i,m] vectors, we -# # have P_STOP and P_CHOOSE... or, in a sense it happens only we -# # calculate P_STOP and P_CHOOSE..somehow. -# for rule in g.rules: -# rule.num = 0 -# rule.den = 0 -# for pre_term in range(len(g.p_terminals)): -# ptnum[pre_term] = 0 -# ptden[pre_term] = 0 - -# # we could also flip this to make rules the outer loop, then we -# # wouldn't have to initialize den/num in loops of their own -# for sent in corpus: -# for rule in g.rules # Equation 20 -# for s in range(len(sent)): -# for t in range(s, len(sent)): -# rule.num += w(s,t, rule.LHS(),rule.L,rule.R, g, sent, P[sent]) -# rule.den += v(s,t, rule.LHS(), g, sent, P[sent]) -# # todo: do we need a "new-prob" vs "old-prob" distinction here? -# probably, since we use inner/outer which checks rule.prob() -# # todo: also, this wouldn't work, since for each sentence, we'd -# # discard the old probability; should rules be the outer -# # loop then? -# rule.prob = rule.num / rule.den -# for pre_term in range(len(g.p_terminals)): # Equation 21 -# num = 0 -# den = 0 -# for s in range(len(sent)): -# for t in range(s, len(sent)): -# num += v(t,t,pre_term, g, sent, P[sent]) -# den += v(s,t,pre_term, g, sent, P[sent]) - -# for rule in g.rules: -# rule.prob = rule.num / rule.den -# for pre_term in range(len(g.p_terminals)): -# g.p_terminals[pre_term] = ptnum[pre_term] / ptden[pre_term] - - -# # --- Expectation: --- -# for sent in corpus: # Equation 11 -# inside = inner(0, len(sent), ROOT, g, sent) -# P[sent] = inside[0] - -# # todo: set inner.chart to {} again, how? - -# # todo: need a old-P new-P distinction to check if we're below -# # threshold difference -# return "todo" - -# def w(s,t, LHS,L,R, g, sent, P_sent): -# w = 0 -# rule = g.rule(LHS, L, R) -# for r in range(s, t): -# w += rule.prob() * inner(s,r, L, g, sent) * inner(r+1, t, R, g, sent) * outer(s,t,LHS,g,sent) -# return w / P_sent - -# def v(s,t, LHS, g, sent, P_sent): -# return ( inner(s,t, LHS, g, sent) * outer(s,t, LHS, g, sent) ) / P_sent - - - - - diff --git a/src/io.pyc b/src/io.pyc dissimilarity index 74% index d85f16d..f7e705b 100644 Binary files a/src/io.pyc and b/src/io.pyc differ diff --git a/src/junk.py b/src/junk.py old mode 100755 new mode 100644 diff --git a/src/junk.py.~1~ b/src/junk.py.~1~ deleted file mode 100755 index f1b1a52..0000000 --- a/src/junk.py.~1~ +++ /dev/null @@ -1,28 +0,0 @@ -if __name__ == "__main__": - # these are not Real rules, just testing the classes. todo: make - # a rule-set to test inner() on. - b = {} - s = DMV_Rule((LRBAR,0), (NOBAR,1),(NOBAR,2), 1.0, 0.0) # s->np vp - np = DMV_Rule((NOBAR,1), (NOBAR,3),(NOBAR,4), 0.3, 0.0) # np->n p - b[(NOBAR,1), 'n'] = 0.7 # np->'n' - b[(NOBAR,3), 'n'] = 1.0 # n->'n' - b[(NOBAR,4), 'p'] = 1.0 # p->'p' - vp = DMV_Rule((NOBAR,2), (NOBAR,5),(NOBAR,1), 0.1, 0.0) # vp->v np (two parses use this rule) - vp2 = DMV_Rule((NOBAR,2), (NOBAR,2),(NOBAR,4), 0.9, 0.0) # vp->vp p - b[(NOBAR,5), 'v'] = 1.0 # v->'v' - - g = DMV_Grammar([s,np,vp,vp2], b, "todo","todo", "todo") - - io.DEBUG = 0 - test1 = io.inner(0,0, (NOBAR,1), g, ['n'], {}) - if test1[0] != 0.7: - print "should be 0.70 : %.2f" % test1[0] - print "" - - test2 = io.inner(0,2, (NOBAR,2), g, ['v','n','p'], {}) - if "%.2f" % test2[0] != "0.09": # 0.092999 etc, don't care about that - print "should be 0.09 if the io.py-test is right : %.2f" % test2[0] - # the following should manage to look stuff up in the chart: - test2 = io.inner(0,2, (NOBAR,2), g, ['v','n','p'], test2[1]) - if "%.2f" % test2[0] != "0.09": - print "should be 0.09 if the io.py-test is right : %.2f" % test2[0] diff --git a/src/loc_h_dmv.pyc b/src/loc_h_dmv.pyc dissimilarity index 85% index 0081392..11efe3f 100644 Binary files a/src/loc_h_dmv.pyc and b/src/loc_h_dmv.pyc differ diff --git a/src/pcnf_dmv.py b/src/pcnf_dmv.py old mode 100755 new mode 100644 diff --git a/tex/DMVCCM.org.~1~ b/tex/DMVCCM.org.~1~ deleted file mode 100755 index b96d7bd..0000000 --- a/tex/DMVCCM.org.~1~ +++ /dev/null @@ -1,386 +0,0 @@ -# -*- coding: mule-utf-8-unix -*- - -#+STARTUP: overview -#+TAGS: OPTIMIZE PRETTIER -#+STARTUP: hidestars -#+TITLE: DMV/CCM -- todo-list / progress -#+AUTHOR: Kevin Brubeck Unhammer -#+EMAIL: K.BrubeckUnhammer at student uva nl -#+OPTIONS: H:4 -#+OPTIONS: toc:3 -#+OPTIONS: ^:{} -#+LANGUAGE: en -#+SEQ_TODO: TOGROK TODO DONE - -* dmvccm report and project - DEADLINE: <2008-06-30 Mon> -- [[file:src/dmv.py][dmv.py]] -- [[file:src/io.py][io.py]] -- [[file:src/harmonic.py::harmonic%20py%20initialization%20for%20dmv][harmonic.py]] - -(But absolute, extended, really-quite-dead-now deadline: August 31...) -* DONE outer probabilities - CLOSED: [2008-06-12 Thu 11:11] -# <> -There are 6 different configurations here, based on what the above -(mother) node is, and what the Node for which we're computing is. -Here *r* is not between *s* and *t* as in inner(), but an /outer/ index. *loc_N* -is the location of the Node head in the sentence, *loc_m* for the head -of the mother of Node. - -+ mother is a RIGHT-stop: - - outer(*s, t*, mom.LHS, *loc_N*), no inner-call - - adjacent iff *t* == *loc_m* -+ mother is a LEFT-stop: - - outer(*s, t*, mom.LHS, *loc_N*), no inner-call - - adjacent iff *s* == *loc_m* - -+ Node is on the LEFT branch (mom.L == Node) - * and mother is a LEFT attachment: - - *loc_N* will be in the LEFT branch, can be anything here. - - In the RIGHT, attached, branch we find inner(*t+1, r*, mom.R, *loc_m*) for - all possible *loc_m* in the right part of the sentence. - - outer(*s, r*, mom.LHS, *loc_m*). - - adjacent iff *t+1* == *loc_m* - * and mother is a RIGHT attachment: - - *loc_m* = *loc_N*. - - In the RIGHT, attached, branch we find inner(*t+1, r*, mom.R, *loc_R*) for - all possible *loc_R* in the right part of the sentence. - - outer(*s, r*, mom.LHS, *loc_N*). - - adjacent iff *t* == *loc_m* - -+ Node is on the RIGHT branch (mom.R == Node) - * and mother is a LEFT attachment: - - *loc_m* = *loc_N*. - - In the LEFT, attached, branch we find inner(*r, s-1*, mom.L, *loc_L*) for - all possible *loc_L* in the left part of the sentence. - - outer(*r, t*, mom.LHS, *loc_m*). - - adjacent iff *s* == *loc_m* - * and mother is a RIGHT attachment: - - *loc_N* will be in the RIGHT branch, can be anything here. - - In the RIGHT, attached, branch we find inner(*r, s-1*, mom.L, *loc_m*) for - all possible *loc_m* in the right part of the sentence. - - outer(*r, t*, mom.LHS, *loc_N*). - - adjacent iff *s-1* == *loc_m* - -[[file:outer_attachments.jpg]] - -Also, unlike in [[http://bibsonomy.org/bibtex/2b9f6798bb092697da7042ca3f5dee795][Lari & Young]], non-ROOT ('S') symbols may cover the -whole sentence, but ROOT may /only/ appear if it covers the whole -sentence. -** COMMENT qtrees - -\Tree [.{$h\urcorner$} [.{$_s \ulcorner a\urcorner _t$\\ - Node} ] {$_{t+1} h\urcorner _r$\\ - R} ] -\Tree [.{$h$} {$_{s} h _{t}$\\ - Node} [.{$_{t+1} \ulcorner a\urcorner _r$\\ - R} ] ] -\Tree [.{$h\urcorner$} [.{$_r \ulcorner a\urcorner _{s-1}$\\ - L} ] {$_{s} h\urcorner _t$\\ - Node} ] -\Tree [.{$h$} {$_{r} h _{s-1}$\\ - L} [.{$_{s} \ulcorner a\urcorner _t$\\ - Node} ] ] -* TODO [#B] P_STOP and P_CHOOSE for IO/EM (reestimation) -[[file:src/dmv.py::DMV%20probabilities][dmv-P_STOP]] -Remember: The P_{STOP} formula is upside-down (left-to-right also). -(In the article..not the [[http://www.eecs.berkeley.edu/~klein/papers/klein_thesis.pdf][thesis]]) - -** P_STOP formulas for various dir and adj: -Assuming this: -- P_{STOP}(STOP|h,L,non_adj) = \sum_{corpus} \sum_{sloc(h)} - inner(s,t,_h_, ...) / \sum_{corpus} \sum_{s} \sum_{t>loc(h)} - inner(s,t,h_, ...) -- P_{STOP}(STOP|h,R,adj) = \sum_{corpus} \sum_{s} \sum_{t=loc(h)} - inner(s,t,_h_, ...) / \sum_{corpus} \sum_{s} \sum_{t=loc(h)} - inner(s,t,h_, ...) - -(And P_{STOP}(-STOP|...) = 1 - P_{STOP}(STOP|...) ) -** P_CHOOSE formula: -Assuming this: -- P_{CHOOSE}(a|h,L) = \sum_{corpus} \sum_{s=loc(h)} \sum_{r=loc(h)} - inner(s,t,h_,...) - - t >= loc(h) since there are many possibilites for - right-attachments below, and each of them alone gives a lower - probability (through multiplication) to the upper tree (so sum - them all) -- P_{CHOOSE}(a|h,R) = \sum_{corpus} \sum_{s=loc(h)} \sum_{r>s} - inner(s,r,_a_,...) / \sum_{corpus} \sum_{s=loc(h)} \sum_{t} - inner(s,t,h, ...) -** DONE [#A] How do we only count from completed trees? - CLOSED: [2008-06-13 Fri 11:40] -Use c(s,t,Node); inner * outer / P_sent - -** DONE [#A] c(s,t,Node) - CLOSED: [2008-06-13 Fri 11:38] -= inner * outer / P_sent - -implemented as inner * outer / inner_sent - -* TOGROK Find Most Probable Parse of given test sentence -We could probably do it pretty easily from the chart: -- for all loc_h, select the (0,len(sent),ROOT,loc_h) that has the highest p -- for stops, just keep going -- for rewrites, for loc_dep select the one that has the highest p -* TOGROK Combine CCM with DMV -* TODO Get a dependency parsed version of WSJ to test with -* Initialization -[[file:~/Documents/Skole/V08/Probability/dmvccm/src/dmv.py::Initialization%20todo][dmv-inits]] - -We go through the corpus, since the probabilities are based on how far -away in the sentence arguments are from their heads. -** TOGROK CCM Initialization -P_{SPLIT} used here... how, again? -** DONE Separate initialization to another file? :PRETTIER: - CLOSED: [2008-06-08 Sun 12:51] -[[file:src/harmonic.py::harmonic%20py%20initialization%20for%20dmv][harmonic.py]] -** DONE DMV Initialization probabilities -(from initialization frequency) -** DONE DMV Initialization frequencies - CLOSED: [2008-05-27 Tue 20:04] -*** P_STOP -P_{STOP} is not well defined by K&M. One possible interpretation given -the sentence [det nn vb nn] is -: f_{STOP}( STOP|det, L, adj) +1 -: f_{STOP}(-STOP|det, L, adj) +0 -: f_{STOP}( STOP|det, L, non_adj) +1 -: f_{STOP}(-STOP|det, L, non_adj) +0 -: f_{STOP}( STOP|det, R, adj) +0 -: f_{STOP}(-STOP|det, R, adj) +1 -: -: f_{STOP}( STOP|nn, L, adj) +0 -: f_{STOP}(-STOP|nn, L, adj) +1 -: f_{STOP}( STOP|nn, L, non_adj) +1 # since there's at least one to the left -: f_{STOP}(-STOP|nn, L, non_adj) +0 -**** TODO tweak -# <> -: f[head, 'STOP', 'LN'] += (i_h <= 1) # first two words -: f[head, '-STOP', 'LN'] += (not i_h <= 1) -: f[head, 'STOP', 'LA'] += (i_h == 0) # very first word -: f[head, '-STOP', 'LA'] += (not i_h == 0) -: f[head, 'STOP', 'RN'] += (i_h >= n - 2) # last two words -: f[head, '-STOP', 'RN'] += (not i_h >= n - 2) -: f[head, 'STOP', 'RA'] += (i_h == n - 1) # very last word -: f[head, '-STOP', 'RA'] += (not i_h == n - 1) -vs -: # this one requires some additional rewriting since it -: # introduces divisions by zero -: f[head, 'STOP', 'LN'] += (i_h == 1) # second word -: f[head, '-STOP', 'LN'] += (not i_h <= 1) # not first two -: f[head, 'STOP', 'LA'] += (i_h == 0) # first word -: f[head, '-STOP', 'LA'] += (not i_h == 0) # not first -: f[head, 'STOP', 'RN'] += (i_h == n - 2) # second-to-last -: f[head, '-STOP', 'RN'] += (not i_h >= n - 2) # not last two -: f[head, 'STOP', 'RA'] += (i_h == n - 1) # last word -: f[head, '-STOP', 'RA'] += (not i_h == n - 1) # not last -vs -: f[head, 'STOP', 'LN'] += (i_h == 1) # second word -: f[head, '-STOP', 'LN'] += (not i_h == 1) # not second -: f[head, 'STOP', 'LA'] += (i_h == 0) # first word -: f[head, '-STOP', 'LA'] += (not i_h == 0) # not first -: f[head, 'STOP', 'RN'] += (i_h == n - 2) # second-to-last -: f[head, '-STOP', 'RN'] += (not i_h == n - 2) # not second-to-last -: f[head, 'STOP', 'RA'] += (i_h == n - 1) # last word -: f[head, '-STOP', 'RA'] += (not i_h == n - 1) # not last -vs -"all words take the same number of arguments" interpreted as -:for all heads: -: p_STOP(head, 'STOP', 'LN') = 0.3 -: p_STOP(head, 'STOP', 'LA') = 0.5 -: p_STOP(head, 'STOP', 'RN') = 0.4 -: p_STOP(head, 'STOP', 'RA') = 0.7 -(which we easily may tweak in init_zeros()) -*** P_CHOOSE -Go through the corpus, counting distances between heads and -arguments. In [det nn vb nn], we give -- f_{CHOOSE}(nn|det, R) +1/1 + C -- f_{CHOOSE}(vb|det, R) +1/2 + C -- f_{CHOOSE}(nn|det, R) +1/3 + C - - If this were the full corpus, P_{CHOOSE}(nn|det, R) would have - (1+1/3+2C) / sum_a f_{CHOOSE}(a|det, R) - -The ROOT gets "each argument with equal probability", so in a sentence -of three words, 1/3 for each (in [nn vb nn], 'nn' gets 2/3). Basically -just a frequency count of the corpus... - -In a sense there are no terminal probabilities, since an /h/ can only -rewrite to an 'h' anyway. -* [#C] Deferred -http://wiki.python.org/moin/PythonSpeed/PerformanceTips Eg., use -map/reduce/filter/[i for i in [i's]]/(i for i in [i's]) instead of -for-loops; use local variables for globals (global variables or or -functions), etc. - -** TODO when reestimating P_STOP etc, remove rules with p < epsilon :OPTIMIZE: -** TODO inner_dmv, short ranges and impossible attachment :OPTIMIZE: -If s-t <= 2, there can be only one attachment below, so don't recurse -with both Lattach=True and Rattach=True. - -If s-t <= 1, there can be no attachment below, so only recurse with -Lattach=False, Rattach=False. - -Put this in the loop under rewrite rules (could also do it in the STOP -section, but that would only have an effect on very short sentences). -** TODO clean up the module files :PRETTIER: -Is there better way to divide dmv and harmonic? There's a two-way -dependency between the modules. Guess there could be a third file that -imports both the initialization and the actual EM stuff, while a file -containing constants and classes could be imported by all others: -: dmv.py imports dmv_EM.py imports dmv_classes.py -: dmv.py imports dmv_inits.py imports dmv_classes.py - -** TOGROK Some (tagged) sentences are bound to come twice :OPTIMIZE: -Eg, first sort and count, so that the corpus -[['nn','vbd','det','nn'], - ['vbd','nn','det','nn'], - ['nn','vbd','det','nn']] -becomes -[(['nn','vbd','det','nn'],2), - (['vbd','nn','det','nn'],1)] -and then in each loop through sentences, make sure we handle the -frequency correctly. - -Is there much to gain here? - -** TOGROK tags as numbers or tags as strings? :OPTIMIZE: -Need to clean up the representation. - -Stick with tag-strings in initialization then switch to numbers for -IO-algorithm perhaps? Can probably afford more string-matching in -initialization.. -** DONE if loc_h == t, no need to try right-attachment rules &v.v. :OPTIMIZE: - CLOSED: [2008-06-10 Tue 14:34] -(and if loc_h == s, no need to try left-attachment rules.) - -Modest speed increase (5%). -** DONE io.debug parameters should not call functions :OPTIMIZE: - CLOSED: [2008-06-10 Tue 12:26] -Exchanged all io.debug(str,'level') calls with statements of the form: -:if 'level' in io.DEBUG: -: print str - -and got an almost threefold speed increase on inner(). -** DONE inner_dmv() should disregard rules with heads not in sent :OPTIMIZE: - CLOSED: [2008-06-08 Sun 10:18] -If the sentence is "nn vbd det nn", we should not even look at rules -where -: rule.head() not in "nn vbd det nn".split() -This is ruled out by getting rules from g.rules(LHS, sent). - -Also, we optimize this further by saying we don't even recurse into -attachment rules where -: rule.head() not in sent[ s :r+1] -: rule.head() not in sent[r+1:t+1] -meaning, if we're looking at the span "vbd det", we only use -attachment rules where both daughters are members of ['vbd','det'] -(although we don't (yet) care about removing rules that rewrite to the -same tag if there are no duplicate tags in the span, etc., that would -be a lot of trouble for little potential gain). -* Adjacency and combining it with the inside-outside algorithm -Each DMV_Rule has both a probN and a probA, for adjacencies. inner() -and outer() needs the correct one in each case. - -In each inner() call, loc_h is the location of the head of this -dependency structure. In each outer() call, it's the head of the /Node/, -the structure we're looking outside of. - -We call inner() for each location of a head, and on each terminal, -loc_h must equal s (and t). In the recursive attachment calls, we use -the locations (sentence indices) of words to the left or right of the -head in calls to inner(). /loc_h lets us check whether we need probN or -probA/. -** Possible alternate type of adjacency -K&M's adjacency is just whether or not an argument has been generated -in the current direction yet. One could also make a stronger type of -adjacency, where h and a are not adjacent if b is in between, eg. with -the sentence "a b h" and the structure ((h->a), (a->b)), h is -K&M-adjacent to a, but not next to a, since b is in between. It's easy -to check this type of adjacency in inner(), but it needs new rules for -P_STOP reestimation. -* Expectation Maximation in IO/DMV-terms -outer(s,t,Node) and inner(s,t,Node) calculates the expected number of -trees (CNF-)headed by Node from *s* to *t* (sentence positions). This uses -the P_STOP and P_CHOOSE values, which have been conveniently -distributed into CNF rules as probN and probA (non-adjacent and -adjacent probabilites). - -When re-estimating, we use the expected values from outer() and -inner() to get new values for P_STOP and P_CHOOSE. When we've -re-estimated for the entire corpus, we distribute P_STOP and P_CHOOSE -into the CNF rules again, so that in the next round we use new probN -and probA to find outer- and inner-probabilites. - -The distribution of P_STOP and P_CHOOSE into CNF rules also happens in -init_normalize() (here along with the creation of P_STOP and -P_CHOOSE); P_STOP is used to create CNF rules where one branch of the -rule is STOP, P_CHOOSE is used to create rules of the form -: h -> h _a_ -: h_ -> h_ _a_ - -Since "adjacency" is not captured in regular CNF rules, we need two -probabilites for each rule, and outer() and inner() have to know when -to use which. - - -* Python-stuff -# <> -Make those debug statements steal a bit less attention in emacs: -:(font-lock-add-keywords -: 'python-mode ; not really regexp, a bit slow -: '(("^\\( *\\)\\(\\if +'.+' +in +io.DEBUG. *\\( -:\\1 .+$\\)+\\)" 2 font-lock-preprocessor-face t))) -:(font-lock-add-keywords -: 'python-mode -: '(("\\<\\(\\(io\\.\\)?debug(.+)\\)" 1 font-lock-preprocessor-face t))) - -- [[file:src/pseudo.py][pseudo.py]] -- http://nltk.org/doc/en/structured-programming.html recursive dynamic -- http://nltk.org/doc/en/advanced-parsing.html -- http://jaynes.colorado.edu/PythonIdioms.html - - - -* Git -Repository web page: http://repo.or.cz/w/dmvccm.git - -Setting up a new project: -: git init -: git add . -: git commit -m "first release" - -Later on: (=-a= does =git rm= and =git add= automatically) -: git init -: git commit -a -m "some subsequent release" - -Then push stuff up to the remote server: -: git push git+ssh://username@repo.or.cz/srv/git/dmvccm.git master - -(=eval `ssh-agent`= and =ssh-add= to avoid having to type in keyphrase all -the time) - -Make a copy of the (remote) master branch: -: git clone git://repo.or.cz/dmvccm.git - -Make and name a new branch in this folder -: git checkout -b mybranch - -To save changes in =mybranch=: -: git commit -a - -Go back to the master branch (uncommitted changes from =mybranch= are -carried over): -: git checkout master - -Try out: -: git add --interactive - -Good tutorial: -http://www-cs-students.stanford.edu/~blynn//gitmagic/ diff --git a/tex/test.org.~1~ b/tex/test.org.~1~ deleted file mode 100755 index a91acd7..0000000 --- a/tex/test.org.~1~ +++ /dev/null @@ -1,15 +0,0 @@ -# -*- coding: mule-utf-8-unix -*- - -#+STARTUP: overview -#+TAGS: OPTIMIZE PRETTIER -#+STARTUP: hidestars -#+TITLE: DMV/CCM -- todo-list / progress -#+AUTHOR: Kevin Brubeck Unhammer -#+EMAIL: K.BrubeckUnhammer at student uva nl -#+OPTIONS: H:4 -#+OPTIONS: toc:3 -#+OPTIONS: ^:{} -#+LANGUAGE: en -#+SEQ_TODO: TOGROK TODO DONE - -* dmvccm report and project diff --git a/tex/test.tex.~1~ b/tex/test.tex.~1~ deleted file mode 100644 index a75e084..0000000 --- a/tex/test.tex.~1~ +++ /dev/null @@ -1,49 +0,0 @@ -% Created 2008-06-13 Fri 17:05 -\documentclass[11pt,a4paper]{article} -\usepackage[utf8]{inputenc} -\usepackage[T1]{fontenc} -\usepackage{hyperref} -\usepackage{natbib} - -\usepackage{pslatex} -\usepackage{pdfsync} -\pdfoutput=1 - -\usepackage{avm} -\avmfont{\sc} -\avmoptions{sorted,active} -\avmvalfont{\rm} -\avmsortfont{\scriptsize\it} - - -\title{DMV/CCM -- tex stuff} -\author{Kevin Brubeck Unhammer} -\date{13 June 2008} - -\begin{document} - -\maketitle - -\setcounter{tocdepth}{3} -\tableofcontents - - - -\section{stuff} - - -\Tree [.\{$h\urcorner$} [.{$_s \ulcorner a\urcorner _t$\\ - Node} ] {$_{t+1} h\urcorner _r$\\ - R} ] -\Tree [.{$h$} {$_{s} h _{t}$\\ - Node} [.{$_{t+1} \ulcorner a\urcorner _r$\\ - R} ] ] -\Tree [.{$h\urcorner$} [.{$_r \ulcorner a\urcorner _{s-1}$\\ - L} ] {$_{s} h\urcorner _t$\\ - Node} ] -\Tree [.{$h$} {$_{r} h _{s-1}$\\ - L} [.{$_{s} \ulcorner a\urcorner _t$\\ - Node} ] ] - - -\end{document} \ No newline at end of file