1 ##################################################################
3 ##################################################################
5 # - more elaborate pseudo-code in the io()-function
8 # - CNF_Rule has a function LHS() which returns head, in DMV_Rule this
9 # returns the pair of (bars, head).
12 # - CNF_Rule has __eq__ and __ne__ defined, so that we can use == and
13 # != on two such rules
16 # import numpy # numpy provides Fast Arrays, for future optimization
17 # import pprint # for pretty-printing
21 # some of dmv-module bleeding in here... todo: prettier (in inner())
26 '''Easily turn on/off inline debug printouts with this global. There's
27 a lot of cluttering debug statements here, todo: clean up'''
32 '''A single CNF rule in the PCFG, of the form
34 where these are just integers
35 (where do we save the connection between number and symbol?
36 symbols being 'vbd' etc.)'''
37 def __eq__(self, other):
38 return self.LHS() == other.LHS() and self.R() == other.R() and self.L() == other.L()
39 def __ne__(self, other):
40 return self.LHS() != other.LHS() or self.R() != other.R() or self.L() != other.L()
42 return "%s -> %s %s [%.2f]" % (self.LHS(), self.L(), self.R(), self.prob)
43 def __init__(self, LHS, L, R, prob):
49 "Return a probability, doesn't care about attachment..."
60 '''The PCFG used in the I/O-algorithm.
62 Todo: as of now, this allows duplicate rules... should we check
63 for this? (eg. g = Grammar([x,x],[]) where x.prob == 1 may give
64 inner probabilities of 2.)'''
66 return [rule for rule in self.p_rules if rule.LHS() == LHS]
68 def __init__(self, p_rules, p_terminals):
69 '''rules and p_terminals should be arrays, where p_terminals are of
70 the form [preterminal, terminal], and rules are CNF_Rule's.'''
71 self.p_rules = p_rules # could check for summing to 1 (+/- epsilon)
72 self.p_terminals = p_terminals
76 def inner(s, t, LHS, g, sent, chart):
77 ''' Give the inner probability of having the node LHS cover whatever's
78 between s and t in sentence sent, using grammar g.
80 Returns a pair of the inner probability and the chart
82 For DMV, LHS is a pair (bar, h), but this function ought to be
85 e() is an internal function, so the variable chart (a dictionary)
86 is available to all calls of e().
88 Since terminal probabilities are just simple lookups, they are not
89 put in the chart (although we could put them in there later to
97 '''Chart has lists of probability and whether or not we've attached
98 yet to L and R, each entry is a list [p, Rattach, Lattach], where if
99 Rattach==True then the rule has a right-attachment or there is one
100 lower in the tree (meaning we're no longer adjacent).'''
101 if (s, t, LHS) in chart:
102 return chart[(s, t, LHS)]
104 debug( "trying from %d to %d with %s" % (s,t,LHS) )
106 if (LHS, O(s)) in g.p_terminals:
107 prob = g.p_terminals[LHS, O(s)] # b[LHS, O(s)]
109 prob = 0.0 # todo: is this the right way to deal with lacking rules?
110 print "\t LACKING TERMINAL:"
111 debug( "\t terminal: %s -> %s : %.1f" % (LHS, O(s), prob) )
112 # terminals have no attachment
115 if (s,t,LHS) not in chart:
116 # by default, not attachment yet
117 chart[(s,t,LHS)] = 0.0 #, False, False]
118 for rule in g.rules(LHS): # summing over j,k in a[LHS,j,k]
119 debug( "\tsumming rule %s" % rule )
122 for r in range(s, t): # summing over r = s to r = t-1
125 p = rule.p("todo","todo")
126 chart[(s, t, LHS)] += p * p_L * p_R
127 debug( "\tchart[(%d,%d,%s)] = %.2f" % (s,t,LHS, chart[(s,t,LHS)]) )
128 return chart[(s, t, LHS)]
131 inner_prob = e(s,t,LHS)
134 for k,v in chart.iteritems():
135 print "\t%s -> %s_%d ... %s_%d : %.1f" % (k[2], O(k[0]), k[0], O(k[1]), k[1], v)
136 print "---CHART:end---"
137 return [inner_prob, chart] # inner_prob == chart[(s,t,LHS)]
146 if __name__ == "__main__":
147 print "IO-module tests:"
149 s = CNF_Rule(0,1,2, 1.0) # s->np vp
150 np = CNF_Rule(1,3,4, 0.3) # np->n p
151 b[1, 'n'] = 0.7 # np->'n'
152 b[3, 'n'] = 1.0 # n->'n'
153 b[4, 'p'] = 1.0 # p->'p'
154 vp = CNF_Rule(2,5,1, 0.1) # vp->v np (two parses use this rule)
155 vp2 = CNF_Rule(2,2,4, 0.9) # vp->vp p
156 b[5, 'v'] = 1.0 # v->'v'
158 g = Grammar([s,np,vp,vp2], b)
166 test1 = inner(0,0, 1, g, ['n'], {})
168 print "should be 0.70 : %.2f" % test1[0]
172 test2 = inner(0,2, 2, g, ['v','n','p'], test1[1])
173 print "should be 0.?? (.09??) : %.2f" % test2[0]
174 print "------ trying the same again:----------"
175 test2 = inner(0,2, 2, g, ['v','n','p'], test2[1])
176 print "should be 0.?? (.09??) : %.2f" % test2[0]
179 ##################################################################
180 # just junk from here on down: #
181 ##################################################################
184 # "(pseudo-code / wishful thinking) "
185 # g = initialize(corpus) # or corpus.tagset ?
187 # P = {('v','n','p'):0.09}
188 # # P is used in v_q, w_q (expectation), so each sentence in the
189 # # corpus needs some initial P.
191 # # --- Maximation: ---
193 # # actually, this step (from Lari & Young) probably never happens
194 # # with DMV, since instead of the a[i,j,k] and b[i,m] vectors, we
195 # # have P_STOP and P_CHOOSE... or, in a sense it happens only we
196 # # calculate P_STOP and P_CHOOSE..somehow.
197 # for rule in g.p_rules:
200 # for pre_term in range(len(g.p_terminals)):
201 # ptnum[pre_term] = 0
202 # ptden[pre_term] = 0
204 # # we could also flip this to make rules the outer loop, then we
205 # # wouldn't have to initialize den/num in loops of their own
206 # for sent in corpus:
207 # for rule in g.p_rules # Equation 20
208 # for s in range(len(sent)):
209 # for t in range(s, len(sent)):
210 # rule.num += w(s,t, rule.LHS(),rule.L,rule.R, g, sent, P[sent])
211 # rule.den += v(s,t, rule.LHS(), g, sent, P[sent])
212 # # todo: do we need a "new-prob" vs "old-prob" distinction here?
213 # probably, since we use inner/outer which checks rule.prob()
214 # # todo: also, this wouldn't work, since for each sentence, we'd
215 # # discard the old probability; should rules be the outer
217 # rule.prob = rule.num / rule.den
218 # for pre_term in range(len(g.p_terminals)): # Equation 21
221 # for s in range(len(sent)):
222 # for t in range(s, len(sent)):
223 # num += v(t,t,pre_term, g, sent, P[sent])
224 # den += v(s,t,pre_term, g, sent, P[sent])
226 # for rule in g.rules:
227 # rule.prob = rule.num / rule.den
228 # for pre_term in range(len(g.p_terminals)):
229 # g.p_terminals[pre_term] = ptnum[pre_term] / ptden[pre_term]
232 # # --- Expectation: ---
233 # for sent in corpus: # Equation 11
234 # inside = inner(0, len(sent), ROOT, g, sent)
235 # P[sent] = inside[0]
237 # # todo: set inner.chart to {} again, how?
239 # # todo: need a old-P new-P distinction to check if we're below
240 # # threshold difference
243 # def w(s,t, LHS,L,R, g, sent, P_sent):
245 # rule = g.rule(LHS, L, R)
246 # for r in range(s, t):
247 # w += rule.prob() * inner(s,r, L, g, sent) * inner(r+1, t, R, g, sent) * outer(s,t,LHS,g,sent)
250 # def v(s,t, LHS, g, sent, P_sent):
251 # return ( inner(s,t, LHS, g, sent) * outer(s,t, LHS, g, sent) ) / P_sent