1 ##################################################################
3 ##################################################################
5 # - more elaborate pseudo-code in the io()-function
8 # - CNF_Rule has a function LHS() which returns head, in DMV_Rule this
9 # returns the pair of (bars, head).
12 # - CNF_Rule has __eq__ and __ne__ defined, so that we can use == and
13 # != on two such rules
16 # import numpy # numpy provides Fast Arrays, for future optimization
17 # import pprint # for pretty-printing
21 '''Easily turn on/off inline debug printouts with this global. There's
22 a lot of cluttering debug statements here, todo: clean up'''
27 '''A single CNF rule in the PCFG, of the form
29 where these are just integers
30 (where do we save the connection between number and symbol?
31 symbols being 'vbd' etc.)'''
32 def __eq__(self, other):
33 return self.LHS() == other.LHS() and self.R == other.R and self.L == other.L
34 def __ne__(self, other):
35 return self.LHS() != other.LHS() or self.R != other.R or self.L != other.L
37 return "%s -> %s %s [%.2f]" % (self.head, self.L, self.R, self.prob)
38 def __init__(self, head, L, R, prob):
49 '''The PCFG used in the I/O-algorithm.
51 Todo: as of now, this allows duplicate rules... should we check
52 for this? (eg. g = Grammar([x,x],[]) where x.prob == 1 may give
53 inner probabilities of 2.)'''
54 def rules(self, head):
55 return [rule for rule in self.rules if rule.head == head]
57 def __init__(self, rules, p_terminals):
58 '''rules and p_terminals should be arrays, where p_terminals are of
59 the form [preterminal, terminal], and rules are CNF_Rule's.'''
60 self.rules = rules # could check for summing to 1 (+/- epsilon)
61 self.p_terminals = p_terminals
65 def inner(s, t, LHS, g, sent, chart):
66 ''' Give the inner probability of having the node LHS cover whatever's
67 between s and t in sentence sent, using grammar g.
69 Returns a pair of the inner probability and the chart
71 For DMV, LHS is a pair (bar, h), but this function ought to be
74 e() is an internal function, so the variable chart (a dictionary)
75 is available to all calls of e().
77 Since terminal probabilities are just simple lookups, they are not
78 put in the chart (although we could put them in there later to
86 if (s, t, LHS) in chart:
87 return chart[(s, t, LHS)]
89 debug( "trying from %d to %d with %s" % (s,t,LHS) )
91 if (LHS, O(s)) in g.p_terminals:
92 prob = g.p_terminals[LHS, O(s)] # b[LHS, O(s)]
94 # if a left stop, we need a right stop below, if a right stop,
95 # we need the terminal below. todo.
96 prob = 0.0 # todo: is this the right way to deal with lacking rules?
97 debug( "\tterminal: %s -> %s : %.1f" % (LHS, O(s), prob) )
100 if (s,t,LHS) not in chart:
101 chart[(s,t,LHS)] = 0.0
102 for rule in g.rules(LHS): # summing over j,k in a[LHS,j,k]
103 debug( "\tsumming rule %s" % rule )
106 for r in range(s, t): # summing over r = s to r = t-1
107 chart[(s, t, LHS)] += rule.p(s,t,g) * e(s, r, L) * e(r + 1, t, R)
108 debug( "\tchart[(%d,%d,%s)] = %.2f" % (s,t,LHS, chart[(s,t,LHS)]) )
109 return chart[(s, t, LHS)]
112 inner_prob = e(s,t,LHS)
115 for k,v in chart.iteritems():
116 print "\t%s -> %s_%d ... %s_%d : %.1f" % (k[2], O(k[0]), k[0], O(k[1]), k[1], v)
117 print "---CHART:end---"
118 return (inner_prob, chart) # inner_prob == chart[(s,t,LHS)]
127 if __name__ == "__main__":
128 print "IO-module tests:"
130 s = CNF_Rule(0,1,2, 1.0) # s->np vp
131 np = CNF_Rule(1,3,4, 0.3) # np->n p
132 b[1, 'n'] = 0.7 # np->'n'
133 b[3, 'n'] = 1.0 # n->'n'
134 b[4, 'p'] = 1.0 # p->'p'
135 vp = CNF_Rule(2,5,1, 0.1) # vp->v np (two parses use this rule)
136 vp2 = CNF_Rule(2,2,4, 0.9) # vp->vp p
137 b[5, 'v'] = 1.0 # v->'v'
139 g = Grammar([s,np,vp,vp2], b)
147 test1 = inner(0,0, 1, g, ['n'], {})
149 print "should be 0.70 : %.2f" % test1[0]
153 test2 = inner(0,2, 2, g, ['v','n','p'], test1[1])
154 print "should be 0.?? (.09??) : %.2f" % test2[0]
155 print "------ trying the same again:----------"
156 test2 = inner(0,2, 2, g, ['v','n','p'], test2[1])
157 print "should be 0.?? (.09??) : %.2f" % test2[0]
160 ##################################################################
161 # just junk from here on down: #
162 ##################################################################
165 # "(pseudo-code / wishful thinking) "
166 # g = initialize(corpus) # or corpus.tagset ?
168 # P = {('v','n','p'):0.09}
169 # # P is used in v_q, w_q (expectation), so each sentence in the
170 # # corpus needs some initial P.
172 # # --- Maximation: ---
174 # # actually, this step (from Lari & Young) probably never happens
175 # # with DMV, since instead of the a[i,j,k] and b[i,m] vectors, we
176 # # have P_STOP and P_CHOOSE... or, in a sense it happens only we
177 # # calculate P_STOP and P_CHOOSE..somehow.
178 # for rule in g.rules:
181 # for pre_term in range(len(g.p_terminals)):
182 # ptnum[pre_term] = 0
183 # ptden[pre_term] = 0
185 # # we could also flip this to make rules the outer loop, then we
186 # # wouldn't have to initialize den/num in loops of their own
187 # for sent in corpus:
188 # for rule in g.rules # Equation 20
189 # for s in range(len(sent)):
190 # for t in range(s, len(sent)):
191 # rule.num += w(s,t, rule.LHS(),rule.L,rule.R, g, sent, P[sent])
192 # rule.den += v(s,t, rule.LHS(), g, sent, P[sent])
193 # # todo: do we need a "new-prob" vs "old-prob" distinction here?
194 # probably, since we use inner/outer which checks rule.prob()
195 # # todo: also, this wouldn't work, since for each sentence, we'd
196 # # discard the old probability; should rules be the outer
198 # rule.prob = rule.num / rule.den
199 # for pre_term in range(len(g.p_terminals)): # Equation 21
202 # for s in range(len(sent)):
203 # for t in range(s, len(sent)):
204 # num += v(t,t,pre_term, g, sent, P[sent])
205 # den += v(s,t,pre_term, g, sent, P[sent])
207 # for rule in g.rules:
208 # rule.prob = rule.num / rule.den
209 # for pre_term in range(len(g.p_terminals)):
210 # g.p_terminals[pre_term] = ptnum[pre_term] / ptden[pre_term]
213 # # --- Expectation: ---
214 # for sent in corpus: # Equation 11
215 # inside = inner(0, len(sent), ROOT, g, sent)
216 # P[sent] = inside[0]
218 # # todo: set inner.chart to {} again, how?
220 # # todo: need a old-P new-P distinction to check if we're below
221 # # threshold difference
224 # def w(s,t, LHS,L,R, g, sent, P_sent):
226 # rule = g.rule(LHS, L, R)
227 # for r in range(s, t):
228 # w += rule.prob() * inner(s,r, L, g, sent) * inner(r+1, t, R, g, sent) * outer(s,t,LHS,g,sent)
231 # def v(s,t, LHS, g, sent, P_sent):
232 # return ( inner(s,t, LHS, g, sent) * outer(s,t, LHS, g, sent) ) / P_sent