1 ##################################################################
3 ##################################################################
5 # - more elaborate pseudo-code in the io()-function
8 # - CNF_Rule has a function LHS() which returns head, in DMV_Rule this
9 # returns the pair of (bars, head).
12 # - CNF_Rule has __eq__ and __ne__ defined, so that we can use == and
13 # != on two such rules
16 # import numpy # numpy provides Fast Arrays, for future optimization
17 # import pprint # for pretty-printing
21 '''Easily turn on/off inline debug printouts with this global. There's
22 a lot of cluttering debug statements here, todo: clean up'''
27 '''A single CNF rule in the PCFG, of the form
29 where these are just integers
30 (where do we save the connection between number and symbol?
31 symbols being 'vbd' etc.)'''
32 def __eq__(self, other):
33 return self.LHS() == other.LHS() and self.R == other.R and self.L == other.L
34 def __ne__(self, other):
35 return self.LHS() != other.LHS() or self.R != other.R or self.L != other.L
37 return "%s -> %s %s [%.2f]" % (self.head, self.L, self.R, self.prob)
38 def __init__(self, head, L, R, prob):
47 '''The PCFG used in the I/O-algorithm.
49 Todo: as of now, this allows duplicate rules... should we check
50 for this? (eg. g = Grammar([x,x],[]) where x.prob == 1 may give
51 inner probabilities of 2.)'''
52 def rules(self, head):
53 return [rule for rule in self.p_rules if rule.head == head]
55 def __init__(self, p_rules, p_terminals):
56 '''p_rules and p_terminals should be arrays, where p_terminals are of
57 the form [preterminal, terminal], and p_rules are CNF_Rule's.'''
58 self.p_rules = p_rules # could check for summing to 1 (+/- epsilon)
59 self.p_terminals = p_terminals
63 def inner(s, t, LHS, g, sent, chart):
64 ''' Give the inner probability of having the node LHS cover whatever's
65 between s and t in sentence sent, using grammar g.
67 Returns a pair of the inner probability and the chart
69 For DMV, LHS is a pair (bar, h), but this function ought to be
72 e() is an internal function, so the variable chart (a dictionary)
73 is available to all calls of e().
75 Since terminal probabilities are just simple lookups, they are not
76 put in the chart (although we could put them in there later to
84 if (s, t, LHS) in chart:
85 return chart[(s, t, LHS)]
87 debug( "trying from %d to %d with %s" % (s,t,LHS) )
89 if (LHS, O(s)) in g.p_terminals:
90 prob = g.p_terminals[LHS, O(s)] # b[LHS, O(s)]
92 # if a left stop, we need a right stop below, if a right stop,
93 # we need the terminal below. todo.
94 prob = 0.0 # todo: is this the right way to deal with lacking rules?
95 debug( "\tterminal: %s -> %s : %.1f" % (LHS, O(s), prob) )
98 if (s,t,LHS) not in chart:
99 chart[(s,t,LHS)] = 0.0
100 for rule in g.rules(LHS): # summing over j,k in a[LHS,j,k]
101 debug( "\tsumming rule %s" % rule )
104 for r in range(s, t): # summing over r = s to r = t-1
105 chart[(s, t, LHS)] += rule.prob * e(s, r, L) * e(r + 1, t, R)
106 debug( "\tchart[(%d,%d,%s)] = %.2f" % (s,t,LHS, chart[(s,t,LHS)]) )
107 return chart[(s, t, LHS)]
110 inner_prob = e(s,t,LHS)
113 for k,v in chart.iteritems():
114 print "\t%s -> %s_%d ... %s_%d : %.1f" % (k[2], O(k[0]), k[0], O(k[1]), k[1], v)
115 print "---CHART:end---"
116 return (inner_prob, chart) # inner_prob == chart[(s,t,LHS)]
125 if __name__ == "__main__":
126 print "IO-module tests:"
128 s = CNF_Rule(0,1,2, 1.0) # s->np vp
129 np = CNF_Rule(1,3,4, 0.3) # np->n p
130 b[1, 'n'] = 0.7 # np->'n'
131 b[3, 'n'] = 1.0 # n->'n'
132 b[4, 'p'] = 1.0 # p->'p'
133 vp = CNF_Rule(2,5,1, 0.1) # vp->v np (two parses use this rule)
134 vp2 = CNF_Rule(2,2,4, 0.9) # vp->vp p
135 b[5, 'v'] = 1.0 # v->'v'
137 g = Grammar([s,np,vp,vp2], b)
145 test1 = inner(0,0, 1, g, ['n'], {})
147 print "should be 0.70 : %.2f" % test1[0]
151 test2 = inner(0,2, 2, g, ['v','n','p'], test1[1])
152 print "should be 0.?? (.09??) : %.2f" % test2[0]
153 print "------ trying the same again:----------"
154 test2 = inner(0,2, 2, g, ['v','n','p'], test2[1])
155 print "should be 0.?? (.09??) : %.2f" % test2[0]
158 ##################################################################
159 # just junk from here on down: #
160 ##################################################################
163 # "(pseudo-code / wishful thinking) "
164 # g = initialize(corpus) # or corpus.tagset ?
166 # P = {('v','n','p'):0.09}
167 # # P is used in v_q, w_q (expectation), so each sentence in the
168 # # corpus needs some initial P.
170 # # --- Maximation: ---
172 # # actually, this step (from Lari & Young) probably never happens
173 # # with DMV, since instead of the a[i,j,k] and b[i,m] vectors, we
174 # # have P_STOP and P_CHOOSE... or, in a sense it happens only we
175 # # calculate P_STOP and P_CHOOSE..somehow.
176 # for rule in g.p_rules:
179 # for pre_term in range(len(g.p_terminals)):
180 # ptnum[pre_term] = 0
181 # ptden[pre_term] = 0
183 # # we could also flip this to make rules the outer loop, then we
184 # # wouldn't have to initialize den/num in loops of their own
185 # for sent in corpus:
186 # for rule in g.p_rules # Equation 20
187 # for s in range(len(sent)):
188 # for t in range(s, len(sent)):
189 # rule.num += w(s,t, rule.LHS(),rule.L,rule.R, g, sent, P[sent])
190 # rule.den += v(s,t, rule.LHS(), g, sent, P[sent])
191 # # todo: do we need a "new-prob" vs "old-prob" distinction here?
192 # probably, since we use inner/outer which checks rule.prob
193 # # todo: also, this wouldn't work, since for each sentence, we'd
194 # # discard the old probability; should rules be the outer
196 # rule.prob = rule.num / rule.den
197 # for pre_term in range(len(g.p_terminals)): # Equation 21
200 # for s in range(len(sent)):
201 # for t in range(s, len(sent)):
202 # num += v(t,t,pre_term, g, sent, P[sent])
203 # den += v(s,t,pre_term, g, sent, P[sent])
205 # for rule in g.p_rules:
206 # rule.prob = rule.num / rule.den
207 # for pre_term in range(len(g.p_terminals)):
208 # g.p_terminals[pre_term] = ptnum[pre_term] / ptden[pre_term]
211 # # --- Expectation: ---
212 # for sent in corpus: # Equation 11
213 # inside = inner(0, len(sent), ROOT, g, sent)
214 # P[sent] = inside[0]
216 # # todo: set inner.chart to {} again, how?
218 # # todo: need a old-P new-P distinction to check if we're below
219 # # threshold difference
222 # def w(s,t, LHS,L,R, g, sent, P_sent):
224 # rule = g.rule(LHS, L, R)
225 # for r in range(s, t):
226 # w += rule.prob * inner(s,r, L, g, sent) * inner(r+1, t, R, g, sent) * outer(s,t,LHS,g,sent)
229 # def v(s,t, LHS, g, sent, P_sent):
230 # return ( inner(s,t, LHS, g, sent) * outer(s,t, LHS, g, sent) ) / P_sent