added ccm.py
[dmvccm.git] / src / ccm.py
blob677ad02fb70d56c9ac922bc80e1273172bc1cdac
1 # ccm.py -- algorithm for the CCM model,
2 # http://repo.or.cz/w/dmvccm.git
3 #
4 # Copyright (C) 2008 Emily Morgan
6 testcorpus = [tuple(s.split()) for s in ['a b c', 'b c a']]
8 ##########################################################
9 #functions for dealing with sentences
10 #indices refer to spaced between words and are 0-indexed
12 #beginning and end of sentence tokens, needed for contexts
13 BEGIN = "BEGIN"
14 END = "END"
17 def alpha(i, j, sen):
18 '''returns span from i to j'''
19 return sen[i:j]
21 def beta(i,j,sen):
22 '''returns context [i-1,j+1]'''
23 if i==0:
24 first = BEGIN
25 else:
26 first = sen[i-1]
27 if j==len(sen):
28 second = END
29 else:
30 second = sen[j]
31 return (first, second)
33 #######################################
34 #stuff that just needs to happen once
36 #constants for smoothing
37 const=1
38 dist=2
40 def spancontextlists(corpus):
41 '''returns sets of all spans and all contexts'''
42 spanslist = set([])
43 contextslist = set([])
44 for s in corpus:
45 for i in range(len(s)+1):
46 for j in range(len(s)-i+1):
47 spanslist.add(alpha(i,i+j,s))
48 contextslist.add(beta(i,i+j,s))
49 return [spanslist, contextslist]
51 def p_split(i,j,s):
52 l=j-i
53 n=len(s)
54 if l==0 or l>n:
55 return 0.0
56 elif l==1 or l==n:
57 return 1.0
58 elif n==3:
59 return .5
60 elif n==4:
61 return .4
62 elif n==5:
63 if l==2 or l==4:
64 return 5.0/14
65 else:
66 return 4.0/14
67 else:
68 return -1
70 def initP_BRACKET(corpus):
71 #no smoothing used yet
72 p_bracket={}
73 for s in corpus:
74 for i in range(len(s)+1):
75 for j in range(len(s)-i+1):
76 p_bracket[i,i+j,s]=p_split(i,i+j,s)
77 return p_bracket
79 def initP_SPANandCONTEXT(corpus,P_BRACKET):
80 [totalspans,totalconstits]=counts(corpus)
81 [spanlist,contextlist]=spancontextlists(corpus)
82 apbracketsums={}
83 bpbracketsums={}
84 acounts={}
85 bcounts={}
86 P_SPAN={}
87 P_CONTEXT={}
88 for s in corpus:
89 for i in range(len(s)+1):
90 for j in range(len(s)-i+1):
91 a=alpha(i,i+j,s)
92 b=beta(i,i+j,s)
93 try:
94 acounts[a]+=1
95 apbracketsums[a]+=P_BRACKET[i,i+j,s]
96 except KeyError:
97 acounts[a]=1
98 apbracketsums[a]=P_BRACKET[i,i+j,s]
99 try:
100 bcounts[b]+=1
101 bpbracketsums[b]+=P_BRACKET[i,i+j,s]
102 except KeyError:
103 bcounts[b]=1
104 bpbracketsums[b]=P_BRACKET[i,i+j,s]
105 for a in spanlist:
106 P_SPAN[a,1]=(apbracketsums[a]+const)/(totalconstits+const*len(spanlist))
107 P_SPAN[a,0]=(acounts[a]-apbracketsums[a]+dist)/\
108 (totalspans-totalconstits+dist*len(spanlist))
109 for b in contextlist:
110 P_CONTEXT[b,1]=(bpbracketsums[b]+const)/\
111 (totalconstits+const*len(contextlist))
112 P_CONTEXT[b,0]=(bcounts[b]-bpbracketsums[b]+dist)/\
113 (totalspans-totalconstits+dist*len(contextlist))
114 return [P_SPAN,P_CONTEXT]
117 #######################################
118 #relative frequency estimates
120 def counts(corpus):
121 '''returns total numbers of spans and constituents in the corpus'''
122 spans = 0
123 constits = 0
124 for s in corpus:
125 n = len(s)
126 spans += (n+1)*(n+2)/2 #counting empty spans
127 constits += 2*n-1
128 return [spans, constits]
130 def p_relfreqs(corpus):
131 '''returns dictionaries p_span and p_context
132 with relative frequencies for each span and context
133 (irrespective of length)'''
134 [totalspans, totalconstits]=counts(corpus)
135 [spanslist, contextslist]=spancontextlists(corpus)
136 span_counts={}
137 context_counts={}
138 p_span_relfreq={}
139 p_context_relfreq={}
140 for s in spanslist:
141 span_counts[s]=0.0
142 for s in contextslist:
143 context_counts[s]=0.0
144 for s in corpus:
145 for i in range(len(s)+1):
146 for j in range(len(s)-i+1):
147 span_counts[alpha(i,i+j,s)]+=1
148 context_counts[beta(i,i+j,s)]+=1
149 for a in spanslist:
150 p_span_relfreq[a]=span_counts[a]/totalspans
151 for b in contextslist:
152 p_context_relfreq[b]=context_counts[b]/totalspans
153 return [p_span_relfreq, p_context_relfreq]
155 ###############
156 #reestimation
158 # def calculatephi(corpus, p_span, p_context):
159 # phi={}
160 # for s in corpus:
161 # for i in range(len(s)+1):
162 # for j in range(len(s)-i+1):
163 # a=alpha(i,i+j,s)
164 # b=beta(i,i+j,s)
165 # if j-i==1:
166 # phi[i,i+j,s]=p_span[a,1]*p_context[b,1]/ \
167 # p_context[b,0]
168 # elif j-i==len(s):
169 # phi[i,i+j,s]=p_span[a,1]*p_context[b,1]/ \
170 # p_span[a,0]
171 # else:
172 # try:
173 # phi[i,i+j,s]=p_span[a,1]*p_context[b,1]/ \
174 # (p_span[a,0]*p_context[b,0])
175 # except ZeroDivisionError:
176 # print \
177 # '''Error! a=%s, b=%s, p_span[a,0]=%s, and p_context[b,0]=%s\n''' % (a,b,p_span[a,0],p_context[b,0])
178 # phi[i,i+j,s]=p_span[a,1]*p_context[b,1]
180 # return phi
182 def calculatephi(corpus, p_span, p_context):
183 phi={}
184 for s in corpus:
185 for i in range(len(s)+1):
186 for j in range(len(s)-i+1):
187 a=alpha(i,i+j,s)
188 b=beta(i,i+j,s)
189 phi[i,i+j,s]=p_span[a,1]*p_context[b,1]/ \
190 (p_span[a,0]*p_context[b,0])
191 return phi
193 def calculateI(corpus,phi):
194 I={}
195 for s in corpus:
196 for jcounter in range(len(s)+1):
197 for i in range(len(s)+1-jcounter):
198 j=i+jcounter
199 if j-i==0:
200 I[i,j,s]=0
201 elif j-i==1:
202 I[i,j,s]=phi[i,j,s]
203 else:
204 sum = 0
205 for kcounter in range(j-i-1):
206 k=i+1+kcounter
207 sum += I[i,k,s]*I[k,j,s]
208 I[i,j,s]=phi[i,j,s]*sum
209 return I
211 def calculateO(corpus,phi,I):
212 O={}
213 for s in corpus:
214 O[0,len(s),s]=1
215 for jcounter in [len(s)-1-x for x in range(len(s))]:
216 for i in range(len(s)+1-jcounter):
217 j=i+jcounter
218 sum = 0
219 for k in range(i):
220 sum+=I[k,i,s]*phi[k,j,s]*O[k,j,s]
221 for k in [j+1+x for x in range(len(s)-j)]:
222 sum+=I[j,k,s]*phi[i,k,s]*O[i,k,s]
223 O[i,j,s]=sum
224 return O
226 def calculateP_BRACKET(corpus,I,O):
227 P_BRACKET={}
228 for s in corpus:
229 for i in range(len(s)+1):
230 for j in range(len(s)+1-i):
231 P_BRACKET[i,i+j,s]=I[i,i+j,s]*O[i,i+j,s]/I[0,len(s),s]
232 return P_BRACKET
234 def calculateP_SPANandCONTEXT(corpus,P_BRACKET):
235 P_SPAN={}
236 P_CONTEXT={}
237 for s in corpus:
238 N=(len(s)+1)*(len(s)+2)/2
239 acounts={}
240 apbracketsums={}
241 bcounts={}
242 bpbracketsums={}
243 for i in range(len(s)+1):
244 for j in range(len(s)+1-i):
245 a=alpha(i,i+j,s)
246 b=beta(i,i+j,s)
247 try:
248 acounts[a]+=1
249 apbracketsums[a]+=P_BRACKET[i,i+j,s]
250 except KeyError:
251 acounts[a]=1
252 apbracketsums[a]=P_BRACKET[i,i+j,s]
253 try:
254 bcounts[b]+=1
255 bpbracketsums[b]+=P_BRACKET[i,i+j,s]
256 except KeyError:
257 bcounts[b]=1
258 bpbracketsums[b]=P_BRACKET[i,i+j,s]
259 for a in acounts.keys():
260 if len(s)==1:
261 if a==s:
262 trueval=1
263 else:
264 trueval=0
265 else:
266 trueval=apbracketsums[a]/(2*len(s)-1)
267 falseval=(acounts[a]-apbracketsums[a])/(N-2*len(s)+1)
268 try:
269 P_SPAN[a,1]+=trueval
270 P_SPAN[a,0]+=falseval
271 except KeyError:
272 P_SPAN[a,1]=trueval
273 P_SPAN[a,0]=falseval
274 for b in bcounts.keys():
275 if len(s)==1:
276 if b==(BEGIN, END):
277 trueval=1
278 else:
279 trueval=0
280 else:
281 trueval=bpbracketsums[b]/(2*len(s)-1)
282 falseval=(bcounts[b]-bpbracketsums[b])/(N-2*len(s)+1)
283 try:
284 P_CONTEXT[b,1]+=trueval
285 P_CONTEXT[b,0]+=falseval
286 except KeyError:
287 P_CONTEXT[b,1]=trueval
288 P_CONTEXT[b,0]=falseval
289 [totalspans,totalconstits]=counts(corpus)
290 mtrue, mfalse=.01,.02
291 for k,v in P_SPAN.items():
292 if k[1]==1:
293 P_SPAN[k]=(v+mtrue)/(len(corpus)*(1+mtrue*totalspans))
294 else:
295 P_SPAN[k]=(v+mfalse)/(len(corpus)*(1+mfalse*totalspans))
296 for k,v in P_CONTEXT.items():
297 if k[1]==1:
298 P_CONTEXT[k]=(v+mtrue)/(len(corpus)*(1+mtrue*totalconstits))
299 else:
300 P_CONTEXT[k]=(v+mfalse)/(len(corpus)*(1+mfalse*totalconstits))
301 return [P_SPAN,P_CONTEXT]
303 def reestimate(corpus,p_span,p_context):
304 phi=calculatephi(corpus,p_span,p_context)
305 I=calculateI(corpus,phi)
306 O=calculateO(corpus,phi,I)
307 P_BRACKET = calculateP_BRACKET(corpus,I,O)
308 [newP_SPAN,newP_CONTEXT]=calculateP_SPANandCONTEXT(corpus,P_BRACKET)
309 return [newP_SPAN,newP_CONTEXT]
313 #############
314 #test stuff
316 def initialp(corpus):
317 '''a stupid initialization that always gives phi=1'''
318 [p_span, p_context]=p_relfreqs(corpus)
319 for k,v in p_span.items():
320 p_span[k,1]= v
321 p_span[k,0]= v
322 for k,v in p_context.items():
323 p_context[k,1]= v
324 p_context[k,0]= v
325 return [p_span, p_context]
327 if __name__ == "__main__":
328 p_bracket=initP_BRACKET(testcorpus)
329 # print "\n".join(["%s : %s" % (k,v) for k,v in p_bracket.items()])
330 [P_SPAN,P_CONTEXT]=initP_SPANandCONTEXT(testcorpus,p_bracket)
331 for n in range(2):
332 # print "\n".join(["%s : %s" % (k,v) for k,v in P_CONTEXT.items()])
333 [P_SPAN,P_CONTEXT]=reestimate(testcorpus,P_SPAN,P_CONTEXT)
334 print "\n".join(["%s : %s" % (k,v) for k,v in P_SPAN.items()])
338 # [p_span, p_context]=initialp(testcorpus)
339 # phi=calculatephi(testcorpus,p_span,p_context)
340 # I = calculateI(testcorpus,phi)
341 # O = calculateO(testcorpus,phi,I)
342 # P_BRACKET = calculateP_BRACKET(testcorpus,I,O)
343 # [P_SPAN,P_CONTEXT]=calculateP_SPANandCONTEXT(testcorpus,P_BRACKET)
344 # print "\n".join(["%s : %s" % (k,v) for k,v in P_SPAN.items()])
345 # print counts(testcorpus)
346 # print p_relfreqs(testcorpus)