1 # ccm.py -- algorithm for the CCM model,
2 # http://repo.or.cz/w/dmvccm.git
4 # Copyright (C) 2008 Emily Morgan
6 testcorpus
= [tuple(s
.split()) for s
in ['a b c', 'b c a']]
8 ##########################################################
9 #functions for dealing with sentences
10 #indices refer to spaced between words and are 0-indexed
12 #beginning and end of sentence tokens, needed for contexts
18 '''returns span from i to j'''
22 '''returns context [i-1,j+1]'''
31 return (first
, second
)
33 #######################################
34 #stuff that just needs to happen once
36 #constants for smoothing
40 def spancontextlists(corpus
):
41 '''returns sets of all spans and all contexts'''
43 contextslist
= set([])
45 for i
in range(len(s
)+1):
46 for j
in range(len(s
)-i
+1):
47 spanslist
.add(alpha(i
,i
+j
,s
))
48 contextslist
.add(beta(i
,i
+j
,s
))
49 return [spanslist
, contextslist
]
70 def initP_BRACKET(corpus
):
71 #no smoothing used yet
74 for i
in range(len(s
)+1):
75 for j
in range(len(s
)-i
+1):
76 p_bracket
[i
,i
+j
,s
]=p_split(i
,i
+j
,s
)
79 def initP_SPANandCONTEXT(corpus
,P_BRACKET
):
80 [totalspans
,totalconstits
]=counts(corpus
)
81 [spanlist
,contextlist
]=spancontextlists(corpus
)
89 for i
in range(len(s
)+1):
90 for j
in range(len(s
)-i
+1):
95 apbracketsums
[a
]+=P_BRACKET
[i
,i
+j
,s
]
98 apbracketsums
[a
]=P_BRACKET
[i
,i
+j
,s
]
101 bpbracketsums
[b
]+=P_BRACKET
[i
,i
+j
,s
]
104 bpbracketsums
[b
]=P_BRACKET
[i
,i
+j
,s
]
106 P_SPAN
[a
,1]=(apbracketsums
[a
]+const
)/(totalconstits
+const
*len(spanlist
))
107 P_SPAN
[a
,0]=(acounts
[a
]-apbracketsums
[a
]+dist
)/\
108 (totalspans
-totalconstits
+dist
*len(spanlist
))
109 for b
in contextlist
:
110 P_CONTEXT
[b
,1]=(bpbracketsums
[b
]+const
)/\
111 (totalconstits
+const
*len(contextlist
))
112 P_CONTEXT
[b
,0]=(bcounts
[b
]-bpbracketsums
[b
]+dist
)/\
113 (totalspans
-totalconstits
+dist
*len(contextlist
))
114 return [P_SPAN
,P_CONTEXT
]
117 #######################################
118 #relative frequency estimates
121 '''returns total numbers of spans and constituents in the corpus'''
126 spans
+= (n
+1)*(n
+2)/2 #counting empty spans
128 return [spans
, constits
]
130 def p_relfreqs(corpus
):
131 '''returns dictionaries p_span and p_context
132 with relative frequencies for each span and context
133 (irrespective of length)'''
134 [totalspans
, totalconstits
]=counts(corpus
)
135 [spanslist
, contextslist
]=spancontextlists(corpus
)
142 for s
in contextslist
:
143 context_counts
[s
]=0.0
145 for i
in range(len(s
)+1):
146 for j
in range(len(s
)-i
+1):
147 span_counts
[alpha(i
,i
+j
,s
)]+=1
148 context_counts
[beta(i
,i
+j
,s
)]+=1
150 p_span_relfreq
[a
]=span_counts
[a
]/totalspans
151 for b
in contextslist
:
152 p_context_relfreq
[b
]=context_counts
[b
]/totalspans
153 return [p_span_relfreq
, p_context_relfreq
]
158 # def calculatephi(corpus, p_span, p_context):
161 # for i in range(len(s)+1):
162 # for j in range(len(s)-i+1):
166 # phi[i,i+j,s]=p_span[a,1]*p_context[b,1]/ \
169 # phi[i,i+j,s]=p_span[a,1]*p_context[b,1]/ \
173 # phi[i,i+j,s]=p_span[a,1]*p_context[b,1]/ \
174 # (p_span[a,0]*p_context[b,0])
175 # except ZeroDivisionError:
177 # '''Error! a=%s, b=%s, p_span[a,0]=%s, and p_context[b,0]=%s\n''' % (a,b,p_span[a,0],p_context[b,0])
178 # phi[i,i+j,s]=p_span[a,1]*p_context[b,1]
182 def calculatephi(corpus
, p_span
, p_context
):
185 for i
in range(len(s
)+1):
186 for j
in range(len(s
)-i
+1):
189 phi
[i
,i
+j
,s
]=p_span
[a
,1]*p_context
[b
,1]/ \
190 (p_span
[a
,0]*p_context
[b
,0])
193 def calculateI(corpus
,phi
):
196 for jcounter
in range(len(s
)+1):
197 for i
in range(len(s
)+1-jcounter
):
205 for kcounter
in range(j
-i
-1):
207 sum += I
[i
,k
,s
]*I
[k
,j
,s
]
208 I
[i
,j
,s
]=phi
[i
,j
,s
]*sum
211 def calculateO(corpus
,phi
,I
):
215 for jcounter
in [len(s
)-1-x
for x
in range(len(s
))]:
216 for i
in range(len(s
)+1-jcounter
):
220 sum+=I
[k
,i
,s
]*phi
[k
,j
,s
]*O
[k
,j
,s
]
221 for k
in [j
+1+x
for x
in range(len(s
)-j
)]:
222 sum+=I
[j
,k
,s
]*phi
[i
,k
,s
]*O
[i
,k
,s
]
226 def calculateP_BRACKET(corpus
,I
,O
):
229 for i
in range(len(s
)+1):
230 for j
in range(len(s
)+1-i
):
231 P_BRACKET
[i
,i
+j
,s
]=I
[i
,i
+j
,s
]*O
[i
,i
+j
,s
]/I
[0,len(s
),s
]
234 def calculateP_SPANandCONTEXT(corpus
,P_BRACKET
):
238 N
=(len(s
)+1)*(len(s
)+2)/2
243 for i
in range(len(s
)+1):
244 for j
in range(len(s
)+1-i
):
249 apbracketsums
[a
]+=P_BRACKET
[i
,i
+j
,s
]
252 apbracketsums
[a
]=P_BRACKET
[i
,i
+j
,s
]
255 bpbracketsums
[b
]+=P_BRACKET
[i
,i
+j
,s
]
258 bpbracketsums
[b
]=P_BRACKET
[i
,i
+j
,s
]
259 for a
in acounts
.keys():
266 trueval
=apbracketsums
[a
]/(2*len(s
)-1)
267 falseval
=(acounts
[a
]-apbracketsums
[a
])/(N
-2*len(s
)+1)
270 P_SPAN
[a
,0]+=falseval
274 for b
in bcounts
.keys():
281 trueval
=bpbracketsums
[b
]/(2*len(s
)-1)
282 falseval
=(bcounts
[b
]-bpbracketsums
[b
])/(N
-2*len(s
)+1)
284 P_CONTEXT
[b
,1]+=trueval
285 P_CONTEXT
[b
,0]+=falseval
287 P_CONTEXT
[b
,1]=trueval
288 P_CONTEXT
[b
,0]=falseval
289 [totalspans
,totalconstits
]=counts(corpus
)
290 mtrue
, mfalse
=.01,.02
291 for k
,v
in P_SPAN
.items():
293 P_SPAN
[k
]=(v
+mtrue
)/(len(corpus
)*(1+mtrue
*totalspans
))
295 P_SPAN
[k
]=(v
+mfalse
)/(len(corpus
)*(1+mfalse
*totalspans
))
296 for k
,v
in P_CONTEXT
.items():
298 P_CONTEXT
[k
]=(v
+mtrue
)/(len(corpus
)*(1+mtrue
*totalconstits
))
300 P_CONTEXT
[k
]=(v
+mfalse
)/(len(corpus
)*(1+mfalse
*totalconstits
))
301 return [P_SPAN
,P_CONTEXT
]
303 def reestimate(corpus
,p_span
,p_context
):
304 phi
=calculatephi(corpus
,p_span
,p_context
)
305 I
=calculateI(corpus
,phi
)
306 O
=calculateO(corpus
,phi
,I
)
307 P_BRACKET
= calculateP_BRACKET(corpus
,I
,O
)
308 [newP_SPAN
,newP_CONTEXT
]=calculateP_SPANandCONTEXT(corpus
,P_BRACKET
)
309 return [newP_SPAN
,newP_CONTEXT
]
316 def initialp(corpus
):
317 '''a stupid initialization that always gives phi=1'''
318 [p_span
, p_context
]=p_relfreqs(corpus
)
319 for k
,v
in p_span
.items():
322 for k
,v
in p_context
.items():
325 return [p_span
, p_context
]
327 if __name__
== "__main__":
328 p_bracket
=initP_BRACKET(testcorpus
)
329 # print "\n".join(["%s : %s" % (k,v) for k,v in p_bracket.items()])
330 [P_SPAN
,P_CONTEXT
]=initP_SPANandCONTEXT(testcorpus
,p_bracket
)
332 # print "\n".join(["%s : %s" % (k,v) for k,v in P_CONTEXT.items()])
333 [P_SPAN
,P_CONTEXT
]=reestimate(testcorpus
,P_SPAN
,P_CONTEXT
)
334 print "\n".join(["%s : %s" % (k
,v
) for k
,v
in P_SPAN
.items()])
338 # [p_span, p_context]=initialp(testcorpus)
339 # phi=calculatephi(testcorpus,p_span,p_context)
340 # I = calculateI(testcorpus,phi)
341 # O = calculateO(testcorpus,phi,I)
342 # P_BRACKET = calculateP_BRACKET(testcorpus,I,O)
343 # [P_SPAN,P_CONTEXT]=calculateP_SPANandCONTEXT(testcorpus,P_BRACKET)
344 # print "\n".join(["%s : %s" % (k,v) for k,v in P_SPAN.items()])
345 # print counts(testcorpus)
346 # print p_relfreqs(testcorpus)