added ccm.py
authorKevin Brubeck Unhammer <pixiemotion@gmail.com>
Tue, 30 Sep 2008 09:48:19 +0000 (30 11:48 +0200)
committerKevin Brubeck Unhammer <pixiemotion@gmail.com>
Tue, 30 Sep 2008 09:48:19 +0000 (30 11:48 +0200)
report/report.pdf
report/report.tex
src/ccm.py [new file with mode: 0644]
src/main.py

index 5d07133..2ae2549 100644 (file)
Binary files a/report/report.pdf and b/report/report.pdf differ
index d630da1..328438b 100644 (file)
@@ -127,7 +127,20 @@ A similar calculation shows that $$P_{\text{CONTEXT}}(\beta|\text{true},s)=\frac
 Now let us consider $P_{\text{SPAN}}(\alpha|\text{false},s)$. Again by Bayes' Theorem we have $$P_{\text{SPAN}}(\alpha|\text{false},s)=\frac{P_{\text{SPAN}}(\text{false}|\alpha,s)P_{\text{SPAN}}(\alpha|s)}{P_{\text{SPAN}}(\text{false}|s)}$$
 
 $P_{\text{SPAN}}(\alpha|s)$ is as before. $P_{\text{SPAN}}(\text{false}|s)=1-P_{\text{SPAN}}(\text{true}|s)$. And $P_{\text{SPAN}}(\text{false}|\alpha,s)=1-P_{\text{SPAN}}(\text{true}|\alpha,s)$. Thus 
-$$P_{\text{SPAN}}(\alpha|\text{false},s)=\frac{(1-\frac{1}{\#_s(\alpha)}\displaystyle\sum_{\langle i,j\rangle=\alpha}P_{\text{BRACKET}}(i,j,s))\frac{2n_s-1}{N_s}}{1-\frac{2n_s-1}{N_s}}$$$$=\frac{\#(\alpha)-\Sigma_{<i,j>=\alpha}P_{BRACKET}(i,j,s)}{N_s-2n_s+1}$$
+$$
+P_{\text{SPAN}}(\alpha|\text{false},s)=\frac
+{(1-\frac
+  {1}
+  {\#_s(\alpha)}
+  \displaystyle\sum_{\langle i,j\rangle=\alpha}
+  P_{\text{BRACKET}}(i,j,s))\frac
+  {\#_s(\alpha)}
+  {N_s}}
+{1-\frac{2n_s-1}{N_s}}
+$$
+$$=\frac
+{\#(\alpha)-\Sigma_{<i,j>=\alpha}P_{BRACKET}(i,j,s)}
+{N_s-2n_s+1}$$
 
 Similarly, we have $$P_{CONTEXT}(\beta|false,s) = \frac{\#(\beta)-\Sigma_{>i,j<=\beta}P_{BRACKET}(i,j,s)}{N_s-2n_s+1}$$
 
@@ -549,22 +562,23 @@ possible\footnote{221 parses in the dependency parsed WSJ-10 had
 
 \begin{table*}[hb]
   \centering
-  \begin{tabular}{l|ccc}
-    Model                          & P             & R              & F1          \\
+  \begin{tabular}{l|cc}
+    Model                          & \multicolumn{2}{c}{F1, directed}  \\
     \hline 
-    LBRANCH/RHEAD                  &               &                & 33.6        \\
-    RANDOM                         &               &                & 30.1        \\ 
-    RBRANCH/LHEAD                  &               &                & 24.0        \\
-    K\&M's DMV                     &               &                & 43.2        \\
-    Our DMV:                                                                      \\
-    Uniform initial distribution   & 21.0 (18.7)   & 20.1 (18.1)    & 20.5 (18.4) \\
-    $C_A=0; C_S=1;C_N=0.1;C_M=1$    & 24.8 (24.5)   & 23.7 (23.7)    & 24.2 (24.1) \\ % local screen
-    $C_A=0; C_S=1;C_N=0.1;C_M=10$   & 26.7 (31.6)   & 25.5 (30.5)    & 26.1 (31.0) \\ % uib screen
-%    $C_A=10;C_S=1;C_N=0.1;C_M=10$   & 25.6 (30.6)    & 24.5 (29.6)   & 25.0 (30.1) \\
-    $C_A=10;C_S=1;C_N=3  ;C_M=10$   &               &                & 26.3 (31.4) \\ 
-    $C_A=15;C_S=3;C_N=1  ;C_M=20$   &               &                & 27.2 (32.2) \\
+    LBRANCH/RHEAD                  & \multicolumn{2}{c}{33.6} \\
+    RANDOM                         & \multicolumn{2}{c}{30.1} \\ 
+    RBRANCH/LHEAD                  & \multicolumn{2}{c}{24.0} \\
+    K\&M's DMV                     & \multicolumn{2}{c}{43.2} \\[3pt]
+    Our DMV:                       & no ROOT & ROOT added     \\
+    \hline 
+    Uniform initial distribution   & 24.9    & 22.1 \\
+    $C_A=0; C_S=1;C_N=0.1;C_M=1$    & 25.2    & 24.9 \\ 
+    $C_A=0; C_S=1;C_N=0.1;C_M=10$   & 26.9    & 31.9 \\ 
+%    $C_A=10;C_S=1;C_N=0.1;C_M=10$ & 25.0    & 30.1 \\
+    $C_A=10;C_S=1;C_N=3  ;C_M=10$   & 26.3    & 31.4 \\ 
+    $C_A=15;C_S=3;C_N=1  ;C_M=20$   & 27.2    & 32.2 \\
   \end{tabular}
-  \caption{DMV results on the WSJ-10 for various initialization values (numbers in parentheses are when counting added ROOT links)}
+  \caption{DMV results on the WSJ-10 for various initialization values (the right column is when we counted ROOT links added to the gold parses)}
   \label{tab:dmv-wsj}
 \end{table*}
 
diff --git a/src/ccm.py b/src/ccm.py
new file mode 100644 (file)
index 0000000..677ad02
--- /dev/null
@@ -0,0 +1,346 @@
+# ccm.py -- algorithm for the CCM model,
+# http://repo.or.cz/w/dmvccm.git
+# 
+# Copyright (C) 2008 Emily Morgan
+
+testcorpus = [tuple(s.split()) for s in ['a b c', 'b c a']]
+
+##########################################################
+#functions for dealing with sentences
+#indices refer to spaced between words and are 0-indexed
+
+#beginning and end of sentence tokens, needed for contexts
+BEGIN = "BEGIN"
+END = "END"
+
+
+def alpha(i, j, sen):
+    '''returns span from i to j'''
+    return sen[i:j]
+
+def beta(i,j,sen):
+    '''returns context [i-1,j+1]'''
+    if i==0:
+        first = BEGIN
+    else:
+        first = sen[i-1]
+    if j==len(sen):
+        second = END
+    else:
+        second = sen[j]
+    return (first, second)
+
+#######################################
+#stuff that just needs to happen once
+
+#constants for smoothing
+const=1
+dist=2
+
+def spancontextlists(corpus):
+    '''returns sets of all spans and all contexts'''
+    spanslist = set([])
+    contextslist = set([])
+    for s in corpus:
+        for i in range(len(s)+1):
+            for j in range(len(s)-i+1):
+                spanslist.add(alpha(i,i+j,s))
+                contextslist.add(beta(i,i+j,s))                
+    return [spanslist, contextslist]
+
+def p_split(i,j,s):
+    l=j-i
+    n=len(s)
+    if l==0 or l>n:
+        return 0.0
+    elif l==1 or l==n:
+        return 1.0
+    elif n==3:
+        return .5
+    elif n==4:
+        return .4
+    elif n==5:
+        if l==2 or l==4:
+            return 5.0/14
+        else:
+            return 4.0/14
+    else:
+        return -1
+
+def initP_BRACKET(corpus):
+    #no smoothing used yet
+    p_bracket={}
+    for s in corpus:
+        for i in range(len(s)+1):
+            for j in range(len(s)-i+1):
+                p_bracket[i,i+j,s]=p_split(i,i+j,s)
+    return p_bracket
+    
+def initP_SPANandCONTEXT(corpus,P_BRACKET):
+    [totalspans,totalconstits]=counts(corpus)
+    [spanlist,contextlist]=spancontextlists(corpus)
+    apbracketsums={}
+    bpbracketsums={}
+    acounts={}
+    bcounts={}
+    P_SPAN={}
+    P_CONTEXT={}
+    for s in corpus:
+        for i in range(len(s)+1):
+            for j in range(len(s)-i+1):
+                a=alpha(i,i+j,s)
+                b=beta(i,i+j,s)
+                try:
+                    acounts[a]+=1
+                    apbracketsums[a]+=P_BRACKET[i,i+j,s]
+                except KeyError:
+                    acounts[a]=1
+                    apbracketsums[a]=P_BRACKET[i,i+j,s]
+                try:
+                    bcounts[b]+=1
+                    bpbracketsums[b]+=P_BRACKET[i,i+j,s]
+                except KeyError:
+                    bcounts[b]=1
+                    bpbracketsums[b]=P_BRACKET[i,i+j,s]
+    for a in spanlist:
+        P_SPAN[a,1]=(apbracketsums[a]+const)/(totalconstits+const*len(spanlist))
+        P_SPAN[a,0]=(acounts[a]-apbracketsums[a]+dist)/\
+            (totalspans-totalconstits+dist*len(spanlist))
+    for b in contextlist:
+        P_CONTEXT[b,1]=(bpbracketsums[b]+const)/\
+            (totalconstits+const*len(contextlist))
+        P_CONTEXT[b,0]=(bcounts[b]-bpbracketsums[b]+dist)/\
+            (totalspans-totalconstits+dist*len(contextlist))
+    return [P_SPAN,P_CONTEXT]
+        
+
+#######################################
+#relative frequency estimates
+
+def counts(corpus):
+    '''returns total numbers of spans and constituents in the corpus'''
+    spans = 0
+    constits = 0
+    for s in corpus:
+        n = len(s)
+        spans += (n+1)*(n+2)/2 #counting empty spans
+        constits += 2*n-1
+    return [spans, constits]
+
+def p_relfreqs(corpus):
+    '''returns dictionaries p_span and p_context
+    with relative frequencies for each span and context
+    (irrespective of length)'''
+    [totalspans, totalconstits]=counts(corpus)
+    [spanslist, contextslist]=spancontextlists(corpus)
+    span_counts={}
+    context_counts={}
+    p_span_relfreq={}
+    p_context_relfreq={}
+    for s in spanslist:
+        span_counts[s]=0.0
+    for s in contextslist:
+        context_counts[s]=0.0
+    for s in corpus:
+        for i in range(len(s)+1):
+            for j in range(len(s)-i+1):
+                span_counts[alpha(i,i+j,s)]+=1
+                context_counts[beta(i,i+j,s)]+=1
+    for a in spanslist:
+        p_span_relfreq[a]=span_counts[a]/totalspans
+    for b in contextslist:
+        p_context_relfreq[b]=context_counts[b]/totalspans
+    return [p_span_relfreq, p_context_relfreq]
+
+###############
+#reestimation
+
+# def calculatephi(corpus, p_span, p_context):
+#     phi={}
+#     for s in corpus:
+#         for i in range(len(s)+1):
+#             for j in range(len(s)-i+1):
+#                 a=alpha(i,i+j,s)
+#                 b=beta(i,i+j,s)
+#                 if j-i==1:
+#                     phi[i,i+j,s]=p_span[a,1]*p_context[b,1]/ \
+#                         p_context[b,0]
+#                 elif j-i==len(s):
+#                      phi[i,i+j,s]=p_span[a,1]*p_context[b,1]/ \
+#                         p_span[a,0]
+#                 else:
+#                     try:
+#                         phi[i,i+j,s]=p_span[a,1]*p_context[b,1]/ \
+#                             (p_span[a,0]*p_context[b,0])
+#                     except ZeroDivisionError:
+#                         print \
+#                             '''Error! a=%s, b=%s, p_span[a,0]=%s, and p_context[b,0]=%s\n''' % (a,b,p_span[a,0],p_context[b,0])
+#                         phi[i,i+j,s]=p_span[a,1]*p_context[b,1]
+                        
+#     return phi
+
+def calculatephi(corpus, p_span, p_context):
+    phi={}
+    for s in corpus:
+        for i in range(len(s)+1):
+            for j in range(len(s)-i+1):
+                a=alpha(i,i+j,s)
+                b=beta(i,i+j,s)
+                phi[i,i+j,s]=p_span[a,1]*p_context[b,1]/ \
+                    (p_span[a,0]*p_context[b,0]) 
+    return phi
+
+def calculateI(corpus,phi):
+    I={}
+    for s in corpus:
+        for jcounter in range(len(s)+1):
+            for i in range(len(s)+1-jcounter):
+                j=i+jcounter
+                if j-i==0:
+                    I[i,j,s]=0
+                elif j-i==1:
+                    I[i,j,s]=phi[i,j,s]
+                else:
+                    sum = 0
+                    for kcounter in range(j-i-1):
+                        k=i+1+kcounter
+                        sum += I[i,k,s]*I[k,j,s]
+                    I[i,j,s]=phi[i,j,s]*sum
+    return I
+
+def calculateO(corpus,phi,I):
+    O={}
+    for s in corpus:
+        O[0,len(s),s]=1
+        for jcounter in [len(s)-1-x for x in range(len(s))]:
+            for i in range(len(s)+1-jcounter):
+                j=i+jcounter
+                sum = 0
+                for k in range(i):
+                    sum+=I[k,i,s]*phi[k,j,s]*O[k,j,s]
+                for k in [j+1+x for x in range(len(s)-j)]:
+                    sum+=I[j,k,s]*phi[i,k,s]*O[i,k,s]
+                O[i,j,s]=sum
+    return O
+                 
+def calculateP_BRACKET(corpus,I,O):
+    P_BRACKET={}
+    for s in corpus:
+        for i in range(len(s)+1):
+            for j in range(len(s)+1-i):
+                P_BRACKET[i,i+j,s]=I[i,i+j,s]*O[i,i+j,s]/I[0,len(s),s]
+    return P_BRACKET
+
+def calculateP_SPANandCONTEXT(corpus,P_BRACKET):
+    P_SPAN={}
+    P_CONTEXT={}
+    for s in corpus:
+        N=(len(s)+1)*(len(s)+2)/2
+        acounts={}
+        apbracketsums={}
+        bcounts={}
+        bpbracketsums={}
+        for i in range(len(s)+1):
+            for j in range(len(s)+1-i):
+                a=alpha(i,i+j,s)
+                b=beta(i,i+j,s)
+                try:
+                    acounts[a]+=1
+                    apbracketsums[a]+=P_BRACKET[i,i+j,s]
+                except KeyError:
+                    acounts[a]=1
+                    apbracketsums[a]=P_BRACKET[i,i+j,s]
+                try:
+                    bcounts[b]+=1
+                    bpbracketsums[b]+=P_BRACKET[i,i+j,s]
+                except KeyError:
+                    bcounts[b]=1
+                    bpbracketsums[b]=P_BRACKET[i,i+j,s]
+        for a in acounts.keys():
+            if len(s)==1:
+                if a==s:
+                    trueval=1
+                else:
+                    trueval=0
+            else:
+                trueval=apbracketsums[a]/(2*len(s)-1)
+            falseval=(acounts[a]-apbracketsums[a])/(N-2*len(s)+1)
+            try:
+                P_SPAN[a,1]+=trueval
+                P_SPAN[a,0]+=falseval
+            except KeyError:
+                P_SPAN[a,1]=trueval
+                P_SPAN[a,0]=falseval
+        for b in bcounts.keys():
+            if len(s)==1:
+                if b==(BEGIN, END):
+                    trueval=1
+                else:
+                    trueval=0
+            else:
+                trueval=bpbracketsums[b]/(2*len(s)-1)
+            falseval=(bcounts[b]-bpbracketsums[b])/(N-2*len(s)+1)
+            try:
+                P_CONTEXT[b,1]+=trueval
+                P_CONTEXT[b,0]+=falseval
+            except KeyError:
+                P_CONTEXT[b,1]=trueval
+                P_CONTEXT[b,0]=falseval
+    [totalspans,totalconstits]=counts(corpus)
+    mtrue, mfalse=.01,.02
+    for k,v in P_SPAN.items():
+        if k[1]==1:
+            P_SPAN[k]=(v+mtrue)/(len(corpus)*(1+mtrue*totalspans))
+        else:
+            P_SPAN[k]=(v+mfalse)/(len(corpus)*(1+mfalse*totalspans))
+    for k,v in P_CONTEXT.items():
+        if k[1]==1:
+            P_CONTEXT[k]=(v+mtrue)/(len(corpus)*(1+mtrue*totalconstits))
+        else:
+            P_CONTEXT[k]=(v+mfalse)/(len(corpus)*(1+mfalse*totalconstits))
+    return [P_SPAN,P_CONTEXT]
+
+def reestimate(corpus,p_span,p_context):
+    phi=calculatephi(corpus,p_span,p_context)
+    I=calculateI(corpus,phi)
+    O=calculateO(corpus,phi,I)
+    P_BRACKET = calculateP_BRACKET(corpus,I,O)
+    [newP_SPAN,newP_CONTEXT]=calculateP_SPANandCONTEXT(corpus,P_BRACKET)
+    return [newP_SPAN,newP_CONTEXT]
+
+    
+
+#############
+#test stuff
+
+def initialp(corpus):
+    '''a stupid initialization that always gives phi=1'''
+    [p_span, p_context]=p_relfreqs(corpus)
+    for k,v in p_span.items():
+        p_span[k,1]= v
+        p_span[k,0]= v
+    for k,v in p_context.items():
+        p_context[k,1]= v
+        p_context[k,0]= v
+    return [p_span, p_context]
+
+if __name__ == "__main__":
+    p_bracket=initP_BRACKET(testcorpus)
+#    print "\n".join(["%s : %s" % (k,v) for k,v in p_bracket.items()])
+    [P_SPAN,P_CONTEXT]=initP_SPANandCONTEXT(testcorpus,p_bracket)
+    for n in range(2):
+#        print "\n".join(["%s : %s" % (k,v) for k,v in P_CONTEXT.items()])
+        [P_SPAN,P_CONTEXT]=reestimate(testcorpus,P_SPAN,P_CONTEXT)
+    print "\n".join(["%s : %s" % (k,v) for k,v in P_SPAN.items()])
+
+
+
+#    [p_span, p_context]=initialp(testcorpus)
+#    phi=calculatephi(testcorpus,p_span,p_context)
+#    I = calculateI(testcorpus,phi)
+#    O = calculateO(testcorpus,phi,I)
+#    P_BRACKET = calculateP_BRACKET(testcorpus,I,O)
+#    [P_SPAN,P_CONTEXT]=calculateP_SPANandCONTEXT(testcorpus,P_BRACKET)
+#    print "\n".join(["%s : %s" % (k,v) for k,v in P_SPAN.items()])
+#    print counts(testcorpus)
+#    print p_relfreqs(testcorpus)
index a4384ba..e6798fd 100644 (file)
@@ -17,10 +17,10 @@ def initialize_loc_h(tagonlys):
 #     loc_h_harmonic.FSTOP_MIN = 13.5744632704
 #     loc_h_harmonic.FNONSTOP_MIN = 34.8939452454
     loc_h_harmonic.HARMONIC_C =  0.0 #120.0 * random.random() # 509.63
-    loc_h_harmonic.STOP_C =      1.0 #3.0 * random.random() 
-    loc_h_harmonic.NSTOP_C =     0.1 #5.0 * random.random() # 0.1
+    loc_h_harmonic.STOP_C =      3.0 #3.0 * random.random() 
+    loc_h_harmonic.NSTOP_C =     1.0 #5.0 * random.random() # 0.1
     loc_h_harmonic.FSTOP_MIN =  10.0 #20.0 * random.random() # 13.08 
-#   $C_A=0; C_S=1;C_N=0.1;C_M=10$
+    
     loc_h_harmonic.RIGHT_FIRST = 1.0
     loc_h_harmonic.OLD_STOP_CALC = False
     print '''
@@ -146,8 +146,10 @@ class Evaluation():
 R:  %5d/%5d = %s | R_r:  %5d/%5d = %s
 F1:               %s | F1_r:               %s (unrooted gold parses: %d, double-headed: %d)'''%str_vals
 
-        tex_str_vals = tuple([p * 100 for p in (self._precision,self._precision_r,self._recall,self._recall_r,self._F1,self._F1_r)])
-        tex_str = "$C_A=; C_S=;C_N=;C_M=$   & %.1f (%.1f)   & %.1f (%.1f)    & %.1f (%.1f) \\"%tex_str_vals
+        if self._precision != self._recall: print regular_str # raise Exception todo
+        
+        tex_str_vals = tuple([p * 100 for p in (self._F1,self._F1_r)])
+        tex_str = "$C_A=; C_S=;C_N=;C_M=$   & %.1f   & %.1f \\"%tex_str_vals
 
         return tex_str # todo make variable