src/io.py.~2~

   1 ##################################################################
   2 #                            Changes:                            #
   3 ##################################################################
   4 # 2008-05-24, KBU:
   5 # - more elaborate pseudo-code in the io()-function
   6 #
   7 # 2008-05-25, KBU:
   8 # - CNF_Rule has a function LHS() which returns head, in DMV_Rule this
   9 #   returns the pair of (bars, head).
  10 #
  11 # 2008-05-27
  12 # - CNF_Rule has __eq__ and __ne__ defined, so that we can use == and
  13 #   != on two such rules
  14
  15
  16 # import numpy # numpy provides Fast Arrays, for future optimization
  17 # import pprint # for pretty-printing
  18
  19 DEBUG = 0
  20 def debug(string):
  21     '''Easily turn on/off inline debug printouts with this global. There's
  22 a lot of cluttering debug statements here, todo: clean up'''
  23     if DEBUG:
  24         print string
  25
  26 class CNF_Rule():
  27     '''A single CNF rule in the PCFG, of the form
  28     head -> L R
  29     where these are just integers
  30     (where do we save the connection between number and symbol?
  31     symbols being 'vbd' etc.)'''
  32     def __eq__(self, other):
  33         return self.LHS() == other.LHS() and self.R == other.R and self.L == other.L
  34     def __ne__(self, other):
  35         return self.LHS() != other.LHS() or self.R != other.R or self.L != other.L
  36     def __str__(self):
  37         return "%s -> %s %s [%.2f]" % (self.head, self.L, self.R, self.prob)
  38     def __init__(self, head, L, R, prob):
  39         self.head = head
  40         self.R = R
  41         self.L = L
  42         self.prob = prob
  43     def p(self, s, t, g):
  44         return self.prob
  45     def LHS(self):
  46         return self.head
  47
  48 class Grammar():
  49     '''The PCFG used in the I/O-algorithm.
  50
  51     Todo: as of now, this allows duplicate rules... should we check
  52     for this?  (eg. g = Grammar([x,x],[]) where x.prob == 1 may give
  53     inner probabilities of 2.)'''
  54     def rules(self, head):
  55         return [rule for rule in self.rules if rule.head == head]
  56
  57     def __init__(self, rules, p_terminals):
  58         '''rules and p_terminals should be arrays, where p_terminals are of
  59         the form [preterminal, terminal], and rules are CNF_Rule's.'''
  60         self.rules = rules # could check for summing to 1 (+/- epsilon)
  61         self.p_terminals = p_terminals
  62
  63
  64
  65 def inner(s, t, LHS, g, sent, chart):
  66     ''' Give the inner probability of having the node LHS cover whatever's
  67     between s and t in sentence sent, using grammar g.
  68
  69     Returns a pair of the inner probability and the chart
  70
  71     For DMV, LHS is a pair (bar, h), but this function ought to be
  72     agnostic about that.
  73
  74     e() is an internal function, so the variable chart (a dictionary)
  75     is available to all calls of e().
  76
  77     Since terminal probabilities are just simple lookups, they are not
  78     put in the chart (although we could put them in there later to
  79     optimize)
  80     '''
  81
  82     def O(s):
  83         return sent[s]
  84
  85     def e(s,t,LHS):
  86         if (s, t, LHS) in chart:
  87             return chart[(s, t, LHS)]
  88         else:
  89             debug( "trying from %d to %d with %s" % (s,t,LHS) )
  90             if s == t:
  91                 if (LHS, O(s)) in g.p_terminals:
  92                     prob = g.p_terminals[LHS, O(s)] # b[LHS, O(s)]
  93                 else:
  94                     # if a left stop, we need a right stop below, if a right stop,
  95                     # we need the terminal below. todo.
  96                     prob = 0.0 # todo: is this the right way to deal with lacking rules?
  97                     debug( "\tterminal: %s -> %s : %.1f" % (LHS, O(s), prob) )
  98                 return prob
  99             else:
 100                 if (s,t,LHS) not in chart:
 101                     chart[(s,t,LHS)] = 0.0
 102                 for rule in g.rules(LHS): # summing over j,k in a[LHS,j,k]
 103                     debug( "\tsumming rule %s" % rule )
 104                     L = rule.L
 105                     R = rule.R
 106                     for r in range(s, t): # summing over r = s to r = t-1
 107                         chart[(s, t, LHS)] += rule.p(s,t,g) * e(s, r, L) * e(r + 1, t, R)
 108                 debug( "\tchart[(%d,%d,%s)] = %.2f" % (s,t,LHS, chart[(s,t,LHS)]) )
 109                 return chart[(s, t, LHS)]
 110     # end of e-function
 111
 112     inner_prob = e(s,t,LHS)
 113     if DEBUG:
 114         print "---CHART:---"
 115         for k,v in chart.iteritems():
 116             print "\t%s -> %s_%d ... %s_%d : %.1f" % (k[2], O(k[0]), k[0], O(k[1]), k[1], v)
 117         print "---CHART:end---"
 118     return (inner_prob, chart) # inner_prob == chart[(s,t,LHS)]
 119
 120
 121
 122
 123
 124
 125
 126
 127 if __name__ == "__main__":
 128     print "IO-module tests:"
 129     b = {}
 130     s   = CNF_Rule(0,1,2, 1.0) # s->np vp
 131     np  = CNF_Rule(1,3,4, 0.3) # np->n p
 132     b[1, 'n'] = 0.7 # np->'n'
 133     b[3, 'n'] = 1.0 # n->'n'
 134     b[4, 'p'] = 1.0 # p->'p'
 135     vp  = CNF_Rule(2,5,1, 0.1) # vp->v np (two parses use this rule)
 136     vp2 = CNF_Rule(2,2,4, 0.9) # vp->vp p
 137     b[5, 'v'] = 1.0 # v->'v'
 138
 139     g = Grammar([s,np,vp,vp2], b)
 140
 141     print "The rules:"
 142     for i in range(0,5):
 143         for r in g.rules(i):
 144             print r
 145     print ""
 146
 147     test1 = inner(0,0, 1, g, ['n'], {})
 148     if test1[0] != 0.7:
 149         print "should be 0.70 : %.2f" % test1[0]
 150         print ""
 151
 152     DEBUG = 1
 153     test2 = inner(0,2, 2, g, ['v','n','p'], test1[1])
 154     print "should be 0.?? (.09??) : %.2f" % test2[0]
 155     print "------ trying the same again:----------"
 156     test2 = inner(0,2, 2, g, ['v','n','p'], test2[1])
 157     print "should be 0.?? (.09??) : %.2f" % test2[0]
 158
 159
 160 ##################################################################
 161 #            just junk from here on down:                        #
 162 ##################################################################
 163
 164 # def io(corpus):
 165 #     "(pseudo-code / wishful thinking) "
 166 #     g = initialize(corpus) # or corpus.tagset ?
 167
 168 #     P = {('v','n','p'):0.09}
 169 #     # P is used in v_q, w_q (expectation), so each sentence in the
 170 #     # corpus needs some initial P.
 171
 172 #     # --- Maximation: ---
 173 #     #
 174 #     # actually, this step (from Lari & Young) probably never happens
 175 #     # with DMV, since instead of the a[i,j,k] and b[i,m] vectors, we
 176 #     # have P_STOP and P_CHOOSE... or, in a sense it happens only we
 177 #     # calculate P_STOP and P_CHOOSE..somehow.
 178 #     for rule in g.rules:
 179 #         rule.num = 0
 180 #         rule.den = 0
 181 #     for pre_term in range(len(g.p_terminals)):
 182 #         ptnum[pre_term] = 0
 183 #         ptden[pre_term] = 0
 184
 185 #     # we could also flip this to make rules the outer loop, then we
 186 #     # wouldn't have to initialize den/num in loops of their own
 187 #     for sent in corpus:
 188 #         for rule in g.rules # Equation 20
 189 #             for s in range(len(sent)):
 190 #                 for t in range(s, len(sent)):
 191 #                     rule.num += w(s,t, rule.LHS(),rule.L,rule.R, g, sent, P[sent])
 192 #                     rule.den += v(s,t, rule.LHS(), g, sent, P[sent])
 193 #             # todo: do we need a "new-prob" vs "old-prob" distinction here?
 194 #                     probably, since we use inner/outer which checks rule.prob()
 195 #             # todo: also, this wouldn't work, since for each sentence, we'd
 196 #             #       discard the old probability; should rules be the outer
 197 #             #       loop then?
 198 #             rule.prob = rule.num / rule.den
 199 #         for pre_term in range(len(g.p_terminals)): # Equation 21
 200 #             num = 0
 201 #             den = 0
 202 #             for s in range(len(sent)):
 203 #                 for t in range(s, len(sent)):
 204 #                     num += v(t,t,pre_term, g, sent, P[sent])
 205 #                     den += v(s,t,pre_term, g, sent, P[sent])
 206
 207 #     for rule in g.rules:
 208 #         rule.prob = rule.num / rule.den
 209 #     for pre_term in range(len(g.p_terminals)):
 210 #         g.p_terminals[pre_term] = ptnum[pre_term] / ptden[pre_term]
 211
 212
 213 #     # --- Expectation: ---
 214 #     for sent in corpus: # Equation 11
 215 #         inside = inner(0, len(sent), ROOT, g, sent)
 216 #         P[sent] = inside[0]
 217
 218 #     # todo: set inner.chart to {} again, how?
 219
 220 #     # todo: need a old-P new-P distinction to check if we're below
 221 #     # threshold difference
 222 #     return "todo"
 223
 224 # def w(s,t, LHS,L,R, g, sent, P_sent):
 225 #     w = 0
 226 #     rule = g.rule(LHS, L, R)
 227 #     for r in range(s, t):
 228 #         w += rule.prob() * inner(s,r, L, g, sent) * inner(r+1, t, R, g, sent) * outer(s,t,LHS,g,sent)
 229 #     return w / P_sent
 230
 231 # def v(s,t, LHS, g, sent, P_sent):
 232 #     return ( inner(s,t, LHS, g, sent) * outer(s,t, LHS, g, sent) ) / P_sent
 233
 234
 235
 236
 237