src/io.py.~1~

   1 ##################################################################
   2 #                            Changes:                            #
   3 ##################################################################
   4 # 2008-05-24, KBU:
   5 # - more elaborate pseudo-code in the io()-function
   6 #
   7 # 2008-05-25, KBU:
   8 # - CNF_Rule has a function LHS() which returns head, in DMV_Rule this
   9 #   returns the pair of (bars, head).
  10 #
  11 # 2008-05-27
  12 # - CNF_Rule has __eq__ and __ne__ defined, so that we can use == and
  13 #   != on two such rules
  14
  15
  16 # import numpy # numpy provides Fast Arrays, for future optimization
  17 # import pprint # for pretty-printing
  18
  19 DEBUG = 0
  20 def debug(string):
  21     '''Easily turn on/off inline debug printouts with this global. There's
  22 a lot of cluttering debug statements here, todo: clean up'''
  23     if DEBUG:
  24         print string
  25
  26 class CNF_Rule():
  27     '''A single CNF rule in the PCFG, of the form
  28     head -> L R
  29     where these are just integers
  30     (where do we save the connection between number and symbol?
  31     symbols being 'vbd' etc.)'''
  32     def __eq__(self, other):
  33         return self.LHS() == other.LHS() and self.R == other.R and self.L == other.L
  34     def __ne__(self, other):
  35         return self.LHS() != other.LHS() or self.R != other.R or self.L != other.L
  36     def __str__(self):
  37         return "%s -> %s %s [%.2f]" % (self.head, self.L, self.R, self.prob)
  38     def __init__(self, head, L, R, prob):
  39         self.head = head
  40         self.R = R
  41         self.L = L
  42         self.prob = prob
  43     def LHS(self):
  44         return self.head
  45
  46 class Grammar():
  47     '''The PCFG used in the I/O-algorithm.
  48
  49     Todo: as of now, this allows duplicate rules... should we check
  50     for this?  (eg. g = Grammar([x,x],[]) where x.prob == 1 may give
  51     inner probabilities of 2.)'''
  52     def rules(self, head):
  53         return [rule for rule in self.p_rules if rule.head == head]
  54
  55     def __init__(self, p_rules, p_terminals):
  56         '''p_rules and p_terminals should be arrays, where p_terminals are of
  57         the form [preterminal, terminal], and p_rules are CNF_Rule's.'''
  58         self.p_rules = p_rules # could check for summing to 1 (+/- epsilon)
  59         self.p_terminals = p_terminals
  60
  61
  62
  63 def inner(s, t, LHS, g, sent, chart):
  64     ''' Give the inner probability of having the node LHS cover whatever's
  65     between s and t in sentence sent, using grammar g.
  66
  67     Returns a pair of the inner probability and the chart
  68
  69     For DMV, LHS is a pair (bar, h), but this function ought to be
  70     agnostic about that.
  71
  72     e() is an internal function, so the variable chart (a dictionary)
  73     is available to all calls of e().
  74
  75     Since terminal probabilities are just simple lookups, they are not
  76     put in the chart (although we could put them in there later to
  77     optimize)
  78     '''
  79
  80     def O(s):
  81         return sent[s]
  82
  83     def e(s,t,LHS):
  84         if (s, t, LHS) in chart:
  85             return chart[(s, t, LHS)]
  86         else:
  87             debug( "trying from %d to %d with %s" % (s,t,LHS) )
  88             if s == t:
  89                 if (LHS, O(s)) in g.p_terminals:
  90                     prob = g.p_terminals[LHS, O(s)] # b[LHS, O(s)]
  91                 else:
  92                     # if a left stop, we need a right stop below, if a right stop,
  93                     # we need the terminal below. todo.
  94                     prob = 0.0 # todo: is this the right way to deal with lacking rules?
  95                     debug( "\tterminal: %s -> %s : %.1f" % (LHS, O(s), prob) )
  96                 return prob
  97             else:
  98                 if (s,t,LHS) not in chart:
  99                     chart[(s,t,LHS)] = 0.0
 100                 for rule in g.rules(LHS): # summing over j,k in a[LHS,j,k]
 101                     debug( "\tsumming rule %s" % rule )
 102                     L = rule.L
 103                     R = rule.R
 104                     for r in range(s, t): # summing over r = s to r = t-1
 105                         chart[(s, t, LHS)] += rule.prob * e(s, r, L) * e(r + 1, t, R)
 106                 debug( "\tchart[(%d,%d,%s)] = %.2f" % (s,t,LHS, chart[(s,t,LHS)]) )
 107                 return chart[(s, t, LHS)]
 108     # end of e-function
 109
 110     inner_prob = e(s,t,LHS)
 111     if DEBUG:
 112         print "---CHART:---"
 113         for k,v in chart.iteritems():
 114             print "\t%s -> %s_%d ... %s_%d : %.1f" % (k[2], O(k[0]), k[0], O(k[1]), k[1], v)
 115         print "---CHART:end---"
 116     return (inner_prob, chart) # inner_prob == chart[(s,t,LHS)]
 117
 118
 119
 120
 121
 122
 123
 124
 125 if __name__ == "__main__":
 126     print "IO-module tests:"
 127     b = {}
 128     s   = CNF_Rule(0,1,2, 1.0) # s->np vp
 129     np  = CNF_Rule(1,3,4, 0.3) # np->n p
 130     b[1, 'n'] = 0.7 # np->'n'
 131     b[3, 'n'] = 1.0 # n->'n'
 132     b[4, 'p'] = 1.0 # p->'p'
 133     vp  = CNF_Rule(2,5,1, 0.1) # vp->v np (two parses use this rule)
 134     vp2 = CNF_Rule(2,2,4, 0.9) # vp->vp p
 135     b[5, 'v'] = 1.0 # v->'v'
 136
 137     g = Grammar([s,np,vp,vp2], b)
 138
 139     print "The rules:"
 140     for i in range(0,5):
 141         for r in g.rules(i):
 142             print r
 143     print ""
 144
 145     test1 = inner(0,0, 1, g, ['n'], {})
 146     if test1[0] != 0.7:
 147         print "should be 0.70 : %.2f" % test1[0]
 148         print ""
 149
 150     DEBUG = 1
 151     test2 = inner(0,2, 2, g, ['v','n','p'], test1[1])
 152     print "should be 0.?? (.09??) : %.2f" % test2[0]
 153     print "------ trying the same again:----------"
 154     test2 = inner(0,2, 2, g, ['v','n','p'], test2[1])
 155     print "should be 0.?? (.09??) : %.2f" % test2[0]
 156
 157
 158 ##################################################################
 159 #            just junk from here on down:                        #
 160 ##################################################################
 161
 162 # def io(corpus):
 163 #     "(pseudo-code / wishful thinking) "
 164 #     g = initialize(corpus) # or corpus.tagset ?
 165
 166 #     P = {('v','n','p'):0.09}
 167 #     # P is used in v_q, w_q (expectation), so each sentence in the
 168 #     # corpus needs some initial P.
 169
 170 #     # --- Maximation: ---
 171 #     #
 172 #     # actually, this step (from Lari & Young) probably never happens
 173 #     # with DMV, since instead of the a[i,j,k] and b[i,m] vectors, we
 174 #     # have P_STOP and P_CHOOSE... or, in a sense it happens only we
 175 #     # calculate P_STOP and P_CHOOSE..somehow.
 176 #     for rule in g.p_rules:
 177 #         rule.num = 0
 178 #         rule.den = 0
 179 #     for pre_term in range(len(g.p_terminals)):
 180 #         ptnum[pre_term] = 0
 181 #         ptden[pre_term] = 0
 182
 183 #     # we could also flip this to make rules the outer loop, then we
 184 #     # wouldn't have to initialize den/num in loops of their own
 185 #     for sent in corpus:
 186 #         for rule in g.p_rules # Equation 20
 187 #             for s in range(len(sent)):
 188 #                 for t in range(s, len(sent)):
 189 #                     rule.num += w(s,t, rule.LHS(),rule.L,rule.R, g, sent, P[sent])
 190 #                     rule.den += v(s,t, rule.LHS(), g, sent, P[sent])
 191 #             # todo: do we need a "new-prob" vs "old-prob" distinction here?
 192 #                     probably, since we use inner/outer which checks rule.prob
 193 #             # todo: also, this wouldn't work, since for each sentence, we'd
 194 #             #       discard the old probability; should rules be the outer
 195 #             #       loop then?
 196 #             rule.prob = rule.num / rule.den
 197 #         for pre_term in range(len(g.p_terminals)): # Equation 21
 198 #             num = 0
 199 #             den = 0
 200 #             for s in range(len(sent)):
 201 #                 for t in range(s, len(sent)):
 202 #                     num += v(t,t,pre_term, g, sent, P[sent])
 203 #                     den += v(s,t,pre_term, g, sent, P[sent])
 204
 205 #     for rule in g.p_rules:
 206 #         rule.prob = rule.num / rule.den
 207 #     for pre_term in range(len(g.p_terminals)):
 208 #         g.p_terminals[pre_term] = ptnum[pre_term] / ptden[pre_term]
 209
 210
 211 #     # --- Expectation: ---
 212 #     for sent in corpus: # Equation 11
 213 #         inside = inner(0, len(sent), ROOT, g, sent)
 214 #         P[sent] = inside[0]
 215
 216 #     # todo: set inner.chart to {} again, how?
 217
 218 #     # todo: need a old-P new-P distinction to check if we're below
 219 #     # threshold difference
 220 #     return "todo"
 221
 222 # def w(s,t, LHS,L,R, g, sent, P_sent):
 223 #     w = 0
 224 #     rule = g.rule(LHS, L, R)
 225 #     for r in range(s, t):
 226 #         w += rule.prob * inner(s,r, L, g, sent) * inner(r+1, t, R, g, sent) * outer(s,t,LHS,g,sent)
 227 #     return w / P_sent
 228
 229 # def v(s,t, LHS, g, sent, P_sent):
 230 #     return ( inner(s,t, LHS, g, sent) * outer(s,t, LHS, g, sent) ) / P_sent
 231
 232
 233
 234
 235