src/io.py.~13~

   1 ##################################################################
   2 #                            Changes:                            #
   3 ##################################################################
   4 # 2008-05-24, KBU:
   5 # - more elaborate pseudo-code in the io()-function
   6 #
   7 # 2008-05-25, KBU:
   8 # - CNF_Rule has a function LHS() which returns head, in DMV_Rule this
   9 #   returns the pair of (bars, head).
  10 #
  11 # 2008-05-27
  12 # - CNF_Rule has __eq__ and __ne__ defined, so that we can use == and
  13 #   != on two such rules
  14
  15
  16 # import numpy # numpy provides Fast Arrays, for future optimization
  17 # import pprint # for pretty-printing
  18
  19 DEBUG = 0
  20
  21 # some of dmv-module bleeding in here... todo: prettier (in inner())
  22 NOBAR = 0
  23 STOP = (NOBAR, -2)
  24
  25 def debug(string):
  26     '''Easily turn on/off inline debug printouts with this global. There's
  27 a lot of cluttering debug statements here, todo: clean up'''
  28     if DEBUG:
  29         print string
  30
  31 class CNF_Rule():
  32     '''A single CNF rule in the PCFG, of the form
  33     LHS -> L R
  34     where these are just integers
  35     (where do we save the connection between number and symbol?
  36     symbols being 'vbd' etc.)'''
  37     def __eq__(self, other):
  38         return self.LHS() == other.LHS() and self.R() == other.R() and self.L() == other.L()
  39     def __ne__(self, other):
  40         return self.LHS() != other.LHS() or self.R() != other.R() or self.L() != other.L()
  41     def __str__(self):
  42         return "%s -> %s %s [%.2f]" % (self.LHS(), self.L(), self.R(), self.prob)
  43     def __init__(self, LHS, L, R, prob):
  44         self.__LHS = LHS
  45         self.__R = R
  46         self.__L = L
  47         self.prob = prob
  48     def p(self, *arg):
  49         "Return a probability, doesn't care about attachment..."
  50         return self.prob
  51     def LHS(self):
  52         return self.__LHS
  53     def L(self):
  54         return self.__L
  55     def R(self):
  56         return self.__R
  57
  58
  59 class Grammar():
  60     '''The PCFG used in the I/O-algorithm.
  61
  62     Todo: as of now, this allows duplicate rules... should we check
  63     for this?  (eg. g = Grammar([x,x],[]) where x.prob == 1 may give
  64     inner probabilities of 2.)'''
  65     def rules(self, LHS):
  66         return [rule for rule in self.p_rules if rule.LHS() == LHS]
  67
  68     def __init__(self, p_rules, p_terminals):
  69         '''rules and p_terminals should be arrays, where p_terminals are of
  70         the form [preterminal, terminal], and rules are CNF_Rule's.'''
  71         self.p_rules = p_rules # could check for summing to 1 (+/- epsilon)
  72         self.p_terminals = p_terminals
  73
  74
  75
  76 def inner(s, t, LHS, g, sent, chart):
  77     ''' Give the inner probability of having the node LHS cover whatever's
  78     between s and t in sentence sent, using grammar g.
  79
  80     Returns a pair of the inner probability and the chart
  81
  82     For DMV, LHS is a pair (bar, h), but this function ought to be
  83     agnostic about that.
  84
  85     e() is an internal function, so the variable chart (a dictionary)
  86     is available to all calls of e().
  87
  88     Since terminal probabilities are just simple lookups, they are not
  89     put in the chart (although we could put them in there later to
  90     optimize)
  91     '''
  92
  93     def O(s):
  94         return sent[s]
  95
  96     def e(s,t,LHS):
  97         '''Chart has lists of probability and whether or not we've attached
  98 yet to L and R, each entry is a list [p, Rattach, Lattach], where if
  99 Rattach==True then the rule has a right-attachment or there is one
 100 lower in the tree (meaning we're no longer adjacent).'''
 101         if (s, t, LHS) in chart:
 102             return chart[(s, t, LHS)]
 103         else:
 104             debug( "trying from %d to %d with %s" % (s,t,LHS) )
 105             if s == t:
 106                 if (LHS, O(s)) in g.p_terminals:
 107                     prob = g.p_terminals[LHS, O(s)] # b[LHS, O(s)]
 108                 else:
 109                     prob = 0.0 # todo: is this the right way to deal with lacking rules?
 110                     print "\t LACKING TERMINAL:"
 111                 debug( "\t terminal: %s -> %s : %.1f" % (LHS, O(s), prob) )
 112                 # terminals have no attachment
 113                 return prob
 114             else:
 115                 if (s,t,LHS) not in chart:
 116                     # by default, not attachment yet
 117                     chart[(s,t,LHS)] = 0.0 #, False, False]
 118                 for rule in g.rules(LHS): # summing over j,k in a[LHS,j,k]
 119                     debug( "\tsumming rule %s" % rule )
 120                     L = rule.L()
 121                     R = rule.R()
 122                     for r in range(s, t): # summing over r = s to r = t-1
 123                         p_L = e(s, r, L)
 124                         p_R = e(r + 1, t, R)
 125                         p = rule.p("todo","todo")
 126                         chart[(s, t, LHS)] += p * p_L * p_R
 127                 debug( "\tchart[(%d,%d,%s)] = %.2f" % (s,t,LHS, chart[(s,t,LHS)]) )
 128                 return chart[(s, t, LHS)]
 129     # end of e-function
 130
 131     inner_prob = e(s,t,LHS)
 132     if DEBUG:
 133         print "---CHART:---"
 134         for k,v in chart.iteritems():
 135             print "\t%s -> %s_%d ... %s_%d : %.1f" % (k[2], O(k[0]), k[0], O(k[1]), k[1], v)
 136         print "---CHART:end---"
 137     return [inner_prob, chart] # inner_prob == chart[(s,t,LHS)]
 138
 139
 140
 141
 142
 143
 144
 145
 146 if __name__ == "__main__":
 147     print "IO-module tests:"
 148     b = {}
 149     s   = CNF_Rule(0,1,2, 1.0) # s->np vp
 150     np  = CNF_Rule(1,3,4, 0.3) # np->n p
 151     b[1, 'n'] = 0.7 # np->'n'
 152     b[3, 'n'] = 1.0 # n->'n'
 153     b[4, 'p'] = 1.0 # p->'p'
 154     vp  = CNF_Rule(2,5,1, 0.1) # vp->v np (two parses use this rule)
 155     vp2 = CNF_Rule(2,2,4, 0.9) # vp->vp p
 156     b[5, 'v'] = 1.0 # v->'v'
 157
 158     g = Grammar([s,np,vp,vp2], b)
 159
 160     print "The rules:"
 161     for i in range(0,5):
 162         for r in g.rules(i):
 163             print r
 164     print ""
 165
 166     test1 = inner(0,0, 1, g, ['n'], {})
 167     if test1[0] != 0.7:
 168         print "should be 0.70 : %.2f" % test1[0]
 169         print ""
 170
 171     DEBUG = 1
 172     test2 = inner(0,2, 2, g, ['v','n','p'], test1[1])
 173     print "should be 0.?? (.09??) : %.2f" % test2[0]
 174     print "------ trying the same again:----------"
 175     test2 = inner(0,2, 2, g, ['v','n','p'], test2[1])
 176     print "should be 0.?? (.09??) : %.2f" % test2[0]
 177
 178
 179 ##################################################################
 180 #            just junk from here on down:                        #
 181 ##################################################################
 182
 183 # def io(corpus):
 184 #     "(pseudo-code / wishful thinking) "
 185 #     g = initialize(corpus) # or corpus.tagset ?
 186
 187 #     P = {('v','n','p'):0.09}
 188 #     # P is used in v_q, w_q (expectation), so each sentence in the
 189 #     # corpus needs some initial P.
 190
 191 #     # --- Maximation: ---
 192 #     #
 193 #     # actually, this step (from Lari & Young) probably never happens
 194 #     # with DMV, since instead of the a[i,j,k] and b[i,m] vectors, we
 195 #     # have P_STOP and P_CHOOSE... or, in a sense it happens only we
 196 #     # calculate P_STOP and P_CHOOSE..somehow.
 197 #     for rule in g.p_rules:
 198 #         rule.num = 0
 199 #         rule.den = 0
 200 #     for pre_term in range(len(g.p_terminals)):
 201 #         ptnum[pre_term] = 0
 202 #         ptden[pre_term] = 0
 203
 204 #     # we could also flip this to make rules the outer loop, then we
 205 #     # wouldn't have to initialize den/num in loops of their own
 206 #     for sent in corpus:
 207 #         for rule in g.p_rules # Equation 20
 208 #             for s in range(len(sent)):
 209 #                 for t in range(s, len(sent)):
 210 #                     rule.num += w(s,t, rule.LHS(),rule.L,rule.R, g, sent, P[sent])
 211 #                     rule.den += v(s,t, rule.LHS(), g, sent, P[sent])
 212 #             # todo: do we need a "new-prob" vs "old-prob" distinction here?
 213 #                     probably, since we use inner/outer which checks rule.prob()
 214 #             # todo: also, this wouldn't work, since for each sentence, we'd
 215 #             #       discard the old probability; should rules be the outer
 216 #             #       loop then?
 217 #             rule.prob = rule.num / rule.den
 218 #         for pre_term in range(len(g.p_terminals)): # Equation 21
 219 #             num = 0
 220 #             den = 0
 221 #             for s in range(len(sent)):
 222 #                 for t in range(s, len(sent)):
 223 #                     num += v(t,t,pre_term, g, sent, P[sent])
 224 #                     den += v(s,t,pre_term, g, sent, P[sent])
 225
 226 #     for rule in g.rules:
 227 #         rule.prob = rule.num / rule.den
 228 #     for pre_term in range(len(g.p_terminals)):
 229 #         g.p_terminals[pre_term] = ptnum[pre_term] / ptden[pre_term]
 230
 231
 232 #     # --- Expectation: ---
 233 #     for sent in corpus: # Equation 11
 234 #         inside = inner(0, len(sent), ROOT, g, sent)
 235 #         P[sent] = inside[0]
 236
 237 #     # todo: set inner.chart to {} again, how?
 238
 239 #     # todo: need a old-P new-P distinction to check if we're below
 240 #     # threshold difference
 241 #     return "todo"
 242
 243 # def w(s,t, LHS,L,R, g, sent, P_sent):
 244 #     w = 0
 245 #     rule = g.rule(LHS, L, R)
 246 #     for r in range(s, t):
 247 #         w += rule.prob() * inner(s,r, L, g, sent) * inner(r+1, t, R, g, sent) * outer(s,t,LHS,g,sent)
 248 #     return w / P_sent
 249
 250 # def v(s,t, LHS, g, sent, P_sent):
 251 #     return ( inner(s,t, LHS, g, sent) * outer(s,t, LHS, g, sent) ) / P_sent
 252
 253
 254
 255
 256