src/dmv.py~20080527~

   1 #### changes by KBU:
   2 # 2008-05-24:
   3 # - prettier printout for DMV_Rule
   4 # - DMV_Rule changed a bit. head, L and R are now all pairs of the
   5 #   form (bars, head).
   6 # - Started on P_STOP, a bit less pseudo now..
   7
   8 # import numpy # numpy provides Fast Arrays, for future optimization
   9
  10 import io
  11
  12 # the following will only print when in this module:
  13 if __name__ == "__main__":
  14     print "DMV-module tests:"
  15     a = io.CNF_Rule(3,1,3,0.1)
  16     if(a.head == 3):
  17         print "import io works"
  18
  19 class DMV_Grammar(io.Grammar):
  20     '''The DMV-PCFG.
  21
  22     We need to be able to access rules per mother node, sum every H, every
  23     H_, ..., every H', every H'_, etc. for the IO-algorithm.
  24
  25     What other representations do we need? (P_STOP formula uses
  26     deps_D(h,l/r) at least)'''
  27     def rules(self, b_h):
  28         bars = b_h[0]
  29         head = b_h[1]
  30         return [r for r in self.p_rules if r.head == head and r.bars == bars]
  31
  32     def heads(self):
  33         return "some structure full of rule heads.."
  34
  35     def deps(self, h, dir):
  36         return "all dir-dependents of rules with head h"
  37
  38     def __init__(self, p_rules, p_terminals):
  39         io.Grammar.__init__(self, p_rules, p_terminals)
  40
  41
  42 class DMV_Rule(io.CNF_Rule):
  43     '''A single CNF rule in the PCFG, of the form
  44     LHS -> L R
  45     where LHS = (bars, head)
  46
  47     todo: possibly just store b_h instead of bars and head? (then b_h
  48     = LHS, while we need new accessor functions for bars and head)
  49
  50     Different rule-types have different probabilities associated with
  51     them:
  52
  53     _h_ -> STOP  h_     P( STOP|h,L,    adj)
  54     _h_ -> STOP  h_     P( STOP|h,L,non_adj)
  55      h_ ->  h  STOP     P( STOP|h,R,    adj)
  56      h_ ->  h  STOP     P( STOP|h,R,non_adj)
  57      h_ -> _a_   h_     P(-STOP|h,L,    adj) * P(a|h,L)
  58      h_ -> _a_   h_     P(-STOP|h,L,non_adj) * P(a|h,L)
  59      h  ->  h   _a_     P(-STOP|h,R,    adj) * P(a|h,R)
  60      h  ->  h   _a_     P(-STOP|h,R,non_adj) * P(a|h,R)
  61
  62     Todo, togrok: How do we know whether we use adj or non in inner()?
  63
  64     Todo, togrok: How does "STOP" work in the inner-function?
  65     '''
  66     def __str__(self):
  67         def bar_str(b_h):
  68             str = "%d" % b_h[1]
  69             if(b_h[0] == 1):
  70                 str = "_%d" % b_h[1]
  71             if(b_h[0] == 2):
  72                 str = "_%d_" % b_h[1]
  73             return str
  74         return "%s -> %s %s [%.2f]" % (bar_str((self.bars,self.head)),
  75                                        bar_str(self.L),
  76                                        bar_str(self.R),
  77                                        self.prob)
  78
  79     def LHS(self):
  80         return (self.bars, self.head)
  81
  82     def __init__(self, b_h, b_L, b_R, prob):
  83         io.CNF_Rule.__init__(self, b_h[1], b_L, b_R, prob)
  84         if b_h[0] in [0,1,2]:
  85             self.bars = b_h[0]
  86         else: # hmm, should perhaps check b_L and b_R too? todo
  87             raise ValueError("bars must be either 0, 1 or 2; was given: %s" % bars)
  88
  89
  90 # the following will only print when in this module:
  91 if __name__ == "__main__":
  92     # these are not Real rules, just testing the classes. todo: make
  93     # a rule-set to test inner() on.
  94     b = {}
  95     s   = DMV_Rule((2,0), (0,1),(0,2), 1.0) # s->np vp
  96     np  = DMV_Rule((0,1), (0,3),(0,4), 0.3) # np->n p
  97     b[(0,1), 'n'] = 0.7 # np->'n'
  98     b[(0,3), 'n'] = 1.0 # n->'n'
  99     b[(0,4), 'p'] = 1.0 # p->'p'
 100     vp  = DMV_Rule((0,2), (0,5),(0,1), 0.1) # vp->v np (two parses use this rule)
 101     vp2 = DMV_Rule((0,2), (0,2),(0,4), 0.9) # vp->vp p
 102     b[(0,5), 'v'] = 1.0 # v->'v'
 103
 104     g = DMV_Grammar([s,np,vp,vp2], b)
 105
 106     io.DEBUG = 0
 107     test1 = io.inner(0,0, (0,1), g, ['n'], {})
 108     if test1[0] != 0.7:
 109         print "should be 0.70 : %.2f" % test1[0]
 110         print ""
 111
 112     io.DEBUG = 1
 113     test2 = io.inner(0,2, (0,2), g, ['v','n','p'], {})
 114     print "should be 0.09 if the io.py-test is right : %.2f" % test2[0]
 115     # todo: does this manage to look it up in the chart???
 116     test2 = io.inner(0,2, (0,2), g, ['v','n','p'], test2[1])
 117     print "should be 0.09 if the io.py-test is right : %.2f" % test2[0]
 118
 119
 120
 121
 122 def all_inner_dmv(sent, g):
 123     for h in g.heads():
 124         for bars in [0, 1, 2]:
 125             for s in range(s, len(sent)): # summing over r = s to r = t-1
 126                 for t in range(s+1, len(sent)):
 127                     io.inner(s, t, (bars, h), g, sent)
 128
 129
 130
 131 # DMV-probabilities, todo:
 132
 133 def P_STOP(STOP, h, dir, adj, corpus):
 134     '''corpus is a list of sentences s.
 135
 136 This is based on the formula where STOP is True... not sure how we
 137 calculate if STOP is False.
 138
 139
 140 I thought about instead having this:
 141
 142 for rule in g.p_rules:
 143     rule.num = 0
 144     rule.den = 0
 145 for sent in corpus:
 146     for rule in g.p_rules:
 147        for s:
 148            for t:
 149                set num and den using inner
 150 for rule in g.p_rules
 151     rule.prob = rule.num / rule.den
 152
 153 ..the way I'm assuming we do it in the commented out io-function in
 154 io.py. Having sentences as the outer loop at least we can easily just
 155 go through the heads that are actually in the sentence... BUT, this
 156 means having to go through rules 3 times, not sure what is slower.
 157
 158 oh, and:
 159 P_STOP(-STOP|...) = 1 - P_STOP(STOP|...)
 160 =D
 161
 162 '''
 163     P_STOP_num = 0
 164     P_STOP_den = 0
 165     for sent in corpus:
 166         # here we should somehow make each word in the sentence
 167         # unique, decorate them with subscripts or something. We have
 168         # to run through the sentence as many times as h appears
 169         # there. This also means changing inner(), I suspect.  Have to
 170         # make sure we separate reading of inner_prob from changing of
 171         # inner_prob...
 172         for s in range(loc(h)): # i<loc(h), where h is in the sentence.
 173             for t in range(i, len(sent)):
 174                 P_STOP_num += inner(s, t, h-r, g, sent, chart)
 175                 P_STOP_den += inner(s, t, l-h-r, g, sent, chart)
 176     return P_STOP_num / P_STOP_den # possibly other way round? todo
 177
 178
 179 def P_CHOOSE():
 180     return "todo"
 181
 182 def DMV(sent, g):
 183     '''Here it seems like they store rule information on a per-head (per
 184     direction) basis, in deps_D(h, dir) which gives us a list. '''
 185     def P_h(h):
 186         P_h = 1 # ?
 187         for dir in ['l', 'r']:
 188             for a in deps(h, dir):
 189                 # D(a)??
 190                 P_h *= \
 191                     P_STOP (0, h, dir, adj) * \
 192                     P_CHOOSE (a, h, dir) * \
 193                     P_h(D(a)) * \
 194                 P_STOP (STOP | h, dir, adj)
 195         return P_h
 196     return P_h(root(sent))
 197
 198
 199
 200
 201
 202
 203
 204 ##################################################################
 205 #            just junk from here on down:                        #
 206 ##################################################################
 207
 208 def initialize(taglist):
 209     '''If H and A are non-equal tags, we want every rule
 210     of these forms:
 211     _H_ -> STOP H_
 212     H_ -> H STOP
 213     H_ -> _A_ H
 214     H  -> H _A_
 215
 216     to have some initial probability. It's supposed to be harmonic,
 217     which we get from an M-step of the IO-algorithm; do we need a full
 218     rule count? -- Where we check the distance from H to A in each case,
 219     and the probability is C/distance + C' ... but then what constants
 220     do we need in order to sum to 1?
 221
 222     p.4 in Carroll & Charniak gives the number of DG rules for
 223     sentences of length n as n*(2^{n-1}+1).
 224
 225         "harmonic" completion where all non-ROOT words took the same
 226         number of arguments, and each took other words as arguments in
 227         inverse proportion to (a constant plus) the distance between
 228         them. The ROOT always had a single argument and took each word
 229         with equal probability. (p.5 in DMV-article)
 230
 231     ---
 232
 233     Given their model, we have these types of probabilities:
 234
 235     - P_STOP(-STOP | h, dir, adj)
 236     - P_STOP(STOP | h, dir, adj)
 237     - P_CHOOSE(a | h, dir)
 238
 239     and then each has to sum to 1 given certain values for h, dir, adj.
 240     (so P_STOP(-STOP | VBD, L, non-adj)+P_STOP( STOP | VBD, L, non-adj)=1;
 241     and summing P_CHOOSE(a | VBD, L), over all a, equals 1).
 242
 243     '''
 244     rules = [ ] # should be a simple array where indexes correspond to
 245                 # heads h, so if h->_a_ h_, then h == heads[i] iff
 246                 # rules[i] == (_a_, h_)
 247     heads = ['ROOT']
 248     for h in taglist:
 249
 250         rules['ROOT'] = ('ROOT', h)
 251         rules['_' + h + '_'] = ('STOP', h + '_') # _H_ -> STOP H_
 252         rules[      h + '_'] = ('h'   , 'STOP' ) # H_ -> H STOP
 253         for a in taglist:
 254             if a != h :
 255                 rules[h] = (h      , '_' + a + '_') # H  ->  H _A_
 256                 rules[h + '_'] = ('_' + a + '_', h + '_') # H_ -> _A_ H_
 257
 258     # not sure what kind of rule-access we'll need
 259     # (dictionary?).. also, rules should be stored somewhere "globally
 260     # accessible"..
 261
 262     # put the probabilities in a single array, indexed on rule-number,
 263     # (anticipating the need for quick and simple access)
 264     p_rules = numpy.zeros( (len(rules)) )
 265
 266     return (p_rules, rules)
 267
 268
 269 def bracket(tags):
 270     p = tuple([(tag, 1) for tag in tags])
 271     test = numpy.zeros((len(tags)+1, len(tags)+1))
 272     return test
 273
 274 #if __name__ == "__main__":
 275     # pprint.pprint(initialize(['vbd', 'nn', 'jj', 'dt']))
 276     # print bracket(("foo", "bar", "fie","foe"))