src/dmv.py~20080524~

   1 # import numpy # numpy provides Fast Arrays, for future optimization
   2 import io
   3
   4 # the following will only print when in this module:
   5 if __name__ == "__main__":
   6     print "DMV-module tests:"
   7     a = io.CNF_Rule(3,1,3,0.1)
   8     if(a.head == 3):
   9         print "import io works"
  10
  11 class DMV_Grammar(io.Grammar):
  12     '''The DMV-PCFG.
  13
  14     We need to be able to access rules per mother node, sum every H, every
  15     H_, ..., every H', every H'_, etc. for the IO-algorithm.
  16
  17     What other representations do we need? (P_STOP formula uses
  18     deps_D(h,l/r) at least)'''
  19     def rules(self, b_h):
  20         bars = b_h[0]
  21         head = b_h[1]
  22         return [r for r in self.p_rules if r.head == head and r.bars == bars]
  23
  24     def headed_rule_probs(self):
  25         '''Give all rules sorted by head, for use in DMV-specific
  26         probabilities (need deps_D(h, l/r) and so on also)'''
  27         return "todo"
  28
  29     def __init__(self, p_rules, p_terminals):
  30         io.Grammar.__init__(self, p_rules, p_terminals)
  31
  32
  33 class DMV_Rule(io.CNF_Rule):
  34     '''A single CNF rule in the PCFG, of the form
  35     head -> l r
  36     with the added information of bars
  37     '''
  38     def __str__(self):
  39         b = ""
  40         if(self.bars == 1):
  41             b = "_"
  42         if(self.bars == 2): # todo: print prettier?
  43             b = "__"
  44         return b + io.CNF_Rule.__str__(self)
  45
  46     def __init__(self, head, L, R, prob, bars):
  47         io.CNF_Rule.__init__(self, head, L, R, prob)
  48         if bars in [0,1,2]:
  49             self.bars = bars
  50         else:
  51             raise ValueError("bars must be either 0, 1 or 2; was given: %s" % bars)
  52
  53
  54 # the following will only print when in this module:
  55 if __name__ == "__main__":
  56     # these are not Real rules, just testing the classes. todo: make
  57     # a rule-set to test inner() on.
  58     b = {}
  59     s   = DMV_Rule(0,1,2, 1.0, 0) # s->np vp
  60     np  = DMV_Rule(1,3,4, 0.3, 0) # np->n p
  61     b[1, 'n'] = 0.7 # np->'n'
  62     b[3, 'n'] = 1.0 # n->'n'
  63     b[4, 'p'] = 1.0 # p->'p'
  64     vp  = DMV_Rule(2,5,1, 0.1, 0) # vp->v np (two parses use this rule)
  65     vp2 = DMV_Rule(2,2,4, 0.9, 0) # vp->vp p
  66     b[5, 'v'] = 1.0 # v->'v'
  67
  68     g = DMV_Grammar([s,np,vp,vp2], b)
  69
  70     test1 = io.inner(0,0, 1, g, ['n'])
  71     if test1 != 0.7:
  72         print "should be 0.70 : %.2f" % test1
  73         print ""
  74
  75     test2 = io.inner(0,2, (2,0), g, ['v','n','p'])
  76     print "should be 0.?? : %.2f" % test2
  77
  78
  79 def all_inner_dmv(sent, g):
  80     for h in g.heads():
  81         for bars in [0, 1, 2]:
  82             for s in range(s, len(sent)): # summing over r = s to r = t-1
  83                 for t in range(s+1, len(sent)):
  84                     io.inner(s, t, (bars, h), g, sent)
  85
  86
  87
  88 # DMV-probabilities:
  89
  90 # def DMV(sentence):
  91 #     '''Here it seems like they store bracketing information on a per-head
  92 #     (per direction) basis, in deps_D(h, dir) which gives us a list. '''
  93 #     P_D = array indexed on h`s from sentence
  94 #     for dir from {l,r}:
  95 #         for a from deps_D(h, dir):
  96 #             P_D[h] *= \
  97 #                 P_STOP (notSTOP | h, dir, adj) * \
  98 #                 P_CHOOSE (a | h,dir) * \
  99 #                 P (D(a)) * \
 100 #                 P_STOP (STOP | h, dir, adj)
 101 #             return P_D_h
 102
 103 # def P_STOP(STOP | h, dir, adj):
 104 #     P_STOP_num = 0
 105 #     for s from S:
 106 #         for i<loc(h): # where h is in the sentence
 107 #             for k:
 108 #                 P_STOP_num += c(h-r : i, k)
 109 #     P_STOP_den = 0
 110 #     for s from S:
 111 #         for i<loc(h):
 112 #             for k:
 113 #                 P_STOP_den += c(l-h-r : i, k)
 114 #     return P_STOP_num / P_STOP_den
 115
 116
 117
 118 ##################################################################
 119 #            just junk from here on down:                        #
 120 ##################################################################
 121
 122 def initialize(taglist):
 123     '''If H and A are non-equal tags, we want every rule
 124     of these forms:
 125     _H_ -> STOP H_
 126     H_ -> H STOP
 127     H_ -> _A_ H
 128     H  -> H _A_
 129
 130     to have some initial probability. It's supposed to be harmonic,
 131     which we get from an M-step of the IO-algorithm; do we need a full
 132     rule count? -- Where we check the distance from H to A in each case,
 133     and the probability is C/distance + C' ... but then what constants
 134     do we need in order to sum to 1?
 135
 136     p.4 in Carroll & Charniak gives the number of DG rules for
 137     sentences of length n as n*(2^{n-1}+1).
 138
 139         "harmonic" completion where all non-ROOT words took the same
 140         number of arguments, and each took other words as arguments in
 141         inverse proportion to (a constant plus) the distance between
 142         them. The ROOT always had a single argument and took each word
 143         with equal probability. (p.5 in DMV-article)
 144
 145     ---
 146
 147     Given their model, we have these types of probabilities:
 148
 149     - P_STOP(-STOP | h, dir, adj)
 150     - P_STOP(STOP | h, dir, adj)
 151     - P_CHOOSE(a | h, dir)
 152
 153     and then each has to sum to 1 given certain values for h, dir, adj.
 154     (so P_STOP(-STOP | VBD, L, non-adj)+P_STOP( STOP | VBD, L, non-adj)=1;
 155     and summing P_CHOOSE(a | VBD, L), over all a, equals 1).
 156
 157     '''
 158     rules = [ ] # should be a simple array where indexes correspond to
 159                 # heads h, so if h->_a_ h_, then h == heads[i] iff
 160                 # rules[i] == (_a_, h_)
 161     heads = ['ROOT']
 162     for h in taglist:
 163
 164         rules['ROOT'] = ('ROOT', h)
 165         rules['_' + h + '_'] = ('STOP', h + '_') # _H_ -> STOP H_
 166         rules[      h + '_'] = ('h'   , 'STOP' ) # H_ -> H STOP
 167         for a in taglist:
 168             if a != h :
 169                 rules[h] = (h      , '_' + a + '_') # H  ->  H _A_
 170                 rules[h + '_'] = ('_' + a + '_', h + '_') # H_ -> _A_ H_
 171
 172     # not sure what kind of rule-access we'll need
 173     # (dictionary?).. also, rules should be stored somewhere "globally
 174     # accessible"..
 175
 176     # put the probabilities in a single array, indexed on rule-number,
 177     # (anticipating the need for quick and simple access)
 178     p_rules = numpy.zeros( (len(rules)) )
 179
 180     return (p_rules, rules)
 181
 182
 183 def bracket(tags):
 184     p = tuple([(tag, 1) for tag in tags])
 185     test = numpy.zeros((len(tags)+1, len(tags)+1))
 186     return test
 187
 188 #if __name__ == "__main__":
 189     # pprint.pprint(initialize(['vbd', 'nn', 'jj', 'dt']))
 190     # print bracket(("foo", "bar", "fie","foe"))