src/io_at_words.py

   1 # io_at_words.py, sentence positions at word locations
   2
   3 # import numpy # numpy provides Fast Arrays, for future optimization
   4 # import pprint # for pretty-printing
   5
   6 DEBUG = set(['TODO'])
   7
   8 # some of dmv-module bleeding in here... todo: prettier (in inner())
   9 NOBAR = 0
  10 STOP = (NOBAR, -2)
  11
  12 def debug(string, level='TODO'):
  13     '''Easily turn on/off inline debug printouts with this global. There's
  14 a lot of cluttering debug statements here, todo: clean up'''
  15     if level in DEBUG:
  16         print string
  17
  18
  19 class Grammar():
  20     '''The PCFG used in the I/O-algorithm.
  21
  22     Public members:
  23     p_terminals
  24
  25     Todo: as of now, this allows duplicate rules... should we check
  26     for this?  (eg. g = Grammar([x,x],[]) where x.prob == 1 may give
  27     inner probabilities of 2.)'''
  28     def all_rules(self):
  29         return self.__p_rules
  30
  31     def rules(self, LHS):
  32         return [rule for rule in self.all_rules() if rule.LHS() == LHS]
  33
  34     def nums(self):
  35         return self.__numtag
  36
  37     def sent_nums(self, sent):
  38         return [self.tagnum(tag) for tag in sent]
  39
  40     def numtag(self, num):
  41         return self.__numtag[num]
  42
  43     def tagnum(self, tag):
  44         return self.__tagnum[tag]
  45
  46     def __init__(self, p_rules, p_terminals, numtag, tagnum):
  47         '''rules and p_terminals should be arrays, where p_terminals are of
  48         the form [preterminal, terminal], and rules are CNF_Rule's.'''
  49         self.__p_rules = p_rules # todo: could check for summing to 1 (+/- epsilon)
  50         self.__numtag = numtag
  51         self.__tagnum = tagnum
  52         self.p_terminals = p_terminals
  53
  54
  55
  56
  57 class CNF_Rule():
  58     '''A single CNF rule in the PCFG, of the form
  59     LHS -> L R
  60     where these are just integers
  61     (where do we save the connection between number and symbol?
  62     symbols being 'vbd' etc.)'''
  63     def __eq__(self, other):
  64         return self.LHS() == other.LHS() and self.R() == other.R() and self.L() == other.L()
  65     def __ne__(self, other):
  66         return self.LHS() != other.LHS() or self.R() != other.R() or self.L() != other.L()
  67     def __str__(self):
  68         return "%s -> %s %s [%.2f]" % (self.LHS(), self.L(), self.R(), self.prob)
  69     def __init__(self, LHS, L, R, prob):
  70         self.__LHS = LHS
  71         self.__R = R
  72         self.__L = L
  73         self.prob = prob
  74     def p(self, *arg):
  75         "Return a probability, doesn't care about attachment..."
  76         return self.prob
  77     def LHS(self):
  78         return self.__LHS
  79     def L(self):
  80         return self.__L
  81     def R(self):
  82         return self.__R
  83
  84 def inner(s, t, LHS, g, sent, chart):
  85     ''' Give the inner probability of having the node LHS cover whatever's
  86     between s and t in sentence sent, using grammar g.
  87
  88     Returns a pair of the inner probability and the chart
  89
  90     For DMV, LHS is a pair (bar, h), but this function ought to be
  91     agnostic about that.
  92
  93     e() is an internal function, so the variable chart (a dictionary)
  94     is available to all calls of e().
  95
  96     Since terminal probabilities are just simple lookups, they are not
  97     put in the chart (although we could put them in there later to
  98     optimize)
  99     '''
 100
 101     def O(s):
 102         return sent[s]
 103
 104     def e(s,t,LHS):
 105         '''Chart has lists of probability and whether or not we've attached
 106 yet to L and R, each entry is a list [p, Rattach, Lattach], where if
 107 Rattach==True then the rule has a right-attachment or there is one
 108 lower in the tree (meaning we're no longer adjacent).'''
 109         if (s, t, LHS) in chart:
 110             return chart[s, t, LHS]
 111         else:
 112             debug( "trying from %d to %d with %s" % (s,t,LHS) )
 113             if s == t:
 114                 if (LHS, O(s)) in g.p_terminals:
 115                     prob = g.p_terminals[LHS, O(s)] # b[LHS, O(s)]
 116                 else:
 117                     prob = 0.0 # todo: is this the right way to deal with lacking rules?
 118                     print "\t LACKING TERMINAL:"
 119                 debug( "\t terminal: %s -> %s : %.1f" % (LHS, O(s), prob) )
 120                 # terminals have no attachment
 121                 return prob
 122             else:
 123                 if (s,t,LHS) not in chart:
 124                     # by default, not attachment yet
 125                     chart[s,t,LHS] = 0.0 #, False, False]
 126                 for rule in g.rules(LHS): # summing over j,k in a[LHS,j,k]
 127                     debug( "\tsumming rule %s" % rule )
 128                     L = rule.L()
 129                     R = rule.R()
 130                     for r in range(s, t): # summing over r = s to r = t-1
 131                         p_L = e(s, r, L)
 132                         p_R = e(r + 1, t, R)
 133                         p = rule.p("todo","todo")
 134                         chart[s, t, LHS] += p * p_L * p_R
 135                 debug( "\tchart[%d,%d,%s] = %.2f" % (s,t,LHS, chart[s,t,LHS]) )
 136                 return chart[s, t, LHS]
 137     # end of e-function
 138
 139     inner_prob = e(s,t,LHS)
 140     if 1 in DEBUG:
 141         print "---CHART:---"
 142         for k,v in chart.iteritems():
 143             print "\t%s -> %s_%d ... %s_%d : %.1f" % (k[2], O(k[0]), k[0], O(k[1]), k[1], v)
 144         print "---CHART:end---"
 145     return [inner_prob, chart]
 146
 147
 148
 149
 150
 151
 152
 153
 154 if __name__ == "__main__":
 155     print "IO-module tests:"
 156     b = {}
 157     s   = CNF_Rule(0,1,2, 1.0) # s->np vp
 158     np  = CNF_Rule(1,3,4, 0.3) # np->n p
 159     b[1, 'n'] = 0.7 # np->'n'
 160     b[3, 'n'] = 1.0 # n->'n'
 161     b[4, 'p'] = 1.0 # p->'p'
 162     vp  = CNF_Rule(2,5,1, 0.1) # vp->v np (two parses use this rule)
 163     vp2 = CNF_Rule(2,2,4, 0.9) # vp->vp p
 164     b[5, 'v'] = 1.0 # v->'v'
 165
 166     g = Grammar([s,np,vp,vp2], b, {0:'s',1:'np',2:'vp',3:'n',4:'p',5:'v'},
 167                 {'s':0,'np':1,'vp':2,'n':3,'p':4,'v':5})
 168
 169     print "The rules:"
 170     for i in range(0,5):
 171         for r in g.rules(i):
 172             print r
 173     print ""
 174
 175     test1 = inner(0,0, 1, g, ['n'], {})
 176     if test1[0] != 0.7:
 177         print "should be 0.70 : %.2f" % test1[0]
 178         print ""
 179
 180     test2 = inner(0,2, 2, g, ['v','n','p'], test1[1])
 181     print "should be 0.0930 : %.4f" % test2[0]
 182     print "------ trying the same again:----------"
 183     test2 = inner(0,2, 2, g, ['v','n','p'], test2[1])
 184     print "should be 0.0930 : %.4f" % test2[0]
 185
 186