remove old backups (trust in vc)
[dmvccm.git] / src / io_at_words.py
blob1560e78a24ac93daa1df92408a00386c57dbb623
1 # io_at_words.py, sentence positions at word locations
3 # import numpy # numpy provides Fast Arrays, for future optimization
4 # import pprint # for pretty-printing
6 DEBUG = set(['TODO'])
8 # some of dmv-module bleeding in here... todo: prettier (in inner())
9 NOBAR = 0
10 STOP = (NOBAR, -2)
12 def debug(string, level='TODO'):
13 '''Easily turn on/off inline debug printouts with this global. There's
14 a lot of cluttering debug statements here, todo: clean up'''
15 if level in DEBUG:
16 print string
19 class Grammar():
20 '''The PCFG used in the I/O-algorithm.
22 Public members:
23 p_terminals
25 Todo: as of now, this allows duplicate rules... should we check
26 for this? (eg. g = Grammar([x,x],[]) where x.prob == 1 may give
27 inner probabilities of 2.)'''
28 def all_rules(self):
29 return self.__p_rules
31 def rules(self, LHS):
32 return [rule for rule in self.all_rules() if rule.LHS() == LHS]
34 def nums(self):
35 return self.__numtag
37 def sent_nums(self, sent):
38 return [self.tagnum(tag) for tag in sent]
40 def numtag(self, num):
41 return self.__numtag[num]
43 def tagnum(self, tag):
44 return self.__tagnum[tag]
46 def __init__(self, p_rules, p_terminals, numtag, tagnum):
47 '''rules and p_terminals should be arrays, where p_terminals are of
48 the form [preterminal, terminal], and rules are CNF_Rule's.'''
49 self.__p_rules = p_rules # todo: could check for summing to 1 (+/- epsilon)
50 self.__numtag = numtag
51 self.__tagnum = tagnum
52 self.p_terminals = p_terminals
57 class CNF_Rule():
58 '''A single CNF rule in the PCFG, of the form
59 LHS -> L R
60 where these are just integers
61 (where do we save the connection between number and symbol?
62 symbols being 'vbd' etc.)'''
63 def __eq__(self, other):
64 return self.LHS() == other.LHS() and self.R() == other.R() and self.L() == other.L()
65 def __ne__(self, other):
66 return self.LHS() != other.LHS() or self.R() != other.R() or self.L() != other.L()
67 def __str__(self):
68 return "%s -> %s %s [%.2f]" % (self.LHS(), self.L(), self.R(), self.prob)
69 def __init__(self, LHS, L, R, prob):
70 self.__LHS = LHS
71 self.__R = R
72 self.__L = L
73 self.prob = prob
74 def p(self, *arg):
75 "Return a probability, doesn't care about attachment..."
76 return self.prob
77 def LHS(self):
78 return self.__LHS
79 def L(self):
80 return self.__L
81 def R(self):
82 return self.__R
84 def inner(s, t, LHS, g, sent, chart):
85 ''' Give the inner probability of having the node LHS cover whatever's
86 between s and t in sentence sent, using grammar g.
88 Returns a pair of the inner probability and the chart
90 For DMV, LHS is a pair (bar, h), but this function ought to be
91 agnostic about that.
93 e() is an internal function, so the variable chart (a dictionary)
94 is available to all calls of e().
96 Since terminal probabilities are just simple lookups, they are not
97 put in the chart (although we could put them in there later to
98 optimize)
99 '''
101 def O(s):
102 return sent[s]
104 def e(s,t,LHS):
105 '''Chart has lists of probability and whether or not we've attached
106 yet to L and R, each entry is a list [p, Rattach, Lattach], where if
107 Rattach==True then the rule has a right-attachment or there is one
108 lower in the tree (meaning we're no longer adjacent).'''
109 if (s, t, LHS) in chart:
110 return chart[s, t, LHS]
111 else:
112 debug( "trying from %d to %d with %s" % (s,t,LHS) )
113 if s == t:
114 if (LHS, O(s)) in g.p_terminals:
115 prob = g.p_terminals[LHS, O(s)] # b[LHS, O(s)]
116 else:
117 prob = 0.0 # todo: is this the right way to deal with lacking rules?
118 print "\t LACKING TERMINAL:"
119 debug( "\t terminal: %s -> %s : %.1f" % (LHS, O(s), prob) )
120 # terminals have no attachment
121 return prob
122 else:
123 if (s,t,LHS) not in chart:
124 # by default, not attachment yet
125 chart[s,t,LHS] = 0.0 #, False, False]
126 for rule in g.rules(LHS): # summing over j,k in a[LHS,j,k]
127 debug( "\tsumming rule %s" % rule )
128 L = rule.L()
129 R = rule.R()
130 for r in range(s, t): # summing over r = s to r = t-1
131 p_L = e(s, r, L)
132 p_R = e(r + 1, t, R)
133 p = rule.p("todo","todo")
134 chart[s, t, LHS] += p * p_L * p_R
135 debug( "\tchart[%d,%d,%s] = %.2f" % (s,t,LHS, chart[s,t,LHS]) )
136 return chart[s, t, LHS]
137 # end of e-function
139 inner_prob = e(s,t,LHS)
140 if 1 in DEBUG:
141 print "---CHART:---"
142 for k,v in chart.iteritems():
143 print "\t%s -> %s_%d ... %s_%d : %.1f" % (k[2], O(k[0]), k[0], O(k[1]), k[1], v)
144 print "---CHART:end---"
145 return [inner_prob, chart]
154 if __name__ == "__main__":
155 print "IO-module tests:"
156 b = {}
157 s = CNF_Rule(0,1,2, 1.0) # s->np vp
158 np = CNF_Rule(1,3,4, 0.3) # np->n p
159 b[1, 'n'] = 0.7 # np->'n'
160 b[3, 'n'] = 1.0 # n->'n'
161 b[4, 'p'] = 1.0 # p->'p'
162 vp = CNF_Rule(2,5,1, 0.1) # vp->v np (two parses use this rule)
163 vp2 = CNF_Rule(2,2,4, 0.9) # vp->vp p
164 b[5, 'v'] = 1.0 # v->'v'
166 g = Grammar([s,np,vp,vp2], b, {0:'s',1:'np',2:'vp',3:'n',4:'p',5:'v'},
167 {'s':0,'np':1,'vp':2,'n':3,'p':4,'v':5})
169 print "The rules:"
170 for i in range(0,5):
171 for r in g.rules(i):
172 print r
173 print ""
175 test1 = inner(0,0, 1, g, ['n'], {})
176 if test1[0] != 0.7:
177 print "should be 0.70 : %.2f" % test1[0]
178 print ""
180 test2 = inner(0,2, 2, g, ['v','n','p'], test1[1])
181 print "should be 0.0930 : %.4f" % test2[0]
182 print "------ trying the same again:----------"
183 test2 = inner(0,2, 2, g, ['v','n','p'], test2[1])
184 print "should be 0.0930 : %.4f" % test2[0]