another little fix to stop initialization
[dmvccm.git] / src / loc_h_harmonic.py
blob352a5c6eb3dd757983ee2fcec5d49d170764e7c7
1 # loc_h_harmonic.py, initialization for loc_h_dmv.py
3 from loc_h_dmv import * # better way to do this?
5 # todo: tweak these
6 HARMONIC_C = 0.0
7 STOP_C = 1.0
8 NSTOP_C = 1.0
9 FSTOP_MIN = 1.0
11 FNONSTOP_MIN = 0.0 # for OLD_STOP_CALC. 0.0 on FSTOP_MIN gives many
12 # zero-probabilities for OLD_STOP_CALC
13 OLD_STOP_CALC = True
15 RIGHT_FIRST = 1.0 # apparently right-first is best for DMV-only
17 ##############################
18 # Initialization #
19 ##############################
20 def taglist(corpus):
21 '''sents is of this form:
22 [['tag', ...], ['tag2', ...], ...]
24 Return a list of the tags. (Has to be ordered for enumerating to be
25 consistent.)
27 Fortunately only has to run once.
28 '''
29 tagset = set()
30 for sent in corpus:
31 for tag in sent:
32 tagset.add(tag)
33 if 'ROOT' in tagset:
34 raise ValueError("it seems we must have a new ROOT symbol")
35 return list(tagset)
41 def init_zeros(tags):
42 '''Return a frequency dictionary with DMV-relevant keys set to 0 or
43 {}.
45 Todo: tweak (especially for f_STOP).'''
46 f = {}
47 for tag in tags:
48 f['ROOT', tag] = 0
49 f['sum', 'ROOT'] = 0
50 for dir in [LEFT, RIGHT]:
51 for adj in [ADJ, NON]:
52 f[tag, 'STOP', dir, adj] = 0.0
53 if OLD_STOP_CALC:
54 f[tag, 'STOP', dir, adj] = FSTOP_MIN
55 f[tag, '-STOP', dir, adj] = FNONSTOP_MIN
56 f[tag, RIGHT] = {}
57 f[tag, LEFT] = {}
58 f[tag, 'sum', RIGHT] = 0.0
59 f[tag, 'sum', LEFT] = 0.0
60 return f
62 def init_freq(corpus, tags):
63 '''Returns f, a dictionary with these types of keys:
64 - ('ROOT', tag) is basically just the frequency of tag
65 - (tag, 'STOP', 'LN') is for P_STOP(STOP|tag, left, non_adj);
66 etc. for 'RN', 'LA', 'LN', '-STOP'.
67 - (tag, LEFT) is a dictionary of arg:f, where head could take arg
68 to direction LEFT (etc. for RIGHT) and f is "harmonically" divided
69 by distance, used for finding P_CHOOSE
71 Does this stuff:
72 1. counts word frequencies for f_ROOT
73 2. adds to certain f_STOP counters if a word is found first,
74 last, first or second, or last or second to last in the sentence
75 (Left Adjacent, Left Non-Adjacent, etc)
76 3. adds to f_CHOOSE(arg|head) a "harmonic" number (divided by
77 distance between arg and head)
78 '''
79 f = init_zeros(tags)
81 for sent in corpus: # sent is ['VBD', 'NN', ...]
82 n = len(sent) - 1
83 # NOTE: head in DMV_Rule is a number, while this is the string
84 for loc_h, head in enumerate(sent):
85 # todo grok: how is this different from just using straight head
86 # frequency counts, for the ROOT probabilities?
87 f['ROOT', head] += 1.0
88 f['sum', 'ROOT'] += 1.0
90 if OLD_STOP_CALC:
91 f[head, 'STOP', LEFT,NON] += (loc_h == 1) # second word
92 f[head, '-STOP', LEFT,NON] += (not loc_h == 1) # not second
93 f[head, 'STOP', LEFT,ADJ] += (loc_h == 0) # first word
94 f[head, '-STOP', LEFT,ADJ] += (not loc_h == 0) # not first
95 f[head, 'STOP', RIGHT,NON] += (loc_h == n - 1) # second-to-last
96 f[head, '-STOP', RIGHT,NON] += (not loc_h == n - 1) # not second-to-last
97 f[head, 'STOP', RIGHT,ADJ] += (loc_h == n) # last word
98 f[head, '-STOP', RIGHT,ADJ] += (not loc_h == n) # not last
99 else:
100 f[head, 'STOP', LEFT,NON] += (loc_h == 1) # second word
101 f[head, 'STOP', LEFT,ADJ] += (loc_h == 0) # first word
102 f[head, 'STOP', RIGHT,NON] += (loc_h == n - 1) # second-to-last
103 f[head, 'STOP', RIGHT,ADJ] += (loc_h == n) # last word
104 # elif OTHER_STOP_CALC:
105 # f[head, 'STOP', LEFT,NON] += (loc_h == 1) # second word
106 # f[head, '-STOP', LEFT,NON] += (loc_h > 1) # after second
107 # f[head, 'STOP', LEFT,ADJ] += (loc_h == 0) # first word
108 # f[head, '-STOP', LEFT,ADJ] += (loc_h > 0) # not first
109 # f[head, 'STOP', RIGHT,NON] += (loc_h == n - 1) # second-to-last
110 # f[head, '-STOP', RIGHT,NON] += (loc_h < n - 1) # before second-to-last
111 # f[head, 'STOP', RIGHT,ADJ] += (loc_h == n) # last word
112 # f[head, '-STOP', RIGHT,ADJ] += (loc_h < n) # not last
114 # this is where we make the "harmonic" distribution. quite.
115 for loc_a, arg in enumerate(sent):
116 if loc_h != loc_a:
117 harmony = 1.0/abs(loc_h - loc_a) + HARMONIC_C
118 if loc_h > loc_a:
119 dir = LEFT
120 else:
121 dir = RIGHT
122 if arg not in f[head, dir]:
123 f[head, dir][arg] = 0.0
124 f[head, dir][arg] += harmony
125 f[head, 'sum', dir] += harmony
126 # todo, optimization: possible to do both directions
127 # at once here, and later on rule out the ones we've
128 # done? does it actually speed things up?
129 return f
131 def init_normalize(f, tags, numtag, tagnum):
132 '''Use frequencies (and sums) in f to return create p_STOP, p_ATTACH,
133 p_ROOT.
135 Return a usable DMV_Grammar.'''
136 p_rules = []
137 p_STOP, p_ROOT, p_ATTACH, p_ORDER = {},{},{},{}
138 for h, head in numtag.iteritems():
139 # f['ROOT', head] is just a simple frequency count
140 p_ROOT[h] = f['ROOT', head] / f['sum', 'ROOT']
142 for dir in [LEFT,RIGHT]:
143 for adj in [NON,ADJ]:
144 if OLD_STOP_CALC:
145 den = f[head, 'STOP', dir, adj] + f[head, '-STOP', dir, adj]
146 if den > 0.0:
147 p_STOP[h, dir, adj] = f[head, 'STOP', dir, adj] / den
148 else: p_STOP[h, dir, adj] = 1.0
149 else:
150 p_STOP[h, dir, adj] = \
151 (FSTOP_MIN + f[head, 'STOP', dir, adj] * STOP_C) / \
152 (FSTOP_MIN + f['ROOT',head] * (STOP_C + NSTOP_C))
154 p_ORDER[GOR, h] = RIGHT_FIRST
155 p_ORDER[GOL, h] = 1 - RIGHT_FIRST
157 for dir in [LEFT, RIGHT]:
158 for arg, val in f[head, dir].iteritems():
159 p_ATTACH[tagnum[arg], h, dir] = val / f[head,'sum',dir]
161 return DMV_Grammar(numtag, tagnum, p_ROOT, p_STOP, p_ATTACH, p_ORDER)
163 def initialize(corpus):
164 ''' corpus is a list of lists of tags.'''
165 tags = taglist(corpus)
166 numtag, tagnum = {}, {}
167 for num, tag in enumerate(tags):
168 tagnum[tag] = num
169 numtag[num] = tag
170 # f: frequency counts used in initialization, mostly distances
171 f = init_freq(corpus, tags)
172 g = init_normalize(f, tags, numtag, tagnum)
174 g.HARMONIC_C = HARMONIC_C # for evaluations in main.py, todo: remove
175 g.STOP_C = STOP_C
176 g.NSTOP_C = NSTOP_C
177 g.FNONSTOP_MIN = FNONSTOP_MIN
178 g.FSTOP_MIN = FSTOP_MIN
180 return g
185 ##############################
186 # testing functions: #
187 ##############################
189 if __name__ == "__main__":
190 print "--------initialization testing------------"
191 g2 = initialize([ ['Leftmost', 'Midleft','Midright','Rightmost']])
192 print g2