1b3eb1e6652de351fa24fb370b483e86b4fbc964
[dmvccm.git] / src / loc_h_harmonic.py
blob1b3eb1e6652de351fa24fb370b483e86b4fbc964
1 # loc_h_harmonic.py, initialization for loc_h_dmv.py
3 from loc_h_dmv import * # better way to do this?
5 # todo: tweak these (0.0 on FSTOP_MIN gives too many zero-probabilities though)
6 HARMONIC_C = 0.0
7 FNONSTOP_MIN = 0.0
8 FSTOP_MIN = 1.0
9 OTHER_STOP_CALC = True
11 RIGHT_FIRST = 1.0 # apparently right-first is best for DMV-only
13 ##############################
14 # Initialization #
15 ##############################
16 def taglist(corpus):
17 '''sents is of this form:
18 [['tag', ...], ['tag2', ...], ...]
20 Return a list of the tags. (Has to be ordered for enumerating to be
21 consistent.)
23 Fortunately only has to run once.
24 '''
25 tagset = set()
26 for sent in corpus:
27 for tag in sent:
28 tagset.add(tag)
29 if 'ROOT' in tagset:
30 raise ValueError("it seems we must have a new ROOT symbol")
31 return list(tagset)
37 def init_zeros(tags):
38 '''Return a frequency dictionary with DMV-relevant keys set to 0 or
39 {}.
41 Todo: tweak (especially for f_STOP).'''
42 f = {}
43 for tag in tags:
44 f['ROOT', tag] = 0
45 f['sum', 'ROOT'] = 0
46 for dir in [LEFT, RIGHT]:
47 for adj in [ADJ, NON]:
48 f[tag, 'STOP', dir, adj] = FSTOP_MIN
49 f[tag, '-STOP', dir, adj] = FNONSTOP_MIN
50 f[tag, RIGHT] = {}
51 f[tag, LEFT] = {}
52 f[tag, 'sum', RIGHT] = 0.0
53 f[tag, 'sum', LEFT] = 0.0
54 return f
56 def init_freq(corpus, tags):
57 '''Returns f, a dictionary with these types of keys:
58 - ('ROOT', tag) is basically just the frequency of tag
59 - (tag, 'STOP', 'LN') is for P_STOP(STOP|tag, left, non_adj);
60 etc. for 'RN', 'LA', 'LN', '-STOP'.
61 - (tag, LEFT) is a dictionary of arg:f, where head could take arg
62 to direction LEFT (etc. for RIGHT) and f is "harmonically" divided
63 by distance, used for finding P_CHOOSE
65 Does this stuff:
66 1. counts word frequencies for f_ROOT
67 2. adds to certain f_STOP counters if a word is found first,
68 last, first or second, or last or second to last in the sentence
69 (Left Adjacent, Left Non-Adjacent, etc)
70 3. adds to f_CHOOSE(arg|head) a "harmonic" number (divided by
71 distance between arg and head)
72 '''
73 f = init_zeros(tags)
75 for sent in corpus: # sent is ['VBD', 'NN', ...]
76 n = len(sent) - 1
77 # NOTE: head in DMV_Rule is a number, while this is the string
78 for loc_h, head in enumerate(sent):
79 # todo grok: how is this different from just using straight head
80 # frequency counts, for the ROOT probabilities?
81 f['ROOT', head] += 1
82 f['sum', 'ROOT'] += 1
84 if OTHER_STOP_CALC:
85 f[head, 'STOP', LEFT,NON] += (loc_h == 1) # second word
86 f[head, '-STOP', LEFT,NON] += (loc_h > 1) # after second
87 f[head, 'STOP', LEFT,ADJ] += (loc_h == 0) # first word
88 f[head, '-STOP', LEFT,ADJ] += (loc_h > 0) # not first
89 f[head, 'STOP', RIGHT,NON] += (loc_h == n - 1) # second-to-last
90 f[head, '-STOP', RIGHT,NON] += (loc_h < n - 1) # before second-to-last
91 f[head, 'STOP', RIGHT,ADJ] += (loc_h == n) # last word
92 f[head, '-STOP', RIGHT,ADJ] += (loc_h < n) # not last
93 else: # note: int(True) == 1
94 f[head, 'STOP', LEFT,NON] += (loc_h == 1) # second word
95 f[head, '-STOP', LEFT,NON] += (not loc_h == 1) # not second
96 f[head, 'STOP', LEFT,ADJ] += (loc_h == 0) # first word
97 f[head, '-STOP', LEFT,ADJ] += (not loc_h == 0) # not first
98 f[head, 'STOP', RIGHT,NON] += (loc_h == n - 1) # second-to-last
99 f[head, '-STOP', RIGHT,NON] += (not loc_h == n - 1) # not second-to-last
100 f[head, 'STOP', RIGHT,ADJ] += (loc_h == n) # last word
101 f[head, '-STOP', RIGHT,ADJ] += (not loc_h == n) # not last
103 # this is where we make the "harmonic" distribution. quite.
104 for loc_a, arg in enumerate(sent):
105 if loc_h != loc_a:
106 harmony = 1.0/abs(loc_h - loc_a) + HARMONIC_C
107 if loc_h > loc_a:
108 dir = LEFT
109 else:
110 dir = RIGHT
111 if arg not in f[head, dir]:
112 f[head, dir][arg] = 0.0
113 f[head, dir][arg] += harmony
114 f[head, 'sum', dir] += harmony
115 # todo, optimization: possible to do both directions
116 # at once here, and later on rule out the ones we've
117 # done? does it actually speed things up?
118 return f
120 def init_normalize(f, tags, numtag, tagnum):
121 '''Use frequencies (and sums) in f to return create p_STOP, p_ATTACH
122 and p_GO_AT (which is (1-p_STOP)*p_ATTACH).
124 Return a usable DMV_Grammar2.'''
125 p_rules = []
126 p_STOP, p_ROOT, p_ATTACH, p_ORDER = {},{},{},{}
127 for h, head in numtag.iteritems():
128 p_ROOT[h] = float(f['ROOT', head]) / f['sum', 'ROOT']
130 # p_STOP = STOP / (STOP + NOT_STOP)
131 for dir in [LEFT,RIGHT]:
132 for adj in [NON,ADJ]:
133 den = f[head, 'STOP', dir, adj] + f[head, '-STOP', dir, adj]
134 if den > 0.0:
135 p_STOP[h, dir, adj] = float(f[head, 'STOP', dir, adj]) / float(den)
136 else:
137 p_STOP[h, dir, adj] = 1.0
141 p_ORDER[GOR, h] = RIGHT_FIRST
142 p_ORDER[GOL, h] = 1 - RIGHT_FIRST
144 for dir in [LEFT, RIGHT]:
145 for arg, val in f[head, dir].iteritems():
146 p_ATTACH[tagnum[arg], h, dir] = float(val) / f[head,'sum',dir]
148 return DMV_Grammar(numtag, tagnum, p_ROOT, p_STOP, p_ATTACH, p_ORDER)
150 def initialize(corpus):
151 ''' corpus is a list of lists of tags.'''
152 tags = taglist(corpus)
153 numtag, tagnum = {}, {}
154 for num, tag in enumerate(tags):
155 tagnum[tag] = num
156 numtag[num] = tag
157 # f: frequency counts used in initialization, mostly distances
158 f = init_freq(corpus, tags)
159 g = init_normalize(f, tags, numtag, tagnum)
161 g.HARMONIC_C = HARMONIC_C # for evaluations in main.py, todo: remove
162 g.FNONSTOP_MIN = FNONSTOP_MIN
163 g.FSTOP_MIN = FSTOP_MIN
165 return g
170 ##############################
171 # testing functions: #
172 ##############################
174 if __name__ == "__main__":
175 print "--------initialization testing------------"
176 g2 = initialize([ ['Leftmost', 'Midleft','Midright','Rightmost']])
177 print g2