1 # harmonic.py, initialization for dmv
3 # initialization is a package within dmvccm
5 from dmv import * # better way to do this?
7 ##############################
9 ##############################
11 '''sents is of this form:
12 [['tag', ...], ['tag2', ...], ...]
14 Return a list of the tags. (Has to be ordered for enumerating to be
17 Fortunately only has to run once.
24 raise ValueError("it seems we must have a new ROOT symbol")
32 '''Return a frequency dictionary with DMV-relevant keys set to 0 or
35 Todo: tweak (especially for f_STOP).'''
40 for dir_adj in ['LN','LA','RN','RA']:
41 f[tag, 'STOP', dir_adj] = FSTOP_MIN
42 f[tag, '-STOP', dir_adj] = FNONSTOP_MIN
45 f[tag, 'sum', 'R'] = 0.0
46 f[tag, 'sum', 'L'] = 0.0
49 def init_freq(corpus, tags):
50 '''Returns f, a dictionary with these types of keys:
51 - ('ROOT', tag) is basically just the frequency of tag
52 - (tag, 'STOP', 'LN') is for P_STOP(STOP|tag, left, non_adj);
53 etc. for 'RN', 'LA', 'LN', '-STOP'.
54 - (tag, 'L') is a dictionary of arg:f, where head could take arg
55 to direction 'L' (etc. for 'R') and f is "harmonically" divided
56 by distance, used for finding P_CHOOSE
59 1. counts word frequencies for f_ROOT
60 2. adds to certain f_STOP counters if a word is found first,
61 last, first or second, or last or second to last in the sentence
62 (Left Adjacent, Left Non-Adjacent, etc)
63 3. adds to f_CHOOSE(arg|head) a "harmonic" number (divided by
64 distance between arg and head)
68 for sent in corpus: # sent is ['VBD', 'NN', ...]
70 # NOTE: head in DMV_Rule is a number, while this is the string
71 for i_h, head in enumerate(sent):
72 # todo grok: how is this different from just using straight head
73 # frequency counts, for the ROOT probabilities?
77 # True = 1, False = 0. todo: make prettier
78 f[head, 'STOP', 'LN'] += (i_h == 1) # second word
79 f[head, '-STOP', 'LN'] += (not i_h == 1) # not second
80 f[head, 'STOP', 'LA'] += (i_h == 0) # first word
81 f[head, '-STOP', 'LA'] += (not i_h == 0) # not first
82 f[head, 'STOP', 'RN'] += (i_h == n - 2) # second-to-last
83 f[head, '-STOP', 'RN'] += (not i_h == n - 2) # not second-to-last
84 f[head, 'STOP', 'RA'] += (i_h == n - 1) # last word
85 f[head, '-STOP', 'RA'] += (not i_h == n - 1) # not last
87 # this is where we make the "harmonic" distribution. quite.
88 for i_a, arg in enumerate(sent):
90 harmony = 1.0/abs(i_h - i_a) + HARMONIC_C
95 if arg not in f[head, dir]:
96 f[head, dir][arg] = 0.0
97 f[head, dir][arg] += harmony
98 f[head, 'sum', dir] += harmony
99 # todo, optimization: possible to do both directions
100 # at once here, and later on rule out the ones we've
101 # done? does it actually speed things up?
105 def init_normalize(f, tags, tagnum, numtag):
106 '''Use frequencies (and sums) in f to return create p_STOP and
107 p_CHOOSE; at the same time adding the context-free rules to the
108 grammar using these probabilities.
110 Return a usable grammar.'''
112 p_STOP, p_ROOT, p_CHOOSE, p_terminals = {},{},{},{}
113 for n_h, head in enumerate(tags):
114 p_ROOT[n_h] = float(f['ROOT', head]) / f['sum', 'ROOT']
115 p_rules.append( DMV_Rule(ROOT, (LRBAR,n_h), STOP,
119 # p_STOP = STOP / (STOP + NOT_STOP)
120 for dir in ['L','R']:
121 for adj in ['N','A']:
122 p_STOP[n_h, dir+adj] = \
123 float(f[head, 'STOP', dir+adj]) / \
124 (f[head, 'STOP', dir+adj] + f[head, '-STOP', dir+adj])
125 # make rule using the previously found probN and probA:
126 p_rules.append( DMV_Rule((RBAR, n_h), (NOBAR, n_h), STOP,
129 p_rules.append( DMV_Rule((LRBAR, n_h), STOP, (RBAR, n_h),
133 # inner() shouldn't have to deal with those long non-branching stops:
134 p_terminals[(NOBAR, n_h), head] = 1.0
135 p_terminals[(RBAR, n_h), head] = p_STOP[n_h, 'RA']
136 p_terminals[(LRBAR, n_h), head] = p_STOP[n_h, 'RA'] * p_STOP[n_h, 'LA']
138 for dir in ['L', 'R']:
139 for arg, val in f[head, dir].iteritems():
140 p_CHOOSE[tagnum[arg], n_h, dir] = float(val) / f[head,'sum',dir]
142 # after the head tag-loop, add every head-argument rule:
143 for (n_a, n_h, dir),p_C in p_CHOOSE.iteritems():
144 if dir == 'L': # arg is to the left of head
145 p_rules.append( DMV_Rule((RBAR,n_h), (LRBAR,n_a), (RBAR,n_h),
146 p_C*(1-p_STOP[n_h, dir+'N']),
147 p_C*(1-p_STOP[n_h, dir+'A'])) )
149 p_rules.append( DMV_Rule((NOBAR,n_h), (NOBAR,n_h), (LRBAR,n_a),
150 p_C*(1-p_STOP[n_h, dir+'N']),
151 p_C*(1-p_STOP[n_h, dir+'A'])) )
153 return DMV_Grammar(p_rules, p_terminals, p_STOP, p_CHOOSE, p_ROOT, numtag, tagnum)
156 def initialize(corpus):
157 '''Return an initialized DMV_Grammar
158 corpus is a list of lists of tags.'''
159 tags = taglist(corpus)
160 tagnum, numtag = {}, {}
161 for num, tag in enumerate(tags):
164 # f: frequency counts used in initialization, mostly distances
165 f = init_freq(corpus, tags)
166 g = init_normalize(f, tags, tagnum, numtag)
170 if __name__ == "__main__":
171 print "--------initialization------------"
172 print initialize([['foo', 'two','foo','foo'],
173 ['zero', 'one','two','three']])