1 # harmonic.py, initialization for dmv
3 # initialization is a package within dmvccm
5 from dmv
import * # better way to do this?
12 ##############################
14 ##############################
16 '''sents is of this form:
17 [['tag', ...], ['tag2', ...], ...]
19 Return a list of the tags. (Has to be ordered for enumerating to be
22 Fortunately only has to run once.
29 raise ValueError("it seems we must have a new ROOT symbol")
37 '''Return a frequency dictionary with DMV-relevant keys set to 0 or
40 Todo: tweak (especially for f_STOP).'''
45 for dir_adj
in ['LN','LA','RN','RA']:
46 f
[tag
, 'STOP', dir_adj
] = FSTOP_MIN
47 f
[tag
, '-STOP', dir_adj
] = FNONSTOP_MIN
50 f
[tag
, 'sum', 'R'] = 0.0
51 f
[tag
, 'sum', 'L'] = 0.0
54 def init_freq(corpus
, tags
):
55 '''Returns f, a dictionary with these types of keys:
56 - ('ROOT', tag) is basically just the frequency of tag
57 - (tag, 'STOP', 'LN') is for P_STOP(STOP|tag, left, non_adj);
58 etc. for 'RN', 'LA', 'LN', '-STOP'.
59 - (tag, 'L') is a dictionary of arg:f, where head could take arg
60 to direction 'L' (etc. for 'R') and f is "harmonically" divided
61 by distance, used for finding P_CHOOSE
64 1. counts word frequencies for f_ROOT
65 2. adds to certain f_STOP counters if a word is found first,
66 last, first or second, or last or second to last in the sentence
67 (Left Adjacent, Left Non-Adjacent, etc)
68 3. adds to f_CHOOSE(arg|head) a "harmonic" number (divided by
69 distance between arg and head)
73 for sent
in corpus
: # sent is ['VBD', 'NN', ...]
75 # NOTE: head in DMV_Rule is a number, while this is the string
76 for i_h
, head
in enumerate(sent
):
77 # todo grok: how is this different from just using straight head
78 # frequency counts, for the ROOT probabilities?
82 # True = 1, False = 0. todo: make prettier
83 f
[head
, 'STOP', 'LN'] += (i_h
== 1) # second word
84 f
[head
, '-STOP', 'LN'] += (not i_h
== 1) # not second
85 f
[head
, 'STOP', 'LA'] += (i_h
== 0) # first word
86 f
[head
, '-STOP', 'LA'] += (not i_h
== 0) # not first
87 f
[head
, 'STOP', 'RN'] += (i_h
== n
- 2) # second-to-last
88 f
[head
, '-STOP', 'RN'] += (not i_h
== n
- 2) # not second-to-last
89 f
[head
, 'STOP', 'RA'] += (i_h
== n
- 1) # last word
90 f
[head
, '-STOP', 'RA'] += (not i_h
== n
- 1) # not last
92 # this is where we make the "harmonic" distribution. quite.
93 for i_a
, arg
in enumerate(sent
):
95 harmony
= 1.0/abs(i_h
- i_a
) + HARMONIC_C
100 if arg
not in f
[head
, dir]:
101 f
[head
, dir][arg
] = 0.0
102 f
[head
, dir][arg
] += harmony
103 f
[head
, 'sum', dir] += harmony
104 # todo, optimization: possible to do both directions
105 # at once here, and later on rule out the ones we've
106 # done? does it actually speed things up?
110 def init_normalize(f
, tags
, tagnum
, numtag
):
111 '''Use frequencies (and sums) in f to return create p_STOP and
112 p_CHOOSE; at the same time adding the context-free rules to the
113 grammar using these probabilities.
115 Return a usable grammar.'''
117 p_STOP
, p_ROOT
, p_CHOOSE
, p_terminals
= {},{},{},{}
118 for n_h
, head
in enumerate(tags
):
119 p_ROOT
[n_h
] = float(f
['ROOT', head
]) / f
['sum', 'ROOT']
120 p_rules
.append( DMV_Rule(ROOT
, (LRBAR
,n_h
), STOP
,
124 # p_STOP = STOP / (STOP + NOT_STOP)
125 for dir in ['L','R']:
126 for adj
in ['N','A']:
127 p_STOP
[n_h
, dir+adj
] = \
128 float(f
[head
, 'STOP', dir+adj
]) / \
129 (f
[head
, 'STOP', dir+adj
] + f
[head
, '-STOP', dir+adj
])
130 # make rule using the previously found probN and probA:
131 p_rules
.append( DMV_Rule((RBAR
, n_h
), (NOBAR
, n_h
), STOP
,
134 p_rules
.append( DMV_Rule((LRBAR
, n_h
), STOP
, (RBAR
, n_h
),
138 # inner() shouldn't have to deal with those long non-branching stops:
139 p_terminals
[(NOBAR
, n_h
), head
] = 1.0
140 p_terminals
[(RBAR
, n_h
), head
] = p_STOP
[n_h
, 'RA']
141 p_terminals
[(LRBAR
, n_h
), head
] = p_STOP
[n_h
, 'RA'] * p_STOP
[n_h
, 'LA']
143 for dir in ['L', 'R']:
144 for arg
, val
in f
[head
, dir].iteritems():
145 p_CHOOSE
[tagnum
[arg
], n_h
, dir] = float(val
) / f
[head
,'sum',dir]
147 # after the head tag-loop, add every head-argument rule:
148 for (n_a
, n_h
, dir),p_C
in p_CHOOSE
.iteritems():
149 if dir == 'L': # arg is to the left of head
150 p_rules
.append( DMV_Rule((RBAR
,n_h
), (LRBAR
,n_a
), (RBAR
,n_h
),
151 p_C
*(1-p_STOP
[n_h
, dir+'N']),
152 p_C
*(1-p_STOP
[n_h
, dir+'A'])) )
154 p_rules
.append( DMV_Rule((NOBAR
,n_h
), (NOBAR
,n_h
), (LRBAR
,n_a
),
155 p_C
*(1-p_STOP
[n_h
, dir+'N']),
156 p_C
*(1-p_STOP
[n_h
, dir+'A'])) )
158 return DMV_Grammar(p_rules
, p_terminals
, p_STOP
, p_CHOOSE
, p_ROOT
, numtag
, tagnum
)
161 def initialize(corpus
):
162 '''Return an initialized DMV_Grammar
163 corpus is a list of lists of tags.'''
164 tags
= taglist(corpus
)
165 tagnum
, numtag
= {}, {}
166 for num
, tag
in enumerate(tags
):
169 # f: frequency counts used in initialization, mostly distances
170 f
= init_freq(corpus
, tags
)
171 g
= init_normalize(f
, tags
, tagnum
, numtag
)
175 if __name__
== "__main__":
176 print "--------initialization------------"
177 print initialize([['foo', 'two','foo','foo'],
178 ['zero', 'one','two','three']])