1 # loc_h_harmonic.py, initialization for loc_h_dmv.py
3 from loc_h_dmv
import * # better way to do this?
6 HARMONIC_C
= 0.0 # C_A in report.pdf
7 STOP_C
= 1.0 # C_S in report.pdf
8 NSTOP_C
= 1.0 # C_N in report.pdf
9 FSTOP_MIN
= 1.0 # C_M in report.pdf
11 FNONSTOP_MIN
= 0.0 # for OLD_STOP_CALC. 0.0 on FSTOP_MIN gives many
12 # zero-probabilities for OLD_STOP_CALC
15 RIGHT_FIRST
= 1.0 # apparently right-first is best for DMV-only
17 ##############################
19 ##############################
21 '''sents is of this form:
22 [['tag', ...], ['tag2', ...], ...]
24 Return a list of the tags. (Has to be ordered for enumerating to be
27 Fortunately only has to run once.
34 raise ValueError("it seems we must have a new ROOT symbol")
42 '''Return a frequency dictionary with DMV-relevant keys set to 0 or
45 Todo: tweak (especially for f_STOP).'''
50 for dir in [LEFT
, RIGHT
]:
51 for adj
in [ADJ
, NON
]:
52 f
[tag
, 'STOP', dir, adj
] = 0.0
54 f
[tag
, 'STOP', dir, adj
] = FSTOP_MIN
55 f
[tag
, '-STOP', dir, adj
] = FNONSTOP_MIN
58 f
[tag
, 'sum', RIGHT
] = 0.0
59 f
[tag
, 'sum', LEFT
] = 0.0
62 def init_freq(corpus
, tags
):
63 '''Returns f, a dictionary with these types of keys:
64 - ('ROOT', tag) is basically just the frequency of tag
65 - (tag, 'STOP', 'LN') is for P_STOP(STOP|tag, left, non_adj);
66 etc. for 'RN', 'LA', 'LN', '-STOP'.
67 - (tag, LEFT) is a dictionary of arg:f, where head could take arg
68 to direction LEFT (etc. for RIGHT) and f is "harmonically" divided
69 by distance, used for finding P_CHOOSE
72 1. counts word frequencies for f_ROOT
73 2. adds to certain f_STOP counters if a word is found first,
74 last, first or second, or last or second to last in the sentence
75 (Left Adjacent, Left Non-Adjacent, etc)
76 3. adds to f_CHOOSE(arg|head) a "harmonic" number (divided by
77 distance between arg and head)
81 for sent
in corpus
: # sent is ['VBD', 'NN', ...]
83 # NOTE: head in DMV_Rule is a number, while this is the string
84 for loc_h
, head
in enumerate(sent
):
85 # todo grok: how is this different from just using straight head
86 # frequency counts, for the ROOT probabilities?
87 f
['ROOT', head
] += 1.0
88 f
['sum', 'ROOT'] += 1.0
91 f
[head
, 'STOP', LEFT
,NON
] += (loc_h
== 1) # second word
92 f
[head
, '-STOP', LEFT
,NON
] += (not loc_h
== 1) # not second
93 f
[head
, 'STOP', LEFT
,ADJ
] += (loc_h
== 0) # first word
94 f
[head
, '-STOP', LEFT
,ADJ
] += (not loc_h
== 0) # not first
95 f
[head
, 'STOP', RIGHT
,NON
] += (loc_h
== n
- 1) # second-to-last
96 f
[head
, '-STOP', RIGHT
,NON
] += (not loc_h
== n
- 1) # not second-to-last
97 f
[head
, 'STOP', RIGHT
,ADJ
] += (loc_h
== n
) # last word
98 f
[head
, '-STOP', RIGHT
,ADJ
] += (not loc_h
== n
) # not last
100 f
[head
, 'STOP', LEFT
,NON
] += (loc_h
== 1) # second word
101 f
[head
, 'STOP', LEFT
,ADJ
] += (loc_h
== 0) # first word
102 f
[head
, 'STOP', RIGHT
,NON
] += (loc_h
== n
- 1) # second-to-last
103 f
[head
, 'STOP', RIGHT
,ADJ
] += (loc_h
== n
) # last word
104 # elif OTHER_STOP_CALC:
105 # f[head, 'STOP', LEFT,NON] += (loc_h == 1) # second word
106 # f[head, '-STOP', LEFT,NON] += (loc_h > 1) # after second
107 # f[head, 'STOP', LEFT,ADJ] += (loc_h == 0) # first word
108 # f[head, '-STOP', LEFT,ADJ] += (loc_h > 0) # not first
109 # f[head, 'STOP', RIGHT,NON] += (loc_h == n - 1) # second-to-last
110 # f[head, '-STOP', RIGHT,NON] += (loc_h < n - 1) # before second-to-last
111 # f[head, 'STOP', RIGHT,ADJ] += (loc_h == n) # last word
112 # f[head, '-STOP', RIGHT,ADJ] += (loc_h < n) # not last
114 # this is where we make the "harmonic" distribution. quite.
115 for loc_a
, arg
in enumerate(sent
):
117 harmony
= 1.0/abs(loc_h
- loc_a
) + HARMONIC_C
122 if arg
not in f
[head
, dir]:
123 f
[head
, dir][arg
] = 0.0
124 f
[head
, dir][arg
] += harmony
125 f
[head
, 'sum', dir] += harmony
126 # todo, optimization: possible to do both directions
127 # at once here, and later on rule out the ones we've
128 # done? does it actually speed things up?
131 def init_normalize(f
, tags
, numtag
, tagnum
):
132 '''Use frequencies (and sums) in f to return create p_STOP, p_ATTACH,
135 Return a usable DMV_Grammar.'''
137 p_STOP
, p_ROOT
, p_ATTACH
, p_ORDER
= {},{},{},{}
138 for h
, head
in numtag
.iteritems():
139 # f['ROOT', head] is just a simple frequency count
140 p_ROOT
[h
] = f
['ROOT', head
] / f
['sum', 'ROOT']
142 for dir in [LEFT
,RIGHT
]:
143 for adj
in [NON
,ADJ
]:
145 den
= f
[head
, 'STOP', dir, adj
] + f
[head
, '-STOP', dir, adj
]
147 p_STOP
[h
, dir, adj
] = f
[head
, 'STOP', dir, adj
] / den
148 else: p_STOP
[h
, dir, adj
] = 1.0
150 p_STOP
[h
, dir, adj
] = \
151 (FSTOP_MIN
+ f
[head
, 'STOP', dir, adj
] * STOP_C
) / \
152 (FSTOP_MIN
+ f
['ROOT',head
] * (STOP_C
+ NSTOP_C
))
154 p_ORDER
[GOR
, h
] = RIGHT_FIRST
155 p_ORDER
[GOL
, h
] = 1 - RIGHT_FIRST
157 for dir in [LEFT
, RIGHT
]:
158 for arg
, val
in f
[head
, dir].iteritems():
159 p_ATTACH
[tagnum
[arg
], h
, dir] = val
/ f
[head
,'sum',dir]
161 return DMV_Grammar(numtag
, tagnum
, p_ROOT
, p_STOP
, p_ATTACH
, p_ORDER
)
163 def initialize(corpus
):
164 ''' corpus is a list of lists of tags.'''
165 tags
= taglist(corpus
)
166 numtag
, tagnum
= {}, {}
167 for num
, tag
in enumerate(tags
):
170 # f: frequency counts used in initialization, mostly distances
171 f
= init_freq(corpus
, tags
)
172 g
= init_normalize(f
, tags
, numtag
, tagnum
)
174 g
.HARMONIC_C
= HARMONIC_C
# for evaluations in main.py, todo: remove
177 g
.FNONSTOP_MIN
= FNONSTOP_MIN
178 g
.FSTOP_MIN
= FSTOP_MIN
185 ##############################
186 # testing functions: #
187 ##############################
189 if __name__
== "__main__":
190 print "--------initialization testing------------"
191 g2
= initialize([ ['Leftmost', 'Midleft','Midright','Rightmost']])