1b3eb1e6652de351fa24fb370b483e86b4fbc964
1 # loc_h_harmonic.py, initialization for loc_h_dmv.py
3 from loc_h_dmv
import * # better way to do this?
5 # todo: tweak these (0.0 on FSTOP_MIN gives too many zero-probabilities though)
11 RIGHT_FIRST
= 1.0 # apparently right-first is best for DMV-only
13 ##############################
15 ##############################
17 '''sents is of this form:
18 [['tag', ...], ['tag2', ...], ...]
20 Return a list of the tags. (Has to be ordered for enumerating to be
23 Fortunately only has to run once.
30 raise ValueError("it seems we must have a new ROOT symbol")
38 '''Return a frequency dictionary with DMV-relevant keys set to 0 or
41 Todo: tweak (especially for f_STOP).'''
46 for dir in [LEFT
, RIGHT
]:
47 for adj
in [ADJ
, NON
]:
48 f
[tag
, 'STOP', dir, adj
] = FSTOP_MIN
49 f
[tag
, '-STOP', dir, adj
] = FNONSTOP_MIN
52 f
[tag
, 'sum', RIGHT
] = 0.0
53 f
[tag
, 'sum', LEFT
] = 0.0
56 def init_freq(corpus
, tags
):
57 '''Returns f, a dictionary with these types of keys:
58 - ('ROOT', tag) is basically just the frequency of tag
59 - (tag, 'STOP', 'LN') is for P_STOP(STOP|tag, left, non_adj);
60 etc. for 'RN', 'LA', 'LN', '-STOP'.
61 - (tag, LEFT) is a dictionary of arg:f, where head could take arg
62 to direction LEFT (etc. for RIGHT) and f is "harmonically" divided
63 by distance, used for finding P_CHOOSE
66 1. counts word frequencies for f_ROOT
67 2. adds to certain f_STOP counters if a word is found first,
68 last, first or second, or last or second to last in the sentence
69 (Left Adjacent, Left Non-Adjacent, etc)
70 3. adds to f_CHOOSE(arg|head) a "harmonic" number (divided by
71 distance between arg and head)
75 for sent
in corpus
: # sent is ['VBD', 'NN', ...]
77 # NOTE: head in DMV_Rule is a number, while this is the string
78 for loc_h
, head
in enumerate(sent
):
79 # todo grok: how is this different from just using straight head
80 # frequency counts, for the ROOT probabilities?
85 f
[head
, 'STOP', LEFT
,NON
] += (loc_h
== 1) # second word
86 f
[head
, '-STOP', LEFT
,NON
] += (loc_h
> 1) # after second
87 f
[head
, 'STOP', LEFT
,ADJ
] += (loc_h
== 0) # first word
88 f
[head
, '-STOP', LEFT
,ADJ
] += (loc_h
> 0) # not first
89 f
[head
, 'STOP', RIGHT
,NON
] += (loc_h
== n
- 1) # second-to-last
90 f
[head
, '-STOP', RIGHT
,NON
] += (loc_h
< n
- 1) # before second-to-last
91 f
[head
, 'STOP', RIGHT
,ADJ
] += (loc_h
== n
) # last word
92 f
[head
, '-STOP', RIGHT
,ADJ
] += (loc_h
< n
) # not last
93 else: # note: int(True) == 1
94 f
[head
, 'STOP', LEFT
,NON
] += (loc_h
== 1) # second word
95 f
[head
, '-STOP', LEFT
,NON
] += (not loc_h
== 1) # not second
96 f
[head
, 'STOP', LEFT
,ADJ
] += (loc_h
== 0) # first word
97 f
[head
, '-STOP', LEFT
,ADJ
] += (not loc_h
== 0) # not first
98 f
[head
, 'STOP', RIGHT
,NON
] += (loc_h
== n
- 1) # second-to-last
99 f
[head
, '-STOP', RIGHT
,NON
] += (not loc_h
== n
- 1) # not second-to-last
100 f
[head
, 'STOP', RIGHT
,ADJ
] += (loc_h
== n
) # last word
101 f
[head
, '-STOP', RIGHT
,ADJ
] += (not loc_h
== n
) # not last
103 # this is where we make the "harmonic" distribution. quite.
104 for loc_a
, arg
in enumerate(sent
):
106 harmony
= 1.0/abs(loc_h
- loc_a
) + HARMONIC_C
111 if arg
not in f
[head
, dir]:
112 f
[head
, dir][arg
] = 0.0
113 f
[head
, dir][arg
] += harmony
114 f
[head
, 'sum', dir] += harmony
115 # todo, optimization: possible to do both directions
116 # at once here, and later on rule out the ones we've
117 # done? does it actually speed things up?
120 def init_normalize(f
, tags
, numtag
, tagnum
):
121 '''Use frequencies (and sums) in f to return create p_STOP, p_ATTACH
122 and p_GO_AT (which is (1-p_STOP)*p_ATTACH).
124 Return a usable DMV_Grammar2.'''
126 p_STOP
, p_ROOT
, p_ATTACH
, p_ORDER
= {},{},{},{}
127 for h
, head
in numtag
.iteritems():
128 p_ROOT
[h
] = float(f
['ROOT', head
]) / f
['sum', 'ROOT']
130 # p_STOP = STOP / (STOP + NOT_STOP)
131 for dir in [LEFT
,RIGHT
]:
132 for adj
in [NON
,ADJ
]:
133 den
= f
[head
, 'STOP', dir, adj
] + f
[head
, '-STOP', dir, adj
]
135 p_STOP
[h
, dir, adj
] = float(f
[head
, 'STOP', dir, adj
]) / float(den
)
137 p_STOP
[h
, dir, adj
] = 1.0
141 p_ORDER
[GOR
, h
] = RIGHT_FIRST
142 p_ORDER
[GOL
, h
] = 1 - RIGHT_FIRST
144 for dir in [LEFT
, RIGHT
]:
145 for arg
, val
in f
[head
, dir].iteritems():
146 p_ATTACH
[tagnum
[arg
], h
, dir] = float(val
) / f
[head
,'sum',dir]
148 return DMV_Grammar(numtag
, tagnum
, p_ROOT
, p_STOP
, p_ATTACH
, p_ORDER
)
150 def initialize(corpus
):
151 ''' corpus is a list of lists of tags.'''
152 tags
= taglist(corpus
)
153 numtag
, tagnum
= {}, {}
154 for num
, tag
in enumerate(tags
):
157 # f: frequency counts used in initialization, mostly distances
158 f
= init_freq(corpus
, tags
)
159 g
= init_normalize(f
, tags
, numtag
, tagnum
)
161 g
.HARMONIC_C
= HARMONIC_C
# for evaluations in main.py, todo: remove
162 g
.FNONSTOP_MIN
= FNONSTOP_MIN
163 g
.FSTOP_MIN
= FSTOP_MIN
170 ##############################
171 # testing functions: #
172 ##############################
174 if __name__
== "__main__":
175 print "--------initialization testing------------"
176 g2
= initialize([ ['Leftmost', 'Midleft','Midright','Rightmost']])