3 # - prettier printout for DMV_Rule
4 # - DMV_Rule changed a bit. head, L and R are now all pairs of the
6 # - Started on P_STOP, a bit less pseudo now..
9 # - started on initialization. So far, I have frequencies for
10 # everything, very harmonic. Still need to make these into 1-summing
14 # - more work on initialization (init_freq and init_normalize),
15 # getting closer to probabilities now.
18 # - init_normalize is done, it creates p_STOP, p_ROOT and p_CHOOSE,
19 # and also adds the relevant probabilities to p_rules in a grammar.
20 # Still, each individual rule has to store both adjacent and non_adj
21 # probabilities, and inner() should be able to send some parameter
22 # which lets the rule choose... hopefully... Is this possible to do
23 # top-down even? when the sentence could be all the same words?
24 # todo: extensive testing of identical words in sentences!
25 # - frequencies (only used in initialization) are stored as strings,
26 # but in the rules and p_STOP etc, there are only numbers.
29 # - copied inner() into this file, to make the very dmv-specific
30 # adjacency stuff work (have to factor that out later on, when it
35 # - finished typing in inner_dmv(), still have to test and debug
36 # it. The chart is now four times as big since for any rule we may
37 # have attachments to either the left or the right below, which
38 # upper rules depend on, for selecting probN or probA
41 # import numpy # numpy provides Fast Arrays, for future optimization
48 # is this the best way to representat ROOT and STOP?
53 # the following will only print when in this module:
54 if __name__ == "__main__":
55 print "DMV-module tests:"
56 a = io.CNF_Rule(3,1,3,0.1)
58 print "import io not working"
60 class DMV_Grammar(io.Grammar):
63 We need to be able to access rules per mother node, sum every H, every
64 H_, ..., every H', every H'_, etc. for the IO-algorithm.
66 What other representations do we need? (P_STOP formula uses
67 deps_D(h,l/r) at least)'''
70 for r in self.p_rules:
76 return [r for r in self.p_rules if r.head == head and r.bars == bars]
79 return "some structure full of rule heads.."
81 def deps(self, h, dir):
82 return "all dir-dependents of rules with head h"
84 def __init__(self, p_rules, p_terminals, p_STOP, p_CHOOSE, p_ROOT):
85 io.Grammar.__init__(self, p_rules, p_terminals)
87 self.p_CHOOSE = p_CHOOSE
91 class DMV_Rule(io.CNF_Rule):
92 '''A single CNF rule in the PCFG, of the form
94 where LHS = (bars, head)
96 todo: possibly just store b_h instead of bars and head? (then b_h
97 = LHS, while we need new accessor functions for bars and head)
99 Different rule-types have different probabilities associated with
102 _h_ -> STOP h_ P( STOP|h,L, adj)
103 _h_ -> STOP h_ P( STOP|h,L,non_adj)
104 h_ -> h STOP P( STOP|h,R, adj)
105 h_ -> h STOP P( STOP|h,R,non_adj)
106 h_ -> _a_ h_ P(-STOP|h,L, adj) * P(a|h,L)
107 h_ -> _a_ h_ P(-STOP|h,L,non_adj) * P(a|h,L)
108 h -> h _a_ P(-STOP|h,R, adj) * P(a|h,R)
109 h -> h _a_ P(-STOP|h,R,non_adj) * P(a|h,R)
111 def p(self, LRattach, RLattach, *arg):
112 '''Returns the correct probability, adjacent or non-adjacent,
113 depending on whether or not there is a some lower attachment either on
114 the right side of the left child, or the left side of the right child.
116 if (not LRattach) and (not RLattach):
122 return (self.bars, self.head)
124 def __init__(self, b_h, b_L, b_R, probN, probA):
125 io.CNF_Rule.__init__(self, b_h[1], b_L, b_R, probN)
126 self.probA = probA # adjacent
127 self.probN = probN # non_adj
130 else: # hmm, should perhaps check b_L and b_R too? todo
131 raise ValueError("bars must be in %s; was given: %s" % (BARS, b_h[0]))
139 str = "_%d_" % b_h[1]
145 return "%s\t->\t%s\t%s\t[%.2f] [%.2f]" % (bar_str((self.bars,self.head)),
152 # the following will only print when in this module:
153 if __name__ == "__main__":
154 h2 = DMV_Rule((LRBAR,0),STOP,(RBAR,0), 1.0,1.0) #Lstop
155 h1 = DMV_Rule((RBAR,0),(NOBAR,0),STOP, 0.5,0.5) #Rstop
156 h3 = DMV_Rule((RBAR,0),(LRBAR,0),(RBAR,0), 0.5,0.5) #Lattach
157 h0 = DMV_Rule((NOBAR,0),(NOBAR,0),(LRBAR,0), 1.0,1.0) #Rattach
159 b2[(NOBAR, 0), 'h'] = 1.0
160 b2[(RBAR, 0), 'h'] = h1.prob
161 b2[(LRBAR, 0), 'h'] = h1.prob * h2.prob
163 g2 = DMV_Grammar([h0,h1,h2,h3],b2,0,0,0)
166 test1 = io.inner(0,1, (LRBAR,0), g2, "h h".split(), {})
167 if test1[0] != 0.375:
168 print "Should be 0.375 = 0.5^3 + 0.5^2" % test1[0]
171 pprint.pprint( io.inner(0,2, (LRBAR,0), g2, "h h h".split(), {}) )
177 ###################################
178 # dmv-specific version of inner() #
179 ###################################
180 def rewrite_adj(bars, Lattach, Rattach):
181 # todo: make prettier? Although since we call this so many times,
182 # having it spelled out here is probably faster
183 if bars == NOBAR and not Lattach and Rattach:
184 return ( (Lattach, False, False, False),
185 (Lattach, False, False, True),
186 (Lattach, False, True, False),
187 (Lattach, False, True, True),
188 (Lattach, True, False, False),
189 (Lattach, True, False, True),
190 (Lattach, True, True, False),
191 (Lattach, True, True, True), )
192 elif bars == RBAR and Lattach:
193 # Rattach may be either true or false here!
194 return ( (False, False, False, Rattach),
195 (False, False, True, Rattach),
196 (False, True, False, Rattach),
197 (False, True, True, Rattach),
198 (True, False, False, Rattach),
199 (True, False, True, Rattach),
200 (True, True, False, Rattach),
201 (True, True, True, Rattach) )
203 # NOBAR rewrite rules cannot have Lattach below, and must
204 # have/add Rattach. RBAR rewrite rules must add Lattach, but
205 # don't care about Rattach. Returning () should ensure we
206 # don't add any probability to such "false" situations
209 def inner_dmv(s, t, LHS, g, sent, chart):
210 ''' A rewrite of inner in io.py, to take adjacency into accord.
212 The chart is now 4 times bigger, since there are different values
213 for with or without L/R attachments:
214 chart[(s,t,LHS, Lattach, Rattach)]
216 If Rattach==True then the rule has a right-attachment or there is
217 one lower in the tree (meaning we're no longer adjacent).
219 Todo: make this work, then, if possible, refactor (move
220 dmv-specific stuff back into dmv, so this is "general" again)
226 def e(s,t,LHS, Lattach, Rattach):
227 if (s, t, LHS, Lattach, Rattach) in chart:
228 return chart[(s, t, LHS, Lattach, Rattach)]
230 debug( "trying from %d to %d with %s (L:%s, R:%s)" % (s,t,LHS, Lattach, Rattach) )
232 if (LHS, O(s)) in g.p_terminals:
233 # terminals are always F,F for attachment
234 prob = g.p_terminals[LHS, O(s)] # b[LHS, O(s)] in Lari&Young
236 # todo: is this the right way to deal with lacking
237 # rules? assuming it is so, since we add
238 # probabilities, and 0 is identity
240 print "\t LACKING TERMINAL:"
241 debug( "\t terminal: %s -> %s : %.1f" % (LHS, O(s), prob) )
242 # todo: add to chart perhaps? Although, it _is_ simple lookup..
245 if (s,t,LHS,Lattach, Rattach) not in chart:
246 chart[(s,t,LHS,Lattach,Rattach)] = 0.0
247 for rule in g.rules(LHS): # summing over j,k in a[LHS,j,k]
248 debug( "\tsumming rule %s" % rule )
251 # if it's a STOP rule, rewrite for the same range:
252 if (L == STOP) or (R == STOP):
254 p = rule.p(Lattach, False) # todo check
255 pLR = e(s, t, R, Lattach, Rattach)
257 p = rule.p(False, Rattach) # todo check
258 pLR = e(s, t, L, Lattach, Rattach)
259 chart[(s, t, LHS, Lattach, Rattach)] += p * pLR
261 # not a STOP, an attachment rewrite:
263 for r in range(s, t):
264 # LL etc are boolean attachment values
265 for (LL, LR, RL, RR) in rewrite_adj(rule.bars, Lattach, Rattach):
266 p = rule.p(LR, RL) # probN or probA
267 pL = e(s, r, L, LL, LR)
268 pR = e(r+1, t, R, RL, RR)
269 chart[(s, t, LHS,Lattach,Rattach)] += p * pL * pR
271 debug( "\tchart[(%d,%d,%s,%s%s)] = %.2f" % (s,t,LHS,Lattach,Rattach, chart[(s,t,LHS,Lattach,Rattach)]) )
272 return chart[(s, t, LHS,Lattach,Rattach)]
275 inner_prob = e(s,t,LHS,True,True) + e(s,t,LHS,True,False) + e(s,t,LHS,False,True) + e(s,t,LHS,False,False)
278 for k,v in chart.iteritems():
279 print "\t%s -> %s_%d ... %s_%d : %.1f" % (k[2], O(k[0]), k[0], O(k[1]), k[1], v)
280 print "---CHART:end---"
281 return [inner_prob, chart]
289 ##############################
290 # DMV-probabilities, todo: #
291 ##############################
293 def P_STOP(STOP, h, dir, adj, corpus):
294 '''corpus is a list of sentences s.
296 This is based on the formula where STOP is True... not sure how we
297 calculate if STOP is False.
300 I thought about instead having this:
302 for rule in g.p_rules:
306 for rule in g.p_rules:
309 set num and den using inner
310 for rule in g.p_rules
311 rule.prob = rule.num / rule.den
313 ..the way I'm assuming we do it in the commented out io-function in
314 io.py. Having sentences as the outer loop at least we can easily just
315 go through the heads that are actually in the sentence... BUT, this
316 means having to go through p_rules 3 times, not sure what is slower.
319 P_STOP(-STOP|...) = 1 - P_STOP(STOP|...)
326 # here we should somehow make each word in the sentence
327 # unique, decorate them with subscripts or something. We have
328 # to run through the sentence as many times as h appears
329 # there. This also means changing inner(), I suspect. Have to
330 # make sure we separate reading of inner_prob from changing of
332 for s in range(loc(h)): # i<loc(h), where h is in the sentence.
333 for t in range(i, len(sent)):
334 P_STOP_num += inner(s, t, h-r, g, sent, chart)
335 P_STOP_den += inner(s, t, l-h-r, g, sent, chart)
336 return P_STOP_num / P_STOP_den # possibly other way round? todo
343 '''Here it seems like they store rule information on a per-head (per
344 direction) basis, in deps_D(h, dir) which gives us a list. '''
347 for dir in ['l', 'r']:
348 for a in deps(h, dir):
351 P_STOP (0, h, dir, adj) * \
352 P_CHOOSE (a, h, dir) * \
354 P_STOP (STOP | h, dir, adj)
356 return P_h(root(sent))
360 ##############################
361 # Initialization, todo #
362 ##############################
364 '''sents is of this form:
365 [['tag', ...], ['tag2', ...], ...]
367 Return a list of the tags. (Has to be ordered for enumerating to be
370 Fortunately only has to run once.
377 raise ValueError("it seems we must have a new ROOT symbol")
384 def init_zeros(tags):
385 "Return a frequency dictionary with DMV-relevant keys set to 0 / {}."
390 for dir_adj in ['LN','LA','RN','RA']:
391 f[tag, 'STOP', dir_adj] = 0
392 f[tag, '-STOP', dir_adj] = 0
395 f[tag, 'sum', 'R'] = 0.0
396 f[tag, 'sum', 'L'] = 0.0
399 def init_freq(corpus, tags):
400 '''Returns f, a dictionary with these types of keys:
401 - ('ROOT', tag) is basically just the frequency of tag
402 - (tag, 'STOP', 'LN') is for P_STOP(STOP|tag, left, non_adj);
403 etc. for 'RN', 'LA', 'LN', '-STOP'.
404 - (tag, 'L') is a dictionary of arg:f, where head could take arg
405 to direction 'L' (etc. for 'R') and f is "harmonically" divided
406 by distance, used for finding P_CHOOSE
410 for sent in corpus: # sent is ['VBD', 'NN', ...]
412 # NOTE: head in DMV_Rule is a number, while this is the string
413 for i_h, head in enumerate(sent):
414 # todo grok: how is this different from just using straight head
415 # frequency counts, for the ROOT probabilities?
417 f['sum', 'ROOT'] += 1
419 # True = 1, False = 0. todo: make prettier
420 f[head, 'STOP', 'LN'] += (i_h <= 1) # first two words
421 f[head, '-STOP', 'LN'] += (not i_h <= 1)
422 f[head, 'STOP', 'LA'] += (i_h == 0) # very first word
423 f[head, '-STOP', 'LA'] += (not i_h == 0)
424 f[head, 'STOP', 'RN'] += (i_h >= n - 2) # last two words
425 f[head, '-STOP', 'RN'] += (not i_h >= n - 2)
426 f[head, 'STOP', 'RA'] += (i_h == n - 1) # very last word
427 f[head, '-STOP', 'RA'] += (not i_h == n - 1)
429 # this is where we make the "harmonic" distribution. quite.
430 for i_a, arg in enumerate(sent):
432 harmony = 1.0/abs(i_h - i_a) + HARMONIC_C
437 if arg not in f[head, dir]:
438 f[head, dir][arg] = 0.0
439 f[head, dir][arg] += harmony
440 f[head, 'sum', dir] += harmony
441 # todo, optimization: possible to do both directions
442 # at once here, and later on rule out the ones we've
443 # done? does it actually speed things up?
447 def init_normalize(f, tags, tagnum):
448 '''Use frequencies (and sums) in f to return create p_STOP and
449 p_CHOOSE; at the same time adding the context-free rules to the
450 grammar using these probabilities.
452 Return a usable grammar.'''
454 p_STOP, p_ROOT, p_CHOOSE, p_terminals = {},{},{},{}
455 for n_h, head in enumerate(tags):
456 p_ROOT[n_h] = float(f['ROOT', head]) / f['sum', 'ROOT']
457 p_rules.append( DMV_Rule(ROOT, (LRBAR,n_h), STOP,
461 # p_STOP = STOP / (STOP + NOT_STOP)
462 for dir in ['L','R']:
463 for adj in ['N','A']:
464 p_STOP[n_h, dir+adj] = \
465 float(f[head, 'STOP', dir+adj]) / \
466 (f[head, 'STOP', dir+adj] + f[head, '-STOP', dir+adj])
468 p_rules.append( DMV_Rule((RBAR, n_h), (NOBAR, n_h), STOP,
469 p_STOP[n_h, dir+'N'],
470 p_STOP[n_h, dir+'A']) )
472 # inner() shouldn't have to deal with those long non-branching stops:
473 p_terminals[(NOBAR, n_h), head] = 1.0
474 p_terminals[(RBAR, n_h), head] = p_STOP[n_h, 'RA']
475 p_terminals[(LRBAR, n_h), head] = p_STOP[n_h, 'RA'] * p_STOP[n_h, 'LA']
477 for dir in ['L', 'R']:
478 for arg, val in f[head, dir].iteritems():
479 p_CHOOSE[tagnum[arg], n_h, dir] = float(val) / f[head,'sum',dir]
481 # after the head tag-loop, add every head-argument rule:
482 for (n_a, n_h, dir),p_C in p_CHOOSE.iteritems():
483 if dir == 'L': # arg is to the left of head
484 p_rules.append( DMV_Rule((RBAR,n_h), (LRBAR,n_a), (RBAR,n_h),
485 p_C*(1-p_STOP[n_h, dir+'N']),
486 p_C*(1-p_STOP[n_h, dir+'A'])) )
488 p_rules.append( DMV_Rule((NOBAR,n_h), (LRBAR,n_a), (NOBAR,n_h),
489 p_C*(1-p_STOP[n_h, dir+'N']),
490 p_C*(1-p_STOP[n_h, dir+'A'])) )
492 return DMV_Grammar(p_rules, p_terminals, p_STOP, p_CHOOSE, p_ROOT)
495 def initialize(corpus):
496 '''Return an initialized DMV_Grammar
497 corpus is a list of lists of tags.'''
498 tags = taglist(corpus)
500 for num, tag in enumerate(tags):
503 # f: frequency counts used in initialization, mostly distances
504 f = init_freq(corpus, tags)
506 g = init_normalize(f, tags, tagnum)
511 if __name__ == "__main__":
512 # print "--------------------"
513 initialize([['foo', 'two','foo','foo'],
514 ['zero', 'one','two','three']])
528 # todo: some testing on the Brown corpus:
530 if __name__ == "__main__":
531 # first five sentences of the Brown corpus:
532 initialize([['AT', 'NP-TL', 'NN-TL', 'JJ-TL', 'NN-TL', 'VBD', 'NR', 'AT', 'NN', 'IN', 'NP$', 'JJ', 'NN', 'NN', 'VBD', '``', 'AT', 'NN', "''", 'CS', 'DTI', 'NNS', 'VBD', 'NN', '.'], ['AT', 'NN', 'RBR', 'VBD', 'IN', 'NN', 'NNS', 'CS', 'AT', 'NN-TL', 'JJ-TL', 'NN-TL', ',', 'WDT', 'HVD', 'JJ', 'NN', 'IN', 'AT', 'NN', ',', '``', 'VBZ', 'AT', 'NN', 'CC', 'NNS', 'IN', 'AT', 'NN-TL', 'IN-TL', 'NP-TL', "''", 'IN', 'AT', 'NN', 'IN', 'WDT', 'AT', 'NN', 'BEDZ', 'VBN', '.'], ['AT', 'NP', 'NN', 'NN', 'HVD', 'BEN', 'VBN', 'IN', 'NP-TL', 'JJ-TL', 'NN-TL', 'NN-TL', 'NP', 'NP', 'TO', 'VB', 'NNS', 'IN', 'JJ', '``', 'NNS', "''", 'IN', 'AT', 'JJ', 'NN', 'WDT', 'BEDZ', 'VBN', 'IN', 'NN-TL', 'NP', 'NP', 'NP', '.'], ['``', 'RB', 'AT', 'JJ', 'NN', 'IN', 'JJ', 'NNS', 'BEDZ', 'VBN', "''", ',', 'AT', 'NN', 'VBD', ',', '``', 'IN', 'AT', 'JJ', 'NN', 'IN', 'AT', 'NN', ',', 'AT', 'NN', 'IN', 'NNS', 'CC', 'AT', 'NN', 'IN', 'DT', 'NN', "''", '.'], ['AT', 'NN', 'VBD', 'PPS', 'DOD', 'VB', 'CS', 'AP', 'IN', 'NP$', 'NN', 'CC', 'NN', 'NNS', '``', 'BER', 'JJ', 'CC', 'JJ', 'CC', 'RB', 'JJ', "''", '.'], ['PPS', 'VBD', 'CS', 'NP', 'NNS', 'VB', '``', 'TO', 'HV', 'DTS', 'NNS', 'VBN', 'CC', 'VBN', 'IN', 'AT', 'NN', 'IN', 'VBG', 'CC', 'VBG', 'PPO', "''", '.'], ['AT', 'JJ', 'NN', 'VBD', 'IN', 'AT', 'NN', 'IN', 'AP', 'NNS', ',', 'IN', 'PPO', 'AT', 'NP', 'CC', 'NP-TL', 'NN-TL', 'VBG', 'NNS', 'WDT', 'PPS', 'VBD', '``', 'BER', 'QL', 'VBN', 'CC', 'VB', 'RB', 'VBN', 'NNS', 'WDT', 'VB', 'IN', 'AT', 'JJT', 'NN', 'IN', 'ABX', 'NNS', "''", '.'], ['NN-HL', 'VBN-HL'], ['WRB', ',', 'AT', 'NN', 'VBD', 'PPS', 'VBZ', '``', 'DTS', 'CD', 'NNS', 'MD', 'BE', 'VBN', 'TO', 'VB', 'JJR', 'NN', 'CC', 'VB', 'AT', 'NN', 'IN', 'NN', "''", '.'], ['AT', 'NN-TL', 'VBG-TL', 'NN-TL', ',', 'AT', 'NN', 'VBD', ',', '``', 'BEZ', 'VBG', 'IN', 'VBN', 'JJ', 'NNS', 'CS', 'AT', 'NN', 'IN', 'NN', 'NNS', 'NNS', "''", '.']])
534 # this will give the tag sequences of all the 6218 Brown corpus
535 # sentences of length < 7:
536 # [[tag for (w, tag) in sent]
537 # for sent in nltk.corpus.brown.tagged_sents() if len(sent) < 7]
541 "472 tags, takes a while to extract with tagset(), hardcoded here."
542 return set(['BEDZ-NC', 'NP$', 'AT-TL', 'CS', 'NP+HVZ', 'IN-TL-HL', 'NR-HL', 'CC-TL-HL', 'NNS$-HL', 'JJS-HL', 'JJ-HL', 'WRB-TL', 'JJT-TL', 'WRB', 'DOD*', 'BER*-NC', ')-HL', 'NPS$-HL', 'RB-HL', 'FW-PPSS', 'NP+HVZ-NC', 'NNS$', '--', 'CC-TL', 'FW-NN-TL', 'NP-TL-HL', 'PPSS+MD', 'NPS', 'RBR+CS', 'DTI', 'NPS-TL', 'BEM', 'FW-AT+NP-TL', 'EX+BEZ', 'BEG', 'BED', 'BEZ', 'DTX', 'DOD*-TL', 'FW-VB-NC', 'DTS', 'DTS+BEZ', 'QL-HL', 'NP$-TL', 'WRB+DOD*', 'JJR+CS', 'NN+MD', 'NN-TL-HL', 'HVD-HL', 'NP+BEZ-NC', 'VBN+TO', '*-TL', 'WDT-HL', 'MD', 'NN-HL', 'FW-BE', 'DT$', 'PN-TL', 'DT-HL', 'FW-NR-TL', 'VBG', 'VBD', 'VBN', 'DOD', 'FW-VBG-TL', 'DOZ', 'ABN-TL', 'VB+JJ-NC', 'VBZ', 'RB+CS', 'FW-PN', 'CS-NC', 'VBG-NC', 'BER-HL', 'MD*', '``', 'WPS-TL', 'OD-TL', 'PPSS-HL', 'PPS+MD', 'DO*', 'DO-HL', 'HVG-HL', 'WRB-HL', 'JJT', 'JJS', 'JJR', 'HV+TO', 'WQL', 'DOD-NC', 'CC-HL', 'FW-PPSS+HV', 'FW-NP-TL', 'MD+TO', 'VB+IN', 'JJT-NC', 'WDT+BEZ-TL', '---HL', 'PN$', 'VB+PPO', 'BE-TL', 'VBG-TL', 'NP$-HL', 'VBZ-TL', 'UH', 'FW-WPO', 'AP+AP-NC', 'FW-IN', 'NRS-TL', 'ABL', 'ABN', 'TO-TL', 'ABX', '*-HL', 'FW-WPS', 'VB-NC', 'HVD*', 'PPS+HVD', 'FW-IN+AT', 'FW-NP', 'QLP', 'FW-NR', 'FW-NN', 'PPS+HVZ', 'NNS-NC', 'DT+BEZ-NC', 'PPO', 'PPO-NC', 'EX-HL', 'AP$', 'OD-NC', 'RP', 'WPS+BEZ', 'NN+BEZ', '.-TL', ',', 'FW-DT+BEZ', 'RB', 'FW-PP$-NC', 'RN', 'JJ$-TL', 'MD-NC', 'VBD-NC', 'PPSS+BER-N', 'RB+BEZ-NC', 'WPS-HL', 'VBN-NC', 'BEZ-HL', 'PPL-NC', 'BER-TL', 'PP$$', 'NNS+MD', 'PPS-NC', 'FW-UH-NC', 'PPS+BEZ-NC', 'PPSS+BER-TL', 'NR-NC', 'FW-JJ', 'PPS+BEZ-HL', 'NPS$', 'RB-TL', 'VB-TL', 'BEM*', 'MD*-HL', 'FW-CC', 'NP+MD', 'EX+HVZ', 'FW-CD', 'EX+HVD', 'IN-HL', 'FW-CS', 'JJR-HL', 'FW-IN+NP-TL', 'JJ-TL-HL', 'FW-UH', 'EX', 'FW-NNS-NC', 'FW-JJ-NC', 'VBZ-HL', 'VB+RP', 'BEZ-NC', 'PPSS+HV-TL', 'HV*', 'IN', 'PP$-NC', 'NP-NC', 'BEN', 'PP$-TL', 'FW-*-TL', 'FW-OD-TL', 'WPS', 'WPO', 'MD+PPSS', 'WDT+BER', 'WDT+BEZ', 'CD-HL', 'WDT+BEZ-NC', 'WP$', 'DO+PPSS', 'HV-HL', 'DT-NC', 'PN-NC', 'FW-VBZ', 'HVD', 'HVG', 'NN+BEZ-TL', 'HVZ', 'FW-VBD', 'FW-VBG', 'NNS$-TL', 'JJ-TL', 'FW-VBN', 'MD-TL', 'WDT+DOD', 'HV-TL', 'NN-TL', 'PPSS', 'NR$', 'BER', 'FW-VB', 'DT', 'PN+BEZ', 'VBG-HL', 'FW-PPL+VBZ', 'FW-NPS-TL', 'RB$', 'FW-IN+NN', 'FW-CC-TL', 'RBT', 'RBR', 'PPS-TL', 'PPSS+HV', 'JJS-TL', 'NPS-HL', 'WPS+BEZ-TL', 'NNS-TL-HL', 'VBN-TL-NC', 'QL-TL', 'NN+NN-NC', 'JJR-TL', 'NN$-TL', 'FW-QL', 'IN-TL', 'BED-NC', 'NRS', '.-HL', 'QL', 'PP$-HL', 'WRB+BER', 'JJ', 'WRB+BEZ', 'NNS$-TL-HL', 'PPSS+BEZ', '(', 'PPSS+BER', 'DT+MD', 'DOZ-TL', 'PPSS+BEM', 'FW-PP$', 'RB+BEZ-HL', 'FW-RB+CC', 'FW-PPS', 'VBG+TO', 'DO*-HL', 'NR+MD', 'PPLS', 'IN+IN', 'BEZ*', 'FW-PPL', 'FW-PPO', 'NNS-HL', 'NIL', 'HVN', 'PPSS+BER-NC', 'AP-TL', 'FW-DT', '(-HL', 'DTI-TL', 'JJ+JJ-NC', 'FW-RB', 'FW-VBD-TL', 'BER-NC', 'NNS$-NC', 'JJ-NC', 'NPS$-TL', 'VB+VB-NC', 'PN', 'VB+TO', 'AT-TL-HL', 'BEM-NC', 'PPL-TL', 'ABN-HL', 'RB-NC', 'DO-NC', 'BE-HL', 'WRB+IN', 'FW-UH-TL', 'PPO-HL', 'FW-CD-TL', 'TO-HL', 'PPS+BEZ', 'CD$', 'DO', 'EX+MD', 'HVZ-TL', 'TO-NC', 'IN-NC', '.', 'WRB+DO', 'CD-NC', 'FW-PPO+IN', 'FW-NN$-TL', 'WDT+BEZ-HL', 'RP-HL', 'CC', 'NN+HVZ-TL', 'FW-NNS-TL', 'DT+BEZ', 'WPS+HVZ', 'BEDZ*', 'NP-TL', ':-TL', 'NN-NC', 'WPO-TL', 'QL-NC', 'FW-AT+NN-TL', 'WDT+HVZ', '.-NC', 'FW-DTS', 'NP-HL', ':-HL', 'RBR-NC', 'OD-HL', 'BEDZ-HL', 'VBD-TL', 'NPS-NC', ')', 'TO+VB', 'FW-IN+NN-TL', 'PPL', 'PPS', 'PPSS+VB', 'DT-TL', 'RP-NC', 'VB', 'FW-VB-TL', 'PP$', 'VBD-HL', 'DTI-HL', 'NN-TL-NC', 'PPL-HL', 'DOZ*', 'NR-TL', 'WRB+MD', 'PN+HVZ', 'FW-IN-TL', 'PN+HVD', 'BEN-TL', 'BE', 'WDT', 'WPS+HVD', 'DO-TL', 'FW-NN-NC', 'WRB+BEZ-TL', 'UH-TL', 'JJR-NC', 'NNS', 'PPSS-NC', 'WPS+BEZ-NC', ',-TL', 'NN$', 'VBN-TL-HL', 'WDT-NC', 'OD', 'FW-OD-NC', 'DOZ*-TL', 'PPSS+HVD', 'CS-TL', 'WRB+DOZ', 'CC-NC', 'HV', 'NN$-HL', 'FW-WDT', 'WRB+DOD', 'NN+HVZ', 'AT-NC', 'NNS-TL', 'FW-BEZ', 'CS-HL', 'WPO-NC', 'FW-BER', 'NNS-TL-NC', 'BEZ-TL', 'FW-IN+AT-T', 'ABN-NC', 'NR-TL-HL', 'BEDZ', 'NP+BEZ', 'FW-AT-TL', 'BER*', 'WPS+MD', 'MD-HL', 'BED*', 'HV-NC', 'WPS-NC', 'VBN-HL', 'FW-TO+VB', 'PPSS+MD-NC', 'HVZ*', 'PPS-HL', 'WRB-NC', 'VBN-TL', 'CD-TL-HL', ',-NC', 'RP-TL', 'AP-HL', 'FW-HV', 'WQL-TL', 'FW-AT', 'NN', 'NR$-TL', 'VBZ-NC', '*', 'PPSS-TL', 'JJT-HL', 'FW-NNS', 'NP', 'UH-HL', 'NR', ':', 'FW-NN$', 'RP+IN', ',-HL', 'JJ-TL-NC', 'AP-NC', '*-NC', 'VB-HL', 'HVZ-NC', 'DTS-HL', 'FW-JJT', 'FW-JJR', 'FW-JJ-TL', 'FW-*', 'RB+BEZ', "''", 'VB+AT', 'PN-HL', 'PPO-TL', 'CD-TL', 'UH-NC', 'FW-NN-TL-NC', 'EX-NC', 'PPSS+BEZ*', 'TO', 'WDT+DO+PPS', 'IN+PPO', 'AP', 'AT', 'DOZ-HL', 'FW-RB-TL', 'CD', 'NN+IN', 'FW-AT-HL', 'PN+MD', "'", 'FW-PP$-TL', 'FW-NPS', 'WDT+BER+PP', 'NN+HVD-TL', 'MD+HV', 'AT-HL', 'FW-IN+AT-TL'])