3 # - prettier printout for DMV_Rule
4 # - DMV_Rule changed a bit. head, L and R are now all pairs of the
6 # - Started on P_STOP, a bit less pseudo now..
9 # - started on initialization. So far, I have frequencies for
10 # everything, very harmonic. Still need to make these into 1-summing
14 # - more work on initialization (init_freq and init_normalize),
15 # getting closer to probabilities now.
18 # - init_normalize is done, it creates p_STOP, p_ROOT and p_CHOOSE,
19 # and also adds the relevant probabilities to p_rules in a grammar.
20 # Still, each individual rule has to store both adjacent and non_adj
21 # probabilities, and inner() should be able to send some parameter
22 # which lets the rule choose... hopefully... Is this possible to do
23 # top-down even? when the sentence could be all the same words?
24 # todo: extensive testing of identical words in sentences!
25 # - frequencies (only used in initialization) are stored as strings,
26 # but in the rules and p_STOP etc, there are only numbers.
29 # - copied inner() into this file, to make the very dmv-specific
30 # adjacency stuff work (have to factor that out later on, when it
34 # - finished typing in inner_dmv(), still have to test and debug
35 # it. The chart is now four times as big since for any rule we may
36 # have attachments to either the left or the right below, which
37 # upper rules depend on, for selecting probN or probA
40 # - fixed a number of little bugs in initialization, where certain
41 # rules were simply not created, or created "backwards"
42 # - inner_dmv() should Work now...
45 # import numpy # numpy provides Fast Arrays, for future optimization
54 # non-tweakable/constant "lookup" globals
61 # (is this the best way to represent ROOT and STOP?)
63 # todo: use these instead for attachment constants. Requires making
64 # the last two indices of chart[] one single pair, boring retyping and
69 NOATT = (False, False)
71 if __name__ == "__main__":
72 print "DMV module tests:"
74 class DMV_Grammar(io.Grammar):
78 p_STOP, p_ROOT, p_CHOOSE, p_terminals
79 These are changed in the Maximation step, then used to set the
80 new probabilities of each DMV_Rule.
82 Todo: make p_terminals private? (But it has to be changable in
83 maximation step due to the short-cutting rules... could of course
84 make a DMV_Grammar function to update the short-cut rules...)
86 __p_rules is private, but we can still say stuff like:
87 for r in g.all_rules():
90 What other representations do we need? (P_STOP formula uses
91 deps_D(h,l/r) at least)'''
94 for r in self.all_rules():
101 return [r for r in self.all_rules() if r.head() == head and r.bars() == bars]
103 def sent_rules(self, b_h, sent_nums):
106 sent_nums += [STOP[1]] # (but not ROOT though)
107 return [r for r in self.all_rules()
108 if r.head() == head and r.bars() == bars
109 and r.L()[1] in sent_nums and r.R()[1] in sent_nums]
112 '''Not sure yet what is needed here, or where this is needed'''
115 def deps_L(self, head):
116 # todo test, probably this list comprehension doesn't work
117 return [a for r in self.all_rules() if r.head() == head and a == r.L()]
119 def deps_R(self, head):
120 # todo test, probably this list comprehension doesn't work
121 return [a for r in self.all_rules() if r.head() == head and a == r.R()]
123 def __init__(self, p_rules, p_terminals, p_STOP, p_CHOOSE, p_ROOT, numtag, tagnum):
124 io.Grammar.__init__(self, p_rules, p_terminals, numtag, tagnum)
126 self.p_CHOOSE = p_CHOOSE
130 class DMV_Rule(io.CNF_Rule):
131 '''A single CNF rule in the PCFG, of the form
133 where LHS = (bars, head)
141 Different rule-types have different probabilities associated with
144 _h_ -> STOP h_ P( STOP|h,L, adj)
145 _h_ -> STOP h_ P( STOP|h,L,non_adj)
146 h_ -> h STOP P( STOP|h,R, adj)
147 h_ -> h STOP P( STOP|h,R,non_adj)
148 h_ -> _a_ h_ P(-STOP|h,L, adj) * P(a|h,L)
149 h_ -> _a_ h_ P(-STOP|h,L,non_adj) * P(a|h,L)
150 h -> h _a_ P(-STOP|h,R, adj) * P(a|h,R)
151 h -> h _a_ P(-STOP|h,R,non_adj) * P(a|h,R)
153 def p(self, LRattach, RLattach, *arg):
154 '''Returns the correct probability, adjacent or non-adjacent,
155 depending on whether or not there is a some lower attachment
156 either on the right side of the left child, or the left side
157 of the right child. '''
158 if (not LRattach) and (not RLattach):
169 def __init__(self, LHS, L, R, probN, probA):
170 for b_h in [LHS, L, R]:
171 if b_h[0] not in BARS:
172 raise ValueError("bars must be in %s; was given: %s" % (BARS, b_h[0]))
173 io.CNF_Rule.__init__(self, LHS, L, R, probN)
174 self.probA = probA # adjacent
175 self.probN = probN # non_adj
177 @classmethod # so we can call DMV_Rule.bar_str(b_h)
178 def bar_str(cls, b_h):
179 str = " %d " % b_h[1]
181 str = " %d_ " % b_h[1]
183 str = "_%d_ " % b_h[1]
191 return "%s-->%s %s\t[N %.2f] [A %.2f]" % (self.bar_str(self.LHS()),
192 self.bar_str(self.L()),
193 self.bar_str(self.R()),
203 ###################################
204 # dmv-specific version of inner() #
205 ###################################
206 def rewrite_adj(bars, Lattach, Rattach):
207 # todo: make prettier? Although since we call this so many times,
208 # having it spelled out here is probably faster
209 if bars == NOBAR and not Lattach and Rattach:
210 return ( (Lattach, False, False, False),
211 (Lattach, False, False, True),
212 (Lattach, False, True, False),
213 (Lattach, False, True, True),
214 (Lattach, True, False, False),
215 (Lattach, True, False, True),
216 (Lattach, True, True, False),
217 (Lattach, True, True, True), )
218 elif bars == RBAR and Lattach:
219 # Rattach may be either true or false here!
220 return ( (False, False, False, Rattach),
221 (False, False, True, Rattach),
222 (False, True, False, Rattach),
223 (False, True, True, Rattach),
224 (True, False, False, Rattach),
225 (True, False, True, Rattach),
226 (True, True, False, Rattach),
227 (True, True, True, Rattach) )
229 # NOBAR rewrite rules cannot have Lattach below, and must
230 # have/add Rattach. RBAR rewrite rules must add Lattach, but
231 # don't care about Rattach. Returning () should ensure we
232 # don't add any probability to such "false" situations
235 def inner_dmv(s, t, LHS, g, sent, chart):
236 ''' A rewrite of inner in io.py, to take adjacency into accord.
238 The chart is now 4 times bigger, since there are different values
239 for with or without L/R attachments:
240 chart[(s,t,LHS, Lattach, Rattach)]
242 If Rattach==True then the rule has a right-attachment or there is
243 one lower in the tree (meaning we're no longer
244 adjacent). Adjacency depends on whether there is an attachment
245 lower in the tree, cf. DMV_Rule.p(LRattach, RLattach).
247 Todo: make this work, then, if possible, refactor (move
248 dmv-specific stuff back into dmv, so this is "general" again)
250 Or, if that doesn't work, we might as well make it a method of
254 def debug_inner_dmv(tabs,s,t,LHS,Lattach,Rattach):
257 (True, True): "left and right attachments below",
258 (True, False): "left attachment(s) below",
259 (False, True): "right attachment(s) below",
260 (False, False): "no attachments below" }
261 info = (tabs,O(s),s,O(t),t, DMV_Rule.bar_str(LHS), attach[Lattach,Rattach])
262 print "%sTrying from %s_%d to %s_%d with %s, %s:" % info
267 sent_nums = [g.tagnum(tag) for tag in sent]
269 def e(s,t,LHS, Lattach, Rattach, n_t):
273 if (s, t, LHS, Lattach, Rattach) in chart:
274 return chart[(s, t, LHS, Lattach, Rattach)]
276 debug_inner_dmv(tab(),s,t,LHS, Lattach, Rattach)
278 if Lattach or Rattach:
279 # terminals are always F,F for attachment
280 io.debug("%s= 0.0 (1 word, no lower attach)" % tab())
282 elif (LHS, O(s)) in g.p_terminals:
283 prob = g.p_terminals[LHS, O(s)] # b[LHS, O(s)] in Lari&Young
285 # todo: assuming this is how to deal with lacking
286 # rules, since we add prob.s, and 0 is identity
288 io.debug( "%sLACKING TERMINAL:" % tab())
289 # todo: add to chart perhaps? Although, it _is_ simple lookup..
290 io.debug( "%s= %.1f (terminal: %s -> %s)" % (tab(),prob,
291 DMV_Rule.bar_str(LHS),
295 if (s,t,LHS,Lattach, Rattach) not in chart:
296 chart[(s,t,LHS,Lattach,Rattach)] = 0.0
297 for rule in g.sent_rules(LHS, sent_nums): # summing over j,k in a[LHS,j,k]
298 io.debug( "%ssumming rule %s" % (tab(),rule) )
301 # if it's a STOP rule, rewrite for the same range:
302 if (L == STOP) or (R == STOP):
304 p = rule.p(Lattach, False) # todo check
305 pLR = e(s, t, R, Lattach, Rattach, n_t+1)
307 p = rule.p(False, Rattach) # todo check
308 pLR = e(s, t, L, Lattach, Rattach, n_t+1)
309 chart[(s, t, LHS, Lattach, Rattach)] += p * pLR
311 # not a STOP, an attachment rewrite:
313 for r in range(s, t):
314 if L[1] in sent_nums[s:r+1] and R[1] in sent_nums[r+1:t+1]:
315 # LL etc are boolean attachment values
316 for (LL, LR, RL, RR) in rewrite_adj(rule.bars(), Lattach, Rattach):
317 p = rule.p(LR, RL) # probN or probA
318 pL = e(s, r, L, LL, LR, n_t+1)
319 pR = e(r+1, t, R, RL, RR, n_t+1)
320 chart[(s, t, LHS,Lattach,Rattach)] += p * pL * pR
322 return chart[(s, t, LHS,Lattach,Rattach)]
325 inner_prob = e(s,t,LHS,True,True, 0) + e(s,t,LHS,True,False, 0) + e(s,t,LHS,False,True, 0) + e(s,t,LHS,False,False, 0)
328 for k,v in chart.iteritems():
329 print "\t%s -> %s_%d ... %s_%d (L:%s, R:%s):\t%.3f" % (DMV_Rule.bar_str(k[2]),
333 print "---CHART:end---"
334 return [inner_prob, chart]
338 if __name__ == "__main__": # Non, Adj
339 _h_ = DMV_Rule((LRBAR,0), STOP, ( RBAR,0), 1.0, 1.0) # LSTOP
340 h_S = DMV_Rule(( RBAR,0),(NOBAR,0), STOP, 0.4, 0.3) # RSTOP
341 h_A = DMV_Rule(( RBAR,0),(LRBAR,0),( RBAR,0), 0.6, 0.7) # Lattach
342 h = DMV_Rule((NOBAR,0),(NOBAR,0),(LRBAR,0), 1.0, 1.0) # Rattach
344 b2[(NOBAR, 0), 'h'] = 1.0
345 b2[(RBAR, 0), 'h'] = h_S.probA
346 b2[(LRBAR, 0), 'h'] = h_S.probA * _h_.probA
348 g2 = DMV_Grammar([ _h_, h_S, h_A, h ],b2,0,0,0, {0:'h'}, {'h':0})
351 test1 = inner_dmv(0, 1, (LRBAR,0), g2, 'h h'.split(), {})
352 print "Should be 0.183: %.3f" % test1[0]
356 ##############################
357 # DMV-probabilities, todo: #
358 ##############################
360 def P_STOP(STOP, h, dir, adj, corpus):
361 '''corpus is a list of sentences s.
363 This is based on the formula where STOP is True... not sure how we
364 calculate if STOP is False.
367 I thought about instead having this:
369 for rule in g.p_rules:
373 for rule in g.p_rules:
376 set num and den using inner
377 for rule in g.p_rules
378 rule.prob = rule.num / rule.den
380 ..the way I'm assuming we do it in the commented out io-function in
381 io.py. Having sentences as the outer loop at least we can easily just
382 go through the heads that are actually in the sentence... BUT, this
383 means having to go through p_rules 3 times, not sure what is slower.
386 P_STOP(-STOP|...) = 1 - P_STOP(STOP|...)
393 # here we should somehow make each word in the sentence
394 # unique, decorate them with subscripts or something. We have
395 # to run through the sentence as many times as h appears
396 # there. This also means changing inner(), I suspect. Have to
397 # make sure we separate reading of inner_prob from changing of
399 for s in range(loc(h)): # i<loc(h), where h is in the sentence.
400 for t in range(i, len(sent)):
401 P_STOP_num += inner(s, t, h-r, g, sent, chart)
402 P_STOP_den += inner(s, t, l-h-r, g, sent, chart)
403 return P_STOP_num / P_STOP_den # possibly other way round? todo
410 '''Here it seems like they store rule information on a per-head (per
411 direction) basis, in deps_D(h, dir) which gives us a list. '''
414 for dir in ['l', 'r']:
415 for a in deps(h, dir):
418 P_STOP (0, h, dir, adj) * \
419 P_CHOOSE (a, h, dir) * \
421 P_STOP (STOP | h, dir, adj)
423 return P_h(root(sent))
427 ##############################
428 # Initialization, todo #
429 ##############################
431 '''sents is of this form:
432 [['tag', ...], ['tag2', ...], ...]
434 Return a list of the tags. (Has to be ordered for enumerating to be
437 Fortunately only has to run once.
444 raise ValueError("it seems we must have a new ROOT symbol")
451 def init_zeros(tags):
452 '''Return a frequency dictionary with DMV-relevant keys set to 0 or
455 Todo: tweak (especially for f_STOP).'''
460 for dir_adj in ['LN','LA','RN','RA']:
461 f[tag, 'STOP', dir_adj] = FSTOP_MIN
462 f[tag, '-STOP', dir_adj] = FNONSTOP_MIN
465 f[tag, 'sum', 'R'] = 0.0
466 f[tag, 'sum', 'L'] = 0.0
469 def init_freq(corpus, tags):
470 '''Returns f, a dictionary with these types of keys:
471 - ('ROOT', tag) is basically just the frequency of tag
472 - (tag, 'STOP', 'LN') is for P_STOP(STOP|tag, left, non_adj);
473 etc. for 'RN', 'LA', 'LN', '-STOP'.
474 - (tag, 'L') is a dictionary of arg:f, where head could take arg
475 to direction 'L' (etc. for 'R') and f is "harmonically" divided
476 by distance, used for finding P_CHOOSE
479 1. counts word frequencies for f_ROOT
480 2. adds to certain f_STOP counters if a word is found first,
481 last, first or second, or last or second to last in the sentence
482 (Left Adjacent, Left Non-Adjacent, etc)
483 3. adds to f_CHOOSE(arg|head) a "harmonic" number (divided by
484 distance between arg and head)
488 for sent in corpus: # sent is ['VBD', 'NN', ...]
490 # NOTE: head in DMV_Rule is a number, while this is the string
491 for i_h, head in enumerate(sent):
492 # todo grok: how is this different from just using straight head
493 # frequency counts, for the ROOT probabilities?
495 f['sum', 'ROOT'] += 1
497 # True = 1, False = 0. todo: make prettier
498 f[head, 'STOP', 'LN'] += (i_h == 1) # second word
499 f[head, '-STOP', 'LN'] += (not i_h == 1) # not second
500 f[head, 'STOP', 'LA'] += (i_h == 0) # first word
501 f[head, '-STOP', 'LA'] += (not i_h == 0) # not first
502 f[head, 'STOP', 'RN'] += (i_h == n - 2) # second-to-last
503 f[head, '-STOP', 'RN'] += (not i_h == n - 2) # not second-to-last
504 f[head, 'STOP', 'RA'] += (i_h == n - 1) # last word
505 f[head, '-STOP', 'RA'] += (not i_h == n - 1) # not last
507 # this is where we make the "harmonic" distribution. quite.
508 for i_a, arg in enumerate(sent):
510 harmony = 1.0/abs(i_h - i_a) + HARMONIC_C
515 if arg not in f[head, dir]:
516 f[head, dir][arg] = 0.0
517 f[head, dir][arg] += harmony
518 f[head, 'sum', dir] += harmony
519 # todo, optimization: possible to do both directions
520 # at once here, and later on rule out the ones we've
521 # done? does it actually speed things up?
525 def init_normalize(f, tags, tagnum, numtag):
526 '''Use frequencies (and sums) in f to return create p_STOP and
527 p_CHOOSE; at the same time adding the context-free rules to the
528 grammar using these probabilities.
530 Return a usable grammar.'''
532 p_STOP, p_ROOT, p_CHOOSE, p_terminals = {},{},{},{}
533 for n_h, head in enumerate(tags):
534 p_ROOT[n_h] = float(f['ROOT', head]) / f['sum', 'ROOT']
535 p_rules.append( DMV_Rule(ROOT, (LRBAR,n_h), STOP,
539 # p_STOP = STOP / (STOP + NOT_STOP)
540 for dir in ['L','R']:
541 for adj in ['N','A']:
542 p_STOP[n_h, dir+adj] = \
543 float(f[head, 'STOP', dir+adj]) / \
544 (f[head, 'STOP', dir+adj] + f[head, '-STOP', dir+adj])
545 # make rule using the previously found probN and probA:
546 p_rules.append( DMV_Rule((RBAR, n_h), (NOBAR, n_h), STOP,
549 p_rules.append( DMV_Rule((LRBAR, n_h), STOP, (RBAR, n_h),
553 # inner() shouldn't have to deal with those long non-branching stops:
554 p_terminals[(NOBAR, n_h), head] = 1.0
555 p_terminals[(RBAR, n_h), head] = p_STOP[n_h, 'RA']
556 p_terminals[(LRBAR, n_h), head] = p_STOP[n_h, 'RA'] * p_STOP[n_h, 'LA']
558 for dir in ['L', 'R']:
559 for arg, val in f[head, dir].iteritems():
560 p_CHOOSE[tagnum[arg], n_h, dir] = float(val) / f[head,'sum',dir]
562 # after the head tag-loop, add every head-argument rule:
563 for (n_a, n_h, dir),p_C in p_CHOOSE.iteritems():
564 if dir == 'L': # arg is to the left of head
565 p_rules.append( DMV_Rule((RBAR,n_h), (LRBAR,n_a), (RBAR,n_h),
566 p_C*(1-p_STOP[n_h, dir+'N']),
567 p_C*(1-p_STOP[n_h, dir+'A'])) )
569 p_rules.append( DMV_Rule((NOBAR,n_h), (NOBAR,n_h), (LRBAR,n_a),
570 p_C*(1-p_STOP[n_h, dir+'N']),
571 p_C*(1-p_STOP[n_h, dir+'A'])) )
573 return DMV_Grammar(p_rules, p_terminals, p_STOP, p_CHOOSE, p_ROOT, numtag, tagnum)
576 def initialize(corpus):
577 '''Return an initialized DMV_Grammar
578 corpus is a list of lists of tags.'''
579 tags = taglist(corpus)
580 tagnum, numtag = {}, {}
581 for num, tag in enumerate(tags):
584 # f: frequency counts used in initialization, mostly distances
585 f = init_freq(corpus, tags)
586 g = init_normalize(f, tags, tagnum, numtag)
590 if __name__ == "__main__":
591 # print "--------initialization------------"
592 # print initialize([['foo', 'two','foo','foo'],
593 # ['zero', 'one','two','three']])
607 # todo: some testing on the Brown corpus:
609 if __name__ == "__main__":
610 # first five sentences of the Brown corpus:
611 g_brown = initialize([['AT', 'NP-TL', 'NN-TL', 'JJ-TL', 'NN-TL', 'VBD', 'NR', 'AT', 'NN', 'IN', 'NP$', 'JJ', 'NN', 'NN', 'VBD', '``', 'AT', 'NN', "''", 'CS', 'DTI', 'NNS', 'VBD', 'NN', '.'], ['AT', 'NN', 'RBR', 'VBD', 'IN', 'NN', 'NNS', 'CS', 'AT', 'NN-TL', 'JJ-TL', 'NN-TL', ',', 'WDT', 'HVD', 'JJ', 'NN', 'IN', 'AT', 'NN', ',', '``', 'VBZ', 'AT', 'NN', 'CC', 'NNS', 'IN', 'AT', 'NN-TL', 'IN-TL', 'NP-TL', "''", 'IN', 'AT', 'NN', 'IN', 'WDT', 'AT', 'NN', 'BEDZ', 'VBN', '.'], ['AT', 'NP', 'NN', 'NN', 'HVD', 'BEN', 'VBN', 'IN', 'NP-TL', 'JJ-TL', 'NN-TL', 'NN-TL', 'NP', 'NP', 'TO', 'VB', 'NNS', 'IN', 'JJ', '``', 'NNS', "''", 'IN', 'AT', 'JJ', 'NN', 'WDT', 'BEDZ', 'VBN', 'IN', 'NN-TL', 'NP', 'NP', 'NP', '.'], ['``', 'RB', 'AT', 'JJ', 'NN', 'IN', 'JJ', 'NNS', 'BEDZ', 'VBN', "''", ',', 'AT', 'NN', 'VBD', ',', '``', 'IN', 'AT', 'JJ', 'NN', 'IN', 'AT', 'NN', ',', 'AT', 'NN', 'IN', 'NNS', 'CC', 'AT', 'NN', 'IN', 'DT', 'NN', "''", '.'], ['AT', 'NN', 'VBD', 'PPS', 'DOD', 'VB', 'CS', 'AP', 'IN', 'NP$', 'NN', 'CC', 'NN', 'NNS', '``', 'BER', 'JJ', 'CC', 'JJ', 'CC', 'RB', 'JJ', "''", '.'], ['PPS', 'VBD', 'CS', 'NP', 'NNS', 'VB', '``', 'TO', 'HV', 'DTS', 'NNS', 'VBN', 'CC', 'VBN', 'IN', 'AT', 'NN', 'IN', 'VBG', 'CC', 'VBG', 'PPO', "''", '.'], ['AT', 'JJ', 'NN', 'VBD', 'IN', 'AT', 'NN', 'IN', 'AP', 'NNS', ',', 'IN', 'PPO', 'AT', 'NP', 'CC', 'NP-TL', 'NN-TL', 'VBG', 'NNS', 'WDT', 'PPS', 'VBD', '``', 'BER', 'QL', 'VBN', 'CC', 'VB', 'RB', 'VBN', 'NNS', 'WDT', 'VB', 'IN', 'AT', 'JJT', 'NN', 'IN', 'ABX', 'NNS', "''", '.'], ['NN-HL', 'VBN-HL'], ['WRB', ',', 'AT', 'NN', 'VBD', 'PPS', 'VBZ', '``', 'DTS', 'CD', 'NNS', 'MD', 'BE', 'VBN', 'TO', 'VB', 'JJR', 'NN', 'CC', 'VB', 'AT', 'NN', 'IN', 'NN', "''", '.'], ['AT', 'NN-TL', 'VBG-TL', 'NN-TL', ',', 'AT', 'NN', 'VBD', ',', '``', 'BEZ', 'VBG', 'IN', 'VBN', 'JJ', 'NNS', 'CS', 'AT', 'NN', 'IN', 'NN', 'NNS', 'NNS', "''", '.']])
612 # 36:'AT' in g_brown.numtag, 40:'NP-TL'
615 test_brown = inner_dmv(0,3, (LRBAR,36), g_brown, ['AT', 'NP-TL' ,'NN-TL','JJ-TL'], {})
617 for r in g_brown.rules((2,36)) + g_brown.rules((1,36)) + g_brown.rules((0,36)):
620 if L[1] in [36,40,-2] and R[1] in [36,40,-2]:
622 print "Brown-test gives: %.8f" % test_brown[0]
626 # this will give the tag sequences of all the 6218 Brown corpus
627 # sentences of length < 7:
628 # [[tag for (w, tag) in sent]
629 # for sent in nltk.corpus.brown.tagged_sents() if len(sent) < 7]
633 "472 tags, takes a while to extract with tagset(), hardcoded here."
634 return set(['BEDZ-NC', 'NP$', 'AT-TL', 'CS', 'NP+HVZ', 'IN-TL-HL', 'NR-HL', 'CC-TL-HL', 'NNS$-HL', 'JJS-HL', 'JJ-HL', 'WRB-TL', 'JJT-TL', 'WRB', 'DOD*', 'BER*-NC', ')-HL', 'NPS$-HL', 'RB-HL', 'FW-PPSS', 'NP+HVZ-NC', 'NNS$', '--', 'CC-TL', 'FW-NN-TL', 'NP-TL-HL', 'PPSS+MD', 'NPS', 'RBR+CS', 'DTI', 'NPS-TL', 'BEM', 'FW-AT+NP-TL', 'EX+BEZ', 'BEG', 'BED', 'BEZ', 'DTX', 'DOD*-TL', 'FW-VB-NC', 'DTS', 'DTS+BEZ', 'QL-HL', 'NP$-TL', 'WRB+DOD*', 'JJR+CS', 'NN+MD', 'NN-TL-HL', 'HVD-HL', 'NP+BEZ-NC', 'VBN+TO', '*-TL', 'WDT-HL', 'MD', 'NN-HL', 'FW-BE', 'DT$', 'PN-TL', 'DT-HL', 'FW-NR-TL', 'VBG', 'VBD', 'VBN', 'DOD', 'FW-VBG-TL', 'DOZ', 'ABN-TL', 'VB+JJ-NC', 'VBZ', 'RB+CS', 'FW-PN', 'CS-NC', 'VBG-NC', 'BER-HL', 'MD*', '``', 'WPS-TL', 'OD-TL', 'PPSS-HL', 'PPS+MD', 'DO*', 'DO-HL', 'HVG-HL', 'WRB-HL', 'JJT', 'JJS', 'JJR', 'HV+TO', 'WQL', 'DOD-NC', 'CC-HL', 'FW-PPSS+HV', 'FW-NP-TL', 'MD+TO', 'VB+IN', 'JJT-NC', 'WDT+BEZ-TL', '---HL', 'PN$', 'VB+PPO', 'BE-TL', 'VBG-TL', 'NP$-HL', 'VBZ-TL', 'UH', 'FW-WPO', 'AP+AP-NC', 'FW-IN', 'NRS-TL', 'ABL', 'ABN', 'TO-TL', 'ABX', '*-HL', 'FW-WPS', 'VB-NC', 'HVD*', 'PPS+HVD', 'FW-IN+AT', 'FW-NP', 'QLP', 'FW-NR', 'FW-NN', 'PPS+HVZ', 'NNS-NC', 'DT+BEZ-NC', 'PPO', 'PPO-NC', 'EX-HL', 'AP$', 'OD-NC', 'RP', 'WPS+BEZ', 'NN+BEZ', '.-TL', ',', 'FW-DT+BEZ', 'RB', 'FW-PP$-NC', 'RN', 'JJ$-TL', 'MD-NC', 'VBD-NC', 'PPSS+BER-N', 'RB+BEZ-NC', 'WPS-HL', 'VBN-NC', 'BEZ-HL', 'PPL-NC', 'BER-TL', 'PP$$', 'NNS+MD', 'PPS-NC', 'FW-UH-NC', 'PPS+BEZ-NC', 'PPSS+BER-TL', 'NR-NC', 'FW-JJ', 'PPS+BEZ-HL', 'NPS$', 'RB-TL', 'VB-TL', 'BEM*', 'MD*-HL', 'FW-CC', 'NP+MD', 'EX+HVZ', 'FW-CD', 'EX+HVD', 'IN-HL', 'FW-CS', 'JJR-HL', 'FW-IN+NP-TL', 'JJ-TL-HL', 'FW-UH', 'EX', 'FW-NNS-NC', 'FW-JJ-NC', 'VBZ-HL', 'VB+RP', 'BEZ-NC', 'PPSS+HV-TL', 'HV*', 'IN', 'PP$-NC', 'NP-NC', 'BEN', 'PP$-TL', 'FW-*-TL', 'FW-OD-TL', 'WPS', 'WPO', 'MD+PPSS', 'WDT+BER', 'WDT+BEZ', 'CD-HL', 'WDT+BEZ-NC', 'WP$', 'DO+PPSS', 'HV-HL', 'DT-NC', 'PN-NC', 'FW-VBZ', 'HVD', 'HVG', 'NN+BEZ-TL', 'HVZ', 'FW-VBD', 'FW-VBG', 'NNS$-TL', 'JJ-TL', 'FW-VBN', 'MD-TL', 'WDT+DOD', 'HV-TL', 'NN-TL', 'PPSS', 'NR$', 'BER', 'FW-VB', 'DT', 'PN+BEZ', 'VBG-HL', 'FW-PPL+VBZ', 'FW-NPS-TL', 'RB$', 'FW-IN+NN', 'FW-CC-TL', 'RBT', 'RBR', 'PPS-TL', 'PPSS+HV', 'JJS-TL', 'NPS-HL', 'WPS+BEZ-TL', 'NNS-TL-HL', 'VBN-TL-NC', 'QL-TL', 'NN+NN-NC', 'JJR-TL', 'NN$-TL', 'FW-QL', 'IN-TL', 'BED-NC', 'NRS', '.-HL', 'QL', 'PP$-HL', 'WRB+BER', 'JJ', 'WRB+BEZ', 'NNS$-TL-HL', 'PPSS+BEZ', '(', 'PPSS+BER', 'DT+MD', 'DOZ-TL', 'PPSS+BEM', 'FW-PP$', 'RB+BEZ-HL', 'FW-RB+CC', 'FW-PPS', 'VBG+TO', 'DO*-HL', 'NR+MD', 'PPLS', 'IN+IN', 'BEZ*', 'FW-PPL', 'FW-PPO', 'NNS-HL', 'NIL', 'HVN', 'PPSS+BER-NC', 'AP-TL', 'FW-DT', '(-HL', 'DTI-TL', 'JJ+JJ-NC', 'FW-RB', 'FW-VBD-TL', 'BER-NC', 'NNS$-NC', 'JJ-NC', 'NPS$-TL', 'VB+VB-NC', 'PN', 'VB+TO', 'AT-TL-HL', 'BEM-NC', 'PPL-TL', 'ABN-HL', 'RB-NC', 'DO-NC', 'BE-HL', 'WRB+IN', 'FW-UH-TL', 'PPO-HL', 'FW-CD-TL', 'TO-HL', 'PPS+BEZ', 'CD$', 'DO', 'EX+MD', 'HVZ-TL', 'TO-NC', 'IN-NC', '.', 'WRB+DO', 'CD-NC', 'FW-PPO+IN', 'FW-NN$-TL', 'WDT+BEZ-HL', 'RP-HL', 'CC', 'NN+HVZ-TL', 'FW-NNS-TL', 'DT+BEZ', 'WPS+HVZ', 'BEDZ*', 'NP-TL', ':-TL', 'NN-NC', 'WPO-TL', 'QL-NC', 'FW-AT+NN-TL', 'WDT+HVZ', '.-NC', 'FW-DTS', 'NP-HL', ':-HL', 'RBR-NC', 'OD-HL', 'BEDZ-HL', 'VBD-TL', 'NPS-NC', ')', 'TO+VB', 'FW-IN+NN-TL', 'PPL', 'PPS', 'PPSS+VB', 'DT-TL', 'RP-NC', 'VB', 'FW-VB-TL', 'PP$', 'VBD-HL', 'DTI-HL', 'NN-TL-NC', 'PPL-HL', 'DOZ*', 'NR-TL', 'WRB+MD', 'PN+HVZ', 'FW-IN-TL', 'PN+HVD', 'BEN-TL', 'BE', 'WDT', 'WPS+HVD', 'DO-TL', 'FW-NN-NC', 'WRB+BEZ-TL', 'UH-TL', 'JJR-NC', 'NNS', 'PPSS-NC', 'WPS+BEZ-NC', ',-TL', 'NN$', 'VBN-TL-HL', 'WDT-NC', 'OD', 'FW-OD-NC', 'DOZ*-TL', 'PPSS+HVD', 'CS-TL', 'WRB+DOZ', 'CC-NC', 'HV', 'NN$-HL', 'FW-WDT', 'WRB+DOD', 'NN+HVZ', 'AT-NC', 'NNS-TL', 'FW-BEZ', 'CS-HL', 'WPO-NC', 'FW-BER', 'NNS-TL-NC', 'BEZ-TL', 'FW-IN+AT-T', 'ABN-NC', 'NR-TL-HL', 'BEDZ', 'NP+BEZ', 'FW-AT-TL', 'BER*', 'WPS+MD', 'MD-HL', 'BED*', 'HV-NC', 'WPS-NC', 'VBN-HL', 'FW-TO+VB', 'PPSS+MD-NC', 'HVZ*', 'PPS-HL', 'WRB-NC', 'VBN-TL', 'CD-TL-HL', ',-NC', 'RP-TL', 'AP-HL', 'FW-HV', 'WQL-TL', 'FW-AT', 'NN', 'NR$-TL', 'VBZ-NC', '*', 'PPSS-TL', 'JJT-HL', 'FW-NNS', 'NP', 'UH-HL', 'NR', ':', 'FW-NN$', 'RP+IN', ',-HL', 'JJ-TL-NC', 'AP-NC', '*-NC', 'VB-HL', 'HVZ-NC', 'DTS-HL', 'FW-JJT', 'FW-JJR', 'FW-JJ-TL', 'FW-*', 'RB+BEZ', "''", 'VB+AT', 'PN-HL', 'PPO-TL', 'CD-TL', 'UH-NC', 'FW-NN-TL-NC', 'EX-NC', 'PPSS+BEZ*', 'TO', 'WDT+DO+PPS', 'IN+PPO', 'AP', 'AT', 'DOZ-HL', 'FW-RB-TL', 'CD', 'NN+IN', 'FW-AT-HL', 'PN+MD', "'", 'FW-PP$-TL', 'FW-NPS', 'WDT+BER+PP', 'NN+HVD-TL', 'MD+HV', 'AT-HL', 'FW-IN+AT-TL'])