src/harmonic.py

   1 # harmonic.py, initialization for dmv
   2 #
   3 # initialization is a package within dmvccm
   4
   5 from dmv import * # better way to do this?
   6
   7 # todo: tweak this
   8 HARMONIC_C = 0.0
   9 FNONSTOP_MIN = 10
  10 FSTOP_MIN = 5
  11
  12 ##############################
  13 #      Initialization        #
  14 ##############################
  15 def taglist(corpus):
  16     '''sents is of this form:
  17 [['tag', ...], ['tag2', ...], ...]
  18
  19 Return a list of the tags. (Has to be ordered for enumerating to be
  20 consistent.)
  21
  22 Fortunately only has to run once.
  23 '''
  24     tagset = set()
  25     for sent in corpus:
  26         for tag in sent:
  27             tagset.add(tag)
  28     if 'ROOT' in tagset:
  29         raise ValueError("it seems we must have a new ROOT symbol")
  30     return list(tagset)
  31
  32
  33
  34
  35
  36 def init_zeros(tags):
  37     '''Return a frequency dictionary with DMV-relevant keys set to 0 or
  38     {}.
  39
  40     Todo: tweak (especially for f_STOP).'''
  41     f = {}
  42     for tag in tags:
  43         f['ROOT', tag] = 0
  44         f['sum', 'ROOT'] = 0
  45         for dir_adj in ['LN','LA','RN','RA']:
  46             f[tag, 'STOP', dir_adj] = FSTOP_MIN
  47             f[tag, '-STOP', dir_adj] = FNONSTOP_MIN
  48         f[tag, 'R'] = {}
  49         f[tag, 'L'] = {}
  50         f[tag, 'sum', 'R'] = 0.0
  51         f[tag, 'sum', 'L'] = 0.0
  52     return f
  53
  54 def init_freq(corpus, tags):
  55     '''Returns f, a dictionary with these types of keys:
  56     - ('ROOT', tag) is basically just the frequency of tag
  57     - (tag, 'STOP', 'LN') is for P_STOP(STOP|tag, left, non_adj);
  58       etc. for 'RN', 'LA', 'LN', '-STOP'.
  59     - (tag, 'L') is a dictionary of arg:f, where head could take arg
  60       to direction 'L' (etc. for 'R') and f is "harmonically" divided
  61       by distance, used for finding P_CHOOSE
  62
  63       Does this stuff:
  64       1. counts word frequencies for f_ROOT
  65       2. adds to certain f_STOP counters if a word is found first,
  66          last, first or second, or last or second to last in the sentence
  67          (Left Adjacent, Left Non-Adjacent, etc)
  68       3. adds to f_CHOOSE(arg|head) a "harmonic" number (divided by
  69          distance between arg and head)
  70     '''
  71     f = init_zeros(tags)
  72
  73     for sent in corpus: # sent is ['VBD', 'NN', ...]
  74         n = len(sent)
  75         # NOTE: head in DMV_Rule is a number, while this is the string
  76         for i_h, head in enumerate(sent):
  77             # todo grok: how is this different from just using straight head
  78             # frequency counts, for the ROOT probabilities?
  79             f['ROOT', head] += 1
  80             f['sum', 'ROOT'] += 1
  81
  82             # True = 1, False = 0. todo: make prettier
  83             f[head,  'STOP', 'LN'] += (i_h == 1)     # second word
  84             f[head, '-STOP', 'LN'] += (not i_h == 1) # not second
  85             f[head,  'STOP', 'LA'] += (i_h == 0)     # first word
  86             f[head, '-STOP', 'LA'] += (not i_h == 0) # not first
  87             f[head,  'STOP', 'RN'] += (i_h == n - 2)     # second-to-last
  88             f[head, '-STOP', 'RN'] += (not i_h == n - 2) # not second-to-last
  89             f[head,  'STOP', 'RA'] += (i_h == n - 1)     # last word
  90             f[head, '-STOP', 'RA'] += (not i_h == n - 1) # not last
  91
  92             # this is where we make the "harmonic" distribution. quite.
  93             for i_a, arg in enumerate(sent):
  94                 if i_h != i_a:
  95                     harmony = 1.0/abs(i_h - i_a) + HARMONIC_C
  96                     if i_h > i_a:
  97                         dir = 'L'
  98                     else:
  99                         dir = 'R'
 100                     if arg not in f[head, dir]:
 101                         f[head, dir][arg] = 0.0
 102                     f[head, dir][arg] += harmony
 103                     f[head, 'sum', dir] += harmony
 104                 # todo, optimization: possible to do both directions
 105                 # at once here, and later on rule out the ones we've
 106                 # done? does it actually speed things up?
 107
 108     return f
 109
 110 def init_normalize(f, tags, tagnum, numtag):
 111     '''Use frequencies (and sums) in f to return create p_STOP and
 112     p_CHOOSE; at the same time adding the context-free rules to the
 113     grammar using these probabilities.
 114
 115     Return a usable grammar.'''
 116     p_rules = []
 117     p_STOP, p_ROOT, p_CHOOSE, p_terminals = {},{},{},{}
 118     for n_h, head in enumerate(tags):
 119         p_ROOT[n_h] = float(f['ROOT', head]) / f['sum', 'ROOT']
 120         p_rules.append( DMV_Rule(ROOT, (LRBAR,n_h), STOP,
 121                                  p_ROOT[n_h],
 122                                  p_ROOT[n_h]))
 123
 124         # p_STOP = STOP / (STOP + NOT_STOP)
 125         for dir in ['L','R']:
 126             for adj in ['N','A']:
 127                 p_STOP[n_h, dir+adj] = \
 128                     float(f[head, 'STOP', dir+adj]) / \
 129                     (f[head, 'STOP', dir+adj] + f[head, '-STOP', dir+adj])
 130         # make rule using the previously found probN and probA:
 131         p_rules.append( DMV_Rule((RBAR, n_h), (NOBAR, n_h), STOP,
 132                                  p_STOP[n_h, 'RN'],
 133                                  p_STOP[n_h, 'RA']) )
 134         p_rules.append( DMV_Rule((LRBAR, n_h), STOP, (RBAR, n_h),
 135                                  p_STOP[n_h, 'LN'],
 136                                  p_STOP[n_h, 'LA']) )
 137
 138         # inner() shouldn't have to deal with those long non-branching stops:
 139         p_terminals[(NOBAR, n_h), head] = 1.0
 140         p_terminals[(RBAR, n_h), head] = p_STOP[n_h, 'RA']
 141         p_terminals[(LRBAR, n_h), head] = p_STOP[n_h, 'RA'] * p_STOP[n_h, 'LA']
 142
 143         for dir in ['L', 'R']:
 144             for arg, val in f[head, dir].iteritems():
 145                 p_CHOOSE[tagnum[arg], n_h, dir] = float(val) / f[head,'sum',dir]
 146
 147     # after the head tag-loop, add every head-argument rule:
 148     for (n_a, n_h, dir),p_C in p_CHOOSE.iteritems():
 149         if dir == 'L': # arg is to the left of head
 150             p_rules.append( DMV_Rule((RBAR,n_h), (LRBAR,n_a), (RBAR,n_h),
 151                                      p_C*(1-p_STOP[n_h, dir+'N']),
 152                                      p_C*(1-p_STOP[n_h, dir+'A'])) )
 153         if dir == 'R':
 154             p_rules.append( DMV_Rule((NOBAR,n_h), (NOBAR,n_h), (LRBAR,n_a),
 155                                      p_C*(1-p_STOP[n_h, dir+'N']),
 156                                      p_C*(1-p_STOP[n_h, dir+'A'])) )
 157
 158     return DMV_Grammar(p_rules, p_terminals, p_STOP, p_CHOOSE, p_ROOT, numtag, tagnum)
 159
 160
 161 def initialize(corpus):
 162     '''Return an initialized DMV_Grammar
 163     corpus is a list of lists of tags.'''
 164     tags = taglist(corpus)
 165     tagnum, numtag = {}, {}
 166     for num, tag in enumerate(tags):
 167         tagnum[tag] = num
 168         numtag[num] = tag
 169     # f: frequency counts used in initialization, mostly distances
 170     f = init_freq(corpus, tags)
 171     g = init_normalize(f, tags, tagnum, numtag)
 172     return g
 173
 174
 175 if __name__ == "__main__":
 176     print "--------initialization------------"
 177     print initialize([['foo', 'two','foo','foo'],
 178                       ['zero', 'one','two','three']])
 179     pass
 180
 181
 182