src/loc_h_harmonic.py

   1 # loc_h_harmonic.py, initialization for loc_h_dmv.py
   2
   3 from loc_h_dmv import * # better way to do this?
   4
   5 # todo: tweak these (0.0 on FSTOP_MIN gives too many zero-probabilities though)
   6 HARMONIC_C = 0.0
   7 FNONSTOP_MIN = 0.0
   8 FSTOP_MIN = 1.0
   9 OTHER_STOP_CALC = True
  10
  11 RIGHT_FIRST = 1.0 # apparently right-first is best for DMV-only
  12
  13 ##############################
  14 #      Initialization        #
  15 ##############################
  16 def taglist(corpus):
  17     '''sents is of this form:
  18 [['tag', ...], ['tag2', ...], ...]
  19
  20 Return a list of the tags. (Has to be ordered for enumerating to be
  21 consistent.)
  22
  23 Fortunately only has to run once.
  24 '''
  25     tagset = set()
  26     for sent in corpus:
  27         for tag in sent:
  28             tagset.add(tag)
  29     if 'ROOT' in tagset:
  30         raise ValueError("it seems we must have a new ROOT symbol")
  31     return list(tagset)
  32
  33
  34
  35
  36
  37 def init_zeros(tags):
  38     '''Return a frequency dictionary with DMV-relevant keys set to 0 or
  39     {}.
  40
  41     Todo: tweak (especially for f_STOP).'''
  42     f = {}
  43     for tag in tags:
  44         f['ROOT', tag] = 0
  45         f['sum', 'ROOT'] = 0
  46         for dir in [LEFT, RIGHT]:
  47             for adj in [ADJ, NON]:
  48                 f[tag, 'STOP', dir, adj] = FSTOP_MIN
  49                 f[tag, '-STOP', dir, adj] = FNONSTOP_MIN
  50         f[tag, RIGHT] = {}
  51         f[tag, LEFT] = {}
  52         f[tag, 'sum', RIGHT] = 0.0
  53         f[tag, 'sum', LEFT] = 0.0
  54     return f
  55
  56 def init_freq(corpus, tags):
  57     '''Returns f, a dictionary with these types of keys:
  58     - ('ROOT', tag) is basically just the frequency of tag
  59     - (tag, 'STOP', 'LN') is for P_STOP(STOP|tag, left, non_adj);
  60       etc. for 'RN', 'LA', 'LN', '-STOP'.
  61     - (tag, LEFT) is a dictionary of arg:f, where head could take arg
  62       to direction LEFT (etc. for RIGHT) and f is "harmonically" divided
  63       by distance, used for finding P_CHOOSE
  64
  65       Does this stuff:
  66       1. counts word frequencies for f_ROOT
  67       2. adds to certain f_STOP counters if a word is found first,
  68          last, first or second, or last or second to last in the sentence
  69          (Left Adjacent, Left Non-Adjacent, etc)
  70       3. adds to f_CHOOSE(arg|head) a "harmonic" number (divided by
  71          distance between arg and head)
  72     '''
  73     f = init_zeros(tags)
  74
  75     for sent in corpus: # sent is ['VBD', 'NN', ...]
  76         n = len(sent) - 1
  77         # NOTE: head in DMV_Rule is a number, while this is the string
  78         for loc_h, head in enumerate(sent):
  79             # todo grok: how is this different from just using straight head
  80             # frequency counts, for the ROOT probabilities?
  81             f['ROOT', head] += 1
  82             f['sum', 'ROOT'] += 1
  83
  84             if OTHER_STOP_CALC:
  85                 f[head,  'STOP',  LEFT,NON] += (loc_h == 1)     # second word
  86                 f[head, '-STOP',  LEFT,NON] += (loc_h  > 1)     # after second
  87                 f[head,  'STOP',  LEFT,ADJ] += (loc_h == 0)     # first word
  88                 f[head, '-STOP',  LEFT,ADJ] += (loc_h  > 0)     # not first
  89                 f[head,  'STOP', RIGHT,NON] += (loc_h == n - 1) # second-to-last
  90                 f[head, '-STOP', RIGHT,NON] += (loc_h  < n - 1) # before second-to-last
  91                 f[head,  'STOP', RIGHT,ADJ] += (loc_h == n)     # last word
  92                 f[head, '-STOP', RIGHT,ADJ] += (loc_h  < n)     # not last
  93             else: # note: int(True) == 1
  94                 f[head,  'STOP',  LEFT,NON] += (loc_h == 1)     # second word
  95                 f[head, '-STOP',  LEFT,NON] += (not loc_h == 1) # not second
  96                 f[head,  'STOP',  LEFT,ADJ] += (loc_h == 0)     # first word
  97                 f[head, '-STOP',  LEFT,ADJ] += (not loc_h == 0) # not first
  98                 f[head,  'STOP', RIGHT,NON] += (loc_h == n - 1)     # second-to-last
  99                 f[head, '-STOP', RIGHT,NON] += (not loc_h == n - 1) # not second-to-last
 100                 f[head,  'STOP', RIGHT,ADJ] += (loc_h == n)         # last word
 101                 f[head, '-STOP', RIGHT,ADJ] += (not loc_h == n)     # not last
 102
 103             # this is where we make the "harmonic" distribution. quite.
 104             for loc_a, arg in enumerate(sent):
 105                 if loc_h != loc_a:
 106                     harmony = 1.0/abs(loc_h - loc_a) + HARMONIC_C
 107                     if loc_h > loc_a:
 108                         dir = LEFT
 109                     else:
 110                         dir = RIGHT
 111                     if arg not in f[head, dir]:
 112                         f[head, dir][arg] = 0.0
 113                     f[head, dir][arg] += harmony
 114                     f[head, 'sum', dir] += harmony
 115                 # todo, optimization: possible to do both directions
 116                 # at once here, and later on rule out the ones we've
 117                 # done? does it actually speed things up?
 118     return f
 119
 120 def init_normalize(f, tags, numtag, tagnum):
 121     '''Use frequencies (and sums) in f to return create p_STOP, p_ATTACH
 122     and p_GO_AT (which is (1-p_STOP)*p_ATTACH).
 123
 124     Return a usable DMV_Grammar2.'''
 125     p_rules = []
 126     p_STOP, p_ROOT, p_ATTACH, p_ORDER = {},{},{},{}
 127     for h, head in numtag.iteritems():
 128         p_ROOT[h] = float(f['ROOT', head]) / f['sum', 'ROOT']
 129
 130         # p_STOP = STOP / (STOP + NOT_STOP)
 131         for dir in [LEFT,RIGHT]:
 132             for adj in [NON,ADJ]:
 133                 den = f[head, 'STOP', dir, adj] + f[head, '-STOP', dir, adj]
 134                 if den > 0.0:
 135                     p_STOP[h, dir, adj] = float(f[head, 'STOP', dir, adj]) / float(den)
 136                 else:
 137                     p_STOP[h, dir, adj] = 1.0
 138
 139
 140
 141         p_ORDER[GOR, h] = RIGHT_FIRST
 142         p_ORDER[GOL, h] = 1 - RIGHT_FIRST
 143
 144         for dir in [LEFT, RIGHT]:
 145             for arg, val in f[head, dir].iteritems():
 146                 p_ATTACH[tagnum[arg], h, dir] = float(val) / f[head,'sum',dir]
 147
 148     return DMV_Grammar(numtag, tagnum, p_ROOT, p_STOP, p_ATTACH, p_ORDER)
 149
 150 def initialize(corpus):
 151     ''' corpus is a list of lists of tags.'''
 152     tags = taglist(corpus)
 153     numtag, tagnum = {}, {}
 154     for num, tag in enumerate(tags):
 155         tagnum[tag] = num
 156         numtag[num] = tag
 157     # f: frequency counts used in initialization, mostly distances
 158     f = init_freq(corpus, tags)
 159     g = init_normalize(f, tags, numtag, tagnum)
 160
 161     g.HARMONIC_C = HARMONIC_C # for evaluations in main.py, todo: remove
 162     g.FNONSTOP_MIN = FNONSTOP_MIN
 163     g.FSTOP_MIN = FSTOP_MIN
 164
 165     return g
 166
 167
 168
 169
 170 ##############################
 171 #     testing functions:     #
 172 ##############################
 173
 174 if __name__ == "__main__":
 175     print "--------initialization testing------------"
 176     g2 = initialize([ ['Leftmost', 'Midleft','Midright','Rightmost']])
 177     print g2
 178
 179
 180