src/loc_h_harmonic.py

   1 # loc_h_harmonic.py, initialization for loc_h_dmv.py
   2
   3 from loc_h_dmv import * # better way to do this?
   4
   5 # todo: tweak these
   6 HARMONIC_C = 0.0 # C_A in report.pdf
   7 STOP_C = 1.0     # C_S in report.pdf
   8 NSTOP_C = 1.0    # C_N in report.pdf
   9 FSTOP_MIN = 1.0  # C_M in report.pdf
  10
  11 FNONSTOP_MIN = 0.0 # for OLD_STOP_CALC. 0.0 on FSTOP_MIN gives many
  12                    # zero-probabilities for OLD_STOP_CALC
  13 OLD_STOP_CALC = True
  14
  15 RIGHT_FIRST = 1.0 # apparently right-first is best for DMV-only
  16
  17 ##############################
  18 #      Initialization        #
  19 ##############################
  20 def taglist(corpus):
  21     '''sents is of this form:
  22 [['tag', ...], ['tag2', ...], ...]
  23
  24 Return a list of the tags. (Has to be ordered for enumerating to be
  25 consistent.)
  26
  27 Fortunately only has to run once.
  28 '''
  29     tagset = set()
  30     for sent in corpus:
  31         for tag in sent:
  32             tagset.add(tag)
  33     if 'ROOT' in tagset:
  34         raise ValueError("it seems we must have a new ROOT symbol")
  35     return list(tagset)
  36
  37
  38
  39
  40
  41 def init_zeros(tags):
  42     '''Return a frequency dictionary with DMV-relevant keys set to 0 or
  43     {}.
  44
  45     Todo: tweak (especially for f_STOP).'''
  46     f = {}
  47     for tag in tags:
  48         f['ROOT', tag] = 0
  49         f['sum', 'ROOT'] = 0
  50         for dir in [LEFT, RIGHT]:
  51             for adj in [ADJ, NON]:
  52                 f[tag, 'STOP', dir, adj] = 0.0
  53                 if OLD_STOP_CALC:
  54                     f[tag, 'STOP', dir, adj] = FSTOP_MIN
  55                     f[tag, '-STOP', dir, adj] = FNONSTOP_MIN
  56         f[tag, RIGHT] = {}
  57         f[tag, LEFT] = {}
  58         f[tag, 'sum', RIGHT] = 0.0
  59         f[tag, 'sum', LEFT] = 0.0
  60     return f
  61
  62 def init_freq(corpus, tags):
  63     '''Returns f, a dictionary with these types of keys:
  64     - ('ROOT', tag) is basically just the frequency of tag
  65     - (tag, 'STOP', 'LN') is for P_STOP(STOP|tag, left, non_adj);
  66       etc. for 'RN', 'LA', 'LN', '-STOP'.
  67     - (tag, LEFT) is a dictionary of arg:f, where head could take arg
  68       to direction LEFT (etc. for RIGHT) and f is "harmonically" divided
  69       by distance, used for finding P_CHOOSE
  70
  71       Does this stuff:
  72       1. counts word frequencies for f_ROOT
  73       2. adds to certain f_STOP counters if a word is found first,
  74          last, first or second, or last or second to last in the sentence
  75          (Left Adjacent, Left Non-Adjacent, etc)
  76       3. adds to f_CHOOSE(arg|head) a "harmonic" number (divided by
  77          distance between arg and head)
  78     '''
  79     f = init_zeros(tags)
  80
  81     for sent in corpus: # sent is ['VBD', 'NN', ...]
  82         n = len(sent) - 1
  83         # NOTE: head in DMV_Rule is a number, while this is the string
  84         for loc_h, head in enumerate(sent):
  85             # todo grok: how is this different from just using straight head
  86             # frequency counts, for the ROOT probabilities?
  87             f['ROOT', head] += 1.0
  88             f['sum', 'ROOT'] += 1.0
  89
  90             if OLD_STOP_CALC:
  91                 f[head,  'STOP',  LEFT,NON] += (loc_h == 1)     # second word
  92                 f[head, '-STOP',  LEFT,NON] += (not loc_h == 1) # not second
  93                 f[head,  'STOP',  LEFT,ADJ] += (loc_h == 0)     # first word
  94                 f[head, '-STOP',  LEFT,ADJ] += (not loc_h == 0) # not first
  95                 f[head,  'STOP', RIGHT,NON] += (loc_h == n - 1)     # second-to-last
  96                 f[head, '-STOP', RIGHT,NON] += (not loc_h == n - 1) # not second-to-last
  97                 f[head,  'STOP', RIGHT,ADJ] += (loc_h == n)         # last word
  98                 f[head, '-STOP', RIGHT,ADJ] += (not loc_h == n)     # not last
  99             else:
 100                 f[head,  'STOP',  LEFT,NON] += (loc_h == 1)     # second word
 101                 f[head,  'STOP',  LEFT,ADJ] += (loc_h == 0)     # first word
 102                 f[head,  'STOP', RIGHT,NON] += (loc_h == n - 1) # second-to-last
 103                 f[head,  'STOP', RIGHT,ADJ] += (loc_h == n)     # last word
 104 #             elif OTHER_STOP_CALC:
 105 #                 f[head,  'STOP',  LEFT,NON] += (loc_h == 1)     # second word
 106 #                 f[head, '-STOP',  LEFT,NON] += (loc_h  > 1)     # after second
 107 #                 f[head,  'STOP',  LEFT,ADJ] += (loc_h == 0)     # first word
 108 #                 f[head, '-STOP',  LEFT,ADJ] += (loc_h  > 0)     # not first
 109 #                 f[head,  'STOP', RIGHT,NON] += (loc_h == n - 1) # second-to-last
 110 #                 f[head, '-STOP', RIGHT,NON] += (loc_h  < n - 1) # before second-to-last
 111 #                 f[head,  'STOP', RIGHT,ADJ] += (loc_h == n)     # last word
 112 #                 f[head, '-STOP', RIGHT,ADJ] += (loc_h  < n)     # not last
 113
 114             # this is where we make the "harmonic" distribution. quite.
 115             for loc_a, arg in enumerate(sent):
 116                 if loc_h != loc_a:
 117                     harmony = 1.0/abs(loc_h - loc_a) + HARMONIC_C
 118                     if loc_h > loc_a:
 119                         dir = LEFT
 120                     else:
 121                         dir = RIGHT
 122                     if arg not in f[head, dir]:
 123                         f[head, dir][arg] = 0.0
 124                     f[head, dir][arg] += harmony
 125                     f[head, 'sum', dir] += harmony
 126                 # todo, optimization: possible to do both directions
 127                 # at once here, and later on rule out the ones we've
 128                 # done? does it actually speed things up?
 129     return f
 130
 131 def init_normalize(f, tags, numtag, tagnum):
 132     '''Use frequencies (and sums) in f to return create p_STOP, p_ATTACH,
 133     p_ROOT.
 134
 135     Return a usable DMV_Grammar.'''
 136     p_rules = []
 137     p_STOP, p_ROOT, p_ATTACH, p_ORDER = {},{},{},{}
 138     for h, head in numtag.iteritems():
 139         # f['ROOT', head] is just a simple frequency count
 140         p_ROOT[h] = f['ROOT', head] / f['sum', 'ROOT']
 141
 142         for dir in [LEFT,RIGHT]:
 143             for adj in [NON,ADJ]:
 144                 if OLD_STOP_CALC:
 145                     den = f[head, 'STOP', dir, adj] + f[head, '-STOP', dir, adj]
 146                     if den > 0.0:
 147                         p_STOP[h, dir, adj] = f[head, 'STOP', dir, adj] / den
 148                     else: p_STOP[h, dir, adj] = 1.0
 149                 else:
 150                     p_STOP[h, dir, adj] = \
 151                         (FSTOP_MIN + f[head, 'STOP', dir, adj] * STOP_C) / \
 152                         (FSTOP_MIN + f['ROOT',head] * (STOP_C + NSTOP_C))
 153
 154         p_ORDER[GOR, h] = RIGHT_FIRST
 155         p_ORDER[GOL, h] = 1 - RIGHT_FIRST
 156
 157         for dir in [LEFT, RIGHT]:
 158             for arg, val in f[head, dir].iteritems():
 159                 p_ATTACH[tagnum[arg], h, dir] = val / f[head,'sum',dir]
 160
 161     return DMV_Grammar(numtag, tagnum, p_ROOT, p_STOP, p_ATTACH, p_ORDER)
 162
 163 def initialize(corpus):
 164     ''' corpus is a list of lists of tags.'''
 165     tags = taglist(corpus)
 166     numtag, tagnum = {}, {}
 167     for num, tag in enumerate(tags):
 168         tagnum[tag] = num
 169         numtag[num] = tag
 170     # f: frequency counts used in initialization, mostly distances
 171     f = init_freq(corpus, tags)
 172     g = init_normalize(f, tags, numtag, tagnum)
 173
 174     g.HARMONIC_C = HARMONIC_C # for evaluations in main.py, todo: remove
 175     g.STOP_C = STOP_C
 176     g.NSTOP_C = NSTOP_C
 177     g.FNONSTOP_MIN = FNONSTOP_MIN
 178     g.FSTOP_MIN = FSTOP_MIN
 179
 180     return g
 181
 182
 183
 184
 185 ##############################
 186 #     testing functions:     #
 187 ##############################
 188
 189 if __name__ == "__main__":
 190     print "--------initialization testing------------"
 191     g2 = initialize([ ['Leftmost', 'Midleft','Midright','Rightmost']])
 192     print g2
 193
 194
 195