From: Kevin Brubeck Unhammer Date: Thu, 18 Sep 2008 10:00:43 +0000 (+0200) Subject: rewrote loc_h_harmonic's STOP initialization to reflect report.pdf; simpler now X-Git-Url: https://repo.or.cz/w/dmvccm.git/commitdiff_plain/fb4430267a814dc894b27d4c15f92954269d4787 rewrote loc_h_harmonic's STOP initialization to reflect report.pdf; simpler now --- diff --git a/report/report.pdf b/report/report.pdf index d5bf696..44b6911 100644 Binary files a/report/report.pdf and b/report/report.pdf differ diff --git a/report/report.tex b/report/report.tex index 0b02970..df06de3 100644 --- a/report/report.tex +++ b/report/report.tex @@ -375,19 +375,19 @@ border\footnote{For non-adjacent stopping we checked for occurence at \begin{align*} f(stop:\LOC{h},left,adj)=\begin{cases} - 1 \text{, if } loc(\LOC{h}) = 0,\\ + C_S \text{, if } loc(\LOC{h}) = 0,\\ 0 \text{, otherwise} \end{cases} \end{align*} \begin{align*} P_{STOP}(stop|h,left,adj) = \frac - {C_S + \sum_{s \in S}\sum_{\LOC{h} \in s} f(stop:\LOC{h},left,adj)} - {C_S + C_N + \sum_{s \in S}\sum_{\LOC{h} \in s} 1} + {C_{M} + \sum_{s \in S}\sum_{\LOC{h} \in s} f(stop:\LOC{h},left,adj)} + {C_{M} + \sum_{s \in S}\sum_{\LOC{h} \in s} C_N} \end{align*} \subsection{TODO: Results} -We tried various values for the initialization constants $C_A, C_S$ +We tried various values for the initialization constants $C_A, C_M, C_S$ and $C_N$; but it was hard to find any clear pattern for what worked best. diff --git a/src/loc_h_dmv.py b/src/loc_h_dmv.py index 99b6879..b6f9ac0 100644 --- a/src/loc_h_dmv.py +++ b/src/loc_h_dmv.py @@ -38,7 +38,9 @@ class DMV_Grammar(io.Grammar): def t(n): return "%d=%s" % (n, self.numtag(n)) def p(dict,key): - if key in dict: return dict[key] + if key in dict: + if dict[key] > 1.0: raise Exception, "probability > 1.0:%s"%key + return dict[key] else: return 0.0 def no_zeroL(str,tagstr,prob): if prob > 0.0: return (str%(tagstr,prob)).ljust(LJUST) @@ -885,7 +887,7 @@ def testgrammar(): loc_h_harmonic.FNONSTOP_MIN = 25 loc_h_harmonic.FSTOP_MIN = 5 loc_h_harmonic.RIGHT_FIRST = 1.0 - loc_h_harmonic.OTHER_STOP_CALC = False + loc_h_harmonic.OLD_STOP_CALC = True return loc_h_harmonic.initialize(testcorpus) @@ -1056,8 +1058,16 @@ def compare_grammars(g1,g2): def testNVNgrammar(): - from loc_h_harmonic import initialize - g = initialize(['n v n'.split()]) + import loc_h_harmonic + + # make sure these are the way they were when setting up the tests: + loc_h_harmonic.HARMONIC_C = 0.0 + loc_h_harmonic.FNONSTOP_MIN = 25 + loc_h_harmonic.FSTOP_MIN = 5 + loc_h_harmonic.RIGHT_FIRST = 1.0 + loc_h_harmonic.OLD_STOP_CALC = True + + g = loc_h_harmonic.initialize(['n v n'.split()]) return g # todo def testIO(): @@ -1086,6 +1096,14 @@ if __name__ == "__main__": # g1 = testreestimation() # g2 = testreestimation2() # print compare_grammars(g1,g2) + + + + + + + +if False: g = testNVNgrammar() q_sent = inner_sent(g,'n v n'.split(),{}) q_tree = {} diff --git a/src/loc_h_dmv.pyc b/src/loc_h_dmv.pyc index 328748c..dcdc0ba 100644 Binary files a/src/loc_h_dmv.pyc and b/src/loc_h_dmv.pyc differ diff --git a/src/loc_h_harmonic.py b/src/loc_h_harmonic.py index 1b3eb1e..8932cc9 100644 --- a/src/loc_h_harmonic.py +++ b/src/loc_h_harmonic.py @@ -2,11 +2,15 @@ from loc_h_dmv import * # better way to do this? -# todo: tweak these (0.0 on FSTOP_MIN gives too many zero-probabilities though) +# todo: tweak these HARMONIC_C = 0.0 -FNONSTOP_MIN = 0.0 +STOP_C = 1.0 +NSTOP_C = 1.0 FSTOP_MIN = 1.0 -OTHER_STOP_CALC = True + +FNONSTOP_MIN = 0.0 # for OLD_STOP_CALC. 0.0 on FSTOP_MIN gives many + # zero-probabilities for OLD_STOP_CALC +OLD_STOP_CALC = True RIGHT_FIRST = 1.0 # apparently right-first is best for DMV-only @@ -45,8 +49,10 @@ def init_zeros(tags): f['sum', 'ROOT'] = 0 for dir in [LEFT, RIGHT]: for adj in [ADJ, NON]: - f[tag, 'STOP', dir, adj] = FSTOP_MIN - f[tag, '-STOP', dir, adj] = FNONSTOP_MIN + f[tag, 'STOP', dir, adj] = 0.0 + if OLD_STOP_CALC: + f[tag, 'STOP', dir, adj] = FSTOP_MIN + f[tag, '-STOP', dir, adj] = FNONSTOP_MIN f[tag, RIGHT] = {} f[tag, LEFT] = {} f[tag, 'sum', RIGHT] = 0.0 @@ -78,19 +84,10 @@ def init_freq(corpus, tags): for loc_h, head in enumerate(sent): # todo grok: how is this different from just using straight head # frequency counts, for the ROOT probabilities? - f['ROOT', head] += 1 - f['sum', 'ROOT'] += 1 + f['ROOT', head] += 1.0 + f['sum', 'ROOT'] += 1.0 - if OTHER_STOP_CALC: - f[head, 'STOP', LEFT,NON] += (loc_h == 1) # second word - f[head, '-STOP', LEFT,NON] += (loc_h > 1) # after second - f[head, 'STOP', LEFT,ADJ] += (loc_h == 0) # first word - f[head, '-STOP', LEFT,ADJ] += (loc_h > 0) # not first - f[head, 'STOP', RIGHT,NON] += (loc_h == n - 1) # second-to-last - f[head, '-STOP', RIGHT,NON] += (loc_h < n - 1) # before second-to-last - f[head, 'STOP', RIGHT,ADJ] += (loc_h == n) # last word - f[head, '-STOP', RIGHT,ADJ] += (loc_h < n) # not last - else: # note: int(True) == 1 + if OLD_STOP_CALC: f[head, 'STOP', LEFT,NON] += (loc_h == 1) # second word f[head, '-STOP', LEFT,NON] += (not loc_h == 1) # not second f[head, 'STOP', LEFT,ADJ] += (loc_h == 0) # first word @@ -99,6 +96,20 @@ def init_freq(corpus, tags): f[head, '-STOP', RIGHT,NON] += (not loc_h == n - 1) # not second-to-last f[head, 'STOP', RIGHT,ADJ] += (loc_h == n) # last word f[head, '-STOP', RIGHT,ADJ] += (not loc_h == n) # not last + else: + f[head, 'STOP', LEFT,NON] += (loc_h == 1) # second word + f[head, 'STOP', LEFT,ADJ] += (loc_h == 0) # first word + f[head, 'STOP', RIGHT,NON] += (loc_h == n - 1) # second-to-last + f[head, 'STOP', RIGHT,ADJ] += (loc_h == n) # last word +# elif OTHER_STOP_CALC: +# f[head, 'STOP', LEFT,NON] += (loc_h == 1) # second word +# f[head, '-STOP', LEFT,NON] += (loc_h > 1) # after second +# f[head, 'STOP', LEFT,ADJ] += (loc_h == 0) # first word +# f[head, '-STOP', LEFT,ADJ] += (loc_h > 0) # not first +# f[head, 'STOP', RIGHT,NON] += (loc_h == n - 1) # second-to-last +# f[head, '-STOP', RIGHT,NON] += (loc_h < n - 1) # before second-to-last +# f[head, 'STOP', RIGHT,ADJ] += (loc_h == n) # last word +# f[head, '-STOP', RIGHT,ADJ] += (loc_h < n) # not last # this is where we make the "harmonic" distribution. quite. for loc_a, arg in enumerate(sent): @@ -118,32 +129,34 @@ def init_freq(corpus, tags): return f def init_normalize(f, tags, numtag, tagnum): - '''Use frequencies (and sums) in f to return create p_STOP, p_ATTACH - and p_GO_AT (which is (1-p_STOP)*p_ATTACH). - - Return a usable DMV_Grammar2.''' + '''Use frequencies (and sums) in f to return create p_STOP, p_ATTACH, + p_ROOT. + + Return a usable DMV_Grammar.''' p_rules = [] p_STOP, p_ROOT, p_ATTACH, p_ORDER = {},{},{},{} for h, head in numtag.iteritems(): - p_ROOT[h] = float(f['ROOT', head]) / f['sum', 'ROOT'] + # f['ROOT', head] is just a simple frequency count + p_ROOT[h] = f['ROOT', head] / f['sum', 'ROOT'] - # p_STOP = STOP / (STOP + NOT_STOP) for dir in [LEFT,RIGHT]: for adj in [NON,ADJ]: - den = f[head, 'STOP', dir, adj] + f[head, '-STOP', dir, adj] - if den > 0.0: - p_STOP[h, dir, adj] = float(f[head, 'STOP', dir, adj]) / float(den) - else: - p_STOP[h, dir, adj] = 1.0 - + if OLD_STOP_CALC: + den = f[head, 'STOP', dir, adj] + f[head, '-STOP', dir, adj] + if den > 0.0: + p_STOP[h, dir, adj] = f[head, 'STOP', dir, adj] / den + else: p_STOP[h, dir, adj] = 1.0 + else: + p_STOP[h, dir, adj] = \ + (FSTOP_MIN + f[head, 'STOP', dir, adj] * STOP_C) / \ + (FSTOP_MIN + f['ROOT',head] * NSTOP_C) - p_ORDER[GOR, h] = RIGHT_FIRST p_ORDER[GOL, h] = 1 - RIGHT_FIRST for dir in [LEFT, RIGHT]: for arg, val in f[head, dir].iteritems(): - p_ATTACH[tagnum[arg], h, dir] = float(val) / f[head,'sum',dir] + p_ATTACH[tagnum[arg], h, dir] = val / f[head,'sum',dir] return DMV_Grammar(numtag, tagnum, p_ROOT, p_STOP, p_ATTACH, p_ORDER) @@ -159,6 +172,8 @@ def initialize(corpus): g = init_normalize(f, tags, numtag, tagnum) g.HARMONIC_C = HARMONIC_C # for evaluations in main.py, todo: remove + g.STOP_C = STOP_C + g.NSTOP_C = NSTOP_C g.FNONSTOP_MIN = FNONSTOP_MIN g.FSTOP_MIN = FSTOP_MIN diff --git a/src/main.py b/src/main.py index 3facf68..b5b6336 100644 --- a/src/main.py +++ b/src/main.py @@ -15,19 +15,21 @@ def initialize_loc_h(tagonlys): # loc_h_harmonic.HARMONIC_C = 380.111684914 # loc_h_harmonic.FSTOP_MIN = 13.5744632704 # loc_h_harmonic.FNONSTOP_MIN = 34.8939452454 - loc_h_harmonic.HARMONIC_C = 0 # 509.63 #1000.0 * random.random() - loc_h_harmonic.FSTOP_MIN = 13.08 #20.0 * random.random() - loc_h_harmonic.FNONSTOP_MIN = 30.11 #50.0 * random.random() + loc_h_harmonic.FSTOP_MIN + loc_h_harmonic.HARMONIC_C = random.random() # 0.0 # 509.63 #1000.0 * random.random() + loc_h_harmonic.FSTOP_MIN = random.random() # 1.0 # 13.08 #20.0 * random.random() + loc_h_harmonic.STOP_C = random.random() + loc_h_harmonic.NSTOP_C = random.random() loc_h_harmonic.RIGHT_FIRST = 1.0 - loc_h_harmonic.OTHER_STOP_CALC = False + loc_h_harmonic.OLD_STOP_CALC = False print ''' -HARMONIC_C: %s, FNONSTOP_MIN: %s, FSTOP_MIN: %s -RIGHT_FIRST: %s, OTHER_STOP_CALC: %s'''%(loc_h_harmonic.HARMONIC_C, - loc_h_harmonic.FNONSTOP_MIN, - loc_h_harmonic.FSTOP_MIN, - loc_h_harmonic.RIGHT_FIRST, - loc_h_harmonic.OTHER_STOP_CALC) +HARMONIC_C: %s, STOP_C: %s, NSTOP_C: %s, FSTOP_MIN: %s +RIGHT_FIRST: %s, OLD_STOP_CALC: %s'''%(loc_h_harmonic.HARMONIC_C, + loc_h_harmonic.STOP_C, + loc_h_harmonic.NSTOP_C, + loc_h_harmonic.FSTOP_MIN, + loc_h_harmonic.RIGHT_FIRST, + loc_h_harmonic.OLD_STOP_CALC) g = loc_h_harmonic.initialize(tagonlys) return g @@ -184,9 +186,9 @@ def compare_loc_h_cnf(): # end compare_loc_h_cnf() -def init_nothing(g,H,N,S): +def init_nothing(g,H,S,N,M): print ''' -HARMONIC_C: %s, FNONSTOP_MIN: %s, FSTOP_MIN: %s'''%(H,N,S) +HARMONIC_C: %s, STOP_C: %s, NSTOP_C: %s, FSTOP_MIN: %s'''%(H,S,N,M) return lambda corpus:g def rnd_grammars_test(): @@ -202,26 +204,26 @@ def rnd_grammars_test(): iterations=0, corpus_offset=0, eval=True) - rnd_grammars0 += [(g, g.HARMONIC_C, g.FNONSTOP_MIN, g.FSTOP_MIN)] + rnd_grammars0 += [(g, g.HARMONIC_C, g.STOP_C, g.NSTOP_C, g.FSTOP_MIN)] rnd_grammars1 = [(test_likelihood(loc_h_dmv.reestimate, - init_nothing(g,H,N,S), + init_nothing(g,H,S,N,M), loc_h_dmv.inner_sent, corpus_size=6268, iterations=1, corpus_offset=0, eval=True), - H,N,S) - for g,H,N,S in rnd_grammars0] + H,S,N,M) + for g,H,S,N,M in rnd_grammars0] rnd_grammars2 = [(test_likelihood(loc_h_dmv.reestimate, - init_nothing(g,H,N,S), + init_nothing(g,H,S,N,M), loc_h_dmv.inner_sent, corpus_size=6268, iterations=1, corpus_offset=0, eval=True), - H,N,S) - for g,H,N,S in rnd_grammars1] + H,S,N,M) + for g,H,S,N,M in rnd_grammars1] if __name__ == "__main__": print "main.py:" @@ -237,24 +239,26 @@ if __name__ == "__main__": # corpus_size=5, # iterations=4) - import loc_h_dmv + rnd_grammars_test() + +# import loc_h_dmv # reload(loc_h_dmv) -# rnd_grammars_test() - print "\ntrying reestimate v.1 ##############################" - g = test_likelihood(loc_h_dmv.reestimate, - initialize_loc_h, - loc_h_dmv.inner_sent, - corpus_size=5, - iterations=4, - corpus_offset=0, - eval=True) - print g - - print "\ntrying reestimate v.2 ##############################" - g = test_likelihood(loc_h_dmv.reestimate2, - initialize_loc_h, - loc_h_dmv.inner_sent, - corpus_size=5, - iterations=4, - corpus_offset=0) - print "main.py: done" +# print "\ntrying reestimate v.1 ##############################" +# g = test_likelihood(loc_h_dmv.reestimate, +# initialize_loc_h, +# loc_h_dmv.inner_sent, +# corpus_size=6268, +# iterations=100, +# corpus_offset=0, +# eval=True) +# print g + +# print "\ntrying reestimate v.2 ##############################" +# g = test_likelihood(loc_h_dmv.reestimate2, +# initialize_loc_h, +# loc_h_dmv.inner_sent, +# corpus_size=5, +# iterations=4, +# corpus_offset=0) +# print "main.py: done" +# print g