started on pure cnf version, see sec6 of formulas.pdf
[dmvccm.git] / src / pcnf_harmonic.py
blob577cf6901785fcb88acf21f89452c742598a8af4
1 # pcnf_harmonic.py, initialization for pcnf_dmv.py
3 from pcnf_dmv import * # better way to do this?
4 from loc_h_harmonic import taglist, init_freq # neutral as regards cnf/loc_h
6 # todo: tweak this
7 HARMONIC_C = 0.0
8 FNONSTOP_MIN = 25
9 FSTOP_MIN = 5
11 ##############################
12 # Initialization #
13 ##############################
14 def init_normalize(f, tags, tagnum, numtag):
15 '''Use frequencies (and sums) in f to return create p_STOP and
16 p_ATTACH; at the same time adding the context-free rules to the
17 grammar using these probabilities.
19 Return a usable grammar.'''
20 p_rules = []
21 p_STOP, p_ROOT, p_ATTACH, p_terminals = {},{},{},{}
22 for h, head in numtag.iteritems():
23 p_ROOT[h] = float(f['ROOT', head]) / f['sum', 'ROOT']
25 # p_STOP = STOP / (STOP + NOT_STOP)
26 for dir in [LEFT,RIGHT]:
27 for adj in [NON,ADJ]:
28 p_STOP[h, dir, adj] = \
29 float(f[head, 'STOP', dir, adj]) / \
30 (f[head, 'STOP', dir, adj] + f[head, '-STOP', dir, adj])
32 p_terminals[(GOR, h), head] = 1.0
34 for dir in [LEFT, RIGHT]:
35 for arg, val in f[head, dir].iteritems():
36 p_ATTACH[tagnum[arg], h, dir] = float(val) / f[head,'sum',dir]
38 return PCNF_DMV_Grammar(numtag, tagnum, p_ROOT, p_STOP, p_ATTACH, p_terminals)
41 def initialize(corpus):
42 '''Return an initialized PCNF_DMV_Grammar
43 corpus is a list of lists of tags.'''
44 tags = taglist(corpus)
45 tagnum, numtag = {}, {}
46 for num, tag in enumerate(tags):
47 tagnum[tag] = num
48 numtag[num] = tag
49 # f: frequency counts used in initialization, mostly distances
50 f = init_freq(corpus, tags)
51 g = init_normalize(f, tags, tagnum, numtag)
52 return g
55 if __name__ == "__main__":
56 print "--------initialization testing------------"
57 print initialize([['foo', 'two','foo','foo'],
58 ['zero', 'one','two','three']])
60 for (n,s) in [(95,5),(5,5)]:
61 FNONSTOP_MIN = n
62 FSTOP_MIN = s
64 testcorpus = [s.split() for s in ['det nn vbd c nn vbd nn','det nn vbd c nn vbd pp nn',
65 'det nn vbd nn','det nn vbd c nn vbd pp nn',
66 'det nn vbd nn','det nn vbd c nn vbd pp nn',
67 'det nn vbd nn','det nn vbd c nn vbd pp nn',
68 'det nn vbd nn','det nn vbd c nn vbd pp nn',
69 'det nn vbd pp nn','det nn vbd det nn', ]]
70 g = initialize(testcorpus)