1 # pcnf_harmonic.py, initialization for pcnf_dmv.py
3 from pcnf_dmv
import * # better way to do this?
4 from loc_h_harmonic
import taglist
, init_freq
# neutral as regards cnf/loc_h
11 ##############################
13 ##############################
14 def init_normalize(f
, tags
, tagnum
, numtag
):
15 '''Use frequencies (and sums) in f to return create p_STOP and
16 p_ATTACH; at the same time adding the context-free rules to the
17 grammar using these probabilities.
19 Return a usable grammar.'''
21 p_STOP
, p_ROOT
, p_ATTACH
, p_terminals
= {},{},{},{}
22 for h
, head
in numtag
.iteritems():
23 p_ROOT
[h
] = float(f
['ROOT', head
]) / f
['sum', 'ROOT']
25 # p_STOP = STOP / (STOP + NOT_STOP)
26 for dir in [LEFT
,RIGHT
]:
28 p_STOP
[h
, dir, adj
] = \
29 float(f
[head
, 'STOP', dir, adj
]) / \
30 (f
[head
, 'STOP', dir, adj
] + f
[head
, '-STOP', dir, adj
])
32 p_terminals
[(GOR
, h
), head
] = 1.0
34 for dir in [LEFT
, RIGHT
]:
35 for arg
, val
in f
[head
, dir].iteritems():
36 p_ATTACH
[tagnum
[arg
], h
, dir] = float(val
) / f
[head
,'sum',dir]
38 return PCNF_DMV_Grammar(numtag
, tagnum
, p_ROOT
, p_STOP
, p_ATTACH
, p_terminals
)
41 def initialize(corpus
):
42 '''Return an initialized PCNF_DMV_Grammar
43 corpus is a list of lists of tags.'''
44 tags
= taglist(corpus
)
45 tagnum
, numtag
= {}, {}
46 for num
, tag
in enumerate(tags
):
49 # f: frequency counts used in initialization, mostly distances
50 f
= init_freq(corpus
, tags
)
51 g
= init_normalize(f
, tags
, tagnum
, numtag
)
55 if __name__
== "__main__":
56 print "--------initialization testing------------"
57 print initialize([['foo', 'two','foo','foo'],
58 ['zero', 'one','two','three']])
60 for (n
,s
) in [(95,5),(5,5)]:
64 testcorpus
= [s
.split() for s
in ['det nn vbd c nn vbd nn','det nn vbd c nn vbd pp nn',
65 'det nn vbd nn','det nn vbd c nn vbd pp nn',
66 'det nn vbd nn','det nn vbd c nn vbd pp nn',
67 'det nn vbd nn','det nn vbd c nn vbd pp nn',
68 'det nn vbd nn','det nn vbd c nn vbd pp nn',
69 'det nn vbd pp nn','det nn vbd det nn', ]]
70 g
= initialize(testcorpus
)