3 # -*- encoding: utf-8 -*-
11 from itertools
import izip
12 from subprocess
import Popen
, PIPE
14 # In order to use this script, you need to install 4Suite's cDomlette.
15 from Ft
.Xml
.Domlette
import NonvalidatingReader
;
16 from Ft
.Xml
.Domlette
import Print
, PrettyPrint
;
17 from Ft
.Xml
.XPath
import Evaluate
;
20 This tool is used to find entries which are in the bidix but not in a monodix.
22 If a word is in a monodix, then lt-proc will produce one or more morphological
23 analyses for the word. For example, the Afrikaans word 'vat' is both a noun and
24 a verb. The Afrikaans monodix contains both these meanings. Therefore, when
25 we feed 'vat' to 'lt-proc af-en.automorf.bin', it will produce:
26 ^vat/vat<n><sg>/vat<vblex><pres>/vat<vblex><inf>$^./.<sent>
28 In the monodix, we are likely to have two entries:
29 <e><p><l>barrel<s n=\"n\"/></l><r>vat<s n=\"n\"/></r></p></e>
31 <e><p><l>take<s n=\"vblex\"/></l><r>vat<s n=\"vblex\"/></r></p></e>
33 So, we can extract 'vat<n>' and 'vat<vblex>' from the bidix. One can see that
34 vat<n> is a substring of vat<n><sg>
36 vat<vblex> is a substring of <vat><vblex><pres> as well as vat<vblex><inf>
38 In other words, if a bidix definition such as 'vat<n>' has a corresponding
39 monodix entry, then the morphological analyser when given the word 'vat', must
40 produce at least one string of which 'vat<n>' will be a substring. In the
41 above example, that string is 'vat<n><sg>'.
43 Conversely, if the morphological analyser produces no such string, then
44 there is no corresponding monodix entry. This is exactly the property we use
45 to find the missing monodix entries.
49 # Change this for your language
51 '<vblex>' : '<par n="breek__vblex"/>',
52 '<adj>' : '<par n="dadelik__adj"/>',
53 '<n>' : '<par n="artikel__n"/>',
54 '<n><unc>' : '<par n="poësie__n__unc"/>',
55 '<np>' : '<par n="Engeland__np"/>',
56 '<adv>' : '<par n="miskien__adv"/>',
57 '<preadv>' : '<par n="taamlik__preadv"/>'
60 # Default global values
62 lt_proc_path
= '/usr/local/bin/lt-proc' # default location for lt-proc
64 # Parse an XML document and return a DOM tree.
65 def load_xml_file(filename
):
66 return NonvalidatingReader
.parseUri('file://' + os
.path
.realpath(filename
))
69 def extract_entries(doc
, side
='l'):
70 return doc
.xpath("/dictionary/section[@id='main']/e/p/" + side
)
72 def make_input_list(lst
):
73 """lst is list of DOM nodes. Get all the child text nodes for each
74 node and concatenate the text in those nodes. Form a list of these
77 If n is the tree <p>foo</b>bar</p>, we will extract 'foobar' from n.
78 One should probably take special care of the <b/> tags; this is not
81 return [nodes_to_string(node
.xpath('./text()')) for node
in lst
]
83 pattern
= re
.compile(r
'<s n="([a-zA-Z0-9]*)"/>')
84 def transform_tags(tag
):
85 """Change a tag of the form <s n=\"tagname\"> to <tagname>"""
86 return pattern
.sub(r
'<\1>', tag
)
88 def make_compare_list(lst
):
89 return [transform_tags(nodes_to_string(node
.childNodes
)) for node
in lst
]
91 def process_output(output
):
92 """Split a set of newline separated morphological analyses,
93 and for each morphological analysis, strip off '^' and '$'
94 and split the analysis along the character '/'. This creates
98 ^vat/vat<n><sg>/vat<vblex><pres>/vat<vblex><inf>$
99 ^kla/kla<vblex><pres>/kla<vblex><inf>$
103 [['vat', 'vat<n><sg>', 'vat<vblex><pres>', 'vat<vblex><inf>'],
104 ['kla', 'kla<vblex><pres>', 'kla<vblex><inf>'],
105 ['nag', 'nag<n><sg>']]"""
106 return [line
.strip('^$').split('/') for line
in output
.split('\n')]
109 """Given an entry such as: 'foo<bar><baz>', return ('foo', '<bar><baz>').
110 We ignore <g> and <b/> tags. Thus, we will split 'foo<g><b/>bar</g><baz>'
111 into ('foo<g><b/>bar</g>', '<baz>')."""
115 if s
[i
+1:i
+3] == 'g>':
118 elif s
[i
+1:i
+4] in ('/g>', 'b/>'):
126 def call(name
, input=''):
127 """A convenience function to invoke a subprocess with the
128 parameter list name (where the first argument is the name
129 of an executable). The subprocess is fed the contents of
130 input via stdin. We collect the output of both stdout and
131 stderr from the subprocess. If stderr is not empty, we
132 raise an exception, otherwise we return the contents of
134 proc
= Popen(name
, stdin
=PIPE
, stdout
=PIPE
, stderr
=PIPE
)
135 out
, err
= proc
.communicate(input)
137 if not (err
== None or err
== ''):
143 def run_ltproc(morfo_path
, input_list
):
144 out
= call([Globals
.lt_proc_path
, morfo_path
], "\n".join(input_list
))
145 return process_output(out
)
147 def find_missing_entries(morfo_path
, lst
):
148 i
= make_input_list(lst
)
149 c
= make_compare_list(lst
)
150 o
= run_ltproc(morfo_path
, i
)
154 for entry
, morpho_entries
in izip(c
, o
):
155 matches
= [e
for e
in morpho_entries
if e
.startswith(entry
)]
157 if len(matches
) == 0:
158 missing
.append(entry
)
162 def write_entries_to_file(f
, lst
):
163 for lemma
, tags
in (extract_tags(e
) for e
in lst
):
165 f
.write(u
' <e lm="%(lemma)s"><i>%(lemma)s</i>%(template)s</e>\n' % { 'lemma': lemma
,
166 'template': template
[tags
] })
168 def node_to_string(node
):
169 buf
= cStringIO
.StringIO();
170 Print(node
, stream
=buf
, encoding
='utf-8');
171 return buf
.getvalue();
173 def nodes_to_string(lst
):
174 return ''.join(node_to_string(n
) for n
in lst
)
178 ##def last_component(pathname):
179 ## return path.split(pathname)[2]
181 ## def get_automorf_pair(automorf_path):
183 ## return re.match('([^.]*[.]automorf[.]bin)', last_component(automorf_path))
185 ## raise Exception("%s is not a valid automorf path" % automorf_path)
187 ## def get_bidix_pair(bidix_path):
189 ## return re.match('apertium-([^.]*)[.]([^.]*)[.]dix([.]xml', last_component(bidix_path)).group(1)
191 ## raise Exception("%s is not a valid bidix path" % bidix_path)
196 def eliminate_duplicates(lst
):
201 if item
not in seen_before
:
202 seen_before
[item
] = True
208 def main(automorf_path
, bidix_path
, lt_proc_path
=Globals
.lt_proc_path
, output
=None, side
='l'):
209 """usage: find-missing-monodix-entries.py <options>
211 <options> are the following
213 --lt_proc_path - path to lt-proc; the default is /usr/local/bin/lt-proc
214 --automorf_path - path to the morphological analysis file used by lt-proc; e.g. af-en.automorf.bin
215 --bidix_path - path to the bidix file to be used
216 --side - either l or r; used to determine which side in the bidix is to be read; the default is l.
217 --output - filename to dump the data to; this defaults to stdout
220 python find-missing-monodix-entries.py --bidix_path=apertium-en-af.en-af.dix.xml \\
221 --automorf_path=/usr/local/share/apertium/apertium-en-af/af-en.automorf.bin --side=r
223 Read the bidix 'apertium-en-af.en-af.dix.xml' in the current directory. Use the morphological
224 analyser file '/usr/local/share/apertium/apertium-en-af/af-en.automorf.bin'. Read entries from the
225 right hand side of the bidix. Everything will be written to stdout.
228 Globals
.lt_proc_path
= lt_proc_path
232 out_file
= sys
.stdout
234 out_file
= open(output
, 'w')
236 doc
= load_xml_file(bidix_path
)
237 entries
= extract_entries(doc
, side
)
238 missing
= find_missing_entries(automorf_path
, entries
)
239 write_entries_to_file(out_file
, eliminate_duplicates(missing
))
245 if __name__
== '__main__':
249 param_re
= re
.compile("[-][-]([_a-zA-Z0-9]*)[=](.*)$")
250 param_dict
= dict((match
.group(1), match
.group(2)) for match
in (param_re
.match(p
) for p
in sys
.argv
[1:]))