Moving more modules
[apertium.git] / trunk / apertium-tools / find-missing-monodix-entries.py
blobc10f1ada36cd1a9c59eb2e2a9146a332c5669d9d
1 #!/usr/bin/env python
2 # coding=utf-8
3 # -*- encoding: utf-8 -*-
5 import os
6 import os.path as path
7 import re
8 import sys
9 import cStringIO
10 import pprint
11 from itertools import izip
12 from subprocess import Popen, PIPE
14 # In order to use this script, you need to install 4Suite's cDomlette.
15 from Ft.Xml.Domlette import NonvalidatingReader;
16 from Ft.Xml.Domlette import Print, PrettyPrint;
17 from Ft.Xml.XPath import Evaluate;
19 """
20 This tool is used to find entries which are in the bidix but not in a monodix.
22 If a word is in a monodix, then lt-proc will produce one or more morphological
23 analyses for the word. For example, the Afrikaans word 'vat' is both a noun and
24 a verb. The Afrikaans monodix contains both these meanings. Therefore, when
25 we feed 'vat' to 'lt-proc af-en.automorf.bin', it will produce:
26 ^vat/vat<n><sg>/vat<vblex><pres>/vat<vblex><inf>$^./.<sent>
28 In the monodix, we are likely to have two entries:
29 <e><p><l>barrel<s n=\"n\"/></l><r>vat<s n=\"n\"/></r></p></e>
30 and
31 <e><p><l>take<s n=\"vblex\"/></l><r>vat<s n=\"vblex\"/></r></p></e>
33 So, we can extract 'vat<n>' and 'vat<vblex>' from the bidix. One can see that
34 vat<n> is a substring of vat<n><sg>
35 and that
36 vat<vblex> is a substring of <vat><vblex><pres> as well as vat<vblex><inf>
38 In other words, if a bidix definition such as 'vat<n>' has a corresponding
39 monodix entry, then the morphological analyser when given the word 'vat', must
40 produce at least one string of which 'vat<n>' will be a substring. In the
41 above example, that string is 'vat<n><sg>'.
43 Conversely, if the morphological analyser produces no such string, then
44 there is no corresponding monodix entry. This is exactly the property we use
45 to find the missing monodix entries.
46 """
49 # Change this for your language
50 template = {
51 '<vblex>' : '<par n="breek__vblex"/>',
52 '<adj>' : '<par n="dadelik__adj"/>',
53 '<n>' : '<par n="artikel__n"/>',
54 '<n><unc>' : '<par n="poësie__n__unc"/>',
55 '<np>' : '<par n="Engeland__np"/>',
56 '<adv>' : '<par n="miskien__adv"/>',
57 '<preadv>' : '<par n="taamlik__preadv"/>'
60 # Default global values
61 class Globals:
62 lt_proc_path = '/usr/local/bin/lt-proc' # default location for lt-proc
64 # Parse an XML document and return a DOM tree.
65 def load_xml_file(filename):
66 return NonvalidatingReader.parseUri('file://' + os.path.realpath(filename))
69 def extract_entries(doc, side='l'):
70 return doc.xpath("/dictionary/section[@id='main']/e/p/" + side)
72 def make_input_list(lst):
73 """lst is list of DOM nodes. Get all the child text nodes for each
74 node and concatenate the text in those nodes. Form a list of these
75 new strings.
77 If n is the tree <p>foo</b>bar</p>, we will extract 'foobar' from n.
78 One should probably take special care of the <b/> tags; this is not
79 currently done.
80 """
81 return [nodes_to_string(node.xpath('./text()')) for node in lst]
83 pattern = re.compile(r'<s n="([a-zA-Z0-9]*)"/>')
84 def transform_tags(tag):
85 """Change a tag of the form <s n=\"tagname\"> to <tagname>"""
86 return pattern.sub(r'<\1>', tag)
88 def make_compare_list(lst):
89 return [transform_tags(nodes_to_string(node.childNodes)) for node in lst]
91 def process_output(output):
92 """Split a set of newline separated morphological analyses,
93 and for each morphological analysis, strip off '^' and '$'
94 and split the analysis along the character '/'. This creates
95 a list of lists.
97 For example,
98 ^vat/vat<n><sg>/vat<vblex><pres>/vat<vblex><inf>$
99 ^kla/kla<vblex><pres>/kla<vblex><inf>$
100 ^nag/nag<n><sg>$
102 will be split into
103 [['vat', 'vat<n><sg>', 'vat<vblex><pres>', 'vat<vblex><inf>'],
104 ['kla', 'kla<vblex><pres>', 'kla<vblex><inf>'],
105 ['nag', 'nag<n><sg>']]"""
106 return [line.strip('^$').split('/') for line in output.split('\n')]
108 def extract_tags(s):
109 """Given an entry such as: 'foo<bar><baz>', return ('foo', '<bar><baz>').
110 We ignore <g> and <b/> tags. Thus, we will split 'foo<g><b/>bar</g><baz>'
111 into ('foo<g><b/>bar</g>', '<baz>')."""
112 i = 0
113 while True:
114 if s[i] == '<':
115 if s[i+1:i+3] == 'g>':
116 i = i + 2
118 elif s[i+1:i+4] in ('/g>', 'b/>'):
119 i = i + 3
121 else:
122 return s[0:i], s[i:]
124 i += 1
126 def call(name, input=''):
127 """A convenience function to invoke a subprocess with the
128 parameter list name (where the first argument is the name
129 of an executable). The subprocess is fed the contents of
130 input via stdin. We collect the output of both stdout and
131 stderr from the subprocess. If stderr is not empty, we
132 raise an exception, otherwise we return the contents of
133 stdout."""
134 proc = Popen(name, stdin=PIPE, stdout=PIPE, stderr=PIPE)
135 out, err = proc.communicate(input)
137 if not (err == None or err == ''):
138 raise Exception(err)
140 return out
142 # Execute lt-proc
143 def run_ltproc(morfo_path, input_list):
144 out = call([Globals.lt_proc_path, morfo_path], "\n".join(input_list))
145 return process_output(out)
147 def find_missing_entries(morfo_path, lst):
148 i = make_input_list(lst)
149 c = make_compare_list(lst)
150 o = run_ltproc(morfo_path, i)
152 missing = []
154 for entry, morpho_entries in izip(c, o):
155 matches = [e for e in morpho_entries if e.startswith(entry)]
157 if len(matches) == 0:
158 missing.append(entry)
160 return missing
162 def write_entries_to_file(f, lst):
163 for lemma, tags in (extract_tags(e) for e in lst):
164 if tags in template:
165 f.write(u' <e lm="%(lemma)s"><i>%(lemma)s</i>%(template)s</e>\n' % { 'lemma': lemma,
166 'template': template[tags] })
168 def node_to_string(node):
169 buf = cStringIO.StringIO();
170 Print(node, stream=buf, encoding='utf-8');
171 return buf.getvalue();
173 def nodes_to_string(lst):
174 return ''.join(node_to_string(n) for n in lst)
178 ##def last_component(pathname):
179 ## return path.split(pathname)[2]
181 ## def get_automorf_pair(automorf_path):
182 ## try:
183 ## return re.match('([^.]*[.]automorf[.]bin)', last_component(automorf_path))
184 ## except:
185 ## raise Exception("%s is not a valid automorf path" % automorf_path)
187 ## def get_bidix_pair(bidix_path):
188 ## try:
189 ## return re.match('apertium-([^.]*)[.]([^.]*)[.]dix([.]xml', last_component(bidix_path)).group(1)
190 ## except:
191 ## raise Exception("%s is not a valid bidix path" % bidix_path)
196 def eliminate_duplicates(lst):
197 seen_before = {}
198 new_lst = []
200 for item in lst:
201 if item not in seen_before:
202 seen_before[item] = True
203 new_lst.append(item)
205 return new_lst
208 def main(automorf_path, bidix_path, lt_proc_path=Globals.lt_proc_path, output=None, side='l'):
209 """usage: find-missing-monodix-entries.py <options>
211 <options> are the following
213 --lt_proc_path - path to lt-proc; the default is /usr/local/bin/lt-proc
214 --automorf_path - path to the morphological analysis file used by lt-proc; e.g. af-en.automorf.bin
215 --bidix_path - path to the bidix file to be used
216 --side - either l or r; used to determine which side in the bidix is to be read; the default is l.
217 --output - filename to dump the data to; this defaults to stdout
219 Example:
220 python find-missing-monodix-entries.py --bidix_path=apertium-en-af.en-af.dix.xml \\
221 --automorf_path=/usr/local/share/apertium/apertium-en-af/af-en.automorf.bin --side=r
223 Read the bidix 'apertium-en-af.en-af.dix.xml' in the current directory. Use the morphological
224 analyser file '/usr/local/share/apertium/apertium-en-af/af-en.automorf.bin'. Read entries from the
225 right hand side of the bidix. Everything will be written to stdout.
228 Globals.lt_proc_path = lt_proc_path
229 out_file = None
231 if output == None:
232 out_file = sys.stdout
233 else:
234 out_file = open(output, 'w')
236 doc = load_xml_file(bidix_path)
237 entries = extract_entries(doc, side)
238 missing = find_missing_entries(automorf_path, entries)
239 write_entries_to_file(out_file, eliminate_duplicates(missing))
240 out_file.close()
242 exit(0)
245 if __name__ == '__main__':
246 param_dict = {}
248 try:
249 param_re = re.compile("[-][-]([_a-zA-Z0-9]*)[=](.*)$")
250 param_dict = dict((match.group(1), match.group(2)) for match in (param_re.match(p) for p in sys.argv[1:]))
252 except Exception, e:
253 print main.__doc__
254 exit(1)
256 try:
257 main(**param_dict)
259 except Exception, e:
260 print e
261 print main.__doc__
262 exit(1)