trunk/apertium-tools/find-missing-monodix-entries.py

   1 #!/usr/bin/env python
   2 # coding=utf-8
   3 # -*- encoding: utf-8 -*-
   4
   5 import os
   6 import os.path as path
   7 import re
   8 import sys
   9 import cStringIO
  10 import pprint
  11 from itertools import izip
  12 from subprocess import Popen, PIPE
  13
  14 # In order to use this script, you need to install 4Suite's cDomlette.
  15 from Ft.Xml.Domlette import NonvalidatingReader;
  16 from Ft.Xml.Domlette import Print, PrettyPrint;
  17 from Ft.Xml.XPath import Evaluate;
  18
  19 """
  20 This tool is used to find entries which are in the bidix but not in a monodix.
  21
  22 If a word is in a monodix, then lt-proc will produce one or more morphological
  23 analyses for the word. For example, the Afrikaans word 'vat' is both a noun and
  24 a verb. The Afrikaans monodix contains both these meanings. Therefore, when
  25 we feed 'vat' to 'lt-proc af-en.automorf.bin', it will produce:
  26   ^vat/vat<n><sg>/vat<vblex><pres>/vat<vblex><inf>$^./.<sent>
  27
  28 In the monodix, we are likely to have two entries:
  29   <e><p><l>barrel<s n=\"n\"/></l><r>vat<s n=\"n\"/></r></p></e>
  30 and
  31   <e><p><l>take<s n=\"vblex\"/></l><r>vat<s n=\"vblex\"/></r></p></e>
  32
  33 So, we can extract 'vat<n>' and 'vat<vblex>' from the bidix. One can see that
  34   vat<n> is a substring of vat<n><sg>
  35 and that
  36   vat<vblex> is a substring of <vat><vblex><pres> as well as vat<vblex><inf>
  37
  38 In other words, if a bidix definition such as 'vat<n>' has a corresponding
  39 monodix entry, then the morphological analyser when given the word 'vat',  must
  40 produce at least one string of which 'vat<n>' will be a substring. In the
  41 above example, that string is 'vat<n><sg>'.
  42
  43 Conversely, if the morphological analyser produces no such string, then
  44 there is no corresponding monodix entry. This is exactly the property we use
  45 to find the missing monodix entries.
  46 """
  47
  48
  49 # Change this for your language
  50 template = {
  51     '<vblex>'  : '<par n="breek__vblex"/>',
  52     '<adj>'    : '<par n="dadelik__adj"/>',
  53     '<n>'      : '<par n="artikel__n"/>',
  54     '<n><unc>' : '<par n="poësie__n__unc"/>',
  55     '<np>'     : '<par n="Engeland__np"/>',
  56     '<adv>'    : '<par n="miskien__adv"/>',
  57     '<preadv>' : '<par n="taamlik__preadv"/>'
  58 }
  59
  60 # Default global values
  61 class Globals:
  62     lt_proc_path  = '/usr/local/bin/lt-proc' # default location for lt-proc
  63
  64 # Parse an XML document and return a DOM tree.
  65 def load_xml_file(filename):
  66     return NonvalidatingReader.parseUri('file://' + os.path.realpath(filename))
  67
  68
  69 def extract_entries(doc, side='l'):
  70     return doc.xpath("/dictionary/section[@id='main']/e/p/" + side)
  71
  72 def make_input_list(lst):
  73     """lst is list of DOM nodes. Get all the child text nodes for each
  74     node and concatenate the text in those nodes. Form a list of these
  75     new strings.
  76
  77     If n is the tree <p>foo</b>bar</p>, we will extract 'foobar' from n.
  78     One should probably take special care of the <b/> tags; this is not
  79     currently done.
  80     """
  81     return [nodes_to_string(node.xpath('./text()')) for node in lst]
  82
  83 pattern = re.compile(r'<s n="([a-zA-Z0-9]*)"/>')
  84 def transform_tags(tag):
  85     """Change a tag of the form <s n=\"tagname\"> to <tagname>"""
  86     return pattern.sub(r'<\1>', tag)
  87
  88 def make_compare_list(lst):
  89     return [transform_tags(nodes_to_string(node.childNodes)) for node in lst]
  90
  91 def process_output(output):
  92     """Split a set of newline separated morphological analyses,
  93 and for each morphological analysis, strip off '^' and '$'
  94 and split the analysis along the character '/'. This creates
  95 a list of lists.
  96
  97     For example,
  98       ^vat/vat<n><sg>/vat<vblex><pres>/vat<vblex><inf>$
  99       ^kla/kla<vblex><pres>/kla<vblex><inf>$
 100       ^nag/nag<n><sg>$
 101
 102     will be split into
 103       [['vat', 'vat<n><sg>', 'vat<vblex><pres>', 'vat<vblex><inf>'],
 104        ['kla', 'kla<vblex><pres>', 'kla<vblex><inf>'],
 105        ['nag', 'nag<n><sg>']]"""
 106     return [line.strip('^$').split('/') for line in output.split('\n')]
 107
 108 def extract_tags(s):
 109     """Given an entry such as: 'foo<bar><baz>', return ('foo', '<bar><baz>').
 110     We ignore <g> and <b/> tags. Thus, we will split 'foo<g><b/>bar</g><baz>'
 111     into ('foo<g><b/>bar</g>', '<baz>')."""
 112     i = 0
 113     while True:
 114         if s[i] == '<':
 115             if s[i+1:i+3] == 'g>':
 116                 i = i + 2
 117
 118             elif s[i+1:i+4] in ('/g>', 'b/>'):
 119                 i = i + 3
 120
 121             else:
 122                 return s[0:i], s[i:]
 123
 124         i += 1
 125
 126 def call(name, input=''):
 127     """A convenience function to invoke a subprocess with the
 128     parameter list name (where the first argument is the name
 129     of an executable). The subprocess is fed the contents of
 130     input via stdin. We collect the output of both stdout and
 131     stderr from the subprocess. If stderr is not empty, we
 132     raise an exception, otherwise we return the contents of
 133     stdout."""
 134     proc = Popen(name, stdin=PIPE, stdout=PIPE, stderr=PIPE)
 135     out, err = proc.communicate(input)
 136
 137     if not (err == None or err == ''):
 138         raise Exception(err)
 139
 140     return out
 141
 142 # Execute lt-proc
 143 def run_ltproc(morfo_path, input_list):
 144     out = call([Globals.lt_proc_path, morfo_path], "\n".join(input_list))
 145     return process_output(out)
 146
 147 def find_missing_entries(morfo_path, lst):
 148     i = make_input_list(lst)
 149     c = make_compare_list(lst)
 150     o = run_ltproc(morfo_path, i)
 151
 152     missing = []
 153
 154     for entry, morpho_entries in izip(c, o):
 155         matches = [e for e in morpho_entries if e.startswith(entry)]
 156
 157         if len(matches) == 0:
 158             missing.append(entry)
 159
 160     return missing
 161
 162 def write_entries_to_file(f, lst):
 163     for lemma, tags in (extract_tags(e) for e in lst):
 164         if tags in template:
 165             f.write(u'    <e lm="%(lemma)s"><i>%(lemma)s</i>%(template)s</e>\n' % { 'lemma': lemma,
 166                                                                                     'template': template[tags] })
 167
 168 def node_to_string(node):
 169        buf = cStringIO.StringIO();
 170        Print(node, stream=buf, encoding='utf-8');
 171        return buf.getvalue();
 172
 173 def nodes_to_string(lst):
 174     return ''.join(node_to_string(n) for n in lst)
 175
 176
 177
 178 ##def last_component(pathname):
 179 ##     return path.split(pathname)[2]
 180
 181 ## def get_automorf_pair(automorf_path):
 182 ##     try:
 183 ##         return re.match('([^.]*[.]automorf[.]bin)', last_component(automorf_path))
 184 ##     except:
 185 ##         raise Exception("%s is not a valid automorf path" % automorf_path)
 186
 187 ## def get_bidix_pair(bidix_path):
 188 ##     try:
 189 ##         return re.match('apertium-([^.]*)[.]([^.]*)[.]dix([.]xml', last_component(bidix_path)).group(1)
 190 ##     except:
 191 ##         raise Exception("%s is not a valid bidix path" % bidix_path)
 192
 193
 194
 195
 196 def eliminate_duplicates(lst):
 197     seen_before = {}
 198     new_lst = []
 199
 200     for item in lst:
 201         if item not in seen_before:
 202             seen_before[item] = True
 203             new_lst.append(item)
 204
 205     return new_lst
 206
 207
 208 def main(automorf_path, bidix_path, lt_proc_path=Globals.lt_proc_path, output=None, side='l'):
 209     """usage: find-missing-monodix-entries.py <options>
 210
 211     <options> are the following
 212
 213     --lt_proc_path  - path to lt-proc; the default is /usr/local/bin/lt-proc
 214     --automorf_path - path to the morphological analysis file used by lt-proc; e.g. af-en.automorf.bin
 215     --bidix_path    - path to the bidix file to be used
 216     --side          - either l or r; used to determine which side in the bidix is to be read; the default is l.
 217     --output        - filename to dump the data to; this defaults to stdout
 218
 219     Example:
 220       python find-missing-monodix-entries.py --bidix_path=apertium-en-af.en-af.dix.xml \\
 221              --automorf_path=/usr/local/share/apertium/apertium-en-af/af-en.automorf.bin --side=r
 222
 223       Read the bidix 'apertium-en-af.en-af.dix.xml' in the current directory. Use the morphological
 224       analyser file '/usr/local/share/apertium/apertium-en-af/af-en.automorf.bin'. Read entries from the
 225       right hand side of the bidix. Everything will be written to stdout.
 226     """
 227
 228     Globals.lt_proc_path = lt_proc_path
 229     out_file = None
 230
 231     if output == None:
 232         out_file = sys.stdout
 233     else:
 234         out_file = open(output, 'w')
 235
 236     doc = load_xml_file(bidix_path)
 237     entries = extract_entries(doc, side)
 238     missing = find_missing_entries(automorf_path, entries)
 239     write_entries_to_file(out_file, eliminate_duplicates(missing))
 240     out_file.close()
 241
 242     exit(0)
 243
 244
 245 if __name__ == '__main__':
 246     param_dict = {}
 247
 248     try:
 249         param_re = re.compile("[-][-]([_a-zA-Z0-9]*)[=](.*)$")
 250         param_dict = dict((match.group(1), match.group(2)) for match in (param_re.match(p) for p in sys.argv[1:]))
 251
 252     except Exception, e:
 253         print main.__doc__
 254         exit(1)
 255
 256     try:
 257         main(**param_dict)
 258
 259     except Exception, e:
 260         print e
 261         print main.__doc__
 262         exit(1)
 263
 264
 265
 266
 267
 268
 269
 270