skripte/python/abgleich_sprachvarianten.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf8 -*-
   3 # :Copyright: © 2011 Günter Milde.
   4 #             Released without warranty under the terms of the
   5 #             GNU General Public License (v. 2 or later)
   6 # :Id: $Id:  $
   7
   8 # Abgleich der Trennstellen zwischen Sprachvarianten
   9 # ====================================================
  10 #
  11 # * Übertragen von kategorisierten Trennstellen zwischen Sprachvarianten
  12 #   desselben Wortes, und/oder
  13 #
  14 # * Zusammenfassen von Feldern mit gleichem Inhalt wenn das Ergebnis ein
  15 #   wohlgeformter Eintrag ist.
  16 #
  17 # ::
  18
  19 import re, sys, codecs, copy
  20 from werkzeug import WordFile, WordEntry, join_word, udiff, sprachabgleich
  21
  22
  23 # Zusammenfassen von Feldern mit gleichem Inhalt z.B.
  24 #
  25 #      hallo;-2-;hal-lo;hal-o     --> hallo;hal-lo
  26 #
  27 # in allen Einträgen von `wortliste`.
  28 # Siehe ``WordEntry.conflate_fields()`` in werkzeug.py.
  29 #
  30 # Anwendung 2012-03-13
  31 # (getestet mit ``texlua validate.lua < ../wortliste``)
  32 #
  33 # =========   ======   =======
  34 # Typ         Vorher   Nachher
  35 # ---------   ------   -------
  36 # ua          371807   374614
  37 # uxtr        41156    38349
  38 # =========   ======   =======
  39 #
  40 # ::
  41
  42 def conflate(wortliste):
  43
  44     for entry in wortliste:
  45         if len(entry) <= 2:
  46             continue # allgemeine Schreibung
  47         # Felder zusammenfassen:
  48         entry.conflate_fields()
  49         continue
  50
  51 if __name__ == '__main__':
  52
  53     # sys.stdout mit UTF8 encoding.
  54     sys.stdout = codecs.getwriter('UTF-8')(sys.stdout)
  55
  56     # Die `Wortliste`::
  57
  58     wordfile = WordFile('../../wortliste') # ≅ 400 000 Einträge/Zeilen
  59     wortliste = list(wordfile)
  60     wortliste_neu = []
  61
  62     wordfile.seek(0)            # Pointer zurücksetzen
  63     words = wordfile.asdict()
  64
  65     # Bearbeiten der wortliste "in-place"
  66     # conflate(wortliste)
  67
  68     for oldentry in wortliste:
  69         if len(oldentry) <= 2:
  70             wortliste_neu.append(oldentry)
  71             continue
  72         entry = copy.copy(oldentry)
  73         sprachabgleich(entry)
  74         if oldentry == entry and u'ss' in entry[0]:
  75             for w in entry[1:]:
  76                 if not w.startswith(u'-'):
  77                     break
  78             try:
  79                 sprachabgleich(entry, words[join_word(w.replace(u'ss', u'ß'))])
  80             except KeyError:
  81                 # print entry[0].replace(u'ss', u'ß'), "fehlt"
  82                 if entry.get('de-1901-x-GROSS'):
  83                     wort1901 = entry.get('de-1901-x-GROSS')
  84                     wort1901 = wort1901.replace(u'sst', u'ßt')
  85                     wort1901 = wort1901.replace(u'ss=', u'ß=')
  86                     wort1901 = wort1901.replace(u'-ss', u'-ß')
  87                     wort1901 = re.sub(u'ss$', u'ß', wort1901)
  88                     if not u'/' in wort1901 and len(wort1901)>3:
  89                         print u'%s;-2-;%s;-4-' % (join_word(wort1901), wort1901)
  90                 pass  # e.g. "Abfahrtßpezialisten"
  91         if oldentry == entry and u'ß' in entry[0]:
  92             try:
  93                 sprachabgleich(entry, words[entry[0].replace(u'ß', u'ss')])
  94             except KeyError:
  95                 # print entry[0].replace(u'ss', u'ß'), "fehlt"
  96                 pass
  97         if oldentry == entry:
  98             wortliste_neu.append(oldentry)
  99         else:
 100             wortliste_neu.append(entry)
 101
 102
 103
 104     # Patch erstellen::
 105
 106     patch = udiff(wortliste, wortliste_neu, 'wortliste', 'wortliste-neu',
 107                   encoding=wordfile.encoding)
 108     if patch:
 109         # print patch
 110         patchfile = open('wortliste.patch', 'w')
 111         patchfile.write(patch + '\n')
 112     else:
 113         print "empty patch"