skripte/python/edit_tools/abgleich_teilwoerter.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf8 -*-
   3 # :Copyright: © 2014 Günter Milde.
   4 #             Released without warranty under the terms of the
   5 #             GNU General Public License (v. 2 or later)
   6 # :Id: $Id:  $
   7
   8 # Abgleich der Trennstellen zwischen Teilwoertern
   9 # ===============================================
  10 #
  11 # Übertragen von kategorisierten Trennstellen von Teilwörtern auf
  12 # Vorkommen dieser Teilwörter mit unkategorisierten Trennstellen.
  13 #
  14 # ::
  15
  16 from copy import deepcopy
  17 import re, sys, codecs
  18 from wortliste import (WordFile, WordEntry, join_word, udiff,
  19                       uebertrage, TransferError,
  20                       sprachabgleich, toggle_case)
  21 from analyse import read_teilwoerter, teilwoerter
  22
  23 # Sprachvarianten
  24 # ---------------
  25 # Sprach-Tag nach [BCP47]_::
  26
  27 # sprachvariante = 'de-1901'         # "traditionell"
  28 sprachvariante = 'de-1996'         # Reformschreibung
  29 # sprachvariante = 'de-1901-x-GROSS'   # ohne ß (Schweiz oder GROSS)
  30 # sprachvariante = 'de-1996-x-GROSS' # ohne ß (Schweiz oder GROSS)
  31 # sprachvariante = 'de-CH-1901'     # ohne ß (Schweiz) ("süssauer")
  32
  33 # Vergleichsbasis
  34 # ~~~~~~~~~~~~~~~
  35 # Verwende die Wortliste oder die mit ``analyse.py`` generierte Teilwortliste
  36 # als Quelle der kategorisierten Trennungen::
  37
  38 use_teilwoerter = False
  39 # use_teilwoerter = True
  40
  41
  42 # Funktionen
  43 # -----------
  44
  45 # Übertrag kategorisierter Trennstellen aus Teilwort-Datei auf die
  46 # `wortliste`::
  47
  48 def teilwortabgleich(wort, grossklein=False, strict=True):
  49     teile = [teilabgleich(teil, grossklein, strict)
  50              for teil in wort.split(u'=')
  51             ]
  52     return u'='.join(teile)
  53
  54 def teilabgleich(teil, grossklein=False, strict=True):
  55     if grossklein:
  56         return toggle_case(teilabgleich(toggle_case(teil), strict=strict))
  57     try:
  58         key = join_word(teil)
  59     except AssertionError, e:
  60         print e
  61         return teil
  62     if key not in words.trennvarianten:
  63         # print teil, u'not in words'
  64         if grossklein is None:
  65             return toggle_case(teilabgleich(toggle_case(teil), strict=strict))
  66     else:
  67         # Gibt es eine eindeutige Trennung für Teil?
  68         eindeutig = len(words.trennvarianten[key]) == 1
  69         for wort in words.trennvarianten[key]:
  70             # Übertrag der Trennungen
  71             try:
  72                 teil = uebertrage(wort, teil, strict, upgrade=eindeutig)
  73             except TransferError, e: # Inkompatible Wörter
  74                 # print unicode(e)
  75                 if grossklein is None:
  76                     grossklein = True # retry with case toggled
  77     return teil
  78
  79 # "Umgießen" der Wortliste in eine "Teilwörter" Instanz für den
  80 # "Grundwortabgleich" von Wortverbindungen::
  81
  82 def wortliste_to_teilwoerter(wortliste, sprachvariante=sprachvariante):
  83     words = teilwoerter()
  84     for entry in wortliste:
  85         wort = entry.get(sprachvariante)
  86         if wort is None: # Wort existiert nicht in der Sprachvariante
  87             continue
  88         if u'·' not in wort:
  89             words.add(wort)
  90     return words
  91
  92
  93 if __name__ == '__main__':
  94
  95     # sys.stdout mit UTF8 encoding.
  96     sys.stdout = codecs.getwriter('UTF-8')(sys.stdout)
  97
  98 # `Wortliste` einlesen::
  99
 100     wordfile = WordFile('../../../wortliste') # ≅ 400 000 Einträge/Zeilen
 101     wortliste = list(wordfile)
 102     wortliste_neu = deepcopy(wortliste)
 103
 104 # Vergleichswörter einlesen::
 105
 106     if use_teilwoerter:
 107         words = read_teilwoerter(path='teilwoerter-%s.txt'%sprachvariante)
 108     else: # Gesamtwörter als "Teilwörter":
 109         words = wortliste_to_teilwoerter(wortliste, sprachvariante)
 110
 111 # Bearbeiten der neuen wortliste "in-place"::
 112
 113     for entry in wortliste_neu:
 114
 115         # Wort mit Trennungen in Sprachvariante
 116         wort = entry.get(sprachvariante)
 117         if wort is None: # Wort existiert nicht in der Sprachvariante
 118             continue
 119         if u'·' not in wort and u'.' not in wort: # Alle Trennstellen kategorisiert
 120             continue
 121
 122 # Teilwortabgleich::
 123
 124         wort2 = teilwortabgleich(wort, grossklein=None, strict=False)
 125
 126 # Eintrag ändern::
 127
 128         if (wort != wort2): #and (u'·' not in wort2):
 129             entry.set(wort2, sprachvariante)
 130             print u'%s -> %s' % (wort, wort2)
 131             if len(entry) > 2:
 132                 sprachabgleich(entry)
 133
 134 # Patch erstellen::
 135
 136     patch = udiff(wortliste, wortliste_neu, 'wortliste', 'wortliste-neu')
 137     if patch:
 138         # print patch
 139         patchfile = open('wortliste.patch', 'w')
 140         patchfile.write(patch + '\n')
 141     else:
 142         print u'empty patch'