skripte/python/trennstellenkategorisierung/abgleich_suffixe.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf8 -*-
   3 # :Copyright: © 2014 Günter Milde.
   4 #             Released without warranty under the terms of the
   5 #             GNU General Public License (v. 2 or later)
   6 # :Id: $Id:  $
   7
   8 # Abgleich der Trennstellen zwischen Woertern mit unterschiedlichem Suffix
   9 # ========================================================================
  10 #
  11 # Übertragen von kategorisierten Trennstellen von "Wortresten" nach Abtrennen
  12 # der Suffixe auf Vorkommen dieser Wortteile mit anderem Suffix.
  13 # ::
  14
  15 import re, sys, codecs
  16 import difflib
  17 from wortliste import (WordFile, WordEntry, join_word, uebertrage, TransferError, sprachabgleich, toggle_case)
  18 from analyse import read_teilwoerter, teilwoerter
  19 from abgleich_praefixe import udiff
  20 # Sprachvarianten
  21 # ---------------
  22 # Sprach-Tag nach [BCP47]_::
  23
  24 # sprachvariante = 'de-1901'         # "traditionell"
  25 sprachvariante = 'de-1996'         # Reformschreibung
  26 # sprachvariante = 'de-1901-x-GROSS'   # ohne ß (Schweiz oder GROSS)
  27 # sprachvariante = 'de-1996-x-GROSS' # ohne ß (Schweiz oder GROSS)
  28 # sprachvariante = 'de-CH-1901'     # ohne ß (Schweiz) ("süssauer")
  29
  30 # Funktionen
  31 # -----------
  32
  33 # Abtrennen von Suffixen und Eintrag aller (Teil-) Wörter in eine neue
  34 # ``teilwoerter`` Instanz::
  35
  36 def find_stems(words):
  37     stems = teilwoerter()
  38     for line in words:
  39         if u'·' in line:
  40             continue
  41         word = line.split()[0]
  42         # Wis-sen>schaft>lich>keit -> [Wis-sen, Wis-sen>schaft, Wis-sen>schaft>lich]
  43         parts = []
  44         for part in word.split(u'>'):
  45             parts.append(part)
  46             teil = u'>'.join(parts)
  47             stems.add(teil)
  48     return stems
  49
  50 # Vergleich des Wortteiles nach dem ersten '>' mit ``stems``::
  51
  52 def suffixabgleich(wort, grossklein=False):
  53
  54     teile = wort.split('>')
  55     stamm = teile[0]
  56     key = join_word(stamm)
  57     # print u' '.join([wort, key])
  58     if grossklein:
  59         key = toggle_case(key)
  60
  61     if key in stems.trennvarianten:
  62         # print u'fundum', key, teile
  63         for altstamm in stems.trennvarianten[key]:
  64             if u'·' in altstamm:
  65                 continue
  66             if grossklein:
  67                 altstamm = toggle_case(altstamm)
  68             try:
  69                 neustamm = uebertrage(altstamm, stamm)
  70                 # print u'alt/neu', wort, altstamm, neustamm
  71                 teile[0] =  neustamm
  72                 break
  73             except TransferError, e:
  74                 print unicode(e)
  75
  76     return u'>'.join(teile)
  77
  78
  79
  80 if __name__ == '__main__':
  81
  82     # sys.stdout mit UTF8 encoding.
  83     sys.stdout = codecs.getwriter('UTF-8')(sys.stdout)
  84
  85 # Teilwörter einlesen::
  86
  87     wordfile = open('teilwoerter-%s.txt'%sprachvariante)
  88     # 1. Zeile ist Kommentar:
  89     comment = wordfile.readline().decode('utf8')
  90     words = [line.decode('utf8') for line in wordfile]
  91
  92 # Vorsilben abtrennen::
  93
  94     stems = find_stems(words)
  95
  96 # Erstellen der neuen wortliste
  97 # =============================
  98 # ::
  99
 100     words2 = []
 101
 102     for line in words:
 103
 104 # Alle Trennstellen kategorisiert oder kein (markierter) Suffix::
 105
 106         if (u'·' not in line) or (u'>' not in line):
 107             words2.append(line)
 108             continue
 109
 110 # Parsen::
 111
 112         fields = line.split(' ')
 113         wort = fields[0]
 114
 115 # Suffixabgleich::
 116
 117         wort2 = suffixabgleich(wort)
 118         if wort2 == wort:
 119             wort2 = suffixabgleich(wort, grossklein=True)
 120         fields[0] = wort2
 121         words2.append(' '.join(fields))
 122
 123 # Rückmeldung::
 124
 125         if (wort != wort2): #and (u'·' not in wort2):
 126             print u'%s -> %s' % (wort, wort2)
 127
 128 # Patch erstellen::
 129
 130     words.insert(0, comment)
 131     words2.insert(0, comment)
 132     patch = udiff(words, words2, 'teilwoerter', 'teilwoerter-neu')
 133     if patch:
 134         # print patch
 135         patchfile = open('teilwoerter.patch', 'w')
 136         patchfile.write(patch + '\n')
 137     else:
 138         print u'empty patch'