From ebef0f326a04c0e35d6aed246a413be50304b37c Mon Sep 17 00:00:00 2001
From: Guenter Milde <milde@users.sf.net>
Date: Mon, 24 Mar 2014 22:31:59 +0100
Subject: [PATCH] Python-Skript Update.

---
 skripte/python/abgleich_endungen.py         |  2 ++
 skripte/python/analyse.py                   | 23 ++++++++++++++---------
 skripte/python/vorsilben_in_teilwoertern.py |  6 +++---
 skripte/python/werkzeug.py                  |  5 ++---
 skripte/python/wortfugen.py                 | 28 ++++++++++++++++------------
 5 files changed, 37 insertions(+), 27 deletions(-)

diff --git a/skripte/python/abgleich_endungen.py b/skripte/python/abgleich_endungen.py
index 64b8f2c..e649524 100644
--- a/skripte/python/abgleich_endungen.py
+++ b/skripte/python/abgleich_endungen.py
@@ -168,6 +168,7 @@ endungen = [
             # (u'bar', u't'),
             # (u'ce', u'-cen'),
             (u'ch', u'·che'),
+            # (u'ch', u'-che'), # Test "if u'·' not in wort" auskommentieren!
             (u'ch', u'·cher'),
             (u'ck', u'·cke'),
             (u'ck', u'·cker'),
@@ -349,6 +350,7 @@ if __name__ == '__main__':
 
         # Endungsabgleich:
         for alt, neu in endungen:
+
             wort2 = endungsabgleich(wort, endung=neu, vergleichsendung=alt,
                                     use_grundwort=use_grundwort,
                                     grossklein=grossklein
diff --git a/skripte/python/analyse.py b/skripte/python/analyse.py
index f55357b..997c8e8 100644
--- a/skripte/python/analyse.py
+++ b/skripte/python/analyse.py
@@ -78,6 +78,7 @@ class teilwoerter(object):
 # >>> words.add(u'ein<tra·gen')
 # >>> words.add(u'ein·tra-gen')
 # >>> words.add(u'un<klar')
+# >>> words.add(u'un<.klar')
 # >>> words.add(u'un-klar')
 # >>> print words.trennvarianten
 # {u'unklar': [u'un<klar', u'un-klar'], u'eintragen': [u'ein<tra-gen']}
@@ -173,10 +174,16 @@ def read_teilwoerter(path):
 # Hilfsfunktion: Erkenne (Nicht-)Teile wie ``/{ll/ll``  aus
 # ``Fuß=ba[ll=/{ll/ll=l}]eh-re``::
 
+# >>> from analyse import spezialbehandlung
+# >>> print spezialbehandlung(u']er.be')
+# er.be
+# >>> print spezialbehandlung(u'er[<b/b')
+# erb
+
 def spezialbehandlung(teil):
     if re.search(ur'[\[{/\]}]', teil):
         # print teil,
-        teil = teil.replace(u'er[<st/st', 'erst')
+        teil = re.sub(ur'\[<(.+)/[^\]]+', ur'\1', teil) # [<b/b
         teil = re.sub(ur'\{([^/]*)[^}]*$', ur'\1', teil)
         teil = re.sub(ur'\[([^/]*)[^\]]*$', ur'\1', teil)
         teil = re.sub(ur'^(.)}', ur'\1', teil)
@@ -206,10 +213,6 @@ def analyse(path='../../wortliste', sprachvariante='de-1901',
 
 # Teilwörter suchen::
 
-        # Suffixe für Wortverbindung (z.B.an-dert=halb>=fach): verwerfen:
-        if wort.find(u'>=') != -1:
-            continue
-
         # Zerlegen, leere Teile (wegen Mehrfachtrennzeichen '==') weglassen,
         # "halbe" Spezialtrennungen entfernen:
         teile = [spezialbehandlung(teil) for teil in wort.split(u'=')
@@ -234,7 +237,7 @@ def analyse(path='../../wortliste', sprachvariante='de-1901',
         # letztes Teilwort:
         teil = teile[-1]
         if (halbfertig or u'·' not in teil
-           ) and not teil.endswith(u'<'): # Präfix wie un<=wahr=schein-lich
+           ) and not teil.startswith(u'>'): # Suffixe wie an-dert=halb=>fach)
             if gross: # Großschreibung übertragen
                 teil = teil[0].title() + teil[1:]
             words.add(teil)
@@ -333,10 +336,12 @@ def mehrdeutigkeiten(words):
     for teil in sorted(words.trennvarianten):
         if len(words.trennvarianten[teil]) == 1:
             continue
-        # Bekannte Mehrdeutigkeiten:
-        if teil in ('Anhalts', 'Base',  'George', 'Mode', 'Name',
+        # Bekannte Mehrdeutigkeiten (meist engl./dt.):
+        if teil in ('Anhalts', 'Base',  'George',
+                    'herzog', # Her-zog/her>zog
+                    'Mode', 'Name',
                     'Page', 'Pole', 'Planes', 'Rate', 'Real',
-                    'Spare', 'Station', 'Wales', 'Ware',
+                    'Spare', 'Station', 'Stations', 'Wales', 'Ware',
                     'griff' # gri[f-f/{ff/ff=f}]est
                    ):
             continue
diff --git a/skripte/python/vorsilben_in_teilwoertern.py b/skripte/python/vorsilben_in_teilwoertern.py
index d1aae56..de454d3 100644
--- a/skripte/python/vorsilben_in_teilwoertern.py
+++ b/skripte/python/vorsilben_in_teilwoertern.py
@@ -12,7 +12,7 @@
 #
 # Suche nach Wörtern beginnend mit::
 
-term = u'an'  # Angabe mit Trennzeichen, z.B. 'pa-ra'
+term = u'neo'  # Angabe mit Trennzeichen, z.B. 'pa-ra'
 
 # in der Datei ``teilwoerter-<sprachtag>.txt`` und analysiere das
 # folgende (Teil)wort. Schreibe Änderungen in die Datei ``teilwoerter.patch``
@@ -213,13 +213,13 @@ for line in teilwoerter:
 #
 # Ausnahmen aus der Ausnahmeliste::
 
-    if join_word(kandidat+rest) in ausnahmen:
+    if join_word(kandidat+rest.split(u'>')[0]) in ausnahmen:
         ist_ausnahme.append(wort)
 
 # Ausnahme Praefixkandidat + Suffix (z.B. ein>fach)::
 
     elif rest.startswith('>'):
-        pass
+        ist_ausnahme.append(wort)
 
 # ::
 
diff --git a/skripte/python/werkzeug.py b/skripte/python/werkzeug.py
index b242cb6..8ec69e5 100755
--- a/skripte/python/werkzeug.py
+++ b/skripte/python/werkzeug.py
@@ -605,9 +605,8 @@ def uebertrage(wort1, wort2, strict=True, upgrade=True):
         if ((t2 == u'·' and t1 != u'.') # unspezifisch
             or upgrade and
             ((t2 in (u'-', u'<') and t1 in (u'<', u'<<', u'<=')) # Praefixe
-             or (t2 in (u'-', u'<', u'<<') and t1 == u'<<<')     # Praefixe
-             or (t2 in (u'-', u'>') and t1 in (u'>', u'>>', u'>=')) # Suffixe
-             or t1 in (u'=', u'==', u'===')                      # Wortfugen
+             or (t2 in (u'-', u'>') and t1 in (u'>', u'>>', u'=>')) # Suffixe
+             or (t2 in (u'-', u'=') and t1 in (u'=', u'==', u'===')) # W-fugen
             )
            ):
             wort3 += t1
diff --git a/skripte/python/wortfugen.py b/skripte/python/wortfugen.py
index bf38616..f524a87 100755
--- a/skripte/python/wortfugen.py
+++ b/skripte/python/wortfugen.py
@@ -27,6 +27,7 @@ from copy import deepcopy
 
 from werkzeug import WordFile, join_word, udiff
 from analyse import read_teilwoerter, teilwoerter
+from abgleich_teilwoerter import wortliste_to_teilwoerter
 
 # sys.stdout mit UTF8 encoding.
 sys.stdout = codecs.getwriter('UTF-8')(sys.stdout)
@@ -59,8 +60,8 @@ sprachvariante = 'de-1996'         # Reformschreibung
 # Verwende die Wortliste oder die mit ``analyse.py`` generierte Teilwortliste 
 # als Quelle der kategorisierten Trennungen::
 
-# use_teilwoerter = False
-use_teilwoerter = True
+use_teilwoerter = False
+# use_teilwoerter = True
 
 
 # Textdateien mit Wortbestandteilen
@@ -84,10 +85,10 @@ if sprachvariante == 'de-1901':
 else:
     wgerman = set(w for w in wortdatei('/usr/share/dict/ngerman'))
 
-# Entferne Silben, die nie in Wortverbindungen vorkommen
+# Entferne Silben, die (fast) nie in Wortverbindungen vorkommen
 # TODO: Solitäre aus einer Datei lesen. ::
 
-for solitaer in ('Ra', 'He', 'As', 'Co', 'Fa'):
+for solitaer in ('Ra', 'He', 'As', 'Co', 'Fa', 'Em', 'Os', 'baren', 'RAF'):
     wgerman.discard(solitaer)
 
 # Präfixe (auch als Präfix verwendete Partikel, Adjektive, ...)::
@@ -132,7 +133,9 @@ if use_teilwoerter:
     words = read_teilwoerter(path='teilwoerter-%s.txt'%sprachvariante)
 else: # Gesamtwörter als "Teilwörter":
     words = wortliste_to_teilwoerter(wortliste, sprachvariante)
-words = words.trennvarianten
+words = set(words.trennvarianten.keys())
+
+words.update(wgerman)
 
 
 # 2. Durchlauf: Analyse
@@ -198,13 +201,14 @@ for entry in wortliste_neu:
 
 # Komposita::
 
-        if ((erstkey in words 
-             or erstkey.lower() in words
-             or erstkey.upper() in words)
-            and erstkey not in erstsilben
-            and erstkey.lower() not in vorsilben
+        if (#(erstkey in words 
+             # or erstkey.lower() in words
+             # or erstkey.upper() in words)
+            # and erstkey not in erstsilben
+            # and erstkey.lower() not in vorsilben
             # and erstkey.lower() not in praefixe
-            and (zweitkey in words
+            # and 
+            (zweitkey in words
                  or zweitkey.lower() in words
                  or zweitkey.upper() in words)
             and zweitkey.lower() not in endsilben
@@ -282,4 +286,4 @@ if patch:
     patchfile = open('wortliste.patch', 'w')
     patchfile.write(patch + '\n')
 else:
-    print 'keine Änderungen'
+    print u'keine Änderungen'
-- 
2.11.4.GIT