skripte/python/expand_teilwoerter.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf8 -*-
   3 # :Copyright: © 2014 Günter Milde.
   4 #             Released without warranty under the terms of the
   5 #             GNU General Public License (v. 2 or later)
   6 # :Id: $Id:  $
   7
   8 # Erweitern der Wortliste um Kombinationen von Teilwörtern
   9 # ========================================================
  10 #
  11 # Zerlegen von Composita an den Wortfugen und Übernahme der Teile als
  12 # eigenständige Einträge.
  13 #
  14 # >>> from expand_teilwoerter import *
  15 #
  16 # ::
  17
  18 import os, re, sys, codecs, copy
  19 from werkzeug import (WordFile, WordEntry, join_word,
  20                       sprachabgleich, toggle_case, sortkey_duden)
  21
  22
  23 # Funktionen
  24 # -----------
  25 #
  26 # Iterator, gibt alle geordneten Teilkombinationen zurück
  27 #
  28 # >>> list(multisplitter(u'test', u'='))
  29 # [u'test']
  30 #
  31 # >>> list(multisplitter(u'a=b', u'='))
  32 # [u'a', u'a=b', u'b']
  33 #
  34 # >>> list(multisplitter(u'a=b=c', u'='))
  35 # [u'a', u'a=b', u'a=b=c', u'b', u'b=c', u'c']
  36 #
  37 # >>> list(multisplitter('a=b=c=d', '='))
  38 # ['a', 'a=b', 'a=b=c', 'a=b=c=d', 'b', 'b=c', 'b=c=d', 'c', 'c=d', 'd']
  39 #
  40 # >>> list(multisplitter(u'a=b==c', u'=='))
  41 # [u'a=b', u'a=b==c', u'c']
  42 #
  43 # >>> list(multisplitter('a=b==c=de', '=='))
  44 # ['a=b', 'a=b==c=de', 'c=de']
  45 #
  46 # >>> list(multisplitter('a=b==c=de', '==='))
  47 # ['a=b==c=de']
  48 #
  49 # >>> list(multisplitter('er[<st/st=]ritt', u'='))
  50 # [u'er[<st/st=]ritt']
  51 # >>> list(multisplitter('Schiff[=s/s=]tau', u'='))
  52 # [u'Schiff[=s/s=]tau']
  53 # >>> list(multisplitter('a{ll/ll=l}ie-bend', u'='))
  54 # [u'a{ll/ll=l}ie-bend']
  55 # >>> list(multisplitter('Be[t=t/{tt/tt=t}]uch', u'='))
  56 # [u'Be[t=t/{tt/tt=t}]uch']
  57 #
  58 # ::
  59
  60 def multisplitter(wort, sep):
  61     specials = re.findall(ur'\[.*%s.*\]|\{[^}]*%s[^}]*\}'%(sep,sep), wort)
  62     for sp in specials:
  63         wort = wort.replace(sp, sp.replace(sep, '*'))
  64     parts = wort.split(sep)
  65     length = len(parts)
  66     for start in range(length):
  67         for end in range(start+1, length+1):
  68             part = sep.join(parts[start:end])
  69             if specials:
  70                 part = part.replace('*', sep)
  71             yield part
  72
  73
  74 # Gib eine Liste möglicher Zerlegungen eines Kompositums zurück.
  75 # Berücksichtige dabei die Bindungsstärke bis zum Level 3
  76 # ("===", zur Zeit höchste Auszeichnung in der Wortliste).
  77 #
  78 # >>> multisplit(u'test')
  79 # [u'test']
  80 #
  81 # >>> multisplit(u'a=b')
  82 # [u'a', u'a=b', u'b']
  83 #
  84 # >>> multisplit(u'a=b=c')
  85 # [u'a', u'a=b', u'a=b=c', u'b', u'b=c', u'c']
  86 #
  87 # >>> multisplit(u'a==b=c')
  88 # [u'a', u'a==b=c', u'b', u'b=c', u'c']
  89 #
  90 # >>> multisplit(u'a==b=c==d')
  91 # [u'a', u'a==b=c', u'a==b=c==d', u'b', u'b=c', u'c', u'b=c==d', u'd']
  92 #
  93 # >>> for w in multisplit(u'Brenn=stoff==zel-len===an<trieb'):
  94 # ...    print w
  95 # Brenn
  96 # Brenn=stoff
  97 # Stoff
  98 # Brenn=stoff==zel-len
  99 # Zel-len
 100 # Brenn=stoff==zel-len===an<trieb
 101 # An<trieb
 102 #
 103 # ::
 104
 105 def multisplit(wort):
 106     parts = []
 107     for p3 in multisplitter(wort, u'==='):
 108         if u'===' in p3:
 109             parts.append(p3)
 110             continue
 111         for p2 in multisplitter(p3, u'=='):
 112             if u'==' in p2:
 113                 parts.append(p2)
 114                 continue
 115             p2 = p2.replace(u'<=', u'<')
 116             p2 = p2.replace(u'=>', u'>')
 117             for p1 in multisplitter(p2, u'='):
 118                 parts.append(p1)
 119     if wort[:2].istitle():
 120         parts = [part[0].title() + part[1:] for part in parts]
 121     return parts
 122
 123 # Gib eine Liste von allen (sinnvollen) Zerlegungen eines WordEntry zurück
 124 #
 125 # >>> from werkzeug import WordEntry
 126 #
 127 # >>> split_entry(WordEntry(u'Aachen;Aa-chen'))
 128 # [[u'Aachen', u'Aa-chen']]
 129 # >>> aalbestand = WordEntry(u'Aalbestand;Aal=be<stand')
 130 # >>> split_entry(aalbestand)
 131 # [[u'Aal', u'Aal'], [u'Aalbestand', u'Aal=be<stand'], [u'Bestand', u'Be<stand']]
 132 #
 133 # >>> godi = WordEntry(u'Abendgottesdienste;-2-;Abend==got-tes=dien-ste;Abend==got-tes=diens-te')
 134 # >>> for entry in split_entry(godi):
 135 # ...     print entry
 136 # Abend;-2-;Abend;Abend
 137 # Abendgottesdienste;-2-;Abend==got-tes=dien-ste;Abend==got-tes=diens-te
 138 # Gottes;-2-;Got-tes;Got-tes
 139 # Gottesdienste;-2-;Got-tes=dien-ste;Got-tes=diens-te
 140 # Dienste;-2-;Dien-ste;Diens-te
 141 #
 142 # >>> bb = WordEntry(u'Biberbettuch;-2-;Bi-ber==be[t=t/{tt/tt=t}]uch')
 143 # >>> for entry in split_entry(bb):
 144 # ...     print entry
 145 # Biber;-2-;Bi-ber
 146 # Biberbettuch;-2-;Bi-ber==be[t=t/{tt/tt=t}]uch
 147 # Bettuch;-2-;Be[t=t/{tt/tt=t}]uch
 148 #
 149 # ::
 150
 151 def split_entry(entry):
 152
 153     entries = []
 154
 155     for col in range(1, len(entry)):
 156         wort = entry[col]
 157         if u'=' not in wort:
 158             continue # nichts zu splitten
 159         parts = multisplit(wort)
 160
 161         # Kopien des Originaleintrags erstellen
 162         if not entries:
 163             for part in parts:
 164                 entries.append(copy.copy(entry))
 165                 entries[-1][0] = join_word(part)
 166         # Entries ausfüllen
 167         for i in range(len(parts)):
 168             entries[i][col] = parts[i]
 169
 170     if entries:
 171         for e in entries:
 172             e.conflate_fields() # Sprachabgleich
 173         return entries
 174     else:
 175         return [entry]
 176
 177 # Gib ein Dictionary mit Einträgen der Wortliste und Teilwortkombinationen zurück:
 178
 179 def expand_wordfile(wordfile):
 180     words = {}  # Wörter aus der Liste
 181
 182     for entry in wordfile:
 183         try:
 184             entries = split_entry(entry)
 185         except IndexError:  # unterschiedliche Zerlegung je nach Sprache
 186             # print "problematisch", unicode(entry)
 187             words[entry[0]] = entry
 188             continue
 189
 190         for e in entries:
 191             if (len(entries) == 1
 192                 or (e[0].lower() not in words and e[0].title() not in words)
 193                 or len(e[0]) <= 3 and len(e) == 2 # kurze einfache Wörter
 194                ):
 195                 words[e[0]] = e
 196
 197     return words
 198
 199 def exists(wort):
 200     key = join_word(wort)
 201     return (key.title() in words) or (key.lower() in words) or (len(wort)<4)
 202
 203 if __name__ == '__main__':
 204
 205     # sys.stdout mit UTF8 encoding.
 206     sys.stdout = codecs.getwriter('UTF-8')(sys.stdout)
 207
 208 # `Wortliste` einlesen::
 209
 210     wordfile = WordFile('../../wortliste') # ≅ 400 000 Einträge/Zeilen
 211
 212 # Wichtung::
 213
 214     # sprachvariante = 'de-1996'
 215     # sprachvariante = 'de-1901'
 216     # words = wordfile.asdict()
 217     # for entry in words.itervalues():
 218     #     wort = entry.get(sprachvariante)
 219     #     if not wort:
 220     #         continue
 221     #     if (u'<=' in wort or u'=>' in wort or u'==' in wort):
 222     #         continue
 223     #     parts = [part for part in multisplit(wort)
 224     #              if u'=' not in part]
 225     #     if len(parts) == 3:
 226     #         if parts[1] == u'zu':
 227     #             continue
 228     #         if (exists(parts[0]) and exists(''.join(parts[1:]))
 229     #             and not(exists(parts[-1])
 230     #                     and (exists(''.join(parts[:-1])) or exists(''.join(parts[:-1])+u's')))
 231     #            ):
 232     #             for i in range(1,len(parts)):
 233     #                 parts[i] = parts[i].lower()
 234     #             wort = u'=='.join([parts[0], u'='.join(parts[1:])])
 235     #             entry.set(wort, sprachvariante)
 236     #             sprachabgleich(entry)
 237     #             print unicode(entry)
 238     #
 239     # sys.exit()
 240
 241 # expandieren und Speichern::
 242
 243     words = expand_wordfile(wordfile)
 244
 245     print len(words), "expandiert"
 246
 247     outfile = open('wortliste-expandiert', 'w')
 248
 249     for entry in sorted(words.values(), key=sortkey_duden):
 250         outfile.write(str(entry))
 251         outfile.write('\n')