skripte/python/edit_tools/wortzerlegung.py

   1 #!/usr/bin/env python3
   2 # -*- coding: utf8 -*-
   3 # :Copyright: © 2014 Günter Milde.
   4 #             Released without warranty under the terms of the
   5 #             GNU General Public License (v. 2 or later)
   6 # :Id: $Id:  $
   7
   8 # Erweitern der Wortliste um Kombinationen von Teilwörtern
   9 # ========================================================
  10 #
  11 # ::
  12
  13 """Zerlegen von Komposita an den Wortfugen.
  14 """
  15
  16 # >>> from wortzerlegung import *
  17 #
  18 # ::
  19
  20 import codecs, collections, copy, argparse, os, re, sys, time
  21 from wortliste import (WordFile, WordEntry, join_word,
  22                        toggle_case, sortkey_duden)
  23
  24
  25 # Klasse für die rekursive Zerlegung eines Kompositums
  26 #
  27 # >>> print(Compound("Kunst==hand=wer-ker===markt"))
  28 # (Kunst + (hand + wer-ker)) + markt
  29 #
  30 # >>> print(Compound("Ma-kro<=le-be=we-sen"))
  31 # Ma-kro- + (le-be + we-sen)
  32 # >>> print(Compound("Ur<=ur<=gross=va-ter"))
  33 # Ur- + (ur- + (gross + va-ter))
  34 # >>> print(Compound("E·lek-tro<==trieb==fahr=zeug"))
  35 # E·lek-tro- + (trieb + (fahr + zeug))
  36 # >>> print(Compound("drei==ein=halb===mil-li.o-nen===>fach"))
  37 # ((drei + (ein + halb)) + mil-li.o-nen) + -fach
  38 # >>> print(Compound("be<=gut=acht=>bar"))
  39 # be- + (gut + acht) + -bar
  40 # >>> print(Compound("An<=al-.pha=be-ten==>tum"))
  41 # (An- + (al-.pha + be-ten)) + -tum
  42 # >>> print(Compound("Brand==o·ber<=amts=rat"))
  43 # Brand + (o·ber- + (amts + rat))
  44 #
  45 # >>> print(Compound("Kub<ok-ta<e·der=stumpf"))
  46 # (Kub- + (ok-ta- + e·der)) + stumpf
  47 # >>> print(Compound("Dop-pel=in<kli-no<<me-ter"))
  48 # Dop-pel + ((in- + kli-no-) + me-ter)
  49 #
  50 # >>> print(Compound("ver<ein>heit>li-chen"))
  51 # ver- + (ein + -heit) + -li-chen
  52 #
  53 # >>> for i in Compound("Dop-pel=in<kli-no<<me-ter").walk():
  54 # ...      print(repr(i))
  55 # Compound("Dop-pel=in<kli-no<<me-ter")
  56 # Compound("Dop-pel")
  57 # 'Dop-pel'
  58 # Compound("in<kli-no<<me-ter")
  59 # PCompound("in<kli-no")
  60 # Prefix('in')
  61 # Compound("kli-no")
  62 # 'kli-no'
  63 # Compound("me-ter")
  64 # 'me-ter'
  65 #
  66 # >>> for i in Compound("An<ge<legen>heit").walk(parts=False):
  67 # ...      print(repr(i))
  68 # Compound("An<ge<legen>heit")
  69 # Compound("ge<legen")
  70 # Compound("legen")
  71 #
  72 # >>> for i in Compound("An<ge<legen>heit").walk(compounds=False):
  73 # ...      print(i)
  74 # An-
  75 # ge-
  76 # legen
  77 # -heit
  78 #
  79 # ::
  80
  81 class Compound(list):
  82
  83     def __init__(self, word):
  84         self.source = word
  85         self.level = level = compound_level(word)
  86         sep = '='*level # (=, ==, ===, ...)
  87
  88         # Affixe
  89         prefix = []
  90         suffix = []
  91         if ('<<' + sep) in word: # Wörter mit mehrteiligem Präfix
  92             prefix, word = word.split('<<'+sep, maxsplit=1)
  93             prefix = [PCompound(prefix)]
  94         elif ('<' + sep) in word[:-1]:
  95             prefix, word = word.split('<'+sep, maxsplit=1)
  96             prefix = [Prefix(prefix)]
  97         if (sep + '>') in word:
  98             word, suffix = word.rsplit(sep+'>', maxsplit=1)
  99             suffix = [Suffix(suffix)]
 100
 101         if prefix or suffix:
 102             parts = prefix + [Compound(word)] + suffix
 103         else:
 104             if level: # rekursive Zerlegung
 105                 parts = [Compound(part)
 106                          for part in word.split(sep)]
 107             else:
 108                 parts = [word]
 109         list.__init__(self, parts)
 110
 111     # Iterator über die Bestandteile:
 112     def walk(self, compounds=True, parts=True):
 113         if compounds:
 114             yield self
 115         for part in self:
 116             if isinstance(part, Compound):
 117                 yield from part.walk(compounds, parts)
 118             elif parts:
 119                 yield part
 120
 121     def __str__(self):
 122         parts = (str(p) for p in self)
 123         parts = ('('+ p +')' if '+' in p else p for p in parts)
 124         return ' + '.join(parts)
 125
 126     def __repr__(self):
 127         return 'Compound("%s")' % self.source
 128
 129
 130 class PCompound(Compound):
 131     def __repr__(self):
 132         return 'PCompound("%s")' % self.source
 133
 134     def __str__(self):
 135         return Compound.__str__(self) + '-'
 136
 137 # Wrapper für Präfix und Suffix:
 138 #
 139 # >>> print(Prefix("ab"), 'hängig', Suffix('keit'))
 140 # ab- hängig -keit
 141 # >>> print(repr(Prefix("zu")))
 142 # Prefix('zu')
 143 # >>> Prefix("auf").raw()
 144 # 'auf'
 145 #
 146 # ::
 147
 148 class Prefix(str):
 149     # # Idempotente Klasse: Prefix(Prefix('ab')) == Prefix('ab')
 150     # def __new__(cls, value):
 151     #     if isinstance(value, cls):
 152     #         # print(value, "is already a Prefix")
 153     #         return value
 154     #     instance = super().__new__(cls, value)
 155     #     return instance
 156     #
 157     # # def split(self, *args, **nargs):
 158     # #     return [Prefix(s) for s in str.split(self, *args, **nargs)]
 159
 160     def __str__(self):
 161         return self + '-'
 162
 163     def __repr__(self):
 164         return 'Prefix(%s)' % str.__repr__(self)
 165
 166     def raw(self):
 167         return str.__str__(self)
 168
 169 class Suffix(str):
 170     def __str__(self):
 171         return '-' + self
 172
 173     def __repr__(self):
 174         return 'Suffix(%s)' % str.__repr__(self)
 175
 176 # Funktionen
 177 # -----------
 178 #
 179 #
 180 # Zahl der Ebenen der Zerlegungshierarchie:
 181 #
 182 #
 183 # ::
 184
 185 def compound_level(word):
 186     try:
 187         return max(len(s) for s in re.findall('=+', word))
 188     except ValueError:
 189         return 0
 190
 191
 192 # Iterator, gibt alle geordneten Teilkombinationen zurück
 193 #
 194 # >>> list(multisplitter('test', '='))
 195 # ['test']
 196 #
 197 # >>> list(multisplitter('a=b', '='))
 198 # ['a', 'a=b', 'b']
 199 #
 200 # >>> list(multisplitter('a=b=c', '='))
 201 # ['a', 'a=b', 'a=b=c', 'b', 'b=c', 'c']
 202 #
 203 # >>> list(multisplitter('a=b=c=d', '='))
 204 # ['a', 'a=b', 'a=b=c', 'a=b=c=d', 'b', 'b=c', 'b=c=d', 'c', 'c=d', 'd']
 205 #
 206 # >>> list(multisplitter('a=b==c', '=='))
 207 # ['a=b', 'a=b==c', 'c']
 208 # >>> list(multisplitter('a=b==c=de', '=='))
 209 # ['a=b', 'a=b==c=de', 'c=de']
 210 # >>> list(multisplitter('a=b==c=de', '==='))
 211 # ['a=b==c=de']
 212 #
 213 # >>> list(multisplitter('er[<st/st=]ritt', '='))
 214 # ['er[<st/st=]ritt']
 215 # >>> list(multisplitter('Schiff[=s/s=]tau', '='))
 216 # ['Schiff[=s/s=]tau']
 217 # >>> list(multisplitter('a{ll/ll=l}ie-bend', '='))
 218 # ['a{ll/ll=l}ie-bend']
 219 # >>> list(multisplitter('Be[t=t/{tt/tt=t}]uch', '='))
 220 # ['Be[t=t/{tt/tt=t}]uch']
 221 #
 222 # Mit `only_new` wird das Eingangswort nicht mit ausgegeben:
 223 #
 224 # >>> list(multisplitter('test', '=', only_new=True))
 225 # []
 226 # >>> list(multisplitter('a=b', '=', True))
 227 # ['a', 'b']
 228 # >>> list(multisplitter('a=b=c', '=', True))
 229 # ['a', 'a=b', 'b', 'b=c', 'c']
 230 # >>> list(multisplitter('a=b==c=de', '==', True))
 231 # ['a=b', 'c=de']
 232 # >>> list(multisplitter('a=b==c=de', '===', True))
 233 # []
 234 #
 235 # ::
 236
 237 def multisplitter(wort, sep, only_new=False):
 238     specials = re.findall(r'\[.*%s.*\]|\{[^}]*%s[^}]*\}'%(sep,sep), wort)
 239     for sp in specials:
 240         wort = wort.replace(sp, sp.replace(sep, '*'))
 241     parts = wort.split(sep)
 242     length = len(parts)
 243     for start in range(length):
 244         for end in range(start+1, length+1):
 245             if only_new and end - start == length:
 246                 continue
 247             part = sep.join(parts[start:end])
 248             if specials:
 249                 part = part.replace('*', sep)
 250             yield part
 251
 252 # Gib eine Liste möglicher Zerlegungen eines Kompositums zurück.
 253 # Berücksichtige dabei die Bindungsstärke bis zum Level 3
 254 # ("===", zur Zeit höchste Auszeichnung in der Wortliste).
 255 #
 256 # >>> multisplit('test')
 257 # ['test']
 258 # >>> multisplit('a=b')
 259 # ['a=b', 'a', 'b']
 260 # >>> multisplit('a=b=c')
 261 # ['a=b=c', 'a', 'a=b', 'b', 'b=c', 'c']
 262 # >>> multisplit('a<b=c')
 263 # ['a<b=c', 'a<b', 'c']
 264 # >>> multisplit('a==b=c')
 265 # ['a==b=c', 'a', 'b=c', 'b', 'c']
 266 # >>> multisplit('a<=b=c')
 267 # ['a<=b=c', 'b=c', 'b', 'c']
 268 # >>> multisplit('a=b=>c')
 269 # ['a=b=>c', 'a=b', 'a', 'b']
 270 # >>> multisplit('a<==b=c==d')
 271 # ['a<==b=c==d', 'b=c==d', 'b=c', 'b', 'c', 'd']
 272 # >>> multisplit('a==b=c==>d')
 273 # ['a==b=c==>d', 'a==b=c', 'a', 'b=c', 'b', 'c']
 274 # >>> multisplit('a==b=c==d')
 275 # ['a==b=c==d', 'a', 'a==b=c', 'b=c', 'b', 'c', 'b=c==d', 'd']
 276 #
 277 # >>> multisplit('test', only_new=True)
 278 # []
 279 # >>> multisplit('a=b', True)
 280 # ['a', 'b']
 281 # >>> multisplit('a=b=c', True)
 282 # ['a', 'a=b', 'b', 'b=c', 'c']
 283 # >>> multisplit('a<b=c', True)
 284 # ['a<b', 'c']
 285 # >>> multisplit('a==b=c', True)
 286 # ['a', 'b=c', 'b', 'c']
 287 # >>> multisplit('a<=b=c', True)
 288 # ['b=c', 'b', 'c']
 289 # >>> multisplit('a=b=>c', True)
 290 # ['a=b', 'a', 'b']
 291 # >>> multisplit('a<==b=c==d', True)
 292 # ['b=c==d', 'b=c', 'b', 'c', 'd']
 293 # >>> multisplit('a==b=c==>d', True)
 294 # ['a==b=c', 'a', 'b=c', 'b', 'c']
 295 # >>> multisplit('a==b=c==d', True)
 296 # ['a', 'a==b=c', 'b=c', 'b', 'c', 'b=c==d', 'd']
 297 #
 298 # >>> for w in multisplit('Brenn=stoff==zel-len===an<trieb'):
 299 # ...    print(w)
 300 # Brenn=stoff==zel-len===an<trieb
 301 # Brenn=stoff==zel-len
 302 # Brenn=stoff
 303 # Brenn
 304 # Stoff
 305 # Zel-len
 306 # An<trieb
 307 #
 308 # >>> for w in multisplit('drei==ein=halb===mil-lio-nen===>fa-che'):
 309 # ...    print(w)
 310 # drei==ein=halb===mil-lio-nen===>fa-che
 311 # drei==ein=halb===mil-lio-nen
 312 # drei==ein=halb
 313 # drei
 314 # ein=halb
 315 # ein
 316 # halb
 317 # mil-lio-nen
 318 #
 319 # Mit `only_new` wird das Eingangswort nicht mit ausgegeben:
 320 #
 321 # >>> multisplit('a=b=c', only_new=True)
 322 # ['a', 'a=b', 'b', 'b=c', 'c']
 323 # >>> for w in multisplit('drei==ein=halb===mil-lio-nen===>fa-che', True):
 324 # ...    print(w)
 325 # drei==ein=halb===mil-lio-nen
 326 # drei==ein=halb
 327 # drei
 328 # ein=halb
 329 # ein
 330 # halb
 331 # mil-lio-nen
 332 # >>> for w in multisplit('amts=ärzt=>lich', True):
 333 # ...    print(w)
 334 # amts=ärzt
 335 # amts
 336 # ärzt
 337 #
 338 # ::
 339
 340 def multisplit(wort, only_new=False, level=3):
 341
 342     if only_new:
 343         parts = []
 344     else:
 345         parts = [wort]
 346     if not level:
 347         return parts
 348
 349     fuge = '=' * level
 350     istitle = wort[:2].istitle()
 351
 352     # Globale Affixe abspalten:
 353     global_suffix = fuge + '>' # =>, ==>, ===>, ...
 354     global_prefix = '<' + fuge # <=, <==, <===, ...
 355     if global_suffix in wort or global_prefix in wort:
 356         wort = re.sub(global_suffix+'.*$', '', wort)
 357         wort = re.sub('^.*'+global_prefix, '', wort)
 358         if istitle:
 359             wort = wort[0].title() + wort[1:]
 360         parts.append(wort)
 361
 362     if fuge not in wort:
 363         # print(level, wort, only_new)
 364         return multisplit(wort, only_new, level-1)
 365
 366     # Zerlegen an Fugen mit `level`:
 367     for part in multisplitter(wort, fuge, only_new=True):
 368         # print(level,  part)
 369         if fuge in part:
 370             parts.append(part)
 371             continue
 372         parts.extend(multisplit(part, False, level-1))
 373
 374     # Großschreibung beibehalten:
 375     istitle = wort[:2].istitle()
 376     if istitle:
 377         parts = [part[0].title() + part[1:] for part in parts]
 378     return parts
 379
 380 # Gib eine Liste von allen (sinnvollen) Zerlegungen eines WordEntry zurück
 381 #
 382 # >>> from wortliste import WordEntry
 383 #
 384 # >>> split_entry(WordEntry('Aachen;Aa-chen'))
 385 # [WordEntry('Aachen;Aa-chen')]
 386 # >>> aalbestand = WordEntry('Aalbestand;Aal=be<stand')
 387 # >>> for e in split_entry(aalbestand):
 388 # ...     e
 389 # WordEntry('Aalbestand;Aal=be<stand')
 390 # WordEntry('Aal;Aal')
 391 # WordEntry('Bestand;Be<stand')
 392 #
 393 # >>> godi = WordEntry('Abendgottesdienste;-2-;Abend==got-tes=dien-ste;Abend==got-tes=diens-te')
 394 # >>> for entry in split_entry(godi):
 395 # ...     print(entry)
 396 # Abendgottesdienste;-2-;Abend==got-tes=dien-ste;Abend==got-tes=diens-te
 397 # Abend;Abend
 398 # Gottesdienste;-2-;Got-tes=dien-ste;Got-tes=diens-te
 399 # Gottes;Got-tes
 400 # Dienste;-2-;Dien-ste;Diens-te
 401 #
 402 # Mit `only_new` wird das Ausgangswort weggelassen:
 403 #
 404 # >>> for entry in split_entry(godi, only_new=True):
 405 # ...     print(entry)
 406 # Abend;Abend
 407 # Gottesdienste;-2-;Got-tes=dien-ste;Got-tes=diens-te
 408 # Gottes;Got-tes
 409 # Dienste;-2-;Dien-ste;Diens-te
 410 #
 411 # Achtung: Wenn ein Wort nur in einer Sprachvariante existiert, werden alle
 412 # Zerlegungen auch nur in dieser Variante zurückgegeben,
 413 # selbst wenn sie auch in anderen Sprachvarianten korrekt sind:
 414 #
 415 # >>> bb = WordEntry('Biberbettuch;-2-;Bi-ber==be[t=t/{tt/tt=t}]uch')
 416 # >>> for entry in split_entry(bb):
 417 # ...     print(entry)
 418 # Biberbettuch;-2-;Bi-ber==be[t=t/{tt/tt=t}]uch
 419 # Biber;-2-;Bi-ber
 420 # Bettuch;-2-;Be[t=t/{tt/tt=t}]uch
 421 #
 422 # >>> bb = WordEntry('Biberbetttuch;-2-;-3-;Bi-ber==bett=tuch')
 423 # >>> for entry in split_entry(bb):
 424 # ...     print(entry)
 425 # Biberbetttuch;-2-;-3-;Bi-ber==bett=tuch
 426 # Biber;-2-;-3-;Bi-ber
 427 # Betttuch;-2-;-3-;Bett=tuch
 428 # Bett;-2-;-3-;Bett
 429 # Tuch;-2-;-3-;Tuch
 430 #
 431 # Wenn die Zahl der Zerlegungen abweicht, wird ein Fehler erzeugt:
 432 #
 433 # >>> sa = WordEntry('Schrottanker;-2-;Schro[tt=/{tt/tt=t}]an-ker;Schrott=an-ker')
 434 # >>> split_entry(sa)
 435 # Traceback (most recent call last):
 436 #   ...
 437 # ValueError: unterschiedliche Zerlegungsanzahl für Schrottanker;-2-;Schro[tt=/{tt/tt=t}]an-ker;Schrott=an-ker
 438 #
 439 # ::
 440
 441 def split_entry(entry, only_new=False):
 442
 443     entries = []
 444
 445     for col in range(1, len(entry)):
 446         wort = entry[col]
 447         if '=' not in wort:
 448             continue # nichts zu splitten
 449         parts = multisplit(wort, only_new)
 450
 451         # (leere) Einträge und Schlüssel erstellen
 452         if not entries:
 453             for part in parts:
 454                 entries.append(copy.copy(entry))
 455                 entries[-1][0] = join_word(part)
 456         # Einträge auffüllen
 457         for i in range(len(parts)):
 458             try:
 459                 entries[i][col] = parts[i]
 460             except IndexError:
 461                 raise ValueError('unterschiedliche Zerlegungsanzahl für %s'
 462                                  %entry)
 463     if entries:
 464         for e in entries:
 465             e.prune() # Zusammenfassen von Sprachvarianten
 466         return entries
 467     else:
 468         return [entry]
 469
 470
 471
 472 # Iteriere über die Wortliste-Einträge `entries` und
 473 # gib ein Dictionary mit Teilwortkombinationen der Einträge zurück.
 474 #
 475 # Das Argument `scope` bestimmt die Auswahl der gesammelten Wörter:
 476 #
 477 #
 478 #   alle:
 479 #     alle Teile vor und nach Wortfugen,
 480 #     auch "vollständige" (in `entries` vorhandene) Wörter,
 481 #
 482 #   teile:
 483 #     Teile vor und nach Wortfugen, ohne in `entries` vorhandene Wörter.
 484 #
 485 #   :selbständige:
 486 #     ohne vollständige Wörter und Teile die (wahrscheinlich)
 487 #     keine sinnvollen Einträge für die Wortliste sind:
 488 #
 489 #     * kürzer als 3 Buchstaben,
 490 #     * eingeschobenes "zu" (gegen=zu=halten)
 491 #     * Kurzform ohne "-en" (z.B. Ab<löse=..., amts=ärzt=>lich, ...)
 492 #     * Bindungs-S (z.B. "Ab<fahrts=...")
 493 #     * "ung" + Bindungs-S (z.B. "Abrechnungs=grund")
 494 #
 495 # ::
 496
 497 def expand_words(entries, scope='teile', verbose=False):
 498     newentries = {}
 499     only_new = scope in ('teile', 'selbständige')
 500
 501     if only_new:
 502         entries = list(entries) # Sicherstellen, dass noch einmal iteriert werden kann.
 503         keys = set(entry.key() for entry in entries)
 504     for entry in entries:
 505         if "Kurzwort" in entry.comment:
 506             continue
 507         try:
 508             parts = split_entry(entry, only_new) # Liste mit Teilworteinträgen
 509         except ValueError as err:
 510             if verbose:
 511                 print(err, file=sys.stderr)
 512             continue
 513         for e in parts:
 514             key = e[0]
 515             if key in newentries:  # schon da
 516                 ve = newentries[key]
 517                 ve._duplicates += 1
 518                 if (len(ve) == 2): # alle Sprachvarianten gleich
 519                     continue
 520                 try:
 521                     ve.merge(e)
 522                 except AssertionError as err:
 523                     if verbose:
 524                         print(err, file=sys.stderr)
 525                 continue
 526             if only_new:
 527                 if (key in keys
 528                     or key.lower() in keys
 529                     or key.title() in keys):
 530                     continue # in Originalverzeichnis vorhanden
 531
 532             # Aussortieren von Einträgen die wahrscheinlich keine
 533             # selbständigen Wörter sind.
 534             if scope == 'selbständige':
 535                 if len(key) <= 3:  # zu kurz
 536                     continue
 537                 # eingeschobenes "zu" (gegen=zu=halten)
 538                 if  key.startswith('zu=') or key.endswith('=zu'):
 539                     continue
 540                 # Kurzform ohne "-en" (z.B. Ab<löse=..., amts=ärzt=>lich, ...)
 541                 if key.lower() + 'en' in keys:
 542                     continue
 543                 # Bindungs-S (z.B. "Ab<fahrts=...")
 544                 if key.endswith('s') and key[:-1] in keys:
 545                     # aber: Achs=... (ach), ...
 546                     continue
 547                 # "ung" + Bindungs-S (z.B. "Abrechnungsgrund")
 548                 if key.endswith('ungs') and key[:-4] + 'en' in keys:
 549                     continue
 550
 551             # Herkunft festhalten:
 552             e.comment = '< ' + entry.get('de-1996,de-1901,de-1901-x-versal,'
 553                                          'de-1996-x-versal,de-CH-1901')
 554             e._duplicates = 0
 555
 556             # Entfernen des "Ungünstigkeitsmarkers" nach kurzen Vorsilben:
 557             for i in range(1, len(e)):
 558                 if re.match('..<[.]', e[i]):
 559                     e[i] = e[i][:3] + e[i][4:]
 560             newentries[key] = e
 561
 562     for entry in newentries.values():
 563         if entry._duplicates:
 564             entry.comment += ' +%s×' % entry._duplicates
 565
 566     return newentries
 567
 568 # def exists(wort):
 569 #     key = join_word(wort)
 570 #     return (key.title() in words) or (key.lower() in words) or (len(wort)<4)
 571 #
 572 #
 573 # Präfixe bestimmen::
 574
 575 def check_affix(affix):
 576     if '[' in affix or '/' in affix or ']' in affix:
 577        return '' # Spezialtrennung/Alternativen -> unsicher
 578     affix = affix.lower()
 579     # Entferne Unterdrücker und führende Trennzeichen
 580     affix = affix.replace('.', '')
 581     affix = re.sub('^[·]', '', affix)
 582     # Entferne Alternativtrennung nach §113 (verblasste Morphologie):
 583     affix = re.sub('[·-]([^aeiouäöüy])$', r'\1', affix)
 584     affix = re.sub('[-]([aeiouäöüy])$', r'·\1', affix)
 585     return affix
 586
 587 # >>> words = ['Vor<silbe',
 588 # ...          'Keine=vor<silbe',
 589 # ...          'Globale<=vor<silbe',
 590 # ...          'au-to<gen',
 591 # ...          'Au-to=bahn',
 592 # ...          'un<zu<ver<läs-sig',
 593 # ...          'un<<gleich>för-mig',
 594 # ...          'An<äs-the-si·o<<lo-gie',
 595 # ...         ]
 596 # >>> for word in words:
 597 # ...   for prefix in split_prefixes(word):
 598 # ...     print(prefix)
 599 # vor
 600 # vor
 601 # globale
 602 # vor
 603 # au-to
 604 # un
 605 # zu
 606 # ver
 607 # un
 608 # an<äs-the-si·o
 609 # an
 610 #
 611 # ::
 612
 613 def split_prefixes(word):
 614     for part in Compound(word).walk():
 615         if isinstance(part, PCompound):
 616             yield check_affix(part.source)
 617         if isinstance(part, Prefix):
 618             yield check_affix(part)
 619     return
 620
 621
 622 # >>> for word in words:
 623 # ...   for suffix in split_suffixes(word):
 624 # ...     print(suffix)
 625 # för-mig
 626 #
 627 # ::
 628
 629 def split_suffixes(word):
 630     for part in Compound(word).walk(compounds=False):
 631         if isinstance(part, Suffix):
 632             yield check_affix(part)
 633     return
 634
 635 # Präfixe sammeln:
 636 # `scope`:
 637 #   * alle: alle Teile vom Wortanfang bis "<"
 638 #   * neuwörter: keine Homonyme zu Einträgen in `entries`,
 639 #   * wörter: Homonyme zu Einträgen und Teilwörtern.
 640 #
 641 # >>> entries = [WordEntry(join_word(word) + ';' + word)
 642 # ...            for word in words]
 643 # >>> for p in list_affixes(entries, split_fun=split_prefixes): print(p)
 644 # ('vor', 'Vor<silbe', 2)
 645 # ('globale', 'Globale<=vor<silbe', 0)
 646 # ('au-to', 'au-to<gen', 0)
 647 # ('un', 'un<zu<ver<läs-sig', 1)
 648 # ('zu', 'un<zu<ver<läs-sig', 0)
 649 # ('ver', 'un<zu<ver<läs-sig', 0)
 650 # ('an<äs-the-si·o', 'An<äs-the-si·o<<lo-gie', 0)
 651 # ('an', 'An<äs-the-si·o<<lo-gie', 0)
 652 #
 653 # >>> for p in list_affixes(entries, split_fun=split_prefixes,
 654 # ...                       scope='neuwörter'): print(p)
 655 # ('vor', 'Vor<silbe', 2)
 656 # ('globale', 'Globale<=vor<silbe', 0)
 657 # ('un', 'un<zu<ver<läs-sig', 1)
 658 # ('zu', 'un<zu<ver<läs-sig', 0)
 659 # ('ver', 'un<zu<ver<läs-sig', 0)
 660 # ('an<äs-the-si·o', 'An<äs-the-si·o<<lo-gie', 0)
 661 # ('an', 'An<äs-the-si·o<<lo-gie', 0)
 662 #
 663 # >>> list_affixes(entries, split_fun=split_prefixes, scope='wörter')
 664 # [('au-to', 'au-to<gen (vs. < Au-to=bahn)', 0)]
 665 #
 666 # ::
 667
 668 # Affixe sammeln::
 669
 670 def list_affixes(entries, split_fun=split_suffixes,
 671                  scope='alle', lang='de-1996'):
 672     affixes = {}
 673     if scope != 'alle':
 674         entries = list(entries)
 675     for entry in entries:
 676         word = entry.get(lang)
 677         for affix in split_fun(word):
 678             try:
 679                 affixes[affix][1] += 1
 680             except KeyError:
 681                 affixes[affix] = [word, 0]
 682     affixes.pop('', None)
 683
 684     if scope != 'alle':
 685         words = dict((e.get('de-1996').lower(), e)
 686                      for e in expand_words(entries, scope='alle').values())
 687         # print(words)
 688         for affix in list(affixes):
 689             if ((scope == 'neuwörter' and affix in words)
 690                 or (scope == 'wörter' and affix not in words)):
 691                 del(affixes[affix])
 692             elif scope == 'wörter':
 693                 affixes[affix][0] += ' (vs. %s)' % words[affix].comment[2:]
 694
 695     return [(key, affixes[key][0], affixes[key][1]) for key in affixes]
 696
 697
 698 if __name__ == '__main__':
 699
 700 # Pfad zu "../../../wortliste" unabhängig vom Arbeitsverzeichnis::
 701
 702     default_wortliste = os.path.relpath(os.path.join(
 703         os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(
 704             os.path.abspath(__file__))))),
 705         'wortliste'))
 706
 707 # Optionen::
 708
 709     usage = '%prog [Optionen]\n' + __doc__
 710
 711     parser = argparse.ArgumentParser(description = __doc__,
 712                         # formatter_class=argparse.RawDescriptionHelpFormatter
 713                                     )
 714     parser.add_argument('WORTLISTE', nargs='?', # optionales Argument
 715                         help='Eingabedatei im "Wortliste-Format" '
 716                         '("-" für Standardeingabe), '
 717                         'Vorgabe "%s".'%default_wortliste,
 718                         default=default_wortliste)
 719     parser.add_argument('-k', '--komposita',
 720                         help='Liste Konstituenten zusammengesetzter Wörter '
 721                         '(Teile vor und nach \'=\'). '
 722                         '"alle": auch vollständige Wörter der Eingabe, '
 723                         '"teile": keine vollständigen Wörter der Eingabe, '
 724                         '"selbständige": keine Konstituenten mit Bindungs-s, '
 725                         'eingeschobenem »zu« oder fehlender Endung. '
 726                         'Vorgabe "teile".',
 727                         choices=['alle', 'teile', 'selbständige', ''],
 728                         default='teile')
 729     parser.add_argument('-p', '--praefixe',
 730                         help='Präfixe (statt Komposita) bestimmen. '
 731                         '"alle": alle Präfixe (Wortanfang bis \'<\', '
 732                         '"wörter": nur Homonyme zu Einträgen und Teilwörtern, '
 733                         '"neuwörter": keine Homonyme, '
 734                         '"": keine. '
 735                         'Vorgabe "".',
 736                         choices=['alle', 'neuwörter', 'wörter', ''],
 737                         default='')
 738     parser.add_argument('-s', '--suffixe',
 739                         help='Suffixe (statt Komposita) bestimmen. '
 740                         '"alle": alle Suffixe von \'>\' bis Wortende, '
 741                         '"wörter": nur Homonyme zu Einträgen und Teilwörtern, '
 742                         '"neuwörter": keine Homonyme, '
 743                         '"": keine. '
 744                         'Vorgabe "".',
 745                         choices=['alle', 'neuwörter', 'wörter', ''],
 746                         default='')
 747     parser.add_argument('-z', '--zerlegungshierarchie', action='store_true',
 748                         help='Gib die Zerlegungshierarchie aus.')
 749     parser.add_argument('-v', '--verbose', action='store_true',
 750                         help='Kommentare zur Herkunft anhängen.')
 751     parser.add_argument('--sort', choices=['duden', 'länge', 'anzahl', ''],
 752                         default='duden',
 753                         help='Sortieren der Ergebnisse (Default: nach Duden)')
 754     parser.add_argument('-l', '--language', metavar='SPRACHE,[SPRACHE...]',
 755                         help='Sprachvariante(n), ignoriert für "-k" '
 756                         '(Vorgabe: "de-1996").',
 757                         default="de-1996")
 758
 759     args = parser.parse_args()
 760
 761
 762     if args.WORTLISTE == '-':
 763         wordfile = (WordEntry(line.rstrip()) for line in sys.stdin)
 764     else:
 765         wordfile = WordFile(args.WORTLISTE)
 766
 767
 768     sortkeys = {'duden':  lambda result : sortkey_duden(result[0]),
 769                 'länge':  lambda result : len(result[0]),
 770                 'anzahl': lambda result : result[2],
 771                }
 772
 773
 774 # `Wortliste` einlesen::
 775
 776     entries = wordfile
 777
 778 # Präfixe::
 779
 780     if args.praefixe:
 781         # affixes = list_prefixes(entries, scope=args.praefixe)
 782         affixes = list_affixes(entries, split_fun=split_prefixes,
 783                                 scope=args.praefixe)
 784
 785     elif args.suffixe:
 786         affixes = list_affixes(entries, split_fun=split_suffixes,
 787                                scope=args.suffixe)
 788
 789     elif args.zerlegungshierarchie:
 790         words = (entry.get('de-1996') for entry in entries)
 791         output = ['%s # %s' % (word, Compound(word)) for word in words]
 792
 793 # Teilwörter::
 794
 795     else:
 796         parts = expand_words(entries, scope=args.komposita,
 797                              verbose=args.verbose)
 798         output = sorted(parts.values(), key=sortkey_duden)
 799
 800 # Affixe sortieren und formatieren::
 801
 802     if args.praefixe or args.suffixe:
 803         output =  ['%s # < %s +%s×' % affix
 804                    for affix in sorted(affixes, key=sortkeys[args.sort])]
 805
 806 # Ausgabe::
 807
 808     # Header
 809     # Modifikationszeit der Eingabdatei:
 810     if args.WORTLISTE != '-':
 811         mtime = ''
 812     else:
 813         mtime = time.localtime(os.path.getmtime(args.WORTLISTE))
 814         mtime = time.strftime(' vom %d.%m.%Y', mtime)
 815     argv = list(sys.argv) # Kopie
 816     argv[0] = os.path.relpath(argv[0])
 817     print('# Extract aus "%s"%s' % (args.WORTLISTE, mtime))
 818     print('# mit `%s`' % ' '.join(argv))
 819
 820     for line in output:
 821         print(line)
 822
 823     print('#', len(output), "Einträge", file=sys.stderr)