skripte/python/long_s_conversion.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf8 -*-
   3 # :Copyright: © 2014 Günter Milde.
   4 #             Released without warranty under the terms of the
   5 #             GNU General Public License (v. 2 or later)
   6 # :Id: $Id:  $
   7
   8 # long_s_conversion.py: Demonstrator für die Lang-S Wandlung
   9 # =============================================================================
  10
  11 u"""Rund-S nach Lang-S Wandlung über "hyphenation patterns"."""
  12
  13 import sys, codecs, re, optparse
  14 from hyphenation import Hyphenator
  15
  16
  17 # Konfiguration
  18 # -------------
  19
  20 # Die Lang-S Pattern-Dateien welche über "make" Ziele
  21 # im Wurzelverzeichnis der wortliste generiert werden::
  22
  23 pfile = '../../de-Latf/de-Latf.pat'
  24
  25 # Trenner-Instanzen::
  26
  27 h_Latf = Hyphenator(pfile)
  28
  29
  30 # ſ steht auch am Ende von Abkürzungen, wenn es im abgekürzten Wort steht
  31 # (Abſ. - Abſatz/Abſender, (de)creſc. - (de)creſcendo, daſ. - daſelbst ...)
  32 # s steht auch in der Mitte von Abkürzungen, wenn es im abgekürzten Wort steht
  33 # (Ausg. - Ausgang/Ausgabe, Hrsg. - Herausgeber, ...)
  34 # ::
  35
  36 exceptions = (u'Abſ', # Abſatz/Abſender
  37               u'beſ',  # beſonders
  38               u'coſ',  # Ko<ſinus
  39             # u'daſ',   # da<ſelbſt (nicht von Artikel "das" zu unterscheiden!)
  40               u'Diſſ',  # Diſſertation
  41               u'Hſ',    # Handschrift
  42               u'Maſſ',  # Maſſachuſetts
  43             # u'Miſſ',  # Miſſiſippi (nicht von Miſs (Frln.) zu unterscheiden)
  44               # TODO: N-Z
  45              )
  46 exceptions = dict((ex.replace(u'ſ', u's'), ex) for ex in exceptions)
  47
  48 # Konvertierung mit Hyphenator::
  49
  50 def transformiere(wort, hyphenator=h_Latf):
  51
  52     if u's' not in wort:
  53         return wort
  54     if wort in exceptions:
  55         return exceptions[wort]
  56
  57     parts = hyphenator.split_word(wort, rmin=1)
  58
  59     # Wandle in jedem Teil alle klein S zu Lang-S, außer am Schluss:
  60     parts = [part[:-1].replace(u's', u'ſ') + part[-1] for part in parts]
  61
  62     return u''.join(parts)
  63
  64 def transformiere_text(text, hyphenator=h_Latf):
  65     # Text zerlegen: finde (ggf. leere) Folgen von nicht-Wort-Zeichen
  66     # gefolgt von Wort-Zeichen. Der Iterator liefert Match-Objekte, mit
  67     # den Untergruppen 0: nicht-Wort und 1: Wort.
  68     it = re.finditer(r"([\W0-9_]*)(\w*)", text, flags=re.UNICODE)
  69     # Konvertierung und Zusammenfügen
  70     parts = [match.groups()[0] # nicht-Wort Zeichen
  71              + transformiere(match.groups()[1], hyphenator)
  72              for match in it]
  73     return u''.join(parts)
  74
  75 if __name__ == '__main__':
  76
  77     usage = u'%prog [options] [words to be transformed]\n\n' + __doc__
  78
  79     parser = optparse.OptionParser(usage=usage)
  80     parser.add_option('-f', '--pattern-file', dest='pattern_file',
  81                       help='Pattern file, Default "%s"' % pfile,
  82                       default=pfile)
  83     parser.add_option('-t', '--test', action="store_true", default=False,
  84                       help='Vergleiche Eingabe mit Rekonstruktion, '
  85                       'Melde Differenzen.')
  86
  87     (options, args) = parser.parse_args()
  88
  89     h_Latf = Hyphenator(options.pattern_file)
  90
  91     # sys.stdout mit UTF8 encoding.
  92     sys.stdout = codecs.getwriter('utf8')(sys.stdout)
  93
  94     if len(args) > 0:
  95         lines = [' '.join(args).decode('utf8')]
  96     else:
  97         lines = (line.decode('utf8') for line in sys.stdin)
  98
  99     if options.test:
 100         for line in lines:
 101             line2 = transformiere_text(line.replace(u'ſ', u's'))
 102             if line2 != line:
 103                 print line.strip(), '->', line2,
 104     else:
 105         for line in lines:
 106             print transformiere_text(line),