skripte/python/patuse/long_s_conversion.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf8 -*-
   3 # :Copyright: © 2014 Günter Milde.
   4 #             Released without warranty under the terms of the
   5 #             GNU General Public License (v. 2 or later)
   6 # :Id: $Id:  $
   7
   8 # long_s_conversion.py: Demonstrator für die Lang-S Wandlung
   9 # =============================================================================
  10
  11 u"""Rund-S nach Lang-S Wandlung über "hyphenation patterns"."""
  12
  13 import os, sys, codecs, re, optparse
  14 from hyphenation import Hyphenator
  15
  16
  17 # Konfiguration
  18 # -------------
  19
  20 # Lang-s Pseudo-Trennmuster mit "s-" statt "ſ" (ausſagen -> auss-agen)
  21 # (mit "make de-x-long-s" im Wurzelverzeichnis der wortliste generiert)::
  22
  23 default_pfile = os.path.normpath(os.path.join(os.path.dirname(__file__),
  24                                     '../../../de-long-s/de-x-long-s.pat'))
  25
  26 # Ausnahmen
  27 # ---------
  28
  29 # ſ steht auch am Ende von Abkürzungen, wenn es im abgekürzten Wort steht
  30 # (Abſ. - Abſatz/Abſender, (de)creſc. - (de)creſcendo, daſ. - daſelbst ...)
  31 # s steht auch in der Mitte von Abkürzungen, wenn es im abgekürzten Wort steht
  32 # (Ausg. - Ausgang/Ausgabe, Hrsg. - Herausgeber, ...)
  33 # ::
  34
  35 exceptions = (u'Abſ', # Abſatz/Abſender
  36               u'beſ',  # beſonders
  37               u'coſ',  # Ko<ſinus
  38             # u'daſ',   # da<ſelbſt (nicht von Artikel "das" zu unterscheiden!)
  39               u'Diſſ',  # Diſſertation
  40               u'Hſ',    # Handschrift
  41               u'Maſſ',  # Maſſachuſetts
  42             # u'Miſſ',  # Miſſiſippi (nicht von Miſs (Frln.) zu unterscheiden)
  43               # TODO: N-Z
  44              )
  45 exceptions = dict((ex.replace(u'ſ', u's'), ex) for ex in exceptions)
  46
  47 # Konvertierung mit Hyphenator::
  48
  49 def transformiere(wort, hyphenator):
  50
  51     if u's' not in wort:
  52         return wort
  53     if wort in exceptions:
  54         return exceptions[wort]
  55
  56     wort = hyphenator.hyphenate_word(wort, hyphen=u'-', lmin=1, rmin=1)
  57
  58     # Wandle "s-" in "ſ" (auss-agen -> ausſagen):
  59
  60     return wort.replace(u's-', u'ſ').replace(u'S-', u'S')
  61
  62 def transformiere_text(text, hyphenator):
  63     # Text zerlegen: finde (ggf. leere) Folgen von nicht-Wort-Zeichen
  64     # gefolgt von Wort-Zeichen. Der Iterator liefert Match-Objekte, mit
  65     # den Untergruppen 0: nicht-Wort und 1: Wort.
  66     it = re.finditer(r"([\W0-9_]*)(\w*)", text, flags=re.UNICODE)
  67     # Konvertierung und Zusammenfügen
  68     parts = [match.groups()[0] # nicht-Wort Zeichen
  69              + transformiere(match.groups()[1], hyphenator)
  70              for match in it]
  71     return u''.join(parts)
  72
  73 if __name__ == '__main__':
  74
  75     usage = u'%prog [options] [words to be transformed]\n\n' + __doc__
  76
  77     parser = optparse.OptionParser(usage=usage)
  78     parser.add_option('-f', '--pattern-file', dest='pattern_file',
  79                       help='Pattern file, Default "%s"' % default_pfile,
  80                       default=default_pfile)
  81     parser.add_option('-t', '--test', action="store_true", default=False,
  82                       help='Vergleiche Eingabe mit Rekonstruktion, '
  83                       'Melde Differenzen.')
  84
  85     (options, args) = parser.parse_args()
  86
  87
  88     h_Latf = Hyphenator(options.pattern_file)
  89
  90     # sys.stdout mit UTF8 encoding.
  91     sys.stdout = codecs.getwriter('utf8')(sys.stdout)
  92
  93     if len(args) > 0:
  94         lines = [' '.join(args).decode('utf8')]
  95     else:
  96         lines = (line.decode('utf8') for line in sys.stdin)
  97
  98     if options.test:
  99         for line in lines:
 100             line2 = transformiere_text(line.replace(u'ſ', u's'), h_Latf)
 101             if line2 != line:
 102                 print line.strip(), '->', line2,
 103         sys.exit()
 104
 105     for line in lines:
 106         print transformiere_text(line, h_Latf),