skripte/sort.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf8 -*-
   3 # :Copyright: © 2012 Günter Milde.
   4 # :Licence:   This work may be distributed and/or modified under
   5 #             the conditions of the `LaTeX Project Public License`,
   6 #             either version 1.3 of this license or (at your option)
   7 #             any later  version.
   8 # :Version:   0.2 (2012-03-16)
   9
  10 # sort.py
  11 # *******
  12 #
  13 # ::
  14
  15 u"""
  16 Sortiere eine oder mehrere Dateien im "Wortliste-Format".
  17
  18 Filter:
  19   ./sort.py - <../wortliste > ../wortliste.sortiert
  20
  21 Zusammenfügen:
  22   ./sort.py liste.c liste.a liste.b > liste.abc
  23
  24 Einsortieren (zusammenfügen und wieder aufteilen):
  25    ./sort.py neu.todo wl-* --split -o wl-
  26
  27 Einsortieren und Patch erstellen:
  28   ./sort.py ../wortliste neu.todo --diff -o ../wortliste.patch
  29 """
  30
  31 usage = u'%prog [Optionen] Eingangsdatei(en)\n' + __doc__
  32
  33 # Alle Eingabedateien werden in eine Liste gelesen und dann sortiert.
  34 # Das spezielle Argument '-' steht für die Standardeingabe.
  35 #
  36 # Mit Option -o kann die Ausgabe in eine/mehrere Datei(en) gelenkt werden.
  37 # Das spezielle Argument '-' steht für die Standardausgabe (Vorgabe).
  38 #
  39 # Die Kodierung ist UTF8.
  40 #
  41 # Es wird wahlweise nach Duden oder nach der bis März 2012 für die Wortliste
  42 # genutzten Regel sortiert. Voreinstellung ist Dudensortierung.
  43 #
  44 # Siehe auch Optionen
  45
  46
  47 import unicodedata, sys, optparse, os, copy, codecs
  48
  49 # path for local Python modules
  50 sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'python'))
  51
  52 from edit_tools.wortliste import WordEntry, udiff, sortkey_duden
  53 from edit_tools.split_wortliste import split_a_z, write_a_z
  54
  55 # sortkey_wl
  56 # ----------
  57 #
  58 # Sortierschlüssel für den früher genutzten Algorithmus,
  59 # d.h Emulation von:
  60 #
  61 # * Sortieren nach gesamter Zeile
  62 # * mit dem Unix-Aufruf `sort -d`
  63 # * und locale DE.
  64 #
  65 # ::
  66
  67 def sortkey_wl(entry):
  68     # Sortieren nach gesamter Zeile
  69     key = unicode(entry)
  70
  71     # Ersetzungen:
  72     ersetzungen = {ord(u'ß'): u'ss'} # ß -> ss
  73     # Feldtrenner und Trennzeichen ignorieren (Simulation von `sort -d`)
  74     for char in u';-·=|[]{}':
  75         ersetzungen[ord(char)] = None
  76     key = key.translate(ersetzungen)
  77
  78     # Akzente/Umlaute weglassen:
  79     key = unicodedata.normalize('NFKD', key) # Akzente mit 2-Zeichen-Kombi
  80     key = key.encode('ascii', 'ignore')     # ignoriere nicht-ASCII Zeichen
  81     # Großschreibung ignorieren
  82     key = key.lower()
  83
  84     return key
  85
  86
  87 # Aufruf von der Kommandozeile
  88 # ============================
  89 #
  90 # ::
  91
  92 if __name__ == '__main__':
  93
  94 # Optionen::
  95
  96     parser = optparse.OptionParser(usage=usage)
  97     parser.add_option('-o', '--outfile',
  98                       help=u'Ausgangsdatei, Vorgabe: - (Standardausgabe)',
  99                       default='-')
 100     parser.add_option('-p', '--diff', action='store_true', default=False,
 101                       help=u'Erstelle Patch im "unified diff" Format.'
 102                         'Vergleicht die erste Ausgangsdatei mit dem Resultat.')
 103     parser.add_option('-s', '--split',
 104                       help=u'Aufteilen in Dateien "OUTFILEa" bis "OUTFILEz".'
 105                             'Vorgabe: False (nicht splitten)',
 106                       action="store_true", default=False)
 107     parser.add_option('-u', '--unsorted', action='store_true', default=False,
 108                       help=u'Überspringe die Sortierung.')
 109     parser.add_option('-d', '--dump', action="store_true", default=False,
 110                       help=u'Für Rückwärtskompatibilität. Obsolet, ignoriert.')
 111     parser.add_option('--legacy-sort', action="store_true",
 112                       help=u'alternative (obsolete) Sortierordnung',
 113                       default=False)
 114
 115     (options, args) = parser.parse_args()
 116
 117     if not args:
 118         parser.print_help()
 119         sys.exit()
 120
 121     # Achtung: bool(options.legacy_sort) ist immer True, daher nicht
 122     # ``if options.legacy_sort: ...`` verwenden!
 123     if options.legacy_sort is True:
 124         sortkey = sortkey_wl
 125     else:
 126         sortkey = sortkey_duden
 127
 128 # Einlesen in eine Liste::
 129
 130     filenames = ', '.join(arg.replace('-', '<stdin>') for arg in args)
 131     infiles = [sys.stdin if arg=='-' else open(arg) for arg in args]
 132
 133     # Vergleichsdatei bei patch ist erste Datei
 134     wordlist = [line.decode('utf-8') for line in infiles.pop(0)]
 135     if options.diff is True:
 136             wordlist_pre = copy.copy(wordlist)
 137
 138     # Restliche Dateien anhängen
 139     # verschachtelte Listen entflechten [i for lst in lsts for i in lst]
 140     wordlist += [line.decode('utf-8')
 141                  for infile in infiles for line in infile]
 142
 143 # Aufteilen::
 144
 145     if options.split is True:
 146         if options.diff is True:
 147             print 'Aufteilen nach a-z für Patch nicht implementiert.'
 148         elif options.outfile == '-':
 149             print 'Aufteilen nach a-z auf Standardausgabe nicht möglich.'
 150         else:
 151             lists = split_a_z(wordlist)
 152             if not options.unsorted:
 153                 for l in lists.values():
 154                     l.sort(key=sortkey)
 155             write_a_z(lists, options.outfile)
 156         sys.exit()
 157
 158 # Sortieren::
 159
 160     if not options.unsorted:
 161         wordlist.sort(key=sortkey)
 162
 163 # Patch erstellen::
 164
 165     if options.diff is True:
 166         output = udiff(wordlist_pre, wordlist,
 167                       filenames, filenames+'-sortiert', encoding='utf-8')
 168         if not output:
 169             print 'keine Änderungen'
 170             sys.exit()
 171     else:
 172         output = ''.join(unicode(line) for line in wordlist).encode('utf8')
 173
 174 # Ausgabe::
 175
 176     if options.outfile == '-':
 177         outfile = sys.stdout
 178     else:
 179         outfile = codecs.open(options.outfile, 'w', encoding='utf-8')
 180
 181     outfile.write(output)