skripte/sort.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf8 -*-
   3 # :Copyright: © 2012 Günter Milde.
   4 # :Licence:   This work may be distributed and/or modified under
   5 #             the conditions of the `LaTeX Project Public License`,
   6 #             either version 1.3 of this license or (at your option)
   7 #             any later  version.
   8 # :Version:   0.2 (2012-03-16)
   9
  10 # sort.py
  11 # *******
  12 #
  13 # ::
  14
  15 u"""
  16 Sortiere eine oder mehrere Dateien im "Wortliste-Format".
  17
  18 Filter:
  19   ./sort.py - <../wortliste > ../wortliste.sortiert
  20
  21 Zusammenfügen:
  22   ./sort.py liste.c liste.a liste.b > liste.abc
  23
  24 Einsortieren (zusammenfügen und wieder aufteilen):
  25    ./sort.py neu.todo wl_* --split -o wl
  26
  27 Einsortieren und Patch erstellen:
  28   ./sort.py ../wortliste neu.todo --patch -o ../wortliste.patch
  29 """
  30
  31 usage = u'%prog [Optionen] Eingangsdatei(en)\n' + __doc__
  32
  33 # Alle Eingabedateien werden in eine Liste gelesen und dann sortiert.
  34 # Das spezielle Argument '-' steht für die Standardeingabe.
  35 #
  36 # Mit Option -o kann die Ausgabe in eine/mehrere Datei(en) gelenkt werden.
  37 # Das spezielle Argument '-' steht für die Standardausgabe (Vorgabe).
  38 #
  39 # Die Kodierung ist UTF8.
  40 #
  41 # Es wird wahlweise nach Duden oder nach der bis März 2012 für die Wortliste
  42 # genutzten Regel sortiert. Voreinstellung ist Dudensortierung.
  43 #
  44 # Siehe auch Optionen
  45
  46
  47 import unicodedata, sys, optparse, os, copy
  48
  49 # path for local Python modules
  50 sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'python'))
  51
  52 from edit_tools.wortliste import WordEntry, udiff, sortkey_duden
  53 from edit_tools.split_wortliste import split_a_z, write_a_z
  54
  55 # sortkey_wl
  56 # ----------
  57 #
  58 # Sortierschlüssel für den früher genutzten Algorithmus,
  59 # d.h Emulation von:
  60 #
  61 # * Sortieren nach gesamter Zeile
  62 # * mit dem Unix-Aufruf `sort -d`
  63 # * und locale DE.
  64 #
  65 # ::
  66
  67 def sortkey_wl(entry):
  68     # Sortieren nach gesamter Zeile
  69     key = unicode(entry)
  70
  71     # Ersetzungen:
  72     ersetzungen = {ord(u'ß'): u'ss'} # ß -> ss
  73     # Feldtrenner und Trennzeichen ignorieren (Simulation von `sort -d`)
  74     for char in u';-·=|[]{}':
  75         ersetzungen[ord(char)] = None
  76     key = key.translate(ersetzungen)
  77
  78     # Akzente/Umlaute weglassen:
  79     key = unicodedata.normalize('NFKD', key) # Akzente mit 2-Zeichen-Kombi
  80     key = key.encode('ascii', 'ignore')     # ignoriere nicht-ASCII Zeichen
  81     # Großschreibung ignorieren
  82     key = key.lower()
  83
  84     return key
  85
  86
  87 # Aufruf von der Kommandozeile
  88 # ============================
  89 #
  90 # ::
  91
  92 if __name__ == '__main__':
  93
  94 # Optionen::
  95
  96     parser = optparse.OptionParser(usage=usage)
  97     parser.add_option('-o', '--outfile',
  98                       help='Ausgangsdatei, Vorgabe: - (Standardausgabe)',
  99                       default='-')
 100     parser.add_option('-p', '--diff', action='store_true', default=False,
 101                       help='Erstelle Patch im "unified diff" Format.'
 102                         'Vergleicht die erste Ausgangsdatei mit dem Resultat.')
 103     parser.add_option('-s', '--split',
 104                       help='Aufteilen in Dateien "OUTFILE_a" bis "OUTFILE_z".'
 105                            'Vorgabe: False (nicht splitten)',
 106                       action="store_true", default=False)
 107     parser.add_option('-u', '--unsorted', action='store_true', default=False,
 108                       help='Überspringe die Sortierung.')
 109     parser.add_option('-d', '--dump', action="store_true", default=False,
 110                       help='Für Rückwärtskompatibilität. Obsolet, ignoriert.')
 111     parser.add_option('--legacy-sort', action="store_true",
 112                       help='alternative (obsolete) Sortierordnung',
 113                       default=False)
 114
 115     (options, args) = parser.parse_args()
 116     if not args:
 117         parser.print_help()
 118         sys.exit()
 119
 120     # Achtung: bool(options.legacy_sort) ist immer True, daher nicht
 121     # ``if options.legacy_sort: ...`` verwenden!
 122     if options.legacy_sort is True:
 123         sortkey = sortkey_wl
 124     else:
 125         sortkey = sortkey_duden
 126
 127 # Einlesen in eine Liste::
 128
 129     filename = ', '.join(arg.replace('-', '<stdin>') for arg in args)
 130     infiles = [sys.stdin if arg=='-' else open(arg) for arg in args]
 131
 132     # Vergleichsdatei bei patch ist erste Datei
 133     wordlist = [line.decode('utf-8') for line in infiles.pop(0)]
 134     if options.diff is True:
 135             wordlist_pre = copy.copy(wordlist)
 136
 137     # Restliche Dateien anhängen
 138     # verschachtelte Listen entflechten [i for lst in lsts for i in lst]
 139     wordlist += [line.decode('utf-8')
 140                  for infile in infiles for line in infile]
 141
 142 # Aufteilen::
 143
 144     if options.split is True:
 145         if options.diff is True:
 146             print 'Aufteilen nach a-z für Patch nicht implementiert.'
 147         elif options.outfile == '-':
 148             print 'Aufteilen nach a-z auf Standardausgabe nicht möglich.'
 149         else:
 150             lists = split_a_z(wordlist)
 151             if options.unsorted is not True:
 152                 for l in lists.values():
 153                     l.sort(key=sortkey)
 154             write_a_z(lists, options.outfile)
 155         sys.exit()
 156
 157 # Sortieren::
 158
 159     if options.unsorted is not True:
 160         wordlist.sort(key=sortkey)
 161
 162 # Patch erstellen::
 163
 164     if options.diff is True:
 165         output = udiff(wordlist_pre, wordlist,
 166                       filename, filename+'-sortiert',
 167                       encoding='utf-8')
 168         if not output:
 169             print 'keine Änderungen'
 170             sys.exit()
 171     else:
 172         output = ''.join(unicode(line) for line in wordlist).encode('utf8')
 173
 174 # Ausgabe::
 175
 176     if options.outfile == '-':
 177         outfile = sys.stdout
 178     else:
 179         outfile = open(options.outfile, 'w')
 180
 181     outfile.write(output + '\n')