patgen-list-diff.awk

   1 #!/usr/bin/awk -f
   2 #
   3 # Dieses Skript liest eine DIFF-Datei der Patgen-Eingabelisten (siehe
   4 # Skript patgen-list-diff.sh) und zerlegt sie in Wörter,
   5 #
   6 #  * die neu hinzugefügt,
   7 #  * die entfernt,
   8 #  * deren Trennung korrigiert und
   9 #  * deren Klein- und Großschreibung korrigiert
  10 #
  11 # wurde.  Die Wörter werden in Dateien der Form <Eingabedatei>.<ext>
  12 # gespeichert.  <ext> ist entsprechend 'added', 'removed', 'case' oder
  13 # 'hyph'.  Beim Aufruf des Skripts muss die Variable 'ftr' mit dem Namen
  14 # der Translate-Datei für Patgen vorbelegt werden:
  15 #   gawk -v ftr=<translate datei> ...
  16
  17
  18
  19 # Translate a string to lower case characters as defined by the rules in
  20 # the translate file.
  21 function tr_tolower(s) {
  22     l = length(s)
  23     trs = ""
  24     for (i=1; i<=l; ++i) {
  25         ch = substr(s, i, 1)
  26         if (tr[ch] == "") {
  27             printf("%s", "Error: Bad character '" ch "' in string " s " (" FILENAME ", line " FNR ")\n") > "/dev/stderr"
  28             error = 1
  29             exit
  30         }
  31         trs = trs tr[ch]
  32     }
  33     return trs
  34 }
  35
  36
  37
  38 # Normalize a word.
  39 #   * Remove hyphens.
  40 #   * All lower case.
  41 function normalize_word(word) {
  42     gsub("-", "", word)
  43     word = tr_tolower(word)
  44     return word
  45 }
  46
  47
  48
  49 # Output all words of a class (added, removed, ...) to a file.  Output
  50 # number of words in class on command-line.
  51 function output_word_class(clarr, clname) {
  52     fname = fdiff "." clname
  53     i = 0
  54     # Create output file and output word class unsorted.
  55     printf "" > fname ".unsort"
  56     for (word in clarr) {
  57         ++i
  58         print(clarr[word]) >> fname ".unsort"
  59     }
  60     print(clname ": " i)
  61     # Sort output file on shell.
  62     system("LC_COLLATE=de_DE.ISO8859-15 LC_CTYPE=de_DE.ISO8859-15 sort -f " fname ".unsort > " fname)
  63     system("rm -f " fname ".unsort")
  64     return i
  65 }
  66
  67
  68
  69 # Read translate file via getline and build translate table used for
  70 # validating and normalizing words.  A word is valid, if it consists
  71 # entirely of characters that are indices in table tr.
  72 function read_translate_file(ftr) {
  73     # Switch to fixed-width field splitting mode.  The separator in
  74     # translate files is not known in advance and can change from line
  75     # to line.
  76     FIELDWIDTHS = "1 1 1 1 1 1 1"
  77     # Skip first line containing hyphenation minima and hyphenation
  78     # characters.
  79     getline < ftr
  80     # The hyphen is a valid character.
  81     tr["-"] = "-"
  82     # NR and FNR aren't updated when reading a file via getline.  So we
  83     # count lines manually.
  84     ln = 1
  85     # Read lines from translate file.
  86     while (getline < ftr > 0) {
  87         ++ln
  88         # Skip comments (column 1 == column 2).
  89         if ($1 != $2) {
  90             # Determine number of columns on line.
  91             cols=0
  92             while ($(cols+1) != "") {
  93                 cols++
  94             }
  95             # Check character translation table format.  Check if
  96             # separators are all equal to that in column 1.
  97             for (i=3; i<=cols; i=i+2) {
  98                 if ($i != $1) {
  99                     printf("%s", "Error: Bad character translation table in file " ftr ", line " ln "\n") > "/dev/stderr"
 100                     error = 1
 101                     exit
 102                 }
 103             }
 104             # Store characters in translation table.
 105             for (i=2; i<=cols; i=i+2)
 106                 tr[$i] = $2
 107         }
 108     }
 109 #    print(ln " lines from translation file " ftr " read OK.")
 110 #    for (ch in tr)
 111 #        print(ch, tr[ch])
 112     # Reset regular field splitting mode.
 113     FS = FS
 114     return
 115 }
 116
 117
 118
 119 # First, read translate file, whose name is defined in variable ftr on the
 120 # command line.
 121 BEGIN {
 122     # Check if translate file name is set.
 123     if (ftr == "") {
 124         printf("%s", "Error: Translate file name missing!\nPlease set-up variable 'ftr' like: gawk -v ftr=<translate file> ...\n") > "/dev/stderr"
 125         error = 1
 126         exit
 127     }
 128     # Read translate file and build translate table.
 129     read_translate_file(ftr)
 130 }
 131
 132
 133
 134 # Read DIFF file's added lines.
 135 /^> / {
 136     # Store added word in field with:
 137     #   key = <normalized word>,
 138     #   value = <patgen input word>.
 139     # A normalized word is lower case only with hyphens removed.  Example:
 140     #   word_in["tafelsilber"] = "Ta-fel-sil-ber"
 141     k = normalize_word($2)
 142     v = $2
 143     word_in[k] = v
 144 }
 145 # Read DIFF file's removed lines.
 146 /^< / {
 147     # Store removed word with:
 148     #   key = <normalized word>,
 149     #   value = <patgen input word>.
 150     # A normalized word is lower case only with hyphens removed.  Example:
 151     #   word_out["tafelsilber"] = "Ta-fel-sil-ber"
 152     k = normalize_word($2)
 153     v = $2
 154     word_out[k] = v
 155 }
 156
 157
 158
 159 END {
 160     if (error) {
 161         exit error
 162     }
 163     for (word in word_in) {
 164         if (word in word_out) {
 165             # Changed word.
 166             # Check for case changes only.
 167             lword_in = tr_tolower(word_in[word])
 168             lword_out = tr_tolower(word_out[word])
 169             if (lword_in == lword_out) {
 170                 # Case change only.
 171                 case[word] = word_in[word]
 172             }
 173             else {
 174                 # Hyphenation corrected.
 175                 hyph[word] = word_in[word]
 176             }
 177         }
 178         else {
 179             # Added word.
 180             added[word] = word_in[word]
 181         }
 182     }
 183     for (word in word_out) {
 184         if (word in word_in) {
 185             # Changed word.
 186             # Already processed in above loop.
 187         }
 188         else {
 189             # Removed word.
 190             removed[word] = word_out[word]
 191         }
 192     }
 193     # Save input file name.
 194     fdiff = FILENAME
 195     # Output results.
 196     print("Processed file " fdiff ".")
 197     n_added = output_word_class(added, "added")
 198     n_removed = output_word_class(removed, "removed")
 199     n_hyph = output_word_class(hyph, "hyph")
 200     n_case = output_word_class(case, "case")
 201     # Output entry for table in CHANGES file (in Markdown).
 202     printf("   %11d   %8d   %10d\n", n_added, n_removed, n_hyph) >> "CHANGES.table.txt"
 203 }