skripte/diff-patgen-input.awk

   1 #!/usr/bin/awk -f
   2 #
   3 # Dieses Skript liest eine DIFF-Datei der Patgen-Eingabelisten (siehe
   4 # Skript diff-patgen-input.sh) und zerlegt sie in Wörter,
   5 #
   6 #  * die neu hinzugefügt,
   7 #  * die entfernt,
   8 #  * deren Trennung korrigiert und
   9 #  * deren Klein- und Großschreibung korrigiert
  10 #
  11 # wurde.  Die Wörter werden in Dateien der Form <Eingabedatei>.<ext>
  12 # gespeichert.  <ext> ist entsprechend 'added', 'removed', 'case' oder
  13 # 'hyph'.  Beim Aufruf des Skripts muss die Variable 'ftr' mit dem Namen
  14 # der Translate-Datei für Patgen vorbelegt werden:
  15 #   gawk -v ftr=<translate datei> ...
  16
  17
  18
  19 # Translate a string to lower case characters as defined by the rules in
  20 # the translate file.
  21 function tr_tolower(s) {
  22     l = length(s)
  23     trs = ""
  24     for (i=1; i<=l; ++i) {
  25         ch = substr(s, i, 1)
  26         if (tr[ch] == "") {
  27             printf("%s", "Error: Bad character '" ch "' in string " s " (" FILENAME ", line " FNR ")\n") > "/dev/stderr"
  28             error = 1
  29             exit
  30         }
  31         trs = trs tr[ch]
  32     }
  33     return trs
  34 }
  35
  36
  37
  38 # Normalize a word.
  39 #   * Remove hyphens.
  40 #   * All lower case.
  41 function normalize_word(word) {
  42     gsub("-", "", word)
  43     word = tr_tolower(word)
  44     return word
  45 }
  46
  47
  48
  49 # Output all words of a class (added, removed, ...) to a file.  Output
  50 # number of words in class on command-line.
  51 function output_word_class(clarr, clname) {
  52     fname = fdiff "." clname
  53     i = 0
  54     # Create output file and output word class unsorted.
  55     printf "" > fname ".unsort"
  56     for (word in clarr) {
  57         ++i
  58         print(clarr[word]) >> fname ".unsort"
  59     }
  60     print(clname ": " i)
  61     # Sort output file on shell.
  62     system("LC_COLLATE=de_DE.ISO8859-15 LC_CTYPE=de_DE.ISO8859-15 sort -f " fname ".unsort > " fname)
  63     system("rm -f " fname ".unsort")
  64 }
  65
  66
  67
  68 # Read translate file via getline and build translate table used for
  69 # validating and normalizing words.  A word is valid, if it consists
  70 # entirely of characters that are indices in table tr.
  71 function read_translate_file(ftr) {
  72     # Switch to fixed-width field splitting mode.  The separator in
  73     # translate files is not known in advance and can change from line
  74     # to line.
  75     FIELDWIDTHS = "1 1 1 1 1 1 1"
  76     # Skip first line containing hyphenation minima and hyphenation
  77     # characters.
  78     getline < ftr
  79     # The hyphen is a valid character.
  80     tr["-"] = "-"
  81     # NR and FNR aren't updated when reading a file via getline.  So we
  82     # count lines manually.
  83     ln = 1
  84     # Read lines from translate file.
  85     while (getline < ftr > 0) {
  86         ++ln
  87         # Skip comments (column 1 == column 2).
  88         if ($1 != $2) {
  89             # Determine number of columns on line.
  90             cols = 0
  91             while ($(cols+1) != "") {
  92                 cols++
  93             }
  94             # Ignore trailing separators.
  95             while ((cols > 0) && ($cols == $1)) {
  96                 cols--
  97             }
  98             # Check character translation table format.  Check if
  99             # separators are all equal to that in column 1.
 100             for (i=3; i<=cols; i=i+2) {
 101                 if ($i != $1) {
 102                     printf("%s", "Error: Bad character translation table in file " ftr ", line " ln "\n") > "/dev/stderr"
 103                     error = 1
 104                     exit
 105                 }
 106             }
 107             # Store characters in translation table.
 108             for (i=2; i<=cols; i=i+2)
 109                 tr[$i] = $2
 110         }
 111     }
 112 #    print(ln " lines from translation file " ftr " read OK.")
 113 #    for (ch in tr)
 114 #        print(ch, tr[ch])
 115     # Reset regular field splitting mode.
 116     FS = FS
 117     return
 118 }
 119
 120
 121
 122 # First, read translate file, whose name is defined in variable ftr on the
 123 # command line.
 124 BEGIN {
 125     # Check if translate file name is set.
 126     if (ftr == "") {
 127         printf("%s", "Error: Translate file name missing!\nPlease set-up variable 'ftr' like: gawk -v ftr=<translate file> ...\n") > "/dev/stderr"
 128         error = 1
 129         exit
 130     }
 131     # Read translate file and build translate table.
 132     read_translate_file(ftr)
 133 }
 134
 135
 136
 137 # Read DIFF file's added lines.
 138 /^> / {
 139     # Store added word in field with:
 140     #   key = <normalized word>,
 141     #   value = <patgen input word>.
 142     # A normalized word is lower case only with hyphens removed.  Example:
 143     #   word_in["tafelsilber"] = "Ta-fel-sil-ber"
 144     k = normalize_word($2)
 145     v = $2
 146     word_in[k] = v
 147 }
 148 # Read DIFF file's removed lines.
 149 /^< / {
 150     # Store removed word with:
 151     #   key = <normalized word>,
 152     #   value = <patgen input word>.
 153     # A normalized word is lower case only with hyphens removed.  Example:
 154     #   word_out["tafelsilber"] = "Ta-fel-sil-ber"
 155     k = normalize_word($2)
 156     v = $2
 157     word_out[k] = v
 158 }
 159
 160
 161
 162 END {
 163     if (error) {
 164         exit error
 165     }
 166     for (word in word_in) {
 167         if (word in word_out) {
 168             # Changed word.
 169             # Check for case changes only.
 170             lword_in = tr_tolower(word_in[word])
 171             lword_out = tr_tolower(word_out[word])
 172             if (lword_in == lword_out) {
 173                 # Case change only.
 174                 CaSe[word] = word_in[word]
 175             }
 176             else {
 177                 # Hyphenation corrected.
 178                 hyph[word] = word_in[word]
 179             }
 180         }
 181         else {
 182             # Added word.
 183             added[word] = word_in[word]
 184         }
 185     }
 186     for (word in word_out) {
 187         if (word in word_in) {
 188             # Changed word.
 189             # Already processed in above loop.
 190         }
 191         else {
 192             # Removed word.
 193             removed[word] = word_out[word]
 194         }
 195     }
 196     # Save input file name.
 197     fdiff = FILENAME
 198     # Output results.
 199     print("Processed file " fdiff ".")
 200     output_word_class(added, "added")
 201     output_word_class(removed, "removed")
 202     output_word_class(hyph, "hyph")
 203     output_word_class(CaSe, "case")
 204 }