Thukydides I (Bredow, Heilmann 1812)
[wortliste.git] / patgen-list-diff.awk
blobd3b4544691a82d9b2f94dfa100612134487ad7a2
1 #!/usr/bin/awk -f
3 # Dieses Skript liest eine DIFF-Datei der Patgen-Eingabelisten (siehe
4 # Skript patgen-list-diff.sh) und zerlegt sie in Wörter,
6 # * die neu hinzugefügt,
7 # * die entfernt,
8 # * deren Trennung korrigiert und
9 # * deren Klein- und Großschreibung korrigiert
11 # wurde. Die Wörter werden in Dateien der Form <Eingabedatei>.<ext>
12 # gespeichert. <ext> ist entsprechend 'added', 'removed', 'case' oder
13 # 'hyph'. Beim Aufruf des Skripts muss die Variable 'ftr' mit dem Namen
14 # der Translate-Datei für Patgen vorbelegt werden:
15 # gawk -v ftr=<translate datei> ...
19 # Translate a string to lower case characters as defined by the rules in
20 # the translate file.
21 function tr_tolower(s) {
22 l = length(s)
23 trs = ""
24 for (i=1; i<=l; ++i) {
25 ch = substr(s, i, 1)
26 if (tr[ch] == "") {
27 printf("%s", "Error: Bad character '" ch "' in string " s " (" FILENAME ", line " FNR ")\n") > "/dev/stderr"
28 error = 1
29 exit
31 trs = trs tr[ch]
33 return trs
38 # Normalize a word.
39 # * Remove hyphens.
40 # * All lower case.
41 function normalize_word(word) {
42 gsub("-", "", word)
43 word = tr_tolower(word)
44 return word
49 # Output all words of a class (added, removed, ...) to a file. Output
50 # number of words in class on command-line.
51 function output_word_class(clarr, clname) {
52 fname = fdiff "." clname
53 i = 0
54 # Create output file and output word class unsorted.
55 printf "" > fname ".unsort"
56 for (word in clarr) {
57 ++i
58 print(clarr[word]) >> fname ".unsort"
60 print(clname ": " i)
61 # Sort output file on shell.
62 system("LC_COLLATE=de_DE.ISO8859-15 LC_CTYPE=de_DE.ISO8859-15 sort -f " fname ".unsort > " fname)
63 system("rm -f " fname ".unsort")
64 return i
69 # Read translate file via getline and build translate table used for
70 # validating and normalizing words. A word is valid, if it consists
71 # entirely of characters that are indices in table tr.
72 function read_translate_file(ftr) {
73 # Switch to fixed-width field splitting mode. The separator in
74 # translate files is not known in advance and can change from line
75 # to line.
76 FIELDWIDTHS = "1 1 1 1 1 1 1"
77 # Skip first line containing hyphenation minima and hyphenation
78 # characters.
79 getline < ftr
80 # The hyphen is a valid character.
81 tr["-"] = "-"
82 # NR and FNR aren't updated when reading a file via getline. So we
83 # count lines manually.
84 ln = 1
85 # Read lines from translate file.
86 while (getline < ftr > 0) {
87 ++ln
88 # Skip comments (column 1 == column 2).
89 if ($1 != $2) {
90 # Determine number of columns on line.
91 cols=0
92 while ($(cols+1) != "") {
93 cols++
95 # Check character translation table format. Check if
96 # separators are all equal to that in column 1.
97 for (i=3; i<=cols; i=i+2) {
98 if ($i != $1) {
99 printf("%s", "Error: Bad character translation table in file " ftr ", line " ln "\n") > "/dev/stderr"
100 error = 1
101 exit
104 # Store characters in translation table.
105 for (i=2; i<=cols; i=i+2)
106 tr[$i] = $2
109 # print(ln " lines from translation file " ftr " read OK.")
110 # for (ch in tr)
111 # print(ch, tr[ch])
112 # Reset regular field splitting mode.
113 FS = FS
114 return
119 # First, read translate file, whose name is defined in variable ftr on the
120 # command line.
121 BEGIN {
122 # Check if translate file name is set.
123 if (ftr == "") {
124 printf("%s", "Error: Translate file name missing!\nPlease set-up variable 'ftr' like: gawk -v ftr=<translate file> ...\n") > "/dev/stderr"
125 error = 1
126 exit
128 # Read translate file and build translate table.
129 read_translate_file(ftr)
134 # Read DIFF file's added lines.
135 /^> / {
136 # Store added word in field with:
137 # key = <normalized word>,
138 # value = <patgen input word>.
139 # A normalized word is lower case only with hyphens removed. Example:
140 # word_in["tafelsilber"] = "Ta-fel-sil-ber"
141 k = normalize_word($2)
142 v = $2
143 word_in[k] = v
145 # Read DIFF file's removed lines.
146 /^< / {
147 # Store removed word with:
148 # key = <normalized word>,
149 # value = <patgen input word>.
150 # A normalized word is lower case only with hyphens removed. Example:
151 # word_out["tafelsilber"] = "Ta-fel-sil-ber"
152 k = normalize_word($2)
153 v = $2
154 word_out[k] = v
159 END {
160 if (error) {
161 exit error
163 for (word in word_in) {
164 if (word in word_out) {
165 # Changed word.
166 # Check for case changes only.
167 lword_in = tr_tolower(word_in[word])
168 lword_out = tr_tolower(word_out[word])
169 if (lword_in == lword_out) {
170 # Case change only.
171 case[word] = word_in[word]
173 else {
174 # Hyphenation corrected.
175 hyph[word] = word_in[word]
178 else {
179 # Added word.
180 added[word] = word_in[word]
183 for (word in word_out) {
184 if (word in word_in) {
185 # Changed word.
186 # Already processed in above loop.
188 else {
189 # Removed word.
190 removed[word] = word_out[word]
193 # Save input file name.
194 fdiff = FILENAME
195 # Output results.
196 print("Processed file " fdiff ".")
197 n_added = output_word_class(added, "added")
198 n_removed = output_word_class(removed, "removed")
199 n_hyph = output_word_class(hyph, "hyph")
200 n_case = output_word_class(case, "case")
201 # Output entry for table in CHANGES file (in Markdown).
202 printf(" %11d %8d %10d\n", n_added, n_removed, n_hyph) >> "CHANGES.table.txt"