Python Skript update.
[wortliste.git] / skripte / diff-patgen-input.awk
blobb77ebf1e91d488459a9184f9adb55322202c5bcc
1 #!/usr/bin/awk -f
3 # Dieses Skript liest eine DIFF-Datei der Patgen-Eingabelisten (siehe
4 # Skript diff-patgen-input.sh) und zerlegt sie in Wörter,
6 # * die neu hinzugefügt,
7 # * die entfernt,
8 # * deren Trennung korrigiert und
9 # * deren Klein- und Großschreibung korrigiert
11 # wurde. Die Wörter werden in Dateien der Form <Eingabedatei>.<ext>
12 # gespeichert. <ext> ist entsprechend 'added', 'removed', 'case' oder
13 # 'hyph'. Beim Aufruf des Skripts muss die Variable 'ftr' mit dem Namen
14 # der Translate-Datei für Patgen vorbelegt werden:
15 # gawk -v ftr=<translate datei> ...
19 # Translate a string to lower case characters as defined by the rules in
20 # the translate file.
21 function tr_tolower(s) {
22 l = length(s)
23 trs = ""
24 for (i=1; i<=l; ++i) {
25 ch = substr(s, i, 1)
26 if (tr[ch] == "") {
27 printf("%s", "Error: Bad character '" ch "' in string " s " (" FILENAME ", line " FNR ")\n") > "/dev/stderr"
28 error = 1
29 exit
31 trs = trs tr[ch]
33 return trs
38 # Normalize a word.
39 # * Remove hyphens.
40 # * All lower case.
41 function normalize_word(word) {
42 gsub("-", "", word)
43 word = tr_tolower(word)
44 return word
49 # Output all words of a class (added, removed, ...) to a file. Output
50 # number of words in class on command-line.
51 function output_word_class(clarr, clname) {
52 fname = fdiff "." clname
53 i = 0
54 # Create output file and output word class unsorted.
55 printf "" > fname ".unsort"
56 for (word in clarr) {
57 ++i
58 print(clarr[word]) >> fname ".unsort"
60 print(clname ": " i)
61 # Sort output file on shell.
62 system("LC_COLLATE=de_DE.ISO8859-15 LC_CTYPE=de_DE.ISO8859-15 sort -f " fname ".unsort > " fname)
63 system("rm -f " fname ".unsort")
68 # Read translate file via getline and build translate table used for
69 # validating and normalizing words. A word is valid, if it consists
70 # entirely of characters that are indices in table tr.
71 function read_translate_file(ftr) {
72 # Switch to fixed-width field splitting mode. The separator in
73 # translate files is not known in advance and can change from line
74 # to line.
75 FIELDWIDTHS = "1 1 1 1 1 1 1"
76 # Skip first line containing hyphenation minima and hyphenation
77 # characters.
78 getline < ftr
79 # The hyphen is a valid character.
80 tr["-"] = "-"
81 # NR and FNR aren't updated when reading a file via getline. So we
82 # count lines manually.
83 ln = 1
84 # Read lines from translate file.
85 while (getline < ftr > 0) {
86 ++ln
87 # Skip comments (column 1 == column 2).
88 if ($1 != $2) {
89 # Determine number of columns on line.
90 cols = 0
91 while ($(cols+1) != "") {
92 cols++
94 # Ignore trailing separators.
95 while ((cols > 0) && ($cols == $1)) {
96 cols--
98 # Check character translation table format. Check if
99 # separators are all equal to that in column 1.
100 for (i=3; i<=cols; i=i+2) {
101 if ($i != $1) {
102 printf("%s", "Error: Bad character translation table in file " ftr ", line " ln "\n") > "/dev/stderr"
103 error = 1
104 exit
107 # Store characters in translation table.
108 for (i=2; i<=cols; i=i+2)
109 tr[$i] = $2
112 # print(ln " lines from translation file " ftr " read OK.")
113 # for (ch in tr)
114 # print(ch, tr[ch])
115 # Reset regular field splitting mode.
116 FS = FS
117 return
122 # First, read translate file, whose name is defined in variable ftr on the
123 # command line.
124 BEGIN {
125 # Check if translate file name is set.
126 if (ftr == "") {
127 printf("%s", "Error: Translate file name missing!\nPlease set-up variable 'ftr' like: gawk -v ftr=<translate file> ...\n") > "/dev/stderr"
128 error = 1
129 exit
131 # Read translate file and build translate table.
132 read_translate_file(ftr)
137 # Read DIFF file's added lines.
138 /^> / {
139 # Store added word in field with:
140 # key = <normalized word>,
141 # value = <patgen input word>.
142 # A normalized word is lower case only with hyphens removed. Example:
143 # word_in["tafelsilber"] = "Ta-fel-sil-ber"
144 k = normalize_word($2)
145 v = $2
146 word_in[k] = v
148 # Read DIFF file's removed lines.
149 /^< / {
150 # Store removed word with:
151 # key = <normalized word>,
152 # value = <patgen input word>.
153 # A normalized word is lower case only with hyphens removed. Example:
154 # word_out["tafelsilber"] = "Ta-fel-sil-ber"
155 k = normalize_word($2)
156 v = $2
157 word_out[k] = v
162 END {
163 if (error) {
164 exit error
166 for (word in word_in) {
167 if (word in word_out) {
168 # Changed word.
169 # Check for case changes only.
170 lword_in = tr_tolower(word_in[word])
171 lword_out = tr_tolower(word_out[word])
172 if (lword_in == lword_out) {
173 # Case change only.
174 CaSe[word] = word_in[word]
176 else {
177 # Hyphenation corrected.
178 hyph[word] = word_in[word]
181 else {
182 # Added word.
183 added[word] = word_in[word]
186 for (word in word_out) {
187 if (word in word_in) {
188 # Changed word.
189 # Already processed in above loop.
191 else {
192 # Removed word.
193 removed[word] = word_out[word]
196 # Save input file name.
197 fdiff = FILENAME
198 # Output results.
199 print("Processed file " fdiff ".")
200 output_word_class(added, "added")
201 output_word_class(removed, "removed")
202 output_word_class(hyph, "hyph")
203 output_word_class(CaSe, "case")