3 # Dieses Skript liest eine DIFF-Datei der Patgen-Eingabelisten (siehe
4 # Skript patgen-list-diff.sh) und zerlegt sie in Wörter,
6 # * die neu hinzugefügt,
8 # * deren Trennung korrigiert und
9 # * deren Klein- und Großschreibung korrigiert
11 # wurde. Die Wörter werden in Dateien der Form <Eingabedatei>.<ext>
12 # gespeichert. <ext> ist entsprechend 'added', 'removed', 'case' oder
13 # 'hyph'. Beim Aufruf des Skripts muss die Variable 'ftr' mit dem Namen
14 # der Translate-Datei für Patgen vorbelegt werden:
15 # gawk -v ftr=<translate datei> ...
19 # Translate a string to lower case characters as defined by the rules in
21 function tr_tolower
(s
) {
24 for (i=
1; i
<=l
; ++i
) {
27 printf("%s", "Error: Bad character '" ch
"' in string " s
" (" FILENAME ", line " FNR ")\n") > "/dev/stderr"
41 function normalize_word
(word
) {
43 word = tr_tolower
(word
)
49 # Output all words of a class (added, removed, ...) to a file. Output
50 # number of words in class on command-line.
51 function output_word_class
(clarr
, clname
) {
52 fname = fdiff
"." clname
54 # Create output file and output word class unsorted.
55 printf "" > fname
".unsort"
58 print(clarr
[word
]) >> fname
".unsort"
61 # Sort output file on shell.
62 system("LC_COLLATE=de_DE.ISO8859-15 LC_CTYPE=de_DE.ISO8859-15 sort -f " fname
".unsort > " fname
)
63 system("rm -f " fname
".unsort")
69 # Read translate file via getline and build translate table used for
70 # validating and normalizing words. A word is valid, if it consists
71 # entirely of characters that are indices in table tr.
72 function read_translate_file
(ftr
) {
73 # Switch to fixed-width field splitting mode. The separator in
74 # translate files is not known in advance and can change from line
76 FIELDWIDTHS =
"1 1 1 1 1 1 1"
77 # Skip first line containing hyphenation minima and hyphenation
80 # The hyphen is a valid character.
82 # NR and FNR aren't updated when reading a file via getline. So we
83 # count lines manually.
85 # Read lines from translate file.
86 while (getline < ftr
> 0) {
88 # Skip comments (column 1 == column 2).
90 # Determine number of columns on line.
92 while ($
(cols
+1) != "") {
95 # Check character translation table format. Check if
96 # separators are all equal to that in column 1.
97 for (i=
3; i
<=cols
; i=i
+2) {
99 printf("%s", "Error: Bad character translation table in file " ftr
", line " ln
"\n") > "/dev/stderr"
104 # Store characters in translation table.
105 for (i=
2; i
<=cols
; i=i
+2)
109 # print(ln " lines from translation file " ftr " read OK.")
112 # Reset regular field splitting mode.
119 # First, read translate file, whose name is defined in variable ftr on the
122 # Check if translate file name is set.
124 printf("%s", "Error: Translate file name missing!\nPlease set-up variable 'ftr' like: gawk -v ftr=<translate file> ...\n") > "/dev/stderr"
128 # Read translate file and build translate table.
129 read_translate_file
(ftr
)
134 # Read DIFF file's added lines.
136 # Store added word in field with:
137 # key = <normalized word>,
138 # value = <patgen input word>.
139 # A normalized word is lower case only with hyphens removed. Example:
140 # word_in["tafelsilber"] = "Ta-fel-sil-ber"
141 k = normalize_word
($
2)
145 # Read DIFF file's removed lines.
147 # Store removed word with:
148 # key = <normalized word>,
149 # value = <patgen input word>.
150 # A normalized word is lower case only with hyphens removed. Example:
151 # word_out["tafelsilber"] = "Ta-fel-sil-ber"
152 k = normalize_word
($
2)
163 for (word in word_in
) {
164 if (word in word_out
) {
166 # Check for case changes only.
167 lword_in = tr_tolower
(word_in
[word
])
168 lword_out = tr_tolower
(word_out
[word
])
169 if (lword_in == lword_out
) {
171 case
[word
] = word_in
[word
]
174 # Hyphenation corrected.
175 hyph
[word
] = word_in
[word
]
180 added
[word
] = word_in
[word
]
183 for (word in word_out
) {
184 if (word in word_in
) {
186 # Already processed in above loop.
190 removed
[word
] = word_out
[word
]
193 # Save input file name.
196 print("Processed file " fdiff
".")
197 n_added = output_word_class
(added
, "added")
198 n_removed = output_word_class
(removed
, "removed")
199 n_hyph = output_word_class
(hyph
, "hyph")
200 n_case = output_word_class
(case
, "case")
201 # Output entry for table in CHANGES file (in Markdown).
202 printf(" %11d %8d %10d\n", n_added
, n_removed
, n_hyph
) >> "CHANGES.table.txt"