3 # Dieses Skript liest eine DIFF-Datei der Patgen-Eingabelisten (siehe
4 # Skript diff-patgen-input.sh) und zerlegt sie in Wörter,
6 # * die neu hinzugefügt,
8 # * deren Trennung korrigiert und
9 # * deren Klein- und Großschreibung korrigiert
11 # wurde. Die Wörter werden in Dateien der Form <Eingabedatei>.<ext>
12 # gespeichert. <ext> ist entsprechend 'added', 'removed', 'case' oder
13 # 'hyph'. Beim Aufruf des Skripts muss die Variable 'ftr' mit dem Namen
14 # der Translate-Datei für Patgen vorbelegt werden:
15 # gawk -v ftr=<translate datei> ...
19 # Translate a string to lower case characters as defined by the rules in
21 function tr_tolower
(s
) {
24 for (i=
1; i
<=l
; ++i
) {
27 printf("%s", "Error: Bad character '" ch
"' in string " s
" (" FILENAME ", line " FNR ")\n") > "/dev/stderr"
41 function normalize_word
(word
) {
43 word = tr_tolower
(word
)
49 # Output all words of a class (added, removed, ...) to a file. Output
50 # number of words in class on command-line.
51 function output_word_class
(clarr
, clname
) {
52 fname = fdiff
"." clname
54 # Create output file and output word class unsorted.
55 printf "" > fname
".unsort"
58 print(clarr
[word
]) >> fname
".unsort"
61 # Sort output file on shell.
62 system("LC_COLLATE=de_DE.ISO8859-15 LC_CTYPE=de_DE.ISO8859-15 sort -f " fname
".unsort > " fname
)
63 system("rm -f " fname
".unsort")
68 # Read translate file via getline and build translate table used for
69 # validating and normalizing words. A word is valid, if it consists
70 # entirely of characters that are indices in table tr.
71 function read_translate_file
(ftr
) {
72 # Switch to fixed-width field splitting mode. The separator in
73 # translate files is not known in advance and can change from line
75 FIELDWIDTHS =
"1 1 1 1 1 1 1"
76 # Skip first line containing hyphenation minima and hyphenation
79 # The hyphen is a valid character.
81 # NR and FNR aren't updated when reading a file via getline. So we
82 # count lines manually.
84 # Read lines from translate file.
85 while (getline < ftr
> 0) {
87 # Skip comments (column 1 == column 2).
89 # Determine number of columns on line.
91 while ($
(cols
+1) != "") {
94 # Ignore trailing separators.
95 while ((cols
> 0) && ($cols == $
1)) {
98 # Check character translation table format. Check if
99 # separators are all equal to that in column 1.
100 for (i=
3; i
<=cols
; i=i
+2) {
102 printf("%s", "Error: Bad character translation table in file " ftr
", line " ln
"\n") > "/dev/stderr"
107 # Store characters in translation table.
108 for (i=
2; i
<=cols
; i=i
+2)
112 # print(ln " lines from translation file " ftr " read OK.")
115 # Reset regular field splitting mode.
122 # First, read translate file, whose name is defined in variable ftr on the
125 # Check if translate file name is set.
127 printf("%s", "Error: Translate file name missing!\nPlease set-up variable 'ftr' like: gawk -v ftr=<translate file> ...\n") > "/dev/stderr"
131 # Read translate file and build translate table.
132 read_translate_file
(ftr
)
137 # Read DIFF file's added lines.
139 # Store added word in field with:
140 # key = <normalized word>,
141 # value = <patgen input word>.
142 # A normalized word is lower case only with hyphens removed. Example:
143 # word_in["tafelsilber"] = "Ta-fel-sil-ber"
144 k = normalize_word
($
2)
148 # Read DIFF file's removed lines.
150 # Store removed word with:
151 # key = <normalized word>,
152 # value = <patgen input word>.
153 # A normalized word is lower case only with hyphens removed. Example:
154 # word_out["tafelsilber"] = "Ta-fel-sil-ber"
155 k = normalize_word
($
2)
166 for (word in word_in
) {
167 if (word in word_out
) {
169 # Check for case changes only.
170 lword_in = tr_tolower
(word_in
[word
])
171 lword_out = tr_tolower
(word_out
[word
])
172 if (lword_in == lword_out
) {
174 CaSe
[word
] = word_in
[word
]
177 # Hyphenation corrected.
178 hyph
[word
] = word_in
[word
]
183 added
[word
] = word_in
[word
]
186 for (word in word_out
) {
187 if (word in word_in
) {
189 # Already processed in above loop.
193 removed
[word
] = word_out
[word
]
196 # Save input file name.
199 print("Processed file " fdiff
".")
200 output_word_class
(added
, "added")
201 output_word_class
(removed
, "removed")
202 output_word_class
(hyph
, "hyph")
203 output_word_class
(CaSe
, "case")