Imported from antiword-0.30.tar.gz.
[antiword.git] / chartrans.c
blob43915920a5338bddd0ad718c42e730d4e502365e
1 /*
2 * chartrans.c
3 * Copyright (C) 1999,2000 A.J. van Os
5 * Description:
6 * Translate characters to ISO-8859-1
7 */
9 #include <ctype.h>
10 #include <limits.h>
11 #include "antiword.h"
14 * iTranslateCharacters - Translate characters to ISO-8859-1
16 * Translate Macintosh characters, Window characters and some Unicode
17 * characters into ISO-8859-1 (aka Latin1).
19 * returns the translated character or EOF if the input character should
20 * be ignored
22 int
23 iTranslateCharacters(int iChar, int iFileOffset, BOOL bMacFile)
25 if (bMacFile) {
26 /* Translate special Macintosh characters */
27 switch (iChar) {
28 case MAC_CAPTAL_A_DIAERESIS:
29 return OUR_CAPTAL_A_DIAERESIS;
30 case MAC_CAPTAL_A_RING_ABOVE:
31 return OUR_CAPTAL_A_RING_ABOVE;
32 case MAC_CAPTAL_C_CEDILLA:
33 return OUR_CAPTAL_C_CEDILLA;
34 case MAC_CAPTAL_O_ACUTE_ACCENT:
35 return OUR_CAPTAL_O_ACUTE_ACCENT;
36 case MAC_CAPTAL_N_TILDE:
37 return OUR_CAPTAL_N_TILDE;
38 case MAC_CAPTAL_O_DIAERESIS:
39 return OUR_CAPTAL_O_DIAERESIS;
40 case MAC_CAPTAL_U_DIAERESIS:
41 return OUR_CAPTAL_U_DIAERESIS;
42 case MAC_SMALL_A_ACUTE_ACCENT:
43 return OUR_SMALL_A_ACUTE_ACCENT;
44 case MAC_SMALL_A_GRAVE_ACCENT:
45 return OUR_SMALL_A_GRAVE_ACCENT;
46 case MAC_SMALL_A_CIRCUMFLEX_ACCENT:
47 return OUR_SMALL_A_CIRCUMFLEX_ACCENT;
48 case MAC_SMALL_A_DIAERESIS:
49 return OUR_SMALL_A_DIAERESIS;
50 case MAC_SMALL_A_TILDE:
51 return OUR_SMALL_A_TILDE;
52 case MAC_SMALL_A_RING_ABOVE:
53 return OUR_SMALL_A_RING_ABOVE;
54 case MAC_SMALL_C_CEDILLA:
55 return OUR_SMALL_C_CEDILLA;
56 case MAC_SMALL_E_ACUTE_ACCENT:
57 return OUR_SMALL_E_ACUTE_ACCENT;
58 case MAC_SMALL_E_GRAVE_ACCENT:
59 return OUR_SMALL_E_GRAVE_ACCENT;
60 case MAC_SMALL_E_CIRCUMFLEX_ACCENT:
61 return OUR_SMALL_E_CIRCUMFLEX_ACCENT;
62 case MAC_SMALL_E_DIAERESIS:
63 return OUR_SMALL_E_DIAERESIS;
64 case MAC_SMALL_I_ACUTE_ACCENT:
65 return OUR_SMALL_I_ACUTE_ACCENT;
66 case MAC_SMALL_I_GRAVE_ACCENT:
67 return OUR_SMALL_I_GRAVE_ACCENT;
68 case MAC_SMALL_I_CIRCUMFLEX_ACCENT:
69 return OUR_SMALL_I_CIRCUMFLEX_ACCENT;
70 case MAC_SMALL_I_DIAERESIS:
71 return OUR_SMALL_I_DIAERESIS;
72 case MAC_SMALL_N_TILDE:
73 return OUR_SMALL_N_TILDE;
74 case MAC_SMALL_O_ACUTE_ACCENT:
75 return OUR_SMALL_O_ACUTE_ACCENT;
76 case MAC_SMALL_O_GRAVE_ACCENT:
77 return OUR_SMALL_O_GRAVE_ACCENT;
78 case MAC_SMALL_O_CIRCUMFLEX_ACCENT:
79 return OUR_SMALL_O_CIRCUMFLEX_ACCENT;
80 case MAC_SMALL_O_DIAERESIS:
81 return OUR_SMALL_O_DIAERESIS;
82 case MAC_SMALL_O_TILDE:
83 return OUR_SMALL_O_TILDE;
84 case MAC_SMALL_U_ACUTE_ACCENT:
85 return OUR_SMALL_U_ACUTE_ACCENT;
86 case MAC_SMALL_U_GRAVE_ACCENT:
87 return OUR_SMALL_U_GRAVE_ACCENT;
88 case MAC_SMALL_U_CIRCUMFLEX_ACCENT:
89 return OUR_SMALL_U_CIRCUMFLEX_ACCENT;
90 case MAC_SMALL_U_DIAERESIS:
91 return OUR_SMALL_U_DIAERESIS;
92 case MAC_SMALL_SHARP_S:
93 return OUR_SMALL_SHARP_S;
94 case MAC_LEFT_DOUBLE_QMARK:
95 return OUR_LEFT_DOUBLE_QMARK;
96 case MAC_RIGHT_DOUBLE_QMARK:
97 return OUR_RIGHT_DOUBLE_QMARK;
98 case MAC_EN_DASH:
99 return OUR_EN_DASH;
100 case MAC_EM_DASH:
101 return OUR_EM_DASH;
102 case MAC_OPENING_DOUBLE_QUOTE:
103 return OUR_OPENING_DOUBLE_QUOTE;
104 case MAC_CLOSING_DOUBLE_QUOTE:
105 return OUR_CLOSING_DOUBLE_QUOTE;
106 case MAC_LEFT_SINGLE_QUOTE:
107 return OUR_LEFT_SINGLE_QUOTE;
108 case MAC_RIGHT_SINGLE_QUOTE:
109 return OUR_RIGHT_SINGLE_QUOTE;
110 default:
111 break;
115 /* Translate characters to ISO-8859-1 */
116 switch (iChar) {
117 case IGNORE_CHAR:
118 case ANNOTATION:
119 case FRAME:
120 case WORD_SOFT_HYPHEN:
121 case UNICODE_HYPHENATION_POINT:
122 return EOF;
123 case PICTURE:
124 case TABLE_SEPARATOR:
125 case TAB:
126 case HARD_RETURN:
127 case FORM_FEED:
128 case PAR_END:
129 case COLUMN_FEED:
130 return iChar;
131 case FOOTNOTE_OR_ENDNOTE:
132 NO_DBG_HEX(iFileOffset);
133 switch (eGetNotetype(iFileOffset)) {
134 case notetype_is_footnote:
135 return FOOTNOTE_CHAR;
136 case notetype_is_endnote:
137 return ENDNOTE_CHAR;
138 default:
139 return UNKNOWN_NOTE_CHAR;
141 case WORD_UNBREAKABLE_JOIN:
142 case UNICODE_NON_BREAKING_HYPHEN:
143 return OUR_UNBREAKABLE_JOIN;
144 case WORD_EURO_SIGN:
145 case UNICODE_EURO_SIGN:
146 return OUR_EURO_SIGN;
147 case WORD_CEDILLA:
148 return OUR_CEDILLA;
149 case WORD_DUTCH_GUILDER_SIGN:
150 return OUR_DUTCH_GUILDER_SIGN;
151 case WORD_LOW_DOUBLE_QUOTE:
152 case UNICODE_LOW_DOUBLE_QUOTE:
153 return OUR_LOW_DOUBLE_QUOTE;
154 case WORD_ELLIPSIS:
155 case UNICODE_ELLIPSIS:
156 return OUR_ELLIPSIS;
157 case WORD_DAGGER:
158 case UNICODE_DAGGER:
159 return OUR_DAGGER;
160 case WORD_DOUBLE_DAGGER:
161 case UNICODE_DOUBLE_DAGGER:
162 return OUR_DOUBLE_DAGGER;
163 case WORD_NON_SPACING_CIRCUMFLEX_ACCENT:
164 return OUR_NON_SPACING_CIRCUMFLEX_ACCENT;
165 case WORD_PER_MILLE_SIGN:
166 case UNICODE_PER_MILLE_SIGN:
167 return OUR_PER_MILLE_SIGN;
168 case WORD_LEFT_SINGLE_QMARK:
169 case UNICODE_LEFT_SINGLE_QMARK:
170 return OUR_LEFT_SINGLE_QMARK;
171 case WORD_CAPITAL_LIGATURE_OE:
172 return OUR_CAPITAL_LIGATURE_OE;
173 case WORD_LEFT_SINGLE_QUOTE:
174 case UNICODE_LEFT_SINGLE_QUOTE:
175 case UNICODE_LEFT_SINGLE_QUOTE_ALT:
176 return OUR_LEFT_SINGLE_QUOTE;
177 case WORD_RIGHT_SINGLE_QUOTE:
178 case UNICODE_RIGHT_SINGLE_QUOTE:
179 case UNICODE_RIGHT_SINGLE_QUOTE_ALT:
180 return OUR_RIGHT_SINGLE_QUOTE;
181 case WORD_OPENING_DOUBLE_QUOTE:
182 case UNICODE_OPENING_DOUBLE_QUOTE:
183 return OUR_OPENING_DOUBLE_QUOTE;
184 case WORD_CLOSING_DOUBLE_QUOTE:
185 case UNICODE_CLOSING_DOUBLE_QUOTE:
186 return OUR_CLOSING_DOUBLE_QUOTE;
187 case WORD_BULLET:
188 case UNICODE_BULLET:
189 case UNICODE_BULLET_ALT:
190 return OUR_BULLET;
191 case WORD_EM_DASH:
192 case UNICODE_EM_DASH:
193 case UNICODE_HORIZONTAL_BAR:
194 return OUR_EM_DASH;
195 case WORD_EN_DASH:
196 case UNICODE_EN_DASH:
197 case UNICODE_FIGURE_DASH:
198 return OUR_EN_DASH;
199 case WORD_NON_SPACING_TILDE:
200 return OUR_NON_SPACING_TILDE;
201 case WORD_TRADEMARK:
202 case UNICODE_TRADEMARK:
203 return OUR_TRADEMARK;
204 case WORD_RIGHT_SINGLE_QMARK:
205 case UNICODE_RIGHT_SINGLE_QMARK:
206 return OUR_RIGHT_SINGLE_QMARK;
207 case WORD_SMALL_LIGATURE_OE:
208 return OUR_SMALL_LIGATURE_OE;
209 case UNICODE_CAPITAL_W_CIRCUMFLEX_ACCENT:
210 return OUR_CAPITAL_W_CIRCUMFLEX_ACCENT;
211 case UNICODE_SMALL_W_CIRCUMFLEX_ACCENT:
212 return OUR_SMALL_W_CIRCUMFLEX_ACCENT;
213 case UNICODE_CAPITAL_Y_CIRCUMFLEX_ACCENT:
214 return OUR_CAPITAL_Y_CIRCUMFLEX_ACCENT;
215 case UNICODE_SMALL_Y_CIRCUMFLEX_ACCENT:
216 return OUR_SMALL_Y_CIRCUMFLEX_ACCENT;
217 case UNICODE_HYPHEN:
218 return OUR_HYPHEN;
219 case UNICODE_DOUBLE_VERTICAL_LINE:
220 return OUR_DOUBLE_VERTICAL_LINE;
221 case UNICODE_DOUBLE_LOW_LINE:
222 return OUR_DOUBLE_LOW_LINE;
223 case UNICODE_FRACTION_SLASH:
224 return OUR_FRACTION_SLASH;
225 case UNICODE_WHITE_SMILING_FACE:
226 return OUR_WHITE_SMILING_FACE;
227 case UNICODE_BLACK_SMILING_FACE:
228 return OUR_BLACK_SMILING_FACE;
229 case UNICODE_DIAMOND:
230 return OUR_DIAMOND;
231 case UNICODE_COPYRIGHT:
232 return OUR_COPYRIGHT;
233 default:
234 DBG_HEX_C(iChar >= 0x80 && iChar <= 0x9f, iFileOffset);
235 DBG_HEX_C(iChar >= 0x80 && iChar <= 0x9f, iChar);
236 DBG_HEX_C(iChar < 0x20 || iChar > 0xff, iFileOffset);
237 DBG_HEX_C(iChar < 0x20 || iChar > 0xff, iChar);
238 if (iChar < 0x20) {
239 /* A control character slipped through */
240 return EOF;
242 if (iChar > 0xff) {
243 /* Untranslated Unicode character */
244 return '?';
246 return iChar;
248 } /* end of iTranslateCharacters */
251 * iToUpper - convert letter to upper case
253 * This function converts a letter to upper case. Unlike toupper(3) this
254 * function is independent from the settings of locale. This comes in handy
255 * for people who have to read Word documents in more than one language or
256 * contain more than one language.
258 * returns the converted letter, or iChar if the conversion was not possible.
260 int iToUpper(int iChar)
262 if ((iChar & ~0x7f) == 0) {
263 /* US ASCII: use standard function */
264 return toupper(iChar);
266 if (iChar >= 0xe0 && iChar <= 0xfe && iChar != 0xf7) {
268 * Lower case accented characters
269 * 0xf7 is Division sign; 0xd7 is Multiplication sign
270 * 0xff is y with diaeresis; 0xdf is Sharp s
272 return iChar & ~0x20;
274 return iChar;
275 } /* end of iToUpper */