Imported from antiword-0.34.tar.gz.
[antiword.git] / chartrans.c
blob25cf604592f64821ca5b5de35b34b51771b8e89f
1 /*
2 * chartrans.c
3 * Copyright (C) 1999-2003 A.J. van Os; Released under GNU GPL
5 * Description:
6 * Translate Word characters to local representation
7 */
9 #include <stdlib.h>
10 #include <string.h>
11 #include <ctype.h>
12 #if defined(__STDC_ISO_10646__)
13 #include <wctype.h>
14 #endif /* __STDC_ISO_10646__ */
15 #include "antiword.h"
17 static const USHORT usCp850[] = {
18 0x00c7, 0x00fc, 0x00e9, 0x00e2, 0x00e4, 0x00e0, 0x00e5, 0x00e7,
19 0x00ea, 0x00eb, 0x00e8, 0x00ef, 0x00ee, 0x00ec, 0x00c4, 0x00c5,
20 0x00c9, 0x00e6, 0x00c6, 0x00f4, 0x00f6, 0x00f2, 0x00fb, 0x00f9,
21 0x00ff, 0x00d6, 0x00dc, 0x00f8, 0x00a3, 0x00d8, 0x00d7, 0x0192,
22 0x00e1, 0x00ed, 0x00f3, 0x00fa, 0x00f1, 0x00d1, 0x00aa, 0x00ba,
23 0x00bf, 0x00ae, 0x00ac, 0x00bd, 0x00bc, 0x00a1, 0x00ab, 0x00bb,
24 0x2591, 0x2592, 0x2593, 0x2502, 0x2524, 0x00c1, 0x00c2, 0x00c0,
25 0x00a9, 0x2563, 0x2551, 0x2557, 0x255d, 0x00a2, 0x00a5, 0x2510,
26 0x2514, 0x2534, 0x252c, 0x251c, 0x2500, 0x253c, 0x00e3, 0x00c3,
27 0x255a, 0x2554, 0x2569, 0x2566, 0x2560, 0x2550, 0x256c, 0x00a4,
28 0x00f0, 0x00d0, 0x00ca, 0x00cb, 0x00c8, 0x0131, 0x00cd, 0x00ce,
29 0x00cf, 0x2518, 0x250c, 0x2588, 0x2584, 0x00a6, 0x00cc, 0x2580,
30 0x00d3, 0x00df, 0x00d4, 0x00d2, 0x00f5, 0x00d5, 0x00b5, 0x00fe,
31 0x00de, 0x00da, 0x00db, 0x00d9, 0x00fd, 0x00dd, 0x00af, 0x00b4,
32 0x00ad, 0x00b1, 0x2017, 0x00be, 0x00b6, 0x00a7, 0x00f7, 0x00b8,
33 0x00b0, 0x00a8, 0x00b7, 0x00b9, 0x00b3, 0x00b2, 0x25a0, 0x00a0,
36 static const USHORT usCp1250[] = {
37 0x20ac, 0x003f, 0x201a, 0x003f, 0x201e, 0x2026, 0x2020, 0x2021,
38 0x003f, 0x2030, 0x0160, 0x2039, 0x015a, 0x0164, 0x017d, 0x0179,
39 0x003f, 0x2018, 0x2019, 0x201c, 0x201d, 0x2022, 0x2013, 0x2014,
40 0x003f, 0x2122, 0x0161, 0x203a, 0x015b, 0x0165, 0x017e, 0x017a,
41 0x00a0, 0x02c7, 0x02d8, 0x0141, 0x00a4, 0x0104, 0x00a6, 0x00a7,
42 0x00a8, 0x00a9, 0x015e, 0x00ab, 0x00ac, 0x00ad, 0x00ae, 0x017b,
43 0x00b0, 0x00b1, 0x02db, 0x0142, 0x00b4, 0x00b5, 0x00b6, 0x00b7,
44 0x00b8, 0x0105, 0x015f, 0x00bb, 0x013d, 0x02dd, 0x013e, 0x017c,
45 0x0154, 0x00c1, 0x00c2, 0x0102, 0x00c4, 0x0139, 0x0106, 0x00c7,
46 0x010c, 0x00c9, 0x0118, 0x00cb, 0x011a, 0x00cd, 0x00ce, 0x010e,
47 0x0110, 0x0143, 0x0147, 0x00d3, 0x00d4, 0x0150, 0x00d6, 0x00d7,
48 0x0158, 0x016e, 0x00da, 0x0170, 0x00dc, 0x00dd, 0x0162, 0x00df,
49 0x0155, 0x00e1, 0x00e2, 0x0103, 0x00e4, 0x013a, 0x0107, 0x00e7,
50 0x010d, 0x00e9, 0x0119, 0x00eb, 0x011b, 0x00ed, 0x00ee, 0x010f,
51 0x0111, 0x0144, 0x0148, 0x00f3, 0x00f4, 0x0151, 0x00f6, 0x00f7,
52 0x0159, 0x016f, 0x00fa, 0x0171, 0x00fc, 0x00fd, 0x0163, 0x02d9,
55 static const USHORT usCp1252[] = {
56 0x20ac, 0x003f, 0x201a, 0x0192, 0x201e, 0x2026, 0x2020, 0x2021,
57 0x02c6, 0x2030, 0x0160, 0x2039, 0x0152, 0x003f, 0x017d, 0x003f,
58 0x003f, 0x2018, 0x2019, 0x201c, 0x201d, 0x2022, 0x2013, 0x2014,
59 0x02dc, 0x2122, 0x0161, 0x203a, 0x0153, 0x003f, 0x017e, 0x0178,
62 static const USHORT usMacRoman[] = {
63 0x00c4, 0x00c5, 0x00c7, 0x00c9, 0x00d1, 0x00d6, 0x00dc, 0x00e1,
64 0x00e0, 0x00e2, 0x00e4, 0x00e3, 0x00e5, 0x00e7, 0x00e9, 0x00e8,
65 0x00ea, 0x00eb, 0x00ed, 0x00ec, 0x00ee, 0x00ef, 0x00f1, 0x00f3,
66 0x00f2, 0x00f4, 0x00f6, 0x00f5, 0x00fa, 0x00f9, 0x00fb, 0x00fc,
67 0x2020, 0x00b0, 0x00a2, 0x00a3, 0x00a7, 0x2022, 0x00b6, 0x00df,
68 0x00ae, 0x00a9, 0x2122, 0x00b4, 0x00a8, 0x2260, 0x00c6, 0x00d8,
69 0x221e, 0x00b1, 0x2264, 0x2265, 0x00a5, 0x00b5, 0x2202, 0x2211,
70 0x220f, 0x03c0, 0x222b, 0x00aa, 0x00ba, 0x2126, 0x00e6, 0x00f8,
71 0x00bf, 0x00a1, 0x00ac, 0x221a, 0x0192, 0x2248, 0x2206, 0x00ab,
72 0x00bb, 0x2026, 0x00a0, 0x00c0, 0x00c3, 0x00d5, 0x0152, 0x0153,
73 0x2013, 0x2014, 0x201c, 0x201d, 0x2018, 0x2019, 0x00f7, 0x25ca,
74 0x00ff, 0x0178, 0x2044, 0x00a4, 0x2039, 0x203a, 0xfb01, 0xfb02,
75 0x2021, 0x00b7, 0x201a, 0x201e, 0x2030, 0x00c2, 0x00ca, 0x00c1,
76 0x00cb, 0x00c8, 0x00cd, 0x00ce, 0x00cf, 0x00cc, 0x00d3, 0x00d4,
77 0x003f, 0x00d2, 0x00da, 0x00db, 0x00d9, 0x0131, 0x02c6, 0x02dc,
78 0x00af, 0x02d8, 0x02d9, 0x02da, 0x00b8, 0x02dd, 0x02db, 0x02c7,
81 static const USHORT usPrivateArea[] = {
82 0x0020, 0x0021, 0x2200, 0x0023, 0x2203, 0x0025, 0x0026, 0x220d,
83 0x0028, 0x0029, 0x2217, 0x002b, 0x002c, 0x2212, 0x002e, 0x002f,
84 0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037,
85 0x0038, 0x0039, 0x003a, 0x003b, 0x003c, 0x2019, 0x003e, 0x003f,
86 0x201d, 0x201c, 0x0392, 0x03a7, 0x0394, 0x0395, 0x03a6, 0x0393,
87 0x0397, 0x0399, 0x03d1, 0x039a, 0x039b, 0x039c, 0x039d, 0x039f,
88 0x03a0, 0x0398, 0x03a1, 0x03a3, 0x03a4, 0x03a5, 0x03c2, 0x03a9,
89 0x039e, 0x03a8, 0x0396, 0x005b, 0x2234, 0x005d, 0x22a5, 0x005f,
90 0x003f, 0x03b1, 0x03b2, 0x03c7, 0x03b4, 0x03b5, 0x03c6, 0x03b3,
91 0x03b7, 0x03b9, 0x03d5, 0x03ba, 0x03bb, 0x03bc, 0x03bd, 0x03bf,
92 0x03c0, 0x03b8, 0x03c1, 0x03c3, 0x03c4, 0x03c5, 0x03d6, 0x03c9,
93 0x03be, 0x03c8, 0x03b6, 0x007b, 0x007c, 0x007d, 0x223c, 0x003f,
94 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
95 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
96 0x003f, 0x003f, 0x003f, 0x2022, 0x003f, 0x003f, 0x003f, 0x003f,
97 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
98 0x20ac, 0x03d2, 0x2032, 0x2264, 0x2044, 0x221e, 0x0192, 0x2663,
99 0x2666, 0x2665, 0x2660, 0x2194, 0x2190, 0x2191, 0x2192, 0x2193,
100 0x00b0, 0x00b1, 0x2033, 0x2265, 0x00d7, 0x221d, 0x2202, 0x2022,
101 0x00f7, 0x2260, 0x2261, 0x2248, 0x2026, 0x007c, 0x23af, 0x21b5,
102 0x2135, 0x2111, 0x211c, 0x2118, 0x2297, 0x2295, 0x2205, 0x2229,
103 0x222a, 0x2283, 0x2287, 0x2284, 0x2282, 0x2286, 0x2208, 0x2209,
104 0x2220, 0x2207, 0x00ae, 0x00a9, 0x2122, 0x220f, 0x221a, 0x22c5,
105 0x00ac, 0x2227, 0x2228, 0x21d4, 0x21d0, 0x21d1, 0x21d2, 0x21d3,
106 0x22c4, 0x3008, 0x00ae, 0x00a9, 0x2122, 0x2211, 0x239b, 0x239c,
107 0x239d, 0x23a1, 0x23a2, 0x23a3, 0x23a7, 0x23a8, 0x23a9, 0x23aa,
108 0x003f, 0x3009, 0x222b, 0x2320, 0x23ae, 0x2321, 0x239e, 0x239f,
109 0x23a0, 0x23a4, 0x23a5, 0x23a6, 0x23ab, 0x23ac, 0x23ad, 0x003f,
112 typedef struct char_table_tag {
113 USHORT usLocal;
114 USHORT usUnicode;
115 } char_table_type;
117 static char_table_type atCharTable[128];
121 * iCompare - compare two records
123 * Compares two records. For use by qsort(3C) and bsearch(3C).
125 * returns -1 if rec1 < rec2, 0 if rec1 == rec2, 1 if rec1 > rec2
127 static int
128 iCompare(const void *pvRecord1, const void *pvRecord2)
130 USHORT usUnicode1, usUnicode2;
132 usUnicode1 = ((char_table_type *)pvRecord1)->usUnicode;
133 usUnicode2 = ((char_table_type *)pvRecord2)->usUnicode;
135 if (usUnicode1 < usUnicode2) {
136 return -1;
138 if (usUnicode1 > usUnicode2) {
139 return 1;
141 return 0;
142 } /* end of iCompare */
145 * pGetCharTableRecord - get the character table record
147 * returns a pointer to the record when found, otherwise NULL
149 static const char_table_type *
150 pGetCharTableRecord(USHORT usUnicode)
152 char_table_type tKey;
154 tKey.usUnicode = usUnicode;
155 tKey.usLocal = 0;
156 return (char_table_type *)bsearch(&tKey,
157 atCharTable,
158 elementsof(atCharTable), sizeof(atCharTable[0]),
159 iCompare);
160 } /* end of pGetCharTableRecord */
163 * ucGetNbspValue - get the local representation of the non-breaking space
165 UCHAR
166 ucGetNbspValue(void)
168 const char_table_type *pRec;
170 pRec = pGetCharTableRecord(0x00a0); /* Unicode non-breaking space */
171 if (pRec == NULL || pRec->usLocal > 0xff) {
172 DBG_MSG_C(pRec == NULL, "Non-breaking space record not found");
173 DBG_HEX_C(pRec != NULL, pRec->usLocal);
174 /* No value found, use the best guess */
175 return (UCHAR)0xa0;
177 return (UCHAR)pRec->usLocal;
178 } /* end of ucGetNbspValue */
181 * bReadCharacterMappingTable - read the mapping table
183 * Read the character mapping table from file and have the contents sorted
185 * returns TRUE if successful, otherwise FALSE
187 BOOL
188 bReadCharacterMappingTable(const char *szFilename)
190 FILE *pFile;
191 char *pcTmp;
192 ULONG ulUnicode;
193 UINT uiLocal;
194 int iFields;
195 char szLine[81];
197 DBG_MSG(szFilename);
199 fail(szFilename == NULL);
201 if (szFilename == NULL || szFilename[0] == '\0') {
202 return FALSE;
204 pFile = fopen(szFilename, "r");
205 if (pFile == NULL) {
206 DBG_MSG(szFilename);
207 return FALSE;
209 (void)memset(atCharTable, 0, sizeof(atCharTable));
211 while (fgets(szLine, (int)sizeof(szLine), pFile)) {
212 if (szLine[0] == '#' ||
213 szLine[0] == '\r' ||
214 szLine[0] == '\n') {
215 /* Comment or empty line */
216 continue;
218 iFields = sscanf(szLine, "%x %lx %*s", &uiLocal, &ulUnicode);
219 if (iFields != 2) {
220 pcTmp = strchr(szLine, '\r');
221 if (pcTmp != NULL) {
222 *pcTmp = '\0';
224 pcTmp = strchr(szLine, '\n');
225 if (pcTmp != NULL) {
226 *pcTmp = '\0';
228 werr(0, "Syntax error in: '%s'", szLine);
229 continue;
231 if (uiLocal > 0xff || ulUnicode > 0xffff) {
232 werr(0, "Syntax error in: '%02x %04lx'",
233 uiLocal, ulUnicode);
234 continue;
236 if (uiLocal >= 0x80) {
237 atCharTable[uiLocal - 0x80].usLocal =
238 (USHORT)uiLocal;
239 atCharTable[uiLocal - 0x80].usUnicode =
240 (USHORT)ulUnicode;
243 (void)fclose(pFile);
245 DBG_HEX(atCharTable[0].usUnicode);
246 DBG_HEX(atCharTable[elementsof(atCharTable)-1].usUnicode);
248 qsort(atCharTable,
249 elementsof(atCharTable), sizeof(atCharTable[0]),
250 iCompare);
252 DBG_HEX(atCharTable[0].usUnicode);
253 DBG_HEX(atCharTable[elementsof(atCharTable)-1].usUnicode);
255 return TRUE;
256 } /* end of bReadCharacterMappingTable */
259 * ulTranslateCharacters - Translate characters to local representation
261 * Translate all characters to local representation
263 * returns the translated character
265 ULONG
266 ulTranslateCharacters(USHORT usChar, ULONG ulFileOffset, int iWordVersion,
267 conversion_type eConversionType, encoding_type eEncoding,
268 BOOL bUseMacCharSet)
270 const char_table_type *pTmp;
272 if (bUseMacCharSet) {
273 /* Translate special Macintosh characters */
274 if (usChar >= 0x80 && usChar <= 0xff) {
275 usChar = usMacRoman[usChar - 0x80];
277 } else if (iWordVersion == 0) {
278 /* From Code Page 850 to Unicode */
279 if (usChar >= 0x80 && usChar <= 0xff) {
280 usChar = usCp850[usChar - 0x80];
282 } else {
283 if (eEncoding == encoding_iso_8859_2) {
284 /* Translate implementation defined characters */
285 if (usChar >= 0x80 && usChar <= 0x9f) {
286 usChar = usCp1250[usChar - 0x80];
288 /* From Code Page 1250 to Unicode */
289 if (iWordVersion < 8 &&
290 usChar >= 0xa0 && usChar <= 0xff) {
291 usChar = usCp1250[usChar - 0x80];
293 } else {
294 /* Translate implementation defined characters */
295 if (usChar >= 0x80 && usChar <= 0x9f) {
296 usChar = usCp1252[usChar - 0x80];
301 /* Microsoft Unicode to real Unicode */
302 if (usChar >= 0xf020 && usChar <= 0xf0ff) {
303 DBG_HEX_C(usPrivateArea[usChar - 0xf020] == 0x003f, usChar);
304 usChar = usPrivateArea[usChar - 0xf020];
307 /* Characters with a special meaning in Word */
308 switch (usChar) {
309 case IGNORE_CHARACTER:
310 case FOOTNOTE_SEPARATOR:
311 case FOOTNOTE_CONTINUATION:
312 case ANNOTATION:
313 case FRAME:
314 case LINE_FEED:
315 case WORD_SOFT_HYPHEN:
316 case UNICODE_HYPHENATION_POINT:
317 return IGNORE_CHARACTER;
318 case PICTURE:
319 case TABLE_SEPARATOR:
320 case TAB:
321 case HARD_RETURN:
322 case PAGE_BREAK:
323 case PAR_END:
324 case COLUMN_FEED:
325 return (ULONG)usChar;
326 case FOOTNOTE_OR_ENDNOTE:
327 NO_DBG_HEX(ulFileOffset);
328 switch (eGetNotetype(ulFileOffset)) {
329 case notetype_is_footnote:
330 return FOOTNOTE_CHAR;
331 case notetype_is_endnote:
332 return ENDNOTE_CHAR;
333 default:
334 return UNKNOWN_NOTE_CHAR;
336 case WORD_UNBREAKABLE_JOIN:
337 return (ULONG)OUR_UNBREAKABLE_JOIN;
338 default:
339 break;
342 if (eEncoding != encoding_utf8) {
343 /* Latin characters in an oriental text */
344 if (usChar >= 0xff01 && usChar <= 0xff5e) {
345 usChar -= 0xfee0;
349 if (eConversionType == conversion_ps &&
350 eEncoding == encoding_iso_8859_1) {
351 switch (usChar) {
352 case UNICODE_ELLIPSIS:
353 return 140;
354 case UNICODE_TRADEMARK_SIGN:
355 return 141;
356 case UNICODE_PER_MILLE_SIGN:
357 return 142;
358 case UNICODE_BULLET:
359 case UNICODE_BLACK_CLUB_SUIT:
360 return (ULONG)(UCHAR)OUR_BULLET_PS;
361 case UNICODE_LEFT_SINGLE_QMARK:
362 return 144;
363 case UNICODE_RIGHT_SINGLE_QMARK:
364 return 145;
365 case UNICODE_SINGLE_LEFT_ANGLE_QMARK:
366 return 146;
367 case UNICODE_SINGLE_RIGHT_ANGLE_QMARK:
368 return 147;
369 case UNICODE_LEFT_DOUBLE_QMARK:
370 return 148;
371 case UNICODE_RIGHT_DOUBLE_QMARK:
372 return 149;
373 case UNICODE_DOUBLE_LOW_9_QMARK:
374 return 150;
375 case UNICODE_EN_DASH:
376 return 151;
377 case UNICODE_EM_DASH:
378 return 152;
379 case UNICODE_MINUS_SIGN:
380 return 153;
381 case UNICODE_CAPITAL_LIGATURE_OE:
382 return 154;
383 case UNICODE_SMALL_LIGATURE_OE:
384 return 155;
385 case UNICODE_DAGGER:
386 return 156;
387 case UNICODE_DOUBLE_DAGGER:
388 return 157;
389 case UNICODE_SMALL_LIGATURE_FI:
390 return 158;
391 case UNICODE_SMALL_LIGATURE_FL:
392 return 159;
393 default:
394 break;
398 if (usChar < 0x80) {
399 /* US ASCII */
400 if (usChar < 0x20 || usChar == 0x7f) {
401 /* Ignore control characters */
402 DBG_HEX(usChar);
403 DBG_FIXME();
404 return IGNORE_CHARACTER;
406 return (ULONG)usChar;
409 if (eEncoding == encoding_utf8) {
410 /* No need to convert Unicode characters */
411 return (ULONG)usChar;
414 /* Unicode to local representation */
415 pTmp = pGetCharTableRecord(usChar);
416 if (pTmp != NULL) {
417 DBG_HEX_C(usChar >= 0x7f && usChar <= 0x9f, usChar);
418 return (ULONG)pTmp->usLocal;
421 /* Fancy characters to simple US ASCII */
422 switch (usChar) {
423 case UNICODE_SMALL_F_HOOK:
424 return (ULONG)'f';
425 case UNICODE_GREEK_CAPITAL_CHI:
426 return (ULONG)'X';
427 case UNICODE_GREEK_SMALL_UPSILON:
428 return (ULONG)'v';
429 case UNICODE_MODIFIER_CIRCUMFLEX:
430 case UNICODE_UPWARDS_ARROW:
431 return (ULONG)'^';
432 case UNICODE_SMALL_TILDE:
433 case UNICODE_TILDE_OPERATOR:
434 return (ULONG)'~';
435 case UNICODE_EN_QUAD:
436 case UNICODE_EM_QUAD:
437 case UNICODE_EN_SPACE:
438 case UNICODE_EM_SPACE:
439 case UNICODE_THREE_PER_EM_SPACE:
440 case UNICODE_FOUR_PER_EM_SPACE:
441 case UNICODE_SIX_PER_EM_SPACE:
442 case UNICODE_FIGURE_SPACE:
443 case UNICODE_PUNCTUATION_SPACE:
444 case UNICODE_THIN_SPACE:
445 case UNICODE_LIGHT_SHADE:
446 case UNICODE_MEDIUM_SHADE:
447 case UNICODE_DARK_SHADE:
448 return (ULONG)' ';
449 case UNICODE_LEFT_DOUBLE_QMARK:
450 case UNICODE_RIGHT_DOUBLE_QMARK:
451 case UNICODE_DOUBLE_LOW_9_QMARK:
452 case UNICODE_DOUBLE_HIGH_REV_9_QMARK:
453 case UNICODE_DOUBLE_PRIME:
454 return (ULONG)'"';
455 case UNICODE_LEFT_SINGLE_QMARK:
456 case UNICODE_RIGHT_SINGLE_QMARK:
457 case UNICODE_SINGLE_LOW_9_QMARK:
458 case UNICODE_SINGLE_HIGH_REV_9_QMARK:
459 case UNICODE_PRIME:
460 return (ULONG)'\'';
461 case UNICODE_HYPHEN:
462 case UNICODE_NON_BREAKING_HYPHEN:
463 case UNICODE_FIGURE_DASH:
464 case UNICODE_EN_DASH:
465 case UNICODE_EM_DASH:
466 case UNICODE_HORIZONTAL_BAR:
467 case UNICODE_MINUS_SIGN:
468 case UNICODE_BD_LIGHT_HORIZONTAL:
469 case UNICODE_BD_DOUBLE_HORIZONTAL:
470 return (ULONG)'-';
471 case UNICODE_DOUBLE_VERTICAL_LINE:
472 case UNICODE_BD_LIGHT_VERTICAL:
473 case UNICODE_BD_DOUBLE_VERTICAL:
474 return (ULONG)'|';
475 case UNICODE_DOUBLE_LOW_LINE:
476 return (ULONG)'_';
477 case UNICODE_DAGGER:
478 return (ULONG)'+';
479 case UNICODE_DOUBLE_DAGGER:
480 return (ULONG)'#';
481 case UNICODE_BULLET:
482 case UNICODE_BLACK_CLUB_SUIT:
483 return (ULONG)OUR_BULLET_TEXT;
484 case UNICODE_ONE_DOT_LEADER:
485 return (ULONG)'.';
486 case UNICODE_ELLIPSIS:
487 return (ULONG)OUR_ELLIPSIS;
488 case UNICODE_TRIANGULAR_BULLET:
489 case UNICODE_SINGLE_LEFT_ANGLE_QMARK:
490 case UNICODE_LEFTWARDS_ARROW:
491 return (ULONG)'<';
492 case UNICODE_SINGLE_RIGHT_ANGLE_QMARK:
493 case UNICODE_RIGHTWARDS_ARROW:
494 return (ULONG)'>';
495 case UNICODE_UNDERTIE:
496 return (ULONG)'-';
497 case UNICODE_N_ARY_SUMMATION:
498 return (ULONG)'S';
499 case UNICODE_EURO_SIGN:
500 return (ULONG)'E';
501 case UNICODE_CIRCLE:
502 case UNICODE_SQUARE:
503 return (ULONG)'O';
504 case UNICODE_DIAMOND:
505 return (ULONG)OUR_DIAMOND;
506 case UNICODE_KELVIN_SIGN:
507 return (ULONG)'K';
508 case UNICODE_DOWNWARDS_ARROW:
509 return (ULONG)'v';
510 case UNICODE_FRACTION_SLASH:
511 case UNICODE_DIVISION_SLASH:
512 return (ULONG)'/';
513 case UNICODE_ASTERISK_OPERATOR:
514 return (ULONG)'*';
515 case UNICODE_RATIO:
516 return (ULONG)':';
517 case UNICODE_BD_LIGHT_DOWN_RIGHT:
518 case UNICODE_BD_LIGHT_DOWN_AND_LEFT:
519 case UNICODE_BD_LIGHT_UP_AND_RIGHT:
520 case UNICODE_BD_LIGHT_UP_AND_LEFT:
521 case UNICODE_BD_LIGHT_VERTICAL_AND_RIGHT:
522 case UNICODE_BD_LIGHT_VERTICAL_AND_LEFT:
523 case UNICODE_BD_LIGHT_DOWN_AND_HORIZONTAL:
524 case UNICODE_BD_LIGHT_UP_AND_HORIZONTAL:
525 case UNICODE_BD_LIGHT_VERTICAL_AND_HORIZONTAL:
526 case UNICODE_BD_DOUBLE_DOWN_AND_RIGHT:
527 case UNICODE_BD_DOUBLE_DOWN_AND_LEFT:
528 case UNICODE_BD_DOUBLE_UP_AND_RIGHT:
529 case UNICODE_BD_DOUBLE_UP_AND_LEFT:
530 case UNICODE_BD_DOUBLE_VERTICAL_AND_RIGHT:
531 case UNICODE_BD_DOUBLE_VERTICAL_AND_LEFT:
532 case UNICODE_BD_DOUBLE_DOWN_AND_HORIZONTAL:
533 case UNICODE_BD_DOUBLE_UP_AND_HORIZONTAL:
534 case UNICODE_BD_DOUBLE_VERTICAL_AND_HORIZONTAL:
535 case UNICODE_BLACK_SQUARE:
536 return (ULONG)'+';
537 case UNICODE_HAIR_SPACE:
538 case UNICODE_ZERO_WIDTH_SPACE:
539 case UNICODE_ZERO_WIDTH_NON_JOINER:
540 case UNICODE_ZERO_WIDTH_JOINER:
541 case UNICODE_LEFT_TO_RIGHT_MARK:
542 case UNICODE_RIGHT_TO_LEFT_MARK:
543 case UNICODE_LEFT_TO_RIGHT_EMBEDDING:
544 case UNICODE_RIGHT_TO_LEFT_EMBEDDING:
545 return IGNORE_CHARACTER;
546 default:
547 break;
550 if (usChar == UNICODE_TRADEMARK_SIGN) {
552 * No local representation, it doesn't look like anything in
553 * US-ASCII and a question mark does more harm than good.
555 return IGNORE_CHARACTER;
558 if (usChar >= 0xa0 && usChar <= 0xff) {
559 /* Before Word 97, Word did't use Unicode */
560 return (ULONG)usChar;
563 DBG_HEX_C(usChar < 0x3000 || usChar >= 0xd800, ulFileOffset);
564 DBG_HEX_C(usChar < 0x3000 || usChar >= 0xd800, usChar);
565 DBG_MSG_C(usChar >= 0xe000 && usChar < 0xf900, "Private Use Area");
567 /* Untranslated Unicode character */
568 return 0x3f;
569 } /* end of ulTranslateCharacters */
572 * ulToUpper - convert letter to upper case
574 * This function converts a letter to upper case. Unlike toupper(3) this
575 * function is independent from the settings of locale. This comes in handy
576 * for people who have to read Word documents in more than one language or
577 * contain more than one language.
579 * returns the converted letter, or ulChar if the conversion was not possible.
581 ULONG
582 ulToUpper(ULONG ulChar)
584 if (ulChar < 0x80) {
585 /* US ASCII: use standard function */
586 return (ULONG)toupper((int)ulChar);
588 if (ulChar >= 0xe0 && ulChar <= 0xfe && ulChar != 0xf7) {
590 * Lower case accented characters
591 * 0xf7 is Division sign; 0xd7 is Multiplication sign
592 * 0xff is y with diaeresis; 0xdf is Sharp s
594 return ulChar & ~0x20;
596 #if defined(__STDC_ISO_10646__)
598 * If this is ISO C99 and all locales have wchar_t = ISO 10646
599 * (e.g., glibc 2.2 or newer), then use standard function
601 if (ulChar > 0xff) {
602 return (ULONG)towupper((wint_t)ulChar);
604 #endif /* __STDC_ISO_10646__ */
605 return ulChar;
606 } /* end of ulToUpper */