Imported from antiword-0.37.tar.gz.
[antiword.git] / chartrans.c
blob5edaae9b4f1ae6c2c9fc7ba8e7ba53150704229e
1 /*
2 * chartrans.c
3 * Copyright (C) 1999-2004 A.J. van Os; Released under GNU GPL
5 * Description:
6 * Translate Word characters to local representation
7 */
9 #include <stdlib.h>
10 #include <string.h>
11 #include <ctype.h>
12 #if defined(__STDC_ISO_10646__)
13 #include <wctype.h>
14 #endif /* __STDC_ISO_10646__ */
15 #include "antiword.h"
17 static const USHORT usCp850[] = { /* DOS implementation of Latin1 */
18 0x00c7, 0x00fc, 0x00e9, 0x00e2, 0x00e4, 0x00e0, 0x00e5, 0x00e7,
19 0x00ea, 0x00eb, 0x00e8, 0x00ef, 0x00ee, 0x00ec, 0x00c4, 0x00c5,
20 0x00c9, 0x00e6, 0x00c6, 0x00f4, 0x00f6, 0x00f2, 0x00fb, 0x00f9,
21 0x00ff, 0x00d6, 0x00dc, 0x00f8, 0x00a3, 0x00d8, 0x00d7, 0x0192,
22 0x00e1, 0x00ed, 0x00f3, 0x00fa, 0x00f1, 0x00d1, 0x00aa, 0x00ba,
23 0x00bf, 0x00ae, 0x00ac, 0x00bd, 0x00bc, 0x00a1, 0x00ab, 0x00bb,
24 0x2591, 0x2592, 0x2593, 0x2502, 0x2524, 0x00c1, 0x00c2, 0x00c0,
25 0x00a9, 0x2563, 0x2551, 0x2557, 0x255d, 0x00a2, 0x00a5, 0x2510,
26 0x2514, 0x2534, 0x252c, 0x251c, 0x2500, 0x253c, 0x00e3, 0x00c3,
27 0x255a, 0x2554, 0x2569, 0x2566, 0x2560, 0x2550, 0x256c, 0x00a4,
28 0x00f0, 0x00d0, 0x00ca, 0x00cb, 0x00c8, 0x0131, 0x00cd, 0x00ce,
29 0x00cf, 0x2518, 0x250c, 0x2588, 0x2584, 0x00a6, 0x00cc, 0x2580,
30 0x00d3, 0x00df, 0x00d4, 0x00d2, 0x00f5, 0x00d5, 0x00b5, 0x00fe,
31 0x00de, 0x00da, 0x00db, 0x00d9, 0x00fd, 0x00dd, 0x00af, 0x00b4,
32 0x00ad, 0x00b1, 0x2017, 0x00be, 0x00b6, 0x00a7, 0x00f7, 0x00b8,
33 0x00b0, 0x00a8, 0x00b7, 0x00b9, 0x00b3, 0x00b2, 0x25a0, 0x00a0,
36 static const USHORT usCp1250[] = { /* Windows implementation of Latin2 */
37 0x20ac, 0x003f, 0x201a, 0x003f, 0x201e, 0x2026, 0x2020, 0x2021,
38 0x003f, 0x2030, 0x0160, 0x2039, 0x015a, 0x0164, 0x017d, 0x0179,
39 0x003f, 0x2018, 0x2019, 0x201c, 0x201d, 0x2022, 0x2013, 0x2014,
40 0x003f, 0x2122, 0x0161, 0x203a, 0x015b, 0x0165, 0x017e, 0x017a,
41 0x00a0, 0x02c7, 0x02d8, 0x0141, 0x00a4, 0x0104, 0x00a6, 0x00a7,
42 0x00a8, 0x00a9, 0x015e, 0x00ab, 0x00ac, 0x00ad, 0x00ae, 0x017b,
43 0x00b0, 0x00b1, 0x02db, 0x0142, 0x00b4, 0x00b5, 0x00b6, 0x00b7,
44 0x00b8, 0x0105, 0x015f, 0x00bb, 0x013d, 0x02dd, 0x013e, 0x017c,
45 0x0154, 0x00c1, 0x00c2, 0x0102, 0x00c4, 0x0139, 0x0106, 0x00c7,
46 0x010c, 0x00c9, 0x0118, 0x00cb, 0x011a, 0x00cd, 0x00ce, 0x010e,
47 0x0110, 0x0143, 0x0147, 0x00d3, 0x00d4, 0x0150, 0x00d6, 0x00d7,
48 0x0158, 0x016e, 0x00da, 0x0170, 0x00dc, 0x00dd, 0x0162, 0x00df,
49 0x0155, 0x00e1, 0x00e2, 0x0103, 0x00e4, 0x013a, 0x0107, 0x00e7,
50 0x010d, 0x00e9, 0x0119, 0x00eb, 0x011b, 0x00ed, 0x00ee, 0x010f,
51 0x0111, 0x0144, 0x0148, 0x00f3, 0x00f4, 0x0151, 0x00f6, 0x00f7,
52 0x0159, 0x016f, 0x00fa, 0x0171, 0x00fc, 0x00fd, 0x0163, 0x02d9,
55 static const USHORT usCp1251[] = { /* Windows implementation of Cyrillic */
56 0x0402, 0x0403, 0x201a, 0x0453, 0x201e, 0x2026, 0x2020, 0x2021,
57 0x20ac, 0x2030, 0x0409, 0x2039, 0x040a, 0x040c, 0x040b, 0x040f,
58 0x0452, 0x2018, 0x2019, 0x201c, 0x201d, 0x2022, 0x2013, 0x2014,
59 0x00f3, 0x2122, 0x0459, 0x203a, 0x045a, 0x045c, 0x045b, 0x045f,
60 0x00a0, 0x040e, 0x045e, 0x0408, 0x00a4, 0x0490, 0x00a6, 0x00a7,
61 0x0401, 0x00a9, 0x0404, 0x00ab, 0x00ac, 0x00ad, 0x00ae, 0x0407,
62 0x00b0, 0x00b1, 0x0406, 0x0456, 0x0491, 0x00b5, 0x00b6, 0x00b7,
63 0x0451, 0x2116, 0x0454, 0x00bb, 0x0458, 0x0405, 0x0455, 0x0457,
64 0x0410, 0x0411, 0x0412, 0x0413, 0x0414, 0x0415, 0x0416, 0x0417,
65 0x0418, 0x0419, 0x041a, 0x041b, 0x041c, 0x041d, 0x041e, 0x041f,
66 0x0420, 0x0421, 0x0422, 0x0423, 0x0424, 0x0425, 0x0426, 0x0427,
67 0x0428, 0x0429, 0x042a, 0x042b, 0x042c, 0x042d, 0x042e, 0x042f,
68 0x0430, 0x0431, 0x0432, 0x0433, 0x0434, 0x0435, 0x0436, 0x0437,
69 0x0438, 0x0439, 0x043a, 0x043b, 0x043c, 0x043d, 0x043e, 0x043f,
70 0x0440, 0x0441, 0x0442, 0x0443, 0x0444, 0x0445, 0x0446, 0x0447,
71 0x0448, 0x0449, 0x044a, 0x044b, 0x044c, 0x044d, 0x044e, 0x044f,
74 static const USHORT usCp1252[] = { /* Windows implementation of Latin1 */
75 0x20ac, 0x003f, 0x201a, 0x0192, 0x201e, 0x2026, 0x2020, 0x2021,
76 0x02c6, 0x2030, 0x0160, 0x2039, 0x0152, 0x003f, 0x017d, 0x003f,
77 0x003f, 0x2018, 0x2019, 0x201c, 0x201d, 0x2022, 0x2013, 0x2014,
78 0x02dc, 0x2122, 0x0161, 0x203a, 0x0153, 0x003f, 0x017e, 0x0178,
79 0x00a0, 0x00a1, 0x00a2, 0x00a3, 0x00a4, 0x00a5, 0x00a6, 0x00a7,
80 0x00a8, 0x00a9, 0x00aa, 0x00ab, 0x00ac, 0x00ad, 0x00ae, 0x00af,
81 0x00b0, 0x00b1, 0x00b2, 0x00b3, 0x00b4, 0x00b5, 0x00b6, 0x00b7,
82 0x00b8, 0x00b9, 0x00ba, 0x00bb, 0x00bc, 0x00bd, 0x00be, 0x00bf,
83 0x00c0, 0x00c1, 0x00c2, 0x00c3, 0x00c4, 0x00c5, 0x00c6, 0x00c7,
84 0x00c8, 0x00c9, 0x00ca, 0x00cb, 0x00cc, 0x00cd, 0x00ce, 0x00cf,
85 0x00d0, 0x00d1, 0x00d2, 0x00d3, 0x00d4, 0x00d5, 0x00d6, 0x00d7,
86 0x00d8, 0x00d9, 0x00da, 0x00db, 0x00dc, 0x00dd, 0x00de, 0x00df,
87 0x00e0, 0x00e1, 0x00e2, 0x00e3, 0x00e4, 0x00e5, 0x00e6, 0x00e7,
88 0x00e8, 0x00e9, 0x00ea, 0x00eb, 0x00ec, 0x00ed, 0x00ee, 0x00ef,
89 0x00f0, 0x00f1, 0x00f2, 0x00f3, 0x00f4, 0x00f5, 0x00f6, 0x00f7,
90 0x00f8, 0x00f9, 0x00fa, 0x00fb, 0x00fc, 0x00fd, 0x00fe, 0x00ff,
93 static const USHORT usMacRoman[] = { /* Apple implementation of Latin1 */
94 0x00c4, 0x00c5, 0x00c7, 0x00c9, 0x00d1, 0x00d6, 0x00dc, 0x00e1,
95 0x00e0, 0x00e2, 0x00e4, 0x00e3, 0x00e5, 0x00e7, 0x00e9, 0x00e8,
96 0x00ea, 0x00eb, 0x00ed, 0x00ec, 0x00ee, 0x00ef, 0x00f1, 0x00f3,
97 0x00f2, 0x00f4, 0x00f6, 0x00f5, 0x00fa, 0x00f9, 0x00fb, 0x00fc,
98 0x2020, 0x00b0, 0x00a2, 0x00a3, 0x00a7, 0x2022, 0x00b6, 0x00df,
99 0x00ae, 0x00a9, 0x2122, 0x00b4, 0x00a8, 0x2260, 0x00c6, 0x00d8,
100 0x221e, 0x00b1, 0x2264, 0x2265, 0x00a5, 0x00b5, 0x2202, 0x2211,
101 0x220f, 0x03c0, 0x222b, 0x00aa, 0x00ba, 0x2126, 0x00e6, 0x00f8,
102 0x00bf, 0x00a1, 0x00ac, 0x221a, 0x0192, 0x2248, 0x2206, 0x00ab,
103 0x00bb, 0x2026, 0x00a0, 0x00c0, 0x00c3, 0x00d5, 0x0152, 0x0153,
104 0x2013, 0x2014, 0x201c, 0x201d, 0x2018, 0x2019, 0x00f7, 0x25ca,
105 0x00ff, 0x0178, 0x2044, 0x00a4, 0x2039, 0x203a, 0xfb01, 0xfb02,
106 0x2021, 0x00b7, 0x201a, 0x201e, 0x2030, 0x00c2, 0x00ca, 0x00c1,
107 0x00cb, 0x00c8, 0x00cd, 0x00ce, 0x00cf, 0x00cc, 0x00d3, 0x00d4,
108 0x003f, 0x00d2, 0x00da, 0x00db, 0x00d9, 0x0131, 0x02c6, 0x02dc,
109 0x00af, 0x02d8, 0x02d9, 0x02da, 0x00b8, 0x02dd, 0x02db, 0x02c7,
112 static const USHORT usPrivateArea[] = {
113 0x0020, 0x0021, 0x2200, 0x0023, 0x2203, 0x0025, 0x0026, 0x220d,
114 0x0028, 0x0029, 0x2217, 0x002b, 0x002c, 0x2212, 0x002e, 0x002f,
115 0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037,
116 0x0038, 0x0039, 0x003a, 0x003b, 0x003c, 0x2019, 0x003e, 0x003f,
117 0x201d, 0x201c, 0x0392, 0x03a7, 0x0394, 0x0395, 0x03a6, 0x0393,
118 0x0397, 0x0399, 0x03d1, 0x039a, 0x039b, 0x039c, 0x039d, 0x039f,
119 0x03a0, 0x0398, 0x03a1, 0x03a3, 0x03a4, 0x03a5, 0x03c2, 0x03a9,
120 0x039e, 0x03a8, 0x0396, 0x005b, 0x2234, 0x005d, 0x22a5, 0x005f,
121 0x003f, 0x03b1, 0x03b2, 0x03c7, 0x03b4, 0x03b5, 0x03c6, 0x03b3,
122 0x03b7, 0x03b9, 0x03d5, 0x03ba, 0x03bb, 0x03bc, 0x03bd, 0x03bf,
123 0x03c0, 0x03b8, 0x03c1, 0x03c3, 0x03c4, 0x03c5, 0x03d6, 0x03c9,
124 0x03be, 0x03c8, 0x03b6, 0x007b, 0x007c, 0x007d, 0x223c, 0x003f,
125 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
126 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
127 0x003f, 0x003f, 0x003f, 0x2022, 0x003f, 0x003f, 0x003f, 0x003f,
128 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
129 0x20ac, 0x03d2, 0x2032, 0x2264, 0x2044, 0x221e, 0x0192, 0x2663,
130 0x2666, 0x2665, 0x2660, 0x2194, 0x2190, 0x2191, 0x2192, 0x2193,
131 0x00b0, 0x00b1, 0x2033, 0x2265, 0x00d7, 0x221d, 0x2202, 0x2022,
132 0x00f7, 0x2260, 0x2261, 0x2248, 0x2026, 0x007c, 0x23af, 0x21b5,
133 0x2135, 0x2111, 0x211c, 0x2118, 0x2297, 0x2295, 0x2205, 0x2229,
134 0x222a, 0x2283, 0x2287, 0x2284, 0x2282, 0x2286, 0x2208, 0x2209,
135 0x2220, 0x2207, 0x00ae, 0x00a9, 0x2122, 0x220f, 0x221a, 0x22c5,
136 0x00ac, 0x2227, 0x2228, 0x21d4, 0x21d0, 0x21d1, 0x21d2, 0x21d3,
137 0x22c4, 0x3008, 0x00ae, 0x00a9, 0x2122, 0x2211, 0x239b, 0x239c,
138 0x239d, 0x23a1, 0x23a2, 0x23a3, 0x23a7, 0x23a8, 0x23a9, 0x23aa,
139 0x003f, 0x3009, 0x222b, 0x2320, 0x23ae, 0x2321, 0x239e, 0x239f,
140 0x23a0, 0x23a4, 0x23a5, 0x23a6, 0x23ab, 0x23ac, 0x23ad, 0x003f,
143 typedef struct char_table_tag {
144 UCHAR ucLocal;
145 USHORT usUnicode;
146 } char_table_type;
148 static char_table_type atCharTable[256];
149 static size_t tNextPosFree = 0;
153 * iCompare - compare two records
155 * Compares two records. For use by qsort(3C) and bsearch(3C).
157 * returns -1 if rec1 < rec2, 0 if rec1 == rec2, 1 if rec1 > rec2
159 static int
160 iCompare(const void *pvRecord1, const void *pvRecord2)
162 USHORT usUnicode1, usUnicode2;
164 usUnicode1 = ((char_table_type *)pvRecord1)->usUnicode;
165 usUnicode2 = ((char_table_type *)pvRecord2)->usUnicode;
167 if (usUnicode1 < usUnicode2) {
168 return -1;
170 if (usUnicode1 > usUnicode2) {
171 return 1;
173 return 0;
174 } /* end of iCompare */
177 * pGetCharTableRecord - get the character table record
179 * returns a pointer to the record when found, otherwise NULL
181 static const char_table_type *
182 pGetCharTableRecord(USHORT usUnicode)
184 char_table_type tKey;
186 if (tNextPosFree == 0) {
187 return NULL;
189 tKey.usUnicode = usUnicode;
190 tKey.ucLocal = 0;
191 return (char_table_type *)bsearch(&tKey,
192 atCharTable,
193 tNextPosFree, sizeof(atCharTable[0]),
194 iCompare);
195 } /* end of pGetCharTableRecord */
198 * ucGetBulletCharacter - get the local representation of the bullet
200 UCHAR
201 ucGetBulletCharacter(conversion_type eConversionType, encoding_type eEncoding)
203 #if defined(__riscos)
204 return 0x8f;
205 #else
206 const char_table_type *pRec;
208 fail(eEncoding == encoding_utf_8);
210 if (eEncoding == encoding_latin_1 &&
211 (eConversionType == conversion_ps ||
212 eConversionType == conversion_pdf)) {
213 /* Ugly, but it makes the PostScript and PDF look better */
214 return (UCHAR)143;
216 if (eConversionType != conversion_text &&
217 eConversionType != conversion_fmt_text) {
218 pRec = pGetCharTableRecord(UNICODE_BULLET);
219 if (pRec != NULL) {
220 return pRec->ucLocal;
222 pRec = pGetCharTableRecord(UNICODE_BULLET_OPERATOR);
223 if (pRec != NULL) {
224 return pRec->ucLocal;
226 pRec = pGetCharTableRecord(UNICODE_MIDDLE_DOT);
227 if (pRec != NULL) {
228 return pRec->ucLocal;
231 return (UCHAR)'.';
232 #endif /* __riscos */
233 } /* end of ucGetBulletCharacter */
236 * ucGetNbspCharacter - get the local representation of the non-breaking space
238 UCHAR
239 ucGetNbspCharacter(void)
241 const char_table_type *pRec;
243 pRec = pGetCharTableRecord(0x00a0); /* Unicode non-breaking space */
244 if (pRec == NULL) {
245 DBG_MSG("Non-breaking space record not found");
246 /* No value found, use the best guess */
247 return (UCHAR)0xa0;
249 return pRec->ucLocal;
250 } /* end of ucGetNbspCharacter */
253 * bReadCharacterMappingTable - read the mapping table
255 * Read the character mapping table from file and have the contents sorted
257 * returns TRUE if successful, otherwise FALSE
259 BOOL
260 bReadCharacterMappingTable(FILE *pFile)
262 char *pcTmp;
263 ULONG ulUnicode;
264 UINT uiLocal;
265 int iFields;
266 char szLine[81];
268 if (pFile == NULL) {
269 return FALSE;
272 /* Clean the table first */
273 (void)memset(atCharTable, 0, sizeof(atCharTable));
275 /* Fill the table */
276 while (fgets(szLine, (int)sizeof(szLine), pFile)) {
277 if (szLine[0] == '#' ||
278 szLine[0] == '\r' ||
279 szLine[0] == '\n') {
280 /* Comment or empty line */
281 continue;
283 iFields = sscanf(szLine, "%x %lx %*s", &uiLocal, &ulUnicode);
284 if (iFields != 2) {
285 pcTmp = strchr(szLine, '\r');
286 if (pcTmp != NULL) {
287 *pcTmp = '\0';
289 pcTmp = strchr(szLine, '\n');
290 if (pcTmp != NULL) {
291 *pcTmp = '\0';
293 werr(0, "Syntax error in: '%s'", szLine);
294 continue;
296 if (uiLocal > 0xff || ulUnicode > 0xffff) {
297 werr(0, "Syntax error in: '%02x %04lx'",
298 uiLocal, ulUnicode);
299 continue;
301 /* Store only the relevant entries */
302 if (uiLocal != ulUnicode || uiLocal >= 0x80) {
303 atCharTable[tNextPosFree].ucLocal = (UCHAR)uiLocal;
304 atCharTable[tNextPosFree].usUnicode = (USHORT)ulUnicode;
305 tNextPosFree++;
307 if (tNextPosFree >= elementsof(atCharTable)) {
308 werr(0, "Too many entries in the character mapping "
309 "file. Ignoring the rest.");
310 break;
314 if (tNextPosFree != 0) {
315 DBG_HEX(atCharTable[0].usUnicode);
316 DBG_HEX(atCharTable[tNextPosFree - 1].usUnicode);
318 qsort(atCharTable,
319 tNextPosFree, sizeof(atCharTable[0]),
320 iCompare);
322 DBG_HEX(atCharTable[0].usUnicode);
323 DBG_HEX(atCharTable[tNextPosFree - 1].usUnicode);
326 return TRUE;
327 } /* end of bReadCharacterMappingTable */
330 * ulTranslateCharacters - Translate characters to local representation
332 * Translate all characters to local representation
334 * returns the translated character
336 ULONG
337 ulTranslateCharacters(USHORT usChar, ULONG ulFileOffset, int iWordVersion,
338 conversion_type eConversionType, encoding_type eEncoding,
339 BOOL bUseMacCharSet)
341 const char_table_type *pTmp;
342 const USHORT *usCharSet;
344 usCharSet = NULL;
345 if (bUseMacCharSet) {
346 /* Macintosh character set */
347 usCharSet = usMacRoman;
348 } else if (iWordVersion == 0) {
349 /* DOS character set */
350 usCharSet = usCp850;
351 } else {
352 /* Windows character set */
353 switch (eEncoding) {
354 case encoding_latin_2:
355 usCharSet = usCp1250;
356 break;
357 case encoding_cyrillic:
358 usCharSet = usCp1251;
359 break;
360 case encoding_latin_1:
361 default:
362 usCharSet = usCp1252;
363 break;
366 fail(usCharSet == NULL);
367 if (usChar >= 0x80 && usChar <= 0x9f) {
368 /* Translate implementation defined characters */
369 usChar = usCharSet[usChar - 0x80];
370 } else if (iWordVersion < 8 && usChar >= 0xa0 && usChar <= 0xff) {
371 /* Translate old character set to Unixcode */
372 usChar = usCharSet[usChar - 0x80];
375 /* Microsoft Unicode to real Unicode */
376 if (usChar >= 0xf020 && usChar <= 0xf0ff) {
377 DBG_HEX_C(usPrivateArea[usChar - 0xf020] == 0x003f, usChar);
378 usChar = usPrivateArea[usChar - 0xf020];
381 /* Characters with a special meaning in Word */
382 switch (usChar) {
383 case IGNORE_CHARACTER:
384 case FOOTNOTE_SEPARATOR:
385 case FOOTNOTE_CONTINUATION:
386 case ANNOTATION:
387 case FRAME:
388 case LINE_FEED:
389 case WORD_SOFT_HYPHEN:
390 case UNICODE_HYPHENATION_POINT:
391 return IGNORE_CHARACTER;
392 case PICTURE:
393 case TABLE_SEPARATOR:
394 case TAB:
395 case HARD_RETURN:
396 case PAGE_BREAK:
397 case PAR_END:
398 case COLUMN_FEED:
399 return (ULONG)usChar;
400 case FOOTNOTE_OR_ENDNOTE:
401 NO_DBG_HEX(ulFileOffset);
402 switch (eGetNotetype(ulFileOffset)) {
403 case notetype_is_footnote:
404 return FOOTNOTE_CHAR;
405 case notetype_is_endnote:
406 return ENDNOTE_CHAR;
407 default:
408 return UNKNOWN_NOTE_CHAR;
410 case WORD_UNBREAKABLE_JOIN:
411 return (ULONG)OUR_UNBREAKABLE_JOIN;
412 default:
413 break;
416 if (eEncoding != encoding_utf_8) {
417 /* Latin characters in an oriental text */
418 if (usChar >= 0xff01 && usChar <= 0xff5e) {
419 usChar -= 0xfee0;
423 if (eEncoding == encoding_latin_1 &&
424 (eConversionType == conversion_ps ||
425 eConversionType == conversion_pdf)) {
426 /* Ugly, but it makes the PostScript and PDF look better */
427 switch (usChar) {
428 case UNICODE_ELLIPSIS:
429 return 140;
430 case UNICODE_TRADEMARK_SIGN:
431 return 141;
432 case UNICODE_PER_MILLE_SIGN:
433 return 142;
434 case UNICODE_BULLET:
435 case UNICODE_BULLET_OPERATOR:
436 case UNICODE_BLACK_CLUB_SUIT:
437 return 143;
438 case UNICODE_LEFT_SINGLE_QMARK:
439 return 144;
440 case UNICODE_RIGHT_SINGLE_QMARK:
441 return 145;
442 case UNICODE_SINGLE_LEFT_ANGLE_QMARK:
443 return 146;
444 case UNICODE_SINGLE_RIGHT_ANGLE_QMARK:
445 return 147;
446 case UNICODE_LEFT_DOUBLE_QMARK:
447 return 148;
448 case UNICODE_RIGHT_DOUBLE_QMARK:
449 return 149;
450 case UNICODE_DOUBLE_LOW_9_QMARK:
451 return 150;
452 case UNICODE_EN_DASH:
453 return 151;
454 case UNICODE_EM_DASH:
455 return 152;
456 case UNICODE_MINUS_SIGN:
457 return 153;
458 case UNICODE_CAPITAL_LIGATURE_OE:
459 return 154;
460 case UNICODE_SMALL_LIGATURE_OE:
461 return 155;
462 case UNICODE_DAGGER:
463 return 156;
464 case UNICODE_DOUBLE_DAGGER:
465 return 157;
466 case UNICODE_SMALL_LIGATURE_FI:
467 return 158;
468 case UNICODE_SMALL_LIGATURE_FL:
469 return 159;
470 default:
471 break;
475 if (eConversionType == conversion_pdf) {
476 if (eEncoding == encoding_latin_1) {
477 switch (usChar) {
478 case UNICODE_EURO_SIGN:
479 return 128;
480 default:
481 break;
483 } else if (eEncoding == encoding_latin_2) {
484 switch (usChar) {
485 case UNICODE_CAPITAL_D_WITH_STROKE:
486 case UNICODE_SMALL_D_WITH_STROKE:
487 return 0x3f;
488 default:
489 break;
494 if (usChar < 0x80) {
495 /* US ASCII */
496 if (usChar < 0x20 || usChar == 0x7f) {
497 /* Ignore control characters */
498 DBG_HEX(usChar);
499 DBG_FIXME();
500 return IGNORE_CHARACTER;
502 return (ULONG)usChar;
505 if (eEncoding == encoding_utf_8) {
506 /* No need to convert Unicode characters */
507 return (ULONG)usChar;
510 /* Unicode to local representation */
511 pTmp = pGetCharTableRecord(usChar);
512 if (pTmp != NULL) {
513 DBG_HEX_C(usChar >= 0x7f && usChar <= 0x9f, usChar);
514 return (ULONG)pTmp->ucLocal;
517 /* Fancy characters to simple US ASCII */
518 switch (usChar) {
519 case UNICODE_SMALL_F_HOOK:
520 return (ULONG)'f';
521 case UNICODE_GREEK_CAPITAL_CHI:
522 return (ULONG)'X';
523 case UNICODE_GREEK_SMALL_UPSILON:
524 return (ULONG)'v';
525 case UNICODE_MODIFIER_CIRCUMFLEX:
526 case UNICODE_UPWARDS_ARROW:
527 return (ULONG)'^';
528 case UNICODE_SMALL_TILDE:
529 case UNICODE_TILDE_OPERATOR:
530 return (ULONG)'~';
531 case UNICODE_EN_QUAD:
532 case UNICODE_EM_QUAD:
533 case UNICODE_EN_SPACE:
534 case UNICODE_EM_SPACE:
535 case UNICODE_THREE_PER_EM_SPACE:
536 case UNICODE_FOUR_PER_EM_SPACE:
537 case UNICODE_SIX_PER_EM_SPACE:
538 case UNICODE_FIGURE_SPACE:
539 case UNICODE_PUNCTUATION_SPACE:
540 case UNICODE_THIN_SPACE:
541 case UNICODE_NARROW_NO_BREAK_SPACE:
542 case UNICODE_LIGHT_SHADE:
543 case UNICODE_MEDIUM_SHADE:
544 case UNICODE_DARK_SHADE:
545 return (ULONG)' ';
546 case UNICODE_LEFT_DOUBLE_QMARK:
547 case UNICODE_RIGHT_DOUBLE_QMARK:
548 case UNICODE_DOUBLE_LOW_9_QMARK:
549 case UNICODE_DOUBLE_HIGH_REV_9_QMARK:
550 case UNICODE_DOUBLE_PRIME:
551 return (ULONG)'"';
552 case UNICODE_LEFT_SINGLE_QMARK:
553 case UNICODE_RIGHT_SINGLE_QMARK:
554 case UNICODE_SINGLE_LOW_9_QMARK:
555 case UNICODE_SINGLE_HIGH_REV_9_QMARK:
556 case UNICODE_PRIME:
557 return (ULONG)'\'';
558 case UNICODE_HYPHEN:
559 case UNICODE_NON_BREAKING_HYPHEN:
560 case UNICODE_FIGURE_DASH:
561 case UNICODE_EN_DASH:
562 case UNICODE_EM_DASH:
563 case UNICODE_HORIZONTAL_BAR:
564 case UNICODE_MINUS_SIGN:
565 case UNICODE_BD_LIGHT_HORIZONTAL:
566 case UNICODE_BD_DOUBLE_HORIZONTAL:
567 return (ULONG)'-';
568 case UNICODE_DOUBLE_VERTICAL_LINE:
569 case UNICODE_BD_LIGHT_VERTICAL:
570 case UNICODE_BD_DOUBLE_VERTICAL:
571 return (ULONG)'|';
572 case UNICODE_DOUBLE_LOW_LINE:
573 return (ULONG)'_';
574 case UNICODE_DAGGER:
575 return (ULONG)'+';
576 case UNICODE_DOUBLE_DAGGER:
577 return (ULONG)'#';
578 case UNICODE_BULLET:
579 case UNICODE_BULLET_OPERATOR:
580 case UNICODE_BLACK_CLUB_SUIT:
581 return (ULONG)ucGetBulletCharacter(eConversionType, eEncoding);
582 case UNICODE_ONE_DOT_LEADER:
583 case UNICODE_TWO_DOT_LEADER:
584 return (ULONG)'.';
585 case UNICODE_ELLIPSIS:
586 #if defined(__riscos)
587 return (ULONG)OUR_ELLIPSIS;
588 #else
589 if (ulFileOffset == 0) {
590 return (ULONG)OUR_ELLIPSIS;
592 return UNICODE_ELLIPSIS;
593 #endif /* __riscos */
594 case UNICODE_DOUBLE_LEFT_ANGLE_QMARK:
595 case UNICODE_TRIANGULAR_BULLET:
596 case UNICODE_SINGLE_LEFT_ANGLE_QMARK:
597 case UNICODE_LEFTWARDS_ARROW:
598 return (ULONG)'<';
599 case UNICODE_DOUBLE_RIGHT_ANGLE_QMARK:
600 case UNICODE_SINGLE_RIGHT_ANGLE_QMARK:
601 case UNICODE_RIGHTWARDS_ARROW:
602 return (ULONG)'>';
603 case UNICODE_UNDERTIE:
604 return (ULONG)'-';
605 case UNICODE_N_ARY_SUMMATION:
606 return (ULONG)'S';
607 case UNICODE_EURO_SIGN:
608 return (ULONG)'E';
609 case UNICODE_CIRCLE:
610 case UNICODE_SQUARE:
611 return (ULONG)'O';
612 case UNICODE_DIAMOND:
613 return (ULONG)OUR_DIAMOND;
614 case UNICODE_NUMERO_SIGN:
615 return (ULONG)'N';
616 case UNICODE_KELVIN_SIGN:
617 return (ULONG)'K';
618 case UNICODE_DOWNWARDS_ARROW:
619 return (ULONG)'v';
620 case UNICODE_FRACTION_SLASH:
621 case UNICODE_DIVISION_SLASH:
622 return (ULONG)'/';
623 case UNICODE_ASTERISK_OPERATOR:
624 return (ULONG)'*';
625 case UNICODE_RATIO:
626 return (ULONG)':';
627 case UNICODE_BD_LIGHT_DOWN_RIGHT:
628 case UNICODE_BD_LIGHT_DOWN_AND_LEFT:
629 case UNICODE_BD_LIGHT_UP_AND_RIGHT:
630 case UNICODE_BD_LIGHT_UP_AND_LEFT:
631 case UNICODE_BD_LIGHT_VERTICAL_AND_RIGHT:
632 case UNICODE_BD_LIGHT_VERTICAL_AND_LEFT:
633 case UNICODE_BD_LIGHT_DOWN_AND_HORIZONTAL:
634 case UNICODE_BD_LIGHT_UP_AND_HORIZONTAL:
635 case UNICODE_BD_LIGHT_VERTICAL_AND_HORIZONTAL:
636 case UNICODE_BD_DOUBLE_DOWN_AND_RIGHT:
637 case UNICODE_BD_DOUBLE_DOWN_AND_LEFT:
638 case UNICODE_BD_DOUBLE_UP_AND_RIGHT:
639 case UNICODE_BD_DOUBLE_UP_AND_LEFT:
640 case UNICODE_BD_DOUBLE_VERTICAL_AND_RIGHT:
641 case UNICODE_BD_DOUBLE_VERTICAL_AND_LEFT:
642 case UNICODE_BD_DOUBLE_DOWN_AND_HORIZONTAL:
643 case UNICODE_BD_DOUBLE_UP_AND_HORIZONTAL:
644 case UNICODE_BD_DOUBLE_VERTICAL_AND_HORIZONTAL:
645 case UNICODE_BLACK_SQUARE:
646 return (ULONG)'+';
647 case UNICODE_HAIR_SPACE:
648 case UNICODE_ZERO_WIDTH_SPACE:
649 case UNICODE_ZERO_WIDTH_NON_JOINER:
650 case UNICODE_ZERO_WIDTH_JOINER:
651 case UNICODE_LEFT_TO_RIGHT_MARK:
652 case UNICODE_RIGHT_TO_LEFT_MARK:
653 case UNICODE_LEFT_TO_RIGHT_EMBEDDING:
654 case UNICODE_RIGHT_TO_LEFT_EMBEDDING:
655 case UNICODE_POP_DIRECTIONAL_FORMATTING:
656 case UNICODE_LEFT_TO_RIGHT_OVERRIDE:
657 case UNICODE_RIGHT_TO_LEFT_OVERRIDE:
658 case UNICODE_ZERO_WIDTH_NO_BREAK_SPACE:
659 return IGNORE_CHARACTER;
660 default:
661 break;
664 if (usChar == UNICODE_TRADEMARK_SIGN) {
666 * No local representation, it doesn't look like anything in
667 * US-ASCII and a question mark does more harm than good.
669 return IGNORE_CHARACTER;
672 if (usChar >= 0xa0 && usChar <= 0xff) {
673 /* Before Word 97, Word did't use Unicode */
674 return (ULONG)usChar;
677 DBG_HEX_C(usChar < 0x3000 || usChar >= 0xd800, ulFileOffset);
678 DBG_HEX_C(usChar < 0x3000 || usChar >= 0xd800, usChar);
679 DBG_MSG_C(usChar >= 0xe000 && usChar < 0xf900, "Private Use Area");
681 /* Untranslated Unicode character */
682 return 0x3f;
683 } /* end of ulTranslateCharacters */
686 * ulToUpper - convert letter to upper case
688 * This function converts a letter to upper case. Unlike toupper(3) this
689 * function is independent from the settings of locale. This comes in handy
690 * for people who have to read Word documents in more than one language or
691 * contain more than one language.
693 * returns the converted letter, or ulChar if the conversion was not possible.
695 ULONG
696 ulToUpper(ULONG ulChar)
698 if (ulChar < 0x80) {
699 /* US ASCII: use standard function */
700 return (ULONG)toupper((int)ulChar);
702 if (ulChar >= 0xe0 && ulChar <= 0xfe && ulChar != 0xf7) {
704 * Lower case accented characters
705 * 0xf7 is Division sign; 0xd7 is Multiplication sign
706 * 0xff is y with diaeresis; 0xdf is Sharp s
708 return ulChar & ~0x20;
710 #if defined(__STDC_ISO_10646__)
712 * If this is ISO C99 and all locales have wchar_t = ISO 10646
713 * (e.g., glibc 2.2 or newer), then use standard function
715 if (ulChar > 0xff) {
716 return (ULONG)towupper((wint_t)ulChar);
718 #endif /* __STDC_ISO_10646__ */
719 return ulChar;
720 } /* end of ulToUpper */