Corrections to SVN properties.
[AROS.git] / workbench / libs / codesets / src / codesets.c
blob43e63fbccc3e592587eb0aa589e395ef6395dedf
1 /***************************************************************************
3 codesets.library - Amiga shared library for handling different codesets
4 Copyright (C) 2001-2005 by Alfonso [alfie] Ranieri <alforan@tin.it>.
5 Copyright (C) 2005-2013 by codesets.library Open Source Team
7 This library is free software; you can redistribute it and/or
8 modify it under the terms of the GNU Lesser General Public
9 License as published by the Free Software Foundation; either
10 version 2.1 of the License, or (at your option) any later version.
12 This library is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 Lesser General Public License for more details.
17 codesets.library project: http://sourceforge.net/projects/codesetslib/
19 Most of the code included in this file was relicensed from GPL to LGPL
20 from the source code of SimpleMail (http://www.sf.net/projects/simplemail)
21 with full permissions by its authors.
23 $Id$
25 ***************************************************************************/
27 #include "lib.h"
29 #include <clib/alib_protos.h>
31 #include <diskfont/glyph.h>
32 #include <diskfont/diskfonttag.h>
33 #include <proto/diskfont.h>
34 #include <ctype.h>
35 #include <limits.h>
37 #ifdef __MORPHOS__
38 #include <proto/keymap.h>
39 #include <proto/locale.h>
40 #endif
42 #include "codesets_table.h"
43 #include "convertUTF.h"
44 #include "codepages.h"
46 #include "SDI_stdarg.h"
48 #include "debug.h"
50 /**************************************************************************/
52 // a union used for various type casts while avoiding the annoying "dereferencing
53 // type punned pointer is breaking strict alias rules" warnings of GCC4+
54 union TypeAliases
56 void **voidptr;
57 char **schar;
58 unsigned char **uchar;
59 STRPTR *strptr;
60 UTF8 **utf8;
61 const UTF8 **cutf8;
62 UTF16 **utf16;
63 const UTF16 **cutf16;
64 UTF32 **utf32;
65 const UTF32 **cutf32;
68 /// BIN_SEARCH()
69 // search a sorted array in O(log n) e.g.
70 // BIN_SEARCH(strings,0,sizeof(strings)/sizeof(strings[0]),strcmp(key,array[mid]),res);
71 #define BIN_SEARCH(array,low,high,compare,result) \
73 int l = low;\
74 int h = high;\
75 int m = (low+high)/2;\
76 result = NULL;\
77 while (l<=h)\
79 int d = compare;\
80 if (!d){ result = &array[m]; break; }\
81 if (d < 0) h = m - 1;\
82 else l = m + 1;\
83 m = (l + h)/2;\
87 ///
88 /// mystrdup()
89 static STRPTR mystrdup(const char *str)
91 STRPTR newStr = NULL;
93 ENTER();
95 if(str != NULL)
97 int len;
99 if((len = strlen(str)) > 0)
101 if((newStr = allocArbitrateVecPooled(len+1)) != NULL)
102 strlcpy(newStr, str, len+1);
106 RETURN(newStr);
107 return newStr;
111 /// mystrndup()
112 static STRPTR mystrndup(const char *str1, int n)
114 STRPTR dest;
116 ENTER();
118 if((dest = allocArbitrateVecPooled(n+1)) != NULL)
120 if(str1 != NULL)
121 strlcpy(dest, str1, n+1);
122 else
123 dest[0] = '\0';
125 dest[n] = '\0';
128 RETURN(dest);
129 return dest;
133 /// readLine()
134 static BOOL readLine(BPTR fh, char *buf, ULONG size)
136 BOOL success = FALSE;
137 char *c;
139 ENTER();
141 if((c = FGets(fh, buf, size)) != NULL)
143 // we succeeded in reading something
144 success = TRUE;
146 // now find the end of the line and strip the LF/CR character
147 for(; *c; c++)
149 if(*c == '\n' || *c == '\r')
151 *c = '\0';
152 break;
157 RETURN(success);
158 return success;
162 /// getConfigItem()
163 static const char *getConfigItem(const char *buf, const char *item)
165 const char *configItem = NULL;
166 int len;
168 ENTER();
170 len = strlen(item);
172 if(strnicmp(buf, item, len) == 0)
174 char c;
176 buf += len;
178 // skip spaces
179 while((c = *buf) != '\0' && isspace(c))
180 buf++;
182 if(*buf == '=')
184 buf++;
186 // skip spaces
187 while((c = *buf) != '\0' && isspace(c))
188 buf++;
190 configItem = buf;
194 RETURN(configItem);
195 return configItem;
199 /// parseUtf8()
200 static int parseUtf8(STRPTR *ps)
202 STRPTR s = *ps;
203 int wc, n, i;
205 ENTER();
207 if(*s<0x80)
209 *ps = s+1;
211 RETURN(*s);
212 return *s;
215 if(*s<0xc2)
217 RETURN(-1);
218 return -1;
220 else
222 if(*s<0xe0)
224 if((s[1] & 0xc0)!=0x80)
226 RETURN(-1);
227 return -1;
230 *ps = s+2;
232 RETURN(((s[0] & 0x1f)<<6) | (s[1] & 0x3f));
233 return ((s[0] & 0x1f)<<6) | (s[1] & 0x3f);
235 else
237 if(*s<0xf0)
239 n = 3;
241 else
243 if(*s<0xf8)
245 n = 4;
247 else
249 if(*s<0xfc)
251 n = 5;
253 else
255 if(*s<0xfe)
257 n = 6;
259 else
261 RETURN(-1);
262 return -1;
270 wc = *s++ & ((1<<(7-n))-1);
272 for(i = 1; i<n; i++)
274 if((*s & 0xc0) != 0x80)
276 RETURN(-1);
277 return -1;
280 wc = (wc << 6) | (*s++ & 0x3f);
283 if(wc < (1 << (5 * n - 4)))
285 RETURN(-1);
286 return -1;
289 *ps = s;
291 RETURN(wc);
292 return wc;
296 /// countCodesets()
297 static int countCodesets(struct codesetList *csList)
299 struct Node *node;
300 int num = 0;
302 for(node = GetHead((struct List *)csList); node != NULL; node = GetSucc(node))
303 num++;
305 return num;
309 /// mapUTF8toASCII()
310 // in case some UTF8 sequences can not be converted during CodesetsUTF8ToStrA(), this
311 // function is used to replace these unknown sequences with lookalike characters that
312 // still make the text more readable. For more replacement see
313 // http://www.utf8-zeichentabelle.de/unicode-utf8-table.pl
315 // The conversion table in this function is partly borrowed from the awebcharset plugin
316 // written by Frank Weber. See http://cvs.sunsite.dk/viewcvs.cgi/aweb/plugins/charset/awebcharset.c
318 struct UTF8Replacement
320 const char *utf8; // the original UTF8 string we are going to replace
321 const int utf8len; // the length of the UTF8 string
322 const char *rep; // pointer to the replacement string
323 const int replen; // the length of the replacement string (minus for signalling an UTF8 string)
326 static int compareUTF8Replacements(const void *p1, const void *p2)
328 struct UTF8Replacement *key = (struct UTF8Replacement *)p1;
329 struct UTF8Replacement *rep = (struct UTF8Replacement *)p2;
330 int cmp;
332 // compare the length first, after that compare the strings
333 cmp = key->utf8len - rep->utf8len;
334 if(cmp == 0)
335 cmp = memcmp(key->utf8, rep->utf8, key->utf8len);
337 return cmp;
340 static int mapUTF8toASCII(const char **dst, const unsigned char *src, const int utf8len)
342 int len = 0;
343 struct UTF8Replacement key = { (char *)src, utf8len, NULL, 0 };
344 struct UTF8Replacement *rep;
346 static struct UTF8Replacement const utf8map[] =
348 // U+0100 ... U+017F (Latin Extended-A)
349 { "\xC4\x80", 2, "A", 1 }, // U+0100 -> A (LATIN CAPITAL LETTER A WITH MACRON)
350 { "\xC4\x81", 2, "a", 1 }, // U+0101 -> a (LATIN SMALL LETTER A WITH MACRON)
351 { "\xC4\x82", 2, "A", 1 }, // U+0102 -> A (LATIN CAPITAL LETTER A WITH BREVE)
352 { "\xC4\x83", 2, "a", 1 }, // U+0103 -> a (LATIN SMALL LETTER A WITH BREVE)
353 { "\xC4\x84", 2, "A", 1 }, // U+0104 -> A (LATIN CAPITAL LETTER A WITH OGONEK)
354 { "\xC4\x85", 2, "a", 1 }, // U+0105 -> a (LATIN SMALL LETTER A WITH OGONEK)
355 { "\xC4\x86", 2, "C", 1 }, // U+0106 -> C (LATIN CAPITAL LETTER C WITH ACUTE)
356 { "\xC4\x87", 2, "c", 1 }, // U+0107 -> c (LATIN SMALL LETTER C WITH ACUTE)
357 { "\xC4\x88", 2, "C", 1 }, // U+0108 -> C (LATIN CAPITAL LETTER C WITH CIRCUMFLEX)
358 { "\xC4\x89", 2, "c", 1 }, // U+0109 -> c (LATIN SMALL LETTER C WITH CIRCUMFLEX)
359 { "\xC4\x8A", 2, "C", 1 }, // U+010A -> C (LATIN CAPITAL LETTER C WITH DOT ABOVE)
360 { "\xC4\x8B", 2, "c", 1 }, // U+010B -> c (LATIN SMALL LETTER C WITH DOT ABOVE)
361 { "\xC4\x8C", 2, "C", 1 }, // U+010C -> C (LATIN CAPITAL LETTER C WITH CARON)
362 { "\xC4\x8D", 2, "c", 1 }, // U+010D -> c (LATIN SMALL LETTER C WITH CARON)
363 { "\xC4\x8E", 2, "D", 1 }, // U+010E -> D (LATIN CAPITAL LETTER D WITH CARON)
364 { "\xC4\x8F", 2, "d", 1 }, // U+010F -> d (LATIN SMALL LETTER D WITH CARON)
365 { "\xC4\x90", 2, "D", 1 }, // U+0110 -> D (LATIN CAPITAL LETTER D WITH STROKE)
366 { "\xC4\x91", 2, "d", 1 }, // U+0111 -> d (LATIN SMALL LETTER D WITH STROKE)
367 { "\xC4\x92", 2, "E", 1 }, // U+0112 -> E (LATIN CAPITAL LETTER E WITH MACRON)
368 { "\xC4\x93", 2, "e", 1 }, // U+0113 -> e (LATIN SMALL LETTER E WITH MACRON)
369 { "\xC4\x94", 2, "E", 1 }, // U+0114 -> E (LATIN CAPITAL LETTER E WITH BREVE)
370 { "\xC4\x95", 2, "e", 1 }, // U+0115 -> e (LATIN SMALL LETTER E WITH BREVE)
371 { "\xC4\x96", 2, "E", 1 }, // U+0116 -> E (LATIN CAPITAL LETTER E WITH DOT ABOVE)
372 { "\xC4\x97", 2, "e", 1 }, // U+0117 -> e (LATIN SMALL LETTER E WITH DOT ABOVE)
373 { "\xC4\x98", 2, "E", 1 }, // U+0118 -> E (LATIN CAPITAL LETTER E WITH OGONEK)
374 { "\xC4\x99", 2, "e", 1 }, // U+0119 -> e (LATIN SMALL LETTER E WITH OGONEK)
375 { "\xC4\x9A", 2, "E", 1 }, // U+011A -> E (LATIN CAPITAL LETTER E WITH CARON)
376 { "\xC4\x9B", 2, "e", 1 }, // U+011B -> e (LATIN SMALL LETTER E WITH CARON)
377 { "\xC4\x9C", 2, "G", 1 }, // U+011C -> G (LATIN CAPITAL LETTER G WITH CIRCUMFLEX)
378 { "\xC4\x9D", 2, "g", 1 }, // U+011D -> g (LATIN SMALL LETTER G WITH CIRCUMFLEX)
379 { "\xC4\x9E", 2, "G", 1 }, // U+011E -> G (LATIN CAPITAL LETTER G WITH BREVE)
380 { "\xC4\x9F", 2, "g", 1 }, // U+011F -> g (LATIN SMALL LETTER G WITH BREVE)
381 { "\xC4\xA0", 2, "G", 1 }, // U+0120 -> G (LATIN CAPITAL LETTER G WITH DOT ABOVE)
382 { "\xC4\xA1", 2, "g", 1 }, // U+0121 -> g (LATIN SMALL LETTER G WITH DOT ABOVE)
383 { "\xC4\xA2", 2, "G", 1 }, // U+0122 -> G (LATIN CAPITAL LETTER G WITH CEDILLA)
384 { "\xC4\xA3", 2, "g", 1 }, // U+0123 -> g (LATIN SMALL LETTER G WITH CEDILLA)
385 { "\xC4\xA4", 2, "H", 1 }, // U+0124 -> H (LATIN CAPITAL LETTER H WITH CIRCUMFLEX)
386 { "\xC4\xA5", 2, "h", 1 }, // U+0125 -> h (LATIN SMALL LETTER H WITH CIRCUMFLEX)
387 { "\xC4\xA6", 2, "H", 1 }, // U+0126 -> H (LATIN CAPITAL LETTER H WITH STROKE)
388 { "\xC4\xA7", 2, "h", 1 }, // U+0127 -> h (LATIN SMALL LETTER H WITH STROKE)
389 { "\xC4\xA8", 2, "I", 1 }, // U+0128 -> I (LATIN CAPITAL LETTER I WITH TILDE)
390 { "\xC4\xA9", 2, "i", 1 }, // U+0129 -> i (LATIN SMALL LETTER I WITH TILDE)
391 { "\xC4\xAA", 2, "I", 1 }, // U+012A -> I (LATIN CAPITAL LETTER I WITH MACRON)
392 { "\xC4\xAB", 2, "i", 1 }, // U+012B -> i (LATIN SMALL LETTER I WITH MACRON)
393 { "\xC4\xAC", 2, "I", 1 }, // U+012C -> I (LATIN CAPITAL LETTER I WITH BREVE)
394 { "\xC4\xAD", 2, "i", 1 }, // U+012D -> i (LATIN SMALL LETTER I WITH BREVE)
395 { "\xC4\xAE", 2, "I", 1 }, // U+012E -> I (LATIN CAPITAL LETTER I WITH OGONEK)
396 { "\xC4\xAF", 2, "i", 1 }, // U+012F -> i (LATIN SMALL LETTER I WITH OGONEK)
397 { "\xC4\xB0", 2, "I", 1 }, // U+0130 -> I (LATIN CAPITAL LETTER I WITH DOT ABOVE)
398 { "\xC4\xB1", 2, "i", 1 }, // U+0131 -> i (LATIN SMALL LETTER DOTLESS I)
399 { "\xC4\xB2", 2, "Ij", 2 }, // U+0132 -> Ij (LATIN CAPITAL LIGATURE IJ)
400 { "\xC4\xB3", 2, "ij", 2 }, // U+0133 -> ij (LATIN SMALL LIGATURE IJ)
401 { "\xC4\xB4", 2, "J", 1 }, // U+0134 -> J (LATIN CAPITAL LETTER J WITH CIRCUMFLEX)
402 { "\xC4\xB5", 2, "j", 1 }, // U+0135 -> j (LATIN SMALL LETTER J WITH CIRCUMFLEX)
403 { "\xC4\xB6", 2, "K", 1 }, // U+0136 -> K (LATIN CAPITAL LETTER K WITH CEDILLA)
404 { "\xC4\xB7", 2, "k", 1 }, // U+0137 -> k (LATIN SMALL LETTER K WITH CEDILLA)
405 { "\xC4\xB8", 2, "k", 1 }, // U+0138 -> k (LATIN SMALL LETTER KRA)
406 { "\xC4\xB9", 2, "L", 1 }, // U+0139 -> L (LATIN CAPITAL LETTER L WITH ACUTE)
407 { "\xC4\xBA", 2, "l", 1 }, // U+013A -> l (LATIN SMALL LETTER L WITH ACUTE)
408 { "\xC4\xBB", 2, "L", 1 }, // U+013B -> L (LATIN CAPITAL LETTER L WITH CEDILLA)
409 { "\xC4\xBC", 2, "l", 1 }, // U+013C -> l (LATIN SMALL LETTER L WITH CEDILLA)
410 { "\xC4\xBD", 2, "L", 1 }, // U+013D -> L (LATIN CAPITAL LETTER L WITH CARON)
411 { "\xC4\xBE", 2, "l", 1 }, // U+013E -> l (LATIN SMALL LETTER L WITH CARON)
412 { "\xC4\xBF", 2, "L", 1 }, // U+013F -> L (LATIN CAPITAL LETTER L WITH MIDDLE DOT)
413 { "\xC5\x80", 2, "l", 1 }, // U+0140 -> l (LATIN SMALL LETTER L WITH MIDDLE DOT)
414 { "\xC5\x81", 2, "L", 1 }, // U+0141 -> L (LATIN CAPITAL LETTER L WITH STROKE)
415 { "\xC5\x82", 2, "l", 1 }, // U+0142 -> l (LATIN SMALL LETTER L WITH STROKE)
416 { "\xC5\x83", 2, "N", 1 }, // U+0143 -> N (LATIN CAPITAL LETTER N WITH ACUTE)
417 { "\xC5\x84", 2, "n", 1 }, // U+0144 -> n (LATIN SMALL LETTER N WITH ACUTE)
418 { "\xC5\x85", 2, "N", 1 }, // U+0145 -> N (LATIN CAPITAL LETTER N WITH CEDILLA)
419 { "\xC5\x86", 2, "n", 1 }, // U+0146 -> n (LATIN SMALL LETTER N WITH CEDILLA)
420 { "\xC5\x87", 2, "N", 1 }, // U+0147 -> N (LATIN CAPITAL LETTER N WITH CARON)
421 { "\xC5\x88", 2, "n", 1 }, // U+0148 -> n (LATIN SMALL LETTER N WITH CARON)
422 { "\xC5\x89", 2, "'n", 2 }, // U+0149 -> 'n (LATIN SMALL LETTER N PRECEDED BY APOSTROPHE)
423 { "\xC5\x8A", 2, "Ng", 2 }, // U+014A -> Ng (LATIN CAPITAL LETTER ENG)
424 { "\xC5\x8B", 2, "ng", 2 }, // U+014B -> ng (LATIN SMALL LETTER ENG)
425 { "\xC5\x8C", 2, "O", 1 }, // U+014C -> O (LATIN CAPITAL LETTER O WITH MACRON)
426 { "\xC5\x8D", 2, "o", 1 }, // U+014D -> o (LATIN SMALL LETTER O WITH MACRON)
427 { "\xC5\x8E", 2, "O", 1 }, // U+014E -> O (LATIN CAPITAL LETTER O WITH BREVE)
428 { "\xC5\x8F", 2, "o", 1 }, // U+014F -> o (LATIN SMALL LETTER O WITH BREVE)
429 { "\xC5\x90", 2, "O", 1 }, // U+0150 -> O (LATIN CAPITAL LETTER O WITH DOUBLE ACUTE)
430 { "\xC5\x91", 2, "o", 1 }, // U+0151 -> o (LATIN SMALL LETTER O WITH DOUBLE ACUTE)
431 { "\xC5\x92", 2, "Oe", 2 }, // U+0152 -> Oe (LATIN CAPITAL LIGATURE OE)
432 { "\xC5\x93", 2, "oe", 2 }, // U+0153 -> oe (LATIN SMALL LIGATURE OE)
433 { "\xC5\x94", 2, "R", 1 }, // U+0154 -> R (LATIN CAPITAL LETTER R WITH ACUTE)
434 { "\xC5\x95", 2, "r", 1 }, // U+0155 -> r (LATIN SMALL LETTER R WITH ACUTE)
435 { "\xC5\x96", 2, "R", 1 }, // U+0156 -> R (LATIN CAPITAL LETTER R WITH CEDILLA)
436 { "\xC5\x97", 2, "r", 1 }, // U+0157 -> r (LATIN SMALL LETTER R WITH CEDILLA)
437 { "\xC5\x98", 2, "R", 1 }, // U+0158 -> R (LATIN CAPITAL LETTER R WITH CARON)
438 { "\xC5\x99", 2, "r", 1 }, // U+0159 -> r (LATIN SMALL LETTER R WITH CARON)
439 { "\xC5\x9A", 2, "S", 1 }, // U+015A -> S (LATIN CAPITAL LETTER S WITH ACUTE)
440 { "\xC5\x9B", 2, "s", 1 }, // U+015B -> s (LATIN SMALL LETTER S WITH ACUTE)
441 { "\xC5\x9C", 2, "S", 1 }, // U+015C -> S (LATIN CAPITAL LETTER S WITH CIRCUMFLEX)
442 { "\xC5\x9D", 2, "s", 1 }, // U+015D -> s (LATIN SMALL LETTER S WITH CIRCUMFLEX)
443 { "\xC5\x9E", 2, "S", 1 }, // U+015E -> S (LATIN CAPITAL LETTER S WITH CEDILLA)
444 { "\xC5\x9F", 2, "s", 1 }, // U+015F -> s (LATIN SMALL LETTER S WITH CEDILLA)
445 { "\xC5\xA0", 2, "S", 1 }, // U+0160 -> S (LATIN CAPITAL LETTER S WITH CARON)
446 { "\xC5\xA1", 2, "s", 1 }, // U+0161 -> s (LATIN SMALL LETTER S WITH CARON)
447 { "\xC5\xA2", 2, "T", 1 }, // U+0162 -> T (LATIN CAPITAL LETTER T WITH CEDILLA)
448 { "\xC5\xA3", 2, "t", 1 }, // U+0163 -> t (LATIN SMALL LETTER T WITH CEDILLA)
449 { "\xC5\xA4", 2, "T", 1 }, // U+0164 -> T (LATIN CAPITAL LETTER T WITH CARON)
450 { "\xC5\xA5", 2, "t", 1 }, // U+0165 -> t (LATIN SMALL LETTER T WITH CARON)
451 { "\xC5\xA6", 2, "T", 1 }, // U+0166 -> T (LATIN CAPITAL LETTER T WITH STROKE)
452 { "\xC5\xA7", 2, "t", 1 }, // U+0167 -> t (LATIN SMALL LETTER T WITH STROKE)
453 { "\xC5\xA8", 2, "U", 1 }, // U+0168 -> U (LATIN CAPITAL LETTER U WITH TILDE)
454 { "\xC5\xA9", 2, "u", 1 }, // U+0169 -> u (LATIN SMALL LETTER U WITH TILDE)
455 { "\xC5\xAA", 2, "U", 1 }, // U+016A -> U (LATIN CAPITAL LETTER U WITH MACRON)
456 { "\xC5\xAB", 2, "u", 1 }, // U+016B -> u (LATIN SMALL LETTER U WITH MACRON)
457 { "\xC5\xAC", 2, "U", 1 }, // U+016C -> U (LATIN CAPITAL LETTER U WITH BREVE)
458 { "\xC5\xAD", 2, "u", 1 }, // U+016D -> u (LATIN SMALL LETTER U WITH BREVE)
459 { "\xC5\xAE", 2, "U", 1 }, // U+016E -> U (LATIN CAPITAL LETTER U WITH RING ABOVE)
460 { "\xC5\xAF", 2, "u", 1 }, // U+016F -> u (LATIN SMALL LETTER U WITH RING ABOVE)
461 { "\xC5\xB0", 2, "U", 1 }, // U+0170 -> U (LATIN CAPITAL LETTER U WITH DOUBLE ACUTE)
462 { "\xC5\xB1", 2, "u", 1 }, // U+0171 -> u (LATIN SMALL LETTER U WITH DOUBLE ACUTE)
463 { "\xC5\xB2", 2, "U", 1 }, // U+0172 -> U (LATIN CAPITAL LETTER U WITH OGONEK)
464 { "\xC5\xB3", 2, "u", 1 }, // U+0173 -> u (LATIN SMALL LETTER U WITH OGONEK)
465 { "\xC5\xB4", 2, "W", 1 }, // U+0174 -> W (LATIN CAPITAL LETTER W WITH CIRCUMFLEX)
466 { "\xC5\xB5", 2, "w", 1 }, // U+0175 -> w (LATIN SMALL LETTER W WITH CIRCUMFLEX)
467 { "\xC5\xB6", 2, "Y", 1 }, // U+0176 -> Y (LATIN CAPITAL LETTER Y WITH CIRCUMFLEX)
468 { "\xC5\xB7", 2, "y", 1 }, // U+0177 -> y (LATIN SMALL LETTER Y WITH CIRCUMFLEX)
469 { "\xC5\xB8", 2, "Y", 1 }, // U+0178 -> Y (LATIN CAPITAL LETTER Y WITH DIAERESIS)
470 { "\xC5\xB9", 2, "Z", 1 }, // U+0179 -> Z (LATIN CAPITAL LETTER Z WITH ACUTE)
471 { "\xC5\xBA", 2, "z", 1 }, // U+017A -> z (LATIN SMALL LETTER Z WITH ACUTE)
472 { "\xC5\xBB", 2, "Z", 1 }, // U+017B -> Z (LATIN CAPITAL LETTER Z WITH DOT ABOVE)
473 { "\xC5\xBC", 2, "z", 1 }, // U+017C -> z (LATIN SMALL LETTER Z WITH DOT ABOVE)
474 { "\xC5\xBD", 2, "Z", 1 }, // U+017D -> Z (LATIN CAPITAL LETTER Z WITH CARON)
475 { "\xC5\xBE", 2, "z", 1 }, // U+017E -> z (LATIN SMALL LETTER Z WITH CARON)
476 { "\xC5\xBF", 2, "s", 1 }, // U+017F -> s (LATIN SMALL LETTER LONG S
478 // U+2000 ... U+206F (General Punctuation)
479 { "\xE2\x80\x90", 3, "-", 1 }, // U+2010 -> - (HYPHEN)
480 { "\xE2\x80\x91", 3, "-", 1 }, // U+2011 -> - (NON-BREAKING HYPHEN)
481 { "\xE2\x80\x92", 3, "--", 2 }, // U+2012 -> -- (FIGURE DASH)
482 { "\xE2\x80\x93", 3, "--", 2 }, // U+2013 -> -- (EN DASH)
483 { "\xE2\x80\x94", 3, "---", 3 }, // U+2014 -> --- (EM DASH)
484 { "\xE2\x80\x95", 3, "---", 3 }, // U+2015 -> --- (HORIZONTAL BAR)
485 { "\xE2\x80\x96", 3, "||", 2 }, // U+2016 -> || (DOUBLE VERTICAL LINE)
486 { "\xE2\x80\x97", 3, "_", 1 }, // U+2017 -> _ (DOUBLE LOW LINE)
487 { "\xE2\x80\x98", 3, "`", 1 }, // U+2018 -> ` (LEFT SINGLE QUOTATION MARK)
488 { "\xE2\x80\x99", 3, "'", 1 }, // U+2019 -> ' (RIGHT SINGLE QUOTATION MARK)
489 { "\xE2\x80\x9A", 3, ",", 1 }, // U+201A -> , (SINGLE LOW-9 QUOTATION MARK)
490 { "\xE2\x80\x9B", 3, "'", 1 }, // U+201B -> ' (SINGLE HIGH-REVERSED-9 QUOTATION MARK)
491 { "\xE2\x80\x9C", 3, "\"", 1 }, // U+201C -> " (LEFT DOUBLE QUOTATION MARK)
492 { "\xE2\x80\x9D", 3, "\"", 1 }, // U+201D -> " (RIGHT DOUBLE QUOTATION MARK)
493 { "\xE2\x80\x9E", 3, ",,", 2 }, // U+201E -> ,, (DOUBLE LOW-9 QUOTATION MARK)
494 { "\xE2\x80\x9F", 3, "``", 2 }, // U+201F -> `` (DOUBLE HIGH-REVERSED-9 QUOTATION MARK)
495 { "\xE2\x80\xA0", 3, "+", 1 }, // U+2020 -> + (DAGGER)
496 { "\xE2\x80\xA1", 3, "+", 1 }, // U+2021 -> + (DOUBLE DAGGER)
497 { "\xE2\x80\xA2", 3, "\xC2\xB7", -2 }, // U+2022 -> U+00B7 (BULLET) -> (MIDDLE POINT)
498 { "\xE2\x80\xA3", 3, ".", 1 }, // U+2023 -> . (TRIANGULAR BULLET)
499 { "\xE2\x80\xA4", 3, ".", 1 }, // U+2024 -> . (ONE DOT LEADER)
500 { "\xE2\x80\xA5", 3, "..", 2 }, // U+2025 -> .. (TWO DOT LEADER)
501 { "\xE2\x80\xA6", 3, "...", 3 }, // U+2026 -> ... (HORIZONTAL ELLIPSIS)
502 { "\xE2\x80\xA7", 3, "\xC2\xB7", -2 }, // U+2027 -> U+00B7 (HYPHENATION POINT) -> (MIDDLE POINT)
503 { "\xE2\x80\xB0", 3, "%.", 2 }, // U+2030 -> %. (PER MILLE SIGN)
504 { "\xE2\x80\xB1", 3, "%..", 3 }, // U+2031 -> %.. (PER TEN THOUSAND SIGN)
505 { "\xE2\x80\xB2", 3, "'", 1 }, // U+2032 -> ` (PRIME)
506 { "\xE2\x80\xB3", 3, "''", 2 }, // U+2033 -> '' (DOUBLE PRIME)
507 { "\xE2\x80\xB4", 3, "'''", 3 }, // U+2034 -> ''' (TRIPLE PRIME)
508 { "\xE2\x80\xB5", 3, "`", 1 }, // U+2035 -> ` (REVERSED PRIME)
509 { "\xE2\x80\xB6", 3, "``", 2 }, // U+2036 -> `` (REVERSED DOUBLE PRIME)
510 { "\xE2\x80\xB7", 3, "```", 3 }, // U+2037 -> ``` (REVERSED TRIPLE PRIME)
511 { "\xE2\x80\xB8", 3, "^", 1 }, // U+2038 -> ^ (CARET)
512 { "\xE2\x80\xB9", 3, "<", 1 }, // U+2039 -> < (SINGLE LEFT-POINTING ANGLE QUOTATION MARK)
513 { "\xE2\x80\xBA", 3, ">", 1 }, // U+203A -> > (SINGLE RIGHT-POINTING ANGLE QUOTATION MARK)
514 { "\xE2\x80\xBB", 3, "\xC3\x97", -2 }, // U+203B -> U+00D7 (REFERENCE MARK) -> (MULTIPLICATION SIGN)
515 { "\xE2\x80\xBC", 3, "!!", 2 }, // U+203C -> !! (DOUBLE EXCLAMATION MARK)
516 { "\xE2\x80\xBD", 3, "?", 1 }, // U+203D -> ? (INTERROBANG)
517 { "\xE2\x81\x82", 3, "*", 1 }, // U+2042 -> * (ASTERISM)
518 { "\xE2\x81\x83", 3, ".", 1 }, // U+2043 -> . (HYPHEN BULLET)
519 { "\xE2\x81\x84", 3, "/", 1 }, // U+2044 -> / (FRACTION SLASH)
520 { "\xE2\x81\x87", 3, "??", 2 }, // U+2047 -> ?? (DOUBLE QUESTION MARK)
521 { "\xE2\x81\x88", 3, "?!", 2 }, // U+2048 -> ?! (QUESTION EXCLAMATION MARK)
522 { "\xE2\x81\x89", 3, "!?", 2 }, // U+2049 -> !? (EXCLAMATION QUESTION MARK)
523 { "\xE2\x81\x8E", 3, "*", 1 }, // U+204E -> * (LOW ASTERISK)
524 { "\xE2\x81\x8F", 3, ";", 1 }, // U+204F -> ; (REVERSED SEMICOLON)
525 { "\xE2\x81\x91", 3, "*", 1 }, // U+2051 -> * (TWO ASTERISKS ALIGNED VERTICALLY)
526 { "\xE2\x81\x92", 3, "-", 1 }, // U+2052 -> - (COMMERCIAL MINUS SIGN)
527 { "\xE2\x81\x93", 3, "~", 1 }, // U+2053 -> ~ (SWUNG DASH)
528 { "\xE2\x81\x95", 3, "*", 1 }, // U+2055 -> * (FLOWER PUNCTUATION MARK)
529 { "\xE2\x81\x97", 3, "''''", 4 }, // U+2057 -> '''' (QUADRUPLE PRIME)
530 { "\xE2\x81\x9A", 3, ":", 1 }, // U+205A -> : (TWO DOT PUNCTUATION)
531 { "\xE2\x81\x9C", 3, "+", 1 }, // U+205C -> + (DOTTED CROSS)
533 // U+20A0 ... U+20CF (Currency Symbols)
534 { "\xE2\x82\xA0", 3, "ECU", 3 }, // U+20A0 -> ECU (EURO-CURRENCY SIGN)
535 { "\xE2\x82\xA1", 3, "CRC", 3 }, // U+20A1 -> CRC (COLON SIGN)
536 { "\xE2\x82\xA2", 3, "BRC", 3 }, // U+20A2 -> BRC (CRUZEIRO SIGN)
537 { "\xE2\x82\xA3", 3, "BEF", 3 }, // U+20A3 -> BEF (FRENCH FRANC SIGN)
538 { "\xE2\x82\xA4", 3, "ITL", 3 }, // U+20A4 -> ITL (LIRA SIGN)
539 { "\xE2\x82\xA6", 3, "NGN", 3 }, // U+20A6 -> NGN (NEIRA SIGN)
540 { "\xE2\x82\xA7", 3, "ESP", 3 }, // U+20A7 -> ESP (PESETA SIGN)
541 { "\xE2\x82\xA8", 3, "MVQ", 3 }, // U+20A8 -> MVQ (RUPEE SIGN)
542 { "\xE2\x82\xA9", 3, "KPW", 3 }, // U+20A9 -> KPW (WON SIGN)
543 { "\xE2\x82\xAA", 3, "ILS", 3 }, // U+20AA -> ILS (NEW SHEQEL SIGN)
544 { "\xE2\x82\xAB", 3, "VNC", 3 }, // U+20AB -> VNC (DONG SIGN)
545 { "\xE2\x82\xAC", 3, "EUR", 3 }, // U+20AC -> EUR (EURO SIGN)
546 { "\xE2\x82\xAD", 3, "LAK", 3 }, // U+20AD -> LAK (KIP SIGN)
547 { "\xE2\x82\xAE", 3, "MNT", 3 }, // U+20AE -> MNT (TUGRIK SIGN)
548 { "\xE2\x82\xAF", 3, "GRD", 3 }, // U+20AF -> GRD (DRACHMA SIGN)
549 { "\xE2\x82\xB0", 3, "Pf", 2 }, // U+20B0 -> Pf (GERMAN PENNY SIGN)
550 { "\xE2\x82\xB1", 3, "P", 1 }, // U+20B1 -> P (PESO SIGN)
551 { "\xE2\x82\xB2", 3, "PYG", 3 }, // U+20B2 -> PYG (GUARANI SIGN)
552 { "\xE2\x82\xB3", 3, "ARA", 3 }, // U+20B3 -> ARA (AUSTRAL SIGN)
553 { "\xE2\x82\xB4", 3, "UAH", 3 }, // U+20B4 -> UAH (HRYVNIA SIGN)
554 { "\xE2\x82\xB5", 3, "GHS", 3 }, // U+20B5 -> GHS (CEDI SIGN)
556 // U+2190 ... U+21FF (Arrows)
557 { "\xE2\x86\x90", 3, "<-", 2 }, // U+2190 -> <- (LEFTWARDS ARROW)
558 { "\xE2\x86\x92", 3, "->", 2 }, // U+2192 -> -> (RIGHTWARDS ARROW)
561 ENTER();
563 // start with no replacement string
564 *dst = NULL;
566 // perform a binary search in the lookup table
567 if((rep = bsearch(&key, utf8map, sizeof(utf8map) / sizeof(utf8map[0]), sizeof(utf8map[0]), compareUTF8Replacements)) != NULL)
569 // if we found something, then copy this over to the result variables
570 *dst = rep->rep;
571 len = rep->replen;
574 RETURN(len);
575 return len;
579 /// matchCodesetAlias()
581 struct CodesetAliases
583 const char *MIMEname; // The official and correct MIME name for a codeset
584 const char *Aliases; // A space separated array with well-known aliases
587 const struct CodesetAliases codesetAliases[] =
589 // MIME name Aliases
590 { "Amiga-1251", "Ami1251 Amiga1251" },
591 { "AmigaPL", "AmiPL Amiga-PL" },
592 { "ISO-8859-1", "ISO8859-1 8859-1" },
593 { "ISO-8859-2", "ISO8859-2 8859-2" },
594 { "ISO-8859-3", "ISO8859-3 8859-3" },
595 { "ISO-8859-4", "ISO8859-4 8859-4" },
596 { "ISO-8859-5", "ISO8859-5 8859-5" },
597 { "ISO-8859-6", "ISO8859-6 8859-6" },
598 { "ISO-8859-7", "ISO8859-7 8859-7" },
599 { "ISO-8859-8", "ISO8859-8 8859-8" },
600 { "ISO-8859-9", "ISO8859-9 8859-9" },
601 { "ISO-8859-10", "ISO8859-10 8859-10" },
602 { "ISO-8859-11", "ISO8859-11 8859-11" },
603 { "ISO-8859-12", "ISO8859-12 8859-12" },
604 { "ISO-8859-13", "ISO8859-13 8859-13" },
605 { "ISO-8859-14", "ISO8859-14 8859-14" },
606 { "ISO-8859-15", "ISO8859-15 8859-15" },
607 { "ISO-8859-16", "ISO8859-16 8859-16" },
608 { "ISO-8859-10", "ISO8859-10 8859-10" },
609 { "KOI8-R", "KOI8R" },
610 { "US-ASCII", "ASCII" },
611 { "UTF-8", "UTF8 UTF" },
612 { "UTF-16", "UTF16" },
613 { "UTF-32", "UTF32" },
614 { "windows-1250", "cp1250 windows1250" },
615 { "windows-1251", "cp1251 windows1251" },
616 { "windows-1252", "cp1252 windows1252" },
617 { "windows-1253", "cp1253 windows1253" },
618 { "windows-1254", "cp1254 windows1254" },
619 { "windows-1255", "cp1255 windows1255" },
620 { "windows-1256", "cp1256 windows1256" },
621 { "windows-1257", "cp1257 windows1257" },
622 { NULL, NULL, }
625 static const char *matchCodesetAlias(const char *search)
627 const char *result = NULL;
628 size_t len = strlen(search);
629 int i;
631 ENTER();
633 for(i=0; codesetAliases[i].MIMEname != NULL; i++)
635 BOOL found = FALSE;
637 // search the MIMEname first
638 if(stricmp(search, codesetAliases[i].MIMEname) == 0)
639 found = TRUE;
640 else
642 const char *s = codesetAliases[i].Aliases;
644 // loop through space separated list of aliases
645 while(s != NULL && *s != '\0')
647 if(strnicmp(search, s, len) == 0)
649 found = TRUE;
650 break;
653 if((s = strpbrk(s, " ")) != NULL)
654 s++;
658 if(found == TRUE)
660 result = codesetAliases[i].MIMEname;
662 break;
666 RETURN(result);
667 return result;
672 /**************************************************************************/
674 /// defaultCodeset()
675 static struct codeset *defaultCodeset(BOOL useSemaphore)
677 char buf[256];
678 struct codeset *codeset;
680 ENTER();
682 if(useSemaphore == TRUE)
683 ObtainSemaphoreShared(&CodesetsBase->libSem);
685 buf[0] = '\0';
686 GetVar("codeset_default" ,buf, sizeof(buf), GVF_GLOBAL_ONLY);
688 if(buf[0] == '\0' || (codeset = codesetsFind(&CodesetsBase->codesets, buf)) == NULL)
689 codeset = CodesetsBase->systemCodeset;
691 if(useSemaphore == TRUE)
692 ReleaseSemaphore(&CodesetsBase->libSem);
694 RETURN(codeset);
695 return codeset;
699 /// codesetsCmpUnicode()
700 // The compare function
701 static int codesetsCmpUnicode(const void *a1, const void *a2)
703 struct single_convert *arg1 = (struct single_convert *)a1;
704 struct single_convert *arg2 = (struct single_convert *)a2;
706 return strcmp((char*)&arg1->utf8[1], (char*)&arg2->utf8[1]);
710 /// codesetsReadTable()
712 #define ITEM_STANDARD "Standard"
713 #define ITEM_ALTSTANDARD "AltStandard"
714 #define ITEM_READONLY "ReadOnly"
715 #define ITEM_CHARACTERIZATION "Characterization"
717 // Reads a coding table and adds it
718 static BOOL codesetsReadTable(struct codesetList *csList, STRPTR name)
720 BPTR fh;
721 BOOL res = FALSE;
723 ENTER();
725 D(DBF_STARTUP, "trying to read charset file '%s'...", name);
727 if((fh = Open(name, MODE_OLDFILE)) != (BPTR)NULL)
729 struct codeset *codeset;
731 if((codeset = (struct codeset *)allocArbitrateVecPooled(sizeof(*codeset))) != NULL)
733 int i;
734 char buf[512];
736 memset(codeset, 0, sizeof(*codeset));
738 for(i = 0; i<256; i++)
740 codeset->table[i].code = i;
741 codeset->table[i].ucs4 = i;
744 while(readLine(fh, buf, sizeof(buf)) == TRUE)
746 const char *result;
748 if(buf[0] != '#')
750 if((result = getConfigItem(buf, ITEM_STANDARD)) != NULL)
751 codeset->name = mystrdup(result);
752 else if(codeset->name == NULL) // a valid file starts with "Standard" and nothing else!!
753 break;
754 else if((result = getConfigItem(buf, ITEM_ALTSTANDARD)) != NULL)
755 codeset->alt_name = mystrdup(result);
756 else if((result = getConfigItem(buf, ITEM_READONLY)) != NULL)
757 codeset->read_only = (atoi(result) == 0) ? 0 : 1;
758 else if((result = getConfigItem(buf, ITEM_CHARACTERIZATION)) != NULL)
760 if(result[0] == '_' && result[1] == '(' && result[2] == '"')
762 char *end = strchr(result + 3, '"');
764 if(end != NULL)
765 codeset->characterization = mystrndup(result+3, end-(result+3));
767 else
768 codeset->characterization = mystrdup(result);
770 else
772 char *p = buf;
773 int fmt2 = 0;
775 if(*p == '=' || (fmt2 = ((*p=='0') || (*(p+1)=='x'))))
777 p++;
778 p += fmt2;
780 i = strtol(p, &p, 16);
781 if(i>0 && i<256)
783 while(isspace(*p))
784 p++;
786 if(strnicmp(p, "U+", 2) == 0)
788 p += 2;
789 codeset->table[i].ucs4 = strtol(p, &p, 16);
791 else if(*p != '#')
793 codeset->table[i].ucs4 = strtol(p, &p, 0);
801 // check if there is not already codeset with the same name in here
802 if(codeset->name != NULL && codesetsFind(csList, codeset->name) == NULL)
804 for(i=0; i<256; i++)
806 UTF32 src = codeset->table[i].ucs4;
807 UTF32 *src_ptr = &src;
808 UTF8 *dest_ptr = &codeset->table[i].utf8[1];
810 CodesetsConvertUTF32toUTF8((const UTF32 **)&src_ptr, src_ptr+1, &dest_ptr, dest_ptr+6, CSF_StrictConversion);
811 *dest_ptr = 0;
812 codeset->table[i].utf8[0] = (IPTR)dest_ptr-(IPTR)(&codeset->table[i].utf8[1]);
815 memcpy(codeset->table_sorted, codeset->table, sizeof(codeset->table));
816 qsort(codeset->table_sorted, 256, sizeof(codeset->table[0]), codesetsCmpUnicode);
817 D(DBF_STARTUP, "adding external codeset '%s'", codeset->name);
818 AddTail((struct List *)csList, (struct Node *)&codeset->node);
820 res = TRUE;
822 else
824 // cleanup
825 if(codeset->name != NULL)
826 freeArbitrateVecPooled(codeset->name);
827 if(codeset->alt_name != NULL)
828 freeArbitrateVecPooled(codeset->alt_name);
829 if(codeset->characterization != NULL)
830 freeArbitrateVecPooled(codeset->characterization);
831 freeArbitrateVecPooled(codeset);
835 Close(fh);
838 RETURN(res);
839 return res;
842 /// codesetsScanDir()
843 static void codesetsScanDir(struct codesetList *csList, const char *dirPath)
845 ENTER();
847 if(dirPath != NULL && dirPath[0] != '\0')
849 #if defined(__amigaos4__)
850 APTR dirContext;
852 if((dirContext = ObtainDirContextTags(EX_StringNameInput, dirPath,
853 EX_DataFields, EXF_NAME|EXF_TYPE,
854 TAG_END)) != NULL)
856 struct ExamineData *exd;
858 D(DBF_STARTUP, "scanning directory '%s' for codesets tables", dirPath);
860 while((exd = ExamineDir(dirContext)) != NULL)
862 if(EXD_IS_FILE(exd))
864 char filePath[620];
866 strlcpy(filePath, dirPath, sizeof(filePath));
867 AddPart(filePath, exd->Name, sizeof(filePath));
869 D(DBF_STARTUP, "about to read codeset table '%s'", filePath);
871 codesetsReadTable(csList, filePath);
875 ReleaseDirContext(dirContext);
877 #else
878 BPTR dirLock;
880 if((dirLock = Lock(dirPath, ACCESS_READ)))
882 struct ExAllControl *eac;
884 D(DBF_STARTUP, "scanning directory '%s' for codesets tables", dirPath);
886 if((eac = AllocDosObject(DOS_EXALLCONTROL, NULL)) != NULL)
888 struct ExAllData *ead;
889 struct ExAllData *eabuffer;
890 LONG more;
892 eac->eac_LastKey = 0;
893 eac->eac_MatchString = NULL;
894 eac->eac_MatchFunc = NULL;
896 if((eabuffer = allocVecPooled(CodesetsBase->pool, 10*sizeof(struct ExAllData))) != NULL)
898 char filePath[620];
902 more = ExAll(dirLock, eabuffer, 10*sizeof(struct ExAllData), ED_TYPE, eac);
903 if(!more && IoErr() != ERROR_NO_MORE_ENTRIES)
904 break;
906 if(eac->eac_Entries == 0)
907 continue;
909 ead = (struct ExAllData *)eabuffer;
912 // we only take that ead if it is a file (ed_Type < 0)
913 if(ead->ed_Type < 0)
915 strlcpy(filePath, dirPath, sizeof(filePath));
916 AddPart(filePath, (char *)ead->ed_Name, sizeof(filePath));
918 D(DBF_STARTUP, "about to read codeset table '%s'", filePath);
920 codesetsReadTable(csList, filePath);
922 ead = ead->ed_Next;
924 while(ead != NULL);
926 while(more);
928 freeVecPooled(CodesetsBase->pool, eabuffer);
931 FreeDosObject(DOS_EXALLCONTROL, eac);
934 UnLock(dirLock);
936 #endif
939 LEAVE();
943 /// codesetsInit()
944 // Initialized and loads the codesets
945 BOOL codesetsInit(struct codesetList *csList)
947 BOOL success = FALSE;
948 struct codeset *codeset;
949 UTF32 src;
950 int i;
951 #if defined(__amigaos4__)
952 ULONG nextMIB = 3;
953 #endif
955 ENTER();
957 NewList((struct List *)csList);
959 // to make the list of the supported codesets complete we also add fake
960 // 'UTF-8', 'UTF-16' and 'UTF-32' only so that our users can query for those codesets as well.
961 if((codeset = allocArbitrateVecPooled(sizeof(*codeset))) == NULL)
962 goto end;
964 memset(codeset, 0, sizeof(*codeset));
965 codeset->name = mystrdup("UTF-8");
966 codeset->alt_name = mystrdup("UTF8");
967 codeset->characterization = mystrdup("Unicode");
968 codeset->read_only = 0;
969 D(DBF_STARTUP, "adding internal codeset 'UTF-8'");
970 AddTail((struct List *)csList, (struct Node *)&codeset->node);
971 CodesetsBase->utf8Codeset = codeset;
973 if((codeset = allocArbitrateVecPooled(sizeof(*codeset))) == NULL)
974 goto end;
976 memset(codeset, 0, sizeof(*codeset));
977 codeset->name = mystrdup("UTF-16");
978 codeset->alt_name = mystrdup("UTF16");
979 codeset->characterization = mystrdup("16-bit Unicode");
980 codeset->read_only = 0;
981 D(DBF_STARTUP, "adding internal codeset 'UTF-16'");
982 AddTail((struct List *)csList, (struct Node *)&codeset->node);
983 CodesetsBase->utf16Codeset = codeset;
985 if((codeset = allocArbitrateVecPooled(sizeof(*codeset))) == NULL)
986 goto end;
988 memset(codeset, 0, sizeof(*codeset));
989 codeset->name = mystrdup("UTF-32");
990 codeset->alt_name = mystrdup("UTF32");
991 codeset->characterization = mystrdup("32-bit Unicode");
992 codeset->read_only = 0;
993 D(DBF_STARTUP, "adding internal codeset 'UTF-32'");
994 AddTail((struct List *)csList, (struct Node *)&codeset->node);
995 CodesetsBase->utf32Codeset = codeset;
997 // on AmigaOS4 we can use diskfont.library to inquire charset information as
998 // it comes with a quite rich implementation of different charsets.
999 #if defined(__amigaos4__)
1000 D(DBF_STARTUP, "OS4, asking diskfont.library for codesets");
1003 char *mimename;
1004 char *ianaName;
1005 ULONG *mapTable;
1006 ULONG curMIB = nextMIB;
1008 nextMIB = ObtainCharsetInfo(DFCS_NUMBER, curMIB, DFCS_NEXTNUMBER);
1009 if(nextMIB == 0)
1010 break;
1012 mapTable = (ULONG *)ObtainCharsetInfo(DFCS_NUMBER, curMIB, DFCS_MAPTABLE);
1013 mimename = (char *)ObtainCharsetInfo(DFCS_NUMBER, curMIB, DFCS_MIMENAME);
1014 ianaName = (char *)ObtainCharsetInfo(DFCS_NUMBER, curMIB, DFCS_NAME);
1015 if(mapTable != NULL && mimename != NULL && codesetsFind(csList, mimename) == NULL)
1017 D(DBF_STARTUP, "loading charset '%s' from diskfont.library...", mimename);
1019 if((codeset = allocArbitrateVecPooled(sizeof(*codeset))) == NULL)
1020 goto end;
1022 codeset->name = mystrdup(mimename);
1023 codeset->alt_name = NULL;
1024 codeset->characterization = mystrdup(ianaName);
1025 codeset->read_only = 0;
1027 for(i=0; i<256; i++)
1029 UTF32 *src_ptr = &src;
1030 UTF8 *dest_ptr = &codeset->table[i].utf8[1];
1032 src = mapTable[i];
1034 codeset->table[i].code = i;
1035 codeset->table[i].ucs4 = src;
1036 CodesetsConvertUTF32toUTF8((const UTF32 **)&src_ptr, src_ptr+1, &dest_ptr, dest_ptr+6, CSF_StrictConversion);
1037 *dest_ptr = 0;
1038 codeset->table[i].utf8[0] = (IPTR)dest_ptr-(IPTR)&codeset->table[i].utf8[1];
1041 memcpy(codeset->table_sorted, codeset->table, sizeof(codeset->table));
1042 qsort(codeset->table_sorted, 256, sizeof(codeset->table[0]), codesetsCmpUnicode);
1044 D(DBF_STARTUP, "adding diskfont.library codeset '%s'", codeset->name);
1045 AddTail((struct List *)csList, (struct Node *)&codeset->node);
1048 while(TRUE);
1049 #endif
1051 #if defined(__MORPHOS__)
1053 struct Library *KeymapBase;
1054 struct Library *LocaleBase;
1055 // assume success at first
1056 BOOL success = TRUE;
1058 D(DBF_STARTUP, "MorphOS, asking keymap.library for codesets");
1059 if((KeymapBase = OpenLibrary("keymap.library", 51)) != NULL)
1061 if((LocaleBase = OpenLibrary("locale.library", 51)) != NULL)
1063 struct KeyMap *keymap = AskKeyMapDefault();
1064 // it doesn't matter if this call fails, as we don't depend on the system codesets
1065 CONST_STRPTR name = GetKeyMapCodepage(keymap);
1067 // legacy keymaps dont have codepage or Unicode mappings
1068 if(name != NULL && keymap != NULL)
1070 D(DBF_STARTUP, "loading charset '%s' from keymap.library...", name);
1072 if((codeset = allocArbitrateVecPooled(sizeof(*codeset))) != NULL)
1074 codeset->name = mystrdup(name);
1075 codeset->alt_name = NULL;
1076 codeset->characterization = mystrdup(name); // No further information available
1077 codeset->read_only = 0;
1079 for(i=0; i<256; i++)
1081 UTF8 *dest_ptr = &codeset->table[i].utf8[1];
1082 LONG rc;
1084 codeset->table[i].code = i;
1085 codeset->table[i].ucs4 = src = ToUCS4(i, keymap);
1087 // here we use UTF8_Encode() instead of ConvertUCS4ToUTF8() because
1088 // of an internal bug in MorphOS 2.2.
1089 rc = UTF8_Encode(src, dest_ptr);
1090 rc = rc > 0 ? rc : 1;
1092 dest_ptr[rc] = '\0';
1093 codeset->table[i].utf8[0] = rc;
1096 memcpy(codeset->table_sorted, codeset->table, sizeof(codeset->table));
1097 qsort(codeset->table_sorted, 256, sizeof(codeset->table[0]), codesetsCmpUnicode);
1099 D(DBF_STARTUP, "adding keymap.library codeset '%s'", codeset->name);
1100 AddTail((struct List *)csList, (struct Node *)&codeset->node);
1102 else
1104 // only failed memory allocations are treated as error
1105 success = FALSE;
1109 CloseLibrary(LocaleBase);
1112 CloseLibrary(KeymapBase);
1115 if(success == FALSE)
1116 goto end;
1118 #endif
1120 D(DBF_STARTUP, "loading charsets from LIBS:Charsets...");
1122 // we try to walk to the LIBS:Charsets directory on our own and readin our
1123 // own charset tables
1124 codesetsScanDir(csList, "LIBS:Charsets");
1127 // now we go and initialize our internally supported codesets but only if
1128 // we have not already loaded a charset with the same name
1130 D(DBF_STARTUP, "initializing internal charsets...");
1132 // ISO-8859-1 + EURO
1133 if(codesetsFind(csList, "ISO-8859-1 + Euro") == NULL)
1135 if((codeset = allocArbitrateVecPooled(sizeof(*codeset))) == NULL)
1136 goto end;
1138 codeset->name = mystrdup("ISO-8859-1 + Euro");
1139 codeset->alt_name = NULL;
1140 codeset->characterization = mystrdup("West European (with EURO)");
1141 codeset->read_only = 1;
1143 for(i = 0; i<256; i++)
1145 UTF32 *src_ptr = &src;
1146 UTF8 *dest_ptr = &codeset->table[i].utf8[1];
1148 if(i==164)
1149 src = 0x20AC; // the EURO sign
1150 else
1151 src = i;
1153 codeset->table[i].code = i;
1154 codeset->table[i].ucs4 = src;
1155 CodesetsConvertUTF32toUTF8((const UTF32 **)&src_ptr, src_ptr+1, &dest_ptr, dest_ptr+6, CSF_StrictConversion);
1156 *dest_ptr = 0;
1157 codeset->table[i].utf8[0] = (IPTR)dest_ptr-(IPTR)&codeset->table[i].utf8[1];
1159 memcpy(codeset->table_sorted, codeset->table, sizeof(codeset->table));
1160 qsort(codeset->table_sorted, 256, sizeof(codeset->table[0]), codesetsCmpUnicode);
1162 D(DBF_STARTUP, "adding internal codeset '%s'", codeset->name);
1163 AddTail((struct List *)csList, (struct Node *)&codeset->node);
1166 // ISO-8859-1
1167 if(codesetsFind(csList, "ISO-8859-1") == NULL)
1169 if((codeset = allocArbitrateVecPooled(sizeof(*codeset))) == NULL)
1170 goto end;
1172 codeset->name = mystrdup("ISO-8859-1");
1173 codeset->alt_name = mystrdup("ISO8859-1");
1174 codeset->characterization = mystrdup("West European");
1175 codeset->read_only = 0;
1177 for(i = 0; i<256; i++)
1179 UTF32 *src_ptr = &src;
1180 UTF8 *dest_ptr = &codeset->table[i].utf8[1];
1182 src = i;
1184 codeset->table[i].code = i;
1185 codeset->table[i].ucs4 = src;
1186 CodesetsConvertUTF32toUTF8((const UTF32 **)&src_ptr, src_ptr+1, &dest_ptr, dest_ptr+6, CSF_StrictConversion);
1187 *dest_ptr = 0;
1188 codeset->table[i].utf8[0] = (IPTR)dest_ptr-(IPTR)&codeset->table[i].utf8[1];
1190 memcpy(codeset->table_sorted, codeset->table, sizeof(codeset->table));
1191 qsort(codeset->table_sorted, 256, sizeof(codeset->table[0]), codesetsCmpUnicode);
1193 D(DBF_STARTUP, "adding internal codeset '%s'", codeset->name);
1194 AddTail((struct List *)csList, (struct Node *)&codeset->node);
1197 // ISO-8859-2
1198 if(codesetsFind(csList, "ISO-8859-2") == NULL)
1200 if((codeset = allocArbitrateVecPooled(sizeof(*codeset))) == NULL)
1201 goto end;
1203 codeset->name = mystrdup("ISO-8859-2");
1204 codeset->alt_name = mystrdup("ISO8859-2");
1205 codeset->characterization = mystrdup("Central/East European");
1206 codeset->read_only = 0;
1208 for(i = 0; i<256; i++)
1210 UTF32 *src_ptr = &src;
1211 UTF8 *dest_ptr = &codeset->table[i].utf8[1];
1213 if(i<0xa0)
1214 src = i;
1215 else
1216 src = iso_8859_2_to_ucs4[i-0xa0];
1218 codeset->table[i].code = i;
1219 codeset->table[i].ucs4 = src;
1220 CodesetsConvertUTF32toUTF8((const UTF32 **)&src_ptr, src_ptr+1, &dest_ptr,dest_ptr+6, CSF_StrictConversion);
1221 *dest_ptr = 0;
1222 codeset->table[i].utf8[0] = (IPTR)dest_ptr-(IPTR)&codeset->table[i].utf8[1];
1224 memcpy(codeset->table_sorted, codeset->table, sizeof(codeset->table));
1225 qsort(codeset->table_sorted, 256, sizeof(codeset->table[0]), codesetsCmpUnicode);
1227 D(DBF_STARTUP, "adding internal codeset '%s'", codeset->name);
1228 AddTail((struct List *)csList, (struct Node *)&codeset->node);
1231 // ISO-8859-3
1232 if(codesetsFind(csList, "ISO-8859-3") == NULL)
1234 if((codeset = allocArbitrateVecPooled(sizeof(*codeset))) == NULL)
1235 goto end;
1237 codeset->name = mystrdup("ISO-8859-3");
1238 codeset->alt_name = mystrdup("ISO8859-3");
1239 codeset->characterization = mystrdup("South European");
1240 codeset->read_only = 0;
1242 for(i = 0; i<256; i++)
1244 UTF32 *src_ptr = &src;
1245 UTF8 *dest_ptr = &codeset->table[i].utf8[1];
1247 if(i<0xa0)
1248 src = i;
1249 else
1250 src = iso_8859_3_to_ucs4[i-0xa0];
1252 codeset->table[i].code = i;
1253 codeset->table[i].ucs4 = src;
1254 CodesetsConvertUTF32toUTF8((const UTF32 **)&src_ptr,src_ptr+1,&dest_ptr,dest_ptr+6,CSF_StrictConversion);
1255 *dest_ptr = 0;
1256 codeset->table[i].utf8[0] = (IPTR)dest_ptr-(IPTR)&codeset->table[i].utf8[1];
1258 memcpy(codeset->table_sorted, codeset->table, sizeof(codeset->table));
1259 qsort(codeset->table_sorted, 256, sizeof(codeset->table[0]), codesetsCmpUnicode);
1261 D(DBF_STARTUP, "adding internal codeset '%s'", codeset->name);
1262 AddTail((struct List *)csList, (struct Node *)&codeset->node);
1265 // ISO-8859-4
1266 if(codesetsFind(csList, "ISO-8859-4") == NULL)
1268 if((codeset = allocArbitrateVecPooled(sizeof(*codeset))) == NULL)
1269 goto end;
1271 codeset->name = mystrdup("ISO-8859-4");
1272 codeset->alt_name = mystrdup("ISO8859-4");
1273 codeset->characterization = mystrdup("North European");
1274 codeset->read_only = 0;
1276 for(i = 0; i<256; i++)
1278 UTF32 *src_ptr = &src;
1279 UTF8 *dest_ptr = &codeset->table[i].utf8[1];
1281 if(i<0xa0)
1282 src = i;
1283 else
1284 src = iso_8859_4_to_ucs4[i-0xa0];
1286 codeset->table[i].code = i;
1287 codeset->table[i].ucs4 = src;
1288 CodesetsConvertUTF32toUTF8((const UTF32 **)&src_ptr,src_ptr+1,&dest_ptr,dest_ptr+6,CSF_StrictConversion);
1289 *dest_ptr = 0;
1290 codeset->table[i].utf8[0] = (IPTR)dest_ptr-(IPTR)&codeset->table[i].utf8[1];
1292 memcpy(codeset->table_sorted, codeset->table, sizeof(codeset->table));
1293 qsort(codeset->table_sorted, 256, sizeof(codeset->table[0]), codesetsCmpUnicode);
1295 D(DBF_STARTUP, "adding internal codeset '%s'", codeset->name);
1296 AddTail((struct List *)csList, (struct Node *)&codeset->node);
1299 // ISO-8859-5
1300 if(codesetsFind(csList, "ISO-8859-5") == NULL)
1302 if((codeset = allocArbitrateVecPooled(sizeof(*codeset))) == NULL)
1303 goto end;
1305 codeset->name = mystrdup("ISO-8859-5");
1306 codeset->alt_name = mystrdup("ISO8859-5");
1307 codeset->characterization = mystrdup("Slavic languages");
1308 codeset->read_only = 0;
1310 for(i = 0; i<256; i++)
1312 UTF32 *src_ptr = &src;
1313 UTF8 *dest_ptr = &codeset->table[i].utf8[1];
1315 if(i<0xa0)
1316 src = i;
1317 else
1318 src = iso_8859_5_to_ucs4[i-0xa0];
1320 codeset->table[i].code = i;
1321 codeset->table[i].ucs4 = src;
1322 CodesetsConvertUTF32toUTF8((const UTF32 **)&src_ptr,src_ptr+1,&dest_ptr,dest_ptr+6,CSF_StrictConversion);
1323 *dest_ptr = 0;
1324 codeset->table[i].utf8[0] = (IPTR)dest_ptr-(IPTR)&codeset->table[i].utf8[1];
1326 memcpy(codeset->table_sorted, codeset->table, sizeof(codeset->table));
1327 qsort(codeset->table_sorted, 256, sizeof(codeset->table[0]), codesetsCmpUnicode);
1329 D(DBF_STARTUP, "adding internal codeset '%s'", codeset->name);
1330 AddTail((struct List *)csList, (struct Node *)&codeset->node);
1333 // ISO-8859-9
1334 if(codesetsFind(csList, "ISO-8859-9") == NULL)
1336 if((codeset = allocArbitrateVecPooled(sizeof(*codeset))) == NULL)
1337 goto end;
1339 codeset->name = mystrdup("ISO-8859-9");
1340 codeset->alt_name = mystrdup("ISO8859-9");
1341 codeset->characterization = mystrdup("Turkish");
1342 codeset->read_only = 0;
1344 for(i = 0; i<256; i++)
1346 UTF32 *src_ptr = &src;
1347 UTF8 *dest_ptr = &codeset->table[i].utf8[1];
1349 if(i<0xa0)
1350 src = i;
1351 else
1352 src = iso_8859_9_to_ucs4[i-0xa0];
1354 codeset->table[i].code = i;
1355 codeset->table[i].ucs4 = src;
1356 CodesetsConvertUTF32toUTF8((const UTF32 **)&src_ptr,src_ptr+1,&dest_ptr,dest_ptr+6,CSF_StrictConversion);
1357 *dest_ptr = 0;
1358 codeset->table[i].utf8[0] = (IPTR)dest_ptr-(IPTR)&codeset->table[i].utf8[1];
1360 memcpy(codeset->table_sorted, codeset->table, sizeof(codeset->table));
1361 qsort(codeset->table_sorted, 256, sizeof(codeset->table[0]), codesetsCmpUnicode);
1363 D(DBF_STARTUP, "adding internal codeset '%s'", codeset->name);
1364 AddTail((struct List *)csList, (struct Node *)&codeset->node);
1367 // ISO-8859-15
1368 if(codesetsFind(csList, "ISO-8859-15") == NULL)
1370 if((codeset = allocArbitrateVecPooled(sizeof(*codeset))) == NULL)
1371 goto end;
1373 codeset->name = mystrdup("ISO-8859-15");
1374 codeset->alt_name = mystrdup("ISO8859-15");
1375 codeset->characterization = mystrdup("West European II");
1376 codeset->read_only = 0;
1378 for(i = 0; i<256; i++)
1380 UTF32 *src_ptr = &src;
1381 UTF8 *dest_ptr = &codeset->table[i].utf8[1];
1383 if(i<0xa0)
1384 src = i;
1385 else
1386 src = iso_8859_15_to_ucs4[i-0xa0];
1388 codeset->table[i].code = i;
1389 codeset->table[i].ucs4 = src;
1390 CodesetsConvertUTF32toUTF8((const UTF32 **)&src_ptr,src_ptr+1,&dest_ptr,dest_ptr+6,CSF_StrictConversion);
1391 *dest_ptr = 0;
1392 codeset->table[i].utf8[0] = (IPTR)dest_ptr-(IPTR)&codeset->table[i].utf8[1];
1394 memcpy(codeset->table_sorted,codeset->table,sizeof (codeset->table));
1395 qsort(codeset->table_sorted, 256, sizeof(codeset->table[0]), codesetsCmpUnicode);
1397 D(DBF_STARTUP, "adding internal codeset '%s'", codeset->name);
1398 AddTail((struct List *)csList, (struct Node *)&codeset->node);
1401 // ISO-8859-16
1402 if(codesetsFind(csList, "ISO-8859-16") == NULL)
1404 if((codeset = allocArbitrateVecPooled(sizeof(*codeset))) == NULL)
1405 goto end;
1407 codeset->name = mystrdup("ISO-8859-16");
1408 codeset->alt_name = mystrdup("ISO8869-16");
1409 codeset->characterization = mystrdup("South-Eastern European");
1410 codeset->read_only = 0;
1412 for(i=0;i<256;i++)
1414 UTF32 *src_ptr = &src;
1415 UTF8 *dest_ptr = &codeset->table[i].utf8[1];
1417 if(i < 0xa0)
1418 src = i;
1419 else
1420 src = iso_8859_16_to_ucs4[i-0xa0];
1422 codeset->table[i].code = i;
1423 codeset->table[i].ucs4 = src;
1424 CodesetsConvertUTF32toUTF8((const UTF32 **)&src_ptr, src_ptr+1, &dest_ptr, dest_ptr+6, CSF_StrictConversion);
1425 *dest_ptr = 0;
1426 codeset->table[i].utf8[0] = (IPTR)dest_ptr - (IPTR)&codeset->table[i].utf8[1];
1428 memcpy(codeset->table_sorted, codeset->table, sizeof(codeset->table));
1429 qsort(codeset->table_sorted, 256, sizeof(codeset->table[0]), codesetsCmpUnicode);
1431 D(DBF_STARTUP, "adding internal codeset '%s'", codeset->name);
1432 AddTail((struct List *)csList, (struct Node *)&codeset->node);
1435 // KOI8-R
1436 if(codesetsFind(csList, "KOI8-R") == NULL)
1438 if((codeset = allocArbitrateVecPooled(sizeof(*codeset))) == NULL)
1439 goto end;
1441 codeset->name = mystrdup("KOI8-R");
1442 codeset->alt_name = mystrdup("KOI8R");
1443 codeset->characterization = mystrdup("Russian");
1444 codeset->read_only = 0;
1446 for(i = 0; i<256; i++)
1448 UTF32 *src_ptr = &src;
1449 UTF8 *dest_ptr = &codeset->table[i].utf8[1];
1451 if(i<0x80)
1452 src = i;
1453 else
1454 src = koi8r_to_ucs4[i-0x80];
1456 codeset->table[i].code = i;
1457 codeset->table[i].ucs4 = src;
1458 CodesetsConvertUTF32toUTF8((const UTF32 **)&src_ptr,src_ptr+1,&dest_ptr,dest_ptr+6,CSF_StrictConversion);
1459 *dest_ptr = 0;
1460 codeset->table[i].utf8[0] = (IPTR)dest_ptr-(IPTR)&codeset->table[i].utf8[1];
1462 memcpy(codeset->table_sorted, codeset->table, sizeof(codeset->table));
1463 qsort(codeset->table_sorted, 256, sizeof(codeset->table[0]), codesetsCmpUnicode);
1465 D(DBF_STARTUP, "adding internal codeset '%s'", codeset->name);
1466 AddTail((struct List *)csList, (struct Node *)&codeset->node);
1469 // AmigaPL
1470 if(codesetsFind(csList, "AmigaPL") == NULL)
1472 if((codeset = allocArbitrateVecPooled(sizeof(*codeset))) == NULL)
1473 goto end;
1475 codeset->name = mystrdup("AmigaPL");
1476 codeset->alt_name = mystrdup("AmiPL");
1477 codeset->characterization = mystrdup("Polish (Amiga)");
1478 codeset->read_only = 1;
1480 for(i=0; i<256; i++)
1482 UTF32 *src_ptr = &src;
1483 UTF8 *dest_ptr = &codeset->table[i].utf8[1];
1485 if(i<0xa0)
1486 src = i;
1487 else
1488 src = amigapl_to_ucs4[i-0xa0];
1490 codeset->table[i].code = i;
1491 codeset->table[i].ucs4 = src;
1492 CodesetsConvertUTF32toUTF8((const UTF32 **)&src_ptr,src_ptr+1,&dest_ptr,dest_ptr+6,CSF_StrictConversion);
1493 *dest_ptr = 0;
1494 codeset->table[i].utf8[0] = (IPTR)dest_ptr-(IPTR)&codeset->table[i].utf8[1];
1496 memcpy(codeset->table_sorted, codeset->table, sizeof(codeset->table));
1497 qsort(codeset->table_sorted, 256, sizeof(codeset->table[0]), codesetsCmpUnicode);
1499 D(DBF_STARTUP, "adding internal codeset '%s'", codeset->name);
1500 AddTail((struct List *)csList, (struct Node *)&codeset->node);
1503 // Amiga-1251
1504 if(codesetsFind(csList, "Amiga-1251") == NULL)
1506 if((codeset = allocArbitrateVecPooled(sizeof(*codeset))) == NULL)
1507 goto end;
1509 codeset->name = mystrdup("Amiga-1251");
1510 codeset->alt_name = mystrdup("Ami1251");
1511 codeset->characterization = mystrdup("Cyrillic (Amiga)");
1512 codeset->read_only = 1;
1514 for(i=0; i<256; i++)
1516 UTF32 *src_ptr = &src;
1517 UTF8 *dest_ptr = &codeset->table[i].utf8[1];
1519 if(i < 0xa0)
1520 src = i;
1521 else
1522 src = amiga1251_to_ucs4[i-0xa0];
1524 codeset->table[i].code = i;
1525 codeset->table[i].ucs4 = src;
1526 CodesetsConvertUTF32toUTF8((const UTF32 **)&src_ptr, src_ptr+1, &dest_ptr, dest_ptr+6, CSF_StrictConversion);
1527 *dest_ptr = 0;
1528 codeset->table[i].utf8[0] = (char*)dest_ptr - (char*)&codeset->table[i].utf8[1];
1530 memcpy(codeset->table_sorted, codeset->table, sizeof(codeset->table));
1531 qsort(codeset->table_sorted, 256, sizeof(codeset->table[0]), codesetsCmpUnicode);
1533 D(DBF_STARTUP, "adding internal codeset '%s'", codeset->name);
1534 AddTail((struct List *)csList, (struct Node *)&codeset->node);
1537 success = TRUE;
1539 end:
1540 RETURN(success);
1541 return success;
1545 /// codesetsCleanup()
1546 // Cleanup the memory for the codeset
1547 void codesetsCleanup(struct codesetList *csList)
1549 struct codeset *code;
1551 ENTER();
1553 while((code = (struct codeset *)RemHead((struct List *)csList)) != NULL)
1555 if(code->name != NULL)
1556 freeArbitrateVecPooled(code->name);
1557 if(code->alt_name != NULL)
1558 freeArbitrateVecPooled(code->alt_name);
1559 if(code->characterization != NULL)
1560 freeArbitrateVecPooled(code->characterization);
1562 freeArbitrateVecPooled(code);
1565 LEAVE();
1569 /// codesetsFind()
1570 // Returns the given codeset.
1571 struct codeset *codesetsFind(struct codesetList *csList, const char *name)
1573 struct codeset *res = NULL;
1575 ENTER();
1577 if(name != NULL && name[0] != '\0')
1579 struct Node *node;
1580 const char *matchedName;
1582 if((matchedName = matchCodesetAlias(name)) != NULL)
1583 name = matchedName;
1585 for(node = GetHead((struct List *)csList); node != NULL; node = GetSucc(node))
1587 struct codeset *mstate = (struct codeset *)node;
1589 if(stricmp(name, mstate->name) == 0 ||
1590 (mstate->alt_name != NULL && stricmp(name, mstate->alt_name) == 0))
1592 // break out
1593 res = mstate;
1594 break;
1599 RETURN(res);
1600 return res;
1604 /// checkTextAgainstSingleCodeset
1605 // check how good a text can be represented by a specific codeset
1606 static int checkTextAgainstSingleCodeset(CONST_STRPTR text, ULONG textLen, struct codeset *codeset)
1608 int errors = textLen;
1610 ENTER();
1612 if(codeset->read_only == 0 &&
1613 codeset != CodesetsBase->utf8Codeset &&
1614 codeset != CodesetsBase->utf16Codeset &&
1615 codeset != CodesetsBase->utf32Codeset)
1617 CONST_STRPTR text_ptr = text;
1618 ULONG i;
1620 errors = 0;
1622 // the following identification/detection routine is NOT really smart.
1623 // we just see how each UTF8 string is the representation of each char
1624 // in our source text and then check if they are valid or not. As said,
1625 // not very smart, but we don't have anything better right now :(
1626 for(i=0; i < textLen; i++)
1628 unsigned char c = *text_ptr++;
1630 if(c != '\0')
1632 struct single_convert *f = &codeset->table[c];
1634 if(f->utf8[0] == 0x00 || f->utf8[1] == 0x00)
1635 errors++;
1637 else
1638 break;
1641 else
1642 W(DBF_STARTUP, "codeset '%s' is either read-only (%ld) or UTF8/16/32 (%ld)", codeset->name, codeset->read_only, codeset == CodesetsBase->utf8Codeset || codeset == CodesetsBase->utf16Codeset || codeset == CodesetsBase->utf32Codeset);
1644 D(DBF_STARTUP, "tried to identify text as '%s' text with %ld of %ld errors", codeset->name, errors, textLen);
1646 RETURN(errors);
1647 return errors;
1651 /// checkTextAgainstCodesetList
1652 static int checkTextAgainstCodesetList(CONST_STRPTR text, ULONG textLen, struct codesetList *csList, struct codeset **bestCodeset)
1654 struct Node *node;
1655 int bestErrors = textLen;
1657 ENTER();
1659 *bestCodeset = NULL;
1661 for(node = GetHead((struct List *)csList); node != NULL; node = GetSucc(node))
1663 struct codeset *codeset = (struct codeset *)node;
1664 int errors;
1666 errors = checkTextAgainstSingleCodeset(text, textLen, codeset);
1667 if(errors < bestErrors)
1669 *bestCodeset = codeset;
1670 bestErrors = errors;
1672 if(bestErrors == 0)
1673 break;
1677 RETURN(bestErrors);
1678 return bestErrors;
1682 /// codesetsFindBest()
1683 // Returns the best codeset for the given text
1684 static struct codeset *codesetsFindBest(struct TagItem *attrs, ULONG csFamily, CONST_STRPTR text, ULONG textLen, int *errorPtr)
1686 struct codeset *bestCodeset = NULL;
1687 int bestErrors = textLen;
1688 BOOL found = FALSE;
1690 ENTER();
1692 ObtainSemaphoreShared(&CodesetsBase->libSem);
1694 // in case the user specified the codeset family as a
1695 // cyrillic one we go and do our cyrillic specific analysis first
1696 if(csFamily == CSV_CodesetFamily_Cyrillic)
1698 #define NUM_CYRILLIC 3
1700 struct CodesetSearch
1702 const char *name;
1703 const char *data;
1706 struct CodesetSearch search[NUM_CYRILLIC];
1707 unsigned char *p;
1708 unsigned char *tp;
1709 int ctr[NUM_CYRILLIC];
1710 int Nmax;
1711 int NGlob = 1;
1712 int max;
1713 int gr = 0;
1714 int lr = 0;
1716 D(DBF_STARTUP, "performing cyrillic analysis");
1718 search[0].name = "windows-1251";
1719 search[0].data = cp1251_data;
1720 search[1].name = "IBM866";
1721 search[1].data = cp866_data;
1722 search[2].name = "KOI8-R";
1723 search[2].data = koi8r_data;
1725 memset(&ctr, 0, sizeof(ctr));
1727 tp = (unsigned char *)text;
1731 int n;
1732 int mid = max = -466725766; // TODO: what's the magic behind this constant?
1733 Nmax = 0;
1735 for(n=0; n < NUM_CYRILLIC; n++)
1737 unsigned char la = 0;
1738 unsigned char *tptr = (unsigned char *)search[n].data;
1740 p = tp;
1744 unsigned char lb = (*p++) ^ 128;
1746 if(!((la | lb) & 128))
1747 ctr[n] += (signed char)tptr[(la << 7) + lb];
1749 la = lb;
1751 while(*p);
1753 if(max < ctr[n])
1755 mid = max;
1756 max = ctr[n];
1757 Nmax = n+1;
1761 tp = p;
1762 if((max >= 500) && ((max-mid) >= 1000))
1764 lr = gr = 1;
1765 NGlob = Nmax;
1768 while((*p) && (!gr));
1770 if(gr || ((!(*p)) && lr))
1771 Nmax = NGlob;
1773 // if our analysis found something, we go and try
1774 // to find the corresponding codeset in out codeset list
1775 if(max != 0)
1777 struct TagItem *tstate = attrs;
1778 struct TagItem *tag;
1780 D(DBF_STARTUP, "identified text as '%s", search[Nmax-1].name);
1782 // now we walk through our taglist and check if the user
1783 // supplied
1784 while((tag = NextTagItem((APTR)&tstate)) != NULL)
1786 if(tag->ti_Tag == CSA_CodesetList && tag->ti_Data != 0)
1788 struct codesetList *csList = (struct codesetList *)tag->ti_Data;
1790 if((bestCodeset = codesetsFind(csList, search[Nmax-1].name)) != NULL)
1791 break;
1795 // if we still haven't found the matching codeset
1796 // we search the internal list
1797 if(bestCodeset == NULL)
1798 bestCodeset = codesetsFind(&CodesetsBase->codesets, search[Nmax-1].name);
1800 bestErrors = 0;
1802 found = TRUE;
1806 // if we haven't found the best codeset (through the cyrillic analysis)
1807 // we go and do the dumb latin search in our codesetlist
1808 if(found == FALSE)
1810 struct TagItem *tstate = attrs;
1811 struct TagItem *tag;
1813 // check text against all codesets in all supplied lists of codesets
1814 while((tag = NextTagItem((APTR)&tstate)) != NULL)
1816 switch(tag->ti_Tag)
1818 case CSA_CodesetList:
1820 struct codesetList *csList = (struct codesetList *)tag->ti_Data;
1821 struct codeset *bestCodesetInList;
1822 int bestErrorsInList;
1824 D(DBF_STARTUP, "checking against external codeset list");
1825 bestErrorsInList = checkTextAgainstCodesetList(text, textLen, csList, &bestCodesetInList);
1826 if(bestErrorsInList < bestErrors && bestCodesetInList != NULL)
1828 bestCodeset = bestCodesetInList;
1829 bestErrors = bestErrorsInList;
1831 if(bestErrors == 0)
1832 break;
1835 break;
1839 // we didn't find a "best" codeset in the supplied codesets lists so far,
1840 // so now we check against our internal list
1841 if(bestErrors != 0)
1843 struct codeset *bestCodesetInList;
1844 int bestErrorsInList;
1846 D(DBF_STARTUP, "checking against internal codeset list");
1847 bestErrorsInList = checkTextAgainstCodesetList(text, textLen, &CodesetsBase->codesets, &bestCodesetInList);
1848 if(bestErrorsInList < bestErrors && bestCodesetInList != NULL)
1850 bestCodeset = bestCodesetInList;
1851 bestErrors = bestErrorsInList;
1856 ReleaseSemaphore(&CodesetsBase->libSem);
1858 if(errorPtr != NULL)
1859 *errorPtr = bestErrors;
1861 RETURN(bestCodeset);
1862 return bestCodeset;
1867 /**************************************************************************/
1869 /// CodesetsSupportedA()
1870 STRPTR * LIBFUNC CodesetsSupportedA(REG(a0, UNUSED struct TagItem *attrs))
1872 STRPTR *array = NULL;
1873 struct TagItem *tstate = attrs;
1874 struct TagItem *tag;
1875 int numCodesets;
1877 ENTER();
1879 ObtainSemaphoreShared(&CodesetsBase->libSem);
1881 // first we need to check how many codesets our supplied
1882 // lists carry.
1883 numCodesets = countCodesets(&CodesetsBase->codesets);
1884 while((tag = NextTagItem((APTR)&tstate)) != NULL)
1886 switch(tag->ti_Tag)
1888 case CSA_CodesetList:
1890 numCodesets += countCodesets((struct codesetList *)tag->ti_Data);
1892 break;
1896 // now that we know how many codesets we have in our lists we
1897 // can put their names into our string arrays
1898 if(numCodesets > 0)
1900 if((array = allocArbitrateVecPooled((numCodesets+1)*sizeof(STRPTR))) != NULL)
1902 struct Node *node;
1903 int i=0;
1905 // first we walk through the internal codesets list and
1906 // add the names
1907 for(node = GetHead((struct List *)&CodesetsBase->codesets); node != NULL; node = GetSucc(node))
1909 struct codeset *code = (struct codeset *)node;
1911 array[i] = code->name;
1912 i++;
1915 // reset the tstate
1916 tstate = attrs;
1918 // then we also iterate through our private codesets list
1919 while((tag = NextTagItem((APTR)&tstate)) != NULL)
1921 switch(tag->ti_Tag)
1923 case CSA_CodesetList:
1925 for(node = GetHead((struct List *)tag->ti_Data); node != NULL; node = GetSucc(node))
1927 struct codeset *code = (struct codeset *)node;
1929 array[i] = code->name;
1930 i++;
1933 break;
1937 array[i] = NULL;
1941 ReleaseSemaphore(&CodesetsBase->libSem);
1943 RETURN(array);
1944 return array;
1948 /// CodesetsFreeA()
1949 void LIBFUNC CodesetsFreeA(REG(a0, APTR obj), REG(a1, UNUSED struct TagItem *attrs))
1951 ENTER();
1953 if(obj != NULL)
1954 freeArbitrateVecPooled(obj);
1956 LEAVE();
1960 /// CodesetsSetDefaultA()
1961 struct codeset * LIBFUNC CodesetsSetDefaultA(REG(a0, STRPTR name), REG(a1, struct TagItem *attrs))
1963 struct codeset *codeset;
1965 ENTER();
1967 ObtainSemaphoreShared(&CodesetsBase->libSem);
1969 if((codeset = codesetsFind(&CodesetsBase->codesets, name)) != NULL)
1971 ULONG flags;
1973 flags = GVF_SAVE_VAR;
1974 if(GetTagData(CSA_Save, FALSE, attrs))
1975 SET_FLAG(flags, GVF_GLOBAL_ONLY);
1977 SetVar("codeset_default", codeset->name, strlen(codeset->name), flags);
1980 ReleaseSemaphore(&CodesetsBase->libSem);
1982 RETURN(codeset);
1983 return codeset;
1987 /// CodesetsFindA()
1988 struct codeset * LIBFUNC CodesetsFindA(REG(a0, STRPTR name), REG(a1, struct TagItem *attrs))
1990 struct codeset *codeset = NULL;
1992 ENTER();
1994 ObtainSemaphoreShared(&CodesetsBase->libSem);
1996 // if no name pointer was supplied we have to return
1997 // the default codeset only.
1998 if(name != NULL)
2000 // we first walk through our internal list and check if we
2001 // can find the requested codeset
2002 codeset = codesetsFind(&CodesetsBase->codesets, name);
2004 if(codeset == NULL)
2006 struct TagItem *tstate = attrs;
2007 struct TagItem *tag;
2009 // now we walk through our taglist and check if the user
2010 // supplied
2011 while((tag = NextTagItem((APTR)&tstate)) != NULL)
2013 if(tag->ti_Tag == CSA_CodesetList && tag->ti_Data != 0)
2015 struct codesetList *csList = (struct codesetList *)tag->ti_Data;
2017 if((codeset = codesetsFind(csList, name)) != NULL)
2018 break;
2024 // check if we found something or not.
2025 if(codeset == NULL && GetTagData(CSA_FallbackToDefault, TRUE, attrs))
2026 codeset = defaultCodeset(FALSE);
2028 ReleaseSemaphore(&CodesetsBase->libSem);
2030 RETURN(codeset);
2031 return codeset;
2035 /// CodesetsFindBestA()
2036 struct codeset * LIBFUNC CodesetsFindBestA(REG(a0, struct TagItem *attrs))
2038 struct codeset *codeset = NULL;
2039 char *text;
2040 ULONG textLen;
2042 ENTER();
2044 ObtainSemaphoreShared(&CodesetsBase->libSem);
2046 text = (char *)GetTagData(CSA_Source, 0, attrs);
2047 textLen = GetTagData(CSA_SourceLen, text != NULL ? strlen(text) : 0, attrs);
2049 if(text != NULL && textLen != 0)
2051 int numErrors = 0;
2052 ULONG csFamily = GetTagData(CSA_CodesetFamily, CSV_CodesetFamily_Latin, attrs);
2053 int *errorPtr = (int *)GetTagData(CSA_ErrPtr, 0, attrs);
2055 codeset = codesetsFindBest(attrs, csFamily, text, textLen, &numErrors);
2057 if(errorPtr != NULL)
2058 *errorPtr = numErrors;
2060 // if we still haven't got the codeset we fallback to the default
2061 if(codeset == NULL && GetTagData(CSA_FallbackToDefault, FALSE, attrs))
2062 codeset = defaultCodeset(FALSE);
2065 ReleaseSemaphore(&CodesetsBase->libSem);
2067 RETURN(codeset);
2068 return codeset;
2072 /// CodesetsUTF8Len()
2073 // Returns the number of characters a utf8 string has. This is not
2074 // identically with the size of memory is required to hold the string.
2075 ULONG LIBFUNC CodesetsUTF8Len(REG(a0, UTF8 *str))
2077 int len = 0;
2078 unsigned char c;
2080 ENTER();
2082 if(str != NULL)
2084 while((c = *str++))
2086 len++;
2087 str += trailingBytesForUTF8[c];
2091 RETURN((ULONG)len);
2092 return (ULONG)len;
2096 /// CodesetsStrLenA()
2097 ULONG LIBFUNC CodesetsStrLenA(REG(a0, STRPTR str), REG(a1, struct TagItem *attrs))
2099 ULONG res = 0;
2101 ENTER();
2103 if(str != NULL)
2105 struct codeset *codeset;
2106 int len;
2107 STRPTR src;
2108 int utf;
2110 if((codeset = (struct codeset *)GetTagData(CSA_SourceCodeset, 0, attrs)) == NULL)
2111 codeset = defaultCodeset(TRUE);
2113 if(codeset == CodesetsBase->utf32Codeset)
2115 utf = 32;
2116 len = utf32_strlen((UTF32 *)str);
2118 else if(codeset == CodesetsBase->utf16Codeset)
2120 utf = 16;
2121 len = utf16_strlen((UTF16 *)str);
2123 else
2125 utf = 0;
2126 len = strlen(str);
2129 len = GetTagData(CSA_SourceLen, len, attrs);
2131 src = str;
2133 if(utf != 0)
2135 void *srcend = src + len;
2136 UTF8 *dstlen = NULL;
2137 union TypeAliases srcAlias;
2138 union TypeAliases dstAlias;
2140 srcAlias.strptr = &src;
2141 dstAlias.utf8 = &dstlen;
2143 switch(utf)
2145 case 16:
2146 CodesetsConvertUTF16toUTF8(srcAlias.cutf16, srcend, dstAlias.utf8, NULL, 0);
2147 break;
2149 case 32:
2150 CodesetsConvertUTF32toUTF8(srcAlias.cutf32, srcend, dstAlias.utf8, NULL, 0);
2151 break;
2153 res = (IPTR)dstlen;
2155 else
2157 UBYTE c;
2159 res = 0;
2161 while((c = *src++) != '\0' && len != 0)
2163 res += codeset->table[c].utf8[0];
2164 len--;
2169 RETURN(res);
2170 return res;
2174 /// CodesetsUTF8ToStrA()
2175 // Converts an UTF8 string to a given charset. Return the number of bytes
2176 // written to dest excluding the NULL byte (which is always ensured by this
2177 // function; it means a NULL str will produce "" as dest; anyway you should
2178 // check NULL str to not waste your time!).
2179 STRPTR LIBFUNC CodesetsUTF8ToStrA(REG(a0, struct TagItem *attrs))
2181 UTF8 *src;
2182 ULONG srcLen;
2183 ULONG destLen = 0;
2184 ULONG *destLenPtr;
2185 ULONG n = 0;
2186 STRPTR dest = NULL;
2188 ENTER();
2190 if((src = (UTF8 *)GetTagData(CSA_Source, 0, attrs)) != NULL &&
2191 (srcLen = GetTagData(CSA_SourceLen, src != NULL ? strlen((char *)src) : 0, attrs)) > 0)
2193 struct convertMsg msg;
2194 struct codeset *codeset;
2195 struct Hook *destHook;
2196 struct Hook *mapForeignCharsHook;
2197 char buf[256];
2198 STRPTR destIter = NULL;
2199 char *b = NULL;
2200 int i = 0;
2201 unsigned char *s = src;
2202 unsigned char *e = (src+srcLen);
2203 int numConvErrors = 0;
2204 int *numConvErrorsPtr;
2205 BOOL mapForeignChars;
2206 APTR pool = NULL;
2207 struct SignalSemaphore *sem = NULL;
2208 int utf;
2209 ULONG char_size;
2211 // get some more optional attributes
2212 destHook = (struct Hook *)GetTagData(CSA_DestHook, 0, attrs);
2213 destLen = GetTagData(CSA_DestLen, 0, attrs);
2214 numConvErrorsPtr = (int *)GetTagData(CSA_ErrPtr, 0, attrs);
2215 mapForeignChars = (BOOL)GetTagData(CSA_MapForeignChars, FALSE, attrs);
2216 mapForeignCharsHook = (struct Hook *)GetTagData(CSA_MapForeignCharsHook, 0, attrs);
2218 // get the destination codeset pointer
2219 if((codeset = (struct codeset *)GetTagData(CSA_DestCodeset, 0, attrs)) == NULL)
2220 codeset = defaultCodeset(TRUE);
2221 if(codeset == CodesetsBase->utf32Codeset)
2223 utf = 32;
2224 char_size = 4;
2226 else if(codeset == CodesetsBase->utf16Codeset)
2228 utf = 16;
2229 char_size = 2;
2231 else
2233 utf = 0;
2234 char_size = 1;
2237 // first we make sure we allocate enough memory
2238 // for our destination buffer
2239 if(destHook != NULL)
2241 if(destLen < 16 || destLen > sizeof(buf))
2242 destLen = sizeof(buf);
2244 msg.state = CSV_Translating;
2245 b = buf;
2246 i = 0;
2248 else
2250 // in case the user wants us to dynamically generate the
2251 // destination buffer we do it right now
2252 if((dest = (STRPTR)GetTagData(CSA_Dest, 0, attrs)) == NULL ||
2253 GetTagData(CSA_AllocIfNeeded, TRUE, attrs) != FALSE)
2255 ULONG len = 0;
2257 // calculate the destLen
2258 if(utf)
2260 void *dstlen = NULL;
2261 union TypeAliases srcAlias;
2262 union TypeAliases dstAlias;
2264 srcAlias.uchar = &s;
2265 dstAlias.voidptr = &dstlen;
2267 switch(utf)
2269 case 16:
2270 CodesetsConvertUTF8toUTF16(srcAlias.cutf8, e, dstAlias.utf16, NULL, 0);
2271 break;
2273 case 32:
2274 CodesetsConvertUTF8toUTF32(srcAlias.cutf8, e, dstAlias.utf32, NULL, 0);
2275 break;
2277 len = (IPTR)dstlen;
2279 else
2281 while(s < e)
2283 unsigned char c = *s++;
2285 len++;
2286 s += trailingBytesForUTF8[c];
2290 if(dest == NULL || (destLen < len+1))
2292 if((pool = (APTR)GetTagData(CSA_Pool, 0, attrs)) != NULL)
2294 if((sem = (struct SignalSemaphore *)GetTagData(CSA_PoolSem, 0, attrs)) != NULL)
2295 ObtainSemaphore(sem);
2297 // allocate the destination buffer
2298 dest = allocVecPooled(pool, len+char_size);
2300 if(sem != NULL)
2301 ReleaseSemaphore(sem);
2303 else
2304 dest = allocArbitrateVecPooled(len+char_size);
2306 destLen = len+char_size;
2309 if(dest == NULL)
2311 RETURN(NULL);
2312 return NULL;
2316 destIter = dest;
2319 // now we convert the src string to the
2320 // destination buffer.
2321 s = src;
2322 if(utf != 0)
2324 void *dstend;
2326 if(destHook != NULL)
2328 ULONG r = CSR_TargetExhausted;
2330 dstend = b + destLen - char_size;
2333 union TypeAliases srcAlias;
2334 union TypeAliases dstAlias;
2336 srcAlias.uchar = &s;
2337 dstAlias.schar = &b;
2339 switch(utf)
2341 case 16:
2342 r = CodesetsConvertUTF8toUTF16(srcAlias.cutf8, e, dstAlias.utf16, dstend, 0);
2343 break;
2345 case 32:
2346 r = CodesetsConvertUTF8toUTF32(srcAlias.cutf8, e, dstAlias.utf32, dstend, 0);
2347 break;
2349 b[0] = 0;
2350 if(char_size > 1)
2351 b[1] = 0;
2352 if(r != CSR_TargetExhausted)
2353 msg.state = CSV_End;
2354 msg.len = b-buf;
2355 CallHookPkt(destHook,&msg,buf);
2357 b = buf;
2358 n += msg.len;
2360 while(r == CSR_TargetExhausted);
2362 else
2364 union TypeAliases srcAlias;
2365 union TypeAliases dstAlias;
2367 srcAlias.uchar = &s;
2368 dstAlias.strptr = &destIter;
2369 dstend = destIter + destLen - char_size;
2370 switch(utf)
2372 case 16:
2373 CodesetsConvertUTF8toUTF16(srcAlias.cutf8, e, dstAlias.utf16, dstend, 0);
2374 break;
2376 case 32:
2377 CodesetsConvertUTF8toUTF32(srcAlias.cutf8, e, dstAlias.utf32, dstend, 0);
2378 break;
2380 n = destIter-dest;
2383 else
2385 for(;;n++)
2387 if(destHook == NULL && n >= destLen-1)
2388 break;
2390 // convert until we reach the end of the
2391 // source buffer.
2392 if(s < e)
2394 unsigned char c = *s;
2395 unsigned char d = '?';
2396 const char *repstr = NULL;
2397 int replen = 0;
2399 // check if the char is a >7bit char
2400 if(c > 127)
2402 struct single_convert *f;
2403 int lenAdd = trailingBytesForUTF8[c];
2404 int lenStr = lenAdd+1;
2405 unsigned char *src = s;
2409 // start each iteration with "no replacement found yet"
2410 repstr = NULL;
2411 replen = 0;
2413 // search in the UTF8 conversion table of the current charset if
2414 // we have a replacement character for the char sequence starting at s
2415 BIN_SEARCH(codeset->table_sorted, 0, 255, strncmp((char *)src, (char *)codeset->table_sorted[m].utf8+1, lenStr), f);
2417 if(f != NULL)
2419 d = f->code;
2420 replen = -1;
2422 break;
2424 else
2426 // the analysed char sequence (s) is not convertable to a
2427 // single visible char replacement, so we normally have to put
2428 // a ? sign as a "unknown char" sign at the very position.
2430 // For convienence we, however, allow users to replace these
2431 // UTF8 characters with char sequences that "looklike" the
2432 // original char.
2433 if(mapForeignChars == TRUE)
2434 replen = mapUTF8toASCII(&repstr, src, lenStr);
2436 // call the hook only, if the internal table yielded no suitable
2437 // replacement
2438 if(replen == 0 && mapForeignCharsHook != NULL)
2440 struct replaceMsg rmsg;
2442 rmsg.dst = (char **)&repstr;
2443 rmsg.src = src;
2444 rmsg.srclen = lenStr;
2445 replen = CallHookPkt(mapForeignCharsHook, &rmsg, NULL);
2448 if(replen < 0)
2450 D(DBF_UTF, "got UTF8 replacement (%ld)", replen);
2452 // stay in the loop as long as one replacement function delivers
2453 // further UTF8 replacement sequences
2454 src = (unsigned char *)repstr;
2455 // remember the length of the replaced string, as we might do another
2456 // iteration in the loop which might result in a further replacement
2457 lenStr = -replen;
2459 else if(replen == 0)
2461 D(DBF_UTF, "found no ASCII replacement for UTF8 string (%ld)", replen);
2462 repstr = NULL;
2464 else
2465 D(DBF_UTF, "got replacement string '%s' (%ld)", repstr ? repstr : "<null>", replen);
2468 while(replen < 0);
2470 if(repstr == NULL || replen == 0)
2472 if(replen >= 0)
2474 d = '?';
2475 numConvErrors++;
2479 s += lenAdd;
2481 else
2482 d = c;
2484 if(destHook != NULL)
2486 if(replen > 1)
2488 while(replen > 0)
2490 *b++ = *repstr;
2491 repstr++;
2492 i++;
2493 replen--;
2495 if(i%(destLen-1)==0)
2497 *b = '\0';
2498 msg.len = i;
2499 CallHookPkt(destHook, &msg, buf);
2501 b = buf;
2502 *b = '\0';
2503 i = 0;
2507 else
2509 *b++ = replen > 0 ? *repstr : d;
2510 i++;
2513 if(i%(destLen-1)==0)
2515 *b = '\0';
2516 msg.len = i;
2517 CallHookPkt(destHook, &msg, buf);
2519 b = buf;
2520 *b = '\0';
2521 i = 0;
2524 else
2526 if(replen > 1)
2528 ULONG destPos = destIter-dest;
2530 if(pool != NULL)
2532 if(sem != NULL)
2533 ObtainSemaphore(sem);
2535 // allocate the destination buffer
2536 dest = reallocVecPooled(pool, dest, destLen, destLen+replen-1);
2538 if(sem != NULL)
2539 ReleaseSemaphore(sem);
2541 else
2542 dest = reallocArbitrateVecPooled(dest, destLen, destLen+replen-1);
2544 if(dest == NULL)
2546 RETURN(NULL);
2547 return NULL;
2550 destIter = dest+destPos;
2551 memcpy(destIter, repstr, replen);
2553 // adjust our loop pointer and destination length
2554 destIter += replen;
2555 destLen += replen-1;
2557 else if(replen == 1)
2558 *destIter++ = *repstr;
2559 else
2560 *destIter++ = d;
2563 s++;
2565 else
2566 break;
2569 if(destHook != NULL)
2571 msg.state = CSV_End;
2572 msg.len = i;
2573 *b = '\0';
2574 CallHookPkt(destHook,&msg,buf);
2576 else
2577 *destIter = '\0';
2580 // let us write the number of conversion errors
2581 // to the proper variable pointer, if wanted
2582 if(numConvErrorsPtr != NULL)
2583 *numConvErrorsPtr = numConvErrors;
2586 // put the final length of our destination buffer
2587 // into the destLenPtr
2588 if((destLenPtr = (ULONG *)GetTagData(CSA_DestLenPtr, 0, attrs)) != NULL)
2590 if(destLen > 0)
2591 *destLenPtr = destLen-1;
2592 else
2593 *destLenPtr = 0;
2596 RETURN(dest);
2597 return dest;
2601 /// CodesetsUTF8CreateA()
2602 // Converts a string and a charset to an UTF8. Returns the UTF8.
2603 // If a destination hook is supplied always return 0.
2604 // If from is NULL, it returns NULL and doesn't call the hook.
2605 UTF8 * LIBFUNC CodesetsUTF8CreateA(REG(a0, struct TagItem *attrs))
2607 UTF8 *from;
2608 UTF8 *dest;
2609 struct codeset *codeset;
2610 ULONG fromLen, *destLenPtr;
2611 ULONG n;
2612 int utf;
2614 ENTER();
2616 dest = NULL;
2617 n = 0;
2619 if((codeset = (struct codeset *)GetTagData(CSA_SourceCodeset, 0, attrs)) == NULL)
2620 codeset = defaultCodeset(TRUE);
2621 if(codeset == CodesetsBase->utf32Codeset)
2622 utf = 32;
2623 else if(codeset == CodesetsBase->utf16Codeset)
2624 utf = 16;
2625 else
2626 utf = 0;
2628 from = (UTF8 *)GetTagData(CSA_Source, 0, attrs);
2629 if(from != NULL)
2631 switch(utf)
2633 case 32:
2634 fromLen = utf32_strlen((UTF32 *)from);
2635 break;
2637 case 16:
2638 fromLen = utf16_strlen((UTF16 *)from);
2639 break;
2641 default:
2642 fromLen = strlen((char *)from);
2643 break;
2646 else
2647 fromLen = 0;
2648 fromLen = GetTagData(CSA_SourceLen, fromLen, attrs);
2650 if(from != NULL && fromLen != 0)
2652 struct convertMsg msg;
2653 struct Hook *hook;
2654 ULONG destLen;
2655 int i = 0;
2656 TEXT buf[256];
2657 STRPTR src, destPtr = NULL, b = NULL;
2658 ULONG c;
2660 hook = (struct Hook *)GetTagData(CSA_DestHook, 0, attrs);
2661 destLen = GetTagData(CSA_DestLen, 0, attrs);
2663 if(hook != NULL)
2665 if(destLen<16 || destLen>sizeof(buf))
2666 destLen = sizeof(buf);
2668 msg.state = CSV_Translating;
2669 b = buf;
2670 i = 0;
2672 else
2674 if((dest = (UTF8 *)GetTagData(CSA_Dest, 0, attrs)) != NULL ||
2675 GetTagData(CSA_AllocIfNeeded, TRUE, attrs))
2677 ULONG len;
2679 src = (STRPTR)from;
2681 if(utf != 0)
2683 void *srcend = src + fromLen;
2684 UTF8 *dstlen = NULL;
2685 union TypeAliases srcAlias;
2686 union TypeAliases dstAlias;
2688 srcAlias.strptr = &src;
2689 dstAlias.utf8 = &dstlen;
2691 switch(utf)
2693 case 16:
2694 CodesetsConvertUTF16toUTF8(srcAlias.cutf16, srcend, dstAlias.utf8, NULL, 0);
2695 break;
2697 case 32:
2698 CodesetsConvertUTF32toUTF8(srcAlias.cutf32, srcend, dstAlias.utf8, NULL, 0);
2699 break;
2701 len = (IPTR)dstlen;
2703 else
2705 ULONG flen = fromLen;
2707 len = 0;
2708 while((c = *src++) != '\0' && flen != 0)
2710 len += codeset->table[c].utf8[0];
2711 flen--;
2714 D(DBF_UTF, "Calculated output UTF-8 buffer length: %lu", len);
2716 if(dest == NULL || (destLen<len+1))
2718 APTR pool;
2719 struct SignalSemaphore *sem;
2721 if((pool = (APTR)GetTagData(CSA_Pool, 0, attrs)) != NULL)
2723 if((sem = (struct SignalSemaphore *)GetTagData(CSA_PoolSem, 0, attrs)) != NULL)
2724 ObtainSemaphore(sem);
2726 // allocate the destination buffer
2727 dest = allocVecPooled(pool,len+1);
2729 if(sem != NULL)
2730 ReleaseSemaphore(sem);
2732 else
2733 dest = allocArbitrateVecPooled(len+1);
2735 destLen = len;
2738 if(dest == NULL)
2740 RETURN(NULL);
2741 return NULL;
2745 destPtr = (STRPTR)dest;
2748 src = (STRPTR)from;
2749 if(utf != 0)
2751 void *srcend = src + fromLen;
2752 UTF8 *dstend;
2754 if(hook != NULL)
2756 ULONG r = CSR_TargetExhausted;
2757 union TypeAliases srcAlias;
2758 union TypeAliases dstAlias;
2760 srcAlias.strptr = &src;
2761 dstAlias.strptr = &b;
2762 dstend = (UTF8 *)(b + destLen - 1);
2765 switch(utf)
2767 case 16:
2768 r = CodesetsConvertUTF16toUTF8(srcAlias.cutf16, srcend, dstAlias.utf8, dstend, 0);
2769 break;
2771 case 32:
2772 r = CodesetsConvertUTF32toUTF8(srcAlias.cutf32, srcend, dstAlias.utf8, dstend, 0);
2773 break;
2775 *b = 0;
2776 if(r != CSR_TargetExhausted)
2777 msg.state = CSV_End;
2778 msg.len = b-buf;
2779 CallHookPkt(hook,&msg,buf);
2781 b = buf;
2782 n += msg.len;
2784 while(r == CSR_TargetExhausted);
2786 else
2788 union TypeAliases srcAlias;
2789 union TypeAliases dstAlias;
2791 srcAlias.strptr = &src;
2792 dstAlias.strptr = &destPtr;
2793 dstend = (UTF8 *)(destPtr + destLen);
2794 switch(utf)
2796 case 16:
2797 CodesetsConvertUTF16toUTF8(srcAlias.cutf16, srcend, dstAlias.utf8, dstend, 0);
2798 break;
2800 case 32:
2801 CodesetsConvertUTF32toUTF8(srcAlias.cutf32, srcend, dstAlias.utf8, dstend, 0);
2802 break;
2804 n = destPtr-(STRPTR)dest;
2807 else
2809 for(; fromLen && (c = *src); src++, fromLen--)
2811 UTF8 *utf8_seq;
2813 for(utf8_seq = &codeset->table[c].utf8[1]; (c = *utf8_seq); utf8_seq++)
2815 if(hook != NULL)
2817 *b++ = c;
2818 i++;
2820 if(i%(destLen-1)==0)
2822 *b = 0;
2823 msg.len = i;
2824 CallHookPkt(hook,&msg,buf);
2826 b = buf;
2827 *b = 0;
2828 i = 0;
2831 else
2833 if(n>=destLen)
2834 break;
2836 *destPtr++ = c;
2839 n++;
2843 if(hook != NULL)
2845 msg.state = CSV_End;
2846 msg.len = i;
2847 *b = 0;
2848 CallHookPkt(hook,&msg,buf);
2850 else
2852 *destPtr = 0;
2857 if((destLenPtr = (ULONG *)GetTagData(CSA_DestLenPtr, 0, attrs)) != NULL)
2858 *destLenPtr = n;
2860 RETURN(dest);
2861 return dest;
2865 /// CodesetsIsValidUTF8()
2866 #define GOOD_UCS(c) \
2867 ((c) >= 160 && ((c) & ~0x3ff) != 0xd800 && \
2868 (c) != 0xfeff && (c) != 0xfffe && (c) != 0xffff)
2870 BOOL LIBFUNC CodesetsIsValidUTF8(REG(a0, STRPTR s))
2872 STRPTR t = s;
2873 int n;
2875 ENTER();
2877 while((n = parseUtf8(&t)) != 0)
2879 if(!GOOD_UCS(n))
2881 RETURN(FALSE);
2882 return FALSE;
2886 RETURN(TRUE);
2887 return TRUE;
2891 /// CodesetsConvertStrA()
2892 // Converts a given string from one source Codeset to a given destination
2893 // codeset and returns the convert string
2894 STRPTR LIBFUNC CodesetsConvertStrA(REG(a0, struct TagItem *attrs))
2896 struct codeset *srcCodeset;
2897 STRPTR srcStr = NULL;
2898 STRPTR dstStr = NULL;
2899 ULONG srcLen = 0;
2900 ULONG dstLen = 0;
2901 ULONG charSize = 0;
2903 ENTER();
2905 // get the ptr to the src string we want to convert
2906 // from the source codeset to the dest codeset.
2907 srcStr = (STRPTR)GetTagData(CSA_Source, 0, attrs);
2909 // get the pointer to the codeset in which the src string is encoded
2910 if((srcCodeset = (struct codeset *)GetTagData(CSA_SourceCodeset, 0, attrs)) == NULL)
2911 srcCodeset = defaultCodeset(TRUE);
2913 if(srcStr != NULL)
2915 if(srcCodeset == CodesetsBase->utf32Codeset)
2917 srcLen = utf32_strlen((UTF32 *)srcStr);
2918 charSize = sizeof(UTF32);
2920 else if(srcCodeset == CodesetsBase->utf16Codeset)
2922 srcLen = utf16_strlen((UTF16 *)srcStr);
2923 charSize = sizeof(UTF16);
2925 else
2927 srcLen = strlen(srcStr);
2928 charSize = sizeof(char);
2931 else
2932 srcLen = 0;
2933 srcLen = GetTagData(CSA_SourceLen, srcLen, attrs);
2935 if(srcStr != NULL && srcLen > 0)
2937 struct codeset *dstCodeset;
2939 // get the pointer to the codeset in which the dst string should be encoded
2940 if((dstCodeset = (struct codeset *)GetTagData(CSA_DestCodeset, 0, attrs)) == NULL)
2941 dstCodeset = defaultCodeset(TRUE);
2943 D(DBF_UTF, "srcCodeset: '%s' dstCodeset: '%s'", srcCodeset->name, dstCodeset->name);
2945 if(srcCodeset != NULL && dstCodeset != NULL)
2947 // check that the user didn't supplied the very same codeset
2948 // or otherwise a conversion is not required.
2949 if(srcCodeset != dstCodeset)
2951 BOOL utf8Create = FALSE;
2952 BOOL strCreate = FALSE;
2953 UTF8 *utf8str;
2954 ULONG utf8strLen = 0;
2955 ULONG *destLenPtr = NULL;
2956 BOOL mapForeignChars;
2957 struct Hook *mapForeignCharsHook;
2959 mapForeignChars = (BOOL)GetTagData(CSA_MapForeignChars, FALSE, attrs);
2960 mapForeignCharsHook = (struct Hook *)GetTagData(CSA_MapForeignCharsHook, 0, attrs);
2962 // if the source codeset is UTF-8 we don't have to use the UTF8Create()
2963 // function and can directly call the UTF8ToStr() function
2964 if(srcCodeset != CodesetsBase->utf8Codeset)
2966 struct TagItem tags[] = { { CSA_SourceCodeset, (IPTR)srcCodeset },
2967 { CSA_Source, (IPTR)srcStr },
2968 { CSA_SourceLen, srcLen },
2969 { CSA_DestLenPtr, (IPTR)&utf8strLen },
2970 { TAG_DONE, 0 } };
2972 utf8str = CodesetsUTF8CreateA((struct TagItem *)&tags[0]);
2974 utf8Create = TRUE;
2976 else
2978 utf8str = (UTF8 *)srcStr;
2979 utf8strLen = srcLen;
2982 // in case the destination codeset is UTF-8 we don't have to actually
2983 // use the UTF8ToStr() function and can immediately return our
2984 // UTF8 string
2985 if(utf8str != NULL && utf8strLen > 0 && dstCodeset != CodesetsBase->utf8Codeset)
2987 struct TagItem tags[] = { { CSA_DestCodeset, (IPTR)dstCodeset },
2988 { CSA_Source, (IPTR)utf8str },
2989 { CSA_SourceLen, utf8strLen },
2990 { CSA_DestLenPtr, (IPTR)&dstLen },
2991 { CSA_MapForeignChars, mapForeignChars },
2992 { CSA_MapForeignCharsHook, (IPTR)mapForeignCharsHook },
2993 { TAG_DONE, 0 } };
2995 dstStr = CodesetsUTF8ToStrA((struct TagItem *)&tags[0]);
2997 strCreate = TRUE;
2999 else
3001 dstStr = (STRPTR)utf8str;
3002 dstLen = utf8strLen;
3005 D(DBF_UTF, "srcStr: %lx srcLen: %ld dstStr: %lx dstLen: %ld utf8create: %ld strCreate: %ld", srcStr, srcLen,
3006 dstStr, dstLen,
3007 utf8Create,
3008 strCreate);
3010 // if everything was successfull we can go and finalize everything
3011 if(dstStr != NULL && utf8str != NULL)
3013 // as the conversion was a two way pass we have to either free the
3014 // memory of the utf8 string or not
3015 if(utf8Create == TRUE && strCreate == TRUE)
3016 CodesetsFreeA(utf8str, NULL);
3018 // if the user wants to be informed abour the length
3019 // of our destination string we store the length now in the supplied ptr.
3020 if((destLenPtr = (ULONG *)GetTagData(CSA_DestLenPtr, 0, attrs)) != NULL)
3021 *destLenPtr = dstLen;
3023 D(DBF_UTF, "successfully converted string with len %ld", dstLen);
3025 else
3027 W(DBF_ALWAYS, "an error occurred while trying to convert a string");
3029 // free all memory in case the conversion didn't work out
3030 if(utf8Create == TRUE && utf8str != NULL)
3031 CodesetsFreeA(utf8str, NULL);
3033 if(strCreate == TRUE && dstStr != NULL)
3034 CodesetsFreeA(dstStr, NULL);
3036 dstStr = NULL;
3039 else
3041 // we got the same source and destination codesets passed in
3042 // instead of failing silently we just create a copy of the source string
3043 ULONG *destLenPtr = NULL;
3045 // allocate memory for the destination string, including a trailing NUL byte
3046 if((dstStr = allocArbitrateVecPooled(srcLen + charSize)) != NULL)
3048 // just copy the source string without any further modification
3049 // we must use memcpy() as the source string could be UTF16/32 encoded and
3050 // thus strcpy() would not do what we want.
3051 memcpy(dstStr, srcStr, srcLen + charSize);
3052 dstLen = srcLen;
3053 D(DBF_UTF, "successfully copied string with len %ld", dstLen);
3055 else
3056 W(DBF_ALWAYS, "no memory for dest string");
3058 // if the user wants to be informed abour the length
3059 // of our destination string we store the length now in the supplied ptr.
3060 if((destLenPtr = (ULONG *)GetTagData(CSA_DestLenPtr, 0, attrs)) != NULL)
3061 *destLenPtr = dstLen;
3066 RETURN(dstStr);
3067 return dstStr;
3071 /// CodesetsFreeVecPooledA()
3072 void LIBFUNC CodesetsFreeVecPooledA(REG(a0, APTR pool), REG(a1, APTR mem), REG(a2, struct TagItem *attrs))
3074 ENTER();
3076 if(pool != NULL && mem != NULL)
3078 struct SignalSemaphore *sem;
3080 if((sem = (struct SignalSemaphore *)GetTagData(CSA_PoolSem, 0, attrs)) != NULL)
3081 ObtainSemaphore(sem);
3083 freeVecPooled(pool,mem);
3085 if(sem != NULL)
3086 ReleaseSemaphore(sem);
3089 LEAVE();
3093 /// CodesetsListCreateA()
3094 struct codesetList * LIBFUNC CodesetsListCreateA(REG(a0, struct TagItem *attrs))
3096 struct codesetList *csList = NULL;
3098 ENTER();
3100 // no matter what, we create a codesets list we will return to the user
3101 if((csList = allocArbitrateVecPooled(sizeof(struct codesetList))) != NULL)
3103 BOOL scanProgDir = TRUE;
3104 struct TagItem *tstate = attrs;
3105 struct TagItem *tag;
3107 // initialize the new private codeset list and put it into a separate list
3108 NewList((struct List *)csList);
3110 // first we get the path of the directory from which we go
3111 // and scan for charset tables from
3112 while((tag = NextTagItem((APTR)&tstate)) != NULL)
3114 switch(tag->ti_Tag)
3116 case CSA_CodesetDir:
3118 codesetsScanDir(csList, (STRPTR)tag->ti_Data);
3120 scanProgDir = FALSE;
3122 break;
3124 case CSA_CodesetFile:
3126 codesetsReadTable(csList, (STRPTR)tag->ti_Data);
3128 scanProgDir = FALSE;
3130 break;
3132 case CSA_SourceCodeset:
3134 struct codeset *cs = (struct codeset *)tag->ti_Data;
3136 AddTail((struct List *)csList, (struct Node *)&cs->node);
3138 scanProgDir = FALSE;
3140 break;
3144 // in case the user also wants us to scan PROGDIR:
3145 // we do so
3146 if(scanProgDir == TRUE)
3147 codesetsScanDir(csList, "PROGDIR:Charsets");
3150 RETURN(csList);
3151 return csList;
3155 /// CodesetsListDeleteA()
3156 BOOL LIBFUNC CodesetsListDeleteA(REG(a0, struct TagItem *attrs))
3158 BOOL result = FALSE;
3159 struct TagItem *tstate = attrs;
3160 struct TagItem *tag;
3161 BOOL freeCodesets;
3163 ENTER();
3165 // check if the caller wants us also to free the codesets
3166 freeCodesets = (BOOL)GetTagData(CSA_FreeCodesets, TRUE, attrs);
3168 // now we iterate through or tagItems and see what the
3169 // user wants to remove from the list
3170 while((tag = NextTagItem((APTR)&tstate)) != NULL)
3172 switch(tag->ti_Tag)
3174 case CSA_CodesetList:
3176 struct codesetList *csList = (struct codesetList *)tag->ti_Data;
3178 if(csList != NULL)
3180 // cleanup the codesets within the list
3181 if(freeCodesets == TRUE)
3182 codesetsCleanup(csList);
3184 // then free the list itself
3185 freeArbitrateVecPooled(csList);
3187 result = TRUE;
3190 break;
3194 RETURN(result);
3195 return result;
3199 /// CodesetsListAddA()
3200 BOOL LIBFUNC CodesetsListAddA(REG(a0, struct codesetList *csList), REG(a1, struct TagItem *attrs))
3202 BOOL result = FALSE;
3204 ENTER();
3206 if(csList != NULL)
3208 struct TagItem *tstate = attrs;
3209 struct TagItem *tag;
3211 // now we iterate through or tagItems and see if the user
3212 // wants to scan a whole directory or just adds a file.
3213 while((tag = NextTagItem((APTR)&tstate)) != NULL)
3215 switch(tag->ti_Tag)
3217 case CSA_CodesetDir:
3219 codesetsScanDir(csList, (STRPTR)tag->ti_Data);
3220 result = TRUE;
3222 break;
3224 case CSA_CodesetFile:
3226 codesetsReadTable(csList, (STRPTR)tag->ti_Data);
3227 result = TRUE;
3229 break;
3231 case CSA_SourceCodeset:
3233 struct codeset *cs = (struct codeset *)tag->ti_Data;
3235 AddTail((struct List *)csList, (struct Node *)&cs->node);
3236 result = TRUE;
3238 break;
3243 RETURN(result);
3244 return result;
3248 /// CodesetsListRemoveA()
3249 BOOL LIBFUNC CodesetsListRemoveA(REG(a0, struct TagItem *attrs))
3251 BOOL result = FALSE;
3252 struct TagItem *tstate = attrs;
3253 struct TagItem *tag;
3254 BOOL freeCodesets;
3256 ENTER();
3258 // check if the caller wants us also to free the codesets
3259 freeCodesets = (BOOL)GetTagData(CSA_FreeCodesets, TRUE, attrs);
3261 // now we iterate through or tagItems and see what the
3262 // user wants to remove from the list
3263 while((tag = NextTagItem((APTR)&tstate)) != NULL)
3265 switch(tag->ti_Tag)
3267 case CSA_SourceCodeset:
3269 struct codeset *removeCS = (struct codeset *)tag->ti_Data;
3271 if(removeCS != NULL)
3273 struct Node *node;
3274 BOOL isExternalNode = TRUE;
3276 ObtainSemaphore(&CodesetsBase->libSem);
3278 // iterate over our internal list an check whether the given
3279 // node is part of that list
3280 for(node = GetHead((struct List *)&CodesetsBase->codesets); node != NULL; node = GetSucc(node))
3282 if((struct codeset *)node == removeCS)
3284 isExternalNode = FALSE;
3285 break;
3289 ReleaseSemaphore(&CodesetsBase->libSem);
3291 if(isExternalNode == TRUE)
3293 Remove((struct Node *)removeCS);
3295 // free all codesets data if requested
3296 if(freeCodesets == TRUE)
3298 if(removeCS->name != NULL)
3299 freeArbitrateVecPooled(removeCS->name);
3300 if(removeCS->alt_name != NULL)
3301 freeArbitrateVecPooled(removeCS->alt_name);
3302 if(removeCS->characterization != NULL)
3303 freeArbitrateVecPooled(removeCS->characterization);
3305 freeArbitrateVecPooled(removeCS);
3308 result = TRUE;
3310 else
3311 W(DBF_ALWAYS, "user tried to remove an internal codeset!");
3314 break;
3318 RETURN(result);
3319 return result;
3324 /**************************************************************************/