1 /***************************************************************************
3 codesets.library - Amiga shared library for handling different codesets
4 Copyright (C) 2001-2005 by Alfonso [alfie] Ranieri <alforan@tin.it>.
5 Copyright (C) 2005-2013 by codesets.library Open Source Team
7 This library is free software; you can redistribute it and/or
8 modify it under the terms of the GNU Lesser General Public
9 License as published by the Free Software Foundation; either
10 version 2.1 of the License, or (at your option) any later version.
12 This library is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 Lesser General Public License for more details.
17 codesets.library project: http://sourceforge.net/projects/codesetslib/
19 Most of the code included in this file was relicensed from GPL to LGPL
20 from the source code of SimpleMail (http://www.sf.net/projects/simplemail)
21 with full permissions by its authors.
25 ***************************************************************************/
29 #include <clib/alib_protos.h>
31 #include <diskfont/glyph.h>
32 #include <diskfont/diskfonttag.h>
33 #include <proto/diskfont.h>
38 #include <proto/keymap.h>
39 #include <proto/locale.h>
42 #include "codesets_table.h"
43 #include "convertUTF.h"
44 #include "codepages.h"
46 #include "SDI_stdarg.h"
50 /**************************************************************************/
52 // a union used for various type casts while avoiding the annoying "dereferencing
53 // type punned pointer is breaking strict alias rules" warnings of GCC4+
58 unsigned char **uchar
;
69 // search a sorted array in O(log n) e.g.
70 // BIN_SEARCH(strings,0,sizeof(strings)/sizeof(strings[0]),strcmp(key,array[mid]),res);
71 #define BIN_SEARCH(array,low,high,compare,result) \
75 int m = (low+high)/2;\
80 if (!d){ result = &array[m]; break; }\
81 if (d < 0) h = m - 1;\
89 static STRPTR
mystrdup(const char *str
)
99 if((len
= strlen(str
)) > 0)
101 if((newStr
= allocArbitrateVecPooled(len
+1)) != NULL
)
102 strlcpy(newStr
, str
, len
+1);
112 static STRPTR
mystrndup(const char *str1
, int n
)
118 if((dest
= allocArbitrateVecPooled(n
+1)) != NULL
)
121 strlcpy(dest
, str1
, n
+1);
134 static BOOL
readLine(BPTR fh
, char *buf
, ULONG size
)
136 BOOL success
= FALSE
;
141 if((c
= FGets(fh
, buf
, size
)) != NULL
)
143 // we succeeded in reading something
146 // now find the end of the line and strip the LF/CR character
149 if(*c
== '\n' || *c
== '\r')
163 static const char *getConfigItem(const char *buf
, const char *item
)
165 const char *configItem
= NULL
;
172 if(strnicmp(buf
, item
, len
) == 0)
179 while((c
= *buf
) != '\0' && isspace(c
))
187 while((c
= *buf
) != '\0' && isspace(c
))
200 static int parseUtf8(STRPTR
*ps
)
224 if((s
[1] & 0xc0)!=0x80)
232 RETURN(((s
[0] & 0x1f)<<6) | (s
[1] & 0x3f));
233 return ((s
[0] & 0x1f)<<6) | (s
[1] & 0x3f);
270 wc
= *s
++ & ((1<<(7-n
))-1);
274 if((*s
& 0xc0) != 0x80)
280 wc
= (wc
<< 6) | (*s
++ & 0x3f);
283 if(wc
< (1 << (5 * n
- 4)))
297 static int countCodesets(struct codesetList
*csList
)
302 for(node
= GetHead((struct List
*)csList
); node
!= NULL
; node
= GetSucc(node
))
310 // in case some UTF8 sequences can not be converted during CodesetsUTF8ToStrA(), this
311 // function is used to replace these unknown sequences with lookalike characters that
312 // still make the text more readable. For more replacement see
313 // http://www.utf8-zeichentabelle.de/unicode-utf8-table.pl
315 // The conversion table in this function is partly borrowed from the awebcharset plugin
316 // written by Frank Weber. See http://cvs.sunsite.dk/viewcvs.cgi/aweb/plugins/charset/awebcharset.c
318 struct UTF8Replacement
320 const char *utf8
; // the original UTF8 string we are going to replace
321 const int utf8len
; // the length of the UTF8 string
322 const char *rep
; // pointer to the replacement string
323 const int replen
; // the length of the replacement string (minus for signalling an UTF8 string)
326 static int compareUTF8Replacements(const void *p1
, const void *p2
)
328 struct UTF8Replacement
*key
= (struct UTF8Replacement
*)p1
;
329 struct UTF8Replacement
*rep
= (struct UTF8Replacement
*)p2
;
332 // compare the length first, after that compare the strings
333 cmp
= key
->utf8len
- rep
->utf8len
;
335 cmp
= memcmp(key
->utf8
, rep
->utf8
, key
->utf8len
);
340 static int mapUTF8toASCII(const char **dst
, const unsigned char *src
, const int utf8len
)
343 struct UTF8Replacement key
= { (char *)src
, utf8len
, NULL
, 0 };
344 struct UTF8Replacement
*rep
;
346 static struct UTF8Replacement
const utf8map
[] =
348 // U+0100 ... U+017F (Latin Extended-A)
349 { "\xC4\x80", 2, "A", 1 }, // U+0100 -> A (LATIN CAPITAL LETTER A WITH MACRON)
350 { "\xC4\x81", 2, "a", 1 }, // U+0101 -> a (LATIN SMALL LETTER A WITH MACRON)
351 { "\xC4\x82", 2, "A", 1 }, // U+0102 -> A (LATIN CAPITAL LETTER A WITH BREVE)
352 { "\xC4\x83", 2, "a", 1 }, // U+0103 -> a (LATIN SMALL LETTER A WITH BREVE)
353 { "\xC4\x84", 2, "A", 1 }, // U+0104 -> A (LATIN CAPITAL LETTER A WITH OGONEK)
354 { "\xC4\x85", 2, "a", 1 }, // U+0105 -> a (LATIN SMALL LETTER A WITH OGONEK)
355 { "\xC4\x86", 2, "C", 1 }, // U+0106 -> C (LATIN CAPITAL LETTER C WITH ACUTE)
356 { "\xC4\x87", 2, "c", 1 }, // U+0107 -> c (LATIN SMALL LETTER C WITH ACUTE)
357 { "\xC4\x88", 2, "C", 1 }, // U+0108 -> C (LATIN CAPITAL LETTER C WITH CIRCUMFLEX)
358 { "\xC4\x89", 2, "c", 1 }, // U+0109 -> c (LATIN SMALL LETTER C WITH CIRCUMFLEX)
359 { "\xC4\x8A", 2, "C", 1 }, // U+010A -> C (LATIN CAPITAL LETTER C WITH DOT ABOVE)
360 { "\xC4\x8B", 2, "c", 1 }, // U+010B -> c (LATIN SMALL LETTER C WITH DOT ABOVE)
361 { "\xC4\x8C", 2, "C", 1 }, // U+010C -> C (LATIN CAPITAL LETTER C WITH CARON)
362 { "\xC4\x8D", 2, "c", 1 }, // U+010D -> c (LATIN SMALL LETTER C WITH CARON)
363 { "\xC4\x8E", 2, "D", 1 }, // U+010E -> D (LATIN CAPITAL LETTER D WITH CARON)
364 { "\xC4\x8F", 2, "d", 1 }, // U+010F -> d (LATIN SMALL LETTER D WITH CARON)
365 { "\xC4\x90", 2, "D", 1 }, // U+0110 -> D (LATIN CAPITAL LETTER D WITH STROKE)
366 { "\xC4\x91", 2, "d", 1 }, // U+0111 -> d (LATIN SMALL LETTER D WITH STROKE)
367 { "\xC4\x92", 2, "E", 1 }, // U+0112 -> E (LATIN CAPITAL LETTER E WITH MACRON)
368 { "\xC4\x93", 2, "e", 1 }, // U+0113 -> e (LATIN SMALL LETTER E WITH MACRON)
369 { "\xC4\x94", 2, "E", 1 }, // U+0114 -> E (LATIN CAPITAL LETTER E WITH BREVE)
370 { "\xC4\x95", 2, "e", 1 }, // U+0115 -> e (LATIN SMALL LETTER E WITH BREVE)
371 { "\xC4\x96", 2, "E", 1 }, // U+0116 -> E (LATIN CAPITAL LETTER E WITH DOT ABOVE)
372 { "\xC4\x97", 2, "e", 1 }, // U+0117 -> e (LATIN SMALL LETTER E WITH DOT ABOVE)
373 { "\xC4\x98", 2, "E", 1 }, // U+0118 -> E (LATIN CAPITAL LETTER E WITH OGONEK)
374 { "\xC4\x99", 2, "e", 1 }, // U+0119 -> e (LATIN SMALL LETTER E WITH OGONEK)
375 { "\xC4\x9A", 2, "E", 1 }, // U+011A -> E (LATIN CAPITAL LETTER E WITH CARON)
376 { "\xC4\x9B", 2, "e", 1 }, // U+011B -> e (LATIN SMALL LETTER E WITH CARON)
377 { "\xC4\x9C", 2, "G", 1 }, // U+011C -> G (LATIN CAPITAL LETTER G WITH CIRCUMFLEX)
378 { "\xC4\x9D", 2, "g", 1 }, // U+011D -> g (LATIN SMALL LETTER G WITH CIRCUMFLEX)
379 { "\xC4\x9E", 2, "G", 1 }, // U+011E -> G (LATIN CAPITAL LETTER G WITH BREVE)
380 { "\xC4\x9F", 2, "g", 1 }, // U+011F -> g (LATIN SMALL LETTER G WITH BREVE)
381 { "\xC4\xA0", 2, "G", 1 }, // U+0120 -> G (LATIN CAPITAL LETTER G WITH DOT ABOVE)
382 { "\xC4\xA1", 2, "g", 1 }, // U+0121 -> g (LATIN SMALL LETTER G WITH DOT ABOVE)
383 { "\xC4\xA2", 2, "G", 1 }, // U+0122 -> G (LATIN CAPITAL LETTER G WITH CEDILLA)
384 { "\xC4\xA3", 2, "g", 1 }, // U+0123 -> g (LATIN SMALL LETTER G WITH CEDILLA)
385 { "\xC4\xA4", 2, "H", 1 }, // U+0124 -> H (LATIN CAPITAL LETTER H WITH CIRCUMFLEX)
386 { "\xC4\xA5", 2, "h", 1 }, // U+0125 -> h (LATIN SMALL LETTER H WITH CIRCUMFLEX)
387 { "\xC4\xA6", 2, "H", 1 }, // U+0126 -> H (LATIN CAPITAL LETTER H WITH STROKE)
388 { "\xC4\xA7", 2, "h", 1 }, // U+0127 -> h (LATIN SMALL LETTER H WITH STROKE)
389 { "\xC4\xA8", 2, "I", 1 }, // U+0128 -> I (LATIN CAPITAL LETTER I WITH TILDE)
390 { "\xC4\xA9", 2, "i", 1 }, // U+0129 -> i (LATIN SMALL LETTER I WITH TILDE)
391 { "\xC4\xAA", 2, "I", 1 }, // U+012A -> I (LATIN CAPITAL LETTER I WITH MACRON)
392 { "\xC4\xAB", 2, "i", 1 }, // U+012B -> i (LATIN SMALL LETTER I WITH MACRON)
393 { "\xC4\xAC", 2, "I", 1 }, // U+012C -> I (LATIN CAPITAL LETTER I WITH BREVE)
394 { "\xC4\xAD", 2, "i", 1 }, // U+012D -> i (LATIN SMALL LETTER I WITH BREVE)
395 { "\xC4\xAE", 2, "I", 1 }, // U+012E -> I (LATIN CAPITAL LETTER I WITH OGONEK)
396 { "\xC4\xAF", 2, "i", 1 }, // U+012F -> i (LATIN SMALL LETTER I WITH OGONEK)
397 { "\xC4\xB0", 2, "I", 1 }, // U+0130 -> I (LATIN CAPITAL LETTER I WITH DOT ABOVE)
398 { "\xC4\xB1", 2, "i", 1 }, // U+0131 -> i (LATIN SMALL LETTER DOTLESS I)
399 { "\xC4\xB2", 2, "Ij", 2 }, // U+0132 -> Ij (LATIN CAPITAL LIGATURE IJ)
400 { "\xC4\xB3", 2, "ij", 2 }, // U+0133 -> ij (LATIN SMALL LIGATURE IJ)
401 { "\xC4\xB4", 2, "J", 1 }, // U+0134 -> J (LATIN CAPITAL LETTER J WITH CIRCUMFLEX)
402 { "\xC4\xB5", 2, "j", 1 }, // U+0135 -> j (LATIN SMALL LETTER J WITH CIRCUMFLEX)
403 { "\xC4\xB6", 2, "K", 1 }, // U+0136 -> K (LATIN CAPITAL LETTER K WITH CEDILLA)
404 { "\xC4\xB7", 2, "k", 1 }, // U+0137 -> k (LATIN SMALL LETTER K WITH CEDILLA)
405 { "\xC4\xB8", 2, "k", 1 }, // U+0138 -> k (LATIN SMALL LETTER KRA)
406 { "\xC4\xB9", 2, "L", 1 }, // U+0139 -> L (LATIN CAPITAL LETTER L WITH ACUTE)
407 { "\xC4\xBA", 2, "l", 1 }, // U+013A -> l (LATIN SMALL LETTER L WITH ACUTE)
408 { "\xC4\xBB", 2, "L", 1 }, // U+013B -> L (LATIN CAPITAL LETTER L WITH CEDILLA)
409 { "\xC4\xBC", 2, "l", 1 }, // U+013C -> l (LATIN SMALL LETTER L WITH CEDILLA)
410 { "\xC4\xBD", 2, "L", 1 }, // U+013D -> L (LATIN CAPITAL LETTER L WITH CARON)
411 { "\xC4\xBE", 2, "l", 1 }, // U+013E -> l (LATIN SMALL LETTER L WITH CARON)
412 { "\xC4\xBF", 2, "L", 1 }, // U+013F -> L (LATIN CAPITAL LETTER L WITH MIDDLE DOT)
413 { "\xC5\x80", 2, "l", 1 }, // U+0140 -> l (LATIN SMALL LETTER L WITH MIDDLE DOT)
414 { "\xC5\x81", 2, "L", 1 }, // U+0141 -> L (LATIN CAPITAL LETTER L WITH STROKE)
415 { "\xC5\x82", 2, "l", 1 }, // U+0142 -> l (LATIN SMALL LETTER L WITH STROKE)
416 { "\xC5\x83", 2, "N", 1 }, // U+0143 -> N (LATIN CAPITAL LETTER N WITH ACUTE)
417 { "\xC5\x84", 2, "n", 1 }, // U+0144 -> n (LATIN SMALL LETTER N WITH ACUTE)
418 { "\xC5\x85", 2, "N", 1 }, // U+0145 -> N (LATIN CAPITAL LETTER N WITH CEDILLA)
419 { "\xC5\x86", 2, "n", 1 }, // U+0146 -> n (LATIN SMALL LETTER N WITH CEDILLA)
420 { "\xC5\x87", 2, "N", 1 }, // U+0147 -> N (LATIN CAPITAL LETTER N WITH CARON)
421 { "\xC5\x88", 2, "n", 1 }, // U+0148 -> n (LATIN SMALL LETTER N WITH CARON)
422 { "\xC5\x89", 2, "'n", 2 }, // U+0149 -> 'n (LATIN SMALL LETTER N PRECEDED BY APOSTROPHE)
423 { "\xC5\x8A", 2, "Ng", 2 }, // U+014A -> Ng (LATIN CAPITAL LETTER ENG)
424 { "\xC5\x8B", 2, "ng", 2 }, // U+014B -> ng (LATIN SMALL LETTER ENG)
425 { "\xC5\x8C", 2, "O", 1 }, // U+014C -> O (LATIN CAPITAL LETTER O WITH MACRON)
426 { "\xC5\x8D", 2, "o", 1 }, // U+014D -> o (LATIN SMALL LETTER O WITH MACRON)
427 { "\xC5\x8E", 2, "O", 1 }, // U+014E -> O (LATIN CAPITAL LETTER O WITH BREVE)
428 { "\xC5\x8F", 2, "o", 1 }, // U+014F -> o (LATIN SMALL LETTER O WITH BREVE)
429 { "\xC5\x90", 2, "O", 1 }, // U+0150 -> O (LATIN CAPITAL LETTER O WITH DOUBLE ACUTE)
430 { "\xC5\x91", 2, "o", 1 }, // U+0151 -> o (LATIN SMALL LETTER O WITH DOUBLE ACUTE)
431 { "\xC5\x92", 2, "Oe", 2 }, // U+0152 -> Oe (LATIN CAPITAL LIGATURE OE)
432 { "\xC5\x93", 2, "oe", 2 }, // U+0153 -> oe (LATIN SMALL LIGATURE OE)
433 { "\xC5\x94", 2, "R", 1 }, // U+0154 -> R (LATIN CAPITAL LETTER R WITH ACUTE)
434 { "\xC5\x95", 2, "r", 1 }, // U+0155 -> r (LATIN SMALL LETTER R WITH ACUTE)
435 { "\xC5\x96", 2, "R", 1 }, // U+0156 -> R (LATIN CAPITAL LETTER R WITH CEDILLA)
436 { "\xC5\x97", 2, "r", 1 }, // U+0157 -> r (LATIN SMALL LETTER R WITH CEDILLA)
437 { "\xC5\x98", 2, "R", 1 }, // U+0158 -> R (LATIN CAPITAL LETTER R WITH CARON)
438 { "\xC5\x99", 2, "r", 1 }, // U+0159 -> r (LATIN SMALL LETTER R WITH CARON)
439 { "\xC5\x9A", 2, "S", 1 }, // U+015A -> S (LATIN CAPITAL LETTER S WITH ACUTE)
440 { "\xC5\x9B", 2, "s", 1 }, // U+015B -> s (LATIN SMALL LETTER S WITH ACUTE)
441 { "\xC5\x9C", 2, "S", 1 }, // U+015C -> S (LATIN CAPITAL LETTER S WITH CIRCUMFLEX)
442 { "\xC5\x9D", 2, "s", 1 }, // U+015D -> s (LATIN SMALL LETTER S WITH CIRCUMFLEX)
443 { "\xC5\x9E", 2, "S", 1 }, // U+015E -> S (LATIN CAPITAL LETTER S WITH CEDILLA)
444 { "\xC5\x9F", 2, "s", 1 }, // U+015F -> s (LATIN SMALL LETTER S WITH CEDILLA)
445 { "\xC5\xA0", 2, "S", 1 }, // U+0160 -> S (LATIN CAPITAL LETTER S WITH CARON)
446 { "\xC5\xA1", 2, "s", 1 }, // U+0161 -> s (LATIN SMALL LETTER S WITH CARON)
447 { "\xC5\xA2", 2, "T", 1 }, // U+0162 -> T (LATIN CAPITAL LETTER T WITH CEDILLA)
448 { "\xC5\xA3", 2, "t", 1 }, // U+0163 -> t (LATIN SMALL LETTER T WITH CEDILLA)
449 { "\xC5\xA4", 2, "T", 1 }, // U+0164 -> T (LATIN CAPITAL LETTER T WITH CARON)
450 { "\xC5\xA5", 2, "t", 1 }, // U+0165 -> t (LATIN SMALL LETTER T WITH CARON)
451 { "\xC5\xA6", 2, "T", 1 }, // U+0166 -> T (LATIN CAPITAL LETTER T WITH STROKE)
452 { "\xC5\xA7", 2, "t", 1 }, // U+0167 -> t (LATIN SMALL LETTER T WITH STROKE)
453 { "\xC5\xA8", 2, "U", 1 }, // U+0168 -> U (LATIN CAPITAL LETTER U WITH TILDE)
454 { "\xC5\xA9", 2, "u", 1 }, // U+0169 -> u (LATIN SMALL LETTER U WITH TILDE)
455 { "\xC5\xAA", 2, "U", 1 }, // U+016A -> U (LATIN CAPITAL LETTER U WITH MACRON)
456 { "\xC5\xAB", 2, "u", 1 }, // U+016B -> u (LATIN SMALL LETTER U WITH MACRON)
457 { "\xC5\xAC", 2, "U", 1 }, // U+016C -> U (LATIN CAPITAL LETTER U WITH BREVE)
458 { "\xC5\xAD", 2, "u", 1 }, // U+016D -> u (LATIN SMALL LETTER U WITH BREVE)
459 { "\xC5\xAE", 2, "U", 1 }, // U+016E -> U (LATIN CAPITAL LETTER U WITH RING ABOVE)
460 { "\xC5\xAF", 2, "u", 1 }, // U+016F -> u (LATIN SMALL LETTER U WITH RING ABOVE)
461 { "\xC5\xB0", 2, "U", 1 }, // U+0170 -> U (LATIN CAPITAL LETTER U WITH DOUBLE ACUTE)
462 { "\xC5\xB1", 2, "u", 1 }, // U+0171 -> u (LATIN SMALL LETTER U WITH DOUBLE ACUTE)
463 { "\xC5\xB2", 2, "U", 1 }, // U+0172 -> U (LATIN CAPITAL LETTER U WITH OGONEK)
464 { "\xC5\xB3", 2, "u", 1 }, // U+0173 -> u (LATIN SMALL LETTER U WITH OGONEK)
465 { "\xC5\xB4", 2, "W", 1 }, // U+0174 -> W (LATIN CAPITAL LETTER W WITH CIRCUMFLEX)
466 { "\xC5\xB5", 2, "w", 1 }, // U+0175 -> w (LATIN SMALL LETTER W WITH CIRCUMFLEX)
467 { "\xC5\xB6", 2, "Y", 1 }, // U+0176 -> Y (LATIN CAPITAL LETTER Y WITH CIRCUMFLEX)
468 { "\xC5\xB7", 2, "y", 1 }, // U+0177 -> y (LATIN SMALL LETTER Y WITH CIRCUMFLEX)
469 { "\xC5\xB8", 2, "Y", 1 }, // U+0178 -> Y (LATIN CAPITAL LETTER Y WITH DIAERESIS)
470 { "\xC5\xB9", 2, "Z", 1 }, // U+0179 -> Z (LATIN CAPITAL LETTER Z WITH ACUTE)
471 { "\xC5\xBA", 2, "z", 1 }, // U+017A -> z (LATIN SMALL LETTER Z WITH ACUTE)
472 { "\xC5\xBB", 2, "Z", 1 }, // U+017B -> Z (LATIN CAPITAL LETTER Z WITH DOT ABOVE)
473 { "\xC5\xBC", 2, "z", 1 }, // U+017C -> z (LATIN SMALL LETTER Z WITH DOT ABOVE)
474 { "\xC5\xBD", 2, "Z", 1 }, // U+017D -> Z (LATIN CAPITAL LETTER Z WITH CARON)
475 { "\xC5\xBE", 2, "z", 1 }, // U+017E -> z (LATIN SMALL LETTER Z WITH CARON)
476 { "\xC5\xBF", 2, "s", 1 }, // U+017F -> s (LATIN SMALL LETTER LONG S
478 // U+2000 ... U+206F (General Punctuation)
479 { "\xE2\x80\x90", 3, "-", 1 }, // U+2010 -> - (HYPHEN)
480 { "\xE2\x80\x91", 3, "-", 1 }, // U+2011 -> - (NON-BREAKING HYPHEN)
481 { "\xE2\x80\x92", 3, "--", 2 }, // U+2012 -> -- (FIGURE DASH)
482 { "\xE2\x80\x93", 3, "--", 2 }, // U+2013 -> -- (EN DASH)
483 { "\xE2\x80\x94", 3, "---", 3 }, // U+2014 -> --- (EM DASH)
484 { "\xE2\x80\x95", 3, "---", 3 }, // U+2015 -> --- (HORIZONTAL BAR)
485 { "\xE2\x80\x96", 3, "||", 2 }, // U+2016 -> || (DOUBLE VERTICAL LINE)
486 { "\xE2\x80\x97", 3, "_", 1 }, // U+2017 -> _ (DOUBLE LOW LINE)
487 { "\xE2\x80\x98", 3, "`", 1 }, // U+2018 -> ` (LEFT SINGLE QUOTATION MARK)
488 { "\xE2\x80\x99", 3, "'", 1 }, // U+2019 -> ' (RIGHT SINGLE QUOTATION MARK)
489 { "\xE2\x80\x9A", 3, ",", 1 }, // U+201A -> , (SINGLE LOW-9 QUOTATION MARK)
490 { "\xE2\x80\x9B", 3, "'", 1 }, // U+201B -> ' (SINGLE HIGH-REVERSED-9 QUOTATION MARK)
491 { "\xE2\x80\x9C", 3, "\"", 1 }, // U+201C -> " (LEFT DOUBLE QUOTATION MARK)
492 { "\xE2\x80\x9D", 3, "\"", 1 }, // U+201D -> " (RIGHT DOUBLE QUOTATION MARK)
493 { "\xE2\x80\x9E", 3, ",,", 2 }, // U+201E -> ,, (DOUBLE LOW-9 QUOTATION MARK)
494 { "\xE2\x80\x9F", 3, "``", 2 }, // U+201F -> `` (DOUBLE HIGH-REVERSED-9 QUOTATION MARK)
495 { "\xE2\x80\xA0", 3, "+", 1 }, // U+2020 -> + (DAGGER)
496 { "\xE2\x80\xA1", 3, "+", 1 }, // U+2021 -> + (DOUBLE DAGGER)
497 { "\xE2\x80\xA2", 3, "\xC2\xB7", -2 }, // U+2022 -> U+00B7 (BULLET) -> (MIDDLE POINT)
498 { "\xE2\x80\xA3", 3, ".", 1 }, // U+2023 -> . (TRIANGULAR BULLET)
499 { "\xE2\x80\xA4", 3, ".", 1 }, // U+2024 -> . (ONE DOT LEADER)
500 { "\xE2\x80\xA5", 3, "..", 2 }, // U+2025 -> .. (TWO DOT LEADER)
501 { "\xE2\x80\xA6", 3, "...", 3 }, // U+2026 -> ... (HORIZONTAL ELLIPSIS)
502 { "\xE2\x80\xA7", 3, "\xC2\xB7", -2 }, // U+2027 -> U+00B7 (HYPHENATION POINT) -> (MIDDLE POINT)
503 { "\xE2\x80\xB0", 3, "%.", 2 }, // U+2030 -> %. (PER MILLE SIGN)
504 { "\xE2\x80\xB1", 3, "%..", 3 }, // U+2031 -> %.. (PER TEN THOUSAND SIGN)
505 { "\xE2\x80\xB2", 3, "'", 1 }, // U+2032 -> ` (PRIME)
506 { "\xE2\x80\xB3", 3, "''", 2 }, // U+2033 -> '' (DOUBLE PRIME)
507 { "\xE2\x80\xB4", 3, "'''", 3 }, // U+2034 -> ''' (TRIPLE PRIME)
508 { "\xE2\x80\xB5", 3, "`", 1 }, // U+2035 -> ` (REVERSED PRIME)
509 { "\xE2\x80\xB6", 3, "``", 2 }, // U+2036 -> `` (REVERSED DOUBLE PRIME)
510 { "\xE2\x80\xB7", 3, "```", 3 }, // U+2037 -> ``` (REVERSED TRIPLE PRIME)
511 { "\xE2\x80\xB8", 3, "^", 1 }, // U+2038 -> ^ (CARET)
512 { "\xE2\x80\xB9", 3, "<", 1 }, // U+2039 -> < (SINGLE LEFT-POINTING ANGLE QUOTATION MARK)
513 { "\xE2\x80\xBA", 3, ">", 1 }, // U+203A -> > (SINGLE RIGHT-POINTING ANGLE QUOTATION MARK)
514 { "\xE2\x80\xBB", 3, "\xC3\x97", -2 }, // U+203B -> U+00D7 (REFERENCE MARK) -> (MULTIPLICATION SIGN)
515 { "\xE2\x80\xBC", 3, "!!", 2 }, // U+203C -> !! (DOUBLE EXCLAMATION MARK)
516 { "\xE2\x80\xBD", 3, "?", 1 }, // U+203D -> ? (INTERROBANG)
517 { "\xE2\x81\x82", 3, "*", 1 }, // U+2042 -> * (ASTERISM)
518 { "\xE2\x81\x83", 3, ".", 1 }, // U+2043 -> . (HYPHEN BULLET)
519 { "\xE2\x81\x84", 3, "/", 1 }, // U+2044 -> / (FRACTION SLASH)
520 { "\xE2\x81\x87", 3, "??", 2 }, // U+2047 -> ?? (DOUBLE QUESTION MARK)
521 { "\xE2\x81\x88", 3, "?!", 2 }, // U+2048 -> ?! (QUESTION EXCLAMATION MARK)
522 { "\xE2\x81\x89", 3, "!?", 2 }, // U+2049 -> !? (EXCLAMATION QUESTION MARK)
523 { "\xE2\x81\x8E", 3, "*", 1 }, // U+204E -> * (LOW ASTERISK)
524 { "\xE2\x81\x8F", 3, ";", 1 }, // U+204F -> ; (REVERSED SEMICOLON)
525 { "\xE2\x81\x91", 3, "*", 1 }, // U+2051 -> * (TWO ASTERISKS ALIGNED VERTICALLY)
526 { "\xE2\x81\x92", 3, "-", 1 }, // U+2052 -> - (COMMERCIAL MINUS SIGN)
527 { "\xE2\x81\x93", 3, "~", 1 }, // U+2053 -> ~ (SWUNG DASH)
528 { "\xE2\x81\x95", 3, "*", 1 }, // U+2055 -> * (FLOWER PUNCTUATION MARK)
529 { "\xE2\x81\x97", 3, "''''", 4 }, // U+2057 -> '''' (QUADRUPLE PRIME)
530 { "\xE2\x81\x9A", 3, ":", 1 }, // U+205A -> : (TWO DOT PUNCTUATION)
531 { "\xE2\x81\x9C", 3, "+", 1 }, // U+205C -> + (DOTTED CROSS)
533 // U+20A0 ... U+20CF (Currency Symbols)
534 { "\xE2\x82\xA0", 3, "ECU", 3 }, // U+20A0 -> ECU (EURO-CURRENCY SIGN)
535 { "\xE2\x82\xA1", 3, "CRC", 3 }, // U+20A1 -> CRC (COLON SIGN)
536 { "\xE2\x82\xA2", 3, "BRC", 3 }, // U+20A2 -> BRC (CRUZEIRO SIGN)
537 { "\xE2\x82\xA3", 3, "BEF", 3 }, // U+20A3 -> BEF (FRENCH FRANC SIGN)
538 { "\xE2\x82\xA4", 3, "ITL", 3 }, // U+20A4 -> ITL (LIRA SIGN)
539 { "\xE2\x82\xA6", 3, "NGN", 3 }, // U+20A6 -> NGN (NEIRA SIGN)
540 { "\xE2\x82\xA7", 3, "ESP", 3 }, // U+20A7 -> ESP (PESETA SIGN)
541 { "\xE2\x82\xA8", 3, "MVQ", 3 }, // U+20A8 -> MVQ (RUPEE SIGN)
542 { "\xE2\x82\xA9", 3, "KPW", 3 }, // U+20A9 -> KPW (WON SIGN)
543 { "\xE2\x82\xAA", 3, "ILS", 3 }, // U+20AA -> ILS (NEW SHEQEL SIGN)
544 { "\xE2\x82\xAB", 3, "VNC", 3 }, // U+20AB -> VNC (DONG SIGN)
545 { "\xE2\x82\xAC", 3, "EUR", 3 }, // U+20AC -> EUR (EURO SIGN)
546 { "\xE2\x82\xAD", 3, "LAK", 3 }, // U+20AD -> LAK (KIP SIGN)
547 { "\xE2\x82\xAE", 3, "MNT", 3 }, // U+20AE -> MNT (TUGRIK SIGN)
548 { "\xE2\x82\xAF", 3, "GRD", 3 }, // U+20AF -> GRD (DRACHMA SIGN)
549 { "\xE2\x82\xB0", 3, "Pf", 2 }, // U+20B0 -> Pf (GERMAN PENNY SIGN)
550 { "\xE2\x82\xB1", 3, "P", 1 }, // U+20B1 -> P (PESO SIGN)
551 { "\xE2\x82\xB2", 3, "PYG", 3 }, // U+20B2 -> PYG (GUARANI SIGN)
552 { "\xE2\x82\xB3", 3, "ARA", 3 }, // U+20B3 -> ARA (AUSTRAL SIGN)
553 { "\xE2\x82\xB4", 3, "UAH", 3 }, // U+20B4 -> UAH (HRYVNIA SIGN)
554 { "\xE2\x82\xB5", 3, "GHS", 3 }, // U+20B5 -> GHS (CEDI SIGN)
556 // U+2190 ... U+21FF (Arrows)
557 { "\xE2\x86\x90", 3, "<-", 2 }, // U+2190 -> <- (LEFTWARDS ARROW)
558 { "\xE2\x86\x92", 3, "->", 2 }, // U+2192 -> -> (RIGHTWARDS ARROW)
563 // start with no replacement string
566 // perform a binary search in the lookup table
567 if((rep
= bsearch(&key
, utf8map
, sizeof(utf8map
) / sizeof(utf8map
[0]), sizeof(utf8map
[0]), compareUTF8Replacements
)) != NULL
)
569 // if we found something, then copy this over to the result variables
579 /// matchCodesetAlias()
581 struct CodesetAliases
583 const char *MIMEname
; // The official and correct MIME name for a codeset
584 const char *Aliases
; // A space separated array with well-known aliases
587 const struct CodesetAliases codesetAliases
[] =
590 { "Amiga-1251", "Ami1251 Amiga1251" },
591 { "AmigaPL", "AmiPL Amiga-PL" },
592 { "ISO-8859-1", "ISO8859-1 8859-1" },
593 { "ISO-8859-2", "ISO8859-2 8859-2" },
594 { "ISO-8859-3", "ISO8859-3 8859-3" },
595 { "ISO-8859-4", "ISO8859-4 8859-4" },
596 { "ISO-8859-5", "ISO8859-5 8859-5" },
597 { "ISO-8859-6", "ISO8859-6 8859-6" },
598 { "ISO-8859-7", "ISO8859-7 8859-7" },
599 { "ISO-8859-8", "ISO8859-8 8859-8" },
600 { "ISO-8859-9", "ISO8859-9 8859-9" },
601 { "ISO-8859-10", "ISO8859-10 8859-10" },
602 { "ISO-8859-11", "ISO8859-11 8859-11" },
603 { "ISO-8859-12", "ISO8859-12 8859-12" },
604 { "ISO-8859-13", "ISO8859-13 8859-13" },
605 { "ISO-8859-14", "ISO8859-14 8859-14" },
606 { "ISO-8859-15", "ISO8859-15 8859-15" },
607 { "ISO-8859-16", "ISO8859-16 8859-16" },
608 { "ISO-8859-10", "ISO8859-10 8859-10" },
609 { "KOI8-R", "KOI8R" },
610 { "US-ASCII", "ASCII" },
611 { "UTF-8", "UTF8 UTF" },
612 { "UTF-16", "UTF16" },
613 { "UTF-32", "UTF32" },
614 { "windows-1250", "cp1250 windows1250" },
615 { "windows-1251", "cp1251 windows1251" },
616 { "windows-1252", "cp1252 windows1252" },
617 { "windows-1253", "cp1253 windows1253" },
618 { "windows-1254", "cp1254 windows1254" },
619 { "windows-1255", "cp1255 windows1255" },
620 { "windows-1256", "cp1256 windows1256" },
621 { "windows-1257", "cp1257 windows1257" },
625 static const char *matchCodesetAlias(const char *search
)
627 const char *result
= NULL
;
628 size_t len
= strlen(search
);
633 for(i
=0; codesetAliases
[i
].MIMEname
!= NULL
; i
++)
637 // search the MIMEname first
638 if(stricmp(search
, codesetAliases
[i
].MIMEname
) == 0)
642 const char *s
= codesetAliases
[i
].Aliases
;
644 // loop through space separated list of aliases
645 while(s
!= NULL
&& *s
!= '\0')
647 if(strnicmp(search
, s
, len
) == 0)
653 if((s
= strpbrk(s
, " ")) != NULL
)
660 result
= codesetAliases
[i
].MIMEname
;
672 /**************************************************************************/
675 static struct codeset
*defaultCodeset(BOOL useSemaphore
)
678 struct codeset
*codeset
;
682 if(useSemaphore
== TRUE
)
683 ObtainSemaphoreShared(&CodesetsBase
->libSem
);
686 GetVar("codeset_default" ,buf
, sizeof(buf
), GVF_GLOBAL_ONLY
);
688 if(buf
[0] == '\0' || (codeset
= codesetsFind(&CodesetsBase
->codesets
, buf
)) == NULL
)
689 codeset
= CodesetsBase
->systemCodeset
;
691 if(useSemaphore
== TRUE
)
692 ReleaseSemaphore(&CodesetsBase
->libSem
);
699 /// codesetsCmpUnicode()
700 // The compare function
701 static int codesetsCmpUnicode(const void *a1
, const void *a2
)
703 struct single_convert
*arg1
= (struct single_convert
*)a1
;
704 struct single_convert
*arg2
= (struct single_convert
*)a2
;
706 return strcmp((char*)&arg1
->utf8
[1], (char*)&arg2
->utf8
[1]);
710 /// codesetsReadTable()
712 #define ITEM_STANDARD "Standard"
713 #define ITEM_ALTSTANDARD "AltStandard"
714 #define ITEM_READONLY "ReadOnly"
715 #define ITEM_CHARACTERIZATION "Characterization"
717 // Reads a coding table and adds it
718 static BOOL
codesetsReadTable(struct codesetList
*csList
, STRPTR name
)
725 D(DBF_STARTUP
, "trying to read charset file '%s'...", name
);
727 if((fh
= Open(name
, MODE_OLDFILE
)) != (BPTR
)NULL
)
729 struct codeset
*codeset
;
731 if((codeset
= (struct codeset
*)allocArbitrateVecPooled(sizeof(*codeset
))) != NULL
)
736 memset(codeset
, 0, sizeof(*codeset
));
738 for(i
= 0; i
<256; i
++)
740 codeset
->table
[i
].code
= i
;
741 codeset
->table
[i
].ucs4
= i
;
744 while(readLine(fh
, buf
, sizeof(buf
)) == TRUE
)
750 if((result
= getConfigItem(buf
, ITEM_STANDARD
)) != NULL
)
751 codeset
->name
= mystrdup(result
);
752 else if(codeset
->name
== NULL
) // a valid file starts with "Standard" and nothing else!!
754 else if((result
= getConfigItem(buf
, ITEM_ALTSTANDARD
)) != NULL
)
755 codeset
->alt_name
= mystrdup(result
);
756 else if((result
= getConfigItem(buf
, ITEM_READONLY
)) != NULL
)
757 codeset
->read_only
= (atoi(result
) == 0) ? 0 : 1;
758 else if((result
= getConfigItem(buf
, ITEM_CHARACTERIZATION
)) != NULL
)
760 if(result
[0] == '_' && result
[1] == '(' && result
[2] == '"')
762 char *end
= strchr(result
+ 3, '"');
765 codeset
->characterization
= mystrndup(result
+3, end
-(result
+3));
768 codeset
->characterization
= mystrdup(result
);
775 if(*p
== '=' || (fmt2
= ((*p
=='0') || (*(p
+1)=='x'))))
780 i
= strtol(p
, &p
, 16);
786 if(strnicmp(p
, "U+", 2) == 0)
789 codeset
->table
[i
].ucs4
= strtol(p
, &p
, 16);
793 codeset
->table
[i
].ucs4
= strtol(p
, &p
, 0);
801 // check if there is not already codeset with the same name in here
802 if(codeset
->name
!= NULL
&& codesetsFind(csList
, codeset
->name
) == NULL
)
806 UTF32 src
= codeset
->table
[i
].ucs4
;
807 UTF32
*src_ptr
= &src
;
808 UTF8
*dest_ptr
= &codeset
->table
[i
].utf8
[1];
810 CodesetsConvertUTF32toUTF8((const UTF32
**)&src_ptr
, src_ptr
+1, &dest_ptr
, dest_ptr
+6, CSF_StrictConversion
);
812 codeset
->table
[i
].utf8
[0] = (IPTR
)dest_ptr
-(IPTR
)(&codeset
->table
[i
].utf8
[1]);
815 memcpy(codeset
->table_sorted
, codeset
->table
, sizeof(codeset
->table
));
816 qsort(codeset
->table_sorted
, 256, sizeof(codeset
->table
[0]), codesetsCmpUnicode
);
817 D(DBF_STARTUP
, "adding external codeset '%s'", codeset
->name
);
818 AddTail((struct List
*)csList
, (struct Node
*)&codeset
->node
);
825 if(codeset
->name
!= NULL
)
826 freeArbitrateVecPooled(codeset
->name
);
827 if(codeset
->alt_name
!= NULL
)
828 freeArbitrateVecPooled(codeset
->alt_name
);
829 if(codeset
->characterization
!= NULL
)
830 freeArbitrateVecPooled(codeset
->characterization
);
831 freeArbitrateVecPooled(codeset
);
842 /// codesetsScanDir()
843 static void codesetsScanDir(struct codesetList
*csList
, const char *dirPath
)
847 if(dirPath
!= NULL
&& dirPath
[0] != '\0')
849 #if defined(__amigaos4__)
852 if((dirContext
= ObtainDirContextTags(EX_StringNameInput
, dirPath
,
853 EX_DataFields
, EXF_NAME
|EXF_TYPE
,
856 struct ExamineData
*exd
;
858 D(DBF_STARTUP
, "scanning directory '%s' for codesets tables", dirPath
);
860 while((exd
= ExamineDir(dirContext
)) != NULL
)
866 strlcpy(filePath
, dirPath
, sizeof(filePath
));
867 AddPart(filePath
, exd
->Name
, sizeof(filePath
));
869 D(DBF_STARTUP
, "about to read codeset table '%s'", filePath
);
871 codesetsReadTable(csList
, filePath
);
875 ReleaseDirContext(dirContext
);
880 if((dirLock
= Lock(dirPath
, ACCESS_READ
)))
882 struct ExAllControl
*eac
;
884 D(DBF_STARTUP
, "scanning directory '%s' for codesets tables", dirPath
);
886 if((eac
= AllocDosObject(DOS_EXALLCONTROL
, NULL
)) != NULL
)
888 struct ExAllData
*ead
;
889 struct ExAllData
*eabuffer
;
892 eac
->eac_LastKey
= 0;
893 eac
->eac_MatchString
= NULL
;
894 eac
->eac_MatchFunc
= NULL
;
896 if((eabuffer
= allocVecPooled(CodesetsBase
->pool
, 10*sizeof(struct ExAllData
))) != NULL
)
902 more
= ExAll(dirLock
, eabuffer
, 10*sizeof(struct ExAllData
), ED_TYPE
, eac
);
903 if(!more
&& IoErr() != ERROR_NO_MORE_ENTRIES
)
906 if(eac
->eac_Entries
== 0)
909 ead
= (struct ExAllData
*)eabuffer
;
912 // we only take that ead if it is a file (ed_Type < 0)
915 strlcpy(filePath
, dirPath
, sizeof(filePath
));
916 AddPart(filePath
, (char *)ead
->ed_Name
, sizeof(filePath
));
918 D(DBF_STARTUP
, "about to read codeset table '%s'", filePath
);
920 codesetsReadTable(csList
, filePath
);
928 freeVecPooled(CodesetsBase
->pool
, eabuffer
);
931 FreeDosObject(DOS_EXALLCONTROL
, eac
);
944 // Initialized and loads the codesets
945 BOOL
codesetsInit(struct codesetList
*csList
)
947 BOOL success
= FALSE
;
948 struct codeset
*codeset
;
951 #if defined(__amigaos4__)
957 NewList((struct List
*)csList
);
959 // to make the list of the supported codesets complete we also add fake
960 // 'UTF-8', 'UTF-16' and 'UTF-32' only so that our users can query for those codesets as well.
961 if((codeset
= allocArbitrateVecPooled(sizeof(*codeset
))) == NULL
)
964 memset(codeset
, 0, sizeof(*codeset
));
965 codeset
->name
= mystrdup("UTF-8");
966 codeset
->alt_name
= mystrdup("UTF8");
967 codeset
->characterization
= mystrdup("Unicode");
968 codeset
->read_only
= 0;
969 D(DBF_STARTUP
, "adding internal codeset 'UTF-8'");
970 AddTail((struct List
*)csList
, (struct Node
*)&codeset
->node
);
971 CodesetsBase
->utf8Codeset
= codeset
;
973 if((codeset
= allocArbitrateVecPooled(sizeof(*codeset
))) == NULL
)
976 memset(codeset
, 0, sizeof(*codeset
));
977 codeset
->name
= mystrdup("UTF-16");
978 codeset
->alt_name
= mystrdup("UTF16");
979 codeset
->characterization
= mystrdup("16-bit Unicode");
980 codeset
->read_only
= 0;
981 D(DBF_STARTUP
, "adding internal codeset 'UTF-16'");
982 AddTail((struct List
*)csList
, (struct Node
*)&codeset
->node
);
983 CodesetsBase
->utf16Codeset
= codeset
;
985 if((codeset
= allocArbitrateVecPooled(sizeof(*codeset
))) == NULL
)
988 memset(codeset
, 0, sizeof(*codeset
));
989 codeset
->name
= mystrdup("UTF-32");
990 codeset
->alt_name
= mystrdup("UTF32");
991 codeset
->characterization
= mystrdup("32-bit Unicode");
992 codeset
->read_only
= 0;
993 D(DBF_STARTUP
, "adding internal codeset 'UTF-32'");
994 AddTail((struct List
*)csList
, (struct Node
*)&codeset
->node
);
995 CodesetsBase
->utf32Codeset
= codeset
;
997 // on AmigaOS4 we can use diskfont.library to inquire charset information as
998 // it comes with a quite rich implementation of different charsets.
999 #if defined(__amigaos4__)
1000 D(DBF_STARTUP
, "OS4, asking diskfont.library for codesets");
1006 ULONG curMIB
= nextMIB
;
1008 nextMIB
= ObtainCharsetInfo(DFCS_NUMBER
, curMIB
, DFCS_NEXTNUMBER
);
1012 mapTable
= (ULONG
*)ObtainCharsetInfo(DFCS_NUMBER
, curMIB
, DFCS_MAPTABLE
);
1013 mimename
= (char *)ObtainCharsetInfo(DFCS_NUMBER
, curMIB
, DFCS_MIMENAME
);
1014 ianaName
= (char *)ObtainCharsetInfo(DFCS_NUMBER
, curMIB
, DFCS_NAME
);
1015 if(mapTable
!= NULL
&& mimename
!= NULL
&& codesetsFind(csList
, mimename
) == NULL
)
1017 D(DBF_STARTUP
, "loading charset '%s' from diskfont.library...", mimename
);
1019 if((codeset
= allocArbitrateVecPooled(sizeof(*codeset
))) == NULL
)
1022 codeset
->name
= mystrdup(mimename
);
1023 codeset
->alt_name
= NULL
;
1024 codeset
->characterization
= mystrdup(ianaName
);
1025 codeset
->read_only
= 0;
1027 for(i
=0; i
<256; i
++)
1029 UTF32
*src_ptr
= &src
;
1030 UTF8
*dest_ptr
= &codeset
->table
[i
].utf8
[1];
1034 codeset
->table
[i
].code
= i
;
1035 codeset
->table
[i
].ucs4
= src
;
1036 CodesetsConvertUTF32toUTF8((const UTF32
**)&src_ptr
, src_ptr
+1, &dest_ptr
, dest_ptr
+6, CSF_StrictConversion
);
1038 codeset
->table
[i
].utf8
[0] = (IPTR
)dest_ptr
-(IPTR
)&codeset
->table
[i
].utf8
[1];
1041 memcpy(codeset
->table_sorted
, codeset
->table
, sizeof(codeset
->table
));
1042 qsort(codeset
->table_sorted
, 256, sizeof(codeset
->table
[0]), codesetsCmpUnicode
);
1044 D(DBF_STARTUP
, "adding diskfont.library codeset '%s'", codeset
->name
);
1045 AddTail((struct List
*)csList
, (struct Node
*)&codeset
->node
);
1051 #if defined(__MORPHOS__)
1053 struct Library
*KeymapBase
;
1054 struct Library
*LocaleBase
;
1055 // assume success at first
1056 BOOL success
= TRUE
;
1058 D(DBF_STARTUP
, "MorphOS, asking keymap.library for codesets");
1059 if((KeymapBase
= OpenLibrary("keymap.library", 51)) != NULL
)
1061 if((LocaleBase
= OpenLibrary("locale.library", 51)) != NULL
)
1063 struct KeyMap
*keymap
= AskKeyMapDefault();
1064 // it doesn't matter if this call fails, as we don't depend on the system codesets
1065 CONST_STRPTR name
= GetKeyMapCodepage(keymap
);
1067 // legacy keymaps dont have codepage or Unicode mappings
1068 if(name
!= NULL
&& keymap
!= NULL
)
1070 D(DBF_STARTUP
, "loading charset '%s' from keymap.library...", name
);
1072 if((codeset
= allocArbitrateVecPooled(sizeof(*codeset
))) != NULL
)
1074 codeset
->name
= mystrdup(name
);
1075 codeset
->alt_name
= NULL
;
1076 codeset
->characterization
= mystrdup(name
); // No further information available
1077 codeset
->read_only
= 0;
1079 for(i
=0; i
<256; i
++)
1081 UTF8
*dest_ptr
= &codeset
->table
[i
].utf8
[1];
1084 codeset
->table
[i
].code
= i
;
1085 codeset
->table
[i
].ucs4
= src
= ToUCS4(i
, keymap
);
1087 // here we use UTF8_Encode() instead of ConvertUCS4ToUTF8() because
1088 // of an internal bug in MorphOS 2.2.
1089 rc
= UTF8_Encode(src
, dest_ptr
);
1090 rc
= rc
> 0 ? rc
: 1;
1092 dest_ptr
[rc
] = '\0';
1093 codeset
->table
[i
].utf8
[0] = rc
;
1096 memcpy(codeset
->table_sorted
, codeset
->table
, sizeof(codeset
->table
));
1097 qsort(codeset
->table_sorted
, 256, sizeof(codeset
->table
[0]), codesetsCmpUnicode
);
1099 D(DBF_STARTUP
, "adding keymap.library codeset '%s'", codeset
->name
);
1100 AddTail((struct List
*)csList
, (struct Node
*)&codeset
->node
);
1104 // only failed memory allocations are treated as error
1109 CloseLibrary(LocaleBase
);
1112 CloseLibrary(KeymapBase
);
1115 if(success
== FALSE
)
1120 D(DBF_STARTUP
, "loading charsets from LIBS:Charsets...");
1122 // we try to walk to the LIBS:Charsets directory on our own and readin our
1123 // own charset tables
1124 codesetsScanDir(csList
, "LIBS:Charsets");
1127 // now we go and initialize our internally supported codesets but only if
1128 // we have not already loaded a charset with the same name
1130 D(DBF_STARTUP
, "initializing internal charsets...");
1132 // ISO-8859-1 + EURO
1133 if(codesetsFind(csList
, "ISO-8859-1 + Euro") == NULL
)
1135 if((codeset
= allocArbitrateVecPooled(sizeof(*codeset
))) == NULL
)
1138 codeset
->name
= mystrdup("ISO-8859-1 + Euro");
1139 codeset
->alt_name
= NULL
;
1140 codeset
->characterization
= mystrdup("West European (with EURO)");
1141 codeset
->read_only
= 1;
1143 for(i
= 0; i
<256; i
++)
1145 UTF32
*src_ptr
= &src
;
1146 UTF8
*dest_ptr
= &codeset
->table
[i
].utf8
[1];
1149 src
= 0x20AC; // the EURO sign
1153 codeset
->table
[i
].code
= i
;
1154 codeset
->table
[i
].ucs4
= src
;
1155 CodesetsConvertUTF32toUTF8((const UTF32
**)&src_ptr
, src_ptr
+1, &dest_ptr
, dest_ptr
+6, CSF_StrictConversion
);
1157 codeset
->table
[i
].utf8
[0] = (IPTR
)dest_ptr
-(IPTR
)&codeset
->table
[i
].utf8
[1];
1159 memcpy(codeset
->table_sorted
, codeset
->table
, sizeof(codeset
->table
));
1160 qsort(codeset
->table_sorted
, 256, sizeof(codeset
->table
[0]), codesetsCmpUnicode
);
1162 D(DBF_STARTUP
, "adding internal codeset '%s'", codeset
->name
);
1163 AddTail((struct List
*)csList
, (struct Node
*)&codeset
->node
);
1167 if(codesetsFind(csList
, "ISO-8859-1") == NULL
)
1169 if((codeset
= allocArbitrateVecPooled(sizeof(*codeset
))) == NULL
)
1172 codeset
->name
= mystrdup("ISO-8859-1");
1173 codeset
->alt_name
= mystrdup("ISO8859-1");
1174 codeset
->characterization
= mystrdup("West European");
1175 codeset
->read_only
= 0;
1177 for(i
= 0; i
<256; i
++)
1179 UTF32
*src_ptr
= &src
;
1180 UTF8
*dest_ptr
= &codeset
->table
[i
].utf8
[1];
1184 codeset
->table
[i
].code
= i
;
1185 codeset
->table
[i
].ucs4
= src
;
1186 CodesetsConvertUTF32toUTF8((const UTF32
**)&src_ptr
, src_ptr
+1, &dest_ptr
, dest_ptr
+6, CSF_StrictConversion
);
1188 codeset
->table
[i
].utf8
[0] = (IPTR
)dest_ptr
-(IPTR
)&codeset
->table
[i
].utf8
[1];
1190 memcpy(codeset
->table_sorted
, codeset
->table
, sizeof(codeset
->table
));
1191 qsort(codeset
->table_sorted
, 256, sizeof(codeset
->table
[0]), codesetsCmpUnicode
);
1193 D(DBF_STARTUP
, "adding internal codeset '%s'", codeset
->name
);
1194 AddTail((struct List
*)csList
, (struct Node
*)&codeset
->node
);
1198 if(codesetsFind(csList
, "ISO-8859-2") == NULL
)
1200 if((codeset
= allocArbitrateVecPooled(sizeof(*codeset
))) == NULL
)
1203 codeset
->name
= mystrdup("ISO-8859-2");
1204 codeset
->alt_name
= mystrdup("ISO8859-2");
1205 codeset
->characterization
= mystrdup("Central/East European");
1206 codeset
->read_only
= 0;
1208 for(i
= 0; i
<256; i
++)
1210 UTF32
*src_ptr
= &src
;
1211 UTF8
*dest_ptr
= &codeset
->table
[i
].utf8
[1];
1216 src
= iso_8859_2_to_ucs4
[i
-0xa0];
1218 codeset
->table
[i
].code
= i
;
1219 codeset
->table
[i
].ucs4
= src
;
1220 CodesetsConvertUTF32toUTF8((const UTF32
**)&src_ptr
, src_ptr
+1, &dest_ptr
,dest_ptr
+6, CSF_StrictConversion
);
1222 codeset
->table
[i
].utf8
[0] = (IPTR
)dest_ptr
-(IPTR
)&codeset
->table
[i
].utf8
[1];
1224 memcpy(codeset
->table_sorted
, codeset
->table
, sizeof(codeset
->table
));
1225 qsort(codeset
->table_sorted
, 256, sizeof(codeset
->table
[0]), codesetsCmpUnicode
);
1227 D(DBF_STARTUP
, "adding internal codeset '%s'", codeset
->name
);
1228 AddTail((struct List
*)csList
, (struct Node
*)&codeset
->node
);
1232 if(codesetsFind(csList
, "ISO-8859-3") == NULL
)
1234 if((codeset
= allocArbitrateVecPooled(sizeof(*codeset
))) == NULL
)
1237 codeset
->name
= mystrdup("ISO-8859-3");
1238 codeset
->alt_name
= mystrdup("ISO8859-3");
1239 codeset
->characterization
= mystrdup("South European");
1240 codeset
->read_only
= 0;
1242 for(i
= 0; i
<256; i
++)
1244 UTF32
*src_ptr
= &src
;
1245 UTF8
*dest_ptr
= &codeset
->table
[i
].utf8
[1];
1250 src
= iso_8859_3_to_ucs4
[i
-0xa0];
1252 codeset
->table
[i
].code
= i
;
1253 codeset
->table
[i
].ucs4
= src
;
1254 CodesetsConvertUTF32toUTF8((const UTF32
**)&src_ptr
,src_ptr
+1,&dest_ptr
,dest_ptr
+6,CSF_StrictConversion
);
1256 codeset
->table
[i
].utf8
[0] = (IPTR
)dest_ptr
-(IPTR
)&codeset
->table
[i
].utf8
[1];
1258 memcpy(codeset
->table_sorted
, codeset
->table
, sizeof(codeset
->table
));
1259 qsort(codeset
->table_sorted
, 256, sizeof(codeset
->table
[0]), codesetsCmpUnicode
);
1261 D(DBF_STARTUP
, "adding internal codeset '%s'", codeset
->name
);
1262 AddTail((struct List
*)csList
, (struct Node
*)&codeset
->node
);
1266 if(codesetsFind(csList
, "ISO-8859-4") == NULL
)
1268 if((codeset
= allocArbitrateVecPooled(sizeof(*codeset
))) == NULL
)
1271 codeset
->name
= mystrdup("ISO-8859-4");
1272 codeset
->alt_name
= mystrdup("ISO8859-4");
1273 codeset
->characterization
= mystrdup("North European");
1274 codeset
->read_only
= 0;
1276 for(i
= 0; i
<256; i
++)
1278 UTF32
*src_ptr
= &src
;
1279 UTF8
*dest_ptr
= &codeset
->table
[i
].utf8
[1];
1284 src
= iso_8859_4_to_ucs4
[i
-0xa0];
1286 codeset
->table
[i
].code
= i
;
1287 codeset
->table
[i
].ucs4
= src
;
1288 CodesetsConvertUTF32toUTF8((const UTF32
**)&src_ptr
,src_ptr
+1,&dest_ptr
,dest_ptr
+6,CSF_StrictConversion
);
1290 codeset
->table
[i
].utf8
[0] = (IPTR
)dest_ptr
-(IPTR
)&codeset
->table
[i
].utf8
[1];
1292 memcpy(codeset
->table_sorted
, codeset
->table
, sizeof(codeset
->table
));
1293 qsort(codeset
->table_sorted
, 256, sizeof(codeset
->table
[0]), codesetsCmpUnicode
);
1295 D(DBF_STARTUP
, "adding internal codeset '%s'", codeset
->name
);
1296 AddTail((struct List
*)csList
, (struct Node
*)&codeset
->node
);
1300 if(codesetsFind(csList
, "ISO-8859-5") == NULL
)
1302 if((codeset
= allocArbitrateVecPooled(sizeof(*codeset
))) == NULL
)
1305 codeset
->name
= mystrdup("ISO-8859-5");
1306 codeset
->alt_name
= mystrdup("ISO8859-5");
1307 codeset
->characterization
= mystrdup("Slavic languages");
1308 codeset
->read_only
= 0;
1310 for(i
= 0; i
<256; i
++)
1312 UTF32
*src_ptr
= &src
;
1313 UTF8
*dest_ptr
= &codeset
->table
[i
].utf8
[1];
1318 src
= iso_8859_5_to_ucs4
[i
-0xa0];
1320 codeset
->table
[i
].code
= i
;
1321 codeset
->table
[i
].ucs4
= src
;
1322 CodesetsConvertUTF32toUTF8((const UTF32
**)&src_ptr
,src_ptr
+1,&dest_ptr
,dest_ptr
+6,CSF_StrictConversion
);
1324 codeset
->table
[i
].utf8
[0] = (IPTR
)dest_ptr
-(IPTR
)&codeset
->table
[i
].utf8
[1];
1326 memcpy(codeset
->table_sorted
, codeset
->table
, sizeof(codeset
->table
));
1327 qsort(codeset
->table_sorted
, 256, sizeof(codeset
->table
[0]), codesetsCmpUnicode
);
1329 D(DBF_STARTUP
, "adding internal codeset '%s'", codeset
->name
);
1330 AddTail((struct List
*)csList
, (struct Node
*)&codeset
->node
);
1334 if(codesetsFind(csList
, "ISO-8859-9") == NULL
)
1336 if((codeset
= allocArbitrateVecPooled(sizeof(*codeset
))) == NULL
)
1339 codeset
->name
= mystrdup("ISO-8859-9");
1340 codeset
->alt_name
= mystrdup("ISO8859-9");
1341 codeset
->characterization
= mystrdup("Turkish");
1342 codeset
->read_only
= 0;
1344 for(i
= 0; i
<256; i
++)
1346 UTF32
*src_ptr
= &src
;
1347 UTF8
*dest_ptr
= &codeset
->table
[i
].utf8
[1];
1352 src
= iso_8859_9_to_ucs4
[i
-0xa0];
1354 codeset
->table
[i
].code
= i
;
1355 codeset
->table
[i
].ucs4
= src
;
1356 CodesetsConvertUTF32toUTF8((const UTF32
**)&src_ptr
,src_ptr
+1,&dest_ptr
,dest_ptr
+6,CSF_StrictConversion
);
1358 codeset
->table
[i
].utf8
[0] = (IPTR
)dest_ptr
-(IPTR
)&codeset
->table
[i
].utf8
[1];
1360 memcpy(codeset
->table_sorted
, codeset
->table
, sizeof(codeset
->table
));
1361 qsort(codeset
->table_sorted
, 256, sizeof(codeset
->table
[0]), codesetsCmpUnicode
);
1363 D(DBF_STARTUP
, "adding internal codeset '%s'", codeset
->name
);
1364 AddTail((struct List
*)csList
, (struct Node
*)&codeset
->node
);
1368 if(codesetsFind(csList
, "ISO-8859-15") == NULL
)
1370 if((codeset
= allocArbitrateVecPooled(sizeof(*codeset
))) == NULL
)
1373 codeset
->name
= mystrdup("ISO-8859-15");
1374 codeset
->alt_name
= mystrdup("ISO8859-15");
1375 codeset
->characterization
= mystrdup("West European II");
1376 codeset
->read_only
= 0;
1378 for(i
= 0; i
<256; i
++)
1380 UTF32
*src_ptr
= &src
;
1381 UTF8
*dest_ptr
= &codeset
->table
[i
].utf8
[1];
1386 src
= iso_8859_15_to_ucs4
[i
-0xa0];
1388 codeset
->table
[i
].code
= i
;
1389 codeset
->table
[i
].ucs4
= src
;
1390 CodesetsConvertUTF32toUTF8((const UTF32
**)&src_ptr
,src_ptr
+1,&dest_ptr
,dest_ptr
+6,CSF_StrictConversion
);
1392 codeset
->table
[i
].utf8
[0] = (IPTR
)dest_ptr
-(IPTR
)&codeset
->table
[i
].utf8
[1];
1394 memcpy(codeset
->table_sorted
,codeset
->table
,sizeof (codeset
->table
));
1395 qsort(codeset
->table_sorted
, 256, sizeof(codeset
->table
[0]), codesetsCmpUnicode
);
1397 D(DBF_STARTUP
, "adding internal codeset '%s'", codeset
->name
);
1398 AddTail((struct List
*)csList
, (struct Node
*)&codeset
->node
);
1402 if(codesetsFind(csList
, "ISO-8859-16") == NULL
)
1404 if((codeset
= allocArbitrateVecPooled(sizeof(*codeset
))) == NULL
)
1407 codeset
->name
= mystrdup("ISO-8859-16");
1408 codeset
->alt_name
= mystrdup("ISO8869-16");
1409 codeset
->characterization
= mystrdup("South-Eastern European");
1410 codeset
->read_only
= 0;
1414 UTF32
*src_ptr
= &src
;
1415 UTF8
*dest_ptr
= &codeset
->table
[i
].utf8
[1];
1420 src
= iso_8859_16_to_ucs4
[i
-0xa0];
1422 codeset
->table
[i
].code
= i
;
1423 codeset
->table
[i
].ucs4
= src
;
1424 CodesetsConvertUTF32toUTF8((const UTF32
**)&src_ptr
, src_ptr
+1, &dest_ptr
, dest_ptr
+6, CSF_StrictConversion
);
1426 codeset
->table
[i
].utf8
[0] = (IPTR
)dest_ptr
- (IPTR
)&codeset
->table
[i
].utf8
[1];
1428 memcpy(codeset
->table_sorted
, codeset
->table
, sizeof(codeset
->table
));
1429 qsort(codeset
->table_sorted
, 256, sizeof(codeset
->table
[0]), codesetsCmpUnicode
);
1431 D(DBF_STARTUP
, "adding internal codeset '%s'", codeset
->name
);
1432 AddTail((struct List
*)csList
, (struct Node
*)&codeset
->node
);
1436 if(codesetsFind(csList
, "KOI8-R") == NULL
)
1438 if((codeset
= allocArbitrateVecPooled(sizeof(*codeset
))) == NULL
)
1441 codeset
->name
= mystrdup("KOI8-R");
1442 codeset
->alt_name
= mystrdup("KOI8R");
1443 codeset
->characterization
= mystrdup("Russian");
1444 codeset
->read_only
= 0;
1446 for(i
= 0; i
<256; i
++)
1448 UTF32
*src_ptr
= &src
;
1449 UTF8
*dest_ptr
= &codeset
->table
[i
].utf8
[1];
1454 src
= koi8r_to_ucs4
[i
-0x80];
1456 codeset
->table
[i
].code
= i
;
1457 codeset
->table
[i
].ucs4
= src
;
1458 CodesetsConvertUTF32toUTF8((const UTF32
**)&src_ptr
,src_ptr
+1,&dest_ptr
,dest_ptr
+6,CSF_StrictConversion
);
1460 codeset
->table
[i
].utf8
[0] = (IPTR
)dest_ptr
-(IPTR
)&codeset
->table
[i
].utf8
[1];
1462 memcpy(codeset
->table_sorted
, codeset
->table
, sizeof(codeset
->table
));
1463 qsort(codeset
->table_sorted
, 256, sizeof(codeset
->table
[0]), codesetsCmpUnicode
);
1465 D(DBF_STARTUP
, "adding internal codeset '%s'", codeset
->name
);
1466 AddTail((struct List
*)csList
, (struct Node
*)&codeset
->node
);
1470 if(codesetsFind(csList
, "AmigaPL") == NULL
)
1472 if((codeset
= allocArbitrateVecPooled(sizeof(*codeset
))) == NULL
)
1475 codeset
->name
= mystrdup("AmigaPL");
1476 codeset
->alt_name
= mystrdup("AmiPL");
1477 codeset
->characterization
= mystrdup("Polish (Amiga)");
1478 codeset
->read_only
= 1;
1480 for(i
=0; i
<256; i
++)
1482 UTF32
*src_ptr
= &src
;
1483 UTF8
*dest_ptr
= &codeset
->table
[i
].utf8
[1];
1488 src
= amigapl_to_ucs4
[i
-0xa0];
1490 codeset
->table
[i
].code
= i
;
1491 codeset
->table
[i
].ucs4
= src
;
1492 CodesetsConvertUTF32toUTF8((const UTF32
**)&src_ptr
,src_ptr
+1,&dest_ptr
,dest_ptr
+6,CSF_StrictConversion
);
1494 codeset
->table
[i
].utf8
[0] = (IPTR
)dest_ptr
-(IPTR
)&codeset
->table
[i
].utf8
[1];
1496 memcpy(codeset
->table_sorted
, codeset
->table
, sizeof(codeset
->table
));
1497 qsort(codeset
->table_sorted
, 256, sizeof(codeset
->table
[0]), codesetsCmpUnicode
);
1499 D(DBF_STARTUP
, "adding internal codeset '%s'", codeset
->name
);
1500 AddTail((struct List
*)csList
, (struct Node
*)&codeset
->node
);
1504 if(codesetsFind(csList
, "Amiga-1251") == NULL
)
1506 if((codeset
= allocArbitrateVecPooled(sizeof(*codeset
))) == NULL
)
1509 codeset
->name
= mystrdup("Amiga-1251");
1510 codeset
->alt_name
= mystrdup("Ami1251");
1511 codeset
->characterization
= mystrdup("Cyrillic (Amiga)");
1512 codeset
->read_only
= 1;
1514 for(i
=0; i
<256; i
++)
1516 UTF32
*src_ptr
= &src
;
1517 UTF8
*dest_ptr
= &codeset
->table
[i
].utf8
[1];
1522 src
= amiga1251_to_ucs4
[i
-0xa0];
1524 codeset
->table
[i
].code
= i
;
1525 codeset
->table
[i
].ucs4
= src
;
1526 CodesetsConvertUTF32toUTF8((const UTF32
**)&src_ptr
, src_ptr
+1, &dest_ptr
, dest_ptr
+6, CSF_StrictConversion
);
1528 codeset
->table
[i
].utf8
[0] = (char*)dest_ptr
- (char*)&codeset
->table
[i
].utf8
[1];
1530 memcpy(codeset
->table_sorted
, codeset
->table
, sizeof(codeset
->table
));
1531 qsort(codeset
->table_sorted
, 256, sizeof(codeset
->table
[0]), codesetsCmpUnicode
);
1533 D(DBF_STARTUP
, "adding internal codeset '%s'", codeset
->name
);
1534 AddTail((struct List
*)csList
, (struct Node
*)&codeset
->node
);
1545 /// codesetsCleanup()
1546 // Cleanup the memory for the codeset
1547 void codesetsCleanup(struct codesetList
*csList
)
1549 struct codeset
*code
;
1553 while((code
= (struct codeset
*)RemHead((struct List
*)csList
)) != NULL
)
1555 if(code
->name
!= NULL
)
1556 freeArbitrateVecPooled(code
->name
);
1557 if(code
->alt_name
!= NULL
)
1558 freeArbitrateVecPooled(code
->alt_name
);
1559 if(code
->characterization
!= NULL
)
1560 freeArbitrateVecPooled(code
->characterization
);
1562 freeArbitrateVecPooled(code
);
1570 // Returns the given codeset.
1571 struct codeset
*codesetsFind(struct codesetList
*csList
, const char *name
)
1573 struct codeset
*res
= NULL
;
1577 if(name
!= NULL
&& name
[0] != '\0')
1580 const char *matchedName
;
1582 if((matchedName
= matchCodesetAlias(name
)) != NULL
)
1585 for(node
= GetHead((struct List
*)csList
); node
!= NULL
; node
= GetSucc(node
))
1587 struct codeset
*mstate
= (struct codeset
*)node
;
1589 if(stricmp(name
, mstate
->name
) == 0 ||
1590 (mstate
->alt_name
!= NULL
&& stricmp(name
, mstate
->alt_name
) == 0))
1604 /// checkTextAgainstSingleCodeset
1605 // check how good a text can be represented by a specific codeset
1606 static int checkTextAgainstSingleCodeset(CONST_STRPTR text
, ULONG textLen
, struct codeset
*codeset
)
1608 int errors
= textLen
;
1612 if(codeset
->read_only
== 0 &&
1613 codeset
!= CodesetsBase
->utf8Codeset
&&
1614 codeset
!= CodesetsBase
->utf16Codeset
&&
1615 codeset
!= CodesetsBase
->utf32Codeset
)
1617 CONST_STRPTR text_ptr
= text
;
1622 // the following identification/detection routine is NOT really smart.
1623 // we just see how each UTF8 string is the representation of each char
1624 // in our source text and then check if they are valid or not. As said,
1625 // not very smart, but we don't have anything better right now :(
1626 for(i
=0; i
< textLen
; i
++)
1628 unsigned char c
= *text_ptr
++;
1632 struct single_convert
*f
= &codeset
->table
[c
];
1634 if(f
->utf8
[0] == 0x00 || f
->utf8
[1] == 0x00)
1642 W(DBF_STARTUP
, "codeset '%s' is either read-only (%ld) or UTF8/16/32 (%ld)", codeset
->name
, codeset
->read_only
, codeset
== CodesetsBase
->utf8Codeset
|| codeset
== CodesetsBase
->utf16Codeset
|| codeset
== CodesetsBase
->utf32Codeset
);
1644 D(DBF_STARTUP
, "tried to identify text as '%s' text with %ld of %ld errors", codeset
->name
, errors
, textLen
);
1651 /// checkTextAgainstCodesetList
1652 static int checkTextAgainstCodesetList(CONST_STRPTR text
, ULONG textLen
, struct codesetList
*csList
, struct codeset
**bestCodeset
)
1655 int bestErrors
= textLen
;
1659 *bestCodeset
= NULL
;
1661 for(node
= GetHead((struct List
*)csList
); node
!= NULL
; node
= GetSucc(node
))
1663 struct codeset
*codeset
= (struct codeset
*)node
;
1666 errors
= checkTextAgainstSingleCodeset(text
, textLen
, codeset
);
1667 if(errors
< bestErrors
)
1669 *bestCodeset
= codeset
;
1670 bestErrors
= errors
;
1682 /// codesetsFindBest()
1683 // Returns the best codeset for the given text
1684 static struct codeset
*codesetsFindBest(struct TagItem
*attrs
, ULONG csFamily
, CONST_STRPTR text
, ULONG textLen
, int *errorPtr
)
1686 struct codeset
*bestCodeset
= NULL
;
1687 int bestErrors
= textLen
;
1692 ObtainSemaphoreShared(&CodesetsBase
->libSem
);
1694 // in case the user specified the codeset family as a
1695 // cyrillic one we go and do our cyrillic specific analysis first
1696 if(csFamily
== CSV_CodesetFamily_Cyrillic
)
1698 #define NUM_CYRILLIC 3
1700 struct CodesetSearch
1706 struct CodesetSearch search
[NUM_CYRILLIC
];
1709 int ctr
[NUM_CYRILLIC
];
1716 D(DBF_STARTUP
, "performing cyrillic analysis");
1718 search
[0].name
= "windows-1251";
1719 search
[0].data
= cp1251_data
;
1720 search
[1].name
= "IBM866";
1721 search
[1].data
= cp866_data
;
1722 search
[2].name
= "KOI8-R";
1723 search
[2].data
= koi8r_data
;
1725 memset(&ctr
, 0, sizeof(ctr
));
1727 tp
= (unsigned char *)text
;
1732 int mid
= max
= -466725766; // TODO: what's the magic behind this constant?
1735 for(n
=0; n
< NUM_CYRILLIC
; n
++)
1737 unsigned char la
= 0;
1738 unsigned char *tptr
= (unsigned char *)search
[n
].data
;
1744 unsigned char lb
= (*p
++) ^ 128;
1746 if(!((la
| lb
) & 128))
1747 ctr
[n
] += (signed char)tptr
[(la
<< 7) + lb
];
1762 if((max
>= 500) && ((max
-mid
) >= 1000))
1768 while((*p
) && (!gr
));
1770 if(gr
|| ((!(*p
)) && lr
))
1773 // if our analysis found something, we go and try
1774 // to find the corresponding codeset in out codeset list
1777 struct TagItem
*tstate
= attrs
;
1778 struct TagItem
*tag
;
1780 D(DBF_STARTUP
, "identified text as '%s", search
[Nmax
-1].name
);
1782 // now we walk through our taglist and check if the user
1784 while((tag
= NextTagItem((APTR
)&tstate
)) != NULL
)
1786 if(tag
->ti_Tag
== CSA_CodesetList
&& tag
->ti_Data
!= 0)
1788 struct codesetList
*csList
= (struct codesetList
*)tag
->ti_Data
;
1790 if((bestCodeset
= codesetsFind(csList
, search
[Nmax
-1].name
)) != NULL
)
1795 // if we still haven't found the matching codeset
1796 // we search the internal list
1797 if(bestCodeset
== NULL
)
1798 bestCodeset
= codesetsFind(&CodesetsBase
->codesets
, search
[Nmax
-1].name
);
1806 // if we haven't found the best codeset (through the cyrillic analysis)
1807 // we go and do the dumb latin search in our codesetlist
1810 struct TagItem
*tstate
= attrs
;
1811 struct TagItem
*tag
;
1813 // check text against all codesets in all supplied lists of codesets
1814 while((tag
= NextTagItem((APTR
)&tstate
)) != NULL
)
1818 case CSA_CodesetList
:
1820 struct codesetList
*csList
= (struct codesetList
*)tag
->ti_Data
;
1821 struct codeset
*bestCodesetInList
;
1822 int bestErrorsInList
;
1824 D(DBF_STARTUP
, "checking against external codeset list");
1825 bestErrorsInList
= checkTextAgainstCodesetList(text
, textLen
, csList
, &bestCodesetInList
);
1826 if(bestErrorsInList
< bestErrors
&& bestCodesetInList
!= NULL
)
1828 bestCodeset
= bestCodesetInList
;
1829 bestErrors
= bestErrorsInList
;
1839 // we didn't find a "best" codeset in the supplied codesets lists so far,
1840 // so now we check against our internal list
1843 struct codeset
*bestCodesetInList
;
1844 int bestErrorsInList
;
1846 D(DBF_STARTUP
, "checking against internal codeset list");
1847 bestErrorsInList
= checkTextAgainstCodesetList(text
, textLen
, &CodesetsBase
->codesets
, &bestCodesetInList
);
1848 if(bestErrorsInList
< bestErrors
&& bestCodesetInList
!= NULL
)
1850 bestCodeset
= bestCodesetInList
;
1851 bestErrors
= bestErrorsInList
;
1856 ReleaseSemaphore(&CodesetsBase
->libSem
);
1858 if(errorPtr
!= NULL
)
1859 *errorPtr
= bestErrors
;
1861 RETURN(bestCodeset
);
1867 /**************************************************************************/
1869 /// CodesetsSupportedA()
1870 STRPTR
* LIBFUNC
CodesetsSupportedA(REG(a0
, UNUSED
struct TagItem
*attrs
))
1872 STRPTR
*array
= NULL
;
1873 struct TagItem
*tstate
= attrs
;
1874 struct TagItem
*tag
;
1879 ObtainSemaphoreShared(&CodesetsBase
->libSem
);
1881 // first we need to check how many codesets our supplied
1883 numCodesets
= countCodesets(&CodesetsBase
->codesets
);
1884 while((tag
= NextTagItem((APTR
)&tstate
)) != NULL
)
1888 case CSA_CodesetList
:
1890 numCodesets
+= countCodesets((struct codesetList
*)tag
->ti_Data
);
1896 // now that we know how many codesets we have in our lists we
1897 // can put their names into our string arrays
1900 if((array
= allocArbitrateVecPooled((numCodesets
+1)*sizeof(STRPTR
))) != NULL
)
1905 // first we walk through the internal codesets list and
1907 for(node
= GetHead((struct List
*)&CodesetsBase
->codesets
); node
!= NULL
; node
= GetSucc(node
))
1909 struct codeset
*code
= (struct codeset
*)node
;
1911 array
[i
] = code
->name
;
1918 // then we also iterate through our private codesets list
1919 while((tag
= NextTagItem((APTR
)&tstate
)) != NULL
)
1923 case CSA_CodesetList
:
1925 for(node
= GetHead((struct List
*)tag
->ti_Data
); node
!= NULL
; node
= GetSucc(node
))
1927 struct codeset
*code
= (struct codeset
*)node
;
1929 array
[i
] = code
->name
;
1941 ReleaseSemaphore(&CodesetsBase
->libSem
);
1949 void LIBFUNC
CodesetsFreeA(REG(a0
, APTR obj
), REG(a1
, UNUSED
struct TagItem
*attrs
))
1954 freeArbitrateVecPooled(obj
);
1960 /// CodesetsSetDefaultA()
1961 struct codeset
* LIBFUNC
CodesetsSetDefaultA(REG(a0
, STRPTR name
), REG(a1
, struct TagItem
*attrs
))
1963 struct codeset
*codeset
;
1967 ObtainSemaphoreShared(&CodesetsBase
->libSem
);
1969 if((codeset
= codesetsFind(&CodesetsBase
->codesets
, name
)) != NULL
)
1973 flags
= GVF_SAVE_VAR
;
1974 if(GetTagData(CSA_Save
, FALSE
, attrs
))
1975 SET_FLAG(flags
, GVF_GLOBAL_ONLY
);
1977 SetVar("codeset_default", codeset
->name
, strlen(codeset
->name
), flags
);
1980 ReleaseSemaphore(&CodesetsBase
->libSem
);
1988 struct codeset
* LIBFUNC
CodesetsFindA(REG(a0
, STRPTR name
), REG(a1
, struct TagItem
*attrs
))
1990 struct codeset
*codeset
= NULL
;
1994 ObtainSemaphoreShared(&CodesetsBase
->libSem
);
1996 // if no name pointer was supplied we have to return
1997 // the default codeset only.
2000 // we first walk through our internal list and check if we
2001 // can find the requested codeset
2002 codeset
= codesetsFind(&CodesetsBase
->codesets
, name
);
2006 struct TagItem
*tstate
= attrs
;
2007 struct TagItem
*tag
;
2009 // now we walk through our taglist and check if the user
2011 while((tag
= NextTagItem((APTR
)&tstate
)) != NULL
)
2013 if(tag
->ti_Tag
== CSA_CodesetList
&& tag
->ti_Data
!= 0)
2015 struct codesetList
*csList
= (struct codesetList
*)tag
->ti_Data
;
2017 if((codeset
= codesetsFind(csList
, name
)) != NULL
)
2024 // check if we found something or not.
2025 if(codeset
== NULL
&& GetTagData(CSA_FallbackToDefault
, TRUE
, attrs
))
2026 codeset
= defaultCodeset(FALSE
);
2028 ReleaseSemaphore(&CodesetsBase
->libSem
);
2035 /// CodesetsFindBestA()
2036 struct codeset
* LIBFUNC
CodesetsFindBestA(REG(a0
, struct TagItem
*attrs
))
2038 struct codeset
*codeset
= NULL
;
2044 ObtainSemaphoreShared(&CodesetsBase
->libSem
);
2046 text
= (char *)GetTagData(CSA_Source
, 0, attrs
);
2047 textLen
= GetTagData(CSA_SourceLen
, text
!= NULL
? strlen(text
) : 0, attrs
);
2049 if(text
!= NULL
&& textLen
!= 0)
2052 ULONG csFamily
= GetTagData(CSA_CodesetFamily
, CSV_CodesetFamily_Latin
, attrs
);
2053 int *errorPtr
= (int *)GetTagData(CSA_ErrPtr
, 0, attrs
);
2055 codeset
= codesetsFindBest(attrs
, csFamily
, text
, textLen
, &numErrors
);
2057 if(errorPtr
!= NULL
)
2058 *errorPtr
= numErrors
;
2060 // if we still haven't got the codeset we fallback to the default
2061 if(codeset
== NULL
&& GetTagData(CSA_FallbackToDefault
, FALSE
, attrs
))
2062 codeset
= defaultCodeset(FALSE
);
2065 ReleaseSemaphore(&CodesetsBase
->libSem
);
2072 /// CodesetsUTF8Len()
2073 // Returns the number of characters a utf8 string has. This is not
2074 // identically with the size of memory is required to hold the string.
2075 ULONG LIBFUNC
CodesetsUTF8Len(REG(a0
, UTF8
*str
))
2087 str
+= trailingBytesForUTF8
[c
];
2096 /// CodesetsStrLenA()
2097 ULONG LIBFUNC
CodesetsStrLenA(REG(a0
, STRPTR str
), REG(a1
, struct TagItem
*attrs
))
2105 struct codeset
*codeset
;
2110 if((codeset
= (struct codeset
*)GetTagData(CSA_SourceCodeset
, 0, attrs
)) == NULL
)
2111 codeset
= defaultCodeset(TRUE
);
2113 if(codeset
== CodesetsBase
->utf32Codeset
)
2116 len
= utf32_strlen((UTF32
*)str
);
2118 else if(codeset
== CodesetsBase
->utf16Codeset
)
2121 len
= utf16_strlen((UTF16
*)str
);
2129 len
= GetTagData(CSA_SourceLen
, len
, attrs
);
2135 void *srcend
= src
+ len
;
2136 UTF8
*dstlen
= NULL
;
2137 union TypeAliases srcAlias
;
2138 union TypeAliases dstAlias
;
2140 srcAlias
.strptr
= &src
;
2141 dstAlias
.utf8
= &dstlen
;
2146 CodesetsConvertUTF16toUTF8(srcAlias
.cutf16
, srcend
, dstAlias
.utf8
, NULL
, 0);
2150 CodesetsConvertUTF32toUTF8(srcAlias
.cutf32
, srcend
, dstAlias
.utf8
, NULL
, 0);
2161 while((c
= *src
++) != '\0' && len
!= 0)
2163 res
+= codeset
->table
[c
].utf8
[0];
2174 /// CodesetsUTF8ToStrA()
2175 // Converts an UTF8 string to a given charset. Return the number of bytes
2176 // written to dest excluding the NULL byte (which is always ensured by this
2177 // function; it means a NULL str will produce "" as dest; anyway you should
2178 // check NULL str to not waste your time!).
2179 STRPTR LIBFUNC
CodesetsUTF8ToStrA(REG(a0
, struct TagItem
*attrs
))
2190 if((src
= (UTF8
*)GetTagData(CSA_Source
, 0, attrs
)) != NULL
&&
2191 (srcLen
= GetTagData(CSA_SourceLen
, src
!= NULL
? strlen((char *)src
) : 0, attrs
)) > 0)
2193 struct convertMsg msg
;
2194 struct codeset
*codeset
;
2195 struct Hook
*destHook
;
2196 struct Hook
*mapForeignCharsHook
;
2198 STRPTR destIter
= NULL
;
2201 unsigned char *s
= src
;
2202 unsigned char *e
= (src
+srcLen
);
2203 int numConvErrors
= 0;
2204 int *numConvErrorsPtr
;
2205 BOOL mapForeignChars
;
2207 struct SignalSemaphore
*sem
= NULL
;
2211 // get some more optional attributes
2212 destHook
= (struct Hook
*)GetTagData(CSA_DestHook
, 0, attrs
);
2213 destLen
= GetTagData(CSA_DestLen
, 0, attrs
);
2214 numConvErrorsPtr
= (int *)GetTagData(CSA_ErrPtr
, 0, attrs
);
2215 mapForeignChars
= (BOOL
)GetTagData(CSA_MapForeignChars
, FALSE
, attrs
);
2216 mapForeignCharsHook
= (struct Hook
*)GetTagData(CSA_MapForeignCharsHook
, 0, attrs
);
2218 // get the destination codeset pointer
2219 if((codeset
= (struct codeset
*)GetTagData(CSA_DestCodeset
, 0, attrs
)) == NULL
)
2220 codeset
= defaultCodeset(TRUE
);
2221 if(codeset
== CodesetsBase
->utf32Codeset
)
2226 else if(codeset
== CodesetsBase
->utf16Codeset
)
2237 // first we make sure we allocate enough memory
2238 // for our destination buffer
2239 if(destHook
!= NULL
)
2241 if(destLen
< 16 || destLen
> sizeof(buf
))
2242 destLen
= sizeof(buf
);
2244 msg
.state
= CSV_Translating
;
2250 // in case the user wants us to dynamically generate the
2251 // destination buffer we do it right now
2252 if((dest
= (STRPTR
)GetTagData(CSA_Dest
, 0, attrs
)) == NULL
||
2253 GetTagData(CSA_AllocIfNeeded
, TRUE
, attrs
) != FALSE
)
2257 // calculate the destLen
2260 void *dstlen
= NULL
;
2261 union TypeAliases srcAlias
;
2262 union TypeAliases dstAlias
;
2264 srcAlias
.uchar
= &s
;
2265 dstAlias
.voidptr
= &dstlen
;
2270 CodesetsConvertUTF8toUTF16(srcAlias
.cutf8
, e
, dstAlias
.utf16
, NULL
, 0);
2274 CodesetsConvertUTF8toUTF32(srcAlias
.cutf8
, e
, dstAlias
.utf32
, NULL
, 0);
2283 unsigned char c
= *s
++;
2286 s
+= trailingBytesForUTF8
[c
];
2290 if(dest
== NULL
|| (destLen
< len
+1))
2292 if((pool
= (APTR
)GetTagData(CSA_Pool
, 0, attrs
)) != NULL
)
2294 if((sem
= (struct SignalSemaphore
*)GetTagData(CSA_PoolSem
, 0, attrs
)) != NULL
)
2295 ObtainSemaphore(sem
);
2297 // allocate the destination buffer
2298 dest
= allocVecPooled(pool
, len
+char_size
);
2301 ReleaseSemaphore(sem
);
2304 dest
= allocArbitrateVecPooled(len
+char_size
);
2306 destLen
= len
+char_size
;
2319 // now we convert the src string to the
2320 // destination buffer.
2326 if(destHook
!= NULL
)
2328 ULONG r
= CSR_TargetExhausted
;
2330 dstend
= b
+ destLen
- char_size
;
2333 union TypeAliases srcAlias
;
2334 union TypeAliases dstAlias
;
2336 srcAlias
.uchar
= &s
;
2337 dstAlias
.schar
= &b
;
2342 r
= CodesetsConvertUTF8toUTF16(srcAlias
.cutf8
, e
, dstAlias
.utf16
, dstend
, 0);
2346 r
= CodesetsConvertUTF8toUTF32(srcAlias
.cutf8
, e
, dstAlias
.utf32
, dstend
, 0);
2352 if(r
!= CSR_TargetExhausted
)
2353 msg
.state
= CSV_End
;
2355 CallHookPkt(destHook
,&msg
,buf
);
2360 while(r
== CSR_TargetExhausted
);
2364 union TypeAliases srcAlias
;
2365 union TypeAliases dstAlias
;
2367 srcAlias
.uchar
= &s
;
2368 dstAlias
.strptr
= &destIter
;
2369 dstend
= destIter
+ destLen
- char_size
;
2373 CodesetsConvertUTF8toUTF16(srcAlias
.cutf8
, e
, dstAlias
.utf16
, dstend
, 0);
2377 CodesetsConvertUTF8toUTF32(srcAlias
.cutf8
, e
, dstAlias
.utf32
, dstend
, 0);
2387 if(destHook
== NULL
&& n
>= destLen
-1)
2390 // convert until we reach the end of the
2394 unsigned char c
= *s
;
2395 unsigned char d
= '?';
2396 const char *repstr
= NULL
;
2399 // check if the char is a >7bit char
2402 struct single_convert
*f
;
2403 int lenAdd
= trailingBytesForUTF8
[c
];
2404 int lenStr
= lenAdd
+1;
2405 unsigned char *src
= s
;
2409 // start each iteration with "no replacement found yet"
2413 // search in the UTF8 conversion table of the current charset if
2414 // we have a replacement character for the char sequence starting at s
2415 BIN_SEARCH(codeset
->table_sorted
, 0, 255, strncmp((char *)src
, (char *)codeset
->table_sorted
[m
].utf8
+1, lenStr
), f
);
2426 // the analysed char sequence (s) is not convertable to a
2427 // single visible char replacement, so we normally have to put
2428 // a ? sign as a "unknown char" sign at the very position.
2430 // For convienence we, however, allow users to replace these
2431 // UTF8 characters with char sequences that "looklike" the
2433 if(mapForeignChars
== TRUE
)
2434 replen
= mapUTF8toASCII(&repstr
, src
, lenStr
);
2436 // call the hook only, if the internal table yielded no suitable
2438 if(replen
== 0 && mapForeignCharsHook
!= NULL
)
2440 struct replaceMsg rmsg
;
2442 rmsg
.dst
= (char **)&repstr
;
2444 rmsg
.srclen
= lenStr
;
2445 replen
= CallHookPkt(mapForeignCharsHook
, &rmsg
, NULL
);
2450 D(DBF_UTF
, "got UTF8 replacement (%ld)", replen
);
2452 // stay in the loop as long as one replacement function delivers
2453 // further UTF8 replacement sequences
2454 src
= (unsigned char *)repstr
;
2455 // remember the length of the replaced string, as we might do another
2456 // iteration in the loop which might result in a further replacement
2459 else if(replen
== 0)
2461 D(DBF_UTF
, "found no ASCII replacement for UTF8 string (%ld)", replen
);
2465 D(DBF_UTF
, "got replacement string '%s' (%ld)", repstr
? repstr
: "<null>", replen
);
2470 if(repstr
== NULL
|| replen
== 0)
2484 if(destHook
!= NULL
)
2495 if(i
%(destLen
-1)==0)
2499 CallHookPkt(destHook
, &msg
, buf
);
2509 *b
++ = replen
> 0 ? *repstr
: d
;
2513 if(i
%(destLen
-1)==0)
2517 CallHookPkt(destHook
, &msg
, buf
);
2528 ULONG destPos
= destIter
-dest
;
2533 ObtainSemaphore(sem
);
2535 // allocate the destination buffer
2536 dest
= reallocVecPooled(pool
, dest
, destLen
, destLen
+replen
-1);
2539 ReleaseSemaphore(sem
);
2542 dest
= reallocArbitrateVecPooled(dest
, destLen
, destLen
+replen
-1);
2550 destIter
= dest
+destPos
;
2551 memcpy(destIter
, repstr
, replen
);
2553 // adjust our loop pointer and destination length
2555 destLen
+= replen
-1;
2557 else if(replen
== 1)
2558 *destIter
++ = *repstr
;
2569 if(destHook
!= NULL
)
2571 msg
.state
= CSV_End
;
2574 CallHookPkt(destHook
,&msg
,buf
);
2580 // let us write the number of conversion errors
2581 // to the proper variable pointer, if wanted
2582 if(numConvErrorsPtr
!= NULL
)
2583 *numConvErrorsPtr
= numConvErrors
;
2586 // put the final length of our destination buffer
2587 // into the destLenPtr
2588 if((destLenPtr
= (ULONG
*)GetTagData(CSA_DestLenPtr
, 0, attrs
)) != NULL
)
2591 *destLenPtr
= destLen
-1;
2601 /// CodesetsUTF8CreateA()
2602 // Converts a string and a charset to an UTF8. Returns the UTF8.
2603 // If a destination hook is supplied always return 0.
2604 // If from is NULL, it returns NULL and doesn't call the hook.
2605 UTF8
* LIBFUNC
CodesetsUTF8CreateA(REG(a0
, struct TagItem
*attrs
))
2609 struct codeset
*codeset
;
2610 ULONG fromLen
, *destLenPtr
;
2619 if((codeset
= (struct codeset
*)GetTagData(CSA_SourceCodeset
, 0, attrs
)) == NULL
)
2620 codeset
= defaultCodeset(TRUE
);
2621 if(codeset
== CodesetsBase
->utf32Codeset
)
2623 else if(codeset
== CodesetsBase
->utf16Codeset
)
2628 from
= (UTF8
*)GetTagData(CSA_Source
, 0, attrs
);
2634 fromLen
= utf32_strlen((UTF32
*)from
);
2638 fromLen
= utf16_strlen((UTF16
*)from
);
2642 fromLen
= strlen((char *)from
);
2648 fromLen
= GetTagData(CSA_SourceLen
, fromLen
, attrs
);
2650 if(from
!= NULL
&& fromLen
!= 0)
2652 struct convertMsg msg
;
2657 STRPTR src
, destPtr
= NULL
, b
= NULL
;
2660 hook
= (struct Hook
*)GetTagData(CSA_DestHook
, 0, attrs
);
2661 destLen
= GetTagData(CSA_DestLen
, 0, attrs
);
2665 if(destLen
<16 || destLen
>sizeof(buf
))
2666 destLen
= sizeof(buf
);
2668 msg
.state
= CSV_Translating
;
2674 if((dest
= (UTF8
*)GetTagData(CSA_Dest
, 0, attrs
)) != NULL
||
2675 GetTagData(CSA_AllocIfNeeded
, TRUE
, attrs
))
2683 void *srcend
= src
+ fromLen
;
2684 UTF8
*dstlen
= NULL
;
2685 union TypeAliases srcAlias
;
2686 union TypeAliases dstAlias
;
2688 srcAlias
.strptr
= &src
;
2689 dstAlias
.utf8
= &dstlen
;
2694 CodesetsConvertUTF16toUTF8(srcAlias
.cutf16
, srcend
, dstAlias
.utf8
, NULL
, 0);
2698 CodesetsConvertUTF32toUTF8(srcAlias
.cutf32
, srcend
, dstAlias
.utf8
, NULL
, 0);
2705 ULONG flen
= fromLen
;
2708 while((c
= *src
++) != '\0' && flen
!= 0)
2710 len
+= codeset
->table
[c
].utf8
[0];
2714 D(DBF_UTF
, "Calculated output UTF-8 buffer length: %lu", len
);
2716 if(dest
== NULL
|| (destLen
<len
+1))
2719 struct SignalSemaphore
*sem
;
2721 if((pool
= (APTR
)GetTagData(CSA_Pool
, 0, attrs
)) != NULL
)
2723 if((sem
= (struct SignalSemaphore
*)GetTagData(CSA_PoolSem
, 0, attrs
)) != NULL
)
2724 ObtainSemaphore(sem
);
2726 // allocate the destination buffer
2727 dest
= allocVecPooled(pool
,len
+1);
2730 ReleaseSemaphore(sem
);
2733 dest
= allocArbitrateVecPooled(len
+1);
2745 destPtr
= (STRPTR
)dest
;
2751 void *srcend
= src
+ fromLen
;
2756 ULONG r
= CSR_TargetExhausted
;
2757 union TypeAliases srcAlias
;
2758 union TypeAliases dstAlias
;
2760 srcAlias
.strptr
= &src
;
2761 dstAlias
.strptr
= &b
;
2762 dstend
= (UTF8
*)(b
+ destLen
- 1);
2768 r
= CodesetsConvertUTF16toUTF8(srcAlias
.cutf16
, srcend
, dstAlias
.utf8
, dstend
, 0);
2772 r
= CodesetsConvertUTF32toUTF8(srcAlias
.cutf32
, srcend
, dstAlias
.utf8
, dstend
, 0);
2776 if(r
!= CSR_TargetExhausted
)
2777 msg
.state
= CSV_End
;
2779 CallHookPkt(hook
,&msg
,buf
);
2784 while(r
== CSR_TargetExhausted
);
2788 union TypeAliases srcAlias
;
2789 union TypeAliases dstAlias
;
2791 srcAlias
.strptr
= &src
;
2792 dstAlias
.strptr
= &destPtr
;
2793 dstend
= (UTF8
*)(destPtr
+ destLen
);
2797 CodesetsConvertUTF16toUTF8(srcAlias
.cutf16
, srcend
, dstAlias
.utf8
, dstend
, 0);
2801 CodesetsConvertUTF32toUTF8(srcAlias
.cutf32
, srcend
, dstAlias
.utf8
, dstend
, 0);
2804 n
= destPtr
-(STRPTR
)dest
;
2809 for(; fromLen
&& (c
= *src
); src
++, fromLen
--)
2813 for(utf8_seq
= &codeset
->table
[c
].utf8
[1]; (c
= *utf8_seq
); utf8_seq
++)
2820 if(i
%(destLen
-1)==0)
2824 CallHookPkt(hook
,&msg
,buf
);
2845 msg
.state
= CSV_End
;
2848 CallHookPkt(hook
,&msg
,buf
);
2857 if((destLenPtr
= (ULONG
*)GetTagData(CSA_DestLenPtr
, 0, attrs
)) != NULL
)
2865 /// CodesetsIsValidUTF8()
2866 #define GOOD_UCS(c) \
2867 ((c) >= 160 && ((c) & ~0x3ff) != 0xd800 && \
2868 (c) != 0xfeff && (c) != 0xfffe && (c) != 0xffff)
2870 BOOL LIBFUNC
CodesetsIsValidUTF8(REG(a0
, STRPTR s
))
2877 while((n
= parseUtf8(&t
)) != 0)
2891 /// CodesetsConvertStrA()
2892 // Converts a given string from one source Codeset to a given destination
2893 // codeset and returns the convert string
2894 STRPTR LIBFUNC
CodesetsConvertStrA(REG(a0
, struct TagItem
*attrs
))
2896 struct codeset
*srcCodeset
;
2897 STRPTR srcStr
= NULL
;
2898 STRPTR dstStr
= NULL
;
2905 // get the ptr to the src string we want to convert
2906 // from the source codeset to the dest codeset.
2907 srcStr
= (STRPTR
)GetTagData(CSA_Source
, 0, attrs
);
2909 // get the pointer to the codeset in which the src string is encoded
2910 if((srcCodeset
= (struct codeset
*)GetTagData(CSA_SourceCodeset
, 0, attrs
)) == NULL
)
2911 srcCodeset
= defaultCodeset(TRUE
);
2915 if(srcCodeset
== CodesetsBase
->utf32Codeset
)
2917 srcLen
= utf32_strlen((UTF32
*)srcStr
);
2918 charSize
= sizeof(UTF32
);
2920 else if(srcCodeset
== CodesetsBase
->utf16Codeset
)
2922 srcLen
= utf16_strlen((UTF16
*)srcStr
);
2923 charSize
= sizeof(UTF16
);
2927 srcLen
= strlen(srcStr
);
2928 charSize
= sizeof(char);
2933 srcLen
= GetTagData(CSA_SourceLen
, srcLen
, attrs
);
2935 if(srcStr
!= NULL
&& srcLen
> 0)
2937 struct codeset
*dstCodeset
;
2939 // get the pointer to the codeset in which the dst string should be encoded
2940 if((dstCodeset
= (struct codeset
*)GetTagData(CSA_DestCodeset
, 0, attrs
)) == NULL
)
2941 dstCodeset
= defaultCodeset(TRUE
);
2943 D(DBF_UTF
, "srcCodeset: '%s' dstCodeset: '%s'", srcCodeset
->name
, dstCodeset
->name
);
2945 if(srcCodeset
!= NULL
&& dstCodeset
!= NULL
)
2947 // check that the user didn't supplied the very same codeset
2948 // or otherwise a conversion is not required.
2949 if(srcCodeset
!= dstCodeset
)
2951 BOOL utf8Create
= FALSE
;
2952 BOOL strCreate
= FALSE
;
2954 ULONG utf8strLen
= 0;
2955 ULONG
*destLenPtr
= NULL
;
2956 BOOL mapForeignChars
;
2957 struct Hook
*mapForeignCharsHook
;
2959 mapForeignChars
= (BOOL
)GetTagData(CSA_MapForeignChars
, FALSE
, attrs
);
2960 mapForeignCharsHook
= (struct Hook
*)GetTagData(CSA_MapForeignCharsHook
, 0, attrs
);
2962 // if the source codeset is UTF-8 we don't have to use the UTF8Create()
2963 // function and can directly call the UTF8ToStr() function
2964 if(srcCodeset
!= CodesetsBase
->utf8Codeset
)
2966 struct TagItem tags
[] = { { CSA_SourceCodeset
, (IPTR
)srcCodeset
},
2967 { CSA_Source
, (IPTR
)srcStr
},
2968 { CSA_SourceLen
, srcLen
},
2969 { CSA_DestLenPtr
, (IPTR
)&utf8strLen
},
2972 utf8str
= CodesetsUTF8CreateA((struct TagItem
*)&tags
[0]);
2978 utf8str
= (UTF8
*)srcStr
;
2979 utf8strLen
= srcLen
;
2982 // in case the destination codeset is UTF-8 we don't have to actually
2983 // use the UTF8ToStr() function and can immediately return our
2985 if(utf8str
!= NULL
&& utf8strLen
> 0 && dstCodeset
!= CodesetsBase
->utf8Codeset
)
2987 struct TagItem tags
[] = { { CSA_DestCodeset
, (IPTR
)dstCodeset
},
2988 { CSA_Source
, (IPTR
)utf8str
},
2989 { CSA_SourceLen
, utf8strLen
},
2990 { CSA_DestLenPtr
, (IPTR
)&dstLen
},
2991 { CSA_MapForeignChars
, mapForeignChars
},
2992 { CSA_MapForeignCharsHook
, (IPTR
)mapForeignCharsHook
},
2995 dstStr
= CodesetsUTF8ToStrA((struct TagItem
*)&tags
[0]);
3001 dstStr
= (STRPTR
)utf8str
;
3002 dstLen
= utf8strLen
;
3005 D(DBF_UTF
, "srcStr: %lx srcLen: %ld dstStr: %lx dstLen: %ld utf8create: %ld strCreate: %ld", srcStr
, srcLen
,
3010 // if everything was successfull we can go and finalize everything
3011 if(dstStr
!= NULL
&& utf8str
!= NULL
)
3013 // as the conversion was a two way pass we have to either free the
3014 // memory of the utf8 string or not
3015 if(utf8Create
== TRUE
&& strCreate
== TRUE
)
3016 CodesetsFreeA(utf8str
, NULL
);
3018 // if the user wants to be informed abour the length
3019 // of our destination string we store the length now in the supplied ptr.
3020 if((destLenPtr
= (ULONG
*)GetTagData(CSA_DestLenPtr
, 0, attrs
)) != NULL
)
3021 *destLenPtr
= dstLen
;
3023 D(DBF_UTF
, "successfully converted string with len %ld", dstLen
);
3027 W(DBF_ALWAYS
, "an error occurred while trying to convert a string");
3029 // free all memory in case the conversion didn't work out
3030 if(utf8Create
== TRUE
&& utf8str
!= NULL
)
3031 CodesetsFreeA(utf8str
, NULL
);
3033 if(strCreate
== TRUE
&& dstStr
!= NULL
)
3034 CodesetsFreeA(dstStr
, NULL
);
3041 // we got the same source and destination codesets passed in
3042 // instead of failing silently we just create a copy of the source string
3043 ULONG
*destLenPtr
= NULL
;
3045 // allocate memory for the destination string, including a trailing NUL byte
3046 if((dstStr
= allocArbitrateVecPooled(srcLen
+ charSize
)) != NULL
)
3048 // just copy the source string without any further modification
3049 // we must use memcpy() as the source string could be UTF16/32 encoded and
3050 // thus strcpy() would not do what we want.
3051 memcpy(dstStr
, srcStr
, srcLen
+ charSize
);
3053 D(DBF_UTF
, "successfully copied string with len %ld", dstLen
);
3056 W(DBF_ALWAYS
, "no memory for dest string");
3058 // if the user wants to be informed abour the length
3059 // of our destination string we store the length now in the supplied ptr.
3060 if((destLenPtr
= (ULONG
*)GetTagData(CSA_DestLenPtr
, 0, attrs
)) != NULL
)
3061 *destLenPtr
= dstLen
;
3071 /// CodesetsFreeVecPooledA()
3072 void LIBFUNC
CodesetsFreeVecPooledA(REG(a0
, APTR pool
), REG(a1
, APTR mem
), REG(a2
, struct TagItem
*attrs
))
3076 if(pool
!= NULL
&& mem
!= NULL
)
3078 struct SignalSemaphore
*sem
;
3080 if((sem
= (struct SignalSemaphore
*)GetTagData(CSA_PoolSem
, 0, attrs
)) != NULL
)
3081 ObtainSemaphore(sem
);
3083 freeVecPooled(pool
,mem
);
3086 ReleaseSemaphore(sem
);
3093 /// CodesetsListCreateA()
3094 struct codesetList
* LIBFUNC
CodesetsListCreateA(REG(a0
, struct TagItem
*attrs
))
3096 struct codesetList
*csList
= NULL
;
3100 // no matter what, we create a codesets list we will return to the user
3101 if((csList
= allocArbitrateVecPooled(sizeof(struct codesetList
))) != NULL
)
3103 BOOL scanProgDir
= TRUE
;
3104 struct TagItem
*tstate
= attrs
;
3105 struct TagItem
*tag
;
3107 // initialize the new private codeset list and put it into a separate list
3108 NewList((struct List
*)csList
);
3110 // first we get the path of the directory from which we go
3111 // and scan for charset tables from
3112 while((tag
= NextTagItem((APTR
)&tstate
)) != NULL
)
3116 case CSA_CodesetDir
:
3118 codesetsScanDir(csList
, (STRPTR
)tag
->ti_Data
);
3120 scanProgDir
= FALSE
;
3124 case CSA_CodesetFile
:
3126 codesetsReadTable(csList
, (STRPTR
)tag
->ti_Data
);
3128 scanProgDir
= FALSE
;
3132 case CSA_SourceCodeset
:
3134 struct codeset
*cs
= (struct codeset
*)tag
->ti_Data
;
3136 AddTail((struct List
*)csList
, (struct Node
*)&cs
->node
);
3138 scanProgDir
= FALSE
;
3144 // in case the user also wants us to scan PROGDIR:
3146 if(scanProgDir
== TRUE
)
3147 codesetsScanDir(csList
, "PROGDIR:Charsets");
3155 /// CodesetsListDeleteA()
3156 BOOL LIBFUNC
CodesetsListDeleteA(REG(a0
, struct TagItem
*attrs
))
3158 BOOL result
= FALSE
;
3159 struct TagItem
*tstate
= attrs
;
3160 struct TagItem
*tag
;
3165 // check if the caller wants us also to free the codesets
3166 freeCodesets
= (BOOL
)GetTagData(CSA_FreeCodesets
, TRUE
, attrs
);
3168 // now we iterate through or tagItems and see what the
3169 // user wants to remove from the list
3170 while((tag
= NextTagItem((APTR
)&tstate
)) != NULL
)
3174 case CSA_CodesetList
:
3176 struct codesetList
*csList
= (struct codesetList
*)tag
->ti_Data
;
3180 // cleanup the codesets within the list
3181 if(freeCodesets
== TRUE
)
3182 codesetsCleanup(csList
);
3184 // then free the list itself
3185 freeArbitrateVecPooled(csList
);
3199 /// CodesetsListAddA()
3200 BOOL LIBFUNC
CodesetsListAddA(REG(a0
, struct codesetList
*csList
), REG(a1
, struct TagItem
*attrs
))
3202 BOOL result
= FALSE
;
3208 struct TagItem
*tstate
= attrs
;
3209 struct TagItem
*tag
;
3211 // now we iterate through or tagItems and see if the user
3212 // wants to scan a whole directory or just adds a file.
3213 while((tag
= NextTagItem((APTR
)&tstate
)) != NULL
)
3217 case CSA_CodesetDir
:
3219 codesetsScanDir(csList
, (STRPTR
)tag
->ti_Data
);
3224 case CSA_CodesetFile
:
3226 codesetsReadTable(csList
, (STRPTR
)tag
->ti_Data
);
3231 case CSA_SourceCodeset
:
3233 struct codeset
*cs
= (struct codeset
*)tag
->ti_Data
;
3235 AddTail((struct List
*)csList
, (struct Node
*)&cs
->node
);
3248 /// CodesetsListRemoveA()
3249 BOOL LIBFUNC
CodesetsListRemoveA(REG(a0
, struct TagItem
*attrs
))
3251 BOOL result
= FALSE
;
3252 struct TagItem
*tstate
= attrs
;
3253 struct TagItem
*tag
;
3258 // check if the caller wants us also to free the codesets
3259 freeCodesets
= (BOOL
)GetTagData(CSA_FreeCodesets
, TRUE
, attrs
);
3261 // now we iterate through or tagItems and see what the
3262 // user wants to remove from the list
3263 while((tag
= NextTagItem((APTR
)&tstate
)) != NULL
)
3267 case CSA_SourceCodeset
:
3269 struct codeset
*removeCS
= (struct codeset
*)tag
->ti_Data
;
3271 if(removeCS
!= NULL
)
3274 BOOL isExternalNode
= TRUE
;
3276 ObtainSemaphore(&CodesetsBase
->libSem
);
3278 // iterate over our internal list an check whether the given
3279 // node is part of that list
3280 for(node
= GetHead((struct List
*)&CodesetsBase
->codesets
); node
!= NULL
; node
= GetSucc(node
))
3282 if((struct codeset
*)node
== removeCS
)
3284 isExternalNode
= FALSE
;
3289 ReleaseSemaphore(&CodesetsBase
->libSem
);
3291 if(isExternalNode
== TRUE
)
3293 Remove((struct Node
*)removeCS
);
3295 // free all codesets data if requested
3296 if(freeCodesets
== TRUE
)
3298 if(removeCS
->name
!= NULL
)
3299 freeArbitrateVecPooled(removeCS
->name
);
3300 if(removeCS
->alt_name
!= NULL
)
3301 freeArbitrateVecPooled(removeCS
->alt_name
);
3302 if(removeCS
->characterization
!= NULL
)
3303 freeArbitrateVecPooled(removeCS
->characterization
);
3305 freeArbitrateVecPooled(removeCS
);
3311 W(DBF_ALWAYS
, "user tried to remove an internal codeset!");
3324 /**************************************************************************/