workbench/libs/codesets/src/codesets.c

   1 /***************************************************************************
   2
   3  codesets.library - Amiga shared library for handling different codesets
   4  Copyright (C) 2001-2005 by Alfonso [alfie] Ranieri <alforan@tin.it>.
   5  Copyright (C) 2005-2013 by codesets.library Open Source Team
   6
   7  This library is free software; you can redistribute it and/or
   8  modify it under the terms of the GNU Lesser General Public
   9  License as published by the Free Software Foundation; either
  10  version 2.1 of the License, or (at your option) any later version.
  11
  12  This library is distributed in the hope that it will be useful,
  13  but WITHOUT ANY WARRANTY; without even the implied warranty of
  14  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15  Lesser General Public License for more details.
  16
  17  codesets.library project: http://sourceforge.net/projects/codesetslib/
  18
  19  Most of the code included in this file was relicensed from GPL to LGPL
  20  from the source code of SimpleMail (http://www.sf.net/projects/simplemail)
  21  with full permissions by its authors.
  22
  23  $Id$
  24
  25 ***************************************************************************/
  26
  27 #include "lib.h"
  28
  29 #include <clib/alib_protos.h>
  30
  31 #include <diskfont/glyph.h>
  32 #include <diskfont/diskfonttag.h>
  33 #include <proto/diskfont.h>
  34 #include <ctype.h>
  35 #include <limits.h>
  36
  37 #ifdef __MORPHOS__
  38 #include <proto/keymap.h>
  39 #include <proto/locale.h>
  40 #endif
  41
  42 #include "codesets_table.h"
  43 #include "convertUTF.h"
  44 #include "codepages.h"
  45
  46 #include "SDI_stdarg.h"
  47
  48 #include "debug.h"
  49
  50 /**************************************************************************/
  51
  52 // a union used for various type casts while avoiding the annoying "dereferencing
  53 // type punned pointer is breaking strict alias rules" warnings of GCC4+
  54 union TypeAliases
  55 {
  56   void **voidptr;
  57   char **schar;
  58   unsigned char **uchar;
  59   STRPTR *strptr;
  60   UTF8 **utf8;
  61   const UTF8 **cutf8;
  62   UTF16 **utf16;
  63   const UTF16 **cutf16;
  64   UTF32 **utf32;
  65   const UTF32 **cutf32;
  66 };
  67
  68 /// BIN_SEARCH()
  69 // search a sorted array in O(log n) e.g.
  70 // BIN_SEARCH(strings,0,sizeof(strings)/sizeof(strings[0]),strcmp(key,array[mid]),res);
  71 #define BIN_SEARCH(array,low,high,compare,result) \
  72   {\
  73     int l = low;\
  74     int h = high;\
  75     int m = (low+high)/2;\
  76     result = NULL;\
  77     while (l<=h)\
  78     {\
  79       int d = compare;\
  80       if (!d){ result = &array[m]; break; }\
  81       if (d < 0) h = m - 1;\
  82       else l = m + 1;\
  83       m = (l + h)/2;\
  84     }\
  85   }
  86
  87 ///
  88 /// mystrdup()
  89 static STRPTR mystrdup(const char *str)
  90 {
  91   STRPTR newStr = NULL;
  92
  93   ENTER();
  94
  95   if(str != NULL)
  96   {
  97     int len;
  98
  99     if((len = strlen(str)) > 0)
 100     {
 101       if((newStr = allocArbitrateVecPooled(len+1)) != NULL)
 102         strlcpy(newStr, str, len+1);
 103     }
 104   }
 105
 106   RETURN(newStr);
 107   return newStr;
 108 }
 109
 110 ///
 111 /// mystrndup()
 112 static STRPTR mystrndup(const char *str1, int n)
 113 {
 114   STRPTR dest;
 115
 116   ENTER();
 117
 118   if((dest = allocArbitrateVecPooled(n+1)) != NULL)
 119   {
 120     if(str1 != NULL)
 121       strlcpy(dest, str1, n+1);
 122     else
 123       dest[0] = '\0';
 124
 125     dest[n] = '\0';
 126   }
 127
 128   RETURN(dest);
 129   return dest;
 130 }
 131
 132 ///
 133 /// readLine()
 134 static BOOL readLine(BPTR fh, char *buf, ULONG size)
 135 {
 136   BOOL success = FALSE;
 137   char *c;
 138
 139   ENTER();
 140
 141   if((c = FGets(fh, buf, size)) != NULL)
 142   {
 143     // we succeeded in reading something
 144     success = TRUE;
 145
 146     // now find the end of the line and strip the LF/CR character
 147     for(; *c; c++)
 148     {
 149       if(*c == '\n' || *c == '\r')
 150       {
 151         *c = '\0';
 152         break;
 153       }
 154     }
 155   }
 156
 157   RETURN(success);
 158   return success;
 159 }
 160
 161 ///
 162 /// getConfigItem()
 163 static const char *getConfigItem(const char *buf, const char *item)
 164 {
 165   const char *configItem = NULL;
 166   int len;
 167
 168   ENTER();
 169
 170   len = strlen(item);
 171
 172   if(strnicmp(buf, item, len) == 0)
 173   {
 174     char c;
 175
 176     buf += len;
 177
 178     // skip spaces
 179     while((c = *buf) != '\0' && isspace(c))
 180       buf++;
 181
 182     if(*buf == '=')
 183     {
 184       buf++;
 185
 186       // skip spaces
 187       while((c = *buf) != '\0'  && isspace(c))
 188         buf++;
 189
 190       configItem = buf;
 191     }
 192   }
 193
 194   RETURN(configItem);
 195   return configItem;
 196 }
 197
 198 ///
 199 /// parseUtf8()
 200 static int parseUtf8(STRPTR *ps)
 201 {
 202   STRPTR s = *ps;
 203   int wc, n, i;
 204
 205   ENTER();
 206
 207   if(*s<0x80)
 208   {
 209     *ps = s+1;
 210
 211     RETURN(*s);
 212     return *s;
 213   }
 214
 215   if(*s<0xc2)
 216   {
 217     RETURN(-1);
 218     return -1;
 219   }
 220   else
 221   {
 222     if(*s<0xe0)
 223     {
 224       if((s[1] & 0xc0)!=0x80)
 225       {
 226         RETURN(-1);
 227         return -1;
 228       }
 229
 230       *ps = s+2;
 231
 232       RETURN(((s[0] & 0x1f)<<6) | (s[1] & 0x3f));
 233       return ((s[0] & 0x1f)<<6) | (s[1] & 0x3f);
 234     }
 235     else
 236     {
 237       if(*s<0xf0)
 238       {
 239         n = 3;
 240       }
 241       else
 242       {
 243         if(*s<0xf8)
 244         {
 245           n = 4;
 246         }
 247         else
 248         {
 249           if(*s<0xfc)
 250           {
 251             n = 5;
 252           }
 253           else
 254           {
 255             if(*s<0xfe)
 256             {
 257               n = 6;
 258             }
 259             else
 260             {
 261               RETURN(-1);
 262               return -1;
 263             }
 264           }
 265         }
 266       }
 267     }
 268   }
 269
 270   wc = *s++ & ((1<<(7-n))-1);
 271
 272   for(i = 1; i<n; i++)
 273   {
 274     if((*s & 0xc0) != 0x80)
 275     {
 276       RETURN(-1);
 277       return -1;
 278     }
 279
 280     wc = (wc << 6) | (*s++ & 0x3f);
 281   }
 282
 283   if(wc < (1 << (5 * n - 4)))
 284   {
 285     RETURN(-1);
 286     return -1;
 287   }
 288
 289   *ps = s;
 290
 291   RETURN(wc);
 292   return wc;
 293 }
 294
 295 ///
 296 /// countCodesets()
 297 static int countCodesets(struct codesetList *csList)
 298 {
 299   struct Node *node;
 300   int num = 0;
 301
 302   for(node = GetHead((struct List *)csList); node != NULL; node = GetSucc(node))
 303     num++;
 304
 305   return num;
 306 }
 307
 308 ///
 309 /// mapUTF8toASCII()
 310 // in case some UTF8 sequences can not be converted during CodesetsUTF8ToStrA(), this
 311 // function is used to replace these unknown sequences with lookalike characters that
 312 // still make the text more readable. For more replacement see
 313 // http://www.utf8-zeichentabelle.de/unicode-utf8-table.pl
 314 //
 315 // The conversion table in this function is partly borrowed from the awebcharset plugin
 316 // written by Frank Weber. See http://cvs.sunsite.dk/viewcvs.cgi/aweb/plugins/charset/awebcharset.c
 317 //
 318 struct UTF8Replacement
 319 {
 320   const char *utf8;     // the original UTF8 string we are going to replace
 321   const int utf8len;    // the length of the UTF8 string
 322   const char *rep;      // pointer to the replacement string
 323   const int replen;     // the length of the replacement string (minus for signalling an UTF8 string)
 324 };
 325
 326 static int compareUTF8Replacements(const void *p1, const void *p2)
 327 {
 328   struct UTF8Replacement *key = (struct UTF8Replacement *)p1;
 329   struct UTF8Replacement *rep = (struct UTF8Replacement *)p2;
 330   int cmp;
 331
 332   // compare the length first, after that compare the strings
 333   cmp = key->utf8len - rep->utf8len;
 334   if(cmp == 0)
 335     cmp = memcmp(key->utf8, rep->utf8, key->utf8len);
 336
 337   return cmp;
 338 }
 339
 340 static int mapUTF8toASCII(const char **dst, const unsigned char *src, const int utf8len)
 341 {
 342   int len = 0;
 343   struct UTF8Replacement key = { (char *)src, utf8len, NULL, 0 };
 344   struct UTF8Replacement *rep;
 345
 346   static struct UTF8Replacement const utf8map[] =
 347   {
 348     // U+0100 ... U+017F (Latin Extended-A)
 349     { "\xC4\x80", 2, "A",         1 }, // U+0100 -> A       (LATIN CAPITAL LETTER A WITH MACRON)
 350     { "\xC4\x81", 2, "a",         1 }, // U+0101 -> a       (LATIN SMALL LETTER A WITH MACRON)
 351     { "\xC4\x82", 2, "A",         1 }, // U+0102 -> A       (LATIN CAPITAL LETTER A WITH BREVE)
 352     { "\xC4\x83", 2, "a",         1 }, // U+0103 -> a       (LATIN SMALL LETTER A WITH BREVE)
 353     { "\xC4\x84", 2, "A",         1 }, // U+0104 -> A       (LATIN CAPITAL LETTER A WITH OGONEK)
 354     { "\xC4\x85", 2, "a",         1 }, // U+0105 -> a       (LATIN SMALL LETTER A WITH OGONEK)
 355     { "\xC4\x86", 2, "C",         1 }, // U+0106 -> C       (LATIN CAPITAL LETTER C WITH ACUTE)
 356     { "\xC4\x87", 2, "c",         1 }, // U+0107 -> c       (LATIN SMALL LETTER C WITH ACUTE)
 357     { "\xC4\x88", 2, "C",         1 }, // U+0108 -> C       (LATIN CAPITAL LETTER C WITH CIRCUMFLEX)
 358     { "\xC4\x89", 2, "c",         1 }, // U+0109 -> c       (LATIN SMALL LETTER C WITH CIRCUMFLEX)
 359     { "\xC4\x8A", 2, "C",         1 }, // U+010A -> C       (LATIN CAPITAL LETTER C WITH DOT ABOVE)
 360     { "\xC4\x8B", 2, "c",         1 }, // U+010B -> c       (LATIN SMALL LETTER C WITH DOT ABOVE)
 361     { "\xC4\x8C", 2, "C",         1 }, // U+010C -> C       (LATIN CAPITAL LETTER C WITH CARON)
 362     { "\xC4\x8D", 2, "c",         1 }, // U+010D -> c       (LATIN SMALL LETTER C WITH CARON)
 363     { "\xC4\x8E", 2, "D",         1 }, // U+010E -> D       (LATIN CAPITAL LETTER D WITH CARON)
 364     { "\xC4\x8F", 2, "d",         1 }, // U+010F -> d       (LATIN SMALL LETTER D WITH CARON)
 365     { "\xC4\x90", 2, "D",         1 }, // U+0110 -> D       (LATIN CAPITAL LETTER D WITH STROKE)
 366     { "\xC4\x91", 2, "d",         1 }, // U+0111 -> d       (LATIN SMALL LETTER D WITH STROKE)
 367     { "\xC4\x92", 2, "E",         1 }, // U+0112 -> E       (LATIN CAPITAL LETTER E WITH MACRON)
 368     { "\xC4\x93", 2, "e",         1 }, // U+0113 -> e       (LATIN SMALL LETTER E WITH MACRON)
 369     { "\xC4\x94", 2, "E",         1 }, // U+0114 -> E       (LATIN CAPITAL LETTER E WITH BREVE)
 370     { "\xC4\x95", 2, "e",         1 }, // U+0115 -> e       (LATIN SMALL LETTER E WITH BREVE)
 371     { "\xC4\x96", 2, "E",         1 }, // U+0116 -> E       (LATIN CAPITAL LETTER E WITH DOT ABOVE)
 372     { "\xC4\x97", 2, "e",         1 }, // U+0117 -> e       (LATIN SMALL LETTER E WITH DOT ABOVE)
 373     { "\xC4\x98", 2, "E",         1 }, // U+0118 -> E       (LATIN CAPITAL LETTER E WITH OGONEK)
 374     { "\xC4\x99", 2, "e",         1 }, // U+0119 -> e       (LATIN SMALL LETTER E WITH OGONEK)
 375     { "\xC4\x9A", 2, "E",         1 }, // U+011A -> E       (LATIN CAPITAL LETTER E WITH CARON)
 376     { "\xC4\x9B", 2, "e",         1 }, // U+011B -> e       (LATIN SMALL LETTER E WITH CARON)
 377     { "\xC4\x9C", 2, "G",         1 }, // U+011C -> G       (LATIN CAPITAL LETTER G WITH CIRCUMFLEX)
 378     { "\xC4\x9D", 2, "g",         1 }, // U+011D -> g       (LATIN SMALL LETTER G WITH CIRCUMFLEX)
 379     { "\xC4\x9E", 2, "G",         1 }, // U+011E -> G       (LATIN CAPITAL LETTER G WITH BREVE)
 380     { "\xC4\x9F", 2, "g",         1 }, // U+011F -> g       (LATIN SMALL LETTER G WITH BREVE)
 381     { "\xC4\xA0", 2, "G",         1 }, // U+0120 -> G       (LATIN CAPITAL LETTER G WITH DOT ABOVE)
 382     { "\xC4\xA1", 2, "g",         1 }, // U+0121 -> g       (LATIN SMALL LETTER G WITH DOT ABOVE)
 383     { "\xC4\xA2", 2, "G",         1 }, // U+0122 -> G       (LATIN CAPITAL LETTER G WITH CEDILLA)
 384     { "\xC4\xA3", 2, "g",         1 }, // U+0123 -> g       (LATIN SMALL LETTER G WITH CEDILLA)
 385     { "\xC4\xA4", 2, "H",         1 }, // U+0124 -> H       (LATIN CAPITAL LETTER H WITH CIRCUMFLEX)
 386     { "\xC4\xA5", 2, "h",         1 }, // U+0125 -> h       (LATIN SMALL LETTER H WITH CIRCUMFLEX)
 387     { "\xC4\xA6", 2, "H",         1 }, // U+0126 -> H       (LATIN CAPITAL LETTER H WITH STROKE)
 388     { "\xC4\xA7", 2, "h",         1 }, // U+0127 -> h       (LATIN SMALL LETTER H WITH STROKE)
 389     { "\xC4\xA8", 2, "I",         1 }, // U+0128 -> I       (LATIN CAPITAL LETTER I WITH TILDE)
 390     { "\xC4\xA9", 2, "i",         1 }, // U+0129 -> i       (LATIN SMALL LETTER I WITH TILDE)
 391     { "\xC4\xAA", 2, "I",         1 }, // U+012A -> I       (LATIN CAPITAL LETTER I WITH MACRON)
 392     { "\xC4\xAB", 2, "i",         1 }, // U+012B -> i       (LATIN SMALL LETTER I WITH MACRON)
 393     { "\xC4\xAC", 2, "I",         1 }, // U+012C -> I       (LATIN CAPITAL LETTER I WITH BREVE)
 394     { "\xC4\xAD", 2, "i",         1 }, // U+012D -> i       (LATIN SMALL LETTER I WITH BREVE)
 395     { "\xC4\xAE", 2, "I",         1 }, // U+012E -> I       (LATIN CAPITAL LETTER I WITH OGONEK)
 396     { "\xC4\xAF", 2, "i",         1 }, // U+012F -> i       (LATIN SMALL LETTER I WITH OGONEK)
 397     { "\xC4\xB0", 2, "I",         1 }, // U+0130 -> I       (LATIN CAPITAL LETTER I WITH DOT ABOVE)
 398     { "\xC4\xB1", 2, "i",         1 }, // U+0131 -> i       (LATIN SMALL LETTER DOTLESS I)
 399     { "\xC4\xB2", 2, "Ij",        2 }, // U+0132 -> Ij      (LATIN CAPITAL LIGATURE IJ)
 400     { "\xC4\xB3", 2, "ij",        2 }, // U+0133 -> ij      (LATIN SMALL LIGATURE IJ)
 401     { "\xC4\xB4", 2, "J",         1 }, // U+0134 -> J       (LATIN CAPITAL LETTER J WITH CIRCUMFLEX)
 402     { "\xC4\xB5", 2, "j",         1 }, // U+0135 -> j       (LATIN SMALL LETTER J WITH CIRCUMFLEX)
 403     { "\xC4\xB6", 2, "K",         1 }, // U+0136 -> K       (LATIN CAPITAL LETTER K WITH CEDILLA)
 404     { "\xC4\xB7", 2, "k",         1 }, // U+0137 -> k       (LATIN SMALL LETTER K WITH CEDILLA)
 405     { "\xC4\xB8", 2, "k",         1 }, // U+0138 -> k       (LATIN SMALL LETTER KRA)
 406     { "\xC4\xB9", 2, "L",         1 }, // U+0139 -> L       (LATIN CAPITAL LETTER L WITH ACUTE)
 407     { "\xC4\xBA", 2, "l",         1 }, // U+013A -> l       (LATIN SMALL LETTER L WITH ACUTE)
 408     { "\xC4\xBB", 2, "L",         1 }, // U+013B -> L       (LATIN CAPITAL LETTER L WITH CEDILLA)
 409     { "\xC4\xBC", 2, "l",         1 }, // U+013C -> l       (LATIN SMALL LETTER L WITH CEDILLA)
 410     { "\xC4\xBD", 2, "L",         1 }, // U+013D -> L       (LATIN CAPITAL LETTER L WITH CARON)
 411     { "\xC4\xBE", 2, "l",         1 }, // U+013E -> l       (LATIN SMALL LETTER L WITH CARON)
 412     { "\xC4\xBF", 2, "L",         1 }, // U+013F -> L       (LATIN CAPITAL LETTER L WITH MIDDLE DOT)
 413     { "\xC5\x80", 2, "l",         1 }, // U+0140 -> l       (LATIN SMALL LETTER L WITH MIDDLE DOT)
 414     { "\xC5\x81", 2, "L",         1 }, // U+0141 -> L       (LATIN CAPITAL LETTER L WITH STROKE)
 415     { "\xC5\x82", 2, "l",         1 }, // U+0142 -> l       (LATIN SMALL LETTER L WITH STROKE)
 416     { "\xC5\x83", 2, "N",         1 }, // U+0143 -> N       (LATIN CAPITAL LETTER N WITH ACUTE)
 417     { "\xC5\x84", 2, "n",         1 }, // U+0144 -> n       (LATIN SMALL LETTER N WITH ACUTE)
 418     { "\xC5\x85", 2, "N",         1 }, // U+0145 -> N       (LATIN CAPITAL LETTER N WITH CEDILLA)
 419     { "\xC5\x86", 2, "n",         1 }, // U+0146 -> n       (LATIN SMALL LETTER N WITH CEDILLA)
 420     { "\xC5\x87", 2, "N",         1 }, // U+0147 -> N       (LATIN CAPITAL LETTER N WITH CARON)
 421     { "\xC5\x88", 2, "n",         1 }, // U+0148 -> n       (LATIN SMALL LETTER N WITH CARON)
 422     { "\xC5\x89", 2, "'n",        2 }, // U+0149 -> 'n      (LATIN SMALL LETTER N PRECEDED BY APOSTROPHE)
 423     { "\xC5\x8A", 2, "Ng",        2 }, // U+014A -> Ng      (LATIN CAPITAL LETTER ENG)
 424     { "\xC5\x8B", 2, "ng",        2 }, // U+014B -> ng      (LATIN SMALL LETTER ENG)
 425     { "\xC5\x8C", 2, "O",         1 }, // U+014C -> O       (LATIN CAPITAL LETTER O WITH MACRON)
 426     { "\xC5\x8D", 2, "o",         1 }, // U+014D -> o       (LATIN SMALL LETTER O WITH MACRON)
 427     { "\xC5\x8E", 2, "O",         1 }, // U+014E -> O       (LATIN CAPITAL LETTER O WITH BREVE)
 428     { "\xC5\x8F", 2, "o",         1 }, // U+014F -> o       (LATIN SMALL LETTER O WITH BREVE)
 429     { "\xC5\x90", 2, "O",         1 }, // U+0150 -> O       (LATIN CAPITAL LETTER O WITH DOUBLE ACUTE)
 430     { "\xC5\x91", 2, "o",         1 }, // U+0151 -> o       (LATIN SMALL LETTER O WITH DOUBLE ACUTE)
 431     { "\xC5\x92", 2, "Oe",        2 }, // U+0152 -> Oe      (LATIN CAPITAL LIGATURE OE)
 432     { "\xC5\x93", 2, "oe",        2 }, // U+0153 -> oe      (LATIN SMALL LIGATURE OE)
 433     { "\xC5\x94", 2, "R",         1 }, // U+0154 -> R       (LATIN CAPITAL LETTER R WITH ACUTE)
 434     { "\xC5\x95", 2, "r",         1 }, // U+0155 -> r       (LATIN SMALL LETTER R WITH ACUTE)
 435     { "\xC5\x96", 2, "R",         1 }, // U+0156 -> R       (LATIN CAPITAL LETTER R WITH CEDILLA)
 436     { "\xC5\x97", 2, "r",         1 }, // U+0157 -> r       (LATIN SMALL LETTER R WITH CEDILLA)
 437     { "\xC5\x98", 2, "R",         1 }, // U+0158 -> R       (LATIN CAPITAL LETTER R WITH CARON)
 438     { "\xC5\x99", 2, "r",         1 }, // U+0159 -> r       (LATIN SMALL LETTER R WITH CARON)
 439     { "\xC5\x9A", 2, "S",         1 }, // U+015A -> S       (LATIN CAPITAL LETTER S WITH ACUTE)
 440     { "\xC5\x9B", 2, "s",         1 }, // U+015B -> s       (LATIN SMALL LETTER S WITH ACUTE)
 441     { "\xC5\x9C", 2, "S",         1 }, // U+015C -> S       (LATIN CAPITAL LETTER S WITH CIRCUMFLEX)
 442     { "\xC5\x9D", 2, "s",         1 }, // U+015D -> s       (LATIN SMALL LETTER S WITH CIRCUMFLEX)
 443     { "\xC5\x9E", 2, "S",         1 }, // U+015E -> S       (LATIN CAPITAL LETTER S WITH CEDILLA)
 444     { "\xC5\x9F", 2, "s",         1 }, // U+015F -> s       (LATIN SMALL LETTER S WITH CEDILLA)
 445     { "\xC5\xA0", 2, "S",         1 }, // U+0160 -> S       (LATIN CAPITAL LETTER S WITH CARON)
 446     { "\xC5\xA1", 2, "s",         1 }, // U+0161 -> s       (LATIN SMALL LETTER S WITH CARON)
 447     { "\xC5\xA2", 2, "T",         1 }, // U+0162 -> T       (LATIN CAPITAL LETTER T WITH CEDILLA)
 448     { "\xC5\xA3", 2, "t",         1 }, // U+0163 -> t       (LATIN SMALL LETTER T WITH CEDILLA)
 449     { "\xC5\xA4", 2, "T",         1 }, // U+0164 -> T       (LATIN CAPITAL LETTER T WITH CARON)
 450     { "\xC5\xA5", 2, "t",         1 }, // U+0165 -> t       (LATIN SMALL LETTER T WITH CARON)
 451     { "\xC5\xA6", 2, "T",         1 }, // U+0166 -> T       (LATIN CAPITAL LETTER T WITH STROKE)
 452     { "\xC5\xA7", 2, "t",         1 }, // U+0167 -> t       (LATIN SMALL LETTER T WITH STROKE)
 453     { "\xC5\xA8", 2, "U",         1 }, // U+0168 -> U       (LATIN CAPITAL LETTER U WITH TILDE)
 454     { "\xC5\xA9", 2, "u",         1 }, // U+0169 -> u       (LATIN SMALL LETTER U WITH TILDE)
 455     { "\xC5\xAA", 2, "U",         1 }, // U+016A -> U       (LATIN CAPITAL LETTER U WITH MACRON)
 456     { "\xC5\xAB", 2, "u",         1 }, // U+016B -> u       (LATIN SMALL LETTER U WITH MACRON)
 457     { "\xC5\xAC", 2, "U",         1 }, // U+016C -> U       (LATIN CAPITAL LETTER U WITH BREVE)
 458     { "\xC5\xAD", 2, "u",         1 }, // U+016D -> u       (LATIN SMALL LETTER U WITH BREVE)
 459     { "\xC5\xAE", 2, "U",         1 }, // U+016E -> U       (LATIN CAPITAL LETTER U WITH RING ABOVE)
 460     { "\xC5\xAF", 2, "u",         1 }, // U+016F -> u       (LATIN SMALL LETTER U WITH RING ABOVE)
 461     { "\xC5\xB0", 2, "U",         1 }, // U+0170 -> U       (LATIN CAPITAL LETTER U WITH DOUBLE ACUTE)
 462     { "\xC5\xB1", 2, "u",         1 }, // U+0171 -> u       (LATIN SMALL LETTER U WITH DOUBLE ACUTE)
 463     { "\xC5\xB2", 2, "U",         1 }, // U+0172 -> U       (LATIN CAPITAL LETTER U WITH OGONEK)
 464     { "\xC5\xB3", 2, "u",         1 }, // U+0173 -> u       (LATIN SMALL LETTER U WITH OGONEK)
 465     { "\xC5\xB4", 2, "W",         1 }, // U+0174 -> W       (LATIN CAPITAL LETTER W WITH CIRCUMFLEX)
 466     { "\xC5\xB5", 2, "w",         1 }, // U+0175 -> w       (LATIN SMALL LETTER W WITH CIRCUMFLEX)
 467     { "\xC5\xB6", 2, "Y",         1 }, // U+0176 -> Y       (LATIN CAPITAL LETTER Y WITH CIRCUMFLEX)
 468     { "\xC5\xB7", 2, "y",         1 }, // U+0177 -> y       (LATIN SMALL LETTER Y WITH CIRCUMFLEX)
 469     { "\xC5\xB8", 2, "Y",         1 }, // U+0178 -> Y       (LATIN CAPITAL LETTER Y WITH DIAERESIS)
 470     { "\xC5\xB9", 2, "Z",         1 }, // U+0179 -> Z       (LATIN CAPITAL LETTER Z WITH ACUTE)
 471     { "\xC5\xBA", 2, "z",         1 }, // U+017A -> z       (LATIN SMALL LETTER Z WITH ACUTE)
 472     { "\xC5\xBB", 2, "Z",         1 }, // U+017B -> Z       (LATIN CAPITAL LETTER Z WITH DOT ABOVE)
 473     { "\xC5\xBC", 2, "z",         1 }, // U+017C -> z       (LATIN SMALL LETTER Z WITH DOT ABOVE)
 474     { "\xC5\xBD", 2, "Z",         1 }, // U+017D -> Z       (LATIN CAPITAL LETTER Z WITH CARON)
 475     { "\xC5\xBE", 2, "z",         1 }, // U+017E -> z       (LATIN SMALL LETTER Z WITH CARON)
 476     { "\xC5\xBF", 2, "s",         1 }, // U+017F -> s       (LATIN SMALL LETTER LONG S
 477
 478     // U+2000 ... U+206F (General Punctuation)
 479     { "\xE2\x80\x90", 3, "-",         1 }, // U+2010 -> -       (HYPHEN)
 480     { "\xE2\x80\x91", 3, "-",         1 }, // U+2011 -> -       (NON-BREAKING HYPHEN)
 481     { "\xE2\x80\x92", 3, "--",        2 }, // U+2012 -> --      (FIGURE DASH)
 482     { "\xE2\x80\x93", 3, "--",        2 }, // U+2013 -> --      (EN DASH)
 483     { "\xE2\x80\x94", 3, "---",       3 }, // U+2014 -> ---     (EM DASH)
 484     { "\xE2\x80\x95", 3, "---",       3 }, // U+2015 -> ---     (HORIZONTAL BAR)
 485     { "\xE2\x80\x96", 3, "||",        2 }, // U+2016 -> ||      (DOUBLE VERTICAL LINE)
 486     { "\xE2\x80\x97", 3, "_",         1 }, // U+2017 -> _       (DOUBLE LOW LINE)
 487     { "\xE2\x80\x98", 3, "`",         1 }, // U+2018 -> `       (LEFT SINGLE QUOTATION MARK)
 488     { "\xE2\x80\x99", 3, "'",         1 }, // U+2019 -> '       (RIGHT SINGLE QUOTATION MARK)
 489     { "\xE2\x80\x9A", 3, ",",         1 }, // U+201A -> ,       (SINGLE LOW-9 QUOTATION MARK)
 490     { "\xE2\x80\x9B", 3, "'",         1 }, // U+201B -> '       (SINGLE HIGH-REVERSED-9 QUOTATION MARK)
 491     { "\xE2\x80\x9C", 3, "\"",        1 }, // U+201C -> "       (LEFT DOUBLE QUOTATION MARK)
 492     { "\xE2\x80\x9D", 3, "\"",        1 }, // U+201D -> "       (RIGHT DOUBLE QUOTATION MARK)
 493     { "\xE2\x80\x9E", 3, ",,",        2 }, // U+201E -> ,,      (DOUBLE LOW-9 QUOTATION MARK)
 494     { "\xE2\x80\x9F", 3, "``",        2 }, // U+201F -> ``      (DOUBLE HIGH-REVERSED-9 QUOTATION MARK)
 495     { "\xE2\x80\xA0", 3, "+",         1 }, // U+2020 -> +       (DAGGER)
 496     { "\xE2\x80\xA1", 3, "+",         1 }, // U+2021 -> +       (DOUBLE DAGGER)
 497     { "\xE2\x80\xA2", 3, "\xC2\xB7", -2 }, // U+2022 -> U+00B7  (BULLET) -> (MIDDLE POINT)
 498     { "\xE2\x80\xA3", 3, ".",         1 }, // U+2023 -> .       (TRIANGULAR BULLET)
 499     { "\xE2\x80\xA4", 3, ".",         1 }, // U+2024 -> .       (ONE DOT LEADER)
 500     { "\xE2\x80\xA5", 3, "..",        2 }, // U+2025 -> ..      (TWO DOT LEADER)
 501     { "\xE2\x80\xA6", 3, "...",       3 }, // U+2026 -> ...     (HORIZONTAL ELLIPSIS)
 502     { "\xE2\x80\xA7", 3, "\xC2\xB7", -2 }, // U+2027 -> U+00B7  (HYPHENATION POINT) -> (MIDDLE POINT)
 503     { "\xE2\x80\xB0", 3, "%.",        2 }, // U+2030 -> %.      (PER MILLE SIGN)
 504     { "\xE2\x80\xB1", 3, "%..",       3 }, // U+2031 -> %..     (PER TEN THOUSAND SIGN)
 505     { "\xE2\x80\xB2", 3, "'",         1 }, // U+2032 -> `       (PRIME)
 506     { "\xE2\x80\xB3", 3, "''",        2 }, // U+2033 -> ''      (DOUBLE PRIME)
 507     { "\xE2\x80\xB4", 3, "'''",       3 }, // U+2034 -> '''     (TRIPLE PRIME)
 508     { "\xE2\x80\xB5", 3, "`",         1 }, // U+2035 -> `       (REVERSED PRIME)
 509     { "\xE2\x80\xB6", 3, "``",        2 }, // U+2036 -> ``      (REVERSED DOUBLE PRIME)
 510     { "\xE2\x80\xB7", 3, "```",       3 }, // U+2037 -> ```     (REVERSED TRIPLE PRIME)
 511     { "\xE2\x80\xB8", 3, "^",         1 }, // U+2038 -> ^       (CARET)
 512     { "\xE2\x80\xB9", 3, "<",         1 }, // U+2039 -> <       (SINGLE LEFT-POINTING ANGLE QUOTATION MARK)
 513     { "\xE2\x80\xBA", 3, ">",         1 }, // U+203A -> >       (SINGLE RIGHT-POINTING ANGLE QUOTATION MARK)
 514     { "\xE2\x80\xBB", 3, "\xC3\x97", -2 }, // U+203B -> U+00D7  (REFERENCE MARK) -> (MULTIPLICATION SIGN)
 515     { "\xE2\x80\xBC", 3, "!!",        2 }, // U+203C -> !!      (DOUBLE EXCLAMATION MARK)
 516     { "\xE2\x80\xBD", 3, "?",         1 }, // U+203D -> ?       (INTERROBANG)
 517     { "\xE2\x81\x82", 3, "*",         1 }, // U+2042 -> *       (ASTERISM)
 518     { "\xE2\x81\x83", 3, ".",         1 }, // U+2043 -> .       (HYPHEN BULLET)
 519     { "\xE2\x81\x84", 3, "/",         1 }, // U+2044 -> /       (FRACTION SLASH)
 520     { "\xE2\x81\x87", 3, "??",        2 }, // U+2047 -> ??      (DOUBLE QUESTION MARK)
 521     { "\xE2\x81\x88", 3, "?!",        2 }, // U+2048 -> ?!      (QUESTION EXCLAMATION MARK)
 522     { "\xE2\x81\x89", 3, "!?",        2 }, // U+2049 -> !?      (EXCLAMATION QUESTION MARK)
 523     { "\xE2\x81\x8E", 3, "*",         1 }, // U+204E -> *       (LOW ASTERISK)
 524     { "\xE2\x81\x8F", 3, ";",         1 }, // U+204F -> ;       (REVERSED SEMICOLON)
 525     { "\xE2\x81\x91", 3, "*",         1 }, // U+2051 -> *       (TWO ASTERISKS ALIGNED VERTICALLY)
 526     { "\xE2\x81\x92", 3, "-",         1 }, // U+2052 -> -       (COMMERCIAL MINUS SIGN)
 527     { "\xE2\x81\x93", 3, "~",         1 }, // U+2053 -> ~       (SWUNG DASH)
 528     { "\xE2\x81\x95", 3, "*",         1 }, // U+2055 -> *       (FLOWER PUNCTUATION MARK)
 529     { "\xE2\x81\x97", 3, "''''",      4 }, // U+2057 -> ''''    (QUADRUPLE PRIME)
 530     { "\xE2\x81\x9A", 3, ":",         1 }, // U+205A -> :       (TWO DOT PUNCTUATION)
 531     { "\xE2\x81\x9C", 3, "+",         1 }, // U+205C -> +       (DOTTED CROSS)
 532
 533     // U+20A0 ... U+20CF (Currency Symbols)
 534     { "\xE2\x82\xA0", 3, "ECU",       3 }, // U+20A0 -> ECU     (EURO-CURRENCY SIGN)
 535     { "\xE2\x82\xA1", 3, "CRC",       3 }, // U+20A1 -> CRC     (COLON SIGN)
 536     { "\xE2\x82\xA2", 3, "BRC",       3 }, // U+20A2 -> BRC     (CRUZEIRO SIGN)
 537     { "\xE2\x82\xA3", 3, "BEF",       3 }, // U+20A3 -> BEF     (FRENCH FRANC SIGN)
 538     { "\xE2\x82\xA4", 3, "ITL",       3 }, // U+20A4 -> ITL     (LIRA SIGN)
 539     { "\xE2\x82\xA6", 3, "NGN",       3 }, // U+20A6 -> NGN     (NEIRA SIGN)
 540     { "\xE2\x82\xA7", 3, "ESP",       3 }, // U+20A7 -> ESP     (PESETA SIGN)
 541     { "\xE2\x82\xA8", 3, "MVQ",       3 }, // U+20A8 -> MVQ     (RUPEE SIGN)
 542     { "\xE2\x82\xA9", 3, "KPW",       3 }, // U+20A9 -> KPW     (WON SIGN)
 543     { "\xE2\x82\xAA", 3, "ILS",       3 }, // U+20AA -> ILS     (NEW SHEQEL SIGN)
 544     { "\xE2\x82\xAB", 3, "VNC",       3 }, // U+20AB -> VNC     (DONG SIGN)
 545     { "\xE2\x82\xAC", 3, "EUR",       3 }, // U+20AC -> EUR     (EURO SIGN)
 546     { "\xE2\x82\xAD", 3, "LAK",       3 }, // U+20AD -> LAK     (KIP SIGN)
 547     { "\xE2\x82\xAE", 3, "MNT",       3 }, // U+20AE -> MNT     (TUGRIK SIGN)
 548     { "\xE2\x82\xAF", 3, "GRD",       3 }, // U+20AF -> GRD     (DRACHMA SIGN)
 549     { "\xE2\x82\xB0", 3, "Pf",        2 }, // U+20B0 -> Pf      (GERMAN PENNY SIGN)
 550     { "\xE2\x82\xB1", 3, "P",         1 }, // U+20B1 -> P       (PESO SIGN)
 551     { "\xE2\x82\xB2", 3, "PYG",       3 }, // U+20B2 -> PYG     (GUARANI SIGN)
 552     { "\xE2\x82\xB3", 3, "ARA",       3 }, // U+20B3 -> ARA     (AUSTRAL SIGN)
 553     { "\xE2\x82\xB4", 3, "UAH",       3 }, // U+20B4 -> UAH     (HRYVNIA SIGN)
 554     { "\xE2\x82\xB5", 3, "GHS",       3 }, // U+20B5 -> GHS     (CEDI SIGN)
 555
 556     // U+2190 ... U+21FF (Arrows)
 557     { "\xE2\x86\x90", 3, "<-",        2 }, // U+2190 -> <-      (LEFTWARDS ARROW)
 558     { "\xE2\x86\x92", 3, "->",        2 }, // U+2192 -> ->      (RIGHTWARDS ARROW)
 559   };
 560
 561   ENTER();
 562
 563   // start with no replacement string
 564   *dst = NULL;
 565
 566   // perform a binary search in the lookup table
 567   if((rep = bsearch(&key, utf8map, sizeof(utf8map) / sizeof(utf8map[0]), sizeof(utf8map[0]), compareUTF8Replacements)) != NULL)
 568   {
 569     // if we found something, then copy this over to the result variables
 570     *dst = rep->rep;
 571     len = rep->replen;
 572   }
 573
 574   RETURN(len);
 575   return len;
 576 }
 577
 578 ///
 579 /// matchCodesetAlias()
 580 //
 581 struct CodesetAliases
 582 {
 583   const char *MIMEname;   // The official and correct MIME name for a codeset
 584   const char *Aliases;    // A space separated array with well-known aliases
 585 };
 586
 587 const struct CodesetAliases codesetAliases[] =
 588 {
 589   // MIME name       Aliases
 590   { "Amiga-1251",   "Ami1251 Amiga1251"  },
 591   { "AmigaPL",      "AmiPL Amiga-PL"     },
 592   { "ISO-8859-1",   "ISO8859-1 8859-1" },
 593   { "ISO-8859-2",   "ISO8859-2 8859-2" },
 594   { "ISO-8859-3",   "ISO8859-3 8859-3" },
 595   { "ISO-8859-4",   "ISO8859-4 8859-4" },
 596   { "ISO-8859-5",   "ISO8859-5 8859-5" },
 597   { "ISO-8859-6",   "ISO8859-6 8859-6" },
 598   { "ISO-8859-7",   "ISO8859-7 8859-7" },
 599   { "ISO-8859-8",   "ISO8859-8 8859-8" },
 600   { "ISO-8859-9",   "ISO8859-9 8859-9" },
 601   { "ISO-8859-10",  "ISO8859-10 8859-10" },
 602   { "ISO-8859-11",  "ISO8859-11 8859-11" },
 603   { "ISO-8859-12",  "ISO8859-12 8859-12" },
 604   { "ISO-8859-13",  "ISO8859-13 8859-13" },
 605   { "ISO-8859-14",  "ISO8859-14 8859-14" },
 606   { "ISO-8859-15",  "ISO8859-15 8859-15" },
 607   { "ISO-8859-16",  "ISO8859-16 8859-16" },
 608   { "ISO-8859-10",  "ISO8859-10 8859-10" },
 609   { "KOI8-R",       "KOI8R" },
 610   { "US-ASCII",     "ASCII" },
 611   { "UTF-8",        "UTF8 UTF" },
 612   { "UTF-16",       "UTF16" },
 613   { "UTF-32",       "UTF32" },
 614   { "windows-1250", "cp1250 windows1250" },
 615   { "windows-1251", "cp1251 windows1251" },
 616   { "windows-1252", "cp1252 windows1252" },
 617   { "windows-1253", "cp1253 windows1253" },
 618   { "windows-1254", "cp1254 windows1254" },
 619   { "windows-1255", "cp1255 windows1255" },
 620   { "windows-1256", "cp1256 windows1256" },
 621   { "windows-1257", "cp1257 windows1257" },
 622   { NULL,           NULL,                }
 623 };
 624
 625 static const char *matchCodesetAlias(const char *search)
 626 {
 627   const char *result = NULL;
 628   size_t len = strlen(search);
 629   int i;
 630
 631   ENTER();
 632
 633   for(i=0; codesetAliases[i].MIMEname != NULL; i++)
 634   {
 635     BOOL found = FALSE;
 636
 637     // search the MIMEname first
 638     if(stricmp(search, codesetAliases[i].MIMEname) == 0)
 639       found = TRUE;
 640     else
 641     {
 642       const char *s = codesetAliases[i].Aliases;
 643
 644       // loop through space separated list of aliases
 645       while(s != NULL && *s != '\0')
 646       {
 647         if(strnicmp(search, s, len) == 0)
 648         {
 649           found = TRUE;
 650           break;
 651         }
 652
 653         if((s = strpbrk(s, " ")) != NULL)
 654           s++;
 655       }
 656     }
 657
 658     if(found == TRUE)
 659     {
 660       result = codesetAliases[i].MIMEname;
 661
 662       break;
 663     }
 664   }
 665
 666   RETURN(result);
 667   return result;
 668 }
 669
 670 ///
 671
 672 /**************************************************************************/
 673
 674 /// defaultCodeset()
 675 static struct codeset *defaultCodeset(BOOL useSemaphore)
 676 {
 677   char buf[256];
 678   struct codeset *codeset;
 679
 680   ENTER();
 681
 682   if(useSemaphore == TRUE)
 683     ObtainSemaphoreShared(&CodesetsBase->libSem);
 684
 685   buf[0] = '\0';
 686   GetVar("codeset_default" ,buf, sizeof(buf), GVF_GLOBAL_ONLY);
 687
 688   if(buf[0] == '\0' || (codeset = codesetsFind(&CodesetsBase->codesets, buf)) == NULL)
 689     codeset = CodesetsBase->systemCodeset;
 690
 691   if(useSemaphore == TRUE)
 692     ReleaseSemaphore(&CodesetsBase->libSem);
 693
 694   RETURN(codeset);
 695   return codeset;
 696 }
 697
 698 ///
 699 /// codesetsCmpUnicode()
 700 // The compare function
 701 static int codesetsCmpUnicode(const void *a1, const void *a2)
 702 {
 703   struct single_convert *arg1 = (struct single_convert *)a1;
 704   struct single_convert *arg2 = (struct single_convert *)a2;
 705
 706   return strcmp((char*)&arg1->utf8[1], (char*)&arg2->utf8[1]);
 707 }
 708
 709 ///
 710 /// codesetsReadTable()
 711
 712 #define ITEM_STANDARD           "Standard"
 713 #define ITEM_ALTSTANDARD        "AltStandard"
 714 #define ITEM_READONLY           "ReadOnly"
 715 #define ITEM_CHARACTERIZATION   "Characterization"
 716
 717 // Reads a coding table and adds it
 718 static BOOL codesetsReadTable(struct codesetList *csList, STRPTR name)
 719 {
 720   BPTR fh;
 721   BOOL res = FALSE;
 722
 723   ENTER();
 724
 725   D(DBF_STARTUP, "trying to read charset file '%s'...", name);
 726
 727   if((fh = Open(name, MODE_OLDFILE)) != (BPTR)NULL)
 728   {
 729     struct codeset *codeset;
 730
 731     if((codeset = (struct codeset *)allocArbitrateVecPooled(sizeof(*codeset))) != NULL)
 732     {
 733       int i;
 734       char buf[512];
 735
 736       memset(codeset, 0, sizeof(*codeset));
 737
 738       for(i = 0; i<256; i++)
 739       {
 740         codeset->table[i].code = i;
 741         codeset->table[i].ucs4 = i;
 742       }
 743
 744       while(readLine(fh, buf, sizeof(buf)) == TRUE)
 745       {
 746         const char *result;
 747
 748         if(buf[0] != '#')
 749         {
 750           if((result = getConfigItem(buf, ITEM_STANDARD)) != NULL)
 751             codeset->name = mystrdup(result);
 752           else if(codeset->name == NULL) // a valid file starts with "Standard" and nothing else!!
 753             break;
 754           else if((result = getConfigItem(buf, ITEM_ALTSTANDARD)) != NULL)
 755             codeset->alt_name = mystrdup(result);
 756           else if((result = getConfigItem(buf, ITEM_READONLY)) != NULL)
 757             codeset->read_only = (atoi(result) == 0) ? 0 : 1;
 758           else if((result = getConfigItem(buf, ITEM_CHARACTERIZATION)) != NULL)
 759           {
 760             if(result[0] == '_' && result[1] == '(' && result[2] == '"')
 761             {
 762               char *end = strchr(result + 3, '"');
 763
 764               if(end != NULL)
 765                 codeset->characterization = mystrndup(result+3, end-(result+3));
 766             }
 767             else
 768               codeset->characterization = mystrdup(result);
 769           }
 770           else
 771           {
 772             char *p = buf;
 773             int fmt2 = 0;
 774
 775             if(*p == '=' || (fmt2 = ((*p=='0') || (*(p+1)=='x'))))
 776             {
 777               p++;
 778               p += fmt2;
 779
 780               i = strtol(p, &p, 16);
 781               if(i>0 && i<256)
 782               {
 783                 while(isspace(*p))
 784                   p++;
 785
 786                 if(strnicmp(p, "U+", 2) == 0)
 787                 {
 788                   p += 2;
 789                   codeset->table[i].ucs4 = strtol(p, &p, 16);
 790                 }
 791                 else if(*p != '#')
 792                 {
 793                   codeset->table[i].ucs4 = strtol(p, &p, 0);
 794                 }
 795               }
 796             }
 797           }
 798         }
 799       }
 800
 801       // check if there is not already codeset with the same name in here
 802       if(codeset->name != NULL && codesetsFind(csList, codeset->name) == NULL)
 803       {
 804         for(i=0; i<256; i++)
 805         {
 806           UTF32 src = codeset->table[i].ucs4;
 807           UTF32 *src_ptr = &src;
 808           UTF8 *dest_ptr = &codeset->table[i].utf8[1];
 809
 810           CodesetsConvertUTF32toUTF8((const UTF32 **)&src_ptr, src_ptr+1, &dest_ptr, dest_ptr+6, CSF_StrictConversion);
 811           *dest_ptr = 0;
 812           codeset->table[i].utf8[0] = (IPTR)dest_ptr-(IPTR)(&codeset->table[i].utf8[1]);
 813         }
 814
 815         memcpy(codeset->table_sorted, codeset->table, sizeof(codeset->table));
 816         qsort(codeset->table_sorted, 256, sizeof(codeset->table[0]), codesetsCmpUnicode);
 817         D(DBF_STARTUP, "adding external codeset '%s'", codeset->name);
 818         AddTail((struct List *)csList, (struct Node *)&codeset->node);
 819
 820         res = TRUE;
 821       }
 822       else
 823       {
 824         // cleanup
 825         if(codeset->name != NULL)
 826           freeArbitrateVecPooled(codeset->name);
 827         if(codeset->alt_name != NULL)
 828           freeArbitrateVecPooled(codeset->alt_name);
 829         if(codeset->characterization != NULL)
 830           freeArbitrateVecPooled(codeset->characterization);
 831         freeArbitrateVecPooled(codeset);
 832       }
 833     }
 834
 835     Close(fh);
 836   }
 837
 838   RETURN(res);
 839   return res;
 840 }
 841 ///
 842 /// codesetsScanDir()
 843 static void codesetsScanDir(struct codesetList *csList, const char *dirPath)
 844 {
 845   ENTER();
 846
 847   if(dirPath != NULL && dirPath[0] != '\0')
 848   {
 849     #if defined(__amigaos4__)
 850     APTR dirContext;
 851
 852     if((dirContext = ObtainDirContextTags(EX_StringNameInput, dirPath,
 853                                           EX_DataFields,      EXF_NAME|EXF_TYPE,
 854                                           TAG_END)) != NULL)
 855     {
 856       struct ExamineData *exd;
 857
 858       D(DBF_STARTUP, "scanning directory '%s' for codesets tables", dirPath);
 859
 860       while((exd = ExamineDir(dirContext)) != NULL)
 861       {
 862         if(EXD_IS_FILE(exd))
 863         {
 864           char filePath[620];
 865
 866           strlcpy(filePath, dirPath, sizeof(filePath));
 867           AddPart(filePath, exd->Name, sizeof(filePath));
 868
 869           D(DBF_STARTUP, "about to read codeset table '%s'", filePath);
 870
 871           codesetsReadTable(csList, filePath);
 872         }
 873       }
 874
 875       ReleaseDirContext(dirContext);
 876     }
 877     #else
 878     BPTR dirLock;
 879
 880     if((dirLock = Lock(dirPath, ACCESS_READ)))
 881     {
 882       struct ExAllControl *eac;
 883
 884       D(DBF_STARTUP, "scanning directory '%s' for codesets tables", dirPath);
 885
 886       if((eac = AllocDosObject(DOS_EXALLCONTROL, NULL)) != NULL)
 887       {
 888         struct ExAllData *ead;
 889         struct ExAllData *eabuffer;
 890         LONG more;
 891
 892         eac->eac_LastKey = 0;
 893         eac->eac_MatchString = NULL;
 894         eac->eac_MatchFunc = NULL;
 895
 896         if((eabuffer = allocVecPooled(CodesetsBase->pool, 10*sizeof(struct ExAllData))) != NULL)
 897         {
 898           char filePath[620];
 899
 900           do
 901           {
 902             more = ExAll(dirLock, eabuffer, 10*sizeof(struct ExAllData), ED_TYPE, eac);
 903             if(!more && IoErr() != ERROR_NO_MORE_ENTRIES)
 904               break;
 905
 906             if(eac->eac_Entries == 0)
 907               continue;
 908
 909             ead = (struct ExAllData *)eabuffer;
 910             do
 911             {
 912               // we only take that ead if it is a file (ed_Type < 0)
 913               if(ead->ed_Type < 0)
 914               {
 915                 strlcpy(filePath, dirPath, sizeof(filePath));
 916                 AddPart(filePath, (char *)ead->ed_Name, sizeof(filePath));
 917
 918                 D(DBF_STARTUP, "about to read codeset table '%s'", filePath);
 919
 920                 codesetsReadTable(csList, filePath);
 921               }
 922               ead = ead->ed_Next;
 923             }
 924             while(ead != NULL);
 925           }
 926           while(more);
 927
 928           freeVecPooled(CodesetsBase->pool, eabuffer);
 929         }
 930
 931         FreeDosObject(DOS_EXALLCONTROL, eac);
 932       }
 933
 934       UnLock(dirLock);
 935     }
 936     #endif
 937   }
 938
 939   LEAVE();
 940 }
 941
 942 ///
 943 /// codesetsInit()
 944 // Initialized and loads the codesets
 945 BOOL codesetsInit(struct codesetList *csList)
 946 {
 947   BOOL success = FALSE;
 948   struct codeset *codeset;
 949   UTF32 src;
 950   int i;
 951   #if defined(__amigaos4__)
 952   ULONG nextMIB = 3;
 953   #endif
 954
 955   ENTER();
 956
 957   NewList((struct List *)csList);
 958
 959   // to make the list of the supported codesets complete we also add fake
 960   // 'UTF-8', 'UTF-16' and 'UTF-32' only so that our users can query for those codesets as well.
 961   if((codeset = allocArbitrateVecPooled(sizeof(*codeset))) == NULL)
 962     goto end;
 963
 964   memset(codeset, 0, sizeof(*codeset));
 965   codeset->name             = mystrdup("UTF-8");
 966   codeset->alt_name         = mystrdup("UTF8");
 967   codeset->characterization = mystrdup("Unicode");
 968   codeset->read_only        = 0;
 969   D(DBF_STARTUP, "adding internal codeset 'UTF-8'");
 970   AddTail((struct List *)csList, (struct Node *)&codeset->node);
 971   CodesetsBase->utf8Codeset = codeset;
 972
 973   if((codeset = allocArbitrateVecPooled(sizeof(*codeset))) == NULL)
 974     goto end;
 975
 976   memset(codeset, 0, sizeof(*codeset));
 977   codeset->name             = mystrdup("UTF-16");
 978   codeset->alt_name         = mystrdup("UTF16");
 979   codeset->characterization = mystrdup("16-bit Unicode");
 980   codeset->read_only        = 0;
 981   D(DBF_STARTUP, "adding internal codeset 'UTF-16'");
 982   AddTail((struct List *)csList, (struct Node *)&codeset->node);
 983   CodesetsBase->utf16Codeset = codeset;
 984
 985   if((codeset = allocArbitrateVecPooled(sizeof(*codeset))) == NULL)
 986     goto end;
 987
 988   memset(codeset, 0, sizeof(*codeset));
 989   codeset->name             = mystrdup("UTF-32");
 990   codeset->alt_name         = mystrdup("UTF32");
 991   codeset->characterization = mystrdup("32-bit Unicode");
 992   codeset->read_only        = 0;
 993   D(DBF_STARTUP, "adding internal codeset 'UTF-32'");
 994   AddTail((struct List *)csList, (struct Node *)&codeset->node);
 995   CodesetsBase->utf32Codeset = codeset;
 996
 997   // on AmigaOS4 we can use diskfont.library to inquire charset information as
 998   // it comes with a quite rich implementation of different charsets.
 999   #if defined(__amigaos4__)
1000   D(DBF_STARTUP, "OS4, asking diskfont.library for codesets");
1001   do
1002   {
1003     char *mimename;
1004     char *ianaName;
1005     ULONG *mapTable;
1006     ULONG curMIB = nextMIB;
1007
1008     nextMIB = ObtainCharsetInfo(DFCS_NUMBER, curMIB, DFCS_NEXTNUMBER);
1009     if(nextMIB == 0)
1010       break;
1011
1012     mapTable = (ULONG *)ObtainCharsetInfo(DFCS_NUMBER, curMIB, DFCS_MAPTABLE);
1013     mimename = (char *)ObtainCharsetInfo(DFCS_NUMBER, curMIB, DFCS_MIMENAME);
1014     ianaName = (char *)ObtainCharsetInfo(DFCS_NUMBER, curMIB, DFCS_NAME);
1015     if(mapTable != NULL && mimename != NULL && codesetsFind(csList, mimename) == NULL)
1016     {
1017       D(DBF_STARTUP, "loading charset '%s' from diskfont.library...", mimename);
1018
1019       if((codeset = allocArbitrateVecPooled(sizeof(*codeset))) == NULL)
1020         goto end;
1021
1022       codeset->name             = mystrdup(mimename);
1023       codeset->alt_name         = NULL;
1024       codeset->characterization = mystrdup(ianaName);
1025       codeset->read_only        = 0;
1026
1027       for(i=0; i<256; i++)
1028       {
1029         UTF32 *src_ptr = &src;
1030         UTF8 *dest_ptr = &codeset->table[i].utf8[1];
1031
1032         src = mapTable[i];
1033
1034         codeset->table[i].code = i;
1035         codeset->table[i].ucs4 = src;
1036         CodesetsConvertUTF32toUTF8((const UTF32 **)&src_ptr, src_ptr+1, &dest_ptr, dest_ptr+6, CSF_StrictConversion);
1037         *dest_ptr = 0;
1038         codeset->table[i].utf8[0] = (IPTR)dest_ptr-(IPTR)&codeset->table[i].utf8[1];
1039       }
1040
1041       memcpy(codeset->table_sorted, codeset->table, sizeof(codeset->table));
1042       qsort(codeset->table_sorted, 256, sizeof(codeset->table[0]), codesetsCmpUnicode);
1043
1044       D(DBF_STARTUP, "adding diskfont.library codeset '%s'", codeset->name);
1045       AddTail((struct List *)csList, (struct Node *)&codeset->node);
1046     }
1047   }
1048   while(TRUE);
1049   #endif
1050
1051   #if defined(__MORPHOS__)
1052   {
1053     struct Library *KeymapBase;
1054     struct Library *LocaleBase;
1055     // assume success at first
1056     BOOL success = TRUE;
1057
1058     D(DBF_STARTUP, "MorphOS, asking keymap.library for codesets");
1059     if((KeymapBase = OpenLibrary("keymap.library", 51)) != NULL)
1060     {
1061       if((LocaleBase = OpenLibrary("locale.library", 51)) != NULL)
1062       {
1063         struct KeyMap *keymap = AskKeyMapDefault();
1064         // it doesn't matter if this call fails, as we don't depend on the system codesets
1065         CONST_STRPTR name = GetKeyMapCodepage(keymap);
1066
1067         // legacy keymaps dont have codepage or Unicode mappings
1068         if(name != NULL && keymap != NULL)
1069         {
1070           D(DBF_STARTUP, "loading charset '%s' from keymap.library...", name);
1071
1072           if((codeset = allocArbitrateVecPooled(sizeof(*codeset))) != NULL)
1073           {
1074              codeset->name             = mystrdup(name);
1075              codeset->alt_name         = NULL;
1076              codeset->characterization = mystrdup(name);  // No further information available
1077              codeset->read_only        = 0;
1078
1079              for(i=0; i<256; i++)
1080              {
1081                UTF8 *dest_ptr = &codeset->table[i].utf8[1];
1082                LONG rc;
1083
1084                codeset->table[i].code = i;
1085                codeset->table[i].ucs4 = src = ToUCS4(i, keymap);
1086
1087                // here we use UTF8_Encode() instead of ConvertUCS4ToUTF8() because
1088                // of an internal bug in MorphOS 2.2.
1089                rc = UTF8_Encode(src, dest_ptr);
1090                rc = rc > 0 ? rc : 1;
1091
1092                dest_ptr[rc] = '\0';
1093                codeset->table[i].utf8[0] = rc;
1094              }
1095
1096              memcpy(codeset->table_sorted, codeset->table, sizeof(codeset->table));
1097              qsort(codeset->table_sorted, 256, sizeof(codeset->table[0]), codesetsCmpUnicode);
1098
1099              D(DBF_STARTUP, "adding keymap.library codeset '%s'", codeset->name);
1100              AddTail((struct List *)csList, (struct Node *)&codeset->node);
1101           }
1102           else
1103           {
1104             // only failed memory allocations are treated as error
1105             success = FALSE;
1106           }
1107         }
1108
1109         CloseLibrary(LocaleBase);
1110       }
1111
1112       CloseLibrary(KeymapBase);
1113     }
1114
1115     if(success == FALSE)
1116       goto end;
1117   }
1118   #endif
1119
1120   D(DBF_STARTUP, "loading charsets from LIBS:Charsets...");
1121
1122   // we try to walk to the LIBS:Charsets directory on our own and readin our
1123   // own charset tables
1124   codesetsScanDir(csList, "LIBS:Charsets");
1125
1126   //
1127   // now we go and initialize our internally supported codesets but only if
1128   // we have not already loaded a charset with the same name
1129   //
1130   D(DBF_STARTUP, "initializing internal charsets...");
1131
1132   // ISO-8859-1 + EURO
1133   if(codesetsFind(csList, "ISO-8859-1 + Euro") == NULL)
1134   {
1135     if((codeset = allocArbitrateVecPooled(sizeof(*codeset))) == NULL)
1136       goto end;
1137
1138     codeset->name             = mystrdup("ISO-8859-1 + Euro");
1139     codeset->alt_name         = NULL;
1140     codeset->characterization = mystrdup("West European (with EURO)");
1141     codeset->read_only        = 1;
1142
1143     for(i = 0; i<256; i++)
1144     {
1145       UTF32 *src_ptr = &src;
1146       UTF8 *dest_ptr = &codeset->table[i].utf8[1];
1147
1148       if(i==164)
1149         src = 0x20AC; // the EURO sign
1150       else
1151         src = i;
1152
1153       codeset->table[i].code = i;
1154       codeset->table[i].ucs4 = src;
1155       CodesetsConvertUTF32toUTF8((const UTF32 **)&src_ptr, src_ptr+1, &dest_ptr, dest_ptr+6, CSF_StrictConversion);
1156       *dest_ptr = 0;
1157       codeset->table[i].utf8[0] = (IPTR)dest_ptr-(IPTR)&codeset->table[i].utf8[1];
1158     }
1159     memcpy(codeset->table_sorted, codeset->table, sizeof(codeset->table));
1160     qsort(codeset->table_sorted, 256, sizeof(codeset->table[0]), codesetsCmpUnicode);
1161
1162     D(DBF_STARTUP, "adding internal codeset '%s'", codeset->name);
1163     AddTail((struct List *)csList, (struct Node *)&codeset->node);
1164   }
1165
1166   // ISO-8859-1
1167   if(codesetsFind(csList, "ISO-8859-1") == NULL)
1168   {
1169     if((codeset = allocArbitrateVecPooled(sizeof(*codeset))) == NULL)
1170       goto end;
1171
1172     codeset->name             = mystrdup("ISO-8859-1");
1173     codeset->alt_name         = mystrdup("ISO8859-1");
1174     codeset->characterization = mystrdup("West European");
1175     codeset->read_only        = 0;
1176
1177     for(i = 0; i<256; i++)
1178     {
1179       UTF32 *src_ptr = &src;
1180       UTF8 *dest_ptr = &codeset->table[i].utf8[1];
1181
1182       src = i;
1183
1184       codeset->table[i].code = i;
1185       codeset->table[i].ucs4 = src;
1186       CodesetsConvertUTF32toUTF8((const UTF32 **)&src_ptr, src_ptr+1, &dest_ptr, dest_ptr+6, CSF_StrictConversion);
1187       *dest_ptr = 0;
1188       codeset->table[i].utf8[0] = (IPTR)dest_ptr-(IPTR)&codeset->table[i].utf8[1];
1189     }
1190     memcpy(codeset->table_sorted, codeset->table, sizeof(codeset->table));
1191     qsort(codeset->table_sorted, 256, sizeof(codeset->table[0]), codesetsCmpUnicode);
1192
1193     D(DBF_STARTUP, "adding internal codeset '%s'", codeset->name);
1194     AddTail((struct List *)csList, (struct Node *)&codeset->node);
1195   }
1196
1197   // ISO-8859-2
1198   if(codesetsFind(csList, "ISO-8859-2") == NULL)
1199   {
1200     if((codeset = allocArbitrateVecPooled(sizeof(*codeset))) == NULL)
1201       goto end;
1202
1203     codeset->name             = mystrdup("ISO-8859-2");
1204     codeset->alt_name         = mystrdup("ISO8859-2");
1205     codeset->characterization = mystrdup("Central/East European");
1206     codeset->read_only        = 0;
1207
1208     for(i = 0; i<256; i++)
1209     {
1210       UTF32 *src_ptr = &src;
1211       UTF8 *dest_ptr = &codeset->table[i].utf8[1];
1212
1213       if(i<0xa0)
1214         src = i;
1215       else
1216         src = iso_8859_2_to_ucs4[i-0xa0];
1217
1218       codeset->table[i].code = i;
1219       codeset->table[i].ucs4 = src;
1220       CodesetsConvertUTF32toUTF8((const UTF32 **)&src_ptr, src_ptr+1, &dest_ptr,dest_ptr+6, CSF_StrictConversion);
1221       *dest_ptr = 0;
1222       codeset->table[i].utf8[0] = (IPTR)dest_ptr-(IPTR)&codeset->table[i].utf8[1];
1223     }
1224     memcpy(codeset->table_sorted, codeset->table, sizeof(codeset->table));
1225     qsort(codeset->table_sorted, 256, sizeof(codeset->table[0]), codesetsCmpUnicode);
1226
1227     D(DBF_STARTUP, "adding internal codeset '%s'", codeset->name);
1228     AddTail((struct List *)csList, (struct Node *)&codeset->node);
1229   }
1230
1231   // ISO-8859-3
1232   if(codesetsFind(csList, "ISO-8859-3") == NULL)
1233   {
1234     if((codeset = allocArbitrateVecPooled(sizeof(*codeset))) == NULL)
1235       goto end;
1236
1237     codeset->name             = mystrdup("ISO-8859-3");
1238     codeset->alt_name         = mystrdup("ISO8859-3");
1239     codeset->characterization = mystrdup("South European");
1240     codeset->read_only        = 0;
1241
1242     for(i = 0; i<256; i++)
1243     {
1244       UTF32 *src_ptr = &src;
1245       UTF8 *dest_ptr = &codeset->table[i].utf8[1];
1246
1247       if(i<0xa0)
1248         src = i;
1249       else
1250         src = iso_8859_3_to_ucs4[i-0xa0];
1251
1252       codeset->table[i].code = i;
1253       codeset->table[i].ucs4 = src;
1254       CodesetsConvertUTF32toUTF8((const UTF32 **)&src_ptr,src_ptr+1,&dest_ptr,dest_ptr+6,CSF_StrictConversion);
1255       *dest_ptr = 0;
1256       codeset->table[i].utf8[0] = (IPTR)dest_ptr-(IPTR)&codeset->table[i].utf8[1];
1257     }
1258     memcpy(codeset->table_sorted, codeset->table, sizeof(codeset->table));
1259     qsort(codeset->table_sorted, 256, sizeof(codeset->table[0]), codesetsCmpUnicode);
1260
1261     D(DBF_STARTUP, "adding internal codeset '%s'", codeset->name);
1262     AddTail((struct List *)csList, (struct Node *)&codeset->node);
1263   }
1264
1265   // ISO-8859-4
1266   if(codesetsFind(csList, "ISO-8859-4") == NULL)
1267   {
1268     if((codeset = allocArbitrateVecPooled(sizeof(*codeset))) == NULL)
1269       goto end;
1270
1271     codeset->name             = mystrdup("ISO-8859-4");
1272     codeset->alt_name         = mystrdup("ISO8859-4");
1273     codeset->characterization = mystrdup("North European");
1274     codeset->read_only        = 0;
1275
1276     for(i = 0; i<256; i++)
1277     {
1278       UTF32 *src_ptr = &src;
1279       UTF8 *dest_ptr = &codeset->table[i].utf8[1];
1280
1281       if(i<0xa0)
1282         src = i;
1283       else
1284         src = iso_8859_4_to_ucs4[i-0xa0];
1285
1286       codeset->table[i].code = i;
1287       codeset->table[i].ucs4 = src;
1288       CodesetsConvertUTF32toUTF8((const UTF32 **)&src_ptr,src_ptr+1,&dest_ptr,dest_ptr+6,CSF_StrictConversion);
1289       *dest_ptr = 0;
1290       codeset->table[i].utf8[0] = (IPTR)dest_ptr-(IPTR)&codeset->table[i].utf8[1];
1291     }
1292     memcpy(codeset->table_sorted, codeset->table, sizeof(codeset->table));
1293     qsort(codeset->table_sorted, 256, sizeof(codeset->table[0]), codesetsCmpUnicode);
1294
1295     D(DBF_STARTUP, "adding internal codeset '%s'", codeset->name);
1296     AddTail((struct List *)csList, (struct Node *)&codeset->node);
1297   }
1298
1299   // ISO-8859-5
1300   if(codesetsFind(csList, "ISO-8859-5") == NULL)
1301   {
1302     if((codeset = allocArbitrateVecPooled(sizeof(*codeset))) == NULL)
1303       goto end;
1304
1305     codeset->name             = mystrdup("ISO-8859-5");
1306     codeset->alt_name         = mystrdup("ISO8859-5");
1307     codeset->characterization = mystrdup("Slavic languages");
1308     codeset->read_only        = 0;
1309
1310     for(i = 0; i<256; i++)
1311     {
1312       UTF32 *src_ptr = &src;
1313       UTF8 *dest_ptr = &codeset->table[i].utf8[1];
1314
1315       if(i<0xa0)
1316         src = i;
1317       else
1318         src = iso_8859_5_to_ucs4[i-0xa0];
1319
1320       codeset->table[i].code = i;
1321       codeset->table[i].ucs4 = src;
1322       CodesetsConvertUTF32toUTF8((const UTF32 **)&src_ptr,src_ptr+1,&dest_ptr,dest_ptr+6,CSF_StrictConversion);
1323       *dest_ptr = 0;
1324       codeset->table[i].utf8[0] = (IPTR)dest_ptr-(IPTR)&codeset->table[i].utf8[1];
1325     }
1326     memcpy(codeset->table_sorted, codeset->table, sizeof(codeset->table));
1327     qsort(codeset->table_sorted, 256, sizeof(codeset->table[0]), codesetsCmpUnicode);
1328
1329     D(DBF_STARTUP, "adding internal codeset '%s'", codeset->name);
1330     AddTail((struct List *)csList, (struct Node *)&codeset->node);
1331   }
1332
1333   // ISO-8859-9
1334   if(codesetsFind(csList, "ISO-8859-9") == NULL)
1335   {
1336     if((codeset = allocArbitrateVecPooled(sizeof(*codeset))) == NULL)
1337       goto end;
1338
1339     codeset->name             = mystrdup("ISO-8859-9");
1340     codeset->alt_name         = mystrdup("ISO8859-9");
1341     codeset->characterization = mystrdup("Turkish");
1342     codeset->read_only        = 0;
1343
1344     for(i = 0; i<256; i++)
1345     {
1346       UTF32 *src_ptr = &src;
1347       UTF8 *dest_ptr = &codeset->table[i].utf8[1];
1348
1349       if(i<0xa0)
1350         src = i;
1351       else
1352         src = iso_8859_9_to_ucs4[i-0xa0];
1353
1354       codeset->table[i].code = i;
1355       codeset->table[i].ucs4 = src;
1356       CodesetsConvertUTF32toUTF8((const UTF32 **)&src_ptr,src_ptr+1,&dest_ptr,dest_ptr+6,CSF_StrictConversion);
1357       *dest_ptr = 0;
1358       codeset->table[i].utf8[0] = (IPTR)dest_ptr-(IPTR)&codeset->table[i].utf8[1];
1359     }
1360     memcpy(codeset->table_sorted, codeset->table, sizeof(codeset->table));
1361     qsort(codeset->table_sorted, 256, sizeof(codeset->table[0]), codesetsCmpUnicode);
1362
1363     D(DBF_STARTUP, "adding internal codeset '%s'", codeset->name);
1364     AddTail((struct List *)csList, (struct Node *)&codeset->node);
1365   }
1366
1367   // ISO-8859-15
1368   if(codesetsFind(csList, "ISO-8859-15") == NULL)
1369   {
1370     if((codeset = allocArbitrateVecPooled(sizeof(*codeset))) == NULL)
1371       goto end;
1372
1373     codeset->name             = mystrdup("ISO-8859-15");
1374     codeset->alt_name         = mystrdup("ISO8859-15");
1375     codeset->characterization = mystrdup("West European II");
1376     codeset->read_only        = 0;
1377
1378     for(i = 0; i<256; i++)
1379     {
1380       UTF32 *src_ptr = &src;
1381       UTF8 *dest_ptr = &codeset->table[i].utf8[1];
1382
1383       if(i<0xa0)
1384         src = i;
1385       else
1386         src = iso_8859_15_to_ucs4[i-0xa0];
1387
1388       codeset->table[i].code = i;
1389       codeset->table[i].ucs4 = src;
1390       CodesetsConvertUTF32toUTF8((const UTF32 **)&src_ptr,src_ptr+1,&dest_ptr,dest_ptr+6,CSF_StrictConversion);
1391       *dest_ptr = 0;
1392       codeset->table[i].utf8[0] = (IPTR)dest_ptr-(IPTR)&codeset->table[i].utf8[1];
1393     }
1394     memcpy(codeset->table_sorted,codeset->table,sizeof (codeset->table));
1395     qsort(codeset->table_sorted, 256, sizeof(codeset->table[0]), codesetsCmpUnicode);
1396
1397     D(DBF_STARTUP, "adding internal codeset '%s'", codeset->name);
1398     AddTail((struct List *)csList, (struct Node *)&codeset->node);
1399   }
1400
1401   // ISO-8859-16
1402   if(codesetsFind(csList, "ISO-8859-16") == NULL)
1403   {
1404     if((codeset = allocArbitrateVecPooled(sizeof(*codeset))) == NULL)
1405       goto end;
1406
1407     codeset->name             = mystrdup("ISO-8859-16");
1408     codeset->alt_name         = mystrdup("ISO8869-16");
1409     codeset->characterization = mystrdup("South-Eastern European");
1410     codeset->read_only        = 0;
1411
1412     for(i=0;i<256;i++)
1413     {
1414       UTF32 *src_ptr = &src;
1415       UTF8 *dest_ptr = &codeset->table[i].utf8[1];
1416
1417       if(i < 0xa0)
1418         src = i;
1419       else
1420         src = iso_8859_16_to_ucs4[i-0xa0];
1421
1422       codeset->table[i].code = i;
1423       codeset->table[i].ucs4 = src;
1424       CodesetsConvertUTF32toUTF8((const UTF32 **)&src_ptr, src_ptr+1, &dest_ptr, dest_ptr+6, CSF_StrictConversion);
1425       *dest_ptr = 0;
1426       codeset->table[i].utf8[0] = (IPTR)dest_ptr - (IPTR)&codeset->table[i].utf8[1];
1427     }
1428     memcpy(codeset->table_sorted, codeset->table, sizeof(codeset->table));
1429     qsort(codeset->table_sorted, 256, sizeof(codeset->table[0]), codesetsCmpUnicode);
1430
1431     D(DBF_STARTUP, "adding internal codeset '%s'", codeset->name);
1432     AddTail((struct List *)csList, (struct Node *)&codeset->node);
1433   }
1434
1435   // KOI8-R
1436   if(codesetsFind(csList, "KOI8-R") == NULL)
1437   {
1438     if((codeset = allocArbitrateVecPooled(sizeof(*codeset))) == NULL)
1439       goto end;
1440
1441     codeset->name               = mystrdup("KOI8-R");
1442     codeset->alt_name           = mystrdup("KOI8R");
1443     codeset->characterization   = mystrdup("Russian");
1444     codeset->read_only          = 0;
1445
1446     for(i = 0; i<256; i++)
1447     {
1448       UTF32 *src_ptr = &src;
1449       UTF8 *dest_ptr = &codeset->table[i].utf8[1];
1450
1451       if(i<0x80)
1452         src = i;
1453       else
1454         src = koi8r_to_ucs4[i-0x80];
1455
1456       codeset->table[i].code = i;
1457       codeset->table[i].ucs4 = src;
1458       CodesetsConvertUTF32toUTF8((const UTF32 **)&src_ptr,src_ptr+1,&dest_ptr,dest_ptr+6,CSF_StrictConversion);
1459       *dest_ptr = 0;
1460       codeset->table[i].utf8[0] = (IPTR)dest_ptr-(IPTR)&codeset->table[i].utf8[1];
1461     }
1462     memcpy(codeset->table_sorted, codeset->table, sizeof(codeset->table));
1463     qsort(codeset->table_sorted, 256, sizeof(codeset->table[0]), codesetsCmpUnicode);
1464
1465     D(DBF_STARTUP, "adding internal codeset '%s'", codeset->name);
1466     AddTail((struct List *)csList, (struct Node *)&codeset->node);
1467   }
1468
1469   // AmigaPL
1470   if(codesetsFind(csList, "AmigaPL") == NULL)
1471   {
1472     if((codeset = allocArbitrateVecPooled(sizeof(*codeset))) == NULL)
1473       goto end;
1474
1475     codeset->name             = mystrdup("AmigaPL");
1476     codeset->alt_name         = mystrdup("AmiPL");
1477     codeset->characterization = mystrdup("Polish (Amiga)");
1478     codeset->read_only        = 1;
1479
1480     for(i=0; i<256; i++)
1481     {
1482       UTF32 *src_ptr = &src;
1483       UTF8 *dest_ptr = &codeset->table[i].utf8[1];
1484
1485       if(i<0xa0)
1486         src = i;
1487       else
1488         src = amigapl_to_ucs4[i-0xa0];
1489
1490       codeset->table[i].code = i;
1491       codeset->table[i].ucs4 = src;
1492       CodesetsConvertUTF32toUTF8((const UTF32 **)&src_ptr,src_ptr+1,&dest_ptr,dest_ptr+6,CSF_StrictConversion);
1493       *dest_ptr = 0;
1494       codeset->table[i].utf8[0] = (IPTR)dest_ptr-(IPTR)&codeset->table[i].utf8[1];
1495     }
1496     memcpy(codeset->table_sorted, codeset->table, sizeof(codeset->table));
1497     qsort(codeset->table_sorted, 256, sizeof(codeset->table[0]), codesetsCmpUnicode);
1498
1499     D(DBF_STARTUP, "adding internal codeset '%s'", codeset->name);
1500     AddTail((struct List *)csList, (struct Node *)&codeset->node);
1501   }
1502
1503   // Amiga-1251
1504   if(codesetsFind(csList, "Amiga-1251") == NULL)
1505   {
1506     if((codeset = allocArbitrateVecPooled(sizeof(*codeset))) == NULL)
1507       goto end;
1508
1509     codeset->name             = mystrdup("Amiga-1251");
1510     codeset->alt_name         = mystrdup("Ami1251");
1511     codeset->characterization = mystrdup("Cyrillic (Amiga)");
1512     codeset->read_only        = 1;
1513
1514     for(i=0; i<256; i++)
1515     {
1516       UTF32 *src_ptr = &src;
1517       UTF8 *dest_ptr = &codeset->table[i].utf8[1];
1518
1519       if(i < 0xa0)
1520         src = i;
1521       else
1522         src = amiga1251_to_ucs4[i-0xa0];
1523
1524       codeset->table[i].code = i;
1525       codeset->table[i].ucs4 = src;
1526       CodesetsConvertUTF32toUTF8((const UTF32 **)&src_ptr, src_ptr+1, &dest_ptr, dest_ptr+6, CSF_StrictConversion);
1527       *dest_ptr = 0;
1528       codeset->table[i].utf8[0] = (char*)dest_ptr - (char*)&codeset->table[i].utf8[1];
1529     }
1530     memcpy(codeset->table_sorted, codeset->table, sizeof(codeset->table));
1531     qsort(codeset->table_sorted, 256, sizeof(codeset->table[0]), codesetsCmpUnicode);
1532
1533     D(DBF_STARTUP, "adding internal codeset '%s'", codeset->name);
1534     AddTail((struct List *)csList, (struct Node *)&codeset->node);
1535   }
1536
1537   success = TRUE;
1538
1539 end:
1540   RETURN(success);
1541   return success;
1542 }
1543
1544 ///
1545 /// codesetsCleanup()
1546 // Cleanup the memory for the codeset
1547 void codesetsCleanup(struct codesetList *csList)
1548 {
1549   struct codeset *code;
1550
1551   ENTER();
1552
1553   while((code = (struct codeset *)RemHead((struct List *)csList)) != NULL)
1554   {
1555     if(code->name != NULL)
1556       freeArbitrateVecPooled(code->name);
1557     if(code->alt_name != NULL)
1558       freeArbitrateVecPooled(code->alt_name);
1559     if(code->characterization != NULL)
1560       freeArbitrateVecPooled(code->characterization);
1561
1562     freeArbitrateVecPooled(code);
1563   }
1564
1565   LEAVE();
1566 }
1567
1568 ///
1569 /// codesetsFind()
1570 // Returns the given codeset.
1571 struct codeset *codesetsFind(struct codesetList *csList, const char *name)
1572 {
1573   struct codeset *res = NULL;
1574
1575   ENTER();
1576
1577   if(name != NULL && name[0] != '\0')
1578   {
1579     struct Node *node;
1580     const char *matchedName;
1581
1582     if((matchedName = matchCodesetAlias(name)) != NULL)
1583       name = matchedName;
1584
1585     for(node = GetHead((struct List *)csList); node != NULL; node = GetSucc(node))
1586     {
1587       struct codeset *mstate = (struct codeset *)node;
1588
1589       if(stricmp(name, mstate->name) == 0 ||
1590         (mstate->alt_name != NULL && stricmp(name, mstate->alt_name) == 0))
1591       {
1592         // break out
1593         res = mstate;
1594         break;
1595       }
1596     }
1597   }
1598
1599   RETURN(res);
1600   return res;
1601 }
1602
1603 ///
1604 /// checkTextAgainstSingleCodeset
1605 // check how good a text can be represented by a specific codeset
1606 static int checkTextAgainstSingleCodeset(CONST_STRPTR text, ULONG textLen, struct codeset *codeset)
1607 {
1608   int errors = textLen;
1609
1610   ENTER();
1611
1612   if(codeset->read_only == 0 &&
1613      codeset != CodesetsBase->utf8Codeset &&
1614      codeset != CodesetsBase->utf16Codeset &&
1615      codeset != CodesetsBase->utf32Codeset)
1616   {
1617     CONST_STRPTR text_ptr = text;
1618     ULONG i;
1619
1620     errors = 0;
1621
1622     // the following identification/detection routine is NOT really smart.
1623     // we just see how each UTF8 string is the representation of each char
1624     // in our source text and then check if they are valid or not. As said,
1625     // not very smart, but we don't have anything better right now :(
1626     for(i=0; i < textLen; i++)
1627     {
1628       unsigned char c = *text_ptr++;
1629
1630       if(c != '\0')
1631       {
1632         struct single_convert *f = &codeset->table[c];
1633
1634         if(f->utf8[0] == 0x00 || f->utf8[1] == 0x00)
1635           errors++;
1636       }
1637       else
1638         break;
1639     }
1640   }
1641   else
1642     W(DBF_STARTUP, "codeset '%s' is either read-only (%ld) or UTF8/16/32 (%ld)", codeset->name, codeset->read_only, codeset == CodesetsBase->utf8Codeset || codeset == CodesetsBase->utf16Codeset || codeset == CodesetsBase->utf32Codeset);
1643
1644   D(DBF_STARTUP, "tried to identify text as '%s' text with %ld of %ld errors", codeset->name, errors, textLen);
1645
1646   RETURN(errors);
1647   return errors;
1648 }
1649
1650 ///
1651 /// checkTextAgainstCodesetList
1652 static int checkTextAgainstCodesetList(CONST_STRPTR text, ULONG textLen, struct codesetList *csList, struct codeset **bestCodeset)
1653 {
1654   struct Node *node;
1655   int bestErrors = textLen;
1656
1657   ENTER();
1658
1659   *bestCodeset = NULL;
1660
1661   for(node = GetHead((struct List *)csList); node != NULL; node = GetSucc(node))
1662   {
1663     struct codeset *codeset = (struct codeset *)node;
1664     int errors;
1665
1666     errors = checkTextAgainstSingleCodeset(text, textLen, codeset);
1667     if(errors < bestErrors)
1668     {
1669       *bestCodeset = codeset;
1670       bestErrors = errors;
1671
1672       if(bestErrors == 0)
1673         break;
1674     }
1675   }
1676
1677   RETURN(bestErrors);
1678   return bestErrors;
1679 }
1680
1681 ///
1682 /// codesetsFindBest()
1683 // Returns the best codeset for the given text
1684 static struct codeset *codesetsFindBest(struct TagItem *attrs, ULONG csFamily, CONST_STRPTR text, ULONG textLen, int *errorPtr)
1685 {
1686   struct codeset *bestCodeset = NULL;
1687   int bestErrors = textLen;
1688   BOOL found = FALSE;
1689
1690   ENTER();
1691
1692   ObtainSemaphoreShared(&CodesetsBase->libSem);
1693
1694   // in case the user specified the codeset family as a
1695   // cyrillic one we go and do our cyrillic specific analysis first
1696   if(csFamily == CSV_CodesetFamily_Cyrillic)
1697   {
1698     #define NUM_CYRILLIC 3
1699
1700     struct CodesetSearch
1701     {
1702       const char *name;
1703       const char *data;
1704     };
1705
1706     struct CodesetSearch search[NUM_CYRILLIC];
1707     unsigned char *p;
1708     unsigned char *tp;
1709     int ctr[NUM_CYRILLIC];
1710     int Nmax;
1711     int NGlob = 1;
1712     int max;
1713     int gr = 0;
1714     int lr = 0;
1715
1716     D(DBF_STARTUP, "performing cyrillic analysis");
1717
1718     search[0].name = "windows-1251";
1719     search[0].data = cp1251_data;
1720     search[1].name = "IBM866";
1721     search[1].data = cp866_data;
1722     search[2].name = "KOI8-R";
1723     search[2].data = koi8r_data;
1724
1725     memset(&ctr, 0, sizeof(ctr));
1726
1727     tp = (unsigned char *)text;
1728
1729     do
1730     {
1731       int n;
1732       int mid = max = -466725766; // TODO: what's the magic behind this constant?
1733       Nmax = 0;
1734
1735       for(n=0; n < NUM_CYRILLIC; n++)
1736       {
1737         unsigned char la = 0;
1738         unsigned char *tptr = (unsigned char *)search[n].data;
1739
1740         p = tp;
1741
1742         do
1743         {
1744           unsigned char lb = (*p++) ^ 128;
1745
1746           if(!((la | lb) & 128))
1747             ctr[n] += (signed char)tptr[(la << 7) + lb];
1748
1749           la = lb;
1750         }
1751         while(*p);
1752
1753         if(max < ctr[n])
1754         {
1755           mid = max;
1756           max = ctr[n];
1757           Nmax = n+1;
1758         }
1759       }
1760
1761       tp = p;
1762       if((max >= 500) && ((max-mid) >= 1000))
1763       {
1764         lr = gr = 1;
1765         NGlob = Nmax;
1766       }
1767     }
1768     while((*p) && (!gr));
1769
1770     if(gr || ((!(*p)) && lr))
1771       Nmax = NGlob;
1772
1773     // if our analysis found something, we go and try
1774     // to find the corresponding codeset in out codeset list
1775     if(max != 0)
1776     {
1777       struct TagItem *tstate = attrs;
1778       struct TagItem *tag;
1779
1780       D(DBF_STARTUP, "identified text as '%s", search[Nmax-1].name);
1781
1782       // now we walk through our taglist and check if the user
1783       // supplied
1784       while((tag = NextTagItem((APTR)&tstate)) != NULL)
1785       {
1786         if(tag->ti_Tag == CSA_CodesetList && tag->ti_Data != 0)
1787         {
1788           struct codesetList *csList = (struct codesetList *)tag->ti_Data;
1789
1790           if((bestCodeset = codesetsFind(csList, search[Nmax-1].name)) != NULL)
1791             break;
1792         }
1793       }
1794
1795       // if we still haven't found the matching codeset
1796       // we search the internal list
1797       if(bestCodeset == NULL)
1798         bestCodeset = codesetsFind(&CodesetsBase->codesets, search[Nmax-1].name);
1799
1800       bestErrors = 0;
1801
1802       found = TRUE;
1803     }
1804   }
1805
1806   // if we haven't found the best codeset (through the cyrillic analysis)
1807   // we go and do the dumb latin search in our codesetlist
1808   if(found == FALSE)
1809   {
1810     struct TagItem *tstate = attrs;
1811     struct TagItem *tag;
1812
1813     // check text against all codesets in all supplied lists of codesets
1814     while((tag = NextTagItem((APTR)&tstate)) != NULL)
1815     {
1816       switch(tag->ti_Tag)
1817       {
1818         case CSA_CodesetList:
1819         {
1820           struct codesetList *csList = (struct codesetList *)tag->ti_Data;
1821           struct codeset *bestCodesetInList;
1822           int bestErrorsInList;
1823
1824           D(DBF_STARTUP, "checking against external codeset list");
1825           bestErrorsInList = checkTextAgainstCodesetList(text, textLen, csList, &bestCodesetInList);
1826           if(bestErrorsInList < bestErrors && bestCodesetInList != NULL)
1827           {
1828             bestCodeset = bestCodesetInList;
1829             bestErrors = bestErrorsInList;
1830
1831             if(bestErrors == 0)
1832               break;
1833           }
1834         }
1835         break;
1836       }
1837     }
1838
1839     // we didn't find a "best" codeset in the supplied codesets lists so far,
1840     // so now we check against our internal list
1841     if(bestErrors != 0)
1842     {
1843       struct codeset *bestCodesetInList;
1844       int bestErrorsInList;
1845
1846       D(DBF_STARTUP, "checking against internal codeset list");
1847       bestErrorsInList = checkTextAgainstCodesetList(text, textLen, &CodesetsBase->codesets, &bestCodesetInList);
1848       if(bestErrorsInList < bestErrors && bestCodesetInList != NULL)
1849       {
1850         bestCodeset = bestCodesetInList;
1851         bestErrors = bestErrorsInList;
1852       }
1853     }
1854   }
1855
1856   ReleaseSemaphore(&CodesetsBase->libSem);
1857
1858   if(errorPtr != NULL)
1859     *errorPtr = bestErrors;
1860
1861   RETURN(bestCodeset);
1862   return bestCodeset;
1863 }
1864
1865 ///
1866
1867 /**************************************************************************/
1868
1869 /// CodesetsSupportedA()
1870 STRPTR * LIBFUNC CodesetsSupportedA(REG(a0, UNUSED struct TagItem *attrs))
1871 {
1872   STRPTR *array = NULL;
1873   struct TagItem *tstate = attrs;
1874   struct TagItem *tag;
1875   int numCodesets;
1876
1877   ENTER();
1878
1879   ObtainSemaphoreShared(&CodesetsBase->libSem);
1880
1881   // first we need to check how many codesets our supplied
1882   // lists carry.
1883   numCodesets = countCodesets(&CodesetsBase->codesets);
1884   while((tag = NextTagItem((APTR)&tstate)) != NULL)
1885   {
1886     switch(tag->ti_Tag)
1887     {
1888       case CSA_CodesetList:
1889       {
1890         numCodesets += countCodesets((struct codesetList *)tag->ti_Data);
1891       }
1892       break;
1893     }
1894   }
1895
1896   // now that we know how many codesets we have in our lists we
1897   // can put their names into our string arrays
1898   if(numCodesets > 0)
1899   {
1900     if((array = allocArbitrateVecPooled((numCodesets+1)*sizeof(STRPTR))) != NULL)
1901     {
1902       struct Node *node;
1903       int i=0;
1904
1905       // first we walk through the internal codesets list and
1906       // add the names
1907       for(node = GetHead((struct List *)&CodesetsBase->codesets); node != NULL; node = GetSucc(node))
1908       {
1909         struct codeset *code = (struct codeset *)node;
1910
1911         array[i] = code->name;
1912         i++;
1913       }
1914
1915       // reset the tstate
1916       tstate = attrs;
1917
1918       // then we also iterate through our private codesets list
1919       while((tag = NextTagItem((APTR)&tstate)) != NULL)
1920       {
1921         switch(tag->ti_Tag)
1922         {
1923           case CSA_CodesetList:
1924           {
1925             for(node = GetHead((struct List *)tag->ti_Data); node != NULL; node = GetSucc(node))
1926             {
1927               struct codeset *code = (struct codeset *)node;
1928
1929               array[i] = code->name;
1930               i++;
1931             }
1932           }
1933           break;
1934         }
1935       }
1936
1937       array[i] = NULL;
1938     }
1939   }
1940
1941   ReleaseSemaphore(&CodesetsBase->libSem);
1942
1943   RETURN(array);
1944   return array;
1945 }
1946
1947 ///
1948 /// CodesetsFreeA()
1949 void LIBFUNC CodesetsFreeA(REG(a0, APTR obj), REG(a1, UNUSED struct TagItem *attrs))
1950 {
1951   ENTER();
1952
1953   if(obj != NULL)
1954     freeArbitrateVecPooled(obj);
1955
1956   LEAVE();
1957 }
1958
1959 ///
1960 /// CodesetsSetDefaultA()
1961 struct codeset * LIBFUNC CodesetsSetDefaultA(REG(a0, STRPTR name), REG(a1, struct TagItem *attrs))
1962 {
1963   struct codeset *codeset;
1964
1965   ENTER();
1966
1967   ObtainSemaphoreShared(&CodesetsBase->libSem);
1968
1969   if((codeset = codesetsFind(&CodesetsBase->codesets, name)) != NULL)
1970   {
1971     ULONG flags;
1972
1973     flags = GVF_SAVE_VAR;
1974     if(GetTagData(CSA_Save, FALSE, attrs))
1975       SET_FLAG(flags, GVF_GLOBAL_ONLY);
1976
1977     SetVar("codeset_default", codeset->name, strlen(codeset->name), flags);
1978   }
1979
1980   ReleaseSemaphore(&CodesetsBase->libSem);
1981
1982   RETURN(codeset);
1983   return codeset;
1984 }
1985
1986 ///
1987 /// CodesetsFindA()
1988 struct codeset * LIBFUNC CodesetsFindA(REG(a0, STRPTR name), REG(a1, struct TagItem *attrs))
1989 {
1990   struct codeset *codeset = NULL;
1991
1992   ENTER();
1993
1994   ObtainSemaphoreShared(&CodesetsBase->libSem);
1995
1996   // if no name pointer was supplied we have to return
1997   // the default codeset only.
1998   if(name != NULL)
1999   {
2000     // we first walk through our internal list and check if we
2001     // can find the requested codeset
2002     codeset = codesetsFind(&CodesetsBase->codesets, name);
2003
2004     if(codeset == NULL)
2005     {
2006       struct TagItem *tstate = attrs;
2007       struct TagItem *tag;
2008
2009       // now we walk through our taglist and check if the user
2010       // supplied
2011       while((tag = NextTagItem((APTR)&tstate)) != NULL)
2012       {
2013         if(tag->ti_Tag == CSA_CodesetList && tag->ti_Data != 0)
2014         {
2015           struct codesetList *csList = (struct codesetList *)tag->ti_Data;
2016
2017           if((codeset = codesetsFind(csList, name)) != NULL)
2018             break;
2019         }
2020       }
2021     }
2022   }
2023
2024   // check if we found something or not.
2025   if(codeset == NULL && GetTagData(CSA_FallbackToDefault, TRUE, attrs))
2026     codeset = defaultCodeset(FALSE);
2027
2028   ReleaseSemaphore(&CodesetsBase->libSem);
2029
2030   RETURN(codeset);
2031   return codeset;
2032 }
2033
2034 ///
2035 /// CodesetsFindBestA()
2036 struct codeset * LIBFUNC CodesetsFindBestA(REG(a0, struct TagItem *attrs))
2037 {
2038   struct codeset *codeset = NULL;
2039   char *text;
2040   ULONG textLen;
2041
2042   ENTER();
2043
2044   ObtainSemaphoreShared(&CodesetsBase->libSem);
2045
2046   text = (char *)GetTagData(CSA_Source, 0, attrs);
2047   textLen = GetTagData(CSA_SourceLen, text != NULL ? strlen(text) : 0, attrs);
2048
2049   if(text != NULL && textLen != 0)
2050   {
2051     int numErrors = 0;
2052     ULONG csFamily = GetTagData(CSA_CodesetFamily, CSV_CodesetFamily_Latin, attrs);
2053     int *errorPtr = (int *)GetTagData(CSA_ErrPtr, 0, attrs);
2054
2055     codeset = codesetsFindBest(attrs, csFamily, text, textLen, &numErrors);
2056
2057     if(errorPtr != NULL)
2058       *errorPtr = numErrors;
2059
2060     // if we still haven't got the codeset we fallback to the default
2061     if(codeset == NULL && GetTagData(CSA_FallbackToDefault, FALSE, attrs))
2062       codeset = defaultCodeset(FALSE);
2063   }
2064
2065   ReleaseSemaphore(&CodesetsBase->libSem);
2066
2067   RETURN(codeset);
2068   return codeset;
2069 }
2070
2071 ///
2072 /// CodesetsUTF8Len()
2073 // Returns the number of characters a utf8 string has. This is not
2074 // identically with the size of memory is required to hold the string.
2075 ULONG LIBFUNC CodesetsUTF8Len(REG(a0, UTF8 *str))
2076 {
2077   int len = 0;
2078   unsigned char c;
2079
2080   ENTER();
2081
2082   if(str != NULL)
2083   {
2084     while((c = *str++))
2085     {
2086       len++;
2087       str += trailingBytesForUTF8[c];
2088     }
2089   }
2090
2091   RETURN((ULONG)len);
2092   return (ULONG)len;
2093 }
2094
2095 ///
2096 /// CodesetsStrLenA()
2097 ULONG LIBFUNC CodesetsStrLenA(REG(a0, STRPTR str), REG(a1, struct TagItem *attrs))
2098 {
2099   ULONG res = 0;
2100
2101   ENTER();
2102
2103   if(str != NULL)
2104   {
2105     struct codeset *codeset;
2106     int            len;
2107     STRPTR         src;
2108     int            utf;
2109
2110     if((codeset = (struct codeset *)GetTagData(CSA_SourceCodeset, 0, attrs)) == NULL)
2111       codeset = defaultCodeset(TRUE);
2112
2113     if(codeset == CodesetsBase->utf32Codeset)
2114     {
2115       utf = 32;
2116       len = utf32_strlen((UTF32 *)str);
2117     }
2118     else if(codeset == CodesetsBase->utf16Codeset)
2119     {
2120       utf = 16;
2121       len = utf16_strlen((UTF16 *)str);
2122     }
2123     else
2124     {
2125       utf = 0;
2126       len = strlen(str);
2127     }
2128
2129     len = GetTagData(CSA_SourceLen, len, attrs);
2130
2131     src = str;
2132
2133     if(utf != 0)
2134     {
2135       void *srcend = src + len;
2136       UTF8 *dstlen = NULL;
2137       union TypeAliases srcAlias;
2138       union TypeAliases dstAlias;
2139
2140       srcAlias.strptr = &src;
2141       dstAlias.utf8 = &dstlen;
2142
2143       switch(utf)
2144       {
2145         case 16:
2146           CodesetsConvertUTF16toUTF8(srcAlias.cutf16, srcend, dstAlias.utf8, NULL, 0);
2147         break;
2148
2149         case 32:
2150           CodesetsConvertUTF32toUTF8(srcAlias.cutf32, srcend, dstAlias.utf8, NULL, 0);
2151         break;
2152       }
2153       res = (IPTR)dstlen;
2154     }
2155     else
2156     {
2157       UBYTE c;
2158
2159       res = 0;
2160
2161       while((c = *src++) != '\0' && len != 0)
2162       {
2163         res += codeset->table[c].utf8[0];
2164         len--;
2165       }
2166     }
2167   }
2168
2169   RETURN(res);
2170   return res;
2171 }
2172
2173 ///
2174 /// CodesetsUTF8ToStrA()
2175 // Converts an UTF8 string to a given charset. Return the number of bytes
2176 // written to dest excluding the NULL byte (which is always ensured by this
2177 // function; it means a NULL str will produce "" as dest; anyway you should
2178 // check NULL str to not waste your time!).
2179 STRPTR LIBFUNC CodesetsUTF8ToStrA(REG(a0, struct TagItem *attrs))
2180 {
2181   UTF8 *src;
2182   ULONG srcLen;
2183   ULONG destLen = 0;
2184   ULONG *destLenPtr;
2185   ULONG n = 0;
2186   STRPTR dest = NULL;
2187
2188   ENTER();
2189
2190   if((src = (UTF8 *)GetTagData(CSA_Source, 0, attrs)) != NULL &&
2191      (srcLen = GetTagData(CSA_SourceLen, src != NULL ? strlen((char *)src) : 0, attrs)) > 0)
2192   {
2193     struct convertMsg msg;
2194     struct codeset *codeset;
2195     struct Hook *destHook;
2196     struct Hook *mapForeignCharsHook;
2197     char buf[256];
2198     STRPTR destIter = NULL;
2199     char *b = NULL;
2200     int i = 0;
2201     unsigned char *s = src;
2202     unsigned char *e = (src+srcLen);
2203     int numConvErrors = 0;
2204     int *numConvErrorsPtr;
2205     BOOL mapForeignChars;
2206     APTR pool = NULL;
2207     struct SignalSemaphore *sem = NULL;
2208     int utf;
2209     ULONG char_size;
2210
2211     // get some more optional attributes
2212     destHook = (struct Hook *)GetTagData(CSA_DestHook, 0, attrs);
2213     destLen = GetTagData(CSA_DestLen, 0, attrs);
2214     numConvErrorsPtr = (int *)GetTagData(CSA_ErrPtr, 0, attrs);
2215     mapForeignChars = (BOOL)GetTagData(CSA_MapForeignChars, FALSE, attrs);
2216     mapForeignCharsHook = (struct Hook *)GetTagData(CSA_MapForeignCharsHook, 0, attrs);
2217
2218     // get the destination codeset pointer
2219     if((codeset = (struct codeset *)GetTagData(CSA_DestCodeset, 0, attrs)) == NULL)
2220       codeset = defaultCodeset(TRUE);
2221     if(codeset == CodesetsBase->utf32Codeset)
2222     {
2223       utf = 32;
2224       char_size = 4;
2225     }
2226     else if(codeset == CodesetsBase->utf16Codeset)
2227     {
2228       utf = 16;
2229       char_size = 2;
2230     }
2231     else
2232     {
2233       utf = 0;
2234       char_size = 1;
2235     }
2236
2237     // first we make sure we allocate enough memory
2238     // for our destination buffer
2239     if(destHook != NULL)
2240     {
2241       if(destLen < 16 || destLen > sizeof(buf))
2242         destLen = sizeof(buf);
2243
2244       msg.state = CSV_Translating;
2245       b = buf;
2246       i = 0;
2247     }
2248     else
2249     {
2250       // in case the user wants us to dynamically generate the
2251       // destination buffer we do it right now
2252       if((dest = (STRPTR)GetTagData(CSA_Dest, 0, attrs)) == NULL ||
2253          GetTagData(CSA_AllocIfNeeded, TRUE, attrs) != FALSE)
2254       {
2255         ULONG len = 0;
2256
2257         // calculate the destLen
2258         if(utf)
2259         {
2260           void *dstlen = NULL;
2261           union TypeAliases srcAlias;
2262           union TypeAliases dstAlias;
2263
2264           srcAlias.uchar = &s;
2265           dstAlias.voidptr = &dstlen;
2266
2267           switch(utf)
2268           {
2269             case 16:
2270               CodesetsConvertUTF8toUTF16(srcAlias.cutf8, e, dstAlias.utf16, NULL, 0);
2271             break;
2272
2273             case 32:
2274               CodesetsConvertUTF8toUTF32(srcAlias.cutf8, e, dstAlias.utf32, NULL, 0);
2275             break;
2276           }
2277           len = (IPTR)dstlen;
2278         }
2279         else
2280         {
2281           while(s < e)
2282           {
2283             unsigned char c = *s++;
2284
2285             len++;
2286             s += trailingBytesForUTF8[c];
2287           }
2288         }
2289
2290         if(dest == NULL || (destLen < len+1))
2291         {
2292           if((pool = (APTR)GetTagData(CSA_Pool, 0, attrs)) != NULL)
2293           {
2294             if((sem = (struct SignalSemaphore *)GetTagData(CSA_PoolSem, 0, attrs)) != NULL)
2295               ObtainSemaphore(sem);
2296
2297             // allocate the destination buffer
2298             dest = allocVecPooled(pool, len+char_size);
2299
2300             if(sem != NULL)
2301               ReleaseSemaphore(sem);
2302           }
2303           else
2304             dest = allocArbitrateVecPooled(len+char_size);
2305
2306           destLen = len+char_size;
2307         }
2308
2309         if(dest == NULL)
2310         {
2311           RETURN(NULL);
2312           return NULL;
2313         }
2314       }
2315
2316       destIter = dest;
2317     }
2318
2319     // now we convert the src string to the
2320     // destination buffer.
2321     s = src;
2322     if(utf != 0)
2323     {
2324       void *dstend;
2325
2326       if(destHook != NULL)
2327       {
2328         ULONG r = CSR_TargetExhausted;
2329
2330         dstend = b + destLen - char_size;
2331         do
2332         {
2333           union TypeAliases srcAlias;
2334           union TypeAliases dstAlias;
2335
2336           srcAlias.uchar = &s;
2337           dstAlias.schar = &b;
2338
2339           switch(utf)
2340           {
2341             case 16:
2342               r = CodesetsConvertUTF8toUTF16(srcAlias.cutf8, e, dstAlias.utf16, dstend, 0);
2343             break;
2344
2345             case 32:
2346               r = CodesetsConvertUTF8toUTF32(srcAlias.cutf8, e, dstAlias.utf32, dstend, 0);
2347             break;
2348           }
2349           b[0] = 0;
2350           if(char_size > 1)
2351             b[1] = 0;
2352           if(r != CSR_TargetExhausted)
2353             msg.state = CSV_End;
2354           msg.len = b-buf;
2355           CallHookPkt(destHook,&msg,buf);
2356
2357           b  = buf;
2358           n += msg.len;
2359         }
2360         while(r == CSR_TargetExhausted);
2361       }
2362       else
2363       {
2364         union TypeAliases srcAlias;
2365         union TypeAliases dstAlias;
2366
2367         srcAlias.uchar = &s;
2368         dstAlias.strptr = &destIter;
2369         dstend = destIter + destLen - char_size;
2370         switch(utf)
2371         {
2372           case 16:
2373             CodesetsConvertUTF8toUTF16(srcAlias.cutf8, e, dstAlias.utf16, dstend, 0);
2374           break;
2375
2376           case 32:
2377             CodesetsConvertUTF8toUTF32(srcAlias.cutf8, e, dstAlias.utf32, dstend, 0);
2378           break;
2379         }
2380         n = destIter-dest;
2381       }
2382     }
2383     else
2384     {
2385       for(;;n++)
2386       {
2387         if(destHook == NULL && n >= destLen-1)
2388           break;
2389
2390         // convert until we reach the end of the
2391         // source buffer.
2392         if(s < e)
2393         {
2394           unsigned char c = *s;
2395           unsigned char d = '?';
2396           const char *repstr = NULL;
2397           int replen = 0;
2398
2399           // check if the char is a >7bit char
2400           if(c > 127)
2401           {
2402             struct single_convert *f;
2403             int lenAdd = trailingBytesForUTF8[c];
2404             int lenStr = lenAdd+1;
2405             unsigned char *src = s;
2406
2407             do
2408             {
2409               // start each iteration with "no replacement found yet"
2410               repstr = NULL;
2411               replen = 0;
2412
2413               // search in the UTF8 conversion table of the current charset if
2414               // we have a replacement character for the char sequence starting at s
2415               BIN_SEARCH(codeset->table_sorted, 0, 255, strncmp((char *)src, (char *)codeset->table_sorted[m].utf8+1, lenStr), f);
2416
2417               if(f != NULL)
2418               {
2419                 d = f->code;
2420                 replen = -1;
2421
2422                 break;
2423               }
2424               else
2425               {
2426                 // the analysed char sequence (s) is not convertable to a
2427                 // single visible char replacement, so we normally have to put
2428                 // a ? sign as a "unknown char" sign at the very position.
2429                 //
2430                 // For convienence we, however, allow users to replace these
2431                 // UTF8 characters with char sequences that "looklike" the
2432                 // original char.
2433                 if(mapForeignChars == TRUE)
2434                   replen = mapUTF8toASCII(&repstr, src, lenStr);
2435
2436                 // call the hook only, if the internal table yielded no suitable
2437                 // replacement
2438                 if(replen == 0 && mapForeignCharsHook != NULL)
2439                 {
2440                   struct replaceMsg rmsg;
2441
2442                   rmsg.dst = (char **)&repstr;
2443                   rmsg.src = src;
2444                   rmsg.srclen = lenStr;
2445                   replen = CallHookPkt(mapForeignCharsHook, &rmsg, NULL);
2446                 }
2447
2448                 if(replen < 0)
2449                 {
2450                   D(DBF_UTF, "got UTF8 replacement (%ld)", replen);
2451
2452                   // stay in the loop as long as one replacement function delivers
2453                   // further UTF8 replacement sequences
2454                   src = (unsigned char *)repstr;
2455                   // remember the length of the replaced string, as we might do another
2456                   // iteration in the loop which might result in a further replacement
2457                   lenStr = -replen;
2458                 }
2459                 else if(replen == 0)
2460                 {
2461                   D(DBF_UTF, "found no ASCII replacement for UTF8 string (%ld)", replen);
2462                   repstr = NULL;
2463                 }
2464                 else
2465                   D(DBF_UTF, "got replacement string '%s' (%ld)", repstr ? repstr : "<null>", replen);
2466               }
2467             }
2468             while(replen < 0);
2469
2470             if(repstr == NULL || replen == 0)
2471             {
2472               if(replen >= 0)
2473               {
2474                 d = '?';
2475                 numConvErrors++;
2476               }
2477             }
2478
2479             s += lenAdd;
2480           }
2481           else
2482             d = c;
2483
2484           if(destHook != NULL)
2485           {
2486             if(replen > 1)
2487             {
2488               while(replen > 0)
2489               {
2490                 *b++ = *repstr;
2491                 repstr++;
2492                 i++;
2493                 replen--;
2494
2495                 if(i%(destLen-1)==0)
2496                 {
2497                   *b = '\0';
2498                   msg.len = i;
2499                   CallHookPkt(destHook, &msg, buf);
2500
2501                   b  = buf;
2502                   *b = '\0';
2503                   i  = 0;
2504                 }
2505               }
2506             }
2507             else
2508             {
2509               *b++ = replen > 0 ? *repstr : d;
2510               i++;
2511             }
2512
2513             if(i%(destLen-1)==0)
2514             {
2515               *b = '\0';
2516               msg.len = i;
2517               CallHookPkt(destHook, &msg, buf);
2518
2519               b  = buf;
2520               *b = '\0';
2521               i  = 0;
2522             }
2523           }
2524           else
2525           {
2526             if(replen > 1)
2527             {
2528               ULONG destPos = destIter-dest;
2529
2530               if(pool != NULL)
2531               {
2532                 if(sem != NULL)
2533                   ObtainSemaphore(sem);
2534
2535                 // allocate the destination buffer
2536                 dest = reallocVecPooled(pool, dest, destLen, destLen+replen-1);
2537
2538                 if(sem != NULL)
2539                   ReleaseSemaphore(sem);
2540               }
2541               else
2542                 dest = reallocArbitrateVecPooled(dest, destLen, destLen+replen-1);
2543
2544               if(dest == NULL)
2545               {
2546                 RETURN(NULL);
2547                 return NULL;
2548               }
2549
2550               destIter = dest+destPos;
2551               memcpy(destIter, repstr, replen);
2552
2553               // adjust our loop pointer and destination length
2554               destIter += replen;
2555               destLen += replen-1;
2556             }
2557             else if(replen == 1)
2558               *destIter++ = *repstr;
2559             else
2560               *destIter++ = d;
2561           }
2562
2563           s++;
2564         }
2565         else
2566           break;
2567       }
2568
2569       if(destHook != NULL)
2570       {
2571         msg.state = CSV_End;
2572         msg.len   = i;
2573         *b        = '\0';
2574         CallHookPkt(destHook,&msg,buf);
2575       }
2576       else
2577         *destIter = '\0';
2578     }
2579
2580     // let us write the number of conversion errors
2581     // to the proper variable pointer, if wanted
2582     if(numConvErrorsPtr != NULL)
2583       *numConvErrorsPtr = numConvErrors;
2584   }
2585
2586   // put the final length of our destination buffer
2587   // into the destLenPtr
2588   if((destLenPtr = (ULONG *)GetTagData(CSA_DestLenPtr, 0, attrs)) != NULL)
2589   {
2590     if(destLen > 0)
2591       *destLenPtr = destLen-1;
2592     else
2593       *destLenPtr = 0;
2594   }
2595
2596   RETURN(dest);
2597   return dest;
2598 }
2599
2600 ///
2601 /// CodesetsUTF8CreateA()
2602 // Converts a string and a charset to an UTF8. Returns the UTF8.
2603 // If a destination hook is supplied always return 0.
2604 // If from is NULL, it returns NULL and doesn't call the hook.
2605 UTF8 * LIBFUNC CodesetsUTF8CreateA(REG(a0, struct TagItem *attrs))
2606 {
2607   UTF8   *from;
2608   UTF8   *dest;
2609   struct codeset *codeset;
2610   ULONG  fromLen, *destLenPtr;
2611   ULONG  n;
2612   int    utf;
2613
2614   ENTER();
2615
2616   dest = NULL;
2617   n    = 0;
2618
2619   if((codeset = (struct codeset *)GetTagData(CSA_SourceCodeset, 0, attrs)) == NULL)
2620     codeset = defaultCodeset(TRUE);
2621   if(codeset == CodesetsBase->utf32Codeset)
2622     utf = 32;
2623   else if(codeset == CodesetsBase->utf16Codeset)
2624     utf = 16;
2625   else
2626     utf = 0;
2627
2628   from = (UTF8 *)GetTagData(CSA_Source, 0, attrs);
2629   if(from != NULL)
2630   {
2631     switch(utf)
2632     {
2633       case 32:
2634         fromLen = utf32_strlen((UTF32 *)from);
2635       break;
2636
2637       case 16:
2638         fromLen = utf16_strlen((UTF16 *)from);
2639       break;
2640
2641       default:
2642         fromLen = strlen((char *)from);
2643       break;
2644     }
2645   }
2646   else
2647     fromLen = 0;
2648   fromLen = GetTagData(CSA_SourceLen, fromLen, attrs);
2649
2650   if(from != NULL && fromLen != 0)
2651   {
2652     struct convertMsg       msg;
2653     struct Hook    *hook;
2654     ULONG          destLen;
2655     int            i = 0;
2656     TEXT           buf[256];
2657     STRPTR         src, destPtr = NULL, b = NULL;
2658     ULONG          c;
2659
2660     hook    = (struct Hook *)GetTagData(CSA_DestHook, 0, attrs);
2661     destLen = GetTagData(CSA_DestLen, 0, attrs);
2662
2663     if(hook != NULL)
2664     {
2665       if(destLen<16 || destLen>sizeof(buf))
2666         destLen = sizeof(buf);
2667
2668       msg.state = CSV_Translating;
2669       b = buf;
2670       i = 0;
2671     }
2672     else
2673     {
2674       if((dest = (UTF8 *)GetTagData(CSA_Dest, 0, attrs)) != NULL ||
2675          GetTagData(CSA_AllocIfNeeded, TRUE, attrs))
2676       {
2677         ULONG len;
2678
2679         src = (STRPTR)from;
2680
2681         if(utf != 0)
2682         {
2683           void *srcend = src + fromLen;
2684           UTF8 *dstlen = NULL;
2685           union TypeAliases srcAlias;
2686           union TypeAliases dstAlias;
2687
2688           srcAlias.strptr = &src;
2689           dstAlias.utf8 = &dstlen;
2690
2691           switch(utf)
2692           {
2693             case 16:
2694               CodesetsConvertUTF16toUTF8(srcAlias.cutf16, srcend, dstAlias.utf8, NULL, 0);
2695             break;
2696
2697             case 32:
2698               CodesetsConvertUTF32toUTF8(srcAlias.cutf32, srcend, dstAlias.utf8, NULL, 0);
2699             break;
2700           }
2701           len = (IPTR)dstlen;
2702         }
2703         else
2704         {
2705           ULONG flen = fromLen;
2706
2707           len = 0;
2708           while((c = *src++) != '\0' && flen != 0)
2709           {
2710             len += codeset->table[c].utf8[0];
2711             flen--;
2712           }
2713         }
2714         D(DBF_UTF, "Calculated output UTF-8 buffer length: %lu", len);
2715
2716         if(dest == NULL || (destLen<len+1))
2717         {
2718           APTR                   pool;
2719           struct SignalSemaphore *sem;
2720
2721           if((pool = (APTR)GetTagData(CSA_Pool, 0, attrs)) != NULL)
2722           {
2723             if((sem = (struct SignalSemaphore *)GetTagData(CSA_PoolSem, 0, attrs)) != NULL)
2724               ObtainSemaphore(sem);
2725
2726             // allocate the destination buffer
2727             dest = allocVecPooled(pool,len+1);
2728
2729             if(sem != NULL)
2730               ReleaseSemaphore(sem);
2731           }
2732           else
2733             dest = allocArbitrateVecPooled(len+1);
2734
2735           destLen  = len;
2736         }
2737
2738         if(dest == NULL)
2739         {
2740           RETURN(NULL);
2741           return NULL;
2742         }
2743       }
2744
2745       destPtr = (STRPTR)dest;
2746     }
2747
2748     src = (STRPTR)from;
2749     if(utf != 0)
2750     {
2751       void *srcend = src + fromLen;
2752       UTF8 *dstend;
2753
2754       if(hook != NULL)
2755       {
2756         ULONG r = CSR_TargetExhausted;
2757         union TypeAliases srcAlias;
2758         union TypeAliases dstAlias;
2759
2760         srcAlias.strptr = &src;
2761         dstAlias.strptr = &b;
2762         dstend = (UTF8 *)(b + destLen - 1);
2763         do
2764         {
2765           switch(utf)
2766           {
2767             case 16:
2768               r = CodesetsConvertUTF16toUTF8(srcAlias.cutf16, srcend, dstAlias.utf8, dstend, 0);
2769             break;
2770
2771             case 32:
2772               r = CodesetsConvertUTF32toUTF8(srcAlias.cutf32, srcend, dstAlias.utf8, dstend, 0);
2773             break;
2774           }
2775           *b = 0;
2776           if(r != CSR_TargetExhausted)
2777             msg.state = CSV_End;
2778           msg.len = b-buf;
2779           CallHookPkt(hook,&msg,buf);
2780
2781           b  = buf;
2782           n += msg.len;
2783         }
2784         while(r == CSR_TargetExhausted);
2785       }
2786       else
2787       {
2788         union TypeAliases srcAlias;
2789         union TypeAliases dstAlias;
2790
2791         srcAlias.strptr = &src;
2792         dstAlias.strptr = &destPtr;
2793         dstend = (UTF8 *)(destPtr + destLen);
2794         switch(utf)
2795         {
2796           case 16:
2797             CodesetsConvertUTF16toUTF8(srcAlias.cutf16, srcend, dstAlias.utf8, dstend, 0);
2798           break;
2799
2800           case 32:
2801             CodesetsConvertUTF32toUTF8(srcAlias.cutf32, srcend, dstAlias.utf8, dstend, 0);
2802           break;
2803         }
2804         n = destPtr-(STRPTR)dest;
2805       }
2806     }
2807     else
2808     {
2809       for(; fromLen && (c = *src); src++, fromLen--)
2810       {
2811         UTF8 *utf8_seq;
2812
2813         for(utf8_seq = &codeset->table[c].utf8[1]; (c = *utf8_seq); utf8_seq++)
2814         {
2815           if(hook != NULL)
2816           {
2817             *b++ = c;
2818             i++;
2819
2820             if(i%(destLen-1)==0)
2821             {
2822               *b = 0;
2823               msg.len = i;
2824               CallHookPkt(hook,&msg,buf);
2825
2826               b  = buf;
2827               *b = 0;
2828               i  = 0;
2829             }
2830           }
2831           else
2832           {
2833             if(n>=destLen)
2834               break;
2835
2836             *destPtr++ = c;
2837           }
2838
2839           n++;
2840         }
2841       }
2842
2843       if(hook != NULL)
2844       {
2845         msg.state = CSV_End;
2846         msg.len   = i;
2847         *b = 0;
2848         CallHookPkt(hook,&msg,buf);
2849       }
2850       else
2851       {
2852         *destPtr = 0;
2853       }
2854     }
2855   }
2856
2857   if((destLenPtr = (ULONG *)GetTagData(CSA_DestLenPtr, 0, attrs)) != NULL)
2858     *destLenPtr = n;
2859
2860   RETURN(dest);
2861   return dest;
2862 }
2863
2864 ///
2865 /// CodesetsIsValidUTF8()
2866 #define GOOD_UCS(c) \
2867      ((c) >= 160 && ((c) & ~0x3ff) != 0xd800 && \
2868       (c) != 0xfeff && (c) != 0xfffe && (c) != 0xffff)
2869
2870 BOOL LIBFUNC CodesetsIsValidUTF8(REG(a0, STRPTR s))
2871 {
2872   STRPTR t = s;
2873   int n;
2874
2875   ENTER();
2876
2877   while((n = parseUtf8(&t)) != 0)
2878   {
2879     if(!GOOD_UCS(n))
2880     {
2881       RETURN(FALSE);
2882       return FALSE;
2883     }
2884   }
2885
2886   RETURN(TRUE);
2887   return TRUE;
2888 }
2889
2890 ///
2891 /// CodesetsConvertStrA()
2892 // Converts a given string from one source Codeset to a given destination
2893 // codeset and returns the convert string
2894 STRPTR LIBFUNC CodesetsConvertStrA(REG(a0, struct TagItem *attrs))
2895 {
2896   struct codeset *srcCodeset;
2897   STRPTR srcStr = NULL;
2898   STRPTR dstStr = NULL;
2899   ULONG srcLen = 0;
2900   ULONG dstLen = 0;
2901   ULONG charSize = 0;
2902
2903   ENTER();
2904
2905   // get the ptr to the src string we want to convert
2906   // from the source codeset to the dest codeset.
2907   srcStr = (STRPTR)GetTagData(CSA_Source, 0, attrs);
2908
2909   // get the pointer to the codeset in which the src string is encoded
2910   if((srcCodeset = (struct codeset *)GetTagData(CSA_SourceCodeset, 0, attrs)) == NULL)
2911     srcCodeset = defaultCodeset(TRUE);
2912
2913   if(srcStr != NULL)
2914   {
2915     if(srcCodeset == CodesetsBase->utf32Codeset)
2916     {
2917       srcLen = utf32_strlen((UTF32 *)srcStr);
2918       charSize = sizeof(UTF32);
2919     }
2920     else if(srcCodeset == CodesetsBase->utf16Codeset)
2921     {
2922       srcLen = utf16_strlen((UTF16 *)srcStr);
2923       charSize = sizeof(UTF16);
2924     }
2925     else
2926     {
2927       srcLen = strlen(srcStr);
2928       charSize = sizeof(char);
2929     }
2930   }
2931   else
2932     srcLen = 0;
2933   srcLen = GetTagData(CSA_SourceLen, srcLen, attrs);
2934
2935   if(srcStr != NULL && srcLen > 0)
2936   {
2937     struct codeset *dstCodeset;
2938
2939     // get the pointer to the codeset in which the dst string should be encoded
2940     if((dstCodeset = (struct codeset *)GetTagData(CSA_DestCodeset, 0, attrs)) == NULL)
2941       dstCodeset = defaultCodeset(TRUE);
2942
2943     D(DBF_UTF, "srcCodeset: '%s' dstCodeset: '%s'", srcCodeset->name, dstCodeset->name);
2944
2945     if(srcCodeset != NULL && dstCodeset != NULL)
2946     {
2947       // check that the user didn't supplied the very same codeset
2948       // or otherwise a conversion is not required.
2949       if(srcCodeset != dstCodeset)
2950       {
2951         BOOL utf8Create = FALSE;
2952         BOOL strCreate = FALSE;
2953         UTF8 *utf8str;
2954         ULONG utf8strLen = 0;
2955         ULONG *destLenPtr = NULL;
2956         BOOL mapForeignChars;
2957         struct Hook *mapForeignCharsHook;
2958
2959         mapForeignChars = (BOOL)GetTagData(CSA_MapForeignChars, FALSE, attrs);
2960         mapForeignCharsHook = (struct Hook *)GetTagData(CSA_MapForeignCharsHook, 0, attrs);
2961
2962         // if the source codeset is UTF-8 we don't have to use the UTF8Create()
2963         // function and can directly call the UTF8ToStr() function
2964         if(srcCodeset != CodesetsBase->utf8Codeset)
2965         {
2966           struct TagItem tags[] = { { CSA_SourceCodeset,  (IPTR)srcCodeset   },
2967                                     { CSA_Source,         (IPTR)srcStr       },
2968                                     { CSA_SourceLen,      srcLen             },
2969                                     { CSA_DestLenPtr,     (IPTR)&utf8strLen  },
2970                                     { TAG_DONE,           0                  } };
2971
2972           utf8str = CodesetsUTF8CreateA((struct TagItem *)&tags[0]);
2973
2974           utf8Create = TRUE;
2975         }
2976         else
2977         {
2978           utf8str = (UTF8 *)srcStr;
2979           utf8strLen = srcLen;
2980         }
2981
2982         // in case the destination codeset is UTF-8 we don't have to actually
2983         // use the UTF8ToStr() function and can immediately return our
2984         // UTF8 string
2985         if(utf8str != NULL && utf8strLen > 0 && dstCodeset != CodesetsBase->utf8Codeset)
2986         {
2987           struct TagItem tags[] = { { CSA_DestCodeset,          (IPTR)dstCodeset           },
2988                                     { CSA_Source,               (IPTR)utf8str              },
2989                                     { CSA_SourceLen,            utf8strLen                 },
2990                                     { CSA_DestLenPtr,           (IPTR)&dstLen              },
2991                                     { CSA_MapForeignChars,      mapForeignChars            },
2992                                     { CSA_MapForeignCharsHook,  (IPTR)mapForeignCharsHook  },
2993                                     { TAG_DONE,                 0                          } };
2994
2995           dstStr = CodesetsUTF8ToStrA((struct TagItem *)&tags[0]);
2996
2997           strCreate = TRUE;
2998         }
2999         else
3000         {
3001           dstStr = (STRPTR)utf8str;
3002           dstLen = utf8strLen;
3003         }
3004
3005         D(DBF_UTF, "srcStr: %lx srcLen: %ld dstStr: %lx dstLen: %ld utf8create: %ld strCreate: %ld", srcStr, srcLen,
3006                                                                                                      dstStr, dstLen,
3007                                                                                                      utf8Create,
3008                                                                                                      strCreate);
3009
3010         // if everything was successfull we can go and finalize everything
3011         if(dstStr != NULL && utf8str != NULL)
3012         {
3013           // as the conversion was a two way pass we have to either free the
3014           // memory of the utf8 string or not
3015           if(utf8Create == TRUE && strCreate == TRUE)
3016             CodesetsFreeA(utf8str, NULL);
3017
3018           // if the user wants to be informed abour the length
3019           // of our destination string we store the length now in the supplied ptr.
3020           if((destLenPtr = (ULONG *)GetTagData(CSA_DestLenPtr, 0, attrs)) != NULL)
3021             *destLenPtr = dstLen;
3022
3023           D(DBF_UTF, "successfully converted string with len %ld", dstLen);
3024         }
3025         else
3026         {
3027           W(DBF_ALWAYS, "an error occurred while trying to convert a string");
3028
3029           // free all memory in case the conversion didn't work out
3030           if(utf8Create == TRUE && utf8str != NULL)
3031             CodesetsFreeA(utf8str, NULL);
3032
3033           if(strCreate == TRUE && dstStr != NULL)
3034             CodesetsFreeA(dstStr, NULL);
3035
3036           dstStr = NULL;
3037         }
3038       }
3039       else
3040       {
3041         // we got the same source and destination codesets passed in
3042         // instead of failing silently we just create a copy of the source string
3043         ULONG *destLenPtr = NULL;
3044
3045         // allocate memory for the destination string, including a trailing NUL byte
3046         if((dstStr = allocArbitrateVecPooled(srcLen + charSize)) != NULL)
3047         {
3048           // just copy the source string without any further modification
3049           // we must use memcpy() as the source string could be UTF16/32 encoded and
3050           // thus strcpy() would not do what we want.
3051           memcpy(dstStr, srcStr, srcLen + charSize);
3052           dstLen = srcLen;
3053           D(DBF_UTF, "successfully copied string with len %ld", dstLen);
3054         }
3055         else
3056           W(DBF_ALWAYS, "no memory for dest string");
3057
3058         // if the user wants to be informed abour the length
3059         // of our destination string we store the length now in the supplied ptr.
3060         if((destLenPtr = (ULONG *)GetTagData(CSA_DestLenPtr, 0, attrs)) != NULL)
3061           *destLenPtr = dstLen;
3062       }
3063     }
3064   }
3065
3066   RETURN(dstStr);
3067   return dstStr;
3068 }
3069
3070 ///
3071 /// CodesetsFreeVecPooledA()
3072 void LIBFUNC CodesetsFreeVecPooledA(REG(a0, APTR pool), REG(a1, APTR mem), REG(a2, struct TagItem *attrs))
3073 {
3074   ENTER();
3075
3076   if(pool != NULL && mem != NULL)
3077   {
3078     struct SignalSemaphore *sem;
3079
3080     if((sem = (struct SignalSemaphore *)GetTagData(CSA_PoolSem, 0, attrs)) != NULL)
3081       ObtainSemaphore(sem);
3082
3083     freeVecPooled(pool,mem);
3084
3085     if(sem != NULL)
3086       ReleaseSemaphore(sem);
3087   }
3088
3089   LEAVE();
3090 }
3091
3092 ///
3093 /// CodesetsListCreateA()
3094 struct codesetList * LIBFUNC CodesetsListCreateA(REG(a0, struct TagItem *attrs))
3095 {
3096   struct codesetList *csList = NULL;
3097
3098   ENTER();
3099
3100   // no matter what, we create a codesets list we will return to the user
3101   if((csList = allocArbitrateVecPooled(sizeof(struct codesetList))) != NULL)
3102   {
3103     BOOL scanProgDir = TRUE;
3104     struct TagItem *tstate = attrs;
3105     struct TagItem *tag;
3106
3107     // initialize the new private codeset list and put it into a separate list
3108     NewList((struct List *)csList);
3109
3110     // first we get the path of the directory from which we go
3111     // and scan for charset tables from
3112     while((tag = NextTagItem((APTR)&tstate)) != NULL)
3113     {
3114       switch(tag->ti_Tag)
3115       {
3116         case CSA_CodesetDir:
3117         {
3118           codesetsScanDir(csList, (STRPTR)tag->ti_Data);
3119
3120           scanProgDir = FALSE;
3121         }
3122         break;
3123
3124         case CSA_CodesetFile:
3125         {
3126           codesetsReadTable(csList, (STRPTR)tag->ti_Data);
3127
3128           scanProgDir = FALSE;
3129         }
3130         break;
3131
3132         case CSA_SourceCodeset:
3133         {
3134           struct codeset *cs = (struct codeset *)tag->ti_Data;
3135
3136           AddTail((struct List *)csList, (struct Node *)&cs->node);
3137
3138           scanProgDir = FALSE;
3139         }
3140         break;
3141       }
3142     }
3143
3144     // in case the user also wants us to scan PROGDIR:
3145     // we do so
3146     if(scanProgDir == TRUE)
3147       codesetsScanDir(csList, "PROGDIR:Charsets");
3148   }
3149
3150   RETURN(csList);
3151   return csList;
3152 }
3153
3154 ///
3155 /// CodesetsListDeleteA()
3156 BOOL LIBFUNC CodesetsListDeleteA(REG(a0, struct TagItem *attrs))
3157 {
3158   BOOL result = FALSE;
3159   struct TagItem *tstate = attrs;
3160   struct TagItem *tag;
3161   BOOL freeCodesets;
3162
3163   ENTER();
3164
3165   // check if the caller wants us also to free the codesets
3166   freeCodesets = (BOOL)GetTagData(CSA_FreeCodesets, TRUE, attrs);
3167
3168   // now we iterate through or tagItems and see what the
3169   // user wants to remove from the list
3170   while((tag = NextTagItem((APTR)&tstate)) != NULL)
3171   {
3172     switch(tag->ti_Tag)
3173     {
3174       case CSA_CodesetList:
3175       {
3176         struct codesetList *csList = (struct codesetList *)tag->ti_Data;
3177
3178         if(csList != NULL)
3179         {
3180           // cleanup the codesets within the list
3181           if(freeCodesets == TRUE)
3182             codesetsCleanup(csList);
3183
3184           // then free the list itself
3185           freeArbitrateVecPooled(csList);
3186
3187           result = TRUE;
3188         }
3189       }
3190       break;
3191     }
3192   }
3193
3194   RETURN(result);
3195   return result;
3196 }
3197
3198 ///
3199 /// CodesetsListAddA()
3200 BOOL LIBFUNC CodesetsListAddA(REG(a0, struct codesetList *csList), REG(a1, struct TagItem *attrs))
3201 {
3202   BOOL result = FALSE;
3203
3204   ENTER();
3205
3206   if(csList != NULL)
3207   {
3208     struct TagItem *tstate = attrs;
3209     struct TagItem *tag;
3210
3211     // now we iterate through or tagItems and see if the user
3212     // wants to scan a whole directory or just adds a file.
3213     while((tag = NextTagItem((APTR)&tstate)) != NULL)
3214     {
3215       switch(tag->ti_Tag)
3216       {
3217         case CSA_CodesetDir:
3218         {
3219           codesetsScanDir(csList, (STRPTR)tag->ti_Data);
3220           result = TRUE;
3221         }
3222         break;
3223
3224         case CSA_CodesetFile:
3225         {
3226           codesetsReadTable(csList, (STRPTR)tag->ti_Data);
3227           result = TRUE;
3228         }
3229         break;
3230
3231         case CSA_SourceCodeset:
3232         {
3233           struct codeset *cs = (struct codeset *)tag->ti_Data;
3234
3235           AddTail((struct List *)csList, (struct Node *)&cs->node);
3236           result = TRUE;
3237         }
3238         break;
3239       }
3240     }
3241   }
3242
3243   RETURN(result);
3244   return result;
3245 }
3246
3247 ///
3248 /// CodesetsListRemoveA()
3249 BOOL LIBFUNC CodesetsListRemoveA(REG(a0, struct TagItem *attrs))
3250 {
3251   BOOL result = FALSE;
3252   struct TagItem *tstate = attrs;
3253   struct TagItem *tag;
3254   BOOL freeCodesets;
3255
3256   ENTER();
3257
3258   // check if the caller wants us also to free the codesets
3259   freeCodesets = (BOOL)GetTagData(CSA_FreeCodesets, TRUE, attrs);
3260
3261   // now we iterate through or tagItems and see what the
3262   // user wants to remove from the list
3263   while((tag = NextTagItem((APTR)&tstate)) != NULL)
3264   {
3265     switch(tag->ti_Tag)
3266     {
3267       case CSA_SourceCodeset:
3268       {
3269         struct codeset *removeCS = (struct codeset *)tag->ti_Data;
3270
3271         if(removeCS != NULL)
3272         {
3273           struct Node *node;
3274           BOOL isExternalNode = TRUE;
3275
3276           ObtainSemaphore(&CodesetsBase->libSem);
3277
3278           // iterate over our internal list an check whether the given
3279           // node is part of that list
3280           for(node = GetHead((struct List *)&CodesetsBase->codesets); node != NULL; node = GetSucc(node))
3281           {
3282             if((struct codeset *)node == removeCS)
3283             {
3284               isExternalNode = FALSE;
3285               break;
3286             }
3287           }
3288
3289           ReleaseSemaphore(&CodesetsBase->libSem);
3290
3291           if(isExternalNode == TRUE)
3292           {
3293             Remove((struct Node *)removeCS);
3294
3295             // free all codesets data if requested
3296             if(freeCodesets == TRUE)
3297             {
3298               if(removeCS->name != NULL)
3299                 freeArbitrateVecPooled(removeCS->name);
3300               if(removeCS->alt_name != NULL)
3301                 freeArbitrateVecPooled(removeCS->alt_name);
3302               if(removeCS->characterization != NULL)
3303                 freeArbitrateVecPooled(removeCS->characterization);
3304
3305               freeArbitrateVecPooled(removeCS);
3306             }
3307
3308             result = TRUE;
3309           }
3310           else
3311             W(DBF_ALWAYS, "user tried to remove an internal codeset!");
3312         }
3313       }
3314       break;
3315     }
3316   }
3317
3318   RETURN(result);
3319   return result;
3320 }
3321
3322 ///
3323
3324 /**************************************************************************/