workbench/libs/codesetslib/src/convertUTF.c

   1 /*
   2  * Copyright 2001-2004 Unicode, Inc.
   3  *
   4  * Disclaimer
   5  *
   6  * This source code is provided as is by Unicode, Inc. No claims are
   7  * made as to fitness for any particular purpose. No warranties of any
   8  * kind are expressed or implied. The recipient agrees to determine
   9  * applicability of information provided. If this file has been
  10  * purchased on magnetic or optical media from Unicode, Inc., the
  11  * sole remedy for any claim will be exchange of defective media
  12  * within 90 days of receipt.
  13  *
  14  * Limitations on Rights to Redistribute This Code
  15  *
  16  * Unicode, Inc. hereby grants the right to freely use the information
  17  * supplied in this file in the creation of products supporting the
  18  * Unicode Standard, and to make copies of this file in any form
  19  * for internal or external distribution as long as this notice
  20  * remains attached.
  21  */
  22
  23 /* ---------------------------------------------------------------------
  24
  25     Conversions between UTF32, UTF-16, and UTF-8. Source code file.
  26     Author: Mark E. Davis, 1994.
  27     Rev History: Rick McGowan, fixes & updates May 2001.
  28     Sept 2001: fixed const & error conditions per
  29         mods suggested by S. Parent & A. Lillich.
  30     June 2002: Tim Dodd added detection and handling of incomplete
  31         source sequences, enhanced error detection, added casts
  32         to eliminate compiler warnings.
  33     July 2003: slight mods to back out aggressive FFFE detection.
  34     Jan 2004: updated switches in from-UTF8 conversions.
  35     Oct 2004: updated to use UNI_MAX_LEGAL_UTF32 in UTF-32 conversions.
  36
  37     See the header file "ConvertUTF.h" for complete documentation.
  38
  39 ------------------------------------------------------------------------ */
  40
  41 #include "lib.h"
  42 #include "convertUTF.h"
  43
  44 #ifndef __AROS__
  45 #include "SDI_lib.h"
  46 #endif /* __AROS__ */
  47
  48 #include "debug.h"
  49
  50 /***********************************************************************/
  51
  52 static const int halfShift = 10;    /* used for shifting by 10 bits */
  53
  54 static const UTF32 halfBase = 0x0010000UL;
  55 static const UTF32 halfMask = 0x3FFUL;
  56
  57 #define UNI_SUR_HIGH_START  (UTF32)0xD800
  58 #define UNI_SUR_HIGH_END    (UTF32)0xDBFF
  59 #define UNI_SUR_LOW_START   (UTF32)0xDC00
  60 #define UNI_SUR_LOW_END     (UTF32)0xDFFF
  61
  62 /***********************************************************************/
  63
  64 #ifdef __AROS__
  65 AROS_LH5(ULONG, CodesetsConvertUTF32toUTF16,
  66     AROS_LHA(const UTF32 **, sourceStart, A0),
  67     AROS_LHA(const UTF32 *, sourceEnd, A1),
  68     AROS_LHA(UTF16 **, targetStart, A2),
  69     AROS_LHA(UTF16 *, targetEnd, A3),
  70     AROS_LHA(ULONG, flags, D0),
  71     struct LibraryHeader *, library, 5, Codesets
  72 )
  73 {
  74     AROS_LIBFUNC_INIT
  75 #else
  76 ULONG LIBFUNC
  77 CodesetsConvertUTF32toUTF16(REG(a0, const UTF32 ** sourceStart),
  78                             REG(a1, const UTF32 * sourceEnd),
  79                             REG(a2, UTF16 ** targetStart),
  80                             REG(a3, UTF16 * targetEnd),
  81                             REG(d0, ULONG flags))
  82 {
  83 #endif
  84   ULONG result = CSR_ConversionOK;
  85   const UTF32 *source = *sourceStart;
  86   UTF16 *target = *targetStart;
  87
  88   ENTER();
  89
  90   while(source < sourceEnd)
  91   {
  92     UTF32 ch;
  93
  94     if(target >= targetEnd)
  95     {
  96       result = CSR_TargetExhausted;
  97       break;
  98     }
  99
 100     ch = *source++;
 101     if(ch <= UNI_MAX_BMP)
 102     {
 103       /* Target is a character <= 0xFFFF */
 104       /* UTF-16 surrogate values are illegal in UTF-32; 0xffff or 0xfffe are both reserved values */
 105       if(ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END)
 106       {
 107         if(flags == CSF_StrictConversion)
 108         {
 109           --source;   /* return to the illegal value itself */
 110           result = CSR_SourceIllegal;
 111           break;
 112         }
 113         else
 114         {
 115           *target++ = UNI_REPLACEMENT_CHAR;
 116         }
 117       }
 118       else
 119       {
 120         *target++ = (UTF16)ch; /* normal case */
 121       }
 122     }
 123     else if(ch > UNI_MAX_LEGAL_UTF32)
 124     {
 125       if(flags == CSF_StrictConversion)
 126       {
 127         result = CSR_SourceIllegal;
 128       }
 129       else
 130       {
 131         *target++ = UNI_REPLACEMENT_CHAR;
 132       }
 133     }
 134     else
 135     {
 136       /* target is a character in range 0xFFFF - 0x10FFFF. */
 137       if(target + 1 >= targetEnd)
 138       {
 139         --source;      /* Back up source pointer! */
 140         result = CSR_TargetExhausted;
 141         break;
 142       }
 143       ch -= halfBase;
 144       *target++ = (UTF16) ((ch >> halfShift) + UNI_SUR_HIGH_START);
 145       *target++ = (UTF16) ((ch & halfMask) + UNI_SUR_LOW_START);
 146     }
 147   }
 148
 149   *sourceStart = source;
 150   *targetStart = target;
 151
 152   RETURN(result);
 153   return result;
 154 #ifdef __AROS__
 155     AROS_LIBFUNC_EXIT
 156 #endif
 157 }
 158
 159 #ifndef __AROS__
 160 LIBSTUB(CodesetsConvertUTF32toUTF16, ULONG, REG(a0, const UTF32 ** sourceStart),
 161                                             REG(a1, const UTF32 * sourceEnd),
 162                                             REG(a2, UTF16 ** targetStart),
 163                                             REG(a3, UTF16 * targetEnd),
 164                                             REG(d0, ULONG flags))
 165 {
 166   #ifdef __MORPHOS__
 167   return CodesetsConvertUTF32toUTF16((const UTF32 **)REG_A0, (const UTF32 *)REG_A1, (UTF16 **)REG_A2, (UTF16 *)REG_A3, (ULONG)REG_D0);
 168   #else
 169   return CodesetsConvertUTF32toUTF16(sourceStart, sourceEnd, targetStart, targetEnd, flags);
 170   #endif
 171 }
 172 #endif
 173
 174 /***********************************************************************/
 175
 176 #ifdef __AROS__
 177 AROS_LH5(ULONG, CodesetsConvertUTF16toUTF32,
 178     AROS_LHA(const  UTF16 **, sourceStart, A0),
 179     AROS_LHA(const UTF16 *, sourceEnd, A1),
 180     AROS_LHA(UTF32 **, targetStart, A2),
 181     AROS_LHA(UTF32 *, targetEnd, A3),
 182     AROS_LHA(ULONG, flags, D0),
 183     struct LibraryHeader *, library, 6, Codesets
 184 )
 185 {
 186     AROS_LIBFUNC_INIT
 187 #else
 188 ULONG LIBFUNC
 189 CodesetsConvertUTF16toUTF32(REG(a0, const UTF16 ** sourceStart),
 190                             REG(a1, const UTF16 * sourceEnd),
 191                             REG(a2, UTF32 ** targetStart),
 192                             REG(a3, UTF32 * targetEnd),
 193                             REG(d0, ULONG flags))
 194 {
 195 #endif
 196   ULONG result = CSR_ConversionOK;
 197   const UTF16 *source = *sourceStart;
 198   UTF32 *target = *targetStart;
 199   UTF32 ch=0, ch2=0;
 200
 201   ENTER();
 202
 203   while(source < sourceEnd)
 204   {
 205     const UTF16 *oldSource = source;    /*  In case we have to back up because of target overflow. */
 206
 207     ch = *source++;
 208     /* If we have a surrogate pair, convert to UTF32 first. */
 209     if(ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END)
 210     {
 211       /* If the 16 bits following the high surrogate are in the source buffer... */
 212       if(source < sourceEnd)
 213       {
 214         ch2 = *source;
 215
 216         /* If it's a low surrogate, convert to UTF32. */
 217         if(ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END)
 218         {
 219           ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
 220                 + (ch2 - UNI_SUR_LOW_START) + halfBase;
 221
 222           ++source;
 223         }
 224         else if(flags == CSF_StrictConversion)
 225         {
 226           /* it's an unpaired high surrogate */
 227           --source;   /* return to the illegal value itself */
 228           result = CSR_SourceIllegal;
 229
 230           break;
 231         }
 232       }
 233       else
 234       {
 235         /* We don't have the 16 bits following the high surrogate. */
 236         --source;       /* return to the high surrogate */
 237         result = CSR_SourceExhausted;
 238
 239         break;
 240       }
 241     }
 242     else if (flags == CSF_StrictConversion)
 243     {
 244       /* UTF-16 surrogate values are illegal in UTF-32 */
 245       if(ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END)
 246       {
 247         --source;       /* return to the illegal value itself */
 248         result = CSR_SourceIllegal;
 249
 250         break;
 251       }
 252     }
 253
 254     if(target >= targetEnd)
 255     {
 256       source = oldSource; /* Back up source pointer! */
 257       result = CSR_TargetExhausted;
 258
 259       break;
 260     }
 261     *target++ = ch;
 262   }
 263
 264   *sourceStart = source;
 265   *targetStart = target;
 266
 267   #if defined(DEBUG)
 268   if(result == CSR_SourceIllegal)
 269   {
 270     E(DBF_UTF, "ConvertUTF16toUTF32 illegal seq 0x%04x,%04x", ch, ch2);
 271   }
 272   #endif
 273
 274   RETURN(result);
 275   return result;
 276 #ifdef __AROS__
 277     AROS_LIBFUNC_EXIT
 278 #endif
 279 }
 280
 281 #ifndef __AROS__
 282 LIBSTUB(CodesetsConvertUTF16toUTF32, ULONG, REG(a0, const UTF16 ** sourceStart),
 283                                             REG(a1, const UTF16 * sourceEnd),
 284                                             REG(a2, UTF32 ** targetStart),
 285                                             REG(a3, UTF32 * targetEnd),
 286                                             REG(d0, ULONG flags))
 287 {
 288   #ifdef __MORPHOS__
 289   return CodesetsConvertUTF16toUTF32((const UTF16 **)REG_A0, (const UTF16 *)REG_A1, (UTF32 **)REG_A2, (UTF32 *)REG_A3, (ULONG)REG_D0);
 290   #else
 291   return CodesetsConvertUTF16toUTF32(sourceStart, sourceEnd, targetStart, targetEnd, flags);
 292   #endif
 293 }
 294 #endif
 295
 296 /***********************************************************************/
 297
 298 /*
 299  * Index into the table below with the first byte of a UTF-8 sequence to
 300  * get the number of trailing bytes that are supposed to follow it.
 301  * Note that *legal* UTF-8 values can't have 4 or 5-bytes. The table is
 302  * left as-is for anyone who may want to do such conversion, which was
 303  * allowed in earlier algorithms.
 304  */
 305 const char trailingBytesForUTF8[256] = {
 306     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
 307     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
 308     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
 309     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
 310     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
 311     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
 312     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
 313     2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5
 314 };
 315
 316 /*
 317  * Magic values subtracted from a buffer value during UTF8 conversion.
 318  * This table contains as many values as there might be trailing bytes
 319  * in a UTF-8 sequence.
 320  */
 321 static const UTF32 offsetsFromUTF8[6] = {
 322     0x00000000UL, 0x00003080UL, 0x000E2080UL,
 323     0x03C82080UL, 0xFA082080UL, 0x82082080UL
 324 };
 325
 326 /*
 327  * Once the bits are split out into bytes of UTF-8, this is a mask OR-ed
 328  * into the first byte, depending on how many bytes follow.  There are
 329  * as many entries in this table as there are UTF-8 sequence types.
 330  * (I.e., one byte sequence, two byte... etc.). Remember that sequencs
 331  * for *legal* UTF-8 will be 4 or fewer bytes total.
 332  */
 333 static const UTF8 firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
 334
 335 /***********************************************************************/
 336
 337 /* The interface converts a whole buffer to avoid function-call overhead.
 338  * Constants have been gathered. Loops & conditionals have been removed as
 339  * much as possible for efficiency, in favor of drop-through switches.
 340  * (See "Note A" at the bottom of the file for equivalent code.)
 341  * If your compiler supports it, the "isLegalUTF8" call can be turned
 342  * into an inline function.
 343  */
 344
 345 /***********************************************************************/
 346
 347 #ifdef __AROS__
 348 AROS_LH5(ULONG, CodesetsConvertUTF16toUTF8,
 349     AROS_LHA(const UTF16 **, sourceStart, A0),
 350     AROS_LHA(const UTF16 *, sourceEnd, A1),
 351     AROS_LHA(UTF8 **, targetStart, A2),
 352     AROS_LHA(UTF8 *, targetEnd, A3),
 353     AROS_LHA(ULONG, flags, D0),
 354     struct LibraryHeader *, library, 7, Codesets
 355 )
 356 {
 357     AROS_LIBFUNC_INIT
 358 #else
 359 ULONG LIBFUNC
 360 CodesetsConvertUTF16toUTF8(REG(a0, const UTF16 ** sourceStart),
 361                            REG(a1, const UTF16 * sourceEnd),
 362                            REG(a2, UTF8 ** targetStart),
 363                            REG(a3, UTF8 * targetEnd),
 364                            REG(d0, ULONG flags))
 365 {
 366 #endif
 367   ULONG result = CSR_ConversionOK;
 368   const UTF16 *source = *sourceStart;
 369   UTF8 *target = *targetStart;
 370
 371   ENTER();
 372
 373   while(source < sourceEnd)
 374   {
 375     UTF32 ch;
 376     unsigned short bytesToWrite = 0;
 377     const UTF32 byteMask = 0xBF;
 378     const UTF32 byteMark = 0x80;
 379     const UTF16 *oldSource = source;    /* In case we have to back up because of target overflow. */
 380
 381     ch = *source++;
 382
 383     /* If we have a surrogate pair, convert to UTF32 first. */
 384     if(ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END)
 385     {
 386       /* If the 16 bits following the high surrogate are in the source buffer... */
 387       if(source < sourceEnd)
 388       {
 389         UTF32 ch2 = *source;
 390
 391         /* If it's a low surrogate, convert to UTF32. */
 392         if(ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END)
 393         {
 394           ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
 395                 + (ch2 - UNI_SUR_LOW_START) + halfBase;
 396
 397           ++source;
 398         }
 399         else if(flags == CSF_StrictConversion)
 400         {
 401           /* it's an unpaired high surrogate */
 402           --source;   /* return to the illegal value itself */
 403           result = CSR_SourceIllegal;
 404           break;
 405         }
 406       }
 407       else
 408       {
 409         /* We don't have the 16 bits following the high surrogate. */
 410         --source;       /* return to the high surrogate */
 411         result = CSR_SourceExhausted;
 412
 413         break;
 414       }
 415     }
 416     else if(flags == CSF_StrictConversion)
 417     {
 418       /* UTF-16 surrogate values are illegal in UTF-32 */
 419       if(ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END)
 420       {
 421         --source;       /* return to the illegal value itself */
 422         result = CSR_SourceIllegal;
 423
 424         break;
 425       }
 426     }
 427     /* Figure out how many bytes the result will require */
 428     if(ch < (UTF32) 0x80)
 429     {
 430       bytesToWrite = 1;
 431     }
 432     else if (ch < (UTF32) 0x800)
 433     {
 434       bytesToWrite = 2;
 435     }
 436     else if (ch < (UTF32) 0x10000)
 437     {
 438       bytesToWrite = 3;
 439     }
 440     else if (ch < (UTF32) 0x110000)
 441     {
 442       bytesToWrite = 4;
 443     }
 444     else
 445     {
 446       bytesToWrite = 3;
 447       ch = UNI_REPLACEMENT_CHAR;
 448     }
 449
 450     target += bytesToWrite;
 451     if(target > targetEnd)
 452     {
 453       source = oldSource; /* Back up source pointer! */
 454       target -= bytesToWrite;
 455       result = CSR_TargetExhausted;
 456
 457       break;
 458     }
 459     switch(bytesToWrite)
 460     {
 461       /* note: everything falls through. */
 462       case 4:
 463         *--target = (UTF8) ((ch | byteMark) & byteMask);
 464         ch >>= 6;
 465
 466       case 3:
 467         *--target = (UTF8) ((ch | byteMark) & byteMask);
 468         ch >>= 6;
 469
 470       case 2:
 471         *--target = (UTF8) ((ch | byteMark) & byteMask);
 472         ch >>= 6;
 473
 474       case 1:
 475         *--target = (UTF8) (ch | firstByteMark[bytesToWrite]);
 476     }
 477
 478     target += bytesToWrite;
 479   }
 480
 481   *sourceStart = source;
 482   *targetStart = target;
 483
 484   RETURN(result);
 485   return result;
 486 #ifdef __AROS__
 487     AROS_LIBFUNC_EXIT
 488 #endif
 489 }
 490
 491 #ifndef __AROS__
 492 LIBSTUB(CodesetsConvertUTF16toUTF8, ULONG, REG(a0, const UTF16 ** sourceStart),
 493                                            REG(a1, const UTF16 * sourceEnd),
 494                                            REG(a2, UTF8 ** targetStart),
 495                                            REG(a3, UTF8 * targetEnd),
 496                                            REG(d0, ULONG flags))
 497 {
 498   #ifdef __MORPHOS__
 499   return CodesetsConvertUTF16toUTF8((const UTF16 **)REG_A0, (const UTF16 *)REG_A1, (UTF8 **)REG_A2, (UTF8 *)REG_A3, (ULONG)REG_D0);
 500   #else
 501   return CodesetsConvertUTF16toUTF8(sourceStart, sourceEnd, targetStart, targetEnd, flags);
 502   #endif
 503 }
 504 #endif
 505
 506 /***********************************************************************/
 507
 508 /*
 509  * Utility routine to tell whether a sequence of bytes is legal UTF-8.
 510  * This must be called with the length pre-determined by the first byte.
 511  * If not calling this from ConvertUTF8to*, then the length can be set by:
 512  *  length = trailingBytesForUTF8[*source]+1;
 513  * and the sequence is illegal right away if there aren't that many bytes
 514  * available.
 515  * If presented with a length > 4, this returns FALSE.  The Unicode
 516  * definition of UTF-8 goes up to 4-byte sequences.
 517  */
 518
 519 #ifdef __AROS__
 520 AROS_LH2(BOOL, CodesetsIsLegalUTF8,
 521     AROS_LHA(const UTF8 *, source, A0),
 522     AROS_LHA(ULONG, length, D0),
 523     struct LibraryHeader *, library, 8, Codesets
 524 )
 525 {
 526     AROS_LIBFUNC_INIT
 527 #else
 528 BOOL LIBFUNC
 529 CodesetsIsLegalUTF8(REG(a0, const UTF8 * source),
 530                                 REG(d0, ULONG length))
 531 {
 532 #endif
 533   UTF8 a;
 534   const UTF8 *srcptr = source + length;
 535
 536   ENTER();
 537
 538   switch(length)
 539   {
 540     default:
 541       RETURN(FALSE);
 542       return FALSE;
 543
 544     /* Everything else falls through when "TRUE"... */
 545     case 4:
 546       if((a = (*--srcptr)) < 0x80 || a > 0xBF)
 547       {
 548         RETURN(FALSE);
 549         return FALSE;
 550       }
 551
 552     case 3:
 553       if((a = (*--srcptr)) < 0x80 || a > 0xBF)
 554       {
 555         RETURN(FALSE);
 556         return FALSE;
 557       }
 558
 559     case 2:
 560       if((a = (*--srcptr)) > 0xBF)
 561       {
 562         RETURN(FALSE);
 563         return FALSE;
 564       }
 565
 566       switch (*source)
 567       {
 568         /* no fall-through in this inner switch */
 569         case 0xE0:
 570           if(a < 0xA0)
 571           {
 572             RETURN(FALSE);
 573             return FALSE;
 574           }
 575         break;
 576
 577         case 0xED:
 578           if(a > 0x9F)
 579           {
 580             RETURN(FALSE);
 581             return FALSE;
 582           }
 583         break;
 584
 585         case 0xF0:
 586           if(a < 0x90)
 587           {
 588             RETURN(FALSE);
 589             return FALSE;
 590           }
 591           break;
 592
 593         case 0xF4:
 594           if(a > 0x8F)
 595           {
 596             RETURN(FALSE);
 597             return FALSE;
 598           }
 599         break;
 600
 601         default:
 602           if(a < 0x80)
 603           {
 604             RETURN(FALSE);
 605             return FALSE;
 606           }
 607       }
 608
 609     case 1:
 610       if(*source >= 0x80 && *source < 0xC2)
 611       {
 612         RETURN(FALSE);
 613         return FALSE;
 614       }
 615   }
 616
 617   if(*source > 0xF4)
 618   {
 619     RETURN(FALSE);
 620     return FALSE;
 621   }
 622
 623   RETURN(TRUE);
 624   return TRUE;
 625 #ifdef __AROS__
 626     AROS_LIBFUNC_EXIT
 627 #endif
 628 }
 629
 630 #ifndef __AROS__
 631 LIBSTUB(CodesetsIsLegalUTF8, BOOL, REG(a0, const UTF8 * source),
 632                                                 REG(d0, ULONG length))
 633 {
 634   #ifdef __MORPHOS__
 635   return CodesetsIsLegalUTF8((const UTF8 *)REG_A0,(ULONG)REG_D0);
 636   #else
 637   return CodesetsIsLegalUTF8(source, length);
 638   #endif
 639 }
 640 #endif
 641
 642 /***********************************************************************/
 643
 644 /*
 645  * Exported function to return whether a UTF-8 sequence is legal or not.
 646  * This is not used here; it's just exported.
 647  */
 648
 649 #ifdef __AROS__
 650 AROS_LH2(BOOL, CodesetsIsLegalUTF8Sequence,
 651     AROS_LHA(const UTF8 *, source, A0),
 652     AROS_LHA(const UTF8 *, sourceEnd, D1),
 653     struct LibraryHeader *, library, 9, Codesets
 654 )
 655 {
 656     AROS_LIBFUNC_INIT
 657 #else
 658 BOOL LIBFUNC
 659 CodesetsIsLegalUTF8Sequence(REG(a0, const UTF8 * source),
 660                             REG(a1, const UTF8 * sourceEnd))
 661 {
 662 #endif
 663   int length = trailingBytesForUTF8[*source] + 1;
 664   BOOL res = FALSE;
 665
 666   ENTER();
 667
 668   if(source + length > sourceEnd)
 669   {
 670     RETURN(FALSE);
 671     return FALSE;
 672   }
 673
 674   res = CodesetsIsLegalUTF8(source, length);
 675
 676   RETURN(res);
 677   return res;
 678 #ifdef __AROS__
 679     AROS_LIBFUNC_EXIT
 680 #endif
 681 }
 682
 683 #ifndef __AROS__
 684 LIBSTUB(CodesetsIsLegalUTF8Sequence, BOOL, REG(a0, const UTF8 * source),
 685                                             REG(a1, const UTF8 * sourceEnd))
 686 {
 687   #ifdef __MORPHOS__
 688   return CodesetsIsLegalUTF8Sequence((const UTF8 *)REG_A0,(const UTF8 *)REG_A1);
 689   #else
 690   return CodesetsIsLegalUTF8Sequence(source, sourceEnd);
 691   #endif
 692 }
 693 #endif
 694
 695 /***********************************************************************/
 696
 697 #ifdef __AROS__
 698 AROS_LH5(ULONG, CodesetsConvertUTF8toUTF16,
 699     AROS_LHA(const UTF8 **, sourceStart, A0),
 700     AROS_LHA(const UTF8 *, sourceEnd, A1),
 701     AROS_LHA(UTF16 **, targetStart, A2),
 702     AROS_LHA(UTF16 *, targetEnd, A3),
 703     AROS_LHA(ULONG, flags, D0),
 704     struct LibraryHeader *, library, 10, Codesets
 705 )
 706 {
 707     AROS_LIBFUNC_INIT
 708 #else
 709 ULONG LIBFUNC
 710 CodesetsConvertUTF8toUTF16(REG(a0, const UTF8 ** sourceStart),
 711                            REG(a1, const UTF8 * sourceEnd),
 712                            REG(a2, UTF16 ** targetStart),
 713                            REG(a3, UTF16 * targetEnd),
 714                            REG(d0, ULONG flags))
 715 {
 716 #endif
 717   ULONG result = CSR_ConversionOK;
 718   const UTF8 *source = *sourceStart;
 719   UTF16 *target = *targetStart;
 720
 721   ENTER();
 722
 723   while(source < sourceEnd)
 724   {
 725     UTF32 ch = 0;
 726     unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
 727
 728     if(source + extraBytesToRead >= sourceEnd)
 729     {
 730       result = CSR_SourceExhausted;
 731       break;
 732     }
 733
 734     /* Do this check whether lenient or strict */
 735     if(!CodesetsIsLegalUTF8 (source, extraBytesToRead + 1))
 736     {
 737       result = CSR_SourceIllegal;
 738       break;
 739     }
 740
 741     /*
 742      * The cases all fall through. See "Note A" below.
 743      */
 744     switch (extraBytesToRead)
 745     {
 746       case 5:
 747         ch += *source++;
 748         ch <<= 6;       /* remember, illegal UTF-8 */
 749
 750       case 4:
 751         ch += *source++;
 752         ch <<= 6;       /* remember, illegal UTF-8 */
 753
 754       case 3:
 755         ch += *source++;
 756         ch <<= 6;
 757
 758       case 2:
 759         ch += *source++;
 760         ch <<= 6;
 761
 762       case 1:
 763         ch += *source++;
 764         ch <<= 6;
 765
 766       case 0:
 767         ch += *source++;
 768     }
 769
 770     ch -= offsetsFromUTF8[extraBytesToRead];
 771
 772     if(target >= targetEnd)
 773     {
 774       source -= (extraBytesToRead + 1);   /* Back up source pointer! */
 775       result = CSR_TargetExhausted;
 776
 777       break;
 778     }
 779
 780     if(ch <= UNI_MAX_BMP)
 781     {
 782       /* Target is a character <= 0xFFFF */
 783       /* UTF-16 surrogate values are illegal in UTF-32 */
 784       if(ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END)
 785       {
 786         if(flags == CSF_StrictConversion)
 787         {
 788           source -= (extraBytesToRead + 1);   /* return to the illegal value itself */
 789           result = CSR_SourceIllegal;
 790
 791           break;
 792         }
 793         else
 794         {
 795           *target++ = UNI_REPLACEMENT_CHAR;
 796         }
 797       }
 798       else
 799       {
 800         *target++ = (UTF16) ch; /* normal case */
 801       }
 802     }
 803     else if(ch > UNI_MAX_UTF16)
 804     {
 805       if(flags == CSF_StrictConversion)
 806       {
 807         result = CSR_SourceIllegal;
 808         source -= (extraBytesToRead + 1);   /* return to the start */
 809
 810         break;          /* Bail out; shouldn't continue */
 811       }
 812       else
 813       {
 814         *target++ = UNI_REPLACEMENT_CHAR;
 815       }
 816     }
 817     else
 818     {
 819       /* target is a character in range 0xFFFF - 0x10FFFF. */
 820       if(target + 1 >= targetEnd)
 821       {
 822         source -= (extraBytesToRead + 1);   /* Back up source pointer! */
 823         result = CSR_TargetExhausted;
 824
 825         break;
 826       }
 827
 828       ch -= halfBase;
 829       *target++ = (UTF16) ((ch >> halfShift) + UNI_SUR_HIGH_START);
 830       *target++ = (UTF16) ((ch & halfMask) + UNI_SUR_LOW_START);
 831     }
 832   }
 833
 834   *sourceStart = source;
 835   *targetStart = target;
 836
 837   RETURN(result);
 838   return result;
 839 #ifdef __AROS__
 840     AROS_LIBFUNC_EXIT
 841 #endif
 842 }
 843
 844 #ifndef __AROS__
 845 LIBSTUB(CodesetsConvertUTF8toUTF16, ULONG, REG(a0, const UTF8 ** sourceStart),
 846                                            REG(a1, const UTF8 * sourceEnd),
 847                                            REG(a2, UTF16 ** targetStart),
 848                                            REG(a3, UTF16 * targetEnd),
 849                                            REG(d0, ULONG flags))
 850 {
 851   #ifdef __MORPHOS__
 852   return CodesetsConvertUTF8toUTF16((const UTF8 **)REG_A0, (const UTF8 *)REG_A1, (UTF16 **)REG_A2, (UTF16 *)REG_A3, (ULONG)REG_D0);
 853   #else
 854   return CodesetsConvertUTF8toUTF16(sourceStart, sourceEnd, targetStart, targetEnd, flags);
 855   #endif
 856 }
 857 #endif
 858
 859 /***********************************************************************/
 860
 861 #ifdef __AROS__
 862 AROS_LH5(ULONG, CodesetsConvertUTF32toUTF8,
 863     AROS_LHA(const UTF32 **, sourceStart, A0),
 864     AROS_LHA(const UTF32 *, sourceEnd, A1),
 865     AROS_LHA(UTF8 **, targetStart, A2),
 866     AROS_LHA(UTF8 *, targetEnd, A3),
 867     AROS_LHA(ULONG, flags, D0),
 868     struct LibraryHeader *, library, 11, Codesets
 869 )
 870 {
 871     AROS_LIBFUNC_INIT
 872 #else
 873 ULONG LIBFUNC
 874 CodesetsConvertUTF32toUTF8(REG(a0, const UTF32 ** sourceStart),
 875                            REG(a1, const UTF32 * sourceEnd),
 876                            REG(a2, UTF8 ** targetStart),
 877                            REG(a3, UTF8 * targetEnd),
 878                            REG(d0, ULONG flags))
 879 {
 880 #endif
 881   ULONG result = CSR_ConversionOK;
 882   const UTF32 *source = *sourceStart;
 883   UTF8 *target = *targetStart;
 884
 885   ENTER();
 886
 887   while(source < sourceEnd)
 888   {
 889     UTF32 ch;
 890     unsigned short bytesToWrite = 0;
 891     const UTF32 byteMask = 0xBF;
 892     const UTF32 byteMark = 0x80;
 893
 894     ch = *source++;
 895
 896     if(flags == CSF_StrictConversion)
 897     {
 898       /* UTF-16 surrogate values are illegal in UTF-32 */
 899       if(ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END)
 900       {
 901         --source;       /* return to the illegal value itself */
 902         result = CSR_SourceIllegal;
 903
 904         break;
 905       }
 906     }
 907
 908     /*
 909      * Figure out how many bytes the result will require. Turn any
 910      * illegally large UTF32 things (> Plane 17) into replacement chars.
 911     */
 912     if(ch < (UTF32) 0x80)
 913     {
 914       bytesToWrite = 1;
 915     }
 916     else if(ch < (UTF32) 0x800)
 917     {
 918       bytesToWrite = 2;
 919     }
 920     else if(ch < (UTF32) 0x10000)
 921     {
 922       bytesToWrite = 3;
 923     }
 924     else if(ch <= UNI_MAX_LEGAL_UTF32)
 925     {
 926       bytesToWrite = 4;
 927     }
 928     else
 929     {
 930       bytesToWrite = 3;
 931       ch = UNI_REPLACEMENT_CHAR;
 932       result = CSR_SourceIllegal;
 933     }
 934
 935     target += bytesToWrite;
 936     if(target > targetEnd)
 937     {
 938       --source;           /* Back up source pointer! */
 939       target -= bytesToWrite;
 940       result = CSR_TargetExhausted;
 941
 942       break;
 943     }
 944     switch(bytesToWrite)
 945     {
 946       /* note: everything falls through. */
 947       case 4:
 948         *--target = (UTF8) ((ch | byteMark) & byteMask);
 949         ch >>= 6;
 950
 951       case 3:
 952         *--target = (UTF8) ((ch | byteMark) & byteMask);
 953         ch >>= 6;
 954
 955       case 2:
 956         *--target = (UTF8) ((ch | byteMark) & byteMask);
 957         ch >>= 6;
 958
 959       case 1:
 960         *--target = (UTF8) (ch | firstByteMark[bytesToWrite]);
 961     }
 962
 963     target += bytesToWrite;
 964   }
 965
 966   *sourceStart = source;
 967   *targetStart = target;
 968
 969   RETURN(result);
 970   return result;
 971 #ifdef __AROS__
 972     AROS_LIBFUNC_EXIT
 973 #endif
 974 }
 975
 976 #ifndef __AROS__
 977 LIBSTUB(CodesetsConvertUTF32toUTF8, ULONG, REG(a0, const UTF32 ** sourceStart),
 978                                            REG(a1, const UTF32 * sourceEnd),
 979                                            REG(a2, UTF8 ** targetStart),
 980                                            REG(a3, UTF8 * targetEnd),
 981                                            REG(d0, ULONG flags))
 982 {
 983   #ifdef __MORPHOS__
 984   return CodesetsConvertUTF32toUTF8((const UTF32 **)REG_A0, (const UTF32 *)REG_A1, (UTF8 **)REG_A2, (UTF8 *)REG_A3, (ULONG)REG_D0);
 985   #else
 986   return CodesetsConvertUTF32toUTF8(sourceStart, sourceEnd, targetStart, targetEnd, flags);
 987   #endif
 988 }
 989 #endif
 990
 991 /***********************************************************************/
 992
 993 #ifdef __AROS__
 994 AROS_LH5(ULONG, CodesetsConvertUTF8toUTF32,
 995     AROS_LHA(const UTF8 **, sourceStart, A0),
 996     AROS_LHA(const UTF8 *, sourceEnd, A1),
 997     AROS_LHA(UTF32 **, targetStart, A2),
 998     AROS_LHA(UTF32 *, targetEnd, A3),
 999     AROS_LHA(ULONG, flags, D0),
1000     struct LibraryHeader *, library, 12, Codesets
1001 )
1002 {
1003     AROS_LIBFUNC_INIT
1004 #else
1005 ULONG LIBFUNC
1006 CodesetsConvertUTF8toUTF32(REG(a0, const UTF8 ** sourceStart),
1007                            REG(a1, const UTF8 * sourceEnd),
1008                            REG(a2, UTF32 ** targetStart),
1009                            REG(a3, UTF32 * targetEnd),
1010                            REG(d0, ULONG flags))
1011 {
1012 #endif
1013   ULONG result = CSR_ConversionOK;
1014   const UTF8 *source = *sourceStart;
1015   UTF32 *target = *targetStart;
1016
1017   ENTER();
1018
1019   while(source < sourceEnd)
1020   {
1021     UTF32 ch = 0;
1022     unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
1023
1024     if(source + extraBytesToRead >= sourceEnd)
1025     {
1026       result = CSR_SourceExhausted;
1027       break;
1028     }
1029
1030     /* Do this check whether lenient or strict */
1031     if(!CodesetsIsLegalUTF8(source, extraBytesToRead + 1))
1032     {
1033       result = CSR_SourceIllegal;
1034       break;
1035     }
1036
1037     /*
1038      * The cases all fall through. See "Note A" below.
1039     */
1040     switch (extraBytesToRead)
1041     {
1042       case 5:
1043         ch += *source++;
1044         ch <<= 6;
1045
1046       case 4:
1047         ch += *source++;
1048         ch <<= 6;
1049
1050       case 3:
1051         ch += *source++;
1052         ch <<= 6;
1053
1054       case 2:
1055         ch += *source++;
1056         ch <<= 6;
1057
1058       case 1:
1059         ch += *source++;
1060         ch <<= 6;
1061
1062       case 0:
1063         ch += *source++;
1064     }
1065
1066     ch -= offsetsFromUTF8[extraBytesToRead];
1067
1068     if(target >= targetEnd)
1069     {
1070       source -= (extraBytesToRead + 1);   /* Back up the source pointer! */
1071       result = CSR_TargetExhausted;
1072
1073       break;
1074     }
1075
1076     if(ch <= UNI_MAX_LEGAL_UTF32)
1077     {
1078       /*
1079        * UTF-16 surrogate values are illegal in UTF-32, and anything
1080        * over Plane 17 (> 0x10FFFF) is illegal.
1081       */
1082       if(ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END)
1083       {
1084         if(flags == CSF_StrictConversion)
1085         {
1086           source -= (extraBytesToRead + 1);   /* return to the illegal value itself */
1087           result = CSR_SourceIllegal;
1088
1089           break;
1090         }
1091         else
1092         {
1093           *target++ = UNI_REPLACEMENT_CHAR;
1094         }
1095       }
1096       else
1097       {
1098         *target++ = ch;
1099       }
1100     }
1101     else
1102     {
1103       /* i.e., ch > UNI_MAX_LEGAL_UTF32 */
1104       result = CSR_SourceIllegal;
1105       *target++ = UNI_REPLACEMENT_CHAR;
1106     }
1107   }
1108
1109   *sourceStart = source;
1110   *targetStart = target;
1111
1112   RETURN(result);
1113   return result;
1114 #ifdef __AROS__
1115     AROS_LIBFUNC_EXIT
1116 #endif
1117 }
1118
1119 #ifndef __AROS__
1120 LIBSTUB(CodesetsConvertUTF8toUTF32, ULONG, REG(a0, const UTF8 ** sourceStart),
1121                                            REG(a1, const UTF8 * sourceEnd),
1122                                            REG(a2, UTF32 ** targetStart),
1123                                            REG(a3, UTF32 * targetEnd),
1124                                            REG(d0, ULONG flags))
1125 {
1126   #ifdef __MORPHOS__
1127   return CodesetsConvertUTF8toUTF32((const UTF8 **)REG_A0, (const UTF8 *)REG_A1, (UTF32 **)REG_A2, (UTF32 *)REG_A3, (ULONG)REG_D0);
1128   #else
1129   return CodesetsConvertUTF8toUTF32(sourceStart, sourceEnd, targetStart, targetEnd, flags);
1130   #endif
1131 }
1132 #endif
1133
1134 /***********************************************************************
1135
1136     Note A.
1137     The fall-through switches in UTF-8 reading code save a
1138     temp variable, some decrements & conditionals.  The switches
1139     are equivalent to the following loop:
1140     {
1141         int tmpBytesToRead = extraBytesToRead+1;
1142         do {
1143         ch += *source++;
1144         --tmpBytesToRead;
1145         if (tmpBytesToRead) ch <<= 6;
1146         } while (tmpBytesToRead > 0);
1147     }
1148     In UTF-8 writing code, the switches on "bytesToWrite" are
1149     similarly unrolled loops.
1150
1151 ***********************************************************************/