workbench/libs/codesetslib/src/convertUTF.c

   1 /*
   2  * Copyright 2001-2004 Unicode, Inc.
   3  *
   4  * Disclaimer
   5  *
   6  * This source code is provided as is by Unicode, Inc. No claims are
   7  * made as to fitness for any particular purpose. No warranties of any
   8  * kind are expressed or implied. The recipient agrees to determine
   9  * applicability of information provided. If this file has been
  10  * purchased on magnetic or optical media from Unicode, Inc., the
  11  * sole remedy for any claim will be exchange of defective media
  12  * within 90 days of receipt.
  13  *
  14  * Limitations on Rights to Redistribute This Code
  15  *
  16  * Unicode, Inc. hereby grants the right to freely use the information
  17  * supplied in this file in the creation of products supporting the
  18  * Unicode Standard, and to make copies of this file in any form
  19  * for internal or external distribution as long as this notice
  20  * remains attached.
  21  */
  22
  23 /* ---------------------------------------------------------------------
  24
  25     Conversions between UTF32, UTF-16, and UTF-8. Source code file.
  26     Author: Mark E. Davis, 1994.
  27     Rev History: Rick McGowan, fixes & updates May 2001.
  28     Sept 2001: fixed const & error conditions per
  29         mods suggested by S. Parent & A. Lillich.
  30     June 2002: Tim Dodd added detection and handling of incomplete
  31         source sequences, enhanced error detection, added casts
  32         to eliminate compiler warnings.
  33     July 2003: slight mods to back out aggressive FFFE detection.
  34     Jan 2004: updated switches in from-UTF8 conversions.
  35     Oct 2004: updated to use UNI_MAX_LEGAL_UTF32 in UTF-32 conversions.
  36
  37     See the header file "ConvertUTF.h" for complete documentation.
  38
  39 ------------------------------------------------------------------------ */
  40
  41 #include "lib.h"
  42 #include "convertUTF.h"
  43
  44 #include "SDI_lib.h"
  45
  46 #include "debug.h"
  47
  48 /***********************************************************************/
  49
  50 static const int halfShift = 10;    /* used for shifting by 10 bits */
  51
  52 static const UTF32 halfBase = 0x0010000UL;
  53 static const UTF32 halfMask = 0x3FFUL;
  54
  55 #define UNI_SUR_HIGH_START  (UTF32)0xD800
  56 #define UNI_SUR_HIGH_END    (UTF32)0xDBFF
  57 #define UNI_SUR_LOW_START   (UTF32)0xDC00
  58 #define UNI_SUR_LOW_END     (UTF32)0xDFFF
  59
  60 /***********************************************************************/
  61
  62 #ifdef __AROS__
  63 AROS_LH5(ULONG, CodesetsConvertUTF32toUTF16,
  64     AROS_LHA(const UTF32 **, sourceStart, A0),
  65     AROS_LHA(const UTF32 *, sourceEnd, A1),
  66     AROS_LHA(UTF16 **, targetStart, A2),
  67     AROS_LHA(UTF16 *, targetEnd, A3),
  68     AROS_LHA(ULONG, flags, D0),
  69     struct CodesetsBase *, library, 5, Codesets
  70 )
  71 {
  72     AROS_LIBFUNC_INIT
  73 #else
  74 ULONG LIBFUNC
  75 CodesetsConvertUTF32toUTF16(REG(a0, const UTF32 ** sourceStart),
  76                             REG(a1, const UTF32 * sourceEnd),
  77                             REG(a2, UTF16 ** targetStart),
  78                             REG(a3, UTF16 * targetEnd),
  79                             REG(d0, ULONG flags))
  80 {
  81 #endif
  82   ULONG result = CSR_ConversionOK;
  83   const UTF32 *source = *sourceStart;
  84   UTF16 *target = *targetStart;
  85
  86   ENTER();
  87
  88   while(source < sourceEnd)
  89   {
  90     UTF32 ch;
  91
  92     if(target >= targetEnd)
  93     {
  94       result = CSR_TargetExhausted;
  95       break;
  96     }
  97
  98     ch = *source++;
  99     if(ch <= UNI_MAX_BMP)
 100     {
 101       /* Target is a character <= 0xFFFF */
 102       /* UTF-16 surrogate values are illegal in UTF-32; 0xffff or 0xfffe are both reserved values */
 103       if(ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END)
 104       {
 105         if(flags == CSF_StrictConversion)
 106         {
 107           --source;   /* return to the illegal value itself */
 108           result = CSR_SourceIllegal;
 109           break;
 110         }
 111         else
 112         {
 113           *target++ = UNI_REPLACEMENT_CHAR;
 114         }
 115       }
 116       else
 117       {
 118         *target++ = (UTF16)ch; /* normal case */
 119       }
 120     }
 121     else if(ch > UNI_MAX_LEGAL_UTF32)
 122     {
 123       if(flags == CSF_StrictConversion)
 124       {
 125         result = CSR_SourceIllegal;
 126       }
 127       else
 128       {
 129         *target++ = UNI_REPLACEMENT_CHAR;
 130       }
 131     }
 132     else
 133     {
 134       /* target is a character in range 0xFFFF - 0x10FFFF. */
 135       if(target + 1 >= targetEnd)
 136       {
 137         --source;      /* Back up source pointer! */
 138         result = CSR_TargetExhausted;
 139         break;
 140       }
 141       ch -= halfBase;
 142       *target++ = (UTF16) ((ch >> halfShift) + UNI_SUR_HIGH_START);
 143       *target++ = (UTF16) ((ch & halfMask) + UNI_SUR_LOW_START);
 144     }
 145   }
 146
 147   *sourceStart = source;
 148   *targetStart = target;
 149
 150   RETURN(result);
 151   return result;
 152 #ifdef __AROS__
 153     AROS_LIBFUNC_EXIT
 154 #endif
 155 }
 156
 157 #ifndef __AROS__
 158 LIBSTUB(CodesetsConvertUTF32toUTF16, ULONG, REG(a0, const UTF32 ** sourceStart),
 159                                             REG(a1, const UTF32 * sourceEnd),
 160                                             REG(a2, UTF16 ** targetStart),
 161                                             REG(a3, UTF16 * targetEnd),
 162                                             REG(d0, ULONG flags))
 163 {
 164   #ifdef __MORPHOS__
 165   return CodesetsConvertUTF32toUTF16((const UTF32 **)REG_A0, (const UTF32 *)REG_A1, (UTF16 **)REG_A2, (UTF16 *)REG_A3, (ULONG)REG_D0);
 166   #else
 167   return CodesetsConvertUTF32toUTF16(sourceStart, sourceEnd, targetStart, targetEnd, flags);
 168   #endif
 169 }
 170 #endif
 171
 172 /***********************************************************************/
 173
 174 #ifdef __AROS__
 175 AROS_LH5(ULONG, CodesetsConvertUTF16toUTF32,
 176     AROS_LHA(const  UTF16 **, sourceStart, A0),
 177     AROS_LHA(const UTF16 *, sourceEnd, A1),
 178     AROS_LHA(UTF32 **, targetStart, A2),
 179     AROS_LHA(UTF32 *, targetEnd, A3),
 180     AROS_LHA(ULONG, flags, D0),
 181     struct CodesetsBase *, library, 6, Codesets
 182 )
 183 {
 184     AROS_LIBFUNC_INIT
 185 #else
 186 ULONG LIBFUNC
 187 CodesetsConvertUTF16toUTF32(REG(a0, const UTF16 ** sourceStart),
 188                             REG(a1, const UTF16 * sourceEnd),
 189                             REG(a2, UTF32 ** targetStart),
 190                             REG(a3, UTF32 * targetEnd),
 191                             REG(d0, ULONG flags))
 192 {
 193 #endif
 194   ULONG result = CSR_ConversionOK;
 195   const UTF16 *source = *sourceStart;
 196   UTF32 *target = *targetStart;
 197   UTF32 ch=0, ch2=0;
 198
 199   ENTER();
 200
 201   while(source < sourceEnd)
 202   {
 203     const UTF16 *oldSource = source;    /*  In case we have to back up because of target overflow. */
 204
 205     ch = *source++;
 206     /* If we have a surrogate pair, convert to UTF32 first. */
 207     if(ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END)
 208     {
 209       /* If the 16 bits following the high surrogate are in the source buffer... */
 210       if(source < sourceEnd)
 211       {
 212         ch2 = *source;
 213
 214         /* If it's a low surrogate, convert to UTF32. */
 215         if(ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END)
 216         {
 217           ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
 218                 + (ch2 - UNI_SUR_LOW_START) + halfBase;
 219
 220           ++source;
 221         }
 222         else if(flags == CSF_StrictConversion)
 223         {
 224           /* it's an unpaired high surrogate */
 225           --source;   /* return to the illegal value itself */
 226           result = CSR_SourceIllegal;
 227
 228           break;
 229         }
 230       }
 231       else
 232       {
 233         /* We don't have the 16 bits following the high surrogate. */
 234         --source;       /* return to the high surrogate */
 235         result = CSR_SourceExhausted;
 236
 237         break;
 238       }
 239     }
 240     else if (flags == CSF_StrictConversion)
 241     {
 242       /* UTF-16 surrogate values are illegal in UTF-32 */
 243       if(ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END)
 244       {
 245         --source;       /* return to the illegal value itself */
 246         result = CSR_SourceIllegal;
 247
 248         break;
 249       }
 250     }
 251
 252     if(target >= targetEnd)
 253     {
 254       source = oldSource; /* Back up source pointer! */
 255       result = CSR_TargetExhausted;
 256
 257       break;
 258     }
 259     *target++ = ch;
 260   }
 261
 262   *sourceStart = source;
 263   *targetStart = target;
 264
 265   #if defined(DEBUG)
 266   if(result == CSR_SourceIllegal)
 267   {
 268     E(DBF_UTF, "ConvertUTF16toUTF32 illegal seq 0x%04x,%04x", ch, ch2);
 269   }
 270   #endif
 271
 272   RETURN(result);
 273   return result;
 274 #ifdef __AROS__
 275     AROS_LIBFUNC_EXIT
 276 #endif
 277 }
 278
 279 #ifndef __AROS__
 280 LIBSTUB(CodesetsConvertUTF16toUTF32, ULONG, REG(a0, const UTF16 ** sourceStart),
 281                                             REG(a1, const UTF16 * sourceEnd),
 282                                             REG(a2, UTF32 ** targetStart),
 283                                             REG(a3, UTF32 * targetEnd),
 284                                             REG(d0, ULONG flags))
 285 {
 286   #ifdef __MORPHOS__
 287   return CodesetsConvertUTF16toUTF32((const UTF16 **)REG_A0, (const UTF16 *)REG_A1, (UTF32 **)REG_A2, (UTF32 *)REG_A3, (ULONG)REG_D0);
 288   #else
 289   return CodesetsConvertUTF16toUTF32(sourceStart, sourceEnd, targetStart, targetEnd, flags);
 290   #endif
 291 }
 292 #endif
 293
 294 /***********************************************************************/
 295
 296 /*
 297  * Index into the table below with the first byte of a UTF-8 sequence to
 298  * get the number of trailing bytes that are supposed to follow it.
 299  * Note that *legal* UTF-8 values can't have 4 or 5-bytes. The table is
 300  * left as-is for anyone who may want to do such conversion, which was
 301  * allowed in earlier algorithms.
 302  */
 303 const char trailingBytesForUTF8[256] = {
 304     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
 305     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
 306     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
 307     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
 308     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
 309     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
 310     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
 311     2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5
 312 };
 313
 314 /*
 315  * Magic values subtracted from a buffer value during UTF8 conversion.
 316  * This table contains as many values as there might be trailing bytes
 317  * in a UTF-8 sequence.
 318  */
 319 static const UTF32 offsetsFromUTF8[6] = {
 320     0x00000000UL, 0x00003080UL, 0x000E2080UL,
 321     0x03C82080UL, 0xFA082080UL, 0x82082080UL
 322 };
 323
 324 /*
 325  * Once the bits are split out into bytes of UTF-8, this is a mask OR-ed
 326  * into the first byte, depending on how many bytes follow.  There are
 327  * as many entries in this table as there are UTF-8 sequence types.
 328  * (I.e., one byte sequence, two byte... etc.). Remember that sequencs
 329  * for *legal* UTF-8 will be 4 or fewer bytes total.
 330  */
 331 static const UTF8 firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
 332
 333 /***********************************************************************/
 334
 335 /* The interface converts a whole buffer to avoid function-call overhead.
 336  * Constants have been gathered. Loops & conditionals have been removed as
 337  * much as possible for efficiency, in favor of drop-through switches.
 338  * (See "Note A" at the bottom of the file for equivalent code.)
 339  * If your compiler supports it, the "isLegalUTF8" call can be turned
 340  * into an inline function.
 341  */
 342
 343 /***********************************************************************/
 344
 345 #ifdef __AROS__
 346 AROS_LH5(ULONG, CodesetsConvertUTF16toUTF8,
 347     AROS_LHA(const UTF16 **, sourceStart, A0),
 348     AROS_LHA(const UTF16 *, sourceEnd, A1),
 349     AROS_LHA(UTF8 **, targetStart, A2),
 350     AROS_LHA(UTF8 *, targetEnd, A3),
 351     AROS_LHA(ULONG, flags, D0),
 352     struct CodesetsBase *, library, 7, Codesets
 353 )
 354 {
 355     AROS_LIBFUNC_INIT
 356 #else
 357 ULONG LIBFUNC
 358 CodesetsConvertUTF16toUTF8(REG(a0, const UTF16 ** sourceStart),
 359                            REG(a1, const UTF16 * sourceEnd),
 360                            REG(a2, UTF8 ** targetStart),
 361                            REG(a3, UTF8 * targetEnd),
 362                            REG(d0, ULONG flags))
 363 {
 364 #endif
 365   ULONG result = CSR_ConversionOK;
 366   const UTF16 *source = *sourceStart;
 367   UTF8 *target = *targetStart;
 368
 369   ENTER();
 370
 371   while(source < sourceEnd)
 372   {
 373     UTF32 ch;
 374     unsigned short bytesToWrite = 0;
 375     const UTF32 byteMask = 0xBF;
 376     const UTF32 byteMark = 0x80;
 377     const UTF16 *oldSource = source;    /* In case we have to back up because of target overflow. */
 378
 379     ch = *source++;
 380
 381     /* If we have a surrogate pair, convert to UTF32 first. */
 382     if(ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END)
 383     {
 384       /* If the 16 bits following the high surrogate are in the source buffer... */
 385       if(source < sourceEnd)
 386       {
 387         UTF32 ch2 = *source;
 388
 389         /* If it's a low surrogate, convert to UTF32. */
 390         if(ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END)
 391         {
 392           ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
 393                 + (ch2 - UNI_SUR_LOW_START) + halfBase;
 394
 395           ++source;
 396         }
 397         else if(flags == CSF_StrictConversion)
 398         {
 399           /* it's an unpaired high surrogate */
 400           --source;   /* return to the illegal value itself */
 401           result = CSR_SourceIllegal;
 402           break;
 403         }
 404       }
 405       else
 406       {
 407         /* We don't have the 16 bits following the high surrogate. */
 408         --source;       /* return to the high surrogate */
 409         result = CSR_SourceExhausted;
 410
 411         break;
 412       }
 413     }
 414     else if(flags == CSF_StrictConversion)
 415     {
 416       /* UTF-16 surrogate values are illegal in UTF-32 */
 417       if(ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END)
 418       {
 419         --source;       /* return to the illegal value itself */
 420         result = CSR_SourceIllegal;
 421
 422         break;
 423       }
 424     }
 425     /* Figure out how many bytes the result will require */
 426     if(ch < (UTF32) 0x80)
 427     {
 428       bytesToWrite = 1;
 429     }
 430     else if (ch < (UTF32) 0x800)
 431     {
 432       bytesToWrite = 2;
 433     }
 434     else if (ch < (UTF32) 0x10000)
 435     {
 436       bytesToWrite = 3;
 437     }
 438     else if (ch < (UTF32) 0x110000)
 439     {
 440       bytesToWrite = 4;
 441     }
 442     else
 443     {
 444       bytesToWrite = 3;
 445       ch = UNI_REPLACEMENT_CHAR;
 446     }
 447
 448     target += bytesToWrite;
 449     if(target > targetEnd)
 450     {
 451       source = oldSource; /* Back up source pointer! */
 452       target -= bytesToWrite;
 453       result = CSR_TargetExhausted;
 454
 455       break;
 456     }
 457     switch(bytesToWrite)
 458     {
 459       /* note: everything falls through. */
 460       case 4:
 461         *--target = (UTF8) ((ch | byteMark) & byteMask);
 462         ch >>= 6;
 463
 464       case 3:
 465         *--target = (UTF8) ((ch | byteMark) & byteMask);
 466         ch >>= 6;
 467
 468       case 2:
 469         *--target = (UTF8) ((ch | byteMark) & byteMask);
 470         ch >>= 6;
 471
 472       case 1:
 473         *--target = (UTF8) (ch | firstByteMark[bytesToWrite]);
 474     }
 475
 476     target += bytesToWrite;
 477   }
 478
 479   *sourceStart = source;
 480   *targetStart = target;
 481
 482   RETURN(result);
 483   return result;
 484 #ifdef __AROS__
 485     AROS_LIBFUNC_EXIT
 486 #endif
 487 }
 488
 489 #ifndef __AROS__
 490 LIBSTUB(CodesetsConvertUTF16toUTF8, ULONG, REG(a0, const UTF16 ** sourceStart),
 491                                            REG(a1, const UTF16 * sourceEnd),
 492                                            REG(a2, UTF8 ** targetStart),
 493                                            REG(a3, UTF8 * targetEnd),
 494                                            REG(d0, ULONG flags))
 495 {
 496   #ifdef __MORPHOS__
 497   return CodesetsConvertUTF16toUTF8((const UTF16 **)REG_A0, (const UTF16 *)REG_A1, (UTF8 **)REG_A2, (UTF8 *)REG_A3, (ULONG)REG_D0);
 498   #else
 499   return CodesetsConvertUTF16toUTF8(sourceStart, sourceEnd, targetStart, targetEnd, flags);
 500   #endif
 501 }
 502 #endif
 503
 504 /***********************************************************************/
 505
 506 /*
 507  * Utility routine to tell whether a sequence of bytes is legal UTF-8.
 508  * This must be called with the length pre-determined by the first byte.
 509  * If not calling this from ConvertUTF8to*, then the length can be set by:
 510  *  length = trailingBytesForUTF8[*source]+1;
 511  * and the sequence is illegal right away if there aren't that many bytes
 512  * available.
 513  * If presented with a length > 4, this returns FALSE.  The Unicode
 514  * definition of UTF-8 goes up to 4-byte sequences.
 515  */
 516
 517 #ifdef __AROS__
 518 AROS_LH2(BOOL, CodesetsIsLegalUTF8,
 519     AROS_LHA(const UTF8 *, source, A0),
 520     AROS_LHA(ULONG, length, D0),
 521     struct CodesetsBase *, library, 8, Codesets
 522 )
 523 {
 524     AROS_LIBFUNC_INIT
 525 #else
 526 BOOL LIBFUNC
 527 CodesetsIsLegalUTF8(REG(a0, const UTF8 * source),
 528                                 REG(d0, ULONG length))
 529 {
 530 #endif
 531   UTF8 a;
 532   const UTF8 *srcptr = source + length;
 533
 534   ENTER();
 535
 536   switch(length)
 537   {
 538     default:
 539       RETURN(FALSE);
 540       return FALSE;
 541
 542     /* Everything else falls through when "TRUE"... */
 543     case 4:
 544       if((a = (*--srcptr)) < 0x80 || a > 0xBF)
 545       {
 546         RETURN(FALSE);
 547         return FALSE;
 548       }
 549
 550     case 3:
 551       if((a = (*--srcptr)) < 0x80 || a > 0xBF)
 552       {
 553         RETURN(FALSE);
 554         return FALSE;
 555       }
 556
 557     case 2:
 558       if((a = (*--srcptr)) > 0xBF)
 559       {
 560         RETURN(FALSE);
 561         return FALSE;
 562       }
 563
 564       switch (*source)
 565       {
 566         /* no fall-through in this inner switch */
 567         case 0xE0:
 568           if(a < 0xA0)
 569           {
 570             RETURN(FALSE);
 571             return FALSE;
 572           }
 573         break;
 574
 575         case 0xED:
 576           if(a > 0x9F)
 577           {
 578             RETURN(FALSE);
 579             return FALSE;
 580           }
 581         break;
 582
 583         case 0xF0:
 584           if(a < 0x90)
 585           {
 586             RETURN(FALSE);
 587             return FALSE;
 588           }
 589           break;
 590
 591         case 0xF4:
 592           if(a > 0x8F)
 593           {
 594             RETURN(FALSE);
 595             return FALSE;
 596           }
 597         break;
 598
 599         default:
 600           if(a < 0x80)
 601           {
 602             RETURN(FALSE);
 603             return FALSE;
 604           }
 605       }
 606
 607     case 1:
 608       if(*source >= 0x80 && *source < 0xC2)
 609       {
 610         RETURN(FALSE);
 611         return FALSE;
 612       }
 613   }
 614
 615   if(*source > 0xF4)
 616   {
 617     RETURN(FALSE);
 618     return FALSE;
 619   }
 620
 621   RETURN(TRUE);
 622   return TRUE;
 623 #ifdef __AROS__
 624     AROS_LIBFUNC_EXIT
 625 #endif
 626 }
 627
 628 #ifndef __AROS__
 629 LIBSTUB(CodesetsIsLegalUTF8, BOOL, REG(a0, const UTF8 * source),
 630                                                 REG(d0, ULONG length))
 631 {
 632   #ifdef __MORPHOS__
 633   return CodesetsIsLegalUTF8((const UTF8 *)REG_A0,(ULONG)REG_D0);
 634   #else
 635   return CodesetsIsLegalUTF8(source, length);
 636   #endif
 637 }
 638 #endif
 639
 640 /***********************************************************************/
 641
 642 /*
 643  * Exported function to return whether a UTF-8 sequence is legal or not.
 644  * This is not used here; it's just exported.
 645  */
 646
 647 #ifdef __AROS__
 648 AROS_LH2(BOOL, CodesetsIsLegalUTF8Sequence,
 649     AROS_LHA(const UTF8 *, source, A0),
 650     AROS_LHA(const UTF8 *, sourceEnd, D1),
 651     struct CodesetsBase *, library, 9, Codesets
 652 )
 653 {
 654     AROS_LIBFUNC_INIT
 655 #else
 656 BOOL LIBFUNC
 657 CodesetsIsLegalUTF8Sequence(REG(a0, const UTF8 * source),
 658                             REG(a1, const UTF8 * sourceEnd))
 659 {
 660 #endif
 661   int length = trailingBytesForUTF8[*source] + 1;
 662   BOOL res = FALSE;
 663
 664   ENTER();
 665
 666   if(source + length > sourceEnd)
 667   {
 668     RETURN(FALSE);
 669     return FALSE;
 670   }
 671
 672   res = CodesetsIsLegalUTF8(source, length);
 673
 674   RETURN(res);
 675   return res;
 676 #ifdef __AROS__
 677     AROS_LIBFUNC_EXIT
 678 #endif
 679 }
 680
 681 #ifndef __AROS__
 682 LIBSTUB(CodesetsIsLegalUTF8Sequence, BOOL, REG(a0, const UTF8 * source),
 683                                             REG(a1, const UTF8 * sourceEnd))
 684 {
 685   #ifdef __MORPHOS__
 686   return CodesetsIsLegalUTF8Sequence((const UTF8 *)REG_A0,(const UTF8 *)REG_A1);
 687   #else
 688   return CodesetsIsLegalUTF8Sequence(source, sourceEnd);
 689   #endif
 690 }
 691 #endif
 692
 693 /***********************************************************************/
 694
 695 #ifdef __AROS__
 696 AROS_LH5(ULONG, CodesetsConvertUTF8toUTF16,
 697     AROS_LHA(const UTF8 **, sourceStart, A0),
 698     AROS_LHA(const UTF8 *, sourceEnd, A1),
 699     AROS_LHA(UTF16 **, targetStart, A2),
 700     AROS_LHA(UTF16 *, targetEnd, A3),
 701     AROS_LHA(ULONG, flags, D0),
 702     struct CodesetsBase *, library, 10, Codesets
 703 )
 704 {
 705     AROS_LIBFUNC_INIT
 706 #else
 707 ULONG LIBFUNC
 708 CodesetsConvertUTF8toUTF16(REG(a0, const UTF8 ** sourceStart),
 709                            REG(a1, const UTF8 * sourceEnd),
 710                            REG(a2, UTF16 ** targetStart),
 711                            REG(a3, UTF16 * targetEnd),
 712                            REG(d0, ULONG flags))
 713 {
 714 #endif
 715   ULONG result = CSR_ConversionOK;
 716   const UTF8 *source = *sourceStart;
 717   UTF16 *target = *targetStart;
 718
 719   ENTER();
 720
 721   while(source < sourceEnd)
 722   {
 723     UTF32 ch = 0;
 724     unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
 725
 726     if(source + extraBytesToRead >= sourceEnd)
 727     {
 728       result = CSR_SourceExhausted;
 729       break;
 730     }
 731
 732     /* Do this check whether lenient or strict */
 733     if(!CodesetsIsLegalUTF8 (source, extraBytesToRead + 1))
 734     {
 735       result = CSR_SourceIllegal;
 736       break;
 737     }
 738
 739     /*
 740      * The cases all fall through. See "Note A" below.
 741      */
 742     switch (extraBytesToRead)
 743     {
 744       case 5:
 745         ch += *source++;
 746         ch <<= 6;       /* remember, illegal UTF-8 */
 747
 748       case 4:
 749         ch += *source++;
 750         ch <<= 6;       /* remember, illegal UTF-8 */
 751
 752       case 3:
 753         ch += *source++;
 754         ch <<= 6;
 755
 756       case 2:
 757         ch += *source++;
 758         ch <<= 6;
 759
 760       case 1:
 761         ch += *source++;
 762         ch <<= 6;
 763
 764       case 0:
 765         ch += *source++;
 766     }
 767
 768     ch -= offsetsFromUTF8[extraBytesToRead];
 769
 770     if(target >= targetEnd)
 771     {
 772       source -= (extraBytesToRead + 1);   /* Back up source pointer! */
 773       result = CSR_TargetExhausted;
 774
 775       break;
 776     }
 777
 778     if(ch <= UNI_MAX_BMP)
 779     {
 780       /* Target is a character <= 0xFFFF */
 781       /* UTF-16 surrogate values are illegal in UTF-32 */
 782       if(ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END)
 783       {
 784         if(flags == CSF_StrictConversion)
 785         {
 786           source -= (extraBytesToRead + 1);   /* return to the illegal value itself */
 787           result = CSR_SourceIllegal;
 788
 789           break;
 790         }
 791         else
 792         {
 793           *target++ = UNI_REPLACEMENT_CHAR;
 794         }
 795       }
 796       else
 797       {
 798         *target++ = (UTF16) ch; /* normal case */
 799       }
 800     }
 801     else if(ch > UNI_MAX_UTF16)
 802     {
 803       if(flags == CSF_StrictConversion)
 804       {
 805         result = CSR_SourceIllegal;
 806         source -= (extraBytesToRead + 1);   /* return to the start */
 807
 808         break;          /* Bail out; shouldn't continue */
 809       }
 810       else
 811       {
 812         *target++ = UNI_REPLACEMENT_CHAR;
 813       }
 814     }
 815     else
 816     {
 817       /* target is a character in range 0xFFFF - 0x10FFFF. */
 818       if(target + 1 >= targetEnd)
 819       {
 820         source -= (extraBytesToRead + 1);   /* Back up source pointer! */
 821         result = CSR_TargetExhausted;
 822
 823         break;
 824       }
 825
 826       ch -= halfBase;
 827       *target++ = (UTF16) ((ch >> halfShift) + UNI_SUR_HIGH_START);
 828       *target++ = (UTF16) ((ch & halfMask) + UNI_SUR_LOW_START);
 829     }
 830   }
 831
 832   *sourceStart = source;
 833   *targetStart = target;
 834
 835   RETURN(result);
 836   return result;
 837 #ifdef __AROS__
 838     AROS_LIBFUNC_EXIT
 839 #endif
 840 }
 841
 842 #ifndef __AROS__
 843 LIBSTUB(CodesetsConvertUTF8toUTF16, ULONG, REG(a0, const UTF8 ** sourceStart),
 844                                            REG(a1, const UTF8 * sourceEnd),
 845                                            REG(a2, UTF16 ** targetStart),
 846                                            REG(a3, UTF16 * targetEnd),
 847                                            REG(d0, ULONG flags))
 848 {
 849   #ifdef __MORPHOS__
 850   return CodesetsConvertUTF8toUTF16((const UTF8 **)REG_A0, (const UTF8 *)REG_A1, (UTF16 **)REG_A2, (UTF16 *)REG_A3, (ULONG)REG_D0);
 851   #else
 852   return CodesetsConvertUTF8toUTF16(sourceStart, sourceEnd, targetStart, targetEnd, flags);
 853   #endif
 854 }
 855 #endif
 856
 857 /***********************************************************************/
 858
 859 #ifdef __AROS__
 860 AROS_LH5(ULONG, CodesetsConvertUTF32toUTF8,
 861     AROS_LHA(const UTF32 **, sourceStart, A0),
 862     AROS_LHA(const UTF32 *, sourceEnd, A1),
 863     AROS_LHA(UTF8 **, targetStart, A2),
 864     AROS_LHA(UTF8 *, targetEnd, A3),
 865     AROS_LHA(ULONG, flags, D0),
 866     struct CodesetsBase *, library, 11, Codesets
 867 )
 868 {
 869     AROS_LIBFUNC_INIT
 870 #else
 871 ULONG LIBFUNC
 872 CodesetsConvertUTF32toUTF8(REG(a0, const UTF32 ** sourceStart),
 873                            REG(a1, const UTF32 * sourceEnd),
 874                            REG(a2, UTF8 ** targetStart),
 875                            REG(a3, UTF8 * targetEnd),
 876                            REG(d0, ULONG flags))
 877 {
 878 #endif
 879   ULONG result = CSR_ConversionOK;
 880   const UTF32 *source = *sourceStart;
 881   UTF8 *target = *targetStart;
 882
 883   ENTER();
 884
 885   while(source < sourceEnd)
 886   {
 887     UTF32 ch;
 888     unsigned short bytesToWrite = 0;
 889     const UTF32 byteMask = 0xBF;
 890     const UTF32 byteMark = 0x80;
 891
 892     ch = *source++;
 893
 894     if(flags == CSF_StrictConversion)
 895     {
 896       /* UTF-16 surrogate values are illegal in UTF-32 */
 897       if(ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END)
 898       {
 899         --source;       /* return to the illegal value itself */
 900         result = CSR_SourceIllegal;
 901
 902         break;
 903       }
 904     }
 905
 906     /*
 907      * Figure out how many bytes the result will require. Turn any
 908      * illegally large UTF32 things (> Plane 17) into replacement chars.
 909     */
 910     if(ch < (UTF32) 0x80)
 911     {
 912       bytesToWrite = 1;
 913     }
 914     else if(ch < (UTF32) 0x800)
 915     {
 916       bytesToWrite = 2;
 917     }
 918     else if(ch < (UTF32) 0x10000)
 919     {
 920       bytesToWrite = 3;
 921     }
 922     else if(ch <= UNI_MAX_LEGAL_UTF32)
 923     {
 924       bytesToWrite = 4;
 925     }
 926     else
 927     {
 928       bytesToWrite = 3;
 929       ch = UNI_REPLACEMENT_CHAR;
 930       result = CSR_SourceIllegal;
 931     }
 932
 933     target += bytesToWrite;
 934     if(target > targetEnd)
 935     {
 936       --source;           /* Back up source pointer! */
 937       target -= bytesToWrite;
 938       result = CSR_TargetExhausted;
 939
 940       break;
 941     }
 942     switch(bytesToWrite)
 943     {
 944       /* note: everything falls through. */
 945       case 4:
 946         *--target = (UTF8) ((ch | byteMark) & byteMask);
 947         ch >>= 6;
 948
 949       case 3:
 950         *--target = (UTF8) ((ch | byteMark) & byteMask);
 951         ch >>= 6;
 952
 953       case 2:
 954         *--target = (UTF8) ((ch | byteMark) & byteMask);
 955         ch >>= 6;
 956
 957       case 1:
 958         *--target = (UTF8) (ch | firstByteMark[bytesToWrite]);
 959     }
 960
 961     target += bytesToWrite;
 962   }
 963
 964   *sourceStart = source;
 965   *targetStart = target;
 966
 967   RETURN(result);
 968   return result;
 969 #ifdef __AROS__
 970     AROS_LIBFUNC_EXIT
 971 #endif
 972 }
 973
 974 #ifndef __AROS__
 975 LIBSTUB(CodesetsConvertUTF32toUTF8, ULONG, REG(a0, const UTF32 ** sourceStart),
 976                                            REG(a1, const UTF32 * sourceEnd),
 977                                            REG(a2, UTF8 ** targetStart),
 978                                            REG(a3, UTF8 * targetEnd),
 979                                            REG(d0, ULONG flags))
 980 {
 981   #ifdef __MORPHOS__
 982   return CodesetsConvertUTF32toUTF8((const UTF32 **)REG_A0, (const UTF32 *)REG_A1, (UTF8 **)REG_A2, (UTF8 *)REG_A3, (ULONG)REG_D0);
 983   #else
 984   return CodesetsConvertUTF32toUTF8(sourceStart, sourceEnd, targetStart, targetEnd, flags);
 985   #endif
 986 }
 987 #endif
 988
 989 /***********************************************************************/
 990
 991 #ifdef __AROS__
 992 AROS_LH5(ULONG, CodesetsConvertUTF8toUTF32,
 993     AROS_LHA(const UTF8 **, sourceStart, A0),
 994     AROS_LHA(const UTF8 *, sourceEnd, A1),
 995     AROS_LHA(UTF32 **, targetStart, A2),
 996     AROS_LHA(UTF32 *, targetEnd, A3),
 997     AROS_LHA(ULONG, flags, D0),
 998     struct CodesetsBase *, library, 12, Codesets
 999 )
1000 {
1001     AROS_LIBFUNC_INIT
1002 #else
1003 ULONG LIBFUNC
1004 CodesetsConvertUTF8toUTF32(REG(a0, const UTF8 ** sourceStart),
1005                            REG(a1, const UTF8 * sourceEnd),
1006                            REG(a2, UTF32 ** targetStart),
1007                            REG(a3, UTF32 * targetEnd),
1008                            REG(d0, ULONG flags))
1009 {
1010 #endif
1011   ULONG result = CSR_ConversionOK;
1012   const UTF8 *source = *sourceStart;
1013   UTF32 *target = *targetStart;
1014
1015   ENTER();
1016
1017   while(source < sourceEnd)
1018   {
1019     UTF32 ch = 0;
1020     unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
1021
1022     if(source + extraBytesToRead >= sourceEnd)
1023     {
1024       result = CSR_SourceExhausted;
1025       break;
1026     }
1027
1028     /* Do this check whether lenient or strict */
1029     if(!CodesetsIsLegalUTF8(source, extraBytesToRead + 1))
1030     {
1031       result = CSR_SourceIllegal;
1032       break;
1033     }
1034
1035     /*
1036      * The cases all fall through. See "Note A" below.
1037     */
1038     switch (extraBytesToRead)
1039     {
1040       case 5:
1041         ch += *source++;
1042         ch <<= 6;
1043
1044       case 4:
1045         ch += *source++;
1046         ch <<= 6;
1047
1048       case 3:
1049         ch += *source++;
1050         ch <<= 6;
1051
1052       case 2:
1053         ch += *source++;
1054         ch <<= 6;
1055
1056       case 1:
1057         ch += *source++;
1058         ch <<= 6;
1059
1060       case 0:
1061         ch += *source++;
1062     }
1063
1064     ch -= offsetsFromUTF8[extraBytesToRead];
1065
1066     if(target >= targetEnd)
1067     {
1068       source -= (extraBytesToRead + 1);   /* Back up the source pointer! */
1069       result = CSR_TargetExhausted;
1070
1071       break;
1072     }
1073
1074     if(ch <= UNI_MAX_LEGAL_UTF32)
1075     {
1076       /*
1077        * UTF-16 surrogate values are illegal in UTF-32, and anything
1078        * over Plane 17 (> 0x10FFFF) is illegal.
1079       */
1080       if(ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END)
1081       {
1082         if(flags == CSF_StrictConversion)
1083         {
1084           source -= (extraBytesToRead + 1);   /* return to the illegal value itself */
1085           result = CSR_SourceIllegal;
1086
1087           break;
1088         }
1089         else
1090         {
1091           *target++ = UNI_REPLACEMENT_CHAR;
1092         }
1093       }
1094       else
1095       {
1096         *target++ = ch;
1097       }
1098     }
1099     else
1100     {
1101       /* i.e., ch > UNI_MAX_LEGAL_UTF32 */
1102       result = CSR_SourceIllegal;
1103       *target++ = UNI_REPLACEMENT_CHAR;
1104     }
1105   }
1106
1107   *sourceStart = source;
1108   *targetStart = target;
1109
1110   RETURN(result);
1111   return result;
1112 #ifdef __AROS__
1113     AROS_LIBFUNC_EXIT
1114 #endif
1115 }
1116
1117 #ifndef __AROS__
1118 LIBSTUB(CodesetsConvertUTF8toUTF32, ULONG, REG(a0, const UTF8 ** sourceStart),
1119                                            REG(a1, const UTF8 * sourceEnd),
1120                                            REG(a2, UTF32 ** targetStart),
1121                                            REG(a3, UTF32 * targetEnd),
1122                                            REG(d0, ULONG flags))
1123 {
1124   #ifdef __MORPHOS__
1125   return CodesetsConvertUTF8toUTF32((const UTF8 **)REG_A0, (const UTF8 *)REG_A1, (UTF32 **)REG_A2, (UTF32 *)REG_A3, (ULONG)REG_D0);
1126   #else
1127   return CodesetsConvertUTF8toUTF32(sourceStart, sourceEnd, targetStart, targetEnd, flags);
1128   #endif
1129 }
1130 #endif
1131
1132 /***********************************************************************
1133
1134     Note A.
1135     The fall-through switches in UTF-8 reading code save a
1136     temp variable, some decrements & conditionals.  The switches
1137     are equivalent to the following loop:
1138     {
1139         int tmpBytesToRead = extraBytesToRead+1;
1140         do {
1141         ch += *source++;
1142         --tmpBytesToRead;
1143         if (tmpBytesToRead) ch <<= 6;
1144         } while (tmpBytesToRead > 0);
1145     }
1146     In UTF-8 writing code, the switches on "bytesToWrite" are
1147     similarly unrolled loops.
1148
1149 ***********************************************************************/