src/fl_utf.c

   1 /*
   2  * "$Id: fl_utf.c 8585 2011-04-13 15:43:22Z ianmacarthur $"
   3  *
   4  * This is the utf.c file from fltk2 adapted for use in my fltk1.1 port
   5  */
   6 /* Copyright 2006-2011 by Bill Spitzak and others.
   7  *
   8  * This library is free software; you can redistribute it and/or
   9  * modify it under the terms of the GNU Library General Public
  10  * License as published by the Free Software Foundation; either
  11  * version 2 of the License, or (at your option) any later version.
  12  *
  13  * This library is distributed in the hope that it will be useful,
  14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  16  * Library General Public License for more details.
  17  *
  18  * You should have received a copy of the GNU Library General Public
  19  * License along with this library; if not, write to the Free Software
  20  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
  21  * USA.
  22  *
  23  * Please report all bugs and problems on the following page:
  24  *
  25  *     http://www.fltk.org/str.php
  26  */
  27
  28 /* Modified to obey rfc3629, which limits unicode to 0-0x10ffff */
  29
  30 #include <FL/fl_utf8.h>
  31 #include <string.h>
  32 #include <stdlib.h>
  33
  34 /** \addtogroup fl_unicode
  35     @{
  36 */
  37
  38
  39 #if 0
  40   /**
  41    \defgroup fl_unichar Unicode Character Functions
  42    Global Functions Handling Single Unicode Characters
  43    @{ */
  44
  45   /**
  46    Converts a Unicode character into a utf-8 sequence.
  47    \param[in] uc Unicode character
  48    \param[out] text utf-8 sequence will be written here; if this pointer is
  49    \c NULL, only the length of the utf-8 sequence is calculated
  50    \return length of the sequence in bytes
  51    */
  52   /* FL_EXPORT int fl_unichar_to_utf8(unsigned int uc, char *text); */
  53
  54   /** @} */
  55
  56   /**
  57    \defgroup fl_utf8 Unicode String Functions
  58    Global Functions Handling Unicode Text
  59    @{ */
  60
  61   /**
  62    Calculate the size of a utf-8 sequence for a Unicode character.
  63    \param[in] uc Unicode character
  64    \return length of the sequence in bytes
  65    */
  66   /* FL_EXPORT int fl_utf8_size(unsigned int uc); */
  67
  68   /** @} */
  69 #endif /* 0 */
  70
  71 /*!Set to 1 to turn bad UTF8 bytes into ISO-8859-1. If this is to zero
  72    they are instead turned into the Unicode REPLACEMENT CHARACTER, of
  73    value 0xfffd.
  74    If this is on fl_utf8decode() will correctly map most (perhaps all)
  75    human-readable text that is in ISO-8859-1. This may allow you
  76    to completely ignore character sets in your code because virtually
  77    everything is either ISO-8859-1 or UTF-8.
  78 */
  79 #define ERRORS_TO_ISO8859_1 1
  80
  81 /*!Set to 1 to turn bad UTF8 bytes in the 0x80-0x9f range into the
  82    Unicode index for Microsoft's CP1252 character set. You should
  83    also set ERRORS_TO_ISO8859_1. With this a huge amount of more
  84    available text (such as all web pages) are correctly converted
  85    to Unicode.
  86 */
  87 #define ERRORS_TO_CP1252 1
  88
  89 /*!A number of Unicode code points are in fact illegal and should not
  90    be produced by a UTF-8 converter. Turn this on will replace the
  91    bytes in those encodings with errors. If you do this then converting
  92    arbitrary 16-bit data to UTF-8 and then back is not an identity,
  93    which will probably break a lot of software.
  94 */
  95 #define STRICT_RFC3629 0
  96
  97 #if ERRORS_TO_CP1252
  98 /* Codes 0x80..0x9f from the Microsoft CP1252 character set, translated
  99  * to Unicode:
 100  */
 101 static unsigned short cp1252[32] = {
 102   0x20ac, 0x0081, 0x201a, 0x0192, 0x201e, 0x2026, 0x2020, 0x2021,
 103   0x02c6, 0x2030, 0x0160, 0x2039, 0x0152, 0x008d, 0x017d, 0x008f,
 104   0x0090, 0x2018, 0x2019, 0x201c, 0x201d, 0x2022, 0x2013, 0x2014,
 105   0x02dc, 0x2122, 0x0161, 0x203a, 0x0153, 0x009d, 0x017e, 0x0178
 106 };
 107 #endif
 108
 109 /*! Decode a single UTF-8 encoded character starting at \e p. The
 110     resulting Unicode value (in the range 0-0x10ffff) is returned,
 111     and \e len is set to the number of bytes in the UTF-8 encoding
 112     (adding \e len to \e p will point at the next character).
 113
 114     If \p p points at an illegal UTF-8 encoding, including one that
 115     would go past \e end, or where a code is uses more bytes than
 116     necessary, then *(unsigned char*)p is translated as though it is
 117     in the Microsoft CP1252 character set and \e len is set to 1.
 118     Treating errors this way allows this to decode almost any
 119     ISO-8859-1 or CP1252 text that has been mistakenly placed where
 120     UTF-8 is expected, and has proven very useful.
 121
 122     If you want errors to be converted to error characters (as the
 123     standards recommend), adding a test to see if the length is
 124     unexpectedly 1 will work:
 125
 126     \code
 127     if (*p & 0x80) {              // what should be a multibyte encoding
 128       code = fl_utf8decode(p,end,&len);
 129       if (len<2) code = 0xFFFD;   // Turn errors into REPLACEMENT CHARACTER
 130     } else {                      // handle the 1-byte utf8 encoding:
 131       code = *p;
 132       len = 1;
 133     }
 134     \endcode
 135
 136     Direct testing for the 1-byte case (as shown above) will also
 137     speed up the scanning of strings where the majority of characters
 138     are ASCII.
 139 */
 140 unsigned fl_utf8decode(const char* p, const char* end, int* len)
 141 {
 142   unsigned char c = *(unsigned char*)p;
 143   if (c < 0x80) {
 144     if (len) *len = 1;
 145     return c;
 146 #if ERRORS_TO_CP1252
 147   } else if (c < 0xa0) {
 148     if (len) *len = 1;
 149     return cp1252[c-0x80];
 150 #endif
 151   } else if (c < 0xc2) {
 152     goto FAIL;
 153   }
 154   if ( (end && p+1 >= end) || (p[1]&0xc0) != 0x80) goto FAIL;
 155   if (c < 0xe0) {
 156     if (len) *len = 2;
 157     return
 158       ((p[0] & 0x1f) << 6) +
 159       ((p[1] & 0x3f));
 160   } else if (c == 0xe0) {
 161     if (((unsigned char*)p)[1] < 0xa0) goto FAIL;
 162     goto UTF8_3;
 163 #if STRICT_RFC3629
 164   } else if (c == 0xed) {
 165     /* RFC 3629 says surrogate chars are illegal. */
 166     if (((unsigned char*)p)[1] >= 0xa0) goto FAIL;
 167     goto UTF8_3;
 168   } else if (c == 0xef) {
 169     /* 0xfffe and 0xffff are also illegal characters */
 170     if (((unsigned char*)p)[1]==0xbf &&
 171         ((unsigned char*)p)[2]>=0xbe) goto FAIL;
 172     goto UTF8_3;
 173 #endif
 174   } else if (c < 0xf0) {
 175   UTF8_3:
 176     if ( (end && p+2 >= end) || (p[2]&0xc0) != 0x80) goto FAIL;
 177     if (len) *len = 3;
 178     return
 179       ((p[0] & 0x0f) << 12) +
 180       ((p[1] & 0x3f) << 6) +
 181       ((p[2] & 0x3f));
 182   } else if (c == 0xf0) {
 183     if (((unsigned char*)p)[1] < 0x90) goto FAIL;
 184     goto UTF8_4;
 185   } else if (c < 0xf4) {
 186   UTF8_4:
 187     if ( (end && p+3 >= end) || (p[2]&0xc0) != 0x80 || (p[3]&0xc0) != 0x80) goto FAIL;
 188     if (len) *len = 4;
 189 #if STRICT_RFC3629
 190     /* RFC 3629 says all codes ending in fffe or ffff are illegal: */
 191     if ((p[1]&0xf)==0xf &&
 192         ((unsigned char*)p)[2] == 0xbf &&
 193         ((unsigned char*)p)[3] >= 0xbe) goto FAIL;
 194 #endif
 195     return
 196       ((p[0] & 0x07) << 18) +
 197       ((p[1] & 0x3f) << 12) +
 198       ((p[2] & 0x3f) << 6) +
 199       ((p[3] & 0x3f));
 200   } else if (c == 0xf4) {
 201     if (((unsigned char*)p)[1] > 0x8f) goto FAIL; /* after 0x10ffff */
 202     goto UTF8_4;
 203   } else {
 204   FAIL:
 205     if (len) *len = 1;
 206 #if ERRORS_TO_ISO8859_1
 207     return c;
 208 #else
 209     return 0xfffd; /* Unicode REPLACEMENT CHARACTER */
 210 #endif
 211   }
 212 }
 213
 214 /*! Move \p p forward until it points to the start of a UTF-8
 215   character. If it already points at the start of one then it
 216   is returned unchanged. Any UTF-8 errors are treated as though each
 217   byte of the error is an individual character.
 218
 219   \e start is the start of the string and is used to limit the
 220   backwards search for the start of a utf8 character.
 221
 222   \e end is the end of the string and is assumed to be a break
 223   between characters. It is assumed to be greater than p.
 224
 225   This function is for moving a pointer that was jumped to the
 226   middle of a string, such as when doing a binary search for
 227   a position. You should use either this or fl_utf8back() depending
 228   on which direction your algorithim can handle the pointer
 229   moving. Do not use this to scan strings, use fl_utf8decode()
 230   instead.
 231 */
 232 const char* fl_utf8fwd(const char* p, const char* start, const char* end)
 233 {
 234   const char* a;
 235   int len;
 236   /* if we are not pointing at a continuation character, we are done: */
 237   if ((*p&0xc0) != 0x80) return p;
 238   /* search backwards for a 0xc0 starting the character: */
 239   for (a = p-1; ; --a) {
 240     if (a < start) return p;
 241     if (!(a[0]&0x80)) return p;
 242     if ((a[0]&0x40)) break;
 243   }
 244   fl_utf8decode(a,end,&len);
 245   a += len;
 246   if (a > p) return a;
 247   return p;
 248 }
 249
 250 /*! Move \p p backward until it points to the start of a UTF-8
 251   character. If it already points at the start of one then it
 252   is returned unchanged. Any UTF-8 errors are treated as though each
 253   byte of the error is an individual character.
 254
 255   \e start is the start of the string and is used to limit the
 256   backwards search for the start of a UTF-8 character.
 257
 258   \e end is the end of the string and is assumed to be a break
 259   between characters. It is assumed to be greater than p.
 260
 261   If you wish to decrement a UTF-8 pointer, pass p-1 to this.
 262 */
 263 const char* fl_utf8back(const char* p, const char* start, const char* end)
 264 {
 265   const char* a;
 266   int len;
 267   /* if we are not pointing at a continuation character, we are done: */
 268   if ((*p&0xc0) != 0x80) return p;
 269   /* search backwards for a 0xc0 starting the character: */
 270   for (a = p-1; ; --a) {
 271     if (a < start) return p;
 272     if (!(a[0]&0x80)) return p;
 273     if ((a[0]&0x40)) break;
 274   }
 275   fl_utf8decode(a,end,&len);
 276   if (a+len > p) return a;
 277   return p;
 278 }
 279
 280 /*! Returns number of bytes that utf8encode() will use to encode the
 281   character \p ucs. */
 282 int fl_utf8bytes(unsigned ucs) {
 283   if (ucs < 0x000080U) {
 284     return 1;
 285   } else if (ucs < 0x000800U) {
 286     return 2;
 287   } else if (ucs < 0x010000U) {
 288     return 3;
 289   } else if (ucs <= 0x10ffffU) {
 290     return 4;
 291   } else {
 292     return 3; /* length of the illegal character encoding */
 293   }
 294 }
 295
 296 /*! Write the UTF-8 encoding of \e ucs into \e buf and return the
 297     number of bytes written. Up to 4 bytes may be written. If you know
 298     that \p ucs is less than 0x10000 then at most 3 bytes will be written.
 299     If you wish to speed this up, remember that anything less than 0x80
 300     is written as a single byte.
 301
 302     If ucs is greater than 0x10ffff this is an illegal character
 303     according to RFC 3629. These are converted as though they are
 304     0xFFFD (REPLACEMENT CHARACTER).
 305
 306     RFC 3629 also says many other values for \p ucs are illegal (in
 307     the range 0xd800 to 0xdfff, or ending with 0xfffe or
 308     0xffff). However I encode these as though they are legal, so that
 309     utf8encode/fl_utf8decode will be the identity for all codes between 0
 310     and 0x10ffff.
 311 */
 312 int fl_utf8encode(unsigned ucs, char* buf) {
 313   if (ucs < 0x000080U) {
 314     buf[0] = ucs;
 315     return 1;
 316   } else if (ucs < 0x000800U) {
 317     buf[0] = 0xc0 | (ucs >> 6);
 318     buf[1] = 0x80 | (ucs & 0x3F);
 319     return 2;
 320   } else if (ucs < 0x010000U) {
 321     buf[0] = 0xe0 | (ucs >> 12);
 322     buf[1] = 0x80 | ((ucs >> 6) & 0x3F);
 323     buf[2] = 0x80 | (ucs & 0x3F);
 324     return 3;
 325   } else if (ucs <= 0x0010ffffU) {
 326     buf[0] = 0xf0 | (ucs >> 18);
 327     buf[1] = 0x80 | ((ucs >> 12) & 0x3F);
 328     buf[2] = 0x80 | ((ucs >> 6) & 0x3F);
 329     buf[3] = 0x80 | (ucs & 0x3F);
 330     return 4;
 331   } else {
 332     /* encode 0xfffd: */
 333     buf[0] = 0xefU;
 334     buf[1] = 0xbfU;
 335     buf[2] = 0xbdU;
 336     return 3;
 337   }
 338 }
 339
 340 /*! Convert a single 32-bit Unicode codepoint into an array of 16-bit
 341     characters. These are used by some system calls, especially on Windows.
 342
 343     \p ucs is the value to convert.
 344
 345     \p dst points at an array to write, and \p dstlen is the number of
 346     locations in this array. At most \p dstlen words will be
 347     written, and a 0 terminating word will be added if \p dstlen is
 348     large enough. Thus this function will never overwrite the buffer
 349     and will attempt return a zero-terminated string if space permits.
 350     If \p dstlen is zero then \p dst can be set to NULL and no data
 351     is written, but the length is returned.
 352
 353     The return value is the number of 16-bit words that \e would be written
 354     to \p dst if it is large enough, not counting any terminating
 355     zero.
 356
 357     If the return value is greater than \p dstlen it indicates truncation,
 358     you should then allocate a new array of size return+1 and call this again.
 359
 360     Unicode characters in the range 0x10000 to 0x10ffff are converted to
 361     "surrogate pairs" which take two words each (in UTF-16 encoding).
 362     Typically, setting \p dstlen to 2 will ensure that any valid Unicode
 363     value can be converted, and setting \p dstlen to 3 or more will allow
 364     a NULL terminated sequence to be returned.
 365 */
 366 unsigned fl_ucs_to_Utf16(const unsigned ucs, unsigned short *dst, const unsigned dstlen)
 367 {
 368   /* The rule for direct conversion from UCS to UTF16 is:
 369    * - if UCS >  0x0010FFFF then UCS is invalid
 370    * - if UCS >= 0xD800 && UCS <= 0xDFFF UCS is invalid
 371    * - if UCS <= 0x0000FFFF then U16 = UCS, len = 1
 372    * - else
 373    * -- U16[0] = ((UCS - 0x00010000) >> 10) & 0x3FF + 0xD800
 374    * -- U16[1] = (UCS & 0x3FF) + 0xDC00
 375    * -- len = 2;
 376    */
 377   unsigned count;        /* Count of converted UTF16 cells */
 378   unsigned short u16[4]; /* Alternate buffer if dst is not set */
 379   unsigned short *out;   /* points to the active buffer */
 380   /* Ensure we have a valid buffer to write to */
 381   if((!dstlen) || (!dst)) {
 382     out = u16;
 383   } else {
 384     out = dst;
 385   }
 386   /* Convert from UCS to UTF16 */
 387   if((ucs > 0x0010FFFF) || /* UCS is too large */
 388   ((ucs > 0xD7FF) && (ucs < 0xE000))) { /* UCS in invalid range */
 389     out[0] = 0xFFFD; /* REPLACEMENT CHARACTER */
 390     count = 1;
 391   } else if(ucs < 0x00010000) {
 392     out[0] = (unsigned short)ucs;
 393     count = 1;
 394   } else if(dstlen < 2) { /* dst is too small for the result */
 395     out[0] = 0xFFFD; /* REPLACEMENT CHARACTER */
 396     count = 2;
 397   } else {
 398     out[0] = (((ucs - 0x00010000) >> 10) & 0x3FF) + 0xD800;
 399     out[1] = (ucs & 0x3FF) + 0xDC00;
 400     count = 2;
 401   }
 402   /* NULL terminate the output, if there is space */
 403   if(count < dstlen) { out[count] = 0; }
 404   return count;
 405 } /* fl_ucs_to_Utf16 */
 406
 407 /*! Convert a UTF-8 sequence into an array of 16-bit characters. These
 408     are used by some system calls, especially on Windows.
 409
 410     \p src points at the UTF-8, and \p srclen is the number of bytes to
 411     convert.
 412
 413     \p dst points at an array to write, and \p dstlen is the number of
 414     locations in this array. At most \p dstlen-1 words will be
 415     written there, plus a 0 terminating word. Thus this function
 416     will never overwrite the buffer and will always return a
 417     zero-terminated string. If \p dstlen is zero then \p dst can be
 418     null and no data is written, but the length is returned.
 419
 420     The return value is the number of 16-bit words that \e would be written
 421     to \p dst if it were long enough, not counting the terminating
 422     zero. If the return value is greater or equal to \p dstlen it
 423     indicates truncation, you can then allocate a new array of size
 424     return+1 and call this again.
 425
 426     Errors in the UTF-8 are converted as though each byte in the
 427     erroneous string is in the Microsoft CP1252 encoding. This allows
 428     ISO-8859-1 text mistakenly identified as UTF-8 to be printed
 429     correctly.
 430
 431     Unicode characters in the range 0x10000 to 0x10ffff are converted to
 432     "surrogate pairs" which take two words each (this is called UTF-16
 433     encoding).
 434 */
 435 unsigned fl_utf8toUtf16(const char* src, unsigned srclen,
 436                   unsigned short* dst, unsigned dstlen)
 437 {
 438   const char* p = src;
 439   const char* e = src+srclen;
 440   unsigned count = 0;
 441   if (dstlen) for (;;) {
 442     if (p >= e) {dst[count] = 0; return count;}
 443     if (!(*p & 0x80)) { /* ascii */
 444       dst[count] = *p++;
 445     } else {
 446       int len; unsigned ucs = fl_utf8decode(p,e,&len);
 447       p += len;
 448       if (ucs < 0x10000) {
 449         dst[count] = ucs;
 450       } else {
 451         /* make a surrogate pair: */
 452         if (count+2 >= dstlen) {dst[count] = 0; count += 2; break;}
 453         dst[count] = (((ucs-0x10000u)>>10)&0x3ff) | 0xd800;
 454         dst[++count] = (ucs&0x3ff) | 0xdc00;
 455       }
 456     }
 457     if (++count == dstlen) {dst[count-1] = 0; break;}
 458   }
 459   /* we filled dst, measure the rest: */
 460   while (p < e) {
 461     if (!(*p & 0x80)) p++;
 462     else {
 463       int len; unsigned ucs = fl_utf8decode(p,e,&len);
 464       p += len;
 465       if (ucs >= 0x10000) ++count;
 466     }
 467     ++count;
 468   }
 469   return count;
 470 }
 471
 472
 473 /**
 474   Converts a UTF-8 string into a wide character string.
 475
 476   This function generates 32-bit wchar_t (e.g. "ucs4" as it were) except
 477   on Windows where it is equivalent to fl_utf8toUtf16 and returns
 478   UTF-16.
 479
 480   \p src points at the UTF-8, and \p srclen is the number of bytes to
 481   convert.
 482
 483   \p dst points at an array to write, and \p dstlen is the number of
 484   locations in this array. At most \p dstlen-1 wchar_t will be
 485   written there, plus a 0 terminating wchar_t.
 486
 487   The return value is the number of wchar_t that \e would be written
 488   to \p dst if it were long enough, not counting the terminating
 489   zero. If the return value is greater or equal to \p dstlen it
 490   indicates truncation, you can then allocate a new array of size
 491   return+1 and call this again.
 492
 493   Notice that sizeof(wchar_t) is 2 on Windows and is 4 on Linux
 494   and most other systems. Where wchar_t is 16 bits, Unicode
 495   characters in the range 0x10000 to 0x10ffff are converted to
 496   "surrogate pairs" which take two words each (this is called UTF-16
 497   encoding). If wchar_t is 32 bits this rather nasty problem is
 498   avoided.
 499
 500   Note that Windows includes Cygwin, i.e. compiled with Cygwin's POSIX
 501   layer (cygwin1.dll, --enable-cygwin), either native (GDI) or X11.
 502   */
 503 unsigned fl_utf8towc(const char* src, unsigned srclen,
 504                   wchar_t* dst, unsigned dstlen)
 505 {
 506 #if defined(WIN32) || defined(__CYGWIN__)
 507   return fl_utf8toUtf16(src, srclen, (unsigned short*)dst, dstlen);
 508 #else
 509   const char* p = src;
 510   const char* e = src+srclen;
 511   unsigned count = 0;
 512   if (dstlen) for (;;) {
 513     if (p >= e) {
 514       dst[count] = 0;
 515       return count;
 516     }
 517     if (!(*p & 0x80)) { /* ascii */
 518       dst[count] = *p++;
 519     } else {
 520       int len; unsigned ucs = fl_utf8decode(p,e,&len);
 521       p += len;
 522       dst[count] = (wchar_t)ucs;
 523     }
 524     if (++count == dstlen) {dst[count-1] = 0; break;}
 525   }
 526   /* we filled dst, measure the rest: */
 527   while (p < e) {
 528     if (!(*p & 0x80)) p++;
 529     else {
 530       int len; fl_utf8decode(p,e,&len);
 531       p += len;
 532     }
 533     ++count;
 534   }
 535   return count;
 536 #endif
 537 }
 538
 539 /*! Convert a UTF-8 sequence into an array of 1-byte characters.
 540
 541     If the UTF-8 decodes to a character greater than 0xff then it is
 542     replaced with '?'.
 543
 544     Errors in the UTF-8 are converted as individual bytes, same as
 545     fl_utf8decode() does. This allows ISO-8859-1 text mistakenly identified
 546     as UTF-8 to be printed correctly (and possibly CP1512 on Windows).
 547
 548     \p src points at the UTF-8, and \p srclen is the number of bytes to
 549     convert.
 550
 551     Up to \p dstlen bytes are written to \p dst, including a null
 552     terminator. The return value is the number of bytes that would be
 553     written, not counting the null terminator. If greater or equal to
 554     \p dstlen then if you malloc a new array of size n+1 you will have
 555     the space needed for the entire string. If \p dstlen is zero then
 556     nothing is written and this call just measures the storage space
 557     needed.
 558 */
 559 unsigned fl_utf8toa(const char* src, unsigned srclen,
 560                  char* dst, unsigned dstlen)
 561 {
 562   const char* p = src;
 563   const char* e = src+srclen;
 564   unsigned count = 0;
 565   if (dstlen) for (;;) {
 566     unsigned char c;
 567     if (p >= e) {dst[count] = 0; return count;}
 568     c = *(unsigned char*)p;
 569     if (c < 0xC2) { /* ascii or bad code */
 570       dst[count] = c;
 571       p++;
 572     } else {
 573       int len; unsigned ucs = fl_utf8decode(p,e,&len);
 574       p += len;
 575       if (ucs < 0x100) dst[count] = ucs;
 576       else dst[count] = '?';
 577     }
 578     if (++count >= dstlen) {dst[count-1] = 0; break;}
 579   }
 580   /* we filled dst, measure the rest: */
 581   while (p < e) {
 582     if (!(*p & 0x80)) p++;
 583     else {
 584       int len;
 585       fl_utf8decode(p,e,&len);
 586       p += len;
 587     }
 588     ++count;
 589   }
 590   return count;
 591 }
 592
 593 /*! Turn "wide characters" as returned by some system calls
 594     (especially on Windows) into UTF-8.
 595
 596     Up to \p dstlen bytes are written to \p dst, including a null
 597     terminator. The return value is the number of bytes that would be
 598     written, not counting the null terminator. If greater or equal to
 599     \p dstlen then if you malloc a new array of size n+1 you will have
 600     the space needed for the entire string. If \p dstlen is zero then
 601     nothing is written and this call just measures the storage space
 602     needed.
 603
 604     \p srclen is the number of words in \p src to convert. On Windows
 605     this is not necessarily the number of characters, due to there
 606     possibly being "surrogate pairs" in the UTF-16 encoding used.
 607     On Unix wchar_t is 32 bits and each location is a character.
 608
 609     On Unix if a \p src word is greater than 0x10ffff then this is an
 610     illegal character according to RFC 3629. These are converted as
 611     though they are 0xFFFD (REPLACEMENT CHARACTER). Characters in the
 612     range 0xd800 to 0xdfff, or ending with 0xfffe or 0xffff are also
 613     illegal according to RFC 3629. However I encode these as though
 614     they are legal, so that fl_utf8towc will return the original data.
 615
 616     On Windows "surrogate pairs" are converted to a single character
 617     and UTF-8 encoded (as 4 bytes). Mismatched halves of surrogate
 618     pairs are converted as though they are individual characters.
 619 */
 620 unsigned fl_utf8fromwc(char* dst, unsigned dstlen,
 621                     const wchar_t* src, unsigned srclen) {
 622   unsigned i = 0;
 623   unsigned count = 0;
 624   if (dstlen) for (;;) {
 625     unsigned ucs;
 626     if (i >= srclen) {dst[count] = 0; return count;}
 627     ucs = src[i++];
 628     if (ucs < 0x80U) {
 629       dst[count++] = ucs;
 630       if (count >= dstlen) {dst[count-1] = 0; break;}
 631     } else if (ucs < 0x800U) { /* 2 bytes */
 632       if (count+2 >= dstlen) {dst[count] = 0; count += 2; break;}
 633       dst[count++] = 0xc0 | (ucs >> 6);
 634       dst[count++] = 0x80 | (ucs & 0x3F);
 635 #if defined(WIN32) || defined(__CYGWIN__)
 636     } else if (ucs >= 0xd800 && ucs <= 0xdbff && i < srclen &&
 637                src[i] >= 0xdc00 && src[i] <= 0xdfff) {
 638       /* surrogate pair */
 639       unsigned ucs2 = src[i++];
 640       ucs = 0x10000U + ((ucs&0x3ff)<<10) + (ucs2&0x3ff);
 641       /* all surrogate pairs turn into 4-byte utf8 */
 642 #else
 643     } else if (ucs >= 0x10000) {
 644       if (ucs > 0x10ffff) {
 645         ucs = 0xfffd;
 646         goto J1;
 647       }
 648 #endif
 649       if (count+4 >= dstlen) {dst[count] = 0; count += 4; break;}
 650       dst[count++] = 0xf0 | (ucs >> 18);
 651       dst[count++] = 0x80 | ((ucs >> 12) & 0x3F);
 652       dst[count++] = 0x80 | ((ucs >> 6) & 0x3F);
 653       dst[count++] = 0x80 | (ucs & 0x3F);
 654     } else {
 655 #if !(defined(WIN32) || defined(__CYGWIN__))
 656     J1:
 657 #endif
 658       /* all others are 3 bytes: */
 659       if (count+3 >= dstlen) {dst[count] = 0; count += 3; break;}
 660       dst[count++] = 0xe0 | (ucs >> 12);
 661       dst[count++] = 0x80 | ((ucs >> 6) & 0x3F);
 662       dst[count++] = 0x80 | (ucs & 0x3F);
 663     }
 664   }
 665   /* we filled dst, measure the rest: */
 666   while (i < srclen) {
 667     unsigned ucs = src[i++];
 668     if (ucs < 0x80U) {
 669       count++;
 670     } else if (ucs < 0x800U) { /* 2 bytes */
 671       count += 2;
 672 #if defined(WIN32) || defined(__CYGWIN__)
 673     } else if (ucs >= 0xd800 && ucs <= 0xdbff && i < srclen-1 &&
 674                src[i+1] >= 0xdc00 && src[i+1] <= 0xdfff) {
 675       /* surrogate pair */
 676       ++i;
 677 #else
 678     } else if (ucs >= 0x10000 && ucs <= 0x10ffff) {
 679 #endif
 680       count += 4;
 681     } else {
 682       count += 3;
 683     }
 684   }
 685   return count;
 686 }
 687
 688 /*! Convert an ISO-8859-1 (ie normal c-string) byte stream to UTF-8.
 689
 690     It is possible this should convert Microsoft's CP1252 to UTF-8
 691     instead. This would translate the codes in the range 0x80-0x9f
 692     to different characters. Currently it does not do this.
 693
 694     Up to \p dstlen bytes are written to \p dst, including a null
 695     terminator. The return value is the number of bytes that would be
 696     written, not counting the null terminator. If greater or equal to
 697     \p dstlen then if you malloc a new array of size n+1 you will have
 698     the space needed for the entire string. If \p dstlen is zero then
 699     nothing is written and this call just measures the storage space
 700     needed.
 701
 702     \p srclen is the number of bytes in \p src to convert.
 703
 704     If the return value equals \p srclen then this indicates that
 705     no conversion is necessary, as only ASCII characters are in the
 706     string.
 707 */
 708 unsigned fl_utf8froma(char* dst, unsigned dstlen,
 709                    const char* src, unsigned srclen) {
 710   const char* p = src;
 711   const char* e = src+srclen;
 712   unsigned count = 0;
 713   if (dstlen) for (;;) {
 714     unsigned char ucs;
 715     if (p >= e) {dst[count] = 0; return count;}
 716     ucs = *(unsigned char*)p++;
 717     if (ucs < 0x80U) {
 718       dst[count++] = ucs;
 719       if (count >= dstlen) {dst[count-1] = 0; break;}
 720     } else { /* 2 bytes (note that CP1252 translate could make 3 bytes!) */
 721       if (count+2 >= dstlen) {dst[count] = 0; count += 2; break;}
 722       dst[count++] = 0xc0 | (ucs >> 6);
 723       dst[count++] = 0x80 | (ucs & 0x3F);
 724     }
 725   }
 726   /* we filled dst, measure the rest: */
 727   while (p < e) {
 728     unsigned char ucs = *(unsigned char*)p++;
 729     if (ucs < 0x80U) {
 730       count++;
 731     } else {
 732       count += 2;
 733     }
 734   }
 735   return count;
 736 }
 737
 738 #ifdef WIN32
 739 # include <windows.h>
 740 #endif
 741
 742 /*! Return true if the "locale" seems to indicate that UTF-8 encoding
 743     is used. If true the fl_utf8to_mb and fl_utf8from_mb don't do anything
 744     useful.
 745
 746     <i>It is highly recommended that you change your system so this
 747     does return true.</i> On Windows this is done by setting the
 748     "codepage" to CP_UTF8.  On Unix this is done by setting $LC_CTYPE
 749     to a string containing the letters "utf" or "UTF" in it, or by
 750     deleting all $LC* and $LANG environment variables. In the future
 751     it is likely that all non-Asian Unix systems will return true,
 752     due to the compatibility of UTF-8 with ISO-8859-1.
 753 */
 754 int fl_utf8locale(void) {
 755   static int ret = 2;
 756   if (ret == 2) {
 757 #ifdef WIN32
 758     ret = GetACP() == CP_UTF8;
 759 #else
 760     char* s;
 761     ret = 1; /* assume UTF-8 if no locale */
 762     if (((s = getenv("LC_CTYPE")) && *s) ||
 763         ((s = getenv("LC_ALL"))   && *s) ||
 764         ((s = getenv("LANG"))     && *s)) {
 765       ret = (strstr(s,"utf") || strstr(s,"UTF"));
 766     }
 767 #endif
 768   }
 769   return ret;
 770 }
 771
 772 /*! Convert the UTF-8 used by FLTK to the locale-specific encoding
 773     used for filenames (and sometimes used for data in files).
 774     Unfortunately due to stupid design you will have to do this as
 775     needed for filenames. This is a bug on both Unix and Windows.
 776
 777     Up to \p dstlen bytes are written to \p dst, including a null
 778     terminator. The return value is the number of bytes that would be
 779     written, not counting the null terminator. If greater or equal to
 780     \p dstlen then if you malloc a new array of size n+1 you will have
 781     the space needed for the entire string. If \p dstlen is zero then
 782     nothing is written and this call just measures the storage space
 783     needed.
 784
 785     If fl_utf8locale() returns true then this does not change the data.
 786 */
 787 unsigned fl_utf8to_mb(const char* src, unsigned srclen,
 788                   char* dst, unsigned dstlen)
 789 {
 790   if (!fl_utf8locale()) {
 791 #ifdef WIN32
 792     wchar_t lbuf[1024];
 793     wchar_t* buf = lbuf;
 794     unsigned length = fl_utf8towc(src, srclen, buf, 1024);
 795     unsigned ret;
 796     if (length >= 1024) {
 797       buf = (wchar_t*)(malloc((length+1)*sizeof(wchar_t)));
 798       fl_utf8towc(src, srclen, buf, length+1);
 799     }
 800     if (dstlen) {
 801       /* apparently this does not null-terminate, even though msdn
 802        * documentation claims it does:
 803        */
 804       ret =
 805         WideCharToMultiByte(GetACP(), 0, buf, length, dst, dstlen, 0, 0);
 806       dst[ret] = 0;
 807     }
 808     /* if it overflows or measuring length, get the actual length: */
 809     if (dstlen==0 || ret >= dstlen-1)
 810       ret =
 811         WideCharToMultiByte(GetACP(), 0, buf, length, 0, 0, 0, 0);
 812     if (buf != lbuf) free((void*)buf);
 813     return ret;
 814 #else
 815     wchar_t lbuf[1024];
 816     wchar_t* buf = lbuf;
 817     unsigned length = fl_utf8towc(src, srclen, buf, 1024);
 818     int ret;
 819     if (length >= 1024) {
 820       buf = (wchar_t*)(malloc((length+1)*sizeof(wchar_t)));
 821       fl_utf8towc(src, srclen, buf, length+1);
 822     }
 823     if (dstlen) {
 824       ret = wcstombs(dst, buf, dstlen);
 825       if (ret >= dstlen-1) ret = wcstombs(0,buf,0);
 826     } else {
 827       ret = wcstombs(0,buf,0);
 828     }
 829     if (buf != lbuf) free((void*)buf);
 830     if (ret >= 0) return (unsigned)ret;
 831     /* on any errors we return the UTF-8 as raw text...*/
 832 #endif
 833   }
 834   /* identity transform: */
 835   if (srclen < dstlen) {
 836     memcpy(dst, src, srclen);
 837     dst[srclen] = 0;
 838   } else {
 839     /* Buffer insufficent or buffer query */
 840   }
 841   return srclen;
 842 }
 843
 844 /*! Convert a filename from the locale-specific multibyte encoding
 845     used by Windows to UTF-8 as used by FLTK.
 846
 847     Up to \p dstlen bytes are written to \p dst, including a null
 848     terminator. The return value is the number of bytes that would be
 849     written, not counting the null terminator. If greater or equal to
 850     \p dstlen then if you malloc a new array of size n+1 you will have
 851     the space needed for the entire string. If \p dstlen is zero then
 852     nothing is written and this call just measures the storage space
 853     needed.
 854
 855     On Unix or on Windows when a UTF-8 locale is in effect, this
 856     does not change the data.
 857     You may also want to check if fl_utf8test() returns non-zero, so that
 858     the filesystem can store filenames in UTF-8 encoding regardless of
 859     the locale.
 860 */
 861 unsigned fl_utf8from_mb(char* dst, unsigned dstlen,
 862                     const char* src, unsigned srclen)
 863 {
 864   if (!fl_utf8locale()) {
 865 #ifdef WIN32
 866     wchar_t lbuf[1024];
 867     wchar_t* buf = lbuf;
 868     unsigned length;
 869     unsigned ret;
 870     length = MultiByteToWideChar(GetACP(), 0, src, srclen, buf, 1024);
 871     if ((length == 0)&&(GetLastError()==ERROR_INSUFFICIENT_BUFFER)) {
 872       length = MultiByteToWideChar(GetACP(), 0, src, srclen, 0, 0);
 873       buf = (wchar_t*)(malloc(length*sizeof(wchar_t)));
 874       MultiByteToWideChar(GetACP(), 0, src, srclen, buf, length);
 875     }
 876     ret = fl_utf8fromwc(dst, dstlen, buf, length);
 877     if (buf != lbuf) free((void*)buf);
 878     return ret;
 879 #else
 880     wchar_t lbuf[1024];
 881     wchar_t* buf = lbuf;
 882     int length;
 883     unsigned ret;
 884     length = mbstowcs(buf, src, 1024);
 885     if (length >= 1024) {
 886       length = mbstowcs(0, src, 0)+1;
 887       buf = (wchar_t*)(malloc(length*sizeof(wchar_t)));
 888       mbstowcs(buf, src, length);
 889     }
 890     if (length >= 0) {
 891       ret = fl_utf8fromwc(dst, dstlen, buf, length);
 892       if (buf != lbuf) free((void*)buf);
 893       return ret;
 894     }
 895     /* errors in conversion return the UTF-8 unchanged */
 896 #endif
 897   }
 898   /* identity transform: */
 899   if (srclen < dstlen) {
 900     memcpy(dst, src, srclen);
 901     dst[srclen] = 0;
 902   } else {
 903     /* Buffer insufficent or buffer query */
 904   }
 905   return srclen;
 906 }
 907
 908 /*! Examines the first \p srclen bytes in \p src and returns a verdict
 909     on whether it is UTF-8 or not.
 910     - Returns 0 if there is any illegal UTF-8 sequences, using the
 911       same rules as fl_utf8decode(). Note that some UCS values considered
 912       illegal by RFC 3629, such as 0xffff, are considered legal by this.
 913     - Returns 1 if there are only single-byte characters (ie no bytes
 914       have the high bit set). This is legal UTF-8, but also indicates
 915       plain ASCII. It also returns 1 if \p srclen is zero.
 916     - Returns 2 if there are only characters less than 0x800.
 917     - Returns 3 if there are only characters less than 0x10000.
 918     - Returns 4 if there are characters in the 0x10000 to 0x10ffff range.
 919
 920     Because there are many illegal sequences in UTF-8, it is almost
 921     impossible for a string in another encoding to be confused with
 922     UTF-8. This is very useful for transitioning Unix to UTF-8
 923     filenames, you can simply test each filename with this to decide
 924     if it is UTF-8 or in the locale encoding. My hope is that if
 925     this is done we will be able to cleanly transition to a locale-less
 926     encoding.
 927 */
 928 int fl_utf8test(const char* src, unsigned srclen) {
 929   int ret = 1;
 930   const char* p = src;
 931   const char* e = src+srclen;
 932   while (p < e) {
 933     if (*p & 0x80) {
 934       int len; fl_utf8decode(p,e,&len);
 935       if (len < 2) return 0;
 936       if (len > ret) ret = len;
 937       p += len;
 938     } else {
 939       p++;
 940     }
 941   }
 942   return ret;
 943 }
 944
 945 /* forward declare mk_wcwidth() as static so the name is not visible.
 946  */
 947  static int mk_wcwidth(unsigned int ucs);
 948
 949  /* include the c source directly so it's contents are only visible here
 950   */
 951 #include "xutf8/mk_wcwidth.c"
 952
 953 /** wrapper to adapt Markus Kuhn's implementation of wcwidth() for FLTK
 954     \param [in] ucs Unicode character value
 955     \returns width of character in columns
 956
 957     See http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c for Markus Kuhn's
 958     original implementation of wcwidth() and wcswidth()
 959     (defined in IEEE Std 1002.1-2001) for Unicode.
 960
 961     \b WARNING: this function returns widths for "raw" Unicode characters.
 962     It does not even try to map C1 control characters (0x80 to 0x9F) to
 963     CP1252, and C0/C1 control characters and DEL will return -1.
 964     You are advised to use fl_width(const char* src) instead.
 965  */
 966 int fl_wcwidth_(unsigned int ucs) {
 967   return mk_wcwidth(ucs);
 968 }
 969
 970 /** extended wrapper around  fl_wcwidth_(unsigned int ucs) function.
 971     \param[in] src pointer to start of UTF-8 byte sequence
 972     \returns width of character in columns
 973
 974     Depending on build options, this function may map C1 control
 975     characters (0x80 to 0x9f) to CP1252, and return the width of
 976     that character instead. This is not the same behaviour as
 977     fl_wcwidth_(unsigned int ucs) .
 978
 979     Note that other control characters and DEL will still return -1,
 980     so if you want different behaviour, you need to test for those
 981     characters before calling fl_wcwidth(), and handle them separately.
 982  */
 983 int fl_wcwidth(const char* src) {
 984   int len = fl_utf8len(*src);
 985   int ret = 0;
 986   unsigned int ucs = fl_utf8decode(src, src+len, &ret);
 987   int width = fl_wcwidth_(ucs);
 988   return width;
 989 }
 990
 991 /** @} */
 992
 993 /*
 994  * End of "$Id: fl_utf.c 8585 2011-04-13 15:43:22Z ianmacarthur $".
 995  */