themes: Workaround for bug where a background color of RGB 0,0,0 in Black color schem...
[ntk.git] / src / fl_utf.c
blobdbdcd502e6df163e53b163ccf727c538f5ace504
1 /*
2 * "$Id: fl_utf.c 8585 2011-04-13 15:43:22Z ianmacarthur $"
4 * This is the utf.c file from fltk2 adapted for use in my fltk1.1 port
5 */
6 /* Copyright 2006-2011 by Bill Spitzak and others.
8 * This library is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Library General Public
10 * License as published by the Free Software Foundation; either
11 * version 2 of the License, or (at your option) any later version.
13 * This library is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 * Library General Public License for more details.
18 * You should have received a copy of the GNU Library General Public
19 * License along with this library; if not, write to the Free Software
20 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
21 * USA.
23 * Please report all bugs and problems on the following page:
25 * http://www.fltk.org/str.php
28 /* Modified to obey rfc3629, which limits unicode to 0-0x10ffff */
30 #include <FL/fl_utf8.h>
31 #include <string.h>
32 #include <stdlib.h>
34 /** \addtogroup fl_unicode
39 #if 0
40 /**
41 \defgroup fl_unichar Unicode Character Functions
42 Global Functions Handling Single Unicode Characters
43 @{ */
45 /**
46 Converts a Unicode character into a utf-8 sequence.
47 \param[in] uc Unicode character
48 \param[out] text utf-8 sequence will be written here; if this pointer is
49 \c NULL, only the length of the utf-8 sequence is calculated
50 \return length of the sequence in bytes
52 /* FL_EXPORT int fl_unichar_to_utf8(unsigned int uc, char *text); */
54 /** @} */
56 /**
57 \defgroup fl_utf8 Unicode String Functions
58 Global Functions Handling Unicode Text
59 @{ */
61 /**
62 Calculate the size of a utf-8 sequence for a Unicode character.
63 \param[in] uc Unicode character
64 \return length of the sequence in bytes
66 /* FL_EXPORT int fl_utf8_size(unsigned int uc); */
68 /** @} */
69 #endif /* 0 */
71 /*!Set to 1 to turn bad UTF8 bytes into ISO-8859-1. If this is to zero
72 they are instead turned into the Unicode REPLACEMENT CHARACTER, of
73 value 0xfffd.
74 If this is on fl_utf8decode() will correctly map most (perhaps all)
75 human-readable text that is in ISO-8859-1. This may allow you
76 to completely ignore character sets in your code because virtually
77 everything is either ISO-8859-1 or UTF-8.
79 #define ERRORS_TO_ISO8859_1 1
81 /*!Set to 1 to turn bad UTF8 bytes in the 0x80-0x9f range into the
82 Unicode index for Microsoft's CP1252 character set. You should
83 also set ERRORS_TO_ISO8859_1. With this a huge amount of more
84 available text (such as all web pages) are correctly converted
85 to Unicode.
87 #define ERRORS_TO_CP1252 1
89 /*!A number of Unicode code points are in fact illegal and should not
90 be produced by a UTF-8 converter. Turn this on will replace the
91 bytes in those encodings with errors. If you do this then converting
92 arbitrary 16-bit data to UTF-8 and then back is not an identity,
93 which will probably break a lot of software.
95 #define STRICT_RFC3629 0
97 #if ERRORS_TO_CP1252
98 /* Codes 0x80..0x9f from the Microsoft CP1252 character set, translated
99 * to Unicode:
101 static unsigned short cp1252[32] = {
102 0x20ac, 0x0081, 0x201a, 0x0192, 0x201e, 0x2026, 0x2020, 0x2021,
103 0x02c6, 0x2030, 0x0160, 0x2039, 0x0152, 0x008d, 0x017d, 0x008f,
104 0x0090, 0x2018, 0x2019, 0x201c, 0x201d, 0x2022, 0x2013, 0x2014,
105 0x02dc, 0x2122, 0x0161, 0x203a, 0x0153, 0x009d, 0x017e, 0x0178
107 #endif
109 /*! Decode a single UTF-8 encoded character starting at \e p. The
110 resulting Unicode value (in the range 0-0x10ffff) is returned,
111 and \e len is set to the number of bytes in the UTF-8 encoding
112 (adding \e len to \e p will point at the next character).
114 If \p p points at an illegal UTF-8 encoding, including one that
115 would go past \e end, or where a code is uses more bytes than
116 necessary, then *(unsigned char*)p is translated as though it is
117 in the Microsoft CP1252 character set and \e len is set to 1.
118 Treating errors this way allows this to decode almost any
119 ISO-8859-1 or CP1252 text that has been mistakenly placed where
120 UTF-8 is expected, and has proven very useful.
122 If you want errors to be converted to error characters (as the
123 standards recommend), adding a test to see if the length is
124 unexpectedly 1 will work:
126 \code
127 if (*p & 0x80) { // what should be a multibyte encoding
128 code = fl_utf8decode(p,end,&len);
129 if (len<2) code = 0xFFFD; // Turn errors into REPLACEMENT CHARACTER
130 } else { // handle the 1-byte utf8 encoding:
131 code = *p;
132 len = 1;
134 \endcode
136 Direct testing for the 1-byte case (as shown above) will also
137 speed up the scanning of strings where the majority of characters
138 are ASCII.
140 unsigned fl_utf8decode(const char* p, const char* end, int* len)
142 unsigned char c = *(unsigned char*)p;
143 if (c < 0x80) {
144 if (len) *len = 1;
145 return c;
146 #if ERRORS_TO_CP1252
147 } else if (c < 0xa0) {
148 if (len) *len = 1;
149 return cp1252[c-0x80];
150 #endif
151 } else if (c < 0xc2) {
152 goto FAIL;
154 if ( (end && p+1 >= end) || (p[1]&0xc0) != 0x80) goto FAIL;
155 if (c < 0xe0) {
156 if (len) *len = 2;
157 return
158 ((p[0] & 0x1f) << 6) +
159 ((p[1] & 0x3f));
160 } else if (c == 0xe0) {
161 if (((unsigned char*)p)[1] < 0xa0) goto FAIL;
162 goto UTF8_3;
163 #if STRICT_RFC3629
164 } else if (c == 0xed) {
165 /* RFC 3629 says surrogate chars are illegal. */
166 if (((unsigned char*)p)[1] >= 0xa0) goto FAIL;
167 goto UTF8_3;
168 } else if (c == 0xef) {
169 /* 0xfffe and 0xffff are also illegal characters */
170 if (((unsigned char*)p)[1]==0xbf &&
171 ((unsigned char*)p)[2]>=0xbe) goto FAIL;
172 goto UTF8_3;
173 #endif
174 } else if (c < 0xf0) {
175 UTF8_3:
176 if ( (end && p+2 >= end) || (p[2]&0xc0) != 0x80) goto FAIL;
177 if (len) *len = 3;
178 return
179 ((p[0] & 0x0f) << 12) +
180 ((p[1] & 0x3f) << 6) +
181 ((p[2] & 0x3f));
182 } else if (c == 0xf0) {
183 if (((unsigned char*)p)[1] < 0x90) goto FAIL;
184 goto UTF8_4;
185 } else if (c < 0xf4) {
186 UTF8_4:
187 if ( (end && p+3 >= end) || (p[2]&0xc0) != 0x80 || (p[3]&0xc0) != 0x80) goto FAIL;
188 if (len) *len = 4;
189 #if STRICT_RFC3629
190 /* RFC 3629 says all codes ending in fffe or ffff are illegal: */
191 if ((p[1]&0xf)==0xf &&
192 ((unsigned char*)p)[2] == 0xbf &&
193 ((unsigned char*)p)[3] >= 0xbe) goto FAIL;
194 #endif
195 return
196 ((p[0] & 0x07) << 18) +
197 ((p[1] & 0x3f) << 12) +
198 ((p[2] & 0x3f) << 6) +
199 ((p[3] & 0x3f));
200 } else if (c == 0xf4) {
201 if (((unsigned char*)p)[1] > 0x8f) goto FAIL; /* after 0x10ffff */
202 goto UTF8_4;
203 } else {
204 FAIL:
205 if (len) *len = 1;
206 #if ERRORS_TO_ISO8859_1
207 return c;
208 #else
209 return 0xfffd; /* Unicode REPLACEMENT CHARACTER */
210 #endif
214 /*! Move \p p forward until it points to the start of a UTF-8
215 character. If it already points at the start of one then it
216 is returned unchanged. Any UTF-8 errors are treated as though each
217 byte of the error is an individual character.
219 \e start is the start of the string and is used to limit the
220 backwards search for the start of a utf8 character.
222 \e end is the end of the string and is assumed to be a break
223 between characters. It is assumed to be greater than p.
225 This function is for moving a pointer that was jumped to the
226 middle of a string, such as when doing a binary search for
227 a position. You should use either this or fl_utf8back() depending
228 on which direction your algorithim can handle the pointer
229 moving. Do not use this to scan strings, use fl_utf8decode()
230 instead.
232 const char* fl_utf8fwd(const char* p, const char* start, const char* end)
234 const char* a;
235 int len;
236 /* if we are not pointing at a continuation character, we are done: */
237 if ((*p&0xc0) != 0x80) return p;
238 /* search backwards for a 0xc0 starting the character: */
239 for (a = p-1; ; --a) {
240 if (a < start) return p;
241 if (!(a[0]&0x80)) return p;
242 if ((a[0]&0x40)) break;
244 fl_utf8decode(a,end,&len);
245 a += len;
246 if (a > p) return a;
247 return p;
250 /*! Move \p p backward until it points to the start of a UTF-8
251 character. If it already points at the start of one then it
252 is returned unchanged. Any UTF-8 errors are treated as though each
253 byte of the error is an individual character.
255 \e start is the start of the string and is used to limit the
256 backwards search for the start of a UTF-8 character.
258 \e end is the end of the string and is assumed to be a break
259 between characters. It is assumed to be greater than p.
261 If you wish to decrement a UTF-8 pointer, pass p-1 to this.
263 const char* fl_utf8back(const char* p, const char* start, const char* end)
265 const char* a;
266 int len;
267 /* if we are not pointing at a continuation character, we are done: */
268 if ((*p&0xc0) != 0x80) return p;
269 /* search backwards for a 0xc0 starting the character: */
270 for (a = p-1; ; --a) {
271 if (a < start) return p;
272 if (!(a[0]&0x80)) return p;
273 if ((a[0]&0x40)) break;
275 fl_utf8decode(a,end,&len);
276 if (a+len > p) return a;
277 return p;
280 /*! Returns number of bytes that utf8encode() will use to encode the
281 character \p ucs. */
282 int fl_utf8bytes(unsigned ucs) {
283 if (ucs < 0x000080U) {
284 return 1;
285 } else if (ucs < 0x000800U) {
286 return 2;
287 } else if (ucs < 0x010000U) {
288 return 3;
289 } else if (ucs <= 0x10ffffU) {
290 return 4;
291 } else {
292 return 3; /* length of the illegal character encoding */
296 /*! Write the UTF-8 encoding of \e ucs into \e buf and return the
297 number of bytes written. Up to 4 bytes may be written. If you know
298 that \p ucs is less than 0x10000 then at most 3 bytes will be written.
299 If you wish to speed this up, remember that anything less than 0x80
300 is written as a single byte.
302 If ucs is greater than 0x10ffff this is an illegal character
303 according to RFC 3629. These are converted as though they are
304 0xFFFD (REPLACEMENT CHARACTER).
306 RFC 3629 also says many other values for \p ucs are illegal (in
307 the range 0xd800 to 0xdfff, or ending with 0xfffe or
308 0xffff). However I encode these as though they are legal, so that
309 utf8encode/fl_utf8decode will be the identity for all codes between 0
310 and 0x10ffff.
312 int fl_utf8encode(unsigned ucs, char* buf) {
313 if (ucs < 0x000080U) {
314 buf[0] = ucs;
315 return 1;
316 } else if (ucs < 0x000800U) {
317 buf[0] = 0xc0 | (ucs >> 6);
318 buf[1] = 0x80 | (ucs & 0x3F);
319 return 2;
320 } else if (ucs < 0x010000U) {
321 buf[0] = 0xe0 | (ucs >> 12);
322 buf[1] = 0x80 | ((ucs >> 6) & 0x3F);
323 buf[2] = 0x80 | (ucs & 0x3F);
324 return 3;
325 } else if (ucs <= 0x0010ffffU) {
326 buf[0] = 0xf0 | (ucs >> 18);
327 buf[1] = 0x80 | ((ucs >> 12) & 0x3F);
328 buf[2] = 0x80 | ((ucs >> 6) & 0x3F);
329 buf[3] = 0x80 | (ucs & 0x3F);
330 return 4;
331 } else {
332 /* encode 0xfffd: */
333 buf[0] = 0xefU;
334 buf[1] = 0xbfU;
335 buf[2] = 0xbdU;
336 return 3;
340 /*! Convert a single 32-bit Unicode codepoint into an array of 16-bit
341 characters. These are used by some system calls, especially on Windows.
343 \p ucs is the value to convert.
345 \p dst points at an array to write, and \p dstlen is the number of
346 locations in this array. At most \p dstlen words will be
347 written, and a 0 terminating word will be added if \p dstlen is
348 large enough. Thus this function will never overwrite the buffer
349 and will attempt return a zero-terminated string if space permits.
350 If \p dstlen is zero then \p dst can be set to NULL and no data
351 is written, but the length is returned.
353 The return value is the number of 16-bit words that \e would be written
354 to \p dst if it is large enough, not counting any terminating
355 zero.
357 If the return value is greater than \p dstlen it indicates truncation,
358 you should then allocate a new array of size return+1 and call this again.
360 Unicode characters in the range 0x10000 to 0x10ffff are converted to
361 "surrogate pairs" which take two words each (in UTF-16 encoding).
362 Typically, setting \p dstlen to 2 will ensure that any valid Unicode
363 value can be converted, and setting \p dstlen to 3 or more will allow
364 a NULL terminated sequence to be returned.
366 unsigned fl_ucs_to_Utf16(const unsigned ucs, unsigned short *dst, const unsigned dstlen)
368 /* The rule for direct conversion from UCS to UTF16 is:
369 * - if UCS > 0x0010FFFF then UCS is invalid
370 * - if UCS >= 0xD800 && UCS <= 0xDFFF UCS is invalid
371 * - if UCS <= 0x0000FFFF then U16 = UCS, len = 1
372 * - else
373 * -- U16[0] = ((UCS - 0x00010000) >> 10) & 0x3FF + 0xD800
374 * -- U16[1] = (UCS & 0x3FF) + 0xDC00
375 * -- len = 2;
377 unsigned count; /* Count of converted UTF16 cells */
378 unsigned short u16[4]; /* Alternate buffer if dst is not set */
379 unsigned short *out; /* points to the active buffer */
380 /* Ensure we have a valid buffer to write to */
381 if((!dstlen) || (!dst)) {
382 out = u16;
383 } else {
384 out = dst;
386 /* Convert from UCS to UTF16 */
387 if((ucs > 0x0010FFFF) || /* UCS is too large */
388 ((ucs > 0xD7FF) && (ucs < 0xE000))) { /* UCS in invalid range */
389 out[0] = 0xFFFD; /* REPLACEMENT CHARACTER */
390 count = 1;
391 } else if(ucs < 0x00010000) {
392 out[0] = (unsigned short)ucs;
393 count = 1;
394 } else if(dstlen < 2) { /* dst is too small for the result */
395 out[0] = 0xFFFD; /* REPLACEMENT CHARACTER */
396 count = 2;
397 } else {
398 out[0] = (((ucs - 0x00010000) >> 10) & 0x3FF) + 0xD800;
399 out[1] = (ucs & 0x3FF) + 0xDC00;
400 count = 2;
402 /* NULL terminate the output, if there is space */
403 if(count < dstlen) { out[count] = 0; }
404 return count;
405 } /* fl_ucs_to_Utf16 */
407 /*! Convert a UTF-8 sequence into an array of 16-bit characters. These
408 are used by some system calls, especially on Windows.
410 \p src points at the UTF-8, and \p srclen is the number of bytes to
411 convert.
413 \p dst points at an array to write, and \p dstlen is the number of
414 locations in this array. At most \p dstlen-1 words will be
415 written there, plus a 0 terminating word. Thus this function
416 will never overwrite the buffer and will always return a
417 zero-terminated string. If \p dstlen is zero then \p dst can be
418 null and no data is written, but the length is returned.
420 The return value is the number of 16-bit words that \e would be written
421 to \p dst if it were long enough, not counting the terminating
422 zero. If the return value is greater or equal to \p dstlen it
423 indicates truncation, you can then allocate a new array of size
424 return+1 and call this again.
426 Errors in the UTF-8 are converted as though each byte in the
427 erroneous string is in the Microsoft CP1252 encoding. This allows
428 ISO-8859-1 text mistakenly identified as UTF-8 to be printed
429 correctly.
431 Unicode characters in the range 0x10000 to 0x10ffff are converted to
432 "surrogate pairs" which take two words each (this is called UTF-16
433 encoding).
435 unsigned fl_utf8toUtf16(const char* src, unsigned srclen,
436 unsigned short* dst, unsigned dstlen)
438 const char* p = src;
439 const char* e = src+srclen;
440 unsigned count = 0;
441 if (dstlen) for (;;) {
442 if (p >= e) {dst[count] = 0; return count;}
443 if (!(*p & 0x80)) { /* ascii */
444 dst[count] = *p++;
445 } else {
446 int len; unsigned ucs = fl_utf8decode(p,e,&len);
447 p += len;
448 if (ucs < 0x10000) {
449 dst[count] = ucs;
450 } else {
451 /* make a surrogate pair: */
452 if (count+2 >= dstlen) {dst[count] = 0; count += 2; break;}
453 dst[count] = (((ucs-0x10000u)>>10)&0x3ff) | 0xd800;
454 dst[++count] = (ucs&0x3ff) | 0xdc00;
457 if (++count == dstlen) {dst[count-1] = 0; break;}
459 /* we filled dst, measure the rest: */
460 while (p < e) {
461 if (!(*p & 0x80)) p++;
462 else {
463 int len; unsigned ucs = fl_utf8decode(p,e,&len);
464 p += len;
465 if (ucs >= 0x10000) ++count;
467 ++count;
469 return count;
474 Converts a UTF-8 string into a wide character string.
476 This function generates 32-bit wchar_t (e.g. "ucs4" as it were) except
477 on Windows where it is equivalent to fl_utf8toUtf16 and returns
478 UTF-16.
480 \p src points at the UTF-8, and \p srclen is the number of bytes to
481 convert.
483 \p dst points at an array to write, and \p dstlen is the number of
484 locations in this array. At most \p dstlen-1 wchar_t will be
485 written there, plus a 0 terminating wchar_t.
487 The return value is the number of wchar_t that \e would be written
488 to \p dst if it were long enough, not counting the terminating
489 zero. If the return value is greater or equal to \p dstlen it
490 indicates truncation, you can then allocate a new array of size
491 return+1 and call this again.
493 Notice that sizeof(wchar_t) is 2 on Windows and is 4 on Linux
494 and most other systems. Where wchar_t is 16 bits, Unicode
495 characters in the range 0x10000 to 0x10ffff are converted to
496 "surrogate pairs" which take two words each (this is called UTF-16
497 encoding). If wchar_t is 32 bits this rather nasty problem is
498 avoided.
500 Note that Windows includes Cygwin, i.e. compiled with Cygwin's POSIX
501 layer (cygwin1.dll, --enable-cygwin), either native (GDI) or X11.
503 unsigned fl_utf8towc(const char* src, unsigned srclen,
504 wchar_t* dst, unsigned dstlen)
506 #if defined(WIN32) || defined(__CYGWIN__)
507 return fl_utf8toUtf16(src, srclen, (unsigned short*)dst, dstlen);
508 #else
509 const char* p = src;
510 const char* e = src+srclen;
511 unsigned count = 0;
512 if (dstlen) for (;;) {
513 if (p >= e) {
514 dst[count] = 0;
515 return count;
517 if (!(*p & 0x80)) { /* ascii */
518 dst[count] = *p++;
519 } else {
520 int len; unsigned ucs = fl_utf8decode(p,e,&len);
521 p += len;
522 dst[count] = (wchar_t)ucs;
524 if (++count == dstlen) {dst[count-1] = 0; break;}
526 /* we filled dst, measure the rest: */
527 while (p < e) {
528 if (!(*p & 0x80)) p++;
529 else {
530 int len; fl_utf8decode(p,e,&len);
531 p += len;
533 ++count;
535 return count;
536 #endif
539 /*! Convert a UTF-8 sequence into an array of 1-byte characters.
541 If the UTF-8 decodes to a character greater than 0xff then it is
542 replaced with '?'.
544 Errors in the UTF-8 are converted as individual bytes, same as
545 fl_utf8decode() does. This allows ISO-8859-1 text mistakenly identified
546 as UTF-8 to be printed correctly (and possibly CP1512 on Windows).
548 \p src points at the UTF-8, and \p srclen is the number of bytes to
549 convert.
551 Up to \p dstlen bytes are written to \p dst, including a null
552 terminator. The return value is the number of bytes that would be
553 written, not counting the null terminator. If greater or equal to
554 \p dstlen then if you malloc a new array of size n+1 you will have
555 the space needed for the entire string. If \p dstlen is zero then
556 nothing is written and this call just measures the storage space
557 needed.
559 unsigned fl_utf8toa(const char* src, unsigned srclen,
560 char* dst, unsigned dstlen)
562 const char* p = src;
563 const char* e = src+srclen;
564 unsigned count = 0;
565 if (dstlen) for (;;) {
566 unsigned char c;
567 if (p >= e) {dst[count] = 0; return count;}
568 c = *(unsigned char*)p;
569 if (c < 0xC2) { /* ascii or bad code */
570 dst[count] = c;
571 p++;
572 } else {
573 int len; unsigned ucs = fl_utf8decode(p,e,&len);
574 p += len;
575 if (ucs < 0x100) dst[count] = ucs;
576 else dst[count] = '?';
578 if (++count >= dstlen) {dst[count-1] = 0; break;}
580 /* we filled dst, measure the rest: */
581 while (p < e) {
582 if (!(*p & 0x80)) p++;
583 else {
584 int len;
585 fl_utf8decode(p,e,&len);
586 p += len;
588 ++count;
590 return count;
593 /*! Turn "wide characters" as returned by some system calls
594 (especially on Windows) into UTF-8.
596 Up to \p dstlen bytes are written to \p dst, including a null
597 terminator. The return value is the number of bytes that would be
598 written, not counting the null terminator. If greater or equal to
599 \p dstlen then if you malloc a new array of size n+1 you will have
600 the space needed for the entire string. If \p dstlen is zero then
601 nothing is written and this call just measures the storage space
602 needed.
604 \p srclen is the number of words in \p src to convert. On Windows
605 this is not necessarily the number of characters, due to there
606 possibly being "surrogate pairs" in the UTF-16 encoding used.
607 On Unix wchar_t is 32 bits and each location is a character.
609 On Unix if a \p src word is greater than 0x10ffff then this is an
610 illegal character according to RFC 3629. These are converted as
611 though they are 0xFFFD (REPLACEMENT CHARACTER). Characters in the
612 range 0xd800 to 0xdfff, or ending with 0xfffe or 0xffff are also
613 illegal according to RFC 3629. However I encode these as though
614 they are legal, so that fl_utf8towc will return the original data.
616 On Windows "surrogate pairs" are converted to a single character
617 and UTF-8 encoded (as 4 bytes). Mismatched halves of surrogate
618 pairs are converted as though they are individual characters.
620 unsigned fl_utf8fromwc(char* dst, unsigned dstlen,
621 const wchar_t* src, unsigned srclen) {
622 unsigned i = 0;
623 unsigned count = 0;
624 if (dstlen) for (;;) {
625 unsigned ucs;
626 if (i >= srclen) {dst[count] = 0; return count;}
627 ucs = src[i++];
628 if (ucs < 0x80U) {
629 dst[count++] = ucs;
630 if (count >= dstlen) {dst[count-1] = 0; break;}
631 } else if (ucs < 0x800U) { /* 2 bytes */
632 if (count+2 >= dstlen) {dst[count] = 0; count += 2; break;}
633 dst[count++] = 0xc0 | (ucs >> 6);
634 dst[count++] = 0x80 | (ucs & 0x3F);
635 #if defined(WIN32) || defined(__CYGWIN__)
636 } else if (ucs >= 0xd800 && ucs <= 0xdbff && i < srclen &&
637 src[i] >= 0xdc00 && src[i] <= 0xdfff) {
638 /* surrogate pair */
639 unsigned ucs2 = src[i++];
640 ucs = 0x10000U + ((ucs&0x3ff)<<10) + (ucs2&0x3ff);
641 /* all surrogate pairs turn into 4-byte utf8 */
642 #else
643 } else if (ucs >= 0x10000) {
644 if (ucs > 0x10ffff) {
645 ucs = 0xfffd;
646 goto J1;
648 #endif
649 if (count+4 >= dstlen) {dst[count] = 0; count += 4; break;}
650 dst[count++] = 0xf0 | (ucs >> 18);
651 dst[count++] = 0x80 | ((ucs >> 12) & 0x3F);
652 dst[count++] = 0x80 | ((ucs >> 6) & 0x3F);
653 dst[count++] = 0x80 | (ucs & 0x3F);
654 } else {
655 #if !(defined(WIN32) || defined(__CYGWIN__))
657 #endif
658 /* all others are 3 bytes: */
659 if (count+3 >= dstlen) {dst[count] = 0; count += 3; break;}
660 dst[count++] = 0xe0 | (ucs >> 12);
661 dst[count++] = 0x80 | ((ucs >> 6) & 0x3F);
662 dst[count++] = 0x80 | (ucs & 0x3F);
665 /* we filled dst, measure the rest: */
666 while (i < srclen) {
667 unsigned ucs = src[i++];
668 if (ucs < 0x80U) {
669 count++;
670 } else if (ucs < 0x800U) { /* 2 bytes */
671 count += 2;
672 #if defined(WIN32) || defined(__CYGWIN__)
673 } else if (ucs >= 0xd800 && ucs <= 0xdbff && i < srclen-1 &&
674 src[i+1] >= 0xdc00 && src[i+1] <= 0xdfff) {
675 /* surrogate pair */
676 ++i;
677 #else
678 } else if (ucs >= 0x10000 && ucs <= 0x10ffff) {
679 #endif
680 count += 4;
681 } else {
682 count += 3;
685 return count;
688 /*! Convert an ISO-8859-1 (ie normal c-string) byte stream to UTF-8.
690 It is possible this should convert Microsoft's CP1252 to UTF-8
691 instead. This would translate the codes in the range 0x80-0x9f
692 to different characters. Currently it does not do this.
694 Up to \p dstlen bytes are written to \p dst, including a null
695 terminator. The return value is the number of bytes that would be
696 written, not counting the null terminator. If greater or equal to
697 \p dstlen then if you malloc a new array of size n+1 you will have
698 the space needed for the entire string. If \p dstlen is zero then
699 nothing is written and this call just measures the storage space
700 needed.
702 \p srclen is the number of bytes in \p src to convert.
704 If the return value equals \p srclen then this indicates that
705 no conversion is necessary, as only ASCII characters are in the
706 string.
708 unsigned fl_utf8froma(char* dst, unsigned dstlen,
709 const char* src, unsigned srclen) {
710 const char* p = src;
711 const char* e = src+srclen;
712 unsigned count = 0;
713 if (dstlen) for (;;) {
714 unsigned char ucs;
715 if (p >= e) {dst[count] = 0; return count;}
716 ucs = *(unsigned char*)p++;
717 if (ucs < 0x80U) {
718 dst[count++] = ucs;
719 if (count >= dstlen) {dst[count-1] = 0; break;}
720 } else { /* 2 bytes (note that CP1252 translate could make 3 bytes!) */
721 if (count+2 >= dstlen) {dst[count] = 0; count += 2; break;}
722 dst[count++] = 0xc0 | (ucs >> 6);
723 dst[count++] = 0x80 | (ucs & 0x3F);
726 /* we filled dst, measure the rest: */
727 while (p < e) {
728 unsigned char ucs = *(unsigned char*)p++;
729 if (ucs < 0x80U) {
730 count++;
731 } else {
732 count += 2;
735 return count;
738 #ifdef WIN32
739 # include <windows.h>
740 #endif
742 /*! Return true if the "locale" seems to indicate that UTF-8 encoding
743 is used. If true the fl_utf8to_mb and fl_utf8from_mb don't do anything
744 useful.
746 <i>It is highly recommended that you change your system so this
747 does return true.</i> On Windows this is done by setting the
748 "codepage" to CP_UTF8. On Unix this is done by setting $LC_CTYPE
749 to a string containing the letters "utf" or "UTF" in it, or by
750 deleting all $LC* and $LANG environment variables. In the future
751 it is likely that all non-Asian Unix systems will return true,
752 due to the compatibility of UTF-8 with ISO-8859-1.
754 int fl_utf8locale(void) {
755 static int ret = 2;
756 if (ret == 2) {
757 #ifdef WIN32
758 ret = GetACP() == CP_UTF8;
759 #else
760 char* s;
761 ret = 1; /* assume UTF-8 if no locale */
762 if (((s = getenv("LC_CTYPE")) && *s) ||
763 ((s = getenv("LC_ALL")) && *s) ||
764 ((s = getenv("LANG")) && *s)) {
765 ret = (strstr(s,"utf") || strstr(s,"UTF"));
767 #endif
769 return ret;
772 /*! Convert the UTF-8 used by FLTK to the locale-specific encoding
773 used for filenames (and sometimes used for data in files).
774 Unfortunately due to stupid design you will have to do this as
775 needed for filenames. This is a bug on both Unix and Windows.
777 Up to \p dstlen bytes are written to \p dst, including a null
778 terminator. The return value is the number of bytes that would be
779 written, not counting the null terminator. If greater or equal to
780 \p dstlen then if you malloc a new array of size n+1 you will have
781 the space needed for the entire string. If \p dstlen is zero then
782 nothing is written and this call just measures the storage space
783 needed.
785 If fl_utf8locale() returns true then this does not change the data.
787 unsigned fl_utf8to_mb(const char* src, unsigned srclen,
788 char* dst, unsigned dstlen)
790 if (!fl_utf8locale()) {
791 #ifdef WIN32
792 wchar_t lbuf[1024];
793 wchar_t* buf = lbuf;
794 unsigned length = fl_utf8towc(src, srclen, buf, 1024);
795 unsigned ret;
796 if (length >= 1024) {
797 buf = (wchar_t*)(malloc((length+1)*sizeof(wchar_t)));
798 fl_utf8towc(src, srclen, buf, length+1);
800 if (dstlen) {
801 /* apparently this does not null-terminate, even though msdn
802 * documentation claims it does:
804 ret =
805 WideCharToMultiByte(GetACP(), 0, buf, length, dst, dstlen, 0, 0);
806 dst[ret] = 0;
808 /* if it overflows or measuring length, get the actual length: */
809 if (dstlen==0 || ret >= dstlen-1)
810 ret =
811 WideCharToMultiByte(GetACP(), 0, buf, length, 0, 0, 0, 0);
812 if (buf != lbuf) free((void*)buf);
813 return ret;
814 #else
815 wchar_t lbuf[1024];
816 wchar_t* buf = lbuf;
817 unsigned length = fl_utf8towc(src, srclen, buf, 1024);
818 int ret;
819 if (length >= 1024) {
820 buf = (wchar_t*)(malloc((length+1)*sizeof(wchar_t)));
821 fl_utf8towc(src, srclen, buf, length+1);
823 if (dstlen) {
824 ret = wcstombs(dst, buf, dstlen);
825 if (ret >= dstlen-1) ret = wcstombs(0,buf,0);
826 } else {
827 ret = wcstombs(0,buf,0);
829 if (buf != lbuf) free((void*)buf);
830 if (ret >= 0) return (unsigned)ret;
831 /* on any errors we return the UTF-8 as raw text...*/
832 #endif
834 /* identity transform: */
835 if (srclen < dstlen) {
836 memcpy(dst, src, srclen);
837 dst[srclen] = 0;
838 } else {
839 /* Buffer insufficent or buffer query */
841 return srclen;
844 /*! Convert a filename from the locale-specific multibyte encoding
845 used by Windows to UTF-8 as used by FLTK.
847 Up to \p dstlen bytes are written to \p dst, including a null
848 terminator. The return value is the number of bytes that would be
849 written, not counting the null terminator. If greater or equal to
850 \p dstlen then if you malloc a new array of size n+1 you will have
851 the space needed for the entire string. If \p dstlen is zero then
852 nothing is written and this call just measures the storage space
853 needed.
855 On Unix or on Windows when a UTF-8 locale is in effect, this
856 does not change the data.
857 You may also want to check if fl_utf8test() returns non-zero, so that
858 the filesystem can store filenames in UTF-8 encoding regardless of
859 the locale.
861 unsigned fl_utf8from_mb(char* dst, unsigned dstlen,
862 const char* src, unsigned srclen)
864 if (!fl_utf8locale()) {
865 #ifdef WIN32
866 wchar_t lbuf[1024];
867 wchar_t* buf = lbuf;
868 unsigned length;
869 unsigned ret;
870 length = MultiByteToWideChar(GetACP(), 0, src, srclen, buf, 1024);
871 if ((length == 0)&&(GetLastError()==ERROR_INSUFFICIENT_BUFFER)) {
872 length = MultiByteToWideChar(GetACP(), 0, src, srclen, 0, 0);
873 buf = (wchar_t*)(malloc(length*sizeof(wchar_t)));
874 MultiByteToWideChar(GetACP(), 0, src, srclen, buf, length);
876 ret = fl_utf8fromwc(dst, dstlen, buf, length);
877 if (buf != lbuf) free((void*)buf);
878 return ret;
879 #else
880 wchar_t lbuf[1024];
881 wchar_t* buf = lbuf;
882 int length;
883 unsigned ret;
884 length = mbstowcs(buf, src, 1024);
885 if (length >= 1024) {
886 length = mbstowcs(0, src, 0)+1;
887 buf = (wchar_t*)(malloc(length*sizeof(wchar_t)));
888 mbstowcs(buf, src, length);
890 if (length >= 0) {
891 ret = fl_utf8fromwc(dst, dstlen, buf, length);
892 if (buf != lbuf) free((void*)buf);
893 return ret;
895 /* errors in conversion return the UTF-8 unchanged */
896 #endif
898 /* identity transform: */
899 if (srclen < dstlen) {
900 memcpy(dst, src, srclen);
901 dst[srclen] = 0;
902 } else {
903 /* Buffer insufficent or buffer query */
905 return srclen;
908 /*! Examines the first \p srclen bytes in \p src and returns a verdict
909 on whether it is UTF-8 or not.
910 - Returns 0 if there is any illegal UTF-8 sequences, using the
911 same rules as fl_utf8decode(). Note that some UCS values considered
912 illegal by RFC 3629, such as 0xffff, are considered legal by this.
913 - Returns 1 if there are only single-byte characters (ie no bytes
914 have the high bit set). This is legal UTF-8, but also indicates
915 plain ASCII. It also returns 1 if \p srclen is zero.
916 - Returns 2 if there are only characters less than 0x800.
917 - Returns 3 if there are only characters less than 0x10000.
918 - Returns 4 if there are characters in the 0x10000 to 0x10ffff range.
920 Because there are many illegal sequences in UTF-8, it is almost
921 impossible for a string in another encoding to be confused with
922 UTF-8. This is very useful for transitioning Unix to UTF-8
923 filenames, you can simply test each filename with this to decide
924 if it is UTF-8 or in the locale encoding. My hope is that if
925 this is done we will be able to cleanly transition to a locale-less
926 encoding.
928 int fl_utf8test(const char* src, unsigned srclen) {
929 int ret = 1;
930 const char* p = src;
931 const char* e = src+srclen;
932 while (p < e) {
933 if (*p & 0x80) {
934 int len; fl_utf8decode(p,e,&len);
935 if (len < 2) return 0;
936 if (len > ret) ret = len;
937 p += len;
938 } else {
939 p++;
942 return ret;
945 /* forward declare mk_wcwidth() as static so the name is not visible.
947 static int mk_wcwidth(unsigned int ucs);
949 /* include the c source directly so it's contents are only visible here
951 #include "xutf8/mk_wcwidth.c"
953 /** wrapper to adapt Markus Kuhn's implementation of wcwidth() for FLTK
954 \param [in] ucs Unicode character value
955 \returns width of character in columns
957 See http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c for Markus Kuhn's
958 original implementation of wcwidth() and wcswidth()
959 (defined in IEEE Std 1002.1-2001) for Unicode.
961 \b WARNING: this function returns widths for "raw" Unicode characters.
962 It does not even try to map C1 control characters (0x80 to 0x9F) to
963 CP1252, and C0/C1 control characters and DEL will return -1.
964 You are advised to use fl_width(const char* src) instead.
966 int fl_wcwidth_(unsigned int ucs) {
967 return mk_wcwidth(ucs);
970 /** extended wrapper around fl_wcwidth_(unsigned int ucs) function.
971 \param[in] src pointer to start of UTF-8 byte sequence
972 \returns width of character in columns
974 Depending on build options, this function may map C1 control
975 characters (0x80 to 0x9f) to CP1252, and return the width of
976 that character instead. This is not the same behaviour as
977 fl_wcwidth_(unsigned int ucs) .
979 Note that other control characters and DEL will still return -1,
980 so if you want different behaviour, you need to test for those
981 characters before calling fl_wcwidth(), and handle them separately.
983 int fl_wcwidth(const char* src) {
984 int len = fl_utf8len(*src);
985 int ret = 0;
986 unsigned int ucs = fl_utf8decode(src, src+len, &ret);
987 int width = fl_wcwidth_(ucs);
988 return width;
991 /** @} */
994 * End of "$Id: fl_utf.c 8585 2011-04-13 15:43:22Z ianmacarthur $".