Corrections to SVN properties.
[AROS.git] / workbench / libs / codesets / src / convertUTF.c
blob217deb0cbc50720eebb99b044346b90fbf2cc2d4
1 /*
2 * Copyright 2001-2004 Unicode, Inc.
4 * Disclaimer
6 * This source code is provided as is by Unicode, Inc. No claims are
7 * made as to fitness for any particular purpose. No warranties of any
8 * kind are expressed or implied. The recipient agrees to determine
9 * applicability of information provided. If this file has been
10 * purchased on magnetic or optical media from Unicode, Inc., the
11 * sole remedy for any claim will be exchange of defective media
12 * within 90 days of receipt.
14 * Limitations on Rights to Redistribute This Code
16 * Unicode, Inc. hereby grants the right to freely use the information
17 * supplied in this file in the creation of products supporting the
18 * Unicode Standard, and to make copies of this file in any form
19 * for internal or external distribution as long as this notice
20 * remains attached.
23 /* ---------------------------------------------------------------------
25 Conversions between UTF32, UTF-16, and UTF-8. Source code file.
26 Author: Mark E. Davis, 1994.
27 Rev History: Rick McGowan, fixes & updates May 2001.
28 Sept 2001: fixed const & error conditions per
29 mods suggested by S. Parent & A. Lillich.
30 June 2002: Tim Dodd added detection and handling of incomplete
31 source sequences, enhanced error detection, added casts
32 to eliminate compiler warnings.
33 July 2003: slight mods to back out aggressive FFFE detection.
34 Jan 2004: updated switches in from-UTF8 conversions.
35 Oct 2004: updated to use UNI_MAX_LEGAL_UTF32 in UTF-32 conversions.
37 See the header file "ConvertUTF.h" for complete documentation.
39 ------------------------------------------------------------------------ */
41 #include "lib.h"
42 #include "convertUTF.h"
44 #include "SDI_lib.h"
46 #include "debug.h"
48 /***********************************************************************/
50 static const int halfShift = 10; /* used for shifting by 10 bits */
52 static const UTF32 halfBase = 0x0010000UL;
53 static const UTF32 halfMask = 0x3FFUL;
55 #define UNI_SUR_HIGH_START (UTF32)0xD800
56 #define UNI_SUR_HIGH_END (UTF32)0xDBFF
57 #define UNI_SUR_LOW_START (UTF32)0xDC00
58 #define UNI_SUR_LOW_END (UTF32)0xDFFF
60 /***********************************************************************/
62 ULONG LIBFUNC
63 CodesetsConvertUTF32toUTF16(REG(a0, const UTF32 ** sourceStart),
64 REG(a1, const UTF32 * sourceEnd),
65 REG(a2, UTF16 ** targetStart),
66 REG(a3, UTF16 * targetEnd),
67 REG(d0, ULONG flags))
69 ULONG result = CSR_ConversionOK;
70 const UTF32 *source = *sourceStart;
71 UTF16 *target = *targetStart;
73 ENTER();
75 while(source < sourceEnd)
77 UTF32 ch;
79 if(target >= targetEnd)
81 result = CSR_TargetExhausted;
82 break;
85 ch = *source++;
86 if(ch <= UNI_MAX_BMP)
88 /* Target is a character <= 0xFFFF */
89 /* UTF-16 surrogate values are illegal in UTF-32; 0xffff or 0xfffe are both reserved values */
90 if(ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END)
92 if(flags == CSF_StrictConversion)
94 --source; /* return to the illegal value itself */
95 result = CSR_SourceIllegal;
96 break;
98 else
100 *target++ = UNI_REPLACEMENT_CHAR;
103 else
105 *target++ = (UTF16)ch; /* normal case */
108 else if(ch > UNI_MAX_LEGAL_UTF32)
110 if(flags == CSF_StrictConversion)
112 result = CSR_SourceIllegal;
114 else
116 *target++ = UNI_REPLACEMENT_CHAR;
119 else
121 /* target is a character in range 0xFFFF - 0x10FFFF. */
122 if(target + 1 >= targetEnd)
124 --source; /* Back up source pointer! */
125 result = CSR_TargetExhausted;
126 break;
128 ch -= halfBase;
129 *target++ = (UTF16) ((ch >> halfShift) + UNI_SUR_HIGH_START);
130 *target++ = (UTF16) ((ch & halfMask) + UNI_SUR_LOW_START);
134 *sourceStart = source;
135 *targetStart = target;
137 RETURN(result);
138 return result;
141 /***********************************************************************/
143 ULONG LIBFUNC
144 CodesetsConvertUTF16toUTF32(REG(a0, const UTF16 ** sourceStart),
145 REG(a1, const UTF16 * sourceEnd),
146 REG(a2, UTF32 ** targetStart),
147 REG(a3, UTF32 * targetEnd),
148 REG(d0, ULONG flags))
150 ULONG result = CSR_ConversionOK;
151 const UTF16 *source = *sourceStart;
152 UTF32 *target = *targetStart;
153 UTF32 ch=0, ch2=0;
155 ENTER();
157 while(source < sourceEnd)
159 const UTF16 *oldSource = source; /* In case we have to back up because of target overflow. */
161 ch = *source++;
162 /* If we have a surrogate pair, convert to UTF32 first. */
163 if(ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END)
165 /* If the 16 bits following the high surrogate are in the source buffer... */
166 if(source < sourceEnd)
168 ch2 = *source;
170 /* If it's a low surrogate, convert to UTF32. */
171 if(ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END)
173 ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
174 + (ch2 - UNI_SUR_LOW_START) + halfBase;
176 ++source;
178 else if(flags == CSF_StrictConversion)
180 /* it's an unpaired high surrogate */
181 --source; /* return to the illegal value itself */
182 result = CSR_SourceIllegal;
184 break;
187 else
189 /* We don't have the 16 bits following the high surrogate. */
190 --source; /* return to the high surrogate */
191 result = CSR_SourceExhausted;
193 break;
196 else if (flags == CSF_StrictConversion)
198 /* UTF-16 surrogate values are illegal in UTF-32 */
199 if(ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END)
201 --source; /* return to the illegal value itself */
202 result = CSR_SourceIllegal;
204 break;
208 if(target >= targetEnd)
210 source = oldSource; /* Back up source pointer! */
211 result = CSR_TargetExhausted;
213 break;
215 *target++ = ch;
218 *sourceStart = source;
219 *targetStart = target;
221 #if defined(DEBUG)
222 if(result == CSR_SourceIllegal)
224 E(DBF_UTF, "ConvertUTF16toUTF32 illegal seq 0x%04x,%04x", ch, ch2);
226 #endif
228 RETURN(result);
229 return result;
232 /***********************************************************************/
235 * Index into the table below with the first byte of a UTF-8 sequence to
236 * get the number of trailing bytes that are supposed to follow it.
237 * Note that *legal* UTF-8 values can't have 4 or 5-bytes. The table is
238 * left as-is for anyone who may want to do such conversion, which was
239 * allowed in earlier algorithms.
241 const char trailingBytesForUTF8[256] = {
242 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
243 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
244 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
245 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
246 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
247 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
248 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
249 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5
253 * Magic values subtracted from a buffer value during UTF8 conversion.
254 * This table contains as many values as there might be trailing bytes
255 * in a UTF-8 sequence.
257 static const UTF32 offsetsFromUTF8[6] = {
258 0x00000000UL, 0x00003080UL, 0x000E2080UL,
259 0x03C82080UL, 0xFA082080UL, 0x82082080UL
263 * Once the bits are split out into bytes of UTF-8, this is a mask OR-ed
264 * into the first byte, depending on how many bytes follow. There are
265 * as many entries in this table as there are UTF-8 sequence types.
266 * (I.e., one byte sequence, two byte... etc.). Remember that sequencs
267 * for *legal* UTF-8 will be 4 or fewer bytes total.
269 static const UTF8 firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
271 /***********************************************************************/
273 /* The interface converts a whole buffer to avoid function-call overhead.
274 * Constants have been gathered. Loops & conditionals have been removed as
275 * much as possible for efficiency, in favor of drop-through switches.
276 * (See "Note A" at the bottom of the file for equivalent code.)
277 * If your compiler supports it, the "isLegalUTF8" call can be turned
278 * into an inline function.
281 /***********************************************************************/
283 ULONG LIBFUNC
284 CodesetsConvertUTF16toUTF8(REG(a0, const UTF16 ** sourceStart),
285 REG(a1, const UTF16 * sourceEnd),
286 REG(a2, UTF8 ** targetStart),
287 REG(a3, UTF8 * targetEnd),
288 REG(d0, ULONG flags))
290 ULONG result = CSR_ConversionOK;
291 const UTF16 *source = *sourceStart;
292 UTF8 *target = *targetStart;
293 UTF8 *start = target;
295 ENTER();
297 while(source < sourceEnd)
299 UTF32 ch;
300 unsigned short bytesToWrite = 0;
301 const UTF32 byteMask = 0xBF;
302 const UTF32 byteMark = 0x80;
303 const UTF16 *oldSource = source; /* In case we have to back up because of target overflow. */
305 ch = *source++;
307 /* If we have a surrogate pair, convert to UTF32 first. */
308 if(ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END)
310 /* If the 16 bits following the high surrogate are in the source buffer... */
311 if(source < sourceEnd)
313 UTF32 ch2 = *source;
315 /* If it's a low surrogate, convert to UTF32. */
316 if(ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END)
318 ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
319 + (ch2 - UNI_SUR_LOW_START) + halfBase;
321 ++source;
323 else if(flags == CSF_StrictConversion)
325 /* it's an unpaired high surrogate */
326 --source; /* return to the illegal value itself */
327 result = CSR_SourceIllegal;
328 break;
331 else
333 /* We don't have the 16 bits following the high surrogate. */
334 --source; /* return to the high surrogate */
335 result = CSR_SourceExhausted;
337 break;
340 else if(flags == CSF_StrictConversion)
342 /* UTF-16 surrogate values are illegal in UTF-32 */
343 if(ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END)
345 --source; /* return to the illegal value itself */
346 result = CSR_SourceIllegal;
348 break;
351 /* Figure out how many bytes the result will require */
352 if(ch < (UTF32) 0x80)
354 bytesToWrite = 1;
356 else if (ch < (UTF32) 0x800)
358 bytesToWrite = 2;
360 else if (ch < (UTF32) 0x10000)
362 bytesToWrite = 3;
364 else if (ch < (UTF32) 0x110000)
366 bytesToWrite = 4;
368 else
370 bytesToWrite = 3;
371 ch = UNI_REPLACEMENT_CHAR;
374 target += bytesToWrite;
375 if(start)
377 if(target > targetEnd)
379 source = oldSource; /* Back up source pointer! */
380 target -= bytesToWrite;
381 result = CSR_TargetExhausted;
383 break;
385 switch(bytesToWrite)
387 /* note: everything falls through. */
388 case 4:
389 *--target = (UTF8) ((ch | byteMark) & byteMask);
390 ch >>= 6;
392 case 3:
393 *--target = (UTF8) ((ch | byteMark) & byteMask);
394 ch >>= 6;
396 case 2:
397 *--target = (UTF8) ((ch | byteMark) & byteMask);
398 ch >>= 6;
400 case 1:
401 *--target = (UTF8) (ch | firstByteMark[bytesToWrite]);
404 target += bytesToWrite;
408 *sourceStart = source;
409 *targetStart = target;
411 RETURN(result);
412 return result;
415 /***********************************************************************/
418 * Utility routine to tell whether a sequence of bytes is legal UTF-8.
419 * This must be called with the length pre-determined by the first byte.
420 * If not calling this from ConvertUTF8to*, then the length can be set by:
421 * length = trailingBytesForUTF8[*source]+1;
422 * and the sequence is illegal right away if there aren't that many bytes
423 * available.
424 * If presented with a length > 4, this returns FALSE. The Unicode
425 * definition of UTF-8 goes up to 4-byte sequences.
428 BOOL LIBFUNC
429 CodesetsIsLegalUTF8(REG(a0, const UTF8 * source),
430 REG(d0, ULONG length))
432 UTF8 a;
433 const UTF8 *srcptr = source + length;
435 ENTER();
437 switch(length)
439 default:
440 RETURN(FALSE);
441 return FALSE;
443 /* Everything else falls through when "TRUE"... */
444 case 4:
445 if((a = (*--srcptr)) < 0x80 || a > 0xBF)
447 RETURN(FALSE);
448 return FALSE;
451 case 3:
452 if((a = (*--srcptr)) < 0x80 || a > 0xBF)
454 RETURN(FALSE);
455 return FALSE;
458 case 2:
459 if((a = (*--srcptr)) > 0xBF)
461 RETURN(FALSE);
462 return FALSE;
465 switch (*source)
467 /* no fall-through in this inner switch */
468 case 0xE0:
469 if(a < 0xA0)
471 RETURN(FALSE);
472 return FALSE;
474 break;
476 case 0xED:
477 if(a > 0x9F)
479 RETURN(FALSE);
480 return FALSE;
482 break;
484 case 0xF0:
485 if(a < 0x90)
487 RETURN(FALSE);
488 return FALSE;
490 break;
492 case 0xF4:
493 if(a > 0x8F)
495 RETURN(FALSE);
496 return FALSE;
498 break;
500 default:
501 if(a < 0x80)
503 RETURN(FALSE);
504 return FALSE;
508 case 1:
509 if(*source >= 0x80 && *source < 0xC2)
511 RETURN(FALSE);
512 return FALSE;
516 if(*source > 0xF4)
518 RETURN(FALSE);
519 return FALSE;
522 RETURN(TRUE);
523 return TRUE;
526 /***********************************************************************/
529 * Exported function to return whether a UTF-8 sequence is legal or not.
530 * This is not used here; it's just exported.
533 BOOL LIBFUNC
534 CodesetsIsLegalUTF8Sequence(REG(a0, const UTF8 * source),
535 REG(a1, const UTF8 * sourceEnd))
537 int length = trailingBytesForUTF8[*source] + 1;
538 BOOL res = FALSE;
540 ENTER();
542 if(source + length > sourceEnd)
544 RETURN(FALSE);
545 return FALSE;
548 res = CodesetsIsLegalUTF8(source, length);
550 RETURN(res);
551 return res;
554 /***********************************************************************/
556 ULONG LIBFUNC
557 CodesetsConvertUTF8toUTF16(REG(a0, const UTF8 ** sourceStart),
558 REG(a1, const UTF8 * sourceEnd),
559 REG(a2, UTF16 ** targetStart),
560 REG(a3, UTF16 * targetEnd),
561 REG(d0, ULONG flags))
563 ULONG result = CSR_ConversionOK;
564 const UTF8 *source = *sourceStart;
565 UTF16 *target = *targetStart;
566 UTF16 *start = target;
568 ENTER();
570 while(source < sourceEnd)
572 UTF32 ch = 0;
573 unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
575 if(source + extraBytesToRead >= sourceEnd)
577 result = CSR_SourceExhausted;
578 break;
581 /* Do this check whether lenient or strict */
582 if(!CodesetsIsLegalUTF8 (source, extraBytesToRead + 1))
584 result = CSR_SourceIllegal;
585 break;
589 * The cases all fall through. See "Note A" below.
591 switch (extraBytesToRead)
593 case 5:
594 ch += *source++;
595 ch <<= 6; /* remember, illegal UTF-8 */
597 case 4:
598 ch += *source++;
599 ch <<= 6; /* remember, illegal UTF-8 */
601 case 3:
602 ch += *source++;
603 ch <<= 6;
605 case 2:
606 ch += *source++;
607 ch <<= 6;
609 case 1:
610 ch += *source++;
611 ch <<= 6;
613 case 0:
614 ch += *source++;
617 ch -= offsetsFromUTF8[extraBytesToRead];
619 if(start && (target >= targetEnd))
621 source -= (extraBytesToRead + 1); /* Back up source pointer! */
622 result = CSR_TargetExhausted;
624 break;
627 if(ch <= UNI_MAX_BMP)
629 /* Target is a character <= 0xFFFF */
630 /* UTF-16 surrogate values are illegal in UTF-32 */
631 if(ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END)
633 if(flags == CSF_StrictConversion)
635 source -= (extraBytesToRead + 1); /* return to the illegal value itself */
636 result = CSR_SourceIllegal;
638 break;
640 else
641 ch = UNI_REPLACEMENT_CHAR;
643 if(start)
644 *target = (UTF16) ch; /* normal case */
645 target++;
647 else if(ch > UNI_MAX_UTF16)
649 if(flags == CSF_StrictConversion)
651 result = CSR_SourceIllegal;
652 source -= (extraBytesToRead + 1); /* return to the start */
654 break; /* Bail out; shouldn't continue */
656 if(start)
657 *target = UNI_REPLACEMENT_CHAR;
658 target++;
660 else
662 /* target is a character in range 0xFFFF - 0x10FFFF. */
663 if(start)
665 if(target + 1 >= targetEnd)
667 source -= (extraBytesToRead + 1); /* Back up source pointer! */
668 result = CSR_TargetExhausted;
670 break;
673 ch -= halfBase;
674 target[0] = (UTF16) ((ch >> halfShift) + UNI_SUR_HIGH_START);
675 target[1] = (UTF16) ((ch & halfMask) + UNI_SUR_LOW_START);
677 target += 2;
681 *sourceStart = source;
682 *targetStart = target;
684 RETURN(result);
685 return result;
688 /***********************************************************************/
690 ULONG LIBFUNC
691 CodesetsConvertUTF32toUTF8(REG(a0, const UTF32 ** sourceStart),
692 REG(a1, const UTF32 * sourceEnd),
693 REG(a2, UTF8 ** targetStart),
694 REG(a3, UTF8 * targetEnd),
695 REG(d0, ULONG flags))
697 ULONG result = CSR_ConversionOK;
698 const UTF32 *source = *sourceStart;
699 UTF8 *target = *targetStart;
700 UTF8 *start = target;
702 ENTER();
704 while(source < sourceEnd)
706 UTF32 ch;
707 unsigned short bytesToWrite = 0;
708 const UTF32 byteMask = 0xBF;
709 const UTF32 byteMark = 0x80;
711 ch = *source++;
713 if(flags == CSF_StrictConversion)
715 /* UTF-16 surrogate values are illegal in UTF-32 */
716 if(ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END)
718 --source; /* return to the illegal value itself */
719 result = CSR_SourceIllegal;
721 break;
726 * Figure out how many bytes the result will require. Turn any
727 * illegally large UTF32 things (> Plane 17) into replacement chars.
729 if(ch < (UTF32) 0x80)
731 bytesToWrite = 1;
733 else if(ch < (UTF32) 0x800)
735 bytesToWrite = 2;
737 else if(ch < (UTF32) 0x10000)
739 bytesToWrite = 3;
741 else if(ch <= UNI_MAX_LEGAL_UTF32)
743 bytesToWrite = 4;
745 else
747 bytesToWrite = 3;
748 ch = UNI_REPLACEMENT_CHAR;
749 result = CSR_SourceIllegal;
752 target += bytesToWrite;
753 if(start)
755 if(target > targetEnd)
757 --source; /* Back up source pointer! */
758 target -= bytesToWrite;
759 result = CSR_TargetExhausted;
761 break;
763 switch(bytesToWrite)
765 /* note: everything falls through. */
766 case 4:
767 *--target = (UTF8) ((ch | byteMark) & byteMask);
768 ch >>= 6;
770 case 3:
771 *--target = (UTF8) ((ch | byteMark) & byteMask);
772 ch >>= 6;
774 case 2:
775 *--target = (UTF8) ((ch | byteMark) & byteMask);
776 ch >>= 6;
778 case 1:
779 *--target = (UTF8) (ch | firstByteMark[bytesToWrite]);
782 target += bytesToWrite;
786 *sourceStart = source;
787 *targetStart = target;
789 RETURN(result);
790 return result;
793 /***********************************************************************/
795 ULONG LIBFUNC
796 CodesetsConvertUTF8toUTF32(REG(a0, const UTF8 ** sourceStart),
797 REG(a1, const UTF8 * sourceEnd),
798 REG(a2, UTF32 ** targetStart),
799 REG(a3, UTF32 * targetEnd),
800 REG(d0, ULONG flags))
802 ULONG result = CSR_ConversionOK;
803 const UTF8 *source = *sourceStart;
804 UTF32 *target = *targetStart;
805 UTF32 *start = target;
807 ENTER();
809 while(source < sourceEnd)
811 UTF32 ch = 0;
812 unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
814 if(source + extraBytesToRead >= sourceEnd)
816 result = CSR_SourceExhausted;
817 break;
820 /* Do this check whether lenient or strict */
821 if(!CodesetsIsLegalUTF8(source, extraBytesToRead + 1))
823 result = CSR_SourceIllegal;
824 break;
828 * The cases all fall through. See "Note A" below.
830 switch (extraBytesToRead)
832 case 5:
833 ch += *source++;
834 ch <<= 6;
836 case 4:
837 ch += *source++;
838 ch <<= 6;
840 case 3:
841 ch += *source++;
842 ch <<= 6;
844 case 2:
845 ch += *source++;
846 ch <<= 6;
848 case 1:
849 ch += *source++;
850 ch <<= 6;
852 case 0:
853 ch += *source++;
856 ch -= offsetsFromUTF8[extraBytesToRead];
858 if(start)
860 if(target >= targetEnd)
862 source -= (extraBytesToRead + 1); /* Back up the source pointer! */
863 result = CSR_TargetExhausted;
865 break;
868 if(ch <= UNI_MAX_LEGAL_UTF32)
871 * UTF-16 surrogate values are illegal in UTF-32, and anything
872 * over Plane 17 (> 0x10FFFF) is illegal.
874 if(ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END)
876 if(flags == CSF_StrictConversion)
878 source -= (extraBytesToRead + 1); /* return to the illegal value itself */
879 result = CSR_SourceIllegal;
881 break;
883 else
885 *target++ = UNI_REPLACEMENT_CHAR;
888 else
890 *target++ = ch;
893 else
895 /* i.e., ch > UNI_MAX_LEGAL_UTF32 */
896 result = CSR_SourceIllegal;
897 *target++ = UNI_REPLACEMENT_CHAR;
900 else
901 target++;
904 *sourceStart = source;
905 *targetStart = target;
907 RETURN(result);
908 return result;
911 /***********************************************************************
913 Note A.
914 The fall-through switches in UTF-8 reading code save a
915 temp variable, some decrements & conditionals. The switches
916 are equivalent to the following loop:
918 int tmpBytesToRead = extraBytesToRead+1;
919 do {
920 ch += *source++;
921 --tmpBytesToRead;
922 if (tmpBytesToRead) ch <<= 6;
923 } while (tmpBytesToRead > 0);
925 In UTF-8 writing code, the switches on "bytesToWrite" are
926 similarly unrolled loops.
928 ***********************************************************************/