2 * The authors of this software are Rob Pike and Ken Thompson.
3 * Copyright (c) 2002 by Lucent Technologies.
4 * Portions Copyright 2009 The Go Authors. All rights reserved.
5 * Permission to use, copy, modify, and distribute this software for any
6 * purpose without fee is hereby granted, provided that this entire notice
7 * is included in all copies of any software which is or includes a copy
8 * or modification of this software and in all copies of the supporting
9 * documentation for such software.
10 * THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED
11 * WARRANTY. IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE ANY
12 * REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY
13 * OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE.
17 * This code is copied, with slight editing due to type differences,
18 * from a subset of ../lib9/utf/rune.c [which no longer exists]
31 t1
= ((1 << (bit1
+ 1)) - 1) ^ 0xFF /* 0000 0000 */
32 tx
= ((1 << (bitx
+ 1)) - 1) ^ 0xFF /* 1000 0000 */
33 t2
= ((1 << (bit2
+ 1)) - 1) ^ 0xFF /* 1100 0000 */
34 t3
= ((1 << (bit3
+ 1)) - 1) ^ 0xFF /* 1110 0000 */
35 t4
= ((1 << (bit4
+ 1)) - 1) ^ 0xFF /* 1111 0000 */
36 t5
= ((1 << (bit5
+ 1)) - 1) ^ 0xFF /* 1111 1000 */
38 rune1
= (1 << (bit1
+ 0*bitx
)) - 1 /* 0000 0000 0111 1111 */
39 rune2
= (1 << (bit2
+ 1*bitx
)) - 1 /* 0000 0111 1111 1111 */
40 rune3
= (1 << (bit3
+ 2*bitx
)) - 1 /* 1111 1111 1111 1111 */
41 rune4
= (1 << (bit4
+ 3*bitx
)) - 1 /* 0001 1111 1111 1111 1111 1111 */
43 maskx
= (1 << bitx
) - 1 /* 0011 1111 */
44 testx
= maskx
^ 0xFF /* 1100 0000 */
54 runemax
= 0x10FFFF /* maximum rune value */
58 * Modified by Wei-Hwa Huang, Google Inc., on 2004-09-24
59 * This is a slower but "safe" version of the old chartorune
60 * that works on strings that are not necessarily null-terminated.
62 * If you know for sure that your string is null-terminated,
63 * chartorune will be a bit faster.
65 * It is guaranteed not to attempt to access "length"
66 * past the incoming pointer. This is to avoid
67 * possible access violations. If the string appears to be
68 * well-formed but incomplete (i.e., to get the whole Rune
69 * we'd need to read past str+length) then we'll set the Rune
70 * to Bad and return 0.
72 * Note that if we have decoding problems for other
73 * reasons, we return 1 instead of 0.
75 func charntorune(s
string) (rune
, int) {
76 /* When we're not allowed to read anything */
82 * one character sequence (7-bit value)
90 // If we can't read more than one character we must stop
96 * two character sequence (11-bit value)
100 if (c1
& testx
) != 0 {
107 l
:= ((rune(c
) << bitx
) |
rune(c1
)) & rune2
114 // If we can't read more than two characters we must stop
120 * three character sequence (16-bit value)
121 * 0800-FFFF => t3 tx tx
124 if (c2
& testx
) != 0 {
128 l
:= ((((rune(c
) << bitx
) |
rune(c1
)) << bitx
) |
rune(c2
)) & rune3
132 if surrogateMin
<= l
&& l
<= surrogateMax
{
143 * four character sequence (21-bit value)
144 * 10000-1FFFFF => t4 tx tx tx
147 if (c3
& testx
) != 0 {
151 l
:= ((((((rune(c
) << bitx
) |
rune(c1
)) << bitx
) |
rune(c2
)) << bitx
) |
rune(c3
)) & rune4
152 if l
<= rune3 || l
> runemax
{
158 // Support for 5-byte or longer UTF-8 would go here, but
159 // since we don't have that, we'll just return bad.
163 // runetochar converts r to bytes and writes the result to str.
164 // returns the number of bytes generated.
165 func runetochar(str
[]byte, r rune
) int {
166 /* runes are signed, so convert to unsigned for range check. */
169 * one character sequence
170 * 00000-0007F => 00-7F
177 * two character sequence
181 str
[0] = byte(t2 |
(c
>> (1 * bitx
)))
182 str
[1] = byte(tx |
(c
& maskx
))
187 * If the rune is out of range or a surrogate half, convert it to the error rune.
188 * Do this test here because the error rune encodes to three bytes.
189 * Doing it earlier would duplicate work, since an out of range
190 * rune wouldn't have fit in one or two bytes.
195 if surrogateMin
<= c
&& c
<= surrogateMax
{
200 * three character sequence
201 * 0800-FFFF => t3 tx tx
204 str
[0] = byte(t3 |
(c
>> (2 * bitx
)))
205 str
[1] = byte(tx |
((c
>> (1 * bitx
)) & maskx
))
206 str
[2] = byte(tx |
(c
& maskx
))
211 * four character sequence (21-bit value)
212 * 10000-1FFFFF => t4 tx tx tx
214 str
[0] = byte(t4 |
(c
>> (3 * bitx
)))
215 str
[1] = byte(tx |
((c
>> (2 * bitx
)) & maskx
))
216 str
[2] = byte(tx |
((c
>> (1 * bitx
)) & maskx
))
217 str
[3] = byte(tx |
(c
& maskx
))