1 // Copyright 2016 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
7 import _
"unsafe" // For go:linkname.
9 // For gccgo, use go:linkname to rename compiler-called functions to
10 // themselves, so that the compiler will export them.
12 //go:linkname decoderune runtime.decoderune
14 // Numbers fundamental to the encoding.
16 runeError
= '\uFFFD' // the "error" Rune or "Unicode replacement character"
17 runeSelf
= 0x80 // characters below Runeself are represented as themselves in a single byte.
18 maxRune
= '\U0010FFFF' // Maximum valid Unicode code point.
21 // Code points in the surrogate range are not valid for UTF-8.
28 t1
= 0x00 // 0000 0000
29 tx
= 0x80 // 1000 0000
30 t2
= 0xC0 // 1100 0000
31 t3
= 0xE0 // 1110 0000
32 t4
= 0xF0 // 1111 0000
33 t5
= 0xF8 // 1111 1000
35 maskx
= 0x3F // 0011 1111
36 mask2
= 0x1F // 0001 1111
37 mask3
= 0x0F // 0000 1111
38 mask4
= 0x07 // 0000 0111
44 // The default lowest and highest continuation byte.
45 locb
= 0x80 // 1000 0000
46 hicb
= 0xBF // 1011 1111
49 // decoderune returns the non-ASCII rune at the start of
50 // s[k:] and the index after the rune in s.
52 // decoderune assumes that caller has checked that
53 // the to be decoded rune is a non-ASCII rune.
55 // If the string appears to be incomplete or decoding problems
56 // are encountered (runeerror, k + 1) is returned to ensure
57 // progress when decoderune is used to iterate over a string.
58 func decoderune(s
string, k
int) (r rune
, pos
int) {
62 return runeError
, k
+ 1
68 case t2
<= s
[0] && s
[0] < t3
:
69 // 0080-07FF two byte sequence
70 if len(s
) > 1 && (locb
<= s
[1] && s
[1] <= hicb
) {
71 r
= rune(s
[0]&mask2
)<<6 |
rune(s
[1]&maskx
)
77 case t3
<= s
[0] && s
[0] < t4
:
78 // 0800-FFFF three byte sequence
79 if len(s
) > 2 && (locb
<= s
[1] && s
[1] <= hicb
) && (locb
<= s
[2] && s
[2] <= hicb
) {
80 r
= rune(s
[0]&mask3
)<<12 |
rune(s
[1]&maskx
)<<6 |
rune(s
[2]&maskx
)
82 if rune2Max
< r
&& !(surrogateMin
<= r
&& r
<= surrogateMax
) {
86 case t4
<= s
[0] && s
[0] < t5
:
87 // 10000-1FFFFF four byte sequence
88 if len(s
) > 3 && (locb
<= s
[1] && s
[1] <= hicb
) && (locb
<= s
[2] && s
[2] <= hicb
) && (locb
<= s
[3] && s
[3] <= hicb
) {
89 r
= rune(s
[0]&mask4
)<<18 |
rune(s
[1]&maskx
)<<12 |
rune(s
[2]&maskx
)<<6 |
rune(s
[3]&maskx
)
91 if rune3Max
< r
&& r
<= maxRune
{
97 return runeError
, k
+ 1
100 // encoderune writes into p (which must be large enough) the UTF-8 encoding of the rune.
101 // It returns the number of bytes written.
102 func encoderune(p
[]byte, r rune
) int {
103 // Negative values are erroneous. Making it unsigned addresses the problem.
104 switch i
:= uint32(r
); {
109 _
= p
[1] // eliminate bounds checks
110 p
[0] = t2 |
byte(r
>>6)
111 p
[1] = tx |
byte(r
)&maskx
113 case i
> maxRune
, surrogateMin
<= i
&& i
<= surrogateMax
:
117 _
= p
[2] // eliminate bounds checks
118 p
[0] = t3 |
byte(r
>>12)
119 p
[1] = tx |
byte(r
>>6)&maskx
120 p
[2] = tx |
byte(r
)&maskx
123 _
= p
[3] // eliminate bounds checks
124 p
[0] = t4 |
byte(r
>>18)
125 p
[1] = tx |
byte(r
>>12)&maskx
126 p
[2] = tx |
byte(r
>>6)&maskx
127 p
[3] = tx |
byte(r
)&maskx