1 // Copyright 2016 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
7 import _
"unsafe" // For go:linkname.
9 // For gccgo, use go:linkname to rename compiler-called functions to
10 // themselves, so that the compiler will export them.
12 //go:linkname decoderune runtime.decoderune
14 // Numbers fundamental to the encoding.
16 runeError
= '\uFFFD' // the "error" Rune or "Unicode replacement character"
17 runeSelf
= 0x80 // characters below Runeself are represented as themselves in a single byte.
18 maxRune
= '\U0010FFFF' // Maximum valid Unicode code point.
21 // Code points in the surrogate range are not valid for UTF-8.
28 t1
= 0x00 // 0000 0000
29 tx
= 0x80 // 1000 0000
30 t2
= 0xC0 // 1100 0000
31 t3
= 0xE0 // 1110 0000
32 t4
= 0xF0 // 1111 0000
33 t5
= 0xF8 // 1111 1000
35 maskx
= 0x3F // 0011 1111
36 mask2
= 0x1F // 0001 1111
37 mask3
= 0x0F // 0000 1111
38 mask4
= 0x07 // 0000 0111
44 // The default lowest and highest continuation byte.
45 locb
= 0x80 // 1000 0000
46 hicb
= 0xBF // 1011 1111
49 // countrunes returns the number of runes in s.
50 func countrunes(s
string) int {
58 // decoderune returns the non-ASCII rune at the start of
59 // s[k:] and the index after the rune in s.
61 // decoderune assumes that caller has checked that
62 // the to be decoded rune is a non-ASCII rune.
64 // If the string appears to be incomplete or decoding problems
65 // are encountered (runeerror, k + 1) is returned to ensure
66 // progress when decoderune is used to iterate over a string.
67 func decoderune(s
string, k
int) (r rune
, pos
int) {
71 return runeError
, k
+ 1
77 case t2
<= s
[0] && s
[0] < t3
:
78 // 0080-07FF two byte sequence
79 if len(s
) > 1 && (locb
<= s
[1] && s
[1] <= hicb
) {
80 r
= rune(s
[0]&mask2
)<<6 |
rune(s
[1]&maskx
)
86 case t3
<= s
[0] && s
[0] < t4
:
87 // 0800-FFFF three byte sequence
88 if len(s
) > 2 && (locb
<= s
[1] && s
[1] <= hicb
) && (locb
<= s
[2] && s
[2] <= hicb
) {
89 r
= rune(s
[0]&mask3
)<<12 |
rune(s
[1]&maskx
)<<6 |
rune(s
[2]&maskx
)
91 if rune2Max
< r
&& !(surrogateMin
<= r
&& r
<= surrogateMax
) {
95 case t4
<= s
[0] && s
[0] < t5
:
96 // 10000-1FFFFF four byte sequence
97 if len(s
) > 3 && (locb
<= s
[1] && s
[1] <= hicb
) && (locb
<= s
[2] && s
[2] <= hicb
) && (locb
<= s
[3] && s
[3] <= hicb
) {
98 r
= rune(s
[0]&mask4
)<<18 |
rune(s
[1]&maskx
)<<12 |
rune(s
[2]&maskx
)<<6 |
rune(s
[3]&maskx
)
100 if rune3Max
< r
&& r
<= maxRune
{
106 return runeError
, k
+ 1
109 // encoderune writes into p (which must be large enough) the UTF-8 encoding of the rune.
110 // It returns the number of bytes written.
111 func encoderune(p
[]byte, r rune
) int {
112 // Negative values are erroneous. Making it unsigned addresses the problem.
113 switch i
:= uint32(r
); {
118 _
= p
[1] // eliminate bounds checks
119 p
[0] = t2 |
byte(r
>>6)
120 p
[1] = tx |
byte(r
)&maskx
122 case i
> maxRune
, surrogateMin
<= i
&& i
<= surrogateMax
:
126 _
= p
[2] // eliminate bounds checks
127 p
[0] = t3 |
byte(r
>>12)
128 p
[1] = tx |
byte(r
>>6)&maskx
129 p
[2] = tx |
byte(r
)&maskx
132 _
= p
[3] // eliminate bounds checks
133 p
[0] = t4 |
byte(r
>>18)
134 p
[1] = tx |
byte(r
>>12)&maskx
135 p
[2] = tx |
byte(r
>>6)&maskx
136 p
[3] = tx |
byte(r
)&maskx