1 // Copyright 2016 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
7 import _
"unsafe" // For go:linkname.
9 // For gccgo, use go:linkname to export compiler-called functions.
11 //go:linkname decoderune
13 // Numbers fundamental to the encoding.
15 runeError
= '\uFFFD' // the "error" Rune or "Unicode replacement character"
16 runeSelf
= 0x80 // characters below runeSelf are represented as themselves in a single byte.
17 maxRune
= '\U0010FFFF' // Maximum valid Unicode code point.
20 // Code points in the surrogate range are not valid for UTF-8.
27 t1
= 0x00 // 0000 0000
28 tx
= 0x80 // 1000 0000
29 t2
= 0xC0 // 1100 0000
30 t3
= 0xE0 // 1110 0000
31 t4
= 0xF0 // 1111 0000
32 t5
= 0xF8 // 1111 1000
34 maskx
= 0x3F // 0011 1111
35 mask2
= 0x1F // 0001 1111
36 mask3
= 0x0F // 0000 1111
37 mask4
= 0x07 // 0000 0111
43 // The default lowest and highest continuation byte.
44 locb
= 0x80 // 1000 0000
45 hicb
= 0xBF // 1011 1111
48 // countrunes returns the number of runes in s.
49 func countrunes(s
string) int {
57 // decoderune returns the non-ASCII rune at the start of
58 // s[k:] and the index after the rune in s.
60 // decoderune assumes that caller has checked that
61 // the to be decoded rune is a non-ASCII rune.
63 // If the string appears to be incomplete or decoding problems
64 // are encountered (runeerror, k + 1) is returned to ensure
65 // progress when decoderune is used to iterate over a string.
66 func decoderune(s
string, k
int) (r rune
, pos
int) {
70 return runeError
, k
+ 1
76 case t2
<= s
[0] && s
[0] < t3
:
77 // 0080-07FF two byte sequence
78 if len(s
) > 1 && (locb
<= s
[1] && s
[1] <= hicb
) {
79 r
= rune(s
[0]&mask2
)<<6 |
rune(s
[1]&maskx
)
85 case t3
<= s
[0] && s
[0] < t4
:
86 // 0800-FFFF three byte sequence
87 if len(s
) > 2 && (locb
<= s
[1] && s
[1] <= hicb
) && (locb
<= s
[2] && s
[2] <= hicb
) {
88 r
= rune(s
[0]&mask3
)<<12 |
rune(s
[1]&maskx
)<<6 |
rune(s
[2]&maskx
)
90 if rune2Max
< r
&& !(surrogateMin
<= r
&& r
<= surrogateMax
) {
94 case t4
<= s
[0] && s
[0] < t5
:
95 // 10000-1FFFFF four byte sequence
96 if len(s
) > 3 && (locb
<= s
[1] && s
[1] <= hicb
) && (locb
<= s
[2] && s
[2] <= hicb
) && (locb
<= s
[3] && s
[3] <= hicb
) {
97 r
= rune(s
[0]&mask4
)<<18 |
rune(s
[1]&maskx
)<<12 |
rune(s
[2]&maskx
)<<6 |
rune(s
[3]&maskx
)
99 if rune3Max
< r
&& r
<= maxRune
{
105 return runeError
, k
+ 1
108 // encoderune writes into p (which must be large enough) the UTF-8 encoding of the rune.
109 // It returns the number of bytes written.
110 func encoderune(p
[]byte, r rune
) int {
111 // Negative values are erroneous. Making it unsigned addresses the problem.
112 switch i
:= uint32(r
); {
117 _
= p
[1] // eliminate bounds checks
118 p
[0] = t2 |
byte(r
>>6)
119 p
[1] = tx |
byte(r
)&maskx
121 case i
> maxRune
, surrogateMin
<= i
&& i
<= surrogateMax
:
125 _
= p
[2] // eliminate bounds checks
126 p
[0] = t3 |
byte(r
>>12)
127 p
[1] = tx |
byte(r
>>6)&maskx
128 p
[2] = tx |
byte(r
)&maskx
131 _
= p
[3] // eliminate bounds checks
132 p
[0] = t4 |
byte(r
>>18)
133 p
[1] = tx |
byte(r
>>12)&maskx
134 p
[2] = tx |
byte(r
>>6)&maskx
135 p
[3] = tx |
byte(r
)&maskx