1 // Copyright 2009 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
5 // Functions and constants to support text encoded in UTF-8.
6 // This package calls a Unicode character a rune for brevity.
9 import "unicode" // only needed for a couple of constants
11 // Numbers fundamental to the encoding.
13 RuneError
= unicode
.ReplacementChar
// the "error" Rune or "replacement character".
14 RuneSelf
= 0x80 // characters below Runeself are represented as themselves in a single byte.
15 UTFMax
= 4 // maximum number of bytes of a UTF-8 encoded Unicode character.
19 _T1
= 0x00 // 0000 0000
20 _Tx
= 0x80 // 1000 0000
21 _T2
= 0xC0 // 1100 0000
22 _T3
= 0xE0 // 1110 0000
23 _T4
= 0xF0 // 1111 0000
24 _T5
= 0xF8 // 1111 1000
26 _Maskx
= 0x3F // 0011 1111
27 _Mask2
= 0x1F // 0001 1111
28 _Mask3
= 0x0F // 0000 1111
29 _Mask4
= 0x07 // 0000 0111
37 func decodeRuneInternal(p
[]byte) (rune
, size
int, short
bool) {
40 return RuneError
, 0, true
44 // 1-byte, 7-bit sequence?
46 return int(c0
), 1, false
49 // unexpected continuation byte?
51 return RuneError
, 1, false
54 // need first continuation byte
56 return RuneError
, 1, true
59 if c1
< _Tx || _T2
<= c1
{
60 return RuneError
, 1, false
63 // 2-byte, 11-bit sequence?
65 rune
= int(c0
&_Mask2
)<<6 |
int(c1
&_Maskx
)
66 if rune
<= _Rune1Max
{
67 return RuneError
, 1, false
72 // need second continuation byte
74 return RuneError
, 1, true
77 if c2
< _Tx || _T2
<= c2
{
78 return RuneError
, 1, false
81 // 3-byte, 16-bit sequence?
83 rune
= int(c0
&_Mask3
)<<12 |
int(c1
&_Maskx
)<<6 |
int(c2
&_Maskx
)
84 if rune
<= _Rune2Max
{
85 return RuneError
, 1, false
90 // need third continuation byte
92 return RuneError
, 1, true
95 if c3
< _Tx || _T2
<= c3
{
96 return RuneError
, 1, false
99 // 4-byte, 21-bit sequence?
101 rune
= int(c0
&_Mask4
)<<18 |
int(c1
&_Maskx
)<<12 |
int(c2
&_Maskx
)<<6 |
int(c3
&_Maskx
)
102 if rune
<= _Rune3Max
{
103 return RuneError
, 1, false
105 return rune
, 4, false
109 return RuneError
, 1, false
112 func decodeRuneInStringInternal(s
string) (rune
, size
int, short
bool) {
115 return RuneError
, 0, true
119 // 1-byte, 7-bit sequence?
121 return int(c0
), 1, false
124 // unexpected continuation byte?
126 return RuneError
, 1, false
129 // need first continuation byte
131 return RuneError
, 1, true
134 if c1
< _Tx || _T2
<= c1
{
135 return RuneError
, 1, false
138 // 2-byte, 11-bit sequence?
140 rune
= int(c0
&_Mask2
)<<6 |
int(c1
&_Maskx
)
141 if rune
<= _Rune1Max
{
142 return RuneError
, 1, false
144 return rune
, 2, false
147 // need second continuation byte
149 return RuneError
, 1, true
152 if c2
< _Tx || _T2
<= c2
{
153 return RuneError
, 1, false
156 // 3-byte, 16-bit sequence?
158 rune
= int(c0
&_Mask3
)<<12 |
int(c1
&_Maskx
)<<6 |
int(c2
&_Maskx
)
159 if rune
<= _Rune2Max
{
160 return RuneError
, 1, false
162 return rune
, 3, false
165 // need third continuation byte
167 return RuneError
, 1, true
170 if c3
< _Tx || _T2
<= c3
{
171 return RuneError
, 1, false
174 // 4-byte, 21-bit sequence?
176 rune
= int(c0
&_Mask4
)<<18 |
int(c1
&_Maskx
)<<12 |
int(c2
&_Maskx
)<<6 |
int(c3
&_Maskx
)
177 if rune
<= _Rune3Max
{
178 return RuneError
, 1, false
180 return rune
, 4, false
184 return RuneError
, 1, false
187 // FullRune reports whether the bytes in p begin with a full UTF-8 encoding of a rune.
188 // An invalid encoding is considered a full Rune since it will convert as a width-1 error rune.
189 func FullRune(p
[]byte) bool {
190 _
, _
, short
:= decodeRuneInternal(p
)
194 // FullRuneInString is like FullRune but its input is a string.
195 func FullRuneInString(s
string) bool {
196 _
, _
, short
:= decodeRuneInStringInternal(s
)
200 // DecodeRune unpacks the first UTF-8 encoding in p and returns the rune and its width in bytes.
201 func DecodeRune(p
[]byte) (rune
, size
int) {
202 rune
, size
, _
= decodeRuneInternal(p
)
206 // DecodeRuneInString is like DecodeRune but its input is a string.
207 func DecodeRuneInString(s
string) (rune
, size
int) {
208 rune
, size
, _
= decodeRuneInStringInternal(s
)
212 // DecodeLastRune unpacks the last UTF-8 encoding in p
213 // and returns the rune and its width in bytes.
214 func DecodeLastRune(p
[]byte) (rune
, size
int) {
224 // guard against O(n^2) behavior when traversing
225 // backwards through strings with long sequences of
231 for start
--; start
>= lim
; start
-- {
232 if RuneStart(p
[start
]) {
239 rune
, size
= DecodeRune(p
[start
:end
])
240 if start
+size
!= end
{
246 // DecodeLastRuneInString is like DecodeLastRune but its input is a string.
247 func DecodeLastRuneInString(s
string) (rune
, size
int) {
257 // guard against O(n^2) behavior when traversing
258 // backwards through strings with long sequences of
264 for start
--; start
>= lim
; start
-- {
265 if RuneStart(s
[start
]) {
272 rune
, size
= DecodeRuneInString(s
[start
:end
])
273 if start
+size
!= end
{
279 // RuneLen returns the number of bytes required to encode the rune.
280 func RuneLen(rune
int) int {
282 case rune
<= _Rune1Max
:
284 case rune
<= _Rune2Max
:
286 case rune
<= _Rune3Max
:
288 case rune
<= _Rune4Max
:
294 // EncodeRune writes into p (which must be large enough) the UTF-8 encoding of the rune.
295 // It returns the number of bytes written.
296 func EncodeRune(rune
int, p
[]byte) int {
297 // Negative values are erroneous. Making it unsigned addresses the problem.
306 p
[0] = _T2 |
byte(r
>>6)
307 p
[1] = _Tx |
byte(r
)&_Maskx
311 if r
> unicode
.MaxRune
{
316 p
[0] = _T3 |
byte(r
>>12)
317 p
[1] = _Tx |
byte(r
>>6)&_Maskx
318 p
[2] = _Tx |
byte(r
)&_Maskx
322 p
[0] = _T4 |
byte(r
>>18)
323 p
[1] = _Tx |
byte(r
>>12)&_Maskx
324 p
[2] = _Tx |
byte(r
>>6)&_Maskx
325 p
[3] = _Tx |
byte(r
)&_Maskx
329 // RuneCount returns the number of runes in p. Erroneous and short
330 // encodings are treated as single runes of width 1 byte.
331 func RuneCount(p
[]byte) int {
334 for n
= 0; i
< len(p
); n
++ {
338 _
, size
:= DecodeRune(p
[i
:])
345 // RuneCountInString is like RuneCount but its input is a string.
346 func RuneCountInString(s
string) (n
int) {
353 // RuneStart reports whether the byte could be the first byte of
354 // an encoded rune. Second and subsequent bytes always have the top
355 // two bits set to 10.
356 func RuneStart(b
byte) bool { return b
&0xC0 != 0x80 }