fix for merge conflict
[official-gcc.git] / libgo / go / utf8 / utf8.go
blobdfcdef9613b0196fca7692703100dd9cf344854b
1 // Copyright 2009 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
5 // Functions and constants to support text encoded in UTF-8.
6 // This package calls a Unicode character a rune for brevity.
7 package utf8
9 import "unicode" // only needed for a couple of constants
11 // Numbers fundamental to the encoding.
12 const (
13 RuneError = unicode.ReplacementChar // the "error" Rune or "replacement character".
14 RuneSelf = 0x80 // characters below Runeself are represented as themselves in a single byte.
15 UTFMax = 4 // maximum number of bytes of a UTF-8 encoded Unicode character.
18 const (
19 _T1 = 0x00 // 0000 0000
20 _Tx = 0x80 // 1000 0000
21 _T2 = 0xC0 // 1100 0000
22 _T3 = 0xE0 // 1110 0000
23 _T4 = 0xF0 // 1111 0000
24 _T5 = 0xF8 // 1111 1000
26 _Maskx = 0x3F // 0011 1111
27 _Mask2 = 0x1F // 0001 1111
28 _Mask3 = 0x0F // 0000 1111
29 _Mask4 = 0x07 // 0000 0111
31 _Rune1Max = 1<<7 - 1
32 _Rune2Max = 1<<11 - 1
33 _Rune3Max = 1<<16 - 1
34 _Rune4Max = 1<<21 - 1
37 func decodeRuneInternal(p []byte) (rune, size int, short bool) {
38 n := len(p)
39 if n < 1 {
40 return RuneError, 0, true
42 c0 := p[0]
44 // 1-byte, 7-bit sequence?
45 if c0 < _Tx {
46 return int(c0), 1, false
49 // unexpected continuation byte?
50 if c0 < _T2 {
51 return RuneError, 1, false
54 // need first continuation byte
55 if n < 2 {
56 return RuneError, 1, true
58 c1 := p[1]
59 if c1 < _Tx || _T2 <= c1 {
60 return RuneError, 1, false
63 // 2-byte, 11-bit sequence?
64 if c0 < _T3 {
65 rune = int(c0&_Mask2)<<6 | int(c1&_Maskx)
66 if rune <= _Rune1Max {
67 return RuneError, 1, false
69 return rune, 2, false
72 // need second continuation byte
73 if n < 3 {
74 return RuneError, 1, true
76 c2 := p[2]
77 if c2 < _Tx || _T2 <= c2 {
78 return RuneError, 1, false
81 // 3-byte, 16-bit sequence?
82 if c0 < _T4 {
83 rune = int(c0&_Mask3)<<12 | int(c1&_Maskx)<<6 | int(c2&_Maskx)
84 if rune <= _Rune2Max {
85 return RuneError, 1, false
87 return rune, 3, false
90 // need third continuation byte
91 if n < 4 {
92 return RuneError, 1, true
94 c3 := p[3]
95 if c3 < _Tx || _T2 <= c3 {
96 return RuneError, 1, false
99 // 4-byte, 21-bit sequence?
100 if c0 < _T5 {
101 rune = int(c0&_Mask4)<<18 | int(c1&_Maskx)<<12 | int(c2&_Maskx)<<6 | int(c3&_Maskx)
102 if rune <= _Rune3Max {
103 return RuneError, 1, false
105 return rune, 4, false
108 // error
109 return RuneError, 1, false
112 func decodeRuneInStringInternal(s string) (rune, size int, short bool) {
113 n := len(s)
114 if n < 1 {
115 return RuneError, 0, true
117 c0 := s[0]
119 // 1-byte, 7-bit sequence?
120 if c0 < _Tx {
121 return int(c0), 1, false
124 // unexpected continuation byte?
125 if c0 < _T2 {
126 return RuneError, 1, false
129 // need first continuation byte
130 if n < 2 {
131 return RuneError, 1, true
133 c1 := s[1]
134 if c1 < _Tx || _T2 <= c1 {
135 return RuneError, 1, false
138 // 2-byte, 11-bit sequence?
139 if c0 < _T3 {
140 rune = int(c0&_Mask2)<<6 | int(c1&_Maskx)
141 if rune <= _Rune1Max {
142 return RuneError, 1, false
144 return rune, 2, false
147 // need second continuation byte
148 if n < 3 {
149 return RuneError, 1, true
151 c2 := s[2]
152 if c2 < _Tx || _T2 <= c2 {
153 return RuneError, 1, false
156 // 3-byte, 16-bit sequence?
157 if c0 < _T4 {
158 rune = int(c0&_Mask3)<<12 | int(c1&_Maskx)<<6 | int(c2&_Maskx)
159 if rune <= _Rune2Max {
160 return RuneError, 1, false
162 return rune, 3, false
165 // need third continuation byte
166 if n < 4 {
167 return RuneError, 1, true
169 c3 := s[3]
170 if c3 < _Tx || _T2 <= c3 {
171 return RuneError, 1, false
174 // 4-byte, 21-bit sequence?
175 if c0 < _T5 {
176 rune = int(c0&_Mask4)<<18 | int(c1&_Maskx)<<12 | int(c2&_Maskx)<<6 | int(c3&_Maskx)
177 if rune <= _Rune3Max {
178 return RuneError, 1, false
180 return rune, 4, false
183 // error
184 return RuneError, 1, false
187 // FullRune reports whether the bytes in p begin with a full UTF-8 encoding of a rune.
188 // An invalid encoding is considered a full Rune since it will convert as a width-1 error rune.
189 func FullRune(p []byte) bool {
190 _, _, short := decodeRuneInternal(p)
191 return !short
194 // FullRuneInString is like FullRune but its input is a string.
195 func FullRuneInString(s string) bool {
196 _, _, short := decodeRuneInStringInternal(s)
197 return !short
200 // DecodeRune unpacks the first UTF-8 encoding in p and returns the rune and its width in bytes.
201 func DecodeRune(p []byte) (rune, size int) {
202 rune, size, _ = decodeRuneInternal(p)
203 return
206 // DecodeRuneInString is like DecodeRune but its input is a string.
207 func DecodeRuneInString(s string) (rune, size int) {
208 rune, size, _ = decodeRuneInStringInternal(s)
209 return
212 // DecodeLastRune unpacks the last UTF-8 encoding in p
213 // and returns the rune and its width in bytes.
214 func DecodeLastRune(p []byte) (rune, size int) {
215 end := len(p)
216 if end == 0 {
217 return RuneError, 0
219 start := end - 1
220 rune = int(p[start])
221 if rune < RuneSelf {
222 return rune, 1
224 // guard against O(n^2) behavior when traversing
225 // backwards through strings with long sequences of
226 // invalid UTF-8.
227 lim := end - UTFMax
228 if lim < 0 {
229 lim = 0
231 for start--; start >= lim; start-- {
232 if RuneStart(p[start]) {
233 break
236 if start < 0 {
237 start = 0
239 rune, size = DecodeRune(p[start:end])
240 if start+size != end {
241 return RuneError, 1
243 return rune, size
246 // DecodeLastRuneInString is like DecodeLastRune but its input is a string.
247 func DecodeLastRuneInString(s string) (rune, size int) {
248 end := len(s)
249 if end == 0 {
250 return RuneError, 0
252 start := end - 1
253 rune = int(s[start])
254 if rune < RuneSelf {
255 return rune, 1
257 // guard against O(n^2) behavior when traversing
258 // backwards through strings with long sequences of
259 // invalid UTF-8.
260 lim := end - UTFMax
261 if lim < 0 {
262 lim = 0
264 for start--; start >= lim; start-- {
265 if RuneStart(s[start]) {
266 break
269 if start < 0 {
270 start = 0
272 rune, size = DecodeRuneInString(s[start:end])
273 if start+size != end {
274 return RuneError, 1
276 return rune, size
279 // RuneLen returns the number of bytes required to encode the rune.
280 func RuneLen(rune int) int {
281 switch {
282 case rune <= _Rune1Max:
283 return 1
284 case rune <= _Rune2Max:
285 return 2
286 case rune <= _Rune3Max:
287 return 3
288 case rune <= _Rune4Max:
289 return 4
291 return -1
294 // EncodeRune writes into p (which must be large enough) the UTF-8 encoding of the rune.
295 // It returns the number of bytes written.
296 func EncodeRune(rune int, p []byte) int {
297 // Negative values are erroneous. Making it unsigned addresses the problem.
298 r := uint(rune)
300 if r <= _Rune1Max {
301 p[0] = byte(r)
302 return 1
305 if r <= _Rune2Max {
306 p[0] = _T2 | byte(r>>6)
307 p[1] = _Tx | byte(r)&_Maskx
308 return 2
311 if r > unicode.MaxRune {
312 r = RuneError
315 if r <= _Rune3Max {
316 p[0] = _T3 | byte(r>>12)
317 p[1] = _Tx | byte(r>>6)&_Maskx
318 p[2] = _Tx | byte(r)&_Maskx
319 return 3
322 p[0] = _T4 | byte(r>>18)
323 p[1] = _Tx | byte(r>>12)&_Maskx
324 p[2] = _Tx | byte(r>>6)&_Maskx
325 p[3] = _Tx | byte(r)&_Maskx
326 return 4
329 // RuneCount returns the number of runes in p. Erroneous and short
330 // encodings are treated as single runes of width 1 byte.
331 func RuneCount(p []byte) int {
332 i := 0
333 var n int
334 for n = 0; i < len(p); n++ {
335 if p[i] < RuneSelf {
337 } else {
338 _, size := DecodeRune(p[i:])
339 i += size
342 return n
345 // RuneCountInString is like RuneCount but its input is a string.
346 func RuneCountInString(s string) (n int) {
347 for _ = range s {
350 return
353 // RuneStart reports whether the byte could be the first byte of
354 // an encoded rune. Second and subsequent bytes always have the top
355 // two bits set to 10.
356 func RuneStart(b byte) bool { return b&0xC0 != 0x80 }