c++: Some cp-tree.def comment fixes
[official-gcc.git] / libgo / go / strconv / quote.go
blobd2814b92da795b77ceb1a4757748cf7d25ca6ea2
1 // Copyright 2009 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
5 //go:generate go run makeisprint.go -output isprint.go
7 package strconv
9 import (
10 "unicode/utf8"
13 const (
14 lowerhex = "0123456789abcdef"
15 upperhex = "0123456789ABCDEF"
18 // contains reports whether the string contains the byte c.
19 func contains(s string, c byte) bool {
20 return index(s, c) != -1
23 func quoteWith(s string, quote byte, ASCIIonly, graphicOnly bool) string {
24 return string(appendQuotedWith(make([]byte, 0, 3*len(s)/2), s, quote, ASCIIonly, graphicOnly))
27 func quoteRuneWith(r rune, quote byte, ASCIIonly, graphicOnly bool) string {
28 return string(appendQuotedRuneWith(nil, r, quote, ASCIIonly, graphicOnly))
31 func appendQuotedWith(buf []byte, s string, quote byte, ASCIIonly, graphicOnly bool) []byte {
32 // Often called with big strings, so preallocate. If there's quoting,
33 // this is conservative but still helps a lot.
34 if cap(buf)-len(buf) < len(s) {
35 nBuf := make([]byte, len(buf), len(buf)+1+len(s)+1)
36 copy(nBuf, buf)
37 buf = nBuf
39 buf = append(buf, quote)
40 for width := 0; len(s) > 0; s = s[width:] {
41 r := rune(s[0])
42 width = 1
43 if r >= utf8.RuneSelf {
44 r, width = utf8.DecodeRuneInString(s)
46 if width == 1 && r == utf8.RuneError {
47 buf = append(buf, `\x`...)
48 buf = append(buf, lowerhex[s[0]>>4])
49 buf = append(buf, lowerhex[s[0]&0xF])
50 continue
52 buf = appendEscapedRune(buf, r, quote, ASCIIonly, graphicOnly)
54 buf = append(buf, quote)
55 return buf
58 func appendQuotedRuneWith(buf []byte, r rune, quote byte, ASCIIonly, graphicOnly bool) []byte {
59 buf = append(buf, quote)
60 if !utf8.ValidRune(r) {
61 r = utf8.RuneError
63 buf = appendEscapedRune(buf, r, quote, ASCIIonly, graphicOnly)
64 buf = append(buf, quote)
65 return buf
68 func appendEscapedRune(buf []byte, r rune, quote byte, ASCIIonly, graphicOnly bool) []byte {
69 var runeTmp [utf8.UTFMax]byte
70 if r == rune(quote) || r == '\\' { // always backslashed
71 buf = append(buf, '\\')
72 buf = append(buf, byte(r))
73 return buf
75 if ASCIIonly {
76 if r < utf8.RuneSelf && IsPrint(r) {
77 buf = append(buf, byte(r))
78 return buf
80 } else if IsPrint(r) || graphicOnly && isInGraphicList(r) {
81 n := utf8.EncodeRune(runeTmp[:], r)
82 buf = append(buf, runeTmp[:n]...)
83 return buf
85 switch r {
86 case '\a':
87 buf = append(buf, `\a`...)
88 case '\b':
89 buf = append(buf, `\b`...)
90 case '\f':
91 buf = append(buf, `\f`...)
92 case '\n':
93 buf = append(buf, `\n`...)
94 case '\r':
95 buf = append(buf, `\r`...)
96 case '\t':
97 buf = append(buf, `\t`...)
98 case '\v':
99 buf = append(buf, `\v`...)
100 default:
101 switch {
102 case r < ' ':
103 buf = append(buf, `\x`...)
104 buf = append(buf, lowerhex[byte(r)>>4])
105 buf = append(buf, lowerhex[byte(r)&0xF])
106 case !utf8.ValidRune(r):
107 r = 0xFFFD
108 fallthrough
109 case r < 0x10000:
110 buf = append(buf, `\u`...)
111 for s := 12; s >= 0; s -= 4 {
112 buf = append(buf, lowerhex[r>>uint(s)&0xF])
114 default:
115 buf = append(buf, `\U`...)
116 for s := 28; s >= 0; s -= 4 {
117 buf = append(buf, lowerhex[r>>uint(s)&0xF])
121 return buf
124 // Quote returns a double-quoted Go string literal representing s. The
125 // returned string uses Go escape sequences (\t, \n, \xFF, \u0100) for
126 // control characters and non-printable characters as defined by
127 // IsPrint.
128 func Quote(s string) string {
129 return quoteWith(s, '"', false, false)
132 // AppendQuote appends a double-quoted Go string literal representing s,
133 // as generated by Quote, to dst and returns the extended buffer.
134 func AppendQuote(dst []byte, s string) []byte {
135 return appendQuotedWith(dst, s, '"', false, false)
138 // QuoteToASCII returns a double-quoted Go string literal representing s.
139 // The returned string uses Go escape sequences (\t, \n, \xFF, \u0100) for
140 // non-ASCII characters and non-printable characters as defined by IsPrint.
141 func QuoteToASCII(s string) string {
142 return quoteWith(s, '"', true, false)
145 // AppendQuoteToASCII appends a double-quoted Go string literal representing s,
146 // as generated by QuoteToASCII, to dst and returns the extended buffer.
147 func AppendQuoteToASCII(dst []byte, s string) []byte {
148 return appendQuotedWith(dst, s, '"', true, false)
151 // QuoteToGraphic returns a double-quoted Go string literal representing s.
152 // The returned string leaves Unicode graphic characters, as defined by
153 // IsGraphic, unchanged and uses Go escape sequences (\t, \n, \xFF, \u0100)
154 // for non-graphic characters.
155 func QuoteToGraphic(s string) string {
156 return quoteWith(s, '"', false, true)
159 // AppendQuoteToGraphic appends a double-quoted Go string literal representing s,
160 // as generated by QuoteToGraphic, to dst and returns the extended buffer.
161 func AppendQuoteToGraphic(dst []byte, s string) []byte {
162 return appendQuotedWith(dst, s, '"', false, true)
165 // QuoteRune returns a single-quoted Go character literal representing the
166 // rune. The returned string uses Go escape sequences (\t, \n, \xFF, \u0100)
167 // for control characters and non-printable characters as defined by IsPrint.
168 func QuoteRune(r rune) string {
169 return quoteRuneWith(r, '\'', false, false)
172 // AppendQuoteRune appends a single-quoted Go character literal representing the rune,
173 // as generated by QuoteRune, to dst and returns the extended buffer.
174 func AppendQuoteRune(dst []byte, r rune) []byte {
175 return appendQuotedRuneWith(dst, r, '\'', false, false)
178 // QuoteRuneToASCII returns a single-quoted Go character literal representing
179 // the rune. The returned string uses Go escape sequences (\t, \n, \xFF,
180 // \u0100) for non-ASCII characters and non-printable characters as defined
181 // by IsPrint.
182 func QuoteRuneToASCII(r rune) string {
183 return quoteRuneWith(r, '\'', true, false)
186 // AppendQuoteRuneToASCII appends a single-quoted Go character literal representing the rune,
187 // as generated by QuoteRuneToASCII, to dst and returns the extended buffer.
188 func AppendQuoteRuneToASCII(dst []byte, r rune) []byte {
189 return appendQuotedRuneWith(dst, r, '\'', true, false)
192 // QuoteRuneToGraphic returns a single-quoted Go character literal representing
193 // the rune. If the rune is not a Unicode graphic character,
194 // as defined by IsGraphic, the returned string will use a Go escape sequence
195 // (\t, \n, \xFF, \u0100).
196 func QuoteRuneToGraphic(r rune) string {
197 return quoteRuneWith(r, '\'', false, true)
200 // AppendQuoteRuneToGraphic appends a single-quoted Go character literal representing the rune,
201 // as generated by QuoteRuneToGraphic, to dst and returns the extended buffer.
202 func AppendQuoteRuneToGraphic(dst []byte, r rune) []byte {
203 return appendQuotedRuneWith(dst, r, '\'', false, true)
206 // CanBackquote reports whether the string s can be represented
207 // unchanged as a single-line backquoted string without control
208 // characters other than tab.
209 func CanBackquote(s string) bool {
210 for len(s) > 0 {
211 r, wid := utf8.DecodeRuneInString(s)
212 s = s[wid:]
213 if wid > 1 {
214 if r == '\ufeff' {
215 return false // BOMs are invisible and should not be quoted.
217 continue // All other multibyte runes are correctly encoded and assumed printable.
219 if r == utf8.RuneError {
220 return false
222 if (r < ' ' && r != '\t') || r == '`' || r == '\u007F' {
223 return false
226 return true
229 func unhex(b byte) (v rune, ok bool) {
230 c := rune(b)
231 switch {
232 case '0' <= c && c <= '9':
233 return c - '0', true
234 case 'a' <= c && c <= 'f':
235 return c - 'a' + 10, true
236 case 'A' <= c && c <= 'F':
237 return c - 'A' + 10, true
239 return
242 // UnquoteChar decodes the first character or byte in the escaped string
243 // or character literal represented by the string s.
244 // It returns four values:
246 // 1) value, the decoded Unicode code point or byte value;
247 // 2) multibyte, a boolean indicating whether the decoded character requires a multibyte UTF-8 representation;
248 // 3) tail, the remainder of the string after the character; and
249 // 4) an error that will be nil if the character is syntactically valid.
251 // The second argument, quote, specifies the type of literal being parsed
252 // and therefore which escaped quote character is permitted.
253 // If set to a single quote, it permits the sequence \' and disallows unescaped '.
254 // If set to a double quote, it permits \" and disallows unescaped ".
255 // If set to zero, it does not permit either escape and allows both quote characters to appear unescaped.
256 func UnquoteChar(s string, quote byte) (value rune, multibyte bool, tail string, err error) {
257 // easy cases
258 if len(s) == 0 {
259 err = ErrSyntax
260 return
262 switch c := s[0]; {
263 case c == quote && (quote == '\'' || quote == '"'):
264 err = ErrSyntax
265 return
266 case c >= utf8.RuneSelf:
267 r, size := utf8.DecodeRuneInString(s)
268 return r, true, s[size:], nil
269 case c != '\\':
270 return rune(s[0]), false, s[1:], nil
273 // hard case: c is backslash
274 if len(s) <= 1 {
275 err = ErrSyntax
276 return
278 c := s[1]
279 s = s[2:]
281 switch c {
282 case 'a':
283 value = '\a'
284 case 'b':
285 value = '\b'
286 case 'f':
287 value = '\f'
288 case 'n':
289 value = '\n'
290 case 'r':
291 value = '\r'
292 case 't':
293 value = '\t'
294 case 'v':
295 value = '\v'
296 case 'x', 'u', 'U':
297 n := 0
298 switch c {
299 case 'x':
300 n = 2
301 case 'u':
302 n = 4
303 case 'U':
304 n = 8
306 var v rune
307 if len(s) < n {
308 err = ErrSyntax
309 return
311 for j := 0; j < n; j++ {
312 x, ok := unhex(s[j])
313 if !ok {
314 err = ErrSyntax
315 return
317 v = v<<4 | x
319 s = s[n:]
320 if c == 'x' {
321 // single-byte string, possibly not UTF-8
322 value = v
323 break
325 if !utf8.ValidRune(v) {
326 err = ErrSyntax
327 return
329 value = v
330 multibyte = true
331 case '0', '1', '2', '3', '4', '5', '6', '7':
332 v := rune(c) - '0'
333 if len(s) < 2 {
334 err = ErrSyntax
335 return
337 for j := 0; j < 2; j++ { // one digit already; two more
338 x := rune(s[j]) - '0'
339 if x < 0 || x > 7 {
340 err = ErrSyntax
341 return
343 v = (v << 3) | x
345 s = s[2:]
346 if v > 255 {
347 err = ErrSyntax
348 return
350 value = v
351 case '\\':
352 value = '\\'
353 case '\'', '"':
354 if c != quote {
355 err = ErrSyntax
356 return
358 value = rune(c)
359 default:
360 err = ErrSyntax
361 return
363 tail = s
364 return
367 // QuotedPrefix returns the quoted string (as understood by Unquote) at the prefix of s.
368 // If s does not start with a valid quoted string, QuotedPrefix returns an error.
369 func QuotedPrefix(s string) (string, error) {
370 out, _, err := unquote(s, false)
371 return out, err
374 // Unquote interprets s as a single-quoted, double-quoted,
375 // or backquoted Go string literal, returning the string value
376 // that s quotes. (If s is single-quoted, it would be a Go
377 // character literal; Unquote returns the corresponding
378 // one-character string.)
379 func Unquote(s string) (string, error) {
380 out, rem, err := unquote(s, true)
381 if len(rem) > 0 {
382 return "", ErrSyntax
384 return out, err
387 // unquote parses a quoted string at the start of the input,
388 // returning the parsed prefix, the remaining suffix, and any parse errors.
389 // If unescape is true, the parsed prefix is unescaped,
390 // otherwise the input prefix is provided verbatim.
391 func unquote(in string, unescape bool) (out, rem string, err error) {
392 // Determine the quote form and optimistically find the terminating quote.
393 if len(in) < 2 {
394 return "", in, ErrSyntax
396 quote := in[0]
397 end := index(in[1:], quote)
398 if end < 0 {
399 return "", in, ErrSyntax
401 end += 2 // position after terminating quote; may be wrong if escape sequences are present
403 switch quote {
404 case '`':
405 switch {
406 case !unescape:
407 out = in[:end] // include quotes
408 case !contains(in[:end], '\r'):
409 out = in[len("`") : end-len("`")] // exclude quotes
410 default:
411 // Carriage return characters ('\r') inside raw string literals
412 // are discarded from the raw string value.
413 buf := make([]byte, 0, end-len("`")-len("\r")-len("`"))
414 for i := len("`"); i < end-len("`"); i++ {
415 if in[i] != '\r' {
416 buf = append(buf, in[i])
419 out = string(buf)
421 // NOTE: Prior implementations did not verify that raw strings consist
422 // of valid UTF-8 characters and we continue to not verify it as such.
423 // The Go specification does not explicitly require valid UTF-8,
424 // but only mention that it is implicitly valid for Go source code
425 // (which must be valid UTF-8).
426 return out, in[end:], nil
427 case '"', '\'':
428 // Handle quoted strings without any escape sequences.
429 if !contains(in[:end], '\\') && !contains(in[:end], '\n') {
430 var valid bool
431 switch quote {
432 case '"':
433 valid = utf8.ValidString(in[len(`"`) : end-len(`"`)])
434 case '\'':
435 r, n := utf8.DecodeRuneInString(in[len("'") : end-len("'")])
436 valid = len("'")+n+len("'") == end && (r != utf8.RuneError || n != 1)
438 if valid {
439 out = in[:end]
440 if unescape {
441 out = out[1 : end-1] // exclude quotes
443 return out, in[end:], nil
447 // Handle quoted strings with escape sequences.
448 var buf []byte
449 in0 := in
450 in = in[1:] // skip starting quote
451 if unescape {
452 buf = make([]byte, 0, 3*end/2) // try to avoid more allocations
454 for len(in) > 0 && in[0] != quote {
455 // Process the next character,
456 // rejecting any unescaped newline characters which are invalid.
457 r, multibyte, rem, err := UnquoteChar(in, quote)
458 if in[0] == '\n' || err != nil {
459 return "", in0, ErrSyntax
461 in = rem
463 // Append the character if unescaping the input.
464 if unescape {
465 if r < utf8.RuneSelf || !multibyte {
466 buf = append(buf, byte(r))
467 } else {
468 var arr [utf8.UTFMax]byte
469 n := utf8.EncodeRune(arr[:], r)
470 buf = append(buf, arr[:n]...)
474 // Single quoted strings must be a single character.
475 if quote == '\'' {
476 break
480 // Verify that the string ends with a terminating quote.
481 if !(len(in) > 0 && in[0] == quote) {
482 return "", in0, ErrSyntax
484 in = in[1:] // skip terminating quote
486 if unescape {
487 return string(buf), in, nil
489 return in0[:len(in0)-len(in)], in, nil
490 default:
491 return "", in, ErrSyntax
495 // bsearch16 returns the smallest i such that a[i] >= x.
496 // If there is no such i, bsearch16 returns len(a).
497 func bsearch16(a []uint16, x uint16) int {
498 i, j := 0, len(a)
499 for i < j {
500 h := i + (j-i)>>1
501 if a[h] < x {
502 i = h + 1
503 } else {
504 j = h
507 return i
510 // bsearch32 returns the smallest i such that a[i] >= x.
511 // If there is no such i, bsearch32 returns len(a).
512 func bsearch32(a []uint32, x uint32) int {
513 i, j := 0, len(a)
514 for i < j {
515 h := i + (j-i)>>1
516 if a[h] < x {
517 i = h + 1
518 } else {
519 j = h
522 return i
525 // TODO: IsPrint is a local implementation of unicode.IsPrint, verified by the tests
526 // to give the same answer. It allows this package not to depend on unicode,
527 // and therefore not pull in all the Unicode tables. If the linker were better
528 // at tossing unused tables, we could get rid of this implementation.
529 // That would be nice.
531 // IsPrint reports whether the rune is defined as printable by Go, with
532 // the same definition as unicode.IsPrint: letters, numbers, punctuation,
533 // symbols and ASCII space.
534 func IsPrint(r rune) bool {
535 // Fast check for Latin-1
536 if r <= 0xFF {
537 if 0x20 <= r && r <= 0x7E {
538 // All the ASCII is printable from space through DEL-1.
539 return true
541 if 0xA1 <= r && r <= 0xFF {
542 // Similarly for ¡ through ÿ...
543 return r != 0xAD // ...except for the bizarre soft hyphen.
545 return false
548 // Same algorithm, either on uint16 or uint32 value.
549 // First, find first i such that isPrint[i] >= x.
550 // This is the index of either the start or end of a pair that might span x.
551 // The start is even (isPrint[i&^1]) and the end is odd (isPrint[i|1]).
552 // If we find x in a range, make sure x is not in isNotPrint list.
554 if 0 <= r && r < 1<<16 {
555 rr, isPrint, isNotPrint := uint16(r), isPrint16, isNotPrint16
556 i := bsearch16(isPrint, rr)
557 if i >= len(isPrint) || rr < isPrint[i&^1] || isPrint[i|1] < rr {
558 return false
560 j := bsearch16(isNotPrint, rr)
561 return j >= len(isNotPrint) || isNotPrint[j] != rr
564 rr, isPrint, isNotPrint := uint32(r), isPrint32, isNotPrint32
565 i := bsearch32(isPrint, rr)
566 if i >= len(isPrint) || rr < isPrint[i&^1] || isPrint[i|1] < rr {
567 return false
569 if r >= 0x20000 {
570 return true
572 r -= 0x10000
573 j := bsearch16(isNotPrint, uint16(r))
574 return j >= len(isNotPrint) || isNotPrint[j] != uint16(r)
577 // IsGraphic reports whether the rune is defined as a Graphic by Unicode. Such
578 // characters include letters, marks, numbers, punctuation, symbols, and
579 // spaces, from categories L, M, N, P, S, and Zs.
580 func IsGraphic(r rune) bool {
581 if IsPrint(r) {
582 return true
584 return isInGraphicList(r)
587 // isInGraphicList reports whether the rune is in the isGraphic list. This separation
588 // from IsGraphic allows quoteWith to avoid two calls to IsPrint.
589 // Should be called only if IsPrint fails.
590 func isInGraphicList(r rune) bool {
591 // We know r must fit in 16 bits - see makeisprint.go.
592 if r > 0xFFFF {
593 return false
595 rr := uint16(r)
596 i := bsearch16(isGraphic, rr)
597 return i < len(isGraphic) && rr == isGraphic[i]