1 // Copyright 2009 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
5 //go:generate go run makeisprint.go -output isprint.go
14 lowerhex
= "0123456789abcdef"
15 upperhex
= "0123456789ABCDEF"
18 // contains reports whether the string contains the byte c.
19 func contains(s
string, c
byte) bool {
20 return index(s
, c
) != -1
23 func quoteWith(s
string, quote
byte, ASCIIonly
, graphicOnly
bool) string {
24 return string(appendQuotedWith(make([]byte, 0, 3*len(s
)/2), s
, quote
, ASCIIonly
, graphicOnly
))
27 func quoteRuneWith(r rune
, quote
byte, ASCIIonly
, graphicOnly
bool) string {
28 return string(appendQuotedRuneWith(nil, r
, quote
, ASCIIonly
, graphicOnly
))
31 func appendQuotedWith(buf
[]byte, s
string, quote
byte, ASCIIonly
, graphicOnly
bool) []byte {
32 // Often called with big strings, so preallocate. If there's quoting,
33 // this is conservative but still helps a lot.
34 if cap(buf
)-len(buf
) < len(s
) {
35 nBuf
:= make([]byte, len(buf
), len(buf
)+1+len(s
)+1)
39 buf
= append(buf
, quote
)
40 for width
:= 0; len(s
) > 0; s
= s
[width
:] {
43 if r
>= utf8
.RuneSelf
{
44 r
, width
= utf8
.DecodeRuneInString(s
)
46 if width
== 1 && r
== utf8
.RuneError
{
47 buf
= append(buf
, `\x`...)
48 buf
= append(buf
, lowerhex
[s
[0]>>4])
49 buf
= append(buf
, lowerhex
[s
[0]&0xF])
52 buf
= appendEscapedRune(buf
, r
, quote
, ASCIIonly
, graphicOnly
)
54 buf
= append(buf
, quote
)
58 func appendQuotedRuneWith(buf
[]byte, r rune
, quote
byte, ASCIIonly
, graphicOnly
bool) []byte {
59 buf
= append(buf
, quote
)
60 if !utf8
.ValidRune(r
) {
63 buf
= appendEscapedRune(buf
, r
, quote
, ASCIIonly
, graphicOnly
)
64 buf
= append(buf
, quote
)
68 func appendEscapedRune(buf
[]byte, r rune
, quote
byte, ASCIIonly
, graphicOnly
bool) []byte {
69 var runeTmp
[utf8
.UTFMax
]byte
70 if r
== rune(quote
) || r
== '\\' { // always backslashed
71 buf
= append(buf
, '\\')
72 buf
= append(buf
, byte(r
))
76 if r
< utf8
.RuneSelf
&& IsPrint(r
) {
77 buf
= append(buf
, byte(r
))
80 } else if IsPrint(r
) || graphicOnly
&& isInGraphicList(r
) {
81 n
:= utf8
.EncodeRune(runeTmp
[:], r
)
82 buf
= append(buf
, runeTmp
[:n
]...)
87 buf
= append(buf
, `\a`...)
89 buf
= append(buf
, `\b`...)
91 buf
= append(buf
, `\f`...)
93 buf
= append(buf
, `\n`...)
95 buf
= append(buf
, `\r`...)
97 buf
= append(buf
, `\t`...)
99 buf
= append(buf
, `\v`...)
103 buf
= append(buf
, `\x`...)
104 buf
= append(buf
, lowerhex
[byte(r
)>>4])
105 buf
= append(buf
, lowerhex
[byte(r
)&0xF])
106 case !utf8
.ValidRune(r
):
110 buf
= append(buf
, `\u`...)
111 for s
:= 12; s
>= 0; s
-= 4 {
112 buf
= append(buf
, lowerhex
[r
>>uint(s
)&0xF])
115 buf
= append(buf
, `\U`...)
116 for s
:= 28; s
>= 0; s
-= 4 {
117 buf
= append(buf
, lowerhex
[r
>>uint(s
)&0xF])
124 // Quote returns a double-quoted Go string literal representing s. The
125 // returned string uses Go escape sequences (\t, \n, \xFF, \u0100) for
126 // control characters and non-printable characters as defined by
128 func Quote(s
string) string {
129 return quoteWith(s
, '"', false, false)
132 // AppendQuote appends a double-quoted Go string literal representing s,
133 // as generated by Quote, to dst and returns the extended buffer.
134 func AppendQuote(dst
[]byte, s
string) []byte {
135 return appendQuotedWith(dst
, s
, '"', false, false)
138 // QuoteToASCII returns a double-quoted Go string literal representing s.
139 // The returned string uses Go escape sequences (\t, \n, \xFF, \u0100) for
140 // non-ASCII characters and non-printable characters as defined by IsPrint.
141 func QuoteToASCII(s
string) string {
142 return quoteWith(s
, '"', true, false)
145 // AppendQuoteToASCII appends a double-quoted Go string literal representing s,
146 // as generated by QuoteToASCII, to dst and returns the extended buffer.
147 func AppendQuoteToASCII(dst
[]byte, s
string) []byte {
148 return appendQuotedWith(dst
, s
, '"', true, false)
151 // QuoteToGraphic returns a double-quoted Go string literal representing s.
152 // The returned string leaves Unicode graphic characters, as defined by
153 // IsGraphic, unchanged and uses Go escape sequences (\t, \n, \xFF, \u0100)
154 // for non-graphic characters.
155 func QuoteToGraphic(s
string) string {
156 return quoteWith(s
, '"', false, true)
159 // AppendQuoteToGraphic appends a double-quoted Go string literal representing s,
160 // as generated by QuoteToGraphic, to dst and returns the extended buffer.
161 func AppendQuoteToGraphic(dst
[]byte, s
string) []byte {
162 return appendQuotedWith(dst
, s
, '"', false, true)
165 // QuoteRune returns a single-quoted Go character literal representing the
166 // rune. The returned string uses Go escape sequences (\t, \n, \xFF, \u0100)
167 // for control characters and non-printable characters as defined by IsPrint.
168 func QuoteRune(r rune
) string {
169 return quoteRuneWith(r
, '\'', false, false)
172 // AppendQuoteRune appends a single-quoted Go character literal representing the rune,
173 // as generated by QuoteRune, to dst and returns the extended buffer.
174 func AppendQuoteRune(dst
[]byte, r rune
) []byte {
175 return appendQuotedRuneWith(dst
, r
, '\'', false, false)
178 // QuoteRuneToASCII returns a single-quoted Go character literal representing
179 // the rune. The returned string uses Go escape sequences (\t, \n, \xFF,
180 // \u0100) for non-ASCII characters and non-printable characters as defined
182 func QuoteRuneToASCII(r rune
) string {
183 return quoteRuneWith(r
, '\'', true, false)
186 // AppendQuoteRuneToASCII appends a single-quoted Go character literal representing the rune,
187 // as generated by QuoteRuneToASCII, to dst and returns the extended buffer.
188 func AppendQuoteRuneToASCII(dst
[]byte, r rune
) []byte {
189 return appendQuotedRuneWith(dst
, r
, '\'', true, false)
192 // QuoteRuneToGraphic returns a single-quoted Go character literal representing
193 // the rune. If the rune is not a Unicode graphic character,
194 // as defined by IsGraphic, the returned string will use a Go escape sequence
195 // (\t, \n, \xFF, \u0100).
196 func QuoteRuneToGraphic(r rune
) string {
197 return quoteRuneWith(r
, '\'', false, true)
200 // AppendQuoteRuneToGraphic appends a single-quoted Go character literal representing the rune,
201 // as generated by QuoteRuneToGraphic, to dst and returns the extended buffer.
202 func AppendQuoteRuneToGraphic(dst
[]byte, r rune
) []byte {
203 return appendQuotedRuneWith(dst
, r
, '\'', false, true)
206 // CanBackquote reports whether the string s can be represented
207 // unchanged as a single-line backquoted string without control
208 // characters other than tab.
209 func CanBackquote(s
string) bool {
211 r
, wid
:= utf8
.DecodeRuneInString(s
)
215 return false // BOMs are invisible and should not be quoted.
217 continue // All other multibyte runes are correctly encoded and assumed printable.
219 if r
== utf8
.RuneError
{
222 if (r
< ' ' && r
!= '\t') || r
== '`' || r
== '\u007F' {
229 func unhex(b
byte) (v rune
, ok
bool) {
232 case '0' <= c
&& c
<= '9':
234 case 'a' <= c
&& c
<= 'f':
235 return c
- 'a' + 10, true
236 case 'A' <= c
&& c
<= 'F':
237 return c
- 'A' + 10, true
242 // UnquoteChar decodes the first character or byte in the escaped string
243 // or character literal represented by the string s.
244 // It returns four values:
246 // 1) value, the decoded Unicode code point or byte value;
247 // 2) multibyte, a boolean indicating whether the decoded character requires a multibyte UTF-8 representation;
248 // 3) tail, the remainder of the string after the character; and
249 // 4) an error that will be nil if the character is syntactically valid.
251 // The second argument, quote, specifies the type of literal being parsed
252 // and therefore which escaped quote character is permitted.
253 // If set to a single quote, it permits the sequence \' and disallows unescaped '.
254 // If set to a double quote, it permits \" and disallows unescaped ".
255 // If set to zero, it does not permit either escape and allows both quote characters to appear unescaped.
256 func UnquoteChar(s
string, quote
byte) (value rune
, multibyte
bool, tail
string, err error
) {
263 case c
== quote
&& (quote
== '\'' || quote
== '"'):
266 case c
>= utf8
.RuneSelf
:
267 r
, size
:= utf8
.DecodeRuneInString(s
)
268 return r
, true, s
[size
:], nil
270 return rune(s
[0]), false, s
[1:], nil
273 // hard case: c is backslash
311 for j
:= 0; j
< n
; j
++ {
321 // single-byte string, possibly not UTF-8
325 if !utf8
.ValidRune(v
) {
331 case '0', '1', '2', '3', '4', '5', '6', '7':
337 for j
:= 0; j
< 2; j
++ { // one digit already; two more
338 x
:= rune(s
[j
]) - '0'
367 // QuotedPrefix returns the quoted string (as understood by Unquote) at the prefix of s.
368 // If s does not start with a valid quoted string, QuotedPrefix returns an error.
369 func QuotedPrefix(s
string) (string, error
) {
370 out
, _
, err
:= unquote(s
, false)
374 // Unquote interprets s as a single-quoted, double-quoted,
375 // or backquoted Go string literal, returning the string value
376 // that s quotes. (If s is single-quoted, it would be a Go
377 // character literal; Unquote returns the corresponding
378 // one-character string.)
379 func Unquote(s
string) (string, error
) {
380 out
, rem
, err
:= unquote(s
, true)
387 // unquote parses a quoted string at the start of the input,
388 // returning the parsed prefix, the remaining suffix, and any parse errors.
389 // If unescape is true, the parsed prefix is unescaped,
390 // otherwise the input prefix is provided verbatim.
391 func unquote(in
string, unescape
bool) (out
, rem
string, err error
) {
392 // Determine the quote form and optimistically find the terminating quote.
394 return "", in
, ErrSyntax
397 end
:= index(in
[1:], quote
)
399 return "", in
, ErrSyntax
401 end
+= 2 // position after terminating quote; may be wrong if escape sequences are present
407 out
= in
[:end
] // include quotes
408 case !contains(in
[:end
], '\r'):
409 out
= in
[len("`") : end
-len("`")] // exclude quotes
411 // Carriage return characters ('\r') inside raw string literals
412 // are discarded from the raw string value.
413 buf
:= make([]byte, 0, end
-len("`")-len("\r")-len("`"))
414 for i
:= len("`"); i
< end
-len("`"); i
++ {
416 buf
= append(buf
, in
[i
])
421 // NOTE: Prior implementations did not verify that raw strings consist
422 // of valid UTF-8 characters and we continue to not verify it as such.
423 // The Go specification does not explicitly require valid UTF-8,
424 // but only mention that it is implicitly valid for Go source code
425 // (which must be valid UTF-8).
426 return out
, in
[end
:], nil
428 // Handle quoted strings without any escape sequences.
429 if !contains(in
[:end
], '\\') && !contains(in
[:end
], '\n') {
433 valid
= utf8
.ValidString(in
[len(`"`) : end
-len(`"`)])
435 r
, n
:= utf8
.DecodeRuneInString(in
[len("'") : end
-len("'")])
436 valid
= len("'")+n
+len("'") == end
&& (r
!= utf8
.RuneError || n
!= 1)
441 out
= out
[1 : end
-1] // exclude quotes
443 return out
, in
[end
:], nil
447 // Handle quoted strings with escape sequences.
450 in
= in
[1:] // skip starting quote
452 buf
= make([]byte, 0, 3*end
/2) // try to avoid more allocations
454 for len(in
) > 0 && in
[0] != quote
{
455 // Process the next character,
456 // rejecting any unescaped newline characters which are invalid.
457 r
, multibyte
, rem
, err
:= UnquoteChar(in
, quote
)
458 if in
[0] == '\n' || err
!= nil {
459 return "", in0
, ErrSyntax
463 // Append the character if unescaping the input.
465 if r
< utf8
.RuneSelf ||
!multibyte
{
466 buf
= append(buf
, byte(r
))
468 var arr
[utf8
.UTFMax
]byte
469 n
:= utf8
.EncodeRune(arr
[:], r
)
470 buf
= append(buf
, arr
[:n
]...)
474 // Single quoted strings must be a single character.
480 // Verify that the string ends with a terminating quote.
481 if !(len(in
) > 0 && in
[0] == quote
) {
482 return "", in0
, ErrSyntax
484 in
= in
[1:] // skip terminating quote
487 return string(buf
), in
, nil
489 return in0
[:len(in0
)-len(in
)], in
, nil
491 return "", in
, ErrSyntax
495 // bsearch16 returns the smallest i such that a[i] >= x.
496 // If there is no such i, bsearch16 returns len(a).
497 func bsearch16(a
[]uint16, x
uint16) int {
510 // bsearch32 returns the smallest i such that a[i] >= x.
511 // If there is no such i, bsearch32 returns len(a).
512 func bsearch32(a
[]uint32, x
uint32) int {
525 // TODO: IsPrint is a local implementation of unicode.IsPrint, verified by the tests
526 // to give the same answer. It allows this package not to depend on unicode,
527 // and therefore not pull in all the Unicode tables. If the linker were better
528 // at tossing unused tables, we could get rid of this implementation.
529 // That would be nice.
531 // IsPrint reports whether the rune is defined as printable by Go, with
532 // the same definition as unicode.IsPrint: letters, numbers, punctuation,
533 // symbols and ASCII space.
534 func IsPrint(r rune
) bool {
535 // Fast check for Latin-1
537 if 0x20 <= r
&& r
<= 0x7E {
538 // All the ASCII is printable from space through DEL-1.
541 if 0xA1 <= r
&& r
<= 0xFF {
542 // Similarly for ¡ through ÿ...
543 return r
!= 0xAD // ...except for the bizarre soft hyphen.
548 // Same algorithm, either on uint16 or uint32 value.
549 // First, find first i such that isPrint[i] >= x.
550 // This is the index of either the start or end of a pair that might span x.
551 // The start is even (isPrint[i&^1]) and the end is odd (isPrint[i|1]).
552 // If we find x in a range, make sure x is not in isNotPrint list.
554 if 0 <= r
&& r
< 1<<16 {
555 rr
, isPrint
, isNotPrint
:= uint16(r
), isPrint16
, isNotPrint16
556 i
:= bsearch16(isPrint
, rr
)
557 if i
>= len(isPrint
) || rr
< isPrint
[i
&^1] || isPrint
[i|
1] < rr
{
560 j
:= bsearch16(isNotPrint
, rr
)
561 return j
>= len(isNotPrint
) || isNotPrint
[j
] != rr
564 rr
, isPrint
, isNotPrint
:= uint32(r
), isPrint32
, isNotPrint32
565 i
:= bsearch32(isPrint
, rr
)
566 if i
>= len(isPrint
) || rr
< isPrint
[i
&^1] || isPrint
[i|
1] < rr
{
573 j
:= bsearch16(isNotPrint
, uint16(r
))
574 return j
>= len(isNotPrint
) || isNotPrint
[j
] != uint16(r
)
577 // IsGraphic reports whether the rune is defined as a Graphic by Unicode. Such
578 // characters include letters, marks, numbers, punctuation, symbols, and
579 // spaces, from categories L, M, N, P, S, and Zs.
580 func IsGraphic(r rune
) bool {
584 return isInGraphicList(r
)
587 // isInGraphicList reports whether the rune is in the isGraphic list. This separation
588 // from IsGraphic allows quoteWith to avoid two calls to IsPrint.
589 // Should be called only if IsPrint fails.
590 func isInGraphicList(r rune
) bool {
591 // We know r must fit in 16 bits - see makeisprint.go.
596 i
:= bsearch16(isGraphic
, rr
)
597 return i
< len(isGraphic
) && rr
== isGraphic
[i
]