libgo/go/runtime/utf8.go

   1 // Copyright 2016 The Go Authors. All rights reserved.
   2 // Use of this source code is governed by a BSD-style
   3 // license that can be found in the LICENSE file.
   4
   5 package runtime
   6
   7 import _ "unsafe" // For go:linkname.
   8
   9 // For gccgo, use go:linkname to rename compiler-called functions to
  10 // themselves, so that the compiler will export them.
  11 //
  12 //go:linkname decoderune runtime.decoderune
  13
  14 // Numbers fundamental to the encoding.
  15 const (
  16         runeError = '\uFFFD'     // the "error" Rune or "Unicode replacement character"
  17         runeSelf  = 0x80         // characters below Runeself are represented as themselves in a single byte.
  18         maxRune   = '\U0010FFFF' // Maximum valid Unicode code point.
  19 )
  20
  21 // Code points in the surrogate range are not valid for UTF-8.
  22 const (
  23         surrogateMin = 0xD800
  24         surrogateMax = 0xDFFF
  25 )
  26
  27 const (
  28         t1 = 0x00 // 0000 0000
  29         tx = 0x80 // 1000 0000
  30         t2 = 0xC0 // 1100 0000
  31         t3 = 0xE0 // 1110 0000
  32         t4 = 0xF0 // 1111 0000
  33         t5 = 0xF8 // 1111 1000
  34
  35         maskx = 0x3F // 0011 1111
  36         mask2 = 0x1F // 0001 1111
  37         mask3 = 0x0F // 0000 1111
  38         mask4 = 0x07 // 0000 0111
  39
  40         rune1Max = 1<<7 - 1
  41         rune2Max = 1<<11 - 1
  42         rune3Max = 1<<16 - 1
  43
  44         // The default lowest and highest continuation byte.
  45         locb = 0x80 // 1000 0000
  46         hicb = 0xBF // 1011 1111
  47 )
  48
  49 // decoderune returns the non-ASCII rune at the start of
  50 // s[k:] and the index after the rune in s.
  51 //
  52 // decoderune assumes that caller has checked that
  53 // the to be decoded rune is a non-ASCII rune.
  54 //
  55 // If the string appears to be incomplete or decoding problems
  56 // are encountered (runeerror, k + 1) is returned to ensure
  57 // progress when decoderune is used to iterate over a string.
  58 func decoderune(s string, k int) (r rune, pos int) {
  59         pos = k
  60
  61         if k >= len(s) {
  62                 return runeError, k + 1
  63         }
  64
  65         s = s[k:]
  66
  67         switch {
  68         case t2 <= s[0] && s[0] < t3:
  69                 // 0080-07FF two byte sequence
  70                 if len(s) > 1 && (locb <= s[1] && s[1] <= hicb) {
  71                         r = rune(s[0]&mask2)<<6 | rune(s[1]&maskx)
  72                         pos += 2
  73                         if rune1Max < r {
  74                                 return
  75                         }
  76                 }
  77         case t3 <= s[0] && s[0] < t4:
  78                 // 0800-FFFF three byte sequence
  79                 if len(s) > 2 && (locb <= s[1] && s[1] <= hicb) && (locb <= s[2] && s[2] <= hicb) {
  80                         r = rune(s[0]&mask3)<<12 | rune(s[1]&maskx)<<6 | rune(s[2]&maskx)
  81                         pos += 3
  82                         if rune2Max < r && !(surrogateMin <= r && r <= surrogateMax) {
  83                                 return
  84                         }
  85                 }
  86         case t4 <= s[0] && s[0] < t5:
  87                 // 10000-1FFFFF four byte sequence
  88                 if len(s) > 3 && (locb <= s[1] && s[1] <= hicb) && (locb <= s[2] && s[2] <= hicb) && (locb <= s[3] && s[3] <= hicb) {
  89                         r = rune(s[0]&mask4)<<18 | rune(s[1]&maskx)<<12 | rune(s[2]&maskx)<<6 | rune(s[3]&maskx)
  90                         pos += 4
  91                         if rune3Max < r && r <= maxRune {
  92                                 return
  93                         }
  94                 }
  95         }
  96
  97         return runeError, k + 1
  98 }
  99
 100 // encoderune writes into p (which must be large enough) the UTF-8 encoding of the rune.
 101 // It returns the number of bytes written.
 102 func encoderune(p []byte, r rune) int {
 103         // Negative values are erroneous. Making it unsigned addresses the problem.
 104         switch i := uint32(r); {
 105         case i <= rune1Max:
 106                 p[0] = byte(r)
 107                 return 1
 108         case i <= rune2Max:
 109                 _ = p[1] // eliminate bounds checks
 110                 p[0] = t2 | byte(r>>6)
 111                 p[1] = tx | byte(r)&maskx
 112                 return 2
 113         case i > maxRune, surrogateMin <= i && i <= surrogateMax:
 114                 r = runeError
 115                 fallthrough
 116         case i <= rune3Max:
 117                 _ = p[2] // eliminate bounds checks
 118                 p[0] = t3 | byte(r>>12)
 119                 p[1] = tx | byte(r>>6)&maskx
 120                 p[2] = tx | byte(r)&maskx
 121                 return 3
 122         default:
 123                 _ = p[3] // eliminate bounds checks
 124                 p[0] = t4 | byte(r>>18)
 125                 p[1] = tx | byte(r>>12)&maskx
 126                 p[2] = tx | byte(r>>6)&maskx
 127                 p[3] = tx | byte(r)&maskx
 128                 return 4
 129         }
 130 }