libgo/go/mime/encodedword.go

   1 // Copyright 2015 The Go Authors. All rights reserved.
   2 // Use of this source code is governed by a BSD-style
   3 // license that can be found in the LICENSE file.
   4
   5 package mime
   6
   7 import (
   8         "bytes"
   9         "encoding/base64"
  10         "errors"
  11         "fmt"
  12         "io"
  13         "strings"
  14         "unicode"
  15         "unicode/utf8"
  16 )
  17
  18 // A WordEncoder is an RFC 2047 encoded-word encoder.
  19 type WordEncoder byte
  20
  21 const (
  22         // BEncoding represents Base64 encoding scheme as defined by RFC 2045.
  23         BEncoding = WordEncoder('b')
  24         // QEncoding represents the Q-encoding scheme as defined by RFC 2047.
  25         QEncoding = WordEncoder('q')
  26 )
  27
  28 var (
  29         errInvalidWord = errors.New("mime: invalid RFC 2047 encoded-word")
  30 )
  31
  32 // Encode returns the encoded-word form of s. If s is ASCII without special
  33 // characters, it is returned unchanged. The provided charset is the IANA
  34 // charset name of s. It is case insensitive.
  35 func (e WordEncoder) Encode(charset, s string) string {
  36         if !needsEncoding(s) {
  37                 return s
  38         }
  39         return e.encodeWord(charset, s)
  40 }
  41
  42 func needsEncoding(s string) bool {
  43         for _, b := range s {
  44                 if (b < ' ' || b > '~') && b != '\t' {
  45                         return true
  46                 }
  47         }
  48         return false
  49 }
  50
  51 // encodeWord encodes a string into an encoded-word.
  52 func (e WordEncoder) encodeWord(charset, s string) string {
  53         var buf strings.Builder
  54         // Could use a hint like len(s)*3, but that's not enough for cases
  55         // with word splits and too much for simpler inputs.
  56         // 48 is close to maxEncodedWordLen/2, but adjusted to allocator size class.
  57         buf.Grow(48)
  58
  59         e.openWord(&buf, charset)
  60         if e == BEncoding {
  61                 e.bEncode(&buf, charset, s)
  62         } else {
  63                 e.qEncode(&buf, charset, s)
  64         }
  65         closeWord(&buf)
  66
  67         return buf.String()
  68 }
  69
  70 const (
  71         // The maximum length of an encoded-word is 75 characters.
  72         // See RFC 2047, section 2.
  73         maxEncodedWordLen = 75
  74         // maxContentLen is how much content can be encoded, ignoring the header and
  75         // 2-byte footer.
  76         maxContentLen = maxEncodedWordLen - len("=?UTF-8?q?") - len("?=")
  77 )
  78
  79 var maxBase64Len = base64.StdEncoding.DecodedLen(maxContentLen)
  80
  81 // bEncode encodes s using base64 encoding and writes it to buf.
  82 func (e WordEncoder) bEncode(buf *strings.Builder, charset, s string) {
  83         w := base64.NewEncoder(base64.StdEncoding, buf)
  84         // If the charset is not UTF-8 or if the content is short, do not bother
  85         // splitting the encoded-word.
  86         if !isUTF8(charset) || base64.StdEncoding.EncodedLen(len(s)) <= maxContentLen {
  87                 io.WriteString(w, s)
  88                 w.Close()
  89                 return
  90         }
  91
  92         var currentLen, last, runeLen int
  93         for i := 0; i < len(s); i += runeLen {
  94                 // Multi-byte characters must not be split across encoded-words.
  95                 // See RFC 2047, section 5.3.
  96                 _, runeLen = utf8.DecodeRuneInString(s[i:])
  97
  98                 if currentLen+runeLen <= maxBase64Len {
  99                         currentLen += runeLen
 100                 } else {
 101                         io.WriteString(w, s[last:i])
 102                         w.Close()
 103                         e.splitWord(buf, charset)
 104                         last = i
 105                         currentLen = runeLen
 106                 }
 107         }
 108         io.WriteString(w, s[last:])
 109         w.Close()
 110 }
 111
 112 // qEncode encodes s using Q encoding and writes it to buf. It splits the
 113 // encoded-words when necessary.
 114 func (e WordEncoder) qEncode(buf *strings.Builder, charset, s string) {
 115         // We only split encoded-words when the charset is UTF-8.
 116         if !isUTF8(charset) {
 117                 writeQString(buf, s)
 118                 return
 119         }
 120
 121         var currentLen, runeLen int
 122         for i := 0; i < len(s); i += runeLen {
 123                 b := s[i]
 124                 // Multi-byte characters must not be split across encoded-words.
 125                 // See RFC 2047, section 5.3.
 126                 var encLen int
 127                 if b >= ' ' && b <= '~' && b != '=' && b != '?' && b != '_' {
 128                         runeLen, encLen = 1, 1
 129                 } else {
 130                         _, runeLen = utf8.DecodeRuneInString(s[i:])
 131                         encLen = 3 * runeLen
 132                 }
 133
 134                 if currentLen+encLen > maxContentLen {
 135                         e.splitWord(buf, charset)
 136                         currentLen = 0
 137                 }
 138                 writeQString(buf, s[i:i+runeLen])
 139                 currentLen += encLen
 140         }
 141 }
 142
 143 // writeQString encodes s using Q encoding and writes it to buf.
 144 func writeQString(buf *strings.Builder, s string) {
 145         for i := 0; i < len(s); i++ {
 146                 switch b := s[i]; {
 147                 case b == ' ':
 148                         buf.WriteByte('_')
 149                 case b >= '!' && b <= '~' && b != '=' && b != '?' && b != '_':
 150                         buf.WriteByte(b)
 151                 default:
 152                         buf.WriteByte('=')
 153                         buf.WriteByte(upperhex[b>>4])
 154                         buf.WriteByte(upperhex[b&0x0f])
 155                 }
 156         }
 157 }
 158
 159 // openWord writes the beginning of an encoded-word into buf.
 160 func (e WordEncoder) openWord(buf *strings.Builder, charset string) {
 161         buf.WriteString("=?")
 162         buf.WriteString(charset)
 163         buf.WriteByte('?')
 164         buf.WriteByte(byte(e))
 165         buf.WriteByte('?')
 166 }
 167
 168 // closeWord writes the end of an encoded-word into buf.
 169 func closeWord(buf *strings.Builder) {
 170         buf.WriteString("?=")
 171 }
 172
 173 // splitWord closes the current encoded-word and opens a new one.
 174 func (e WordEncoder) splitWord(buf *strings.Builder, charset string) {
 175         closeWord(buf)
 176         buf.WriteByte(' ')
 177         e.openWord(buf, charset)
 178 }
 179
 180 func isUTF8(charset string) bool {
 181         return strings.EqualFold(charset, "UTF-8")
 182 }
 183
 184 const upperhex = "0123456789ABCDEF"
 185
 186 // A WordDecoder decodes MIME headers containing RFC 2047 encoded-words.
 187 type WordDecoder struct {
 188         // CharsetReader, if non-nil, defines a function to generate
 189         // charset-conversion readers, converting from the provided
 190         // charset into UTF-8.
 191         // Charsets are always lower-case. utf-8, iso-8859-1 and us-ascii charsets
 192         // are handled by default.
 193         // One of the CharsetReader's result values must be non-nil.
 194         CharsetReader func(charset string, input io.Reader) (io.Reader, error)
 195 }
 196
 197 // Decode decodes an RFC 2047 encoded-word.
 198 func (d *WordDecoder) Decode(word string) (string, error) {
 199         // See https://tools.ietf.org/html/rfc2047#section-2 for details.
 200         // Our decoder is permissive, we accept empty encoded-text.
 201         if len(word) < 8 || !strings.HasPrefix(word, "=?") || !strings.HasSuffix(word, "?=") || strings.Count(word, "?") != 4 {
 202                 return "", errInvalidWord
 203         }
 204         word = word[2 : len(word)-2]
 205
 206         // split delimits the first 2 fields
 207         split := strings.IndexByte(word, '?')
 208
 209         // split word "UTF-8?q?ascii" into "UTF-8", 'q', and "ascii"
 210         charset := word[:split]
 211         if len(charset) == 0 {
 212                 return "", errInvalidWord
 213         }
 214         if len(word) < split+3 {
 215                 return "", errInvalidWord
 216         }
 217         encoding := word[split+1]
 218         // the field after split must only be one byte
 219         if word[split+2] != '?' {
 220                 return "", errInvalidWord
 221         }
 222         text := word[split+3:]
 223
 224         content, err := decode(encoding, text)
 225         if err != nil {
 226                 return "", err
 227         }
 228
 229         var buf strings.Builder
 230
 231         if err := d.convert(&buf, charset, content); err != nil {
 232                 return "", err
 233         }
 234
 235         return buf.String(), nil
 236 }
 237
 238 // DecodeHeader decodes all encoded-words of the given string. It returns an
 239 // error if and only if CharsetReader of d returns an error.
 240 func (d *WordDecoder) DecodeHeader(header string) (string, error) {
 241         // If there is no encoded-word, returns before creating a buffer.
 242         i := strings.Index(header, "=?")
 243         if i == -1 {
 244                 return header, nil
 245         }
 246
 247         var buf strings.Builder
 248
 249         buf.WriteString(header[:i])
 250         header = header[i:]
 251
 252         betweenWords := false
 253         for {
 254                 start := strings.Index(header, "=?")
 255                 if start == -1 {
 256                         break
 257                 }
 258                 cur := start + len("=?")
 259
 260                 i := strings.Index(header[cur:], "?")
 261                 if i == -1 {
 262                         break
 263                 }
 264                 charset := header[cur : cur+i]
 265                 cur += i + len("?")
 266
 267                 if len(header) < cur+len("Q??=") {
 268                         break
 269                 }
 270                 encoding := header[cur]
 271                 cur++
 272
 273                 if header[cur] != '?' {
 274                         break
 275                 }
 276                 cur++
 277
 278                 j := strings.Index(header[cur:], "?=")
 279                 if j == -1 {
 280                         break
 281                 }
 282                 text := header[cur : cur+j]
 283                 end := cur + j + len("?=")
 284
 285                 content, err := decode(encoding, text)
 286                 if err != nil {
 287                         betweenWords = false
 288                         buf.WriteString(header[:start+2])
 289                         header = header[start+2:]
 290                         continue
 291                 }
 292
 293                 // Write characters before the encoded-word. White-space and newline
 294                 // characters separating two encoded-words must be deleted.
 295                 if start > 0 && (!betweenWords || hasNonWhitespace(header[:start])) {
 296                         buf.WriteString(header[:start])
 297                 }
 298
 299                 if err := d.convert(&buf, charset, content); err != nil {
 300                         return "", err
 301                 }
 302
 303                 header = header[end:]
 304                 betweenWords = true
 305         }
 306
 307         if len(header) > 0 {
 308                 buf.WriteString(header)
 309         }
 310
 311         return buf.String(), nil
 312 }
 313
 314 func decode(encoding byte, text string) ([]byte, error) {
 315         switch encoding {
 316         case 'B', 'b':
 317                 return base64.StdEncoding.DecodeString(text)
 318         case 'Q', 'q':
 319                 return qDecode(text)
 320         default:
 321                 return nil, errInvalidWord
 322         }
 323 }
 324
 325 func (d *WordDecoder) convert(buf *strings.Builder, charset string, content []byte) error {
 326         switch {
 327         case strings.EqualFold("utf-8", charset):
 328                 buf.Write(content)
 329         case strings.EqualFold("iso-8859-1", charset):
 330                 for _, c := range content {
 331                         buf.WriteRune(rune(c))
 332                 }
 333         case strings.EqualFold("us-ascii", charset):
 334                 for _, c := range content {
 335                         if c >= utf8.RuneSelf {
 336                                 buf.WriteRune(unicode.ReplacementChar)
 337                         } else {
 338                                 buf.WriteByte(c)
 339                         }
 340                 }
 341         default:
 342                 if d.CharsetReader == nil {
 343                         return fmt.Errorf("mime: unhandled charset %q", charset)
 344                 }
 345                 r, err := d.CharsetReader(strings.ToLower(charset), bytes.NewReader(content))
 346                 if err != nil {
 347                         return err
 348                 }
 349                 if _, err = io.Copy(buf, r); err != nil {
 350                         return err
 351                 }
 352         }
 353         return nil
 354 }
 355
 356 // hasNonWhitespace reports whether s (assumed to be ASCII) contains at least
 357 // one byte of non-whitespace.
 358 func hasNonWhitespace(s string) bool {
 359         for _, b := range s {
 360                 switch b {
 361                 // Encoded-words can only be separated by linear white spaces which does
 362                 // not include vertical tabs (\v).
 363                 case ' ', '\t', '\n', '\r':
 364                 default:
 365                         return true
 366                 }
 367         }
 368         return false
 369 }
 370
 371 // qDecode decodes a Q encoded string.
 372 func qDecode(s string) ([]byte, error) {
 373         dec := make([]byte, len(s))
 374         n := 0
 375         for i := 0; i < len(s); i++ {
 376                 switch c := s[i]; {
 377                 case c == '_':
 378                         dec[n] = ' '
 379                 case c == '=':
 380                         if i+2 >= len(s) {
 381                                 return nil, errInvalidWord
 382                         }
 383                         b, err := readHexByte(s[i+1], s[i+2])
 384                         if err != nil {
 385                                 return nil, err
 386                         }
 387                         dec[n] = b
 388                         i += 2
 389                 case (c <= '~' && c >= ' ') || c == '\n' || c == '\r' || c == '\t':
 390                         dec[n] = c
 391                 default:
 392                         return nil, errInvalidWord
 393                 }
 394                 n++
 395         }
 396
 397         return dec[:n], nil
 398 }
 399
 400 // readHexByte returns the byte from its quoted-printable representation.
 401 func readHexByte(a, b byte) (byte, error) {
 402         var hb, lb byte
 403         var err error
 404         if hb, err = fromHex(a); err != nil {
 405                 return 0, err
 406         }
 407         if lb, err = fromHex(b); err != nil {
 408                 return 0, err
 409         }
 410         return hb<<4 | lb, nil
 411 }
 412
 413 func fromHex(b byte) (byte, error) {
 414         switch {
 415         case b >= '0' && b <= '9':
 416                 return b - '0', nil
 417         case b >= 'A' && b <= 'F':
 418                 return b - 'A' + 10, nil
 419         // Accept badly encoded bytes.
 420         case b >= 'a' && b <= 'f':
 421                 return b - 'a' + 10, nil
 422         }
 423         return 0, fmt.Errorf("mime: invalid hex byte %#02x", b)
 424 }