libgo/go/mime/encodedword.go

   1 // Copyright 2015 The Go Authors. All rights reserved.
   2 // Use of this source code is governed by a BSD-style
   3 // license that can be found in the LICENSE file.
   4
   5 package mime
   6
   7 import (
   8         "bytes"
   9         "encoding/base64"
  10         "errors"
  11         "fmt"
  12         "io"
  13         "strings"
  14         "sync"
  15         "unicode"
  16         "unicode/utf8"
  17 )
  18
  19 // A WordEncoder is an RFC 2047 encoded-word encoder.
  20 type WordEncoder byte
  21
  22 const (
  23         // BEncoding represents Base64 encoding scheme as defined by RFC 2045.
  24         BEncoding = WordEncoder('b')
  25         // QEncoding represents the Q-encoding scheme as defined by RFC 2047.
  26         QEncoding = WordEncoder('q')
  27 )
  28
  29 var (
  30         errInvalidWord = errors.New("mime: invalid RFC 2047 encoded-word")
  31 )
  32
  33 // Encode returns the encoded-word form of s. If s is ASCII without special
  34 // characters, it is returned unchanged. The provided charset is the IANA
  35 // charset name of s. It is case insensitive.
  36 func (e WordEncoder) Encode(charset, s string) string {
  37         if !needsEncoding(s) {
  38                 return s
  39         }
  40         return e.encodeWord(charset, s)
  41 }
  42
  43 func needsEncoding(s string) bool {
  44         for _, b := range s {
  45                 if (b < ' ' || b > '~') && b != '\t' {
  46                         return true
  47                 }
  48         }
  49         return false
  50 }
  51
  52 // encodeWord encodes a string into an encoded-word.
  53 func (e WordEncoder) encodeWord(charset, s string) string {
  54         buf := getBuffer()
  55         defer putBuffer(buf)
  56
  57         e.openWord(buf, charset)
  58         if e == BEncoding {
  59                 e.bEncode(buf, charset, s)
  60         } else {
  61                 e.qEncode(buf, charset, s)
  62         }
  63         closeWord(buf)
  64
  65         return buf.String()
  66 }
  67
  68 const (
  69         // The maximum length of an encoded-word is 75 characters.
  70         // See RFC 2047, section 2.
  71         maxEncodedWordLen = 75
  72         // maxContentLen is how much content can be encoded, ignoring the header and
  73         // 2-byte footer.
  74         maxContentLen = maxEncodedWordLen - len("=?UTF-8?q?") - len("?=")
  75 )
  76
  77 var maxBase64Len = base64.StdEncoding.DecodedLen(maxContentLen)
  78
  79 // bEncode encodes s using base64 encoding and writes it to buf.
  80 func (e WordEncoder) bEncode(buf *bytes.Buffer, charset, s string) {
  81         w := base64.NewEncoder(base64.StdEncoding, buf)
  82         // If the charset is not UTF-8 or if the content is short, do not bother
  83         // splitting the encoded-word.
  84         if !isUTF8(charset) || base64.StdEncoding.EncodedLen(len(s)) <= maxContentLen {
  85                 io.WriteString(w, s)
  86                 w.Close()
  87                 return
  88         }
  89
  90         var currentLen, last, runeLen int
  91         for i := 0; i < len(s); i += runeLen {
  92                 // Multi-byte characters must not be split across encoded-words.
  93                 // See RFC 2047, section 5.3.
  94                 _, runeLen = utf8.DecodeRuneInString(s[i:])
  95
  96                 if currentLen+runeLen <= maxBase64Len {
  97                         currentLen += runeLen
  98                 } else {
  99                         io.WriteString(w, s[last:i])
 100                         w.Close()
 101                         e.splitWord(buf, charset)
 102                         last = i
 103                         currentLen = runeLen
 104                 }
 105         }
 106         io.WriteString(w, s[last:])
 107         w.Close()
 108 }
 109
 110 // qEncode encodes s using Q encoding and writes it to buf. It splits the
 111 // encoded-words when necessary.
 112 func (e WordEncoder) qEncode(buf *bytes.Buffer, charset, s string) {
 113         // We only split encoded-words when the charset is UTF-8.
 114         if !isUTF8(charset) {
 115                 writeQString(buf, s)
 116                 return
 117         }
 118
 119         var currentLen, runeLen int
 120         for i := 0; i < len(s); i += runeLen {
 121                 b := s[i]
 122                 // Multi-byte characters must not be split across encoded-words.
 123                 // See RFC 2047, section 5.3.
 124                 var encLen int
 125                 if b >= ' ' && b <= '~' && b != '=' && b != '?' && b != '_' {
 126                         runeLen, encLen = 1, 1
 127                 } else {
 128                         _, runeLen = utf8.DecodeRuneInString(s[i:])
 129                         encLen = 3 * runeLen
 130                 }
 131
 132                 if currentLen+encLen > maxContentLen {
 133                         e.splitWord(buf, charset)
 134                         currentLen = 0
 135                 }
 136                 writeQString(buf, s[i:i+runeLen])
 137                 currentLen += encLen
 138         }
 139 }
 140
 141 // writeQString encodes s using Q encoding and writes it to buf.
 142 func writeQString(buf *bytes.Buffer, s string) {
 143         for i := 0; i < len(s); i++ {
 144                 switch b := s[i]; {
 145                 case b == ' ':
 146                         buf.WriteByte('_')
 147                 case b >= '!' && b <= '~' && b != '=' && b != '?' && b != '_':
 148                         buf.WriteByte(b)
 149                 default:
 150                         buf.WriteByte('=')
 151                         buf.WriteByte(upperhex[b>>4])
 152                         buf.WriteByte(upperhex[b&0x0f])
 153                 }
 154         }
 155 }
 156
 157 // openWord writes the beginning of an encoded-word into buf.
 158 func (e WordEncoder) openWord(buf *bytes.Buffer, charset string) {
 159         buf.WriteString("=?")
 160         buf.WriteString(charset)
 161         buf.WriteByte('?')
 162         buf.WriteByte(byte(e))
 163         buf.WriteByte('?')
 164 }
 165
 166 // closeWord writes the end of an encoded-word into buf.
 167 func closeWord(buf *bytes.Buffer) {
 168         buf.WriteString("?=")
 169 }
 170
 171 // splitWord closes the current encoded-word and opens a new one.
 172 func (e WordEncoder) splitWord(buf *bytes.Buffer, charset string) {
 173         closeWord(buf)
 174         buf.WriteByte(' ')
 175         e.openWord(buf, charset)
 176 }
 177
 178 func isUTF8(charset string) bool {
 179         return strings.EqualFold(charset, "UTF-8")
 180 }
 181
 182 const upperhex = "0123456789ABCDEF"
 183
 184 // A WordDecoder decodes MIME headers containing RFC 2047 encoded-words.
 185 type WordDecoder struct {
 186         // CharsetReader, if non-nil, defines a function to generate
 187         // charset-conversion readers, converting from the provided
 188         // charset into UTF-8.
 189         // Charsets are always lower-case. utf-8, iso-8859-1 and us-ascii charsets
 190         // are handled by default.
 191         // One of the CharsetReader's result values must be non-nil.
 192         CharsetReader func(charset string, input io.Reader) (io.Reader, error)
 193 }
 194
 195 // Decode decodes an RFC 2047 encoded-word.
 196 func (d *WordDecoder) Decode(word string) (string, error) {
 197         // See https://tools.ietf.org/html/rfc2047#section-2 for details.
 198         // Our decoder is permissive, we accept empty encoded-text.
 199         if len(word) < 8 || !strings.HasPrefix(word, "=?") || !strings.HasSuffix(word, "?=") || strings.Count(word, "?") != 4 {
 200                 return "", errInvalidWord
 201         }
 202         word = word[2 : len(word)-2]
 203
 204         // split delimits the first 2 fields
 205         split := strings.IndexByte(word, '?')
 206
 207         // split word "UTF-8?q?ascii" into "UTF-8", 'q', and "ascii"
 208         charset := word[:split]
 209         if len(charset) == 0 {
 210                 return "", errInvalidWord
 211         }
 212         if len(word) < split+3 {
 213                 return "", errInvalidWord
 214         }
 215         encoding := word[split+1]
 216         // the field after split must only be one byte
 217         if word[split+2] != '?' {
 218                 return "", errInvalidWord
 219         }
 220         text := word[split+3:]
 221
 222         content, err := decode(encoding, text)
 223         if err != nil {
 224                 return "", err
 225         }
 226
 227         buf := getBuffer()
 228         defer putBuffer(buf)
 229
 230         if err := d.convert(buf, charset, content); err != nil {
 231                 return "", err
 232         }
 233
 234         return buf.String(), nil
 235 }
 236
 237 // DecodeHeader decodes all encoded-words of the given string. It returns an
 238 // error if and only if CharsetReader of d returns an error.
 239 func (d *WordDecoder) DecodeHeader(header string) (string, error) {
 240         // If there is no encoded-word, returns before creating a buffer.
 241         i := strings.Index(header, "=?")
 242         if i == -1 {
 243                 return header, nil
 244         }
 245
 246         buf := getBuffer()
 247         defer putBuffer(buf)
 248
 249         buf.WriteString(header[:i])
 250         header = header[i:]
 251
 252         betweenWords := false
 253         for {
 254                 start := strings.Index(header, "=?")
 255                 if start == -1 {
 256                         break
 257                 }
 258                 cur := start + len("=?")
 259
 260                 i := strings.Index(header[cur:], "?")
 261                 if i == -1 {
 262                         break
 263                 }
 264                 charset := header[cur : cur+i]
 265                 cur += i + len("?")
 266
 267                 if len(header) < cur+len("Q??=") {
 268                         break
 269                 }
 270                 encoding := header[cur]
 271                 cur++
 272
 273                 if header[cur] != '?' {
 274                         break
 275                 }
 276                 cur++
 277
 278                 j := strings.Index(header[cur:], "?=")
 279                 if j == -1 {
 280                         break
 281                 }
 282                 text := header[cur : cur+j]
 283                 end := cur + j + len("?=")
 284
 285                 content, err := decode(encoding, text)
 286                 if err != nil {
 287                         betweenWords = false
 288                         buf.WriteString(header[:start+2])
 289                         header = header[start+2:]
 290                         continue
 291                 }
 292
 293                 // Write characters before the encoded-word. White-space and newline
 294                 // characters separating two encoded-words must be deleted.
 295                 if start > 0 && (!betweenWords || hasNonWhitespace(header[:start])) {
 296                         buf.WriteString(header[:start])
 297                 }
 298
 299                 if err := d.convert(buf, charset, content); err != nil {
 300                         return "", err
 301                 }
 302
 303                 header = header[end:]
 304                 betweenWords = true
 305         }
 306
 307         if len(header) > 0 {
 308                 buf.WriteString(header)
 309         }
 310
 311         return buf.String(), nil
 312 }
 313
 314 func decode(encoding byte, text string) ([]byte, error) {
 315         switch encoding {
 316         case 'B', 'b':
 317                 return base64.StdEncoding.DecodeString(text)
 318         case 'Q', 'q':
 319                 return qDecode(text)
 320         default:
 321                 return nil, errInvalidWord
 322         }
 323 }
 324
 325 func (d *WordDecoder) convert(buf *bytes.Buffer, charset string, content []byte) error {
 326         switch {
 327         case strings.EqualFold("utf-8", charset):
 328                 buf.Write(content)
 329         case strings.EqualFold("iso-8859-1", charset):
 330                 for _, c := range content {
 331                         buf.WriteRune(rune(c))
 332                 }
 333         case strings.EqualFold("us-ascii", charset):
 334                 for _, c := range content {
 335                         if c >= utf8.RuneSelf {
 336                                 buf.WriteRune(unicode.ReplacementChar)
 337                         } else {
 338                                 buf.WriteByte(c)
 339                         }
 340                 }
 341         default:
 342                 if d.CharsetReader == nil {
 343                         return fmt.Errorf("mime: unhandled charset %q", charset)
 344                 }
 345                 r, err := d.CharsetReader(strings.ToLower(charset), bytes.NewReader(content))
 346                 if err != nil {
 347                         return err
 348                 }
 349                 if _, err = buf.ReadFrom(r); err != nil {
 350                         return err
 351                 }
 352         }
 353         return nil
 354 }
 355
 356 // hasNonWhitespace reports whether s (assumed to be ASCII) contains at least
 357 // one byte of non-whitespace.
 358 func hasNonWhitespace(s string) bool {
 359         for _, b := range s {
 360                 switch b {
 361                 // Encoded-words can only be separated by linear white spaces which does
 362                 // not include vertical tabs (\v).
 363                 case ' ', '\t', '\n', '\r':
 364                 default:
 365                         return true
 366                 }
 367         }
 368         return false
 369 }
 370
 371 // qDecode decodes a Q encoded string.
 372 func qDecode(s string) ([]byte, error) {
 373         dec := make([]byte, len(s))
 374         n := 0
 375         for i := 0; i < len(s); i++ {
 376                 switch c := s[i]; {
 377                 case c == '_':
 378                         dec[n] = ' '
 379                 case c == '=':
 380                         if i+2 >= len(s) {
 381                                 return nil, errInvalidWord
 382                         }
 383                         b, err := readHexByte(s[i+1], s[i+2])
 384                         if err != nil {
 385                                 return nil, err
 386                         }
 387                         dec[n] = b
 388                         i += 2
 389                 case (c <= '~' && c >= ' ') || c == '\n' || c == '\r' || c == '\t':
 390                         dec[n] = c
 391                 default:
 392                         return nil, errInvalidWord
 393                 }
 394                 n++
 395         }
 396
 397         return dec[:n], nil
 398 }
 399
 400 // readHexByte returns the byte from its quoted-printable representation.
 401 func readHexByte(a, b byte) (byte, error) {
 402         var hb, lb byte
 403         var err error
 404         if hb, err = fromHex(a); err != nil {
 405                 return 0, err
 406         }
 407         if lb, err = fromHex(b); err != nil {
 408                 return 0, err
 409         }
 410         return hb<<4 | lb, nil
 411 }
 412
 413 func fromHex(b byte) (byte, error) {
 414         switch {
 415         case b >= '0' && b <= '9':
 416                 return b - '0', nil
 417         case b >= 'A' && b <= 'F':
 418                 return b - 'A' + 10, nil
 419         // Accept badly encoded bytes.
 420         case b >= 'a' && b <= 'f':
 421                 return b - 'a' + 10, nil
 422         }
 423         return 0, fmt.Errorf("mime: invalid hex byte %#02x", b)
 424 }
 425
 426 var bufPool = sync.Pool{
 427         New: func() interface{} {
 428                 return new(bytes.Buffer)
 429         },
 430 }
 431
 432 func getBuffer() *bytes.Buffer {
 433         return bufPool.Get().(*bytes.Buffer)
 434 }
 435
 436 func putBuffer(buf *bytes.Buffer) {
 437         if buf.Len() > 1024 {
 438                 return
 439         }
 440         buf.Reset()
 441         bufPool.Put(buf)
 442 }