rfc2047.myr

   1 use std
   2
   3 use "base64"
   4 use "util"
   5
   6 pkg mime =
   7         /* Handle =?utf-8?q?Welcome_to_=22foobar=22_ML?= via RFC 2047 */
   8         const utf8_from_encodedword : (encoded : byte[:] -> byte[:])
   9 ;;
  10
  11 type rfc2047_state = union
  12         `Boring_ASCII
  13         `Saw_Opening_Equals
  14         `Reading_Charset
  15         `Reading_Encoding
  16         `Saw_Closing_Question_Mark
  17         `In_Text
  18         `Just_Finished_Text
  19 ;;
  20
  21 const ascii_lc = { b : byte
  22         if b > 0x40 && b < 0x5B
  23                 -> b | 0x20
  24         else
  25                 -> b
  26         ;;
  27 }
  28
  29 const caseieq = { s : byte[:], t : byte[:]
  30         if s.len != t.len
  31                 -> false
  32         ;;
  33         for var j : std.size = 0; j < s.len; ++j
  34                 if ascii_lc(s[j]) != ascii_lc(t[j])
  35                         -> false
  36                 ;;
  37         ;;
  38
  39         -> true
  40 }
  41
  42 const atox = { b : byte
  43         if b >= ('0' : byte) && b <= ('9' : byte)
  44                 -> `std.Ok (b - ('0' : byte))
  45         ;;
  46         if b >= ('A' : byte) && b <= ('F' : byte)
  47                 -> `std.Ok (b - ('A' : byte) + 0xA)
  48         ;;
  49         if b >= ('a' : byte) && b <= ('f' : byte)
  50                 -> `std.Ok (b - ('a' : byte) + 0xA)
  51         ;;
  52
  53         -> `std.Err void
  54 }
  55
  56 const decode_q = { s : byte[:]
  57         var dec : std.strbuf# = std.mksb()
  58         for var j : std.size = 0; j < s.len; ++j
  59                 match (s[j] : char)
  60                 | '?': goto err
  61                 | ' ': goto err
  62                 | '\t': goto err
  63                 | '_': std.sbputb(dec, (' ' : byte))
  64                 | '=':
  65                         if j + 2 >= s.len
  66                                 goto err
  67                         ;;
  68
  69                         var b : byte = 0
  70                         match atox(s[j+1])
  71                         | `std.Ok n: b += (16 * n)
  72                         | `std.Err void: goto err
  73                         ;;
  74                         match atox(s[j+2])
  75                         | `std.Ok n: b += n
  76                         | `std.Err void: goto err
  77                         ;;
  78                         std.sbputb(dec, b)
  79                         j = j + 2
  80                 | _: std.sbputb(dec, s[j])
  81                 ;;
  82         ;;
  83
  84         -> `std.Ok std.sbfin(dec)
  85
  86 :err
  87         std.sbfree(dec)
  88         -> `std.Err void
  89 }
  90
  91 const token_safe_byte = { b : byte
  92         /*
  93          * Any CHAR except SPACE, CTLs, and especials
  94          *
  95          * especials = "(" / ")" / "<" / ">" / "@" / "," / ";" /
  96          * ":" / " <"> / "/" / "[" / "]" / "?" / "." / "="
  97          */
  98
  99         if b <= (' ' : byte) || b > ('~' : byte)
 100                 -> false
 101         ;;
 102         match (b : char)
 103         | '(': -> false
 104         | ')': -> false
 105         | '<': -> false
 106         | '>': -> false
 107         | '@': -> false
 108         | ',': -> false
 109         | ';': -> false
 110         | ':': -> false
 111         | '"': -> false
 112         | '/': -> false
 113         | '[': -> false
 114         | ']': -> false
 115         | '?': -> false
 116         | '.': -> false
 117         | '=': -> false
 118         | _: -> true
 119         ;;
 120 }
 121
 122 const text_safe_byte = { b : byte
 123         /*
 124          * 1*<Any printable ASCII character other than "?"
 125          *    or SPACE>
 126          * ; (but see "Use of encoded-words in message
 127          * ; headers", section 5)
 128          *
 129          * Section 5 adds context to where encoded-words are allowed.
 130          * We over-enforce that in the state machine, so we don't
 131          * have to worry about it here.  I think.
 132          */
 133         -> b > 0x20 && b < 0x7f && (b : char) != '?'
 134 }
 135
 136 const utf8_from_encodedword = { encoded : byte[:]
 137         var decoded : std.strbuf# = std.mksb()
 138         var charset : byte[:] = [][:]
 139         var encoding : byte[:] = [][:]
 140         var s : rfc2047_state = `Boring_ASCII
 141         var b : byte = 0
 142
 143         var last_was_whitespace : bool = true
 144         var this_word_start : std.size = 0
 145         var cs_start : std.size = 0
 146         var e_start : std.size = 0
 147         var text_start : std.size = 0
 148         var decode_word : (s : byte[:] -> std.result(byte[:], void))
 149
 150         var err : std.strbuf# = std.mksb()
 151
 152         for var j : std.size = 0; j < encoded.len; ++j
 153                 b = encoded[j]
 154                 match s
 155                 | `Boring_ASCII:
 156                         var c : char = (b : char)
 157                         match c
 158                         | '=':
 159                                 if !last_was_whitespace
 160                                         goto not_encodedword
 161                                 ;;
 162                                 s = `Saw_Opening_Equals
 163                         | _:
 164                                 if (c == ' ' || c == '\t' || c == '\r' || c == '\n')
 165                                         if last_was_whitespace
 166                                                 continue
 167                                         ;;
 168                                         std.sbputb(decoded, 0x20)
 169                                         last_was_whitespace = true
 170                                 else
 171                                         std.sbputb(decoded, b)
 172                                         last_was_whitespace = false
 173                                 ;;
 174                         ;;
 175                 | `Just_Finished_Text:
 176                         /*
 177                          * RFC 2047, section 5 requires that in
 178                          * some contexts, encoded text be separated
 179                          * from surrounding ASCII by linear whitespace.
 180                          * That "in some contexts" is hard, so we
 181                          * enforce it everywhere. This state is for
 182                          * requiring that, right after the ?=, we
 183                          * do not have a non-whitespace character
 184                          */
 185                         var c : char = (b : char)
 186                         if c != ' ' && c != '\t' && c != '\r' && c != '\n'
 187                                 goto not_encodedword
 188                         ;;
 189                         if c == '\r'
 190                                 continue
 191                         elif c == '\n' && j < encoded.len - 1
 192                                 /*
 193                                  * Between =?...?= atoms, "\n "
 194                                  * should be discarded, not folded
 195                                  * to " "
 196                                  */
 197                                 if encoded[j + 1] == 0x20 || encoded[j + 1] == 0x09
 198                                         j = j + 1
 199                                         s = `Boring_ASCII
 200                                         last_was_whitespace = true
 201                                         continue
 202                                 ;;
 203                         ;;
 204                         std.sbputb(decoded, b)
 205                         last_was_whitespace = true
 206                         s = `Boring_ASCII
 207                 | `Saw_Opening_Equals:
 208                         this_word_start = j
 209                         match (b : char)
 210                         | '?':
 211                                 s = `Reading_Charset
 212                                 if j + 1 >= encoded.len
 213                                         goto not_encodedword
 214                                 ;;
 215                                 cs_start = j + 1
 216                         | _:
 217                                 goto not_encodedword
 218                         ;;
 219                 | `Reading_Charset:
 220                         if j - this_word_start > 75
 221                                 goto not_encodedword
 222                         ;;
 223                         match (b : char)
 224                         | '?':
 225                                 s = `Reading_Encoding
 226                                 charset = encoded[cs_start:j]
 227                                 if j + 1 >= encoded.len
 228                                         goto not_encodedword
 229                                 ;;
 230                                 e_start = j + 1
 231                         | _:
 232                                 if !token_safe_byte(b)
 233                                         goto not_encodedword
 234                                 ;;
 235                         ;;
 236                 | `Reading_Encoding:
 237                         if j - this_word_start > 75
 238                                 goto not_encodedword
 239                         ;;
 240                         match (b : char)
 241                         | '?':
 242                                 encoding = encoded[e_start:j]
 243                                 /*
 244                                    TODO: RFC 2231 means we should
 245                                    strip trailing *FOO from this.
 246                                  */
 247                                 if !caseieq(charset, "utf-8")
 248                                         goto unimplemented
 249                                 ;;
 250
 251                                 if caseieq(encoding, "q")
 252                                         decode_word = decode_q
 253                                 elif caseieq(encoding, "b")
 254                                         decode_word = utf8_from_base64
 255                                 else
 256                                         goto unimplemented
 257                                 ;;
 258                                 if j + 0 >= encoded.len
 259                                         goto not_encodedword
 260                                 ;;
 261                                 text_start = j + 1
 262                                 s = `In_Text
 263                         | _:
 264                                 if !token_safe_byte(b)
 265                                         goto not_encodedword
 266                                 ;;
 267                         ;;
 268                 | `In_Text:
 269                         if j - this_word_start > 75
 270                                 goto not_encodedword
 271                         ;;
 272                         match (b : char)
 273                         | '?':
 274                                 match decode_word(encoded[text_start:j])
 275                                 | `std.Ok dec:
 276                                         std.sbputs(decoded, dec)
 277                                         std.slfree(dec)
 278                                 | `std.Err void:
 279                                         goto not_encodedword
 280                                 ;;
 281                                 s = `Saw_Closing_Question_Mark
 282                         | _:
 283                                 if !text_safe_byte(b)
 284                                         goto not_encodedword
 285                                 ;;
 286                         ;;
 287                 | `Saw_Closing_Question_Mark:
 288                         if j - this_word_start > 75
 289                                 goto not_encodedword
 290                         ;;
 291                         match (b : char)
 292                         | '=':
 293                                 s = `Just_Finished_Text
 294                         | _:
 295                                 goto not_encodedword
 296                         ;;
 297                 ;;
 298         ;;
 299
 300         /*
 301          * Because Q and B encoding work on a byte-level, there's
 302          * a chance that what we have isn't valid UTF-8. That would
 303          * be a shame.
 304          *
 305          * TODO: we die if tabs are in the subject here. Is that bad?
 306          */
 307         if !util.non_ctrl_utf8(std.sbpeek(decoded))
 308                 goto not_encodedword
 309         ;;
 310
 311         match s
 312         | `Boring_ASCII: goto done
 313         | `Just_Finished_Text: goto done
 314         | _: goto not_encodedword
 315         ;;
 316
 317 :unimplemented
 318 :not_encodedword
 319         std.sbfree(decoded)
 320         -> only_fold_whitespace(encoded)
 321
 322 :done
 323         -> std.sbfin(decoded)
 324 }
 325
 326 const only_fold_whitespace = { s : byte[:]
 327         var r : std.strbuf# = std.mksb()
 328         var last_was_whitespace : bool = false
 329         for b : s
 330                 var c : char = (b : char)
 331                 if c == '\n' || c == '\r' || c == '\t' || c == ' '
 332                         if last_was_whitespace
 333                                 continue
 334                         ;;
 335
 336                         std.sbputc(r, ' ')
 337                         last_was_whitespace = true
 338                 else
 339                         std.sbputb(r, b)
 340                         last_was_whitespace = false
 341                 ;;
 342         ;;
 343
 344         -> std.sbfin(r)
 345 }