message.myr

   1 use std
   2 use bio
   3 use date
   4
   5 use "rfc2047"
   6 use "util"
   7
   8 /*
   9    This is based on RFC 5322 (for handling headers) and RFCs 2045,
  10    2046 (for the structure of MIME). The parameters are decoded as
  11    per RFC 2231.
  12  */
  13
  14 pkg mime =
  15         type message = struct
  16                 /*
  17                    Master raw data; all sub-objects are slices of
  18                    this. The goal is that, to free a message, freeing
  19                    raw should deal with all the contents
  20                  */
  21                 raw : byte[:]
  22                 contents : entity
  23         ;;
  24
  25         type mimebody = union
  26                 /* Perhaps plaintext, or base64 encoded application/pdf */
  27                 `Single byte[:]
  28
  29                 /* Text and attachment, plain/HTML, &c */
  30                 `Multipart entity[:]
  31         ;;
  32
  33         type entity = struct
  34                 /* A slice of a message.raw somewhere up above */
  35                 raw : byte[:]
  36
  37                 /* NOT sourced from raw, must be freed separately */
  38                 headers : std.htab(byte[:], byte[:][:])#
  39
  40                 /* Potentially recursive definition */
  41                 body : mimebody
  42
  43                 /*
  44                    Perhaps 'filename' from Content-Disposition, or
  45                    'name' from Content-Type. Must be freed separately.
  46                  */
  47                 name : std.option(byte[:])
  48
  49                 /*
  50                    Memory for the following resides in the headers'
  51                    values, or else is static (so don't free it).
  52                  */
  53
  54                 /* Only the "type/subtype" bit */
  55                 contenttype : byte[:]
  56         ;;
  57
  58         /*
  59            Given raw, assume it represents a message, or the beginning
  60            of one, and extract the headers into key/value.
  61
  62            The values of the output are multiple in case a message
  63            has duplicate fields (e.g. "Subject: foo" and "Subject:
  64            bar"). Behavior is unspecified (RFC 2822, 4.5).
  65          */
  66         const get_headers : (raw : byte[:] -> std.result(std.htab(byte[:], byte[:][:])#, byte[:]))
  67
  68         /*
  69            Given raw, like "foo: bar; a=\"b\"; c*=utf-8''whatever",
  70            parse the params as per RFC 2231
  71          */
  72         const get_params : (raw : byte[:] -> std.result(std.htab(byte[:], byte[:])#, byte[:]))
  73
  74         /*
  75            Given raw, parse it into a MIME message. This takes
  76            ownership of raw if successful.
  77          */
  78         const get_message : (raw : byte[:] -> std.result(message#, byte[:]))
  79
  80         /*
  81            Free all memory used by m, including the raw data used
  82            to construct it.
  83          */
  84         const free_message : (m : message# -> void)
  85
  86         /* Parse the God-forsaken date format */
  87         const parse_rfc5322_date : (s : byte[:] -> std.result(date.instant, byte[:]))
  88 ;;
  89
  90 const get_headers = {raw : byte[:]
  91         var h : std.htab(byte[:], byte[:][:])# = std.mkht()
  92         var err : std.strbuf# = std.mksb()
  93         var cline : std.strbuf# = std.mksb()
  94         var split : byte[:][:] = std.strsplit(raw, "\r\n")
  95         for l : split
  96                 if l.len > 0 && is_WSP(l[0])
  97                         /*
  98                            This is a wrapped header, but we can't
  99                            unwrap it now, because it might be a RFC
 100                            2047 encodedword.  For those, CRLF+' '
 101                            should be deleted, but for bog standard
 102                            RFC 5322, CRLF+' ' should be treated as
 103                            ' '.
 104                          */
 105                         std.sbfmt(cline, "\n{}", l)
 106                 elif l.len == 0
 107                         if cline.len > 0
 108                                 match handle_header(h, std.sbpeek(cline))
 109                                 | `std.Ok void:
 110                                 | `std.Err e:
 111                                         std.sbfmt(err, "malformed headers: {}", e)
 112                                         std.slfree(e)
 113                                         break
 114                                 ;;
 115                         ;;
 116
 117                         /* Done with headers */
 118                         std.sbtrim(cline, 0)
 119                         break
 120                 elif l.len > 0 && all_WSP(l)
 121                         std.sbfmt(err, "all-whitespace line in headers")
 122                         break
 123                 else
 124                         if std.sbpeek(cline).len > 0
 125                                 match handle_header(h, std.sbpeek(cline))
 126                                 | `std.Ok void:
 127                                 | `std.Err e:
 128                                         std.sbfmt(err, "malformed headers: {}", e)
 129                                         std.slfree(e)
 130                                         break
 131                                 ;;
 132                         ;;
 133
 134                         std.sbtrim(cline, 0)
 135                         std.sbfmt(cline, "{}", l)
 136                 ;;
 137         ;;
 138         std.slfree(split)
 139         split = [][:]
 140
 141         /*
 142            Perhaps cline just contains whitespace right now --
 143            nothing useful. This is probably the case if we're parsing
 144            a full message, because we hit the "Done with headers"
 145            case above. Perhaps not, though.
 146          */
 147         if !all_WSP(std.sbpeek(cline))
 148                 match handle_header(h, std.sbpeek(cline))
 149                 | `std.Ok void:
 150                 | `std.Err e:
 151                         std.sbfmt(err, "malformed headers: {}", e)
 152                         std.slfree(e)
 153                 ;;
 154         ;;
 155         std.sbfree(cline)
 156
 157         /*
 158            Now we've slurped all the raw data in, but perhaps it
 159            needs to be decoded.
 160          */
 161         decode_all(h)
 162
 163 :done
 164         match err.len
 165         | 0:
 166                 std.sbfree(err)
 167                 -> `std.Ok (h)
 168         | _:
 169                 free_all_headers(h)
 170                 std.htfree(h)
 171
 172                 -> `std.Err std.sbfin(err)
 173         ;;
 174 }
 175
 176 const is_WSP_or_nl = { b : byte
 177         var c : uint8 = (b : uint8)
 178         -> c == 0x09 || c == 0x0a || c == 0x0d || c == 0x20
 179 }
 180
 181 const is_WSP = { b : byte
 182         var c : uint8 = (b : uint8)
 183         -> c == 0x09 || c == 0x20
 184 }
 185
 186 const all_WSP = { l : byte[:]
 187         for b : l
 188                 if (b : uint8) != 0x20 && (b : uint8) != 0x09
 189                         -> false
 190                 ;;
 191         ;;
 192
 193         -> true
 194 }
 195
 196 const handle_header = { h : std.htab(byte[:], byte[:][:])#, l : byte[:]
 197         var err : std.strbuf# = std.mksb()
 198
 199         match std.strfind(l, ":")
 200         | `std.None:
 201                 std.sbfmt(err, "\"{}\" has no ':'", l)
 202                 -> `std.Err std.sbfin(err)
 203         | `std.Some j:
 204                 var k : byte[:] = str_to_lower(l[0:j])
 205                 var v : byte[:] = [][:]
 206
 207                 for j++; j < l.len; j++
 208                         if !is_WSP(l[j])
 209                                 v = std.sldup(l[j:])
 210                                 break
 211                         ;;
 212                 ;;
 213
 214                 match std.htget(h, k)
 215                 | `std.None:
 216                         std.htput(h, k, std.sldup([ v ][:]))
 217                 | `std.Some t:
 218                         std.slpush(&t, v)
 219                         std.htput(h, k, t)
 220                 ;;
 221         ;;
 222
 223         -> `std.Ok void
 224 }
 225
 226 /*
 227    The XXX_all functions are workarounds for a current awkwardness:
 228    generic functions can't use generic iterators that don't use the
 229    same genericity that the enclosing function uses. If you're
 230    reading this, Ori has probably fixed it by now.
 231  */
 232 const decode_all = {h : std.htab(byte[:], byte[:][:])#
 233         /*
 234            This might need to become more complicated; encodedwords
 235            are only allowed to appear in certain locations.
 236          */
 237         for (k, v) : std.byhtkeyvals(h)
 238                 for var j = 0; j < v.len; ++j
 239                         var new = utf8_from_encodedword(v[j])
 240                         std.slfree(v[j])
 241                         v[j] = new
 242                 ;;
 243         ;;
 244 }
 245
 246 const free_all_headers = {h : std.htab(byte[:], byte[:][:])#
 247         for (k, v) : std.byhtkeyvals(h)
 248                 for vv : v
 249                         std.slfree(vv)
 250                 ;;
 251                 std.slfree(k)
 252         ;;
 253 }
 254
 255 const free_all_params = {h : std.htab(byte[:], byte[:])#
 256         for (k, v) : std.byhtkeyvals(h)
 257                 std.slfree(v)
 258                 std.slfree(k)
 259         ;;
 260 }
 261
 262 /* */
 263 const parse_rfc5322_date = { s : byte[:]
 264         /* First, fold whitespace */
 265         var s2 : byte[:] = std.slalloc(0)
 266         var last_was_ws : bool = false
 267         var free_this : byte[:] = s2
 268         for b : s
 269                 if b < 0x20 || b > 0x7e
 270                         std.slfree(free_this)
 271                         -> `std.Err std.fmt("unparsable date \"{}\"", s)
 272                 ;;
 273
 274                 match (b : char)
 275                 | ' ':
 276                         if last_was_ws
 277                                 continue
 278                         ;;
 279                         std.slpush(&s2, (' ' : byte))
 280                         last_was_ws = true
 281                 | '\t':
 282                         if last_was_ws
 283                                 continue
 284                         ;;
 285                         std.slpush(&s2, (' ' : byte))
 286                         last_was_ws = true
 287                 | '\r':
 288                         if last_was_ws
 289                                 continue
 290                         ;;
 291                         std.slpush(&s2, (' ' : byte))
 292                         last_was_ws = true
 293                 | '\n':
 294                         if last_was_ws
 295                                 continue
 296                         ;;
 297                         std.slpush(&s2, (' ' : byte))
 298                         last_was_ws = true
 299                 | _:
 300                         std.slpush(&s2, b)
 301                         last_was_ws = false
 302                 ;;
 303         ;;
 304         free_this = s2
 305
 306         /* skip [ day-of-week "," ] */
 307         if s2.len > 5 && s2[3] == (',' : byte)
 308                 s2 = s2[5:]
 309         ;;
 310
 311         /* try with seconds */
 312         match date.parsefmt("%e %b %Y %H:%M:%S %z", s2)
 313         | `std.Err e:
 314         | `std.Ok i:
 315                 std.slfree(free_this)
 316                 -> `std.Ok i
 317         ;;
 318
 319         /* try without seconds */
 320         match date.parsefmt("%e %b %Y %H:%M %z", s2)
 321         | `std.Err e:
 322         | `std.Ok i:
 323                 std.slfree(free_this)
 324                 -> `std.Ok i
 325         ;;
 326
 327         /* perhaps the time zone is in obsolete format? */
 328         if s2.len > 3
 329                 for (a, b) : [
 330                         ("EDT", "-0400"),
 331                         ("EST", "-0500"),
 332                         ("CDT", "-0500"),
 333                         ("CST", "-0600"),
 334                         ("MDT", "-0600"),
 335                         ("MST", "-0700"),
 336                         ("PDT", "-0700"),
 337                         ("PST", "-0800"),
 338                 ][:]
 339                         if std.eq(a, s2[s2.len - a.len:])
 340                                 var t = std.fmt("{} {}", s2[0:s2.len - a.len], b)
 341                                 match date.parsefmt("%e %b %Y %H:%M:%S %Z", t)
 342                                 | `std.Err e: std.slfree(t)
 343                                 | `std.Ok i:
 344                                         std.slfree(t)
 345                                         std.slfree(free_this)
 346                                         -> `std.Ok i
 347                                 ;;
 348                         ;;
 349                 ;;
 350         ;;
 351
 352         std.slfree(free_this)
 353         -> `std.Err std.fmt("unparsable date \"{}\"", s)
 354 }
 355
 356 const str_to_lower = { s : byte[:]
 357         var l : std.strbuf# = std.mksb()
 358         for b : s
 359                 var c : char = (b : char)
 360                 if c >= 'A' && c <= 'Z'
 361                         std.sbputb(l, b - ('A' : byte) + ('a' : byte))
 362                 else
 363                         std.sbputb(l, b)
 364                 ;;
 365         ;;
 366
 367         -> std.sbfin(l)
 368 }
 369
 370 /* */
 371 const get_message = {raw : byte[:]
 372         var err : std.strbuf# = std.mksb()
 373
 374         var ent : entity
 375         match get_entity(raw, false)
 376         | `std.Ok e: ent = e
 377         | `std.Err e:
 378                 std.sbfmt(err, "{}", e)
 379                 std.slfree(e)
 380                 goto done
 381         ;;
 382
 383 :done
 384         match err.len
 385         | 0:
 386                 std.sbfree(err)
 387                 -> `std.Ok std.mk([.raw = raw, .contents = ent])
 388         | _: -> `std.Err std.sbfin(err)
 389         ;;
 390 }
 391
 392 const get_entity = {raw : byte[:], in_digest : bool
 393         var err : std.strbuf# = std.mksb()
 394         var body : byte[:] = raw
 395         var boundary : byte[:] = [][:]
 396         var params : std.htab(byte[:], byte[:])# = std.mkht()
 397         var result : entity = [
 398                 .raw = [][:],
 399                 .headers = std.mkht(),
 400                 .body = `Single [][:],
 401                 .name = `std.None,
 402                 .contenttype = [][:],
 403         ]
 404         var child_in_digest : bool = false
 405         var children : entity[:] = [][:]
 406
 407         result.raw = raw
 408
 409         /* First, figure out the headers for the whole message. */
 410         match get_headers(raw)
 411         | `std.Ok h:
 412                 std.htfree(result.headers)
 413                 result.headers = h
 414         | `std.Err e:
 415                 std.sbfmt(err, "malformed message: {}", e)
 416                 goto done
 417         ;;
 418
 419         /*
 420            Special case: the content type will tell us whether to
 421            expect something mixed or not, and perhaps a nice name
 422          */
 423         match std.htget(result.headers, "content-type")
 424         | `std.None:
 425                 /*
 426                    Default as per RFC 2045, section 5.2, but also
 427                    RFC 2046, section 5.1.1
 428                  */
 429                 if in_digest
 430                         result.contenttype = "message/rfc822"
 431                 else
 432                         result.contenttype = "text/plain"
 433                 ;;
 434         | `std.Some s:
 435                 /* s[0] is something like "foo/bar baz=quux; charset=utf-13" */
 436                 result.contenttype = s[0]
 437                 for var j = 0; j < result.contenttype.len; ++j
 438                         if is_WSP(result.contenttype[j]) || result.contenttype[j] == (';' : byte)
 439                                 result.contenttype = result.contenttype[0:j]
 440                                 break
 441                         ;;
 442                 ;;
 443
 444                 if std.eq(result.contenttype, "multipart/digest")
 445                         child_in_digest = true
 446                 ;;
 447
 448                 match get_params(s[0])
 449                 | `std.Err e:
 450                         std.sbfmt(err, "bad header “{}”: {}", s[0], e)
 451                         std.slfree(e)
 452                         goto done
 453                 | `std.Ok h:
 454                         std.htfree(params)
 455                         params = h
 456                         match std.htget(params, "name")
 457                         | `std.None:
 458                         | `std.Some n: result.name = `std.Some std.sldup(n)
 459                         ;;
 460                 ;;
 461         ;;
 462
 463         /*
 464            Special case: the content-disposition may have a filename,
 465            which is even better than a name
 466          */
 467         match std.htget(result.headers, "content-disposition")
 468         | `std.None:
 469         | `std.Some s:
 470                 match get_params(s[0])
 471                 | `std.Err e:
 472                         std.sbfmt(err, "bad header “{}”: {}", s[0], e)
 473                         std.slfree(e)
 474                         goto done
 475                 | `std.Ok h:
 476                         match std.htget(h, "filename")
 477                         | `std.None:
 478                         | `std.Some new_name:
 479                                 match result.name
 480                                 | `std.None:
 481                                 | `std.Some old_name: std.slfree(old_name)
 482                                 ;;
 483                                 result.name = `std.Some std.sldup(new_name)
 484                         ;;
 485                         free_all_params(h)
 486                         std.htfree(h)
 487                 ;;
 488         ;;
 489
 490         /*
 491            TODO: this only really works if content-transer-encoding
 492            is inline. Fix that.
 493          */
 494
 495         /* The header bit ends at the first CRLFCRLF */
 496         match std.strfind(raw, "\r\n\r\n")
 497         | `std.None: body = [][:]
 498         | `std.Some j: body = raw[j + 4:]
 499         ;;
 500
 501         if startswith(result.contenttype, "multipart/")
 502                 match std.htget(params, "boundary")
 503                 | `std.None:
 504                         std.sbfmt(err, "multipart type, but no boundary")
 505                         goto done
 506                 | `std.Some b:
 507                         boundary = std.fmt("\r\n--{}", b)
 508                         var start : std.size = 0
 509                         var end : std.size = 0
 510
 511                         /* Skip preamble; see RFC 2046 section 5.1.1 */
 512                         match std.strfind(body, boundary)
 513                         | `std.None:
 514                                 std.sbfmt(err, "multipart type, boundary not present")
 515                                 goto done
 516                         | `std.Some j: start = j
 517                         ;;
 518
 519                         /* Loop through all sub-parts (RFC 2046 for all this) */
 520                         while true
 521                                 /*
 522                                    We have found a boundary: it's
 523                                    at body[start]. We want to jump
 524                                    to the end of the boundary, then
 525                                    eat all linear whitespace. If
 526                                    it is followed by CRLF, then we
 527                                    start a new segment. If it is
 528                                    followed by "--", we're done
 529                                    with the whole thing. Otherwise,
 530                                    error (the boundary delimiter
 531                                    has appeared in the body).
 532                                  */
 533                                 start = start + boundary.len
 534                                 while start < body.len && is_WSP(body[start])
 535                                         start++
 536                                 ;;
 537
 538                                 if start + 2 > body.len
 539                                         std.sbfmt(err, "multipart boundary ends abruptly")
 540                                         goto done
 541                                 ;;
 542
 543                                 match ((body[start] : char), (body[start+1] : char))
 544                                 | ('-', '-'):
 545                                         /*
 546                                            This is the distinguished
 547                                            delimiter. We're done.
 548                                          */
 549                                         break
 550                                 | ('\r', '\n'):
 551                                         /* There is more to come. */
 552                                         start = start + 2
 553                                         match std.strfind(body[start:], boundary)
 554                                         | `std.None:
 555                                                 std.sbfmt(err, "unterminated multipart")
 556                                                 goto done
 557                                         | `std.Some j: end = start + j
 558                                         ;;
 559
 560                                         /* Now body[start:end] is something worthy of parsing */
 561                                         match get_entity(body[start:end], child_in_digest)
 562                                         | `std.Ok ent:
 563                                                 std.slpush(&children, ent)
 564                                                 start = end
 565                                         | `std.Err e:
 566                                                 std.sbfmt(err, "malformed body part: {}", e)
 567                                                 std.slfree(e)
 568                                                 goto done
 569                                         ;;
 570                                 | (_, _):
 571                                         std.sbfmt(err, "multipart boundary has appeared in body")
 572                                         goto done
 573                                 ;;
 574                         ;;
 575                 ;;
 576                 result.body = `Multipart children
 577         elif startswith(result.contenttype, "message")
 578                 /*
 579                    Having never seen this in the wild, I'm not sure
 580                    how I want it handled. For now, let's just slurp
 581                    it raw.
 582                  */
 583                 result.body = `Single body
 584         else
 585                 result.body = `Single body
 586         ;;
 587
 588 :done
 589         std.slfree(boundary)
 590         boundary = [][:]
 591         free_all_params(params)
 592         std.htfree(params)
 593         match err.len
 594         | 0:
 595                 std.sbfree(err)
 596                 -> `std.Ok result
 597         | _:
 598                 free_all_headers(result.headers)
 599                 std.htfree(result.headers)
 600                 -> `std.Err std.sbfin(err)
 601         ;;
 602 }
 603
 604 const startswith = {s : byte[:], prefix : byte[:]
 605         if s.len < prefix.len
 606                 -> false
 607         ;;
 608
 609         -> std.eq(s[:prefix.len], prefix)
 610 }
 611
 612 type rfc2231_state = union
 613         `Just_saw_semicolon
 614         `Reading_attribute
 615         `Just_saw_asterisk
 616         `Reading_section
 617         `Just_saw_equals
 618         `Reading_boring_value
 619         `Reading_encoded_value
 620         `Reading_quoted_value
 621         `Finished_a_param
 622 ;;
 623
 624 /* */
 625 const get_params = {raw : byte[:]
 626         var err : std.strbuf# = std.mksb()
 627         var params : std.htab(byte[:], byte[:])# = std.mkht()
 628         var keys_with_continuations : byte[:][:] = [][:]
 629         var keys_needing_decoding : byte[:][:] = [][:]
 630         var j : std.size = 0
 631
 632         /*
 633            Our state machine isn't completely pure, we need a few
 634            variables to guide the transitions.
 635          */
 636         var state : rfc2231_state = `Just_saw_semicolon
 637         var is_sectioned : bool = false
 638         var is_initial_section : bool = false
 639         var is_extended : bool = false
 640         var attr_start : std.size = 0
 641         var attr : byte[:] = [][:]
 642         var attr_sans_asterisk : byte[:] = [][:]
 643         var section_start : std.size = 0
 644         var value_start : std.size = 0
 645         var quoted_buf : std.strbuf# = std.mksb()
 646
 647         match std.strfind(raw, ";")
 648         | `std.None:
 649                 raw = [][:]
 650                 goto done
 651         | `std.Some k: j = k
 652         ;;
 653
 654         /* Let's tack an extra ";" onto raw just to make cleaning out params easier */
 655         raw = std.fmt("{};", raw)
 656
 657         while j + 1 < raw.len
 658                 j++
 659                 var c : char = (raw[j] : char)
 660                 match state
 661                 | `Just_saw_semicolon:
 662                         if is_WSP_or_nl(raw[j])
 663                                 continue
 664                         elif is_attribute_char(raw[j])
 665                                 state = `Reading_attribute
 666                                 attr_start = j
 667                         else
 668                                 std.sbfmt(err, "illegal byte in attribute")
 669                                 break
 670                         ;;
 671                 | `Reading_attribute:
 672                         if c == '*'
 673                                 state = `Just_saw_asterisk
 674                                 attr_sans_asterisk = raw[attr_start:j]
 675                         elif c == '='
 676                                 attr = raw[attr_start:j]
 677                                 attr_sans_asterisk = raw[attr_start:j]
 678                                 state = `Just_saw_equals
 679                         elif is_attribute_char(raw[j])
 680                                 continue
 681                         else
 682                                 std.sbfmt(err, "illegal byte in attribute")
 683                                 break
 684                         ;;
 685                 | `Just_saw_asterisk:
 686                         if c == '='
 687                                 attr = raw[attr_start:j]
 688                                 is_extended = true
 689                                 state = `Just_saw_equals
 690                         elif raw[j] >= ('0' : byte) && raw[j] <= ('9' : byte)
 691                                 is_sectioned = true
 692                                 section_start = j
 693                                 state = `Reading_section
 694                         else
 695                                 std.sbfmt(err, "illegal byte in attribute after '*'")
 696                                 break
 697                         ;;
 698                 | `Reading_section:
 699                         if c == '='
 700                                 is_initial_section = std.eq(raw[section_start:j], "0")
 701                                 attr = raw[attr_start:j]
 702                                 state = `Just_saw_equals
 703                         elif raw[j] >= ('0' : byte) && raw[j] <= ('9' : byte)
 704                                 continue
 705                         else
 706                                 std.sbfmt(err, "illegal byte in attribute after '*'")
 707                                 break
 708                         ;;
 709                 | `Just_saw_equals:
 710                         if is_extended && (!is_sectioned || is_initial_section)
 711                                 match std.strfind(raw[j:], "'")
 712                                 | `std.None:
 713                                         std.sbfmt(err, "unterminated charset")
 714                                         break
 715                                 | `std.Some k:
 716                                         k += j
 717                                         if !std.eq(raw[j:k], "utf-8") && !std.eq(raw[j:k], "us-ascii")
 718                                                 std.sbfmt(err, "unsupported charset {}", raw[j:k])
 719                                                 break
 720                                         ;;
 721                                         j = k + 1
 722                                 ;;
 723
 724                                 match std.strfind(raw[j:], "'")
 725                                 | `std.None:
 726                                         std.sbfmt(err, "unterminated language")
 727                                         break
 728                                 | `std.Some k:
 729                                         /* Completely ignore language. */
 730                                         j = j + k
 731                                         state = `Reading_encoded_value
 732                                         value_start = j + 1
 733                                 ;;
 734
 735                         else
 736                                 if c == '"'
 737                                         state = `Reading_quoted_value
 738                                         value_start = j + 1
 739                                 else
 740                                         state = `Reading_boring_value
 741                                         value_start = j
 742                                 ;;
 743                         ;;
 744                 | `Reading_boring_value:
 745                         if c == ';'
 746                                 var klower : byte[:] = str_to_lower(attr)
 747                                 if std.hthas(params, klower)
 748                                         std.sbfmt(err, "duplicate attribute “{}”", klower)
 749                                         std.slfree(klower)
 750                                         break
 751                                 ;;
 752                                 std.htput(params, klower, std.sldup(raw[value_start:j]))
 753                                 if is_extended
 754                                         ensure_in(&keys_needing_decoding, attr)
 755                                 ;;
 756                                 if is_sectioned
 757                                         var q : byte[:] = [][:]
 758                                         if is_extended
 759                                                 q = std.fmt("{}*", attr_sans_asterisk)
 760                                         else
 761                                                 q = std.sldup(attr_sans_asterisk)
 762                                         ;;
 763                                         ensure_in(&keys_with_continuations, q)
 764                                         std.slfree(q)
 765                                 ;;
 766                                 state = `Finished_a_param
 767                                 j--
 768                         elif is_token_char(raw[j])
 769                                 continue
 770                         else
 771                                 std.sbfmt(err, "illegal character in param value")
 772                                 break
 773                         ;;
 774                 | `Reading_encoded_value:
 775                         if c == ';'
 776                                 var klower : byte[:] = str_to_lower(attr)
 777                                 if std.hthas(params, klower)
 778                                         std.sbfmt(err, "duplicate attribute “{}”", klower)
 779                                         std.slfree(klower)
 780                                         break
 781                                 ;;
 782                                 std.htput(params, klower, std.sldup(raw[value_start:j]))
 783                                 ensure_in(&keys_needing_decoding, attr)
 784                                 if is_sectioned
 785                                         var q : byte[:] = std.fmt("{}*", attr_sans_asterisk)
 786                                         ensure_in(&keys_with_continuations, q)
 787                                         std.slfree(q)
 788                                 ;;
 789                                 state = `Finished_a_param
 790                                 j--
 791                         elif c == '%'
 792                                 if j + 2 >= raw.len
 793                                         std.sbfmt(err, "extended octet ends prematurely")
 794                                         break
 795                                 ;;
 796
 797                                 if !is_octet_char(raw[j+1]) || !is_octet_char(raw[j+2])
 798                                         std.sbfmt(err, "illegal byte in extended octet")
 799                                         break
 800                                 ;;
 801                                 j = j + 2
 802                         elif is_attribute_char(raw[j])
 803                                 /*
 804                                    I find it odd that this is
 805                                    "attribute char" instead of
 806                                    "token". RFC 2231, section 7,
 807                                    "extended-other-values"
 808                                  */
 809                                 continue
 810                         else
 811                                 std.sbfmt(err, "illegal byte in extended parameter")
 812                                 break
 813                         ;;
 814                 | `Reading_quoted_value:
 815                         if c == '"'
 816                                 var klower : byte[:] = str_to_lower(attr)
 817                                 if std.hthas(params, klower)
 818                                         std.sbfmt(err, "duplicate attribute “{}”", klower)
 819                                         std.slfree(klower)
 820                                         break
 821                                 ;;
 822                                 std.htput(params, klower, std.sbfin(quoted_buf))
 823                                 quoted_buf = std.mksb()
 824                                 if is_extended
 825                                         ensure_in(&keys_needing_decoding, attr)
 826                                 ;;
 827                                 if is_sectioned
 828                                         var q : byte[:] = [][:]
 829                                         if is_extended
 830                                                 q = std.fmt("{}*", attr_sans_asterisk)
 831                                         else
 832                                                 q = std.sldup(attr_sans_asterisk)
 833                                         ;;
 834                                         ensure_in(&keys_with_continuations, q)
 835                                         std.slfree(q)
 836                                 ;;
 837                                 state = `Finished_a_param
 838                         elif c == '\\'
 839                                 if j + 1 >= raw.len
 840                                         std.sbfmt(err, "quoted pair ends abruptly")
 841                                         break
 842                                 ;;
 843                                 std.sbputb(quoted_buf, raw[j+1])
 844                                 j++
 845                         else
 846                                 std.sbputb(quoted_buf, raw[j])
 847                         ;;
 848                 | `Finished_a_param:
 849                         if c != ';'
 850                                 std.sbfmt(err, "expected ‘;’ after parameter")
 851                                 break
 852                         ;;
 853
 854                         /* Reset everything */
 855                         state = `Just_saw_semicolon
 856                         is_sectioned = false
 857                         is_initial_section = false
 858                         is_extended = false
 859                         attr_start = 0
 860                         attr = [][:]
 861                         attr_sans_asterisk = [][:]
 862                         section_start = 0
 863                         value_start = 0
 864                         std.sbtrim(quoted_buf, 0)
 865                 ;;
 866         ;;
 867
 868         if err.len > 0
 869                 goto done
 870         ;;
 871
 872         /*
 873            We now need to decode and join things a bit carefully.
 874
 875            First, Because params don't follow any order and only
 876            the *0 section carries decoding information, we needed
 877            to store them all before decoding any, and we want to
 878            join before decoding.
 879
 880            (This does not contradict the remarks of RFC 2231, section
 881            4, because concatenating quoted strings and encoded
 882            strings will produce a result that decodes correctly.)
 883
 884            Second, since the "*N" comes before the "*" in the param
 885            name, we have to be a bit awkward about joining.
 886          */
 887         for k : keys_with_continuations
 888                 var ksa : byte[:] = k
 889                 is_extended = false
 890                 if ksa[ksa.len - 1] == ('*' : byte)
 891                         ksa = ksa[:ksa.len - 1]
 892                         is_extended = true
 893                 ;;
 894
 895                 var n : int = 0
 896                 var sb : std.strbuf# = std.mksb()
 897                 while true
 898                         var k2 : byte[:] = [][:]
 899                         if is_extended
 900                                 k2 = std.fmt("{}*{}*", ksa, n)
 901                         else
 902                                 k2 = std.fmt("{}*{}", ksa, n)
 903                         ;;
 904
 905                         match std.htget(params, k2)
 906                         | `std.None:
 907                                 if std.hthas(params, k)
 908                                         std.sbfmt(err, "duplicate attribute “{}”", k)
 909                                         goto done
 910                                 ;;
 911                                 std.htput(params, k, std.sbfin(sb))
 912                                 break
 913                         | `std.Some s:
 914                                 std.sbfmt(sb, "{}", s)
 915
 916                         ;;
 917                 ;;
 918         ;;
 919         std.slfree(keys_with_continuations)
 920         keys_with_continuations = [][:]
 921
 922         /* Now we've joined everything, so we can decode it */
 923         for k : keys_needing_decoding
 924                 if k.len < 2 || k[k.len - 1] != ('*' : byte)
 925                         /* Impossible */
 926                         continue
 927                 ;;
 928
 929                 /* TODO: handle more than utf-8 here */
 930                 var val : byte[:] = [][:]
 931                 match std.htget(params, k)
 932                 | `std.None: continue
 933                 | `std.Some s:
 934                         match utf8_from_octet(s)
 935                         | `std.Ok u: val = u
 936                         | `std.Err void:
 937                                 std.sbfmt(err, "invalid utf-8 “{}”", s)
 938                                 goto done
 939                         ;;
 940                 ;;
 941
 942                 var ksa : byte[:] = str_to_lower(k[:k.len - 1])
 943                 if std.hthas(params, ksa)
 944                         std.sbfmt(err, "duplicate attribute “{}”", ksa)
 945                         goto done
 946                 ;;
 947                 std.htput(params, ksa, val)
 948         ;;
 949         std.slfree(keys_needing_decoding)
 950         keys_needing_decoding = [][:]
 951
 952         /* Now we've decoded everything, so we can remove all the intermediate keys */
 953         for (k, v) : std.byhtkeyvals(params)
 954                 match std.strfind(k, "*")
 955                 | `std.None:
 956                 | `std.Some _:
 957                         std.htdel(params, k)
 958                         std.slfree(k)
 959                         std.slfree(v)
 960                 ;;
 961         ;;
 962
 963 :done
 964         std.slfree(keys_with_continuations)
 965         std.slfree(keys_needing_decoding)
 966         std.sbfree(quoted_buf)
 967
 968         /* TODO: remove the slfill. It's just salting the earth to make sure I sldup()d things right */
 969         std.slfill(raw, ('Z' : byte))
 970         std.slfree(raw)
 971
 972         match err.len
 973         | 0:
 974                 std.sbfree(err)
 975                 -> `std.Ok params
 976         | _:
 977                 free_all_params(params)
 978                 std.htfree(params)
 979                 -> `std.Err std.sbfin(err)
 980         ;;
 981 }
 982
 983 const ensure_in = {list : byte[:][:]#, value : byte[:]
 984         var lc_val : byte[:] = str_to_lower(value)
 985         for v : list#
 986                 if std.eq(v, lc_val)
 987                         std.slfree(lc_val)
 988                         -> void
 989                 ;;
 990         ;;
 991
 992         std.slpush(list, lc_val)
 993 }
 994
 995 /* See RFC 2231, section 7, and RFC 2045 section 5.1 */
 996 const is_attribute_char = {b : byte
 997         /* CTRL, SPACE, and and non-US-ASCII */
 998         if b <= 0x20 || b > 0x7e
 999                 -> false
1000         ;;
1001
1002         /* ":" to "@" */
1003         if b >= 0x3a && b <= 0x40
1004                 -> false
1005         ;;
1006
1007         /* "[" to "]" */
1008         if b >= 0x5b && b <= 0x5d
1009                 -> false
1010         ;;
1011
1012         /* "'" to ")" */
1013         if b >= 0x27 && b <= 0x29
1014                 -> false
1015         ;;
1016
1017         /* "/" and "," */
1018         if b == 0x2f || b == 0x2c
1019                 -> false
1020         ;;
1021
1022         -> true
1023 }
1024
1025 const is_token_char = {b : byte
1026         /* CTRL, SPACE, and non-US-ASCII */
1027         if b <= 0x20 || b > 0x7e
1028                 -> false
1029         ;;
1030
1031         /* ":" to "@" */
1032         if b >= 0x3a && b <= 0x40
1033                 -> false
1034         ;;
1035
1036         /* "[" to "]" */
1037         if b >= 0x5b && b <= 0x5d
1038                 -> false
1039         ;;
1040
1041         /* "(" or ")" or "/" or "," */
1042         if b == 0x28 || b == 0x29 || b == 0x2f || b == 0x2c
1043                 -> false
1044         ;;
1045
1046         -> true
1047 }
1048
1049 const is_octet_char = {b : byte
1050         /* 0 through 9 */
1051         if b >= 0x30 && b <= 0x39
1052                 -> true
1053         ;;
1054
1055         /* A through F */
1056         if b >= 0x41 && b <= 0x46
1057                 -> true
1058         ;;
1059
1060         -> false
1061 }
1062
1063 const utf8_from_octet = {s : byte[:]
1064         var sb : std.strbuf# = std.mksb()
1065         for var j = 0; j < s.len; ++j
1066                 match (s[j] : char)
1067                 | '%':
1068                         var b : byte = 0
1069
1070                         if j + 2 >= s.len
1071                                 -> `std.Err void
1072                         ;;
1073
1074                         var b1 : byte = s[j + 1]
1075                         var b2 : byte = s[j + 2]
1076
1077                         if b1 >= ('0' : byte) && b1 <= ('9' : byte)
1078                                 b += (b1 - ('0' : byte)) * 0x10
1079                         elif b1 >= ('A' : byte) && b1 <= ('F' : byte)
1080                                 b += (b1 - ('A' : byte) + 0x0a) * 0x10
1081                         else
1082                                 -> `std.Err void
1083                         ;;
1084
1085                         if b2 >= ('0' : byte) && b2 <= ('9' : byte)
1086                                 b += (b2 - ('0' : byte))
1087                         elif b2 >= ('A' : byte) && b2 <= ('F' : byte)
1088                                 b += (b2 - ('A' : byte) + 0x0a)
1089                         else
1090                                 -> `std.Err void
1091                         ;;
1092                         std.sbputb(sb, b)
1093                         j = j + 2
1094                 | _: std.sbputb(sb, s[j])
1095                 ;;
1096         ;;
1097
1098         if !util.non_ctrl_utf8(std.sbpeek(sb))
1099                 -> `std.Err void
1100         ;;
1101
1102         -> `std.Ok std.sbfin(sb)
1103 }
1104
1105 /* */
1106 const free_message = {m : message#
1107         free_entity(m.contents)
1108         std.slfree(m.raw)
1109 }
1110
1111 const free_entity = {e : entity
1112         /*
1113            Don't free raw, it belongs to the message containing
1114            this entity
1115          */
1116         for (k, v) : std.byhtkeyvals(e.headers)
1117                 std.slfree(k)
1118                 for vv : v
1119                         std.slfree(vv)
1120                 ;;
1121                 std.slfree(v)
1122         ;;
1123         std.htfree(e.headers)
1124
1125         /* Name was sldup()d, must be freed */
1126         match e.name
1127         | `std.None:
1128         | `std.Some n:
1129                 std.slfree(n)
1130                 e.name = `std.None
1131         ;;
1132
1133         /* */
1134         match e.body
1135         | `Single(_): /* No need, a subset of raw */
1136         | `Multipart(es):
1137                 for ee : es
1138                         free_entity(ee)
1139                 ;;
1140                 std.slfree(es)
1141         ;;
1142 }