9 This is based on RFC 5322 (for handling headers) and RFCs 2045,
10 2046 (for the structure of MIME). The parameters are decoded as
17 Master raw data; all sub-objects are slices of
18 this. The goal is that, to free a message, freeing
19 raw should deal with all the contents
26 /* Perhaps plaintext, or base64 encoded application/pdf */
29 /* Text and attachment, plain/HTML, &c */
34 /* A slice of a message.raw somewhere up above */
37 /* NOT sourced from raw, must be freed separately */
38 headers : std.htab(byte[:], byte[:][:])#
40 /* Potentially recursive definition */
44 Perhaps 'filename' from Content-Disposition, or
45 'name' from Content-Type. Must be freed separately.
47 name : std.option(byte[:])
50 Memory for the following resides in the headers'
51 values, or else is static (so don't free it).
54 /* Only the "type/subtype" bit */
59 Given raw, assume it represents a message, or the beginning
60 of one, and extract the headers into key/value.
62 The values of the output are multiple in case a message
63 has duplicate fields (e.g. "Subject: foo" and "Subject:
64 bar"). Behavior is unspecified (RFC 2822, 4.5).
66 const get_headers : (raw : byte[:] -> std.result(std.htab(byte[:], byte[:][:])#, byte[:]))
69 Given raw, like "foo: bar; a=\"b\"; c*=utf-8''whatever",
70 parse the params as per RFC 2231
72 const get_params : (raw : byte[:] -> std.result(std.htab(byte[:], byte[:])#, byte[:]))
75 Given raw, parse it into a MIME message. This takes
76 ownership of raw if successful.
78 const get_message : (raw : byte[:] -> std.result(message#, byte[:]))
81 Free all memory used by m, including the raw data used
84 const free_message : (m : message# -> void)
86 /* Parse the God-forsaken date format */
87 const parse_rfc5322_date : (s : byte[:] -> std.result(date.instant, byte[:]))
90 const get_headers = {raw : byte[:]
91 var h : std.htab(byte[:], byte[:][:])# = std.mkht()
92 var err : std.strbuf# = std.mksb()
93 var cline : std.strbuf# = std.mksb()
94 var split : byte[:][:] = std.strsplit(raw, "\r\n")
96 if l.len > 0 && is_WSP(l[0])
98 This is a wrapped header, but we can't
99 unwrap it now, because it might be a RFC
100 2047 encodedword. For those, CRLF+' '
101 should be deleted, but for bog standard
102 RFC 5322, CRLF+' ' should be treated as
105 std.sbfmt(cline, "\n{}", l)
108 match handle_header(h, std.sbpeek(cline))
111 std.sbfmt(err, "malformed headers: {}", e)
117 /* Done with headers */
120 elif l.len > 0 && all_WSP(l)
121 std.sbfmt(err, "all-whitespace line in headers")
124 if std.sbpeek(cline).len > 0
125 match handle_header(h, std.sbpeek(cline))
128 std.sbfmt(err, "malformed headers: {}", e)
135 std.sbfmt(cline, "{}", l)
142 Perhaps cline just contains whitespace right now --
143 nothing useful. This is probably the case if we're parsing
144 a full message, because we hit the "Done with headers"
145 case above. Perhaps not, though.
147 if !all_WSP(std.sbpeek(cline))
148 match handle_header(h, std.sbpeek(cline))
151 std.sbfmt(err, "malformed headers: {}", e)
158 Now we've slurped all the raw data in, but perhaps it
172 -> `std.Err std.sbfin(err)
176 const is_WSP_or_nl = { b : byte
177 var c : uint8 = (b : uint8)
178 -> c == 0x09 || c == 0x0a || c == 0x0d || c == 0x20
181 const is_WSP = { b : byte
182 var c : uint8 = (b : uint8)
183 -> c == 0x09 || c == 0x20
186 const all_WSP = { l : byte[:]
188 if (b : uint8) != 0x20 && (b : uint8) != 0x09
196 const handle_header = { h : std.htab(byte[:], byte[:][:])#, l : byte[:]
197 var err : std.strbuf# = std.mksb()
199 match std.strfind(l, ":")
201 std.sbfmt(err, "\"{}\" has no ':'", l)
202 -> `std.Err std.sbfin(err)
204 var k : byte[:] = str_to_lower(l[0:j])
205 var v : byte[:] = [][:]
207 for j++; j < l.len; j++
214 match std.htget(h, k)
216 std.htput(h, k, std.sldup([ v ][:]))
227 The XXX_all functions are workarounds for a current awkwardness:
228 generic functions can't use generic iterators that don't use the
229 same genericity that the enclosing function uses. If you're
230 reading this, Ori has probably fixed it by now.
232 const decode_all = {h : std.htab(byte[:], byte[:][:])#
234 This might need to become more complicated; encodedwords
235 are only allowed to appear in certain locations.
237 for (k, v) : std.byhtkeyvals(h)
238 for var j = 0; j < v.len; ++j
239 var new = utf8_from_encodedword(v[j])
246 const free_all_headers = {h : std.htab(byte[:], byte[:][:])#
247 for (k, v) : std.byhtkeyvals(h)
255 const free_all_params = {h : std.htab(byte[:], byte[:])#
256 for (k, v) : std.byhtkeyvals(h)
263 const parse_rfc5322_date = { s : byte[:]
264 /* First, fold whitespace */
265 var s2 : byte[:] = std.slalloc(0)
266 var last_was_ws : bool = false
267 var free_this : byte[:] = s2
269 if b < 0x20 || b > 0x7e
270 std.slfree(free_this)
271 -> `std.Err std.fmt("unparsable date \"{}\"", s)
279 std.slpush(&s2, (' ' : byte))
285 std.slpush(&s2, (' ' : byte))
291 std.slpush(&s2, (' ' : byte))
297 std.slpush(&s2, (' ' : byte))
306 /* skip [ day-of-week "," ] */
307 if s2.len > 5 && s2[3] == (',' : byte)
311 /* try with seconds */
312 match date.parsefmt("%e %b %Y %H:%M:%S %z", s2)
315 std.slfree(free_this)
319 /* try without seconds */
320 match date.parsefmt("%e %b %Y %H:%M %z", s2)
323 std.slfree(free_this)
327 /* perhaps the time zone is in obsolete format? */
339 if std.eq(a, s2[s2.len - a.len:])
340 var t = std.fmt("{} {}", s2[0:s2.len - a.len], b)
341 match date.parsefmt("%e %b %Y %H:%M:%S %Z", t)
342 | `std.Err e: std.slfree(t)
345 std.slfree(free_this)
352 std.slfree(free_this)
353 -> `std.Err std.fmt("unparsable date \"{}\"", s)
356 const str_to_lower = { s : byte[:]
357 var l : std.strbuf# = std.mksb()
359 var c : char = (b : char)
360 if c >= 'A' && c <= 'Z'
361 std.sbputb(l, b - ('A' : byte) + ('a' : byte))
371 const get_message = {raw : byte[:]
372 var err : std.strbuf# = std.mksb()
375 match get_entity(raw, false)
378 std.sbfmt(err, "{}", e)
387 -> `std.Ok std.mk([.raw = raw, .contents = ent])
388 | _: -> `std.Err std.sbfin(err)
392 const get_entity = {raw : byte[:], in_digest : bool
393 var err : std.strbuf# = std.mksb()
394 var body : byte[:] = raw
395 var boundary : byte[:] = [][:]
396 var params : std.htab(byte[:], byte[:])# = std.mkht()
397 var result : entity = [
399 .headers = std.mkht(),
400 .body = `Single [][:],
402 .contenttype = [][:],
404 var child_in_digest : bool = false
405 var children : entity[:] = [][:]
409 /* First, figure out the headers for the whole message. */
410 match get_headers(raw)
412 std.htfree(result.headers)
415 std.sbfmt(err, "malformed message: {}", e)
420 Special case: the content type will tell us whether to
421 expect something mixed or not, and perhaps a nice name
423 match std.htget(result.headers, "content-type")
426 Default as per RFC 2045, section 5.2, but also
427 RFC 2046, section 5.1.1
430 result.contenttype = "message/rfc822"
432 result.contenttype = "text/plain"
435 /* s[0] is something like "foo/bar baz=quux; charset=utf-13" */
436 result.contenttype = s[0]
437 for var j = 0; j < result.contenttype.len; ++j
438 if is_WSP(result.contenttype[j]) || result.contenttype[j] == (';' : byte)
439 result.contenttype = result.contenttype[0:j]
444 if std.eq(result.contenttype, "multipart/digest")
445 child_in_digest = true
448 match get_params(s[0])
450 std.sbfmt(err, "bad header “{}”: {}", s[0], e)
456 match std.htget(params, "name")
458 | `std.Some n: result.name = `std.Some std.sldup(n)
464 Special case: the content-disposition may have a filename,
465 which is even better than a name
467 match std.htget(result.headers, "content-disposition")
470 match get_params(s[0])
472 std.sbfmt(err, "bad header “{}”: {}", s[0], e)
476 match std.htget(h, "filename")
478 | `std.Some new_name:
481 | `std.Some old_name: std.slfree(old_name)
483 result.name = `std.Some std.sldup(new_name)
491 TODO: this only really works if content-transer-encoding
495 /* The header bit ends at the first CRLFCRLF */
496 match std.strfind(raw, "\r\n\r\n")
497 | `std.None: body = [][:]
498 | `std.Some j: body = raw[j + 4:]
501 if startswith(result.contenttype, "multipart/")
502 match std.htget(params, "boundary")
504 std.sbfmt(err, "multipart type, but no boundary")
507 boundary = std.fmt("\r\n--{}", b)
508 var start : std.size = 0
509 var end : std.size = 0
511 /* Skip preamble; see RFC 2046 section 5.1.1 */
512 match std.strfind(body, boundary)
514 std.sbfmt(err, "multipart type, boundary not present")
516 | `std.Some j: start = j
519 /* Loop through all sub-parts (RFC 2046 for all this) */
522 We have found a boundary: it's
523 at body[start]. We want to jump
524 to the end of the boundary, then
525 eat all linear whitespace. If
526 it is followed by CRLF, then we
527 start a new segment. If it is
528 followed by "--", we're done
529 with the whole thing. Otherwise,
530 error (the boundary delimiter
531 has appeared in the body).
533 start = start + boundary.len
534 while start < body.len && is_WSP(body[start])
538 if start + 2 > body.len
539 std.sbfmt(err, "multipart boundary ends abruptly")
543 match ((body[start] : char), (body[start+1] : char))
546 This is the distinguished
547 delimiter. We're done.
551 /* There is more to come. */
553 match std.strfind(body[start:], boundary)
555 std.sbfmt(err, "unterminated multipart")
557 | `std.Some j: end = start + j
560 /* Now body[start:end] is something worthy of parsing */
561 match get_entity(body[start:end], child_in_digest)
563 std.slpush(&children, ent)
566 std.sbfmt(err, "malformed body part: {}", e)
571 std.sbfmt(err, "multipart boundary has appeared in body")
576 result.body = `Multipart children
577 elif startswith(result.contenttype, "message")
579 Having never seen this in the wild, I'm not sure
580 how I want it handled. For now, let's just slurp
583 result.body = `Single body
585 result.body = `Single body
591 free_all_params(params)
598 free_all_headers(result.headers)
599 std.htfree(result.headers)
600 -> `std.Err std.sbfin(err)
604 const startswith = {s : byte[:], prefix : byte[:]
605 if s.len < prefix.len
609 -> std.eq(s[:prefix.len], prefix)
612 type rfc2231_state = union
618 `Reading_boring_value
619 `Reading_encoded_value
620 `Reading_quoted_value
625 const get_params = {raw : byte[:]
626 var err : std.strbuf# = std.mksb()
627 var params : std.htab(byte[:], byte[:])# = std.mkht()
628 var keys_with_continuations : byte[:][:] = [][:]
629 var keys_needing_decoding : byte[:][:] = [][:]
633 Our state machine isn't completely pure, we need a few
634 variables to guide the transitions.
636 var state : rfc2231_state = `Just_saw_semicolon
637 var is_sectioned : bool = false
638 var is_initial_section : bool = false
639 var is_extended : bool = false
640 var attr_start : std.size = 0
641 var attr : byte[:] = [][:]
642 var attr_sans_asterisk : byte[:] = [][:]
643 var section_start : std.size = 0
644 var value_start : std.size = 0
645 var quoted_buf : std.strbuf# = std.mksb()
647 match std.strfind(raw, ";")
654 /* Let's tack an extra ";" onto raw just to make cleaning out params easier */
655 raw = std.fmt("{};", raw)
657 while j + 1 < raw.len
659 var c : char = (raw[j] : char)
661 | `Just_saw_semicolon:
662 if is_WSP_or_nl(raw[j])
664 elif is_attribute_char(raw[j])
665 state = `Reading_attribute
668 std.sbfmt(err, "illegal byte in attribute")
671 | `Reading_attribute:
673 state = `Just_saw_asterisk
674 attr_sans_asterisk = raw[attr_start:j]
676 attr = raw[attr_start:j]
677 attr_sans_asterisk = raw[attr_start:j]
678 state = `Just_saw_equals
679 elif is_attribute_char(raw[j])
682 std.sbfmt(err, "illegal byte in attribute")
685 | `Just_saw_asterisk:
687 attr = raw[attr_start:j]
689 state = `Just_saw_equals
690 elif raw[j] >= ('0' : byte) && raw[j] <= ('9' : byte)
693 state = `Reading_section
695 std.sbfmt(err, "illegal byte in attribute after '*'")
700 is_initial_section = std.eq(raw[section_start:j], "0")
701 attr = raw[attr_start:j]
702 state = `Just_saw_equals
703 elif raw[j] >= ('0' : byte) && raw[j] <= ('9' : byte)
706 std.sbfmt(err, "illegal byte in attribute after '*'")
710 if is_extended && (!is_sectioned || is_initial_section)
711 match std.strfind(raw[j:], "'")
713 std.sbfmt(err, "unterminated charset")
717 if !std.eq(raw[j:k], "utf-8") && !std.eq(raw[j:k], "us-ascii")
718 std.sbfmt(err, "unsupported charset {}", raw[j:k])
724 match std.strfind(raw[j:], "'")
726 std.sbfmt(err, "unterminated language")
729 /* Completely ignore language. */
731 state = `Reading_encoded_value
737 state = `Reading_quoted_value
740 state = `Reading_boring_value
744 | `Reading_boring_value:
746 var klower : byte[:] = str_to_lower(attr)
747 if std.hthas(params, klower)
748 std.sbfmt(err, "duplicate attribute “{}”", klower)
752 std.htput(params, klower, std.sldup(raw[value_start:j]))
754 ensure_in(&keys_needing_decoding, attr)
757 var q : byte[:] = [][:]
759 q = std.fmt("{}*", attr_sans_asterisk)
761 q = std.sldup(attr_sans_asterisk)
763 ensure_in(&keys_with_continuations, q)
766 state = `Finished_a_param
768 elif is_token_char(raw[j])
771 std.sbfmt(err, "illegal character in param value")
774 | `Reading_encoded_value:
776 var klower : byte[:] = str_to_lower(attr)
777 if std.hthas(params, klower)
778 std.sbfmt(err, "duplicate attribute “{}”", klower)
782 std.htput(params, klower, std.sldup(raw[value_start:j]))
783 ensure_in(&keys_needing_decoding, attr)
785 var q : byte[:] = std.fmt("{}*", attr_sans_asterisk)
786 ensure_in(&keys_with_continuations, q)
789 state = `Finished_a_param
793 std.sbfmt(err, "extended octet ends prematurely")
797 if !is_octet_char(raw[j+1]) || !is_octet_char(raw[j+2])
798 std.sbfmt(err, "illegal byte in extended octet")
802 elif is_attribute_char(raw[j])
804 I find it odd that this is
805 "attribute char" instead of
806 "token". RFC 2231, section 7,
807 "extended-other-values"
811 std.sbfmt(err, "illegal byte in extended parameter")
814 | `Reading_quoted_value:
816 var klower : byte[:] = str_to_lower(attr)
817 if std.hthas(params, klower)
818 std.sbfmt(err, "duplicate attribute “{}”", klower)
822 std.htput(params, klower, std.sbfin(quoted_buf))
823 quoted_buf = std.mksb()
825 ensure_in(&keys_needing_decoding, attr)
828 var q : byte[:] = [][:]
830 q = std.fmt("{}*", attr_sans_asterisk)
832 q = std.sldup(attr_sans_asterisk)
834 ensure_in(&keys_with_continuations, q)
837 state = `Finished_a_param
840 std.sbfmt(err, "quoted pair ends abruptly")
843 std.sbputb(quoted_buf, raw[j+1])
846 std.sbputb(quoted_buf, raw[j])
850 std.sbfmt(err, "expected ‘;’ after parameter")
854 /* Reset everything */
855 state = `Just_saw_semicolon
857 is_initial_section = false
861 attr_sans_asterisk = [][:]
864 std.sbtrim(quoted_buf, 0)
873 We now need to decode and join things a bit carefully.
875 First, Because params don't follow any order and only
876 the *0 section carries decoding information, we needed
877 to store them all before decoding any, and we want to
878 join before decoding.
880 (This does not contradict the remarks of RFC 2231, section
881 4, because concatenating quoted strings and encoded
882 strings will produce a result that decodes correctly.)
884 Second, since the "*N" comes before the "*" in the param
885 name, we have to be a bit awkward about joining.
887 for k : keys_with_continuations
888 var ksa : byte[:] = k
890 if ksa[ksa.len - 1] == ('*' : byte)
891 ksa = ksa[:ksa.len - 1]
896 var sb : std.strbuf# = std.mksb()
898 var k2 : byte[:] = [][:]
900 k2 = std.fmt("{}*{}*", ksa, n)
902 k2 = std.fmt("{}*{}", ksa, n)
905 match std.htget(params, k2)
907 if std.hthas(params, k)
908 std.sbfmt(err, "duplicate attribute “{}”", k)
911 std.htput(params, k, std.sbfin(sb))
914 std.sbfmt(sb, "{}", s)
919 std.slfree(keys_with_continuations)
920 keys_with_continuations = [][:]
922 /* Now we've joined everything, so we can decode it */
923 for k : keys_needing_decoding
924 if k.len < 2 || k[k.len - 1] != ('*' : byte)
929 /* TODO: handle more than utf-8 here */
930 var val : byte[:] = [][:]
931 match std.htget(params, k)
932 | `std.None: continue
934 match utf8_from_octet(s)
937 std.sbfmt(err, "invalid utf-8 “{}”", s)
942 var ksa : byte[:] = str_to_lower(k[:k.len - 1])
943 if std.hthas(params, ksa)
944 std.sbfmt(err, "duplicate attribute “{}”", ksa)
947 std.htput(params, ksa, val)
949 std.slfree(keys_needing_decoding)
950 keys_needing_decoding = [][:]
952 /* Now we've decoded everything, so we can remove all the intermediate keys */
953 for (k, v) : std.byhtkeyvals(params)
954 match std.strfind(k, "*")
964 std.slfree(keys_with_continuations)
965 std.slfree(keys_needing_decoding)
966 std.sbfree(quoted_buf)
968 /* TODO: remove the slfill. It's just salting the earth to make sure I sldup()d things right */
969 std.slfill(raw, ('Z' : byte))
977 free_all_params(params)
979 -> `std.Err std.sbfin(err)
983 const ensure_in = {list : byte[:][:]#, value : byte[:]
984 var lc_val : byte[:] = str_to_lower(value)
992 std.slpush(list, lc_val)
995 /* See RFC 2231, section 7, and RFC 2045 section 5.1 */
996 const is_attribute_char = {b : byte
997 /* CTRL, SPACE, and and non-US-ASCII */
998 if b <= 0x20 || b > 0x7e
1003 if b >= 0x3a && b <= 0x40
1008 if b >= 0x5b && b <= 0x5d
1013 if b >= 0x27 && b <= 0x29
1018 if b == 0x2f || b == 0x2c
1025 const is_token_char = {b : byte
1026 /* CTRL, SPACE, and non-US-ASCII */
1027 if b <= 0x20 || b > 0x7e
1032 if b >= 0x3a && b <= 0x40
1037 if b >= 0x5b && b <= 0x5d
1041 /* "(" or ")" or "/" or "," */
1042 if b == 0x28 || b == 0x29 || b == 0x2f || b == 0x2c
1049 const is_octet_char = {b : byte
1051 if b >= 0x30 && b <= 0x39
1056 if b >= 0x41 && b <= 0x46
1063 const utf8_from_octet = {s : byte[:]
1064 var sb : std.strbuf# = std.mksb()
1065 for var j = 0; j < s.len; ++j
1074 var b1 : byte = s[j + 1]
1075 var b2 : byte = s[j + 2]
1077 if b1 >= ('0' : byte) && b1 <= ('9' : byte)
1078 b += (b1 - ('0' : byte)) * 0x10
1079 elif b1 >= ('A' : byte) && b1 <= ('F' : byte)
1080 b += (b1 - ('A' : byte) + 0x0a) * 0x10
1085 if b2 >= ('0' : byte) && b2 <= ('9' : byte)
1086 b += (b2 - ('0' : byte))
1087 elif b2 >= ('A' : byte) && b2 <= ('F' : byte)
1088 b += (b2 - ('A' : byte) + 0x0a)
1094 | _: std.sbputb(sb, s[j])
1098 if !util.non_ctrl_utf8(std.sbpeek(sb))
1102 -> `std.Ok std.sbfin(sb)
1106 const free_message = {m : message#
1107 free_entity(m.contents)
1111 const free_entity = {e : entity
1113 Don't free raw, it belongs to the message containing
1116 for (k, v) : std.byhtkeyvals(e.headers)
1123 std.htfree(e.headers)
1125 /* Name was sldup()d, must be freed */
1135 | `Single(_): /* No need, a subset of raw */