get_message() passes two tests
[libmime.git] / rfc2047.myr
blob299c5981df9fbf6ec9b8a41336071bd74f9af4bd
1 use std
3 use "base64"
4 use "util"
6 pkg mime =
7         /* Handle =?utf-8?q?Welcome_to_=22foobar=22_ML?= via RFC 2047 */
8         const utf8_from_encodedword : (encoded : byte[:] -> byte[:])
9 ;;
11 type rfc2047_state = union
12         `Boring_ASCII
13         `Saw_Opening_Equals
14         `Reading_Charset
15         `Reading_Encoding
16         `Saw_Closing_Question_Mark
17         `In_Text
18         `Just_Finished_Text
21 const ascii_lc = { b : byte
22         if b > 0x40 && b < 0x5B
23                 -> b | 0x20
24         else
25                 -> b
26         ;;
29 const caseieq = { s : byte[:], t : byte[:]
30         if s.len != t.len
31                 -> false
32         ;;
33         for var j : std.size = 0; j < s.len; ++j
34                 if ascii_lc(s[j]) != ascii_lc(t[j])
35                         -> false
36                 ;;
37         ;;
39         -> true
42 const atox = { b : byte
43         if b >= ('0' : byte) && b <= ('9' : byte)
44                 -> `std.Ok (b - ('0' : byte))
45         ;;
46         if b >= ('A' : byte) && b <= ('F' : byte)
47                 -> `std.Ok (b - ('A' : byte) + 0xA)
48         ;;
49         if b >= ('a' : byte) && b <= ('f' : byte)
50                 -> `std.Ok (b - ('a' : byte) + 0xA)
51         ;;
53         -> `std.Err void
56 const decode_q = { s : byte[:]
57         var dec : std.strbuf# = std.mksb()
58         for var j : std.size = 0; j < s.len; ++j
59                 match (s[j] : char)
60                 | '?': goto err
61                 | ' ': goto err
62                 | '\t': goto err
63                 | '_': std.sbputb(dec, (' ' : byte))
64                 | '=':
65                         if j + 2 >= s.len
66                                 goto err
67                         ;;
69                         var b : byte = 0
70                         match atox(s[j+1])
71                         | `std.Ok n: b += (16 * n)
72                         | `std.Err void: goto err
73                         ;;
74                         match atox(s[j+2])
75                         | `std.Ok n: b += n
76                         | `std.Err void: goto err
77                         ;;
78                         std.sbputb(dec, b)
79                         j = j + 2
80                 | _: std.sbputb(dec, s[j])
81                 ;;
82         ;;
84         -> `std.Ok std.sbfin(dec)
86 :err
87         std.sbfree(dec)
88         -> `std.Err void
91 const token_safe_byte = { b : byte
92         /* 
93          * Any CHAR except SPACE, CTLs, and especials
94          *
95          * especials = "(" / ")" / "<" / ">" / "@" / "," / ";" /
96          * ":" / " <"> / "/" / "[" / "]" / "?" / "." / "="
97          */
99         if b <= (' ' : byte) || b > ('~' : byte)
100                 -> false
101         ;;
102         match (b : char)
103         | '(': -> false
104         | ')': -> false
105         | '<': -> false
106         | '>': -> false
107         | '@': -> false
108         | ',': -> false
109         | ';': -> false
110         | ':': -> false
111         | '"': -> false
112         | '/': -> false
113         | '[': -> false
114         | ']': -> false
115         | '?': -> false
116         | '.': -> false
117         | '=': -> false
118         | _: -> true
119         ;;
122 const text_safe_byte = { b : byte
123         /*
124          * 1*<Any printable ASCII character other than "?"
125          *    or SPACE>
126          * ; (but see "Use of encoded-words in message
127          * ; headers", section 5)
128          *
129          * Section 5 adds context to where encoded-words are allowed.
130          * We over-enforce that in the state machine, so we don't
131          * have to worry about it here.  I think.
132          */
133         -> b > 0x20 && b < 0x7f && (b : char) != '?'
136 const utf8_from_encodedword = { encoded : byte[:]
137         var decoded : std.strbuf# = std.mksb()
138         var charset : byte[:] = [][:]
139         var encoding : byte[:] = [][:]
140         var s : rfc2047_state = `Boring_ASCII
141         var b : byte = 0
143         var last_was_whitespace : bool = true
144         var this_word_start : std.size = 0
145         var cs_start : std.size = 0
146         var e_start : std.size = 0
147         var text_start : std.size = 0
148         var decode_word : (s : byte[:] -> std.result(byte[:], void))
150         var err : std.strbuf# = std.mksb()
152         for var j : std.size = 0; j < encoded.len; ++j
153                 b = encoded[j]
154                 match s
155                 | `Boring_ASCII:
156                         var c : char = (b : char)
157                         match c
158                         | '=':
159                                 if !last_was_whitespace
160                                         goto not_encodedword
161                                 ;;
162                                 s = `Saw_Opening_Equals
163                         | _:
164                                 if (c == ' ' || c == '\t' || c == '\r' || c == '\n')
165                                         if last_was_whitespace
166                                                 continue
167                                         ;;
168                                         std.sbputb(decoded, 0x20)
169                                         last_was_whitespace = true
170                                 else
171                                         std.sbputb(decoded, b)
172                                         last_was_whitespace = false
173                                 ;;
174                         ;;
175                 | `Just_Finished_Text:
176                         /*
177                          * RFC 2047, section 5 requires that in
178                          * some contexts, encoded text be separated
179                          * from surrounding ASCII by linear whitespace.
180                          * That "in some contexts" is hard, so we
181                          * enforce it everywhere. This state is for
182                          * requiring that, right after the ?=, we
183                          * do not have a non-whitespace character
184                          */
185                         var c : char = (b : char)
186                         if c != ' ' && c != '\t' && c != '\r' && c != '\n'
187                                 goto not_encodedword
188                         ;;
189                         if c == '\r'
190                                 continue
191                         elif c == '\n' && j < encoded.len - 1
192                                 /*
193                                  * Between =?...?= atoms, "\n "
194                                  * should be discarded, not folded
195                                  * to " "
196                                  */
197                                 if encoded[j + 1] == 0x20 || encoded[j + 1] == 0x09
198                                         j = j + 1
199                                         s = `Boring_ASCII
200                                         last_was_whitespace = true
201                                         continue
202                                 ;;
203                         ;;
204                         std.sbputb(decoded, b)
205                         last_was_whitespace = true
206                         s = `Boring_ASCII
207                 | `Saw_Opening_Equals:
208                         this_word_start = j
209                         match (b : char)
210                         | '?':
211                                 s = `Reading_Charset
212                                 if j + 1 >= encoded.len
213                                         goto not_encodedword
214                                 ;;
215                                 cs_start = j + 1
216                         | _:
217                                 goto not_encodedword
218                         ;;
219                 | `Reading_Charset:
220                         if j - this_word_start > 75
221                                 goto not_encodedword
222                         ;;
223                         match (b : char)
224                         | '?':
225                                 s = `Reading_Encoding
226                                 charset = encoded[cs_start:j]
227                                 if j + 1 >= encoded.len
228                                         goto not_encodedword
229                                 ;;
230                                 e_start = j + 1
231                         | _:
232                                 if !token_safe_byte(b)
233                                         goto not_encodedword
234                                 ;;
235                         ;;
236                 | `Reading_Encoding:
237                         if j - this_word_start > 75
238                                 goto not_encodedword
239                         ;;
240                         match (b : char)
241                         | '?':
242                                 encoding = encoded[e_start:j]
243                                 /*
244                                    TODO: RFC 2231 means we should
245                                    strip trailing *FOO from this.
246                                  */
247                                 if !caseieq(charset, "utf-8")
248                                         goto unimplemented
249                                 ;;
250                                 
251                                 if caseieq(encoding, "q")
252                                         decode_word = decode_q
253                                 elif caseieq(encoding, "b")
254                                         decode_word = utf8_from_base64
255                                 else
256                                         goto unimplemented
257                                 ;;
258                                 if j + 0 >= encoded.len
259                                         goto not_encodedword
260                                 ;;
261                                 text_start = j + 1
262                                 s = `In_Text
263                         | _:
264                                 if !token_safe_byte(b)
265                                         goto not_encodedword
266                                 ;;
267                         ;;
268                 | `In_Text:
269                         if j - this_word_start > 75
270                                 goto not_encodedword
271                         ;;
272                         match (b : char)
273                         | '?':
274                                 match decode_word(encoded[text_start:j])
275                                 | `std.Ok dec:
276                                         std.sbputs(decoded, dec)
277                                         std.slfree(dec)
278                                 | `std.Err void:
279                                         goto not_encodedword
280                                 ;;
281                                 s = `Saw_Closing_Question_Mark
282                         | _:
283                                 if !text_safe_byte(b)
284                                         goto not_encodedword
285                                 ;;
286                         ;;
287                 | `Saw_Closing_Question_Mark:
288                         if j - this_word_start > 75
289                                 goto not_encodedword
290                         ;;
291                         match (b : char)
292                         | '=':
293                                 s = `Just_Finished_Text
294                         | _:
295                                 goto not_encodedword
296                         ;;
297                 ;;
298         ;;
300         /*
301          * Because Q and B encoding work on a byte-level, there's
302          * a chance that what we have isn't valid UTF-8. That would
303          * be a shame.
304          *
305          * TODO: we die if tabs are in the subject here. Is that bad?
306          */
307         if !util.non_ctrl_utf8(std.sbpeek(decoded))
308                 goto not_encodedword
309         ;;
311         match s
312         | `Boring_ASCII: goto done
313         | `Just_Finished_Text: goto done
314         | _: goto not_encodedword
315         ;;
317 :unimplemented
318 :not_encodedword
319         std.sbfree(decoded)
320         -> only_fold_whitespace(encoded)
322 :done
323         -> std.sbfin(decoded)
326 const only_fold_whitespace = { s : byte[:]
327         var r : std.strbuf# = std.mksb()
328         var last_was_whitespace : bool = false
329         for b : s
330                 var c : char = (b : char)
331                 if c == '\n' || c == '\r' || c == '\t' || c == ' '
332                         if last_was_whitespace
333                                 continue
334                         ;;
336                         std.sbputc(r, ' ')
337                         last_was_whitespace = true
338                 else
339                         std.sbputb(r, b)
340                         last_was_whitespace = false
341                 ;;
342         ;;
344         -> std.sbfin(r)