added licenses
[mime-lua.git] / mime.lua
blobb2cc122180170433bfc9c67d29f1d309c17b1c05
1 #!/usr/bin/lua
2 --
3 -- Copyright (c) 2009 Mauro Iazzi
4 --
5 -- Permission is hereby granted, free of charge, to any person
6 -- obtaining a copy of this software and associated documentation
7 -- files (the "Software"), to deal in the Software without
8 -- restriction, including without limitation the rights to use,
9 -- copy, modify, merge, publish, distribute, sublicense, and/or sell
10 -- copies of the Software, and to permit persons to whom the
11 -- Software is furnished to do so, subject to the following
12 -- conditions:
14 -- The above copyright notice and this permission notice shall be
15 -- included in all copies or substantial portions of the Software.
17 -- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
18 -- EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
19 -- OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
20 -- NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
21 -- HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
22 -- WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
23 -- FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
24 -- OTHER DEALINGS IN THE SOFTWARE.
25 --/
26 #!/usr/bin/lua
28 require 'lpeg'
30 -- MIME implementation in Lua
32 -- XXX: at section 3.4
35 local string = string
36 local type = type
37 local tostring = tostring
38 local select = select
39 local setfenv = setfenv
40 local CRLF = '\r\n'
41 local lpeg = lpeg
42 local print = print
44 -- this creates a named and substitution capture for the given pattern
45 lpeg.Csg = function (patt)
46 return #lpeg.Cs(patt) * lpeg.Ct(patt)
47 end
48 -- this collects both the table of named captures and the
49 -- substitution captureof the given pattern
50 lpeg.Cst = function (patt)
51 return (#lpeg.Cs(patt) * lpeg.Ct(patt)) / function (s, t)
52 local ret = {}
53 ret [1] = s
54 for k, v in pairs(t) do
55 if type(k)~='number' then
56 ret[k] = v
57 end
58 end
59 return ret
60 end
61 end
63 -- rfc2822 Section 2.1:
65 -- A message consists of header fields (collectively called "the header
66 -- of the message") followed, optionally, by a body. The header is a
67 -- sequence of lines of characters with special syntax as defined in
68 -- this standard. The body is simply a sequence of characters that
69 -- follows the header and is separated from the header by an empty line
70 -- (i.e., a line with nothing preceding the CRLF).
72 local split_content = function (c)
73 -- NOTE: there is no guarantee that the CRLF separating header
74 -- and body is the first one. However no headers admit a blank
75 -- line so far
76 local h, b = string.match(c, '^(.-)'..CRLF..CRLF..'(.*)$')
77 assert(type(h)=='string' and type(b)=='string')
78 return h, b
79 end
81 -- rfc2822 Section 2.2.3
83 -- The process of moving from this folded multiple-line representation
84 -- of a header field to its single line representation is called
85 -- "unfolding". Unfolding is accomplished by simply removing any CRLF
86 -- that is immediately followed by WSP. Each header field should be
87 -- treated in its unfolded form for further syntactic and semantic
88 -- evaluation.
90 local unfold_header = function (h)
91 return string.gsub(h, CRLF..'[ \t]', ' ')
92 end
94 -- rfc2822 Section 2.1
96 -- At the most basic level, a message is a series of characters. A
97 -- message that is conformant with this standard is comprised of
98 -- characters with values in the range 1 through 127 and interpreted as
99 -- US-ASCII characters [ASCII]. For brevity, this document sometimes
100 -- refers to this range of characters as simply "US-ASCII characters".
103 -- rfc2822 Section 2.1.1
105 -- There are two limits that this standard places on the number of
106 -- characters in a line. Each line of characters MUST be no more than
107 -- 998 characters, and SHOULD be no more than 78 characters, excluding
108 -- the CRLF.
110 local check_content = function (c)
111 local err, warn
112 -- TODO: implement in a portable way
113 return err, warn
116 -- rfc2822 Section 2.3
118 -- The body of a message is simply lines of US-ASCII characters. The
119 -- only two limitations on the body are as follows:
121 -- - CR and LF MUST only occur together as CRLF; they MUST NOT appear
122 -- independently in the body.
124 -- - Lines of characters in the body MUST be limited to 998 characters,
126 -- and SHOULD be limited to 78 characters, excluding the CRLF.
128 local check_body = function (b)
129 local err, warn
130 -- TODO: implement
131 return err, warn
134 -- some core values (not implemented literally)
135 -- see RFC 2234 Section 6.1
136 local core_values = {
137 ["ALPHA"] = lpeg.R("AZ", "az"),
138 ["BIT"] = lpeg.P("0") + lpeg.P("1"),
139 ["CHAR"] = lpeg.R("\01\127"),
140 ["CR"] = lpeg.P("\13"),
141 ["CRLF"] = lpeg.P("\13\10"),
142 ["DIGIT"] = lpeg.R("09"),
143 ["LF"] = lpeg.P("\10"),
144 ["WSP"] = lpeg.S("\32\09"),
147 -- lexical tokens used in the specification
148 -- TODO: could be optimized
149 -- check for non-obfuscated optimizations
150 -- TODO: write in a less obfuscated way
151 -- check if parenthesis can be removed
152 local lex_tokens = function ()
153 setfenv(1, lpeg)
154 return {
155 -- control characters without whitespaces
156 ["NO-WS-CTL"] = R("\1\8") +
157 P("\11") +
158 P("\12") +
159 R("\14\13") +
160 P("\127"), -- RFC 2822 Section 3.2.1
161 -- a character in a text
162 ["text"] = R("\1\9") +
163 P("\11") +
164 P("\12") +
165 R("\14\127"), -- RFC 2822 Section 3.2.1
166 -- a special character
167 ["specials"] = P("(") + P(")") +
168 P("<") + P(">") +
169 P("[") + P("]") +
170 P(":") + P(";") +
171 P("@") + P("\\") +
172 P(",") + P(".") +
173 P("\""), -- RFC 2822 Section 3.2.1
174 -- a quoted pair should return only the second character
175 ["quoted-pair"] = (P("\\")/'' * V("text")) + V("obs-qp"), -- RFC 2822 Section 3.2.2
176 -- a folding white space (a whitespace that can include a CRLF)
177 -- should be substituted by a single whitespace
178 ["FWS"] = (((V("WSP")^0 * V("CRLF"))^-1 * V("WSP")^1) + V("obs-FWS")) / " ", -- RFC 2822 Section 3.2.3
179 -- a text character allowed inside a comment
180 ["ctext"] = V("NO-WS-CTL") +
181 R("\33\39") +
182 R("\42\91") +
183 R("\93\126"), -- RFC 2822 Section 3.2.3
184 -- the content of a comment (comments can nest)
185 ["ccontent"] = V("ctext") + V("quoted-pair") + V("comment"), -- RFC 2822 Section 3.2.3
186 -- an actual comment
187 -- should be substituted by a single whitespace
188 ["comment"] = (P("(") * (V("FWS")^-1 * V("ccontent"))^0 * V("FWS")^-1 * P(")")) / " ", -- RFC 2822 Section 3.2.3
189 -- a comment or a folding white space
190 -- should be substituted by a single whitespace
192 -- Folding white spaces should not be placed in a way that
193 -- creates lines containing only whitespaces.
194 -- This requirement Is not necessarily enforced by this grammar
195 ["CFWS"] = ( (V("FWS") + (V("comment")*V("FWS")^-1)) * (V("comment")*V("FWS")^-1)^0 ) / " ", -- RFC 2822 Section 3.2.3
196 -- character that can appear in an atom
197 ["atext"] = V("ALPHA") + V("DIGIT") +
198 P("!") + P("#") +
199 P("$") + P("%") +
200 P("&") + P("'") +
201 P("*") + P("+") +
202 P("-") + P("/") +
203 P("=") + P("?") +
204 P("^") + P("_") +
205 P("`") + P("{") +
206 P("|") + P("}") +
207 P("~"), -- RFC 2822 Section 3.2.4
208 -- an atom is equal to the content only discarding comments and whitespace
209 ["atom"] = V("CFWS")^-1 * C(V("atext")^1) * V("CFWS")^-1, -- RFC 2822 Section 3.2.4
210 -- an atom with dots is only the content, discarding CFWSs
211 ["dot-atom"] = Cs((V("CFWS")^-1/'') * C(V("dot-atom-text")) * (V("CFWS")^-1/'')), -- RFC 2822 Section 3.2.4
212 -- the content of an atom text with dots
213 ["dot-atom-text"] = V("atext")^1 * (P(".") * V("atext")^1)^0, -- RFC 2822 Section 3.2.4
214 -- character that can appear in a quoted string
215 ["qtext"] = V("NO-WS-CTL") +
216 P("\33") +
217 R("\35\91") +
218 R("\93\126"), -- RFC 2822 Section 3.2.5
219 -- character or quoted pair (both can appear in a quoted string)
220 -- it is equivalent to the character itself or to the result of the
221 -- quoted pair
222 ["qcontent"] = C(V("qtext")) + V("quoted-pair"), -- RFC 2822 Section 3.2.5
223 -- a quoted string is equal to its content
224 ["quoted-string"] = V("CFWS")^-1 *
225 P("\"") * (V("FWS")^-1 * V("qcontent"))^0 * V("FWS")^-1 * P("\"") * V("CFWS")^-1, -- RFC 2822 Section 3.2.5
226 -- unstructured patterns for unspecified headers
227 -- what should these be equal to?
229 -- an generic word
230 ["word"] = V("atom") + V("quoted-string"), -- RFC 2822 Section 3.2.6
231 -- an generic phrase
232 ["phrase"] = V("word")^1 + V("obs-phrase"), -- RFC 2822 Section 3.2.6
233 -- a character for unstructured text
234 ["utext"] = V("NO-WS-CTL") + R("\33\126") + V("obs-utext"), -- RFC 2822 Section 3.2.6
235 -- an unstructured text
236 ["unstructured"] = (V("FWS")^-1 * V("utext"))^0 * V("FWS")^-1, -- RFC 2822 Section 3.2.6
240 local show = my.show
241 local deb = function(...) for i=1,select('#',...) do local v=select(i,...)print(type(v), v)end end
243 local date_time = function ()
244 setfenv(1, lpeg)
245 return {
246 -- date and time specification
247 -- dates and times should be valid
248 -- this grammar does not enforce this yet
249 ["date-time"] = Cst(( V"day-of-week" * P"," )^-1 * V"date" * V"FWS" * V"time" * V"CFWS"^-1), -- RFC 2822 Section 3.3
250 ["day-of-week"] = ( V"FWS"^-1 * V"day-name" ) + V"obs-day-of-week", -- RFC 2822 Section 3.3
251 ["day-name"] = Cg(P"Mon" + P"Tue" + P"Wed" + P"Thu" + P"Fri" + P"Sat" + P"Sun", "weekday"), -- RFC 2822 Section 3.3
252 ["date"] = V"day" * V"month" * V"year", -- RFC 2822 Section 3.3
253 ["year"] = Cg(C(V"DIGIT"^4), "year") + V"obs-year", -- RFC 2822 Section 3.3
254 ["month"] = (V"FWS" * V"month-name" * V"FWS") + V"obs-month", -- RFC 2822 Section 3.3
255 ["month-name"] = Cg(C"Jan" + C"Feb" + C"Mar" + C"Apr" + C"May" + C"Jun" + C"Jul" + C"Aug" + C"Sep" + C"Oct" + C"Nov" + C"Dec", "month"), -- RFC 2822 Section 3.3
256 ["day"] = (V"FWS"^-1 * Cg(C(V"DIGIT" * V"DIGIT"^-1), "day")) + V"obs-day", -- RFC 2822 Section 3.3
257 ["time"] = V"time-of-day" * V"FWS" * V"zone", -- RFC 2822 Section 3.3
258 ["time-of-day"] = V"hour" * P":" * V"minute" * (P":" * V"second")^-1, -- RFC 2822 Section 3.3
259 ["hour"] = Cg(C(V"DIGIT" * V"DIGIT"), "hour") + V"obs-hour", -- RFC 2822 Section 3.3
260 ["minute"] = Cg((V"DIGIT" * V"DIGIT"), "minute") + V"obs-minute", -- RFC 2822 Section 3.3
261 ["second"] = Cg((V"DIGIT" * V"DIGIT"), "second") + V"obs-second", -- RFC 2822 Section 3.3
262 ["zone"] = Cg( (P"+" + P"-") * V"DIGIT" * V"DIGIT" * V"DIGIT" * V"DIGIT", "zone" ) + V"obs-zone", -- RFC 2822 Section 3.3
266 local address = function ()
267 setfenv(1, lpeg)
268 return {
269 -- address specification
270 -- dates and times should be valid
271 -- this grammar does not enforce this yet
272 ["address"] = V"mailbox" + V"group", -- RFC 2822 Section 3.4
273 ["mailbox"] = V"name-addr" + V"addr-spec", -- RFC 2822 Section 3.4
274 ["name-addr"] = V"display-name"^-1 * V"angle-addr", -- RFC 2822 Section 3.4
275 ["angle-addr"] = (V"CFWS"^-1 * P"<" * V"addr-spec" * P">" * V"CFWS"^-1) + V"obs-angle-addr", -- RFC 2822 Section 3.4
276 ["group"] = V"display-name" * P":" * (V"mailbox-list" + V"CFWS") * P";" * V"CFWS"^-1, -- RFC 2822 Section 3.4
277 ["display-name"] = V"phrase", -- RFC 2822 Section 3.4
278 ["mailbox-list"] = (V"mailbox" * (P"," * V"mailbox")^0) + V"obs-mailbox-list", -- RFC 2822 Section 3.4
279 ["address-list"] = (V"address" * (P"," * V"address")^0) + V"obs-address-list", -- RFC 2822 Section 3.4
280 -- address specification (name@host.domain)
281 ["addr-spec"] = Cst(V"local-part" * P"@" * V"domain"), -- RFC 2822 Section 3.4.1
282 ["local-part"] = Csg(V"dot-atom" + V"quoted-string" + V"obs-local-part", "box"), -- RFC 2822 Section 3.4.1
283 ["domain"] = Cg(V"dot-atom" + V"domain-literal" + V"obs-domain", "domain"), -- RFC 2822 Section 3.4.1
284 ["domain-literal"] = V"CFWS"^-1 * P"[" * (V"FWS"^-1 * V"dcontent")^0 * V"FWS"^-1 * P"]" * V"CFWS"^-1, -- RFC 2822 Section 3.4.1
285 ["dcontent"] = V"dtext" + V"quoted-pair", -- RFC 2822 Section 3.4.1
286 ["dtext"] = V"NO-WS-CTL" + R"\33\90" + R"\94\126", -- RFC 2822 Section 3.4.1
290 local overall_message = function ()
291 setfenv(1, lpeg)
292 return {
293 -- overall message specification
295 -- RRC 2822 Section 3.5:
297 -- A message consists of header fields, optionally followed by a message
298 -- body. Lines in a message MUST be a maximum of 998 characters
299 -- excluding the CRLF, but it is RECOMMENDED that lines be limited to 78
300 -- characters excluding the CRLF.
302 -- this grammar does not enforce the 78 chars limit yet
303 ["message"] = (V"fields" + V"obs-fields") * (V"CRLF" * V"body")^-1, -- RFC 2822 Section 3.5
304 ["body"] = (V"text"^-998 * V"CRLF")^0 * V"text"^-998, -- RFC 2822 Section 3.5
308 local fields = function ()
309 setfenv(1, lpeg)
310 return {
311 -- spec for all fields in the header
313 -- RRC 2822 Section 3.6
315 --the "fields" rule is incomplete
316 ["fields"] = -- trace fields missing
317 (V"orig-date" + V"from" + V"sender" + V"reply-to" + V"to" + V"cc" + V"bcc"
318 + V"message-id" + V"in-reply-to" + V"to" + V"cc" + V"bcc")^0, -- RFC 2822 Section 3.6
319 -- origination date field: the time at which the author decided
320 -- the message was ready to be sent (e.g. hit the button)
321 ["orig-date"] = P"Date:" * V"date-time" * V"CRLF", -- RFC 2822 Section 3.6.1
322 -- the author of the message
323 ["from"] = P"From:" * V"mailbox-list" * V"CRLF", -- RFC 2822 Section 3.6.2
324 -- the actual agent that sent the message
325 ["sender"] = P"Sender:" * V"mailbox" * V"CRLF", -- RFC 2822 Section 3.6.2
326 -- address which should be replied to
327 -- see Section 3.6.3 for forming replies
328 ["reply-to"] = P"Reply-To:" * V"address-list" * V"CRLF", -- RFC 2822 Section 3.6.2
329 -- The "To:" field contains the address(es) of the primary recipient(s)
330 -- of the message.
331 ["to"] = P"To:" * V"address-list" * V"CRLF", -- RFC 2822 Section 3.6.3
332 -- The "Cc:" field contains the addresses of others who are to receive
333 -- the message, though the content of the message may not be directed
334 -- at them.
335 ["cc"] = P"Cc:" * V"address-list" * V"CRLF", -- RFC 2822 Section 3.6.3
336 -- The "Bcc:" field contains addresses of recipients of the message
337 -- whose addresses are not to be revealed to other recipients of the
338 -- message.
339 ["bcc"] = P"Bcc:" * (V"address-list" + V"CFWS") * V"CRLF", -- RFC 2822 Section 3.6.3
340 ["message-id"] = P"Message-Id:" * V"msg-id" * V"CRLF", -- RFC 2822 Section 3.6.4
341 ["in-reply-to"] = P"In-Reply-To:" * V"msg-id"^1 * V"CRLF", -- RFC 2822 Section 3.6.4
342 ["references"] = P"References:" * V"msg-id"^1 * V"CRLF", -- RFC 2822 Section 3.6.4
343 ["msg-id"] = V"CFWS"^-1 * P"<" * V"id-left" * P"@" * V"id-right" * P">" * V"CFWS"^-1, -- RFC 2822 Section 3.6.4
344 ["id-left"] = V"dot-atom-text" + V"no-fold-quote" + V"obs-id-left", -- RFC 2822 Section 3.6.4
345 ["id-right"] = V"dot-atom-text" + V"no-fold-literal" + V"obs-id-right", -- RFC 2822 Section 3.6.4
346 ["no-fold-quote"] = P"\"" * (V"qtext" + V"quoted-pair")^0 * P"\"", -- RFC 2822 Section 3.6.4
347 ["no-fold-literal"] = P"[" * (V"dtext" + V"quoted-pair")^0 * P"]", -- RFC 2822 Section 3.6.4
351 local obs_strict = {
352 ["obs-FWS"] = lpeg.P(false),
353 ["obs-qp"] = lpeg.P(false),
354 ["obs-phrase"] = lpeg.P(false),
355 ["obs-utext"] = lpeg.P(false),
356 ["obs-day-of-week"] = lpeg.P(false),
357 ["obs-year"] = lpeg.P(false),
358 ["obs-month"] = lpeg.P(false),
359 ["obs-day"] = lpeg.P(false),
360 ["obs-hour"] = lpeg.P(false),
361 ["obs-minute"] = lpeg.P(false),
362 ["obs-second"] = lpeg.P(false),
363 ["obs-zone"] = lpeg.P(false),
364 ["obs-angle-addr"] = lpeg.P(false),
365 ["obs-mailbox-list"] = lpeg.P(false),
366 ["obs-address-list"] = lpeg.P(false),
367 ["obs-local-part"] = lpeg.P(false),
368 ["obs-domain"] = lpeg.P(false),
369 ["obs-fields"] = lpeg.P(false),
370 ["obs-id-left"] = lpeg.P(false),
371 ["obs-id-right"] = lpeg.P(false),
374 local os = setmetatable({}, {__index = function() return lpeg.P(false) end})
376 local join_set = function (...)
377 local n = select('#', ...)
378 local ret = {}
379 for i = 1, n do
380 local t = select(i, ...)
381 if type(t)=='table' then
382 for k, v in pairs(t) do
383 ret[k] = v
385 elseif type(t)=='string' then
386 ret[1] = t
387 elseif type(t)=='boolean' then
388 -- TODO: check no overwrite
389 else
390 error('join_set: bad argument number '..i..' of type '..type(t))
393 return ret
396 local gr = join_set(core_values, lex_tokens(), date_time(), address(), overall_message(), fields(), obs_strict)
398 return gr