3 -- Copyright (c) 2009 Mauro Iazzi
5 -- Permission is hereby granted, free of charge, to any person
6 -- obtaining a copy of this software and associated documentation
7 -- files (the "Software"), to deal in the Software without
8 -- restriction, including without limitation the rights to use,
9 -- copy, modify, merge, publish, distribute, sublicense, and/or sell
10 -- copies of the Software, and to permit persons to whom the
11 -- Software is furnished to do so, subject to the following
14 -- The above copyright notice and this permission notice shall be
15 -- included in all copies or substantial portions of the Software.
17 -- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
18 -- EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
19 -- OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
20 -- NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
21 -- HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
22 -- WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
23 -- FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
24 -- OTHER DEALINGS IN THE SOFTWARE.
29 -- MIME implementation in Lua
31 -- XXX: at section 3.4
36 local tostring = tostring
38 local setfenv
= setfenv
43 -- this creates a named and substitution capture for the given pattern
44 lpeg
.Csg
= function (patt
, name
)
45 return (#lpeg
.Cg(patt
, name
)) * lpeg
.Cs(patt
)
47 -- this collects both the table of named captures and the
48 -- substitution capture of the given pattern
49 lpeg
.Cst
= function (patt
)
50 return (#lpeg
.Cs(patt
) * lpeg
.Ct(patt
)) / function (s
, t
)
53 for k
, v
in pairs(t
) do
54 if type(k
)~='number' then
62 -- rfc2822 Section 2.1:
64 -- A message consists of header fields (collectively called "the header
65 -- of the message") followed, optionally, by a body. The header is a
66 -- sequence of lines of characters with special syntax as defined in
67 -- this standard. The body is simply a sequence of characters that
68 -- follows the header and is separated from the header by an empty line
69 -- (i.e., a line with nothing preceding the CRLF).
71 local split_content
= function (c
)
72 -- NOTE: there is no guarantee that the CRLF separating header
73 -- and body is the first one. However no headers admit a blank
75 local h
, b
= string.match(c
, '^(.-)'..CRLF
..CRLF
..'(.*)$')
76 assert(type(h
)=='string' and type(b
)=='string')
80 -- rfc2822 Section 2.2.3
82 -- The process of moving from this folded multiple-line representation
83 -- of a header field to its single line representation is called
84 -- "unfolding". Unfolding is accomplished by simply removing any CRLF
85 -- that is immediately followed by WSP. Each header field should be
86 -- treated in its unfolded form for further syntactic and semantic
89 local unfold_header
= function (h
)
90 return string.gsub(h
, CRLF
..'[ \t]', ' ')
93 -- rfc2822 Section 2.1
95 -- At the most basic level, a message is a series of characters. A
96 -- message that is conformant with this standard is comprised of
97 -- characters with values in the range 1 through 127 and interpreted as
98 -- US-ASCII characters [ASCII]. For brevity, this document sometimes
99 -- refers to this range of characters as simply "US-ASCII characters".
102 -- rfc2822 Section 2.1.1
104 -- There are two limits that this standard places on the number of
105 -- characters in a line. Each line of characters MUST be no more than
106 -- 998 characters, and SHOULD be no more than 78 characters, excluding
109 local check_content
= function (c
)
111 -- TODO: implement in a portable way
115 -- rfc2822 Section 2.3
117 -- The body of a message is simply lines of US-ASCII characters. The
118 -- only two limitations on the body are as follows:
120 -- - CR and LF MUST only occur together as CRLF; they MUST NOT appear
121 -- independently in the body.
123 -- - Lines of characters in the body MUST be limited to 998 characters,
125 -- and SHOULD be limited to 78 characters, excluding the CRLF.
127 local check_body
= function (b
)
133 -- some core values (not implemented literally)
134 -- see RFC 2234 Section 6.1
135 local core_values
= {
136 ["ALPHA"] = lpeg
.R("AZ", "az"),
137 ["BIT"] = lpeg
.P("0") + lpeg
.P("1"),
138 ["CHAR"] = lpeg
.R("\01\127"),
139 ["CR"] = lpeg
.P("\13"),
140 ["CRLF"] = lpeg
.P("\13\10"),
141 ["DIGIT"] = lpeg
.R("09"),
142 ["LF"] = lpeg
.P("\10"),
143 ["WSP"] = lpeg
.S("\32\09"),
146 -- lexical tokens used in the specification
147 -- TODO: could be optimized
148 -- check for non-obfuscated optimizations
149 -- TODO: write in a less obfuscated way
150 -- check if parenthesis can be removed
151 local lex_tokens
= function ()
154 -- control characters without whitespaces
155 ["NO-WS-CTL"] = R("\1\8") +
159 P("\127"), -- RFC 2822 Section 3.2.1
160 -- a character in a text
161 ["text"] = R("\1\9") +
164 R("\14\127"), -- RFC 2822 Section 3.2.1
165 -- a special character
166 ["specials"] = P("(") + P(")") +
172 P("\""), -- RFC 2822 Section 3.2.1
173 -- a quoted pair should return only the second character
174 ["quoted-pair"] = (P("\\")/'' * V("text")) + V("obs-qp"), -- RFC 2822 Section 3.2.2
175 -- a folding white space (a whitespace that can include a CRLF)
176 -- should be substituted by a single whitespace
177 ["FWS"] = (((V("WSP")^
0 * V("CRLF"))^
-1 * V("WSP")^
1) + V("obs-FWS")) / " ", -- RFC 2822 Section 3.2.3
178 -- a text character allowed inside a comment
179 ["ctext"] = V("NO-WS-CTL") +
182 R("\93\126"), -- RFC 2822 Section 3.2.3
183 -- the content of a comment (comments can nest)
184 ["ccontent"] = V("ctext") + V("quoted-pair") + V("comment"), -- RFC 2822 Section 3.2.3
186 -- should be substituted by a single whitespace
187 ["comment"] = (P("(") * (V("FWS")^
-1 * V("ccontent"))^
0 * V("FWS")^
-1 * P(")")) / " ", -- RFC 2822 Section 3.2.3
188 -- a comment or a folding white space
189 -- should be substituted by a single whitespace
191 -- Folding white spaces should not be placed in a way that
192 -- creates lines containing only whitespaces.
193 -- This requirement Is not necessarily enforced by this grammar
194 ["CFWS"] = ( (V("FWS") + (V("comment")*V("FWS")^
-1)) * (V("comment")*V("FWS")^
-1)^
0 ) / " ", -- RFC 2822 Section 3.2.3
195 -- character that can appear in an atom
196 ["atext"] = V("ALPHA") + V("DIGIT") +
206 P("~"), -- RFC 2822 Section 3.2.4
207 -- an atom is equal to the content only discarding comments and whitespace
208 ["atom"] = V("CFWS")^
-1 * C(V("atext")^
1) * V("CFWS")^
-1, -- RFC 2822 Section 3.2.4
209 -- an atom with dots is only the content, discarding CFWSs
210 ["dot-atom"] = V("CFWS")^
-1 * C(V("dot-atom-text")) * V("CFWS")^
-1, -- RFC 2822 Section 3.2.4
211 -- the content of an atom text with dots
212 ["dot-atom-text"] = V("atext")^
1 * (P(".") * V("atext")^
1)^
0, -- RFC 2822 Section 3.2.4
213 -- character that can appear in a quoted string
214 ["qtext"] = V("NO-WS-CTL") +
217 R("\93\126"), -- RFC 2822 Section 3.2.5
218 -- character or quoted pair (both can appear in a quoted string)
219 -- it is equivalent to the character itself or to the result of the
221 ["qcontent"] = C(V("qtext")) + V("quoted-pair"), -- RFC 2822 Section 3.2.5
222 -- a quoted string is equal to its content
223 ["quoted-string"] = V("CFWS")^
-1 *
224 P("\"") * (V("FWS")^
-1 * V("qcontent"))^
0 * V("FWS")^
-1 * P("\"") * V("CFWS")^
-1, -- RFC 2822 Section 3.2.5
225 -- unstructured patterns for unspecified headers
226 -- what should these be equal to?
229 ["word"] = V("atom") + V("quoted-string"), -- RFC 2822 Section 3.2.6
231 ["phrase"] = Cs(V("word")^
1) + V("obs-phrase"), -- RFC 2822 Section 3.2.6
232 -- a character for unstructured text
233 ["utext"] = V("NO-WS-CTL") + R("\33\126") + V("obs-utext"), -- RFC 2822 Section 3.2.6
234 -- an unstructured text
235 ["unstructured"] = (V("FWS")^
-1 * V("utext"))^
0 * V("FWS")^
-1, -- RFC 2822 Section 3.2.6
239 local deb
= function(...) for i
=1,select('#',...) do local v
=select(i
,...)print(type(v
), v
)end end
241 local date_time
= function ()
244 -- date and time specification
245 -- dates and times should be valid
246 -- this grammar does not enforce this yet
247 ["date-time"] = Cst(( V
"day-of-week" * P
"," )^
-1 * V
"date" * V
"FWS" * V
"time" * V
"CFWS"^
-1), -- RFC 2822 Section 3.3
248 ["day-of-week"] = ( V
"FWS"^
-1 * V
"day-name" ) + V
"obs-day-of-week", -- RFC 2822 Section 3.3
249 ["day-name"] = Cg(P
"Mon" + P
"Tue" + P
"Wed" + P
"Thu" + P
"Fri" + P
"Sat" + P
"Sun", "weekday"), -- RFC 2822 Section 3.3
250 ["date"] = V
"day" * V
"month" * V
"year", -- RFC 2822 Section 3.3
251 ["year"] = Cg(C(V
"DIGIT"^
4), "year") + V
"obs-year", -- RFC 2822 Section 3.3
252 ["month"] = (V
"FWS" * V
"month-name" * V
"FWS") + V
"obs-month", -- RFC 2822 Section 3.3
253 ["month-name"] = Cg(C
"Jan" + C
"Feb" + C
"Mar" + C
"Apr" + C
"May" + C
"Jun" + C
"Jul" + C
"Aug" + C
"Sep" + C
"Oct" + C
"Nov" + C
"Dec", "month"), -- RFC 2822 Section 3.3
254 ["day"] = (V
"FWS"^
-1 * Cg(C(V
"DIGIT" * V
"DIGIT"^
-1), "day")) + V
"obs-day", -- RFC 2822 Section 3.3
255 ["time"] = V
"time-of-day" * V
"FWS" * V
"zone", -- RFC 2822 Section 3.3
256 ["time-of-day"] = V
"hour" * P
":" * V
"minute" * (P
":" * V
"second")^
-1, -- RFC 2822 Section 3.3
257 ["hour"] = Cg(C(V
"DIGIT" * V
"DIGIT"), "hour") + V
"obs-hour", -- RFC 2822 Section 3.3
258 ["minute"] = Cg((V
"DIGIT" * V
"DIGIT"), "minute") + V
"obs-minute", -- RFC 2822 Section 3.3
259 ["second"] = Cg((V
"DIGIT" * V
"DIGIT"), "second") + V
"obs-second", -- RFC 2822 Section 3.3
260 ["zone"] = Cg( (P
"+" + P
"-") * V
"DIGIT" * V
"DIGIT" * V
"DIGIT" * V
"DIGIT", "zone" ) + V
"obs-zone", -- RFC 2822 Section 3.3
264 local address
= function ()
267 -- address specification
268 -- dates and times should be valid
269 -- this grammar does not enforce this yet
270 ["address"] = V
"mailbox" + V
"group", -- RFC 2822 Section 3.4
271 ["mailbox"] = V
"name-addr" + V
"addr-spec", -- RFC 2822 Section 3.4
272 ["name-addr"] = V
"display-name"^
-1 * V
"angle-addr", -- RFC 2822 Section 3.4
273 ["angle-addr"] = (V
"CFWS"^
-1 * P
"<" * V
"addr-spec" * P
">" * V
"CFWS"^
-1) + V
"obs-angle-addr", -- RFC 2822 Section 3.4
274 ["group"] = V
"display-name" * P
":" * (V
"mailbox-list" + V
"CFWS") * P
";" * V
"CFWS"^
-1, -- RFC 2822 Section 3.4
275 ["display-name"] = V
"phrase", -- RFC 2822 Section 3.4
276 ["mailbox-list"] = (V
"mailbox" * (P
"," * V
"mailbox")^
0) + V
"obs-mailbox-list", -- RFC 2822 Section 3.4
277 ["address-list"] = (V
"address" * (P
"," * V
"address")^
0) + V
"obs-address-list", -- RFC 2822 Section 3.4
278 -- address specification (name@host.domain)
279 ["addr-spec"] = Cst(V
"local-part" * P
"@" * V
"domain"), -- RFC 2822 Section 3.4.1
280 ["local-part"] = Csg(V
"dot-atom" + V
"quoted-string" + V
"obs-local-part", "box"), -- RFC 2822 Section 3.4.1
281 ["domain"] = Cg(V
"dot-atom" + V
"domain-literal" + V
"obs-domain", "domain"), -- RFC 2822 Section 3.4.1
282 ["domain-literal"] = Cs(V
"CFWS"^
-1 * P
"[" * (V
"FWS"^
-1 * V
"dcontent")^
0 * V
"FWS"^
-1 * P
"]" * V
"CFWS"^
-1), -- RFC 2822 Section 3.4.1
283 ["dcontent"] = V
"dtext" + V
"quoted-pair", -- RFC 2822 Section 3.4.1
284 ["dtext"] = V
"NO-WS-CTL" + R
"\33\90" + R
"\94\126", -- RFC 2822 Section 3.4.1
288 local overall_message
= function ()
291 -- overall message specification
293 -- RRC 2822 Section 3.5:
295 -- A message consists of header fields, optionally followed by a message
296 -- body. Lines in a message MUST be a maximum of 998 characters
297 -- excluding the CRLF, but it is RECOMMENDED that lines be limited to 78
298 -- characters excluding the CRLF.
300 -- this grammar does not enforce the 78 chars limit yet
301 ["message"] = (V
"fields" + V
"obs-fields") * (V
"CRLF" * V
"body")^
-1, -- RFC 2822 Section 3.5
302 ["body"] = (V
"text"^
-998 * V
"CRLF")^
0 * V
"text"^
-998, -- RFC 2822 Section 3.5
306 local fields
= function ()
309 -- spec for all fields in the header
311 -- RRC 2822 Section 3.6
313 --the "fields" rule is incomplete
314 ["fields"] = -- trace fields missing
315 (V
"orig-date" + V
"from" + V
"sender" + V
"reply-to" + V
"to" + V
"cc" + V
"bcc"
316 + V
"message-id" + V
"in-reply-to" + V
"to" + V
"cc" + V
"bcc")^
0, -- RFC 2822 Section 3.6
317 -- origination date field: the time at which the author decided
318 -- the message was ready to be sent (e.g. hit the button)
319 ["orig-date"] = P
"Date:" * V
"date-time" * V
"CRLF", -- RFC 2822 Section 3.6.1
320 -- the author of the message
321 ["from"] = P
"From:" * V
"mailbox-list" * V
"CRLF", -- RFC 2822 Section 3.6.2
322 -- the actual agent that sent the message
323 ["sender"] = P
"Sender:" * V
"mailbox" * V
"CRLF", -- RFC 2822 Section 3.6.2
324 -- address which should be replied to
325 -- see Section 3.6.3 for forming replies
326 ["reply-to"] = P
"Reply-To:" * V
"address-list" * V
"CRLF", -- RFC 2822 Section 3.6.2
327 -- The "To:" field contains the address(es) of the primary recipient(s)
329 ["to"] = P
"To:" * V
"address-list" * V
"CRLF", -- RFC 2822 Section 3.6.3
330 -- The "Cc:" field contains the addresses of others who are to receive
331 -- the message, though the content of the message may not be directed
333 ["cc"] = P
"Cc:" * V
"address-list" * V
"CRLF", -- RFC 2822 Section 3.6.3
334 -- The "Bcc:" field contains addresses of recipients of the message
335 -- whose addresses are not to be revealed to other recipients of the
337 ["bcc"] = P
"Bcc:" * (V
"address-list" + V
"CFWS") * V
"CRLF", -- RFC 2822 Section 3.6.3
338 ["message-id"] = P
"Message-Id:" * V
"msg-id" * V
"CRLF", -- RFC 2822 Section 3.6.4
339 ["in-reply-to"] = P
"In-Reply-To:" * V
"msg-id"^
1 * V
"CRLF", -- RFC 2822 Section 3.6.4
340 ["references"] = P
"References:" * V
"msg-id"^
1 * V
"CRLF", -- RFC 2822 Section 3.6.4
341 ["msg-id"] = V
"CFWS"^
-1 * P
"<" * V
"id-left" * P
"@" * V
"id-right" * P
">" * V
"CFWS"^
-1, -- RFC 2822 Section 3.6.4
342 ["id-left"] = V
"dot-atom-text" + V
"no-fold-quote" + V
"obs-id-left", -- RFC 2822 Section 3.6.4
343 ["id-right"] = V
"dot-atom-text" + V
"no-fold-literal" + V
"obs-id-right", -- RFC 2822 Section 3.6.4
344 ["no-fold-quote"] = P
"\"" * (V
"qtext" + V
"quoted-pair")^
0 * P
"\"", -- RFC 2822 Section 3.6.4
345 ["no-fold-literal"] = P
"[" * (V
"dtext" + V
"quoted-pair")^
0 * P
"]", -- RFC 2822 Section 3.6.4
350 ["obs-FWS"] = lpeg
.P(false),
351 ["obs-qp"] = lpeg
.P(false),
352 ["obs-phrase"] = lpeg
.P(false),
353 ["obs-utext"] = lpeg
.P(false),
354 ["obs-day-of-week"] = lpeg
.P(false),
355 ["obs-year"] = lpeg
.P(false),
356 ["obs-month"] = lpeg
.P(false),
357 ["obs-day"] = lpeg
.P(false),
358 ["obs-hour"] = lpeg
.P(false),
359 ["obs-minute"] = lpeg
.P(false),
360 ["obs-second"] = lpeg
.P(false),
361 ["obs-zone"] = lpeg
.P(false),
362 ["obs-angle-addr"] = lpeg
.P(false),
363 ["obs-mailbox-list"] = lpeg
.P(false),
364 ["obs-address-list"] = lpeg
.P(false),
365 ["obs-local-part"] = lpeg
.P(false),
366 ["obs-domain"] = lpeg
.P(false),
367 ["obs-fields"] = lpeg
.P(false),
368 ["obs-id-left"] = lpeg
.P(false),
369 ["obs-id-right"] = lpeg
.P(false),
372 local os
= setmetatable({}, {__index
= function() return lpeg
.P(false) end})
374 local join_set
= function (...)
375 local n
= select('#', ...)
378 local t
= select(i
, ...)
379 if type(t
)=='table' then
380 for k
, v
in pairs(t
) do
383 elseif type(t
)=='string' then
385 elseif type(t
)=='boolean' then
386 -- TODO: check no overwrite
388 error('join_set: bad argument number '..i
..' of type '..type(t
))
394 local gr
= join_set(core_values
, lex_tokens(), date_time(), address(), overall_message(), fields(), obs_strict
)