changed some named captures
[mime-lua.git] / mime.lua
blobc3daa269a097579bb92be26c41e6d0331e22bc5d
1 #!/usr/bin/lua
3 require 'lpeg'
5 -- MIME implementation in Lua
7 -- XXX: at section 3.4
10 local string = string
11 local type = type
12 local tostring = tostring
13 local select = select
14 local setfenv = setfenv
15 local CRLF = '\r\n'
16 local lpeg = lpeg
17 local print = print
19 local join = function (...)
20 local ret = ''
21 for i = 1, select('#', ...) do
22 ret = ret .. tostring(select(i, ...))
23 end
24 return ret
25 end
27 -- rfc2822 Section 2.1:
29 -- A message consists of header fields (collectively called "the header
30 -- of the message") followed, optionally, by a body. The header is a
31 -- sequence of lines of characters with special syntax as defined in
32 -- this standard. The body is simply a sequence of characters that
33 -- follows the header and is separated from the header by an empty line
34 -- (i.e., a line with nothing preceding the CRLF).
36 local split_content = function (c)
37 -- NOTE: there is no guarantee that the CRLF separating header
38 -- and body is the first one. However no headers admit a blank
39 -- line so far
40 local h, b = string.match(c, '^(.-)'..CRLF..CRLF..'(.*)$')
41 assert(type(h)=='string' and type(b)=='string')
42 return h, b
43 end
45 -- rfc2822 Section 2.2.3
47 -- The process of moving from this folded multiple-line representation
48 -- of a header field to its single line representation is called
49 -- "unfolding". Unfolding is accomplished by simply removing any CRLF
50 -- that is immediately followed by WSP. Each header field should be
51 -- treated in its unfolded form for further syntactic and semantic
52 -- evaluation.
54 local unfold_header = function (h)
55 return string.gsub(h, CRLF..'[ \t]', ' ')
56 end
58 -- rfc2822 Section 2.1
60 -- At the most basic level, a message is a series of characters. A
61 -- message that is conformant with this standard is comprised of
62 -- characters with values in the range 1 through 127 and interpreted as
63 -- US-ASCII characters [ASCII]. For brevity, this document sometimes
64 -- refers to this range of characters as simply "US-ASCII characters".
67 -- rfc2822 Section 2.1.1
69 -- There are two limits that this standard places on the number of
70 -- characters in a line. Each line of characters MUST be no more than
71 -- 998 characters, and SHOULD be no more than 78 characters, excluding
72 -- the CRLF.
74 local check_content = function (c)
75 local err, warn
76 -- TODO: implement in a portable way
77 return err, warn
78 end
80 -- rfc2822 Section 2.3
82 -- The body of a message is simply lines of US-ASCII characters. The
83 -- only two limitations on the body are as follows:
85 -- - CR and LF MUST only occur together as CRLF; they MUST NOT appear
86 -- independently in the body.
88 -- - Lines of characters in the body MUST be limited to 998 characters,
90 -- and SHOULD be limited to 78 characters, excluding the CRLF.
92 local check_body = function (b)
93 local err, warn
94 -- TODO: implement
95 return err, warn
96 end
98 -- some core values (not implemented literally)
99 -- see RFC 2234 Section 6.1
100 local core_values = {
101 ["ALPHA"] = lpeg.R("AZ", "az"),
102 ["BIT"] = lpeg.P("0") + lpeg.P("1"),
103 ["CHAR"] = lpeg.R("\01\127"),
104 ["CR"] = lpeg.P("\13"),
105 ["CRLF"] = lpeg.P("\13\10"),
106 ["DIGIT"] = lpeg.R("09"),
107 ["LF"] = lpeg.P("\10"),
108 ["WSP"] = lpeg.S("\32\09"),
111 -- lexical tokens used in the specification
112 -- TODO: could be optimized
113 -- check for non-obfuscated optimizations
114 -- TODO: write in a less obfuscated way
115 -- check if parenthesis can be removed
116 local lex_tokens = function ()
117 setfenv(1, lpeg)
118 return {
119 -- control characters without whitespaces
120 ["NO-WS-CTL"] = R("\1\8") +
121 P("\11") +
122 P("\12") +
123 R("\14\13") +
124 P("\127"), -- RFC 2822 Section 3.2.1
125 -- a character in a text
126 ["text"] = R("\1\9") +
127 P("\11") +
128 P("\12") +
129 R("\14\127"), -- RFC 2822 Section 3.2.1
130 -- a special character
131 ["specials"] = P("(") + P(")") +
132 P("<") + P(">") +
133 P("[") + P("]") +
134 P(":") + P(";") +
135 P("@") + P("\\") +
136 P(",") + P(".") +
137 P("\""), -- RFC 2822 Section 3.2.1
138 -- a quoted pair should return only the second character
139 ["quoted-pair"] = (P("\\") * C(V("text"))) + V("obs-qp"), -- RFC 2822 Section 3.2.2
140 -- a folding white space (a whitespace that can include a CRLF)
141 -- should be substituted by a single whitespace
142 ["FWS"] = (((V("WSP")^0 * V("CRLF"))^-1 * V("WSP")^1) + V("obs-FWS")) / " ", -- RFC 2822 Section 3.2.3
143 -- a text character allowed inside a comment
144 ["ctext"] = V("NO-WS-CTL") +
145 R("\33\39") +
146 R("\42\91") +
147 R("\93\126"), -- RFC 2822 Section 3.2.3
148 -- the content of a comment (comments can nest)
149 ["ccontent"] = V("ctext") + V("quoted-pair") + V("comment"), -- RFC 2822 Section 3.2.3
150 -- an actual comment
151 -- should be substituted by a single whitespace
152 ["comment"] = (P("(") * (V("FWS")^-1 * V("ccontent"))^0 * V("FWS")^-1 * P(")")) / " ", -- RFC 2822 Section 3.2.3
153 -- a comment or a folding white space
154 -- should be substituted by a single whitespace
156 -- Folding white spaces should not be placed in a way that
157 -- creates lines containing only whitespaces.
158 -- This requirement Is not necessarily enforced by this grammar
159 ["CFWS"] = ( (V("FWS") + (V("comment")*V("FWS")^-1)) * (V("comment")*V("FWS")^-1)^0 ) / " ", -- RFC 2822 Section 3.2.3
160 -- character that can appear in an atom
161 ["atext"] = V("ALPHA") + V("DIGIT") +
162 P("!") + P("#") +
163 P("$") + P("%") +
164 P("&") + P("'") +
165 P("*") + P("+") +
166 P("-") + P("/") +
167 P("=") + P("?") +
168 P("^") + P("_") +
169 P("`") + P("{") +
170 P("|") + P("}") +
171 P("~"), -- RFC 2822 Section 3.2.4
172 -- an atom is equal to the content only discarding comments and whitespace
173 ["atom"] = V("CFWS")^-1 * C(V("atext")^1) * V("CFWS")^-1, -- RFC 2822 Section 3.2.4
174 -- an atom with dots is only the content
175 ["dot-atom"] = V("CFWS")^-1 * C(V("dot-atom-text")) * V("CFWS")^-1, -- RFC 2822 Section 3.2.4
176 -- the content of an atom text with dots
177 ["dot-atom-text"] = V("atext")^1 * (P(".") * V("atext")^1)^0, -- RFC 2822 Section 3.2.4
178 -- character that can appear in a quoted string
179 ["qtext"] = V("NO-WS-CTL") +
180 P("\33") +
181 R("\35\91") +
182 R("\93\126"), -- RFC 2822 Section 3.2.5
183 -- character or quoted pair (both can appear in a quoted string)
184 -- it is equivalent to the character itself or to the result of the
185 -- quoted pair
186 ["qcontent"] = C(V("qtext")) + V("quoted-pair"), -- RFC 2822 Section 3.2.5
187 -- a quoted string is equal to its content
188 ["quoted-string"] = V("CFWS")^-1 *
189 P("\"") * ((V("FWS")^-1 * V("qcontent"))^0 * V("FWS")^-1)/join * P("\"") * V("CFWS")^-1, -- RFC 2822 Section 3.2.5
190 -- unstructured patterns for unspecified headers
191 -- what should these be equal to?
193 -- an generic word
194 ["word"] = V("atom") + V("quoted-string"), -- RFC 2822 Section 3.2.6
195 -- an generic phrase
196 ["phrase"] = V("word")^1 + V("obs-phrase"), -- RFC 2822 Section 3.2.6
197 -- a character for unstructured text
198 ["utext"] = V("NO-WS-CTL") + R("\33\126") + V("obs-utext"), -- RFC 2822 Section 3.2.6
199 -- an unstructured text
200 ["unstructured"] = (V("FWS")^-1 * V("utext"))^0 * V("FWS")^-1, -- RFC 2822 Section 3.2.6
204 local show = my.show
205 local deb = function(...) for i=1,select('#',...) do local v=select(i,...)print(type(v), v)end end
207 local date_time = function ()
208 setfenv(1, lpeg)
209 return {
210 -- date and time specification
211 -- dates and times should be valid
212 -- this grammar does not enforce this yet
213 ["date-time"] = Cs(( V"day-of-week" * P"," )^-1 * V"date" * V"FWS" * V"time" * V"CFWS"^-1), -- RFC 2822 Section 3.3
214 ["day-of-week"] = ( V"FWS"^-1 * V"day-name" ) + V"obs-day-of-week", -- RFC 2822 Section 3.3
215 ["day-name"] = Cg(P"Mon" + P"Tue" + P"Wed" + P"Thu" + P"Fri" + P"Sat" + P"Sun", "weekday"), -- RFC 2822 Section 3.3
216 ["date"] = V"day" * V"month" * V"year", -- RFC 2822 Section 3.3
217 ["year"] = Cg(C(V"DIGIT"^4), "year") + V"obs-year", -- RFC 2822 Section 3.3
218 ["month"] = (V"FWS" * V"month-name" * V"FWS") + V"obs-month", -- RFC 2822 Section 3.3
219 ["month-name"] = Cg(C"Jan" + C"Feb" + C"Mar" + C"Apr" + C"May" + C"Jun" + C"Jul" + C"Aug" + C"Sep" + C"Oct" + C"Nov" + C"Dec", "month"), -- RFC 2822 Section 3.3
220 ["day"] = (V"FWS"^-1 * Cg(C(V"DIGIT" * V"DIGIT"^-1), "day")) + V"obs-day", -- RFC 2822 Section 3.3
221 ["time"] = V"time-of-day" * V"FWS" * V"zone", -- RFC 2822 Section 3.3
222 ["time-of-day"] = V"hour" * P":" * V"minute" * (P":" * V"second")^-1, -- RFC 2822 Section 3.3
223 ["hour"] = Cg(C(V"DIGIT" * V"DIGIT"), "hour") + V"obs-hour", -- RFC 2822 Section 3.3
224 ["minute"] = Cg((V"DIGIT" * V"DIGIT"), "minute") + V"obs-minute", -- RFC 2822 Section 3.3
225 ["second"] = Cg((V"DIGIT" * V"DIGIT"), "second") + V"obs-second", -- RFC 2822 Section 3.3
226 ["zone"] = Cg( (P"+" + P"-") * V"DIGIT" * V"DIGIT" * V"DIGIT" * V"DIGIT", "zone" ) + V"obs-zone", -- RFC 2822 Section 3.3
230 local address = function ()
231 setfenv(1, lpeg)
232 return {
233 -- address specification
234 -- dates and times should be valid
235 -- this grammar does not enforce this yet
236 ["address"] = V"mailbox" + V"group", -- RFC 2822 Section 3.4
237 ["mailbox"] = V"name-addr" + V"addr-spec", -- RFC 2822 Section 3.4
238 ["name-addr"] = V"display-name"^-1 * V"angle-addr", -- RFC 2822 Section 3.4
239 ["angle-addr"] = (V"CFWS"^-1 * P"<" * V"addr-spec" * P">" * V"CFWS"^-1) + V"obs-angle-addr", -- RFC 2822 Section 3.4
240 ["group"] = V"display-name" * P":" * (V"mailbox-list" + V"CFWS") * P";" * V"CFWS"^-1, -- RFC 2822 Section 3.4
241 ["display-name"] = V"phrase", -- RFC 2822 Section 3.4
242 ["mailbox-list"] = (V"mailbox" * (P"," * V"mailbox")^0) + V"obs-mailbox-list", -- RFC 2822 Section 3.4
243 ["address-list"] = (V"address" * (P"," * V"address")^0) + V"obs-address-list", -- RFC 2822 Section 3.4
244 -- address specification (name@host.domain)
245 ["addr-spec"] = V"local-part" * P"@" * V"domain", -- RFC 2822 Section 3.4.1
246 ["local-part"] = V"dot-atom" + V"quoted-string" + V"obs-local-part", -- RFC 2822 Section 3.4.1
247 ["domain"] = V"dot-atom" + V"domain-literal" + V"obs-domain", -- RFC 2822 Section 3.4.1
248 ["domain-literal"] = V"CFWS"^-1 * P"[" * (V"FWS"^-1 * V"dcontent")^0 * V"FWS"^-1 * P"]" * V"CFWS"^-1, -- RFC 2822 Section 3.4.1
249 ["dcontent"] = V"dtext" + V"quoted-pair", -- RFC 2822 Section 3.4.1
250 ["dtext"] = V"NO-WS-CTL" + R"\33\90" + R"\94\126", -- RFC 2822 Section 3.4.1
254 local obs_strict = {
255 ["obs-FWS"] = lpeg.P(false),
256 ["obs-qp"] = lpeg.P(false),
257 ["obs-phrase"] = lpeg.P(false),
258 ["obs-utext"] = lpeg.P(false),
259 ["obs-day-of-week"] = lpeg.P(false),
260 ["obs-year"] = lpeg.P(false),
261 ["obs-month"] = lpeg.P(false),
262 ["obs-day"] = lpeg.P(false),
263 ["obs-hour"] = lpeg.P(false),
264 ["obs-minute"] = lpeg.P(false),
265 ["obs-second"] = lpeg.P(false),
266 ["obs-zone"] = lpeg.P(false),
267 ["obs-angle-addr"] = lpeg.P(false),
268 ["obs-mailbox-list"] = lpeg.P(false),
269 ["obs-address-list"] = lpeg.P(false),
270 ["obs-local-part"] = lpeg.P(false),
271 ["obs-domain"] = lpeg.P(false),
274 local os = setmetatable({}, {__index = function() return lpeg.P(false) end})
276 local join_set = function (...)
277 local n = select('#', ...)
278 local ret = {}
279 for i = 1, n do
280 local t = select(i, ...)
281 if type(t)=='table' then
282 for k, v in pairs(t) do
283 ret[k] = v
285 elseif type(t)=='string' then
286 ret[1] = t
287 elseif type(t)=='boolean' then
288 -- TODO: check no overwrite
289 else
290 error('join_set: bad argument number '..i..' of type '..type(t))
293 return ret
296 local gr = join_set(core_values, lex_tokens(), date_time(), address(), obs_strict)
298 return gr