debugging functions and fixes
[mime-lua.git] / mime.lua
blobcbcb47853212c8fc20aaf92a8d99c722114fd190
1 #!/usr/bin/lua
3 require 'lpeg'
5 -- MIME implementation in Lua
7 -- XXX: at section 3.2.5
10 local string = string
11 local type = type
12 local tostring = tostring
13 local select = select
14 local setfenv = setfenv
15 local CRLF = '\r\n'
16 local lpeg = lpeg
18 local join = function (...)
19 local ret = ''
20 for i = 1, select('#', ...) do
21 ret = ret .. tostring(select(i, ...))
22 end
23 return ret
24 end
26 -- rfc2822 Section 2.1:
28 -- A message consists of header fields (collectively called "the header
29 -- of the message") followed, optionally, by a body. The header is a
30 -- sequence of lines of characters with special syntax as defined in
31 -- this standard. The body is simply a sequence of characters that
32 -- follows the header and is separated from the header by an empty line
33 -- (i.e., a line with nothing preceding the CRLF).
35 local split_content = function (c)
36 -- NOTE: there is no guarantee that the CRLF separating header
37 -- and body is the first one. However no headers admit a blank
38 -- line so far
39 local h, b = string.match(c, '^(.-)'..CRLF..CRLF..'(.*)$')
40 assert(type(h)=='string' and type(b)=='string')
41 return h, b
42 end
44 -- rfc2822 Section 2.2.3
46 -- The process of moving from this folded multiple-line representation
47 -- of a header field to its single line representation is called
48 -- "unfolding". Unfolding is accomplished by simply removing any CRLF
49 -- that is immediately followed by WSP. Each header field should be
50 -- treated in its unfolded form for further syntactic and semantic
51 -- evaluation.
53 local unfold_header = function (h)
54 return string.gsub(h, CRLF..'[ \t]', ' ')
55 end
57 -- rfc2822 Section 2.1
59 -- At the most basic level, a message is a series of characters. A
60 -- message that is conformant with this standard is comprised of
61 -- characters with values in the range 1 through 127 and interpreted as
62 -- US-ASCII characters [ASCII]. For brevity, this document sometimes
63 -- refers to this range of characters as simply "US-ASCII characters".
66 -- rfc2822 Section 2.1.1
68 -- There are two limits that this standard places on the number of
69 -- characters in a line. Each line of characters MUST be no more than
70 -- 998 characters, and SHOULD be no more than 78 characters, excluding
71 -- the CRLF.
73 local check_content = function (c)
74 local err, warn
75 -- TODO: implement in a portable way
76 return err, warn
77 end
79 -- rfc2822 Section 2.3
81 -- The body of a message is simply lines of US-ASCII characters. The
82 -- only two limitations on the body are as follows:
84 -- - CR and LF MUST only occur together as CRLF; they MUST NOT appear
85 -- independently in the body.
87 -- - Lines of characters in the body MUST be limited to 998 characters,
89 -- and SHOULD be limited to 78 characters, excluding the CRLF.
91 local check_body = function (b)
92 local err, warn
93 -- TODO: implement
94 return err, warn
95 end
97 -- some core values (not implemented literally)
98 -- see RFC 2234 Section 6.1
99 local core_values = {
100 ["ALPHA"] = lpeg.R("AZ", "az"),
101 ["BIT"] = lpeg.P("0") + lpeg.P("1"),
102 ["CHAR"] = lpeg.R("\01\127"),
103 ["CR"] = lpeg.P("\13"),
104 ["CRLF"] = lpeg.P("\13\10"),
105 ["DIGIT"] = lpeg.R("09"),
106 ["LF"] = lpeg.P("\10"),
107 ["WSP"] = lpeg.S("\32\09"),
110 -- lexical tokens used in the specification
111 -- TODO: could be optimized
112 -- check for non-obfuscated optimizations
113 -- TODO: write in a less obfuscated way
114 -- check if parenthesis can be removed
115 local lex_tokens = function ()
116 setfenv(1, lpeg)
117 return {
118 -- control characters without whitespaces
119 ["NO-WS-CTL"] = R("\1\8") +
120 P("\11") +
121 P("\12") +
122 R("\14\13") +
123 P("\127"), -- RFC 2822 Section 3.2.1
124 -- a character in a text
125 ["text"] = R("\1\9") +
126 P("\11") +
127 P("\12") +
128 R("\14\127"), -- RFC 2822 Section 3.2.1
129 -- a special character
130 ["specials"] = P("(") + P(")") +
131 P("<") + P(">") +
132 P("[") + P("]") +
133 P(":") + P(";") +
134 P("@") + P("\\") +
135 P(",") + P(".") +
136 P("\""), -- RFC 2822 Section 3.2.1
137 -- a quoted pair should return only the second character
138 ["quoted-pair"] = (P("\\") * C(V("text"))) + V("obs-qp"), -- RFC 2822 Section 3.2.2
139 -- a folding white space (a whitespace that can include a CRLF)
140 -- should be substituted by a single whitespace
141 ["FWS"] = (((V("WSP")^0 * V("CRLF"))^-1 * V("WSP")^1) + V("obs-FWS")) / " ", -- RFC 2822 Section 3.2.3
142 -- a text character allowed inside a comment
143 ["ctext"] = V("NO-WS-CTL") +
144 R("\33\39") +
145 R("\42\91") +
146 R("\93\126"), -- RFC 2822 Section 3.2.3
147 -- the content of a comment (comments can nest)
148 ["ccontent"] = V("ctext") + V("quoted-pair") + V("comment"), -- RFC 2822 Section 3.2.3
149 -- an actual comment
150 -- should be substituted by a single whitespace
151 ["comment"] = (P("(") * (V("FWS")^-1 * V("ccontent"))^0 * V("FWS")^-1 * P(")")) / " ", -- RFC 2822 Section 3.2.3
152 -- a comment or a folding white space
153 -- should be substituted by a single whitespace
155 -- Folding white spaces should not be placed in a way that
156 -- creates lines containing only whitespaces.
157 -- This requirement Is not necessarily enforced by this grammar
158 ["CFWS"] = ((V("FWS")^-1 * V("comment")) * (V("FWS")^-1 * V("comment"))^0 * V("FWS")^-1) / " ", -- RFC 2822 Section 3.2.3
159 -- character that can appear in an atom
160 ["atext"] = V("ALPHA") + V("DIGIT") +
161 P("!") + P("#") +
162 P("$") + P("%") +
163 P("&") + P("'") +
164 P("*") + P("+") +
165 P("-") + P("/") +
166 P("=") + P("?") +
167 P("^") + P("_") +
168 P("`") + P("{") +
169 P("|") + P("}") +
170 P("~"), -- RFC 2822 Section 3.2.4
171 -- an atom is equal to the content only discarding comments and whitespace
172 ["atom"] = V("CFWS")^-1 * C(V("atext")^1) * V("CFWS")^-1, -- RFC 2822 Section 3.2.4
173 -- an atom with dots is only the content
174 ["dot-atom"] = V("CFWS")^-1 * C(V("dot-atom-text")) * V("CFWS")^-1, -- RFC 2822 Section 3.2.4
175 -- the content of an atom text with dots
176 ["dot-atom-text"] = V("atext")^1 * (P(".") * V("atext")^1)^0, -- RFC 2822 Section 3.2.4
177 -- character that can appear in a quoted string
178 ["qtext"] = V("NO-WS-CTL") +
179 P("\33") +
180 R("\35\91") +
181 R("\93\126"), -- RFC 2822 Section 3.2.5
182 -- character or quoted pair (both can appear in a quoted string)
183 -- it is equivalent to the character itself or to the result of the
184 -- quoted pair
185 ["qcontent"] = C(V("qtext")) + V("quoted-pair"), -- RFC 2822 Section 3.2.5
186 -- a quoted string is equal to its content
187 ["quoted-string"] = V("CFWS")^-1 *
188 P("\"") * ((V("FWS")^-1 * V("qcontent"))^0 * V("FWS")^-1)/join * P("\"") * V("CFWS")^-1, -- RFC 2822 Section 3.2.5
189 ["word"] = V("atom") + V("quoted-string"), -- RFC 2822 Section 3.2.6
190 -- what should these be equal to?
191 ["phrase"] = V("word")^1 + V("obs-phrase"), -- RFC 2822 Section 3.2.6
192 ["utext"] = V("NO-WS-CTL") + R("\33\126") + V("obs-utext"), -- RFC 2822 Section 3.2.6
193 ["unstructured"] = (V("FWS")^-1 * V("utext"))^0 * V("FWS")^-1, -- RFC 2822 Section 3.2.6
197 local obs_strict = {
198 ["obs-FWS"] = lpeg.P(false),
199 ["obs-qp"] = lpeg.P(false),
200 ["obs-phrase"] = lpeg.P(false),
201 ["obs-utext"] = lpeg.P(false),
204 local join_set = function (...)
205 local n = select('#', ...)
206 local ret = {}
207 for i = 1, n do
208 local t = select(i, ...)
209 if type(t)=='table' then
210 for k, v in pairs(t) do
211 ret[k] = v
213 elseif type(t)=='string' then
214 ret[1] = t
215 elseif type(t)=='boolean' then
216 -- TODO: check no overwrite
217 else
218 error('join_set: bad argument number '..i..' of type '..type(t))
221 return ret
224 local gr = join_set(core_values, lex_tokens(), obs_strict)
226 return gr