more letters are needed
[mime-lua.git] / mime.lua
bloba9ff1440005b5603d2b0b3d0d11d51ae2fe16a80
1 #!/usr/bin/lua
3 require 'lpeg'
5 -- MIME implementation in Lua
7 -- XXX: at section 3.2.3
10 local string = string
11 local type = type
12 local CRLF = '\r\n'
14 -- rfc2822 Section 2.1:
16 -- A message consists of header fields (collectively called "the header
17 -- of the message") followed, optionally, by a body. The header is a
18 -- sequence of lines of characters with special syntax as defined in
19 -- this standard. The body is simply a sequence of characters that
20 -- follows the header and is separated from the header by an empty line
21 -- (i.e., a line with nothing preceding the CRLF).
23 local split_content = function (c)
24 -- TODO: add checks on types
25 -- NOTE: there is no guarantee that the CRLF separating header
26 -- and body is the first one. However no headers admit a blank
27 -- line so far
28 local h, b = string.match(c, '^(.-)'..CRLF..CRLF..'(.*)$')
29 assert(type(h)=='string' and type(b)=='string')
30 return h, b
31 end
33 -- rfc2822 Section 2.2.3
35 -- The process of moving from this folded multiple-line representation
36 -- of a header field to its single line representation is called
37 -- "unfolding". Unfolding is accomplished by simply removing any CRLF
38 -- that is immediately followed by WSP. Each header field should be
39 -- treated in its unfolded form for further syntactic and semantic
40 -- evaluation.
42 local unfold_header = function (h)
43 return string.gsub(h, CRLF..' ', ' ')
44 end
46 -- rfc2822 Section 2.1
48 -- At the most basic level, a message is a series of characters. A
49 -- message that is conformant with this standard is comprised of
50 -- characters with values in the range 1 through 127 and interpreted as
51 -- US-ASCII characters [ASCII]. For brevity, this document sometimes
52 -- refers to this range of characters as simply "US-ASCII characters".
55 -- rfc2822 Section 2.1.1
57 -- There are two limits that this standard places on the number of
58 -- characters in a line. Each line of characters MUST be no more than
59 -- 998 characters, and SHOULD be no more than 78 characters, excluding
60 -- the CRLF.
62 local check_content = function (c)
63 local err, warn
64 -- TODO: implement in a portable way
65 return err, warn
66 end
68 -- rfc2822 Section 2.3
70 -- The body of a message is simply lines of US-ASCII characters. The
71 -- only two limitations on the body are as follows:
73 -- - CR and LF MUST only occur together as CRLF; they MUST NOT appear
74 -- independently in the body.
76 -- - Lines of characters in the body MUST be limited to 998 characters,
78 -- and SHOULD be limited to 78 characters, excluding the CRLF.
80 local check_body = function (b)
81 local err, warn
82 -- TODO: implement
83 return err, warn
84 end
86 -- some core values (not implemented literally)
87 -- see RFC 2234 Section 6.1
88 local core_values = {
89 ["ALPHA"] = lpeg.R("AZ", "az"),
90 ["BIT"] = lpeg.P("0") + lpeg.P("1"),
91 ["CHAR"] = lpeg.R("\01\127"),
92 ["CR"] = lpeg.P("\13"),
93 ["DIGI"] = lpeg.R("09"),
94 ["CRLF"] = lpeg.P("\13\10"),
95 ["LF"] = lpeg.P("\10"),
98 -- lexical tokens used in the specification
99 -- TODO: could be optimized
100 -- check for non-obfuscated optimizations
101 -- TODO: write in a less obfuscated way
102 -- check if parenthesis can be removed
103 local lex_tokens = {
104 -- control characters without whitespaces
105 ["NO-WS-CTL"] = lpeg.R("\1\8") +
106 lpeg.P("\11") +
107 lpeg.P("\12") +
108 lpeg.R("\14\13") +
109 lpeg.P("\127"), -- RFC 2822 Section 3.2.1
110 -- a character in a text
111 ["text"] = lpeg.R("\1\9") +
112 lpeg.P("\11") +
113 lpeg.P("\12") +
114 lpeg.R("\14\127"), -- RFC 2822 Section 3.2.1
115 -- a special character
116 ["specials"] = lpeg.P("(") + lpeg.P(")") +
117 lpeg.P("<") + lpeg.P(">") +
118 lpeg.P("[") + lpeg.P("]") +
119 lpeg.P(":") + lpeg.P(";") +
120 lpeg.P("@") + lpeg.P("\\") +
121 lpeg.P(",") + lpeg.P(".") +
122 lpeg.P("\""), -- RFC 2822 Section 3.2.1
123 -- a quoted pair should return only the second character
124 ["quoted-pair"] = (lpeg.P("\\") * lpeg.C(lpeg.V("text"))) + lpeg.V("obs-qp"), -- RFC 2822 Section 3.2.2
125 -- a folding white space (a whitespace that can include a CRLF)
126 -- should be substituted by a single whitespace
127 ["FWS"] = (((lpeg.V("WSP")^0 * lpeg.V("CRLF"))^-1 * lpeg.V("WSP")^1) + lpeg.V("obs-FWS")) / " ", -- RFC 2822 Section 3.2.3
128 -- a text character allowed inside a comment
129 ["ctext"] = lpeg.V("NO-WS-CTL") +
130 lpeg.R("\33\39") +
131 lpeg.R("\42\91") +
132 lpeg.R("\93\126"), -- RFC 2822 Section 3.2.3
133 -- the content of a comment (comments can nest)
134 ["ccontent"] = lpeg.V("ctext") + lpeg.V("quoted-pair") + lpeg.V("comment"), -- RFC 2822 Section 3.2.3
135 -- an actual comment
136 -- should be substituted by a single whitespace
137 ["comment"] = (lpeg.P("(") * (lpeg.V("FWS")^-1 * lpeg.V("ccontent"))^0 * lpeg.V("FWS")^-1 * lpeg.P(")")) / " ", -- RFC 2822 Section 3.2.3
138 -- a comment or a folding white space
139 -- should be substituted by a single whitespace
141 -- Folding white spaces should not be placed in a way that
142 -- creates lines containing only whitespaces.
143 -- This requirement is not enforced by this grammar
144 ["CFWS"] = ((lpeg.V("FWS")^-1 * lpeg.V("comment"))^0 * ((lpeg.V("FWS")^-1 * lpeg.V("comment")) + lpeg.P("FWS"))) / " ", -- RFC 2822 Section 3.2.3
145 -- character that can appear in an atom
146 ["atext"] = lpeg.V("ALPHA") + lpeg.V("DIGIT") +
147 lpeg.P("!") + lpeg.P("#") +
148 lpeg.P("$") + lpeg.P("%") +
149 lpeg.P("&") + lpeg.P("'") +
150 lpeg.P("*") + lpeg.P("+") +
151 lpeg.P("-") + lpeg.P("/") +
152 lpeg.P("=") + lpeg.P("?") +
153 lpeg.P("^") + lpeg.P("_") +
154 lpeg.P("`") + lpeg.P("{") +
155 lpeg.P("|") + lpeg.P("}") +
156 lpeg.P("~"), -- RFC 2822 Section 3.2.4
157 -- an atom is equal to the content only discarding comments and whitespace
158 ["atom"] = lpeg.V("CFWS")^-1 * lpeg.C(lpeg.V("atext")^1) * lpeg.V("CFWS")^-1, -- RFC 2822 Section 3.2.4
159 -- an atom with dots is only the content
160 ["dot-atom"] = lpeg.V("CFWS")^-1 * lpeg.C(lpeg.V("dot-atom-text")) * lpeg.V("CFWS")^-1, -- RFC 2822 Section 3.2.4
161 -- the content of an atom text with dots
162 ["dot-atom-text"] = lpeg.V("atext")^1 * (lpeg.P(".") * lpeg.V("atext")^1)^0, -- RFC 2822 Section 3.2.4