5 -- MIME implementation in Lua
7 -- XXX: at section 3.2.3
14 -- rfc2822 Section 2.1:
16 -- A message consists of header fields (collectively called "the header
17 -- of the message") followed, optionally, by a body. The header is a
18 -- sequence of lines of characters with special syntax as defined in
19 -- this standard. The body is simply a sequence of characters that
20 -- follows the header and is separated from the header by an empty line
21 -- (i.e., a line with nothing preceding the CRLF).
23 local split_content
= function (c
)
24 -- TODO: add checks on types
25 -- NOTE: there is no guarantee that the CRLF separating header
26 -- and body is the first one. However no headers admit a blank
28 local h
, b
= string.match(c
, '^(.-)'..CRLF
..CRLF
..'(.*)$')
29 assert(type(h
)=='string' and type(b
)=='string')
33 -- rfc2822 Section 2.2.3
35 -- The process of moving from this folded multiple-line representation
36 -- of a header field to its single line representation is called
37 -- "unfolding". Unfolding is accomplished by simply removing any CRLF
38 -- that is immediately followed by WSP. Each header field should be
39 -- treated in its unfolded form for further syntactic and semantic
42 local unfold_header
= function (h
)
43 return string.gsub(h
, CRLF
..' ', ' ')
46 -- rfc2822 Section 2.1
48 -- At the most basic level, a message is a series of characters. A
49 -- message that is conformant with this standard is comprised of
50 -- characters with values in the range 1 through 127 and interpreted as
51 -- US-ASCII characters [ASCII]. For brevity, this document sometimes
52 -- refers to this range of characters as simply "US-ASCII characters".
55 -- rfc2822 Section 2.1.1
57 -- There are two limits that this standard places on the number of
58 -- characters in a line. Each line of characters MUST be no more than
59 -- 998 characters, and SHOULD be no more than 78 characters, excluding
62 local check_content
= function (c
)
64 -- TODO: implement in a portable way
68 -- rfc2822 Section 2.3
70 -- The body of a message is simply lines of US-ASCII characters. The
71 -- only two limitations on the body are as follows:
73 -- - CR and LF MUST only occur together as CRLF; they MUST NOT appear
74 -- independently in the body.
76 -- - Lines of characters in the body MUST be limited to 998 characters,
78 -- and SHOULD be limited to 78 characters, excluding the CRLF.
80 local check_body
= function (b
)
86 -- some core values (not implemented literally)
87 -- see RFC 2234 Section 6.1
89 ["ALPHA"] = lpeg
.R("AZ", "az"),
90 ["BIT"] = lpeg
.P("0") + lpeg
.P("1"),
91 ["CHAR"] = lpeg
.R("\01\127"),
92 ["CR"] = lpeg
.P("\13"),
93 ["DIGI"] = lpeg
.R("09"),
94 ["CRLF"] = lpeg
.P("\13\10"),
95 ["LF"] = lpeg
.P("\10"),
98 -- lexical tokens used in the specification
99 -- TODO: could be optimized
100 -- check for non-obfuscated optimizations
101 -- TODO: write in a less obfuscated way
102 -- check if parenthesis can be removed
104 -- control characters without whitespaces
105 ["NO-WS-CTL"] = lpeg
.R("\1\8") +
109 lpeg
.P("\127"), -- RFC 2822 Section 3.2.1
110 -- a character in a text
111 ["text"] = lpeg
.R("\1\9") +
114 lpeg
.R("\14\127"), -- RFC 2822 Section 3.2.1
115 -- a special character
116 ["specials"] = lpeg
.P("(") + lpeg
.P(")") +
117 lpeg
.P("<") + lpeg
.P(">") +
118 lpeg
.P("[") + lpeg
.P("]") +
119 lpeg
.P(":") + lpeg
.P(";") +
120 lpeg
.P("@") + lpeg
.P("\\") +
121 lpeg
.P(",") + lpeg
.P(".") +
122 lpeg
.P("\""), -- RFC 2822 Section 3.2.1
123 -- a quoted pair should return only the second character
124 ["quoted-pair"] = (lpeg
.P("\\") * lpeg
.C(lpeg
.V("text"))) + lpeg
.V("obs-qp"), -- RFC 2822 Section 3.2.2
125 -- a folding white space (a whitespace that can include a CRLF)
126 -- should be substituted by a single whitespace
127 ["FWS"] = (((lpeg
.V("WSP")^
0 * lpeg
.V("CRLF"))^
-1 * lpeg
.V("WSP")^
1) + lpeg
.V("obs-FWS")) / " ", -- RFC 2822 Section 3.2.3
128 -- a text character allowed inside a comment
129 ["ctext"] = lpeg
.V("NO-WS-CTL") +
132 lpeg
.R("\93\126"), -- RFC 2822 Section 3.2.3
133 -- the content of a comment (comments can nest)
134 ["ccontent"] = lpeg
.V("ctext") + lpeg
.V("quoted-pair") + lpeg
.V("comment"), -- RFC 2822 Section 3.2.3
136 -- should be substituted by a single whitespace
137 ["comment"] = (lpeg
.P("(") * (lpeg
.V("FWS")^
-1 * lpeg
.V("ccontent"))^
0 * lpeg
.V("FWS")^
-1 * lpeg
.P(")")) / " ", -- RFC 2822 Section 3.2.3
138 -- a comment or a folding white space
139 -- should be substituted by a single whitespace
141 -- Folding white spaces should not be placed in a way that
142 -- creates lines containing only whitespaces.
143 -- This requirement is not enforced by this grammar
144 ["CFWS"] = ((lpeg
.V("FWS")^
-1 * lpeg
.V("comment"))^
0 * ((lpeg
.V("FWS")^
-1 * lpeg
.V("comment")) + lpeg
.P("FWS"))) / " ", -- RFC 2822 Section 3.2.3
145 -- character that can appear in an atom
146 ["atext"] = lpeg
.V("ALPHA") + lpeg
.V("DIGIT") +
147 lpeg
.P("!") + lpeg
.P("#") +
148 lpeg
.P("$") + lpeg
.P("%") +
149 lpeg
.P("&") + lpeg
.P("'") +
150 lpeg
.P("*") + lpeg
.P("+") +
151 lpeg
.P("-") + lpeg
.P("/") +
152 lpeg
.P("=") + lpeg
.P("?") +
153 lpeg
.P("^") + lpeg
.P("_") +
154 lpeg
.P("`") + lpeg
.P("{") +
155 lpeg
.P("|") + lpeg
.P("}") +
156 lpeg
.P("~"), -- RFC 2822 Section 3.2.4
157 -- an atom is equal to the content only discarding comments and whitespace
158 ["atom"] = lpeg
.V("CFWS")^
-1 * lpeg
.C(lpeg
.V("atext")^
1) * lpeg
.V("CFWS")^
-1, -- RFC 2822 Section 3.2.4
159 -- an atom with dots is only the content
160 ["dot-atom"] = lpeg
.V("CFWS")^
-1 * lpeg
.C(lpeg
.V("dot-atom-text")) * lpeg
.V("CFWS")^
-1, -- RFC 2822 Section 3.2.4
161 -- the content of an atom text with dots
162 ["dot-atom-text"] = lpeg
.V("atext")^
1 * (lpeg
.P(".") * lpeg
.V("atext")^
1)^
0, -- RFC 2822 Section 3.2.4