mime.lua

   1 #!/usr/bin/lua
   2
   3 require 'lpeg'
   4
   5 -- MIME implementation in Lua
   6
   7 -- XXX: at section 3.2.5
   8
   9
  10 local string = string
  11 local type = type
  12 local tostring = tostring
  13 local select = select
  14 local setfenv = setfenv
  15 local CRLF = '\r\n'
  16 local lpeg = lpeg
  17
  18 local join = function (...)
  19         local ret = ''
  20         for i = 1, select('#', ...) do
  21                 ret = ret .. tostring(select(i, ...))
  22         end
  23         return ret
  24 end
  25
  26 -- rfc2822 Section 2.1:
  27 --
  28 -- A message consists of header fields (collectively called "the header
  29 -- of the message") followed, optionally, by a body.  The header is a
  30 -- sequence of lines of characters with special syntax as defined in
  31 -- this standard. The body is simply a sequence of characters that
  32 -- follows the header and is separated from the header by an empty line
  33 -- (i.e., a line with nothing preceding the CRLF).
  34 --
  35 local split_content = function (c)
  36         -- NOTE: there is no guarantee that the CRLF separating header
  37         -- and body is the first one. However no headers admit a blank
  38         -- line so far
  39         local h, b = string.match(c, '^(.-)'..CRLF..CRLF..'(.*)$')
  40         assert(type(h)=='string' and type(b)=='string')
  41         return h, b
  42 end
  43
  44 -- rfc2822 Section 2.2.3
  45 --
  46 -- The process of moving from this folded multiple-line representation
  47 -- of a header field to its single line representation is called
  48 -- "unfolding". Unfolding is accomplished by simply removing any CRLF
  49 -- that is immediately followed by WSP.  Each header field should be
  50 -- treated in its unfolded form for further syntactic and semantic
  51 -- evaluation.
  52 --
  53 local unfold_header = function (h)
  54         return string.gsub(h, CRLF..'[ \t]', ' ')
  55 end
  56
  57 -- rfc2822 Section 2.1
  58 --
  59 -- At the most basic level, a message is a series of characters.  A
  60 -- message that is conformant with this standard is comprised of
  61 -- characters with values in the range 1 through 127 and interpreted as
  62 -- US-ASCII characters [ASCII].  For brevity, this document sometimes
  63 -- refers to this range of characters as simply "US-ASCII characters".
  64 --
  65
  66 -- rfc2822 Section 2.1.1
  67 --
  68 -- There are two limits that this standard places on the number of
  69 -- characters in a line. Each line of characters MUST be no more than
  70 -- 998 characters, and SHOULD be no more than 78 characters, excluding
  71 -- the CRLF.
  72 --
  73 local check_content = function (c)
  74         local err, warn
  75         -- TODO: implement in a portable way
  76         return err, warn
  77 end
  78
  79 -- rfc2822 Section 2.3
  80 --
  81 -- The body of a message is simply lines of US-ASCII characters.  The
  82 -- only two limitations on the body are as follows:
  83 --
  84 -- - CR and LF MUST only occur together as CRLF; they MUST NOT appear
  85 -- independently in the body.
  86 --
  87 -- - Lines of characters in the body MUST be limited to 998 characters,
  88 --
  89 -- and SHOULD be limited to 78 characters, excluding the CRLF.
  90 --
  91 local check_body = function (b)
  92         local err, warn
  93         -- TODO: implement
  94         return err, warn
  95 end
  96
  97 -- some core values (not implemented literally)
  98 -- see RFC 2234 Section 6.1
  99 local core_values = {
 100         ["ALPHA"] = lpeg.R("AZ", "az"),
 101         ["BIT"] = lpeg.P("0") + lpeg.P("1"),
 102         ["CHAR"] = lpeg.R("\01\127"),
 103         ["CR"] = lpeg.P("\13"),
 104         ["CRLF"] = lpeg.P("\13\10"),
 105         ["DIGIT"] = lpeg.R("09"),
 106         ["LF"] = lpeg.P("\10"),
 107         ["WSP"] = lpeg.S("\32\09"),
 108 }
 109
 110 -- lexical tokens used in the specification
 111 -- TODO: could be optimized
 112 -- check for non-obfuscated optimizations
 113 -- TODO: write in a less obfuscated way
 114 -- check if parenthesis can be removed
 115 local lex_tokens = function ()
 116         setfenv(1, lpeg)
 117         return {
 118         -- control characters without whitespaces
 119         ["NO-WS-CTL"] = R("\1\8") +
 120         P("\11") +
 121         P("\12") +
 122         R("\14\13") +
 123         P("\127"), -- RFC 2822 Section 3.2.1
 124         -- a character in a text
 125         ["text"] =      R("\1\9") +
 126         P("\11") +
 127         P("\12") +
 128         R("\14\127"), -- RFC 2822 Section 3.2.1
 129         -- a special character
 130         ["specials"] =  P("(") + P(")") +
 131         P("<") + P(">") +
 132         P("[") + P("]") +
 133         P(":") + P(";") +
 134         P("@") + P("\\") +
 135         P(",") + P(".") +
 136         P("\""), -- RFC 2822 Section 3.2.1
 137         -- a quoted pair should return only the second character
 138         ["quoted-pair"] = (P("\\") * C(V("text"))) + V("obs-qp"), -- RFC 2822 Section 3.2.2
 139         -- a folding white space (a whitespace that can include a CRLF)
 140         -- should be substituted by a single whitespace
 141         ["FWS"] = (((V("WSP")^0 * V("CRLF"))^-1 * V("WSP")^1) + V("obs-FWS")) / " ", -- RFC 2822 Section 3.2.3
 142         -- a text character allowed inside a comment
 143         ["ctext"] =     V("NO-WS-CTL") +
 144         R("\33\39") +
 145         R("\42\91") +
 146         R("\93\126"), -- RFC 2822 Section 3.2.3
 147         -- the content of a comment (comments can nest)
 148         ["ccontent"] =  V("ctext") + V("quoted-pair") + V("comment"), -- RFC 2822 Section 3.2.3
 149         -- an actual comment
 150         -- should be substituted by a single whitespace
 151         ["comment"] =   (P("(") * (V("FWS")^-1 * V("ccontent"))^0 * V("FWS")^-1 * P(")")) / " ", -- RFC 2822 Section 3.2.3
 152         -- a comment or a folding white space
 153         -- should be substituted by a single whitespace
 154         --
 155         -- Folding white spaces should not be placed in a way that
 156         -- creates lines containing only whitespaces.
 157         -- This requirement Is not necessarily enforced by this grammar
 158         ["CFWS"] =      ((V("FWS")^-1 * V("comment")) * (V("FWS")^-1 * V("comment"))^0 * V("FWS")^-1) / " ", -- RFC 2822 Section 3.2.3
 159         -- character that can appear in an atom
 160         ["atext"] =     V("ALPHA") + V("DIGIT") +
 161         P("!") + P("#") +
 162         P("$") + P("%") +
 163         P("&") + P("'") +
 164         P("*") + P("+") +
 165         P("-") + P("/") +
 166         P("=") + P("?") +
 167         P("^") + P("_") +
 168         P("`") + P("{") +
 169         P("|") + P("}") +
 170         P("~"), -- RFC 2822 Section 3.2.4
 171         -- an atom is equal to the content only discarding comments and whitespace
 172         ["atom"] =      V("CFWS")^-1 * C(V("atext")^1) * V("CFWS")^-1, -- RFC 2822 Section 3.2.4
 173         -- an atom with dots is only the content
 174         ["dot-atom"] =  V("CFWS")^-1 * C(V("dot-atom-text")) * V("CFWS")^-1, -- RFC 2822 Section 3.2.4
 175         -- the content of an atom text with dots
 176         ["dot-atom-text"] =     V("atext")^1 * (P(".") * V("atext")^1)^0, -- RFC 2822 Section 3.2.4
 177         -- character that can appear in a quoted string
 178         ["qtext"] =     V("NO-WS-CTL") +
 179         P("\33") +
 180         R("\35\91") +
 181         R("\93\126"), -- RFC 2822 Section 3.2.5
 182         -- character or quoted pair (both can appear in a quoted string)
 183         -- it is equivalent to the character itself or to the result of the
 184         -- quoted pair
 185         ["qcontent"] =  C(V("qtext")) + V("quoted-pair"),  -- RFC 2822 Section 3.2.5
 186         -- a quoted string is equal to its content
 187         ["quoted-string"] =     V("CFWS")^-1 *
 188         P("\"") * ((V("FWS")^-1 * V("qcontent"))^0 * V("FWS")^-1)/join * P("\"") * V("CFWS")^-1, -- RFC 2822 Section 3.2.5
 189         ["word"] =      V("atom") + V("quoted-string"), -- RFC 2822 Section 3.2.6
 190         -- what should these be equal to?
 191         ["phrase"] =    V("word")^1 + V("obs-phrase"), -- RFC 2822 Section 3.2.6
 192         ["utext"] =     V("NO-WS-CTL") + R("\33\126") + V("obs-utext"), -- RFC 2822 Section 3.2.6
 193         ["unstructured"] =      (V("FWS")^-1 * V("utext"))^0 * V("FWS")^-1, -- RFC 2822 Section 3.2.6
 194 }
 195 end
 196
 197 local obs_strict = {
 198         ["obs-FWS"] = lpeg.P(false),
 199         ["obs-qp"] = lpeg.P(false),
 200         ["obs-phrase"] = lpeg.P(false),
 201         ["obs-utext"] = lpeg.P(false),
 202 }
 203
 204 local join_set = function (...)
 205         local n = select('#', ...)
 206         local ret = {}
 207         for i = 1, n do
 208                 local t = select(i, ...)
 209                 if type(t)=='table' then
 210                         for k, v in pairs(t) do
 211                                 ret[k] = v
 212                         end
 213                 elseif type(t)=='string' then
 214                         ret[1] = t
 215                 elseif type(t)=='boolean' then
 216                         -- TODO: check no overwrite
 217                 else
 218                         error('join_set: bad argument number '..i..' of type '..type(t))
 219                 end
 220         end
 221         return ret
 222 end
 223
 224 local gr = join_set(core_values, lex_tokens(), obs_strict)
 225
 226 return gr
 227