CRLF
[ghsmtp.git] / Mailbox.cpp
blob1a3f88288c186d5444caf690f52f0d598ff696b6
1 #include "Mailbox.hpp"
3 #include <string>
5 #include <boost/algorithm/string/classification.hpp>
6 #include <boost/algorithm/string/split.hpp>
8 #include <tao/pegtl.hpp>
9 #include <tao/pegtl/contrib/abnf.hpp>
11 #include <glog/logging.h>
13 #include <boost/algorithm/string/classification.hpp>
14 #include <boost/algorithm/string/split.hpp>
16 using namespace tao::pegtl;
17 using namespace tao::pegtl::abnf;
19 namespace RFC3629 {
20 // clang-format off
22 // 4. Syntax of UTF-8 Byte Sequences
24 struct UTF8_tail : range<'\x80', '\xBF'> {};
26 struct UTF8_1 : range<0x00, 0x7F> {};
28 struct UTF8_2 : seq<range<'\xC2', '\xDF'>, UTF8_tail> {};
30 struct UTF8_3 : sor<seq<one<'\xE0'>, range<'\xA0', '\xBF'>, UTF8_tail>,
31 seq<range<'\xE1', '\xEC'>, rep<2, UTF8_tail>>,
32 seq<one<'\xED'>, range<'\x80', '\x9F'>, UTF8_tail>,
33 seq<range<'\xEE', '\xEF'>, rep<2, UTF8_tail>>> {};
35 struct UTF8_4 : sor<seq<one<'\xF0'>, range<'\x90', '\xBF'>, rep<2, UTF8_tail>>,
36 seq<range<'\xF1', '\xF3'>, rep<3, UTF8_tail>>,
37 seq<one<'\xF4'>, range<'\x80', '\x8F'>, rep<2, UTF8_tail>>> {};
39 struct non_ascii : sor<UTF8_2, UTF8_3, UTF8_4> {};
41 } // namespace RFC3629
43 namespace Chars {
44 struct VUCHAR : sor<VCHAR, RFC3629::non_ascii> {};
46 // excluded from atext: "(),.@[]"
47 struct atext : sor<ALPHA, DIGIT,
48 one<'!', '#',
49 '$', '%',
50 '&', '\'',
51 '*', '+',
52 '-', '/',
53 '=', '?',
54 '^', '_',
55 '`', '{',
56 '|', '}',
57 '~'>,
58 RFC3629::non_ascii> {};
60 } // namespace Chars
62 namespace RFC5321 {
63 // <https://tools.ietf.org/html/rfc5321>
65 using dot = one<'.'>;
66 using colon = one<':'>;
68 struct u_let_dig : sor<ALPHA, DIGIT, RFC3629::non_ascii> {};
70 struct u_ldh_tail : star<sor<seq<plus<one<'-'>>, u_let_dig>, u_let_dig>> {};
72 struct u_label : seq<u_let_dig, u_ldh_tail> {};
74 struct let_dig : sor<ALPHA, DIGIT> {};
76 struct ldh_tail : star<sor<seq<plus<one<'-'>>, let_dig>, let_dig>> {};
78 struct ldh_str : seq<let_dig, ldh_tail> {};
80 struct label : ldh_str {};
82 struct sub_domain : sor<label, u_label> {};
84 struct domain : list<sub_domain, dot> {};
86 struct dec_octet : sor<seq<string<'2','5'>, range<'0','5'>>,
87 seq<one<'2'>, range<'0','4'>, DIGIT>,
88 seq<range<'0', '1'>, rep<2, DIGIT>>,
89 rep_min_max<1, 2, DIGIT>> {};
91 struct IPv4_address_literal : seq<dec_octet, dot, dec_octet, dot, dec_octet, dot, dec_octet> {};
93 struct h16 : rep_min_max<1, 4, HEXDIG> {};
95 struct ls32 : sor<seq<h16, colon, h16>, IPv4_address_literal> {};
97 struct dcolon : two<':'> {};
99 struct IPv6address : sor<seq< rep<6, h16, colon>, ls32>,
100 seq< dcolon, rep<5, h16, colon>, ls32>,
101 seq<opt<h16 >, dcolon, rep<4, h16, colon>, ls32>,
102 seq<opt<h16, opt< colon, h16>>, dcolon, rep<3, h16, colon>, ls32>,
103 seq<opt<h16, rep_opt<2, colon, h16>>, dcolon, rep<2, h16, colon>, ls32>,
104 seq<opt<h16, rep_opt<3, colon, h16>>, dcolon, h16, colon, ls32>,
105 seq<opt<h16, rep_opt<4, colon, h16>>, dcolon, ls32>,
106 seq<opt<h16, rep_opt<5, colon, h16>>, dcolon, h16>,
107 seq<opt<h16, rep_opt<6, colon, h16>>, dcolon >> {};
109 struct IPv6_address_literal : seq<TAO_PEGTL_ISTRING("IPv6:"), IPv6address> {};
111 struct dcontent : ranges<33, 90, 94, 126> {};
113 struct standardized_tag : ldh_str {};
115 struct general_address_literal : seq<standardized_tag, colon, plus<dcontent>> {};
117 // 4.1.3. Address Literals
118 struct address_literal : seq<one<'['>,
119 sor<IPv4_address_literal,
120 IPv6_address_literal,
121 general_address_literal>,
122 one<']'>> {};
125 struct qtextSMTP : sor<ranges<32, 33, 35, 91, 93, 126>, RFC3629::non_ascii> {};
126 struct graphic : range<32, 126> {};
127 struct quoted_pairSMTP : seq<one<'\\'>, graphic> {};
128 struct qcontentSMTP : sor<qtextSMTP, quoted_pairSMTP> {};
130 struct atom : plus<Chars::atext> {};
131 struct dot_string : list<atom, dot> {};
132 struct quoted_string : seq<one<'"'>, star<qcontentSMTP>, one<'"'>> {};
133 struct local_part : sor<dot_string, quoted_string> {};
134 struct non_local_part : sor<domain, address_literal> {};
135 struct mailbox : seq<local_part, one<'@'>, non_local_part> {};
136 struct mailbox_only : seq<mailbox, eof> {};
138 // clang-format on
139 // Actions
141 template <typename Input>
142 static std::string_view make_view(Input const& in)
144 return std::string_view(in.begin(), std::distance(in.begin(), in.end()));
147 template <typename Rule>
148 struct action : nothing<Rule> {
151 template <>
152 struct action<dot_string> {
153 template <typename Input>
154 static void apply(Input const& in, Mailbox::parse_results& results)
156 results.local_type = Mailbox::local_types::dot_string;
160 template <>
161 struct action<quoted_string> {
162 template <typename Input>
163 static void apply(Input const& in, Mailbox::parse_results& results)
165 results.local_type = Mailbox::local_types::quoted_string;
169 template <>
170 struct action<domain> {
171 template <typename Input>
172 static void apply(Input const& in, Mailbox::parse_results& results)
174 results.domain_type = Mailbox::domain_types::domain;
178 template <>
179 struct action<IPv4_address_literal> {
180 template <typename Input>
181 static void apply(Input const& in, Mailbox::parse_results& results)
183 results.domain_type = Mailbox::domain_types::address_literal;
187 template <>
188 struct action<IPv6_address_literal> {
189 template <typename Input>
190 static void apply(Input const& in, Mailbox::parse_results& results)
192 results.domain_type = Mailbox::domain_types::address_literal;
196 template <>
197 struct action<standardized_tag> {
198 template <typename Input>
199 static void apply(Input const& in, Mailbox::parse_results& results)
201 results.standardized_tag = make_view(in);
205 template <>
206 struct action<general_address_literal> {
207 template <typename Input>
208 static void apply(Input const& in, Mailbox::parse_results& results)
210 results.domain_type = Mailbox::domain_types::general_address_literal;
214 template <>
215 struct action<local_part> {
216 template <typename Input>
217 static void apply(Input const& in, Mailbox::parse_results& results)
219 results.local = make_view(in);
223 template <>
224 struct action<non_local_part> {
225 template <typename Input>
226 static void apply(Input const& in, Mailbox::parse_results& results)
228 results.domain = make_view(in);
231 } // namespace RFC5321
233 std::optional<Mailbox::parse_results> Mailbox::parse(std::string_view mailbox)
235 if (mailbox.empty())
236 return {};
238 parse_results results;
239 memory_input<> mbx_in(mailbox, "mailbox");
240 if (tao::pegtl::parse<RFC5321::mailbox_only, RFC5321::action>(mbx_in,
241 results)) {
242 return results;
244 return {};
247 Mailbox::Mailbox(std::string_view mailbox)
249 if (mailbox.empty()) {
250 LOG(ERROR) << "empty mailbox string";
251 throw std::invalid_argument("empty mailbox string");
254 parse_results results;
255 memory_input<> mbx_in(mailbox, "mailbox");
256 if (!tao::pegtl::parse<RFC5321::mailbox_only, RFC5321::action>(mbx_in,
257 results)) {
258 LOG(ERROR) << "invalid mailbox syntax «" << mailbox << "»";
259 throw std::invalid_argument("invalid mailbox syntax");
262 if (results.domain_type == domain_types::general_address_literal) {
263 LOG(ERROR) << "general address literal in mailbox «" << mailbox << "»";
264 LOG(ERROR) << "unknown tag «" << results.standardized_tag << "»";
265 throw std::invalid_argument("general address literal in mailbox");
268 // "Impossible" errors; if the parse succeeded, the types must not
269 // be unknown.
270 CHECK(results.local_type != local_types::unknown);
271 CHECK(results.domain_type != domain_types::unknown);
273 // RFC-5321 4.5.3.1. Size Limits and Minimums
275 // “To the maximum extent possible, implementation techniques that
276 // impose no limits on the length of these objects should be used.”
278 // In practice, long local-parts are used and work fine. DNS imposes
279 // length limits, so we check those.
281 if (results.domain.length() > 255) { // Section 4.5.3.1.2.
282 // Also RFC 2181 section 11. Name syntax
283 LOG(ERROR) << "domain > 255 octets in «" << mailbox << "»";
284 throw std::invalid_argument("mailbox domain too long");
287 std::string dom{results.domain.begin(), results.domain.end()};
288 std::vector<boost::iterator_range<std::string::iterator>> labels;
289 boost::algorithm::split(labels, dom, boost::algorithm::is_any_of("."));
291 // Checks for DNS style domains, not address literals.
292 if (results.domain_type == domain_types::domain) {
293 if (labels.size() < 2) {
294 LOG(ERROR) << "domain must have at least two labels «" << mailbox << "»";
295 throw std::invalid_argument("mailbox domain not fully qualified");
298 if (labels[labels.size() - 1].size() < 2) {
299 LOG(ERROR) << "single octet TLD in «" << mailbox << "»";
300 throw std::invalid_argument("mailbox TLD must be two or more octets");
303 for (auto label : labels) {
304 if (label.size() > 63) {
305 LOG(ERROR) << "label > 63 octets in «" << mailbox << "»";
306 throw std::invalid_argument(
307 "mailbox domain label greater than 63 octets");
312 set_local(results.local);
313 set_domain(results.domain);
316 size_t Mailbox::length(domain_encoding enc) const
318 if (enc == domain_encoding::ascii) {
319 for (auto ch : local_part_) {
320 if (!isascii(static_cast<unsigned char>(ch))) {
321 LOG(WARNING) << "non ascii chars in local part:" << local_part_;
322 // throw std::range_error("non ascii chars in local part of mailbox");
326 auto const& d
327 = (enc == domain_encoding::utf8) ? domain().utf8() : domain().ascii();
328 return local_part_.length() + (d.length() ? (d.length() + 1) : 0);
331 std::string Mailbox::as_string(domain_encoding enc) const
333 if (enc == domain_encoding::ascii) {
334 for (auto ch : local_part_) {
335 if (!isascii(static_cast<unsigned char>(ch))) {
336 LOG(WARNING) << "non ascii chars in local part:" << local_part_;
337 // throw std::range_error("non ascii chars in local part of mailbox");
341 std::string s;
342 s.reserve(length(enc));
343 s = local_part();
344 auto const& d
345 = (enc == domain_encoding::utf8) ? domain().utf8() : domain().ascii();
346 if (!d.empty()) {
347 s += '@';
348 s += d;
350 return s;