with comments, less is more
[uri.git] / uri.cpp
bloba8b4100397145c7031f594eea56947b7926fefd2
1 #define BUILDING_DLL
2 #include "uri.hpp"
4 #include <iostream>
6 #include <glog/logging.h>
8 #include <fmt/format.h>
9 #include <fmt/ostream.h>
11 #include <idn2.h>
12 #include <uninorm.h>
14 #include <boost/algorithm/string/join.hpp>
15 #include <boost/algorithm/string/split.hpp>
17 #include <tao/pegtl.hpp>
18 #include <tao/pegtl/contrib/abnf.hpp>
19 // #include <tao/pegtl/contrib/tracer.hpp>
21 using namespace tao::pegtl;
22 using namespace tao::pegtl::abnf;
24 namespace uri {
25 class category_impl : public std::error_category {
26 public:
27 category_impl() = default;
28 virtual ~category_impl() {}
29 virtual char const* name() const noexcept;
30 virtual std::string message(int ev) const;
33 char const* category_impl::name() const noexcept
35 static const char name[] = "uri_error";
36 return name;
39 std::string category_impl::message(int ev) const
41 switch (static_cast<error>(ev)) {
42 case error::invalid_syntax:
43 return "unable to parse URI";
45 return "unknown URI error";
48 const std::error_category& category()
50 static category_impl category;
51 return category;
54 std::error_code make_error_code(error e)
56 return std::error_code(static_cast<int>(e), category());
59 syntax_error::syntax_error()
60 : std::system_error(make_error_code(error::invalid_syntax))
64 syntax_error::~syntax_error() noexcept {}
65 } // namespace uri
67 // clang-format off
68 namespace uri_internal {
70 // Rules are from <https://tools.ietf.org/html/rfc3986#appendix-A>
72 // The order is the rules is mostly reversed here, since we need to
73 // define them before use.
75 // UTF-8 is from RFC-3987
77 struct UTF8_tail : range<'\x80', '\xBF'> {};
79 struct UTF8_1 : range<'\x00', '\x7F'> {};
81 struct UTF8_2 : seq<range<'\xC2', '\xDF'>, UTF8_tail> {};
83 struct UTF8_3 : sor<seq<one<'\xE0'>, range<'\xA0', '\xBF'>, UTF8_tail>,
84 seq<range<'\xE1', '\xEC'>, rep<2, UTF8_tail>>,
85 seq<one<'\xED'>, range<'\x80', '\x9F'>, UTF8_tail>,
86 seq<range<'\xEE', '\xEF'>, rep<2, UTF8_tail>>> {};
88 struct UTF8_4 : sor<seq<one<'\xF0'>, range<'\x90', '\xBF'>, rep<2, UTF8_tail>>,
89 seq<range<'\xF1', '\xF3'>, rep<3, UTF8_tail>>,
90 seq<one<'\xF4'>, range<'\x80', '\x8F'>, rep<2, UTF8_tail>>> {};
92 struct UTF8_non_ascii : sor<UTF8_2, UTF8_3, UTF8_4> {};
94 // sub-delims = "!" / "$" / "&" / "'" / "(" / ")"
95 // / "*" / "+" / "," / ";" / "="
96 struct sub_delims : one<'!', '$', '&', '\'', '(', ')',
97 '*', '+', ',', ';', '='> {};
99 // gen-delims = ":" / "/" / "?" / "#" / "[" / "]" / "@"
100 struct gen_delims : one<':', '/', '?', '#', '[', ']', '@'> {};
102 // reserved = gen-delims / sub-delims
103 struct reserved : sor<gen_delims, sub_delims> {};
105 // Allowing UTF-8 in the unreserved rule isn't strictly RFC-3987 since we
106 // make no attempt to limit the code points to exaclude the private use
107 // areas. See <https://tools.ietf.org/html/rfc3987>
109 // iunreserved = ALPHA / DIGIT / "-" / "." / "_" / "~" / ucschar
110 // unreserved = ALPHA / DIGIT / "-" / "." / "_" / "~"
111 struct unreserved : sor<ALPHA, DIGIT, one<'-', '.', '_', '~'>, UTF8_non_ascii> {};
113 // pct-encoded = "%" HEXDIG HEXDIG
114 struct pct_encoded : seq<one<'%'>, HEXDIG, HEXDIG> {};
116 // pchar = unreserved / pct-encoded / sub-delims / ":" / "@"
117 struct pchar : sor<unreserved, pct_encoded, sub_delims, one<':', '@'>> {};
119 // fragment = *( pchar / "/" / "?" )
120 struct fragment : star<sor<pchar, one<'/', '?'>>> {};
122 // query = *( pchar / "/" / "?" )
123 struct query : star<sor<pchar, one<'/', '?'>>> {};
125 // segment-nz-nc = 1*( unreserved / pct-encoded / sub-delims / "@" )
126 // ; non-zero-length segment without any colon ":"
127 struct segment_nz_nc : plus<sor<unreserved, pct_encoded, sub_delims, one<'@'>>> {};
129 // segment-nz = 1*pchar
130 struct segment_nz : plus<pchar> {};
132 // segment = *pchar
133 struct segment : star<pchar> {};
135 // Updated by Errata ID: 2033
136 // path-empty = ""
137 struct path_empty : success {};
139 // path-rootless = segment-nz *( "/" segment )
140 struct path_rootless : seq<segment_nz, star<seq<one<'/'>, segment>>> {};
142 // path-noscheme = segment-nz-nc *( "/" segment )
143 struct path_noscheme : seq<segment_nz_nc, star<seq<one<'/'>, segment>>> {};
145 // path-absolute = "/" [ segment-nz *( "/" segment ) ]
146 struct path_absolute : seq<one<'/'>, opt<seq<segment_nz, star<seq<one<'/'>, segment>>>>> {};
148 // path-abempty = *( "/" segment )
149 struct path_abempty : star<seq<one<'/'>, segment>> {};
151 // path = path-abempty ; begins with "/" or is empty
152 // / path-absolute ; begins with "/" but not "//"
153 // / path-noscheme ; begins with a non-colon segment
154 // / path-rootless ; begins with a segment
155 // / path-empty ; zero characters
156 // struct path : sor<path_abempty,
157 // path_absolute,
158 // path_noscheme,
159 // path_rootless,
160 // path_empty> {};
162 /////////////////////////////////////////////////////////////////////////////
164 // The definition of reg-name is where I stray from the (very loose)
165 // grammar of RFC-3986 and apply the stricter rules of RFC-1123 plus
166 // the UTF-8 of RFC-3987.
168 // We allow a very limited set of percent encoded characters in the
169 // reg_name part: just letter, digit, hyphen, and dot. If you want
170 // Unicode in your host part, use UTF-8 or punycode: you can't percent
171 // encode it.
173 struct pct_let_dig : seq<one<'%'>,
174 sor<// ALPHA x41 -> x5A
175 seq<one<'4'>, range<'1','9'>>,
176 seq<one<'4'>, range<'A','F'>>,
177 seq<one<'4'>, range<'a','f'>>,
178 seq<one<'5'>, range<'0','9'>>,
179 seq<one<'5'>, one<'A'>>,
180 seq<one<'5'>, one<'a'>>,
181 // DIGIT x30 -> x39
182 seq<one<'3'>, range<'0','9'>>
184 > {};
186 struct u_let_dig : sor<ALPHA, DIGIT, UTF8_non_ascii, pct_let_dig> {};
188 struct dash : sor<one<'-'>, TAOCPP_PEGTL_ISTRING("%2D")> {};
190 struct u_ldh_tail : star<sor<seq<plus<dash>, u_let_dig>, u_let_dig>> {};
192 struct u_label : seq<u_let_dig, u_ldh_tail> {};
194 struct dot : sor<one<'.'>, TAOCPP_PEGTL_ISTRING("%2E")> {};
196 // An Internet (RFC-1123) style hostname:
197 struct reg_name : list_tail<u_label, dot> {};
199 // All that is required for 3986 (as updated by Errata ID: 4942) is the following:
201 // reg-name = *( unreserved / pct-encoded / "-" / "." )
202 //struct reg_name : star<sor<unreserved, pct_encoded, one<'-'>, one<'.'>>> {};
204 /////////////////////////////////////////////////////////////////////////////
206 // dec-octet = DIGIT ; 0-9
207 // / %x31-39 DIGIT ; 10-99
208 // / "1" 2DIGIT ; 100-199
209 // / "2" %x30-34 DIGIT ; 200-249
210 // / "25" %x30-35 ; 250-255
211 struct dec_octet : sor<seq<string<'2','5'>, range<'0','5'>>,
212 seq<one<'2'>, range<'0','4'>, DIGIT>,
213 seq<one<'1'>, DIGIT, DIGIT>,
214 seq<range<'1','9'>, DIGIT>,
215 DIGIT> {};
217 // IPv4address = dec-octet "." dec-octet "." dec-octet "." dec-octet
218 struct IPv4address : seq<dec_octet, one<'.'>, dec_octet, one<'.'>, dec_octet, one<'.'>, dec_octet> {};
219 struct IPv4address_eof : seq<IPv4address, eof> {};
221 // h16 = 1*4HEXDIG
222 // ; 16 bits of address represented in hexadecimal
223 struct h16 : rep_min_max<1, 4, HEXDIG> {};
225 // ls32 = ( h16 ":" h16 ) / IPv4address
226 // ; least-significant 32 bits of address
227 struct ls32 : sor<seq<h16, one<':'>, h16>, IPv4address> {};
229 // IPv6address = 6( h16 ":" ) ls32
230 // / "::" 5( h16 ":" ) ls32
231 // / [ h16 ] "::" 4( h16 ":" ) ls32
232 // / [ *1( h16 ":" ) h16 ] "::" 3( h16 ":" ) ls32
233 // / [ *2( h16 ":" ) h16 ] "::" 2( h16 ":" ) ls32
234 // / [ *3( h16 ":" ) h16 ] "::" h16 ":" ls32
235 // / [ *4( h16 ":" ) h16 ] "::" ls32
236 // / [ *5( h16 ":" ) h16 ] "::" h16
237 // / [ *6( h16 ":" ) h16 ] "::"
239 struct IPv6address : sor<seq< rep<6, h16, one<':'>>, ls32>,
240 seq< two<':'>, rep<5, h16, one<':'>>, ls32>,
241 seq<opt<h16 >, two<':'>, rep<4, h16, one<':'>>, ls32>,
242 seq<opt<h16, opt< one<':'>, h16>>, two<':'>, rep<3, h16, one<':'>>, ls32>,
243 seq<opt<h16, rep_opt<2, one<':'>, h16>>, two<':'>, rep<2, h16, one<':'>>, ls32>,
244 seq<opt<h16, rep_opt<3, one<':'>, h16>>, two<':'>, h16, one<':'>, ls32>,
245 seq<opt<h16, rep_opt<4, one<':'>, h16>>, two<':'>, ls32>,
246 seq<opt<h16, rep_opt<5, one<':'>, h16>>, two<':'>, h16>,
247 seq<opt<h16, rep_opt<6, one<':'>, h16>>, two<':'>>> {};
249 // IPvFuture = "v" 1*HEXDIG "." 1*( unreserved / sub-delims / ":" )
250 struct IPvFuture : seq<one<'v'>, plus<HEXDIG>, one<'.'>, plus<sor<unreserved, sub_delims, one<':'>>>> {};
252 // IP-literal = "[" ( IPv6address / IPvFuture ) "]"
253 //struct IP_literal : seq<one<'['>, sor<IPv6address, IPvFuture>, one<']'>> {};
255 // RFC 6874 replaced the above rule with:
257 // ZoneID = 1*( unreserved / pct-encoded )
258 struct ZoneID : plus<sor<unreserved, pct_encoded>> {};
260 // IPv6addrz = IPv6address "%25" ZoneID
261 struct IPv6addrz : seq<IPv6address, one<'%'>, ZoneID> {};
263 // IP-literal = "[" ( IPv6address / IPv6addrz / IPvFuture ) "]"
264 struct IP_literal : seq<one<'['>, sor<IPv6addrz, IPv6address, IPvFuture>, one<']'>> {};
266 struct IP_literal_eof: seq<IP_literal, eof> {};
268 // port = *DIGIT
269 struct port : star<DIGIT> {};
271 // host = IP-literal / IPv4address / reg-name
272 struct host : sor<IP_literal, IPv4address, reg_name> {};
274 // userinfo = *( unreserved / pct-encoded / sub-delims / ":" )
275 struct userinfo : star<sor<unreserved, pct_encoded, sub_delims, one<':'>>> {};
277 // Use userinfo_at rule to trigger setting userinfo field only after '@' char is found.
278 struct userinfo_at : seq<userinfo, one<'@'>> {};
280 // authority = [ userinfo "@" ] host [ ":" port ]
281 struct authority : seq<opt<userinfo_at>, host, opt<seq<one<':'>, port>>> {};
283 // scheme = ALPHA *( ALPHA / DIGIT / "+" / "-" / "." )
284 struct scheme : seq<ALPHA, star<sor<ALPHA, DIGIT, one<'+', '-', '.'>>>> {};
286 // Use scheme_colon rule to trigger setting scheme field only after ':' char is found.
287 struct scheme_colon : seq<scheme, one<':'>> {};
289 // relative-part = "//" authority path-abempty
290 // / path-absolute
291 // / path-noscheme
292 // / path-abempty ; this was added in Errata ID: 5428
293 // / path-empty
294 struct relative_part : sor<seq<two<'/'>, authority, path_abempty>,
295 path_absolute,
296 path_noscheme,
297 path_abempty,
298 path_empty> {};
300 // relative-ref = relative-part [ "?" query ] [ "#" fragment ]
301 struct relative_ref : seq<relative_part, opt<seq<one<'?'>, query>>, opt<seq<one<'#'>, fragment>>> {};
302 struct relative_ref_eof : seq<relative_ref, eof> {};
304 // hier-part = "//" authority path-abempty
305 // / path-absolute
306 // / path-rootless
307 // / path-empty
308 struct hier_part : sor<seq<two<'/'>, authority, path_abempty>,
309 path_absolute,
310 path_rootless,
311 path_empty> {};
313 // absolute-URI = scheme ":" hier-part [ "?" query ]
314 struct absolute_URI : seq<scheme_colon, hier_part, opt<seq<one<'?'>, query>>> {};
315 struct absolute_URI_eof : seq<absolute_URI, eof> {};
317 // URI = scheme ":" hier-part [ "?" query ] [ "#" fragment ]
318 struct URI : seq<scheme_colon, hier_part, opt<seq<one<'?'>, query>>, opt<seq<one<'#'>, fragment>>> {};
319 struct URI_eof : seq<URI, eof> {};
321 // URI-reference = URI / relative-ref
322 struct URI_reference : sor<URI, relative_ref> {};
323 struct URI_reference_eof : seq<URI_reference, eof> {};
325 struct path_segment : seq<opt<one<'/'>>, seq<star<not_at<one<'/'>>, not_at<eof>, any>>> {};
327 // clang-format on
329 template <typename Rule> struct action : nothing<Rule> {
332 template <> struct action<scheme_colon> {
333 template <typename Input>
334 static void apply(Input const& in, uri::components& parts)
336 auto sc = in.string();
337 CHECK((size(sc) >= 1) && (sc.back() == ':'));
338 sc.pop_back();
339 parts.scheme = sc;
343 template <> struct action<authority> {
344 template <typename Input>
345 static void apply(Input const& in, uri::components& parts)
347 parts.authority = in.string();
351 template <> struct action<path_abempty> {
352 template <typename Input>
353 static void apply(Input const& in, uri::components& parts)
355 parts.path = in.string();
359 template <> struct action<path_empty> {
360 template <typename Input>
361 static void apply(Input const& in, uri::components& parts)
363 parts.path = std::string{};
367 template <> struct action<path_absolute> {
368 template <typename Input>
369 static void apply(Input const& in, uri::components& parts)
371 parts.path = in.string();
375 template <> struct action<path_rootless> {
376 template <typename Input>
377 static void apply(Input const& in, uri::components& parts)
379 parts.path = in.string();
383 template <> struct action<path_noscheme> {
384 template <typename Input>
385 static void apply(Input const& in, uri::components& parts)
387 parts.path = in.string();
391 template <> struct action<query> {
392 template <typename Input>
393 static void apply(Input const& in, uri::components& parts)
395 parts.query = in.string();
399 template <> struct action<fragment> {
400 template <typename Input>
401 static void apply(Input const& in, uri::components& parts)
403 parts.fragment = in.string();
407 // The _at rule gives us userinfo + '@', so remove the at.
409 template <> struct action<userinfo_at> {
410 template <typename Input>
411 static void apply(Input const& in, uri::components& parts)
413 auto ui = in.string();
414 CHECK((size(ui) >= 1) && (ui.back() == '@'));
415 ui.pop_back();
416 parts.userinfo = ui;
420 template <> struct action<host> {
421 template <typename Input>
422 static void apply(Input const& in, uri::components& parts)
424 parts.host = in.string();
428 template <> struct action<port> {
429 template <typename Input>
430 static void apply(Input const& in, uri::components& parts)
432 parts.port = in.string();
436 template <> struct action<path_segment> {
437 template <typename Input>
438 static void apply(Input const& in, std::string& path_seg)
440 path_seg = in.string();
443 } // namespace uri_internal
445 namespace uri {
446 DLL_PUBLIC bool parse_generic(std::string_view uri, components& parts)
448 auto in{memory_input<>{uri.data(), uri.size(), "uri"}};
449 if (tao::pegtl::parse<uri_internal::URI_eof, uri_internal::action>(in,
450 parts)) {
451 return true;
453 return false;
456 DLL_PUBLIC bool parse_relative_ref(std::string_view uri, components& parts)
458 auto in{memory_input<>{uri.data(), uri.size(), "uri"}};
459 if (tao::pegtl::parse<uri_internal::relative_ref_eof, uri_internal::action>(
460 in, parts)) {
461 return true;
463 return false;
466 DLL_PUBLIC bool parse_reference(std::string_view uri, components& parts)
468 auto in{memory_input<>{uri.data(), uri.size(), "uri"}};
469 if (tao::pegtl::parse<uri_internal::URI_reference_eof, uri_internal::action>(
470 in, parts)) {
471 return true;
473 return false;
476 DLL_PUBLIC bool parse_absolute(std::string_view uri, components& parts)
478 auto in{memory_input<>{uri.data(), uri.size(), "uri"}};
479 if (tao::pegtl::parse<uri_internal::absolute_URI_eof, uri_internal::action>(
480 in, parts)) {
481 return true;
483 return false;
486 std::string to_string(uri const& uri_in) { return to_string(uri_in.parts()); }
488 std::string to_string(components const& uri)
490 std::ostringstream os;
491 os << uri;
492 return os.str();
495 namespace {
496 // clang-format off
498 bool constexpr isunreserved(unsigned char in)
500 switch (in) {
501 case '0': case '1': case '2': case '3': case '4':
502 case '5': case '6': case '7': case '8': case '9':
503 case 'a': case 'b': case 'c': case 'd': case 'e':
504 case 'f': case 'g': case 'h': case 'i': case 'j':
505 case 'k': case 'l': case 'm': case 'n': case 'o':
506 case 'p': case 'q': case 'r': case 's': case 't':
507 case 'u': case 'v': case 'w': case 'x': case 'y': case 'z':
508 case 'A': case 'B': case 'C': case 'D': case 'E':
509 case 'F': case 'G': case 'H': case 'I': case 'J':
510 case 'K': case 'L': case 'M': case 'N': case 'O':
511 case 'P': case 'Q': case 'R': case 'S': case 'T':
512 case 'U': case 'V': case 'W': case 'X': case 'Y': case 'Z':
513 case '-': case '.': case '_': case '~':
514 return true;
515 default:
516 break;
518 return false;
521 bool constexpr ishexdigit(unsigned char in)
523 switch (in) {
524 case '0': case '1': case '2': case '3': case '4':
525 case '5': case '6': case '7': case '8': case '9':
526 case 'a': case 'b': case 'c': case 'd': case 'e':
527 case 'f':
528 case 'A': case 'B': case 'C': case 'D': case 'E':
529 case 'F':
530 return true;
531 default:
532 break;
534 return false;
537 unsigned char constexpr hexdigit2bin(unsigned char in)
539 switch (in) {
540 case '0': case '1': case '2': case '3': case '4':
541 case '5': case '6': case '7': case '8': case '9':
542 return (in - '0');
543 case 'a': case 'b': case 'c': case 'd': case 'e':
544 case 'f':
545 return 10 + (in - 'a');
546 case 'A': case 'B': case 'C': case 'D': case 'E':
547 case 'F':
548 break;
550 return 10 + (in - 'A');
552 // clang-format on
554 std::string normalize_pct_encoded(std::string_view string)
556 fmt::memory_buffer out;
558 for (auto s = begin(string); s < end(string); ++s) {
559 auto ch = *s;
560 if (ch == '%') {
561 if ((s + 3 <= end(string)) && ishexdigit(s[1]) && ishexdigit(s[2])) {
562 auto pct_ch = 0x10 * hexdigit2bin(s[1]) + hexdigit2bin(s[2]);
563 if (isunreserved(pct_ch)) {
564 fmt::format_to(out, "{}", char(pct_ch));
566 else {
567 fmt::format_to(out, "%{:02X}", pct_ch);
569 s += 2;
570 continue;
573 fmt::format_to(out, "{}", ch);
576 return fmt::to_string(out);
579 bool starts_with(std::string_view str, std::string_view prefix)
581 if (str.size() >= prefix.size())
582 return str.compare(0, prefix.size(), prefix) == 0;
583 return false;
586 bool ends_with(std::string_view str, std::string_view suffix)
588 if (str.size() >= suffix.size())
589 return str.compare(str.length() - suffix.length(), suffix.length(), suffix)
590 == 0;
591 return false;
594 std::string all_but_the_last(std::string_view path)
596 // …
597 // excluding any characters after the right-most "/" in the base URI
598 // path, or excluding the entire base URI path if it does not contain
599 // any "/" characters).
601 auto x = path.rfind('/');
602 if (x == std::string_view::npos)
603 return std::string{};
604 return std::string(path.data(), x + 1);
607 // <https://tools.ietf.org/html/rfc3986#section-5.2.3>
609 // 5.2.3. Merge Paths
611 std::string merge(components const& base_parts, components const& ref_parts)
614 // Updated by Errata ID: 4789
616 // o If the base URI has a defined authority component and an empty
617 // path, or if the base URI's path is ending with "/..", then return
618 // a string consisting of base's path concatenated with "/" and then
619 // concatenated with the reference's path; otherwise,
621 if ((base_parts.authority && base_parts.path->empty())
622 || ends_with(*base_parts.path, "/..")) {
623 return "/" + *ref_parts.path;
626 // o return a string consisting of the reference's path component
627 // appended to all but the last segment of the base URI's path…
629 return all_but_the_last(*base_parts.path) + *ref_parts.path;
632 // <https://tools.ietf.org/html/rfc3986#section-5.2.4>
634 // 5.2.4. Remove Dot Segments
636 std::string remove_dot_segments(std::string input)
638 std::string output;
639 output.reserve(input.length());
641 while (!input.empty()) {
642 // A.
643 if (starts_with(input, "../")) {
644 input.erase(0, 3);
645 continue;
647 if (starts_with(input, "./")) {
648 input.erase(0, 2);
649 continue;
652 // B.
653 if (starts_with(input, "/./")) {
654 input.erase(0, 3);
655 input.insert(0, "/");
656 continue;
658 if (input == "/.") {
659 input.erase(0, 2);
660 input.insert(0, "/");
661 continue;
664 // C.
665 if (starts_with(input, "/../")) {
666 input.erase(0, 4);
667 input.insert(0, "/");
668 // remove last segment from output
669 auto last = output.rfind("/");
670 if (last != std::string::npos) {
671 output.erase(output.begin() + last, output.end());
673 continue;
675 if (input == "/..") {
676 input.erase(0, 3);
677 input.insert(0, "/");
678 // remove last segment from output
679 auto last = output.rfind("/");
680 if (last != std::string::npos) {
681 output.erase(output.begin() + last, output.end());
683 continue;
686 // D.
687 if (input == ".") {
688 input.erase(0, 1);
689 continue;
691 if (input == "..") {
692 input.erase(0, 2);
693 continue;
696 auto in{memory_input<>{input.data(), input.size(), "path-segment"}};
698 std::string path_seg;
699 if (tao::pegtl::parse<uri_internal::path_segment, uri_internal::action>(
700 in, path_seg)) {
701 output += path_seg;
702 input.erase(0, path_seg.length());
704 else {
705 LOG(FATAL) << "no match, we'll be looping forever";
709 return output;
712 size_t constexpr max_length = 255;
714 std::string_view remove_trailing_dot(std::string_view a)
716 if (a.length() && ('.' == a.back())) {
717 a.remove_suffix(1);
719 return a;
722 // Normalization Form KC (NFKC) Compatibility Decomposition, followed
723 // by Canonical Composition, see <http://unicode.org/reports/tr15/>
725 std::string nfkc(std::string_view str)
727 size_t length = max_length;
728 char bfr[max_length];
729 if (str.length() > max_length) {
730 throw std::runtime_error("hostname too long");
732 auto udata = reinterpret_cast<uint8_t const*>(str.data());
733 auto ubfr = reinterpret_cast<uint8_t*>(bfr);
734 if (u8_normalize(UNINORM_NFKC, udata, str.size(), ubfr, &length) == nullptr) {
735 throw std::runtime_error("u8_normalize failure");
737 return std::string{bfr, length};
740 bool is_IPv4address(std::string_view x)
742 auto in{memory_input<>{x.data(), x.size(), "maybe-IPv4address"}};
743 if (tao::pegtl::parse<uri_internal::IPv4address_eof, uri_internal::action>(
744 in)) {
745 return true;
747 return false;
750 bool is_IP_literal(std::string_view x)
752 auto in{memory_input<>{x.data(), x.size(), "maybe-IP_literal"}};
753 if (tao::pegtl::parse<uri_internal::IP_literal_eof, uri_internal::action>(
754 in)) {
755 return true;
757 return false;
760 std::string normalize_host(std::string_view host)
762 host = remove_trailing_dot(host);
764 auto norm_host = normalize_pct_encoded(host);
766 norm_host = nfkc(norm_host);
768 char* ptr = nullptr;
769 auto code = idn2_to_ascii_8z(norm_host.data(), &ptr, IDN2_TRANSITIONAL);
770 if (code != IDN2_OK) {
771 throw std::runtime_error(idn2_strerror(code));
773 norm_host = ptr;
774 idn2_free(ptr);
776 // At this point, we have a (normalized) ascii norm_host. Continue
777 // on to get the UTF-8 version.
779 //#ifdef PREFER_UNICODE_HOSTNAME
780 ptr = nullptr;
781 code = idn2_to_unicode_8z8z(norm_host.c_str(), &ptr, IDN2_TRANSITIONAL);
782 if (code != IDN2_OK) {
783 throw std::runtime_error(idn2_strerror(code));
785 norm_host = ptr;
786 idn2_free(ptr);
787 //#endif
789 return norm_host;
791 } // namespace
793 DLL_PUBLIC std::string normalize(components uri)
795 // Normalize the scheme.
796 if (uri.scheme) {
797 std::transform(begin(*uri.scheme), end(*uri.scheme), begin(*uri.scheme),
798 [](unsigned char c) { return std::tolower(c); });
801 // Normalize the host name.
802 if (uri.host) {
803 if (!(is_IPv4address(*uri.host) || is_IP_literal(*uri.host))) {
804 uri.host = normalize_host(*uri.host);
808 // we'll want to remove default port numbers
810 // Rebuild authority from user@host:port triple.
811 std::stringstream authstream;
812 if (uri.userinfo)
813 authstream << *uri.userinfo << '@';
815 if (uri.host)
816 authstream << *uri.host;
818 if (uri.port)
819 authstream << ':' << *uri.port;
821 if (uri.userinfo || uri.host || uri.port) {
822 uri.authority = authstream.str();
825 // Normalize the path.
826 if (uri.path) {
827 uri.path = remove_dot_segments(normalize_pct_encoded(*uri.path));
830 return to_string(uri);
833 DLL_PUBLIC uri resolve_ref(absolute const& base, reference const& ref)
835 // 5.2. Relative Resolution
837 if (ref.empty()) {
838 return base;
841 components const& base_parts = base.parts();
842 components const& ref_parts = ref.parts();
844 components target_parts;
846 // if defined(R.scheme) then
848 if (ref_parts.scheme) {
850 // T.scheme = R.scheme;
851 target_parts.scheme = *ref_parts.scheme;
853 // T.authority = R.authority;
854 if (ref_parts.authority) {
855 target_parts.authority = *ref_parts.authority;
858 if (ref_parts.path) {
859 target_parts.path = remove_dot_segments(*ref_parts.path);
862 if (ref_parts.query) {
863 target_parts.query = *ref_parts.query;
866 else {
867 if (ref_parts.authority) {
868 target_parts.authority = *ref_parts.authority;
869 if (ref_parts.path) {
870 target_parts.path = remove_dot_segments(*ref_parts.path);
872 target_parts.query = ref_parts.query;
874 else {
876 if (ref_parts.path == "") {
877 target_parts.path = base_parts.path;
878 if (ref_parts.query) {
879 target_parts.query = ref_parts.query;
881 else {
882 target_parts.query = base_parts.query;
885 else {
886 if (starts_with(*ref_parts.path, "/")) {
887 if (ref_parts.path) {
888 target_parts.path = remove_dot_segments(*ref_parts.path);
891 else {
892 // T.path = merge(Base.path, R.path);
893 // T.path = remove_dot_segments(T.path);
894 target_parts.path = remove_dot_segments(merge(base_parts, ref_parts));
897 // T.query = R.query;
898 target_parts.query = ref_parts.query;
901 // T.authority = Base.authority;
902 target_parts.authority = base_parts.authority;
905 // T.scheme = Base.scheme;
906 target_parts.scheme = base_parts.scheme;
909 // T.fragment = R.fragment;
910 if (ref_parts.fragment) {
911 target_parts.fragment = *ref_parts.fragment;
914 return generic(target_parts);
917 } // namespace uri
919 // <https://tools.ietf.org/html/rfc3986#section-5.3>
921 // 5.3. Component Recomposition
923 DLL_PUBLIC std::ostream& operator<<(std::ostream& os,
924 uri::components const& uri)
926 if (uri.scheme) {
927 os << *uri.scheme << ':';
930 // The individual parts take precedence over the single authority.
932 if (uri.userinfo || uri.host || uri.port) {
933 os << "//";
935 if (uri.userinfo)
936 os << *uri.userinfo << '@';
938 // Host is never undefined, but perhaps zero length.
939 if (uri.host)
940 os << *uri.host;
942 if (uri.port)
943 os << ':' << *uri.userinfo;
945 else if (uri.authority) {
946 os << "//" << *uri.authority;
949 if (uri.path) {
950 os << *uri.path;
953 if (uri.query) {
954 os << '?' << *uri.query;
957 if (uri.fragment) {
958 os << '#' << *uri.fragment;
961 return os;
964 DLL_PUBLIC std::ostream& operator<<(std::ostream& os, uri::uri const& uri_in)
966 return os << uri_in.parts();