LoongArch: Add support to annotate tablejump
[official-gcc.git] / libphobos / src / std / uri.d
blobbf7cbc06438a3ac9d99c77b604ae4f85a7db6fb6
1 // Written in the D programming language.
3 /**
4 * Encode and decode Uniform Resource Identifiers (URIs).
5 * URIs are used in internet transfer protocols.
6 * Valid URI characters consist of letters, digits,
7 * and the characters $(B ;/?:@&=+$,-_.!~*'())
8 * Reserved URI characters are $(B ;/?:@&=+$,)
9 * Escape sequences consist of $(B %) followed by two hex digits.
11 * See_Also:
12 * $(LINK2 https://www.ietf.org/rfc/rfc3986.txt, RFC 3986)<br>
13 * $(LINK2 http://en.wikipedia.org/wiki/Uniform_resource_identifier, Wikipedia)
14 * Copyright: Copyright The D Language Foundation 2000 - 2009.
15 * License: $(HTTP www.boost.org/LICENSE_1_0.txt, Boost License 1.0).
16 * Authors: $(HTTP digitalmars.com, Walter Bright)
17 * Source: $(PHOBOSSRC std/uri.d)
19 /* Copyright The D Language Foundation 2000 - 2009.
20 * Distributed under the Boost Software License, Version 1.0.
21 * (See accompanying file LICENSE_1_0.txt or copy at
22 * http://www.boost.org/LICENSE_1_0.txt)
24 module std.uri;
26 //debug=uri; // uncomment to turn on debugging writefln's
27 debug(uri) import std.stdio;
28 import std.traits : isSomeChar;
30 /** This Exception is thrown if something goes wrong when encoding or
31 decoding a URI.
33 class URIException : Exception
35 import std.exception : basicExceptionCtors;
36 mixin basicExceptionCtors;
39 ///
40 @safe unittest
42 import std.exception : assertThrown;
43 assertThrown!URIException("%ab".decode);
46 private enum
48 URI_Alpha = 1,
49 URI_Reserved = 2,
50 URI_Mark = 4,
51 URI_Digit = 8,
52 URI_Hash = 0x10, // '#'
55 private immutable char[16] hex2ascii = "0123456789ABCDEF";
57 private immutable ubyte[128] uri_flags = // indexed by character
59 ubyte[128] uflags;
61 // Compile time initialize
62 uflags['#'] |= URI_Hash;
64 foreach (c; 'A' .. 'Z' + 1)
66 uflags[c] |= URI_Alpha;
67 uflags[c + 0x20] |= URI_Alpha; // lowercase letters
69 foreach (c; '0' .. '9' + 1) uflags[c] |= URI_Digit;
70 foreach (c; ";/?:@&=+$,") uflags[c] |= URI_Reserved;
71 foreach (c; "-_.!~*'()") uflags[c] |= URI_Mark;
72 return uflags;
73 })();
75 private string URI_Encode(dstring str, uint unescapedSet) @safe pure
77 uint j;
78 uint k;
79 dchar V;
80 dchar C;
82 // result buffer
83 char[50] buffer = void;
84 char[] R;
85 uint Rlen;
86 uint Rsize; // alloc'd size
88 immutable len = str.length;
90 R = buffer[];
91 Rsize = buffer.length;
92 Rlen = 0;
94 for (k = 0; k != len; k++)
96 C = str[k];
97 // if (C in unescapedSet)
98 if (C < uri_flags.length && uri_flags[C] & unescapedSet)
100 if (Rlen == Rsize)
102 char[] R2;
104 Rsize *= 2;
105 R2 = new char[Rsize];
106 R2[0 .. Rlen] = R[0 .. Rlen];
107 R = R2;
109 R[Rlen] = cast(char) C;
110 Rlen++;
112 else
114 char[6] Octet;
115 uint L;
117 V = C;
119 // Transform V into octets
120 if (V <= 0x7F)
122 Octet[0] = cast(char) V;
123 L = 1;
125 else if (V <= 0x7FF)
127 Octet[0] = cast(char)(0xC0 | (V >> 6));
128 Octet[1] = cast(char)(0x80 | (V & 0x3F));
129 L = 2;
131 else if (V <= 0xFFFF)
133 Octet[0] = cast(char)(0xE0 | (V >> 12));
134 Octet[1] = cast(char)(0x80 | ((V >> 6) & 0x3F));
135 Octet[2] = cast(char)(0x80 | (V & 0x3F));
136 L = 3;
138 else if (V <= 0x1FFFFF)
140 Octet[0] = cast(char)(0xF0 | (V >> 18));
141 Octet[1] = cast(char)(0x80 | ((V >> 12) & 0x3F));
142 Octet[2] = cast(char)(0x80 | ((V >> 6) & 0x3F));
143 Octet[3] = cast(char)(0x80 | (V & 0x3F));
144 L = 4;
146 else
148 throw new URIException("Undefined UTF-32 code point");
151 if (Rlen + L * 3 > Rsize)
153 char[] R2;
155 Rsize = 2 * (Rlen + L * 3);
156 R2 = new char[Rsize];
157 R2[0 .. Rlen] = R[0 .. Rlen];
158 R = R2;
161 for (j = 0; j < L; j++)
163 R[Rlen] = '%';
164 R[Rlen + 1] = hex2ascii[Octet[j] >> 4];
165 R[Rlen + 2] = hex2ascii[Octet[j] & 15];
167 Rlen += 3;
172 return R[0 .. Rlen].idup;
175 @safe pure unittest
177 import std.exception : assertThrown;
179 assert(URI_Encode("", 0) == "");
180 assert(URI_Encode(URI_Decode("%F0%BF%BF%BF", 0), 0) == "%F0%BF%BF%BF");
181 dstring a;
182 a ~= cast(dchar) 0xFFFFFFFF;
183 assertThrown(URI_Encode(a, 0));
184 assert(URI_Encode("aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", 0).length == 3 * 60);
187 private uint ascii2hex(dchar c) @nogc @safe pure nothrow
189 return (c <= '9') ? c - '0' :
190 (c <= 'F') ? c - 'A' + 10 :
191 c - 'a' + 10;
194 private dstring URI_Decode(Char)(scope const(Char)[] uri, uint reservedSet)
195 if (isSomeChar!Char)
197 import std.ascii : isHexDigit;
199 uint j;
200 uint k;
201 uint V;
202 dchar C;
204 uint Rlen;
205 immutable len = uri.length;
206 auto s = uri;
208 auto Rsize = len;
209 dchar[] R = new dchar[Rsize];
210 Rlen = 0;
212 for (k = 0; k != len; k++)
214 char B;
215 uint start;
217 C = s[k];
218 if (C != '%')
220 R[Rlen] = C;
221 Rlen++;
222 continue;
224 start = k;
225 if (k + 2 >= len)
226 throw new URIException("Unexpected end of URI");
227 if (!isHexDigit(s[k + 1]) || !isHexDigit(s[k + 2]))
228 throw new URIException("Expected two hexadecimal digits after '%'");
229 B = cast(char)((ascii2hex(s[k + 1]) << 4) + ascii2hex(s[k + 2]));
230 k += 2;
231 if ((B & 0x80) == 0)
233 C = B;
235 else
237 uint n;
239 for (n = 1; ; n++)
241 if (n > 4)
242 throw new URIException("UTF-32 code point size too large");
243 if (((B << n) & 0x80) == 0)
245 if (n == 1)
246 throw new URIException("UTF-32 code point size too small");
247 break;
251 // Pick off (7 - n) significant bits of B from first byte of octet
252 V = B & ((1 << (7 - n)) - 1); // (!!!)
254 if (k + (3 * (n - 1)) >= len)
255 throw new URIException("UTF-32 unaligned String");
256 for (j = 1; j != n; j++)
258 k++;
259 if (s[k] != '%')
260 throw new URIException("Expected: '%'");
261 if (!isHexDigit(s[k + 1]) || !isHexDigit(s[k + 2]))
262 throw new URIException("Expected two hexadecimal digits after '%'");
263 B = cast(char)((ascii2hex(s[k + 1]) << 4) + ascii2hex(s[k + 2]));
264 if ((B & 0xC0) != 0x80)
265 throw new URIException("Incorrect UTF-32 multi-byte sequence");
266 k += 2;
267 V = (V << 6) | (B & 0x3F);
269 if (V > 0x10FFFF)
270 throw new URIException("Unknown UTF-32 code point");
271 C = V;
273 if (C < uri_flags.length && uri_flags[C] & reservedSet)
275 // R ~= s[start .. k + 1];
276 immutable width = (k + 1) - start;
277 for (int ii = 0; ii < width; ii++)
278 R[Rlen + ii] = s[start + ii];
279 Rlen += width;
281 else
283 R[Rlen] = C;
284 Rlen++;
287 assert(Rlen <= Rsize); // enforce our preallocation size guarantee
289 // Copy array on stack to array in memory
290 return R[0 .. Rlen].idup;
293 @safe pure unittest
295 import std.exception : assertThrown;
297 assert(URI_Decode("", 0) == "");
298 assertThrown!URIException(URI_Decode("%", 0));
299 assertThrown!URIException(URI_Decode("%xx", 0));
300 assertThrown!URIException(URI_Decode("%FF", 0));
301 assertThrown!URIException(URI_Decode("%C0", 0));
302 assertThrown!URIException(URI_Decode("%C0000000", 0));
303 assertThrown!URIException(URI_Decode("%C0%xx0000", 0));
304 assertThrown!URIException(URI_Decode("%C0%C00000", 0));
305 assertThrown!URIException(URI_Decode("%F7%BF%BF%BF", 0));
306 assert(URI_Decode("%23", URI_Hash) == "%23");
309 /*************************************
310 * Decodes the URI string encodedURI into a UTF-8 string and returns it.
311 * Escape sequences that resolve to reserved URI characters are not replaced.
312 * Escape sequences that resolve to the '#' character are not replaced.
314 string decode(Char)(scope const(Char)[] encodedURI)
315 if (isSomeChar!Char)
317 import std.algorithm.iteration : each;
318 import std.utf : encode;
319 auto s = URI_Decode(encodedURI, URI_Reserved | URI_Hash);
320 char[] r;
321 s.each!(c => encode(r, c));
322 return r;
326 @safe unittest
328 assert("foo%20bar".decode == "foo bar");
329 assert("%3C%3E.@.%E2%84%A2".decode == "<>.@.™");
330 assert("foo&/".decode == "foo&/");
331 assert("!@#$&*(".decode == "!@#$&*(");
334 /*******************************
335 * Decodes the URI string encodedURI into a UTF-8 string and returns it. All
336 * escape sequences are decoded.
338 string decodeComponent(Char)(scope const(Char)[] encodedURIComponent)
339 if (isSomeChar!Char)
341 import std.algorithm.iteration : each;
342 import std.utf : encode;
343 auto s = URI_Decode(encodedURIComponent, 0);
344 char[] r;
345 s.each!(c => encode(r, c));
346 return r;
350 @safe unittest
352 assert("foo%2F%26".decodeComponent == "foo/&");
353 assert("dl%C3%A4ng%20r%C3%B6cks".decodeComponent == "dläng röcks");
354 assert("!%40%23%24%25%5E%26*(".decodeComponent == "!@#$%^&*(");
357 /*****************************
358 * Encodes the UTF-8 string uri into a URI and returns that URI. Any character
359 * not a valid URI character is escaped. The '#' character is not escaped.
361 string encode(Char)(scope const(Char)[] uri)
362 if (isSomeChar!Char)
364 import std.utf : toUTF32;
365 auto s = toUTF32(uri);
366 return URI_Encode(s, URI_Reserved | URI_Hash | URI_Alpha | URI_Digit | URI_Mark);
370 @safe unittest
372 assert("foo bar".encode == "foo%20bar");
373 assert("<>.@.™".encode == "%3C%3E.@.%E2%84%A2");
374 assert("foo/#?a=1&b=2".encode == "foo/#?a=1&b=2");
375 assert("dlang+rocks!".encode == "dlang+rocks!");
376 assert("!@#$%^&*(".encode == "!@#$%25%5E&*(");
379 /********************************
380 * Encodes the UTF-8 string uriComponent into a URI and returns that URI.
381 * Any character not a letter, digit, or one of -_.!~*'() is escaped.
383 string encodeComponent(Char)(scope const(Char)[] uriComponent)
384 if (isSomeChar!Char)
386 import std.utf : toUTF32;
387 auto s = toUTF32(uriComponent);
388 return URI_Encode(s, URI_Alpha | URI_Digit | URI_Mark);
392 @safe unittest
394 assert("!@#$%^&*(".encodeComponent == "!%40%23%24%25%5E%26*(");
395 assert("<>.@.™".encodeComponent == "%3C%3E.%40.%E2%84%A2");
396 assert("foo/&".encodeComponent == "foo%2F%26");
397 assert("dläng röcks".encodeComponent == "dl%C3%A4ng%20r%C3%B6cks");
398 assert("dlang+rocks!".encodeComponent == "dlang%2Brocks!");
401 /* Encode associative array using www-form-urlencoding
403 * Params:
404 * values = an associative array containing the values to be encoded.
406 * Returns:
407 * A string encoded using www-form-urlencoding.
409 package string urlEncode(scope string[string] values) @safe pure
411 if (values.length == 0)
412 return "";
414 import std.array : Appender;
415 import std.format.write : formattedWrite;
417 Appender!string enc;
418 enc.reserve(values.length * 128);
420 bool first = true;
421 foreach (k, v; values)
423 if (!first)
424 enc.put('&');
425 formattedWrite(enc, "%s=%s", encodeComponent(k), encodeComponent(v));
426 first = false;
428 return enc.data;
431 @safe pure unittest
433 // @system because urlEncode -> encodeComponent -> URI_Encode
434 // URI_Encode uses alloca and pointer slicing
435 string[string] a;
436 assert(urlEncode(a) == "");
437 assert(urlEncode(["name1" : "value1"]) == "name1=value1");
438 auto enc = urlEncode(["name1" : "value1", "name2" : "value2"]);
439 assert(enc == "name1=value1&name2=value2" || enc == "name2=value2&name1=value1");
442 /***************************
443 * Does string s[] start with a URL?
444 * Returns:
445 * -1 it does not
446 * len it does, and s[0 .. len] is the slice of s[] that is that URL
449 ptrdiff_t uriLength(Char)(scope const(Char)[] s)
450 if (isSomeChar!Char)
452 /* Must start with one of:
453 * http://
454 * https://
455 * www.
457 import std.ascii : isAlphaNum;
458 import std.uni : icmp;
460 ptrdiff_t i;
462 if (s.length <= 4)
463 return -1;
465 if (s.length > 7 && icmp(s[0 .. 7], "http://") == 0)
467 i = 7;
469 else
471 if (s.length > 8 && icmp(s[0 .. 8], "https://") == 0)
472 i = 8;
473 else
474 return -1;
477 ptrdiff_t lastdot;
478 for (; i < s.length; i++)
480 auto c = s[i];
481 if (isAlphaNum(c))
482 continue;
483 if (c == '-' || c == '_' || c == '?' ||
484 c == '=' || c == '%' || c == '&' ||
485 c == '/' || c == '+' || c == '#' ||
486 c == '~' || c == '$')
487 continue;
488 if (c == '.')
490 lastdot = i;
491 continue;
493 break;
495 if (!lastdot)
496 return -1;
498 return i;
502 @safe pure unittest
504 string s1 = "http://www.digitalmars.com/~fred/fredsRX.html#foo end!";
505 assert(uriLength(s1) == 49);
506 string s2 = "no uri here";
507 assert(uriLength(s2) == -1);
508 assert(uriLength("issue 14924") < 0);
511 @safe pure nothrow @nogc unittest
513 assert(uriLength("") == -1);
514 assert(uriLength("https://www") == -1);
517 /***************************
518 * Does string s[] start with an email address?
519 * Returns:
520 * -1 it does not
521 * len it does, and s[0 .. i] is the slice of s[] that is that email address
522 * References:
523 * RFC2822
525 ptrdiff_t emailLength(Char)(scope const(Char)[] s)
526 if (isSomeChar!Char)
528 import std.ascii : isAlpha, isAlphaNum;
530 ptrdiff_t i;
532 if (s.length == 0)
533 return -1;
535 if (!isAlpha(s[0]))
536 return -1;
538 for (i = 1; 1; i++)
540 if (i == s.length)
541 return -1;
542 auto c = s[i];
543 if (isAlphaNum(c))
544 continue;
545 if (c == '-' || c == '_' || c == '.')
546 continue;
547 if (c != '@')
548 return -1;
549 i++;
550 break;
553 /* Now do the part past the '@'
555 ptrdiff_t lastdot;
556 for (; i < s.length; i++)
558 auto c = s[i];
559 if (isAlphaNum(c))
560 continue;
561 if (c == '-' || c == '_')
562 continue;
563 if (c == '.')
565 lastdot = i;
566 continue;
568 break;
570 if (!lastdot || (i - lastdot != 3 && i - lastdot != 4))
571 return -1;
573 return i;
577 @safe pure unittest
579 string s1 = "my.e-mail@www.example-domain.com with garbage added";
580 assert(emailLength(s1) == 32);
581 string s2 = "no email address here";
582 assert(emailLength(s2) == -1);
583 assert(emailLength("issue 14924") < 0);
586 @safe pure unittest
588 //@system because of encode -> URI_Encode
589 debug(uri) writeln("uri.encodeURI.unittest");
591 string source = "http://www.digitalmars.com/~fred/fred's RX.html#foo";
592 string target = "http://www.digitalmars.com/~fred/fred's%20RX.html#foo";
594 auto result = encode(source);
595 debug(uri) writefln("result = '%s'", result);
596 assert(result == target);
597 result = decode(target);
598 debug(uri) writefln("result = '%s'", result);
599 assert(result == source);
601 result = encode(decode("%E3%81%82%E3%81%82"));
602 assert(result == "%E3%81%82%E3%81%82");
604 result = encodeComponent("c++");
605 assert(result == "c%2B%2B");
607 auto str = new char[10_000_000];
608 str[] = 'A';
609 result = encodeComponent(str);
610 foreach (char c; result)
611 assert(c == 'A');
613 result = decode("%41%42%43");
614 debug(uri) writeln(result);
616 import std.meta : AliasSeq;
617 static foreach (StringType; AliasSeq!(char[], wchar[], dchar[], string, wstring, dstring))
619 import std.conv : to;
620 StringType decoded1 = source.to!StringType;
621 string encoded1 = encode(decoded1);
622 assert(decoded1 == source.to!StringType); // check that `decoded1` wasn't changed
623 assert(encoded1 == target);
624 assert(decoded1 == decode(encoded1).to!StringType);
626 StringType encoded2 = target.to!StringType;
627 string decoded2 = decode(encoded2);
628 assert(encoded2 == target.to!StringType); // check that `encoded2` wasn't changed
629 assert(decoded2 == source);
630 assert(encoded2 == encode(decoded2).to!StringType);
634 @safe pure nothrow @nogc unittest
636 assert(emailLength("") == -1);
637 assert(emailLength("@") == -1);
638 assert(emailLength("abcd") == -1);
639 assert(emailLength("blah@blub") == -1);
640 assert(emailLength("blah@blub.") == -1);
641 assert(emailLength("blah@blub.domain") == -1);