For obj-c stage-final re-use the checksum from the previous stage
[official-gcc.git] / libphobos / src / std / uri.d
blobfcc902c8236ea24510e713f6ea3188ef3b034dcb
1 // Written in the D programming language.
3 /**
4 * Encode and decode Uniform Resource Identifiers (URIs).
5 * URIs are used in internet transfer protocols.
6 * Valid URI characters consist of letters, digits,
7 * and the characters $(B ;/?:@&=+$,-_.!~*'())
8 * Reserved URI characters are $(B ;/?:@&=+$,)
9 * Escape sequences consist of $(B %) followed by two hex digits.
11 * See_Also:
12 * $(LINK2 http://www.ietf.org/rfc/rfc3986.txt, RFC 3986)<br>
13 * $(LINK2 http://en.wikipedia.org/wiki/Uniform_resource_identifier, Wikipedia)
14 * Copyright: Copyright Digital Mars 2000 - 2009.
15 * License: $(HTTP www.boost.org/LICENSE_1_0.txt, Boost License 1.0).
16 * Authors: $(HTTP digitalmars.com, Walter Bright)
17 * Source: $(PHOBOSSRC std/_uri.d)
19 /* Copyright Digital Mars 2000 - 2009.
20 * Distributed under the Boost Software License, Version 1.0.
21 * (See accompanying file LICENSE_1_0.txt or copy at
22 * http://www.boost.org/LICENSE_1_0.txt)
24 module std.uri;
26 //debug=uri; // uncomment to turn on debugging writefln's
27 debug(uri) import std.stdio;
28 import std.traits : isSomeChar;
30 /** This Exception is thrown if something goes wrong when encoding or
31 decoding a URI.
33 class URIException : Exception
35 import std.exception : basicExceptionCtors;
36 mixin basicExceptionCtors;
39 private enum
41 URI_Alpha = 1,
42 URI_Reserved = 2,
43 URI_Mark = 4,
44 URI_Digit = 8,
45 URI_Hash = 0x10, // '#'
48 private immutable char[16] hex2ascii = "0123456789ABCDEF";
50 private immutable ubyte[128] uri_flags = // indexed by character
52 ubyte[128] uflags;
54 // Compile time initialize
55 uflags['#'] |= URI_Hash;
57 foreach (c; 'A' .. 'Z' + 1)
59 uflags[c] |= URI_Alpha;
60 uflags[c + 0x20] |= URI_Alpha; // lowercase letters
62 foreach (c; '0' .. '9' + 1) uflags[c] |= URI_Digit;
63 foreach (c; ";/?:@&=+$,") uflags[c] |= URI_Reserved;
64 foreach (c; "-_.!~*'()") uflags[c] |= URI_Mark;
65 return uflags;
66 })();
68 private string URI_Encode(dstring str, uint unescapedSet)
70 import core.exception : OutOfMemoryError;
71 import core.stdc.stdlib : alloca;
73 uint j;
74 uint k;
75 dchar V;
76 dchar C;
78 // result buffer
79 char[50] buffer = void;
80 char* R;
81 uint Rlen;
82 uint Rsize; // alloc'd size
84 immutable len = str.length;
86 R = buffer.ptr;
87 Rsize = buffer.length;
88 Rlen = 0;
90 for (k = 0; k != len; k++)
92 C = str[k];
93 // if (C in unescapedSet)
94 if (C < uri_flags.length && uri_flags[C] & unescapedSet)
96 if (Rlen == Rsize)
98 char* R2;
100 Rsize *= 2;
101 if (Rsize > 1024)
103 R2 = (new char[Rsize]).ptr;
105 else
107 R2 = cast(char *) alloca(Rsize * char.sizeof);
108 if (!R2)
109 throw new OutOfMemoryError("Alloca failure");
111 R2[0 .. Rlen] = R[0 .. Rlen];
112 R = R2;
114 R[Rlen] = cast(char) C;
115 Rlen++;
117 else
119 char[6] Octet;
120 uint L;
122 V = C;
124 // Transform V into octets
125 if (V <= 0x7F)
127 Octet[0] = cast(char) V;
128 L = 1;
130 else if (V <= 0x7FF)
132 Octet[0] = cast(char)(0xC0 | (V >> 6));
133 Octet[1] = cast(char)(0x80 | (V & 0x3F));
134 L = 2;
136 else if (V <= 0xFFFF)
138 Octet[0] = cast(char)(0xE0 | (V >> 12));
139 Octet[1] = cast(char)(0x80 | ((V >> 6) & 0x3F));
140 Octet[2] = cast(char)(0x80 | (V & 0x3F));
141 L = 3;
143 else if (V <= 0x1FFFFF)
145 Octet[0] = cast(char)(0xF0 | (V >> 18));
146 Octet[1] = cast(char)(0x80 | ((V >> 12) & 0x3F));
147 Octet[2] = cast(char)(0x80 | ((V >> 6) & 0x3F));
148 Octet[3] = cast(char)(0x80 | (V & 0x3F));
149 L = 4;
151 else
153 throw new URIException("Undefined UTF-32 code point");
156 if (Rlen + L * 3 > Rsize)
158 char *R2;
160 Rsize = 2 * (Rlen + L * 3);
161 if (Rsize > 1024)
163 R2 = (new char[Rsize]).ptr;
165 else
167 R2 = cast(char *) alloca(Rsize * char.sizeof);
168 if (!R2)
169 throw new OutOfMemoryError("Alloca failure");
171 R2[0 .. Rlen] = R[0 .. Rlen];
172 R = R2;
175 for (j = 0; j < L; j++)
177 R[Rlen] = '%';
178 R[Rlen + 1] = hex2ascii[Octet[j] >> 4];
179 R[Rlen + 2] = hex2ascii[Octet[j] & 15];
181 Rlen += 3;
186 return R[0 .. Rlen].idup;
189 private uint ascii2hex(dchar c) @nogc @safe pure nothrow
191 return (c <= '9') ? c - '0' :
192 (c <= 'F') ? c - 'A' + 10 :
193 c - 'a' + 10;
196 private dstring URI_Decode(Char)(in Char[] uri, uint reservedSet)
197 if (isSomeChar!Char)
199 import core.exception : OutOfMemoryError;
200 import core.stdc.stdlib : alloca;
201 import std.ascii : isHexDigit;
203 uint j;
204 uint k;
205 uint V;
206 dchar C;
208 // Result array, allocated on stack
209 dchar* R;
210 uint Rlen;
212 immutable len = uri.length;
213 auto s = uri.ptr;
215 // Preallocate result buffer R guaranteed to be large enough for result
216 auto Rsize = len;
217 if (Rsize > 1024 / dchar.sizeof)
219 R = (new dchar[Rsize]).ptr;
221 else
223 R = cast(dchar *) alloca(Rsize * dchar.sizeof);
224 if (!R)
225 throw new OutOfMemoryError("Alloca failure");
227 Rlen = 0;
229 for (k = 0; k != len; k++)
231 char B;
232 uint start;
234 C = s[k];
235 if (C != '%')
237 R[Rlen] = C;
238 Rlen++;
239 continue;
241 start = k;
242 if (k + 2 >= len)
243 throw new URIException("Unexpected end of URI");
244 if (!isHexDigit(s[k + 1]) || !isHexDigit(s[k + 2]))
245 throw new URIException("Expected two hexadecimal digits after '%'");
246 B = cast(char)((ascii2hex(s[k + 1]) << 4) + ascii2hex(s[k + 2]));
247 k += 2;
248 if ((B & 0x80) == 0)
250 C = B;
252 else
254 uint n;
256 for (n = 1; ; n++)
258 if (n > 4)
259 throw new URIException("UTF-32 code point size too large");
260 if (((B << n) & 0x80) == 0)
262 if (n == 1)
263 throw new URIException("UTF-32 code point size too small");
264 break;
268 // Pick off (7 - n) significant bits of B from first byte of octet
269 V = B & ((1 << (7 - n)) - 1); // (!!!)
271 if (k + (3 * (n - 1)) >= len)
272 throw new URIException("UTF-32 unaligned String");
273 for (j = 1; j != n; j++)
275 k++;
276 if (s[k] != '%')
277 throw new URIException("Expected: '%'");
278 if (!isHexDigit(s[k + 1]) || !isHexDigit(s[k + 2]))
279 throw new URIException("Expected two hexadecimal digits after '%'");
280 B = cast(char)((ascii2hex(s[k + 1]) << 4) + ascii2hex(s[k + 2]));
281 if ((B & 0xC0) != 0x80)
282 throw new URIException("Incorrect UTF-32 multi-byte sequence");
283 k += 2;
284 V = (V << 6) | (B & 0x3F);
286 if (V > 0x10FFFF)
287 throw new URIException("Unknown UTF-32 code point");
288 C = V;
290 if (C < uri_flags.length && uri_flags[C] & reservedSet)
292 // R ~= s[start .. k + 1];
293 immutable width = (k + 1) - start;
294 for (int ii = 0; ii < width; ii++)
295 R[Rlen + ii] = s[start + ii];
296 Rlen += width;
298 else
300 R[Rlen] = C;
301 Rlen++;
304 assert(Rlen <= Rsize); // enforce our preallocation size guarantee
306 // Copy array on stack to array in memory
307 return R[0 .. Rlen].idup;
310 /*************************************
311 * Decodes the URI string encodedURI into a UTF-8 string and returns it.
312 * Escape sequences that resolve to reserved URI characters are not replaced.
313 * Escape sequences that resolve to the '#' character are not replaced.
316 string decode(Char)(in Char[] encodedURI)
317 if (isSomeChar!Char)
319 import std.algorithm.iteration : each;
320 import std.utf : encode;
321 auto s = URI_Decode(encodedURI, URI_Reserved | URI_Hash);
322 char[] r;
323 s.each!(c => encode(r, c));
324 return r;
327 /*******************************
328 * Decodes the URI string encodedURI into a UTF-8 string and returns it. All
329 * escape sequences are decoded.
332 string decodeComponent(Char)(in Char[] encodedURIComponent)
333 if (isSomeChar!Char)
335 import std.algorithm.iteration : each;
336 import std.utf : encode;
337 auto s = URI_Decode(encodedURIComponent, 0);
338 char[] r;
339 s.each!(c => encode(r, c));
340 return r;
343 /*****************************
344 * Encodes the UTF-8 string uri into a URI and returns that URI. Any character
345 * not a valid URI character is escaped. The '#' character is not escaped.
348 string encode(Char)(in Char[] uri)
349 if (isSomeChar!Char)
351 import std.utf : toUTF32;
352 auto s = toUTF32(uri);
353 return URI_Encode(s, URI_Reserved | URI_Hash | URI_Alpha | URI_Digit | URI_Mark);
356 /********************************
357 * Encodes the UTF-8 string uriComponent into a URI and returns that URI.
358 * Any character not a letter, digit, or one of -_.!~*'() is escaped.
361 string encodeComponent(Char)(in Char[] uriComponent)
362 if (isSomeChar!Char)
364 import std.utf : toUTF32;
365 auto s = toUTF32(uriComponent);
366 return URI_Encode(s, URI_Alpha | URI_Digit | URI_Mark);
369 /* Encode associative array using www-form-urlencoding
371 * Params:
372 * values = an associative array containing the values to be encoded.
374 * Returns:
375 * A string encoded using www-form-urlencoding.
377 package string urlEncode(in string[string] values)
379 if (values.length == 0)
380 return "";
382 import std.array : Appender;
383 import std.format : formattedWrite;
385 Appender!string enc;
386 enc.reserve(values.length * 128);
388 bool first = true;
389 foreach (k, v; values)
391 if (!first)
392 enc.put('&');
393 formattedWrite(enc, "%s=%s", encodeComponent(k), encodeComponent(v));
394 first = false;
396 return enc.data;
399 @system unittest
401 // @system because urlEncode -> encodeComponent -> URI_Encode
402 // URI_Encode uses alloca and pointer slicing
403 string[string] a;
404 assert(urlEncode(a) == "");
405 assert(urlEncode(["name1" : "value1"]) == "name1=value1");
406 auto enc = urlEncode(["name1" : "value1", "name2" : "value2"]);
407 assert(enc == "name1=value1&name2=value2" || enc == "name2=value2&name1=value1");
410 /***************************
411 * Does string s[] start with a URL?
412 * Returns:
413 * -1 it does not
414 * len it does, and s[0 .. len] is the slice of s[] that is that URL
417 ptrdiff_t uriLength(Char)(in Char[] s)
418 if (isSomeChar!Char)
420 /* Must start with one of:
421 * http://
422 * https://
423 * www.
425 import std.ascii : isAlphaNum;
426 import std.uni : icmp;
428 ptrdiff_t i;
430 if (s.length <= 4)
431 return -1;
433 if (s.length > 7 && icmp(s[0 .. 7], "http://") == 0)
435 i = 7;
437 else
439 if (s.length > 8 && icmp(s[0 .. 8], "https://") == 0)
440 i = 8;
441 else
442 return -1;
445 ptrdiff_t lastdot;
446 for (; i < s.length; i++)
448 auto c = s[i];
449 if (isAlphaNum(c))
450 continue;
451 if (c == '-' || c == '_' || c == '?' ||
452 c == '=' || c == '%' || c == '&' ||
453 c == '/' || c == '+' || c == '#' ||
454 c == '~' || c == '$')
455 continue;
456 if (c == '.')
458 lastdot = i;
459 continue;
461 break;
463 if (!lastdot)
464 return -1;
466 return i;
470 @safe unittest
472 string s1 = "http://www.digitalmars.com/~fred/fredsRX.html#foo end!";
473 assert(uriLength(s1) == 49);
474 string s2 = "no uri here";
475 assert(uriLength(s2) == -1);
476 assert(uriLength("issue 14924") < 0);
480 /***************************
481 * Does string s[] start with an email address?
482 * Returns:
483 * -1 it does not
484 * len it does, and s[0 .. i] is the slice of s[] that is that email address
485 * References:
486 * RFC2822
488 ptrdiff_t emailLength(Char)(in Char[] s)
489 if (isSomeChar!Char)
491 import std.ascii : isAlpha, isAlphaNum;
493 ptrdiff_t i;
495 if (!isAlpha(s[0]))
496 return -1;
498 for (i = 1; 1; i++)
500 if (i == s.length)
501 return -1;
502 auto c = s[i];
503 if (isAlphaNum(c))
504 continue;
505 if (c == '-' || c == '_' || c == '.')
506 continue;
507 if (c != '@')
508 return -1;
509 i++;
510 break;
513 /* Now do the part past the '@'
515 ptrdiff_t lastdot;
516 for (; i < s.length; i++)
518 auto c = s[i];
519 if (isAlphaNum(c))
520 continue;
521 if (c == '-' || c == '_')
522 continue;
523 if (c == '.')
525 lastdot = i;
526 continue;
528 break;
530 if (!lastdot || (i - lastdot != 3 && i - lastdot != 4))
531 return -1;
533 return i;
537 @safe unittest
539 string s1 = "my.e-mail@www.example-domain.com with garbage added";
540 assert(emailLength(s1) == 32);
541 string s2 = "no email address here";
542 assert(emailLength(s2) == -1);
543 assert(emailLength("issue 14924") < 0);
547 @system unittest
549 //@system because of encode -> URI_Encode
550 debug(uri) writeln("uri.encodeURI.unittest");
552 string source = "http://www.digitalmars.com/~fred/fred's RX.html#foo";
553 string target = "http://www.digitalmars.com/~fred/fred's%20RX.html#foo";
555 auto result = encode(source);
556 debug(uri) writefln("result = '%s'", result);
557 assert(result == target);
558 result = decode(target);
559 debug(uri) writefln("result = '%s'", result);
560 assert(result == source);
562 result = encode(decode("%E3%81%82%E3%81%82"));
563 assert(result == "%E3%81%82%E3%81%82");
565 result = encodeComponent("c++");
566 assert(result == "c%2B%2B");
568 auto str = new char[10_000_000];
569 str[] = 'A';
570 result = encodeComponent(str);
571 foreach (char c; result)
572 assert(c == 'A');
574 result = decode("%41%42%43");
575 debug(uri) writeln(result);
577 import std.meta : AliasSeq;
578 foreach (StringType; AliasSeq!(char[], wchar[], dchar[], string, wstring, dstring))
580 import std.conv : to;
581 StringType decoded1 = source.to!StringType;
582 string encoded1 = encode(decoded1);
583 assert(decoded1 == source.to!StringType); // check that `decoded1` wasn't changed
584 assert(encoded1 == target);
585 assert(decoded1 == decode(encoded1).to!StringType);
587 StringType encoded2 = target.to!StringType;
588 string decoded2 = decode(encoded2);
589 assert(encoded2 == target.to!StringType); // check that `encoded2` wasn't changed
590 assert(decoded2 == source);
591 assert(encoded2 == encode(decoded2).to!StringType);