libphobos/src/std/uri.d

   1 // Written in the D programming language.
   2
   3 /**
   4  * Encode and decode Uniform Resource Identifiers (URIs).
   5  * URIs are used in internet transfer protocols.
   6  * Valid URI characters consist of letters, digits,
   7  * and the characters $(B ;/?:@&amp;=+$,-_.!~*'())
   8  * Reserved URI characters are $(B ;/?:@&amp;=+$,)
   9  * Escape sequences consist of $(B %) followed by two hex digits.
  10  *
  11  * See_Also:
  12  *  $(LINK2 https://www.ietf.org/rfc/rfc3986.txt, RFC 3986)<br>
  13  *  $(LINK2 http://en.wikipedia.org/wiki/Uniform_resource_identifier, Wikipedia)
  14  * Copyright: Copyright The D Language Foundation 2000 - 2009.
  15  * License:   $(HTTP www.boost.org/LICENSE_1_0.txt, Boost License 1.0).
  16  * Authors:   $(HTTP digitalmars.com, Walter Bright)
  17  * Source:    $(PHOBOSSRC std/uri.d)
  18  */
  19 /*          Copyright The D Language Foundation 2000 - 2009.
  20  * Distributed under the Boost Software License, Version 1.0.
  21  *    (See accompanying file LICENSE_1_0.txt or copy at
  22  *          http://www.boost.org/LICENSE_1_0.txt)
  23  */
  24 module std.uri;
  25
  26 //debug=uri;        // uncomment to turn on debugging writefln's
  27 debug(uri) import std.stdio;
  28 import std.traits : isSomeChar;
  29
  30 /** This Exception is thrown if something goes wrong when encoding or
  31 decoding a URI.
  32 */
  33 class URIException : Exception
  34 {
  35     import std.exception : basicExceptionCtors;
  36     mixin basicExceptionCtors;
  37 }
  38
  39 ///
  40 @safe unittest
  41 {
  42     import std.exception : assertThrown;
  43     assertThrown!URIException("%ab".decode);
  44 }
  45
  46 private enum
  47 {
  48     URI_Alpha = 1,
  49     URI_Reserved = 2,
  50     URI_Mark = 4,
  51     URI_Digit = 8,
  52     URI_Hash = 0x10,        // '#'
  53 }
  54
  55 private immutable char[16] hex2ascii = "0123456789ABCDEF";
  56
  57 private immutable ubyte[128] uri_flags =      // indexed by character
  58     ({
  59         ubyte[128] uflags;
  60
  61         // Compile time initialize
  62         uflags['#'] |= URI_Hash;
  63
  64         foreach (c; 'A' .. 'Z' + 1)
  65         {
  66             uflags[c] |= URI_Alpha;
  67             uflags[c + 0x20] |= URI_Alpha;   // lowercase letters
  68         }
  69         foreach (c; '0' .. '9' + 1) uflags[c] |= URI_Digit;
  70         foreach (c; ";/?:@&=+$,")   uflags[c] |= URI_Reserved;
  71         foreach (c; "-_.!~*'()")    uflags[c] |= URI_Mark;
  72         return uflags;
  73     })();
  74
  75 private string URI_Encode(dstring str, uint unescapedSet) @safe pure
  76 {
  77     uint j;
  78     uint k;
  79     dchar V;
  80     dchar C;
  81
  82     // result buffer
  83     char[50] buffer = void;
  84     char[] R;
  85     uint Rlen;
  86     uint Rsize; // alloc'd size
  87
  88     immutable len = str.length;
  89
  90     R = buffer[];
  91     Rsize = buffer.length;
  92     Rlen = 0;
  93
  94     for (k = 0; k != len; k++)
  95     {
  96         C = str[k];
  97         // if (C in unescapedSet)
  98         if (C < uri_flags.length && uri_flags[C] & unescapedSet)
  99         {
 100             if (Rlen == Rsize)
 101             {
 102                 char[] R2;
 103
 104                 Rsize *= 2;
 105                 R2 = new char[Rsize];
 106                 R2[0 .. Rlen] = R[0 .. Rlen];
 107                 R = R2;
 108             }
 109             R[Rlen] = cast(char) C;
 110             Rlen++;
 111         }
 112         else
 113         {
 114             char[6] Octet;
 115             uint L;
 116
 117             V = C;
 118
 119             // Transform V into octets
 120             if (V <= 0x7F)
 121             {
 122                 Octet[0] = cast(char) V;
 123                 L = 1;
 124             }
 125             else if (V <= 0x7FF)
 126             {
 127                 Octet[0] = cast(char)(0xC0 | (V >> 6));
 128                 Octet[1] = cast(char)(0x80 | (V & 0x3F));
 129                 L = 2;
 130             }
 131             else if (V <= 0xFFFF)
 132             {
 133                 Octet[0] = cast(char)(0xE0 | (V >> 12));
 134                 Octet[1] = cast(char)(0x80 | ((V >> 6) & 0x3F));
 135                 Octet[2] = cast(char)(0x80 | (V & 0x3F));
 136                 L = 3;
 137             }
 138             else if (V <= 0x1FFFFF)
 139             {
 140                 Octet[0] = cast(char)(0xF0 | (V >> 18));
 141                 Octet[1] = cast(char)(0x80 | ((V >> 12) & 0x3F));
 142                 Octet[2] = cast(char)(0x80 | ((V >> 6) & 0x3F));
 143                 Octet[3] = cast(char)(0x80 | (V & 0x3F));
 144                 L = 4;
 145             }
 146             else
 147             {
 148                 throw new URIException("Undefined UTF-32 code point");
 149             }
 150
 151             if (Rlen + L * 3 > Rsize)
 152             {
 153                 char[] R2;
 154
 155                 Rsize = 2 * (Rlen + L * 3);
 156                 R2 = new char[Rsize];
 157                 R2[0 .. Rlen] = R[0 .. Rlen];
 158                 R = R2;
 159             }
 160
 161             for (j = 0; j < L; j++)
 162             {
 163                 R[Rlen] = '%';
 164                 R[Rlen + 1] = hex2ascii[Octet[j] >> 4];
 165                 R[Rlen + 2] = hex2ascii[Octet[j] & 15];
 166
 167                 Rlen += 3;
 168             }
 169         }
 170     }
 171
 172     return R[0 .. Rlen].idup;
 173 }
 174
 175 @safe pure unittest
 176 {
 177     import std.exception : assertThrown;
 178
 179     assert(URI_Encode("", 0) == "");
 180     assert(URI_Encode(URI_Decode("%F0%BF%BF%BF", 0), 0) == "%F0%BF%BF%BF");
 181     dstring a;
 182     a ~= cast(dchar) 0xFFFFFFFF;
 183     assertThrown(URI_Encode(a, 0));
 184     assert(URI_Encode("aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", 0).length == 3 * 60);
 185 }
 186
 187 private uint ascii2hex(dchar c) @nogc @safe pure nothrow
 188 {
 189     return (c <= '9') ? c - '0' :
 190         (c <= 'F') ? c - 'A' + 10 :
 191         c - 'a' + 10;
 192 }
 193
 194 private dstring URI_Decode(Char)(scope const(Char)[] uri, uint reservedSet)
 195 if (isSomeChar!Char)
 196 {
 197     import std.ascii : isHexDigit;
 198
 199     uint j;
 200     uint k;
 201     uint V;
 202     dchar C;
 203
 204     uint Rlen;
 205     immutable len = uri.length;
 206     auto s = uri;
 207
 208     auto Rsize = len;
 209     dchar[] R = new dchar[Rsize];
 210     Rlen = 0;
 211
 212     for (k = 0; k != len; k++)
 213     {
 214         char B;
 215         uint start;
 216
 217         C = s[k];
 218         if (C != '%')
 219         {
 220             R[Rlen] = C;
 221             Rlen++;
 222             continue;
 223         }
 224         start = k;
 225         if (k + 2 >= len)
 226             throw new URIException("Unexpected end of URI");
 227         if (!isHexDigit(s[k + 1]) || !isHexDigit(s[k + 2]))
 228             throw new URIException("Expected two hexadecimal digits after '%'");
 229         B = cast(char)((ascii2hex(s[k + 1]) << 4) + ascii2hex(s[k + 2]));
 230         k += 2;
 231         if ((B & 0x80) == 0)
 232         {
 233             C = B;
 234         }
 235         else
 236         {
 237             uint n;
 238
 239             for (n = 1; ; n++)
 240             {
 241                 if (n > 4)
 242                     throw new URIException("UTF-32 code point size too large");
 243                 if (((B << n) & 0x80) == 0)
 244                 {
 245                     if (n == 1)
 246                         throw new URIException("UTF-32 code point size too small");
 247                     break;
 248                 }
 249             }
 250
 251             // Pick off (7 - n) significant bits of B from first byte of octet
 252             V = B & ((1 << (7 - n)) - 1);   // (!!!)
 253
 254             if (k + (3 * (n - 1)) >= len)
 255                 throw new URIException("UTF-32 unaligned String");
 256             for (j = 1; j != n; j++)
 257             {
 258                 k++;
 259                 if (s[k] != '%')
 260                     throw new URIException("Expected: '%'");
 261                 if (!isHexDigit(s[k + 1]) || !isHexDigit(s[k + 2]))
 262                     throw new URIException("Expected two hexadecimal digits after '%'");
 263                 B = cast(char)((ascii2hex(s[k + 1]) << 4) + ascii2hex(s[k + 2]));
 264                 if ((B & 0xC0) != 0x80)
 265                     throw new URIException("Incorrect UTF-32 multi-byte sequence");
 266                 k += 2;
 267                 V = (V << 6) | (B & 0x3F);
 268             }
 269             if (V > 0x10FFFF)
 270                 throw new URIException("Unknown UTF-32 code point");
 271             C = V;
 272         }
 273         if (C < uri_flags.length && uri_flags[C] & reservedSet)
 274         {
 275             // R ~= s[start .. k + 1];
 276             immutable width = (k + 1) - start;
 277             for (int ii = 0; ii < width; ii++)
 278                 R[Rlen + ii] = s[start + ii];
 279             Rlen += width;
 280         }
 281         else
 282         {
 283             R[Rlen] = C;
 284             Rlen++;
 285         }
 286     }
 287     assert(Rlen <= Rsize);  // enforce our preallocation size guarantee
 288
 289     // Copy array on stack to array in memory
 290     return R[0 .. Rlen].idup;
 291 }
 292
 293 @safe pure unittest
 294 {
 295     import std.exception : assertThrown;
 296
 297     assert(URI_Decode("", 0) == "");
 298     assertThrown!URIException(URI_Decode("%", 0));
 299     assertThrown!URIException(URI_Decode("%xx", 0));
 300     assertThrown!URIException(URI_Decode("%FF", 0));
 301     assertThrown!URIException(URI_Decode("%C0", 0));
 302     assertThrown!URIException(URI_Decode("%C0000000", 0));
 303     assertThrown!URIException(URI_Decode("%C0%xx0000", 0));
 304     assertThrown!URIException(URI_Decode("%C0%C00000", 0));
 305     assertThrown!URIException(URI_Decode("%F7%BF%BF%BF", 0));
 306     assert(URI_Decode("%23", URI_Hash) == "%23");
 307 }
 308
 309 /*************************************
 310  * Decodes the URI string encodedURI into a UTF-8 string and returns it.
 311  * Escape sequences that resolve to reserved URI characters are not replaced.
 312  * Escape sequences that resolve to the '#' character are not replaced.
 313  */
 314 string decode(Char)(scope const(Char)[] encodedURI)
 315 if (isSomeChar!Char)
 316 {
 317     import std.algorithm.iteration : each;
 318     import std.utf : encode;
 319     auto s = URI_Decode(encodedURI, URI_Reserved | URI_Hash);
 320     char[] r;
 321     s.each!(c => encode(r, c));
 322     return r;
 323 }
 324
 325 ///
 326 @safe unittest
 327 {
 328     assert("foo%20bar".decode == "foo bar");
 329     assert("%3C%3E.@.%E2%84%A2".decode == "<>.@.™");
 330     assert("foo&/".decode == "foo&/");
 331     assert("!@#$&*(".decode == "!@#$&*(");
 332 }
 333
 334 /*******************************
 335  * Decodes the URI string encodedURI into a UTF-8 string and returns it. All
 336  * escape sequences are decoded.
 337  */
 338 string decodeComponent(Char)(scope const(Char)[] encodedURIComponent)
 339 if (isSomeChar!Char)
 340 {
 341     import std.algorithm.iteration : each;
 342     import std.utf : encode;
 343     auto s = URI_Decode(encodedURIComponent, 0);
 344     char[] r;
 345     s.each!(c => encode(r, c));
 346     return r;
 347 }
 348
 349 ///
 350 @safe unittest
 351 {
 352     assert("foo%2F%26".decodeComponent == "foo/&");
 353     assert("dl%C3%A4ng%20r%C3%B6cks".decodeComponent == "dläng röcks");
 354     assert("!%40%23%24%25%5E%26*(".decodeComponent == "!@#$%^&*(");
 355 }
 356
 357 /*****************************
 358  * Encodes the UTF-8 string uri into a URI and returns that URI. Any character
 359  * not a valid URI character is escaped. The '#' character is not escaped.
 360  */
 361 string encode(Char)(scope const(Char)[] uri)
 362 if (isSomeChar!Char)
 363 {
 364     import std.utf : toUTF32;
 365     auto s = toUTF32(uri);
 366     return URI_Encode(s, URI_Reserved | URI_Hash | URI_Alpha | URI_Digit | URI_Mark);
 367 }
 368
 369 ///
 370 @safe unittest
 371 {
 372     assert("foo bar".encode == "foo%20bar");
 373     assert("<>.@.™".encode == "%3C%3E.@.%E2%84%A2");
 374     assert("foo/#?a=1&b=2".encode == "foo/#?a=1&b=2");
 375     assert("dlang+rocks!".encode == "dlang+rocks!");
 376     assert("!@#$%^&*(".encode == "!@#$%25%5E&*(");
 377 }
 378
 379 /********************************
 380  * Encodes the UTF-8 string uriComponent into a URI and returns that URI.
 381  * Any character not a letter, digit, or one of -_.!~*'() is escaped.
 382  */
 383 string encodeComponent(Char)(scope const(Char)[] uriComponent)
 384 if (isSomeChar!Char)
 385 {
 386     import std.utf : toUTF32;
 387     auto s = toUTF32(uriComponent);
 388     return URI_Encode(s, URI_Alpha | URI_Digit | URI_Mark);
 389 }
 390
 391 ///
 392 @safe unittest
 393 {
 394     assert("!@#$%^&*(".encodeComponent == "!%40%23%24%25%5E%26*(");
 395     assert("<>.@.™".encodeComponent == "%3C%3E.%40.%E2%84%A2");
 396     assert("foo/&".encodeComponent == "foo%2F%26");
 397     assert("dläng röcks".encodeComponent == "dl%C3%A4ng%20r%C3%B6cks");
 398     assert("dlang+rocks!".encodeComponent == "dlang%2Brocks!");
 399 }
 400
 401 /* Encode associative array using www-form-urlencoding
 402  *
 403  * Params:
 404  *      values = an associative array containing the values to be encoded.
 405  *
 406  * Returns:
 407  *      A string encoded using www-form-urlencoding.
 408  */
 409 package string urlEncode(scope string[string] values) @safe pure
 410 {
 411     if (values.length == 0)
 412         return "";
 413
 414     import std.array : Appender;
 415     import std.format.write : formattedWrite;
 416
 417     Appender!string enc;
 418     enc.reserve(values.length * 128);
 419
 420     bool first = true;
 421     foreach (k, v; values)
 422     {
 423         if (!first)
 424             enc.put('&');
 425         formattedWrite(enc, "%s=%s", encodeComponent(k), encodeComponent(v));
 426         first = false;
 427     }
 428     return enc.data;
 429 }
 430
 431 @safe pure unittest
 432 {
 433     // @system because urlEncode -> encodeComponent -> URI_Encode
 434     // URI_Encode uses alloca and pointer slicing
 435     string[string] a;
 436     assert(urlEncode(a) == "");
 437     assert(urlEncode(["name1" : "value1"]) == "name1=value1");
 438     auto enc = urlEncode(["name1" : "value1", "name2" : "value2"]);
 439     assert(enc == "name1=value1&name2=value2" || enc == "name2=value2&name1=value1");
 440 }
 441
 442 /***************************
 443  * Does string s[] start with a URL?
 444  * Returns:
 445  *  -1   it does not
 446  *  len  it does, and s[0 .. len] is the slice of s[] that is that URL
 447  */
 448
 449 ptrdiff_t uriLength(Char)(scope const(Char)[] s)
 450 if (isSomeChar!Char)
 451 {
 452     /* Must start with one of:
 453      *  http://
 454      *  https://
 455      *  www.
 456      */
 457     import std.ascii : isAlphaNum;
 458     import std.uni : icmp;
 459
 460     ptrdiff_t i;
 461
 462     if (s.length <= 4)
 463         return -1;
 464
 465     if (s.length > 7 && icmp(s[0 .. 7], "http://") == 0)
 466     {
 467         i = 7;
 468     }
 469     else
 470     {
 471         if (s.length > 8 && icmp(s[0 .. 8], "https://") == 0)
 472             i = 8;
 473         else
 474             return -1;
 475     }
 476
 477     ptrdiff_t lastdot;
 478     for (; i < s.length; i++)
 479     {
 480         auto c = s[i];
 481         if (isAlphaNum(c))
 482             continue;
 483         if (c == '-' || c == '_' || c == '?' ||
 484                 c == '=' || c == '%' || c == '&' ||
 485                 c == '/' || c == '+' || c == '#' ||
 486                 c == '~' || c == '$')
 487             continue;
 488         if (c == '.')
 489         {
 490             lastdot = i;
 491             continue;
 492         }
 493         break;
 494     }
 495     if (!lastdot)
 496         return -1;
 497
 498     return i;
 499 }
 500
 501 ///
 502 @safe pure unittest
 503 {
 504     string s1 = "http://www.digitalmars.com/~fred/fredsRX.html#foo end!";
 505     assert(uriLength(s1) == 49);
 506     string s2 = "no uri here";
 507     assert(uriLength(s2) == -1);
 508     assert(uriLength("issue 14924") < 0);
 509 }
 510
 511 @safe pure nothrow @nogc unittest
 512 {
 513     assert(uriLength("") == -1);
 514     assert(uriLength("https://www") == -1);
 515 }
 516
 517 /***************************
 518  * Does string s[] start with an email address?
 519  * Returns:
 520  *  -1    it does not
 521  *  len   it does, and s[0 .. i] is the slice of s[] that is that email address
 522  * References:
 523  *  RFC2822
 524  */
 525 ptrdiff_t emailLength(Char)(scope const(Char)[] s)
 526 if (isSomeChar!Char)
 527 {
 528     import std.ascii : isAlpha, isAlphaNum;
 529
 530     ptrdiff_t i;
 531
 532     if (s.length == 0)
 533         return -1;
 534
 535     if (!isAlpha(s[0]))
 536         return -1;
 537
 538     for (i = 1; 1; i++)
 539     {
 540         if (i == s.length)
 541             return -1;
 542         auto c = s[i];
 543         if (isAlphaNum(c))
 544             continue;
 545         if (c == '-' || c == '_' || c == '.')
 546             continue;
 547         if (c != '@')
 548             return -1;
 549         i++;
 550         break;
 551     }
 552
 553     /* Now do the part past the '@'
 554      */
 555     ptrdiff_t lastdot;
 556     for (; i < s.length; i++)
 557     {
 558         auto c = s[i];
 559         if (isAlphaNum(c))
 560             continue;
 561         if (c == '-' || c == '_')
 562             continue;
 563         if (c == '.')
 564         {
 565             lastdot = i;
 566             continue;
 567         }
 568         break;
 569     }
 570     if (!lastdot || (i - lastdot != 3 && i - lastdot != 4))
 571         return -1;
 572
 573     return i;
 574 }
 575
 576 ///
 577 @safe pure unittest
 578 {
 579     string s1 = "my.e-mail@www.example-domain.com with garbage added";
 580     assert(emailLength(s1) == 32);
 581     string s2 = "no email address here";
 582     assert(emailLength(s2) == -1);
 583     assert(emailLength("issue 14924") < 0);
 584 }
 585
 586 @safe pure unittest
 587 {
 588     //@system because of encode -> URI_Encode
 589     debug(uri) writeln("uri.encodeURI.unittest");
 590
 591     string source = "http://www.digitalmars.com/~fred/fred's RX.html#foo";
 592     string target = "http://www.digitalmars.com/~fred/fred's%20RX.html#foo";
 593
 594     auto result = encode(source);
 595     debug(uri) writefln("result = '%s'", result);
 596     assert(result == target);
 597     result = decode(target);
 598     debug(uri) writefln("result = '%s'", result);
 599     assert(result == source);
 600
 601     result = encode(decode("%E3%81%82%E3%81%82"));
 602     assert(result == "%E3%81%82%E3%81%82");
 603
 604     result = encodeComponent("c++");
 605     assert(result == "c%2B%2B");
 606
 607     auto str = new char[10_000_000];
 608     str[] = 'A';
 609     result = encodeComponent(str);
 610     foreach (char c; result)
 611         assert(c == 'A');
 612
 613     result = decode("%41%42%43");
 614     debug(uri) writeln(result);
 615
 616     import std.meta : AliasSeq;
 617     static foreach (StringType; AliasSeq!(char[], wchar[], dchar[], string, wstring, dstring))
 618     {{
 619         import std.conv : to;
 620         StringType decoded1 = source.to!StringType;
 621         string encoded1 = encode(decoded1);
 622         assert(decoded1 == source.to!StringType); // check that `decoded1` wasn't changed
 623         assert(encoded1 == target);
 624         assert(decoded1 == decode(encoded1).to!StringType);
 625
 626         StringType encoded2 = target.to!StringType;
 627         string decoded2 = decode(encoded2);
 628         assert(encoded2 == target.to!StringType); // check that `encoded2` wasn't changed
 629         assert(decoded2 == source);
 630         assert(encoded2 == encode(decoded2).to!StringType);
 631     }}
 632 }
 633
 634 @safe pure nothrow @nogc unittest
 635 {
 636     assert(emailLength("") == -1);
 637     assert(emailLength("@") == -1);
 638     assert(emailLength("abcd") == -1);
 639     assert(emailLength("blah@blub") == -1);
 640     assert(emailLength("blah@blub.") == -1);
 641     assert(emailLength("blah@blub.domain") == -1);
 642 }