libphobos/src/std/uri.d

   1 // Written in the D programming language.
   2
   3 /**
   4  * Encode and decode Uniform Resource Identifiers (URIs).
   5  * URIs are used in internet transfer protocols.
   6  * Valid URI characters consist of letters, digits,
   7  * and the characters $(B ;/?:@&amp;=+$,-_.!~*'())
   8  * Reserved URI characters are $(B ;/?:@&amp;=+$,)
   9  * Escape sequences consist of $(B %) followed by two hex digits.
  10  *
  11  * See_Also:
  12  *  $(LINK2 http://www.ietf.org/rfc/rfc3986.txt, RFC 3986)<br>
  13  *  $(LINK2 http://en.wikipedia.org/wiki/Uniform_resource_identifier, Wikipedia)
  14  * Copyright: Copyright Digital Mars 2000 - 2009.
  15  * License:   $(HTTP www.boost.org/LICENSE_1_0.txt, Boost License 1.0).
  16  * Authors:   $(HTTP digitalmars.com, Walter Bright)
  17  * Source:    $(PHOBOSSRC std/_uri.d)
  18  */
  19 /*          Copyright Digital Mars 2000 - 2009.
  20  * Distributed under the Boost Software License, Version 1.0.
  21  *    (See accompanying file LICENSE_1_0.txt or copy at
  22  *          http://www.boost.org/LICENSE_1_0.txt)
  23  */
  24 module std.uri;
  25
  26 //debug=uri;        // uncomment to turn on debugging writefln's
  27 debug(uri) import std.stdio;
  28 import std.traits : isSomeChar;
  29
  30 /** This Exception is thrown if something goes wrong when encoding or
  31 decoding a URI.
  32 */
  33 class URIException : Exception
  34 {
  35     import std.exception : basicExceptionCtors;
  36     mixin basicExceptionCtors;
  37 }
  38
  39 private enum
  40 {
  41     URI_Alpha = 1,
  42     URI_Reserved = 2,
  43     URI_Mark = 4,
  44     URI_Digit = 8,
  45     URI_Hash = 0x10,        // '#'
  46 }
  47
  48 private immutable char[16] hex2ascii = "0123456789ABCDEF";
  49
  50 private immutable ubyte[128] uri_flags =      // indexed by character
  51     ({
  52         ubyte[128] uflags;
  53
  54         // Compile time initialize
  55         uflags['#'] |= URI_Hash;
  56
  57         foreach (c; 'A' .. 'Z' + 1)
  58         {
  59             uflags[c] |= URI_Alpha;
  60             uflags[c + 0x20] |= URI_Alpha;   // lowercase letters
  61         }
  62         foreach (c; '0' .. '9' + 1) uflags[c] |= URI_Digit;
  63         foreach (c; ";/?:@&=+$,")   uflags[c] |= URI_Reserved;
  64         foreach (c; "-_.!~*'()")    uflags[c] |= URI_Mark;
  65         return uflags;
  66     })();
  67
  68 private string URI_Encode(dstring str, uint unescapedSet)
  69 {
  70     import core.exception : OutOfMemoryError;
  71     import core.stdc.stdlib : alloca;
  72
  73     uint j;
  74     uint k;
  75     dchar V;
  76     dchar C;
  77
  78     // result buffer
  79     char[50] buffer = void;
  80     char* R;
  81     uint Rlen;
  82     uint Rsize; // alloc'd size
  83
  84     immutable len = str.length;
  85
  86     R = buffer.ptr;
  87     Rsize = buffer.length;
  88     Rlen = 0;
  89
  90     for (k = 0; k != len; k++)
  91     {
  92         C = str[k];
  93         // if (C in unescapedSet)
  94         if (C < uri_flags.length && uri_flags[C] & unescapedSet)
  95         {
  96             if (Rlen == Rsize)
  97             {
  98                 char* R2;
  99
 100                 Rsize *= 2;
 101                 if (Rsize > 1024)
 102                 {
 103                     R2 = (new char[Rsize]).ptr;
 104                 }
 105                 else
 106                 {
 107                     R2 = cast(char *) alloca(Rsize * char.sizeof);
 108                     if (!R2)
 109                         throw new OutOfMemoryError("Alloca failure");
 110                 }
 111                 R2[0 .. Rlen] = R[0 .. Rlen];
 112                 R = R2;
 113             }
 114             R[Rlen] = cast(char) C;
 115             Rlen++;
 116         }
 117         else
 118         {
 119             char[6] Octet;
 120             uint L;
 121
 122             V = C;
 123
 124             // Transform V into octets
 125             if (V <= 0x7F)
 126             {
 127                 Octet[0] = cast(char) V;
 128                 L = 1;
 129             }
 130             else if (V <= 0x7FF)
 131             {
 132                 Octet[0] = cast(char)(0xC0 | (V >> 6));
 133                 Octet[1] = cast(char)(0x80 | (V & 0x3F));
 134                 L = 2;
 135             }
 136             else if (V <= 0xFFFF)
 137             {
 138                 Octet[0] = cast(char)(0xE0 | (V >> 12));
 139                 Octet[1] = cast(char)(0x80 | ((V >> 6) & 0x3F));
 140                 Octet[2] = cast(char)(0x80 | (V & 0x3F));
 141                 L = 3;
 142             }
 143             else if (V <= 0x1FFFFF)
 144             {
 145                 Octet[0] = cast(char)(0xF0 | (V >> 18));
 146                 Octet[1] = cast(char)(0x80 | ((V >> 12) & 0x3F));
 147                 Octet[2] = cast(char)(0x80 | ((V >> 6) & 0x3F));
 148                 Octet[3] = cast(char)(0x80 | (V & 0x3F));
 149                 L = 4;
 150             }
 151             else
 152             {
 153                 throw new URIException("Undefined UTF-32 code point");
 154             }
 155
 156             if (Rlen + L * 3 > Rsize)
 157             {
 158                 char *R2;
 159
 160                 Rsize = 2 * (Rlen + L * 3);
 161                 if (Rsize > 1024)
 162                 {
 163                     R2 = (new char[Rsize]).ptr;
 164                 }
 165                 else
 166                 {
 167                     R2 = cast(char *) alloca(Rsize * char.sizeof);
 168                     if (!R2)
 169                         throw new OutOfMemoryError("Alloca failure");
 170                 }
 171                 R2[0 .. Rlen] = R[0 .. Rlen];
 172                 R = R2;
 173             }
 174
 175             for (j = 0; j < L; j++)
 176             {
 177                 R[Rlen] = '%';
 178                 R[Rlen + 1] = hex2ascii[Octet[j] >> 4];
 179                 R[Rlen + 2] = hex2ascii[Octet[j] & 15];
 180
 181                 Rlen += 3;
 182             }
 183         }
 184     }
 185
 186     return R[0 .. Rlen].idup;
 187 }
 188
 189 private uint ascii2hex(dchar c) @nogc @safe pure nothrow
 190 {
 191     return (c <= '9') ? c - '0' :
 192         (c <= 'F') ? c - 'A' + 10 :
 193         c - 'a' + 10;
 194 }
 195
 196 private dstring URI_Decode(Char)(in Char[] uri, uint reservedSet)
 197 if (isSomeChar!Char)
 198 {
 199     import core.exception : OutOfMemoryError;
 200     import core.stdc.stdlib : alloca;
 201     import std.ascii : isHexDigit;
 202
 203     uint j;
 204     uint k;
 205     uint V;
 206     dchar C;
 207
 208     // Result array, allocated on stack
 209     dchar* R;
 210     uint Rlen;
 211
 212     immutable len = uri.length;
 213     auto s = uri.ptr;
 214
 215     // Preallocate result buffer R guaranteed to be large enough for result
 216     auto Rsize = len;
 217     if (Rsize > 1024 / dchar.sizeof)
 218     {
 219         R = (new dchar[Rsize]).ptr;
 220     }
 221     else
 222     {
 223         R = cast(dchar *) alloca(Rsize * dchar.sizeof);
 224         if (!R)
 225             throw new OutOfMemoryError("Alloca failure");
 226     }
 227     Rlen = 0;
 228
 229     for (k = 0; k != len; k++)
 230     {
 231         char B;
 232         uint start;
 233
 234         C = s[k];
 235         if (C != '%')
 236         {
 237             R[Rlen] = C;
 238             Rlen++;
 239             continue;
 240         }
 241         start = k;
 242         if (k + 2 >= len)
 243             throw new URIException("Unexpected end of URI");
 244         if (!isHexDigit(s[k + 1]) || !isHexDigit(s[k + 2]))
 245             throw new URIException("Expected two hexadecimal digits after '%'");
 246         B = cast(char)((ascii2hex(s[k + 1]) << 4) + ascii2hex(s[k + 2]));
 247         k += 2;
 248         if ((B & 0x80) == 0)
 249         {
 250             C = B;
 251         }
 252         else
 253         {
 254             uint n;
 255
 256             for (n = 1; ; n++)
 257             {
 258                 if (n > 4)
 259                     throw new URIException("UTF-32 code point size too large");
 260                 if (((B << n) & 0x80) == 0)
 261                 {
 262                     if (n == 1)
 263                         throw new URIException("UTF-32 code point size too small");
 264                     break;
 265                 }
 266             }
 267
 268             // Pick off (7 - n) significant bits of B from first byte of octet
 269             V = B & ((1 << (7 - n)) - 1);   // (!!!)
 270
 271             if (k + (3 * (n - 1)) >= len)
 272                 throw new URIException("UTF-32 unaligned String");
 273             for (j = 1; j != n; j++)
 274             {
 275                 k++;
 276                 if (s[k] != '%')
 277                     throw new URIException("Expected: '%'");
 278                 if (!isHexDigit(s[k + 1]) || !isHexDigit(s[k + 2]))
 279                     throw new URIException("Expected two hexadecimal digits after '%'");
 280                 B = cast(char)((ascii2hex(s[k + 1]) << 4) + ascii2hex(s[k + 2]));
 281                 if ((B & 0xC0) != 0x80)
 282                     throw new URIException("Incorrect UTF-32 multi-byte sequence");
 283                 k += 2;
 284                 V = (V << 6) | (B & 0x3F);
 285             }
 286             if (V > 0x10FFFF)
 287                 throw new URIException("Unknown UTF-32 code point");
 288             C = V;
 289         }
 290         if (C < uri_flags.length && uri_flags[C] & reservedSet)
 291         {
 292             // R ~= s[start .. k + 1];
 293             immutable width = (k + 1) - start;
 294             for (int ii = 0; ii < width; ii++)
 295                 R[Rlen + ii] = s[start + ii];
 296             Rlen += width;
 297         }
 298         else
 299         {
 300             R[Rlen] = C;
 301             Rlen++;
 302         }
 303     }
 304     assert(Rlen <= Rsize);  // enforce our preallocation size guarantee
 305
 306     // Copy array on stack to array in memory
 307     return R[0 .. Rlen].idup;
 308 }
 309
 310 /*************************************
 311  * Decodes the URI string encodedURI into a UTF-8 string and returns it.
 312  * Escape sequences that resolve to reserved URI characters are not replaced.
 313  * Escape sequences that resolve to the '#' character are not replaced.
 314  */
 315
 316 string decode(Char)(in Char[] encodedURI)
 317 if (isSomeChar!Char)
 318 {
 319     import std.algorithm.iteration : each;
 320     import std.utf : encode;
 321     auto s = URI_Decode(encodedURI, URI_Reserved | URI_Hash);
 322     char[] r;
 323     s.each!(c => encode(r, c));
 324     return r;
 325 }
 326
 327 /*******************************
 328  * Decodes the URI string encodedURI into a UTF-8 string and returns it. All
 329  * escape sequences are decoded.
 330  */
 331
 332 string decodeComponent(Char)(in Char[] encodedURIComponent)
 333 if (isSomeChar!Char)
 334 {
 335     import std.algorithm.iteration : each;
 336     import std.utf : encode;
 337     auto s = URI_Decode(encodedURIComponent, 0);
 338     char[] r;
 339     s.each!(c => encode(r, c));
 340     return r;
 341 }
 342
 343 /*****************************
 344  * Encodes the UTF-8 string uri into a URI and returns that URI. Any character
 345  * not a valid URI character is escaped. The '#' character is not escaped.
 346  */
 347
 348 string encode(Char)(in Char[] uri)
 349 if (isSomeChar!Char)
 350 {
 351     import std.utf : toUTF32;
 352     auto s = toUTF32(uri);
 353     return URI_Encode(s, URI_Reserved | URI_Hash | URI_Alpha | URI_Digit | URI_Mark);
 354 }
 355
 356 /********************************
 357  * Encodes the UTF-8 string uriComponent into a URI and returns that URI.
 358  * Any character not a letter, digit, or one of -_.!~*'() is escaped.
 359  */
 360
 361 string encodeComponent(Char)(in Char[] uriComponent)
 362 if (isSomeChar!Char)
 363 {
 364     import std.utf : toUTF32;
 365     auto s = toUTF32(uriComponent);
 366     return URI_Encode(s, URI_Alpha | URI_Digit | URI_Mark);
 367 }
 368
 369 /* Encode associative array using www-form-urlencoding
 370  *
 371  * Params:
 372  *      values = an associative array containing the values to be encoded.
 373  *
 374  * Returns:
 375  *      A string encoded using www-form-urlencoding.
 376  */
 377 package string urlEncode(in string[string] values)
 378 {
 379     if (values.length == 0)
 380         return "";
 381
 382     import std.array : Appender;
 383     import std.format : formattedWrite;
 384
 385     Appender!string enc;
 386     enc.reserve(values.length * 128);
 387
 388     bool first = true;
 389     foreach (k, v; values)
 390     {
 391         if (!first)
 392             enc.put('&');
 393         formattedWrite(enc, "%s=%s", encodeComponent(k), encodeComponent(v));
 394         first = false;
 395     }
 396     return enc.data;
 397 }
 398
 399 @system unittest
 400 {
 401     // @system because urlEncode -> encodeComponent -> URI_Encode
 402     // URI_Encode uses alloca and pointer slicing
 403     string[string] a;
 404     assert(urlEncode(a) == "");
 405     assert(urlEncode(["name1" : "value1"]) == "name1=value1");
 406     auto enc = urlEncode(["name1" : "value1", "name2" : "value2"]);
 407     assert(enc == "name1=value1&name2=value2" || enc == "name2=value2&name1=value1");
 408 }
 409
 410 /***************************
 411  * Does string s[] start with a URL?
 412  * Returns:
 413  *  -1   it does not
 414  *  len  it does, and s[0 .. len] is the slice of s[] that is that URL
 415  */
 416
 417 ptrdiff_t uriLength(Char)(in Char[] s)
 418 if (isSomeChar!Char)
 419 {
 420     /* Must start with one of:
 421      *  http://
 422      *  https://
 423      *  www.
 424      */
 425     import std.ascii : isAlphaNum;
 426     import std.uni : icmp;
 427
 428     ptrdiff_t i;
 429
 430     if (s.length <= 4)
 431         return -1;
 432
 433     if (s.length > 7 && icmp(s[0 .. 7], "http://") == 0)
 434     {
 435         i = 7;
 436     }
 437     else
 438     {
 439         if (s.length > 8 && icmp(s[0 .. 8], "https://") == 0)
 440             i = 8;
 441         else
 442             return -1;
 443     }
 444
 445     ptrdiff_t lastdot;
 446     for (; i < s.length; i++)
 447     {
 448         auto c = s[i];
 449         if (isAlphaNum(c))
 450             continue;
 451         if (c == '-' || c == '_' || c == '?' ||
 452                 c == '=' || c == '%' || c == '&' ||
 453                 c == '/' || c == '+' || c == '#' ||
 454                 c == '~' || c == '$')
 455             continue;
 456         if (c == '.')
 457         {
 458             lastdot = i;
 459             continue;
 460         }
 461         break;
 462     }
 463     if (!lastdot)
 464         return -1;
 465
 466     return i;
 467 }
 468
 469 ///
 470 @safe unittest
 471 {
 472     string s1 = "http://www.digitalmars.com/~fred/fredsRX.html#foo end!";
 473     assert(uriLength(s1) == 49);
 474     string s2 = "no uri here";
 475     assert(uriLength(s2) == -1);
 476     assert(uriLength("issue 14924") < 0);
 477 }
 478
 479
 480 /***************************
 481  * Does string s[] start with an email address?
 482  * Returns:
 483  *  -1    it does not
 484  *  len   it does, and s[0 .. i] is the slice of s[] that is that email address
 485  * References:
 486  *  RFC2822
 487  */
 488 ptrdiff_t emailLength(Char)(in Char[] s)
 489 if (isSomeChar!Char)
 490 {
 491     import std.ascii : isAlpha, isAlphaNum;
 492
 493     ptrdiff_t i;
 494
 495     if (!isAlpha(s[0]))
 496         return -1;
 497
 498     for (i = 1; 1; i++)
 499     {
 500         if (i == s.length)
 501             return -1;
 502         auto c = s[i];
 503         if (isAlphaNum(c))
 504             continue;
 505         if (c == '-' || c == '_' || c == '.')
 506             continue;
 507         if (c != '@')
 508             return -1;
 509         i++;
 510         break;
 511     }
 512
 513     /* Now do the part past the '@'
 514      */
 515     ptrdiff_t lastdot;
 516     for (; i < s.length; i++)
 517     {
 518         auto c = s[i];
 519         if (isAlphaNum(c))
 520             continue;
 521         if (c == '-' || c == '_')
 522             continue;
 523         if (c == '.')
 524         {
 525             lastdot = i;
 526             continue;
 527         }
 528         break;
 529     }
 530     if (!lastdot || (i - lastdot != 3 && i - lastdot != 4))
 531         return -1;
 532
 533     return i;
 534 }
 535
 536 ///
 537 @safe unittest
 538 {
 539     string s1 = "my.e-mail@www.example-domain.com with garbage added";
 540     assert(emailLength(s1) == 32);
 541     string s2 = "no email address here";
 542     assert(emailLength(s2) == -1);
 543     assert(emailLength("issue 14924") < 0);
 544 }
 545
 546
 547 @system unittest
 548 {
 549     //@system because of encode -> URI_Encode
 550     debug(uri) writeln("uri.encodeURI.unittest");
 551
 552     string source = "http://www.digitalmars.com/~fred/fred's RX.html#foo";
 553     string target = "http://www.digitalmars.com/~fred/fred's%20RX.html#foo";
 554
 555     auto result = encode(source);
 556     debug(uri) writefln("result = '%s'", result);
 557     assert(result == target);
 558     result = decode(target);
 559     debug(uri) writefln("result = '%s'", result);
 560     assert(result == source);
 561
 562     result = encode(decode("%E3%81%82%E3%81%82"));
 563     assert(result == "%E3%81%82%E3%81%82");
 564
 565     result = encodeComponent("c++");
 566     assert(result == "c%2B%2B");
 567
 568     auto str = new char[10_000_000];
 569     str[] = 'A';
 570     result = encodeComponent(str);
 571     foreach (char c; result)
 572         assert(c == 'A');
 573
 574     result = decode("%41%42%43");
 575     debug(uri) writeln(result);
 576
 577     import std.meta : AliasSeq;
 578     foreach (StringType; AliasSeq!(char[], wchar[], dchar[], string, wstring, dstring))
 579     {
 580         import std.conv : to;
 581         StringType decoded1 = source.to!StringType;
 582         string encoded1 = encode(decoded1);
 583         assert(decoded1 == source.to!StringType); // check that `decoded1` wasn't changed
 584         assert(encoded1 == target);
 585         assert(decoded1 == decode(encoded1).to!StringType);
 586
 587         StringType encoded2 = target.to!StringType;
 588         string decoded2 = decode(encoded2);
 589         assert(encoded2 == target.to!StringType); // check that `encoded2` wasn't changed
 590         assert(decoded2 == source);
 591         assert(encoded2 == encode(decoded2).to!StringType);
 592     }
 593 }