1 // Written in the D programming language.
4 * Encode and decode Uniform Resource Identifiers (URIs).
5 * URIs are used in internet transfer protocols.
6 * Valid URI characters consist of letters, digits,
7 * and the characters $(B ;/?:@&=+$,-_.!~*'())
8 * Reserved URI characters are $(B ;/?:@&=+$,)
9 * Escape sequences consist of $(B %) followed by two hex digits.
12 * $(LINK2 https://www.ietf.org/rfc/rfc3986.txt, RFC 3986)<br>
13 * $(LINK2 http://en.wikipedia.org/wiki/Uniform_resource_identifier, Wikipedia)
14 * Copyright: Copyright The D Language Foundation 2000 - 2009.
15 * License: $(HTTP www.boost.org/LICENSE_1_0.txt, Boost License 1.0).
16 * Authors: $(HTTP digitalmars.com, Walter Bright)
17 * Source: $(PHOBOSSRC std/uri.d)
19 /* Copyright The D Language Foundation 2000 - 2009.
20 * Distributed under the Boost Software License, Version 1.0.
21 * (See accompanying file LICENSE_1_0.txt or copy at
22 * http://www.boost.org/LICENSE_1_0.txt)
26 //debug=uri; // uncomment to turn on debugging writefln's
27 debug(uri
) import std
.stdio
;
28 import std
.traits
: isSomeChar
;
30 /** This Exception is thrown if something goes wrong when encoding or
33 class URIException
: Exception
35 import std
.exception
: basicExceptionCtors
;
36 mixin basicExceptionCtors
;
42 import std
.exception
: assertThrown
;
43 assertThrown
!URIException("%ab".decode
);
52 URI_Hash
= 0x10, // '#'
55 private immutable char[16] hex2ascii
= "0123456789ABCDEF";
57 private immutable ubyte[128] uri_flags
= // indexed by character
61 // Compile time initialize
62 uflags
['#'] |
= URI_Hash
;
64 foreach (c
; 'A' .. 'Z' + 1)
66 uflags
[c
] |
= URI_Alpha
;
67 uflags
[c
+ 0x20] |
= URI_Alpha
; // lowercase letters
69 foreach (c
; '0' .. '9' + 1) uflags
[c
] |
= URI_Digit
;
70 foreach (c
; ";/?:@&=+$,") uflags
[c
] |
= URI_Reserved
;
71 foreach (c
; "-_.!~*'()") uflags
[c
] |
= URI_Mark
;
75 private string
URI_Encode(dstring
str, uint unescapedSet
) @safe pure
83 char[50] buffer
= void;
86 uint Rsize
; // alloc'd size
88 immutable len
= str.length
;
91 Rsize
= buffer
.length
;
94 for (k
= 0; k
!= len
; k
++)
97 // if (C in unescapedSet)
98 if (C
< uri_flags
.length
&& uri_flags
[C
] & unescapedSet
)
105 R2
= new char[Rsize
];
106 R2
[0 .. Rlen
] = R
[0 .. Rlen
];
109 R
[Rlen
] = cast(char) C
;
119 // Transform V into octets
122 Octet
[0] = cast(char) V
;
127 Octet
[0] = cast(char)(0xC0 |
(V
>> 6));
128 Octet
[1] = cast(char)(0x80 |
(V
& 0x3F));
131 else if (V
<= 0xFFFF)
133 Octet
[0] = cast(char)(0xE0 |
(V
>> 12));
134 Octet
[1] = cast(char)(0x80 |
((V
>> 6) & 0x3F));
135 Octet
[2] = cast(char)(0x80 |
(V
& 0x3F));
138 else if (V
<= 0x1FFFFF)
140 Octet
[0] = cast(char)(0xF0 |
(V
>> 18));
141 Octet
[1] = cast(char)(0x80 |
((V
>> 12) & 0x3F));
142 Octet
[2] = cast(char)(0x80 |
((V
>> 6) & 0x3F));
143 Octet
[3] = cast(char)(0x80 |
(V
& 0x3F));
148 throw new URIException("Undefined UTF-32 code point");
151 if (Rlen
+ L
* 3 > Rsize
)
155 Rsize
= 2 * (Rlen
+ L
* 3);
156 R2
= new char[Rsize
];
157 R2
[0 .. Rlen
] = R
[0 .. Rlen
];
161 for (j
= 0; j
< L
; j
++)
164 R
[Rlen
+ 1] = hex2ascii
[Octet
[j
] >> 4];
165 R
[Rlen
+ 2] = hex2ascii
[Octet
[j
] & 15];
172 return R
[0 .. Rlen
].idup
;
177 import std
.exception
: assertThrown
;
179 assert(URI_Encode("", 0) == "");
180 assert(URI_Encode(URI_Decode("%F0%BF%BF%BF", 0), 0) == "%F0%BF%BF%BF");
182 a
~= cast(dchar) 0xFFFFFFFF;
183 assertThrown(URI_Encode(a
, 0));
184 assert(URI_Encode("aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", 0).length
== 3 * 60);
187 private uint ascii2hex(dchar c
) @nogc @safe pure nothrow
189 return (c
<= '9') ? c
- '0' :
190 (c
<= 'F') ? c
- 'A' + 10 :
194 private dstring
URI_Decode(Char
)(scope const(Char
)[] uri
, uint reservedSet
)
197 import std
.ascii
: isHexDigit
;
205 immutable len
= uri
.length
;
209 dchar[] R
= new dchar[Rsize
];
212 for (k
= 0; k
!= len
; k
++)
226 throw new URIException("Unexpected end of URI");
227 if (!isHexDigit(s
[k
+ 1]) ||
!isHexDigit(s
[k
+ 2]))
228 throw new URIException("Expected two hexadecimal digits after '%'");
229 B
= cast(char)((ascii2hex(s
[k
+ 1]) << 4) + ascii2hex(s
[k
+ 2]));
242 throw new URIException("UTF-32 code point size too large");
243 if (((B
<< n
) & 0x80) == 0)
246 throw new URIException("UTF-32 code point size too small");
251 // Pick off (7 - n) significant bits of B from first byte of octet
252 V
= B
& ((1 << (7 - n
)) - 1); // (!!!)
254 if (k
+ (3 * (n
- 1)) >= len
)
255 throw new URIException("UTF-32 unaligned String");
256 for (j
= 1; j
!= n
; j
++)
260 throw new URIException("Expected: '%'");
261 if (!isHexDigit(s
[k
+ 1]) ||
!isHexDigit(s
[k
+ 2]))
262 throw new URIException("Expected two hexadecimal digits after '%'");
263 B
= cast(char)((ascii2hex(s
[k
+ 1]) << 4) + ascii2hex(s
[k
+ 2]));
264 if ((B
& 0xC0) != 0x80)
265 throw new URIException("Incorrect UTF-32 multi-byte sequence");
267 V
= (V
<< 6) |
(B
& 0x3F);
270 throw new URIException("Unknown UTF-32 code point");
273 if (C
< uri_flags
.length
&& uri_flags
[C
] & reservedSet
)
275 // R ~= s[start .. k + 1];
276 immutable width
= (k
+ 1) - start
;
277 for (int ii
= 0; ii
< width
; ii
++)
278 R
[Rlen
+ ii
] = s
[start
+ ii
];
287 assert(Rlen
<= Rsize
); // enforce our preallocation size guarantee
289 // Copy array on stack to array in memory
290 return R
[0 .. Rlen
].idup
;
295 import std
.exception
: assertThrown
;
297 assert(URI_Decode("", 0) == "");
298 assertThrown
!URIException(URI_Decode("%", 0));
299 assertThrown
!URIException(URI_Decode("%xx", 0));
300 assertThrown
!URIException(URI_Decode("%FF", 0));
301 assertThrown
!URIException(URI_Decode("%C0", 0));
302 assertThrown
!URIException(URI_Decode("%C0000000", 0));
303 assertThrown
!URIException(URI_Decode("%C0%xx0000", 0));
304 assertThrown
!URIException(URI_Decode("%C0%C00000", 0));
305 assertThrown
!URIException(URI_Decode("%F7%BF%BF%BF", 0));
306 assert(URI_Decode("%23", URI_Hash
) == "%23");
309 /*************************************
310 * Decodes the URI string encodedURI into a UTF-8 string and returns it.
311 * Escape sequences that resolve to reserved URI characters are not replaced.
312 * Escape sequences that resolve to the '#' character are not replaced.
314 string
decode(Char
)(scope const(Char
)[] encodedURI
)
317 import std
.algorithm
.iteration
: each
;
318 import std
.utf
: encode
;
319 auto s
= URI_Decode(encodedURI
, URI_Reserved | URI_Hash
);
321 s
.each
!(c
=> encode(r
, c
));
328 assert("foo%20bar".decode
== "foo bar");
329 assert("%3C%3E.@.%E2%84%A2".decode
== "<>.@.™");
330 assert("foo&/".decode
== "foo&/");
331 assert("!@#$&*(".decode
== "!@#$&*(");
334 /*******************************
335 * Decodes the URI string encodedURI into a UTF-8 string and returns it. All
336 * escape sequences are decoded.
338 string
decodeComponent(Char
)(scope const(Char
)[] encodedURIComponent
)
341 import std
.algorithm
.iteration
: each
;
342 import std
.utf
: encode
;
343 auto s
= URI_Decode(encodedURIComponent
, 0);
345 s
.each
!(c
=> encode(r
, c
));
352 assert("foo%2F%26".decodeComponent
== "foo/&");
353 assert("dl%C3%A4ng%20r%C3%B6cks".decodeComponent
== "dläng röcks");
354 assert("!%40%23%24%25%5E%26*(".decodeComponent
== "!@#$%^&*(");
357 /*****************************
358 * Encodes the UTF-8 string uri into a URI and returns that URI. Any character
359 * not a valid URI character is escaped. The '#' character is not escaped.
361 string
encode(Char
)(scope const(Char
)[] uri
)
364 import std
.utf
: toUTF32
;
365 auto s
= toUTF32(uri
);
366 return URI_Encode(s
, URI_Reserved | URI_Hash | URI_Alpha | URI_Digit | URI_Mark
);
372 assert("foo bar".encode
== "foo%20bar");
373 assert("<>.@.™".encode
== "%3C%3E.@.%E2%84%A2");
374 assert("foo/#?a=1&b=2".encode
== "foo/#?a=1&b=2");
375 assert("dlang+rocks!".encode
== "dlang+rocks!");
376 assert("!@#$%^&*(".encode
== "!@#$%25%5E&*(");
379 /********************************
380 * Encodes the UTF-8 string uriComponent into a URI and returns that URI.
381 * Any character not a letter, digit, or one of -_.!~*'() is escaped.
383 string
encodeComponent(Char
)(scope const(Char
)[] uriComponent
)
386 import std
.utf
: toUTF32
;
387 auto s
= toUTF32(uriComponent
);
388 return URI_Encode(s
, URI_Alpha | URI_Digit | URI_Mark
);
394 assert("!@#$%^&*(".encodeComponent
== "!%40%23%24%25%5E%26*(");
395 assert("<>.@.™".encodeComponent
== "%3C%3E.%40.%E2%84%A2");
396 assert("foo/&".encodeComponent
== "foo%2F%26");
397 assert("dläng röcks".encodeComponent
== "dl%C3%A4ng%20r%C3%B6cks");
398 assert("dlang+rocks!".encodeComponent
== "dlang%2Brocks!");
401 /* Encode associative array using www-form-urlencoding
404 * values = an associative array containing the values to be encoded.
407 * A string encoded using www-form-urlencoding.
409 package string
urlEncode(scope string
[string
] values
) @safe pure
411 if (values
.length
== 0)
414 import std
.array
: Appender
;
415 import std
.format
.write
: formattedWrite
;
418 enc
.reserve(values
.length
* 128);
421 foreach (k
, v
; values
)
425 formattedWrite(enc
, "%s=%s", encodeComponent(k
), encodeComponent(v
));
433 // @system because urlEncode -> encodeComponent -> URI_Encode
434 // URI_Encode uses alloca and pointer slicing
436 assert(urlEncode(a
) == "");
437 assert(urlEncode(["name1" : "value1"]) == "name1=value1");
438 auto enc
= urlEncode(["name1" : "value1", "name2" : "value2"]);
439 assert(enc
== "name1=value1&name2=value2" || enc
== "name2=value2&name1=value1");
442 /***************************
443 * Does string s[] start with a URL?
446 * len it does, and s[0 .. len] is the slice of s[] that is that URL
449 ptrdiff_t
uriLength(Char
)(scope const(Char
)[] s
)
452 /* Must start with one of:
457 import std
.ascii
: isAlphaNum
;
458 import std
.uni
: icmp
;
465 if (s
.length
> 7 && icmp(s
[0 .. 7], "http://") == 0)
471 if (s
.length
> 8 && icmp(s
[0 .. 8], "https://") == 0)
478 for (; i
< s
.length
; i
++)
483 if (c
== '-' || c
== '_' || c
== '?' ||
484 c
== '=' || c
== '%' || c
== '&' ||
485 c
== '/' || c
== '+' || c
== '#' ||
486 c
== '~' || c
== '$')
504 string s1
= "http://www.digitalmars.com/~fred/fredsRX.html#foo end!";
505 assert(uriLength(s1
) == 49);
506 string s2
= "no uri here";
507 assert(uriLength(s2
) == -1);
508 assert(uriLength("issue 14924") < 0);
511 @safe pure nothrow @nogc unittest
513 assert(uriLength("") == -1);
514 assert(uriLength("https://www") == -1);
517 /***************************
518 * Does string s[] start with an email address?
521 * len it does, and s[0 .. i] is the slice of s[] that is that email address
525 ptrdiff_t
emailLength(Char
)(scope const(Char
)[] s
)
528 import std
.ascii
: isAlpha
, isAlphaNum
;
545 if (c
== '-' || c
== '_' || c
== '.')
553 /* Now do the part past the '@'
556 for (; i
< s
.length
; i
++)
561 if (c
== '-' || c
== '_')
570 if (!lastdot ||
(i
- lastdot
!= 3 && i
- lastdot
!= 4))
579 string s1
= "my.e-mail@www.example-domain.com with garbage added";
580 assert(emailLength(s1
) == 32);
581 string s2
= "no email address here";
582 assert(emailLength(s2
) == -1);
583 assert(emailLength("issue 14924") < 0);
588 //@system because of encode -> URI_Encode
589 debug(uri
) writeln("uri.encodeURI.unittest");
591 string source
= "http://www.digitalmars.com/~fred/fred's RX.html#foo";
592 string target
= "http://www.digitalmars.com/~fred/fred's%20RX.html#foo";
594 auto result
= encode(source
);
595 debug(uri
) writefln("result = '%s'", result
);
596 assert(result
== target
);
597 result
= decode(target
);
598 debug(uri
) writefln("result = '%s'", result
);
599 assert(result
== source
);
601 result
= encode(decode("%E3%81%82%E3%81%82"));
602 assert(result
== "%E3%81%82%E3%81%82");
604 result
= encodeComponent("c++");
605 assert(result
== "c%2B%2B");
607 auto str = new char[10_000_000];
609 result
= encodeComponent(str);
610 foreach (char c
; result
)
613 result
= decode("%41%42%43");
614 debug(uri
) writeln(result
);
616 import std
.meta
: AliasSeq
;
617 static foreach (StringType
; AliasSeq
!(char[], wchar[], dchar[], string
, wstring
, dstring
))
619 import std
.conv
: to
;
620 StringType decoded1
= source
.to
!StringType
;
621 string encoded1
= encode(decoded1
);
622 assert(decoded1
== source
.to
!StringType
); // check that `decoded1` wasn't changed
623 assert(encoded1
== target
);
624 assert(decoded1
== decode(encoded1
).to
!StringType
);
626 StringType encoded2
= target
.to
!StringType
;
627 string decoded2
= decode(encoded2
);
628 assert(encoded2
== target
.to
!StringType
); // check that `encoded2` wasn't changed
629 assert(decoded2
== source
);
630 assert(encoded2
== encode(decoded2
).to
!StringType
);
634 @safe pure nothrow @nogc unittest
636 assert(emailLength("") == -1);
637 assert(emailLength("@") == -1);
638 assert(emailLength("abcd") == -1);
639 assert(emailLength("blah@blub") == -1);
640 assert(emailLength("blah@blub.") == -1);
641 assert(emailLength("blah@blub.domain") == -1);