1 /* URL parser and translator; implementation of RFC 2396. */
15 #include <sys/types.h>
17 #include <netdb.h> /* OS/2 needs this after sys/types.h */
20 #ifdef HAVE_SYS_SOCKET_H
21 #include <sys/socket.h>
23 #ifdef HAVE_NETINET_IN_H
24 #include <netinet/in.h>
26 #ifdef HAVE_ARPA_INET_H
27 #include <arpa/inet.h>
32 #include "main/object.h"
33 #include "protocol/protocol.h"
34 #include "protocol/uri.h"
35 #include "util/conv.h"
36 #include "util/error.h"
37 #include "util/file.h"
38 #include "util/hash.h"
39 #include "util/memory.h"
40 #include "util/string.h"
44 end_of_dir(unsigned char c
)
46 /* This used to check for c == ';' as well. But section 3.3
47 * of RFC 2396 explicitly says that parameters in a path
48 * segment "are not significant to the parsing of relative
50 return c
== POST_CHAR
|| c
== '#' || c
== '?';
54 is_uri_dir_sep(const struct uri
*uri
, unsigned char pos
)
56 return (uri
->protocol
== PROTOCOL_FILE
? dir_sep(pos
) : pos
== '/');
61 is_in_domain(unsigned char *domain
, unsigned char *server
, int server_len
)
63 int domain_len
= strlen(domain
);
66 if (domain_len
> server_len
)
69 if (domain_len
== server_len
)
70 return !strncasecmp(domain
, server
, server_len
);
72 len
= server_len
- domain_len
;
73 if (server
[len
- 1] != '.')
76 return !strncasecmp(domain
, server
+ len
, domain_len
);
80 is_ip_address(const unsigned char *address
, int addresslen
)
82 /* The @address has well defined limits so it would be a shame to
84 unsigned char buffer
[IP_ADDRESS_BUFFER_SIZE
];
86 if (addresslen
>= sizeof(buffer
))
89 safe_strncpy(buffer
, address
, addresslen
+ 1);
94 struct sockaddr_in6 addr6
;
96 if (inet_pton(AF_INET6
, buffer
, &addr6
.sin6_addr
) > 0)
99 #endif /* CONFIG_IPV6 */
101 struct in_addr addr4
;
103 if (inet_pton(AF_INET
, buffer
, &addr4
) > 0)
109 /* FIXME: Is this ever the case? */
111 #endif /* HAVE_INET_PTON */
116 end_with_known_tld(const unsigned char *s
, int slen
)
119 static const unsigned char *const tld
[] =
120 { "com", "edu", "net",
122 "int", "biz", "arpa",
125 "name", "pro", NULL
};
127 if (!slen
) return -1;
128 if (slen
< 0) slen
= strlen(s
);
130 for (i
= 0; tld
[i
]; i
++) {
131 int tldlen
= strlen(tld
[i
]);
132 int pos
= slen
- tldlen
;
134 if (pos
>= 0 && !strncasecmp(&s
[pos
], tld
[i
], tldlen
))
141 /* XXX: this function writes to @name. */
143 check_whether_file_exists(unsigned char *name
)
145 /* Check POST_CHAR etc ... */
146 static const unsigned char chars
[] = POST_CHAR_S
"#?";
148 int namelen
= strlen(name
);
150 if (file_exists(name
))
153 for (i
= 0; i
< sizeof(chars
) - 1; i
++) {
154 unsigned char *pos
= memchr(name
, chars
[i
], namelen
);
160 exists
= file_exists(name
);
171 /* Encodes URIs without encoding stuff like fragments and query separators. */
173 encode_file_uri_string(struct string
*string
, unsigned char *uristring
)
175 int filenamelen
= check_whether_file_exists(uristring
);
177 encode_uri_string(string
, uristring
, filenamelen
, 0);
182 get_protocol_length(const unsigned char *url
)
184 unsigned char *end
= (unsigned char *) url
;
186 /* Seek the end of the protocol name if any. */
188 * scheme = 1*[ lowalpha | digit | "+" | "-" | "." ]
189 * (but per its recommendations we accept "upalpha" too) */
190 while (isalnum(*end
) || *end
== '+' || *end
== '-' || *end
== '.')
193 /* Now we make something to support our "IP version in protocol scheme
194 * name" hack and silently chop off the last digit if it's there. The
195 * IETF's not gonna notice I hope or it'd be going after us hard. */
196 if (end
!= url
&& isdigit(end
[-1]))
199 /* Also return 0 if there's no protocol name (@end == @url). */
200 return (*end
== ':' || isdigit(*end
)) ? end
- url
: 0;
204 parse_uri(struct uri
*uri
, unsigned char *uristring
)
206 unsigned char *prefix_end
, *host_end
;
208 unsigned char *lbracket
, *rbracket
;
211 assertm(uristring
!= NULL
, "No uri to parse.");
212 memset(uri
, 0, sizeof(*uri
));
214 /* Nothing to do for an empty url. */
215 if_assert_failed
return 0;
216 if (!*uristring
) return URI_ERRNO_EMPTY
;
218 uri
->string
= uristring
;
219 uri
->protocollen
= get_protocol_length(uristring
);
222 if (!uri
->protocollen
) return URI_ERRNO_INVALID_PROTOCOL
;
224 /* Figure out whether the protocol is known */
225 uri
->protocol
= get_protocol(struri(uri
), uri
->protocollen
);
227 prefix_end
= uristring
+ uri
->protocollen
; /* ':' */
229 /* Check if there's a digit after the protocol name. */
230 if (isdigit(*prefix_end
)) {
231 uri
->ip_family
= uristring
[uri
->protocollen
] - '0';
234 if (*prefix_end
!= ':')
235 return URI_ERRNO_INVALID_PROTOCOL
;
240 if (prefix_end
[0] == '/' && prefix_end
[1] == '/') {
241 if (prefix_end
[2] == '/'
242 && get_protocol_need_slash_after_host(uri
->protocol
))
243 return URI_ERRNO_TOO_MANY_SLASHES
;
247 } else if (get_protocol_need_slashes(uri
->protocol
)) {
248 return URI_ERRNO_NO_SLASHES
;
251 if (get_protocol_free_syntax(uri
->protocol
)) {
252 uri
->data
= prefix_end
;
253 uri
->datalen
= strlen(prefix_end
);
256 } else if (uri
->protocol
== PROTOCOL_FILE
) {
257 int datalen
= strcspn(prefix_end
, "#" POST_CHAR_S
);
258 unsigned char *frag_or_post
= prefix_end
+ datalen
;
260 /* Extract the fragment part. */
262 if (*frag_or_post
== '#') {
263 uri
->fragment
= frag_or_post
+ 1;
264 uri
->fragmentlen
= strcspn(uri
->fragment
, POST_CHAR_S
);
265 frag_or_post
= uri
->fragment
+ uri
->fragmentlen
;
267 if (*frag_or_post
== POST_CHAR
) {
268 uri
->post
= frag_or_post
+ 1;
271 datalen
= strlen(prefix_end
);
274 /* A bit of a special case, but using the "normal" host
275 * parsing seems a bit scary at this point. (see bug 107). */
276 if (datalen
> 9 && !strncasecmp(prefix_end
, "localhost/", 10)) {
281 uri
->data
= prefix_end
;
282 uri
->datalen
= datalen
;
290 /* Get brackets enclosing IPv6 address */
291 lbracket
= strchr(prefix_end
, '[');
293 rbracket
= strchr(lbracket
, ']');
294 /* [address] is handled only inside of hostname part (surprisingly). */
295 if (rbracket
&& rbracket
< prefix_end
+ strcspn(prefix_end
, "/"))
298 lbracket
= rbracket
= NULL
;
304 /* Possibly skip auth part */
305 host_end
= prefix_end
+ strcspn(prefix_end
, "@");
307 if (prefix_end
+ strcspn(prefix_end
, "/") > host_end
308 && *host_end
) { /* we have auth info here */
309 unsigned char *user_end
;
311 /* Allow '@' in the password component */
312 while (strcspn(host_end
+ 1, "@") < strcspn(host_end
+ 1, "/?"))
313 host_end
= host_end
+ 1 + strcspn(host_end
+ 1, "@");
315 user_end
= strchr(prefix_end
, ':');
317 if (!user_end
|| user_end
> host_end
) {
318 uri
->user
= prefix_end
;
319 uri
->userlen
= host_end
- prefix_end
;
321 uri
->user
= prefix_end
;
322 uri
->userlen
= user_end
- prefix_end
;
323 uri
->password
= user_end
+ 1;
324 uri
->passwordlen
= host_end
- user_end
- 1;
326 prefix_end
= host_end
+ 1;
331 host_end
= rbracket
+ strcspn(rbracket
, ":/?");
334 host_end
= prefix_end
+ strcspn(prefix_end
, ":/?");
338 int addrlen
= rbracket
- lbracket
- 1;
340 /* Check for valid length.
341 * addrlen >= sizeof(hostbuf) is theorically impossible
342 * but i keep the test in case of... Safer, imho --Zas */
343 assertm(addrlen
>= 0 && addrlen
< NI_MAXHOST
,
344 "parse_uri(): addrlen value is bad (%d) for URL '%s'. "
345 "Problems are likely to be encountered. Please report "
346 "this, it is a security bug!", addrlen
, uristring
);
347 if_assert_failed
return URI_ERRNO_IPV6_SECURITY
;
349 uri
->host
= lbracket
+ 1;
350 uri
->hostlen
= addrlen
;
354 uri
->host
= prefix_end
;
355 uri
->hostlen
= host_end
- prefix_end
;
357 /* Trim trailing '.'s */
358 if (uri
->hostlen
&& uri
->host
[uri
->hostlen
- 1] == '.')
359 return URI_ERRNO_TRAILING_DOTS
;
362 if (*host_end
== ':') { /* we have port here */
363 unsigned char *port_end
= host_end
+ 1 + strcspn(host_end
+ 1, "/");
367 uri
->port
= host_end
;
368 uri
->portlen
= port_end
- host_end
;
370 if (uri
->portlen
== 0)
371 return URI_ERRNO_NO_PORT_COLON
;
373 /* We only use 8 bits for portlen so better check */
374 if (uri
->portlen
!= port_end
- host_end
)
375 return URI_ERRNO_INVALID_PORT
;
377 /* test if port is number */
378 /* TODO: possibly lookup for the service otherwise? --pasky */
379 for (; host_end
< port_end
; host_end
++)
380 if (!isdigit(*host_end
))
381 return URI_ERRNO_INVALID_PORT
;
383 /* Check valid port value, and let show an error message
384 * about invalid url syntax. */
385 if (uri
->port
&& uri
->portlen
) {
389 n
= strtol(uri
->port
, NULL
, 10);
390 if (errno
|| !uri_port_is_valid(n
))
391 return URI_ERRNO_INVALID_PORT
;
395 if (*host_end
== '/') {
398 } else if (get_protocol_need_slash_after_host(uri
->protocol
)) {
399 /* The need for slash after the host component depends on the
400 * need for a host component. -- The dangerous mind of Jonah */
402 return URI_ERRNO_NO_HOST
;
404 return URI_ERRNO_NO_HOST_SLASH
;
407 /* Look for #fragment or POST_CHAR */
408 prefix_end
= host_end
+ strcspn(host_end
, "#" POST_CHAR_S
);
409 uri
->data
= host_end
;
410 uri
->datalen
= prefix_end
- host_end
;
412 if (*prefix_end
== '#') {
413 uri
->fragment
= prefix_end
+ 1;
414 uri
->fragmentlen
= strcspn(uri
->fragment
, POST_CHAR_S
);
415 prefix_end
= uri
->fragment
+ uri
->fragmentlen
;
418 if (*prefix_end
== POST_CHAR
) {
419 uri
->post
= prefix_end
+ 1;
426 get_uri_port(const struct uri
*uri
)
428 if (uri
->port
&& uri
->portlen
) {
429 const unsigned char *end
= uri
->port
;
430 int port
= strtol(uri
->port
, (char **) &end
, 10);
432 if (end
!= uri
->port
) {
433 assert(uri_port_is_valid(port
));
438 return get_protocol_port(uri
->protocol
);
441 #define can_compare_uri_components(comp) !(((comp) & (URI_SPECIAL | URI_IDN)))
444 compare_component(const unsigned char *a
, int alen
,
445 const unsigned char *b
, int blen
)
447 /* Check that the length and the strings are both set or unset */
448 if (alen
!= blen
|| !!a
!= !!b
) return 0;
450 /* Both are unset so that will make a perfect match */
451 if (!a
|| !alen
) return 1;
453 /* Let the higher forces decide */
454 return !memcmp(a
, b
, blen
);
457 #define wants(x) (components & (x))
460 compare_uri(const struct uri
*a
, const struct uri
*b
,
461 enum uri_component components
)
463 if (a
== b
) return 1;
464 if (!components
) return 0;
466 assertm(can_compare_uri_components(components
),
467 "compare_uri() is a work in progress. Component unsupported");
469 return (!wants(URI_PROTOCOL
) || a
->protocol
== b
->protocol
)
470 && (!wants(URI_IP_FAMILY
) || a
->ip_family
== b
->ip_family
)
472 || compare_component(a
->user
, a
->userlen
, b
->user
, b
->userlen
))
473 && (!wants(URI_PASSWORD
)
474 || compare_component(a
->password
, a
->passwordlen
, b
->password
, b
->passwordlen
))
476 || compare_component(a
->host
, a
->hostlen
, b
->host
, b
->hostlen
))
478 || compare_component(a
->port
, a
->portlen
, b
->port
, b
->portlen
))
480 || compare_component(a
->data
, a
->datalen
, b
->data
, b
->datalen
))
481 && (!wants(URI_FRAGMENT
)
482 || compare_component(a
->fragment
, a
->fragmentlen
, b
->fragment
, b
->fragmentlen
))
484 || compare_component(a
->post
, a
->post
? strlen(a
->post
) : 0, b
->post
, b
->post
? strlen(b
->post
) : 0));
488 /* We might need something more intelligent than this Swiss army knife. */
490 add_uri_to_string(struct string
*string
, const struct uri
*uri
,
491 enum uri_component components
)
493 /* Custom or unknown keep the URI untouched. */
494 if (uri
->protocol
== PROTOCOL_UNKNOWN
)
495 return add_to_string(string
, struri(uri
));
497 if (wants(URI_PROTOCOL
)) {
498 add_bytes_to_string(string
, uri
->string
, uri
->protocollen
);
499 if (wants(URI_IP_FAMILY
) && uri
->ip_family
)
500 add_long_to_string(string
, uri
->ip_family
);
501 add_char_to_string(string
, ':');
502 if (get_protocol_need_slashes(uri
->protocol
))
503 add_to_string(string
, "//");
506 if (wants(URI_USER
) && uri
->userlen
) {
507 add_bytes_to_string(string
, uri
->user
, uri
->userlen
);
509 if (wants(URI_PASSWORD
) && uri
->passwordlen
) {
510 add_char_to_string(string
, ':');
511 add_bytes_to_string(string
, uri
->password
,
515 add_char_to_string(string
, '@');
517 } else if (wants(URI_PASSWORD
) && uri
->passwordlen
) {
518 add_bytes_to_string(string
, uri
->password
, uri
->passwordlen
);
521 if (wants(URI_HOST
) && uri
->hostlen
) {
525 /* Rationale for wants(URI_PORT): The [notation] was invented
526 * so that you can have an IPv6 addy and a port together. So
527 * we want to use it when that happens, otherwise we need not
528 * bother (that happens only when we want it for DNS anyway).
529 * I insist on an implied elegancy of this way, but YMMV. ;-)
531 if (uri
->ipv6
&& wants(URI_PORT
)) add_char_to_string(string
, '[');
534 /* Support for the GNU International Domain Name library.
536 * http://www.gnu.org/software/libidn/manual/html_node/IDNA-Functions.html
538 * Now it is probably not perfect because idna_to_ascii_lz()
539 * will be using a ``zero terminated input string encoded in
540 * the current locale's character set''. Anyway I don't know
541 * how to convert anything to UTF-8 or Unicode. --jonas */
542 if (wants(URI_IDN
)) {
543 unsigned char *host
= memacpy(uri
->host
, uri
->hostlen
);
547 int code
= idna_to_ascii_lz(host
, &idname
, 0);
549 /* FIXME: Return NULL if it coughed? --jonas */
550 if (code
== IDNA_SUCCESS
) {
551 add_to_string(string
, idname
);
562 add_bytes_to_string(string
, uri
->host
, uri
->hostlen
);
565 if (uri
->ipv6
&& wants(URI_PORT
)) add_char_to_string(string
, ']');
569 if (wants(URI_PORT
) || wants(URI_DEFAULT_PORT
)) {
571 add_char_to_string(string
, ':');
572 add_bytes_to_string(string
, uri
->port
, uri
->portlen
);
574 } else if (wants(URI_DEFAULT_PORT
)
575 && uri
->protocol
!= PROTOCOL_USER
) {
576 /* For user protocols we don't know a default port.
577 * Should user protocols ports be configurable? */
578 int port
= get_protocol_port(uri
->protocol
);
580 add_char_to_string(string
, ':');
581 add_long_to_string(string
, port
);
585 /* Only add slash if we need to separate */
586 if ((wants(URI_DATA
) || wants(URI_POST
) || components
== URI_HTTP_REFERRER_HOST
)
587 && wants(~(URI_DATA
| URI_PORT
))
588 && get_protocol_need_slash_after_host(uri
->protocol
))
589 add_char_to_string(string
, '/');
591 if (wants(URI_DATA
) && uri
->datalen
)
592 add_bytes_to_string(string
, uri
->data
, uri
->datalen
);
594 /* We can not test uri->datalen here since we need to always
596 if (wants(URI_PATH
) || wants(URI_FILENAME
)) {
597 const unsigned char *filename
= uri
->data
;
598 const unsigned char *pos
;
600 assertm(!wants(URI_FILENAME
) || components
== URI_FILENAME
,
601 "URI_FILENAME should be used alone %d", components
);
603 if (wants(URI_PATH
) && !is_uri_dir_sep(uri
, *filename
)) {
604 #ifdef CONFIG_OS_WIN32
605 if (uri
->protocol
!= PROTOCOL_FILE
)
607 /* FIXME: Add correct separator */
608 add_char_to_string(string
, '/');
611 if (!uri
->datalen
) return string
;
613 for (pos
= filename
; *pos
&& !end_of_dir(*pos
); pos
++)
614 if (wants(URI_FILENAME
) && is_uri_dir_sep(uri
, *pos
))
617 return add_bytes_to_string(string
, filename
, pos
- filename
);
620 if (wants(URI_QUERY
) && uri
->datalen
) {
621 const unsigned char *query
= memchr(uri
->data
, '?', uri
->datalen
);
623 assertm(URI_QUERY
== components
,
624 "URI_QUERY should be used alone %d", components
);
626 if (!query
) return string
;
629 /* Check fragment and POST_CHAR */
630 return add_bytes_to_string(string
, query
, strcspn(query
, "#" POST_CHAR_S
));
633 if (wants(URI_FRAGMENT
) && uri
->fragmentlen
) {
634 add_char_to_string(string
, '#');
635 add_bytes_to_string(string
, uri
->fragment
, uri
->fragmentlen
);
638 if (wants(URI_POST
) && uri
->post
) {
639 add_char_to_string(string
, POST_CHAR
);
640 add_to_string(string
, uri
->post
);
642 } else if (wants(URI_POST_INFO
) && uri
->post
) {
643 if (!strncmp(uri
->post
, "text/plain", 10)) {
644 add_to_string(string
, " (PLAIN TEXT DATA)");
646 } else if (!strncmp(uri
->post
, "multipart/form-data;", 20)) {
647 add_to_string(string
, " (MULTIPART FORM DATA)");
650 add_to_string(string
, " (POST DATA)");
661 get_uri_string(const struct uri
*uri
, enum uri_component components
)
663 struct string string
;
665 if (init_string(&string
)
666 && add_uri_to_string(&string
, uri
, components
))
667 return string
.source
;
669 done_string(&string
);
675 add_string_uri_to_string(struct string
*string
, unsigned char *uristring
,
676 enum uri_component components
)
680 if (parse_uri(&uri
, uristring
) != URI_ERRNO_OK
)
683 return add_uri_to_string(string
, &uri
, components
);
687 #define normalize_uri_reparse(str) normalize_uri(NULL, str)
688 #define normalize_uri_noparse(uri) normalize_uri(uri, struri(uri))
691 normalize_uri(struct uri
*uri
, unsigned char *uristring
)
693 unsigned char *parse_string
= uristring
;
694 unsigned char *src
, *dest
, *path
;
695 int need_slash
= 0, keep_dslash
= 1;
696 int parse
= (uri
== NULL
);
697 struct uri uri_struct
;
699 if (!uri
) uri
= &uri_struct
;
701 /* We need to get the real (proxied) URI but lowercase relevant URI
702 * parts along the way. */
704 if (parse
&& parse_uri(uri
, parse_string
) != URI_ERRNO_OK
)
709 /* This is a maybe not the right place but both join_urls() and
710 * get_translated_uri() through translate_url() calls this
711 * function and then it already works on and modifies an
713 convert_to_lowercase(uri
->string
, uri
->protocollen
);
714 if (uri
->hostlen
) convert_to_lowercase(uri
->host
, uri
->hostlen
);
717 parse_string
= uri
->data
;
718 } while (uri
->protocol
== PROTOCOL_PROXY
);
720 if (get_protocol_free_syntax(uri
->protocol
))
723 if (uri
->protocol
!= PROTOCOL_UNKNOWN
) {
724 need_slash
= get_protocol_need_slash_after_host(uri
->protocol
);
725 keep_dslash
= get_protocol_keep_double_slashes(uri
->protocol
);
728 path
= uri
->data
- need_slash
;
731 /* This loop mangles the URI string by removing ".." and "." segments.
732 * However it must not alter "//" without reason; see bug 744. */
734 /* If the following pieces are the LAST parts of URL, we remove
735 * them as well. See RFC 2396 section 5.2 for details. */
737 if (end_of_dir(src
[0])) {
738 /* URL data contains no more path. */
739 memmove(dest
, src
, strlen(src
) + 1);
743 if (!is_uri_dir_sep(uri
, src
[0])) {
744 /* This is to reduce indentation */
746 } else if (src
[1] == '.') {
748 /* /. - skip the dot */
753 } else if (is_uri_dir_sep(uri
, src
[2])) {
754 /* /./ - strip that.. */
758 } else if (src
[2] == '.'
759 && (is_uri_dir_sep(uri
, src
[3]) || !src
[3])) {
760 /* /../ or /.. - skip it and preceding element.
762 * <path> "/foo/bar" <dest> ...
763 * <src> ("/../" or "/..\0") ...
765 * Remove "bar" and the directory
766 * separator that precedes it. The
767 * separator will be added back in the
768 * next iteration unless another ".."
769 * follows, in which case it will be
770 * added later. "bar" may be empty. */
772 while (dest
> path
) {
774 if (is_uri_dir_sep(uri
, *dest
)) break;
777 /* <path> "/foo" <dest> "/bar" ...
778 * <src> ("/../" or "/..\0") ... */
780 /* /.. - add ending slash and stop */
790 } else if (is_uri_dir_sep(uri
, src
[1]) && !keep_dslash
) {
791 /* // - ignore first '/'. */
796 /* We don't want to access memory past the NUL char. */
804 /* The 'file' scheme URI comes in and bastardized URI comes out which consists
805 * of just the complete path to file/directory, which the dumb 'file' protocol
806 * backend can understand. No host parts etc, that is what this function is
807 * supposed to chew. */
809 transform_file_url(struct uri
*uri
, const unsigned char *cwd
)
811 unsigned char *path
= uri
->data
;
813 assert(uri
->protocol
== PROTOCOL_FILE
&& uri
->data
);
815 /* Sort out the host part. We currently support only host "localhost"
816 * (plus empty host part will be assumed to be "localhost" as well).
817 * As our extensions, '.' will reference to the cwd on localhost
818 * (originally, when the first thing after file:// wasn't "localhost/",
819 * we assumed the cwd as well, and pretended that there's no host part
820 * at all) and '..' to the directory parent to cwd. Another extension
821 * is that if this is a DOS-like system, the first char in two-char
822 * host part is uppercase letter and the second char is a colon, it is
823 * assumed to be a local disk specification. */
824 /* TODO: Use FTP for non-localhost hosts. --pasky */
826 /* For URL "file://", we open the current directory. Some other
827 * browsers instead open root directory, but AFAIK the standard does
828 * not specify that and this was the original behaviour and it is more
829 * consistent with our file://./ notation. */
831 /* Who would name their file/dir '...' ? */
832 if (*path
== '.' || !*path
) {
835 if (!init_string(&dir
))
838 encode_uri_string(&dir
, cwd
, -1, 0);
840 /* Either we will end up with '//' and translate_directories()
841 * will shorten it or the '/' will mark the inserted cwd as a
843 if (*path
== '.') *path
= '/';
845 /* Insert the current working directory. */
846 /* The offset is 7 == sizeof("file://") - 1. */
847 insert_in_string(&struri(uri
), 7, dir
.source
, dir
.length
);
854 if (isasciialpha(path
[0]) && path
[1] == ':' && dir_sep(path
[2]))
858 for (; *path
&& !dir_sep(*path
); path
++);
860 /* FIXME: We will in fact assume localhost even for non-local hosts,
861 * until we will support the FTP transformation. --pasky */
863 memmove(uri
->data
, path
, strlen(path
) + 1);
867 static unsigned char *translate_url(unsigned char *url
, unsigned char *cwd
);
870 join_urls(struct uri
*base
, unsigned char *rel
)
872 unsigned char *uristring
, *path
;
878 /* TODO: Support for ';' ? (see the RFC) --pasky */
880 /* For '#', '?' and '//' we could use get_uri_string() but it might be
881 * too expensive since it uses granular allocation scheme. I wouldn't
882 * personally mind tho' because it would be cleaner. --jonas */
884 /* Strip fragment and post part from the base URI and append
885 * the fragment string in @rel. */
886 length
= base
->fragment
887 ? base
->fragment
- struri(base
) - 1
888 : get_real_uri_length(base
);
890 } else if (rel
[0] == '?') {
891 /* Strip query, fragment and post part from the base URI and
892 * append the query string in @rel. */
893 length
= base
->fragment
? base
->fragment
- struri(base
) - 1
894 : get_real_uri_length(base
);
896 uristring
= memchr(base
->data
, '?', base
->datalen
);
897 if (uristring
) length
= uristring
- struri(base
);
899 } else if (rel
[0] == '/' && rel
[1] == '/') {
900 if (!get_protocol_need_slashes(base
->protocol
))
903 /* Get `<protocol>:' from the base URI and append the `//' part
905 length
= base
->protocollen
+ 1;
907 /* We need to sanitize the relative part and add stuff like
912 /* If one of the tests above set @length to something useful */
914 uristring
= memacpy(struri(base
), length
);
915 if (!uristring
) return NULL
;
917 add_to_strn(&uristring
, rel
);
920 unsigned char *translated
;
922 translated
= translate_url(uristring
, NULL
);
926 return normalize_uri_reparse(uristring
);
929 /* Check if there is some protocol name to go for */
930 length
= get_protocol_length(rel
);
932 switch (get_protocol(rel
, length
)) {
933 case PROTOCOL_UNKNOWN
:
935 /* Mysteriously proxy URIs are breaking here ... */
939 /* FIXME: Use get_uri_string(base, URI_PATH) as cwd arg
940 * to translate_url(). */
942 uristring
= translate_url(rel
, NULL
);
943 if (uristring
) return uristring
;
947 assertm(base
->data
!= NULL
, "bad base url");
948 if_assert_failed
return NULL
;
952 /* Either is path blank, but we've slash char before, or path is not
953 * blank, but doesn't start by a slash (if we'd just stay along with
954 * is_uri_dir_sep(&uri, path[-1]) w/o all the surrounding crap, it
955 * should be enough, but I'm not sure and I don't want to break
956 * anything --pasky). */
957 /* We skip first char of URL ('/') in parse_url() (ARGH). This
958 * is reason of all this bug-bearing magic.. */
960 if (!is_uri_dir_sep(base
, *path
)) path
--;
962 if (is_uri_dir_sep(base
, path
[-1])) path
--;
965 if (!is_uri_dir_sep(base
, rel
[0])) {
966 unsigned char *path_end
;
968 /* The URL is relative. */
971 /* There's no path in the URL, but we're going to add
972 * something there, and the something doesn't start by
973 * a slash. So we need to insert a slash after the base
974 * URL. Clever, eh? ;) */
978 for (path_end
= path
; *path_end
; path_end
++) {
979 if (end_of_dir(*path_end
)) break;
980 /* Modify the path pointer, so that it'll always point
981 * above the last '/' in the URL; later, we'll copy the
982 * URL only _TO_ this point, and anything after last
983 * slash will be substituted by 'rel'. */
984 if (is_uri_dir_sep(base
, *path_end
))
989 length
= path
- struri(base
);
990 uristring
= mem_alloc(length
+ strlen(rel
) + add_slash
+ 1);
991 if (!uristring
) return NULL
;
993 memcpy(uristring
, struri(base
), length
);
994 if (add_slash
) uristring
[length
] = '/';
995 strcpy(uristring
+ length
+ add_slash
, rel
);
997 return normalize_uri_reparse(uristring
);
1001 /* Tries to figure out what protocol @newurl might be specifying by checking if
1002 * it exists as a file locally or by checking parts of the host name. */
1003 static enum protocol
1004 find_uri_protocol(unsigned char *newurl
)
1008 /* First see if it is a file so filenames that look like hostnames
1009 * won't confuse us below. */
1010 if (check_whether_file_exists(newurl
) >= 0) return PROTOCOL_FILE
;
1012 /* Yes, it would be simpler to make test for IPv6 address first,
1013 * but it would result in confusing mix of ifdefs ;-). */
1014 /* FIXME: Ideas for improve protocol detection
1016 * - Handle common hostnames. It could be part of the protocol backend
1017 * structure. [ www -> http, irc -> irc, news -> nntp, ... ]
1019 * - Resolve using port number. [ 119 -> nntp, 443 -> https, ... ]
1022 ch
= newurl
+ strcspn(newurl
, ".:/@");
1024 || (*ch
== ':' && *newurl
!= '[' && strchr(newurl
, '@'))
1025 || !strncasecmp(newurl
, "ftp.", 4)) {
1026 /* Contains user/password/ftp-hostname */
1027 return PROTOCOL_FTP
;
1030 } else if (*newurl
== '[' && *ch
== ':') {
1031 /* Candidate for IPv6 address */
1032 unsigned char *bracket2
, *colon2
;
1035 bracket2
= strchr(ch
, ']');
1036 colon2
= strchr(ch
, ':');
1037 if (bracket2
&& colon2
&& bracket2
> colon2
)
1038 return PROTOCOL_HTTP
;
1041 } else if (*newurl
!= '.' && *ch
== '.') {
1042 /* Contains domain name? */
1043 unsigned char *host_end
, *domain
;
1044 unsigned char *ipscan
;
1046 /* Process the hostname */
1047 for (domain
= ch
+ 1;
1048 *(host_end
= domain
+ strcspn(domain
, ".:/?")) == '.';
1049 domain
= host_end
+ 1);
1052 for (ipscan
= ch
; isdigit(*ipscan
) || *ipscan
== '.';
1055 if (!*ipscan
|| *ipscan
== ':' || *ipscan
== '/')
1056 return PROTOCOL_HTTP
;
1058 /* It's two-letter or known TLD? */
1059 if (host_end
- domain
== 2
1060 || end_with_known_tld(domain
, host_end
- domain
) >= 0)
1061 return PROTOCOL_HTTP
;
1064 return PROTOCOL_UNKNOWN
;
1068 #define MAX_TRANSLATION_ATTEMPTS 32
1070 /* Returns an URI string that can be used internally. Adding protocol prefix,
1071 * missing slashes etc. */
1072 static unsigned char *
1073 translate_url(unsigned char *url
, unsigned char *cwd
)
1075 unsigned char *newurl
;
1077 enum uri_errno uri_errno
, prev_errno
= URI_ERRNO_EMPTY
;
1080 /* Strip starting spaces */
1081 while (*url
== ' ') url
++;
1082 if (!*url
) return NULL
;
1084 newurl
= expand_tilde(url
); /* XXX: Post data copy. */
1085 if (!newurl
) return NULL
;
1088 /* Yay a goto loop. If we get some URI parse error and try to
1089 * fix it we go back to here and try again. */
1090 /* Ordinary parse */
1091 uri_errno
= parse_uri(&uri
, newurl
);
1093 /* Bail out if the same error occurs twice */
1094 if (uri_errno
== prev_errno
|| retries
++ > MAX_TRANSLATION_ATTEMPTS
) {
1095 if (retries
> MAX_TRANSLATION_ATTEMPTS
) {
1096 ERROR("Maximum number of parsing attempts exceeded "
1103 prev_errno
= uri_errno
;
1105 switch (uri_errno
) {
1107 /* Fix translation of 1.2.3.4:5 so IP address part won't be
1108 * interpreted as the protocol name. */
1109 if (uri
.protocol
== PROTOCOL_UNKNOWN
) {
1110 enum protocol protocol
= find_uri_protocol(newurl
);
1112 /* Code duplication with the URI_ERRNO_INVALID_PROTOCOL
1114 if (protocol
!= PROTOCOL_UNKNOWN
) {
1117 if (!init_string(&str
)) return NULL
;
1121 add_to_string(&str
, "ftp://");
1122 encode_uri_string(&str
, newurl
, -1, 0);
1126 add_to_string(&str
, "http://");
1127 add_to_string(&str
, newurl
);
1130 case PROTOCOL_UNKNOWN
:
1135 add_to_string(&str
, "file://");
1136 if (!dir_sep(*newurl
))
1137 add_to_string(&str
, "./");
1139 add_to_string(&str
, newurl
);
1143 newurl
= str
.source
;
1145 /* Work around the infinite loop prevention */
1146 prev_errno
= URI_ERRNO_EMPTY
;
1151 /* If file:// URI is transformed we need to reparse. */
1152 if (uri
.protocol
== PROTOCOL_FILE
&& cwd
&& *cwd
1153 && transform_file_url(&uri
, cwd
))
1154 return normalize_uri_reparse(struri(&uri
));
1156 /* Translate the proxied URI too if proxy:// */
1157 if (uri
.protocol
== PROTOCOL_PROXY
) {
1158 unsigned char *data
= translate_url(uri
.data
, cwd
);
1159 int pos
= uri
.data
- struri(&uri
);
1162 struri(&uri
)[pos
] = 0;
1163 insert_in_string(&struri(&uri
), pos
, data
, strlen(data
));
1165 return normalize_uri_reparse(struri(&uri
));
1168 return normalize_uri_noparse(&uri
);
1170 case URI_ERRNO_TOO_MANY_SLASHES
:
1172 unsigned char *from
, *to
;
1174 assert(uri
.string
[uri
.protocollen
] == ':'
1175 && uri
.string
[uri
.protocollen
+ 1] == '/'
1176 && uri
.string
[uri
.protocollen
+ 2] == '/');
1178 from
= to
= uri
.string
+ uri
.protocollen
+ 3;
1179 while (*from
== '/') from
++;
1182 memmove(to
, from
, strlen(from
) + 1);
1185 case URI_ERRNO_NO_SLASHES
:
1187 /* Try prefix:some.url -> prefix://some.url.. */
1190 /* Check if only one '/' is needed. */
1191 if (uri
.string
[uri
.protocollen
+ 1] == '/')
1194 insert_in_string(&newurl
, uri
.protocollen
+ 1, "//", slashes
);
1197 case URI_ERRNO_TRAILING_DOTS
:
1199 /* Trim trailing '.'s */
1200 unsigned char *from
= uri
.host
+ uri
.hostlen
;
1201 unsigned char *to
= from
;
1203 assert(uri
.host
< to
&& to
[-1] == '.' && *from
!= '.');
1205 while (uri
.host
< to
&& to
[-1] == '.') to
--;
1208 memmove(to
, from
, strlen(from
) + 1);
1211 case URI_ERRNO_NO_PORT_COLON
:
1212 assert(uri
.portlen
== 0
1213 && uri
.string
< uri
.port
1214 && uri
.port
[-1] == ':');
1216 memmove(uri
.port
- 1, uri
.port
, strlen(uri
.port
) + 1);
1219 case URI_ERRNO_NO_HOST_SLASH
:
1221 int offset
= uri
.port
1222 ? uri
.port
+ uri
.portlen
- struri(&uri
)
1223 : uri
.host
+ uri
.hostlen
- struri(&uri
) + uri
.ipv6
/* ']' */;
1225 assertm(uri
.host
!= NULL
, "uri.host not set after no host slash error");
1226 insert_in_string(&newurl
, offset
, "/", 1);
1229 case URI_ERRNO_INVALID_PROTOCOL
:
1231 /* No protocol name */
1232 enum protocol protocol
= find_uri_protocol(newurl
);
1235 if (!init_string(&str
)) return NULL
;
1239 add_to_string(&str
, "ftp://");
1240 encode_uri_string(&str
, newurl
, -1, 0);
1244 add_to_string(&str
, "http://");
1245 add_to_string(&str
, newurl
);
1248 case PROTOCOL_UNKNOWN
:
1249 /* We default to file:// even though we already
1250 * tested if the file existed since it will give
1251 * a "No such file or directory" error. which
1252 * might better hint the user that there was
1253 * problem figuring out the URI. */
1256 add_to_string(&str
, "file://");
1257 if (!dir_sep(*newurl
))
1258 add_to_string(&str
, "./");
1260 encode_file_uri_string(&str
, newurl
);
1264 newurl
= str
.source
;
1268 case URI_ERRNO_EMPTY
:
1269 case URI_ERRNO_IPV6_SECURITY
:
1270 case URI_ERRNO_NO_HOST
:
1271 case URI_ERRNO_INVALID_PORT
:
1272 case URI_ERRNO_INVALID_PORT_RANGE
:
1273 /* None of these can be handled properly. */
1283 get_composed_uri(struct uri
*uri
, enum uri_component components
)
1285 unsigned char *string
;
1288 if_assert_failed
return NULL
;
1290 string
= get_uri_string(uri
, components
);
1291 if (!string
) return NULL
;
1293 uri
= get_uri(string
, 0);
1300 get_translated_uri(unsigned char *uristring
, unsigned char *cwd
)
1304 uristring
= translate_url(uristring
, cwd
);
1305 if (!uristring
) return NULL
;
1307 uri
= get_uri(uristring
, 0);
1308 mem_free(uristring
);
1315 get_extension_from_uri(struct uri
*uri
)
1317 unsigned char *extension
= NULL
;
1319 unsigned char *pos
= uri
->data
;
1323 for (; *pos
&& !end_of_dir(*pos
); pos
++) {
1324 if (!afterslash
&& !extension
&& *pos
== '.') {
1326 } else if (is_uri_dir_sep(uri
, *pos
)) {
1334 if (extension
&& extension
< pos
)
1335 return memacpy(extension
, pos
- extension
);
1340 /* URI encoding, escaping unallowed characters. */
1342 safe_char(unsigned char c
)
1344 /* RFC 2396, Page 8, Section 2.3 ;-) */
1345 return isident(c
) || c
== '.' || c
== '!' || c
== '~'
1346 || c
== '*' || c
== '\''|| c
== '(' || c
== ')';
1350 encode_uri_string(struct string
*string
, const unsigned char *name
, int namelen
,
1351 int convert_slashes
)
1354 const unsigned char *end
;
1359 if (namelen
< 0) namelen
= strlen(name
);
1361 for (end
= name
+ namelen
; name
< end
; name
++) {
1363 /* This is probably correct only for query part of URI..? */
1364 if (*name
== ' ') add_char_to_string(data
, len
, '+');
1367 if (safe_char(*name
) || (!convert_slashes
&& *name
== '/')) {
1368 add_char_to_string(string
, *name
);
1371 n
[1] = hx((((int) *name
) & 0xF0) >> 4);
1372 n
[2] = hx(((int) *name
) & 0xF);
1373 add_bytes_to_string(string
, n
, sizeof(n
) - 1);
1379 encode_win32_uri_string(struct string
*string
, unsigned char *name
, int namelen
)
1387 if (namelen
< 0) namelen
= strlen(name
);
1389 for (end
= name
+ namelen
; name
< end
; name
++) {
1390 if (safe_char(*name
) || *name
== ':' || *name
== '\\') {
1391 add_char_to_string(string
, *name
);
1394 n
[1] = hx((((int) *name
) & 0xF0) >> 4);
1395 n
[2] = hx(((int) *name
) & 0xF);
1396 add_bytes_to_string(string
, n
, sizeof(n
) - 1);
1401 /* This function is evil, it modifies its parameter. */
1402 /* XXX: but decoded string is _never_ longer than encoded string so it's an
1403 * efficient way to do that, imho. --Zas */
1405 decode_uri(unsigned char *src
)
1407 unsigned char *dst
= src
;
1414 int x1
= unhx(*src
);
1417 int x2
= unhx(*(src
+ 1));
1420 x1
= (x1
<< 4) + x2
;
1421 if (x1
!= 0) { /* don't allow %00 */
1422 c
= (unsigned char) x1
;
1429 } else if (c
== '+') {
1430 /* As the comment in encode_uri_string suggests, '+'
1431 * should only be decoded in the query part of a URI
1432 * (should that be 'URL'?). I'm not bold enough to
1433 * disable this code, tho. -- Miciah */
1439 } while (c
!= '\0');
1443 decode_uri_string(struct string
*string
)
1445 decode_uri(string
->source
);
1446 string
->length
= strlen(string
->source
);
1450 decode_uri_for_display(unsigned char *src
)
1455 if (!isprint(*src
) || iscntrl(*src
))
1460 decode_uri_string_for_display(struct string
*string
)
1462 decode_uri_for_display(string
->source
);
1463 string
->length
= strlen(string
->source
);
1469 #define URI_LIST_GRANULARITY 0x3
1471 #define realloc_uri_list(list) \
1472 mem_align_alloc(&(list)->uris, (list)->size, (list)->size + 1, \
1473 URI_LIST_GRANULARITY)
1476 add_to_uri_list(struct uri_list
*list
, struct uri
*uri
)
1478 if (!realloc_uri_list(list
))
1481 list
->uris
[list
->size
++] = get_uri_reference(uri
);
1487 free_uri_list(struct uri_list
*list
)
1492 if (!list
->uris
) return;
1494 foreach_uri (uri
, index
, list
) {
1498 mem_free_set(&list
->uris
, NULL
);
1504 struct uri_cache_entry
{
1506 unsigned char string
[1];
1511 struct object object
;
1514 static struct uri_cache uri_cache
;
1518 check_uri_sanity(struct uri
*uri
)
1522 for (pos
= 0; pos
< uri
->protocollen
; pos
++)
1523 if (isupper(uri
->string
[pos
])) goto error
;
1526 for (pos
= 0; pos
< uri
->hostlen
; pos
++)
1527 if (isupper(uri
->host
[pos
])) goto error
;
1530 INTERNAL("Uppercase letters detected in protocol or host part (%s).", struri(uri
));
1533 #define check_uri_sanity(uri)
1536 static inline struct uri_cache_entry
*
1537 get_uri_cache_entry(unsigned char *string
, int length
)
1539 struct uri_cache_entry
*entry
;
1540 struct hash_item
*item
;
1542 assert(string
&& length
> 0);
1543 if_assert_failed
return NULL
;
1545 item
= get_hash_item(uri_cache
.map
, string
, length
);
1546 if (item
) return item
->value
;
1548 /* Setup a new entry */
1550 entry
= mem_calloc(1, sizeof(*entry
) + length
);
1551 if (!entry
) return NULL
;
1553 object_nolock(&entry
->uri
, "uri");
1554 memcpy(&entry
->string
, string
, length
);
1555 string
= entry
->string
;
1557 if (parse_uri(&entry
->uri
, string
) != URI_ERRNO_OK
1558 || !add_hash_item(uri_cache
.map
, string
, length
, entry
)) {
1563 object_lock(&uri_cache
);
1569 get_uri(unsigned char *string
, enum uri_component components
)
1571 struct uri_cache_entry
*entry
;
1578 if (parse_uri(&uri
, string
) != URI_ERRNO_OK
)
1581 return get_composed_uri(&uri
, components
);
1584 if (!is_object_used(&uri_cache
)) {
1585 uri_cache
.map
= init_hash8();
1586 if (!uri_cache
.map
) return NULL
;
1587 object_nolock(&uri_cache
, "uri_cache");
1590 entry
= get_uri_cache_entry(string
, strlen(string
));
1592 if (!is_object_used(&uri_cache
))
1593 free_hash(&uri_cache
.map
);
1597 check_uri_sanity(&entry
->uri
);
1598 object_nolock(&entry
->uri
, "uri");
1599 object_lock(&entry
->uri
);
1605 done_uri(struct uri
*uri
)
1607 unsigned char *string
= struri(uri
);
1608 int length
= strlen(string
);
1609 struct hash_item
*item
;
1610 struct uri_cache_entry
*entry
;
1612 assert(is_object_used(&uri_cache
));
1615 if (is_object_used(uri
)) return;
1617 item
= get_hash_item(uri_cache
.map
, string
, length
);
1618 entry
= item
? item
->value
: NULL
;
1620 assertm(entry
!= NULL
, "Releasing unknown URI [%s]", string
);
1621 del_hash_item(uri_cache
.map
, item
);
1624 /* Last URI frees the cache */
1625 object_unlock(&uri_cache
);
1626 if (!is_object_used(&uri_cache
))
1627 free_hash(&uri_cache
.map
);