Local file browsing works under Windows. It is done lame way.
[elinks.git] / src / protocol / uri.c
blob076f9cf4f10f3fe81cc2c772dd7a37cb7984698e
1 /* URL parser and translator; implementation of RFC 2396. */
3 #ifdef HAVE_CONFIG_H
4 #include "config.h"
5 #endif
7 #include <ctype.h>
8 #include <errno.h>
9 #ifdef HAVE_IDNA_H
10 #include <idna.h>
11 #endif
12 #include <stdio.h>
13 #include <stdlib.h>
14 #include <string.h>
15 #include <sys/types.h>
16 #ifdef HAVE_NETDB_H
17 #include <netdb.h> /* OS/2 needs this after sys/types.h */
18 #endif
20 #ifdef HAVE_SYS_SOCKET_H
21 #include <sys/socket.h>
22 #endif
23 #ifdef HAVE_NETINET_IN_H
24 #include <netinet/in.h>
25 #endif
26 #ifdef HAVE_ARPA_INET_H
27 #include <arpa/inet.h>
28 #endif
30 #include "elinks.h"
32 #include "main/object.h"
33 #include "protocol/protocol.h"
34 #include "protocol/uri.h"
35 #include "util/conv.h"
36 #include "util/error.h"
37 #include "util/file.h"
38 #include "util/hash.h"
39 #include "util/memory.h"
40 #include "util/string.h"
43 static inline int
44 end_of_dir(unsigned char c)
46 return c == POST_CHAR || c == '#' || c == ';' || c == '?';
49 static inline int
50 is_uri_dir_sep(struct uri *uri, unsigned char pos)
52 return (uri->protocol == PROTOCOL_FILE ? dir_sep(pos) : pos == '/');
56 int
57 is_ip_address(unsigned char *address, int addresslen)
59 /* The @address has well defined limits so it would be a shame to
60 * allocate it. */
61 unsigned char buffer[IP_ADDRESS_BUFFER_SIZE];
63 if (addresslen >= sizeof(buffer))
64 return 0;
66 safe_strncpy(buffer, address, addresslen + 1);
68 #ifdef HAVE_INET_PTON
69 #ifdef CONFIG_IPV6
71 struct sockaddr_in6 addr6;
73 if (inet_pton(AF_INET6, buffer, &addr6.sin6_addr) > 0)
74 return 1;
76 #endif /* CONFIG_IPV6 */
78 struct in_addr addr4;
80 if (inet_pton(AF_INET, buffer, &addr4) > 0)
81 return 1;
84 return 0;
85 #else
86 /* FIXME: Is this ever the case? */
87 return 0;
88 #endif /* HAVE_INET_PTON */
92 int
93 end_with_known_tld(unsigned char *s, int slen)
95 int i;
96 static const unsigned char *tld[] =
97 { "com", "edu", "net",
98 "org", "gov", "mil",
99 "int", "biz", "arpa",
100 "aero", "coop",
101 "info", "museum",
102 "name", "pro", NULL };
104 if (!slen) return -1;
105 if (slen < 0) slen = strlen(s);
107 for (i = 0; tld[i]; i++) {
108 int tldlen = strlen(tld[i]);
109 int pos = slen - tldlen;
111 if (pos >= 0 && !strncasecmp(&s[pos], tld[i], tldlen))
112 return pos;
115 return -1;
118 /* XXX: this function writes to @name. */
119 static int
120 check_whether_file_exists(unsigned char *name)
122 /* Check POST_CHAR etc ... */
123 static const unsigned char chars[] = POST_CHAR_S "#?";
124 int i;
125 int namelen = strlen(name);
127 if (file_exists(name))
128 return namelen;
130 for (i = 0; i < sizeof(chars) - 1; i++) {
131 unsigned char *pos = memchr(name, chars[i], namelen);
132 int exists;
134 if (!pos) continue;
136 *pos = 0;
137 exists = file_exists(name);
138 *pos = chars[i];
140 if (exists) {
141 return pos - name;
145 return -1;
148 static int
149 check_uri_file(unsigned char *name)
151 /* Check POST_CHAR etc ... */
152 static const unsigned char chars[] = POST_CHAR_S "#?";
154 return strcspn(name, chars);
157 /* Encodes URIs without encoding stuff like fragments and query separators. */
158 static void
159 encode_file_uri_string(struct string *string, unsigned char *uristring)
161 int filenamelen = check_whether_file_exists(uristring);
163 encode_uri_string(string, uristring, filenamelen, 0);
167 static inline int
168 get_protocol_length(const unsigned char *url)
170 unsigned char *end = (unsigned char *) url;
172 /* Seek the end of the protocol name if any. */
173 /* RFC1738:
174 * scheme = 1*[ lowalpha | digit | "+" | "-" | "." ]
175 * (but per its recommendations we accept "upalpha" too) */
176 while (isalnum(*end) || *end == '+' || *end == '-' || *end == '.')
177 end++;
179 /* Now we make something to support our "IP version in protocol scheme
180 * name" hack and silently chop off the last digit if it's there. The
181 * IETF's not gonna notice I hope or it'd be going after us hard. */
182 if (end != url && isdigit(end[-1]))
183 end--;
185 /* Also return 0 if there's no protocol name (@end == @url). */
186 return (*end == ':' || isdigit(*end)) ? end - url : 0;
189 enum uri_errno
190 parse_uri(struct uri *uri, unsigned char *uristring)
192 unsigned char *prefix_end, *host_end;
193 #ifdef CONFIG_IPV6
194 unsigned char *lbracket, *rbracket;
195 #endif
197 assertm(uristring, "No uri to parse.");
198 memset(uri, 0, sizeof(*uri));
200 /* Nothing to do for an empty url. */
201 if_assert_failed return 0;
202 if (!*uristring) return URI_ERRNO_EMPTY;
204 uri->string = uristring;
205 uri->protocollen = get_protocol_length(uristring);
207 /* Invalid */
208 if (!uri->protocollen) return URI_ERRNO_INVALID_PROTOCOL;
210 /* Figure out whether the protocol is known */
211 uri->protocol = get_protocol(struri(uri), uri->protocollen);
213 prefix_end = uristring + uri->protocollen; /* ':' */
215 /* Check if there's a digit after the protocol name. */
216 if (isdigit(*prefix_end)) {
217 uri->ip_family = uristring[uri->protocollen] - '0';
218 prefix_end++;
220 if (*prefix_end != ':')
221 return URI_ERRNO_INVALID_PROTOCOL;
222 prefix_end++;
224 /* Skip slashes */
226 if (prefix_end[0] == '/' && prefix_end[1] == '/') {
227 if (prefix_end[2] == '/'
228 && get_protocol_need_slash_after_host(uri->protocol))
229 return URI_ERRNO_TOO_MANY_SLASHES;
231 prefix_end += 2;
233 } else if (get_protocol_need_slashes(uri->protocol)) {
234 return URI_ERRNO_NO_SLASHES;
237 if (get_protocol_free_syntax(uri->protocol)) {
238 uri->data = prefix_end;
239 uri->datalen = strlen(prefix_end);
240 return URI_ERRNO_OK;
242 } else if (uri->protocol == PROTOCOL_FILE) {
243 int datalen = check_uri_file(prefix_end);
245 /* Extract the fragment part. */
246 if (datalen >= 0 && prefix_end[datalen] == '#') {
247 uri->fragment = prefix_end + datalen + 1;
248 uri->fragmentlen = strlen(uri->fragment);
249 } else {
250 datalen = strlen(prefix_end);
253 uri->data = prefix_end;
254 uri->datalen = datalen;
256 return URI_ERRNO_OK;
259 /* Isolate host */
261 #ifdef CONFIG_IPV6
262 /* Get brackets enclosing IPv6 address */
263 lbracket = strchr(prefix_end, '[');
264 if (lbracket) {
265 rbracket = strchr(lbracket, ']');
266 /* [address] is handled only inside of hostname part (surprisingly). */
267 if (rbracket && rbracket < prefix_end + strcspn(prefix_end, "/"))
268 uri->ipv6 = 1;
269 else
270 lbracket = rbracket = NULL;
271 } else {
272 rbracket = NULL;
274 #endif
276 /* Possibly skip auth part */
277 host_end = prefix_end + strcspn(prefix_end, "@");
279 if (prefix_end + strcspn(prefix_end, "/") > host_end
280 && *host_end) { /* we have auth info here */
281 unsigned char *user_end;
283 /* Allow '@' in the password component */
284 while (strcspn(host_end + 1, "@") < strcspn(host_end + 1, "/?"))
285 host_end = host_end + 1 + strcspn(host_end + 1, "@");
287 user_end = strchr(prefix_end, ':');
289 if (!user_end || user_end > host_end) {
290 uri->user = prefix_end;
291 uri->userlen = host_end - prefix_end;
292 } else {
293 uri->user = prefix_end;
294 uri->userlen = user_end - prefix_end;
295 uri->password = user_end + 1;
296 uri->passwordlen = host_end - user_end - 1;
298 prefix_end = host_end + 1;
301 #ifdef CONFIG_IPV6
302 if (uri->ipv6)
303 host_end = rbracket + strcspn(rbracket, ":/?");
304 else
305 #endif
306 host_end = prefix_end + strcspn(prefix_end, ":/?");
308 #ifdef CONFIG_IPV6
309 if (uri->ipv6) {
310 int addrlen = rbracket - lbracket - 1;
312 /* Check for valid length.
313 * addrlen >= sizeof(hostbuf) is theorically impossible
314 * but i keep the test in case of... Safer, imho --Zas */
315 assertm(addrlen >= 0 && addrlen < NI_MAXHOST,
316 "parse_uri(): addrlen value is bad (%d) for URL '%s'. "
317 "Problems are likely to be encountered. Please report "
318 "this, it is a security bug!", addrlen, uristring);
319 if_assert_failed return URI_ERRNO_IPV6_SECURITY;
321 uri->host = lbracket + 1;
322 uri->hostlen = addrlen;
323 } else
324 #endif
326 uri->host = prefix_end;
327 uri->hostlen = host_end - prefix_end;
329 /* Trim trailing '.'s */
330 if (uri->hostlen && uri->host[uri->hostlen - 1] == '.')
331 return URI_ERRNO_TRAILING_DOTS;
334 if (*host_end == ':') { /* we have port here */
335 unsigned char *port_end = host_end + 1 + strcspn(host_end + 1, "/");
337 host_end++;
339 uri->port = host_end;
340 uri->portlen = port_end - host_end;
342 if (uri->portlen == 0)
343 return URI_ERRNO_NO_PORT_COLON;
345 /* We only use 8 bits for portlen so better check */
346 if (uri->portlen != port_end - host_end)
347 return URI_ERRNO_INVALID_PORT;
349 /* test if port is number */
350 /* TODO: possibly lookup for the service otherwise? --pasky */
351 for (; host_end < port_end; host_end++)
352 if (!isdigit(*host_end))
353 return URI_ERRNO_INVALID_PORT;
355 /* Check valid port value, and let show an error message
356 * about invalid url syntax. */
357 if (uri->port && uri->portlen) {
358 int n;
360 errno = 0;
361 n = strtol(uri->port, NULL, 10);
362 if (errno || !uri_port_is_valid(n))
363 return URI_ERRNO_INVALID_PORT;
367 if (*host_end == '/') {
368 host_end++;
370 } else if (get_protocol_need_slash_after_host(uri->protocol)) {
371 /* The need for slash after the host component depends on the
372 * need for a host component. -- The dangerous mind of Jonah */
373 if (!uri->hostlen)
374 return URI_ERRNO_NO_HOST;
376 return URI_ERRNO_NO_HOST_SLASH;
379 /* Look for #fragment or POST_CHAR */
380 prefix_end = host_end + strcspn(host_end, "#" POST_CHAR_S);
381 uri->data = host_end;
382 uri->datalen = prefix_end - host_end;
384 if (*prefix_end == '#') {
385 uri->fragment = prefix_end + 1;
386 uri->fragmentlen = strcspn(uri->fragment, POST_CHAR_S);
387 prefix_end = uri->fragment + uri->fragmentlen;
390 if (*prefix_end == POST_CHAR) {
391 uri->post = prefix_end + 1;
394 return URI_ERRNO_OK;
398 get_uri_port(struct uri *uri)
400 if (uri->port && uri->portlen) {
401 unsigned char *end = uri->port;
402 int port = strtol(uri->port, (char **) &end, 10);
404 if (end != uri->port) {
405 assert(uri_port_is_valid(port));
406 return port;
410 return get_protocol_port(uri->protocol);
413 #define can_compare_uri_components(comp) !(((comp) & (URI_SPECIAL | URI_IDN)))
415 static inline int
416 compare_component(unsigned char *a, int alen, unsigned char *b, int blen)
418 /* Check that the length and the strings are both set or unset */
419 if (alen != blen || !!a != !!b) return 0;
421 /* Both are unset so that will make a perfect match */
422 if (!a || !alen) return 1;
424 /* Let the higher forces decide */
425 return !memcmp(a, b, blen);
428 #define wants(x) (components & (x))
431 compare_uri(struct uri *a, struct uri *b, enum uri_component components)
433 if (a == b) return 1;
434 if (!components) return 0;
436 assertm(can_compare_uri_components(components),
437 "compare_uri() is a work in progress. Component unsupported");
439 return (!wants(URI_PROTOCOL) || a->protocol == b->protocol)
440 && (!wants(URI_IP_FAMILY) || a->ip_family == b->ip_family)
441 && (!wants(URI_USER)
442 || compare_component(a->user, a->userlen, b->user, b->userlen))
443 && (!wants(URI_PASSWORD)
444 || compare_component(a->password, a->passwordlen, b->password, b->passwordlen))
445 && (!wants(URI_HOST)
446 || compare_component(a->host, a->hostlen, b->host, b->hostlen))
447 && (!wants(URI_PORT)
448 || compare_component(a->port, a->portlen, b->port, b->portlen))
449 && (!wants(URI_DATA)
450 || compare_component(a->data, a->datalen, b->data, b->datalen))
451 && (!wants(URI_FRAGMENT)
452 || compare_component(a->fragment, a->fragmentlen, b->fragment, b->fragmentlen))
453 && (!wants(URI_POST)
454 || compare_component(a->post, a->post ? strlen(a->post) : 0, b->post, b->post ? strlen(b->post) : 0));
458 /* We might need something more intelligent than this Swiss army knife. */
459 struct string *
460 add_uri_to_string(struct string *string, struct uri *uri,
461 enum uri_component components)
463 /* Custom or unknown keep the URI untouched. */
464 if (uri->protocol == PROTOCOL_UNKNOWN)
465 return add_to_string(string, struri(uri));
467 if (wants(URI_PROTOCOL)) {
468 add_bytes_to_string(string, uri->string, uri->protocollen);
469 if (wants(URI_IP_FAMILY) && uri->ip_family)
470 add_long_to_string(string, uri->ip_family);
471 add_char_to_string(string, ':');
472 if (get_protocol_need_slashes(uri->protocol))
473 add_to_string(string, "//");
476 if (wants(URI_USER) && uri->userlen) {
477 add_bytes_to_string(string, uri->user, uri->userlen);
479 if (wants(URI_PASSWORD) && uri->passwordlen) {
480 add_char_to_string(string, ':');
481 add_bytes_to_string(string, uri->password,
482 uri->passwordlen);
485 add_char_to_string(string, '@');
488 if (wants(URI_HOST) && uri->hostlen) {
489 int add_host = 1;
491 #ifdef CONFIG_IPV6
492 /* Rationale for wants(URI_PORT): The [notation] was invented
493 * so that you can have an IPv6 addy and a port together. So
494 * we want to use it when that happens, otherwise we need not
495 * bother (that happens only when we want it for DNS anyway).
496 * I insist on an implied elegancy of this way, but YMMV. ;-)
497 * --pasky */
498 if (uri->ipv6 && wants(URI_PORT)) add_char_to_string(string, '[');
499 #endif
500 #ifdef CONFIG_IDN
501 /* Support for the GNU International Domain Name library.
503 * http://www.gnu.org/software/libidn/manual/html_node/IDNA-Functions.html
505 * Now it is probably not perfect because idna_to_ascii_lz()
506 * will be using a ``zero terminated input string encoded in
507 * the current locale's character set''. Anyway I don't know
508 * how to convert anything to UTF-8 or Unicode. --jonas */
509 if (wants(URI_IDN)) {
510 unsigned char *host = memacpy(uri->host, uri->hostlen);
512 if (host) {
513 char *idname;
514 int code = idna_to_ascii_lz(host, &idname, 0);
516 /* FIXME: Return NULL if it coughed? --jonas */
517 if (code == IDNA_SUCCESS) {
518 add_to_string(string, idname);
519 free(idname);
520 add_host = 0;
523 mem_free(host);
527 #endif
528 if (add_host)
529 add_bytes_to_string(string, uri->host, uri->hostlen);
531 #ifdef CONFIG_IPV6
532 if (uri->ipv6 && wants(URI_PORT)) add_char_to_string(string, ']');
533 #endif
536 if (wants(URI_PORT) || wants(URI_DEFAULT_PORT)) {
537 if (uri->portlen) {
538 add_char_to_string(string, ':');
539 add_bytes_to_string(string, uri->port, uri->portlen);
541 } else if (wants(URI_DEFAULT_PORT)
542 && uri->protocol != PROTOCOL_USER) {
543 /* For user protocols we don't know a default port.
544 * Should user protocols ports be configurable? */
545 int port = get_protocol_port(uri->protocol);
547 add_char_to_string(string, ':');
548 add_long_to_string(string, port);
552 /* Only add slash if we need to separate */
553 if ((wants(URI_DATA) || wants(URI_POST) || components == URI_HTTP_REFERRER_HOST)
554 && wants(~(URI_DATA | URI_PORT))
555 && get_protocol_need_slash_after_host(uri->protocol))
556 add_char_to_string(string, '/');
558 if (wants(URI_DATA) && uri->datalen)
559 add_bytes_to_string(string, uri->data, uri->datalen);
561 /* We can not test uri->datalen here since we need to always
562 * add '/'. */
563 if (wants(URI_PATH) || wants(URI_FILENAME)) {
564 unsigned char *filename = uri->data;
565 unsigned char *pos;
567 assertm(!wants(URI_FILENAME) || components == URI_FILENAME,
568 "URI_FILENAME should be used alone %d", components);
570 if (wants(URI_PATH) && !is_uri_dir_sep(uri, *filename)) {
571 #ifdef CONFIG_OS_WIN32
572 if (uri->protocol != PROTOCOL_FILE)
573 #endif
574 /* FIXME: Add correct separator */
575 add_char_to_string(string, '/');
578 if (!uri->datalen) return string;
580 for (pos = filename; *pos && !end_of_dir(*pos); pos++)
581 if (wants(URI_FILENAME) && is_uri_dir_sep(uri, *pos))
582 filename = pos + 1;
584 return add_bytes_to_string(string, filename, pos - filename);
587 if (wants(URI_QUERY) && uri->datalen) {
588 unsigned char *query = memchr(uri->data, '?', uri->datalen);
590 assertm(URI_QUERY == components,
591 "URI_QUERY should be used alone %d", components);
593 if (!query) return string;
595 query++;
596 /* Check fragment and POST_CHAR */
597 return add_bytes_to_string(string, query, strcspn(query, "#" POST_CHAR_S));
600 if (wants(URI_FRAGMENT) && uri->fragmentlen) {
601 add_char_to_string(string, '#');
602 add_bytes_to_string(string, uri->fragment, uri->fragmentlen);
605 if (wants(URI_POST) && uri->post) {
606 add_char_to_string(string, POST_CHAR);
607 add_to_string(string, uri->post);
609 } else if (wants(URI_POST_INFO) && uri->post) {
610 if (!strncmp(uri->post, "text/plain", 10)) {
611 add_to_string(string, " (PLAIN TEXT DATA)");
613 } else if (!strncmp(uri->post, "multipart/form-data;", 20)) {
614 add_to_string(string, " (MULTIPART FORM DATA)");
616 } else {
617 add_to_string(string, " (POST DATA)");
622 return string;
625 #undef wants
627 unsigned char *
628 get_uri_string(struct uri *uri, enum uri_component components)
630 struct string string;
632 if (init_string(&string)
633 && add_uri_to_string(&string, uri, components))
634 return string.source;
636 done_string(&string);
637 return NULL;
641 struct string *
642 add_string_uri_to_string(struct string *string, unsigned char *uristring,
643 enum uri_component components)
645 struct uri uri;
647 if (parse_uri(&uri, uristring) != URI_ERRNO_OK)
648 return NULL;
650 return add_uri_to_string(string, &uri, components);
654 #define normalize_uri_reparse(str) normalize_uri(NULL, str)
655 #define normalize_uri_noparse(uri) normalize_uri(uri, struri(uri))
657 unsigned char *
658 normalize_uri(struct uri *uri, unsigned char *uristring)
660 unsigned char *parse_string = uristring;
661 unsigned char *src, *dest, *path;
662 int need_slash = 0;
663 int parse = (uri == NULL);
664 struct uri uri_struct;
666 if (!uri) uri = &uri_struct;
668 /* We need to get the real (proxied) URI but lowercase relevant URI
669 * parts along the way. */
670 do {
671 if (parse && parse_uri(uri, parse_string) != URI_ERRNO_OK)
672 return uristring;
674 assert(uri->data);
676 /* This is a maybe not the right place but both join_urls() and
677 * get_translated_uri() through translate_url() calls this
678 * function and then it already works on and modifies an
679 * allocated copy. */
680 convert_to_lowercase(uri->string, uri->protocollen);
681 if (uri->hostlen) convert_to_lowercase(uri->host, uri->hostlen);
683 parse = 1;
684 parse_string = uri->data;
685 } while (uri->protocol == PROTOCOL_PROXY);
687 if (get_protocol_free_syntax(uri->protocol))
688 return uristring;
690 if (uri->protocol != PROTOCOL_UNKNOWN)
691 need_slash = get_protocol_need_slash_after_host(uri->protocol);
693 /* We want to start at the first slash to also reduce URIs like
694 * http://host//index.html to http://host/index.html */
695 path = uri->data - need_slash;
696 dest = src = path;
698 /* This loop mangles the URI string by removing directory elevators and
699 * other cruft. Example: /.././etc////..//usr/ -> /usr/ */
700 while (*dest) {
701 /* If the following pieces are the LAST parts of URL, we remove
702 * them as well. See RFC 1808 for details. */
704 if (end_of_dir(src[0])) {
705 /* URL data contains no more path. */
706 memmove(dest, src, strlen(src) + 1);
707 break;
710 if (!is_uri_dir_sep(uri, src[0])) {
711 /* This is to reduce indentation */
713 } else if (src[1] == '.') {
714 if (!src[2]) {
715 /* /. - skip the dot */
716 *dest++ = *src;
717 *dest = 0;
718 break;
720 } else if (is_uri_dir_sep(uri, src[2])) {
721 /* /./ - strip that.. */
722 src += 2;
723 continue;
725 } else if (src[2] == '.'
726 && (is_uri_dir_sep(uri, src[3]) || !src[3])) {
727 /* /../ or /.. - skip it and preceding element. */
729 /* First back out the last incrementation of
730 * @dest (dest++) to get the position that was
731 * last asigned to. */
732 if (dest > path) dest--;
734 /* @dest might be pointing to a dir separator
735 * so we decrement before any testing. */
736 while (dest > path) {
737 dest--;
738 if (is_uri_dir_sep(uri, *dest)) break;
741 if (!src[3]) {
742 /* /.. - add ending slash and stop */
743 *dest++ = *src;
744 *dest = 0;
745 break;
748 src += 3;
749 continue;
752 } else if (is_uri_dir_sep(uri, src[1])) {
753 /* // - ignore first '/'. */
754 src += 1;
755 continue;
758 /* We don't want to access memory past the NUL char. */
759 *dest = *src++;
760 if (*dest) dest++;
763 return uristring;
766 /* The 'file' scheme URI comes in and bastardized URI comes out which consists
767 * of just the complete path to file/directory, which the dumb 'file' protocol
768 * backend can understand. No host parts etc, that is what this function is
769 * supposed to chew. */
770 static struct uri *
771 transform_file_url(struct uri *uri, unsigned char *cwd)
773 unsigned char *path = uri->data;
775 assert(uri->protocol == PROTOCOL_FILE && uri->data);
777 /* Sort out the host part. We currently support only host "localhost"
778 * (plus empty host part will be assumed to be "localhost" as well).
779 * As our extensions, '.' will reference to the cwd on localhost
780 * (originally, when the first thing after file:// wasn't "localhost/",
781 * we assumed the cwd as well, and pretended that there's no host part
782 * at all) and '..' to the directory parent to cwd. Another extension
783 * is that if this is a DOS-like system, the first char in two-char
784 * host part is uppercase letter and the second char is a colon, it is
785 * assumed to be a local disk specification. */
786 /* TODO: Use FTP for non-localhost hosts. --pasky */
788 /* For URL "file://", we open the current directory. Some other
789 * browsers instead open root directory, but AFAIK the standard does
790 * not specify that and this was the original behaviour and it is more
791 * consistent with our file://./ notation. */
793 /* Who would name their file/dir '...' ? */
794 if (*path == '.' || !*path) {
795 struct string dir;
797 if (!init_string(&dir))
798 return NULL;
800 encode_uri_string(&dir, cwd, -1, 0);
802 /* Either we will end up with '//' and translate_directories()
803 * will shorten it or the '/' will mark the inserted cwd as a
804 * directory. */
805 if (*path == '.') *path = '/';
807 /* Insert the current working directory. */
808 /* The offset is 7 == sizeof("file://") - 1. */
809 insert_in_string(&struri(uri), 7, dir.source, dir.length);
811 done_string(&dir);
812 return uri;
815 #ifdef DOS_FS
816 if (isasciialpha(path[0]) && path[1] == ':' && dir_sep(path[2]))
817 return NULL;
818 #endif
820 for (; *path && !dir_sep(*path); path++);
822 /* FIXME: We will in fact assume localhost even for non-local hosts,
823 * until we will support the FTP transformation. --pasky */
825 memmove(uri->data, path, strlen(path) + 1);
826 return uri;
829 static unsigned char *translate_url(unsigned char *url, unsigned char *cwd);
831 unsigned char *
832 join_urls(struct uri *base, unsigned char *rel)
834 unsigned char *uristring, *path;
835 int add_slash = 0;
836 int translate = 0;
837 int length = 0;
839 /* See RFC 1808 */
840 /* TODO: Support for ';' ? (see the RFC) --pasky */
842 /* For '#', '?' and '//' we could use get_uri_string() but it might be
843 * too expensive since it uses granular allocation scheme. I wouldn't
844 * personally mind tho' because it would be cleaner. --jonas */
845 if (rel[0] == '#') {
846 /* Strip fragment and post part from the base URI and append
847 * the fragment string in @rel. */
848 length = base->fragment
849 ? base->fragment - struri(base) - 1
850 : get_real_uri_length(base);
852 } else if (rel[0] == '?') {
853 /* Strip query, fragment and post part from the base URI and
854 * append the query string in @rel. */
855 length = base->fragment ? base->fragment - struri(base) - 1
856 : get_real_uri_length(base);
858 uristring = memchr(base->data, '?', base->datalen);
859 if (uristring) length = uristring - struri(base);
861 } else if (rel[0] == '/' && rel[1] == '/') {
862 if (!get_protocol_need_slashes(base->protocol))
863 return NULL;
865 /* Get `<protocol>:' from the base URI and append the `//' part
866 * from @rel. */
867 length = base->protocollen + 1;
869 /* We need to sanitize the relative part and add stuff like
870 * host slash. */
871 translate = 1;
874 /* If one of the tests above set @length to something useful */
875 if (length) {
876 uristring = memacpy(struri(base), length);
877 if (!uristring) return NULL;
879 add_to_strn(&uristring, rel);
881 if (translate) {
882 unsigned char *translated;
884 translated = translate_url(uristring, NULL);
885 mem_free(uristring);
886 return translated;
888 return normalize_uri_reparse(uristring);
891 /* Check if there is some protocol name to go for */
892 length = get_protocol_length(rel);
893 if (length) {
894 switch (get_protocol(rel, length)) {
895 case PROTOCOL_UNKNOWN:
896 case PROTOCOL_PROXY:
897 /* Mysteriously proxy URIs are breaking here ... */
898 break;
900 case PROTOCOL_FILE:
901 /* FIXME: Use get_uri_string(base, URI_PATH) as cwd arg
902 * to translate_url(). */
903 default:
904 uristring = translate_url(rel, NULL);
905 if (uristring) return uristring;
909 assertm(base->data, "bad base url");
910 if_assert_failed return NULL;
912 path = base->data;
914 /* Either is path blank, but we've slash char before, or path is not
915 * blank, but doesn't start by a slash (if we'd just stay along with
916 * is_uri_dir_sep(&uri, path[-1]) w/o all the surrounding crap, it
917 * should be enough, but I'm not sure and I don't want to break
918 * anything --pasky). */
919 /* We skip first char of URL ('/') in parse_url() (ARGH). This
920 * is reason of all this bug-bearing magic.. */
921 if (*path) {
922 if (!is_uri_dir_sep(base, *path)) path--;
923 } else {
924 if (is_uri_dir_sep(base, path[-1])) path--;
927 if (!is_uri_dir_sep(base, rel[0])) {
928 unsigned char *path_end;
930 /* The URL is relative. */
932 if (!*path) {
933 /* There's no path in the URL, but we're going to add
934 * something there, and the something doesn't start by
935 * a slash. So we need to insert a slash after the base
936 * URL. Clever, eh? ;) */
937 add_slash = 1;
940 for (path_end = path; *path_end; path_end++) {
941 if (end_of_dir(*path_end)) break;
942 /* Modify the path pointer, so that it'll always point
943 * above the last '/' in the URL; later, we'll copy the
944 * URL only _TO_ this point, and anything after last
945 * slash will be substituted by 'rel'. */
946 if (is_uri_dir_sep(base, *path_end))
947 path = path_end + 1;
951 length = path - struri(base);
952 uristring = mem_alloc(length + strlen(rel) + add_slash + 1);
953 if (!uristring) return NULL;
955 memcpy(uristring, struri(base), length);
956 if (add_slash) uristring[length] = '/';
957 strcpy(uristring + length + add_slash, rel);
959 return normalize_uri_reparse(uristring);
963 /* Tries to figure out what protocol @newurl might be specifying by checking if
964 * it exists as a file locally or by checking parts of the host name. */
965 static enum protocol
966 find_uri_protocol(unsigned char *newurl)
968 unsigned char *ch;
970 /* First see if it is a file so filenames that look like hostnames
971 * won't confuse us below. */
972 if (check_whether_file_exists(newurl) >= 0) return PROTOCOL_FILE;
974 /* Yes, it would be simpler to make test for IPv6 address first,
975 * but it would result in confusing mix of ifdefs ;-). */
976 /* FIXME: Ideas for improve protocol detection
978 * - Handle common hostnames. It could be part of the protocol backend
979 * structure. [ www -> http, irc -> irc, news -> nntp, ... ]
981 * - Resolve using port number. [ 119 -> nntp, 443 -> https, ... ]
984 ch = newurl + strcspn(newurl, ".:/@");
985 if (*ch == '@'
986 || (*ch == ':' && *newurl != '[' && strchr(newurl, '@'))
987 || !strncasecmp(newurl, "ftp.", 4)) {
988 /* Contains user/password/ftp-hostname */
989 return PROTOCOL_FTP;
991 #ifdef CONFIG_IPV6
992 } else if (*newurl == '[' && *ch == ':') {
993 /* Candidate for IPv6 address */
994 unsigned char *bracket2, *colon2;
996 ch++;
997 bracket2 = strchr(ch, ']');
998 colon2 = strchr(ch, ':');
999 if (bracket2 && colon2 && bracket2 > colon2)
1000 return PROTOCOL_HTTP;
1001 #endif
1003 } else if (*newurl != '.' && *ch == '.') {
1004 /* Contains domain name? */
1005 unsigned char *host_end, *domain;
1006 unsigned char *ipscan;
1008 /* Process the hostname */
1009 for (domain = ch + 1;
1010 *(host_end = domain + strcspn(domain, ".:/?")) == '.';
1011 domain = host_end + 1);
1013 /* It's IP? */
1014 for (ipscan = ch; isdigit(*ipscan) || *ipscan == '.';
1015 ipscan++);
1017 if (!*ipscan || *ipscan == ':' || *ipscan == '/')
1018 return PROTOCOL_HTTP;
1020 /* It's two-letter or known TLD? */
1021 if (host_end - domain == 2
1022 || end_with_known_tld(domain, host_end - domain) >= 0)
1023 return PROTOCOL_HTTP;
1026 return PROTOCOL_UNKNOWN;
1030 #define MAX_TRANSLATION_ATTEMPTS 32
1032 /* Returns an URI string that can be used internally. Adding protocol prefix,
1033 * missing slashes etc. */
1034 static unsigned char *
1035 translate_url(unsigned char *url, unsigned char *cwd)
1037 unsigned char *newurl;
1038 struct uri uri;
1039 enum uri_errno uri_errno, prev_errno = URI_ERRNO_EMPTY;
1040 int retries = 0;
1042 /* Strip starting spaces */
1043 while (*url == ' ') url++;
1044 if (!*url) return NULL;
1046 newurl = expand_tilde(url); /* XXX: Post data copy. */
1047 if (!newurl) return NULL;
1049 parse_uri:
1050 /* Yay a goto loop. If we get some URI parse error and try to
1051 * fix it we go back to here and try again. */
1052 /* Ordinary parse */
1053 uri_errno = parse_uri(&uri, newurl);
1055 /* Bail out if the same error occurs twice */
1056 if (uri_errno == prev_errno || retries++ > MAX_TRANSLATION_ATTEMPTS) {
1057 if (retries > MAX_TRANSLATION_ATTEMPTS) {
1058 ERROR("Maximum number of parsing attempts exceeded "
1059 "for %s.", url);
1061 mem_free(newurl);
1062 return NULL;
1065 prev_errno = uri_errno;
1067 switch (uri_errno) {
1068 case URI_ERRNO_OK:
1069 /* Fix translation of 1.2.3.4:5 so IP address part won't be
1070 * interpreted as the protocol name. */
1071 if (uri.protocol == PROTOCOL_UNKNOWN) {
1072 enum protocol protocol = find_uri_protocol(newurl);
1074 /* Code duplication with the URI_ERRNO_INVALID_PROTOCOL
1075 * case. */
1076 if (protocol != PROTOCOL_UNKNOWN) {
1077 struct string str;
1079 if (!init_string(&str)) return NULL;
1081 switch (protocol) {
1082 case PROTOCOL_FTP:
1083 add_to_string(&str, "ftp://");
1084 encode_uri_string(&str, newurl, -1, 0);
1085 break;
1087 case PROTOCOL_HTTP:
1088 add_to_string(&str, "http://");
1089 add_to_string(&str, newurl);
1090 break;
1092 case PROTOCOL_UNKNOWN:
1093 break;
1095 case PROTOCOL_FILE:
1096 default:
1097 add_to_string(&str, "file://");
1098 if (!dir_sep(*newurl))
1099 add_to_string(&str, "./");
1101 add_to_string(&str, newurl);
1104 mem_free(newurl);
1105 newurl = str.source;
1107 /* Work around the infinite loop prevention */
1108 prev_errno = URI_ERRNO_EMPTY;
1109 goto parse_uri;
1113 /* If file:// URI is transformed we need to reparse. */
1114 if (uri.protocol == PROTOCOL_FILE && cwd && *cwd
1115 && transform_file_url(&uri, cwd))
1116 return normalize_uri_reparse(struri(&uri));
1118 /* Translate the proxied URI too if proxy:// */
1119 if (uri.protocol == PROTOCOL_PROXY) {
1120 unsigned char *data = translate_url(uri.data, cwd);
1121 int pos = uri.data - struri(&uri);
1123 if (!data) break;
1124 struri(&uri)[pos] = 0;
1125 insert_in_string(&struri(&uri), pos, data, strlen(data));
1126 mem_free(data);
1127 return normalize_uri_reparse(struri(&uri));
1130 return normalize_uri_noparse(&uri);
1132 case URI_ERRNO_TOO_MANY_SLASHES:
1134 unsigned char *from, *to;
1136 assert(uri.string[uri.protocollen] == ':'
1137 && uri.string[uri.protocollen + 1] == '/'
1138 && uri.string[uri.protocollen + 2] == '/');
1140 from = to = uri.string + uri.protocollen + 3;
1141 while (*from == '/') from++;
1143 assert(to < from);
1144 memmove(to, from, strlen(from) + 1);
1145 goto parse_uri;
1147 case URI_ERRNO_NO_SLASHES:
1149 /* Try prefix:some.url -> prefix://some.url.. */
1150 int slashes = 2;
1152 /* Check if only one '/' is needed. */
1153 if (uri.string[uri.protocollen + 1] == '/')
1154 slashes--;
1156 insert_in_string(&newurl, uri.protocollen + 1, "//", slashes);
1157 goto parse_uri;
1159 case URI_ERRNO_TRAILING_DOTS:
1161 /* Trim trailing '.'s */
1162 unsigned char *from = uri.host + uri.hostlen;
1163 unsigned char *to = from;
1165 assert(uri.host < to && to[-1] == '.' && *from != '.');
1167 while (uri.host < to && to[-1] == '.') to--;
1169 assert(to < from);
1170 memmove(to, from, strlen(from) + 1);
1171 goto parse_uri;
1173 case URI_ERRNO_NO_PORT_COLON:
1174 assert(uri.portlen == 0
1175 && uri.string < uri.port
1176 && uri.port[-1] == ':');
1178 memmove(uri.port - 1, uri.port, strlen(uri.port) + 1);
1179 goto parse_uri;
1181 case URI_ERRNO_NO_HOST_SLASH:
1183 int offset = uri.port
1184 ? uri.port + uri.portlen - struri(&uri)
1185 : uri.host + uri.hostlen - struri(&uri) + uri.ipv6 /* ']' */;
1187 assertm(uri.host, "uri.host not set after no host slash error");
1188 insert_in_string(&newurl, offset, "/", 1);
1189 goto parse_uri;
1191 case URI_ERRNO_INVALID_PROTOCOL:
1193 /* No protocol name */
1194 enum protocol protocol = find_uri_protocol(newurl);
1195 struct string str;
1197 if (!init_string(&str)) return NULL;
1199 switch (protocol) {
1200 case PROTOCOL_FTP:
1201 add_to_string(&str, "ftp://");
1202 encode_uri_string(&str, newurl, -1, 0);
1203 break;
1205 case PROTOCOL_HTTP:
1206 add_to_string(&str, "http://");
1207 add_to_string(&str, newurl);
1208 break;
1210 case PROTOCOL_UNKNOWN:
1211 /* We default to file:// even though we already
1212 * tested if the file existed since it will give
1213 * a "No such file or directory" error. which
1214 * might better hint the user that there was
1215 * problem figuring out the URI. */
1216 case PROTOCOL_FILE:
1217 default:
1218 add_to_string(&str, "file://");
1219 if (!dir_sep(*newurl))
1220 add_to_string(&str, "./");
1222 encode_file_uri_string(&str, newurl);
1225 mem_free(newurl);
1226 newurl = str.source;
1228 goto parse_uri;
1230 case URI_ERRNO_EMPTY:
1231 case URI_ERRNO_IPV6_SECURITY:
1232 case URI_ERRNO_NO_HOST:
1233 case URI_ERRNO_INVALID_PORT:
1234 case URI_ERRNO_INVALID_PORT_RANGE:
1235 /* None of these can be handled properly. */
1236 break;
1239 mem_free(newurl);
1240 return NULL;
1244 struct uri *
1245 get_composed_uri(struct uri *uri, enum uri_component components)
1247 unsigned char *string;
1249 assert(uri);
1250 if_assert_failed return NULL;
1252 string = get_uri_string(uri, components);
1253 if (!string) return NULL;
1255 uri = get_uri(string, 0);
1256 mem_free(string);
1258 return uri;
1261 struct uri *
1262 get_translated_uri(unsigned char *uristring, unsigned char *cwd)
1264 struct uri *uri;
1266 uristring = translate_url(uristring, cwd);
1267 if (!uristring) return NULL;
1269 uri = get_uri(uristring, 0);
1270 mem_free(uristring);
1272 return uri;
1276 unsigned char *
1277 get_extension_from_uri(struct uri *uri)
1279 unsigned char *extension = NULL;
1280 int afterslash = 1;
1281 unsigned char *pos = uri->data;
1283 assert(pos);
1285 for (; *pos && !end_of_dir(*pos); pos++) {
1286 if (!afterslash && !extension && *pos == '.') {
1287 extension = pos;
1288 } else if (is_uri_dir_sep(uri, *pos)) {
1289 extension = NULL;
1290 afterslash = 1;
1291 } else {
1292 afterslash = 0;
1296 if (extension && extension < pos)
1297 return memacpy(extension, pos - extension);
1299 return NULL;
1302 /* URI encoding, escaping unallowed characters. */
1303 static inline int
1304 safe_char(unsigned char c)
1306 /* RFC 2396, Page 8, Section 2.3 ;-) */
1307 return isident(c) || c == '.' || c == '!' || c == '~'
1308 || c == '*' || c == '\''|| c == '(' || c == ')';
1311 void
1312 encode_uri_string(struct string *string, unsigned char *name, int namelen,
1313 int convert_slashes)
1315 unsigned char n[4];
1316 unsigned char *end;
1318 n[0] = '%';
1319 n[3] = '\0';
1321 if (namelen < 0) namelen = strlen(name);
1323 for (end = name + namelen; name < end; name++) {
1324 #if 0
1325 /* This is probably correct only for query part of URI..? */
1326 if (*name == ' ') add_char_to_string(data, len, '+');
1327 else
1328 #endif
1329 if (safe_char(*name) || (!convert_slashes && *name == '/')) {
1330 add_char_to_string(string, *name);
1331 } else {
1332 /* Hex it. */
1333 n[1] = hx((((int) *name) & 0xF0) >> 4);
1334 n[2] = hx(((int) *name) & 0xF);
1335 add_bytes_to_string(string, n, sizeof(n) - 1);
1340 void
1341 encode_win32_uri_string(struct string *string, unsigned char *name, int namelen)
1343 unsigned char n[4];
1344 unsigned char *end;
1346 n[0] = '%';
1347 n[3] = '\0';
1349 if (namelen < 0) namelen = strlen(name);
1351 for (end = name + namelen; name < end; name++) {
1352 if (safe_char(*name) || *name == ':' || *name == '\\') {
1353 add_char_to_string(string, *name);
1354 } else {
1355 /* Hex it. */
1356 n[1] = hx((((int) *name) & 0xF0) >> 4);
1357 n[2] = hx(((int) *name) & 0xF);
1358 add_bytes_to_string(string, n, sizeof(n) - 1);
1363 /* This function is evil, it modifies its parameter. */
1364 /* XXX: but decoded string is _never_ longer than encoded string so it's an
1365 * efficient way to do that, imho. --Zas */
1366 void
1367 decode_uri(unsigned char *src)
1369 unsigned char *dst = src;
1370 unsigned char c;
1372 do {
1373 c = *src++;
1375 if (c == '%') {
1376 int x1 = unhx(*src);
1378 if (x1 >= 0) {
1379 int x2 = unhx(*(src + 1));
1381 if (x2 >= 0) {
1382 x1 = (x1 << 4) + x2;
1383 if (x1 != 0) { /* don't allow %00 */
1384 c = (unsigned char) x1;
1385 src += 2;
1390 #if 0
1391 } else if (c == '+') {
1392 /* As the comment in encode_uri_string suggests, '+'
1393 * should only be decoded in the query part of a URI
1394 * (should that be 'URL'?). I'm not bold enough to
1395 * disable this code, tho. -- Miciah */
1396 c = ' ';
1397 #endif
1400 *dst++ = c;
1401 } while (c != '\0');
1404 void
1405 decode_uri_string(struct string *string)
1407 decode_uri(string->source);
1408 string->length = strlen(string->source);
1411 void
1412 decode_uri_for_display(unsigned char *src)
1414 decode_uri(src);
1416 for (; *src; src++)
1417 if (!isprint(*src) || iscntrl(*src))
1418 *src = '*';
1421 void
1422 decode_uri_string_for_display(struct string *string)
1424 decode_uri_for_display(string->source);
1425 string->length = strlen(string->source);
1429 /* URI list */
1431 #define URI_LIST_GRANULARITY 0x3
1433 #define realloc_uri_list(list) \
1434 mem_align_alloc(&(list)->uris, (list)->size, (list)->size + 1, \
1435 URI_LIST_GRANULARITY)
1437 struct uri *
1438 add_to_uri_list(struct uri_list *list, struct uri *uri)
1440 if (!realloc_uri_list(list))
1441 return NULL;
1443 list->uris[list->size++] = get_uri_reference(uri);
1445 return uri;
1448 void
1449 free_uri_list(struct uri_list *list)
1451 struct uri *uri;
1452 int index;
1454 if (!list->uris) return;
1456 foreach_uri (uri, index, list) {
1457 done_uri(uri);
1460 mem_free_set(&list->uris, NULL);
1461 list->size = 0;
1464 /* URI cache */
1466 struct uri_cache_entry {
1467 struct uri uri;
1468 unsigned char string[1];
1471 struct uri_cache {
1472 struct hash *map;
1473 struct object object;
1476 static struct uri_cache uri_cache;
1478 #ifdef CONFIG_DEBUG
1479 static inline void
1480 check_uri_sanity(struct uri *uri)
1482 int pos;
1484 for (pos = 0; pos < uri->protocollen; pos++)
1485 if (isupper(uri->string[pos])) goto error;
1487 if (uri->hostlen)
1488 for (pos = 0; pos < uri->hostlen; pos++)
1489 if (isupper(uri->host[pos])) goto error;
1490 return;
1491 error:
1492 INTERNAL("Uppercase letters detected in protocol or host part (%s).", struri(uri));
1494 #else
1495 #define check_uri_sanity(uri)
1496 #endif
1498 static inline struct uri_cache_entry *
1499 get_uri_cache_entry(unsigned char *string, int length)
1501 struct uri_cache_entry *entry;
1502 struct hash_item *item;
1504 assert(string && length > 0);
1505 if_assert_failed return NULL;
1507 item = get_hash_item(uri_cache.map, string, length);
1508 if (item) return item->value;
1510 /* Setup a new entry */
1512 entry = mem_calloc(1, sizeof(*entry) + length);
1513 if (!entry) return NULL;
1515 object_nolock(&entry->uri, "uri");
1516 memcpy(&entry->string, string, length);
1517 string = entry->string;
1519 if (parse_uri(&entry->uri, string) != URI_ERRNO_OK
1520 || !add_hash_item(uri_cache.map, string, length, entry)) {
1521 mem_free(entry);
1522 return NULL;
1525 object_lock(&uri_cache);
1527 return entry;
1530 struct uri *
1531 get_uri(unsigned char *string, enum uri_component components)
1533 struct uri_cache_entry *entry;
1535 assert(string);
1537 if (components) {
1538 struct uri uri;
1540 if (parse_uri(&uri, string) != URI_ERRNO_OK)
1541 return NULL;
1543 return get_composed_uri(&uri, components);
1546 if (!is_object_used(&uri_cache)) {
1547 uri_cache.map = init_hash8();
1548 if (!uri_cache.map) return NULL;
1549 object_nolock(&uri_cache, "uri_cache");
1552 entry = get_uri_cache_entry(string, strlen(string));
1553 if (!entry) {
1554 if (!is_object_used(&uri_cache))
1555 free_hash(&uri_cache.map);
1556 return NULL;
1559 check_uri_sanity(&entry->uri);
1560 object_nolock(&entry->uri, "uri");
1561 object_lock(&entry->uri);
1563 return &entry->uri;
1566 void
1567 done_uri(struct uri *uri)
1569 unsigned char *string = struri(uri);
1570 int length = strlen(string);
1571 struct hash_item *item;
1572 struct uri_cache_entry *entry;
1574 assert(is_object_used(&uri_cache));
1576 object_unlock(uri);
1577 if (is_object_used(uri)) return;
1579 item = get_hash_item(uri_cache.map, string, length);
1580 entry = item ? item->value : NULL;
1582 assertm(entry, "Releasing unknown URI [%s]", string);
1583 del_hash_item(uri_cache.map, item);
1584 mem_free(entry);
1586 /* Last URI frees the cache */
1587 object_unlock(&uri_cache);
1588 if (!is_object_used(&uri_cache))
1589 free_hash(&uri_cache.map);