1 /* URL parser and translator; implementation of RFC 2396. */
15 #include <sys/types.h>
17 #include <netdb.h> /* OS/2 needs this after sys/types.h */
20 #ifdef HAVE_SYS_SOCKET_H
21 #include <sys/socket.h>
23 #ifdef HAVE_NETINET_IN_H
24 #include <netinet/in.h>
26 #ifdef HAVE_ARPA_INET_H
27 #include <arpa/inet.h>
32 #include "main/object.h"
33 #include "protocol/protocol.h"
34 #include "protocol/uri.h"
35 #include "util/conv.h"
36 #include "util/error.h"
37 #include "util/file.h"
38 #include "util/hash.h"
39 #include "util/memory.h"
40 #include "util/string.h"
44 end_of_dir(unsigned char c
)
46 return c
== POST_CHAR
|| c
== '#' || c
== ';' || c
== '?';
50 is_uri_dir_sep(struct uri
*uri
, unsigned char pos
)
52 return (uri
->protocol
== PROTOCOL_FILE
? dir_sep(pos
) : pos
== '/');
57 is_ip_address(unsigned char *address
, int addresslen
)
59 /* The @address has well defined limits so it would be a shame to
61 unsigned char buffer
[IP_ADDRESS_BUFFER_SIZE
];
63 if (addresslen
>= sizeof(buffer
))
66 safe_strncpy(buffer
, address
, addresslen
+ 1);
71 struct sockaddr_in6 addr6
;
73 if (inet_pton(AF_INET6
, buffer
, &addr6
.sin6_addr
) > 0)
76 #endif /* CONFIG_IPV6 */
80 if (inet_pton(AF_INET
, buffer
, &addr4
) > 0)
86 /* FIXME: Is this ever the case? */
88 #endif /* HAVE_INET_PTON */
93 end_with_known_tld(unsigned char *s
, int slen
)
96 static const unsigned char *tld
[] =
97 { "com", "edu", "net",
102 "name", "pro", NULL
};
104 if (!slen
) return -1;
105 if (slen
< 0) slen
= strlen(s
);
107 for (i
= 0; tld
[i
]; i
++) {
108 int tldlen
= strlen(tld
[i
]);
109 int pos
= slen
- tldlen
;
111 if (pos
>= 0 && !strncasecmp(&s
[pos
], tld
[i
], tldlen
))
118 /* XXX: this function writes to @name. */
120 check_whether_file_exists(unsigned char *name
)
122 /* Check POST_CHAR etc ... */
123 static const unsigned char chars
[] = POST_CHAR_S
"#?";
125 int namelen
= strlen(name
);
127 if (file_exists(name
))
130 for (i
= 0; i
< sizeof(chars
) - 1; i
++) {
131 unsigned char *pos
= memchr(name
, chars
[i
], namelen
);
137 exists
= file_exists(name
);
149 check_uri_file(unsigned char *name
)
151 /* Check POST_CHAR etc ... */
152 static const unsigned char chars
[] = POST_CHAR_S
"#?";
154 return strcspn(name
, chars
);
157 /* Encodes URIs without encoding stuff like fragments and query separators. */
159 encode_file_uri_string(struct string
*string
, unsigned char *uristring
)
161 int filenamelen
= check_whether_file_exists(uristring
);
163 encode_uri_string(string
, uristring
, filenamelen
, 0);
168 get_protocol_length(const unsigned char *url
)
170 unsigned char *end
= (unsigned char *) url
;
172 /* Seek the end of the protocol name if any. */
174 * scheme = 1*[ lowalpha | digit | "+" | "-" | "." ]
175 * (but per its recommendations we accept "upalpha" too) */
176 while (isalnum(*end
) || *end
== '+' || *end
== '-' || *end
== '.')
179 /* Now we make something to support our "IP version in protocol scheme
180 * name" hack and silently chop off the last digit if it's there. The
181 * IETF's not gonna notice I hope or it'd be going after us hard. */
182 if (end
!= url
&& isdigit(end
[-1]))
185 /* Also return 0 if there's no protocol name (@end == @url). */
186 return (*end
== ':' || isdigit(*end
)) ? end
- url
: 0;
190 parse_uri(struct uri
*uri
, unsigned char *uristring
)
192 unsigned char *prefix_end
, *host_end
;
194 unsigned char *lbracket
, *rbracket
;
197 assertm(uristring
, "No uri to parse.");
198 memset(uri
, 0, sizeof(*uri
));
200 /* Nothing to do for an empty url. */
201 if_assert_failed
return 0;
202 if (!*uristring
) return URI_ERRNO_EMPTY
;
204 uri
->string
= uristring
;
205 uri
->protocollen
= get_protocol_length(uristring
);
208 if (!uri
->protocollen
) return URI_ERRNO_INVALID_PROTOCOL
;
210 /* Figure out whether the protocol is known */
211 uri
->protocol
= get_protocol(struri(uri
), uri
->protocollen
);
213 prefix_end
= uristring
+ uri
->protocollen
; /* ':' */
215 /* Check if there's a digit after the protocol name. */
216 if (isdigit(*prefix_end
)) {
217 uri
->ip_family
= uristring
[uri
->protocollen
] - '0';
220 if (*prefix_end
!= ':')
221 return URI_ERRNO_INVALID_PROTOCOL
;
226 if (prefix_end
[0] == '/' && prefix_end
[1] == '/') {
227 if (prefix_end
[2] == '/'
228 && get_protocol_need_slash_after_host(uri
->protocol
))
229 return URI_ERRNO_TOO_MANY_SLASHES
;
233 } else if (get_protocol_need_slashes(uri
->protocol
)) {
234 return URI_ERRNO_NO_SLASHES
;
237 if (get_protocol_free_syntax(uri
->protocol
)) {
238 uri
->data
= prefix_end
;
239 uri
->datalen
= strlen(prefix_end
);
242 } else if (uri
->protocol
== PROTOCOL_FILE
) {
243 int datalen
= check_uri_file(prefix_end
);
245 /* Extract the fragment part. */
246 if (datalen
>= 0 && prefix_end
[datalen
] == '#') {
247 uri
->fragment
= prefix_end
+ datalen
+ 1;
248 uri
->fragmentlen
= strlen(uri
->fragment
);
250 datalen
= strlen(prefix_end
);
253 uri
->data
= prefix_end
;
254 uri
->datalen
= datalen
;
262 /* Get brackets enclosing IPv6 address */
263 lbracket
= strchr(prefix_end
, '[');
265 rbracket
= strchr(lbracket
, ']');
266 /* [address] is handled only inside of hostname part (surprisingly). */
267 if (rbracket
&& rbracket
< prefix_end
+ strcspn(prefix_end
, "/"))
270 lbracket
= rbracket
= NULL
;
276 /* Possibly skip auth part */
277 host_end
= prefix_end
+ strcspn(prefix_end
, "@");
279 if (prefix_end
+ strcspn(prefix_end
, "/") > host_end
280 && *host_end
) { /* we have auth info here */
281 unsigned char *user_end
;
283 /* Allow '@' in the password component */
284 while (strcspn(host_end
+ 1, "@") < strcspn(host_end
+ 1, "/?"))
285 host_end
= host_end
+ 1 + strcspn(host_end
+ 1, "@");
287 user_end
= strchr(prefix_end
, ':');
289 if (!user_end
|| user_end
> host_end
) {
290 uri
->user
= prefix_end
;
291 uri
->userlen
= host_end
- prefix_end
;
293 uri
->user
= prefix_end
;
294 uri
->userlen
= user_end
- prefix_end
;
295 uri
->password
= user_end
+ 1;
296 uri
->passwordlen
= host_end
- user_end
- 1;
298 prefix_end
= host_end
+ 1;
303 host_end
= rbracket
+ strcspn(rbracket
, ":/?");
306 host_end
= prefix_end
+ strcspn(prefix_end
, ":/?");
310 int addrlen
= rbracket
- lbracket
- 1;
312 /* Check for valid length.
313 * addrlen >= sizeof(hostbuf) is theorically impossible
314 * but i keep the test in case of... Safer, imho --Zas */
315 assertm(addrlen
>= 0 && addrlen
< NI_MAXHOST
,
316 "parse_uri(): addrlen value is bad (%d) for URL '%s'. "
317 "Problems are likely to be encountered. Please report "
318 "this, it is a security bug!", addrlen
, uristring
);
319 if_assert_failed
return URI_ERRNO_IPV6_SECURITY
;
321 uri
->host
= lbracket
+ 1;
322 uri
->hostlen
= addrlen
;
326 uri
->host
= prefix_end
;
327 uri
->hostlen
= host_end
- prefix_end
;
329 /* Trim trailing '.'s */
330 if (uri
->hostlen
&& uri
->host
[uri
->hostlen
- 1] == '.')
331 return URI_ERRNO_TRAILING_DOTS
;
334 if (*host_end
== ':') { /* we have port here */
335 unsigned char *port_end
= host_end
+ 1 + strcspn(host_end
+ 1, "/");
339 uri
->port
= host_end
;
340 uri
->portlen
= port_end
- host_end
;
342 if (uri
->portlen
== 0)
343 return URI_ERRNO_NO_PORT_COLON
;
345 /* We only use 8 bits for portlen so better check */
346 if (uri
->portlen
!= port_end
- host_end
)
347 return URI_ERRNO_INVALID_PORT
;
349 /* test if port is number */
350 /* TODO: possibly lookup for the service otherwise? --pasky */
351 for (; host_end
< port_end
; host_end
++)
352 if (!isdigit(*host_end
))
353 return URI_ERRNO_INVALID_PORT
;
355 /* Check valid port value, and let show an error message
356 * about invalid url syntax. */
357 if (uri
->port
&& uri
->portlen
) {
361 n
= strtol(uri
->port
, NULL
, 10);
362 if (errno
|| !uri_port_is_valid(n
))
363 return URI_ERRNO_INVALID_PORT
;
367 if (*host_end
== '/') {
370 } else if (get_protocol_need_slash_after_host(uri
->protocol
)) {
371 /* The need for slash after the host component depends on the
372 * need for a host component. -- The dangerous mind of Jonah */
374 return URI_ERRNO_NO_HOST
;
376 return URI_ERRNO_NO_HOST_SLASH
;
379 /* Look for #fragment or POST_CHAR */
380 prefix_end
= host_end
+ strcspn(host_end
, "#" POST_CHAR_S
);
381 uri
->data
= host_end
;
382 uri
->datalen
= prefix_end
- host_end
;
384 if (*prefix_end
== '#') {
385 uri
->fragment
= prefix_end
+ 1;
386 uri
->fragmentlen
= strcspn(uri
->fragment
, POST_CHAR_S
);
387 prefix_end
= uri
->fragment
+ uri
->fragmentlen
;
390 if (*prefix_end
== POST_CHAR
) {
391 uri
->post
= prefix_end
+ 1;
398 get_uri_port(struct uri
*uri
)
400 if (uri
->port
&& uri
->portlen
) {
401 unsigned char *end
= uri
->port
;
402 int port
= strtol(uri
->port
, (char **) &end
, 10);
404 if (end
!= uri
->port
) {
405 assert(uri_port_is_valid(port
));
410 return get_protocol_port(uri
->protocol
);
413 #define can_compare_uri_components(comp) !(((comp) & (URI_SPECIAL | URI_IDN)))
416 compare_component(unsigned char *a
, int alen
, unsigned char *b
, int blen
)
418 /* Check that the length and the strings are both set or unset */
419 if (alen
!= blen
|| !!a
!= !!b
) return 0;
421 /* Both are unset so that will make a perfect match */
422 if (!a
|| !alen
) return 1;
424 /* Let the higher forces decide */
425 return !memcmp(a
, b
, blen
);
428 #define wants(x) (components & (x))
431 compare_uri(struct uri
*a
, struct uri
*b
, enum uri_component components
)
433 if (a
== b
) return 1;
434 if (!components
) return 0;
436 assertm(can_compare_uri_components(components
),
437 "compare_uri() is a work in progress. Component unsupported");
439 return (!wants(URI_PROTOCOL
) || a
->protocol
== b
->protocol
)
440 && (!wants(URI_IP_FAMILY
) || a
->ip_family
== b
->ip_family
)
442 || compare_component(a
->user
, a
->userlen
, b
->user
, b
->userlen
))
443 && (!wants(URI_PASSWORD
)
444 || compare_component(a
->password
, a
->passwordlen
, b
->password
, b
->passwordlen
))
446 || compare_component(a
->host
, a
->hostlen
, b
->host
, b
->hostlen
))
448 || compare_component(a
->port
, a
->portlen
, b
->port
, b
->portlen
))
450 || compare_component(a
->data
, a
->datalen
, b
->data
, b
->datalen
))
451 && (!wants(URI_FRAGMENT
)
452 || compare_component(a
->fragment
, a
->fragmentlen
, b
->fragment
, b
->fragmentlen
))
454 || compare_component(a
->post
, a
->post
? strlen(a
->post
) : 0, b
->post
, b
->post
? strlen(b
->post
) : 0));
458 /* We might need something more intelligent than this Swiss army knife. */
460 add_uri_to_string(struct string
*string
, struct uri
*uri
,
461 enum uri_component components
)
463 /* Custom or unknown keep the URI untouched. */
464 if (uri
->protocol
== PROTOCOL_UNKNOWN
)
465 return add_to_string(string
, struri(uri
));
467 if (wants(URI_PROTOCOL
)) {
468 add_bytes_to_string(string
, uri
->string
, uri
->protocollen
);
469 if (wants(URI_IP_FAMILY
) && uri
->ip_family
)
470 add_long_to_string(string
, uri
->ip_family
);
471 add_char_to_string(string
, ':');
472 if (get_protocol_need_slashes(uri
->protocol
))
473 add_to_string(string
, "//");
476 if (wants(URI_USER
) && uri
->userlen
) {
477 add_bytes_to_string(string
, uri
->user
, uri
->userlen
);
479 if (wants(URI_PASSWORD
) && uri
->passwordlen
) {
480 add_char_to_string(string
, ':');
481 add_bytes_to_string(string
, uri
->password
,
485 add_char_to_string(string
, '@');
488 if (wants(URI_HOST
) && uri
->hostlen
) {
492 /* Rationale for wants(URI_PORT): The [notation] was invented
493 * so that you can have an IPv6 addy and a port together. So
494 * we want to use it when that happens, otherwise we need not
495 * bother (that happens only when we want it for DNS anyway).
496 * I insist on an implied elegancy of this way, but YMMV. ;-)
498 if (uri
->ipv6
&& wants(URI_PORT
)) add_char_to_string(string
, '[');
501 /* Support for the GNU International Domain Name library.
503 * http://www.gnu.org/software/libidn/manual/html_node/IDNA-Functions.html
505 * Now it is probably not perfect because idna_to_ascii_lz()
506 * will be using a ``zero terminated input string encoded in
507 * the current locale's character set''. Anyway I don't know
508 * how to convert anything to UTF-8 or Unicode. --jonas */
509 if (wants(URI_IDN
)) {
510 unsigned char *host
= memacpy(uri
->host
, uri
->hostlen
);
514 int code
= idna_to_ascii_lz(host
, &idname
, 0);
516 /* FIXME: Return NULL if it coughed? --jonas */
517 if (code
== IDNA_SUCCESS
) {
518 add_to_string(string
, idname
);
529 add_bytes_to_string(string
, uri
->host
, uri
->hostlen
);
532 if (uri
->ipv6
&& wants(URI_PORT
)) add_char_to_string(string
, ']');
536 if (wants(URI_PORT
) || wants(URI_DEFAULT_PORT
)) {
538 add_char_to_string(string
, ':');
539 add_bytes_to_string(string
, uri
->port
, uri
->portlen
);
541 } else if (wants(URI_DEFAULT_PORT
)
542 && uri
->protocol
!= PROTOCOL_USER
) {
543 /* For user protocols we don't know a default port.
544 * Should user protocols ports be configurable? */
545 int port
= get_protocol_port(uri
->protocol
);
547 add_char_to_string(string
, ':');
548 add_long_to_string(string
, port
);
552 /* Only add slash if we need to separate */
553 if ((wants(URI_DATA
) || wants(URI_POST
) || components
== URI_HTTP_REFERRER_HOST
)
554 && wants(~(URI_DATA
| URI_PORT
))
555 && get_protocol_need_slash_after_host(uri
->protocol
))
556 add_char_to_string(string
, '/');
558 if (wants(URI_DATA
) && uri
->datalen
)
559 add_bytes_to_string(string
, uri
->data
, uri
->datalen
);
561 /* We can not test uri->datalen here since we need to always
563 if (wants(URI_PATH
) || wants(URI_FILENAME
)) {
564 unsigned char *filename
= uri
->data
;
567 assertm(!wants(URI_FILENAME
) || components
== URI_FILENAME
,
568 "URI_FILENAME should be used alone %d", components
);
570 if (wants(URI_PATH
) && !is_uri_dir_sep(uri
, *filename
)) {
571 #ifdef CONFIG_OS_WIN32
572 if (uri
->protocol
!= PROTOCOL_FILE
)
574 /* FIXME: Add correct separator */
575 add_char_to_string(string
, '/');
578 if (!uri
->datalen
) return string
;
580 for (pos
= filename
; *pos
&& !end_of_dir(*pos
); pos
++)
581 if (wants(URI_FILENAME
) && is_uri_dir_sep(uri
, *pos
))
584 return add_bytes_to_string(string
, filename
, pos
- filename
);
587 if (wants(URI_QUERY
) && uri
->datalen
) {
588 unsigned char *query
= memchr(uri
->data
, '?', uri
->datalen
);
590 assertm(URI_QUERY
== components
,
591 "URI_QUERY should be used alone %d", components
);
593 if (!query
) return string
;
596 /* Check fragment and POST_CHAR */
597 return add_bytes_to_string(string
, query
, strcspn(query
, "#" POST_CHAR_S
));
600 if (wants(URI_FRAGMENT
) && uri
->fragmentlen
) {
601 add_char_to_string(string
, '#');
602 add_bytes_to_string(string
, uri
->fragment
, uri
->fragmentlen
);
605 if (wants(URI_POST
) && uri
->post
) {
606 add_char_to_string(string
, POST_CHAR
);
607 add_to_string(string
, uri
->post
);
609 } else if (wants(URI_POST_INFO
) && uri
->post
) {
610 if (!strncmp(uri
->post
, "text/plain", 10)) {
611 add_to_string(string
, " (PLAIN TEXT DATA)");
613 } else if (!strncmp(uri
->post
, "multipart/form-data;", 20)) {
614 add_to_string(string
, " (MULTIPART FORM DATA)");
617 add_to_string(string
, " (POST DATA)");
628 get_uri_string(struct uri
*uri
, enum uri_component components
)
630 struct string string
;
632 if (init_string(&string
)
633 && add_uri_to_string(&string
, uri
, components
))
634 return string
.source
;
636 done_string(&string
);
642 add_string_uri_to_string(struct string
*string
, unsigned char *uristring
,
643 enum uri_component components
)
647 if (parse_uri(&uri
, uristring
) != URI_ERRNO_OK
)
650 return add_uri_to_string(string
, &uri
, components
);
654 #define normalize_uri_reparse(str) normalize_uri(NULL, str)
655 #define normalize_uri_noparse(uri) normalize_uri(uri, struri(uri))
658 normalize_uri(struct uri
*uri
, unsigned char *uristring
)
660 unsigned char *parse_string
= uristring
;
661 unsigned char *src
, *dest
, *path
;
663 int parse
= (uri
== NULL
);
664 struct uri uri_struct
;
666 if (!uri
) uri
= &uri_struct
;
668 /* We need to get the real (proxied) URI but lowercase relevant URI
669 * parts along the way. */
671 if (parse
&& parse_uri(uri
, parse_string
) != URI_ERRNO_OK
)
676 /* This is a maybe not the right place but both join_urls() and
677 * get_translated_uri() through translate_url() calls this
678 * function and then it already works on and modifies an
680 convert_to_lowercase(uri
->string
, uri
->protocollen
);
681 if (uri
->hostlen
) convert_to_lowercase(uri
->host
, uri
->hostlen
);
684 parse_string
= uri
->data
;
685 } while (uri
->protocol
== PROTOCOL_PROXY
);
687 if (get_protocol_free_syntax(uri
->protocol
))
690 if (uri
->protocol
!= PROTOCOL_UNKNOWN
)
691 need_slash
= get_protocol_need_slash_after_host(uri
->protocol
);
693 /* We want to start at the first slash to also reduce URIs like
694 * http://host//index.html to http://host/index.html */
695 path
= uri
->data
- need_slash
;
698 /* This loop mangles the URI string by removing directory elevators and
699 * other cruft. Example: /.././etc////..//usr/ -> /usr/ */
701 /* If the following pieces are the LAST parts of URL, we remove
702 * them as well. See RFC 1808 for details. */
704 if (end_of_dir(src
[0])) {
705 /* URL data contains no more path. */
706 memmove(dest
, src
, strlen(src
) + 1);
710 if (!is_uri_dir_sep(uri
, src
[0])) {
711 /* This is to reduce indentation */
713 } else if (src
[1] == '.') {
715 /* /. - skip the dot */
720 } else if (is_uri_dir_sep(uri
, src
[2])) {
721 /* /./ - strip that.. */
725 } else if (src
[2] == '.'
726 && (is_uri_dir_sep(uri
, src
[3]) || !src
[3])) {
727 /* /../ or /.. - skip it and preceding element. */
729 /* First back out the last incrementation of
730 * @dest (dest++) to get the position that was
731 * last asigned to. */
732 if (dest
> path
) dest
--;
734 /* @dest might be pointing to a dir separator
735 * so we decrement before any testing. */
736 while (dest
> path
) {
738 if (is_uri_dir_sep(uri
, *dest
)) break;
742 /* /.. - add ending slash and stop */
752 } else if (is_uri_dir_sep(uri
, src
[1])) {
753 /* // - ignore first '/'. */
758 /* We don't want to access memory past the NUL char. */
766 /* The 'file' scheme URI comes in and bastardized URI comes out which consists
767 * of just the complete path to file/directory, which the dumb 'file' protocol
768 * backend can understand. No host parts etc, that is what this function is
769 * supposed to chew. */
771 transform_file_url(struct uri
*uri
, unsigned char *cwd
)
773 unsigned char *path
= uri
->data
;
775 assert(uri
->protocol
== PROTOCOL_FILE
&& uri
->data
);
777 /* Sort out the host part. We currently support only host "localhost"
778 * (plus empty host part will be assumed to be "localhost" as well).
779 * As our extensions, '.' will reference to the cwd on localhost
780 * (originally, when the first thing after file:// wasn't "localhost/",
781 * we assumed the cwd as well, and pretended that there's no host part
782 * at all) and '..' to the directory parent to cwd. Another extension
783 * is that if this is a DOS-like system, the first char in two-char
784 * host part is uppercase letter and the second char is a colon, it is
785 * assumed to be a local disk specification. */
786 /* TODO: Use FTP for non-localhost hosts. --pasky */
788 /* For URL "file://", we open the current directory. Some other
789 * browsers instead open root directory, but AFAIK the standard does
790 * not specify that and this was the original behaviour and it is more
791 * consistent with our file://./ notation. */
793 /* Who would name their file/dir '...' ? */
794 if (*path
== '.' || !*path
) {
797 if (!init_string(&dir
))
800 encode_uri_string(&dir
, cwd
, -1, 0);
802 /* Either we will end up with '//' and translate_directories()
803 * will shorten it or the '/' will mark the inserted cwd as a
805 if (*path
== '.') *path
= '/';
807 /* Insert the current working directory. */
808 /* The offset is 7 == sizeof("file://") - 1. */
809 insert_in_string(&struri(uri
), 7, dir
.source
, dir
.length
);
816 if (isasciialpha(path
[0]) && path
[1] == ':' && dir_sep(path
[2]))
820 for (; *path
&& !dir_sep(*path
); path
++);
822 /* FIXME: We will in fact assume localhost even for non-local hosts,
823 * until we will support the FTP transformation. --pasky */
825 memmove(uri
->data
, path
, strlen(path
) + 1);
829 static unsigned char *translate_url(unsigned char *url
, unsigned char *cwd
);
832 join_urls(struct uri
*base
, unsigned char *rel
)
834 unsigned char *uristring
, *path
;
840 /* TODO: Support for ';' ? (see the RFC) --pasky */
842 /* For '#', '?' and '//' we could use get_uri_string() but it might be
843 * too expensive since it uses granular allocation scheme. I wouldn't
844 * personally mind tho' because it would be cleaner. --jonas */
846 /* Strip fragment and post part from the base URI and append
847 * the fragment string in @rel. */
848 length
= base
->fragment
849 ? base
->fragment
- struri(base
) - 1
850 : get_real_uri_length(base
);
852 } else if (rel
[0] == '?') {
853 /* Strip query, fragment and post part from the base URI and
854 * append the query string in @rel. */
855 length
= base
->fragment
? base
->fragment
- struri(base
) - 1
856 : get_real_uri_length(base
);
858 uristring
= memchr(base
->data
, '?', base
->datalen
);
859 if (uristring
) length
= uristring
- struri(base
);
861 } else if (rel
[0] == '/' && rel
[1] == '/') {
862 if (!get_protocol_need_slashes(base
->protocol
))
865 /* Get `<protocol>:' from the base URI and append the `//' part
867 length
= base
->protocollen
+ 1;
869 /* We need to sanitize the relative part and add stuff like
874 /* If one of the tests above set @length to something useful */
876 uristring
= memacpy(struri(base
), length
);
877 if (!uristring
) return NULL
;
879 add_to_strn(&uristring
, rel
);
882 unsigned char *translated
;
884 translated
= translate_url(uristring
, NULL
);
888 return normalize_uri_reparse(uristring
);
891 /* Check if there is some protocol name to go for */
892 length
= get_protocol_length(rel
);
894 switch (get_protocol(rel
, length
)) {
895 case PROTOCOL_UNKNOWN
:
897 /* Mysteriously proxy URIs are breaking here ... */
901 /* FIXME: Use get_uri_string(base, URI_PATH) as cwd arg
902 * to translate_url(). */
904 uristring
= translate_url(rel
, NULL
);
905 if (uristring
) return uristring
;
909 assertm(base
->data
, "bad base url");
910 if_assert_failed
return NULL
;
914 /* Either is path blank, but we've slash char before, or path is not
915 * blank, but doesn't start by a slash (if we'd just stay along with
916 * is_uri_dir_sep(&uri, path[-1]) w/o all the surrounding crap, it
917 * should be enough, but I'm not sure and I don't want to break
918 * anything --pasky). */
919 /* We skip first char of URL ('/') in parse_url() (ARGH). This
920 * is reason of all this bug-bearing magic.. */
922 if (!is_uri_dir_sep(base
, *path
)) path
--;
924 if (is_uri_dir_sep(base
, path
[-1])) path
--;
927 if (!is_uri_dir_sep(base
, rel
[0])) {
928 unsigned char *path_end
;
930 /* The URL is relative. */
933 /* There's no path in the URL, but we're going to add
934 * something there, and the something doesn't start by
935 * a slash. So we need to insert a slash after the base
936 * URL. Clever, eh? ;) */
940 for (path_end
= path
; *path_end
; path_end
++) {
941 if (end_of_dir(*path_end
)) break;
942 /* Modify the path pointer, so that it'll always point
943 * above the last '/' in the URL; later, we'll copy the
944 * URL only _TO_ this point, and anything after last
945 * slash will be substituted by 'rel'. */
946 if (is_uri_dir_sep(base
, *path_end
))
951 length
= path
- struri(base
);
952 uristring
= mem_alloc(length
+ strlen(rel
) + add_slash
+ 1);
953 if (!uristring
) return NULL
;
955 memcpy(uristring
, struri(base
), length
);
956 if (add_slash
) uristring
[length
] = '/';
957 strcpy(uristring
+ length
+ add_slash
, rel
);
959 return normalize_uri_reparse(uristring
);
963 /* Tries to figure out what protocol @newurl might be specifying by checking if
964 * it exists as a file locally or by checking parts of the host name. */
966 find_uri_protocol(unsigned char *newurl
)
970 /* First see if it is a file so filenames that look like hostnames
971 * won't confuse us below. */
972 if (check_whether_file_exists(newurl
) >= 0) return PROTOCOL_FILE
;
974 /* Yes, it would be simpler to make test for IPv6 address first,
975 * but it would result in confusing mix of ifdefs ;-). */
976 /* FIXME: Ideas for improve protocol detection
978 * - Handle common hostnames. It could be part of the protocol backend
979 * structure. [ www -> http, irc -> irc, news -> nntp, ... ]
981 * - Resolve using port number. [ 119 -> nntp, 443 -> https, ... ]
984 ch
= newurl
+ strcspn(newurl
, ".:/@");
986 || (*ch
== ':' && *newurl
!= '[' && strchr(newurl
, '@'))
987 || !strncasecmp(newurl
, "ftp.", 4)) {
988 /* Contains user/password/ftp-hostname */
992 } else if (*newurl
== '[' && *ch
== ':') {
993 /* Candidate for IPv6 address */
994 unsigned char *bracket2
, *colon2
;
997 bracket2
= strchr(ch
, ']');
998 colon2
= strchr(ch
, ':');
999 if (bracket2
&& colon2
&& bracket2
> colon2
)
1000 return PROTOCOL_HTTP
;
1003 } else if (*newurl
!= '.' && *ch
== '.') {
1004 /* Contains domain name? */
1005 unsigned char *host_end
, *domain
;
1006 unsigned char *ipscan
;
1008 /* Process the hostname */
1009 for (domain
= ch
+ 1;
1010 *(host_end
= domain
+ strcspn(domain
, ".:/?")) == '.';
1011 domain
= host_end
+ 1);
1014 for (ipscan
= ch
; isdigit(*ipscan
) || *ipscan
== '.';
1017 if (!*ipscan
|| *ipscan
== ':' || *ipscan
== '/')
1018 return PROTOCOL_HTTP
;
1020 /* It's two-letter or known TLD? */
1021 if (host_end
- domain
== 2
1022 || end_with_known_tld(domain
, host_end
- domain
) >= 0)
1023 return PROTOCOL_HTTP
;
1026 return PROTOCOL_UNKNOWN
;
1030 #define MAX_TRANSLATION_ATTEMPTS 32
1032 /* Returns an URI string that can be used internally. Adding protocol prefix,
1033 * missing slashes etc. */
1034 static unsigned char *
1035 translate_url(unsigned char *url
, unsigned char *cwd
)
1037 unsigned char *newurl
;
1039 enum uri_errno uri_errno
, prev_errno
= URI_ERRNO_EMPTY
;
1042 /* Strip starting spaces */
1043 while (*url
== ' ') url
++;
1044 if (!*url
) return NULL
;
1046 newurl
= expand_tilde(url
); /* XXX: Post data copy. */
1047 if (!newurl
) return NULL
;
1050 /* Yay a goto loop. If we get some URI parse error and try to
1051 * fix it we go back to here and try again. */
1052 /* Ordinary parse */
1053 uri_errno
= parse_uri(&uri
, newurl
);
1055 /* Bail out if the same error occurs twice */
1056 if (uri_errno
== prev_errno
|| retries
++ > MAX_TRANSLATION_ATTEMPTS
) {
1057 if (retries
> MAX_TRANSLATION_ATTEMPTS
) {
1058 ERROR("Maximum number of parsing attempts exceeded "
1065 prev_errno
= uri_errno
;
1067 switch (uri_errno
) {
1069 /* Fix translation of 1.2.3.4:5 so IP address part won't be
1070 * interpreted as the protocol name. */
1071 if (uri
.protocol
== PROTOCOL_UNKNOWN
) {
1072 enum protocol protocol
= find_uri_protocol(newurl
);
1074 /* Code duplication with the URI_ERRNO_INVALID_PROTOCOL
1076 if (protocol
!= PROTOCOL_UNKNOWN
) {
1079 if (!init_string(&str
)) return NULL
;
1083 add_to_string(&str
, "ftp://");
1084 encode_uri_string(&str
, newurl
, -1, 0);
1088 add_to_string(&str
, "http://");
1089 add_to_string(&str
, newurl
);
1092 case PROTOCOL_UNKNOWN
:
1097 add_to_string(&str
, "file://");
1098 if (!dir_sep(*newurl
))
1099 add_to_string(&str
, "./");
1101 add_to_string(&str
, newurl
);
1105 newurl
= str
.source
;
1107 /* Work around the infinite loop prevention */
1108 prev_errno
= URI_ERRNO_EMPTY
;
1113 /* If file:// URI is transformed we need to reparse. */
1114 if (uri
.protocol
== PROTOCOL_FILE
&& cwd
&& *cwd
1115 && transform_file_url(&uri
, cwd
))
1116 return normalize_uri_reparse(struri(&uri
));
1118 /* Translate the proxied URI too if proxy:// */
1119 if (uri
.protocol
== PROTOCOL_PROXY
) {
1120 unsigned char *data
= translate_url(uri
.data
, cwd
);
1121 int pos
= uri
.data
- struri(&uri
);
1124 struri(&uri
)[pos
] = 0;
1125 insert_in_string(&struri(&uri
), pos
, data
, strlen(data
));
1127 return normalize_uri_reparse(struri(&uri
));
1130 return normalize_uri_noparse(&uri
);
1132 case URI_ERRNO_TOO_MANY_SLASHES
:
1134 unsigned char *from
, *to
;
1136 assert(uri
.string
[uri
.protocollen
] == ':'
1137 && uri
.string
[uri
.protocollen
+ 1] == '/'
1138 && uri
.string
[uri
.protocollen
+ 2] == '/');
1140 from
= to
= uri
.string
+ uri
.protocollen
+ 3;
1141 while (*from
== '/') from
++;
1144 memmove(to
, from
, strlen(from
) + 1);
1147 case URI_ERRNO_NO_SLASHES
:
1149 /* Try prefix:some.url -> prefix://some.url.. */
1152 /* Check if only one '/' is needed. */
1153 if (uri
.string
[uri
.protocollen
+ 1] == '/')
1156 insert_in_string(&newurl
, uri
.protocollen
+ 1, "//", slashes
);
1159 case URI_ERRNO_TRAILING_DOTS
:
1161 /* Trim trailing '.'s */
1162 unsigned char *from
= uri
.host
+ uri
.hostlen
;
1163 unsigned char *to
= from
;
1165 assert(uri
.host
< to
&& to
[-1] == '.' && *from
!= '.');
1167 while (uri
.host
< to
&& to
[-1] == '.') to
--;
1170 memmove(to
, from
, strlen(from
) + 1);
1173 case URI_ERRNO_NO_PORT_COLON
:
1174 assert(uri
.portlen
== 0
1175 && uri
.string
< uri
.port
1176 && uri
.port
[-1] == ':');
1178 memmove(uri
.port
- 1, uri
.port
, strlen(uri
.port
) + 1);
1181 case URI_ERRNO_NO_HOST_SLASH
:
1183 int offset
= uri
.port
1184 ? uri
.port
+ uri
.portlen
- struri(&uri
)
1185 : uri
.host
+ uri
.hostlen
- struri(&uri
) + uri
.ipv6
/* ']' */;
1187 assertm(uri
.host
, "uri.host not set after no host slash error");
1188 insert_in_string(&newurl
, offset
, "/", 1);
1191 case URI_ERRNO_INVALID_PROTOCOL
:
1193 /* No protocol name */
1194 enum protocol protocol
= find_uri_protocol(newurl
);
1197 if (!init_string(&str
)) return NULL
;
1201 add_to_string(&str
, "ftp://");
1202 encode_uri_string(&str
, newurl
, -1, 0);
1206 add_to_string(&str
, "http://");
1207 add_to_string(&str
, newurl
);
1210 case PROTOCOL_UNKNOWN
:
1211 /* We default to file:// even though we already
1212 * tested if the file existed since it will give
1213 * a "No such file or directory" error. which
1214 * might better hint the user that there was
1215 * problem figuring out the URI. */
1218 add_to_string(&str
, "file://");
1219 if (!dir_sep(*newurl
))
1220 add_to_string(&str
, "./");
1222 encode_file_uri_string(&str
, newurl
);
1226 newurl
= str
.source
;
1230 case URI_ERRNO_EMPTY
:
1231 case URI_ERRNO_IPV6_SECURITY
:
1232 case URI_ERRNO_NO_HOST
:
1233 case URI_ERRNO_INVALID_PORT
:
1234 case URI_ERRNO_INVALID_PORT_RANGE
:
1235 /* None of these can be handled properly. */
1245 get_composed_uri(struct uri
*uri
, enum uri_component components
)
1247 unsigned char *string
;
1250 if_assert_failed
return NULL
;
1252 string
= get_uri_string(uri
, components
);
1253 if (!string
) return NULL
;
1255 uri
= get_uri(string
, 0);
1262 get_translated_uri(unsigned char *uristring
, unsigned char *cwd
)
1266 uristring
= translate_url(uristring
, cwd
);
1267 if (!uristring
) return NULL
;
1269 uri
= get_uri(uristring
, 0);
1270 mem_free(uristring
);
1277 get_extension_from_uri(struct uri
*uri
)
1279 unsigned char *extension
= NULL
;
1281 unsigned char *pos
= uri
->data
;
1285 for (; *pos
&& !end_of_dir(*pos
); pos
++) {
1286 if (!afterslash
&& !extension
&& *pos
== '.') {
1288 } else if (is_uri_dir_sep(uri
, *pos
)) {
1296 if (extension
&& extension
< pos
)
1297 return memacpy(extension
, pos
- extension
);
1302 /* URI encoding, escaping unallowed characters. */
1304 safe_char(unsigned char c
)
1306 /* RFC 2396, Page 8, Section 2.3 ;-) */
1307 return isident(c
) || c
== '.' || c
== '!' || c
== '~'
1308 || c
== '*' || c
== '\''|| c
== '(' || c
== ')';
1312 encode_uri_string(struct string
*string
, unsigned char *name
, int namelen
,
1313 int convert_slashes
)
1321 if (namelen
< 0) namelen
= strlen(name
);
1323 for (end
= name
+ namelen
; name
< end
; name
++) {
1325 /* This is probably correct only for query part of URI..? */
1326 if (*name
== ' ') add_char_to_string(data
, len
, '+');
1329 if (safe_char(*name
) || (!convert_slashes
&& *name
== '/')) {
1330 add_char_to_string(string
, *name
);
1333 n
[1] = hx((((int) *name
) & 0xF0) >> 4);
1334 n
[2] = hx(((int) *name
) & 0xF);
1335 add_bytes_to_string(string
, n
, sizeof(n
) - 1);
1341 encode_win32_uri_string(struct string
*string
, unsigned char *name
, int namelen
)
1349 if (namelen
< 0) namelen
= strlen(name
);
1351 for (end
= name
+ namelen
; name
< end
; name
++) {
1352 if (safe_char(*name
) || *name
== ':' || *name
== '\\') {
1353 add_char_to_string(string
, *name
);
1356 n
[1] = hx((((int) *name
) & 0xF0) >> 4);
1357 n
[2] = hx(((int) *name
) & 0xF);
1358 add_bytes_to_string(string
, n
, sizeof(n
) - 1);
1363 /* This function is evil, it modifies its parameter. */
1364 /* XXX: but decoded string is _never_ longer than encoded string so it's an
1365 * efficient way to do that, imho. --Zas */
1367 decode_uri(unsigned char *src
)
1369 unsigned char *dst
= src
;
1376 int x1
= unhx(*src
);
1379 int x2
= unhx(*(src
+ 1));
1382 x1
= (x1
<< 4) + x2
;
1383 if (x1
!= 0) { /* don't allow %00 */
1384 c
= (unsigned char) x1
;
1391 } else if (c
== '+') {
1392 /* As the comment in encode_uri_string suggests, '+'
1393 * should only be decoded in the query part of a URI
1394 * (should that be 'URL'?). I'm not bold enough to
1395 * disable this code, tho. -- Miciah */
1401 } while (c
!= '\0');
1405 decode_uri_string(struct string
*string
)
1407 decode_uri(string
->source
);
1408 string
->length
= strlen(string
->source
);
1412 decode_uri_for_display(unsigned char *src
)
1417 if (!isprint(*src
) || iscntrl(*src
))
1422 decode_uri_string_for_display(struct string
*string
)
1424 decode_uri_for_display(string
->source
);
1425 string
->length
= strlen(string
->source
);
1431 #define URI_LIST_GRANULARITY 0x3
1433 #define realloc_uri_list(list) \
1434 mem_align_alloc(&(list)->uris, (list)->size, (list)->size + 1, \
1435 URI_LIST_GRANULARITY)
1438 add_to_uri_list(struct uri_list
*list
, struct uri
*uri
)
1440 if (!realloc_uri_list(list
))
1443 list
->uris
[list
->size
++] = get_uri_reference(uri
);
1449 free_uri_list(struct uri_list
*list
)
1454 if (!list
->uris
) return;
1456 foreach_uri (uri
, index
, list
) {
1460 mem_free_set(&list
->uris
, NULL
);
1466 struct uri_cache_entry
{
1468 unsigned char string
[1];
1473 struct object object
;
1476 static struct uri_cache uri_cache
;
1480 check_uri_sanity(struct uri
*uri
)
1484 for (pos
= 0; pos
< uri
->protocollen
; pos
++)
1485 if (isupper(uri
->string
[pos
])) goto error
;
1488 for (pos
= 0; pos
< uri
->hostlen
; pos
++)
1489 if (isupper(uri
->host
[pos
])) goto error
;
1492 INTERNAL("Uppercase letters detected in protocol or host part (%s).", struri(uri
));
1495 #define check_uri_sanity(uri)
1498 static inline struct uri_cache_entry
*
1499 get_uri_cache_entry(unsigned char *string
, int length
)
1501 struct uri_cache_entry
*entry
;
1502 struct hash_item
*item
;
1504 assert(string
&& length
> 0);
1505 if_assert_failed
return NULL
;
1507 item
= get_hash_item(uri_cache
.map
, string
, length
);
1508 if (item
) return item
->value
;
1510 /* Setup a new entry */
1512 entry
= mem_calloc(1, sizeof(*entry
) + length
);
1513 if (!entry
) return NULL
;
1515 object_nolock(&entry
->uri
, "uri");
1516 memcpy(&entry
->string
, string
, length
);
1517 string
= entry
->string
;
1519 if (parse_uri(&entry
->uri
, string
) != URI_ERRNO_OK
1520 || !add_hash_item(uri_cache
.map
, string
, length
, entry
)) {
1525 object_lock(&uri_cache
);
1531 get_uri(unsigned char *string
, enum uri_component components
)
1533 struct uri_cache_entry
*entry
;
1540 if (parse_uri(&uri
, string
) != URI_ERRNO_OK
)
1543 return get_composed_uri(&uri
, components
);
1546 if (!is_object_used(&uri_cache
)) {
1547 uri_cache
.map
= init_hash8();
1548 if (!uri_cache
.map
) return NULL
;
1549 object_nolock(&uri_cache
, "uri_cache");
1552 entry
= get_uri_cache_entry(string
, strlen(string
));
1554 if (!is_object_used(&uri_cache
))
1555 free_hash(&uri_cache
.map
);
1559 check_uri_sanity(&entry
->uri
);
1560 object_nolock(&entry
->uri
, "uri");
1561 object_lock(&entry
->uri
);
1567 done_uri(struct uri
*uri
)
1569 unsigned char *string
= struri(uri
);
1570 int length
= strlen(string
);
1571 struct hash_item
*item
;
1572 struct uri_cache_entry
*entry
;
1574 assert(is_object_used(&uri_cache
));
1577 if (is_object_used(uri
)) return;
1579 item
= get_hash_item(uri_cache
.map
, string
, length
);
1580 entry
= item
? item
->value
: NULL
;
1582 assertm(entry
, "Releasing unknown URI [%s]", string
);
1583 del_hash_item(uri_cache
.map
, item
);
1586 /* Last URI frees the cache */
1587 object_unlock(&uri_cache
);
1588 if (!is_object_used(&uri_cache
))
1589 free_hash(&uri_cache
.map
);