src/protocol/uri.c

   1 /* URL parser and translator; implementation of RFC 2396. */
   2
   3 #ifdef HAVE_CONFIG_H
   4 #include "config.h"
   5 #endif
   6
   7 #include <ctype.h>
   8 #include <errno.h>
   9 #ifdef HAVE_IDNA_H
  10 #include <idna.h>
  11 #endif
  12 #include <stdio.h>
  13 #include <stdlib.h>
  14 #include <string.h>
  15 #include <sys/types.h>
  16 #ifdef HAVE_NETDB_H
  17 #include <netdb.h> /* OS/2 needs this after sys/types.h */
  18 #endif
  19
  20 #ifdef HAVE_SYS_SOCKET_H
  21 #include <sys/socket.h>
  22 #endif
  23 #ifdef HAVE_NETINET_IN_H
  24 #include <netinet/in.h>
  25 #endif
  26 #ifdef HAVE_ARPA_INET_H
  27 #include <arpa/inet.h>
  28 #endif
  29
  30 #include "elinks.h"
  31
  32 #include "main/object.h"
  33 #include "protocol/protocol.h"
  34 #include "protocol/uri.h"
  35 #include "util/conv.h"
  36 #include "util/error.h"
  37 #include "util/file.h"
  38 #include "util/hash.h"
  39 #include "util/memory.h"
  40 #include "util/string.h"
  41
  42
  43 static inline int
  44 end_of_dir(unsigned char c)
  45 {
  46         /* This used to check for c == ';' as well.  But section 3.3
  47          * of RFC 2396 explicitly says that parameters in a path
  48          * segment "are not significant to the parsing of relative
  49          * references."  */
  50         return c == POST_CHAR || c == '#' || c == '?';
  51 }
  52
  53 static inline int
  54 is_uri_dir_sep(const struct uri *uri, unsigned char pos)
  55 {
  56         return (uri->protocol == PROTOCOL_FILE ? dir_sep(pos) : pos == '/');
  57 }
  58
  59
  60 int
  61 is_in_domain(unsigned char *domain, unsigned char *server, int server_len)
  62 {
  63         int domain_len = strlen(domain);
  64         int len;
  65
  66         if (domain_len > server_len)
  67                 return 0;
  68
  69         if (domain_len == server_len)
  70                 return !c_strncasecmp(domain, server, server_len);
  71
  72         len = server_len - domain_len;
  73         if (server[len - 1] != '.')
  74                 return 0;
  75
  76         return !c_strncasecmp(domain, server + len, domain_len);
  77 }
  78
  79 int
  80 is_ip_address(const unsigned char *address, int addresslen)
  81 {
  82         /* The @address has well defined limits so it would be a shame to
  83          * allocate it. */
  84         unsigned char buffer[IP_ADDRESS_BUFFER_SIZE];
  85
  86         if (addresslen >= sizeof(buffer))
  87                 return 0;
  88
  89         safe_strncpy(buffer, address, addresslen + 1);
  90
  91 #ifdef HAVE_INET_PTON
  92 #ifdef CONFIG_IPV6
  93         {
  94                 struct sockaddr_in6 addr6;
  95
  96                 if (inet_pton(AF_INET6, buffer, &addr6.sin6_addr) > 0)
  97                         return 1;
  98         }
  99 #endif /* CONFIG_IPV6 */
 100         {
 101                 struct in_addr addr4;
 102
 103                 if (inet_pton(AF_INET, buffer, &addr4) > 0)
 104                         return 1;
 105         }
 106
 107         return 0;
 108 #else
 109         /* FIXME: Is this ever the case? */
 110         return 0;
 111 #endif /* HAVE_INET_PTON */
 112 }
 113
 114
 115 int
 116 end_with_known_tld(const unsigned char *s, int slen)
 117 {
 118         int i;
 119         static const unsigned char *const tld[] =
 120         { "com", "edu", "net",
 121           "org", "gov", "mil",
 122           "int", "biz", "arpa",
 123           "aero", "coop",
 124           "info", "museum",
 125           "name", "pro", NULL };
 126
 127         if (!slen) return -1;
 128         if (slen < 0) slen = strlen(s);
 129
 130         for (i = 0; tld[i]; i++) {
 131                 int tldlen = strlen(tld[i]);
 132                 int pos = slen - tldlen;
 133
 134                 if (pos >= 0 && !c_strncasecmp(&s[pos], tld[i], tldlen))
 135                         return pos;
 136         }
 137
 138         return -1;
 139 }
 140
 141 /* XXX: this function writes to @name. */
 142 static int
 143 check_whether_file_exists(unsigned char *name)
 144 {
 145         /* Check POST_CHAR etc ... */
 146         static const unsigned char chars[] = POST_CHAR_S "#?";
 147         int i;
 148         int namelen = strlen(name);
 149
 150         if (file_exists(name))
 151                 return namelen;
 152
 153         for (i = 0; i < sizeof(chars) - 1; i++) {
 154                 unsigned char *pos = memchr(name, chars[i], namelen);
 155                 int exists;
 156
 157                 if (!pos) continue;
 158
 159                 *pos = 0;
 160                 exists = file_exists(name);
 161                 *pos = chars[i];
 162
 163                 if (exists) {
 164                         return pos - name;
 165                 }
 166         }
 167
 168         return -1;
 169 }
 170
 171 /* Encodes URIs without encoding stuff like fragments and query separators. */
 172 static void
 173 encode_file_uri_string(struct string *string, unsigned char *uristring)
 174 {
 175         int filenamelen = check_whether_file_exists(uristring);
 176
 177         encode_uri_string(string, uristring, filenamelen, 0);
 178         if (filenamelen > 0) add_to_string(string, uristring + filenamelen);
 179 }
 180
 181
 182 static inline int
 183 get_protocol_length(const unsigned char *url)
 184 {
 185         unsigned char *end = (unsigned char *) url;
 186
 187         /* Seek the end of the protocol name if any. */
 188         /* RFC1738:
 189          * scheme  = 1*[ lowalpha | digit | "+" | "-" | "." ]
 190          * (but per its recommendations we accept "upalpha" too) */
 191         while (isalnum(*end) || *end == '+' || *end == '-' || *end == '.')
 192                 end++;
 193
 194         /* Now we make something to support our "IP version in protocol scheme
 195          * name" hack and silently chop off the last digit if it's there. The
 196          * IETF's not gonna notice I hope or it'd be going after us hard. */
 197         if (end != url && isdigit(end[-1]))
 198                 end--;
 199
 200         /* Also return 0 if there's no protocol name (@end == @url). */
 201         return (*end == ':' || isdigit(*end)) ? end - url : 0;
 202 }
 203
 204 enum uri_errno
 205 parse_uri(struct uri *uri, unsigned char *uristring)
 206 {
 207         unsigned char *prefix_end, *host_end;
 208 #ifdef CONFIG_IPV6
 209         unsigned char *lbracket, *rbracket;
 210 #endif
 211
 212         assertm(uristring != NULL, "No uri to parse.");
 213         memset(uri, 0, sizeof(*uri));
 214
 215         /* Nothing to do for an empty url. */
 216         if_assert_failed return 0;
 217         if (!*uristring) return URI_ERRNO_EMPTY;
 218
 219         uri->string = uristring;
 220         uri->protocollen = get_protocol_length(uristring);
 221
 222         /* Invalid */
 223         if (!uri->protocollen) return URI_ERRNO_INVALID_PROTOCOL;
 224
 225         /* Figure out whether the protocol is known */
 226         uri->protocol = get_protocol(struri(uri), uri->protocollen);
 227
 228         prefix_end = uristring + uri->protocollen; /* ':' */
 229
 230         /* Check if there's a digit after the protocol name. */
 231         if (isdigit(*prefix_end)) {
 232                 uri->ip_family = uristring[uri->protocollen] - '0';
 233                 prefix_end++;
 234         }
 235         if (*prefix_end != ':')
 236                 return URI_ERRNO_INVALID_PROTOCOL;
 237         prefix_end++;
 238
 239         /* Skip slashes */
 240
 241         if (prefix_end[0] == '/' && prefix_end[1] == '/') {
 242                 if (prefix_end[2] == '/'
 243                     && get_protocol_need_slash_after_host(uri->protocol))
 244                         return URI_ERRNO_TOO_MANY_SLASHES;
 245
 246                 prefix_end += 2;
 247
 248         } else if (get_protocol_need_slashes(uri->protocol)) {
 249                 return URI_ERRNO_NO_SLASHES;
 250         }
 251
 252         if (get_protocol_free_syntax(uri->protocol)) {
 253                 uri->data = prefix_end;
 254                 uri->datalen = strlen(prefix_end);
 255                 return URI_ERRNO_OK;
 256
 257         } else if (uri->protocol == PROTOCOL_FILE) {
 258                 int datalen = strcspn(prefix_end, "#" POST_CHAR_S);
 259                 unsigned char *frag_or_post = prefix_end + datalen;
 260
 261                 /* Extract the fragment part. */
 262                 if (datalen >= 0) {
 263                         if (*frag_or_post == '#') {
 264                                 uri->fragment = frag_or_post + 1;
 265                                 uri->fragmentlen = strcspn(uri->fragment, POST_CHAR_S);
 266                                 frag_or_post = uri->fragment + uri->fragmentlen;
 267                         }
 268                         if (*frag_or_post == POST_CHAR) {
 269                                 uri->post = frag_or_post + 1;
 270                         }
 271                 } else {
 272                         datalen = strlen(prefix_end);
 273                 }
 274
 275                 /* A bit of a special case, but using the "normal" host
 276                  * parsing seems a bit scary at this point. (see bug 107). */
 277                 if (datalen > 9 && !c_strncasecmp(prefix_end, "localhost/", 10)) {
 278                         prefix_end += 9;
 279                         datalen -= 9;
 280                 }
 281
 282                 uri->data = prefix_end;
 283                 uri->datalen = datalen;
 284
 285                 return URI_ERRNO_OK;
 286         }
 287
 288         /* Isolate host */
 289
 290 #ifdef CONFIG_IPV6
 291         /* Get brackets enclosing IPv6 address */
 292         lbracket = strchr(prefix_end, '[');
 293         if (lbracket) {
 294                 rbracket = strchr(lbracket, ']');
 295                 /* [address] is handled only inside of hostname part (surprisingly). */
 296                 if (rbracket && rbracket < prefix_end + strcspn(prefix_end, "/"))
 297                         uri->ipv6 = 1;
 298                 else
 299                         lbracket = rbracket = NULL;
 300         } else {
 301                 rbracket = NULL;
 302         }
 303 #endif
 304
 305         /* Possibly skip auth part */
 306         host_end = prefix_end + strcspn(prefix_end, "@");
 307
 308         if (prefix_end + strcspn(prefix_end, "/") > host_end
 309             && *host_end) { /* we have auth info here */
 310                 unsigned char *user_end;
 311
 312                 /* Allow '@' in the password component */
 313                 while (strcspn(host_end + 1, "@") < strcspn(host_end + 1, "/?"))
 314                         host_end = host_end + 1 + strcspn(host_end + 1, "@");
 315
 316                 user_end = strchr(prefix_end, ':');
 317
 318                 if (!user_end || user_end > host_end) {
 319                         uri->user = prefix_end;
 320                         uri->userlen = host_end - prefix_end;
 321                 } else {
 322                         uri->user = prefix_end;
 323                         uri->userlen = user_end - prefix_end;
 324                         uri->password = user_end + 1;
 325                         uri->passwordlen = host_end - user_end - 1;
 326                 }
 327                 prefix_end = host_end + 1;
 328         }
 329
 330 #ifdef CONFIG_IPV6
 331         if (uri->ipv6)
 332                 host_end = rbracket + strcspn(rbracket, ":/?");
 333         else
 334 #endif
 335                 host_end = prefix_end + strcspn(prefix_end, ":/?");
 336
 337 #ifdef CONFIG_IPV6
 338         if (uri->ipv6) {
 339                 int addrlen = rbracket - lbracket - 1;
 340
 341                 /* Check for valid length.
 342                  * addrlen >= sizeof(hostbuf) is theorically impossible
 343                  * but i keep the test in case of... Safer, imho --Zas */
 344                 assertm(addrlen >= 0 && addrlen < NI_MAXHOST,
 345                         "parse_uri(): addrlen value is bad (%d) for URL '%s'. "
 346                         "Problems are likely to be encountered. Please report "
 347                         "this, it is a security bug!", addrlen, uristring);
 348                 if_assert_failed return URI_ERRNO_IPV6_SECURITY;
 349
 350                 uri->host = lbracket + 1;
 351                 uri->hostlen = addrlen;
 352         } else
 353 #endif
 354         {
 355                 uri->host = prefix_end;
 356                 uri->hostlen = host_end - prefix_end;
 357
 358                 /* Trim trailing '.'s */
 359                 if (uri->hostlen && uri->host[uri->hostlen - 1] == '.')
 360                         return URI_ERRNO_TRAILING_DOTS;
 361         }
 362
 363         if (*host_end == ':') { /* we have port here */
 364                 unsigned char *port_end = host_end + 1 + strcspn(host_end + 1, "/");
 365
 366                 host_end++;
 367
 368                 uri->port = host_end;
 369                 uri->portlen = port_end - host_end;
 370
 371                 if (uri->portlen == 0)
 372                         return URI_ERRNO_NO_PORT_COLON;
 373
 374                 /* We only use 8 bits for portlen so better check */
 375                 if (uri->portlen != port_end - host_end)
 376                         return URI_ERRNO_INVALID_PORT;
 377
 378                 /* test if port is number */
 379                 /* TODO: possibly lookup for the service otherwise? --pasky */
 380                 for (; host_end < port_end; host_end++)
 381                         if (!isdigit(*host_end))
 382                                 return URI_ERRNO_INVALID_PORT;
 383
 384                 /* Check valid port value, and let show an error message
 385                  * about invalid url syntax. */
 386                 if (uri->port && uri->portlen) {
 387                         int n;
 388
 389                         errno = 0;
 390                         n = strtol(uri->port, NULL, 10);
 391                         if (errno || !uri_port_is_valid(n))
 392                                 return URI_ERRNO_INVALID_PORT;
 393                 }
 394         }
 395
 396         if (*host_end == '/') {
 397                 host_end++;
 398
 399         } else if (get_protocol_need_slash_after_host(uri->protocol)) {
 400                 /* The need for slash after the host component depends on the
 401                  * need for a host component. -- The dangerous mind of Jonah */
 402                 if (!uri->hostlen)
 403                         return URI_ERRNO_NO_HOST;
 404
 405                 return URI_ERRNO_NO_HOST_SLASH;
 406         }
 407
 408         /* Look for #fragment or POST_CHAR */
 409         prefix_end = host_end + strcspn(host_end, "#" POST_CHAR_S);
 410         uri->data = host_end;
 411         uri->datalen = prefix_end - host_end;
 412
 413         if (*prefix_end == '#') {
 414                 uri->fragment = prefix_end + 1;
 415                 uri->fragmentlen = strcspn(uri->fragment, POST_CHAR_S);
 416                 prefix_end = uri->fragment + uri->fragmentlen;
 417         }
 418
 419         if (*prefix_end == POST_CHAR) {
 420                 uri->post = prefix_end + 1;
 421         }
 422
 423         return URI_ERRNO_OK;
 424 }
 425
 426 int
 427 get_uri_port(const struct uri *uri)
 428 {
 429         if (uri->port && uri->portlen) {
 430                 const unsigned char *end = uri->port;
 431                 int port = strtol(uri->port, (char **) &end, 10);
 432
 433                 if (end != uri->port) {
 434                         assert(uri_port_is_valid(port));
 435                         return port;
 436                 }
 437         }
 438
 439         return get_protocol_port(uri->protocol);
 440 }
 441
 442 #define can_compare_uri_components(comp) !(((comp) & (URI_SPECIAL | URI_IDN)))
 443
 444 static inline int
 445 compare_component(const unsigned char *a, int alen,
 446                   const unsigned char *b, int blen)
 447 {
 448         /* Check that the length and the strings are both set or unset */
 449         if (alen != blen || !!a != !!b) return 0;
 450
 451         /* Both are unset so that will make a perfect match */
 452         if (!a || !alen) return 1;
 453
 454         /* Let the higher forces decide */
 455         return !memcmp(a, b, blen);
 456 }
 457
 458 #define wants(x) (components & (x))
 459
 460 int
 461 compare_uri(const struct uri *a, const struct uri *b,
 462             enum uri_component components)
 463 {
 464         if (a == b) return 1;
 465         if (!components) return 0;
 466
 467         assertm(can_compare_uri_components(components),
 468                 "compare_uri() is a work in progress. Component unsupported");
 469
 470         return (!wants(URI_PROTOCOL) || a->protocol == b->protocol)
 471                 && (!wants(URI_IP_FAMILY) || a->ip_family == b->ip_family)
 472                 && (!wants(URI_USER)
 473                     || compare_component(a->user, a->userlen, b->user, b->userlen))
 474                 && (!wants(URI_PASSWORD)
 475                     || compare_component(a->password, a->passwordlen, b->password, b->passwordlen))
 476                 && (!wants(URI_HOST)
 477                     || compare_component(a->host, a->hostlen, b->host, b->hostlen))
 478                 && (!wants(URI_PORT)
 479                     || compare_component(a->port, a->portlen, b->port, b->portlen))
 480                 && (!wants(URI_DATA)
 481                     || compare_component(a->data, a->datalen, b->data, b->datalen))
 482                 && (!wants(URI_FRAGMENT)
 483                     || compare_component(a->fragment, a->fragmentlen, b->fragment, b->fragmentlen))
 484                 && (!wants(URI_POST)
 485                     || compare_component(a->post, a->post ? strlen(a->post) : 0, b->post, b->post ? strlen(b->post) : 0));
 486 }
 487
 488
 489 /* We might need something more intelligent than this Swiss army knife. */
 490 struct string *
 491 add_uri_to_string(struct string *string, const struct uri *uri,
 492                   enum uri_component components)
 493 {
 494         /* Custom or unknown keep the URI untouched. */
 495         if (uri->protocol == PROTOCOL_UNKNOWN)
 496                 return add_to_string(string, struri(uri));
 497
 498         if (wants(URI_PROTOCOL)) {
 499                 add_bytes_to_string(string, uri->string, uri->protocollen);
 500                 if (wants(URI_IP_FAMILY) && uri->ip_family)
 501                         add_long_to_string(string, uri->ip_family);
 502                 add_char_to_string(string, ':');
 503                 if (get_protocol_need_slashes(uri->protocol))
 504                         add_to_string(string, "//");
 505         }
 506
 507         if (wants(URI_USER) && uri->userlen) {
 508                 add_bytes_to_string(string, uri->user, uri->userlen);
 509
 510                 if (wants(URI_PASSWORD) && uri->passwordlen) {
 511                         add_char_to_string(string, ':');
 512                         add_bytes_to_string(string, uri->password,
 513                                                     uri->passwordlen);
 514                 }
 515
 516                 add_char_to_string(string, '@');
 517
 518         } else if (wants(URI_PASSWORD) && uri->passwordlen) {
 519                 add_bytes_to_string(string, uri->password, uri->passwordlen);
 520         }
 521
 522         if (wants(URI_HOST) && uri->hostlen) {
 523                 int add_host = 1;
 524
 525 #ifdef CONFIG_IPV6
 526                 /* Rationale for wants(URI_PORT): The [notation] was invented
 527                  * so that you can have an IPv6 addy and a port together. So
 528                  * we want to use it when that happens, otherwise we need not
 529                  * bother (that happens only when we want it for DNS anyway).
 530                  * I insist on an implied elegancy of this way, but YMMV. ;-)
 531                  * --pasky */
 532                 if (uri->ipv6 && wants(URI_PORT)) add_char_to_string(string, '[');
 533 #endif
 534 #ifdef CONFIG_IDN
 535                 /* Support for the GNU International Domain Name library.
 536                  *
 537                  * http://www.gnu.org/software/libidn/manual/html_node/IDNA-Functions.html
 538                  *
 539                  * Now it is probably not perfect because idna_to_ascii_lz()
 540                  * will be using a ``zero terminated input string encoded in
 541                  * the current locale's character set''. Anyway I don't know
 542                  * how to convert anything to UTF-8 or Unicode. --jonas */
 543                 if (wants(URI_IDN)) {
 544                         unsigned char *host = memacpy(uri->host, uri->hostlen);
 545
 546                         if (host) {
 547                                 char *idname;
 548                                 int code = idna_to_ascii_lz(host, &idname, 0);
 549
 550                                 /* FIXME: Return NULL if it coughed? --jonas */
 551                                 if (code == IDNA_SUCCESS) {
 552                                         add_to_string(string, idname);
 553                                         free(idname);
 554                                         add_host = 0;
 555                                 }
 556
 557                                 mem_free(host);
 558                         }
 559                 }
 560
 561 #endif
 562                 if (add_host)
 563                         add_bytes_to_string(string, uri->host, uri->hostlen);
 564
 565 #ifdef CONFIG_IPV6
 566                 if (uri->ipv6 && wants(URI_PORT)) add_char_to_string(string, ']');
 567 #endif
 568         }
 569
 570         if (wants(URI_PORT) || wants(URI_DEFAULT_PORT)) {
 571                 if (uri->portlen) {
 572                         add_char_to_string(string, ':');
 573                         add_bytes_to_string(string, uri->port, uri->portlen);
 574
 575                 } else if (wants(URI_DEFAULT_PORT)
 576                            && uri->protocol != PROTOCOL_USER) {
 577                         /* For user protocols we don't know a default port.
 578                          * Should user protocols ports be configurable? */
 579                         int port = get_protocol_port(uri->protocol);
 580
 581                         add_char_to_string(string, ':');
 582                         add_long_to_string(string, port);
 583                 }
 584         }
 585
 586         /* Only add slash if we need to separate */
 587         if ((wants(URI_DATA) || wants(URI_POST) || components == URI_HTTP_REFERRER_HOST)
 588             && wants(~(URI_DATA | URI_PORT))
 589             && get_protocol_need_slash_after_host(uri->protocol))
 590                 add_char_to_string(string, '/');
 591
 592         if (wants(URI_DATA) && uri->datalen)
 593                 add_bytes_to_string(string, uri->data, uri->datalen);
 594
 595         /* We can not test uri->datalen here since we need to always
 596          * add '/'. */
 597         if (wants(URI_PATH) || wants(URI_FILENAME)) {
 598                 const unsigned char *filename = uri->data;
 599                 const unsigned char *pos;
 600
 601                 assertm(!wants(URI_FILENAME) || components == URI_FILENAME,
 602                         "URI_FILENAME should be used alone %d", components);
 603
 604                 if (wants(URI_PATH) && !is_uri_dir_sep(uri, *filename)) {
 605 #ifdef CONFIG_OS_WIN32
 606                         if (uri->protocol != PROTOCOL_FILE)
 607 #endif
 608                         /* FIXME: Add correct separator */
 609                         add_char_to_string(string, '/');
 610                 }
 611
 612                 if (!uri->datalen) return string;
 613
 614                 for (pos = filename; *pos && !end_of_dir(*pos); pos++)
 615                         if (wants(URI_FILENAME) && is_uri_dir_sep(uri, *pos))
 616                                 filename = pos + 1;
 617
 618                 return add_bytes_to_string(string, filename, pos - filename);
 619         }
 620
 621         if (wants(URI_QUERY) && uri->datalen) {
 622                 const unsigned char *query = memchr(uri->data, '?', uri->datalen);
 623
 624                 assertm(URI_QUERY == components,
 625                         "URI_QUERY should be used alone %d", components);
 626
 627                 if (!query) return string;
 628
 629                 query++;
 630                 /* Check fragment and POST_CHAR */
 631                 return add_bytes_to_string(string, query, strcspn(query, "#" POST_CHAR_S));
 632         }
 633
 634         if (wants(URI_FRAGMENT) && uri->fragmentlen) {
 635                 add_char_to_string(string, '#');
 636                 add_bytes_to_string(string, uri->fragment, uri->fragmentlen);
 637         }
 638
 639         if (wants(URI_POST) && uri->post) {
 640                 add_char_to_string(string, POST_CHAR);
 641                 add_to_string(string, uri->post);
 642
 643         } else if (wants(URI_POST_INFO) && uri->post) {
 644                 if (!strncmp(uri->post, "text/plain", 10)) {
 645                         add_to_string(string, " (PLAIN TEXT DATA)");
 646
 647                 } else if (!strncmp(uri->post, "multipart/form-data;", 20)) {
 648                         add_to_string(string, " (MULTIPART FORM DATA)");
 649
 650                 } else {
 651                         add_to_string(string, " (POST DATA)");
 652                 }
 653
 654         }
 655
 656         return string;
 657 }
 658
 659 #undef wants
 660
 661 unsigned char *
 662 get_uri_string(const struct uri *uri, enum uri_component components)
 663 {
 664         struct string string;
 665
 666         if (init_string(&string)
 667             && add_uri_to_string(&string, uri, components))
 668                 return string.source;
 669
 670         done_string(&string);
 671         return NULL;
 672 }
 673
 674
 675 struct string *
 676 add_string_uri_to_string(struct string *string, unsigned char *uristring,
 677                          enum uri_component components)
 678 {
 679         struct uri uri;
 680
 681         if (parse_uri(&uri, uristring) != URI_ERRNO_OK)
 682                 return NULL;
 683
 684         return add_uri_to_string(string, &uri, components);
 685 }
 686
 687
 688 #define normalize_uri_reparse(str)      normalize_uri(NULL, str)
 689 #define normalize_uri_noparse(uri)      normalize_uri(uri, struri(uri))
 690
 691 unsigned char *
 692 normalize_uri(struct uri *uri, unsigned char *uristring)
 693 {
 694         unsigned char *parse_string = uristring;
 695         unsigned char *src, *dest, *path;
 696         int need_slash = 0, keep_dslash = 1;
 697         int parse = (uri == NULL);
 698         struct uri uri_struct;
 699
 700         if (!uri) uri = &uri_struct;
 701
 702         /* We need to get the real (proxied) URI but lowercase relevant URI
 703          * parts along the way. */
 704         do {
 705                 if (parse && parse_uri(uri, parse_string) != URI_ERRNO_OK)
 706                         return uristring;
 707
 708                 assert(uri->data);
 709
 710                 /* This is a maybe not the right place but both join_urls() and
 711                  * get_translated_uri() through translate_url() calls this
 712                  * function and then it already works on and modifies an
 713                  * allocated copy. */
 714                 convert_to_lowercase_locale_indep(uri->string, uri->protocollen);
 715                 if (uri->hostlen) convert_to_lowercase_locale_indep(uri->host, uri->hostlen);
 716
 717                 parse = 1;
 718                 parse_string = uri->data;
 719         } while (uri->protocol == PROTOCOL_PROXY);
 720
 721         if (get_protocol_free_syntax(uri->protocol))
 722                 return uristring;
 723
 724         if (uri->protocol != PROTOCOL_UNKNOWN) {
 725                 need_slash = get_protocol_need_slash_after_host(uri->protocol);
 726                 keep_dslash = get_protocol_keep_double_slashes(uri->protocol);
 727         }
 728
 729         path = uri->data - need_slash;
 730         dest = src = path;
 731
 732         /* This loop mangles the URI string by removing ".." and "." segments.
 733          * However it must not alter "//" without reason; see bug 744.  */
 734         while (*dest) {
 735                 /* If the following pieces are the LAST parts of URL, we remove
 736                  * them as well. See RFC 2396 section 5.2 for details. */
 737
 738                 if (end_of_dir(src[0])) {
 739                         /* URL data contains no more path. */
 740                         memmove(dest, src, strlen(src) + 1);
 741                         break;
 742                 }
 743
 744                 if (!is_uri_dir_sep(uri, src[0])) {
 745                         /* This is to reduce indentation */
 746
 747                 } else if (src[1] == '.') {
 748                         if (!src[2]) {
 749                                 /* /. - skip the dot */
 750                                 *dest++ = *src;
 751                                 *dest = 0;
 752                                 break;
 753
 754                         } else if (is_uri_dir_sep(uri, src[2])) {
 755                                 /* /./ - strip that.. */
 756                                 src += 2;
 757                                 continue;
 758
 759                         } else if (src[2] == '.'
 760                                    && (is_uri_dir_sep(uri, src[3]) || !src[3])) {
 761                                 /* /../ or /.. - skip it and preceding element.
 762                                  *
 763                                  * <path> "/foo/bar" <dest> ...
 764                                  * <src> ("/../" or "/..\0") ...
 765                                  *
 766                                  * Remove "bar" and the directory
 767                                  * separator that precedes it.  The
 768                                  * separator will be added back in the
 769                                  * next iteration unless another ".."
 770                                  * follows, in which case it will be
 771                                  * added later.  "bar" may be empty.  */
 772
 773                                 while (dest > path) {
 774                                         dest--;
 775                                         if (is_uri_dir_sep(uri, *dest)) break;
 776                                 }
 777
 778                                 /* <path> "/foo" <dest> "/bar" ...
 779                                  * <src> ("/../" or "/..\0") ... */
 780                                 if (!src[3]) {
 781                                         /* /.. - add ending slash and stop */
 782                                         *dest++ = *src;
 783                                         *dest = 0;
 784                                         break;
 785                                 }
 786
 787                                 src += 3;
 788                                 continue;
 789                         }
 790
 791                 } else if (is_uri_dir_sep(uri, src[1]) && !keep_dslash) {
 792                         /* // - ignore first '/'. */
 793                         src += 1;
 794                         continue;
 795                 }
 796
 797                 /* We don't want to access memory past the NUL char. */
 798                 *dest = *src++;
 799                 if (*dest) dest++;
 800         }
 801
 802         return uristring;
 803 }
 804
 805 /* The 'file' scheme URI comes in and bastardized URI comes out which consists
 806  * of just the complete path to file/directory, which the dumb 'file' protocol
 807  * backend can understand. No host parts etc, that is what this function is
 808  * supposed to chew. */
 809 static struct uri *
 810 transform_file_url(struct uri *uri, const unsigned char *cwd)
 811 {
 812         unsigned char *path = uri->data;
 813
 814         assert(uri->protocol == PROTOCOL_FILE && uri->data);
 815
 816         /* Sort out the host part. We currently support only host "localhost"
 817          * (plus empty host part will be assumed to be "localhost" as well).
 818          * As our extensions, '.' will reference to the cwd on localhost
 819          * (originally, when the first thing after file:// wasn't "localhost/",
 820          * we assumed the cwd as well, and pretended that there's no host part
 821          * at all) and '..' to the directory parent to cwd. Another extension
 822          * is that if this is a DOS-like system, the first char in two-char
 823          * host part is uppercase letter and the second char is a colon, it is
 824          * assumed to be a local disk specification. */
 825         /* TODO: Use FTP for non-localhost hosts. --pasky */
 826
 827         /* For URL "file://", we open the current directory. Some other
 828          * browsers instead open root directory, but AFAIK the standard does
 829          * not specify that and this was the original behaviour and it is more
 830          * consistent with our file://./ notation. */
 831
 832         /* Who would name their file/dir '...' ? */
 833         if (*path == '.' || !*path) {
 834                 struct string dir;
 835
 836                 if (!init_string(&dir))
 837                         return NULL;
 838
 839                 encode_uri_string(&dir, cwd, -1, 0);
 840
 841                 /* Either we will end up with '//' and translate_directories()
 842                  * will shorten it or the '/' will mark the inserted cwd as a
 843                  * directory. */
 844                 if (*path == '.') *path = '/';
 845
 846                 /* Insert the current working directory. */
 847                 /* The offset is 7 == sizeof("file://") - 1. */
 848                 insert_in_string(&struri(uri), 7, dir.source, dir.length);
 849
 850                 done_string(&dir);
 851                 return uri;
 852         }
 853
 854 #ifdef DOS_FS
 855         if (isasciialpha(path[0]) && path[1] == ':' && dir_sep(path[2]))
 856                 return NULL;
 857 #endif
 858
 859         for (; *path && !dir_sep(*path); path++);
 860
 861         /* FIXME: We will in fact assume localhost even for non-local hosts,
 862          * until we will support the FTP transformation. --pasky */
 863
 864         memmove(uri->data, path, strlen(path) + 1);
 865         return uri;
 866 }
 867
 868 static unsigned char *translate_url(unsigned char *url, unsigned char *cwd);
 869
 870 unsigned char *
 871 join_urls(struct uri *base, unsigned char *rel)
 872 {
 873         unsigned char *uristring, *path;
 874         int add_slash = 0;
 875         int translate = 0;
 876         int length = 0;
 877         int rel_len;
 878
 879         /* See RFC 1808 */
 880         /* TODO: Support for ';' ? (see the RFC) --pasky */
 881
 882         /* For '#', '?' and '//' we could use get_uri_string() but it might be
 883          * too expensive since it uses granular allocation scheme. I wouldn't
 884          * personally mind tho' because it would be cleaner. --jonas */
 885         if (rel[0] == '#') {
 886                 /* Strip fragment and post part from the base URI and append
 887                  * the fragment string in @rel. */
 888                 length  = base->fragment
 889                         ? base->fragment - struri(base) - 1
 890                         : get_real_uri_length(base);
 891
 892         } else if (rel[0] == '?') {
 893                 /* Strip query, fragment and post part from the base URI and
 894                  * append the query string in @rel. */
 895                 length  = base->fragment ? base->fragment - struri(base) - 1
 896                                          : get_real_uri_length(base);
 897
 898                 uristring = memchr(base->data, '?', base->datalen);
 899                 if (uristring) length = uristring - struri(base);
 900
 901         } else if (rel[0] == '/' && rel[1] == '/') {
 902                 if (!get_protocol_need_slashes(base->protocol))
 903                         return NULL;
 904
 905                 /* Get `<protocol>:' from the base URI and append the `//' part
 906                  * from @rel. */
 907                 length = base->protocollen + 1;
 908
 909                 /* We need to sanitize the relative part and add stuff like
 910                  * host slash. */
 911                 translate = 1;
 912         }
 913
 914         /* If one of the tests above set @length to something useful */
 915         if (length) {
 916                 uristring = memacpy(struri(base), length);
 917                 if (!uristring) return NULL;
 918
 919                 add_to_strn(&uristring, rel);
 920
 921                 if (translate) {
 922                         unsigned char *translated;
 923
 924                         translated = translate_url(uristring, NULL);
 925                         mem_free(uristring);
 926                         return translated;
 927                 }
 928                 return normalize_uri_reparse(uristring);
 929         }
 930
 931         /* Check if there is some protocol name to go for */
 932         length = get_protocol_length(rel);
 933         if (length) {
 934                 switch (get_protocol(rel, length)) {
 935                 case PROTOCOL_UNKNOWN:
 936                 case PROTOCOL_PROXY:
 937                         /* Mysteriously proxy URIs are breaking here ... */
 938                         break;
 939
 940                 case PROTOCOL_FILE:
 941                         /* FIXME: Use get_uri_string(base, URI_PATH) as cwd arg
 942                          * to translate_url(). */
 943                 default:
 944                         uristring = translate_url(rel, NULL);
 945                         if (uristring) return uristring;
 946                 }
 947         }
 948
 949         assertm(base->data != NULL, "bad base url");
 950         if_assert_failed return NULL;
 951
 952         path = base->data;
 953
 954         /* Either is path blank, but we've slash char before, or path is not
 955          * blank, but doesn't start by a slash (if we'd just stay along with
 956          * is_uri_dir_sep(&uri, path[-1]) w/o all the surrounding crap, it
 957          * should be enough, but I'm not sure and I don't want to break
 958          * anything --pasky). */
 959         /* We skip first char of URL ('/') in parse_url() (ARGH). This
 960          * is reason of all this bug-bearing magic.. */
 961         if (*path) {
 962                 if (!is_uri_dir_sep(base, *path)) path--;
 963         } else {
 964                 if (is_uri_dir_sep(base, path[-1])) path--;
 965         }
 966
 967         if (!is_uri_dir_sep(base, rel[0])) {
 968                 unsigned char *path_end;
 969
 970                 /* The URL is relative. */
 971
 972                 if (!*path) {
 973                         /* There's no path in the URL, but we're going to add
 974                          * something there, and the something doesn't start by
 975                          * a slash. So we need to insert a slash after the base
 976                          * URL. Clever, eh? ;) */
 977                         add_slash = 1;
 978                 }
 979
 980                 for (path_end = path; *path_end; path_end++) {
 981                         if (end_of_dir(*path_end)) break;
 982                         /* Modify the path pointer, so that it'll always point
 983                          * above the last '/' in the URL; later, we'll copy the
 984                          * URL only _TO_ this point, and anything after last
 985                          * slash will be substituted by 'rel'. */
 986                         if (is_uri_dir_sep(base, *path_end))
 987                                 path = path_end + 1;
 988                 }
 989         }
 990
 991         length = path - struri(base);
 992         rel_len = strlen(rel);
 993         uristring = mem_alloc(length + rel_len + add_slash + 1);
 994         if (!uristring) return NULL;
 995
 996         memcpy(uristring, struri(base), length);
 997         if (add_slash) uristring[length] = '/';
 998         strlcpy(uristring + length + add_slash, rel, rel_len + 1);
 999
1000         return normalize_uri_reparse(uristring);
1001 }
1002
1003
1004 /* Tries to figure out what protocol @newurl might be specifying by checking if
1005  * it exists as a file locally or by checking parts of the host name. */
1006 static enum protocol
1007 find_uri_protocol(unsigned char *newurl)
1008 {
1009         unsigned char *ch;
1010
1011         /* First see if it is a file so filenames that look like hostnames
1012          * won't confuse us below. */
1013         if (check_whether_file_exists(newurl) >= 0) return PROTOCOL_FILE;
1014
1015         /* Yes, it would be simpler to make test for IPv6 address first,
1016          * but it would result in confusing mix of ifdefs ;-). */
1017         /* FIXME: Ideas for improve protocol detection
1018          *
1019          * - Handle common hostnames. It could be part of the protocol backend
1020          *   structure. [ www -> http, irc -> irc, news -> nntp, ... ]
1021          *
1022          * - Resolve using port number. [ 119 -> nntp, 443 -> https, ... ]
1023          */
1024
1025         ch = newurl + strcspn(newurl, ".:/@");
1026         if (*ch == '@'
1027             || (*ch == ':' && *newurl != '[' && strchr(newurl, '@'))
1028             || !c_strncasecmp(newurl, "ftp.", 4)) {
1029                 /* Contains user/password/ftp-hostname */
1030                 return PROTOCOL_FTP;
1031
1032 #ifdef CONFIG_IPV6
1033         } else if (*newurl == '[' && *ch == ':') {
1034                 /* Candidate for IPv6 address */
1035                 unsigned char *bracket2, *colon2;
1036
1037                 ch++;
1038                 bracket2 = strchr(ch, ']');
1039                 colon2 = strchr(ch, ':');
1040                 if (bracket2 && colon2 && bracket2 > colon2)
1041                         return PROTOCOL_HTTP;
1042 #endif
1043
1044         } else if (*newurl != '.' && *ch == '.') {
1045                 /* Contains domain name? */
1046                 unsigned char *host_end, *domain;
1047                 unsigned char *ipscan;
1048
1049                 /* Process the hostname */
1050                 for (domain = ch + 1;
1051                         *(host_end = domain + strcspn(domain, ".:/?")) == '.';
1052                         domain = host_end + 1);
1053
1054                 /* It's IP? */
1055                 for (ipscan = ch; isdigit(*ipscan) || *ipscan == '.';
1056                         ipscan++);
1057
1058                 if (!*ipscan || *ipscan == ':' || *ipscan == '/')
1059                         return PROTOCOL_HTTP;
1060
1061                 /* It's two-letter or known TLD? */
1062                 if (host_end - domain == 2
1063                     || end_with_known_tld(domain, host_end - domain) >= 0)
1064                         return PROTOCOL_HTTP;
1065         }
1066
1067         return PROTOCOL_UNKNOWN;
1068 }
1069
1070
1071 #define MAX_TRANSLATION_ATTEMPTS        32
1072
1073 /* Returns an URI string that can be used internally. Adding protocol prefix,
1074  * missing slashes etc. */
1075 static unsigned char *
1076 translate_url(unsigned char *url, unsigned char *cwd)
1077 {
1078         unsigned char *newurl;
1079         struct uri uri;
1080         enum uri_errno uri_errno, prev_errno = URI_ERRNO_EMPTY;
1081         int retries = 0;
1082
1083         /* Strip starting spaces */
1084         while (*url == ' ') url++;
1085         if (!*url) return NULL;
1086
1087         newurl = expand_tilde(url); /* XXX: Post data copy. */
1088         if (!newurl) return NULL;
1089
1090 parse_uri:
1091         /* Yay a goto loop. If we get some URI parse error and try to
1092          * fix it we go back to here and try again. */
1093         /* Ordinary parse */
1094         uri_errno = parse_uri(&uri, newurl);
1095
1096         /* Bail out if the same error occurs twice */
1097         if (uri_errno == prev_errno || retries++ > MAX_TRANSLATION_ATTEMPTS) {
1098                 if (retries > MAX_TRANSLATION_ATTEMPTS) {
1099                         ERROR("Maximum number of parsing attempts exceeded "
1100                               "for %s.", url);
1101                 }
1102                 mem_free(newurl);
1103                 return NULL;
1104         }
1105
1106         prev_errno = uri_errno;
1107
1108         switch (uri_errno) {
1109         case URI_ERRNO_OK:
1110                 /* Fix translation of 1.2.3.4:5 so IP address part won't be
1111                  * interpreted as the protocol name. */
1112                 if (uri.protocol == PROTOCOL_UNKNOWN) {
1113                         enum protocol protocol = find_uri_protocol(newurl);
1114
1115                         /* Code duplication with the URI_ERRNO_INVALID_PROTOCOL
1116                          * case. */
1117                         if (protocol != PROTOCOL_UNKNOWN) {
1118                                 struct string str;
1119
1120                                 if (!init_string(&str)) return NULL;
1121
1122                                 switch (protocol) {
1123                                 case PROTOCOL_FTP:
1124                                         add_to_string(&str, "ftp://");
1125                                         encode_uri_string(&str, newurl, -1, 0);
1126                                         break;
1127
1128                                 case PROTOCOL_HTTP:
1129                                         add_to_string(&str, "http://");
1130                                         add_to_string(&str, newurl);
1131                                         break;
1132
1133                                 case PROTOCOL_UNKNOWN:
1134                                         break;
1135
1136                                 case PROTOCOL_FILE:
1137                                 default:
1138                                         add_to_string(&str, "file://");
1139                                         if (!dir_sep(*newurl))
1140                                                 add_to_string(&str, "./");
1141
1142                                         add_to_string(&str, newurl);
1143                                 }
1144
1145                                 mem_free(newurl);
1146                                 newurl = str.source;
1147
1148                                 /* Work around the infinite loop prevention */
1149                                 prev_errno = URI_ERRNO_EMPTY;
1150                                 goto parse_uri;
1151                         }
1152                 }
1153
1154                 /* If file:// URI is transformed we need to reparse. */
1155                 if (uri.protocol == PROTOCOL_FILE && cwd && *cwd
1156                     && transform_file_url(&uri, cwd))
1157                         return normalize_uri_reparse(struri(&uri));
1158
1159                 /* Translate the proxied URI too if proxy:// */
1160                 if (uri.protocol == PROTOCOL_PROXY) {
1161                         unsigned char *data = translate_url(uri.data, cwd);
1162                         int pos = uri.data - struri(&uri);
1163
1164                         if (!data) break;
1165                         struri(&uri)[pos] = 0;
1166                         insert_in_string(&struri(&uri), pos, data, strlen(data));
1167                         mem_free(data);
1168                         return normalize_uri_reparse(struri(&uri));
1169                 }
1170
1171                 return normalize_uri_noparse(&uri);
1172
1173         case URI_ERRNO_TOO_MANY_SLASHES:
1174         {
1175                 unsigned char *from, *to;
1176
1177                 assert(uri.string[uri.protocollen] == ':'
1178                        && uri.string[uri.protocollen + 1] == '/'
1179                        && uri.string[uri.protocollen + 2] == '/');
1180
1181                 from = to = uri.string + uri.protocollen + 3;
1182                 while (*from == '/') from++;
1183
1184                 assert(to < from);
1185                 memmove(to, from, strlen(from) + 1);
1186                 goto parse_uri;
1187         }
1188         case URI_ERRNO_NO_SLASHES:
1189         {
1190                 /* Try prefix:some.url -> prefix://some.url.. */
1191                 int slashes = 2;
1192
1193                 /* Check if only one '/' is needed. */
1194                 if (uri.string[uri.protocollen + 1] == '/')
1195                         slashes--;
1196
1197                 insert_in_string(&newurl, uri.protocollen + 1, "//", slashes);
1198                 goto parse_uri;
1199         }
1200         case URI_ERRNO_TRAILING_DOTS:
1201         {
1202                 /* Trim trailing '.'s */
1203                 unsigned char *from = uri.host + uri.hostlen;
1204                 unsigned char *to = from;
1205
1206                 assert(uri.host < to && to[-1] == '.' && *from != '.');
1207
1208                 while (uri.host < to && to[-1] == '.') to--;
1209
1210                 assert(to < from);
1211                 memmove(to, from, strlen(from) + 1);
1212                 goto parse_uri;
1213         }
1214         case URI_ERRNO_NO_PORT_COLON:
1215                 assert(uri.portlen == 0
1216                        && uri.string < uri.port
1217                        && uri.port[-1] == ':');
1218
1219                 memmove(uri.port - 1, uri.port, strlen(uri.port) + 1);
1220                 goto parse_uri;
1221
1222         case URI_ERRNO_NO_HOST_SLASH:
1223         {
1224                 int offset = uri.port
1225                            ? uri.port + uri.portlen - struri(&uri)
1226                            : uri.host + uri.hostlen - struri(&uri) + uri.ipv6 /* ']' */;
1227
1228                 assertm(uri.host != NULL, "uri.host not set after no host slash error");
1229                 insert_in_string(&newurl, offset, "/", 1);
1230                 goto parse_uri;
1231         }
1232         case URI_ERRNO_INVALID_PROTOCOL:
1233         {
1234                 /* No protocol name */
1235                 enum protocol protocol = find_uri_protocol(newurl);
1236                 struct string str;
1237
1238                 if (!init_string(&str)) return NULL;
1239
1240                 switch (protocol) {
1241                         case PROTOCOL_FTP:
1242                                 add_to_string(&str, "ftp://");
1243                                 encode_uri_string(&str, newurl, -1, 0);
1244                                 break;
1245
1246                         case PROTOCOL_HTTP:
1247                                 add_to_string(&str, "http://");
1248                                 add_to_string(&str, newurl);
1249                                 break;
1250
1251                         case PROTOCOL_UNKNOWN:
1252                                 /* We default to file:// even though we already
1253                                  * tested if the file existed since it will give
1254                                  * a "No such file or directory" error.  which
1255                                  * might better hint the user that there was
1256                                  * problem figuring out the URI. */
1257                         case PROTOCOL_FILE:
1258                         default:
1259                                 add_to_string(&str, "file://");
1260                                 if (!dir_sep(*newurl))
1261                                         add_to_string(&str, "./");
1262
1263                                 encode_file_uri_string(&str, newurl);
1264                 }
1265
1266                 mem_free(newurl);
1267                 newurl = str.source;
1268
1269                 goto parse_uri;
1270         }
1271         case URI_ERRNO_EMPTY:
1272         case URI_ERRNO_IPV6_SECURITY:
1273         case URI_ERRNO_NO_HOST:
1274         case URI_ERRNO_INVALID_PORT:
1275         case URI_ERRNO_INVALID_PORT_RANGE:
1276                 /* None of these can be handled properly. */
1277                 break;
1278         }
1279
1280         mem_free(newurl);
1281         return NULL;
1282 }
1283
1284
1285 struct uri *
1286 get_composed_uri(struct uri *uri, enum uri_component components)
1287 {
1288         unsigned char *string;
1289
1290         assert(uri);
1291         if_assert_failed return NULL;
1292
1293         string = get_uri_string(uri, components);
1294         if (!string) return NULL;
1295
1296         uri = get_uri(string, 0);
1297         mem_free(string);
1298
1299         return uri;
1300 }
1301
1302 struct uri *
1303 get_translated_uri(unsigned char *uristring, unsigned char *cwd)
1304 {
1305         struct uri *uri;
1306
1307         uristring = translate_url(uristring, cwd);
1308         if (!uristring) return NULL;
1309
1310         uri = get_uri(uristring, 0);
1311         mem_free(uristring);
1312
1313         return uri;
1314 }
1315
1316
1317 unsigned char *
1318 get_extension_from_uri(struct uri *uri)
1319 {
1320         unsigned char *extension = NULL;
1321         int afterslash = 1;
1322         unsigned char *pos = uri->data;
1323
1324         assert(pos);
1325
1326         for (; *pos && !end_of_dir(*pos); pos++) {
1327                 if (!afterslash && !extension && *pos == '.') {
1328                         extension = pos;
1329                 } else if (is_uri_dir_sep(uri, *pos)) {
1330                         extension = NULL;
1331                         afterslash = 1;
1332                 } else {
1333                         afterslash = 0;
1334                 }
1335         }
1336
1337         if (extension && extension < pos)
1338                 return memacpy(extension, pos - extension);
1339
1340         return NULL;
1341 }
1342
1343 /* URI encoding, escaping unallowed characters. */
1344 static inline int
1345 safe_char(unsigned char c)
1346 {
1347         /* RFC 2396, Page 8, Section 2.3 ;-) */
1348         return isident(c) || c == '.' || c == '!' || c == '~'
1349                || c == '*' || c == '\''|| c == '(' || c == ')';
1350 }
1351
1352 void
1353 encode_uri_string(struct string *string, const unsigned char *name, int namelen,
1354                   int convert_slashes)
1355 {
1356         unsigned char n[4];
1357         const unsigned char *end;
1358
1359         n[0] = '%';
1360         n[3] = '\0';
1361
1362         if (namelen < 0) namelen = strlen(name);
1363
1364         for (end = name + namelen; name < end; name++) {
1365 #if 0
1366                 /* This is probably correct only for query part of URI..? */
1367                 if (*name == ' ') add_char_to_string(data, len, '+');
1368                 else
1369 #endif
1370                 if (safe_char(*name) || (!convert_slashes && *name == '/')) {
1371                         add_char_to_string(string, *name);
1372                 } else {
1373                         /* Hex it. */
1374                         n[1] = hx((((int) *name) & 0xF0) >> 4);
1375                         n[2] = hx(((int) *name) & 0xF);
1376                         add_bytes_to_string(string, n, sizeof(n) - 1);
1377                 }
1378         }
1379 }
1380
1381 void
1382 encode_win32_uri_string(struct string *string, unsigned char *name, int namelen)
1383 {
1384         unsigned char n[4];
1385         unsigned char *end;
1386
1387         n[0] = '%';
1388         n[3] = '\0';
1389
1390         if (namelen < 0) namelen = strlen(name);
1391
1392         for (end = name + namelen; name < end; name++) {
1393                 if (safe_char(*name) || *name == ':' || *name == '\\') {
1394                         add_char_to_string(string, *name);
1395                 } else {
1396                         /* Hex it. */
1397                         n[1] = hx((((int) *name) & 0xF0) >> 4);
1398                         n[2] = hx(((int) *name) & 0xF);
1399                         add_bytes_to_string(string, n, sizeof(n) - 1);
1400                 }
1401         }
1402 }
1403
1404 /* This function is evil, it modifies its parameter. */
1405 /* XXX: but decoded string is _never_ longer than encoded string so it's an
1406  * efficient way to do that, imho. --Zas */
1407 void
1408 decode_uri(unsigned char *src)
1409 {
1410         unsigned char *dst = src;
1411         unsigned char c;
1412
1413         do {
1414                 c = *src++;
1415
1416                 if (c == '%') {
1417                         int x1 = unhx(*src);
1418
1419                         if (x1 >= 0) {
1420                                 int x2 = unhx(*(src + 1));
1421
1422                                 if (x2 >= 0) {
1423                                         x1 = (x1 << 4) + x2;
1424                                         if (x1 != 0) { /* don't allow %00 */
1425                                                 c = (unsigned char) x1;
1426                                                 src += 2;
1427                                         }
1428                                 }
1429                         }
1430
1431 #if 0
1432                 } else if (c == '+') {
1433                         /* As the comment in encode_uri_string suggests, '+'
1434                          * should only be decoded in the query part of a URI
1435                          * (should that be 'URL'?). I'm not bold enough to
1436                          * disable this code, tho. -- Miciah */
1437                         c = ' ';
1438 #endif
1439                 }
1440
1441                 *dst++ = c;
1442         } while (c != '\0');
1443 }
1444
1445 void
1446 decode_uri_string(struct string *string)
1447 {
1448         decode_uri(string->source);
1449         string->length = strlen(string->source);
1450 }
1451
1452 void
1453 decode_uri_for_display(unsigned char *src)
1454 {
1455         decode_uri(src);
1456
1457         for (; *src; src++)
1458                 if (!isprint(*src) || iscntrl(*src))
1459                         *src = '*';
1460 }
1461
1462 void
1463 decode_uri_string_for_display(struct string *string)
1464 {
1465         decode_uri_for_display(string->source);
1466         string->length = strlen(string->source);
1467 }
1468
1469
1470 /* URI list */
1471
1472 #define URI_LIST_GRANULARITY 0x3
1473
1474 #define realloc_uri_list(list) \
1475         mem_align_alloc(&(list)->uris, (list)->size, (list)->size + 1, \
1476                         URI_LIST_GRANULARITY)
1477
1478 struct uri *
1479 add_to_uri_list(struct uri_list *list, struct uri *uri)
1480 {
1481         if (!realloc_uri_list(list))
1482                 return NULL;
1483
1484         list->uris[list->size++] = get_uri_reference(uri);
1485
1486         return uri;
1487 };
1488
1489 void
1490 free_uri_list(struct uri_list *list)
1491 {
1492         struct uri *uri;
1493         int index;
1494
1495         if (!list->uris) return;
1496
1497         foreach_uri (uri, index, list) {
1498                 done_uri(uri);
1499         }
1500
1501         mem_free_set(&list->uris, NULL);
1502         list->size = 0;
1503 }
1504
1505 /* URI cache */
1506
1507 struct uri_cache_entry {
1508         struct uri uri;
1509         unsigned char string[1];
1510 };
1511
1512 struct uri_cache {
1513         struct hash *map;
1514         struct object object;
1515 };
1516
1517 static struct uri_cache uri_cache;
1518
1519 #ifdef CONFIG_DEBUG
1520 static inline void
1521 check_uri_sanity(struct uri *uri)
1522 {
1523         int pos;
1524
1525         for (pos = 0; pos < uri->protocollen; pos++)
1526                 if (c_isupper(uri->string[pos])) goto error;
1527
1528         if (uri->hostlen)
1529                 for (pos = 0; pos < uri->hostlen; pos++)
1530                         if (c_isupper(uri->host[pos])) goto error;
1531         return;
1532 error:
1533         INTERNAL("Uppercase letters detected in protocol or host part (%s).", struri(uri));
1534 }
1535 #else
1536 #define check_uri_sanity(uri)
1537 #endif
1538
1539 static inline struct uri_cache_entry *
1540 get_uri_cache_entry(unsigned char *string, int length)
1541 {
1542         struct uri_cache_entry *entry;
1543         struct hash_item *item;
1544
1545         assert(string && length > 0);
1546         if_assert_failed return NULL;
1547
1548         item = get_hash_item(uri_cache.map, string, length);
1549         if (item) return item->value;
1550
1551         /* Setup a new entry */
1552
1553         entry = mem_calloc(1, sizeof(*entry) + length);
1554         if (!entry) return NULL;
1555
1556         object_nolock(&entry->uri, "uri");
1557         memcpy(&entry->string, string, length);
1558         string = entry->string;
1559
1560         if (parse_uri(&entry->uri, string) != URI_ERRNO_OK
1561             || !add_hash_item(uri_cache.map, string, length, entry)) {
1562                 mem_free(entry);
1563                 return NULL;
1564         }
1565
1566         object_lock(&uri_cache);
1567
1568         return entry;
1569 }
1570
1571 struct uri *
1572 get_uri(unsigned char *string, enum uri_component components)
1573 {
1574         struct uri_cache_entry *entry;
1575
1576         assert(string);
1577
1578         if (components) {
1579                 struct uri uri;
1580
1581                 if (parse_uri(&uri, string) != URI_ERRNO_OK)
1582                         return NULL;
1583
1584                 return get_composed_uri(&uri, components);
1585         }
1586
1587         if (!is_object_used(&uri_cache)) {
1588                 uri_cache.map = init_hash8();
1589                 if (!uri_cache.map) return NULL;
1590                 object_nolock(&uri_cache, "uri_cache");
1591         }
1592
1593         entry = get_uri_cache_entry(string, strlen(string));
1594         if (!entry) {
1595                 if (!is_object_used(&uri_cache))
1596                         free_hash(&uri_cache.map);
1597                 return NULL;
1598         }
1599
1600         check_uri_sanity(&entry->uri);
1601         object_nolock(&entry->uri, "uri");
1602         object_lock(&entry->uri);
1603
1604         return &entry->uri;
1605 }
1606
1607 void
1608 done_uri(struct uri *uri)
1609 {
1610         unsigned char *string = struri(uri);
1611         int length = strlen(string);
1612         struct hash_item *item;
1613         struct uri_cache_entry *entry;
1614
1615         assert(is_object_used(&uri_cache));
1616
1617         object_unlock(uri);
1618         if (is_object_used(uri)) return;
1619
1620         item = get_hash_item(uri_cache.map, string, length);
1621         entry = item ? item->value : NULL;
1622
1623         assertm(entry != NULL, "Releasing unknown URI [%s]", string);
1624         del_hash_item(uri_cache.map, item);
1625         mem_free(entry);
1626
1627         /* Last URI frees the cache */
1628         object_unlock(&uri_cache);
1629         if (!is_object_used(&uri_cache))
1630                 free_hash(&uri_cache.map);
1631 }