src/protocol/uri.c

   1 /* URL parser and translator; implementation of RFC 2396. */
   2
   3 #ifdef HAVE_CONFIG_H
   4 #include "config.h"
   5 #endif
   6
   7 #include <ctype.h>
   8 #include <errno.h>
   9 #ifdef HAVE_IDNA_H
  10 #include <idna.h>
  11 #endif
  12 #include <stdio.h>
  13 #include <stdlib.h>
  14 #include <string.h>
  15 #include <sys/types.h>
  16 #ifdef HAVE_NETDB_H
  17 #include <netdb.h> /* OS/2 needs this after sys/types.h */
  18 #endif
  19
  20 #ifdef HAVE_SYS_SOCKET_H
  21 #include <sys/socket.h>
  22 #endif
  23 #ifdef HAVE_NETINET_IN_H
  24 #include <netinet/in.h>
  25 #endif
  26 #ifdef HAVE_ARPA_INET_H
  27 #include <arpa/inet.h>
  28 #endif
  29
  30 #include "elinks.h"
  31
  32 #include "main/object.h"
  33 #include "protocol/protocol.h"
  34 #include "protocol/uri.h"
  35 #include "util/conv.h"
  36 #include "util/error.h"
  37 #include "util/file.h"
  38 #include "util/hash.h"
  39 #include "util/memory.h"
  40 #include "util/string.h"
  41
  42
  43 static inline int
  44 end_of_dir(unsigned char c)
  45 {
  46         /* This used to check for c == ';' as well.  But section 3.3
  47          * of RFC 2396 explicitly says that parameters in a path
  48          * segment "are not significant to the parsing of relative
  49          * references."  */
  50         return c == POST_CHAR || c == '#' || c == '?';
  51 }
  52
  53 static inline int
  54 is_uri_dir_sep(const struct uri *uri, unsigned char pos)
  55 {
  56         return (uri->protocol == PROTOCOL_FILE ? dir_sep(pos) : pos == '/');
  57 }
  58
  59
  60 int
  61 is_in_domain(unsigned char *domain, unsigned char *server, int server_len)
  62 {
  63         int domain_len = strlen(domain);
  64         int len;
  65
  66         if (domain_len > server_len)
  67                 return 0;
  68
  69         if (domain_len == server_len)
  70                 return !strncasecmp(domain, server, server_len);
  71
  72         len = server_len - domain_len;
  73         if (server[len - 1] != '.')
  74                 return 0;
  75
  76         return !strncasecmp(domain, server + len, domain_len);
  77 }
  78
  79 int
  80 is_ip_address(const unsigned char *address, int addresslen)
  81 {
  82         /* The @address has well defined limits so it would be a shame to
  83          * allocate it. */
  84         unsigned char buffer[IP_ADDRESS_BUFFER_SIZE];
  85
  86         if (addresslen >= sizeof(buffer))
  87                 return 0;
  88
  89         safe_strncpy(buffer, address, addresslen + 1);
  90
  91 #ifdef HAVE_INET_PTON
  92 #ifdef CONFIG_IPV6
  93         {
  94                 struct sockaddr_in6 addr6;
  95
  96                 if (inet_pton(AF_INET6, buffer, &addr6.sin6_addr) > 0)
  97                         return 1;
  98         }
  99 #endif /* CONFIG_IPV6 */
 100         {
 101                 struct in_addr addr4;
 102
 103                 if (inet_pton(AF_INET, buffer, &addr4) > 0)
 104                         return 1;
 105         }
 106
 107         return 0;
 108 #else
 109         /* FIXME: Is this ever the case? */
 110         return 0;
 111 #endif /* HAVE_INET_PTON */
 112 }
 113
 114
 115 int
 116 end_with_known_tld(const unsigned char *s, int slen)
 117 {
 118         int i;
 119         static const unsigned char *const tld[] =
 120         { "com", "edu", "net",
 121           "org", "gov", "mil",
 122           "int", "biz", "arpa",
 123           "aero", "coop",
 124           "info", "museum",
 125           "name", "pro", NULL };
 126
 127         if (!slen) return -1;
 128         if (slen < 0) slen = strlen(s);
 129
 130         for (i = 0; tld[i]; i++) {
 131                 int tldlen = strlen(tld[i]);
 132                 int pos = slen - tldlen;
 133
 134                 if (pos >= 0 && !strncasecmp(&s[pos], tld[i], tldlen))
 135                         return pos;
 136         }
 137
 138         return -1;
 139 }
 140
 141 /* XXX: this function writes to @name. */
 142 static int
 143 check_whether_file_exists(unsigned char *name)
 144 {
 145         /* Check POST_CHAR etc ... */
 146         static const unsigned char chars[] = POST_CHAR_S "#?";
 147         int i;
 148         int namelen = strlen(name);
 149
 150         if (file_exists(name))
 151                 return namelen;
 152
 153         for (i = 0; i < sizeof(chars) - 1; i++) {
 154                 unsigned char *pos = memchr(name, chars[i], namelen);
 155                 int exists;
 156
 157                 if (!pos) continue;
 158
 159                 *pos = 0;
 160                 exists = file_exists(name);
 161                 *pos = chars[i];
 162
 163                 if (exists) {
 164                         return pos - name;
 165                 }
 166         }
 167
 168         return -1;
 169 }
 170
 171 /* Encodes URIs without encoding stuff like fragments and query separators. */
 172 static void
 173 encode_file_uri_string(struct string *string, unsigned char *uristring)
 174 {
 175         int filenamelen = check_whether_file_exists(uristring);
 176
 177         encode_uri_string(string, uristring, filenamelen, 0);
 178 }
 179
 180
 181 static inline int
 182 get_protocol_length(const unsigned char *url)
 183 {
 184         unsigned char *end = (unsigned char *) url;
 185
 186         /* Seek the end of the protocol name if any. */
 187         /* RFC1738:
 188          * scheme  = 1*[ lowalpha | digit | "+" | "-" | "." ]
 189          * (but per its recommendations we accept "upalpha" too) */
 190         while (isalnum(*end) || *end == '+' || *end == '-' || *end == '.')
 191                 end++;
 192
 193         /* Now we make something to support our "IP version in protocol scheme
 194          * name" hack and silently chop off the last digit if it's there. The
 195          * IETF's not gonna notice I hope or it'd be going after us hard. */
 196         if (end != url && isdigit(end[-1]))
 197                 end--;
 198
 199         /* Also return 0 if there's no protocol name (@end == @url). */
 200         return (*end == ':' || isdigit(*end)) ? end - url : 0;
 201 }
 202
 203 enum uri_errno
 204 parse_uri(struct uri *uri, unsigned char *uristring)
 205 {
 206         unsigned char *prefix_end, *host_end;
 207 #ifdef CONFIG_IPV6
 208         unsigned char *lbracket, *rbracket;
 209 #endif
 210
 211         assertm(uristring != NULL, "No uri to parse.");
 212         memset(uri, 0, sizeof(*uri));
 213
 214         /* Nothing to do for an empty url. */
 215         if_assert_failed return 0;
 216         if (!*uristring) return URI_ERRNO_EMPTY;
 217
 218         uri->string = uristring;
 219         uri->protocollen = get_protocol_length(uristring);
 220
 221         /* Invalid */
 222         if (!uri->protocollen) return URI_ERRNO_INVALID_PROTOCOL;
 223
 224         /* Figure out whether the protocol is known */
 225         uri->protocol = get_protocol(struri(uri), uri->protocollen);
 226
 227         prefix_end = uristring + uri->protocollen; /* ':' */
 228
 229         /* Check if there's a digit after the protocol name. */
 230         if (isdigit(*prefix_end)) {
 231                 uri->ip_family = uristring[uri->protocollen] - '0';
 232                 prefix_end++;
 233         }
 234         if (*prefix_end != ':')
 235                 return URI_ERRNO_INVALID_PROTOCOL;
 236         prefix_end++;
 237
 238         /* Skip slashes */
 239
 240         if (prefix_end[0] == '/' && prefix_end[1] == '/') {
 241                 if (prefix_end[2] == '/'
 242                     && get_protocol_need_slash_after_host(uri->protocol))
 243                         return URI_ERRNO_TOO_MANY_SLASHES;
 244
 245                 prefix_end += 2;
 246
 247         } else if (get_protocol_need_slashes(uri->protocol)) {
 248                 return URI_ERRNO_NO_SLASHES;
 249         }
 250
 251         if (get_protocol_free_syntax(uri->protocol)) {
 252                 uri->data = prefix_end;
 253                 uri->datalen = strlen(prefix_end);
 254                 return URI_ERRNO_OK;
 255
 256         } else if (uri->protocol == PROTOCOL_FILE) {
 257                 int datalen = strcspn(prefix_end, "#" POST_CHAR_S);
 258                 unsigned char *frag_or_post = prefix_end + datalen;
 259
 260                 /* Extract the fragment part. */
 261                 if (datalen >= 0) {
 262                         if (*frag_or_post == '#') {
 263                                 uri->fragment = frag_or_post + 1;
 264                                 uri->fragmentlen = strcspn(uri->fragment, POST_CHAR_S);
 265                                 frag_or_post = uri->fragment + uri->fragmentlen;
 266                         }
 267                         if (*frag_or_post == POST_CHAR) {
 268                                 uri->post = frag_or_post + 1;
 269                         }
 270                 } else {
 271                         datalen = strlen(prefix_end);
 272                 }
 273
 274                 /* A bit of a special case, but using the "normal" host
 275                  * parsing seems a bit scary at this point. (see bug 107). */
 276                 if (datalen > 9 && !strncasecmp(prefix_end, "localhost/", 10)) {
 277                         prefix_end += 9;
 278                         datalen -= 9;
 279                 }
 280
 281                 uri->data = prefix_end;
 282                 uri->datalen = datalen;
 283
 284                 return URI_ERRNO_OK;
 285         }
 286
 287         /* Isolate host */
 288
 289 #ifdef CONFIG_IPV6
 290         /* Get brackets enclosing IPv6 address */
 291         lbracket = strchr(prefix_end, '[');
 292         if (lbracket) {
 293                 rbracket = strchr(lbracket, ']');
 294                 /* [address] is handled only inside of hostname part (surprisingly). */
 295                 if (rbracket && rbracket < prefix_end + strcspn(prefix_end, "/"))
 296                         uri->ipv6 = 1;
 297                 else
 298                         lbracket = rbracket = NULL;
 299         } else {
 300                 rbracket = NULL;
 301         }
 302 #endif
 303
 304         /* Possibly skip auth part */
 305         host_end = prefix_end + strcspn(prefix_end, "@");
 306
 307         if (prefix_end + strcspn(prefix_end, "/") > host_end
 308             && *host_end) { /* we have auth info here */
 309                 unsigned char *user_end;
 310
 311                 /* Allow '@' in the password component */
 312                 while (strcspn(host_end + 1, "@") < strcspn(host_end + 1, "/?"))
 313                         host_end = host_end + 1 + strcspn(host_end + 1, "@");
 314
 315                 user_end = strchr(prefix_end, ':');
 316
 317                 if (!user_end || user_end > host_end) {
 318                         uri->user = prefix_end;
 319                         uri->userlen = host_end - prefix_end;
 320                 } else {
 321                         uri->user = prefix_end;
 322                         uri->userlen = user_end - prefix_end;
 323                         uri->password = user_end + 1;
 324                         uri->passwordlen = host_end - user_end - 1;
 325                 }
 326                 prefix_end = host_end + 1;
 327         }
 328
 329 #ifdef CONFIG_IPV6
 330         if (uri->ipv6)
 331                 host_end = rbracket + strcspn(rbracket, ":/?");
 332         else
 333 #endif
 334                 host_end = prefix_end + strcspn(prefix_end, ":/?");
 335
 336 #ifdef CONFIG_IPV6
 337         if (uri->ipv6) {
 338                 int addrlen = rbracket - lbracket - 1;
 339
 340                 /* Check for valid length.
 341                  * addrlen >= sizeof(hostbuf) is theorically impossible
 342                  * but i keep the test in case of... Safer, imho --Zas */
 343                 assertm(addrlen >= 0 && addrlen < NI_MAXHOST,
 344                         "parse_uri(): addrlen value is bad (%d) for URL '%s'. "
 345                         "Problems are likely to be encountered. Please report "
 346                         "this, it is a security bug!", addrlen, uristring);
 347                 if_assert_failed return URI_ERRNO_IPV6_SECURITY;
 348
 349                 uri->host = lbracket + 1;
 350                 uri->hostlen = addrlen;
 351         } else
 352 #endif
 353         {
 354                 uri->host = prefix_end;
 355                 uri->hostlen = host_end - prefix_end;
 356
 357                 /* Trim trailing '.'s */
 358                 if (uri->hostlen && uri->host[uri->hostlen - 1] == '.')
 359                         return URI_ERRNO_TRAILING_DOTS;
 360         }
 361
 362         if (*host_end == ':') { /* we have port here */
 363                 unsigned char *port_end = host_end + 1 + strcspn(host_end + 1, "/");
 364
 365                 host_end++;
 366
 367                 uri->port = host_end;
 368                 uri->portlen = port_end - host_end;
 369
 370                 if (uri->portlen == 0)
 371                         return URI_ERRNO_NO_PORT_COLON;
 372
 373                 /* We only use 8 bits for portlen so better check */
 374                 if (uri->portlen != port_end - host_end)
 375                         return URI_ERRNO_INVALID_PORT;
 376
 377                 /* test if port is number */
 378                 /* TODO: possibly lookup for the service otherwise? --pasky */
 379                 for (; host_end < port_end; host_end++)
 380                         if (!isdigit(*host_end))
 381                                 return URI_ERRNO_INVALID_PORT;
 382
 383                 /* Check valid port value, and let show an error message
 384                  * about invalid url syntax. */
 385                 if (uri->port && uri->portlen) {
 386                         int n;
 387
 388                         errno = 0;
 389                         n = strtol(uri->port, NULL, 10);
 390                         if (errno || !uri_port_is_valid(n))
 391                                 return URI_ERRNO_INVALID_PORT;
 392                 }
 393         }
 394
 395         if (*host_end == '/') {
 396                 host_end++;
 397
 398         } else if (get_protocol_need_slash_after_host(uri->protocol)) {
 399                 /* The need for slash after the host component depends on the
 400                  * need for a host component. -- The dangerous mind of Jonah */
 401                 if (!uri->hostlen)
 402                         return URI_ERRNO_NO_HOST;
 403
 404                 return URI_ERRNO_NO_HOST_SLASH;
 405         }
 406
 407         /* Look for #fragment or POST_CHAR */
 408         prefix_end = host_end + strcspn(host_end, "#" POST_CHAR_S);
 409         uri->data = host_end;
 410         uri->datalen = prefix_end - host_end;
 411
 412         if (*prefix_end == '#') {
 413                 uri->fragment = prefix_end + 1;
 414                 uri->fragmentlen = strcspn(uri->fragment, POST_CHAR_S);
 415                 prefix_end = uri->fragment + uri->fragmentlen;
 416         }
 417
 418         if (*prefix_end == POST_CHAR) {
 419                 uri->post = prefix_end + 1;
 420         }
 421
 422         return URI_ERRNO_OK;
 423 }
 424
 425 int
 426 get_uri_port(const struct uri *uri)
 427 {
 428         if (uri->port && uri->portlen) {
 429                 const unsigned char *end = uri->port;
 430                 int port = strtol(uri->port, (char **) &end, 10);
 431
 432                 if (end != uri->port) {
 433                         assert(uri_port_is_valid(port));
 434                         return port;
 435                 }
 436         }
 437
 438         return get_protocol_port(uri->protocol);
 439 }
 440
 441 #define can_compare_uri_components(comp) !(((comp) & (URI_SPECIAL | URI_IDN)))
 442
 443 static inline int
 444 compare_component(const unsigned char *a, int alen,
 445                   const unsigned char *b, int blen)
 446 {
 447         /* Check that the length and the strings are both set or unset */
 448         if (alen != blen || !!a != !!b) return 0;
 449
 450         /* Both are unset so that will make a perfect match */
 451         if (!a || !alen) return 1;
 452
 453         /* Let the higher forces decide */
 454         return !memcmp(a, b, blen);
 455 }
 456
 457 #define wants(x) (components & (x))
 458
 459 int
 460 compare_uri(const struct uri *a, const struct uri *b,
 461             enum uri_component components)
 462 {
 463         if (a == b) return 1;
 464         if (!components) return 0;
 465
 466         assertm(can_compare_uri_components(components),
 467                 "compare_uri() is a work in progress. Component unsupported");
 468
 469         return (!wants(URI_PROTOCOL) || a->protocol == b->protocol)
 470                 && (!wants(URI_IP_FAMILY) || a->ip_family == b->ip_family)
 471                 && (!wants(URI_USER)
 472                     || compare_component(a->user, a->userlen, b->user, b->userlen))
 473                 && (!wants(URI_PASSWORD)
 474                     || compare_component(a->password, a->passwordlen, b->password, b->passwordlen))
 475                 && (!wants(URI_HOST)
 476                     || compare_component(a->host, a->hostlen, b->host, b->hostlen))
 477                 && (!wants(URI_PORT)
 478                     || compare_component(a->port, a->portlen, b->port, b->portlen))
 479                 && (!wants(URI_DATA)
 480                     || compare_component(a->data, a->datalen, b->data, b->datalen))
 481                 && (!wants(URI_FRAGMENT)
 482                     || compare_component(a->fragment, a->fragmentlen, b->fragment, b->fragmentlen))
 483                 && (!wants(URI_POST)
 484                     || compare_component(a->post, a->post ? strlen(a->post) : 0, b->post, b->post ? strlen(b->post) : 0));
 485 }
 486
 487
 488 /* We might need something more intelligent than this Swiss army knife. */
 489 struct string *
 490 add_uri_to_string(struct string *string, const struct uri *uri,
 491                   enum uri_component components)
 492 {
 493         /* Custom or unknown keep the URI untouched. */
 494         if (uri->protocol == PROTOCOL_UNKNOWN)
 495                 return add_to_string(string, struri(uri));
 496
 497         if (wants(URI_PROTOCOL)) {
 498                 add_bytes_to_string(string, uri->string, uri->protocollen);
 499                 if (wants(URI_IP_FAMILY) && uri->ip_family)
 500                         add_long_to_string(string, uri->ip_family);
 501                 add_char_to_string(string, ':');
 502                 if (get_protocol_need_slashes(uri->protocol))
 503                         add_to_string(string, "//");
 504         }
 505
 506         if (wants(URI_USER) && uri->userlen) {
 507                 add_bytes_to_string(string, uri->user, uri->userlen);
 508
 509                 if (wants(URI_PASSWORD) && uri->passwordlen) {
 510                         add_char_to_string(string, ':');
 511                         add_bytes_to_string(string, uri->password,
 512                                                     uri->passwordlen);
 513                 }
 514
 515                 add_char_to_string(string, '@');
 516
 517         } else if (wants(URI_PASSWORD) && uri->passwordlen) {
 518                 add_bytes_to_string(string, uri->password, uri->passwordlen);
 519         }
 520
 521         if (wants(URI_HOST) && uri->hostlen) {
 522                 int add_host = 1;
 523
 524 #ifdef CONFIG_IPV6
 525                 /* Rationale for wants(URI_PORT): The [notation] was invented
 526                  * so that you can have an IPv6 addy and a port together. So
 527                  * we want to use it when that happens, otherwise we need not
 528                  * bother (that happens only when we want it for DNS anyway).
 529                  * I insist on an implied elegancy of this way, but YMMV. ;-)
 530                  * --pasky */
 531                 if (uri->ipv6 && wants(URI_PORT)) add_char_to_string(string, '[');
 532 #endif
 533 #ifdef CONFIG_IDN
 534                 /* Support for the GNU International Domain Name library.
 535                  *
 536                  * http://www.gnu.org/software/libidn/manual/html_node/IDNA-Functions.html
 537                  *
 538                  * Now it is probably not perfect because idna_to_ascii_lz()
 539                  * will be using a ``zero terminated input string encoded in
 540                  * the current locale's character set''. Anyway I don't know
 541                  * how to convert anything to UTF-8 or Unicode. --jonas */
 542                 if (wants(URI_IDN)) {
 543                         unsigned char *host = memacpy(uri->host, uri->hostlen);
 544
 545                         if (host) {
 546                                 char *idname;
 547                                 int code = idna_to_ascii_lz(host, &idname, 0);
 548
 549                                 /* FIXME: Return NULL if it coughed? --jonas */
 550                                 if (code == IDNA_SUCCESS) {
 551                                         add_to_string(string, idname);
 552                                         free(idname);
 553                                         add_host = 0;
 554                                 }
 555
 556                                 mem_free(host);
 557                         }
 558                 }
 559
 560 #endif
 561                 if (add_host)
 562                         add_bytes_to_string(string, uri->host, uri->hostlen);
 563
 564 #ifdef CONFIG_IPV6
 565                 if (uri->ipv6 && wants(URI_PORT)) add_char_to_string(string, ']');
 566 #endif
 567         }
 568
 569         if (wants(URI_PORT) || wants(URI_DEFAULT_PORT)) {
 570                 if (uri->portlen) {
 571                         add_char_to_string(string, ':');
 572                         add_bytes_to_string(string, uri->port, uri->portlen);
 573
 574                 } else if (wants(URI_DEFAULT_PORT)
 575                            && uri->protocol != PROTOCOL_USER) {
 576                         /* For user protocols we don't know a default port.
 577                          * Should user protocols ports be configurable? */
 578                         int port = get_protocol_port(uri->protocol);
 579
 580                         add_char_to_string(string, ':');
 581                         add_long_to_string(string, port);
 582                 }
 583         }
 584
 585         /* Only add slash if we need to separate */
 586         if ((wants(URI_DATA) || wants(URI_POST) || components == URI_HTTP_REFERRER_HOST)
 587             && wants(~(URI_DATA | URI_PORT))
 588             && get_protocol_need_slash_after_host(uri->protocol))
 589                 add_char_to_string(string, '/');
 590
 591         if (wants(URI_DATA) && uri->datalen)
 592                 add_bytes_to_string(string, uri->data, uri->datalen);
 593
 594         /* We can not test uri->datalen here since we need to always
 595          * add '/'. */
 596         if (wants(URI_PATH) || wants(URI_FILENAME)) {
 597                 const unsigned char *filename = uri->data;
 598                 const unsigned char *pos;
 599
 600                 assertm(!wants(URI_FILENAME) || components == URI_FILENAME,
 601                         "URI_FILENAME should be used alone %d", components);
 602
 603                 if (wants(URI_PATH) && !is_uri_dir_sep(uri, *filename)) {
 604 #ifdef CONFIG_OS_WIN32
 605                         if (uri->protocol != PROTOCOL_FILE)
 606 #endif
 607                         /* FIXME: Add correct separator */
 608                         add_char_to_string(string, '/');
 609                 }
 610
 611                 if (!uri->datalen) return string;
 612
 613                 for (pos = filename; *pos && !end_of_dir(*pos); pos++)
 614                         if (wants(URI_FILENAME) && is_uri_dir_sep(uri, *pos))
 615                                 filename = pos + 1;
 616
 617                 return add_bytes_to_string(string, filename, pos - filename);
 618         }
 619
 620         if (wants(URI_QUERY) && uri->datalen) {
 621                 const unsigned char *query = memchr(uri->data, '?', uri->datalen);
 622
 623                 assertm(URI_QUERY == components,
 624                         "URI_QUERY should be used alone %d", components);
 625
 626                 if (!query) return string;
 627
 628                 query++;
 629                 /* Check fragment and POST_CHAR */
 630                 return add_bytes_to_string(string, query, strcspn(query, "#" POST_CHAR_S));
 631         }
 632
 633         if (wants(URI_FRAGMENT) && uri->fragmentlen) {
 634                 add_char_to_string(string, '#');
 635                 add_bytes_to_string(string, uri->fragment, uri->fragmentlen);
 636         }
 637
 638         if (wants(URI_POST) && uri->post) {
 639                 add_char_to_string(string, POST_CHAR);
 640                 add_to_string(string, uri->post);
 641
 642         } else if (wants(URI_POST_INFO) && uri->post) {
 643                 if (!strncmp(uri->post, "text/plain", 10)) {
 644                         add_to_string(string, " (PLAIN TEXT DATA)");
 645
 646                 } else if (!strncmp(uri->post, "multipart/form-data;", 20)) {
 647                         add_to_string(string, " (MULTIPART FORM DATA)");
 648
 649                 } else {
 650                         add_to_string(string, " (POST DATA)");
 651                 }
 652
 653         }
 654
 655         return string;
 656 }
 657
 658 #undef wants
 659
 660 unsigned char *
 661 get_uri_string(const struct uri *uri, enum uri_component components)
 662 {
 663         struct string string;
 664
 665         if (init_string(&string)
 666             && add_uri_to_string(&string, uri, components))
 667                 return string.source;
 668
 669         done_string(&string);
 670         return NULL;
 671 }
 672
 673
 674 struct string *
 675 add_string_uri_to_string(struct string *string, unsigned char *uristring,
 676                          enum uri_component components)
 677 {
 678         struct uri uri;
 679
 680         if (parse_uri(&uri, uristring) != URI_ERRNO_OK)
 681                 return NULL;
 682
 683         return add_uri_to_string(string, &uri, components);
 684 }
 685
 686
 687 #define normalize_uri_reparse(str)      normalize_uri(NULL, str)
 688 #define normalize_uri_noparse(uri)      normalize_uri(uri, struri(uri))
 689
 690 unsigned char *
 691 normalize_uri(struct uri *uri, unsigned char *uristring)
 692 {
 693         unsigned char *parse_string = uristring;
 694         unsigned char *src, *dest, *path;
 695         int need_slash = 0, keep_dslash = 1;
 696         int parse = (uri == NULL);
 697         struct uri uri_struct;
 698
 699         if (!uri) uri = &uri_struct;
 700
 701         /* We need to get the real (proxied) URI but lowercase relevant URI
 702          * parts along the way. */
 703         do {
 704                 if (parse && parse_uri(uri, parse_string) != URI_ERRNO_OK)
 705                         return uristring;
 706
 707                 assert(uri->data);
 708
 709                 /* This is a maybe not the right place but both join_urls() and
 710                  * get_translated_uri() through translate_url() calls this
 711                  * function and then it already works on and modifies an
 712                  * allocated copy. */
 713                 convert_to_lowercase(uri->string, uri->protocollen);
 714                 if (uri->hostlen) convert_to_lowercase(uri->host, uri->hostlen);
 715
 716                 parse = 1;
 717                 parse_string = uri->data;
 718         } while (uri->protocol == PROTOCOL_PROXY);
 719
 720         if (get_protocol_free_syntax(uri->protocol))
 721                 return uristring;
 722
 723         if (uri->protocol != PROTOCOL_UNKNOWN) {
 724                 need_slash = get_protocol_need_slash_after_host(uri->protocol);
 725                 keep_dslash = get_protocol_keep_double_slashes(uri->protocol);
 726         }
 727
 728         path = uri->data - need_slash;
 729         dest = src = path;
 730
 731         /* This loop mangles the URI string by removing ".." and "." segments.
 732          * However it must not alter "//" without reason; see bug 744.  */
 733         while (*dest) {
 734                 /* If the following pieces are the LAST parts of URL, we remove
 735                  * them as well. See RFC 2396 section 5.2 for details. */
 736
 737                 if (end_of_dir(src[0])) {
 738                         /* URL data contains no more path. */
 739                         memmove(dest, src, strlen(src) + 1);
 740                         break;
 741                 }
 742
 743                 if (!is_uri_dir_sep(uri, src[0])) {
 744                         /* This is to reduce indentation */
 745
 746                 } else if (src[1] == '.') {
 747                         if (!src[2]) {
 748                                 /* /. - skip the dot */
 749                                 *dest++ = *src;
 750                                 *dest = 0;
 751                                 break;
 752
 753                         } else if (is_uri_dir_sep(uri, src[2])) {
 754                                 /* /./ - strip that.. */
 755                                 src += 2;
 756                                 continue;
 757
 758                         } else if (src[2] == '.'
 759                                    && (is_uri_dir_sep(uri, src[3]) || !src[3])) {
 760                                 /* /../ or /.. - skip it and preceding element.
 761                                  *
 762                                  * <path> "/foo/bar" <dest> ...
 763                                  * <src> ("/../" or "/..\0") ...
 764                                  *
 765                                  * Remove "bar" and the directory
 766                                  * separator that precedes it.  The
 767                                  * separator will be added back in the
 768                                  * next iteration unless another ".."
 769                                  * follows, in which case it will be
 770                                  * added later.  "bar" may be empty.  */
 771
 772                                 while (dest > path) {
 773                                         dest--;
 774                                         if (is_uri_dir_sep(uri, *dest)) break;
 775                                 }
 776
 777                                 /* <path> "/foo" <dest> "/bar" ...
 778                                  * <src> ("/../" or "/..\0") ... */
 779                                 if (!src[3]) {
 780                                         /* /.. - add ending slash and stop */
 781                                         *dest++ = *src;
 782                                         *dest = 0;
 783                                         break;
 784                                 }
 785
 786                                 src += 3;
 787                                 continue;
 788                         }
 789
 790                 } else if (is_uri_dir_sep(uri, src[1]) && !keep_dslash) {
 791                         /* // - ignore first '/'. */
 792                         src += 1;
 793                         continue;
 794                 }
 795
 796                 /* We don't want to access memory past the NUL char. */
 797                 *dest = *src++;
 798                 if (*dest) dest++;
 799         }
 800
 801         return uristring;
 802 }
 803
 804 /* The 'file' scheme URI comes in and bastardized URI comes out which consists
 805  * of just the complete path to file/directory, which the dumb 'file' protocol
 806  * backend can understand. No host parts etc, that is what this function is
 807  * supposed to chew. */
 808 static struct uri *
 809 transform_file_url(struct uri *uri, const unsigned char *cwd)
 810 {
 811         unsigned char *path = uri->data;
 812
 813         assert(uri->protocol == PROTOCOL_FILE && uri->data);
 814
 815         /* Sort out the host part. We currently support only host "localhost"
 816          * (plus empty host part will be assumed to be "localhost" as well).
 817          * As our extensions, '.' will reference to the cwd on localhost
 818          * (originally, when the first thing after file:// wasn't "localhost/",
 819          * we assumed the cwd as well, and pretended that there's no host part
 820          * at all) and '..' to the directory parent to cwd. Another extension
 821          * is that if this is a DOS-like system, the first char in two-char
 822          * host part is uppercase letter and the second char is a colon, it is
 823          * assumed to be a local disk specification. */
 824         /* TODO: Use FTP for non-localhost hosts. --pasky */
 825
 826         /* For URL "file://", we open the current directory. Some other
 827          * browsers instead open root directory, but AFAIK the standard does
 828          * not specify that and this was the original behaviour and it is more
 829          * consistent with our file://./ notation. */
 830
 831         /* Who would name their file/dir '...' ? */
 832         if (*path == '.' || !*path) {
 833                 struct string dir;
 834
 835                 if (!init_string(&dir))
 836                         return NULL;
 837
 838                 encode_uri_string(&dir, cwd, -1, 0);
 839
 840                 /* Either we will end up with '//' and translate_directories()
 841                  * will shorten it or the '/' will mark the inserted cwd as a
 842                  * directory. */
 843                 if (*path == '.') *path = '/';
 844
 845                 /* Insert the current working directory. */
 846                 /* The offset is 7 == sizeof("file://") - 1. */
 847                 insert_in_string(&struri(uri), 7, dir.source, dir.length);
 848
 849                 done_string(&dir);
 850                 return uri;
 851         }
 852
 853 #ifdef DOS_FS
 854         if (isasciialpha(path[0]) && path[1] == ':' && dir_sep(path[2]))
 855                 return NULL;
 856 #endif
 857
 858         for (; *path && !dir_sep(*path); path++);
 859
 860         /* FIXME: We will in fact assume localhost even for non-local hosts,
 861          * until we will support the FTP transformation. --pasky */
 862
 863         memmove(uri->data, path, strlen(path) + 1);
 864         return uri;
 865 }
 866
 867 static unsigned char *translate_url(unsigned char *url, unsigned char *cwd);
 868
 869 unsigned char *
 870 join_urls(struct uri *base, unsigned char *rel)
 871 {
 872         unsigned char *uristring, *path;
 873         int add_slash = 0;
 874         int translate = 0;
 875         int length = 0;
 876
 877         /* See RFC 1808 */
 878         /* TODO: Support for ';' ? (see the RFC) --pasky */
 879
 880         /* For '#', '?' and '//' we could use get_uri_string() but it might be
 881          * too expensive since it uses granular allocation scheme. I wouldn't
 882          * personally mind tho' because it would be cleaner. --jonas */
 883         if (rel[0] == '#') {
 884                 /* Strip fragment and post part from the base URI and append
 885                  * the fragment string in @rel. */
 886                 length  = base->fragment
 887                         ? base->fragment - struri(base) - 1
 888                         : get_real_uri_length(base);
 889
 890         } else if (rel[0] == '?') {
 891                 /* Strip query, fragment and post part from the base URI and
 892                  * append the query string in @rel. */
 893                 length  = base->fragment ? base->fragment - struri(base) - 1
 894                                          : get_real_uri_length(base);
 895
 896                 uristring = memchr(base->data, '?', base->datalen);
 897                 if (uristring) length = uristring - struri(base);
 898
 899         } else if (rel[0] == '/' && rel[1] == '/') {
 900                 if (!get_protocol_need_slashes(base->protocol))
 901                         return NULL;
 902
 903                 /* Get `<protocol>:' from the base URI and append the `//' part
 904                  * from @rel. */
 905                 length = base->protocollen + 1;
 906
 907                 /* We need to sanitize the relative part and add stuff like
 908                  * host slash. */
 909                 translate = 1;
 910         }
 911
 912         /* If one of the tests above set @length to something useful */
 913         if (length) {
 914                 uristring = memacpy(struri(base), length);
 915                 if (!uristring) return NULL;
 916
 917                 add_to_strn(&uristring, rel);
 918
 919                 if (translate) {
 920                         unsigned char *translated;
 921
 922                         translated = translate_url(uristring, NULL);
 923                         mem_free(uristring);
 924                         return translated;
 925                 }
 926                 return normalize_uri_reparse(uristring);
 927         }
 928
 929         /* Check if there is some protocol name to go for */
 930         length = get_protocol_length(rel);
 931         if (length) {
 932                 switch (get_protocol(rel, length)) {
 933                 case PROTOCOL_UNKNOWN:
 934                 case PROTOCOL_PROXY:
 935                         /* Mysteriously proxy URIs are breaking here ... */
 936                         break;
 937
 938                 case PROTOCOL_FILE:
 939                         /* FIXME: Use get_uri_string(base, URI_PATH) as cwd arg
 940                          * to translate_url(). */
 941                 default:
 942                         uristring = translate_url(rel, NULL);
 943                         if (uristring) return uristring;
 944                 }
 945         }
 946
 947         assertm(base->data != NULL, "bad base url");
 948         if_assert_failed return NULL;
 949
 950         path = base->data;
 951
 952         /* Either is path blank, but we've slash char before, or path is not
 953          * blank, but doesn't start by a slash (if we'd just stay along with
 954          * is_uri_dir_sep(&uri, path[-1]) w/o all the surrounding crap, it
 955          * should be enough, but I'm not sure and I don't want to break
 956          * anything --pasky). */
 957         /* We skip first char of URL ('/') in parse_url() (ARGH). This
 958          * is reason of all this bug-bearing magic.. */
 959         if (*path) {
 960                 if (!is_uri_dir_sep(base, *path)) path--;
 961         } else {
 962                 if (is_uri_dir_sep(base, path[-1])) path--;
 963         }
 964
 965         if (!is_uri_dir_sep(base, rel[0])) {
 966                 unsigned char *path_end;
 967
 968                 /* The URL is relative. */
 969
 970                 if (!*path) {
 971                         /* There's no path in the URL, but we're going to add
 972                          * something there, and the something doesn't start by
 973                          * a slash. So we need to insert a slash after the base
 974                          * URL. Clever, eh? ;) */
 975                         add_slash = 1;
 976                 }
 977
 978                 for (path_end = path; *path_end; path_end++) {
 979                         if (end_of_dir(*path_end)) break;
 980                         /* Modify the path pointer, so that it'll always point
 981                          * above the last '/' in the URL; later, we'll copy the
 982                          * URL only _TO_ this point, and anything after last
 983                          * slash will be substituted by 'rel'. */
 984                         if (is_uri_dir_sep(base, *path_end))
 985                                 path = path_end + 1;
 986                 }
 987         }
 988
 989         length = path - struri(base);
 990         uristring = mem_alloc(length + strlen(rel) + add_slash + 1);
 991         if (!uristring) return NULL;
 992
 993         memcpy(uristring, struri(base), length);
 994         if (add_slash) uristring[length] = '/';
 995         strcpy(uristring + length + add_slash, rel);
 996
 997         return normalize_uri_reparse(uristring);
 998 }
 999
1000
1001 /* Tries to figure out what protocol @newurl might be specifying by checking if
1002  * it exists as a file locally or by checking parts of the host name. */
1003 static enum protocol
1004 find_uri_protocol(unsigned char *newurl)
1005 {
1006         unsigned char *ch;
1007
1008         /* First see if it is a file so filenames that look like hostnames
1009          * won't confuse us below. */
1010         if (check_whether_file_exists(newurl) >= 0) return PROTOCOL_FILE;
1011
1012         /* Yes, it would be simpler to make test for IPv6 address first,
1013          * but it would result in confusing mix of ifdefs ;-). */
1014         /* FIXME: Ideas for improve protocol detection
1015          *
1016          * - Handle common hostnames. It could be part of the protocol backend
1017          *   structure. [ www -> http, irc -> irc, news -> nntp, ... ]
1018          *
1019          * - Resolve using port number. [ 119 -> nntp, 443 -> https, ... ]
1020          */
1021
1022         ch = newurl + strcspn(newurl, ".:/@");
1023         if (*ch == '@'
1024             || (*ch == ':' && *newurl != '[' && strchr(newurl, '@'))
1025             || !strncasecmp(newurl, "ftp.", 4)) {
1026                 /* Contains user/password/ftp-hostname */
1027                 return PROTOCOL_FTP;
1028
1029 #ifdef CONFIG_IPV6
1030         } else if (*newurl == '[' && *ch == ':') {
1031                 /* Candidate for IPv6 address */
1032                 unsigned char *bracket2, *colon2;
1033
1034                 ch++;
1035                 bracket2 = strchr(ch, ']');
1036                 colon2 = strchr(ch, ':');
1037                 if (bracket2 && colon2 && bracket2 > colon2)
1038                         return PROTOCOL_HTTP;
1039 #endif
1040
1041         } else if (*newurl != '.' && *ch == '.') {
1042                 /* Contains domain name? */
1043                 unsigned char *host_end, *domain;
1044                 unsigned char *ipscan;
1045
1046                 /* Process the hostname */
1047                 for (domain = ch + 1;
1048                         *(host_end = domain + strcspn(domain, ".:/?")) == '.';
1049                         domain = host_end + 1);
1050
1051                 /* It's IP? */
1052                 for (ipscan = ch; isdigit(*ipscan) || *ipscan == '.';
1053                         ipscan++);
1054
1055                 if (!*ipscan || *ipscan == ':' || *ipscan == '/')
1056                         return PROTOCOL_HTTP;
1057
1058                 /* It's two-letter or known TLD? */
1059                 if (host_end - domain == 2
1060                     || end_with_known_tld(domain, host_end - domain) >= 0)
1061                         return PROTOCOL_HTTP;
1062         }
1063
1064         return PROTOCOL_UNKNOWN;
1065 }
1066
1067
1068 #define MAX_TRANSLATION_ATTEMPTS        32
1069
1070 /* Returns an URI string that can be used internally. Adding protocol prefix,
1071  * missing slashes etc. */
1072 static unsigned char *
1073 translate_url(unsigned char *url, unsigned char *cwd)
1074 {
1075         unsigned char *newurl;
1076         struct uri uri;
1077         enum uri_errno uri_errno, prev_errno = URI_ERRNO_EMPTY;
1078         int retries = 0;
1079
1080         /* Strip starting spaces */
1081         while (*url == ' ') url++;
1082         if (!*url) return NULL;
1083
1084         newurl = expand_tilde(url); /* XXX: Post data copy. */
1085         if (!newurl) return NULL;
1086
1087 parse_uri:
1088         /* Yay a goto loop. If we get some URI parse error and try to
1089          * fix it we go back to here and try again. */
1090         /* Ordinary parse */
1091         uri_errno = parse_uri(&uri, newurl);
1092
1093         /* Bail out if the same error occurs twice */
1094         if (uri_errno == prev_errno || retries++ > MAX_TRANSLATION_ATTEMPTS) {
1095                 if (retries > MAX_TRANSLATION_ATTEMPTS) {
1096                         ERROR("Maximum number of parsing attempts exceeded "
1097                               "for %s.", url);
1098                 }
1099                 mem_free(newurl);
1100                 return NULL;
1101         }
1102
1103         prev_errno = uri_errno;
1104
1105         switch (uri_errno) {
1106         case URI_ERRNO_OK:
1107                 /* Fix translation of 1.2.3.4:5 so IP address part won't be
1108                  * interpreted as the protocol name. */
1109                 if (uri.protocol == PROTOCOL_UNKNOWN) {
1110                         enum protocol protocol = find_uri_protocol(newurl);
1111
1112                         /* Code duplication with the URI_ERRNO_INVALID_PROTOCOL
1113                          * case. */
1114                         if (protocol != PROTOCOL_UNKNOWN) {
1115                                 struct string str;
1116
1117                                 if (!init_string(&str)) return NULL;
1118
1119                                 switch (protocol) {
1120                                 case PROTOCOL_FTP:
1121                                         add_to_string(&str, "ftp://");
1122                                         encode_uri_string(&str, newurl, -1, 0);
1123                                         break;
1124
1125                                 case PROTOCOL_HTTP:
1126                                         add_to_string(&str, "http://");
1127                                         add_to_string(&str, newurl);
1128                                         break;
1129
1130                                 case PROTOCOL_UNKNOWN:
1131                                         break;
1132
1133                                 case PROTOCOL_FILE:
1134                                 default:
1135                                         add_to_string(&str, "file://");
1136                                         if (!dir_sep(*newurl))
1137                                                 add_to_string(&str, "./");
1138
1139                                         add_to_string(&str, newurl);
1140                                 }
1141
1142                                 mem_free(newurl);
1143                                 newurl = str.source;
1144
1145                                 /* Work around the infinite loop prevention */
1146                                 prev_errno = URI_ERRNO_EMPTY;
1147                                 goto parse_uri;
1148                         }
1149                 }
1150
1151                 /* If file:// URI is transformed we need to reparse. */
1152                 if (uri.protocol == PROTOCOL_FILE && cwd && *cwd
1153                     && transform_file_url(&uri, cwd))
1154                         return normalize_uri_reparse(struri(&uri));
1155
1156                 /* Translate the proxied URI too if proxy:// */
1157                 if (uri.protocol == PROTOCOL_PROXY) {
1158                         unsigned char *data = translate_url(uri.data, cwd);
1159                         int pos = uri.data - struri(&uri);
1160
1161                         if (!data) break;
1162                         struri(&uri)[pos] = 0;
1163                         insert_in_string(&struri(&uri), pos, data, strlen(data));
1164                         mem_free(data);
1165                         return normalize_uri_reparse(struri(&uri));
1166                 }
1167
1168                 return normalize_uri_noparse(&uri);
1169
1170         case URI_ERRNO_TOO_MANY_SLASHES:
1171         {
1172                 unsigned char *from, *to;
1173
1174                 assert(uri.string[uri.protocollen] == ':'
1175                        && uri.string[uri.protocollen + 1] == '/'
1176                        && uri.string[uri.protocollen + 2] == '/');
1177
1178                 from = to = uri.string + uri.protocollen + 3;
1179                 while (*from == '/') from++;
1180
1181                 assert(to < from);
1182                 memmove(to, from, strlen(from) + 1);
1183                 goto parse_uri;
1184         }
1185         case URI_ERRNO_NO_SLASHES:
1186         {
1187                 /* Try prefix:some.url -> prefix://some.url.. */
1188                 int slashes = 2;
1189
1190                 /* Check if only one '/' is needed. */
1191                 if (uri.string[uri.protocollen + 1] == '/')
1192                         slashes--;
1193
1194                 insert_in_string(&newurl, uri.protocollen + 1, "//", slashes);
1195                 goto parse_uri;
1196         }
1197         case URI_ERRNO_TRAILING_DOTS:
1198         {
1199                 /* Trim trailing '.'s */
1200                 unsigned char *from = uri.host + uri.hostlen;
1201                 unsigned char *to = from;
1202
1203                 assert(uri.host < to && to[-1] == '.' && *from != '.');
1204
1205                 while (uri.host < to && to[-1] == '.') to--;
1206
1207                 assert(to < from);
1208                 memmove(to, from, strlen(from) + 1);
1209                 goto parse_uri;
1210         }
1211         case URI_ERRNO_NO_PORT_COLON:
1212                 assert(uri.portlen == 0
1213                        && uri.string < uri.port
1214                        && uri.port[-1] == ':');
1215
1216                 memmove(uri.port - 1, uri.port, strlen(uri.port) + 1);
1217                 goto parse_uri;
1218
1219         case URI_ERRNO_NO_HOST_SLASH:
1220         {
1221                 int offset = uri.port
1222                            ? uri.port + uri.portlen - struri(&uri)
1223                            : uri.host + uri.hostlen - struri(&uri) + uri.ipv6 /* ']' */;
1224
1225                 assertm(uri.host != NULL, "uri.host not set after no host slash error");
1226                 insert_in_string(&newurl, offset, "/", 1);
1227                 goto parse_uri;
1228         }
1229         case URI_ERRNO_INVALID_PROTOCOL:
1230         {
1231                 /* No protocol name */
1232                 enum protocol protocol = find_uri_protocol(newurl);
1233                 struct string str;
1234
1235                 if (!init_string(&str)) return NULL;
1236
1237                 switch (protocol) {
1238                         case PROTOCOL_FTP:
1239                                 add_to_string(&str, "ftp://");
1240                                 encode_uri_string(&str, newurl, -1, 0);
1241                                 break;
1242
1243                         case PROTOCOL_HTTP:
1244                                 add_to_string(&str, "http://");
1245                                 add_to_string(&str, newurl);
1246                                 break;
1247
1248                         case PROTOCOL_UNKNOWN:
1249                                 /* We default to file:// even though we already
1250                                  * tested if the file existed since it will give
1251                                  * a "No such file or directory" error.  which
1252                                  * might better hint the user that there was
1253                                  * problem figuring out the URI. */
1254                         case PROTOCOL_FILE:
1255                         default:
1256                                 add_to_string(&str, "file://");
1257                                 if (!dir_sep(*newurl))
1258                                         add_to_string(&str, "./");
1259
1260                                 encode_file_uri_string(&str, newurl);
1261                 }
1262
1263                 mem_free(newurl);
1264                 newurl = str.source;
1265
1266                 goto parse_uri;
1267         }
1268         case URI_ERRNO_EMPTY:
1269         case URI_ERRNO_IPV6_SECURITY:
1270         case URI_ERRNO_NO_HOST:
1271         case URI_ERRNO_INVALID_PORT:
1272         case URI_ERRNO_INVALID_PORT_RANGE:
1273                 /* None of these can be handled properly. */
1274                 break;
1275         }
1276
1277         mem_free(newurl);
1278         return NULL;
1279 }
1280
1281
1282 struct uri *
1283 get_composed_uri(struct uri *uri, enum uri_component components)
1284 {
1285         unsigned char *string;
1286
1287         assert(uri);
1288         if_assert_failed return NULL;
1289
1290         string = get_uri_string(uri, components);
1291         if (!string) return NULL;
1292
1293         uri = get_uri(string, 0);
1294         mem_free(string);
1295
1296         return uri;
1297 }
1298
1299 struct uri *
1300 get_translated_uri(unsigned char *uristring, unsigned char *cwd)
1301 {
1302         struct uri *uri;
1303
1304         uristring = translate_url(uristring, cwd);
1305         if (!uristring) return NULL;
1306
1307         uri = get_uri(uristring, 0);
1308         mem_free(uristring);
1309
1310         return uri;
1311 }
1312
1313
1314 unsigned char *
1315 get_extension_from_uri(struct uri *uri)
1316 {
1317         unsigned char *extension = NULL;
1318         int afterslash = 1;
1319         unsigned char *pos = uri->data;
1320
1321         assert(pos);
1322
1323         for (; *pos && !end_of_dir(*pos); pos++) {
1324                 if (!afterslash && !extension && *pos == '.') {
1325                         extension = pos;
1326                 } else if (is_uri_dir_sep(uri, *pos)) {
1327                         extension = NULL;
1328                         afterslash = 1;
1329                 } else {
1330                         afterslash = 0;
1331                 }
1332         }
1333
1334         if (extension && extension < pos)
1335                 return memacpy(extension, pos - extension);
1336
1337         return NULL;
1338 }
1339
1340 /* URI encoding, escaping unallowed characters. */
1341 static inline int
1342 safe_char(unsigned char c)
1343 {
1344         /* RFC 2396, Page 8, Section 2.3 ;-) */
1345         return isident(c) || c == '.' || c == '!' || c == '~'
1346                || c == '*' || c == '\''|| c == '(' || c == ')';
1347 }
1348
1349 void
1350 encode_uri_string(struct string *string, const unsigned char *name, int namelen,
1351                   int convert_slashes)
1352 {
1353         unsigned char n[4];
1354         const unsigned char *end;
1355
1356         n[0] = '%';
1357         n[3] = '\0';
1358
1359         if (namelen < 0) namelen = strlen(name);
1360
1361         for (end = name + namelen; name < end; name++) {
1362 #if 0
1363                 /* This is probably correct only for query part of URI..? */
1364                 if (*name == ' ') add_char_to_string(data, len, '+');
1365                 else
1366 #endif
1367                 if (safe_char(*name) || (!convert_slashes && *name == '/')) {
1368                         add_char_to_string(string, *name);
1369                 } else {
1370                         /* Hex it. */
1371                         n[1] = hx((((int) *name) & 0xF0) >> 4);
1372                         n[2] = hx(((int) *name) & 0xF);
1373                         add_bytes_to_string(string, n, sizeof(n) - 1);
1374                 }
1375         }
1376 }
1377
1378 void
1379 encode_win32_uri_string(struct string *string, unsigned char *name, int namelen)
1380 {
1381         unsigned char n[4];
1382         unsigned char *end;
1383
1384         n[0] = '%';
1385         n[3] = '\0';
1386
1387         if (namelen < 0) namelen = strlen(name);
1388
1389         for (end = name + namelen; name < end; name++) {
1390                 if (safe_char(*name) || *name == ':' || *name == '\\') {
1391                         add_char_to_string(string, *name);
1392                 } else {
1393                         /* Hex it. */
1394                         n[1] = hx((((int) *name) & 0xF0) >> 4);
1395                         n[2] = hx(((int) *name) & 0xF);
1396                         add_bytes_to_string(string, n, sizeof(n) - 1);
1397                 }
1398         }
1399 }
1400
1401 /* This function is evil, it modifies its parameter. */
1402 /* XXX: but decoded string is _never_ longer than encoded string so it's an
1403  * efficient way to do that, imho. --Zas */
1404 void
1405 decode_uri(unsigned char *src)
1406 {
1407         unsigned char *dst = src;
1408         unsigned char c;
1409
1410         do {
1411                 c = *src++;
1412
1413                 if (c == '%') {
1414                         int x1 = unhx(*src);
1415
1416                         if (x1 >= 0) {
1417                                 int x2 = unhx(*(src + 1));
1418
1419                                 if (x2 >= 0) {
1420                                         x1 = (x1 << 4) + x2;
1421                                         if (x1 != 0) { /* don't allow %00 */
1422                                                 c = (unsigned char) x1;
1423                                                 src += 2;
1424                                         }
1425                                 }
1426                         }
1427
1428 #if 0
1429                 } else if (c == '+') {
1430                         /* As the comment in encode_uri_string suggests, '+'
1431                          * should only be decoded in the query part of a URI
1432                          * (should that be 'URL'?). I'm not bold enough to
1433                          * disable this code, tho. -- Miciah */
1434                         c = ' ';
1435 #endif
1436                 }
1437
1438                 *dst++ = c;
1439         } while (c != '\0');
1440 }
1441
1442 void
1443 decode_uri_string(struct string *string)
1444 {
1445         decode_uri(string->source);
1446         string->length = strlen(string->source);
1447 }
1448
1449 void
1450 decode_uri_for_display(unsigned char *src)
1451 {
1452         decode_uri(src);
1453
1454         for (; *src; src++)
1455                 if (!isprint(*src) || iscntrl(*src))
1456                         *src = '*';
1457 }
1458
1459 void
1460 decode_uri_string_for_display(struct string *string)
1461 {
1462         decode_uri_for_display(string->source);
1463         string->length = strlen(string->source);
1464 }
1465
1466
1467 /* URI list */
1468
1469 #define URI_LIST_GRANULARITY 0x3
1470
1471 #define realloc_uri_list(list) \
1472         mem_align_alloc(&(list)->uris, (list)->size, (list)->size + 1, \
1473                         URI_LIST_GRANULARITY)
1474
1475 struct uri *
1476 add_to_uri_list(struct uri_list *list, struct uri *uri)
1477 {
1478         if (!realloc_uri_list(list))
1479                 return NULL;
1480
1481         list->uris[list->size++] = get_uri_reference(uri);
1482
1483         return uri;
1484 };
1485
1486 void
1487 free_uri_list(struct uri_list *list)
1488 {
1489         struct uri *uri;
1490         int index;
1491
1492         if (!list->uris) return;
1493
1494         foreach_uri (uri, index, list) {
1495                 done_uri(uri);
1496         }
1497
1498         mem_free_set(&list->uris, NULL);
1499         list->size = 0;
1500 }
1501
1502 /* URI cache */
1503
1504 struct uri_cache_entry {
1505         struct uri uri;
1506         unsigned char string[1];
1507 };
1508
1509 struct uri_cache {
1510         struct hash *map;
1511         struct object object;
1512 };
1513
1514 static struct uri_cache uri_cache;
1515
1516 #ifdef CONFIG_DEBUG
1517 static inline void
1518 check_uri_sanity(struct uri *uri)
1519 {
1520         int pos;
1521
1522         for (pos = 0; pos < uri->protocollen; pos++)
1523                 if (isupper(uri->string[pos])) goto error;
1524
1525         if (uri->hostlen)
1526                 for (pos = 0; pos < uri->hostlen; pos++)
1527                         if (isupper(uri->host[pos])) goto error;
1528         return;
1529 error:
1530         INTERNAL("Uppercase letters detected in protocol or host part (%s).", struri(uri));
1531 }
1532 #else
1533 #define check_uri_sanity(uri)
1534 #endif
1535
1536 static inline struct uri_cache_entry *
1537 get_uri_cache_entry(unsigned char *string, int length)
1538 {
1539         struct uri_cache_entry *entry;
1540         struct hash_item *item;
1541
1542         assert(string && length > 0);
1543         if_assert_failed return NULL;
1544
1545         item = get_hash_item(uri_cache.map, string, length);
1546         if (item) return item->value;
1547
1548         /* Setup a new entry */
1549
1550         entry = mem_calloc(1, sizeof(*entry) + length);
1551         if (!entry) return NULL;
1552
1553         object_nolock(&entry->uri, "uri");
1554         memcpy(&entry->string, string, length);
1555         string = entry->string;
1556
1557         if (parse_uri(&entry->uri, string) != URI_ERRNO_OK
1558             || !add_hash_item(uri_cache.map, string, length, entry)) {
1559                 mem_free(entry);
1560                 return NULL;
1561         }
1562
1563         object_lock(&uri_cache);
1564
1565         return entry;
1566 }
1567
1568 struct uri *
1569 get_uri(unsigned char *string, enum uri_component components)
1570 {
1571         struct uri_cache_entry *entry;
1572
1573         assert(string);
1574
1575         if (components) {
1576                 struct uri uri;
1577
1578                 if (parse_uri(&uri, string) != URI_ERRNO_OK)
1579                         return NULL;
1580
1581                 return get_composed_uri(&uri, components);
1582         }
1583
1584         if (!is_object_used(&uri_cache)) {
1585                 uri_cache.map = init_hash8();
1586                 if (!uri_cache.map) return NULL;
1587                 object_nolock(&uri_cache, "uri_cache");
1588         }
1589
1590         entry = get_uri_cache_entry(string, strlen(string));
1591         if (!entry) {
1592                 if (!is_object_used(&uri_cache))
1593                         free_hash(&uri_cache.map);
1594                 return NULL;
1595         }
1596
1597         check_uri_sanity(&entry->uri);
1598         object_nolock(&entry->uri, "uri");
1599         object_lock(&entry->uri);
1600
1601         return &entry->uri;
1602 }
1603
1604 void
1605 done_uri(struct uri *uri)
1606 {
1607         unsigned char *string = struri(uri);
1608         int length = strlen(string);
1609         struct hash_item *item;
1610         struct uri_cache_entry *entry;
1611
1612         assert(is_object_used(&uri_cache));
1613
1614         object_unlock(uri);
1615         if (is_object_used(uri)) return;
1616
1617         item = get_hash_item(uri_cache.map, string, length);
1618         entry = item ? item->value : NULL;
1619
1620         assertm(entry != NULL, "Releasing unknown URI [%s]", string);
1621         del_hash_item(uri_cache.map, item);
1622         mem_free(entry);
1623
1624         /* Last URI frees the cache */
1625         object_unlock(&uri_cache);
1626         if (!is_object_used(&uri_cache))
1627                 free_hash(&uri_cache.map);
1628 }