src/protocol/uri.c

   1 /* URL parser and translator; implementation of RFC 2396. */
   2
   3 #ifdef HAVE_CONFIG_H
   4 #include "config.h"
   5 #endif
   6
   7 #include <ctype.h>
   8 #include <errno.h>
   9 #ifdef HAVE_IDNA_H
  10 #include <idna.h>
  11 #endif
  12 #include <stdio.h>
  13 #include <stdlib.h>
  14 #include <string.h>
  15 #include <sys/types.h>
  16 #ifdef HAVE_NETDB_H
  17 #include <netdb.h> /* OS/2 needs this after sys/types.h */
  18 #endif
  19
  20 #ifdef HAVE_SYS_SOCKET_H
  21 #include <sys/socket.h>
  22 #endif
  23 #ifdef HAVE_NETINET_IN_H
  24 #include <netinet/in.h>
  25 #endif
  26 #ifdef HAVE_ARPA_INET_H
  27 #include <arpa/inet.h>
  28 #endif
  29
  30 #include "elinks.h"
  31
  32 #include "main/object.h"
  33 #include "protocol/protocol.h"
  34 #include "protocol/uri.h"
  35 #include "util/conv.h"
  36 #include "util/error.h"
  37 #include "util/file.h"
  38 #include "util/hash.h"
  39 #include "util/memory.h"
  40 #include "util/string.h"
  41
  42
  43 static inline int
  44 end_of_dir(unsigned char c)
  45 {
  46         return c == POST_CHAR || c == '#' || c == ';' || c == '?';
  47 }
  48
  49 static inline int
  50 is_uri_dir_sep(struct uri *uri, unsigned char pos)
  51 {
  52         return (uri->protocol == PROTOCOL_FILE ? dir_sep(pos) : pos == '/');
  53 }
  54
  55
  56 int
  57 is_ip_address(unsigned char *address, int addresslen)
  58 {
  59         /* The @address has well defined limits so it would be a shame to
  60          * allocate it. */
  61         unsigned char buffer[IP_ADDRESS_BUFFER_SIZE];
  62
  63         if (addresslen >= sizeof(buffer))
  64                 return 0;
  65
  66         safe_strncpy(buffer, address, addresslen + 1);
  67
  68 #ifdef HAVE_INET_PTON
  69 #ifdef CONFIG_IPV6
  70         {
  71                 struct sockaddr_in6 addr6;
  72
  73                 if (inet_pton(AF_INET6, buffer, &addr6.sin6_addr) > 0)
  74                         return 1;
  75         }
  76 #endif /* CONFIG_IPV6 */
  77         {
  78                 struct in_addr addr4;
  79
  80                 if (inet_pton(AF_INET, buffer, &addr4) > 0)
  81                         return 1;
  82         }
  83
  84         return 0;
  85 #else
  86         /* FIXME: Is this ever the case? */
  87         return 0;
  88 #endif /* HAVE_INET_PTON */
  89 }
  90
  91
  92 int
  93 end_with_known_tld(unsigned char *s, int slen)
  94 {
  95         int i;
  96         static const unsigned char *tld[] =
  97         { "com", "edu", "net",
  98           "org", "gov", "mil",
  99           "int", "biz", "arpa",
 100           "aero", "coop",
 101           "info", "museum",
 102           "name", "pro", NULL };
 103
 104         if (!slen) return -1;
 105         if (slen < 0) slen = strlen(s);
 106
 107         for (i = 0; tld[i]; i++) {
 108                 int tldlen = strlen(tld[i]);
 109                 int pos = slen - tldlen;
 110
 111                 if (pos >= 0 && !strncasecmp(&s[pos], tld[i], tldlen))
 112                         return pos;
 113         }
 114
 115         return -1;
 116 }
 117
 118 /* XXX: this function writes to @name. */
 119 static int
 120 check_whether_file_exists(unsigned char *name)
 121 {
 122         /* Check POST_CHAR etc ... */
 123         static const unsigned char chars[] = POST_CHAR_S "#?";
 124         int i;
 125         int namelen = strlen(name);
 126
 127         if (file_exists(name))
 128                 return namelen;
 129
 130         for (i = 0; i < sizeof(chars) - 1; i++) {
 131                 unsigned char *pos = memchr(name, chars[i], namelen);
 132                 int exists;
 133
 134                 if (!pos) continue;
 135
 136                 *pos = 0;
 137                 exists = file_exists(name);
 138                 *pos = chars[i];
 139
 140                 if (exists) {
 141                         return pos - name;
 142                 }
 143         }
 144
 145         return -1;
 146 }
 147
 148 static int
 149 check_uri_file(unsigned char *name)
 150 {
 151         /* Check POST_CHAR etc ... */
 152         static const unsigned char chars[] = POST_CHAR_S "#?";
 153
 154         return strcspn(name, chars);
 155 }
 156
 157 /* Encodes URIs without encoding stuff like fragments and query separators. */
 158 static void
 159 encode_file_uri_string(struct string *string, unsigned char *uristring)
 160 {
 161         int filenamelen = check_whether_file_exists(uristring);
 162
 163         encode_uri_string(string, uristring, filenamelen, 0);
 164 }
 165
 166
 167 static inline int
 168 get_protocol_length(const unsigned char *url)
 169 {
 170         unsigned char *end = (unsigned char *) url;
 171
 172         /* Seek the end of the protocol name if any. */
 173         /* RFC1738:
 174          * scheme  = 1*[ lowalpha | digit | "+" | "-" | "." ]
 175          * (but per its recommendations we accept "upalpha" too) */
 176         while (isalnum(*end) || *end == '+' || *end == '-' || *end == '.')
 177                 end++;
 178
 179         /* Now we make something to support our "IP version in protocol scheme
 180          * name" hack and silently chop off the last digit if it's there. The
 181          * IETF's not gonna notice I hope or it'd be going after us hard. */
 182         if (end != url && isdigit(end[-1]))
 183                 end--;
 184
 185         /* Also return 0 if there's no protocol name (@end == @url). */
 186         return (*end == ':' || isdigit(*end)) ? end - url : 0;
 187 }
 188
 189 enum uri_errno
 190 parse_uri(struct uri *uri, unsigned char *uristring)
 191 {
 192         unsigned char *prefix_end, *host_end;
 193 #ifdef CONFIG_IPV6
 194         unsigned char *lbracket, *rbracket;
 195 #endif
 196
 197         assertm(uristring, "No uri to parse.");
 198         memset(uri, 0, sizeof(*uri));
 199
 200         /* Nothing to do for an empty url. */
 201         if_assert_failed return 0;
 202         if (!*uristring) return URI_ERRNO_EMPTY;
 203
 204         uri->string = uristring;
 205         uri->protocollen = get_protocol_length(uristring);
 206
 207         /* Invalid */
 208         if (!uri->protocollen) return URI_ERRNO_INVALID_PROTOCOL;
 209
 210         /* Figure out whether the protocol is known */
 211         uri->protocol = get_protocol(struri(uri), uri->protocollen);
 212
 213         prefix_end = uristring + uri->protocollen; /* ':' */
 214
 215         /* Check if there's a digit after the protocol name. */
 216         if (isdigit(*prefix_end)) {
 217                 uri->ip_family = uristring[uri->protocollen] - '0';
 218                 prefix_end++;
 219         }
 220         if (*prefix_end != ':')
 221                 return URI_ERRNO_INVALID_PROTOCOL;
 222         prefix_end++;
 223
 224         /* Skip slashes */
 225
 226         if (prefix_end[0] == '/' && prefix_end[1] == '/') {
 227                 if (prefix_end[2] == '/'
 228                     && get_protocol_need_slash_after_host(uri->protocol))
 229                         return URI_ERRNO_TOO_MANY_SLASHES;
 230
 231                 prefix_end += 2;
 232
 233         } else if (get_protocol_need_slashes(uri->protocol)) {
 234                 return URI_ERRNO_NO_SLASHES;
 235         }
 236
 237         if (get_protocol_free_syntax(uri->protocol)) {
 238                 uri->data = prefix_end;
 239                 uri->datalen = strlen(prefix_end);
 240                 return URI_ERRNO_OK;
 241
 242         } else if (uri->protocol == PROTOCOL_FILE) {
 243                 int datalen = check_uri_file(prefix_end);
 244
 245                 /* Extract the fragment part. */
 246                 if (datalen >= 0 && prefix_end[datalen] == '#') {
 247                         uri->fragment = prefix_end + datalen + 1;
 248                         uri->fragmentlen = strlen(uri->fragment);
 249                 } else {
 250                         datalen = strlen(prefix_end);
 251                 }
 252
 253                 uri->data = prefix_end;
 254                 uri->datalen = datalen;
 255
 256                 return URI_ERRNO_OK;
 257         }
 258
 259         /* Isolate host */
 260
 261 #ifdef CONFIG_IPV6
 262         /* Get brackets enclosing IPv6 address */
 263         lbracket = strchr(prefix_end, '[');
 264         if (lbracket) {
 265                 rbracket = strchr(lbracket, ']');
 266                 /* [address] is handled only inside of hostname part (surprisingly). */
 267                 if (rbracket && rbracket < prefix_end + strcspn(prefix_end, "/"))
 268                         uri->ipv6 = 1;
 269                 else
 270                         lbracket = rbracket = NULL;
 271         } else {
 272                 rbracket = NULL;
 273         }
 274 #endif
 275
 276         /* Possibly skip auth part */
 277         host_end = prefix_end + strcspn(prefix_end, "@");
 278
 279         if (prefix_end + strcspn(prefix_end, "/") > host_end
 280             && *host_end) { /* we have auth info here */
 281                 unsigned char *user_end;
 282
 283                 /* Allow '@' in the password component */
 284                 while (strcspn(host_end + 1, "@") < strcspn(host_end + 1, "/?"))
 285                         host_end = host_end + 1 + strcspn(host_end + 1, "@");
 286
 287                 user_end = strchr(prefix_end, ':');
 288
 289                 if (!user_end || user_end > host_end) {
 290                         uri->user = prefix_end;
 291                         uri->userlen = host_end - prefix_end;
 292                 } else {
 293                         uri->user = prefix_end;
 294                         uri->userlen = user_end - prefix_end;
 295                         uri->password = user_end + 1;
 296                         uri->passwordlen = host_end - user_end - 1;
 297                 }
 298                 prefix_end = host_end + 1;
 299         }
 300
 301 #ifdef CONFIG_IPV6
 302         if (uri->ipv6)
 303                 host_end = rbracket + strcspn(rbracket, ":/?");
 304         else
 305 #endif
 306                 host_end = prefix_end + strcspn(prefix_end, ":/?");
 307
 308 #ifdef CONFIG_IPV6
 309         if (uri->ipv6) {
 310                 int addrlen = rbracket - lbracket - 1;
 311
 312                 /* Check for valid length.
 313                  * addrlen >= sizeof(hostbuf) is theorically impossible
 314                  * but i keep the test in case of... Safer, imho --Zas */
 315                 assertm(addrlen >= 0 && addrlen < NI_MAXHOST,
 316                         "parse_uri(): addrlen value is bad (%d) for URL '%s'. "
 317                         "Problems are likely to be encountered. Please report "
 318                         "this, it is a security bug!", addrlen, uristring);
 319                 if_assert_failed return URI_ERRNO_IPV6_SECURITY;
 320
 321                 uri->host = lbracket + 1;
 322                 uri->hostlen = addrlen;
 323         } else
 324 #endif
 325         {
 326                 uri->host = prefix_end;
 327                 uri->hostlen = host_end - prefix_end;
 328
 329                 /* Trim trailing '.'s */
 330                 if (uri->hostlen && uri->host[uri->hostlen - 1] == '.')
 331                         return URI_ERRNO_TRAILING_DOTS;
 332         }
 333
 334         if (*host_end == ':') { /* we have port here */
 335                 unsigned char *port_end = host_end + 1 + strcspn(host_end + 1, "/");
 336
 337                 host_end++;
 338
 339                 uri->port = host_end;
 340                 uri->portlen = port_end - host_end;
 341
 342                 if (uri->portlen == 0)
 343                         return URI_ERRNO_NO_PORT_COLON;
 344
 345                 /* We only use 8 bits for portlen so better check */
 346                 if (uri->portlen != port_end - host_end)
 347                         return URI_ERRNO_INVALID_PORT;
 348
 349                 /* test if port is number */
 350                 /* TODO: possibly lookup for the service otherwise? --pasky */
 351                 for (; host_end < port_end; host_end++)
 352                         if (!isdigit(*host_end))
 353                                 return URI_ERRNO_INVALID_PORT;
 354
 355                 /* Check valid port value, and let show an error message
 356                  * about invalid url syntax. */
 357                 if (uri->port && uri->portlen) {
 358                         int n;
 359
 360                         errno = 0;
 361                         n = strtol(uri->port, NULL, 10);
 362                         if (errno || !uri_port_is_valid(n))
 363                                 return URI_ERRNO_INVALID_PORT;
 364                 }
 365         }
 366
 367         if (*host_end == '/') {
 368                 host_end++;
 369
 370         } else if (get_protocol_need_slash_after_host(uri->protocol)) {
 371                 /* The need for slash after the host component depends on the
 372                  * need for a host component. -- The dangerous mind of Jonah */
 373                 if (!uri->hostlen)
 374                         return URI_ERRNO_NO_HOST;
 375
 376                 return URI_ERRNO_NO_HOST_SLASH;
 377         }
 378
 379         /* Look for #fragment or POST_CHAR */
 380         prefix_end = host_end + strcspn(host_end, "#" POST_CHAR_S);
 381         uri->data = host_end;
 382         uri->datalen = prefix_end - host_end;
 383
 384         if (*prefix_end == '#') {
 385                 uri->fragment = prefix_end + 1;
 386                 uri->fragmentlen = strcspn(uri->fragment, POST_CHAR_S);
 387                 prefix_end = uri->fragment + uri->fragmentlen;
 388         }
 389
 390         if (*prefix_end == POST_CHAR) {
 391                 uri->post = prefix_end + 1;
 392         }
 393
 394         return URI_ERRNO_OK;
 395 }
 396
 397 int
 398 get_uri_port(struct uri *uri)
 399 {
 400         if (uri->port && uri->portlen) {
 401                 unsigned char *end = uri->port;
 402                 int port = strtol(uri->port, (char **) &end, 10);
 403
 404                 if (end != uri->port) {
 405                         assert(uri_port_is_valid(port));
 406                         return port;
 407                 }
 408         }
 409
 410         return get_protocol_port(uri->protocol);
 411 }
 412
 413 #define can_compare_uri_components(comp) !(((comp) & (URI_SPECIAL | URI_IDN)))
 414
 415 static inline int
 416 compare_component(unsigned char *a, int alen, unsigned char *b, int blen)
 417 {
 418         /* Check that the length and the strings are both set or unset */
 419         if (alen != blen || !!a != !!b) return 0;
 420
 421         /* Both are unset so that will make a perfect match */
 422         if (!a || !alen) return 1;
 423
 424         /* Let the higher forces decide */
 425         return !memcmp(a, b, blen);
 426 }
 427
 428 #define wants(x) (components & (x))
 429
 430 int
 431 compare_uri(struct uri *a, struct uri *b, enum uri_component components)
 432 {
 433         if (a == b) return 1;
 434         if (!components) return 0;
 435
 436         assertm(can_compare_uri_components(components),
 437                 "compare_uri() is a work in progress. Component unsupported");
 438
 439         return (!wants(URI_PROTOCOL) || a->protocol == b->protocol)
 440                 && (!wants(URI_IP_FAMILY) || a->ip_family == b->ip_family)
 441                 && (!wants(URI_USER)
 442                     || compare_component(a->user, a->userlen, b->user, b->userlen))
 443                 && (!wants(URI_PASSWORD)
 444                     || compare_component(a->password, a->passwordlen, b->password, b->passwordlen))
 445                 && (!wants(URI_HOST)
 446                     || compare_component(a->host, a->hostlen, b->host, b->hostlen))
 447                 && (!wants(URI_PORT)
 448                     || compare_component(a->port, a->portlen, b->port, b->portlen))
 449                 && (!wants(URI_DATA)
 450                     || compare_component(a->data, a->datalen, b->data, b->datalen))
 451                 && (!wants(URI_FRAGMENT)
 452                     || compare_component(a->fragment, a->fragmentlen, b->fragment, b->fragmentlen))
 453                 && (!wants(URI_POST)
 454                     || compare_component(a->post, a->post ? strlen(a->post) : 0, b->post, b->post ? strlen(b->post) : 0));
 455 }
 456
 457
 458 /* We might need something more intelligent than this Swiss army knife. */
 459 struct string *
 460 add_uri_to_string(struct string *string, struct uri *uri,
 461                   enum uri_component components)
 462 {
 463         /* Custom or unknown keep the URI untouched. */
 464         if (uri->protocol == PROTOCOL_UNKNOWN)
 465                 return add_to_string(string, struri(uri));
 466
 467         if (wants(URI_PROTOCOL)) {
 468                 add_bytes_to_string(string, uri->string, uri->protocollen);
 469                 if (wants(URI_IP_FAMILY) && uri->ip_family)
 470                         add_long_to_string(string, uri->ip_family);
 471                 add_char_to_string(string, ':');
 472                 if (get_protocol_need_slashes(uri->protocol))
 473                         add_to_string(string, "//");
 474         }
 475
 476         if (wants(URI_USER) && uri->userlen) {
 477                 add_bytes_to_string(string, uri->user, uri->userlen);
 478
 479                 if (wants(URI_PASSWORD) && uri->passwordlen) {
 480                         add_char_to_string(string, ':');
 481                         add_bytes_to_string(string, uri->password,
 482                                                     uri->passwordlen);
 483                 }
 484
 485                 add_char_to_string(string, '@');
 486         }
 487
 488         if (wants(URI_HOST) && uri->hostlen) {
 489                 int add_host = 1;
 490
 491 #ifdef CONFIG_IPV6
 492                 /* Rationale for wants(URI_PORT): The [notation] was invented
 493                  * so that you can have an IPv6 addy and a port together. So
 494                  * we want to use it when that happens, otherwise we need not
 495                  * bother (that happens only when we want it for DNS anyway).
 496                  * I insist on an implied elegancy of this way, but YMMV. ;-)
 497                  * --pasky */
 498                 if (uri->ipv6 && wants(URI_PORT)) add_char_to_string(string, '[');
 499 #endif
 500 #ifdef CONFIG_IDN
 501                 /* Support for the GNU International Domain Name library.
 502                  *
 503                  * http://www.gnu.org/software/libidn/manual/html_node/IDNA-Functions.html
 504                  *
 505                  * Now it is probably not perfect because idna_to_ascii_lz()
 506                  * will be using a ``zero terminated input string encoded in
 507                  * the current locale's character set''. Anyway I don't know
 508                  * how to convert anything to UTF-8 or Unicode. --jonas */
 509                 if (wants(URI_IDN)) {
 510                         unsigned char *host = memacpy(uri->host, uri->hostlen);
 511
 512                         if (host) {
 513                                 char *idname;
 514                                 int code = idna_to_ascii_lz(host, &idname, 0);
 515
 516                                 /* FIXME: Return NULL if it coughed? --jonas */
 517                                 if (code == IDNA_SUCCESS) {
 518                                         add_to_string(string, idname);
 519                                         free(idname);
 520                                         add_host = 0;
 521                                 }
 522
 523                                 mem_free(host);
 524                         }
 525                 }
 526
 527 #endif
 528                 if (add_host)
 529                         add_bytes_to_string(string, uri->host, uri->hostlen);
 530
 531 #ifdef CONFIG_IPV6
 532                 if (uri->ipv6 && wants(URI_PORT)) add_char_to_string(string, ']');
 533 #endif
 534         }
 535
 536         if (wants(URI_PORT) || wants(URI_DEFAULT_PORT)) {
 537                 if (uri->portlen) {
 538                         add_char_to_string(string, ':');
 539                         add_bytes_to_string(string, uri->port, uri->portlen);
 540
 541                 } else if (wants(URI_DEFAULT_PORT)
 542                            && uri->protocol != PROTOCOL_USER) {
 543                         /* For user protocols we don't know a default port.
 544                          * Should user protocols ports be configurable? */
 545                         int port = get_protocol_port(uri->protocol);
 546
 547                         add_char_to_string(string, ':');
 548                         add_long_to_string(string, port);
 549                 }
 550         }
 551
 552         /* Only add slash if we need to separate */
 553         if ((wants(URI_DATA) || wants(URI_POST) || components == URI_HTTP_REFERRER_HOST)
 554             && wants(~(URI_DATA | URI_PORT))
 555             && get_protocol_need_slash_after_host(uri->protocol))
 556                 add_char_to_string(string, '/');
 557
 558         if (wants(URI_DATA) && uri->datalen)
 559                 add_bytes_to_string(string, uri->data, uri->datalen);
 560
 561         /* We can not test uri->datalen here since we need to always
 562          * add '/'. */
 563         if (wants(URI_PATH) || wants(URI_FILENAME)) {
 564                 unsigned char *filename = uri->data;
 565                 unsigned char *pos;
 566
 567                 assertm(!wants(URI_FILENAME) || components == URI_FILENAME,
 568                         "URI_FILENAME should be used alone %d", components);
 569
 570                 if (wants(URI_PATH) && !is_uri_dir_sep(uri, *filename)) {
 571 #ifdef CONFIG_OS_WIN32
 572                         if (uri->protocol != PROTOCOL_FILE)
 573 #endif
 574                         /* FIXME: Add correct separator */
 575                         add_char_to_string(string, '/');
 576                 }
 577
 578                 if (!uri->datalen) return string;
 579
 580                 for (pos = filename; *pos && !end_of_dir(*pos); pos++)
 581                         if (wants(URI_FILENAME) && is_uri_dir_sep(uri, *pos))
 582                                 filename = pos + 1;
 583
 584                 return add_bytes_to_string(string, filename, pos - filename);
 585         }
 586
 587         if (wants(URI_QUERY) && uri->datalen) {
 588                 unsigned char *query = memchr(uri->data, '?', uri->datalen);
 589
 590                 assertm(URI_QUERY == components,
 591                         "URI_QUERY should be used alone %d", components);
 592
 593                 if (!query) return string;
 594
 595                 query++;
 596                 /* Check fragment and POST_CHAR */
 597                 return add_bytes_to_string(string, query, strcspn(query, "#" POST_CHAR_S));
 598         }
 599
 600         if (wants(URI_FRAGMENT) && uri->fragmentlen) {
 601                 add_char_to_string(string, '#');
 602                 add_bytes_to_string(string, uri->fragment, uri->fragmentlen);
 603         }
 604
 605         if (wants(URI_POST) && uri->post) {
 606                 add_char_to_string(string, POST_CHAR);
 607                 add_to_string(string, uri->post);
 608
 609         } else if (wants(URI_POST_INFO) && uri->post) {
 610                 if (!strncmp(uri->post, "text/plain", 10)) {
 611                         add_to_string(string, " (PLAIN TEXT DATA)");
 612
 613                 } else if (!strncmp(uri->post, "multipart/form-data;", 20)) {
 614                         add_to_string(string, " (MULTIPART FORM DATA)");
 615
 616                 } else {
 617                         add_to_string(string, " (POST DATA)");
 618                 }
 619
 620         }
 621
 622         return string;
 623 }
 624
 625 #undef wants
 626
 627 unsigned char *
 628 get_uri_string(struct uri *uri, enum uri_component components)
 629 {
 630         struct string string;
 631
 632         if (init_string(&string)
 633             && add_uri_to_string(&string, uri, components))
 634                 return string.source;
 635
 636         done_string(&string);
 637         return NULL;
 638 }
 639
 640
 641 struct string *
 642 add_string_uri_to_string(struct string *string, unsigned char *uristring,
 643                          enum uri_component components)
 644 {
 645         struct uri uri;
 646
 647         if (parse_uri(&uri, uristring) != URI_ERRNO_OK)
 648                 return NULL;
 649
 650         return add_uri_to_string(string, &uri, components);
 651 }
 652
 653
 654 #define normalize_uri_reparse(str)      normalize_uri(NULL, str)
 655 #define normalize_uri_noparse(uri)      normalize_uri(uri, struri(uri))
 656
 657 unsigned char *
 658 normalize_uri(struct uri *uri, unsigned char *uristring)
 659 {
 660         unsigned char *parse_string = uristring;
 661         unsigned char *src, *dest, *path;
 662         int need_slash = 0;
 663         int parse = (uri == NULL);
 664         struct uri uri_struct;
 665
 666         if (!uri) uri = &uri_struct;
 667
 668         /* We need to get the real (proxied) URI but lowercase relevant URI
 669          * parts along the way. */
 670         do {
 671                 if (parse && parse_uri(uri, parse_string) != URI_ERRNO_OK)
 672                         return uristring;
 673
 674                 assert(uri->data);
 675
 676                 /* This is a maybe not the right place but both join_urls() and
 677                  * get_translated_uri() through translate_url() calls this
 678                  * function and then it already works on and modifies an
 679                  * allocated copy. */
 680                 convert_to_lowercase(uri->string, uri->protocollen);
 681                 if (uri->hostlen) convert_to_lowercase(uri->host, uri->hostlen);
 682
 683                 parse = 1;
 684                 parse_string = uri->data;
 685         } while (uri->protocol == PROTOCOL_PROXY);
 686
 687         if (get_protocol_free_syntax(uri->protocol))
 688                 return uristring;
 689
 690         if (uri->protocol != PROTOCOL_UNKNOWN)
 691                 need_slash = get_protocol_need_slash_after_host(uri->protocol);
 692
 693         /* We want to start at the first slash to also reduce URIs like
 694          * http://host//index.html to http://host/index.html */
 695         path = uri->data - need_slash;
 696         dest = src = path;
 697
 698         /* This loop mangles the URI string by removing directory elevators and
 699          * other cruft. Example: /.././etc////..//usr/ -> /usr/ */
 700         while (*dest) {
 701                 /* If the following pieces are the LAST parts of URL, we remove
 702                  * them as well. See RFC 1808 for details. */
 703
 704                 if (end_of_dir(src[0])) {
 705                         /* URL data contains no more path. */
 706                         memmove(dest, src, strlen(src) + 1);
 707                         break;
 708                 }
 709
 710                 if (!is_uri_dir_sep(uri, src[0])) {
 711                         /* This is to reduce indentation */
 712
 713                 } else if (src[1] == '.') {
 714                         if (!src[2]) {
 715                                 /* /. - skip the dot */
 716                                 *dest++ = *src;
 717                                 *dest = 0;
 718                                 break;
 719
 720                         } else if (is_uri_dir_sep(uri, src[2])) {
 721                                 /* /./ - strip that.. */
 722                                 src += 2;
 723                                 continue;
 724
 725                         } else if (src[2] == '.'
 726                                    && (is_uri_dir_sep(uri, src[3]) || !src[3])) {
 727                                 /* /../ or /.. - skip it and preceding element. */
 728
 729                                 /* First back out the last incrementation of
 730                                  * @dest (dest++) to get the position that was
 731                                  * last asigned to. */
 732                                 if (dest > path) dest--;
 733
 734                                 /* @dest might be pointing to a dir separator
 735                                  * so we decrement before any testing. */
 736                                 while (dest > path) {
 737                                         dest--;
 738                                         if (is_uri_dir_sep(uri, *dest)) break;
 739                                 }
 740
 741                                 if (!src[3]) {
 742                                         /* /.. - add ending slash and stop */
 743                                         *dest++ = *src;
 744                                         *dest = 0;
 745                                         break;
 746                                 }
 747
 748                                 src += 3;
 749                                 continue;
 750                         }
 751
 752                 } else if (is_uri_dir_sep(uri, src[1])) {
 753                         /* // - ignore first '/'. */
 754                         src += 1;
 755                         continue;
 756                 }
 757
 758                 /* We don't want to access memory past the NUL char. */
 759                 *dest = *src++;
 760                 if (*dest) dest++;
 761         }
 762
 763         return uristring;
 764 }
 765
 766 /* The 'file' scheme URI comes in and bastardized URI comes out which consists
 767  * of just the complete path to file/directory, which the dumb 'file' protocol
 768  * backend can understand. No host parts etc, that is what this function is
 769  * supposed to chew. */
 770 static struct uri *
 771 transform_file_url(struct uri *uri, unsigned char *cwd)
 772 {
 773         unsigned char *path = uri->data;
 774
 775         assert(uri->protocol == PROTOCOL_FILE && uri->data);
 776
 777         /* Sort out the host part. We currently support only host "localhost"
 778          * (plus empty host part will be assumed to be "localhost" as well).
 779          * As our extensions, '.' will reference to the cwd on localhost
 780          * (originally, when the first thing after file:// wasn't "localhost/",
 781          * we assumed the cwd as well, and pretended that there's no host part
 782          * at all) and '..' to the directory parent to cwd. Another extension
 783          * is that if this is a DOS-like system, the first char in two-char
 784          * host part is uppercase letter and the second char is a colon, it is
 785          * assumed to be a local disk specification. */
 786         /* TODO: Use FTP for non-localhost hosts. --pasky */
 787
 788         /* For URL "file://", we open the current directory. Some other
 789          * browsers instead open root directory, but AFAIK the standard does
 790          * not specify that and this was the original behaviour and it is more
 791          * consistent with our file://./ notation. */
 792
 793         /* Who would name their file/dir '...' ? */
 794         if (*path == '.' || !*path) {
 795                 struct string dir;
 796
 797                 if (!init_string(&dir))
 798                         return NULL;
 799
 800                 encode_uri_string(&dir, cwd, -1, 0);
 801
 802                 /* Either we will end up with '//' and translate_directories()
 803                  * will shorten it or the '/' will mark the inserted cwd as a
 804                  * directory. */
 805                 if (*path == '.') *path = '/';
 806
 807                 /* Insert the current working directory. */
 808                 /* The offset is 7 == sizeof("file://") - 1. */
 809                 insert_in_string(&struri(uri), 7, dir.source, dir.length);
 810
 811                 done_string(&dir);
 812                 return uri;
 813         }
 814
 815 #ifdef DOS_FS
 816         if (isasciialpha(path[0]) && path[1] == ':' && dir_sep(path[2]))
 817                 return NULL;
 818 #endif
 819
 820         for (; *path && !dir_sep(*path); path++);
 821
 822         /* FIXME: We will in fact assume localhost even for non-local hosts,
 823          * until we will support the FTP transformation. --pasky */
 824
 825         memmove(uri->data, path, strlen(path) + 1);
 826         return uri;
 827 }
 828
 829 static unsigned char *translate_url(unsigned char *url, unsigned char *cwd);
 830
 831 unsigned char *
 832 join_urls(struct uri *base, unsigned char *rel)
 833 {
 834         unsigned char *uristring, *path;
 835         int add_slash = 0;
 836         int translate = 0;
 837         int length = 0;
 838
 839         /* See RFC 1808 */
 840         /* TODO: Support for ';' ? (see the RFC) --pasky */
 841
 842         /* For '#', '?' and '//' we could use get_uri_string() but it might be
 843          * too expensive since it uses granular allocation scheme. I wouldn't
 844          * personally mind tho' because it would be cleaner. --jonas */
 845         if (rel[0] == '#') {
 846                 /* Strip fragment and post part from the base URI and append
 847                  * the fragment string in @rel. */
 848                 length  = base->fragment
 849                         ? base->fragment - struri(base) - 1
 850                         : get_real_uri_length(base);
 851
 852         } else if (rel[0] == '?') {
 853                 /* Strip query, fragment and post part from the base URI and
 854                  * append the query string in @rel. */
 855                 length  = base->fragment ? base->fragment - struri(base) - 1
 856                                          : get_real_uri_length(base);
 857
 858                 uristring = memchr(base->data, '?', base->datalen);
 859                 if (uristring) length = uristring - struri(base);
 860
 861         } else if (rel[0] == '/' && rel[1] == '/') {
 862                 if (!get_protocol_need_slashes(base->protocol))
 863                         return NULL;
 864
 865                 /* Get `<protocol>:' from the base URI and append the `//' part
 866                  * from @rel. */
 867                 length = base->protocollen + 1;
 868
 869                 /* We need to sanitize the relative part and add stuff like
 870                  * host slash. */
 871                 translate = 1;
 872         }
 873
 874         /* If one of the tests above set @length to something useful */
 875         if (length) {
 876                 uristring = memacpy(struri(base), length);
 877                 if (!uristring) return NULL;
 878
 879                 add_to_strn(&uristring, rel);
 880
 881                 if (translate) {
 882                         unsigned char *translated;
 883
 884                         translated = translate_url(uristring, NULL);
 885                         mem_free(uristring);
 886                         return translated;
 887                 }
 888                 return normalize_uri_reparse(uristring);
 889         }
 890
 891         /* Check if there is some protocol name to go for */
 892         length = get_protocol_length(rel);
 893         if (length) {
 894                 switch (get_protocol(rel, length)) {
 895                 case PROTOCOL_UNKNOWN:
 896                 case PROTOCOL_PROXY:
 897                         /* Mysteriously proxy URIs are breaking here ... */
 898                         break;
 899
 900                 case PROTOCOL_FILE:
 901                         /* FIXME: Use get_uri_string(base, URI_PATH) as cwd arg
 902                          * to translate_url(). */
 903                 default:
 904                         uristring = translate_url(rel, NULL);
 905                         if (uristring) return uristring;
 906                 }
 907         }
 908
 909         assertm(base->data, "bad base url");
 910         if_assert_failed return NULL;
 911
 912         path = base->data;
 913
 914         /* Either is path blank, but we've slash char before, or path is not
 915          * blank, but doesn't start by a slash (if we'd just stay along with
 916          * is_uri_dir_sep(&uri, path[-1]) w/o all the surrounding crap, it
 917          * should be enough, but I'm not sure and I don't want to break
 918          * anything --pasky). */
 919         /* We skip first char of URL ('/') in parse_url() (ARGH). This
 920          * is reason of all this bug-bearing magic.. */
 921         if (*path) {
 922                 if (!is_uri_dir_sep(base, *path)) path--;
 923         } else {
 924                 if (is_uri_dir_sep(base, path[-1])) path--;
 925         }
 926
 927         if (!is_uri_dir_sep(base, rel[0])) {
 928                 unsigned char *path_end;
 929
 930                 /* The URL is relative. */
 931
 932                 if (!*path) {
 933                         /* There's no path in the URL, but we're going to add
 934                          * something there, and the something doesn't start by
 935                          * a slash. So we need to insert a slash after the base
 936                          * URL. Clever, eh? ;) */
 937                         add_slash = 1;
 938                 }
 939
 940                 for (path_end = path; *path_end; path_end++) {
 941                         if (end_of_dir(*path_end)) break;
 942                         /* Modify the path pointer, so that it'll always point
 943                          * above the last '/' in the URL; later, we'll copy the
 944                          * URL only _TO_ this point, and anything after last
 945                          * slash will be substituted by 'rel'. */
 946                         if (is_uri_dir_sep(base, *path_end))
 947                                 path = path_end + 1;
 948                 }
 949         }
 950
 951         length = path - struri(base);
 952         uristring = mem_alloc(length + strlen(rel) + add_slash + 1);
 953         if (!uristring) return NULL;
 954
 955         memcpy(uristring, struri(base), length);
 956         if (add_slash) uristring[length] = '/';
 957         strcpy(uristring + length + add_slash, rel);
 958
 959         return normalize_uri_reparse(uristring);
 960 }
 961
 962
 963 /* Tries to figure out what protocol @newurl might be specifying by checking if
 964  * it exists as a file locally or by checking parts of the host name. */
 965 static enum protocol
 966 find_uri_protocol(unsigned char *newurl)
 967 {
 968         unsigned char *ch;
 969
 970         /* First see if it is a file so filenames that look like hostnames
 971          * won't confuse us below. */
 972         if (check_whether_file_exists(newurl) >= 0) return PROTOCOL_FILE;
 973
 974         /* Yes, it would be simpler to make test for IPv6 address first,
 975          * but it would result in confusing mix of ifdefs ;-). */
 976         /* FIXME: Ideas for improve protocol detection
 977          *
 978          * - Handle common hostnames. It could be part of the protocol backend
 979          *   structure. [ www -> http, irc -> irc, news -> nntp, ... ]
 980          *
 981          * - Resolve using port number. [ 119 -> nntp, 443 -> https, ... ]
 982          */
 983
 984         ch = newurl + strcspn(newurl, ".:/@");
 985         if (*ch == '@'
 986             || (*ch == ':' && *newurl != '[' && strchr(newurl, '@'))
 987             || !strncasecmp(newurl, "ftp.", 4)) {
 988                 /* Contains user/password/ftp-hostname */
 989                 return PROTOCOL_FTP;
 990
 991 #ifdef CONFIG_IPV6
 992         } else if (*newurl == '[' && *ch == ':') {
 993                 /* Candidate for IPv6 address */
 994                 unsigned char *bracket2, *colon2;
 995
 996                 ch++;
 997                 bracket2 = strchr(ch, ']');
 998                 colon2 = strchr(ch, ':');
 999                 if (bracket2 && colon2 && bracket2 > colon2)
1000                         return PROTOCOL_HTTP;
1001 #endif
1002
1003         } else if (*newurl != '.' && *ch == '.') {
1004                 /* Contains domain name? */
1005                 unsigned char *host_end, *domain;
1006                 unsigned char *ipscan;
1007
1008                 /* Process the hostname */
1009                 for (domain = ch + 1;
1010                         *(host_end = domain + strcspn(domain, ".:/?")) == '.';
1011                         domain = host_end + 1);
1012
1013                 /* It's IP? */
1014                 for (ipscan = ch; isdigit(*ipscan) || *ipscan == '.';
1015                         ipscan++);
1016
1017                 if (!*ipscan || *ipscan == ':' || *ipscan == '/')
1018                         return PROTOCOL_HTTP;
1019
1020                 /* It's two-letter or known TLD? */
1021                 if (host_end - domain == 2
1022                     || end_with_known_tld(domain, host_end - domain) >= 0)
1023                         return PROTOCOL_HTTP;
1024         }
1025
1026         return PROTOCOL_UNKNOWN;
1027 }
1028
1029
1030 #define MAX_TRANSLATION_ATTEMPTS        32
1031
1032 /* Returns an URI string that can be used internally. Adding protocol prefix,
1033  * missing slashes etc. */
1034 static unsigned char *
1035 translate_url(unsigned char *url, unsigned char *cwd)
1036 {
1037         unsigned char *newurl;
1038         struct uri uri;
1039         enum uri_errno uri_errno, prev_errno = URI_ERRNO_EMPTY;
1040         int retries = 0;
1041
1042         /* Strip starting spaces */
1043         while (*url == ' ') url++;
1044         if (!*url) return NULL;
1045
1046         newurl = expand_tilde(url); /* XXX: Post data copy. */
1047         if (!newurl) return NULL;
1048
1049 parse_uri:
1050         /* Yay a goto loop. If we get some URI parse error and try to
1051          * fix it we go back to here and try again. */
1052         /* Ordinary parse */
1053         uri_errno = parse_uri(&uri, newurl);
1054
1055         /* Bail out if the same error occurs twice */
1056         if (uri_errno == prev_errno || retries++ > MAX_TRANSLATION_ATTEMPTS) {
1057                 if (retries > MAX_TRANSLATION_ATTEMPTS) {
1058                         ERROR("Maximum number of parsing attempts exceeded "
1059                               "for %s.", url);
1060                 }
1061                 mem_free(newurl);
1062                 return NULL;
1063         }
1064
1065         prev_errno = uri_errno;
1066
1067         switch (uri_errno) {
1068         case URI_ERRNO_OK:
1069                 /* Fix translation of 1.2.3.4:5 so IP address part won't be
1070                  * interpreted as the protocol name. */
1071                 if (uri.protocol == PROTOCOL_UNKNOWN) {
1072                         enum protocol protocol = find_uri_protocol(newurl);
1073
1074                         /* Code duplication with the URI_ERRNO_INVALID_PROTOCOL
1075                          * case. */
1076                         if (protocol != PROTOCOL_UNKNOWN) {
1077                                 struct string str;
1078
1079                                 if (!init_string(&str)) return NULL;
1080
1081                                 switch (protocol) {
1082                                 case PROTOCOL_FTP:
1083                                         add_to_string(&str, "ftp://");
1084                                         encode_uri_string(&str, newurl, -1, 0);
1085                                         break;
1086
1087                                 case PROTOCOL_HTTP:
1088                                         add_to_string(&str, "http://");
1089                                         add_to_string(&str, newurl);
1090                                         break;
1091
1092                                 case PROTOCOL_UNKNOWN:
1093                                         break;
1094
1095                                 case PROTOCOL_FILE:
1096                                 default:
1097                                         add_to_string(&str, "file://");
1098                                         if (!dir_sep(*newurl))
1099                                                 add_to_string(&str, "./");
1100
1101                                         add_to_string(&str, newurl);
1102                                 }
1103
1104                                 mem_free(newurl);
1105                                 newurl = str.source;
1106
1107                                 /* Work around the infinite loop prevention */
1108                                 prev_errno = URI_ERRNO_EMPTY;
1109                                 goto parse_uri;
1110                         }
1111                 }
1112
1113                 /* If file:// URI is transformed we need to reparse. */
1114                 if (uri.protocol == PROTOCOL_FILE && cwd && *cwd
1115                     && transform_file_url(&uri, cwd))
1116                         return normalize_uri_reparse(struri(&uri));
1117
1118                 /* Translate the proxied URI too if proxy:// */
1119                 if (uri.protocol == PROTOCOL_PROXY) {
1120                         unsigned char *data = translate_url(uri.data, cwd);
1121                         int pos = uri.data - struri(&uri);
1122
1123                         if (!data) break;
1124                         struri(&uri)[pos] = 0;
1125                         insert_in_string(&struri(&uri), pos, data, strlen(data));
1126                         mem_free(data);
1127                         return normalize_uri_reparse(struri(&uri));
1128                 }
1129
1130                 return normalize_uri_noparse(&uri);
1131
1132         case URI_ERRNO_TOO_MANY_SLASHES:
1133         {
1134                 unsigned char *from, *to;
1135
1136                 assert(uri.string[uri.protocollen] == ':'
1137                        && uri.string[uri.protocollen + 1] == '/'
1138                        && uri.string[uri.protocollen + 2] == '/');
1139
1140                 from = to = uri.string + uri.protocollen + 3;
1141                 while (*from == '/') from++;
1142
1143                 assert(to < from);
1144                 memmove(to, from, strlen(from) + 1);
1145                 goto parse_uri;
1146         }
1147         case URI_ERRNO_NO_SLASHES:
1148         {
1149                 /* Try prefix:some.url -> prefix://some.url.. */
1150                 int slashes = 2;
1151
1152                 /* Check if only one '/' is needed. */
1153                 if (uri.string[uri.protocollen + 1] == '/')
1154                         slashes--;
1155
1156                 insert_in_string(&newurl, uri.protocollen + 1, "//", slashes);
1157                 goto parse_uri;
1158         }
1159         case URI_ERRNO_TRAILING_DOTS:
1160         {
1161                 /* Trim trailing '.'s */
1162                 unsigned char *from = uri.host + uri.hostlen;
1163                 unsigned char *to = from;
1164
1165                 assert(uri.host < to && to[-1] == '.' && *from != '.');
1166
1167                 while (uri.host < to && to[-1] == '.') to--;
1168
1169                 assert(to < from);
1170                 memmove(to, from, strlen(from) + 1);
1171                 goto parse_uri;
1172         }
1173         case URI_ERRNO_NO_PORT_COLON:
1174                 assert(uri.portlen == 0
1175                        && uri.string < uri.port
1176                        && uri.port[-1] == ':');
1177
1178                 memmove(uri.port - 1, uri.port, strlen(uri.port) + 1);
1179                 goto parse_uri;
1180
1181         case URI_ERRNO_NO_HOST_SLASH:
1182         {
1183                 int offset = uri.port
1184                            ? uri.port + uri.portlen - struri(&uri)
1185                            : uri.host + uri.hostlen - struri(&uri) + uri.ipv6 /* ']' */;
1186
1187                 assertm(uri.host, "uri.host not set after no host slash error");
1188                 insert_in_string(&newurl, offset, "/", 1);
1189                 goto parse_uri;
1190         }
1191         case URI_ERRNO_INVALID_PROTOCOL:
1192         {
1193                 /* No protocol name */
1194                 enum protocol protocol = find_uri_protocol(newurl);
1195                 struct string str;
1196
1197                 if (!init_string(&str)) return NULL;
1198
1199                 switch (protocol) {
1200                         case PROTOCOL_FTP:
1201                                 add_to_string(&str, "ftp://");
1202                                 encode_uri_string(&str, newurl, -1, 0);
1203                                 break;
1204
1205                         case PROTOCOL_HTTP:
1206                                 add_to_string(&str, "http://");
1207                                 add_to_string(&str, newurl);
1208                                 break;
1209
1210                         case PROTOCOL_UNKNOWN:
1211                                 /* We default to file:// even though we already
1212                                  * tested if the file existed since it will give
1213                                  * a "No such file or directory" error.  which
1214                                  * might better hint the user that there was
1215                                  * problem figuring out the URI. */
1216                         case PROTOCOL_FILE:
1217                         default:
1218                                 add_to_string(&str, "file://");
1219                                 if (!dir_sep(*newurl))
1220                                         add_to_string(&str, "./");
1221
1222                                 encode_file_uri_string(&str, newurl);
1223                 }
1224
1225                 mem_free(newurl);
1226                 newurl = str.source;
1227
1228                 goto parse_uri;
1229         }
1230         case URI_ERRNO_EMPTY:
1231         case URI_ERRNO_IPV6_SECURITY:
1232         case URI_ERRNO_NO_HOST:
1233         case URI_ERRNO_INVALID_PORT:
1234         case URI_ERRNO_INVALID_PORT_RANGE:
1235                 /* None of these can be handled properly. */
1236                 break;
1237         }
1238
1239         mem_free(newurl);
1240         return NULL;
1241 }
1242
1243
1244 struct uri *
1245 get_composed_uri(struct uri *uri, enum uri_component components)
1246 {
1247         unsigned char *string;
1248
1249         assert(uri);
1250         if_assert_failed return NULL;
1251
1252         string = get_uri_string(uri, components);
1253         if (!string) return NULL;
1254
1255         uri = get_uri(string, 0);
1256         mem_free(string);
1257
1258         return uri;
1259 }
1260
1261 struct uri *
1262 get_translated_uri(unsigned char *uristring, unsigned char *cwd)
1263 {
1264         struct uri *uri;
1265
1266         uristring = translate_url(uristring, cwd);
1267         if (!uristring) return NULL;
1268
1269         uri = get_uri(uristring, 0);
1270         mem_free(uristring);
1271
1272         return uri;
1273 }
1274
1275
1276 unsigned char *
1277 get_extension_from_uri(struct uri *uri)
1278 {
1279         unsigned char *extension = NULL;
1280         int afterslash = 1;
1281         unsigned char *pos = uri->data;
1282
1283         assert(pos);
1284
1285         for (; *pos && !end_of_dir(*pos); pos++) {
1286                 if (!afterslash && !extension && *pos == '.') {
1287                         extension = pos;
1288                 } else if (is_uri_dir_sep(uri, *pos)) {
1289                         extension = NULL;
1290                         afterslash = 1;
1291                 } else {
1292                         afterslash = 0;
1293                 }
1294         }
1295
1296         if (extension && extension < pos)
1297                 return memacpy(extension, pos - extension);
1298
1299         return NULL;
1300 }
1301
1302 /* URI encoding, escaping unallowed characters. */
1303 static inline int
1304 safe_char(unsigned char c)
1305 {
1306         /* RFC 2396, Page 8, Section 2.3 ;-) */
1307         return isident(c) || c == '.' || c == '!' || c == '~'
1308                || c == '*' || c == '\''|| c == '(' || c == ')';
1309 }
1310
1311 void
1312 encode_uri_string(struct string *string, unsigned char *name, int namelen,
1313                   int convert_slashes)
1314 {
1315         unsigned char n[4];
1316         unsigned char *end;
1317
1318         n[0] = '%';
1319         n[3] = '\0';
1320
1321         if (namelen < 0) namelen = strlen(name);
1322
1323         for (end = name + namelen; name < end; name++) {
1324 #if 0
1325                 /* This is probably correct only for query part of URI..? */
1326                 if (*name == ' ') add_char_to_string(data, len, '+');
1327                 else
1328 #endif
1329                 if (safe_char(*name) || (!convert_slashes && *name == '/')) {
1330                         add_char_to_string(string, *name);
1331                 } else {
1332                         /* Hex it. */
1333                         n[1] = hx((((int) *name) & 0xF0) >> 4);
1334                         n[2] = hx(((int) *name) & 0xF);
1335                         add_bytes_to_string(string, n, sizeof(n) - 1);
1336                 }
1337         }
1338 }
1339
1340 void
1341 encode_win32_uri_string(struct string *string, unsigned char *name, int namelen)
1342 {
1343         unsigned char n[4];
1344         unsigned char *end;
1345
1346         n[0] = '%';
1347         n[3] = '\0';
1348
1349         if (namelen < 0) namelen = strlen(name);
1350
1351         for (end = name + namelen; name < end; name++) {
1352                 if (safe_char(*name) || *name == ':' || *name == '\\') {
1353                         add_char_to_string(string, *name);
1354                 } else {
1355                         /* Hex it. */
1356                         n[1] = hx((((int) *name) & 0xF0) >> 4);
1357                         n[2] = hx(((int) *name) & 0xF);
1358                         add_bytes_to_string(string, n, sizeof(n) - 1);
1359                 }
1360         }
1361 }
1362
1363 /* This function is evil, it modifies its parameter. */
1364 /* XXX: but decoded string is _never_ longer than encoded string so it's an
1365  * efficient way to do that, imho. --Zas */
1366 void
1367 decode_uri(unsigned char *src)
1368 {
1369         unsigned char *dst = src;
1370         unsigned char c;
1371
1372         do {
1373                 c = *src++;
1374
1375                 if (c == '%') {
1376                         int x1 = unhx(*src);
1377
1378                         if (x1 >= 0) {
1379                                 int x2 = unhx(*(src + 1));
1380
1381                                 if (x2 >= 0) {
1382                                         x1 = (x1 << 4) + x2;
1383                                         if (x1 != 0) { /* don't allow %00 */
1384                                                 c = (unsigned char) x1;
1385                                                 src += 2;
1386                                         }
1387                                 }
1388                         }
1389
1390 #if 0
1391                 } else if (c == '+') {
1392                         /* As the comment in encode_uri_string suggests, '+'
1393                          * should only be decoded in the query part of a URI
1394                          * (should that be 'URL'?). I'm not bold enough to
1395                          * disable this code, tho. -- Miciah */
1396                         c = ' ';
1397 #endif
1398                 }
1399
1400                 *dst++ = c;
1401         } while (c != '\0');
1402 }
1403
1404 void
1405 decode_uri_string(struct string *string)
1406 {
1407         decode_uri(string->source);
1408         string->length = strlen(string->source);
1409 }
1410
1411 void
1412 decode_uri_for_display(unsigned char *src)
1413 {
1414         decode_uri(src);
1415
1416         for (; *src; src++)
1417                 if (!isprint(*src) || iscntrl(*src))
1418                         *src = '*';
1419 }
1420
1421 void
1422 decode_uri_string_for_display(struct string *string)
1423 {
1424         decode_uri_for_display(string->source);
1425         string->length = strlen(string->source);
1426 }
1427
1428
1429 /* URI list */
1430
1431 #define URI_LIST_GRANULARITY 0x3
1432
1433 #define realloc_uri_list(list) \
1434         mem_align_alloc(&(list)->uris, (list)->size, (list)->size + 1, \
1435                         URI_LIST_GRANULARITY)
1436
1437 struct uri *
1438 add_to_uri_list(struct uri_list *list, struct uri *uri)
1439 {
1440         if (!realloc_uri_list(list))
1441                 return NULL;
1442
1443         list->uris[list->size++] = get_uri_reference(uri);
1444
1445         return uri;
1446 };
1447
1448 void
1449 free_uri_list(struct uri_list *list)
1450 {
1451         struct uri *uri;
1452         int index;
1453
1454         if (!list->uris) return;
1455
1456         foreach_uri (uri, index, list) {
1457                 done_uri(uri);
1458         }
1459
1460         mem_free_set(&list->uris, NULL);
1461         list->size = 0;
1462 }
1463
1464 /* URI cache */
1465
1466 struct uri_cache_entry {
1467         struct uri uri;
1468         unsigned char string[1];
1469 };
1470
1471 struct uri_cache {
1472         struct hash *map;
1473         struct object object;
1474 };
1475
1476 static struct uri_cache uri_cache;
1477
1478 #ifdef CONFIG_DEBUG
1479 static inline void
1480 check_uri_sanity(struct uri *uri)
1481 {
1482         int pos;
1483
1484         for (pos = 0; pos < uri->protocollen; pos++)
1485                 if (isupper(uri->string[pos])) goto error;
1486
1487         if (uri->hostlen)
1488                 for (pos = 0; pos < uri->hostlen; pos++)
1489                         if (isupper(uri->host[pos])) goto error;
1490         return;
1491 error:
1492         INTERNAL("Uppercase letters detected in protocol or host part (%s).", struri(uri));
1493 }
1494 #else
1495 #define check_uri_sanity(uri)
1496 #endif
1497
1498 static inline struct uri_cache_entry *
1499 get_uri_cache_entry(unsigned char *string, int length)
1500 {
1501         struct uri_cache_entry *entry;
1502         struct hash_item *item;
1503
1504         assert(string && length > 0);
1505         if_assert_failed return NULL;
1506
1507         item = get_hash_item(uri_cache.map, string, length);
1508         if (item) return item->value;
1509
1510         /* Setup a new entry */
1511
1512         entry = mem_calloc(1, sizeof(*entry) + length);
1513         if (!entry) return NULL;
1514
1515         object_nolock(&entry->uri, "uri");
1516         memcpy(&entry->string, string, length);
1517         string = entry->string;
1518
1519         if (parse_uri(&entry->uri, string) != URI_ERRNO_OK
1520             || !add_hash_item(uri_cache.map, string, length, entry)) {
1521                 mem_free(entry);
1522                 return NULL;
1523         }
1524
1525         object_lock(&uri_cache);
1526
1527         return entry;
1528 }
1529
1530 struct uri *
1531 get_uri(unsigned char *string, enum uri_component components)
1532 {
1533         struct uri_cache_entry *entry;
1534
1535         assert(string);
1536
1537         if (components) {
1538                 struct uri uri;
1539
1540                 if (parse_uri(&uri, string) != URI_ERRNO_OK)
1541                         return NULL;
1542
1543                 return get_composed_uri(&uri, components);
1544         }
1545
1546         if (!is_object_used(&uri_cache)) {
1547                 uri_cache.map = init_hash8();
1548                 if (!uri_cache.map) return NULL;
1549                 object_nolock(&uri_cache, "uri_cache");
1550         }
1551
1552         entry = get_uri_cache_entry(string, strlen(string));
1553         if (!entry) {
1554                 if (!is_object_used(&uri_cache))
1555                         free_hash(&uri_cache.map);
1556                 return NULL;
1557         }
1558
1559         check_uri_sanity(&entry->uri);
1560         object_nolock(&entry->uri, "uri");
1561         object_lock(&entry->uri);
1562
1563         return &entry->uri;
1564 }
1565
1566 void
1567 done_uri(struct uri *uri)
1568 {
1569         unsigned char *string = struri(uri);
1570         int length = strlen(string);
1571         struct hash_item *item;
1572         struct uri_cache_entry *entry;
1573
1574         assert(is_object_used(&uri_cache));
1575
1576         object_unlock(uri);
1577         if (is_object_used(uri)) return;
1578
1579         item = get_hash_item(uri_cache.map, string, length);
1580         entry = item ? item->value : NULL;
1581
1582         assertm(entry, "Releasing unknown URI [%s]", string);
1583         del_hash_item(uri_cache.map, item);
1584         mem_free(entry);
1585
1586         /* Last URI frees the cache */
1587         object_unlock(&uri_cache);
1588         if (!is_object_used(&uri_cache))
1589                 free_hash(&uri_cache.map);
1590 }