util/uri.c

   1 /**
   2  * uri.c: set of generic URI related routines
   3  *
   4  * Reference: RFCs 3986, 2732 and 2373
   5  *
   6  * Copyright (C) 1998-2003 Daniel Veillard.  All Rights Reserved.
   7  *
   8  * Permission is hereby granted, free of charge, to any person obtaining a copy
   9  * of this software and associated documentation files (the "Software"), to deal
  10  * in the Software without restriction, including without limitation the rights
  11  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  12  * copies of the Software, and to permit persons to whom the Software is
  13  * furnished to do so, subject to the following conditions:
  14  *
  15  * The above copyright notice and this permission notice shall be included in
  16  * all copies or substantial portions of the Software.
  17  *
  18  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  19  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  20  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
  21  * DANIEL VEILLARD BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
  22  * IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  23  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  24  *
  25  * Except as contained in this notice, the name of Daniel Veillard shall not
  26  * be used in advertising or otherwise to promote the sale, use or other
  27  * dealings in this Software without prior written authorization from him.
  28  *
  29  * daniel@veillard.com
  30  *
  31  **
  32  *
  33  * Copyright (C) 2007, 2009-2010 Red Hat, Inc.
  34  *
  35  * This library is free software; you can redistribute it and/or
  36  * modify it under the terms of the GNU Lesser General Public
  37  * License as published by the Free Software Foundation; either
  38  * version 2.1 of the License, or (at your option) any later version.
  39  *
  40  * This library is distributed in the hope that it will be useful,
  41  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  42  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  43  * Lesser General Public License for more details.
  44  *
  45  * You should have received a copy of the GNU Lesser General Public
  46  * License along with this library; if not, write to the Free Software
  47  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307  USA
  48  *
  49  * Authors:
  50  *    Richard W.M. Jones <rjones@redhat.com>
  51  *
  52  */
  53
  54 #include <glib.h>
  55 #include <string.h>
  56 #include <stdio.h>
  57
  58 #include "qemu/uri.h"
  59
  60 static void uri_clean(URI *uri);
  61
  62 /*
  63  * Old rule from 2396 used in legacy handling code
  64  * alpha    = lowalpha | upalpha
  65  */
  66 #define IS_ALPHA(x) (IS_LOWALPHA(x) || IS_UPALPHA(x))
  67
  68
  69 /*
  70  * lowalpha = "a" | "b" | "c" | "d" | "e" | "f" | "g" | "h" | "i" | "j" |
  71  *            "k" | "l" | "m" | "n" | "o" | "p" | "q" | "r" | "s" | "t" |
  72  *            "u" | "v" | "w" | "x" | "y" | "z"
  73  */
  74
  75 #define IS_LOWALPHA(x) (((x) >= 'a') && ((x) <= 'z'))
  76
  77 /*
  78  * upalpha = "A" | "B" | "C" | "D" | "E" | "F" | "G" | "H" | "I" | "J" |
  79  *           "K" | "L" | "M" | "N" | "O" | "P" | "Q" | "R" | "S" | "T" |
  80  *           "U" | "V" | "W" | "X" | "Y" | "Z"
  81  */
  82 #define IS_UPALPHA(x) (((x) >= 'A') && ((x) <= 'Z'))
  83
  84 #ifdef IS_DIGIT
  85 #undef IS_DIGIT
  86 #endif
  87 /*
  88  * digit = "0" | "1" | "2" | "3" | "4" | "5" | "6" | "7" | "8" | "9"
  89  */
  90 #define IS_DIGIT(x) (((x) >= '0') && ((x) <= '9'))
  91
  92 /*
  93  * alphanum = alpha | digit
  94  */
  95
  96 #define IS_ALPHANUM(x) (IS_ALPHA(x) || IS_DIGIT(x))
  97
  98 /*
  99  * mark = "-" | "_" | "." | "!" | "~" | "*" | "'" | "(" | ")"
 100  */
 101
 102 #define IS_MARK(x) (((x) == '-') || ((x) == '_') || ((x) == '.') ||     \
 103     ((x) == '!') || ((x) == '~') || ((x) == '*') || ((x) == '\'') ||    \
 104     ((x) == '(') || ((x) == ')'))
 105
 106 /*
 107  * unwise = "{" | "}" | "|" | "\" | "^" | "`"
 108  */
 109
 110 #define IS_UNWISE(p)                                                    \
 111       (((*(p) == '{')) || ((*(p) == '}')) || ((*(p) == '|')) ||         \
 112        ((*(p) == '\\')) || ((*(p) == '^')) || ((*(p) == '[')) ||        \
 113        ((*(p) == ']')) || ((*(p) == '`')))
 114 /*
 115  * reserved = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" | "$" | "," |
 116  *            "[" | "]"
 117  */
 118
 119 #define IS_RESERVED(x) (((x) == ';') || ((x) == '/') || ((x) == '?') || \
 120         ((x) == ':') || ((x) == '@') || ((x) == '&') || ((x) == '=') || \
 121         ((x) == '+') || ((x) == '$') || ((x) == ',') || ((x) == '[') || \
 122         ((x) == ']'))
 123
 124 /*
 125  * unreserved = alphanum | mark
 126  */
 127
 128 #define IS_UNRESERVED(x) (IS_ALPHANUM(x) || IS_MARK(x))
 129
 130 /*
 131  * Skip to next pointer char, handle escaped sequences
 132  */
 133
 134 #define NEXT(p) ((*p == '%')? p += 3 : p++)
 135
 136 /*
 137  * Productions from the spec.
 138  *
 139  *    authority     = server | reg_name
 140  *    reg_name      = 1*( unreserved | escaped | "$" | "," |
 141  *                        ";" | ":" | "@" | "&" | "=" | "+" )
 142  *
 143  * path          = [ abs_path | opaque_part ]
 144  */
 145
 146
 147 /************************************************************************
 148  *                                                                      *
 149  *                         RFC 3986 parser                              *
 150  *                                                                      *
 151  ************************************************************************/
 152
 153 #define ISA_DIGIT(p) ((*(p) >= '0') && (*(p) <= '9'))
 154 #define ISA_ALPHA(p) (((*(p) >= 'a') && (*(p) <= 'z')) ||               \
 155                       ((*(p) >= 'A') && (*(p) <= 'Z')))
 156 #define ISA_HEXDIG(p)                                                   \
 157        (ISA_DIGIT(p) || ((*(p) >= 'a') && (*(p) <= 'f')) ||             \
 158         ((*(p) >= 'A') && (*(p) <= 'F')))
 159
 160 /*
 161  *    sub-delims    = "!" / "$" / "&" / "'" / "(" / ")"
 162  *                     / "*" / "+" / "," / ";" / "="
 163  */
 164 #define ISA_SUB_DELIM(p)                                                \
 165       (((*(p) == '!')) || ((*(p) == '$')) || ((*(p) == '&')) ||         \
 166        ((*(p) == '(')) || ((*(p) == ')')) || ((*(p) == '*')) ||         \
 167        ((*(p) == '+')) || ((*(p) == ',')) || ((*(p) == ';')) ||         \
 168        ((*(p) == '=')) || ((*(p) == '\'')))
 169
 170 /*
 171  *    gen-delims    = ":" / "/" / "?" / "#" / "[" / "]" / "@"
 172  */
 173 #define ISA_GEN_DELIM(p)                                                \
 174       (((*(p) == ':')) || ((*(p) == '/')) || ((*(p) == '?')) ||         \
 175        ((*(p) == '#')) || ((*(p) == '[')) || ((*(p) == ']')) ||         \
 176        ((*(p) == '@')))
 177
 178 /*
 179  *    reserved      = gen-delims / sub-delims
 180  */
 181 #define ISA_RESERVED(p) (ISA_GEN_DELIM(p) || (ISA_SUB_DELIM(p)))
 182
 183 /*
 184  *    unreserved    = ALPHA / DIGIT / "-" / "." / "_" / "~"
 185  */
 186 #define ISA_UNRESERVED(p)                                               \
 187       ((ISA_ALPHA(p)) || (ISA_DIGIT(p)) || ((*(p) == '-')) ||           \
 188        ((*(p) == '.')) || ((*(p) == '_')) || ((*(p) == '~')))
 189
 190 /*
 191  *    pct-encoded   = "%" HEXDIG HEXDIG
 192  */
 193 #define ISA_PCT_ENCODED(p)                                              \
 194      ((*(p) == '%') && (ISA_HEXDIG(p + 1)) && (ISA_HEXDIG(p + 2)))
 195
 196 /*
 197  *    pchar         = unreserved / pct-encoded / sub-delims / ":" / "@"
 198  */
 199 #define ISA_PCHAR(p)                                                    \
 200      (ISA_UNRESERVED(p) || ISA_PCT_ENCODED(p) || ISA_SUB_DELIM(p) ||    \
 201       ((*(p) == ':')) || ((*(p) == '@')))
 202
 203 /**
 204  * rfc3986_parse_scheme:
 205  * @uri:  pointer to an URI structure
 206  * @str:  pointer to the string to analyze
 207  *
 208  * Parse an URI scheme
 209  *
 210  * ALPHA *( ALPHA / DIGIT / "+" / "-" / "." )
 211  *
 212  * Returns 0 or the error code
 213  */
 214 static int
 215 rfc3986_parse_scheme(URI *uri, const char **str) {
 216     const char *cur;
 217
 218     if (str == NULL)
 219         return(-1);
 220
 221     cur = *str;
 222     if (!ISA_ALPHA(cur))
 223         return(2);
 224     cur++;
 225     while (ISA_ALPHA(cur) || ISA_DIGIT(cur) ||
 226            (*cur == '+') || (*cur == '-') || (*cur == '.')) cur++;
 227     if (uri != NULL) {
 228         g_free(uri->scheme);
 229         uri->scheme = g_strndup(*str, cur - *str);
 230     }
 231     *str = cur;
 232     return(0);
 233 }
 234
 235 /**
 236  * rfc3986_parse_fragment:
 237  * @uri:  pointer to an URI structure
 238  * @str:  pointer to the string to analyze
 239  *
 240  * Parse the query part of an URI
 241  *
 242  * fragment      = *( pchar / "/" / "?" )
 243  * NOTE: the strict syntax as defined by 3986 does not allow '[' and ']'
 244  *       in the fragment identifier but this is used very broadly for
 245  *       xpointer scheme selection, so we are allowing it here to not break
 246  *       for example all the DocBook processing chains.
 247  *
 248  * Returns 0 or the error code
 249  */
 250 static int
 251 rfc3986_parse_fragment(URI *uri, const char **str)
 252 {
 253     const char *cur;
 254
 255     if (str == NULL)
 256         return (-1);
 257
 258     cur = *str;
 259
 260     while ((ISA_PCHAR(cur)) || (*cur == '/') || (*cur == '?') ||
 261            (*cur == '[') || (*cur == ']') ||
 262            ((uri != NULL) && (uri->cleanup & 1) && (IS_UNWISE(cur))))
 263         NEXT(cur);
 264     if (uri != NULL) {
 265         g_free(uri->fragment);
 266         if (uri->cleanup & 2)
 267             uri->fragment = g_strndup(*str, cur - *str);
 268         else
 269             uri->fragment = uri_string_unescape(*str, cur - *str, NULL);
 270     }
 271     *str = cur;
 272     return (0);
 273 }
 274
 275 /**
 276  * rfc3986_parse_query:
 277  * @uri:  pointer to an URI structure
 278  * @str:  pointer to the string to analyze
 279  *
 280  * Parse the query part of an URI
 281  *
 282  * query = *uric
 283  *
 284  * Returns 0 or the error code
 285  */
 286 static int
 287 rfc3986_parse_query(URI *uri, const char **str)
 288 {
 289     const char *cur;
 290
 291     if (str == NULL)
 292         return (-1);
 293
 294     cur = *str;
 295
 296     while ((ISA_PCHAR(cur)) || (*cur == '/') || (*cur == '?') ||
 297            ((uri != NULL) && (uri->cleanup & 1) && (IS_UNWISE(cur))))
 298         NEXT(cur);
 299     if (uri != NULL) {
 300         g_free(uri->query);
 301         uri->query = g_strndup (*str, cur - *str);
 302     }
 303     *str = cur;
 304     return (0);
 305 }
 306
 307 /**
 308  * rfc3986_parse_port:
 309  * @uri:  pointer to an URI structure
 310  * @str:  the string to analyze
 311  *
 312  * Parse a port  part and fills in the appropriate fields
 313  * of the @uri structure
 314  *
 315  * port          = *DIGIT
 316  *
 317  * Returns 0 or the error code
 318  */
 319 static int
 320 rfc3986_parse_port(URI *uri, const char **str)
 321 {
 322     const char *cur = *str;
 323     int port = 0;
 324
 325     if (ISA_DIGIT(cur)) {
 326         while (ISA_DIGIT(cur)) {
 327             port = port * 10 + (*cur - '0');
 328             if (port > 65535) {
 329                 return 1;
 330             }
 331             cur++;
 332         }
 333         if (uri) {
 334             uri->port = port;
 335         }
 336         *str = cur;
 337         return 0;
 338     }
 339     return 1;
 340 }
 341
 342 /**
 343  * rfc3986_parse_user_info:
 344  * @uri:  pointer to an URI structure
 345  * @str:  the string to analyze
 346  *
 347  * Parse an user informations part and fills in the appropriate fields
 348  * of the @uri structure
 349  *
 350  * userinfo      = *( unreserved / pct-encoded / sub-delims / ":" )
 351  *
 352  * Returns 0 or the error code
 353  */
 354 static int
 355 rfc3986_parse_user_info(URI *uri, const char **str)
 356 {
 357     const char *cur;
 358
 359     cur = *str;
 360     while (ISA_UNRESERVED(cur) || ISA_PCT_ENCODED(cur) ||
 361            ISA_SUB_DELIM(cur) || (*cur == ':'))
 362         NEXT(cur);
 363     if (*cur == '@') {
 364         if (uri != NULL) {
 365             g_free(uri->user);
 366             if (uri->cleanup & 2)
 367                 uri->user = g_strndup(*str, cur - *str);
 368             else
 369                 uri->user = uri_string_unescape(*str, cur - *str, NULL);
 370         }
 371         *str = cur;
 372         return(0);
 373     }
 374     return(1);
 375 }
 376
 377 /**
 378  * rfc3986_parse_dec_octet:
 379  * @str:  the string to analyze
 380  *
 381  *    dec-octet     = DIGIT                 ; 0-9
 382  *                  / %x31-39 DIGIT         ; 10-99
 383  *                  / "1" 2DIGIT            ; 100-199
 384  *                  / "2" %x30-34 DIGIT     ; 200-249
 385  *                  / "25" %x30-35          ; 250-255
 386  *
 387  * Skip a dec-octet.
 388  *
 389  * Returns 0 if found and skipped, 1 otherwise
 390  */
 391 static int
 392 rfc3986_parse_dec_octet(const char **str) {
 393     const char *cur = *str;
 394
 395     if (!(ISA_DIGIT(cur)))
 396         return(1);
 397     if (!ISA_DIGIT(cur+1))
 398         cur++;
 399     else if ((*cur != '0') && (ISA_DIGIT(cur + 1)) && (!ISA_DIGIT(cur+2)))
 400         cur += 2;
 401     else if ((*cur == '1') && (ISA_DIGIT(cur + 1)) && (ISA_DIGIT(cur + 2)))
 402         cur += 3;
 403     else if ((*cur == '2') && (*(cur + 1) >= '0') &&
 404              (*(cur + 1) <= '4') && (ISA_DIGIT(cur + 2)))
 405         cur += 3;
 406     else if ((*cur == '2') && (*(cur + 1) == '5') &&
 407              (*(cur + 2) >= '0') && (*(cur + 1) <= '5'))
 408         cur += 3;
 409     else
 410         return(1);
 411     *str = cur;
 412     return(0);
 413 }
 414 /**
 415  * rfc3986_parse_host:
 416  * @uri:  pointer to an URI structure
 417  * @str:  the string to analyze
 418  *
 419  * Parse an host part and fills in the appropriate fields
 420  * of the @uri structure
 421  *
 422  * host          = IP-literal / IPv4address / reg-name
 423  * IP-literal    = "[" ( IPv6address / IPvFuture  ) "]"
 424  * IPv4address   = dec-octet "." dec-octet "." dec-octet "." dec-octet
 425  * reg-name      = *( unreserved / pct-encoded / sub-delims )
 426  *
 427  * Returns 0 or the error code
 428  */
 429 static int
 430 rfc3986_parse_host(URI *uri, const char **str)
 431 {
 432     const char *cur = *str;
 433     const char *host;
 434
 435     host = cur;
 436     /*
 437      * IPv6 and future addressing scheme are enclosed between brackets
 438      */
 439     if (*cur == '[') {
 440         cur++;
 441         while ((*cur != ']') && (*cur != 0))
 442             cur++;
 443         if (*cur != ']')
 444             return(1);
 445         cur++;
 446         goto found;
 447     }
 448     /*
 449      * try to parse an IPv4
 450      */
 451     if (ISA_DIGIT(cur)) {
 452         if (rfc3986_parse_dec_octet(&cur) != 0)
 453             goto not_ipv4;
 454         if (*cur != '.')
 455             goto not_ipv4;
 456         cur++;
 457         if (rfc3986_parse_dec_octet(&cur) != 0)
 458             goto not_ipv4;
 459         if (*cur != '.')
 460             goto not_ipv4;
 461         if (rfc3986_parse_dec_octet(&cur) != 0)
 462             goto not_ipv4;
 463         if (*cur != '.')
 464             goto not_ipv4;
 465         if (rfc3986_parse_dec_octet(&cur) != 0)
 466             goto not_ipv4;
 467         goto found;
 468 not_ipv4:
 469         cur = *str;
 470     }
 471     /*
 472      * then this should be a hostname which can be empty
 473      */
 474     while (ISA_UNRESERVED(cur) || ISA_PCT_ENCODED(cur) || ISA_SUB_DELIM(cur))
 475         NEXT(cur);
 476 found:
 477     if (uri != NULL) {
 478         g_free(uri->authority);
 479         uri->authority = NULL;
 480         g_free(uri->server);
 481         if (cur != host) {
 482             if (uri->cleanup & 2)
 483                 uri->server = g_strndup(host, cur - host);
 484             else
 485                 uri->server = uri_string_unescape(host, cur - host, NULL);
 486         } else
 487             uri->server = NULL;
 488     }
 489     *str = cur;
 490     return(0);
 491 }
 492
 493 /**
 494  * rfc3986_parse_authority:
 495  * @uri:  pointer to an URI structure
 496  * @str:  the string to analyze
 497  *
 498  * Parse an authority part and fills in the appropriate fields
 499  * of the @uri structure
 500  *
 501  * authority     = [ userinfo "@" ] host [ ":" port ]
 502  *
 503  * Returns 0 or the error code
 504  */
 505 static int
 506 rfc3986_parse_authority(URI *uri, const char **str)
 507 {
 508     const char *cur;
 509     int ret;
 510
 511     cur = *str;
 512     /*
 513      * try to parse an userinfo and check for the trailing @
 514      */
 515     ret = rfc3986_parse_user_info(uri, &cur);
 516     if ((ret != 0) || (*cur != '@'))
 517         cur = *str;
 518     else
 519         cur++;
 520     ret = rfc3986_parse_host(uri, &cur);
 521     if (ret != 0) return(ret);
 522     if (*cur == ':') {
 523         cur++;
 524         ret = rfc3986_parse_port(uri, &cur);
 525         if (ret != 0) return(ret);
 526     }
 527     *str = cur;
 528     return(0);
 529 }
 530
 531 /**
 532  * rfc3986_parse_segment:
 533  * @str:  the string to analyze
 534  * @forbid: an optional forbidden character
 535  * @empty: allow an empty segment
 536  *
 537  * Parse a segment and fills in the appropriate fields
 538  * of the @uri structure
 539  *
 540  * segment       = *pchar
 541  * segment-nz    = 1*pchar
 542  * segment-nz-nc = 1*( unreserved / pct-encoded / sub-delims / "@" )
 543  *               ; non-zero-length segment without any colon ":"
 544  *
 545  * Returns 0 or the error code
 546  */
 547 static int
 548 rfc3986_parse_segment(const char **str, char forbid, int empty)
 549 {
 550     const char *cur;
 551
 552     cur = *str;
 553     if (!ISA_PCHAR(cur)) {
 554         if (empty)
 555             return(0);
 556         return(1);
 557     }
 558     while (ISA_PCHAR(cur) && (*cur != forbid))
 559         NEXT(cur);
 560     *str = cur;
 561     return (0);
 562 }
 563
 564 /**
 565  * rfc3986_parse_path_ab_empty:
 566  * @uri:  pointer to an URI structure
 567  * @str:  the string to analyze
 568  *
 569  * Parse an path absolute or empty and fills in the appropriate fields
 570  * of the @uri structure
 571  *
 572  * path-abempty  = *( "/" segment )
 573  *
 574  * Returns 0 or the error code
 575  */
 576 static int
 577 rfc3986_parse_path_ab_empty(URI *uri, const char **str)
 578 {
 579     const char *cur;
 580     int ret;
 581
 582     cur = *str;
 583
 584     while (*cur == '/') {
 585         cur++;
 586         ret = rfc3986_parse_segment(&cur, 0, 1);
 587         if (ret != 0) return(ret);
 588     }
 589     if (uri != NULL) {
 590         g_free(uri->path);
 591         if (*str != cur) {
 592             if (uri->cleanup & 2)
 593                 uri->path = g_strndup(*str, cur - *str);
 594             else
 595                 uri->path = uri_string_unescape(*str, cur - *str, NULL);
 596         } else {
 597             uri->path = NULL;
 598         }
 599     }
 600     *str = cur;
 601     return (0);
 602 }
 603
 604 /**
 605  * rfc3986_parse_path_absolute:
 606  * @uri:  pointer to an URI structure
 607  * @str:  the string to analyze
 608  *
 609  * Parse an path absolute and fills in the appropriate fields
 610  * of the @uri structure
 611  *
 612  * path-absolute = "/" [ segment-nz *( "/" segment ) ]
 613  *
 614  * Returns 0 or the error code
 615  */
 616 static int
 617 rfc3986_parse_path_absolute(URI *uri, const char **str)
 618 {
 619     const char *cur;
 620     int ret;
 621
 622     cur = *str;
 623
 624     if (*cur != '/')
 625         return(1);
 626     cur++;
 627     ret = rfc3986_parse_segment(&cur, 0, 0);
 628     if (ret == 0) {
 629         while (*cur == '/') {
 630             cur++;
 631             ret = rfc3986_parse_segment(&cur, 0, 1);
 632             if (ret != 0) return(ret);
 633         }
 634     }
 635     if (uri != NULL) {
 636         g_free(uri->path);
 637         if (cur != *str) {
 638             if (uri->cleanup & 2)
 639                 uri->path = g_strndup(*str, cur - *str);
 640             else
 641                 uri->path = uri_string_unescape(*str, cur - *str, NULL);
 642         } else {
 643             uri->path = NULL;
 644         }
 645     }
 646     *str = cur;
 647     return (0);
 648 }
 649
 650 /**
 651  * rfc3986_parse_path_rootless:
 652  * @uri:  pointer to an URI structure
 653  * @str:  the string to analyze
 654  *
 655  * Parse an path without root and fills in the appropriate fields
 656  * of the @uri structure
 657  *
 658  * path-rootless = segment-nz *( "/" segment )
 659  *
 660  * Returns 0 or the error code
 661  */
 662 static int
 663 rfc3986_parse_path_rootless(URI *uri, const char **str)
 664 {
 665     const char *cur;
 666     int ret;
 667
 668     cur = *str;
 669
 670     ret = rfc3986_parse_segment(&cur, 0, 0);
 671     if (ret != 0) return(ret);
 672     while (*cur == '/') {
 673         cur++;
 674         ret = rfc3986_parse_segment(&cur, 0, 1);
 675         if (ret != 0) return(ret);
 676     }
 677     if (uri != NULL) {
 678         g_free(uri->path);
 679         if (cur != *str) {
 680             if (uri->cleanup & 2)
 681                 uri->path = g_strndup(*str, cur - *str);
 682             else
 683                 uri->path = uri_string_unescape(*str, cur - *str, NULL);
 684         } else {
 685             uri->path = NULL;
 686         }
 687     }
 688     *str = cur;
 689     return (0);
 690 }
 691
 692 /**
 693  * rfc3986_parse_path_no_scheme:
 694  * @uri:  pointer to an URI structure
 695  * @str:  the string to analyze
 696  *
 697  * Parse an path which is not a scheme and fills in the appropriate fields
 698  * of the @uri structure
 699  *
 700  * path-noscheme = segment-nz-nc *( "/" segment )
 701  *
 702  * Returns 0 or the error code
 703  */
 704 static int
 705 rfc3986_parse_path_no_scheme(URI *uri, const char **str)
 706 {
 707     const char *cur;
 708     int ret;
 709
 710     cur = *str;
 711
 712     ret = rfc3986_parse_segment(&cur, ':', 0);
 713     if (ret != 0) return(ret);
 714     while (*cur == '/') {
 715         cur++;
 716         ret = rfc3986_parse_segment(&cur, 0, 1);
 717         if (ret != 0) return(ret);
 718     }
 719     if (uri != NULL) {
 720         g_free(uri->path);
 721         if (cur != *str) {
 722             if (uri->cleanup & 2)
 723                 uri->path = g_strndup(*str, cur - *str);
 724             else
 725                 uri->path = uri_string_unescape(*str, cur - *str, NULL);
 726         } else {
 727             uri->path = NULL;
 728         }
 729     }
 730     *str = cur;
 731     return (0);
 732 }
 733
 734 /**
 735  * rfc3986_parse_hier_part:
 736  * @uri:  pointer to an URI structure
 737  * @str:  the string to analyze
 738  *
 739  * Parse an hierarchical part and fills in the appropriate fields
 740  * of the @uri structure
 741  *
 742  * hier-part     = "//" authority path-abempty
 743  *                / path-absolute
 744  *                / path-rootless
 745  *                / path-empty
 746  *
 747  * Returns 0 or the error code
 748  */
 749 static int
 750 rfc3986_parse_hier_part(URI *uri, const char **str)
 751 {
 752     const char *cur;
 753     int ret;
 754
 755     cur = *str;
 756
 757     if ((*cur == '/') && (*(cur + 1) == '/')) {
 758         cur += 2;
 759         ret = rfc3986_parse_authority(uri, &cur);
 760         if (ret != 0) return(ret);
 761         ret = rfc3986_parse_path_ab_empty(uri, &cur);
 762         if (ret != 0) return(ret);
 763         *str = cur;
 764         return(0);
 765     } else if (*cur == '/') {
 766         ret = rfc3986_parse_path_absolute(uri, &cur);
 767         if (ret != 0) return(ret);
 768     } else if (ISA_PCHAR(cur)) {
 769         ret = rfc3986_parse_path_rootless(uri, &cur);
 770         if (ret != 0) return(ret);
 771     } else {
 772         /* path-empty is effectively empty */
 773         if (uri != NULL) {
 774             g_free(uri->path);
 775             uri->path = NULL;
 776         }
 777     }
 778     *str = cur;
 779     return (0);
 780 }
 781
 782 /**
 783  * rfc3986_parse_relative_ref:
 784  * @uri:  pointer to an URI structure
 785  * @str:  the string to analyze
 786  *
 787  * Parse an URI string and fills in the appropriate fields
 788  * of the @uri structure
 789  *
 790  * relative-ref  = relative-part [ "?" query ] [ "#" fragment ]
 791  * relative-part = "//" authority path-abempty
 792  *               / path-absolute
 793  *               / path-noscheme
 794  *               / path-empty
 795  *
 796  * Returns 0 or the error code
 797  */
 798 static int
 799 rfc3986_parse_relative_ref(URI *uri, const char *str) {
 800     int ret;
 801
 802     if ((*str == '/') && (*(str + 1) == '/')) {
 803         str += 2;
 804         ret = rfc3986_parse_authority(uri, &str);
 805         if (ret != 0) return(ret);
 806         ret = rfc3986_parse_path_ab_empty(uri, &str);
 807         if (ret != 0) return(ret);
 808     } else if (*str == '/') {
 809         ret = rfc3986_parse_path_absolute(uri, &str);
 810         if (ret != 0) return(ret);
 811     } else if (ISA_PCHAR(str)) {
 812         ret = rfc3986_parse_path_no_scheme(uri, &str);
 813         if (ret != 0) return(ret);
 814     } else {
 815         /* path-empty is effectively empty */
 816         if (uri != NULL) {
 817             g_free(uri->path);
 818             uri->path = NULL;
 819         }
 820     }
 821
 822     if (*str == '?') {
 823         str++;
 824         ret = rfc3986_parse_query(uri, &str);
 825         if (ret != 0) return(ret);
 826     }
 827     if (*str == '#') {
 828         str++;
 829         ret = rfc3986_parse_fragment(uri, &str);
 830         if (ret != 0) return(ret);
 831     }
 832     if (*str != 0) {
 833         uri_clean(uri);
 834         return(1);
 835     }
 836     return(0);
 837 }
 838
 839
 840 /**
 841  * rfc3986_parse:
 842  * @uri:  pointer to an URI structure
 843  * @str:  the string to analyze
 844  *
 845  * Parse an URI string and fills in the appropriate fields
 846  * of the @uri structure
 847  *
 848  * scheme ":" hier-part [ "?" query ] [ "#" fragment ]
 849  *
 850  * Returns 0 or the error code
 851  */
 852 static int
 853 rfc3986_parse(URI *uri, const char *str) {
 854     int ret;
 855
 856     ret = rfc3986_parse_scheme(uri, &str);
 857     if (ret != 0) return(ret);
 858     if (*str != ':') {
 859         return(1);
 860     }
 861     str++;
 862     ret = rfc3986_parse_hier_part(uri, &str);
 863     if (ret != 0) return(ret);
 864     if (*str == '?') {
 865         str++;
 866         ret = rfc3986_parse_query(uri, &str);
 867         if (ret != 0) return(ret);
 868     }
 869     if (*str == '#') {
 870         str++;
 871         ret = rfc3986_parse_fragment(uri, &str);
 872         if (ret != 0) return(ret);
 873     }
 874     if (*str != 0) {
 875         uri_clean(uri);
 876         return(1);
 877     }
 878     return(0);
 879 }
 880
 881 /**
 882  * rfc3986_parse_uri_reference:
 883  * @uri:  pointer to an URI structure
 884  * @str:  the string to analyze
 885  *
 886  * Parse an URI reference string and fills in the appropriate fields
 887  * of the @uri structure
 888  *
 889  * URI-reference = URI / relative-ref
 890  *
 891  * Returns 0 or the error code
 892  */
 893 static int
 894 rfc3986_parse_uri_reference(URI *uri, const char *str) {
 895     int ret;
 896
 897     if (str == NULL)
 898         return(-1);
 899     uri_clean(uri);
 900
 901     /*
 902      * Try first to parse absolute refs, then fallback to relative if
 903      * it fails.
 904      */
 905     ret = rfc3986_parse(uri, str);
 906     if (ret != 0) {
 907         uri_clean(uri);
 908         ret = rfc3986_parse_relative_ref(uri, str);
 909         if (ret != 0) {
 910             uri_clean(uri);
 911             return(ret);
 912         }
 913     }
 914     return(0);
 915 }
 916
 917 /**
 918  * uri_parse:
 919  * @str:  the URI string to analyze
 920  *
 921  * Parse an URI based on RFC 3986
 922  *
 923  * URI-reference = [ absoluteURI | relativeURI ] [ "#" fragment ]
 924  *
 925  * Returns a newly built URI or NULL in case of error
 926  */
 927 URI *
 928 uri_parse(const char *str) {
 929     URI *uri;
 930     int ret;
 931
 932     if (str == NULL)
 933         return(NULL);
 934     uri = uri_new();
 935     ret = rfc3986_parse_uri_reference(uri, str);
 936     if (ret) {
 937         uri_free(uri);
 938         return(NULL);
 939     }
 940     return(uri);
 941 }
 942
 943 /**
 944  * uri_parse_into:
 945  * @uri:  pointer to an URI structure
 946  * @str:  the string to analyze
 947  *
 948  * Parse an URI reference string based on RFC 3986 and fills in the
 949  * appropriate fields of the @uri structure
 950  *
 951  * URI-reference = URI / relative-ref
 952  *
 953  * Returns 0 or the error code
 954  */
 955 int
 956 uri_parse_into(URI *uri, const char *str) {
 957     return(rfc3986_parse_uri_reference(uri, str));
 958 }
 959
 960 /**
 961  * uri_parse_raw:
 962  * @str:  the URI string to analyze
 963  * @raw:  if 1 unescaping of URI pieces are disabled
 964  *
 965  * Parse an URI but allows to keep intact the original fragments.
 966  *
 967  * URI-reference = URI / relative-ref
 968  *
 969  * Returns a newly built URI or NULL in case of error
 970  */
 971 URI *
 972 uri_parse_raw(const char *str, int raw) {
 973     URI *uri;
 974     int ret;
 975
 976     if (str == NULL)
 977         return(NULL);
 978     uri = uri_new();
 979     if (raw) {
 980         uri->cleanup |= 2;
 981     }
 982     ret = uri_parse_into(uri, str);
 983     if (ret) {
 984         uri_free(uri);
 985         return(NULL);
 986     }
 987     return(uri);
 988 }
 989
 990 /************************************************************************
 991  *                                                                      *
 992  *                      Generic URI structure functions                 *
 993  *                                                                      *
 994  ************************************************************************/
 995
 996 /**
 997  * uri_new:
 998  *
 999  * Simply creates an empty URI
1000  *
1001  * Returns the new structure or NULL in case of error
1002  */
1003 URI *
1004 uri_new(void) {
1005     URI *ret;
1006
1007     ret = g_new0(URI, 1);
1008     return(ret);
1009 }
1010
1011 /**
1012  * realloc2n:
1013  *
1014  * Function to handle properly a reallocation when saving an URI
1015  * Also imposes some limit on the length of an URI string output
1016  */
1017 static char *
1018 realloc2n(char *ret, int *max) {
1019     char *temp;
1020     int tmp;
1021
1022     tmp = *max * 2;
1023     temp = g_realloc(ret, (tmp + 1));
1024     *max = tmp;
1025     return(temp);
1026 }
1027
1028 /**
1029  * uri_to_string:
1030  * @uri:  pointer to an URI
1031  *
1032  * Save the URI as an escaped string
1033  *
1034  * Returns a new string (to be deallocated by caller)
1035  */
1036 char *
1037 uri_to_string(URI *uri) {
1038     char *ret = NULL;
1039     char *temp;
1040     const char *p;
1041     int len;
1042     int max;
1043
1044     if (uri == NULL) return(NULL);
1045
1046
1047     max = 80;
1048     ret = g_malloc(max + 1);
1049     len = 0;
1050
1051     if (uri->scheme != NULL) {
1052         p = uri->scheme;
1053         while (*p != 0) {
1054             if (len >= max) {
1055                 temp = realloc2n(ret, &max);
1056                 ret = temp;
1057             }
1058             ret[len++] = *p++;
1059         }
1060         if (len >= max) {
1061             temp = realloc2n(ret, &max);
1062             ret = temp;
1063         }
1064         ret[len++] = ':';
1065     }
1066     if (uri->opaque != NULL) {
1067         p = uri->opaque;
1068         while (*p != 0) {
1069             if (len + 3 >= max) {
1070                 temp = realloc2n(ret, &max);
1071                 ret = temp;
1072             }
1073             if (IS_RESERVED(*(p)) || IS_UNRESERVED(*(p)))
1074                 ret[len++] = *p++;
1075             else {
1076                 int val = *(unsigned char *)p++;
1077                 int hi = val / 0x10, lo = val % 0x10;
1078                 ret[len++] = '%';
1079                 ret[len++] = hi + (hi > 9? 'A'-10 : '0');
1080                 ret[len++] = lo + (lo > 9? 'A'-10 : '0');
1081             }
1082         }
1083     } else {
1084         if (uri->server != NULL) {
1085             if (len + 3 >= max) {
1086                 temp = realloc2n(ret, &max);
1087                 ret = temp;
1088             }
1089             ret[len++] = '/';
1090             ret[len++] = '/';
1091             if (uri->user != NULL) {
1092                 p = uri->user;
1093                 while (*p != 0) {
1094                     if (len + 3 >= max) {
1095                         temp = realloc2n(ret, &max);
1096                         ret = temp;
1097                     }
1098                     if ((IS_UNRESERVED(*(p))) ||
1099                         ((*(p) == ';')) || ((*(p) == ':')) ||
1100                         ((*(p) == '&')) || ((*(p) == '=')) ||
1101                         ((*(p) == '+')) || ((*(p) == '$')) ||
1102                         ((*(p) == ',')))
1103                         ret[len++] = *p++;
1104                     else {
1105                         int val = *(unsigned char *)p++;
1106                         int hi = val / 0x10, lo = val % 0x10;
1107                         ret[len++] = '%';
1108                         ret[len++] = hi + (hi > 9? 'A'-10 : '0');
1109                         ret[len++] = lo + (lo > 9? 'A'-10 : '0');
1110                     }
1111                 }
1112                 if (len + 3 >= max) {
1113                     temp = realloc2n(ret, &max);
1114                     ret = temp;
1115                 }
1116                 ret[len++] = '@';
1117             }
1118             p = uri->server;
1119             while (*p != 0) {
1120                 if (len >= max) {
1121                     temp = realloc2n(ret, &max);
1122                     ret = temp;
1123                 }
1124                 ret[len++] = *p++;
1125             }
1126             if (uri->port > 0) {
1127                 if (len + 10 >= max) {
1128                     temp = realloc2n(ret, &max);
1129                     ret = temp;
1130                 }
1131                 len += snprintf(&ret[len], max - len, ":%d", uri->port);
1132             }
1133         } else if (uri->authority != NULL) {
1134             if (len + 3 >= max) {
1135                 temp = realloc2n(ret, &max);
1136                 ret = temp;
1137             }
1138             ret[len++] = '/';
1139             ret[len++] = '/';
1140             p = uri->authority;
1141             while (*p != 0) {
1142                 if (len + 3 >= max) {
1143                     temp = realloc2n(ret, &max);
1144                     ret = temp;
1145                 }
1146                 if ((IS_UNRESERVED(*(p))) ||
1147                     ((*(p) == '$')) || ((*(p) == ',')) || ((*(p) == ';')) ||
1148                     ((*(p) == ':')) || ((*(p) == '@')) || ((*(p) == '&')) ||
1149                     ((*(p) == '=')) || ((*(p) == '+')))
1150                     ret[len++] = *p++;
1151                 else {
1152                     int val = *(unsigned char *)p++;
1153                     int hi = val / 0x10, lo = val % 0x10;
1154                     ret[len++] = '%';
1155                     ret[len++] = hi + (hi > 9? 'A'-10 : '0');
1156                     ret[len++] = lo + (lo > 9? 'A'-10 : '0');
1157                 }
1158             }
1159         } else if (uri->scheme != NULL) {
1160             if (len + 3 >= max) {
1161                 temp = realloc2n(ret, &max);
1162                 ret = temp;
1163             }
1164             ret[len++] = '/';
1165             ret[len++] = '/';
1166         }
1167         if (uri->path != NULL) {
1168             p = uri->path;
1169             /*
1170              * the colon in file:///d: should not be escaped or
1171              * Windows accesses fail later.
1172              */
1173             if ((uri->scheme != NULL) &&
1174                 (p[0] == '/') &&
1175                 (((p[1] >= 'a') && (p[1] <= 'z')) ||
1176                  ((p[1] >= 'A') && (p[1] <= 'Z'))) &&
1177                 (p[2] == ':') &&
1178                 (!strcmp(uri->scheme, "file"))) {
1179                 if (len + 3 >= max) {
1180                     temp = realloc2n(ret, &max);
1181                     ret = temp;
1182                 }
1183                 ret[len++] = *p++;
1184                 ret[len++] = *p++;
1185                 ret[len++] = *p++;
1186             }
1187             while (*p != 0) {
1188                 if (len + 3 >= max) {
1189                     temp = realloc2n(ret, &max);
1190                     ret = temp;
1191                 }
1192                 if ((IS_UNRESERVED(*(p))) || ((*(p) == '/')) ||
1193                     ((*(p) == ';')) || ((*(p) == '@')) || ((*(p) == '&')) ||
1194                     ((*(p) == '=')) || ((*(p) == '+')) || ((*(p) == '$')) ||
1195                     ((*(p) == ',')))
1196                     ret[len++] = *p++;
1197                 else {
1198                     int val = *(unsigned char *)p++;
1199                     int hi = val / 0x10, lo = val % 0x10;
1200                     ret[len++] = '%';
1201                     ret[len++] = hi + (hi > 9? 'A'-10 : '0');
1202                     ret[len++] = lo + (lo > 9? 'A'-10 : '0');
1203                 }
1204             }
1205         }
1206         if (uri->query != NULL) {
1207             if (len + 1 >= max) {
1208                 temp = realloc2n(ret, &max);
1209                 ret = temp;
1210             }
1211             ret[len++] = '?';
1212             p = uri->query;
1213             while (*p != 0) {
1214                 if (len + 1 >= max) {
1215                     temp = realloc2n(ret, &max);
1216                     ret = temp;
1217                 }
1218                 ret[len++] = *p++;
1219             }
1220         }
1221     }
1222     if (uri->fragment != NULL) {
1223         if (len + 3 >= max) {
1224             temp = realloc2n(ret, &max);
1225             ret = temp;
1226         }
1227         ret[len++] = '#';
1228         p = uri->fragment;
1229         while (*p != 0) {
1230             if (len + 3 >= max) {
1231                 temp = realloc2n(ret, &max);
1232                 ret = temp;
1233             }
1234             if ((IS_UNRESERVED(*(p))) || (IS_RESERVED(*(p))))
1235                 ret[len++] = *p++;
1236             else {
1237                 int val = *(unsigned char *)p++;
1238                 int hi = val / 0x10, lo = val % 0x10;
1239                 ret[len++] = '%';
1240                 ret[len++] = hi + (hi > 9? 'A'-10 : '0');
1241                 ret[len++] = lo + (lo > 9? 'A'-10 : '0');
1242             }
1243         }
1244     }
1245     if (len >= max) {
1246         temp = realloc2n(ret, &max);
1247         ret = temp;
1248     }
1249     ret[len] = 0;
1250     return(ret);
1251 }
1252
1253 /**
1254  * uri_clean:
1255  * @uri:  pointer to an URI
1256  *
1257  * Make sure the URI struct is free of content
1258  */
1259 static void
1260 uri_clean(URI *uri) {
1261     if (uri == NULL) return;
1262
1263     g_free(uri->scheme);
1264     uri->scheme = NULL;
1265     g_free(uri->server);
1266     uri->server = NULL;
1267     g_free(uri->user);
1268     uri->user = NULL;
1269     g_free(uri->path);
1270     uri->path = NULL;
1271     g_free(uri->fragment);
1272     uri->fragment = NULL;
1273     g_free(uri->opaque);
1274     uri->opaque = NULL;
1275     g_free(uri->authority);
1276     uri->authority = NULL;
1277     g_free(uri->query);
1278     uri->query = NULL;
1279 }
1280
1281 /**
1282  * uri_free:
1283  * @uri:  pointer to an URI
1284  *
1285  * Free up the URI struct
1286  */
1287 void
1288 uri_free(URI *uri) {
1289     uri_clean(uri);
1290     g_free(uri);
1291 }
1292
1293 /************************************************************************
1294  *                                                                      *
1295  *                      Helper functions                                *
1296  *                                                                      *
1297  ************************************************************************/
1298
1299 /**
1300  * normalize_uri_path:
1301  * @path:  pointer to the path string
1302  *
1303  * Applies the 5 normalization steps to a path string--that is, RFC 2396
1304  * Section 5.2, steps 6.c through 6.g.
1305  *
1306  * Normalization occurs directly on the string, no new allocation is done
1307  *
1308  * Returns 0 or an error code
1309  */
1310 static int
1311 normalize_uri_path(char *path) {
1312     char *cur, *out;
1313
1314     if (path == NULL)
1315         return(-1);
1316
1317     /* Skip all initial "/" chars.  We want to get to the beginning of the
1318      * first non-empty segment.
1319      */
1320     cur = path;
1321     while (cur[0] == '/')
1322       ++cur;
1323     if (cur[0] == '\0')
1324       return(0);
1325
1326     /* Keep everything we've seen so far.  */
1327     out = cur;
1328
1329     /*
1330      * Analyze each segment in sequence for cases (c) and (d).
1331      */
1332     while (cur[0] != '\0') {
1333         /*
1334          * c) All occurrences of "./", where "." is a complete path segment,
1335          *    are removed from the buffer string.
1336          */
1337         if ((cur[0] == '.') && (cur[1] == '/')) {
1338             cur += 2;
1339             /* '//' normalization should be done at this point too */
1340             while (cur[0] == '/')
1341                 cur++;
1342             continue;
1343         }
1344
1345         /*
1346          * d) If the buffer string ends with "." as a complete path segment,
1347          *    that "." is removed.
1348          */
1349         if ((cur[0] == '.') && (cur[1] == '\0'))
1350             break;
1351
1352         /* Otherwise keep the segment.  */
1353         while (cur[0] != '/') {
1354             if (cur[0] == '\0')
1355               goto done_cd;
1356             (out++)[0] = (cur++)[0];
1357         }
1358         /* nomalize // */
1359         while ((cur[0] == '/') && (cur[1] == '/'))
1360             cur++;
1361
1362         (out++)[0] = (cur++)[0];
1363     }
1364  done_cd:
1365     out[0] = '\0';
1366
1367     /* Reset to the beginning of the first segment for the next sequence.  */
1368     cur = path;
1369     while (cur[0] == '/')
1370       ++cur;
1371     if (cur[0] == '\0')
1372         return(0);
1373
1374     /*
1375      * Analyze each segment in sequence for cases (e) and (f).
1376      *
1377      * e) All occurrences of "<segment>/../", where <segment> is a
1378      *    complete path segment not equal to "..", are removed from the
1379      *    buffer string.  Removal of these path segments is performed
1380      *    iteratively, removing the leftmost matching pattern on each
1381      *    iteration, until no matching pattern remains.
1382      *
1383      * f) If the buffer string ends with "<segment>/..", where <segment>
1384      *    is a complete path segment not equal to "..", that
1385      *    "<segment>/.." is removed.
1386      *
1387      * To satisfy the "iterative" clause in (e), we need to collapse the
1388      * string every time we find something that needs to be removed.  Thus,
1389      * we don't need to keep two pointers into the string: we only need a
1390      * "current position" pointer.
1391      */
1392     while (1) {
1393         char *segp, *tmp;
1394
1395         /* At the beginning of each iteration of this loop, "cur" points to
1396          * the first character of the segment we want to examine.
1397          */
1398
1399         /* Find the end of the current segment.  */
1400         segp = cur;
1401         while ((segp[0] != '/') && (segp[0] != '\0'))
1402           ++segp;
1403
1404         /* If this is the last segment, we're done (we need at least two
1405          * segments to meet the criteria for the (e) and (f) cases).
1406          */
1407         if (segp[0] == '\0')
1408           break;
1409
1410         /* If the first segment is "..", or if the next segment _isn't_ "..",
1411          * keep this segment and try the next one.
1412          */
1413         ++segp;
1414         if (((cur[0] == '.') && (cur[1] == '.') && (segp == cur+3))
1415             || ((segp[0] != '.') || (segp[1] != '.')
1416                 || ((segp[2] != '/') && (segp[2] != '\0')))) {
1417           cur = segp;
1418           continue;
1419         }
1420
1421         /* If we get here, remove this segment and the next one and back up
1422          * to the previous segment (if there is one), to implement the
1423          * "iteratively" clause.  It's pretty much impossible to back up
1424          * while maintaining two pointers into the buffer, so just compact
1425          * the whole buffer now.
1426          */
1427
1428         /* If this is the end of the buffer, we're done.  */
1429         if (segp[2] == '\0') {
1430           cur[0] = '\0';
1431           break;
1432         }
1433         /* Valgrind complained, strcpy(cur, segp + 3); */
1434         /* string will overlap, do not use strcpy */
1435         tmp = cur;
1436         segp += 3;
1437         while ((*tmp++ = *segp++) != 0)
1438           ;
1439
1440         /* If there are no previous segments, then keep going from here.  */
1441         segp = cur;
1442         while ((segp > path) && ((--segp)[0] == '/'))
1443           ;
1444         if (segp == path)
1445           continue;
1446
1447         /* "segp" is pointing to the end of a previous segment; find it's
1448          * start.  We need to back up to the previous segment and start
1449          * over with that to handle things like "foo/bar/../..".  If we
1450          * don't do this, then on the first pass we'll remove the "bar/..",
1451          * but be pointing at the second ".." so we won't realize we can also
1452          * remove the "foo/..".
1453          */
1454         cur = segp;
1455         while ((cur > path) && (cur[-1] != '/'))
1456           --cur;
1457     }
1458     out[0] = '\0';
1459
1460     /*
1461      * g) If the resulting buffer string still begins with one or more
1462      *    complete path segments of "..", then the reference is
1463      *    considered to be in error. Implementations may handle this
1464      *    error by retaining these components in the resolved path (i.e.,
1465      *    treating them as part of the final URI), by removing them from
1466      *    the resolved path (i.e., discarding relative levels above the
1467      *    root), or by avoiding traversal of the reference.
1468      *
1469      * We discard them from the final path.
1470      */
1471     if (path[0] == '/') {
1472       cur = path;
1473       while ((cur[0] == '/') && (cur[1] == '.') && (cur[2] == '.')
1474              && ((cur[3] == '/') || (cur[3] == '\0')))
1475         cur += 3;
1476
1477       if (cur != path) {
1478         out = path;
1479         while (cur[0] != '\0')
1480           (out++)[0] = (cur++)[0];
1481         out[0] = 0;
1482       }
1483     }
1484
1485     return(0);
1486 }
1487
1488 static int is_hex(char c) {
1489     if (((c >= '0') && (c <= '9')) ||
1490         ((c >= 'a') && (c <= 'f')) ||
1491         ((c >= 'A') && (c <= 'F')))
1492         return(1);
1493     return(0);
1494 }
1495
1496
1497 /**
1498  * uri_string_unescape:
1499  * @str:  the string to unescape
1500  * @len:   the length in bytes to unescape (or <= 0 to indicate full string)
1501  * @target:  optional destination buffer
1502  *
1503  * Unescaping routine, but does not check that the string is an URI. The
1504  * output is a direct unsigned char translation of %XX values (no encoding)
1505  * Note that the length of the result can only be smaller or same size as
1506  * the input string.
1507  *
1508  * Returns a copy of the string, but unescaped, will return NULL only in case
1509  * of error
1510  */
1511 char *
1512 uri_string_unescape(const char *str, int len, char *target) {
1513     char *ret, *out;
1514     const char *in;
1515
1516     if (str == NULL)
1517         return(NULL);
1518     if (len <= 0) len = strlen(str);
1519     if (len < 0) return(NULL);
1520
1521     if (target == NULL) {
1522         ret = g_malloc(len + 1);
1523     } else
1524         ret = target;
1525     in = str;
1526     out = ret;
1527     while(len > 0) {
1528         if ((len > 2) && (*in == '%') && (is_hex(in[1])) && (is_hex(in[2]))) {
1529             in++;
1530             if ((*in >= '0') && (*in <= '9'))
1531                 *out = (*in - '0');
1532             else if ((*in >= 'a') && (*in <= 'f'))
1533                 *out = (*in - 'a') + 10;
1534             else if ((*in >= 'A') && (*in <= 'F'))
1535                 *out = (*in - 'A') + 10;
1536             in++;
1537             if ((*in >= '0') && (*in <= '9'))
1538                 *out = *out * 16 + (*in - '0');
1539             else if ((*in >= 'a') && (*in <= 'f'))
1540                 *out = *out * 16 + (*in - 'a') + 10;
1541             else if ((*in >= 'A') && (*in <= 'F'))
1542                 *out = *out * 16 + (*in - 'A') + 10;
1543             in++;
1544             len -= 3;
1545             out++;
1546         } else {
1547             *out++ = *in++;
1548             len--;
1549         }
1550     }
1551     *out = 0;
1552     return(ret);
1553 }
1554
1555 /**
1556  * uri_string_escape:
1557  * @str:  string to escape
1558  * @list: exception list string of chars not to escape
1559  *
1560  * This routine escapes a string to hex, ignoring reserved characters (a-z)
1561  * and the characters in the exception list.
1562  *
1563  * Returns a new escaped string or NULL in case of error.
1564  */
1565 char *
1566 uri_string_escape(const char *str, const char *list) {
1567     char *ret, ch;
1568     char *temp;
1569     const char *in;
1570     int len, out;
1571
1572     if (str == NULL)
1573         return(NULL);
1574     if (str[0] == 0)
1575         return(g_strdup(str));
1576     len = strlen(str);
1577     if (!(len > 0)) return(NULL);
1578
1579     len += 20;
1580     ret = g_malloc(len);
1581     in = str;
1582     out = 0;
1583     while(*in != 0) {
1584         if (len - out <= 3) {
1585             temp = realloc2n(ret, &len);
1586             ret = temp;
1587         }
1588
1589         ch = *in;
1590
1591         if ((ch != '@') && (!IS_UNRESERVED(ch)) && (!strchr(list, ch))) {
1592             unsigned char val;
1593             ret[out++] = '%';
1594             val = ch >> 4;
1595             if (val <= 9)
1596                 ret[out++] = '0' + val;
1597             else
1598                 ret[out++] = 'A' + val - 0xA;
1599             val = ch & 0xF;
1600             if (val <= 9)
1601                 ret[out++] = '0' + val;
1602             else
1603                 ret[out++] = 'A' + val - 0xA;
1604             in++;
1605         } else {
1606             ret[out++] = *in++;
1607         }
1608
1609     }
1610     ret[out] = 0;
1611     return(ret);
1612 }
1613
1614 /************************************************************************
1615  *                                                                      *
1616  *                      Public functions                                *
1617  *                                                                      *
1618  ************************************************************************/
1619
1620 /**
1621  * uri_resolve:
1622  * @URI:  the URI instance found in the document
1623  * @base:  the base value
1624  *
1625  * Computes he final URI of the reference done by checking that
1626  * the given URI is valid, and building the final URI using the
1627  * base URI. This is processed according to section 5.2 of the
1628  * RFC 2396
1629  *
1630  * 5.2. Resolving Relative References to Absolute Form
1631  *
1632  * Returns a new URI string (to be freed by the caller) or NULL in case
1633  *         of error.
1634  */
1635 char *
1636 uri_resolve(const char *uri, const char *base) {
1637     char *val = NULL;
1638     int ret, len, indx, cur, out;
1639     URI *ref = NULL;
1640     URI *bas = NULL;
1641     URI *res = NULL;
1642
1643     /*
1644      * 1) The URI reference is parsed into the potential four components and
1645      *    fragment identifier, as described in Section 4.3.
1646      *
1647      *    NOTE that a completely empty URI is treated by modern browsers
1648      *    as a reference to "." rather than as a synonym for the current
1649      *    URI.  Should we do that here?
1650      */
1651     if (uri == NULL)
1652         ret = -1;
1653     else {
1654         if (*uri) {
1655             ref = uri_new();
1656             ret = uri_parse_into(ref, uri);
1657         }
1658         else
1659             ret = 0;
1660     }
1661     if (ret != 0)
1662         goto done;
1663     if ((ref != NULL) && (ref->scheme != NULL)) {
1664         /*
1665          * The URI is absolute don't modify.
1666          */
1667         val = g_strdup(uri);
1668         goto done;
1669     }
1670     if (base == NULL)
1671         ret = -1;
1672     else {
1673         bas = uri_new();
1674         ret = uri_parse_into(bas, base);
1675     }
1676     if (ret != 0) {
1677         if (ref)
1678             val = uri_to_string(ref);
1679         goto done;
1680     }
1681     if (ref == NULL) {
1682         /*
1683          * the base fragment must be ignored
1684          */
1685         g_free(bas->fragment);
1686         bas->fragment = NULL;
1687         val = uri_to_string(bas);
1688         goto done;
1689     }
1690
1691     /*
1692      * 2) If the path component is empty and the scheme, authority, and
1693      *    query components are undefined, then it is a reference to the
1694      *    current document and we are done.  Otherwise, the reference URI's
1695      *    query and fragment components are defined as found (or not found)
1696      *    within the URI reference and not inherited from the base URI.
1697      *
1698      *    NOTE that in modern browsers, the parsing differs from the above
1699      *    in the following aspect:  the query component is allowed to be
1700      *    defined while still treating this as a reference to the current
1701      *    document.
1702      */
1703     res = uri_new();
1704     if ((ref->scheme == NULL) && (ref->path == NULL) &&
1705         ((ref->authority == NULL) && (ref->server == NULL))) {
1706         res->scheme = g_strdup(bas->scheme);
1707         if (bas->authority != NULL)
1708             res->authority = g_strdup(bas->authority);
1709         else if (bas->server != NULL) {
1710             res->server = g_strdup(bas->server);
1711             res->user = g_strdup(bas->user);
1712             res->port = bas->port;
1713         }
1714         res->path = g_strdup(bas->path);
1715         if (ref->query != NULL) {
1716             res->query = g_strdup (ref->query);
1717         } else {
1718             res->query = g_strdup(bas->query);
1719         }
1720         res->fragment = g_strdup(ref->fragment);
1721         goto step_7;
1722     }
1723
1724     /*
1725      * 3) If the scheme component is defined, indicating that the reference
1726      *    starts with a scheme name, then the reference is interpreted as an
1727      *    absolute URI and we are done.  Otherwise, the reference URI's
1728      *    scheme is inherited from the base URI's scheme component.
1729      */
1730     if (ref->scheme != NULL) {
1731         val = uri_to_string(ref);
1732         goto done;
1733     }
1734     res->scheme = g_strdup(bas->scheme);
1735
1736     res->query = g_strdup(ref->query);
1737     res->fragment = g_strdup(ref->fragment);
1738
1739     /*
1740      * 4) If the authority component is defined, then the reference is a
1741      *    network-path and we skip to step 7.  Otherwise, the reference
1742      *    URI's authority is inherited from the base URI's authority
1743      *    component, which will also be undefined if the URI scheme does not
1744      *    use an authority component.
1745      */
1746     if ((ref->authority != NULL) || (ref->server != NULL)) {
1747         if (ref->authority != NULL)
1748             res->authority = g_strdup(ref->authority);
1749         else {
1750             res->server = g_strdup(ref->server);
1751             res->user = g_strdup(ref->user);
1752             res->port = ref->port;
1753         }
1754         res->path = g_strdup(ref->path);
1755         goto step_7;
1756     }
1757     if (bas->authority != NULL)
1758         res->authority = g_strdup(bas->authority);
1759     else if (bas->server != NULL) {
1760         res->server = g_strdup(bas->server);
1761         res->user = g_strdup(bas->user);
1762         res->port = bas->port;
1763     }
1764
1765     /*
1766      * 5) If the path component begins with a slash character ("/"), then
1767      *    the reference is an absolute-path and we skip to step 7.
1768      */
1769     if ((ref->path != NULL) && (ref->path[0] == '/')) {
1770         res->path = g_strdup(ref->path);
1771         goto step_7;
1772     }
1773
1774
1775     /*
1776      * 6) If this step is reached, then we are resolving a relative-path
1777      *    reference.  The relative path needs to be merged with the base
1778      *    URI's path.  Although there are many ways to do this, we will
1779      *    describe a simple method using a separate string buffer.
1780      *
1781      * Allocate a buffer large enough for the result string.
1782      */
1783     len = 2; /* extra / and 0 */
1784     if (ref->path != NULL)
1785         len += strlen(ref->path);
1786     if (bas->path != NULL)
1787         len += strlen(bas->path);
1788     res->path = g_malloc(len);
1789     res->path[0] = 0;
1790
1791     /*
1792      * a) All but the last segment of the base URI's path component is
1793      *    copied to the buffer.  In other words, any characters after the
1794      *    last (right-most) slash character, if any, are excluded.
1795      */
1796     cur = 0;
1797     out = 0;
1798     if (bas->path != NULL) {
1799         while (bas->path[cur] != 0) {
1800             while ((bas->path[cur] != 0) && (bas->path[cur] != '/'))
1801                 cur++;
1802             if (bas->path[cur] == 0)
1803                 break;
1804
1805             cur++;
1806             while (out < cur) {
1807                 res->path[out] = bas->path[out];
1808                 out++;
1809             }
1810         }
1811     }
1812     res->path[out] = 0;
1813
1814     /*
1815      * b) The reference's path component is appended to the buffer
1816      *    string.
1817      */
1818     if (ref->path != NULL && ref->path[0] != 0) {
1819         indx = 0;
1820         /*
1821          * Ensure the path includes a '/'
1822          */
1823         if ((out == 0) && (bas->server != NULL))
1824             res->path[out++] = '/';
1825         while (ref->path[indx] != 0) {
1826             res->path[out++] = ref->path[indx++];
1827         }
1828     }
1829     res->path[out] = 0;
1830
1831     /*
1832      * Steps c) to h) are really path normalization steps
1833      */
1834     normalize_uri_path(res->path);
1835
1836 step_7:
1837
1838     /*
1839      * 7) The resulting URI components, including any inherited from the
1840      *    base URI, are recombined to give the absolute form of the URI
1841      *    reference.
1842      */
1843     val = uri_to_string(res);
1844
1845 done:
1846     if (ref != NULL)
1847         uri_free(ref);
1848     if (bas != NULL)
1849         uri_free(bas);
1850     if (res != NULL)
1851         uri_free(res);
1852     return(val);
1853 }
1854
1855 /**
1856  * uri_resolve_relative:
1857  * @URI:  the URI reference under consideration
1858  * @base:  the base value
1859  *
1860  * Expresses the URI of the reference in terms relative to the
1861  * base.  Some examples of this operation include:
1862  *     base = "http://site1.com/docs/book1.html"
1863  *        URI input                        URI returned
1864  *     docs/pic1.gif                    pic1.gif
1865  *     docs/img/pic1.gif                img/pic1.gif
1866  *     img/pic1.gif                     ../img/pic1.gif
1867  *     http://site1.com/docs/pic1.gif   pic1.gif
1868  *     http://site2.com/docs/pic1.gif   http://site2.com/docs/pic1.gif
1869  *
1870  *     base = "docs/book1.html"
1871  *        URI input                        URI returned
1872  *     docs/pic1.gif                    pic1.gif
1873  *     docs/img/pic1.gif                img/pic1.gif
1874  *     img/pic1.gif                     ../img/pic1.gif
1875  *     http://site1.com/docs/pic1.gif   http://site1.com/docs/pic1.gif
1876  *
1877  *
1878  * Note: if the URI reference is really weird or complicated, it may be
1879  *       worthwhile to first convert it into a "nice" one by calling
1880  *       uri_resolve (using 'base') before calling this routine,
1881  *       since this routine (for reasonable efficiency) assumes URI has
1882  *       already been through some validation.
1883  *
1884  * Returns a new URI string (to be freed by the caller) or NULL in case
1885  * error.
1886  */
1887 char *
1888 uri_resolve_relative (const char *uri, const char * base)
1889 {
1890     char *val = NULL;
1891     int ret;
1892     int ix;
1893     int pos = 0;
1894     int nbslash = 0;
1895     int len;
1896     URI *ref = NULL;
1897     URI *bas = NULL;
1898     char *bptr, *uptr, *vptr;
1899     int remove_path = 0;
1900
1901     if ((uri == NULL) || (*uri == 0))
1902         return NULL;
1903
1904     /*
1905      * First parse URI into a standard form
1906      */
1907     ref = uri_new ();
1908     /* If URI not already in "relative" form */
1909     if (uri[0] != '.') {
1910         ret = uri_parse_into (ref, uri);
1911         if (ret != 0)
1912             goto done;          /* Error in URI, return NULL */
1913     } else
1914         ref->path = g_strdup(uri);
1915
1916     /*
1917      * Next parse base into the same standard form
1918      */
1919     if ((base == NULL) || (*base == 0)) {
1920         val = g_strdup (uri);
1921         goto done;
1922     }
1923     bas = uri_new ();
1924     if (base[0] != '.') {
1925         ret = uri_parse_into (bas, base);
1926         if (ret != 0)
1927             goto done;          /* Error in base, return NULL */
1928     } else
1929         bas->path = g_strdup(base);
1930
1931     /*
1932      * If the scheme / server on the URI differs from the base,
1933      * just return the URI
1934      */
1935     if ((ref->scheme != NULL) &&
1936         ((bas->scheme == NULL) ||
1937          (strcmp (bas->scheme, ref->scheme)) ||
1938          (strcmp (bas->server, ref->server)))) {
1939         val = g_strdup (uri);
1940         goto done;
1941     }
1942     if (bas->path == ref->path ||
1943         (bas->path && ref->path && !strcmp(bas->path, ref->path))) {
1944         val = g_strdup("");
1945         goto done;
1946     }
1947     if (bas->path == NULL) {
1948         val = g_strdup(ref->path);
1949         goto done;
1950     }
1951     if (ref->path == NULL) {
1952         ref->path = (char *) "/";
1953         remove_path = 1;
1954     }
1955
1956     /*
1957      * At this point (at last!) we can compare the two paths
1958      *
1959      * First we take care of the special case where either of the
1960      * two path components may be missing (bug 316224)
1961      */
1962     if (bas->path == NULL) {
1963         if (ref->path != NULL) {
1964             uptr = ref->path;
1965             if (*uptr == '/')
1966                 uptr++;
1967             /* exception characters from uri_to_string */
1968             val = uri_string_escape(uptr, "/;&=+$,");
1969         }
1970         goto done;
1971     }
1972     bptr = bas->path;
1973     if (ref->path == NULL) {
1974         for (ix = 0; bptr[ix] != 0; ix++) {
1975             if (bptr[ix] == '/')
1976                 nbslash++;
1977         }
1978         uptr = NULL;
1979         len = 1;        /* this is for a string terminator only */
1980     } else {
1981     /*
1982      * Next we compare the two strings and find where they first differ
1983      */
1984         if ((ref->path[pos] == '.') && (ref->path[pos+1] == '/'))
1985             pos += 2;
1986         if ((*bptr == '.') && (bptr[1] == '/'))
1987             bptr += 2;
1988         else if ((*bptr == '/') && (ref->path[pos] != '/'))
1989             bptr++;
1990         while ((bptr[pos] == ref->path[pos]) && (bptr[pos] != 0))
1991             pos++;
1992
1993         if (bptr[pos] == ref->path[pos]) {
1994             val = g_strdup("");
1995             goto done;          /* (I can't imagine why anyone would do this) */
1996         }
1997
1998         /*
1999          * In URI, "back up" to the last '/' encountered.  This will be the
2000          * beginning of the "unique" suffix of URI
2001          */
2002         ix = pos;
2003         if ((ref->path[ix] == '/') && (ix > 0))
2004             ix--;
2005         else if ((ref->path[ix] == 0) && (ix > 1) && (ref->path[ix - 1] == '/'))
2006             ix -= 2;
2007         for (; ix > 0; ix--) {
2008             if (ref->path[ix] == '/')
2009                 break;
2010         }
2011         if (ix == 0) {
2012             uptr = ref->path;
2013         } else {
2014             ix++;
2015             uptr = &ref->path[ix];
2016         }
2017
2018         /*
2019          * In base, count the number of '/' from the differing point
2020          */
2021         if (bptr[pos] != ref->path[pos]) {/* check for trivial URI == base */
2022             for (; bptr[ix] != 0; ix++) {
2023                 if (bptr[ix] == '/')
2024                     nbslash++;
2025             }
2026         }
2027         len = strlen (uptr) + 1;
2028     }
2029
2030     if (nbslash == 0) {
2031         if (uptr != NULL)
2032             /* exception characters from uri_to_string */
2033             val = uri_string_escape(uptr, "/;&=+$,");
2034         goto done;
2035     }
2036
2037     /*
2038      * Allocate just enough space for the returned string -
2039      * length of the remainder of the URI, plus enough space
2040      * for the "../" groups, plus one for the terminator
2041      */
2042     val = g_malloc (len + 3 * nbslash);
2043     vptr = val;
2044     /*
2045      * Put in as many "../" as needed
2046      */
2047     for (; nbslash>0; nbslash--) {
2048         *vptr++ = '.';
2049         *vptr++ = '.';
2050         *vptr++ = '/';
2051     }
2052     /*
2053      * Finish up with the end of the URI
2054      */
2055     if (uptr != NULL) {
2056         if ((vptr > val) && (len > 0) &&
2057             (uptr[0] == '/') && (vptr[-1] == '/')) {
2058             memcpy (vptr, uptr + 1, len - 1);
2059             vptr[len - 2] = 0;
2060         } else {
2061             memcpy (vptr, uptr, len);
2062             vptr[len - 1] = 0;
2063         }
2064     } else {
2065         vptr[len - 1] = 0;
2066     }
2067
2068     /* escape the freshly-built path */
2069     vptr = val;
2070         /* exception characters from uri_to_string */
2071     val = uri_string_escape(vptr, "/;&=+$,");
2072     g_free(vptr);
2073
2074 done:
2075     /*
2076      * Free the working variables
2077      */
2078     if (remove_path != 0)
2079         ref->path = NULL;
2080     if (ref != NULL)
2081         uri_free (ref);
2082     if (bas != NULL)
2083         uri_free (bas);
2084
2085     return val;
2086 }
2087
2088 /*
2089  * Utility functions to help parse and assemble query strings.
2090  */
2091
2092 struct QueryParams *
2093 query_params_new (int init_alloc)
2094 {
2095     struct QueryParams *ps;
2096
2097     if (init_alloc <= 0) init_alloc = 1;
2098
2099     ps = g_new(QueryParams, 1);
2100     ps->n = 0;
2101     ps->alloc = init_alloc;
2102     ps->p = g_new(QueryParam, ps->alloc);
2103
2104     return ps;
2105 }
2106
2107 /* Ensure there is space to store at least one more parameter
2108  * at the end of the set.
2109  */
2110 static int
2111 query_params_append (struct QueryParams *ps,
2112                const char *name, const char *value)
2113 {
2114     if (ps->n >= ps->alloc) {
2115         ps->p = g_renew(QueryParam, ps->p, ps->alloc * 2);
2116         ps->alloc *= 2;
2117     }
2118
2119     ps->p[ps->n].name = g_strdup(name);
2120     ps->p[ps->n].value = g_strdup(value);
2121     ps->p[ps->n].ignore = 0;
2122     ps->n++;
2123
2124     return 0;
2125 }
2126
2127 void
2128 query_params_free (struct QueryParams *ps)
2129 {
2130     int i;
2131
2132     for (i = 0; i < ps->n; ++i) {
2133         g_free (ps->p[i].name);
2134         g_free (ps->p[i].value);
2135     }
2136     g_free (ps->p);
2137     g_free (ps);
2138 }
2139
2140 struct QueryParams *
2141 query_params_parse (const char *query)
2142 {
2143     struct QueryParams *ps;
2144     const char *end, *eq;
2145
2146     ps = query_params_new (0);
2147     if (!query || query[0] == '\0') return ps;
2148
2149     while (*query) {
2150         char *name = NULL, *value = NULL;
2151
2152         /* Find the next separator, or end of the string. */
2153         end = strchr (query, '&');
2154         if (!end)
2155             end = strchr (query, ';');
2156         if (!end)
2157             end = query + strlen (query);
2158
2159         /* Find the first '=' character between here and end. */
2160         eq = strchr (query, '=');
2161         if (eq && eq >= end) eq = NULL;
2162
2163         /* Empty section (eg. "&&"). */
2164         if (end == query)
2165             goto next;
2166
2167         /* If there is no '=' character, then we have just "name"
2168          * and consistent with CGI.pm we assume value is "".
2169          */
2170         else if (!eq) {
2171             name = uri_string_unescape (query, end - query, NULL);
2172             value = NULL;
2173         }
2174         /* Or if we have "name=" here (works around annoying
2175          * problem when calling uri_string_unescape with len = 0).
2176          */
2177         else if (eq+1 == end) {
2178             name = uri_string_unescape (query, eq - query, NULL);
2179             value = g_new0(char, 1);
2180         }
2181         /* If the '=' character is at the beginning then we have
2182          * "=value" and consistent with CGI.pm we _ignore_ this.
2183          */
2184         else if (query == eq)
2185             goto next;
2186
2187         /* Otherwise it's "name=value". */
2188         else {
2189             name = uri_string_unescape (query, eq - query, NULL);
2190             value = uri_string_unescape (eq+1, end - (eq+1), NULL);
2191         }
2192
2193         /* Append to the parameter set. */
2194         query_params_append (ps, name, value);
2195         g_free(name);
2196         g_free(value);
2197
2198     next:
2199         query = end;
2200         if (*query) query ++; /* skip '&' separator */
2201     }
2202
2203     return ps;
2204 }