util/uri.c

   1 /**
   2  * uri.c: set of generic URI related routines
   3  *
   4  * Reference: RFCs 3986, 2732 and 2373
   5  *
   6  * Copyright (C) 1998-2003 Daniel Veillard.  All Rights Reserved.
   7  *
   8  * Permission is hereby granted, free of charge, to any person obtaining a copy
   9  * of this software and associated documentation files (the "Software"), to deal
  10  * in the Software without restriction, including without limitation the rights
  11  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  12  * copies of the Software, and to permit persons to whom the Software is
  13  * furnished to do so, subject to the following conditions:
  14  *
  15  * The above copyright notice and this permission notice shall be included in
  16  * all copies or substantial portions of the Software.
  17  *
  18  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  19  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  20  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
  21  * DANIEL VEILLARD BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
  22  * IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  23  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  24  *
  25  * Except as contained in this notice, the name of Daniel Veillard shall not
  26  * be used in advertising or otherwise to promote the sale, use or other
  27  * dealings in this Software without prior written authorization from him.
  28  *
  29  * daniel@veillard.com
  30  *
  31  **
  32  *
  33  * Copyright (C) 2007, 2009-2010 Red Hat, Inc.
  34  *
  35  * This library is free software; you can redistribute it and/or
  36  * modify it under the terms of the GNU Lesser General Public
  37  * License as published by the Free Software Foundation; either
  38  * version 2.1 of the License, or (at your option) any later version.
  39  *
  40  * This library is distributed in the hope that it will be useful,
  41  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  42  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  43  * Lesser General Public License for more details.
  44  *
  45  * You should have received a copy of the GNU Lesser General Public
  46  * License along with this library; if not, write to the Free Software
  47  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307  USA
  48  *
  49  * Authors:
  50  *    Richard W.M. Jones <rjones@redhat.com>
  51  *
  52  */
  53
  54 #include "qemu/osdep.h"
  55 #include <glib.h>
  56
  57 #include "qemu/uri.h"
  58
  59 static void uri_clean(URI *uri);
  60
  61 /*
  62  * Old rule from 2396 used in legacy handling code
  63  * alpha    = lowalpha | upalpha
  64  */
  65 #define IS_ALPHA(x) (IS_LOWALPHA(x) || IS_UPALPHA(x))
  66
  67
  68 /*
  69  * lowalpha = "a" | "b" | "c" | "d" | "e" | "f" | "g" | "h" | "i" | "j" |
  70  *            "k" | "l" | "m" | "n" | "o" | "p" | "q" | "r" | "s" | "t" |
  71  *            "u" | "v" | "w" | "x" | "y" | "z"
  72  */
  73
  74 #define IS_LOWALPHA(x) (((x) >= 'a') && ((x) <= 'z'))
  75
  76 /*
  77  * upalpha = "A" | "B" | "C" | "D" | "E" | "F" | "G" | "H" | "I" | "J" |
  78  *           "K" | "L" | "M" | "N" | "O" | "P" | "Q" | "R" | "S" | "T" |
  79  *           "U" | "V" | "W" | "X" | "Y" | "Z"
  80  */
  81 #define IS_UPALPHA(x) (((x) >= 'A') && ((x) <= 'Z'))
  82
  83 #ifdef IS_DIGIT
  84 #undef IS_DIGIT
  85 #endif
  86 /*
  87  * digit = "0" | "1" | "2" | "3" | "4" | "5" | "6" | "7" | "8" | "9"
  88  */
  89 #define IS_DIGIT(x) (((x) >= '0') && ((x) <= '9'))
  90
  91 /*
  92  * alphanum = alpha | digit
  93  */
  94
  95 #define IS_ALPHANUM(x) (IS_ALPHA(x) || IS_DIGIT(x))
  96
  97 /*
  98  * mark = "-" | "_" | "." | "!" | "~" | "*" | "'" | "(" | ")"
  99  */
 100
 101 #define IS_MARK(x) (((x) == '-') || ((x) == '_') || ((x) == '.') ||     \
 102     ((x) == '!') || ((x) == '~') || ((x) == '*') || ((x) == '\'') ||    \
 103     ((x) == '(') || ((x) == ')'))
 104
 105 /*
 106  * unwise = "{" | "}" | "|" | "\" | "^" | "`"
 107  */
 108
 109 #define IS_UNWISE(p)                                                    \
 110       (((*(p) == '{')) || ((*(p) == '}')) || ((*(p) == '|')) ||         \
 111        ((*(p) == '\\')) || ((*(p) == '^')) || ((*(p) == '[')) ||        \
 112        ((*(p) == ']')) || ((*(p) == '`')))
 113 /*
 114  * reserved = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" | "$" | "," |
 115  *            "[" | "]"
 116  */
 117
 118 #define IS_RESERVED(x) (((x) == ';') || ((x) == '/') || ((x) == '?') || \
 119         ((x) == ':') || ((x) == '@') || ((x) == '&') || ((x) == '=') || \
 120         ((x) == '+') || ((x) == '$') || ((x) == ',') || ((x) == '[') || \
 121         ((x) == ']'))
 122
 123 /*
 124  * unreserved = alphanum | mark
 125  */
 126
 127 #define IS_UNRESERVED(x) (IS_ALPHANUM(x) || IS_MARK(x))
 128
 129 /*
 130  * Skip to next pointer char, handle escaped sequences
 131  */
 132
 133 #define NEXT(p) ((*p == '%')? p += 3 : p++)
 134
 135 /*
 136  * Productions from the spec.
 137  *
 138  *    authority     = server | reg_name
 139  *    reg_name      = 1*( unreserved | escaped | "$" | "," |
 140  *                        ";" | ":" | "@" | "&" | "=" | "+" )
 141  *
 142  * path          = [ abs_path | opaque_part ]
 143  */
 144
 145
 146 /************************************************************************
 147  *                                                                      *
 148  *                         RFC 3986 parser                              *
 149  *                                                                      *
 150  ************************************************************************/
 151
 152 #define ISA_DIGIT(p) ((*(p) >= '0') && (*(p) <= '9'))
 153 #define ISA_ALPHA(p) (((*(p) >= 'a') && (*(p) <= 'z')) ||               \
 154                       ((*(p) >= 'A') && (*(p) <= 'Z')))
 155 #define ISA_HEXDIG(p)                                                   \
 156        (ISA_DIGIT(p) || ((*(p) >= 'a') && (*(p) <= 'f')) ||             \
 157         ((*(p) >= 'A') && (*(p) <= 'F')))
 158
 159 /*
 160  *    sub-delims    = "!" / "$" / "&" / "'" / "(" / ")"
 161  *                     / "*" / "+" / "," / ";" / "="
 162  */
 163 #define ISA_SUB_DELIM(p)                                                \
 164       (((*(p) == '!')) || ((*(p) == '$')) || ((*(p) == '&')) ||         \
 165        ((*(p) == '(')) || ((*(p) == ')')) || ((*(p) == '*')) ||         \
 166        ((*(p) == '+')) || ((*(p) == ',')) || ((*(p) == ';')) ||         \
 167        ((*(p) == '=')) || ((*(p) == '\'')))
 168
 169 /*
 170  *    gen-delims    = ":" / "/" / "?" / "#" / "[" / "]" / "@"
 171  */
 172 #define ISA_GEN_DELIM(p)                                                \
 173       (((*(p) == ':')) || ((*(p) == '/')) || ((*(p) == '?')) ||         \
 174        ((*(p) == '#')) || ((*(p) == '[')) || ((*(p) == ']')) ||         \
 175        ((*(p) == '@')))
 176
 177 /*
 178  *    reserved      = gen-delims / sub-delims
 179  */
 180 #define ISA_RESERVED(p) (ISA_GEN_DELIM(p) || (ISA_SUB_DELIM(p)))
 181
 182 /*
 183  *    unreserved    = ALPHA / DIGIT / "-" / "." / "_" / "~"
 184  */
 185 #define ISA_UNRESERVED(p)                                               \
 186       ((ISA_ALPHA(p)) || (ISA_DIGIT(p)) || ((*(p) == '-')) ||           \
 187        ((*(p) == '.')) || ((*(p) == '_')) || ((*(p) == '~')))
 188
 189 /*
 190  *    pct-encoded   = "%" HEXDIG HEXDIG
 191  */
 192 #define ISA_PCT_ENCODED(p)                                              \
 193      ((*(p) == '%') && (ISA_HEXDIG(p + 1)) && (ISA_HEXDIG(p + 2)))
 194
 195 /*
 196  *    pchar         = unreserved / pct-encoded / sub-delims / ":" / "@"
 197  */
 198 #define ISA_PCHAR(p)                                                    \
 199      (ISA_UNRESERVED(p) || ISA_PCT_ENCODED(p) || ISA_SUB_DELIM(p) ||    \
 200       ((*(p) == ':')) || ((*(p) == '@')))
 201
 202 /**
 203  * rfc3986_parse_scheme:
 204  * @uri:  pointer to an URI structure
 205  * @str:  pointer to the string to analyze
 206  *
 207  * Parse an URI scheme
 208  *
 209  * ALPHA *( ALPHA / DIGIT / "+" / "-" / "." )
 210  *
 211  * Returns 0 or the error code
 212  */
 213 static int
 214 rfc3986_parse_scheme(URI *uri, const char **str) {
 215     const char *cur;
 216
 217     if (str == NULL)
 218         return(-1);
 219
 220     cur = *str;
 221     if (!ISA_ALPHA(cur))
 222         return(2);
 223     cur++;
 224     while (ISA_ALPHA(cur) || ISA_DIGIT(cur) ||
 225            (*cur == '+') || (*cur == '-') || (*cur == '.')) cur++;
 226     if (uri != NULL) {
 227         g_free(uri->scheme);
 228         uri->scheme = g_strndup(*str, cur - *str);
 229     }
 230     *str = cur;
 231     return(0);
 232 }
 233
 234 /**
 235  * rfc3986_parse_fragment:
 236  * @uri:  pointer to an URI structure
 237  * @str:  pointer to the string to analyze
 238  *
 239  * Parse the query part of an URI
 240  *
 241  * fragment      = *( pchar / "/" / "?" )
 242  * NOTE: the strict syntax as defined by 3986 does not allow '[' and ']'
 243  *       in the fragment identifier but this is used very broadly for
 244  *       xpointer scheme selection, so we are allowing it here to not break
 245  *       for example all the DocBook processing chains.
 246  *
 247  * Returns 0 or the error code
 248  */
 249 static int
 250 rfc3986_parse_fragment(URI *uri, const char **str)
 251 {
 252     const char *cur;
 253
 254     if (str == NULL)
 255         return (-1);
 256
 257     cur = *str;
 258
 259     while ((ISA_PCHAR(cur)) || (*cur == '/') || (*cur == '?') ||
 260            (*cur == '[') || (*cur == ']') ||
 261            ((uri != NULL) && (uri->cleanup & 1) && (IS_UNWISE(cur))))
 262         NEXT(cur);
 263     if (uri != NULL) {
 264         g_free(uri->fragment);
 265         if (uri->cleanup & 2)
 266             uri->fragment = g_strndup(*str, cur - *str);
 267         else
 268             uri->fragment = uri_string_unescape(*str, cur - *str, NULL);
 269     }
 270     *str = cur;
 271     return (0);
 272 }
 273
 274 /**
 275  * rfc3986_parse_query:
 276  * @uri:  pointer to an URI structure
 277  * @str:  pointer to the string to analyze
 278  *
 279  * Parse the query part of an URI
 280  *
 281  * query = *uric
 282  *
 283  * Returns 0 or the error code
 284  */
 285 static int
 286 rfc3986_parse_query(URI *uri, const char **str)
 287 {
 288     const char *cur;
 289
 290     if (str == NULL)
 291         return (-1);
 292
 293     cur = *str;
 294
 295     while ((ISA_PCHAR(cur)) || (*cur == '/') || (*cur == '?') ||
 296            ((uri != NULL) && (uri->cleanup & 1) && (IS_UNWISE(cur))))
 297         NEXT(cur);
 298     if (uri != NULL) {
 299         g_free(uri->query);
 300         uri->query = g_strndup (*str, cur - *str);
 301     }
 302     *str = cur;
 303     return (0);
 304 }
 305
 306 /**
 307  * rfc3986_parse_port:
 308  * @uri:  pointer to an URI structure
 309  * @str:  the string to analyze
 310  *
 311  * Parse a port  part and fills in the appropriate fields
 312  * of the @uri structure
 313  *
 314  * port          = *DIGIT
 315  *
 316  * Returns 0 or the error code
 317  */
 318 static int
 319 rfc3986_parse_port(URI *uri, const char **str)
 320 {
 321     const char *cur = *str;
 322     int port = 0;
 323
 324     if (ISA_DIGIT(cur)) {
 325         while (ISA_DIGIT(cur)) {
 326             port = port * 10 + (*cur - '0');
 327             if (port > 65535) {
 328                 return 1;
 329             }
 330             cur++;
 331         }
 332         if (uri) {
 333             uri->port = port;
 334         }
 335         *str = cur;
 336         return 0;
 337     }
 338     return 1;
 339 }
 340
 341 /**
 342  * rfc3986_parse_user_info:
 343  * @uri:  pointer to an URI structure
 344  * @str:  the string to analyze
 345  *
 346  * Parse an user informations part and fills in the appropriate fields
 347  * of the @uri structure
 348  *
 349  * userinfo      = *( unreserved / pct-encoded / sub-delims / ":" )
 350  *
 351  * Returns 0 or the error code
 352  */
 353 static int
 354 rfc3986_parse_user_info(URI *uri, const char **str)
 355 {
 356     const char *cur;
 357
 358     cur = *str;
 359     while (ISA_UNRESERVED(cur) || ISA_PCT_ENCODED(cur) ||
 360            ISA_SUB_DELIM(cur) || (*cur == ':'))
 361         NEXT(cur);
 362     if (*cur == '@') {
 363         if (uri != NULL) {
 364             g_free(uri->user);
 365             if (uri->cleanup & 2)
 366                 uri->user = g_strndup(*str, cur - *str);
 367             else
 368                 uri->user = uri_string_unescape(*str, cur - *str, NULL);
 369         }
 370         *str = cur;
 371         return(0);
 372     }
 373     return(1);
 374 }
 375
 376 /**
 377  * rfc3986_parse_dec_octet:
 378  * @str:  the string to analyze
 379  *
 380  *    dec-octet     = DIGIT                 ; 0-9
 381  *                  / %x31-39 DIGIT         ; 10-99
 382  *                  / "1" 2DIGIT            ; 100-199
 383  *                  / "2" %x30-34 DIGIT     ; 200-249
 384  *                  / "25" %x30-35          ; 250-255
 385  *
 386  * Skip a dec-octet.
 387  *
 388  * Returns 0 if found and skipped, 1 otherwise
 389  */
 390 static int
 391 rfc3986_parse_dec_octet(const char **str) {
 392     const char *cur = *str;
 393
 394     if (!(ISA_DIGIT(cur)))
 395         return(1);
 396     if (!ISA_DIGIT(cur+1))
 397         cur++;
 398     else if ((*cur != '0') && (ISA_DIGIT(cur + 1)) && (!ISA_DIGIT(cur+2)))
 399         cur += 2;
 400     else if ((*cur == '1') && (ISA_DIGIT(cur + 1)) && (ISA_DIGIT(cur + 2)))
 401         cur += 3;
 402     else if ((*cur == '2') && (*(cur + 1) >= '0') &&
 403              (*(cur + 1) <= '4') && (ISA_DIGIT(cur + 2)))
 404         cur += 3;
 405     else if ((*cur == '2') && (*(cur + 1) == '5') &&
 406              (*(cur + 2) >= '0') && (*(cur + 1) <= '5'))
 407         cur += 3;
 408     else
 409         return(1);
 410     *str = cur;
 411     return(0);
 412 }
 413 /**
 414  * rfc3986_parse_host:
 415  * @uri:  pointer to an URI structure
 416  * @str:  the string to analyze
 417  *
 418  * Parse an host part and fills in the appropriate fields
 419  * of the @uri structure
 420  *
 421  * host          = IP-literal / IPv4address / reg-name
 422  * IP-literal    = "[" ( IPv6address / IPvFuture  ) "]"
 423  * IPv4address   = dec-octet "." dec-octet "." dec-octet "." dec-octet
 424  * reg-name      = *( unreserved / pct-encoded / sub-delims )
 425  *
 426  * Returns 0 or the error code
 427  */
 428 static int
 429 rfc3986_parse_host(URI *uri, const char **str)
 430 {
 431     const char *cur = *str;
 432     const char *host;
 433
 434     host = cur;
 435     /*
 436      * IPv6 and future addressing scheme are enclosed between brackets
 437      */
 438     if (*cur == '[') {
 439         cur++;
 440         while ((*cur != ']') && (*cur != 0))
 441             cur++;
 442         if (*cur != ']')
 443             return(1);
 444         cur++;
 445         goto found;
 446     }
 447     /*
 448      * try to parse an IPv4
 449      */
 450     if (ISA_DIGIT(cur)) {
 451         if (rfc3986_parse_dec_octet(&cur) != 0)
 452             goto not_ipv4;
 453         if (*cur != '.')
 454             goto not_ipv4;
 455         cur++;
 456         if (rfc3986_parse_dec_octet(&cur) != 0)
 457             goto not_ipv4;
 458         if (*cur != '.')
 459             goto not_ipv4;
 460         if (rfc3986_parse_dec_octet(&cur) != 0)
 461             goto not_ipv4;
 462         if (*cur != '.')
 463             goto not_ipv4;
 464         if (rfc3986_parse_dec_octet(&cur) != 0)
 465             goto not_ipv4;
 466         goto found;
 467 not_ipv4:
 468         cur = *str;
 469     }
 470     /*
 471      * then this should be a hostname which can be empty
 472      */
 473     while (ISA_UNRESERVED(cur) || ISA_PCT_ENCODED(cur) || ISA_SUB_DELIM(cur))
 474         NEXT(cur);
 475 found:
 476     if (uri != NULL) {
 477         g_free(uri->authority);
 478         uri->authority = NULL;
 479         g_free(uri->server);
 480         if (cur != host) {
 481             if (uri->cleanup & 2)
 482                 uri->server = g_strndup(host, cur - host);
 483             else
 484                 uri->server = uri_string_unescape(host, cur - host, NULL);
 485         } else
 486             uri->server = NULL;
 487     }
 488     *str = cur;
 489     return(0);
 490 }
 491
 492 /**
 493  * rfc3986_parse_authority:
 494  * @uri:  pointer to an URI structure
 495  * @str:  the string to analyze
 496  *
 497  * Parse an authority part and fills in the appropriate fields
 498  * of the @uri structure
 499  *
 500  * authority     = [ userinfo "@" ] host [ ":" port ]
 501  *
 502  * Returns 0 or the error code
 503  */
 504 static int
 505 rfc3986_parse_authority(URI *uri, const char **str)
 506 {
 507     const char *cur;
 508     int ret;
 509
 510     cur = *str;
 511     /*
 512      * try to parse an userinfo and check for the trailing @
 513      */
 514     ret = rfc3986_parse_user_info(uri, &cur);
 515     if ((ret != 0) || (*cur != '@'))
 516         cur = *str;
 517     else
 518         cur++;
 519     ret = rfc3986_parse_host(uri, &cur);
 520     if (ret != 0) return(ret);
 521     if (*cur == ':') {
 522         cur++;
 523         ret = rfc3986_parse_port(uri, &cur);
 524         if (ret != 0) return(ret);
 525     }
 526     *str = cur;
 527     return(0);
 528 }
 529
 530 /**
 531  * rfc3986_parse_segment:
 532  * @str:  the string to analyze
 533  * @forbid: an optional forbidden character
 534  * @empty: allow an empty segment
 535  *
 536  * Parse a segment and fills in the appropriate fields
 537  * of the @uri structure
 538  *
 539  * segment       = *pchar
 540  * segment-nz    = 1*pchar
 541  * segment-nz-nc = 1*( unreserved / pct-encoded / sub-delims / "@" )
 542  *               ; non-zero-length segment without any colon ":"
 543  *
 544  * Returns 0 or the error code
 545  */
 546 static int
 547 rfc3986_parse_segment(const char **str, char forbid, int empty)
 548 {
 549     const char *cur;
 550
 551     cur = *str;
 552     if (!ISA_PCHAR(cur)) {
 553         if (empty)
 554             return(0);
 555         return(1);
 556     }
 557     while (ISA_PCHAR(cur) && (*cur != forbid))
 558         NEXT(cur);
 559     *str = cur;
 560     return (0);
 561 }
 562
 563 /**
 564  * rfc3986_parse_path_ab_empty:
 565  * @uri:  pointer to an URI structure
 566  * @str:  the string to analyze
 567  *
 568  * Parse an path absolute or empty and fills in the appropriate fields
 569  * of the @uri structure
 570  *
 571  * path-abempty  = *( "/" segment )
 572  *
 573  * Returns 0 or the error code
 574  */
 575 static int
 576 rfc3986_parse_path_ab_empty(URI *uri, const char **str)
 577 {
 578     const char *cur;
 579     int ret;
 580
 581     cur = *str;
 582
 583     while (*cur == '/') {
 584         cur++;
 585         ret = rfc3986_parse_segment(&cur, 0, 1);
 586         if (ret != 0) return(ret);
 587     }
 588     if (uri != NULL) {
 589         g_free(uri->path);
 590         if (*str != cur) {
 591             if (uri->cleanup & 2)
 592                 uri->path = g_strndup(*str, cur - *str);
 593             else
 594                 uri->path = uri_string_unescape(*str, cur - *str, NULL);
 595         } else {
 596             uri->path = NULL;
 597         }
 598     }
 599     *str = cur;
 600     return (0);
 601 }
 602
 603 /**
 604  * rfc3986_parse_path_absolute:
 605  * @uri:  pointer to an URI structure
 606  * @str:  the string to analyze
 607  *
 608  * Parse an path absolute and fills in the appropriate fields
 609  * of the @uri structure
 610  *
 611  * path-absolute = "/" [ segment-nz *( "/" segment ) ]
 612  *
 613  * Returns 0 or the error code
 614  */
 615 static int
 616 rfc3986_parse_path_absolute(URI *uri, const char **str)
 617 {
 618     const char *cur;
 619     int ret;
 620
 621     cur = *str;
 622
 623     if (*cur != '/')
 624         return(1);
 625     cur++;
 626     ret = rfc3986_parse_segment(&cur, 0, 0);
 627     if (ret == 0) {
 628         while (*cur == '/') {
 629             cur++;
 630             ret = rfc3986_parse_segment(&cur, 0, 1);
 631             if (ret != 0) return(ret);
 632         }
 633     }
 634     if (uri != NULL) {
 635         g_free(uri->path);
 636         if (cur != *str) {
 637             if (uri->cleanup & 2)
 638                 uri->path = g_strndup(*str, cur - *str);
 639             else
 640                 uri->path = uri_string_unescape(*str, cur - *str, NULL);
 641         } else {
 642             uri->path = NULL;
 643         }
 644     }
 645     *str = cur;
 646     return (0);
 647 }
 648
 649 /**
 650  * rfc3986_parse_path_rootless:
 651  * @uri:  pointer to an URI structure
 652  * @str:  the string to analyze
 653  *
 654  * Parse an path without root and fills in the appropriate fields
 655  * of the @uri structure
 656  *
 657  * path-rootless = segment-nz *( "/" segment )
 658  *
 659  * Returns 0 or the error code
 660  */
 661 static int
 662 rfc3986_parse_path_rootless(URI *uri, const char **str)
 663 {
 664     const char *cur;
 665     int ret;
 666
 667     cur = *str;
 668
 669     ret = rfc3986_parse_segment(&cur, 0, 0);
 670     if (ret != 0) return(ret);
 671     while (*cur == '/') {
 672         cur++;
 673         ret = rfc3986_parse_segment(&cur, 0, 1);
 674         if (ret != 0) return(ret);
 675     }
 676     if (uri != NULL) {
 677         g_free(uri->path);
 678         if (cur != *str) {
 679             if (uri->cleanup & 2)
 680                 uri->path = g_strndup(*str, cur - *str);
 681             else
 682                 uri->path = uri_string_unescape(*str, cur - *str, NULL);
 683         } else {
 684             uri->path = NULL;
 685         }
 686     }
 687     *str = cur;
 688     return (0);
 689 }
 690
 691 /**
 692  * rfc3986_parse_path_no_scheme:
 693  * @uri:  pointer to an URI structure
 694  * @str:  the string to analyze
 695  *
 696  * Parse an path which is not a scheme and fills in the appropriate fields
 697  * of the @uri structure
 698  *
 699  * path-noscheme = segment-nz-nc *( "/" segment )
 700  *
 701  * Returns 0 or the error code
 702  */
 703 static int
 704 rfc3986_parse_path_no_scheme(URI *uri, const char **str)
 705 {
 706     const char *cur;
 707     int ret;
 708
 709     cur = *str;
 710
 711     ret = rfc3986_parse_segment(&cur, ':', 0);
 712     if (ret != 0) return(ret);
 713     while (*cur == '/') {
 714         cur++;
 715         ret = rfc3986_parse_segment(&cur, 0, 1);
 716         if (ret != 0) return(ret);
 717     }
 718     if (uri != NULL) {
 719         g_free(uri->path);
 720         if (cur != *str) {
 721             if (uri->cleanup & 2)
 722                 uri->path = g_strndup(*str, cur - *str);
 723             else
 724                 uri->path = uri_string_unescape(*str, cur - *str, NULL);
 725         } else {
 726             uri->path = NULL;
 727         }
 728     }
 729     *str = cur;
 730     return (0);
 731 }
 732
 733 /**
 734  * rfc3986_parse_hier_part:
 735  * @uri:  pointer to an URI structure
 736  * @str:  the string to analyze
 737  *
 738  * Parse an hierarchical part and fills in the appropriate fields
 739  * of the @uri structure
 740  *
 741  * hier-part     = "//" authority path-abempty
 742  *                / path-absolute
 743  *                / path-rootless
 744  *                / path-empty
 745  *
 746  * Returns 0 or the error code
 747  */
 748 static int
 749 rfc3986_parse_hier_part(URI *uri, const char **str)
 750 {
 751     const char *cur;
 752     int ret;
 753
 754     cur = *str;
 755
 756     if ((*cur == '/') && (*(cur + 1) == '/')) {
 757         cur += 2;
 758         ret = rfc3986_parse_authority(uri, &cur);
 759         if (ret != 0) return(ret);
 760         ret = rfc3986_parse_path_ab_empty(uri, &cur);
 761         if (ret != 0) return(ret);
 762         *str = cur;
 763         return(0);
 764     } else if (*cur == '/') {
 765         ret = rfc3986_parse_path_absolute(uri, &cur);
 766         if (ret != 0) return(ret);
 767     } else if (ISA_PCHAR(cur)) {
 768         ret = rfc3986_parse_path_rootless(uri, &cur);
 769         if (ret != 0) return(ret);
 770     } else {
 771         /* path-empty is effectively empty */
 772         if (uri != NULL) {
 773             g_free(uri->path);
 774             uri->path = NULL;
 775         }
 776     }
 777     *str = cur;
 778     return (0);
 779 }
 780
 781 /**
 782  * rfc3986_parse_relative_ref:
 783  * @uri:  pointer to an URI structure
 784  * @str:  the string to analyze
 785  *
 786  * Parse an URI string and fills in the appropriate fields
 787  * of the @uri structure
 788  *
 789  * relative-ref  = relative-part [ "?" query ] [ "#" fragment ]
 790  * relative-part = "//" authority path-abempty
 791  *               / path-absolute
 792  *               / path-noscheme
 793  *               / path-empty
 794  *
 795  * Returns 0 or the error code
 796  */
 797 static int
 798 rfc3986_parse_relative_ref(URI *uri, const char *str) {
 799     int ret;
 800
 801     if ((*str == '/') && (*(str + 1) == '/')) {
 802         str += 2;
 803         ret = rfc3986_parse_authority(uri, &str);
 804         if (ret != 0) return(ret);
 805         ret = rfc3986_parse_path_ab_empty(uri, &str);
 806         if (ret != 0) return(ret);
 807     } else if (*str == '/') {
 808         ret = rfc3986_parse_path_absolute(uri, &str);
 809         if (ret != 0) return(ret);
 810     } else if (ISA_PCHAR(str)) {
 811         ret = rfc3986_parse_path_no_scheme(uri, &str);
 812         if (ret != 0) return(ret);
 813     } else {
 814         /* path-empty is effectively empty */
 815         if (uri != NULL) {
 816             g_free(uri->path);
 817             uri->path = NULL;
 818         }
 819     }
 820
 821     if (*str == '?') {
 822         str++;
 823         ret = rfc3986_parse_query(uri, &str);
 824         if (ret != 0) return(ret);
 825     }
 826     if (*str == '#') {
 827         str++;
 828         ret = rfc3986_parse_fragment(uri, &str);
 829         if (ret != 0) return(ret);
 830     }
 831     if (*str != 0) {
 832         uri_clean(uri);
 833         return(1);
 834     }
 835     return(0);
 836 }
 837
 838
 839 /**
 840  * rfc3986_parse:
 841  * @uri:  pointer to an URI structure
 842  * @str:  the string to analyze
 843  *
 844  * Parse an URI string and fills in the appropriate fields
 845  * of the @uri structure
 846  *
 847  * scheme ":" hier-part [ "?" query ] [ "#" fragment ]
 848  *
 849  * Returns 0 or the error code
 850  */
 851 static int
 852 rfc3986_parse(URI *uri, const char *str) {
 853     int ret;
 854
 855     ret = rfc3986_parse_scheme(uri, &str);
 856     if (ret != 0) return(ret);
 857     if (*str != ':') {
 858         return(1);
 859     }
 860     str++;
 861     ret = rfc3986_parse_hier_part(uri, &str);
 862     if (ret != 0) return(ret);
 863     if (*str == '?') {
 864         str++;
 865         ret = rfc3986_parse_query(uri, &str);
 866         if (ret != 0) return(ret);
 867     }
 868     if (*str == '#') {
 869         str++;
 870         ret = rfc3986_parse_fragment(uri, &str);
 871         if (ret != 0) return(ret);
 872     }
 873     if (*str != 0) {
 874         uri_clean(uri);
 875         return(1);
 876     }
 877     return(0);
 878 }
 879
 880 /**
 881  * rfc3986_parse_uri_reference:
 882  * @uri:  pointer to an URI structure
 883  * @str:  the string to analyze
 884  *
 885  * Parse an URI reference string and fills in the appropriate fields
 886  * of the @uri structure
 887  *
 888  * URI-reference = URI / relative-ref
 889  *
 890  * Returns 0 or the error code
 891  */
 892 static int
 893 rfc3986_parse_uri_reference(URI *uri, const char *str) {
 894     int ret;
 895
 896     if (str == NULL)
 897         return(-1);
 898     uri_clean(uri);
 899
 900     /*
 901      * Try first to parse absolute refs, then fallback to relative if
 902      * it fails.
 903      */
 904     ret = rfc3986_parse(uri, str);
 905     if (ret != 0) {
 906         uri_clean(uri);
 907         ret = rfc3986_parse_relative_ref(uri, str);
 908         if (ret != 0) {
 909             uri_clean(uri);
 910             return(ret);
 911         }
 912     }
 913     return(0);
 914 }
 915
 916 /**
 917  * uri_parse:
 918  * @str:  the URI string to analyze
 919  *
 920  * Parse an URI based on RFC 3986
 921  *
 922  * URI-reference = [ absoluteURI | relativeURI ] [ "#" fragment ]
 923  *
 924  * Returns a newly built URI or NULL in case of error
 925  */
 926 URI *
 927 uri_parse(const char *str) {
 928     URI *uri;
 929     int ret;
 930
 931     if (str == NULL)
 932         return(NULL);
 933     uri = uri_new();
 934     ret = rfc3986_parse_uri_reference(uri, str);
 935     if (ret) {
 936         uri_free(uri);
 937         return(NULL);
 938     }
 939     return(uri);
 940 }
 941
 942 /**
 943  * uri_parse_into:
 944  * @uri:  pointer to an URI structure
 945  * @str:  the string to analyze
 946  *
 947  * Parse an URI reference string based on RFC 3986 and fills in the
 948  * appropriate fields of the @uri structure
 949  *
 950  * URI-reference = URI / relative-ref
 951  *
 952  * Returns 0 or the error code
 953  */
 954 int
 955 uri_parse_into(URI *uri, const char *str) {
 956     return(rfc3986_parse_uri_reference(uri, str));
 957 }
 958
 959 /**
 960  * uri_parse_raw:
 961  * @str:  the URI string to analyze
 962  * @raw:  if 1 unescaping of URI pieces are disabled
 963  *
 964  * Parse an URI but allows to keep intact the original fragments.
 965  *
 966  * URI-reference = URI / relative-ref
 967  *
 968  * Returns a newly built URI or NULL in case of error
 969  */
 970 URI *
 971 uri_parse_raw(const char *str, int raw) {
 972     URI *uri;
 973     int ret;
 974
 975     if (str == NULL)
 976         return(NULL);
 977     uri = uri_new();
 978     if (raw) {
 979         uri->cleanup |= 2;
 980     }
 981     ret = uri_parse_into(uri, str);
 982     if (ret) {
 983         uri_free(uri);
 984         return(NULL);
 985     }
 986     return(uri);
 987 }
 988
 989 /************************************************************************
 990  *                                                                      *
 991  *                      Generic URI structure functions                 *
 992  *                                                                      *
 993  ************************************************************************/
 994
 995 /**
 996  * uri_new:
 997  *
 998  * Simply creates an empty URI
 999  *
1000  * Returns the new structure or NULL in case of error
1001  */
1002 URI *
1003 uri_new(void) {
1004     URI *ret;
1005
1006     ret = g_new0(URI, 1);
1007     return(ret);
1008 }
1009
1010 /**
1011  * realloc2n:
1012  *
1013  * Function to handle properly a reallocation when saving an URI
1014  * Also imposes some limit on the length of an URI string output
1015  */
1016 static char *
1017 realloc2n(char *ret, int *max) {
1018     char *temp;
1019     int tmp;
1020
1021     tmp = *max * 2;
1022     temp = g_realloc(ret, (tmp + 1));
1023     *max = tmp;
1024     return(temp);
1025 }
1026
1027 /**
1028  * uri_to_string:
1029  * @uri:  pointer to an URI
1030  *
1031  * Save the URI as an escaped string
1032  *
1033  * Returns a new string (to be deallocated by caller)
1034  */
1035 char *
1036 uri_to_string(URI *uri) {
1037     char *ret = NULL;
1038     char *temp;
1039     const char *p;
1040     int len;
1041     int max;
1042
1043     if (uri == NULL) return(NULL);
1044
1045
1046     max = 80;
1047     ret = g_malloc(max + 1);
1048     len = 0;
1049
1050     if (uri->scheme != NULL) {
1051         p = uri->scheme;
1052         while (*p != 0) {
1053             if (len >= max) {
1054                 temp = realloc2n(ret, &max);
1055                 ret = temp;
1056             }
1057             ret[len++] = *p++;
1058         }
1059         if (len >= max) {
1060             temp = realloc2n(ret, &max);
1061             ret = temp;
1062         }
1063         ret[len++] = ':';
1064     }
1065     if (uri->opaque != NULL) {
1066         p = uri->opaque;
1067         while (*p != 0) {
1068             if (len + 3 >= max) {
1069                 temp = realloc2n(ret, &max);
1070                 ret = temp;
1071             }
1072             if (IS_RESERVED(*(p)) || IS_UNRESERVED(*(p)))
1073                 ret[len++] = *p++;
1074             else {
1075                 int val = *(unsigned char *)p++;
1076                 int hi = val / 0x10, lo = val % 0x10;
1077                 ret[len++] = '%';
1078                 ret[len++] = hi + (hi > 9? 'A'-10 : '0');
1079                 ret[len++] = lo + (lo > 9? 'A'-10 : '0');
1080             }
1081         }
1082     } else {
1083         if (uri->server != NULL) {
1084             if (len + 3 >= max) {
1085                 temp = realloc2n(ret, &max);
1086                 ret = temp;
1087             }
1088             ret[len++] = '/';
1089             ret[len++] = '/';
1090             if (uri->user != NULL) {
1091                 p = uri->user;
1092                 while (*p != 0) {
1093                     if (len + 3 >= max) {
1094                         temp = realloc2n(ret, &max);
1095                         ret = temp;
1096                     }
1097                     if ((IS_UNRESERVED(*(p))) ||
1098                         ((*(p) == ';')) || ((*(p) == ':')) ||
1099                         ((*(p) == '&')) || ((*(p) == '=')) ||
1100                         ((*(p) == '+')) || ((*(p) == '$')) ||
1101                         ((*(p) == ',')))
1102                         ret[len++] = *p++;
1103                     else {
1104                         int val = *(unsigned char *)p++;
1105                         int hi = val / 0x10, lo = val % 0x10;
1106                         ret[len++] = '%';
1107                         ret[len++] = hi + (hi > 9? 'A'-10 : '0');
1108                         ret[len++] = lo + (lo > 9? 'A'-10 : '0');
1109                     }
1110                 }
1111                 if (len + 3 >= max) {
1112                     temp = realloc2n(ret, &max);
1113                     ret = temp;
1114                 }
1115                 ret[len++] = '@';
1116             }
1117             p = uri->server;
1118             while (*p != 0) {
1119                 if (len >= max) {
1120                     temp = realloc2n(ret, &max);
1121                     ret = temp;
1122                 }
1123                 ret[len++] = *p++;
1124             }
1125             if (uri->port > 0) {
1126                 if (len + 10 >= max) {
1127                     temp = realloc2n(ret, &max);
1128                     ret = temp;
1129                 }
1130                 len += snprintf(&ret[len], max - len, ":%d", uri->port);
1131             }
1132         } else if (uri->authority != NULL) {
1133             if (len + 3 >= max) {
1134                 temp = realloc2n(ret, &max);
1135                 ret = temp;
1136             }
1137             ret[len++] = '/';
1138             ret[len++] = '/';
1139             p = uri->authority;
1140             while (*p != 0) {
1141                 if (len + 3 >= max) {
1142                     temp = realloc2n(ret, &max);
1143                     ret = temp;
1144                 }
1145                 if ((IS_UNRESERVED(*(p))) ||
1146                     ((*(p) == '$')) || ((*(p) == ',')) || ((*(p) == ';')) ||
1147                     ((*(p) == ':')) || ((*(p) == '@')) || ((*(p) == '&')) ||
1148                     ((*(p) == '=')) || ((*(p) == '+')))
1149                     ret[len++] = *p++;
1150                 else {
1151                     int val = *(unsigned char *)p++;
1152                     int hi = val / 0x10, lo = val % 0x10;
1153                     ret[len++] = '%';
1154                     ret[len++] = hi + (hi > 9? 'A'-10 : '0');
1155                     ret[len++] = lo + (lo > 9? 'A'-10 : '0');
1156                 }
1157             }
1158         } else if (uri->scheme != NULL) {
1159             if (len + 3 >= max) {
1160                 temp = realloc2n(ret, &max);
1161                 ret = temp;
1162             }
1163             ret[len++] = '/';
1164             ret[len++] = '/';
1165         }
1166         if (uri->path != NULL) {
1167             p = uri->path;
1168             /*
1169              * the colon in file:///d: should not be escaped or
1170              * Windows accesses fail later.
1171              */
1172             if ((uri->scheme != NULL) &&
1173                 (p[0] == '/') &&
1174                 (((p[1] >= 'a') && (p[1] <= 'z')) ||
1175                  ((p[1] >= 'A') && (p[1] <= 'Z'))) &&
1176                 (p[2] == ':') &&
1177                 (!strcmp(uri->scheme, "file"))) {
1178                 if (len + 3 >= max) {
1179                     temp = realloc2n(ret, &max);
1180                     ret = temp;
1181                 }
1182                 ret[len++] = *p++;
1183                 ret[len++] = *p++;
1184                 ret[len++] = *p++;
1185             }
1186             while (*p != 0) {
1187                 if (len + 3 >= max) {
1188                     temp = realloc2n(ret, &max);
1189                     ret = temp;
1190                 }
1191                 if ((IS_UNRESERVED(*(p))) || ((*(p) == '/')) ||
1192                     ((*(p) == ';')) || ((*(p) == '@')) || ((*(p) == '&')) ||
1193                     ((*(p) == '=')) || ((*(p) == '+')) || ((*(p) == '$')) ||
1194                     ((*(p) == ',')))
1195                     ret[len++] = *p++;
1196                 else {
1197                     int val = *(unsigned char *)p++;
1198                     int hi = val / 0x10, lo = val % 0x10;
1199                     ret[len++] = '%';
1200                     ret[len++] = hi + (hi > 9? 'A'-10 : '0');
1201                     ret[len++] = lo + (lo > 9? 'A'-10 : '0');
1202                 }
1203             }
1204         }
1205         if (uri->query != NULL) {
1206             if (len + 1 >= max) {
1207                 temp = realloc2n(ret, &max);
1208                 ret = temp;
1209             }
1210             ret[len++] = '?';
1211             p = uri->query;
1212             while (*p != 0) {
1213                 if (len + 1 >= max) {
1214                     temp = realloc2n(ret, &max);
1215                     ret = temp;
1216                 }
1217                 ret[len++] = *p++;
1218             }
1219         }
1220     }
1221     if (uri->fragment != NULL) {
1222         if (len + 3 >= max) {
1223             temp = realloc2n(ret, &max);
1224             ret = temp;
1225         }
1226         ret[len++] = '#';
1227         p = uri->fragment;
1228         while (*p != 0) {
1229             if (len + 3 >= max) {
1230                 temp = realloc2n(ret, &max);
1231                 ret = temp;
1232             }
1233             if ((IS_UNRESERVED(*(p))) || (IS_RESERVED(*(p))))
1234                 ret[len++] = *p++;
1235             else {
1236                 int val = *(unsigned char *)p++;
1237                 int hi = val / 0x10, lo = val % 0x10;
1238                 ret[len++] = '%';
1239                 ret[len++] = hi + (hi > 9? 'A'-10 : '0');
1240                 ret[len++] = lo + (lo > 9? 'A'-10 : '0');
1241             }
1242         }
1243     }
1244     if (len >= max) {
1245         temp = realloc2n(ret, &max);
1246         ret = temp;
1247     }
1248     ret[len] = 0;
1249     return(ret);
1250 }
1251
1252 /**
1253  * uri_clean:
1254  * @uri:  pointer to an URI
1255  *
1256  * Make sure the URI struct is free of content
1257  */
1258 static void
1259 uri_clean(URI *uri) {
1260     if (uri == NULL) return;
1261
1262     g_free(uri->scheme);
1263     uri->scheme = NULL;
1264     g_free(uri->server);
1265     uri->server = NULL;
1266     g_free(uri->user);
1267     uri->user = NULL;
1268     g_free(uri->path);
1269     uri->path = NULL;
1270     g_free(uri->fragment);
1271     uri->fragment = NULL;
1272     g_free(uri->opaque);
1273     uri->opaque = NULL;
1274     g_free(uri->authority);
1275     uri->authority = NULL;
1276     g_free(uri->query);
1277     uri->query = NULL;
1278 }
1279
1280 /**
1281  * uri_free:
1282  * @uri:  pointer to an URI
1283  *
1284  * Free up the URI struct
1285  */
1286 void
1287 uri_free(URI *uri) {
1288     uri_clean(uri);
1289     g_free(uri);
1290 }
1291
1292 /************************************************************************
1293  *                                                                      *
1294  *                      Helper functions                                *
1295  *                                                                      *
1296  ************************************************************************/
1297
1298 /**
1299  * normalize_uri_path:
1300  * @path:  pointer to the path string
1301  *
1302  * Applies the 5 normalization steps to a path string--that is, RFC 2396
1303  * Section 5.2, steps 6.c through 6.g.
1304  *
1305  * Normalization occurs directly on the string, no new allocation is done
1306  *
1307  * Returns 0 or an error code
1308  */
1309 static int
1310 normalize_uri_path(char *path) {
1311     char *cur, *out;
1312
1313     if (path == NULL)
1314         return(-1);
1315
1316     /* Skip all initial "/" chars.  We want to get to the beginning of the
1317      * first non-empty segment.
1318      */
1319     cur = path;
1320     while (cur[0] == '/')
1321       ++cur;
1322     if (cur[0] == '\0')
1323       return(0);
1324
1325     /* Keep everything we've seen so far.  */
1326     out = cur;
1327
1328     /*
1329      * Analyze each segment in sequence for cases (c) and (d).
1330      */
1331     while (cur[0] != '\0') {
1332         /*
1333          * c) All occurrences of "./", where "." is a complete path segment,
1334          *    are removed from the buffer string.
1335          */
1336         if ((cur[0] == '.') && (cur[1] == '/')) {
1337             cur += 2;
1338             /* '//' normalization should be done at this point too */
1339             while (cur[0] == '/')
1340                 cur++;
1341             continue;
1342         }
1343
1344         /*
1345          * d) If the buffer string ends with "." as a complete path segment,
1346          *    that "." is removed.
1347          */
1348         if ((cur[0] == '.') && (cur[1] == '\0'))
1349             break;
1350
1351         /* Otherwise keep the segment.  */
1352         while (cur[0] != '/') {
1353             if (cur[0] == '\0')
1354               goto done_cd;
1355             (out++)[0] = (cur++)[0];
1356         }
1357         /* nomalize // */
1358         while ((cur[0] == '/') && (cur[1] == '/'))
1359             cur++;
1360
1361         (out++)[0] = (cur++)[0];
1362     }
1363  done_cd:
1364     out[0] = '\0';
1365
1366     /* Reset to the beginning of the first segment for the next sequence.  */
1367     cur = path;
1368     while (cur[0] == '/')
1369       ++cur;
1370     if (cur[0] == '\0')
1371         return(0);
1372
1373     /*
1374      * Analyze each segment in sequence for cases (e) and (f).
1375      *
1376      * e) All occurrences of "<segment>/../", where <segment> is a
1377      *    complete path segment not equal to "..", are removed from the
1378      *    buffer string.  Removal of these path segments is performed
1379      *    iteratively, removing the leftmost matching pattern on each
1380      *    iteration, until no matching pattern remains.
1381      *
1382      * f) If the buffer string ends with "<segment>/..", where <segment>
1383      *    is a complete path segment not equal to "..", that
1384      *    "<segment>/.." is removed.
1385      *
1386      * To satisfy the "iterative" clause in (e), we need to collapse the
1387      * string every time we find something that needs to be removed.  Thus,
1388      * we don't need to keep two pointers into the string: we only need a
1389      * "current position" pointer.
1390      */
1391     while (1) {
1392         char *segp, *tmp;
1393
1394         /* At the beginning of each iteration of this loop, "cur" points to
1395          * the first character of the segment we want to examine.
1396          */
1397
1398         /* Find the end of the current segment.  */
1399         segp = cur;
1400         while ((segp[0] != '/') && (segp[0] != '\0'))
1401           ++segp;
1402
1403         /* If this is the last segment, we're done (we need at least two
1404          * segments to meet the criteria for the (e) and (f) cases).
1405          */
1406         if (segp[0] == '\0')
1407           break;
1408
1409         /* If the first segment is "..", or if the next segment _isn't_ "..",
1410          * keep this segment and try the next one.
1411          */
1412         ++segp;
1413         if (((cur[0] == '.') && (cur[1] == '.') && (segp == cur+3))
1414             || ((segp[0] != '.') || (segp[1] != '.')
1415                 || ((segp[2] != '/') && (segp[2] != '\0')))) {
1416           cur = segp;
1417           continue;
1418         }
1419
1420         /* If we get here, remove this segment and the next one and back up
1421          * to the previous segment (if there is one), to implement the
1422          * "iteratively" clause.  It's pretty much impossible to back up
1423          * while maintaining two pointers into the buffer, so just compact
1424          * the whole buffer now.
1425          */
1426
1427         /* If this is the end of the buffer, we're done.  */
1428         if (segp[2] == '\0') {
1429           cur[0] = '\0';
1430           break;
1431         }
1432         /* Valgrind complained, strcpy(cur, segp + 3); */
1433         /* string will overlap, do not use strcpy */
1434         tmp = cur;
1435         segp += 3;
1436         while ((*tmp++ = *segp++) != 0)
1437           ;
1438
1439         /* If there are no previous segments, then keep going from here.  */
1440         segp = cur;
1441         while ((segp > path) && ((--segp)[0] == '/'))
1442           ;
1443         if (segp == path)
1444           continue;
1445
1446         /* "segp" is pointing to the end of a previous segment; find it's
1447          * start.  We need to back up to the previous segment and start
1448          * over with that to handle things like "foo/bar/../..".  If we
1449          * don't do this, then on the first pass we'll remove the "bar/..",
1450          * but be pointing at the second ".." so we won't realize we can also
1451          * remove the "foo/..".
1452          */
1453         cur = segp;
1454         while ((cur > path) && (cur[-1] != '/'))
1455           --cur;
1456     }
1457     out[0] = '\0';
1458
1459     /*
1460      * g) If the resulting buffer string still begins with one or more
1461      *    complete path segments of "..", then the reference is
1462      *    considered to be in error. Implementations may handle this
1463      *    error by retaining these components in the resolved path (i.e.,
1464      *    treating them as part of the final URI), by removing them from
1465      *    the resolved path (i.e., discarding relative levels above the
1466      *    root), or by avoiding traversal of the reference.
1467      *
1468      * We discard them from the final path.
1469      */
1470     if (path[0] == '/') {
1471       cur = path;
1472       while ((cur[0] == '/') && (cur[1] == '.') && (cur[2] == '.')
1473              && ((cur[3] == '/') || (cur[3] == '\0')))
1474         cur += 3;
1475
1476       if (cur != path) {
1477         out = path;
1478         while (cur[0] != '\0')
1479           (out++)[0] = (cur++)[0];
1480         out[0] = 0;
1481       }
1482     }
1483
1484     return(0);
1485 }
1486
1487 static int is_hex(char c) {
1488     if (((c >= '0') && (c <= '9')) ||
1489         ((c >= 'a') && (c <= 'f')) ||
1490         ((c >= 'A') && (c <= 'F')))
1491         return(1);
1492     return(0);
1493 }
1494
1495
1496 /**
1497  * uri_string_unescape:
1498  * @str:  the string to unescape
1499  * @len:   the length in bytes to unescape (or <= 0 to indicate full string)
1500  * @target:  optional destination buffer
1501  *
1502  * Unescaping routine, but does not check that the string is an URI. The
1503  * output is a direct unsigned char translation of %XX values (no encoding)
1504  * Note that the length of the result can only be smaller or same size as
1505  * the input string.
1506  *
1507  * Returns a copy of the string, but unescaped, will return NULL only in case
1508  * of error
1509  */
1510 char *
1511 uri_string_unescape(const char *str, int len, char *target) {
1512     char *ret, *out;
1513     const char *in;
1514
1515     if (str == NULL)
1516         return(NULL);
1517     if (len <= 0) len = strlen(str);
1518     if (len < 0) return(NULL);
1519
1520     if (target == NULL) {
1521         ret = g_malloc(len + 1);
1522     } else
1523         ret = target;
1524     in = str;
1525     out = ret;
1526     while(len > 0) {
1527         if ((len > 2) && (*in == '%') && (is_hex(in[1])) && (is_hex(in[2]))) {
1528             in++;
1529             if ((*in >= '0') && (*in <= '9'))
1530                 *out = (*in - '0');
1531             else if ((*in >= 'a') && (*in <= 'f'))
1532                 *out = (*in - 'a') + 10;
1533             else if ((*in >= 'A') && (*in <= 'F'))
1534                 *out = (*in - 'A') + 10;
1535             in++;
1536             if ((*in >= '0') && (*in <= '9'))
1537                 *out = *out * 16 + (*in - '0');
1538             else if ((*in >= 'a') && (*in <= 'f'))
1539                 *out = *out * 16 + (*in - 'a') + 10;
1540             else if ((*in >= 'A') && (*in <= 'F'))
1541                 *out = *out * 16 + (*in - 'A') + 10;
1542             in++;
1543             len -= 3;
1544             out++;
1545         } else {
1546             *out++ = *in++;
1547             len--;
1548         }
1549     }
1550     *out = 0;
1551     return(ret);
1552 }
1553
1554 /**
1555  * uri_string_escape:
1556  * @str:  string to escape
1557  * @list: exception list string of chars not to escape
1558  *
1559  * This routine escapes a string to hex, ignoring reserved characters (a-z)
1560  * and the characters in the exception list.
1561  *
1562  * Returns a new escaped string or NULL in case of error.
1563  */
1564 char *
1565 uri_string_escape(const char *str, const char *list) {
1566     char *ret, ch;
1567     char *temp;
1568     const char *in;
1569     int len, out;
1570
1571     if (str == NULL)
1572         return(NULL);
1573     if (str[0] == 0)
1574         return(g_strdup(str));
1575     len = strlen(str);
1576     if (!(len > 0)) return(NULL);
1577
1578     len += 20;
1579     ret = g_malloc(len);
1580     in = str;
1581     out = 0;
1582     while(*in != 0) {
1583         if (len - out <= 3) {
1584             temp = realloc2n(ret, &len);
1585             ret = temp;
1586         }
1587
1588         ch = *in;
1589
1590         if ((ch != '@') && (!IS_UNRESERVED(ch)) && (!strchr(list, ch))) {
1591             unsigned char val;
1592             ret[out++] = '%';
1593             val = ch >> 4;
1594             if (val <= 9)
1595                 ret[out++] = '0' + val;
1596             else
1597                 ret[out++] = 'A' + val - 0xA;
1598             val = ch & 0xF;
1599             if (val <= 9)
1600                 ret[out++] = '0' + val;
1601             else
1602                 ret[out++] = 'A' + val - 0xA;
1603             in++;
1604         } else {
1605             ret[out++] = *in++;
1606         }
1607
1608     }
1609     ret[out] = 0;
1610     return(ret);
1611 }
1612
1613 /************************************************************************
1614  *                                                                      *
1615  *                      Public functions                                *
1616  *                                                                      *
1617  ************************************************************************/
1618
1619 /**
1620  * uri_resolve:
1621  * @URI:  the URI instance found in the document
1622  * @base:  the base value
1623  *
1624  * Computes he final URI of the reference done by checking that
1625  * the given URI is valid, and building the final URI using the
1626  * base URI. This is processed according to section 5.2 of the
1627  * RFC 2396
1628  *
1629  * 5.2. Resolving Relative References to Absolute Form
1630  *
1631  * Returns a new URI string (to be freed by the caller) or NULL in case
1632  *         of error.
1633  */
1634 char *
1635 uri_resolve(const char *uri, const char *base) {
1636     char *val = NULL;
1637     int ret, len, indx, cur, out;
1638     URI *ref = NULL;
1639     URI *bas = NULL;
1640     URI *res = NULL;
1641
1642     /*
1643      * 1) The URI reference is parsed into the potential four components and
1644      *    fragment identifier, as described in Section 4.3.
1645      *
1646      *    NOTE that a completely empty URI is treated by modern browsers
1647      *    as a reference to "." rather than as a synonym for the current
1648      *    URI.  Should we do that here?
1649      */
1650     if (uri == NULL)
1651         ret = -1;
1652     else {
1653         if (*uri) {
1654             ref = uri_new();
1655             ret = uri_parse_into(ref, uri);
1656         }
1657         else
1658             ret = 0;
1659     }
1660     if (ret != 0)
1661         goto done;
1662     if ((ref != NULL) && (ref->scheme != NULL)) {
1663         /*
1664          * The URI is absolute don't modify.
1665          */
1666         val = g_strdup(uri);
1667         goto done;
1668     }
1669     if (base == NULL)
1670         ret = -1;
1671     else {
1672         bas = uri_new();
1673         ret = uri_parse_into(bas, base);
1674     }
1675     if (ret != 0) {
1676         if (ref)
1677             val = uri_to_string(ref);
1678         goto done;
1679     }
1680     if (ref == NULL) {
1681         /*
1682          * the base fragment must be ignored
1683          */
1684         g_free(bas->fragment);
1685         bas->fragment = NULL;
1686         val = uri_to_string(bas);
1687         goto done;
1688     }
1689
1690     /*
1691      * 2) If the path component is empty and the scheme, authority, and
1692      *    query components are undefined, then it is a reference to the
1693      *    current document and we are done.  Otherwise, the reference URI's
1694      *    query and fragment components are defined as found (or not found)
1695      *    within the URI reference and not inherited from the base URI.
1696      *
1697      *    NOTE that in modern browsers, the parsing differs from the above
1698      *    in the following aspect:  the query component is allowed to be
1699      *    defined while still treating this as a reference to the current
1700      *    document.
1701      */
1702     res = uri_new();
1703     if ((ref->scheme == NULL) && (ref->path == NULL) &&
1704         ((ref->authority == NULL) && (ref->server == NULL))) {
1705         res->scheme = g_strdup(bas->scheme);
1706         if (bas->authority != NULL)
1707             res->authority = g_strdup(bas->authority);
1708         else if (bas->server != NULL) {
1709             res->server = g_strdup(bas->server);
1710             res->user = g_strdup(bas->user);
1711             res->port = bas->port;
1712         }
1713         res->path = g_strdup(bas->path);
1714         if (ref->query != NULL) {
1715             res->query = g_strdup (ref->query);
1716         } else {
1717             res->query = g_strdup(bas->query);
1718         }
1719         res->fragment = g_strdup(ref->fragment);
1720         goto step_7;
1721     }
1722
1723     /*
1724      * 3) If the scheme component is defined, indicating that the reference
1725      *    starts with a scheme name, then the reference is interpreted as an
1726      *    absolute URI and we are done.  Otherwise, the reference URI's
1727      *    scheme is inherited from the base URI's scheme component.
1728      */
1729     if (ref->scheme != NULL) {
1730         val = uri_to_string(ref);
1731         goto done;
1732     }
1733     res->scheme = g_strdup(bas->scheme);
1734
1735     res->query = g_strdup(ref->query);
1736     res->fragment = g_strdup(ref->fragment);
1737
1738     /*
1739      * 4) If the authority component is defined, then the reference is a
1740      *    network-path and we skip to step 7.  Otherwise, the reference
1741      *    URI's authority is inherited from the base URI's authority
1742      *    component, which will also be undefined if the URI scheme does not
1743      *    use an authority component.
1744      */
1745     if ((ref->authority != NULL) || (ref->server != NULL)) {
1746         if (ref->authority != NULL)
1747             res->authority = g_strdup(ref->authority);
1748         else {
1749             res->server = g_strdup(ref->server);
1750             res->user = g_strdup(ref->user);
1751             res->port = ref->port;
1752         }
1753         res->path = g_strdup(ref->path);
1754         goto step_7;
1755     }
1756     if (bas->authority != NULL)
1757         res->authority = g_strdup(bas->authority);
1758     else if (bas->server != NULL) {
1759         res->server = g_strdup(bas->server);
1760         res->user = g_strdup(bas->user);
1761         res->port = bas->port;
1762     }
1763
1764     /*
1765      * 5) If the path component begins with a slash character ("/"), then
1766      *    the reference is an absolute-path and we skip to step 7.
1767      */
1768     if ((ref->path != NULL) && (ref->path[0] == '/')) {
1769         res->path = g_strdup(ref->path);
1770         goto step_7;
1771     }
1772
1773
1774     /*
1775      * 6) If this step is reached, then we are resolving a relative-path
1776      *    reference.  The relative path needs to be merged with the base
1777      *    URI's path.  Although there are many ways to do this, we will
1778      *    describe a simple method using a separate string buffer.
1779      *
1780      * Allocate a buffer large enough for the result string.
1781      */
1782     len = 2; /* extra / and 0 */
1783     if (ref->path != NULL)
1784         len += strlen(ref->path);
1785     if (bas->path != NULL)
1786         len += strlen(bas->path);
1787     res->path = g_malloc(len);
1788     res->path[0] = 0;
1789
1790     /*
1791      * a) All but the last segment of the base URI's path component is
1792      *    copied to the buffer.  In other words, any characters after the
1793      *    last (right-most) slash character, if any, are excluded.
1794      */
1795     cur = 0;
1796     out = 0;
1797     if (bas->path != NULL) {
1798         while (bas->path[cur] != 0) {
1799             while ((bas->path[cur] != 0) && (bas->path[cur] != '/'))
1800                 cur++;
1801             if (bas->path[cur] == 0)
1802                 break;
1803
1804             cur++;
1805             while (out < cur) {
1806                 res->path[out] = bas->path[out];
1807                 out++;
1808             }
1809         }
1810     }
1811     res->path[out] = 0;
1812
1813     /*
1814      * b) The reference's path component is appended to the buffer
1815      *    string.
1816      */
1817     if (ref->path != NULL && ref->path[0] != 0) {
1818         indx = 0;
1819         /*
1820          * Ensure the path includes a '/'
1821          */
1822         if ((out == 0) && (bas->server != NULL))
1823             res->path[out++] = '/';
1824         while (ref->path[indx] != 0) {
1825             res->path[out++] = ref->path[indx++];
1826         }
1827     }
1828     res->path[out] = 0;
1829
1830     /*
1831      * Steps c) to h) are really path normalization steps
1832      */
1833     normalize_uri_path(res->path);
1834
1835 step_7:
1836
1837     /*
1838      * 7) The resulting URI components, including any inherited from the
1839      *    base URI, are recombined to give the absolute form of the URI
1840      *    reference.
1841      */
1842     val = uri_to_string(res);
1843
1844 done:
1845     if (ref != NULL)
1846         uri_free(ref);
1847     if (bas != NULL)
1848         uri_free(bas);
1849     if (res != NULL)
1850         uri_free(res);
1851     return(val);
1852 }
1853
1854 /**
1855  * uri_resolve_relative:
1856  * @URI:  the URI reference under consideration
1857  * @base:  the base value
1858  *
1859  * Expresses the URI of the reference in terms relative to the
1860  * base.  Some examples of this operation include:
1861  *     base = "http://site1.com/docs/book1.html"
1862  *        URI input                        URI returned
1863  *     docs/pic1.gif                    pic1.gif
1864  *     docs/img/pic1.gif                img/pic1.gif
1865  *     img/pic1.gif                     ../img/pic1.gif
1866  *     http://site1.com/docs/pic1.gif   pic1.gif
1867  *     http://site2.com/docs/pic1.gif   http://site2.com/docs/pic1.gif
1868  *
1869  *     base = "docs/book1.html"
1870  *        URI input                        URI returned
1871  *     docs/pic1.gif                    pic1.gif
1872  *     docs/img/pic1.gif                img/pic1.gif
1873  *     img/pic1.gif                     ../img/pic1.gif
1874  *     http://site1.com/docs/pic1.gif   http://site1.com/docs/pic1.gif
1875  *
1876  *
1877  * Note: if the URI reference is really weird or complicated, it may be
1878  *       worthwhile to first convert it into a "nice" one by calling
1879  *       uri_resolve (using 'base') before calling this routine,
1880  *       since this routine (for reasonable efficiency) assumes URI has
1881  *       already been through some validation.
1882  *
1883  * Returns a new URI string (to be freed by the caller) or NULL in case
1884  * error.
1885  */
1886 char *
1887 uri_resolve_relative (const char *uri, const char * base)
1888 {
1889     char *val = NULL;
1890     int ret;
1891     int ix;
1892     int pos = 0;
1893     int nbslash = 0;
1894     int len;
1895     URI *ref = NULL;
1896     URI *bas = NULL;
1897     char *bptr, *uptr, *vptr;
1898     int remove_path = 0;
1899
1900     if ((uri == NULL) || (*uri == 0))
1901         return NULL;
1902
1903     /*
1904      * First parse URI into a standard form
1905      */
1906     ref = uri_new ();
1907     /* If URI not already in "relative" form */
1908     if (uri[0] != '.') {
1909         ret = uri_parse_into (ref, uri);
1910         if (ret != 0)
1911             goto done;          /* Error in URI, return NULL */
1912     } else
1913         ref->path = g_strdup(uri);
1914
1915     /*
1916      * Next parse base into the same standard form
1917      */
1918     if ((base == NULL) || (*base == 0)) {
1919         val = g_strdup (uri);
1920         goto done;
1921     }
1922     bas = uri_new ();
1923     if (base[0] != '.') {
1924         ret = uri_parse_into (bas, base);
1925         if (ret != 0)
1926             goto done;          /* Error in base, return NULL */
1927     } else
1928         bas->path = g_strdup(base);
1929
1930     /*
1931      * If the scheme / server on the URI differs from the base,
1932      * just return the URI
1933      */
1934     if ((ref->scheme != NULL) &&
1935         ((bas->scheme == NULL) ||
1936          (strcmp (bas->scheme, ref->scheme)) ||
1937          (strcmp (bas->server, ref->server)))) {
1938         val = g_strdup (uri);
1939         goto done;
1940     }
1941     if (bas->path == ref->path ||
1942         (bas->path && ref->path && !strcmp(bas->path, ref->path))) {
1943         val = g_strdup("");
1944         goto done;
1945     }
1946     if (bas->path == NULL) {
1947         val = g_strdup(ref->path);
1948         goto done;
1949     }
1950     if (ref->path == NULL) {
1951         ref->path = (char *) "/";
1952         remove_path = 1;
1953     }
1954
1955     /*
1956      * At this point (at last!) we can compare the two paths
1957      *
1958      * First we take care of the special case where either of the
1959      * two path components may be missing (bug 316224)
1960      */
1961     if (bas->path == NULL) {
1962         if (ref->path != NULL) {
1963             uptr = ref->path;
1964             if (*uptr == '/')
1965                 uptr++;
1966             /* exception characters from uri_to_string */
1967             val = uri_string_escape(uptr, "/;&=+$,");
1968         }
1969         goto done;
1970     }
1971     bptr = bas->path;
1972     if (ref->path == NULL) {
1973         for (ix = 0; bptr[ix] != 0; ix++) {
1974             if (bptr[ix] == '/')
1975                 nbslash++;
1976         }
1977         uptr = NULL;
1978         len = 1;        /* this is for a string terminator only */
1979     } else {
1980     /*
1981      * Next we compare the two strings and find where they first differ
1982      */
1983         if ((ref->path[pos] == '.') && (ref->path[pos+1] == '/'))
1984             pos += 2;
1985         if ((*bptr == '.') && (bptr[1] == '/'))
1986             bptr += 2;
1987         else if ((*bptr == '/') && (ref->path[pos] != '/'))
1988             bptr++;
1989         while ((bptr[pos] == ref->path[pos]) && (bptr[pos] != 0))
1990             pos++;
1991
1992         if (bptr[pos] == ref->path[pos]) {
1993             val = g_strdup("");
1994             goto done;          /* (I can't imagine why anyone would do this) */
1995         }
1996
1997         /*
1998          * In URI, "back up" to the last '/' encountered.  This will be the
1999          * beginning of the "unique" suffix of URI
2000          */
2001         ix = pos;
2002         if ((ref->path[ix] == '/') && (ix > 0))
2003             ix--;
2004         else if ((ref->path[ix] == 0) && (ix > 1) && (ref->path[ix - 1] == '/'))
2005             ix -= 2;
2006         for (; ix > 0; ix--) {
2007             if (ref->path[ix] == '/')
2008                 break;
2009         }
2010         if (ix == 0) {
2011             uptr = ref->path;
2012         } else {
2013             ix++;
2014             uptr = &ref->path[ix];
2015         }
2016
2017         /*
2018          * In base, count the number of '/' from the differing point
2019          */
2020         if (bptr[pos] != ref->path[pos]) {/* check for trivial URI == base */
2021             for (; bptr[ix] != 0; ix++) {
2022                 if (bptr[ix] == '/')
2023                     nbslash++;
2024             }
2025         }
2026         len = strlen (uptr) + 1;
2027     }
2028
2029     if (nbslash == 0) {
2030         if (uptr != NULL)
2031             /* exception characters from uri_to_string */
2032             val = uri_string_escape(uptr, "/;&=+$,");
2033         goto done;
2034     }
2035
2036     /*
2037      * Allocate just enough space for the returned string -
2038      * length of the remainder of the URI, plus enough space
2039      * for the "../" groups, plus one for the terminator
2040      */
2041     val = g_malloc (len + 3 * nbslash);
2042     vptr = val;
2043     /*
2044      * Put in as many "../" as needed
2045      */
2046     for (; nbslash>0; nbslash--) {
2047         *vptr++ = '.';
2048         *vptr++ = '.';
2049         *vptr++ = '/';
2050     }
2051     /*
2052      * Finish up with the end of the URI
2053      */
2054     if (uptr != NULL) {
2055         if ((vptr > val) && (len > 0) &&
2056             (uptr[0] == '/') && (vptr[-1] == '/')) {
2057             memcpy (vptr, uptr + 1, len - 1);
2058             vptr[len - 2] = 0;
2059         } else {
2060             memcpy (vptr, uptr, len);
2061             vptr[len - 1] = 0;
2062         }
2063     } else {
2064         vptr[len - 1] = 0;
2065     }
2066
2067     /* escape the freshly-built path */
2068     vptr = val;
2069         /* exception characters from uri_to_string */
2070     val = uri_string_escape(vptr, "/;&=+$,");
2071     g_free(vptr);
2072
2073 done:
2074     /*
2075      * Free the working variables
2076      */
2077     if (remove_path != 0)
2078         ref->path = NULL;
2079     if (ref != NULL)
2080         uri_free (ref);
2081     if (bas != NULL)
2082         uri_free (bas);
2083
2084     return val;
2085 }
2086
2087 /*
2088  * Utility functions to help parse and assemble query strings.
2089  */
2090
2091 struct QueryParams *
2092 query_params_new (int init_alloc)
2093 {
2094     struct QueryParams *ps;
2095
2096     if (init_alloc <= 0) init_alloc = 1;
2097
2098     ps = g_new(QueryParams, 1);
2099     ps->n = 0;
2100     ps->alloc = init_alloc;
2101     ps->p = g_new(QueryParam, ps->alloc);
2102
2103     return ps;
2104 }
2105
2106 /* Ensure there is space to store at least one more parameter
2107  * at the end of the set.
2108  */
2109 static int
2110 query_params_append (struct QueryParams *ps,
2111                const char *name, const char *value)
2112 {
2113     if (ps->n >= ps->alloc) {
2114         ps->p = g_renew(QueryParam, ps->p, ps->alloc * 2);
2115         ps->alloc *= 2;
2116     }
2117
2118     ps->p[ps->n].name = g_strdup(name);
2119     ps->p[ps->n].value = g_strdup(value);
2120     ps->p[ps->n].ignore = 0;
2121     ps->n++;
2122
2123     return 0;
2124 }
2125
2126 void
2127 query_params_free (struct QueryParams *ps)
2128 {
2129     int i;
2130
2131     for (i = 0; i < ps->n; ++i) {
2132         g_free (ps->p[i].name);
2133         g_free (ps->p[i].value);
2134     }
2135     g_free (ps->p);
2136     g_free (ps);
2137 }
2138
2139 struct QueryParams *
2140 query_params_parse (const char *query)
2141 {
2142     struct QueryParams *ps;
2143     const char *end, *eq;
2144
2145     ps = query_params_new (0);
2146     if (!query || query[0] == '\0') return ps;
2147
2148     while (*query) {
2149         char *name = NULL, *value = NULL;
2150
2151         /* Find the next separator, or end of the string. */
2152         end = strchr (query, '&');
2153         if (!end)
2154             end = strchr (query, ';');
2155         if (!end)
2156             end = query + strlen (query);
2157
2158         /* Find the first '=' character between here and end. */
2159         eq = strchr (query, '=');
2160         if (eq && eq >= end) eq = NULL;
2161
2162         /* Empty section (eg. "&&"). */
2163         if (end == query)
2164             goto next;
2165
2166         /* If there is no '=' character, then we have just "name"
2167          * and consistent with CGI.pm we assume value is "".
2168          */
2169         else if (!eq) {
2170             name = uri_string_unescape (query, end - query, NULL);
2171             value = NULL;
2172         }
2173         /* Or if we have "name=" here (works around annoying
2174          * problem when calling uri_string_unescape with len = 0).
2175          */
2176         else if (eq+1 == end) {
2177             name = uri_string_unescape (query, eq - query, NULL);
2178             value = g_new0(char, 1);
2179         }
2180         /* If the '=' character is at the beginning then we have
2181          * "=value" and consistent with CGI.pm we _ignore_ this.
2182          */
2183         else if (query == eq)
2184             goto next;
2185
2186         /* Otherwise it's "name=value". */
2187         else {
2188             name = uri_string_unescape (query, eq - query, NULL);
2189             value = uri_string_unescape (eq+1, end - (eq+1), NULL);
2190         }
2191
2192         /* Append to the parameter set. */
2193         query_params_append (ps, name, value);
2194         g_free(name);
2195         g_free(value);
2196
2197     next:
2198         query = end;
2199         if (*query) query ++; /* skip '&' separator */
2200     }
2201
2202     return ps;
2203 }