glib/gurifuncs.c

   1 /* GIO - GLib Input, Output and Streaming Library
   2  *
   3  * Copyright (C) 2006-2007 Red Hat, Inc.
   4  *
   5  * This library is free software; you can redistribute it and/or
   6  * modify it under the terms of the GNU Lesser General Public
   7  * License as published by the Free Software Foundation; either
   8  * version 2.1 of the License, or (at your option) any later version.
   9  *
  10  * This library is distributed in the hope that it will be useful,
  11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13  * Lesser General Public License for more details.
  14  *
  15  * You should have received a copy of the GNU Lesser General
  16  * Public License along with this library; if not, see <http://www.gnu.org/licenses/>.
  17  *
  18  * Author: Alexander Larsson <alexl@redhat.com>
  19  */
  20
  21 #include "config.h"
  22
  23 #include "gurifuncs.h"
  24
  25 #include <glib/gstrfuncs.h>
  26 #include <glib/gmessages.h>
  27 #include <glib/gstring.h>
  28 #include <glib/gmem.h>
  29
  30 #include <string.h>
  31
  32 #include "config.h"
  33
  34 /**
  35  * SECTION:gurifuncs
  36  * @title: URI Functions
  37  * @short_description: manipulating URIs
  38  *
  39  * Functions for manipulating Universal Resource Identifiers (URIs) as
  40  * defined by
  41  * [RFC 3986](http://www.ietf.org/rfc/rfc3986.txt).
  42  * It is highly recommended that you have read and
  43  * understand RFC 3986 for understanding this API.
  44  */
  45
  46 static int
  47 unescape_character (const char *scanner)
  48 {
  49   int first_digit;
  50   int second_digit;
  51
  52   first_digit = g_ascii_xdigit_value (*scanner++);
  53   if (first_digit < 0)
  54     return -1;
  55
  56   second_digit = g_ascii_xdigit_value (*scanner++);
  57   if (second_digit < 0)
  58     return -1;
  59
  60   return (first_digit << 4) | second_digit;
  61 }
  62
  63 /**
  64  * g_uri_unescape_segment:
  65  * @escaped_string: (nullable): A string, may be %NULL
  66  * @escaped_string_end: (nullable): Pointer to end of @escaped_string, may be %NULL
  67  * @illegal_characters: (nullable): An optional string of illegal characters not to be allowed, may be %NULL
  68  *
  69  * Unescapes a segment of an escaped string.
  70  *
  71  * If any of the characters in @illegal_characters or the character zero appears
  72  * as an escaped character in @escaped_string then that is an error and %NULL
  73  * will be returned. This is useful it you want to avoid for instance having a
  74  * slash being expanded in an escaped path element, which might confuse pathname
  75  * handling.
  76  *
  77  * Returns: an unescaped version of @escaped_string or %NULL on error.
  78  * The returned string should be freed when no longer needed.  As a
  79  * special case if %NULL is given for @escaped_string, this function
  80  * will return %NULL.
  81  *
  82  * Since: 2.16
  83  **/
  84 char *
  85 g_uri_unescape_segment (const char *escaped_string,
  86                         const char *escaped_string_end,
  87                         const char *illegal_characters)
  88 {
  89   const char *in;
  90   char *out, *result;
  91   gint character;
  92
  93   if (escaped_string == NULL)
  94     return NULL;
  95
  96   if (escaped_string_end == NULL)
  97     escaped_string_end = escaped_string + strlen (escaped_string);
  98
  99   result = g_malloc (escaped_string_end - escaped_string + 1);
 100
 101   out = result;
 102   for (in = escaped_string; in < escaped_string_end; in++)
 103     {
 104       character = *in;
 105
 106       if (*in == '%')
 107         {
 108           in++;
 109
 110           if (escaped_string_end - in < 2)
 111             {
 112               /* Invalid escaped char (to short) */
 113               g_free (result);
 114               return NULL;
 115             }
 116
 117           character = unescape_character (in);
 118
 119           /* Check for an illegal character. We consider '\0' illegal here. */
 120           if (character <= 0 ||
 121               (illegal_characters != NULL &&
 122                strchr (illegal_characters, (char)character) != NULL))
 123             {
 124               g_free (result);
 125               return NULL;
 126             }
 127
 128           in++; /* The other char will be eaten in the loop header */
 129         }
 130       *out++ = (char)character;
 131     }
 132
 133   *out = '\0';
 134
 135   return result;
 136 }
 137
 138 /**
 139  * g_uri_unescape_string:
 140  * @escaped_string: an escaped string to be unescaped.
 141  * @illegal_characters: (nullable): a string of illegal characters not to be
 142  *      allowed, or %NULL.
 143  *
 144  * Unescapes a whole escaped string.
 145  *
 146  * If any of the characters in @illegal_characters or the character zero appears
 147  * as an escaped character in @escaped_string then that is an error and %NULL
 148  * will be returned. This is useful it you want to avoid for instance having a
 149  * slash being expanded in an escaped path element, which might confuse pathname
 150  * handling.
 151  *
 152  * Returns: an unescaped version of @escaped_string. The returned string
 153  * should be freed when no longer needed.
 154  *
 155  * Since: 2.16
 156  **/
 157 char *
 158 g_uri_unescape_string (const char *escaped_string,
 159                        const char *illegal_characters)
 160 {
 161   return g_uri_unescape_segment (escaped_string, NULL, illegal_characters);
 162 }
 163
 164 /**
 165  * g_uri_parse_scheme:
 166  * @uri: a valid URI.
 167  *
 168  * Gets the scheme portion of a URI string. RFC 3986 decodes the scheme as:
 169  * |[
 170  * URI = scheme ":" hier-part [ "?" query ] [ "#" fragment ]
 171  * ]|
 172  * Common schemes include "file", "http", "svn+ssh", etc.
 173  *
 174  * Returns: The "Scheme" component of the URI, or %NULL on error.
 175  * The returned string should be freed when no longer needed.
 176  *
 177  * Since: 2.16
 178  **/
 179 char *
 180 g_uri_parse_scheme (const char  *uri)
 181 {
 182   const char *p;
 183   char c;
 184
 185   g_return_val_if_fail (uri != NULL, NULL);
 186
 187   /* From RFC 3986 Decodes:
 188    * URI         = scheme ":" hier-part [ "?" query ] [ "#" fragment ]
 189    */
 190
 191   p = uri;
 192
 193   /* Decode scheme:
 194      scheme      = ALPHA *( ALPHA / DIGIT / "+" / "-" / "." )
 195   */
 196
 197   if (!g_ascii_isalpha (*p))
 198     return NULL;
 199
 200   while (1)
 201     {
 202       c = *p++;
 203
 204       if (c == ':')
 205         break;
 206
 207       if (!(g_ascii_isalnum(c) ||
 208             c == '+' ||
 209             c == '-' ||
 210             c == '.'))
 211         return NULL;
 212     }
 213
 214   return g_strndup (uri, p - uri - 1);
 215 }
 216
 217 /**
 218  * g_uri_escape_string:
 219  * @unescaped: the unescaped input string.
 220  * @reserved_chars_allowed: (nullable): a string of reserved characters that
 221  *      are allowed to be used, or %NULL.
 222  * @allow_utf8: %TRUE if the result can include UTF-8 characters.
 223  *
 224  * Escapes a string for use in a URI.
 225  *
 226  * Normally all characters that are not "unreserved" (i.e. ASCII alphanumerical
 227  * characters plus dash, dot, underscore and tilde) are escaped.
 228  * But if you specify characters in @reserved_chars_allowed they are not
 229  * escaped. This is useful for the "reserved" characters in the URI
 230  * specification, since those are allowed unescaped in some portions of
 231  * a URI.
 232  *
 233  * Returns: an escaped version of @unescaped. The returned string should be
 234  * freed when no longer needed.
 235  *
 236  * Since: 2.16
 237  **/
 238 char *
 239 g_uri_escape_string (const char *unescaped,
 240                      const char  *reserved_chars_allowed,
 241                      gboolean     allow_utf8)
 242 {
 243   GString *s;
 244
 245   g_return_val_if_fail (unescaped != NULL, NULL);
 246
 247   s = g_string_sized_new (strlen (unescaped) + 10);
 248
 249   g_string_append_uri_escaped (s, unescaped, reserved_chars_allowed, allow_utf8);
 250
 251   return g_string_free (s, FALSE);
 252 }