src/protocol/header.c

   1 /* Parser of HTTP headers */
   2
   3 #ifdef HAVE_CONFIG_H
   4 #include "config.h"
   5 #endif
   6
   7 #include <string.h>
   8
   9 #include "elinks.h"
  10
  11 #include "protocol/header.h"
  12 #include "util/conv.h"
  13 #include "util/error.h"
  14 #include "util/memory.h"
  15 #include "util/string.h"
  16
  17 /*
  18 * RFC 2616                        HTTP/1.1                       June 1999
  19 *
  20 *
  21 *        OCTET          = <any 8-bit sequence of data>
  22 *        CHAR           = <any US-ASCII character (octets 0 - 127)>
  23 *        UPALPHA        = <any US-ASCII uppercase letter "A".."Z">
  24 *        LOALPHA        = <any US-ASCII lowercase letter "a".."z">
  25 *        ALPHA          = UPALPHA | LOALPHA
  26 *        DIGIT          = <any US-ASCII digit "0".."9">
  27 *        CTL            = <any US-ASCII control character
  28 *                         (octets 0 - 31) and DEL (127)>
  29 *        CR             = <US-ASCII CR, carriage return (13)>
  30 *        LF             = <US-ASCII LF, linefeed (10)>
  31 *        SP             = <US-ASCII SP, space (32)>
  32 *        HT             = <US-ASCII HT, horizontal-tab (9)>
  33 *        <">            = <US-ASCII double-quote mark (34)>
  34 *
  35 *    HTTP/1.1 defines the sequence CR LF as the end-of-line marker for all
  36 *    protocol elements except the entity-body (see appendix 19.3 for
  37 *    tolerant applications). The end-of-line marker within an entity-body
  38 *    is defined by its associated media type, as described in section 3.7.
  39 *
  40 *        CRLF           = CR LF
  41 *
  42 *    HTTP/1.1 header field values can be folded onto multiple lines if the
  43 *    continuation line begins with a space or horizontal tab. All linear
  44 *    white space, including folding, has the same semantics as SP. A
  45 *    recipient MAY replace any linear white space with a single SP before
  46 *    interpreting the field value or forwarding the message downstream.
  47 *
  48 *        LWS            = [CRLF] 1*( SP | HT )
  49 *
  50 *    The TEXT rule is only used for descriptive field contents and values
  51 *    that are not intended to be interpreted by the message parser. Words
  52 *    of *TEXT MAY contain characters from character sets other than ISO-
  53 *    8859-1 [22] only when encoded according to the rules of RFC 2047
  54 *    [14].
  55 *
  56 *        TEXT           = <any OCTET except CTLs,
  57 *                         but including LWS>
  58 *
  59 *    A CRLF is allowed in the definition of TEXT only as part of a header
  60 *    field continuation. It is expected that the folding LWS will be
  61 *    replaced with a single SP before interpretation of the TEXT value.
  62 *
  63 *    Hexadecimal numeric characters are used in several protocol elements.
  64 *
  65 *        HEX            = "A" | "B" | "C" | "D" | "E" | "F"
  66 *                       | "a" | "b" | "c" | "d" | "e" | "f" | DIGIT
  67 *
  68 *    Many HTTP/1.1 header field values consist of words separated by LWS
  69 *    or special characters. These special characters MUST be in a quoted
  70 *    string to be used within a parameter value (as defined in section
  71 *    3.6).
  72 *
  73 *        token          = 1*<any CHAR except CTLs or separators>
  74 *        separators     = "(" | ")" | "<" | ">" | "@"
  75 *                       | "," | ";" | ":" | "\" | <">
  76 *                       | "/" | "[" | "]" | "?" | "="
  77 *                       | "{" | "}" | SP | HT
  78 *
  79 *    Comments can be included in some HTTP header fields by surrounding
  80 *    the comment text with parentheses. Comments are only allowed in
  81 *    fields containing "comment" as part of their field value definition.
  82 *    In all other fields, parentheses are considered part of the field
  83 *    value.
  84 *
  85 *        comment        = "(" *( ctext | quoted-pair | comment ) ")"
  86 *        ctext          = <any TEXT excluding "(" and ")">
  87 *
  88 *    A string of text is parsed as a single word if it is quoted using
  89 *    double-quote marks.
  90 *
  91 *        quoted-string  = ( <"> *(qdtext | quoted-pair ) <"> )
  92 *        qdtext         = <any TEXT except <">>
  93 *
  94 *    The backslash character ("\") MAY be used as a single-character
  95 *    quoting mechanism only within quoted-string and comment constructs.
  96 *
  97 *        quoted-pair    = "\" CHAR
  98 */
  99
 100 /* FIXME: bug 549
 101  *
 102  * HTTP/1.1 header continuation lines are not honoured.
 103  * DEL char is accepted in TEXT part.
 104  * HT char is not accepted in TEXT part.
 105  * LF alone do not mark end of line, CRLF is the correct termination.
 106  * CR or LF are invalid in header line.
 107  *
 108  * Mozilla, IE, NS tolerate header value separator different from ':'
 109  * Examples:
 110  * name: value
 111  * name value
 112  * name :value
 113  * name=value
 114  */
 115
 116 #define LWS(c) ((c) == ' ' || (c) == ASCII_TAB)
 117
 118 /** Searches for a message-header with the specified field-name.
 119  *
 120  * @param[in] head
 121  *   Where to start searching in the message received from the server.
 122  *   This function actually ignores the line to which @a head points,
 123  *   and starts searching from the next line.  Therefore, when parsing
 124  *   an HTTP message, @a head should initially point to the start-line,
 125  *   e.g. "HTTP/1.1 200 OK".  Alternatively, if the caller has already
 126  *   found a message-header and wants to know if there are any more
 127  *   message-headers with the same field-name, then @a head can be the
 128  *   pointer that a previous call stored in *@a ptr.
 129  * @param[in] item
 130  *   The field-name for which this function searches.
 131  * @param[out] ptr
 132  *   If @a ptr is not NULL, and this function finds a message-header,
 133  *   then this function stores in *@a ptr the address at which the
 134  *   field-content begins; the caller may pass that as @a head in a
 135  *   later call.  Otherwise, this function does not modify *@a ptr.
 136  * @returns
 137  *   NULL if not found or out of memory.  Otherwise, a copy of the
 138  *   field-content of the message-header; the caller must eventually
 139  *   mem_free() it.
 140  *
 141  * The terms message-header, field-name, start-line, and field-content
 142  * are defined in RFC 2616 sections 4.1 and 4.2.  */
 143 unsigned char *
 144 parse_header(unsigned char *head, const unsigned char *item, unsigned char **ptr)
 145 {
 146         unsigned char *pos = head;
 147
 148         if (!pos) return NULL;
 149
 150         while (*pos) {
 151                 unsigned char *end, *value;
 152                 const unsigned char *itempos;
 153                 int len;
 154
 155                 /* Go for a newline. */
 156                 while (*pos && *pos != ASCII_LF) pos++;
 157                 if (!*pos) break;
 158                 pos++; /* Start of line now. */
 159
 160                 /* Does item match header line ? */
 161                 for (itempos = item; *itempos && *pos; itempos++, pos++)
 162                         if (c_toupper(*itempos) != c_toupper(*pos))
 163                                 break;
 164
 165                 if (!*pos) break; /* Nothing left to parse. */
 166                 if (*itempos) continue; /* Do not match. */
 167
 168                 /* Be tolerant: we accept headers with
 169                  * weird syntax, since most browsers does it
 170                  * anyway, ie:
 171                  * name value
 172                  * name :value
 173                  * name = value
 174                  * name[TAB]:[TAB]value */
 175
 176                 end = pos;
 177
 178                 /* Skip leading whitespaces if any. */
 179                 while (LWS(*pos)) pos++;
 180                 if (!*pos) break; /* Nothing left to parse. */
 181
 182                 /* Eat ':' or '=' if any. */
 183                 if (*pos == ':' || *pos == '=') pos++;
 184                 if (!*pos) break; /* Nothing left to parse. */
 185
 186                 /* Skip whitespaces after separator if any. */
 187                 while (LWS(*pos)) pos++;
 188                 if (!*pos) break; /* Nothing left to parse. */
 189
 190                 if (pos == end) continue; /* Not an exact match (substring). */
 191
 192                 /* Find the end of line/string.
 193                  * We fail on control chars and DEL char. */
 194                 end = pos;
 195                 while (*end != ASCII_DEL && (*end > ' ' || LWS(*end))) end++;
 196                 if (!*end) break; /* No end of line, nothing left to parse. */
 197
 198                 /* Ignore line if we encountered an unexpected char. */
 199                 if (*end != ASCII_CR && *end != ASCII_LF) continue;
 200
 201                 /* Strip trailing whitespaces. */
 202                 while (end > pos && LWS(end[-1])) end--;
 203
 204                 len = end - pos;
 205                 assert(len >= 0);
 206                 if_assert_failed break;
 207
 208                 if (!len) continue;     /* Empty value. */
 209
 210                 value = memacpy(pos, len);
 211                 if (!value) break; /* Allocation failure, stop here. */
 212
 213                 if (ptr) *ptr = pos;
 214                 return value;
 215         }
 216
 217         return NULL;
 218 }
 219
 220 /* Extract the value of name part of the value of attribute content.
 221  * Ie. @name = "charset" and @str = "text/html; charset=iso-8859-1"
 222  * will store in *@ret an allocated string containing "iso-8859-1".
 223  * It supposes that separator is ';' and ignore first element in the
 224  * list. (ie. '1' is ignored in "1; URL=xxx")
 225  * The return value is one of:
 226  *
 227  * - HEADER_PARAM_FOUND: the parameter was found, copied, and stored in *@ret.
 228  * - HEADER_PARAM_NOT_FOUND: the parameter is not there.  *@ret is now NULL.
 229  * - HEADER_PARAM_OUT_OF_MEMORY: error. *@ret is now NULL.
 230  *
 231  * If @ret is NULL, then this function doesn't actually access *@ret,
 232  * and cannot fail with HEADER_PARAM_OUT_OF_MEMORY.  Some callers may
 233  * rely on this. */
 234 enum parse_header_param
 235 parse_header_param(unsigned char *str, unsigned char *name, unsigned char **ret, int content_disposition)
 236 {
 237         unsigned char *p = str;
 238         int namelen, plen = 0;
 239
 240         if (ret) *ret = NULL;   /* default in case of early return */
 241
 242         assert(str && name && *name);
 243         if_assert_failed return HEADER_PARAM_NOT_FOUND;
 244
 245         /* Returns now if string @str is empty. */
 246         if (!*p) return HEADER_PARAM_NOT_FOUND;
 247
 248         namelen = strlen(name);
 249         do {
 250                 if (!content_disposition) {
 251                         p = strchr((const char *)p, ';');
 252                         if (!p) return HEADER_PARAM_NOT_FOUND;
 253                 }
 254
 255                 while (*p && (*p == ';' || *p <= ' ')) p++;
 256                 if (strlen(p) < namelen) return HEADER_PARAM_NOT_FOUND;
 257         } while (c_strncasecmp(p, name, namelen));
 258
 259         p += namelen;
 260
 261         while (*p && (*p <= ' ' || *p == '=')) p++;
 262         if (!*p) {
 263                 if (ret) {
 264                         *ret = stracpy("");
 265                         if (!*ret)
 266                                 return HEADER_PARAM_OUT_OF_MEMORY;
 267                 }
 268                 return HEADER_PARAM_FOUND;
 269         }
 270
 271         while ((p[plen] > ' ' || LWS(p[plen])) && p[plen] != ';') plen++;
 272
 273         /* Trim ending spaces */
 274         while (plen > 0 && LWS(p[plen - 1])) plen--;
 275
 276         /* XXX: Drop enclosing single quotes if there's some.
 277          *
 278          * Some websites like newsnow.co.uk are using single quotes around url
 279          * in URL field in meta tag content attribute like this:
 280          * <meta http-equiv="Refresh" content="0; URL='http://www.site.com/path/xxx.htm'">
 281          *
 282          * This is an attempt to handle that, but it may break something else.
 283          * We drop all pair of enclosing quotes found (eg. '''url''' => url).
 284          * Please report any issue related to this. --Zas */
 285         while (plen > 1 && *p == '\'' && p[plen - 1] == '\'') {
 286                 p++;
 287                 plen -= 2;
 288         }
 289
 290         if (ret) {
 291                 *ret = memacpy(p, plen);
 292                 if (!*ret)
 293                         return HEADER_PARAM_OUT_OF_MEMORY;
 294         }
 295         return HEADER_PARAM_FOUND;
 296 }
 297
 298 /* Parse string param="value", return value as new string or NULL if any
 299  * error. */
 300 unsigned char *
 301 get_header_param(unsigned char *e, unsigned char *name)
 302 {
 303         unsigned char *n, *start;
 304
 305 again:
 306         while (*e && c_toupper(*e++) != c_toupper(*name));
 307         if (!*e) return NULL;
 308
 309         n = name + 1;
 310         while (*n && c_toupper(*e) == c_toupper(*n)) e++, n++;
 311         if (*n) goto again;
 312
 313         skip_space(e);
 314         if (*e++ != '=') return NULL;
 315
 316         skip_space(e);
 317         start = e;
 318
 319         if (!isquote(*e)) {
 320                 skip_nonspace(e);
 321         } else {
 322                 unsigned char uu = *e++;
 323
 324                 start++;
 325                 while (*e != uu) {
 326                         if (!*e) return NULL;
 327                         e++;
 328                 }
 329         }
 330
 331         while (start < e && *start == ' ') start++;
 332         while (start < e && *(e - 1) == ' ') e--;
 333         if (start == e) return NULL;
 334
 335         n = mem_alloc(e - start + 1);
 336         if (n) {
 337                 int i = 0;
 338
 339                 while (start < e) {
 340                         n[i++] = (*start < ' ') ? '.' : *start;
 341                         start++;
 342                 }
 343                 n[i] = '\0';
 344         }
 345
 346         return n;
 347 }