src/protocol/header.c

   1 /* Parser of HTTP headers */
   2
   3 #ifdef HAVE_CONFIG_H
   4 #include "config.h"
   5 #endif
   6
   7 #include <string.h>
   8
   9 #include "elinks.h"
  10
  11 #include "protocol/header.h"
  12 #include "util/conv.h"
  13 #include "util/error.h"
  14 #include "util/memory.h"
  15 #include "util/string.h"
  16
  17 /*
  18 * RFC 2616                        HTTP/1.1                       June 1999
  19 *
  20 *
  21 *        OCTET          = <any 8-bit sequence of data>
  22 *        CHAR           = <any US-ASCII character (octets 0 - 127)>
  23 *        UPALPHA        = <any US-ASCII uppercase letter "A".."Z">
  24 *        LOALPHA        = <any US-ASCII lowercase letter "a".."z">
  25 *        ALPHA          = UPALPHA | LOALPHA
  26 *        DIGIT          = <any US-ASCII digit "0".."9">
  27 *        CTL            = <any US-ASCII control character
  28 *                         (octets 0 - 31) and DEL (127)>
  29 *        CR             = <US-ASCII CR, carriage return (13)>
  30 *        LF             = <US-ASCII LF, linefeed (10)>
  31 *        SP             = <US-ASCII SP, space (32)>
  32 *        HT             = <US-ASCII HT, horizontal-tab (9)>
  33 *        <">            = <US-ASCII double-quote mark (34)>
  34 *
  35 *    HTTP/1.1 defines the sequence CR LF as the end-of-line marker for all
  36 *    protocol elements except the entity-body (see appendix 19.3 for
  37 *    tolerant applications). The end-of-line marker within an entity-body
  38 *    is defined by its associated media type, as described in section 3.7.
  39 *
  40 *        CRLF           = CR LF
  41 *
  42 *    HTTP/1.1 header field values can be folded onto multiple lines if the
  43 *    continuation line begins with a space or horizontal tab. All linear
  44 *    white space, including folding, has the same semantics as SP. A
  45 *    recipient MAY replace any linear white space with a single SP before
  46 *    interpreting the field value or forwarding the message downstream.
  47 *
  48 *        LWS            = [CRLF] 1*( SP | HT )
  49 *
  50 *    The TEXT rule is only used for descriptive field contents and values
  51 *    that are not intended to be interpreted by the message parser. Words
  52 *    of *TEXT MAY contain characters from character sets other than ISO-
  53 *    8859-1 [22] only when encoded according to the rules of RFC 2047
  54 *    [14].
  55 *
  56 *        TEXT           = <any OCTET except CTLs,
  57 *                         but including LWS>
  58 *
  59 *    A CRLF is allowed in the definition of TEXT only as part of a header
  60 *    field continuation. It is expected that the folding LWS will be
  61 *    replaced with a single SP before interpretation of the TEXT value.
  62 *
  63 *    Hexadecimal numeric characters are used in several protocol elements.
  64 *
  65 *        HEX            = "A" | "B" | "C" | "D" | "E" | "F"
  66 *                       | "a" | "b" | "c" | "d" | "e" | "f" | DIGIT
  67 *
  68 *    Many HTTP/1.1 header field values consist of words separated by LWS
  69 *    or special characters. These special characters MUST be in a quoted
  70 *    string to be used within a parameter value (as defined in section
  71 *    3.6).
  72 *
  73 *        token          = 1*<any CHAR except CTLs or separators>
  74 *        separators     = "(" | ")" | "<" | ">" | "@"
  75 *                       | "," | ";" | ":" | "\" | <">
  76 *                       | "/" | "[" | "]" | "?" | "="
  77 *                       | "{" | "}" | SP | HT
  78 *
  79 *    Comments can be included in some HTTP header fields by surrounding
  80 *    the comment text with parentheses. Comments are only allowed in
  81 *    fields containing "comment" as part of their field value definition.
  82 *    In all other fields, parentheses are considered part of the field
  83 *    value.
  84 *
  85 *        comment        = "(" *( ctext | quoted-pair | comment ) ")"
  86 *        ctext          = <any TEXT excluding "(" and ")">
  87 *
  88 *    A string of text is parsed as a single word if it is quoted using
  89 *    double-quote marks.
  90 *
  91 *        quoted-string  = ( <"> *(qdtext | quoted-pair ) <"> )
  92 *        qdtext         = <any TEXT except <">>
  93 *
  94 *    The backslash character ("\") MAY be used as a single-character
  95 *    quoting mechanism only within quoted-string and comment constructs.
  96 *
  97 *        quoted-pair    = "\" CHAR
  98 */
  99
 100 /* FIXME: bug 549
 101  *
 102  * HTTP/1.1 header continuation lines are not honoured.
 103  * DEL char is accepted in TEXT part.
 104  * HT char is not accepted in TEXT part.
 105  * LF alone do not mark end of line, CRLF is the correct termination.
 106  * CR or LF are invalid in header line.
 107  *
 108  * Mozilla, IE, NS tolerate header value separator different from ':'
 109  * Examples:
 110  * name: value
 111  * name value
 112  * name :value
 113  * name=value
 114  */
 115
 116 #define LWS(c) ((c) == ' ' || (c) == ASCII_TAB)
 117
 118 unsigned char *
 119 parse_header(unsigned char *head, unsigned char *item, unsigned char **ptr)
 120 {
 121         unsigned char *pos = head;
 122
 123         if (!pos) return NULL;
 124
 125         while (*pos) {
 126                 unsigned char *end, *itempos, *value;
 127                 int len;
 128
 129                 /* Go for a newline. */
 130                 while (*pos && *pos != ASCII_LF) pos++;
 131                 if (!*pos) break;
 132                 pos++; /* Start of line now. */
 133
 134                 /* Does item match header line ? */
 135                 for (itempos = item; *itempos && *pos; itempos++, pos++)
 136                         if (toupper(*itempos) != toupper(*pos))
 137                                 break;
 138
 139                 if (!*pos) break; /* Nothing left to parse. */
 140                 if (*itempos) continue; /* Do not match. */
 141
 142                 /* Be tolerant: we accept headers with
 143                  * weird syntax, since most browsers does it
 144                  * anyway, ie:
 145                  * name value
 146                  * name :value
 147                  * name = value
 148                  * name[TAB]:[TAB]value */
 149
 150                 end = pos;
 151
 152                 /* Skip leading whitespaces if any. */
 153                 while (LWS(*pos)) pos++;
 154                 if (!*pos) break; /* Nothing left to parse. */
 155
 156                 /* Eat ':' or '=' if any. */
 157                 if (*pos == ':' || *pos == '=') pos++;
 158                 if (!*pos) break; /* Nothing left to parse. */
 159
 160                 /* Skip whitespaces after separator if any. */
 161                 while (LWS(*pos)) pos++;
 162                 if (!*pos) break; /* Nothing left to parse. */
 163
 164                 if (pos == end) continue; /* Not an exact match (substring). */
 165
 166                 /* Find the end of line/string.
 167                  * We fail on control chars and DEL char. */
 168                 end = pos;
 169                 while (*end != ASCII_DEL && (*end > ' ' || LWS(*end))) end++;
 170                 if (!*end) break; /* No end of line, nothing left to parse. */
 171
 172                 /* Ignore line if we encountered an unexpected char. */
 173                 if (*end != ASCII_CR && *end != ASCII_LF) continue;
 174
 175                 /* Strip trailing whitespaces. */
 176                 while (end > pos && LWS(end[-1])) end--;
 177
 178                 len = end - pos;
 179                 assert(len >= 0);
 180                 if_assert_failed break;
 181
 182                 if (!len) continue;     /* Empty value. */
 183
 184                 value = memacpy(pos, len);
 185                 if (!value) break; /* Allocation failure, stop here. */
 186
 187                 if (ptr) *ptr = pos;
 188                 return value;
 189         }
 190
 191         return NULL;
 192 }
 193
 194 /* Extract the value of name part of the value of attribute content.
 195  * Ie. @name = "charset" and @str = "text/html; charset=iso-8859-1"
 196  * will store in *@ret an allocated string containing "iso-8859-1".
 197  * It supposes that separator is ';' and ignore first element in the
 198  * list. (ie. '1' is ignored in "1; URL=xxx")
 199  * The return value is one of:
 200  *
 201  * - HEADER_PARAM_FOUND: the parameter was found, copied, and stored in *@ret.
 202  * - HEADER_PARAM_NOT_FOUND: the parameter is not there.  *@ret is now NULL.
 203  * - HEADER_PARAM_OUT_OF_MEMORY: error. *@ret is now NULL.
 204  *
 205  * If @ret is NULL, then this function doesn't actually access *@ret,
 206  * and cannot fail with HEADER_PARAM_OUT_OF_MEMORY.  Some callers may
 207  * rely on this. */
 208 enum parse_header_param
 209 parse_header_param(unsigned char *str, unsigned char *name, unsigned char **ret)
 210 {
 211         unsigned char *p = str;
 212         int namelen, plen = 0;
 213
 214         if (ret) *ret = NULL;   /* default in case of early return */
 215
 216         assert(str && name && *name);
 217         if_assert_failed return HEADER_PARAM_NOT_FOUND;
 218
 219         /* Returns now if string @str is empty. */
 220         if (!*p) return HEADER_PARAM_NOT_FOUND;
 221
 222         namelen = strlen(name);
 223         do {
 224                 p = strchr(p, ';');
 225                 if (!p) return HEADER_PARAM_NOT_FOUND;
 226
 227                 while (*p && (*p == ';' || *p <= ' ')) p++;
 228                 if (strlen(p) < namelen) return HEADER_PARAM_NOT_FOUND;
 229         } while (strncasecmp(p, name, namelen));
 230
 231         p += namelen;
 232
 233         while (*p && (*p <= ' ' || *p == '=')) p++;
 234         if (!*p) {
 235                 if (ret) {
 236                         *ret = stracpy("");
 237                         if (!*ret)
 238                                 return HEADER_PARAM_OUT_OF_MEMORY;
 239                 }
 240                 return HEADER_PARAM_FOUND;
 241         }
 242
 243         while ((p[plen] > ' ' || LWS(p[plen])) && p[plen] != ';') plen++;
 244
 245         /* Trim ending spaces */
 246         while (plen > 0 && LWS(p[plen - 1])) plen--;
 247
 248         /* XXX: Drop enclosing single quotes if there's some.
 249          *
 250          * Some websites like newsnow.co.uk are using single quotes around url
 251          * in URL field in meta tag content attribute like this:
 252          * <meta http-equiv="Refresh" content="0; URL='http://www.site.com/path/xxx.htm'">
 253          *
 254          * This is an attempt to handle that, but it may break something else.
 255          * We drop all pair of enclosing quotes found (eg. '''url''' => url).
 256          * Please report any issue related to this. --Zas */
 257         while (plen > 1 && *p == '\'' && p[plen - 1] == '\'') {
 258                 p++;
 259                 plen -= 2;
 260         }
 261
 262         if (ret) {
 263                 *ret = memacpy(p, plen);
 264                 if (!*ret)
 265                         return HEADER_PARAM_OUT_OF_MEMORY;
 266         }
 267         return HEADER_PARAM_FOUND;
 268 }
 269
 270 /* Parse string param="value", return value as new string or NULL if any
 271  * error. */
 272 unsigned char *
 273 get_header_param(unsigned char *e, unsigned char *name)
 274 {
 275         unsigned char *n, *start;
 276
 277 again:
 278         while (*e && toupper(*e++) != toupper(*name));
 279         if (!*e) return NULL;
 280
 281         n = name + 1;
 282         while (*n && toupper(*e) == toupper(*n)) e++, n++;
 283         if (*n) goto again;
 284
 285         skip_space(e);
 286         if (*e++ != '=') return NULL;
 287
 288         skip_space(e);
 289         start = e;
 290
 291         if (!isquote(*e)) {
 292                 skip_nonspace(e);
 293         } else {
 294                 unsigned char uu = *e++;
 295
 296                 start++;
 297                 while (*e != uu) {
 298                         if (!*e) return NULL;
 299                         e++;
 300                 }
 301         }
 302
 303         while (start < e && *start == ' ') start++;
 304         while (start < e && *(e - 1) == ' ') e--;
 305         if (start == e) return NULL;
 306
 307         n = mem_alloc(e - start + 1);
 308         if (n) {
 309                 int i = 0;
 310
 311                 while (start < e) {
 312                         n[i++] = (*start < ' ') ? '.' : *start;
 313                         start++;
 314                 }
 315                 n[i] = '\0';
 316         }
 317
 318         return n;
 319 }