1 /* Parser of HTTP headers */
11 #include "protocol/header.h"
12 #include "util/conv.h"
13 #include "util/error.h"
14 #include "util/memory.h"
15 #include "util/string.h"
18 * RFC 2616 HTTP/1.1 June 1999
21 * OCTET = <any 8-bit sequence of data>
22 * CHAR = <any US-ASCII character (octets 0 - 127)>
23 * UPALPHA = <any US-ASCII uppercase letter "A".."Z">
24 * LOALPHA = <any US-ASCII lowercase letter "a".."z">
25 * ALPHA = UPALPHA | LOALPHA
26 * DIGIT = <any US-ASCII digit "0".."9">
27 * CTL = <any US-ASCII control character
28 * (octets 0 - 31) and DEL (127)>
29 * CR = <US-ASCII CR, carriage return (13)>
30 * LF = <US-ASCII LF, linefeed (10)>
31 * SP = <US-ASCII SP, space (32)>
32 * HT = <US-ASCII HT, horizontal-tab (9)>
33 * <"> = <US-ASCII double-quote mark (34)>
35 * HTTP/1.1 defines the sequence CR LF as the end-of-line marker for all
36 * protocol elements except the entity-body (see appendix 19.3 for
37 * tolerant applications). The end-of-line marker within an entity-body
38 * is defined by its associated media type, as described in section 3.7.
42 * HTTP/1.1 header field values can be folded onto multiple lines if the
43 * continuation line begins with a space or horizontal tab. All linear
44 * white space, including folding, has the same semantics as SP. A
45 * recipient MAY replace any linear white space with a single SP before
46 * interpreting the field value or forwarding the message downstream.
48 * LWS = [CRLF] 1*( SP | HT )
50 * The TEXT rule is only used for descriptive field contents and values
51 * that are not intended to be interpreted by the message parser. Words
52 * of *TEXT MAY contain characters from character sets other than ISO-
53 * 8859-1 [22] only when encoded according to the rules of RFC 2047
56 * TEXT = <any OCTET except CTLs,
59 * A CRLF is allowed in the definition of TEXT only as part of a header
60 * field continuation. It is expected that the folding LWS will be
61 * replaced with a single SP before interpretation of the TEXT value.
63 * Hexadecimal numeric characters are used in several protocol elements.
65 * HEX = "A" | "B" | "C" | "D" | "E" | "F"
66 * | "a" | "b" | "c" | "d" | "e" | "f" | DIGIT
68 * Many HTTP/1.1 header field values consist of words separated by LWS
69 * or special characters. These special characters MUST be in a quoted
70 * string to be used within a parameter value (as defined in section
73 * token = 1*<any CHAR except CTLs or separators>
74 * separators = "(" | ")" | "<" | ">" | "@"
75 * | "," | ";" | ":" | "\" | <">
76 * | "/" | "[" | "]" | "?" | "="
77 * | "{" | "}" | SP | HT
79 * Comments can be included in some HTTP header fields by surrounding
80 * the comment text with parentheses. Comments are only allowed in
81 * fields containing "comment" as part of their field value definition.
82 * In all other fields, parentheses are considered part of the field
85 * comment = "(" *( ctext | quoted-pair | comment ) ")"
86 * ctext = <any TEXT excluding "(" and ")">
88 * A string of text is parsed as a single word if it is quoted using
91 * quoted-string = ( <"> *(qdtext | quoted-pair ) <"> )
92 * qdtext = <any TEXT except <">>
94 * The backslash character ("\") MAY be used as a single-character
95 * quoting mechanism only within quoted-string and comment constructs.
97 * quoted-pair = "\" CHAR
102 * HTTP/1.1 header continuation lines are not honoured.
103 * DEL char is accepted in TEXT part.
104 * HT char is not accepted in TEXT part.
105 * LF alone do not mark end of line, CRLF is the correct termination.
106 * CR or LF are invalid in header line.
108 * Mozilla, IE, NS tolerate header value separator different from ':'
116 #define LWS(c) ((c) == ' ' || (c) == ASCII_TAB)
118 /** Searches for a message-header with the specified field-name.
121 * Where to start searching in the message received from the server.
122 * This function actually ignores the line to which @a head points,
123 * and starts searching from the next line. Therefore, when parsing
124 * an HTTP message, @a head should initially point to the start-line,
125 * e.g. "HTTP/1.1 200 OK". Alternatively, if the caller has already
126 * found a message-header and wants to know if there are any more
127 * message-headers with the same field-name, then @a head can be the
128 * pointer that a previous call stored in *@a ptr.
130 * The field-name for which this function searches.
132 * If @a ptr is not NULL, and this function finds a message-header,
133 * then this function stores in *@a ptr the address at which the
134 * field-content begins; the caller may pass that as @a head in a
135 * later call. Otherwise, this function does not modify *@a ptr.
137 * NULL if not found or out of memory. Otherwise, a copy of the
138 * field-content of the message-header; the caller must eventually
141 * The terms message-header, field-name, start-line, and field-content
142 * are defined in RFC 2616 sections 4.1 and 4.2. */
144 parse_header(unsigned char *head
, const unsigned char *item
, unsigned char **ptr
)
146 unsigned char *pos
= head
;
148 if (!pos
) return NULL
;
151 unsigned char *end
, *value
;
152 const unsigned char *itempos
;
155 /* Go for a newline. */
156 while (*pos
&& *pos
!= ASCII_LF
) pos
++;
158 pos
++; /* Start of line now. */
160 /* Does item match header line ? */
161 for (itempos
= item
; *itempos
&& *pos
; itempos
++, pos
++)
162 if (c_toupper(*itempos
) != c_toupper(*pos
))
165 if (!*pos
) break; /* Nothing left to parse. */
166 if (*itempos
) continue; /* Do not match. */
168 /* Be tolerant: we accept headers with
169 * weird syntax, since most browsers does it
174 * name[TAB]:[TAB]value */
178 /* Skip leading whitespaces if any. */
179 while (LWS(*pos
)) pos
++;
180 if (!*pos
) break; /* Nothing left to parse. */
182 /* Eat ':' or '=' if any. */
183 if (*pos
== ':' || *pos
== '=') pos
++;
184 if (!*pos
) break; /* Nothing left to parse. */
186 /* Skip whitespaces after separator if any. */
187 while (LWS(*pos
)) pos
++;
188 if (!*pos
) break; /* Nothing left to parse. */
190 if (pos
== end
) continue; /* Not an exact match (substring). */
192 /* Find the end of line/string.
193 * We fail on control chars and DEL char. */
195 while (*end
!= ASCII_DEL
&& (*end
> ' ' || LWS(*end
))) end
++;
196 if (!*end
) break; /* No end of line, nothing left to parse. */
198 /* Ignore line if we encountered an unexpected char. */
199 if (*end
!= ASCII_CR
&& *end
!= ASCII_LF
) continue;
201 /* Strip trailing whitespaces. */
202 while (end
> pos
&& LWS(end
[-1])) end
--;
206 if_assert_failed
break;
208 if (!len
) continue; /* Empty value. */
210 value
= memacpy(pos
, len
);
211 if (!value
) break; /* Allocation failure, stop here. */
220 /* Extract the value of name part of the value of attribute content.
221 * Ie. @name = "charset" and @str = "text/html; charset=iso-8859-1"
222 * will store in *@ret an allocated string containing "iso-8859-1".
223 * It supposes that separator is ';' and ignore first element in the
224 * list. (ie. '1' is ignored in "1; URL=xxx")
225 * The return value is one of:
227 * - HEADER_PARAM_FOUND: the parameter was found, copied, and stored in *@ret.
228 * - HEADER_PARAM_NOT_FOUND: the parameter is not there. *@ret is now NULL.
229 * - HEADER_PARAM_OUT_OF_MEMORY: error. *@ret is now NULL.
231 * If @ret is NULL, then this function doesn't actually access *@ret,
232 * and cannot fail with HEADER_PARAM_OUT_OF_MEMORY. Some callers may
234 enum parse_header_param
235 parse_header_param(unsigned char *str
, unsigned char *name
, unsigned char **ret
, int content_disposition
)
237 unsigned char *p
= str
;
238 int namelen
, plen
= 0;
240 if (ret
) *ret
= NULL
; /* default in case of early return */
242 assert(str
&& name
&& *name
);
243 if_assert_failed
return HEADER_PARAM_NOT_FOUND
;
245 /* Returns now if string @str is empty. */
246 if (!*p
) return HEADER_PARAM_NOT_FOUND
;
248 namelen
= strlen(name
);
250 if (!content_disposition
) {
251 p
= strchr((const char *)p
, ';');
252 if (!p
) return HEADER_PARAM_NOT_FOUND
;
255 while (*p
&& (*p
== ';' || *p
<= ' ')) p
++;
256 if (strlen(p
) < namelen
) return HEADER_PARAM_NOT_FOUND
;
257 } while (c_strncasecmp(p
, name
, namelen
));
261 while (*p
&& (*p
<= ' ' || *p
== '=')) p
++;
266 return HEADER_PARAM_OUT_OF_MEMORY
;
268 return HEADER_PARAM_FOUND
;
271 while ((p
[plen
] > ' ' || LWS(p
[plen
])) && p
[plen
] != ';') plen
++;
273 /* Trim ending spaces */
274 while (plen
> 0 && LWS(p
[plen
- 1])) plen
--;
276 /* XXX: Drop enclosing single quotes if there's some.
278 * Some websites like newsnow.co.uk are using single quotes around url
279 * in URL field in meta tag content attribute like this:
280 * <meta http-equiv="Refresh" content="0; URL='http://www.site.com/path/xxx.htm'">
282 * This is an attempt to handle that, but it may break something else.
283 * We drop all pair of enclosing quotes found (eg. '''url''' => url).
284 * Please report any issue related to this. --Zas */
285 while (plen
> 1 && *p
== '\'' && p
[plen
- 1] == '\'') {
291 *ret
= memacpy(p
, plen
);
293 return HEADER_PARAM_OUT_OF_MEMORY
;
295 return HEADER_PARAM_FOUND
;
298 /* Parse string param="value", return value as new string or NULL if any
301 get_header_param(unsigned char *e
, unsigned char *name
)
303 unsigned char *n
, *start
;
306 while (*e
&& c_toupper(*e
++) != c_toupper(*name
));
307 if (!*e
) return NULL
;
310 while (*n
&& c_toupper(*e
) == c_toupper(*n
)) e
++, n
++;
314 if (*e
++ != '=') return NULL
;
322 unsigned char uu
= *e
++;
326 if (!*e
) return NULL
;
331 while (start
< e
&& *start
== ' ') start
++;
332 while (start
< e
&& *(e
- 1) == ' ') e
--;
333 if (start
== e
) return NULL
;
335 n
= mem_alloc(e
- start
+ 1);
340 n
[i
++] = (*start
< ' ') ? '.' : *start
;