move-link-prev(next)-line: Typo with cut-n-paste. s/line/last/.
[elinks.git] / src / protocol / header.c
blobae5272a20852ed4a2228db29613e0f734ab4ac81
1 /* Parser of HTTP headers */
3 #ifdef HAVE_CONFIG_H
4 #include "config.h"
5 #endif
7 #include <string.h>
9 #include "elinks.h"
11 #include "protocol/header.h"
12 #include "util/conv.h"
13 #include "util/error.h"
14 #include "util/memory.h"
15 #include "util/string.h"
18 * RFC 2616 HTTP/1.1 June 1999
21 * OCTET = <any 8-bit sequence of data>
22 * CHAR = <any US-ASCII character (octets 0 - 127)>
23 * UPALPHA = <any US-ASCII uppercase letter "A".."Z">
24 * LOALPHA = <any US-ASCII lowercase letter "a".."z">
25 * ALPHA = UPALPHA | LOALPHA
26 * DIGIT = <any US-ASCII digit "0".."9">
27 * CTL = <any US-ASCII control character
28 * (octets 0 - 31) and DEL (127)>
29 * CR = <US-ASCII CR, carriage return (13)>
30 * LF = <US-ASCII LF, linefeed (10)>
31 * SP = <US-ASCII SP, space (32)>
32 * HT = <US-ASCII HT, horizontal-tab (9)>
33 * <"> = <US-ASCII double-quote mark (34)>
35 * HTTP/1.1 defines the sequence CR LF as the end-of-line marker for all
36 * protocol elements except the entity-body (see appendix 19.3 for
37 * tolerant applications). The end-of-line marker within an entity-body
38 * is defined by its associated media type, as described in section 3.7.
40 * CRLF = CR LF
42 * HTTP/1.1 header field values can be folded onto multiple lines if the
43 * continuation line begins with a space or horizontal tab. All linear
44 * white space, including folding, has the same semantics as SP. A
45 * recipient MAY replace any linear white space with a single SP before
46 * interpreting the field value or forwarding the message downstream.
48 * LWS = [CRLF] 1*( SP | HT )
50 * The TEXT rule is only used for descriptive field contents and values
51 * that are not intended to be interpreted by the message parser. Words
52 * of *TEXT MAY contain characters from character sets other than ISO-
53 * 8859-1 [22] only when encoded according to the rules of RFC 2047
54 * [14].
56 * TEXT = <any OCTET except CTLs,
57 * but including LWS>
59 * A CRLF is allowed in the definition of TEXT only as part of a header
60 * field continuation. It is expected that the folding LWS will be
61 * replaced with a single SP before interpretation of the TEXT value.
63 * Hexadecimal numeric characters are used in several protocol elements.
65 * HEX = "A" | "B" | "C" | "D" | "E" | "F"
66 * | "a" | "b" | "c" | "d" | "e" | "f" | DIGIT
68 * Many HTTP/1.1 header field values consist of words separated by LWS
69 * or special characters. These special characters MUST be in a quoted
70 * string to be used within a parameter value (as defined in section
71 * 3.6).
73 * token = 1*<any CHAR except CTLs or separators>
74 * separators = "(" | ")" | "<" | ">" | "@"
75 * | "," | ";" | ":" | "\" | <">
76 * | "/" | "[" | "]" | "?" | "="
77 * | "{" | "}" | SP | HT
79 * Comments can be included in some HTTP header fields by surrounding
80 * the comment text with parentheses. Comments are only allowed in
81 * fields containing "comment" as part of their field value definition.
82 * In all other fields, parentheses are considered part of the field
83 * value.
85 * comment = "(" *( ctext | quoted-pair | comment ) ")"
86 * ctext = <any TEXT excluding "(" and ")">
88 * A string of text is parsed as a single word if it is quoted using
89 * double-quote marks.
91 * quoted-string = ( <"> *(qdtext | quoted-pair ) <"> )
92 * qdtext = <any TEXT except <">>
94 * The backslash character ("\") MAY be used as a single-character
95 * quoting mechanism only within quoted-string and comment constructs.
97 * quoted-pair = "\" CHAR
100 /* FIXME: bug 549
102 * HTTP/1.1 header continuation lines are not honoured.
103 * DEL char is accepted in TEXT part.
104 * HT char is not accepted in TEXT part.
105 * LF alone do not mark end of line, CRLF is the correct termination.
106 * CR or LF are invalid in header line.
108 * Mozilla, IE, NS tolerate header value separator different from ':'
109 * Examples:
110 * name: value
111 * name value
112 * name :value
113 * name=value
116 #define LWS(c) ((c) == ' ' || (c) == ASCII_TAB)
118 unsigned char *
119 parse_header(unsigned char *head, unsigned char *item, unsigned char **ptr)
121 unsigned char *pos = head;
123 if (!pos) return NULL;
125 while (*pos) {
126 unsigned char *end, *itempos, *value;
127 int len;
129 /* Go for a newline. */
130 while (*pos && *pos != ASCII_LF) pos++;
131 if (!*pos) break;
132 pos++; /* Start of line now. */
134 /* Does item match header line ? */
135 for (itempos = item; *itempos && *pos; itempos++, pos++)
136 if (toupper(*itempos) != toupper(*pos))
137 break;
139 if (!*pos) break; /* Nothing left to parse. */
140 if (*itempos) continue; /* Do not match. */
142 /* Be tolerant: we accept headers with
143 * weird syntax, since most browsers does it
144 * anyway, ie:
145 * name value
146 * name :value
147 * name = value
148 * name[TAB]:[TAB]value */
150 end = pos;
152 /* Skip leading whitespaces if any. */
153 while (LWS(*pos)) pos++;
154 if (!*pos) break; /* Nothing left to parse. */
156 /* Eat ':' or '=' if any. */
157 if (*pos == ':' || *pos == '=') pos++;
158 if (!*pos) break; /* Nothing left to parse. */
160 /* Skip whitespaces after separator if any. */
161 while (LWS(*pos)) pos++;
162 if (!*pos) break; /* Nothing left to parse. */
164 if (pos == end) continue; /* Not an exact match (substring). */
166 /* Find the end of line/string.
167 * We fail on control chars and DEL char. */
168 end = pos;
169 while (*end != ASCII_DEL && (*end > ' ' || LWS(*end))) end++;
170 if (!*end) break; /* No end of line, nothing left to parse. */
172 /* Ignore line if we encountered an unexpected char. */
173 if (*end != ASCII_CR && *end != ASCII_LF) continue;
175 /* Strip trailing whitespaces. */
176 while (end > pos && LWS(end[-1])) end--;
178 len = end - pos;
179 assert(len >= 0);
180 if_assert_failed break;
182 if (!len) continue; /* Empty value. */
184 value = memacpy(pos, len);
185 if (!value) break; /* Allocation failure, stop here. */
187 if (ptr) *ptr = pos;
188 return value;
191 return NULL;
194 /* Extract the value of name part of the value of attribute content.
195 * Ie. @name = "charset" and @str = "text/html; charset=iso-8859-1"
196 * will store in *@ret an allocated string containing "iso-8859-1".
197 * It supposes that separator is ';' and ignore first element in the
198 * list. (ie. '1' is ignored in "1; URL=xxx")
199 * The return value is one of:
201 * - HEADER_PARAM_FOUND: the parameter was found, copied, and stored in *@ret.
202 * - HEADER_PARAM_NOT_FOUND: the parameter is not there. *@ret is now NULL.
203 * - HEADER_PARAM_OUT_OF_MEMORY: error. *@ret is now NULL.
205 * If @ret is NULL, then this function doesn't actually access *@ret,
206 * and cannot fail with HEADER_PARAM_OUT_OF_MEMORY. Some callers may
207 * rely on this. */
208 enum parse_header_param
209 parse_header_param(unsigned char *str, unsigned char *name, unsigned char **ret)
211 unsigned char *p = str;
212 int namelen, plen = 0;
214 if (ret) *ret = NULL; /* default in case of early return */
216 assert(str && name && *name);
217 if_assert_failed return HEADER_PARAM_NOT_FOUND;
219 /* Returns now if string @str is empty. */
220 if (!*p) return HEADER_PARAM_NOT_FOUND;
222 namelen = strlen(name);
223 do {
224 p = strchr(p, ';');
225 if (!p) return HEADER_PARAM_NOT_FOUND;
227 while (*p && (*p == ';' || *p <= ' ')) p++;
228 if (strlen(p) < namelen) return HEADER_PARAM_NOT_FOUND;
229 } while (strncasecmp(p, name, namelen));
231 p += namelen;
233 while (*p && (*p <= ' ' || *p == '=')) p++;
234 if (!*p) {
235 if (ret) {
236 *ret = stracpy("");
237 if (!*ret)
238 return HEADER_PARAM_OUT_OF_MEMORY;
240 return HEADER_PARAM_FOUND;
243 while ((p[plen] > ' ' || LWS(p[plen])) && p[plen] != ';') plen++;
245 /* Trim ending spaces */
246 while (plen > 0 && LWS(p[plen - 1])) plen--;
248 /* XXX: Drop enclosing single quotes if there's some.
250 * Some websites like newsnow.co.uk are using single quotes around url
251 * in URL field in meta tag content attribute like this:
252 * <meta http-equiv="Refresh" content="0; URL='http://www.site.com/path/xxx.htm'">
254 * This is an attempt to handle that, but it may break something else.
255 * We drop all pair of enclosing quotes found (eg. '''url''' => url).
256 * Please report any issue related to this. --Zas */
257 while (plen > 1 && *p == '\'' && p[plen - 1] == '\'') {
258 p++;
259 plen -= 2;
262 if (ret) {
263 *ret = memacpy(p, plen);
264 if (!*ret)
265 return HEADER_PARAM_OUT_OF_MEMORY;
267 return HEADER_PARAM_FOUND;
270 /* Parse string param="value", return value as new string or NULL if any
271 * error. */
272 unsigned char *
273 get_header_param(unsigned char *e, unsigned char *name)
275 unsigned char *n, *start;
277 again:
278 while (*e && toupper(*e++) != toupper(*name));
279 if (!*e) return NULL;
281 n = name + 1;
282 while (*n && toupper(*e) == toupper(*n)) e++, n++;
283 if (*n) goto again;
285 skip_space(e);
286 if (*e++ != '=') return NULL;
288 skip_space(e);
289 start = e;
291 if (!isquote(*e)) {
292 skip_nonspace(e);
293 } else {
294 unsigned char uu = *e++;
296 start++;
297 while (*e != uu) {
298 if (!*e) return NULL;
299 e++;
303 while (start < e && *start == ' ') start++;
304 while (start < e && *(e - 1) == ' ') e--;
305 if (start == e) return NULL;
307 n = mem_alloc(e - start + 1);
308 if (n) {
309 int i = 0;
311 while (start < e) {
312 n[i++] = (*start < ' ') ? '.' : *start;
313 start++;
315 n[i] = '\0';
318 return n;