[doc] NEWS
[lighttpd.git] / src / burl.c
blobb62a5cd58c67fe3e3609bdc101e82920a3d5a9a1
1 #include "first.h"
2 #include "burl.h"
4 #include <string.h>
6 #include "buffer.h"
7 #include "base64.h"
9 static const char hex_chars_uc[] = "0123456789ABCDEF";
11 /* everything except: ! $ & ' ( ) * + , - . / 0-9 : ; = ? @ A-Z _ a-z ~ */
12 static const char encoded_chars_http_uri_reqd[] = {
14 0 1 2 3 4 5 6 7 8 9 A B C D E F
16 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00 - 0F control chars */
17 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 10 - 1F */
18 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 20 - 2F space " # % */
19 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, /* 30 - 3F < > */
20 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 40 - 4F */
21 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, /* 50 - 5F [ \ ] ^ */
22 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 60 - 6F ` */
23 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, /* 70 - 7F { | } DEL */
24 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 80 - 8F */
25 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 90 - 9F */
26 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* A0 - AF */
27 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* B0 - BF */
28 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* C0 - CF */
29 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* D0 - DF */
30 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* E0 - EF */
31 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* F0 - FF */
35 /* c (char) and n (nibble) MUST be unsigned integer types */
36 #define li_cton(c,n) \
37 (((n) = (c) - '0') <= 9 || (((n) = ((c)&0xdf) - 'A') <= 5 ? ((n) += 10) : 0))
39 /* b (byte) MUST be unsigned integer type
40 * https://en.wikipedia.org/wiki/UTF-8
41 * reject overlong encodings of 7-byte ASCII and invalid UTF-8
42 * (but does not detect other overlong multi-byte encodings) */
43 #define li_utf8_invalid_byte(b) ((b) >= 0xF5 || ((b)|0x1) == 0xC1)
46 static int burl_is_unreserved (const int c)
48 return (light_isalnum(c) || c == '-' || c == '.' || c == '_' || c == '~');
52 static int burl_normalize_basic_unreserved_fix (buffer *b, buffer *t, int i, int qs)
54 int j = i;
55 const int used = (int)buffer_string_length(b);
56 const unsigned char * const s = (unsigned char *)b->ptr;
57 unsigned char * const p =
58 (unsigned char *)buffer_string_prepare_copy(t,i+(used-i)*3+1);
59 unsigned int n1, n2;
60 memcpy(p, s, (size_t)i);
61 for (; i < used; ++i, ++j) {
62 if (!encoded_chars_http_uri_reqd[s[i]]) {
63 if (s[i] == '?' && -1 == qs) qs = j;
64 p[j] = s[i];
66 else if (s[i]=='%' && li_cton(s[i+1], n1) && li_cton(s[i+2], n2)) {
67 const unsigned int x = (n1 << 4) | n2;
68 if (burl_is_unreserved(x)) {
69 p[j] = x;
71 else {
72 p[j] = '%';
73 p[++j] = hex_chars_uc[n1]; /*(s[i+1] & 0xdf)*/
74 p[++j] = hex_chars_uc[n2]; /*(s[i+2] & 0xdf)*/
75 if (li_utf8_invalid_byte(x)) qs = -2;
77 i+=2;
79 else if (s[i] == '#') break; /* ignore fragment */
80 else {
81 p[j] = '%';
82 p[++j] = hex_chars_uc[(s[i] >> 4) & 0xF];
83 p[++j] = hex_chars_uc[s[i] & 0xF];
84 if (li_utf8_invalid_byte(s[i])) qs = -2;
87 buffer_commit(t, (size_t)j);
88 buffer_copy_buffer(b, t);
89 return qs;
93 static int burl_normalize_basic_unreserved (buffer *b, buffer *t)
95 const unsigned char * const s = (unsigned char *)b->ptr;
96 const int used = (int)buffer_string_length(b);
97 unsigned int n1, n2, x;
98 int qs = -1;
100 for (int i = 0; i < used; ++i) {
101 if (!encoded_chars_http_uri_reqd[s[i]]) {
102 if (s[i] == '?' && -1 == qs) qs = i;
104 else if (s[i]=='%' && li_cton(s[i+1], n1) && li_cton(s[i+2], n2)
105 && !burl_is_unreserved((x = (n1 << 4) | n2))) {
106 if (li_utf8_invalid_byte(x)) qs = -2;
107 if (s[i+1] >= 'a') b->ptr[i+1] &= 0xdf; /* uppercase hex */
108 if (s[i+2] >= 'a') b->ptr[i+2] &= 0xdf; /* uppercase hex */
109 i+=2;
111 else if (s[i] == '#') { /* ignore fragment */
112 buffer_string_set_length(b, (size_t)i);
113 break;
115 else {
116 qs = burl_normalize_basic_unreserved_fix(b, t, i, qs);
117 break;
121 return qs;
125 static int burl_normalize_basic_required_fix (buffer *b, buffer *t, int i, int qs)
127 int j = i;
128 const int used = (int)buffer_string_length(b);
129 const unsigned char * const s = (unsigned char *)b->ptr;
130 unsigned char * const p =
131 (unsigned char *)buffer_string_prepare_copy(t,i+(used-i)*3+1);
132 unsigned int n1, n2;
133 memcpy(p, s, (size_t)i);
134 for (; i < used; ++i, ++j) {
135 if (!encoded_chars_http_uri_reqd[s[i]]) {
136 if (s[i] == '?' && -1 == qs) qs = j;
137 p[j] = s[i];
139 else if (s[i]=='%' && li_cton(s[i+1], n1) && li_cton(s[i+2], n2)) {
140 const unsigned int x = (n1 << 4) | n2;
141 if (!encoded_chars_http_uri_reqd[x]
142 && (qs < 0 ? (x!='/'&&x!='?') : (x!='&'&&x!='='&&x!=';'))) {
143 p[j] = x;
145 else {
146 p[j] = '%';
147 p[++j] = hex_chars_uc[n1]; /*(s[i+1] & 0xdf)*/
148 p[++j] = hex_chars_uc[n2]; /*(s[i+2] & 0xdf)*/
149 if (li_utf8_invalid_byte(x)) qs = -2;
151 i+=2;
153 else if (s[i] == '#') break; /* ignore fragment */
154 else {
155 p[j] = '%';
156 p[++j] = hex_chars_uc[(s[i] >> 4) & 0xF];
157 p[++j] = hex_chars_uc[s[i] & 0xF];
158 if (li_utf8_invalid_byte(s[i])) qs = -2;
161 buffer_commit(t, (size_t)j);
162 buffer_copy_buffer(b, t);
163 return qs;
167 static int burl_normalize_basic_required (buffer *b, buffer *t)
169 const unsigned char * const s = (unsigned char *)b->ptr;
170 const int used = (int)buffer_string_length(b);
171 unsigned int n1, n2, x;
172 int qs = -1;
174 for (int i = 0; i < used; ++i) {
175 if (!encoded_chars_http_uri_reqd[s[i]]) {
176 if (s[i] == '?' && -1 == qs) qs = i;
178 else if (s[i]=='%' && li_cton(s[i+1], n1) && li_cton(s[i+2], n2)
179 && (encoded_chars_http_uri_reqd[(x = (n1 << 4) | n2)]
180 ||(qs < 0 ? (x=='/'||x=='?') : (x=='&'||x=='='||x==';')))){
181 if (li_utf8_invalid_byte(x)) qs = -2;
182 if (s[i+1] >= 'a') b->ptr[i+1] &= 0xdf; /* uppercase hex */
183 if (s[i+2] >= 'a') b->ptr[i+2] &= 0xdf; /* uppercase hex */
184 i+=2;
186 else if (s[i] == '#') { /* ignore fragment */
187 buffer_string_set_length(b, (size_t)i);
188 break;
190 else {
191 qs = burl_normalize_basic_required_fix(b, t, i, qs);
192 break;
196 return qs;
200 static int burl_contains_ctrls (const buffer *b)
202 const char * const s = b->ptr;
203 const int used = (int)buffer_string_length(b);
204 for (int i = 0; i < used; ++i) {
205 if (s[i] == '%' && (s[i+1] < '2' || (s[i+1] == '7' && s[i+2] == 'F')))
206 return 1;
208 return 0;
212 static void burl_normalize_qs20_to_plus_fix (buffer *b, int i)
214 char * const s = b->ptr;
215 const int used = (int)buffer_string_length(b);
216 int j = i;
217 for (; i < used; ++i, ++j) {
218 s[j] = s[i];
219 if (s[i] == '%' && s[i+1] == '2' && s[i+2] == '0') {
220 s[j] = '+';
221 i+=2;
224 buffer_string_set_length(b, j);
228 static void burl_normalize_qs20_to_plus (buffer *b, int qs)
230 const char * const s = b->ptr;
231 const int used = qs < 0 ? 0 : (int)buffer_string_length(b);
232 int i;
233 if (qs < 0) return;
234 for (i = qs+1; i < used; ++i) {
235 if (s[i] == '%' && s[i+1] == '2' && s[i+2] == '0') break;
237 if (i != used) burl_normalize_qs20_to_plus_fix(b, i);
241 static int burl_normalize_2F_to_slash_fix (buffer *b, int qs, int i)
243 char * const s = b->ptr;
244 const int blen = (int)buffer_string_length(b);
245 const int used = qs < 0 ? blen : qs;
246 int j = i;
247 for (; i < used; ++i, ++j) {
248 s[j] = s[i];
249 if (s[i] == '%' && s[i+1] == '2' && s[i+2] == 'F') {
250 s[j] = '/';
251 i+=2;
254 if (qs >= 0) {
255 const int qslen = blen - qs;
256 memmove(s+j, s+qs, (size_t)qslen);
257 qs = j;
258 j += qslen;
260 buffer_string_set_length(b, j);
261 return qs;
265 static int burl_normalize_2F_to_slash (buffer *b, int qs, int flags)
267 /*("%2F" must already have been uppercased during normalization)*/
268 const char * const s = b->ptr;
269 const int used = qs < 0 ? (int)buffer_string_length(b) : qs;
270 for (int i = 0; i < used; ++i) {
271 if (s[i] == '%' && s[i+1] == '2' && s[i+2] == 'F') {
272 return (flags & HTTP_PARSEOPT_URL_NORMALIZE_PATH_2F_DECODE)
273 ? burl_normalize_2F_to_slash_fix(b, qs, i)
274 : -2; /*(flags & HTTP_PARSEOPT_URL_NORMALIZE_PATH_2F_REJECT)*/
277 return qs;
281 static int burl_normalize_path (buffer *b, buffer *t, int qs, int flags)
283 const unsigned char * const s = (unsigned char *)b->ptr;
284 const int used = (int)buffer_string_length(b);
285 int path_simplify = 0;
286 for (int i = 0, len = qs < 0 ? used : qs; i < len; ++i) {
287 if (s[i] == '.' && (s[i+1] != '.' || ++i)
288 && (s[i+1] == '/' || s[i+1] == '?' || s[i+1] == '\0')) {
289 path_simplify = 1;
290 break;
292 while (i < len && s[i] != '/') ++i;
293 if (s[i] == '/' && s[i+1] == '/') { /*(s[len] != '/')*/
294 path_simplify = 1;
295 break;
299 if (path_simplify) {
300 if (flags & HTTP_PARSEOPT_URL_NORMALIZE_PATH_DOTSEG_REJECT) return -2;
301 if (qs >= 0) {
302 buffer_copy_string_len(t, b->ptr+qs, used - qs);
303 buffer_string_set_length(b, qs);
306 buffer_path_simplify(b, b);
308 if (qs >= 0) {
309 qs = (int)buffer_string_length(b);
310 buffer_append_string_len(b, CONST_BUF_LEN(t));
314 return qs;
318 int burl_normalize (buffer *b, buffer *t, int flags)
320 int qs;
322 #if defined(__WIN32) || defined(__CYGWIN__)
323 /* Windows and Cygwin treat '\\' as '/' if '\\' is present in path;
324 * convert to '/' for consistency before percent-encoding
325 * normalization which will convert '\\' to "%5C" in the URL.
326 * (Clients still should not be sending '\\' unencoded in requests.) */
327 if (flags & HTTP_PARSEOPT_URL_NORMALIZE_PATH_BACKSLASH_TRANS) {
328 for (char *p = b->ptr; *p != '?' && *p != '\0'; ++p) {
329 if (*p == '\\') *p = '/';
332 #endif
334 qs = (flags & HTTP_PARSEOPT_URL_NORMALIZE_REQUIRED)
335 ? burl_normalize_basic_required(b, t)
336 : burl_normalize_basic_unreserved(b, t);
337 if (-2 == qs) return -2;
339 if (flags & HTTP_PARSEOPT_URL_NORMALIZE_CTRLS_REJECT) {
340 if (burl_contains_ctrls(b)) return -2;
343 if (flags & (HTTP_PARSEOPT_URL_NORMALIZE_PATH_2F_DECODE
344 |HTTP_PARSEOPT_URL_NORMALIZE_PATH_2F_REJECT)) {
345 qs = burl_normalize_2F_to_slash(b, qs, flags);
346 if (-2 == qs) return -2;
349 if (flags & (HTTP_PARSEOPT_URL_NORMALIZE_PATH_DOTSEG_REMOVE
350 |HTTP_PARSEOPT_URL_NORMALIZE_PATH_DOTSEG_REJECT)) {
351 qs = burl_normalize_path(b, t, qs, flags);
352 if (-2 == qs) return -2;
355 if (flags & HTTP_PARSEOPT_URL_NORMALIZE_QUERY_20_PLUS) {
356 if (qs >= 0) burl_normalize_qs20_to_plus(b, qs);
359 return qs;
363 static void burl_append_encode_nde (buffer * const b, const char * const str, const size_t len)
365 /* percent-encodes everything except unreserved - . 0-9 A-Z _ a-z ~
366 * unless already percent-encoded (does not double-encode) */
367 /* Note: not checking for invalid UTF-8 */
368 char * const p = buffer_string_prepare_append(b, len*3);
369 unsigned int n1, n2;
370 int j = 0;
371 for (unsigned int i = 0; i < len; ++i, ++j) {
372 if (str[i]=='%' && li_cton(str[i+1], n1) && li_cton(str[i+2], n2)) {
373 const unsigned int x = (n1 << 4) | n2;
374 if (burl_is_unreserved((int)x)) {
375 p[j] = (char)x;
377 else { /* leave UTF-8, control chars, and required chars encoded */
378 p[j] = '%';
379 p[++j] = str[i+1];
380 p[++j] = str[i+2];
382 i+=2;
384 else if (burl_is_unreserved(str[i])) {
385 p[j] = str[i];
387 else {
388 p[j] = '%';
389 p[++j] = hex_chars_uc[(str[i] >> 4) & 0xF];
390 p[++j] = hex_chars_uc[str[i] & 0xF];
393 buffer_commit(b, j);
397 static void burl_append_encode_psnde (buffer * const b, const char * const str, const size_t len)
399 /* percent-encodes everything except unreserved - . 0-9 A-Z _ a-z ~ plus /
400 * unless already percent-encoded (does not double-encode) */
401 /* Note: not checking for invalid UTF-8 */
402 char * const p = buffer_string_prepare_append(b, len*3);
403 unsigned int n1, n2;
404 int j = 0;
405 for (unsigned int i = 0; i < len; ++i, ++j) {
406 if (str[i]=='%' && li_cton(str[i+1], n1) && li_cton(str[i+2], n2)) {
407 const unsigned int x = (n1 << 4) | n2;
408 if (burl_is_unreserved((int)x)) {
409 p[j] = (char)x;
411 else { /* leave UTF-8, control chars, and required chars encoded */
412 p[j] = '%';
413 p[++j] = str[i+1];
414 p[++j] = str[i+2];
416 i+=2;
418 else if (burl_is_unreserved(str[i]) || str[i] == '/') {
419 p[j] = str[i];
421 else {
422 p[j] = '%';
423 p[++j] = hex_chars_uc[(str[i] >> 4) & 0xF];
424 p[++j] = hex_chars_uc[str[i] & 0xF];
427 buffer_commit(b, j);
431 static void burl_append_encode_all (buffer * const b, const char * const str, const size_t len)
433 /* percent-encodes everything except unreserved - . 0-9 A-Z _ a-z ~
434 * Note: double-encodes any existing '%') */
435 /* Note: not checking for invalid UTF-8 */
436 char * const p = buffer_string_prepare_append(b, len*3);
437 int j = 0;
438 for (unsigned int i = 0; i < len; ++i, ++j) {
439 if (burl_is_unreserved(str[i])) {
440 p[j] = str[i];
442 else {
443 p[j] = '%';
444 p[++j] = hex_chars_uc[(str[i] >> 4) & 0xF];
445 p[++j] = hex_chars_uc[str[i] & 0xF];
448 buffer_commit(b, j);
452 static void burl_offset_tolower (buffer * const b, const size_t off)
454 /*(skips over all percent-encodings, including encoding of alpha chars)*/
455 for (char *p = b->ptr+off; p[0]; ++p) {
456 if (p[0] >= 'A' && p[0] <= 'Z') p[0] |= 0x20;
457 else if (p[0]=='%' && light_isxdigit(p[1]) && light_isxdigit(p[2]))
458 p+=2;
463 static void burl_offset_toupper (buffer * const b, const size_t off)
465 /*(skips over all percent-encodings, including encoding of alpha chars)*/
466 for (char *p = b->ptr+off; p[0]; ++p) {
467 if (p[0] >= 'a' && p[0] <= 'z') p[0] &= 0xdf;
468 else if (p[0]=='%' && light_isxdigit(p[1]) && light_isxdigit(p[2]))
469 p+=2;
474 void burl_append (buffer * const b, const char * const str, const size_t len, const int flags)
476 size_t off = 0;
478 if (0 == len) return;
480 if (0 == flags) {
481 buffer_append_string_len(b, str, len);
482 return;
485 if (flags & (BURL_TOUPPER|BURL_TOLOWER)) off = buffer_string_length(b);
487 if (flags & BURL_ENCODE_NONE) {
488 buffer_append_string_len(b, str, len);
490 else if (flags & BURL_ENCODE_ALL) {
491 burl_append_encode_all(b, str, len);
493 else if (flags & BURL_ENCODE_NDE) {
494 burl_append_encode_nde(b, str, len);
496 else if (flags & BURL_ENCODE_PSNDE) {
497 burl_append_encode_psnde(b, str, len);
499 else if (flags & BURL_ENCODE_B64U) {
500 const unsigned char *s = (const unsigned char *)str;
501 buffer_append_base64_encode_no_padding(b, s, len, BASE64_URL);
503 else if (flags & BURL_DECODE_B64U) {
504 buffer_append_base64_decode(b, str, len, BASE64_URL);
507 /* note: not normalizing str, which could come from arbitrary header,
508 * so it is possible that alpha chars are percent-encoded upper/lowercase */
509 if (flags & (BURL_TOLOWER|BURL_TOUPPER)) {
510 (flags & BURL_TOLOWER)
511 ? burl_offset_tolower(b, off) /*(flags & BURL_TOLOWER)*/
512 : burl_offset_toupper(b, off); /*(flags & BURL_TOUPPER)*/