9 static const char hex_chars_uc
[] = "0123456789ABCDEF";
11 /* everything except: ! $ & ' ( ) * + , - . / 0-9 : ; = ? @ A-Z _ a-z ~ */
12 static const char encoded_chars_http_uri_reqd
[] = {
14 0 1 2 3 4 5 6 7 8 9 A B C D E F
16 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00 - 0F control chars */
17 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 10 - 1F */
18 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 20 - 2F space " # % */
19 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, /* 30 - 3F < > */
20 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 40 - 4F */
21 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, /* 50 - 5F [ \ ] ^ */
22 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 60 - 6F ` */
23 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, /* 70 - 7F { | } DEL */
24 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 80 - 8F */
25 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 90 - 9F */
26 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* A0 - AF */
27 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* B0 - BF */
28 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* C0 - CF */
29 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* D0 - DF */
30 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* E0 - EF */
31 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* F0 - FF */
35 /* c (char) and n (nibble) MUST be unsigned integer types */
36 #define li_cton(c,n) \
37 (((n) = (c) - '0') <= 9 || (((n) = ((c)&0xdf) - 'A') <= 5 ? ((n) += 10) : 0))
39 /* b (byte) MUST be unsigned integer type
40 * https://en.wikipedia.org/wiki/UTF-8
41 * reject overlong encodings of 7-byte ASCII and invalid UTF-8
42 * (but does not detect other overlong multi-byte encodings) */
43 #define li_utf8_invalid_byte(b) ((b) >= 0xF5 || ((b)|0x1) == 0xC1)
46 static int burl_is_unreserved (const int c
)
48 return (light_isalnum(c
) || c
== '-' || c
== '.' || c
== '_' || c
== '~');
52 static int burl_normalize_basic_unreserved_fix (buffer
*b
, buffer
*t
, int i
, int qs
)
55 const int used
= (int)buffer_string_length(b
);
56 const unsigned char * const s
= (unsigned char *)b
->ptr
;
57 unsigned char * const p
=
58 (unsigned char *)buffer_string_prepare_copy(t
,i
+(used
-i
)*3+1);
60 memcpy(p
, s
, (size_t)i
);
61 for (; i
< used
; ++i
, ++j
) {
62 if (!encoded_chars_http_uri_reqd
[s
[i
]]) {
63 if (s
[i
] == '?' && -1 == qs
) qs
= j
;
66 else if (s
[i
]=='%' && li_cton(s
[i
+1], n1
) && li_cton(s
[i
+2], n2
)) {
67 const unsigned int x
= (n1
<< 4) | n2
;
68 if (burl_is_unreserved(x
)) {
73 p
[++j
] = hex_chars_uc
[n1
]; /*(s[i+1] & 0xdf)*/
74 p
[++j
] = hex_chars_uc
[n2
]; /*(s[i+2] & 0xdf)*/
75 if (li_utf8_invalid_byte(x
)) qs
= -2;
79 else if (s
[i
] == '#') break; /* ignore fragment */
82 p
[++j
] = hex_chars_uc
[(s
[i
] >> 4) & 0xF];
83 p
[++j
] = hex_chars_uc
[s
[i
] & 0xF];
84 if (li_utf8_invalid_byte(s
[i
])) qs
= -2;
87 buffer_commit(t
, (size_t)j
);
88 buffer_copy_buffer(b
, t
);
93 static int burl_normalize_basic_unreserved (buffer
*b
, buffer
*t
)
95 const unsigned char * const s
= (unsigned char *)b
->ptr
;
96 const int used
= (int)buffer_string_length(b
);
97 unsigned int n1
, n2
, x
;
100 for (int i
= 0; i
< used
; ++i
) {
101 if (!encoded_chars_http_uri_reqd
[s
[i
]]) {
102 if (s
[i
] == '?' && -1 == qs
) qs
= i
;
104 else if (s
[i
]=='%' && li_cton(s
[i
+1], n1
) && li_cton(s
[i
+2], n2
)
105 && !burl_is_unreserved((x
= (n1
<< 4) | n2
))) {
106 if (li_utf8_invalid_byte(x
)) qs
= -2;
107 if (s
[i
+1] >= 'a') b
->ptr
[i
+1] &= 0xdf; /* uppercase hex */
108 if (s
[i
+2] >= 'a') b
->ptr
[i
+2] &= 0xdf; /* uppercase hex */
111 else if (s
[i
] == '#') { /* ignore fragment */
112 buffer_string_set_length(b
, (size_t)i
);
116 qs
= burl_normalize_basic_unreserved_fix(b
, t
, i
, qs
);
125 static int burl_normalize_basic_required_fix (buffer
*b
, buffer
*t
, int i
, int qs
)
128 const int used
= (int)buffer_string_length(b
);
129 const unsigned char * const s
= (unsigned char *)b
->ptr
;
130 unsigned char * const p
=
131 (unsigned char *)buffer_string_prepare_copy(t
,i
+(used
-i
)*3+1);
133 memcpy(p
, s
, (size_t)i
);
134 for (; i
< used
; ++i
, ++j
) {
135 if (!encoded_chars_http_uri_reqd
[s
[i
]]) {
136 if (s
[i
] == '?' && -1 == qs
) qs
= j
;
139 else if (s
[i
]=='%' && li_cton(s
[i
+1], n1
) && li_cton(s
[i
+2], n2
)) {
140 const unsigned int x
= (n1
<< 4) | n2
;
141 if (!encoded_chars_http_uri_reqd
[x
]
142 && (qs
< 0 ? (x
!='/'&&x
!='?') : (x
!='&'&&x
!='='&&x
!=';'))) {
147 p
[++j
] = hex_chars_uc
[n1
]; /*(s[i+1] & 0xdf)*/
148 p
[++j
] = hex_chars_uc
[n2
]; /*(s[i+2] & 0xdf)*/
149 if (li_utf8_invalid_byte(x
)) qs
= -2;
153 else if (s
[i
] == '#') break; /* ignore fragment */
156 p
[++j
] = hex_chars_uc
[(s
[i
] >> 4) & 0xF];
157 p
[++j
] = hex_chars_uc
[s
[i
] & 0xF];
158 if (li_utf8_invalid_byte(s
[i
])) qs
= -2;
161 buffer_commit(t
, (size_t)j
);
162 buffer_copy_buffer(b
, t
);
167 static int burl_normalize_basic_required (buffer
*b
, buffer
*t
)
169 const unsigned char * const s
= (unsigned char *)b
->ptr
;
170 const int used
= (int)buffer_string_length(b
);
171 unsigned int n1
, n2
, x
;
174 for (int i
= 0; i
< used
; ++i
) {
175 if (!encoded_chars_http_uri_reqd
[s
[i
]]) {
176 if (s
[i
] == '?' && -1 == qs
) qs
= i
;
178 else if (s
[i
]=='%' && li_cton(s
[i
+1], n1
) && li_cton(s
[i
+2], n2
)
179 && (encoded_chars_http_uri_reqd
[(x
= (n1
<< 4) | n2
)]
180 ||(qs
< 0 ? (x
=='/'||x
=='?') : (x
=='&'||x
=='='||x
==';')))){
181 if (li_utf8_invalid_byte(x
)) qs
= -2;
182 if (s
[i
+1] >= 'a') b
->ptr
[i
+1] &= 0xdf; /* uppercase hex */
183 if (s
[i
+2] >= 'a') b
->ptr
[i
+2] &= 0xdf; /* uppercase hex */
186 else if (s
[i
] == '#') { /* ignore fragment */
187 buffer_string_set_length(b
, (size_t)i
);
191 qs
= burl_normalize_basic_required_fix(b
, t
, i
, qs
);
200 static int burl_contains_ctrls (const buffer
*b
)
202 const char * const s
= b
->ptr
;
203 const int used
= (int)buffer_string_length(b
);
204 for (int i
= 0; i
< used
; ++i
) {
205 if (s
[i
] == '%' && (s
[i
+1] < '2' || (s
[i
+1] == '7' && s
[i
+2] == 'F')))
212 static void burl_normalize_qs20_to_plus_fix (buffer
*b
, int i
)
214 char * const s
= b
->ptr
;
215 const int used
= (int)buffer_string_length(b
);
217 for (; i
< used
; ++i
, ++j
) {
219 if (s
[i
] == '%' && s
[i
+1] == '2' && s
[i
+2] == '0') {
224 buffer_string_set_length(b
, j
);
228 static void burl_normalize_qs20_to_plus (buffer
*b
, int qs
)
230 const char * const s
= b
->ptr
;
231 const int used
= qs
< 0 ? 0 : (int)buffer_string_length(b
);
234 for (i
= qs
+1; i
< used
; ++i
) {
235 if (s
[i
] == '%' && s
[i
+1] == '2' && s
[i
+2] == '0') break;
237 if (i
!= used
) burl_normalize_qs20_to_plus_fix(b
, i
);
241 static int burl_normalize_2F_to_slash_fix (buffer
*b
, int qs
, int i
)
243 char * const s
= b
->ptr
;
244 const int blen
= (int)buffer_string_length(b
);
245 const int used
= qs
< 0 ? blen
: qs
;
247 for (; i
< used
; ++i
, ++j
) {
249 if (s
[i
] == '%' && s
[i
+1] == '2' && s
[i
+2] == 'F') {
255 memmove(s
+j
, s
+qs
, blen
- qs
);
258 buffer_string_set_length(b
, j
);
263 static int burl_normalize_2F_to_slash (buffer
*b
, int qs
, int flags
)
265 /*("%2F" must already have been uppercased during normalization)*/
266 const char * const s
= b
->ptr
;
267 const int used
= qs
< 0 ? (int)buffer_string_length(b
) : qs
;
268 for (int i
= 0; i
< used
; ++i
) {
269 if (s
[i
] == '%' && s
[i
+1] == '2' && s
[i
+2] == 'F') {
270 return (flags
& HTTP_PARSEOPT_URL_NORMALIZE_PATH_2F_DECODE
)
271 ? burl_normalize_2F_to_slash_fix(b
, qs
, i
)
272 : -2; /*(flags & HTTP_PARSEOPT_URL_NORMALIZE_PATH_2F_REJECT)*/
279 static int burl_normalize_path (buffer
*b
, buffer
*t
, int qs
, int flags
)
281 const unsigned char * const s
= (unsigned char *)b
->ptr
;
282 const int used
= (int)buffer_string_length(b
);
283 int path_simplify
= 0;
284 for (int i
= 0, len
= qs
< 0 ? used
: qs
; i
< len
; ++i
) {
285 if (s
[i
] == '.' && (s
[i
+1] != '.' || ++i
)
286 && (s
[i
+1] == '/' || s
[i
+1] == '?' || s
[i
+1] == '\0')) {
290 do { ++i
; } while (i
< len
&& s
[i
] != '/');
291 if (s
[i
] == '/' && s
[i
+1] == '/') { /*(s[len] != '/')*/
298 if (flags
& HTTP_PARSEOPT_URL_NORMALIZE_PATH_DOTSEG_REJECT
) return -2;
300 buffer_copy_string_len(t
, b
->ptr
+qs
, used
- qs
);
301 buffer_string_set_length(b
, qs
);
304 buffer_path_simplify(b
, b
);
307 qs
= (int)buffer_string_length(b
);
308 buffer_append_string_len(b
, CONST_BUF_LEN(t
));
316 int burl_normalize (buffer
*b
, buffer
*t
, int flags
)
320 #if defined(__WIN32) || defined(__CYGWIN__)
321 /* Windows and Cygwin treat '\\' as '/' if '\\' is present in path;
322 * convert to '/' for consistency before percent-encoding
323 * normalization which will convert '\\' to "%5C" in the URL.
324 * (Clients still should not be sending '\\' unencoded in requests.) */
325 if (flags
& HTTP_PARSEOPT_URL_NORMALIZE_PATH_BACKSLASH_TRANS
) {
326 for (char *p
= b
->ptr
; *p
!= '?' && *p
!= '\0'; ++p
) {
327 if (*p
== '\\') *p
= '/';
332 qs
= (flags
& HTTP_PARSEOPT_URL_NORMALIZE_REQUIRED
)
333 ? burl_normalize_basic_required(b
, t
)
334 : burl_normalize_basic_unreserved(b
, t
);
335 if (-2 == qs
) return -2;
337 if (flags
& HTTP_PARSEOPT_URL_NORMALIZE_CTRLS_REJECT
) {
338 if (burl_contains_ctrls(b
)) return -2;
341 if (flags
& (HTTP_PARSEOPT_URL_NORMALIZE_PATH_2F_DECODE
342 |HTTP_PARSEOPT_URL_NORMALIZE_PATH_2F_REJECT
)) {
343 qs
= burl_normalize_2F_to_slash(b
, qs
, flags
);
344 if (-2 == qs
) return -2;
347 if (flags
& (HTTP_PARSEOPT_URL_NORMALIZE_PATH_DOTSEG_REMOVE
348 |HTTP_PARSEOPT_URL_NORMALIZE_PATH_DOTSEG_REJECT
)) {
349 qs
= burl_normalize_path(b
, t
, qs
, flags
);
350 if (-2 == qs
) return -2;
353 if (flags
& HTTP_PARSEOPT_URL_NORMALIZE_QUERY_20_PLUS
) {
354 if (qs
>= 0) burl_normalize_qs20_to_plus(b
, qs
);
361 static void burl_append_encode_nde (buffer
* const b
, const char * const str
, const size_t len
)
363 /* percent-encodes everything except unreserved - . 0-9 A-Z _ a-z ~
364 * unless already percent-encoded (does not double-encode) */
365 /* Note: not checking for invalid UTF-8 */
366 char * const p
= buffer_string_prepare_append(b
, len
*3);
369 for (unsigned int i
= 0; i
< len
; ++i
, ++j
) {
370 if (str
[i
]=='%' && li_cton(str
[i
+1], n1
) && li_cton(str
[i
+2], n2
)) {
371 const unsigned int x
= (n1
<< 4) | n2
;
372 if (burl_is_unreserved((int)x
)) {
375 else { /* leave UTF-8, control chars, and required chars encoded */
382 else if (burl_is_unreserved(str
[i
])) {
387 p
[++j
] = hex_chars_uc
[(str
[i
] >> 4) & 0xF];
388 p
[++j
] = hex_chars_uc
[str
[i
] & 0xF];
395 static void burl_append_encode_psnde (buffer
* const b
, const char * const str
, const size_t len
)
397 /* percent-encodes everything except unreserved - . 0-9 A-Z _ a-z ~ plus /
398 * unless already percent-encoded (does not double-encode) */
399 /* Note: not checking for invalid UTF-8 */
400 char * const p
= buffer_string_prepare_append(b
, len
*3);
403 for (unsigned int i
= 0; i
< len
; ++i
, ++j
) {
404 if (str
[i
]=='%' && li_cton(str
[i
+1], n1
) && li_cton(str
[i
+2], n2
)) {
405 const unsigned int x
= (n1
<< 4) | n2
;
406 if (burl_is_unreserved((int)x
)) {
409 else { /* leave UTF-8, control chars, and required chars encoded */
416 else if (burl_is_unreserved(str
[i
]) || str
[i
] == '/') {
421 p
[++j
] = hex_chars_uc
[(str
[i
] >> 4) & 0xF];
422 p
[++j
] = hex_chars_uc
[str
[i
] & 0xF];
429 static void burl_append_encode_all (buffer
* const b
, const char * const str
, const size_t len
)
431 /* percent-encodes everything except unreserved - . 0-9 A-Z _ a-z ~
432 * Note: double-encodes any existing '%') */
433 /* Note: not checking for invalid UTF-8 */
434 char * const p
= buffer_string_prepare_append(b
, len
*3);
436 for (unsigned int i
= 0; i
< len
; ++i
, ++j
) {
437 if (burl_is_unreserved(str
[i
])) {
442 p
[++j
] = hex_chars_uc
[(str
[i
] >> 4) & 0xF];
443 p
[++j
] = hex_chars_uc
[str
[i
] & 0xF];
450 static void burl_offset_tolower (buffer
* const b
, const size_t off
)
452 /*(skips over all percent-encodings, including encoding of alpha chars)*/
453 for (char *p
= b
->ptr
+off
; p
[0]; ++p
) {
454 if (p
[0] >= 'A' && p
[0] <= 'Z') p
[0] |= 0x20;
455 else if (p
[0]=='%' && light_isxdigit(p
[1]) && light_isxdigit(p
[2]))
461 static void burl_offset_toupper (buffer
* const b
, const size_t off
)
463 /*(skips over all percent-encodings, including encoding of alpha chars)*/
464 for (char *p
= b
->ptr
+off
; p
[0]; ++p
) {
465 if (p
[0] >= 'a' && p
[0] <= 'z') p
[0] &= 0xdf;
466 else if (p
[0]=='%' && light_isxdigit(p
[1]) && light_isxdigit(p
[2]))
472 void burl_append (buffer
* const b
, const char * const str
, const size_t len
, const int flags
)
476 if (0 == len
) return;
479 buffer_append_string_len(b
, str
, len
);
483 if (flags
& (BURL_TOUPPER
|BURL_TOLOWER
)) off
= buffer_string_length(b
);
485 if (flags
& BURL_ENCODE_NONE
) {
486 buffer_append_string_len(b
, str
, len
);
488 else if (flags
& BURL_ENCODE_ALL
) {
489 burl_append_encode_all(b
, str
, len
);
491 else if (flags
& BURL_ENCODE_NDE
) {
492 burl_append_encode_nde(b
, str
, len
);
494 else if (flags
& BURL_ENCODE_PSNDE
) {
495 burl_append_encode_psnde(b
, str
, len
);
497 else if (flags
& BURL_ENCODE_B64U
) {
498 const unsigned char *s
= (const unsigned char *)str
;
499 buffer_append_base64_encode_no_padding(b
, s
, len
, BASE64_URL
);
501 else if (flags
& BURL_DECODE_B64U
) {
502 buffer_append_base64_decode(b
, str
, len
, BASE64_URL
);
505 /* note: not normalizing str, which could come from arbitrary header,
506 * so it is possible that alpha chars are percent-encoded upper/lowercase */
507 if (flags
& (BURL_TOLOWER
|BURL_TOUPPER
)) {
508 (flags
& BURL_TOLOWER
)
509 ? burl_offset_tolower(b
, off
) /*(flags & BURL_TOLOWER)*/
510 : burl_offset_toupper(b
, off
); /*(flags & BURL_TOUPPER)*/