9 static const char hex_chars_uc
[] = "0123456789ABCDEF";
11 /* everything except: ! $ & ' ( ) * + , - . / 0-9 : ; = ? @ A-Z _ a-z ~ */
12 static const char encoded_chars_http_uri_reqd
[] = {
14 0 1 2 3 4 5 6 7 8 9 A B C D E F
16 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00 - 0F control chars */
17 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 10 - 1F */
18 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 20 - 2F space " # % */
19 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, /* 30 - 3F < > */
20 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 40 - 4F */
21 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, /* 50 - 5F [ \ ] ^ */
22 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 60 - 6F ` */
23 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, /* 70 - 7F { | } DEL */
24 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 80 - 8F */
25 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 90 - 9F */
26 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* A0 - AF */
27 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* B0 - BF */
28 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* C0 - CF */
29 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* D0 - DF */
30 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* E0 - EF */
31 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* F0 - FF */
35 /* c (char) and n (nibble) MUST be unsigned integer types */
36 #define li_cton(c,n) \
37 (((n) = (c) - '0') <= 9 || (((n) = ((c)&0xdf) - 'A') <= 5 ? ((n) += 10) : 0))
39 /* b (byte) MUST be unsigned integer type
40 * https://en.wikipedia.org/wiki/UTF-8
41 * reject overlong encodings of 7-byte ASCII and invalid UTF-8
42 * (but does not detect other overlong multi-byte encodings) */
43 #define li_utf8_invalid_byte(b) ((b) >= 0xF5 || ((b)|0x1) == 0xC1)
46 static int burl_is_unreserved (const int c
)
48 return (light_isalnum(c
) || c
== '-' || c
== '.' || c
== '_' || c
== '~');
52 static int burl_normalize_basic_unreserved_fix (buffer
*b
, buffer
*t
, int i
, int qs
)
55 const int used
= (int)buffer_string_length(b
);
56 const unsigned char * const s
= (unsigned char *)b
->ptr
;
57 unsigned char * const p
=
58 (unsigned char *)buffer_string_prepare_copy(t
,i
+(used
-i
)*3+1);
60 memcpy(p
, s
, (size_t)i
);
61 for (; i
< used
; ++i
, ++j
) {
62 if (!encoded_chars_http_uri_reqd
[s
[i
]]) {
63 if (s
[i
] == '?' && -1 == qs
) qs
= j
;
66 else if (s
[i
]=='%' && li_cton(s
[i
+1], n1
) && li_cton(s
[i
+2], n2
)) {
67 const unsigned int x
= (n1
<< 4) | n2
;
68 if (burl_is_unreserved(x
)) {
73 p
[++j
] = hex_chars_uc
[n1
]; /*(s[i+1] & 0xdf)*/
74 p
[++j
] = hex_chars_uc
[n2
]; /*(s[i+2] & 0xdf)*/
75 if (li_utf8_invalid_byte(x
)) qs
= -2;
79 else if (s
[i
] == '#') break; /* ignore fragment */
82 p
[++j
] = hex_chars_uc
[(s
[i
] >> 4) & 0xF];
83 p
[++j
] = hex_chars_uc
[s
[i
] & 0xF];
84 if (li_utf8_invalid_byte(s
[i
])) qs
= -2;
87 buffer_commit(t
, (size_t)j
);
88 buffer_copy_buffer(b
, t
);
93 static int burl_normalize_basic_unreserved (buffer
*b
, buffer
*t
)
95 const unsigned char * const s
= (unsigned char *)b
->ptr
;
96 const int used
= (int)buffer_string_length(b
);
97 unsigned int n1
, n2
, x
;
100 for (int i
= 0; i
< used
; ++i
) {
101 if (!encoded_chars_http_uri_reqd
[s
[i
]]) {
102 if (s
[i
] == '?' && -1 == qs
) qs
= i
;
104 else if (s
[i
]=='%' && li_cton(s
[i
+1], n1
) && li_cton(s
[i
+2], n2
)
105 && !burl_is_unreserved((x
= (n1
<< 4) | n2
))) {
106 if (li_utf8_invalid_byte(x
)) qs
= -2;
107 if (s
[i
+1] >= 'a') b
->ptr
[i
+1] &= 0xdf; /* uppercase hex */
108 if (s
[i
+2] >= 'a') b
->ptr
[i
+2] &= 0xdf; /* uppercase hex */
111 else if (s
[i
] == '#') { /* ignore fragment */
112 buffer_string_set_length(b
, (size_t)i
);
116 qs
= burl_normalize_basic_unreserved_fix(b
, t
, i
, qs
);
125 static int burl_normalize_basic_required_fix (buffer
*b
, buffer
*t
, int i
, int qs
)
128 const int used
= (int)buffer_string_length(b
);
129 const unsigned char * const s
= (unsigned char *)b
->ptr
;
130 unsigned char * const p
=
131 (unsigned char *)buffer_string_prepare_copy(t
,i
+(used
-i
)*3+1);
133 memcpy(p
, s
, (size_t)i
);
134 for (; i
< used
; ++i
, ++j
) {
135 if (!encoded_chars_http_uri_reqd
[s
[i
]]) {
136 if (s
[i
] == '?' && -1 == qs
) qs
= j
;
139 else if (s
[i
]=='%' && li_cton(s
[i
+1], n1
) && li_cton(s
[i
+2], n2
)) {
140 const unsigned int x
= (n1
<< 4) | n2
;
141 if (!encoded_chars_http_uri_reqd
[x
]
142 && (qs
< 0 ? (x
!='/'&&x
!='?') : (x
!='&'&&x
!='='&&x
!=';'))) {
147 p
[++j
] = hex_chars_uc
[n1
]; /*(s[i+1] & 0xdf)*/
148 p
[++j
] = hex_chars_uc
[n2
]; /*(s[i+2] & 0xdf)*/
149 if (li_utf8_invalid_byte(x
)) qs
= -2;
153 else if (s
[i
] == '#') break; /* ignore fragment */
156 p
[++j
] = hex_chars_uc
[(s
[i
] >> 4) & 0xF];
157 p
[++j
] = hex_chars_uc
[s
[i
] & 0xF];
158 if (li_utf8_invalid_byte(s
[i
])) qs
= -2;
161 buffer_commit(t
, (size_t)j
);
162 buffer_copy_buffer(b
, t
);
167 static int burl_normalize_basic_required (buffer
*b
, buffer
*t
)
169 const unsigned char * const s
= (unsigned char *)b
->ptr
;
170 const int used
= (int)buffer_string_length(b
);
171 unsigned int n1
, n2
, x
;
174 for (int i
= 0; i
< used
; ++i
) {
175 if (!encoded_chars_http_uri_reqd
[s
[i
]]) {
176 if (s
[i
] == '?' && -1 == qs
) qs
= i
;
178 else if (s
[i
]=='%' && li_cton(s
[i
+1], n1
) && li_cton(s
[i
+2], n2
)
179 && (encoded_chars_http_uri_reqd
[(x
= (n1
<< 4) | n2
)]
180 ||(qs
< 0 ? (x
=='/'||x
=='?') : (x
=='&'||x
=='='||x
==';')))){
181 if (li_utf8_invalid_byte(x
)) qs
= -2;
182 if (s
[i
+1] >= 'a') b
->ptr
[i
+1] &= 0xdf; /* uppercase hex */
183 if (s
[i
+2] >= 'a') b
->ptr
[i
+2] &= 0xdf; /* uppercase hex */
186 else if (s
[i
] == '#') { /* ignore fragment */
187 buffer_string_set_length(b
, (size_t)i
);
191 qs
= burl_normalize_basic_required_fix(b
, t
, i
, qs
);
200 static int burl_contains_ctrls (const buffer
*b
)
202 const char * const s
= b
->ptr
;
203 const int used
= (int)buffer_string_length(b
);
204 for (int i
= 0; i
< used
; ++i
) {
205 if (s
[i
] == '%' && (s
[i
+1] < '2' || (s
[i
+1] == '7' && s
[i
+2] == 'F')))
212 static void burl_normalize_qs20_to_plus_fix (buffer
*b
, int i
)
214 char * const s
= b
->ptr
;
215 const int used
= (int)buffer_string_length(b
);
217 for (; i
< used
; ++i
, ++j
) {
219 if (s
[i
] == '%' && s
[i
+1] == '2' && s
[i
+2] == '0') {
224 buffer_string_set_length(b
, j
);
228 static void burl_normalize_qs20_to_plus (buffer
*b
, int qs
)
230 const char * const s
= b
->ptr
;
231 const int used
= qs
< 0 ? 0 : (int)buffer_string_length(b
);
234 for (i
= qs
+1; i
< used
; ++i
) {
235 if (s
[i
] == '%' && s
[i
+1] == '2' && s
[i
+2] == '0') break;
237 if (i
!= used
) burl_normalize_qs20_to_plus_fix(b
, i
);
241 static int burl_normalize_2F_to_slash_fix (buffer
*b
, int qs
, int i
)
243 char * const s
= b
->ptr
;
244 const int blen
= (int)buffer_string_length(b
);
245 const int used
= qs
< 0 ? blen
: qs
;
247 for (; i
< used
; ++i
, ++j
) {
249 if (s
[i
] == '%' && s
[i
+1] == '2' && s
[i
+2] == 'F') {
255 const int qslen
= blen
- qs
;
256 memmove(s
+j
, s
+qs
, (size_t)qslen
);
260 buffer_string_set_length(b
, j
);
265 static int burl_normalize_2F_to_slash (buffer
*b
, int qs
, int flags
)
267 /*("%2F" must already have been uppercased during normalization)*/
268 const char * const s
= b
->ptr
;
269 const int used
= qs
< 0 ? (int)buffer_string_length(b
) : qs
;
270 for (int i
= 0; i
< used
; ++i
) {
271 if (s
[i
] == '%' && s
[i
+1] == '2' && s
[i
+2] == 'F') {
272 return (flags
& HTTP_PARSEOPT_URL_NORMALIZE_PATH_2F_DECODE
)
273 ? burl_normalize_2F_to_slash_fix(b
, qs
, i
)
274 : -2; /*(flags & HTTP_PARSEOPT_URL_NORMALIZE_PATH_2F_REJECT)*/
281 static int burl_normalize_path (buffer
*b
, buffer
*t
, int qs
, int flags
)
283 const unsigned char * const s
= (unsigned char *)b
->ptr
;
284 const int used
= (int)buffer_string_length(b
);
285 int path_simplify
= 0;
286 for (int i
= 0, len
= qs
< 0 ? used
: qs
; i
< len
; ++i
) {
287 if (s
[i
] == '.' && (s
[i
+1] != '.' || ++i
)
288 && (s
[i
+1] == '/' || s
[i
+1] == '?' || s
[i
+1] == '\0')) {
292 while (i
< len
&& s
[i
] != '/') ++i
;
293 if (s
[i
] == '/' && s
[i
+1] == '/') { /*(s[len] != '/')*/
300 if (flags
& HTTP_PARSEOPT_URL_NORMALIZE_PATH_DOTSEG_REJECT
) return -2;
302 buffer_copy_string_len(t
, b
->ptr
+qs
, used
- qs
);
303 buffer_string_set_length(b
, qs
);
306 buffer_path_simplify(b
, b
);
309 qs
= (int)buffer_string_length(b
);
310 buffer_append_string_len(b
, CONST_BUF_LEN(t
));
318 int burl_normalize (buffer
*b
, buffer
*t
, int flags
)
322 #if defined(__WIN32) || defined(__CYGWIN__)
323 /* Windows and Cygwin treat '\\' as '/' if '\\' is present in path;
324 * convert to '/' for consistency before percent-encoding
325 * normalization which will convert '\\' to "%5C" in the URL.
326 * (Clients still should not be sending '\\' unencoded in requests.) */
327 if (flags
& HTTP_PARSEOPT_URL_NORMALIZE_PATH_BACKSLASH_TRANS
) {
328 for (char *p
= b
->ptr
; *p
!= '?' && *p
!= '\0'; ++p
) {
329 if (*p
== '\\') *p
= '/';
334 qs
= (flags
& HTTP_PARSEOPT_URL_NORMALIZE_REQUIRED
)
335 ? burl_normalize_basic_required(b
, t
)
336 : burl_normalize_basic_unreserved(b
, t
);
337 if (-2 == qs
) return -2;
339 if (flags
& HTTP_PARSEOPT_URL_NORMALIZE_CTRLS_REJECT
) {
340 if (burl_contains_ctrls(b
)) return -2;
343 if (flags
& (HTTP_PARSEOPT_URL_NORMALIZE_PATH_2F_DECODE
344 |HTTP_PARSEOPT_URL_NORMALIZE_PATH_2F_REJECT
)) {
345 qs
= burl_normalize_2F_to_slash(b
, qs
, flags
);
346 if (-2 == qs
) return -2;
349 if (flags
& (HTTP_PARSEOPT_URL_NORMALIZE_PATH_DOTSEG_REMOVE
350 |HTTP_PARSEOPT_URL_NORMALIZE_PATH_DOTSEG_REJECT
)) {
351 qs
= burl_normalize_path(b
, t
, qs
, flags
);
352 if (-2 == qs
) return -2;
355 if (flags
& HTTP_PARSEOPT_URL_NORMALIZE_QUERY_20_PLUS
) {
356 if (qs
>= 0) burl_normalize_qs20_to_plus(b
, qs
);
363 static void burl_append_encode_nde (buffer
* const b
, const char * const str
, const size_t len
)
365 /* percent-encodes everything except unreserved - . 0-9 A-Z _ a-z ~
366 * unless already percent-encoded (does not double-encode) */
367 /* Note: not checking for invalid UTF-8 */
368 char * const p
= buffer_string_prepare_append(b
, len
*3);
371 for (unsigned int i
= 0; i
< len
; ++i
, ++j
) {
372 if (str
[i
]=='%' && li_cton(str
[i
+1], n1
) && li_cton(str
[i
+2], n2
)) {
373 const unsigned int x
= (n1
<< 4) | n2
;
374 if (burl_is_unreserved((int)x
)) {
377 else { /* leave UTF-8, control chars, and required chars encoded */
384 else if (burl_is_unreserved(str
[i
])) {
389 p
[++j
] = hex_chars_uc
[(str
[i
] >> 4) & 0xF];
390 p
[++j
] = hex_chars_uc
[str
[i
] & 0xF];
397 static void burl_append_encode_psnde (buffer
* const b
, const char * const str
, const size_t len
)
399 /* percent-encodes everything except unreserved - . 0-9 A-Z _ a-z ~ plus /
400 * unless already percent-encoded (does not double-encode) */
401 /* Note: not checking for invalid UTF-8 */
402 char * const p
= buffer_string_prepare_append(b
, len
*3);
405 for (unsigned int i
= 0; i
< len
; ++i
, ++j
) {
406 if (str
[i
]=='%' && li_cton(str
[i
+1], n1
) && li_cton(str
[i
+2], n2
)) {
407 const unsigned int x
= (n1
<< 4) | n2
;
408 if (burl_is_unreserved((int)x
)) {
411 else { /* leave UTF-8, control chars, and required chars encoded */
418 else if (burl_is_unreserved(str
[i
]) || str
[i
] == '/') {
423 p
[++j
] = hex_chars_uc
[(str
[i
] >> 4) & 0xF];
424 p
[++j
] = hex_chars_uc
[str
[i
] & 0xF];
431 static void burl_append_encode_all (buffer
* const b
, const char * const str
, const size_t len
)
433 /* percent-encodes everything except unreserved - . 0-9 A-Z _ a-z ~
434 * Note: double-encodes any existing '%') */
435 /* Note: not checking for invalid UTF-8 */
436 char * const p
= buffer_string_prepare_append(b
, len
*3);
438 for (unsigned int i
= 0; i
< len
; ++i
, ++j
) {
439 if (burl_is_unreserved(str
[i
])) {
444 p
[++j
] = hex_chars_uc
[(str
[i
] >> 4) & 0xF];
445 p
[++j
] = hex_chars_uc
[str
[i
] & 0xF];
452 static void burl_offset_tolower (buffer
* const b
, const size_t off
)
454 /*(skips over all percent-encodings, including encoding of alpha chars)*/
455 for (char *p
= b
->ptr
+off
; p
[0]; ++p
) {
456 if (p
[0] >= 'A' && p
[0] <= 'Z') p
[0] |= 0x20;
457 else if (p
[0]=='%' && light_isxdigit(p
[1]) && light_isxdigit(p
[2]))
463 static void burl_offset_toupper (buffer
* const b
, const size_t off
)
465 /*(skips over all percent-encodings, including encoding of alpha chars)*/
466 for (char *p
= b
->ptr
+off
; p
[0]; ++p
) {
467 if (p
[0] >= 'a' && p
[0] <= 'z') p
[0] &= 0xdf;
468 else if (p
[0]=='%' && light_isxdigit(p
[1]) && light_isxdigit(p
[2]))
474 void burl_append (buffer
* const b
, const char * const str
, const size_t len
, const int flags
)
478 if (0 == len
) return;
481 buffer_append_string_len(b
, str
, len
);
485 if (flags
& (BURL_TOUPPER
|BURL_TOLOWER
)) off
= buffer_string_length(b
);
487 if (flags
& BURL_ENCODE_NONE
) {
488 buffer_append_string_len(b
, str
, len
);
490 else if (flags
& BURL_ENCODE_ALL
) {
491 burl_append_encode_all(b
, str
, len
);
493 else if (flags
& BURL_ENCODE_NDE
) {
494 burl_append_encode_nde(b
, str
, len
);
496 else if (flags
& BURL_ENCODE_PSNDE
) {
497 burl_append_encode_psnde(b
, str
, len
);
499 else if (flags
& BURL_ENCODE_B64U
) {
500 const unsigned char *s
= (const unsigned char *)str
;
501 buffer_append_base64_encode_no_padding(b
, s
, len
, BASE64_URL
);
503 else if (flags
& BURL_DECODE_B64U
) {
504 buffer_append_base64_decode(b
, str
, len
, BASE64_URL
);
507 /* note: not normalizing str, which could come from arbitrary header,
508 * so it is possible that alpha chars are percent-encoded upper/lowercase */
509 if (flags
& (BURL_TOLOWER
|BURL_TOUPPER
)) {
510 (flags
& BURL_TOLOWER
)
511 ? burl_offset_tolower(b
, off
) /*(flags & BURL_TOLOWER)*/
512 : burl_offset_toupper(b
, off
); /*(flags & BURL_TOUPPER)*/