src/burl.c

   1 #include "first.h"
   2 #include "burl.h"
   3
   4 #include <string.h>
   5
   6 #include "buffer.h"
   7 #include "base64.h"
   8
   9 static const char hex_chars_uc[] = "0123456789ABCDEF";
  10
  11 /* everything except: ! $ & ' ( ) * + , - . / 0-9 : ; = ? @ A-Z _ a-z ~ */
  12 static const char encoded_chars_http_uri_reqd[] = {
  13   /*
  14   0  1  2  3  4  5  6  7  8  9  A  B  C  D  E  F
  15   */
  16   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  /*  00 -  0F control chars */
  17   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  /*  10 -  1F */
  18   1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  /*  20 -  2F space " # % */
  19   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0,  /*  30 -  3F < > */
  20   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  /*  40 -  4F */
  21   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,  /*  50 -  5F [ \ ] ^ */
  22   1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  /*  60 -  6F ` */
  23   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1,  /*  70 -  7F { | } DEL */
  24   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  /*  80 -  8F */
  25   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  /*  90 -  9F */
  26   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  /*  A0 -  AF */
  27   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  /*  B0 -  BF */
  28   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  /*  C0 -  CF */
  29   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  /*  D0 -  DF */
  30   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  /*  E0 -  EF */
  31   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  /*  F0 -  FF */
  32 };
  33
  34
  35 /* c (char) and n (nibble) MUST be unsigned integer types */
  36 #define li_cton(c,n) \
  37   (((n) = (c) - '0') <= 9 || (((n) = ((c)&0xdf) - 'A') <= 5 ? ((n) += 10) : 0))
  38
  39 /* b (byte) MUST be unsigned integer type
  40  * https://en.wikipedia.org/wiki/UTF-8
  41  * reject overlong encodings of 7-byte ASCII and invalid UTF-8
  42  * (but does not detect other overlong multi-byte encodings) */
  43 #define li_utf8_invalid_byte(b) ((b) >= 0xF5 || ((b)|0x1) == 0xC1)
  44
  45
  46 static int burl_is_unreserved (const int c)
  47 {
  48     return (light_isalnum(c) || c == '-' || c == '.' || c == '_' || c == '~');
  49 }
  50
  51
  52 static int burl_normalize_basic_unreserved_fix (buffer *b, buffer *t, int i, int qs)
  53 {
  54     int j = i;
  55     const int used = (int)buffer_string_length(b);
  56     const unsigned char * const s = (unsigned char *)b->ptr;
  57     unsigned char * const p =
  58       (unsigned char *)buffer_string_prepare_copy(t,i+(used-i)*3+1);
  59     unsigned int n1, n2;
  60     memcpy(p, s, (size_t)i);
  61     for (; i < used; ++i, ++j) {
  62         if (!encoded_chars_http_uri_reqd[s[i]]) {
  63             if (s[i] == '?' && -1 == qs) qs = j;
  64             p[j] = s[i];
  65         }
  66         else if (s[i]=='%' && li_cton(s[i+1], n1) && li_cton(s[i+2], n2)) {
  67             const unsigned int x = (n1 << 4) | n2;
  68             if (burl_is_unreserved(x)) {
  69                 p[j] = x;
  70             }
  71             else {
  72                 p[j]   = '%';
  73                 p[++j] = hex_chars_uc[n1]; /*(s[i+1] & 0xdf)*/
  74                 p[++j] = hex_chars_uc[n2]; /*(s[i+2] & 0xdf)*/
  75                 if (li_utf8_invalid_byte(x)) qs = -2;
  76             }
  77             i+=2;
  78         }
  79         else if (s[i] == '#') break; /* ignore fragment */
  80         else {
  81             p[j]   = '%';
  82             p[++j] = hex_chars_uc[(s[i] >> 4) & 0xF];
  83             p[++j] = hex_chars_uc[s[i] & 0xF];
  84             if (li_utf8_invalid_byte(s[i])) qs = -2;
  85         }
  86     }
  87     buffer_commit(t, (size_t)j);
  88     buffer_copy_buffer(b, t);
  89     return qs;
  90 }
  91
  92
  93 static int burl_normalize_basic_unreserved (buffer *b, buffer *t)
  94 {
  95     const unsigned char * const s = (unsigned char *)b->ptr;
  96     const int used = (int)buffer_string_length(b);
  97     unsigned int n1, n2, x;
  98     int qs = -1;
  99
 100     for (int i = 0; i < used; ++i) {
 101         if (!encoded_chars_http_uri_reqd[s[i]]) {
 102             if (s[i] == '?' && -1 == qs) qs = i;
 103         }
 104         else if (s[i]=='%' && li_cton(s[i+1], n1) && li_cton(s[i+2], n2)
 105                  && !burl_is_unreserved((x = (n1 << 4) | n2))) {
 106             if (li_utf8_invalid_byte(x)) qs = -2;
 107             if (s[i+1] >= 'a') b->ptr[i+1] &= 0xdf; /* uppercase hex */
 108             if (s[i+2] >= 'a') b->ptr[i+2] &= 0xdf; /* uppercase hex */
 109             i+=2;
 110         }
 111         else if (s[i] == '#') { /* ignore fragment */
 112             buffer_string_set_length(b, (size_t)i);
 113             break;
 114         }
 115         else {
 116             qs = burl_normalize_basic_unreserved_fix(b, t, i, qs);
 117             break;
 118         }
 119     }
 120
 121     return qs;
 122 }
 123
 124
 125 static int burl_normalize_basic_required_fix (buffer *b, buffer *t, int i, int qs)
 126 {
 127     int j = i;
 128     const int used = (int)buffer_string_length(b);
 129     const unsigned char * const s = (unsigned char *)b->ptr;
 130     unsigned char * const p =
 131       (unsigned char *)buffer_string_prepare_copy(t,i+(used-i)*3+1);
 132     unsigned int n1, n2;
 133     memcpy(p, s, (size_t)i);
 134     for (; i < used; ++i, ++j) {
 135         if (!encoded_chars_http_uri_reqd[s[i]]) {
 136             if (s[i] == '?' && -1 == qs) qs = j;
 137             p[j] = s[i];
 138         }
 139         else if (s[i]=='%' && li_cton(s[i+1], n1) && li_cton(s[i+2], n2)) {
 140             const unsigned int x = (n1 << 4) | n2;
 141             if (!encoded_chars_http_uri_reqd[x]
 142                 && (qs < 0 ? (x!='/'&&x!='?') : (x!='&'&&x!='='&&x!=';'))) {
 143                 p[j] = x;
 144             }
 145             else {
 146                 p[j]   = '%';
 147                 p[++j] = hex_chars_uc[n1]; /*(s[i+1] & 0xdf)*/
 148                 p[++j] = hex_chars_uc[n2]; /*(s[i+2] & 0xdf)*/
 149                 if (li_utf8_invalid_byte(x)) qs = -2;
 150             }
 151             i+=2;
 152         }
 153         else if (s[i] == '#') break; /* ignore fragment */
 154         else {
 155             p[j]   = '%';
 156             p[++j] = hex_chars_uc[(s[i] >> 4) & 0xF];
 157             p[++j] = hex_chars_uc[s[i] & 0xF];
 158             if (li_utf8_invalid_byte(s[i])) qs = -2;
 159         }
 160     }
 161     buffer_commit(t, (size_t)j);
 162     buffer_copy_buffer(b, t);
 163     return qs;
 164 }
 165
 166
 167 static int burl_normalize_basic_required (buffer *b, buffer *t)
 168 {
 169     const unsigned char * const s = (unsigned char *)b->ptr;
 170     const int used = (int)buffer_string_length(b);
 171     unsigned int n1, n2, x;
 172     int qs = -1;
 173
 174     for (int i = 0; i < used; ++i) {
 175         if (!encoded_chars_http_uri_reqd[s[i]]) {
 176             if (s[i] == '?' && -1 == qs) qs = i;
 177         }
 178         else if (s[i]=='%' && li_cton(s[i+1], n1) && li_cton(s[i+2], n2)
 179                  && (encoded_chars_http_uri_reqd[(x = (n1 << 4) | n2)]
 180                      ||(qs < 0 ? (x=='/'||x=='?') : (x=='&'||x=='='||x==';')))){
 181             if (li_utf8_invalid_byte(x)) qs = -2;
 182             if (s[i+1] >= 'a') b->ptr[i+1] &= 0xdf; /* uppercase hex */
 183             if (s[i+2] >= 'a') b->ptr[i+2] &= 0xdf; /* uppercase hex */
 184             i+=2;
 185         }
 186         else if (s[i] == '#') { /* ignore fragment */
 187             buffer_string_set_length(b, (size_t)i);
 188             break;
 189         }
 190         else {
 191             qs = burl_normalize_basic_required_fix(b, t, i, qs);
 192             break;
 193         }
 194     }
 195
 196     return qs;
 197 }
 198
 199
 200 static int burl_contains_ctrls (const buffer *b)
 201 {
 202     const char * const s = b->ptr;
 203     const int used = (int)buffer_string_length(b);
 204     for (int i = 0; i < used; ++i) {
 205         if (s[i] == '%' && (s[i+1] < '2' || (s[i+1] == '7' && s[i+2] == 'F')))
 206             return 1;
 207     }
 208     return 0;
 209 }
 210
 211
 212 static void burl_normalize_qs20_to_plus_fix (buffer *b, int i)
 213 {
 214     char * const s = b->ptr;
 215     const int used = (int)buffer_string_length(b);
 216     int j = i;
 217     for (; i < used; ++i, ++j) {
 218         s[j] = s[i];
 219         if (s[i] == '%' && s[i+1] == '2' && s[i+2] == '0') {
 220             s[j] = '+';
 221             i+=2;
 222         }
 223     }
 224     buffer_string_set_length(b, j);
 225 }
 226
 227
 228 static void burl_normalize_qs20_to_plus (buffer *b, int qs)
 229 {
 230     const char * const s = b->ptr;
 231     const int used = qs < 0 ? 0 : (int)buffer_string_length(b);
 232     int i;
 233     if (qs < 0) return;
 234     for (i = qs+1; i < used; ++i) {
 235         if (s[i] == '%' && s[i+1] == '2' && s[i+2] == '0') break;
 236     }
 237     if (i != used) burl_normalize_qs20_to_plus_fix(b, i);
 238 }
 239
 240
 241 static int burl_normalize_2F_to_slash_fix (buffer *b, int qs, int i)
 242 {
 243     char * const s = b->ptr;
 244     const int blen = (int)buffer_string_length(b);
 245     const int used = qs < 0 ? blen : qs;
 246     int j = i;
 247     for (; i < used; ++i, ++j) {
 248         s[j] = s[i];
 249         if (s[i] == '%' && s[i+1] == '2' && s[i+2] == 'F') {
 250             s[j] = '/';
 251             i+=2;
 252         }
 253     }
 254     if (qs >= 0) {
 255         const int qslen = blen - qs;
 256         memmove(s+j, s+qs, (size_t)qslen);
 257         qs = j;
 258         j += qslen;
 259     }
 260     buffer_string_set_length(b, j);
 261     return qs;
 262 }
 263
 264
 265 static int burl_normalize_2F_to_slash (buffer *b, int qs, int flags)
 266 {
 267     /*("%2F" must already have been uppercased during normalization)*/
 268     const char * const s = b->ptr;
 269     const int used = qs < 0 ? (int)buffer_string_length(b) : qs;
 270     for (int i = 0; i < used; ++i) {
 271         if (s[i] == '%' && s[i+1] == '2' && s[i+2] == 'F') {
 272             return (flags & HTTP_PARSEOPT_URL_NORMALIZE_PATH_2F_DECODE)
 273               ? burl_normalize_2F_to_slash_fix(b, qs, i)
 274               : -2; /*(flags & HTTP_PARSEOPT_URL_NORMALIZE_PATH_2F_REJECT)*/
 275         }
 276     }
 277     return qs;
 278 }
 279
 280
 281 static int burl_normalize_path (buffer *b, buffer *t, int qs, int flags)
 282 {
 283     const unsigned char * const s = (unsigned char *)b->ptr;
 284     const int used = (int)buffer_string_length(b);
 285     int path_simplify = 0;
 286     for (int i = 0, len = qs < 0 ? used : qs; i < len; ++i) {
 287         if (s[i] == '.' && (s[i+1] != '.' || ++i)
 288             && (s[i+1] == '/' || s[i+1] == '?' || s[i+1] == '\0')) {
 289             path_simplify = 1;
 290             break;
 291         }
 292         while (i < len && s[i] != '/') ++i;
 293         if (s[i] == '/' && s[i+1] == '/') { /*(s[len] != '/')*/
 294             path_simplify = 1;
 295             break;
 296         }
 297     }
 298
 299     if (path_simplify) {
 300         if (flags & HTTP_PARSEOPT_URL_NORMALIZE_PATH_DOTSEG_REJECT) return -2;
 301         if (qs >= 0) {
 302             buffer_copy_string_len(t, b->ptr+qs, used - qs);
 303             buffer_string_set_length(b, qs);
 304         }
 305
 306         buffer_path_simplify(b, b);
 307
 308         if (qs >= 0) {
 309             qs = (int)buffer_string_length(b);
 310             buffer_append_string_len(b, CONST_BUF_LEN(t));
 311         }
 312     }
 313
 314     return qs;
 315 }
 316
 317
 318 int burl_normalize (buffer *b, buffer *t, int flags)
 319 {
 320     int qs;
 321
 322   #if defined(__WIN32) || defined(__CYGWIN__)
 323     /* Windows and Cygwin treat '\\' as '/' if '\\' is present in path;
 324      * convert to '/' for consistency before percent-encoding
 325      * normalization which will convert '\\' to "%5C" in the URL.
 326      * (Clients still should not be sending '\\' unencoded in requests.) */
 327     if (flags & HTTP_PARSEOPT_URL_NORMALIZE_PATH_BACKSLASH_TRANS) {
 328         for (char *p = b->ptr; *p != '?' && *p != '\0'; ++p) {
 329             if (*p == '\\') *p = '/';
 330         }
 331     }
 332   #endif
 333
 334     qs = (flags & HTTP_PARSEOPT_URL_NORMALIZE_REQUIRED)
 335       ? burl_normalize_basic_required(b, t)
 336       : burl_normalize_basic_unreserved(b, t);
 337     if (-2 == qs) return -2;
 338
 339     if (flags & HTTP_PARSEOPT_URL_NORMALIZE_CTRLS_REJECT) {
 340         if (burl_contains_ctrls(b)) return -2;
 341     }
 342
 343     if (flags & (HTTP_PARSEOPT_URL_NORMALIZE_PATH_2F_DECODE
 344                 |HTTP_PARSEOPT_URL_NORMALIZE_PATH_2F_REJECT)) {
 345         qs = burl_normalize_2F_to_slash(b, qs, flags);
 346         if (-2 == qs) return -2;
 347     }
 348
 349     if (flags & (HTTP_PARSEOPT_URL_NORMALIZE_PATH_DOTSEG_REMOVE
 350                 |HTTP_PARSEOPT_URL_NORMALIZE_PATH_DOTSEG_REJECT)) {
 351         qs = burl_normalize_path(b, t, qs, flags);
 352         if (-2 == qs) return -2;
 353     }
 354
 355     if (flags & HTTP_PARSEOPT_URL_NORMALIZE_QUERY_20_PLUS) {
 356         if (qs >= 0) burl_normalize_qs20_to_plus(b, qs);
 357     }
 358
 359     return qs;
 360 }
 361
 362
 363 static void burl_append_encode_nde (buffer * const b, const char * const str, const size_t len)
 364 {
 365     /* percent-encodes everything except unreserved  - . 0-9 A-Z _ a-z ~
 366      * unless already percent-encoded (does not double-encode) */
 367     /* Note: not checking for invalid UTF-8 */
 368     char * const p = buffer_string_prepare_append(b, len*3);
 369     unsigned int n1, n2;
 370     int j = 0;
 371     for (unsigned int i = 0; i < len; ++i, ++j) {
 372         if (str[i]=='%' && li_cton(str[i+1], n1) && li_cton(str[i+2], n2)) {
 373             const unsigned int x = (n1 << 4) | n2;
 374             if (burl_is_unreserved((int)x)) {
 375                 p[j] = (char)x;
 376             }
 377             else { /* leave UTF-8, control chars, and required chars encoded */
 378                 p[j]   = '%';
 379                 p[++j] = str[i+1];
 380                 p[++j] = str[i+2];
 381             }
 382             i+=2;
 383         }
 384         else if (burl_is_unreserved(str[i])) {
 385             p[j] = str[i];
 386         }
 387         else {
 388             p[j]   = '%';
 389             p[++j] = hex_chars_uc[(str[i] >> 4) & 0xF];
 390             p[++j] = hex_chars_uc[str[i] & 0xF];
 391         }
 392     }
 393     buffer_commit(b, j);
 394 }
 395
 396
 397 static void burl_append_encode_psnde (buffer * const b, const char * const str, const size_t len)
 398 {
 399     /* percent-encodes everything except unreserved  - . 0-9 A-Z _ a-z ~ plus /
 400      * unless already percent-encoded (does not double-encode) */
 401     /* Note: not checking for invalid UTF-8 */
 402     char * const p = buffer_string_prepare_append(b, len*3);
 403     unsigned int n1, n2;
 404     int j = 0;
 405     for (unsigned int i = 0; i < len; ++i, ++j) {
 406         if (str[i]=='%' && li_cton(str[i+1], n1) && li_cton(str[i+2], n2)) {
 407             const unsigned int x = (n1 << 4) | n2;
 408             if (burl_is_unreserved((int)x)) {
 409                 p[j] = (char)x;
 410             }
 411             else { /* leave UTF-8, control chars, and required chars encoded */
 412                 p[j]   = '%';
 413                 p[++j] = str[i+1];
 414                 p[++j] = str[i+2];
 415             }
 416             i+=2;
 417         }
 418         else if (burl_is_unreserved(str[i]) || str[i] == '/') {
 419             p[j] = str[i];
 420         }
 421         else {
 422             p[j]   = '%';
 423             p[++j] = hex_chars_uc[(str[i] >> 4) & 0xF];
 424             p[++j] = hex_chars_uc[str[i] & 0xF];
 425         }
 426     }
 427     buffer_commit(b, j);
 428 }
 429
 430
 431 static void burl_append_encode_all (buffer * const b, const char * const str, const size_t len)
 432 {
 433     /* percent-encodes everything except unreserved  - . 0-9 A-Z _ a-z ~
 434      * Note: double-encodes any existing '%') */
 435     /* Note: not checking for invalid UTF-8 */
 436     char * const p = buffer_string_prepare_append(b, len*3);
 437     int j = 0;
 438     for (unsigned int i = 0; i < len; ++i, ++j) {
 439         if (burl_is_unreserved(str[i])) {
 440             p[j] = str[i];
 441         }
 442         else {
 443             p[j]   = '%';
 444             p[++j] = hex_chars_uc[(str[i] >> 4) & 0xF];
 445             p[++j] = hex_chars_uc[str[i] & 0xF];
 446         }
 447     }
 448     buffer_commit(b, j);
 449 }
 450
 451
 452 static void burl_offset_tolower (buffer * const b, const size_t off)
 453 {
 454     /*(skips over all percent-encodings, including encoding of alpha chars)*/
 455     for (char *p = b->ptr+off; p[0]; ++p) {
 456         if (p[0] >= 'A' && p[0] <= 'Z') p[0] |= 0x20;
 457         else if (p[0]=='%' && light_isxdigit(p[1]) && light_isxdigit(p[2]))
 458             p+=2;
 459     }
 460 }
 461
 462
 463 static void burl_offset_toupper (buffer * const b, const size_t off)
 464 {
 465     /*(skips over all percent-encodings, including encoding of alpha chars)*/
 466     for (char *p = b->ptr+off; p[0]; ++p) {
 467         if (p[0] >= 'a' && p[0] <= 'z') p[0] &= 0xdf;
 468         else if (p[0]=='%' && light_isxdigit(p[1]) && light_isxdigit(p[2]))
 469             p+=2;
 470     }
 471 }
 472
 473
 474 void burl_append (buffer * const b, const char * const str, const size_t len, const int flags)
 475 {
 476     size_t off = 0;
 477
 478     if (0 == len) return;
 479
 480     if (0 == flags) {
 481         buffer_append_string_len(b, str, len);
 482         return;
 483     }
 484
 485     if (flags & (BURL_TOUPPER|BURL_TOLOWER)) off = buffer_string_length(b);
 486
 487     if (flags & BURL_ENCODE_NONE) {
 488         buffer_append_string_len(b, str, len);
 489     }
 490     else if (flags & BURL_ENCODE_ALL) {
 491         burl_append_encode_all(b, str, len);
 492     }
 493     else if (flags & BURL_ENCODE_NDE) {
 494         burl_append_encode_nde(b, str, len);
 495     }
 496     else if (flags & BURL_ENCODE_PSNDE) {
 497         burl_append_encode_psnde(b, str, len);
 498     }
 499     else if (flags & BURL_ENCODE_B64U) {
 500         const unsigned char *s = (const unsigned char *)str;
 501         buffer_append_base64_encode_no_padding(b, s, len, BASE64_URL);
 502     }
 503     else if (flags & BURL_DECODE_B64U) {
 504         buffer_append_base64_decode(b, str, len, BASE64_URL);
 505     }
 506
 507     /* note: not normalizing str, which could come from arbitrary header,
 508      * so it is possible that alpha chars are percent-encoded upper/lowercase */
 509     if (flags & (BURL_TOLOWER|BURL_TOUPPER)) {
 510         (flags & BURL_TOLOWER)
 511           ? burl_offset_tolower(b, off)  /*(flags & BURL_TOLOWER)*/
 512           : burl_offset_toupper(b, off); /*(flags & BURL_TOUPPER)*/
 513     }
 514 }