xapian-applications/omega/urldecode.h

   1 /* @file urldecode.h
   2  * @brief URL decoding as described by RFC3986.
   3  */
   4 /* Copyright (C) 2011,2012,2015 Olly Betts
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a copy
   7  * of this software and associated documentation files (the "Software"), to
   8  * deal in the Software without restriction, including without limitation the
   9  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
  10  * sell copies of the Software, and to permit persons to whom the Software is
  11  * furnished to do so, subject to the following conditions:
  12  *
  13  * The above copyright notice and this permission notice shall be included in
  14  * all copies or substantial portions of the Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  21  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  22  * IN THE SOFTWARE.
  23  */
  24
  25 #ifndef OMEGA_INCLUDED_URLDECODE_H
  26 #define OMEGA_INCLUDED_URLDECODE_H
  27
  28 #include <algorithm>
  29 #include <cstdio>
  30 #include <cstring>
  31 #include <string>
  32 #include "stringutils.h"
  33
  34 struct CGIParameterHandler {
  35     void operator()(const std::string&, const std::string&) const;
  36 };
  37
  38 template<typename I>
  39 inline void
  40 url_decode(const CGIParameterHandler & handle_parameter, I begin, I end)
  41 {
  42     bool seen_equals = false;
  43     std::string var, val;
  44     while (begin != end) {
  45         unsigned char ch = *begin;
  46         ++begin;
  47 process_ch:
  48         if (ch == '&') {
  49             if (!seen_equals)
  50                 swap(var, val);
  51             if (!var.empty())
  52                 handle_parameter(var, val);
  53             var.resize(0);
  54             val.resize(0);
  55             seen_equals = false;
  56             continue;
  57         }
  58
  59         switch (ch) {
  60             case '%': {
  61                 if (begin == end)
  62                     break;
  63                 unsigned char hex1 = *begin;
  64                 ++begin;
  65                 if (begin == end || !C_isxdigit(hex1)) {
  66                     val += ch;
  67                     ch = hex1;
  68                     if (begin == end)
  69                         break;
  70                     goto process_ch;
  71                 }
  72                 unsigned char newch = hex_digit(hex1);
  73                 unsigned char hex2 = *begin;
  74                 ++begin;
  75                 if (!C_isxdigit(hex2)) {
  76                     val += ch;
  77                     val += hex1;
  78                     ch = hex2;
  79                     if (begin == end)
  80                         break;
  81                     goto process_ch;
  82                 }
  83                 ch = (newch << 4) | hex_digit(hex2);
  84                 break;
  85             }
  86             case '+':
  87                 ch = ' ';
  88                 break;
  89             case '=':
  90                 if (seen_equals)
  91                     break;
  92                 seen_equals = true;
  93                 swap(var, val);
  94                 continue;
  95         }
  96         val += ch;
  97     }
  98     if (!seen_equals)
  99         swap(var, val);
 100     if (!var.empty())
 101         handle_parameter(var, val);
 102 }
 103
 104 class CStringItor {
 105     const char * p;
 106
 107     void operator++(int);
 108
 109   public:
 110     CStringItor() : p(NULL) { }
 111
 112     explicit CStringItor(const char * p_) : p(p_) {
 113         if (!*p) p = NULL;
 114     }
 115
 116     unsigned char operator*() const { return *p; }
 117
 118     CStringItor & operator++() {
 119         if (!*++p) p = NULL;
 120         return *this;
 121     }
 122
 123     friend bool operator==(const CStringItor& a, const CStringItor& b);
 124     friend bool operator!=(const CStringItor& a, const CStringItor& b);
 125 };
 126
 127 inline bool
 128 operator==(const CStringItor& a, const CStringItor& b)
 129 {
 130     return a.p == b.p;
 131 }
 132
 133 inline bool
 134 operator!=(const CStringItor& a, const CStringItor& b)
 135 {
 136     return !(a == b);
 137 }
 138
 139 class StdinItor {
 140     size_t count;
 141
 142     mutable int current;
 143
 144     void operator++(int);
 145
 146   public:
 147     StdinItor() : current(EOF) { }
 148
 149     explicit StdinItor(size_t count_) : count(count_), current(256) { }
 150
 151     unsigned char operator*() const {
 152         if (current == 256)
 153             current = std::getchar();
 154         return current;
 155     }
 156
 157     StdinItor & operator++() {
 158         if (count--)
 159             current = std::getchar();
 160         else
 161             current = EOF;
 162         return *this;
 163     }
 164
 165     friend bool operator==(const StdinItor& a, const StdinItor& b);
 166     friend bool operator!=(const StdinItor& a, const StdinItor& b);
 167 };
 168
 169 inline bool
 170 operator==(const StdinItor& a, const StdinItor& b)
 171 {
 172     return a.current == b.current;
 173 }
 174
 175 inline bool
 176 operator!=(const StdinItor& a, const StdinItor& b)
 177 {
 178     return !(a == b);
 179 }
 180
 181 // First group is RFC3986 reserved "gen-delims", except []@: (which are safe
 182 // to decode if they occur after the "authority".
 183 //
 184 // Second group is RFC3986 reserved "sub-delims", except !$'()*,; (which are
 185 // actually safe to decode in practice) and &+= (which are OK to decode if they
 186 // aren't in the "query" part).
 187 //
 188 // We also need to leave an encoded "%" alone.  We should probably leave an
 189 // encoded "/" alone too (though we shouldn't encounter one in a database
 190 // created by omindex, unless it was in the base URL specified by the user).
 191 //
 192 // This prettifying is aimed at URLs produced by omindex, so we don't currently
 193 // try to decode the query or fragment parts of the URL at all.  We can probably
 194 // safely decode the query in a similar way, but also leaving &+= alone.
 195
 196 enum {
 197     // Always unsafe.
 198     UNSAFE,
 199     // Always safe.
 200     OK,
 201     // Always safe (and 8, 9, a, b, A or B).
 202     OK89AB,
 203     // Safe after a '/'.
 204     INPATH,
 205     // Start of a 2 byte UTF-8 sequence.
 206     SEQ2,
 207     // Start of a 3 byte UTF-8 sequence.
 208     SEQ3,
 209     // Start of a 4 byte UTF-8 sequence.
 210     SEQ4
 211 };
 212
 213 static const char url_chars[256] = {
 214     // 0x00-0x07
 215     UNSAFE,     UNSAFE, UNSAFE, UNSAFE, UNSAFE, UNSAFE, UNSAFE, UNSAFE,
 216     // 0x08-0x0f
 217     UNSAFE,     UNSAFE, UNSAFE, UNSAFE, UNSAFE, UNSAFE, UNSAFE, UNSAFE,
 218     // 0x10-0x17
 219     UNSAFE,     UNSAFE, UNSAFE, UNSAFE, UNSAFE, UNSAFE, UNSAFE, UNSAFE,
 220     // 0x18-0x1f
 221     UNSAFE,     UNSAFE, UNSAFE, UNSAFE, UNSAFE, UNSAFE, UNSAFE, UNSAFE,
 222     // ' '      !       "       #       $       %       &       '
 223     OK,         OK,     OK,     UNSAFE, OK,     UNSAFE, OK,     OK,
 224     // (        )       *       +       ,       -       .       /
 225     OK,         OK,     OK,     OK,     OK,     OK,     OK,     UNSAFE,
 226     // 0        1       2       3       4       5       6       7
 227     OK,         OK,     OK,     OK,     OK,     OK,     OK,     OK,
 228     // 8        9       :       ;       <       =       >       ?
 229     OK89AB,     OK89AB, INPATH, OK,     OK,     OK,     OK,     UNSAFE,
 230     // @        A       B       C       D       E       F       G
 231     INPATH,     OK89AB, OK89AB, OK,     OK,     OK,     OK,     OK,
 232     // H        I       J       K       L       M       N       O
 233     OK,         OK,     OK,     OK,     OK,     OK,     OK,     OK,
 234     // P        Q       R       S       T       U       V       W
 235     OK,         OK,     OK,     OK,     OK,     OK,     OK,     OK,
 236     // X        Y       Z       [       \       ]       ^       _
 237     OK,         OK,     OK,     INPATH, OK,     INPATH, OK,     OK,
 238     // `        a       b       c       d       e       f       g
 239     OK,         OK89AB, OK89AB, OK,     OK,     OK,     OK,     OK,
 240     // h        i       j       k       l       m       n       o
 241     OK,         OK,     OK,     OK,     OK,     OK,     OK,     OK,
 242     // p        q       r       s       t       u       v       w
 243     OK,         OK,     OK,     OK,     OK,     OK,     OK,     OK,
 244     // x        y       z       {       |       }       ~       0x7f
 245     OK,         OK,     OK,     OK,     OK,     OK,     OK,     UNSAFE,
 246     // 0x80     0x81    0x82    0x83    0x84    0x85    0x86    0x87
 247     UNSAFE,     UNSAFE, UNSAFE, UNSAFE, UNSAFE, UNSAFE, UNSAFE, UNSAFE,
 248     // 0x88     0x89    0x8a    0x8b    0x8c    0x8d    0x8e    0x8f
 249     UNSAFE,     UNSAFE, UNSAFE, UNSAFE, UNSAFE, UNSAFE, UNSAFE, UNSAFE,
 250     // 0x90     0x91    0x92    0x93    0x94    0x95    0x96    0x97
 251     UNSAFE,     UNSAFE, UNSAFE, UNSAFE, UNSAFE, UNSAFE, UNSAFE, UNSAFE,
 252     // 0x98     0x99    0x9a    0x9b    0x9c    0x9d    0x9e    0x9f
 253     UNSAFE,     UNSAFE, UNSAFE, UNSAFE, UNSAFE, UNSAFE, UNSAFE, UNSAFE,
 254     // 0xa0     0xa1    0xa2    0xa3    0xa4    0xa5    0xa6    0xa7
 255     UNSAFE,     UNSAFE, UNSAFE, UNSAFE, UNSAFE, UNSAFE, UNSAFE, UNSAFE,
 256     // 0xa8     0xa9    0xaa    0xab    0xac    0xad    0xae    0xaf
 257     UNSAFE,     UNSAFE, UNSAFE, UNSAFE, UNSAFE, UNSAFE, UNSAFE, UNSAFE,
 258     // 0xb0     0xb1    0xb2    0xb3    0xb4    0xb5    0xb6    0xb7
 259     UNSAFE,     UNSAFE, UNSAFE, UNSAFE, UNSAFE, UNSAFE, UNSAFE, UNSAFE,
 260     // 0xb8     0xb9    0xba    0xbb    0xbc    0xbd    0xbe    0xbf
 261     UNSAFE,     UNSAFE, UNSAFE, UNSAFE, UNSAFE, UNSAFE, UNSAFE, UNSAFE,
 262     // 0xc0     0xc1    0xc2    0xc3    0xc4    0xc5    0xc6    0xc7
 263     UNSAFE,     UNSAFE, SEQ2,   SEQ2,   SEQ2,   SEQ2,   SEQ2,   SEQ2,
 264     // 0xc8     0xc9    0xca    0xcb    0xcc    0xcd    0xce    0xcf
 265     SEQ2,       SEQ2,   SEQ2,   SEQ2,   SEQ2,   SEQ2,   SEQ2,   SEQ2,
 266     // 0xd0     0xd1    0xd2    0xd3    0xd4    0xd5    0xd6    0xd7
 267     SEQ2,       SEQ2,   SEQ2,   SEQ2,   SEQ2,   SEQ2,   SEQ2,   SEQ2,
 268     // 0xd8     0xd9    0xda    0xdb    0xdc    0xdd    0xde    0xdf
 269     SEQ2,       SEQ2,   SEQ2,   SEQ2,   SEQ2,   SEQ2,   SEQ2,   SEQ2,
 270     // 0xe0     0xe1    0xe2    0xe3    0xe4    0xe5    0xe6    0xe7
 271     SEQ3,       SEQ3,   SEQ3,   SEQ3,   SEQ3,   SEQ3,   SEQ3,   SEQ3,
 272     // 0xe8     0xe9    0xea    0xeb    0xec    0xed    0xee    0xef
 273     SEQ3,       SEQ3,   SEQ3,   SEQ3,   SEQ3,   SEQ3,   SEQ3,   SEQ3,
 274     // 0xf0     0xf1    0xf2    0xf3    0xf4    0xf5    0xf6    0xf7
 275     SEQ4,       SEQ4,   SEQ4,   SEQ4,   SEQ4,   UNSAFE, UNSAFE, UNSAFE,
 276     // 0xf8     0xf9    0xfa    0xfb    0xfc    0xfd    0xfe    0xff
 277     UNSAFE,     UNSAFE, UNSAFE, UNSAFE, UNSAFE, UNSAFE, UNSAFE, UNSAFE
 278 };
 279
 280 // Test if the 3 characters of s from offset i are '%', one of [89abAB]
 281 // and a hex digit.
 282 inline bool
 283 encoded_ucont(const std::string & s, size_t i)
 284 {
 285     return s[i] == '%' &&
 286         url_chars[static_cast<unsigned char>(s[i + 1])] == OK89AB &&
 287         C_isxdigit(s[i + 2]);
 288 }
 289
 290 /** Prettify a URL.
 291  *
 292  *  Undo RFC3986 escaping which doesn't affect semantics in practice, to make
 293  *  a prettier version of a URL to show the user, but which should still work
 294  *  if copied and pasted.
 295  */
 296 inline void
 297 url_prettify(std::string & url)
 298 {
 299     size_t pcent = url.find('%');
 300     // Fast path for URLs without a '%' in.
 301     if (pcent == std::string::npos)
 302         return;
 303
 304     if (url.size() < 3)
 305         return;
 306
 307     // Don't try to decode the query or fragment, and don't try to decode if
 308     // there aren't 2 characters after the '%'.
 309     size_t pretty_limit = std::min(url.find_first_of("?#"), url.size() - 2);
 310     if (pcent >= pretty_limit)
 311         return;
 312
 313     size_t slash = std::string::npos;
 314     size_t start = 0;
 315     std::string in;
 316     swap(in, url);
 317     url.reserve(in.size());
 318     while (true) {
 319         // We've checked there are at least two bytes after the '%' already.
 320         if (C_isxdigit(in[pcent + 1]) && C_isxdigit(in[pcent + 2])) {
 321             int ch = (hex_digit(in[pcent + 1]) << 4);
 322             ch |= hex_digit(in[pcent + 2]);
 323             bool safe = true;
 324             switch (url_chars[ch]) {
 325                 case UNSAFE:
 326                     safe = false;
 327                     break;
 328                 case SEQ2:
 329                     if (in.size() - (pcent + 2) < 3 ||
 330                         !encoded_ucont(in, pcent + 3)) {
 331                         safe = false;
 332                         break;
 333                     }
 334                     url.append(in, start, pcent - start);
 335                     url += char(ch);
 336                     pcent += 3;
 337                     ch = (hex_digit(in[pcent + 1]) << 4);
 338                     ch |= hex_digit(in[pcent + 2]);
 339                     start = pcent;
 340                     break;
 341                 case SEQ3:
 342                     if (in.size() - (pcent + 2) < 3 * 2 ||
 343                         !encoded_ucont(in, pcent + 3) ||
 344                         !encoded_ucont(in, pcent + 6) ||
 345                         (ch == 0xe0 && in[pcent + 4] <= '9')) {
 346                         safe = false;
 347                         break;
 348                     }
 349                     url.append(in, start, pcent - start);
 350                     url += char(ch);
 351                     pcent += 3;
 352                     ch = (hex_digit(in[pcent + 1]) << 4);
 353                     ch |= hex_digit(in[pcent + 2]);
 354                     url += char(ch);
 355                     pcent += 3;
 356                     ch = (hex_digit(in[pcent + 1]) << 4);
 357                     ch |= hex_digit(in[pcent + 2]);
 358                     start = pcent;
 359                     break;
 360                 case SEQ4:
 361                     if (in.size() - (pcent + 2) < 3 * 3 ||
 362                         !encoded_ucont(in, pcent + 3) ||
 363                         !encoded_ucont(in, pcent + 6) ||
 364                         !encoded_ucont(in, pcent + 9) ||
 365                         (ch == 0xf0 && in[pcent + 4] == '8') ||
 366                         (ch == 0xf4 && in[pcent + 4] >= '9')) {
 367                         safe = false;
 368                         break;
 369                     }
 370                     url.append(in, start, pcent - start);
 371                     url += char(ch);
 372                     pcent += 3;
 373                     ch = (hex_digit(in[pcent + 1]) << 4);
 374                     ch |= hex_digit(in[pcent + 2]);
 375                     url += char(ch);
 376                     pcent += 3;
 377                     ch = (hex_digit(in[pcent + 1]) << 4);
 378                     ch |= hex_digit(in[pcent + 2]);
 379                     url += char(ch);
 380                     pcent += 3;
 381                     ch = (hex_digit(in[pcent + 1]) << 4);
 382                     ch |= hex_digit(in[pcent + 2]);
 383                     start = pcent;
 384                     break;
 385                 case INPATH:
 386                     // ':' is safe to decode if there is a single '/' earlier in
 387                     // the URL.
 388                     if (slash == std::string::npos) {
 389                         // Lazily set slash to the position of the first single '/'.
 390                         const char * d = in.data();
 391                         slash = 0;
 392                         while (true) {
 393                             const void* s = std::memchr(d + slash, '/',
 394                                                         pretty_limit - slash);
 395                             if (s == NULL) {
 396                                 slash = in.size();
 397                                 break;
 398                             }
 399                             slash = reinterpret_cast<const char *>(s) - d;
 400                             if (slash == in.size() - 1 || d[slash + 1] != '/')
 401                                 break;
 402                             ++slash;
 403                             while (++slash < in.size() - 1 && d[slash] == '/') { }
 404                         }
 405                     }
 406                     safe = (pcent > slash);
 407                     break;
 408             }
 409
 410             if (safe) {
 411                 url.append(in, start, pcent - start);
 412                 url += char(ch);
 413                 pcent += 3;
 414                 start = pcent;
 415             } else {
 416                 pcent += 3;
 417             }
 418         } else {
 419             ++pcent;
 420         }
 421         pcent = in.find('%', pcent);
 422
 423         if (pcent >= pretty_limit) {
 424             url.append(in, start, std::string::npos);
 425             return;
 426         }
 427     }
 428 }
 429
 430 #endif // OMEGA_INCLUDED_URLDECODE_H