Eliminate dead assignment
[xapian.git] / xapian-applications / omega / urldecode.h
bloba992e1ab31528444f8ec5eaa729d8bf5547d66d8
1 /* @file urldecode.h
2 * @brief URL decoding as described by RFC3986.
3 */
4 /* Copyright (C) 2011,2012,2015 Olly Betts
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
22 * IN THE SOFTWARE.
25 #ifndef OMEGA_INCLUDED_URLDECODE_H
26 #define OMEGA_INCLUDED_URLDECODE_H
28 #include <algorithm>
29 #include <cstdio>
30 #include <cstring>
31 #include <string>
32 #include "stringutils.h"
34 struct CGIParameterHandler {
35 void operator()(const std::string&, const std::string&) const;
38 template<typename I>
39 inline void
40 url_decode(const CGIParameterHandler & handle_parameter, I begin, I end)
42 bool seen_equals = false;
43 std::string var, val;
44 while (begin != end) {
45 unsigned char ch = *begin;
46 ++begin;
47 process_ch:
48 if (ch == '&') {
49 if (!seen_equals)
50 swap(var, val);
51 if (!var.empty())
52 handle_parameter(var, val);
53 var.resize(0);
54 val.resize(0);
55 seen_equals = false;
56 continue;
59 switch (ch) {
60 case '%': {
61 if (begin == end)
62 break;
63 unsigned char hex1 = *begin;
64 ++begin;
65 if (begin == end || !C_isxdigit(hex1)) {
66 val += ch;
67 ch = hex1;
68 if (begin == end)
69 break;
70 goto process_ch;
72 unsigned char newch = hex_digit(hex1);
73 unsigned char hex2 = *begin;
74 ++begin;
75 if (!C_isxdigit(hex2)) {
76 val += ch;
77 val += hex1;
78 ch = hex2;
79 if (begin == end)
80 break;
81 goto process_ch;
83 ch = (newch << 4) | hex_digit(hex2);
84 break;
86 case '+':
87 ch = ' ';
88 break;
89 case '=':
90 if (seen_equals)
91 break;
92 seen_equals = true;
93 swap(var, val);
94 continue;
96 val += ch;
98 if (!seen_equals)
99 swap(var, val);
100 if (!var.empty())
101 handle_parameter(var, val);
104 class CStringItor {
105 const char * p;
107 void operator++(int);
109 public:
110 CStringItor() : p(NULL) { }
112 explicit CStringItor(const char * p_) : p(p_) {
113 if (!*p) p = NULL;
116 unsigned char operator*() const { return *p; }
118 CStringItor & operator++() {
119 if (!*++p) p = NULL;
120 return *this;
123 friend bool operator==(const CStringItor& a, const CStringItor& b);
124 friend bool operator!=(const CStringItor& a, const CStringItor& b);
127 inline bool
128 operator==(const CStringItor& a, const CStringItor& b)
130 return a.p == b.p;
133 inline bool
134 operator!=(const CStringItor& a, const CStringItor& b)
136 return !(a == b);
139 class StdinItor {
140 size_t count;
142 mutable int current;
144 void operator++(int);
146 public:
147 StdinItor() : current(EOF) { }
149 explicit StdinItor(size_t count_) : count(count_), current(256) { }
151 unsigned char operator*() const {
152 if (current == 256)
153 current = std::getchar();
154 return current;
157 StdinItor & operator++() {
158 if (count--)
159 current = std::getchar();
160 else
161 current = EOF;
162 return *this;
165 friend bool operator==(const StdinItor& a, const StdinItor& b);
166 friend bool operator!=(const StdinItor& a, const StdinItor& b);
169 inline bool
170 operator==(const StdinItor& a, const StdinItor& b)
172 return a.current == b.current;
175 inline bool
176 operator!=(const StdinItor& a, const StdinItor& b)
178 return !(a == b);
181 // First group is RFC3986 reserved "gen-delims", except []@: (which are safe
182 // to decode if they occur after the "authority".
184 // Second group is RFC3986 reserved "sub-delims", except !$'()*,; (which are
185 // actually safe to decode in practice) and &+= (which are OK to decode if they
186 // aren't in the "query" part).
188 // We also need to leave an encoded "%" alone. We should probably leave an
189 // encoded "/" alone too (though we shouldn't encounter one in a database
190 // created by omindex, unless it was in the base URL specified by the user).
192 // This prettifying is aimed at URLs produced by omindex, so we don't currently
193 // try to decode the query or fragment parts of the URL at all. We can probably
194 // safely decode the query in a similar way, but also leaving &+= alone.
196 enum {
197 // Always unsafe.
198 UNSAFE,
199 // Always safe.
201 // Always safe (and 8, 9, a, b, A or B).
202 OK89AB,
203 // Safe after a '/'.
204 INPATH,
205 // Start of a 2 byte UTF-8 sequence.
206 SEQ2,
207 // Start of a 3 byte UTF-8 sequence.
208 SEQ3,
209 // Start of a 4 byte UTF-8 sequence.
210 SEQ4
213 static const char url_chars[256] = {
214 // 0x00-0x07
215 UNSAFE, UNSAFE, UNSAFE, UNSAFE, UNSAFE, UNSAFE, UNSAFE, UNSAFE,
216 // 0x08-0x0f
217 UNSAFE, UNSAFE, UNSAFE, UNSAFE, UNSAFE, UNSAFE, UNSAFE, UNSAFE,
218 // 0x10-0x17
219 UNSAFE, UNSAFE, UNSAFE, UNSAFE, UNSAFE, UNSAFE, UNSAFE, UNSAFE,
220 // 0x18-0x1f
221 UNSAFE, UNSAFE, UNSAFE, UNSAFE, UNSAFE, UNSAFE, UNSAFE, UNSAFE,
222 // ' ' ! " # $ % & '
223 OK, OK, OK, UNSAFE, OK, UNSAFE, OK, OK,
224 // ( ) * + , - . /
225 OK, OK, OK, OK, OK, OK, OK, UNSAFE,
226 // 0 1 2 3 4 5 6 7
227 OK, OK, OK, OK, OK, OK, OK, OK,
228 // 8 9 : ; < = > ?
229 OK89AB, OK89AB, INPATH, OK, OK, OK, OK, UNSAFE,
230 // @ A B C D E F G
231 INPATH, OK89AB, OK89AB, OK, OK, OK, OK, OK,
232 // H I J K L M N O
233 OK, OK, OK, OK, OK, OK, OK, OK,
234 // P Q R S T U V W
235 OK, OK, OK, OK, OK, OK, OK, OK,
236 // X Y Z [ \ ] ^ _
237 OK, OK, OK, INPATH, OK, INPATH, OK, OK,
238 // ` a b c d e f g
239 OK, OK89AB, OK89AB, OK, OK, OK, OK, OK,
240 // h i j k l m n o
241 OK, OK, OK, OK, OK, OK, OK, OK,
242 // p q r s t u v w
243 OK, OK, OK, OK, OK, OK, OK, OK,
244 // x y z { | } ~ 0x7f
245 OK, OK, OK, OK, OK, OK, OK, UNSAFE,
246 // 0x80 0x81 0x82 0x83 0x84 0x85 0x86 0x87
247 UNSAFE, UNSAFE, UNSAFE, UNSAFE, UNSAFE, UNSAFE, UNSAFE, UNSAFE,
248 // 0x88 0x89 0x8a 0x8b 0x8c 0x8d 0x8e 0x8f
249 UNSAFE, UNSAFE, UNSAFE, UNSAFE, UNSAFE, UNSAFE, UNSAFE, UNSAFE,
250 // 0x90 0x91 0x92 0x93 0x94 0x95 0x96 0x97
251 UNSAFE, UNSAFE, UNSAFE, UNSAFE, UNSAFE, UNSAFE, UNSAFE, UNSAFE,
252 // 0x98 0x99 0x9a 0x9b 0x9c 0x9d 0x9e 0x9f
253 UNSAFE, UNSAFE, UNSAFE, UNSAFE, UNSAFE, UNSAFE, UNSAFE, UNSAFE,
254 // 0xa0 0xa1 0xa2 0xa3 0xa4 0xa5 0xa6 0xa7
255 UNSAFE, UNSAFE, UNSAFE, UNSAFE, UNSAFE, UNSAFE, UNSAFE, UNSAFE,
256 // 0xa8 0xa9 0xaa 0xab 0xac 0xad 0xae 0xaf
257 UNSAFE, UNSAFE, UNSAFE, UNSAFE, UNSAFE, UNSAFE, UNSAFE, UNSAFE,
258 // 0xb0 0xb1 0xb2 0xb3 0xb4 0xb5 0xb6 0xb7
259 UNSAFE, UNSAFE, UNSAFE, UNSAFE, UNSAFE, UNSAFE, UNSAFE, UNSAFE,
260 // 0xb8 0xb9 0xba 0xbb 0xbc 0xbd 0xbe 0xbf
261 UNSAFE, UNSAFE, UNSAFE, UNSAFE, UNSAFE, UNSAFE, UNSAFE, UNSAFE,
262 // 0xc0 0xc1 0xc2 0xc3 0xc4 0xc5 0xc6 0xc7
263 UNSAFE, UNSAFE, SEQ2, SEQ2, SEQ2, SEQ2, SEQ2, SEQ2,
264 // 0xc8 0xc9 0xca 0xcb 0xcc 0xcd 0xce 0xcf
265 SEQ2, SEQ2, SEQ2, SEQ2, SEQ2, SEQ2, SEQ2, SEQ2,
266 // 0xd0 0xd1 0xd2 0xd3 0xd4 0xd5 0xd6 0xd7
267 SEQ2, SEQ2, SEQ2, SEQ2, SEQ2, SEQ2, SEQ2, SEQ2,
268 // 0xd8 0xd9 0xda 0xdb 0xdc 0xdd 0xde 0xdf
269 SEQ2, SEQ2, SEQ2, SEQ2, SEQ2, SEQ2, SEQ2, SEQ2,
270 // 0xe0 0xe1 0xe2 0xe3 0xe4 0xe5 0xe6 0xe7
271 SEQ3, SEQ3, SEQ3, SEQ3, SEQ3, SEQ3, SEQ3, SEQ3,
272 // 0xe8 0xe9 0xea 0xeb 0xec 0xed 0xee 0xef
273 SEQ3, SEQ3, SEQ3, SEQ3, SEQ3, SEQ3, SEQ3, SEQ3,
274 // 0xf0 0xf1 0xf2 0xf3 0xf4 0xf5 0xf6 0xf7
275 SEQ4, SEQ4, SEQ4, SEQ4, SEQ4, UNSAFE, UNSAFE, UNSAFE,
276 // 0xf8 0xf9 0xfa 0xfb 0xfc 0xfd 0xfe 0xff
277 UNSAFE, UNSAFE, UNSAFE, UNSAFE, UNSAFE, UNSAFE, UNSAFE, UNSAFE
280 // Test if the 3 characters of s from offset i are '%', one of [89abAB]
281 // and a hex digit.
282 inline bool
283 encoded_ucont(const std::string & s, size_t i)
285 return s[i] == '%' &&
286 url_chars[static_cast<unsigned char>(s[i + 1])] == OK89AB &&
287 C_isxdigit(s[i + 2]);
290 /** Prettify a URL.
292 * Undo RFC3986 escaping which doesn't affect semantics in practice, to make
293 * a prettier version of a URL to show the user, but which should still work
294 * if copied and pasted.
296 inline void
297 url_prettify(std::string & url)
299 size_t pcent = url.find('%');
300 // Fast path for URLs without a '%' in.
301 if (pcent == std::string::npos)
302 return;
304 if (url.size() < 3)
305 return;
307 // Don't try to decode the query or fragment, and don't try to decode if
308 // there aren't 2 characters after the '%'.
309 size_t pretty_limit = std::min(url.find_first_of("?#"), url.size() - 2);
310 if (pcent >= pretty_limit)
311 return;
313 size_t slash = std::string::npos;
314 size_t start = 0;
315 std::string in;
316 swap(in, url);
317 url.reserve(in.size());
318 while (true) {
319 // We've checked there are at least two bytes after the '%' already.
320 if (C_isxdigit(in[pcent + 1]) && C_isxdigit(in[pcent + 2])) {
321 int ch = (hex_digit(in[pcent + 1]) << 4);
322 ch |= hex_digit(in[pcent + 2]);
323 bool safe = true;
324 switch (url_chars[ch]) {
325 case UNSAFE:
326 safe = false;
327 break;
328 case SEQ2:
329 if (in.size() - (pcent + 2) < 3 ||
330 !encoded_ucont(in, pcent + 3)) {
331 safe = false;
332 break;
334 url.append(in, start, pcent - start);
335 url += char(ch);
336 pcent += 3;
337 ch = (hex_digit(in[pcent + 1]) << 4);
338 ch |= hex_digit(in[pcent + 2]);
339 start = pcent;
340 break;
341 case SEQ3:
342 if (in.size() - (pcent + 2) < 3 * 2 ||
343 !encoded_ucont(in, pcent + 3) ||
344 !encoded_ucont(in, pcent + 6) ||
345 (ch == 0xe0 && in[pcent + 4] <= '9')) {
346 safe = false;
347 break;
349 url.append(in, start, pcent - start);
350 url += char(ch);
351 pcent += 3;
352 ch = (hex_digit(in[pcent + 1]) << 4);
353 ch |= hex_digit(in[pcent + 2]);
354 url += char(ch);
355 pcent += 3;
356 ch = (hex_digit(in[pcent + 1]) << 4);
357 ch |= hex_digit(in[pcent + 2]);
358 start = pcent;
359 break;
360 case SEQ4:
361 if (in.size() - (pcent + 2) < 3 * 3 ||
362 !encoded_ucont(in, pcent + 3) ||
363 !encoded_ucont(in, pcent + 6) ||
364 !encoded_ucont(in, pcent + 9) ||
365 (ch == 0xf0 && in[pcent + 4] == '8') ||
366 (ch == 0xf4 && in[pcent + 4] >= '9')) {
367 safe = false;
368 break;
370 url.append(in, start, pcent - start);
371 url += char(ch);
372 pcent += 3;
373 ch = (hex_digit(in[pcent + 1]) << 4);
374 ch |= hex_digit(in[pcent + 2]);
375 url += char(ch);
376 pcent += 3;
377 ch = (hex_digit(in[pcent + 1]) << 4);
378 ch |= hex_digit(in[pcent + 2]);
379 url += char(ch);
380 pcent += 3;
381 ch = (hex_digit(in[pcent + 1]) << 4);
382 ch |= hex_digit(in[pcent + 2]);
383 start = pcent;
384 break;
385 case INPATH:
386 // ':' is safe to decode if there is a single '/' earlier in
387 // the URL.
388 if (slash == std::string::npos) {
389 // Lazily set slash to the position of the first single '/'.
390 const char * d = in.data();
391 slash = 0;
392 while (true) {
393 const void* s = std::memchr(d + slash, '/',
394 pretty_limit - slash);
395 if (s == NULL) {
396 slash = in.size();
397 break;
399 slash = reinterpret_cast<const char *>(s) - d;
400 if (slash == in.size() - 1 || d[slash + 1] != '/')
401 break;
402 ++slash;
403 while (++slash < in.size() - 1 && d[slash] == '/') { }
406 safe = (pcent > slash);
407 break;
410 if (safe) {
411 url.append(in, start, pcent - start);
412 url += char(ch);
413 pcent += 3;
414 start = pcent;
415 } else {
416 pcent += 3;
418 } else {
419 ++pcent;
421 pcent = in.find('%', pcent);
423 if (pcent >= pretty_limit) {
424 url.append(in, start, std::string::npos);
425 return;
430 #endif // OMEGA_INCLUDED_URLDECODE_H