1 // Copyright 2013 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include "base/logging.h"
6 #include "url/url_canon.h"
7 #include "url/url_canon_internal.h"
8 #include "url/url_parse_internal.h"
15 // Pass through unchanged, whether escaped or unescaped. This doesn't
16 // actually set anything so you can't OR it to check, it's just to make the
17 // table below more clear when neither ESCAPE or UNESCAPE is set.
20 // This character requires special handling in DoPartialPath. Doing this test
21 // first allows us to filter out the common cases of regular characters that
22 // can be directly copied.
25 // This character must be escaped in the canonical output. Note that all
26 // escaped chars also have the "special" bit set so that the code that looks
27 // for this is triggered. Not valid with PASS or ESCAPE
29 ESCAPE
= ESCAPE_BIT
| SPECIAL
,
31 // This character must be unescaped in canonical output. Not valid with
32 // ESCAPE or PASS. We DON'T set the SPECIAL flag since if we encounter these
33 // characters unescaped, they should just be copied.
36 // This character is disallowed in URLs. Note that the "special" bit is also
37 // set to trigger handling.
39 INVALID
= INVALID_BIT
| SPECIAL
,
42 // This table contains one of the above flag values. Note some flags are more
43 // than one bits because they also turn on the "special" flag. Special is the
44 // only flag that may be combined with others.
46 // This table is designed to match exactly what IE does with the characters.
48 // Dot is even more special, and the escaped version is handled specially by
49 // IsDot. Therefore, we don't need the "escape" flag, and even the "unescape"
50 // bit is never handled (we just need the "special") bit.
51 const unsigned char kPathCharLookup
[0x100] = {
52 // NULL control chars...
53 INVALID
, ESCAPE
, ESCAPE
, ESCAPE
, ESCAPE
, ESCAPE
, ESCAPE
, ESCAPE
, ESCAPE
, ESCAPE
, ESCAPE
, ESCAPE
, ESCAPE
, ESCAPE
, ESCAPE
, ESCAPE
,
55 ESCAPE
, ESCAPE
, ESCAPE
, ESCAPE
, ESCAPE
, ESCAPE
, ESCAPE
, ESCAPE
, ESCAPE
, ESCAPE
, ESCAPE
, ESCAPE
, ESCAPE
, ESCAPE
, ESCAPE
, ESCAPE
,
56 // ' ' ! " # $ % & ' ( ) * + , - . /
57 ESCAPE
, PASS
, ESCAPE
, ESCAPE
, PASS
, ESCAPE
, PASS
, PASS
, PASS
, PASS
, PASS
, PASS
, PASS
, UNESCAPE
,SPECIAL
, PASS
,
58 // 0 1 2 3 4 5 6 7 8 9 : ; < = > ?
59 UNESCAPE
,UNESCAPE
,UNESCAPE
,UNESCAPE
,UNESCAPE
,UNESCAPE
,UNESCAPE
,UNESCAPE
,UNESCAPE
,UNESCAPE
,PASS
, PASS
, ESCAPE
, PASS
, ESCAPE
, ESCAPE
,
60 // @ A B C D E F G H I J K L M N O
61 PASS
, UNESCAPE
,UNESCAPE
,UNESCAPE
,UNESCAPE
,UNESCAPE
,UNESCAPE
,UNESCAPE
,UNESCAPE
,UNESCAPE
,UNESCAPE
,UNESCAPE
,UNESCAPE
,UNESCAPE
,UNESCAPE
,UNESCAPE
,
62 // P Q R S T U V W X Y Z [ \ ] ^ _
63 UNESCAPE
,UNESCAPE
,UNESCAPE
,UNESCAPE
,UNESCAPE
,UNESCAPE
,UNESCAPE
,UNESCAPE
,UNESCAPE
,UNESCAPE
,UNESCAPE
,PASS
, ESCAPE
, PASS
, ESCAPE
, UNESCAPE
,
64 // ` a b c d e f g h i j k l m n o
65 ESCAPE
, UNESCAPE
,UNESCAPE
,UNESCAPE
,UNESCAPE
,UNESCAPE
,UNESCAPE
,UNESCAPE
,UNESCAPE
,UNESCAPE
,UNESCAPE
,UNESCAPE
,UNESCAPE
,UNESCAPE
,UNESCAPE
,UNESCAPE
,
66 // p q r s t u v w x y z { | } ~ <NBSP>
67 UNESCAPE
,UNESCAPE
,UNESCAPE
,UNESCAPE
,UNESCAPE
,UNESCAPE
,UNESCAPE
,UNESCAPE
,UNESCAPE
,UNESCAPE
,UNESCAPE
,ESCAPE
, ESCAPE
, ESCAPE
, UNESCAPE
,ESCAPE
,
68 // ...all the high-bit characters are escaped
69 ESCAPE
, ESCAPE
, ESCAPE
, ESCAPE
, ESCAPE
, ESCAPE
, ESCAPE
, ESCAPE
, ESCAPE
, ESCAPE
, ESCAPE
, ESCAPE
, ESCAPE
, ESCAPE
, ESCAPE
, ESCAPE
,
70 ESCAPE
, ESCAPE
, ESCAPE
, ESCAPE
, ESCAPE
, ESCAPE
, ESCAPE
, ESCAPE
, ESCAPE
, ESCAPE
, ESCAPE
, ESCAPE
, ESCAPE
, ESCAPE
, ESCAPE
, ESCAPE
,
71 ESCAPE
, ESCAPE
, ESCAPE
, ESCAPE
, ESCAPE
, ESCAPE
, ESCAPE
, ESCAPE
, ESCAPE
, ESCAPE
, ESCAPE
, ESCAPE
, ESCAPE
, ESCAPE
, ESCAPE
, ESCAPE
,
72 ESCAPE
, ESCAPE
, ESCAPE
, ESCAPE
, ESCAPE
, ESCAPE
, ESCAPE
, ESCAPE
, ESCAPE
, ESCAPE
, ESCAPE
, ESCAPE
, ESCAPE
, ESCAPE
, ESCAPE
, ESCAPE
,
73 ESCAPE
, ESCAPE
, ESCAPE
, ESCAPE
, ESCAPE
, ESCAPE
, ESCAPE
, ESCAPE
, ESCAPE
, ESCAPE
, ESCAPE
, ESCAPE
, ESCAPE
, ESCAPE
, ESCAPE
, ESCAPE
,
74 ESCAPE
, ESCAPE
, ESCAPE
, ESCAPE
, ESCAPE
, ESCAPE
, ESCAPE
, ESCAPE
, ESCAPE
, ESCAPE
, ESCAPE
, ESCAPE
, ESCAPE
, ESCAPE
, ESCAPE
, ESCAPE
,
75 ESCAPE
, ESCAPE
, ESCAPE
, ESCAPE
, ESCAPE
, ESCAPE
, ESCAPE
, ESCAPE
, ESCAPE
, ESCAPE
, ESCAPE
, ESCAPE
, ESCAPE
, ESCAPE
, ESCAPE
, ESCAPE
,
76 ESCAPE
, ESCAPE
, ESCAPE
, ESCAPE
, ESCAPE
, ESCAPE
, ESCAPE
, ESCAPE
, ESCAPE
, ESCAPE
, ESCAPE
, ESCAPE
, ESCAPE
, ESCAPE
, ESCAPE
, ESCAPE
};
79 // The given dot is just part of a filename and is not special.
82 // The given dot is the current directory.
85 // The given dot is the first of a double dot that should take us up one.
89 // When the path resolver finds a dot, this function is called with the
90 // character following that dot to see what it is. The return value
91 // indicates what type this dot is (see above). This code handles the case
92 // where the dot is at the end of the input.
94 // |*consumed_len| will contain the number of characters in the input that
95 // express what we found.
97 // If the input is "../foo", |after_dot| = 1, |end| = 6, and
98 // at the end, |*consumed_len| = 2 for the "./" this function consumed. The
99 // original dot length should be handled by the caller.
100 template<typename CHAR
>
101 DotDisposition
ClassifyAfterDot(const CHAR
* spec
, int after_dot
,
102 int end
, int* consumed_len
) {
103 if (after_dot
== end
) {
104 // Single dot at the end.
106 return DIRECTORY_CUR
;
108 if (IsURLSlash(spec
[after_dot
])) {
109 // Single dot followed by a slash.
110 *consumed_len
= 1; // Consume the slash
111 return DIRECTORY_CUR
;
114 int second_dot_len
= IsDot(spec
, after_dot
, end
);
115 if (second_dot_len
) {
116 int after_second_dot
= after_dot
+ second_dot_len
;
117 if (after_second_dot
== end
) {
118 // Double dot at the end.
119 *consumed_len
= second_dot_len
;
122 if (IsURLSlash(spec
[after_second_dot
])) {
123 // Double dot followed by a slash.
124 *consumed_len
= second_dot_len
+ 1;
129 // The dots are followed by something else, not a directory.
131 return NOT_A_DIRECTORY
;
134 // Rewinds the output to the previous slash. It is assumed that the output
135 // ends with a slash and this doesn't count (we call this when we are
136 // appending directory paths, so the previous path component has and ending
139 // This will stop at the first slash (assumed to be at position
140 // |path_begin_in_output| and not go any higher than that. Some web pages
141 // do ".." too many times, so we need to handle that brokenness.
143 // It searches for a literal slash rather than including a backslash as well
144 // because it is run only on the canonical output.
146 // The output is guaranteed to end in a slash when this function completes.
147 void BackUpToPreviousSlash(int path_begin_in_output
,
148 CanonOutput
* output
) {
149 DCHECK(output
->length() > 0);
151 int i
= output
->length() - 1;
152 DCHECK(output
->at(i
) == '/');
153 if (i
== path_begin_in_output
)
154 return; // We're at the first slash, nothing to do.
156 // Now back up (skipping the trailing slash) until we find another slash.
158 while (output
->at(i
) != '/' && i
> path_begin_in_output
)
161 // Now shrink the output to just include that last slash we found.
162 output
->set_length(i
+ 1);
165 // Appends the given path to the output. It assumes that if the input path
166 // starts with a slash, it should be copied to the output. If no path has
167 // already been appended to the output (the case when not resolving
168 // relative URLs), the path should begin with a slash.
170 // If there are already path components (this mode is used when appending
171 // relative paths for resolving), it assumes that the output already has
172 // a trailing slash and that if the input begins with a slash, it should be
173 // copied to the output.
175 // We do not collapse multiple slashes in a row to a single slash. It seems
176 // no web browsers do this, and we don't want incompatibilities, even though
177 // it would be correct for most systems.
178 template<typename CHAR
, typename UCHAR
>
179 bool DoPartialPath(const CHAR
* spec
,
180 const Component
& path
,
181 int path_begin_in_output
,
182 CanonOutput
* output
) {
183 int end
= path
.end();
186 for (int i
= path
.begin
; i
< end
; i
++) {
187 UCHAR uch
= static_cast<UCHAR
>(spec
[i
]);
188 if (sizeof(CHAR
) > sizeof(char) && uch
>= 0x80) {
189 // We only need to test wide input for having non-ASCII characters. For
190 // narrow input, we'll always just use the lookup table. We don't try to
191 // do anything tricky with decoding/validating UTF-8. This function will
192 // read one or two UTF-16 characters and append the output as UTF-8. This
193 // call will be removed in 8-bit mode.
194 success
&= AppendUTF8EscapedChar(spec
, &i
, end
, output
);
196 // Normal ASCII character or 8-bit input, use the lookup table.
197 unsigned char out_ch
= static_cast<unsigned char>(uch
);
198 unsigned char flags
= kPathCharLookup
[out_ch
];
199 if (flags
& SPECIAL
) {
200 // Needs special handling of some sort.
202 if ((dotlen
= IsDot(spec
, i
, end
)) > 0) {
203 // See if this dot was preceded by a slash in the output. We
204 // assume that when canonicalizing paths, they will always
205 // start with a slash and not a dot, so we don't have to
206 // bounds check the output.
208 // Note that we check this in the case of dots so we don't have to
209 // special case slashes. Since slashes are much more common than
210 // dots, this actually increases performance measurably (though
212 DCHECK(output
->length() > path_begin_in_output
);
213 if (output
->length() > path_begin_in_output
&&
214 output
->at(output
->length() - 1) == '/') {
215 // Slash followed by a dot, check to see if this is means relative
217 switch (ClassifyAfterDot
<CHAR
>(spec
, i
+ dotlen
, end
,
219 case NOT_A_DIRECTORY
:
220 // Copy the dot to the output, it means nothing special.
221 output
->push_back('.');
224 case DIRECTORY_CUR
: // Current directory, just skip the input.
225 i
+= dotlen
+ consumed_len
- 1;
228 BackUpToPreviousSlash(path_begin_in_output
, output
);
229 i
+= dotlen
+ consumed_len
- 1;
233 // This dot is not preceded by a slash, it is just part of some
235 output
->push_back('.');
239 } else if (out_ch
== '\\') {
240 // Convert backslashes to forward slashes
241 output
->push_back('/');
243 } else if (out_ch
== '%') {
244 // Handle escape sequences.
245 unsigned char unescaped_value
;
246 if (DecodeEscaped(spec
, &i
, end
, &unescaped_value
)) {
247 // Valid escape sequence, see if we keep, reject, or unescape it.
248 char unescaped_flags
= kPathCharLookup
[unescaped_value
];
250 if (unescaped_flags
& UNESCAPE
) {
251 // This escaped value shouldn't be escaped, copy it.
252 output
->push_back(unescaped_value
);
253 } else if (unescaped_flags
& INVALID_BIT
) {
254 // Invalid escaped character, copy it and remember the error.
255 output
->push_back('%');
256 output
->push_back(static_cast<char>(spec
[i
- 1]));
257 output
->push_back(static_cast<char>(spec
[i
]));
260 // Valid escaped character but we should keep it escaped. We
261 // don't want to change the case of any hex letters in case
262 // the server is sensitive to that, so we just copy the two
263 // characters without checking (DecodeEscape will have advanced
264 // to the last character of the pair).
265 output
->push_back('%');
266 output
->push_back(static_cast<char>(spec
[i
- 1]));
267 output
->push_back(static_cast<char>(spec
[i
]));
270 // Invalid escape sequence. IE7 rejects any URLs with such
271 // sequences, while Firefox, IE6, and Safari all pass it through
272 // unchanged. We are more permissive unlike IE7. I don't think this
273 // can cause significant problems, if it does, we should change
274 // to be more like IE7.
275 output
->push_back('%');
278 } else if (flags
& INVALID_BIT
) {
279 // For NULLs, etc. fail.
280 AppendEscapedChar(out_ch
, output
);
283 } else if (flags
& ESCAPE_BIT
) {
284 // This character should be escaped.
285 AppendEscapedChar(out_ch
, output
);
288 // Nothing special about this character, just append it.
289 output
->push_back(out_ch
);
296 template<typename CHAR
, typename UCHAR
>
297 bool DoPath(const CHAR
* spec
,
298 const Component
& path
,
300 Component
* out_path
) {
302 out_path
->begin
= output
->length();
304 // Write out an initial slash if the input has none. If we just parse a URL
305 // and then canonicalize it, it will of course have a slash already. This
306 // check is for the replacement and relative URL resolving cases of file
308 if (!IsURLSlash(spec
[path
.begin
]))
309 output
->push_back('/');
311 success
= DoPartialPath
<CHAR
, UCHAR
>(spec
, path
, out_path
->begin
, output
);
313 // No input, canonical path is a slash.
314 output
->push_back('/');
316 out_path
->len
= output
->length() - out_path
->begin
;
322 bool CanonicalizePath(const char* spec
,
323 const Component
& path
,
325 Component
* out_path
) {
326 return DoPath
<char, unsigned char>(spec
, path
, output
, out_path
);
329 bool CanonicalizePath(const base::char16
* spec
,
330 const Component
& path
,
332 Component
* out_path
) {
333 return DoPath
<base::char16
, base::char16
>(spec
, path
, output
, out_path
);
336 bool CanonicalizePartialPath(const char* spec
,
337 const Component
& path
,
338 int path_begin_in_output
,
339 CanonOutput
* output
) {
340 return DoPartialPath
<char, unsigned char>(spec
, path
, path_begin_in_output
,
344 bool CanonicalizePartialPath(const base::char16
* spec
,
345 const Component
& path
,
346 int path_begin_in_output
,
347 CanonOutput
* output
) {
348 return DoPartialPath
<base::char16
, base::char16
>(spec
, path
,
349 path_begin_in_output
,