Add workaround for mmap() with PROT_EXEC on Chrome OS.
[chromium-blink-merge.git] / url / url_canon_etc.cc
bloba1512f6f63e66b8ebab0a71ecdba3b9d0fd76a62
1 // Copyright 2013 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 // Canonicalizers for random bits that aren't big enough for their own files.
7 #include <string.h>
9 #include "url/url_canon.h"
10 #include "url/url_canon_internal.h"
12 namespace url_canon {
14 namespace {
16 // Returns true if the given character should be removed from the middle of a
17 // URL.
18 inline bool IsRemovableURLWhitespace(int ch) {
19 return ch == '\r' || ch == '\n' || ch == '\t';
22 // Backend for RemoveURLWhitespace (see declaration in url_canon.h).
23 // It sucks that we have to do this, since this takes about 13% of the total URL
24 // canonicalization time.
25 template<typename CHAR>
26 const CHAR* DoRemoveURLWhitespace(const CHAR* input, int input_len,
27 CanonOutputT<CHAR>* buffer,
28 int* output_len) {
29 // Fast verification that there's nothing that needs removal. This is the 99%
30 // case, so we want it to be fast and don't care about impacting the speed
31 // when we do find whitespace.
32 int found_whitespace = false;
33 for (int i = 0; i < input_len; i++) {
34 if (!IsRemovableURLWhitespace(input[i]))
35 continue;
36 found_whitespace = true;
37 break;
40 if (!found_whitespace) {
41 // Didn't find any whitespace, we don't need to do anything. We can just
42 // return the input as the output.
43 *output_len = input_len;
44 return input;
47 // Remove the whitespace into the new buffer and return it.
48 for (int i = 0; i < input_len; i++) {
49 if (!IsRemovableURLWhitespace(input[i]))
50 buffer->push_back(input[i]);
52 *output_len = buffer->length();
53 return buffer->data();
56 // Contains the canonical version of each possible input letter in the scheme
57 // (basically, lower-cased). The corresponding entry will be 0 if the letter
58 // is not allowed in a scheme.
59 const char kSchemeCanonical[0x80] = {
60 // 00-1f: all are invalid
61 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
62 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
63 // ' ' ! " # $ % & ' ( ) * + , - . /
64 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, '+', 0, '-', '.', 0,
65 // 0 1 2 3 4 5 6 7 8 9 : ; < = > ?
66 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 0 , 0 , 0 , 0 , 0 , 0 ,
67 // @ A B C D E F G H I J K L M N O
68 0 , 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o',
69 // P Q R S T U V W X Y Z [ \ ] ^ _
70 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 0, 0 , 0, 0 , 0,
71 // ` a b c d e f g h i j k l m n o
72 0 , 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o',
73 // p q r s t u v w x y z { | } ~
74 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 0 , 0 , 0 , 0 , 0 };
76 // This could be a table lookup as well by setting the high bit for each
77 // valid character, but it's only called once per URL, and it makes the lookup
78 // table easier to read not having extra stuff in it.
79 inline bool IsSchemeFirstChar(unsigned char c) {
80 return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z');
83 template<typename CHAR, typename UCHAR>
84 bool DoScheme(const CHAR* spec,
85 const url_parse::Component& scheme,
86 CanonOutput* output,
87 url_parse::Component* out_scheme) {
88 if (scheme.len <= 0) {
89 // Scheme is unspecified or empty, convert to empty by appending a colon.
90 *out_scheme = url_parse::Component(output->length(), 0);
91 output->push_back(':');
92 return true;
95 // The output scheme starts from the current position.
96 out_scheme->begin = output->length();
98 // Danger: it's important that this code does not strip any characters: it
99 // only emits the canonical version (be it valid or escaped) of each of
100 // the input characters. Stripping would put it out of sync with
101 // url_util::FindAndCompareScheme, which could cause some security checks on
102 // schemes to be incorrect.
103 bool success = true;
104 int end = scheme.end();
105 for (int i = scheme.begin; i < end; i++) {
106 UCHAR ch = static_cast<UCHAR>(spec[i]);
107 char replacement = 0;
108 if (ch < 0x80) {
109 if (i == scheme.begin) {
110 // Need to do a special check for the first letter of the scheme.
111 if (IsSchemeFirstChar(static_cast<unsigned char>(ch)))
112 replacement = kSchemeCanonical[ch];
113 } else {
114 replacement = kSchemeCanonical[ch];
118 if (replacement) {
119 output->push_back(replacement);
120 } else if (ch == '%') {
121 // Canonicalizing the scheme multiple times should lead to the same
122 // result. Since invalid characters will be escaped, we need to preserve
123 // the percent to avoid multiple escaping. The scheme will be invalid.
124 success = false;
125 output->push_back('%');
126 } else {
127 // Invalid character, store it but mark this scheme as invalid.
128 success = false;
130 // This will escape the output and also handle encoding issues.
131 // Ignore the return value since we already failed.
132 AppendUTF8EscapedChar(spec, &i, end, output);
136 // The output scheme ends with the the current position, before appending
137 // the colon.
138 out_scheme->len = output->length() - out_scheme->begin;
139 output->push_back(':');
140 return success;
143 // The username and password components reference ranges in the corresponding
144 // *_spec strings. Typically, these specs will be the same (we're
145 // canonicalizing a single source string), but may be different when
146 // replacing components.
147 template<typename CHAR, typename UCHAR>
148 bool DoUserInfo(const CHAR* username_spec,
149 const url_parse::Component& username,
150 const CHAR* password_spec,
151 const url_parse::Component& password,
152 CanonOutput* output,
153 url_parse::Component* out_username,
154 url_parse::Component* out_password) {
155 if (username.len <= 0 && password.len <= 0) {
156 // Common case: no user info. We strip empty username/passwords.
157 *out_username = url_parse::Component();
158 *out_password = url_parse::Component();
159 return true;
162 // Write the username.
163 out_username->begin = output->length();
164 if (username.len > 0) {
165 // This will escape characters not valid for the username.
166 AppendStringOfType(&username_spec[username.begin], username.len,
167 CHAR_USERINFO, output);
169 out_username->len = output->length() - out_username->begin;
171 // When there is a password, we need the separator. Note that we strip
172 // empty but specified passwords.
173 if (password.len > 0) {
174 output->push_back(':');
175 out_password->begin = output->length();
176 AppendStringOfType(&password_spec[password.begin], password.len,
177 CHAR_USERINFO, output);
178 out_password->len = output->length() - out_password->begin;
179 } else {
180 *out_password = url_parse::Component();
183 output->push_back('@');
184 return true;
187 // Helper functions for converting port integers to strings.
188 inline void WritePortInt(char* output, int output_len, int port) {
189 _itoa_s(port, output, output_len, 10);
192 // This function will prepend the colon if there will be a port.
193 template<typename CHAR, typename UCHAR>
194 bool DoPort(const CHAR* spec,
195 const url_parse::Component& port,
196 int default_port_for_scheme,
197 CanonOutput* output,
198 url_parse::Component* out_port) {
199 int port_num = url_parse::ParsePort(spec, port);
200 if (port_num == url_parse::PORT_UNSPECIFIED ||
201 port_num == default_port_for_scheme) {
202 *out_port = url_parse::Component();
203 return true; // Leave port empty.
206 if (port_num == url_parse::PORT_INVALID) {
207 // Invalid port: We'll copy the text from the input so the user can see
208 // what the error was, and mark the URL as invalid by returning false.
209 output->push_back(':');
210 out_port->begin = output->length();
211 AppendInvalidNarrowString(spec, port.begin, port.end(), output);
212 out_port->len = output->length() - out_port->begin;
213 return false;
216 // Convert port number back to an integer. Max port value is 5 digits, and
217 // the Parsed::ExtractPort will have made sure the integer is in range.
218 const int buf_size = 6;
219 char buf[buf_size];
220 WritePortInt(buf, buf_size, port_num);
222 // Append the port number to the output, preceeded by a colon.
223 output->push_back(':');
224 out_port->begin = output->length();
225 for (int i = 0; i < buf_size && buf[i]; i++)
226 output->push_back(buf[i]);
228 out_port->len = output->length() - out_port->begin;
229 return true;
232 template<typename CHAR, typename UCHAR>
233 void DoCanonicalizeRef(const CHAR* spec,
234 const url_parse::Component& ref,
235 CanonOutput* output,
236 url_parse::Component* out_ref) {
237 if (ref.len < 0) {
238 // Common case of no ref.
239 *out_ref = url_parse::Component();
240 return;
243 // Append the ref separator. Note that we need to do this even when the ref
244 // is empty but present.
245 output->push_back('#');
246 out_ref->begin = output->length();
248 // Now iterate through all the characters, converting to UTF-8 and validating.
249 int end = ref.end();
250 for (int i = ref.begin; i < end; i++) {
251 if (spec[i] == 0) {
252 // IE just strips NULLs, so we do too.
253 continue;
254 } else if (static_cast<UCHAR>(spec[i]) < 0x20) {
255 // Unline IE seems to, we escape control characters. This will probably
256 // make the reference fragment unusable on a web page, but people
257 // shouldn't be using control characters in their anchor names.
258 AppendEscapedChar(static_cast<unsigned char>(spec[i]), output);
259 } else if (static_cast<UCHAR>(spec[i]) < 0x80) {
260 // Normal ASCII characters are just appended.
261 output->push_back(static_cast<char>(spec[i]));
262 } else {
263 // Non-ASCII characters are appended unescaped, but only when they are
264 // valid. Invalid Unicode characters are replaced with the "invalid
265 // character" as IE seems to (ReadUTFChar puts the unicode replacement
266 // character in the output on failure for us).
267 unsigned code_point;
268 ReadUTFChar(spec, &i, end, &code_point);
269 AppendUTF8Value(code_point, output);
273 out_ref->len = output->length() - out_ref->begin;
276 } // namespace
278 const char* RemoveURLWhitespace(const char* input, int input_len,
279 CanonOutputT<char>* buffer,
280 int* output_len) {
281 return DoRemoveURLWhitespace(input, input_len, buffer, output_len);
284 const base::char16* RemoveURLWhitespace(const base::char16* input,
285 int input_len,
286 CanonOutputT<base::char16>* buffer,
287 int* output_len) {
288 return DoRemoveURLWhitespace(input, input_len, buffer, output_len);
291 char CanonicalSchemeChar(base::char16 ch) {
292 if (ch >= 0x80)
293 return 0; // Non-ASCII is not supported by schemes.
294 return kSchemeCanonical[ch];
297 bool CanonicalizeScheme(const char* spec,
298 const url_parse::Component& scheme,
299 CanonOutput* output,
300 url_parse::Component* out_scheme) {
301 return DoScheme<char, unsigned char>(spec, scheme, output, out_scheme);
304 bool CanonicalizeScheme(const base::char16* spec,
305 const url_parse::Component& scheme,
306 CanonOutput* output,
307 url_parse::Component* out_scheme) {
308 return DoScheme<base::char16, base::char16>(spec, scheme, output, out_scheme);
311 bool CanonicalizeUserInfo(const char* username_source,
312 const url_parse::Component& username,
313 const char* password_source,
314 const url_parse::Component& password,
315 CanonOutput* output,
316 url_parse::Component* out_username,
317 url_parse::Component* out_password) {
318 return DoUserInfo<char, unsigned char>(
319 username_source, username, password_source, password,
320 output, out_username, out_password);
323 bool CanonicalizeUserInfo(const base::char16* username_source,
324 const url_parse::Component& username,
325 const base::char16* password_source,
326 const url_parse::Component& password,
327 CanonOutput* output,
328 url_parse::Component* out_username,
329 url_parse::Component* out_password) {
330 return DoUserInfo<base::char16, base::char16>(
331 username_source, username, password_source, password,
332 output, out_username, out_password);
335 bool CanonicalizePort(const char* spec,
336 const url_parse::Component& port,
337 int default_port_for_scheme,
338 CanonOutput* output,
339 url_parse::Component* out_port) {
340 return DoPort<char, unsigned char>(spec, port,
341 default_port_for_scheme,
342 output, out_port);
345 bool CanonicalizePort(const base::char16* spec,
346 const url_parse::Component& port,
347 int default_port_for_scheme,
348 CanonOutput* output,
349 url_parse::Component* out_port) {
350 return DoPort<base::char16, base::char16>(spec, port, default_port_for_scheme,
351 output, out_port);
354 void CanonicalizeRef(const char* spec,
355 const url_parse::Component& ref,
356 CanonOutput* output,
357 url_parse::Component* out_ref) {
358 DoCanonicalizeRef<char, unsigned char>(spec, ref, output, out_ref);
361 void CanonicalizeRef(const base::char16* spec,
362 const url_parse::Component& ref,
363 CanonOutput* output,
364 url_parse::Component* out_ref) {
365 DoCanonicalizeRef<base::char16, base::char16>(spec, ref, output, out_ref);
368 } // namespace url_canon