Roll src/third_party/WebKit 5668f22:1929caf (svn 192731:192750)
[chromium-blink-merge.git] / components / search_engines / template_url_parser.cc
blob14298da11b6b5381db9598403584cf259bf2b487
1 // Copyright 2014 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include "components/search_engines/template_url_parser.h"
7 #include <algorithm>
8 #include <map>
9 #include <vector>
11 #include "base/logging.h"
12 #include "base/memory/scoped_ptr.h"
13 #include "base/strings/string_number_conversions.h"
14 #include "base/strings/string_util.h"
15 #include "base/strings/utf_string_conversions.h"
16 #include "components/search_engines/template_url.h"
17 #include "libxml/parser.h"
18 #include "libxml/xmlwriter.h"
19 #include "ui/gfx/favicon_size.h"
20 #include "url/gurl.h"
21 #include "url/url_constants.h"
23 namespace {
25 // NOTE: libxml uses the UTF-8 encoding. As 0-127 of UTF-8 corresponds
26 // to that of char, the following names are all in terms of char. This avoids
27 // having to convert to wide, then do comparisons.
29 // Defines for element names of the OSD document:
30 const char kURLElement[] = "Url";
31 const char kParamElement[] = "Param";
32 const char kShortNameElement[] = "ShortName";
33 const char kImageElement[] = "Image";
34 const char kOpenSearchDescriptionElement[] = "OpenSearchDescription";
35 const char kFirefoxSearchDescriptionElement[] = "SearchPlugin";
36 const char kInputEncodingElement[] = "InputEncoding";
37 const char kAliasElement[] = "Alias";
39 // Various XML attributes used.
40 const char kURLTypeAttribute[] = "type";
41 const char kURLTemplateAttribute[] = "template";
42 const char kImageTypeAttribute[] = "type";
43 const char kImageWidthAttribute[] = "width";
44 const char kImageHeightAttribute[] = "height";
45 const char kParamNameAttribute[] = "name";
46 const char kParamValueAttribute[] = "value";
47 const char kParamMethodAttribute[] = "method";
49 // Mime type for search results.
50 const char kHTMLType[] = "text/html";
52 // Mime type for as you type suggestions.
53 const char kSuggestionType[] = "application/x-suggestions+json";
55 std::string XMLCharToString(const xmlChar* value) {
56 return std::string(reinterpret_cast<const char*>(value));
59 // Returns true if input_encoding contains a valid input encoding string. This
60 // doesn't verify that we have a valid encoding for the string, just that the
61 // string contains characters that constitute a valid input encoding.
62 bool IsValidEncodingString(const std::string& input_encoding) {
63 if (input_encoding.empty())
64 return false;
66 if (!IsAsciiAlpha(input_encoding[0]))
67 return false;
69 for (size_t i = 1, max = input_encoding.size(); i < max; ++i) {
70 char c = input_encoding[i];
71 if (!IsAsciiAlpha(c) && !IsAsciiDigit(c) && c != '.' && c != '_' &&
72 c != '-') {
73 return false;
76 return true;
79 void AppendParamToQuery(const std::string& key,
80 const std::string& value,
81 std::string* query) {
82 if (!query->empty())
83 query->append("&");
84 if (!key.empty()) {
85 query->append(key);
86 query->append("=");
88 query->append(value);
91 // Returns true if |url| is empty or is a valid URL with a scheme of HTTP[S].
92 bool IsHTTPRef(const std::string& url) {
93 if (url.empty())
94 return true;
95 GURL gurl(url);
96 return gurl.is_valid() && (gurl.SchemeIs(url::kHttpScheme) ||
97 gurl.SchemeIs(url::kHttpsScheme));
100 } // namespace
103 // TemplateURLParsingContext --------------------------------------------------
105 // To minimize memory overhead while parsing, a SAX style parser is used.
106 // TemplateURLParsingContext is used to maintain the state we're in the document
107 // while parsing.
108 class TemplateURLParsingContext {
109 public:
110 // Enum of the known element types.
111 enum ElementType {
112 UNKNOWN,
113 OPEN_SEARCH_DESCRIPTION,
114 URL,
115 PARAM,
116 SHORT_NAME,
117 IMAGE,
118 INPUT_ENCODING,
119 ALIAS,
122 enum Method {
123 GET,
124 POST
127 // Key/value of a Param node.
128 typedef std::pair<std::string, std::string> Param;
130 explicit TemplateURLParsingContext(
131 TemplateURLParser::ParameterFilter* parameter_filter);
133 static void StartElementImpl(void* ctx,
134 const xmlChar* name,
135 const xmlChar** atts);
136 static void EndElementImpl(void* ctx, const xmlChar* name);
137 static void CharactersImpl(void* ctx, const xmlChar* ch, int len);
139 // Returns a heap-allocated TemplateURL representing the result of parsing.
140 // This will be NULL if parsing failed or if the results were invalid for some
141 // reason (e.g. the resulting URL was not HTTP[S], a name wasn't supplied,
142 // a resulting TemplateURLRef was invalid, etc.).
143 TemplateURL* GetTemplateURL(const SearchTermsData& search_terms_data,
144 bool show_in_default_list);
146 private:
147 // Key is UTF8 encoded.
148 typedef std::map<std::string, ElementType> ElementNameToElementTypeMap;
150 static void InitMapping();
152 void ParseURL(const xmlChar** atts);
153 void ParseImage(const xmlChar** atts);
154 void ParseParam(const xmlChar** atts);
155 void ProcessURLParams();
157 // Returns the current ElementType.
158 ElementType GetKnownType();
160 static ElementNameToElementTypeMap* kElementNameToElementTypeMap;
162 // Data that gets updated as we parse, and is converted to a TemplateURL by
163 // GetTemplateURL().
164 TemplateURLData data_;
166 std::vector<ElementType> elements_;
167 bool image_is_valid_for_favicon_;
169 // Character content for the current element.
170 base::string16 string_;
172 TemplateURLParser::ParameterFilter* parameter_filter_;
174 // The list of parameters parsed in the Param nodes of a Url node.
175 std::vector<Param> extra_params_;
177 // The HTTP methods used.
178 Method method_;
179 Method suggestion_method_;
181 // If true, we are currently parsing a suggest URL, otherwise it is an HTML
182 // search. Note that we don't need a stack as URL nodes cannot be nested.
183 bool is_suggest_url_;
185 // If true, the user has set a keyword and we should use it. Otherwise,
186 // we generate a keyword based on the URL.
187 bool has_custom_keyword_;
189 // Whether we should derive the image from the URL (when images are data
190 // URLs).
191 bool derive_image_from_url_;
193 DISALLOW_COPY_AND_ASSIGN(TemplateURLParsingContext);
196 // static
197 TemplateURLParsingContext::ElementNameToElementTypeMap*
198 TemplateURLParsingContext::kElementNameToElementTypeMap = NULL;
200 TemplateURLParsingContext::TemplateURLParsingContext(
201 TemplateURLParser::ParameterFilter* parameter_filter)
202 : image_is_valid_for_favicon_(false),
203 parameter_filter_(parameter_filter),
204 method_(GET),
205 suggestion_method_(GET),
206 is_suggest_url_(false),
207 has_custom_keyword_(false),
208 derive_image_from_url_(false) {
209 if (kElementNameToElementTypeMap == NULL)
210 InitMapping();
213 // static
214 void TemplateURLParsingContext::StartElementImpl(void* ctx,
215 const xmlChar* name,
216 const xmlChar** atts) {
217 // Remove the namespace from |name|, ex: os:Url -> Url.
218 std::string node_name(XMLCharToString(name));
219 size_t index = node_name.find_first_of(":");
220 if (index != std::string::npos)
221 node_name.erase(0, index + 1);
223 TemplateURLParsingContext* context =
224 reinterpret_cast<TemplateURLParsingContext*>(ctx);
225 context->elements_.push_back(
226 context->kElementNameToElementTypeMap->count(node_name) ?
227 (*context->kElementNameToElementTypeMap)[node_name] : UNKNOWN);
228 switch (context->GetKnownType()) {
229 case TemplateURLParsingContext::URL:
230 context->extra_params_.clear();
231 context->ParseURL(atts);
232 break;
233 case TemplateURLParsingContext::IMAGE:
234 context->ParseImage(atts);
235 break;
236 case TemplateURLParsingContext::PARAM:
237 context->ParseParam(atts);
238 break;
239 default:
240 break;
242 context->string_.clear();
245 // static
246 void TemplateURLParsingContext::EndElementImpl(void* ctx, const xmlChar* name) {
247 TemplateURLParsingContext* context =
248 reinterpret_cast<TemplateURLParsingContext*>(ctx);
249 switch (context->GetKnownType()) {
250 case TemplateURLParsingContext::URL:
251 context->ProcessURLParams();
252 break;
253 case TemplateURLParsingContext::SHORT_NAME:
254 context->data_.short_name = context->string_;
255 break;
256 case TemplateURLParsingContext::IMAGE: {
257 GURL image_url(base::UTF16ToUTF8(context->string_));
258 if (image_url.SchemeIs(url::kDataScheme)) {
259 // TODO (jcampan): bug 1169256: when dealing with data URL, we need to
260 // decode the data URL in the renderer. For now, we'll just point to the
261 // favicon from the URL.
262 context->derive_image_from_url_ = true;
263 } else if (context->image_is_valid_for_favicon_ && image_url.is_valid() &&
264 (image_url.SchemeIs(url::kHttpScheme) ||
265 image_url.SchemeIs(url::kHttpsScheme))) {
266 context->data_.favicon_url = image_url;
268 context->image_is_valid_for_favicon_ = false;
269 break;
271 case TemplateURLParsingContext::INPUT_ENCODING: {
272 std::string input_encoding = base::UTF16ToASCII(context->string_);
273 if (IsValidEncodingString(input_encoding))
274 context->data_.input_encodings.push_back(input_encoding);
275 break;
277 case TemplateURLParsingContext::ALIAS: {
278 context->data_.SetKeyword(context->string_);
279 context->has_custom_keyword_ = true;
280 break;
282 default:
283 break;
285 context->string_.clear();
286 context->elements_.pop_back();
289 // static
290 void TemplateURLParsingContext::CharactersImpl(void* ctx,
291 const xmlChar* ch,
292 int len) {
293 reinterpret_cast<TemplateURLParsingContext*>(ctx)->string_ +=
294 base::UTF8ToUTF16(
295 base::StringPiece(reinterpret_cast<const char*>(ch), len));
298 TemplateURL* TemplateURLParsingContext::GetTemplateURL(
299 const SearchTermsData& search_terms_data,
300 bool show_in_default_list) {
301 // TODO(jcampan): Support engines that use POST; see http://crbug.com/18107
302 if (method_ == TemplateURLParsingContext::POST || data_.short_name.empty() ||
303 !IsHTTPRef(data_.url()) || !IsHTTPRef(data_.suggestions_url))
304 return NULL;
305 if (suggestion_method_ == TemplateURLParsingContext::POST)
306 data_.suggestions_url.clear();
308 // If the image was a data URL, use the favicon from the search URL instead.
309 // (see the TODO in EndElementImpl()).
310 GURL search_url(data_.url());
311 if (derive_image_from_url_ && data_.favicon_url.is_empty())
312 data_.favicon_url = TemplateURL::GenerateFaviconURL(search_url);
314 // Generate a keyword for this search engine if a custom one was not present
315 // in the imported data.
316 if (!has_custom_keyword_)
317 data_.SetKeyword(TemplateURL::GenerateKeyword(search_url));
319 data_.show_in_default_list = show_in_default_list;
321 // Bail if the search URL is empty or if either TemplateURLRef is invalid.
322 scoped_ptr<TemplateURL> template_url(new TemplateURL(data_));
323 if (template_url->url().empty() ||
324 !template_url->url_ref().IsValid(search_terms_data) ||
325 (!template_url->suggestions_url().empty() &&
326 !template_url->suggestions_url_ref().IsValid(search_terms_data))) {
327 return NULL;
330 return template_url.release();
333 // static
334 void TemplateURLParsingContext::InitMapping() {
335 kElementNameToElementTypeMap = new std::map<std::string, ElementType>;
336 (*kElementNameToElementTypeMap)[kURLElement] = URL;
337 (*kElementNameToElementTypeMap)[kParamElement] = PARAM;
338 (*kElementNameToElementTypeMap)[kShortNameElement] = SHORT_NAME;
339 (*kElementNameToElementTypeMap)[kImageElement] = IMAGE;
340 (*kElementNameToElementTypeMap)[kOpenSearchDescriptionElement] =
341 OPEN_SEARCH_DESCRIPTION;
342 (*kElementNameToElementTypeMap)[kFirefoxSearchDescriptionElement] =
343 OPEN_SEARCH_DESCRIPTION;
344 (*kElementNameToElementTypeMap)[kInputEncodingElement] = INPUT_ENCODING;
345 (*kElementNameToElementTypeMap)[kAliasElement] = ALIAS;
348 void TemplateURLParsingContext::ParseURL(const xmlChar** atts) {
349 if (!atts)
350 return;
352 std::string template_url;
353 bool is_post = false;
354 bool is_html_url = false;
355 bool is_suggest_url = false;
356 for (; *atts; atts += 2) {
357 std::string name(XMLCharToString(*atts));
358 const xmlChar* value = atts[1];
359 if (name == kURLTypeAttribute) {
360 std::string type = XMLCharToString(value);
361 is_html_url = (type == kHTMLType);
362 is_suggest_url = (type == kSuggestionType);
363 } else if (name == kURLTemplateAttribute) {
364 template_url = XMLCharToString(value);
365 } else if (name == kParamMethodAttribute) {
366 is_post = LowerCaseEqualsASCII(XMLCharToString(value), "post");
370 if (is_html_url && !template_url.empty()) {
371 data_.SetURL(template_url);
372 is_suggest_url_ = false;
373 if (is_post)
374 method_ = POST;
375 } else if (is_suggest_url) {
376 data_.suggestions_url = template_url;
377 is_suggest_url_ = true;
378 if (is_post)
379 suggestion_method_ = POST;
383 void TemplateURLParsingContext::ParseImage(const xmlChar** atts) {
384 if (!atts)
385 return;
387 int width = 0;
388 int height = 0;
389 std::string type;
390 for (; *atts; atts += 2) {
391 std::string name(XMLCharToString(*atts));
392 const xmlChar* value = atts[1];
393 if (name == kImageTypeAttribute) {
394 type = XMLCharToString(value);
395 } else if (name == kImageWidthAttribute) {
396 base::StringToInt(XMLCharToString(value), &width);
397 } else if (name == kImageHeightAttribute) {
398 base::StringToInt(XMLCharToString(value), &height);
402 image_is_valid_for_favicon_ = (width == gfx::kFaviconSize) &&
403 (height == gfx::kFaviconSize) &&
404 ((type == "image/x-icon") || (type == "image/vnd.microsoft.icon"));
407 void TemplateURLParsingContext::ParseParam(const xmlChar** atts) {
408 if (!atts)
409 return;
411 std::string key, value;
412 for (; *atts; atts += 2) {
413 std::string name(XMLCharToString(*atts));
414 const xmlChar* val = atts[1];
415 if (name == kParamNameAttribute) {
416 key = XMLCharToString(val);
417 } else if (name == kParamValueAttribute) {
418 value = XMLCharToString(val);
422 if (!key.empty() &&
423 (!parameter_filter_ || parameter_filter_->KeepParameter(key, value)))
424 extra_params_.push_back(Param(key, value));
427 void TemplateURLParsingContext::ProcessURLParams() {
428 if (!parameter_filter_ && extra_params_.empty())
429 return;
431 GURL url(is_suggest_url_ ? data_.suggestions_url : data_.url());
432 if (url.is_empty())
433 return;
435 // If there is a parameter filter, parse the existing URL and remove any
436 // unwanted parameter.
437 std::string new_query;
438 bool modified = false;
439 if (parameter_filter_) {
440 url::Component query = url.parsed_for_possibly_invalid_spec().query;
441 url::Component key, value;
442 const char* url_spec = url.spec().c_str();
443 while (url::ExtractQueryKeyValue(url_spec, &query, &key, &value)) {
444 std::string key_str(url_spec, key.begin, key.len);
445 std::string value_str(url_spec, value.begin, value.len);
446 if (parameter_filter_->KeepParameter(key_str, value_str)) {
447 AppendParamToQuery(key_str, value_str, &new_query);
448 } else {
449 modified = true;
453 if (!modified)
454 new_query = url.query();
456 // Add the extra parameters if any.
457 if (!extra_params_.empty()) {
458 modified = true;
459 for (std::vector<Param>::const_iterator iter(extra_params_.begin());
460 iter != extra_params_.end(); ++iter)
461 AppendParamToQuery(iter->first, iter->second, &new_query);
464 if (modified) {
465 GURL::Replacements repl;
466 repl.SetQueryStr(new_query);
467 url = url.ReplaceComponents(repl);
468 if (is_suggest_url_)
469 data_.suggestions_url = url.spec();
470 else if (url.is_valid())
471 data_.SetURL(url.spec());
475 TemplateURLParsingContext::ElementType
476 TemplateURLParsingContext::GetKnownType() {
477 if (elements_.size() == 2 && elements_[0] == OPEN_SEARCH_DESCRIPTION)
478 return elements_[1];
479 // We only expect PARAM nodes under the URL node.
480 return (elements_.size() == 3 && elements_[0] == OPEN_SEARCH_DESCRIPTION &&
481 elements_[1] == URL && elements_[2] == PARAM) ? PARAM : UNKNOWN;
485 // TemplateURLParser ----------------------------------------------------------
487 // static
488 TemplateURL* TemplateURLParser::Parse(
489 const SearchTermsData& search_terms_data,
490 bool show_in_default_list,
491 const char* data,
492 size_t length,
493 TemplateURLParser::ParameterFilter* param_filter) {
494 // xmlSubstituteEntitiesDefault(1) makes it so that &amp; isn't mapped to
495 // &#38; . Unfortunately xmlSubstituteEntitiesDefault affects global state.
496 // If this becomes problematic we'll need to provide our own entity
497 // type for &amp;, or strip out &#38; by hand after parsing.
498 int last_sub_entities_value = xmlSubstituteEntitiesDefault(1);
499 TemplateURLParsingContext context(param_filter);
500 xmlSAXHandler sax_handler;
501 memset(&sax_handler, 0, sizeof(sax_handler));
502 sax_handler.startElement = &TemplateURLParsingContext::StartElementImpl;
503 sax_handler.endElement = &TemplateURLParsingContext::EndElementImpl;
504 sax_handler.characters = &TemplateURLParsingContext::CharactersImpl;
505 int error = xmlSAXUserParseMemory(&sax_handler, &context, data,
506 static_cast<int>(length));
507 xmlSubstituteEntitiesDefault(last_sub_entities_value);
509 return error ?
510 NULL : context.GetTemplateURL(search_terms_data, show_in_default_list);