1 // Copyright (c) 2011 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include "chrome/browser/search_engines/template_url_parser.h"
11 #include "base/logging.h"
12 #include "base/memory/scoped_ptr.h"
13 #include "base/string_number_conversions.h"
14 #include "base/string_util.h"
15 #include "base/utf_string_conversions.h"
16 #include "chrome/browser/search_engines/template_url.h"
17 #include "chrome/common/url_constants.h"
18 #include "googleurl/src/gurl.h"
19 #include "libxml/parser.h"
20 #include "libxml/xmlwriter.h"
25 // NOTE: libxml uses the UTF-8 encoding. As 0-127 of UTF-8 corresponds
26 // to that of char, the following names are all in terms of char. This avoids
27 // having to convert to wide, then do comparisons
29 // Defines for element names of the OSD document:
30 static const char kURLElement
[] = "Url";
31 static const char kParamElement
[] = "Param";
32 static const char kShortNameElement
[] = "ShortName";
33 static const char kDescriptionElement
[] = "Description";
34 static const char kImageElement
[] = "Image";
35 static const char kOpenSearchDescriptionElement
[] = "OpenSearchDescription";
36 static const char kFirefoxSearchDescriptionElement
[] = "SearchPlugin";
37 static const char kLanguageElement
[] = "Language";
38 static const char kInputEncodingElement
[] = "InputEncoding";
40 // Various XML attributes used.
41 static const char kURLTypeAttribute
[] = "type";
42 static const char kURLTemplateAttribute
[] = "template";
43 static const char kImageTypeAttribute
[] = "type";
44 static const char kImageWidthAttribute
[] = "width";
45 static const char kImageHeightAttribute
[] = "height";
46 static const char kURLIndexOffsetAttribute
[] = "indexOffset";
47 static const char kURLPageOffsetAttribute
[] = "pageOffset";
48 static const char kParamNameAttribute
[] = "name";
49 static const char kParamValueAttribute
[] = "value";
50 static const char kParamMethodAttribute
[] = "method";
52 // Mime type for search results.
53 static const char kHTMLType
[] = "text/html";
55 // Mime type for as you type suggestions.
56 static const char kSuggestionType
[] = "application/x-suggestions+json";
58 // Namespace identifier.
59 static const char kOSDNS
[] = "xmlns";
61 // The namespace for documents we understand.
62 static const char kNameSpace
[] = "http://a9.com/-/spec/opensearch/1.1/";
64 // Removes the namespace from the specified |name|, ex: os:Url -> Url.
65 static void PruneNamespace(std::string
* name
) {
66 size_t index
= name
->find_first_of(":");
67 if (index
!= std::string::npos
)
68 name
->erase(0, index
+ 1);
72 // To minimize memory overhead while parsing, a SAX style parser is used.
73 // ParsingContext is used to maintain the state we're in the document
75 class ParsingContext
{
77 // Enum of the known element types.
80 OPEN_SEARCH_DESCRIPTION
,
95 // Key/value of a Param node.
96 typedef std::pair
<std::string
, std::string
> Param
;
98 ParsingContext(TemplateURLParser::ParameterFilter
* parameter_filter
,
101 parameter_filter_(parameter_filter
),
103 suggestion_method_(GET
),
104 is_suggest_url_(false),
105 derive_image_from_url_(false) {
106 if (kElementNameToElementTypeMap
== NULL
)
110 // Invoked when an element starts.
111 void PushElement(const std::string
& element
) {
113 if (kElementNameToElementTypeMap
->find(element
) ==
114 kElementNameToElementTypeMap
->end()) {
117 type
= (*kElementNameToElementTypeMap
)[element
];
119 elements_
.push_back(type
);
123 elements_
.pop_back();
126 // Returns the current ElementType.
127 ElementType
GetKnownType() {
128 if (elements_
.size() == 2 && elements_
[0] == OPEN_SEARCH_DESCRIPTION
)
131 // We only expect PARAM nodes under the Url node
132 if (elements_
.size() == 3 && elements_
[0] == OPEN_SEARCH_DESCRIPTION
&&
133 elements_
[1] == URL
&& elements_
[2] == PARAM
)
139 TemplateURL
* template_url() { return url_
; }
141 void AddImageRef(const std::string
& type
, int width
, int height
) {
142 if (width
> 0 && height
> 0)
143 current_image_
.reset(new TemplateURL::ImageRef(type
, width
, height
));
147 current_image_
.reset();
150 void SetImageURL(const GURL
& url
) {
151 if (current_image_
.get()) {
152 current_image_
->url
= url
;
153 url_
->add_image_ref(*current_image_
);
154 current_image_
.reset();
162 void AppendString(const string16
& string
) {
166 const string16
& GetString() {
170 void ResetExtraParams() {
171 extra_params_
.clear();
174 void AddExtraParams(const std::string
& key
, const std::string
& value
) {
175 if (parameter_filter_
&& !parameter_filter_
->KeepParameter(key
, value
))
177 extra_params_
.push_back(Param(key
, value
));
180 const std::vector
<Param
>& extra_params() const { return extra_params_
; }
182 void set_is_suggestion(bool value
) { is_suggest_url_
= value
; }
183 bool is_suggestion() const { return is_suggest_url_
; }
185 TemplateURLParser::ParameterFilter
* parameter_filter() const {
186 return parameter_filter_
;
189 void set_derive_image_from_url(bool derive_image_from_url
) {
190 derive_image_from_url_
= derive_image_from_url
;
193 void set_method(Method method
) { method_
= method
; }
194 Method
method() { return method_
; }
196 void set_suggestion_method(Method method
) { suggestion_method_
= method
; }
197 Method
suggestion_method() { return suggestion_method_
; }
199 // Builds the image URL from the Template search URL if no image URL has been
201 void DeriveImageFromURL() {
202 if (derive_image_from_url_
&&
203 url_
->GetFaviconURL().is_empty() && url_
->url()) {
204 GURL
url(url_
->url()->url()); // More url's please...
205 url_
->SetFaviconURL(TemplateURL::GenerateFaviconURL(url
));
210 static void InitMapping() {
211 kElementNameToElementTypeMap
= new std::map
<std::string
, ElementType
>;
212 (*kElementNameToElementTypeMap
)[kURLElement
] = URL
;
213 (*kElementNameToElementTypeMap
)[kParamElement
] = PARAM
;
214 (*kElementNameToElementTypeMap
)[kShortNameElement
] = SHORT_NAME
;
215 (*kElementNameToElementTypeMap
)[kDescriptionElement
] = DESCRIPTION
;
216 (*kElementNameToElementTypeMap
)[kImageElement
] = IMAGE
;
217 (*kElementNameToElementTypeMap
)[kOpenSearchDescriptionElement
] =
218 OPEN_SEARCH_DESCRIPTION
;
219 (*kElementNameToElementTypeMap
)[kFirefoxSearchDescriptionElement
] =
220 OPEN_SEARCH_DESCRIPTION
;
221 (*kElementNameToElementTypeMap
)[kLanguageElement
] =
223 (*kElementNameToElementTypeMap
)[kInputEncodingElement
] =
227 // Key is UTF8 encoded.
228 static std::map
<std::string
, ElementType
>* kElementNameToElementTypeMap
;
229 // TemplateURL supplied to Read method. It's owned by the caller, so we
230 // don't need to free it.
232 std::vector
<ElementType
> elements_
;
233 scoped_ptr
<TemplateURL::ImageRef
> current_image_
;
235 // Character content for the current element.
238 TemplateURLParser::ParameterFilter
* parameter_filter_
;
240 // The list of parameters parsed in the Param nodes of a Url node.
241 std::vector
<Param
> extra_params_
;
243 // The HTTP methods used.
245 Method suggestion_method_
;
247 // If true, we are currently parsing a suggest URL, otherwise it is an HTML
248 // search. Note that we don't need a stack as Url nodes cannot be nested.
249 bool is_suggest_url_
;
251 // Whether we should derive the image from the URL (when images are data
253 bool derive_image_from_url_
;
255 DISALLOW_COPY_AND_ASSIGN(ParsingContext
);
259 std::map
<std::string
, ParsingContext::ElementType
>*
260 ParsingContext::kElementNameToElementTypeMap
= NULL
;
262 string16
XMLCharToUTF16(const xmlChar
* value
, int length
) {
263 return UTF8ToUTF16(std::string((const char*)value
, length
));
266 std::string
XMLCharToString(const xmlChar
* value
) {
267 return std::string((const char*)value
);
270 // Returns true if input_encoding contains a valid input encoding string. This
271 // doesn't verify that we have a valid encoding for the string, just that the
272 // string contains characters that constitute a valid input encoding.
273 bool IsValidEncodingString(const std::string
& input_encoding
) {
274 if (input_encoding
.empty())
277 if (!IsAsciiAlpha(input_encoding
[0]))
280 for (size_t i
= 1, max
= input_encoding
.size(); i
< max
; ++i
) {
281 char c
= input_encoding
[i
];
282 if (!IsAsciiAlpha(c
) && !IsAsciiDigit(c
) && c
!= '.' && c
!= '_' &&
290 void ParseURL(const xmlChar
** atts
, ParsingContext
* context
) {
294 TemplateURL
* turl
= context
->template_url();
295 const xmlChar
** attributes
= atts
;
296 std::string template_url
;
297 bool is_post
= false;
298 bool is_html_url
= false;
299 bool is_suggest_url
= false;
300 int index_offset
= 1;
303 while (*attributes
) {
304 std::string
name(XMLCharToString(*attributes
));
305 const xmlChar
* value
= attributes
[1];
306 if (name
== kURLTypeAttribute
) {
307 std::string type
= XMLCharToString(value
);
308 is_html_url
= (type
== kHTMLType
);
309 is_suggest_url
= (type
== kSuggestionType
);
310 } else if (name
== kURLTemplateAttribute
) {
311 template_url
= XMLCharToString(value
);
312 } else if (name
== kURLIndexOffsetAttribute
) {
313 base::StringToInt(XMLCharToString(value
), &index_offset
);
314 index_offset
= std::max(1, index_offset
);
315 } else if (name
== kURLPageOffsetAttribute
) {
316 base::StringToInt(XMLCharToString(value
), &page_offset
);
317 page_offset
= std::max(1, page_offset
);
318 } else if (name
== kParamMethodAttribute
) {
319 is_post
= LowerCaseEqualsASCII(XMLCharToString(value
), "post");
324 turl
->SetURL(template_url
, index_offset
, page_offset
);
325 context
->set_is_suggestion(false);
327 context
->set_method(ParsingContext::POST
);
328 } else if (is_suggest_url
) {
329 turl
->SetSuggestionsURL(template_url
, index_offset
, page_offset
);
330 context
->set_is_suggestion(true);
332 context
->set_suggestion_method(ParsingContext::POST
);
336 void ParseImage(const xmlChar
** atts
, ParsingContext
* context
) {
340 const xmlChar
** attributes
= atts
;
344 while (*attributes
) {
345 std::string
name(XMLCharToString(*attributes
));
346 const xmlChar
* value
= attributes
[1];
347 if (name
== kImageTypeAttribute
) {
348 type
= XMLCharToString(value
);
349 } else if (name
== kImageWidthAttribute
) {
350 base::StringToInt(XMLCharToString(value
), &width
);
351 } else if (name
== kImageHeightAttribute
) {
352 base::StringToInt(XMLCharToString(value
), &height
);
356 if (width
> 0 && height
> 0 && !type
.empty()) {
358 context
->AddImageRef(type
, width
, height
);
362 void ParseParam(const xmlChar
** atts
, ParsingContext
* context
) {
366 const xmlChar
** attributes
= atts
;
367 std::string key
, value
;
368 while (*attributes
) {
369 std::string
name(XMLCharToString(*attributes
));
370 const xmlChar
* val
= attributes
[1];
371 if (name
== kParamNameAttribute
) {
372 key
= XMLCharToString(val
);
373 } else if (name
== kParamValueAttribute
) {
374 value
= XMLCharToString(val
);
379 context
->AddExtraParams(key
, value
);
382 static void AppendParamToQuery(const std::string
& key
,
383 const std::string
& value
,
384 std::string
* query
) {
391 query
->append(value
);
394 void ProcessURLParams(ParsingContext
* context
) {
395 TemplateURL
* t_url
= context
->template_url();
396 const TemplateURLRef
* t_url_ref
=
397 context
->is_suggestion() ? t_url
->suggestions_url() :
402 if (!context
->parameter_filter() && context
->extra_params().empty())
405 GURL
url(t_url_ref
->url());
406 // If there is a parameter filter, parse the existing URL and remove any
407 // unwanted parameter.
408 TemplateURLParser::ParameterFilter
* filter
= context
->parameter_filter();
409 std::string new_query
;
410 bool modified
= false;
412 url_parse::Component query
= url
.parsed_for_possibly_invalid_spec().query
;
413 url_parse::Component key
, value
;
414 const char* url_spec
= url
.spec().c_str();
415 while (url_parse::ExtractQueryKeyValue(url_spec
, &query
, &key
, &value
)) {
416 std::string
key_str(url_spec
, key
.begin
, key
.len
);
417 std::string
value_str(url_spec
, value
.begin
, value
.len
);
418 if (filter
->KeepParameter(key_str
, value_str
)) {
419 AppendParamToQuery(key_str
, value_str
, &new_query
);
426 new_query
= url
.query();
428 // Add the extra parameters if any.
429 const std::vector
<ParsingContext::Param
>& params
= context
->extra_params();
430 if (!params
.empty()) {
432 std::vector
<ParsingContext::Param
>::const_iterator iter
;
433 for (iter
= params
.begin(); iter
!= params
.end(); ++iter
)
434 AppendParamToQuery(iter
->first
, iter
->second
, &new_query
);
438 GURL::Replacements repl
;
439 repl
.SetQueryStr(new_query
);
440 url
= url
.ReplaceComponents(repl
);
441 if (context
->is_suggestion()) {
442 t_url
->SetSuggestionsURL(url
.spec(),
443 t_url_ref
->index_offset(),
444 t_url_ref
->page_offset());
446 t_url
->SetURL(url
.spec(),
447 t_url_ref
->index_offset(),
448 t_url_ref
->page_offset());
453 void StartElementImpl(void *ctx
, const xmlChar
*name
, const xmlChar
**atts
) {
454 ParsingContext
* context
= reinterpret_cast<ParsingContext
*>(ctx
);
455 std::string
node_name((const char*)name
);
456 PruneNamespace(&node_name
);
457 context
->PushElement(node_name
);
458 switch (context
->GetKnownType()) {
459 case ParsingContext::URL
:
460 context
->ResetExtraParams();
461 ParseURL(atts
, context
);
463 case ParsingContext::IMAGE
:
464 ParseImage(atts
, context
);
466 case ParsingContext::PARAM
:
467 ParseParam(atts
, context
);
472 context
->ResetString();
475 void EndElementImpl(void *ctx
, const xmlChar
*name
) {
476 ParsingContext
* context
= reinterpret_cast<ParsingContext
*>(ctx
);
477 switch (context
->GetKnownType()) {
478 case ParsingContext::SHORT_NAME
:
479 context
->template_url()->set_short_name(context
->GetString());
481 case ParsingContext::DESCRIPTION
:
482 context
->template_url()->set_description(context
->GetString());
484 case ParsingContext::IMAGE
: {
485 GURL
image_url(UTF16ToUTF8(context
->GetString()));
486 if (image_url
.SchemeIs(chrome::kDataScheme
)) {
487 // TODO (jcampan): bug 1169256: when dealing with data URL, we need to
488 // decode the data URL in the renderer. For now, we'll just point to the
489 // favicon from the URL.
490 context
->set_derive_image_from_url(true);
492 context
->SetImageURL(image_url
);
497 case ParsingContext::LANGUAGE
:
498 context
->template_url()->add_language(context
->GetString());
500 case ParsingContext::INPUT_ENCODING
: {
501 std::string input_encoding
= UTF16ToASCII(context
->GetString());
502 if (IsValidEncodingString(input_encoding
))
503 context
->template_url()->add_input_encoding(input_encoding
);
506 case ParsingContext::URL
:
507 ProcessURLParams(context
);
512 context
->ResetString();
513 context
->PopElement();
516 void CharactersImpl(void *ctx
, const xmlChar
*ch
, int len
) {
517 ParsingContext
* context
= reinterpret_cast<ParsingContext
*>(ctx
);
518 context
->AppendString(XMLCharToUTF16(ch
, len
));
521 // Returns true if the ref is null, or the url wrapped by ref is
522 // valid with a spec of http/https.
523 bool IsHTTPRef(const TemplateURLRef
* ref
) {
526 GURL
url(ref
->url());
527 return (url
.is_valid() && (url
.SchemeIs(chrome::kHttpScheme
) ||
528 url
.SchemeIs(chrome::kHttpsScheme
)));
531 // Returns true if the TemplateURL is legal. A legal TemplateURL is one
532 // where all URLs have a spec of http/https.
533 bool IsLegal(TemplateURL
* url
) {
534 if (!IsHTTPRef(url
->url()) || !IsHTTPRef(url
->suggestions_url()))
536 // Make sure all the image refs are legal.
537 const std::vector
<TemplateURL::ImageRef
>& image_refs
= url
->image_refs();
538 for (size_t i
= 0; i
< image_refs
.size(); i
++) {
539 GURL
image_url(image_refs
[i
].url
);
540 if (!image_url
.is_valid() ||
541 !(image_url
.SchemeIs(chrome::kHttpScheme
) ||
542 image_url
.SchemeIs(chrome::kHttpsScheme
))) {
552 bool TemplateURLParser::Parse(const unsigned char* data
, size_t length
,
553 TemplateURLParser::ParameterFilter
* param_filter
,
556 // xmlSubstituteEntitiesDefault(1) makes it so that & isn't mapped to
557 // & . Unfortunately xmlSubstituteEntitiesDefault effects global state.
558 // If this becomes problematic we'll need to provide our own entity
559 // type for &, or strip out " by hand after parsing.
560 int last_sub_entities_value
= xmlSubstituteEntitiesDefault(1);
561 ParsingContext
context(param_filter
, url
);
562 xmlSAXHandler sax_handler
;
563 memset(&sax_handler
, 0, sizeof(sax_handler
));
564 sax_handler
.startElement
= &StartElementImpl
;
565 sax_handler
.endElement
= &EndElementImpl
;
566 sax_handler
.characters
= &CharactersImpl
;
567 xmlSAXUserParseMemory(&sax_handler
, &context
,
568 reinterpret_cast<const char*>(data
),
569 static_cast<int>(length
));
570 xmlSubstituteEntitiesDefault(last_sub_entities_value
);
571 // If the image was a data URL, use the favicon from the search URL instead.
572 // (see TODO inEndElementImpl()).
573 context
.DeriveImageFromURL();
575 // TODO(jcampan): http://b/issue?id=1196285 we do not support search engines
576 // that use POST yet.
577 if (context
.method() == ParsingContext::POST
)
579 if (context
.suggestion_method() == ParsingContext::POST
)
580 url
->SetSuggestionsURL("", 0, 0);
582 if (!url
->short_name().empty() && !url
->description().empty()) {
583 // So far so good, make sure the urls are http.