1 // Copyright (c) 2009 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include "chrome_frame/html_utils.h"
10 #include "base/string_util.h"
11 #include "base/string_tokenizer.h"
12 #include "chrome_frame/utils.h"
14 const wchar_t kQuotes
[] = L
"\"'";
15 const char kXFrameOptionsHeader
[] = "X-Frame-Options";
16 const char kXFrameOptionsValueAllowAll
[] = "allowall";
18 HTMLScanner::StringRange::StringRange() {
21 HTMLScanner::StringRange::StringRange(StrPos start
, StrPos end
)
22 : start_(start
), end_(end
) {
25 bool HTMLScanner::StringRange::LowerCaseEqualsASCII(const char* other
) const {
26 return ::LowerCaseEqualsASCII(start_
, end_
, other
);
29 bool HTMLScanner::StringRange::Equals(const wchar_t* other
) const {
30 int ret
= wcsncmp(&start_
[0], other
, end_
- start_
);
32 ret
= (other
[end_
- start_
] == L
'\0') ? 0 : -1;
36 std::wstring
HTMLScanner::StringRange::Copy() const {
37 return std::wstring(start_
, end_
);
40 bool HTMLScanner::StringRange::GetTagName(std::wstring
* tag_name
) const {
41 if (*start_
!= L
'<') {
42 LOG(ERROR
) << "Badly formatted tag found";
46 StrPos name_start
= start_
;
48 while (name_start
< end_
&& IsWhitespace(*name_start
))
51 if (name_start
>= end_
) {
52 // We seem to have a degenerate tag (i.e. < >). Return false here.
56 StrPos name_end
= name_start
+ 1;
57 while (name_end
< end_
&& !IsWhitespace(*name_end
))
60 if (name_end
> end_
) {
61 // This looks like an improperly formatted tab ('<foo'). Return false here.
65 tag_name
->assign(name_start
, name_end
);
70 bool HTMLScanner::StringRange::GetTagAttribute(const wchar_t* attribute_name
,
71 StringRange
* attribute_value
) const {
72 if (NULL
== attribute_name
|| NULL
== attribute_value
) {
77 // Use this so we can use the convenience method LowerCaseEqualsASCII()
78 // from string_util.h.
79 std::string
search_name_ascii(WideToASCII(attribute_name
));
81 WStringTokenizer
tokenizer(start_
, end_
, L
" =/");
82 tokenizer
.set_options(WStringTokenizer::RETURN_DELIMS
);
84 // Set up the quote chars so that we get quoted attribute values as single
86 tokenizer
.set_quote_chars(L
"\"'");
88 const bool PARSE_STATE_NAME
= true;
89 const bool PARSE_STATE_VALUE
= false;
90 bool parse_state
= PARSE_STATE_NAME
;
92 // Used to skip the first token, which is the tag name.
93 bool first_token_skipped
= false;
95 // This is set during a loop iteration in which an '=' sign was spotted.
96 // It is used to filter out degenerate tags such as:
98 bool last_token_was_delim
= false;
100 // Set this if the attribute name has been found that we might then
101 // pick up the value in the next loop iteration.
102 bool attribute_name_found
= false;
104 while (tokenizer
.GetNext()) {
105 // If we have a whitespace delimiter, just keep going. Cases of this should
106 // be reduced by the CollapseWhitespace call. If we have an '=' character,
107 // we update our state and reiterate.
108 if (tokenizer
.token_is_delim()) {
109 if (*tokenizer
.token_begin() == L
'=') {
110 if (last_token_was_delim
) {
111 // Looks like we have a badly formed tag, just stop parsing now.
114 parse_state
= !parse_state
;
115 last_token_was_delim
= true;
120 last_token_was_delim
= false;
122 // The first non-delimiter token is the tag name, which we don't want.
123 if (!first_token_skipped
) {
124 first_token_skipped
= true;
128 if (PARSE_STATE_NAME
== parse_state
) {
129 // We have a tag name, check to see if it matches our target name:
130 if (::LowerCaseEqualsASCII(tokenizer
.token_begin(), tokenizer
.token_end(),
131 search_name_ascii
.c_str())) {
132 attribute_name_found
= true;
135 } else if (PARSE_STATE_VALUE
== parse_state
&& attribute_name_found
) {
136 attribute_value
->start_
= tokenizer
.token_begin();
137 attribute_value
->end_
= tokenizer
.token_end();
139 // Unquote the attribute value if need be.
140 attribute_value
->UnQuote();
143 } else if (PARSE_STATE_VALUE
== parse_state
) {
144 // If we haven't found the attribute name we want yet, ignore this token
145 // and go back to looking for our name.
146 parse_state
= PARSE_STATE_NAME
;
153 bool HTMLScanner::StringRange::UnQuote() {
154 if (start_
+ 2 > end_
) {
155 // String's too short to be quoted, bail.
159 if ((*start_
== L
'\'' && *(end_
- 1) == L
'\'') ||
160 (*start_
== L
'"' && *(end_
- 1) == L
'"')) {
169 HTMLScanner::HTMLScanner(const wchar_t* html_string
)
170 : html_string_(CollapseWhitespace(html_string
, true)),
174 void HTMLScanner::GetTagsByName(const wchar_t* name
, StringRangeList
* tag_list
,
175 const wchar_t* stop_tag
) {
176 DCHECK(NULL
!= name
);
177 DCHECK(NULL
!= tag_list
);
178 DCHECK(NULL
!= stop_tag
);
180 StringRange
remaining_html(html_string_
.begin(), html_string_
.end());
182 std::wstring
search_name(name
);
183 TrimWhitespace(search_name
, TRIM_ALL
, &search_name
);
185 // Use this so we can use the convenience method LowerCaseEqualsASCII()
186 // from string_util.h.
187 std::string
search_name_ascii(WideToASCII(search_name
));
188 std::string
stop_tag_ascii(WideToASCII(stop_tag
));
190 StringRange current_tag
;
191 std::wstring current_name
;
192 while (NextTag(&remaining_html
, ¤t_tag
)) {
193 if (current_tag
.GetTagName(¤t_name
)) {
194 if (LowerCaseEqualsASCII(current_name
, search_name_ascii
.c_str())) {
195 tag_list
->push_back(current_tag
);
196 } else if (LowerCaseEqualsASCII(current_name
, stop_tag_ascii
.c_str())) {
197 // We hit the stop tag so it's time to go home.
208 ScanState() : in_quote(false), in_escape(false) {}
211 bool HTMLScanner::IsQuote(wchar_t c
) {
212 return quotes_
.find(c
) != std::wstring::npos
;
215 bool HTMLScanner::IsHTMLCommentClose(StringRange
* html_string
, StrPos pos
) {
216 if (pos
< html_string
->end_
&& pos
> html_string
->start_
+ 2 &&
218 return *(pos
-1) == L
'-' && *(pos
-2) == L
'-';
223 bool HTMLScanner::NextTag(StringRange
* html_string
, StringRange
* tag
) {
224 DCHECK(NULL
!= html_string
);
227 tag
->start_
= html_string
->start_
;
228 while (tag
->start_
< html_string
->end_
&& *tag
->start_
!= L
'<') {
232 // we went past the end of the string.
233 if (tag
->start_
>= html_string
->end_
) {
237 tag
->end_
= tag
->start_
+ 1;
239 // Get the tag name to see if we are in an HTML comment. If we are, then
240 // don't consider quotes. This should work for example:
241 // <!-- foo ' --> <meta foo='bar'>
242 std::wstring tag_name
;
243 StringRange
start_range(tag
->start_
, html_string
->end_
);
244 start_range
.GetTagName(&tag_name
);
245 if (StartsWith(tag_name
, L
"!--", true)) {
246 // We're inside a comment tag, keep going until we get out of it.
247 while (tag
->end_
< html_string
->end_
&&
248 !IsHTMLCommentClose(html_string
, tag
->end_
)) {
252 // Properly handle quoted strings within non-comment tags by maintaining
253 // some state while scanning. Specifically, we have to maintain state on
254 // whether we are inside a string, what the string terminating character
255 // will be and whether we are inside an escape sequence.
257 while (tag
->end_
< html_string
->end_
) {
258 if (state
.in_quote
) {
259 if (state
.in_escape
) {
260 state
.in_escape
= false;
261 } else if (*tag
->end_
== '\\') {
262 state
.in_escape
= true;
263 } else if (*tag
->end_
== state
.quote_char
) {
264 state
.in_quote
= false;
267 state
.in_quote
= IsQuote(state
.quote_char
= *tag
->end_
);
270 if (!state
.in_quote
&& *tag
->end_
== L
'>') {
277 // We hit the end_ but found no matching tag closure. Consider this an
278 // incomplete tag and do not report it.
279 if (tag
->end_
>= html_string
->end_
)
282 // Modify html_string to point to just beyond the end_ of the current tag.
283 html_string
->start_
= tag
->end_
+ 1;
288 namespace http_utils
{
290 const char kChromeFrameUserAgent
[] = "chromeframe";
292 const char* GetChromeFrameUserAgent() {
293 static char cf_user_agent
[100] = {0};
294 if (!cf_user_agent
[0]) {
295 _pAtlModule
->m_csStaticDataInitAndTypeInfo
.Lock();
296 if (!cf_user_agent
[0]) {
297 uint32 high_version
= 0, low_version
= 0;
298 GetModuleVersion(reinterpret_cast<HMODULE
>(&__ImageBase
), &high_version
,
300 wsprintfA(cf_user_agent
, "%s/%i.%i.%i.%i", kChromeFrameUserAgent
,
301 HIWORD(high_version
), LOWORD(high_version
),
302 HIWORD(low_version
), LOWORD(low_version
));
304 _pAtlModule
->m_csStaticDataInitAndTypeInfo
.Unlock();
306 return cf_user_agent
;
309 std::string
AddChromeFrameToUserAgentValue(const std::string
& value
) {
311 DLOG(WARNING
) << "empty user agent value";
315 DCHECK_EQ(false, StartsWithASCII(value
, "User-Agent:", true));
317 if (value
.find(kChromeFrameUserAgent
) != std::string::npos
) {
318 // Our user agent has already been added.
322 std::string
ret(value
);
324 ret
+= GetChromeFrameUserAgent();
329 std::string
GetDefaultUserAgentHeaderWithCFTag() {
330 std::string
ua(GetDefaultUserAgent());
331 return "User-Agent: " + AddChromeFrameToUserAgentValue(ua
);
334 std::string
GetDefaultUserAgent() {
336 DWORD size
= MAX_PATH
; // NOLINT
337 HRESULT hr
= E_OUTOFMEMORY
;
338 for (int retries
= 1; hr
== E_OUTOFMEMORY
&& retries
<= 10; ++retries
) {
339 hr
= ::ObtainUserAgentString(0, WriteInto(&ret
, size
+ 1), &size
);
340 if (hr
== E_OUTOFMEMORY
) {
341 size
= MAX_PATH
* retries
;
342 } else if (SUCCEEDED(hr
)) {
343 // Truncate the extra allocation.
344 DCHECK(size
> 0); // NOLINT
345 ret
.resize(size
- sizeof(char)); // NOLINT
350 NOTREACHED() << StringPrintf("ObtainUserAgentString==0x%08X", hr
);
353 DCHECK(ret
.length() == lstrlenA(ret
.c_str()));
359 bool HasFrameBustingHeader(const std::string
& http_headers
) {
360 net::HttpUtil::HeadersIterator
it(
361 http_headers
.begin(), http_headers
.end(), "\r\n");
362 while (it
.GetNext()) {
363 if (it
.name() == kXFrameOptionsHeader
) {
364 std::string
allow_all(kXFrameOptionsValueAllowAll
);
365 if (it
.values_end() - it
.values_begin() != allow_all
.length() ||
366 !std::equal(it
.values_begin(), it
.values_end(),
368 CaseInsensitiveCompareASCII
<const char>())) {
377 } // namespace http_utils