1 /* htmlparsetest.cc: test the MyHtmlParser class
3 * Copyright (C) 2006,2008,2011,2012,2013,2015,2016,2018 Olly Betts
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation; either version 2 of the
8 * License, or (at your option) any later version.
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
27 #include "myhtmlparse.h"
35 const char * keywords
;
39 static const testcase tests
[] = {
40 { "<body>test<!--htdig_noindex-->icle<!--/htdig_noindex-->s</body>",
41 "tests", "", "", "" },
42 { "<body>test<!--htdig_noindex-->ing</body>", "test", "", "", "" },
43 { "hello<!-- bl>ah --> world", "hello world", "", "", "" },
44 { "hello<!-- blah > world", "hello world", "", "", "" },
45 { "<script>\nif (a<b) a = b;</script>test", "test", "", "", "" },
46 // Regression test for bug first noticed in 1.0.0 (but present earlier).
47 { "<b>not</b>\n<b>able</b>", "not able", "", "", "" },
48 // Check that whitespace is handled as intended.
49 { " <b>not </b>\n<b>\table\t</b>\r\n", "not able", "", "", "" },
50 { "<html><head><title>\xc2\xae</title></head><body>\xc2\xa3</body></html>", "\xc3\x82\xc2\xa3", "\xc3\x82\xc2\xae", "", "" },
51 { "<html><head><meta http-equiv=Content-Type content=\"text/html;charset=iso-8859-1\"><title>\xc2\xae</title></head><body>\xc2\xa3</body></html>", "\xc3\x82\xc2\xa3", "\xc3\x82\xc2\xae", "", "" },
52 { "<html><head><meta http-equiv=Content-Type content=\"text/html;charset=utf-8\"><title>\xc2\xae</title></head><body>\xc2\xa3</body></html>", "\xc2\xa3", "\xc2\xae", "", "" },
53 { "<html><head><meta charset='utf-8'><title>\xc2\xae</title></head><body>\xc2\xa3</body></html>", "\xc2\xa3", "\xc2\xae", "", "" },
54 { "<html><head><title>\xc2\xae</title><meta charset=\"utf-8\"></head><body>\xc2\xa3</body></html>", "\xc2\xa3", "\xc2\xae", "", "" },
55 { "<html><body><p>This is \nthe text</p><p>This is \nthe tex</p></body></html>", "This is the text\rThis is the tex", "", "", "" },
56 // Check we default to UTF-8 for HTML5.
57 { "<!DOCTYPE html><html><head><title>\xc2\xae</title></head><body>\xc2\xa3</body></html>", "\xc2\xa3", "\xc2\xae", "", "" },
58 { "<!Doctype\tHTML ><html><head><title>\xc2\xae</title></head><body>\xc2\xa3</body></html>", "\xc2\xa3", "\xc2\xae", "", "" },
59 { "<!Doctype HTML\t><html><head><title>\xc2\xae</title></head><body>\xc2\xa3</body></html>", "\xc2\xa3", "\xc2\xae", "", "" },
60 { "<!DOCTYPE system 'about:legacy-compat'><html><head><title>\xc2\xae</title></head><body>\xc2\xa3</body></html>", "\xc2\xa3", "\xc2\xae", "", "" },
61 { "<!doctype SyStem \"about:legacy-compat\" ><html><head><title>\xc2\xae</title></head><body>\xc2\xa3</body></html>", "\xc2\xa3", "\xc2\xae", "", "" },
62 // Check we default to UTF-8 for XML.
63 { "<?xml version=\"1.0\"?><html><head><title>\xc2\xae</title></head><body>\xc2\xa3</body></html>", "\xc2\xa3", "\xc2\xae", "", "" },
64 // Check we handle specify a charset for XML.
65 { "<?xml version=\"1.0\" encoding=\"ISO-8859-1\"?><html><head><title>\xc2\xae</title></head><body>\xc2\xa3</body></html>", "\xc3\x82\xc2\xa3", "\xc3\x82\xc2\xae", "", "" },
66 { "<!--UdmComment-->test<!--/UdmComment--><div id='body'>test</div>", "test", "", "", "" },
67 { "Foo<![CDATA[ & bar <literal>\"]]> ok", "Foo & bar <literal>\" ok", "", "", "" },
68 { "Foo<![CDATA", "Foo", "", "", "" },
69 { "foo<![CDATA[bar", "foobar", "", "", "" },
70 // Test that handling of multiple body tags matches modern browser behaviour (ticket#599).
71 { "a<html>b<head>c<title>bad</title>d</head>e<body>f</body>g<body>h</body>i</html>j<body>k", "abcdefghijk", "bad", "", "" },
72 { "check<object id='foo'>for<applet foo=\"bar\" />spaces<br> in <p>\tout</p>put\r\n", "check for spaces\rin\rout\rput", "", "", "" },
73 { "tab:<table><tr><th>col 1</th><th>col 2</th></tr><tr><td>test</td><td><img src='foo.jpg'> <img src='bar.jpg'></td></tr><tr><td colspan=2>hello world</td></tr></table>done", "tab:\rcol 1\tcol 2\rtest\rhello world\rdone", "", "", "" },
75 { "<html><body>1 < 2, 3 > 2</body></html>", "1 < 2, 3 > 2", "", "", "" },
76 { "<html><body>&amp;</body></html>", "&", "", "", "" },
77 { "<html><body><Unknown &ent;-ity></body></html>", "<Unknown &ent;-ity>", "", "", "" },
78 { "<html><body>Does a < ä ?</body></html>", "Does a < รค ?", "", "", "" },
79 { "A@>", "A@>", "", "", "" },
86 for (size_t i
= 0; tests
[i
].html
; ++i
) {
89 p
.parse_html(tests
[i
].html
, "iso-8859-1", false);
90 } catch (const string
&newcharset
) {
92 p
.parse_html(tests
[i
].html
, newcharset
, true);
94 if (!p
.indexing_allowed
) {
95 cout
<< "indexing disallowed by meta tag - skipping\n";
98 if (tests
[i
].dump
!= p
.dump
) {
99 cout
<< "DUMP " << i
<< ": [" << p
.dump
<< "] != [" << tests
[i
].dump
<< "]" << endl
;
102 if (tests
[i
].title
!= p
.title
) {
103 cout
<< "TITLE " << i
<< ": [" << p
.title
<< "] != [" << tests
[i
].title
<< "]" << endl
;
106 if (tests
[i
].keywords
!= p
.keywords
) {
107 cout
<< "KEYWORDS " << i
<< ": [" << p
.keywords
<< "] != [" << tests
[i
].keywords
<< "]" << endl
;
110 const char *sample
= tests
[i
].sample
;
111 if (sample
== NULL
) sample
= tests
[i
].dump
;
112 if (sample
!= p
.sample
) {
113 cout
<< "SAMPLE " << i
<< ": [" << p
.sample
<< "] != [" << sample
<< "]" << endl
;