xapian-applications/omega/htmlparsertest.cc

   1 /** @file
   2  * @brief test the HtmlParser class
   3  */
   4 /* Copyright (C) 2006-2023 Olly Betts
   5  *
   6  * This program is free software; you can redistribute it and/or
   7  * modify it under the terms of the GNU General Public License as
   8  * published by the Free Software Foundation; either version 2 of the
   9  * License, or (at your option) any later version.
  10  *
  11  * This program is distributed in the hope that it will be useful,
  12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14  * GNU General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU General Public License
  17  * along with this program; if not, write to the Free Software
  18  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
  19  * USA
  20  */
  21
  22 #include <config.h>
  23
  24 #include <cstdlib>
  25 #include <cstring>
  26 #include <iostream>
  27 #include <string>
  28 #include <string_view>
  29
  30 #include "htmlparser.h"
  31
  32 using namespace std;
  33
  34 struct testcase {
  35     const char * html;
  36     const char * dump;
  37     const char * title;
  38     const char * keywords;
  39     const char * sample;
  40 };
  41
  42 // Wide character test data is signalled by a single leading nul and terminated
  43 // by a double nul.
  44 #define WIDE(X) "\0" X "\0"
  45
  46 static const testcase tests[] = {
  47     { "<body>test<!--htdig_noindex-->icle<!--/htdig_noindex-->s</body>",
  48       "tests", "", "", "" },
  49     { "<body>test<!--htdig_noindex-->ing</body>", "test", "", "", "" },
  50     { "hello<!-- bl>ah --> world", "hello world", "", "", "" },
  51     { "hello<!-- blah > world", "hello world", "", "", "" },
  52     { "<script>\nif (a<b) a = b;</script>test", "test", "", "", "" },
  53     // Regression test for bug first noticed in 1.0.0 (but present earlier).
  54     { "<b>not</b>\n<b>able</b>", "not able", "", "", "" },
  55     // Check that whitespace is handled as intended.
  56     { " <b>not </b>\n<b>\table\t</b>\r\n", "not able", "", "", "" },
  57     { "<html><head><title>\xc2\xae</title></head><body>\xc2\xa3</body></html>", "\xc3\x82\xc2\xa3", "\xc3\x82\xc2\xae", "", "" },
  58     { "<html><head><meta http-equiv=Content-Type content=\"text/html;charset=iso-8859-1\"><title>\xc2\xae</title></head><body>\xc2\xa3</body></html>", "\xc3\x82\xc2\xa3", "\xc3\x82\xc2\xae", "", "" },
  59     { "<html><head><meta http-equiv=Content-Type content=\"text/html;charset=utf-8\"><title>\xc2\xae</title></head><body>\xc2\xa3</body></html>", "\xc2\xa3", "\xc2\xae", "", "" },
  60     { "<html><head><meta charset='utf-8'><title>\xc2\xae</title></head><body>\xc2\xa3</body></html>", "\xc2\xa3", "\xc2\xae", "", "" },
  61     { "<html><head><title>\xc2\xae</title><meta charset=\"utf-8\"></head><body>\xc2\xa3</body></html>", "\xc2\xa3", "\xc2\xae", "", "" },
  62     // The UTF-8 "BOM" should also set the charset to utf-8.
  63     { "\xef\xbb\xbf<html><head><title>\xc2\xae</title></head><body>\xc2\xa3</body></html>", "\xc2\xa3", "\xc2\xae", "", "" },
  64     { "<title>X</title>", "", "X", "", "" },
  65     { WIDE("\xff\xfe<\0t\0i\0t\0l\0e\0>\0\x20\x26<\0/\0t\0i\0t\0l\0e\0>\0"), "", "\xe2\x98\xa0", "", "" },
  66     { WIDE("\xfe\xff\0<\0t\0i\0t\0l\0e\0>\x26\x20\0<\0/\0t\0i\0t\0l\0e\0>"), "", "\xe2\x98\xa0", "", "" },
  67     { "<html><body><p>This is \nthe text</p><p>This is \nthe tex</p></body></html>", "This is the text This is the tex", "", "", "" },
  68     // Check we default to UTF-8 for HTML5.
  69     { "<!DOCTYPE html><html><head><title>\xc2\xae</title></head><body>\xc2\xa3</body></html>", "\xc2\xa3", "\xc2\xae", "", "" },
  70     { "<!Doctype\tHTML  ><html><head><title>\xc2\xae</title></head><body>\xc2\xa3</body></html>", "\xc2\xa3", "\xc2\xae", "", "" },
  71     { "<!Doctype  HTML\t><html><head><title>\xc2\xae</title></head><body>\xc2\xa3</body></html>", "\xc2\xa3", "\xc2\xae", "", "" },
  72     { "<!DOCTYPE system 'about:legacy-compat'><html><head><title>\xc2\xae</title></head><body>\xc2\xa3</body></html>", "\xc2\xa3", "\xc2\xae", "", "" },
  73     { "<!doctype SyStem \"about:legacy-compat\" ><html><head><title>\xc2\xae</title></head><body>\xc2\xa3</body></html>", "\xc2\xa3", "\xc2\xae", "", "" },
  74     // Check we default to UTF-8 for XML.
  75     { "<?xml version=\"1.0\"?><html><head><title>\xc2\xae</title></head><body>\xc2\xa3</body></html>", "\xc2\xa3", "\xc2\xae", "", "" },
  76     // Check we handle specify a charset for XML.
  77     { "<?xml version=\"1.0\" encoding=\"ISO-8859-1\"?><html><head><title>\xc2\xae</title></head><body>\xc2\xa3</body></html>", "\xc3\x82\xc2\xa3", "\xc3\x82\xc2\xae", "", "" },
  78     // Check the XML gets case-sensitive handling.
  79     { "<?xml version=\"1.0\"?><html><head><TITLE>Not really a title</TITLE><meta Name='keywords' value='not really keywords'/></head><body>test</body></html>", "Not really a titletest", "", "", "" },
  80     { "<!--UdmComment-->test<!--/UdmComment--><div id='body'>test</div>", "test", "", "", "" },
  81     { "Foo<![CDATA[ & bar <literal>\"]]> ok", "Foo & bar <literal>\" ok", "", "", "" },
  82     { "Foo<![CDATA", "Foo", "", "", "" },
  83     { "foo<![CDATA[bar", "foobar", "", "", "" },
  84     // Test that handling of multiple body tags matches modern browser behaviour (ticket#599).
  85     { "a<html>b<head>c<title>bad</title>d</head>e<body>f</body>g<body>h</body>i</html>j<body>k", "abcdefghijk", "bad", "", "" },
  86     { "check<object id='foo'>for<applet foo=\"bar\" />spaces<br> in <p>\tout</p>put\r\n", "check for spaces in out put", "", "", "" },
  87     { "tab:<table><tr><th>col 1</th><th>col 2</th></tr><tr><td>test</td><td><img src='foo.jpg'> <img src='bar.jpg'></td></tr><tr><td colspan=2>hello world</td></tr></table>done", "tab: col 1 col 2 test hello world done", "", "", "" },
  88     // Test HTML checkboxes are converted to Unicode symbols.
  89     { "<input type=checkbox><input checked=checked type=checkbox><input type=checkbox checked>", "\xe2\x98\x90\xe2\x98\x91\xe2\x98\x91", "", "", "" },
  90     // Test entities.
  91     { "<html><body>1 &lt; 2, 3 &gt; 2</body></html>", "1 < 2, 3 > 2", "", "", "" },
  92     { "<html><body>&amp;amp;</body></html>", "&amp;", "", "", "" },
  93     { "<html><body>&lt;Unknown &ent;-ity&gt;</body></html>", "<Unknown &ent;-ity>", "", "", "" },
  94     { "<html><body>&#68;oes &#97; &lt; &auml; &#x3f</body></html>", "Does a < ä ?", "", "", "" },
  95     { "&#65;&#x40;&gt", "A@>", "", "", "" },
  96     // Test empty tags.
  97     //
  98     // First two cases are a regression test - in Omega < 1.4.16 the title
  99     // wasn't closed and any body content was put into the title instead.
 100     { "<head><title xml:lang=\"en-US\"/></head><body><p>Body</p></body>", "Body", "", "", "" },
 101     { "<head><title xml:lang='en-US'/></head><body><p>Body</p></body>", "Body", "", "", "" },
 102     { "<head><title xml:lang=\"en-US\" /></head><body><p>Body</p></body>", "Body", "", "", "" },
 103     { "<head><title xml:lang='en-US\" /></head><body><p>Body</p></body>", "Body", "", "", "" },
 104     { "<head><title/></head><body><p>Body</p></body>", "Body", "", "", "" },
 105     { "<head><title /></head><body><p>Body</p></body>", "Body", "", "", "" },
 106     // Test attribute names are handled case-insensitively in HTML but not XHTML.
 107     { "<html><head><MeTa Name=KeywordS CONTENT='testing'></head><body>Body</body></html>", "Body", "", "testing", "" },
 108     { "<?xml version=\"1.0\"?><html><head><meta name=keywords content='testing'/></head><body>Body</body></html>", "Body", "", "testing", "" },
 109     { "<?xml version=\"1.0\"?><html><head><meta Name=keywords content='testing'/></head><body>Body</body></html>", "Body", "", "", "" },
 110     { "<?xml version=\"1.0\"?><html><head><meta name=keywords Content='testing'/></head><body>Body</body></html>", "Body", "", "", "" },
 111     // Test handling of PHP tags.
 112     { "T<?php $a=PHP_MAJOR_VERSION > 7 ?>\r\ne<? if ($a) new(); ?>\ns<?= $a ?>\rting<? ?>\n\nPHP<?php $a=0;", "Testing PHP", "", "", "" },
 113     { 0, 0, 0, 0, 0 }
 114 };
 115
 116 int
 117 main()
 118 {
 119     for (size_t i = 0; tests[i].html; ++i) {
 120         HtmlParser p;
 121         const char* html_begin = tests[i].html;
 122         size_t html_len = strlen(html_begin);
 123         if (html_len == 0) {
 124             // Wide character test data is signalled by a single leading nul
 125             // and terminated by a double nul.
 126             ++html_begin;
 127             while (html_begin[html_len] || html_begin[html_len + 1]) {
 128                 html_len += 2;
 129             }
 130         }
 131         string_view html(html_begin, html_len);
 132         try {
 133             p.parse(html, "iso-8859-1", false);
 134         } catch (const string &newcharset) {
 135             p.reset();
 136             p.parse(html, newcharset, true);
 137         }
 138         if (!p.indexing_allowed) {
 139             cout << "indexing disallowed by meta tag - skipping\n";
 140             continue;
 141         }
 142         if (tests[i].dump != p.dump) {
 143             cout << "DUMP " << i << ": [" << p.dump << "] != [" << tests[i].dump << "]" << endl;
 144             exit(1);
 145         }
 146         if (tests[i].title != p.title) {
 147             cout << "TITLE " << i << ": [" << p.title << "] != [" << tests[i].title << "]" << endl;
 148             exit(1);
 149         }
 150         if (tests[i].keywords != p.keywords) {
 151             cout << "KEYWORDS " << i << ": [" << p.keywords << "] != [" << tests[i].keywords << "]" << endl;
 152             exit(1);
 153         }
 154         const char *sample = tests[i].sample;
 155         if (sample == NULL) sample = tests[i].dump;
 156         if (sample != p.sample) {
 157             cout << "SAMPLE " << i << ": [" << p.sample << "] != [" << sample << "]" << endl;
 158             exit(1);
 159         }
 160     }
 161 }