xapian-applications/omega/htmlparsetest.cc

   1 /* htmlparsetest.cc: test the MyHtmlParser class
   2  *
   3  * Copyright (C) 2006,2008,2011,2012,2013,2015,2016,2018 Olly Betts
   4  *
   5  * This program is free software; you can redistribute it and/or
   6  * modify it under the terms of the GNU General Public License as
   7  * published by the Free Software Foundation; either version 2 of the
   8  * License, or (at your option) any later version.
   9  *
  10  * This program is distributed in the hope that it will be useful,
  11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13  * GNU General Public License for more details.
  14  *
  15  * You should have received a copy of the GNU General Public License
  16  * along with this program; if not, write to the Free Software
  17  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
  18  * USA
  19  */
  20
  21 #include <config.h>
  22
  23 #include <cstdlib>
  24 #include <iostream>
  25 #include <string>
  26
  27 #include "myhtmlparse.h"
  28
  29 using namespace std;
  30
  31 struct testcase {
  32     const char * html;
  33     const char * dump;
  34     const char * title;
  35     const char * keywords;
  36     const char * sample;
  37 };
  38
  39 static const testcase tests[] = {
  40     { "<body>test<!--htdig_noindex-->icle<!--/htdig_noindex-->s</body>",
  41       "tests", "", "", "" },
  42     { "<body>test<!--htdig_noindex-->ing</body>", "test", "", "", "" },
  43     { "hello<!-- bl>ah --> world", "hello world", "", "", "" },
  44     { "hello<!-- blah > world", "hello world", "", "", "" },
  45     { "<script>\nif (a<b) a = b;</script>test", "test", "", "", "" },
  46     // Regression test for bug first noticed in 1.0.0 (but present earlier).
  47     { "<b>not</b>\n<b>able</b>", "not able", "", "", "" },
  48     // Check that whitespace is handled as intended.
  49     { " <b>not </b>\n<b>\table\t</b>\r\n", "not able", "", "", "" },
  50     { "<html><head><title>\xc2\xae</title></head><body>\xc2\xa3</body></html>", "\xc3\x82\xc2\xa3", "\xc3\x82\xc2\xae", "", "" },
  51     { "<html><head><meta http-equiv=Content-Type content=\"text/html;charset=iso-8859-1\"><title>\xc2\xae</title></head><body>\xc2\xa3</body></html>", "\xc3\x82\xc2\xa3", "\xc3\x82\xc2\xae", "", "" },
  52     { "<html><head><meta http-equiv=Content-Type content=\"text/html;charset=utf-8\"><title>\xc2\xae</title></head><body>\xc2\xa3</body></html>", "\xc2\xa3", "\xc2\xae", "", "" },
  53     { "<html><head><meta charset='utf-8'><title>\xc2\xae</title></head><body>\xc2\xa3</body></html>", "\xc2\xa3", "\xc2\xae", "", "" },
  54     { "<html><head><title>\xc2\xae</title><meta charset=\"utf-8\"></head><body>\xc2\xa3</body></html>", "\xc2\xa3", "\xc2\xae", "", "" },
  55     // Check we default to UTF-8 for HTML5.
  56     { "<!DOCTYPE html><html><head><title>\xc2\xae</title></head><body>\xc2\xa3</body></html>", "\xc2\xa3", "\xc2\xae", "", "" },
  57     { "<!Doctype\tHTML  ><html><head><title>\xc2\xae</title></head><body>\xc2\xa3</body></html>", "\xc2\xa3", "\xc2\xae", "", "" },
  58     { "<!Doctype  HTML\t><html><head><title>\xc2\xae</title></head><body>\xc2\xa3</body></html>", "\xc2\xa3", "\xc2\xae", "", "" },
  59     { "<!DOCTYPE system 'about:legacy-compat'><html><head><title>\xc2\xae</title></head><body>\xc2\xa3</body></html>", "\xc2\xa3", "\xc2\xae", "", "" },
  60     { "<!doctype SyStem \"about:legacy-compat\" ><html><head><title>\xc2\xae</title></head><body>\xc2\xa3</body></html>", "\xc2\xa3", "\xc2\xae", "", "" },
  61     // Check we default to UTF-8 for XML.
  62     { "<?xml version=\"1.0\"?><html><head><title>\xc2\xae</title></head><body>\xc2\xa3</body></html>", "\xc2\xa3", "\xc2\xae", "", "" },
  63     // Check we handle specify a charset for XML.
  64     { "<?xml version=\"1.0\" encoding=\"ISO-8859-1\"?><html><head><title>\xc2\xae</title></head><body>\xc2\xa3</body></html>", "\xc3\x82\xc2\xa3", "\xc3\x82\xc2\xae", "", "" },
  65     { "<!--UdmComment-->test<!--/UdmComment--><div id='body'>test</div>", "test", "", "", "" },
  66     { "Foo<![CDATA[ & bar <literal>\"]]> ok", "Foo & bar <literal>\" ok", "", "", "" },
  67     { "Foo<![CDATA", "Foo", "", "", "" },
  68     { "foo<![CDATA[bar", "foobar", "", "", "" },
  69     // Test that handling of multiple body tags matches modern browser behaviour (ticket#599).
  70     { "a<html>b<head>c<title>bad</title>d</head>e<body>f</body>g<body>h</body>i</html>j<body>k", "abcdefghijk", "bad", "", "" },
  71     { "check<object id='foo'>for<applet foo=\"bar\" />spaces<br> in <p>\tout</p>put\r\n", "check for spaces\rin\rout\rput", "", "", "" },
  72     { "tab:<table><tr><th>col 1</th><th>col 2</th></tr><tr><td>test</td><td><img src='foo.jpg'> <img src='bar.jpg'></td></tr><tr><td colspan=2>hello world</td></tr></table>done", "tab:\rcol 1\tcol 2\rtest\rhello world\rdone", "", "", "" },
  73     // Test entities.
  74     { "<html><body>1 &lt; 2, 3 &gt; 2</body></html>", "1 < 2, 3 > 2", "", "", "" },
  75     { "<html><body>&amp;amp;</body></html>", "&amp;", "", "", "" },
  76     { "<html><body>&lt;Unknown &ent;-ity&gt;</body></html>", "<Unknown &ent;-ity>", "", "", "" },
  77     { "<html><body>&#68;oes &#97; &lt; &auml; &#x3f</body></html>", "Does a < ä ?", "", "", "" },
  78     { "&#65;&#x40;&gt", "A@>", "", "", "" },
  79     { 0, 0, 0, 0, 0 }
  80 };
  81
  82 int
  83 main()
  84 {
  85     for (size_t i = 0; tests[i].html; ++i) {
  86         MyHtmlParser p;
  87         try {
  88             p.parse_html(tests[i].html, "iso-8859-1", false);
  89         } catch (const string &newcharset) {
  90             p.reset();
  91             p.parse_html(tests[i].html, newcharset, true);
  92         }
  93         if (!p.indexing_allowed) {
  94             cout << "indexing disallowed by meta tag - skipping\n";
  95             continue;
  96         }
  97         if (tests[i].dump != p.dump) {
  98             cout << "DUMP " << i << ": [" << p.dump << "] != [" << tests[i].dump << "]" << endl;
  99             exit(1);
 100         }
 101         if (tests[i].title != p.title) {
 102             cout << "TITLE " << i << ": [" << p.title << "] != [" << tests[i].title << "]" << endl;
 103             exit(1);
 104         }
 105         if (tests[i].keywords != p.keywords) {
 106             cout << "KEYWORDS " << i << ": [" << p.keywords << "] != [" << tests[i].keywords << "]" << endl;
 107             exit(1);
 108         }
 109         const char *sample = tests[i].sample;
 110         if (sample == NULL) sample = tests[i].dump;
 111         if (sample != p.sample) {
 112             cout << "SAMPLE " << i << ": [" << p.sample << "] != [" << sample << "]" << endl;
 113             exit(1);
 114         }
 115     }
 116 }