2 * This file is part of duit.
4 * duit is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU Lesser General Public License as published by
6 * the Free Software Foundation; either version 2.1 of the License, or
7 * (at your option) any later version.
9 * duit is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU Lesser General Public License for more details.
14 * You should have received a copy of the GNU Lesser General Public License
15 * along with duit; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 * grabs the text of an html doc
23 module utils
.HtmlStrip
;
28 public class HtmlStrip
31 private import std
.file
;
32 private import std
.stdio
;
34 public bit convertComment
= true;
35 public bit markHR
= true;
36 public bit markP
= true;
37 public bit markH
= true;
38 public bit removeEmptyLines
= true;
39 public bit removeExtraSpaces
= true;
46 char[] strip(char[] htmlText
, bool checkUTF
=true)
55 foreach ( char c
; htmlText
)
66 if ( markHR
&& (mark
== "hr" || mark
== "HR") )
68 stripped
~= "\n<hr>\n";
70 else if ( markP
&& (mark
== "p" || mark
== "P") )
74 else if ( markH
&& (mark
== "/h2" || mark
== "/H2") )
78 else if ( markH
&& (mark
== "/h3" || mark
== "/H3") )
94 debug(amper
) writefln("amper = ",amper
);
97 case "<" : c
= '<'; break;
98 case ">" : c
= '>'; break;
99 case " ": c
= ' '; break;
100 default: c
= '\0'; break;
115 else if ( markupCount
<= 0 )
117 if ( convertComment
&& pc
== '/' && c
== '*' )
121 if ( convertComment
&& pc
== '*' && c
== '/' )
123 stripped
[stripped
.length
-1] = '+';
126 else if ( removeEmptyLines
127 && stripped
.length
> 2
129 && stripped
[stripped
.length
-1] == '\n'
130 && stripped
[stripped
.length
-2] == '\n'
135 else if ( removeEmptyLines
136 && stripped
.length
> 2
138 && stripped
[stripped
.length
-1] == 0x0A
139 && stripped
[stripped
.length
-2] == 0x0A
144 else if ( removeExtraSpaces
&& c
== ' ' && pc
== ' ' )
170 char[] stripFile(char[] filename
)
172 debug(file
)writefln("HtmlStrip.stripFile filename = %s", filename
);
173 char[] text
= cast(char[])std
.file
.read(filename
);
175 //writefln("Original html:\n%s", text);
180 private import std
.utf
;
182 public void cleanUTF(inout char[] str)
184 //printf("before utfClean\n%s\nend before utfClean\n", (str~"\0").ptr);
186 while ( i
< str.length
)
190 std
.utf
.decode(str, i
);
192 catch ( UtfException e
)
200 //writefln("after utfClean\n%s\nend after utfClean", str);
210 private import std
.stdio
;
215 HtmlStrip strip
= new HtmlStrip();
216 char[] stripped
= strip
.stripFile("/home/data/down/GTK/API/glib/glib-The-Main-Event-Loop.html");
218 writefln("Stripped html:\n%s", stripped
);