I've no idea here...
[gtkD.git] / wrap / utils / HtmlStrip.d
blobecc531711b279ede462668fcbf5e405e3d673b7f
1 /*
2 * This file is part of duit.
3 *
4 * duit is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU Lesser General Public License as published by
6 * the Free Software Foundation; either version 2.1 of the License, or
7 * (at your option) any later version.
8 *
9 * duit is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU Lesser General Public License for more details.
14 * You should have received a copy of the GNU Lesser General Public License
15 * along with duit; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19 /**
20 * grabs the text of an html doc
23 module utils.HtmlStrip;
25 //debug=amper;
26 //debug=file;
28 public class HtmlStrip
31 private import std.file;
32 private import std.stdio;
34 public bit convertComment = true;
35 public bit markHR = true;
36 public bit markP = true;
37 public bit markH = true;
38 public bit removeEmptyLines = true;
39 public bit removeExtraSpaces = true;
41 public this()
46 char[] strip(char[] htmlText, bool checkUTF=true)
48 int markupCount = 0;
49 char[] stripped;
50 char pc = ' ';
51 char[] mark;
52 bit inAmper = false;
53 char[] amper;
55 foreach ( char c ; htmlText )
57 switch ( c )
59 case '<':
60 ++markupCount;
61 mark.length = 0;
62 break;
64 case '>':
65 --markupCount;
66 if ( markHR && (mark == "hr" || mark == "HR") )
68 stripped ~= "\n<hr>\n";
70 else if ( markP && (mark == "p" || mark == "P") )
72 stripped ~= "\n";
74 else if ( markH && (mark == "/h2" || mark == "/H2") )
76 stripped ~= "\n";
78 else if ( markH && (mark == "/h3" || mark == "/H3") )
80 stripped ~= "\n";
82 break;
84 case '&':
85 inAmper = true;
86 amper = "&";
87 break;
89 default:
90 if ( inAmper )
92 if ( c==';' )
94 debug(amper) writefln("amper = ",amper);
95 switch ( amper )
97 case "&lt" : c = '<'; break;
98 case "&gt" : c = '>'; break;
99 case "&nbsp": c = ' '; break;
100 default: c = '\0'; break;
102 inAmper = false;
103 amper.length = 0;
105 else
107 amper ~= c;
108 c = '\0';
111 if ( c == '\0' )
113 // ignore it
115 else if ( markupCount <= 0 )
117 if ( convertComment && pc == '/' && c == '*' )
119 stripped ~= '+';
121 if ( convertComment && pc == '*' && c == '/' )
123 stripped[stripped.length-1] = '+';
124 stripped ~= c;
126 else if ( removeEmptyLines
127 && stripped.length > 2
128 && c == '\n'
129 && stripped[stripped.length-1] == '\n'
130 && stripped[stripped.length-2] == '\n'
133 // ignore this EOL
135 else if ( removeEmptyLines
136 && stripped.length > 2
137 && c == 0x0A
138 && stripped[stripped.length-1] == 0x0A
139 && stripped[stripped.length-2] == 0x0A
142 // ignore this EOL
144 else if ( removeExtraSpaces && c == ' ' && pc == ' ' )
146 // skip this space
148 else
150 stripped ~= c;
153 else
155 mark ~= c;
157 break;
159 pc = c;
162 if ( checkUTF )
164 cleanUTF(stripped);
167 return stripped;
170 char[] stripFile(char[] filename)
172 debug(file)writefln("HtmlStrip.stripFile filename = %s", filename);
173 char[] text = cast(char[])std.file.read(filename);
175 //writefln("Original html:\n%s", text);
177 return strip(text);
180 private import std.utf;
182 public void cleanUTF(inout char[] str)
184 //printf("before utfClean\n%s\nend before utfClean\n", (str~"\0").ptr);
185 size_t i = 0;
186 while ( i < str.length )
190 std.utf.decode(str, i);
192 catch ( UtfException e )
194 str[i] = ' ';
195 ++i;
200 //writefln("after utfClean\n%s\nend after utfClean", str);
207 version (standAlone)
210 private import std.stdio;
212 int main()
215 HtmlStrip strip = new HtmlStrip();
216 char[] stripped = strip.stripFile("/home/data/down/GTK/API/glib/glib-The-Main-Event-Loop.html");
218 writefln("Stripped html:\n%s", stripped);
220 return 0;