adding all of botlist, initial add
[botlist.git] / botlistprojects / botsocial / lib / erlang / www_tools / html_tokenise.erl
blobb44c3354112c14a3d7b006379658b70d3d263b6b
1 -module(html_tokenise).
3 %IA Joe Armstrong
4 %ID 970314
5 %IK [html,token]
6 %IH Tokenise an html file
7 %IT Tokenises an HTML file:
8 % <p><b>html_tokenise:file2toks(File) -> Toks tokenises a file
9 % <p><b>html_tokenise:bin2toks(Bin) -> Toks tokenises a binary
10 % <p><b>html_tokenise:string2toks(String) -> Toks. tokenises a string
11 % <p><b>html_tokenise:toks2file(Toks, File) -> ok write Toks to File.html
12 % <p><b>html_tokenise:toks2socket(Toks, Socket) -> ok send Toks to TCP/IP socket
15 -export([file2toks/1, disk_cache2toks/1,
16 bin2toks/1,
17 string2toks/1, toks2file/2, toks2socket/2]).
19 -import(lists, [foreach/2, reverse/1]).
21 %% -deftype token() = tagEnd{string()}
22 %% | tagError{int()|string()}
23 %% | tagStart{string()}
24 %% | tagStart{string(), [{string(), string()}]}
25 %% | raw{string()}.
27 %% -exportdeftype([token/0]).
29 %% -type file2toks(string()) -> [token()].
31 file2toks(File) ->
32 case file:read_file(File) of
33 {ok, Bin} ->
34 bin2toks(Bin);
35 {error, _} ->
36 exit(badfile)
37 end.
39 disk_cache2toks(URL) ->
40 case disk_cache:fetch(URL) of
41 {ok, Bin} ->
42 bin2toks(Bin);
43 {error, _} ->
44 exit(badcache)
45 end.
47 %% -type binary(bin()) -> [token()].
49 bin2toks(Bin) -> string2toks(binary_to_list(Bin)).
51 %% -type toks2file([token()], string()) -> true.
53 toks2file(Toks, File) ->
54 %% io:format("Dumping tokens to:~p~n", [File]),
55 {ok, Out} = file:open(File ++ ".html", write),
56 OutFun = fun(Text) -> io:put_chars(Out, Text) end,
57 foreach(fun(X) -> dump_token(OutFun, X) end, Toks),
58 file:close(Out),
59 true.
61 %% -type toks2socket([token()], port()) -> true.
63 toks2socket(Toks, Socket) ->
64 %% io:format("Dumping tokens to:~p~n", [Socket]),
65 OutFun = fun(Text) -> gen_tcp:send(Socket, Text) end,
66 foreach(fun(X) -> dump_token(OutFun, X) end, Toks),
67 true.
69 %% -type string2toks(string()) -> [token()].
71 string2toks(Str) -> tokenise(Str, []).
73 %% -type tokenise(string(), [token()]) -> [token()].
75 tokenise([$<|T], L) ->
76 case collect_names(T) of
77 {[$/|Rest], [], [$>|T1]} ->
78 tokenise(T1, [{tagEnd, to_lower(Rest)}|L]);
79 {[$/|Rest], _, [$>|T1]} ->
80 tokenise(T1, [{tagError, to_lower(Rest)}|L]);
81 {Tag, [], [$>|T1]} ->
82 tokenise(T1, [{tagStart, to_lower(Tag)}|L]);
83 {Tag, Args, [$>|T1]} ->
84 tokenise(T1, [{tagStart, to_lower(Tag), Args}|L]);
85 {_, _, [$>|T1]} ->
86 tokenise(T1, [{tagError, 3}|L]);
87 {_, _, T1} ->
88 tokenise(T1, [{tagError, 4}|L])
89 end;
90 tokenise([H|T], L) ->
91 {Raw, T1} = collect_raw([H|T]),
92 tokenise(T1, [{raw, Raw}|L]);
93 tokenise([], L) ->
94 reverse(L).
96 %% -type to_lower(string()) -> string().
98 to_lower([H|T]) when $A =< H, H =< $Z -> [H - $A + $a|to_lower(T)];
99 to_lower([H|T]) -> [H|to_lower(T)];
100 to_lower([]) -> [].
102 %% collect names is called after we hit <
104 %% -type collect_names(string()) -> {string(), [{string(), string()}], string()}.
106 collect_names(Str) ->
107 Str1 = skip_white(Str),
108 case Str1 of
109 [$>|T] ->
110 {[], [], [$>|T]};
111 [] ->
112 {[], [], []};
113 _ ->
114 {Name, Str2} = collect_name(Str1),
115 {Args, Str3} = collect_args(Str2),
116 {Name, Args, Str3}
117 end.
119 %% Args = (name = arg)*
121 %% -type collect_args(string()) -> {[{string(), string()}], string()}.
123 collect_args(Str) ->
124 %% io:format("here collect args:~s\n", [Str]),
125 Str1 = skip_white(Str),
126 case Str1 of
127 [$>|T] ->
128 {[], [$>|T]};
129 [] ->
130 {[], []};
131 _ ->
132 {Name, Str2} = collect_name(Str1),
133 Str3 = skip_white(Str2),
134 case Str3 of
135 [$=|Str4] ->
136 Str5 = skip_white(Str4),
137 {Val, Str6} = collect_name(Str5),
138 {ArgT, Str7} = collect_args(Str6),
139 {[{to_lower(Name),Val}|ArgT], Str7};
140 _ ->
141 Str4 = skip_to($>, Str3),
142 {[], Str4}
144 end.
146 %% -type skip_to(char(), string()) -> string().
148 skip_to(H, [H|T]) -> [H|T];
149 skip_to(H, []) -> [H];
150 skip_to(H, [_|T]) -> skip_to(H, T).
152 %% -type skip_white(string()) -> string().
154 skip_white([$ |T]) -> skip_white(T);
155 skip_white([$\n|T]) -> skip_white(T);
156 skip_white([13|T]) -> skip_white(T);
157 skip_white(Str) -> Str.
159 %% -type collect_name(string()) -> {string(), string()}.
160 %% -type collect_name(string(), string()) -> {string(), string()}.
161 %% -type collect_quoted_name(string(), string()) -> {string(), string()}.
163 collect_name([$"|T]) -> collect_quoted_name(T, []);
164 collect_name(Str) -> collect_name(Str, []).
166 collect_name([$ |T], L) -> {reverse(L), T};
167 collect_name([$>|T], L) -> {reverse(L), [$>|T]};
168 collect_name([$=|T], L) -> {reverse(L), [$=|T]};
169 collect_name([$\n|T], L) -> {reverse(L), T};
170 collect_name([13|T], L) -> {reverse(L), T};
171 collect_name([H|T], L) -> collect_name(T, [H|L]);
172 collect_name([], L) -> {reverse(L), []}.
174 collect_quoted_name([$\\,$"|T], L) -> collect_quoted_name(T, [$"|L]);
175 collect_quoted_name([$"|T], L) -> {reverse(L), T};
176 collect_quoted_name([$\n|T], L) -> collect_quoted_name(T, [$\n|L]);
177 collect_quoted_name([13|T], L) -> collect_quoted_name(T, [$\n|L]);
178 collect_quoted_name([H|T], L) -> collect_quoted_name(T, [H|L]);
179 collect_quoted_name([], L) -> {reverse(L), []}.
181 %% collect_raw(Str) -> {Raw', Str'}
183 %% -type collect_raw(string()) -> {string(), string()}.
185 collect_raw(Str) -> collect_raw(Str, []).
187 %% -type collect_raw(string(), string()) -> {string(), string()}.
189 collect_raw([$\\,$<|T], L) -> collect_raw(T, [$<|L]);
190 collect_raw([$\n|T], L) -> collect_raw(T, [$\n|L]);
191 collect_raw([13|T], L) -> collect_raw(T, [$\n|L]);
192 collect_raw([$<|T], L) -> {reverse(L), [$<|T]};
193 collect_raw([$&|T], L) ->
194 {NT, Name} = collect_amp(T, []),
195 case translate_amp(Name) of
196 error -> collect_raw(T, [$&|L]);
197 Code -> collect_raw(NT, [Code | L])
198 end;
199 collect_raw([H|T], L) ->
200 collect_raw(T, [H|L]);
201 collect_raw([], L) ->
202 {reverse(L), []}.
204 %% -type collect_amp(string(), string()) -> {string(), string()}.
206 collect_amp([$ | T], L) -> {T, reverse(L)};
207 collect_amp([$\n | T], L) -> {T, reverse(L)};
208 collect_amp([$\r | T], L) -> {T, reverse(L)};
209 collect_amp([$; | T], L) -> {T, reverse(L)};
210 collect_amp([H | T], L) -> collect_amp(T, [H|L]);
211 collect_amp([], L) -> {[], reverse(L)}.
213 %% -type translate_amp(string()) -> int() | error.
215 translate_amp([$# | Ds]) ->
216 amp_digits(Ds, 0);
217 translate_amp(Name) ->
218 case Name of
219 "lt" -> $<;
220 "gt" -> $>;
221 "amp" -> $&;
222 "quot" -> $";
223 "nbsp" -> 160;
224 "iexcl" -> 161;
225 "cent" -> 162;
226 "pound" -> 163;
227 "curren" -> 164;
228 "yen" -> 165;
229 "brvbar" -> 166;
230 "sect" -> 167;
231 "uml" -> 168;
232 "copy" -> 169;
233 "ordf" -> 170;
234 "laquo" -> 171;
235 "not" -> 172;
236 "shy" -> 173;
237 "reg" -> 174;
238 "macr" -> 175;
239 "deg" -> 176;
240 "plusmn" -> 177;
241 "sup2" -> 178;
242 "sup3" -> 179;
243 "acute" -> 180;
244 "micro" -> 181;
245 "para" -> 182;
246 "middot" -> 183;
247 "cedil" -> 184;
248 "sup1" -> 185;
249 "ordm" -> 186;
250 "raquo" -> 187;
251 "frac14" -> 188;
252 "frac12" -> 189;
253 "frac34" -> 190;
254 "iquest" -> 191;
255 "Agrave" -> 192;
256 "Aacute" -> 193;
257 "Acirc" -> 194;
258 "Atilde" -> 195;
259 "Auml" -> 196;
260 "Aring" -> 197;
261 "AElig" -> 198;
262 "Ccedil" -> 199;
263 "Egrave" -> 200;
264 "Eacute" -> 201;
265 "Ecirc" -> 202;
266 "Euml" -> 203;
267 "Igrave" -> 204;
268 "Iacute" -> 205;
269 "Icirc" -> 206;
270 "Iuml" -> 207;
271 "ETH" -> 208;
272 "Ntilde" -> 209;
273 "Ograve" -> 210;
274 "Oacute" -> 211;
275 "Ocirc"-> 212;
276 "Otilde" -> 213;
277 "Ouml" -> 214;
278 "times" -> 215;
279 "Oslash" -> 216;
280 "Ugrave" -> 217;
281 "Uacute" -> 218;
282 "Ucirc" -> 219;
283 "Uuml" -> 220;
284 "Yacute" -> 221;
285 "THORN" -> 222;
286 "szlig" -> 223;
287 "agrave" -> 224;
288 "aacute" -> 225;
289 "acirc" -> 226;
290 "atilde" -> 227;
291 "auml" -> 228;
292 "aring" -> 229;
293 "aelig" -> 230;
294 "ccedil" -> 231;
295 "egrave" -> 232;
296 "eacute" -> 233;
297 "ecirc" -> 234;
298 "euml" -> 235;
299 "igrave" -> 236;
300 "iacute" -> 237;
301 "icirc" -> 238;
302 "iuml" -> 239;
303 "eth" -> 240;
304 "ntilde" -> 241;
305 "ograve" -> 242;
306 "oacute" -> 243;
307 "ocirc" -> 244;
308 "otilde" -> 245;
309 "ouml" -> 246;
310 "divide" -> 247;
311 "oslash" -> 248;
312 "ugrave" -> 249;
313 "uacute" -> 250;
314 "ucirc" -> 251;
315 "uuml" -> 252;
316 "yacute" -> 253;
317 "thorn" -> 254;
318 "yuml" -> 255;
319 _ -> error
320 end.
322 %% -type amp_digits(string(), int()) -> int() | error.
324 amp_digits([X | Xs], N) when X >= $0, X =< $9 ->
325 amp_digits(Xs, N*10 + (X-$0));
326 amp_digits([], N) ->
328 N >= 0, N =< 8 -> error;
329 N >= 127, N =< 159 -> error;
330 N > 255 -> error;
331 true -> N
332 end.
335 dump_token(O, Token) when is_pid(O) ->
336 dump_token(fun(Text) -> io:put_chars(O, Text) end, Token);
338 dump_token(O, {raw, R}) ->
339 O(io_lib:fwrite("~s", [R]));
340 dump_token(O, {tagStart, Tag, Args}) ->
341 O(io_lib:fwrite("<~s", [Tag]) ++
342 lists:foldl(fun({Key,Val},Acc) ->
343 Acc ++ io_lib:fwrite(" ~s=\"~s\"", [Key,Val])
344 end, "", Args) ++
345 io_lib:fwrite(">", []));
346 dump_token(O, {tagStart, Tag}) ->
347 O(io_lib:fwrite("<~s>", [Tag]));
348 dump_token(O, {tagEnd, Tag}) ->
349 O(io_lib:fwrite("</~s>", [Tag]));
350 dump_token(O, Other) ->
351 io:format("dump_token ????~p~n", [Other]).