1 -module(html_tokenise
).
6 %IH Tokenise an html file
7 %IT Tokenises an HTML file:
8 % <p><b>html_tokenise:file2toks(File) -> Toks tokenises a file
9 % <p><b>html_tokenise:bin2toks(Bin) -> Toks tokenises a binary
10 % <p><b>html_tokenise:string2toks(String) -> Toks. tokenises a string
11 % <p><b>html_tokenise:toks2file(Toks, File) -> ok write Toks to File.html
12 % <p><b>html_tokenise:toks2socket(Toks, Socket) -> ok send Toks to TCP/IP socket
15 -export([file2toks
/1, disk_cache2toks
/1,
17 string2toks
/1, toks2file
/2, toks2socket
/2]).
19 -import(lists
, [foreach
/2, reverse
/1]).
21 %% -deftype token() = tagEnd{string()}
22 %% | tagError{int()|string()}
23 %% | tagStart{string()}
24 %% | tagStart{string(), [{string(), string()}]}
27 %% -exportdeftype([token/0]).
29 %% -type file2toks(string()) -> [token()].
32 case file:read_file(File
) of
39 disk_cache2toks(URL
) ->
40 case disk_cache:fetch(URL
) of
47 %% -type binary(bin()) -> [token()].
49 bin2toks(Bin
) -> string2toks(binary_to_list(Bin
)).
51 %% -type toks2file([token()], string()) -> true.
53 toks2file(Toks
, File
) ->
54 %% io:format("Dumping tokens to:~p~n", [File]),
55 {ok
, Out
} = file:open(File
++ ".html", write
),
56 OutFun
= fun(Text
) -> io:put_chars(Out
, Text
) end,
57 foreach(fun(X
) -> dump_token(OutFun
, X
) end, Toks
),
61 %% -type toks2socket([token()], port()) -> true.
63 toks2socket(Toks
, Socket
) ->
64 %% io:format("Dumping tokens to:~p~n", [Socket]),
65 OutFun
= fun(Text
) -> gen_tcp:send(Socket
, Text
) end,
66 foreach(fun(X
) -> dump_token(OutFun
, X
) end, Toks
),
69 %% -type string2toks(string()) -> [token()].
71 string2toks(Str
) -> tokenise(Str
, []).
73 %% -type tokenise(string(), [token()]) -> [token()].
75 tokenise([$
<|T
], L
) ->
76 case collect_names(T
) of
77 {[$
/|Rest
], [], [$
>|T1
]} ->
78 tokenise(T1
, [{tagEnd
, to_lower(Rest
)}|L
]);
79 {[$
/|Rest
], _
, [$
>|T1
]} ->
80 tokenise(T1
, [{tagError
, to_lower(Rest
)}|L
]);
82 tokenise(T1
, [{tagStart
, to_lower(Tag
)}|L
]);
83 {Tag
, Args
, [$
>|T1
]} ->
84 tokenise(T1
, [{tagStart
, to_lower(Tag
), Args
}|L
]);
86 tokenise(T1
, [{tagError
, 3}|L
]);
88 tokenise(T1
, [{tagError
, 4}|L
])
91 {Raw
, T1
} = collect_raw([H
|T
]),
92 tokenise(T1
, [{raw
, Raw
}|L
]);
96 %% -type to_lower(string()) -> string().
98 to_lower([H
|T
]) when $A
=< H
, H
=< $Z
-> [H
- $A
+ $a
|to_lower(T
)];
99 to_lower([H
|T
]) -> [H
|to_lower(T
)];
102 %% collect names is called after we hit <
104 %% -type collect_names(string()) -> {string(), [{string(), string()}], string()}.
106 collect_names(Str
) ->
107 Str1
= skip_white(Str
),
114 {Name
, Str2
} = collect_name(Str1
),
115 {Args
, Str3
} = collect_args(Str2
),
119 %% Args = (name = arg)*
121 %% -type collect_args(string()) -> {[{string(), string()}], string()}.
124 %% io:format("here collect args:~s\n", [Str]),
125 Str1
= skip_white(Str
),
132 {Name
, Str2
} = collect_name(Str1
),
133 Str3
= skip_white(Str2
),
136 Str5
= skip_white(Str4
),
137 {Val
, Str6
} = collect_name(Str5
),
138 {ArgT
, Str7
} = collect_args(Str6
),
139 {[{to_lower(Name
),Val
}|ArgT
], Str7
};
141 Str4
= skip_to($
>, Str3
),
146 %% -type skip_to(char(), string()) -> string().
148 skip_to(H
, [H
|T
]) -> [H
|T
];
149 skip_to(H
, []) -> [H
];
150 skip_to(H
, [_
|T
]) -> skip_to(H
, T
).
152 %% -type skip_white(string()) -> string().
154 skip_white([$
|T
]) -> skip_white(T
);
155 skip_white([$
\n|T
]) -> skip_white(T
);
156 skip_white([13|T
]) -> skip_white(T
);
157 skip_white(Str
) -> Str
.
159 %% -type collect_name(string()) -> {string(), string()}.
160 %% -type collect_name(string(), string()) -> {string(), string()}.
161 %% -type collect_quoted_name(string(), string()) -> {string(), string()}.
163 collect_name([$
"|T]) -> collect_quoted_name(T, []);
164 collect_name(Str) -> collect_name(Str, []).
166 collect_name([$ |T], L) -> {reverse(L), T};
167 collect_name([$>|T], L) -> {reverse(L), [$>|T]};
168 collect_name([$=|T], L) -> {reverse(L), [$=|T]};
169 collect_name([$\n|T], L) -> {reverse(L), T};
170 collect_name([13|T], L) -> {reverse(L), T};
171 collect_name([H|T], L) -> collect_name(T, [H|L]);
172 collect_name([], L) -> {reverse(L), []}.
174 collect_quoted_name([$\\,$"|T
], L
) -> collect_quoted_name(T
, [$
"|L]);
175 collect_quoted_name([$"|T
], L
) -> {reverse(L
), T
};
176 collect_quoted_name([$
\n|T
], L
) -> collect_quoted_name(T
, [$
\n|L
]);
177 collect_quoted_name([13|T
], L
) -> collect_quoted_name(T
, [$
\n|L
]);
178 collect_quoted_name([H
|T
], L
) -> collect_quoted_name(T
, [H
|L
]);
179 collect_quoted_name([], L
) -> {reverse(L
), []}.
181 %% collect_raw(Str) -> {Raw', Str'}
183 %% -type collect_raw(string()) -> {string(), string()}.
185 collect_raw(Str
) -> collect_raw(Str
, []).
187 %% -type collect_raw(string(), string()) -> {string(), string()}.
189 collect_raw([$
\\,$
<|T
], L
) -> collect_raw(T
, [$
<|L
]);
190 collect_raw([$
\n|T
], L
) -> collect_raw(T
, [$
\n|L
]);
191 collect_raw([13|T
], L
) -> collect_raw(T
, [$
\n|L
]);
192 collect_raw([$
<|T
], L
) -> {reverse(L
), [$
<|T
]};
193 collect_raw([$
&|T
], L
) ->
194 {NT
, Name
} = collect_amp(T
, []),
195 case translate_amp(Name
) of
196 error
-> collect_raw(T
, [$
&|L
]);
197 Code
-> collect_raw(NT
, [Code
| L
])
199 collect_raw([H
|T
], L
) ->
200 collect_raw(T
, [H
|L
]);
201 collect_raw([], L
) ->
204 %% -type collect_amp(string(), string()) -> {string(), string()}.
206 collect_amp([$
| T
], L
) -> {T
, reverse(L
)};
207 collect_amp([$
\n | T
], L
) -> {T
, reverse(L
)};
208 collect_amp([$
\r | T
], L
) -> {T
, reverse(L
)};
209 collect_amp([$
; | T
], L
) -> {T
, reverse(L
)};
210 collect_amp([H
| T
], L
) -> collect_amp(T
, [H
|L
]);
211 collect_amp([], L
) -> {[], reverse(L
)}.
213 %% -type translate_amp(string()) -> int() | error.
215 translate_amp([$#
| Ds
]) ->
217 translate_amp(Name
) ->
322 %% -type amp_digits(string(), int()) -> int() | error.
324 amp_digits([X | Xs], N) when X >= $0, X =< $9 ->
325 amp_digits(Xs, N*10 + (X-$0));
328 N >= 0, N =< 8 -> error;
329 N >= 127, N =< 159 -> error;
335 dump_token(O, Token) when is_pid(O) ->
336 dump_token(fun(Text) -> io:put_chars(O, Text) end, Token);
338 dump_token(O, {raw, R}) ->
339 O(io_lib:fwrite("~s
", [R]));
340 dump_token(O, {tagStart, Tag, Args}) ->
341 O(io_lib:fwrite("<~s
", [Tag]) ++
342 lists:foldl(fun({Key,Val},Acc) ->
343 Acc ++ io_lib:fwrite(" ~s
=\"~s
\"", [Key,Val])
345 io_lib:fwrite(">", []));
346 dump_token(O, {tagStart, Tag}) ->
347 O(io_lib:fwrite("<~s
>", [Tag]));
348 dump_token(O, {tagEnd, Tag}) ->
349 O(io_lib:fwrite("</~s
>", [Tag]));
350 dump_token(O, Other) ->
351 io:format("dump_token ????~p~n
", [Other]).