2 * Claws Mail -- a GTK+ based, lightweight, and fast e-mail client
3 * Copyright (C) 1999-2016 Hiroyuki Yamamoto and the Claws Mail team
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 3 of the License, or
8 * (at your option) any later version.
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
15 * You should have received a copy of the GNU General Public License
16 * along with this program. If not, see <http://www.gnu.org/licenses/>.
21 #include "claws-features.h"
33 #include "file-utils.h"
35 #define SC_HTMLBUFSIZE 8192
36 #define HR_STR "────────────────────────────────────────────────"
39 static SC_HTMLState
sc_html_read_line (SC_HTMLParser
*parser
);
40 static void sc_html_append_char (SC_HTMLParser
*parser
,
42 static void sc_html_append_str (SC_HTMLParser
*parser
,
45 static SC_HTMLState
sc_html_parse_tag (SC_HTMLParser
*parser
);
46 static void sc_html_parse_special (SC_HTMLParser
*parser
);
47 static void sc_html_get_parenthesis (SC_HTMLParser
*parser
,
52 SC_HTMLParser
*sc_html_parser_new(FILE *fp
, CodeConverter
*conv
)
54 SC_HTMLParser
*parser
;
56 cm_return_val_if_fail(fp
!= NULL
, NULL
);
57 cm_return_val_if_fail(conv
!= NULL
, NULL
);
59 parser
= g_new0(SC_HTMLParser
, 1);
62 parser
->str
= g_string_new(NULL
);
63 parser
->buf
= g_string_new(NULL
);
64 parser
->bufp
= parser
->buf
->str
;
65 parser
->state
= SC_HTML_NORMAL
;
67 parser
->newline
= TRUE
;
68 parser
->empty_line
= TRUE
;
69 parser
->space
= FALSE
;
76 void sc_html_parser_destroy(SC_HTMLParser
*parser
)
78 g_string_free(parser
->str
, TRUE
);
79 g_string_free(parser
->buf
, TRUE
);
84 gchar
*sc_html_parse(SC_HTMLParser
*parser
)
86 parser
->state
= SC_HTML_NORMAL
;
87 g_string_truncate(parser
->str
, 0);
89 if (*parser
->bufp
== '\0') {
90 g_string_truncate(parser
->buf
, 0);
91 parser
->bufp
= parser
->buf
->str
;
92 if (sc_html_read_line(parser
) == SC_HTML_EOF
)
96 while (*parser
->bufp
!= '\0') {
97 switch (*parser
->bufp
) {
100 st
= sc_html_parse_tag(parser
);
101 /* when we see an href, we need to flush the str
102 * buffer. Then collect all the chars until we
103 * see the end anchor tag
105 if (SC_HTML_HREF_BEG
== st
|| SC_HTML_HREF
== st
)
106 return parser
->str
->str
;
110 sc_html_parse_special(parser
);
116 if (parser
->bufp
[0] == '\r' && parser
->bufp
[1] == '\n')
120 if (!parser
->newline
)
121 parser
->space
= TRUE
;
128 sc_html_append_char(parser
, *parser
->bufp
++);
132 return parser
->str
->str
;
135 static SC_HTMLState
sc_html_read_line(SC_HTMLParser
*parser
)
137 gchar buf
[SC_HTMLBUFSIZE
];
138 gchar buf2
[SC_HTMLBUFSIZE
*4];
142 if (parser
->fp
== NULL
)
145 n
= claws_fread(buf
, 1, sizeof(buf
) - 1, parser
->fp
);
147 parser
->state
= SC_HTML_EOF
;
152 if (conv_convert(parser
->conv
, buf2
, sizeof(buf2
), buf
) < 0) {
153 index
= parser
->bufp
- parser
->buf
->str
;
155 conv_utf8todisp(buf2
, sizeof(buf2
), buf
);
156 g_string_append(parser
->buf
, buf2
);
158 parser
->bufp
= parser
->buf
->str
+ index
;
160 return SC_HTML_CONV_FAILED
;
163 index
= parser
->bufp
- parser
->buf
->str
;
165 g_string_append(parser
->buf
, buf2
);
167 parser
->bufp
= parser
->buf
->str
+ index
;
169 return SC_HTML_NORMAL
;
172 static void sc_html_append_char(SC_HTMLParser
*parser
, gchar ch
)
174 GString
*str
= parser
->str
;
176 if (!parser
->pre
&& parser
->space
) {
177 g_string_append_c(str
, ' ');
178 parser
->space
= FALSE
;
181 g_string_append_c(str
, ch
);
183 parser
->empty_line
= FALSE
;
185 parser
->newline
= TRUE
;
186 if (str
->len
> 1 && str
->str
[str
->len
- 2] == '\n')
187 parser
->empty_line
= TRUE
;
188 if (parser
->indent
> 0) {
189 gint i
, n
= parser
->indent
;
190 for (i
= 0; i
< n
; i
++)
191 g_string_append_c(str
, '>');
192 g_string_append_c(str
, ' ');
195 parser
->newline
= FALSE
;
198 static void sc_html_append_str(SC_HTMLParser
*parser
, const gchar
*str
, gint len
)
200 GString
*string
= parser
->str
;
202 if (!parser
->pre
&& parser
->space
) {
203 g_string_append_c(string
, ' ');
204 parser
->space
= FALSE
;
207 if (len
== 0) return;
209 g_string_append(string
, str
);
212 Xstrndup_a(s
, str
, len
, return);
213 g_string_append(string
, s
);
216 parser
->empty_line
= FALSE
;
217 if (string
->len
> 0 && string
->str
[string
->len
- 1] == '\n') {
218 parser
->newline
= TRUE
;
219 if (string
->len
> 1 && string
->str
[string
->len
- 2] == '\n')
220 parser
->empty_line
= TRUE
;
222 parser
->newline
= FALSE
;
225 static SC_HTMLTag
*sc_html_get_tag(const gchar
*str
)
231 cm_return_val_if_fail(str
!= NULL
, NULL
);
233 if (*str
== '\0' || *str
== '!') return NULL
;
235 Xstrdup_a(tmp
, str
, return NULL
);
237 tag
= g_new0(SC_HTMLTag
, 1);
239 for (tmpp
= tmp
; *tmpp
!= '\0' && !g_ascii_isspace(*tmpp
); tmpp
++)
243 tag
->name
= g_utf8_strdown(tmp
, -1);
247 tag
->name
= g_utf8_strdown(tmp
, -1);
250 while (*tmpp
!= '\0') {
257 while (g_ascii_isspace(*tmpp
)) tmpp
++;
260 while (*tmpp
!= '\0' && !g_ascii_isspace(*tmpp
) &&
263 if (*tmpp
!= '\0' && *tmpp
!= '=') {
265 while (g_ascii_isspace(*tmpp
)) tmpp
++;
270 while (g_ascii_isspace(*tmpp
)) tmpp
++;
272 if (*tmpp
== '"' || *tmpp
== '\'') {
277 if ((p
= strchr(attr_value
, quote
)) == NULL
) {
278 if (debug_get_mode()) {
279 g_warning("sc_html_get_tag(): syntax error in tag: '%s'",
282 gchar
*cut
= g_strndup(str
, 100);
283 g_warning("sc_html_get_tag(): syntax error in tag: '%s%s'",
284 cut
, strlen(str
)>100?"...":".");
291 while (g_ascii_isspace(*tmpp
)) tmpp
++;
295 while (*tmpp
!= '\0' && !g_ascii_isspace(*tmpp
)) tmpp
++;
302 g_strchomp(attr_name
);
303 attr
= g_new(SC_HTMLAttr
, 1);
304 attr
->name
= g_utf8_strdown(attr_name
, -1);
305 attr
->value
= g_strdup(attr_value
);
306 tag
->attr
= g_list_append(tag
->attr
, attr
);
312 static void sc_html_free_tag(SC_HTMLTag
*tag
)
317 while (tag
->attr
!= NULL
) {
318 SC_HTMLAttr
*attr
= (SC_HTMLAttr
*)tag
->attr
->data
;
322 tag
->attr
= g_list_remove(tag
->attr
, tag
->attr
->data
);
327 static void decode_href(SC_HTMLParser
*parser
)
330 SC_HTMLParser
*tparser
= g_new0(SC_HTMLParser
, 1);
332 tparser
->str
= g_string_new(NULL
);
333 tparser
->buf
= g_string_new(parser
->href
);
334 tparser
->bufp
= tparser
->buf
->str
;
336 tmp
= sc_html_parse(tparser
);
338 g_free(parser
->href
);
339 parser
->href
= g_strdup(tmp
);
341 sc_html_parser_destroy(tparser
);
344 static SC_HTMLState
sc_html_parse_tag(SC_HTMLParser
*parser
)
346 gchar buf
[SC_HTMLBUFSIZE
];
349 sc_html_get_parenthesis(parser
, buf
, sizeof(buf
));
351 tag
= sc_html_get_tag(buf
);
353 parser
->state
= SC_HTML_UNKNOWN
;
354 if (!tag
) return SC_HTML_UNKNOWN
;
356 if (!strcmp(tag
->name
, "br") || !strcmp(tag
->name
, "br/")) {
357 parser
->space
= FALSE
;
358 sc_html_append_char(parser
, '\n');
359 parser
->state
= SC_HTML_BR
;
360 } else if (!strcmp(tag
->name
, "a")) {
362 if (parser
->href
!= NULL
) {
363 g_free(parser
->href
);
366 for (cur
= tag
->attr
; cur
!= NULL
; cur
= cur
->next
) {
367 if (cur
->data
&& !strcmp(((SC_HTMLAttr
*)cur
->data
)->name
, "href")) {
368 g_free(parser
->href
);
369 parser
->href
= g_strdup(((SC_HTMLAttr
*)cur
->data
)->value
);
371 parser
->state
= SC_HTML_HREF_BEG
;
375 if (parser
->href
== NULL
)
376 parser
->href
= g_strdup("");
377 parser
->state
= SC_HTML_HREF_BEG
;
378 } else if (!strcmp(tag
->name
, "/a")) {
379 parser
->state
= SC_HTML_HREF
;
380 } else if (!strcmp(tag
->name
, "p")) {
381 parser
->space
= FALSE
;
382 if (!parser
->empty_line
) {
383 parser
->space
= FALSE
;
384 if (!parser
->newline
) sc_html_append_char(parser
, '\n');
385 sc_html_append_char(parser
, '\n');
387 parser
->state
= SC_HTML_PAR
;
388 } else if (!strcmp(tag
->name
, "pre")) {
390 parser
->state
= SC_HTML_PRE
;
391 } else if (!strcmp(tag
->name
, "/pre")) {
393 parser
->state
= SC_HTML_NORMAL
;
394 } else if (!strcmp(tag
->name
, "hr")) {
395 if (!parser
->newline
) {
396 parser
->space
= FALSE
;
397 sc_html_append_char(parser
, '\n');
399 sc_html_append_str(parser
, HR_STR
, -1);
400 sc_html_append_char(parser
, '\n');
401 parser
->state
= SC_HTML_HR
;
402 } else if (!strcmp(tag
->name
, "div") ||
403 !strcmp(tag
->name
, "ul") ||
404 !strcmp(tag
->name
, "li") ||
405 !strcmp(tag
->name
, "table") ||
406 !strcmp(tag
->name
, "dd") ||
407 !strcmp(tag
->name
, "tr")) {
408 if (!parser
->newline
) {
409 parser
->space
= FALSE
;
410 sc_html_append_char(parser
, '\n');
412 if (!strcmp(tag
->name
, "li")) {
413 sc_html_append_str(parser
, LI_STR
, -1);
415 parser
->state
= SC_HTML_NORMAL
;
416 } else if (tag
->name
[0] == 'h' && g_ascii_isdigit(tag
->name
[1])) {
417 if (!parser
->newline
) {
418 parser
->space
= FALSE
;
419 sc_html_append_char(parser
, '\n');
421 sc_html_append_char(parser
, '\n');
422 } else if (!strcmp(tag
->name
, "blockquote")) {
423 parser
->state
= SC_HTML_NORMAL
;
425 } else if (!strcmp(tag
->name
, "/blockquote")) {
426 parser
->state
= SC_HTML_NORMAL
;
428 } else if (!strcmp(tag
->name
, "/table") ||
429 (tag
->name
[0] == '/' &&
430 tag
->name
[1] == 'h' &&
431 g_ascii_isdigit(tag
->name
[2]))) {
432 if (!parser
->empty_line
) {
433 parser
->space
= FALSE
;
434 if (!parser
->newline
) sc_html_append_char(parser
, '\n');
435 sc_html_append_char(parser
, '\n');
437 parser
->state
= SC_HTML_NORMAL
;
438 } else if (!strcmp(tag
->name
, "/div") ||
439 !strcmp(tag
->name
, "/ul") ||
440 !strcmp(tag
->name
, "/li")) {
441 if (!parser
->newline
) {
442 parser
->space
= FALSE
;
443 sc_html_append_char(parser
, '\n');
445 parser
->state
= SC_HTML_NORMAL
;
448 sc_html_free_tag(tag
);
450 return parser
->state
;
453 static void sc_html_parse_special(SC_HTMLParser
*parser
)
457 parser
->state
= SC_HTML_UNKNOWN
;
458 cm_return_if_fail(*parser
->bufp
== '&');
460 entity
= entity_decode(parser
->bufp
);
461 if (entity
!= NULL
) {
462 sc_html_append_str(parser
, entity
, -1);
464 while (*parser
->bufp
++ != ';');
466 /* output literal `&' */
467 sc_html_append_char(parser
, *parser
->bufp
++);
469 parser
->state
= SC_HTML_NORMAL
;
472 static gchar
*sc_html_find_tag(SC_HTMLParser
*parser
, const gchar
*tag
)
474 gchar
*cur
= parser
->bufp
;
475 gint len
= strlen(tag
);
480 while ((cur
= strstr(cur
, "<")) != NULL
) {
481 if (!g_ascii_strncasecmp(cur
, tag
, len
))
488 static void sc_html_get_parenthesis(SC_HTMLParser
*parser
, gchar
*buf
, gint len
)
493 cm_return_if_fail(*parser
->bufp
== '<');
495 /* ignore comment / CSS / script stuff */
496 if (!strncmp(parser
->bufp
, "<!--", 4)) {
498 while ((p
= strstr(parser
->bufp
, "-->")) == NULL
)
499 if (sc_html_read_line(parser
) == SC_HTML_EOF
) return;
500 parser
->bufp
= p
+ 3;
503 if (!g_ascii_strncasecmp(parser
->bufp
, "<style", 6)) {
505 while ((p
= sc_html_find_tag(parser
, "</style>")) == NULL
)
506 if (sc_html_read_line(parser
) == SC_HTML_EOF
) return;
507 parser
->bufp
= p
+ 8;
510 if (!g_ascii_strncasecmp(parser
->bufp
, "<script", 7)) {
512 while ((p
= sc_html_find_tag(parser
, "</script>")) == NULL
)
513 if (sc_html_read_line(parser
) == SC_HTML_EOF
) return;
514 parser
->bufp
= p
+ 9;
519 while ((p
= strchr(parser
->bufp
, '>')) == NULL
)
520 if (sc_html_read_line(parser
) == SC_HTML_EOF
) return;
522 strncpy2(buf
, parser
->bufp
, MIN(p
- parser
->bufp
+ 1, len
));
524 parser
->bufp
= p
+ 1;