2 * Claws Mail -- a GTK+ based, lightweight, and fast e-mail client
3 * Copyright (C) 1999-2016 Hiroyuki Yamamoto and the Claws Mail team
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 3 of the License, or
8 * (at your option) any later version.
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
15 * You should have received a copy of the GNU General Public License
16 * along with this program. If not, see <http://www.gnu.org/licenses/>.
29 #define SC_HTMLBUFSIZE 8192
30 #define HR_STR "────────────────────────────────────────────────"
33 static SC_HTMLState
sc_html_read_line (SC_HTMLParser
*parser
);
34 static void sc_html_append_char (SC_HTMLParser
*parser
,
36 static void sc_html_append_str (SC_HTMLParser
*parser
,
39 static SC_HTMLState
sc_html_parse_tag (SC_HTMLParser
*parser
);
40 static void sc_html_parse_special (SC_HTMLParser
*parser
);
41 static void sc_html_get_parenthesis (SC_HTMLParser
*parser
,
46 SC_HTMLParser
*sc_html_parser_new(FILE *fp
, CodeConverter
*conv
)
48 SC_HTMLParser
*parser
;
50 cm_return_val_if_fail(fp
!= NULL
, NULL
);
51 cm_return_val_if_fail(conv
!= NULL
, NULL
);
53 parser
= g_new0(SC_HTMLParser
, 1);
56 parser
->str
= g_string_new(NULL
);
57 parser
->buf
= g_string_new(NULL
);
58 parser
->bufp
= parser
->buf
->str
;
59 parser
->state
= SC_HTML_NORMAL
;
61 parser
->newline
= TRUE
;
62 parser
->empty_line
= TRUE
;
63 parser
->space
= FALSE
;
70 void sc_html_parser_destroy(SC_HTMLParser
*parser
)
72 g_string_free(parser
->str
, TRUE
);
73 g_string_free(parser
->buf
, TRUE
);
78 gchar
*sc_html_parse(SC_HTMLParser
*parser
)
80 parser
->state
= SC_HTML_NORMAL
;
81 g_string_truncate(parser
->str
, 0);
83 if (*parser
->bufp
== '\0') {
84 g_string_truncate(parser
->buf
, 0);
85 parser
->bufp
= parser
->buf
->str
;
86 if (sc_html_read_line(parser
) == SC_HTML_EOF
)
90 while (*parser
->bufp
!= '\0') {
91 switch (*parser
->bufp
) {
94 st
= sc_html_parse_tag(parser
);
95 /* when we see an href, we need to flush the str
96 * buffer. Then collect all the chars until we
97 * see the end anchor tag
99 if (SC_HTML_HREF_BEG
== st
|| SC_HTML_HREF
== st
)
100 return parser
->str
->str
;
104 sc_html_parse_special(parser
);
110 if (parser
->bufp
[0] == '\r' && parser
->bufp
[1] == '\n')
114 if (!parser
->newline
)
115 parser
->space
= TRUE
;
122 sc_html_append_char(parser
, *parser
->bufp
++);
126 return parser
->str
->str
;
129 static SC_HTMLState
sc_html_read_line(SC_HTMLParser
*parser
)
131 gchar buf
[SC_HTMLBUFSIZE
];
132 gchar buf2
[SC_HTMLBUFSIZE
];
136 if (parser
->fp
== NULL
)
139 n
= fread(buf
, 1, sizeof(buf
) - 1, parser
->fp
);
141 parser
->state
= SC_HTML_EOF
;
146 if (conv_convert(parser
->conv
, buf2
, sizeof(buf2
), buf
) < 0) {
147 index
= parser
->bufp
- parser
->buf
->str
;
149 conv_utf8todisp(buf2
, sizeof(buf2
), buf
);
150 g_string_append(parser
->buf
, buf2
);
152 parser
->bufp
= parser
->buf
->str
+ index
;
154 return SC_HTML_CONV_FAILED
;
157 index
= parser
->bufp
- parser
->buf
->str
;
159 g_string_append(parser
->buf
, buf2
);
161 parser
->bufp
= parser
->buf
->str
+ index
;
163 return SC_HTML_NORMAL
;
166 static void sc_html_append_char(SC_HTMLParser
*parser
, gchar ch
)
168 GString
*str
= parser
->str
;
170 if (!parser
->pre
&& parser
->space
) {
171 g_string_append_c(str
, ' ');
172 parser
->space
= FALSE
;
175 g_string_append_c(str
, ch
);
177 parser
->empty_line
= FALSE
;
179 parser
->newline
= TRUE
;
180 if (str
->len
> 1 && str
->str
[str
->len
- 2] == '\n')
181 parser
->empty_line
= TRUE
;
182 if (parser
->indent
> 0) {
183 gint i
, n
= parser
->indent
;
184 for (i
= 0; i
< n
; i
++)
185 g_string_append_c(str
, '>');
186 g_string_append_c(str
, ' ');
189 parser
->newline
= FALSE
;
192 static void sc_html_append_str(SC_HTMLParser
*parser
, const gchar
*str
, gint len
)
194 GString
*string
= parser
->str
;
196 if (!parser
->pre
&& parser
->space
) {
197 g_string_append_c(string
, ' ');
198 parser
->space
= FALSE
;
201 if (len
== 0) return;
203 g_string_append(string
, str
);
206 Xstrndup_a(s
, str
, len
, return);
207 g_string_append(string
, s
);
210 parser
->empty_line
= FALSE
;
211 if (string
->len
> 0 && string
->str
[string
->len
- 1] == '\n') {
212 parser
->newline
= TRUE
;
213 if (string
->len
> 1 && string
->str
[string
->len
- 2] == '\n')
214 parser
->empty_line
= TRUE
;
216 parser
->newline
= FALSE
;
219 static SC_HTMLTag
*sc_html_get_tag(const gchar
*str
)
225 cm_return_val_if_fail(str
!= NULL
, NULL
);
227 if (*str
== '\0' || *str
== '!') return NULL
;
229 Xstrdup_a(tmp
, str
, return NULL
);
231 tag
= g_new0(SC_HTMLTag
, 1);
233 for (tmpp
= tmp
; *tmpp
!= '\0' && !g_ascii_isspace(*tmpp
); tmpp
++)
237 tag
->name
= g_utf8_strdown(tmp
, -1);
241 tag
->name
= g_utf8_strdown(tmp
, -1);
244 while (*tmpp
!= '\0') {
251 while (g_ascii_isspace(*tmpp
)) tmpp
++;
254 while (*tmpp
!= '\0' && !g_ascii_isspace(*tmpp
) &&
257 if (*tmpp
!= '\0' && *tmpp
!= '=') {
259 while (g_ascii_isspace(*tmpp
)) tmpp
++;
264 while (g_ascii_isspace(*tmpp
)) tmpp
++;
266 if (*tmpp
== '"' || *tmpp
== '\'') {
271 if ((p
= strchr(attr_value
, quote
)) == NULL
) {
272 if (debug_get_mode()) {
273 g_warning("sc_html_get_tag(): syntax error in tag: '%s'",
276 gchar
*cut
= g_strndup(str
, 100);
277 g_warning("sc_html_get_tag(): syntax error in tag: '%s%s'",
278 cut
, strlen(str
)>100?"...":".");
285 while (g_ascii_isspace(*tmpp
)) tmpp
++;
289 while (*tmpp
!= '\0' && !g_ascii_isspace(*tmpp
)) tmpp
++;
296 g_strchomp(attr_name
);
297 attr
= g_new(SC_HTMLAttr
, 1);
298 attr
->name
= g_utf8_strdown(attr_name
, -1);
299 attr
->value
= g_strdup(attr_value
);
300 tag
->attr
= g_list_append(tag
->attr
, attr
);
306 static void sc_html_free_tag(SC_HTMLTag
*tag
)
311 while (tag
->attr
!= NULL
) {
312 SC_HTMLAttr
*attr
= (SC_HTMLAttr
*)tag
->attr
->data
;
316 tag
->attr
= g_list_remove(tag
->attr
, tag
->attr
->data
);
321 static void decode_href(SC_HTMLParser
*parser
)
324 SC_HTMLParser
*tparser
= g_new0(SC_HTMLParser
, 1);
326 tparser
->str
= g_string_new(NULL
);
327 tparser
->buf
= g_string_new(parser
->href
);
328 tparser
->bufp
= tparser
->buf
->str
;
330 tmp
= sc_html_parse(tparser
);
332 g_free(parser
->href
);
333 parser
->href
= g_strdup(tmp
);
335 sc_html_parser_destroy(tparser
);
338 static SC_HTMLState
sc_html_parse_tag(SC_HTMLParser
*parser
)
340 gchar buf
[SC_HTMLBUFSIZE
];
343 sc_html_get_parenthesis(parser
, buf
, sizeof(buf
));
345 tag
= sc_html_get_tag(buf
);
347 parser
->state
= SC_HTML_UNKNOWN
;
348 if (!tag
) return SC_HTML_UNKNOWN
;
350 if (!strcmp(tag
->name
, "br") || !strcmp(tag
->name
, "br/")) {
351 parser
->space
= FALSE
;
352 sc_html_append_char(parser
, '\n');
353 parser
->state
= SC_HTML_BR
;
354 } else if (!strcmp(tag
->name
, "a")) {
357 for (cur
= tag
->attr
; cur
!= NULL
; cur
= cur
->next
) {
358 if (cur
->data
&& !strcmp(((SC_HTMLAttr
*)cur
->data
)->name
, "href")) {
359 g_free(parser
->href
);
360 parser
->href
= g_strdup(((SC_HTMLAttr
*)cur
->data
)->value
);
362 parser
->state
= SC_HTML_HREF_BEG
;
366 if (parser
->href
== NULL
)
367 parser
->href
= g_strdup("");
368 parser
->state
= SC_HTML_HREF_BEG
;
369 } else if (!strcmp(tag
->name
, "/a")) {
370 parser
->state
= SC_HTML_HREF
;
371 } else if (!strcmp(tag
->name
, "p")) {
372 parser
->space
= FALSE
;
373 if (!parser
->empty_line
) {
374 parser
->space
= FALSE
;
375 if (!parser
->newline
) sc_html_append_char(parser
, '\n');
376 sc_html_append_char(parser
, '\n');
378 parser
->state
= SC_HTML_PAR
;
379 } else if (!strcmp(tag
->name
, "pre")) {
381 parser
->state
= SC_HTML_PRE
;
382 } else if (!strcmp(tag
->name
, "/pre")) {
384 parser
->state
= SC_HTML_NORMAL
;
385 } else if (!strcmp(tag
->name
, "hr")) {
386 if (!parser
->newline
) {
387 parser
->space
= FALSE
;
388 sc_html_append_char(parser
, '\n');
390 sc_html_append_str(parser
, HR_STR
, -1);
391 sc_html_append_char(parser
, '\n');
392 parser
->state
= SC_HTML_HR
;
393 } else if (!strcmp(tag
->name
, "div") ||
394 !strcmp(tag
->name
, "ul") ||
395 !strcmp(tag
->name
, "li") ||
396 !strcmp(tag
->name
, "table") ||
397 !strcmp(tag
->name
, "dd") ||
398 !strcmp(tag
->name
, "tr") ||
399 (tag
->name
[0] == 'h' && g_ascii_isdigit(tag
->name
[1]))) {
400 if (!parser
->newline
) {
401 parser
->space
= FALSE
;
402 sc_html_append_char(parser
, '\n');
404 if (!strcmp(tag
->name
, "li")) {
405 sc_html_append_str(parser
, LI_STR
, -1);
407 parser
->state
= SC_HTML_NORMAL
;
408 } else if (!strcmp(tag
->name
, "blockquote")) {
409 parser
->state
= SC_HTML_NORMAL
;
411 } else if (!strcmp(tag
->name
, "/blockquote")) {
412 parser
->state
= SC_HTML_NORMAL
;
414 } else if (!strcmp(tag
->name
, "/table") ||
415 (tag
->name
[0] == '/' &&
416 tag
->name
[1] == 'h' &&
417 g_ascii_isdigit(tag
->name
[1]))) {
418 if (!parser
->empty_line
) {
419 parser
->space
= FALSE
;
420 if (!parser
->newline
) sc_html_append_char(parser
, '\n');
421 sc_html_append_char(parser
, '\n');
423 parser
->state
= SC_HTML_NORMAL
;
424 } else if (!strcmp(tag
->name
, "/div") ||
425 !strcmp(tag
->name
, "/ul") ||
426 !strcmp(tag
->name
, "/li")) {
427 if (!parser
->newline
) {
428 parser
->space
= FALSE
;
429 sc_html_append_char(parser
, '\n');
431 parser
->state
= SC_HTML_NORMAL
;
434 sc_html_free_tag(tag
);
436 return parser
->state
;
439 static void sc_html_parse_special(SC_HTMLParser
*parser
)
443 parser
->state
= SC_HTML_UNKNOWN
;
444 cm_return_if_fail(*parser
->bufp
== '&');
446 entity
= entity_decode(parser
->bufp
);
447 if (entity
!= NULL
) {
448 sc_html_append_str(parser
, entity
, -1);
450 while (*parser
->bufp
++ != ';');
452 /* output literal `&' */
453 sc_html_append_char(parser
, *parser
->bufp
++);
455 parser
->state
= SC_HTML_NORMAL
;
458 static gchar
*sc_html_find_tag(SC_HTMLParser
*parser
, const gchar
*tag
)
460 gchar
*cur
= parser
->bufp
;
461 gint len
= strlen(tag
);
466 while ((cur
= strstr(cur
, "<")) != NULL
) {
467 if (!g_ascii_strncasecmp(cur
, tag
, len
))
474 static void sc_html_get_parenthesis(SC_HTMLParser
*parser
, gchar
*buf
, gint len
)
479 cm_return_if_fail(*parser
->bufp
== '<');
481 /* ignore comment / CSS / script stuff */
482 if (!strncmp(parser
->bufp
, "<!--", 4)) {
484 while ((p
= strstr(parser
->bufp
, "-->")) == NULL
)
485 if (sc_html_read_line(parser
) == SC_HTML_EOF
) return;
486 parser
->bufp
= p
+ 3;
489 if (!g_ascii_strncasecmp(parser
->bufp
, "<style", 6)) {
491 while ((p
= sc_html_find_tag(parser
, "</style>")) == NULL
)
492 if (sc_html_read_line(parser
) == SC_HTML_EOF
) return;
493 parser
->bufp
= p
+ 8;
496 if (!g_ascii_strncasecmp(parser
->bufp
, "<script", 7)) {
498 while ((p
= sc_html_find_tag(parser
, "</script>")) == NULL
)
499 if (sc_html_read_line(parser
) == SC_HTML_EOF
) return;
500 parser
->bufp
= p
+ 9;
505 while ((p
= strchr(parser
->bufp
, '>')) == NULL
)
506 if (sc_html_read_line(parser
) == SC_HTML_EOF
) return;
508 strncpy2(buf
, parser
->bufp
, MIN(p
- parser
->bufp
+ 1, len
));
510 parser
->bufp
= p
+ 1;