2007-08-27 [colin] 2.10.0cvs179
[claws.git] / src / html.c
blobe124c6b3d18434f36fc2b4cf1b9adc186b5ed9fa
1 /*
2 * Sylpheed -- a GTK+ based, lightweight, and fast e-mail client
3 * Copyright (C) 1999-2007 Hiroyuki Yamamoto and the Claws Mail team
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 3 of the License, or
8 * (at your option) any later version.
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
15 * You should have received a copy of the GNU General Public License
16 * along with this program. If not, see <http://www.gnu.org/licenses/>.
20 #include <glib.h>
21 #include <stdio.h>
22 #include <string.h>
23 #include <ctype.h>
25 #include "html.h"
26 #include "codeconv.h"
27 #include "utils.h"
29 #define SC_HTMLBUFSIZE 8192
30 #define HR_STR "------------------------------------------------"
32 typedef struct _SC_HTMLSymbol SC_HTMLSymbol;
34 struct _SC_HTMLSymbol
36 gchar *const key;
37 gchar *const val;
40 static SC_HTMLSymbol symbol_list[] = {
41 {"&#34;", "\42"},
42 {"&#38;", "\46"},
43 {"&#39;", "\47"},
44 {"&#60;", "\74"},
45 {"&#62;", "\76"},
46 {"&#146;", "\47"},
47 {"&#153;", "\342\204\242"},
48 {"&#160;", "\40"},
49 {"&#161;", "\302\241"},
50 {"&#162;", "\302\242"},
51 {"&#163;", "\302\243"},
52 {"&#164;", "\302\244"},
53 {"&#165;", "\302\245"},
54 {"&#166;", "\302\246"},
55 {"&#167;", "\302\247"},
56 {"&#168;", "\302\250"},
57 {"&#169;", "\302\251"},
58 {"&#170;", "\302\252"},
59 {"&#171;", "\302\253"},
60 {"&#172;", "\302\254"},
61 {"&#173;", "\302\255"},
62 {"&#174;", "\302\256"},
63 {"&#175;", "\302\257"},
64 {"&#176;", "\302\260"},
65 {"&#177;", "\302\261"},
66 {"&#178;", "\302\262"},
67 {"&#179;", "\302\263"},
68 {"&#180;", "\302\264"},
69 {"&#181;", "\302\265"},
70 {"&#182;", "\302\266"},
71 {"&#183;", "\302\267"},
72 {"&#184;", "\302\270"},
73 {"&#185;", "\302\271"},
74 {"&#186;", "\302\272"},
75 {"&#187;", "\302\273"},
76 {"&#188;", "\302\274"},
77 {"&#189;", "\302\275"},
78 {"&#190;", "\302\276"},
79 {"&#191;", "\302\277"},
80 {"&#192;", "\303\200"},
81 {"&#193;", "\303\201"},
82 {"&#194;", "\303\202"},
83 {"&#195;", "\303\203"},
84 {"&#196;", "\303\204"},
85 {"&#197;", "\303\205"},
86 {"&#198;", "\303\206"},
87 {"&#199;", "\303\207"},
88 {"&#200;", "\303\210"},
89 {"&#201;", "\303\211"},
90 {"&#202;", "\303\212"},
91 {"&#203;", "\303\213"},
92 {"&#204;", "\303\214"},
93 {"&#205;", "\303\215"},
94 {"&#206;", "\303\216"},
95 {"&#207;", "\303\217"},
96 {"&#208;", "\303\220"},
97 {"&#209;", "\303\221"},
98 {"&#210;", "\303\222"},
99 {"&#211;", "\303\223"},
100 {"&#212;", "\303\224"},
101 {"&#213;", "\303\225"},
102 {"&#214;", "\303\226"},
103 {"&#215;", "\303\227"},
104 {"&#216;", "\303\230"},
105 {"&#217;", "\303\231"},
106 {"&#218;", "\303\232"},
107 {"&#219;", "\303\233"},
108 {"&#220;", "\303\234"},
109 {"&#221;", "\303\235"},
110 {"&#222;", "\303\236"},
111 {"&#223;", "\303\237"},
112 {"&#224;", "\303\240"},
113 {"&#225;", "\303\241"},
114 {"&#226;", "\303\242"},
115 {"&#227;", "\303\243"},
116 {"&#228;", "\303\244"},
117 {"&#229;", "\303\245"},
118 {"&#230;", "\303\246"},
119 {"&#231;", "\303\247"},
120 {"&#232;", "\303\250"},
121 {"&#233;", "\303\251"},
122 {"&#234;", "\303\252"},
123 {"&#235;", "\303\253"},
124 {"&#236;", "\303\254"},
125 {"&#237;", "\303\255"},
126 {"&#238;", "\303\256"},
127 {"&#239;", "\303\257"},
128 {"&#240;", "\303\260"},
129 {"&#241;", "\303\261"},
130 {"&#242;", "\303\262"},
131 {"&#243;", "\303\263"},
132 {"&#244;", "\303\264"},
133 {"&#245;", "\303\265"},
134 {"&#246;", "\303\266"},
135 {"&#247;", "\303\267"},
136 {"&#248;", "\303\270"},
137 {"&#249;", "\303\271"},
138 {"&#250;", "\303\272"},
139 {"&#251;", "\303\273"},
140 {"&#252;", "\303\274"},
141 {"&#253;", "\303\275"},
142 {"&#254;", "\303\276"},
143 {"&#255;", "\303\277"},
144 {"&#338;", "\305\222"},
145 {"&#339;", "\305\223"},
146 {"&#352;", "\305\240"},
147 {"&#353;", "\305\241"},
148 {"&#376;", "\305\270"},
149 {"&#710;", "\313\206"},
150 {"&#732;", "\313\234"},
151 {"&#8194;", "\342\200\202"},
152 {"&#8195;", "\342\200\203"},
153 {"&#8201;", "\342\200\211"},
154 {"&#8211;", "\342\200\223"},
155 {"&#8212;", "\342\200\224"},
156 {"&#8216;", "\342\200\230"},
157 {"&#8217;", "\342\200\231"},
158 {"&#8218;", "\342\200\232"},
159 {"&#8220;", "\342\200\234"},
160 {"&#8221;", "\342\200\235"},
161 {"&#8222;", "\342\200\236"},
162 {"&#8224;", "\342\200\240"},
163 {"&#8225;", "\342\200\241"},
164 {"&#8226;", "\342\200\242"},
165 {"&#8230;", "\342\200\246"},
166 {"&#8240;", "\342\200\260"},
167 {"&#8249;", "\342\200\271"},
168 {"&#8250;", "\342\200\272"},
169 {"&#8364;", "\342\202\254"},
170 {"&#8482;", "\342\204\242"},
171 {"&quot;", "\42"},
172 {"&amp;", "\46"},
173 {"&apos;", "\47"},
174 {"&lt;", "\74"},
175 {"&gt;", "\76"},
176 {"&squot;", "\47"},
177 {"&nbsp;", "\40"},
178 {"&iexcl;", "\302\241"},
179 {"&cent;", "\302\242"},
180 {"&pound;", "\302\243"},
181 {"&curren;", "\302\244"},
182 {"&yen;", "\302\245"},
183 {"&brvbar;", "\302\246"},
184 {"&sect;", "\302\247"},
185 {"&uml;", "\302\250"},
186 {"&copy;", "\302\251"},
187 {"&ordf;", "\302\252"},
188 {"&laquo;", "\302\253"},
189 {"&not;", "\302\254"},
190 {"&shy;", "\302\255"},
191 {"&reg;", "\302\256"},
192 {"&macr;", "\302\257"},
193 {"&deg;", "\302\260"},
194 {"&plusmn;", "\302\261"},
195 {"&sup2;", "\302\262"},
196 {"&sup3;", "\302\263"},
197 {"&acute;", "\302\264"},
198 {"&micro;", "\302\265"},
199 {"&para;", "\302\266"},
200 {"&middot;", "\302\267"},
201 {"&cedil;", "\302\270"},
202 {"&sup1;", "\302\271"},
203 {"&ordm;", "\302\272"},
204 {"&raquo;", "\302\273"},
205 {"&frac14;", "\302\274"},
206 {"&frac12;", "\302\275"},
207 {"&frac34;", "\302\276"},
208 {"&iquest;", "\302\277"},
209 {"&Agrave;", "\303\200"},
210 {"&Aacute;", "\303\201"},
211 {"&Acirc;", "\303\202"},
212 {"&Atilde;", "\303\203"},
213 {"&Auml;", "\303\204"},
214 {"&Aring;", "\303\205"},
215 {"&AElig;", "\303\206"},
216 {"&Ccedil;", "\303\207"},
217 {"&Egrave;", "\303\210"},
218 {"&Eacute;", "\303\211"},
219 {"&Ecirc;", "\303\212"},
220 {"&Euml;", "\303\213"},
221 {"&Igrave;", "\303\214"},
222 {"&Iacute;", "\303\215"},
223 {"&Icirc;", "\303\216"},
224 {"&Iuml;", "\303\217"},
225 {"&ETH;", "\303\220"},
226 {"&Ntilde;", "\303\221"},
227 {"&Ograve;", "\303\222"},
228 {"&Oacute;", "\303\223"},
229 {"&Ocirc;", "\303\224"},
230 {"&Otilde;", "\303\225"},
231 {"&Ouml;", "\303\226"},
232 {"&times;", "\303\227"},
233 {"&Oslash;", "\303\230"},
234 {"&Ugrave;", "\303\231"},
235 {"&Uacute;", "\303\232"},
236 {"&Ucirc;", "\303\233"},
237 {"&Uuml;", "\303\234"},
238 {"&Yacute;", "\303\235"},
239 {"&THORN;", "\303\236"},
240 {"&szlig;", "\303\237"},
241 {"&agrave;", "\303\240"},
242 {"&aacute;", "\303\241"},
243 {"&acirc;", "\303\242"},
244 {"&atilde;", "\303\243"},
245 {"&auml;", "\303\244"},
246 {"&aring;", "\303\245"},
247 {"&aelig;", "\303\246"},
248 {"&ccedil;", "\303\247"},
249 {"&egrave;", "\303\250"},
250 {"&eacute;", "\303\251"},
251 {"&ecirc;", "\303\252"},
252 {"&euml;", "\303\253"},
253 {"&igrave;", "\303\254"},
254 {"&iacute;", "\303\255"},
255 {"&icirc;", "\303\256"},
256 {"&iuml;", "\303\257"},
257 {"&eth;", "\303\260"},
258 {"&ntilde;", "\303\261"},
259 {"&ograve;", "\303\262"},
260 {"&oacute;", "\303\263"},
261 {"&ocirc;", "\303\264"},
262 {"&otilde;", "\303\265"},
263 {"&ouml;", "\303\266"},
264 {"&divide;", "\303\267"},
265 {"&oslash;", "\303\270"},
266 {"&ugrave;", "\303\271"},
267 {"&uacute;", "\303\272"},
268 {"&ucirc;", "\303\273"},
269 {"&uuml;", "\303\274"},
270 {"&yacute;", "\303\275"},
271 {"&thorn;", "\303\276"},
272 {"&yuml;", "\303\277"},
273 {"&OElig;", "\305\222"},
274 {"&oelig;", "\305\223"},
275 {"&Scaron;", "\305\240"},
276 {"&scaron;", "\305\241"},
277 {"&Yuml;", "\305\270"},
278 {"&circ;", "\313\206"},
279 {"&tilde;", "\313\234"},
280 {"&ensp;", "\342\200\202"},
281 {"&emsp;", "\342\200\203"},
282 {"&thinsp;", "\342\200\211"},
283 {"&ndash;", "\342\200\223"},
284 {"&mdash;", "\342\200\224"},
285 {"&lsquo;", "\342\200\230"},
286 {"&rsquo;", "\342\200\231"},
287 {"&sbquo;", "\342\200\232"},
288 {"&ldquo;", "\342\200\234"},
289 {"&rdquo;", "\342\200\235"},
290 {"&bdquo;", "\342\200\236"},
291 {"&dagger;", "\342\200\240"},
292 {"&Dagger;", "\342\200\241"},
293 {"&bull;", "\342\200\242"},
294 {"&hellip;", "\342\200\246"},
295 {"&permil;", "\342\200\260"},
296 {"&lsaquo;", "\342\200\271"},
297 {"&rsaquo;", "\342\200\272"},
298 {"&euro;", "\342\202\254"},
299 {"&trade;", "\342\204\242"}
302 typedef struct _SC_HTMLAltSymbol SC_HTMLAltSymbol;
304 struct _SC_HTMLAltSymbol
306 gint key;
307 gchar *const val;
310 static GHashTable *default_symbol_table;
312 static SC_HTMLState sc_html_read_line (SC_HTMLParser *parser);
313 static void sc_html_append_char (SC_HTMLParser *parser,
314 gchar ch);
315 static void sc_html_append_str (SC_HTMLParser *parser,
316 const gchar *str,
317 gint len);
318 static SC_HTMLState sc_html_parse_tag (SC_HTMLParser *parser);
319 static void sc_html_parse_special (SC_HTMLParser *parser);
320 static void sc_html_get_parenthesis (SC_HTMLParser *parser,
321 gchar *buf,
322 gint len);
325 SC_HTMLParser *sc_html_parser_new(FILE *fp, CodeConverter *conv)
327 SC_HTMLParser *parser;
329 g_return_val_if_fail(fp != NULL, NULL);
330 g_return_val_if_fail(conv != NULL, NULL);
332 parser = g_new0(SC_HTMLParser, 1);
333 parser->fp = fp;
334 parser->conv = conv;
335 parser->str = g_string_new(NULL);
336 parser->buf = g_string_new(NULL);
337 parser->bufp = parser->buf->str;
338 parser->state = SC_HTML_NORMAL;
339 parser->href = NULL;
340 parser->newline = TRUE;
341 parser->empty_line = TRUE;
342 parser->space = FALSE;
343 parser->pre = FALSE;
345 #define SYMBOL_TABLE_ADD(table, list) \
347 gint i; \
349 for (i = 0; i < sizeof(list) / sizeof(list[0]); i++) \
350 g_hash_table_insert(table, list[i].key, list[i].val); \
352 #define SYMBOL_TABLE_REF_ADD(table, list) \
354 gint i; \
356 for (i = 0; i < sizeof(list) / sizeof(list[0]); i++) \
357 g_hash_table_insert(table, &list[i].key, list[i].val); \
360 if (!default_symbol_table) {
361 default_symbol_table =
362 g_hash_table_new(g_str_hash, g_str_equal);
363 SYMBOL_TABLE_ADD(default_symbol_table, symbol_list);
366 #undef SYMBOL_TABLE_ADD
367 #undef SYMBOL_TABLE_REF_ADD
369 parser->symbol_table = default_symbol_table;
371 return parser;
374 void sc_html_parser_destroy(SC_HTMLParser *parser)
376 g_string_free(parser->str, TRUE);
377 g_string_free(parser->buf, TRUE);
378 g_free(parser->href);
379 g_free(parser);
382 gchar *sc_html_parse(SC_HTMLParser *parser)
384 parser->state = SC_HTML_NORMAL;
385 g_string_truncate(parser->str, 0);
387 if (*parser->bufp == '\0') {
388 g_string_truncate(parser->buf, 0);
389 parser->bufp = parser->buf->str;
390 if (sc_html_read_line(parser) == SC_HTML_EOF)
391 return NULL;
394 while (*parser->bufp != '\0') {
395 switch (*parser->bufp) {
396 case '<': {
397 SC_HTMLState st;
398 st = sc_html_parse_tag(parser);
399 /* when we see an href, we need to flush the str
400 * buffer. Then collect all the chars until we
401 * see the end anchor tag
403 if (SC_HTML_HREF_BEG == st || SC_HTML_HREF == st)
404 return parser->str->str;
406 break;
407 case '&':
408 sc_html_parse_special(parser);
409 break;
410 case ' ':
411 case '\t':
412 case '\r':
413 case '\n':
414 if (parser->bufp[0] == '\r' && parser->bufp[1] == '\n')
415 parser->bufp++;
417 if (!parser->pre) {
418 if (!parser->newline)
419 parser->space = TRUE;
421 parser->bufp++;
422 break;
424 /* fallthrough */
425 default:
426 sc_html_append_char(parser, *parser->bufp++);
430 return parser->str->str;
433 static SC_HTMLState sc_html_read_line(SC_HTMLParser *parser)
435 gchar buf[SC_HTMLBUFSIZE];
436 gchar buf2[SC_HTMLBUFSIZE];
437 gint index;
439 if (fgets(buf, sizeof(buf), parser->fp) == NULL) {
440 parser->state = SC_HTML_EOF;
441 return SC_HTML_EOF;
444 if (conv_convert(parser->conv, buf2, sizeof(buf2), buf) < 0) {
445 index = parser->bufp - parser->buf->str;
447 conv_utf8todisp(buf2, sizeof(buf2), buf);
448 g_string_append(parser->buf, buf2);
450 parser->bufp = parser->buf->str + index;
452 return SC_HTML_CONV_FAILED;
455 index = parser->bufp - parser->buf->str;
457 g_string_append(parser->buf, buf2);
459 parser->bufp = parser->buf->str + index;
461 return SC_HTML_NORMAL;
464 static void sc_html_append_char(SC_HTMLParser *parser, gchar ch)
466 GString *str = parser->str;
468 if (!parser->pre && parser->space) {
469 g_string_append_c(str, ' ');
470 parser->space = FALSE;
473 g_string_append_c(str, ch);
475 parser->empty_line = FALSE;
476 if (ch == '\n') {
477 parser->newline = TRUE;
478 if (str->len > 1 && str->str[str->len - 2] == '\n')
479 parser->empty_line = TRUE;
480 } else
481 parser->newline = FALSE;
484 static void sc_html_append_str(SC_HTMLParser *parser, const gchar *str, gint len)
486 GString *string = parser->str;
488 if (!parser->pre && parser->space) {
489 g_string_append_c(string, ' ');
490 parser->space = FALSE;
493 if (len == 0) return;
494 if (len < 0)
495 g_string_append(string, str);
496 else {
497 gchar *s;
498 Xstrndup_a(s, str, len, return);
499 g_string_append(string, s);
502 parser->empty_line = FALSE;
503 if (string->len > 0 && string->str[string->len - 1] == '\n') {
504 parser->newline = TRUE;
505 if (string->len > 1 && string->str[string->len - 2] == '\n')
506 parser->empty_line = TRUE;
507 } else
508 parser->newline = FALSE;
511 static SC_HTMLTag *sc_html_get_tag(const gchar *str)
513 SC_HTMLTag *tag;
514 gchar *tmp;
515 guchar *tmpp;
517 g_return_val_if_fail(str != NULL, NULL);
519 if (*str == '\0' || *str == '!') return NULL;
521 Xstrdup_a(tmp, str, return NULL);
523 tag = g_new0(SC_HTMLTag, 1);
525 for (tmpp = tmp; *tmpp != '\0' && !g_ascii_isspace(*tmpp); tmpp++)
528 if (*tmpp == '\0') {
529 g_strdown(tmp);
530 tag->name = g_strdup(tmp);
531 return tag;
532 } else {
533 *tmpp++ = '\0';
534 g_strdown(tmp);
535 tag->name = g_strdup(tmp);
538 while (*tmpp != '\0') {
539 SC_HTMLAttr *attr;
540 gchar *attr_name;
541 gchar *attr_value;
542 gchar *p;
543 gchar quote;
545 while (g_ascii_isspace(*tmpp)) tmpp++;
546 attr_name = tmpp;
548 while (*tmpp != '\0' && !g_ascii_isspace(*tmpp) &&
549 *tmpp != '=')
550 tmpp++;
551 if (*tmpp != '\0' && *tmpp != '=') {
552 *tmpp++ = '\0';
553 while (g_ascii_isspace(*tmpp)) tmpp++;
556 if (*tmpp == '=') {
557 *tmpp++ = '\0';
558 while (g_ascii_isspace(*tmpp)) tmpp++;
560 if (*tmpp == '"' || *tmpp == '\'') {
561 /* name="value" */
562 quote = *tmpp;
563 tmpp++;
564 attr_value = tmpp;
565 if ((p = strchr(attr_value, quote)) == NULL) {
566 g_warning("sc_html_get_tag(): syntax error in tag: '%s'\n", str);
567 return tag;
569 tmpp = p;
570 *tmpp++ = '\0';
571 while (g_ascii_isspace(*tmpp)) tmpp++;
572 } else {
573 /* name=value */
574 attr_value = tmpp;
575 while (*tmpp != '\0' && !g_ascii_isspace(*tmpp)) tmpp++;
576 if (*tmpp != '\0')
577 *tmpp++ = '\0';
579 } else
580 attr_value = "";
582 g_strchomp(attr_name);
583 g_strdown(attr_name);
584 attr = g_new(SC_HTMLAttr, 1);
585 attr->name = g_strdup(attr_name);
586 attr->value = g_strdup(attr_value);
587 tag->attr = g_list_append(tag->attr, attr);
590 return tag;
593 static void sc_html_free_tag(SC_HTMLTag *tag)
595 if (!tag) return;
597 g_free(tag->name);
598 while (tag->attr != NULL) {
599 SC_HTMLAttr *attr = (SC_HTMLAttr *)tag->attr->data;
600 g_free(attr->name);
601 g_free(attr->value);
602 g_free(attr);
603 tag->attr = g_list_remove(tag->attr, tag->attr->data);
605 g_free(tag);
608 static SC_HTMLState sc_html_parse_tag(SC_HTMLParser *parser)
610 gchar buf[SC_HTMLBUFSIZE];
611 SC_HTMLTag *tag;
613 sc_html_get_parenthesis(parser, buf, sizeof(buf));
615 tag = sc_html_get_tag(buf);
617 parser->state = SC_HTML_UNKNOWN;
618 if (!tag) return SC_HTML_UNKNOWN;
620 if (!strcmp(tag->name, "br")) {
621 parser->space = FALSE;
622 sc_html_append_char(parser, '\n');
623 parser->state = SC_HTML_BR;
624 } else if (!strcmp(tag->name, "a")) {
625 GList *cur;
626 for (cur = tag->attr; cur != NULL; cur = cur->next) {
627 if (cur->data && !strcmp(((SC_HTMLAttr *)cur->data)->name, "href")) {
628 g_free(parser->href);
629 parser->href = g_strdup(((SC_HTMLAttr *)cur->data)->value);
630 parser->state = SC_HTML_HREF_BEG;
631 break;
634 } else if (!strcmp(tag->name, "/a")) {
635 parser->state = SC_HTML_HREF;
636 } else if (!strcmp(tag->name, "p")) {
637 parser->space = FALSE;
638 if (!parser->empty_line) {
639 parser->space = FALSE;
640 if (!parser->newline) sc_html_append_char(parser, '\n');
641 sc_html_append_char(parser, '\n');
643 parser->state = SC_HTML_PAR;
644 } else if (!strcmp(tag->name, "pre")) {
645 parser->pre = TRUE;
646 parser->state = SC_HTML_PRE;
647 } else if (!strcmp(tag->name, "/pre")) {
648 parser->pre = FALSE;
649 parser->state = SC_HTML_NORMAL;
650 } else if (!strcmp(tag->name, "hr")) {
651 if (!parser->newline) {
652 parser->space = FALSE;
653 sc_html_append_char(parser, '\n');
655 sc_html_append_str(parser, HR_STR "\n", -1);
656 parser->state = SC_HTML_HR;
657 } else if (!strcmp(tag->name, "div") ||
658 !strcmp(tag->name, "ul") ||
659 !strcmp(tag->name, "li") ||
660 !strcmp(tag->name, "table") ||
661 !strcmp(tag->name, "tr") ||
662 (tag->name[0] == 'h' && g_ascii_isdigit(tag->name[1]))) {
663 if (!parser->newline) {
664 parser->space = FALSE;
665 sc_html_append_char(parser, '\n');
667 parser->state = SC_HTML_NORMAL;
668 } else if (!strcmp(tag->name, "/table") ||
669 (tag->name[0] == '/' &&
670 tag->name[1] == 'h' &&
671 g_ascii_isdigit(tag->name[1]))) {
672 if (!parser->empty_line) {
673 parser->space = FALSE;
674 if (!parser->newline) sc_html_append_char(parser, '\n');
675 sc_html_append_char(parser, '\n');
677 parser->state = SC_HTML_NORMAL;
678 } else if (!strcmp(tag->name, "/div") ||
679 !strcmp(tag->name, "/ul") ||
680 !strcmp(tag->name, "/li")) {
681 if (!parser->newline) {
682 parser->space = FALSE;
683 sc_html_append_char(parser, '\n');
685 parser->state = SC_HTML_NORMAL;
688 sc_html_free_tag(tag);
690 return parser->state;
693 static void sc_html_parse_special(SC_HTMLParser *parser)
695 gchar symbol_name[9];
696 gint n;
697 const gchar *val;
699 parser->state = SC_HTML_UNKNOWN;
700 g_return_if_fail(*parser->bufp == '&');
702 /* &foo; */
703 for (n = 0; parser->bufp[n] != '\0' && parser->bufp[n] != ';'; n++)
705 if (n > 7 || parser->bufp[n] != ';') {
706 /* output literal `&' */
707 sc_html_append_char(parser, *parser->bufp++);
708 parser->state = SC_HTML_NORMAL;
709 return;
711 strncpy2(symbol_name, parser->bufp, n + 2);
712 parser->bufp += n + 1;
714 if ((val = g_hash_table_lookup(parser->symbol_table, symbol_name))
715 != NULL) {
716 sc_html_append_str(parser, val, -1);
717 parser->state = SC_HTML_NORMAL;
718 return;
721 sc_html_append_str(parser, symbol_name, -1);
724 static void sc_html_get_parenthesis(SC_HTMLParser *parser, gchar *buf, gint len)
726 gchar *p;
728 buf[0] = '\0';
729 g_return_if_fail(*parser->bufp == '<');
731 /* ignore comment / CSS / script stuff */
732 if (!strncmp(parser->bufp, "<!--", 4)) {
733 parser->bufp += 4;
734 while ((p = strstr(parser->bufp, "-->")) == NULL)
735 if (sc_html_read_line(parser) == SC_HTML_EOF) return;
736 parser->bufp = p + 3;
737 return;
739 if (!g_ascii_strncasecmp(parser->bufp, "<style", 6)) {
740 parser->bufp += 6;
741 while ((p = strcasestr(parser->bufp, "</style>")) == NULL)
742 if (sc_html_read_line(parser) == SC_HTML_EOF) return;
743 parser->bufp = p + 8;
744 return;
746 if (!g_ascii_strncasecmp(parser->bufp, "<script", 7)) {
747 parser->bufp += 7;
748 while ((p = strcasestr(parser->bufp, "</script>")) == NULL)
749 if (sc_html_read_line(parser) == SC_HTML_EOF) return;
750 parser->bufp = p + 9;
751 return;
754 parser->bufp++;
755 while ((p = strchr(parser->bufp, '>')) == NULL)
756 if (sc_html_read_line(parser) == SC_HTML_EOF) return;
758 strncpy2(buf, parser->bufp, MIN(p - parser->bufp + 1, len));
759 g_strstrip(buf);
760 parser->bufp = p + 1;