Update Spanish translation
[gnumeric.git] / plugins / html / html_read.c
blobc4dd9003973db45d11875c96be09a288bd60261e
1 /*
2 * html_read.c
4 * Copyright (C) 1999, 2000 Rasca, Berlin
5 * EMail: thron@gmx.de
6 * Copyright (c) 2001 Andreas J. Guelzow
7 * EMail: aguelzow@taliesin.ca
8 * Copyright (c) 2002 Jody Goldberg
9 * EMail: jody@gnome.org
11 * Contributors :
12 * Almer S. Tigelaar <almer1@dds.nl>
13 * Andreas J. Guelzow <aguelzow@taliesin.ca>
15 * This program is free software; you can redistribute it and/or modify
16 * it under the terms of the GNU General Public License as published by
17 * the Free Software Foundation; either version 2 of the License, or
18 * (at your option) any later version.
20 * This program is distributed in the hope that it will be useful,
21 * but WITHOUT ANY WARRANTY; without even the implied warranty of
22 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23 * GNU General Public License for more details.
25 * You should have received a copy of the GNU General Public License
26 * along with this program; if not, see <https://www.gnu.org/licenses/>.
29 #include <gnumeric-config.h>
30 #include <glib/gi18n-lib.h>
31 #include <gnumeric.h>
32 #include <string.h>
33 #include "html.h"
35 #include <sheet-object-cell-comment.h>
36 #include <workbook-view.h>
37 #include <workbook.h>
38 #include <sheet.h>
39 #include <sheet-merge.h>
40 #include <sheet-style.h>
41 #include <style.h>
42 #include <style-color.h>
43 #include <hlink.h>
44 #include <cell.h>
45 #include <ranges.h>
46 #include <goffice/goffice.h>
48 #include <gsf/gsf-input.h>
49 #include <libxml/HTMLparser.h>
50 #include <libxml/HTMLtree.h>
52 #define CC2XML(s) ((xmlChar const *)(s))
53 #define C2XML(s) ((xmlChar *)(s))
54 #define CXML2C(s) ((char const *)(s))
55 #define XML2C(s) ((char *)(s))
57 typedef struct {
58 Sheet *sheet;
59 int row;
60 WorkbookView *wb_view;
61 } GnmHtmlTableCtxt;
63 static void html_read_table (htmlNodePtr cur, htmlDocPtr doc,
64 WorkbookView *wb_view,
65 GnmHtmlTableCtxt *tc);
68 static Sheet *
69 html_get_sheet (char const *name, Workbook *wb)
71 Sheet *sheet = NULL;
73 if (name) {
74 sheet = workbook_sheet_by_name (wb, name);
75 if (sheet == NULL) {
76 sheet = sheet_new (wb, name, GNM_DEFAULT_COLS, GNM_DEFAULT_ROWS);
77 workbook_sheet_attach (wb, sheet);
79 } else
80 sheet = workbook_sheet_add (wb, -1, GNM_DEFAULT_COLS, GNM_DEFAULT_ROWS);
81 return sheet;
84 static void
85 html_append_text (GString *buf, const xmlChar *text)
87 const xmlChar *p;
89 while (*text) {
90 while (g_unichar_isspace (g_utf8_get_char (text)))
91 text = g_utf8_next_char (text);
92 if (*text) {
93 for (p = text;
94 *p && !g_unichar_isspace (g_utf8_get_char (p));
95 p = g_utf8_next_char (p))
97 if (buf->len > 0)
98 g_string_append_c (buf, ' ');
99 g_string_append_len (buf, text, p - text);
100 text = p;
105 static void
106 html_read_content (htmlNodePtr cur, GString *buf, GnmStyle *mstyle,
107 xmlBufferPtr a_buf, GSList **hrefs, gboolean first,
108 htmlDocPtr doc, GnmHtmlTableCtxt *tc)
110 htmlNodePtr ptr;
112 for (ptr = cur->children; ptr != NULL ; ptr = ptr->next) {
113 if (ptr->type == XML_TEXT_NODE) {
114 if (g_utf8_validate (ptr->content, -1, NULL))
115 html_append_text (buf, ptr->content);
116 else
117 g_string_append (buf, _("[Warning: Invalid text string has been removed.]"));
118 } else if (ptr->type == XML_ELEMENT_NODE) {
119 if (first) {
120 if (xmlStrEqual (ptr->name, CC2XML ("i"))
121 || xmlStrEqual (ptr->name, CC2XML ("em")))
122 gnm_style_set_font_italic (mstyle, TRUE);
123 if (xmlStrEqual (ptr->name, CC2XML ("b")))
124 gnm_style_set_font_bold (mstyle, TRUE);
126 if (xmlStrEqual (ptr->name, CC2XML ("a"))) {
127 xmlAttrPtr props;
128 props = ptr->properties;
129 while (props) {
130 if (xmlStrEqual (props->name, CC2XML ("href")) && props->children) {
131 *hrefs = g_slist_prepend (
132 *hrefs, props->children);
135 props = props->next;
138 if (xmlStrEqual (ptr->name, CC2XML ("img"))) {
139 xmlAttrPtr props;
140 props = ptr->properties;
141 while (props) {
142 if (xmlStrEqual (props->name, CC2XML ("src")) && props->children) {
143 htmlNodeDump (a_buf, doc, props->children);
144 xmlBufferAdd (a_buf, CC2XML ("\n"), -1);
146 props = props->next;
149 if (xmlStrEqual (ptr->name, CC2XML ("table"))) {
150 Sheet *last_sheet = tc->sheet;
151 int last_row = tc->row;
152 tc->sheet = NULL;
153 tc->row = -1;
154 html_read_table (ptr, doc, tc->wb_view, tc);
155 if (tc->sheet) {
156 g_string_append_printf (buf, _("[see sheet %s]"), tc->sheet->name_quoted);
157 xmlBufferAdd (a_buf, CC2XML (_("The original html file is\n"
158 "using nested tables.")), -1);
160 tc->sheet = last_sheet;
161 tc->row = last_row;
162 } else
163 html_read_content
164 (ptr, buf, mstyle, a_buf, hrefs, first, doc, tc);
166 first = FALSE;
170 static void
171 html_read_row (htmlNodePtr cur, htmlDocPtr doc, GnmHtmlTableCtxt *tc)
173 htmlNodePtr ptr;
174 int col = -1;
176 for (ptr = cur->children; ptr != NULL ; ptr = ptr->next) {
177 if (xmlStrEqual (ptr->name, CC2XML ("td")) ||
178 xmlStrEqual (ptr->name, CC2XML ("th"))) {
179 GString *buf;
180 xmlBufferPtr a_buf;
181 xmlAttrPtr props;
182 int colspan = 1;
183 int rowspan = 1;
184 GnmCellPos pos;
185 GnmStyle *mstyle;
186 GSList *hrefs = NULL;
187 GnmHLink *lnk = NULL;
189 /* Check whether we need to skip merges from above */
190 pos.row = tc->row;
191 pos.col = col + 1;
192 while (gnm_sheet_merge_contains_pos (tc->sheet, &pos)) {
193 col++;
194 pos.col++;
197 /* Do we span across multiple rows or cols? */
198 props = ptr->properties;
199 while (props) {
200 if (xmlStrEqual (props->name, CC2XML ("colspan")) && props->children)
201 colspan = atoi (CXML2C (props->children->content));
202 if (xmlStrEqual (props->name, CC2XML ("rowspan")) && props->children)
203 rowspan = atoi (CXML2C (props->children->content));
204 props = props->next;
206 if (colspan < 1)
207 colspan = 1;
208 if (rowspan < 1)
209 rowspan = 1;
211 /* Let's figure out the content of the cell */
212 buf = g_string_new (NULL);
213 a_buf = xmlBufferCreate ();
215 mstyle = gnm_style_new_default ();
216 if (xmlStrEqual (ptr->name, CC2XML ("th")))
217 gnm_style_set_font_bold (mstyle, TRUE);
219 html_read_content (ptr, buf, mstyle, a_buf,
220 &hrefs, TRUE, doc, tc);
223 if (g_slist_length (hrefs) >= 1 &&
224 buf->len > 0) {
225 /* One hyperlink, and text to make it
226 * visible */
227 char *url;
228 xmlBufferPtr h_buf = xmlBufferCreate ();
230 hrefs = g_slist_reverse (hrefs);
231 htmlNodeDump (
232 h_buf, doc, (htmlNodePtr)hrefs->data);
233 url = g_strndup (
234 CXML2C (h_buf->content), h_buf->use);
235 if (strncmp (url, "mailto:",
236 strlen ("mailto:")) == 0)
237 lnk = gnm_hlink_new (
238 gnm_hlink_email_get_type (),
239 tc->sheet);
240 else
241 lnk = gnm_hlink_new (
242 gnm_hlink_url_get_type (),
243 tc->sheet);
244 gnm_hlink_set_target (lnk, url);
245 gnm_style_set_hlink (mstyle, lnk);
246 gnm_style_set_font_uline (mstyle,
247 UNDERLINE_SINGLE);
248 gnm_style_set_font_color (mstyle,
249 gnm_color_new_go (GO_COLOR_BLUE));
250 g_free (url);
251 xmlBufferFree (h_buf);
253 if (g_slist_length (hrefs) > 1 || buf->len <= 0) {
254 /* Multiple links,
255 * or no text to give hyperlink style,
256 * so put them in a comment */
257 GSList *l;
259 for (l = hrefs; l != NULL; l = l->next) {
260 htmlNodeDump (a_buf, doc,
261 (htmlNodePtr)l->data);
262 xmlBufferAdd (a_buf, CC2XML ("\n"),
263 -1);
266 g_slist_free (hrefs);
267 if (buf->len > 0) {
268 GnmCell *cell = sheet_cell_fetch (tc->sheet, col + 1, tc->row);
269 sheet_style_set_pos (tc->sheet, col + 1, tc->row, mstyle);
270 gnm_cell_set_text (cell, buf->str);
271 } else
272 gnm_style_unref (mstyle);
274 if (a_buf->use > 0) {
275 char *name;
277 name = g_strndup (CXML2C (a_buf->content), a_buf->use);
278 cell_set_comment (tc->sheet, &pos, NULL, name, NULL);
279 g_free (name);
281 g_string_free (buf, TRUE);
282 xmlBufferFree (a_buf);
284 /* If necessary create the merge */
285 if (colspan > 1 || rowspan > 1) {
286 GnmRange range;
287 GnmRange *r = &range;
289 range_init (r, col + 1, tc->row, col + colspan, tc->row + rowspan - 1);
290 gnm_sheet_merge_add (tc->sheet, r, FALSE, NULL);
293 col += colspan;
298 static void
299 html_read_rows (htmlNodePtr cur, htmlDocPtr doc, Workbook *wb,
300 GnmHtmlTableCtxt *tc)
302 htmlNodePtr ptr;
304 for (ptr = cur->children; ptr != NULL ; ptr = ptr->next) {
305 if (ptr->type != XML_ELEMENT_NODE)
306 continue;
307 if (xmlStrEqual (ptr->name, CC2XML ("tr"))) {
308 tc->row++;
309 if (tc->sheet == NULL)
310 tc->sheet = html_get_sheet (NULL, wb);
311 html_read_row (ptr, doc, tc);
316 static void
317 html_read_table (htmlNodePtr cur, htmlDocPtr doc, WorkbookView *wb_view,
318 GnmHtmlTableCtxt *tc)
320 Workbook *wb;
321 htmlNodePtr ptr, ptr2;
323 g_return_if_fail (cur != NULL);
324 g_return_if_fail (wb_view != NULL);
326 wb = wb_view_get_workbook (wb_view);
327 for (ptr = cur->children; ptr != NULL ; ptr = ptr->next) {
328 if (ptr->type != XML_ELEMENT_NODE)
329 continue;
330 if (xmlStrEqual (ptr->name, CC2XML ("caption"))) {
331 xmlBufferPtr buf;
332 buf = xmlBufferCreate ();
333 for (ptr2 = ptr->children; ptr2 != NULL ; ptr2 = ptr2->next) {
334 htmlNodeDump (buf, doc, ptr2);
336 if (buf->use > 0) {
337 char *name;
338 name = g_strndup (CXML2C (buf->content), buf->use);
339 tc->sheet = html_get_sheet (name, wb);
340 g_free (name);
342 xmlBufferFree (buf);
343 } else if (xmlStrEqual (ptr->name, CC2XML ("thead")) ||
344 xmlStrEqual (ptr->name, CC2XML ("tfoot")) ||
345 xmlStrEqual (ptr->name, CC2XML ("tbody"))) {
346 html_read_rows (ptr, doc, wb, tc);
347 } else if (xmlStrEqual (ptr->name, CC2XML ("tr"))) {
348 html_read_rows (cur, doc, wb, tc);
349 break;
354 /* Element types which imply that we are inside a table */
355 static char const *table_start_elt_types[] = {
356 "caption",
357 "col",
358 "colgroup",
359 "tbody",
360 "tfoot",
361 "thead",
362 "tr",
363 NULL
366 /* Element types which imply that we are inside a row */
367 static char const *row_start_elt_types[] = {
368 "td",
369 "th",
370 NULL
373 /* Element types which occur inside tables and rows, but also outside */
374 static char const *cont_elt_types[] = {
375 "del",
376 "ins",
377 NULL
380 static gboolean
381 is_elt_type (htmlNodePtr ptr, char const ** types)
383 char const **p;
384 gboolean ret = FALSE;
386 for (p = types; *p; p++)
387 if (xmlStrEqual (ptr->name, CC2XML ((*p)))) {
388 ret = TRUE;
389 break;
392 return ret;
395 static gboolean
396 starts_inferred_table (htmlNodePtr ptr)
398 return ((ptr->type == XML_ELEMENT_NODE) &&
399 is_elt_type (ptr, table_start_elt_types));
402 static gboolean
403 ends_inferred_table (htmlNodePtr ptr)
405 return ((ptr->type == XML_ELEMENT_NODE) &&
406 !(is_elt_type (ptr, table_start_elt_types) ||
407 is_elt_type (ptr, cont_elt_types)));
410 static gboolean
411 starts_inferred_row (htmlNodePtr ptr)
413 return ((ptr->type == XML_ELEMENT_NODE) &&
414 is_elt_type (ptr, row_start_elt_types));
417 static gboolean
418 ends_inferred_row (htmlNodePtr ptr)
420 return ((ptr->type == XML_ELEMENT_NODE) &&
421 !(is_elt_type (ptr, row_start_elt_types) ||
422 is_elt_type (ptr, cont_elt_types)));
426 * Handles incomplete html fragments as may occur on the clipboard,
427 * e.g. a <td> without <tr> and <table> in front of it.
429 static void
430 html_search_for_tables (htmlNodePtr cur, htmlDocPtr doc,
431 WorkbookView *wb_view, GnmHtmlTableCtxt *tc)
433 htmlNodePtr ptr;
435 if (cur == NULL) {
436 xmlGenericError(xmlGenericErrorContext,
437 "htmlNodeDumpFormatOutput : node == NULL\n");
438 return;
441 if (cur->type != XML_ELEMENT_NODE)
442 return;
444 if (xmlStrEqual (cur->name, CC2XML ("table"))) {
445 html_read_table (cur, doc, wb_view, tc);
446 } else if (starts_inferred_table (cur) || starts_inferred_row (cur)) {
447 htmlNodePtr tnode = xmlNewNode (NULL, "table");
449 /* Link in a table node */
450 xmlAddPrevSibling (cur, tnode);
451 if (starts_inferred_row (cur)) {
452 htmlNodePtr rnode = xmlNewNode (NULL, "tr");
454 /* Link in a row node */
455 xmlAddChild (tnode, rnode);
456 /* Make following elements children of the row node,
457 * until we meet one which isn't legal in a row. */
458 while ((ptr = tnode->next) != NULL) {
459 if (ends_inferred_row (ptr))
460 break;
461 xmlUnlinkNode (ptr);
462 xmlAddChild (rnode, ptr);
465 /* Make following elements children of the row node,
466 * until we meet one which isn't legal in a table. */
467 while ((ptr = tnode->next) != NULL) {
468 if (ends_inferred_table (ptr))
469 break;
470 xmlUnlinkNode (ptr);
471 xmlAddChild (tnode, ptr);
473 html_read_table (tnode, doc, wb_view, tc);
474 } else {
475 for (ptr = cur->children; ptr != NULL ; ptr = ptr->next) {
476 html_search_for_tables (ptr, doc, wb_view, tc);
477 /* ptr may now have been pushed down in the tree,
478 * if so, ptr->next is not the right pointer to
479 * follow */
480 while (ptr->parent != cur)
481 ptr = ptr->parent;
486 void
487 html_file_open (G_GNUC_UNUSED GOFileOpener const *fo, GOIOContext *io_context,
488 WorkbookView *wb_view, GsfInput *input)
490 guint8 const *buf;
491 gsf_off_t size;
492 int len, bomlen;
493 htmlParserCtxtPtr ctxt;
494 htmlDocPtr doc = NULL;
495 xmlCharEncoding enc;
496 GnmHtmlTableCtxt tc;
498 g_return_if_fail (input != NULL);
500 if (gsf_input_seek (input, 0, G_SEEK_SET))
501 return;
503 size = gsf_input_size (input);
504 if (size >= 4) {
505 size -= 4;
506 buf = gsf_input_read (input, 4, NULL);
507 if (buf != NULL) {
508 enc = xmlDetectCharEncoding(buf, 4);
509 switch (enc) { /* Skip byte order mark */
510 case XML_CHAR_ENCODING_UCS4BE:
511 case XML_CHAR_ENCODING_UCS4LE:
512 case XML_CHAR_ENCODING_UCS4_2143:
513 case XML_CHAR_ENCODING_UCS4_3412:
514 case XML_CHAR_ENCODING_EBCDIC:
515 bomlen = 4;
516 break;
517 case XML_CHAR_ENCODING_UTF16BE:
518 case XML_CHAR_ENCODING_UTF16LE:
519 bomlen = 2;
520 break;
521 case XML_CHAR_ENCODING_UTF8:
522 if (buf[0] == 0xef)
523 bomlen = 3;
524 else if (buf[0] == 0x3c)
525 bomlen = 4;
526 else
527 bomlen = 0;
528 break;
529 case XML_CHAR_ENCODING_NONE:
530 bomlen = 0;
531 /* Try to detect unmarked UTF16LE
532 (Firefox Windows clipboard, drag data all platforms) */
533 if ((buf[0] >= 0x20 || g_ascii_isspace(buf[0])) &&
534 buf[1] == 0 &&
535 (buf[2] >= 0x20 || g_ascii_isspace(buf[2])) &&
536 buf[3] == 0)
537 enc = XML_CHAR_ENCODING_UTF16LE;
538 break;
539 default:
540 bomlen = 0;
542 ctxt = htmlCreatePushParserCtxt (
543 NULL, NULL, (char const *)(buf + bomlen),
544 4 - bomlen, gsf_input_name (input), enc);
546 for (; size > 0 ; size -= len) {
547 len = MIN (4096, size);
548 buf = gsf_input_read (input, len, NULL);
549 if (buf == NULL)
550 break;
551 htmlParseChunk (
552 ctxt, (char const *)buf, len, 0);
555 htmlParseChunk (ctxt, (char const *)buf, 0, 1);
556 doc = ctxt->myDoc;
557 htmlFreeParserCtxt (ctxt);
561 if (doc != NULL) {
562 xmlNodePtr ptr;
563 tc.sheet = NULL;
564 tc.row = -1;
565 tc.wb_view = wb_view;
566 for (ptr = doc->children; ptr != NULL ; ptr = ptr->next)
567 html_search_for_tables (ptr, doc, wb_view, &tc);
568 xmlFreeDoc (doc);
569 } else
570 go_io_error_info_set (io_context,
571 go_error_info_new_str (_("Unable to parse the html.")));
574 /* Quick and dirty html probe. */
575 gboolean
576 html_file_probe (G_GNUC_UNUSED GOFileOpener const *fo, GsfInput *input,
577 G_GNUC_UNUSED GOFileProbeLevel pl)
579 gsf_off_t size = 200;
580 guint8 const* buf = gsf_input_read (input, size, NULL);
581 gchar *ulstr = NULL;
582 GString *ustr;
583 gboolean res = FALSE;
585 /* Avoid seeking in large streams - try to read, fall back if
586 * stream is too short. (Actually, currently _size does not
587 * involve any syscalls -- MW). */
588 if (!buf) {
589 size = gsf_input_size (input);
590 buf = gsf_input_read (input, size, NULL);
591 if (!buf)
592 return res;
595 if (go_guess_encoding (buf, size, NULL, &ustr, NULL)) {
596 ulstr = g_utf8_strdown (ustr->str, -1);
597 g_string_free (ustr, TRUE);
600 if (!ulstr)
601 return res;
603 res = (strstr (ulstr, "<table") != NULL ||
604 strstr (ulstr, "<html") != NULL ||
605 strstr (ulstr, "<!doctype html") != NULL);
607 g_free (ulstr);
609 return res;