Undo: fix problem with col widths after paste undo.
[gnumeric.git] / plugins / html / html_read.c
blobec759118c407970bd2ff85ca57e8e84d760aef38
1 /* vim: set sw=8: -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */
2 /*
3 * html_read.c
5 * Copyright (C) 1999, 2000 Rasca, Berlin
6 * EMail: thron@gmx.de
7 * Copyright (c) 2001 Andreas J. Guelzow
8 * EMail: aguelzow@taliesin.ca
9 * Copyright (c) 2002 Jody Goldberg
10 * EMail: jody@gnome.org
12 * Contributors :
13 * Almer S. Tigelaar <almer1@dds.nl>
14 * Andreas J. Guelzow <aguelzow@taliesin.ca>
16 * This program is free software; you can redistribute it and/or modify
17 * it under the terms of the GNU General Public License as published by
18 * the Free Software Foundation; either version 2 of the License, or
19 * (at your option) any later version.
21 * This program is distributed in the hope that it will be useful,
22 * but WITHOUT ANY WARRANTY; without even the implied warranty of
23 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
24 * GNU General Public License for more details.
26 * You should have received a copy of the GNU General Public License
27 * along with this program; if not, see <https://www.gnu.org/licenses/>.
30 #include <gnumeric-config.h>
31 #include <glib/gi18n-lib.h>
32 #include <gnumeric.h>
33 #include <string.h>
34 #include "html.h"
36 #include <sheet-object-cell-comment.h>
37 #include <workbook-view.h>
38 #include <workbook.h>
39 #include <sheet.h>
40 #include <sheet-merge.h>
41 #include <sheet-style.h>
42 #include <style.h>
43 #include <style-color.h>
44 #include <hlink.h>
45 #include <cell.h>
46 #include <ranges.h>
47 #include <goffice/goffice.h>
49 #include <gsf/gsf-input.h>
50 #include <libxml/HTMLparser.h>
51 #include <libxml/HTMLtree.h>
53 #define CC2XML(s) ((xmlChar const *)(s))
54 #define C2XML(s) ((xmlChar *)(s))
55 #define CXML2C(s) ((char const *)(s))
56 #define XML2C(s) ((char *)(s))
58 typedef struct {
59 Sheet *sheet;
60 int row;
61 WorkbookView *wb_view;
62 } GnmHtmlTableCtxt;
64 static void html_read_table (htmlNodePtr cur, htmlDocPtr doc,
65 WorkbookView *wb_view,
66 GnmHtmlTableCtxt *tc);
69 static Sheet *
70 html_get_sheet (char const *name, Workbook *wb)
72 Sheet *sheet = NULL;
74 if (name) {
75 sheet = workbook_sheet_by_name (wb, name);
76 if (sheet == NULL) {
77 sheet = sheet_new (wb, name, GNM_DEFAULT_COLS, GNM_DEFAULT_ROWS);
78 workbook_sheet_attach (wb, sheet);
80 } else
81 sheet = workbook_sheet_add (wb, -1, GNM_DEFAULT_COLS, GNM_DEFAULT_ROWS);
82 return sheet;
85 static void
86 html_append_text (GString *buf, const xmlChar *text)
88 const xmlChar *p;
90 while (*text) {
91 while (g_unichar_isspace (g_utf8_get_char (text)))
92 text = g_utf8_next_char (text);
93 if (*text) {
94 for (p = text;
95 *p && !g_unichar_isspace (g_utf8_get_char (p));
96 p = g_utf8_next_char (p))
98 if (buf->len > 0)
99 g_string_append_c (buf, ' ');
100 g_string_append_len (buf, text, p - text);
101 text = p;
106 static void
107 html_read_content (htmlNodePtr cur, GString *buf, GnmStyle *mstyle,
108 xmlBufferPtr a_buf, GSList **hrefs, gboolean first,
109 htmlDocPtr doc, GnmHtmlTableCtxt *tc)
111 htmlNodePtr ptr;
113 for (ptr = cur->children; ptr != NULL ; ptr = ptr->next) {
114 if (ptr->type == XML_TEXT_NODE) {
115 if (g_utf8_validate (ptr->content, -1, NULL))
116 html_append_text (buf, ptr->content);
117 else
118 g_string_append (buf, _("[Warning: Invalid text string has been removed.]"));
119 } else if (ptr->type == XML_ELEMENT_NODE) {
120 if (first) {
121 if (xmlStrEqual (ptr->name, CC2XML ("i"))
122 || xmlStrEqual (ptr->name, CC2XML ("em")))
123 gnm_style_set_font_italic (mstyle, TRUE);
124 if (xmlStrEqual (ptr->name, CC2XML ("b")))
125 gnm_style_set_font_bold (mstyle, TRUE);
127 if (xmlStrEqual (ptr->name, CC2XML ("a"))) {
128 xmlAttrPtr props;
129 props = ptr->properties;
130 while (props) {
131 if (xmlStrEqual (props->name, CC2XML ("href")) && props->children) {
132 *hrefs = g_slist_prepend (
133 *hrefs, props->children);
136 props = props->next;
139 if (xmlStrEqual (ptr->name, CC2XML ("img"))) {
140 xmlAttrPtr props;
141 props = ptr->properties;
142 while (props) {
143 if (xmlStrEqual (props->name, CC2XML ("src")) && props->children) {
144 htmlNodeDump (a_buf, doc, props->children);
145 xmlBufferAdd (a_buf, CC2XML ("\n"), -1);
147 props = props->next;
150 if (xmlStrEqual (ptr->name, CC2XML ("table"))) {
151 Sheet *last_sheet = tc->sheet;
152 int last_row = tc->row;
153 tc->sheet = NULL;
154 tc->row = -1;
155 html_read_table (ptr, doc, tc->wb_view, tc);
156 if (tc->sheet) {
157 g_string_append_printf (buf, _("[see sheet %s]"), tc->sheet->name_quoted);
158 xmlBufferAdd (a_buf, CC2XML (_("The original html file is\n"
159 "using nested tables.")), -1);
161 tc->sheet = last_sheet;
162 tc->row = last_row;
163 } else
164 html_read_content
165 (ptr, buf, mstyle, a_buf, hrefs, first, doc, tc);
167 first = FALSE;
171 static void
172 html_read_row (htmlNodePtr cur, htmlDocPtr doc, GnmHtmlTableCtxt *tc)
174 htmlNodePtr ptr;
175 int col = -1;
177 for (ptr = cur->children; ptr != NULL ; ptr = ptr->next) {
178 if (xmlStrEqual (ptr->name, CC2XML ("td")) ||
179 xmlStrEqual (ptr->name, CC2XML ("th"))) {
180 GString *buf;
181 xmlBufferPtr a_buf;
182 xmlAttrPtr props;
183 int colspan = 1;
184 int rowspan = 1;
185 GnmCellPos pos;
186 GnmStyle *mstyle;
187 GSList *hrefs = NULL;
188 GnmHLink *lnk = NULL;
190 /* Check whether we need to skip merges from above */
191 pos.row = tc->row;
192 pos.col = col + 1;
193 while (gnm_sheet_merge_contains_pos (tc->sheet, &pos)) {
194 col++;
195 pos.col++;
198 /* Do we span across multiple rows or cols? */
199 props = ptr->properties;
200 while (props) {
201 if (xmlStrEqual (props->name, CC2XML ("colspan")) && props->children)
202 colspan = atoi (CXML2C (props->children->content));
203 if (xmlStrEqual (props->name, CC2XML ("rowspan")) && props->children)
204 rowspan = atoi (CXML2C (props->children->content));
205 props = props->next;
207 if (colspan < 1)
208 colspan = 1;
209 if (rowspan < 1)
210 rowspan = 1;
212 /* Let's figure out the content of the cell */
213 buf = g_string_new (NULL);
214 a_buf = xmlBufferCreate ();
216 mstyle = gnm_style_new_default ();
217 if (xmlStrEqual (ptr->name, CC2XML ("th")))
218 gnm_style_set_font_bold (mstyle, TRUE);
220 html_read_content (ptr, buf, mstyle, a_buf,
221 &hrefs, TRUE, doc, tc);
224 if (g_slist_length (hrefs) >= 1 &&
225 buf->len > 0) {
226 /* One hyperlink, and text to make it
227 * visible */
228 char *url;
229 xmlBufferPtr h_buf = xmlBufferCreate ();
231 hrefs = g_slist_reverse (hrefs);
232 htmlNodeDump (
233 h_buf, doc, (htmlNodePtr)hrefs->data);
234 url = g_strndup (
235 CXML2C (h_buf->content), h_buf->use);
236 if (strncmp (url, "mailto:",
237 strlen ("mailto:")) == 0)
238 lnk = gnm_hlink_new (
239 gnm_hlink_email_get_type (),
240 tc->sheet);
241 else
242 lnk = gnm_hlink_new (
243 gnm_hlink_url_get_type (),
244 tc->sheet);
245 gnm_hlink_set_target (lnk, url);
246 gnm_style_set_hlink (mstyle, lnk);
247 gnm_style_set_font_uline (mstyle,
248 UNDERLINE_SINGLE);
249 gnm_style_set_font_color (mstyle,
250 gnm_color_new_go (GO_COLOR_BLUE));
251 g_free (url);
252 xmlBufferFree (h_buf);
254 if (g_slist_length (hrefs) > 1 || buf->len <= 0) {
255 /* Multiple links,
256 * or no text to give hyperlink style,
257 * so put them in a comment */
258 GSList *l;
260 for (l = hrefs; l != NULL; l = l->next) {
261 htmlNodeDump (a_buf, doc,
262 (htmlNodePtr)l->data);
263 xmlBufferAdd (a_buf, CC2XML ("\n"),
264 -1);
267 g_slist_free (hrefs);
268 if (buf->len > 0) {
269 GnmCell *cell = sheet_cell_fetch (tc->sheet, col + 1, tc->row);
270 sheet_style_set_pos (tc->sheet, col + 1, tc->row, mstyle);
271 gnm_cell_set_text (cell, buf->str);
272 } else
273 gnm_style_unref (mstyle);
275 if (a_buf->use > 0) {
276 char *name;
278 name = g_strndup (CXML2C (a_buf->content), a_buf->use);
279 cell_set_comment (tc->sheet, &pos, NULL, name, NULL);
280 g_free (name);
282 g_string_free (buf, TRUE);
283 xmlBufferFree (a_buf);
285 /* If necessary create the merge */
286 if (colspan > 1 || rowspan > 1) {
287 GnmRange range;
288 GnmRange *r = &range;
290 range_init (r, col + 1, tc->row, col + colspan, tc->row + rowspan - 1);
291 gnm_sheet_merge_add (tc->sheet, r, FALSE, NULL);
294 col += colspan;
299 static void
300 html_read_rows (htmlNodePtr cur, htmlDocPtr doc, Workbook *wb,
301 GnmHtmlTableCtxt *tc)
303 htmlNodePtr ptr;
305 for (ptr = cur->children; ptr != NULL ; ptr = ptr->next) {
306 if (ptr->type != XML_ELEMENT_NODE)
307 continue;
308 if (xmlStrEqual (ptr->name, CC2XML ("tr"))) {
309 tc->row++;
310 if (tc->sheet == NULL)
311 tc->sheet = html_get_sheet (NULL, wb);
312 html_read_row (ptr, doc, tc);
317 static void
318 html_read_table (htmlNodePtr cur, htmlDocPtr doc, WorkbookView *wb_view,
319 GnmHtmlTableCtxt *tc)
321 Workbook *wb;
322 htmlNodePtr ptr, ptr2;
324 g_return_if_fail (cur != NULL);
325 g_return_if_fail (wb_view != NULL);
327 wb = wb_view_get_workbook (wb_view);
328 for (ptr = cur->children; ptr != NULL ; ptr = ptr->next) {
329 if (ptr->type != XML_ELEMENT_NODE)
330 continue;
331 if (xmlStrEqual (ptr->name, CC2XML ("caption"))) {
332 xmlBufferPtr buf;
333 buf = xmlBufferCreate ();
334 for (ptr2 = ptr->children; ptr2 != NULL ; ptr2 = ptr2->next) {
335 htmlNodeDump (buf, doc, ptr2);
337 if (buf->use > 0) {
338 char *name;
339 name = g_strndup (CXML2C (buf->content), buf->use);
340 tc->sheet = html_get_sheet (name, wb);
341 g_free (name);
343 xmlBufferFree (buf);
344 } else if (xmlStrEqual (ptr->name, CC2XML ("thead")) ||
345 xmlStrEqual (ptr->name, CC2XML ("tfoot")) ||
346 xmlStrEqual (ptr->name, CC2XML ("tbody"))) {
347 html_read_rows (ptr, doc, wb, tc);
348 } else if (xmlStrEqual (ptr->name, CC2XML ("tr"))) {
349 html_read_rows (cur, doc, wb, tc);
350 break;
355 /* Element types which imply that we are inside a table */
356 static char const *table_start_elt_types[] = {
357 "caption",
358 "col",
359 "colgroup",
360 "tbody",
361 "tfoot",
362 "thead",
363 "tr",
364 NULL
367 /* Element types which imply that we are inside a row */
368 static char const *row_start_elt_types[] = {
369 "td",
370 "th",
371 NULL
374 /* Element types which occur inside tables and rows, but also outside */
375 static char const *cont_elt_types[] = {
376 "del",
377 "ins",
378 NULL
381 static gboolean
382 is_elt_type (htmlNodePtr ptr, char const ** types)
384 char const **p;
385 gboolean ret = FALSE;
387 for (p = types; *p; p++)
388 if (xmlStrEqual (ptr->name, CC2XML ((*p)))) {
389 ret = TRUE;
390 break;
393 return ret;
396 static gboolean
397 starts_inferred_table (htmlNodePtr ptr)
399 return ((ptr->type == XML_ELEMENT_NODE) &&
400 is_elt_type (ptr, table_start_elt_types));
403 static gboolean
404 ends_inferred_table (htmlNodePtr ptr)
406 return ((ptr->type == XML_ELEMENT_NODE) &&
407 !(is_elt_type (ptr, table_start_elt_types) ||
408 is_elt_type (ptr, cont_elt_types)));
411 static gboolean
412 starts_inferred_row (htmlNodePtr ptr)
414 return ((ptr->type == XML_ELEMENT_NODE) &&
415 is_elt_type (ptr, row_start_elt_types));
418 static gboolean
419 ends_inferred_row (htmlNodePtr ptr)
421 return ((ptr->type == XML_ELEMENT_NODE) &&
422 !(is_elt_type (ptr, row_start_elt_types) ||
423 is_elt_type (ptr, cont_elt_types)));
427 * Handles incomplete html fragments as may occur on the clipboard,
428 * e.g. a <td> without <tr> and <table> in front of it.
430 static void
431 html_search_for_tables (htmlNodePtr cur, htmlDocPtr doc,
432 WorkbookView *wb_view, GnmHtmlTableCtxt *tc)
434 htmlNodePtr ptr;
436 if (cur == NULL) {
437 xmlGenericError(xmlGenericErrorContext,
438 "htmlNodeDumpFormatOutput : node == NULL\n");
439 return;
442 if (cur->type != XML_ELEMENT_NODE)
443 return;
445 if (xmlStrEqual (cur->name, CC2XML ("table"))) {
446 html_read_table (cur, doc, wb_view, tc);
447 } else if (starts_inferred_table (cur) || starts_inferred_row (cur)) {
448 htmlNodePtr tnode = xmlNewNode (NULL, "table");
450 /* Link in a table node */
451 xmlAddPrevSibling (cur, tnode);
452 if (starts_inferred_row (cur)) {
453 htmlNodePtr rnode = xmlNewNode (NULL, "tr");
455 /* Link in a row node */
456 xmlAddChild (tnode, rnode);
457 /* Make following elements children of the row node,
458 * until we meet one which isn't legal in a row. */
459 while ((ptr = tnode->next) != NULL) {
460 if (ends_inferred_row (ptr))
461 break;
462 xmlUnlinkNode (ptr);
463 xmlAddChild (rnode, ptr);
466 /* Make following elements children of the row node,
467 * until we meet one which isn't legal in a table. */
468 while ((ptr = tnode->next) != NULL) {
469 if (ends_inferred_table (ptr))
470 break;
471 xmlUnlinkNode (ptr);
472 xmlAddChild (tnode, ptr);
474 html_read_table (tnode, doc, wb_view, tc);
475 } else {
476 for (ptr = cur->children; ptr != NULL ; ptr = ptr->next) {
477 html_search_for_tables (ptr, doc, wb_view, tc);
478 /* ptr may now have been pushed down in the tree,
479 * if so, ptr->next is not the right pointer to
480 * follow */
481 while (ptr->parent != cur)
482 ptr = ptr->parent;
487 void
488 html_file_open (G_GNUC_UNUSED GOFileOpener const *fo, GOIOContext *io_context,
489 WorkbookView *wb_view, GsfInput *input)
491 guint8 const *buf;
492 gsf_off_t size;
493 int len, bomlen;
494 htmlParserCtxtPtr ctxt;
495 htmlDocPtr doc = NULL;
496 xmlCharEncoding enc;
497 GnmHtmlTableCtxt tc;
499 g_return_if_fail (input != NULL);
501 if (gsf_input_seek (input, 0, G_SEEK_SET))
502 return;
504 size = gsf_input_size (input);
505 if (size >= 4) {
506 size -= 4;
507 buf = gsf_input_read (input, 4, NULL);
508 if (buf != NULL) {
509 enc = xmlDetectCharEncoding(buf, 4);
510 switch (enc) { /* Skip byte order mark */
511 case XML_CHAR_ENCODING_UCS4BE:
512 case XML_CHAR_ENCODING_UCS4LE:
513 case XML_CHAR_ENCODING_UCS4_2143:
514 case XML_CHAR_ENCODING_UCS4_3412:
515 case XML_CHAR_ENCODING_EBCDIC:
516 bomlen = 4;
517 break;
518 case XML_CHAR_ENCODING_UTF16BE:
519 case XML_CHAR_ENCODING_UTF16LE:
520 bomlen = 2;
521 break;
522 case XML_CHAR_ENCODING_UTF8:
523 if (buf[0] == 0xef)
524 bomlen = 3;
525 else if (buf[0] == 0x3c)
526 bomlen = 4;
527 else
528 bomlen = 0;
529 break;
530 case XML_CHAR_ENCODING_NONE:
531 bomlen = 0;
532 /* Try to detect unmarked UTF16LE
533 (Firefox Windows clipboard, drag data all platforms) */
534 if ((buf[0] >= 0x20 || g_ascii_isspace(buf[0])) &&
535 buf[1] == 0 &&
536 (buf[2] >= 0x20 || g_ascii_isspace(buf[2])) &&
537 buf[3] == 0)
538 enc = XML_CHAR_ENCODING_UTF16LE;
539 break;
540 default:
541 bomlen = 0;
543 ctxt = htmlCreatePushParserCtxt (
544 NULL, NULL, (char const *)(buf + bomlen),
545 4 - bomlen, gsf_input_name (input), enc);
547 for (; size > 0 ; size -= len) {
548 len = MIN (4096, size);
549 buf = gsf_input_read (input, len, NULL);
550 if (buf == NULL)
551 break;
552 htmlParseChunk (
553 ctxt, (char const *)buf, len, 0);
556 htmlParseChunk (ctxt, (char const *)buf, 0, 1);
557 doc = ctxt->myDoc;
558 htmlFreeParserCtxt (ctxt);
562 if (doc != NULL) {
563 xmlNodePtr ptr;
564 tc.sheet = NULL;
565 tc.row = -1;
566 tc.wb_view = wb_view;
567 for (ptr = doc->children; ptr != NULL ; ptr = ptr->next)
568 html_search_for_tables (ptr, doc, wb_view, &tc);
569 xmlFreeDoc (doc);
570 } else
571 go_io_error_info_set (io_context,
572 go_error_info_new_str (_("Unable to parse the html.")));
575 /* Quick and dirty html probe. */
576 gboolean
577 html_file_probe (G_GNUC_UNUSED GOFileOpener const *fo, GsfInput *input,
578 G_GNUC_UNUSED GOFileProbeLevel pl)
580 gsf_off_t size = 200;
581 guint8 const* buf = gsf_input_read (input, size, NULL);
582 gchar *ulstr = NULL;
583 GString *ustr;
584 gboolean res = FALSE;
586 /* Avoid seeking in large streams - try to read, fall back if
587 * stream is too short. (Actually, currently _size does not
588 * involve any syscalls -- MW). */
589 if (!buf) {
590 size = gsf_input_size (input);
591 buf = gsf_input_read (input, size, NULL);
592 if (!buf)
593 return res;
596 if (go_guess_encoding (buf, size, NULL, &ustr, NULL)) {
597 ulstr = g_utf8_strdown (ustr->str, -1);
598 g_string_free (ustr, TRUE);
601 if (!ulstr)
602 return res;
604 res = (strstr (ulstr, "<table") != NULL ||
605 strstr (ulstr, "<html") != NULL ||
606 strstr (ulstr, "<!doctype html") != NULL);
608 g_free (ulstr);
610 return res;