4 * Copyright (C) 1999, 2000 Rasca, Berlin
6 * Copyright (c) 2001 Andreas J. Guelzow
7 * EMail: aguelzow@taliesin.ca
8 * Copyright (c) 2002 Jody Goldberg
9 * EMail: jody@gnome.org
12 * Almer S. Tigelaar <almer1@dds.nl>
13 * Andreas J. Guelzow <aguelzow@taliesin.ca>
15 * This program is free software; you can redistribute it and/or modify
16 * it under the terms of the GNU General Public License as published by
17 * the Free Software Foundation; either version 2 of the License, or
18 * (at your option) any later version.
20 * This program is distributed in the hope that it will be useful,
21 * but WITHOUT ANY WARRANTY; without even the implied warranty of
22 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23 * GNU General Public License for more details.
25 * You should have received a copy of the GNU General Public License
26 * along with this program; if not, see <https://www.gnu.org/licenses/>.
29 #include <gnumeric-config.h>
30 #include <glib/gi18n-lib.h>
35 #include <sheet-object-cell-comment.h>
36 #include <workbook-view.h>
39 #include <sheet-merge.h>
40 #include <sheet-style.h>
42 #include <style-color.h>
46 #include <goffice/goffice.h>
48 #include <gsf/gsf-input.h>
49 #include <libxml/HTMLparser.h>
50 #include <libxml/HTMLtree.h>
52 #define CC2XML(s) ((xmlChar const *)(s))
53 #define C2XML(s) ((xmlChar *)(s))
54 #define CXML2C(s) ((char const *)(s))
55 #define XML2C(s) ((char *)(s))
60 WorkbookView
*wb_view
;
63 static void html_read_table (htmlNodePtr cur
, htmlDocPtr doc
,
64 WorkbookView
*wb_view
,
65 GnmHtmlTableCtxt
*tc
);
69 html_get_sheet (char const *name
, Workbook
*wb
)
74 sheet
= workbook_sheet_by_name (wb
, name
);
76 sheet
= sheet_new (wb
, name
, GNM_DEFAULT_COLS
, GNM_DEFAULT_ROWS
);
77 workbook_sheet_attach (wb
, sheet
);
80 sheet
= workbook_sheet_add (wb
, -1, GNM_DEFAULT_COLS
, GNM_DEFAULT_ROWS
);
85 html_append_text (GString
*buf
, const xmlChar
*text
)
90 while (g_unichar_isspace (g_utf8_get_char (text
)))
91 text
= g_utf8_next_char (text
);
94 *p
&& !g_unichar_isspace (g_utf8_get_char (p
));
95 p
= g_utf8_next_char (p
))
98 g_string_append_c (buf
, ' ');
99 g_string_append_len (buf
, text
, p
- text
);
106 html_read_content (htmlNodePtr cur
, GString
*buf
, GnmStyle
*mstyle
,
107 xmlBufferPtr a_buf
, GSList
**hrefs
, gboolean first
,
108 htmlDocPtr doc
, GnmHtmlTableCtxt
*tc
)
112 for (ptr
= cur
->children
; ptr
!= NULL
; ptr
= ptr
->next
) {
113 if (ptr
->type
== XML_TEXT_NODE
) {
114 if (g_utf8_validate (ptr
->content
, -1, NULL
))
115 html_append_text (buf
, ptr
->content
);
117 g_string_append (buf
, _("[Warning: Invalid text string has been removed.]"));
118 } else if (ptr
->type
== XML_ELEMENT_NODE
) {
120 if (xmlStrEqual (ptr
->name
, CC2XML ("i"))
121 || xmlStrEqual (ptr
->name
, CC2XML ("em")))
122 gnm_style_set_font_italic (mstyle
, TRUE
);
123 if (xmlStrEqual (ptr
->name
, CC2XML ("b")))
124 gnm_style_set_font_bold (mstyle
, TRUE
);
126 if (xmlStrEqual (ptr
->name
, CC2XML ("a"))) {
128 props
= ptr
->properties
;
130 if (xmlStrEqual (props
->name
, CC2XML ("href")) && props
->children
) {
131 *hrefs
= g_slist_prepend (
132 *hrefs
, props
->children
);
138 if (xmlStrEqual (ptr
->name
, CC2XML ("img"))) {
140 props
= ptr
->properties
;
142 if (xmlStrEqual (props
->name
, CC2XML ("src")) && props
->children
) {
143 htmlNodeDump (a_buf
, doc
, props
->children
);
144 xmlBufferAdd (a_buf
, CC2XML ("\n"), -1);
149 if (xmlStrEqual (ptr
->name
, CC2XML ("table"))) {
150 Sheet
*last_sheet
= tc
->sheet
;
151 int last_row
= tc
->row
;
154 html_read_table (ptr
, doc
, tc
->wb_view
, tc
);
156 g_string_append_printf (buf
, _("[see sheet %s]"), tc
->sheet
->name_quoted
);
157 xmlBufferAdd (a_buf
, CC2XML (_("The original html file is\n"
158 "using nested tables.")), -1);
160 tc
->sheet
= last_sheet
;
164 (ptr
, buf
, mstyle
, a_buf
, hrefs
, first
, doc
, tc
);
171 html_read_row (htmlNodePtr cur
, htmlDocPtr doc
, GnmHtmlTableCtxt
*tc
)
176 for (ptr
= cur
->children
; ptr
!= NULL
; ptr
= ptr
->next
) {
177 if (xmlStrEqual (ptr
->name
, CC2XML ("td")) ||
178 xmlStrEqual (ptr
->name
, CC2XML ("th"))) {
186 GSList
*hrefs
= NULL
;
187 GnmHLink
*lnk
= NULL
;
189 /* Check whether we need to skip merges from above */
192 while (gnm_sheet_merge_contains_pos (tc
->sheet
, &pos
)) {
197 /* Do we span across multiple rows or cols? */
198 props
= ptr
->properties
;
200 if (xmlStrEqual (props
->name
, CC2XML ("colspan")) && props
->children
)
201 colspan
= atoi (CXML2C (props
->children
->content
));
202 if (xmlStrEqual (props
->name
, CC2XML ("rowspan")) && props
->children
)
203 rowspan
= atoi (CXML2C (props
->children
->content
));
211 /* Let's figure out the content of the cell */
212 buf
= g_string_new (NULL
);
213 a_buf
= xmlBufferCreate ();
215 mstyle
= gnm_style_new_default ();
216 if (xmlStrEqual (ptr
->name
, CC2XML ("th")))
217 gnm_style_set_font_bold (mstyle
, TRUE
);
219 html_read_content (ptr
, buf
, mstyle
, a_buf
,
220 &hrefs
, TRUE
, doc
, tc
);
223 if (g_slist_length (hrefs
) >= 1 &&
225 /* One hyperlink, and text to make it
228 xmlBufferPtr h_buf
= xmlBufferCreate ();
230 hrefs
= g_slist_reverse (hrefs
);
232 h_buf
, doc
, (htmlNodePtr
)hrefs
->data
);
234 CXML2C (h_buf
->content
), h_buf
->use
);
235 if (strncmp (url
, "mailto:",
236 strlen ("mailto:")) == 0)
237 lnk
= gnm_hlink_new (
238 gnm_hlink_email_get_type (),
241 lnk
= gnm_hlink_new (
242 gnm_hlink_url_get_type (),
244 gnm_hlink_set_target (lnk
, url
);
245 gnm_style_set_hlink (mstyle
, lnk
);
246 gnm_style_set_font_uline (mstyle
,
248 gnm_style_set_font_color (mstyle
,
249 gnm_color_new_go (GO_COLOR_BLUE
));
251 xmlBufferFree (h_buf
);
253 if (g_slist_length (hrefs
) > 1 || buf
->len
<= 0) {
255 * or no text to give hyperlink style,
256 * so put them in a comment */
259 for (l
= hrefs
; l
!= NULL
; l
= l
->next
) {
260 htmlNodeDump (a_buf
, doc
,
261 (htmlNodePtr
)l
->data
);
262 xmlBufferAdd (a_buf
, CC2XML ("\n"),
266 g_slist_free (hrefs
);
268 GnmCell
*cell
= sheet_cell_fetch (tc
->sheet
, col
+ 1, tc
->row
);
269 sheet_style_set_pos (tc
->sheet
, col
+ 1, tc
->row
, mstyle
);
270 gnm_cell_set_text (cell
, buf
->str
);
272 gnm_style_unref (mstyle
);
274 if (a_buf
->use
> 0) {
277 name
= g_strndup (CXML2C (a_buf
->content
), a_buf
->use
);
278 cell_set_comment (tc
->sheet
, &pos
, NULL
, name
, NULL
);
281 g_string_free (buf
, TRUE
);
282 xmlBufferFree (a_buf
);
284 /* If necessary create the merge */
285 if (colspan
> 1 || rowspan
> 1) {
287 GnmRange
*r
= &range
;
289 range_init (r
, col
+ 1, tc
->row
, col
+ colspan
, tc
->row
+ rowspan
- 1);
290 gnm_sheet_merge_add (tc
->sheet
, r
, FALSE
, NULL
);
299 html_read_rows (htmlNodePtr cur
, htmlDocPtr doc
, Workbook
*wb
,
300 GnmHtmlTableCtxt
*tc
)
304 for (ptr
= cur
->children
; ptr
!= NULL
; ptr
= ptr
->next
) {
305 if (ptr
->type
!= XML_ELEMENT_NODE
)
307 if (xmlStrEqual (ptr
->name
, CC2XML ("tr"))) {
309 if (tc
->sheet
== NULL
)
310 tc
->sheet
= html_get_sheet (NULL
, wb
);
311 html_read_row (ptr
, doc
, tc
);
317 html_read_table (htmlNodePtr cur
, htmlDocPtr doc
, WorkbookView
*wb_view
,
318 GnmHtmlTableCtxt
*tc
)
321 htmlNodePtr ptr
, ptr2
;
323 g_return_if_fail (cur
!= NULL
);
324 g_return_if_fail (wb_view
!= NULL
);
326 wb
= wb_view_get_workbook (wb_view
);
327 for (ptr
= cur
->children
; ptr
!= NULL
; ptr
= ptr
->next
) {
328 if (ptr
->type
!= XML_ELEMENT_NODE
)
330 if (xmlStrEqual (ptr
->name
, CC2XML ("caption"))) {
332 buf
= xmlBufferCreate ();
333 for (ptr2
= ptr
->children
; ptr2
!= NULL
; ptr2
= ptr2
->next
) {
334 htmlNodeDump (buf
, doc
, ptr2
);
338 name
= g_strndup (CXML2C (buf
->content
), buf
->use
);
339 tc
->sheet
= html_get_sheet (name
, wb
);
343 } else if (xmlStrEqual (ptr
->name
, CC2XML ("thead")) ||
344 xmlStrEqual (ptr
->name
, CC2XML ("tfoot")) ||
345 xmlStrEqual (ptr
->name
, CC2XML ("tbody"))) {
346 html_read_rows (ptr
, doc
, wb
, tc
);
347 } else if (xmlStrEqual (ptr
->name
, CC2XML ("tr"))) {
348 html_read_rows (cur
, doc
, wb
, tc
);
354 /* Element types which imply that we are inside a table */
355 static char const *table_start_elt_types
[] = {
366 /* Element types which imply that we are inside a row */
367 static char const *row_start_elt_types
[] = {
373 /* Element types which occur inside tables and rows, but also outside */
374 static char const *cont_elt_types
[] = {
381 is_elt_type (htmlNodePtr ptr
, char const ** types
)
384 gboolean ret
= FALSE
;
386 for (p
= types
; *p
; p
++)
387 if (xmlStrEqual (ptr
->name
, CC2XML ((*p
)))) {
396 starts_inferred_table (htmlNodePtr ptr
)
398 return ((ptr
->type
== XML_ELEMENT_NODE
) &&
399 is_elt_type (ptr
, table_start_elt_types
));
403 ends_inferred_table (htmlNodePtr ptr
)
405 return ((ptr
->type
== XML_ELEMENT_NODE
) &&
406 !(is_elt_type (ptr
, table_start_elt_types
) ||
407 is_elt_type (ptr
, cont_elt_types
)));
411 starts_inferred_row (htmlNodePtr ptr
)
413 return ((ptr
->type
== XML_ELEMENT_NODE
) &&
414 is_elt_type (ptr
, row_start_elt_types
));
418 ends_inferred_row (htmlNodePtr ptr
)
420 return ((ptr
->type
== XML_ELEMENT_NODE
) &&
421 !(is_elt_type (ptr
, row_start_elt_types
) ||
422 is_elt_type (ptr
, cont_elt_types
)));
426 * Handles incomplete html fragments as may occur on the clipboard,
427 * e.g. a <td> without <tr> and <table> in front of it.
430 html_search_for_tables (htmlNodePtr cur
, htmlDocPtr doc
,
431 WorkbookView
*wb_view
, GnmHtmlTableCtxt
*tc
)
436 xmlGenericError(xmlGenericErrorContext
,
437 "htmlNodeDumpFormatOutput : node == NULL\n");
441 if (cur
->type
!= XML_ELEMENT_NODE
)
444 if (xmlStrEqual (cur
->name
, CC2XML ("table"))) {
445 html_read_table (cur
, doc
, wb_view
, tc
);
446 } else if (starts_inferred_table (cur
) || starts_inferred_row (cur
)) {
447 htmlNodePtr tnode
= xmlNewNode (NULL
, "table");
449 /* Link in a table node */
450 xmlAddPrevSibling (cur
, tnode
);
451 if (starts_inferred_row (cur
)) {
452 htmlNodePtr rnode
= xmlNewNode (NULL
, "tr");
454 /* Link in a row node */
455 xmlAddChild (tnode
, rnode
);
456 /* Make following elements children of the row node,
457 * until we meet one which isn't legal in a row. */
458 while ((ptr
= tnode
->next
) != NULL
) {
459 if (ends_inferred_row (ptr
))
462 xmlAddChild (rnode
, ptr
);
465 /* Make following elements children of the row node,
466 * until we meet one which isn't legal in a table. */
467 while ((ptr
= tnode
->next
) != NULL
) {
468 if (ends_inferred_table (ptr
))
471 xmlAddChild (tnode
, ptr
);
473 html_read_table (tnode
, doc
, wb_view
, tc
);
475 for (ptr
= cur
->children
; ptr
!= NULL
; ptr
= ptr
->next
) {
476 html_search_for_tables (ptr
, doc
, wb_view
, tc
);
477 /* ptr may now have been pushed down in the tree,
478 * if so, ptr->next is not the right pointer to
480 while (ptr
->parent
!= cur
)
487 html_file_open (G_GNUC_UNUSED GOFileOpener
const *fo
, GOIOContext
*io_context
,
488 WorkbookView
*wb_view
, GsfInput
*input
)
493 htmlParserCtxtPtr ctxt
;
494 htmlDocPtr doc
= NULL
;
498 g_return_if_fail (input
!= NULL
);
500 if (gsf_input_seek (input
, 0, G_SEEK_SET
))
503 size
= gsf_input_size (input
);
506 buf
= gsf_input_read (input
, 4, NULL
);
508 enc
= xmlDetectCharEncoding(buf
, 4);
509 switch (enc
) { /* Skip byte order mark */
510 case XML_CHAR_ENCODING_UCS4BE
:
511 case XML_CHAR_ENCODING_UCS4LE
:
512 case XML_CHAR_ENCODING_UCS4_2143
:
513 case XML_CHAR_ENCODING_UCS4_3412
:
514 case XML_CHAR_ENCODING_EBCDIC
:
517 case XML_CHAR_ENCODING_UTF16BE
:
518 case XML_CHAR_ENCODING_UTF16LE
:
521 case XML_CHAR_ENCODING_UTF8
:
524 else if (buf
[0] == 0x3c)
529 case XML_CHAR_ENCODING_NONE
:
531 /* Try to detect unmarked UTF16LE
532 (Firefox Windows clipboard, drag data all platforms) */
533 if ((buf
[0] >= 0x20 || g_ascii_isspace(buf
[0])) &&
535 (buf
[2] >= 0x20 || g_ascii_isspace(buf
[2])) &&
537 enc
= XML_CHAR_ENCODING_UTF16LE
;
542 ctxt
= htmlCreatePushParserCtxt (
543 NULL
, NULL
, (char const *)(buf
+ bomlen
),
544 4 - bomlen
, gsf_input_name (input
), enc
);
546 for (; size
> 0 ; size
-= len
) {
547 len
= MIN (4096, size
);
548 buf
= gsf_input_read (input
, len
, NULL
);
552 ctxt
, (char const *)buf
, len
, 0);
555 htmlParseChunk (ctxt
, (char const *)buf
, 0, 1);
557 htmlFreeParserCtxt (ctxt
);
565 tc
.wb_view
= wb_view
;
566 for (ptr
= doc
->children
; ptr
!= NULL
; ptr
= ptr
->next
)
567 html_search_for_tables (ptr
, doc
, wb_view
, &tc
);
570 go_io_error_info_set (io_context
,
571 go_error_info_new_str (_("Unable to parse the html.")));
574 /* Quick and dirty html probe. */
576 html_file_probe (G_GNUC_UNUSED GOFileOpener
const *fo
, GsfInput
*input
,
577 G_GNUC_UNUSED GOFileProbeLevel pl
)
579 gsf_off_t size
= 200;
580 guint8
const* buf
= gsf_input_read (input
, size
, NULL
);
583 gboolean res
= FALSE
;
585 /* Avoid seeking in large streams - try to read, fall back if
586 * stream is too short. (Actually, currently _size does not
587 * involve any syscalls -- MW). */
589 size
= gsf_input_size (input
);
590 buf
= gsf_input_read (input
, size
, NULL
);
595 if (go_guess_encoding (buf
, size
, NULL
, &ustr
, NULL
)) {
596 ulstr
= g_utf8_strdown (ustr
->str
, -1);
597 g_string_free (ustr
, TRUE
);
603 res
= (strstr (ulstr
, "<table") != NULL
||
604 strstr (ulstr
, "<html") != NULL
||
605 strstr (ulstr
, "<!doctype html") != NULL
);