1 /* vim: set sw=8: -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */
5 * Copyright (C) 1999, 2000 Rasca, Berlin
7 * Copyright (c) 2001 Andreas J. Guelzow
8 * EMail: aguelzow@taliesin.ca
9 * Copyright (c) 2002 Jody Goldberg
10 * EMail: jody@gnome.org
13 * Almer S. Tigelaar <almer1@dds.nl>
14 * Andreas J. Guelzow <aguelzow@taliesin.ca>
16 * This program is free software; you can redistribute it and/or modify
17 * it under the terms of the GNU General Public License as published by
18 * the Free Software Foundation; either version 2 of the License, or
19 * (at your option) any later version.
21 * This program is distributed in the hope that it will be useful,
22 * but WITHOUT ANY WARRANTY; without even the implied warranty of
23 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
24 * GNU General Public License for more details.
26 * You should have received a copy of the GNU General Public License
27 * along with this program; if not, see <https://www.gnu.org/licenses/>.
30 #include <gnumeric-config.h>
31 #include <glib/gi18n-lib.h>
36 #include <sheet-object-cell-comment.h>
37 #include <workbook-view.h>
40 #include <sheet-merge.h>
41 #include <sheet-style.h>
43 #include <style-color.h>
47 #include <goffice/goffice.h>
49 #include <gsf/gsf-input.h>
50 #include <libxml/HTMLparser.h>
51 #include <libxml/HTMLtree.h>
53 #define CC2XML(s) ((xmlChar const *)(s))
54 #define C2XML(s) ((xmlChar *)(s))
55 #define CXML2C(s) ((char const *)(s))
56 #define XML2C(s) ((char *)(s))
61 WorkbookView
*wb_view
;
64 static void html_read_table (htmlNodePtr cur
, htmlDocPtr doc
,
65 WorkbookView
*wb_view
,
66 GnmHtmlTableCtxt
*tc
);
70 html_get_sheet (char const *name
, Workbook
*wb
)
75 sheet
= workbook_sheet_by_name (wb
, name
);
77 sheet
= sheet_new (wb
, name
, GNM_DEFAULT_COLS
, GNM_DEFAULT_ROWS
);
78 workbook_sheet_attach (wb
, sheet
);
81 sheet
= workbook_sheet_add (wb
, -1, GNM_DEFAULT_COLS
, GNM_DEFAULT_ROWS
);
86 html_append_text (GString
*buf
, const xmlChar
*text
)
91 while (g_unichar_isspace (g_utf8_get_char (text
)))
92 text
= g_utf8_next_char (text
);
95 *p
&& !g_unichar_isspace (g_utf8_get_char (p
));
96 p
= g_utf8_next_char (p
))
99 g_string_append_c (buf
, ' ');
100 g_string_append_len (buf
, text
, p
- text
);
107 html_read_content (htmlNodePtr cur
, GString
*buf
, GnmStyle
*mstyle
,
108 xmlBufferPtr a_buf
, GSList
**hrefs
, gboolean first
,
109 htmlDocPtr doc
, GnmHtmlTableCtxt
*tc
)
113 for (ptr
= cur
->children
; ptr
!= NULL
; ptr
= ptr
->next
) {
114 if (ptr
->type
== XML_TEXT_NODE
) {
115 if (g_utf8_validate (ptr
->content
, -1, NULL
))
116 html_append_text (buf
, ptr
->content
);
118 g_string_append (buf
, _("[Warning: Invalid text string has been removed.]"));
119 } else if (ptr
->type
== XML_ELEMENT_NODE
) {
121 if (xmlStrEqual (ptr
->name
, CC2XML ("i"))
122 || xmlStrEqual (ptr
->name
, CC2XML ("em")))
123 gnm_style_set_font_italic (mstyle
, TRUE
);
124 if (xmlStrEqual (ptr
->name
, CC2XML ("b")))
125 gnm_style_set_font_bold (mstyle
, TRUE
);
127 if (xmlStrEqual (ptr
->name
, CC2XML ("a"))) {
129 props
= ptr
->properties
;
131 if (xmlStrEqual (props
->name
, CC2XML ("href")) && props
->children
) {
132 *hrefs
= g_slist_prepend (
133 *hrefs
, props
->children
);
139 if (xmlStrEqual (ptr
->name
, CC2XML ("img"))) {
141 props
= ptr
->properties
;
143 if (xmlStrEqual (props
->name
, CC2XML ("src")) && props
->children
) {
144 htmlNodeDump (a_buf
, doc
, props
->children
);
145 xmlBufferAdd (a_buf
, CC2XML ("\n"), -1);
150 if (xmlStrEqual (ptr
->name
, CC2XML ("table"))) {
151 Sheet
*last_sheet
= tc
->sheet
;
152 int last_row
= tc
->row
;
155 html_read_table (ptr
, doc
, tc
->wb_view
, tc
);
157 g_string_append_printf (buf
, _("[see sheet %s]"), tc
->sheet
->name_quoted
);
158 xmlBufferAdd (a_buf
, CC2XML (_("The original html file is\n"
159 "using nested tables.")), -1);
161 tc
->sheet
= last_sheet
;
165 (ptr
, buf
, mstyle
, a_buf
, hrefs
, first
, doc
, tc
);
172 html_read_row (htmlNodePtr cur
, htmlDocPtr doc
, GnmHtmlTableCtxt
*tc
)
177 for (ptr
= cur
->children
; ptr
!= NULL
; ptr
= ptr
->next
) {
178 if (xmlStrEqual (ptr
->name
, CC2XML ("td")) ||
179 xmlStrEqual (ptr
->name
, CC2XML ("th"))) {
187 GSList
*hrefs
= NULL
;
188 GnmHLink
*lnk
= NULL
;
190 /* Check whether we need to skip merges from above */
193 while (gnm_sheet_merge_contains_pos (tc
->sheet
, &pos
)) {
198 /* Do we span across multiple rows or cols? */
199 props
= ptr
->properties
;
201 if (xmlStrEqual (props
->name
, CC2XML ("colspan")) && props
->children
)
202 colspan
= atoi (CXML2C (props
->children
->content
));
203 if (xmlStrEqual (props
->name
, CC2XML ("rowspan")) && props
->children
)
204 rowspan
= atoi (CXML2C (props
->children
->content
));
212 /* Let's figure out the content of the cell */
213 buf
= g_string_new (NULL
);
214 a_buf
= xmlBufferCreate ();
216 mstyle
= gnm_style_new_default ();
217 if (xmlStrEqual (ptr
->name
, CC2XML ("th")))
218 gnm_style_set_font_bold (mstyle
, TRUE
);
220 html_read_content (ptr
, buf
, mstyle
, a_buf
,
221 &hrefs
, TRUE
, doc
, tc
);
224 if (g_slist_length (hrefs
) >= 1 &&
226 /* One hyperlink, and text to make it
229 xmlBufferPtr h_buf
= xmlBufferCreate ();
231 hrefs
= g_slist_reverse (hrefs
);
233 h_buf
, doc
, (htmlNodePtr
)hrefs
->data
);
235 CXML2C (h_buf
->content
), h_buf
->use
);
236 if (strncmp (url
, "mailto:",
237 strlen ("mailto:")) == 0)
238 lnk
= gnm_hlink_new (
239 gnm_hlink_email_get_type (),
242 lnk
= gnm_hlink_new (
243 gnm_hlink_url_get_type (),
245 gnm_hlink_set_target (lnk
, url
);
246 gnm_style_set_hlink (mstyle
, lnk
);
247 gnm_style_set_font_uline (mstyle
,
249 gnm_style_set_font_color (mstyle
,
250 gnm_color_new_go (GO_COLOR_BLUE
));
252 xmlBufferFree (h_buf
);
254 if (g_slist_length (hrefs
) > 1 || buf
->len
<= 0) {
256 * or no text to give hyperlink style,
257 * so put them in a comment */
260 for (l
= hrefs
; l
!= NULL
; l
= l
->next
) {
261 htmlNodeDump (a_buf
, doc
,
262 (htmlNodePtr
)l
->data
);
263 xmlBufferAdd (a_buf
, CC2XML ("\n"),
267 g_slist_free (hrefs
);
269 GnmCell
*cell
= sheet_cell_fetch (tc
->sheet
, col
+ 1, tc
->row
);
270 sheet_style_set_pos (tc
->sheet
, col
+ 1, tc
->row
, mstyle
);
271 gnm_cell_set_text (cell
, buf
->str
);
273 gnm_style_unref (mstyle
);
275 if (a_buf
->use
> 0) {
278 name
= g_strndup (CXML2C (a_buf
->content
), a_buf
->use
);
279 cell_set_comment (tc
->sheet
, &pos
, NULL
, name
, NULL
);
282 g_string_free (buf
, TRUE
);
283 xmlBufferFree (a_buf
);
285 /* If necessary create the merge */
286 if (colspan
> 1 || rowspan
> 1) {
288 GnmRange
*r
= &range
;
290 range_init (r
, col
+ 1, tc
->row
, col
+ colspan
, tc
->row
+ rowspan
- 1);
291 gnm_sheet_merge_add (tc
->sheet
, r
, FALSE
, NULL
);
300 html_read_rows (htmlNodePtr cur
, htmlDocPtr doc
, Workbook
*wb
,
301 GnmHtmlTableCtxt
*tc
)
305 for (ptr
= cur
->children
; ptr
!= NULL
; ptr
= ptr
->next
) {
306 if (ptr
->type
!= XML_ELEMENT_NODE
)
308 if (xmlStrEqual (ptr
->name
, CC2XML ("tr"))) {
310 if (tc
->sheet
== NULL
)
311 tc
->sheet
= html_get_sheet (NULL
, wb
);
312 html_read_row (ptr
, doc
, tc
);
318 html_read_table (htmlNodePtr cur
, htmlDocPtr doc
, WorkbookView
*wb_view
,
319 GnmHtmlTableCtxt
*tc
)
322 htmlNodePtr ptr
, ptr2
;
324 g_return_if_fail (cur
!= NULL
);
325 g_return_if_fail (wb_view
!= NULL
);
327 wb
= wb_view_get_workbook (wb_view
);
328 for (ptr
= cur
->children
; ptr
!= NULL
; ptr
= ptr
->next
) {
329 if (ptr
->type
!= XML_ELEMENT_NODE
)
331 if (xmlStrEqual (ptr
->name
, CC2XML ("caption"))) {
333 buf
= xmlBufferCreate ();
334 for (ptr2
= ptr
->children
; ptr2
!= NULL
; ptr2
= ptr2
->next
) {
335 htmlNodeDump (buf
, doc
, ptr2
);
339 name
= g_strndup (CXML2C (buf
->content
), buf
->use
);
340 tc
->sheet
= html_get_sheet (name
, wb
);
344 } else if (xmlStrEqual (ptr
->name
, CC2XML ("thead")) ||
345 xmlStrEqual (ptr
->name
, CC2XML ("tfoot")) ||
346 xmlStrEqual (ptr
->name
, CC2XML ("tbody"))) {
347 html_read_rows (ptr
, doc
, wb
, tc
);
348 } else if (xmlStrEqual (ptr
->name
, CC2XML ("tr"))) {
349 html_read_rows (cur
, doc
, wb
, tc
);
355 /* Element types which imply that we are inside a table */
356 static char const *table_start_elt_types
[] = {
367 /* Element types which imply that we are inside a row */
368 static char const *row_start_elt_types
[] = {
374 /* Element types which occur inside tables and rows, but also outside */
375 static char const *cont_elt_types
[] = {
382 is_elt_type (htmlNodePtr ptr
, char const ** types
)
385 gboolean ret
= FALSE
;
387 for (p
= types
; *p
; p
++)
388 if (xmlStrEqual (ptr
->name
, CC2XML ((*p
)))) {
397 starts_inferred_table (htmlNodePtr ptr
)
399 return ((ptr
->type
== XML_ELEMENT_NODE
) &&
400 is_elt_type (ptr
, table_start_elt_types
));
404 ends_inferred_table (htmlNodePtr ptr
)
406 return ((ptr
->type
== XML_ELEMENT_NODE
) &&
407 !(is_elt_type (ptr
, table_start_elt_types
) ||
408 is_elt_type (ptr
, cont_elt_types
)));
412 starts_inferred_row (htmlNodePtr ptr
)
414 return ((ptr
->type
== XML_ELEMENT_NODE
) &&
415 is_elt_type (ptr
, row_start_elt_types
));
419 ends_inferred_row (htmlNodePtr ptr
)
421 return ((ptr
->type
== XML_ELEMENT_NODE
) &&
422 !(is_elt_type (ptr
, row_start_elt_types
) ||
423 is_elt_type (ptr
, cont_elt_types
)));
427 * Handles incomplete html fragments as may occur on the clipboard,
428 * e.g. a <td> without <tr> and <table> in front of it.
431 html_search_for_tables (htmlNodePtr cur
, htmlDocPtr doc
,
432 WorkbookView
*wb_view
, GnmHtmlTableCtxt
*tc
)
437 xmlGenericError(xmlGenericErrorContext
,
438 "htmlNodeDumpFormatOutput : node == NULL\n");
442 if (cur
->type
!= XML_ELEMENT_NODE
)
445 if (xmlStrEqual (cur
->name
, CC2XML ("table"))) {
446 html_read_table (cur
, doc
, wb_view
, tc
);
447 } else if (starts_inferred_table (cur
) || starts_inferred_row (cur
)) {
448 htmlNodePtr tnode
= xmlNewNode (NULL
, "table");
450 /* Link in a table node */
451 xmlAddPrevSibling (cur
, tnode
);
452 if (starts_inferred_row (cur
)) {
453 htmlNodePtr rnode
= xmlNewNode (NULL
, "tr");
455 /* Link in a row node */
456 xmlAddChild (tnode
, rnode
);
457 /* Make following elements children of the row node,
458 * until we meet one which isn't legal in a row. */
459 while ((ptr
= tnode
->next
) != NULL
) {
460 if (ends_inferred_row (ptr
))
463 xmlAddChild (rnode
, ptr
);
466 /* Make following elements children of the row node,
467 * until we meet one which isn't legal in a table. */
468 while ((ptr
= tnode
->next
) != NULL
) {
469 if (ends_inferred_table (ptr
))
472 xmlAddChild (tnode
, ptr
);
474 html_read_table (tnode
, doc
, wb_view
, tc
);
476 for (ptr
= cur
->children
; ptr
!= NULL
; ptr
= ptr
->next
) {
477 html_search_for_tables (ptr
, doc
, wb_view
, tc
);
478 /* ptr may now have been pushed down in the tree,
479 * if so, ptr->next is not the right pointer to
481 while (ptr
->parent
!= cur
)
488 html_file_open (G_GNUC_UNUSED GOFileOpener
const *fo
, GOIOContext
*io_context
,
489 WorkbookView
*wb_view
, GsfInput
*input
)
494 htmlParserCtxtPtr ctxt
;
495 htmlDocPtr doc
= NULL
;
499 g_return_if_fail (input
!= NULL
);
501 if (gsf_input_seek (input
, 0, G_SEEK_SET
))
504 size
= gsf_input_size (input
);
507 buf
= gsf_input_read (input
, 4, NULL
);
509 enc
= xmlDetectCharEncoding(buf
, 4);
510 switch (enc
) { /* Skip byte order mark */
511 case XML_CHAR_ENCODING_UCS4BE
:
512 case XML_CHAR_ENCODING_UCS4LE
:
513 case XML_CHAR_ENCODING_UCS4_2143
:
514 case XML_CHAR_ENCODING_UCS4_3412
:
515 case XML_CHAR_ENCODING_EBCDIC
:
518 case XML_CHAR_ENCODING_UTF16BE
:
519 case XML_CHAR_ENCODING_UTF16LE
:
522 case XML_CHAR_ENCODING_UTF8
:
525 else if (buf
[0] == 0x3c)
530 case XML_CHAR_ENCODING_NONE
:
532 /* Try to detect unmarked UTF16LE
533 (Firefox Windows clipboard, drag data all platforms) */
534 if ((buf
[0] >= 0x20 || g_ascii_isspace(buf
[0])) &&
536 (buf
[2] >= 0x20 || g_ascii_isspace(buf
[2])) &&
538 enc
= XML_CHAR_ENCODING_UTF16LE
;
543 ctxt
= htmlCreatePushParserCtxt (
544 NULL
, NULL
, (char const *)(buf
+ bomlen
),
545 4 - bomlen
, gsf_input_name (input
), enc
);
547 for (; size
> 0 ; size
-= len
) {
548 len
= MIN (4096, size
);
549 buf
= gsf_input_read (input
, len
, NULL
);
553 ctxt
, (char const *)buf
, len
, 0);
556 htmlParseChunk (ctxt
, (char const *)buf
, 0, 1);
558 htmlFreeParserCtxt (ctxt
);
566 tc
.wb_view
= wb_view
;
567 for (ptr
= doc
->children
; ptr
!= NULL
; ptr
= ptr
->next
)
568 html_search_for_tables (ptr
, doc
, wb_view
, &tc
);
571 go_io_error_info_set (io_context
,
572 go_error_info_new_str (_("Unable to parse the html.")));
575 /* Quick and dirty html probe. */
577 html_file_probe (G_GNUC_UNUSED GOFileOpener
const *fo
, GsfInput
*input
,
578 G_GNUC_UNUSED GOFileProbeLevel pl
)
580 gsf_off_t size
= 200;
581 guint8
const* buf
= gsf_input_read (input
, size
, NULL
);
584 gboolean res
= FALSE
;
586 /* Avoid seeking in large streams - try to read, fall back if
587 * stream is too short. (Actually, currently _size does not
588 * involve any syscalls -- MW). */
590 size
= gsf_input_size (input
);
591 buf
= gsf_input_read (input
, size
, NULL
);
596 if (go_guess_encoding (buf
, size
, NULL
, &ustr
, NULL
)) {
597 ulstr
= g_utf8_strdown (ustr
->str
, -1);
598 g_string_free (ustr
, TRUE
);
604 res
= (strstr (ulstr
, "<table") != NULL
||
605 strstr (ulstr
, "<html") != NULL
||
606 strstr (ulstr
, "<!doctype html") != NULL
);