ssdiff: move comparison engine into its own file.
[gnumeric.git] / src / stf.c
blobf7ca89ca8d58ca627fdd4b305bc60cbaa53af725
1 /* vim: set sw=8: -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */
2 /*
3 * stf.c : Utilizes the stf-parse engine and the dialog-stf to provide a plug-in for
4 * importing text files with a structure (CSV/fixed width)
6 * Copyright (C) Almer. S. Tigelaar <almer@gnome.org>
7 * Copyright (C) 1999-2009 Morten Welinder (terra@gnome.org)
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
19 * You should have received a copy of the GNU General Public License
20 * along with this program; if not, see <https://www.gnu.org/licenses/>.
23 #include <gnumeric-config.h>
24 #include <glib/gi18n-lib.h>
25 #include "gnumeric.h"
26 #include "stf.h"
27 #include "stf-export.h"
29 #include <goffice/goffice.h>
30 #include "cell.h"
31 #include "sheet.h"
32 #include "sheet-view.h"
33 #include "sheet-style.h"
34 #include "style.h"
35 #include "mstyle.h"
36 #include "command-context.h"
37 #include "wbc-gtk.h"
38 #include "workbook-view.h"
39 #include "workbook.h"
40 #include "dialog-stf.h"
41 #include "dialog-stf-export.h"
42 #include "position.h"
43 #include "expr.h"
44 #include "value.h"
45 #include "gnm-format.h"
46 #include "selection.h"
47 #include "ranges.h"
48 #include "clipboard.h"
49 #include "parse-util.h"
50 #include "commands.h"
51 #include "gui-util.h"
52 #include "gutils.h"
54 #include <gsf/gsf-input.h>
55 #include <string.h>
56 #include <gsf/gsf-output.h>
57 #include <gsf/gsf-output-memory.h>
58 #include <gsf/gsf-utils.h>
59 #include <locale.h>
62 static void
63 stf_warning (GOIOContext *context, char const *msg)
66 * Using go_cmd_context_error_import will destroy the
67 * successfully imported portion. We ought to have a
68 * way to issue a warning.
70 if (GNM_IS_WBC_GTK (context->impl))
71 go_gtk_notice_dialog
72 (wbcg_toplevel (WBC_GTK (context->impl)),
73 GTK_MESSAGE_WARNING,
74 "%s", msg);
75 else
76 g_warning ("%s", msg);
81 * stf_open_and_read:
82 * @filename: name of the file to open&read
84 * Will open filename, read the file into a g_alloced memory buffer
86 * NOTE : The returned buffer has to be g_freed by the calling routine.
88 * returns : a buffer containing the file contents
90 static char *
91 stf_open_and_read (G_GNUC_UNUSED GOIOContext *context, GsfInput *input, size_t *readsize)
93 gpointer result;
94 gulong allocsize;
95 gsf_off_t size = gsf_input_size (input);
97 if (gsf_input_seek (input, 0, G_SEEK_SET))
98 return NULL;
100 *readsize = (size_t) size;
101 if ((gsf_off_t) *readsize != size) /* Check for overflow */
102 return NULL;
103 size++;
104 allocsize = (gulong) size;
105 if ((gsf_off_t) allocsize != size) /* Check for overflow */
106 return NULL;
107 result = g_try_malloc (allocsize);
108 if (result == NULL)
109 return NULL;
111 *((char *)result + *readsize) = '\0';
113 if (*readsize > 0 && gsf_input_read (input, *readsize, result) == NULL) {
114 g_warning ("gsf_input_read failed.");
115 g_free (result);
116 result = NULL;
118 return result;
121 static char *
122 stf_preparse (GOIOContext *context, GsfInput *input, size_t *data_len)
124 char *data;
126 data = stf_open_and_read (context, input, data_len);
128 if (!data) {
129 if (context)
130 go_cmd_context_error_import (GO_CMD_CONTEXT (context),
131 _("Error while trying to read file"));
132 return NULL;
135 return data;
138 static gboolean
139 stf_store_results (DialogStfResult_t *dialogresult,
140 Sheet *sheet, int start_col, int start_row)
142 return stf_parse_sheet (dialogresult->parseoptions,
143 dialogresult->text, NULL, sheet,
144 start_col, start_row);
147 static void
148 resize_columns (Sheet *sheet)
150 GnmRange r;
152 if (gnm_debug_flag ("stf"))
153 g_printerr ("Auto-fitting columns...\n");
155 /* If we have lots of rows, auto-fitting will take a very long
156 time. It is probably better to look at only, say, 1000 rows
157 of data. */
158 range_init_full_sheet (&r, sheet);
159 r.end.row = MIN (r.end.row, 1000);
161 colrow_autofit (sheet, &r, TRUE,
162 TRUE, /* Ignore strings */
163 TRUE, /* Don't shrink */
164 TRUE, /* Don't shrink */
165 NULL, NULL);
166 if (gnm_debug_flag ("stf"))
167 g_printerr ("Auto-fitting columns... done\n");
169 sheet_queue_respan (sheet, 0, gnm_sheet_get_last_row (sheet));
174 * stf_read_workbook:
175 * @fo: file opener
176 * @enc: encoding of file
177 * @context: command context
178 * @book: workbook
179 * @input: file to read from+convert
181 * Main routine, handles importing a file including all dialog mumbo-jumbo
183 static void
184 stf_read_workbook (G_GNUC_UNUSED GOFileOpener const *fo, gchar const *enc,
185 GOIOContext *context, GoView *view, GsfInput *input)
187 DialogStfResult_t *dialogresult = NULL;
188 char *name, *nameutf8 = NULL;
189 char *data = NULL;
190 size_t data_len;
191 WorkbookView *wbv = GNM_WORKBOOK_VIEW (view);
193 if (!GNM_IS_WBC_GTK (context->impl)) {
194 go_io_error_string (context, _("This importer can only be used with a GUI."));
195 return;
198 name = g_path_get_basename (gsf_input_name (input));
199 nameutf8 = g_filename_to_utf8 (name, -1, NULL, NULL, NULL);
200 g_free (name);
201 if (!nameutf8) {
202 g_warning ("Failed to convert filename to UTF-8. This shouldn't happen here.");
203 goto out;
206 data = stf_preparse (context, input, &data_len);
207 if (!data)
208 goto out;
210 dialogresult = stf_dialog (WBC_GTK (context->impl),
211 enc, FALSE, NULL, FALSE,
212 nameutf8, data, data_len);
213 if (dialogresult != NULL) {
214 Workbook *book = wb_view_get_workbook (wbv);
215 int cols = dialogresult->colcount, rows = dialogresult->rowcount;
216 Sheet *sheet;
218 gnm_sheet_suggest_size (&cols, &rows);
219 sheet = sheet_new (book, nameutf8, cols, rows);
220 workbook_sheet_attach (book, sheet);
221 if (stf_store_results (dialogresult, sheet, 0, 0)) {
222 workbook_recalc_all (book);
223 resize_columns (sheet);
224 workbook_set_saveinfo
225 (book,
226 GO_FILE_FL_WRITE_ONLY,
227 go_file_saver_for_id
228 ("Gnumeric_stf:stf_assistant"));
229 } else {
230 /* the user has cancelled */
231 /* the caller should notice that we have no sheets */
232 workbook_sheet_delete (sheet);
236 out:
237 g_free (nameutf8);
238 g_free (data);
239 if (dialogresult != NULL)
240 stf_dialog_result_free (dialogresult);
243 static GnmValue *
244 cb_get_content (GnmCellIter const *iter, GsfOutput *buf)
246 GnmCell *cell;
248 if (NULL != (cell = iter->cell)) {
249 char *tmp;
250 if (gnm_cell_has_expr (cell))
251 tmp = gnm_expr_top_as_string (cell->base.texpr,
252 &iter->pp, iter->pp.sheet->convs);
253 else if (VALUE_FMT (cell->value) != NULL)
254 tmp = format_value (NULL, cell->value, -1,
255 workbook_date_conv (iter->pp.wb));
256 else
257 tmp = value_get_as_string (cell->value);
259 gsf_output_write (buf, strlen (tmp), tmp);
260 g_free (tmp);
262 gsf_output_write (buf, 1, "\n");
264 return NULL;
268 * stf_text_to_columns:
269 * @wbc: The control making the request
270 * @cc:
272 * Main routine, handles importing a file including all dialog mumbo-jumbo
274 void
275 stf_text_to_columns (WorkbookControl *wbc, GOCmdContext *cc)
277 DialogStfResult_t *dialogresult = NULL;
278 SheetView *sv;
279 Sheet *src_sheet, *target_sheet;
280 GnmRange const *src;
281 GnmRange target;
282 GsfOutput *buf;
283 guint8 const *data;
284 size_t data_len;
286 sv = wb_control_cur_sheet_view (wbc);
287 src_sheet = sv_sheet (sv);
288 src = selection_first_range (sv, cc, _("Text to Columns"));
289 if (src == NULL)
290 return;
291 if (range_width (src) > 1) {
292 go_cmd_context_error (cc, g_error_new (go_error_invalid (), 0,
293 _("Only one column of input data can be parsed at a time")));
294 return;
297 /* FIXME : how to do this cleanly ? */
298 if (!GNM_IS_WBC_GTK (wbc))
299 return;
301 #warning Add UI for this
302 target_sheet = src_sheet;
303 target = *src;
304 range_translate (&target, target_sheet, 1, 0);
306 buf = gsf_output_memory_new ();
307 sheet_foreach_cell_in_range (src_sheet,
308 CELL_ITER_ALL,
309 src->start.col, src->start.row,
310 src->end.col, src->end.row,
311 (CellIterFunc) &cb_get_content, buf);
313 gsf_output_close (buf);
314 data = gsf_output_memory_get_bytes (GSF_OUTPUT_MEMORY (buf));
315 data_len = (size_t)gsf_output_size (buf);
316 if (data_len == 0) {
317 go_cmd_context_error_import (GO_CMD_CONTEXT (cc),
318 _("There is no data "
319 "to convert"));
320 } else {
321 dialogresult = stf_dialog (WBC_GTK (wbc),
322 NULL, FALSE, NULL, FALSE,
323 _("Text to Columns"),
324 data, data_len);
326 if (dialogresult != NULL) {
327 GnmCellRegion *cr = stf_parse_region (dialogresult->parseoptions,
328 dialogresult->text, NULL, target_sheet->workbook);
329 if (cr != NULL) {
330 stf_dialog_result_attach_formats_to_cr (dialogresult, cr);
331 target.end.col = target.start.col + cr->cols - 1;
332 target.end.row = target.start.row + cr->rows - 1;
334 if (cr == NULL ||
335 cmd_text_to_columns (wbc, src, src_sheet,
336 &target, target_sheet, cr))
337 go_cmd_context_error_import (GO_CMD_CONTEXT (cc),
338 _("Error while trying to "
339 "parse data into sheet"));
340 stf_dialog_result_free (dialogresult);
343 g_object_unref (buf);
346 static void
347 clear_stray_NULs (GOIOContext *context, GString *utf8data)
349 char *cpointer, *endpointer;
350 int null_chars = 0;
351 char const *valid_end;
353 cpointer = utf8data->str;
354 endpointer = utf8data->str + utf8data->len;
355 while (*cpointer != 0)
356 cpointer++;
357 while (cpointer != endpointer) {
358 null_chars++;
359 *cpointer = ' ';
360 while (*cpointer != 0)
361 cpointer++;
363 if (null_chars > 0) {
364 gchar const *format;
365 gchar *msg;
366 format = ngettext ("The file contains %d NULL character. "
367 "It has been changed to a space.",
368 "The file contains %d NULL characters. "
369 "They have been changed to spaces.",
370 null_chars);
371 msg = g_strdup_printf (format, null_chars);
372 stf_warning (context, msg);
373 g_free (msg);
376 if (!g_utf8_validate (utf8data->str, utf8data->len, &valid_end)) {
377 g_string_truncate (utf8data, valid_end - utf8data->str);
378 stf_warning (context, _("The file contains invalid UTF-8 encoded characters and has been truncated"));
383 * stf_read_workbook_auto_csvtab:
384 * @fo: file opener
385 * @enc: optional encoding
386 * @context: command context
387 * @book: workbook
388 * @input: file to read from+convert
390 * Attempt to auto-detect CSV or tab-delimited file
392 static void
393 stf_read_workbook_auto_csvtab (G_GNUC_UNUSED GOFileOpener const *fo, gchar const *enc,
394 GOIOContext *context,
395 GoView *view, GsfInput *input)
397 Sheet *sheet;
398 Workbook *book;
399 char *name;
400 char *data;
401 GString *utf8data;
402 size_t data_len;
403 StfParseOptions_t *po;
404 const char *gsfname;
405 int cols, rows, i;
406 GStringChunk *lines_chunk;
407 GPtrArray *lines;
408 WorkbookView *wbv = GNM_WORKBOOK_VIEW (view);
410 g_return_if_fail (context != NULL);
411 g_return_if_fail (wbv != NULL);
413 book = wb_view_get_workbook (wbv);
415 data = stf_preparse (context, input, &data_len);
416 if (!data)
417 return;
419 enc = go_guess_encoding (data, data_len, enc, &utf8data, NULL);
420 g_free (data);
422 if (!enc) {
423 go_cmd_context_error_import (GO_CMD_CONTEXT (context),
424 _("That file is not in the given encoding."));
425 return;
428 clear_stray_NULs (context, utf8data);
431 * Try to get the filename we're reading from. This is not a
432 * great way.
434 gsfname = gsf_input_name (input);
437 const char *ext = gsf_extension_pointer (gsfname);
438 gboolean iscsv = ext && strcasecmp (ext, "csv") == 0;
439 if (iscsv)
440 po = stf_parse_options_guess_csv (utf8data->str);
441 else
442 po = stf_parse_options_guess (utf8data->str);
445 lines_chunk = g_string_chunk_new (100 * 1024);
446 lines = stf_parse_general (po, lines_chunk,
447 utf8data->str, utf8data->str + utf8data->len);
448 rows = lines->len;
449 cols = 0;
450 for (i = 0; i < rows; i++) {
451 GPtrArray *line = g_ptr_array_index (lines, i);
452 cols = MAX (cols, (int)line->len);
454 gnm_sheet_suggest_size (&cols, &rows);
455 stf_parse_general_free (lines);
456 g_string_chunk_free (lines_chunk);
458 name = g_path_get_basename (gsfname);
459 sheet = sheet_new (book, name, cols, rows);
460 g_free (name);
461 workbook_sheet_attach (book, sheet);
463 if (stf_parse_sheet (po, utf8data->str, NULL, sheet, 0, 0)) {
464 gboolean is_csv;
465 workbook_recalc_all (book);
466 resize_columns (sheet);
467 if (po->cols_exceeded || po->rows_exceeded) {
468 stf_warning (context,
469 _("Some data did not fit on the "
470 "sheet and was dropped."));
472 is_csv = po->sep.chr && po->sep.chr[0] == ',';
473 workbook_set_saveinfo
474 (book,
475 GO_FILE_FL_WRITE_ONLY,
476 go_file_saver_for_id
477 (is_csv ? "Gnumeric_stf:stf_csv" : "Gnumeric_stf:stf_assistant"));
478 } else {
479 workbook_sheet_delete (sheet);
480 go_cmd_context_error_import (GO_CMD_CONTEXT (context),
481 _("Parse error while trying to parse data into sheet"));
485 stf_parse_options_free (po);
486 g_string_free (utf8data, TRUE);
489 /***********************************************************************************/
491 static void
492 stf_write_csv (G_GNUC_UNUSED GOFileSaver const *fs, GOIOContext *context,
493 GoView const *view, GsfOutput *output)
495 Sheet *sheet;
496 GnmRangeRef const *range;
497 WorkbookView *wbv = GNM_WORKBOOK_VIEW (view);
499 GnmStfExport *config = g_object_new
500 (GNM_STF_EXPORT_TYPE,
501 "sink", output,
502 "quoting-triggers", ", \t\n\"",
503 NULL);
505 /* FIXME: this is crap in both branches of the "if". */
506 range = g_object_get_data (G_OBJECT (wb_view_get_workbook (wbv)), "ssconvert-range");
507 if (range && range->a.sheet)
508 sheet = range->a.sheet;
509 else
510 sheet = wb_view_cur_sheet (wbv);
512 gnm_stf_export_options_sheet_list_add (config, sheet);
514 if (gnm_stf_export (config) == FALSE)
515 go_cmd_context_error_import (GO_CMD_CONTEXT (context),
516 _("Error while trying to write CSV file"));
518 g_object_unref (config);
521 static gboolean
522 csv_tsv_probe (GOFileOpener const *fo, GsfInput *input, GOFileProbeLevel pl)
524 /* Rough and ready heuristic. If the first N bytes have no
525 * unprintable characters this may be text */
526 const gsf_off_t N = 512;
528 if (pl == GO_FILE_PROBE_CONTENT) {
529 guint8 const *header;
530 gsf_off_t i;
531 char const *enc = NULL;
532 GString *header_utf8;
533 char const *p;
534 gboolean ok = TRUE;
536 if (gsf_input_seek (input, 0, G_SEEK_SET))
537 return FALSE;
538 i = gsf_input_remaining (input);
540 /* If someone ships us an empty file, accept it only if
541 it has a proper name. */
542 if (i == 0)
543 return csv_tsv_probe (fo, input, GO_FILE_PROBE_FILE_NAME);
545 if (i > N) i = N;
546 if (NULL == (header = gsf_input_read (input, i, NULL)))
547 return FALSE;
549 enc = go_guess_encoding (header, i, NULL, &header_utf8, NULL);
550 if (!enc)
551 return FALSE;
553 for (p = header_utf8->str; *p; p = g_utf8_next_char (p)) {
554 gunichar uc = g_utf8_get_char (p);
555 /* isprint might not be true for these: */
556 if (uc == '\n' || uc == '\t' || uc == '\r')
557 continue;
558 /* Also, ignore a byte-order mark which may be used to
559 * indicate UTF-8; see
560 * http://en.wikipedia.org/wiki/Byte_Order_Mark for
561 * background.
563 if (p == header_utf8->str && uc == 0x0000FEFF) {
564 continue;
566 if (!g_unichar_isprint (uc)) {
567 ok = FALSE;
568 break;
572 g_string_free (header_utf8, TRUE);
573 return ok;
574 } else {
575 char const *name = gsf_input_name (input);
576 if (name == NULL)
577 return FALSE;
578 name = gsf_extension_pointer (name);
579 return (name != NULL &&
580 (g_ascii_strcasecmp (name, "csv") == 0 ||
581 g_ascii_strcasecmp (name, "tsv") == 0 ||
582 g_ascii_strcasecmp (name, "txt") == 0));
586 void
587 stf_init (void)
589 GSList *suffixes = go_slist_create (
590 g_strdup ("csv"),
591 g_strdup ("tsv"),
592 g_strdup ("txt"),
593 NULL);
594 GSList *mimes = go_slist_create (
595 g_strdup ("application/tab-separated-values"),
596 g_strdup ("text/comma-separated-values"),
597 g_strdup ("text/csv"),
598 g_strdup ("text/x-csv"),
599 g_strdup ("text/spreadsheet"),
600 g_strdup ("text/tab-separated-values"),
601 NULL);
602 GSList *mimes_txt = go_slist_create (
603 g_strdup ("text/plain"),
604 g_strdup ("text/csv"),
605 g_strdup ("text/x-csv"),
606 g_strdup ("text/comma-separated-values"),
607 g_strdup ("text/tab-separated-values"),
608 NULL);
609 GOFileSaver *saver;
610 GOFileOpener *opener;
612 opener = go_file_opener_new_with_enc (
613 "Gnumeric_stf:stf_csvtab",
614 _("Comma or tab separated values (CSV/TSV)"),
615 suffixes, mimes,
616 csv_tsv_probe, stf_read_workbook_auto_csvtab);
617 go_file_opener_register (opener, 0);
618 g_object_unref (opener);
620 opener = go_file_opener_new_with_enc (
621 "Gnumeric_stf:stf_assistant",
622 _("Text import (configurable)"),
623 NULL, mimes_txt,
624 NULL, stf_read_workbook);
625 g_object_set (G_OBJECT (opener), "interactive-only", TRUE, NULL);
626 go_file_opener_register (opener, 0);
627 g_object_unref (opener);
629 saver = gnm_stf_file_saver_create ("Gnumeric_stf:stf_assistant");
630 /* Unlike the opener, the saver doesn't require interaction. */
631 go_file_saver_register (saver);
632 g_object_unref (saver);
634 saver = go_file_saver_new (
635 "Gnumeric_stf:stf_csv", "csv",
636 _("Comma separated values (CSV)"),
637 GO_FILE_FL_MANUAL_REMEMBER, stf_write_csv);
638 go_file_saver_set_save_scope (saver, GO_FILE_SAVE_SHEET);
639 go_file_saver_register (saver);
640 g_object_unref (saver);
643 void
644 stf_shutdown (void)
646 go_file_saver_unregister
647 (go_file_saver_for_id ("Gnumeric_stf:stf_assistant"));
648 go_file_saver_unregister
649 (go_file_saver_for_id ("Gnumeric_stf:stf_csv"));
651 go_file_opener_unregister
652 (go_file_opener_for_id ("Gnumeric_stf:stf_csvtab"));
653 go_file_opener_unregister
654 (go_file_opener_for_id ("Gnumeric_stf:stf_assistant"));