stf: honour text formats.
[gnumeric.git] / src / stf-parse.c
blob81b13a9afb5858f8b8df53cf7b56de3704cbd3c2
1 /* vim: set sw=8: -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */
2 /*
3 * stf-parse.c : Structured Text Format parser. (STF)
4 * A general purpose engine for parsing data
5 * in CSV and Fixed width format.
8 * Copyright (C) Almer. S. Tigelaar.
9 * EMail: almer1@dds.nl or almer-t@bigfoot.com
11 * Copyright (C) 2003 Andreas J. Guelzow <aguelzow@taliesin.ca>
12 * Copyright (C) 2003,2008-2009 Morten Welinder <terra@gnome.org>
14 * This program is free software; you can redistribute it and/or modify
15 * it under the terms of the GNU General Public License as published by
16 * the Free Software Foundation; either version 2 of the License, or
17 * (at your option) any later version.
19 * This program is distributed in the hope that it will be useful,
20 * but WITHOUT ANY WARRANTY; without even the implied warranty of
21 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
22 * GNU General Public License for more details.
24 * You should have received a copy of the GNU General Public License
25 * along with this program; if not, write to the Free Software
26 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
29 #include <gnumeric-config.h>
30 #include <glib/gi18n-lib.h>
31 #include "gnumeric.h"
32 #include "stf-parse.h"
33 #include "stf-export.h"
35 #include "workbook.h"
36 #include "cell.h"
37 #include "sheet.h"
38 #include "expr.h"
39 #include "clipboard.h"
40 #include "sheet-style.h"
41 #include "value.h"
42 #include "mstyle.h"
43 #include "number-match.h"
44 #include "gutils.h"
45 #include "parse-util.h"
46 #include "number-match.h"
47 #include "gnm-format.h"
48 #include "ranges.h"
49 #include <goffice/goffice.h>
51 #include <stdlib.h>
52 #include <locale.h>
53 #include <string.h>
55 #define SETUP_LOCALE_SWITCH char *oldlocale = NULL
57 #define START_LOCALE_SWITCH if (parseoptions->locale) {\
58 oldlocale = g_strdup(go_setlocale (LC_ALL, NULL)); \
59 go_setlocale(LC_ALL, parseoptions->locale);}
61 #define END_LOCALE_SWITCH if (oldlocale) {\
62 go_setlocale(LC_ALL, oldlocale);\
63 g_free (oldlocale);}
65 /* Source_t struct, used for interchanging parsing information between the low level parse functions */
66 typedef struct {
67 GStringChunk *chunk;
68 char const *position; /* Indicates the current position within data */
70 /* Used internally for fixed width parsing */
71 int splitpos; /* Indicates current position in splitpositions array */
72 int linepos; /* Position on the current line */
73 } Source_t;
75 /* Struct used for autodiscovery */
76 typedef struct {
77 int start;
78 int stop;
79 } AutoDiscovery_t;
82 * Some silly dude make the length field an unsigned int. C just does
83 * not deal very well with that.
85 static inline int
86 my_garray_len (GArray const *a)
88 return (int)a->len;
91 static char *
92 my_utf8_strchr (const char *p, gunichar uc)
94 return uc < 0x7f ? strchr (p, uc) : g_utf8_strchr (p, -1, uc);
97 static int
98 compare_terminator (char const *s, StfParseOptions_t *parseoptions)
100 guchar const *us = (guchar const *)s;
101 GSList *l;
103 if (*us > parseoptions->compiled_terminator.max ||
104 *us < parseoptions->compiled_terminator.min)
105 return 0;
107 for (l = parseoptions->terminator; l; l = l->next) {
108 char const *term = l->data;
109 char const *d = s;
111 while (*term) {
112 if (*d != *term)
113 goto next;
114 term++;
115 d++;
117 return d - s;
119 next:
122 return 0;
126 /*******************************************************************************************************
127 * STF PARSE OPTIONS : StfParseOptions related
128 *******************************************************************************************************/
130 static void
131 gnm_g_string_free (GString *s)
133 if (s) g_string_free (s, TRUE);
138 * stf_parse_options_new:
140 * This will return a new StfParseOptions_t struct.
141 * The struct should, after being used, freed with stf_parse_options_free.
143 static StfParseOptions_t *
144 stf_parse_options_new (void)
146 StfParseOptions_t* parseoptions = g_new0 (StfParseOptions_t, 1);
148 parseoptions->parsetype = PARSE_TYPE_NOTSET;
150 parseoptions->terminator = NULL;
151 stf_parse_options_add_line_terminator (parseoptions, "\r\n");
152 stf_parse_options_add_line_terminator (parseoptions, "\n");
153 stf_parse_options_add_line_terminator (parseoptions, "\r");
155 parseoptions->trim_spaces = (TRIM_TYPE_RIGHT | TRIM_TYPE_LEFT);
156 parseoptions->locale = NULL;
158 parseoptions->splitpositions = NULL;
159 stf_parse_options_fixed_splitpositions_clear (parseoptions);
161 parseoptions->stringindicator = '"';
162 parseoptions->indicator_2x_is_single = TRUE;
163 parseoptions->sep.duplicates = FALSE;
164 parseoptions->trim_seps = FALSE;
166 parseoptions->sep.str = NULL;
167 parseoptions->sep.chr = NULL;
169 parseoptions->col_autofit_array = NULL;
170 parseoptions->col_import_array = NULL;
171 parseoptions->col_import_array_len = 0;
172 parseoptions->formats = g_ptr_array_new_with_free_func ((GDestroyNotify)go_format_unref);
173 parseoptions->formats_decimal = g_ptr_array_new_with_free_func ((GDestroyNotify)gnm_g_string_free);
174 parseoptions->formats_thousand = g_ptr_array_new_with_free_func ((GDestroyNotify)gnm_g_string_free);
175 parseoptions->formats_curr = g_ptr_array_new_with_free_func ((GDestroyNotify)gnm_g_string_free);
177 parseoptions->cols_exceeded = FALSE;
178 parseoptions->rows_exceeded = FALSE;
179 parseoptions->ref_count = 1;
181 return parseoptions;
185 * stf_parse_options_free:
187 * will free @parseoptions, note that this will not free the splitpositions
188 * member (GArray) of the struct, the caller is responsible for that.
190 void
191 stf_parse_options_free (StfParseOptions_t *parseoptions)
193 g_return_if_fail (parseoptions != NULL);
195 if (parseoptions->ref_count-- > 1)
196 return;
198 g_free (parseoptions->col_import_array);
199 g_free (parseoptions->col_autofit_array);
200 g_free (parseoptions->locale);
201 g_free (parseoptions->sep.chr);
203 if (parseoptions->sep.str) {
204 GSList *l;
206 for (l = parseoptions->sep.str; l != NULL; l = l->next)
207 g_free ((char *) l->data);
208 g_slist_free (parseoptions->sep.str);
211 g_array_free (parseoptions->splitpositions, TRUE);
213 stf_parse_options_clear_line_terminator (parseoptions);
215 g_ptr_array_free (parseoptions->formats, TRUE);
216 g_ptr_array_free (parseoptions->formats_decimal, TRUE);
217 g_ptr_array_free (parseoptions->formats_thousand, TRUE);
218 g_ptr_array_free (parseoptions->formats_curr, TRUE);
220 g_free (parseoptions);
223 static StfParseOptions_t *
224 stf_parse_options_ref (StfParseOptions_t *parseoptions)
226 parseoptions->ref_count++;
227 return parseoptions;
230 GType
231 stf_parse_options_get_type (void)
233 static GType t = 0;
235 if (t == 0) {
236 t = g_boxed_type_register_static ("StfParseOptions_t",
237 (GBoxedCopyFunc)stf_parse_options_ref,
238 (GBoxedFreeFunc)stf_parse_options_free);
240 return t;
243 void
244 stf_parse_options_set_type (StfParseOptions_t *parseoptions, StfParseType_t const parsetype)
246 g_return_if_fail (parseoptions != NULL);
247 g_return_if_fail (parsetype == PARSE_TYPE_CSV || parsetype == PARSE_TYPE_FIXED);
249 parseoptions->parsetype = parsetype;
252 static gint
253 long_string_first (gchar const *a, gchar const *b)
255 /* This actually is UTF-8 safe. */
256 return strlen (b) - strlen (a);
259 static void
260 compile_terminators (StfParseOptions_t *parseoptions)
262 GSList *l;
263 GO_SLIST_SORT (parseoptions->terminator, (GCompareFunc)long_string_first);
265 parseoptions->compiled_terminator.min = 255;
266 parseoptions->compiled_terminator.max = 0;
267 for (l = parseoptions->terminator; l; l = l->next) {
268 const guchar *term = l->data;
269 parseoptions->compiled_terminator.min =
270 MIN (parseoptions->compiled_terminator.min, *term);
271 parseoptions->compiled_terminator.max =
272 MAX (parseoptions->compiled_terminator.max, *term);
277 * stf_parse_options_add_line_terminator:
279 * This will add to the line terminators, in both the Fixed width and CSV delimited importers
280 * this indicates the end of a row.
283 void
284 stf_parse_options_add_line_terminator (StfParseOptions_t *parseoptions, char const *terminator)
286 g_return_if_fail (parseoptions != NULL);
287 g_return_if_fail (terminator != NULL && *terminator != 0);
289 GO_SLIST_PREPEND (parseoptions->terminator, g_strdup (terminator));
290 compile_terminators (parseoptions);
294 * stf_parse_options_clear_line_terminator:
296 * This will clear the line terminator, in both the Fixed width and CSV delimited importers
297 * this indicates the end of a row.
300 void
301 stf_parse_options_clear_line_terminator (StfParseOptions_t *parseoptions)
303 g_return_if_fail (parseoptions != NULL);
305 g_slist_free_full (parseoptions->terminator, g_free);
306 parseoptions->terminator = NULL;
307 compile_terminators (parseoptions);
311 * stf_parse_options_set_trim_spaces:
313 * If enabled will trim spaces in every parsed field on left and/or right
314 * sides.
316 void
317 stf_parse_options_set_trim_spaces (StfParseOptions_t *parseoptions, StfTrimType_t const trim_spaces)
319 g_return_if_fail (parseoptions != NULL);
321 parseoptions->trim_spaces = trim_spaces;
325 * stf_parse_options_csv_set_separators:
326 * @parseoptions: #StfParseOptions_t
327 * @character:
328 * @string: (element-type char):
330 * A copy is made of the parameters.
332 void
333 stf_parse_options_csv_set_separators (StfParseOptions_t *parseoptions, char const *character,
334 GSList const *string)
336 g_return_if_fail (parseoptions != NULL);
338 g_free (parseoptions->sep.chr);
339 parseoptions->sep.chr = g_strdup (character);
341 g_slist_free_full (parseoptions->sep.str, g_free);
342 parseoptions->sep.str = go_slist_map (string, (GOMapFunc)g_strdup);
345 void
346 stf_parse_options_csv_set_stringindicator (StfParseOptions_t *parseoptions, gunichar const stringindicator)
348 g_return_if_fail (parseoptions != NULL);
350 parseoptions->stringindicator = stringindicator;
354 * stf_parse_options_csv_set_indicator_2x_is_single:
355 * @indic_2x: a boolean value indicating whether we want to see two
356 * adjacent string indicators as a single string indicator
357 * that is part of the cell, rather than a terminator.
359 void
360 stf_parse_options_csv_set_indicator_2x_is_single (StfParseOptions_t *parseoptions,
361 gboolean const indic_2x)
363 g_return_if_fail (parseoptions != NULL);
365 parseoptions->indicator_2x_is_single = indic_2x;
369 * stf_parse_options_csv_set_duplicates:
370 * @parseoptions:
371 * @duplicates: a boolean value indicating whether we want to see two
372 * separators right behind each other as one
374 void
375 stf_parse_options_csv_set_duplicates (StfParseOptions_t *parseoptions, gboolean const duplicates)
377 g_return_if_fail (parseoptions != NULL);
379 parseoptions->sep.duplicates = duplicates;
383 * stf_parse_options_csv_set_trim_seps:
384 * @trim_seps: a boolean value indicating whether we want to ignore
385 * separators at the beginning of lines
387 void
388 stf_parse_options_csv_set_trim_seps (StfParseOptions_t *parseoptions, gboolean const trim_seps)
390 g_return_if_fail (parseoptions != NULL);
392 parseoptions->trim_seps = trim_seps;
396 * stf_parse_options_fixed_splitpositions_clear:
398 * This will clear the splitpositions (== points on which a line is split)
400 void
401 stf_parse_options_fixed_splitpositions_clear (StfParseOptions_t *parseoptions)
403 int minus_one = -1;
404 g_return_if_fail (parseoptions != NULL);
406 if (parseoptions->splitpositions)
407 g_array_free (parseoptions->splitpositions, TRUE);
408 parseoptions->splitpositions = g_array_new (FALSE, FALSE, sizeof (int));
410 g_array_append_val (parseoptions->splitpositions, minus_one);
414 * stf_parse_options_fixed_splitpositions_add:
416 * @position will be added to the splitpositions.
418 void
419 stf_parse_options_fixed_splitpositions_add (StfParseOptions_t *parseoptions, int position)
421 unsigned int ui;
423 g_return_if_fail (parseoptions != NULL);
424 g_return_if_fail (position >= 0);
426 for (ui = 0; ui < parseoptions->splitpositions->len - 1; ui++) {
427 int here = g_array_index (parseoptions->splitpositions, int, ui);
428 if (position == here)
429 return;
430 if (position < here)
431 break;
434 g_array_insert_val (parseoptions->splitpositions, ui, position);
437 void
438 stf_parse_options_fixed_splitpositions_remove (StfParseOptions_t *parseoptions, int position)
440 unsigned int ui;
442 g_return_if_fail (parseoptions != NULL);
443 g_return_if_fail (position >= 0);
445 for (ui = 0; ui < parseoptions->splitpositions->len - 1; ui++) {
446 int here = g_array_index (parseoptions->splitpositions, int, ui);
447 if (position == here)
448 g_array_remove_index (parseoptions->splitpositions, ui);
449 if (position <= here)
450 return;
455 stf_parse_options_fixed_splitpositions_count (StfParseOptions_t *parseoptions)
457 return parseoptions->splitpositions->len;
461 stf_parse_options_fixed_splitpositions_nth (StfParseOptions_t *parseoptions, int n)
463 return g_array_index (parseoptions->splitpositions, int, n);
468 * stf_parse_options_valid:
469 * @parseoptions: an import options struct
471 * Checks if @parseoptions is correctly filled
473 * returns : TRUE if it is correctly filled, FALSE otherwise.
475 static gboolean
476 stf_parse_options_valid (StfParseOptions_t *parseoptions)
478 g_return_val_if_fail (parseoptions != NULL, FALSE);
480 if (parseoptions->parsetype == PARSE_TYPE_FIXED) {
481 if (!parseoptions->splitpositions) {
482 g_warning ("STF: No splitpositions in struct");
483 return FALSE;
487 return TRUE;
490 /*******************************************************************************************************
491 * STF PARSE : The actual routines that do the 'trick'
492 *******************************************************************************************************/
494 static void
495 trim_spaces_inplace (char *field, StfParseOptions_t const *parseoptions)
497 if (!field) return;
499 if (parseoptions->trim_spaces & TRIM_TYPE_LEFT) {
500 char *s = field;
502 while (g_unichar_isspace (g_utf8_get_char (s)))
503 s = g_utf8_next_char (s);
505 if (s != field)
506 memmove (field, s, 1 + strlen (s));
509 if (parseoptions->trim_spaces & TRIM_TYPE_RIGHT) {
510 char *s = field + strlen (field);
512 while (field != s) {
513 s = g_utf8_prev_char (s);
514 if (!g_unichar_isspace (g_utf8_get_char (s)))
515 break;
516 *s = 0;
522 * stf_parse_csv_is_separator:
524 * returns NULL if @character is not a separator, a pointer to the character
525 * after the separator otherwise.
527 static char const *
528 stf_parse_csv_is_separator (char const *character, char const *chr, GSList const *str)
530 g_return_val_if_fail (character != NULL, NULL);
532 if (*character == 0)
533 return NULL;
535 if (str) {
536 GSList const *l;
538 for (l = str; l != NULL; l = l->next) {
539 char const *s = l->data;
540 char const *r;
541 glong cnt;
542 glong const len = g_utf8_strlen (s, -1);
544 /* Don't compare past the end of the buffer! */
545 for (r = character, cnt = 0; cnt < len; cnt++, r = g_utf8_next_char (r))
546 if (*r == '\0')
547 break;
549 if ((cnt == len) && (memcmp (character, s, len) == 0))
550 return g_utf8_offset_to_pointer (character, len);
554 if (chr && my_utf8_strchr (chr, g_utf8_get_char (character)))
555 return g_utf8_next_char(character);
557 return NULL;
561 * stf_parse_eat_separators:
563 * skip over leading separators
567 static void
568 stf_parse_eat_separators (Source_t *src, StfParseOptions_t *parseoptions)
570 char const *cur, *next;
572 g_return_if_fail (src != NULL);
573 g_return_if_fail (parseoptions != NULL);
575 cur = src->position;
577 if (*cur == '\0' || compare_terminator (cur, parseoptions))
578 return;
579 while ((next = stf_parse_csv_is_separator (cur, parseoptions->sep.chr, parseoptions->sep.str)))
580 cur = next;
581 src->position = cur;
582 return;
586 typedef enum {
587 STF_CELL_ERROR,
588 STF_CELL_EOF,
589 STF_CELL_EOL,
590 STF_CELL_FIELD_NO_SEP,
591 STF_CELL_FIELD_SEP
592 } StfParseCellRes;
594 static StfParseCellRes
595 stf_parse_csv_cell (GString *text, Source_t *src, StfParseOptions_t *parseoptions)
597 char const *cur;
598 gboolean saw_sep = FALSE;
600 g_return_val_if_fail (src != NULL, STF_CELL_ERROR);
601 g_return_val_if_fail (parseoptions != NULL, STF_CELL_ERROR);
603 cur = src->position;
604 g_return_val_if_fail (cur != NULL, STF_CELL_ERROR);
606 /* Skip whitespace, but stop at line terminators. */
607 while (1) {
608 int term_len;
610 if (*cur == 0) {
611 src->position = cur;
612 return STF_CELL_EOF;
615 term_len = compare_terminator (cur, parseoptions);
616 if (term_len) {
617 src->position = cur + term_len;
618 return STF_CELL_EOL;
621 if ((parseoptions->trim_spaces & TRIM_TYPE_LEFT) == 0)
622 break;
624 if (stf_parse_csv_is_separator (cur, parseoptions->sep.chr,
625 parseoptions->sep.str))
626 break;
628 if (!g_unichar_isspace (g_utf8_get_char (cur)))
629 break;
630 cur = g_utf8_next_char (cur);
633 if (parseoptions->stringindicator != 0 &&
634 g_utf8_get_char (cur) == parseoptions->stringindicator) {
635 cur = g_utf8_next_char (cur);
636 while (*cur) {
637 gunichar uc = g_utf8_get_char (cur);
638 cur = g_utf8_next_char (cur);
640 if (uc == parseoptions->stringindicator) {
641 if (parseoptions->indicator_2x_is_single &&
642 g_utf8_get_char (cur) == parseoptions->stringindicator)
643 cur = g_utf8_next_char (cur);
644 else {
645 /* "field content"dropped-garbage, */
646 while (*cur && !compare_terminator (cur, parseoptions)) {
647 char const *post = stf_parse_csv_is_separator
648 (cur, parseoptions->sep.chr, parseoptions->sep.str);
649 if (post) {
650 cur = post;
651 saw_sep = TRUE;
652 break;
654 cur = g_utf8_next_char (cur);
656 break;
660 g_string_append_unichar (text, uc);
663 /* We silently allow a missing terminating quote. */
664 } else {
665 /* Unquoted field. */
667 while (*cur && !compare_terminator (cur, parseoptions)) {
669 char const *post = stf_parse_csv_is_separator
670 (cur, parseoptions->sep.chr, parseoptions->sep.str);
671 if (post) {
672 cur = post;
673 saw_sep = TRUE;
674 break;
677 g_string_append_unichar (text, g_utf8_get_char (cur));
678 cur = g_utf8_next_char (cur);
681 if (parseoptions->trim_spaces & TRIM_TYPE_RIGHT) {
682 while (text->len) {
683 const char *last = g_utf8_prev_char (text->str + text->len);
684 if (!g_unichar_isspace (g_utf8_get_char (last)))
685 break;
686 g_string_truncate (text, last - text->str);
691 src->position = cur;
693 if (saw_sep && parseoptions->sep.duplicates)
694 stf_parse_eat_separators (src, parseoptions);
696 return saw_sep ? STF_CELL_FIELD_SEP : STF_CELL_FIELD_NO_SEP;
700 * stf_parse_csv_line:
702 * This will parse one line from the current @src->position.
703 * NOTE: The calling routine is responsible for freeing the result.
705 * returns : a GPtrArray of char*'s
707 static GPtrArray *
708 stf_parse_csv_line (Source_t *src, StfParseOptions_t *parseoptions)
710 GPtrArray *line;
711 gboolean cont = FALSE;
712 GString *text;
714 g_return_val_if_fail (src != NULL, NULL);
715 g_return_val_if_fail (parseoptions != NULL, NULL);
717 line = g_ptr_array_new ();
718 if (parseoptions->trim_seps)
719 stf_parse_eat_separators (src, parseoptions);
721 text = g_string_sized_new (30);
723 while (1) {
724 char *ctext;
725 StfParseCellRes res =
726 stf_parse_csv_cell (text, src, parseoptions);
727 trim_spaces_inplace (text->str, parseoptions);
728 ctext = g_string_chunk_insert_len (src->chunk,
729 text->str, text->len);
730 g_string_truncate (text, 0);
732 switch (res) {
733 case STF_CELL_FIELD_NO_SEP:
734 g_ptr_array_add (line, ctext);
735 cont = FALSE;
736 break;
738 case STF_CELL_FIELD_SEP:
739 g_ptr_array_add (line, ctext);
740 cont = TRUE; /* Make sure we see one more field. */
741 break;
743 default:
744 if (cont)
745 g_ptr_array_add (line, ctext);
746 g_string_free (text, TRUE);
747 return line;
753 * stf_parse_fixed_cell:
755 * returns a pointer to the parsed cell contents.
757 static char *
758 stf_parse_fixed_cell (Source_t *src, StfParseOptions_t *parseoptions)
760 char *res;
761 char const *cur;
762 int splitval;
764 g_return_val_if_fail (src != NULL, NULL);
765 g_return_val_if_fail (parseoptions != NULL, NULL);
767 cur = src->position;
769 if (src->splitpos < my_garray_len (parseoptions->splitpositions))
770 splitval = (int) g_array_index (parseoptions->splitpositions, int, src->splitpos);
771 else
772 splitval = -1;
774 while (*cur != 0 && !compare_terminator (cur, parseoptions) && splitval != src->linepos) {
775 src->linepos++;
776 cur = g_utf8_next_char (cur);
779 res = g_string_chunk_insert_len (src->chunk,
780 src->position,
781 cur - src->position);
783 src->position = cur;
785 return res;
789 * stf_parse_fixed_line:
791 * This will parse one line from the current @src->position.
792 * It will return a GPtrArray with the cell contents as strings.
794 * NOTE: The calling routine is responsible for freeing result.
796 static GPtrArray *
797 stf_parse_fixed_line (Source_t *src, StfParseOptions_t *parseoptions)
799 GPtrArray *line;
801 g_return_val_if_fail (src != NULL, NULL);
802 g_return_val_if_fail (parseoptions != NULL, NULL);
804 src->linepos = 0;
805 src->splitpos = 0;
807 line = g_ptr_array_new ();
808 while (*src->position != '\0' && !compare_terminator (src->position, parseoptions)) {
809 char *field = stf_parse_fixed_cell (src, parseoptions);
811 trim_spaces_inplace (field, parseoptions);
812 g_ptr_array_add (line, field);
814 src->splitpos++;
817 while (line->len < parseoptions->splitpositions->len)
818 g_ptr_array_add (line, g_strdup (""));
820 return line;
824 * stf_parse_general_free: (skip)
826 void
827 stf_parse_general_free (GPtrArray *lines)
829 unsigned lineno;
830 for (lineno = 0; lineno < lines->len; lineno++) {
831 GPtrArray *line = g_ptr_array_index (lines, lineno);
832 /* Fields are not freed here. */
833 if (line)
834 g_ptr_array_free (line, TRUE);
836 g_ptr_array_free (lines, TRUE);
841 * stf_parse_general: (skip)
843 * Returns: (transfer full): a GPtrArray of lines, where each line is itself a
844 * GPtrArray of strings.
846 * The caller must free this entire structure, for example by calling
847 * stf_parse_general_free.
849 GPtrArray *
850 stf_parse_general (StfParseOptions_t *parseoptions,
851 GStringChunk *lines_chunk,
852 char const *data, char const *data_end)
854 GPtrArray *lines;
855 Source_t src;
856 int row;
857 char const *valid_end = data_end;
859 g_return_val_if_fail (parseoptions != NULL, NULL);
860 g_return_val_if_fail (data != NULL, NULL);
861 g_return_val_if_fail (data_end != NULL, NULL);
862 g_return_val_if_fail (stf_parse_options_valid (parseoptions), NULL);
863 g_return_val_if_fail (g_utf8_validate (data, data_end-data, &valid_end), NULL);
865 src.chunk = lines_chunk;
866 src.position = data;
867 row = 0;
869 if ((data_end-data >= 3) && !strncmp(src.position, "\xEF\xBB\xBF", 3)) {
870 /* Skip over byte-order mark */
871 src.position += 3;
874 lines = g_ptr_array_new ();
875 while (*src.position != '\0' && src.position < data_end) {
876 GPtrArray *line;
878 if (row == GNM_MAX_ROWS) {
879 parseoptions->rows_exceeded = TRUE;
880 break;
883 line = parseoptions->parsetype == PARSE_TYPE_CSV
884 ? stf_parse_csv_line (&src, parseoptions)
885 : stf_parse_fixed_line (&src, parseoptions);
887 g_ptr_array_add (lines, line);
888 if (parseoptions->parsetype != PARSE_TYPE_CSV)
889 src.position += compare_terminator (src.position, parseoptions);
890 row++;
893 return lines;
897 * stf_parse_lines: (skip)
898 * @parseoptions: #StfParseOptions_t
899 * @lines_chunk:
900 * @data:
901 * @maxlines:
902 * @with_lineno:
904 * Returns: (transfer full): a GPtrArray of lines, where each line is itself a
905 * GPtrArray of strings.
907 * The caller must free this entire structure, for example by calling
908 * stf_parse_general_free.
910 GPtrArray *
911 stf_parse_lines (StfParseOptions_t *parseoptions,
912 GStringChunk *lines_chunk,
913 char const *data,
914 int maxlines, gboolean with_lineno)
916 GPtrArray *lines;
917 int lineno = 1;
919 g_return_val_if_fail (data != NULL, NULL);
921 lines = g_ptr_array_new ();
922 while (*data) {
923 char const *data0 = data;
924 GPtrArray *line = g_ptr_array_new ();
926 if (with_lineno) {
927 char buf[4 * sizeof (int)];
928 sprintf (buf, "%d", lineno);
929 g_ptr_array_add (line,
930 g_string_chunk_insert (lines_chunk, buf));
933 while (1) {
934 int termlen = compare_terminator (data, parseoptions);
935 if (termlen > 0 || *data == 0) {
936 g_ptr_array_add (line,
937 g_string_chunk_insert_len (lines_chunk,
938 data0,
939 data - data0));
940 data += termlen;
941 break;
942 } else
943 data = g_utf8_next_char (data);
946 g_ptr_array_add (lines, line);
948 lineno++;
949 if (lineno >= maxlines)
950 break;
952 return lines;
955 char const *
956 stf_parse_find_line (StfParseOptions_t *parseoptions,
957 char const *data,
958 int line)
960 while (line > 0) {
961 int termlen = compare_terminator (data, parseoptions);
962 if (termlen > 0) {
963 data += termlen;
964 line--;
965 } else if (*data == 0) {
966 return data;
967 } else {
968 data = g_utf8_next_char (data);
971 return data;
976 * stf_parse_options_fixed_autodiscover:
977 * @parseoptions: a Parse options struct.
978 * @data: The actual data.
979 * @data_end: data end.
981 * Automatically try to discover columns in the text to be parsed.
982 * We ignore empty lines (only containing parseoptions->terminator)
984 * FIXME: This is so extremely ugly that I am too tired to rewrite it right now.
985 * Think hard of a better more flexible solution...
987 void
988 stf_parse_options_fixed_autodiscover (StfParseOptions_t *parseoptions,
989 char const *data, char const *data_end)
991 char const *iterator = data;
992 GSList *list = NULL;
993 GSList *list_start = NULL;
994 int lines = 0;
995 int effective_lines = 0;
996 int max_line_length = 0;
997 int *line_begin_hits = NULL;
998 int *line_end_hits = NULL;
999 int i;
1001 stf_parse_options_fixed_splitpositions_clear (parseoptions);
1004 * First take a look at all possible white space combinations
1006 while (*iterator && iterator < data_end) {
1007 gboolean begin_recorded = FALSE;
1008 AutoDiscovery_t *disc = NULL;
1009 int position = 0;
1010 int termlen = 0;
1012 while (*iterator && (termlen = compare_terminator (iterator, parseoptions)) == 0) {
1013 if (!begin_recorded && *iterator == ' ') {
1014 disc = g_new0 (AutoDiscovery_t, 1);
1016 disc->start = position;
1018 begin_recorded = TRUE;
1019 } else if (begin_recorded && *iterator != ' ') {
1020 disc->stop = position;
1021 list = g_slist_prepend (list, disc);
1023 begin_recorded = FALSE;
1024 disc = NULL;
1027 position++;
1028 iterator++;
1031 if (position > max_line_length)
1032 max_line_length = position;
1035 * If there are excess spaces at the end of
1036 * the line : ignore them
1038 g_free (disc);
1041 * Hop over the terminator
1043 iterator += termlen;
1045 if (position != 0)
1046 effective_lines++;
1048 lines++;
1051 list = g_slist_reverse (list);
1052 list_start = list;
1055 * Kewl stuff :
1056 * Look at the number of hits at each line position
1057 * if the number of hits equals the number of lines
1058 * we can be pretty sure this is the start or end
1059 * of a column, we filter out empty columns
1060 * later
1062 line_begin_hits = g_new0 (int, max_line_length + 1);
1063 line_end_hits = g_new0 (int, max_line_length + 1);
1065 while (list) {
1066 AutoDiscovery_t *disc = list->data;
1068 line_begin_hits[disc->start]++;
1069 line_end_hits[disc->stop]++;
1071 g_free (disc);
1073 list = g_slist_next (list);
1075 g_slist_free (list_start);
1077 for (i = 0; i < max_line_length + 1; i++)
1078 if (line_begin_hits[i] == effective_lines || line_end_hits[i] == effective_lines)
1079 stf_parse_options_fixed_splitpositions_add (parseoptions, i);
1082 * Do some corrections to the initial columns
1083 * detected here, we obviously don't need to
1084 * do this if there are no columns at all.
1086 if (my_garray_len (parseoptions->splitpositions) > 0) {
1088 * Try to find columns that look like :
1090 * Example 100
1091 * Example2 9
1093 * (In other words : Columns with left & right justification with
1094 * a minimum of 2 spaces in the middle)
1095 * Split these columns in 2
1098 for (i = 0; i < my_garray_len (parseoptions->splitpositions) - 1; i++) {
1099 int begin = g_array_index (parseoptions->splitpositions, int, i);
1100 int end = g_array_index (parseoptions->splitpositions, int, i + 1);
1101 int num_spaces = -1;
1102 int spaces_start = 0;
1103 gboolean right_aligned = TRUE;
1104 gboolean left_aligned = TRUE;
1105 gboolean has_2_spaces = TRUE;
1107 iterator = data;
1108 lines = 0;
1109 while (*iterator && iterator < data_end) {
1110 gboolean trigger = FALSE;
1111 gboolean space_trigger = FALSE;
1112 int pos = 0;
1114 num_spaces = -1;
1115 spaces_start = 0;
1116 while (*iterator && !compare_terminator (iterator, parseoptions)) {
1117 if (pos == begin) {
1118 if (*iterator == ' ')
1119 left_aligned = FALSE;
1121 trigger = TRUE;
1122 } else if (pos == end - 1) {
1123 if (*iterator == ' ')
1124 right_aligned = FALSE;
1126 trigger = FALSE;
1129 if (trigger || pos == end - 1) {
1130 if (!space_trigger && *iterator == ' ') {
1131 space_trigger = TRUE;
1132 spaces_start = pos;
1133 } else if (space_trigger && *iterator != ' ') {
1134 space_trigger = FALSE;
1135 num_spaces = pos - spaces_start;
1139 iterator++;
1140 pos++;
1143 if (num_spaces < 2)
1144 has_2_spaces = FALSE;
1146 if (*iterator)
1147 iterator++;
1149 lines++;
1153 * If this column meets all the criteria
1154 * split it into two at the last measured
1155 * spaces_start + num_spaces
1157 if (has_2_spaces && right_aligned && left_aligned) {
1158 int val = (((spaces_start + num_spaces) - spaces_start) / 2) + spaces_start;
1160 g_array_insert_val (parseoptions->splitpositions, i + 1, val);
1163 * Skip over the inserted column
1165 i++;
1170 * Remove empty columns here if needed
1172 for (i = 0; i < my_garray_len (parseoptions->splitpositions) - 1; i++) {
1173 int begin = g_array_index (parseoptions->splitpositions, int, i);
1174 int end = g_array_index (parseoptions->splitpositions, int, i + 1);
1175 gboolean only_spaces = TRUE;
1177 iterator = data;
1178 lines = 0;
1179 while (*iterator && iterator < data_end) {
1180 gboolean trigger = FALSE;
1181 int pos = 0;
1183 while (*iterator && !compare_terminator (iterator, parseoptions)) {
1184 if (pos == begin)
1185 trigger = TRUE;
1186 else if (pos == end)
1187 trigger = FALSE;
1189 if (trigger) {
1190 if (*iterator != ' ')
1191 only_spaces = FALSE;
1194 iterator++;
1195 pos++;
1198 if (*iterator)
1199 iterator++;
1201 lines++;
1205 * The column only contains spaces
1206 * remove it
1208 if (only_spaces) {
1209 g_array_remove_index (parseoptions->splitpositions, i);
1212 * We HAVE to make sure that the next column (end) also
1213 * gets checked out. If we don't decrease "i" here, we
1214 * will skip over it as the indexes shift down after
1215 * the removal
1217 i--;
1222 g_free (line_begin_hits);
1223 g_free (line_end_hits);
1226 /*******************************************************************************************************
1227 * STF PARSE HL: high-level functions that dump the raw data returned by the low-level parsing
1228 * functions into something meaningful (== application specific)
1229 *******************************************************************************************************/
1232 * This is more or less as gnm_cell_set_text, except...
1233 * 1. Unknown names are not allowed.
1234 * 2. Only '=' can start an expression.
1237 static void
1238 stf_cell_set_text (GnmCell *cell, char const *text)
1240 GnmExprTop const *texpr;
1241 GnmValue *val;
1242 GOFormat const *fmt = gnm_style_get_format (gnm_cell_get_style (cell));
1243 const GODateConventions *date_conv =
1244 workbook_date_conv (cell->base.sheet->workbook);
1246 if (!go_format_is_text (fmt) && *text == '=' && text[1] != 0) {
1247 GnmExprParseFlags flags =
1248 GNM_EXPR_PARSE_UNKNOWN_NAMES_ARE_INVALID;
1249 const char *expr_start = text + 1;
1250 GnmParsePos pos;
1251 val = NULL;
1252 parse_pos_init_cell (&pos, cell);
1253 texpr = gnm_expr_parse_str (expr_start, &pos, flags,
1254 NULL, NULL);
1255 } else {
1256 texpr = NULL;
1257 val = format_match (text, fmt, date_conv);
1260 if (!val && !texpr)
1261 val = value_new_string (text);
1263 if (val)
1264 gnm_cell_set_value (cell, val);
1265 else {
1266 gnm_cell_set_expr (cell, texpr);
1267 gnm_expr_top_unref (texpr);
1271 static void
1272 stf_read_remember_settings (Workbook *book, StfParseOptions_t *po)
1274 if (po->parsetype == PARSE_TYPE_CSV) {
1275 GnmStfExport *stfe = gnm_stf_get_stfe (G_OBJECT (book));
1276 char quote[6];
1277 int length = g_unichar_to_utf8 (po->stringindicator, quote);
1278 if (length > 5) {
1279 quote[0] = '"';
1280 quote[1] = '\0';
1281 } else quote[length] = '\0';
1283 g_object_set (G_OBJECT (stfe), "separator", po->sep.chr, "quote", &quote, NULL);
1285 if ((po->terminator != NULL) && (po->terminator->data != NULL))
1286 g_object_set (G_OBJECT (stfe), "eol", po->terminator->data, NULL);
1290 gboolean
1291 stf_parse_sheet (StfParseOptions_t *parseoptions,
1292 char const *data, char const *data_end,
1293 Sheet *sheet, int start_col, int start_row)
1295 int row;
1296 unsigned int lrow;
1297 GStringChunk *lines_chunk;
1298 GPtrArray *lines;
1299 gboolean result = TRUE;
1300 int col;
1301 unsigned int lcol;
1303 SETUP_LOCALE_SWITCH;
1305 g_return_val_if_fail (parseoptions != NULL, FALSE);
1306 g_return_val_if_fail (data != NULL, FALSE);
1307 g_return_val_if_fail (IS_SHEET (sheet), FALSE);
1309 if (!data_end)
1310 data_end = data + strlen (data);
1312 lines_chunk = g_string_chunk_new (100 * 1024);
1313 lines = stf_parse_general (parseoptions, lines_chunk, data, data_end);
1314 if (lines == NULL)
1315 result = FALSE;
1317 col = start_col;
1318 for (lcol = 0; lcol < parseoptions->formats->len; lcol++) {
1319 GOFormat const *fmt = g_ptr_array_index (parseoptions->formats, lcol);
1320 GnmStyle *mstyle;
1321 gboolean want_col =
1322 (parseoptions->col_import_array == NULL ||
1323 parseoptions->col_import_array_len <= lcol ||
1324 parseoptions->col_import_array[lcol]);
1325 if (!want_col || col >= gnm_sheet_get_max_cols (sheet))
1326 continue;
1328 if (fmt && !go_format_is_general (fmt)) {
1329 GnmRange r;
1330 int end_row = MIN (start_row + (int)lines->len - 1,
1331 gnm_sheet_get_last_row (sheet));
1333 range_init (&r, col, start_row, col, end_row);
1334 mstyle = gnm_style_new ();
1335 gnm_style_set_format (mstyle, fmt);
1336 sheet_apply_style (sheet, &r, mstyle);
1338 col++;
1341 START_LOCALE_SWITCH;
1342 for (row = start_row, lrow = 0;
1343 result && lrow < lines->len;
1344 row++, lrow++) {
1345 GPtrArray *line;
1347 if (row >= gnm_sheet_get_max_rows (sheet)) {
1348 if (!parseoptions->rows_exceeded) {
1349 /* FIXME: What locale? */
1350 g_warning (_("There are more rows of data than "
1351 "there is room for in the sheet. Extra "
1352 "rows will be ignored."));
1353 parseoptions->rows_exceeded = TRUE;
1355 break;
1358 col = start_col;
1359 line = g_ptr_array_index (lines, lrow);
1361 for (lcol = 0; lcol < line->len; lcol++) {
1362 GOFormat const *fmt = g_ptr_array_index (parseoptions->formats, lcol);
1363 char const *text = g_ptr_array_index (line, lcol);
1364 gboolean want_col =
1365 (parseoptions->col_import_array == NULL ||
1366 parseoptions->col_import_array_len <= lcol ||
1367 parseoptions->col_import_array[lcol]);
1368 if (!want_col)
1369 continue;
1371 if (col >= gnm_sheet_get_max_cols (sheet)) {
1372 if (!parseoptions->cols_exceeded) {
1373 /* FIXME: What locale? */
1374 g_warning (_("There are more columns of data than "
1375 "there is room for in the sheet. Extra "
1376 "columns will be ignored."));
1377 parseoptions->cols_exceeded = TRUE;
1379 break;
1381 if (text && *text) {
1382 GnmCell *cell = sheet_cell_fetch (sheet, col, row);
1383 if (!go_format_is_text (fmt) &&
1384 lcol < parseoptions->formats_decimal->len &&
1385 g_ptr_array_index (parseoptions->formats_decimal, lcol)) {
1386 GOFormatFamily fam;
1387 GnmValue *v = format_match_decimal_number_with_locale
1388 (text, &fam,
1389 g_ptr_array_index (parseoptions->formats_curr, lcol),
1390 g_ptr_array_index (parseoptions->formats_thousand, lcol),
1391 g_ptr_array_index (parseoptions->formats_decimal, lcol));
1392 if (!v)
1393 v = value_new_string (text);
1394 sheet_cell_set_value (cell, v);
1395 } else {
1397 stf_cell_set_text (cell, text);
1400 col++;
1403 g_ptr_array_index (lines, lrow) = NULL;
1404 g_ptr_array_free (line, TRUE);
1406 END_LOCALE_SWITCH;
1408 for (lcol = 0, col = start_col;
1409 lcol < parseoptions->col_import_array_len && col < gnm_sheet_get_max_cols (sheet);
1410 lcol++) {
1411 if (parseoptions->col_import_array == NULL ||
1412 parseoptions->col_import_array_len <= lcol ||
1413 parseoptions->col_import_array[lcol]) {
1414 if (parseoptions->col_autofit_array == NULL ||
1415 parseoptions->col_autofit_array[lcol]) {
1416 ColRowIndexList *list = colrow_get_index_list (col, col, NULL);
1417 ColRowStateGroup *state = colrow_set_sizes (sheet, TRUE, list, -1, 0, -1);
1418 colrow_index_list_destroy (list);
1419 g_slist_free (state);
1421 col++;
1425 g_string_chunk_free (lines_chunk);
1426 if (lines)
1427 stf_parse_general_free (lines);
1428 if (result)
1429 stf_read_remember_settings (sheet->workbook, parseoptions);
1430 return result;
1433 GnmCellRegion *
1434 stf_parse_region (StfParseOptions_t *parseoptions, char const *data, char const *data_end,
1435 Workbook const *wb)
1437 static GODateConventions const default_conv = {FALSE};
1438 GODateConventions const *date_conv = wb ? workbook_date_conv (wb) : &default_conv;
1440 GnmCellRegion *cr;
1441 unsigned int row, colhigh = 0;
1442 GStringChunk *lines_chunk;
1443 GPtrArray *lines;
1444 size_t nformats;
1446 SETUP_LOCALE_SWITCH;
1448 g_return_val_if_fail (parseoptions != NULL, NULL);
1449 g_return_val_if_fail (data != NULL, NULL);
1451 START_LOCALE_SWITCH;
1453 cr = gnm_cell_region_new (NULL);
1455 if (!data_end)
1456 data_end = data + strlen (data);
1457 lines_chunk = g_string_chunk_new (100 * 1024);
1458 lines = stf_parse_general (parseoptions, lines_chunk, data, data_end);
1459 nformats = parseoptions->formats->len;
1460 for (row = 0; row < lines->len; row++) {
1461 GPtrArray *line = g_ptr_array_index (lines, row);
1462 unsigned int col, targetcol = 0;
1463 for (col = 0; col < line->len; col++) {
1464 if (parseoptions->col_import_array == NULL ||
1465 parseoptions->col_import_array_len <= col ||
1466 parseoptions->col_import_array[col]) {
1467 const char *text = g_ptr_array_index (line, col);
1468 if (text) {
1469 GOFormat *fmt = NULL;
1470 GnmValue *v;
1471 GnmCellCopy *cc;
1473 if (col < nformats)
1474 fmt = g_ptr_array_index (parseoptions->formats, col);
1475 v = format_match (text, fmt, date_conv);
1476 if (!v)
1477 v = value_new_string (text);
1479 cc = gnm_cell_copy_new (cr, targetcol, row);
1480 cc->val = v;
1481 cc->texpr = NULL;
1482 targetcol++;
1483 if (targetcol > colhigh)
1484 colhigh = targetcol;
1489 stf_parse_general_free (lines);
1490 g_string_chunk_free (lines_chunk);
1492 END_LOCALE_SWITCH;
1494 cr->cols = (colhigh > 0) ? colhigh : 1;
1495 cr->rows = row;
1497 return cr;
1500 static int
1501 int_sort (void const *a, void const *b)
1503 return *(int const *)a - *(int const *)b;
1506 static int
1507 count_character (GPtrArray *lines, gunichar c, double quantile)
1509 int *counts, res;
1510 unsigned int lno, cno;
1512 if (lines->len == 0)
1513 return 0;
1515 counts = g_new (int, lines->len);
1516 for (lno = cno = 0; lno < lines->len; lno++) {
1517 int count = 0;
1518 GPtrArray *boxline = g_ptr_array_index (lines, lno);
1519 char const *line = g_ptr_array_index (boxline, 0);
1521 /* Ignore empty lines. */
1522 if (*line == 0)
1523 continue;
1525 while (*line) {
1526 if (g_utf8_get_char (line) == c)
1527 count++;
1528 line = g_utf8_next_char (line);
1531 counts[cno++] = count;
1534 if (cno == 0)
1535 res = 0;
1536 else {
1537 unsigned int qi = (unsigned int)ceil (quantile * cno);
1538 qsort (counts, cno, sizeof (counts[0]), int_sort);
1539 if (qi == cno)
1540 qi--;
1541 res = counts[qi];
1544 g_free (counts);
1546 return res;
1549 static void
1550 dump_guessed_options (const StfParseOptions_t *res)
1552 GSList *l;
1553 char ubuffer[6 + 1];
1554 unsigned ui;
1556 g_printerr ("Guessed format:\n");
1557 switch (res->parsetype) {
1558 case PARSE_TYPE_CSV:
1559 g_printerr (" type = sep\n");
1560 g_printerr (" separator = %s\n",
1561 res->sep.chr ? res->sep.chr : "(none)");
1562 g_printerr (" see two as one = %s\n",
1563 res->sep.duplicates ? "yes" : "no");
1564 break;
1565 case PARSE_TYPE_FIXED:
1566 g_printerr (" type = sep\n");
1567 break;
1568 default:
1571 g_printerr (" trim space = %d\n", res->trim_spaces);
1573 ubuffer[g_unichar_to_utf8 (res->stringindicator, ubuffer)] = 0;
1574 g_printerr (" string indicator = %s\n", ubuffer);
1575 g_printerr (" see two as one = %s\n",
1576 res->indicator_2x_is_single ? "yes" : "no");
1578 g_printerr (" line terminators =");
1579 for (l = res->terminator; l; l = l->next) {
1580 const char *t = l->data;
1581 if (strcmp (t, "\n") == 0)
1582 g_printerr (" unix");
1583 else if (strcmp (t, "\r") == 0)
1584 g_printerr (" mac");
1585 else if (strcmp (t, "\r\n") == 0)
1586 g_printerr (" dos");
1587 else
1588 g_printerr (" other");
1590 g_printerr ("\n");
1592 for (ui = 0; ui < res->formats->len; ui++) {
1593 GOFormat const *fmt = g_ptr_array_index (res->formats, ui);
1594 const GString *decimal = ui < res->formats_decimal->len
1595 ? g_ptr_array_index (res->formats_decimal, ui)
1596 : NULL;
1597 const GString *thousand = ui < res->formats_thousand->len
1598 ? g_ptr_array_index (res->formats_thousand, ui)
1599 : NULL;
1601 g_printerr (" fmt.%d = %s\n", ui, go_format_as_XL (fmt));
1602 if (decimal)
1603 g_printerr (" fmt.%d.dec = %s\n", ui, decimal->str);
1604 if (thousand)
1605 g_printerr (" fmt.%d.thou = %s\n", ui, thousand->str);
1610 * stf_parse_options_guess:
1611 * @data: the input data.
1613 * Returns: (transfer full): the guessed options.
1615 StfParseOptions_t *
1616 stf_parse_options_guess (char const *data)
1618 StfParseOptions_t *res;
1619 GStringChunk *lines_chunk;
1620 GPtrArray *lines;
1621 int tabcount;
1622 int sepcount;
1623 gunichar sepchar = go_locale_get_arg_sep ();
1625 g_return_val_if_fail (data != NULL, NULL);
1627 res = stf_parse_options_new ();
1628 lines_chunk = g_string_chunk_new (100 * 1024);
1629 lines = stf_parse_lines (res, lines_chunk, data, 1000, FALSE);
1631 tabcount = count_character (lines, '\t', 0.2);
1632 sepcount = count_character (lines, sepchar, 0.2);
1634 /* At least one tab per line and enough to separate every
1635 would-be sepchars. */
1636 if (tabcount >= 1 && tabcount >= sepcount - 1)
1637 stf_parse_options_csv_set_separators (res, "\t", NULL);
1638 else {
1639 gunichar c;
1642 * Try a few more or less likely characters and pick the first
1643 * one that occurs on at least half the lines.
1645 * The order is mostly random, although ' ' and '!' which
1646 * could very easily occur in text are put last.
1648 if (count_character (lines, (c = sepchar), 0.5) > 0 ||
1649 count_character (lines, (c = go_locale_get_col_sep ()), 0.5) > 0 ||
1650 count_character (lines, (c = ':'), 0.5) > 0 ||
1651 count_character (lines, (c = ','), 0.5) > 0 ||
1652 count_character (lines, (c = ';'), 0.5) > 0 ||
1653 count_character (lines, (c = '|'), 0.5) > 0 ||
1654 count_character (lines, (c = '!'), 0.5) > 0 ||
1655 count_character (lines, (c = ' '), 0.5) > 0) {
1656 char sep[7];
1657 sep[g_unichar_to_utf8 (c, sep)] = 0;
1658 if (c == ' ')
1659 strcat (sep, "\t");
1660 stf_parse_options_csv_set_separators (res, sep, NULL);
1664 // For now, always separated:
1665 stf_parse_options_set_type (res, PARSE_TYPE_CSV);
1667 switch (res->parsetype) {
1668 case PARSE_TYPE_CSV: {
1669 gboolean dups =
1670 res->sep.chr &&
1671 strchr (res->sep.chr, ' ') != NULL;
1672 gboolean trim =
1673 res->sep.chr &&
1674 strchr (res->sep.chr, ' ') != NULL;
1676 stf_parse_options_set_trim_spaces (res, TRIM_TYPE_LEFT | TRIM_TYPE_RIGHT);
1677 stf_parse_options_csv_set_indicator_2x_is_single (res, TRUE);
1678 stf_parse_options_csv_set_duplicates (res, dups);
1679 stf_parse_options_csv_set_trim_seps (res, trim);
1681 stf_parse_options_csv_set_stringindicator (res, '"');
1682 break;
1685 case PARSE_TYPE_FIXED:
1686 break;
1688 default:
1689 g_assert_not_reached ();
1692 stf_parse_general_free (lines);
1693 g_string_chunk_free (lines_chunk);
1695 stf_parse_options_guess_formats (res, data);
1697 if (gnm_debug_flag ("stf"))
1698 dump_guessed_options (res);
1700 return res;
1704 * stf_parse_options_guess_csv:
1705 * @data: the CSV input data.
1707 * Returns: (transfer full): the guessed options.
1709 StfParseOptions_t *
1710 stf_parse_options_guess_csv (char const *data)
1712 StfParseOptions_t *res;
1713 GStringChunk *lines_chunk;
1714 GPtrArray *lines;
1715 char *sep = NULL;
1716 char const *quoteline = NULL;
1717 int pass;
1718 gunichar stringind = '"';
1720 g_return_val_if_fail (data != NULL, NULL);
1722 res = stf_parse_options_new ();
1723 stf_parse_options_set_type (res, PARSE_TYPE_CSV);
1724 stf_parse_options_set_trim_spaces (res, TRIM_TYPE_LEFT | TRIM_TYPE_RIGHT);
1725 stf_parse_options_csv_set_indicator_2x_is_single (res, TRUE);
1726 stf_parse_options_csv_set_duplicates (res, FALSE);
1727 stf_parse_options_csv_set_trim_seps (res, FALSE);
1728 stf_parse_options_csv_set_stringindicator (res, stringind);
1730 lines_chunk = g_string_chunk_new (100 * 1024);
1731 lines = stf_parse_lines (res, lines_chunk, data, 1000, FALSE);
1734 * Find a line containing a quote; skip first line unless it is
1735 * the only one. Prefer a line with the quote first.
1737 for (pass = 1; !quoteline && pass <= 2; pass++) {
1738 size_t lno;
1739 for (lno = MIN (1, lines->len - 1);
1740 !quoteline && lno < lines->len;
1741 lno++) {
1742 GPtrArray *boxline = g_ptr_array_index (lines, lno);
1743 const char *line = g_ptr_array_index (boxline, 0);
1744 switch (pass) {
1745 case 1:
1746 if (g_utf8_get_char (line) == stringind)
1747 quoteline = line;
1748 break;
1749 case 2:
1750 if (my_utf8_strchr (line, stringind))
1751 quoteline = line;
1752 break;
1757 if (quoteline) {
1758 const char *p0 = my_utf8_strchr (quoteline, stringind);
1759 const char *p = p0;
1761 do {
1762 p = g_utf8_next_char (p);
1763 } while (*p && g_utf8_get_char (p) != stringind);
1764 if (*p) p = g_utf8_next_char (p);
1765 while (*p && g_unichar_isspace (g_utf8_get_char (p)))
1766 p = g_utf8_next_char (p);
1767 if (*p) {
1768 /* Use the character after the quote. */
1769 sep = g_strndup (p, g_utf8_next_char (p) - p);
1770 } else {
1771 /* Try to use character before the quote. */
1772 while (p0 > quoteline && !sep) {
1773 p = p0;
1774 p0 = g_utf8_prev_char (p0);
1775 if (!g_unichar_isspace (g_utf8_get_char (p0)))
1776 sep = g_strndup (p0, p - p0);
1781 if (!sep)
1782 sep = g_strdup (",");
1783 stf_parse_options_csv_set_separators (res, sep, NULL);
1784 g_free (sep);
1786 stf_parse_general_free (lines);
1787 g_string_chunk_free (lines_chunk);
1789 stf_parse_options_guess_formats (res, data);
1791 if (gnm_debug_flag ("stf"))
1792 dump_guessed_options (res);
1794 return res;
1797 typedef enum {
1798 STF_GUESS_DATE_DMY = 1,
1799 STF_GUESS_DATE_MDY = 2,
1800 STF_GUESS_DATE_YMD = 4,
1802 STF_GUESS_NUMBER_DEC_POINT = 0x10,
1803 STF_GUESS_NUMBER_DEC_COMMA = 0x20,
1804 STF_GUESS_NUMBER_DEC_EITHER = 0x30,
1806 STF_GUESS_ALL = 0x37
1807 } StfGuessFormats;
1809 static void
1810 do_check_date (const char *data, StfGuessFormats flag,
1811 gboolean mbd, gboolean ybm,
1812 unsigned *possible,
1813 GODateConventions const *date_conv)
1815 GnmValue *v;
1816 gboolean this_mbd, this_ybm;
1817 int imbd;
1819 if (!(*possible & flag))
1820 return;
1822 v = format_match_datetime (data, date_conv, mbd, TRUE, FALSE);
1823 if (!v || !VALUE_FMT (v))
1824 goto fail;
1826 imbd = go_format_month_before_day (VALUE_FMT (v));
1827 this_mbd = (imbd >= 1);
1828 this_ybm = (imbd == 2);
1829 if (mbd != this_mbd || ybm != this_ybm)
1830 goto fail;
1832 goto done;
1834 fail:
1835 *possible &= ~flag;
1836 done:
1837 value_release (v);
1841 static void
1842 do_check_number (const char *data, StfGuessFormats flag,
1843 const GString *dec, const GString *thousand, const GString *curr,
1844 unsigned *possible, int *decimals)
1846 GnmValue *v;
1847 GOFormatFamily family;
1848 const char *pthou;
1850 if (!(*possible & flag))
1851 return;
1853 v = format_match_decimal_number_with_locale (data, &family, curr, thousand, dec);
1854 if (!v)
1855 goto fail;
1857 if (*decimals != -2) {
1858 const char *pdec = strstr (data, dec->str);
1859 int this_decimals = 0;
1860 if (pdec) {
1861 pdec += dec->len;
1862 while (g_ascii_isdigit (*pdec)) {
1863 pdec++;
1864 this_decimals++;
1867 if (*decimals == -1)
1868 *decimals = this_decimals;
1869 else if (*decimals != this_decimals)
1870 *decimals = -2;
1873 pthou = strstr (data, thousand->str);
1874 if (pthou) {
1875 const char *p;
1876 int digits = 0, nonzero_digits = 0;
1877 for (p = data; p < pthou; p = g_utf8_next_char (p)) {
1878 if (g_unichar_isdigit (g_utf8_get_char (p))) {
1879 digits++;
1880 if (*p != '0')
1881 nonzero_digits++;
1884 // "-.222" implies that "." is not a thousands separator.
1885 // "0.222" implies that "." is not a thousands separator.
1886 // "12345,555" implies that "," is not a thousands separator.
1887 if (nonzero_digits == 0 || digits > 3)
1888 goto fail;
1891 goto done;
1893 fail:
1894 *possible &= ~flag;
1895 done:
1896 value_release (v);
1901 * stf_parse_options_guess_formats:
1902 * @data: the CSV input data.
1904 * This function attempts to recognize data formats on a column-by-column
1905 * basis under the assumption that the data in a text file will generally
1906 * use the same data formats.
1908 * This is useful because not all values give sufficient information by
1909 * themselves to tell what format the data is in. For example, "1/2/2000"
1910 * is likely to be a date in year 2000, but it is not clear if it is in
1911 * January or February. If another value in the same column is "31/1/1999"
1912 * then it is likely that the former date was in February.
1914 * Likewise, a value of "123,456" could mean either 1.23456e5 or 1.23456e2.
1915 * A later value of "111,200.22" would clear up the confusion.
1918 void
1919 stf_parse_options_guess_formats (StfParseOptions_t *po, char const *data)
1921 GStringChunk *lines_chunk;
1922 GPtrArray *lines;
1923 unsigned lno, col, colcount, sline;
1924 GODateConventions const *date_conv = go_date_conv_from_str ("Lotus:1900");
1925 GString *s_comma = g_string_new (",");
1926 GString *s_dot = g_string_new (".");
1927 GString *s_dollar = g_string_new ("$");
1928 gboolean debug = gnm_debug_flag ("stf");
1930 g_ptr_array_set_size (po->formats, 0);
1931 g_ptr_array_set_size (po->formats_decimal, 0);
1932 g_ptr_array_set_size (po->formats_thousand, 0);
1933 g_ptr_array_set_size (po->formats_curr, 0);
1935 lines_chunk = g_string_chunk_new (100 * 1024);
1936 lines = stf_parse_general (po, lines_chunk, data, data + strlen (data));
1938 colcount = 0;
1939 for (lno = 0; lno < lines->len; lno++) {
1940 GPtrArray *line = g_ptr_array_index (lines, lno);
1941 colcount = MAX (colcount, line->len);
1944 // Ignore first line unless it is the only one
1945 sline = MIN ((int)lines->len - 1, 1);
1947 g_ptr_array_set_size (po->formats, colcount);
1948 g_ptr_array_set_size (po->formats_decimal, colcount);
1949 g_ptr_array_set_size (po->formats_thousand, colcount);
1950 g_ptr_array_set_size (po->formats_curr, colcount);
1951 for (col = 0; col < colcount; col++) {
1952 unsigned possible = STF_GUESS_ALL;
1953 GOFormat *fmt = NULL;
1954 gboolean seen_dot = FALSE;
1955 gboolean seen_comma = FALSE;
1956 int decimals_if_point = -1; // -1: unset; -2: inconsistent; >=0: count
1957 int decimals_if_comma = -1; // -1: unset; -2: inconsistent; >=0: count
1959 for (lno = sline; possible && lno < lines->len; lno++) {
1960 GPtrArray *line = g_ptr_array_index (lines, lno);
1961 const char *data = col < line->len ? g_ptr_array_index (line, col) : "";
1962 unsigned prev_possible = possible;
1964 if (*data == 0 || data[0] == '\'')
1965 continue;
1967 do_check_date (data, STF_GUESS_DATE_DMY, FALSE, FALSE, &possible, date_conv);
1968 do_check_date (data, STF_GUESS_DATE_MDY, TRUE, FALSE, &possible, date_conv);
1969 do_check_date (data, STF_GUESS_DATE_YMD, TRUE, TRUE, &possible, date_conv);
1971 if ((possible & STF_GUESS_NUMBER_DEC_EITHER) == STF_GUESS_NUMBER_DEC_EITHER) {
1972 const char *pdot = strstr (data, s_dot->str);
1973 const char *pcomma = strstr (data, s_comma->str);
1974 if (pdot && pcomma) {
1975 // Both -- last one is the decimal separator
1976 if (pdot > pcomma)
1977 possible &= ~STF_GUESS_NUMBER_DEC_COMMA;
1978 else
1979 possible &= ~STF_GUESS_NUMBER_DEC_POINT;
1980 } else if (pdot && strstr (pdot + s_dot->len, s_dot->str)) {
1981 // Two dots so they are thousands separators
1982 possible &= ~STF_GUESS_NUMBER_DEC_POINT;
1983 } else if (pcomma && strstr (pcomma + s_comma->len, s_comma->str)) {
1984 // Two commas so they are thousands separators
1985 possible &= ~STF_GUESS_NUMBER_DEC_COMMA;
1988 seen_dot = seen_dot || (pdot != 0);
1989 seen_comma = seen_comma || (pcomma != 0);
1991 do_check_number (data, STF_GUESS_NUMBER_DEC_POINT,
1992 s_dot, s_comma, s_dollar,
1993 &possible, &decimals_if_point);
1994 do_check_number (data, STF_GUESS_NUMBER_DEC_COMMA,
1995 s_comma, s_dot, s_dollar,
1996 &possible, &decimals_if_comma);
1998 if (possible != prev_possible && debug)
1999 g_printerr ("col=%d; after [%s] possible=0x%x\n", col, data, possible);
2002 if ((possible & STF_GUESS_NUMBER_DEC_EITHER) == STF_GUESS_NUMBER_DEC_EITHER &&
2003 !seen_dot && !seen_comma) {
2004 // It doesn't matter what the separators are
2005 possible &= ~STF_GUESS_NUMBER_DEC_COMMA;
2008 switch (possible) {
2009 case STF_GUESS_DATE_DMY:
2010 fmt = go_format_new_from_XL ("d-mmm-yyyy");
2011 break;
2012 case STF_GUESS_DATE_MDY:
2013 fmt = go_format_new_from_XL ("m/d/yyyy");
2014 break;
2015 case STF_GUESS_DATE_YMD:
2016 fmt = go_format_new_from_XL ("yyyy-mm-dd");
2017 break;
2018 case STF_GUESS_NUMBER_DEC_POINT:
2019 g_ptr_array_index (po->formats_decimal, col) = g_string_new (".");
2020 g_ptr_array_index (po->formats_thousand, col) = g_string_new (",");
2021 g_ptr_array_index (po->formats_curr, col) = g_string_new (s_dollar->str);
2022 if (decimals_if_point > 0) {
2023 // Don't set format if decimals is zero
2024 GString *fmt_str = g_string_new (NULL);
2025 go_format_generate_number_str (fmt_str, 1, decimals_if_point, seen_comma, FALSE, FALSE, "", "");
2026 fmt = go_format_new_from_XL (fmt_str->str);
2027 g_string_free (fmt_str, TRUE);
2029 break;
2030 case STF_GUESS_NUMBER_DEC_COMMA:
2031 g_ptr_array_index (po->formats_decimal, col) = g_string_new (",");
2032 g_ptr_array_index (po->formats_thousand, col) = g_string_new (".");
2033 g_ptr_array_index (po->formats_curr, col) = g_string_new (s_dollar->str);
2034 if (decimals_if_comma > 0) {
2035 // Don't set format if decimals is zero
2036 GString *fmt_str = g_string_new (NULL);
2037 go_format_generate_number_str (fmt_str, 1, decimals_if_comma, seen_dot, FALSE, FALSE, "", "");
2038 fmt = go_format_new_from_XL (fmt_str->str);
2039 g_string_free (fmt_str, TRUE);
2041 break;
2042 default:
2043 break;
2046 if (!fmt)
2047 fmt = go_format_ref (go_format_general ());
2048 g_ptr_array_index (po->formats, col) = fmt;
2051 stf_parse_general_free (lines);
2052 g_string_chunk_free (lines_chunk);
2054 g_string_free (s_dot, TRUE);
2055 g_string_free (s_comma, TRUE);
2056 g_string_free (s_dollar, TRUE);