Update Spanish translation
[gnumeric.git] / src / stf-parse.c
blob11009b90ae5afbf356ccb3b9602f56bf50fb8905
1 /*
2 * stf-parse.c : Structured Text Format parser. (STF)
3 * A general purpose engine for parsing data
4 * in CSV and Fixed width format.
7 * Copyright (C) Almer. S. Tigelaar.
8 * EMail: almer1@dds.nl or almer-t@bigfoot.com
10 * Copyright (C) 2003 Andreas J. Guelzow <aguelzow@taliesin.ca>
11 * Copyright (C) 2003,2008-2009 Morten Welinder <terra@gnome.org>
13 * This program is free software; you can redistribute it and/or modify
14 * it under the terms of the GNU General Public License as published by
15 * the Free Software Foundation; either version 2 of the License, or
16 * (at your option) any later version.
18 * This program is distributed in the hope that it will be useful,
19 * but WITHOUT ANY WARRANTY; without even the implied warranty of
20 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21 * GNU General Public License for more details.
23 * You should have received a copy of the GNU General Public License
24 * along with this program; if not, see <https://www.gnu.org/licenses/>.
27 #include <gnumeric-config.h>
28 #include <glib/gi18n-lib.h>
29 #include <gnumeric.h>
30 #include <stf-parse.h>
31 #include <stf-export.h>
33 #include <workbook.h>
34 #include <cell.h>
35 #include <sheet.h>
36 #include <expr.h>
37 #include <clipboard.h>
38 #include <sheet-style.h>
39 #include <value.h>
40 #include <mstyle.h>
41 #include <number-match.h>
42 #include <gutils.h>
43 #include <parse-util.h>
44 #include <number-match.h>
45 #include <gnm-format.h>
46 #include <ranges.h>
47 #include <goffice/goffice.h>
49 #include <stdlib.h>
50 #include <locale.h>
51 #include <string.h>
53 #define SETUP_LOCALE_SWITCH char *oldlocale = NULL
55 #define START_LOCALE_SWITCH if (parseoptions->locale) {\
56 oldlocale = g_strdup(go_setlocale (LC_ALL, NULL)); \
57 go_setlocale(LC_ALL, parseoptions->locale);}
59 #define END_LOCALE_SWITCH if (oldlocale) {\
60 go_setlocale(LC_ALL, oldlocale);\
61 g_free (oldlocale);}
63 /* Source_t struct, used for interchanging parsing information between the low level parse functions */
64 typedef struct {
65 GStringChunk *chunk;
66 char const *position; /* Indicates the current position within data */
68 /* Used internally for fixed width parsing */
69 int splitpos; /* Indicates current position in splitpositions array */
70 int linepos; /* Position on the current line */
71 } Source_t;
73 /* Struct used for autodiscovery */
74 typedef struct {
75 int start;
76 int stop;
77 } AutoDiscovery_t;
80 * Some silly dude make the length field an unsigned int. C just does
81 * not deal very well with that.
83 static inline int
84 my_garray_len (GArray const *a)
86 return (int)a->len;
89 static char *
90 my_utf8_strchr (const char *p, gunichar uc)
92 return uc < 0x7f ? strchr (p, uc) : g_utf8_strchr (p, -1, uc);
95 static int
96 compare_terminator (char const *s, StfParseOptions_t *parseoptions)
98 guchar const *us = (guchar const *)s;
99 GSList *l;
101 if (*us > parseoptions->compiled_terminator.max ||
102 *us < parseoptions->compiled_terminator.min)
103 return 0;
105 for (l = parseoptions->terminator; l; l = l->next) {
106 char const *term = l->data;
107 char const *d = s;
109 while (*term) {
110 if (*d != *term)
111 goto next;
112 term++;
113 d++;
115 return d - s;
117 next:
120 return 0;
124 /*******************************************************************************************************
125 * STF PARSE OPTIONS : StfParseOptions related
126 *******************************************************************************************************/
128 static void
129 gnm_g_string_free (GString *s)
131 if (s) g_string_free (s, TRUE);
136 * stf_parse_options_new:
138 * This will return a new StfParseOptions_t struct.
139 * The struct should, after being used, freed with stf_parse_options_free.
141 static StfParseOptions_t *
142 stf_parse_options_new (void)
144 StfParseOptions_t* parseoptions = g_new0 (StfParseOptions_t, 1);
146 parseoptions->parsetype = PARSE_TYPE_NOTSET;
148 parseoptions->terminator = NULL;
149 stf_parse_options_add_line_terminator (parseoptions, "\r\n");
150 stf_parse_options_add_line_terminator (parseoptions, "\n");
151 stf_parse_options_add_line_terminator (parseoptions, "\r");
153 parseoptions->trim_spaces = (TRIM_TYPE_RIGHT | TRIM_TYPE_LEFT);
154 parseoptions->locale = NULL;
156 parseoptions->splitpositions = NULL;
157 stf_parse_options_fixed_splitpositions_clear (parseoptions);
159 parseoptions->stringindicator = '"';
160 parseoptions->indicator_2x_is_single = TRUE;
161 parseoptions->sep.duplicates = FALSE;
162 parseoptions->trim_seps = FALSE;
164 parseoptions->sep.str = NULL;
165 parseoptions->sep.chr = NULL;
167 parseoptions->col_autofit_array = NULL;
168 parseoptions->col_import_array = NULL;
169 parseoptions->col_import_array_len = 0;
170 parseoptions->formats = g_ptr_array_new_with_free_func ((GDestroyNotify)go_format_unref);
171 parseoptions->formats_decimal = g_ptr_array_new_with_free_func ((GDestroyNotify)gnm_g_string_free);
172 parseoptions->formats_thousand = g_ptr_array_new_with_free_func ((GDestroyNotify)gnm_g_string_free);
173 parseoptions->formats_curr = g_ptr_array_new_with_free_func ((GDestroyNotify)gnm_g_string_free);
175 parseoptions->cols_exceeded = FALSE;
176 parseoptions->rows_exceeded = FALSE;
177 parseoptions->ref_count = 1;
179 return parseoptions;
183 * stf_parse_options_free:
185 * will free @parseoptions, note that this will not free the splitpositions
186 * member (GArray) of the struct, the caller is responsible for that.
188 void
189 stf_parse_options_free (StfParseOptions_t *parseoptions)
191 g_return_if_fail (parseoptions != NULL);
193 if (parseoptions->ref_count-- > 1)
194 return;
196 g_free (parseoptions->col_import_array);
197 g_free (parseoptions->col_autofit_array);
198 g_free (parseoptions->locale);
199 g_free (parseoptions->sep.chr);
201 if (parseoptions->sep.str) {
202 GSList *l;
204 for (l = parseoptions->sep.str; l != NULL; l = l->next)
205 g_free ((char *) l->data);
206 g_slist_free (parseoptions->sep.str);
209 g_array_free (parseoptions->splitpositions, TRUE);
211 stf_parse_options_clear_line_terminator (parseoptions);
213 g_ptr_array_free (parseoptions->formats, TRUE);
214 g_ptr_array_free (parseoptions->formats_decimal, TRUE);
215 g_ptr_array_free (parseoptions->formats_thousand, TRUE);
216 g_ptr_array_free (parseoptions->formats_curr, TRUE);
218 g_free (parseoptions);
221 static StfParseOptions_t *
222 stf_parse_options_ref (StfParseOptions_t *parseoptions)
224 parseoptions->ref_count++;
225 return parseoptions;
228 GType
229 stf_parse_options_get_type (void)
231 static GType t = 0;
233 if (t == 0) {
234 t = g_boxed_type_register_static ("StfParseOptions_t",
235 (GBoxedCopyFunc)stf_parse_options_ref,
236 (GBoxedFreeFunc)stf_parse_options_free);
238 return t;
241 void
242 stf_parse_options_set_type (StfParseOptions_t *parseoptions, StfParseType_t const parsetype)
244 g_return_if_fail (parseoptions != NULL);
245 g_return_if_fail (parsetype == PARSE_TYPE_CSV || parsetype == PARSE_TYPE_FIXED);
247 parseoptions->parsetype = parsetype;
250 static gint
251 long_string_first (gchar const *a, gchar const *b)
253 /* This actually is UTF-8 safe. */
254 return strlen (b) - strlen (a);
257 static void
258 compile_terminators (StfParseOptions_t *parseoptions)
260 GSList *l;
262 parseoptions->terminator =
263 g_slist_sort (parseoptions->terminator,
264 (GCompareFunc)long_string_first);
265 parseoptions->compiled_terminator.min = 255;
266 parseoptions->compiled_terminator.max = 0;
267 for (l = parseoptions->terminator; l; l = l->next) {
268 const guchar *term = l->data;
269 parseoptions->compiled_terminator.min =
270 MIN (parseoptions->compiled_terminator.min, *term);
271 parseoptions->compiled_terminator.max =
272 MAX (parseoptions->compiled_terminator.max, *term);
277 * stf_parse_options_add_line_terminator:
279 * This will add to the line terminators, in both the Fixed width and CSV delimited importers
280 * this indicates the end of a row.
283 void
284 stf_parse_options_add_line_terminator (StfParseOptions_t *parseoptions, char const *terminator)
286 g_return_if_fail (parseoptions != NULL);
287 g_return_if_fail (terminator != NULL && *terminator != 0);
289 GO_SLIST_PREPEND (parseoptions->terminator, g_strdup (terminator));
290 compile_terminators (parseoptions);
294 * stf_parse_options_clear_line_terminator:
296 * This will clear the line terminator, in both the Fixed width and CSV delimited importers
297 * this indicates the end of a row.
300 void
301 stf_parse_options_clear_line_terminator (StfParseOptions_t *parseoptions)
303 g_return_if_fail (parseoptions != NULL);
305 g_slist_free_full (parseoptions->terminator, g_free);
306 parseoptions->terminator = NULL;
307 compile_terminators (parseoptions);
311 * stf_parse_options_set_trim_spaces:
313 * If enabled will trim spaces in every parsed field on left and/or right
314 * sides.
316 void
317 stf_parse_options_set_trim_spaces (StfParseOptions_t *parseoptions, StfTrimType_t const trim_spaces)
319 g_return_if_fail (parseoptions != NULL);
321 parseoptions->trim_spaces = trim_spaces;
325 * stf_parse_options_csv_set_separators:
326 * @parseoptions: #StfParseOptions_t
327 * @character:
328 * @seps: (element-type utf8): the separators to be used
330 * A copy is made of the parameters.
332 void
333 stf_parse_options_csv_set_separators (StfParseOptions_t *parseoptions,
334 char const *character,
335 GSList const *seps)
337 g_return_if_fail (parseoptions != NULL);
339 g_free (parseoptions->sep.chr);
340 parseoptions->sep.chr = g_strdup (character);
342 g_slist_free_full (parseoptions->sep.str, g_free);
343 parseoptions->sep.str =
344 g_slist_copy_deep ((GSList *)seps, (GCopyFunc)g_strdup, NULL);
347 void
348 stf_parse_options_csv_set_stringindicator (StfParseOptions_t *parseoptions, gunichar const stringindicator)
350 g_return_if_fail (parseoptions != NULL);
352 parseoptions->stringindicator = stringindicator;
356 * stf_parse_options_csv_set_indicator_2x_is_single:
357 * @indic_2x: a boolean value indicating whether we want to see two
358 * adjacent string indicators as a single string indicator
359 * that is part of the cell, rather than a terminator.
361 void
362 stf_parse_options_csv_set_indicator_2x_is_single (StfParseOptions_t *parseoptions,
363 gboolean const indic_2x)
365 g_return_if_fail (parseoptions != NULL);
367 parseoptions->indicator_2x_is_single = indic_2x;
371 * stf_parse_options_csv_set_duplicates:
372 * @parseoptions:
373 * @duplicates: a boolean value indicating whether we want to see two
374 * separators right behind each other as one
376 void
377 stf_parse_options_csv_set_duplicates (StfParseOptions_t *parseoptions, gboolean const duplicates)
379 g_return_if_fail (parseoptions != NULL);
381 parseoptions->sep.duplicates = duplicates;
385 * stf_parse_options_csv_set_trim_seps:
386 * @trim_seps: a boolean value indicating whether we want to ignore
387 * separators at the beginning of lines
389 void
390 stf_parse_options_csv_set_trim_seps (StfParseOptions_t *parseoptions, gboolean const trim_seps)
392 g_return_if_fail (parseoptions != NULL);
394 parseoptions->trim_seps = trim_seps;
398 * stf_parse_options_fixed_splitpositions_clear:
400 * This will clear the splitpositions (== points on which a line is split)
402 void
403 stf_parse_options_fixed_splitpositions_clear (StfParseOptions_t *parseoptions)
405 int minus_one = -1;
406 g_return_if_fail (parseoptions != NULL);
408 if (parseoptions->splitpositions)
409 g_array_free (parseoptions->splitpositions, TRUE);
410 parseoptions->splitpositions = g_array_new (FALSE, FALSE, sizeof (int));
412 g_array_append_val (parseoptions->splitpositions, minus_one);
416 * stf_parse_options_fixed_splitpositions_add:
418 * @position will be added to the splitpositions.
420 void
421 stf_parse_options_fixed_splitpositions_add (StfParseOptions_t *parseoptions, int position)
423 unsigned int ui;
425 g_return_if_fail (parseoptions != NULL);
426 g_return_if_fail (position >= 0);
428 for (ui = 0; ui < parseoptions->splitpositions->len - 1; ui++) {
429 int here = g_array_index (parseoptions->splitpositions, int, ui);
430 if (position == here)
431 return;
432 if (position < here)
433 break;
436 g_array_insert_val (parseoptions->splitpositions, ui, position);
439 void
440 stf_parse_options_fixed_splitpositions_remove (StfParseOptions_t *parseoptions, int position)
442 unsigned int ui;
444 g_return_if_fail (parseoptions != NULL);
445 g_return_if_fail (position >= 0);
447 for (ui = 0; ui < parseoptions->splitpositions->len - 1; ui++) {
448 int here = g_array_index (parseoptions->splitpositions, int, ui);
449 if (position == here)
450 g_array_remove_index (parseoptions->splitpositions, ui);
451 if (position <= here)
452 return;
457 stf_parse_options_fixed_splitpositions_count (StfParseOptions_t *parseoptions)
459 return parseoptions->splitpositions->len;
463 stf_parse_options_fixed_splitpositions_nth (StfParseOptions_t *parseoptions, int n)
465 return g_array_index (parseoptions->splitpositions, int, n);
470 * stf_parse_options_valid:
471 * @parseoptions: an import options struct
473 * Checks if @parseoptions is correctly filled
475 * Returns: %TRUE if it is correctly filled, %FALSE otherwise.
477 static gboolean
478 stf_parse_options_valid (StfParseOptions_t *parseoptions)
480 g_return_val_if_fail (parseoptions != NULL, FALSE);
482 if (parseoptions->parsetype == PARSE_TYPE_FIXED) {
483 if (!parseoptions->splitpositions) {
484 g_warning ("STF: No splitpositions in struct");
485 return FALSE;
489 return TRUE;
492 /*******************************************************************************************************
493 * STF PARSE : The actual routines that do the 'trick'
494 *******************************************************************************************************/
496 static void
497 trim_spaces_inplace (char *field, StfParseOptions_t const *parseoptions)
499 if (!field) return;
501 if (parseoptions->trim_spaces & TRIM_TYPE_LEFT) {
502 char *s = field;
504 while (g_unichar_isspace (g_utf8_get_char (s)))
505 s = g_utf8_next_char (s);
507 if (s != field)
508 memmove (field, s, 1 + strlen (s));
511 if (parseoptions->trim_spaces & TRIM_TYPE_RIGHT) {
512 char *s = field + strlen (field);
514 while (field != s) {
515 s = g_utf8_prev_char (s);
516 if (!g_unichar_isspace (g_utf8_get_char (s)))
517 break;
518 *s = 0;
524 * stf_parse_csv_is_separator:
526 * Returns: %NULL if @character is not a separator, a pointer to the character
527 * after the separator otherwise.
529 static char const *
530 stf_parse_csv_is_separator (char const *character, char const *chr, GSList const *str)
532 g_return_val_if_fail (character != NULL, NULL);
534 if (*character == 0)
535 return NULL;
537 if (str) {
538 GSList const *l;
540 for (l = str; l != NULL; l = l->next) {
541 char const *s = l->data;
542 char const *r;
543 glong cnt;
544 glong const len = g_utf8_strlen (s, -1);
546 /* Don't compare past the end of the buffer! */
547 for (r = character, cnt = 0; cnt < len; cnt++, r = g_utf8_next_char (r))
548 if (*r == '\0')
549 break;
551 if ((cnt == len) && (memcmp (character, s, len) == 0))
552 return g_utf8_offset_to_pointer (character, len);
556 if (chr && my_utf8_strchr (chr, g_utf8_get_char (character)))
557 return g_utf8_next_char(character);
559 return NULL;
563 * stf_parse_eat_separators:
565 * skip over leading separators
569 static void
570 stf_parse_eat_separators (Source_t *src, StfParseOptions_t *parseoptions)
572 char const *cur, *next;
574 g_return_if_fail (src != NULL);
575 g_return_if_fail (parseoptions != NULL);
577 cur = src->position;
579 if (*cur == '\0' || compare_terminator (cur, parseoptions))
580 return;
581 while ((next = stf_parse_csv_is_separator (cur, parseoptions->sep.chr, parseoptions->sep.str)))
582 cur = next;
583 src->position = cur;
584 return;
588 typedef enum {
589 STF_CELL_ERROR,
590 STF_CELL_EOF,
591 STF_CELL_EOL,
592 STF_CELL_FIELD_NO_SEP,
593 STF_CELL_FIELD_SEP
594 } StfParseCellRes;
596 static StfParseCellRes
597 stf_parse_csv_cell (GString *text, Source_t *src, StfParseOptions_t *parseoptions)
599 char const *cur;
600 gboolean saw_sep = FALSE;
602 g_return_val_if_fail (src != NULL, STF_CELL_ERROR);
603 g_return_val_if_fail (parseoptions != NULL, STF_CELL_ERROR);
605 cur = src->position;
606 g_return_val_if_fail (cur != NULL, STF_CELL_ERROR);
608 /* Skip whitespace, but stop at line terminators. */
609 while (1) {
610 int term_len;
612 if (*cur == 0) {
613 src->position = cur;
614 return STF_CELL_EOF;
617 term_len = compare_terminator (cur, parseoptions);
618 if (term_len) {
619 src->position = cur + term_len;
620 return STF_CELL_EOL;
623 if ((parseoptions->trim_spaces & TRIM_TYPE_LEFT) == 0)
624 break;
626 if (stf_parse_csv_is_separator (cur, parseoptions->sep.chr,
627 parseoptions->sep.str))
628 break;
630 if (!g_unichar_isspace (g_utf8_get_char (cur)))
631 break;
632 cur = g_utf8_next_char (cur);
635 if (parseoptions->stringindicator != 0 &&
636 g_utf8_get_char (cur) == parseoptions->stringindicator) {
637 cur = g_utf8_next_char (cur);
638 while (*cur) {
639 gunichar uc = g_utf8_get_char (cur);
640 cur = g_utf8_next_char (cur);
642 if (uc == parseoptions->stringindicator) {
643 if (parseoptions->indicator_2x_is_single &&
644 g_utf8_get_char (cur) == parseoptions->stringindicator)
645 cur = g_utf8_next_char (cur);
646 else {
647 /* "field content"dropped-garbage, */
648 while (*cur && !compare_terminator (cur, parseoptions)) {
649 char const *post = stf_parse_csv_is_separator
650 (cur, parseoptions->sep.chr, parseoptions->sep.str);
651 if (post) {
652 cur = post;
653 saw_sep = TRUE;
654 break;
656 cur = g_utf8_next_char (cur);
658 break;
662 g_string_append_unichar (text, uc);
665 /* We silently allow a missing terminating quote. */
666 } else {
667 /* Unquoted field. */
669 while (*cur && !compare_terminator (cur, parseoptions)) {
671 char const *post = stf_parse_csv_is_separator
672 (cur, parseoptions->sep.chr, parseoptions->sep.str);
673 if (post) {
674 cur = post;
675 saw_sep = TRUE;
676 break;
679 g_string_append_unichar (text, g_utf8_get_char (cur));
680 cur = g_utf8_next_char (cur);
683 if (parseoptions->trim_spaces & TRIM_TYPE_RIGHT) {
684 while (text->len) {
685 const char *last = g_utf8_prev_char (text->str + text->len);
686 if (!g_unichar_isspace (g_utf8_get_char (last)))
687 break;
688 g_string_truncate (text, last - text->str);
693 src->position = cur;
695 if (saw_sep && parseoptions->sep.duplicates)
696 stf_parse_eat_separators (src, parseoptions);
698 return saw_sep ? STF_CELL_FIELD_SEP : STF_CELL_FIELD_NO_SEP;
702 * stf_parse_csv_line:
704 * This will parse one line from the current @src->position.
705 * NOTE: The calling routine is responsible for freeing the result.
707 * returns : a GPtrArray of char*'s
709 static GPtrArray *
710 stf_parse_csv_line (Source_t *src, StfParseOptions_t *parseoptions)
712 GPtrArray *line;
713 gboolean cont = FALSE;
714 GString *text;
716 g_return_val_if_fail (src != NULL, NULL);
717 g_return_val_if_fail (parseoptions != NULL, NULL);
719 line = g_ptr_array_new ();
720 if (parseoptions->trim_seps)
721 stf_parse_eat_separators (src, parseoptions);
723 text = g_string_sized_new (30);
725 while (1) {
726 char *ctext;
727 StfParseCellRes res =
728 stf_parse_csv_cell (text, src, parseoptions);
729 trim_spaces_inplace (text->str, parseoptions);
730 ctext = g_string_chunk_insert_len (src->chunk,
731 text->str, text->len);
732 g_string_truncate (text, 0);
734 switch (res) {
735 case STF_CELL_FIELD_NO_SEP:
736 g_ptr_array_add (line, ctext);
737 cont = FALSE;
738 break;
740 case STF_CELL_FIELD_SEP:
741 g_ptr_array_add (line, ctext);
742 cont = TRUE; /* Make sure we see one more field. */
743 break;
745 default:
746 if (cont)
747 g_ptr_array_add (line, ctext);
748 g_string_free (text, TRUE);
749 return line;
755 * stf_parse_fixed_cell:
757 * returns a pointer to the parsed cell contents.
759 static char *
760 stf_parse_fixed_cell (Source_t *src, StfParseOptions_t *parseoptions)
762 char *res;
763 char const *cur;
764 int splitval;
766 g_return_val_if_fail (src != NULL, NULL);
767 g_return_val_if_fail (parseoptions != NULL, NULL);
769 cur = src->position;
771 if (src->splitpos < my_garray_len (parseoptions->splitpositions))
772 splitval = (int) g_array_index (parseoptions->splitpositions, int, src->splitpos);
773 else
774 splitval = -1;
776 while (*cur != 0 && !compare_terminator (cur, parseoptions) && splitval != src->linepos) {
777 src->linepos++;
778 cur = g_utf8_next_char (cur);
781 res = g_string_chunk_insert_len (src->chunk,
782 src->position,
783 cur - src->position);
785 src->position = cur;
787 return res;
791 * stf_parse_fixed_line:
793 * This will parse one line from the current @src->position.
794 * It will return a GPtrArray with the cell contents as strings.
796 * NOTE: The calling routine is responsible for freeing result.
798 static GPtrArray *
799 stf_parse_fixed_line (Source_t *src, StfParseOptions_t *parseoptions)
801 GPtrArray *line;
803 g_return_val_if_fail (src != NULL, NULL);
804 g_return_val_if_fail (parseoptions != NULL, NULL);
806 src->linepos = 0;
807 src->splitpos = 0;
809 line = g_ptr_array_new ();
810 while (*src->position != '\0' && !compare_terminator (src->position, parseoptions)) {
811 char *field = stf_parse_fixed_cell (src, parseoptions);
813 trim_spaces_inplace (field, parseoptions);
814 g_ptr_array_add (line, field);
816 src->splitpos++;
819 while (line->len < parseoptions->splitpositions->len)
820 g_ptr_array_add (line, g_strdup (""));
822 return line;
826 * stf_parse_general_free: (skip)
828 void
829 stf_parse_general_free (GPtrArray *lines)
831 unsigned lineno;
832 for (lineno = 0; lineno < lines->len; lineno++) {
833 GPtrArray *line = g_ptr_array_index (lines, lineno);
834 /* Fields are not freed here. */
835 if (line)
836 g_ptr_array_free (line, TRUE);
838 g_ptr_array_free (lines, TRUE);
843 * stf_parse_general: (skip)
845 * Returns: (transfer full): a GPtrArray of lines, where each line is itself a
846 * GPtrArray of strings.
848 * The caller must free this entire structure, for example by calling
849 * stf_parse_general_free.
851 GPtrArray *
852 stf_parse_general (StfParseOptions_t *parseoptions,
853 GStringChunk *lines_chunk,
854 char const *data, char const *data_end)
856 GPtrArray *lines;
857 Source_t src;
858 int row;
859 char const *valid_end = data_end;
861 g_return_val_if_fail (parseoptions != NULL, NULL);
862 g_return_val_if_fail (data != NULL, NULL);
863 g_return_val_if_fail (data_end != NULL, NULL);
864 g_return_val_if_fail (stf_parse_options_valid (parseoptions), NULL);
865 g_return_val_if_fail (g_utf8_validate (data, data_end-data, &valid_end), NULL);
867 src.chunk = lines_chunk;
868 src.position = data;
869 row = 0;
871 if ((data_end-data >= 3) && !strncmp(src.position, "\xEF\xBB\xBF", 3)) {
872 /* Skip over byte-order mark */
873 src.position += 3;
876 lines = g_ptr_array_new ();
877 while (*src.position != '\0' && src.position < data_end) {
878 GPtrArray *line;
880 if (row == GNM_MAX_ROWS) {
881 parseoptions->rows_exceeded = TRUE;
882 break;
885 line = parseoptions->parsetype == PARSE_TYPE_CSV
886 ? stf_parse_csv_line (&src, parseoptions)
887 : stf_parse_fixed_line (&src, parseoptions);
889 g_ptr_array_add (lines, line);
890 if (parseoptions->parsetype != PARSE_TYPE_CSV)
891 src.position += compare_terminator (src.position, parseoptions);
892 row++;
895 return lines;
899 * stf_parse_lines: (skip)
900 * @parseoptions: #StfParseOptions_t
901 * @lines_chunk:
902 * @data:
903 * @maxlines:
904 * @with_lineno:
906 * Returns: (transfer full): a GPtrArray of lines, where each line is itself a
907 * GPtrArray of strings.
909 * The caller must free this entire structure, for example by calling
910 * stf_parse_general_free.
912 GPtrArray *
913 stf_parse_lines (StfParseOptions_t *parseoptions,
914 GStringChunk *lines_chunk,
915 char const *data,
916 int maxlines, gboolean with_lineno)
918 GPtrArray *lines;
919 int lineno = 1;
921 g_return_val_if_fail (data != NULL, NULL);
923 lines = g_ptr_array_new ();
924 while (*data) {
925 char const *data0 = data;
926 GPtrArray *line = g_ptr_array_new ();
928 if (with_lineno) {
929 char buf[4 * sizeof (int)];
930 sprintf (buf, "%d", lineno);
931 g_ptr_array_add (line,
932 g_string_chunk_insert (lines_chunk, buf));
935 while (1) {
936 int termlen = compare_terminator (data, parseoptions);
937 if (termlen > 0 || *data == 0) {
938 g_ptr_array_add (line,
939 g_string_chunk_insert_len (lines_chunk,
940 data0,
941 data - data0));
942 data += termlen;
943 break;
944 } else
945 data = g_utf8_next_char (data);
948 g_ptr_array_add (lines, line);
950 lineno++;
951 if (lineno >= maxlines)
952 break;
954 return lines;
957 char const *
958 stf_parse_find_line (StfParseOptions_t *parseoptions,
959 char const *data,
960 int line)
962 while (line > 0) {
963 int termlen = compare_terminator (data, parseoptions);
964 if (termlen > 0) {
965 data += termlen;
966 line--;
967 } else if (*data == 0) {
968 return data;
969 } else {
970 data = g_utf8_next_char (data);
973 return data;
978 * stf_parse_options_fixed_autodiscover:
979 * @parseoptions: a Parse options struct.
980 * @data: The actual data.
981 * @data_end: data end.
983 * Automatically try to discover columns in the text to be parsed.
984 * We ignore empty lines (only containing parseoptions->terminator)
986 * FIXME: This is so extremely ugly that I am too tired to rewrite it right now.
987 * Think hard of a better more flexible solution...
989 void
990 stf_parse_options_fixed_autodiscover (StfParseOptions_t *parseoptions,
991 char const *data, char const *data_end)
993 char const *iterator = data;
994 GSList *list = NULL;
995 GSList *list_start = NULL;
996 int lines = 0;
997 int effective_lines = 0;
998 int max_line_length = 0;
999 int *line_begin_hits = NULL;
1000 int *line_end_hits = NULL;
1001 int i;
1003 stf_parse_options_fixed_splitpositions_clear (parseoptions);
1006 * First take a look at all possible white space combinations
1008 while (*iterator && iterator < data_end) {
1009 gboolean begin_recorded = FALSE;
1010 AutoDiscovery_t *disc = NULL;
1011 int position = 0;
1012 int termlen = 0;
1014 while (*iterator && (termlen = compare_terminator (iterator, parseoptions)) == 0) {
1015 if (!begin_recorded && *iterator == ' ') {
1016 disc = g_new0 (AutoDiscovery_t, 1);
1018 disc->start = position;
1020 begin_recorded = TRUE;
1021 } else if (begin_recorded && *iterator != ' ') {
1022 disc->stop = position;
1023 list = g_slist_prepend (list, disc);
1025 begin_recorded = FALSE;
1026 disc = NULL;
1029 position++;
1030 iterator++;
1033 if (position > max_line_length)
1034 max_line_length = position;
1037 * If there are excess spaces at the end of
1038 * the line : ignore them
1040 g_free (disc);
1043 * Hop over the terminator
1045 iterator += termlen;
1047 if (position != 0)
1048 effective_lines++;
1050 lines++;
1053 list = g_slist_reverse (list);
1054 list_start = list;
1057 * Kewl stuff:
1058 * Look at the number of hits at each line position
1059 * if the number of hits equals the number of lines
1060 * we can be pretty sure this is the start or end
1061 * of a column, we filter out empty columns
1062 * later
1064 line_begin_hits = g_new0 (int, max_line_length + 1);
1065 line_end_hits = g_new0 (int, max_line_length + 1);
1067 while (list) {
1068 AutoDiscovery_t *disc = list->data;
1070 line_begin_hits[disc->start]++;
1071 line_end_hits[disc->stop]++;
1073 g_free (disc);
1075 list = g_slist_next (list);
1077 g_slist_free (list_start);
1079 for (i = 0; i < max_line_length + 1; i++)
1080 if (line_begin_hits[i] == effective_lines || line_end_hits[i] == effective_lines)
1081 stf_parse_options_fixed_splitpositions_add (parseoptions, i);
1084 * Do some corrections to the initial columns
1085 * detected here, we obviously don't need to
1086 * do this if there are no columns at all.
1088 if (my_garray_len (parseoptions->splitpositions) > 0) {
1090 * Try to find columns that look like:
1092 * Example 100
1093 * Example2 9
1095 * (In other words : Columns with left & right justification with
1096 * a minimum of 2 spaces in the middle)
1097 * Split these columns in 2
1100 for (i = 0; i < my_garray_len (parseoptions->splitpositions) - 1; i++) {
1101 int begin = g_array_index (parseoptions->splitpositions, int, i);
1102 int end = g_array_index (parseoptions->splitpositions, int, i + 1);
1103 int num_spaces = -1;
1104 int spaces_start = 0;
1105 gboolean right_aligned = TRUE;
1106 gboolean left_aligned = TRUE;
1107 gboolean has_2_spaces = TRUE;
1109 iterator = data;
1110 lines = 0;
1111 while (*iterator && iterator < data_end) {
1112 gboolean trigger = FALSE;
1113 gboolean space_trigger = FALSE;
1114 int pos = 0;
1116 num_spaces = -1;
1117 spaces_start = 0;
1118 while (*iterator && !compare_terminator (iterator, parseoptions)) {
1119 if (pos == begin) {
1120 if (*iterator == ' ')
1121 left_aligned = FALSE;
1123 trigger = TRUE;
1124 } else if (pos == end - 1) {
1125 if (*iterator == ' ')
1126 right_aligned = FALSE;
1128 trigger = FALSE;
1131 if (trigger || pos == end - 1) {
1132 if (!space_trigger && *iterator == ' ') {
1133 space_trigger = TRUE;
1134 spaces_start = pos;
1135 } else if (space_trigger && *iterator != ' ') {
1136 space_trigger = FALSE;
1137 num_spaces = pos - spaces_start;
1141 iterator++;
1142 pos++;
1145 if (num_spaces < 2)
1146 has_2_spaces = FALSE;
1148 if (*iterator)
1149 iterator++;
1151 lines++;
1155 * If this column meets all the criteria
1156 * split it into two at the last measured
1157 * spaces_start + num_spaces
1159 if (has_2_spaces && right_aligned && left_aligned) {
1160 int val = (((spaces_start + num_spaces) - spaces_start) / 2) + spaces_start;
1162 g_array_insert_val (parseoptions->splitpositions, i + 1, val);
1165 * Skip over the inserted column
1167 i++;
1172 * Remove empty columns here if needed
1174 for (i = 0; i < my_garray_len (parseoptions->splitpositions) - 1; i++) {
1175 int begin = g_array_index (parseoptions->splitpositions, int, i);
1176 int end = g_array_index (parseoptions->splitpositions, int, i + 1);
1177 gboolean only_spaces = TRUE;
1179 iterator = data;
1180 lines = 0;
1181 while (*iterator && iterator < data_end) {
1182 gboolean trigger = FALSE;
1183 int pos = 0;
1185 while (*iterator && !compare_terminator (iterator, parseoptions)) {
1186 if (pos == begin)
1187 trigger = TRUE;
1188 else if (pos == end)
1189 trigger = FALSE;
1191 if (trigger) {
1192 if (*iterator != ' ')
1193 only_spaces = FALSE;
1196 iterator++;
1197 pos++;
1200 if (*iterator)
1201 iterator++;
1203 lines++;
1207 * The column only contains spaces
1208 * remove it
1210 if (only_spaces) {
1211 g_array_remove_index (parseoptions->splitpositions, i);
1214 * We HAVE to make sure that the next column (end) also
1215 * gets checked out. If we don't decrease "i" here, we
1216 * will skip over it as the indexes shift down after
1217 * the removal
1219 i--;
1224 g_free (line_begin_hits);
1225 g_free (line_end_hits);
1228 /*******************************************************************************************************
1229 * STF PARSE HL: high-level functions that dump the raw data returned by the low-level parsing
1230 * functions into something meaningful (== application specific)
1231 *******************************************************************************************************/
1234 * This is more or less as gnm_cell_set_text, except...
1235 * 1. Unknown names are not allowed.
1236 * 2. Only '=' can start an expression.
1239 static void
1240 stf_cell_set_text (GnmCell *cell, char const *text)
1242 GnmExprTop const *texpr;
1243 GnmValue *val;
1244 GOFormat const *fmt = gnm_style_get_format (gnm_cell_get_style (cell));
1245 const GODateConventions *date_conv = sheet_date_conv (cell->base.sheet);
1247 if (!go_format_is_text (fmt) && *text == '=' && text[1] != 0) {
1248 GnmExprParseFlags flags =
1249 GNM_EXPR_PARSE_UNKNOWN_NAMES_ARE_INVALID;
1250 const char *expr_start = text + 1;
1251 GnmParsePos pos;
1252 val = NULL;
1253 parse_pos_init_cell (&pos, cell);
1254 texpr = gnm_expr_parse_str (expr_start, &pos, flags,
1255 NULL, NULL);
1256 } else {
1257 texpr = NULL;
1258 val = format_match (text, fmt, date_conv);
1261 if (!val && !texpr)
1262 val = value_new_string (text);
1264 if (val)
1265 gnm_cell_set_value (cell, val);
1266 else {
1267 gnm_cell_set_expr (cell, texpr);
1268 gnm_expr_top_unref (texpr);
1272 static void
1273 stf_read_remember_settings (Workbook *book, StfParseOptions_t *po)
1275 if (po->parsetype == PARSE_TYPE_CSV) {
1276 GnmStfExport *stfe = gnm_stf_get_stfe (G_OBJECT (book));
1277 char quote[6];
1278 int length = g_unichar_to_utf8 (po->stringindicator, quote);
1279 if (length > 5) {
1280 quote[0] = '"';
1281 quote[1] = '\0';
1282 } else quote[length] = '\0';
1284 g_object_set (G_OBJECT (stfe), "separator", po->sep.chr, "quote", &quote, NULL);
1286 if ((po->terminator != NULL) && (po->terminator->data != NULL))
1287 g_object_set (G_OBJECT (stfe), "eol", po->terminator->data, NULL);
1291 gboolean
1292 stf_parse_sheet (StfParseOptions_t *parseoptions,
1293 char const *data, char const *data_end,
1294 Sheet *sheet, int start_col, int start_row)
1296 int row;
1297 unsigned int lrow;
1298 GStringChunk *lines_chunk;
1299 GPtrArray *lines;
1300 gboolean result = TRUE;
1301 int col;
1302 unsigned int lcol;
1303 size_t nformats;
1305 SETUP_LOCALE_SWITCH;
1307 g_return_val_if_fail (parseoptions != NULL, FALSE);
1308 g_return_val_if_fail (data != NULL, FALSE);
1309 g_return_val_if_fail (IS_SHEET (sheet), FALSE);
1311 if (!data_end)
1312 data_end = data + strlen (data);
1314 lines_chunk = g_string_chunk_new (100 * 1024);
1315 lines = stf_parse_general (parseoptions, lines_chunk, data, data_end);
1316 if (lines == NULL)
1317 result = FALSE;
1319 col = start_col;
1320 nformats = parseoptions->formats->len;
1321 for (lcol = 0; lcol < nformats; lcol++) {
1322 GOFormat const *fmt = g_ptr_array_index (parseoptions->formats, lcol);
1323 GnmStyle *mstyle;
1324 gboolean want_col =
1325 (parseoptions->col_import_array == NULL ||
1326 parseoptions->col_import_array_len <= lcol ||
1327 parseoptions->col_import_array[lcol]);
1328 if (!want_col || col >= gnm_sheet_get_max_cols (sheet))
1329 continue;
1331 if (fmt && !go_format_is_general (fmt)) {
1332 GnmRange r;
1333 int end_row = MIN (start_row + (int)lines->len - 1,
1334 gnm_sheet_get_last_row (sheet));
1336 range_init (&r, col, start_row, col, end_row);
1337 mstyle = gnm_style_new ();
1338 gnm_style_set_format (mstyle, fmt);
1339 sheet_apply_style (sheet, &r, mstyle);
1341 col++;
1344 START_LOCALE_SWITCH;
1345 for (row = start_row, lrow = 0;
1346 result && lrow < lines->len;
1347 row++, lrow++) {
1348 GPtrArray *line;
1350 if (row >= gnm_sheet_get_max_rows (sheet)) {
1351 if (!parseoptions->rows_exceeded) {
1352 /* FIXME: What locale? */
1353 g_warning (_("There are more rows of data than "
1354 "there is room for in the sheet. Extra "
1355 "rows will be ignored."));
1356 parseoptions->rows_exceeded = TRUE;
1358 break;
1361 col = start_col;
1362 line = g_ptr_array_index (lines, lrow);
1364 for (lcol = 0; lcol < line->len; lcol++) {
1365 GOFormat const *fmt = lcol < nformats
1366 ? g_ptr_array_index (parseoptions->formats, lcol)
1367 : go_format_general ();
1368 char const *text = g_ptr_array_index (line, lcol);
1369 gboolean want_col =
1370 (parseoptions->col_import_array == NULL ||
1371 parseoptions->col_import_array_len <= lcol ||
1372 parseoptions->col_import_array[lcol]);
1373 if (!want_col)
1374 continue;
1376 if (col >= gnm_sheet_get_max_cols (sheet)) {
1377 if (!parseoptions->cols_exceeded) {
1378 /* FIXME: What locale? */
1379 g_warning (_("There are more columns of data than "
1380 "there is room for in the sheet. Extra "
1381 "columns will be ignored."));
1382 parseoptions->cols_exceeded = TRUE;
1384 break;
1386 if (text && *text) {
1387 GnmCell *cell = sheet_cell_fetch (sheet, col, row);
1388 if (!go_format_is_text (fmt) &&
1389 lcol < parseoptions->formats_decimal->len &&
1390 g_ptr_array_index (parseoptions->formats_decimal, lcol)) {
1391 GOFormatFamily fam;
1392 GnmValue *v = format_match_decimal_number_with_locale
1393 (text, &fam,
1394 g_ptr_array_index (parseoptions->formats_curr, lcol),
1395 g_ptr_array_index (parseoptions->formats_thousand, lcol),
1396 g_ptr_array_index (parseoptions->formats_decimal, lcol));
1397 if (!v)
1398 v = value_new_string (text);
1399 sheet_cell_set_value (cell, v);
1400 } else {
1402 stf_cell_set_text (cell, text);
1405 col++;
1408 g_ptr_array_index (lines, lrow) = NULL;
1409 g_ptr_array_free (line, TRUE);
1411 END_LOCALE_SWITCH;
1413 for (lcol = 0, col = start_col;
1414 lcol < parseoptions->col_import_array_len && col < gnm_sheet_get_max_cols (sheet);
1415 lcol++) {
1416 if (parseoptions->col_import_array == NULL ||
1417 parseoptions->col_import_array_len <= lcol ||
1418 parseoptions->col_import_array[lcol]) {
1419 if (parseoptions->col_autofit_array == NULL ||
1420 parseoptions->col_autofit_array[lcol]) {
1421 ColRowIndexList *list = colrow_get_index_list (col, col, NULL);
1422 ColRowStateGroup *state = colrow_set_sizes (sheet, TRUE, list, -1, 0, -1);
1423 colrow_index_list_destroy (list);
1424 g_slist_free (state);
1426 col++;
1430 g_string_chunk_free (lines_chunk);
1431 if (lines)
1432 stf_parse_general_free (lines);
1433 if (result)
1434 stf_read_remember_settings (sheet->workbook, parseoptions);
1435 return result;
1438 GnmCellRegion *
1439 stf_parse_region (StfParseOptions_t *parseoptions, char const *data, char const *data_end,
1440 Workbook const *wb)
1442 static GODateConventions const default_conv = {FALSE};
1443 GODateConventions const *date_conv = wb ? workbook_date_conv (wb) : &default_conv;
1445 GnmCellRegion *cr;
1446 unsigned int row, colhigh = 0;
1447 GStringChunk *lines_chunk;
1448 GPtrArray *lines;
1449 size_t nformats;
1451 SETUP_LOCALE_SWITCH;
1453 g_return_val_if_fail (parseoptions != NULL, NULL);
1454 g_return_val_if_fail (data != NULL, NULL);
1456 START_LOCALE_SWITCH;
1458 cr = gnm_cell_region_new (NULL);
1460 if (!data_end)
1461 data_end = data + strlen (data);
1462 lines_chunk = g_string_chunk_new (100 * 1024);
1463 lines = stf_parse_general (parseoptions, lines_chunk, data, data_end);
1464 nformats = parseoptions->formats->len;
1465 for (row = 0; row < lines->len; row++) {
1466 GPtrArray *line = g_ptr_array_index (lines, row);
1467 unsigned int col, targetcol = 0;
1468 for (col = 0; col < line->len; col++) {
1469 if (parseoptions->col_import_array == NULL ||
1470 parseoptions->col_import_array_len <= col ||
1471 parseoptions->col_import_array[col]) {
1472 const char *text = g_ptr_array_index (line, col);
1473 if (text) {
1474 GOFormat *fmt = NULL;
1475 GnmValue *v;
1476 GnmCellCopy *cc;
1478 if (col < nformats)
1479 fmt = g_ptr_array_index (parseoptions->formats, col);
1480 v = format_match (text, fmt, date_conv);
1481 if (!v)
1482 v = value_new_string (text);
1484 cc = gnm_cell_copy_new (cr, targetcol, row);
1485 cc->val = v;
1486 cc->texpr = NULL;
1487 targetcol++;
1488 if (targetcol > colhigh)
1489 colhigh = targetcol;
1494 stf_parse_general_free (lines);
1495 g_string_chunk_free (lines_chunk);
1497 END_LOCALE_SWITCH;
1499 cr->cols = (colhigh > 0) ? colhigh : 1;
1500 cr->rows = row;
1502 return cr;
1505 static int
1506 int_sort (void const *a, void const *b)
1508 return *(int const *)a - *(int const *)b;
1511 static int
1512 count_character (GPtrArray *lines, gunichar c, double quantile)
1514 int *counts, res;
1515 unsigned int lno, cno;
1517 if (lines->len == 0)
1518 return 0;
1520 counts = g_new (int, lines->len);
1521 for (lno = cno = 0; lno < lines->len; lno++) {
1522 int count = 0;
1523 GPtrArray *boxline = g_ptr_array_index (lines, lno);
1524 char const *line = g_ptr_array_index (boxline, 0);
1526 /* Ignore empty lines. */
1527 if (*line == 0)
1528 continue;
1530 while (*line) {
1531 if (g_utf8_get_char (line) == c)
1532 count++;
1533 line = g_utf8_next_char (line);
1536 counts[cno++] = count;
1539 if (cno == 0)
1540 res = 0;
1541 else {
1542 unsigned int qi = (unsigned int)ceil (quantile * cno);
1543 qsort (counts, cno, sizeof (counts[0]), int_sort);
1544 if (qi == cno)
1545 qi--;
1546 res = counts[qi];
1549 g_free (counts);
1551 return res;
1554 static void
1555 dump_guessed_options (const StfParseOptions_t *res)
1557 GSList *l;
1558 char ubuffer[6 + 1];
1559 unsigned ui;
1561 g_printerr ("Guessed format:\n");
1562 switch (res->parsetype) {
1563 case PARSE_TYPE_CSV:
1564 g_printerr (" type = sep\n");
1565 g_printerr (" separator = %s\n",
1566 res->sep.chr ? res->sep.chr : "(none)");
1567 g_printerr (" see two as one = %s\n",
1568 res->sep.duplicates ? "yes" : "no");
1569 break;
1570 case PARSE_TYPE_FIXED:
1571 g_printerr (" type = sep\n");
1572 break;
1573 default:
1576 g_printerr (" trim space = %d\n", res->trim_spaces);
1578 ubuffer[g_unichar_to_utf8 (res->stringindicator, ubuffer)] = 0;
1579 g_printerr (" string indicator = %s\n", ubuffer);
1580 g_printerr (" see two as one = %s\n",
1581 res->indicator_2x_is_single ? "yes" : "no");
1583 g_printerr (" line terminators =");
1584 for (l = res->terminator; l; l = l->next) {
1585 const char *t = l->data;
1586 if (strcmp (t, "\n") == 0)
1587 g_printerr (" unix");
1588 else if (strcmp (t, "\r") == 0)
1589 g_printerr (" mac");
1590 else if (strcmp (t, "\r\n") == 0)
1591 g_printerr (" dos");
1592 else
1593 g_printerr (" other");
1595 g_printerr ("\n");
1597 for (ui = 0; ui < res->formats->len; ui++) {
1598 GOFormat const *fmt = g_ptr_array_index (res->formats, ui);
1599 const GString *decimal = ui < res->formats_decimal->len
1600 ? g_ptr_array_index (res->formats_decimal, ui)
1601 : NULL;
1602 const GString *thousand = ui < res->formats_thousand->len
1603 ? g_ptr_array_index (res->formats_thousand, ui)
1604 : NULL;
1606 g_printerr (" fmt.%d = %s\n", ui, go_format_as_XL (fmt));
1607 if (decimal)
1608 g_printerr (" fmt.%d.dec = %s\n", ui, decimal->str);
1609 if (thousand)
1610 g_printerr (" fmt.%d.thou = %s\n", ui, thousand->str);
1615 * stf_parse_options_guess:
1616 * @data: the input data.
1618 * Returns: (transfer full): the guessed options.
1620 StfParseOptions_t *
1621 stf_parse_options_guess (char const *data)
1623 StfParseOptions_t *res;
1624 GStringChunk *lines_chunk;
1625 GPtrArray *lines;
1626 int tabcount;
1627 int sepcount;
1628 gunichar sepchar = go_locale_get_arg_sep ();
1630 g_return_val_if_fail (data != NULL, NULL);
1632 res = stf_parse_options_new ();
1633 lines_chunk = g_string_chunk_new (100 * 1024);
1634 lines = stf_parse_lines (res, lines_chunk, data, 1000, FALSE);
1636 tabcount = count_character (lines, '\t', 0.2);
1637 sepcount = count_character (lines, sepchar, 0.2);
1639 /* At least one tab per line and enough to separate every
1640 would-be sepchars. */
1641 if (tabcount >= 1 && tabcount >= sepcount - 1)
1642 stf_parse_options_csv_set_separators (res, "\t", NULL);
1643 else {
1644 gunichar c;
1647 * Try a few more or less likely characters and pick the first
1648 * one that occurs on at least half the lines.
1650 * The order is mostly random, although ' ' and '!' which
1651 * could very easily occur in text are put last.
1653 if (count_character (lines, (c = sepchar), 0.5) > 0 ||
1654 count_character (lines, (c = go_locale_get_col_sep ()), 0.5) > 0 ||
1655 count_character (lines, (c = ':'), 0.5) > 0 ||
1656 count_character (lines, (c = ','), 0.5) > 0 ||
1657 count_character (lines, (c = ';'), 0.5) > 0 ||
1658 count_character (lines, (c = '|'), 0.5) > 0 ||
1659 count_character (lines, (c = '!'), 0.5) > 0 ||
1660 count_character (lines, (c = ' '), 0.5) > 0) {
1661 char sep[7];
1662 sep[g_unichar_to_utf8 (c, sep)] = 0;
1663 if (c == ' ')
1664 strcat (sep, "\t");
1665 stf_parse_options_csv_set_separators (res, sep, NULL);
1669 // For now, always separated:
1670 stf_parse_options_set_type (res, PARSE_TYPE_CSV);
1672 switch (res->parsetype) {
1673 case PARSE_TYPE_CSV: {
1674 gboolean dups =
1675 res->sep.chr &&
1676 strchr (res->sep.chr, ' ') != NULL;
1677 gboolean trim =
1678 res->sep.chr &&
1679 strchr (res->sep.chr, ' ') != NULL;
1681 stf_parse_options_set_trim_spaces (res, TRIM_TYPE_LEFT | TRIM_TYPE_RIGHT);
1682 stf_parse_options_csv_set_indicator_2x_is_single (res, TRUE);
1683 stf_parse_options_csv_set_duplicates (res, dups);
1684 stf_parse_options_csv_set_trim_seps (res, trim);
1686 stf_parse_options_csv_set_stringindicator (res, '"');
1687 break;
1690 case PARSE_TYPE_FIXED:
1691 break;
1693 default:
1694 g_assert_not_reached ();
1697 stf_parse_general_free (lines);
1698 g_string_chunk_free (lines_chunk);
1700 stf_parse_options_guess_formats (res, data);
1702 if (gnm_debug_flag ("stf"))
1703 dump_guessed_options (res);
1705 return res;
1709 * stf_parse_options_guess_csv:
1710 * @data: the CSV input data.
1712 * Returns: (transfer full): the guessed options.
1714 StfParseOptions_t *
1715 stf_parse_options_guess_csv (char const *data)
1717 StfParseOptions_t *res;
1718 GStringChunk *lines_chunk;
1719 GPtrArray *lines;
1720 char *sep = NULL;
1721 char const *quoteline = NULL;
1722 int pass;
1723 gunichar stringind = '"';
1725 g_return_val_if_fail (data != NULL, NULL);
1727 res = stf_parse_options_new ();
1728 stf_parse_options_set_type (res, PARSE_TYPE_CSV);
1729 stf_parse_options_set_trim_spaces (res, TRIM_TYPE_LEFT | TRIM_TYPE_RIGHT);
1730 stf_parse_options_csv_set_indicator_2x_is_single (res, TRUE);
1731 stf_parse_options_csv_set_duplicates (res, FALSE);
1732 stf_parse_options_csv_set_trim_seps (res, FALSE);
1733 stf_parse_options_csv_set_stringindicator (res, stringind);
1735 lines_chunk = g_string_chunk_new (100 * 1024);
1736 lines = stf_parse_lines (res, lines_chunk, data, 1000, FALSE);
1739 * Find a line containing a quote; skip first line unless it is
1740 * the only one. Prefer a line with the quote first.
1742 for (pass = 1; !quoteline && pass <= 2; pass++) {
1743 size_t lno;
1744 for (lno = MIN (1, lines->len - 1);
1745 !quoteline && lno < lines->len;
1746 lno++) {
1747 GPtrArray *boxline = g_ptr_array_index (lines, lno);
1748 const char *line = g_ptr_array_index (boxline, 0);
1749 switch (pass) {
1750 case 1:
1751 if (g_utf8_get_char (line) == stringind)
1752 quoteline = line;
1753 break;
1754 case 2:
1755 if (my_utf8_strchr (line, stringind))
1756 quoteline = line;
1757 break;
1762 if (quoteline) {
1763 const char *p0 = my_utf8_strchr (quoteline, stringind);
1764 const char *p = p0;
1766 do {
1767 p = g_utf8_next_char (p);
1768 } while (*p && g_utf8_get_char (p) != stringind);
1769 if (*p) p = g_utf8_next_char (p);
1770 while (*p && g_unichar_isspace (g_utf8_get_char (p)))
1771 p = g_utf8_next_char (p);
1772 if (*p) {
1773 /* Use the character after the quote. */
1774 sep = g_strndup (p, g_utf8_next_char (p) - p);
1775 } else {
1776 /* Try to use character before the quote. */
1777 while (p0 > quoteline && !sep) {
1778 p = p0;
1779 p0 = g_utf8_prev_char (p0);
1780 if (!g_unichar_isspace (g_utf8_get_char (p0)))
1781 sep = g_strndup (p0, p - p0);
1786 if (!sep)
1787 sep = g_strdup (",");
1788 stf_parse_options_csv_set_separators (res, sep, NULL);
1789 g_free (sep);
1791 stf_parse_general_free (lines);
1792 g_string_chunk_free (lines_chunk);
1794 stf_parse_options_guess_formats (res, data);
1796 if (gnm_debug_flag ("stf"))
1797 dump_guessed_options (res);
1799 return res;
1802 typedef enum {
1803 STF_GUESS_DATE_DMY = 1,
1804 STF_GUESS_DATE_MDY = 2,
1805 STF_GUESS_DATE_YMD = 4,
1807 STF_GUESS_NUMBER_DEC_POINT = 0x10,
1808 STF_GUESS_NUMBER_DEC_COMMA = 0x20,
1809 STF_GUESS_NUMBER_DEC_EITHER = 0x30,
1811 STF_GUESS_ALL = 0x37
1812 } StfGuessFormats;
1814 static void
1815 do_check_date (const char *data, StfGuessFormats flag,
1816 gboolean mbd, gboolean ybm,
1817 unsigned *possible,
1818 GODateConventions const *date_conv)
1820 GnmValue *v;
1821 gboolean this_mbd, this_ybm;
1822 int imbd;
1824 if (!(*possible & flag))
1825 return;
1827 v = format_match_datetime (data, date_conv, mbd, TRUE, FALSE);
1828 if (!v || !VALUE_FMT (v))
1829 goto fail;
1831 imbd = go_format_month_before_day (VALUE_FMT (v));
1832 this_mbd = (imbd >= 1);
1833 this_ybm = (imbd == 2);
1834 if (mbd != this_mbd || ybm != this_ybm)
1835 goto fail;
1837 goto done;
1839 fail:
1840 *possible &= ~flag;
1841 done:
1842 value_release (v);
1846 static void
1847 do_check_number (const char *data, StfGuessFormats flag,
1848 const GString *dec, const GString *thousand, const GString *curr,
1849 unsigned *possible, int *decimals)
1851 GnmValue *v;
1852 GOFormatFamily family;
1853 const char *pthou;
1855 if (!(*possible & flag))
1856 return;
1858 v = format_match_decimal_number_with_locale (data, &family, curr, thousand, dec);
1859 if (!v)
1860 goto fail;
1862 if (*decimals != -2) {
1863 const char *pdec = strstr (data, dec->str);
1864 int this_decimals = 0;
1865 if (pdec) {
1866 pdec += dec->len;
1867 while (g_ascii_isdigit (*pdec)) {
1868 pdec++;
1869 this_decimals++;
1872 if (*decimals == -1)
1873 *decimals = this_decimals;
1874 else if (*decimals != this_decimals)
1875 *decimals = -2;
1878 pthou = strstr (data, thousand->str);
1879 if (pthou) {
1880 const char *p;
1881 int digits = 0, nonzero_digits = 0;
1882 for (p = data; p < pthou; p = g_utf8_next_char (p)) {
1883 if (g_unichar_isdigit (g_utf8_get_char (p))) {
1884 digits++;
1885 if (*p != '0')
1886 nonzero_digits++;
1889 // "-.222" implies that "." is not a thousands separator.
1890 // "0.222" implies that "." is not a thousands separator.
1891 // "12345,555" implies that "," is not a thousands separator.
1892 if (nonzero_digits == 0 || digits > 3)
1893 goto fail;
1896 goto done;
1898 fail:
1899 *possible &= ~flag;
1900 done:
1901 value_release (v);
1906 * stf_parse_options_guess_formats:
1907 * @data: the CSV input data.
1909 * This function attempts to recognize data formats on a column-by-column
1910 * basis under the assumption that the data in a text file will generally
1911 * use the same data formats.
1913 * This is useful because not all values give sufficient information by
1914 * themselves to tell what format the data is in. For example, "1/2/2000"
1915 * is likely to be a date in year 2000, but it is not clear if it is in
1916 * January or February. If another value in the same column is "31/1/1999"
1917 * then it is likely that the former date was in February.
1919 * Likewise, a value of "123,456" could mean either 1.23456e5 or 1.23456e2.
1920 * A later value of "111,200.22" would clear up the confusion.
1923 void
1924 stf_parse_options_guess_formats (StfParseOptions_t *po, char const *data)
1926 GStringChunk *lines_chunk;
1927 GPtrArray *lines;
1928 unsigned lno, col, colcount, sline;
1929 GODateConventions const *date_conv = go_date_conv_from_str ("Lotus:1900");
1930 GString *s_comma = g_string_new (",");
1931 GString *s_dot = g_string_new (".");
1932 GString *s_dollar = g_string_new ("$");
1933 gboolean debug = gnm_debug_flag ("stf");
1935 g_ptr_array_set_size (po->formats, 0);
1936 g_ptr_array_set_size (po->formats_decimal, 0);
1937 g_ptr_array_set_size (po->formats_thousand, 0);
1938 g_ptr_array_set_size (po->formats_curr, 0);
1940 lines_chunk = g_string_chunk_new (100 * 1024);
1941 lines = stf_parse_general (po, lines_chunk, data, data + strlen (data));
1943 colcount = 0;
1944 for (lno = 0; lno < lines->len; lno++) {
1945 GPtrArray *line = g_ptr_array_index (lines, lno);
1946 colcount = MAX (colcount, line->len);
1949 // Ignore first line unless it is the only one
1950 sline = MIN ((int)lines->len - 1, 1);
1952 g_ptr_array_set_size (po->formats, colcount);
1953 g_ptr_array_set_size (po->formats_decimal, colcount);
1954 g_ptr_array_set_size (po->formats_thousand, colcount);
1955 g_ptr_array_set_size (po->formats_curr, colcount);
1956 for (col = 0; col < colcount; col++) {
1957 unsigned possible = STF_GUESS_ALL;
1958 GOFormat *fmt = NULL;
1959 gboolean seen_dot = FALSE;
1960 gboolean seen_comma = FALSE;
1961 int decimals_if_point = -1; // -1: unset; -2: inconsistent; >=0: count
1962 int decimals_if_comma = -1; // -1: unset; -2: inconsistent; >=0: count
1964 for (lno = sline; possible && lno < lines->len; lno++) {
1965 GPtrArray *line = g_ptr_array_index (lines, lno);
1966 const char *data = col < line->len ? g_ptr_array_index (line, col) : "";
1967 unsigned prev_possible = possible;
1969 if (*data == 0 || data[0] == '\'')
1970 continue;
1972 do_check_date (data, STF_GUESS_DATE_DMY, FALSE, FALSE, &possible, date_conv);
1973 do_check_date (data, STF_GUESS_DATE_MDY, TRUE, FALSE, &possible, date_conv);
1974 do_check_date (data, STF_GUESS_DATE_YMD, TRUE, TRUE, &possible, date_conv);
1976 if ((possible & STF_GUESS_NUMBER_DEC_EITHER) == STF_GUESS_NUMBER_DEC_EITHER) {
1977 const char *pdot = strstr (data, s_dot->str);
1978 const char *pcomma = strstr (data, s_comma->str);
1979 if (pdot && pcomma) {
1980 // Both -- last one is the decimal separator
1981 if (pdot > pcomma)
1982 possible &= ~STF_GUESS_NUMBER_DEC_COMMA;
1983 else
1984 possible &= ~STF_GUESS_NUMBER_DEC_POINT;
1985 } else if (pdot && strstr (pdot + s_dot->len, s_dot->str)) {
1986 // Two dots so they are thousands separators
1987 possible &= ~STF_GUESS_NUMBER_DEC_POINT;
1988 } else if (pcomma && strstr (pcomma + s_comma->len, s_comma->str)) {
1989 // Two commas so they are thousands separators
1990 possible &= ~STF_GUESS_NUMBER_DEC_COMMA;
1993 seen_dot = seen_dot || (pdot != 0);
1994 seen_comma = seen_comma || (pcomma != 0);
1996 do_check_number (data, STF_GUESS_NUMBER_DEC_POINT,
1997 s_dot, s_comma, s_dollar,
1998 &possible, &decimals_if_point);
1999 do_check_number (data, STF_GUESS_NUMBER_DEC_COMMA,
2000 s_comma, s_dot, s_dollar,
2001 &possible, &decimals_if_comma);
2003 if (possible != prev_possible && debug)
2004 g_printerr ("col=%d; after [%s] possible=0x%x\n", col, data, possible);
2007 if ((possible & STF_GUESS_NUMBER_DEC_EITHER) == STF_GUESS_NUMBER_DEC_EITHER &&
2008 !seen_dot && !seen_comma) {
2009 // It doesn't matter what the separators are
2010 possible &= ~STF_GUESS_NUMBER_DEC_COMMA;
2013 switch (possible) {
2014 case STF_GUESS_DATE_DMY:
2015 fmt = go_format_new_from_XL ("d-mmm-yyyy");
2016 break;
2017 case STF_GUESS_DATE_MDY:
2018 fmt = go_format_new_from_XL ("m/d/yyyy");
2019 break;
2020 case STF_GUESS_DATE_YMD:
2021 fmt = go_format_new_from_XL ("yyyy-mm-dd");
2022 break;
2023 case STF_GUESS_NUMBER_DEC_POINT:
2024 g_ptr_array_index (po->formats_decimal, col) = g_string_new (".");
2025 g_ptr_array_index (po->formats_thousand, col) = g_string_new (",");
2026 g_ptr_array_index (po->formats_curr, col) = g_string_new (s_dollar->str);
2027 if (decimals_if_point > 0) {
2028 // Don't set format if decimals is zero
2029 GString *fmt_str = g_string_new (NULL);
2030 go_format_generate_number_str (fmt_str, 1, decimals_if_point, seen_comma, FALSE, FALSE, "", "");
2031 fmt = go_format_new_from_XL (fmt_str->str);
2032 g_string_free (fmt_str, TRUE);
2034 break;
2035 case STF_GUESS_NUMBER_DEC_COMMA:
2036 g_ptr_array_index (po->formats_decimal, col) = g_string_new (",");
2037 g_ptr_array_index (po->formats_thousand, col) = g_string_new (".");
2038 g_ptr_array_index (po->formats_curr, col) = g_string_new (s_dollar->str);
2039 if (decimals_if_comma > 0) {
2040 // Don't set format if decimals is zero
2041 GString *fmt_str = g_string_new (NULL);
2042 go_format_generate_number_str (fmt_str, 1, decimals_if_comma, seen_dot, FALSE, FALSE, "", "");
2043 fmt = go_format_new_from_XL (fmt_str->str);
2044 g_string_free (fmt_str, TRUE);
2046 break;
2047 default:
2048 break;
2051 if (!fmt)
2052 fmt = go_format_ref (go_format_general ());
2053 g_ptr_array_index (po->formats, col) = fmt;
2056 stf_parse_general_free (lines);
2057 g_string_chunk_free (lines_chunk);
2059 g_string_free (s_dot, TRUE);
2060 g_string_free (s_comma, TRUE);
2061 g_string_free (s_dollar, TRUE);