GETENV: check for proper UTF-8.
[gnumeric.git] / src / stf-parse.c
blob44c13392c31f92af027dee3a9c0a213bfb049bfb
1 /* vim: set sw=8: -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */
2 /*
3 * stf-parse.c : Structured Text Format parser. (STF)
4 * A general purpose engine for parsing data
5 * in CSV and Fixed width format.
8 * Copyright (C) Almer. S. Tigelaar.
9 * EMail: almer1@dds.nl or almer-t@bigfoot.com
11 * Copyright (C) 2003 Andreas J. Guelzow <aguelzow@taliesin.ca>
12 * Copyright (C) 2003,2008-2009 Morten Welinder <terra@gnome.org>
14 * This program is free software; you can redistribute it and/or modify
15 * it under the terms of the GNU General Public License as published by
16 * the Free Software Foundation; either version 2 of the License, or
17 * (at your option) any later version.
19 * This program is distributed in the hope that it will be useful,
20 * but WITHOUT ANY WARRANTY; without even the implied warranty of
21 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
22 * GNU General Public License for more details.
24 * You should have received a copy of the GNU General Public License
25 * along with this program; if not, see <https://www.gnu.org/licenses/>.
28 #include <gnumeric-config.h>
29 #include <glib/gi18n-lib.h>
30 #include "gnumeric.h"
31 #include "stf-parse.h"
32 #include "stf-export.h"
34 #include "workbook.h"
35 #include "cell.h"
36 #include "sheet.h"
37 #include "expr.h"
38 #include "clipboard.h"
39 #include "sheet-style.h"
40 #include "value.h"
41 #include "mstyle.h"
42 #include "number-match.h"
43 #include "gutils.h"
44 #include "parse-util.h"
45 #include "number-match.h"
46 #include "gnm-format.h"
47 #include "ranges.h"
48 #include <goffice/goffice.h>
50 #include <stdlib.h>
51 #include <locale.h>
52 #include <string.h>
54 #define SETUP_LOCALE_SWITCH char *oldlocale = NULL
56 #define START_LOCALE_SWITCH if (parseoptions->locale) {\
57 oldlocale = g_strdup(go_setlocale (LC_ALL, NULL)); \
58 go_setlocale(LC_ALL, parseoptions->locale);}
60 #define END_LOCALE_SWITCH if (oldlocale) {\
61 go_setlocale(LC_ALL, oldlocale);\
62 g_free (oldlocale);}
64 /* Source_t struct, used for interchanging parsing information between the low level parse functions */
65 typedef struct {
66 GStringChunk *chunk;
67 char const *position; /* Indicates the current position within data */
69 /* Used internally for fixed width parsing */
70 int splitpos; /* Indicates current position in splitpositions array */
71 int linepos; /* Position on the current line */
72 } Source_t;
74 /* Struct used for autodiscovery */
75 typedef struct {
76 int start;
77 int stop;
78 } AutoDiscovery_t;
81 * Some silly dude make the length field an unsigned int. C just does
82 * not deal very well with that.
84 static inline int
85 my_garray_len (GArray const *a)
87 return (int)a->len;
90 static char *
91 my_utf8_strchr (const char *p, gunichar uc)
93 return uc < 0x7f ? strchr (p, uc) : g_utf8_strchr (p, -1, uc);
96 static int
97 compare_terminator (char const *s, StfParseOptions_t *parseoptions)
99 guchar const *us = (guchar const *)s;
100 GSList *l;
102 if (*us > parseoptions->compiled_terminator.max ||
103 *us < parseoptions->compiled_terminator.min)
104 return 0;
106 for (l = parseoptions->terminator; l; l = l->next) {
107 char const *term = l->data;
108 char const *d = s;
110 while (*term) {
111 if (*d != *term)
112 goto next;
113 term++;
114 d++;
116 return d - s;
118 next:
121 return 0;
125 /*******************************************************************************************************
126 * STF PARSE OPTIONS : StfParseOptions related
127 *******************************************************************************************************/
129 static void
130 gnm_g_string_free (GString *s)
132 if (s) g_string_free (s, TRUE);
137 * stf_parse_options_new:
139 * This will return a new StfParseOptions_t struct.
140 * The struct should, after being used, freed with stf_parse_options_free.
142 static StfParseOptions_t *
143 stf_parse_options_new (void)
145 StfParseOptions_t* parseoptions = g_new0 (StfParseOptions_t, 1);
147 parseoptions->parsetype = PARSE_TYPE_NOTSET;
149 parseoptions->terminator = NULL;
150 stf_parse_options_add_line_terminator (parseoptions, "\r\n");
151 stf_parse_options_add_line_terminator (parseoptions, "\n");
152 stf_parse_options_add_line_terminator (parseoptions, "\r");
154 parseoptions->trim_spaces = (TRIM_TYPE_RIGHT | TRIM_TYPE_LEFT);
155 parseoptions->locale = NULL;
157 parseoptions->splitpositions = NULL;
158 stf_parse_options_fixed_splitpositions_clear (parseoptions);
160 parseoptions->stringindicator = '"';
161 parseoptions->indicator_2x_is_single = TRUE;
162 parseoptions->sep.duplicates = FALSE;
163 parseoptions->trim_seps = FALSE;
165 parseoptions->sep.str = NULL;
166 parseoptions->sep.chr = NULL;
168 parseoptions->col_autofit_array = NULL;
169 parseoptions->col_import_array = NULL;
170 parseoptions->col_import_array_len = 0;
171 parseoptions->formats = g_ptr_array_new_with_free_func ((GDestroyNotify)go_format_unref);
172 parseoptions->formats_decimal = g_ptr_array_new_with_free_func ((GDestroyNotify)gnm_g_string_free);
173 parseoptions->formats_thousand = g_ptr_array_new_with_free_func ((GDestroyNotify)gnm_g_string_free);
174 parseoptions->formats_curr = g_ptr_array_new_with_free_func ((GDestroyNotify)gnm_g_string_free);
176 parseoptions->cols_exceeded = FALSE;
177 parseoptions->rows_exceeded = FALSE;
178 parseoptions->ref_count = 1;
180 return parseoptions;
184 * stf_parse_options_free:
186 * will free @parseoptions, note that this will not free the splitpositions
187 * member (GArray) of the struct, the caller is responsible for that.
189 void
190 stf_parse_options_free (StfParseOptions_t *parseoptions)
192 g_return_if_fail (parseoptions != NULL);
194 if (parseoptions->ref_count-- > 1)
195 return;
197 g_free (parseoptions->col_import_array);
198 g_free (parseoptions->col_autofit_array);
199 g_free (parseoptions->locale);
200 g_free (parseoptions->sep.chr);
202 if (parseoptions->sep.str) {
203 GSList *l;
205 for (l = parseoptions->sep.str; l != NULL; l = l->next)
206 g_free ((char *) l->data);
207 g_slist_free (parseoptions->sep.str);
210 g_array_free (parseoptions->splitpositions, TRUE);
212 stf_parse_options_clear_line_terminator (parseoptions);
214 g_ptr_array_free (parseoptions->formats, TRUE);
215 g_ptr_array_free (parseoptions->formats_decimal, TRUE);
216 g_ptr_array_free (parseoptions->formats_thousand, TRUE);
217 g_ptr_array_free (parseoptions->formats_curr, TRUE);
219 g_free (parseoptions);
222 static StfParseOptions_t *
223 stf_parse_options_ref (StfParseOptions_t *parseoptions)
225 parseoptions->ref_count++;
226 return parseoptions;
229 GType
230 stf_parse_options_get_type (void)
232 static GType t = 0;
234 if (t == 0) {
235 t = g_boxed_type_register_static ("StfParseOptions_t",
236 (GBoxedCopyFunc)stf_parse_options_ref,
237 (GBoxedFreeFunc)stf_parse_options_free);
239 return t;
242 void
243 stf_parse_options_set_type (StfParseOptions_t *parseoptions, StfParseType_t const parsetype)
245 g_return_if_fail (parseoptions != NULL);
246 g_return_if_fail (parsetype == PARSE_TYPE_CSV || parsetype == PARSE_TYPE_FIXED);
248 parseoptions->parsetype = parsetype;
251 static gint
252 long_string_first (gchar const *a, gchar const *b)
254 /* This actually is UTF-8 safe. */
255 return strlen (b) - strlen (a);
258 static void
259 compile_terminators (StfParseOptions_t *parseoptions)
261 GSList *l;
262 GO_SLIST_SORT (parseoptions->terminator, (GCompareFunc)long_string_first);
264 parseoptions->compiled_terminator.min = 255;
265 parseoptions->compiled_terminator.max = 0;
266 for (l = parseoptions->terminator; l; l = l->next) {
267 const guchar *term = l->data;
268 parseoptions->compiled_terminator.min =
269 MIN (parseoptions->compiled_terminator.min, *term);
270 parseoptions->compiled_terminator.max =
271 MAX (parseoptions->compiled_terminator.max, *term);
276 * stf_parse_options_add_line_terminator:
278 * This will add to the line terminators, in both the Fixed width and CSV delimited importers
279 * this indicates the end of a row.
282 void
283 stf_parse_options_add_line_terminator (StfParseOptions_t *parseoptions, char const *terminator)
285 g_return_if_fail (parseoptions != NULL);
286 g_return_if_fail (terminator != NULL && *terminator != 0);
288 GO_SLIST_PREPEND (parseoptions->terminator, g_strdup (terminator));
289 compile_terminators (parseoptions);
293 * stf_parse_options_clear_line_terminator:
295 * This will clear the line terminator, in both the Fixed width and CSV delimited importers
296 * this indicates the end of a row.
299 void
300 stf_parse_options_clear_line_terminator (StfParseOptions_t *parseoptions)
302 g_return_if_fail (parseoptions != NULL);
304 g_slist_free_full (parseoptions->terminator, g_free);
305 parseoptions->terminator = NULL;
306 compile_terminators (parseoptions);
310 * stf_parse_options_set_trim_spaces:
312 * If enabled will trim spaces in every parsed field on left and/or right
313 * sides.
315 void
316 stf_parse_options_set_trim_spaces (StfParseOptions_t *parseoptions, StfTrimType_t const trim_spaces)
318 g_return_if_fail (parseoptions != NULL);
320 parseoptions->trim_spaces = trim_spaces;
324 * stf_parse_options_csv_set_separators:
325 * @parseoptions: #StfParseOptions_t
326 * @character:
327 * @string: (element-type char):
329 * A copy is made of the parameters.
331 void
332 stf_parse_options_csv_set_separators (StfParseOptions_t *parseoptions, char const *character,
333 GSList const *string)
335 g_return_if_fail (parseoptions != NULL);
337 g_free (parseoptions->sep.chr);
338 parseoptions->sep.chr = g_strdup (character);
340 g_slist_free_full (parseoptions->sep.str, g_free);
341 parseoptions->sep.str = go_slist_map (string, (GOMapFunc)g_strdup);
344 void
345 stf_parse_options_csv_set_stringindicator (StfParseOptions_t *parseoptions, gunichar const stringindicator)
347 g_return_if_fail (parseoptions != NULL);
349 parseoptions->stringindicator = stringindicator;
353 * stf_parse_options_csv_set_indicator_2x_is_single:
354 * @indic_2x: a boolean value indicating whether we want to see two
355 * adjacent string indicators as a single string indicator
356 * that is part of the cell, rather than a terminator.
358 void
359 stf_parse_options_csv_set_indicator_2x_is_single (StfParseOptions_t *parseoptions,
360 gboolean const indic_2x)
362 g_return_if_fail (parseoptions != NULL);
364 parseoptions->indicator_2x_is_single = indic_2x;
368 * stf_parse_options_csv_set_duplicates:
369 * @parseoptions:
370 * @duplicates: a boolean value indicating whether we want to see two
371 * separators right behind each other as one
373 void
374 stf_parse_options_csv_set_duplicates (StfParseOptions_t *parseoptions, gboolean const duplicates)
376 g_return_if_fail (parseoptions != NULL);
378 parseoptions->sep.duplicates = duplicates;
382 * stf_parse_options_csv_set_trim_seps:
383 * @trim_seps: a boolean value indicating whether we want to ignore
384 * separators at the beginning of lines
386 void
387 stf_parse_options_csv_set_trim_seps (StfParseOptions_t *parseoptions, gboolean const trim_seps)
389 g_return_if_fail (parseoptions != NULL);
391 parseoptions->trim_seps = trim_seps;
395 * stf_parse_options_fixed_splitpositions_clear:
397 * This will clear the splitpositions (== points on which a line is split)
399 void
400 stf_parse_options_fixed_splitpositions_clear (StfParseOptions_t *parseoptions)
402 int minus_one = -1;
403 g_return_if_fail (parseoptions != NULL);
405 if (parseoptions->splitpositions)
406 g_array_free (parseoptions->splitpositions, TRUE);
407 parseoptions->splitpositions = g_array_new (FALSE, FALSE, sizeof (int));
409 g_array_append_val (parseoptions->splitpositions, minus_one);
413 * stf_parse_options_fixed_splitpositions_add:
415 * @position will be added to the splitpositions.
417 void
418 stf_parse_options_fixed_splitpositions_add (StfParseOptions_t *parseoptions, int position)
420 unsigned int ui;
422 g_return_if_fail (parseoptions != NULL);
423 g_return_if_fail (position >= 0);
425 for (ui = 0; ui < parseoptions->splitpositions->len - 1; ui++) {
426 int here = g_array_index (parseoptions->splitpositions, int, ui);
427 if (position == here)
428 return;
429 if (position < here)
430 break;
433 g_array_insert_val (parseoptions->splitpositions, ui, position);
436 void
437 stf_parse_options_fixed_splitpositions_remove (StfParseOptions_t *parseoptions, int position)
439 unsigned int ui;
441 g_return_if_fail (parseoptions != NULL);
442 g_return_if_fail (position >= 0);
444 for (ui = 0; ui < parseoptions->splitpositions->len - 1; ui++) {
445 int here = g_array_index (parseoptions->splitpositions, int, ui);
446 if (position == here)
447 g_array_remove_index (parseoptions->splitpositions, ui);
448 if (position <= here)
449 return;
454 stf_parse_options_fixed_splitpositions_count (StfParseOptions_t *parseoptions)
456 return parseoptions->splitpositions->len;
460 stf_parse_options_fixed_splitpositions_nth (StfParseOptions_t *parseoptions, int n)
462 return g_array_index (parseoptions->splitpositions, int, n);
467 * stf_parse_options_valid:
468 * @parseoptions: an import options struct
470 * Checks if @parseoptions is correctly filled
472 * returns : TRUE if it is correctly filled, FALSE otherwise.
474 static gboolean
475 stf_parse_options_valid (StfParseOptions_t *parseoptions)
477 g_return_val_if_fail (parseoptions != NULL, FALSE);
479 if (parseoptions->parsetype == PARSE_TYPE_FIXED) {
480 if (!parseoptions->splitpositions) {
481 g_warning ("STF: No splitpositions in struct");
482 return FALSE;
486 return TRUE;
489 /*******************************************************************************************************
490 * STF PARSE : The actual routines that do the 'trick'
491 *******************************************************************************************************/
493 static void
494 trim_spaces_inplace (char *field, StfParseOptions_t const *parseoptions)
496 if (!field) return;
498 if (parseoptions->trim_spaces & TRIM_TYPE_LEFT) {
499 char *s = field;
501 while (g_unichar_isspace (g_utf8_get_char (s)))
502 s = g_utf8_next_char (s);
504 if (s != field)
505 memmove (field, s, 1 + strlen (s));
508 if (parseoptions->trim_spaces & TRIM_TYPE_RIGHT) {
509 char *s = field + strlen (field);
511 while (field != s) {
512 s = g_utf8_prev_char (s);
513 if (!g_unichar_isspace (g_utf8_get_char (s)))
514 break;
515 *s = 0;
521 * stf_parse_csv_is_separator:
523 * returns NULL if @character is not a separator, a pointer to the character
524 * after the separator otherwise.
526 static char const *
527 stf_parse_csv_is_separator (char const *character, char const *chr, GSList const *str)
529 g_return_val_if_fail (character != NULL, NULL);
531 if (*character == 0)
532 return NULL;
534 if (str) {
535 GSList const *l;
537 for (l = str; l != NULL; l = l->next) {
538 char const *s = l->data;
539 char const *r;
540 glong cnt;
541 glong const len = g_utf8_strlen (s, -1);
543 /* Don't compare past the end of the buffer! */
544 for (r = character, cnt = 0; cnt < len; cnt++, r = g_utf8_next_char (r))
545 if (*r == '\0')
546 break;
548 if ((cnt == len) && (memcmp (character, s, len) == 0))
549 return g_utf8_offset_to_pointer (character, len);
553 if (chr && my_utf8_strchr (chr, g_utf8_get_char (character)))
554 return g_utf8_next_char(character);
556 return NULL;
560 * stf_parse_eat_separators:
562 * skip over leading separators
566 static void
567 stf_parse_eat_separators (Source_t *src, StfParseOptions_t *parseoptions)
569 char const *cur, *next;
571 g_return_if_fail (src != NULL);
572 g_return_if_fail (parseoptions != NULL);
574 cur = src->position;
576 if (*cur == '\0' || compare_terminator (cur, parseoptions))
577 return;
578 while ((next = stf_parse_csv_is_separator (cur, parseoptions->sep.chr, parseoptions->sep.str)))
579 cur = next;
580 src->position = cur;
581 return;
585 typedef enum {
586 STF_CELL_ERROR,
587 STF_CELL_EOF,
588 STF_CELL_EOL,
589 STF_CELL_FIELD_NO_SEP,
590 STF_CELL_FIELD_SEP
591 } StfParseCellRes;
593 static StfParseCellRes
594 stf_parse_csv_cell (GString *text, Source_t *src, StfParseOptions_t *parseoptions)
596 char const *cur;
597 gboolean saw_sep = FALSE;
599 g_return_val_if_fail (src != NULL, STF_CELL_ERROR);
600 g_return_val_if_fail (parseoptions != NULL, STF_CELL_ERROR);
602 cur = src->position;
603 g_return_val_if_fail (cur != NULL, STF_CELL_ERROR);
605 /* Skip whitespace, but stop at line terminators. */
606 while (1) {
607 int term_len;
609 if (*cur == 0) {
610 src->position = cur;
611 return STF_CELL_EOF;
614 term_len = compare_terminator (cur, parseoptions);
615 if (term_len) {
616 src->position = cur + term_len;
617 return STF_CELL_EOL;
620 if ((parseoptions->trim_spaces & TRIM_TYPE_LEFT) == 0)
621 break;
623 if (stf_parse_csv_is_separator (cur, parseoptions->sep.chr,
624 parseoptions->sep.str))
625 break;
627 if (!g_unichar_isspace (g_utf8_get_char (cur)))
628 break;
629 cur = g_utf8_next_char (cur);
632 if (parseoptions->stringindicator != 0 &&
633 g_utf8_get_char (cur) == parseoptions->stringindicator) {
634 cur = g_utf8_next_char (cur);
635 while (*cur) {
636 gunichar uc = g_utf8_get_char (cur);
637 cur = g_utf8_next_char (cur);
639 if (uc == parseoptions->stringindicator) {
640 if (parseoptions->indicator_2x_is_single &&
641 g_utf8_get_char (cur) == parseoptions->stringindicator)
642 cur = g_utf8_next_char (cur);
643 else {
644 /* "field content"dropped-garbage, */
645 while (*cur && !compare_terminator (cur, parseoptions)) {
646 char const *post = stf_parse_csv_is_separator
647 (cur, parseoptions->sep.chr, parseoptions->sep.str);
648 if (post) {
649 cur = post;
650 saw_sep = TRUE;
651 break;
653 cur = g_utf8_next_char (cur);
655 break;
659 g_string_append_unichar (text, uc);
662 /* We silently allow a missing terminating quote. */
663 } else {
664 /* Unquoted field. */
666 while (*cur && !compare_terminator (cur, parseoptions)) {
668 char const *post = stf_parse_csv_is_separator
669 (cur, parseoptions->sep.chr, parseoptions->sep.str);
670 if (post) {
671 cur = post;
672 saw_sep = TRUE;
673 break;
676 g_string_append_unichar (text, g_utf8_get_char (cur));
677 cur = g_utf8_next_char (cur);
680 if (parseoptions->trim_spaces & TRIM_TYPE_RIGHT) {
681 while (text->len) {
682 const char *last = g_utf8_prev_char (text->str + text->len);
683 if (!g_unichar_isspace (g_utf8_get_char (last)))
684 break;
685 g_string_truncate (text, last - text->str);
690 src->position = cur;
692 if (saw_sep && parseoptions->sep.duplicates)
693 stf_parse_eat_separators (src, parseoptions);
695 return saw_sep ? STF_CELL_FIELD_SEP : STF_CELL_FIELD_NO_SEP;
699 * stf_parse_csv_line:
701 * This will parse one line from the current @src->position.
702 * NOTE: The calling routine is responsible for freeing the result.
704 * returns : a GPtrArray of char*'s
706 static GPtrArray *
707 stf_parse_csv_line (Source_t *src, StfParseOptions_t *parseoptions)
709 GPtrArray *line;
710 gboolean cont = FALSE;
711 GString *text;
713 g_return_val_if_fail (src != NULL, NULL);
714 g_return_val_if_fail (parseoptions != NULL, NULL);
716 line = g_ptr_array_new ();
717 if (parseoptions->trim_seps)
718 stf_parse_eat_separators (src, parseoptions);
720 text = g_string_sized_new (30);
722 while (1) {
723 char *ctext;
724 StfParseCellRes res =
725 stf_parse_csv_cell (text, src, parseoptions);
726 trim_spaces_inplace (text->str, parseoptions);
727 ctext = g_string_chunk_insert_len (src->chunk,
728 text->str, text->len);
729 g_string_truncate (text, 0);
731 switch (res) {
732 case STF_CELL_FIELD_NO_SEP:
733 g_ptr_array_add (line, ctext);
734 cont = FALSE;
735 break;
737 case STF_CELL_FIELD_SEP:
738 g_ptr_array_add (line, ctext);
739 cont = TRUE; /* Make sure we see one more field. */
740 break;
742 default:
743 if (cont)
744 g_ptr_array_add (line, ctext);
745 g_string_free (text, TRUE);
746 return line;
752 * stf_parse_fixed_cell:
754 * returns a pointer to the parsed cell contents.
756 static char *
757 stf_parse_fixed_cell (Source_t *src, StfParseOptions_t *parseoptions)
759 char *res;
760 char const *cur;
761 int splitval;
763 g_return_val_if_fail (src != NULL, NULL);
764 g_return_val_if_fail (parseoptions != NULL, NULL);
766 cur = src->position;
768 if (src->splitpos < my_garray_len (parseoptions->splitpositions))
769 splitval = (int) g_array_index (parseoptions->splitpositions, int, src->splitpos);
770 else
771 splitval = -1;
773 while (*cur != 0 && !compare_terminator (cur, parseoptions) && splitval != src->linepos) {
774 src->linepos++;
775 cur = g_utf8_next_char (cur);
778 res = g_string_chunk_insert_len (src->chunk,
779 src->position,
780 cur - src->position);
782 src->position = cur;
784 return res;
788 * stf_parse_fixed_line:
790 * This will parse one line from the current @src->position.
791 * It will return a GPtrArray with the cell contents as strings.
793 * NOTE: The calling routine is responsible for freeing result.
795 static GPtrArray *
796 stf_parse_fixed_line (Source_t *src, StfParseOptions_t *parseoptions)
798 GPtrArray *line;
800 g_return_val_if_fail (src != NULL, NULL);
801 g_return_val_if_fail (parseoptions != NULL, NULL);
803 src->linepos = 0;
804 src->splitpos = 0;
806 line = g_ptr_array_new ();
807 while (*src->position != '\0' && !compare_terminator (src->position, parseoptions)) {
808 char *field = stf_parse_fixed_cell (src, parseoptions);
810 trim_spaces_inplace (field, parseoptions);
811 g_ptr_array_add (line, field);
813 src->splitpos++;
816 while (line->len < parseoptions->splitpositions->len)
817 g_ptr_array_add (line, g_strdup (""));
819 return line;
823 * stf_parse_general_free: (skip)
825 void
826 stf_parse_general_free (GPtrArray *lines)
828 unsigned lineno;
829 for (lineno = 0; lineno < lines->len; lineno++) {
830 GPtrArray *line = g_ptr_array_index (lines, lineno);
831 /* Fields are not freed here. */
832 if (line)
833 g_ptr_array_free (line, TRUE);
835 g_ptr_array_free (lines, TRUE);
840 * stf_parse_general: (skip)
842 * Returns: (transfer full): a GPtrArray of lines, where each line is itself a
843 * GPtrArray of strings.
845 * The caller must free this entire structure, for example by calling
846 * stf_parse_general_free.
848 GPtrArray *
849 stf_parse_general (StfParseOptions_t *parseoptions,
850 GStringChunk *lines_chunk,
851 char const *data, char const *data_end)
853 GPtrArray *lines;
854 Source_t src;
855 int row;
856 char const *valid_end = data_end;
858 g_return_val_if_fail (parseoptions != NULL, NULL);
859 g_return_val_if_fail (data != NULL, NULL);
860 g_return_val_if_fail (data_end != NULL, NULL);
861 g_return_val_if_fail (stf_parse_options_valid (parseoptions), NULL);
862 g_return_val_if_fail (g_utf8_validate (data, data_end-data, &valid_end), NULL);
864 src.chunk = lines_chunk;
865 src.position = data;
866 row = 0;
868 if ((data_end-data >= 3) && !strncmp(src.position, "\xEF\xBB\xBF", 3)) {
869 /* Skip over byte-order mark */
870 src.position += 3;
873 lines = g_ptr_array_new ();
874 while (*src.position != '\0' && src.position < data_end) {
875 GPtrArray *line;
877 if (row == GNM_MAX_ROWS) {
878 parseoptions->rows_exceeded = TRUE;
879 break;
882 line = parseoptions->parsetype == PARSE_TYPE_CSV
883 ? stf_parse_csv_line (&src, parseoptions)
884 : stf_parse_fixed_line (&src, parseoptions);
886 g_ptr_array_add (lines, line);
887 if (parseoptions->parsetype != PARSE_TYPE_CSV)
888 src.position += compare_terminator (src.position, parseoptions);
889 row++;
892 return lines;
896 * stf_parse_lines: (skip)
897 * @parseoptions: #StfParseOptions_t
898 * @lines_chunk:
899 * @data:
900 * @maxlines:
901 * @with_lineno:
903 * Returns: (transfer full): a GPtrArray of lines, where each line is itself a
904 * GPtrArray of strings.
906 * The caller must free this entire structure, for example by calling
907 * stf_parse_general_free.
909 GPtrArray *
910 stf_parse_lines (StfParseOptions_t *parseoptions,
911 GStringChunk *lines_chunk,
912 char const *data,
913 int maxlines, gboolean with_lineno)
915 GPtrArray *lines;
916 int lineno = 1;
918 g_return_val_if_fail (data != NULL, NULL);
920 lines = g_ptr_array_new ();
921 while (*data) {
922 char const *data0 = data;
923 GPtrArray *line = g_ptr_array_new ();
925 if (with_lineno) {
926 char buf[4 * sizeof (int)];
927 sprintf (buf, "%d", lineno);
928 g_ptr_array_add (line,
929 g_string_chunk_insert (lines_chunk, buf));
932 while (1) {
933 int termlen = compare_terminator (data, parseoptions);
934 if (termlen > 0 || *data == 0) {
935 g_ptr_array_add (line,
936 g_string_chunk_insert_len (lines_chunk,
937 data0,
938 data - data0));
939 data += termlen;
940 break;
941 } else
942 data = g_utf8_next_char (data);
945 g_ptr_array_add (lines, line);
947 lineno++;
948 if (lineno >= maxlines)
949 break;
951 return lines;
954 char const *
955 stf_parse_find_line (StfParseOptions_t *parseoptions,
956 char const *data,
957 int line)
959 while (line > 0) {
960 int termlen = compare_terminator (data, parseoptions);
961 if (termlen > 0) {
962 data += termlen;
963 line--;
964 } else if (*data == 0) {
965 return data;
966 } else {
967 data = g_utf8_next_char (data);
970 return data;
975 * stf_parse_options_fixed_autodiscover:
976 * @parseoptions: a Parse options struct.
977 * @data: The actual data.
978 * @data_end: data end.
980 * Automatically try to discover columns in the text to be parsed.
981 * We ignore empty lines (only containing parseoptions->terminator)
983 * FIXME: This is so extremely ugly that I am too tired to rewrite it right now.
984 * Think hard of a better more flexible solution...
986 void
987 stf_parse_options_fixed_autodiscover (StfParseOptions_t *parseoptions,
988 char const *data, char const *data_end)
990 char const *iterator = data;
991 GSList *list = NULL;
992 GSList *list_start = NULL;
993 int lines = 0;
994 int effective_lines = 0;
995 int max_line_length = 0;
996 int *line_begin_hits = NULL;
997 int *line_end_hits = NULL;
998 int i;
1000 stf_parse_options_fixed_splitpositions_clear (parseoptions);
1003 * First take a look at all possible white space combinations
1005 while (*iterator && iterator < data_end) {
1006 gboolean begin_recorded = FALSE;
1007 AutoDiscovery_t *disc = NULL;
1008 int position = 0;
1009 int termlen = 0;
1011 while (*iterator && (termlen = compare_terminator (iterator, parseoptions)) == 0) {
1012 if (!begin_recorded && *iterator == ' ') {
1013 disc = g_new0 (AutoDiscovery_t, 1);
1015 disc->start = position;
1017 begin_recorded = TRUE;
1018 } else if (begin_recorded && *iterator != ' ') {
1019 disc->stop = position;
1020 list = g_slist_prepend (list, disc);
1022 begin_recorded = FALSE;
1023 disc = NULL;
1026 position++;
1027 iterator++;
1030 if (position > max_line_length)
1031 max_line_length = position;
1034 * If there are excess spaces at the end of
1035 * the line : ignore them
1037 g_free (disc);
1040 * Hop over the terminator
1042 iterator += termlen;
1044 if (position != 0)
1045 effective_lines++;
1047 lines++;
1050 list = g_slist_reverse (list);
1051 list_start = list;
1054 * Kewl stuff :
1055 * Look at the number of hits at each line position
1056 * if the number of hits equals the number of lines
1057 * we can be pretty sure this is the start or end
1058 * of a column, we filter out empty columns
1059 * later
1061 line_begin_hits = g_new0 (int, max_line_length + 1);
1062 line_end_hits = g_new0 (int, max_line_length + 1);
1064 while (list) {
1065 AutoDiscovery_t *disc = list->data;
1067 line_begin_hits[disc->start]++;
1068 line_end_hits[disc->stop]++;
1070 g_free (disc);
1072 list = g_slist_next (list);
1074 g_slist_free (list_start);
1076 for (i = 0; i < max_line_length + 1; i++)
1077 if (line_begin_hits[i] == effective_lines || line_end_hits[i] == effective_lines)
1078 stf_parse_options_fixed_splitpositions_add (parseoptions, i);
1081 * Do some corrections to the initial columns
1082 * detected here, we obviously don't need to
1083 * do this if there are no columns at all.
1085 if (my_garray_len (parseoptions->splitpositions) > 0) {
1087 * Try to find columns that look like :
1089 * Example 100
1090 * Example2 9
1092 * (In other words : Columns with left & right justification with
1093 * a minimum of 2 spaces in the middle)
1094 * Split these columns in 2
1097 for (i = 0; i < my_garray_len (parseoptions->splitpositions) - 1; i++) {
1098 int begin = g_array_index (parseoptions->splitpositions, int, i);
1099 int end = g_array_index (parseoptions->splitpositions, int, i + 1);
1100 int num_spaces = -1;
1101 int spaces_start = 0;
1102 gboolean right_aligned = TRUE;
1103 gboolean left_aligned = TRUE;
1104 gboolean has_2_spaces = TRUE;
1106 iterator = data;
1107 lines = 0;
1108 while (*iterator && iterator < data_end) {
1109 gboolean trigger = FALSE;
1110 gboolean space_trigger = FALSE;
1111 int pos = 0;
1113 num_spaces = -1;
1114 spaces_start = 0;
1115 while (*iterator && !compare_terminator (iterator, parseoptions)) {
1116 if (pos == begin) {
1117 if (*iterator == ' ')
1118 left_aligned = FALSE;
1120 trigger = TRUE;
1121 } else if (pos == end - 1) {
1122 if (*iterator == ' ')
1123 right_aligned = FALSE;
1125 trigger = FALSE;
1128 if (trigger || pos == end - 1) {
1129 if (!space_trigger && *iterator == ' ') {
1130 space_trigger = TRUE;
1131 spaces_start = pos;
1132 } else if (space_trigger && *iterator != ' ') {
1133 space_trigger = FALSE;
1134 num_spaces = pos - spaces_start;
1138 iterator++;
1139 pos++;
1142 if (num_spaces < 2)
1143 has_2_spaces = FALSE;
1145 if (*iterator)
1146 iterator++;
1148 lines++;
1152 * If this column meets all the criteria
1153 * split it into two at the last measured
1154 * spaces_start + num_spaces
1156 if (has_2_spaces && right_aligned && left_aligned) {
1157 int val = (((spaces_start + num_spaces) - spaces_start) / 2) + spaces_start;
1159 g_array_insert_val (parseoptions->splitpositions, i + 1, val);
1162 * Skip over the inserted column
1164 i++;
1169 * Remove empty columns here if needed
1171 for (i = 0; i < my_garray_len (parseoptions->splitpositions) - 1; i++) {
1172 int begin = g_array_index (parseoptions->splitpositions, int, i);
1173 int end = g_array_index (parseoptions->splitpositions, int, i + 1);
1174 gboolean only_spaces = TRUE;
1176 iterator = data;
1177 lines = 0;
1178 while (*iterator && iterator < data_end) {
1179 gboolean trigger = FALSE;
1180 int pos = 0;
1182 while (*iterator && !compare_terminator (iterator, parseoptions)) {
1183 if (pos == begin)
1184 trigger = TRUE;
1185 else if (pos == end)
1186 trigger = FALSE;
1188 if (trigger) {
1189 if (*iterator != ' ')
1190 only_spaces = FALSE;
1193 iterator++;
1194 pos++;
1197 if (*iterator)
1198 iterator++;
1200 lines++;
1204 * The column only contains spaces
1205 * remove it
1207 if (only_spaces) {
1208 g_array_remove_index (parseoptions->splitpositions, i);
1211 * We HAVE to make sure that the next column (end) also
1212 * gets checked out. If we don't decrease "i" here, we
1213 * will skip over it as the indexes shift down after
1214 * the removal
1216 i--;
1221 g_free (line_begin_hits);
1222 g_free (line_end_hits);
1225 /*******************************************************************************************************
1226 * STF PARSE HL: high-level functions that dump the raw data returned by the low-level parsing
1227 * functions into something meaningful (== application specific)
1228 *******************************************************************************************************/
1231 * This is more or less as gnm_cell_set_text, except...
1232 * 1. Unknown names are not allowed.
1233 * 2. Only '=' can start an expression.
1236 static void
1237 stf_cell_set_text (GnmCell *cell, char const *text)
1239 GnmExprTop const *texpr;
1240 GnmValue *val;
1241 GOFormat const *fmt = gnm_style_get_format (gnm_cell_get_style (cell));
1242 const GODateConventions *date_conv =
1243 workbook_date_conv (cell->base.sheet->workbook);
1245 if (!go_format_is_text (fmt) && *text == '=' && text[1] != 0) {
1246 GnmExprParseFlags flags =
1247 GNM_EXPR_PARSE_UNKNOWN_NAMES_ARE_INVALID;
1248 const char *expr_start = text + 1;
1249 GnmParsePos pos;
1250 val = NULL;
1251 parse_pos_init_cell (&pos, cell);
1252 texpr = gnm_expr_parse_str (expr_start, &pos, flags,
1253 NULL, NULL);
1254 } else {
1255 texpr = NULL;
1256 val = format_match (text, fmt, date_conv);
1259 if (!val && !texpr)
1260 val = value_new_string (text);
1262 if (val)
1263 gnm_cell_set_value (cell, val);
1264 else {
1265 gnm_cell_set_expr (cell, texpr);
1266 gnm_expr_top_unref (texpr);
1270 static void
1271 stf_read_remember_settings (Workbook *book, StfParseOptions_t *po)
1273 if (po->parsetype == PARSE_TYPE_CSV) {
1274 GnmStfExport *stfe = gnm_stf_get_stfe (G_OBJECT (book));
1275 char quote[6];
1276 int length = g_unichar_to_utf8 (po->stringindicator, quote);
1277 if (length > 5) {
1278 quote[0] = '"';
1279 quote[1] = '\0';
1280 } else quote[length] = '\0';
1282 g_object_set (G_OBJECT (stfe), "separator", po->sep.chr, "quote", &quote, NULL);
1284 if ((po->terminator != NULL) && (po->terminator->data != NULL))
1285 g_object_set (G_OBJECT (stfe), "eol", po->terminator->data, NULL);
1289 gboolean
1290 stf_parse_sheet (StfParseOptions_t *parseoptions,
1291 char const *data, char const *data_end,
1292 Sheet *sheet, int start_col, int start_row)
1294 int row;
1295 unsigned int lrow;
1296 GStringChunk *lines_chunk;
1297 GPtrArray *lines;
1298 gboolean result = TRUE;
1299 int col;
1300 unsigned int lcol;
1301 size_t nformats;
1303 SETUP_LOCALE_SWITCH;
1305 g_return_val_if_fail (parseoptions != NULL, FALSE);
1306 g_return_val_if_fail (data != NULL, FALSE);
1307 g_return_val_if_fail (IS_SHEET (sheet), FALSE);
1309 if (!data_end)
1310 data_end = data + strlen (data);
1312 lines_chunk = g_string_chunk_new (100 * 1024);
1313 lines = stf_parse_general (parseoptions, lines_chunk, data, data_end);
1314 if (lines == NULL)
1315 result = FALSE;
1317 col = start_col;
1318 nformats = parseoptions->formats->len;
1319 for (lcol = 0; lcol < nformats; lcol++) {
1320 GOFormat const *fmt = g_ptr_array_index (parseoptions->formats, lcol);
1321 GnmStyle *mstyle;
1322 gboolean want_col =
1323 (parseoptions->col_import_array == NULL ||
1324 parseoptions->col_import_array_len <= lcol ||
1325 parseoptions->col_import_array[lcol]);
1326 if (!want_col || col >= gnm_sheet_get_max_cols (sheet))
1327 continue;
1329 if (fmt && !go_format_is_general (fmt)) {
1330 GnmRange r;
1331 int end_row = MIN (start_row + (int)lines->len - 1,
1332 gnm_sheet_get_last_row (sheet));
1334 range_init (&r, col, start_row, col, end_row);
1335 mstyle = gnm_style_new ();
1336 gnm_style_set_format (mstyle, fmt);
1337 sheet_apply_style (sheet, &r, mstyle);
1339 col++;
1342 START_LOCALE_SWITCH;
1343 for (row = start_row, lrow = 0;
1344 result && lrow < lines->len;
1345 row++, lrow++) {
1346 GPtrArray *line;
1348 if (row >= gnm_sheet_get_max_rows (sheet)) {
1349 if (!parseoptions->rows_exceeded) {
1350 /* FIXME: What locale? */
1351 g_warning (_("There are more rows of data than "
1352 "there is room for in the sheet. Extra "
1353 "rows will be ignored."));
1354 parseoptions->rows_exceeded = TRUE;
1356 break;
1359 col = start_col;
1360 line = g_ptr_array_index (lines, lrow);
1362 for (lcol = 0; lcol < line->len; lcol++) {
1363 GOFormat const *fmt = lcol < nformats
1364 ? g_ptr_array_index (parseoptions->formats, lcol)
1365 : go_format_general ();
1366 char const *text = g_ptr_array_index (line, lcol);
1367 gboolean want_col =
1368 (parseoptions->col_import_array == NULL ||
1369 parseoptions->col_import_array_len <= lcol ||
1370 parseoptions->col_import_array[lcol]);
1371 if (!want_col)
1372 continue;
1374 if (col >= gnm_sheet_get_max_cols (sheet)) {
1375 if (!parseoptions->cols_exceeded) {
1376 /* FIXME: What locale? */
1377 g_warning (_("There are more columns of data than "
1378 "there is room for in the sheet. Extra "
1379 "columns will be ignored."));
1380 parseoptions->cols_exceeded = TRUE;
1382 break;
1384 if (text && *text) {
1385 GnmCell *cell = sheet_cell_fetch (sheet, col, row);
1386 if (!go_format_is_text (fmt) &&
1387 lcol < parseoptions->formats_decimal->len &&
1388 g_ptr_array_index (parseoptions->formats_decimal, lcol)) {
1389 GOFormatFamily fam;
1390 GnmValue *v = format_match_decimal_number_with_locale
1391 (text, &fam,
1392 g_ptr_array_index (parseoptions->formats_curr, lcol),
1393 g_ptr_array_index (parseoptions->formats_thousand, lcol),
1394 g_ptr_array_index (parseoptions->formats_decimal, lcol));
1395 if (!v)
1396 v = value_new_string (text);
1397 sheet_cell_set_value (cell, v);
1398 } else {
1400 stf_cell_set_text (cell, text);
1403 col++;
1406 g_ptr_array_index (lines, lrow) = NULL;
1407 g_ptr_array_free (line, TRUE);
1409 END_LOCALE_SWITCH;
1411 for (lcol = 0, col = start_col;
1412 lcol < parseoptions->col_import_array_len && col < gnm_sheet_get_max_cols (sheet);
1413 lcol++) {
1414 if (parseoptions->col_import_array == NULL ||
1415 parseoptions->col_import_array_len <= lcol ||
1416 parseoptions->col_import_array[lcol]) {
1417 if (parseoptions->col_autofit_array == NULL ||
1418 parseoptions->col_autofit_array[lcol]) {
1419 ColRowIndexList *list = colrow_get_index_list (col, col, NULL);
1420 ColRowStateGroup *state = colrow_set_sizes (sheet, TRUE, list, -1, 0, -1);
1421 colrow_index_list_destroy (list);
1422 g_slist_free (state);
1424 col++;
1428 g_string_chunk_free (lines_chunk);
1429 if (lines)
1430 stf_parse_general_free (lines);
1431 if (result)
1432 stf_read_remember_settings (sheet->workbook, parseoptions);
1433 return result;
1436 GnmCellRegion *
1437 stf_parse_region (StfParseOptions_t *parseoptions, char const *data, char const *data_end,
1438 Workbook const *wb)
1440 static GODateConventions const default_conv = {FALSE};
1441 GODateConventions const *date_conv = wb ? workbook_date_conv (wb) : &default_conv;
1443 GnmCellRegion *cr;
1444 unsigned int row, colhigh = 0;
1445 GStringChunk *lines_chunk;
1446 GPtrArray *lines;
1447 size_t nformats;
1449 SETUP_LOCALE_SWITCH;
1451 g_return_val_if_fail (parseoptions != NULL, NULL);
1452 g_return_val_if_fail (data != NULL, NULL);
1454 START_LOCALE_SWITCH;
1456 cr = gnm_cell_region_new (NULL);
1458 if (!data_end)
1459 data_end = data + strlen (data);
1460 lines_chunk = g_string_chunk_new (100 * 1024);
1461 lines = stf_parse_general (parseoptions, lines_chunk, data, data_end);
1462 nformats = parseoptions->formats->len;
1463 for (row = 0; row < lines->len; row++) {
1464 GPtrArray *line = g_ptr_array_index (lines, row);
1465 unsigned int col, targetcol = 0;
1466 for (col = 0; col < line->len; col++) {
1467 if (parseoptions->col_import_array == NULL ||
1468 parseoptions->col_import_array_len <= col ||
1469 parseoptions->col_import_array[col]) {
1470 const char *text = g_ptr_array_index (line, col);
1471 if (text) {
1472 GOFormat *fmt = NULL;
1473 GnmValue *v;
1474 GnmCellCopy *cc;
1476 if (col < nformats)
1477 fmt = g_ptr_array_index (parseoptions->formats, col);
1478 v = format_match (text, fmt, date_conv);
1479 if (!v)
1480 v = value_new_string (text);
1482 cc = gnm_cell_copy_new (cr, targetcol, row);
1483 cc->val = v;
1484 cc->texpr = NULL;
1485 targetcol++;
1486 if (targetcol > colhigh)
1487 colhigh = targetcol;
1492 stf_parse_general_free (lines);
1493 g_string_chunk_free (lines_chunk);
1495 END_LOCALE_SWITCH;
1497 cr->cols = (colhigh > 0) ? colhigh : 1;
1498 cr->rows = row;
1500 return cr;
1503 static int
1504 int_sort (void const *a, void const *b)
1506 return *(int const *)a - *(int const *)b;
1509 static int
1510 count_character (GPtrArray *lines, gunichar c, double quantile)
1512 int *counts, res;
1513 unsigned int lno, cno;
1515 if (lines->len == 0)
1516 return 0;
1518 counts = g_new (int, lines->len);
1519 for (lno = cno = 0; lno < lines->len; lno++) {
1520 int count = 0;
1521 GPtrArray *boxline = g_ptr_array_index (lines, lno);
1522 char const *line = g_ptr_array_index (boxline, 0);
1524 /* Ignore empty lines. */
1525 if (*line == 0)
1526 continue;
1528 while (*line) {
1529 if (g_utf8_get_char (line) == c)
1530 count++;
1531 line = g_utf8_next_char (line);
1534 counts[cno++] = count;
1537 if (cno == 0)
1538 res = 0;
1539 else {
1540 unsigned int qi = (unsigned int)ceil (quantile * cno);
1541 qsort (counts, cno, sizeof (counts[0]), int_sort);
1542 if (qi == cno)
1543 qi--;
1544 res = counts[qi];
1547 g_free (counts);
1549 return res;
1552 static void
1553 dump_guessed_options (const StfParseOptions_t *res)
1555 GSList *l;
1556 char ubuffer[6 + 1];
1557 unsigned ui;
1559 g_printerr ("Guessed format:\n");
1560 switch (res->parsetype) {
1561 case PARSE_TYPE_CSV:
1562 g_printerr (" type = sep\n");
1563 g_printerr (" separator = %s\n",
1564 res->sep.chr ? res->sep.chr : "(none)");
1565 g_printerr (" see two as one = %s\n",
1566 res->sep.duplicates ? "yes" : "no");
1567 break;
1568 case PARSE_TYPE_FIXED:
1569 g_printerr (" type = sep\n");
1570 break;
1571 default:
1574 g_printerr (" trim space = %d\n", res->trim_spaces);
1576 ubuffer[g_unichar_to_utf8 (res->stringindicator, ubuffer)] = 0;
1577 g_printerr (" string indicator = %s\n", ubuffer);
1578 g_printerr (" see two as one = %s\n",
1579 res->indicator_2x_is_single ? "yes" : "no");
1581 g_printerr (" line terminators =");
1582 for (l = res->terminator; l; l = l->next) {
1583 const char *t = l->data;
1584 if (strcmp (t, "\n") == 0)
1585 g_printerr (" unix");
1586 else if (strcmp (t, "\r") == 0)
1587 g_printerr (" mac");
1588 else if (strcmp (t, "\r\n") == 0)
1589 g_printerr (" dos");
1590 else
1591 g_printerr (" other");
1593 g_printerr ("\n");
1595 for (ui = 0; ui < res->formats->len; ui++) {
1596 GOFormat const *fmt = g_ptr_array_index (res->formats, ui);
1597 const GString *decimal = ui < res->formats_decimal->len
1598 ? g_ptr_array_index (res->formats_decimal, ui)
1599 : NULL;
1600 const GString *thousand = ui < res->formats_thousand->len
1601 ? g_ptr_array_index (res->formats_thousand, ui)
1602 : NULL;
1604 g_printerr (" fmt.%d = %s\n", ui, go_format_as_XL (fmt));
1605 if (decimal)
1606 g_printerr (" fmt.%d.dec = %s\n", ui, decimal->str);
1607 if (thousand)
1608 g_printerr (" fmt.%d.thou = %s\n", ui, thousand->str);
1613 * stf_parse_options_guess:
1614 * @data: the input data.
1616 * Returns: (transfer full): the guessed options.
1618 StfParseOptions_t *
1619 stf_parse_options_guess (char const *data)
1621 StfParseOptions_t *res;
1622 GStringChunk *lines_chunk;
1623 GPtrArray *lines;
1624 int tabcount;
1625 int sepcount;
1626 gunichar sepchar = go_locale_get_arg_sep ();
1628 g_return_val_if_fail (data != NULL, NULL);
1630 res = stf_parse_options_new ();
1631 lines_chunk = g_string_chunk_new (100 * 1024);
1632 lines = stf_parse_lines (res, lines_chunk, data, 1000, FALSE);
1634 tabcount = count_character (lines, '\t', 0.2);
1635 sepcount = count_character (lines, sepchar, 0.2);
1637 /* At least one tab per line and enough to separate every
1638 would-be sepchars. */
1639 if (tabcount >= 1 && tabcount >= sepcount - 1)
1640 stf_parse_options_csv_set_separators (res, "\t", NULL);
1641 else {
1642 gunichar c;
1645 * Try a few more or less likely characters and pick the first
1646 * one that occurs on at least half the lines.
1648 * The order is mostly random, although ' ' and '!' which
1649 * could very easily occur in text are put last.
1651 if (count_character (lines, (c = sepchar), 0.5) > 0 ||
1652 count_character (lines, (c = go_locale_get_col_sep ()), 0.5) > 0 ||
1653 count_character (lines, (c = ':'), 0.5) > 0 ||
1654 count_character (lines, (c = ','), 0.5) > 0 ||
1655 count_character (lines, (c = ';'), 0.5) > 0 ||
1656 count_character (lines, (c = '|'), 0.5) > 0 ||
1657 count_character (lines, (c = '!'), 0.5) > 0 ||
1658 count_character (lines, (c = ' '), 0.5) > 0) {
1659 char sep[7];
1660 sep[g_unichar_to_utf8 (c, sep)] = 0;
1661 if (c == ' ')
1662 strcat (sep, "\t");
1663 stf_parse_options_csv_set_separators (res, sep, NULL);
1667 // For now, always separated:
1668 stf_parse_options_set_type (res, PARSE_TYPE_CSV);
1670 switch (res->parsetype) {
1671 case PARSE_TYPE_CSV: {
1672 gboolean dups =
1673 res->sep.chr &&
1674 strchr (res->sep.chr, ' ') != NULL;
1675 gboolean trim =
1676 res->sep.chr &&
1677 strchr (res->sep.chr, ' ') != NULL;
1679 stf_parse_options_set_trim_spaces (res, TRIM_TYPE_LEFT | TRIM_TYPE_RIGHT);
1680 stf_parse_options_csv_set_indicator_2x_is_single (res, TRUE);
1681 stf_parse_options_csv_set_duplicates (res, dups);
1682 stf_parse_options_csv_set_trim_seps (res, trim);
1684 stf_parse_options_csv_set_stringindicator (res, '"');
1685 break;
1688 case PARSE_TYPE_FIXED:
1689 break;
1691 default:
1692 g_assert_not_reached ();
1695 stf_parse_general_free (lines);
1696 g_string_chunk_free (lines_chunk);
1698 stf_parse_options_guess_formats (res, data);
1700 if (gnm_debug_flag ("stf"))
1701 dump_guessed_options (res);
1703 return res;
1707 * stf_parse_options_guess_csv:
1708 * @data: the CSV input data.
1710 * Returns: (transfer full): the guessed options.
1712 StfParseOptions_t *
1713 stf_parse_options_guess_csv (char const *data)
1715 StfParseOptions_t *res;
1716 GStringChunk *lines_chunk;
1717 GPtrArray *lines;
1718 char *sep = NULL;
1719 char const *quoteline = NULL;
1720 int pass;
1721 gunichar stringind = '"';
1723 g_return_val_if_fail (data != NULL, NULL);
1725 res = stf_parse_options_new ();
1726 stf_parse_options_set_type (res, PARSE_TYPE_CSV);
1727 stf_parse_options_set_trim_spaces (res, TRIM_TYPE_LEFT | TRIM_TYPE_RIGHT);
1728 stf_parse_options_csv_set_indicator_2x_is_single (res, TRUE);
1729 stf_parse_options_csv_set_duplicates (res, FALSE);
1730 stf_parse_options_csv_set_trim_seps (res, FALSE);
1731 stf_parse_options_csv_set_stringindicator (res, stringind);
1733 lines_chunk = g_string_chunk_new (100 * 1024);
1734 lines = stf_parse_lines (res, lines_chunk, data, 1000, FALSE);
1737 * Find a line containing a quote; skip first line unless it is
1738 * the only one. Prefer a line with the quote first.
1740 for (pass = 1; !quoteline && pass <= 2; pass++) {
1741 size_t lno;
1742 for (lno = MIN (1, lines->len - 1);
1743 !quoteline && lno < lines->len;
1744 lno++) {
1745 GPtrArray *boxline = g_ptr_array_index (lines, lno);
1746 const char *line = g_ptr_array_index (boxline, 0);
1747 switch (pass) {
1748 case 1:
1749 if (g_utf8_get_char (line) == stringind)
1750 quoteline = line;
1751 break;
1752 case 2:
1753 if (my_utf8_strchr (line, stringind))
1754 quoteline = line;
1755 break;
1760 if (quoteline) {
1761 const char *p0 = my_utf8_strchr (quoteline, stringind);
1762 const char *p = p0;
1764 do {
1765 p = g_utf8_next_char (p);
1766 } while (*p && g_utf8_get_char (p) != stringind);
1767 if (*p) p = g_utf8_next_char (p);
1768 while (*p && g_unichar_isspace (g_utf8_get_char (p)))
1769 p = g_utf8_next_char (p);
1770 if (*p) {
1771 /* Use the character after the quote. */
1772 sep = g_strndup (p, g_utf8_next_char (p) - p);
1773 } else {
1774 /* Try to use character before the quote. */
1775 while (p0 > quoteline && !sep) {
1776 p = p0;
1777 p0 = g_utf8_prev_char (p0);
1778 if (!g_unichar_isspace (g_utf8_get_char (p0)))
1779 sep = g_strndup (p0, p - p0);
1784 if (!sep)
1785 sep = g_strdup (",");
1786 stf_parse_options_csv_set_separators (res, sep, NULL);
1787 g_free (sep);
1789 stf_parse_general_free (lines);
1790 g_string_chunk_free (lines_chunk);
1792 stf_parse_options_guess_formats (res, data);
1794 if (gnm_debug_flag ("stf"))
1795 dump_guessed_options (res);
1797 return res;
1800 typedef enum {
1801 STF_GUESS_DATE_DMY = 1,
1802 STF_GUESS_DATE_MDY = 2,
1803 STF_GUESS_DATE_YMD = 4,
1805 STF_GUESS_NUMBER_DEC_POINT = 0x10,
1806 STF_GUESS_NUMBER_DEC_COMMA = 0x20,
1807 STF_GUESS_NUMBER_DEC_EITHER = 0x30,
1809 STF_GUESS_ALL = 0x37
1810 } StfGuessFormats;
1812 static void
1813 do_check_date (const char *data, StfGuessFormats flag,
1814 gboolean mbd, gboolean ybm,
1815 unsigned *possible,
1816 GODateConventions const *date_conv)
1818 GnmValue *v;
1819 gboolean this_mbd, this_ybm;
1820 int imbd;
1822 if (!(*possible & flag))
1823 return;
1825 v = format_match_datetime (data, date_conv, mbd, TRUE, FALSE);
1826 if (!v || !VALUE_FMT (v))
1827 goto fail;
1829 imbd = go_format_month_before_day (VALUE_FMT (v));
1830 this_mbd = (imbd >= 1);
1831 this_ybm = (imbd == 2);
1832 if (mbd != this_mbd || ybm != this_ybm)
1833 goto fail;
1835 goto done;
1837 fail:
1838 *possible &= ~flag;
1839 done:
1840 value_release (v);
1844 static void
1845 do_check_number (const char *data, StfGuessFormats flag,
1846 const GString *dec, const GString *thousand, const GString *curr,
1847 unsigned *possible, int *decimals)
1849 GnmValue *v;
1850 GOFormatFamily family;
1851 const char *pthou;
1853 if (!(*possible & flag))
1854 return;
1856 v = format_match_decimal_number_with_locale (data, &family, curr, thousand, dec);
1857 if (!v)
1858 goto fail;
1860 if (*decimals != -2) {
1861 const char *pdec = strstr (data, dec->str);
1862 int this_decimals = 0;
1863 if (pdec) {
1864 pdec += dec->len;
1865 while (g_ascii_isdigit (*pdec)) {
1866 pdec++;
1867 this_decimals++;
1870 if (*decimals == -1)
1871 *decimals = this_decimals;
1872 else if (*decimals != this_decimals)
1873 *decimals = -2;
1876 pthou = strstr (data, thousand->str);
1877 if (pthou) {
1878 const char *p;
1879 int digits = 0, nonzero_digits = 0;
1880 for (p = data; p < pthou; p = g_utf8_next_char (p)) {
1881 if (g_unichar_isdigit (g_utf8_get_char (p))) {
1882 digits++;
1883 if (*p != '0')
1884 nonzero_digits++;
1887 // "-.222" implies that "." is not a thousands separator.
1888 // "0.222" implies that "." is not a thousands separator.
1889 // "12345,555" implies that "," is not a thousands separator.
1890 if (nonzero_digits == 0 || digits > 3)
1891 goto fail;
1894 goto done;
1896 fail:
1897 *possible &= ~flag;
1898 done:
1899 value_release (v);
1904 * stf_parse_options_guess_formats:
1905 * @data: the CSV input data.
1907 * This function attempts to recognize data formats on a column-by-column
1908 * basis under the assumption that the data in a text file will generally
1909 * use the same data formats.
1911 * This is useful because not all values give sufficient information by
1912 * themselves to tell what format the data is in. For example, "1/2/2000"
1913 * is likely to be a date in year 2000, but it is not clear if it is in
1914 * January or February. If another value in the same column is "31/1/1999"
1915 * then it is likely that the former date was in February.
1917 * Likewise, a value of "123,456" could mean either 1.23456e5 or 1.23456e2.
1918 * A later value of "111,200.22" would clear up the confusion.
1921 void
1922 stf_parse_options_guess_formats (StfParseOptions_t *po, char const *data)
1924 GStringChunk *lines_chunk;
1925 GPtrArray *lines;
1926 unsigned lno, col, colcount, sline;
1927 GODateConventions const *date_conv = go_date_conv_from_str ("Lotus:1900");
1928 GString *s_comma = g_string_new (",");
1929 GString *s_dot = g_string_new (".");
1930 GString *s_dollar = g_string_new ("$");
1931 gboolean debug = gnm_debug_flag ("stf");
1933 g_ptr_array_set_size (po->formats, 0);
1934 g_ptr_array_set_size (po->formats_decimal, 0);
1935 g_ptr_array_set_size (po->formats_thousand, 0);
1936 g_ptr_array_set_size (po->formats_curr, 0);
1938 lines_chunk = g_string_chunk_new (100 * 1024);
1939 lines = stf_parse_general (po, lines_chunk, data, data + strlen (data));
1941 colcount = 0;
1942 for (lno = 0; lno < lines->len; lno++) {
1943 GPtrArray *line = g_ptr_array_index (lines, lno);
1944 colcount = MAX (colcount, line->len);
1947 // Ignore first line unless it is the only one
1948 sline = MIN ((int)lines->len - 1, 1);
1950 g_ptr_array_set_size (po->formats, colcount);
1951 g_ptr_array_set_size (po->formats_decimal, colcount);
1952 g_ptr_array_set_size (po->formats_thousand, colcount);
1953 g_ptr_array_set_size (po->formats_curr, colcount);
1954 for (col = 0; col < colcount; col++) {
1955 unsigned possible = STF_GUESS_ALL;
1956 GOFormat *fmt = NULL;
1957 gboolean seen_dot = FALSE;
1958 gboolean seen_comma = FALSE;
1959 int decimals_if_point = -1; // -1: unset; -2: inconsistent; >=0: count
1960 int decimals_if_comma = -1; // -1: unset; -2: inconsistent; >=0: count
1962 for (lno = sline; possible && lno < lines->len; lno++) {
1963 GPtrArray *line = g_ptr_array_index (lines, lno);
1964 const char *data = col < line->len ? g_ptr_array_index (line, col) : "";
1965 unsigned prev_possible = possible;
1967 if (*data == 0 || data[0] == '\'')
1968 continue;
1970 do_check_date (data, STF_GUESS_DATE_DMY, FALSE, FALSE, &possible, date_conv);
1971 do_check_date (data, STF_GUESS_DATE_MDY, TRUE, FALSE, &possible, date_conv);
1972 do_check_date (data, STF_GUESS_DATE_YMD, TRUE, TRUE, &possible, date_conv);
1974 if ((possible & STF_GUESS_NUMBER_DEC_EITHER) == STF_GUESS_NUMBER_DEC_EITHER) {
1975 const char *pdot = strstr (data, s_dot->str);
1976 const char *pcomma = strstr (data, s_comma->str);
1977 if (pdot && pcomma) {
1978 // Both -- last one is the decimal separator
1979 if (pdot > pcomma)
1980 possible &= ~STF_GUESS_NUMBER_DEC_COMMA;
1981 else
1982 possible &= ~STF_GUESS_NUMBER_DEC_POINT;
1983 } else if (pdot && strstr (pdot + s_dot->len, s_dot->str)) {
1984 // Two dots so they are thousands separators
1985 possible &= ~STF_GUESS_NUMBER_DEC_POINT;
1986 } else if (pcomma && strstr (pcomma + s_comma->len, s_comma->str)) {
1987 // Two commas so they are thousands separators
1988 possible &= ~STF_GUESS_NUMBER_DEC_COMMA;
1991 seen_dot = seen_dot || (pdot != 0);
1992 seen_comma = seen_comma || (pcomma != 0);
1994 do_check_number (data, STF_GUESS_NUMBER_DEC_POINT,
1995 s_dot, s_comma, s_dollar,
1996 &possible, &decimals_if_point);
1997 do_check_number (data, STF_GUESS_NUMBER_DEC_COMMA,
1998 s_comma, s_dot, s_dollar,
1999 &possible, &decimals_if_comma);
2001 if (possible != prev_possible && debug)
2002 g_printerr ("col=%d; after [%s] possible=0x%x\n", col, data, possible);
2005 if ((possible & STF_GUESS_NUMBER_DEC_EITHER) == STF_GUESS_NUMBER_DEC_EITHER &&
2006 !seen_dot && !seen_comma) {
2007 // It doesn't matter what the separators are
2008 possible &= ~STF_GUESS_NUMBER_DEC_COMMA;
2011 switch (possible) {
2012 case STF_GUESS_DATE_DMY:
2013 fmt = go_format_new_from_XL ("d-mmm-yyyy");
2014 break;
2015 case STF_GUESS_DATE_MDY:
2016 fmt = go_format_new_from_XL ("m/d/yyyy");
2017 break;
2018 case STF_GUESS_DATE_YMD:
2019 fmt = go_format_new_from_XL ("yyyy-mm-dd");
2020 break;
2021 case STF_GUESS_NUMBER_DEC_POINT:
2022 g_ptr_array_index (po->formats_decimal, col) = g_string_new (".");
2023 g_ptr_array_index (po->formats_thousand, col) = g_string_new (",");
2024 g_ptr_array_index (po->formats_curr, col) = g_string_new (s_dollar->str);
2025 if (decimals_if_point > 0) {
2026 // Don't set format if decimals is zero
2027 GString *fmt_str = g_string_new (NULL);
2028 go_format_generate_number_str (fmt_str, 1, decimals_if_point, seen_comma, FALSE, FALSE, "", "");
2029 fmt = go_format_new_from_XL (fmt_str->str);
2030 g_string_free (fmt_str, TRUE);
2032 break;
2033 case STF_GUESS_NUMBER_DEC_COMMA:
2034 g_ptr_array_index (po->formats_decimal, col) = g_string_new (",");
2035 g_ptr_array_index (po->formats_thousand, col) = g_string_new (".");
2036 g_ptr_array_index (po->formats_curr, col) = g_string_new (s_dollar->str);
2037 if (decimals_if_comma > 0) {
2038 // Don't set format if decimals is zero
2039 GString *fmt_str = g_string_new (NULL);
2040 go_format_generate_number_str (fmt_str, 1, decimals_if_comma, seen_dot, FALSE, FALSE, "", "");
2041 fmt = go_format_new_from_XL (fmt_str->str);
2042 g_string_free (fmt_str, TRUE);
2044 break;
2045 default:
2046 break;
2049 if (!fmt)
2050 fmt = go_format_ref (go_format_general ());
2051 g_ptr_array_index (po->formats, col) = fmt;
2054 stf_parse_general_free (lines);
2055 g_string_chunk_free (lines_chunk);
2057 g_string_free (s_dot, TRUE);
2058 g_string_free (s_comma, TRUE);
2059 g_string_free (s_dollar, TRUE);