1 /* vim: set sw=8: -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */
3 * stf-parse.c : Structured Text Format parser. (STF)
4 * A general purpose engine for parsing data
5 * in CSV and Fixed width format.
8 * Copyright (C) Almer. S. Tigelaar.
9 * EMail: almer1@dds.nl or almer-t@bigfoot.com
11 * Copyright (C) 2003 Andreas J. Guelzow <aguelzow@taliesin.ca>
12 * Copyright (C) 2003,2008-2009 Morten Welinder <terra@gnome.org>
14 * This program is free software; you can redistribute it and/or modify
15 * it under the terms of the GNU General Public License as published by
16 * the Free Software Foundation; either version 2 of the License, or
17 * (at your option) any later version.
19 * This program is distributed in the hope that it will be useful,
20 * but WITHOUT ANY WARRANTY; without even the implied warranty of
21 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
22 * GNU General Public License for more details.
24 * You should have received a copy of the GNU General Public License
25 * along with this program; if not, see <https://www.gnu.org/licenses/>.
28 #include <gnumeric-config.h>
29 #include <glib/gi18n-lib.h>
31 #include "stf-parse.h"
32 #include "stf-export.h"
38 #include "clipboard.h"
39 #include "sheet-style.h"
42 #include "number-match.h"
44 #include "parse-util.h"
45 #include "number-match.h"
46 #include "gnm-format.h"
48 #include <goffice/goffice.h>
54 #define SETUP_LOCALE_SWITCH char *oldlocale = NULL
56 #define START_LOCALE_SWITCH if (parseoptions->locale) {\
57 oldlocale = g_strdup(go_setlocale (LC_ALL, NULL)); \
58 go_setlocale(LC_ALL, parseoptions->locale);}
60 #define END_LOCALE_SWITCH if (oldlocale) {\
61 go_setlocale(LC_ALL, oldlocale);\
64 /* Source_t struct, used for interchanging parsing information between the low level parse functions */
67 char const *position
; /* Indicates the current position within data */
69 /* Used internally for fixed width parsing */
70 int splitpos
; /* Indicates current position in splitpositions array */
71 int linepos
; /* Position on the current line */
74 /* Struct used for autodiscovery */
81 * Some silly dude make the length field an unsigned int. C just does
82 * not deal very well with that.
85 my_garray_len (GArray
const *a
)
91 my_utf8_strchr (const char *p
, gunichar uc
)
93 return uc
< 0x7f ? strchr (p
, uc
) : g_utf8_strchr (p
, -1, uc
);
97 compare_terminator (char const *s
, StfParseOptions_t
*parseoptions
)
99 guchar
const *us
= (guchar
const *)s
;
102 if (*us
> parseoptions
->compiled_terminator
.max
||
103 *us
< parseoptions
->compiled_terminator
.min
)
106 for (l
= parseoptions
->terminator
; l
; l
= l
->next
) {
107 char const *term
= l
->data
;
125 /*******************************************************************************************************
126 * STF PARSE OPTIONS : StfParseOptions related
127 *******************************************************************************************************/
130 gnm_g_string_free (GString
*s
)
132 if (s
) g_string_free (s
, TRUE
);
137 * stf_parse_options_new:
139 * This will return a new StfParseOptions_t struct.
140 * The struct should, after being used, freed with stf_parse_options_free.
142 static StfParseOptions_t
*
143 stf_parse_options_new (void)
145 StfParseOptions_t
* parseoptions
= g_new0 (StfParseOptions_t
, 1);
147 parseoptions
->parsetype
= PARSE_TYPE_NOTSET
;
149 parseoptions
->terminator
= NULL
;
150 stf_parse_options_add_line_terminator (parseoptions
, "\r\n");
151 stf_parse_options_add_line_terminator (parseoptions
, "\n");
152 stf_parse_options_add_line_terminator (parseoptions
, "\r");
154 parseoptions
->trim_spaces
= (TRIM_TYPE_RIGHT
| TRIM_TYPE_LEFT
);
155 parseoptions
->locale
= NULL
;
157 parseoptions
->splitpositions
= NULL
;
158 stf_parse_options_fixed_splitpositions_clear (parseoptions
);
160 parseoptions
->stringindicator
= '"';
161 parseoptions
->indicator_2x_is_single
= TRUE
;
162 parseoptions
->sep
.duplicates
= FALSE
;
163 parseoptions
->trim_seps
= FALSE
;
165 parseoptions
->sep
.str
= NULL
;
166 parseoptions
->sep
.chr
= NULL
;
168 parseoptions
->col_autofit_array
= NULL
;
169 parseoptions
->col_import_array
= NULL
;
170 parseoptions
->col_import_array_len
= 0;
171 parseoptions
->formats
= g_ptr_array_new_with_free_func ((GDestroyNotify
)go_format_unref
);
172 parseoptions
->formats_decimal
= g_ptr_array_new_with_free_func ((GDestroyNotify
)gnm_g_string_free
);
173 parseoptions
->formats_thousand
= g_ptr_array_new_with_free_func ((GDestroyNotify
)gnm_g_string_free
);
174 parseoptions
->formats_curr
= g_ptr_array_new_with_free_func ((GDestroyNotify
)gnm_g_string_free
);
176 parseoptions
->cols_exceeded
= FALSE
;
177 parseoptions
->rows_exceeded
= FALSE
;
178 parseoptions
->ref_count
= 1;
184 * stf_parse_options_free:
186 * will free @parseoptions, note that this will not free the splitpositions
187 * member (GArray) of the struct, the caller is responsible for that.
190 stf_parse_options_free (StfParseOptions_t
*parseoptions
)
192 g_return_if_fail (parseoptions
!= NULL
);
194 if (parseoptions
->ref_count
-- > 1)
197 g_free (parseoptions
->col_import_array
);
198 g_free (parseoptions
->col_autofit_array
);
199 g_free (parseoptions
->locale
);
200 g_free (parseoptions
->sep
.chr
);
202 if (parseoptions
->sep
.str
) {
205 for (l
= parseoptions
->sep
.str
; l
!= NULL
; l
= l
->next
)
206 g_free ((char *) l
->data
);
207 g_slist_free (parseoptions
->sep
.str
);
210 g_array_free (parseoptions
->splitpositions
, TRUE
);
212 stf_parse_options_clear_line_terminator (parseoptions
);
214 g_ptr_array_free (parseoptions
->formats
, TRUE
);
215 g_ptr_array_free (parseoptions
->formats_decimal
, TRUE
);
216 g_ptr_array_free (parseoptions
->formats_thousand
, TRUE
);
217 g_ptr_array_free (parseoptions
->formats_curr
, TRUE
);
219 g_free (parseoptions
);
222 static StfParseOptions_t
*
223 stf_parse_options_ref (StfParseOptions_t
*parseoptions
)
225 parseoptions
->ref_count
++;
230 stf_parse_options_get_type (void)
235 t
= g_boxed_type_register_static ("StfParseOptions_t",
236 (GBoxedCopyFunc
)stf_parse_options_ref
,
237 (GBoxedFreeFunc
)stf_parse_options_free
);
243 stf_parse_options_set_type (StfParseOptions_t
*parseoptions
, StfParseType_t
const parsetype
)
245 g_return_if_fail (parseoptions
!= NULL
);
246 g_return_if_fail (parsetype
== PARSE_TYPE_CSV
|| parsetype
== PARSE_TYPE_FIXED
);
248 parseoptions
->parsetype
= parsetype
;
252 long_string_first (gchar
const *a
, gchar
const *b
)
254 /* This actually is UTF-8 safe. */
255 return strlen (b
) - strlen (a
);
259 compile_terminators (StfParseOptions_t
*parseoptions
)
262 GO_SLIST_SORT (parseoptions
->terminator
, (GCompareFunc
)long_string_first
);
264 parseoptions
->compiled_terminator
.min
= 255;
265 parseoptions
->compiled_terminator
.max
= 0;
266 for (l
= parseoptions
->terminator
; l
; l
= l
->next
) {
267 const guchar
*term
= l
->data
;
268 parseoptions
->compiled_terminator
.min
=
269 MIN (parseoptions
->compiled_terminator
.min
, *term
);
270 parseoptions
->compiled_terminator
.max
=
271 MAX (parseoptions
->compiled_terminator
.max
, *term
);
276 * stf_parse_options_add_line_terminator:
278 * This will add to the line terminators, in both the Fixed width and CSV delimited importers
279 * this indicates the end of a row.
283 stf_parse_options_add_line_terminator (StfParseOptions_t
*parseoptions
, char const *terminator
)
285 g_return_if_fail (parseoptions
!= NULL
);
286 g_return_if_fail (terminator
!= NULL
&& *terminator
!= 0);
288 GO_SLIST_PREPEND (parseoptions
->terminator
, g_strdup (terminator
));
289 compile_terminators (parseoptions
);
293 * stf_parse_options_clear_line_terminator:
295 * This will clear the line terminator, in both the Fixed width and CSV delimited importers
296 * this indicates the end of a row.
300 stf_parse_options_clear_line_terminator (StfParseOptions_t
*parseoptions
)
302 g_return_if_fail (parseoptions
!= NULL
);
304 g_slist_free_full (parseoptions
->terminator
, g_free
);
305 parseoptions
->terminator
= NULL
;
306 compile_terminators (parseoptions
);
310 * stf_parse_options_set_trim_spaces:
312 * If enabled will trim spaces in every parsed field on left and/or right
316 stf_parse_options_set_trim_spaces (StfParseOptions_t
*parseoptions
, StfTrimType_t
const trim_spaces
)
318 g_return_if_fail (parseoptions
!= NULL
);
320 parseoptions
->trim_spaces
= trim_spaces
;
324 * stf_parse_options_csv_set_separators:
325 * @parseoptions: #StfParseOptions_t
327 * @string: (element-type char):
329 * A copy is made of the parameters.
332 stf_parse_options_csv_set_separators (StfParseOptions_t
*parseoptions
, char const *character
,
333 GSList
const *string
)
335 g_return_if_fail (parseoptions
!= NULL
);
337 g_free (parseoptions
->sep
.chr
);
338 parseoptions
->sep
.chr
= g_strdup (character
);
340 g_slist_free_full (parseoptions
->sep
.str
, g_free
);
341 parseoptions
->sep
.str
= go_slist_map (string
, (GOMapFunc
)g_strdup
);
345 stf_parse_options_csv_set_stringindicator (StfParseOptions_t
*parseoptions
, gunichar
const stringindicator
)
347 g_return_if_fail (parseoptions
!= NULL
);
349 parseoptions
->stringindicator
= stringindicator
;
353 * stf_parse_options_csv_set_indicator_2x_is_single:
354 * @indic_2x: a boolean value indicating whether we want to see two
355 * adjacent string indicators as a single string indicator
356 * that is part of the cell, rather than a terminator.
359 stf_parse_options_csv_set_indicator_2x_is_single (StfParseOptions_t
*parseoptions
,
360 gboolean
const indic_2x
)
362 g_return_if_fail (parseoptions
!= NULL
);
364 parseoptions
->indicator_2x_is_single
= indic_2x
;
368 * stf_parse_options_csv_set_duplicates:
370 * @duplicates: a boolean value indicating whether we want to see two
371 * separators right behind each other as one
374 stf_parse_options_csv_set_duplicates (StfParseOptions_t
*parseoptions
, gboolean
const duplicates
)
376 g_return_if_fail (parseoptions
!= NULL
);
378 parseoptions
->sep
.duplicates
= duplicates
;
382 * stf_parse_options_csv_set_trim_seps:
383 * @trim_seps: a boolean value indicating whether we want to ignore
384 * separators at the beginning of lines
387 stf_parse_options_csv_set_trim_seps (StfParseOptions_t
*parseoptions
, gboolean
const trim_seps
)
389 g_return_if_fail (parseoptions
!= NULL
);
391 parseoptions
->trim_seps
= trim_seps
;
395 * stf_parse_options_fixed_splitpositions_clear:
397 * This will clear the splitpositions (== points on which a line is split)
400 stf_parse_options_fixed_splitpositions_clear (StfParseOptions_t
*parseoptions
)
403 g_return_if_fail (parseoptions
!= NULL
);
405 if (parseoptions
->splitpositions
)
406 g_array_free (parseoptions
->splitpositions
, TRUE
);
407 parseoptions
->splitpositions
= g_array_new (FALSE
, FALSE
, sizeof (int));
409 g_array_append_val (parseoptions
->splitpositions
, minus_one
);
413 * stf_parse_options_fixed_splitpositions_add:
415 * @position will be added to the splitpositions.
418 stf_parse_options_fixed_splitpositions_add (StfParseOptions_t
*parseoptions
, int position
)
422 g_return_if_fail (parseoptions
!= NULL
);
423 g_return_if_fail (position
>= 0);
425 for (ui
= 0; ui
< parseoptions
->splitpositions
->len
- 1; ui
++) {
426 int here
= g_array_index (parseoptions
->splitpositions
, int, ui
);
427 if (position
== here
)
433 g_array_insert_val (parseoptions
->splitpositions
, ui
, position
);
437 stf_parse_options_fixed_splitpositions_remove (StfParseOptions_t
*parseoptions
, int position
)
441 g_return_if_fail (parseoptions
!= NULL
);
442 g_return_if_fail (position
>= 0);
444 for (ui
= 0; ui
< parseoptions
->splitpositions
->len
- 1; ui
++) {
445 int here
= g_array_index (parseoptions
->splitpositions
, int, ui
);
446 if (position
== here
)
447 g_array_remove_index (parseoptions
->splitpositions
, ui
);
448 if (position
<= here
)
454 stf_parse_options_fixed_splitpositions_count (StfParseOptions_t
*parseoptions
)
456 return parseoptions
->splitpositions
->len
;
460 stf_parse_options_fixed_splitpositions_nth (StfParseOptions_t
*parseoptions
, int n
)
462 return g_array_index (parseoptions
->splitpositions
, int, n
);
467 * stf_parse_options_valid:
468 * @parseoptions: an import options struct
470 * Checks if @parseoptions is correctly filled
472 * returns : TRUE if it is correctly filled, FALSE otherwise.
475 stf_parse_options_valid (StfParseOptions_t
*parseoptions
)
477 g_return_val_if_fail (parseoptions
!= NULL
, FALSE
);
479 if (parseoptions
->parsetype
== PARSE_TYPE_FIXED
) {
480 if (!parseoptions
->splitpositions
) {
481 g_warning ("STF: No splitpositions in struct");
489 /*******************************************************************************************************
490 * STF PARSE : The actual routines that do the 'trick'
491 *******************************************************************************************************/
494 trim_spaces_inplace (char *field
, StfParseOptions_t
const *parseoptions
)
498 if (parseoptions
->trim_spaces
& TRIM_TYPE_LEFT
) {
501 while (g_unichar_isspace (g_utf8_get_char (s
)))
502 s
= g_utf8_next_char (s
);
505 memmove (field
, s
, 1 + strlen (s
));
508 if (parseoptions
->trim_spaces
& TRIM_TYPE_RIGHT
) {
509 char *s
= field
+ strlen (field
);
512 s
= g_utf8_prev_char (s
);
513 if (!g_unichar_isspace (g_utf8_get_char (s
)))
521 * stf_parse_csv_is_separator:
523 * returns NULL if @character is not a separator, a pointer to the character
524 * after the separator otherwise.
527 stf_parse_csv_is_separator (char const *character
, char const *chr
, GSList
const *str
)
529 g_return_val_if_fail (character
!= NULL
, NULL
);
537 for (l
= str
; l
!= NULL
; l
= l
->next
) {
538 char const *s
= l
->data
;
541 glong
const len
= g_utf8_strlen (s
, -1);
543 /* Don't compare past the end of the buffer! */
544 for (r
= character
, cnt
= 0; cnt
< len
; cnt
++, r
= g_utf8_next_char (r
))
548 if ((cnt
== len
) && (memcmp (character
, s
, len
) == 0))
549 return g_utf8_offset_to_pointer (character
, len
);
553 if (chr
&& my_utf8_strchr (chr
, g_utf8_get_char (character
)))
554 return g_utf8_next_char(character
);
560 * stf_parse_eat_separators:
562 * skip over leading separators
567 stf_parse_eat_separators (Source_t
*src
, StfParseOptions_t
*parseoptions
)
569 char const *cur
, *next
;
571 g_return_if_fail (src
!= NULL
);
572 g_return_if_fail (parseoptions
!= NULL
);
576 if (*cur
== '\0' || compare_terminator (cur
, parseoptions
))
578 while ((next
= stf_parse_csv_is_separator (cur
, parseoptions
->sep
.chr
, parseoptions
->sep
.str
)))
589 STF_CELL_FIELD_NO_SEP
,
593 static StfParseCellRes
594 stf_parse_csv_cell (GString
*text
, Source_t
*src
, StfParseOptions_t
*parseoptions
)
597 gboolean saw_sep
= FALSE
;
599 g_return_val_if_fail (src
!= NULL
, STF_CELL_ERROR
);
600 g_return_val_if_fail (parseoptions
!= NULL
, STF_CELL_ERROR
);
603 g_return_val_if_fail (cur
!= NULL
, STF_CELL_ERROR
);
605 /* Skip whitespace, but stop at line terminators. */
614 term_len
= compare_terminator (cur
, parseoptions
);
616 src
->position
= cur
+ term_len
;
620 if ((parseoptions
->trim_spaces
& TRIM_TYPE_LEFT
) == 0)
623 if (stf_parse_csv_is_separator (cur
, parseoptions
->sep
.chr
,
624 parseoptions
->sep
.str
))
627 if (!g_unichar_isspace (g_utf8_get_char (cur
)))
629 cur
= g_utf8_next_char (cur
);
632 if (parseoptions
->stringindicator
!= 0 &&
633 g_utf8_get_char (cur
) == parseoptions
->stringindicator
) {
634 cur
= g_utf8_next_char (cur
);
636 gunichar uc
= g_utf8_get_char (cur
);
637 cur
= g_utf8_next_char (cur
);
639 if (uc
== parseoptions
->stringindicator
) {
640 if (parseoptions
->indicator_2x_is_single
&&
641 g_utf8_get_char (cur
) == parseoptions
->stringindicator
)
642 cur
= g_utf8_next_char (cur
);
644 /* "field content"dropped-garbage, */
645 while (*cur
&& !compare_terminator (cur
, parseoptions
)) {
646 char const *post
= stf_parse_csv_is_separator
647 (cur
, parseoptions
->sep
.chr
, parseoptions
->sep
.str
);
653 cur
= g_utf8_next_char (cur
);
659 g_string_append_unichar (text
, uc
);
662 /* We silently allow a missing terminating quote. */
664 /* Unquoted field. */
666 while (*cur
&& !compare_terminator (cur
, parseoptions
)) {
668 char const *post
= stf_parse_csv_is_separator
669 (cur
, parseoptions
->sep
.chr
, parseoptions
->sep
.str
);
676 g_string_append_unichar (text
, g_utf8_get_char (cur
));
677 cur
= g_utf8_next_char (cur
);
680 if (parseoptions
->trim_spaces
& TRIM_TYPE_RIGHT
) {
682 const char *last
= g_utf8_prev_char (text
->str
+ text
->len
);
683 if (!g_unichar_isspace (g_utf8_get_char (last
)))
685 g_string_truncate (text
, last
- text
->str
);
692 if (saw_sep
&& parseoptions
->sep
.duplicates
)
693 stf_parse_eat_separators (src
, parseoptions
);
695 return saw_sep
? STF_CELL_FIELD_SEP
: STF_CELL_FIELD_NO_SEP
;
699 * stf_parse_csv_line:
701 * This will parse one line from the current @src->position.
702 * NOTE: The calling routine is responsible for freeing the result.
704 * returns : a GPtrArray of char*'s
707 stf_parse_csv_line (Source_t
*src
, StfParseOptions_t
*parseoptions
)
710 gboolean cont
= FALSE
;
713 g_return_val_if_fail (src
!= NULL
, NULL
);
714 g_return_val_if_fail (parseoptions
!= NULL
, NULL
);
716 line
= g_ptr_array_new ();
717 if (parseoptions
->trim_seps
)
718 stf_parse_eat_separators (src
, parseoptions
);
720 text
= g_string_sized_new (30);
724 StfParseCellRes res
=
725 stf_parse_csv_cell (text
, src
, parseoptions
);
726 trim_spaces_inplace (text
->str
, parseoptions
);
727 ctext
= g_string_chunk_insert_len (src
->chunk
,
728 text
->str
, text
->len
);
729 g_string_truncate (text
, 0);
732 case STF_CELL_FIELD_NO_SEP
:
733 g_ptr_array_add (line
, ctext
);
737 case STF_CELL_FIELD_SEP
:
738 g_ptr_array_add (line
, ctext
);
739 cont
= TRUE
; /* Make sure we see one more field. */
744 g_ptr_array_add (line
, ctext
);
745 g_string_free (text
, TRUE
);
752 * stf_parse_fixed_cell:
754 * returns a pointer to the parsed cell contents.
757 stf_parse_fixed_cell (Source_t
*src
, StfParseOptions_t
*parseoptions
)
763 g_return_val_if_fail (src
!= NULL
, NULL
);
764 g_return_val_if_fail (parseoptions
!= NULL
, NULL
);
768 if (src
->splitpos
< my_garray_len (parseoptions
->splitpositions
))
769 splitval
= (int) g_array_index (parseoptions
->splitpositions
, int, src
->splitpos
);
773 while (*cur
!= 0 && !compare_terminator (cur
, parseoptions
) && splitval
!= src
->linepos
) {
775 cur
= g_utf8_next_char (cur
);
778 res
= g_string_chunk_insert_len (src
->chunk
,
780 cur
- src
->position
);
788 * stf_parse_fixed_line:
790 * This will parse one line from the current @src->position.
791 * It will return a GPtrArray with the cell contents as strings.
793 * NOTE: The calling routine is responsible for freeing result.
796 stf_parse_fixed_line (Source_t
*src
, StfParseOptions_t
*parseoptions
)
800 g_return_val_if_fail (src
!= NULL
, NULL
);
801 g_return_val_if_fail (parseoptions
!= NULL
, NULL
);
806 line
= g_ptr_array_new ();
807 while (*src
->position
!= '\0' && !compare_terminator (src
->position
, parseoptions
)) {
808 char *field
= stf_parse_fixed_cell (src
, parseoptions
);
810 trim_spaces_inplace (field
, parseoptions
);
811 g_ptr_array_add (line
, field
);
816 while (line
->len
< parseoptions
->splitpositions
->len
)
817 g_ptr_array_add (line
, g_strdup (""));
823 * stf_parse_general_free: (skip)
826 stf_parse_general_free (GPtrArray
*lines
)
829 for (lineno
= 0; lineno
< lines
->len
; lineno
++) {
830 GPtrArray
*line
= g_ptr_array_index (lines
, lineno
);
831 /* Fields are not freed here. */
833 g_ptr_array_free (line
, TRUE
);
835 g_ptr_array_free (lines
, TRUE
);
840 * stf_parse_general: (skip)
842 * Returns: (transfer full): a GPtrArray of lines, where each line is itself a
843 * GPtrArray of strings.
845 * The caller must free this entire structure, for example by calling
846 * stf_parse_general_free.
849 stf_parse_general (StfParseOptions_t
*parseoptions
,
850 GStringChunk
*lines_chunk
,
851 char const *data
, char const *data_end
)
856 char const *valid_end
= data_end
;
858 g_return_val_if_fail (parseoptions
!= NULL
, NULL
);
859 g_return_val_if_fail (data
!= NULL
, NULL
);
860 g_return_val_if_fail (data_end
!= NULL
, NULL
);
861 g_return_val_if_fail (stf_parse_options_valid (parseoptions
), NULL
);
862 g_return_val_if_fail (g_utf8_validate (data
, data_end
-data
, &valid_end
), NULL
);
864 src
.chunk
= lines_chunk
;
868 if ((data_end
-data
>= 3) && !strncmp(src
.position
, "\xEF\xBB\xBF", 3)) {
869 /* Skip over byte-order mark */
873 lines
= g_ptr_array_new ();
874 while (*src
.position
!= '\0' && src
.position
< data_end
) {
877 if (row
== GNM_MAX_ROWS
) {
878 parseoptions
->rows_exceeded
= TRUE
;
882 line
= parseoptions
->parsetype
== PARSE_TYPE_CSV
883 ? stf_parse_csv_line (&src
, parseoptions
)
884 : stf_parse_fixed_line (&src
, parseoptions
);
886 g_ptr_array_add (lines
, line
);
887 if (parseoptions
->parsetype
!= PARSE_TYPE_CSV
)
888 src
.position
+= compare_terminator (src
.position
, parseoptions
);
896 * stf_parse_lines: (skip)
897 * @parseoptions: #StfParseOptions_t
903 * Returns: (transfer full): a GPtrArray of lines, where each line is itself a
904 * GPtrArray of strings.
906 * The caller must free this entire structure, for example by calling
907 * stf_parse_general_free.
910 stf_parse_lines (StfParseOptions_t
*parseoptions
,
911 GStringChunk
*lines_chunk
,
913 int maxlines
, gboolean with_lineno
)
918 g_return_val_if_fail (data
!= NULL
, NULL
);
920 lines
= g_ptr_array_new ();
922 char const *data0
= data
;
923 GPtrArray
*line
= g_ptr_array_new ();
926 char buf
[4 * sizeof (int)];
927 sprintf (buf
, "%d", lineno
);
928 g_ptr_array_add (line
,
929 g_string_chunk_insert (lines_chunk
, buf
));
933 int termlen
= compare_terminator (data
, parseoptions
);
934 if (termlen
> 0 || *data
== 0) {
935 g_ptr_array_add (line
,
936 g_string_chunk_insert_len (lines_chunk
,
942 data
= g_utf8_next_char (data
);
945 g_ptr_array_add (lines
, line
);
948 if (lineno
>= maxlines
)
955 stf_parse_find_line (StfParseOptions_t
*parseoptions
,
960 int termlen
= compare_terminator (data
, parseoptions
);
964 } else if (*data
== 0) {
967 data
= g_utf8_next_char (data
);
975 * stf_parse_options_fixed_autodiscover:
976 * @parseoptions: a Parse options struct.
977 * @data: The actual data.
978 * @data_end: data end.
980 * Automatically try to discover columns in the text to be parsed.
981 * We ignore empty lines (only containing parseoptions->terminator)
983 * FIXME: This is so extremely ugly that I am too tired to rewrite it right now.
984 * Think hard of a better more flexible solution...
987 stf_parse_options_fixed_autodiscover (StfParseOptions_t
*parseoptions
,
988 char const *data
, char const *data_end
)
990 char const *iterator
= data
;
992 GSList
*list_start
= NULL
;
994 int effective_lines
= 0;
995 int max_line_length
= 0;
996 int *line_begin_hits
= NULL
;
997 int *line_end_hits
= NULL
;
1000 stf_parse_options_fixed_splitpositions_clear (parseoptions
);
1003 * First take a look at all possible white space combinations
1005 while (*iterator
&& iterator
< data_end
) {
1006 gboolean begin_recorded
= FALSE
;
1007 AutoDiscovery_t
*disc
= NULL
;
1011 while (*iterator
&& (termlen
= compare_terminator (iterator
, parseoptions
)) == 0) {
1012 if (!begin_recorded
&& *iterator
== ' ') {
1013 disc
= g_new0 (AutoDiscovery_t
, 1);
1015 disc
->start
= position
;
1017 begin_recorded
= TRUE
;
1018 } else if (begin_recorded
&& *iterator
!= ' ') {
1019 disc
->stop
= position
;
1020 list
= g_slist_prepend (list
, disc
);
1022 begin_recorded
= FALSE
;
1030 if (position
> max_line_length
)
1031 max_line_length
= position
;
1034 * If there are excess spaces at the end of
1035 * the line : ignore them
1040 * Hop over the terminator
1042 iterator
+= termlen
;
1050 list
= g_slist_reverse (list
);
1055 * Look at the number of hits at each line position
1056 * if the number of hits equals the number of lines
1057 * we can be pretty sure this is the start or end
1058 * of a column, we filter out empty columns
1061 line_begin_hits
= g_new0 (int, max_line_length
+ 1);
1062 line_end_hits
= g_new0 (int, max_line_length
+ 1);
1065 AutoDiscovery_t
*disc
= list
->data
;
1067 line_begin_hits
[disc
->start
]++;
1068 line_end_hits
[disc
->stop
]++;
1072 list
= g_slist_next (list
);
1074 g_slist_free (list_start
);
1076 for (i
= 0; i
< max_line_length
+ 1; i
++)
1077 if (line_begin_hits
[i
] == effective_lines
|| line_end_hits
[i
] == effective_lines
)
1078 stf_parse_options_fixed_splitpositions_add (parseoptions
, i
);
1081 * Do some corrections to the initial columns
1082 * detected here, we obviously don't need to
1083 * do this if there are no columns at all.
1085 if (my_garray_len (parseoptions
->splitpositions
) > 0) {
1087 * Try to find columns that look like :
1092 * (In other words : Columns with left & right justification with
1093 * a minimum of 2 spaces in the middle)
1094 * Split these columns in 2
1097 for (i
= 0; i
< my_garray_len (parseoptions
->splitpositions
) - 1; i
++) {
1098 int begin
= g_array_index (parseoptions
->splitpositions
, int, i
);
1099 int end
= g_array_index (parseoptions
->splitpositions
, int, i
+ 1);
1100 int num_spaces
= -1;
1101 int spaces_start
= 0;
1102 gboolean right_aligned
= TRUE
;
1103 gboolean left_aligned
= TRUE
;
1104 gboolean has_2_spaces
= TRUE
;
1108 while (*iterator
&& iterator
< data_end
) {
1109 gboolean trigger
= FALSE
;
1110 gboolean space_trigger
= FALSE
;
1115 while (*iterator
&& !compare_terminator (iterator
, parseoptions
)) {
1117 if (*iterator
== ' ')
1118 left_aligned
= FALSE
;
1121 } else if (pos
== end
- 1) {
1122 if (*iterator
== ' ')
1123 right_aligned
= FALSE
;
1128 if (trigger
|| pos
== end
- 1) {
1129 if (!space_trigger
&& *iterator
== ' ') {
1130 space_trigger
= TRUE
;
1132 } else if (space_trigger
&& *iterator
!= ' ') {
1133 space_trigger
= FALSE
;
1134 num_spaces
= pos
- spaces_start
;
1143 has_2_spaces
= FALSE
;
1152 * If this column meets all the criteria
1153 * split it into two at the last measured
1154 * spaces_start + num_spaces
1156 if (has_2_spaces
&& right_aligned
&& left_aligned
) {
1157 int val
= (((spaces_start
+ num_spaces
) - spaces_start
) / 2) + spaces_start
;
1159 g_array_insert_val (parseoptions
->splitpositions
, i
+ 1, val
);
1162 * Skip over the inserted column
1169 * Remove empty columns here if needed
1171 for (i
= 0; i
< my_garray_len (parseoptions
->splitpositions
) - 1; i
++) {
1172 int begin
= g_array_index (parseoptions
->splitpositions
, int, i
);
1173 int end
= g_array_index (parseoptions
->splitpositions
, int, i
+ 1);
1174 gboolean only_spaces
= TRUE
;
1178 while (*iterator
&& iterator
< data_end
) {
1179 gboolean trigger
= FALSE
;
1182 while (*iterator
&& !compare_terminator (iterator
, parseoptions
)) {
1185 else if (pos
== end
)
1189 if (*iterator
!= ' ')
1190 only_spaces
= FALSE
;
1204 * The column only contains spaces
1208 g_array_remove_index (parseoptions
->splitpositions
, i
);
1211 * We HAVE to make sure that the next column (end) also
1212 * gets checked out. If we don't decrease "i" here, we
1213 * will skip over it as the indexes shift down after
1221 g_free (line_begin_hits
);
1222 g_free (line_end_hits
);
1225 /*******************************************************************************************************
1226 * STF PARSE HL: high-level functions that dump the raw data returned by the low-level parsing
1227 * functions into something meaningful (== application specific)
1228 *******************************************************************************************************/
1231 * This is more or less as gnm_cell_set_text, except...
1232 * 1. Unknown names are not allowed.
1233 * 2. Only '=' can start an expression.
1237 stf_cell_set_text (GnmCell
*cell
, char const *text
)
1239 GnmExprTop
const *texpr
;
1241 GOFormat
const *fmt
= gnm_style_get_format (gnm_cell_get_style (cell
));
1242 const GODateConventions
*date_conv
=
1243 workbook_date_conv (cell
->base
.sheet
->workbook
);
1245 if (!go_format_is_text (fmt
) && *text
== '=' && text
[1] != 0) {
1246 GnmExprParseFlags flags
=
1247 GNM_EXPR_PARSE_UNKNOWN_NAMES_ARE_INVALID
;
1248 const char *expr_start
= text
+ 1;
1251 parse_pos_init_cell (&pos
, cell
);
1252 texpr
= gnm_expr_parse_str (expr_start
, &pos
, flags
,
1256 val
= format_match (text
, fmt
, date_conv
);
1260 val
= value_new_string (text
);
1263 gnm_cell_set_value (cell
, val
);
1265 gnm_cell_set_expr (cell
, texpr
);
1266 gnm_expr_top_unref (texpr
);
1271 stf_read_remember_settings (Workbook
*book
, StfParseOptions_t
*po
)
1273 if (po
->parsetype
== PARSE_TYPE_CSV
) {
1274 GnmStfExport
*stfe
= gnm_stf_get_stfe (G_OBJECT (book
));
1276 int length
= g_unichar_to_utf8 (po
->stringindicator
, quote
);
1280 } else quote
[length
] = '\0';
1282 g_object_set (G_OBJECT (stfe
), "separator", po
->sep
.chr
, "quote", "e
, NULL
);
1284 if ((po
->terminator
!= NULL
) && (po
->terminator
->data
!= NULL
))
1285 g_object_set (G_OBJECT (stfe
), "eol", po
->terminator
->data
, NULL
);
1290 stf_parse_sheet (StfParseOptions_t
*parseoptions
,
1291 char const *data
, char const *data_end
,
1292 Sheet
*sheet
, int start_col
, int start_row
)
1296 GStringChunk
*lines_chunk
;
1298 gboolean result
= TRUE
;
1303 SETUP_LOCALE_SWITCH
;
1305 g_return_val_if_fail (parseoptions
!= NULL
, FALSE
);
1306 g_return_val_if_fail (data
!= NULL
, FALSE
);
1307 g_return_val_if_fail (IS_SHEET (sheet
), FALSE
);
1310 data_end
= data
+ strlen (data
);
1312 lines_chunk
= g_string_chunk_new (100 * 1024);
1313 lines
= stf_parse_general (parseoptions
, lines_chunk
, data
, data_end
);
1318 nformats
= parseoptions
->formats
->len
;
1319 for (lcol
= 0; lcol
< nformats
; lcol
++) {
1320 GOFormat
const *fmt
= g_ptr_array_index (parseoptions
->formats
, lcol
);
1323 (parseoptions
->col_import_array
== NULL
||
1324 parseoptions
->col_import_array_len
<= lcol
||
1325 parseoptions
->col_import_array
[lcol
]);
1326 if (!want_col
|| col
>= gnm_sheet_get_max_cols (sheet
))
1329 if (fmt
&& !go_format_is_general (fmt
)) {
1331 int end_row
= MIN (start_row
+ (int)lines
->len
- 1,
1332 gnm_sheet_get_last_row (sheet
));
1334 range_init (&r
, col
, start_row
, col
, end_row
);
1335 mstyle
= gnm_style_new ();
1336 gnm_style_set_format (mstyle
, fmt
);
1337 sheet_apply_style (sheet
, &r
, mstyle
);
1342 START_LOCALE_SWITCH
;
1343 for (row
= start_row
, lrow
= 0;
1344 result
&& lrow
< lines
->len
;
1348 if (row
>= gnm_sheet_get_max_rows (sheet
)) {
1349 if (!parseoptions
->rows_exceeded
) {
1350 /* FIXME: What locale? */
1351 g_warning (_("There are more rows of data than "
1352 "there is room for in the sheet. Extra "
1353 "rows will be ignored."));
1354 parseoptions
->rows_exceeded
= TRUE
;
1360 line
= g_ptr_array_index (lines
, lrow
);
1362 for (lcol
= 0; lcol
< line
->len
; lcol
++) {
1363 GOFormat
const *fmt
= lcol
< nformats
1364 ? g_ptr_array_index (parseoptions
->formats
, lcol
)
1365 : go_format_general ();
1366 char const *text
= g_ptr_array_index (line
, lcol
);
1368 (parseoptions
->col_import_array
== NULL
||
1369 parseoptions
->col_import_array_len
<= lcol
||
1370 parseoptions
->col_import_array
[lcol
]);
1374 if (col
>= gnm_sheet_get_max_cols (sheet
)) {
1375 if (!parseoptions
->cols_exceeded
) {
1376 /* FIXME: What locale? */
1377 g_warning (_("There are more columns of data than "
1378 "there is room for in the sheet. Extra "
1379 "columns will be ignored."));
1380 parseoptions
->cols_exceeded
= TRUE
;
1384 if (text
&& *text
) {
1385 GnmCell
*cell
= sheet_cell_fetch (sheet
, col
, row
);
1386 if (!go_format_is_text (fmt
) &&
1387 lcol
< parseoptions
->formats_decimal
->len
&&
1388 g_ptr_array_index (parseoptions
->formats_decimal
, lcol
)) {
1390 GnmValue
*v
= format_match_decimal_number_with_locale
1392 g_ptr_array_index (parseoptions
->formats_curr
, lcol
),
1393 g_ptr_array_index (parseoptions
->formats_thousand
, lcol
),
1394 g_ptr_array_index (parseoptions
->formats_decimal
, lcol
));
1396 v
= value_new_string (text
);
1397 sheet_cell_set_value (cell
, v
);
1400 stf_cell_set_text (cell
, text
);
1406 g_ptr_array_index (lines
, lrow
) = NULL
;
1407 g_ptr_array_free (line
, TRUE
);
1411 for (lcol
= 0, col
= start_col
;
1412 lcol
< parseoptions
->col_import_array_len
&& col
< gnm_sheet_get_max_cols (sheet
);
1414 if (parseoptions
->col_import_array
== NULL
||
1415 parseoptions
->col_import_array_len
<= lcol
||
1416 parseoptions
->col_import_array
[lcol
]) {
1417 if (parseoptions
->col_autofit_array
== NULL
||
1418 parseoptions
->col_autofit_array
[lcol
]) {
1419 ColRowIndexList
*list
= colrow_get_index_list (col
, col
, NULL
);
1420 ColRowStateGroup
*state
= colrow_set_sizes (sheet
, TRUE
, list
, -1, 0, -1);
1421 colrow_index_list_destroy (list
);
1422 g_slist_free (state
);
1428 g_string_chunk_free (lines_chunk
);
1430 stf_parse_general_free (lines
);
1432 stf_read_remember_settings (sheet
->workbook
, parseoptions
);
1437 stf_parse_region (StfParseOptions_t
*parseoptions
, char const *data
, char const *data_end
,
1440 static GODateConventions
const default_conv
= {FALSE
};
1441 GODateConventions
const *date_conv
= wb
? workbook_date_conv (wb
) : &default_conv
;
1444 unsigned int row
, colhigh
= 0;
1445 GStringChunk
*lines_chunk
;
1449 SETUP_LOCALE_SWITCH
;
1451 g_return_val_if_fail (parseoptions
!= NULL
, NULL
);
1452 g_return_val_if_fail (data
!= NULL
, NULL
);
1454 START_LOCALE_SWITCH
;
1456 cr
= gnm_cell_region_new (NULL
);
1459 data_end
= data
+ strlen (data
);
1460 lines_chunk
= g_string_chunk_new (100 * 1024);
1461 lines
= stf_parse_general (parseoptions
, lines_chunk
, data
, data_end
);
1462 nformats
= parseoptions
->formats
->len
;
1463 for (row
= 0; row
< lines
->len
; row
++) {
1464 GPtrArray
*line
= g_ptr_array_index (lines
, row
);
1465 unsigned int col
, targetcol
= 0;
1466 for (col
= 0; col
< line
->len
; col
++) {
1467 if (parseoptions
->col_import_array
== NULL
||
1468 parseoptions
->col_import_array_len
<= col
||
1469 parseoptions
->col_import_array
[col
]) {
1470 const char *text
= g_ptr_array_index (line
, col
);
1472 GOFormat
*fmt
= NULL
;
1477 fmt
= g_ptr_array_index (parseoptions
->formats
, col
);
1478 v
= format_match (text
, fmt
, date_conv
);
1480 v
= value_new_string (text
);
1482 cc
= gnm_cell_copy_new (cr
, targetcol
, row
);
1486 if (targetcol
> colhigh
)
1487 colhigh
= targetcol
;
1492 stf_parse_general_free (lines
);
1493 g_string_chunk_free (lines_chunk
);
1497 cr
->cols
= (colhigh
> 0) ? colhigh
: 1;
1504 int_sort (void const *a
, void const *b
)
1506 return *(int const *)a
- *(int const *)b
;
1510 count_character (GPtrArray
*lines
, gunichar c
, double quantile
)
1513 unsigned int lno
, cno
;
1515 if (lines
->len
== 0)
1518 counts
= g_new (int, lines
->len
);
1519 for (lno
= cno
= 0; lno
< lines
->len
; lno
++) {
1521 GPtrArray
*boxline
= g_ptr_array_index (lines
, lno
);
1522 char const *line
= g_ptr_array_index (boxline
, 0);
1524 /* Ignore empty lines. */
1529 if (g_utf8_get_char (line
) == c
)
1531 line
= g_utf8_next_char (line
);
1534 counts
[cno
++] = count
;
1540 unsigned int qi
= (unsigned int)ceil (quantile
* cno
);
1541 qsort (counts
, cno
, sizeof (counts
[0]), int_sort
);
1553 dump_guessed_options (const StfParseOptions_t
*res
)
1556 char ubuffer
[6 + 1];
1559 g_printerr ("Guessed format:\n");
1560 switch (res
->parsetype
) {
1561 case PARSE_TYPE_CSV
:
1562 g_printerr (" type = sep\n");
1563 g_printerr (" separator = %s\n",
1564 res
->sep
.chr
? res
->sep
.chr
: "(none)");
1565 g_printerr (" see two as one = %s\n",
1566 res
->sep
.duplicates
? "yes" : "no");
1568 case PARSE_TYPE_FIXED
:
1569 g_printerr (" type = sep\n");
1574 g_printerr (" trim space = %d\n", res
->trim_spaces
);
1576 ubuffer
[g_unichar_to_utf8 (res
->stringindicator
, ubuffer
)] = 0;
1577 g_printerr (" string indicator = %s\n", ubuffer
);
1578 g_printerr (" see two as one = %s\n",
1579 res
->indicator_2x_is_single
? "yes" : "no");
1581 g_printerr (" line terminators =");
1582 for (l
= res
->terminator
; l
; l
= l
->next
) {
1583 const char *t
= l
->data
;
1584 if (strcmp (t
, "\n") == 0)
1585 g_printerr (" unix");
1586 else if (strcmp (t
, "\r") == 0)
1587 g_printerr (" mac");
1588 else if (strcmp (t
, "\r\n") == 0)
1589 g_printerr (" dos");
1591 g_printerr (" other");
1595 for (ui
= 0; ui
< res
->formats
->len
; ui
++) {
1596 GOFormat
const *fmt
= g_ptr_array_index (res
->formats
, ui
);
1597 const GString
*decimal
= ui
< res
->formats_decimal
->len
1598 ? g_ptr_array_index (res
->formats_decimal
, ui
)
1600 const GString
*thousand
= ui
< res
->formats_thousand
->len
1601 ? g_ptr_array_index (res
->formats_thousand
, ui
)
1604 g_printerr (" fmt.%d = %s\n", ui
, go_format_as_XL (fmt
));
1606 g_printerr (" fmt.%d.dec = %s\n", ui
, decimal
->str
);
1608 g_printerr (" fmt.%d.thou = %s\n", ui
, thousand
->str
);
1613 * stf_parse_options_guess:
1614 * @data: the input data.
1616 * Returns: (transfer full): the guessed options.
1619 stf_parse_options_guess (char const *data
)
1621 StfParseOptions_t
*res
;
1622 GStringChunk
*lines_chunk
;
1626 gunichar sepchar
= go_locale_get_arg_sep ();
1628 g_return_val_if_fail (data
!= NULL
, NULL
);
1630 res
= stf_parse_options_new ();
1631 lines_chunk
= g_string_chunk_new (100 * 1024);
1632 lines
= stf_parse_lines (res
, lines_chunk
, data
, 1000, FALSE
);
1634 tabcount
= count_character (lines
, '\t', 0.2);
1635 sepcount
= count_character (lines
, sepchar
, 0.2);
1637 /* At least one tab per line and enough to separate every
1638 would-be sepchars. */
1639 if (tabcount
>= 1 && tabcount
>= sepcount
- 1)
1640 stf_parse_options_csv_set_separators (res
, "\t", NULL
);
1645 * Try a few more or less likely characters and pick the first
1646 * one that occurs on at least half the lines.
1648 * The order is mostly random, although ' ' and '!' which
1649 * could very easily occur in text are put last.
1651 if (count_character (lines
, (c
= sepchar
), 0.5) > 0 ||
1652 count_character (lines
, (c
= go_locale_get_col_sep ()), 0.5) > 0 ||
1653 count_character (lines
, (c
= ':'), 0.5) > 0 ||
1654 count_character (lines
, (c
= ','), 0.5) > 0 ||
1655 count_character (lines
, (c
= ';'), 0.5) > 0 ||
1656 count_character (lines
, (c
= '|'), 0.5) > 0 ||
1657 count_character (lines
, (c
= '!'), 0.5) > 0 ||
1658 count_character (lines
, (c
= ' '), 0.5) > 0) {
1660 sep
[g_unichar_to_utf8 (c
, sep
)] = 0;
1663 stf_parse_options_csv_set_separators (res
, sep
, NULL
);
1667 // For now, always separated:
1668 stf_parse_options_set_type (res
, PARSE_TYPE_CSV
);
1670 switch (res
->parsetype
) {
1671 case PARSE_TYPE_CSV
: {
1674 strchr (res
->sep
.chr
, ' ') != NULL
;
1677 strchr (res
->sep
.chr
, ' ') != NULL
;
1679 stf_parse_options_set_trim_spaces (res
, TRIM_TYPE_LEFT
| TRIM_TYPE_RIGHT
);
1680 stf_parse_options_csv_set_indicator_2x_is_single (res
, TRUE
);
1681 stf_parse_options_csv_set_duplicates (res
, dups
);
1682 stf_parse_options_csv_set_trim_seps (res
, trim
);
1684 stf_parse_options_csv_set_stringindicator (res
, '"');
1688 case PARSE_TYPE_FIXED
:
1692 g_assert_not_reached ();
1695 stf_parse_general_free (lines
);
1696 g_string_chunk_free (lines_chunk
);
1698 stf_parse_options_guess_formats (res
, data
);
1700 if (gnm_debug_flag ("stf"))
1701 dump_guessed_options (res
);
1707 * stf_parse_options_guess_csv:
1708 * @data: the CSV input data.
1710 * Returns: (transfer full): the guessed options.
1713 stf_parse_options_guess_csv (char const *data
)
1715 StfParseOptions_t
*res
;
1716 GStringChunk
*lines_chunk
;
1719 char const *quoteline
= NULL
;
1721 gunichar stringind
= '"';
1723 g_return_val_if_fail (data
!= NULL
, NULL
);
1725 res
= stf_parse_options_new ();
1726 stf_parse_options_set_type (res
, PARSE_TYPE_CSV
);
1727 stf_parse_options_set_trim_spaces (res
, TRIM_TYPE_LEFT
| TRIM_TYPE_RIGHT
);
1728 stf_parse_options_csv_set_indicator_2x_is_single (res
, TRUE
);
1729 stf_parse_options_csv_set_duplicates (res
, FALSE
);
1730 stf_parse_options_csv_set_trim_seps (res
, FALSE
);
1731 stf_parse_options_csv_set_stringindicator (res
, stringind
);
1733 lines_chunk
= g_string_chunk_new (100 * 1024);
1734 lines
= stf_parse_lines (res
, lines_chunk
, data
, 1000, FALSE
);
1737 * Find a line containing a quote; skip first line unless it is
1738 * the only one. Prefer a line with the quote first.
1740 for (pass
= 1; !quoteline
&& pass
<= 2; pass
++) {
1742 for (lno
= MIN (1, lines
->len
- 1);
1743 !quoteline
&& lno
< lines
->len
;
1745 GPtrArray
*boxline
= g_ptr_array_index (lines
, lno
);
1746 const char *line
= g_ptr_array_index (boxline
, 0);
1749 if (g_utf8_get_char (line
) == stringind
)
1753 if (my_utf8_strchr (line
, stringind
))
1761 const char *p0
= my_utf8_strchr (quoteline
, stringind
);
1765 p
= g_utf8_next_char (p
);
1766 } while (*p
&& g_utf8_get_char (p
) != stringind
);
1767 if (*p
) p
= g_utf8_next_char (p
);
1768 while (*p
&& g_unichar_isspace (g_utf8_get_char (p
)))
1769 p
= g_utf8_next_char (p
);
1771 /* Use the character after the quote. */
1772 sep
= g_strndup (p
, g_utf8_next_char (p
) - p
);
1774 /* Try to use character before the quote. */
1775 while (p0
> quoteline
&& !sep
) {
1777 p0
= g_utf8_prev_char (p0
);
1778 if (!g_unichar_isspace (g_utf8_get_char (p0
)))
1779 sep
= g_strndup (p0
, p
- p0
);
1785 sep
= g_strdup (",");
1786 stf_parse_options_csv_set_separators (res
, sep
, NULL
);
1789 stf_parse_general_free (lines
);
1790 g_string_chunk_free (lines_chunk
);
1792 stf_parse_options_guess_formats (res
, data
);
1794 if (gnm_debug_flag ("stf"))
1795 dump_guessed_options (res
);
1801 STF_GUESS_DATE_DMY
= 1,
1802 STF_GUESS_DATE_MDY
= 2,
1803 STF_GUESS_DATE_YMD
= 4,
1805 STF_GUESS_NUMBER_DEC_POINT
= 0x10,
1806 STF_GUESS_NUMBER_DEC_COMMA
= 0x20,
1807 STF_GUESS_NUMBER_DEC_EITHER
= 0x30,
1809 STF_GUESS_ALL
= 0x37
1813 do_check_date (const char *data
, StfGuessFormats flag
,
1814 gboolean mbd
, gboolean ybm
,
1816 GODateConventions
const *date_conv
)
1819 gboolean this_mbd
, this_ybm
;
1822 if (!(*possible
& flag
))
1825 v
= format_match_datetime (data
, date_conv
, mbd
, TRUE
, FALSE
);
1826 if (!v
|| !VALUE_FMT (v
))
1829 imbd
= go_format_month_before_day (VALUE_FMT (v
));
1830 this_mbd
= (imbd
>= 1);
1831 this_ybm
= (imbd
== 2);
1832 if (mbd
!= this_mbd
|| ybm
!= this_ybm
)
1845 do_check_number (const char *data
, StfGuessFormats flag
,
1846 const GString
*dec
, const GString
*thousand
, const GString
*curr
,
1847 unsigned *possible
, int *decimals
)
1850 GOFormatFamily family
;
1853 if (!(*possible
& flag
))
1856 v
= format_match_decimal_number_with_locale (data
, &family
, curr
, thousand
, dec
);
1860 if (*decimals
!= -2) {
1861 const char *pdec
= strstr (data
, dec
->str
);
1862 int this_decimals
= 0;
1865 while (g_ascii_isdigit (*pdec
)) {
1870 if (*decimals
== -1)
1871 *decimals
= this_decimals
;
1872 else if (*decimals
!= this_decimals
)
1876 pthou
= strstr (data
, thousand
->str
);
1879 int digits
= 0, nonzero_digits
= 0;
1880 for (p
= data
; p
< pthou
; p
= g_utf8_next_char (p
)) {
1881 if (g_unichar_isdigit (g_utf8_get_char (p
))) {
1887 // "-.222" implies that "." is not a thousands separator.
1888 // "0.222" implies that "." is not a thousands separator.
1889 // "12345,555" implies that "," is not a thousands separator.
1890 if (nonzero_digits
== 0 || digits
> 3)
1904 * stf_parse_options_guess_formats:
1905 * @data: the CSV input data.
1907 * This function attempts to recognize data formats on a column-by-column
1908 * basis under the assumption that the data in a text file will generally
1909 * use the same data formats.
1911 * This is useful because not all values give sufficient information by
1912 * themselves to tell what format the data is in. For example, "1/2/2000"
1913 * is likely to be a date in year 2000, but it is not clear if it is in
1914 * January or February. If another value in the same column is "31/1/1999"
1915 * then it is likely that the former date was in February.
1917 * Likewise, a value of "123,456" could mean either 1.23456e5 or 1.23456e2.
1918 * A later value of "111,200.22" would clear up the confusion.
1922 stf_parse_options_guess_formats (StfParseOptions_t
*po
, char const *data
)
1924 GStringChunk
*lines_chunk
;
1926 unsigned lno
, col
, colcount
, sline
;
1927 GODateConventions
const *date_conv
= go_date_conv_from_str ("Lotus:1900");
1928 GString
*s_comma
= g_string_new (",");
1929 GString
*s_dot
= g_string_new (".");
1930 GString
*s_dollar
= g_string_new ("$");
1931 gboolean debug
= gnm_debug_flag ("stf");
1933 g_ptr_array_set_size (po
->formats
, 0);
1934 g_ptr_array_set_size (po
->formats_decimal
, 0);
1935 g_ptr_array_set_size (po
->formats_thousand
, 0);
1936 g_ptr_array_set_size (po
->formats_curr
, 0);
1938 lines_chunk
= g_string_chunk_new (100 * 1024);
1939 lines
= stf_parse_general (po
, lines_chunk
, data
, data
+ strlen (data
));
1942 for (lno
= 0; lno
< lines
->len
; lno
++) {
1943 GPtrArray
*line
= g_ptr_array_index (lines
, lno
);
1944 colcount
= MAX (colcount
, line
->len
);
1947 // Ignore first line unless it is the only one
1948 sline
= MIN ((int)lines
->len
- 1, 1);
1950 g_ptr_array_set_size (po
->formats
, colcount
);
1951 g_ptr_array_set_size (po
->formats_decimal
, colcount
);
1952 g_ptr_array_set_size (po
->formats_thousand
, colcount
);
1953 g_ptr_array_set_size (po
->formats_curr
, colcount
);
1954 for (col
= 0; col
< colcount
; col
++) {
1955 unsigned possible
= STF_GUESS_ALL
;
1956 GOFormat
*fmt
= NULL
;
1957 gboolean seen_dot
= FALSE
;
1958 gboolean seen_comma
= FALSE
;
1959 int decimals_if_point
= -1; // -1: unset; -2: inconsistent; >=0: count
1960 int decimals_if_comma
= -1; // -1: unset; -2: inconsistent; >=0: count
1962 for (lno
= sline
; possible
&& lno
< lines
->len
; lno
++) {
1963 GPtrArray
*line
= g_ptr_array_index (lines
, lno
);
1964 const char *data
= col
< line
->len
? g_ptr_array_index (line
, col
) : "";
1965 unsigned prev_possible
= possible
;
1967 if (*data
== 0 || data
[0] == '\'')
1970 do_check_date (data
, STF_GUESS_DATE_DMY
, FALSE
, FALSE
, &possible
, date_conv
);
1971 do_check_date (data
, STF_GUESS_DATE_MDY
, TRUE
, FALSE
, &possible
, date_conv
);
1972 do_check_date (data
, STF_GUESS_DATE_YMD
, TRUE
, TRUE
, &possible
, date_conv
);
1974 if ((possible
& STF_GUESS_NUMBER_DEC_EITHER
) == STF_GUESS_NUMBER_DEC_EITHER
) {
1975 const char *pdot
= strstr (data
, s_dot
->str
);
1976 const char *pcomma
= strstr (data
, s_comma
->str
);
1977 if (pdot
&& pcomma
) {
1978 // Both -- last one is the decimal separator
1980 possible
&= ~STF_GUESS_NUMBER_DEC_COMMA
;
1982 possible
&= ~STF_GUESS_NUMBER_DEC_POINT
;
1983 } else if (pdot
&& strstr (pdot
+ s_dot
->len
, s_dot
->str
)) {
1984 // Two dots so they are thousands separators
1985 possible
&= ~STF_GUESS_NUMBER_DEC_POINT
;
1986 } else if (pcomma
&& strstr (pcomma
+ s_comma
->len
, s_comma
->str
)) {
1987 // Two commas so they are thousands separators
1988 possible
&= ~STF_GUESS_NUMBER_DEC_COMMA
;
1991 seen_dot
= seen_dot
|| (pdot
!= 0);
1992 seen_comma
= seen_comma
|| (pcomma
!= 0);
1994 do_check_number (data
, STF_GUESS_NUMBER_DEC_POINT
,
1995 s_dot
, s_comma
, s_dollar
,
1996 &possible
, &decimals_if_point
);
1997 do_check_number (data
, STF_GUESS_NUMBER_DEC_COMMA
,
1998 s_comma
, s_dot
, s_dollar
,
1999 &possible
, &decimals_if_comma
);
2001 if (possible
!= prev_possible
&& debug
)
2002 g_printerr ("col=%d; after [%s] possible=0x%x\n", col
, data
, possible
);
2005 if ((possible
& STF_GUESS_NUMBER_DEC_EITHER
) == STF_GUESS_NUMBER_DEC_EITHER
&&
2006 !seen_dot
&& !seen_comma
) {
2007 // It doesn't matter what the separators are
2008 possible
&= ~STF_GUESS_NUMBER_DEC_COMMA
;
2012 case STF_GUESS_DATE_DMY
:
2013 fmt
= go_format_new_from_XL ("d-mmm-yyyy");
2015 case STF_GUESS_DATE_MDY
:
2016 fmt
= go_format_new_from_XL ("m/d/yyyy");
2018 case STF_GUESS_DATE_YMD
:
2019 fmt
= go_format_new_from_XL ("yyyy-mm-dd");
2021 case STF_GUESS_NUMBER_DEC_POINT
:
2022 g_ptr_array_index (po
->formats_decimal
, col
) = g_string_new (".");
2023 g_ptr_array_index (po
->formats_thousand
, col
) = g_string_new (",");
2024 g_ptr_array_index (po
->formats_curr
, col
) = g_string_new (s_dollar
->str
);
2025 if (decimals_if_point
> 0) {
2026 // Don't set format if decimals is zero
2027 GString
*fmt_str
= g_string_new (NULL
);
2028 go_format_generate_number_str (fmt_str
, 1, decimals_if_point
, seen_comma
, FALSE
, FALSE
, "", "");
2029 fmt
= go_format_new_from_XL (fmt_str
->str
);
2030 g_string_free (fmt_str
, TRUE
);
2033 case STF_GUESS_NUMBER_DEC_COMMA
:
2034 g_ptr_array_index (po
->formats_decimal
, col
) = g_string_new (",");
2035 g_ptr_array_index (po
->formats_thousand
, col
) = g_string_new (".");
2036 g_ptr_array_index (po
->formats_curr
, col
) = g_string_new (s_dollar
->str
);
2037 if (decimals_if_comma
> 0) {
2038 // Don't set format if decimals is zero
2039 GString
*fmt_str
= g_string_new (NULL
);
2040 go_format_generate_number_str (fmt_str
, 1, decimals_if_comma
, seen_dot
, FALSE
, FALSE
, "", "");
2041 fmt
= go_format_new_from_XL (fmt_str
->str
);
2042 g_string_free (fmt_str
, TRUE
);
2050 fmt
= go_format_ref (go_format_general ());
2051 g_ptr_array_index (po
->formats
, col
) = fmt
;
2054 stf_parse_general_free (lines
);
2055 g_string_chunk_free (lines_chunk
);
2057 g_string_free (s_dot
, TRUE
);
2058 g_string_free (s_comma
, TRUE
);
2059 g_string_free (s_dollar
, TRUE
);