1 /* vim: set sw=8: -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */
3 * stf-parse.c : Structured Text Format parser. (STF)
4 * A general purpose engine for parsing data
5 * in CSV and Fixed width format.
8 * Copyright (C) Almer. S. Tigelaar.
9 * EMail: almer1@dds.nl or almer-t@bigfoot.com
11 * Copyright (C) 2003 Andreas J. Guelzow <aguelzow@taliesin.ca>
12 * Copyright (C) 2003,2008-2009 Morten Welinder <terra@gnome.org>
14 * This program is free software; you can redistribute it and/or modify
15 * it under the terms of the GNU General Public License as published by
16 * the Free Software Foundation; either version 2 of the License, or
17 * (at your option) any later version.
19 * This program is distributed in the hope that it will be useful,
20 * but WITHOUT ANY WARRANTY; without even the implied warranty of
21 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
22 * GNU General Public License for more details.
24 * You should have received a copy of the GNU General Public License
25 * along with this program; if not, see <https://www.gnu.org/licenses/>.
28 #include <gnumeric-config.h>
29 #include <glib/gi18n-lib.h>
31 #include "stf-parse.h"
32 #include "stf-export.h"
38 #include "clipboard.h"
39 #include "sheet-style.h"
42 #include "number-match.h"
44 #include "parse-util.h"
45 #include "number-match.h"
46 #include "gnm-format.h"
48 #include <goffice/goffice.h>
54 #define SETUP_LOCALE_SWITCH char *oldlocale = NULL
56 #define START_LOCALE_SWITCH if (parseoptions->locale) {\
57 oldlocale = g_strdup(go_setlocale (LC_ALL, NULL)); \
58 go_setlocale(LC_ALL, parseoptions->locale);}
60 #define END_LOCALE_SWITCH if (oldlocale) {\
61 go_setlocale(LC_ALL, oldlocale);\
64 /* Source_t struct, used for interchanging parsing information between the low level parse functions */
67 char const *position
; /* Indicates the current position within data */
69 /* Used internally for fixed width parsing */
70 int splitpos
; /* Indicates current position in splitpositions array */
71 int linepos
; /* Position on the current line */
74 /* Struct used for autodiscovery */
81 * Some silly dude make the length field an unsigned int. C just does
82 * not deal very well with that.
85 my_garray_len (GArray
const *a
)
91 my_utf8_strchr (const char *p
, gunichar uc
)
93 return uc
< 0x7f ? strchr (p
, uc
) : g_utf8_strchr (p
, -1, uc
);
97 compare_terminator (char const *s
, StfParseOptions_t
*parseoptions
)
99 guchar
const *us
= (guchar
const *)s
;
102 if (*us
> parseoptions
->compiled_terminator
.max
||
103 *us
< parseoptions
->compiled_terminator
.min
)
106 for (l
= parseoptions
->terminator
; l
; l
= l
->next
) {
107 char const *term
= l
->data
;
125 /*******************************************************************************************************
126 * STF PARSE OPTIONS : StfParseOptions related
127 *******************************************************************************************************/
130 gnm_g_string_free (GString
*s
)
132 if (s
) g_string_free (s
, TRUE
);
137 * stf_parse_options_new:
139 * This will return a new StfParseOptions_t struct.
140 * The struct should, after being used, freed with stf_parse_options_free.
142 static StfParseOptions_t
*
143 stf_parse_options_new (void)
145 StfParseOptions_t
* parseoptions
= g_new0 (StfParseOptions_t
, 1);
147 parseoptions
->parsetype
= PARSE_TYPE_NOTSET
;
149 parseoptions
->terminator
= NULL
;
150 stf_parse_options_add_line_terminator (parseoptions
, "\r\n");
151 stf_parse_options_add_line_terminator (parseoptions
, "\n");
152 stf_parse_options_add_line_terminator (parseoptions
, "\r");
154 parseoptions
->trim_spaces
= (TRIM_TYPE_RIGHT
| TRIM_TYPE_LEFT
);
155 parseoptions
->locale
= NULL
;
157 parseoptions
->splitpositions
= NULL
;
158 stf_parse_options_fixed_splitpositions_clear (parseoptions
);
160 parseoptions
->stringindicator
= '"';
161 parseoptions
->indicator_2x_is_single
= TRUE
;
162 parseoptions
->sep
.duplicates
= FALSE
;
163 parseoptions
->trim_seps
= FALSE
;
165 parseoptions
->sep
.str
= NULL
;
166 parseoptions
->sep
.chr
= NULL
;
168 parseoptions
->col_autofit_array
= NULL
;
169 parseoptions
->col_import_array
= NULL
;
170 parseoptions
->col_import_array_len
= 0;
171 parseoptions
->formats
= g_ptr_array_new_with_free_func ((GDestroyNotify
)go_format_unref
);
172 parseoptions
->formats_decimal
= g_ptr_array_new_with_free_func ((GDestroyNotify
)gnm_g_string_free
);
173 parseoptions
->formats_thousand
= g_ptr_array_new_with_free_func ((GDestroyNotify
)gnm_g_string_free
);
174 parseoptions
->formats_curr
= g_ptr_array_new_with_free_func ((GDestroyNotify
)gnm_g_string_free
);
176 parseoptions
->cols_exceeded
= FALSE
;
177 parseoptions
->rows_exceeded
= FALSE
;
178 parseoptions
->ref_count
= 1;
184 * stf_parse_options_free:
186 * will free @parseoptions, note that this will not free the splitpositions
187 * member (GArray) of the struct, the caller is responsible for that.
190 stf_parse_options_free (StfParseOptions_t
*parseoptions
)
192 g_return_if_fail (parseoptions
!= NULL
);
194 if (parseoptions
->ref_count
-- > 1)
197 g_free (parseoptions
->col_import_array
);
198 g_free (parseoptions
->col_autofit_array
);
199 g_free (parseoptions
->locale
);
200 g_free (parseoptions
->sep
.chr
);
202 if (parseoptions
->sep
.str
) {
205 for (l
= parseoptions
->sep
.str
; l
!= NULL
; l
= l
->next
)
206 g_free ((char *) l
->data
);
207 g_slist_free (parseoptions
->sep
.str
);
210 g_array_free (parseoptions
->splitpositions
, TRUE
);
212 stf_parse_options_clear_line_terminator (parseoptions
);
214 g_ptr_array_free (parseoptions
->formats
, TRUE
);
215 g_ptr_array_free (parseoptions
->formats_decimal
, TRUE
);
216 g_ptr_array_free (parseoptions
->formats_thousand
, TRUE
);
217 g_ptr_array_free (parseoptions
->formats_curr
, TRUE
);
219 g_free (parseoptions
);
222 static StfParseOptions_t
*
223 stf_parse_options_ref (StfParseOptions_t
*parseoptions
)
225 parseoptions
->ref_count
++;
230 stf_parse_options_get_type (void)
235 t
= g_boxed_type_register_static ("StfParseOptions_t",
236 (GBoxedCopyFunc
)stf_parse_options_ref
,
237 (GBoxedFreeFunc
)stf_parse_options_free
);
243 stf_parse_options_set_type (StfParseOptions_t
*parseoptions
, StfParseType_t
const parsetype
)
245 g_return_if_fail (parseoptions
!= NULL
);
246 g_return_if_fail (parsetype
== PARSE_TYPE_CSV
|| parsetype
== PARSE_TYPE_FIXED
);
248 parseoptions
->parsetype
= parsetype
;
252 long_string_first (gchar
const *a
, gchar
const *b
)
254 /* This actually is UTF-8 safe. */
255 return strlen (b
) - strlen (a
);
259 compile_terminators (StfParseOptions_t
*parseoptions
)
263 parseoptions
->terminator
=
264 g_slist_sort (parseoptions
->terminator
,
265 (GCompareFunc
)long_string_first
);
266 parseoptions
->compiled_terminator
.min
= 255;
267 parseoptions
->compiled_terminator
.max
= 0;
268 for (l
= parseoptions
->terminator
; l
; l
= l
->next
) {
269 const guchar
*term
= l
->data
;
270 parseoptions
->compiled_terminator
.min
=
271 MIN (parseoptions
->compiled_terminator
.min
, *term
);
272 parseoptions
->compiled_terminator
.max
=
273 MAX (parseoptions
->compiled_terminator
.max
, *term
);
278 * stf_parse_options_add_line_terminator:
280 * This will add to the line terminators, in both the Fixed width and CSV delimited importers
281 * this indicates the end of a row.
285 stf_parse_options_add_line_terminator (StfParseOptions_t
*parseoptions
, char const *terminator
)
287 g_return_if_fail (parseoptions
!= NULL
);
288 g_return_if_fail (terminator
!= NULL
&& *terminator
!= 0);
290 GO_SLIST_PREPEND (parseoptions
->terminator
, g_strdup (terminator
));
291 compile_terminators (parseoptions
);
295 * stf_parse_options_clear_line_terminator:
297 * This will clear the line terminator, in both the Fixed width and CSV delimited importers
298 * this indicates the end of a row.
302 stf_parse_options_clear_line_terminator (StfParseOptions_t
*parseoptions
)
304 g_return_if_fail (parseoptions
!= NULL
);
306 g_slist_free_full (parseoptions
->terminator
, g_free
);
307 parseoptions
->terminator
= NULL
;
308 compile_terminators (parseoptions
);
312 * stf_parse_options_set_trim_spaces:
314 * If enabled will trim spaces in every parsed field on left and/or right
318 stf_parse_options_set_trim_spaces (StfParseOptions_t
*parseoptions
, StfTrimType_t
const trim_spaces
)
320 g_return_if_fail (parseoptions
!= NULL
);
322 parseoptions
->trim_spaces
= trim_spaces
;
326 * stf_parse_options_csv_set_separators:
327 * @parseoptions: #StfParseOptions_t
329 * @seps: (element-type utf8): the separators to be used
331 * A copy is made of the parameters.
334 stf_parse_options_csv_set_separators (StfParseOptions_t
*parseoptions
,
335 char const *character
,
338 g_return_if_fail (parseoptions
!= NULL
);
340 g_free (parseoptions
->sep
.chr
);
341 parseoptions
->sep
.chr
= g_strdup (character
);
343 g_slist_free_full (parseoptions
->sep
.str
, g_free
);
344 parseoptions
->sep
.str
=
345 g_slist_copy_deep ((GSList
*)seps
, (GCopyFunc
)g_strdup
, NULL
);
349 stf_parse_options_csv_set_stringindicator (StfParseOptions_t
*parseoptions
, gunichar
const stringindicator
)
351 g_return_if_fail (parseoptions
!= NULL
);
353 parseoptions
->stringindicator
= stringindicator
;
357 * stf_parse_options_csv_set_indicator_2x_is_single:
358 * @indic_2x: a boolean value indicating whether we want to see two
359 * adjacent string indicators as a single string indicator
360 * that is part of the cell, rather than a terminator.
363 stf_parse_options_csv_set_indicator_2x_is_single (StfParseOptions_t
*parseoptions
,
364 gboolean
const indic_2x
)
366 g_return_if_fail (parseoptions
!= NULL
);
368 parseoptions
->indicator_2x_is_single
= indic_2x
;
372 * stf_parse_options_csv_set_duplicates:
374 * @duplicates: a boolean value indicating whether we want to see two
375 * separators right behind each other as one
378 stf_parse_options_csv_set_duplicates (StfParseOptions_t
*parseoptions
, gboolean
const duplicates
)
380 g_return_if_fail (parseoptions
!= NULL
);
382 parseoptions
->sep
.duplicates
= duplicates
;
386 * stf_parse_options_csv_set_trim_seps:
387 * @trim_seps: a boolean value indicating whether we want to ignore
388 * separators at the beginning of lines
391 stf_parse_options_csv_set_trim_seps (StfParseOptions_t
*parseoptions
, gboolean
const trim_seps
)
393 g_return_if_fail (parseoptions
!= NULL
);
395 parseoptions
->trim_seps
= trim_seps
;
399 * stf_parse_options_fixed_splitpositions_clear:
401 * This will clear the splitpositions (== points on which a line is split)
404 stf_parse_options_fixed_splitpositions_clear (StfParseOptions_t
*parseoptions
)
407 g_return_if_fail (parseoptions
!= NULL
);
409 if (parseoptions
->splitpositions
)
410 g_array_free (parseoptions
->splitpositions
, TRUE
);
411 parseoptions
->splitpositions
= g_array_new (FALSE
, FALSE
, sizeof (int));
413 g_array_append_val (parseoptions
->splitpositions
, minus_one
);
417 * stf_parse_options_fixed_splitpositions_add:
419 * @position will be added to the splitpositions.
422 stf_parse_options_fixed_splitpositions_add (StfParseOptions_t
*parseoptions
, int position
)
426 g_return_if_fail (parseoptions
!= NULL
);
427 g_return_if_fail (position
>= 0);
429 for (ui
= 0; ui
< parseoptions
->splitpositions
->len
- 1; ui
++) {
430 int here
= g_array_index (parseoptions
->splitpositions
, int, ui
);
431 if (position
== here
)
437 g_array_insert_val (parseoptions
->splitpositions
, ui
, position
);
441 stf_parse_options_fixed_splitpositions_remove (StfParseOptions_t
*parseoptions
, int position
)
445 g_return_if_fail (parseoptions
!= NULL
);
446 g_return_if_fail (position
>= 0);
448 for (ui
= 0; ui
< parseoptions
->splitpositions
->len
- 1; ui
++) {
449 int here
= g_array_index (parseoptions
->splitpositions
, int, ui
);
450 if (position
== here
)
451 g_array_remove_index (parseoptions
->splitpositions
, ui
);
452 if (position
<= here
)
458 stf_parse_options_fixed_splitpositions_count (StfParseOptions_t
*parseoptions
)
460 return parseoptions
->splitpositions
->len
;
464 stf_parse_options_fixed_splitpositions_nth (StfParseOptions_t
*parseoptions
, int n
)
466 return g_array_index (parseoptions
->splitpositions
, int, n
);
471 * stf_parse_options_valid:
472 * @parseoptions: an import options struct
474 * Checks if @parseoptions is correctly filled
476 * returns : TRUE if it is correctly filled, FALSE otherwise.
479 stf_parse_options_valid (StfParseOptions_t
*parseoptions
)
481 g_return_val_if_fail (parseoptions
!= NULL
, FALSE
);
483 if (parseoptions
->parsetype
== PARSE_TYPE_FIXED
) {
484 if (!parseoptions
->splitpositions
) {
485 g_warning ("STF: No splitpositions in struct");
493 /*******************************************************************************************************
494 * STF PARSE : The actual routines that do the 'trick'
495 *******************************************************************************************************/
498 trim_spaces_inplace (char *field
, StfParseOptions_t
const *parseoptions
)
502 if (parseoptions
->trim_spaces
& TRIM_TYPE_LEFT
) {
505 while (g_unichar_isspace (g_utf8_get_char (s
)))
506 s
= g_utf8_next_char (s
);
509 memmove (field
, s
, 1 + strlen (s
));
512 if (parseoptions
->trim_spaces
& TRIM_TYPE_RIGHT
) {
513 char *s
= field
+ strlen (field
);
516 s
= g_utf8_prev_char (s
);
517 if (!g_unichar_isspace (g_utf8_get_char (s
)))
525 * stf_parse_csv_is_separator:
527 * returns NULL if @character is not a separator, a pointer to the character
528 * after the separator otherwise.
531 stf_parse_csv_is_separator (char const *character
, char const *chr
, GSList
const *str
)
533 g_return_val_if_fail (character
!= NULL
, NULL
);
541 for (l
= str
; l
!= NULL
; l
= l
->next
) {
542 char const *s
= l
->data
;
545 glong
const len
= g_utf8_strlen (s
, -1);
547 /* Don't compare past the end of the buffer! */
548 for (r
= character
, cnt
= 0; cnt
< len
; cnt
++, r
= g_utf8_next_char (r
))
552 if ((cnt
== len
) && (memcmp (character
, s
, len
) == 0))
553 return g_utf8_offset_to_pointer (character
, len
);
557 if (chr
&& my_utf8_strchr (chr
, g_utf8_get_char (character
)))
558 return g_utf8_next_char(character
);
564 * stf_parse_eat_separators:
566 * skip over leading separators
571 stf_parse_eat_separators (Source_t
*src
, StfParseOptions_t
*parseoptions
)
573 char const *cur
, *next
;
575 g_return_if_fail (src
!= NULL
);
576 g_return_if_fail (parseoptions
!= NULL
);
580 if (*cur
== '\0' || compare_terminator (cur
, parseoptions
))
582 while ((next
= stf_parse_csv_is_separator (cur
, parseoptions
->sep
.chr
, parseoptions
->sep
.str
)))
593 STF_CELL_FIELD_NO_SEP
,
597 static StfParseCellRes
598 stf_parse_csv_cell (GString
*text
, Source_t
*src
, StfParseOptions_t
*parseoptions
)
601 gboolean saw_sep
= FALSE
;
603 g_return_val_if_fail (src
!= NULL
, STF_CELL_ERROR
);
604 g_return_val_if_fail (parseoptions
!= NULL
, STF_CELL_ERROR
);
607 g_return_val_if_fail (cur
!= NULL
, STF_CELL_ERROR
);
609 /* Skip whitespace, but stop at line terminators. */
618 term_len
= compare_terminator (cur
, parseoptions
);
620 src
->position
= cur
+ term_len
;
624 if ((parseoptions
->trim_spaces
& TRIM_TYPE_LEFT
) == 0)
627 if (stf_parse_csv_is_separator (cur
, parseoptions
->sep
.chr
,
628 parseoptions
->sep
.str
))
631 if (!g_unichar_isspace (g_utf8_get_char (cur
)))
633 cur
= g_utf8_next_char (cur
);
636 if (parseoptions
->stringindicator
!= 0 &&
637 g_utf8_get_char (cur
) == parseoptions
->stringindicator
) {
638 cur
= g_utf8_next_char (cur
);
640 gunichar uc
= g_utf8_get_char (cur
);
641 cur
= g_utf8_next_char (cur
);
643 if (uc
== parseoptions
->stringindicator
) {
644 if (parseoptions
->indicator_2x_is_single
&&
645 g_utf8_get_char (cur
) == parseoptions
->stringindicator
)
646 cur
= g_utf8_next_char (cur
);
648 /* "field content"dropped-garbage, */
649 while (*cur
&& !compare_terminator (cur
, parseoptions
)) {
650 char const *post
= stf_parse_csv_is_separator
651 (cur
, parseoptions
->sep
.chr
, parseoptions
->sep
.str
);
657 cur
= g_utf8_next_char (cur
);
663 g_string_append_unichar (text
, uc
);
666 /* We silently allow a missing terminating quote. */
668 /* Unquoted field. */
670 while (*cur
&& !compare_terminator (cur
, parseoptions
)) {
672 char const *post
= stf_parse_csv_is_separator
673 (cur
, parseoptions
->sep
.chr
, parseoptions
->sep
.str
);
680 g_string_append_unichar (text
, g_utf8_get_char (cur
));
681 cur
= g_utf8_next_char (cur
);
684 if (parseoptions
->trim_spaces
& TRIM_TYPE_RIGHT
) {
686 const char *last
= g_utf8_prev_char (text
->str
+ text
->len
);
687 if (!g_unichar_isspace (g_utf8_get_char (last
)))
689 g_string_truncate (text
, last
- text
->str
);
696 if (saw_sep
&& parseoptions
->sep
.duplicates
)
697 stf_parse_eat_separators (src
, parseoptions
);
699 return saw_sep
? STF_CELL_FIELD_SEP
: STF_CELL_FIELD_NO_SEP
;
703 * stf_parse_csv_line:
705 * This will parse one line from the current @src->position.
706 * NOTE: The calling routine is responsible for freeing the result.
708 * returns : a GPtrArray of char*'s
711 stf_parse_csv_line (Source_t
*src
, StfParseOptions_t
*parseoptions
)
714 gboolean cont
= FALSE
;
717 g_return_val_if_fail (src
!= NULL
, NULL
);
718 g_return_val_if_fail (parseoptions
!= NULL
, NULL
);
720 line
= g_ptr_array_new ();
721 if (parseoptions
->trim_seps
)
722 stf_parse_eat_separators (src
, parseoptions
);
724 text
= g_string_sized_new (30);
728 StfParseCellRes res
=
729 stf_parse_csv_cell (text
, src
, parseoptions
);
730 trim_spaces_inplace (text
->str
, parseoptions
);
731 ctext
= g_string_chunk_insert_len (src
->chunk
,
732 text
->str
, text
->len
);
733 g_string_truncate (text
, 0);
736 case STF_CELL_FIELD_NO_SEP
:
737 g_ptr_array_add (line
, ctext
);
741 case STF_CELL_FIELD_SEP
:
742 g_ptr_array_add (line
, ctext
);
743 cont
= TRUE
; /* Make sure we see one more field. */
748 g_ptr_array_add (line
, ctext
);
749 g_string_free (text
, TRUE
);
756 * stf_parse_fixed_cell:
758 * returns a pointer to the parsed cell contents.
761 stf_parse_fixed_cell (Source_t
*src
, StfParseOptions_t
*parseoptions
)
767 g_return_val_if_fail (src
!= NULL
, NULL
);
768 g_return_val_if_fail (parseoptions
!= NULL
, NULL
);
772 if (src
->splitpos
< my_garray_len (parseoptions
->splitpositions
))
773 splitval
= (int) g_array_index (parseoptions
->splitpositions
, int, src
->splitpos
);
777 while (*cur
!= 0 && !compare_terminator (cur
, parseoptions
) && splitval
!= src
->linepos
) {
779 cur
= g_utf8_next_char (cur
);
782 res
= g_string_chunk_insert_len (src
->chunk
,
784 cur
- src
->position
);
792 * stf_parse_fixed_line:
794 * This will parse one line from the current @src->position.
795 * It will return a GPtrArray with the cell contents as strings.
797 * NOTE: The calling routine is responsible for freeing result.
800 stf_parse_fixed_line (Source_t
*src
, StfParseOptions_t
*parseoptions
)
804 g_return_val_if_fail (src
!= NULL
, NULL
);
805 g_return_val_if_fail (parseoptions
!= NULL
, NULL
);
810 line
= g_ptr_array_new ();
811 while (*src
->position
!= '\0' && !compare_terminator (src
->position
, parseoptions
)) {
812 char *field
= stf_parse_fixed_cell (src
, parseoptions
);
814 trim_spaces_inplace (field
, parseoptions
);
815 g_ptr_array_add (line
, field
);
820 while (line
->len
< parseoptions
->splitpositions
->len
)
821 g_ptr_array_add (line
, g_strdup (""));
827 * stf_parse_general_free: (skip)
830 stf_parse_general_free (GPtrArray
*lines
)
833 for (lineno
= 0; lineno
< lines
->len
; lineno
++) {
834 GPtrArray
*line
= g_ptr_array_index (lines
, lineno
);
835 /* Fields are not freed here. */
837 g_ptr_array_free (line
, TRUE
);
839 g_ptr_array_free (lines
, TRUE
);
844 * stf_parse_general: (skip)
846 * Returns: (transfer full): a GPtrArray of lines, where each line is itself a
847 * GPtrArray of strings.
849 * The caller must free this entire structure, for example by calling
850 * stf_parse_general_free.
853 stf_parse_general (StfParseOptions_t
*parseoptions
,
854 GStringChunk
*lines_chunk
,
855 char const *data
, char const *data_end
)
860 char const *valid_end
= data_end
;
862 g_return_val_if_fail (parseoptions
!= NULL
, NULL
);
863 g_return_val_if_fail (data
!= NULL
, NULL
);
864 g_return_val_if_fail (data_end
!= NULL
, NULL
);
865 g_return_val_if_fail (stf_parse_options_valid (parseoptions
), NULL
);
866 g_return_val_if_fail (g_utf8_validate (data
, data_end
-data
, &valid_end
), NULL
);
868 src
.chunk
= lines_chunk
;
872 if ((data_end
-data
>= 3) && !strncmp(src
.position
, "\xEF\xBB\xBF", 3)) {
873 /* Skip over byte-order mark */
877 lines
= g_ptr_array_new ();
878 while (*src
.position
!= '\0' && src
.position
< data_end
) {
881 if (row
== GNM_MAX_ROWS
) {
882 parseoptions
->rows_exceeded
= TRUE
;
886 line
= parseoptions
->parsetype
== PARSE_TYPE_CSV
887 ? stf_parse_csv_line (&src
, parseoptions
)
888 : stf_parse_fixed_line (&src
, parseoptions
);
890 g_ptr_array_add (lines
, line
);
891 if (parseoptions
->parsetype
!= PARSE_TYPE_CSV
)
892 src
.position
+= compare_terminator (src
.position
, parseoptions
);
900 * stf_parse_lines: (skip)
901 * @parseoptions: #StfParseOptions_t
907 * Returns: (transfer full): a GPtrArray of lines, where each line is itself a
908 * GPtrArray of strings.
910 * The caller must free this entire structure, for example by calling
911 * stf_parse_general_free.
914 stf_parse_lines (StfParseOptions_t
*parseoptions
,
915 GStringChunk
*lines_chunk
,
917 int maxlines
, gboolean with_lineno
)
922 g_return_val_if_fail (data
!= NULL
, NULL
);
924 lines
= g_ptr_array_new ();
926 char const *data0
= data
;
927 GPtrArray
*line
= g_ptr_array_new ();
930 char buf
[4 * sizeof (int)];
931 sprintf (buf
, "%d", lineno
);
932 g_ptr_array_add (line
,
933 g_string_chunk_insert (lines_chunk
, buf
));
937 int termlen
= compare_terminator (data
, parseoptions
);
938 if (termlen
> 0 || *data
== 0) {
939 g_ptr_array_add (line
,
940 g_string_chunk_insert_len (lines_chunk
,
946 data
= g_utf8_next_char (data
);
949 g_ptr_array_add (lines
, line
);
952 if (lineno
>= maxlines
)
959 stf_parse_find_line (StfParseOptions_t
*parseoptions
,
964 int termlen
= compare_terminator (data
, parseoptions
);
968 } else if (*data
== 0) {
971 data
= g_utf8_next_char (data
);
979 * stf_parse_options_fixed_autodiscover:
980 * @parseoptions: a Parse options struct.
981 * @data: The actual data.
982 * @data_end: data end.
984 * Automatically try to discover columns in the text to be parsed.
985 * We ignore empty lines (only containing parseoptions->terminator)
987 * FIXME: This is so extremely ugly that I am too tired to rewrite it right now.
988 * Think hard of a better more flexible solution...
991 stf_parse_options_fixed_autodiscover (StfParseOptions_t
*parseoptions
,
992 char const *data
, char const *data_end
)
994 char const *iterator
= data
;
996 GSList
*list_start
= NULL
;
998 int effective_lines
= 0;
999 int max_line_length
= 0;
1000 int *line_begin_hits
= NULL
;
1001 int *line_end_hits
= NULL
;
1004 stf_parse_options_fixed_splitpositions_clear (parseoptions
);
1007 * First take a look at all possible white space combinations
1009 while (*iterator
&& iterator
< data_end
) {
1010 gboolean begin_recorded
= FALSE
;
1011 AutoDiscovery_t
*disc
= NULL
;
1015 while (*iterator
&& (termlen
= compare_terminator (iterator
, parseoptions
)) == 0) {
1016 if (!begin_recorded
&& *iterator
== ' ') {
1017 disc
= g_new0 (AutoDiscovery_t
, 1);
1019 disc
->start
= position
;
1021 begin_recorded
= TRUE
;
1022 } else if (begin_recorded
&& *iterator
!= ' ') {
1023 disc
->stop
= position
;
1024 list
= g_slist_prepend (list
, disc
);
1026 begin_recorded
= FALSE
;
1034 if (position
> max_line_length
)
1035 max_line_length
= position
;
1038 * If there are excess spaces at the end of
1039 * the line : ignore them
1044 * Hop over the terminator
1046 iterator
+= termlen
;
1054 list
= g_slist_reverse (list
);
1059 * Look at the number of hits at each line position
1060 * if the number of hits equals the number of lines
1061 * we can be pretty sure this is the start or end
1062 * of a column, we filter out empty columns
1065 line_begin_hits
= g_new0 (int, max_line_length
+ 1);
1066 line_end_hits
= g_new0 (int, max_line_length
+ 1);
1069 AutoDiscovery_t
*disc
= list
->data
;
1071 line_begin_hits
[disc
->start
]++;
1072 line_end_hits
[disc
->stop
]++;
1076 list
= g_slist_next (list
);
1078 g_slist_free (list_start
);
1080 for (i
= 0; i
< max_line_length
+ 1; i
++)
1081 if (line_begin_hits
[i
] == effective_lines
|| line_end_hits
[i
] == effective_lines
)
1082 stf_parse_options_fixed_splitpositions_add (parseoptions
, i
);
1085 * Do some corrections to the initial columns
1086 * detected here, we obviously don't need to
1087 * do this if there are no columns at all.
1089 if (my_garray_len (parseoptions
->splitpositions
) > 0) {
1091 * Try to find columns that look like:
1096 * (In other words : Columns with left & right justification with
1097 * a minimum of 2 spaces in the middle)
1098 * Split these columns in 2
1101 for (i
= 0; i
< my_garray_len (parseoptions
->splitpositions
) - 1; i
++) {
1102 int begin
= g_array_index (parseoptions
->splitpositions
, int, i
);
1103 int end
= g_array_index (parseoptions
->splitpositions
, int, i
+ 1);
1104 int num_spaces
= -1;
1105 int spaces_start
= 0;
1106 gboolean right_aligned
= TRUE
;
1107 gboolean left_aligned
= TRUE
;
1108 gboolean has_2_spaces
= TRUE
;
1112 while (*iterator
&& iterator
< data_end
) {
1113 gboolean trigger
= FALSE
;
1114 gboolean space_trigger
= FALSE
;
1119 while (*iterator
&& !compare_terminator (iterator
, parseoptions
)) {
1121 if (*iterator
== ' ')
1122 left_aligned
= FALSE
;
1125 } else if (pos
== end
- 1) {
1126 if (*iterator
== ' ')
1127 right_aligned
= FALSE
;
1132 if (trigger
|| pos
== end
- 1) {
1133 if (!space_trigger
&& *iterator
== ' ') {
1134 space_trigger
= TRUE
;
1136 } else if (space_trigger
&& *iterator
!= ' ') {
1137 space_trigger
= FALSE
;
1138 num_spaces
= pos
- spaces_start
;
1147 has_2_spaces
= FALSE
;
1156 * If this column meets all the criteria
1157 * split it into two at the last measured
1158 * spaces_start + num_spaces
1160 if (has_2_spaces
&& right_aligned
&& left_aligned
) {
1161 int val
= (((spaces_start
+ num_spaces
) - spaces_start
) / 2) + spaces_start
;
1163 g_array_insert_val (parseoptions
->splitpositions
, i
+ 1, val
);
1166 * Skip over the inserted column
1173 * Remove empty columns here if needed
1175 for (i
= 0; i
< my_garray_len (parseoptions
->splitpositions
) - 1; i
++) {
1176 int begin
= g_array_index (parseoptions
->splitpositions
, int, i
);
1177 int end
= g_array_index (parseoptions
->splitpositions
, int, i
+ 1);
1178 gboolean only_spaces
= TRUE
;
1182 while (*iterator
&& iterator
< data_end
) {
1183 gboolean trigger
= FALSE
;
1186 while (*iterator
&& !compare_terminator (iterator
, parseoptions
)) {
1189 else if (pos
== end
)
1193 if (*iterator
!= ' ')
1194 only_spaces
= FALSE
;
1208 * The column only contains spaces
1212 g_array_remove_index (parseoptions
->splitpositions
, i
);
1215 * We HAVE to make sure that the next column (end) also
1216 * gets checked out. If we don't decrease "i" here, we
1217 * will skip over it as the indexes shift down after
1225 g_free (line_begin_hits
);
1226 g_free (line_end_hits
);
1229 /*******************************************************************************************************
1230 * STF PARSE HL: high-level functions that dump the raw data returned by the low-level parsing
1231 * functions into something meaningful (== application specific)
1232 *******************************************************************************************************/
1235 * This is more or less as gnm_cell_set_text, except...
1236 * 1. Unknown names are not allowed.
1237 * 2. Only '=' can start an expression.
1241 stf_cell_set_text (GnmCell
*cell
, char const *text
)
1243 GnmExprTop
const *texpr
;
1245 GOFormat
const *fmt
= gnm_style_get_format (gnm_cell_get_style (cell
));
1246 const GODateConventions
*date_conv
= sheet_date_conv (cell
->base
.sheet
);
1248 if (!go_format_is_text (fmt
) && *text
== '=' && text
[1] != 0) {
1249 GnmExprParseFlags flags
=
1250 GNM_EXPR_PARSE_UNKNOWN_NAMES_ARE_INVALID
;
1251 const char *expr_start
= text
+ 1;
1254 parse_pos_init_cell (&pos
, cell
);
1255 texpr
= gnm_expr_parse_str (expr_start
, &pos
, flags
,
1259 val
= format_match (text
, fmt
, date_conv
);
1263 val
= value_new_string (text
);
1266 gnm_cell_set_value (cell
, val
);
1268 gnm_cell_set_expr (cell
, texpr
);
1269 gnm_expr_top_unref (texpr
);
1274 stf_read_remember_settings (Workbook
*book
, StfParseOptions_t
*po
)
1276 if (po
->parsetype
== PARSE_TYPE_CSV
) {
1277 GnmStfExport
*stfe
= gnm_stf_get_stfe (G_OBJECT (book
));
1279 int length
= g_unichar_to_utf8 (po
->stringindicator
, quote
);
1283 } else quote
[length
] = '\0';
1285 g_object_set (G_OBJECT (stfe
), "separator", po
->sep
.chr
, "quote", "e
, NULL
);
1287 if ((po
->terminator
!= NULL
) && (po
->terminator
->data
!= NULL
))
1288 g_object_set (G_OBJECT (stfe
), "eol", po
->terminator
->data
, NULL
);
1293 stf_parse_sheet (StfParseOptions_t
*parseoptions
,
1294 char const *data
, char const *data_end
,
1295 Sheet
*sheet
, int start_col
, int start_row
)
1299 GStringChunk
*lines_chunk
;
1301 gboolean result
= TRUE
;
1306 SETUP_LOCALE_SWITCH
;
1308 g_return_val_if_fail (parseoptions
!= NULL
, FALSE
);
1309 g_return_val_if_fail (data
!= NULL
, FALSE
);
1310 g_return_val_if_fail (IS_SHEET (sheet
), FALSE
);
1313 data_end
= data
+ strlen (data
);
1315 lines_chunk
= g_string_chunk_new (100 * 1024);
1316 lines
= stf_parse_general (parseoptions
, lines_chunk
, data
, data_end
);
1321 nformats
= parseoptions
->formats
->len
;
1322 for (lcol
= 0; lcol
< nformats
; lcol
++) {
1323 GOFormat
const *fmt
= g_ptr_array_index (parseoptions
->formats
, lcol
);
1326 (parseoptions
->col_import_array
== NULL
||
1327 parseoptions
->col_import_array_len
<= lcol
||
1328 parseoptions
->col_import_array
[lcol
]);
1329 if (!want_col
|| col
>= gnm_sheet_get_max_cols (sheet
))
1332 if (fmt
&& !go_format_is_general (fmt
)) {
1334 int end_row
= MIN (start_row
+ (int)lines
->len
- 1,
1335 gnm_sheet_get_last_row (sheet
));
1337 range_init (&r
, col
, start_row
, col
, end_row
);
1338 mstyle
= gnm_style_new ();
1339 gnm_style_set_format (mstyle
, fmt
);
1340 sheet_apply_style (sheet
, &r
, mstyle
);
1345 START_LOCALE_SWITCH
;
1346 for (row
= start_row
, lrow
= 0;
1347 result
&& lrow
< lines
->len
;
1351 if (row
>= gnm_sheet_get_max_rows (sheet
)) {
1352 if (!parseoptions
->rows_exceeded
) {
1353 /* FIXME: What locale? */
1354 g_warning (_("There are more rows of data than "
1355 "there is room for in the sheet. Extra "
1356 "rows will be ignored."));
1357 parseoptions
->rows_exceeded
= TRUE
;
1363 line
= g_ptr_array_index (lines
, lrow
);
1365 for (lcol
= 0; lcol
< line
->len
; lcol
++) {
1366 GOFormat
const *fmt
= lcol
< nformats
1367 ? g_ptr_array_index (parseoptions
->formats
, lcol
)
1368 : go_format_general ();
1369 char const *text
= g_ptr_array_index (line
, lcol
);
1371 (parseoptions
->col_import_array
== NULL
||
1372 parseoptions
->col_import_array_len
<= lcol
||
1373 parseoptions
->col_import_array
[lcol
]);
1377 if (col
>= gnm_sheet_get_max_cols (sheet
)) {
1378 if (!parseoptions
->cols_exceeded
) {
1379 /* FIXME: What locale? */
1380 g_warning (_("There are more columns of data than "
1381 "there is room for in the sheet. Extra "
1382 "columns will be ignored."));
1383 parseoptions
->cols_exceeded
= TRUE
;
1387 if (text
&& *text
) {
1388 GnmCell
*cell
= sheet_cell_fetch (sheet
, col
, row
);
1389 if (!go_format_is_text (fmt
) &&
1390 lcol
< parseoptions
->formats_decimal
->len
&&
1391 g_ptr_array_index (parseoptions
->formats_decimal
, lcol
)) {
1393 GnmValue
*v
= format_match_decimal_number_with_locale
1395 g_ptr_array_index (parseoptions
->formats_curr
, lcol
),
1396 g_ptr_array_index (parseoptions
->formats_thousand
, lcol
),
1397 g_ptr_array_index (parseoptions
->formats_decimal
, lcol
));
1399 v
= value_new_string (text
);
1400 sheet_cell_set_value (cell
, v
);
1403 stf_cell_set_text (cell
, text
);
1409 g_ptr_array_index (lines
, lrow
) = NULL
;
1410 g_ptr_array_free (line
, TRUE
);
1414 for (lcol
= 0, col
= start_col
;
1415 lcol
< parseoptions
->col_import_array_len
&& col
< gnm_sheet_get_max_cols (sheet
);
1417 if (parseoptions
->col_import_array
== NULL
||
1418 parseoptions
->col_import_array_len
<= lcol
||
1419 parseoptions
->col_import_array
[lcol
]) {
1420 if (parseoptions
->col_autofit_array
== NULL
||
1421 parseoptions
->col_autofit_array
[lcol
]) {
1422 ColRowIndexList
*list
= colrow_get_index_list (col
, col
, NULL
);
1423 ColRowStateGroup
*state
= colrow_set_sizes (sheet
, TRUE
, list
, -1, 0, -1);
1424 colrow_index_list_destroy (list
);
1425 g_slist_free (state
);
1431 g_string_chunk_free (lines_chunk
);
1433 stf_parse_general_free (lines
);
1435 stf_read_remember_settings (sheet
->workbook
, parseoptions
);
1440 stf_parse_region (StfParseOptions_t
*parseoptions
, char const *data
, char const *data_end
,
1443 static GODateConventions
const default_conv
= {FALSE
};
1444 GODateConventions
const *date_conv
= wb
? workbook_date_conv (wb
) : &default_conv
;
1447 unsigned int row
, colhigh
= 0;
1448 GStringChunk
*lines_chunk
;
1452 SETUP_LOCALE_SWITCH
;
1454 g_return_val_if_fail (parseoptions
!= NULL
, NULL
);
1455 g_return_val_if_fail (data
!= NULL
, NULL
);
1457 START_LOCALE_SWITCH
;
1459 cr
= gnm_cell_region_new (NULL
);
1462 data_end
= data
+ strlen (data
);
1463 lines_chunk
= g_string_chunk_new (100 * 1024);
1464 lines
= stf_parse_general (parseoptions
, lines_chunk
, data
, data_end
);
1465 nformats
= parseoptions
->formats
->len
;
1466 for (row
= 0; row
< lines
->len
; row
++) {
1467 GPtrArray
*line
= g_ptr_array_index (lines
, row
);
1468 unsigned int col
, targetcol
= 0;
1469 for (col
= 0; col
< line
->len
; col
++) {
1470 if (parseoptions
->col_import_array
== NULL
||
1471 parseoptions
->col_import_array_len
<= col
||
1472 parseoptions
->col_import_array
[col
]) {
1473 const char *text
= g_ptr_array_index (line
, col
);
1475 GOFormat
*fmt
= NULL
;
1480 fmt
= g_ptr_array_index (parseoptions
->formats
, col
);
1481 v
= format_match (text
, fmt
, date_conv
);
1483 v
= value_new_string (text
);
1485 cc
= gnm_cell_copy_new (cr
, targetcol
, row
);
1489 if (targetcol
> colhigh
)
1490 colhigh
= targetcol
;
1495 stf_parse_general_free (lines
);
1496 g_string_chunk_free (lines_chunk
);
1500 cr
->cols
= (colhigh
> 0) ? colhigh
: 1;
1507 int_sort (void const *a
, void const *b
)
1509 return *(int const *)a
- *(int const *)b
;
1513 count_character (GPtrArray
*lines
, gunichar c
, double quantile
)
1516 unsigned int lno
, cno
;
1518 if (lines
->len
== 0)
1521 counts
= g_new (int, lines
->len
);
1522 for (lno
= cno
= 0; lno
< lines
->len
; lno
++) {
1524 GPtrArray
*boxline
= g_ptr_array_index (lines
, lno
);
1525 char const *line
= g_ptr_array_index (boxline
, 0);
1527 /* Ignore empty lines. */
1532 if (g_utf8_get_char (line
) == c
)
1534 line
= g_utf8_next_char (line
);
1537 counts
[cno
++] = count
;
1543 unsigned int qi
= (unsigned int)ceil (quantile
* cno
);
1544 qsort (counts
, cno
, sizeof (counts
[0]), int_sort
);
1556 dump_guessed_options (const StfParseOptions_t
*res
)
1559 char ubuffer
[6 + 1];
1562 g_printerr ("Guessed format:\n");
1563 switch (res
->parsetype
) {
1564 case PARSE_TYPE_CSV
:
1565 g_printerr (" type = sep\n");
1566 g_printerr (" separator = %s\n",
1567 res
->sep
.chr
? res
->sep
.chr
: "(none)");
1568 g_printerr (" see two as one = %s\n",
1569 res
->sep
.duplicates
? "yes" : "no");
1571 case PARSE_TYPE_FIXED
:
1572 g_printerr (" type = sep\n");
1577 g_printerr (" trim space = %d\n", res
->trim_spaces
);
1579 ubuffer
[g_unichar_to_utf8 (res
->stringindicator
, ubuffer
)] = 0;
1580 g_printerr (" string indicator = %s\n", ubuffer
);
1581 g_printerr (" see two as one = %s\n",
1582 res
->indicator_2x_is_single
? "yes" : "no");
1584 g_printerr (" line terminators =");
1585 for (l
= res
->terminator
; l
; l
= l
->next
) {
1586 const char *t
= l
->data
;
1587 if (strcmp (t
, "\n") == 0)
1588 g_printerr (" unix");
1589 else if (strcmp (t
, "\r") == 0)
1590 g_printerr (" mac");
1591 else if (strcmp (t
, "\r\n") == 0)
1592 g_printerr (" dos");
1594 g_printerr (" other");
1598 for (ui
= 0; ui
< res
->formats
->len
; ui
++) {
1599 GOFormat
const *fmt
= g_ptr_array_index (res
->formats
, ui
);
1600 const GString
*decimal
= ui
< res
->formats_decimal
->len
1601 ? g_ptr_array_index (res
->formats_decimal
, ui
)
1603 const GString
*thousand
= ui
< res
->formats_thousand
->len
1604 ? g_ptr_array_index (res
->formats_thousand
, ui
)
1607 g_printerr (" fmt.%d = %s\n", ui
, go_format_as_XL (fmt
));
1609 g_printerr (" fmt.%d.dec = %s\n", ui
, decimal
->str
);
1611 g_printerr (" fmt.%d.thou = %s\n", ui
, thousand
->str
);
1616 * stf_parse_options_guess:
1617 * @data: the input data.
1619 * Returns: (transfer full): the guessed options.
1622 stf_parse_options_guess (char const *data
)
1624 StfParseOptions_t
*res
;
1625 GStringChunk
*lines_chunk
;
1629 gunichar sepchar
= go_locale_get_arg_sep ();
1631 g_return_val_if_fail (data
!= NULL
, NULL
);
1633 res
= stf_parse_options_new ();
1634 lines_chunk
= g_string_chunk_new (100 * 1024);
1635 lines
= stf_parse_lines (res
, lines_chunk
, data
, 1000, FALSE
);
1637 tabcount
= count_character (lines
, '\t', 0.2);
1638 sepcount
= count_character (lines
, sepchar
, 0.2);
1640 /* At least one tab per line and enough to separate every
1641 would-be sepchars. */
1642 if (tabcount
>= 1 && tabcount
>= sepcount
- 1)
1643 stf_parse_options_csv_set_separators (res
, "\t", NULL
);
1648 * Try a few more or less likely characters and pick the first
1649 * one that occurs on at least half the lines.
1651 * The order is mostly random, although ' ' and '!' which
1652 * could very easily occur in text are put last.
1654 if (count_character (lines
, (c
= sepchar
), 0.5) > 0 ||
1655 count_character (lines
, (c
= go_locale_get_col_sep ()), 0.5) > 0 ||
1656 count_character (lines
, (c
= ':'), 0.5) > 0 ||
1657 count_character (lines
, (c
= ','), 0.5) > 0 ||
1658 count_character (lines
, (c
= ';'), 0.5) > 0 ||
1659 count_character (lines
, (c
= '|'), 0.5) > 0 ||
1660 count_character (lines
, (c
= '!'), 0.5) > 0 ||
1661 count_character (lines
, (c
= ' '), 0.5) > 0) {
1663 sep
[g_unichar_to_utf8 (c
, sep
)] = 0;
1666 stf_parse_options_csv_set_separators (res
, sep
, NULL
);
1670 // For now, always separated:
1671 stf_parse_options_set_type (res
, PARSE_TYPE_CSV
);
1673 switch (res
->parsetype
) {
1674 case PARSE_TYPE_CSV
: {
1677 strchr (res
->sep
.chr
, ' ') != NULL
;
1680 strchr (res
->sep
.chr
, ' ') != NULL
;
1682 stf_parse_options_set_trim_spaces (res
, TRIM_TYPE_LEFT
| TRIM_TYPE_RIGHT
);
1683 stf_parse_options_csv_set_indicator_2x_is_single (res
, TRUE
);
1684 stf_parse_options_csv_set_duplicates (res
, dups
);
1685 stf_parse_options_csv_set_trim_seps (res
, trim
);
1687 stf_parse_options_csv_set_stringindicator (res
, '"');
1691 case PARSE_TYPE_FIXED
:
1695 g_assert_not_reached ();
1698 stf_parse_general_free (lines
);
1699 g_string_chunk_free (lines_chunk
);
1701 stf_parse_options_guess_formats (res
, data
);
1703 if (gnm_debug_flag ("stf"))
1704 dump_guessed_options (res
);
1710 * stf_parse_options_guess_csv:
1711 * @data: the CSV input data.
1713 * Returns: (transfer full): the guessed options.
1716 stf_parse_options_guess_csv (char const *data
)
1718 StfParseOptions_t
*res
;
1719 GStringChunk
*lines_chunk
;
1722 char const *quoteline
= NULL
;
1724 gunichar stringind
= '"';
1726 g_return_val_if_fail (data
!= NULL
, NULL
);
1728 res
= stf_parse_options_new ();
1729 stf_parse_options_set_type (res
, PARSE_TYPE_CSV
);
1730 stf_parse_options_set_trim_spaces (res
, TRIM_TYPE_LEFT
| TRIM_TYPE_RIGHT
);
1731 stf_parse_options_csv_set_indicator_2x_is_single (res
, TRUE
);
1732 stf_parse_options_csv_set_duplicates (res
, FALSE
);
1733 stf_parse_options_csv_set_trim_seps (res
, FALSE
);
1734 stf_parse_options_csv_set_stringindicator (res
, stringind
);
1736 lines_chunk
= g_string_chunk_new (100 * 1024);
1737 lines
= stf_parse_lines (res
, lines_chunk
, data
, 1000, FALSE
);
1740 * Find a line containing a quote; skip first line unless it is
1741 * the only one. Prefer a line with the quote first.
1743 for (pass
= 1; !quoteline
&& pass
<= 2; pass
++) {
1745 for (lno
= MIN (1, lines
->len
- 1);
1746 !quoteline
&& lno
< lines
->len
;
1748 GPtrArray
*boxline
= g_ptr_array_index (lines
, lno
);
1749 const char *line
= g_ptr_array_index (boxline
, 0);
1752 if (g_utf8_get_char (line
) == stringind
)
1756 if (my_utf8_strchr (line
, stringind
))
1764 const char *p0
= my_utf8_strchr (quoteline
, stringind
);
1768 p
= g_utf8_next_char (p
);
1769 } while (*p
&& g_utf8_get_char (p
) != stringind
);
1770 if (*p
) p
= g_utf8_next_char (p
);
1771 while (*p
&& g_unichar_isspace (g_utf8_get_char (p
)))
1772 p
= g_utf8_next_char (p
);
1774 /* Use the character after the quote. */
1775 sep
= g_strndup (p
, g_utf8_next_char (p
) - p
);
1777 /* Try to use character before the quote. */
1778 while (p0
> quoteline
&& !sep
) {
1780 p0
= g_utf8_prev_char (p0
);
1781 if (!g_unichar_isspace (g_utf8_get_char (p0
)))
1782 sep
= g_strndup (p0
, p
- p0
);
1788 sep
= g_strdup (",");
1789 stf_parse_options_csv_set_separators (res
, sep
, NULL
);
1792 stf_parse_general_free (lines
);
1793 g_string_chunk_free (lines_chunk
);
1795 stf_parse_options_guess_formats (res
, data
);
1797 if (gnm_debug_flag ("stf"))
1798 dump_guessed_options (res
);
1804 STF_GUESS_DATE_DMY
= 1,
1805 STF_GUESS_DATE_MDY
= 2,
1806 STF_GUESS_DATE_YMD
= 4,
1808 STF_GUESS_NUMBER_DEC_POINT
= 0x10,
1809 STF_GUESS_NUMBER_DEC_COMMA
= 0x20,
1810 STF_GUESS_NUMBER_DEC_EITHER
= 0x30,
1812 STF_GUESS_ALL
= 0x37
1816 do_check_date (const char *data
, StfGuessFormats flag
,
1817 gboolean mbd
, gboolean ybm
,
1819 GODateConventions
const *date_conv
)
1822 gboolean this_mbd
, this_ybm
;
1825 if (!(*possible
& flag
))
1828 v
= format_match_datetime (data
, date_conv
, mbd
, TRUE
, FALSE
);
1829 if (!v
|| !VALUE_FMT (v
))
1832 imbd
= go_format_month_before_day (VALUE_FMT (v
));
1833 this_mbd
= (imbd
>= 1);
1834 this_ybm
= (imbd
== 2);
1835 if (mbd
!= this_mbd
|| ybm
!= this_ybm
)
1848 do_check_number (const char *data
, StfGuessFormats flag
,
1849 const GString
*dec
, const GString
*thousand
, const GString
*curr
,
1850 unsigned *possible
, int *decimals
)
1853 GOFormatFamily family
;
1856 if (!(*possible
& flag
))
1859 v
= format_match_decimal_number_with_locale (data
, &family
, curr
, thousand
, dec
);
1863 if (*decimals
!= -2) {
1864 const char *pdec
= strstr (data
, dec
->str
);
1865 int this_decimals
= 0;
1868 while (g_ascii_isdigit (*pdec
)) {
1873 if (*decimals
== -1)
1874 *decimals
= this_decimals
;
1875 else if (*decimals
!= this_decimals
)
1879 pthou
= strstr (data
, thousand
->str
);
1882 int digits
= 0, nonzero_digits
= 0;
1883 for (p
= data
; p
< pthou
; p
= g_utf8_next_char (p
)) {
1884 if (g_unichar_isdigit (g_utf8_get_char (p
))) {
1890 // "-.222" implies that "." is not a thousands separator.
1891 // "0.222" implies that "." is not a thousands separator.
1892 // "12345,555" implies that "," is not a thousands separator.
1893 if (nonzero_digits
== 0 || digits
> 3)
1907 * stf_parse_options_guess_formats:
1908 * @data: the CSV input data.
1910 * This function attempts to recognize data formats on a column-by-column
1911 * basis under the assumption that the data in a text file will generally
1912 * use the same data formats.
1914 * This is useful because not all values give sufficient information by
1915 * themselves to tell what format the data is in. For example, "1/2/2000"
1916 * is likely to be a date in year 2000, but it is not clear if it is in
1917 * January or February. If another value in the same column is "31/1/1999"
1918 * then it is likely that the former date was in February.
1920 * Likewise, a value of "123,456" could mean either 1.23456e5 or 1.23456e2.
1921 * A later value of "111,200.22" would clear up the confusion.
1925 stf_parse_options_guess_formats (StfParseOptions_t
*po
, char const *data
)
1927 GStringChunk
*lines_chunk
;
1929 unsigned lno
, col
, colcount
, sline
;
1930 GODateConventions
const *date_conv
= go_date_conv_from_str ("Lotus:1900");
1931 GString
*s_comma
= g_string_new (",");
1932 GString
*s_dot
= g_string_new (".");
1933 GString
*s_dollar
= g_string_new ("$");
1934 gboolean debug
= gnm_debug_flag ("stf");
1936 g_ptr_array_set_size (po
->formats
, 0);
1937 g_ptr_array_set_size (po
->formats_decimal
, 0);
1938 g_ptr_array_set_size (po
->formats_thousand
, 0);
1939 g_ptr_array_set_size (po
->formats_curr
, 0);
1941 lines_chunk
= g_string_chunk_new (100 * 1024);
1942 lines
= stf_parse_general (po
, lines_chunk
, data
, data
+ strlen (data
));
1945 for (lno
= 0; lno
< lines
->len
; lno
++) {
1946 GPtrArray
*line
= g_ptr_array_index (lines
, lno
);
1947 colcount
= MAX (colcount
, line
->len
);
1950 // Ignore first line unless it is the only one
1951 sline
= MIN ((int)lines
->len
- 1, 1);
1953 g_ptr_array_set_size (po
->formats
, colcount
);
1954 g_ptr_array_set_size (po
->formats_decimal
, colcount
);
1955 g_ptr_array_set_size (po
->formats_thousand
, colcount
);
1956 g_ptr_array_set_size (po
->formats_curr
, colcount
);
1957 for (col
= 0; col
< colcount
; col
++) {
1958 unsigned possible
= STF_GUESS_ALL
;
1959 GOFormat
*fmt
= NULL
;
1960 gboolean seen_dot
= FALSE
;
1961 gboolean seen_comma
= FALSE
;
1962 int decimals_if_point
= -1; // -1: unset; -2: inconsistent; >=0: count
1963 int decimals_if_comma
= -1; // -1: unset; -2: inconsistent; >=0: count
1965 for (lno
= sline
; possible
&& lno
< lines
->len
; lno
++) {
1966 GPtrArray
*line
= g_ptr_array_index (lines
, lno
);
1967 const char *data
= col
< line
->len
? g_ptr_array_index (line
, col
) : "";
1968 unsigned prev_possible
= possible
;
1970 if (*data
== 0 || data
[0] == '\'')
1973 do_check_date (data
, STF_GUESS_DATE_DMY
, FALSE
, FALSE
, &possible
, date_conv
);
1974 do_check_date (data
, STF_GUESS_DATE_MDY
, TRUE
, FALSE
, &possible
, date_conv
);
1975 do_check_date (data
, STF_GUESS_DATE_YMD
, TRUE
, TRUE
, &possible
, date_conv
);
1977 if ((possible
& STF_GUESS_NUMBER_DEC_EITHER
) == STF_GUESS_NUMBER_DEC_EITHER
) {
1978 const char *pdot
= strstr (data
, s_dot
->str
);
1979 const char *pcomma
= strstr (data
, s_comma
->str
);
1980 if (pdot
&& pcomma
) {
1981 // Both -- last one is the decimal separator
1983 possible
&= ~STF_GUESS_NUMBER_DEC_COMMA
;
1985 possible
&= ~STF_GUESS_NUMBER_DEC_POINT
;
1986 } else if (pdot
&& strstr (pdot
+ s_dot
->len
, s_dot
->str
)) {
1987 // Two dots so they are thousands separators
1988 possible
&= ~STF_GUESS_NUMBER_DEC_POINT
;
1989 } else if (pcomma
&& strstr (pcomma
+ s_comma
->len
, s_comma
->str
)) {
1990 // Two commas so they are thousands separators
1991 possible
&= ~STF_GUESS_NUMBER_DEC_COMMA
;
1994 seen_dot
= seen_dot
|| (pdot
!= 0);
1995 seen_comma
= seen_comma
|| (pcomma
!= 0);
1997 do_check_number (data
, STF_GUESS_NUMBER_DEC_POINT
,
1998 s_dot
, s_comma
, s_dollar
,
1999 &possible
, &decimals_if_point
);
2000 do_check_number (data
, STF_GUESS_NUMBER_DEC_COMMA
,
2001 s_comma
, s_dot
, s_dollar
,
2002 &possible
, &decimals_if_comma
);
2004 if (possible
!= prev_possible
&& debug
)
2005 g_printerr ("col=%d; after [%s] possible=0x%x\n", col
, data
, possible
);
2008 if ((possible
& STF_GUESS_NUMBER_DEC_EITHER
) == STF_GUESS_NUMBER_DEC_EITHER
&&
2009 !seen_dot
&& !seen_comma
) {
2010 // It doesn't matter what the separators are
2011 possible
&= ~STF_GUESS_NUMBER_DEC_COMMA
;
2015 case STF_GUESS_DATE_DMY
:
2016 fmt
= go_format_new_from_XL ("d-mmm-yyyy");
2018 case STF_GUESS_DATE_MDY
:
2019 fmt
= go_format_new_from_XL ("m/d/yyyy");
2021 case STF_GUESS_DATE_YMD
:
2022 fmt
= go_format_new_from_XL ("yyyy-mm-dd");
2024 case STF_GUESS_NUMBER_DEC_POINT
:
2025 g_ptr_array_index (po
->formats_decimal
, col
) = g_string_new (".");
2026 g_ptr_array_index (po
->formats_thousand
, col
) = g_string_new (",");
2027 g_ptr_array_index (po
->formats_curr
, col
) = g_string_new (s_dollar
->str
);
2028 if (decimals_if_point
> 0) {
2029 // Don't set format if decimals is zero
2030 GString
*fmt_str
= g_string_new (NULL
);
2031 go_format_generate_number_str (fmt_str
, 1, decimals_if_point
, seen_comma
, FALSE
, FALSE
, "", "");
2032 fmt
= go_format_new_from_XL (fmt_str
->str
);
2033 g_string_free (fmt_str
, TRUE
);
2036 case STF_GUESS_NUMBER_DEC_COMMA
:
2037 g_ptr_array_index (po
->formats_decimal
, col
) = g_string_new (",");
2038 g_ptr_array_index (po
->formats_thousand
, col
) = g_string_new (".");
2039 g_ptr_array_index (po
->formats_curr
, col
) = g_string_new (s_dollar
->str
);
2040 if (decimals_if_comma
> 0) {
2041 // Don't set format if decimals is zero
2042 GString
*fmt_str
= g_string_new (NULL
);
2043 go_format_generate_number_str (fmt_str
, 1, decimals_if_comma
, seen_dot
, FALSE
, FALSE
, "", "");
2044 fmt
= go_format_new_from_XL (fmt_str
->str
);
2045 g_string_free (fmt_str
, TRUE
);
2053 fmt
= go_format_ref (go_format_general ());
2054 g_ptr_array_index (po
->formats
, col
) = fmt
;
2057 stf_parse_general_free (lines
);
2058 g_string_chunk_free (lines_chunk
);
2060 g_string_free (s_dot
, TRUE
);
2061 g_string_free (s_comma
, TRUE
);
2062 g_string_free (s_dollar
, TRUE
);