2 * stf-parse.c : Structured Text Format parser. (STF)
3 * A general purpose engine for parsing data
4 * in CSV and Fixed width format.
7 * Copyright (C) Almer. S. Tigelaar.
8 * EMail: almer1@dds.nl or almer-t@bigfoot.com
10 * Copyright (C) 2003 Andreas J. Guelzow <aguelzow@taliesin.ca>
11 * Copyright (C) 2003,2008-2009 Morten Welinder <terra@gnome.org>
13 * This program is free software; you can redistribute it and/or modify
14 * it under the terms of the GNU General Public License as published by
15 * the Free Software Foundation; either version 2 of the License, or
16 * (at your option) any later version.
18 * This program is distributed in the hope that it will be useful,
19 * but WITHOUT ANY WARRANTY; without even the implied warranty of
20 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21 * GNU General Public License for more details.
23 * You should have received a copy of the GNU General Public License
24 * along with this program; if not, see <https://www.gnu.org/licenses/>.
27 #include <gnumeric-config.h>
28 #include <glib/gi18n-lib.h>
30 #include <stf-parse.h>
31 #include <stf-export.h>
37 #include <clipboard.h>
38 #include <sheet-style.h>
41 #include <number-match.h>
43 #include <parse-util.h>
44 #include <number-match.h>
45 #include <gnm-format.h>
47 #include <goffice/goffice.h>
53 #define SETUP_LOCALE_SWITCH char *oldlocale = NULL
55 #define START_LOCALE_SWITCH if (parseoptions->locale) {\
56 oldlocale = g_strdup(go_setlocale (LC_ALL, NULL)); \
57 go_setlocale(LC_ALL, parseoptions->locale);}
59 #define END_LOCALE_SWITCH if (oldlocale) {\
60 go_setlocale(LC_ALL, oldlocale);\
63 /* Source_t struct, used for interchanging parsing information between the low level parse functions */
66 char const *position
; /* Indicates the current position within data */
68 /* Used internally for fixed width parsing */
69 int splitpos
; /* Indicates current position in splitpositions array */
70 int linepos
; /* Position on the current line */
73 /* Struct used for autodiscovery */
80 * Some silly dude make the length field an unsigned int. C just does
81 * not deal very well with that.
84 my_garray_len (GArray
const *a
)
90 my_utf8_strchr (const char *p
, gunichar uc
)
92 return uc
< 0x7f ? strchr (p
, uc
) : g_utf8_strchr (p
, -1, uc
);
96 compare_terminator (char const *s
, StfParseOptions_t
*parseoptions
)
98 guchar
const *us
= (guchar
const *)s
;
101 if (*us
> parseoptions
->compiled_terminator
.max
||
102 *us
< parseoptions
->compiled_terminator
.min
)
105 for (l
= parseoptions
->terminator
; l
; l
= l
->next
) {
106 char const *term
= l
->data
;
124 /*******************************************************************************************************
125 * STF PARSE OPTIONS : StfParseOptions related
126 *******************************************************************************************************/
129 gnm_g_string_free (GString
*s
)
131 if (s
) g_string_free (s
, TRUE
);
136 * stf_parse_options_new:
138 * This will return a new StfParseOptions_t struct.
139 * The struct should, after being used, freed with stf_parse_options_free.
141 static StfParseOptions_t
*
142 stf_parse_options_new (void)
144 StfParseOptions_t
* parseoptions
= g_new0 (StfParseOptions_t
, 1);
146 parseoptions
->parsetype
= PARSE_TYPE_NOTSET
;
148 parseoptions
->terminator
= NULL
;
149 stf_parse_options_add_line_terminator (parseoptions
, "\r\n");
150 stf_parse_options_add_line_terminator (parseoptions
, "\n");
151 stf_parse_options_add_line_terminator (parseoptions
, "\r");
153 parseoptions
->trim_spaces
= (TRIM_TYPE_RIGHT
| TRIM_TYPE_LEFT
);
154 parseoptions
->locale
= NULL
;
156 parseoptions
->splitpositions
= NULL
;
157 stf_parse_options_fixed_splitpositions_clear (parseoptions
);
159 parseoptions
->stringindicator
= '"';
160 parseoptions
->indicator_2x_is_single
= TRUE
;
161 parseoptions
->sep
.duplicates
= FALSE
;
162 parseoptions
->trim_seps
= FALSE
;
164 parseoptions
->sep
.str
= NULL
;
165 parseoptions
->sep
.chr
= NULL
;
167 parseoptions
->col_autofit_array
= NULL
;
168 parseoptions
->col_import_array
= NULL
;
169 parseoptions
->col_import_array_len
= 0;
170 parseoptions
->formats
= g_ptr_array_new_with_free_func ((GDestroyNotify
)go_format_unref
);
171 parseoptions
->formats_decimal
= g_ptr_array_new_with_free_func ((GDestroyNotify
)gnm_g_string_free
);
172 parseoptions
->formats_thousand
= g_ptr_array_new_with_free_func ((GDestroyNotify
)gnm_g_string_free
);
173 parseoptions
->formats_curr
= g_ptr_array_new_with_free_func ((GDestroyNotify
)gnm_g_string_free
);
175 parseoptions
->cols_exceeded
= FALSE
;
176 parseoptions
->rows_exceeded
= FALSE
;
177 parseoptions
->ref_count
= 1;
183 * stf_parse_options_free:
185 * will free @parseoptions, note that this will not free the splitpositions
186 * member (GArray) of the struct, the caller is responsible for that.
189 stf_parse_options_free (StfParseOptions_t
*parseoptions
)
191 g_return_if_fail (parseoptions
!= NULL
);
193 if (parseoptions
->ref_count
-- > 1)
196 g_free (parseoptions
->col_import_array
);
197 g_free (parseoptions
->col_autofit_array
);
198 g_free (parseoptions
->locale
);
199 g_free (parseoptions
->sep
.chr
);
201 if (parseoptions
->sep
.str
) {
204 for (l
= parseoptions
->sep
.str
; l
!= NULL
; l
= l
->next
)
205 g_free ((char *) l
->data
);
206 g_slist_free (parseoptions
->sep
.str
);
209 g_array_free (parseoptions
->splitpositions
, TRUE
);
211 stf_parse_options_clear_line_terminator (parseoptions
);
213 g_ptr_array_free (parseoptions
->formats
, TRUE
);
214 g_ptr_array_free (parseoptions
->formats_decimal
, TRUE
);
215 g_ptr_array_free (parseoptions
->formats_thousand
, TRUE
);
216 g_ptr_array_free (parseoptions
->formats_curr
, TRUE
);
218 g_free (parseoptions
);
221 static StfParseOptions_t
*
222 stf_parse_options_ref (StfParseOptions_t
*parseoptions
)
224 parseoptions
->ref_count
++;
229 stf_parse_options_get_type (void)
234 t
= g_boxed_type_register_static ("StfParseOptions_t",
235 (GBoxedCopyFunc
)stf_parse_options_ref
,
236 (GBoxedFreeFunc
)stf_parse_options_free
);
242 stf_parse_options_set_type (StfParseOptions_t
*parseoptions
, StfParseType_t
const parsetype
)
244 g_return_if_fail (parseoptions
!= NULL
);
245 g_return_if_fail (parsetype
== PARSE_TYPE_CSV
|| parsetype
== PARSE_TYPE_FIXED
);
247 parseoptions
->parsetype
= parsetype
;
251 long_string_first (gchar
const *a
, gchar
const *b
)
253 /* This actually is UTF-8 safe. */
254 return strlen (b
) - strlen (a
);
258 compile_terminators (StfParseOptions_t
*parseoptions
)
262 parseoptions
->terminator
=
263 g_slist_sort (parseoptions
->terminator
,
264 (GCompareFunc
)long_string_first
);
265 parseoptions
->compiled_terminator
.min
= 255;
266 parseoptions
->compiled_terminator
.max
= 0;
267 for (l
= parseoptions
->terminator
; l
; l
= l
->next
) {
268 const guchar
*term
= l
->data
;
269 parseoptions
->compiled_terminator
.min
=
270 MIN (parseoptions
->compiled_terminator
.min
, *term
);
271 parseoptions
->compiled_terminator
.max
=
272 MAX (parseoptions
->compiled_terminator
.max
, *term
);
277 * stf_parse_options_add_line_terminator:
279 * This will add to the line terminators, in both the Fixed width and CSV delimited importers
280 * this indicates the end of a row.
284 stf_parse_options_add_line_terminator (StfParseOptions_t
*parseoptions
, char const *terminator
)
286 g_return_if_fail (parseoptions
!= NULL
);
287 g_return_if_fail (terminator
!= NULL
&& *terminator
!= 0);
289 GO_SLIST_PREPEND (parseoptions
->terminator
, g_strdup (terminator
));
290 compile_terminators (parseoptions
);
294 * stf_parse_options_clear_line_terminator:
296 * This will clear the line terminator, in both the Fixed width and CSV delimited importers
297 * this indicates the end of a row.
301 stf_parse_options_clear_line_terminator (StfParseOptions_t
*parseoptions
)
303 g_return_if_fail (parseoptions
!= NULL
);
305 g_slist_free_full (parseoptions
->terminator
, g_free
);
306 parseoptions
->terminator
= NULL
;
307 compile_terminators (parseoptions
);
311 * stf_parse_options_set_trim_spaces:
313 * If enabled will trim spaces in every parsed field on left and/or right
317 stf_parse_options_set_trim_spaces (StfParseOptions_t
*parseoptions
, StfTrimType_t
const trim_spaces
)
319 g_return_if_fail (parseoptions
!= NULL
);
321 parseoptions
->trim_spaces
= trim_spaces
;
325 * stf_parse_options_csv_set_separators:
326 * @parseoptions: #StfParseOptions_t
328 * @seps: (element-type utf8): the separators to be used
330 * A copy is made of the parameters.
333 stf_parse_options_csv_set_separators (StfParseOptions_t
*parseoptions
,
334 char const *character
,
337 g_return_if_fail (parseoptions
!= NULL
);
339 g_free (parseoptions
->sep
.chr
);
340 parseoptions
->sep
.chr
= g_strdup (character
);
342 g_slist_free_full (parseoptions
->sep
.str
, g_free
);
343 parseoptions
->sep
.str
=
344 g_slist_copy_deep ((GSList
*)seps
, (GCopyFunc
)g_strdup
, NULL
);
348 stf_parse_options_csv_set_stringindicator (StfParseOptions_t
*parseoptions
, gunichar
const stringindicator
)
350 g_return_if_fail (parseoptions
!= NULL
);
352 parseoptions
->stringindicator
= stringindicator
;
356 * stf_parse_options_csv_set_indicator_2x_is_single:
357 * @indic_2x: a boolean value indicating whether we want to see two
358 * adjacent string indicators as a single string indicator
359 * that is part of the cell, rather than a terminator.
362 stf_parse_options_csv_set_indicator_2x_is_single (StfParseOptions_t
*parseoptions
,
363 gboolean
const indic_2x
)
365 g_return_if_fail (parseoptions
!= NULL
);
367 parseoptions
->indicator_2x_is_single
= indic_2x
;
371 * stf_parse_options_csv_set_duplicates:
373 * @duplicates: a boolean value indicating whether we want to see two
374 * separators right behind each other as one
377 stf_parse_options_csv_set_duplicates (StfParseOptions_t
*parseoptions
, gboolean
const duplicates
)
379 g_return_if_fail (parseoptions
!= NULL
);
381 parseoptions
->sep
.duplicates
= duplicates
;
385 * stf_parse_options_csv_set_trim_seps:
386 * @trim_seps: a boolean value indicating whether we want to ignore
387 * separators at the beginning of lines
390 stf_parse_options_csv_set_trim_seps (StfParseOptions_t
*parseoptions
, gboolean
const trim_seps
)
392 g_return_if_fail (parseoptions
!= NULL
);
394 parseoptions
->trim_seps
= trim_seps
;
398 * stf_parse_options_fixed_splitpositions_clear:
400 * This will clear the splitpositions (== points on which a line is split)
403 stf_parse_options_fixed_splitpositions_clear (StfParseOptions_t
*parseoptions
)
406 g_return_if_fail (parseoptions
!= NULL
);
408 if (parseoptions
->splitpositions
)
409 g_array_free (parseoptions
->splitpositions
, TRUE
);
410 parseoptions
->splitpositions
= g_array_new (FALSE
, FALSE
, sizeof (int));
412 g_array_append_val (parseoptions
->splitpositions
, minus_one
);
416 * stf_parse_options_fixed_splitpositions_add:
418 * @position will be added to the splitpositions.
421 stf_parse_options_fixed_splitpositions_add (StfParseOptions_t
*parseoptions
, int position
)
425 g_return_if_fail (parseoptions
!= NULL
);
426 g_return_if_fail (position
>= 0);
428 for (ui
= 0; ui
< parseoptions
->splitpositions
->len
- 1; ui
++) {
429 int here
= g_array_index (parseoptions
->splitpositions
, int, ui
);
430 if (position
== here
)
436 g_array_insert_val (parseoptions
->splitpositions
, ui
, position
);
440 stf_parse_options_fixed_splitpositions_remove (StfParseOptions_t
*parseoptions
, int position
)
444 g_return_if_fail (parseoptions
!= NULL
);
445 g_return_if_fail (position
>= 0);
447 for (ui
= 0; ui
< parseoptions
->splitpositions
->len
- 1; ui
++) {
448 int here
= g_array_index (parseoptions
->splitpositions
, int, ui
);
449 if (position
== here
)
450 g_array_remove_index (parseoptions
->splitpositions
, ui
);
451 if (position
<= here
)
457 stf_parse_options_fixed_splitpositions_count (StfParseOptions_t
*parseoptions
)
459 return parseoptions
->splitpositions
->len
;
463 stf_parse_options_fixed_splitpositions_nth (StfParseOptions_t
*parseoptions
, int n
)
465 return g_array_index (parseoptions
->splitpositions
, int, n
);
470 * stf_parse_options_valid:
471 * @parseoptions: an import options struct
473 * Checks if @parseoptions is correctly filled
475 * returns : TRUE if it is correctly filled, FALSE otherwise.
478 stf_parse_options_valid (StfParseOptions_t
*parseoptions
)
480 g_return_val_if_fail (parseoptions
!= NULL
, FALSE
);
482 if (parseoptions
->parsetype
== PARSE_TYPE_FIXED
) {
483 if (!parseoptions
->splitpositions
) {
484 g_warning ("STF: No splitpositions in struct");
492 /*******************************************************************************************************
493 * STF PARSE : The actual routines that do the 'trick'
494 *******************************************************************************************************/
497 trim_spaces_inplace (char *field
, StfParseOptions_t
const *parseoptions
)
501 if (parseoptions
->trim_spaces
& TRIM_TYPE_LEFT
) {
504 while (g_unichar_isspace (g_utf8_get_char (s
)))
505 s
= g_utf8_next_char (s
);
508 memmove (field
, s
, 1 + strlen (s
));
511 if (parseoptions
->trim_spaces
& TRIM_TYPE_RIGHT
) {
512 char *s
= field
+ strlen (field
);
515 s
= g_utf8_prev_char (s
);
516 if (!g_unichar_isspace (g_utf8_get_char (s
)))
524 * stf_parse_csv_is_separator:
526 * returns NULL if @character is not a separator, a pointer to the character
527 * after the separator otherwise.
530 stf_parse_csv_is_separator (char const *character
, char const *chr
, GSList
const *str
)
532 g_return_val_if_fail (character
!= NULL
, NULL
);
540 for (l
= str
; l
!= NULL
; l
= l
->next
) {
541 char const *s
= l
->data
;
544 glong
const len
= g_utf8_strlen (s
, -1);
546 /* Don't compare past the end of the buffer! */
547 for (r
= character
, cnt
= 0; cnt
< len
; cnt
++, r
= g_utf8_next_char (r
))
551 if ((cnt
== len
) && (memcmp (character
, s
, len
) == 0))
552 return g_utf8_offset_to_pointer (character
, len
);
556 if (chr
&& my_utf8_strchr (chr
, g_utf8_get_char (character
)))
557 return g_utf8_next_char(character
);
563 * stf_parse_eat_separators:
565 * skip over leading separators
570 stf_parse_eat_separators (Source_t
*src
, StfParseOptions_t
*parseoptions
)
572 char const *cur
, *next
;
574 g_return_if_fail (src
!= NULL
);
575 g_return_if_fail (parseoptions
!= NULL
);
579 if (*cur
== '\0' || compare_terminator (cur
, parseoptions
))
581 while ((next
= stf_parse_csv_is_separator (cur
, parseoptions
->sep
.chr
, parseoptions
->sep
.str
)))
592 STF_CELL_FIELD_NO_SEP
,
596 static StfParseCellRes
597 stf_parse_csv_cell (GString
*text
, Source_t
*src
, StfParseOptions_t
*parseoptions
)
600 gboolean saw_sep
= FALSE
;
602 g_return_val_if_fail (src
!= NULL
, STF_CELL_ERROR
);
603 g_return_val_if_fail (parseoptions
!= NULL
, STF_CELL_ERROR
);
606 g_return_val_if_fail (cur
!= NULL
, STF_CELL_ERROR
);
608 /* Skip whitespace, but stop at line terminators. */
617 term_len
= compare_terminator (cur
, parseoptions
);
619 src
->position
= cur
+ term_len
;
623 if ((parseoptions
->trim_spaces
& TRIM_TYPE_LEFT
) == 0)
626 if (stf_parse_csv_is_separator (cur
, parseoptions
->sep
.chr
,
627 parseoptions
->sep
.str
))
630 if (!g_unichar_isspace (g_utf8_get_char (cur
)))
632 cur
= g_utf8_next_char (cur
);
635 if (parseoptions
->stringindicator
!= 0 &&
636 g_utf8_get_char (cur
) == parseoptions
->stringindicator
) {
637 cur
= g_utf8_next_char (cur
);
639 gunichar uc
= g_utf8_get_char (cur
);
640 cur
= g_utf8_next_char (cur
);
642 if (uc
== parseoptions
->stringindicator
) {
643 if (parseoptions
->indicator_2x_is_single
&&
644 g_utf8_get_char (cur
) == parseoptions
->stringindicator
)
645 cur
= g_utf8_next_char (cur
);
647 /* "field content"dropped-garbage, */
648 while (*cur
&& !compare_terminator (cur
, parseoptions
)) {
649 char const *post
= stf_parse_csv_is_separator
650 (cur
, parseoptions
->sep
.chr
, parseoptions
->sep
.str
);
656 cur
= g_utf8_next_char (cur
);
662 g_string_append_unichar (text
, uc
);
665 /* We silently allow a missing terminating quote. */
667 /* Unquoted field. */
669 while (*cur
&& !compare_terminator (cur
, parseoptions
)) {
671 char const *post
= stf_parse_csv_is_separator
672 (cur
, parseoptions
->sep
.chr
, parseoptions
->sep
.str
);
679 g_string_append_unichar (text
, g_utf8_get_char (cur
));
680 cur
= g_utf8_next_char (cur
);
683 if (parseoptions
->trim_spaces
& TRIM_TYPE_RIGHT
) {
685 const char *last
= g_utf8_prev_char (text
->str
+ text
->len
);
686 if (!g_unichar_isspace (g_utf8_get_char (last
)))
688 g_string_truncate (text
, last
- text
->str
);
695 if (saw_sep
&& parseoptions
->sep
.duplicates
)
696 stf_parse_eat_separators (src
, parseoptions
);
698 return saw_sep
? STF_CELL_FIELD_SEP
: STF_CELL_FIELD_NO_SEP
;
702 * stf_parse_csv_line:
704 * This will parse one line from the current @src->position.
705 * NOTE: The calling routine is responsible for freeing the result.
707 * returns : a GPtrArray of char*'s
710 stf_parse_csv_line (Source_t
*src
, StfParseOptions_t
*parseoptions
)
713 gboolean cont
= FALSE
;
716 g_return_val_if_fail (src
!= NULL
, NULL
);
717 g_return_val_if_fail (parseoptions
!= NULL
, NULL
);
719 line
= g_ptr_array_new ();
720 if (parseoptions
->trim_seps
)
721 stf_parse_eat_separators (src
, parseoptions
);
723 text
= g_string_sized_new (30);
727 StfParseCellRes res
=
728 stf_parse_csv_cell (text
, src
, parseoptions
);
729 trim_spaces_inplace (text
->str
, parseoptions
);
730 ctext
= g_string_chunk_insert_len (src
->chunk
,
731 text
->str
, text
->len
);
732 g_string_truncate (text
, 0);
735 case STF_CELL_FIELD_NO_SEP
:
736 g_ptr_array_add (line
, ctext
);
740 case STF_CELL_FIELD_SEP
:
741 g_ptr_array_add (line
, ctext
);
742 cont
= TRUE
; /* Make sure we see one more field. */
747 g_ptr_array_add (line
, ctext
);
748 g_string_free (text
, TRUE
);
755 * stf_parse_fixed_cell:
757 * returns a pointer to the parsed cell contents.
760 stf_parse_fixed_cell (Source_t
*src
, StfParseOptions_t
*parseoptions
)
766 g_return_val_if_fail (src
!= NULL
, NULL
);
767 g_return_val_if_fail (parseoptions
!= NULL
, NULL
);
771 if (src
->splitpos
< my_garray_len (parseoptions
->splitpositions
))
772 splitval
= (int) g_array_index (parseoptions
->splitpositions
, int, src
->splitpos
);
776 while (*cur
!= 0 && !compare_terminator (cur
, parseoptions
) && splitval
!= src
->linepos
) {
778 cur
= g_utf8_next_char (cur
);
781 res
= g_string_chunk_insert_len (src
->chunk
,
783 cur
- src
->position
);
791 * stf_parse_fixed_line:
793 * This will parse one line from the current @src->position.
794 * It will return a GPtrArray with the cell contents as strings.
796 * NOTE: The calling routine is responsible for freeing result.
799 stf_parse_fixed_line (Source_t
*src
, StfParseOptions_t
*parseoptions
)
803 g_return_val_if_fail (src
!= NULL
, NULL
);
804 g_return_val_if_fail (parseoptions
!= NULL
, NULL
);
809 line
= g_ptr_array_new ();
810 while (*src
->position
!= '\0' && !compare_terminator (src
->position
, parseoptions
)) {
811 char *field
= stf_parse_fixed_cell (src
, parseoptions
);
813 trim_spaces_inplace (field
, parseoptions
);
814 g_ptr_array_add (line
, field
);
819 while (line
->len
< parseoptions
->splitpositions
->len
)
820 g_ptr_array_add (line
, g_strdup (""));
826 * stf_parse_general_free: (skip)
829 stf_parse_general_free (GPtrArray
*lines
)
832 for (lineno
= 0; lineno
< lines
->len
; lineno
++) {
833 GPtrArray
*line
= g_ptr_array_index (lines
, lineno
);
834 /* Fields are not freed here. */
836 g_ptr_array_free (line
, TRUE
);
838 g_ptr_array_free (lines
, TRUE
);
843 * stf_parse_general: (skip)
845 * Returns: (transfer full): a GPtrArray of lines, where each line is itself a
846 * GPtrArray of strings.
848 * The caller must free this entire structure, for example by calling
849 * stf_parse_general_free.
852 stf_parse_general (StfParseOptions_t
*parseoptions
,
853 GStringChunk
*lines_chunk
,
854 char const *data
, char const *data_end
)
859 char const *valid_end
= data_end
;
861 g_return_val_if_fail (parseoptions
!= NULL
, NULL
);
862 g_return_val_if_fail (data
!= NULL
, NULL
);
863 g_return_val_if_fail (data_end
!= NULL
, NULL
);
864 g_return_val_if_fail (stf_parse_options_valid (parseoptions
), NULL
);
865 g_return_val_if_fail (g_utf8_validate (data
, data_end
-data
, &valid_end
), NULL
);
867 src
.chunk
= lines_chunk
;
871 if ((data_end
-data
>= 3) && !strncmp(src
.position
, "\xEF\xBB\xBF", 3)) {
872 /* Skip over byte-order mark */
876 lines
= g_ptr_array_new ();
877 while (*src
.position
!= '\0' && src
.position
< data_end
) {
880 if (row
== GNM_MAX_ROWS
) {
881 parseoptions
->rows_exceeded
= TRUE
;
885 line
= parseoptions
->parsetype
== PARSE_TYPE_CSV
886 ? stf_parse_csv_line (&src
, parseoptions
)
887 : stf_parse_fixed_line (&src
, parseoptions
);
889 g_ptr_array_add (lines
, line
);
890 if (parseoptions
->parsetype
!= PARSE_TYPE_CSV
)
891 src
.position
+= compare_terminator (src
.position
, parseoptions
);
899 * stf_parse_lines: (skip)
900 * @parseoptions: #StfParseOptions_t
906 * Returns: (transfer full): a GPtrArray of lines, where each line is itself a
907 * GPtrArray of strings.
909 * The caller must free this entire structure, for example by calling
910 * stf_parse_general_free.
913 stf_parse_lines (StfParseOptions_t
*parseoptions
,
914 GStringChunk
*lines_chunk
,
916 int maxlines
, gboolean with_lineno
)
921 g_return_val_if_fail (data
!= NULL
, NULL
);
923 lines
= g_ptr_array_new ();
925 char const *data0
= data
;
926 GPtrArray
*line
= g_ptr_array_new ();
929 char buf
[4 * sizeof (int)];
930 sprintf (buf
, "%d", lineno
);
931 g_ptr_array_add (line
,
932 g_string_chunk_insert (lines_chunk
, buf
));
936 int termlen
= compare_terminator (data
, parseoptions
);
937 if (termlen
> 0 || *data
== 0) {
938 g_ptr_array_add (line
,
939 g_string_chunk_insert_len (lines_chunk
,
945 data
= g_utf8_next_char (data
);
948 g_ptr_array_add (lines
, line
);
951 if (lineno
>= maxlines
)
958 stf_parse_find_line (StfParseOptions_t
*parseoptions
,
963 int termlen
= compare_terminator (data
, parseoptions
);
967 } else if (*data
== 0) {
970 data
= g_utf8_next_char (data
);
978 * stf_parse_options_fixed_autodiscover:
979 * @parseoptions: a Parse options struct.
980 * @data: The actual data.
981 * @data_end: data end.
983 * Automatically try to discover columns in the text to be parsed.
984 * We ignore empty lines (only containing parseoptions->terminator)
986 * FIXME: This is so extremely ugly that I am too tired to rewrite it right now.
987 * Think hard of a better more flexible solution...
990 stf_parse_options_fixed_autodiscover (StfParseOptions_t
*parseoptions
,
991 char const *data
, char const *data_end
)
993 char const *iterator
= data
;
995 GSList
*list_start
= NULL
;
997 int effective_lines
= 0;
998 int max_line_length
= 0;
999 int *line_begin_hits
= NULL
;
1000 int *line_end_hits
= NULL
;
1003 stf_parse_options_fixed_splitpositions_clear (parseoptions
);
1006 * First take a look at all possible white space combinations
1008 while (*iterator
&& iterator
< data_end
) {
1009 gboolean begin_recorded
= FALSE
;
1010 AutoDiscovery_t
*disc
= NULL
;
1014 while (*iterator
&& (termlen
= compare_terminator (iterator
, parseoptions
)) == 0) {
1015 if (!begin_recorded
&& *iterator
== ' ') {
1016 disc
= g_new0 (AutoDiscovery_t
, 1);
1018 disc
->start
= position
;
1020 begin_recorded
= TRUE
;
1021 } else if (begin_recorded
&& *iterator
!= ' ') {
1022 disc
->stop
= position
;
1023 list
= g_slist_prepend (list
, disc
);
1025 begin_recorded
= FALSE
;
1033 if (position
> max_line_length
)
1034 max_line_length
= position
;
1037 * If there are excess spaces at the end of
1038 * the line : ignore them
1043 * Hop over the terminator
1045 iterator
+= termlen
;
1053 list
= g_slist_reverse (list
);
1058 * Look at the number of hits at each line position
1059 * if the number of hits equals the number of lines
1060 * we can be pretty sure this is the start or end
1061 * of a column, we filter out empty columns
1064 line_begin_hits
= g_new0 (int, max_line_length
+ 1);
1065 line_end_hits
= g_new0 (int, max_line_length
+ 1);
1068 AutoDiscovery_t
*disc
= list
->data
;
1070 line_begin_hits
[disc
->start
]++;
1071 line_end_hits
[disc
->stop
]++;
1075 list
= g_slist_next (list
);
1077 g_slist_free (list_start
);
1079 for (i
= 0; i
< max_line_length
+ 1; i
++)
1080 if (line_begin_hits
[i
] == effective_lines
|| line_end_hits
[i
] == effective_lines
)
1081 stf_parse_options_fixed_splitpositions_add (parseoptions
, i
);
1084 * Do some corrections to the initial columns
1085 * detected here, we obviously don't need to
1086 * do this if there are no columns at all.
1088 if (my_garray_len (parseoptions
->splitpositions
) > 0) {
1090 * Try to find columns that look like:
1095 * (In other words : Columns with left & right justification with
1096 * a minimum of 2 spaces in the middle)
1097 * Split these columns in 2
1100 for (i
= 0; i
< my_garray_len (parseoptions
->splitpositions
) - 1; i
++) {
1101 int begin
= g_array_index (parseoptions
->splitpositions
, int, i
);
1102 int end
= g_array_index (parseoptions
->splitpositions
, int, i
+ 1);
1103 int num_spaces
= -1;
1104 int spaces_start
= 0;
1105 gboolean right_aligned
= TRUE
;
1106 gboolean left_aligned
= TRUE
;
1107 gboolean has_2_spaces
= TRUE
;
1111 while (*iterator
&& iterator
< data_end
) {
1112 gboolean trigger
= FALSE
;
1113 gboolean space_trigger
= FALSE
;
1118 while (*iterator
&& !compare_terminator (iterator
, parseoptions
)) {
1120 if (*iterator
== ' ')
1121 left_aligned
= FALSE
;
1124 } else if (pos
== end
- 1) {
1125 if (*iterator
== ' ')
1126 right_aligned
= FALSE
;
1131 if (trigger
|| pos
== end
- 1) {
1132 if (!space_trigger
&& *iterator
== ' ') {
1133 space_trigger
= TRUE
;
1135 } else if (space_trigger
&& *iterator
!= ' ') {
1136 space_trigger
= FALSE
;
1137 num_spaces
= pos
- spaces_start
;
1146 has_2_spaces
= FALSE
;
1155 * If this column meets all the criteria
1156 * split it into two at the last measured
1157 * spaces_start + num_spaces
1159 if (has_2_spaces
&& right_aligned
&& left_aligned
) {
1160 int val
= (((spaces_start
+ num_spaces
) - spaces_start
) / 2) + spaces_start
;
1162 g_array_insert_val (parseoptions
->splitpositions
, i
+ 1, val
);
1165 * Skip over the inserted column
1172 * Remove empty columns here if needed
1174 for (i
= 0; i
< my_garray_len (parseoptions
->splitpositions
) - 1; i
++) {
1175 int begin
= g_array_index (parseoptions
->splitpositions
, int, i
);
1176 int end
= g_array_index (parseoptions
->splitpositions
, int, i
+ 1);
1177 gboolean only_spaces
= TRUE
;
1181 while (*iterator
&& iterator
< data_end
) {
1182 gboolean trigger
= FALSE
;
1185 while (*iterator
&& !compare_terminator (iterator
, parseoptions
)) {
1188 else if (pos
== end
)
1192 if (*iterator
!= ' ')
1193 only_spaces
= FALSE
;
1207 * The column only contains spaces
1211 g_array_remove_index (parseoptions
->splitpositions
, i
);
1214 * We HAVE to make sure that the next column (end) also
1215 * gets checked out. If we don't decrease "i" here, we
1216 * will skip over it as the indexes shift down after
1224 g_free (line_begin_hits
);
1225 g_free (line_end_hits
);
1228 /*******************************************************************************************************
1229 * STF PARSE HL: high-level functions that dump the raw data returned by the low-level parsing
1230 * functions into something meaningful (== application specific)
1231 *******************************************************************************************************/
1234 * This is more or less as gnm_cell_set_text, except...
1235 * 1. Unknown names are not allowed.
1236 * 2. Only '=' can start an expression.
1240 stf_cell_set_text (GnmCell
*cell
, char const *text
)
1242 GnmExprTop
const *texpr
;
1244 GOFormat
const *fmt
= gnm_style_get_format (gnm_cell_get_style (cell
));
1245 const GODateConventions
*date_conv
= sheet_date_conv (cell
->base
.sheet
);
1247 if (!go_format_is_text (fmt
) && *text
== '=' && text
[1] != 0) {
1248 GnmExprParseFlags flags
=
1249 GNM_EXPR_PARSE_UNKNOWN_NAMES_ARE_INVALID
;
1250 const char *expr_start
= text
+ 1;
1253 parse_pos_init_cell (&pos
, cell
);
1254 texpr
= gnm_expr_parse_str (expr_start
, &pos
, flags
,
1258 val
= format_match (text
, fmt
, date_conv
);
1262 val
= value_new_string (text
);
1265 gnm_cell_set_value (cell
, val
);
1267 gnm_cell_set_expr (cell
, texpr
);
1268 gnm_expr_top_unref (texpr
);
1273 stf_read_remember_settings (Workbook
*book
, StfParseOptions_t
*po
)
1275 if (po
->parsetype
== PARSE_TYPE_CSV
) {
1276 GnmStfExport
*stfe
= gnm_stf_get_stfe (G_OBJECT (book
));
1278 int length
= g_unichar_to_utf8 (po
->stringindicator
, quote
);
1282 } else quote
[length
] = '\0';
1284 g_object_set (G_OBJECT (stfe
), "separator", po
->sep
.chr
, "quote", "e
, NULL
);
1286 if ((po
->terminator
!= NULL
) && (po
->terminator
->data
!= NULL
))
1287 g_object_set (G_OBJECT (stfe
), "eol", po
->terminator
->data
, NULL
);
1292 stf_parse_sheet (StfParseOptions_t
*parseoptions
,
1293 char const *data
, char const *data_end
,
1294 Sheet
*sheet
, int start_col
, int start_row
)
1298 GStringChunk
*lines_chunk
;
1300 gboolean result
= TRUE
;
1305 SETUP_LOCALE_SWITCH
;
1307 g_return_val_if_fail (parseoptions
!= NULL
, FALSE
);
1308 g_return_val_if_fail (data
!= NULL
, FALSE
);
1309 g_return_val_if_fail (IS_SHEET (sheet
), FALSE
);
1312 data_end
= data
+ strlen (data
);
1314 lines_chunk
= g_string_chunk_new (100 * 1024);
1315 lines
= stf_parse_general (parseoptions
, lines_chunk
, data
, data_end
);
1320 nformats
= parseoptions
->formats
->len
;
1321 for (lcol
= 0; lcol
< nformats
; lcol
++) {
1322 GOFormat
const *fmt
= g_ptr_array_index (parseoptions
->formats
, lcol
);
1325 (parseoptions
->col_import_array
== NULL
||
1326 parseoptions
->col_import_array_len
<= lcol
||
1327 parseoptions
->col_import_array
[lcol
]);
1328 if (!want_col
|| col
>= gnm_sheet_get_max_cols (sheet
))
1331 if (fmt
&& !go_format_is_general (fmt
)) {
1333 int end_row
= MIN (start_row
+ (int)lines
->len
- 1,
1334 gnm_sheet_get_last_row (sheet
));
1336 range_init (&r
, col
, start_row
, col
, end_row
);
1337 mstyle
= gnm_style_new ();
1338 gnm_style_set_format (mstyle
, fmt
);
1339 sheet_apply_style (sheet
, &r
, mstyle
);
1344 START_LOCALE_SWITCH
;
1345 for (row
= start_row
, lrow
= 0;
1346 result
&& lrow
< lines
->len
;
1350 if (row
>= gnm_sheet_get_max_rows (sheet
)) {
1351 if (!parseoptions
->rows_exceeded
) {
1352 /* FIXME: What locale? */
1353 g_warning (_("There are more rows of data than "
1354 "there is room for in the sheet. Extra "
1355 "rows will be ignored."));
1356 parseoptions
->rows_exceeded
= TRUE
;
1362 line
= g_ptr_array_index (lines
, lrow
);
1364 for (lcol
= 0; lcol
< line
->len
; lcol
++) {
1365 GOFormat
const *fmt
= lcol
< nformats
1366 ? g_ptr_array_index (parseoptions
->formats
, lcol
)
1367 : go_format_general ();
1368 char const *text
= g_ptr_array_index (line
, lcol
);
1370 (parseoptions
->col_import_array
== NULL
||
1371 parseoptions
->col_import_array_len
<= lcol
||
1372 parseoptions
->col_import_array
[lcol
]);
1376 if (col
>= gnm_sheet_get_max_cols (sheet
)) {
1377 if (!parseoptions
->cols_exceeded
) {
1378 /* FIXME: What locale? */
1379 g_warning (_("There are more columns of data than "
1380 "there is room for in the sheet. Extra "
1381 "columns will be ignored."));
1382 parseoptions
->cols_exceeded
= TRUE
;
1386 if (text
&& *text
) {
1387 GnmCell
*cell
= sheet_cell_fetch (sheet
, col
, row
);
1388 if (!go_format_is_text (fmt
) &&
1389 lcol
< parseoptions
->formats_decimal
->len
&&
1390 g_ptr_array_index (parseoptions
->formats_decimal
, lcol
)) {
1392 GnmValue
*v
= format_match_decimal_number_with_locale
1394 g_ptr_array_index (parseoptions
->formats_curr
, lcol
),
1395 g_ptr_array_index (parseoptions
->formats_thousand
, lcol
),
1396 g_ptr_array_index (parseoptions
->formats_decimal
, lcol
));
1398 v
= value_new_string (text
);
1399 sheet_cell_set_value (cell
, v
);
1402 stf_cell_set_text (cell
, text
);
1408 g_ptr_array_index (lines
, lrow
) = NULL
;
1409 g_ptr_array_free (line
, TRUE
);
1413 for (lcol
= 0, col
= start_col
;
1414 lcol
< parseoptions
->col_import_array_len
&& col
< gnm_sheet_get_max_cols (sheet
);
1416 if (parseoptions
->col_import_array
== NULL
||
1417 parseoptions
->col_import_array_len
<= lcol
||
1418 parseoptions
->col_import_array
[lcol
]) {
1419 if (parseoptions
->col_autofit_array
== NULL
||
1420 parseoptions
->col_autofit_array
[lcol
]) {
1421 ColRowIndexList
*list
= colrow_get_index_list (col
, col
, NULL
);
1422 ColRowStateGroup
*state
= colrow_set_sizes (sheet
, TRUE
, list
, -1, 0, -1);
1423 colrow_index_list_destroy (list
);
1424 g_slist_free (state
);
1430 g_string_chunk_free (lines_chunk
);
1432 stf_parse_general_free (lines
);
1434 stf_read_remember_settings (sheet
->workbook
, parseoptions
);
1439 stf_parse_region (StfParseOptions_t
*parseoptions
, char const *data
, char const *data_end
,
1442 static GODateConventions
const default_conv
= {FALSE
};
1443 GODateConventions
const *date_conv
= wb
? workbook_date_conv (wb
) : &default_conv
;
1446 unsigned int row
, colhigh
= 0;
1447 GStringChunk
*lines_chunk
;
1451 SETUP_LOCALE_SWITCH
;
1453 g_return_val_if_fail (parseoptions
!= NULL
, NULL
);
1454 g_return_val_if_fail (data
!= NULL
, NULL
);
1456 START_LOCALE_SWITCH
;
1458 cr
= gnm_cell_region_new (NULL
);
1461 data_end
= data
+ strlen (data
);
1462 lines_chunk
= g_string_chunk_new (100 * 1024);
1463 lines
= stf_parse_general (parseoptions
, lines_chunk
, data
, data_end
);
1464 nformats
= parseoptions
->formats
->len
;
1465 for (row
= 0; row
< lines
->len
; row
++) {
1466 GPtrArray
*line
= g_ptr_array_index (lines
, row
);
1467 unsigned int col
, targetcol
= 0;
1468 for (col
= 0; col
< line
->len
; col
++) {
1469 if (parseoptions
->col_import_array
== NULL
||
1470 parseoptions
->col_import_array_len
<= col
||
1471 parseoptions
->col_import_array
[col
]) {
1472 const char *text
= g_ptr_array_index (line
, col
);
1474 GOFormat
*fmt
= NULL
;
1479 fmt
= g_ptr_array_index (parseoptions
->formats
, col
);
1480 v
= format_match (text
, fmt
, date_conv
);
1482 v
= value_new_string (text
);
1484 cc
= gnm_cell_copy_new (cr
, targetcol
, row
);
1488 if (targetcol
> colhigh
)
1489 colhigh
= targetcol
;
1494 stf_parse_general_free (lines
);
1495 g_string_chunk_free (lines_chunk
);
1499 cr
->cols
= (colhigh
> 0) ? colhigh
: 1;
1506 int_sort (void const *a
, void const *b
)
1508 return *(int const *)a
- *(int const *)b
;
1512 count_character (GPtrArray
*lines
, gunichar c
, double quantile
)
1515 unsigned int lno
, cno
;
1517 if (lines
->len
== 0)
1520 counts
= g_new (int, lines
->len
);
1521 for (lno
= cno
= 0; lno
< lines
->len
; lno
++) {
1523 GPtrArray
*boxline
= g_ptr_array_index (lines
, lno
);
1524 char const *line
= g_ptr_array_index (boxline
, 0);
1526 /* Ignore empty lines. */
1531 if (g_utf8_get_char (line
) == c
)
1533 line
= g_utf8_next_char (line
);
1536 counts
[cno
++] = count
;
1542 unsigned int qi
= (unsigned int)ceil (quantile
* cno
);
1543 qsort (counts
, cno
, sizeof (counts
[0]), int_sort
);
1555 dump_guessed_options (const StfParseOptions_t
*res
)
1558 char ubuffer
[6 + 1];
1561 g_printerr ("Guessed format:\n");
1562 switch (res
->parsetype
) {
1563 case PARSE_TYPE_CSV
:
1564 g_printerr (" type = sep\n");
1565 g_printerr (" separator = %s\n",
1566 res
->sep
.chr
? res
->sep
.chr
: "(none)");
1567 g_printerr (" see two as one = %s\n",
1568 res
->sep
.duplicates
? "yes" : "no");
1570 case PARSE_TYPE_FIXED
:
1571 g_printerr (" type = sep\n");
1576 g_printerr (" trim space = %d\n", res
->trim_spaces
);
1578 ubuffer
[g_unichar_to_utf8 (res
->stringindicator
, ubuffer
)] = 0;
1579 g_printerr (" string indicator = %s\n", ubuffer
);
1580 g_printerr (" see two as one = %s\n",
1581 res
->indicator_2x_is_single
? "yes" : "no");
1583 g_printerr (" line terminators =");
1584 for (l
= res
->terminator
; l
; l
= l
->next
) {
1585 const char *t
= l
->data
;
1586 if (strcmp (t
, "\n") == 0)
1587 g_printerr (" unix");
1588 else if (strcmp (t
, "\r") == 0)
1589 g_printerr (" mac");
1590 else if (strcmp (t
, "\r\n") == 0)
1591 g_printerr (" dos");
1593 g_printerr (" other");
1597 for (ui
= 0; ui
< res
->formats
->len
; ui
++) {
1598 GOFormat
const *fmt
= g_ptr_array_index (res
->formats
, ui
);
1599 const GString
*decimal
= ui
< res
->formats_decimal
->len
1600 ? g_ptr_array_index (res
->formats_decimal
, ui
)
1602 const GString
*thousand
= ui
< res
->formats_thousand
->len
1603 ? g_ptr_array_index (res
->formats_thousand
, ui
)
1606 g_printerr (" fmt.%d = %s\n", ui
, go_format_as_XL (fmt
));
1608 g_printerr (" fmt.%d.dec = %s\n", ui
, decimal
->str
);
1610 g_printerr (" fmt.%d.thou = %s\n", ui
, thousand
->str
);
1615 * stf_parse_options_guess:
1616 * @data: the input data.
1618 * Returns: (transfer full): the guessed options.
1621 stf_parse_options_guess (char const *data
)
1623 StfParseOptions_t
*res
;
1624 GStringChunk
*lines_chunk
;
1628 gunichar sepchar
= go_locale_get_arg_sep ();
1630 g_return_val_if_fail (data
!= NULL
, NULL
);
1632 res
= stf_parse_options_new ();
1633 lines_chunk
= g_string_chunk_new (100 * 1024);
1634 lines
= stf_parse_lines (res
, lines_chunk
, data
, 1000, FALSE
);
1636 tabcount
= count_character (lines
, '\t', 0.2);
1637 sepcount
= count_character (lines
, sepchar
, 0.2);
1639 /* At least one tab per line and enough to separate every
1640 would-be sepchars. */
1641 if (tabcount
>= 1 && tabcount
>= sepcount
- 1)
1642 stf_parse_options_csv_set_separators (res
, "\t", NULL
);
1647 * Try a few more or less likely characters and pick the first
1648 * one that occurs on at least half the lines.
1650 * The order is mostly random, although ' ' and '!' which
1651 * could very easily occur in text are put last.
1653 if (count_character (lines
, (c
= sepchar
), 0.5) > 0 ||
1654 count_character (lines
, (c
= go_locale_get_col_sep ()), 0.5) > 0 ||
1655 count_character (lines
, (c
= ':'), 0.5) > 0 ||
1656 count_character (lines
, (c
= ','), 0.5) > 0 ||
1657 count_character (lines
, (c
= ';'), 0.5) > 0 ||
1658 count_character (lines
, (c
= '|'), 0.5) > 0 ||
1659 count_character (lines
, (c
= '!'), 0.5) > 0 ||
1660 count_character (lines
, (c
= ' '), 0.5) > 0) {
1662 sep
[g_unichar_to_utf8 (c
, sep
)] = 0;
1665 stf_parse_options_csv_set_separators (res
, sep
, NULL
);
1669 // For now, always separated:
1670 stf_parse_options_set_type (res
, PARSE_TYPE_CSV
);
1672 switch (res
->parsetype
) {
1673 case PARSE_TYPE_CSV
: {
1676 strchr (res
->sep
.chr
, ' ') != NULL
;
1679 strchr (res
->sep
.chr
, ' ') != NULL
;
1681 stf_parse_options_set_trim_spaces (res
, TRIM_TYPE_LEFT
| TRIM_TYPE_RIGHT
);
1682 stf_parse_options_csv_set_indicator_2x_is_single (res
, TRUE
);
1683 stf_parse_options_csv_set_duplicates (res
, dups
);
1684 stf_parse_options_csv_set_trim_seps (res
, trim
);
1686 stf_parse_options_csv_set_stringindicator (res
, '"');
1690 case PARSE_TYPE_FIXED
:
1694 g_assert_not_reached ();
1697 stf_parse_general_free (lines
);
1698 g_string_chunk_free (lines_chunk
);
1700 stf_parse_options_guess_formats (res
, data
);
1702 if (gnm_debug_flag ("stf"))
1703 dump_guessed_options (res
);
1709 * stf_parse_options_guess_csv:
1710 * @data: the CSV input data.
1712 * Returns: (transfer full): the guessed options.
1715 stf_parse_options_guess_csv (char const *data
)
1717 StfParseOptions_t
*res
;
1718 GStringChunk
*lines_chunk
;
1721 char const *quoteline
= NULL
;
1723 gunichar stringind
= '"';
1725 g_return_val_if_fail (data
!= NULL
, NULL
);
1727 res
= stf_parse_options_new ();
1728 stf_parse_options_set_type (res
, PARSE_TYPE_CSV
);
1729 stf_parse_options_set_trim_spaces (res
, TRIM_TYPE_LEFT
| TRIM_TYPE_RIGHT
);
1730 stf_parse_options_csv_set_indicator_2x_is_single (res
, TRUE
);
1731 stf_parse_options_csv_set_duplicates (res
, FALSE
);
1732 stf_parse_options_csv_set_trim_seps (res
, FALSE
);
1733 stf_parse_options_csv_set_stringindicator (res
, stringind
);
1735 lines_chunk
= g_string_chunk_new (100 * 1024);
1736 lines
= stf_parse_lines (res
, lines_chunk
, data
, 1000, FALSE
);
1739 * Find a line containing a quote; skip first line unless it is
1740 * the only one. Prefer a line with the quote first.
1742 for (pass
= 1; !quoteline
&& pass
<= 2; pass
++) {
1744 for (lno
= MIN (1, lines
->len
- 1);
1745 !quoteline
&& lno
< lines
->len
;
1747 GPtrArray
*boxline
= g_ptr_array_index (lines
, lno
);
1748 const char *line
= g_ptr_array_index (boxline
, 0);
1751 if (g_utf8_get_char (line
) == stringind
)
1755 if (my_utf8_strchr (line
, stringind
))
1763 const char *p0
= my_utf8_strchr (quoteline
, stringind
);
1767 p
= g_utf8_next_char (p
);
1768 } while (*p
&& g_utf8_get_char (p
) != stringind
);
1769 if (*p
) p
= g_utf8_next_char (p
);
1770 while (*p
&& g_unichar_isspace (g_utf8_get_char (p
)))
1771 p
= g_utf8_next_char (p
);
1773 /* Use the character after the quote. */
1774 sep
= g_strndup (p
, g_utf8_next_char (p
) - p
);
1776 /* Try to use character before the quote. */
1777 while (p0
> quoteline
&& !sep
) {
1779 p0
= g_utf8_prev_char (p0
);
1780 if (!g_unichar_isspace (g_utf8_get_char (p0
)))
1781 sep
= g_strndup (p0
, p
- p0
);
1787 sep
= g_strdup (",");
1788 stf_parse_options_csv_set_separators (res
, sep
, NULL
);
1791 stf_parse_general_free (lines
);
1792 g_string_chunk_free (lines_chunk
);
1794 stf_parse_options_guess_formats (res
, data
);
1796 if (gnm_debug_flag ("stf"))
1797 dump_guessed_options (res
);
1803 STF_GUESS_DATE_DMY
= 1,
1804 STF_GUESS_DATE_MDY
= 2,
1805 STF_GUESS_DATE_YMD
= 4,
1807 STF_GUESS_NUMBER_DEC_POINT
= 0x10,
1808 STF_GUESS_NUMBER_DEC_COMMA
= 0x20,
1809 STF_GUESS_NUMBER_DEC_EITHER
= 0x30,
1811 STF_GUESS_ALL
= 0x37
1815 do_check_date (const char *data
, StfGuessFormats flag
,
1816 gboolean mbd
, gboolean ybm
,
1818 GODateConventions
const *date_conv
)
1821 gboolean this_mbd
, this_ybm
;
1824 if (!(*possible
& flag
))
1827 v
= format_match_datetime (data
, date_conv
, mbd
, TRUE
, FALSE
);
1828 if (!v
|| !VALUE_FMT (v
))
1831 imbd
= go_format_month_before_day (VALUE_FMT (v
));
1832 this_mbd
= (imbd
>= 1);
1833 this_ybm
= (imbd
== 2);
1834 if (mbd
!= this_mbd
|| ybm
!= this_ybm
)
1847 do_check_number (const char *data
, StfGuessFormats flag
,
1848 const GString
*dec
, const GString
*thousand
, const GString
*curr
,
1849 unsigned *possible
, int *decimals
)
1852 GOFormatFamily family
;
1855 if (!(*possible
& flag
))
1858 v
= format_match_decimal_number_with_locale (data
, &family
, curr
, thousand
, dec
);
1862 if (*decimals
!= -2) {
1863 const char *pdec
= strstr (data
, dec
->str
);
1864 int this_decimals
= 0;
1867 while (g_ascii_isdigit (*pdec
)) {
1872 if (*decimals
== -1)
1873 *decimals
= this_decimals
;
1874 else if (*decimals
!= this_decimals
)
1878 pthou
= strstr (data
, thousand
->str
);
1881 int digits
= 0, nonzero_digits
= 0;
1882 for (p
= data
; p
< pthou
; p
= g_utf8_next_char (p
)) {
1883 if (g_unichar_isdigit (g_utf8_get_char (p
))) {
1889 // "-.222" implies that "." is not a thousands separator.
1890 // "0.222" implies that "." is not a thousands separator.
1891 // "12345,555" implies that "," is not a thousands separator.
1892 if (nonzero_digits
== 0 || digits
> 3)
1906 * stf_parse_options_guess_formats:
1907 * @data: the CSV input data.
1909 * This function attempts to recognize data formats on a column-by-column
1910 * basis under the assumption that the data in a text file will generally
1911 * use the same data formats.
1913 * This is useful because not all values give sufficient information by
1914 * themselves to tell what format the data is in. For example, "1/2/2000"
1915 * is likely to be a date in year 2000, but it is not clear if it is in
1916 * January or February. If another value in the same column is "31/1/1999"
1917 * then it is likely that the former date was in February.
1919 * Likewise, a value of "123,456" could mean either 1.23456e5 or 1.23456e2.
1920 * A later value of "111,200.22" would clear up the confusion.
1924 stf_parse_options_guess_formats (StfParseOptions_t
*po
, char const *data
)
1926 GStringChunk
*lines_chunk
;
1928 unsigned lno
, col
, colcount
, sline
;
1929 GODateConventions
const *date_conv
= go_date_conv_from_str ("Lotus:1900");
1930 GString
*s_comma
= g_string_new (",");
1931 GString
*s_dot
= g_string_new (".");
1932 GString
*s_dollar
= g_string_new ("$");
1933 gboolean debug
= gnm_debug_flag ("stf");
1935 g_ptr_array_set_size (po
->formats
, 0);
1936 g_ptr_array_set_size (po
->formats_decimal
, 0);
1937 g_ptr_array_set_size (po
->formats_thousand
, 0);
1938 g_ptr_array_set_size (po
->formats_curr
, 0);
1940 lines_chunk
= g_string_chunk_new (100 * 1024);
1941 lines
= stf_parse_general (po
, lines_chunk
, data
, data
+ strlen (data
));
1944 for (lno
= 0; lno
< lines
->len
; lno
++) {
1945 GPtrArray
*line
= g_ptr_array_index (lines
, lno
);
1946 colcount
= MAX (colcount
, line
->len
);
1949 // Ignore first line unless it is the only one
1950 sline
= MIN ((int)lines
->len
- 1, 1);
1952 g_ptr_array_set_size (po
->formats
, colcount
);
1953 g_ptr_array_set_size (po
->formats_decimal
, colcount
);
1954 g_ptr_array_set_size (po
->formats_thousand
, colcount
);
1955 g_ptr_array_set_size (po
->formats_curr
, colcount
);
1956 for (col
= 0; col
< colcount
; col
++) {
1957 unsigned possible
= STF_GUESS_ALL
;
1958 GOFormat
*fmt
= NULL
;
1959 gboolean seen_dot
= FALSE
;
1960 gboolean seen_comma
= FALSE
;
1961 int decimals_if_point
= -1; // -1: unset; -2: inconsistent; >=0: count
1962 int decimals_if_comma
= -1; // -1: unset; -2: inconsistent; >=0: count
1964 for (lno
= sline
; possible
&& lno
< lines
->len
; lno
++) {
1965 GPtrArray
*line
= g_ptr_array_index (lines
, lno
);
1966 const char *data
= col
< line
->len
? g_ptr_array_index (line
, col
) : "";
1967 unsigned prev_possible
= possible
;
1969 if (*data
== 0 || data
[0] == '\'')
1972 do_check_date (data
, STF_GUESS_DATE_DMY
, FALSE
, FALSE
, &possible
, date_conv
);
1973 do_check_date (data
, STF_GUESS_DATE_MDY
, TRUE
, FALSE
, &possible
, date_conv
);
1974 do_check_date (data
, STF_GUESS_DATE_YMD
, TRUE
, TRUE
, &possible
, date_conv
);
1976 if ((possible
& STF_GUESS_NUMBER_DEC_EITHER
) == STF_GUESS_NUMBER_DEC_EITHER
) {
1977 const char *pdot
= strstr (data
, s_dot
->str
);
1978 const char *pcomma
= strstr (data
, s_comma
->str
);
1979 if (pdot
&& pcomma
) {
1980 // Both -- last one is the decimal separator
1982 possible
&= ~STF_GUESS_NUMBER_DEC_COMMA
;
1984 possible
&= ~STF_GUESS_NUMBER_DEC_POINT
;
1985 } else if (pdot
&& strstr (pdot
+ s_dot
->len
, s_dot
->str
)) {
1986 // Two dots so they are thousands separators
1987 possible
&= ~STF_GUESS_NUMBER_DEC_POINT
;
1988 } else if (pcomma
&& strstr (pcomma
+ s_comma
->len
, s_comma
->str
)) {
1989 // Two commas so they are thousands separators
1990 possible
&= ~STF_GUESS_NUMBER_DEC_COMMA
;
1993 seen_dot
= seen_dot
|| (pdot
!= 0);
1994 seen_comma
= seen_comma
|| (pcomma
!= 0);
1996 do_check_number (data
, STF_GUESS_NUMBER_DEC_POINT
,
1997 s_dot
, s_comma
, s_dollar
,
1998 &possible
, &decimals_if_point
);
1999 do_check_number (data
, STF_GUESS_NUMBER_DEC_COMMA
,
2000 s_comma
, s_dot
, s_dollar
,
2001 &possible
, &decimals_if_comma
);
2003 if (possible
!= prev_possible
&& debug
)
2004 g_printerr ("col=%d; after [%s] possible=0x%x\n", col
, data
, possible
);
2007 if ((possible
& STF_GUESS_NUMBER_DEC_EITHER
) == STF_GUESS_NUMBER_DEC_EITHER
&&
2008 !seen_dot
&& !seen_comma
) {
2009 // It doesn't matter what the separators are
2010 possible
&= ~STF_GUESS_NUMBER_DEC_COMMA
;
2014 case STF_GUESS_DATE_DMY
:
2015 fmt
= go_format_new_from_XL ("d-mmm-yyyy");
2017 case STF_GUESS_DATE_MDY
:
2018 fmt
= go_format_new_from_XL ("m/d/yyyy");
2020 case STF_GUESS_DATE_YMD
:
2021 fmt
= go_format_new_from_XL ("yyyy-mm-dd");
2023 case STF_GUESS_NUMBER_DEC_POINT
:
2024 g_ptr_array_index (po
->formats_decimal
, col
) = g_string_new (".");
2025 g_ptr_array_index (po
->formats_thousand
, col
) = g_string_new (",");
2026 g_ptr_array_index (po
->formats_curr
, col
) = g_string_new (s_dollar
->str
);
2027 if (decimals_if_point
> 0) {
2028 // Don't set format if decimals is zero
2029 GString
*fmt_str
= g_string_new (NULL
);
2030 go_format_generate_number_str (fmt_str
, 1, decimals_if_point
, seen_comma
, FALSE
, FALSE
, "", "");
2031 fmt
= go_format_new_from_XL (fmt_str
->str
);
2032 g_string_free (fmt_str
, TRUE
);
2035 case STF_GUESS_NUMBER_DEC_COMMA
:
2036 g_ptr_array_index (po
->formats_decimal
, col
) = g_string_new (",");
2037 g_ptr_array_index (po
->formats_thousand
, col
) = g_string_new (".");
2038 g_ptr_array_index (po
->formats_curr
, col
) = g_string_new (s_dollar
->str
);
2039 if (decimals_if_comma
> 0) {
2040 // Don't set format if decimals is zero
2041 GString
*fmt_str
= g_string_new (NULL
);
2042 go_format_generate_number_str (fmt_str
, 1, decimals_if_comma
, seen_dot
, FALSE
, FALSE
, "", "");
2043 fmt
= go_format_new_from_XL (fmt_str
->str
);
2044 g_string_free (fmt_str
, TRUE
);
2052 fmt
= go_format_ref (go_format_general ());
2053 g_ptr_array_index (po
->formats
, col
) = fmt
;
2056 stf_parse_general_free (lines
);
2057 g_string_chunk_free (lines_chunk
);
2059 g_string_free (s_dot
, TRUE
);
2060 g_string_free (s_comma
, TRUE
);
2061 g_string_free (s_dollar
, TRUE
);