1 /* vim: set sw=8: -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */
3 * stf-parse.c : Structured Text Format parser. (STF)
4 * A general purpose engine for parsing data
5 * in CSV and Fixed width format.
8 * Copyright (C) Almer. S. Tigelaar.
9 * EMail: almer1@dds.nl or almer-t@bigfoot.com
11 * Copyright (C) 2003 Andreas J. Guelzow <aguelzow@taliesin.ca>
12 * Copyright (C) 2003,2008-2009 Morten Welinder <terra@gnome.org>
14 * This program is free software; you can redistribute it and/or modify
15 * it under the terms of the GNU General Public License as published by
16 * the Free Software Foundation; either version 2 of the License, or
17 * (at your option) any later version.
19 * This program is distributed in the hope that it will be useful,
20 * but WITHOUT ANY WARRANTY; without even the implied warranty of
21 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
22 * GNU General Public License for more details.
24 * You should have received a copy of the GNU General Public License
25 * along with this program; if not, see <https://www.gnu.org/licenses/>.
28 #include <gnumeric-config.h>
29 #include <glib/gi18n-lib.h>
31 #include "stf-parse.h"
32 #include "stf-export.h"
38 #include "clipboard.h"
39 #include "sheet-style.h"
42 #include "number-match.h"
44 #include "parse-util.h"
45 #include "number-match.h"
46 #include "gnm-format.h"
48 #include <goffice/goffice.h>
54 #define SETUP_LOCALE_SWITCH char *oldlocale = NULL
56 #define START_LOCALE_SWITCH if (parseoptions->locale) {\
57 oldlocale = g_strdup(go_setlocale (LC_ALL, NULL)); \
58 go_setlocale(LC_ALL, parseoptions->locale);}
60 #define END_LOCALE_SWITCH if (oldlocale) {\
61 go_setlocale(LC_ALL, oldlocale);\
64 /* Source_t struct, used for interchanging parsing information between the low level parse functions */
67 char const *position
; /* Indicates the current position within data */
69 /* Used internally for fixed width parsing */
70 int splitpos
; /* Indicates current position in splitpositions array */
71 int linepos
; /* Position on the current line */
74 /* Struct used for autodiscovery */
81 * Some silly dude make the length field an unsigned int. C just does
82 * not deal very well with that.
85 my_garray_len (GArray
const *a
)
91 my_utf8_strchr (const char *p
, gunichar uc
)
93 return uc
< 0x7f ? strchr (p
, uc
) : g_utf8_strchr (p
, -1, uc
);
97 compare_terminator (char const *s
, StfParseOptions_t
*parseoptions
)
99 guchar
const *us
= (guchar
const *)s
;
102 if (*us
> parseoptions
->compiled_terminator
.max
||
103 *us
< parseoptions
->compiled_terminator
.min
)
106 for (l
= parseoptions
->terminator
; l
; l
= l
->next
) {
107 char const *term
= l
->data
;
125 /*******************************************************************************************************
126 * STF PARSE OPTIONS : StfParseOptions related
127 *******************************************************************************************************/
130 gnm_g_string_free (GString
*s
)
132 if (s
) g_string_free (s
, TRUE
);
137 * stf_parse_options_new:
139 * This will return a new StfParseOptions_t struct.
140 * The struct should, after being used, freed with stf_parse_options_free.
142 static StfParseOptions_t
*
143 stf_parse_options_new (void)
145 StfParseOptions_t
* parseoptions
= g_new0 (StfParseOptions_t
, 1);
147 parseoptions
->parsetype
= PARSE_TYPE_NOTSET
;
149 parseoptions
->terminator
= NULL
;
150 stf_parse_options_add_line_terminator (parseoptions
, "\r\n");
151 stf_parse_options_add_line_terminator (parseoptions
, "\n");
152 stf_parse_options_add_line_terminator (parseoptions
, "\r");
154 parseoptions
->trim_spaces
= (TRIM_TYPE_RIGHT
| TRIM_TYPE_LEFT
);
155 parseoptions
->locale
= NULL
;
157 parseoptions
->splitpositions
= NULL
;
158 stf_parse_options_fixed_splitpositions_clear (parseoptions
);
160 parseoptions
->stringindicator
= '"';
161 parseoptions
->indicator_2x_is_single
= TRUE
;
162 parseoptions
->sep
.duplicates
= FALSE
;
163 parseoptions
->trim_seps
= FALSE
;
165 parseoptions
->sep
.str
= NULL
;
166 parseoptions
->sep
.chr
= NULL
;
168 parseoptions
->col_autofit_array
= NULL
;
169 parseoptions
->col_import_array
= NULL
;
170 parseoptions
->col_import_array_len
= 0;
171 parseoptions
->formats
= g_ptr_array_new_with_free_func ((GDestroyNotify
)go_format_unref
);
172 parseoptions
->formats_decimal
= g_ptr_array_new_with_free_func ((GDestroyNotify
)gnm_g_string_free
);
173 parseoptions
->formats_thousand
= g_ptr_array_new_with_free_func ((GDestroyNotify
)gnm_g_string_free
);
174 parseoptions
->formats_curr
= g_ptr_array_new_with_free_func ((GDestroyNotify
)gnm_g_string_free
);
176 parseoptions
->cols_exceeded
= FALSE
;
177 parseoptions
->rows_exceeded
= FALSE
;
178 parseoptions
->ref_count
= 1;
184 * stf_parse_options_free:
186 * will free @parseoptions, note that this will not free the splitpositions
187 * member (GArray) of the struct, the caller is responsible for that.
190 stf_parse_options_free (StfParseOptions_t
*parseoptions
)
192 g_return_if_fail (parseoptions
!= NULL
);
194 if (parseoptions
->ref_count
-- > 1)
197 g_free (parseoptions
->col_import_array
);
198 g_free (parseoptions
->col_autofit_array
);
199 g_free (parseoptions
->locale
);
200 g_free (parseoptions
->sep
.chr
);
202 if (parseoptions
->sep
.str
) {
205 for (l
= parseoptions
->sep
.str
; l
!= NULL
; l
= l
->next
)
206 g_free ((char *) l
->data
);
207 g_slist_free (parseoptions
->sep
.str
);
210 g_array_free (parseoptions
->splitpositions
, TRUE
);
212 stf_parse_options_clear_line_terminator (parseoptions
);
214 g_ptr_array_free (parseoptions
->formats
, TRUE
);
215 g_ptr_array_free (parseoptions
->formats_decimal
, TRUE
);
216 g_ptr_array_free (parseoptions
->formats_thousand
, TRUE
);
217 g_ptr_array_free (parseoptions
->formats_curr
, TRUE
);
219 g_free (parseoptions
);
222 static StfParseOptions_t
*
223 stf_parse_options_ref (StfParseOptions_t
*parseoptions
)
225 parseoptions
->ref_count
++;
230 stf_parse_options_get_type (void)
235 t
= g_boxed_type_register_static ("StfParseOptions_t",
236 (GBoxedCopyFunc
)stf_parse_options_ref
,
237 (GBoxedFreeFunc
)stf_parse_options_free
);
243 stf_parse_options_set_type (StfParseOptions_t
*parseoptions
, StfParseType_t
const parsetype
)
245 g_return_if_fail (parseoptions
!= NULL
);
246 g_return_if_fail (parsetype
== PARSE_TYPE_CSV
|| parsetype
== PARSE_TYPE_FIXED
);
248 parseoptions
->parsetype
= parsetype
;
252 long_string_first (gchar
const *a
, gchar
const *b
)
254 /* This actually is UTF-8 safe. */
255 return strlen (b
) - strlen (a
);
259 compile_terminators (StfParseOptions_t
*parseoptions
)
262 GO_SLIST_SORT (parseoptions
->terminator
, (GCompareFunc
)long_string_first
);
264 parseoptions
->compiled_terminator
.min
= 255;
265 parseoptions
->compiled_terminator
.max
= 0;
266 for (l
= parseoptions
->terminator
; l
; l
= l
->next
) {
267 const guchar
*term
= l
->data
;
268 parseoptions
->compiled_terminator
.min
=
269 MIN (parseoptions
->compiled_terminator
.min
, *term
);
270 parseoptions
->compiled_terminator
.max
=
271 MAX (parseoptions
->compiled_terminator
.max
, *term
);
276 * stf_parse_options_add_line_terminator:
278 * This will add to the line terminators, in both the Fixed width and CSV delimited importers
279 * this indicates the end of a row.
283 stf_parse_options_add_line_terminator (StfParseOptions_t
*parseoptions
, char const *terminator
)
285 g_return_if_fail (parseoptions
!= NULL
);
286 g_return_if_fail (terminator
!= NULL
&& *terminator
!= 0);
288 GO_SLIST_PREPEND (parseoptions
->terminator
, g_strdup (terminator
));
289 compile_terminators (parseoptions
);
293 * stf_parse_options_clear_line_terminator:
295 * This will clear the line terminator, in both the Fixed width and CSV delimited importers
296 * this indicates the end of a row.
300 stf_parse_options_clear_line_terminator (StfParseOptions_t
*parseoptions
)
302 g_return_if_fail (parseoptions
!= NULL
);
304 g_slist_free_full (parseoptions
->terminator
, g_free
);
305 parseoptions
->terminator
= NULL
;
306 compile_terminators (parseoptions
);
310 * stf_parse_options_set_trim_spaces:
312 * If enabled will trim spaces in every parsed field on left and/or right
316 stf_parse_options_set_trim_spaces (StfParseOptions_t
*parseoptions
, StfTrimType_t
const trim_spaces
)
318 g_return_if_fail (parseoptions
!= NULL
);
320 parseoptions
->trim_spaces
= trim_spaces
;
324 * stf_parse_options_csv_set_separators:
325 * @parseoptions: #StfParseOptions_t
327 * @seps: (element-type utf8): the separators to be used
329 * A copy is made of the parameters.
332 stf_parse_options_csv_set_separators (StfParseOptions_t
*parseoptions
,
333 char const *character
,
336 g_return_if_fail (parseoptions
!= NULL
);
338 g_free (parseoptions
->sep
.chr
);
339 parseoptions
->sep
.chr
= g_strdup (character
);
341 g_slist_free_full (parseoptions
->sep
.str
, g_free
);
342 parseoptions
->sep
.str
= go_slist_map (seps
, (GOMapFunc
)g_strdup
);
346 stf_parse_options_csv_set_stringindicator (StfParseOptions_t
*parseoptions
, gunichar
const stringindicator
)
348 g_return_if_fail (parseoptions
!= NULL
);
350 parseoptions
->stringindicator
= stringindicator
;
354 * stf_parse_options_csv_set_indicator_2x_is_single:
355 * @indic_2x: a boolean value indicating whether we want to see two
356 * adjacent string indicators as a single string indicator
357 * that is part of the cell, rather than a terminator.
360 stf_parse_options_csv_set_indicator_2x_is_single (StfParseOptions_t
*parseoptions
,
361 gboolean
const indic_2x
)
363 g_return_if_fail (parseoptions
!= NULL
);
365 parseoptions
->indicator_2x_is_single
= indic_2x
;
369 * stf_parse_options_csv_set_duplicates:
371 * @duplicates: a boolean value indicating whether we want to see two
372 * separators right behind each other as one
375 stf_parse_options_csv_set_duplicates (StfParseOptions_t
*parseoptions
, gboolean
const duplicates
)
377 g_return_if_fail (parseoptions
!= NULL
);
379 parseoptions
->sep
.duplicates
= duplicates
;
383 * stf_parse_options_csv_set_trim_seps:
384 * @trim_seps: a boolean value indicating whether we want to ignore
385 * separators at the beginning of lines
388 stf_parse_options_csv_set_trim_seps (StfParseOptions_t
*parseoptions
, gboolean
const trim_seps
)
390 g_return_if_fail (parseoptions
!= NULL
);
392 parseoptions
->trim_seps
= trim_seps
;
396 * stf_parse_options_fixed_splitpositions_clear:
398 * This will clear the splitpositions (== points on which a line is split)
401 stf_parse_options_fixed_splitpositions_clear (StfParseOptions_t
*parseoptions
)
404 g_return_if_fail (parseoptions
!= NULL
);
406 if (parseoptions
->splitpositions
)
407 g_array_free (parseoptions
->splitpositions
, TRUE
);
408 parseoptions
->splitpositions
= g_array_new (FALSE
, FALSE
, sizeof (int));
410 g_array_append_val (parseoptions
->splitpositions
, minus_one
);
414 * stf_parse_options_fixed_splitpositions_add:
416 * @position will be added to the splitpositions.
419 stf_parse_options_fixed_splitpositions_add (StfParseOptions_t
*parseoptions
, int position
)
423 g_return_if_fail (parseoptions
!= NULL
);
424 g_return_if_fail (position
>= 0);
426 for (ui
= 0; ui
< parseoptions
->splitpositions
->len
- 1; ui
++) {
427 int here
= g_array_index (parseoptions
->splitpositions
, int, ui
);
428 if (position
== here
)
434 g_array_insert_val (parseoptions
->splitpositions
, ui
, position
);
438 stf_parse_options_fixed_splitpositions_remove (StfParseOptions_t
*parseoptions
, int position
)
442 g_return_if_fail (parseoptions
!= NULL
);
443 g_return_if_fail (position
>= 0);
445 for (ui
= 0; ui
< parseoptions
->splitpositions
->len
- 1; ui
++) {
446 int here
= g_array_index (parseoptions
->splitpositions
, int, ui
);
447 if (position
== here
)
448 g_array_remove_index (parseoptions
->splitpositions
, ui
);
449 if (position
<= here
)
455 stf_parse_options_fixed_splitpositions_count (StfParseOptions_t
*parseoptions
)
457 return parseoptions
->splitpositions
->len
;
461 stf_parse_options_fixed_splitpositions_nth (StfParseOptions_t
*parseoptions
, int n
)
463 return g_array_index (parseoptions
->splitpositions
, int, n
);
468 * stf_parse_options_valid:
469 * @parseoptions: an import options struct
471 * Checks if @parseoptions is correctly filled
473 * returns : TRUE if it is correctly filled, FALSE otherwise.
476 stf_parse_options_valid (StfParseOptions_t
*parseoptions
)
478 g_return_val_if_fail (parseoptions
!= NULL
, FALSE
);
480 if (parseoptions
->parsetype
== PARSE_TYPE_FIXED
) {
481 if (!parseoptions
->splitpositions
) {
482 g_warning ("STF: No splitpositions in struct");
490 /*******************************************************************************************************
491 * STF PARSE : The actual routines that do the 'trick'
492 *******************************************************************************************************/
495 trim_spaces_inplace (char *field
, StfParseOptions_t
const *parseoptions
)
499 if (parseoptions
->trim_spaces
& TRIM_TYPE_LEFT
) {
502 while (g_unichar_isspace (g_utf8_get_char (s
)))
503 s
= g_utf8_next_char (s
);
506 memmove (field
, s
, 1 + strlen (s
));
509 if (parseoptions
->trim_spaces
& TRIM_TYPE_RIGHT
) {
510 char *s
= field
+ strlen (field
);
513 s
= g_utf8_prev_char (s
);
514 if (!g_unichar_isspace (g_utf8_get_char (s
)))
522 * stf_parse_csv_is_separator:
524 * returns NULL if @character is not a separator, a pointer to the character
525 * after the separator otherwise.
528 stf_parse_csv_is_separator (char const *character
, char const *chr
, GSList
const *str
)
530 g_return_val_if_fail (character
!= NULL
, NULL
);
538 for (l
= str
; l
!= NULL
; l
= l
->next
) {
539 char const *s
= l
->data
;
542 glong
const len
= g_utf8_strlen (s
, -1);
544 /* Don't compare past the end of the buffer! */
545 for (r
= character
, cnt
= 0; cnt
< len
; cnt
++, r
= g_utf8_next_char (r
))
549 if ((cnt
== len
) && (memcmp (character
, s
, len
) == 0))
550 return g_utf8_offset_to_pointer (character
, len
);
554 if (chr
&& my_utf8_strchr (chr
, g_utf8_get_char (character
)))
555 return g_utf8_next_char(character
);
561 * stf_parse_eat_separators:
563 * skip over leading separators
568 stf_parse_eat_separators (Source_t
*src
, StfParseOptions_t
*parseoptions
)
570 char const *cur
, *next
;
572 g_return_if_fail (src
!= NULL
);
573 g_return_if_fail (parseoptions
!= NULL
);
577 if (*cur
== '\0' || compare_terminator (cur
, parseoptions
))
579 while ((next
= stf_parse_csv_is_separator (cur
, parseoptions
->sep
.chr
, parseoptions
->sep
.str
)))
590 STF_CELL_FIELD_NO_SEP
,
594 static StfParseCellRes
595 stf_parse_csv_cell (GString
*text
, Source_t
*src
, StfParseOptions_t
*parseoptions
)
598 gboolean saw_sep
= FALSE
;
600 g_return_val_if_fail (src
!= NULL
, STF_CELL_ERROR
);
601 g_return_val_if_fail (parseoptions
!= NULL
, STF_CELL_ERROR
);
604 g_return_val_if_fail (cur
!= NULL
, STF_CELL_ERROR
);
606 /* Skip whitespace, but stop at line terminators. */
615 term_len
= compare_terminator (cur
, parseoptions
);
617 src
->position
= cur
+ term_len
;
621 if ((parseoptions
->trim_spaces
& TRIM_TYPE_LEFT
) == 0)
624 if (stf_parse_csv_is_separator (cur
, parseoptions
->sep
.chr
,
625 parseoptions
->sep
.str
))
628 if (!g_unichar_isspace (g_utf8_get_char (cur
)))
630 cur
= g_utf8_next_char (cur
);
633 if (parseoptions
->stringindicator
!= 0 &&
634 g_utf8_get_char (cur
) == parseoptions
->stringindicator
) {
635 cur
= g_utf8_next_char (cur
);
637 gunichar uc
= g_utf8_get_char (cur
);
638 cur
= g_utf8_next_char (cur
);
640 if (uc
== parseoptions
->stringindicator
) {
641 if (parseoptions
->indicator_2x_is_single
&&
642 g_utf8_get_char (cur
) == parseoptions
->stringindicator
)
643 cur
= g_utf8_next_char (cur
);
645 /* "field content"dropped-garbage, */
646 while (*cur
&& !compare_terminator (cur
, parseoptions
)) {
647 char const *post
= stf_parse_csv_is_separator
648 (cur
, parseoptions
->sep
.chr
, parseoptions
->sep
.str
);
654 cur
= g_utf8_next_char (cur
);
660 g_string_append_unichar (text
, uc
);
663 /* We silently allow a missing terminating quote. */
665 /* Unquoted field. */
667 while (*cur
&& !compare_terminator (cur
, parseoptions
)) {
669 char const *post
= stf_parse_csv_is_separator
670 (cur
, parseoptions
->sep
.chr
, parseoptions
->sep
.str
);
677 g_string_append_unichar (text
, g_utf8_get_char (cur
));
678 cur
= g_utf8_next_char (cur
);
681 if (parseoptions
->trim_spaces
& TRIM_TYPE_RIGHT
) {
683 const char *last
= g_utf8_prev_char (text
->str
+ text
->len
);
684 if (!g_unichar_isspace (g_utf8_get_char (last
)))
686 g_string_truncate (text
, last
- text
->str
);
693 if (saw_sep
&& parseoptions
->sep
.duplicates
)
694 stf_parse_eat_separators (src
, parseoptions
);
696 return saw_sep
? STF_CELL_FIELD_SEP
: STF_CELL_FIELD_NO_SEP
;
700 * stf_parse_csv_line:
702 * This will parse one line from the current @src->position.
703 * NOTE: The calling routine is responsible for freeing the result.
705 * returns : a GPtrArray of char*'s
708 stf_parse_csv_line (Source_t
*src
, StfParseOptions_t
*parseoptions
)
711 gboolean cont
= FALSE
;
714 g_return_val_if_fail (src
!= NULL
, NULL
);
715 g_return_val_if_fail (parseoptions
!= NULL
, NULL
);
717 line
= g_ptr_array_new ();
718 if (parseoptions
->trim_seps
)
719 stf_parse_eat_separators (src
, parseoptions
);
721 text
= g_string_sized_new (30);
725 StfParseCellRes res
=
726 stf_parse_csv_cell (text
, src
, parseoptions
);
727 trim_spaces_inplace (text
->str
, parseoptions
);
728 ctext
= g_string_chunk_insert_len (src
->chunk
,
729 text
->str
, text
->len
);
730 g_string_truncate (text
, 0);
733 case STF_CELL_FIELD_NO_SEP
:
734 g_ptr_array_add (line
, ctext
);
738 case STF_CELL_FIELD_SEP
:
739 g_ptr_array_add (line
, ctext
);
740 cont
= TRUE
; /* Make sure we see one more field. */
745 g_ptr_array_add (line
, ctext
);
746 g_string_free (text
, TRUE
);
753 * stf_parse_fixed_cell:
755 * returns a pointer to the parsed cell contents.
758 stf_parse_fixed_cell (Source_t
*src
, StfParseOptions_t
*parseoptions
)
764 g_return_val_if_fail (src
!= NULL
, NULL
);
765 g_return_val_if_fail (parseoptions
!= NULL
, NULL
);
769 if (src
->splitpos
< my_garray_len (parseoptions
->splitpositions
))
770 splitval
= (int) g_array_index (parseoptions
->splitpositions
, int, src
->splitpos
);
774 while (*cur
!= 0 && !compare_terminator (cur
, parseoptions
) && splitval
!= src
->linepos
) {
776 cur
= g_utf8_next_char (cur
);
779 res
= g_string_chunk_insert_len (src
->chunk
,
781 cur
- src
->position
);
789 * stf_parse_fixed_line:
791 * This will parse one line from the current @src->position.
792 * It will return a GPtrArray with the cell contents as strings.
794 * NOTE: The calling routine is responsible for freeing result.
797 stf_parse_fixed_line (Source_t
*src
, StfParseOptions_t
*parseoptions
)
801 g_return_val_if_fail (src
!= NULL
, NULL
);
802 g_return_val_if_fail (parseoptions
!= NULL
, NULL
);
807 line
= g_ptr_array_new ();
808 while (*src
->position
!= '\0' && !compare_terminator (src
->position
, parseoptions
)) {
809 char *field
= stf_parse_fixed_cell (src
, parseoptions
);
811 trim_spaces_inplace (field
, parseoptions
);
812 g_ptr_array_add (line
, field
);
817 while (line
->len
< parseoptions
->splitpositions
->len
)
818 g_ptr_array_add (line
, g_strdup (""));
824 * stf_parse_general_free: (skip)
827 stf_parse_general_free (GPtrArray
*lines
)
830 for (lineno
= 0; lineno
< lines
->len
; lineno
++) {
831 GPtrArray
*line
= g_ptr_array_index (lines
, lineno
);
832 /* Fields are not freed here. */
834 g_ptr_array_free (line
, TRUE
);
836 g_ptr_array_free (lines
, TRUE
);
841 * stf_parse_general: (skip)
843 * Returns: (transfer full): a GPtrArray of lines, where each line is itself a
844 * GPtrArray of strings.
846 * The caller must free this entire structure, for example by calling
847 * stf_parse_general_free.
850 stf_parse_general (StfParseOptions_t
*parseoptions
,
851 GStringChunk
*lines_chunk
,
852 char const *data
, char const *data_end
)
857 char const *valid_end
= data_end
;
859 g_return_val_if_fail (parseoptions
!= NULL
, NULL
);
860 g_return_val_if_fail (data
!= NULL
, NULL
);
861 g_return_val_if_fail (data_end
!= NULL
, NULL
);
862 g_return_val_if_fail (stf_parse_options_valid (parseoptions
), NULL
);
863 g_return_val_if_fail (g_utf8_validate (data
, data_end
-data
, &valid_end
), NULL
);
865 src
.chunk
= lines_chunk
;
869 if ((data_end
-data
>= 3) && !strncmp(src
.position
, "\xEF\xBB\xBF", 3)) {
870 /* Skip over byte-order mark */
874 lines
= g_ptr_array_new ();
875 while (*src
.position
!= '\0' && src
.position
< data_end
) {
878 if (row
== GNM_MAX_ROWS
) {
879 parseoptions
->rows_exceeded
= TRUE
;
883 line
= parseoptions
->parsetype
== PARSE_TYPE_CSV
884 ? stf_parse_csv_line (&src
, parseoptions
)
885 : stf_parse_fixed_line (&src
, parseoptions
);
887 g_ptr_array_add (lines
, line
);
888 if (parseoptions
->parsetype
!= PARSE_TYPE_CSV
)
889 src
.position
+= compare_terminator (src
.position
, parseoptions
);
897 * stf_parse_lines: (skip)
898 * @parseoptions: #StfParseOptions_t
904 * Returns: (transfer full): a GPtrArray of lines, where each line is itself a
905 * GPtrArray of strings.
907 * The caller must free this entire structure, for example by calling
908 * stf_parse_general_free.
911 stf_parse_lines (StfParseOptions_t
*parseoptions
,
912 GStringChunk
*lines_chunk
,
914 int maxlines
, gboolean with_lineno
)
919 g_return_val_if_fail (data
!= NULL
, NULL
);
921 lines
= g_ptr_array_new ();
923 char const *data0
= data
;
924 GPtrArray
*line
= g_ptr_array_new ();
927 char buf
[4 * sizeof (int)];
928 sprintf (buf
, "%d", lineno
);
929 g_ptr_array_add (line
,
930 g_string_chunk_insert (lines_chunk
, buf
));
934 int termlen
= compare_terminator (data
, parseoptions
);
935 if (termlen
> 0 || *data
== 0) {
936 g_ptr_array_add (line
,
937 g_string_chunk_insert_len (lines_chunk
,
943 data
= g_utf8_next_char (data
);
946 g_ptr_array_add (lines
, line
);
949 if (lineno
>= maxlines
)
956 stf_parse_find_line (StfParseOptions_t
*parseoptions
,
961 int termlen
= compare_terminator (data
, parseoptions
);
965 } else if (*data
== 0) {
968 data
= g_utf8_next_char (data
);
976 * stf_parse_options_fixed_autodiscover:
977 * @parseoptions: a Parse options struct.
978 * @data: The actual data.
979 * @data_end: data end.
981 * Automatically try to discover columns in the text to be parsed.
982 * We ignore empty lines (only containing parseoptions->terminator)
984 * FIXME: This is so extremely ugly that I am too tired to rewrite it right now.
985 * Think hard of a better more flexible solution...
988 stf_parse_options_fixed_autodiscover (StfParseOptions_t
*parseoptions
,
989 char const *data
, char const *data_end
)
991 char const *iterator
= data
;
993 GSList
*list_start
= NULL
;
995 int effective_lines
= 0;
996 int max_line_length
= 0;
997 int *line_begin_hits
= NULL
;
998 int *line_end_hits
= NULL
;
1001 stf_parse_options_fixed_splitpositions_clear (parseoptions
);
1004 * First take a look at all possible white space combinations
1006 while (*iterator
&& iterator
< data_end
) {
1007 gboolean begin_recorded
= FALSE
;
1008 AutoDiscovery_t
*disc
= NULL
;
1012 while (*iterator
&& (termlen
= compare_terminator (iterator
, parseoptions
)) == 0) {
1013 if (!begin_recorded
&& *iterator
== ' ') {
1014 disc
= g_new0 (AutoDiscovery_t
, 1);
1016 disc
->start
= position
;
1018 begin_recorded
= TRUE
;
1019 } else if (begin_recorded
&& *iterator
!= ' ') {
1020 disc
->stop
= position
;
1021 list
= g_slist_prepend (list
, disc
);
1023 begin_recorded
= FALSE
;
1031 if (position
> max_line_length
)
1032 max_line_length
= position
;
1035 * If there are excess spaces at the end of
1036 * the line : ignore them
1041 * Hop over the terminator
1043 iterator
+= termlen
;
1051 list
= g_slist_reverse (list
);
1056 * Look at the number of hits at each line position
1057 * if the number of hits equals the number of lines
1058 * we can be pretty sure this is the start or end
1059 * of a column, we filter out empty columns
1062 line_begin_hits
= g_new0 (int, max_line_length
+ 1);
1063 line_end_hits
= g_new0 (int, max_line_length
+ 1);
1066 AutoDiscovery_t
*disc
= list
->data
;
1068 line_begin_hits
[disc
->start
]++;
1069 line_end_hits
[disc
->stop
]++;
1073 list
= g_slist_next (list
);
1075 g_slist_free (list_start
);
1077 for (i
= 0; i
< max_line_length
+ 1; i
++)
1078 if (line_begin_hits
[i
] == effective_lines
|| line_end_hits
[i
] == effective_lines
)
1079 stf_parse_options_fixed_splitpositions_add (parseoptions
, i
);
1082 * Do some corrections to the initial columns
1083 * detected here, we obviously don't need to
1084 * do this if there are no columns at all.
1086 if (my_garray_len (parseoptions
->splitpositions
) > 0) {
1088 * Try to find columns that look like :
1093 * (In other words : Columns with left & right justification with
1094 * a minimum of 2 spaces in the middle)
1095 * Split these columns in 2
1098 for (i
= 0; i
< my_garray_len (parseoptions
->splitpositions
) - 1; i
++) {
1099 int begin
= g_array_index (parseoptions
->splitpositions
, int, i
);
1100 int end
= g_array_index (parseoptions
->splitpositions
, int, i
+ 1);
1101 int num_spaces
= -1;
1102 int spaces_start
= 0;
1103 gboolean right_aligned
= TRUE
;
1104 gboolean left_aligned
= TRUE
;
1105 gboolean has_2_spaces
= TRUE
;
1109 while (*iterator
&& iterator
< data_end
) {
1110 gboolean trigger
= FALSE
;
1111 gboolean space_trigger
= FALSE
;
1116 while (*iterator
&& !compare_terminator (iterator
, parseoptions
)) {
1118 if (*iterator
== ' ')
1119 left_aligned
= FALSE
;
1122 } else if (pos
== end
- 1) {
1123 if (*iterator
== ' ')
1124 right_aligned
= FALSE
;
1129 if (trigger
|| pos
== end
- 1) {
1130 if (!space_trigger
&& *iterator
== ' ') {
1131 space_trigger
= TRUE
;
1133 } else if (space_trigger
&& *iterator
!= ' ') {
1134 space_trigger
= FALSE
;
1135 num_spaces
= pos
- spaces_start
;
1144 has_2_spaces
= FALSE
;
1153 * If this column meets all the criteria
1154 * split it into two at the last measured
1155 * spaces_start + num_spaces
1157 if (has_2_spaces
&& right_aligned
&& left_aligned
) {
1158 int val
= (((spaces_start
+ num_spaces
) - spaces_start
) / 2) + spaces_start
;
1160 g_array_insert_val (parseoptions
->splitpositions
, i
+ 1, val
);
1163 * Skip over the inserted column
1170 * Remove empty columns here if needed
1172 for (i
= 0; i
< my_garray_len (parseoptions
->splitpositions
) - 1; i
++) {
1173 int begin
= g_array_index (parseoptions
->splitpositions
, int, i
);
1174 int end
= g_array_index (parseoptions
->splitpositions
, int, i
+ 1);
1175 gboolean only_spaces
= TRUE
;
1179 while (*iterator
&& iterator
< data_end
) {
1180 gboolean trigger
= FALSE
;
1183 while (*iterator
&& !compare_terminator (iterator
, parseoptions
)) {
1186 else if (pos
== end
)
1190 if (*iterator
!= ' ')
1191 only_spaces
= FALSE
;
1205 * The column only contains spaces
1209 g_array_remove_index (parseoptions
->splitpositions
, i
);
1212 * We HAVE to make sure that the next column (end) also
1213 * gets checked out. If we don't decrease "i" here, we
1214 * will skip over it as the indexes shift down after
1222 g_free (line_begin_hits
);
1223 g_free (line_end_hits
);
1226 /*******************************************************************************************************
1227 * STF PARSE HL: high-level functions that dump the raw data returned by the low-level parsing
1228 * functions into something meaningful (== application specific)
1229 *******************************************************************************************************/
1232 * This is more or less as gnm_cell_set_text, except...
1233 * 1. Unknown names are not allowed.
1234 * 2. Only '=' can start an expression.
1238 stf_cell_set_text (GnmCell
*cell
, char const *text
)
1240 GnmExprTop
const *texpr
;
1242 GOFormat
const *fmt
= gnm_style_get_format (gnm_cell_get_style (cell
));
1243 const GODateConventions
*date_conv
=
1244 workbook_date_conv (cell
->base
.sheet
->workbook
);
1246 if (!go_format_is_text (fmt
) && *text
== '=' && text
[1] != 0) {
1247 GnmExprParseFlags flags
=
1248 GNM_EXPR_PARSE_UNKNOWN_NAMES_ARE_INVALID
;
1249 const char *expr_start
= text
+ 1;
1252 parse_pos_init_cell (&pos
, cell
);
1253 texpr
= gnm_expr_parse_str (expr_start
, &pos
, flags
,
1257 val
= format_match (text
, fmt
, date_conv
);
1261 val
= value_new_string (text
);
1264 gnm_cell_set_value (cell
, val
);
1266 gnm_cell_set_expr (cell
, texpr
);
1267 gnm_expr_top_unref (texpr
);
1272 stf_read_remember_settings (Workbook
*book
, StfParseOptions_t
*po
)
1274 if (po
->parsetype
== PARSE_TYPE_CSV
) {
1275 GnmStfExport
*stfe
= gnm_stf_get_stfe (G_OBJECT (book
));
1277 int length
= g_unichar_to_utf8 (po
->stringindicator
, quote
);
1281 } else quote
[length
] = '\0';
1283 g_object_set (G_OBJECT (stfe
), "separator", po
->sep
.chr
, "quote", "e
, NULL
);
1285 if ((po
->terminator
!= NULL
) && (po
->terminator
->data
!= NULL
))
1286 g_object_set (G_OBJECT (stfe
), "eol", po
->terminator
->data
, NULL
);
1291 stf_parse_sheet (StfParseOptions_t
*parseoptions
,
1292 char const *data
, char const *data_end
,
1293 Sheet
*sheet
, int start_col
, int start_row
)
1297 GStringChunk
*lines_chunk
;
1299 gboolean result
= TRUE
;
1304 SETUP_LOCALE_SWITCH
;
1306 g_return_val_if_fail (parseoptions
!= NULL
, FALSE
);
1307 g_return_val_if_fail (data
!= NULL
, FALSE
);
1308 g_return_val_if_fail (IS_SHEET (sheet
), FALSE
);
1311 data_end
= data
+ strlen (data
);
1313 lines_chunk
= g_string_chunk_new (100 * 1024);
1314 lines
= stf_parse_general (parseoptions
, lines_chunk
, data
, data_end
);
1319 nformats
= parseoptions
->formats
->len
;
1320 for (lcol
= 0; lcol
< nformats
; lcol
++) {
1321 GOFormat
const *fmt
= g_ptr_array_index (parseoptions
->formats
, lcol
);
1324 (parseoptions
->col_import_array
== NULL
||
1325 parseoptions
->col_import_array_len
<= lcol
||
1326 parseoptions
->col_import_array
[lcol
]);
1327 if (!want_col
|| col
>= gnm_sheet_get_max_cols (sheet
))
1330 if (fmt
&& !go_format_is_general (fmt
)) {
1332 int end_row
= MIN (start_row
+ (int)lines
->len
- 1,
1333 gnm_sheet_get_last_row (sheet
));
1335 range_init (&r
, col
, start_row
, col
, end_row
);
1336 mstyle
= gnm_style_new ();
1337 gnm_style_set_format (mstyle
, fmt
);
1338 sheet_apply_style (sheet
, &r
, mstyle
);
1343 START_LOCALE_SWITCH
;
1344 for (row
= start_row
, lrow
= 0;
1345 result
&& lrow
< lines
->len
;
1349 if (row
>= gnm_sheet_get_max_rows (sheet
)) {
1350 if (!parseoptions
->rows_exceeded
) {
1351 /* FIXME: What locale? */
1352 g_warning (_("There are more rows of data than "
1353 "there is room for in the sheet. Extra "
1354 "rows will be ignored."));
1355 parseoptions
->rows_exceeded
= TRUE
;
1361 line
= g_ptr_array_index (lines
, lrow
);
1363 for (lcol
= 0; lcol
< line
->len
; lcol
++) {
1364 GOFormat
const *fmt
= lcol
< nformats
1365 ? g_ptr_array_index (parseoptions
->formats
, lcol
)
1366 : go_format_general ();
1367 char const *text
= g_ptr_array_index (line
, lcol
);
1369 (parseoptions
->col_import_array
== NULL
||
1370 parseoptions
->col_import_array_len
<= lcol
||
1371 parseoptions
->col_import_array
[lcol
]);
1375 if (col
>= gnm_sheet_get_max_cols (sheet
)) {
1376 if (!parseoptions
->cols_exceeded
) {
1377 /* FIXME: What locale? */
1378 g_warning (_("There are more columns of data than "
1379 "there is room for in the sheet. Extra "
1380 "columns will be ignored."));
1381 parseoptions
->cols_exceeded
= TRUE
;
1385 if (text
&& *text
) {
1386 GnmCell
*cell
= sheet_cell_fetch (sheet
, col
, row
);
1387 if (!go_format_is_text (fmt
) &&
1388 lcol
< parseoptions
->formats_decimal
->len
&&
1389 g_ptr_array_index (parseoptions
->formats_decimal
, lcol
)) {
1391 GnmValue
*v
= format_match_decimal_number_with_locale
1393 g_ptr_array_index (parseoptions
->formats_curr
, lcol
),
1394 g_ptr_array_index (parseoptions
->formats_thousand
, lcol
),
1395 g_ptr_array_index (parseoptions
->formats_decimal
, lcol
));
1397 v
= value_new_string (text
);
1398 sheet_cell_set_value (cell
, v
);
1401 stf_cell_set_text (cell
, text
);
1407 g_ptr_array_index (lines
, lrow
) = NULL
;
1408 g_ptr_array_free (line
, TRUE
);
1412 for (lcol
= 0, col
= start_col
;
1413 lcol
< parseoptions
->col_import_array_len
&& col
< gnm_sheet_get_max_cols (sheet
);
1415 if (parseoptions
->col_import_array
== NULL
||
1416 parseoptions
->col_import_array_len
<= lcol
||
1417 parseoptions
->col_import_array
[lcol
]) {
1418 if (parseoptions
->col_autofit_array
== NULL
||
1419 parseoptions
->col_autofit_array
[lcol
]) {
1420 ColRowIndexList
*list
= colrow_get_index_list (col
, col
, NULL
);
1421 ColRowStateGroup
*state
= colrow_set_sizes (sheet
, TRUE
, list
, -1, 0, -1);
1422 colrow_index_list_destroy (list
);
1423 g_slist_free (state
);
1429 g_string_chunk_free (lines_chunk
);
1431 stf_parse_general_free (lines
);
1433 stf_read_remember_settings (sheet
->workbook
, parseoptions
);
1438 stf_parse_region (StfParseOptions_t
*parseoptions
, char const *data
, char const *data_end
,
1441 static GODateConventions
const default_conv
= {FALSE
};
1442 GODateConventions
const *date_conv
= wb
? workbook_date_conv (wb
) : &default_conv
;
1445 unsigned int row
, colhigh
= 0;
1446 GStringChunk
*lines_chunk
;
1450 SETUP_LOCALE_SWITCH
;
1452 g_return_val_if_fail (parseoptions
!= NULL
, NULL
);
1453 g_return_val_if_fail (data
!= NULL
, NULL
);
1455 START_LOCALE_SWITCH
;
1457 cr
= gnm_cell_region_new (NULL
);
1460 data_end
= data
+ strlen (data
);
1461 lines_chunk
= g_string_chunk_new (100 * 1024);
1462 lines
= stf_parse_general (parseoptions
, lines_chunk
, data
, data_end
);
1463 nformats
= parseoptions
->formats
->len
;
1464 for (row
= 0; row
< lines
->len
; row
++) {
1465 GPtrArray
*line
= g_ptr_array_index (lines
, row
);
1466 unsigned int col
, targetcol
= 0;
1467 for (col
= 0; col
< line
->len
; col
++) {
1468 if (parseoptions
->col_import_array
== NULL
||
1469 parseoptions
->col_import_array_len
<= col
||
1470 parseoptions
->col_import_array
[col
]) {
1471 const char *text
= g_ptr_array_index (line
, col
);
1473 GOFormat
*fmt
= NULL
;
1478 fmt
= g_ptr_array_index (parseoptions
->formats
, col
);
1479 v
= format_match (text
, fmt
, date_conv
);
1481 v
= value_new_string (text
);
1483 cc
= gnm_cell_copy_new (cr
, targetcol
, row
);
1487 if (targetcol
> colhigh
)
1488 colhigh
= targetcol
;
1493 stf_parse_general_free (lines
);
1494 g_string_chunk_free (lines_chunk
);
1498 cr
->cols
= (colhigh
> 0) ? colhigh
: 1;
1505 int_sort (void const *a
, void const *b
)
1507 return *(int const *)a
- *(int const *)b
;
1511 count_character (GPtrArray
*lines
, gunichar c
, double quantile
)
1514 unsigned int lno
, cno
;
1516 if (lines
->len
== 0)
1519 counts
= g_new (int, lines
->len
);
1520 for (lno
= cno
= 0; lno
< lines
->len
; lno
++) {
1522 GPtrArray
*boxline
= g_ptr_array_index (lines
, lno
);
1523 char const *line
= g_ptr_array_index (boxline
, 0);
1525 /* Ignore empty lines. */
1530 if (g_utf8_get_char (line
) == c
)
1532 line
= g_utf8_next_char (line
);
1535 counts
[cno
++] = count
;
1541 unsigned int qi
= (unsigned int)ceil (quantile
* cno
);
1542 qsort (counts
, cno
, sizeof (counts
[0]), int_sort
);
1554 dump_guessed_options (const StfParseOptions_t
*res
)
1557 char ubuffer
[6 + 1];
1560 g_printerr ("Guessed format:\n");
1561 switch (res
->parsetype
) {
1562 case PARSE_TYPE_CSV
:
1563 g_printerr (" type = sep\n");
1564 g_printerr (" separator = %s\n",
1565 res
->sep
.chr
? res
->sep
.chr
: "(none)");
1566 g_printerr (" see two as one = %s\n",
1567 res
->sep
.duplicates
? "yes" : "no");
1569 case PARSE_TYPE_FIXED
:
1570 g_printerr (" type = sep\n");
1575 g_printerr (" trim space = %d\n", res
->trim_spaces
);
1577 ubuffer
[g_unichar_to_utf8 (res
->stringindicator
, ubuffer
)] = 0;
1578 g_printerr (" string indicator = %s\n", ubuffer
);
1579 g_printerr (" see two as one = %s\n",
1580 res
->indicator_2x_is_single
? "yes" : "no");
1582 g_printerr (" line terminators =");
1583 for (l
= res
->terminator
; l
; l
= l
->next
) {
1584 const char *t
= l
->data
;
1585 if (strcmp (t
, "\n") == 0)
1586 g_printerr (" unix");
1587 else if (strcmp (t
, "\r") == 0)
1588 g_printerr (" mac");
1589 else if (strcmp (t
, "\r\n") == 0)
1590 g_printerr (" dos");
1592 g_printerr (" other");
1596 for (ui
= 0; ui
< res
->formats
->len
; ui
++) {
1597 GOFormat
const *fmt
= g_ptr_array_index (res
->formats
, ui
);
1598 const GString
*decimal
= ui
< res
->formats_decimal
->len
1599 ? g_ptr_array_index (res
->formats_decimal
, ui
)
1601 const GString
*thousand
= ui
< res
->formats_thousand
->len
1602 ? g_ptr_array_index (res
->formats_thousand
, ui
)
1605 g_printerr (" fmt.%d = %s\n", ui
, go_format_as_XL (fmt
));
1607 g_printerr (" fmt.%d.dec = %s\n", ui
, decimal
->str
);
1609 g_printerr (" fmt.%d.thou = %s\n", ui
, thousand
->str
);
1614 * stf_parse_options_guess:
1615 * @data: the input data.
1617 * Returns: (transfer full): the guessed options.
1620 stf_parse_options_guess (char const *data
)
1622 StfParseOptions_t
*res
;
1623 GStringChunk
*lines_chunk
;
1627 gunichar sepchar
= go_locale_get_arg_sep ();
1629 g_return_val_if_fail (data
!= NULL
, NULL
);
1631 res
= stf_parse_options_new ();
1632 lines_chunk
= g_string_chunk_new (100 * 1024);
1633 lines
= stf_parse_lines (res
, lines_chunk
, data
, 1000, FALSE
);
1635 tabcount
= count_character (lines
, '\t', 0.2);
1636 sepcount
= count_character (lines
, sepchar
, 0.2);
1638 /* At least one tab per line and enough to separate every
1639 would-be sepchars. */
1640 if (tabcount
>= 1 && tabcount
>= sepcount
- 1)
1641 stf_parse_options_csv_set_separators (res
, "\t", NULL
);
1646 * Try a few more or less likely characters and pick the first
1647 * one that occurs on at least half the lines.
1649 * The order is mostly random, although ' ' and '!' which
1650 * could very easily occur in text are put last.
1652 if (count_character (lines
, (c
= sepchar
), 0.5) > 0 ||
1653 count_character (lines
, (c
= go_locale_get_col_sep ()), 0.5) > 0 ||
1654 count_character (lines
, (c
= ':'), 0.5) > 0 ||
1655 count_character (lines
, (c
= ','), 0.5) > 0 ||
1656 count_character (lines
, (c
= ';'), 0.5) > 0 ||
1657 count_character (lines
, (c
= '|'), 0.5) > 0 ||
1658 count_character (lines
, (c
= '!'), 0.5) > 0 ||
1659 count_character (lines
, (c
= ' '), 0.5) > 0) {
1661 sep
[g_unichar_to_utf8 (c
, sep
)] = 0;
1664 stf_parse_options_csv_set_separators (res
, sep
, NULL
);
1668 // For now, always separated:
1669 stf_parse_options_set_type (res
, PARSE_TYPE_CSV
);
1671 switch (res
->parsetype
) {
1672 case PARSE_TYPE_CSV
: {
1675 strchr (res
->sep
.chr
, ' ') != NULL
;
1678 strchr (res
->sep
.chr
, ' ') != NULL
;
1680 stf_parse_options_set_trim_spaces (res
, TRIM_TYPE_LEFT
| TRIM_TYPE_RIGHT
);
1681 stf_parse_options_csv_set_indicator_2x_is_single (res
, TRUE
);
1682 stf_parse_options_csv_set_duplicates (res
, dups
);
1683 stf_parse_options_csv_set_trim_seps (res
, trim
);
1685 stf_parse_options_csv_set_stringindicator (res
, '"');
1689 case PARSE_TYPE_FIXED
:
1693 g_assert_not_reached ();
1696 stf_parse_general_free (lines
);
1697 g_string_chunk_free (lines_chunk
);
1699 stf_parse_options_guess_formats (res
, data
);
1701 if (gnm_debug_flag ("stf"))
1702 dump_guessed_options (res
);
1708 * stf_parse_options_guess_csv:
1709 * @data: the CSV input data.
1711 * Returns: (transfer full): the guessed options.
1714 stf_parse_options_guess_csv (char const *data
)
1716 StfParseOptions_t
*res
;
1717 GStringChunk
*lines_chunk
;
1720 char const *quoteline
= NULL
;
1722 gunichar stringind
= '"';
1724 g_return_val_if_fail (data
!= NULL
, NULL
);
1726 res
= stf_parse_options_new ();
1727 stf_parse_options_set_type (res
, PARSE_TYPE_CSV
);
1728 stf_parse_options_set_trim_spaces (res
, TRIM_TYPE_LEFT
| TRIM_TYPE_RIGHT
);
1729 stf_parse_options_csv_set_indicator_2x_is_single (res
, TRUE
);
1730 stf_parse_options_csv_set_duplicates (res
, FALSE
);
1731 stf_parse_options_csv_set_trim_seps (res
, FALSE
);
1732 stf_parse_options_csv_set_stringindicator (res
, stringind
);
1734 lines_chunk
= g_string_chunk_new (100 * 1024);
1735 lines
= stf_parse_lines (res
, lines_chunk
, data
, 1000, FALSE
);
1738 * Find a line containing a quote; skip first line unless it is
1739 * the only one. Prefer a line with the quote first.
1741 for (pass
= 1; !quoteline
&& pass
<= 2; pass
++) {
1743 for (lno
= MIN (1, lines
->len
- 1);
1744 !quoteline
&& lno
< lines
->len
;
1746 GPtrArray
*boxline
= g_ptr_array_index (lines
, lno
);
1747 const char *line
= g_ptr_array_index (boxline
, 0);
1750 if (g_utf8_get_char (line
) == stringind
)
1754 if (my_utf8_strchr (line
, stringind
))
1762 const char *p0
= my_utf8_strchr (quoteline
, stringind
);
1766 p
= g_utf8_next_char (p
);
1767 } while (*p
&& g_utf8_get_char (p
) != stringind
);
1768 if (*p
) p
= g_utf8_next_char (p
);
1769 while (*p
&& g_unichar_isspace (g_utf8_get_char (p
)))
1770 p
= g_utf8_next_char (p
);
1772 /* Use the character after the quote. */
1773 sep
= g_strndup (p
, g_utf8_next_char (p
) - p
);
1775 /* Try to use character before the quote. */
1776 while (p0
> quoteline
&& !sep
) {
1778 p0
= g_utf8_prev_char (p0
);
1779 if (!g_unichar_isspace (g_utf8_get_char (p0
)))
1780 sep
= g_strndup (p0
, p
- p0
);
1786 sep
= g_strdup (",");
1787 stf_parse_options_csv_set_separators (res
, sep
, NULL
);
1790 stf_parse_general_free (lines
);
1791 g_string_chunk_free (lines_chunk
);
1793 stf_parse_options_guess_formats (res
, data
);
1795 if (gnm_debug_flag ("stf"))
1796 dump_guessed_options (res
);
1802 STF_GUESS_DATE_DMY
= 1,
1803 STF_GUESS_DATE_MDY
= 2,
1804 STF_GUESS_DATE_YMD
= 4,
1806 STF_GUESS_NUMBER_DEC_POINT
= 0x10,
1807 STF_GUESS_NUMBER_DEC_COMMA
= 0x20,
1808 STF_GUESS_NUMBER_DEC_EITHER
= 0x30,
1810 STF_GUESS_ALL
= 0x37
1814 do_check_date (const char *data
, StfGuessFormats flag
,
1815 gboolean mbd
, gboolean ybm
,
1817 GODateConventions
const *date_conv
)
1820 gboolean this_mbd
, this_ybm
;
1823 if (!(*possible
& flag
))
1826 v
= format_match_datetime (data
, date_conv
, mbd
, TRUE
, FALSE
);
1827 if (!v
|| !VALUE_FMT (v
))
1830 imbd
= go_format_month_before_day (VALUE_FMT (v
));
1831 this_mbd
= (imbd
>= 1);
1832 this_ybm
= (imbd
== 2);
1833 if (mbd
!= this_mbd
|| ybm
!= this_ybm
)
1846 do_check_number (const char *data
, StfGuessFormats flag
,
1847 const GString
*dec
, const GString
*thousand
, const GString
*curr
,
1848 unsigned *possible
, int *decimals
)
1851 GOFormatFamily family
;
1854 if (!(*possible
& flag
))
1857 v
= format_match_decimal_number_with_locale (data
, &family
, curr
, thousand
, dec
);
1861 if (*decimals
!= -2) {
1862 const char *pdec
= strstr (data
, dec
->str
);
1863 int this_decimals
= 0;
1866 while (g_ascii_isdigit (*pdec
)) {
1871 if (*decimals
== -1)
1872 *decimals
= this_decimals
;
1873 else if (*decimals
!= this_decimals
)
1877 pthou
= strstr (data
, thousand
->str
);
1880 int digits
= 0, nonzero_digits
= 0;
1881 for (p
= data
; p
< pthou
; p
= g_utf8_next_char (p
)) {
1882 if (g_unichar_isdigit (g_utf8_get_char (p
))) {
1888 // "-.222" implies that "." is not a thousands separator.
1889 // "0.222" implies that "." is not a thousands separator.
1890 // "12345,555" implies that "," is not a thousands separator.
1891 if (nonzero_digits
== 0 || digits
> 3)
1905 * stf_parse_options_guess_formats:
1906 * @data: the CSV input data.
1908 * This function attempts to recognize data formats on a column-by-column
1909 * basis under the assumption that the data in a text file will generally
1910 * use the same data formats.
1912 * This is useful because not all values give sufficient information by
1913 * themselves to tell what format the data is in. For example, "1/2/2000"
1914 * is likely to be a date in year 2000, but it is not clear if it is in
1915 * January or February. If another value in the same column is "31/1/1999"
1916 * then it is likely that the former date was in February.
1918 * Likewise, a value of "123,456" could mean either 1.23456e5 or 1.23456e2.
1919 * A later value of "111,200.22" would clear up the confusion.
1923 stf_parse_options_guess_formats (StfParseOptions_t
*po
, char const *data
)
1925 GStringChunk
*lines_chunk
;
1927 unsigned lno
, col
, colcount
, sline
;
1928 GODateConventions
const *date_conv
= go_date_conv_from_str ("Lotus:1900");
1929 GString
*s_comma
= g_string_new (",");
1930 GString
*s_dot
= g_string_new (".");
1931 GString
*s_dollar
= g_string_new ("$");
1932 gboolean debug
= gnm_debug_flag ("stf");
1934 g_ptr_array_set_size (po
->formats
, 0);
1935 g_ptr_array_set_size (po
->formats_decimal
, 0);
1936 g_ptr_array_set_size (po
->formats_thousand
, 0);
1937 g_ptr_array_set_size (po
->formats_curr
, 0);
1939 lines_chunk
= g_string_chunk_new (100 * 1024);
1940 lines
= stf_parse_general (po
, lines_chunk
, data
, data
+ strlen (data
));
1943 for (lno
= 0; lno
< lines
->len
; lno
++) {
1944 GPtrArray
*line
= g_ptr_array_index (lines
, lno
);
1945 colcount
= MAX (colcount
, line
->len
);
1948 // Ignore first line unless it is the only one
1949 sline
= MIN ((int)lines
->len
- 1, 1);
1951 g_ptr_array_set_size (po
->formats
, colcount
);
1952 g_ptr_array_set_size (po
->formats_decimal
, colcount
);
1953 g_ptr_array_set_size (po
->formats_thousand
, colcount
);
1954 g_ptr_array_set_size (po
->formats_curr
, colcount
);
1955 for (col
= 0; col
< colcount
; col
++) {
1956 unsigned possible
= STF_GUESS_ALL
;
1957 GOFormat
*fmt
= NULL
;
1958 gboolean seen_dot
= FALSE
;
1959 gboolean seen_comma
= FALSE
;
1960 int decimals_if_point
= -1; // -1: unset; -2: inconsistent; >=0: count
1961 int decimals_if_comma
= -1; // -1: unset; -2: inconsistent; >=0: count
1963 for (lno
= sline
; possible
&& lno
< lines
->len
; lno
++) {
1964 GPtrArray
*line
= g_ptr_array_index (lines
, lno
);
1965 const char *data
= col
< line
->len
? g_ptr_array_index (line
, col
) : "";
1966 unsigned prev_possible
= possible
;
1968 if (*data
== 0 || data
[0] == '\'')
1971 do_check_date (data
, STF_GUESS_DATE_DMY
, FALSE
, FALSE
, &possible
, date_conv
);
1972 do_check_date (data
, STF_GUESS_DATE_MDY
, TRUE
, FALSE
, &possible
, date_conv
);
1973 do_check_date (data
, STF_GUESS_DATE_YMD
, TRUE
, TRUE
, &possible
, date_conv
);
1975 if ((possible
& STF_GUESS_NUMBER_DEC_EITHER
) == STF_GUESS_NUMBER_DEC_EITHER
) {
1976 const char *pdot
= strstr (data
, s_dot
->str
);
1977 const char *pcomma
= strstr (data
, s_comma
->str
);
1978 if (pdot
&& pcomma
) {
1979 // Both -- last one is the decimal separator
1981 possible
&= ~STF_GUESS_NUMBER_DEC_COMMA
;
1983 possible
&= ~STF_GUESS_NUMBER_DEC_POINT
;
1984 } else if (pdot
&& strstr (pdot
+ s_dot
->len
, s_dot
->str
)) {
1985 // Two dots so they are thousands separators
1986 possible
&= ~STF_GUESS_NUMBER_DEC_POINT
;
1987 } else if (pcomma
&& strstr (pcomma
+ s_comma
->len
, s_comma
->str
)) {
1988 // Two commas so they are thousands separators
1989 possible
&= ~STF_GUESS_NUMBER_DEC_COMMA
;
1992 seen_dot
= seen_dot
|| (pdot
!= 0);
1993 seen_comma
= seen_comma
|| (pcomma
!= 0);
1995 do_check_number (data
, STF_GUESS_NUMBER_DEC_POINT
,
1996 s_dot
, s_comma
, s_dollar
,
1997 &possible
, &decimals_if_point
);
1998 do_check_number (data
, STF_GUESS_NUMBER_DEC_COMMA
,
1999 s_comma
, s_dot
, s_dollar
,
2000 &possible
, &decimals_if_comma
);
2002 if (possible
!= prev_possible
&& debug
)
2003 g_printerr ("col=%d; after [%s] possible=0x%x\n", col
, data
, possible
);
2006 if ((possible
& STF_GUESS_NUMBER_DEC_EITHER
) == STF_GUESS_NUMBER_DEC_EITHER
&&
2007 !seen_dot
&& !seen_comma
) {
2008 // It doesn't matter what the separators are
2009 possible
&= ~STF_GUESS_NUMBER_DEC_COMMA
;
2013 case STF_GUESS_DATE_DMY
:
2014 fmt
= go_format_new_from_XL ("d-mmm-yyyy");
2016 case STF_GUESS_DATE_MDY
:
2017 fmt
= go_format_new_from_XL ("m/d/yyyy");
2019 case STF_GUESS_DATE_YMD
:
2020 fmt
= go_format_new_from_XL ("yyyy-mm-dd");
2022 case STF_GUESS_NUMBER_DEC_POINT
:
2023 g_ptr_array_index (po
->formats_decimal
, col
) = g_string_new (".");
2024 g_ptr_array_index (po
->formats_thousand
, col
) = g_string_new (",");
2025 g_ptr_array_index (po
->formats_curr
, col
) = g_string_new (s_dollar
->str
);
2026 if (decimals_if_point
> 0) {
2027 // Don't set format if decimals is zero
2028 GString
*fmt_str
= g_string_new (NULL
);
2029 go_format_generate_number_str (fmt_str
, 1, decimals_if_point
, seen_comma
, FALSE
, FALSE
, "", "");
2030 fmt
= go_format_new_from_XL (fmt_str
->str
);
2031 g_string_free (fmt_str
, TRUE
);
2034 case STF_GUESS_NUMBER_DEC_COMMA
:
2035 g_ptr_array_index (po
->formats_decimal
, col
) = g_string_new (",");
2036 g_ptr_array_index (po
->formats_thousand
, col
) = g_string_new (".");
2037 g_ptr_array_index (po
->formats_curr
, col
) = g_string_new (s_dollar
->str
);
2038 if (decimals_if_comma
> 0) {
2039 // Don't set format if decimals is zero
2040 GString
*fmt_str
= g_string_new (NULL
);
2041 go_format_generate_number_str (fmt_str
, 1, decimals_if_comma
, seen_dot
, FALSE
, FALSE
, "", "");
2042 fmt
= go_format_new_from_XL (fmt_str
->str
);
2043 g_string_free (fmt_str
, TRUE
);
2051 fmt
= go_format_ref (go_format_general ());
2052 g_ptr_array_index (po
->formats
, col
) = fmt
;
2055 stf_parse_general_free (lines
);
2056 g_string_chunk_free (lines_chunk
);
2058 g_string_free (s_dot
, TRUE
);
2059 g_string_free (s_comma
, TRUE
);
2060 g_string_free (s_dollar
, TRUE
);