1 /* vim: set sw=8: -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */
3 * stf-parse.c : Structured Text Format parser. (STF)
4 * A general purpose engine for parsing data
5 * in CSV and Fixed width format.
8 * Copyright (C) Almer. S. Tigelaar.
9 * EMail: almer1@dds.nl or almer-t@bigfoot.com
11 * Copyright (C) 2003 Andreas J. Guelzow <aguelzow@taliesin.ca>
12 * Copyright (C) 2003,2008-2009 Morten Welinder <terra@gnome.org>
14 * This program is free software; you can redistribute it and/or modify
15 * it under the terms of the GNU General Public License as published by
16 * the Free Software Foundation; either version 2 of the License, or
17 * (at your option) any later version.
19 * This program is distributed in the hope that it will be useful,
20 * but WITHOUT ANY WARRANTY; without even the implied warranty of
21 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
22 * GNU General Public License for more details.
24 * You should have received a copy of the GNU General Public License
25 * along with this program; if not, write to the Free Software
26 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
29 #include <gnumeric-config.h>
30 #include <glib/gi18n-lib.h>
32 #include "stf-parse.h"
33 #include "stf-export.h"
39 #include "clipboard.h"
40 #include "sheet-style.h"
43 #include "number-match.h"
45 #include "parse-util.h"
46 #include "number-match.h"
47 #include "gnm-format.h"
49 #include <goffice/goffice.h>
55 #define SETUP_LOCALE_SWITCH char *oldlocale = NULL
57 #define START_LOCALE_SWITCH if (parseoptions->locale) {\
58 oldlocale = g_strdup(go_setlocale (LC_ALL, NULL)); \
59 go_setlocale(LC_ALL, parseoptions->locale);}
61 #define END_LOCALE_SWITCH if (oldlocale) {\
62 go_setlocale(LC_ALL, oldlocale);\
65 /* Source_t struct, used for interchanging parsing information between the low level parse functions */
68 char const *position
; /* Indicates the current position within data */
70 /* Used internally for fixed width parsing */
71 int splitpos
; /* Indicates current position in splitpositions array */
72 int linepos
; /* Position on the current line */
75 /* Struct used for autodiscovery */
82 * Some silly dude make the length field an unsigned int. C just does
83 * not deal very well with that.
86 my_garray_len (GArray
const *a
)
92 my_utf8_strchr (const char *p
, gunichar uc
)
94 return uc
< 0x7f ? strchr (p
, uc
) : g_utf8_strchr (p
, -1, uc
);
98 compare_terminator (char const *s
, StfParseOptions_t
*parseoptions
)
100 guchar
const *us
= (guchar
const *)s
;
103 if (*us
> parseoptions
->compiled_terminator
.max
||
104 *us
< parseoptions
->compiled_terminator
.min
)
107 for (l
= parseoptions
->terminator
; l
; l
= l
->next
) {
108 char const *term
= l
->data
;
126 /*******************************************************************************************************
127 * STF PARSE OPTIONS : StfParseOptions related
128 *******************************************************************************************************/
131 gnm_g_string_free (GString
*s
)
133 if (s
) g_string_free (s
, TRUE
);
138 * stf_parse_options_new:
140 * This will return a new StfParseOptions_t struct.
141 * The struct should, after being used, freed with stf_parse_options_free.
143 static StfParseOptions_t
*
144 stf_parse_options_new (void)
146 StfParseOptions_t
* parseoptions
= g_new0 (StfParseOptions_t
, 1);
148 parseoptions
->parsetype
= PARSE_TYPE_NOTSET
;
150 parseoptions
->terminator
= NULL
;
151 stf_parse_options_add_line_terminator (parseoptions
, "\r\n");
152 stf_parse_options_add_line_terminator (parseoptions
, "\n");
153 stf_parse_options_add_line_terminator (parseoptions
, "\r");
155 parseoptions
->trim_spaces
= (TRIM_TYPE_RIGHT
| TRIM_TYPE_LEFT
);
156 parseoptions
->locale
= NULL
;
158 parseoptions
->splitpositions
= NULL
;
159 stf_parse_options_fixed_splitpositions_clear (parseoptions
);
161 parseoptions
->stringindicator
= '"';
162 parseoptions
->indicator_2x_is_single
= TRUE
;
163 parseoptions
->sep
.duplicates
= FALSE
;
164 parseoptions
->trim_seps
= FALSE
;
166 parseoptions
->sep
.str
= NULL
;
167 parseoptions
->sep
.chr
= NULL
;
169 parseoptions
->col_autofit_array
= NULL
;
170 parseoptions
->col_import_array
= NULL
;
171 parseoptions
->col_import_array_len
= 0;
172 parseoptions
->formats
= g_ptr_array_new_with_free_func ((GDestroyNotify
)go_format_unref
);
173 parseoptions
->formats_decimal
= g_ptr_array_new_with_free_func ((GDestroyNotify
)gnm_g_string_free
);
174 parseoptions
->formats_thousand
= g_ptr_array_new_with_free_func ((GDestroyNotify
)gnm_g_string_free
);
175 parseoptions
->formats_curr
= g_ptr_array_new_with_free_func ((GDestroyNotify
)gnm_g_string_free
);
177 parseoptions
->cols_exceeded
= FALSE
;
178 parseoptions
->rows_exceeded
= FALSE
;
179 parseoptions
->ref_count
= 1;
185 * stf_parse_options_free:
187 * will free @parseoptions, note that this will not free the splitpositions
188 * member (GArray) of the struct, the caller is responsible for that.
191 stf_parse_options_free (StfParseOptions_t
*parseoptions
)
193 g_return_if_fail (parseoptions
!= NULL
);
195 if (parseoptions
->ref_count
-- > 1)
198 g_free (parseoptions
->col_import_array
);
199 g_free (parseoptions
->col_autofit_array
);
200 g_free (parseoptions
->locale
);
201 g_free (parseoptions
->sep
.chr
);
203 if (parseoptions
->sep
.str
) {
206 for (l
= parseoptions
->sep
.str
; l
!= NULL
; l
= l
->next
)
207 g_free ((char *) l
->data
);
208 g_slist_free (parseoptions
->sep
.str
);
211 g_array_free (parseoptions
->splitpositions
, TRUE
);
213 stf_parse_options_clear_line_terminator (parseoptions
);
215 g_ptr_array_free (parseoptions
->formats
, TRUE
);
216 g_ptr_array_free (parseoptions
->formats_decimal
, TRUE
);
217 g_ptr_array_free (parseoptions
->formats_thousand
, TRUE
);
218 g_ptr_array_free (parseoptions
->formats_curr
, TRUE
);
220 g_free (parseoptions
);
223 static StfParseOptions_t
*
224 stf_parse_options_ref (StfParseOptions_t
*parseoptions
)
226 parseoptions
->ref_count
++;
231 stf_parse_options_get_type (void)
236 t
= g_boxed_type_register_static ("StfParseOptions_t",
237 (GBoxedCopyFunc
)stf_parse_options_ref
,
238 (GBoxedFreeFunc
)stf_parse_options_free
);
244 stf_parse_options_set_type (StfParseOptions_t
*parseoptions
, StfParseType_t
const parsetype
)
246 g_return_if_fail (parseoptions
!= NULL
);
247 g_return_if_fail (parsetype
== PARSE_TYPE_CSV
|| parsetype
== PARSE_TYPE_FIXED
);
249 parseoptions
->parsetype
= parsetype
;
253 long_string_first (gchar
const *a
, gchar
const *b
)
255 /* This actually is UTF-8 safe. */
256 return strlen (b
) - strlen (a
);
260 compile_terminators (StfParseOptions_t
*parseoptions
)
263 GO_SLIST_SORT (parseoptions
->terminator
, (GCompareFunc
)long_string_first
);
265 parseoptions
->compiled_terminator
.min
= 255;
266 parseoptions
->compiled_terminator
.max
= 0;
267 for (l
= parseoptions
->terminator
; l
; l
= l
->next
) {
268 const guchar
*term
= l
->data
;
269 parseoptions
->compiled_terminator
.min
=
270 MIN (parseoptions
->compiled_terminator
.min
, *term
);
271 parseoptions
->compiled_terminator
.max
=
272 MAX (parseoptions
->compiled_terminator
.max
, *term
);
277 * stf_parse_options_add_line_terminator:
279 * This will add to the line terminators, in both the Fixed width and CSV delimited importers
280 * this indicates the end of a row.
284 stf_parse_options_add_line_terminator (StfParseOptions_t
*parseoptions
, char const *terminator
)
286 g_return_if_fail (parseoptions
!= NULL
);
287 g_return_if_fail (terminator
!= NULL
&& *terminator
!= 0);
289 GO_SLIST_PREPEND (parseoptions
->terminator
, g_strdup (terminator
));
290 compile_terminators (parseoptions
);
294 * stf_parse_options_clear_line_terminator:
296 * This will clear the line terminator, in both the Fixed width and CSV delimited importers
297 * this indicates the end of a row.
301 stf_parse_options_clear_line_terminator (StfParseOptions_t
*parseoptions
)
303 g_return_if_fail (parseoptions
!= NULL
);
305 g_slist_free_full (parseoptions
->terminator
, g_free
);
306 parseoptions
->terminator
= NULL
;
307 compile_terminators (parseoptions
);
311 * stf_parse_options_set_trim_spaces:
313 * If enabled will trim spaces in every parsed field on left and/or right
317 stf_parse_options_set_trim_spaces (StfParseOptions_t
*parseoptions
, StfTrimType_t
const trim_spaces
)
319 g_return_if_fail (parseoptions
!= NULL
);
321 parseoptions
->trim_spaces
= trim_spaces
;
325 * stf_parse_options_csv_set_separators:
326 * @parseoptions: #StfParseOptions_t
328 * @string: (element-type char):
330 * A copy is made of the parameters.
333 stf_parse_options_csv_set_separators (StfParseOptions_t
*parseoptions
, char const *character
,
334 GSList
const *string
)
336 g_return_if_fail (parseoptions
!= NULL
);
338 g_free (parseoptions
->sep
.chr
);
339 parseoptions
->sep
.chr
= g_strdup (character
);
341 g_slist_free_full (parseoptions
->sep
.str
, g_free
);
342 parseoptions
->sep
.str
= go_slist_map (string
, (GOMapFunc
)g_strdup
);
346 stf_parse_options_csv_set_stringindicator (StfParseOptions_t
*parseoptions
, gunichar
const stringindicator
)
348 g_return_if_fail (parseoptions
!= NULL
);
350 parseoptions
->stringindicator
= stringindicator
;
354 * stf_parse_options_csv_set_indicator_2x_is_single:
355 * @indic_2x: a boolean value indicating whether we want to see two
356 * adjacent string indicators as a single string indicator
357 * that is part of the cell, rather than a terminator.
360 stf_parse_options_csv_set_indicator_2x_is_single (StfParseOptions_t
*parseoptions
,
361 gboolean
const indic_2x
)
363 g_return_if_fail (parseoptions
!= NULL
);
365 parseoptions
->indicator_2x_is_single
= indic_2x
;
369 * stf_parse_options_csv_set_duplicates:
371 * @duplicates: a boolean value indicating whether we want to see two
372 * separators right behind each other as one
375 stf_parse_options_csv_set_duplicates (StfParseOptions_t
*parseoptions
, gboolean
const duplicates
)
377 g_return_if_fail (parseoptions
!= NULL
);
379 parseoptions
->sep
.duplicates
= duplicates
;
383 * stf_parse_options_csv_set_trim_seps:
384 * @trim_seps: a boolean value indicating whether we want to ignore
385 * separators at the beginning of lines
388 stf_parse_options_csv_set_trim_seps (StfParseOptions_t
*parseoptions
, gboolean
const trim_seps
)
390 g_return_if_fail (parseoptions
!= NULL
);
392 parseoptions
->trim_seps
= trim_seps
;
396 * stf_parse_options_fixed_splitpositions_clear:
398 * This will clear the splitpositions (== points on which a line is split)
401 stf_parse_options_fixed_splitpositions_clear (StfParseOptions_t
*parseoptions
)
404 g_return_if_fail (parseoptions
!= NULL
);
406 if (parseoptions
->splitpositions
)
407 g_array_free (parseoptions
->splitpositions
, TRUE
);
408 parseoptions
->splitpositions
= g_array_new (FALSE
, FALSE
, sizeof (int));
410 g_array_append_val (parseoptions
->splitpositions
, minus_one
);
414 * stf_parse_options_fixed_splitpositions_add:
416 * @position will be added to the splitpositions.
419 stf_parse_options_fixed_splitpositions_add (StfParseOptions_t
*parseoptions
, int position
)
423 g_return_if_fail (parseoptions
!= NULL
);
424 g_return_if_fail (position
>= 0);
426 for (ui
= 0; ui
< parseoptions
->splitpositions
->len
- 1; ui
++) {
427 int here
= g_array_index (parseoptions
->splitpositions
, int, ui
);
428 if (position
== here
)
434 g_array_insert_val (parseoptions
->splitpositions
, ui
, position
);
438 stf_parse_options_fixed_splitpositions_remove (StfParseOptions_t
*parseoptions
, int position
)
442 g_return_if_fail (parseoptions
!= NULL
);
443 g_return_if_fail (position
>= 0);
445 for (ui
= 0; ui
< parseoptions
->splitpositions
->len
- 1; ui
++) {
446 int here
= g_array_index (parseoptions
->splitpositions
, int, ui
);
447 if (position
== here
)
448 g_array_remove_index (parseoptions
->splitpositions
, ui
);
449 if (position
<= here
)
455 stf_parse_options_fixed_splitpositions_count (StfParseOptions_t
*parseoptions
)
457 return parseoptions
->splitpositions
->len
;
461 stf_parse_options_fixed_splitpositions_nth (StfParseOptions_t
*parseoptions
, int n
)
463 return g_array_index (parseoptions
->splitpositions
, int, n
);
468 * stf_parse_options_valid:
469 * @parseoptions: an import options struct
471 * Checks if @parseoptions is correctly filled
473 * returns : TRUE if it is correctly filled, FALSE otherwise.
476 stf_parse_options_valid (StfParseOptions_t
*parseoptions
)
478 g_return_val_if_fail (parseoptions
!= NULL
, FALSE
);
480 if (parseoptions
->parsetype
== PARSE_TYPE_FIXED
) {
481 if (!parseoptions
->splitpositions
) {
482 g_warning ("STF: No splitpositions in struct");
490 /*******************************************************************************************************
491 * STF PARSE : The actual routines that do the 'trick'
492 *******************************************************************************************************/
495 trim_spaces_inplace (char *field
, StfParseOptions_t
const *parseoptions
)
499 if (parseoptions
->trim_spaces
& TRIM_TYPE_LEFT
) {
502 while (g_unichar_isspace (g_utf8_get_char (s
)))
503 s
= g_utf8_next_char (s
);
506 memmove (field
, s
, 1 + strlen (s
));
509 if (parseoptions
->trim_spaces
& TRIM_TYPE_RIGHT
) {
510 char *s
= field
+ strlen (field
);
513 s
= g_utf8_prev_char (s
);
514 if (!g_unichar_isspace (g_utf8_get_char (s
)))
522 * stf_parse_csv_is_separator:
524 * returns NULL if @character is not a separator, a pointer to the character
525 * after the separator otherwise.
528 stf_parse_csv_is_separator (char const *character
, char const *chr
, GSList
const *str
)
530 g_return_val_if_fail (character
!= NULL
, NULL
);
538 for (l
= str
; l
!= NULL
; l
= l
->next
) {
539 char const *s
= l
->data
;
542 glong
const len
= g_utf8_strlen (s
, -1);
544 /* Don't compare past the end of the buffer! */
545 for (r
= character
, cnt
= 0; cnt
< len
; cnt
++, r
= g_utf8_next_char (r
))
549 if ((cnt
== len
) && (memcmp (character
, s
, len
) == 0))
550 return g_utf8_offset_to_pointer (character
, len
);
554 if (chr
&& my_utf8_strchr (chr
, g_utf8_get_char (character
)))
555 return g_utf8_next_char(character
);
561 * stf_parse_eat_separators:
563 * skip over leading separators
568 stf_parse_eat_separators (Source_t
*src
, StfParseOptions_t
*parseoptions
)
570 char const *cur
, *next
;
572 g_return_if_fail (src
!= NULL
);
573 g_return_if_fail (parseoptions
!= NULL
);
577 if (*cur
== '\0' || compare_terminator (cur
, parseoptions
))
579 while ((next
= stf_parse_csv_is_separator (cur
, parseoptions
->sep
.chr
, parseoptions
->sep
.str
)))
590 STF_CELL_FIELD_NO_SEP
,
594 static StfParseCellRes
595 stf_parse_csv_cell (GString
*text
, Source_t
*src
, StfParseOptions_t
*parseoptions
)
598 gboolean saw_sep
= FALSE
;
600 g_return_val_if_fail (src
!= NULL
, STF_CELL_ERROR
);
601 g_return_val_if_fail (parseoptions
!= NULL
, STF_CELL_ERROR
);
604 g_return_val_if_fail (cur
!= NULL
, STF_CELL_ERROR
);
606 /* Skip whitespace, but stop at line terminators. */
615 term_len
= compare_terminator (cur
, parseoptions
);
617 src
->position
= cur
+ term_len
;
621 if ((parseoptions
->trim_spaces
& TRIM_TYPE_LEFT
) == 0)
624 if (stf_parse_csv_is_separator (cur
, parseoptions
->sep
.chr
,
625 parseoptions
->sep
.str
))
628 if (!g_unichar_isspace (g_utf8_get_char (cur
)))
630 cur
= g_utf8_next_char (cur
);
633 if (parseoptions
->stringindicator
!= 0 &&
634 g_utf8_get_char (cur
) == parseoptions
->stringindicator
) {
635 cur
= g_utf8_next_char (cur
);
637 gunichar uc
= g_utf8_get_char (cur
);
638 cur
= g_utf8_next_char (cur
);
640 if (uc
== parseoptions
->stringindicator
) {
641 if (parseoptions
->indicator_2x_is_single
&&
642 g_utf8_get_char (cur
) == parseoptions
->stringindicator
)
643 cur
= g_utf8_next_char (cur
);
645 /* "field content"dropped-garbage, */
646 while (*cur
&& !compare_terminator (cur
, parseoptions
)) {
647 char const *post
= stf_parse_csv_is_separator
648 (cur
, parseoptions
->sep
.chr
, parseoptions
->sep
.str
);
654 cur
= g_utf8_next_char (cur
);
660 g_string_append_unichar (text
, uc
);
663 /* We silently allow a missing terminating quote. */
665 /* Unquoted field. */
667 while (*cur
&& !compare_terminator (cur
, parseoptions
)) {
669 char const *post
= stf_parse_csv_is_separator
670 (cur
, parseoptions
->sep
.chr
, parseoptions
->sep
.str
);
677 g_string_append_unichar (text
, g_utf8_get_char (cur
));
678 cur
= g_utf8_next_char (cur
);
681 if (parseoptions
->trim_spaces
& TRIM_TYPE_RIGHT
) {
683 const char *last
= g_utf8_prev_char (text
->str
+ text
->len
);
684 if (!g_unichar_isspace (g_utf8_get_char (last
)))
686 g_string_truncate (text
, last
- text
->str
);
693 if (saw_sep
&& parseoptions
->sep
.duplicates
)
694 stf_parse_eat_separators (src
, parseoptions
);
696 return saw_sep
? STF_CELL_FIELD_SEP
: STF_CELL_FIELD_NO_SEP
;
700 * stf_parse_csv_line:
702 * This will parse one line from the current @src->position.
703 * NOTE: The calling routine is responsible for freeing the result.
705 * returns : a GPtrArray of char*'s
708 stf_parse_csv_line (Source_t
*src
, StfParseOptions_t
*parseoptions
)
711 gboolean cont
= FALSE
;
714 g_return_val_if_fail (src
!= NULL
, NULL
);
715 g_return_val_if_fail (parseoptions
!= NULL
, NULL
);
717 line
= g_ptr_array_new ();
718 if (parseoptions
->trim_seps
)
719 stf_parse_eat_separators (src
, parseoptions
);
721 text
= g_string_sized_new (30);
725 StfParseCellRes res
=
726 stf_parse_csv_cell (text
, src
, parseoptions
);
727 trim_spaces_inplace (text
->str
, parseoptions
);
728 ctext
= g_string_chunk_insert_len (src
->chunk
,
729 text
->str
, text
->len
);
730 g_string_truncate (text
, 0);
733 case STF_CELL_FIELD_NO_SEP
:
734 g_ptr_array_add (line
, ctext
);
738 case STF_CELL_FIELD_SEP
:
739 g_ptr_array_add (line
, ctext
);
740 cont
= TRUE
; /* Make sure we see one more field. */
745 g_ptr_array_add (line
, ctext
);
746 g_string_free (text
, TRUE
);
753 * stf_parse_fixed_cell:
755 * returns a pointer to the parsed cell contents.
758 stf_parse_fixed_cell (Source_t
*src
, StfParseOptions_t
*parseoptions
)
764 g_return_val_if_fail (src
!= NULL
, NULL
);
765 g_return_val_if_fail (parseoptions
!= NULL
, NULL
);
769 if (src
->splitpos
< my_garray_len (parseoptions
->splitpositions
))
770 splitval
= (int) g_array_index (parseoptions
->splitpositions
, int, src
->splitpos
);
774 while (*cur
!= 0 && !compare_terminator (cur
, parseoptions
) && splitval
!= src
->linepos
) {
776 cur
= g_utf8_next_char (cur
);
779 res
= g_string_chunk_insert_len (src
->chunk
,
781 cur
- src
->position
);
789 * stf_parse_fixed_line:
791 * This will parse one line from the current @src->position.
792 * It will return a GPtrArray with the cell contents as strings.
794 * NOTE: The calling routine is responsible for freeing result.
797 stf_parse_fixed_line (Source_t
*src
, StfParseOptions_t
*parseoptions
)
801 g_return_val_if_fail (src
!= NULL
, NULL
);
802 g_return_val_if_fail (parseoptions
!= NULL
, NULL
);
807 line
= g_ptr_array_new ();
808 while (*src
->position
!= '\0' && !compare_terminator (src
->position
, parseoptions
)) {
809 char *field
= stf_parse_fixed_cell (src
, parseoptions
);
811 trim_spaces_inplace (field
, parseoptions
);
812 g_ptr_array_add (line
, field
);
817 while (line
->len
< parseoptions
->splitpositions
->len
)
818 g_ptr_array_add (line
, g_strdup (""));
824 * stf_parse_general_free: (skip)
827 stf_parse_general_free (GPtrArray
*lines
)
830 for (lineno
= 0; lineno
< lines
->len
; lineno
++) {
831 GPtrArray
*line
= g_ptr_array_index (lines
, lineno
);
832 /* Fields are not freed here. */
834 g_ptr_array_free (line
, TRUE
);
836 g_ptr_array_free (lines
, TRUE
);
841 * stf_parse_general: (skip)
843 * Returns: (transfer full): a GPtrArray of lines, where each line is itself a
844 * GPtrArray of strings.
846 * The caller must free this entire structure, for example by calling
847 * stf_parse_general_free.
850 stf_parse_general (StfParseOptions_t
*parseoptions
,
851 GStringChunk
*lines_chunk
,
852 char const *data
, char const *data_end
)
857 char const *valid_end
= data_end
;
859 g_return_val_if_fail (parseoptions
!= NULL
, NULL
);
860 g_return_val_if_fail (data
!= NULL
, NULL
);
861 g_return_val_if_fail (data_end
!= NULL
, NULL
);
862 g_return_val_if_fail (stf_parse_options_valid (parseoptions
), NULL
);
863 g_return_val_if_fail (g_utf8_validate (data
, data_end
-data
, &valid_end
), NULL
);
865 src
.chunk
= lines_chunk
;
869 if ((data_end
-data
>= 3) && !strncmp(src
.position
, "\xEF\xBB\xBF", 3)) {
870 /* Skip over byte-order mark */
874 lines
= g_ptr_array_new ();
875 while (*src
.position
!= '\0' && src
.position
< data_end
) {
878 if (row
== GNM_MAX_ROWS
) {
879 parseoptions
->rows_exceeded
= TRUE
;
883 line
= parseoptions
->parsetype
== PARSE_TYPE_CSV
884 ? stf_parse_csv_line (&src
, parseoptions
)
885 : stf_parse_fixed_line (&src
, parseoptions
);
887 g_ptr_array_add (lines
, line
);
888 if (parseoptions
->parsetype
!= PARSE_TYPE_CSV
)
889 src
.position
+= compare_terminator (src
.position
, parseoptions
);
897 * stf_parse_lines: (skip)
898 * @parseoptions: #StfParseOptions_t
904 * Returns: (transfer full): a GPtrArray of lines, where each line is itself a
905 * GPtrArray of strings.
907 * The caller must free this entire structure, for example by calling
908 * stf_parse_general_free.
911 stf_parse_lines (StfParseOptions_t
*parseoptions
,
912 GStringChunk
*lines_chunk
,
914 int maxlines
, gboolean with_lineno
)
919 g_return_val_if_fail (data
!= NULL
, NULL
);
921 lines
= g_ptr_array_new ();
923 char const *data0
= data
;
924 GPtrArray
*line
= g_ptr_array_new ();
927 char buf
[4 * sizeof (int)];
928 sprintf (buf
, "%d", lineno
);
929 g_ptr_array_add (line
,
930 g_string_chunk_insert (lines_chunk
, buf
));
934 int termlen
= compare_terminator (data
, parseoptions
);
935 if (termlen
> 0 || *data
== 0) {
936 g_ptr_array_add (line
,
937 g_string_chunk_insert_len (lines_chunk
,
943 data
= g_utf8_next_char (data
);
946 g_ptr_array_add (lines
, line
);
949 if (lineno
>= maxlines
)
956 stf_parse_find_line (StfParseOptions_t
*parseoptions
,
961 int termlen
= compare_terminator (data
, parseoptions
);
965 } else if (*data
== 0) {
968 data
= g_utf8_next_char (data
);
976 * stf_parse_options_fixed_autodiscover:
977 * @parseoptions: a Parse options struct.
978 * @data: The actual data.
979 * @data_end: data end.
981 * Automatically try to discover columns in the text to be parsed.
982 * We ignore empty lines (only containing parseoptions->terminator)
984 * FIXME: This is so extremely ugly that I am too tired to rewrite it right now.
985 * Think hard of a better more flexible solution...
988 stf_parse_options_fixed_autodiscover (StfParseOptions_t
*parseoptions
,
989 char const *data
, char const *data_end
)
991 char const *iterator
= data
;
993 GSList
*list_start
= NULL
;
995 int effective_lines
= 0;
996 int max_line_length
= 0;
997 int *line_begin_hits
= NULL
;
998 int *line_end_hits
= NULL
;
1001 stf_parse_options_fixed_splitpositions_clear (parseoptions
);
1004 * First take a look at all possible white space combinations
1006 while (*iterator
&& iterator
< data_end
) {
1007 gboolean begin_recorded
= FALSE
;
1008 AutoDiscovery_t
*disc
= NULL
;
1012 while (*iterator
&& (termlen
= compare_terminator (iterator
, parseoptions
)) == 0) {
1013 if (!begin_recorded
&& *iterator
== ' ') {
1014 disc
= g_new0 (AutoDiscovery_t
, 1);
1016 disc
->start
= position
;
1018 begin_recorded
= TRUE
;
1019 } else if (begin_recorded
&& *iterator
!= ' ') {
1020 disc
->stop
= position
;
1021 list
= g_slist_prepend (list
, disc
);
1023 begin_recorded
= FALSE
;
1031 if (position
> max_line_length
)
1032 max_line_length
= position
;
1035 * If there are excess spaces at the end of
1036 * the line : ignore them
1041 * Hop over the terminator
1043 iterator
+= termlen
;
1051 list
= g_slist_reverse (list
);
1056 * Look at the number of hits at each line position
1057 * if the number of hits equals the number of lines
1058 * we can be pretty sure this is the start or end
1059 * of a column, we filter out empty columns
1062 line_begin_hits
= g_new0 (int, max_line_length
+ 1);
1063 line_end_hits
= g_new0 (int, max_line_length
+ 1);
1066 AutoDiscovery_t
*disc
= list
->data
;
1068 line_begin_hits
[disc
->start
]++;
1069 line_end_hits
[disc
->stop
]++;
1073 list
= g_slist_next (list
);
1075 g_slist_free (list_start
);
1077 for (i
= 0; i
< max_line_length
+ 1; i
++)
1078 if (line_begin_hits
[i
] == effective_lines
|| line_end_hits
[i
] == effective_lines
)
1079 stf_parse_options_fixed_splitpositions_add (parseoptions
, i
);
1082 * Do some corrections to the initial columns
1083 * detected here, we obviously don't need to
1084 * do this if there are no columns at all.
1086 if (my_garray_len (parseoptions
->splitpositions
) > 0) {
1088 * Try to find columns that look like :
1093 * (In other words : Columns with left & right justification with
1094 * a minimum of 2 spaces in the middle)
1095 * Split these columns in 2
1098 for (i
= 0; i
< my_garray_len (parseoptions
->splitpositions
) - 1; i
++) {
1099 int begin
= g_array_index (parseoptions
->splitpositions
, int, i
);
1100 int end
= g_array_index (parseoptions
->splitpositions
, int, i
+ 1);
1101 int num_spaces
= -1;
1102 int spaces_start
= 0;
1103 gboolean right_aligned
= TRUE
;
1104 gboolean left_aligned
= TRUE
;
1105 gboolean has_2_spaces
= TRUE
;
1109 while (*iterator
&& iterator
< data_end
) {
1110 gboolean trigger
= FALSE
;
1111 gboolean space_trigger
= FALSE
;
1116 while (*iterator
&& !compare_terminator (iterator
, parseoptions
)) {
1118 if (*iterator
== ' ')
1119 left_aligned
= FALSE
;
1122 } else if (pos
== end
- 1) {
1123 if (*iterator
== ' ')
1124 right_aligned
= FALSE
;
1129 if (trigger
|| pos
== end
- 1) {
1130 if (!space_trigger
&& *iterator
== ' ') {
1131 space_trigger
= TRUE
;
1133 } else if (space_trigger
&& *iterator
!= ' ') {
1134 space_trigger
= FALSE
;
1135 num_spaces
= pos
- spaces_start
;
1144 has_2_spaces
= FALSE
;
1153 * If this column meets all the criteria
1154 * split it into two at the last measured
1155 * spaces_start + num_spaces
1157 if (has_2_spaces
&& right_aligned
&& left_aligned
) {
1158 int val
= (((spaces_start
+ num_spaces
) - spaces_start
) / 2) + spaces_start
;
1160 g_array_insert_val (parseoptions
->splitpositions
, i
+ 1, val
);
1163 * Skip over the inserted column
1170 * Remove empty columns here if needed
1172 for (i
= 0; i
< my_garray_len (parseoptions
->splitpositions
) - 1; i
++) {
1173 int begin
= g_array_index (parseoptions
->splitpositions
, int, i
);
1174 int end
= g_array_index (parseoptions
->splitpositions
, int, i
+ 1);
1175 gboolean only_spaces
= TRUE
;
1179 while (*iterator
&& iterator
< data_end
) {
1180 gboolean trigger
= FALSE
;
1183 while (*iterator
&& !compare_terminator (iterator
, parseoptions
)) {
1186 else if (pos
== end
)
1190 if (*iterator
!= ' ')
1191 only_spaces
= FALSE
;
1205 * The column only contains spaces
1209 g_array_remove_index (parseoptions
->splitpositions
, i
);
1212 * We HAVE to make sure that the next column (end) also
1213 * gets checked out. If we don't decrease "i" here, we
1214 * will skip over it as the indexes shift down after
1222 g_free (line_begin_hits
);
1223 g_free (line_end_hits
);
1226 /*******************************************************************************************************
1227 * STF PARSE HL: high-level functions that dump the raw data returned by the low-level parsing
1228 * functions into something meaningful (== application specific)
1229 *******************************************************************************************************/
1232 * This is more or less as gnm_cell_set_text, except...
1233 * 1. Unknown names are not allowed.
1234 * 2. Only '=' can start an expression.
1238 stf_cell_set_text (GnmCell
*cell
, char const *text
)
1240 GnmExprTop
const *texpr
;
1242 GOFormat
const *fmt
= gnm_style_get_format (gnm_cell_get_style (cell
));
1243 const GODateConventions
*date_conv
=
1244 workbook_date_conv (cell
->base
.sheet
->workbook
);
1246 if (!go_format_is_text (fmt
) && *text
== '=' && text
[1] != 0) {
1247 GnmExprParseFlags flags
=
1248 GNM_EXPR_PARSE_UNKNOWN_NAMES_ARE_INVALID
;
1249 const char *expr_start
= text
+ 1;
1252 parse_pos_init_cell (&pos
, cell
);
1253 texpr
= gnm_expr_parse_str (expr_start
, &pos
, flags
,
1257 val
= format_match (text
, fmt
, date_conv
);
1261 val
= value_new_string (text
);
1264 gnm_cell_set_value (cell
, val
);
1266 gnm_cell_set_expr (cell
, texpr
);
1267 gnm_expr_top_unref (texpr
);
1272 stf_read_remember_settings (Workbook
*book
, StfParseOptions_t
*po
)
1274 if (po
->parsetype
== PARSE_TYPE_CSV
) {
1275 GnmStfExport
*stfe
= gnm_stf_get_stfe (G_OBJECT (book
));
1277 int length
= g_unichar_to_utf8 (po
->stringindicator
, quote
);
1281 } else quote
[length
] = '\0';
1283 g_object_set (G_OBJECT (stfe
), "separator", po
->sep
.chr
, "quote", "e
, NULL
);
1285 if ((po
->terminator
!= NULL
) && (po
->terminator
->data
!= NULL
))
1286 g_object_set (G_OBJECT (stfe
), "eol", po
->terminator
->data
, NULL
);
1291 stf_parse_sheet (StfParseOptions_t
*parseoptions
,
1292 char const *data
, char const *data_end
,
1293 Sheet
*sheet
, int start_col
, int start_row
)
1297 GStringChunk
*lines_chunk
;
1299 gboolean result
= TRUE
;
1303 SETUP_LOCALE_SWITCH
;
1305 g_return_val_if_fail (parseoptions
!= NULL
, FALSE
);
1306 g_return_val_if_fail (data
!= NULL
, FALSE
);
1307 g_return_val_if_fail (IS_SHEET (sheet
), FALSE
);
1310 data_end
= data
+ strlen (data
);
1312 lines_chunk
= g_string_chunk_new (100 * 1024);
1313 lines
= stf_parse_general (parseoptions
, lines_chunk
, data
, data_end
);
1318 for (lcol
= 0; lcol
< parseoptions
->formats
->len
; lcol
++) {
1319 GOFormat
const *fmt
= g_ptr_array_index (parseoptions
->formats
, lcol
);
1322 (parseoptions
->col_import_array
== NULL
||
1323 parseoptions
->col_import_array_len
<= lcol
||
1324 parseoptions
->col_import_array
[lcol
]);
1325 if (!want_col
|| col
>= gnm_sheet_get_max_cols (sheet
))
1328 if (fmt
&& !go_format_is_general (fmt
)) {
1330 int end_row
= MIN (start_row
+ (int)lines
->len
- 1,
1331 gnm_sheet_get_last_row (sheet
));
1333 range_init (&r
, col
, start_row
, col
, end_row
);
1334 mstyle
= gnm_style_new ();
1335 gnm_style_set_format (mstyle
, fmt
);
1336 sheet_apply_style (sheet
, &r
, mstyle
);
1341 START_LOCALE_SWITCH
;
1342 for (row
= start_row
, lrow
= 0;
1343 result
&& lrow
< lines
->len
;
1347 if (row
>= gnm_sheet_get_max_rows (sheet
)) {
1348 if (!parseoptions
->rows_exceeded
) {
1349 /* FIXME: What locale? */
1350 g_warning (_("There are more rows of data than "
1351 "there is room for in the sheet. Extra "
1352 "rows will be ignored."));
1353 parseoptions
->rows_exceeded
= TRUE
;
1359 line
= g_ptr_array_index (lines
, lrow
);
1361 for (lcol
= 0; lcol
< line
->len
; lcol
++) {
1362 GOFormat
const *fmt
= g_ptr_array_index (parseoptions
->formats
, lcol
);
1363 char const *text
= g_ptr_array_index (line
, lcol
);
1365 (parseoptions
->col_import_array
== NULL
||
1366 parseoptions
->col_import_array_len
<= lcol
||
1367 parseoptions
->col_import_array
[lcol
]);
1371 if (col
>= gnm_sheet_get_max_cols (sheet
)) {
1372 if (!parseoptions
->cols_exceeded
) {
1373 /* FIXME: What locale? */
1374 g_warning (_("There are more columns of data than "
1375 "there is room for in the sheet. Extra "
1376 "columns will be ignored."));
1377 parseoptions
->cols_exceeded
= TRUE
;
1381 if (text
&& *text
) {
1382 GnmCell
*cell
= sheet_cell_fetch (sheet
, col
, row
);
1383 if (!go_format_is_text (fmt
) &&
1384 lcol
< parseoptions
->formats_decimal
->len
&&
1385 g_ptr_array_index (parseoptions
->formats_decimal
, lcol
)) {
1387 GnmValue
*v
= format_match_decimal_number_with_locale
1389 g_ptr_array_index (parseoptions
->formats_curr
, lcol
),
1390 g_ptr_array_index (parseoptions
->formats_thousand
, lcol
),
1391 g_ptr_array_index (parseoptions
->formats_decimal
, lcol
));
1393 v
= value_new_string (text
);
1394 sheet_cell_set_value (cell
, v
);
1397 stf_cell_set_text (cell
, text
);
1403 g_ptr_array_index (lines
, lrow
) = NULL
;
1404 g_ptr_array_free (line
, TRUE
);
1408 for (lcol
= 0, col
= start_col
;
1409 lcol
< parseoptions
->col_import_array_len
&& col
< gnm_sheet_get_max_cols (sheet
);
1411 if (parseoptions
->col_import_array
== NULL
||
1412 parseoptions
->col_import_array_len
<= lcol
||
1413 parseoptions
->col_import_array
[lcol
]) {
1414 if (parseoptions
->col_autofit_array
== NULL
||
1415 parseoptions
->col_autofit_array
[lcol
]) {
1416 ColRowIndexList
*list
= colrow_get_index_list (col
, col
, NULL
);
1417 ColRowStateGroup
*state
= colrow_set_sizes (sheet
, TRUE
, list
, -1, 0, -1);
1418 colrow_index_list_destroy (list
);
1419 g_slist_free (state
);
1425 g_string_chunk_free (lines_chunk
);
1427 stf_parse_general_free (lines
);
1429 stf_read_remember_settings (sheet
->workbook
, parseoptions
);
1434 stf_parse_region (StfParseOptions_t
*parseoptions
, char const *data
, char const *data_end
,
1437 static GODateConventions
const default_conv
= {FALSE
};
1438 GODateConventions
const *date_conv
= wb
? workbook_date_conv (wb
) : &default_conv
;
1441 unsigned int row
, colhigh
= 0;
1442 GStringChunk
*lines_chunk
;
1446 SETUP_LOCALE_SWITCH
;
1448 g_return_val_if_fail (parseoptions
!= NULL
, NULL
);
1449 g_return_val_if_fail (data
!= NULL
, NULL
);
1451 START_LOCALE_SWITCH
;
1453 cr
= gnm_cell_region_new (NULL
);
1456 data_end
= data
+ strlen (data
);
1457 lines_chunk
= g_string_chunk_new (100 * 1024);
1458 lines
= stf_parse_general (parseoptions
, lines_chunk
, data
, data_end
);
1459 nformats
= parseoptions
->formats
->len
;
1460 for (row
= 0; row
< lines
->len
; row
++) {
1461 GPtrArray
*line
= g_ptr_array_index (lines
, row
);
1462 unsigned int col
, targetcol
= 0;
1463 for (col
= 0; col
< line
->len
; col
++) {
1464 if (parseoptions
->col_import_array
== NULL
||
1465 parseoptions
->col_import_array_len
<= col
||
1466 parseoptions
->col_import_array
[col
]) {
1467 const char *text
= g_ptr_array_index (line
, col
);
1469 GOFormat
*fmt
= NULL
;
1474 fmt
= g_ptr_array_index (parseoptions
->formats
, col
);
1475 v
= format_match (text
, fmt
, date_conv
);
1477 v
= value_new_string (text
);
1479 cc
= gnm_cell_copy_new (cr
, targetcol
, row
);
1483 if (targetcol
> colhigh
)
1484 colhigh
= targetcol
;
1489 stf_parse_general_free (lines
);
1490 g_string_chunk_free (lines_chunk
);
1494 cr
->cols
= (colhigh
> 0) ? colhigh
: 1;
1501 int_sort (void const *a
, void const *b
)
1503 return *(int const *)a
- *(int const *)b
;
1507 count_character (GPtrArray
*lines
, gunichar c
, double quantile
)
1510 unsigned int lno
, cno
;
1512 if (lines
->len
== 0)
1515 counts
= g_new (int, lines
->len
);
1516 for (lno
= cno
= 0; lno
< lines
->len
; lno
++) {
1518 GPtrArray
*boxline
= g_ptr_array_index (lines
, lno
);
1519 char const *line
= g_ptr_array_index (boxline
, 0);
1521 /* Ignore empty lines. */
1526 if (g_utf8_get_char (line
) == c
)
1528 line
= g_utf8_next_char (line
);
1531 counts
[cno
++] = count
;
1537 unsigned int qi
= (unsigned int)ceil (quantile
* cno
);
1538 qsort (counts
, cno
, sizeof (counts
[0]), int_sort
);
1550 dump_guessed_options (const StfParseOptions_t
*res
)
1553 char ubuffer
[6 + 1];
1556 g_printerr ("Guessed format:\n");
1557 switch (res
->parsetype
) {
1558 case PARSE_TYPE_CSV
:
1559 g_printerr (" type = sep\n");
1560 g_printerr (" separator = %s\n",
1561 res
->sep
.chr
? res
->sep
.chr
: "(none)");
1562 g_printerr (" see two as one = %s\n",
1563 res
->sep
.duplicates
? "yes" : "no");
1565 case PARSE_TYPE_FIXED
:
1566 g_printerr (" type = sep\n");
1571 g_printerr (" trim space = %d\n", res
->trim_spaces
);
1573 ubuffer
[g_unichar_to_utf8 (res
->stringindicator
, ubuffer
)] = 0;
1574 g_printerr (" string indicator = %s\n", ubuffer
);
1575 g_printerr (" see two as one = %s\n",
1576 res
->indicator_2x_is_single
? "yes" : "no");
1578 g_printerr (" line terminators =");
1579 for (l
= res
->terminator
; l
; l
= l
->next
) {
1580 const char *t
= l
->data
;
1581 if (strcmp (t
, "\n") == 0)
1582 g_printerr (" unix");
1583 else if (strcmp (t
, "\r") == 0)
1584 g_printerr (" mac");
1585 else if (strcmp (t
, "\r\n") == 0)
1586 g_printerr (" dos");
1588 g_printerr (" other");
1592 for (ui
= 0; ui
< res
->formats
->len
; ui
++) {
1593 GOFormat
const *fmt
= g_ptr_array_index (res
->formats
, ui
);
1594 const GString
*decimal
= ui
< res
->formats_decimal
->len
1595 ? g_ptr_array_index (res
->formats_decimal
, ui
)
1597 const GString
*thousand
= ui
< res
->formats_thousand
->len
1598 ? g_ptr_array_index (res
->formats_thousand
, ui
)
1601 g_printerr (" fmt.%d = %s\n", ui
, go_format_as_XL (fmt
));
1603 g_printerr (" fmt.%d.dec = %s\n", ui
, decimal
->str
);
1605 g_printerr (" fmt.%d.thou = %s\n", ui
, thousand
->str
);
1610 * stf_parse_options_guess:
1611 * @data: the input data.
1613 * Returns: (transfer full): the guessed options.
1616 stf_parse_options_guess (char const *data
)
1618 StfParseOptions_t
*res
;
1619 GStringChunk
*lines_chunk
;
1623 gunichar sepchar
= go_locale_get_arg_sep ();
1625 g_return_val_if_fail (data
!= NULL
, NULL
);
1627 res
= stf_parse_options_new ();
1628 lines_chunk
= g_string_chunk_new (100 * 1024);
1629 lines
= stf_parse_lines (res
, lines_chunk
, data
, 1000, FALSE
);
1631 tabcount
= count_character (lines
, '\t', 0.2);
1632 sepcount
= count_character (lines
, sepchar
, 0.2);
1634 /* At least one tab per line and enough to separate every
1635 would-be sepchars. */
1636 if (tabcount
>= 1 && tabcount
>= sepcount
- 1)
1637 stf_parse_options_csv_set_separators (res
, "\t", NULL
);
1642 * Try a few more or less likely characters and pick the first
1643 * one that occurs on at least half the lines.
1645 * The order is mostly random, although ' ' and '!' which
1646 * could very easily occur in text are put last.
1648 if (count_character (lines
, (c
= sepchar
), 0.5) > 0 ||
1649 count_character (lines
, (c
= go_locale_get_col_sep ()), 0.5) > 0 ||
1650 count_character (lines
, (c
= ':'), 0.5) > 0 ||
1651 count_character (lines
, (c
= ','), 0.5) > 0 ||
1652 count_character (lines
, (c
= ';'), 0.5) > 0 ||
1653 count_character (lines
, (c
= '|'), 0.5) > 0 ||
1654 count_character (lines
, (c
= '!'), 0.5) > 0 ||
1655 count_character (lines
, (c
= ' '), 0.5) > 0) {
1657 sep
[g_unichar_to_utf8 (c
, sep
)] = 0;
1660 stf_parse_options_csv_set_separators (res
, sep
, NULL
);
1664 // For now, always separated:
1665 stf_parse_options_set_type (res
, PARSE_TYPE_CSV
);
1667 switch (res
->parsetype
) {
1668 case PARSE_TYPE_CSV
: {
1671 strchr (res
->sep
.chr
, ' ') != NULL
;
1674 strchr (res
->sep
.chr
, ' ') != NULL
;
1676 stf_parse_options_set_trim_spaces (res
, TRIM_TYPE_LEFT
| TRIM_TYPE_RIGHT
);
1677 stf_parse_options_csv_set_indicator_2x_is_single (res
, TRUE
);
1678 stf_parse_options_csv_set_duplicates (res
, dups
);
1679 stf_parse_options_csv_set_trim_seps (res
, trim
);
1681 stf_parse_options_csv_set_stringindicator (res
, '"');
1685 case PARSE_TYPE_FIXED
:
1689 g_assert_not_reached ();
1692 stf_parse_general_free (lines
);
1693 g_string_chunk_free (lines_chunk
);
1695 stf_parse_options_guess_formats (res
, data
);
1697 if (gnm_debug_flag ("stf"))
1698 dump_guessed_options (res
);
1704 * stf_parse_options_guess_csv:
1705 * @data: the CSV input data.
1707 * Returns: (transfer full): the guessed options.
1710 stf_parse_options_guess_csv (char const *data
)
1712 StfParseOptions_t
*res
;
1713 GStringChunk
*lines_chunk
;
1716 char const *quoteline
= NULL
;
1718 gunichar stringind
= '"';
1720 g_return_val_if_fail (data
!= NULL
, NULL
);
1722 res
= stf_parse_options_new ();
1723 stf_parse_options_set_type (res
, PARSE_TYPE_CSV
);
1724 stf_parse_options_set_trim_spaces (res
, TRIM_TYPE_LEFT
| TRIM_TYPE_RIGHT
);
1725 stf_parse_options_csv_set_indicator_2x_is_single (res
, TRUE
);
1726 stf_parse_options_csv_set_duplicates (res
, FALSE
);
1727 stf_parse_options_csv_set_trim_seps (res
, FALSE
);
1728 stf_parse_options_csv_set_stringindicator (res
, stringind
);
1730 lines_chunk
= g_string_chunk_new (100 * 1024);
1731 lines
= stf_parse_lines (res
, lines_chunk
, data
, 1000, FALSE
);
1734 * Find a line containing a quote; skip first line unless it is
1735 * the only one. Prefer a line with the quote first.
1737 for (pass
= 1; !quoteline
&& pass
<= 2; pass
++) {
1739 for (lno
= MIN (1, lines
->len
- 1);
1740 !quoteline
&& lno
< lines
->len
;
1742 GPtrArray
*boxline
= g_ptr_array_index (lines
, lno
);
1743 const char *line
= g_ptr_array_index (boxline
, 0);
1746 if (g_utf8_get_char (line
) == stringind
)
1750 if (my_utf8_strchr (line
, stringind
))
1758 const char *p0
= my_utf8_strchr (quoteline
, stringind
);
1762 p
= g_utf8_next_char (p
);
1763 } while (*p
&& g_utf8_get_char (p
) != stringind
);
1764 if (*p
) p
= g_utf8_next_char (p
);
1765 while (*p
&& g_unichar_isspace (g_utf8_get_char (p
)))
1766 p
= g_utf8_next_char (p
);
1768 /* Use the character after the quote. */
1769 sep
= g_strndup (p
, g_utf8_next_char (p
) - p
);
1771 /* Try to use character before the quote. */
1772 while (p0
> quoteline
&& !sep
) {
1774 p0
= g_utf8_prev_char (p0
);
1775 if (!g_unichar_isspace (g_utf8_get_char (p0
)))
1776 sep
= g_strndup (p0
, p
- p0
);
1782 sep
= g_strdup (",");
1783 stf_parse_options_csv_set_separators (res
, sep
, NULL
);
1786 stf_parse_general_free (lines
);
1787 g_string_chunk_free (lines_chunk
);
1789 stf_parse_options_guess_formats (res
, data
);
1791 if (gnm_debug_flag ("stf"))
1792 dump_guessed_options (res
);
1798 STF_GUESS_DATE_DMY
= 1,
1799 STF_GUESS_DATE_MDY
= 2,
1800 STF_GUESS_DATE_YMD
= 4,
1802 STF_GUESS_NUMBER_DEC_POINT
= 0x10,
1803 STF_GUESS_NUMBER_DEC_COMMA
= 0x20,
1804 STF_GUESS_NUMBER_DEC_EITHER
= 0x30,
1806 STF_GUESS_ALL
= 0x37
1810 do_check_date (const char *data
, StfGuessFormats flag
,
1811 gboolean mbd
, gboolean ybm
,
1813 GODateConventions
const *date_conv
)
1816 gboolean this_mbd
, this_ybm
;
1819 if (!(*possible
& flag
))
1822 v
= format_match_datetime (data
, date_conv
, mbd
, TRUE
, FALSE
);
1823 if (!v
|| !VALUE_FMT (v
))
1826 imbd
= go_format_month_before_day (VALUE_FMT (v
));
1827 this_mbd
= (imbd
>= 1);
1828 this_ybm
= (imbd
== 2);
1829 if (mbd
!= this_mbd
|| ybm
!= this_ybm
)
1842 do_check_number (const char *data
, StfGuessFormats flag
,
1843 const GString
*dec
, const GString
*thousand
, const GString
*curr
,
1844 unsigned *possible
, int *decimals
)
1847 GOFormatFamily family
;
1850 if (!(*possible
& flag
))
1853 v
= format_match_decimal_number_with_locale (data
, &family
, curr
, thousand
, dec
);
1857 if (*decimals
!= -2) {
1858 const char *pdec
= strstr (data
, dec
->str
);
1859 int this_decimals
= 0;
1862 while (g_ascii_isdigit (*pdec
)) {
1867 if (*decimals
== -1)
1868 *decimals
= this_decimals
;
1869 else if (*decimals
!= this_decimals
)
1873 pthou
= strstr (data
, thousand
->str
);
1876 int digits
= 0, nonzero_digits
= 0;
1877 for (p
= data
; p
< pthou
; p
= g_utf8_next_char (p
)) {
1878 if (g_unichar_isdigit (g_utf8_get_char (p
))) {
1884 // "-.222" implies that "." is not a thousands separator.
1885 // "0.222" implies that "." is not a thousands separator.
1886 // "12345,555" implies that "," is not a thousands separator.
1887 if (nonzero_digits
== 0 || digits
> 3)
1901 * stf_parse_options_guess_formats:
1902 * @data: the CSV input data.
1904 * This function attempts to recognize data formats on a column-by-column
1905 * basis under the assumption that the data in a text file will generally
1906 * use the same data formats.
1908 * This is useful because not all values give sufficient information by
1909 * themselves to tell what format the data is in. For example, "1/2/2000"
1910 * is likely to be a date in year 2000, but it is not clear if it is in
1911 * January or February. If another value in the same column is "31/1/1999"
1912 * then it is likely that the former date was in February.
1914 * Likewise, a value of "123,456" could mean either 1.23456e5 or 1.23456e2.
1915 * A later value of "111,200.22" would clear up the confusion.
1919 stf_parse_options_guess_formats (StfParseOptions_t
*po
, char const *data
)
1921 GStringChunk
*lines_chunk
;
1923 unsigned lno
, col
, colcount
, sline
;
1924 GODateConventions
const *date_conv
= go_date_conv_from_str ("Lotus:1900");
1925 GString
*s_comma
= g_string_new (",");
1926 GString
*s_dot
= g_string_new (".");
1927 GString
*s_dollar
= g_string_new ("$");
1928 gboolean debug
= gnm_debug_flag ("stf");
1930 g_ptr_array_set_size (po
->formats
, 0);
1931 g_ptr_array_set_size (po
->formats_decimal
, 0);
1932 g_ptr_array_set_size (po
->formats_thousand
, 0);
1933 g_ptr_array_set_size (po
->formats_curr
, 0);
1935 lines_chunk
= g_string_chunk_new (100 * 1024);
1936 lines
= stf_parse_general (po
, lines_chunk
, data
, data
+ strlen (data
));
1939 for (lno
= 0; lno
< lines
->len
; lno
++) {
1940 GPtrArray
*line
= g_ptr_array_index (lines
, lno
);
1941 colcount
= MAX (colcount
, line
->len
);
1944 // Ignore first line unless it is the only one
1945 sline
= MIN ((int)lines
->len
- 1, 1);
1947 g_ptr_array_set_size (po
->formats
, colcount
);
1948 g_ptr_array_set_size (po
->formats_decimal
, colcount
);
1949 g_ptr_array_set_size (po
->formats_thousand
, colcount
);
1950 g_ptr_array_set_size (po
->formats_curr
, colcount
);
1951 for (col
= 0; col
< colcount
; col
++) {
1952 unsigned possible
= STF_GUESS_ALL
;
1953 GOFormat
*fmt
= NULL
;
1954 gboolean seen_dot
= FALSE
;
1955 gboolean seen_comma
= FALSE
;
1956 int decimals_if_point
= -1; // -1: unset; -2: inconsistent; >=0: count
1957 int decimals_if_comma
= -1; // -1: unset; -2: inconsistent; >=0: count
1959 for (lno
= sline
; possible
&& lno
< lines
->len
; lno
++) {
1960 GPtrArray
*line
= g_ptr_array_index (lines
, lno
);
1961 const char *data
= col
< line
->len
? g_ptr_array_index (line
, col
) : "";
1962 unsigned prev_possible
= possible
;
1964 if (*data
== 0 || data
[0] == '\'')
1967 do_check_date (data
, STF_GUESS_DATE_DMY
, FALSE
, FALSE
, &possible
, date_conv
);
1968 do_check_date (data
, STF_GUESS_DATE_MDY
, TRUE
, FALSE
, &possible
, date_conv
);
1969 do_check_date (data
, STF_GUESS_DATE_YMD
, TRUE
, TRUE
, &possible
, date_conv
);
1971 if ((possible
& STF_GUESS_NUMBER_DEC_EITHER
) == STF_GUESS_NUMBER_DEC_EITHER
) {
1972 const char *pdot
= strstr (data
, s_dot
->str
);
1973 const char *pcomma
= strstr (data
, s_comma
->str
);
1974 if (pdot
&& pcomma
) {
1975 // Both -- last one is the decimal separator
1977 possible
&= ~STF_GUESS_NUMBER_DEC_COMMA
;
1979 possible
&= ~STF_GUESS_NUMBER_DEC_POINT
;
1980 } else if (pdot
&& strstr (pdot
+ s_dot
->len
, s_dot
->str
)) {
1981 // Two dots so they are thousands separators
1982 possible
&= ~STF_GUESS_NUMBER_DEC_POINT
;
1983 } else if (pcomma
&& strstr (pcomma
+ s_comma
->len
, s_comma
->str
)) {
1984 // Two commas so they are thousands separators
1985 possible
&= ~STF_GUESS_NUMBER_DEC_COMMA
;
1988 seen_dot
= seen_dot
|| (pdot
!= 0);
1989 seen_comma
= seen_comma
|| (pcomma
!= 0);
1991 do_check_number (data
, STF_GUESS_NUMBER_DEC_POINT
,
1992 s_dot
, s_comma
, s_dollar
,
1993 &possible
, &decimals_if_point
);
1994 do_check_number (data
, STF_GUESS_NUMBER_DEC_COMMA
,
1995 s_comma
, s_dot
, s_dollar
,
1996 &possible
, &decimals_if_comma
);
1998 if (possible
!= prev_possible
&& debug
)
1999 g_printerr ("col=%d; after [%s] possible=0x%x\n", col
, data
, possible
);
2002 if ((possible
& STF_GUESS_NUMBER_DEC_EITHER
) == STF_GUESS_NUMBER_DEC_EITHER
&&
2003 !seen_dot
&& !seen_comma
) {
2004 // It doesn't matter what the separators are
2005 possible
&= ~STF_GUESS_NUMBER_DEC_COMMA
;
2009 case STF_GUESS_DATE_DMY
:
2010 fmt
= go_format_new_from_XL ("d-mmm-yyyy");
2012 case STF_GUESS_DATE_MDY
:
2013 fmt
= go_format_new_from_XL ("m/d/yyyy");
2015 case STF_GUESS_DATE_YMD
:
2016 fmt
= go_format_new_from_XL ("yyyy-mm-dd");
2018 case STF_GUESS_NUMBER_DEC_POINT
:
2019 g_ptr_array_index (po
->formats_decimal
, col
) = g_string_new (".");
2020 g_ptr_array_index (po
->formats_thousand
, col
) = g_string_new (",");
2021 g_ptr_array_index (po
->formats_curr
, col
) = g_string_new (s_dollar
->str
);
2022 if (decimals_if_point
> 0) {
2023 // Don't set format if decimals is zero
2024 GString
*fmt_str
= g_string_new (NULL
);
2025 go_format_generate_number_str (fmt_str
, 1, decimals_if_point
, seen_comma
, FALSE
, FALSE
, "", "");
2026 fmt
= go_format_new_from_XL (fmt_str
->str
);
2027 g_string_free (fmt_str
, TRUE
);
2030 case STF_GUESS_NUMBER_DEC_COMMA
:
2031 g_ptr_array_index (po
->formats_decimal
, col
) = g_string_new (",");
2032 g_ptr_array_index (po
->formats_thousand
, col
) = g_string_new (".");
2033 g_ptr_array_index (po
->formats_curr
, col
) = g_string_new (s_dollar
->str
);
2034 if (decimals_if_comma
> 0) {
2035 // Don't set format if decimals is zero
2036 GString
*fmt_str
= g_string_new (NULL
);
2037 go_format_generate_number_str (fmt_str
, 1, decimals_if_comma
, seen_dot
, FALSE
, FALSE
, "", "");
2038 fmt
= go_format_new_from_XL (fmt_str
->str
);
2039 g_string_free (fmt_str
, TRUE
);
2047 fmt
= go_format_ref (go_format_general ());
2048 g_ptr_array_index (po
->formats
, col
) = fmt
;
2051 stf_parse_general_free (lines
);
2052 g_string_chunk_free (lines_chunk
);
2054 g_string_free (s_dot
, TRUE
);
2055 g_string_free (s_comma
, TRUE
);
2056 g_string_free (s_dollar
, TRUE
);