Introspection fixes
[gnumeric.git] / src / stf-parse.c
blob8e5d7e8edfd39d097aad2fb5cec37173feb3fe24
1 /* vim: set sw=8: -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */
2 /*
3 * stf-parse.c : Structured Text Format parser. (STF)
4 * A general purpose engine for parsing data
5 * in CSV and Fixed width format.
8 * Copyright (C) Almer. S. Tigelaar.
9 * EMail: almer1@dds.nl or almer-t@bigfoot.com
11 * Copyright (C) 2003 Andreas J. Guelzow <aguelzow@taliesin.ca>
12 * Copyright (C) 2003,2008-2009 Morten Welinder <terra@gnome.org>
14 * This program is free software; you can redistribute it and/or modify
15 * it under the terms of the GNU General Public License as published by
16 * the Free Software Foundation; either version 2 of the License, or
17 * (at your option) any later version.
19 * This program is distributed in the hope that it will be useful,
20 * but WITHOUT ANY WARRANTY; without even the implied warranty of
21 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
22 * GNU General Public License for more details.
24 * You should have received a copy of the GNU General Public License
25 * along with this program; if not, see <https://www.gnu.org/licenses/>.
28 #include <gnumeric-config.h>
29 #include <glib/gi18n-lib.h>
30 #include "gnumeric.h"
31 #include "stf-parse.h"
32 #include "stf-export.h"
34 #include "workbook.h"
35 #include "cell.h"
36 #include "sheet.h"
37 #include "expr.h"
38 #include "clipboard.h"
39 #include "sheet-style.h"
40 #include "value.h"
41 #include "mstyle.h"
42 #include "number-match.h"
43 #include "gutils.h"
44 #include "parse-util.h"
45 #include "number-match.h"
46 #include "gnm-format.h"
47 #include "ranges.h"
48 #include <goffice/goffice.h>
50 #include <stdlib.h>
51 #include <locale.h>
52 #include <string.h>
54 #define SETUP_LOCALE_SWITCH char *oldlocale = NULL
56 #define START_LOCALE_SWITCH if (parseoptions->locale) {\
57 oldlocale = g_strdup(go_setlocale (LC_ALL, NULL)); \
58 go_setlocale(LC_ALL, parseoptions->locale);}
60 #define END_LOCALE_SWITCH if (oldlocale) {\
61 go_setlocale(LC_ALL, oldlocale);\
62 g_free (oldlocale);}
64 /* Source_t struct, used for interchanging parsing information between the low level parse functions */
65 typedef struct {
66 GStringChunk *chunk;
67 char const *position; /* Indicates the current position within data */
69 /* Used internally for fixed width parsing */
70 int splitpos; /* Indicates current position in splitpositions array */
71 int linepos; /* Position on the current line */
72 } Source_t;
74 /* Struct used for autodiscovery */
75 typedef struct {
76 int start;
77 int stop;
78 } AutoDiscovery_t;
81 * Some silly dude make the length field an unsigned int. C just does
82 * not deal very well with that.
84 static inline int
85 my_garray_len (GArray const *a)
87 return (int)a->len;
90 static char *
91 my_utf8_strchr (const char *p, gunichar uc)
93 return uc < 0x7f ? strchr (p, uc) : g_utf8_strchr (p, -1, uc);
96 static int
97 compare_terminator (char const *s, StfParseOptions_t *parseoptions)
99 guchar const *us = (guchar const *)s;
100 GSList *l;
102 if (*us > parseoptions->compiled_terminator.max ||
103 *us < parseoptions->compiled_terminator.min)
104 return 0;
106 for (l = parseoptions->terminator; l; l = l->next) {
107 char const *term = l->data;
108 char const *d = s;
110 while (*term) {
111 if (*d != *term)
112 goto next;
113 term++;
114 d++;
116 return d - s;
118 next:
121 return 0;
125 /*******************************************************************************************************
126 * STF PARSE OPTIONS : StfParseOptions related
127 *******************************************************************************************************/
129 static void
130 gnm_g_string_free (GString *s)
132 if (s) g_string_free (s, TRUE);
137 * stf_parse_options_new:
139 * This will return a new StfParseOptions_t struct.
140 * The struct should, after being used, freed with stf_parse_options_free.
142 static StfParseOptions_t *
143 stf_parse_options_new (void)
145 StfParseOptions_t* parseoptions = g_new0 (StfParseOptions_t, 1);
147 parseoptions->parsetype = PARSE_TYPE_NOTSET;
149 parseoptions->terminator = NULL;
150 stf_parse_options_add_line_terminator (parseoptions, "\r\n");
151 stf_parse_options_add_line_terminator (parseoptions, "\n");
152 stf_parse_options_add_line_terminator (parseoptions, "\r");
154 parseoptions->trim_spaces = (TRIM_TYPE_RIGHT | TRIM_TYPE_LEFT);
155 parseoptions->locale = NULL;
157 parseoptions->splitpositions = NULL;
158 stf_parse_options_fixed_splitpositions_clear (parseoptions);
160 parseoptions->stringindicator = '"';
161 parseoptions->indicator_2x_is_single = TRUE;
162 parseoptions->sep.duplicates = FALSE;
163 parseoptions->trim_seps = FALSE;
165 parseoptions->sep.str = NULL;
166 parseoptions->sep.chr = NULL;
168 parseoptions->col_autofit_array = NULL;
169 parseoptions->col_import_array = NULL;
170 parseoptions->col_import_array_len = 0;
171 parseoptions->formats = g_ptr_array_new_with_free_func ((GDestroyNotify)go_format_unref);
172 parseoptions->formats_decimal = g_ptr_array_new_with_free_func ((GDestroyNotify)gnm_g_string_free);
173 parseoptions->formats_thousand = g_ptr_array_new_with_free_func ((GDestroyNotify)gnm_g_string_free);
174 parseoptions->formats_curr = g_ptr_array_new_with_free_func ((GDestroyNotify)gnm_g_string_free);
176 parseoptions->cols_exceeded = FALSE;
177 parseoptions->rows_exceeded = FALSE;
178 parseoptions->ref_count = 1;
180 return parseoptions;
184 * stf_parse_options_free:
186 * will free @parseoptions, note that this will not free the splitpositions
187 * member (GArray) of the struct, the caller is responsible for that.
189 void
190 stf_parse_options_free (StfParseOptions_t *parseoptions)
192 g_return_if_fail (parseoptions != NULL);
194 if (parseoptions->ref_count-- > 1)
195 return;
197 g_free (parseoptions->col_import_array);
198 g_free (parseoptions->col_autofit_array);
199 g_free (parseoptions->locale);
200 g_free (parseoptions->sep.chr);
202 if (parseoptions->sep.str) {
203 GSList *l;
205 for (l = parseoptions->sep.str; l != NULL; l = l->next)
206 g_free ((char *) l->data);
207 g_slist_free (parseoptions->sep.str);
210 g_array_free (parseoptions->splitpositions, TRUE);
212 stf_parse_options_clear_line_terminator (parseoptions);
214 g_ptr_array_free (parseoptions->formats, TRUE);
215 g_ptr_array_free (parseoptions->formats_decimal, TRUE);
216 g_ptr_array_free (parseoptions->formats_thousand, TRUE);
217 g_ptr_array_free (parseoptions->formats_curr, TRUE);
219 g_free (parseoptions);
222 static StfParseOptions_t *
223 stf_parse_options_ref (StfParseOptions_t *parseoptions)
225 parseoptions->ref_count++;
226 return parseoptions;
229 GType
230 stf_parse_options_get_type (void)
232 static GType t = 0;
234 if (t == 0) {
235 t = g_boxed_type_register_static ("StfParseOptions_t",
236 (GBoxedCopyFunc)stf_parse_options_ref,
237 (GBoxedFreeFunc)stf_parse_options_free);
239 return t;
242 void
243 stf_parse_options_set_type (StfParseOptions_t *parseoptions, StfParseType_t const parsetype)
245 g_return_if_fail (parseoptions != NULL);
246 g_return_if_fail (parsetype == PARSE_TYPE_CSV || parsetype == PARSE_TYPE_FIXED);
248 parseoptions->parsetype = parsetype;
251 static gint
252 long_string_first (gchar const *a, gchar const *b)
254 /* This actually is UTF-8 safe. */
255 return strlen (b) - strlen (a);
258 static void
259 compile_terminators (StfParseOptions_t *parseoptions)
261 GSList *l;
262 GO_SLIST_SORT (parseoptions->terminator, (GCompareFunc)long_string_first);
264 parseoptions->compiled_terminator.min = 255;
265 parseoptions->compiled_terminator.max = 0;
266 for (l = parseoptions->terminator; l; l = l->next) {
267 const guchar *term = l->data;
268 parseoptions->compiled_terminator.min =
269 MIN (parseoptions->compiled_terminator.min, *term);
270 parseoptions->compiled_terminator.max =
271 MAX (parseoptions->compiled_terminator.max, *term);
276 * stf_parse_options_add_line_terminator:
278 * This will add to the line terminators, in both the Fixed width and CSV delimited importers
279 * this indicates the end of a row.
282 void
283 stf_parse_options_add_line_terminator (StfParseOptions_t *parseoptions, char const *terminator)
285 g_return_if_fail (parseoptions != NULL);
286 g_return_if_fail (terminator != NULL && *terminator != 0);
288 GO_SLIST_PREPEND (parseoptions->terminator, g_strdup (terminator));
289 compile_terminators (parseoptions);
293 * stf_parse_options_clear_line_terminator:
295 * This will clear the line terminator, in both the Fixed width and CSV delimited importers
296 * this indicates the end of a row.
299 void
300 stf_parse_options_clear_line_terminator (StfParseOptions_t *parseoptions)
302 g_return_if_fail (parseoptions != NULL);
304 g_slist_free_full (parseoptions->terminator, g_free);
305 parseoptions->terminator = NULL;
306 compile_terminators (parseoptions);
310 * stf_parse_options_set_trim_spaces:
312 * If enabled will trim spaces in every parsed field on left and/or right
313 * sides.
315 void
316 stf_parse_options_set_trim_spaces (StfParseOptions_t *parseoptions, StfTrimType_t const trim_spaces)
318 g_return_if_fail (parseoptions != NULL);
320 parseoptions->trim_spaces = trim_spaces;
324 * stf_parse_options_csv_set_separators:
325 * @parseoptions: #StfParseOptions_t
326 * @character:
327 * @seps: (element-type utf8): the separators to be used
329 * A copy is made of the parameters.
331 void
332 stf_parse_options_csv_set_separators (StfParseOptions_t *parseoptions,
333 char const *character,
334 GSList const *seps)
336 g_return_if_fail (parseoptions != NULL);
338 g_free (parseoptions->sep.chr);
339 parseoptions->sep.chr = g_strdup (character);
341 g_slist_free_full (parseoptions->sep.str, g_free);
342 parseoptions->sep.str = go_slist_map (seps, (GOMapFunc)g_strdup);
345 void
346 stf_parse_options_csv_set_stringindicator (StfParseOptions_t *parseoptions, gunichar const stringindicator)
348 g_return_if_fail (parseoptions != NULL);
350 parseoptions->stringindicator = stringindicator;
354 * stf_parse_options_csv_set_indicator_2x_is_single:
355 * @indic_2x: a boolean value indicating whether we want to see two
356 * adjacent string indicators as a single string indicator
357 * that is part of the cell, rather than a terminator.
359 void
360 stf_parse_options_csv_set_indicator_2x_is_single (StfParseOptions_t *parseoptions,
361 gboolean const indic_2x)
363 g_return_if_fail (parseoptions != NULL);
365 parseoptions->indicator_2x_is_single = indic_2x;
369 * stf_parse_options_csv_set_duplicates:
370 * @parseoptions:
371 * @duplicates: a boolean value indicating whether we want to see two
372 * separators right behind each other as one
374 void
375 stf_parse_options_csv_set_duplicates (StfParseOptions_t *parseoptions, gboolean const duplicates)
377 g_return_if_fail (parseoptions != NULL);
379 parseoptions->sep.duplicates = duplicates;
383 * stf_parse_options_csv_set_trim_seps:
384 * @trim_seps: a boolean value indicating whether we want to ignore
385 * separators at the beginning of lines
387 void
388 stf_parse_options_csv_set_trim_seps (StfParseOptions_t *parseoptions, gboolean const trim_seps)
390 g_return_if_fail (parseoptions != NULL);
392 parseoptions->trim_seps = trim_seps;
396 * stf_parse_options_fixed_splitpositions_clear:
398 * This will clear the splitpositions (== points on which a line is split)
400 void
401 stf_parse_options_fixed_splitpositions_clear (StfParseOptions_t *parseoptions)
403 int minus_one = -1;
404 g_return_if_fail (parseoptions != NULL);
406 if (parseoptions->splitpositions)
407 g_array_free (parseoptions->splitpositions, TRUE);
408 parseoptions->splitpositions = g_array_new (FALSE, FALSE, sizeof (int));
410 g_array_append_val (parseoptions->splitpositions, minus_one);
414 * stf_parse_options_fixed_splitpositions_add:
416 * @position will be added to the splitpositions.
418 void
419 stf_parse_options_fixed_splitpositions_add (StfParseOptions_t *parseoptions, int position)
421 unsigned int ui;
423 g_return_if_fail (parseoptions != NULL);
424 g_return_if_fail (position >= 0);
426 for (ui = 0; ui < parseoptions->splitpositions->len - 1; ui++) {
427 int here = g_array_index (parseoptions->splitpositions, int, ui);
428 if (position == here)
429 return;
430 if (position < here)
431 break;
434 g_array_insert_val (parseoptions->splitpositions, ui, position);
437 void
438 stf_parse_options_fixed_splitpositions_remove (StfParseOptions_t *parseoptions, int position)
440 unsigned int ui;
442 g_return_if_fail (parseoptions != NULL);
443 g_return_if_fail (position >= 0);
445 for (ui = 0; ui < parseoptions->splitpositions->len - 1; ui++) {
446 int here = g_array_index (parseoptions->splitpositions, int, ui);
447 if (position == here)
448 g_array_remove_index (parseoptions->splitpositions, ui);
449 if (position <= here)
450 return;
455 stf_parse_options_fixed_splitpositions_count (StfParseOptions_t *parseoptions)
457 return parseoptions->splitpositions->len;
461 stf_parse_options_fixed_splitpositions_nth (StfParseOptions_t *parseoptions, int n)
463 return g_array_index (parseoptions->splitpositions, int, n);
468 * stf_parse_options_valid:
469 * @parseoptions: an import options struct
471 * Checks if @parseoptions is correctly filled
473 * returns : TRUE if it is correctly filled, FALSE otherwise.
475 static gboolean
476 stf_parse_options_valid (StfParseOptions_t *parseoptions)
478 g_return_val_if_fail (parseoptions != NULL, FALSE);
480 if (parseoptions->parsetype == PARSE_TYPE_FIXED) {
481 if (!parseoptions->splitpositions) {
482 g_warning ("STF: No splitpositions in struct");
483 return FALSE;
487 return TRUE;
490 /*******************************************************************************************************
491 * STF PARSE : The actual routines that do the 'trick'
492 *******************************************************************************************************/
494 static void
495 trim_spaces_inplace (char *field, StfParseOptions_t const *parseoptions)
497 if (!field) return;
499 if (parseoptions->trim_spaces & TRIM_TYPE_LEFT) {
500 char *s = field;
502 while (g_unichar_isspace (g_utf8_get_char (s)))
503 s = g_utf8_next_char (s);
505 if (s != field)
506 memmove (field, s, 1 + strlen (s));
509 if (parseoptions->trim_spaces & TRIM_TYPE_RIGHT) {
510 char *s = field + strlen (field);
512 while (field != s) {
513 s = g_utf8_prev_char (s);
514 if (!g_unichar_isspace (g_utf8_get_char (s)))
515 break;
516 *s = 0;
522 * stf_parse_csv_is_separator:
524 * returns NULL if @character is not a separator, a pointer to the character
525 * after the separator otherwise.
527 static char const *
528 stf_parse_csv_is_separator (char const *character, char const *chr, GSList const *str)
530 g_return_val_if_fail (character != NULL, NULL);
532 if (*character == 0)
533 return NULL;
535 if (str) {
536 GSList const *l;
538 for (l = str; l != NULL; l = l->next) {
539 char const *s = l->data;
540 char const *r;
541 glong cnt;
542 glong const len = g_utf8_strlen (s, -1);
544 /* Don't compare past the end of the buffer! */
545 for (r = character, cnt = 0; cnt < len; cnt++, r = g_utf8_next_char (r))
546 if (*r == '\0')
547 break;
549 if ((cnt == len) && (memcmp (character, s, len) == 0))
550 return g_utf8_offset_to_pointer (character, len);
554 if (chr && my_utf8_strchr (chr, g_utf8_get_char (character)))
555 return g_utf8_next_char(character);
557 return NULL;
561 * stf_parse_eat_separators:
563 * skip over leading separators
567 static void
568 stf_parse_eat_separators (Source_t *src, StfParseOptions_t *parseoptions)
570 char const *cur, *next;
572 g_return_if_fail (src != NULL);
573 g_return_if_fail (parseoptions != NULL);
575 cur = src->position;
577 if (*cur == '\0' || compare_terminator (cur, parseoptions))
578 return;
579 while ((next = stf_parse_csv_is_separator (cur, parseoptions->sep.chr, parseoptions->sep.str)))
580 cur = next;
581 src->position = cur;
582 return;
586 typedef enum {
587 STF_CELL_ERROR,
588 STF_CELL_EOF,
589 STF_CELL_EOL,
590 STF_CELL_FIELD_NO_SEP,
591 STF_CELL_FIELD_SEP
592 } StfParseCellRes;
594 static StfParseCellRes
595 stf_parse_csv_cell (GString *text, Source_t *src, StfParseOptions_t *parseoptions)
597 char const *cur;
598 gboolean saw_sep = FALSE;
600 g_return_val_if_fail (src != NULL, STF_CELL_ERROR);
601 g_return_val_if_fail (parseoptions != NULL, STF_CELL_ERROR);
603 cur = src->position;
604 g_return_val_if_fail (cur != NULL, STF_CELL_ERROR);
606 /* Skip whitespace, but stop at line terminators. */
607 while (1) {
608 int term_len;
610 if (*cur == 0) {
611 src->position = cur;
612 return STF_CELL_EOF;
615 term_len = compare_terminator (cur, parseoptions);
616 if (term_len) {
617 src->position = cur + term_len;
618 return STF_CELL_EOL;
621 if ((parseoptions->trim_spaces & TRIM_TYPE_LEFT) == 0)
622 break;
624 if (stf_parse_csv_is_separator (cur, parseoptions->sep.chr,
625 parseoptions->sep.str))
626 break;
628 if (!g_unichar_isspace (g_utf8_get_char (cur)))
629 break;
630 cur = g_utf8_next_char (cur);
633 if (parseoptions->stringindicator != 0 &&
634 g_utf8_get_char (cur) == parseoptions->stringindicator) {
635 cur = g_utf8_next_char (cur);
636 while (*cur) {
637 gunichar uc = g_utf8_get_char (cur);
638 cur = g_utf8_next_char (cur);
640 if (uc == parseoptions->stringindicator) {
641 if (parseoptions->indicator_2x_is_single &&
642 g_utf8_get_char (cur) == parseoptions->stringindicator)
643 cur = g_utf8_next_char (cur);
644 else {
645 /* "field content"dropped-garbage, */
646 while (*cur && !compare_terminator (cur, parseoptions)) {
647 char const *post = stf_parse_csv_is_separator
648 (cur, parseoptions->sep.chr, parseoptions->sep.str);
649 if (post) {
650 cur = post;
651 saw_sep = TRUE;
652 break;
654 cur = g_utf8_next_char (cur);
656 break;
660 g_string_append_unichar (text, uc);
663 /* We silently allow a missing terminating quote. */
664 } else {
665 /* Unquoted field. */
667 while (*cur && !compare_terminator (cur, parseoptions)) {
669 char const *post = stf_parse_csv_is_separator
670 (cur, parseoptions->sep.chr, parseoptions->sep.str);
671 if (post) {
672 cur = post;
673 saw_sep = TRUE;
674 break;
677 g_string_append_unichar (text, g_utf8_get_char (cur));
678 cur = g_utf8_next_char (cur);
681 if (parseoptions->trim_spaces & TRIM_TYPE_RIGHT) {
682 while (text->len) {
683 const char *last = g_utf8_prev_char (text->str + text->len);
684 if (!g_unichar_isspace (g_utf8_get_char (last)))
685 break;
686 g_string_truncate (text, last - text->str);
691 src->position = cur;
693 if (saw_sep && parseoptions->sep.duplicates)
694 stf_parse_eat_separators (src, parseoptions);
696 return saw_sep ? STF_CELL_FIELD_SEP : STF_CELL_FIELD_NO_SEP;
700 * stf_parse_csv_line:
702 * This will parse one line from the current @src->position.
703 * NOTE: The calling routine is responsible for freeing the result.
705 * returns : a GPtrArray of char*'s
707 static GPtrArray *
708 stf_parse_csv_line (Source_t *src, StfParseOptions_t *parseoptions)
710 GPtrArray *line;
711 gboolean cont = FALSE;
712 GString *text;
714 g_return_val_if_fail (src != NULL, NULL);
715 g_return_val_if_fail (parseoptions != NULL, NULL);
717 line = g_ptr_array_new ();
718 if (parseoptions->trim_seps)
719 stf_parse_eat_separators (src, parseoptions);
721 text = g_string_sized_new (30);
723 while (1) {
724 char *ctext;
725 StfParseCellRes res =
726 stf_parse_csv_cell (text, src, parseoptions);
727 trim_spaces_inplace (text->str, parseoptions);
728 ctext = g_string_chunk_insert_len (src->chunk,
729 text->str, text->len);
730 g_string_truncate (text, 0);
732 switch (res) {
733 case STF_CELL_FIELD_NO_SEP:
734 g_ptr_array_add (line, ctext);
735 cont = FALSE;
736 break;
738 case STF_CELL_FIELD_SEP:
739 g_ptr_array_add (line, ctext);
740 cont = TRUE; /* Make sure we see one more field. */
741 break;
743 default:
744 if (cont)
745 g_ptr_array_add (line, ctext);
746 g_string_free (text, TRUE);
747 return line;
753 * stf_parse_fixed_cell:
755 * returns a pointer to the parsed cell contents.
757 static char *
758 stf_parse_fixed_cell (Source_t *src, StfParseOptions_t *parseoptions)
760 char *res;
761 char const *cur;
762 int splitval;
764 g_return_val_if_fail (src != NULL, NULL);
765 g_return_val_if_fail (parseoptions != NULL, NULL);
767 cur = src->position;
769 if (src->splitpos < my_garray_len (parseoptions->splitpositions))
770 splitval = (int) g_array_index (parseoptions->splitpositions, int, src->splitpos);
771 else
772 splitval = -1;
774 while (*cur != 0 && !compare_terminator (cur, parseoptions) && splitval != src->linepos) {
775 src->linepos++;
776 cur = g_utf8_next_char (cur);
779 res = g_string_chunk_insert_len (src->chunk,
780 src->position,
781 cur - src->position);
783 src->position = cur;
785 return res;
789 * stf_parse_fixed_line:
791 * This will parse one line from the current @src->position.
792 * It will return a GPtrArray with the cell contents as strings.
794 * NOTE: The calling routine is responsible for freeing result.
796 static GPtrArray *
797 stf_parse_fixed_line (Source_t *src, StfParseOptions_t *parseoptions)
799 GPtrArray *line;
801 g_return_val_if_fail (src != NULL, NULL);
802 g_return_val_if_fail (parseoptions != NULL, NULL);
804 src->linepos = 0;
805 src->splitpos = 0;
807 line = g_ptr_array_new ();
808 while (*src->position != '\0' && !compare_terminator (src->position, parseoptions)) {
809 char *field = stf_parse_fixed_cell (src, parseoptions);
811 trim_spaces_inplace (field, parseoptions);
812 g_ptr_array_add (line, field);
814 src->splitpos++;
817 while (line->len < parseoptions->splitpositions->len)
818 g_ptr_array_add (line, g_strdup (""));
820 return line;
824 * stf_parse_general_free: (skip)
826 void
827 stf_parse_general_free (GPtrArray *lines)
829 unsigned lineno;
830 for (lineno = 0; lineno < lines->len; lineno++) {
831 GPtrArray *line = g_ptr_array_index (lines, lineno);
832 /* Fields are not freed here. */
833 if (line)
834 g_ptr_array_free (line, TRUE);
836 g_ptr_array_free (lines, TRUE);
841 * stf_parse_general: (skip)
843 * Returns: (transfer full): a GPtrArray of lines, where each line is itself a
844 * GPtrArray of strings.
846 * The caller must free this entire structure, for example by calling
847 * stf_parse_general_free.
849 GPtrArray *
850 stf_parse_general (StfParseOptions_t *parseoptions,
851 GStringChunk *lines_chunk,
852 char const *data, char const *data_end)
854 GPtrArray *lines;
855 Source_t src;
856 int row;
857 char const *valid_end = data_end;
859 g_return_val_if_fail (parseoptions != NULL, NULL);
860 g_return_val_if_fail (data != NULL, NULL);
861 g_return_val_if_fail (data_end != NULL, NULL);
862 g_return_val_if_fail (stf_parse_options_valid (parseoptions), NULL);
863 g_return_val_if_fail (g_utf8_validate (data, data_end-data, &valid_end), NULL);
865 src.chunk = lines_chunk;
866 src.position = data;
867 row = 0;
869 if ((data_end-data >= 3) && !strncmp(src.position, "\xEF\xBB\xBF", 3)) {
870 /* Skip over byte-order mark */
871 src.position += 3;
874 lines = g_ptr_array_new ();
875 while (*src.position != '\0' && src.position < data_end) {
876 GPtrArray *line;
878 if (row == GNM_MAX_ROWS) {
879 parseoptions->rows_exceeded = TRUE;
880 break;
883 line = parseoptions->parsetype == PARSE_TYPE_CSV
884 ? stf_parse_csv_line (&src, parseoptions)
885 : stf_parse_fixed_line (&src, parseoptions);
887 g_ptr_array_add (lines, line);
888 if (parseoptions->parsetype != PARSE_TYPE_CSV)
889 src.position += compare_terminator (src.position, parseoptions);
890 row++;
893 return lines;
897 * stf_parse_lines: (skip)
898 * @parseoptions: #StfParseOptions_t
899 * @lines_chunk:
900 * @data:
901 * @maxlines:
902 * @with_lineno:
904 * Returns: (transfer full): a GPtrArray of lines, where each line is itself a
905 * GPtrArray of strings.
907 * The caller must free this entire structure, for example by calling
908 * stf_parse_general_free.
910 GPtrArray *
911 stf_parse_lines (StfParseOptions_t *parseoptions,
912 GStringChunk *lines_chunk,
913 char const *data,
914 int maxlines, gboolean with_lineno)
916 GPtrArray *lines;
917 int lineno = 1;
919 g_return_val_if_fail (data != NULL, NULL);
921 lines = g_ptr_array_new ();
922 while (*data) {
923 char const *data0 = data;
924 GPtrArray *line = g_ptr_array_new ();
926 if (with_lineno) {
927 char buf[4 * sizeof (int)];
928 sprintf (buf, "%d", lineno);
929 g_ptr_array_add (line,
930 g_string_chunk_insert (lines_chunk, buf));
933 while (1) {
934 int termlen = compare_terminator (data, parseoptions);
935 if (termlen > 0 || *data == 0) {
936 g_ptr_array_add (line,
937 g_string_chunk_insert_len (lines_chunk,
938 data0,
939 data - data0));
940 data += termlen;
941 break;
942 } else
943 data = g_utf8_next_char (data);
946 g_ptr_array_add (lines, line);
948 lineno++;
949 if (lineno >= maxlines)
950 break;
952 return lines;
955 char const *
956 stf_parse_find_line (StfParseOptions_t *parseoptions,
957 char const *data,
958 int line)
960 while (line > 0) {
961 int termlen = compare_terminator (data, parseoptions);
962 if (termlen > 0) {
963 data += termlen;
964 line--;
965 } else if (*data == 0) {
966 return data;
967 } else {
968 data = g_utf8_next_char (data);
971 return data;
976 * stf_parse_options_fixed_autodiscover:
977 * @parseoptions: a Parse options struct.
978 * @data: The actual data.
979 * @data_end: data end.
981 * Automatically try to discover columns in the text to be parsed.
982 * We ignore empty lines (only containing parseoptions->terminator)
984 * FIXME: This is so extremely ugly that I am too tired to rewrite it right now.
985 * Think hard of a better more flexible solution...
987 void
988 stf_parse_options_fixed_autodiscover (StfParseOptions_t *parseoptions,
989 char const *data, char const *data_end)
991 char const *iterator = data;
992 GSList *list = NULL;
993 GSList *list_start = NULL;
994 int lines = 0;
995 int effective_lines = 0;
996 int max_line_length = 0;
997 int *line_begin_hits = NULL;
998 int *line_end_hits = NULL;
999 int i;
1001 stf_parse_options_fixed_splitpositions_clear (parseoptions);
1004 * First take a look at all possible white space combinations
1006 while (*iterator && iterator < data_end) {
1007 gboolean begin_recorded = FALSE;
1008 AutoDiscovery_t *disc = NULL;
1009 int position = 0;
1010 int termlen = 0;
1012 while (*iterator && (termlen = compare_terminator (iterator, parseoptions)) == 0) {
1013 if (!begin_recorded && *iterator == ' ') {
1014 disc = g_new0 (AutoDiscovery_t, 1);
1016 disc->start = position;
1018 begin_recorded = TRUE;
1019 } else if (begin_recorded && *iterator != ' ') {
1020 disc->stop = position;
1021 list = g_slist_prepend (list, disc);
1023 begin_recorded = FALSE;
1024 disc = NULL;
1027 position++;
1028 iterator++;
1031 if (position > max_line_length)
1032 max_line_length = position;
1035 * If there are excess spaces at the end of
1036 * the line : ignore them
1038 g_free (disc);
1041 * Hop over the terminator
1043 iterator += termlen;
1045 if (position != 0)
1046 effective_lines++;
1048 lines++;
1051 list = g_slist_reverse (list);
1052 list_start = list;
1055 * Kewl stuff:
1056 * Look at the number of hits at each line position
1057 * if the number of hits equals the number of lines
1058 * we can be pretty sure this is the start or end
1059 * of a column, we filter out empty columns
1060 * later
1062 line_begin_hits = g_new0 (int, max_line_length + 1);
1063 line_end_hits = g_new0 (int, max_line_length + 1);
1065 while (list) {
1066 AutoDiscovery_t *disc = list->data;
1068 line_begin_hits[disc->start]++;
1069 line_end_hits[disc->stop]++;
1071 g_free (disc);
1073 list = g_slist_next (list);
1075 g_slist_free (list_start);
1077 for (i = 0; i < max_line_length + 1; i++)
1078 if (line_begin_hits[i] == effective_lines || line_end_hits[i] == effective_lines)
1079 stf_parse_options_fixed_splitpositions_add (parseoptions, i);
1082 * Do some corrections to the initial columns
1083 * detected here, we obviously don't need to
1084 * do this if there are no columns at all.
1086 if (my_garray_len (parseoptions->splitpositions) > 0) {
1088 * Try to find columns that look like:
1090 * Example 100
1091 * Example2 9
1093 * (In other words : Columns with left & right justification with
1094 * a minimum of 2 spaces in the middle)
1095 * Split these columns in 2
1098 for (i = 0; i < my_garray_len (parseoptions->splitpositions) - 1; i++) {
1099 int begin = g_array_index (parseoptions->splitpositions, int, i);
1100 int end = g_array_index (parseoptions->splitpositions, int, i + 1);
1101 int num_spaces = -1;
1102 int spaces_start = 0;
1103 gboolean right_aligned = TRUE;
1104 gboolean left_aligned = TRUE;
1105 gboolean has_2_spaces = TRUE;
1107 iterator = data;
1108 lines = 0;
1109 while (*iterator && iterator < data_end) {
1110 gboolean trigger = FALSE;
1111 gboolean space_trigger = FALSE;
1112 int pos = 0;
1114 num_spaces = -1;
1115 spaces_start = 0;
1116 while (*iterator && !compare_terminator (iterator, parseoptions)) {
1117 if (pos == begin) {
1118 if (*iterator == ' ')
1119 left_aligned = FALSE;
1121 trigger = TRUE;
1122 } else if (pos == end - 1) {
1123 if (*iterator == ' ')
1124 right_aligned = FALSE;
1126 trigger = FALSE;
1129 if (trigger || pos == end - 1) {
1130 if (!space_trigger && *iterator == ' ') {
1131 space_trigger = TRUE;
1132 spaces_start = pos;
1133 } else if (space_trigger && *iterator != ' ') {
1134 space_trigger = FALSE;
1135 num_spaces = pos - spaces_start;
1139 iterator++;
1140 pos++;
1143 if (num_spaces < 2)
1144 has_2_spaces = FALSE;
1146 if (*iterator)
1147 iterator++;
1149 lines++;
1153 * If this column meets all the criteria
1154 * split it into two at the last measured
1155 * spaces_start + num_spaces
1157 if (has_2_spaces && right_aligned && left_aligned) {
1158 int val = (((spaces_start + num_spaces) - spaces_start) / 2) + spaces_start;
1160 g_array_insert_val (parseoptions->splitpositions, i + 1, val);
1163 * Skip over the inserted column
1165 i++;
1170 * Remove empty columns here if needed
1172 for (i = 0; i < my_garray_len (parseoptions->splitpositions) - 1; i++) {
1173 int begin = g_array_index (parseoptions->splitpositions, int, i);
1174 int end = g_array_index (parseoptions->splitpositions, int, i + 1);
1175 gboolean only_spaces = TRUE;
1177 iterator = data;
1178 lines = 0;
1179 while (*iterator && iterator < data_end) {
1180 gboolean trigger = FALSE;
1181 int pos = 0;
1183 while (*iterator && !compare_terminator (iterator, parseoptions)) {
1184 if (pos == begin)
1185 trigger = TRUE;
1186 else if (pos == end)
1187 trigger = FALSE;
1189 if (trigger) {
1190 if (*iterator != ' ')
1191 only_spaces = FALSE;
1194 iterator++;
1195 pos++;
1198 if (*iterator)
1199 iterator++;
1201 lines++;
1205 * The column only contains spaces
1206 * remove it
1208 if (only_spaces) {
1209 g_array_remove_index (parseoptions->splitpositions, i);
1212 * We HAVE to make sure that the next column (end) also
1213 * gets checked out. If we don't decrease "i" here, we
1214 * will skip over it as the indexes shift down after
1215 * the removal
1217 i--;
1222 g_free (line_begin_hits);
1223 g_free (line_end_hits);
1226 /*******************************************************************************************************
1227 * STF PARSE HL: high-level functions that dump the raw data returned by the low-level parsing
1228 * functions into something meaningful (== application specific)
1229 *******************************************************************************************************/
1232 * This is more or less as gnm_cell_set_text, except...
1233 * 1. Unknown names are not allowed.
1234 * 2. Only '=' can start an expression.
1237 static void
1238 stf_cell_set_text (GnmCell *cell, char const *text)
1240 GnmExprTop const *texpr;
1241 GnmValue *val;
1242 GOFormat const *fmt = gnm_style_get_format (gnm_cell_get_style (cell));
1243 const GODateConventions *date_conv =
1244 workbook_date_conv (cell->base.sheet->workbook);
1246 if (!go_format_is_text (fmt) && *text == '=' && text[1] != 0) {
1247 GnmExprParseFlags flags =
1248 GNM_EXPR_PARSE_UNKNOWN_NAMES_ARE_INVALID;
1249 const char *expr_start = text + 1;
1250 GnmParsePos pos;
1251 val = NULL;
1252 parse_pos_init_cell (&pos, cell);
1253 texpr = gnm_expr_parse_str (expr_start, &pos, flags,
1254 NULL, NULL);
1255 } else {
1256 texpr = NULL;
1257 val = format_match (text, fmt, date_conv);
1260 if (!val && !texpr)
1261 val = value_new_string (text);
1263 if (val)
1264 gnm_cell_set_value (cell, val);
1265 else {
1266 gnm_cell_set_expr (cell, texpr);
1267 gnm_expr_top_unref (texpr);
1271 static void
1272 stf_read_remember_settings (Workbook *book, StfParseOptions_t *po)
1274 if (po->parsetype == PARSE_TYPE_CSV) {
1275 GnmStfExport *stfe = gnm_stf_get_stfe (G_OBJECT (book));
1276 char quote[6];
1277 int length = g_unichar_to_utf8 (po->stringindicator, quote);
1278 if (length > 5) {
1279 quote[0] = '"';
1280 quote[1] = '\0';
1281 } else quote[length] = '\0';
1283 g_object_set (G_OBJECT (stfe), "separator", po->sep.chr, "quote", &quote, NULL);
1285 if ((po->terminator != NULL) && (po->terminator->data != NULL))
1286 g_object_set (G_OBJECT (stfe), "eol", po->terminator->data, NULL);
1290 gboolean
1291 stf_parse_sheet (StfParseOptions_t *parseoptions,
1292 char const *data, char const *data_end,
1293 Sheet *sheet, int start_col, int start_row)
1295 int row;
1296 unsigned int lrow;
1297 GStringChunk *lines_chunk;
1298 GPtrArray *lines;
1299 gboolean result = TRUE;
1300 int col;
1301 unsigned int lcol;
1302 size_t nformats;
1304 SETUP_LOCALE_SWITCH;
1306 g_return_val_if_fail (parseoptions != NULL, FALSE);
1307 g_return_val_if_fail (data != NULL, FALSE);
1308 g_return_val_if_fail (IS_SHEET (sheet), FALSE);
1310 if (!data_end)
1311 data_end = data + strlen (data);
1313 lines_chunk = g_string_chunk_new (100 * 1024);
1314 lines = stf_parse_general (parseoptions, lines_chunk, data, data_end);
1315 if (lines == NULL)
1316 result = FALSE;
1318 col = start_col;
1319 nformats = parseoptions->formats->len;
1320 for (lcol = 0; lcol < nformats; lcol++) {
1321 GOFormat const *fmt = g_ptr_array_index (parseoptions->formats, lcol);
1322 GnmStyle *mstyle;
1323 gboolean want_col =
1324 (parseoptions->col_import_array == NULL ||
1325 parseoptions->col_import_array_len <= lcol ||
1326 parseoptions->col_import_array[lcol]);
1327 if (!want_col || col >= gnm_sheet_get_max_cols (sheet))
1328 continue;
1330 if (fmt && !go_format_is_general (fmt)) {
1331 GnmRange r;
1332 int end_row = MIN (start_row + (int)lines->len - 1,
1333 gnm_sheet_get_last_row (sheet));
1335 range_init (&r, col, start_row, col, end_row);
1336 mstyle = gnm_style_new ();
1337 gnm_style_set_format (mstyle, fmt);
1338 sheet_apply_style (sheet, &r, mstyle);
1340 col++;
1343 START_LOCALE_SWITCH;
1344 for (row = start_row, lrow = 0;
1345 result && lrow < lines->len;
1346 row++, lrow++) {
1347 GPtrArray *line;
1349 if (row >= gnm_sheet_get_max_rows (sheet)) {
1350 if (!parseoptions->rows_exceeded) {
1351 /* FIXME: What locale? */
1352 g_warning (_("There are more rows of data than "
1353 "there is room for in the sheet. Extra "
1354 "rows will be ignored."));
1355 parseoptions->rows_exceeded = TRUE;
1357 break;
1360 col = start_col;
1361 line = g_ptr_array_index (lines, lrow);
1363 for (lcol = 0; lcol < line->len; lcol++) {
1364 GOFormat const *fmt = lcol < nformats
1365 ? g_ptr_array_index (parseoptions->formats, lcol)
1366 : go_format_general ();
1367 char const *text = g_ptr_array_index (line, lcol);
1368 gboolean want_col =
1369 (parseoptions->col_import_array == NULL ||
1370 parseoptions->col_import_array_len <= lcol ||
1371 parseoptions->col_import_array[lcol]);
1372 if (!want_col)
1373 continue;
1375 if (col >= gnm_sheet_get_max_cols (sheet)) {
1376 if (!parseoptions->cols_exceeded) {
1377 /* FIXME: What locale? */
1378 g_warning (_("There are more columns of data than "
1379 "there is room for in the sheet. Extra "
1380 "columns will be ignored."));
1381 parseoptions->cols_exceeded = TRUE;
1383 break;
1385 if (text && *text) {
1386 GnmCell *cell = sheet_cell_fetch (sheet, col, row);
1387 if (!go_format_is_text (fmt) &&
1388 lcol < parseoptions->formats_decimal->len &&
1389 g_ptr_array_index (parseoptions->formats_decimal, lcol)) {
1390 GOFormatFamily fam;
1391 GnmValue *v = format_match_decimal_number_with_locale
1392 (text, &fam,
1393 g_ptr_array_index (parseoptions->formats_curr, lcol),
1394 g_ptr_array_index (parseoptions->formats_thousand, lcol),
1395 g_ptr_array_index (parseoptions->formats_decimal, lcol));
1396 if (!v)
1397 v = value_new_string (text);
1398 sheet_cell_set_value (cell, v);
1399 } else {
1401 stf_cell_set_text (cell, text);
1404 col++;
1407 g_ptr_array_index (lines, lrow) = NULL;
1408 g_ptr_array_free (line, TRUE);
1410 END_LOCALE_SWITCH;
1412 for (lcol = 0, col = start_col;
1413 lcol < parseoptions->col_import_array_len && col < gnm_sheet_get_max_cols (sheet);
1414 lcol++) {
1415 if (parseoptions->col_import_array == NULL ||
1416 parseoptions->col_import_array_len <= lcol ||
1417 parseoptions->col_import_array[lcol]) {
1418 if (parseoptions->col_autofit_array == NULL ||
1419 parseoptions->col_autofit_array[lcol]) {
1420 ColRowIndexList *list = colrow_get_index_list (col, col, NULL);
1421 ColRowStateGroup *state = colrow_set_sizes (sheet, TRUE, list, -1, 0, -1);
1422 colrow_index_list_destroy (list);
1423 g_slist_free (state);
1425 col++;
1429 g_string_chunk_free (lines_chunk);
1430 if (lines)
1431 stf_parse_general_free (lines);
1432 if (result)
1433 stf_read_remember_settings (sheet->workbook, parseoptions);
1434 return result;
1437 GnmCellRegion *
1438 stf_parse_region (StfParseOptions_t *parseoptions, char const *data, char const *data_end,
1439 Workbook const *wb)
1441 static GODateConventions const default_conv = {FALSE};
1442 GODateConventions const *date_conv = wb ? workbook_date_conv (wb) : &default_conv;
1444 GnmCellRegion *cr;
1445 unsigned int row, colhigh = 0;
1446 GStringChunk *lines_chunk;
1447 GPtrArray *lines;
1448 size_t nformats;
1450 SETUP_LOCALE_SWITCH;
1452 g_return_val_if_fail (parseoptions != NULL, NULL);
1453 g_return_val_if_fail (data != NULL, NULL);
1455 START_LOCALE_SWITCH;
1457 cr = gnm_cell_region_new (NULL);
1459 if (!data_end)
1460 data_end = data + strlen (data);
1461 lines_chunk = g_string_chunk_new (100 * 1024);
1462 lines = stf_parse_general (parseoptions, lines_chunk, data, data_end);
1463 nformats = parseoptions->formats->len;
1464 for (row = 0; row < lines->len; row++) {
1465 GPtrArray *line = g_ptr_array_index (lines, row);
1466 unsigned int col, targetcol = 0;
1467 for (col = 0; col < line->len; col++) {
1468 if (parseoptions->col_import_array == NULL ||
1469 parseoptions->col_import_array_len <= col ||
1470 parseoptions->col_import_array[col]) {
1471 const char *text = g_ptr_array_index (line, col);
1472 if (text) {
1473 GOFormat *fmt = NULL;
1474 GnmValue *v;
1475 GnmCellCopy *cc;
1477 if (col < nformats)
1478 fmt = g_ptr_array_index (parseoptions->formats, col);
1479 v = format_match (text, fmt, date_conv);
1480 if (!v)
1481 v = value_new_string (text);
1483 cc = gnm_cell_copy_new (cr, targetcol, row);
1484 cc->val = v;
1485 cc->texpr = NULL;
1486 targetcol++;
1487 if (targetcol > colhigh)
1488 colhigh = targetcol;
1493 stf_parse_general_free (lines);
1494 g_string_chunk_free (lines_chunk);
1496 END_LOCALE_SWITCH;
1498 cr->cols = (colhigh > 0) ? colhigh : 1;
1499 cr->rows = row;
1501 return cr;
1504 static int
1505 int_sort (void const *a, void const *b)
1507 return *(int const *)a - *(int const *)b;
1510 static int
1511 count_character (GPtrArray *lines, gunichar c, double quantile)
1513 int *counts, res;
1514 unsigned int lno, cno;
1516 if (lines->len == 0)
1517 return 0;
1519 counts = g_new (int, lines->len);
1520 for (lno = cno = 0; lno < lines->len; lno++) {
1521 int count = 0;
1522 GPtrArray *boxline = g_ptr_array_index (lines, lno);
1523 char const *line = g_ptr_array_index (boxline, 0);
1525 /* Ignore empty lines. */
1526 if (*line == 0)
1527 continue;
1529 while (*line) {
1530 if (g_utf8_get_char (line) == c)
1531 count++;
1532 line = g_utf8_next_char (line);
1535 counts[cno++] = count;
1538 if (cno == 0)
1539 res = 0;
1540 else {
1541 unsigned int qi = (unsigned int)ceil (quantile * cno);
1542 qsort (counts, cno, sizeof (counts[0]), int_sort);
1543 if (qi == cno)
1544 qi--;
1545 res = counts[qi];
1548 g_free (counts);
1550 return res;
1553 static void
1554 dump_guessed_options (const StfParseOptions_t *res)
1556 GSList *l;
1557 char ubuffer[6 + 1];
1558 unsigned ui;
1560 g_printerr ("Guessed format:\n");
1561 switch (res->parsetype) {
1562 case PARSE_TYPE_CSV:
1563 g_printerr (" type = sep\n");
1564 g_printerr (" separator = %s\n",
1565 res->sep.chr ? res->sep.chr : "(none)");
1566 g_printerr (" see two as one = %s\n",
1567 res->sep.duplicates ? "yes" : "no");
1568 break;
1569 case PARSE_TYPE_FIXED:
1570 g_printerr (" type = sep\n");
1571 break;
1572 default:
1575 g_printerr (" trim space = %d\n", res->trim_spaces);
1577 ubuffer[g_unichar_to_utf8 (res->stringindicator, ubuffer)] = 0;
1578 g_printerr (" string indicator = %s\n", ubuffer);
1579 g_printerr (" see two as one = %s\n",
1580 res->indicator_2x_is_single ? "yes" : "no");
1582 g_printerr (" line terminators =");
1583 for (l = res->terminator; l; l = l->next) {
1584 const char *t = l->data;
1585 if (strcmp (t, "\n") == 0)
1586 g_printerr (" unix");
1587 else if (strcmp (t, "\r") == 0)
1588 g_printerr (" mac");
1589 else if (strcmp (t, "\r\n") == 0)
1590 g_printerr (" dos");
1591 else
1592 g_printerr (" other");
1594 g_printerr ("\n");
1596 for (ui = 0; ui < res->formats->len; ui++) {
1597 GOFormat const *fmt = g_ptr_array_index (res->formats, ui);
1598 const GString *decimal = ui < res->formats_decimal->len
1599 ? g_ptr_array_index (res->formats_decimal, ui)
1600 : NULL;
1601 const GString *thousand = ui < res->formats_thousand->len
1602 ? g_ptr_array_index (res->formats_thousand, ui)
1603 : NULL;
1605 g_printerr (" fmt.%d = %s\n", ui, go_format_as_XL (fmt));
1606 if (decimal)
1607 g_printerr (" fmt.%d.dec = %s\n", ui, decimal->str);
1608 if (thousand)
1609 g_printerr (" fmt.%d.thou = %s\n", ui, thousand->str);
1614 * stf_parse_options_guess:
1615 * @data: the input data.
1617 * Returns: (transfer full): the guessed options.
1619 StfParseOptions_t *
1620 stf_parse_options_guess (char const *data)
1622 StfParseOptions_t *res;
1623 GStringChunk *lines_chunk;
1624 GPtrArray *lines;
1625 int tabcount;
1626 int sepcount;
1627 gunichar sepchar = go_locale_get_arg_sep ();
1629 g_return_val_if_fail (data != NULL, NULL);
1631 res = stf_parse_options_new ();
1632 lines_chunk = g_string_chunk_new (100 * 1024);
1633 lines = stf_parse_lines (res, lines_chunk, data, 1000, FALSE);
1635 tabcount = count_character (lines, '\t', 0.2);
1636 sepcount = count_character (lines, sepchar, 0.2);
1638 /* At least one tab per line and enough to separate every
1639 would-be sepchars. */
1640 if (tabcount >= 1 && tabcount >= sepcount - 1)
1641 stf_parse_options_csv_set_separators (res, "\t", NULL);
1642 else {
1643 gunichar c;
1646 * Try a few more or less likely characters and pick the first
1647 * one that occurs on at least half the lines.
1649 * The order is mostly random, although ' ' and '!' which
1650 * could very easily occur in text are put last.
1652 if (count_character (lines, (c = sepchar), 0.5) > 0 ||
1653 count_character (lines, (c = go_locale_get_col_sep ()), 0.5) > 0 ||
1654 count_character (lines, (c = ':'), 0.5) > 0 ||
1655 count_character (lines, (c = ','), 0.5) > 0 ||
1656 count_character (lines, (c = ';'), 0.5) > 0 ||
1657 count_character (lines, (c = '|'), 0.5) > 0 ||
1658 count_character (lines, (c = '!'), 0.5) > 0 ||
1659 count_character (lines, (c = ' '), 0.5) > 0) {
1660 char sep[7];
1661 sep[g_unichar_to_utf8 (c, sep)] = 0;
1662 if (c == ' ')
1663 strcat (sep, "\t");
1664 stf_parse_options_csv_set_separators (res, sep, NULL);
1668 // For now, always separated:
1669 stf_parse_options_set_type (res, PARSE_TYPE_CSV);
1671 switch (res->parsetype) {
1672 case PARSE_TYPE_CSV: {
1673 gboolean dups =
1674 res->sep.chr &&
1675 strchr (res->sep.chr, ' ') != NULL;
1676 gboolean trim =
1677 res->sep.chr &&
1678 strchr (res->sep.chr, ' ') != NULL;
1680 stf_parse_options_set_trim_spaces (res, TRIM_TYPE_LEFT | TRIM_TYPE_RIGHT);
1681 stf_parse_options_csv_set_indicator_2x_is_single (res, TRUE);
1682 stf_parse_options_csv_set_duplicates (res, dups);
1683 stf_parse_options_csv_set_trim_seps (res, trim);
1685 stf_parse_options_csv_set_stringindicator (res, '"');
1686 break;
1689 case PARSE_TYPE_FIXED:
1690 break;
1692 default:
1693 g_assert_not_reached ();
1696 stf_parse_general_free (lines);
1697 g_string_chunk_free (lines_chunk);
1699 stf_parse_options_guess_formats (res, data);
1701 if (gnm_debug_flag ("stf"))
1702 dump_guessed_options (res);
1704 return res;
1708 * stf_parse_options_guess_csv:
1709 * @data: the CSV input data.
1711 * Returns: (transfer full): the guessed options.
1713 StfParseOptions_t *
1714 stf_parse_options_guess_csv (char const *data)
1716 StfParseOptions_t *res;
1717 GStringChunk *lines_chunk;
1718 GPtrArray *lines;
1719 char *sep = NULL;
1720 char const *quoteline = NULL;
1721 int pass;
1722 gunichar stringind = '"';
1724 g_return_val_if_fail (data != NULL, NULL);
1726 res = stf_parse_options_new ();
1727 stf_parse_options_set_type (res, PARSE_TYPE_CSV);
1728 stf_parse_options_set_trim_spaces (res, TRIM_TYPE_LEFT | TRIM_TYPE_RIGHT);
1729 stf_parse_options_csv_set_indicator_2x_is_single (res, TRUE);
1730 stf_parse_options_csv_set_duplicates (res, FALSE);
1731 stf_parse_options_csv_set_trim_seps (res, FALSE);
1732 stf_parse_options_csv_set_stringindicator (res, stringind);
1734 lines_chunk = g_string_chunk_new (100 * 1024);
1735 lines = stf_parse_lines (res, lines_chunk, data, 1000, FALSE);
1738 * Find a line containing a quote; skip first line unless it is
1739 * the only one. Prefer a line with the quote first.
1741 for (pass = 1; !quoteline && pass <= 2; pass++) {
1742 size_t lno;
1743 for (lno = MIN (1, lines->len - 1);
1744 !quoteline && lno < lines->len;
1745 lno++) {
1746 GPtrArray *boxline = g_ptr_array_index (lines, lno);
1747 const char *line = g_ptr_array_index (boxline, 0);
1748 switch (pass) {
1749 case 1:
1750 if (g_utf8_get_char (line) == stringind)
1751 quoteline = line;
1752 break;
1753 case 2:
1754 if (my_utf8_strchr (line, stringind))
1755 quoteline = line;
1756 break;
1761 if (quoteline) {
1762 const char *p0 = my_utf8_strchr (quoteline, stringind);
1763 const char *p = p0;
1765 do {
1766 p = g_utf8_next_char (p);
1767 } while (*p && g_utf8_get_char (p) != stringind);
1768 if (*p) p = g_utf8_next_char (p);
1769 while (*p && g_unichar_isspace (g_utf8_get_char (p)))
1770 p = g_utf8_next_char (p);
1771 if (*p) {
1772 /* Use the character after the quote. */
1773 sep = g_strndup (p, g_utf8_next_char (p) - p);
1774 } else {
1775 /* Try to use character before the quote. */
1776 while (p0 > quoteline && !sep) {
1777 p = p0;
1778 p0 = g_utf8_prev_char (p0);
1779 if (!g_unichar_isspace (g_utf8_get_char (p0)))
1780 sep = g_strndup (p0, p - p0);
1785 if (!sep)
1786 sep = g_strdup (",");
1787 stf_parse_options_csv_set_separators (res, sep, NULL);
1788 g_free (sep);
1790 stf_parse_general_free (lines);
1791 g_string_chunk_free (lines_chunk);
1793 stf_parse_options_guess_formats (res, data);
1795 if (gnm_debug_flag ("stf"))
1796 dump_guessed_options (res);
1798 return res;
1801 typedef enum {
1802 STF_GUESS_DATE_DMY = 1,
1803 STF_GUESS_DATE_MDY = 2,
1804 STF_GUESS_DATE_YMD = 4,
1806 STF_GUESS_NUMBER_DEC_POINT = 0x10,
1807 STF_GUESS_NUMBER_DEC_COMMA = 0x20,
1808 STF_GUESS_NUMBER_DEC_EITHER = 0x30,
1810 STF_GUESS_ALL = 0x37
1811 } StfGuessFormats;
1813 static void
1814 do_check_date (const char *data, StfGuessFormats flag,
1815 gboolean mbd, gboolean ybm,
1816 unsigned *possible,
1817 GODateConventions const *date_conv)
1819 GnmValue *v;
1820 gboolean this_mbd, this_ybm;
1821 int imbd;
1823 if (!(*possible & flag))
1824 return;
1826 v = format_match_datetime (data, date_conv, mbd, TRUE, FALSE);
1827 if (!v || !VALUE_FMT (v))
1828 goto fail;
1830 imbd = go_format_month_before_day (VALUE_FMT (v));
1831 this_mbd = (imbd >= 1);
1832 this_ybm = (imbd == 2);
1833 if (mbd != this_mbd || ybm != this_ybm)
1834 goto fail;
1836 goto done;
1838 fail:
1839 *possible &= ~flag;
1840 done:
1841 value_release (v);
1845 static void
1846 do_check_number (const char *data, StfGuessFormats flag,
1847 const GString *dec, const GString *thousand, const GString *curr,
1848 unsigned *possible, int *decimals)
1850 GnmValue *v;
1851 GOFormatFamily family;
1852 const char *pthou;
1854 if (!(*possible & flag))
1855 return;
1857 v = format_match_decimal_number_with_locale (data, &family, curr, thousand, dec);
1858 if (!v)
1859 goto fail;
1861 if (*decimals != -2) {
1862 const char *pdec = strstr (data, dec->str);
1863 int this_decimals = 0;
1864 if (pdec) {
1865 pdec += dec->len;
1866 while (g_ascii_isdigit (*pdec)) {
1867 pdec++;
1868 this_decimals++;
1871 if (*decimals == -1)
1872 *decimals = this_decimals;
1873 else if (*decimals != this_decimals)
1874 *decimals = -2;
1877 pthou = strstr (data, thousand->str);
1878 if (pthou) {
1879 const char *p;
1880 int digits = 0, nonzero_digits = 0;
1881 for (p = data; p < pthou; p = g_utf8_next_char (p)) {
1882 if (g_unichar_isdigit (g_utf8_get_char (p))) {
1883 digits++;
1884 if (*p != '0')
1885 nonzero_digits++;
1888 // "-.222" implies that "." is not a thousands separator.
1889 // "0.222" implies that "." is not a thousands separator.
1890 // "12345,555" implies that "," is not a thousands separator.
1891 if (nonzero_digits == 0 || digits > 3)
1892 goto fail;
1895 goto done;
1897 fail:
1898 *possible &= ~flag;
1899 done:
1900 value_release (v);
1905 * stf_parse_options_guess_formats:
1906 * @data: the CSV input data.
1908 * This function attempts to recognize data formats on a column-by-column
1909 * basis under the assumption that the data in a text file will generally
1910 * use the same data formats.
1912 * This is useful because not all values give sufficient information by
1913 * themselves to tell what format the data is in. For example, "1/2/2000"
1914 * is likely to be a date in year 2000, but it is not clear if it is in
1915 * January or February. If another value in the same column is "31/1/1999"
1916 * then it is likely that the former date was in February.
1918 * Likewise, a value of "123,456" could mean either 1.23456e5 or 1.23456e2.
1919 * A later value of "111,200.22" would clear up the confusion.
1922 void
1923 stf_parse_options_guess_formats (StfParseOptions_t *po, char const *data)
1925 GStringChunk *lines_chunk;
1926 GPtrArray *lines;
1927 unsigned lno, col, colcount, sline;
1928 GODateConventions const *date_conv = go_date_conv_from_str ("Lotus:1900");
1929 GString *s_comma = g_string_new (",");
1930 GString *s_dot = g_string_new (".");
1931 GString *s_dollar = g_string_new ("$");
1932 gboolean debug = gnm_debug_flag ("stf");
1934 g_ptr_array_set_size (po->formats, 0);
1935 g_ptr_array_set_size (po->formats_decimal, 0);
1936 g_ptr_array_set_size (po->formats_thousand, 0);
1937 g_ptr_array_set_size (po->formats_curr, 0);
1939 lines_chunk = g_string_chunk_new (100 * 1024);
1940 lines = stf_parse_general (po, lines_chunk, data, data + strlen (data));
1942 colcount = 0;
1943 for (lno = 0; lno < lines->len; lno++) {
1944 GPtrArray *line = g_ptr_array_index (lines, lno);
1945 colcount = MAX (colcount, line->len);
1948 // Ignore first line unless it is the only one
1949 sline = MIN ((int)lines->len - 1, 1);
1951 g_ptr_array_set_size (po->formats, colcount);
1952 g_ptr_array_set_size (po->formats_decimal, colcount);
1953 g_ptr_array_set_size (po->formats_thousand, colcount);
1954 g_ptr_array_set_size (po->formats_curr, colcount);
1955 for (col = 0; col < colcount; col++) {
1956 unsigned possible = STF_GUESS_ALL;
1957 GOFormat *fmt = NULL;
1958 gboolean seen_dot = FALSE;
1959 gboolean seen_comma = FALSE;
1960 int decimals_if_point = -1; // -1: unset; -2: inconsistent; >=0: count
1961 int decimals_if_comma = -1; // -1: unset; -2: inconsistent; >=0: count
1963 for (lno = sline; possible && lno < lines->len; lno++) {
1964 GPtrArray *line = g_ptr_array_index (lines, lno);
1965 const char *data = col < line->len ? g_ptr_array_index (line, col) : "";
1966 unsigned prev_possible = possible;
1968 if (*data == 0 || data[0] == '\'')
1969 continue;
1971 do_check_date (data, STF_GUESS_DATE_DMY, FALSE, FALSE, &possible, date_conv);
1972 do_check_date (data, STF_GUESS_DATE_MDY, TRUE, FALSE, &possible, date_conv);
1973 do_check_date (data, STF_GUESS_DATE_YMD, TRUE, TRUE, &possible, date_conv);
1975 if ((possible & STF_GUESS_NUMBER_DEC_EITHER) == STF_GUESS_NUMBER_DEC_EITHER) {
1976 const char *pdot = strstr (data, s_dot->str);
1977 const char *pcomma = strstr (data, s_comma->str);
1978 if (pdot && pcomma) {
1979 // Both -- last one is the decimal separator
1980 if (pdot > pcomma)
1981 possible &= ~STF_GUESS_NUMBER_DEC_COMMA;
1982 else
1983 possible &= ~STF_GUESS_NUMBER_DEC_POINT;
1984 } else if (pdot && strstr (pdot + s_dot->len, s_dot->str)) {
1985 // Two dots so they are thousands separators
1986 possible &= ~STF_GUESS_NUMBER_DEC_POINT;
1987 } else if (pcomma && strstr (pcomma + s_comma->len, s_comma->str)) {
1988 // Two commas so they are thousands separators
1989 possible &= ~STF_GUESS_NUMBER_DEC_COMMA;
1992 seen_dot = seen_dot || (pdot != 0);
1993 seen_comma = seen_comma || (pcomma != 0);
1995 do_check_number (data, STF_GUESS_NUMBER_DEC_POINT,
1996 s_dot, s_comma, s_dollar,
1997 &possible, &decimals_if_point);
1998 do_check_number (data, STF_GUESS_NUMBER_DEC_COMMA,
1999 s_comma, s_dot, s_dollar,
2000 &possible, &decimals_if_comma);
2002 if (possible != prev_possible && debug)
2003 g_printerr ("col=%d; after [%s] possible=0x%x\n", col, data, possible);
2006 if ((possible & STF_GUESS_NUMBER_DEC_EITHER) == STF_GUESS_NUMBER_DEC_EITHER &&
2007 !seen_dot && !seen_comma) {
2008 // It doesn't matter what the separators are
2009 possible &= ~STF_GUESS_NUMBER_DEC_COMMA;
2012 switch (possible) {
2013 case STF_GUESS_DATE_DMY:
2014 fmt = go_format_new_from_XL ("d-mmm-yyyy");
2015 break;
2016 case STF_GUESS_DATE_MDY:
2017 fmt = go_format_new_from_XL ("m/d/yyyy");
2018 break;
2019 case STF_GUESS_DATE_YMD:
2020 fmt = go_format_new_from_XL ("yyyy-mm-dd");
2021 break;
2022 case STF_GUESS_NUMBER_DEC_POINT:
2023 g_ptr_array_index (po->formats_decimal, col) = g_string_new (".");
2024 g_ptr_array_index (po->formats_thousand, col) = g_string_new (",");
2025 g_ptr_array_index (po->formats_curr, col) = g_string_new (s_dollar->str);
2026 if (decimals_if_point > 0) {
2027 // Don't set format if decimals is zero
2028 GString *fmt_str = g_string_new (NULL);
2029 go_format_generate_number_str (fmt_str, 1, decimals_if_point, seen_comma, FALSE, FALSE, "", "");
2030 fmt = go_format_new_from_XL (fmt_str->str);
2031 g_string_free (fmt_str, TRUE);
2033 break;
2034 case STF_GUESS_NUMBER_DEC_COMMA:
2035 g_ptr_array_index (po->formats_decimal, col) = g_string_new (",");
2036 g_ptr_array_index (po->formats_thousand, col) = g_string_new (".");
2037 g_ptr_array_index (po->formats_curr, col) = g_string_new (s_dollar->str);
2038 if (decimals_if_comma > 0) {
2039 // Don't set format if decimals is zero
2040 GString *fmt_str = g_string_new (NULL);
2041 go_format_generate_number_str (fmt_str, 1, decimals_if_comma, seen_dot, FALSE, FALSE, "", "");
2042 fmt = go_format_new_from_XL (fmt_str->str);
2043 g_string_free (fmt_str, TRUE);
2045 break;
2046 default:
2047 break;
2050 if (!fmt)
2051 fmt = go_format_ref (go_format_general ());
2052 g_ptr_array_index (po->formats, col) = fmt;
2055 stf_parse_general_free (lines);
2056 g_string_chunk_free (lines_chunk);
2058 g_string_free (s_dot, TRUE);
2059 g_string_free (s_comma, TRUE);
2060 g_string_free (s_dollar, TRUE);