GUI: reduce vertical size of the toolbar area
[gnumeric.git] / src / stf-parse.c
blobe658fde755e55c2f23e5cc428994afad71304627
1 /* vim: set sw=8: -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */
2 /*
3 * stf-parse.c : Structured Text Format parser. (STF)
4 * A general purpose engine for parsing data
5 * in CSV and Fixed width format.
8 * Copyright (C) Almer. S. Tigelaar.
9 * EMail: almer1@dds.nl or almer-t@bigfoot.com
11 * Copyright (C) 2003 Andreas J. Guelzow <aguelzow@taliesin.ca>
12 * Copyright (C) 2003,2008-2009 Morten Welinder <terra@gnome.org>
14 * This program is free software; you can redistribute it and/or modify
15 * it under the terms of the GNU General Public License as published by
16 * the Free Software Foundation; either version 2 of the License, or
17 * (at your option) any later version.
19 * This program is distributed in the hope that it will be useful,
20 * but WITHOUT ANY WARRANTY; without even the implied warranty of
21 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
22 * GNU General Public License for more details.
24 * You should have received a copy of the GNU General Public License
25 * along with this program; if not, see <https://www.gnu.org/licenses/>.
28 #include <gnumeric-config.h>
29 #include <glib/gi18n-lib.h>
30 #include "gnumeric.h"
31 #include "stf-parse.h"
32 #include "stf-export.h"
34 #include "workbook.h"
35 #include "cell.h"
36 #include "sheet.h"
37 #include "expr.h"
38 #include "clipboard.h"
39 #include "sheet-style.h"
40 #include "value.h"
41 #include "mstyle.h"
42 #include "number-match.h"
43 #include "gutils.h"
44 #include "parse-util.h"
45 #include "number-match.h"
46 #include "gnm-format.h"
47 #include "ranges.h"
48 #include <goffice/goffice.h>
50 #include <stdlib.h>
51 #include <locale.h>
52 #include <string.h>
54 #define SETUP_LOCALE_SWITCH char *oldlocale = NULL
56 #define START_LOCALE_SWITCH if (parseoptions->locale) {\
57 oldlocale = g_strdup(go_setlocale (LC_ALL, NULL)); \
58 go_setlocale(LC_ALL, parseoptions->locale);}
60 #define END_LOCALE_SWITCH if (oldlocale) {\
61 go_setlocale(LC_ALL, oldlocale);\
62 g_free (oldlocale);}
64 /* Source_t struct, used for interchanging parsing information between the low level parse functions */
65 typedef struct {
66 GStringChunk *chunk;
67 char const *position; /* Indicates the current position within data */
69 /* Used internally for fixed width parsing */
70 int splitpos; /* Indicates current position in splitpositions array */
71 int linepos; /* Position on the current line */
72 } Source_t;
74 /* Struct used for autodiscovery */
75 typedef struct {
76 int start;
77 int stop;
78 } AutoDiscovery_t;
81 * Some silly dude make the length field an unsigned int. C just does
82 * not deal very well with that.
84 static inline int
85 my_garray_len (GArray const *a)
87 return (int)a->len;
90 static char *
91 my_utf8_strchr (const char *p, gunichar uc)
93 return uc < 0x7f ? strchr (p, uc) : g_utf8_strchr (p, -1, uc);
96 static int
97 compare_terminator (char const *s, StfParseOptions_t *parseoptions)
99 guchar const *us = (guchar const *)s;
100 GSList *l;
102 if (*us > parseoptions->compiled_terminator.max ||
103 *us < parseoptions->compiled_terminator.min)
104 return 0;
106 for (l = parseoptions->terminator; l; l = l->next) {
107 char const *term = l->data;
108 char const *d = s;
110 while (*term) {
111 if (*d != *term)
112 goto next;
113 term++;
114 d++;
116 return d - s;
118 next:
121 return 0;
125 /*******************************************************************************************************
126 * STF PARSE OPTIONS : StfParseOptions related
127 *******************************************************************************************************/
129 static void
130 gnm_g_string_free (GString *s)
132 if (s) g_string_free (s, TRUE);
137 * stf_parse_options_new:
139 * This will return a new StfParseOptions_t struct.
140 * The struct should, after being used, freed with stf_parse_options_free.
142 static StfParseOptions_t *
143 stf_parse_options_new (void)
145 StfParseOptions_t* parseoptions = g_new0 (StfParseOptions_t, 1);
147 parseoptions->parsetype = PARSE_TYPE_NOTSET;
149 parseoptions->terminator = NULL;
150 stf_parse_options_add_line_terminator (parseoptions, "\r\n");
151 stf_parse_options_add_line_terminator (parseoptions, "\n");
152 stf_parse_options_add_line_terminator (parseoptions, "\r");
154 parseoptions->trim_spaces = (TRIM_TYPE_RIGHT | TRIM_TYPE_LEFT);
155 parseoptions->locale = NULL;
157 parseoptions->splitpositions = NULL;
158 stf_parse_options_fixed_splitpositions_clear (parseoptions);
160 parseoptions->stringindicator = '"';
161 parseoptions->indicator_2x_is_single = TRUE;
162 parseoptions->sep.duplicates = FALSE;
163 parseoptions->trim_seps = FALSE;
165 parseoptions->sep.str = NULL;
166 parseoptions->sep.chr = NULL;
168 parseoptions->col_autofit_array = NULL;
169 parseoptions->col_import_array = NULL;
170 parseoptions->col_import_array_len = 0;
171 parseoptions->formats = g_ptr_array_new_with_free_func ((GDestroyNotify)go_format_unref);
172 parseoptions->formats_decimal = g_ptr_array_new_with_free_func ((GDestroyNotify)gnm_g_string_free);
173 parseoptions->formats_thousand = g_ptr_array_new_with_free_func ((GDestroyNotify)gnm_g_string_free);
174 parseoptions->formats_curr = g_ptr_array_new_with_free_func ((GDestroyNotify)gnm_g_string_free);
176 parseoptions->cols_exceeded = FALSE;
177 parseoptions->rows_exceeded = FALSE;
178 parseoptions->ref_count = 1;
180 return parseoptions;
184 * stf_parse_options_free:
186 * will free @parseoptions, note that this will not free the splitpositions
187 * member (GArray) of the struct, the caller is responsible for that.
189 void
190 stf_parse_options_free (StfParseOptions_t *parseoptions)
192 g_return_if_fail (parseoptions != NULL);
194 if (parseoptions->ref_count-- > 1)
195 return;
197 g_free (parseoptions->col_import_array);
198 g_free (parseoptions->col_autofit_array);
199 g_free (parseoptions->locale);
200 g_free (parseoptions->sep.chr);
202 if (parseoptions->sep.str) {
203 GSList *l;
205 for (l = parseoptions->sep.str; l != NULL; l = l->next)
206 g_free ((char *) l->data);
207 g_slist_free (parseoptions->sep.str);
210 g_array_free (parseoptions->splitpositions, TRUE);
212 stf_parse_options_clear_line_terminator (parseoptions);
214 g_ptr_array_free (parseoptions->formats, TRUE);
215 g_ptr_array_free (parseoptions->formats_decimal, TRUE);
216 g_ptr_array_free (parseoptions->formats_thousand, TRUE);
217 g_ptr_array_free (parseoptions->formats_curr, TRUE);
219 g_free (parseoptions);
222 static StfParseOptions_t *
223 stf_parse_options_ref (StfParseOptions_t *parseoptions)
225 parseoptions->ref_count++;
226 return parseoptions;
229 GType
230 stf_parse_options_get_type (void)
232 static GType t = 0;
234 if (t == 0) {
235 t = g_boxed_type_register_static ("StfParseOptions_t",
236 (GBoxedCopyFunc)stf_parse_options_ref,
237 (GBoxedFreeFunc)stf_parse_options_free);
239 return t;
242 void
243 stf_parse_options_set_type (StfParseOptions_t *parseoptions, StfParseType_t const parsetype)
245 g_return_if_fail (parseoptions != NULL);
246 g_return_if_fail (parsetype == PARSE_TYPE_CSV || parsetype == PARSE_TYPE_FIXED);
248 parseoptions->parsetype = parsetype;
251 static gint
252 long_string_first (gchar const *a, gchar const *b)
254 /* This actually is UTF-8 safe. */
255 return strlen (b) - strlen (a);
258 static void
259 compile_terminators (StfParseOptions_t *parseoptions)
261 GSList *l;
263 parseoptions->terminator =
264 g_slist_sort (parseoptions->terminator,
265 (GCompareFunc)long_string_first);
266 parseoptions->compiled_terminator.min = 255;
267 parseoptions->compiled_terminator.max = 0;
268 for (l = parseoptions->terminator; l; l = l->next) {
269 const guchar *term = l->data;
270 parseoptions->compiled_terminator.min =
271 MIN (parseoptions->compiled_terminator.min, *term);
272 parseoptions->compiled_terminator.max =
273 MAX (parseoptions->compiled_terminator.max, *term);
278 * stf_parse_options_add_line_terminator:
280 * This will add to the line terminators, in both the Fixed width and CSV delimited importers
281 * this indicates the end of a row.
284 void
285 stf_parse_options_add_line_terminator (StfParseOptions_t *parseoptions, char const *terminator)
287 g_return_if_fail (parseoptions != NULL);
288 g_return_if_fail (terminator != NULL && *terminator != 0);
290 GO_SLIST_PREPEND (parseoptions->terminator, g_strdup (terminator));
291 compile_terminators (parseoptions);
295 * stf_parse_options_clear_line_terminator:
297 * This will clear the line terminator, in both the Fixed width and CSV delimited importers
298 * this indicates the end of a row.
301 void
302 stf_parse_options_clear_line_terminator (StfParseOptions_t *parseoptions)
304 g_return_if_fail (parseoptions != NULL);
306 g_slist_free_full (parseoptions->terminator, g_free);
307 parseoptions->terminator = NULL;
308 compile_terminators (parseoptions);
312 * stf_parse_options_set_trim_spaces:
314 * If enabled will trim spaces in every parsed field on left and/or right
315 * sides.
317 void
318 stf_parse_options_set_trim_spaces (StfParseOptions_t *parseoptions, StfTrimType_t const trim_spaces)
320 g_return_if_fail (parseoptions != NULL);
322 parseoptions->trim_spaces = trim_spaces;
326 * stf_parse_options_csv_set_separators:
327 * @parseoptions: #StfParseOptions_t
328 * @character:
329 * @seps: (element-type utf8): the separators to be used
331 * A copy is made of the parameters.
333 void
334 stf_parse_options_csv_set_separators (StfParseOptions_t *parseoptions,
335 char const *character,
336 GSList const *seps)
338 g_return_if_fail (parseoptions != NULL);
340 g_free (parseoptions->sep.chr);
341 parseoptions->sep.chr = g_strdup (character);
343 g_slist_free_full (parseoptions->sep.str, g_free);
344 parseoptions->sep.str =
345 g_slist_copy_deep ((GSList *)seps, (GCopyFunc)g_strdup, NULL);
348 void
349 stf_parse_options_csv_set_stringindicator (StfParseOptions_t *parseoptions, gunichar const stringindicator)
351 g_return_if_fail (parseoptions != NULL);
353 parseoptions->stringindicator = stringindicator;
357 * stf_parse_options_csv_set_indicator_2x_is_single:
358 * @indic_2x: a boolean value indicating whether we want to see two
359 * adjacent string indicators as a single string indicator
360 * that is part of the cell, rather than a terminator.
362 void
363 stf_parse_options_csv_set_indicator_2x_is_single (StfParseOptions_t *parseoptions,
364 gboolean const indic_2x)
366 g_return_if_fail (parseoptions != NULL);
368 parseoptions->indicator_2x_is_single = indic_2x;
372 * stf_parse_options_csv_set_duplicates:
373 * @parseoptions:
374 * @duplicates: a boolean value indicating whether we want to see two
375 * separators right behind each other as one
377 void
378 stf_parse_options_csv_set_duplicates (StfParseOptions_t *parseoptions, gboolean const duplicates)
380 g_return_if_fail (parseoptions != NULL);
382 parseoptions->sep.duplicates = duplicates;
386 * stf_parse_options_csv_set_trim_seps:
387 * @trim_seps: a boolean value indicating whether we want to ignore
388 * separators at the beginning of lines
390 void
391 stf_parse_options_csv_set_trim_seps (StfParseOptions_t *parseoptions, gboolean const trim_seps)
393 g_return_if_fail (parseoptions != NULL);
395 parseoptions->trim_seps = trim_seps;
399 * stf_parse_options_fixed_splitpositions_clear:
401 * This will clear the splitpositions (== points on which a line is split)
403 void
404 stf_parse_options_fixed_splitpositions_clear (StfParseOptions_t *parseoptions)
406 int minus_one = -1;
407 g_return_if_fail (parseoptions != NULL);
409 if (parseoptions->splitpositions)
410 g_array_free (parseoptions->splitpositions, TRUE);
411 parseoptions->splitpositions = g_array_new (FALSE, FALSE, sizeof (int));
413 g_array_append_val (parseoptions->splitpositions, minus_one);
417 * stf_parse_options_fixed_splitpositions_add:
419 * @position will be added to the splitpositions.
421 void
422 stf_parse_options_fixed_splitpositions_add (StfParseOptions_t *parseoptions, int position)
424 unsigned int ui;
426 g_return_if_fail (parseoptions != NULL);
427 g_return_if_fail (position >= 0);
429 for (ui = 0; ui < parseoptions->splitpositions->len - 1; ui++) {
430 int here = g_array_index (parseoptions->splitpositions, int, ui);
431 if (position == here)
432 return;
433 if (position < here)
434 break;
437 g_array_insert_val (parseoptions->splitpositions, ui, position);
440 void
441 stf_parse_options_fixed_splitpositions_remove (StfParseOptions_t *parseoptions, int position)
443 unsigned int ui;
445 g_return_if_fail (parseoptions != NULL);
446 g_return_if_fail (position >= 0);
448 for (ui = 0; ui < parseoptions->splitpositions->len - 1; ui++) {
449 int here = g_array_index (parseoptions->splitpositions, int, ui);
450 if (position == here)
451 g_array_remove_index (parseoptions->splitpositions, ui);
452 if (position <= here)
453 return;
458 stf_parse_options_fixed_splitpositions_count (StfParseOptions_t *parseoptions)
460 return parseoptions->splitpositions->len;
464 stf_parse_options_fixed_splitpositions_nth (StfParseOptions_t *parseoptions, int n)
466 return g_array_index (parseoptions->splitpositions, int, n);
471 * stf_parse_options_valid:
472 * @parseoptions: an import options struct
474 * Checks if @parseoptions is correctly filled
476 * returns : TRUE if it is correctly filled, FALSE otherwise.
478 static gboolean
479 stf_parse_options_valid (StfParseOptions_t *parseoptions)
481 g_return_val_if_fail (parseoptions != NULL, FALSE);
483 if (parseoptions->parsetype == PARSE_TYPE_FIXED) {
484 if (!parseoptions->splitpositions) {
485 g_warning ("STF: No splitpositions in struct");
486 return FALSE;
490 return TRUE;
493 /*******************************************************************************************************
494 * STF PARSE : The actual routines that do the 'trick'
495 *******************************************************************************************************/
497 static void
498 trim_spaces_inplace (char *field, StfParseOptions_t const *parseoptions)
500 if (!field) return;
502 if (parseoptions->trim_spaces & TRIM_TYPE_LEFT) {
503 char *s = field;
505 while (g_unichar_isspace (g_utf8_get_char (s)))
506 s = g_utf8_next_char (s);
508 if (s != field)
509 memmove (field, s, 1 + strlen (s));
512 if (parseoptions->trim_spaces & TRIM_TYPE_RIGHT) {
513 char *s = field + strlen (field);
515 while (field != s) {
516 s = g_utf8_prev_char (s);
517 if (!g_unichar_isspace (g_utf8_get_char (s)))
518 break;
519 *s = 0;
525 * stf_parse_csv_is_separator:
527 * returns NULL if @character is not a separator, a pointer to the character
528 * after the separator otherwise.
530 static char const *
531 stf_parse_csv_is_separator (char const *character, char const *chr, GSList const *str)
533 g_return_val_if_fail (character != NULL, NULL);
535 if (*character == 0)
536 return NULL;
538 if (str) {
539 GSList const *l;
541 for (l = str; l != NULL; l = l->next) {
542 char const *s = l->data;
543 char const *r;
544 glong cnt;
545 glong const len = g_utf8_strlen (s, -1);
547 /* Don't compare past the end of the buffer! */
548 for (r = character, cnt = 0; cnt < len; cnt++, r = g_utf8_next_char (r))
549 if (*r == '\0')
550 break;
552 if ((cnt == len) && (memcmp (character, s, len) == 0))
553 return g_utf8_offset_to_pointer (character, len);
557 if (chr && my_utf8_strchr (chr, g_utf8_get_char (character)))
558 return g_utf8_next_char(character);
560 return NULL;
564 * stf_parse_eat_separators:
566 * skip over leading separators
570 static void
571 stf_parse_eat_separators (Source_t *src, StfParseOptions_t *parseoptions)
573 char const *cur, *next;
575 g_return_if_fail (src != NULL);
576 g_return_if_fail (parseoptions != NULL);
578 cur = src->position;
580 if (*cur == '\0' || compare_terminator (cur, parseoptions))
581 return;
582 while ((next = stf_parse_csv_is_separator (cur, parseoptions->sep.chr, parseoptions->sep.str)))
583 cur = next;
584 src->position = cur;
585 return;
589 typedef enum {
590 STF_CELL_ERROR,
591 STF_CELL_EOF,
592 STF_CELL_EOL,
593 STF_CELL_FIELD_NO_SEP,
594 STF_CELL_FIELD_SEP
595 } StfParseCellRes;
597 static StfParseCellRes
598 stf_parse_csv_cell (GString *text, Source_t *src, StfParseOptions_t *parseoptions)
600 char const *cur;
601 gboolean saw_sep = FALSE;
603 g_return_val_if_fail (src != NULL, STF_CELL_ERROR);
604 g_return_val_if_fail (parseoptions != NULL, STF_CELL_ERROR);
606 cur = src->position;
607 g_return_val_if_fail (cur != NULL, STF_CELL_ERROR);
609 /* Skip whitespace, but stop at line terminators. */
610 while (1) {
611 int term_len;
613 if (*cur == 0) {
614 src->position = cur;
615 return STF_CELL_EOF;
618 term_len = compare_terminator (cur, parseoptions);
619 if (term_len) {
620 src->position = cur + term_len;
621 return STF_CELL_EOL;
624 if ((parseoptions->trim_spaces & TRIM_TYPE_LEFT) == 0)
625 break;
627 if (stf_parse_csv_is_separator (cur, parseoptions->sep.chr,
628 parseoptions->sep.str))
629 break;
631 if (!g_unichar_isspace (g_utf8_get_char (cur)))
632 break;
633 cur = g_utf8_next_char (cur);
636 if (parseoptions->stringindicator != 0 &&
637 g_utf8_get_char (cur) == parseoptions->stringindicator) {
638 cur = g_utf8_next_char (cur);
639 while (*cur) {
640 gunichar uc = g_utf8_get_char (cur);
641 cur = g_utf8_next_char (cur);
643 if (uc == parseoptions->stringindicator) {
644 if (parseoptions->indicator_2x_is_single &&
645 g_utf8_get_char (cur) == parseoptions->stringindicator)
646 cur = g_utf8_next_char (cur);
647 else {
648 /* "field content"dropped-garbage, */
649 while (*cur && !compare_terminator (cur, parseoptions)) {
650 char const *post = stf_parse_csv_is_separator
651 (cur, parseoptions->sep.chr, parseoptions->sep.str);
652 if (post) {
653 cur = post;
654 saw_sep = TRUE;
655 break;
657 cur = g_utf8_next_char (cur);
659 break;
663 g_string_append_unichar (text, uc);
666 /* We silently allow a missing terminating quote. */
667 } else {
668 /* Unquoted field. */
670 while (*cur && !compare_terminator (cur, parseoptions)) {
672 char const *post = stf_parse_csv_is_separator
673 (cur, parseoptions->sep.chr, parseoptions->sep.str);
674 if (post) {
675 cur = post;
676 saw_sep = TRUE;
677 break;
680 g_string_append_unichar (text, g_utf8_get_char (cur));
681 cur = g_utf8_next_char (cur);
684 if (parseoptions->trim_spaces & TRIM_TYPE_RIGHT) {
685 while (text->len) {
686 const char *last = g_utf8_prev_char (text->str + text->len);
687 if (!g_unichar_isspace (g_utf8_get_char (last)))
688 break;
689 g_string_truncate (text, last - text->str);
694 src->position = cur;
696 if (saw_sep && parseoptions->sep.duplicates)
697 stf_parse_eat_separators (src, parseoptions);
699 return saw_sep ? STF_CELL_FIELD_SEP : STF_CELL_FIELD_NO_SEP;
703 * stf_parse_csv_line:
705 * This will parse one line from the current @src->position.
706 * NOTE: The calling routine is responsible for freeing the result.
708 * returns : a GPtrArray of char*'s
710 static GPtrArray *
711 stf_parse_csv_line (Source_t *src, StfParseOptions_t *parseoptions)
713 GPtrArray *line;
714 gboolean cont = FALSE;
715 GString *text;
717 g_return_val_if_fail (src != NULL, NULL);
718 g_return_val_if_fail (parseoptions != NULL, NULL);
720 line = g_ptr_array_new ();
721 if (parseoptions->trim_seps)
722 stf_parse_eat_separators (src, parseoptions);
724 text = g_string_sized_new (30);
726 while (1) {
727 char *ctext;
728 StfParseCellRes res =
729 stf_parse_csv_cell (text, src, parseoptions);
730 trim_spaces_inplace (text->str, parseoptions);
731 ctext = g_string_chunk_insert_len (src->chunk,
732 text->str, text->len);
733 g_string_truncate (text, 0);
735 switch (res) {
736 case STF_CELL_FIELD_NO_SEP:
737 g_ptr_array_add (line, ctext);
738 cont = FALSE;
739 break;
741 case STF_CELL_FIELD_SEP:
742 g_ptr_array_add (line, ctext);
743 cont = TRUE; /* Make sure we see one more field. */
744 break;
746 default:
747 if (cont)
748 g_ptr_array_add (line, ctext);
749 g_string_free (text, TRUE);
750 return line;
756 * stf_parse_fixed_cell:
758 * returns a pointer to the parsed cell contents.
760 static char *
761 stf_parse_fixed_cell (Source_t *src, StfParseOptions_t *parseoptions)
763 char *res;
764 char const *cur;
765 int splitval;
767 g_return_val_if_fail (src != NULL, NULL);
768 g_return_val_if_fail (parseoptions != NULL, NULL);
770 cur = src->position;
772 if (src->splitpos < my_garray_len (parseoptions->splitpositions))
773 splitval = (int) g_array_index (parseoptions->splitpositions, int, src->splitpos);
774 else
775 splitval = -1;
777 while (*cur != 0 && !compare_terminator (cur, parseoptions) && splitval != src->linepos) {
778 src->linepos++;
779 cur = g_utf8_next_char (cur);
782 res = g_string_chunk_insert_len (src->chunk,
783 src->position,
784 cur - src->position);
786 src->position = cur;
788 return res;
792 * stf_parse_fixed_line:
794 * This will parse one line from the current @src->position.
795 * It will return a GPtrArray with the cell contents as strings.
797 * NOTE: The calling routine is responsible for freeing result.
799 static GPtrArray *
800 stf_parse_fixed_line (Source_t *src, StfParseOptions_t *parseoptions)
802 GPtrArray *line;
804 g_return_val_if_fail (src != NULL, NULL);
805 g_return_val_if_fail (parseoptions != NULL, NULL);
807 src->linepos = 0;
808 src->splitpos = 0;
810 line = g_ptr_array_new ();
811 while (*src->position != '\0' && !compare_terminator (src->position, parseoptions)) {
812 char *field = stf_parse_fixed_cell (src, parseoptions);
814 trim_spaces_inplace (field, parseoptions);
815 g_ptr_array_add (line, field);
817 src->splitpos++;
820 while (line->len < parseoptions->splitpositions->len)
821 g_ptr_array_add (line, g_strdup (""));
823 return line;
827 * stf_parse_general_free: (skip)
829 void
830 stf_parse_general_free (GPtrArray *lines)
832 unsigned lineno;
833 for (lineno = 0; lineno < lines->len; lineno++) {
834 GPtrArray *line = g_ptr_array_index (lines, lineno);
835 /* Fields are not freed here. */
836 if (line)
837 g_ptr_array_free (line, TRUE);
839 g_ptr_array_free (lines, TRUE);
844 * stf_parse_general: (skip)
846 * Returns: (transfer full): a GPtrArray of lines, where each line is itself a
847 * GPtrArray of strings.
849 * The caller must free this entire structure, for example by calling
850 * stf_parse_general_free.
852 GPtrArray *
853 stf_parse_general (StfParseOptions_t *parseoptions,
854 GStringChunk *lines_chunk,
855 char const *data, char const *data_end)
857 GPtrArray *lines;
858 Source_t src;
859 int row;
860 char const *valid_end = data_end;
862 g_return_val_if_fail (parseoptions != NULL, NULL);
863 g_return_val_if_fail (data != NULL, NULL);
864 g_return_val_if_fail (data_end != NULL, NULL);
865 g_return_val_if_fail (stf_parse_options_valid (parseoptions), NULL);
866 g_return_val_if_fail (g_utf8_validate (data, data_end-data, &valid_end), NULL);
868 src.chunk = lines_chunk;
869 src.position = data;
870 row = 0;
872 if ((data_end-data >= 3) && !strncmp(src.position, "\xEF\xBB\xBF", 3)) {
873 /* Skip over byte-order mark */
874 src.position += 3;
877 lines = g_ptr_array_new ();
878 while (*src.position != '\0' && src.position < data_end) {
879 GPtrArray *line;
881 if (row == GNM_MAX_ROWS) {
882 parseoptions->rows_exceeded = TRUE;
883 break;
886 line = parseoptions->parsetype == PARSE_TYPE_CSV
887 ? stf_parse_csv_line (&src, parseoptions)
888 : stf_parse_fixed_line (&src, parseoptions);
890 g_ptr_array_add (lines, line);
891 if (parseoptions->parsetype != PARSE_TYPE_CSV)
892 src.position += compare_terminator (src.position, parseoptions);
893 row++;
896 return lines;
900 * stf_parse_lines: (skip)
901 * @parseoptions: #StfParseOptions_t
902 * @lines_chunk:
903 * @data:
904 * @maxlines:
905 * @with_lineno:
907 * Returns: (transfer full): a GPtrArray of lines, where each line is itself a
908 * GPtrArray of strings.
910 * The caller must free this entire structure, for example by calling
911 * stf_parse_general_free.
913 GPtrArray *
914 stf_parse_lines (StfParseOptions_t *parseoptions,
915 GStringChunk *lines_chunk,
916 char const *data,
917 int maxlines, gboolean with_lineno)
919 GPtrArray *lines;
920 int lineno = 1;
922 g_return_val_if_fail (data != NULL, NULL);
924 lines = g_ptr_array_new ();
925 while (*data) {
926 char const *data0 = data;
927 GPtrArray *line = g_ptr_array_new ();
929 if (with_lineno) {
930 char buf[4 * sizeof (int)];
931 sprintf (buf, "%d", lineno);
932 g_ptr_array_add (line,
933 g_string_chunk_insert (lines_chunk, buf));
936 while (1) {
937 int termlen = compare_terminator (data, parseoptions);
938 if (termlen > 0 || *data == 0) {
939 g_ptr_array_add (line,
940 g_string_chunk_insert_len (lines_chunk,
941 data0,
942 data - data0));
943 data += termlen;
944 break;
945 } else
946 data = g_utf8_next_char (data);
949 g_ptr_array_add (lines, line);
951 lineno++;
952 if (lineno >= maxlines)
953 break;
955 return lines;
958 char const *
959 stf_parse_find_line (StfParseOptions_t *parseoptions,
960 char const *data,
961 int line)
963 while (line > 0) {
964 int termlen = compare_terminator (data, parseoptions);
965 if (termlen > 0) {
966 data += termlen;
967 line--;
968 } else if (*data == 0) {
969 return data;
970 } else {
971 data = g_utf8_next_char (data);
974 return data;
979 * stf_parse_options_fixed_autodiscover:
980 * @parseoptions: a Parse options struct.
981 * @data: The actual data.
982 * @data_end: data end.
984 * Automatically try to discover columns in the text to be parsed.
985 * We ignore empty lines (only containing parseoptions->terminator)
987 * FIXME: This is so extremely ugly that I am too tired to rewrite it right now.
988 * Think hard of a better more flexible solution...
990 void
991 stf_parse_options_fixed_autodiscover (StfParseOptions_t *parseoptions,
992 char const *data, char const *data_end)
994 char const *iterator = data;
995 GSList *list = NULL;
996 GSList *list_start = NULL;
997 int lines = 0;
998 int effective_lines = 0;
999 int max_line_length = 0;
1000 int *line_begin_hits = NULL;
1001 int *line_end_hits = NULL;
1002 int i;
1004 stf_parse_options_fixed_splitpositions_clear (parseoptions);
1007 * First take a look at all possible white space combinations
1009 while (*iterator && iterator < data_end) {
1010 gboolean begin_recorded = FALSE;
1011 AutoDiscovery_t *disc = NULL;
1012 int position = 0;
1013 int termlen = 0;
1015 while (*iterator && (termlen = compare_terminator (iterator, parseoptions)) == 0) {
1016 if (!begin_recorded && *iterator == ' ') {
1017 disc = g_new0 (AutoDiscovery_t, 1);
1019 disc->start = position;
1021 begin_recorded = TRUE;
1022 } else if (begin_recorded && *iterator != ' ') {
1023 disc->stop = position;
1024 list = g_slist_prepend (list, disc);
1026 begin_recorded = FALSE;
1027 disc = NULL;
1030 position++;
1031 iterator++;
1034 if (position > max_line_length)
1035 max_line_length = position;
1038 * If there are excess spaces at the end of
1039 * the line : ignore them
1041 g_free (disc);
1044 * Hop over the terminator
1046 iterator += termlen;
1048 if (position != 0)
1049 effective_lines++;
1051 lines++;
1054 list = g_slist_reverse (list);
1055 list_start = list;
1058 * Kewl stuff:
1059 * Look at the number of hits at each line position
1060 * if the number of hits equals the number of lines
1061 * we can be pretty sure this is the start or end
1062 * of a column, we filter out empty columns
1063 * later
1065 line_begin_hits = g_new0 (int, max_line_length + 1);
1066 line_end_hits = g_new0 (int, max_line_length + 1);
1068 while (list) {
1069 AutoDiscovery_t *disc = list->data;
1071 line_begin_hits[disc->start]++;
1072 line_end_hits[disc->stop]++;
1074 g_free (disc);
1076 list = g_slist_next (list);
1078 g_slist_free (list_start);
1080 for (i = 0; i < max_line_length + 1; i++)
1081 if (line_begin_hits[i] == effective_lines || line_end_hits[i] == effective_lines)
1082 stf_parse_options_fixed_splitpositions_add (parseoptions, i);
1085 * Do some corrections to the initial columns
1086 * detected here, we obviously don't need to
1087 * do this if there are no columns at all.
1089 if (my_garray_len (parseoptions->splitpositions) > 0) {
1091 * Try to find columns that look like:
1093 * Example 100
1094 * Example2 9
1096 * (In other words : Columns with left & right justification with
1097 * a minimum of 2 spaces in the middle)
1098 * Split these columns in 2
1101 for (i = 0; i < my_garray_len (parseoptions->splitpositions) - 1; i++) {
1102 int begin = g_array_index (parseoptions->splitpositions, int, i);
1103 int end = g_array_index (parseoptions->splitpositions, int, i + 1);
1104 int num_spaces = -1;
1105 int spaces_start = 0;
1106 gboolean right_aligned = TRUE;
1107 gboolean left_aligned = TRUE;
1108 gboolean has_2_spaces = TRUE;
1110 iterator = data;
1111 lines = 0;
1112 while (*iterator && iterator < data_end) {
1113 gboolean trigger = FALSE;
1114 gboolean space_trigger = FALSE;
1115 int pos = 0;
1117 num_spaces = -1;
1118 spaces_start = 0;
1119 while (*iterator && !compare_terminator (iterator, parseoptions)) {
1120 if (pos == begin) {
1121 if (*iterator == ' ')
1122 left_aligned = FALSE;
1124 trigger = TRUE;
1125 } else if (pos == end - 1) {
1126 if (*iterator == ' ')
1127 right_aligned = FALSE;
1129 trigger = FALSE;
1132 if (trigger || pos == end - 1) {
1133 if (!space_trigger && *iterator == ' ') {
1134 space_trigger = TRUE;
1135 spaces_start = pos;
1136 } else if (space_trigger && *iterator != ' ') {
1137 space_trigger = FALSE;
1138 num_spaces = pos - spaces_start;
1142 iterator++;
1143 pos++;
1146 if (num_spaces < 2)
1147 has_2_spaces = FALSE;
1149 if (*iterator)
1150 iterator++;
1152 lines++;
1156 * If this column meets all the criteria
1157 * split it into two at the last measured
1158 * spaces_start + num_spaces
1160 if (has_2_spaces && right_aligned && left_aligned) {
1161 int val = (((spaces_start + num_spaces) - spaces_start) / 2) + spaces_start;
1163 g_array_insert_val (parseoptions->splitpositions, i + 1, val);
1166 * Skip over the inserted column
1168 i++;
1173 * Remove empty columns here if needed
1175 for (i = 0; i < my_garray_len (parseoptions->splitpositions) - 1; i++) {
1176 int begin = g_array_index (parseoptions->splitpositions, int, i);
1177 int end = g_array_index (parseoptions->splitpositions, int, i + 1);
1178 gboolean only_spaces = TRUE;
1180 iterator = data;
1181 lines = 0;
1182 while (*iterator && iterator < data_end) {
1183 gboolean trigger = FALSE;
1184 int pos = 0;
1186 while (*iterator && !compare_terminator (iterator, parseoptions)) {
1187 if (pos == begin)
1188 trigger = TRUE;
1189 else if (pos == end)
1190 trigger = FALSE;
1192 if (trigger) {
1193 if (*iterator != ' ')
1194 only_spaces = FALSE;
1197 iterator++;
1198 pos++;
1201 if (*iterator)
1202 iterator++;
1204 lines++;
1208 * The column only contains spaces
1209 * remove it
1211 if (only_spaces) {
1212 g_array_remove_index (parseoptions->splitpositions, i);
1215 * We HAVE to make sure that the next column (end) also
1216 * gets checked out. If we don't decrease "i" here, we
1217 * will skip over it as the indexes shift down after
1218 * the removal
1220 i--;
1225 g_free (line_begin_hits);
1226 g_free (line_end_hits);
1229 /*******************************************************************************************************
1230 * STF PARSE HL: high-level functions that dump the raw data returned by the low-level parsing
1231 * functions into something meaningful (== application specific)
1232 *******************************************************************************************************/
1235 * This is more or less as gnm_cell_set_text, except...
1236 * 1. Unknown names are not allowed.
1237 * 2. Only '=' can start an expression.
1240 static void
1241 stf_cell_set_text (GnmCell *cell, char const *text)
1243 GnmExprTop const *texpr;
1244 GnmValue *val;
1245 GOFormat const *fmt = gnm_style_get_format (gnm_cell_get_style (cell));
1246 const GODateConventions *date_conv = sheet_date_conv (cell->base.sheet);
1248 if (!go_format_is_text (fmt) && *text == '=' && text[1] != 0) {
1249 GnmExprParseFlags flags =
1250 GNM_EXPR_PARSE_UNKNOWN_NAMES_ARE_INVALID;
1251 const char *expr_start = text + 1;
1252 GnmParsePos pos;
1253 val = NULL;
1254 parse_pos_init_cell (&pos, cell);
1255 texpr = gnm_expr_parse_str (expr_start, &pos, flags,
1256 NULL, NULL);
1257 } else {
1258 texpr = NULL;
1259 val = format_match (text, fmt, date_conv);
1262 if (!val && !texpr)
1263 val = value_new_string (text);
1265 if (val)
1266 gnm_cell_set_value (cell, val);
1267 else {
1268 gnm_cell_set_expr (cell, texpr);
1269 gnm_expr_top_unref (texpr);
1273 static void
1274 stf_read_remember_settings (Workbook *book, StfParseOptions_t *po)
1276 if (po->parsetype == PARSE_TYPE_CSV) {
1277 GnmStfExport *stfe = gnm_stf_get_stfe (G_OBJECT (book));
1278 char quote[6];
1279 int length = g_unichar_to_utf8 (po->stringindicator, quote);
1280 if (length > 5) {
1281 quote[0] = '"';
1282 quote[1] = '\0';
1283 } else quote[length] = '\0';
1285 g_object_set (G_OBJECT (stfe), "separator", po->sep.chr, "quote", &quote, NULL);
1287 if ((po->terminator != NULL) && (po->terminator->data != NULL))
1288 g_object_set (G_OBJECT (stfe), "eol", po->terminator->data, NULL);
1292 gboolean
1293 stf_parse_sheet (StfParseOptions_t *parseoptions,
1294 char const *data, char const *data_end,
1295 Sheet *sheet, int start_col, int start_row)
1297 int row;
1298 unsigned int lrow;
1299 GStringChunk *lines_chunk;
1300 GPtrArray *lines;
1301 gboolean result = TRUE;
1302 int col;
1303 unsigned int lcol;
1304 size_t nformats;
1306 SETUP_LOCALE_SWITCH;
1308 g_return_val_if_fail (parseoptions != NULL, FALSE);
1309 g_return_val_if_fail (data != NULL, FALSE);
1310 g_return_val_if_fail (IS_SHEET (sheet), FALSE);
1312 if (!data_end)
1313 data_end = data + strlen (data);
1315 lines_chunk = g_string_chunk_new (100 * 1024);
1316 lines = stf_parse_general (parseoptions, lines_chunk, data, data_end);
1317 if (lines == NULL)
1318 result = FALSE;
1320 col = start_col;
1321 nformats = parseoptions->formats->len;
1322 for (lcol = 0; lcol < nformats; lcol++) {
1323 GOFormat const *fmt = g_ptr_array_index (parseoptions->formats, lcol);
1324 GnmStyle *mstyle;
1325 gboolean want_col =
1326 (parseoptions->col_import_array == NULL ||
1327 parseoptions->col_import_array_len <= lcol ||
1328 parseoptions->col_import_array[lcol]);
1329 if (!want_col || col >= gnm_sheet_get_max_cols (sheet))
1330 continue;
1332 if (fmt && !go_format_is_general (fmt)) {
1333 GnmRange r;
1334 int end_row = MIN (start_row + (int)lines->len - 1,
1335 gnm_sheet_get_last_row (sheet));
1337 range_init (&r, col, start_row, col, end_row);
1338 mstyle = gnm_style_new ();
1339 gnm_style_set_format (mstyle, fmt);
1340 sheet_apply_style (sheet, &r, mstyle);
1342 col++;
1345 START_LOCALE_SWITCH;
1346 for (row = start_row, lrow = 0;
1347 result && lrow < lines->len;
1348 row++, lrow++) {
1349 GPtrArray *line;
1351 if (row >= gnm_sheet_get_max_rows (sheet)) {
1352 if (!parseoptions->rows_exceeded) {
1353 /* FIXME: What locale? */
1354 g_warning (_("There are more rows of data than "
1355 "there is room for in the sheet. Extra "
1356 "rows will be ignored."));
1357 parseoptions->rows_exceeded = TRUE;
1359 break;
1362 col = start_col;
1363 line = g_ptr_array_index (lines, lrow);
1365 for (lcol = 0; lcol < line->len; lcol++) {
1366 GOFormat const *fmt = lcol < nformats
1367 ? g_ptr_array_index (parseoptions->formats, lcol)
1368 : go_format_general ();
1369 char const *text = g_ptr_array_index (line, lcol);
1370 gboolean want_col =
1371 (parseoptions->col_import_array == NULL ||
1372 parseoptions->col_import_array_len <= lcol ||
1373 parseoptions->col_import_array[lcol]);
1374 if (!want_col)
1375 continue;
1377 if (col >= gnm_sheet_get_max_cols (sheet)) {
1378 if (!parseoptions->cols_exceeded) {
1379 /* FIXME: What locale? */
1380 g_warning (_("There are more columns of data than "
1381 "there is room for in the sheet. Extra "
1382 "columns will be ignored."));
1383 parseoptions->cols_exceeded = TRUE;
1385 break;
1387 if (text && *text) {
1388 GnmCell *cell = sheet_cell_fetch (sheet, col, row);
1389 if (!go_format_is_text (fmt) &&
1390 lcol < parseoptions->formats_decimal->len &&
1391 g_ptr_array_index (parseoptions->formats_decimal, lcol)) {
1392 GOFormatFamily fam;
1393 GnmValue *v = format_match_decimal_number_with_locale
1394 (text, &fam,
1395 g_ptr_array_index (parseoptions->formats_curr, lcol),
1396 g_ptr_array_index (parseoptions->formats_thousand, lcol),
1397 g_ptr_array_index (parseoptions->formats_decimal, lcol));
1398 if (!v)
1399 v = value_new_string (text);
1400 sheet_cell_set_value (cell, v);
1401 } else {
1403 stf_cell_set_text (cell, text);
1406 col++;
1409 g_ptr_array_index (lines, lrow) = NULL;
1410 g_ptr_array_free (line, TRUE);
1412 END_LOCALE_SWITCH;
1414 for (lcol = 0, col = start_col;
1415 lcol < parseoptions->col_import_array_len && col < gnm_sheet_get_max_cols (sheet);
1416 lcol++) {
1417 if (parseoptions->col_import_array == NULL ||
1418 parseoptions->col_import_array_len <= lcol ||
1419 parseoptions->col_import_array[lcol]) {
1420 if (parseoptions->col_autofit_array == NULL ||
1421 parseoptions->col_autofit_array[lcol]) {
1422 ColRowIndexList *list = colrow_get_index_list (col, col, NULL);
1423 ColRowStateGroup *state = colrow_set_sizes (sheet, TRUE, list, -1, 0, -1);
1424 colrow_index_list_destroy (list);
1425 g_slist_free (state);
1427 col++;
1431 g_string_chunk_free (lines_chunk);
1432 if (lines)
1433 stf_parse_general_free (lines);
1434 if (result)
1435 stf_read_remember_settings (sheet->workbook, parseoptions);
1436 return result;
1439 GnmCellRegion *
1440 stf_parse_region (StfParseOptions_t *parseoptions, char const *data, char const *data_end,
1441 Workbook const *wb)
1443 static GODateConventions const default_conv = {FALSE};
1444 GODateConventions const *date_conv = wb ? workbook_date_conv (wb) : &default_conv;
1446 GnmCellRegion *cr;
1447 unsigned int row, colhigh = 0;
1448 GStringChunk *lines_chunk;
1449 GPtrArray *lines;
1450 size_t nformats;
1452 SETUP_LOCALE_SWITCH;
1454 g_return_val_if_fail (parseoptions != NULL, NULL);
1455 g_return_val_if_fail (data != NULL, NULL);
1457 START_LOCALE_SWITCH;
1459 cr = gnm_cell_region_new (NULL);
1461 if (!data_end)
1462 data_end = data + strlen (data);
1463 lines_chunk = g_string_chunk_new (100 * 1024);
1464 lines = stf_parse_general (parseoptions, lines_chunk, data, data_end);
1465 nformats = parseoptions->formats->len;
1466 for (row = 0; row < lines->len; row++) {
1467 GPtrArray *line = g_ptr_array_index (lines, row);
1468 unsigned int col, targetcol = 0;
1469 for (col = 0; col < line->len; col++) {
1470 if (parseoptions->col_import_array == NULL ||
1471 parseoptions->col_import_array_len <= col ||
1472 parseoptions->col_import_array[col]) {
1473 const char *text = g_ptr_array_index (line, col);
1474 if (text) {
1475 GOFormat *fmt = NULL;
1476 GnmValue *v;
1477 GnmCellCopy *cc;
1479 if (col < nformats)
1480 fmt = g_ptr_array_index (parseoptions->formats, col);
1481 v = format_match (text, fmt, date_conv);
1482 if (!v)
1483 v = value_new_string (text);
1485 cc = gnm_cell_copy_new (cr, targetcol, row);
1486 cc->val = v;
1487 cc->texpr = NULL;
1488 targetcol++;
1489 if (targetcol > colhigh)
1490 colhigh = targetcol;
1495 stf_parse_general_free (lines);
1496 g_string_chunk_free (lines_chunk);
1498 END_LOCALE_SWITCH;
1500 cr->cols = (colhigh > 0) ? colhigh : 1;
1501 cr->rows = row;
1503 return cr;
1506 static int
1507 int_sort (void const *a, void const *b)
1509 return *(int const *)a - *(int const *)b;
1512 static int
1513 count_character (GPtrArray *lines, gunichar c, double quantile)
1515 int *counts, res;
1516 unsigned int lno, cno;
1518 if (lines->len == 0)
1519 return 0;
1521 counts = g_new (int, lines->len);
1522 for (lno = cno = 0; lno < lines->len; lno++) {
1523 int count = 0;
1524 GPtrArray *boxline = g_ptr_array_index (lines, lno);
1525 char const *line = g_ptr_array_index (boxline, 0);
1527 /* Ignore empty lines. */
1528 if (*line == 0)
1529 continue;
1531 while (*line) {
1532 if (g_utf8_get_char (line) == c)
1533 count++;
1534 line = g_utf8_next_char (line);
1537 counts[cno++] = count;
1540 if (cno == 0)
1541 res = 0;
1542 else {
1543 unsigned int qi = (unsigned int)ceil (quantile * cno);
1544 qsort (counts, cno, sizeof (counts[0]), int_sort);
1545 if (qi == cno)
1546 qi--;
1547 res = counts[qi];
1550 g_free (counts);
1552 return res;
1555 static void
1556 dump_guessed_options (const StfParseOptions_t *res)
1558 GSList *l;
1559 char ubuffer[6 + 1];
1560 unsigned ui;
1562 g_printerr ("Guessed format:\n");
1563 switch (res->parsetype) {
1564 case PARSE_TYPE_CSV:
1565 g_printerr (" type = sep\n");
1566 g_printerr (" separator = %s\n",
1567 res->sep.chr ? res->sep.chr : "(none)");
1568 g_printerr (" see two as one = %s\n",
1569 res->sep.duplicates ? "yes" : "no");
1570 break;
1571 case PARSE_TYPE_FIXED:
1572 g_printerr (" type = sep\n");
1573 break;
1574 default:
1577 g_printerr (" trim space = %d\n", res->trim_spaces);
1579 ubuffer[g_unichar_to_utf8 (res->stringindicator, ubuffer)] = 0;
1580 g_printerr (" string indicator = %s\n", ubuffer);
1581 g_printerr (" see two as one = %s\n",
1582 res->indicator_2x_is_single ? "yes" : "no");
1584 g_printerr (" line terminators =");
1585 for (l = res->terminator; l; l = l->next) {
1586 const char *t = l->data;
1587 if (strcmp (t, "\n") == 0)
1588 g_printerr (" unix");
1589 else if (strcmp (t, "\r") == 0)
1590 g_printerr (" mac");
1591 else if (strcmp (t, "\r\n") == 0)
1592 g_printerr (" dos");
1593 else
1594 g_printerr (" other");
1596 g_printerr ("\n");
1598 for (ui = 0; ui < res->formats->len; ui++) {
1599 GOFormat const *fmt = g_ptr_array_index (res->formats, ui);
1600 const GString *decimal = ui < res->formats_decimal->len
1601 ? g_ptr_array_index (res->formats_decimal, ui)
1602 : NULL;
1603 const GString *thousand = ui < res->formats_thousand->len
1604 ? g_ptr_array_index (res->formats_thousand, ui)
1605 : NULL;
1607 g_printerr (" fmt.%d = %s\n", ui, go_format_as_XL (fmt));
1608 if (decimal)
1609 g_printerr (" fmt.%d.dec = %s\n", ui, decimal->str);
1610 if (thousand)
1611 g_printerr (" fmt.%d.thou = %s\n", ui, thousand->str);
1616 * stf_parse_options_guess:
1617 * @data: the input data.
1619 * Returns: (transfer full): the guessed options.
1621 StfParseOptions_t *
1622 stf_parse_options_guess (char const *data)
1624 StfParseOptions_t *res;
1625 GStringChunk *lines_chunk;
1626 GPtrArray *lines;
1627 int tabcount;
1628 int sepcount;
1629 gunichar sepchar = go_locale_get_arg_sep ();
1631 g_return_val_if_fail (data != NULL, NULL);
1633 res = stf_parse_options_new ();
1634 lines_chunk = g_string_chunk_new (100 * 1024);
1635 lines = stf_parse_lines (res, lines_chunk, data, 1000, FALSE);
1637 tabcount = count_character (lines, '\t', 0.2);
1638 sepcount = count_character (lines, sepchar, 0.2);
1640 /* At least one tab per line and enough to separate every
1641 would-be sepchars. */
1642 if (tabcount >= 1 && tabcount >= sepcount - 1)
1643 stf_parse_options_csv_set_separators (res, "\t", NULL);
1644 else {
1645 gunichar c;
1648 * Try a few more or less likely characters and pick the first
1649 * one that occurs on at least half the lines.
1651 * The order is mostly random, although ' ' and '!' which
1652 * could very easily occur in text are put last.
1654 if (count_character (lines, (c = sepchar), 0.5) > 0 ||
1655 count_character (lines, (c = go_locale_get_col_sep ()), 0.5) > 0 ||
1656 count_character (lines, (c = ':'), 0.5) > 0 ||
1657 count_character (lines, (c = ','), 0.5) > 0 ||
1658 count_character (lines, (c = ';'), 0.5) > 0 ||
1659 count_character (lines, (c = '|'), 0.5) > 0 ||
1660 count_character (lines, (c = '!'), 0.5) > 0 ||
1661 count_character (lines, (c = ' '), 0.5) > 0) {
1662 char sep[7];
1663 sep[g_unichar_to_utf8 (c, sep)] = 0;
1664 if (c == ' ')
1665 strcat (sep, "\t");
1666 stf_parse_options_csv_set_separators (res, sep, NULL);
1670 // For now, always separated:
1671 stf_parse_options_set_type (res, PARSE_TYPE_CSV);
1673 switch (res->parsetype) {
1674 case PARSE_TYPE_CSV: {
1675 gboolean dups =
1676 res->sep.chr &&
1677 strchr (res->sep.chr, ' ') != NULL;
1678 gboolean trim =
1679 res->sep.chr &&
1680 strchr (res->sep.chr, ' ') != NULL;
1682 stf_parse_options_set_trim_spaces (res, TRIM_TYPE_LEFT | TRIM_TYPE_RIGHT);
1683 stf_parse_options_csv_set_indicator_2x_is_single (res, TRUE);
1684 stf_parse_options_csv_set_duplicates (res, dups);
1685 stf_parse_options_csv_set_trim_seps (res, trim);
1687 stf_parse_options_csv_set_stringindicator (res, '"');
1688 break;
1691 case PARSE_TYPE_FIXED:
1692 break;
1694 default:
1695 g_assert_not_reached ();
1698 stf_parse_general_free (lines);
1699 g_string_chunk_free (lines_chunk);
1701 stf_parse_options_guess_formats (res, data);
1703 if (gnm_debug_flag ("stf"))
1704 dump_guessed_options (res);
1706 return res;
1710 * stf_parse_options_guess_csv:
1711 * @data: the CSV input data.
1713 * Returns: (transfer full): the guessed options.
1715 StfParseOptions_t *
1716 stf_parse_options_guess_csv (char const *data)
1718 StfParseOptions_t *res;
1719 GStringChunk *lines_chunk;
1720 GPtrArray *lines;
1721 char *sep = NULL;
1722 char const *quoteline = NULL;
1723 int pass;
1724 gunichar stringind = '"';
1726 g_return_val_if_fail (data != NULL, NULL);
1728 res = stf_parse_options_new ();
1729 stf_parse_options_set_type (res, PARSE_TYPE_CSV);
1730 stf_parse_options_set_trim_spaces (res, TRIM_TYPE_LEFT | TRIM_TYPE_RIGHT);
1731 stf_parse_options_csv_set_indicator_2x_is_single (res, TRUE);
1732 stf_parse_options_csv_set_duplicates (res, FALSE);
1733 stf_parse_options_csv_set_trim_seps (res, FALSE);
1734 stf_parse_options_csv_set_stringindicator (res, stringind);
1736 lines_chunk = g_string_chunk_new (100 * 1024);
1737 lines = stf_parse_lines (res, lines_chunk, data, 1000, FALSE);
1740 * Find a line containing a quote; skip first line unless it is
1741 * the only one. Prefer a line with the quote first.
1743 for (pass = 1; !quoteline && pass <= 2; pass++) {
1744 size_t lno;
1745 for (lno = MIN (1, lines->len - 1);
1746 !quoteline && lno < lines->len;
1747 lno++) {
1748 GPtrArray *boxline = g_ptr_array_index (lines, lno);
1749 const char *line = g_ptr_array_index (boxline, 0);
1750 switch (pass) {
1751 case 1:
1752 if (g_utf8_get_char (line) == stringind)
1753 quoteline = line;
1754 break;
1755 case 2:
1756 if (my_utf8_strchr (line, stringind))
1757 quoteline = line;
1758 break;
1763 if (quoteline) {
1764 const char *p0 = my_utf8_strchr (quoteline, stringind);
1765 const char *p = p0;
1767 do {
1768 p = g_utf8_next_char (p);
1769 } while (*p && g_utf8_get_char (p) != stringind);
1770 if (*p) p = g_utf8_next_char (p);
1771 while (*p && g_unichar_isspace (g_utf8_get_char (p)))
1772 p = g_utf8_next_char (p);
1773 if (*p) {
1774 /* Use the character after the quote. */
1775 sep = g_strndup (p, g_utf8_next_char (p) - p);
1776 } else {
1777 /* Try to use character before the quote. */
1778 while (p0 > quoteline && !sep) {
1779 p = p0;
1780 p0 = g_utf8_prev_char (p0);
1781 if (!g_unichar_isspace (g_utf8_get_char (p0)))
1782 sep = g_strndup (p0, p - p0);
1787 if (!sep)
1788 sep = g_strdup (",");
1789 stf_parse_options_csv_set_separators (res, sep, NULL);
1790 g_free (sep);
1792 stf_parse_general_free (lines);
1793 g_string_chunk_free (lines_chunk);
1795 stf_parse_options_guess_formats (res, data);
1797 if (gnm_debug_flag ("stf"))
1798 dump_guessed_options (res);
1800 return res;
1803 typedef enum {
1804 STF_GUESS_DATE_DMY = 1,
1805 STF_GUESS_DATE_MDY = 2,
1806 STF_GUESS_DATE_YMD = 4,
1808 STF_GUESS_NUMBER_DEC_POINT = 0x10,
1809 STF_GUESS_NUMBER_DEC_COMMA = 0x20,
1810 STF_GUESS_NUMBER_DEC_EITHER = 0x30,
1812 STF_GUESS_ALL = 0x37
1813 } StfGuessFormats;
1815 static void
1816 do_check_date (const char *data, StfGuessFormats flag,
1817 gboolean mbd, gboolean ybm,
1818 unsigned *possible,
1819 GODateConventions const *date_conv)
1821 GnmValue *v;
1822 gboolean this_mbd, this_ybm;
1823 int imbd;
1825 if (!(*possible & flag))
1826 return;
1828 v = format_match_datetime (data, date_conv, mbd, TRUE, FALSE);
1829 if (!v || !VALUE_FMT (v))
1830 goto fail;
1832 imbd = go_format_month_before_day (VALUE_FMT (v));
1833 this_mbd = (imbd >= 1);
1834 this_ybm = (imbd == 2);
1835 if (mbd != this_mbd || ybm != this_ybm)
1836 goto fail;
1838 goto done;
1840 fail:
1841 *possible &= ~flag;
1842 done:
1843 value_release (v);
1847 static void
1848 do_check_number (const char *data, StfGuessFormats flag,
1849 const GString *dec, const GString *thousand, const GString *curr,
1850 unsigned *possible, int *decimals)
1852 GnmValue *v;
1853 GOFormatFamily family;
1854 const char *pthou;
1856 if (!(*possible & flag))
1857 return;
1859 v = format_match_decimal_number_with_locale (data, &family, curr, thousand, dec);
1860 if (!v)
1861 goto fail;
1863 if (*decimals != -2) {
1864 const char *pdec = strstr (data, dec->str);
1865 int this_decimals = 0;
1866 if (pdec) {
1867 pdec += dec->len;
1868 while (g_ascii_isdigit (*pdec)) {
1869 pdec++;
1870 this_decimals++;
1873 if (*decimals == -1)
1874 *decimals = this_decimals;
1875 else if (*decimals != this_decimals)
1876 *decimals = -2;
1879 pthou = strstr (data, thousand->str);
1880 if (pthou) {
1881 const char *p;
1882 int digits = 0, nonzero_digits = 0;
1883 for (p = data; p < pthou; p = g_utf8_next_char (p)) {
1884 if (g_unichar_isdigit (g_utf8_get_char (p))) {
1885 digits++;
1886 if (*p != '0')
1887 nonzero_digits++;
1890 // "-.222" implies that "." is not a thousands separator.
1891 // "0.222" implies that "." is not a thousands separator.
1892 // "12345,555" implies that "," is not a thousands separator.
1893 if (nonzero_digits == 0 || digits > 3)
1894 goto fail;
1897 goto done;
1899 fail:
1900 *possible &= ~flag;
1901 done:
1902 value_release (v);
1907 * stf_parse_options_guess_formats:
1908 * @data: the CSV input data.
1910 * This function attempts to recognize data formats on a column-by-column
1911 * basis under the assumption that the data in a text file will generally
1912 * use the same data formats.
1914 * This is useful because not all values give sufficient information by
1915 * themselves to tell what format the data is in. For example, "1/2/2000"
1916 * is likely to be a date in year 2000, but it is not clear if it is in
1917 * January or February. If another value in the same column is "31/1/1999"
1918 * then it is likely that the former date was in February.
1920 * Likewise, a value of "123,456" could mean either 1.23456e5 or 1.23456e2.
1921 * A later value of "111,200.22" would clear up the confusion.
1924 void
1925 stf_parse_options_guess_formats (StfParseOptions_t *po, char const *data)
1927 GStringChunk *lines_chunk;
1928 GPtrArray *lines;
1929 unsigned lno, col, colcount, sline;
1930 GODateConventions const *date_conv = go_date_conv_from_str ("Lotus:1900");
1931 GString *s_comma = g_string_new (",");
1932 GString *s_dot = g_string_new (".");
1933 GString *s_dollar = g_string_new ("$");
1934 gboolean debug = gnm_debug_flag ("stf");
1936 g_ptr_array_set_size (po->formats, 0);
1937 g_ptr_array_set_size (po->formats_decimal, 0);
1938 g_ptr_array_set_size (po->formats_thousand, 0);
1939 g_ptr_array_set_size (po->formats_curr, 0);
1941 lines_chunk = g_string_chunk_new (100 * 1024);
1942 lines = stf_parse_general (po, lines_chunk, data, data + strlen (data));
1944 colcount = 0;
1945 for (lno = 0; lno < lines->len; lno++) {
1946 GPtrArray *line = g_ptr_array_index (lines, lno);
1947 colcount = MAX (colcount, line->len);
1950 // Ignore first line unless it is the only one
1951 sline = MIN ((int)lines->len - 1, 1);
1953 g_ptr_array_set_size (po->formats, colcount);
1954 g_ptr_array_set_size (po->formats_decimal, colcount);
1955 g_ptr_array_set_size (po->formats_thousand, colcount);
1956 g_ptr_array_set_size (po->formats_curr, colcount);
1957 for (col = 0; col < colcount; col++) {
1958 unsigned possible = STF_GUESS_ALL;
1959 GOFormat *fmt = NULL;
1960 gboolean seen_dot = FALSE;
1961 gboolean seen_comma = FALSE;
1962 int decimals_if_point = -1; // -1: unset; -2: inconsistent; >=0: count
1963 int decimals_if_comma = -1; // -1: unset; -2: inconsistent; >=0: count
1965 for (lno = sline; possible && lno < lines->len; lno++) {
1966 GPtrArray *line = g_ptr_array_index (lines, lno);
1967 const char *data = col < line->len ? g_ptr_array_index (line, col) : "";
1968 unsigned prev_possible = possible;
1970 if (*data == 0 || data[0] == '\'')
1971 continue;
1973 do_check_date (data, STF_GUESS_DATE_DMY, FALSE, FALSE, &possible, date_conv);
1974 do_check_date (data, STF_GUESS_DATE_MDY, TRUE, FALSE, &possible, date_conv);
1975 do_check_date (data, STF_GUESS_DATE_YMD, TRUE, TRUE, &possible, date_conv);
1977 if ((possible & STF_GUESS_NUMBER_DEC_EITHER) == STF_GUESS_NUMBER_DEC_EITHER) {
1978 const char *pdot = strstr (data, s_dot->str);
1979 const char *pcomma = strstr (data, s_comma->str);
1980 if (pdot && pcomma) {
1981 // Both -- last one is the decimal separator
1982 if (pdot > pcomma)
1983 possible &= ~STF_GUESS_NUMBER_DEC_COMMA;
1984 else
1985 possible &= ~STF_GUESS_NUMBER_DEC_POINT;
1986 } else if (pdot && strstr (pdot + s_dot->len, s_dot->str)) {
1987 // Two dots so they are thousands separators
1988 possible &= ~STF_GUESS_NUMBER_DEC_POINT;
1989 } else if (pcomma && strstr (pcomma + s_comma->len, s_comma->str)) {
1990 // Two commas so they are thousands separators
1991 possible &= ~STF_GUESS_NUMBER_DEC_COMMA;
1994 seen_dot = seen_dot || (pdot != 0);
1995 seen_comma = seen_comma || (pcomma != 0);
1997 do_check_number (data, STF_GUESS_NUMBER_DEC_POINT,
1998 s_dot, s_comma, s_dollar,
1999 &possible, &decimals_if_point);
2000 do_check_number (data, STF_GUESS_NUMBER_DEC_COMMA,
2001 s_comma, s_dot, s_dollar,
2002 &possible, &decimals_if_comma);
2004 if (possible != prev_possible && debug)
2005 g_printerr ("col=%d; after [%s] possible=0x%x\n", col, data, possible);
2008 if ((possible & STF_GUESS_NUMBER_DEC_EITHER) == STF_GUESS_NUMBER_DEC_EITHER &&
2009 !seen_dot && !seen_comma) {
2010 // It doesn't matter what the separators are
2011 possible &= ~STF_GUESS_NUMBER_DEC_COMMA;
2014 switch (possible) {
2015 case STF_GUESS_DATE_DMY:
2016 fmt = go_format_new_from_XL ("d-mmm-yyyy");
2017 break;
2018 case STF_GUESS_DATE_MDY:
2019 fmt = go_format_new_from_XL ("m/d/yyyy");
2020 break;
2021 case STF_GUESS_DATE_YMD:
2022 fmt = go_format_new_from_XL ("yyyy-mm-dd");
2023 break;
2024 case STF_GUESS_NUMBER_DEC_POINT:
2025 g_ptr_array_index (po->formats_decimal, col) = g_string_new (".");
2026 g_ptr_array_index (po->formats_thousand, col) = g_string_new (",");
2027 g_ptr_array_index (po->formats_curr, col) = g_string_new (s_dollar->str);
2028 if (decimals_if_point > 0) {
2029 // Don't set format if decimals is zero
2030 GString *fmt_str = g_string_new (NULL);
2031 go_format_generate_number_str (fmt_str, 1, decimals_if_point, seen_comma, FALSE, FALSE, "", "");
2032 fmt = go_format_new_from_XL (fmt_str->str);
2033 g_string_free (fmt_str, TRUE);
2035 break;
2036 case STF_GUESS_NUMBER_DEC_COMMA:
2037 g_ptr_array_index (po->formats_decimal, col) = g_string_new (",");
2038 g_ptr_array_index (po->formats_thousand, col) = g_string_new (".");
2039 g_ptr_array_index (po->formats_curr, col) = g_string_new (s_dollar->str);
2040 if (decimals_if_comma > 0) {
2041 // Don't set format if decimals is zero
2042 GString *fmt_str = g_string_new (NULL);
2043 go_format_generate_number_str (fmt_str, 1, decimals_if_comma, seen_dot, FALSE, FALSE, "", "");
2044 fmt = go_format_new_from_XL (fmt_str->str);
2045 g_string_free (fmt_str, TRUE);
2047 break;
2048 default:
2049 break;
2052 if (!fmt)
2053 fmt = go_format_ref (go_format_general ());
2054 g_ptr_array_index (po->formats, col) = fmt;
2057 stf_parse_general_free (lines);
2058 g_string_chunk_free (lines_chunk);
2060 g_string_free (s_dot, TRUE);
2061 g_string_free (s_comma, TRUE);
2062 g_string_free (s_dollar, TRUE);