Implement SET LEADZERO.
[pspp.git] / src / data / gnumeric-reader.c
blob1378469c8a50e640ef563510cf746914cbf469dd
1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 2007, 2009, 2010, 2011, 2012, 2013, 2016,
3 2020 Free Software Foundation, Inc.
5 This program is free software: you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation, either version 3 of the License, or
8 (at your option) any later version.
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with this program. If not, see <http://www.gnu.org/licenses/>. */
18 #include <config.h>
20 #include "data/gnumeric-reader.h"
21 #include "spreadsheet-reader.h"
23 #include <assert.h>
24 #include <stdbool.h>
25 #include <errno.h>
26 #include <libxml/xmlreader.h>
27 #include <zlib.h>
29 #include "data/case.h"
30 #include "data/casereader-provider.h"
31 #include "data/data-in.h"
32 #include "data/dictionary.h"
33 #include "data/format.h"
34 #include "data/identifier.h"
35 #include "data/value.h"
36 #include "data/variable.h"
37 #include "libpspp/i18n.h"
38 #include "libpspp/message.h"
39 #include "libpspp/misc.h"
40 #include "libpspp/hmap.h"
41 #include "libpspp/hash-functions.h"
43 #include "libpspp/str.h"
45 #include "gl/c-strtod.h"
46 #include "gl/minmax.h"
47 #include "gl/xalloc.h"
49 #include "gettext.h"
50 #define _(msgid) gettext (msgid)
51 #define N_(msgid) (msgid)
53 /* Setting this to false can help with debugging and development.
54 Don't forget to set it back to true, or users will complain that
55 all but the smallest spreadsheets display VERY slowly. */
56 static const bool use_cache = true;
58 /* Shamelessly lifted from the Gnumeric sources:
59 https://git.gnome.org/browse/gnumeric/tree/src/value.h
61 enum gnm_value_type
63 VALUE_EMPTY = 10,
64 VALUE_BOOLEAN = 20,
65 VALUE_INTEGER = 30, /* Note, this was removed from gnumeric in 2006 - old versions may of
66 course still be around. New ones are supposed to use float.*/
67 VALUE_FLOAT = 40,
68 VALUE_ERROR = 50,
69 VALUE_STRING = 60,
70 VALUE_CELLRANGE = 70,
71 VALUE_ARRAY = 80
75 static void gnm_file_casereader_destroy (struct casereader *, void *);
77 static struct ccase *gnm_file_casereader_read (struct casereader *, void *);
80 static const struct casereader_class gnm_file_casereader_class =
82 gnm_file_casereader_read,
83 gnm_file_casereader_destroy,
84 NULL,
85 NULL,
88 enum reader_state
90 STATE_PRE_INIT = 0, /* Initial state */
91 STATE_SHEET_COUNT, /* Found the sheet index */
92 STATE_INIT , /* Other Initial state */
93 STATE_SHEET_START, /* Found the start of a sheet */
94 STATE_SHEET_NAME, /* Found the sheet name */
95 STATE_MAXROW,
96 STATE_MAXCOL,
97 STATE_SHEET_FOUND, /* Found the sheet that we actually want */
98 STATE_CELLS_START, /* Found the start of the cell array */
99 STATE_CELL /* Found a cell */
102 struct state_data
104 gzFile gz;
106 /* The libxml reader for this instance */
107 xmlTextReaderPtr xtr;
109 /* An internal state variable */
110 enum reader_state state;
112 int node_type;
113 int current_sheet;
115 int row;
116 int col;
118 int min_col;
122 static void
123 state_data_destroy (struct state_data *sd)
125 xmlFreeTextReader (sd->xtr);
129 struct gnumeric_reader
131 struct spreadsheet spreadsheet;
133 struct state_data rsd;
134 struct state_data msd;
136 const xmlChar *target_sheet_name;
137 int target_sheet_index;
139 enum gnm_value_type vtype;
141 /* The total number of sheets in the "workbook" */
142 int n_sheets;
144 struct hmap cache;
147 /* A value to be kept in the hash table for cache purposes. */
148 struct cache_datum
150 struct hmap_node node;
152 /* The cell's row. */
153 int row;
155 /* The cell's column. */
156 int col;
158 /* The value of the cell. */
159 char *value;
162 static void
163 gnumeric_destroy (struct spreadsheet *s)
165 struct gnumeric_reader *r = (struct gnumeric_reader *) s;
167 int i;
169 for (i = 0; i < r->n_sheets; ++i)
171 xmlFree (r->spreadsheet.sheets[i].name);
174 if (s->dict)
175 dict_unref (s->dict);
177 free (r->spreadsheet.sheets);
178 state_data_destroy (&r->msd);
180 free (s->file_name);
182 struct cache_datum *cell;
183 struct cache_datum *next;
184 HMAP_FOR_EACH_SAFE (cell, next, struct cache_datum, node, &r->cache)
186 free (cell->value);
187 free (cell);
190 hmap_destroy (&r->cache);
192 free (r);
196 static const char *
197 gnumeric_get_sheet_name (struct spreadsheet *s, int n)
199 struct gnumeric_reader *gr = (struct gnumeric_reader *) s;
200 assert (n < gr->n_sheets);
202 return gr->spreadsheet.sheets[n].name;
206 static void process_node (struct gnumeric_reader *r, struct state_data *sd);
209 static int
210 gnumeric_get_sheet_n_sheets (struct spreadsheet *s)
212 struct gnumeric_reader *gr = (struct gnumeric_reader *) s;
214 int ret;
215 while (1 == (ret = xmlTextReaderRead (gr->msd.xtr)))
217 process_node (gr, &gr->msd);
220 return gr->n_sheets;
224 static char *
225 gnumeric_get_sheet_range (struct spreadsheet *s, int n)
227 int ret;
228 struct gnumeric_reader *gr = (struct gnumeric_reader *) s;
230 while ((gr->spreadsheet.sheets[n].last_col == -1)
232 (1 == (ret = xmlTextReaderRead (gr->msd.xtr))))
234 process_node (gr, &gr->msd);
237 assert (n < gr->n_sheets);
238 return create_cell_range (
239 gr->spreadsheet.sheets[n].first_col,
240 gr->spreadsheet.sheets[n].first_row,
241 gr->spreadsheet.sheets[n].last_col,
242 gr->spreadsheet.sheets[n].last_row);
246 static unsigned int
247 gnumeric_get_sheet_n_rows (struct spreadsheet *s, int n)
249 struct gnumeric_reader *gr = (struct gnumeric_reader *) s;
251 while ((gr->spreadsheet.sheets[n].last_col == -1)
253 (1 == xmlTextReaderRead (gr->msd.xtr)))
255 process_node (gr, &gr->msd);
258 assert (n < gr->n_sheets);
259 return gr->spreadsheet.sheets[n].last_row + 1;
262 static unsigned int
263 gnumeric_get_sheet_n_columns (struct spreadsheet *s, int n)
265 struct gnumeric_reader *gr = (struct gnumeric_reader *) s;
267 while ((gr->spreadsheet.sheets[n].last_col == -1)
269 (1 == xmlTextReaderRead (gr->msd.xtr)))
271 process_node (gr, &gr->msd);
274 assert (n < gr->n_sheets);
275 return gr->spreadsheet.sheets[n].last_col + 1;
278 static struct gnumeric_reader *
279 gnumeric_reopen (struct gnumeric_reader *r, const char *filename, bool show_errors);
282 static char *
283 gnumeric_get_sheet_cell (struct spreadsheet *s, int n, int row, int column)
285 struct gnumeric_reader *gr = (struct gnumeric_reader *) s;
287 /* See if this cell is in the cache. If it is, then use it. */
288 if (use_cache)
290 struct cache_datum *lookup = NULL;
291 unsigned int hash = hash_int (row, 0);
292 hash = hash_int (column, hash);
294 HMAP_FOR_EACH_WITH_HASH (lookup, struct cache_datum, node, hash,
295 &gr->cache)
297 if (lookup->row == row && lookup->col == column)
299 break;
302 if (lookup)
304 return strdup (lookup->value);
308 struct state_data sd;
310 sd.state = STATE_PRE_INIT;
311 sd.current_sheet = -1;
312 sd.row = -1;
313 sd.col = -1;
314 sd.min_col = 0;
315 sd.gz = gzopen (s->file_name, "r");
317 sd.xtr = xmlReaderForIO ((xmlInputReadCallback) gzread,
318 (xmlInputCloseCallback) gzclose,
319 sd.gz,
320 NULL, NULL,
324 gr->target_sheet_name = NULL;
326 int current_row = -1;
327 int current_col = -1;
329 /* Spool to the target cell, caching values of cells as they are encountered. */
330 for (int ret = 1; ret; )
332 while ((ret = xmlTextReaderRead (sd.xtr)))
334 process_node (gr, &sd);
335 if (sd.state == STATE_CELL)
337 if (sd.current_sheet == n)
339 current_row = sd.row;
340 current_col = sd.col;
341 break;
345 if (current_row >= row && current_col >= column - 1)
346 break;
348 while ((ret = xmlTextReaderRead (sd.xtr)))
350 process_node (gr, &sd);
351 if (sd.node_type == XML_READER_TYPE_TEXT)
352 break;
355 if (use_cache)
357 /* See if this cell has already been cached ... */
358 unsigned int hash = hash_int (current_row, 0);
359 hash = hash_int (current_col, hash);
360 struct cache_datum *probe = NULL;
361 HMAP_FOR_EACH_WITH_HASH (probe, struct cache_datum, node, hash,
362 &gr->cache)
364 if (probe->row == current_row && probe->col == current_col)
365 break;
367 /* If not, then cache it. */
368 if (!probe)
370 char *str = CHAR_CAST (char *, xmlTextReaderValue (sd.xtr));
371 struct cache_datum *cell_data = XMALLOC (struct cache_datum);
372 cell_data->row = current_row;
373 cell_data->col = current_col;
374 cell_data->value = str;
375 hmap_insert (&gr->cache, &cell_data->node, hash);
380 while (xmlTextReaderRead (sd.xtr))
382 process_node (gr, &sd);
383 if (sd.state == STATE_CELL && sd.node_type == XML_READER_TYPE_TEXT)
385 if (sd.current_sheet == n)
387 if (row == sd.row && column == sd.col)
388 break;
393 char *cell_content = CHAR_CAST (char *, xmlTextReaderValue (sd.xtr));
394 xmlFreeTextReader (sd.xtr);
395 return cell_content;
399 static void
400 gnm_file_casereader_destroy (struct casereader *reader UNUSED, void *r_)
402 struct gnumeric_reader *r = r_;
404 if (r == NULL)
405 return ;
407 state_data_destroy (&r->rsd);
409 if (r->spreadsheet.first_case && ! r->spreadsheet.used_first_case)
410 case_unref (r->spreadsheet.first_case);
412 if (r->spreadsheet.proto)
413 caseproto_unref (r->spreadsheet.proto);
415 spreadsheet_unref (&r->spreadsheet);
419 static void
420 process_node (struct gnumeric_reader *r, struct state_data *sd)
422 xmlChar *name = xmlTextReaderName (sd->xtr);
423 if (name == NULL)
424 name = xmlStrdup (_xml ("--"));
426 sd->node_type = xmlTextReaderNodeType (sd->xtr);
428 switch (sd->state)
430 case STATE_PRE_INIT:
431 sd->current_sheet = -1;
432 if (0 == xmlStrcasecmp (name, _xml("gnm:SheetNameIndex")) &&
433 XML_READER_TYPE_ELEMENT == sd->node_type)
435 sd->state = STATE_SHEET_COUNT;
437 break;
439 case STATE_SHEET_COUNT:
440 if (0 == xmlStrcasecmp (name, _xml("gnm:SheetName")) &&
441 XML_READER_TYPE_ELEMENT == sd->node_type)
443 ++sd->current_sheet;
444 if (sd->current_sheet + 1 > r->n_sheets)
446 struct sheet_detail *detail ;
447 r->spreadsheet.sheets = xrealloc (r->spreadsheet.sheets, (sd->current_sheet + 1) * sizeof *r->spreadsheet.sheets);
448 detail = &r->spreadsheet.sheets[sd->current_sheet];
449 detail->first_col = detail->last_col = detail->first_row = detail->last_row = -1;
450 detail->name = NULL;
451 r->n_sheets = sd->current_sheet + 1;
454 else if (0 == xmlStrcasecmp (name, _xml("gnm:SheetNameIndex")) &&
455 XML_READER_TYPE_END_ELEMENT == sd->node_type)
457 sd->state = STATE_INIT;
458 sd->current_sheet = -1;
460 else if (XML_READER_TYPE_TEXT == sd->node_type)
462 if (r->spreadsheet.sheets [r->n_sheets - 1].name == NULL)
463 r->spreadsheet.sheets [r->n_sheets - 1].name =
464 CHAR_CAST (char *, xmlTextReaderValue (sd->xtr));
466 break;
468 case STATE_INIT:
469 if (0 == xmlStrcasecmp (name, _xml("gnm:Sheet")) &&
470 XML_READER_TYPE_ELEMENT == sd->node_type)
472 ++sd->current_sheet;
473 sd->state = STATE_SHEET_START;
475 break;
476 case STATE_SHEET_START:
477 if (0 == xmlStrcasecmp (name, _xml("gnm:Name")) &&
478 XML_READER_TYPE_ELEMENT == sd->node_type)
480 sd->state = STATE_SHEET_NAME;
482 break;
483 case STATE_SHEET_NAME:
484 if (0 == xmlStrcasecmp (name, _xml("gnm:Name")) &&
485 XML_READER_TYPE_END_ELEMENT == sd->node_type)
487 sd->state = STATE_INIT;
489 else if (0 == xmlStrcasecmp (name, _xml("gnm:Sheet")) &&
490 XML_READER_TYPE_END_ELEMENT == sd->node_type)
492 sd->state = STATE_INIT;
494 else if (XML_READER_TYPE_TEXT == sd->node_type)
496 if (r->target_sheet_name != NULL)
498 xmlChar *value = xmlTextReaderValue (sd->xtr);
499 if (0 == xmlStrcmp (value, r->target_sheet_name))
500 sd->state = STATE_SHEET_FOUND;
501 free (value);
503 else if (r->target_sheet_index == sd->current_sheet + 1)
505 sd->state = STATE_SHEET_FOUND;
507 else if (r->target_sheet_index == -1)
509 sd->state = STATE_SHEET_FOUND;
512 break;
513 case STATE_SHEET_FOUND:
514 if (0 == xmlStrcasecmp (name, _xml("gnm:Cells")) &&
515 XML_READER_TYPE_ELEMENT == sd->node_type)
517 sd->min_col = INT_MAX;
518 if (! xmlTextReaderIsEmptyElement (sd->xtr))
519 sd->state = STATE_CELLS_START;
521 else if (0 == xmlStrcasecmp (name, _xml("gnm:MaxRow")) &&
522 XML_READER_TYPE_ELEMENT == sd->node_type)
524 sd->state = STATE_MAXROW;
526 else if (0 == xmlStrcasecmp (name, _xml("gnm:MaxCol")) &&
527 XML_READER_TYPE_ELEMENT == sd->node_type)
529 sd->state = STATE_MAXCOL;
531 else if (0 == xmlStrcasecmp (name, _xml("gnm:Sheet")) &&
532 XML_READER_TYPE_END_ELEMENT == sd->node_type)
534 sd->state = STATE_INIT;
536 break;
537 case STATE_MAXROW:
538 if (0 == xmlStrcasecmp (name, _xml("gnm:MaxRow")) &&
539 XML_READER_TYPE_END_ELEMENT == sd->node_type)
541 sd->state = STATE_SHEET_FOUND;
543 else if (sd->node_type == XML_READER_TYPE_TEXT)
545 xmlChar *value = xmlTextReaderValue (sd->xtr);
546 xmlFree (value);
548 break;
549 case STATE_MAXCOL:
550 if (0 == xmlStrcasecmp (name, _xml("gnm:MaxCol")) &&
551 XML_READER_TYPE_END_ELEMENT == sd->node_type)
553 sd->state = STATE_SHEET_FOUND;
555 else if (sd->node_type == XML_READER_TYPE_TEXT)
557 xmlChar *value = xmlTextReaderValue (sd->xtr);
558 xmlFree (value);
560 break;
561 case STATE_CELLS_START:
562 if (0 == xmlStrcasecmp (name, _xml ("gnm:Cell")) &&
563 XML_READER_TYPE_ELEMENT == sd->node_type)
565 xmlChar *attr = xmlTextReaderGetAttribute (sd->xtr, _xml ("Col"));
566 sd->col = _xmlchar_to_int (attr);
567 free (attr);
569 if (sd->col < sd->min_col)
570 sd->min_col = sd->col;
572 attr = xmlTextReaderGetAttribute (sd->xtr, _xml ("Row"));
573 sd->row = _xmlchar_to_int (attr);
574 free (attr);
576 if (r->spreadsheet.sheets[sd->current_sheet].first_row == -1)
578 r->spreadsheet.sheets[sd->current_sheet].first_row = sd->row;
581 if (r->spreadsheet.sheets[sd->current_sheet].first_col == -1)
583 r->spreadsheet.sheets[sd->current_sheet].first_col = sd->col;
585 if (! xmlTextReaderIsEmptyElement (sd->xtr))
586 sd->state = STATE_CELL;
588 else if ((0 == xmlStrcasecmp (name, _xml("gnm:Cells")))
589 && (XML_READER_TYPE_END_ELEMENT == sd->node_type))
591 r->spreadsheet.sheets[sd->current_sheet].last_col = sd->col;
592 r->spreadsheet.sheets[sd->current_sheet].last_row = sd->row;
593 sd->state = STATE_SHEET_NAME;
595 break;
596 case STATE_CELL:
597 if (0 == xmlStrcasecmp (name, _xml("gnm:Cell"))
598 && XML_READER_TYPE_END_ELEMENT == sd->node_type)
600 sd->state = STATE_CELLS_START;
602 break;
603 default:
604 break;
607 xmlFree (name);
612 Sets the VAR of case C, to the value corresponding to the xml string XV
614 static void
615 convert_xml_string_to_value (struct ccase *c, const struct variable *var,
616 const xmlChar *xv, enum gnm_value_type type, int col, int row)
618 union value *v = case_data_rw (c, var);
620 if (xv == NULL)
621 value_set_missing (v, var_get_width (var));
622 else if (var_is_alpha (var))
623 value_copy_str_rpad (v, var_get_width (var), xv, ' ');
624 else if (type == VALUE_FLOAT || type == VALUE_INTEGER)
626 const char *text = CHAR_CAST (const char *, xv);
627 char *endptr;
629 errno = 0;
630 v->f = c_strtod (text, &endptr);
631 if (errno != 0 || endptr == text)
632 v->f = SYSMIS;
634 else
636 const char *text = CHAR_CAST (const char *, xv);
638 const struct fmt_spec *fmt = var_get_write_format (var);
640 char *m = data_in (ss_cstr (text), "UTF-8", fmt->type,
641 settings_get_fmt_settings (), v, var_get_width (var),
642 "UTF-8");
644 if (m)
646 char buf [FMT_STRING_LEN_MAX + 1];
647 char *cell = create_cell_ref (col, row);
649 msg (MW, _("Cannot convert the value in the spreadsheet cell %s to format (%s): %s"),
650 cell, fmt_to_string (fmt, buf), m);
651 free (cell);
653 free (m);
657 struct var_spec
659 char *name;
660 int width;
661 xmlChar *first_value;
662 int first_type;
666 static void
667 gnumeric_error_handler (void *ctx, const char *mesg,
668 xmlParserSeverities sev UNUSED,
669 xmlTextReaderLocatorPtr loc)
671 struct gnumeric_reader *r = ctx;
673 msg (MW, _("There was a problem whilst reading the %s file `%s' (near line %d): `%s'"),
674 "Gnumeric",
675 r->spreadsheet.file_name,
676 xmlTextReaderLocatorLineNumber (loc),
677 mesg);
680 static struct casereader *
681 gnumeric_make_reader (struct spreadsheet *spreadsheet,
682 const struct spreadsheet_read_options *opts)
684 int type = 0;
685 int x = 0;
686 struct gnumeric_reader *r = NULL;
687 unsigned long int vstart = 0;
688 int ret;
689 casenumber n_cases = CASENUMBER_MAX;
690 int i;
691 struct var_spec *var_spec = NULL;
692 int n_var_specs = 0;
694 r = (struct gnumeric_reader *) (spreadsheet);
696 r = gnumeric_reopen (r, NULL, true);
698 if (opts->cell_range)
700 if (! convert_cell_ref (opts->cell_range,
701 &r->spreadsheet.start_col, &r->spreadsheet.start_row,
702 &r->spreadsheet.stop_col, &r->spreadsheet.stop_row))
704 msg (SE, _("Invalid cell range `%s'"),
705 opts->cell_range);
706 goto error;
709 else
711 r->spreadsheet.start_col = -1;
712 r->spreadsheet.start_row = 0;
713 r->spreadsheet.stop_col = -1;
714 r->spreadsheet.stop_row = -1;
717 r->target_sheet_name = BAD_CAST opts->sheet_name;
718 r->target_sheet_index = opts->sheet_index;
719 r->rsd.row = r->rsd.col = -1;
720 r->rsd.current_sheet = -1;
721 r->spreadsheet.first_case = NULL;
722 r->spreadsheet.proto = NULL;
724 /* Advance to the start of the cells for the target sheet */
725 while ((r->rsd.state != STATE_CELL || r->rsd.row < r->spreadsheet.start_row)
726 && 1 == (ret = xmlTextReaderRead (r->rsd.xtr)))
728 xmlChar *value ;
729 process_node (r, &r->rsd);
730 value = xmlTextReaderValue (r->rsd.xtr);
732 if (r->rsd.state == STATE_MAXROW && r->rsd.node_type == XML_READER_TYPE_TEXT)
734 n_cases = 1 + _xmlchar_to_int (value) ;
736 free (value);
739 /* If a range has been given, then use that to calculate the number
740 of cases */
741 if (opts->cell_range)
743 n_cases = MIN (n_cases, r->spreadsheet.stop_row - r->spreadsheet.start_row + 1);
746 if (opts->read_names)
748 r->spreadsheet.start_row++;
749 n_cases --;
753 /* Read in the first row of cells,
754 including the headers if read_names was set */
755 while (
756 ((r->rsd.state == STATE_CELLS_START && r->rsd.row <= r->spreadsheet.start_row) || r->rsd.state == STATE_CELL)
757 && (ret = xmlTextReaderRead (r->rsd.xtr))
760 int idx;
762 if (r->rsd.state == STATE_CELL && r->rsd.node_type == XML_READER_TYPE_TEXT)
764 xmlChar *attr =
765 xmlTextReaderGetAttribute (r->rsd.xtr, _xml ("ValueType"));
767 type = _xmlchar_to_int (attr);
769 xmlFree (attr);
772 process_node (r, &r->rsd);
774 if (r->rsd.row > r->spreadsheet.start_row)
776 xmlChar *attr =
777 xmlTextReaderGetAttribute (r->rsd.xtr, _xml ("ValueType"));
779 r->vtype = _xmlchar_to_int (attr);
781 xmlFree (attr);
782 break;
785 if (r->rsd.col < r->spreadsheet.start_col ||
786 (r->spreadsheet.stop_col != -1 && r->rsd.col > r->spreadsheet.stop_col))
787 continue;
789 idx = r->rsd.col - r->spreadsheet.start_col;
791 if (idx >= n_var_specs)
793 int i;
794 var_spec = xrealloc (var_spec, sizeof (*var_spec) * (idx + 1));
795 for (i = n_var_specs; i <= idx; ++i)
797 var_spec [i].name = NULL;
798 var_spec [i].width = -1;
799 var_spec [i].first_value = NULL;
800 var_spec [i].first_type = -1;
802 n_var_specs = idx + 1 ;
805 var_spec [idx].first_type = type;
807 if (r->rsd.node_type == XML_READER_TYPE_TEXT)
809 xmlChar *value = xmlTextReaderValue (r->rsd.xtr);
810 const char *text = CHAR_CAST (const char *, value);
812 if (r->rsd.row < r->spreadsheet.start_row)
814 if (opts->read_names)
816 var_spec [idx].name = xstrdup (text);
819 else
821 var_spec [idx].first_value = xmlStrdup (value);
823 if (-1 == var_spec [idx].width)
824 var_spec [idx].width = (opts->asw == -1) ?
825 ROUND_UP (strlen(text), SPREADSHEET_DEFAULT_WIDTH) : opts->asw;
828 free (value);
830 else if (r->rsd.node_type == XML_READER_TYPE_ELEMENT
831 && r->rsd.state == STATE_CELL)
833 if (r->rsd.row == r->spreadsheet.start_row)
835 xmlChar *attr =
836 xmlTextReaderGetAttribute (r->rsd.xtr, _xml ("ValueType"));
838 if (NULL == attr || VALUE_STRING != _xmlchar_to_int (attr))
839 var_spec [idx].width = 0;
841 free (attr);
847 const xmlChar *enc = xmlTextReaderConstEncoding (r->rsd.xtr);
848 if (enc == NULL)
849 goto error;
850 /* Create the dictionary and populate it */
851 spreadsheet->dict = dict_create (CHAR_CAST (const char *, enc));
854 for (i = 0 ; i < n_var_specs ; ++i)
856 char *name;
858 if ((var_spec[i].name == NULL) && (var_spec[i].first_value == NULL))
859 continue;
861 /* Probably no data exists for this variable, so allocate a
862 default width */
863 if (var_spec[i].width == -1)
864 var_spec[i].width = SPREADSHEET_DEFAULT_WIDTH;
866 name = dict_make_unique_var_name (r->spreadsheet.dict, var_spec[i].name, &vstart);
867 dict_create_var (r->spreadsheet.dict, name, var_spec[i].width);
868 free (name);
871 /* Create the first case, and cache it */
872 r->spreadsheet.used_first_case = false;
874 if (n_var_specs == 0)
876 msg (MW, _("Selected sheet or range of spreadsheet `%s' is empty."),
877 spreadsheet->file_name);
878 goto error;
881 r->spreadsheet.proto = caseproto_ref (dict_get_proto (r->spreadsheet.dict));
882 r->spreadsheet.first_case = case_create (r->spreadsheet.proto);
883 case_set_missing (r->spreadsheet.first_case);
886 for (i = 0 ; i < n_var_specs ; ++i)
888 const struct variable *var;
890 if ((var_spec[i].name == NULL) && (var_spec[i].first_value == NULL))
891 continue;
893 var = dict_get_var (r->spreadsheet.dict, x++);
895 convert_xml_string_to_value (r->spreadsheet.first_case, var,
896 var_spec[i].first_value,
897 var_spec[i].first_type,
898 r->rsd.col + i - 1,
899 r->rsd.row - 1);
902 for (i = 0 ; i < n_var_specs ; ++i)
904 free (var_spec[i].first_value);
905 free (var_spec[i].name);
908 free (var_spec);
911 return casereader_create_sequential
912 (NULL,
913 r->spreadsheet.proto,
914 n_cases,
915 &gnm_file_casereader_class, r);
918 error:
919 for (i = 0 ; i < n_var_specs ; ++i)
921 free (var_spec[i].first_value);
922 free (var_spec[i].name);
925 free (var_spec);
927 gnm_file_casereader_destroy (NULL, r);
929 return NULL;
933 /* Reads and returns one case from READER's file. Returns a null
934 pointer on failure. */
935 static struct ccase *
936 gnm_file_casereader_read (struct casereader *reader UNUSED, void *r_)
938 struct ccase *c;
939 int ret = 0;
941 struct gnumeric_reader *r = r_;
942 int current_row = r->rsd.row;
944 if (!r->spreadsheet.used_first_case)
946 r->spreadsheet.used_first_case = true;
947 return r->spreadsheet.first_case;
950 c = case_create (r->spreadsheet.proto);
951 case_set_missing (c);
953 if (r->spreadsheet.start_col == -1)
954 r->spreadsheet.start_col = r->rsd.min_col;
957 while ((r->rsd.state == STATE_CELL || r->rsd.state == STATE_CELLS_START)
958 && r->rsd.row == current_row && (ret = xmlTextReaderRead (r->rsd.xtr)))
960 process_node (r, &r->rsd);
962 if (r->rsd.state == STATE_CELL && r->rsd.node_type == XML_READER_TYPE_ELEMENT)
964 xmlChar *attr =
965 xmlTextReaderGetAttribute (r->rsd.xtr, _xml ("ValueType"));
967 r->vtype = _xmlchar_to_int (attr);
969 xmlFree (attr);
972 if (r->rsd.col < r->spreadsheet.start_col || (r->spreadsheet.stop_col != -1 &&
973 r->rsd.col > r->spreadsheet.stop_col))
974 continue;
976 if (r->rsd.col - r->spreadsheet.start_col >= caseproto_get_n_widths (r->spreadsheet.proto))
977 continue;
979 if (r->spreadsheet.stop_row != -1 && r->rsd.row > r->spreadsheet.stop_row)
980 break;
983 if (r->rsd.node_type == XML_READER_TYPE_TEXT)
985 xmlChar *value = xmlTextReaderValue (r->rsd.xtr);
986 const int idx = r->rsd.col - r->spreadsheet.start_col;
987 const struct variable *var = dict_get_var (r->spreadsheet.dict, idx);
989 convert_xml_string_to_value (c, var, value, r->vtype,
990 r->rsd.col, r->rsd.row);
992 xmlFree (value);
996 if (ret == 1)
997 return c;
998 else
1000 case_unref (c);
1001 return NULL;
1005 static struct gnumeric_reader *
1006 gnumeric_reopen (struct gnumeric_reader *r, const char *filename, bool show_errors)
1008 int ret = -1;
1009 struct state_data *sd;
1011 xmlTextReaderPtr xtr;
1012 gzFile gz;
1014 assert (r == NULL || filename == NULL);
1016 if (filename)
1018 gz = gzopen (filename, "r");
1020 else
1022 gz = gzopen (r->spreadsheet.file_name, "r");
1025 if (NULL == gz)
1026 return NULL;
1028 if (r == NULL)
1030 r = xzalloc (sizeof *r);
1031 r->n_sheets = -1;
1032 r->spreadsheet.file_name = strdup (filename);
1033 struct spreadsheet *s = SPREADSHEET_CAST (r);
1034 strcpy (s->type, "GNM");
1035 s->destroy = gnumeric_destroy;
1036 s->make_reader = gnumeric_make_reader;
1037 s->get_sheet_name = gnumeric_get_sheet_name;
1038 s->get_sheet_range = gnumeric_get_sheet_range;
1039 s->get_sheet_n_sheets = gnumeric_get_sheet_n_sheets;
1040 s->get_sheet_n_rows = gnumeric_get_sheet_n_rows;
1041 s->get_sheet_n_columns = gnumeric_get_sheet_n_columns;
1042 s->get_sheet_cell = gnumeric_get_sheet_cell;
1044 sd = &r->msd;
1045 hmap_init (&r->cache);
1047 else
1049 sd = &r->rsd;
1051 sd->gz = gz;
1053 r = (struct gnumeric_reader *) spreadsheet_ref (SPREADSHEET_CAST (r));
1056 xtr = xmlReaderForIO ((xmlInputReadCallback) gzread,
1057 (xmlInputCloseCallback) gzclose, gz,
1058 NULL, NULL,
1059 show_errors ? 0 : (XML_PARSE_NOERROR | XML_PARSE_NOWARNING));
1061 if (xtr == NULL)
1063 gzclose (gz);
1064 free (r);
1065 return NULL;
1068 if (show_errors)
1069 xmlTextReaderSetErrorHandler (xtr, gnumeric_error_handler, r);
1071 sd->row = sd->col = -1;
1072 sd->state = STATE_PRE_INIT;
1073 sd->xtr = xtr;
1076 r->target_sheet_name = NULL;
1077 r->target_sheet_index = -1;
1080 /* Advance to the start of the workbook.
1081 This gives us some confidence that we are actually dealing with a gnumeric
1082 spreadsheet.
1084 while ((sd->state != STATE_INIT)
1085 && 1 == (ret = xmlTextReaderRead (sd->xtr)))
1087 process_node (r, sd);
1090 if (ret != 1)
1092 /* Does not seem to be a gnumeric file */
1093 spreadsheet_unref (&r->spreadsheet);
1094 return NULL;
1097 if (show_errors)
1099 const xmlChar *enc = xmlTextReaderConstEncoding (sd->xtr);
1100 xmlCharEncoding xce = xmlParseCharEncoding (CHAR_CAST (const char *, enc));
1102 if (XML_CHAR_ENCODING_UTF8 != xce)
1104 /* I have been told that ALL gnumeric files are UTF8 encoded. If that is correct, this
1105 can never happen. */
1106 msg (MW, _("The gnumeric file `%s' is encoded as %s instead of the usual UTF-8 encoding. "
1107 "Any non-ascii characters will be incorrectly imported."),
1108 r->spreadsheet.file_name,
1109 enc);
1113 return r;
1117 struct spreadsheet *
1118 gnumeric_probe (const char *filename, bool report_errors)
1120 struct gnumeric_reader *r = gnumeric_reopen (NULL, filename, report_errors);
1122 return &r->spreadsheet;