sys-file-reader: Fully verify multiple response set names.
[pspp.git] / src / data / sys-file-reader.c
blobdbc61345a85869eda4c0d1eb7f0f0aa04fe4c9ce
1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 1997-2000, 2006-2007, 2009-2016 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
17 #include <config.h>
19 #include "data/sys-file-private.h"
21 #include <errno.h>
22 #include <float.h>
23 #include <inttypes.h>
24 #include <stdlib.h>
25 #include <sys/stat.h>
26 #include <zlib.h>
28 #include "data/any-reader.h"
29 #include "data/attributes.h"
30 #include "data/case.h"
31 #include "data/casereader-provider.h"
32 #include "data/casereader.h"
33 #include "data/dictionary.h"
34 #include "data/file-handle-def.h"
35 #include "data/file-name.h"
36 #include "data/format.h"
37 #include "data/identifier.h"
38 #include "data/missing-values.h"
39 #include "data/mrset.h"
40 #include "data/short-names.h"
41 #include "data/value-labels.h"
42 #include "data/value.h"
43 #include "data/variable.h"
44 #include "libpspp/array.h"
45 #include "libpspp/assertion.h"
46 #include "libpspp/compiler.h"
47 #include "libpspp/i18n.h"
48 #include "libpspp/ll.h"
49 #include "libpspp/message.h"
50 #include "libpspp/misc.h"
51 #include "libpspp/pool.h"
52 #include "libpspp/str.h"
53 #include "libpspp/stringi-set.h"
55 #include "gl/c-strtod.h"
56 #include "gl/c-ctype.h"
57 #include "gl/inttostr.h"
58 #include "gl/localcharset.h"
59 #include "gl/minmax.h"
60 #include "gl/unlocked-io.h"
61 #include "gl/xalloc.h"
62 #include "gl/xalloc-oversized.h"
63 #include "gl/xsize.h"
65 #include "gettext.h"
66 #define _(msgid) gettext (msgid)
67 #define N_(msgid) (msgid)
69 enum
71 /* subtypes 0-2 unknown */
72 EXT_INTEGER = 3, /* Machine integer info. */
73 EXT_FLOAT = 4, /* Machine floating-point info. */
74 EXT_VAR_SETS = 5, /* Variable sets. */
75 EXT_DATE = 6, /* DATE. */
76 EXT_MRSETS = 7, /* Multiple response sets. */
77 EXT_DATA_ENTRY = 8, /* SPSS Data Entry. */
78 /* subtype 9 unknown */
79 EXT_PRODUCT_INFO = 10, /* Extra product info text. */
80 EXT_DISPLAY = 11, /* Variable display parameters. */
81 /* subtype 12 unknown */
82 EXT_LONG_NAMES = 13, /* Long variable names. */
83 EXT_LONG_STRINGS = 14, /* Long strings. */
84 /* subtype 15 unknown */
85 EXT_NCASES = 16, /* Extended number of cases. */
86 EXT_FILE_ATTRS = 17, /* Data file attributes. */
87 EXT_VAR_ATTRS = 18, /* Variable attributes. */
88 EXT_MRSETS2 = 19, /* Multiple response sets (extended). */
89 EXT_ENCODING = 20, /* Character encoding. */
90 EXT_LONG_LABELS = 21, /* Value labels for long strings. */
91 EXT_LONG_MISSING = 22, /* Missing values for long strings. */
92 EXT_DATAVIEW = 24 /* "Format properties in dataview table". */
95 /* Fields from the top-level header record. */
96 struct sfm_header_record
98 char magic[5]; /* First 4 bytes of file, then null. */
99 int weight_idx; /* 0 if unweighted, otherwise a var index. */
100 int nominal_case_size; /* Number of var positions. */
102 /* These correspond to the members of struct any_file_info or a dictionary
103 but in the system file's encoding rather than ASCII. */
104 char creation_date[10]; /* "dd mmm yy". */
105 char creation_time[9]; /* "hh:mm:ss". */
106 char eye_catcher[61]; /* Eye-catcher string, then product name. */
107 char file_label[65]; /* File label. */
110 struct sfm_var_record
112 off_t pos;
113 int width;
114 char name[9];
115 int print_format;
116 int write_format;
117 int missing_value_code;
118 uint8_t missing[24];
119 char *label;
120 struct variable *var;
123 struct sfm_value_label
125 uint8_t value[8];
126 char *label;
129 struct sfm_value_label_record
131 off_t pos;
132 struct sfm_value_label *labels;
133 unsigned int n_labels;
135 int *vars;
136 unsigned int n_vars;
139 struct sfm_document_record
141 off_t pos;
142 char *documents;
143 size_t n_lines;
146 struct sfm_mrset
148 const char *name; /* Name. */
149 const char *label; /* Human-readable label for group. */
150 enum mrset_type type; /* Group type. */
151 const char **vars; /* Constituent variables' names. */
152 size_t n_vars; /* Number of constituent variables. */
154 /* MRSET_MD only. */
155 enum mrset_md_cat_source cat_source; /* Source of category labels. */
156 bool label_from_var_label; /* 'label' taken from variable label? */
157 const char *counted; /* Counted value, as string. */
160 struct sfm_extension_record
162 struct ll ll; /* In struct sfm_reader 'var_attrs' list. */
163 int subtype; /* Record subtype. */
164 off_t pos; /* Starting offset in file. */
165 unsigned int size; /* Size of data elements. */
166 unsigned int count; /* Number of data elements. */
167 void *data; /* Contents. */
170 /* System file reader. */
171 struct sfm_reader
173 struct any_reader any_reader;
175 /* Resource tracking. */
176 struct pool *pool; /* All system file state. */
178 /* File data. */
179 struct any_read_info info;
180 struct sfm_header_record header;
181 struct sfm_var_record *vars;
182 size_t n_vars;
183 struct sfm_value_label_record *labels;
184 size_t n_labels;
185 struct sfm_document_record *document;
186 struct sfm_mrset *mrsets;
187 size_t n_mrsets;
188 struct sfm_extension_record *extensions[32];
189 struct ll_list var_attrs; /* Contains "struct sfm_extension_record"s. */
191 /* File state. */
192 struct file_handle *fh; /* File handle. */
193 struct fh_lock *lock; /* Mutual exclusion for file handle. */
194 FILE *file; /* File stream. */
195 off_t pos; /* Position in file. */
196 bool error; /* I/O or corruption error? */
197 struct caseproto *proto; /* Format of output cases. */
199 /* File format. */
200 enum integer_format integer_format; /* On-disk integer format. */
201 enum float_format float_format; /* On-disk floating point format. */
202 struct sfm_var *sfm_vars; /* Variables. */
203 size_t sfm_var_cnt; /* Number of variables. */
204 int case_cnt; /* Number of cases */
205 const char *encoding; /* String encoding. */
206 bool written_by_readstat; /* From https://github.com/WizardMac/ReadStat? */
208 /* Decompression. */
209 enum any_compression compression;
210 double bias; /* Compression bias, usually 100.0. */
211 uint8_t opcodes[8]; /* Current block of opcodes. */
212 size_t opcode_idx; /* Next opcode to interpret, 8 if none left. */
213 bool corruption_warning; /* Warned about possible corruption? */
215 /* ZLIB decompression. */
216 long long int ztrailer_ofs; /* Offset of ZLIB trailer at end of file. */
217 #define ZIN_BUF_SIZE 4096
218 uint8_t *zin_buf; /* Inflation input buffer. */
219 #define ZOUT_BUF_SIZE 16384
220 uint8_t *zout_buf; /* Inflation output buffer. */
221 unsigned int zout_end; /* Number of bytes of data in zout_buf. */
222 unsigned int zout_pos; /* First unconsumed byte in zout_buf. */
223 z_stream zstream; /* ZLIB inflater. */
226 static const struct casereader_class sys_file_casereader_class;
228 static struct sfm_reader *
229 sfm_reader_cast (const struct any_reader *r_)
231 assert (r_->klass == &sys_file_reader_class);
232 return UP_CAST (r_, struct sfm_reader, any_reader);
235 static bool sfm_close (struct any_reader *);
237 static struct variable *lookup_var_by_index (struct sfm_reader *, off_t,
238 const struct sfm_var_record *,
239 size_t n, int idx);
241 static void sys_msg (struct sfm_reader *r, off_t, int class,
242 const char *format, va_list args)
243 PRINTF_FORMAT (4, 0);
244 static void sys_warn (struct sfm_reader *, off_t, const char *, ...)
245 PRINTF_FORMAT (3, 4);
246 static void sys_error (struct sfm_reader *, off_t, const char *, ...)
247 PRINTF_FORMAT (3, 4);
249 static bool read_bytes (struct sfm_reader *, void *, size_t)
250 WARN_UNUSED_RESULT;
251 static int try_read_bytes (struct sfm_reader *, void *, size_t)
252 WARN_UNUSED_RESULT;
253 static bool read_int (struct sfm_reader *, int *) WARN_UNUSED_RESULT;
254 static bool read_uint (struct sfm_reader *, unsigned int *) WARN_UNUSED_RESULT;
255 static bool read_int64 (struct sfm_reader *, long long int *)
256 WARN_UNUSED_RESULT;
257 static bool read_uint64 (struct sfm_reader *, unsigned long long int *)
258 WARN_UNUSED_RESULT;
259 static bool read_string (struct sfm_reader *, char *, size_t)
260 WARN_UNUSED_RESULT;
261 static bool skip_bytes (struct sfm_reader *, size_t) WARN_UNUSED_RESULT;
263 /* ZLIB compressed data handling. */
264 static bool read_zheader (struct sfm_reader *) WARN_UNUSED_RESULT;
265 static bool open_zstream (struct sfm_reader *) WARN_UNUSED_RESULT;
266 static bool close_zstream (struct sfm_reader *) WARN_UNUSED_RESULT;
267 static int read_bytes_zlib (struct sfm_reader *, void *, size_t)
268 WARN_UNUSED_RESULT;
269 static int read_compressed_bytes (struct sfm_reader *, void *, size_t)
270 WARN_UNUSED_RESULT;
271 static int try_read_compressed_bytes (struct sfm_reader *, void *, size_t)
272 WARN_UNUSED_RESULT;
273 static bool read_compressed_float (struct sfm_reader *, double *)
274 WARN_UNUSED_RESULT;
276 static char *fix_line_ends (const char *);
278 static int parse_int (const struct sfm_reader *, const void *data, size_t ofs);
279 static double parse_float (const struct sfm_reader *,
280 const void *data, size_t ofs);
282 static bool read_variable_record (struct sfm_reader *,
283 struct sfm_var_record *);
284 static bool read_value_label_record (struct sfm_reader *,
285 struct sfm_value_label_record *);
286 static bool read_document_record (struct sfm_reader *);
287 static bool read_extension_record (struct sfm_reader *, int subtype,
288 struct sfm_extension_record **);
289 static bool skip_extension_record (struct sfm_reader *, int subtype);
291 static struct text_record *open_text_record (
292 struct sfm_reader *, const struct sfm_extension_record *,
293 bool recode_to_utf8);
294 static void close_text_record (struct sfm_reader *,
295 struct text_record *);
296 static bool read_variable_to_value_pair (struct sfm_reader *,
297 struct dictionary *,
298 struct text_record *,
299 struct variable **var, char **value);
300 static void text_warn (struct sfm_reader *r, struct text_record *text,
301 const char *format, ...) PRINTF_FORMAT (3, 4);
302 static char *text_get_token (struct text_record *,
303 struct substring delimiters, char *delimiter);
304 static bool text_match (struct text_record *, char c);
305 static bool text_read_variable_name (struct sfm_reader *, struct dictionary *,
306 struct text_record *,
307 struct substring delimiters,
308 struct variable **);
309 static bool text_read_short_name (struct sfm_reader *, struct dictionary *,
310 struct text_record *,
311 struct substring delimiters,
312 struct variable **);
313 static const char *text_parse_counted_string (struct sfm_reader *,
314 struct text_record *);
315 static size_t text_pos (const struct text_record *);
316 static const char *text_get_all (const struct text_record *);
318 /* Dictionary reader. */
320 enum which_format
322 PRINT_FORMAT,
323 WRITE_FORMAT
326 static bool read_dictionary (struct sfm_reader *);
327 static bool read_record (struct sfm_reader *, int type,
328 size_t *allocated_vars, size_t *allocated_labels);
329 static bool read_header (struct sfm_reader *, struct any_read_info *,
330 struct sfm_header_record *);
331 static void parse_header (struct sfm_reader *,
332 const struct sfm_header_record *,
333 struct any_read_info *, struct dictionary *);
334 static bool parse_variable_records (struct sfm_reader *, struct dictionary *,
335 struct sfm_var_record *, size_t n);
336 static void parse_format_spec (struct sfm_reader *, off_t pos,
337 unsigned int format, enum which_format,
338 struct variable *, int *format_warning_cnt);
339 static void parse_document (struct dictionary *, struct sfm_document_record *);
340 static void parse_display_parameters (struct sfm_reader *,
341 const struct sfm_extension_record *,
342 struct dictionary *);
343 static bool parse_machine_integer_info (struct sfm_reader *,
344 const struct sfm_extension_record *,
345 struct any_read_info *);
346 static void parse_machine_float_info (struct sfm_reader *,
347 const struct sfm_extension_record *);
348 static void parse_extra_product_info (struct sfm_reader *,
349 const struct sfm_extension_record *,
350 struct any_read_info *);
351 static void parse_mrsets (struct sfm_reader *,
352 const struct sfm_extension_record *,
353 size_t *allocated_mrsets);
354 static void decode_mrsets (struct sfm_reader *, struct dictionary *);
355 static void parse_long_var_name_map (struct sfm_reader *,
356 const struct sfm_extension_record *,
357 struct dictionary *);
358 static bool parse_long_string_map (struct sfm_reader *,
359 const struct sfm_extension_record *,
360 struct dictionary *);
361 static bool parse_value_labels (struct sfm_reader *, struct dictionary *,
362 const struct sfm_var_record *,
363 size_t n_var_recs,
364 const struct sfm_value_label_record *);
365 static void parse_data_file_attributes (struct sfm_reader *,
366 const struct sfm_extension_record *,
367 struct dictionary *);
368 static void parse_variable_attributes (struct sfm_reader *,
369 const struct sfm_extension_record *,
370 struct dictionary *);
371 static void assign_variable_roles (struct sfm_reader *, struct dictionary *);
372 static void parse_long_string_value_labels (struct sfm_reader *,
373 const struct sfm_extension_record *,
374 struct dictionary *);
375 static void parse_long_string_missing_values (
376 struct sfm_reader *, const struct sfm_extension_record *,
377 struct dictionary *);
379 /* Frees the strings inside INFO. */
380 void
381 any_read_info_destroy (struct any_read_info *info)
383 if (info)
385 free (info->creation_date);
386 free (info->creation_time);
387 free (info->product);
388 free (info->product_ext);
392 /* Tries to open FH for reading as a system file. Returns an sfm_reader if
393 successful, otherwise NULL. */
394 static struct any_reader *
395 sfm_open (struct file_handle *fh)
397 size_t allocated_mrsets = 0;
398 struct sfm_reader *r;
400 /* Create and initialize reader. */
401 r = xzalloc (sizeof *r);
402 r->any_reader.klass = &sys_file_reader_class;
403 r->pool = pool_create ();
404 pool_register (r->pool, free, r);
405 r->fh = fh_ref (fh);
406 r->opcode_idx = sizeof r->opcodes;
407 ll_init (&r->var_attrs);
409 /* TRANSLATORS: this fragment will be interpolated into
410 messages in fh_lock() that identify types of files. */
411 r->lock = fh_lock (fh, FH_REF_FILE, N_("system file"), FH_ACC_READ, false);
412 if (r->lock == NULL)
413 goto error;
415 r->file = fn_open (fh, "rb");
416 if (r->file == NULL)
418 msg (ME, _("Error opening `%s' for reading as a system file: %s."),
419 fh_get_file_name (r->fh), strerror (errno));
420 goto error;
423 if (!read_dictionary (r))
424 goto error;
426 if (r->extensions[EXT_MRSETS] != NULL)
427 parse_mrsets (r, r->extensions[EXT_MRSETS], &allocated_mrsets);
429 if (r->extensions[EXT_MRSETS2] != NULL)
430 parse_mrsets (r, r->extensions[EXT_MRSETS2], &allocated_mrsets);
432 return &r->any_reader;
434 error:
435 if (r)
436 sfm_close (&r->any_reader);
437 return NULL;
440 static bool
441 read_dictionary (struct sfm_reader *r)
443 size_t allocated_vars;
444 size_t allocated_labels;
446 if (!read_header (r, &r->info, &r->header))
447 return false;
449 allocated_vars = 0;
450 allocated_labels = 0;
451 for (;;)
453 int type;
455 if (!read_int (r, &type))
456 return false;
457 if (type == 999)
458 break;
459 if (!read_record (r, type, &allocated_vars, &allocated_labels))
460 return false;
463 if (!skip_bytes (r, 4))
464 return false;
466 if (r->compression == ANY_COMP_ZLIB && !read_zheader (r))
467 return false;
469 return true;
472 static bool
473 read_record (struct sfm_reader *r, int type,
474 size_t *allocated_vars, size_t *allocated_labels)
476 int subtype;
478 switch (type)
480 case 2:
481 if (r->n_vars >= *allocated_vars)
482 r->vars = pool_2nrealloc (r->pool, r->vars, allocated_vars,
483 sizeof *r->vars);
484 return read_variable_record (r, &r->vars[r->n_vars++]);
486 case 3:
487 if (r->n_labels >= *allocated_labels)
488 r->labels = pool_2nrealloc (r->pool, r->labels, allocated_labels,
489 sizeof *r->labels);
490 return read_value_label_record (r, &r->labels[r->n_labels++]);
492 case 4:
493 /* A Type 4 record is always immediately after a type 3 record,
494 so the code for type 3 records reads the type 4 record too. */
495 sys_error (r, r->pos, _("Misplaced type 4 record."));
496 return false;
498 case 6:
499 if (r->document != NULL)
501 sys_error (r, r->pos, _("Duplicate type 6 (document) record."));
502 return false;
504 return read_document_record (r);
506 case 7:
507 if (!read_int (r, &subtype))
508 return false;
509 else if (subtype < 0
510 || subtype >= sizeof r->extensions / sizeof *r->extensions)
512 sys_warn (r, r->pos,
513 _("Unrecognized record type 7, subtype %d. For help, "
514 "please send this file to %s and mention that you were "
515 "using %s."),
516 subtype, PACKAGE_BUGREPORT, PACKAGE_STRING);
517 return skip_extension_record (r, subtype);
519 else if (subtype == 18)
521 /* System files written by "Stata 14.1/-savespss- 1.77 by S.Radyakin"
522 put each variable attribute into a separate record with subtype
523 18. I'm surprised that SPSS puts up with this. */
524 struct sfm_extension_record *ext;
525 bool ok = read_extension_record (r, subtype, &ext);
526 if (ok && ext)
527 ll_push_tail (&r->var_attrs, &ext->ll);
528 return ok;
530 else if (r->extensions[subtype] != NULL)
532 sys_warn (r, r->pos,
533 _("Record type 7, subtype %d found here has the same "
534 "type as the record found near offset 0x%llx. For "
535 "help, please send this file to %s and mention that "
536 "you were using %s."),
537 subtype, (long long int) r->extensions[subtype]->pos,
538 PACKAGE_BUGREPORT, PACKAGE_STRING);
539 return skip_extension_record (r, subtype);
541 else
542 return read_extension_record (r, subtype, &r->extensions[subtype]);
544 default:
545 sys_error (r, r->pos, _("Unrecognized record type %d."), type);
546 return false;
549 NOT_REACHED ();
552 /* Returns the character encoding obtained from R, or a null pointer if R
553 doesn't have an indication of its character encoding. */
554 static const char *
555 sfm_get_encoding (const struct sfm_reader *r)
557 /* The EXT_ENCODING record is the best way to determine dictionary
558 encoding. */
559 if (r->extensions[EXT_ENCODING])
560 return r->extensions[EXT_ENCODING]->data;
562 /* But EXT_INTEGER is better than nothing as a fallback. */
563 if (r->extensions[EXT_INTEGER])
565 int codepage = parse_int (r, r->extensions[EXT_INTEGER]->data, 7 * 4);
566 const char *encoding;
568 switch (codepage)
570 case 1:
571 return "EBCDIC-US";
573 case 2:
574 case 3:
575 /* These ostensibly mean "7-bit ASCII" and "8-bit ASCII"[sic]
576 respectively. However, many files have character code 2 but data
577 which are clearly not ASCII. Therefore, ignore these values. */
578 break;
580 case 4:
581 return "MS_KANJI";
583 default:
584 encoding = sys_get_encoding_from_codepage (codepage);
585 if (encoding != NULL)
586 return encoding;
587 break;
591 /* If the file magic number is EBCDIC then its character data is too. */
592 if (!strcmp (r->header.magic, EBCDIC_MAGIC))
593 return "EBCDIC-US";
595 return NULL;
598 struct get_strings_aux
600 struct pool *pool;
601 char **titles;
602 char **strings;
603 bool *ids;
604 size_t allocated;
605 size_t n;
608 static void
609 add_string__ (struct get_strings_aux *aux,
610 const char *string, bool id, char *title)
612 if (aux->n >= aux->allocated)
614 aux->allocated = 2 * (aux->allocated + 1);
615 aux->titles = pool_realloc (aux->pool, aux->titles,
616 aux->allocated * sizeof *aux->titles);
617 aux->strings = pool_realloc (aux->pool, aux->strings,
618 aux->allocated * sizeof *aux->strings);
619 aux->ids = pool_realloc (aux->pool, aux->ids,
620 aux->allocated * sizeof *aux->ids);
623 aux->titles[aux->n] = title;
624 aux->strings[aux->n] = pool_strdup (aux->pool, string);
625 aux->ids[aux->n] = id;
626 aux->n++;
629 static void PRINTF_FORMAT (3, 4)
630 add_string (struct get_strings_aux *aux,
631 const char *string, const char *title, ...)
633 va_list args;
635 va_start (args, title);
636 add_string__ (aux, string, false, pool_vasprintf (aux->pool, title, args));
637 va_end (args);
640 static void PRINTF_FORMAT (3, 4)
641 add_id (struct get_strings_aux *aux, const char *id, const char *title, ...)
643 va_list args;
645 va_start (args, title);
646 add_string__ (aux, id, true, pool_vasprintf (aux->pool, title, args));
647 va_end (args);
650 /* Retrieves significant string data from R in its raw format, to allow the
651 caller to try to detect the encoding in use.
653 Returns the number of strings retrieved N. Sets each of *TITLESP, *IDSP,
654 and *STRINGSP to an array of N elements allocated from POOL. For each I in
655 0...N-1, UTF-8 string *TITLESP[I] describes *STRINGSP[I], which is in
656 whatever encoding system file R uses. *IDS[I] is true if *STRINGSP[I] must
657 be a valid PSPP language identifier, false if *STRINGSP[I] is free-form
658 text. */
659 static size_t
660 sfm_get_strings (const struct any_reader *r_, struct pool *pool,
661 char ***titlesp, bool **idsp, char ***stringsp)
663 struct sfm_reader *r = sfm_reader_cast (r_);
664 const struct sfm_mrset *mrset;
665 struct get_strings_aux aux;
666 size_t var_idx;
667 size_t i, j, k;
669 aux.pool = pool;
670 aux.titles = NULL;
671 aux.strings = NULL;
672 aux.ids = NULL;
673 aux.allocated = 0;
674 aux.n = 0;
676 var_idx = 0;
677 for (i = 0; i < r->n_vars; i++)
678 if (r->vars[i].width != -1)
679 add_id (&aux, r->vars[i].name, _("Variable %zu"), ++var_idx);
681 var_idx = 0;
682 for (i = 0; i < r->n_vars; i++)
683 if (r->vars[i].width != -1)
685 var_idx++;
686 if (r->vars[i].label)
687 add_string (&aux, r->vars[i].label, _("Variable %zu Label"),
688 var_idx);
691 k = 0;
692 for (i = 0; i < r->n_labels; i++)
693 for (j = 0; j < r->labels[i].n_labels; j++)
694 add_string (&aux, r->labels[i].labels[j].label,
695 _("Value Label %zu"), k++);
697 add_string (&aux, r->header.creation_date, _("Creation Date"));
698 add_string (&aux, r->header.creation_time, _("Creation Time"));
699 add_string (&aux, r->header.eye_catcher, _("Product"));
700 add_string (&aux, r->header.file_label, _("File Label"));
702 if (r->extensions[EXT_PRODUCT_INFO])
703 add_string (&aux, r->extensions[EXT_PRODUCT_INFO]->data,
704 _("Extra Product Info"));
706 if (r->document)
708 size_t i;
710 for (i = 0; i < r->document->n_lines; i++)
712 char line[81];
714 memcpy (line, r->document->documents + i * 80, 80);
715 line[80] = '\0';
717 add_string (&aux, line, _("Document Line %zu"), i + 1);
721 for (mrset = r->mrsets; mrset < &r->mrsets[r->n_mrsets]; mrset++)
723 size_t mrset_idx = mrset - r->mrsets + 1;
725 add_id (&aux, mrset->name, _("MRSET %zu"), mrset_idx);
726 if (mrset->label[0])
727 add_string (&aux, mrset->label, _("MRSET %zu Label"), mrset_idx);
729 /* Skip the variables because they ought to be duplicates. */
731 if (mrset->counted)
732 add_string (&aux, mrset->counted, _("MRSET %zu Counted Value"),
733 mrset_idx);
736 /* data file attributes */
737 /* variable attributes */
738 /* long var map */
739 /* long string value labels */
740 /* long string missing values */
742 *titlesp = aux.titles;
743 *idsp = aux.ids;
744 *stringsp = aux.strings;
745 return aux.n;
748 /* Decodes the dictionary read from R, saving it into into *DICT. Character
749 strings in R are decoded using ENCODING, or an encoding obtained from R if
750 ENCODING is null, or the locale encoding if R specifies no encoding.
752 If INFOP is non-null, then it receives additional info about the system
753 file, which the caller must eventually free with any_read_info_destroy()
754 when it is no longer needed.
756 This function consumes R. The caller must use it again later, even to
757 destroy it with sfm_close(). */
758 static struct casereader *
759 sfm_decode (struct any_reader *r_, const char *encoding,
760 struct dictionary **dictp, struct any_read_info *infop)
762 struct sfm_reader *r = sfm_reader_cast (r_);
763 struct dictionary *dict;
764 size_t i;
766 if (encoding == NULL)
768 encoding = sfm_get_encoding (r);
769 if (encoding == NULL)
771 sys_warn (r, -1, _("This system file does not indicate its own "
772 "character encoding. Using default encoding "
773 "%s. For best results, specify an encoding "
774 "explicitly. Use SYSFILE INFO with "
775 "ENCODING=\"DETECT\" to analyze the possible "
776 "encodings."),
777 locale_charset ());
778 encoding = locale_charset ();
782 dict = dict_create (encoding);
783 r->encoding = dict_get_encoding (dict);
785 /* These records don't use variables at all. */
786 if (r->document != NULL)
787 parse_document (dict, r->document);
789 if (r->extensions[EXT_INTEGER] != NULL
790 && !parse_machine_integer_info (r, r->extensions[EXT_INTEGER], &r->info))
791 goto error;
793 if (r->extensions[EXT_FLOAT] != NULL)
794 parse_machine_float_info (r, r->extensions[EXT_FLOAT]);
796 if (r->extensions[EXT_PRODUCT_INFO] != NULL)
797 parse_extra_product_info (r, r->extensions[EXT_PRODUCT_INFO], &r->info);
799 if (r->extensions[EXT_FILE_ATTRS] != NULL)
800 parse_data_file_attributes (r, r->extensions[EXT_FILE_ATTRS], dict);
802 parse_header (r, &r->header, &r->info, dict);
804 /* Parse the variable records, the basis of almost everything else. */
805 if (!parse_variable_records (r, dict, r->vars, r->n_vars))
806 goto error;
808 /* Parse value labels and the weight variable immediately after the variable
809 records. These records use indexes into var_recs[], so we must parse them
810 before those indexes become invalidated by very long string variables. */
811 for (i = 0; i < r->n_labels; i++)
812 if (!parse_value_labels (r, dict, r->vars, r->n_vars, &r->labels[i]))
813 goto error;
814 if (r->header.weight_idx != 0)
816 struct variable *weight_var;
818 weight_var = lookup_var_by_index (r, 76, r->vars, r->n_vars,
819 r->header.weight_idx);
820 if (weight_var != NULL)
822 if (var_is_numeric (weight_var))
823 dict_set_weight (dict, weight_var);
824 else
825 sys_warn (r, -1, _("Ignoring string variable `%s' set "
826 "as weighting variable."),
827 var_get_name (weight_var));
831 if (r->extensions[EXT_DISPLAY] != NULL)
832 parse_display_parameters (r, r->extensions[EXT_DISPLAY], dict);
834 /* The following records use short names, so they need to be parsed before
835 parse_long_var_name_map() changes short names to long names. */
836 decode_mrsets (r, dict);
838 if (r->extensions[EXT_LONG_STRINGS] != NULL
839 && !parse_long_string_map (r, r->extensions[EXT_LONG_STRINGS], dict))
840 goto error;
842 /* Now rename variables to their long names. */
843 parse_long_var_name_map (r, r->extensions[EXT_LONG_NAMES], dict);
845 /* The following records use long names, so they need to follow renaming. */
846 if (!ll_is_empty (&r->var_attrs))
848 struct sfm_extension_record *ext;
849 ll_for_each (ext, struct sfm_extension_record, ll, &r->var_attrs)
850 parse_variable_attributes (r, ext, dict);
852 /* Roles use the $@Role attribute. */
853 assign_variable_roles (r, dict);
855 if (r->extensions[EXT_LONG_LABELS] != NULL)
856 parse_long_string_value_labels (r, r->extensions[EXT_LONG_LABELS], dict);
857 if (r->extensions[EXT_LONG_MISSING] != NULL)
858 parse_long_string_missing_values (r, r->extensions[EXT_LONG_MISSING],
859 dict);
861 /* Warn if the actual amount of data per case differs from the
862 amount that the header claims. SPSS version 13 gets this
863 wrong when very long strings are involved, so don't warn in
864 that case. */
865 if (r->header.nominal_case_size > 0
866 && r->header.nominal_case_size != r->n_vars
867 && r->info.version_major != 13)
868 sys_warn (r, -1, _("File header claims %d variable positions but "
869 "%zu were read from file."),
870 r->header.nominal_case_size, r->n_vars);
872 /* Create an index of dictionary variable widths for
873 sfm_read_case to use. We cannot use the `struct variable's
874 from the dictionary we created, because the caller owns the
875 dictionary and may destroy or modify its variables. */
876 sfm_dictionary_to_sfm_vars (dict, &r->sfm_vars, &r->sfm_var_cnt);
877 pool_register (r->pool, free, r->sfm_vars);
878 r->proto = caseproto_ref_pool (dict_get_proto (dict), r->pool);
880 *dictp = dict;
881 if (infop)
883 *infop = r->info;
884 memset (&r->info, 0, sizeof r->info);
887 return casereader_create_sequential
888 (NULL, r->proto,
889 r->case_cnt == -1 ? CASENUMBER_MAX: r->case_cnt,
890 &sys_file_casereader_class, r);
892 error:
893 sfm_close (r_);
894 dict_destroy (dict);
895 *dictp = NULL;
896 return NULL;
899 /* Closes R, which should have been returned by sfm_open() but not already
900 closed with sfm_decode() or this function.
901 Returns true if an I/O error has occurred on READER, false
902 otherwise. */
903 static bool
904 sfm_close (struct any_reader *r_)
906 struct sfm_reader *r = sfm_reader_cast (r_);
907 bool error;
909 if (r->file)
911 if (fn_close (r->fh, r->file) == EOF)
913 msg (ME, _("Error closing system file `%s': %s."),
914 fh_get_file_name (r->fh), strerror (errno));
915 r->error = true;
917 r->file = NULL;
920 any_read_info_destroy (&r->info);
921 fh_unlock (r->lock);
922 fh_unref (r->fh);
924 error = r->error;
925 pool_destroy (r->pool);
927 return !error;
930 /* Destroys READER. */
931 static void
932 sys_file_casereader_destroy (struct casereader *reader UNUSED, void *r_)
934 struct sfm_reader *r = r_;
935 sfm_close (&r->any_reader);
938 /* Detects whether FILE is an SPSS system file. Returns 1 if so, 0 if not, and
939 a negative errno value if there is an error reading FILE. */
940 static int
941 sfm_detect (FILE *file)
943 char magic[5];
945 if (fseek (file, 0, SEEK_SET) != 0)
946 return -errno;
947 if (fread (magic, 4, 1, file) != 1)
948 return ferror (file) ? -errno : 0;
949 magic[4] = '\0';
951 return (!strcmp (ASCII_MAGIC, magic)
952 || !strcmp (ASCII_ZMAGIC, magic)
953 || !strcmp (EBCDIC_MAGIC, magic));
956 /* Reads the global header of the system file. Initializes *HEADER and *INFO,
957 except for the string fields in *INFO, which parse_header() will initialize
958 later once the file's encoding is known. */
959 static bool
960 read_header (struct sfm_reader *r, struct any_read_info *info,
961 struct sfm_header_record *header)
963 uint8_t raw_layout_code[4];
964 uint8_t raw_bias[8];
965 int compressed;
966 bool zmagic;
968 if (!read_string (r, header->magic, sizeof header->magic)
969 || !read_string (r, header->eye_catcher, sizeof header->eye_catcher))
970 return false;
971 r->written_by_readstat = strstr (header->eye_catcher,
972 "https://github.com/WizardMac/ReadStat");
974 if (!strcmp (ASCII_MAGIC, header->magic)
975 || !strcmp (EBCDIC_MAGIC, header->magic))
976 zmagic = false;
977 else if (!strcmp (ASCII_ZMAGIC, header->magic))
978 zmagic = true;
979 else
981 sys_error (r, 0, _("This is not an SPSS system file."));
982 return false;
985 /* Identify integer format. */
986 if (!read_bytes (r, raw_layout_code, sizeof raw_layout_code))
987 return false;
988 if ((!integer_identify (2, raw_layout_code, sizeof raw_layout_code,
989 &r->integer_format)
990 && !integer_identify (3, raw_layout_code, sizeof raw_layout_code,
991 &r->integer_format))
992 || (r->integer_format != INTEGER_MSB_FIRST
993 && r->integer_format != INTEGER_LSB_FIRST))
995 sys_error (r, 64, _("This is not an SPSS system file."));
996 return false;
999 if (!read_int (r, &header->nominal_case_size))
1000 return false;
1002 if (header->nominal_case_size < 0
1003 || header->nominal_case_size > INT_MAX / 16)
1004 header->nominal_case_size = -1;
1006 if (!read_int (r, &compressed))
1007 return false;
1008 if (!zmagic)
1010 if (compressed == 0)
1011 r->compression = ANY_COMP_NONE;
1012 else if (compressed == 1)
1013 r->compression = ANY_COMP_SIMPLE;
1014 else if (compressed != 0)
1016 sys_error (r, 0, "System file header has invalid compression "
1017 "value %d.", compressed);
1018 return false;
1021 else
1023 if (compressed == 2)
1024 r->compression = ANY_COMP_ZLIB;
1025 else
1027 sys_error (r, 0, "ZLIB-compressed system file header has invalid "
1028 "compression value %d.", compressed);
1029 return false;
1033 if (!read_int (r, &header->weight_idx))
1034 return false;
1036 if (!read_int (r, &r->case_cnt))
1037 return false;
1038 if ( r->case_cnt > INT_MAX / 2)
1039 r->case_cnt = -1;
1041 /* Identify floating-point format and obtain compression bias. */
1042 if (!read_bytes (r, raw_bias, sizeof raw_bias))
1043 return false;
1044 if (float_identify (100.0, raw_bias, sizeof raw_bias, &r->float_format) == 0)
1046 uint8_t zero_bias[8] = { 0, 0, 0, 0, 0, 0, 0, 0 };
1048 if (memcmp (raw_bias, zero_bias, 8))
1049 sys_warn (r, r->pos - 8,
1050 _("Compression bias is not the usual "
1051 "value of 100, or system file uses unrecognized "
1052 "floating-point format."));
1053 else
1055 /* Some software is known to write all-zeros to this
1056 field. Such software also writes floating-point
1057 numbers in the format that we expect by default
1058 (it seems that all software most likely does, in
1059 reality), so don't warn in this case. */
1062 if (r->integer_format == INTEGER_MSB_FIRST)
1063 r->float_format = FLOAT_IEEE_DOUBLE_BE;
1064 else
1065 r->float_format = FLOAT_IEEE_DOUBLE_LE;
1067 float_convert (r->float_format, raw_bias, FLOAT_NATIVE_DOUBLE, &r->bias);
1069 if (!read_string (r, header->creation_date, sizeof header->creation_date)
1070 || !read_string (r, header->creation_time, sizeof header->creation_time)
1071 || !read_string (r, header->file_label, sizeof header->file_label)
1072 || !skip_bytes (r, 3))
1073 return false;
1075 info->integer_format = r->integer_format;
1076 info->float_format = r->float_format;
1077 info->compression = r->compression;
1078 info->case_cnt = r->case_cnt;
1080 return true;
1083 /* Reads a variable (type 2) record from R into RECORD. */
1084 static bool
1085 read_variable_record (struct sfm_reader *r, struct sfm_var_record *record)
1087 int has_variable_label;
1089 memset (record, 0, sizeof *record);
1091 record->pos = r->pos;
1092 if (!read_int (r, &record->width)
1093 || !read_int (r, &has_variable_label)
1094 || !read_int (r, &record->missing_value_code)
1095 || !read_int (r, &record->print_format)
1096 || !read_int (r, &record->write_format)
1097 || !read_string (r, record->name, sizeof record->name))
1098 return false;
1100 if (has_variable_label == 1)
1102 enum { MAX_LABEL_LEN = 65536 };
1103 unsigned int len, read_len;
1105 if (!read_uint (r, &len))
1106 return false;
1108 /* Read up to MAX_LABEL_LEN bytes of label. */
1109 read_len = MIN (MAX_LABEL_LEN, len);
1110 record->label = pool_malloc (r->pool, read_len + 1);
1111 if (!read_string (r, record->label, read_len + 1))
1112 return false;
1114 /* Skip unread label bytes. */
1115 if (!skip_bytes (r, len - read_len))
1116 return false;
1118 /* Skip label padding up to multiple of 4 bytes. */
1119 if (!skip_bytes (r, ROUND_UP (len, 4) - len))
1120 return false;
1122 else if (has_variable_label != 0)
1124 sys_error (r, record->pos,
1125 _("Variable label indicator field is not 0 or 1."));
1126 return false;
1129 /* Set missing values. */
1130 if (record->missing_value_code != 0)
1132 int code = record->missing_value_code;
1133 if (record->width == 0)
1135 if (code < -3 || code > 3 || code == -1)
1137 sys_error (r, record->pos,
1138 _("Numeric missing value indicator field is not "
1139 "-3, -2, 0, 1, 2, or 3."));
1140 return false;
1143 else
1145 if (code < 1 || code > 3)
1147 sys_error (r, record->pos,
1148 _("String missing value indicator field is not "
1149 "0, 1, 2, or 3."));
1150 return false;
1154 if (!read_bytes (r, record->missing, 8 * abs (code)))
1155 return false;
1158 return true;
1161 /* Reads value labels from R into RECORD. */
1162 static bool
1163 read_value_label_record (struct sfm_reader *r,
1164 struct sfm_value_label_record *record)
1166 size_t i;
1167 int type;
1169 /* Read type 3 record. */
1170 record->pos = r->pos;
1171 if (!read_uint (r, &record->n_labels))
1172 return false;
1173 if (record->n_labels > UINT_MAX / sizeof *record->labels)
1175 sys_error (r, r->pos - 4, _("Invalid number of labels %u."),
1176 record->n_labels);
1177 return false;
1179 record->labels = pool_nmalloc (r->pool, record->n_labels,
1180 sizeof *record->labels);
1181 for (i = 0; i < record->n_labels; i++)
1183 struct sfm_value_label *label = &record->labels[i];
1184 unsigned char label_len;
1185 size_t padded_len;
1187 if (!read_bytes (r, label->value, sizeof label->value))
1188 return false;
1190 /* Read label length. */
1191 if (!read_bytes (r, &label_len, sizeof label_len))
1192 return false;
1193 padded_len = ROUND_UP (label_len + 1, 8);
1195 /* Read label, padding. */
1196 label->label = pool_malloc (r->pool, padded_len + 1);
1197 if (!read_bytes (r, label->label, padded_len - 1))
1198 return false;
1199 label->label[label_len] = '\0';
1202 /* Read record type of type 4 record. */
1203 if (!read_int (r, &type))
1204 return false;
1205 if (type != 4)
1207 sys_error (r, r->pos - 4,
1208 _("Variable index record (type 4) does not immediately "
1209 "follow value label record (type 3) as it should."));
1210 return false;
1213 /* Read number of variables associated with value label from type 4
1214 record. */
1215 if (!read_uint (r, &record->n_vars))
1216 return false;
1217 if (record->n_vars < 1 || record->n_vars > r->n_vars)
1219 sys_error (r, r->pos - 4,
1220 _("Number of variables associated with a value label (%u) "
1221 "is not between 1 and the number of variables (%zu)."),
1222 record->n_vars, r->n_vars);
1223 return false;
1226 record->vars = pool_nmalloc (r->pool, record->n_vars, sizeof *record->vars);
1227 for (i = 0; i < record->n_vars; i++)
1228 if (!read_int (r, &record->vars[i]))
1229 return false;
1231 return true;
1234 /* Reads a document record from R. Returns true if successful, false on
1235 error. */
1236 static bool
1237 read_document_record (struct sfm_reader *r)
1239 int n_lines;
1240 if (!read_int (r, &n_lines))
1241 return false;
1242 else if (n_lines == 0)
1243 return true;
1244 else if (n_lines < 0 || n_lines >= INT_MAX / DOC_LINE_LENGTH)
1246 sys_error (r, r->pos,
1247 _("Number of document lines (%d) "
1248 "must be greater than 0 and less than %d."),
1249 n_lines, INT_MAX / DOC_LINE_LENGTH);
1250 return false;
1253 struct sfm_document_record *record;
1254 record = pool_malloc (r->pool, sizeof *record);
1255 record->pos = r->pos;
1256 record->n_lines = n_lines;
1257 record->documents = pool_malloc (r->pool, DOC_LINE_LENGTH * n_lines);
1258 if (!read_bytes (r, record->documents, DOC_LINE_LENGTH * n_lines))
1259 return false;
1261 r->document = record;
1262 return true;
1265 static bool
1266 read_extension_record_header (struct sfm_reader *r, int subtype,
1267 struct sfm_extension_record *record)
1269 record->subtype = subtype;
1270 record->pos = r->pos;
1271 if (!read_uint (r, &record->size) || !read_uint (r, &record->count))
1272 return false;
1274 /* Check that SIZE * COUNT + 1 doesn't overflow. Adding 1
1275 allows an extra byte for a null terminator, used by some
1276 extension processing routines. */
1277 if (record->size != 0
1278 && xsum (1, xtimes (record->count, record->size)) >= UINT_MAX)
1280 sys_error (r, record->pos, "Record type 7 subtype %d too large.",
1281 subtype);
1282 return false;
1285 return true;
1288 /* Reads an extension record from R into RECORD. */
1289 static bool
1290 read_extension_record (struct sfm_reader *r, int subtype,
1291 struct sfm_extension_record **recordp)
1293 struct extension_record_type
1295 int subtype;
1296 int size;
1297 int count;
1300 static const struct extension_record_type types[] =
1302 /* Implemented record types. */
1303 { EXT_INTEGER, 4, 8 },
1304 { EXT_FLOAT, 8, 3 },
1305 { EXT_MRSETS, 1, 0 },
1306 { EXT_PRODUCT_INFO, 1, 0 },
1307 { EXT_DISPLAY, 4, 0 },
1308 { EXT_LONG_NAMES, 1, 0 },
1309 { EXT_LONG_STRINGS, 1, 0 },
1310 { EXT_NCASES, 8, 2 },
1311 { EXT_FILE_ATTRS, 1, 0 },
1312 { EXT_VAR_ATTRS, 1, 0 },
1313 { EXT_MRSETS2, 1, 0 },
1314 { EXT_ENCODING, 1, 0 },
1315 { EXT_LONG_LABELS, 1, 0 },
1316 { EXT_LONG_MISSING, 1, 0 },
1318 /* Ignored record types. */
1319 { EXT_VAR_SETS, 0, 0 },
1320 { EXT_DATE, 0, 0 },
1321 { EXT_DATA_ENTRY, 0, 0 },
1322 { EXT_DATAVIEW, 0, 0 },
1325 const struct extension_record_type *type;
1326 struct sfm_extension_record *record;
1327 size_t n_bytes;
1329 *recordp = NULL;
1330 record = pool_malloc (r->pool, sizeof *record);
1331 if (!read_extension_record_header (r, subtype, record))
1332 return false;
1333 n_bytes = record->count * record->size;
1335 for (type = types; type < &types[sizeof types / sizeof *types]; type++)
1336 if (subtype == type->subtype)
1338 if (type->size > 0 && record->size != type->size)
1339 sys_warn (r, record->pos,
1340 _("Record type 7, subtype %d has bad size %u "
1341 "(expected %d)."), subtype, record->size, type->size);
1342 else if (type->count > 0 && record->count != type->count)
1343 sys_warn (r, record->pos,
1344 _("Record type 7, subtype %d has bad count %u "
1345 "(expected %d)."), subtype, record->count, type->count);
1346 else if (type->count == 0 && type->size == 0)
1348 /* Ignore this record. */
1350 else
1352 char *data = pool_malloc (r->pool, n_bytes + 1);
1353 data[n_bytes] = '\0';
1355 record->data = data;
1356 if (!read_bytes (r, record->data, n_bytes))
1357 return false;
1358 *recordp = record;
1359 return true;
1362 goto skip;
1365 sys_warn (r, record->pos,
1366 _("Unrecognized record type 7, subtype %d. For help, please "
1367 "send this file to %s and mention that you were using %s."),
1368 subtype, PACKAGE_BUGREPORT, PACKAGE_STRING);
1370 skip:
1371 return skip_bytes (r, n_bytes);
1374 static bool
1375 skip_extension_record (struct sfm_reader *r, int subtype)
1377 struct sfm_extension_record record;
1379 return (read_extension_record_header (r, subtype, &record)
1380 && skip_bytes (r, record.count * record.size));
1383 static void
1384 parse_header (struct sfm_reader *r, const struct sfm_header_record *header,
1385 struct any_read_info *info, struct dictionary *dict)
1387 const char *dict_encoding = dict_get_encoding (dict);
1388 struct substring product;
1389 struct substring label;
1390 char *fixed_label;
1392 /* Convert file label to UTF-8 and put it into DICT. */
1393 label = recode_substring_pool ("UTF-8", dict_encoding,
1394 ss_cstr (header->file_label), r->pool);
1395 ss_trim (&label, ss_cstr (" "));
1396 label.string[label.length] = '\0';
1397 fixed_label = fix_line_ends (label.string);
1398 dict_set_label (dict, fixed_label);
1399 free (fixed_label);
1401 /* Put creation date and time in UTF-8 into INFO. */
1402 info->creation_date = recode_string ("UTF-8", dict_encoding,
1403 header->creation_date, -1);
1404 info->creation_time = recode_string ("UTF-8", dict_encoding,
1405 header->creation_time, -1);
1407 /* Put product name into INFO, dropping eye-catcher string if present. */
1408 product = recode_substring_pool ("UTF-8", dict_encoding,
1409 ss_cstr (header->eye_catcher), r->pool);
1410 ss_match_string (&product, ss_cstr ("@(#) SPSS DATA FILE"));
1411 ss_trim (&product, ss_cstr (" "));
1412 info->product = ss_xstrdup (product);
1415 /* Reads a variable (type 2) record from R and adds the
1416 corresponding variable to DICT.
1417 Also skips past additional variable records for long string
1418 variables. */
1419 static bool
1420 parse_variable_records (struct sfm_reader *r, struct dictionary *dict,
1421 struct sfm_var_record *var_recs, size_t n_var_recs)
1423 const char *dict_encoding = dict_get_encoding (dict);
1424 struct sfm_var_record *rec;
1425 int n_warnings = 0;
1427 for (rec = var_recs; rec < &var_recs[n_var_recs]; )
1429 struct variable *var;
1430 size_t n_values;
1431 char *name;
1432 size_t i;
1434 name = recode_string_pool ("UTF-8", dict_encoding,
1435 rec->name, -1, r->pool);
1436 name[strcspn (name, " ")] = '\0';
1438 if (!dict_id_is_valid (dict, name, false)
1439 || name[0] == '$' || name[0] == '#')
1441 sys_error (r, rec->pos, _("Invalid variable name `%s'."), name);
1442 return false;
1445 if (rec->width < 0 || rec->width > 255)
1447 sys_error (r, rec->pos,
1448 _("Bad width %d for variable %s."), rec->width, name);
1449 return false;
1452 var = rec->var = dict_create_var (dict, name, rec->width);
1453 if (var == NULL)
1455 char *new_name = dict_make_unique_var_name (dict, NULL, NULL);
1456 sys_warn (r, rec->pos, _("Renaming variable with duplicate name "
1457 "`%s' to `%s'."),
1458 name, new_name);
1459 var = rec->var = dict_create_var_assert (dict, new_name, rec->width);
1460 free (new_name);
1463 /* Set the short name the same as the long name. */
1464 var_set_short_name (var, 0, name);
1466 /* Get variable label, if any. */
1467 if (rec->label)
1469 char *utf8_label;
1471 utf8_label = recode_string_pool ("UTF-8", dict_encoding,
1472 rec->label, -1, r->pool);
1473 var_set_label (var, utf8_label);
1476 /* Set missing values. */
1477 if (rec->missing_value_code != 0)
1479 int width = var_get_width (var);
1480 struct missing_values mv;
1482 mv_init_pool (r->pool, &mv, width);
1483 if (var_is_numeric (var))
1485 bool has_range = rec->missing_value_code < 0;
1486 int n_discrete = (has_range
1487 ? rec->missing_value_code == -3
1488 : rec->missing_value_code);
1489 int ofs = 0;
1491 if (has_range)
1493 double low = parse_float (r, rec->missing, 0);
1494 double high = parse_float (r, rec->missing, 8);
1496 /* Deal with SPSS 21 change in representation. */
1497 if (low == SYSMIS)
1498 low = LOWEST;
1500 mv_add_range (&mv, low, high);
1501 ofs += 16;
1504 for (i = 0; i < n_discrete; i++)
1506 mv_add_num (&mv, parse_float (r, rec->missing, ofs));
1507 ofs += 8;
1510 else
1511 for (i = 0; i < rec->missing_value_code; i++)
1512 mv_add_str (&mv, rec->missing + 8 * i, MIN (width, 8));
1513 var_set_missing_values (var, &mv);
1516 /* Set formats. */
1517 parse_format_spec (r, rec->pos + 12, rec->print_format,
1518 PRINT_FORMAT, var, &n_warnings);
1519 parse_format_spec (r, rec->pos + 16, rec->write_format,
1520 WRITE_FORMAT, var, &n_warnings);
1522 /* Account for values.
1523 Skip long string continuation records, if any. */
1524 n_values = rec->width == 0 ? 1 : DIV_RND_UP (rec->width, 8);
1525 for (i = 1; i < n_values; i++)
1526 if (i + (rec - var_recs) >= n_var_recs || rec[i].width != -1)
1528 sys_error (r, rec->pos, _("Missing string continuation record."));
1529 return false;
1531 rec += n_values;
1534 return true;
1537 /* Translates the format spec from sysfile format to internal
1538 format. */
1539 static void
1540 parse_format_spec (struct sfm_reader *r, off_t pos, unsigned int format,
1541 enum which_format which, struct variable *v,
1542 int *n_warnings)
1544 const int max_warnings = 8;
1545 uint8_t raw_type = format >> 16;
1546 uint8_t w = format >> 8;
1547 uint8_t d = format;
1548 struct fmt_spec f;
1549 bool ok;
1551 f.w = w;
1552 f.d = d;
1554 msg_disable ();
1555 ok = (fmt_from_io (raw_type, &f.type)
1556 && fmt_check_output (&f)
1557 && fmt_check_width_compat (&f, var_get_width (v)));
1558 msg_enable ();
1560 if (ok)
1562 if (which == PRINT_FORMAT)
1563 var_set_print_format (v, &f);
1564 else
1565 var_set_write_format (v, &f);
1567 else if (format == 0)
1569 /* Actually observed in the wild. No point in warning about it. */
1571 else if (++*n_warnings <= max_warnings)
1573 if (which == PRINT_FORMAT)
1574 sys_warn (r, pos, _("Variable %s with width %d has invalid print "
1575 "format 0x%x."),
1576 var_get_name (v), var_get_width (v), format);
1577 else
1578 sys_warn (r, pos, _("Variable %s with width %d has invalid write "
1579 "format 0x%x."),
1580 var_get_name (v), var_get_width (v), format);
1582 if (*n_warnings == max_warnings)
1583 sys_warn (r, -1, _("Suppressing further invalid format warnings."));
1587 static void
1588 parse_document (struct dictionary *dict, struct sfm_document_record *record)
1590 const char *p;
1592 for (p = record->documents;
1593 p < record->documents + DOC_LINE_LENGTH * record->n_lines;
1594 p += DOC_LINE_LENGTH)
1596 struct substring line;
1598 line = recode_substring_pool ("UTF-8", dict_get_encoding (dict),
1599 ss_buffer (p, DOC_LINE_LENGTH), NULL);
1600 ss_rtrim (&line, ss_cstr (" "));
1601 line.string[line.length] = '\0';
1603 dict_add_document_line (dict, line.string, false);
1605 ss_dealloc (&line);
1609 /* Parses record type 7, subtype 3. */
1610 static bool
1611 parse_machine_integer_info (struct sfm_reader *r,
1612 const struct sfm_extension_record *record,
1613 struct any_read_info *info)
1615 int float_representation, expected_float_format;
1616 int integer_representation, expected_integer_format;
1618 /* Save version info. */
1619 info->version_major = parse_int (r, record->data, 0);
1620 info->version_minor = parse_int (r, record->data, 4);
1621 info->version_revision = parse_int (r, record->data, 8);
1623 /* Check floating point format. */
1624 float_representation = parse_int (r, record->data, 16);
1625 if (r->float_format == FLOAT_IEEE_DOUBLE_BE
1626 || r->float_format == FLOAT_IEEE_DOUBLE_LE)
1627 expected_float_format = 1;
1628 else if (r->float_format == FLOAT_Z_LONG)
1629 expected_float_format = 2;
1630 else if (r->float_format == FLOAT_VAX_G || r->float_format == FLOAT_VAX_D)
1631 expected_float_format = 3;
1632 else
1633 NOT_REACHED ();
1634 if (float_representation != expected_float_format)
1636 sys_error (r, record->pos,
1637 _("Floating-point representation indicated by "
1638 "system file (%d) differs from expected (%d)."),
1639 float_representation, expected_float_format);
1640 return false;
1643 /* Check integer format. */
1644 integer_representation = parse_int (r, record->data, 24);
1645 if (r->integer_format == INTEGER_MSB_FIRST)
1646 expected_integer_format = 1;
1647 else if (r->integer_format == INTEGER_LSB_FIRST)
1648 expected_integer_format = 2;
1649 else
1650 NOT_REACHED ();
1651 if (integer_representation != expected_integer_format)
1652 sys_warn (r, record->pos,
1653 _("Integer format indicated by system file (%d) "
1654 "differs from expected (%d)."),
1655 integer_representation, expected_integer_format);
1657 return true;
1660 /* Parses record type 7, subtype 4. */
1661 static void
1662 parse_machine_float_info (struct sfm_reader *r,
1663 const struct sfm_extension_record *record)
1665 double sysmis = parse_float (r, record->data, 0);
1666 double highest = parse_float (r, record->data, 8);
1667 double lowest = parse_float (r, record->data, 16);
1669 if (sysmis != SYSMIS)
1670 sys_warn (r, record->pos,
1671 _("File specifies unexpected value %g (%a) as %s, "
1672 "instead of %g (%a)."),
1673 sysmis, sysmis, "SYSMIS", SYSMIS, SYSMIS);
1675 if (highest != HIGHEST)
1676 sys_warn (r, record->pos,
1677 _("File specifies unexpected value %g (%a) as %s, "
1678 "instead of %g (%a)."),
1679 highest, highest, "HIGHEST", HIGHEST, HIGHEST);
1681 /* SPSS before version 21 used a unique value just bigger than SYSMIS as
1682 LOWEST. SPSS 21 uses SYSMIS for LOWEST, which is OK because LOWEST only
1683 appears in a context (missing values) where SYSMIS cannot. */
1684 if (lowest != LOWEST && lowest != SYSMIS)
1685 sys_warn (r, record->pos,
1686 _("File specifies unexpected value %g (%a) as %s, "
1687 "instead of %g (%a) or %g (%a)."),
1688 lowest, lowest, "LOWEST", LOWEST, LOWEST, SYSMIS, SYSMIS);
1691 /* Parses record type 7, subtype 10. */
1692 static void
1693 parse_extra_product_info (struct sfm_reader *r,
1694 const struct sfm_extension_record *record,
1695 struct any_read_info *info)
1697 struct text_record *text;
1699 text = open_text_record (r, record, true);
1700 info->product_ext = fix_line_ends (text_get_all (text));
1701 close_text_record (r, text);
1704 /* Parses record type 7, subtype 7 or 19. */
1705 static void
1706 parse_mrsets (struct sfm_reader *r, const struct sfm_extension_record *record,
1707 size_t *allocated_mrsets)
1709 struct text_record *text;
1711 text = open_text_record (r, record, false);
1712 for (;;)
1714 struct sfm_mrset *mrset;
1715 size_t allocated_vars;
1716 char delimiter;
1718 /* Skip extra line feeds if present. */
1719 while (text_match (text, '\n'))
1720 continue;
1722 if (r->n_mrsets >= *allocated_mrsets)
1723 r->mrsets = pool_2nrealloc (r->pool, r->mrsets, allocated_mrsets,
1724 sizeof *r->mrsets);
1725 mrset = &r->mrsets[r->n_mrsets];
1726 memset(mrset, 0, sizeof *mrset);
1728 mrset->name = text_get_token (text, ss_cstr ("="), NULL);
1729 if (mrset->name == NULL)
1730 break;
1732 if (text_match (text, 'C'))
1734 mrset->type = MRSET_MC;
1735 if (!text_match (text, ' '))
1737 sys_warn (r, record->pos,
1738 _("Missing space following `%c' at offset %zu "
1739 "in MRSETS record."), 'C', text_pos (text));
1740 break;
1743 else if (text_match (text, 'D'))
1745 mrset->type = MRSET_MD;
1746 mrset->cat_source = MRSET_VARLABELS;
1748 else if (text_match (text, 'E'))
1750 char *number;
1752 mrset->type = MRSET_MD;
1753 mrset->cat_source = MRSET_COUNTEDVALUES;
1754 if (!text_match (text, ' '))
1756 sys_warn (r, record->pos,
1757 _("Missing space following `%c' at offset %zu "
1758 "in MRSETS record."), 'E', text_pos (text));
1759 break;
1762 number = text_get_token (text, ss_cstr (" "), NULL);
1763 if (!strcmp (number, "11"))
1764 mrset->label_from_var_label = true;
1765 else if (strcmp (number, "1"))
1766 sys_warn (r, record->pos,
1767 _("Unexpected label source value following `E' "
1768 "at offset %zu in MRSETS record."),
1769 text_pos (text));
1771 else
1773 sys_warn (r, record->pos,
1774 _("Missing `C', `D', or `E' at offset %zu "
1775 "in MRSETS record."),
1776 text_pos (text));
1777 break;
1780 if (mrset->type == MRSET_MD)
1782 mrset->counted = text_parse_counted_string (r, text);
1783 if (mrset->counted == NULL)
1784 break;
1787 mrset->label = text_parse_counted_string (r, text);
1788 if (mrset->label == NULL)
1789 break;
1791 allocated_vars = 0;
1794 const char *var;
1796 var = text_get_token (text, ss_cstr (" \n"), &delimiter);
1797 if (var == NULL)
1799 if (delimiter != '\n')
1800 sys_warn (r, record->pos,
1801 _("Missing new-line parsing variable names "
1802 "at offset %zu in MRSETS record."),
1803 text_pos (text));
1804 break;
1807 if (mrset->n_vars >= allocated_vars)
1808 mrset->vars = pool_2nrealloc (r->pool, mrset->vars,
1809 &allocated_vars,
1810 sizeof *mrset->vars);
1811 mrset->vars[mrset->n_vars++] = var;
1813 while (delimiter != '\n');
1815 r->n_mrsets++;
1817 close_text_record (r, text);
1820 static void
1821 decode_mrsets (struct sfm_reader *r, struct dictionary *dict)
1823 const struct sfm_mrset *s;
1825 for (s = r->mrsets; s < &r->mrsets[r->n_mrsets]; s++)
1827 struct stringi_set var_names;
1828 struct mrset *mrset;
1829 char *name;
1830 int width;
1831 size_t i;
1833 name = recode_string ("UTF-8", r->encoding, s->name, -1);
1834 if (!mrset_is_valid_name (name, dict_get_encoding (dict), false))
1836 sys_warn (r, -1, _("Invalid multiple response set name `%s'."),
1837 name);
1838 free (name);
1839 continue;
1842 mrset = xzalloc (sizeof *mrset);
1843 mrset->name = name;
1844 mrset->type = s->type;
1845 mrset->cat_source = s->cat_source;
1846 mrset->label_from_var_label = s->label_from_var_label;
1847 if (s->label[0] != '\0')
1848 mrset->label = recode_string ("UTF-8", r->encoding, s->label, -1);
1850 stringi_set_init (&var_names);
1851 mrset->vars = xmalloc (s->n_vars * sizeof *mrset->vars);
1852 width = INT_MAX;
1853 for (i = 0; i < s->n_vars; i++)
1855 struct variable *var;
1856 char *var_name;
1858 var_name = recode_string ("UTF-8", r->encoding, s->vars[i], -1);
1860 var = dict_lookup_var (dict, var_name);
1861 if (var == NULL)
1863 free (var_name);
1864 continue;
1866 if (!stringi_set_insert (&var_names, var_name))
1868 sys_warn (r, -1,
1869 _("MRSET %s contains duplicate variable name %s."),
1870 mrset->name, var_name);
1871 free (var_name);
1872 continue;
1874 free (var_name);
1876 if (mrset->label == NULL && mrset->label_from_var_label
1877 && var_has_label (var))
1878 mrset->label = xstrdup (var_get_label (var));
1880 if (mrset->n_vars
1881 && var_get_type (var) != var_get_type (mrset->vars[0]))
1883 sys_warn (r, -1,
1884 _("MRSET %s contains both string and "
1885 "numeric variables."), mrset->name);
1886 continue;
1888 width = MIN (width, var_get_width (var));
1890 mrset->vars[mrset->n_vars++] = var;
1893 if (mrset->n_vars < 2)
1895 if (mrset->n_vars == 0)
1896 sys_warn (r, -1, _("MRSET %s has no variables."), mrset->name);
1897 else
1898 sys_warn (r, -1, _("MRSET %s has only one variable."),
1899 mrset->name);
1900 mrset_destroy (mrset);
1901 stringi_set_destroy (&var_names);
1902 continue;
1905 if (mrset->type == MRSET_MD)
1907 mrset->width = width;
1908 value_init (&mrset->counted, width);
1909 if (width == 0)
1910 mrset->counted.f = c_strtod (s->counted, NULL);
1911 else
1912 value_copy_str_rpad (&mrset->counted, width,
1913 (const uint8_t *) s->counted, ' ');
1916 dict_add_mrset (dict, mrset);
1917 stringi_set_destroy (&var_names);
1921 /* Read record type 7, subtype 11, which specifies how variables
1922 should be displayed in GUI environments. */
1923 static void
1924 parse_display_parameters (struct sfm_reader *r,
1925 const struct sfm_extension_record *record,
1926 struct dictionary *dict)
1928 bool includes_width;
1929 bool warned = false;
1930 size_t n_vars;
1931 size_t ofs;
1932 size_t i;
1934 n_vars = dict_get_var_cnt (dict);
1935 if (record->count == 3 * n_vars)
1936 includes_width = true;
1937 else if (record->count == 2 * n_vars)
1938 includes_width = false;
1939 else
1941 sys_warn (r, record->pos,
1942 _("Extension 11 has bad count %u (for %zu variables)."),
1943 record->count, n_vars);
1944 return;
1947 ofs = 0;
1948 for (i = 0; i < n_vars; ++i)
1950 struct variable *v = dict_get_var (dict, i);
1951 int measure, width, align;
1953 measure = parse_int (r, record->data, ofs);
1954 ofs += 4;
1956 if (includes_width)
1958 width = parse_int (r, record->data, ofs);
1959 ofs += 4;
1961 else
1962 width = 0;
1964 align = parse_int (r, record->data, ofs);
1965 ofs += 4;
1967 /* SPSS sometimes seems to set variables' measure to zero. */
1968 if (0 == measure)
1969 measure = 1;
1971 if (measure < 1 || measure > 3 || align < 0 || align > 2)
1973 if (!warned)
1974 sys_warn (r, record->pos,
1975 _("Invalid variable display parameters for variable "
1976 "%zu (%s). Default parameters substituted."),
1977 i, var_get_name (v));
1978 warned = true;
1979 continue;
1982 var_set_measure (v, (measure == 1 ? MEASURE_NOMINAL
1983 : measure == 2 ? MEASURE_ORDINAL
1984 : MEASURE_SCALE));
1985 var_set_alignment (v, (align == 0 ? ALIGN_LEFT
1986 : align == 1 ? ALIGN_RIGHT
1987 : ALIGN_CENTRE));
1989 /* Older versions (SPSS 9.0) sometimes set the display
1990 width to zero. This causes confusion in the GUI, so
1991 only set the width if it is nonzero. */
1992 if (width > 0)
1993 var_set_display_width (v, width);
1997 static void
1998 rename_var_and_save_short_names (struct dictionary *dict, struct variable *var,
1999 const char *new_name)
2001 size_t n_short_names;
2002 char **short_names;
2003 size_t i;
2005 /* Renaming a variable may clear its short names, but we
2006 want to retain them, so we save them and re-set them
2007 afterward. */
2008 n_short_names = var_get_short_name_cnt (var);
2009 short_names = xnmalloc (n_short_names, sizeof *short_names);
2010 for (i = 0; i < n_short_names; i++)
2012 const char *s = var_get_short_name (var, i);
2013 short_names[i] = s != NULL ? xstrdup (s) : NULL;
2016 /* Set long name. */
2017 dict_rename_var (dict, var, new_name);
2019 /* Restore short names. */
2020 for (i = 0; i < n_short_names; i++)
2022 var_set_short_name (var, i, short_names[i]);
2023 free (short_names[i]);
2025 free (short_names);
2028 /* Parses record type 7, subtype 13, which gives the long name that corresponds
2029 to each short name. Modifies variable names in DICT accordingly. */
2030 static void
2031 parse_long_var_name_map (struct sfm_reader *r,
2032 const struct sfm_extension_record *record,
2033 struct dictionary *dict)
2035 struct text_record *text;
2036 struct variable *var;
2037 char *long_name;
2039 if (record == NULL)
2041 /* There are no long variable names. Use the short variable names,
2042 converted to lowercase, as the long variable names. */
2043 size_t i;
2045 for (i = 0; i < dict_get_var_cnt (dict); i++)
2047 struct variable *var = dict_get_var (dict, i);
2048 char *new_name;
2050 new_name = utf8_to_lower (var_get_name (var));
2051 rename_var_and_save_short_names (dict, var, new_name);
2052 free (new_name);
2055 return;
2058 /* Rename each of the variables, one by one. (In a correctly constructed
2059 system file, this cannot create any intermediate duplicate variable names,
2060 because all of the new variable names are longer than any of the old
2061 variable names and thus there cannot be any overlaps.) */
2062 text = open_text_record (r, record, true);
2063 while (read_variable_to_value_pair (r, dict, text, &var, &long_name))
2065 /* Validate long name. */
2066 if (!dict_id_is_valid (dict, long_name, false)
2067 || long_name[0] == '$' || long_name[0] == '#')
2069 sys_warn (r, record->pos,
2070 _("Long variable mapping from %s to invalid "
2071 "variable name `%s'."),
2072 var_get_name (var), long_name);
2073 continue;
2076 /* Identify any duplicates. */
2077 if (utf8_strcasecmp (var_get_short_name (var, 0), long_name)
2078 && dict_lookup_var (dict, long_name) != NULL)
2080 sys_warn (r, record->pos,
2081 _("Duplicate long variable name `%s'."), long_name);
2082 continue;
2085 rename_var_and_save_short_names (dict, var, long_name);
2087 close_text_record (r, text);
2090 /* Reads record type 7, subtype 14, which gives the real length
2091 of each very long string. Rearranges DICT accordingly. */
2092 static bool
2093 parse_long_string_map (struct sfm_reader *r,
2094 const struct sfm_extension_record *record,
2095 struct dictionary *dict)
2097 struct text_record *text;
2098 struct variable *var;
2099 char *length_s;
2101 text = open_text_record (r, record, true);
2102 while (read_variable_to_value_pair (r, dict, text, &var, &length_s))
2104 size_t idx = var_get_dict_index (var);
2105 long int length;
2106 int segment_cnt;
2107 int i;
2109 /* Get length. */
2110 length = strtol (length_s, NULL, 10);
2111 if (length < 1 || length > MAX_STRING)
2113 sys_warn (r, record->pos,
2114 _("%s listed as string of invalid length %s "
2115 "in very long string record."),
2116 var_get_name (var), length_s);
2117 continue;
2120 /* Check segments. */
2121 segment_cnt = sfm_width_to_segments (length);
2122 if (segment_cnt == 1)
2124 sys_warn (r, record->pos,
2125 _("%s listed in very long string record with width %s, "
2126 "which requires only one segment."),
2127 var_get_name (var), length_s);
2128 continue;
2130 if (idx + segment_cnt > dict_get_var_cnt (dict))
2132 sys_error (r, record->pos,
2133 _("Very long string %s overflows dictionary."),
2134 var_get_name (var));
2135 return false;
2138 /* Get the short names from the segments and check their
2139 lengths. */
2140 for (i = 0; i < segment_cnt; i++)
2142 struct variable *seg = dict_get_var (dict, idx + i);
2143 int alloc_width = sfm_segment_alloc_width (length, i);
2144 int width = var_get_width (seg);
2146 if (i > 0)
2147 var_set_short_name (var, i, var_get_short_name (seg, 0));
2148 if (ROUND_UP (width, 8) != ROUND_UP (alloc_width, 8))
2150 sys_error (r, record->pos,
2151 _("Very long string with width %ld has segment %d "
2152 "of width %d (expected %d)."),
2153 length, i, width, alloc_width);
2154 return false;
2157 dict_delete_consecutive_vars (dict, idx + 1, segment_cnt - 1);
2158 var_set_width (var, length);
2160 close_text_record (r, text);
2161 dict_compact_values (dict);
2163 return true;
2166 static bool
2167 parse_value_labels (struct sfm_reader *r, struct dictionary *dict,
2168 const struct sfm_var_record *var_recs, size_t n_var_recs,
2169 const struct sfm_value_label_record *record)
2171 struct variable **vars;
2172 char **utf8_labels;
2173 size_t i;
2175 utf8_labels = pool_nmalloc (r->pool, record->n_labels, sizeof *utf8_labels);
2176 for (i = 0; i < record->n_labels; i++)
2177 utf8_labels[i] = recode_string_pool ("UTF-8", dict_get_encoding (dict),
2178 record->labels[i].label, -1,
2179 r->pool);
2181 vars = pool_nmalloc (r->pool, record->n_vars, sizeof *vars);
2182 for (i = 0; i < record->n_vars; i++)
2184 vars[i] = lookup_var_by_index (r, record->pos,
2185 var_recs, n_var_recs, record->vars[i]);
2186 if (vars[i] == NULL)
2187 return false;
2190 for (i = 1; i < record->n_vars; i++)
2191 if (var_get_type (vars[i]) != var_get_type (vars[0]))
2193 sys_error (r, record->pos,
2194 _("Variables associated with value label are not all of "
2195 "identical type. Variable %s is %s, but variable "
2196 "%s is %s."),
2197 var_get_name (vars[0]),
2198 var_is_numeric (vars[0]) ? _("numeric") : _("string"),
2199 var_get_name (vars[i]),
2200 var_is_numeric (vars[i]) ? _("numeric") : _("string"));
2201 return false;
2204 for (i = 0; i < record->n_vars; i++)
2206 struct variable *var = vars[i];
2207 int width;
2208 size_t j;
2210 width = var_get_width (var);
2211 if (width > 8)
2213 sys_error (r, record->pos,
2214 _("Value labels may not be added to long string "
2215 "variables (e.g. %s) using records types 3 and 4."),
2216 var_get_name (var));
2217 return false;
2220 for (j = 0; j < record->n_labels; j++)
2222 struct sfm_value_label *label = &record->labels[j];
2223 union value value;
2225 value_init (&value, width);
2226 if (width == 0)
2227 value.f = parse_float (r, label->value, 0);
2228 else
2229 memcpy (value_str_rw (&value, width), label->value, width);
2231 if (!var_add_value_label (var, &value, utf8_labels[j]))
2233 if (r->written_by_readstat)
2235 /* Ignore the problem. ReadStat is buggy and emits value
2236 labels whose values are longer than string variables'
2237 widths, that are identical in the actual width of the
2238 variable, e.g. both values "ABC123" and "ABC456" for a
2239 string variable with width 3. */
2241 else if (var_is_numeric (var))
2242 sys_warn (r, record->pos,
2243 _("Duplicate value label for %g on %s."),
2244 value.f, var_get_name (var));
2245 else
2246 sys_warn (r, record->pos,
2247 _("Duplicate value label for `%.*s' on %s."),
2248 width, value_str (&value, width),
2249 var_get_name (var));
2252 value_destroy (&value, width);
2256 pool_free (r->pool, vars);
2257 for (i = 0; i < record->n_labels; i++)
2258 pool_free (r->pool, utf8_labels[i]);
2259 pool_free (r->pool, utf8_labels);
2261 return true;
2264 static struct variable *
2265 lookup_var_by_index (struct sfm_reader *r, off_t offset,
2266 const struct sfm_var_record *var_recs, size_t n_var_recs,
2267 int idx)
2269 const struct sfm_var_record *rec;
2271 if (idx < 1 || idx > n_var_recs)
2273 sys_error (r, offset,
2274 _("Variable index %d not in valid range 1...%zu."),
2275 idx, n_var_recs);
2276 return NULL;
2279 rec = &var_recs[idx - 1];
2280 if (rec->var == NULL)
2282 sys_error (r, offset,
2283 _("Variable index %d refers to long string continuation."),
2284 idx);
2285 return NULL;
2288 return rec->var;
2291 /* Parses a set of custom attributes from TEXT into ATTRS.
2292 ATTRS may be a null pointer, in which case the attributes are
2293 read but discarded. */
2294 static void
2295 parse_attributes (struct sfm_reader *r, struct text_record *text,
2296 struct attrset *attrs)
2300 struct attribute *attr;
2301 char *key;
2302 int index;
2304 /* Parse the key. */
2305 key = text_get_token (text, ss_cstr ("("), NULL);
2306 if (key == NULL)
2307 return;
2309 attr = attribute_create (key);
2310 for (index = 1; ; index++)
2312 /* Parse the value. */
2313 char *value;
2314 size_t length;
2316 value = text_get_token (text, ss_cstr ("\n"), NULL);
2317 if (value == NULL)
2319 text_warn (r, text, _("Error parsing attribute value %s[%d]."),
2320 key, index);
2321 break;
2324 length = strlen (value);
2325 if (length >= 2 && value[0] == '\'' && value[length - 1] == '\'')
2327 value[length - 1] = '\0';
2328 attribute_add_value (attr, value + 1);
2330 else
2332 text_warn (r, text,
2333 _("Attribute value %s[%d] is not quoted: %s."),
2334 key, index, value);
2335 attribute_add_value (attr, value);
2338 /* Was this the last value for this attribute? */
2339 if (text_match (text, ')'))
2340 break;
2342 if (attrs != NULL)
2343 attrset_add (attrs, attr);
2344 else
2345 attribute_destroy (attr);
2347 while (!text_match (text, '/'));
2350 /* Reads record type 7, subtype 17, which lists custom
2351 attributes on the data file. */
2352 static void
2353 parse_data_file_attributes (struct sfm_reader *r,
2354 const struct sfm_extension_record *record,
2355 struct dictionary *dict)
2357 struct text_record *text = open_text_record (r, record, true);
2358 parse_attributes (r, text, dict_get_attributes (dict));
2359 close_text_record (r, text);
2362 /* Parses record type 7, subtype 18, which lists custom
2363 attributes on individual variables. */
2364 static void
2365 parse_variable_attributes (struct sfm_reader *r,
2366 const struct sfm_extension_record *record,
2367 struct dictionary *dict)
2369 struct text_record *text;
2370 struct variable *var;
2372 text = open_text_record (r, record, true);
2373 while (text_read_variable_name (r, dict, text, ss_cstr (":"), &var))
2374 parse_attributes (r, text, var != NULL ? var_get_attributes (var) : NULL);
2375 close_text_record (r, text);
2378 static void
2379 assign_variable_roles (struct sfm_reader *r, struct dictionary *dict)
2381 size_t n_warnings = 0;
2382 size_t i;
2384 for (i = 0; i < dict_get_var_cnt (dict); i++)
2386 struct variable *var = dict_get_var (dict, i);
2387 struct attrset *attrs = var_get_attributes (var);
2388 const struct attribute *attr = attrset_lookup (attrs, "$@Role");
2389 if (attr != NULL)
2391 int value = atoi (attribute_get_value (attr, 0));
2392 enum var_role role;
2394 switch (value)
2396 case 0:
2397 role = ROLE_INPUT;
2398 break;
2400 case 1:
2401 role = ROLE_TARGET;
2402 break;
2404 case 2:
2405 role = ROLE_BOTH;
2406 break;
2408 case 3:
2409 role = ROLE_NONE;
2410 break;
2412 case 4:
2413 role = ROLE_PARTITION;
2414 break;
2416 case 5:
2417 role = ROLE_SPLIT;
2418 break;
2420 default:
2421 role = ROLE_INPUT;
2422 if (n_warnings++ == 0)
2423 sys_warn (r, -1, _("Invalid role for variable %s."),
2424 var_get_name (var));
2427 var_set_role (var, role);
2431 if (n_warnings > 1)
2432 sys_warn (r, -1, _("%zu other variables had invalid roles."),
2433 n_warnings - 1);
2436 static bool
2437 check_overflow (struct sfm_reader *r,
2438 const struct sfm_extension_record *record,
2439 size_t ofs, size_t length)
2441 size_t end = record->size * record->count;
2442 if (length >= end || ofs + length > end)
2444 sys_warn (r, record->pos + end,
2445 _("Extension record subtype %d ends unexpectedly."),
2446 record->subtype);
2447 return false;
2449 return true;
2452 static void
2453 parse_long_string_value_labels (struct sfm_reader *r,
2454 const struct sfm_extension_record *record,
2455 struct dictionary *dict)
2457 const char *dict_encoding = dict_get_encoding (dict);
2458 size_t end = record->size * record->count;
2459 size_t ofs = 0;
2461 while (ofs < end)
2463 char *var_name;
2464 size_t n_labels, i;
2465 struct variable *var;
2466 union value value;
2467 int var_name_len;
2468 int width;
2470 /* Parse variable name length. */
2471 if (!check_overflow (r, record, ofs, 4))
2472 return;
2473 var_name_len = parse_int (r, record->data, ofs);
2474 ofs += 4;
2476 /* Parse variable name, width, and number of labels. */
2477 if (!check_overflow (r, record, ofs, var_name_len)
2478 || !check_overflow (r, record, ofs, var_name_len + 8))
2479 return;
2480 var_name = recode_string_pool ("UTF-8", dict_encoding,
2481 (const char *) record->data + ofs,
2482 var_name_len, r->pool);
2483 width = parse_int (r, record->data, ofs + var_name_len);
2484 n_labels = parse_int (r, record->data, ofs + var_name_len + 4);
2485 ofs += var_name_len + 8;
2487 /* Look up 'var' and validate. */
2488 var = dict_lookup_var (dict, var_name);
2489 if (var == NULL)
2490 sys_warn (r, record->pos + ofs,
2491 _("Ignoring long string value label record for "
2492 "unknown variable %s."), var_name);
2493 else if (var_is_numeric (var))
2495 sys_warn (r, record->pos + ofs,
2496 _("Ignoring long string value label record for "
2497 "numeric variable %s."), var_name);
2498 var = NULL;
2500 else if (width != var_get_width (var))
2502 sys_warn (r, record->pos + ofs,
2503 _("Ignoring long string value label record for variable "
2504 "%s because the record's width (%d) does not match the "
2505 "variable's width (%d)."),
2506 var_name, width, var_get_width (var));
2507 var = NULL;
2510 /* Parse values. */
2511 value_init_pool (r->pool, &value, width);
2512 for (i = 0; i < n_labels; i++)
2514 size_t value_length, label_length;
2515 bool skip = var == NULL;
2517 /* Parse value length. */
2518 if (!check_overflow (r, record, ofs, 4))
2519 return;
2520 value_length = parse_int (r, record->data, ofs);
2521 ofs += 4;
2523 /* Parse value. */
2524 if (!check_overflow (r, record, ofs, value_length))
2525 return;
2526 if (!skip)
2528 if (value_length == width)
2529 memcpy (value_str_rw (&value, width),
2530 (const uint8_t *) record->data + ofs, width);
2531 else
2533 sys_warn (r, record->pos + ofs,
2534 _("Ignoring long string value label %zu for "
2535 "variable %s, with width %d, that has bad value "
2536 "width %zu."),
2537 i, var_get_name (var), width, value_length);
2538 skip = true;
2541 ofs += value_length;
2543 /* Parse label length. */
2544 if (!check_overflow (r, record, ofs, 4))
2545 return;
2546 label_length = parse_int (r, record->data, ofs);
2547 ofs += 4;
2549 /* Parse label. */
2550 if (!check_overflow (r, record, ofs, label_length))
2551 return;
2552 if (!skip)
2554 char *label;
2556 label = recode_string_pool ("UTF-8", dict_encoding,
2557 (const char *) record->data + ofs,
2558 label_length, r->pool);
2559 if (!var_add_value_label (var, &value, label))
2560 sys_warn (r, record->pos + ofs,
2561 _("Duplicate value label for `%.*s' on %s."),
2562 width, value_str (&value, width),
2563 var_get_name (var));
2564 pool_free (r->pool, label);
2566 ofs += label_length;
2571 static void
2572 parse_long_string_missing_values (struct sfm_reader *r,
2573 const struct sfm_extension_record *record,
2574 struct dictionary *dict)
2576 const char *dict_encoding = dict_get_encoding (dict);
2577 size_t end = record->size * record->count;
2578 size_t ofs = 0;
2580 while (ofs < end)
2582 struct missing_values mv;
2583 char *var_name;
2584 struct variable *var;
2585 int n_missing_values;
2586 int var_name_len;
2587 size_t i;
2589 /* Parse variable name length. */
2590 if (!check_overflow (r, record, ofs, 4))
2591 return;
2592 var_name_len = parse_int (r, record->data, ofs);
2593 ofs += 4;
2595 /* Parse variable name. */
2596 if (!check_overflow (r, record, ofs, var_name_len)
2597 || !check_overflow (r, record, ofs, var_name_len + 1))
2598 return;
2599 var_name = recode_string_pool ("UTF-8", dict_encoding,
2600 (const char *) record->data + ofs,
2601 var_name_len, r->pool);
2602 ofs += var_name_len;
2604 /* Parse number of missing values. */
2605 n_missing_values = ((const uint8_t *) record->data)[ofs];
2606 if (n_missing_values < 1 || n_missing_values > 3)
2607 sys_warn (r, record->pos + ofs,
2608 _("Long string missing values record says variable %s "
2609 "has %d missing values, but only 1 to 3 missing values "
2610 "are allowed."),
2611 var_name, n_missing_values);
2612 ofs++;
2614 /* Look up 'var' and validate. */
2615 var = dict_lookup_var (dict, var_name);
2616 if (var == NULL)
2617 sys_warn (r, record->pos + ofs,
2618 _("Ignoring long string missing value record for "
2619 "unknown variable %s."), var_name);
2620 else if (var_is_numeric (var))
2622 sys_warn (r, record->pos + ofs,
2623 _("Ignoring long string missing value record for "
2624 "numeric variable %s."), var_name);
2625 var = NULL;
2628 /* Parse values. */
2629 mv_init_pool (r->pool, &mv, var ? var_get_width (var) : 8);
2630 for (i = 0; i < n_missing_values; i++)
2632 size_t value_length;
2634 /* Parse value length. */
2635 if (!check_overflow (r, record, ofs, 4))
2636 return;
2637 value_length = parse_int (r, record->data, ofs);
2638 ofs += 4;
2640 /* Parse value. */
2641 if (!check_overflow (r, record, ofs, value_length))
2642 return;
2643 if (var != NULL
2644 && i < 3
2645 && !mv_add_str (&mv, (const uint8_t *) record->data + ofs,
2646 value_length))
2647 sys_warn (r, record->pos + ofs,
2648 _("Ignoring long string missing value %zu for variable "
2649 "%s, with width %d, that has bad value width %zu."),
2650 i, var_get_name (var), var_get_width (var),
2651 value_length);
2652 ofs += value_length;
2654 if (var != NULL)
2655 var_set_missing_values (var, &mv);
2659 /* Case reader. */
2661 static void partial_record (struct sfm_reader *);
2663 static void read_error (struct casereader *, const struct sfm_reader *);
2665 static bool read_case_number (struct sfm_reader *, double *);
2666 static int read_case_string (struct sfm_reader *, uint8_t *, size_t);
2667 static int read_opcode (struct sfm_reader *);
2668 static bool read_compressed_number (struct sfm_reader *, double *);
2669 static int read_compressed_string (struct sfm_reader *, uint8_t *);
2670 static int read_whole_strings (struct sfm_reader *, uint8_t *, size_t);
2671 static bool skip_whole_strings (struct sfm_reader *, size_t);
2673 /* Reads and returns one case from READER's file. Returns a null
2674 pointer if not successful. */
2675 static struct ccase *
2676 sys_file_casereader_read (struct casereader *reader, void *r_)
2678 struct sfm_reader *r = r_;
2679 struct ccase *c;
2680 int retval;
2681 int i;
2683 if (r->error || !r->sfm_var_cnt)
2684 return NULL;
2686 c = case_create (r->proto);
2688 for (i = 0; i < r->sfm_var_cnt; i++)
2690 struct sfm_var *sv = &r->sfm_vars[i];
2691 union value *v = case_data_rw_idx (c, sv->case_index);
2693 if (sv->var_width == 0)
2694 retval = read_case_number (r, &v->f);
2695 else
2697 uint8_t *s = value_str_rw (v, sv->var_width);
2698 retval = read_case_string (r, s + sv->offset, sv->segment_width);
2699 if (retval == 1)
2701 retval = skip_whole_strings (r, ROUND_DOWN (sv->padding, 8));
2702 if (retval == 0)
2703 sys_error (r, r->pos, _("File ends in partial string value."));
2707 if (retval != 1)
2708 goto eof;
2710 return c;
2712 eof:
2713 if (i != 0)
2714 partial_record (r);
2715 if (r->case_cnt != -1)
2716 read_error (reader, r);
2717 case_unref (c);
2718 return NULL;
2721 /* Issues an error that R ends in a partial record. */
2722 static void
2723 partial_record (struct sfm_reader *r)
2725 sys_error (r, r->pos, _("File ends in partial case."));
2728 /* Issues an error that an unspecified error occurred SFM, and
2729 marks R tainted. */
2730 static void
2731 read_error (struct casereader *r, const struct sfm_reader *sfm)
2733 msg (ME, _("Error reading case from file %s."), fh_get_name (sfm->fh));
2734 casereader_force_error (r);
2737 /* Reads a number from R and stores its value in *D.
2738 If R is compressed, reads a compressed number;
2739 otherwise, reads a number in the regular way.
2740 Returns true if successful, false if end of file is
2741 reached immediately. */
2742 static bool
2743 read_case_number (struct sfm_reader *r, double *d)
2745 if (r->compression == ANY_COMP_NONE)
2747 uint8_t number[8];
2748 if (!try_read_bytes (r, number, sizeof number))
2749 return false;
2750 float_convert (r->float_format, number, FLOAT_NATIVE_DOUBLE, d);
2751 return true;
2753 else
2754 return read_compressed_number (r, d);
2757 /* Reads LENGTH string bytes from R into S. Always reads a multiple of 8
2758 bytes; if LENGTH is not a multiple of 8, then extra bytes are read and
2759 discarded without being written to S. Reads compressed strings if S is
2760 compressed. Returns 1 if successful, 0 if end of file is reached
2761 immediately, or -1 for some kind of error. */
2762 static int
2763 read_case_string (struct sfm_reader *r, uint8_t *s, size_t length)
2765 size_t whole = ROUND_DOWN (length, 8);
2766 size_t partial = length % 8;
2768 if (whole)
2770 int retval = read_whole_strings (r, s, whole);
2771 if (retval != 1)
2772 return retval;
2775 if (partial)
2777 uint8_t bounce[8];
2778 int retval = read_whole_strings (r, bounce, sizeof bounce);
2779 if (retval == -1)
2780 return -1;
2781 else if (!retval)
2783 if (whole)
2785 partial_record (r);
2786 return -1;
2788 return 0;
2790 memcpy (s + whole, bounce, partial);
2793 return 1;
2796 /* Reads and returns the next compression opcode from R. */
2797 static int
2798 read_opcode (struct sfm_reader *r)
2800 assert (r->compression != ANY_COMP_NONE);
2801 for (;;)
2803 int opcode;
2804 if (r->opcode_idx >= sizeof r->opcodes)
2807 int retval = try_read_compressed_bytes (r, r->opcodes,
2808 sizeof r->opcodes);
2809 if (retval != 1)
2810 return -1;
2811 r->opcode_idx = 0;
2813 opcode = r->opcodes[r->opcode_idx++];
2815 if (opcode != 0)
2816 return opcode;
2820 /* Reads a compressed number from R and stores its value in D.
2821 Returns true if successful, false if end of file is
2822 reached immediately. */
2823 static bool
2824 read_compressed_number (struct sfm_reader *r, double *d)
2826 int opcode = read_opcode (r);
2827 switch (opcode)
2829 case -1:
2830 case 252:
2831 return false;
2833 case 253:
2834 return read_compressed_float (r, d);
2836 case 254:
2837 float_convert (r->float_format, " ", FLOAT_NATIVE_DOUBLE, d);
2838 if (!r->corruption_warning)
2840 r->corruption_warning = true;
2841 sys_warn (r, r->pos,
2842 _("Possible compressed data corruption: "
2843 "compressed spaces appear in numeric field."));
2845 break;
2847 case 255:
2848 *d = SYSMIS;
2849 break;
2851 default:
2852 *d = opcode - r->bias;
2853 break;
2856 return true;
2859 /* Reads a compressed 8-byte string segment from R and stores it in DST. */
2860 static int
2861 read_compressed_string (struct sfm_reader *r, uint8_t *dst)
2863 int opcode;
2864 int retval;
2866 opcode = read_opcode (r);
2867 switch (opcode)
2869 case -1:
2870 case 252:
2871 return 0;
2873 case 253:
2874 retval = read_compressed_bytes (r, dst, 8);
2875 return retval == 1 ? 1 : -1;
2877 case 254:
2878 memset (dst, ' ', 8);
2879 return 1;
2881 default:
2883 double value = opcode - r->bias;
2884 float_convert (FLOAT_NATIVE_DOUBLE, &value, r->float_format, dst);
2885 if (value == 0.0)
2887 /* This has actually been seen "in the wild". The submitter of the
2888 file that showed that the contents decoded as spaces, but they
2889 were at the end of the field so it's possible that the null
2890 bytes just acted as null terminators. */
2892 else if (!r->corruption_warning)
2894 r->corruption_warning = true;
2895 sys_warn (r, r->pos,
2896 _("Possible compressed data corruption: "
2897 "string contains compressed integer (opcode %d)."),
2898 opcode);
2901 return 1;
2905 /* Reads LENGTH string bytes from R into S. LENGTH must be a multiple of 8.
2906 Reads compressed strings if S is compressed. Returns 1 if successful, 0 if
2907 end of file is reached immediately, or -1 for some kind of error. */
2908 static int
2909 read_whole_strings (struct sfm_reader *r, uint8_t *s, size_t length)
2911 assert (length % 8 == 0);
2912 if (r->compression == ANY_COMP_NONE)
2913 return try_read_bytes (r, s, length);
2914 else
2916 size_t ofs;
2918 for (ofs = 0; ofs < length; ofs += 8)
2920 int retval = read_compressed_string (r, s + ofs);
2921 if (retval != 1)
2923 if (ofs != 0)
2925 partial_record (r);
2926 return -1;
2928 return retval;
2931 return 1;
2935 /* Skips LENGTH string bytes from R.
2936 LENGTH must be a multiple of 8.
2937 (LENGTH is also limited to 1024, but that's only because the
2938 current caller never needs more than that many bytes.)
2939 Returns true if successful, false if end of file is
2940 reached immediately. */
2941 static bool
2942 skip_whole_strings (struct sfm_reader *r, size_t length)
2944 uint8_t buffer[1024];
2945 assert (length < sizeof buffer);
2946 return read_whole_strings (r, buffer, length);
2949 /* Helpers for reading records that contain structured text
2950 strings. */
2952 /* Maximum number of warnings to issue for a single text
2953 record. */
2954 #define MAX_TEXT_WARNINGS 5
2956 /* State. */
2957 struct text_record
2959 struct substring buffer; /* Record contents. */
2960 off_t start; /* Starting offset in file. */
2961 size_t pos; /* Current position in buffer. */
2962 int n_warnings; /* Number of warnings issued or suppressed. */
2963 bool recoded; /* Recoded into UTF-8? */
2966 static struct text_record *
2967 open_text_record (struct sfm_reader *r,
2968 const struct sfm_extension_record *record,
2969 bool recode_to_utf8)
2971 struct text_record *text;
2972 struct substring raw;
2974 text = pool_alloc (r->pool, sizeof *text);
2975 raw = ss_buffer (record->data, record->size * record->count);
2976 text->start = record->pos;
2977 text->buffer = (recode_to_utf8
2978 ? recode_substring_pool ("UTF-8", r->encoding, raw, r->pool)
2979 : raw);
2980 text->pos = 0;
2981 text->n_warnings = 0;
2982 text->recoded = recode_to_utf8;
2984 return text;
2987 /* Closes TEXT, frees its storage, and issues a final warning
2988 about suppressed warnings if necessary. */
2989 static void
2990 close_text_record (struct sfm_reader *r, struct text_record *text)
2992 if (text->n_warnings > MAX_TEXT_WARNINGS)
2993 sys_warn (r, -1, _("Suppressed %d additional related warnings."),
2994 text->n_warnings - MAX_TEXT_WARNINGS);
2995 if (text->recoded)
2996 pool_free (r->pool, ss_data (text->buffer));
2999 /* Reads a variable=value pair from TEXT.
3000 Looks up the variable in DICT and stores it into *VAR.
3001 Stores a null-terminated value into *VALUE. */
3002 static bool
3003 read_variable_to_value_pair (struct sfm_reader *r, struct dictionary *dict,
3004 struct text_record *text,
3005 struct variable **var, char **value)
3007 for (;;)
3009 if (!text_read_short_name (r, dict, text, ss_cstr ("="), var))
3010 return false;
3012 *value = text_get_token (text, ss_buffer ("\t\0", 2), NULL);
3013 if (*value == NULL)
3014 return false;
3016 text->pos += ss_span (ss_substr (text->buffer, text->pos, SIZE_MAX),
3017 ss_buffer ("\t\0", 2));
3019 if (*var != NULL)
3020 return true;
3024 static bool
3025 text_read_variable_name (struct sfm_reader *r, struct dictionary *dict,
3026 struct text_record *text, struct substring delimiters,
3027 struct variable **var)
3029 char *name;
3031 name = text_get_token (text, delimiters, NULL);
3032 if (name == NULL)
3033 return false;
3035 *var = dict_lookup_var (dict, name);
3036 if (*var != NULL)
3037 return true;
3039 text_warn (r, text, _("Dictionary record refers to unknown variable %s."),
3040 name);
3041 return false;
3045 static bool
3046 text_read_short_name (struct sfm_reader *r, struct dictionary *dict,
3047 struct text_record *text, struct substring delimiters,
3048 struct variable **var)
3050 char *short_name = text_get_token (text, delimiters, NULL);
3051 if (short_name == NULL)
3052 return false;
3054 *var = dict_lookup_var (dict, short_name);
3055 if (*var == NULL)
3056 text_warn (r, text, _("Dictionary record refers to unknown variable %s."),
3057 short_name);
3058 return true;
3061 /* Displays a warning for the current file position, limiting the
3062 number to MAX_TEXT_WARNINGS for TEXT. */
3063 static void
3064 text_warn (struct sfm_reader *r, struct text_record *text,
3065 const char *format, ...)
3067 if (text->n_warnings++ < MAX_TEXT_WARNINGS)
3069 va_list args;
3071 va_start (args, format);
3072 sys_msg (r, text->start + text->pos, MW, format, args);
3073 va_end (args);
3077 static char *
3078 text_get_token (struct text_record *text, struct substring delimiters,
3079 char *delimiter)
3081 struct substring token;
3082 char *end;
3084 if (!ss_tokenize (text->buffer, delimiters, &text->pos, &token))
3085 return NULL;
3087 end = &ss_data (token)[ss_length (token)];
3088 if (delimiter != NULL)
3089 *delimiter = *end;
3090 *end = '\0';
3091 return ss_data (token);
3094 /* Reads a integer value expressed in decimal, then a space, then a string that
3095 consists of exactly as many bytes as specified by the integer, then a space,
3096 from TEXT. Returns the string, null-terminated, as a subset of TEXT's
3097 buffer (so the caller should not free the string). */
3098 static const char *
3099 text_parse_counted_string (struct sfm_reader *r, struct text_record *text)
3101 size_t start;
3102 size_t n;
3103 char *s;
3105 start = text->pos;
3106 n = 0;
3107 while (text->pos < text->buffer.length)
3109 int c = text->buffer.string[text->pos];
3110 if (c < '0' || c > '9')
3111 break;
3112 n = (n * 10) + (c - '0');
3113 text->pos++;
3115 if (text->pos >= text->buffer.length || start == text->pos)
3117 sys_warn (r, text->start,
3118 _("Expecting digit at offset %zu in MRSETS record."),
3119 text->pos);
3120 return NULL;
3123 if (!text_match (text, ' '))
3125 sys_warn (r, text->start,
3126 _("Expecting space at offset %zu in MRSETS record."),
3127 text->pos);
3128 return NULL;
3131 if (text->pos + n > text->buffer.length)
3133 sys_warn (r, text->start,
3134 _("%zu-byte string starting at offset %zu "
3135 "exceeds record length %zu."),
3136 n, text->pos, text->buffer.length);
3137 return NULL;
3140 s = &text->buffer.string[text->pos];
3141 if (s[n] != ' ')
3143 sys_warn (r, text->start,
3144 _("Expecting space at offset %zu following %zu-byte string."),
3145 text->pos + n, n);
3146 return NULL;
3148 s[n] = '\0';
3149 text->pos += n + 1;
3150 return s;
3153 static bool
3154 text_match (struct text_record *text, char c)
3156 if (text->pos >= text->buffer.length)
3157 return false;
3159 if (text->buffer.string[text->pos] == c)
3161 text->pos++;
3162 return true;
3164 else
3165 return false;
3168 /* Returns the current byte offset (as converted to UTF-8, if it was converted)
3169 inside the TEXT's string. */
3170 static size_t
3171 text_pos (const struct text_record *text)
3173 return text->pos;
3176 static const char *
3177 text_get_all (const struct text_record *text)
3179 return text->buffer.string;
3182 /* Messages. */
3184 /* Displays a corruption message. */
3185 static void
3186 sys_msg (struct sfm_reader *r, off_t offset,
3187 int class, const char *format, va_list args)
3189 struct msg m;
3190 struct string text;
3192 ds_init_empty (&text);
3193 if (offset >= 0)
3194 ds_put_format (&text, _("`%s' near offset 0x%llx: "),
3195 fh_get_file_name (r->fh), (long long int) offset);
3196 else
3197 ds_put_format (&text, _("`%s': "), fh_get_file_name (r->fh));
3198 ds_put_vformat (&text, format, args);
3200 m.category = msg_class_to_category (class);
3201 m.severity = msg_class_to_severity (class);
3202 m.file_name = NULL;
3203 m.first_line = 0;
3204 m.last_line = 0;
3205 m.first_column = 0;
3206 m.last_column = 0;
3207 m.text = ds_cstr (&text);
3209 msg_emit (&m);
3212 /* Displays a warning for offset OFFSET in the file. */
3213 static void
3214 sys_warn (struct sfm_reader *r, off_t offset, const char *format, ...)
3216 va_list args;
3218 va_start (args, format);
3219 sys_msg (r, offset, MW, format, args);
3220 va_end (args);
3223 /* Displays an error for the current file position and marks it as in an error
3224 state. */
3225 static void
3226 sys_error (struct sfm_reader *r, off_t offset, const char *format, ...)
3228 va_list args;
3230 va_start (args, format);
3231 sys_msg (r, offset, ME, format, args);
3232 va_end (args);
3234 r->error = true;
3237 /* Reads BYTE_CNT bytes into BUF.
3238 Returns 1 if exactly BYTE_CNT bytes are successfully read.
3239 Returns -1 if an I/O error or a partial read occurs.
3240 Returns 0 for an immediate end-of-file and, if EOF_IS_OK is false, reports
3241 an error. */
3242 static inline int
3243 read_bytes_internal (struct sfm_reader *r, bool eof_is_ok,
3244 void *buf, size_t byte_cnt)
3246 size_t bytes_read = fread (buf, 1, byte_cnt, r->file);
3247 r->pos += bytes_read;
3248 if (bytes_read == byte_cnt)
3249 return 1;
3250 else if (ferror (r->file))
3252 sys_error (r, r->pos, _("System error: %s."), strerror (errno));
3253 return -1;
3255 else if (!eof_is_ok || bytes_read != 0)
3257 sys_error (r, r->pos, _("Unexpected end of file."));
3258 return -1;
3260 else
3261 return 0;
3264 /* Reads BYTE_CNT into BUF.
3265 Returns true if successful.
3266 Returns false upon I/O error or if end-of-file is encountered. */
3267 static bool
3268 read_bytes (struct sfm_reader *r, void *buf, size_t byte_cnt)
3270 return read_bytes_internal (r, false, buf, byte_cnt) == 1;
3273 /* Reads BYTE_CNT bytes into BUF.
3274 Returns 1 if exactly BYTE_CNT bytes are successfully read.
3275 Returns 0 if an immediate end-of-file is encountered.
3276 Returns -1 if an I/O error or a partial read occurs. */
3277 static int
3278 try_read_bytes (struct sfm_reader *r, void *buf, size_t byte_cnt)
3280 return read_bytes_internal (r, true, buf, byte_cnt);
3283 /* Reads a 32-bit signed integer from R and stores its value in host format in
3284 *X. Returns true if successful, otherwise false. */
3285 static bool
3286 read_int (struct sfm_reader *r, int *x)
3288 uint8_t integer[4];
3289 if (read_bytes (r, integer, sizeof integer) != 1)
3290 return false;
3291 *x = integer_get (r->integer_format, integer, sizeof integer);
3292 return true;
3295 static bool
3296 read_uint (struct sfm_reader *r, unsigned int *x)
3298 bool ok;
3299 int y;
3301 ok = read_int (r, &y);
3302 *x = y;
3303 return ok;
3306 /* Reads a 64-bit signed integer from R and returns its value in
3307 host format. */
3308 static bool
3309 read_int64 (struct sfm_reader *r, long long int *x)
3311 uint8_t integer[8];
3312 if (read_bytes (r, integer, sizeof integer) != 1)
3313 return false;
3314 *x = integer_get (r->integer_format, integer, sizeof integer);
3315 return true;
3318 /* Reads a 64-bit signed integer from R and returns its value in
3319 host format. */
3320 static bool
3321 read_uint64 (struct sfm_reader *r, unsigned long long int *x)
3323 long long int y;
3324 bool ok;
3326 ok = read_int64 (r, &y);
3327 *x = y;
3328 return ok;
3331 static int
3332 parse_int (const struct sfm_reader *r, const void *data, size_t ofs)
3334 return integer_get (r->integer_format, (const uint8_t *) data + ofs, 4);
3337 static double
3338 parse_float (const struct sfm_reader *r, const void *data, size_t ofs)
3340 return float_get_double (r->float_format, (const uint8_t *) data + ofs);
3343 /* Reads exactly SIZE - 1 bytes into BUFFER
3344 and stores a null byte into BUFFER[SIZE - 1]. */
3345 static bool
3346 read_string (struct sfm_reader *r, char *buffer, size_t size)
3348 bool ok;
3350 assert (size > 0);
3351 ok = read_bytes (r, buffer, size - 1);
3352 if (ok)
3353 buffer[size - 1] = '\0';
3354 return ok;
3357 /* Skips BYTES bytes forward in R. */
3358 static bool
3359 skip_bytes (struct sfm_reader *r, size_t bytes)
3361 while (bytes > 0)
3363 char buffer[1024];
3364 size_t chunk = MIN (sizeof buffer, bytes);
3365 if (!read_bytes (r, buffer, chunk))
3366 return false;
3367 bytes -= chunk;
3370 return true;
3373 /* Returns a malloc()'d copy of S in which all lone CRs and CR LF pairs have
3374 been replaced by LFs.
3376 (A product that identifies itself as VOXCO INTERVIEWER 4.3 produces system
3377 files that use CR-only line ends in the file label and extra product
3378 info.) */
3379 static char *
3380 fix_line_ends (const char *s)
3382 char *dst, *d;
3384 d = dst = xmalloc (strlen (s) + 1);
3385 while (*s != '\0')
3387 if (*s == '\r')
3389 s++;
3390 if (*s == '\n')
3391 s++;
3392 *d++ = '\n';
3394 else
3395 *d++ = *s++;
3397 *d = '\0';
3399 return dst;
3402 static bool
3403 read_ztrailer (struct sfm_reader *r,
3404 long long int zheader_ofs,
3405 long long int ztrailer_len);
3407 static void *
3408 zalloc (voidpf pool_, uInt items, uInt size)
3410 struct pool *pool = pool_;
3412 return (!size || xalloc_oversized (items, size)
3413 ? Z_NULL
3414 : pool_malloc (pool, items * size));
3417 static void
3418 zfree (voidpf pool_, voidpf address)
3420 struct pool *pool = pool_;
3422 pool_free (pool, address);
3425 static bool
3426 read_zheader (struct sfm_reader *r)
3428 off_t pos = r->pos;
3429 long long int zheader_ofs;
3430 long long int ztrailer_ofs;
3431 long long int ztrailer_len;
3433 if (!read_int64 (r, &zheader_ofs)
3434 || !read_int64 (r, &ztrailer_ofs)
3435 || !read_int64 (r, &ztrailer_len))
3436 return false;
3438 if (zheader_ofs != pos)
3440 sys_error (r, pos, _("Wrong ZLIB data header offset %#llx "
3441 "(expected %#llx)."),
3442 zheader_ofs, (long long int) pos);
3443 return false;
3446 if (ztrailer_ofs < r->pos)
3448 sys_error (r, pos, _("Impossible ZLIB trailer offset 0x%llx."),
3449 ztrailer_ofs);
3450 return false;
3453 if (ztrailer_len < 24 || ztrailer_len % 24)
3455 sys_error (r, pos, _("Invalid ZLIB trailer length %lld."), ztrailer_len);
3456 return false;
3459 r->ztrailer_ofs = ztrailer_ofs;
3460 if (!read_ztrailer (r, zheader_ofs, ztrailer_len))
3461 return false;
3463 if (r->zin_buf == NULL)
3465 r->zin_buf = pool_malloc (r->pool, ZIN_BUF_SIZE);
3466 r->zout_buf = pool_malloc (r->pool, ZOUT_BUF_SIZE);
3467 r->zstream.next_in = NULL;
3468 r->zstream.avail_in = 0;
3471 r->zstream.zalloc = zalloc;
3472 r->zstream.zfree = zfree;
3473 r->zstream.opaque = r->pool;
3475 return open_zstream (r);
3478 static void
3479 seek (struct sfm_reader *r, off_t offset)
3481 if (fseeko (r->file, offset, SEEK_SET))
3482 sys_error (r, 0, _("%s: seek failed (%s)."),
3483 fh_get_file_name (r->fh), strerror (errno));
3484 r->pos = offset;
3487 /* Performs some additional consistency checks on the ZLIB compressed data
3488 trailer. */
3489 static bool
3490 read_ztrailer (struct sfm_reader *r,
3491 long long int zheader_ofs,
3492 long long int ztrailer_len)
3494 long long int expected_uncmp_ofs;
3495 long long int expected_cmp_ofs;
3496 long long int bias;
3497 long long int zero;
3498 unsigned int block_size;
3499 unsigned int n_blocks;
3500 unsigned int i;
3501 struct stat s;
3503 if (fstat (fileno (r->file), &s))
3505 sys_error (ME, 0, _("%s: stat failed (%s)."),
3506 fh_get_file_name (r->fh), strerror (errno));
3507 return false;
3510 if (!S_ISREG (s.st_mode))
3512 /* We can't seek to the trailer and then back to the data in this file,
3513 so skip doing extra checks. */
3514 return true;
3517 if (r->ztrailer_ofs + ztrailer_len != s.st_size)
3518 sys_warn (r, r->pos,
3519 _("End of ZLIB trailer (0x%llx) is not file size (0x%llx)."),
3520 r->ztrailer_ofs + ztrailer_len, (long long int) s.st_size);
3522 seek (r, r->ztrailer_ofs);
3524 /* Read fixed header from ZLIB data trailer. */
3525 if (!read_int64 (r, &bias))
3526 return false;
3527 if (-bias != r->bias)
3529 sys_error (r, r->pos, _("ZLIB trailer bias (%lld) differs from "
3530 "file header bias (%.2f)."),
3531 -bias, r->bias);
3532 return false;
3535 if (!read_int64 (r, &zero))
3536 return false;
3537 if (zero != 0)
3538 sys_warn (r, r->pos,
3539 _("ZLIB trailer \"zero\" field has nonzero value %lld."), zero);
3541 if (!read_uint (r, &block_size))
3542 return false;
3543 if (block_size != ZBLOCK_SIZE)
3544 sys_warn (r, r->pos,
3545 _("ZLIB trailer specifies unexpected %u-byte block size."),
3546 block_size);
3548 if (!read_uint (r, &n_blocks))
3549 return false;
3550 if (n_blocks != (ztrailer_len - 24) / 24)
3552 sys_error (r, r->pos,
3553 _("%lld-byte ZLIB trailer specifies %u data blocks (expected "
3554 "%lld)."),
3555 ztrailer_len, n_blocks, (ztrailer_len - 24) / 24);
3556 return false;
3559 expected_uncmp_ofs = zheader_ofs;
3560 expected_cmp_ofs = zheader_ofs + 24;
3561 for (i = 0; i < n_blocks; i++)
3563 off_t desc_ofs = r->pos;
3564 unsigned long long int uncompressed_ofs;
3565 unsigned long long int compressed_ofs;
3566 unsigned int uncompressed_size;
3567 unsigned int compressed_size;
3569 if (!read_uint64 (r, &uncompressed_ofs)
3570 || !read_uint64 (r, &compressed_ofs)
3571 || !read_uint (r, &uncompressed_size)
3572 || !read_uint (r, &compressed_size))
3573 return false;
3575 if (uncompressed_ofs != expected_uncmp_ofs)
3577 sys_error (r, desc_ofs,
3578 _("ZLIB block descriptor %u reported uncompressed data "
3579 "offset %#llx, when %#llx was expected."),
3580 i, uncompressed_ofs, expected_uncmp_ofs);
3581 return false;
3584 if (compressed_ofs != expected_cmp_ofs)
3586 sys_error (r, desc_ofs,
3587 _("ZLIB block descriptor %u reported compressed data "
3588 "offset %#llx, when %#llx was expected."),
3589 i, compressed_ofs, expected_cmp_ofs);
3590 return false;
3593 if (i < n_blocks - 1)
3595 if (uncompressed_size != block_size)
3596 sys_warn (r, desc_ofs,
3597 _("ZLIB block descriptor %u reported block size %#x, "
3598 "when %#x was expected."),
3599 i, uncompressed_size, block_size);
3601 else
3603 if (uncompressed_size > block_size)
3604 sys_warn (r, desc_ofs,
3605 _("ZLIB block descriptor %u reported block size %#x, "
3606 "when at most %#x was expected."),
3607 i, uncompressed_size, block_size);
3610 /* http://www.zlib.net/zlib_tech.html says that the maximum expansion
3611 from compression, with worst-case parameters, is 13.5% plus 11 bytes.
3612 This code checks for an expansion of more than 14.3% plus 11
3613 bytes. */
3614 if (compressed_size > uncompressed_size + uncompressed_size / 7 + 11)
3616 sys_error (r, desc_ofs,
3617 _("ZLIB block descriptor %u reports compressed size %u "
3618 "and uncompressed size %u."),
3619 i, compressed_size, uncompressed_size);
3620 return false;
3623 expected_uncmp_ofs += uncompressed_size;
3624 expected_cmp_ofs += compressed_size;
3627 if (expected_cmp_ofs != r->ztrailer_ofs)
3629 sys_error (r, r->pos, _("ZLIB trailer is at offset %#llx but %#llx "
3630 "would be expected from block descriptors."),
3631 r->ztrailer_ofs, expected_cmp_ofs);
3632 return false;
3635 seek (r, zheader_ofs + 24);
3636 return true;
3639 static bool
3640 open_zstream (struct sfm_reader *r)
3642 int error;
3644 r->zout_pos = r->zout_end = 0;
3645 error = inflateInit (&r->zstream);
3646 if (error != Z_OK)
3648 sys_error (r, r->pos, _("ZLIB initialization failed (%s)."),
3649 r->zstream.msg);
3650 return false;
3652 return true;
3655 static bool
3656 close_zstream (struct sfm_reader *r)
3658 int error;
3660 error = inflateEnd (&r->zstream);
3661 if (error != Z_OK)
3663 sys_error (r, r->pos, _("Inconsistency at end of ZLIB stream (%s)."),
3664 r->zstream.msg);
3665 return false;
3667 return true;
3670 static int
3671 read_bytes_zlib (struct sfm_reader *r, void *buf_, size_t byte_cnt)
3673 uint8_t *buf = buf_;
3675 if (byte_cnt == 0)
3676 return 1;
3678 for (;;)
3680 int error;
3682 /* Use already inflated data if there is any. */
3683 if (r->zout_pos < r->zout_end)
3685 unsigned int n = MIN (byte_cnt, r->zout_end - r->zout_pos);
3686 memcpy (buf, &r->zout_buf[r->zout_pos], n);
3687 r->zout_pos += n;
3688 byte_cnt -= n;
3689 buf += n;
3691 if (byte_cnt == 0)
3692 return 1;
3695 /* We need to inflate some more data.
3696 Get some more input data if we don't have any. */
3697 if (r->zstream.avail_in == 0)
3699 unsigned int n = MIN (ZIN_BUF_SIZE, r->ztrailer_ofs - r->pos);
3700 if (n == 0)
3701 return 0;
3702 else
3704 int retval = try_read_bytes (r, r->zin_buf, n);
3705 if (retval != 1)
3706 return retval;
3707 r->zstream.avail_in = n;
3708 r->zstream.next_in = r->zin_buf;
3712 /* Inflate the (remaining) input data. */
3713 r->zstream.avail_out = ZOUT_BUF_SIZE;
3714 r->zstream.next_out = r->zout_buf;
3715 error = inflate (&r->zstream, Z_SYNC_FLUSH);
3716 r->zout_pos = 0;
3717 r->zout_end = r->zstream.next_out - r->zout_buf;
3718 if (r->zout_end == 0)
3720 if (error != Z_STREAM_END)
3722 sys_error (r, r->pos, _("ZLIB stream inconsistency (%s)."),
3723 r->zstream.msg);
3724 return -1;
3726 else if (!close_zstream (r) || !open_zstream (r))
3727 return -1;
3729 else
3731 /* Process the output data and ignore 'error' for now. ZLIB will
3732 present it to us again on the next inflate() call. */
3737 static int
3738 read_compressed_bytes (struct sfm_reader *r, void *buf, size_t byte_cnt)
3740 if (r->compression == ANY_COMP_SIMPLE)
3741 return read_bytes (r, buf, byte_cnt);
3742 else
3744 int retval = read_bytes_zlib (r, buf, byte_cnt);
3745 if (retval == 0)
3746 sys_error (r, r->pos, _("Unexpected end of ZLIB compressed data."));
3747 return retval;
3751 static int
3752 try_read_compressed_bytes (struct sfm_reader *r, void *buf, size_t byte_cnt)
3754 if (r->compression == ANY_COMP_SIMPLE)
3755 return try_read_bytes (r, buf, byte_cnt);
3756 else
3757 return read_bytes_zlib (r, buf, byte_cnt);
3760 /* Reads a 64-bit floating-point number from R and returns its
3761 value in host format. */
3762 static bool
3763 read_compressed_float (struct sfm_reader *r, double *d)
3765 uint8_t number[8];
3767 if (!read_compressed_bytes (r, number, sizeof number))
3768 return false;
3770 *d = float_get_double (r->float_format, number);
3771 return true;
3774 static const struct casereader_class sys_file_casereader_class =
3776 sys_file_casereader_read,
3777 sys_file_casereader_destroy,
3778 NULL,
3779 NULL,
3782 const struct any_reader_class sys_file_reader_class =
3784 N_("SPSS System File"),
3785 sfm_detect,
3786 sfm_open,
3787 sfm_close,
3788 sfm_decode,
3789 sfm_get_strings,