psppire: Sort by name or label case-insensitively in dictionary view.
[pspp.git] / src / data / pc+-file-reader.c
blobcb3716cb5079decbf29835694318e02d90a17054
1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 1997-2000, 2006-2007, 2009-2016 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
17 #include <config.h>
19 #include <errno.h>
20 #include <float.h>
21 #include <inttypes.h>
22 #include <stdlib.h>
23 #include <sys/stat.h>
25 #include "data/any-reader.h"
26 #include "data/case.h"
27 #include "data/casereader-provider.h"
28 #include "data/casereader.h"
29 #include "data/dictionary.h"
30 #include "data/file-handle-def.h"
31 #include "data/file-name.h"
32 #include "data/format.h"
33 #include "data/identifier.h"
34 #include "data/missing-values.h"
35 #include "data/value-labels.h"
36 #include "data/value.h"
37 #include "data/variable.h"
38 #include "libpspp/float-format.h"
39 #include "libpspp/i18n.h"
40 #include "libpspp/integer-format.h"
41 #include "libpspp/message.h"
42 #include "libpspp/misc.h"
43 #include "libpspp/pool.h"
44 #include "libpspp/str.h"
46 #include "gl/localcharset.h"
47 #include "gl/minmax.h"
48 #include "gl/xalloc.h"
49 #include "gl/xsize.h"
51 #include "gettext.h"
52 #define _(msgid) gettext (msgid)
53 #define N_(msgid) (msgid)
55 struct pcp_dir_entry
57 unsigned int ofs;
58 unsigned int len;
61 struct pcp_directory
63 struct pcp_dir_entry main;
64 struct pcp_dir_entry variables;
65 struct pcp_dir_entry labels;
66 struct pcp_dir_entry data;
69 struct pcp_main_header
71 char product[63]; /* "PCSPSS SYSTEM FILE..." */
72 unsigned int nominal_case_size; /* Number of var positions. */
73 char creation_date[9]; /* "[m]m/dd/yy". */
74 char creation_time[9]; /* "[H]H:MM:SS". */
75 char file_label[65]; /* File label. */
76 unsigned int weight_index; /* Index of weighting variable, 0 if none. */
79 struct pcp_var_record
81 unsigned int pos;
83 bool drop;
84 union value tmp;
86 char name[9];
87 int width;
88 struct fmt_spec format;
89 uint8_t missing[8];
90 char *label;
92 bool weight;
94 struct pcp_value_label *val_labs;
95 size_t n_val_labs;
98 struct pcp_value_label
100 uint8_t value[8];
101 char *label;
104 /* System file reader. */
105 struct pcp_reader
107 struct any_reader any_reader;
109 /* Resource tracking. */
110 struct pool *pool; /* All system file state. */
112 /* File data. */
113 unsigned int file_size;
114 struct any_read_info info;
115 struct pcp_directory directory;
116 struct pcp_main_header header;
117 struct pcp_var_record *vars;
118 size_t n_vars;
120 /* File state. */
121 struct file_handle *fh; /* File handle. */
122 struct fh_lock *lock; /* Mutual exclusion for file handle. */
123 FILE *file; /* File stream. */
124 unsigned int pos; /* Position in file. */
125 bool error; /* I/O or corruption error? */
126 struct caseproto *proto; /* Format of output cases. */
128 /* File format. */
129 unsigned int n_cases; /* Number of cases */
130 const char *encoding; /* String encoding. */
132 /* Decompression. */
133 bool compressed;
134 uint8_t opcodes[8]; /* Current block of opcodes. */
135 size_t opcode_idx; /* Next opcode to interpret, 8 if none left. */
136 bool corruption_warning; /* Warned about possible corruption? */
139 static struct pcp_reader *
140 pcp_reader_cast (const struct any_reader *r_)
142 assert (r_->klass == &pcp_file_reader_class);
143 return UP_CAST (r_, struct pcp_reader, any_reader);
146 static const struct casereader_class pcp_file_casereader_class;
148 static bool pcp_close (struct any_reader *);
150 static bool read_variables_record (struct pcp_reader *);
152 static void pcp_msg (struct pcp_reader *r, off_t, int class,
153 const char *format, va_list args)
154 PRINTF_FORMAT (4, 0);
155 static void pcp_warn (struct pcp_reader *, off_t, const char *, ...)
156 PRINTF_FORMAT (3, 4);
157 static void pcp_error (struct pcp_reader *, off_t, const char *, ...)
158 PRINTF_FORMAT (3, 4);
160 static bool read_bytes (struct pcp_reader *, void *, size_t)
161 WARN_UNUSED_RESULT;
162 static int try_read_bytes (struct pcp_reader *, void *, size_t)
163 WARN_UNUSED_RESULT;
164 static bool read_uint16 (struct pcp_reader *, unsigned int *)
165 WARN_UNUSED_RESULT;
166 static bool read_uint32 (struct pcp_reader *, unsigned int *)
167 WARN_UNUSED_RESULT;
168 static bool read_float (struct pcp_reader *, double *)
169 WARN_UNUSED_RESULT;
170 static double parse_float (const uint8_t number[8]);
171 static bool read_string (struct pcp_reader *, char *, size_t)
172 WARN_UNUSED_RESULT;
173 static bool skip_bytes (struct pcp_reader *, size_t) WARN_UNUSED_RESULT;
175 static bool pcp_seek (struct pcp_reader *, off_t);
177 static bool pcp_is_sysmis(const uint8_t *);
179 /* Dictionary reader. */
181 static bool read_dictionary (struct pcp_reader *);
182 static bool read_main_header (struct pcp_reader *, struct pcp_main_header *);
183 static void parse_header (struct pcp_reader *,
184 const struct pcp_main_header *,
185 struct any_read_info *, struct dictionary *);
186 static bool parse_variable_records (struct pcp_reader *, struct dictionary *,
187 struct pcp_var_record *, size_t n);
189 /* Tries to open FH for reading as an SPSS/PC+ system file. Returns a
190 pcp_reader if successful, otherwise NULL. */
191 static struct any_reader *
192 pcp_open (struct file_handle *fh)
194 struct stat s;
196 /* Create and initialize reader. */
197 struct pcp_reader *r = XZALLOC (struct pcp_reader);
198 r->any_reader.klass = &pcp_file_reader_class;
199 r->pool = pool_create ();
200 pool_register (r->pool, free, r);
201 r->fh = fh_ref (fh);
202 r->opcode_idx = sizeof r->opcodes;
204 /* TRANSLATORS: this fragment will be interpolated into
205 messages in fh_lock() that identify types of files. */
206 r->lock = fh_lock (fh, FH_REF_FILE, N_("SPSS/PC+ system file"),
207 FH_ACC_READ, false);
208 if (r->lock == NULL)
209 goto error;
211 /* Open file. */
212 r->file = fn_open (fh, "rb");
213 if (r->file == NULL)
215 msg (ME, _("Error opening `%s' for reading as an SPSS/PC+ "
216 "system file: %s."),
217 fh_get_file_name (r->fh), strerror (errno));
218 goto error;
221 /* Fetch file size. */
222 if (fstat (fileno (r->file), &s))
224 pcp_error (r, 0, _("%s: stat failed (%s)."),
225 fh_get_file_name (r->fh), strerror (errno));
226 goto error;
228 if (s.st_size > UINT_MAX)
230 pcp_error (r, 0, _("%s: file too large."), fh_get_file_name (r->fh));
231 goto error;
233 r->file_size = s.st_size;
235 /* Read dictionary. */
236 if (!read_dictionary (r))
237 goto error;
239 if (!pcp_seek (r, r->directory.data.ofs))
240 goto error;
242 return &r->any_reader;
244 error:
245 pcp_close (&r->any_reader);
246 return NULL;
249 static bool
250 pcp_read_dir_entry (struct pcp_reader *r, struct pcp_dir_entry *de)
252 if (!read_uint32 (r, &de->ofs) || !read_uint32 (r, &de->len))
253 return false;
255 if (de->len > r->file_size || de->ofs > r->file_size - de->len)
257 pcp_error (r, r->pos - 8, _("Directory entry is for a %u-byte record "
258 "starting at offset %u but file is only "
259 "%u bytes long."),
260 de->ofs, de->len, r->file_size);
261 return false;
264 return true;
267 static bool
268 read_dictionary (struct pcp_reader *r)
270 unsigned int two, zero;
272 if (!read_uint32 (r, &two) || !read_uint32 (r, &zero))
273 return false;
274 if (two != 2 || zero != 0)
275 pcp_warn (r, 0, _("Directory fields have unexpected values "
276 "(%u,%u)."), two, zero);
278 if (!pcp_read_dir_entry (r, &r->directory.main)
279 || !pcp_read_dir_entry (r, &r->directory.variables)
280 || !pcp_read_dir_entry (r, &r->directory.labels)
281 || !pcp_read_dir_entry (r, &r->directory.data))
282 return false;
284 if (!read_main_header (r, &r->header))
285 return false;
287 read_variables_record (r);
289 return true;
292 struct get_strings_aux
294 struct pool *pool;
295 char **titles;
296 char **strings;
297 bool *ids;
298 size_t allocated;
299 size_t n;
302 static void
303 add_string__ (struct get_strings_aux *aux,
304 const char *string, bool id, char *title)
306 if (aux->n >= aux->allocated)
308 aux->allocated = 2 * (aux->allocated + 1);
309 aux->titles = pool_realloc (aux->pool, aux->titles,
310 aux->allocated * sizeof *aux->titles);
311 aux->strings = pool_realloc (aux->pool, aux->strings,
312 aux->allocated * sizeof *aux->strings);
313 aux->ids = pool_realloc (aux->pool, aux->ids,
314 aux->allocated * sizeof *aux->ids);
317 aux->titles[aux->n] = title;
318 aux->strings[aux->n] = pool_strdup (aux->pool, string);
319 aux->ids[aux->n] = id;
320 aux->n++;
323 static void PRINTF_FORMAT (3, 4)
324 add_string (struct get_strings_aux *aux,
325 const char *string, const char *title, ...)
327 va_list args;
329 va_start (args, title);
330 add_string__ (aux, string, false, pool_vasprintf (aux->pool, title, args));
331 va_end (args);
334 static void PRINTF_FORMAT (3, 4)
335 add_id (struct get_strings_aux *aux, const char *id, const char *title, ...)
337 va_list args;
339 va_start (args, title);
340 add_string__ (aux, id, true, pool_vasprintf (aux->pool, title, args));
341 va_end (args);
344 /* Retrieves significant string data from R in its raw format, to allow the
345 caller to try to detect the encoding in use.
347 Returns the number of strings retrieved N. Sets each of *TITLESP, *IDSP,
348 and *STRINGSP to an array of N elements allocated from POOL. For each I in
349 0...N-1, UTF-8 string *TITLESP[I] describes *STRINGSP[I], which is in
350 whatever encoding system file R uses. *IDS[I] is true if *STRINGSP[I] must
351 be a valid PSPP language identifier, false if *STRINGSP[I] is free-form
352 text. */
353 static size_t
354 pcp_get_strings (const struct any_reader *r_, struct pool *pool,
355 char ***titlesp, bool **idsp, char ***stringsp)
357 struct pcp_reader *r = pcp_reader_cast (r_);
358 struct get_strings_aux aux;
359 size_t var_idx;
360 size_t i, j;
362 aux.pool = pool;
363 aux.titles = NULL;
364 aux.strings = NULL;
365 aux.ids = NULL;
366 aux.allocated = 0;
367 aux.n = 0;
369 var_idx = 0;
370 for (i = 0; i < r->n_vars; i++)
371 if (r->vars[i].width != -1)
372 add_id (&aux, r->vars[i].name, _("Variable %zu"), ++var_idx);
374 var_idx = 0;
375 for (i = 0; i < r->n_vars; i++)
376 if (r->vars[i].width != -1)
378 var_idx++;
379 if (r->vars[i].label)
380 add_string (&aux, r->vars[i].label, _("Variable %zu Label"),
381 var_idx);
383 for (j = 0; j < r->vars[i].n_val_labs; j++)
384 add_string (&aux, r->vars[i].label,
385 _("Variable %zu Value Label %zu"), var_idx, j);
388 add_string (&aux, r->header.creation_date, _("Creation Date"));
389 add_string (&aux, r->header.creation_time, _("Creation Time"));
390 add_string (&aux, r->header.product, _("Product"));
391 add_string (&aux, r->header.file_label, _("File Label"));
393 *titlesp = aux.titles;
394 *idsp = aux.ids;
395 *stringsp = aux.strings;
396 return aux.n;
399 /* Decodes the dictionary read from R, saving it into *DICT. Character
400 strings in R are decoded using ENCODING, or an encoding obtained from R if
401 ENCODING is null, or the locale encoding if R specifies no encoding.
403 If INFOP is non-null, then it receives additional info about the system
404 file, which the caller must eventually free with any_read_info_destroy()
405 when it is no longer needed.
407 This function consumes R. The caller must use it again later, even to
408 destroy it with pcp_close(). */
409 static struct casereader *
410 pcp_decode (struct any_reader *r_, const char *encoding,
411 struct dictionary **dictp, struct any_read_info *infop)
413 struct pcp_reader *r = pcp_reader_cast (r_);
414 struct dictionary *dict;
416 if (encoding == NULL)
418 encoding = locale_charset ();
419 pcp_warn (r, -1, _("Using default encoding %s to read this SPSS/PC+ "
420 "system file. For best results, specify an "
421 "encoding explicitly. Use SYSFILE INFO with "
422 "ENCODING=\"DETECT\" to analyze the possible "
423 "encodings."),
424 encoding);
427 dict = dict_create (encoding);
428 r->encoding = dict_get_encoding (dict);
430 parse_header (r, &r->header, &r->info, dict);
431 if (!parse_variable_records (r, dict, r->vars, r->n_vars))
432 goto error;
434 /* Create an index of dictionary variable widths for
435 pcp_read_case to use. We cannot use the `struct variable's
436 from the dictionary we created, because the caller owns the
437 dictionary and may destroy or modify its variables. */
438 r->proto = caseproto_ref_pool (dict_get_proto (dict), r->pool);
440 *dictp = dict;
441 if (infop)
443 *infop = r->info;
444 memset (&r->info, 0, sizeof r->info);
447 return casereader_create_sequential
448 (NULL, r->proto, r->n_cases, &pcp_file_casereader_class, r);
450 error:
451 pcp_close (&r->any_reader);
452 dict_unref (dict);
453 *dictp = NULL;
454 return NULL;
457 /* Closes R, which should have been returned by pcp_open() but not already
458 closed with pcp_decode() or this function.
459 Returns true if an I/O error has occurred on READER, false
460 otherwise. */
461 static bool
462 pcp_close (struct any_reader *r_)
464 struct pcp_reader *r = pcp_reader_cast (r_);
465 bool error;
467 if (r->file)
469 if (fn_close (r->fh, r->file) == EOF)
471 msg (ME, _("Error closing system file `%s': %s."),
472 fh_get_file_name (r->fh), strerror (errno));
473 r->error = true;
475 r->file = NULL;
478 any_read_info_destroy (&r->info);
479 fh_unlock (r->lock);
480 fh_unref (r->fh);
482 error = r->error;
483 pool_destroy (r->pool);
485 return !error;
488 /* Destroys READER. */
489 static void
490 pcp_file_casereader_destroy (struct casereader *reader UNUSED, void *r_)
492 struct pcp_reader *r = r_;
493 pcp_close (&r->any_reader);
496 /* Detects whether FILE is an SPSS/PC+ system file. Returns 1 if so, 0 if
497 not, and a negative errno value if there is an error reading FILE. */
498 static int
499 pcp_detect (FILE *file)
501 static const char signature[4] = "SPSS";
502 char buf[sizeof signature];
504 if (fseek (file, 0x104, SEEK_SET))
505 return -errno;
507 if (fread (buf, sizeof buf, 1, file) != 1)
508 return ferror (file) ? -errno : 0;
510 return !memcmp (buf, signature, sizeof buf);
513 /* Reads the main header of the SPSS/PC+ system file. Initializes *HEADER and
514 *INFO, except for the string fields in *INFO, which parse_header() will
515 initialize later once the file's encoding is known. */
516 static bool
517 read_main_header (struct pcp_reader *r, struct pcp_main_header *header)
519 unsigned int base_ofs = r->directory.main.ofs;
520 unsigned int zero0, zero1, zero2, zero3;
521 size_t min_values, min_data_size;
522 unsigned int one0, one1;
523 unsigned int compressed;
524 unsigned int n_cases1;
525 uint8_t sysmis[8];
527 if (!pcp_seek (r, base_ofs))
528 return false;
530 if (r->directory.main.len < 0xb0)
532 pcp_error (r, r->pos, _("This is not an SPSS/PC+ system file."));
533 return false;
535 else if (r->directory.main.len > 0xb0)
536 pcp_warn (r, r->pos, _("Record 0 has unexpected length %u."),
537 r->directory.main.len);
539 if (!read_uint16 (r, &one0)
540 || !read_string (r, header->product, sizeof header->product)
541 || !read_bytes (r, sysmis, sizeof sysmis)
542 || !read_uint32 (r, &zero0)
543 || !read_uint32 (r, &zero1)
544 || !read_uint16 (r, &one1)
545 || !read_uint16 (r, &compressed)
546 || !read_uint16 (r, &header->nominal_case_size)
547 || !read_uint16 (r, &r->n_cases)
548 || !read_uint16 (r, &header->weight_index)
549 || !read_uint16 (r, &zero2)
550 || !read_uint16 (r, &n_cases1)
551 || !read_uint16 (r, &zero3)
552 || !read_string (r, header->creation_date, sizeof header->creation_date)
553 || !read_string (r, header->creation_time, sizeof header->creation_time)
554 || !read_string (r, header->file_label, sizeof header->file_label))
555 return false;
557 if (!pcp_is_sysmis (sysmis))
559 double d = parse_float (sysmis);
560 pcp_warn (r, base_ofs, _("Record 0 specifies unexpected system missing "
561 "value %g (%a)."), d, d);
563 if (one0 != 1 || one1 != 1
564 || zero0 != 0 || zero1 != 0 || zero2 != 0 || zero3 != 0)
565 pcp_warn (r, base_ofs, _("Record 0 reserved fields have unexpected values "
566 "(%u,%u,%u,%u,%u,%u)."),
567 one0, one1, zero0, zero1, zero2, zero3);
568 if (n_cases1 != r->n_cases)
569 pcp_warn (r, base_ofs, _("Record 0 case counts differ (%u versus %u)."),
570 r->n_cases, n_cases1);
571 if (compressed != 0 && compressed != 1)
573 pcp_error (r, base_ofs, _("Invalid compression type %u."), compressed);
574 return false;
577 r->compressed = compressed != 0;
579 min_values = xtimes (header->nominal_case_size, r->n_cases);
580 min_data_size = xtimes (compressed ? 1 : 8, min_values);
581 if (r->directory.data.len < min_data_size
582 || size_overflow_p (min_data_size))
584 pcp_warn (r, base_ofs, _("Record 0 claims %u cases with %u values per "
585 "case (requiring at least %zu bytes) but data "
586 "record is only %u bytes long."),
587 r->n_cases, header->nominal_case_size, min_data_size,
588 r->directory.data.len);
589 return true;
592 return true;
595 static bool
596 read_value_labels (struct pcp_reader *r, struct pcp_var_record *var,
597 unsigned int start, unsigned int end)
599 size_t allocated_val_labs = 0;
601 start += 7;
602 end += 7;
603 if (end > r->directory.labels.len)
605 pcp_warn (r, r->pos - 32,
606 _("Value labels claimed to end at offset %u in labels record "
607 "but labels record is only %u bytes."),
608 end, r->directory.labels.len);
609 return true;
612 start += r->directory.labels.ofs;
613 end += r->directory.labels.ofs;
614 if (start > end || end > r->file_size)
616 pcp_warn (r, r->pos - 32,
617 _("Value labels claimed to be at offset %u with length %u "
618 "but file size is only %u bytes."),
619 start, end - start, r->file_size);
620 return true;
623 if (!pcp_seek (r, start))
624 return false;
626 while (r->pos < end && end - r->pos > 8)
628 struct pcp_value_label *vl;
629 uint8_t len;
631 if (var->n_val_labs >= allocated_val_labs)
632 var->val_labs = pool_2nrealloc (r->pool, var->val_labs,
633 &allocated_val_labs,
634 sizeof *var->val_labs);
635 vl = &var->val_labs[var->n_val_labs];
637 if (!read_bytes (r, vl->value, sizeof vl->value)
638 || !read_bytes (r, &len, 1))
639 return false;
641 if (end - r->pos < len)
643 pcp_warn (r, r->pos,
644 _("Value labels end with partial label (%u bytes left in "
645 "record, label length %"PRIu8")."),
646 end - r->pos, len);
647 return true;
649 vl->label = pool_malloc (r->pool, len + 1);
650 if (!read_bytes (r, vl->label, len))
651 return false;
653 vl->label[len] = '\0';
654 var->n_val_labs++;
656 if (r->pos < end)
657 pcp_warn (r, r->pos, _("%u leftover bytes following value labels."),
658 end - r->pos);
660 return true;
663 static bool
664 read_var_label (struct pcp_reader *r, struct pcp_var_record *var,
665 unsigned int ofs)
667 uint8_t len;
669 ofs += 7;
670 if (ofs >= r->directory.labels.len)
672 pcp_warn (r, r->pos - 32,
673 _("Variable label claimed to start at offset %u in labels "
674 "record but labels record is only %u bytes."),
675 ofs, r->directory.labels.len);
676 return true;
679 if (!pcp_seek (r, ofs + r->directory.labels.ofs) || !read_bytes (r, &len, 1))
680 return false;
682 if (len >= r->directory.labels.len - ofs)
684 pcp_warn (r, r->pos - 1,
685 _("Variable label with length %u starting at offset %u in "
686 "labels record overruns end of %u-byte labels record."),
687 len, ofs + 1, r->directory.labels.len);
688 return false;
691 var->label = pool_malloc (r->pool, len + 1);
692 var->label[len] = '\0';
693 return read_bytes (r, var->label, len);
696 /* Reads the variables record (record 1) into R. */
697 static bool
698 read_variables_record (struct pcp_reader *r)
700 unsigned int i;
701 bool weighted;
703 if (!pcp_seek (r, r->directory.variables.ofs))
704 return false;
705 if (r->directory.variables.len != r->header.nominal_case_size * 32)
707 pcp_error (r, r->pos, _("Record 1 has length %u (expected %u)."),
708 r->directory.variables.len, r->header.nominal_case_size * 32);
709 return false;
712 r->vars = pool_calloc (r->pool,
713 r->header.nominal_case_size, sizeof *r->vars);
714 weighted = false;
715 for (i = 0; i < r->header.nominal_case_size; i++)
717 struct pcp_var_record *var = &r->vars[r->n_vars++];
718 unsigned int value_label_start, value_label_end;
719 unsigned int var_label_ofs;
720 unsigned int format;
721 uint8_t raw_type;
723 var->pos = r->pos;
724 if (!read_uint32 (r, &value_label_start)
725 || !read_uint32 (r, &value_label_end)
726 || !read_uint32 (r, &var_label_ofs)
727 || !read_uint32 (r, &format)
728 || !read_string (r, var->name, sizeof var->name)
729 || !read_bytes (r, var->missing, sizeof var->missing))
730 return false;
732 var->weight = r->header.weight_index && i == r->header.weight_index - 1;
733 if (var->weight)
734 weighted = true;
736 raw_type = format >> 16;
737 if (!fmt_from_io (raw_type, &var->format.type))
739 pcp_error (r, var->pos, _("Variable %u has invalid type %"PRIu8"."),
740 i, raw_type);
741 return false;
744 var->format.w = (format >> 8) & 0xff;
745 var->format.d = format & 0xff;
746 fmt_fix_output (&var->format);
747 var->width = fmt_var_width (var->format);
749 if (var_label_ofs)
751 unsigned int save_pos = r->pos;
752 if (!read_var_label (r, var, var_label_ofs)
753 || !pcp_seek (r, save_pos))
754 return false;
757 if (value_label_end > value_label_start && var->width <= 8)
759 unsigned int save_pos = r->pos;
760 if (!read_value_labels (r, var, value_label_start, value_label_end)
761 || !pcp_seek (r, save_pos))
762 return false;
765 if (var->width > 8)
767 int extra = DIV_RND_UP (var->width - 8, 8);
768 i += extra;
769 if (!skip_bytes (r, 32 * extra))
770 return false;
774 if (r->header.weight_index && !weighted)
775 pcp_warn (r, -1, _("Invalid weight index %u."), r->header.weight_index);
777 return true;
780 static char *
781 recode_and_trim_string (struct pool *pool, const char *from, const char *in)
783 struct substring out;
785 out = recode_substring_pool ("UTF-8", from, ss_cstr (in), pool);
786 ss_trim (&out, ss_cstr (" "));
787 return ss_xstrdup (out);
790 static void
791 parse_header (struct pcp_reader *r, const struct pcp_main_header *header,
792 struct any_read_info *info, struct dictionary *dict)
794 const char *dict_encoding = dict_get_encoding (dict);
795 char *label;
797 memset (info, 0, sizeof *info);
799 info->integer_format = INTEGER_LSB_FIRST;
800 info->float_format = FLOAT_IEEE_DOUBLE_LE;
801 info->compression = r->compressed ? ANY_COMP_SIMPLE : ANY_COMP_NONE;
802 info->n_cases = r->n_cases;
804 /* Convert file label to UTF-8 and put it into DICT. */
805 label = recode_and_trim_string (r->pool, dict_encoding, header->file_label);
806 dict_set_label (dict, label);
807 free (label);
809 /* Put creation date, time, and product in UTF-8 into INFO. */
810 info->creation_date = recode_and_trim_string (r->pool, dict_encoding,
811 header->creation_date);
812 info->creation_time = recode_and_trim_string (r->pool, dict_encoding,
813 header->creation_time);
814 info->product = recode_and_trim_string (r->pool, dict_encoding,
815 header->product);
818 /* Reads a variable (type 2) record from R and adds the
819 corresponding variable to DICT.
820 Also skips past additional variable records for long string
821 variables. */
822 static bool
823 parse_variable_records (struct pcp_reader *r, struct dictionary *dict,
824 struct pcp_var_record *var_recs, size_t n_var_recs)
826 const char *dict_encoding = dict_get_encoding (dict);
827 struct pcp_var_record *rec;
829 for (rec = var_recs; rec < &var_recs[n_var_recs]; rec++)
831 char *name;
832 size_t i;
834 name = recode_string_pool ("UTF-8", dict_encoding,
835 rec->name, -1, r->pool);
836 name[strcspn (name, " ")] = '\0';
838 /* Drop system variables. */
839 rec->drop = name[0] == '$';
840 if (rec->drop)
842 value_init_pool (r->pool, &rec->tmp, rec->width);
843 continue;
846 if (!dict_id_is_valid (dict, name, DC_ORDINARY))
848 pcp_error (r, rec->pos, _("Invalid variable name `%s'."), name);
849 return false;
852 struct variable *var = dict_create_var (dict, name, rec->width);
853 if (var == NULL)
855 var = dict_create_var_with_unique_name (dict, name, rec->width);
856 pcp_warn (r, rec->pos, _("Renaming variable with duplicate name "
857 "`%s' to `%s'."),
858 name, var_get_name (var));
860 if (rec->weight)
862 if (!rec->width)
863 dict_set_weight (dict, var);
864 else
865 pcp_warn (r, rec->pos,
866 _("Cannot weight by string variable `%s'."), name);
869 /* Set the short name the same as the long name. */
870 var_set_short_name (var, 0, name);
872 /* Get variable label, if any. */
873 if (rec->label)
875 char *utf8_label;
877 utf8_label = recode_string ("UTF-8", dict_encoding, rec->label, -1);
878 var_set_label (var, utf8_label);
879 free (utf8_label);
882 /* Add value labels. */
883 for (i = 0; i < rec->n_val_labs; i++)
885 union value value;
886 char *utf8_label;
888 value_init (&value, rec->width);
889 if (var_is_numeric (var))
890 value.f = parse_float (rec->val_labs[i].value);
891 else
892 memcpy (value.s, rec->val_labs[i].value, rec->width);
894 utf8_label = recode_string ("UTF-8", dict_encoding,
895 rec->val_labs[i].label, -1);
896 var_add_value_label (var, &value, utf8_label);
897 free (utf8_label);
899 value_destroy (&value, rec->width);
902 /* Set missing values. */
903 if (rec->width <= 8 && !pcp_is_sysmis (rec->missing))
905 int width = var_get_width (var);
906 struct missing_values mv;
908 mv_init_pool (r->pool, &mv, width);
909 if (var_is_numeric (var))
910 mv_add_num (&mv, parse_float (rec->missing));
911 else
912 mv_add_str (&mv, rec->missing, MIN (width, 8));
913 var_set_missing_values (var, &mv);
916 /* Set formats. */
917 var_set_both_formats (var, rec->format);
920 return true;
923 /* Case reader. */
925 static void read_error (struct casereader *, const struct pcp_reader *);
927 static bool read_case_number (struct pcp_reader *, double *);
928 static int read_case_string (struct pcp_reader *, uint8_t *, size_t);
929 static int read_opcode (struct pcp_reader *);
930 static bool read_compressed_number (struct pcp_reader *, double *);
931 static int read_compressed_string (struct pcp_reader *, uint8_t *);
932 static int read_whole_strings (struct pcp_reader *, uint8_t *, size_t);
934 /* Reads and returns one case from READER's file. Returns a null
935 pointer if not successful. */
936 static struct ccase *
937 pcp_file_casereader_read (struct casereader *reader, void *r_)
939 struct pcp_reader *r = r_;
940 unsigned int start_pos = r->pos;
941 struct ccase *c;
942 int retval;
943 int i;
945 if (r->error || !r->n_cases)
946 return NULL;
947 r->n_cases--;
949 c = case_create (r->proto);
950 size_t case_idx = 0;
951 for (i = 0; i < r->n_vars; i++)
953 struct pcp_var_record *var = &r->vars[i];
954 union value *v = var->drop ? &var->tmp : case_data_rw_idx (c, case_idx++);
956 if (var->width == 0)
957 retval = read_case_number (r, &v->f);
958 else
959 retval = read_case_string (r, v->s, var->width);
961 if (retval != 1)
963 pcp_error (r, r->pos, _("File ends in partial case."));
964 goto error;
967 if (r->pos > r->directory.data.ofs + r->directory.data.len)
969 pcp_error (r, r->pos, _("Case beginning at offset 0x%08x extends past "
970 "end of data record at offset 0x%08x."),
971 start_pos, r->directory.data.ofs + r->directory.data.len);
972 goto error;
975 return c;
977 error:
978 read_error (reader, r);
979 case_unref (c);
980 return NULL;
983 /* Issues an error that an unspecified error occurred PCP, and
984 marks R tainted. */
985 static void
986 read_error (struct casereader *r, const struct pcp_reader *pcp)
988 msg (ME, _("Error reading case from file %s."), fh_get_name (pcp->fh));
989 casereader_force_error (r);
992 /* Reads a number from R and stores its value in *D.
993 If R is compressed, reads a compressed number;
994 otherwise, reads a number in the regular way.
995 Returns true if successful, false if end of file is
996 reached immediately. */
997 static bool
998 read_case_number (struct pcp_reader *r, double *d)
1000 if (!r->compressed)
1002 uint8_t number[8];
1003 if (!try_read_bytes (r, number, sizeof number))
1004 return false;
1005 *d = parse_float (number);
1006 return true;
1008 else
1009 return read_compressed_number (r, d);
1012 /* Reads LENGTH string bytes from R into S. Always reads a multiple of 8
1013 bytes; if LENGTH is not a multiple of 8, then extra bytes are read and
1014 discarded without being written to S. Reads compressed strings if S is
1015 compressed. Returns 1 if successful, 0 if end of file is reached
1016 immediately, or -1 for some kind of error. */
1017 static int
1018 read_case_string (struct pcp_reader *r, uint8_t *s, size_t length)
1020 size_t whole = ROUND_DOWN (length, 8);
1021 size_t partial = length % 8;
1023 if (whole)
1025 int retval = read_whole_strings (r, s, whole);
1026 if (retval != 1)
1027 return retval;
1030 if (partial)
1032 uint8_t bounce[8];
1033 int retval = read_whole_strings (r, bounce, sizeof bounce);
1034 if (retval <= 0)
1035 return -1;
1036 memcpy (s + whole, bounce, partial);
1039 return 1;
1042 /* Reads and returns the next compression opcode from R. */
1043 static int
1044 read_opcode (struct pcp_reader *r)
1046 assert (r->compressed);
1047 if (r->opcode_idx >= sizeof r->opcodes)
1049 int retval = try_read_bytes (r, r->opcodes, sizeof r->opcodes);
1050 if (retval != 1)
1051 return -1;
1052 r->opcode_idx = 0;
1054 return r->opcodes[r->opcode_idx++];
1057 /* Reads a compressed number from R and stores its value in D.
1058 Returns true if successful, false if end of file is
1059 reached immediately. */
1060 static bool
1061 read_compressed_number (struct pcp_reader *r, double *d)
1063 int opcode = read_opcode (r);
1064 switch (opcode)
1066 case -1:
1067 return false;
1069 case 0:
1070 *d = SYSMIS;
1071 return true;
1073 case 1:
1074 return read_float (r, d);
1076 default:
1077 *d = opcode - 105.0;
1078 return true;
1082 /* Reads a compressed 8-byte string segment from R and stores it in DST. */
1083 static int
1084 read_compressed_string (struct pcp_reader *r, uint8_t *dst)
1086 int opcode;
1087 int retval;
1089 opcode = read_opcode (r);
1090 switch (opcode)
1092 case -1:
1093 return 0;
1095 case 1:
1096 retval = read_bytes (r, dst, 8);
1097 return retval == 1 ? 1 : -1;
1099 default:
1100 if (!r->corruption_warning)
1102 r->corruption_warning = true;
1103 pcp_warn (r, r->pos,
1104 _("Possible compressed data corruption: "
1105 "string contains compressed integer (opcode %d)."),
1106 opcode);
1108 memset (dst, ' ', 8);
1109 return 1;
1113 /* Reads LENGTH string bytes from R into S. LENGTH must be a multiple of 8.
1114 Reads compressed strings if S is compressed. Returns 1 if successful, 0 if
1115 end of file is reached immediately, or -1 for some kind of error. */
1116 static int
1117 read_whole_strings (struct pcp_reader *r, uint8_t *s, size_t length)
1119 assert (length % 8 == 0);
1120 if (!r->compressed)
1121 return try_read_bytes (r, s, length);
1122 else
1124 size_t ofs;
1126 for (ofs = 0; ofs < length; ofs += 8)
1128 int retval = read_compressed_string (r, s + ofs);
1129 if (retval != 1)
1130 return -1;
1132 return 1;
1136 /* Messages. */
1138 /* Displays a corruption message. */
1139 static void
1140 pcp_msg (struct pcp_reader *r, off_t offset,
1141 int class, const char *format, va_list args)
1143 struct string text;
1144 ds_init_empty (&text);
1145 if (offset >= 0)
1146 ds_put_format (&text, _("`%s' near offset 0x%llx: "),
1147 fh_get_file_name (r->fh), (long long int) offset);
1148 else
1149 ds_put_format (&text, _("`%s': "), fh_get_file_name (r->fh));
1150 ds_put_vformat (&text, format, args);
1152 struct msg *m = xmalloc (sizeof *m);
1153 *m = (struct msg) {
1154 .category = msg_class_to_category (class),
1155 .severity = msg_class_to_severity (class),
1156 .text = ds_steal_cstr (&text),
1158 msg_emit (m);
1161 /* Displays a warning for offset OFFSET in the file. */
1162 static void
1163 pcp_warn (struct pcp_reader *r, off_t offset, const char *format, ...)
1165 va_list args;
1167 va_start (args, format);
1168 pcp_msg (r, offset, MW, format, args);
1169 va_end (args);
1172 /* Displays an error for the current file position,
1173 marks it as in an error state,
1174 and aborts reading it using longjmp. */
1175 static void
1176 pcp_error (struct pcp_reader *r, off_t offset, const char *format, ...)
1178 va_list args;
1180 va_start (args, format);
1181 pcp_msg (r, offset, ME, format, args);
1182 va_end (args);
1184 r->error = true;
1187 /* Reads BYTE_CNT bytes into BUF.
1188 Returns 1 if exactly BYTE_CNT bytes are successfully read.
1189 Returns -1 if an I/O error or a partial read occurs.
1190 Returns 0 for an immediate end-of-file and, if EOF_IS_OK is false, reports
1191 an error. */
1192 static inline int
1193 read_bytes_internal (struct pcp_reader *r, bool eof_is_ok,
1194 void *buf, size_t n_bytes)
1196 size_t bytes_read = fread (buf, 1, n_bytes, r->file);
1197 r->pos += bytes_read;
1198 if (bytes_read == n_bytes)
1199 return 1;
1200 else if (ferror (r->file))
1202 pcp_error (r, r->pos, _("System error: %s."), strerror (errno));
1203 return -1;
1205 else if (!eof_is_ok || bytes_read != 0)
1207 pcp_error (r, r->pos, _("Unexpected end of file."));
1208 return -1;
1210 else
1211 return 0;
1214 /* Reads BYTE_CNT into BUF.
1215 Returns true if successful.
1216 Returns false upon I/O error or if end-of-file is encountered. */
1217 static bool
1218 read_bytes (struct pcp_reader *r, void *buf, size_t n_bytes)
1220 return read_bytes_internal (r, false, buf, n_bytes) == 1;
1223 /* Reads BYTE_CNT bytes into BUF.
1224 Returns 1 if exactly BYTE_CNT bytes are successfully read.
1225 Returns 0 if an immediate end-of-file is encountered.
1226 Returns -1 if an I/O error or a partial read occurs. */
1227 static int
1228 try_read_bytes (struct pcp_reader *r, void *buf, size_t n_bytes)
1230 return read_bytes_internal (r, true, buf, n_bytes);
1233 /* Reads a 16-bit signed integer from R and stores its value in host format in
1234 *X. Returns true if successful, otherwise false. */
1235 static bool
1236 read_uint16 (struct pcp_reader *r, unsigned int *x)
1238 uint8_t integer[2];
1239 if (read_bytes (r, integer, sizeof integer) != 1)
1240 return false;
1241 *x = integer_get (INTEGER_LSB_FIRST, integer, sizeof integer);
1242 return true;
1245 /* Reads a 32-bit signed integer from R and stores its value in host format in
1246 *X. Returns true if successful, otherwise false. */
1247 static bool
1248 read_uint32 (struct pcp_reader *r, unsigned int *x)
1250 uint8_t integer[4];
1251 if (read_bytes (r, integer, sizeof integer) != 1)
1252 return false;
1253 *x = integer_get (INTEGER_LSB_FIRST, integer, sizeof integer);
1254 return true;
1257 /* Reads exactly SIZE - 1 bytes into BUFFER
1258 and stores a null byte into BUFFER[SIZE - 1]. */
1259 static bool
1260 read_string (struct pcp_reader *r, char *buffer, size_t size)
1262 bool ok;
1264 assert (size > 0);
1265 ok = read_bytes (r, buffer, size - 1);
1266 if (ok)
1267 buffer[size - 1] = '\0';
1268 return ok;
1271 /* Skips BYTES bytes forward in R. */
1272 static bool
1273 skip_bytes (struct pcp_reader *r, size_t bytes)
1275 while (bytes > 0)
1277 char buffer[1024];
1278 size_t chunk = MIN (sizeof buffer, bytes);
1279 if (!read_bytes (r, buffer, chunk))
1280 return false;
1281 bytes -= chunk;
1284 return true;
1287 static bool
1288 pcp_seek (struct pcp_reader *r, off_t offset)
1290 if (fseeko (r->file, offset, SEEK_SET))
1292 pcp_error (r, 0, _("%s: seek failed (%s)."),
1293 fh_get_file_name (r->fh), strerror (errno));
1294 return false;
1296 r->pos = offset;
1297 return true;
1300 /* Reads a 64-bit floating-point number from R and returns its
1301 value in host format. */
1302 static bool
1303 read_float (struct pcp_reader *r, double *d)
1305 uint8_t number[8];
1307 if (!read_bytes (r, number, sizeof number))
1308 return false;
1309 else
1311 *d = parse_float (number);
1312 return true;
1316 static double
1317 parse_float (const uint8_t number[8])
1319 return (pcp_is_sysmis (number)
1320 ? SYSMIS
1321 : float_get_double (FLOAT_IEEE_DOUBLE_LE, number));
1324 static bool
1325 pcp_is_sysmis(const uint8_t *p)
1327 static const uint8_t sysmis[8]
1328 = { 0xf5, 0x1e, 0x26, 0x02, 0x8a, 0x8c, 0xed, 0xff };
1329 return !memcmp (p, sysmis, 8);
1332 static const struct casereader_class pcp_file_casereader_class =
1334 pcp_file_casereader_read,
1335 pcp_file_casereader_destroy,
1336 NULL,
1337 NULL,
1340 const struct any_reader_class pcp_file_reader_class =
1342 N_("SPSS/PC+ System File"),
1343 pcp_detect,
1344 pcp_open,
1345 pcp_close,
1346 pcp_decode,
1347 pcp_get_strings,