src/language/commands/get-data.c

   1 /* PSPP - a program for statistical analysis.
   2    Copyright (C) 2007, 2008, 2009, 2010, 2011, 2012,
   3                  2013, 2015, 2016 Free Software Foundation, Inc.
   4
   5    This program is free software: you can redistribute it and/or modify
   6    it under the terms of the GNU General Public License as published by
   7    the Free Software Foundation, either version 3 of the License, or
   8    (at your option) any later version.
   9
  10    This program is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13    GNU General Public License for more details.
  14
  15    You should have received a copy of the GNU General Public License
  16    along with this program.  If not, see <http://www.gnu.org/licenses/>. */
  17
  18 #include <config.h>
  19
  20 #include <stdlib.h>
  21
  22 #include <string.h>
  23
  24 #include "data/dataset.h"
  25 #include "data/dictionary.h"
  26 #include "data/format.h"
  27 #include "data/gnumeric-reader.h"
  28 #include "data/ods-reader.h"
  29 #include "data/spreadsheet-reader.h"
  30 #include "data/psql-reader.h"
  31 #include "data/settings.h"
  32 #include "language/command.h"
  33 #include "language/commands/data-parser.h"
  34 #include "language/commands/data-reader.h"
  35 #include "language/commands/file-handle.h"
  36 #include "language/commands/placement-parser.h"
  37 #include "language/lexer/format-parser.h"
  38 #include "language/lexer/lexer.h"
  39 #include "libpspp/cast.h"
  40 #include "libpspp/i18n.h"
  41 #include "libpspp/message.h"
  42
  43 #include "gl/xalloc.h"
  44
  45 #include "gettext.h"
  46 #define _(msgid) gettext (msgid)
  47 #define N_(msgid) (msgid)
  48
  49 static bool parse_spreadsheet (struct lexer *lexer, char **filename,
  50                                struct spreadsheet_read_options *opts);
  51
  52 static void destroy_spreadsheet_read_info (struct spreadsheet_read_options *);
  53
  54 static int parse_get_txt (struct lexer *, struct dataset *);
  55 static int parse_get_psql (struct lexer *, struct dataset *);
  56 static int parse_get_spreadsheet (struct lexer *, struct dataset *,
  57                                   struct spreadsheet *(*probe)(
  58                                     const char *filename, bool report_errors));
  59
  60 int
  61 cmd_get_data (struct lexer *lexer, struct dataset *ds)
  62 {
  63   if (!lex_force_match_phrase (lexer, "/TYPE="))
  64     return CMD_FAILURE;
  65
  66   if (lex_match_id (lexer, "TXT"))
  67     return parse_get_txt (lexer, ds);
  68   else if (lex_match_id (lexer, "PSQL"))
  69     return parse_get_psql (lexer, ds);
  70   else if (lex_match_id (lexer, "GNM"))
  71     return parse_get_spreadsheet (lexer, ds, gnumeric_probe);
  72   else if (lex_match_id (lexer, "ODS"))
  73     return parse_get_spreadsheet (lexer, ds, ods_probe);
  74   else
  75     {
  76       lex_error_expecting (lexer, "TXT", "PSQL", "GNM", "ODS");
  77       return CMD_FAILURE;
  78     }
  79 }
  80
  81 static int
  82 parse_get_spreadsheet (struct lexer *lexer, struct dataset *ds,
  83                        struct spreadsheet *(*probe)(
  84                          const char *filename, bool report_errors))
  85 {
  86   struct spreadsheet_read_options opts;
  87   char *filename;
  88   if (!parse_spreadsheet (lexer, &filename, &opts))
  89     return CMD_FAILURE;
  90
  91   bool ok = false;
  92   struct spreadsheet *spreadsheet = probe (filename, true);
  93   if (!spreadsheet)
  94     {
  95       msg (SE, _("error reading file `%s'"), filename);
  96       goto done;
  97     }
  98
  99   struct casereader *reader = spreadsheet_make_reader (spreadsheet, &opts);
 100   if (reader)
 101     {
 102       dataset_set_dict (ds, dict_clone (spreadsheet->dict));
 103       dataset_set_source (ds, reader);
 104       ok = true;
 105     }
 106   spreadsheet_unref (spreadsheet);
 107
 108 done:
 109   free (filename);
 110   destroy_spreadsheet_read_info (&opts);
 111   return ok ? CMD_SUCCESS : CMD_FAILURE;
 112 }
 113
 114 static int
 115 parse_get_psql (struct lexer *lexer, struct dataset *ds)
 116 {
 117   if (!lex_force_match_phrase (lexer, "/CONNECT=") || !lex_force_string (lexer))
 118     return CMD_FAILURE;
 119
 120   struct psql_read_info psql = {
 121     .str_width = -1,
 122     .bsize = -1,
 123     .conninfo = ss_xstrdup (lex_tokss (lexer)),
 124   };
 125   bool ok = false;
 126
 127   lex_get (lexer);
 128
 129   while (lex_match (lexer, T_SLASH))
 130     {
 131       if (lex_match_id (lexer, "ASSUMEDSTRWIDTH"))
 132         {
 133           lex_match (lexer, T_EQUALS);
 134           if (!lex_force_int_range (lexer, "ASSUMEDSTRWIDTH", 1, 32767))
 135             goto done;
 136           psql.str_width = lex_integer (lexer);
 137           lex_get (lexer);
 138         }
 139       else if (lex_match_id (lexer, "BSIZE"))
 140         {
 141           lex_match (lexer, T_EQUALS);
 142           if (!lex_force_int_range (lexer, "BSIZE", 1, INT_MAX))
 143             goto done;
 144           psql.bsize = lex_integer (lexer);
 145           lex_get (lexer);
 146         }
 147       else if (lex_match_id (lexer, "UNENCRYPTED"))
 148         psql.allow_clear = true;
 149       else if (lex_match_id (lexer, "SQL"))
 150         {
 151           lex_match (lexer, T_EQUALS);
 152           if (!lex_force_string (lexer))
 153             goto done;
 154
 155           free (psql.sql);
 156           psql.sql = ss_xstrdup (lex_tokss (lexer));
 157           lex_get (lexer);
 158         }
 159      }
 160
 161   struct dictionary *dict = NULL;
 162   struct casereader *reader = psql_open_reader (&psql, &dict);
 163   if (reader)
 164     {
 165       dataset_set_dict (ds, dict);
 166       dataset_set_source (ds, reader);
 167     }
 168
 169  done:
 170   free (psql.conninfo);
 171   free (psql.sql);
 172
 173   return ok ? CMD_SUCCESS : CMD_FAILURE;
 174 }
 175
 176 static bool
 177 parse_spreadsheet (struct lexer *lexer, char **filename,
 178                    struct spreadsheet_read_options *opts)
 179 {
 180   *opts = (struct spreadsheet_read_options) {
 181     .sheet_index = 1,
 182     .read_names = true,
 183     .asw = -1,
 184   };
 185   *filename = NULL;
 186
 187   if (!lex_force_match_phrase (lexer, "/FILE=") || !lex_force_string (lexer))
 188     goto error;
 189
 190   *filename = utf8_to_filename (lex_tokcstr (lexer));
 191   lex_get (lexer);
 192
 193   while (lex_match (lexer, T_SLASH))
 194     {
 195       if (lex_match_id (lexer, "ASSUMEDSTRWIDTH"))
 196         {
 197           lex_match (lexer, T_EQUALS);
 198           if (!lex_force_int_range (lexer, "ASSUMEDSTRWIDTH", 1, 32767))
 199             goto error;
 200           opts->asw = lex_integer (lexer);
 201           lex_get (lexer);
 202         }
 203       else if (lex_match_id (lexer, "SHEET"))
 204         {
 205           lex_match (lexer, T_EQUALS);
 206           if (lex_match_id (lexer, "NAME"))
 207             {
 208               if (!lex_force_string (lexer))
 209                 goto error;
 210
 211               opts->sheet_name = ss_xstrdup (lex_tokss (lexer));
 212               opts->sheet_index = -1;
 213
 214               lex_get (lexer);
 215             }
 216           else if (lex_match_id (lexer, "INDEX"))
 217             {
 218               if (!lex_force_int_range (lexer, "INDEX", 1, INT_MAX))
 219                 goto error;
 220               opts->sheet_index = lex_integer (lexer);
 221               lex_get (lexer);
 222             }
 223           else
 224             {
 225               lex_error_expecting (lexer, "NAME", "INDEX");
 226               goto error;
 227             }
 228         }
 229       else if (lex_match_id (lexer, "CELLRANGE"))
 230         {
 231           lex_match (lexer, T_EQUALS);
 232
 233           if (lex_match_id (lexer, "FULL"))
 234             opts->cell_range = NULL;
 235           else if (lex_match_id (lexer, "RANGE"))
 236             {
 237               if (!lex_force_string (lexer))
 238                 goto error;
 239
 240               opts->cell_range = ss_xstrdup (lex_tokss (lexer));
 241               lex_get (lexer);
 242             }
 243           else
 244             {
 245               lex_error_expecting (lexer, "FULL", "RANGE");
 246               goto error;
 247             }
 248         }
 249       else if (lex_match_id (lexer, "READNAMES"))
 250         {
 251           lex_match (lexer, T_EQUALS);
 252
 253           if (lex_match_id (lexer, "ON"))
 254             opts->read_names = true;
 255           else if (lex_match_id (lexer, "OFF"))
 256             opts->read_names = false;
 257           else
 258             {
 259               lex_error_expecting (lexer, "ON", "OFF");
 260               goto error;
 261             }
 262         }
 263       else
 264         {
 265           lex_error_expecting (lexer, "ASSUMEDSTRWIDTH", "SHEET", "CELLRANGE",
 266                                "READNAMES");
 267           goto error;
 268         }
 269     }
 270
 271   return true;
 272
 273  error:
 274   destroy_spreadsheet_read_info (opts);
 275   free (*filename);
 276   return false;
 277 }
 278
 279
 280 static bool
 281 set_type (struct lexer *lexer, struct data_parser *parser,
 282           enum data_parser_type type,
 283           int type_start, int type_end, int *type_startp, int *type_endp)
 284 {
 285   if (!*type_startp)
 286     {
 287       data_parser_set_type (parser, type);
 288       *type_startp = type_start;
 289       *type_endp = type_end;
 290     }
 291   else if (type != data_parser_get_type (parser))
 292     {
 293       msg (SE, _("FIXED and DELIMITED arrangements are mutually exclusive."));
 294       lex_ofs_msg (lexer, SN, type_start, type_end,
 295                    _("This syntax requires %s arrangement."),
 296                    type == DP_FIXED ? "FIXED" : "DELIMITED");
 297       lex_ofs_msg (lexer, SN, *type_startp, *type_endp,
 298                    _("This syntax requires %s arrangement."),
 299                    type == DP_FIXED ? "DELIMITED" : "FIXED");
 300       return false;
 301     }
 302   return true;
 303 }
 304
 305 static int
 306 parse_get_txt (struct lexer *lexer, struct dataset *ds)
 307 {
 308   struct dictionary *dict = dict_create (get_default_encoding ());
 309   struct data_parser *parser = data_parser_create ();
 310   struct file_handle *fh = NULL;
 311   char *encoding = NULL;
 312   char *name = NULL;
 313
 314   if (!lex_force_match_phrase (lexer, "/FILE="))
 315     goto error;
 316   fh = fh_parse (lexer, FH_REF_FILE | FH_REF_INLINE, NULL);
 317   if (fh == NULL)
 318     goto error;
 319
 320   int type_start = 0, type_end = 0;
 321   data_parser_set_type (parser, DP_DELIMITED);
 322   data_parser_set_span (parser, false);
 323   data_parser_set_quotes (parser, ss_empty ());
 324   data_parser_set_quote_escape (parser, true);
 325   data_parser_set_empty_line_has_field (parser, true);
 326
 327   for (;;)
 328     {
 329       if (!lex_force_match (lexer, T_SLASH))
 330         goto error;
 331
 332       if (lex_match_id (lexer, "ENCODING"))
 333         {
 334           lex_match (lexer, T_EQUALS);
 335           if (!lex_force_string (lexer))
 336             goto error;
 337
 338           free (encoding);
 339           encoding = ss_xstrdup (lex_tokss (lexer));
 340
 341           lex_get (lexer);
 342         }
 343       else if (lex_match_id (lexer, "ARRANGEMENT"))
 344         {
 345           bool ok;
 346
 347           lex_match (lexer, T_EQUALS);
 348           if (lex_match_id (lexer, "FIXED"))
 349             ok = set_type (lexer, parser, DP_FIXED,
 350                            lex_ofs (lexer) - 3, lex_ofs (lexer) - 1,
 351                            &type_start, &type_end);
 352           else if (lex_match_id (lexer, "DELIMITED"))
 353             ok = set_type (lexer, parser, DP_DELIMITED,
 354                            lex_ofs (lexer) - 3, lex_ofs (lexer) - 1,
 355                            &type_start, &type_end);
 356           else
 357             {
 358               lex_error_expecting (lexer, "FIXED", "DELIMITED");
 359               goto error;
 360             }
 361           if (!ok)
 362             goto error;
 363         }
 364       else if (lex_match_id (lexer, "FIRSTCASE"))
 365         {
 366           lex_match (lexer, T_EQUALS);
 367           if (!lex_force_int_range (lexer, "FIRSTCASE", 1, INT_MAX))
 368             goto error;
 369           data_parser_set_skip (parser, lex_integer (lexer) - 1);
 370           lex_get (lexer);
 371         }
 372       else if (lex_match_id_n (lexer, "DELCASE", 4))
 373         {
 374           if (!set_type (lexer, parser, DP_DELIMITED,
 375                          lex_ofs (lexer) - 1, lex_ofs (lexer) - 1,
 376                          &type_start, &type_end))
 377             goto error;
 378           lex_match (lexer, T_EQUALS);
 379           if (lex_match_id (lexer, "LINE"))
 380             data_parser_set_span (parser, false);
 381           else if (lex_match_id (lexer, "VARIABLES"))
 382             {
 383               data_parser_set_span (parser, true);
 384
 385               /* VARIABLES takes an integer argument, but for no
 386                  good reason.  We just ignore it. */
 387               if (!lex_force_int (lexer))
 388                 goto error;
 389               lex_get (lexer);
 390             }
 391           else
 392             {
 393               lex_error_expecting (lexer, "LINE", "VARIABLES");
 394               goto error;
 395             }
 396         }
 397       else if (lex_match_id (lexer, "FIXCASE"))
 398         {
 399           if (!set_type (lexer, parser, DP_FIXED,
 400                          lex_ofs (lexer) - 1, lex_ofs (lexer) - 1,
 401                          &type_start, &type_end))
 402             goto error;
 403           lex_match (lexer, T_EQUALS);
 404           if (!lex_force_int_range (lexer, "FIXCASE", 1, INT_MAX))
 405             goto error;
 406           data_parser_set_records (parser, lex_integer (lexer));
 407           lex_get (lexer);
 408         }
 409       else if (lex_match_id (lexer, "IMPORTCASES"))
 410         {
 411           int start_ofs = lex_ofs (lexer) - 1;
 412           lex_match (lexer, T_EQUALS);
 413           if (lex_match (lexer, T_ALL))
 414             {
 415               /* Nothing to do. */
 416             }
 417           else if (lex_match_id (lexer, "FIRST"))
 418             {
 419               if (!lex_force_int (lexer))
 420                 goto error;
 421               lex_get (lexer);
 422             }
 423           else if (lex_match_id (lexer, "PERCENT"))
 424             {
 425               if (!lex_force_int (lexer))
 426                 goto error;
 427               lex_get (lexer);
 428             }
 429           lex_ofs_msg (lexer, SW, start_ofs, lex_ofs (lexer) - 1,
 430                        _("Ignoring obsolete IMPORTCASES subcommand.  (N OF "
 431                          "CASES or SAMPLE may be used to substitute.)"));
 432         }
 433       else if (lex_match_id_n (lexer, "DELIMITERS", 4))
 434         {
 435           if (!set_type (lexer, parser, DP_DELIMITED,
 436                          lex_ofs (lexer) - 1, lex_ofs (lexer) - 1,
 437                          &type_start, &type_end))
 438             goto error;
 439           lex_match (lexer, T_EQUALS);
 440
 441           if (!lex_force_string (lexer))
 442             goto error;
 443
 444           /* XXX should support multibyte UTF-8 characters */
 445           struct substring s = lex_tokss (lexer);
 446           struct string hard_seps = DS_EMPTY_INITIALIZER;
 447           const char *soft_seps = "";
 448           if (ss_match_string (&s, ss_cstr ("\\t")))
 449             ds_put_cstr (&hard_seps, "\t");
 450           if (ss_match_string (&s, ss_cstr ("\\\\")))
 451             ds_put_cstr (&hard_seps, "\\");
 452           int c;
 453           while ((c = ss_get_byte (&s)) != EOF)
 454             if (c == ' ')
 455               soft_seps = " ";
 456             else
 457               ds_put_byte (&hard_seps, c);
 458           data_parser_set_soft_delimiters (parser, ss_cstr (soft_seps));
 459           data_parser_set_hard_delimiters (parser, ds_ss (&hard_seps));
 460           ds_destroy (&hard_seps);
 461
 462           lex_get (lexer);
 463         }
 464       else if (lex_match_id (lexer, "QUALIFIERS"))
 465         {
 466           if (!set_type (lexer, parser, DP_DELIMITED,
 467                          lex_ofs (lexer) - 1, lex_ofs (lexer) - 1,
 468                          &type_start, &type_end))
 469             goto error;
 470           lex_match (lexer, T_EQUALS);
 471
 472           if (!lex_force_string (lexer))
 473             goto error;
 474
 475           /* XXX should support multibyte UTF-8 characters */
 476           if (settings_get_syntax () == COMPATIBLE
 477               && ss_length (lex_tokss (lexer)) != 1)
 478             {
 479               lex_error (lexer, _("In compatible syntax mode, the QUALIFIER "
 480                                   "string must contain exactly one character."));
 481               goto error;
 482             }
 483
 484           data_parser_set_quotes (parser, lex_tokss (lexer));
 485           lex_get (lexer);
 486         }
 487       else if (lex_match_id (lexer, "VARIABLES"))
 488         break;
 489       else
 490         {
 491           lex_error_expecting (lexer, "VARIABLES");
 492           goto error;
 493         }
 494     }
 495   lex_match (lexer, T_EQUALS);
 496
 497   int record = 1;
 498   enum data_parser_type type = data_parser_get_type (parser);
 499   do
 500     {
 501       while (type == DP_FIXED && lex_match (lexer, T_SLASH))
 502         {
 503           if (!lex_force_int_range (lexer, NULL, record,
 504                                     data_parser_get_records (parser)))
 505             goto error;
 506           record = lex_integer (lexer);
 507           lex_get (lexer);
 508         }
 509
 510       int name_ofs = lex_ofs (lexer);
 511       if (!lex_force_id (lexer))
 512         goto error;
 513       name = xstrdup (lex_tokcstr (lexer));
 514       char *error = dict_id_is_valid__ (dict, name);
 515       if (error)
 516         {
 517           lex_error (lexer, "%s", error);
 518           free (error);
 519           goto error;
 520         }
 521       lex_get (lexer);
 522
 523       struct fmt_spec input, output;
 524       int fc, lc;
 525       if (type == DP_DELIMITED)
 526         {
 527           if (!parse_format_specifier (lexer, &input))
 528             goto error;
 529           error = fmt_check_input__ (input);
 530           if (error)
 531             {
 532               lex_next_error (lexer, -1, -1, "%s", error);
 533               free (error);
 534               goto error;
 535             }
 536           output = fmt_for_output_from_input (input,
 537                                               settings_get_fmt_settings ());
 538         }
 539       else
 540         {
 541           int start_ofs = lex_ofs (lexer);
 542           if (!parse_column_range (lexer, 0, &fc, &lc, NULL))
 543             goto error;
 544
 545           /* Accept a format (e.g. F8.2) or just a type name (e.g. DOLLAR).  */
 546           char fmt_type_name[FMT_TYPE_LEN_MAX + 1];
 547           uint16_t w;
 548           uint8_t d;
 549           if (!parse_abstract_format_specifier (lexer, fmt_type_name, &w, &d))
 550             goto error;
 551
 552           enum fmt_type fmt_type;
 553           if (!fmt_from_name (fmt_type_name, &fmt_type))
 554             {
 555               lex_next_error (lexer, -1, -1,
 556                               _("Unknown format type `%s'."), fmt_type_name);
 557               goto error;
 558             }
 559           int end_ofs = lex_ofs (lexer) - 1;
 560
 561           /* Compose input format. */
 562           input = (struct fmt_spec) { .type = fmt_type, .w = lc - fc + 1 };
 563           error = fmt_check_input__ (input);
 564           if (error)
 565             {
 566               lex_ofs_error (lexer, start_ofs, end_ofs, "%s", error);
 567               free (error);
 568               goto error;
 569             }
 570
 571           /* Compose output format. */
 572           if (w != 0)
 573             {
 574               output = (struct fmt_spec) { .type = fmt_type, .w = w, .d = d };
 575               error = fmt_check_output__ (output);
 576               if (error)
 577                 {
 578                   lex_ofs_error (lexer, start_ofs, end_ofs, "%s", error);
 579                   free (error);
 580                   goto error;
 581                 }
 582             }
 583           else
 584             output = fmt_for_output_from_input (input,
 585                                                 settings_get_fmt_settings ());
 586         }
 587       struct variable *v = dict_create_var (dict, name, fmt_var_width (input));
 588       if (!v)
 589         {
 590           lex_ofs_error (lexer, name_ofs, name_ofs,
 591                          _("%s is a duplicate variable name."), name);
 592           goto error;
 593         }
 594       var_set_both_formats (v, output);
 595       if (type == DP_DELIMITED)
 596         data_parser_add_delimited_field (parser, input,
 597                                          var_get_dict_index (v),
 598                                          name);
 599       else
 600         data_parser_add_fixed_field (parser, input, var_get_dict_index (v),
 601                                      name, record, fc);
 602       free (name);
 603       name = NULL;
 604     }
 605   while (lex_token (lexer) != T_ENDCMD);
 606
 607   struct dfm_reader *reader = dfm_open_reader (fh, lexer, encoding);
 608   if (!reader)
 609     goto error;
 610
 611   data_parser_make_active_file (parser, ds, reader, dict, NULL, NULL);
 612   fh_unref (fh);
 613   free (encoding);
 614   return CMD_SUCCESS;
 615
 616  error:
 617   data_parser_destroy (parser);
 618   dict_unref (dict);
 619   fh_unref (fh);
 620   free (name);
 621   free (encoding);
 622   return CMD_CASCADING_FAILURE;
 623 }
 624
 625 static void
 626 destroy_spreadsheet_read_info (struct spreadsheet_read_options *opts)
 627 {
 628   free (opts->cell_range);
 629   free (opts->sheet_name);
 630 }