expressions: Simplify type declarations for nodes.
[pspp.git] / src / language / expressions / parse.c
blob96b892b2efc75d811b717b227b8d45383fc0ed3d
1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 1997-9, 2000, 2006, 2010, 2011, 2012, 2014 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
17 #include <config.h>
19 #include "private.h"
21 #include <ctype.h>
22 #include <float.h>
23 #include <limits.h>
24 #include <stdlib.h>
26 #include "data/case.h"
27 #include "data/dictionary.h"
28 #include "data/settings.h"
29 #include "data/variable.h"
30 #include "language/expressions/helpers.h"
31 #include "language/lexer/format-parser.h"
32 #include "language/lexer/lexer.h"
33 #include "language/lexer/variable-parser.h"
34 #include "libpspp/array.h"
35 #include "libpspp/assertion.h"
36 #include "libpspp/i18n.h"
37 #include "libpspp/message.h"
38 #include "libpspp/misc.h"
39 #include "libpspp/pool.h"
40 #include "libpspp/str.h"
42 #include "gl/c-strcase.h"
43 #include "gl/minmax.h"
44 #include "gl/xalloc.h"
46 /* Declarations. */
48 /* Recursive descent parser in order of increasing precedence. */
49 typedef struct expr_node *parse_recursively_func (struct lexer *, struct expression *);
50 static parse_recursively_func parse_or, parse_and, parse_not;
51 static parse_recursively_func parse_rel, parse_add, parse_mul;
52 static parse_recursively_func parse_neg, parse_exp;
53 static parse_recursively_func parse_primary;
54 static parse_recursively_func parse_vector_element, parse_function;
56 /* Utility functions. */
57 static struct expression *expr_create (struct dataset *ds);
58 atom_type expr_node_returns (const struct expr_node *);
60 static const char *atom_type_name (atom_type);
61 static struct expression *finish_expression (struct expr_node *,
62 struct expression *);
63 static bool type_check (const struct expr_node *, enum val_type expected_type);
64 static struct expr_node *allocate_unary_variable (struct expression *,
65 const struct variable *);
67 /* Public functions. */
69 /* Parses an expression of the given TYPE. If DS is nonnull then variables and
70 vectors within it may be referenced within the expression; otherwise, the
71 expression must not reference any variables or vectors. Returns the new
72 expression if successful or a null pointer otherwise. */
73 struct expression *
74 expr_parse (struct lexer *lexer, struct dataset *ds, enum val_type type)
76 assert (val_type_is_valid (type));
78 struct expression *e = expr_create (ds);
79 struct expr_node *n = parse_or (lexer, e);
80 if (!n || !type_check (n, type))
82 expr_free (e);
83 return NULL;
86 return finish_expression (expr_optimize (n, e), e);
89 /* Parses a boolean expression, otherwise similar to expr_parse(). */
90 struct expression *
91 expr_parse_bool (struct lexer *lexer, struct dataset *ds)
93 struct expression *e = expr_create (ds);
94 struct expr_node *n = parse_or (lexer, e);
95 if (!n)
97 expr_free (e);
98 return NULL;
101 atom_type actual_type = expr_node_returns (n);
102 if (actual_type == OP_number)
103 n = expr_allocate_unary (e, OP_NUM_TO_BOOLEAN, n);
104 else if (actual_type != OP_boolean)
106 msg (SE, _("Type mismatch: expression has %s type, "
107 "but a boolean value is required here."),
108 atom_type_name (actual_type));
109 expr_free (e);
110 return NULL;
113 return finish_expression (expr_optimize (n, e), e);
116 /* Parses a numeric expression that is intended to be assigned to newly created
117 variable NEW_VAR_NAME. (This allows for a better error message if the
118 expression is not numeric.) Otherwise similar to expr_parse(). */
119 struct expression *
120 expr_parse_new_variable (struct lexer *lexer, struct dataset *ds,
121 const char *new_var_name)
123 struct expression *e = expr_create (ds);
124 struct expr_node *n = parse_or (lexer, e);
125 if (!n)
127 expr_free (e);
128 return NULL;
131 atom_type actual_type = expr_node_returns (n);
132 if (actual_type != OP_number && actual_type != OP_boolean)
134 msg (SE, _("This command tries to create a new variable %s by assigning a "
135 "string value to it, but this is not supported. Use "
136 "the STRING command to create the new variable with the "
137 "correct width before assigning to it, e.g. STRING %s(A20)."),
138 new_var_name, new_var_name);
139 expr_free (e);
140 return NULL;
143 return finish_expression (expr_optimize (n, e), e);
146 /* Free expression E. */
147 void
148 expr_free (struct expression *e)
150 if (e != NULL)
151 pool_destroy (e->expr_pool);
154 struct expression *
155 expr_parse_any (struct lexer *lexer, struct dataset *ds, bool optimize)
157 struct expr_node *n;
158 struct expression *e;
160 e = expr_create (ds);
161 n = parse_or (lexer, e);
162 if (n == NULL)
164 expr_free (e);
165 return NULL;
168 if (optimize)
169 n = expr_optimize (n, e);
170 return finish_expression (n, e);
173 /* Finishing up expression building. */
175 /* Height of an expression's stacks. */
176 struct stack_heights
178 int number_height; /* Height of number stack. */
179 int string_height; /* Height of string stack. */
182 /* Stack heights used by different kinds of arguments. */
183 static const struct stack_heights on_number_stack = {1, 0};
184 static const struct stack_heights on_string_stack = {0, 1};
185 static const struct stack_heights not_on_stack = {0, 0};
187 /* Returns the stack heights used by an atom of the given
188 TYPE. */
189 static const struct stack_heights *
190 atom_type_stack (atom_type type)
192 assert (is_atom (type));
194 switch (type)
196 case OP_number:
197 case OP_boolean:
198 return &on_number_stack;
200 case OP_string:
201 return &on_string_stack;
203 case OP_format:
204 case OP_ni_format:
205 case OP_no_format:
206 case OP_num_var:
207 case OP_str_var:
208 case OP_integer:
209 case OP_pos_int:
210 case OP_vector:
211 return &not_on_stack;
213 default:
214 NOT_REACHED ();
218 /* Measures the stack height needed for node N, supposing that
219 the stack height is initially *HEIGHT and updating *HEIGHT to
220 the final stack height. Updates *MAX, if necessary, to
221 reflect the maximum intermediate or final height. */
222 static void
223 measure_stack (const struct expr_node *n,
224 struct stack_heights *height, struct stack_heights *max)
226 const struct stack_heights *return_height;
228 if (is_composite (n->type))
230 struct stack_heights args;
231 int i;
233 args = *height;
234 for (i = 0; i < n->n_args; i++)
235 measure_stack (n->args[i], &args, max);
237 return_height = atom_type_stack (operations[n->type].returns);
239 else
240 return_height = atom_type_stack (n->type);
242 height->number_height += return_height->number_height;
243 height->string_height += return_height->string_height;
245 if (height->number_height > max->number_height)
246 max->number_height = height->number_height;
247 if (height->string_height > max->string_height)
248 max->string_height = height->string_height;
251 /* Allocates stacks within E sufficient for evaluating node N. */
252 static void
253 allocate_stacks (struct expr_node *n, struct expression *e)
255 struct stack_heights initial = {0, 0};
256 struct stack_heights max = {0, 0};
258 measure_stack (n, &initial, &max);
259 e->number_stack = pool_alloc (e->expr_pool,
260 sizeof *e->number_stack * max.number_height);
261 e->string_stack = pool_alloc (e->expr_pool,
262 sizeof *e->string_stack * max.string_height);
265 /* Finalizes expression E for evaluating node N. */
266 static struct expression *
267 finish_expression (struct expr_node *n, struct expression *e)
269 /* Allocate stacks. */
270 allocate_stacks (n, e);
272 /* Output postfix representation. */
273 expr_flatten (n, e);
275 /* The eval_pool might have been used for allocating strings
276 during optimization. We need to keep those strings around
277 for all subsequent evaluations, so start a new eval_pool. */
278 e->eval_pool = pool_create_subpool (e->expr_pool);
280 return e;
283 /* Verifies that expression E, whose root node is *N, can be
284 converted to type EXPECTED_TYPE, inserting a conversion at *N
285 if necessary. Returns true if successful, false on failure. */
286 static bool
287 type_check (const struct expr_node *n, enum val_type expected_type)
289 atom_type actual_type = expr_node_returns (n);
291 switch (expected_type)
293 case VAL_NUMERIC:
294 if (actual_type != OP_number && actual_type != OP_boolean)
296 msg (SE, _("Type mismatch: expression has %s type, "
297 "but a numeric value is required here."),
298 atom_type_name (actual_type));
299 return false;
301 break;
303 case VAL_STRING:
304 if (actual_type != OP_string)
306 msg (SE, _("Type mismatch: expression has %s type, "
307 "but a string value is required here."),
308 atom_type_name (actual_type));
309 return false;
311 break;
313 default:
314 NOT_REACHED ();
317 return true;
320 /* Recursive-descent expression parser. */
322 static void
323 free_msg_location (void *loc_)
325 struct msg_location *loc = loc_;
326 msg_location_destroy (loc);
329 static void
330 expr_location__ (struct expression *e,
331 const struct expr_node *node,
332 const struct msg_location **minp,
333 const struct msg_location **maxp)
335 struct msg_location *loc = node->location;
336 if (loc)
338 const struct msg_location *min = *minp;
339 if (loc->start.line
340 && (!min
341 || loc->start.line < min->start.line
342 || (loc->start.line == min->start.line
343 && loc->start.column < min->start.column)))
344 *minp = loc;
346 const struct msg_location *max = *maxp;
347 if (loc->end.line
348 && (!max
349 || loc->end.line > max->end.line
350 || (loc->end.line == max->end.line
351 && loc->end.column > max->end.column)))
352 *maxp = loc;
354 return;
357 if (is_composite (node->type))
358 for (size_t i = 0; i < node->n_args; i++)
359 expr_location__ (e, node->args[i], minp, maxp);
362 /* Returns the source code location corresponding to expression NODE, computing
363 it lazily if needed. */
364 static const struct msg_location *
365 expr_location (struct expression *e, const struct expr_node *node_)
367 struct expr_node *node = CONST_CAST (struct expr_node *, node_);
368 if (!node)
369 return NULL;
371 if (!node->location)
373 const struct msg_location *min = NULL;
374 const struct msg_location *max = NULL;
375 expr_location__ (e, node, &min, &max);
376 if (min && max)
378 node->location = msg_location_dup (min);
379 node->location->end = max->end;
380 pool_register (e->expr_pool, free_msg_location, node->location);
383 return node->location;
386 /* Sets e->location to the tokens in S's lexer from offset START_OFS to the
387 token before the current one. Has no effect if E already has a location or
388 if E is null. */
389 static void
390 expr_add_location (struct lexer *lexer, struct expression *e,
391 int start_ofs, struct expr_node *node)
393 if (node && !node->location)
395 node->location = lex_ofs_location (lexer, start_ofs, lex_ofs (lexer) - 1);
396 pool_register (e->expr_pool, free_msg_location, node->location);
400 static bool
401 type_coercion__ (struct expression *e, struct expr_node *node, size_t arg_idx,
402 bool do_coercion)
404 assert (!!do_coercion == (e != NULL));
406 if (!node)
407 return false;
409 struct expr_node **argp = &node->args[arg_idx];
410 struct expr_node *arg = *argp;
411 if (!arg)
412 return false;
414 const struct operation *op = &operations[node->type];
415 atom_type required_type = op->args[MIN (arg_idx, op->n_args - 1)];
416 atom_type actual_type = expr_node_returns (arg);
417 if (actual_type == required_type)
419 /* Type match. */
420 return true;
423 switch (required_type)
425 case OP_number:
426 if (actual_type == OP_boolean)
428 /* To enforce strict typing rules, insert Boolean to
429 numeric "conversion". This conversion is a no-op,
430 so it will be removed later. */
431 if (do_coercion)
432 *argp = expr_allocate_unary (e, OP_BOOLEAN_TO_NUM, arg);
433 return true;
435 break;
437 case OP_string:
438 /* No coercion to string. */
439 break;
441 case OP_boolean:
442 if (actual_type == OP_number)
444 /* Convert numeric to boolean. */
445 if (do_coercion)
446 *argp = expr_allocate_unary (e, OP_NUM_TO_BOOLEAN, arg);
447 return true;
449 break;
451 case OP_format:
452 NOT_REACHED ();
454 case OP_ni_format:
455 msg_disable ();
456 if (arg->type == OP_format
457 && fmt_check_input (&arg->format)
458 && fmt_check_type_compat (&arg->format, VAL_NUMERIC))
460 msg_enable ();
461 if (do_coercion)
462 arg->type = OP_ni_format;
463 return true;
465 msg_enable ();
466 break;
468 case OP_no_format:
469 msg_disable ();
470 if (arg->type == OP_format
471 && fmt_check_output (&arg->format)
472 && fmt_check_type_compat (&arg->format, VAL_NUMERIC))
474 msg_enable ();
475 if (do_coercion)
476 arg->type = OP_no_format;
477 return true;
479 msg_enable ();
480 break;
482 case OP_num_var:
483 if (arg->type == OP_NUM_VAR)
485 if (do_coercion)
486 *argp = arg->args[0];
487 return true;
489 break;
491 case OP_str_var:
492 if (arg->type == OP_STR_VAR)
494 if (do_coercion)
495 *argp = arg->args[0];
496 return true;
498 break;
500 case OP_var:
501 if (arg->type == OP_NUM_VAR || arg->type == OP_STR_VAR)
503 if (do_coercion)
504 *argp = arg->args[0];
505 return true;
507 break;
509 case OP_pos_int:
510 if (arg->type == OP_number
511 && floor (arg->number) == arg->number
512 && arg->number > 0 && arg->number < INT_MAX)
514 if (do_coercion)
515 *argp = expr_allocate_pos_int (e, arg->number);
516 return true;
518 break;
520 default:
521 NOT_REACHED ();
523 return false;
526 static bool
527 type_coercion (struct expression *e, struct expr_node *node, size_t arg_idx)
529 return type_coercion__ (e, node, arg_idx, true);
532 static bool
533 is_coercible (const struct expr_node *node_, size_t arg_idx)
535 struct expr_node *node = CONST_CAST (struct expr_node *, node_);
536 return type_coercion__ (NULL, node, arg_idx, false);
539 /* How to parse an operator. */
540 struct operator
542 enum token_type token; /* Operator token. */
543 operation_type type; /* Operation. */
546 /* Attempts to match the current token against the tokens for the
547 OP_CNT operators in OPS[]. If successful, returns true
548 and, if OPERATOR is non-null, sets *OPERATOR to the operator.
549 On failure, returns false and, if OPERATOR is non-null, sets
550 *OPERATOR to a null pointer. */
551 static const struct operator *
552 match_operator (struct lexer *lexer, const struct operator ops[], size_t n_ops,
553 const struct expr_node *lhs)
555 bool lhs_is_numeric = operations[lhs->type].returns != OP_string;
556 for (const struct operator *op = ops; op < ops + n_ops; op++)
557 if (lex_token (lexer) == op->token)
559 bool op_is_numeric = operations[op->type].args[0] != OP_string;
560 if (op_is_numeric == lhs_is_numeric)
562 if (op->token != T_NEG_NUM)
563 lex_get (lexer);
564 return op;
567 return NULL;
570 static const char *
571 operator_name (const struct operator *op)
573 return op->token == T_NEG_NUM ? "-" : token_type_to_string (op->token);
576 static struct expr_node *
577 parse_binary_operators__ (struct lexer *lexer, struct expression *e,
578 const struct operator ops[], size_t n_ops,
579 parse_recursively_func *parse_next_level,
580 const char *chain_warning, struct expr_node *lhs)
582 for (int op_count = 0; ; op_count++)
584 const struct operator *operator = match_operator (lexer, ops, n_ops, lhs);
585 if (!operator)
587 if (op_count > 1 && chain_warning)
588 msg_at (SW, expr_location (e, lhs), "%s", chain_warning);
590 return lhs;
593 struct expr_node *rhs = parse_next_level (lexer, e);
594 if (!rhs)
595 return NULL;
597 struct expr_node *node = expr_allocate_binary (e, operator->type,
598 lhs, rhs);
599 bool lhs_ok = type_coercion (e, node, 0);
600 bool rhs_ok = type_coercion (e, node, 1);
602 if (!lhs_ok || !rhs_ok)
604 int n_matches = 0;
605 for (size_t i = 0; i < n_ops; i++)
606 if (ops[i].token == operator->token)
607 n_matches++;
609 const char *name = operator_name (operator);
610 if (n_matches > 1)
611 msg_at (SE, expr_location (e, node),
612 _("The operands of %s must have the same type."), name);
613 else if (operations[node->type].args[0] != OP_string)
614 msg_at (SE, expr_location (e, node),
615 _("Both operands of %s must be numeric."), name);
616 else
617 msg_at (SE, expr_location (e, node),
618 _("Both operands of %s must be strings."), name);
620 msg_at (SN, expr_location (e, node->args[0]),
621 _("The left-hand operand of %s has type '%s'."),
622 name, atom_type_name (expr_node_returns (node->args[0])));
623 msg_at (SN, expr_location (e, node->args[1]),
624 _("The right-hand operand of %s has type '%s'."),
625 name, atom_type_name (expr_node_returns (node->args[1])));
627 return NULL;
630 lhs = node;
634 static struct expr_node *
635 parse_binary_operators (struct lexer *lexer, struct expression *e,
636 const struct operator ops[], size_t n_ops,
637 parse_recursively_func *parse_next_level,
638 const char *chain_warning)
640 struct expr_node *lhs = parse_next_level (lexer, e);
641 if (!lhs)
642 return NULL;
644 return parse_binary_operators__ (lexer, e, ops, n_ops, parse_next_level,
645 chain_warning, lhs);
648 static struct expr_node *
649 parse_inverting_unary_operator (struct lexer *lexer, struct expression *e,
650 const struct operator *op,
651 parse_recursively_func *parse_next_level)
653 int start_ofs = lex_ofs (lexer);
654 unsigned int op_count = 0;
655 while (lex_match (lexer, op->token))
656 op_count++;
658 struct expr_node *inner = parse_next_level (lexer, e);
659 if (!inner || !op_count)
660 return inner;
662 struct expr_node *outer = expr_allocate_unary (e, op->type, inner);
663 expr_add_location (lexer, e, start_ofs, outer);
665 if (!type_coercion (e, outer, 0))
667 assert (operations[outer->type].args[0] != OP_string);
669 const char *name = operator_name (op);
670 msg_at (SE, expr_location (e, outer),
671 _("The unary %s operator requires a numeric operand."), name);
673 msg_at (SN, expr_location (e, outer->args[0]),
674 _("The operand of %s has type '%s'."),
675 name, atom_type_name (expr_node_returns (outer->args[0])));
677 return NULL;
680 return op_count % 2 ? outer : outer->args[0];
683 /* Parses the OR level. */
684 static struct expr_node *
685 parse_or (struct lexer *lexer, struct expression *e)
687 static const struct operator op = { T_OR, OP_OR };
688 return parse_binary_operators (lexer, e, &op, 1, parse_and, NULL);
691 /* Parses the AND level. */
692 static struct expr_node *
693 parse_and (struct lexer *lexer, struct expression *e)
695 static const struct operator op = { T_AND, OP_AND };
697 return parse_binary_operators (lexer, e, &op, 1, parse_not, NULL);
700 /* Parses the NOT level. */
701 static struct expr_node *
702 parse_not (struct lexer *lexer, struct expression *e)
704 static const struct operator op = { T_NOT, OP_NOT };
705 return parse_inverting_unary_operator (lexer, e, &op, parse_rel);
708 /* Parse relational operators. */
709 static struct expr_node *
710 parse_rel (struct lexer *lexer, struct expression *e)
712 const char *chain_warning =
713 _("Chaining relational operators (e.g. `a < b < c') will "
714 "not produce the mathematically expected result. "
715 "Use the AND logical operator to fix the problem "
716 "(e.g. `a < b AND b < c'). "
717 "If chaining is really intended, parentheses will disable "
718 "this warning (e.g. `(a < b) < c'.)");
720 static const struct operator ops[] =
722 /* Numeric operators. */
723 { T_EQUALS, OP_EQ },
724 { T_EQ, OP_EQ },
725 { T_GE, OP_GE },
726 { T_GT, OP_GT },
727 { T_LE, OP_LE },
728 { T_LT, OP_LT },
729 { T_NE, OP_NE },
731 /* String operators. */
732 { T_EQUALS, OP_EQ_STRING },
733 { T_EQ, OP_EQ_STRING },
734 { T_GE, OP_GE_STRING },
735 { T_GT, OP_GT_STRING },
736 { T_LE, OP_LE_STRING },
737 { T_LT, OP_LT_STRING },
738 { T_NE, OP_NE_STRING },
741 return parse_binary_operators (lexer, e, ops, sizeof ops / sizeof *ops,
742 parse_add, chain_warning);
745 /* Parses the addition and subtraction level. */
746 static struct expr_node *
747 parse_add (struct lexer *lexer, struct expression *e)
749 static const struct operator ops[] =
751 { T_PLUS, OP_ADD },
752 { T_DASH, OP_SUB },
753 { T_NEG_NUM, OP_ADD },
756 return parse_binary_operators (lexer, e, ops, sizeof ops / sizeof *ops,
757 parse_mul, NULL);
760 /* Parses the multiplication and division level. */
761 static struct expr_node *
762 parse_mul (struct lexer *lexer, struct expression *e)
764 static const struct operator ops[] =
766 { T_ASTERISK, OP_MUL },
767 { T_SLASH, OP_DIV },
770 return parse_binary_operators (lexer, e, ops, sizeof ops / sizeof *ops,
771 parse_neg, NULL);
774 /* Parses the unary minus level. */
775 static struct expr_node *
776 parse_neg (struct lexer *lexer, struct expression *e)
778 static const struct operator op = { T_DASH, OP_NEG };
779 return parse_inverting_unary_operator (lexer, e, &op, parse_exp);
782 static struct expr_node *
783 parse_exp (struct lexer *lexer, struct expression *e)
785 static const struct operator op = { T_EXP, OP_POW };
787 const char *chain_warning =
788 _("The exponentiation operator (`**') is left-associative, "
789 "even though right-associative semantics are more useful. "
790 "That is, `a**b**c' equals `(a**b)**c', not as `a**(b**c)'. "
791 "To disable this warning, insert parentheses.");
793 if (lex_token (lexer) != T_NEG_NUM || lex_next_token (lexer, 1) != T_EXP)
794 return parse_binary_operators (lexer, e, &op, 1,
795 parse_primary, chain_warning);
797 /* Special case for situations like "-5**6", which must be parsed as
798 -(5**6). */
800 int start_ofs = lex_ofs (lexer);
801 struct expr_node *lhs = expr_allocate_number (e, -lex_tokval (lexer));
802 lex_get (lexer);
804 struct expr_node *node = parse_binary_operators__ (
805 lexer, e, &op, 1, parse_primary, chain_warning, lhs);
806 if (!node)
807 return NULL;
809 node = expr_allocate_unary (e, OP_NEG, node);
810 expr_add_location (lexer, e, start_ofs, node);
811 return node;
814 /* Parses system variables. */
815 static struct expr_node *
816 parse_sysvar (struct lexer *lexer, struct expression *e)
818 if (lex_match_id (lexer, "$CASENUM"))
819 return expr_allocate_nullary (e, OP_CASENUM);
820 else if (lex_match_id (lexer, "$DATE"))
822 static const char *months[12] =
824 "JAN", "FEB", "MAR", "APR", "MAY", "JUN",
825 "JUL", "AUG", "SEP", "OCT", "NOV", "DEC",
828 time_t last_proc_time = time_of_last_procedure (e->ds);
829 struct tm *time;
830 char temp_buf[10];
831 struct substring s;
833 time = localtime (&last_proc_time);
834 sprintf (temp_buf, "%02d %s %02d", abs (time->tm_mday) % 100,
835 months[abs (time->tm_mon) % 12], abs (time->tm_year) % 100);
837 ss_alloc_substring (&s, ss_cstr (temp_buf));
838 return expr_allocate_string (e, s);
840 else if (lex_match_id (lexer, "$TRUE"))
841 return expr_allocate_boolean (e, 1.0);
842 else if (lex_match_id (lexer, "$FALSE"))
843 return expr_allocate_boolean (e, 0.0);
844 else if (lex_match_id (lexer, "$SYSMIS"))
845 return expr_allocate_number (e, SYSMIS);
846 else if (lex_match_id (lexer, "$JDATE"))
848 time_t time = time_of_last_procedure (e->ds);
849 struct tm *tm = localtime (&time);
850 return expr_allocate_number (e, expr_ymd_to_ofs (tm->tm_year + 1900,
851 tm->tm_mon + 1,
852 tm->tm_mday));
854 else if (lex_match_id (lexer, "$TIME"))
856 time_t time = time_of_last_procedure (e->ds);
857 struct tm *tm = localtime (&time);
858 return expr_allocate_number (e,
859 expr_ymd_to_date (tm->tm_year + 1900,
860 tm->tm_mon + 1,
861 tm->tm_mday)
862 + tm->tm_hour * 60 * 60.
863 + tm->tm_min * 60.
864 + tm->tm_sec);
866 else if (lex_match_id (lexer, "$LENGTH"))
867 return expr_allocate_number (e, settings_get_viewlength ());
868 else if (lex_match_id (lexer, "$WIDTH"))
869 return expr_allocate_number (e, settings_get_viewwidth ());
870 else
872 msg (SE, _("Unknown system variable %s."), lex_tokcstr (lexer));
873 return NULL;
877 /* Parses numbers, varnames, etc. */
878 static struct expr_node *
879 parse_primary__ (struct lexer *lexer, struct expression *e)
881 switch (lex_token (lexer))
883 case T_ID:
884 if (lex_next_token (lexer, 1) == T_LPAREN)
886 /* An identifier followed by a left parenthesis may be
887 a vector element reference. If not, it's a function
888 call. */
889 if (e->ds != NULL && dict_lookup_vector (dataset_dict (e->ds), lex_tokcstr (lexer)) != NULL)
890 return parse_vector_element (lexer, e);
891 else
892 return parse_function (lexer, e);
894 else if (lex_tokcstr (lexer)[0] == '$')
896 /* $ at the beginning indicates a system variable. */
897 return parse_sysvar (lexer, e);
899 else if (e->ds != NULL && dict_lookup_var (dataset_dict (e->ds), lex_tokcstr (lexer)))
901 /* It looks like a user variable.
902 (It could be a format specifier, but we'll assume
903 it's a variable unless proven otherwise. */
904 return allocate_unary_variable (e, parse_variable (lexer, dataset_dict (e->ds)));
906 else
908 /* Try to parse it as a format specifier. */
909 struct fmt_spec fmt;
910 bool ok;
912 msg_disable ();
913 ok = parse_format_specifier (lexer, &fmt);
914 msg_enable ();
916 if (ok)
917 return expr_allocate_format (e, &fmt);
919 /* All attempts failed. */
920 msg (SE, _("Unknown identifier %s."), lex_tokcstr (lexer));
921 return NULL;
923 break;
925 case T_POS_NUM:
926 case T_NEG_NUM:
928 struct expr_node *node = expr_allocate_number (e, lex_tokval (lexer));
929 lex_get (lexer);
930 return node;
933 case T_STRING:
935 const char *dict_encoding;
936 struct expr_node *node;
937 char *s;
939 dict_encoding = (e->ds != NULL
940 ? dict_get_encoding (dataset_dict (e->ds))
941 : "UTF-8");
942 s = recode_string_pool (dict_encoding, "UTF-8", lex_tokcstr (lexer),
943 ss_length (lex_tokss (lexer)), e->expr_pool);
944 node = expr_allocate_string (e, ss_cstr (s));
946 lex_get (lexer);
947 return node;
950 case T_LPAREN:
952 /* Count number of left parentheses so that we can match them against
953 an equal number of right parentheses. This defeats trivial attempts
954 to exhaust the stack with a lot of left parentheses. (More
955 sophisticated attacks will still succeed.) */
956 size_t n = 0;
957 while (lex_match (lexer, T_LPAREN))
958 n++;
960 struct expr_node *node = parse_or (lexer, e);
961 if (!node)
962 return NULL;
964 for (size_t i = 0; i < n; i++)
965 if (!lex_force_match (lexer, T_RPAREN))
966 return NULL;
968 return node;
971 default:
972 lex_error (lexer, NULL);
973 return NULL;
977 static struct expr_node *
978 parse_primary (struct lexer *lexer, struct expression *e)
980 int start_ofs = lex_ofs (lexer);
981 struct expr_node *node = parse_primary__ (lexer, e);
982 expr_add_location (lexer, e, start_ofs, node);
983 return node;
986 static struct expr_node *
987 parse_vector_element (struct lexer *lexer, struct expression *e)
989 int vector_start_ofs = lex_ofs (lexer);
991 /* Find vector, skip token.
992 The caller must already have verified that the current token
993 is the name of a vector. */
994 const struct vector *vector = dict_lookup_vector (dataset_dict (e->ds),
995 lex_tokcstr (lexer));
996 assert (vector != NULL);
997 lex_get (lexer);
999 /* Skip left parenthesis token.
1000 The caller must have verified that the lookahead is a left
1001 parenthesis. */
1002 assert (lex_token (lexer) == T_LPAREN);
1003 lex_get (lexer);
1005 int element_start_ofs = lex_ofs (lexer);
1006 struct expr_node *element = parse_or (lexer, e);
1007 if (!element)
1008 return NULL;
1009 expr_add_location (lexer, e, element_start_ofs, element);
1011 if (!lex_match (lexer, T_RPAREN))
1012 return NULL;
1014 operation_type type = (vector_get_type (vector) == VAL_NUMERIC
1015 ? OP_VEC_ELEM_NUM : OP_VEC_ELEM_STR);
1016 struct expr_node *node = expr_allocate_binary (
1017 e, type, element, expr_allocate_vector (e, vector));
1018 expr_add_location (lexer, e, vector_start_ofs, node);
1020 if (!type_coercion (e, node, 1))
1022 msg_at (SE, expr_location (e, node),
1023 _("A vector index must be numeric."));
1025 msg_at (SN, expr_location (e, node->args[0]),
1026 _("This vector index has type '%s'."),
1027 atom_type_name (expr_node_returns (node->args[0])));
1029 return NULL;
1032 return node;
1035 /* Individual function parsing. */
1037 const struct operation operations[OP_first + n_OP] = {
1038 #include "parse.inc"
1041 static bool
1042 word_matches (const char **test, const char **name)
1044 size_t test_len = strcspn (*test, ".");
1045 size_t name_len = strcspn (*name, ".");
1046 if (test_len == name_len)
1048 if (buf_compare_case (*test, *name, test_len))
1049 return false;
1051 else if (test_len < 3 || test_len > name_len)
1052 return false;
1053 else
1055 if (buf_compare_case (*test, *name, test_len))
1056 return false;
1059 *test += test_len;
1060 *name += name_len;
1061 if (**test != **name)
1062 return false;
1064 if (**test == '.')
1066 (*test)++;
1067 (*name)++;
1069 return true;
1072 /* Returns 0 if TOKEN and FUNC do not match,
1073 1 if TOKEN is an acceptable abbreviation for FUNC,
1074 2 if TOKEN equals FUNC. */
1075 static int
1076 compare_function_names (const char *token_, const char *func_)
1078 const char *token = token_;
1079 const char *func = func_;
1080 while (*token || *func)
1081 if (!word_matches (&token, &func))
1082 return 0;
1083 return !c_strcasecmp (token_, func_) ? 2 : 1;
1086 static bool
1087 lookup_function (const char *token,
1088 const struct operation **first,
1089 const struct operation **last)
1091 *first = *last = NULL;
1092 const struct operation *best = NULL;
1094 for (const struct operation *f = operations + OP_function_first;
1095 f <= operations + OP_function_last; f++)
1097 int score = compare_function_names (token, f->name);
1098 if (score == 2)
1100 best = f;
1101 break;
1103 else if (score == 1 && !(f->flags & OPF_NO_ABBREV) && !best)
1104 best = f;
1107 if (!best)
1108 return false;
1110 *first = best;
1112 const struct operation *f = best;
1113 while (f <= operations + OP_function_last
1114 && !c_strcasecmp (f->name, best->name))
1115 f++;
1116 *last = f;
1118 return true;
1121 static int
1122 extract_min_valid (const char *s)
1124 char *p = strrchr (s, '.');
1125 if (p == NULL
1126 || p[1] < '0' || p[1] > '9'
1127 || strspn (p + 1, "0123456789") != strlen (p + 1))
1128 return -1;
1129 *p = '\0';
1130 return atoi (p + 1);
1133 static bool
1134 match_function__ (struct expr_node *node, const struct operation *f)
1136 if (node->n_args < f->n_args
1137 || (node->n_args > f->n_args && (f->flags & OPF_ARRAY_OPERAND) == 0)
1138 || node->n_args - (f->n_args - 1) < f->array_min_elems)
1139 return false;
1141 for (size_t i = 0; i < node->n_args; i++)
1142 if (!is_coercible (node, i))
1143 return false;
1145 return true;
1148 static const struct operation *
1149 match_function (struct expr_node *node,
1150 const struct operation *first, const struct operation *last)
1152 for (const struct operation *f = first; f < last; f++)
1153 if (match_function__ (node, f))
1154 return f;
1155 return NULL;
1158 static bool
1159 validate_function_args (const struct operation *f, int n_args, int min_valid)
1161 /* Count the function arguments that go into the trailing array (if any). We
1162 know that there must be at least the minimum number because
1163 match_function() already checked. */
1164 int array_n_args = n_args - (f->n_args - 1);
1165 assert (array_n_args >= f->array_min_elems);
1167 if ((f->flags & OPF_ARRAY_OPERAND)
1168 && array_n_args % f->array_granularity != 0)
1170 /* RANGE is the only case we have so far. It has paired arguments with
1171 one initial argument, and that's the only special case we deal with
1172 here. */
1173 assert (f->array_granularity == 2);
1174 assert (n_args % 2 == 0);
1175 msg (SE, _("%s must have an odd number of arguments."), f->prototype);
1176 return false;
1179 if (min_valid != -1)
1181 if (f->array_min_elems == 0)
1183 assert ((f->flags & OPF_MIN_VALID) == 0);
1184 msg (SE, _("%s function cannot accept suffix .%d to specify the "
1185 "minimum number of valid arguments."),
1186 f->prototype, min_valid);
1187 return false;
1189 else
1191 assert (f->flags & OPF_MIN_VALID);
1192 if (min_valid > array_n_args)
1194 msg (SE, _("For %s with %d arguments, at most %d (not %d) may be "
1195 "required to be valid."),
1196 f->prototype, n_args, array_n_args, min_valid);
1197 return false;
1202 return true;
1205 static void
1206 add_arg (struct expr_node ***args, size_t *n_args, size_t *allocated_args,
1207 struct expr_node *arg,
1208 struct expression *e, struct lexer *lexer, int arg_start_ofs)
1210 if (*n_args >= *allocated_args)
1211 *args = x2nrealloc (*args, allocated_args, sizeof **args);
1213 expr_add_location (lexer, e, arg_start_ofs, arg);
1214 (*args)[(*n_args)++] = arg;
1217 static void
1218 put_invocation (struct string *s,
1219 const char *func_name, struct expr_node **args, size_t n_args)
1221 size_t i;
1223 ds_put_format (s, "%s(", func_name);
1224 for (i = 0; i < n_args; i++)
1226 if (i > 0)
1227 ds_put_cstr (s, ", ");
1228 ds_put_cstr (s, operations[expr_node_returns (args[i])].prototype);
1230 ds_put_byte (s, ')');
1233 static void
1234 no_match (const char *func_name,
1235 struct expr_node **args, size_t n_args,
1236 const struct operation *first, const struct operation *last)
1238 struct string s;
1239 const struct operation *f;
1241 ds_init_empty (&s);
1243 if (last - first == 1)
1245 ds_put_format (&s, _("Type mismatch invoking %s as "), first->prototype);
1246 put_invocation (&s, func_name, args, n_args);
1248 else
1250 ds_put_cstr (&s, _("Function invocation "));
1251 put_invocation (&s, func_name, args, n_args);
1252 ds_put_cstr (&s, _(" does not match any known function. Candidates are:"));
1254 for (f = first; f < last; f++)
1255 ds_put_format (&s, "\n%s", f->prototype);
1257 ds_put_byte (&s, '.');
1259 msg (SE, "%s", ds_cstr (&s));
1261 ds_destroy (&s);
1264 static struct expr_node *
1265 parse_function (struct lexer *lexer, struct expression *e)
1267 struct string func_name;
1268 ds_init_substring (&func_name, lex_tokss (lexer));
1270 int min_valid = extract_min_valid (lex_tokcstr (lexer));
1272 const struct operation *first, *last;
1273 if (!lookup_function (lex_tokcstr (lexer), &first, &last))
1275 msg (SE, _("No function or vector named %s."), lex_tokcstr (lexer));
1276 ds_destroy (&func_name);
1277 return NULL;
1280 int func_start_ofs = lex_ofs (lexer);
1281 lex_get (lexer);
1282 if (!lex_force_match (lexer, T_LPAREN))
1284 ds_destroy (&func_name);
1285 return NULL;
1288 struct expr_node **args = NULL;
1289 size_t n_args = 0;
1290 size_t allocated_args = 0;
1291 if (lex_token (lexer) != T_RPAREN)
1292 for (;;)
1294 int arg_start_ofs = lex_ofs (lexer);
1295 if (lex_token (lexer) == T_ID
1296 && lex_next_token (lexer, 1) == T_TO)
1298 const struct variable **vars;
1299 size_t n_vars;
1301 if (!parse_variables_const (lexer, dataset_dict (e->ds),
1302 &vars, &n_vars, PV_SINGLE))
1303 goto fail;
1304 for (size_t i = 0; i < n_vars; i++)
1305 add_arg (&args, &n_args, &allocated_args,
1306 allocate_unary_variable (e, vars[i]),
1307 e, lexer, arg_start_ofs);
1308 free (vars);
1310 else
1312 struct expr_node *arg = parse_or (lexer, e);
1313 if (arg == NULL)
1314 goto fail;
1316 add_arg (&args, &n_args, &allocated_args, arg,
1317 e, lexer, arg_start_ofs);
1319 if (lex_match (lexer, T_RPAREN))
1320 break;
1321 else if (!lex_match (lexer, T_COMMA))
1323 lex_error_expecting (lexer, "`,'", "`)'");
1324 goto fail;
1328 struct expr_node *n = expr_allocate_composite (e, first - operations,
1329 args, n_args);
1330 expr_add_location (lexer, e, func_start_ofs, n);
1331 const struct operation *f = match_function (n, first, last);
1332 if (!f)
1334 no_match (ds_cstr (&func_name), args, n_args, first, last);
1335 goto fail;
1337 n->type = f - operations;
1338 n->min_valid = min_valid != -1 ? min_valid : f->array_min_elems;
1340 for (size_t i = 0; i < n_args; i++)
1341 if (!type_coercion (e, n, i))
1343 /* Unreachable because match_function already checked that the
1344 arguments were coercible. */
1345 NOT_REACHED ();
1347 if (!validate_function_args (f, n_args, min_valid))
1348 goto fail;
1350 if ((f->flags & OPF_EXTENSION) && settings_get_syntax () == COMPATIBLE)
1351 msg (SW, _("%s is a PSPP extension."), f->prototype);
1352 if (f->flags & OPF_UNIMPLEMENTED)
1354 msg (SE, _("%s is not available in this version of PSPP."),
1355 f->prototype);
1356 goto fail;
1358 if ((f->flags & OPF_PERM_ONLY) &&
1359 proc_in_temporary_transformations (e->ds))
1361 msg (SE, _("%s may not appear after %s."), f->prototype, "TEMPORARY");
1362 goto fail;
1365 if (n->type == OP_LAG_Vn || n->type == OP_LAG_Vs)
1366 dataset_need_lag (e->ds, 1);
1367 else if (n->type == OP_LAG_Vnn || n->type == OP_LAG_Vsn)
1369 assert (n->n_args == 2);
1370 assert (n->args[1]->type == OP_pos_int);
1371 dataset_need_lag (e->ds, n->args[1]->integer);
1374 free (args);
1375 ds_destroy (&func_name);
1376 return n;
1378 fail:
1379 free (args);
1380 ds_destroy (&func_name);
1381 return NULL;
1384 /* Utility functions. */
1386 static struct expression *
1387 expr_create (struct dataset *ds)
1389 struct pool *pool = pool_create ();
1390 struct expression *e = pool_alloc (pool, sizeof *e);
1391 *e = (struct expression) {
1392 .expr_pool = pool,
1393 .ds = ds,
1394 .eval_pool = pool_create_subpool (pool),
1396 return e;
1399 atom_type
1400 expr_node_returns (const struct expr_node *n)
1402 assert (n != NULL);
1403 assert (is_operation (n->type));
1404 if (is_atom (n->type))
1405 return n->type;
1406 else if (is_composite (n->type))
1407 return operations[n->type].returns;
1408 else
1409 NOT_REACHED ();
1412 static const char *
1413 atom_type_name (atom_type type)
1415 assert (is_atom (type));
1416 return operations[type].name;
1419 struct expr_node *
1420 expr_allocate_nullary (struct expression *e, operation_type op)
1422 return expr_allocate_composite (e, op, NULL, 0);
1425 struct expr_node *
1426 expr_allocate_unary (struct expression *e, operation_type op,
1427 struct expr_node *arg0)
1429 return expr_allocate_composite (e, op, &arg0, 1);
1432 struct expr_node *
1433 expr_allocate_binary (struct expression *e, operation_type op,
1434 struct expr_node *arg0, struct expr_node *arg1)
1436 struct expr_node *args[2];
1437 args[0] = arg0;
1438 args[1] = arg1;
1439 return expr_allocate_composite (e, op, args, 2);
1442 struct expr_node *
1443 expr_allocate_composite (struct expression *e, operation_type op,
1444 struct expr_node **args, size_t n_args)
1446 for (size_t i = 0; i < n_args; i++)
1447 if (!args[i])
1448 return NULL;
1450 struct expr_node *n = pool_alloc (e->expr_pool, sizeof *n);
1451 *n = (struct expr_node) {
1452 .type = op,
1453 .n_args = n_args,
1454 .args = pool_clone (e->expr_pool, args, sizeof *n->args * n_args),
1456 return n;
1459 struct expr_node *
1460 expr_allocate_number (struct expression *e, double d)
1462 struct expr_node *n = pool_alloc (e->expr_pool, sizeof *n);
1463 *n = (struct expr_node) { .type = OP_number, .number = d };
1464 return n;
1467 struct expr_node *
1468 expr_allocate_boolean (struct expression *e, double b)
1470 assert (b == 0.0 || b == 1.0 || b == SYSMIS);
1472 struct expr_node *n = pool_alloc (e->expr_pool, sizeof *n);
1473 *n = (struct expr_node) { .type = OP_boolean, .number = b };
1474 return n;
1477 struct expr_node *
1478 expr_allocate_integer (struct expression *e, int i)
1480 struct expr_node *n = pool_alloc (e->expr_pool, sizeof *n);
1481 *n = (struct expr_node) { .type = OP_integer, .integer = i };
1482 return n;
1485 struct expr_node *
1486 expr_allocate_pos_int (struct expression *e, int i)
1488 assert (i > 0);
1490 struct expr_node *n = pool_alloc (e->expr_pool, sizeof *n);
1491 *n = (struct expr_node) { .type = OP_pos_int, .integer = i };
1492 return n;
1495 struct expr_node *
1496 expr_allocate_vector (struct expression *e, const struct vector *vector)
1498 struct expr_node *n = pool_alloc (e->expr_pool, sizeof *n);
1499 *n = (struct expr_node) { .type = OP_vector, .vector = vector };
1500 return n;
1503 struct expr_node *
1504 expr_allocate_string (struct expression *e, struct substring s)
1506 struct expr_node *n = pool_alloc (e->expr_pool, sizeof *n);
1507 *n = (struct expr_node) { .type = OP_string, .string = s };
1508 return n;
1511 struct expr_node *
1512 expr_allocate_variable (struct expression *e, const struct variable *v)
1514 struct expr_node *n = pool_alloc (e->expr_pool, sizeof *n);
1515 *n = (struct expr_node) {
1516 .type = var_is_numeric (v) ? OP_num_var : OP_str_var,
1517 .variable = v
1519 return n;
1522 struct expr_node *
1523 expr_allocate_format (struct expression *e, const struct fmt_spec *format)
1525 struct expr_node *n = pool_alloc (e->expr_pool, sizeof *n);
1526 *n = (struct expr_node) { .type = OP_format, .format = *format };
1527 return n;
1530 /* Allocates a unary composite node that represents the value of
1531 variable V in expression E. */
1532 static struct expr_node *
1533 allocate_unary_variable (struct expression *e, const struct variable *v)
1535 assert (v != NULL);
1536 return expr_allocate_unary (e, var_is_numeric (v) ? OP_NUM_VAR : OP_STR_VAR,
1537 expr_allocate_variable (e, v));
1540 /* Export function details to other modules. */
1542 /* Returns the operation structure for the function with the
1543 given IDX. */
1544 const struct operation *
1545 expr_get_function (size_t idx)
1547 assert (idx < n_OP_function);
1548 return &operations[OP_function_first + idx];
1551 /* Returns the number of expression functions. */
1552 size_t
1553 expr_get_n_functions (void)
1555 return n_OP_function;
1558 /* Returns the name of operation OP. */
1559 const char *
1560 expr_operation_get_name (const struct operation *op)
1562 return op->name;
1565 /* Returns the human-readable prototype for operation OP. */
1566 const char *
1567 expr_operation_get_prototype (const struct operation *op)
1569 return op->prototype;
1572 /* Returns the number of arguments for operation OP. */
1574 expr_operation_get_n_args (const struct operation *op)
1576 return op->n_args;