lib/dfa.c

   1 /* dfa.c - deterministic extended regexp routines for GNU
   2    Copyright (C) 1988, 1998, 2000, 2002, 2004-2005, 2007-2024 Free Software
   3    Foundation, Inc.
   4
   5    This program is free software; you can redistribute it and/or modify
   6    it under the terms of the GNU General Public License as published by
   7    the Free Software Foundation, either version 3, or (at your option)
   8    any later version.
   9
  10    This program is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13    GNU General Public License for more details.
  14
  15    You should have received a copy of the GNU General Public License
  16    along with this program; if not, write to the Free Software
  17    Foundation, Inc.,
  18    51 Franklin Street - Fifth Floor, Boston, MA  02110-1301, USA */
  19
  20 /* Written June, 1988 by Mike Haertel
  21    Modified July, 1988 by Arthur David Olson to assist BMG speedups  */
  22
  23 #include <config.h>
  24
  25 #include "dfa.h"
  26
  27 #include "flexmember.h"
  28 #include "idx.h"
  29 #include "verify.h"
  30
  31 #include <assert.h>
  32 #include <ctype.h>
  33 #include <stdint.h>
  34 #include <stdio.h>
  35 #include <stdlib.h>
  36 #include <limits.h>
  37 #include <string.h>
  38 #include <wchar.h>
  39
  40 #include "xalloc.h"
  41 #include "localeinfo.h"
  42
  43 #include "gettext.h"
  44 #define _(str) gettext (str)
  45
  46 #if GAWK
  47 /* Use ISO C 99 API.  */
  48 # include <wctype.h>
  49 # define char32_t wchar_t
  50 # define mbrtoc32 mbrtowc
  51 # define c32rtomb wcrtomb
  52 # define c32tob wctob
  53 # define c32isprint iswprint
  54 # define c32isspace iswspace
  55 # define mbszero(p) memset ((p), 0, sizeof (mbstate_t))
  56 #else
  57 /* Use ISO C 11 + gnulib API.  */
  58 # include <uchar.h>
  59 #endif
  60
  61 /* Pacify gcc -Wanalyzer-null-dereference in areas where GCC
  62    understandably cannot deduce that the input comes from a
  63    well-formed regular expression.  There's little point to the
  64    runtime overhead of 'assert' instead of 'assume_nonnull' when the
  65    MMU will check anyway.  */
  66 #define assume_nonnull(x) assume ((x) != NULL)
  67
  68 static bool
  69 str_eq (char const *a, char const *b)
  70 {
  71   return strcmp (a, b) == 0;
  72 }
  73
  74 static bool
  75 c_isdigit (char c)
  76 {
  77   return '0' <= c && c <= '9';
  78 }
  79
  80 #ifndef FALLTHROUGH
  81 # if 201710L < __STDC_VERSION__
  82 #  define FALLTHROUGH [[__fallthrough__]]
  83 # elif ((__GNUC__ >= 7) \
  84         || (defined __apple_build_version__ \
  85             ? __apple_build_version__ >= 12000000 \
  86             : __clang_major__ >= 10))
  87 #  define FALLTHROUGH __attribute__ ((__fallthrough__))
  88 # else
  89 #  define FALLTHROUGH ((void) 0)
  90 # endif
  91 #endif
  92
  93 #ifndef MIN
  94 # define MIN(a,b) ((a) < (b) ? (a) : (b))
  95 #endif
  96
  97 /* HPUX defines these as macros in sys/param.h.  */
  98 #ifdef setbit
  99 # undef setbit
 100 #endif
 101 #ifdef clrbit
 102 # undef clrbit
 103 #endif
 104
 105 /* For code that does not use Gnulib’s isblank module.  */
 106 #if !defined isblank && !defined HAVE_ISBLANK && !defined GNULIB_ISBLANK
 107 # define isblank dfa_isblank
 108 static int
 109 isblank (int c)
 110 {
 111   return c == ' ' || c == '\t';
 112 }
 113 #endif
 114
 115 /* First integer value that is greater than any character code.  */
 116 enum { NOTCHAR = 1 << CHAR_BIT };
 117
 118 #ifdef UINT_LEAST64_MAX
 119
 120 /* Number of bits used in a charclass word.  */
 121 enum { CHARCLASS_WORD_BITS = 64 };
 122
 123 /* This represents part of a character class.  It must be unsigned and
 124    at least CHARCLASS_WORD_BITS wide.  Any excess bits are zero.  */
 125 typedef uint_least64_t charclass_word;
 126
 127 /* Part of a charclass initializer that represents 64 bits' worth of a
 128    charclass, where LO and HI are the low and high-order 32 bits of
 129    the 64-bit quantity.  */
 130 # define CHARCLASS_PAIR(lo, hi) (((charclass_word) (hi) << 32) + (lo))
 131
 132 #else
 133 /* Fallbacks for pre-C99 hosts that lack 64-bit integers.  */
 134 enum { CHARCLASS_WORD_BITS = 32 };
 135 typedef unsigned long charclass_word;
 136 # define CHARCLASS_PAIR(lo, hi) lo, hi
 137 #endif
 138
 139 /* An initializer for a charclass whose 32-bit words are A through H.  */
 140 #define CHARCLASS_INIT(a, b, c, d, e, f, g, h)          \
 141    {{                                                   \
 142       CHARCLASS_PAIR (a, b), CHARCLASS_PAIR (c, d),     \
 143       CHARCLASS_PAIR (e, f), CHARCLASS_PAIR (g, h)      \
 144    }}
 145
 146 /* The maximum useful value of a charclass_word; all used bits are 1.  */
 147 static charclass_word const CHARCLASS_WORD_MASK
 148   = ((charclass_word) 1 << (CHARCLASS_WORD_BITS - 1) << 1) - 1;
 149
 150 /* Number of words required to hold a bit for every character.  */
 151 enum
 152 {
 153   CHARCLASS_WORDS = (NOTCHAR + CHARCLASS_WORD_BITS - 1) / CHARCLASS_WORD_BITS
 154 };
 155
 156 /* Sets of unsigned characters are stored as bit vectors in arrays of ints.  */
 157 typedef struct { charclass_word w[CHARCLASS_WORDS]; } charclass;
 158
 159 /* Convert a possibly-signed character to an unsigned character.  This is
 160    a bit safer than casting to unsigned char, since it catches some type
 161    errors that the cast doesn't.  */
 162 static unsigned char
 163 to_uchar (char ch)
 164 {
 165   return ch;
 166 }
 167
 168 /* Contexts tell us whether a character is a newline or a word constituent.
 169    Word-constituent characters are those that satisfy iswalnum, plus '_'.
 170    Each character has a single CTX_* value; bitmasks of CTX_* values denote
 171    a particular character class.
 172
 173    A state also stores a context value, which is a bitmask of CTX_* values.
 174    A state's context represents a set of characters that the state's
 175    predecessors must match.  For example, a state whose context does not
 176    include CTX_LETTER will never have transitions where the previous
 177    character is a word constituent.  A state whose context is CTX_ANY
 178    might have transitions from any character.  */
 179
 180 enum
 181   {
 182     CTX_NONE = 1,
 183     CTX_LETTER = 2,
 184     CTX_NEWLINE = 4,
 185     CTX_ANY = 7
 186   };
 187
 188 /* Sometimes characters can only be matched depending on the surrounding
 189    context.  Such context decisions depend on what the previous character
 190    was, and the value of the current (lookahead) character.  Context
 191    dependent constraints are encoded as 9-bit integers.  Each bit that
 192    is set indicates that the constraint succeeds in the corresponding
 193    context.
 194
 195    bit 6-8  - valid contexts when next character is CTX_NEWLINE
 196    bit 3-5  - valid contexts when next character is CTX_LETTER
 197    bit 0-2  - valid contexts when next character is CTX_NONE
 198
 199    succeeds_in_context determines whether a given constraint
 200    succeeds in a particular context.  Prev is a bitmask of possible
 201    context values for the previous character, curr is the (single-bit)
 202    context value for the lookahead character.  */
 203 static int
 204 newline_constraint (int constraint)
 205 {
 206   return (constraint >> 6) & 7;
 207 }
 208 static int
 209 letter_constraint (int constraint)
 210 {
 211   return (constraint >> 3) & 7;
 212 }
 213 static int
 214 other_constraint (int constraint)
 215 {
 216   return constraint & 7;
 217 }
 218
 219 static bool
 220 succeeds_in_context (int constraint, int prev, int curr)
 221 {
 222   return !! (((curr & CTX_NONE      ? other_constraint (constraint) : 0) \
 223               | (curr & CTX_LETTER  ? letter_constraint (constraint) : 0) \
 224               | (curr & CTX_NEWLINE ? newline_constraint (constraint) : 0)) \
 225              & prev);
 226 }
 227
 228 /* The following describe what a constraint depends on.  */
 229 static bool
 230 prev_newline_dependent (int constraint)
 231 {
 232   return ((constraint ^ constraint >> 2) & 0111) != 0;
 233 }
 234 static bool
 235 prev_letter_dependent (int constraint)
 236 {
 237   return ((constraint ^ constraint >> 1) & 0111) != 0;
 238 }
 239
 240 /* Tokens that match the empty string subject to some constraint actually
 241    work by applying that constraint to determine what may follow them,
 242    taking into account what has gone before.  The following values are
 243    the constraints corresponding to the special tokens previously defined.  */
 244 enum
 245   {
 246     NO_CONSTRAINT = 0777,
 247     BEGLINE_CONSTRAINT = 0444,
 248     ENDLINE_CONSTRAINT = 0700,
 249     BEGWORD_CONSTRAINT = 0050,
 250     ENDWORD_CONSTRAINT = 0202,
 251     LIMWORD_CONSTRAINT = 0252,
 252     NOTLIMWORD_CONSTRAINT = 0525
 253   };
 254
 255 /* The regexp is parsed into an array of tokens in postfix form.  Some tokens
 256    are operators and others are terminal symbols.  Most (but not all) of these
 257    codes are returned by the lexical analyzer.  */
 258
 259 typedef ptrdiff_t token;
 260 static token const TOKEN_MAX = PTRDIFF_MAX;
 261
 262 /* States are indexed by state_num values.  These are normally
 263    nonnegative but -1 is used as a special value.  */
 264 typedef ptrdiff_t state_num;
 265
 266 /* Predefined token values.  */
 267 enum
 268 {
 269   END = -1,                     /* END is a terminal symbol that matches the
 270                                    end of input; any value of END or less in
 271                                    the parse tree is such a symbol.  Accepting
 272                                    states of the DFA are those that would have
 273                                    a transition on END.  This is -1, not some
 274                                    more-negative value, to tweak the speed of
 275                                    comparisons to END.  */
 276
 277   /* Ordinary character values are terminal symbols that match themselves.  */
 278
 279   /* CSET must come last in the following list of special tokens.  Otherwise,
 280      the list order matters only for performance.  Related special tokens
 281      should have nearby values so that code like (t == ANYCHAR || t == MBCSET
 282      || CSET <= t) can be done with a single machine-level comparison.  */
 283
 284   EMPTY = NOTCHAR,              /* EMPTY is a terminal symbol that matches
 285                                    the empty string.  */
 286
 287   QMARK,                        /* QMARK is an operator of one argument that
 288                                    matches zero or one occurrences of its
 289                                    argument.  */
 290
 291   STAR,                         /* STAR is an operator of one argument that
 292                                    matches the Kleene closure (zero or more
 293                                    occurrences) of its argument.  */
 294
 295   PLUS,                         /* PLUS is an operator of one argument that
 296                                    matches the positive closure (one or more
 297                                    occurrences) of its argument.  */
 298
 299   REPMN,                        /* REPMN is a lexical token corresponding
 300                                    to the {m,n} construct.  REPMN never
 301                                    appears in the compiled token vector.  */
 302
 303   CAT,                          /* CAT is an operator of two arguments that
 304                                    matches the concatenation of its
 305                                    arguments.  CAT is never returned by the
 306                                    lexical analyzer.  */
 307
 308   OR,                           /* OR is an operator of two arguments that
 309                                    matches either of its arguments.  */
 310
 311   LPAREN,                       /* LPAREN never appears in the parse tree,
 312                                    it is only a lexeme.  */
 313
 314   RPAREN,                       /* RPAREN never appears in the parse tree.  */
 315
 316   WCHAR,                        /* Only returned by lex.  wctok contains the
 317                                    32-bit wide character representation.  */
 318
 319   ANYCHAR,                      /* ANYCHAR is a terminal symbol that matches
 320                                    a valid multibyte (or single byte) character.
 321                                    It is used only if MB_CUR_MAX > 1.  */
 322
 323   BEG,                          /* BEG is an initial symbol that matches the
 324                                    beginning of input.  */
 325
 326   BEGLINE,                      /* BEGLINE is a terminal symbol that matches
 327                                    the empty string at the beginning of a
 328                                    line.  */
 329
 330   ENDLINE,                      /* ENDLINE is a terminal symbol that matches
 331                                    the empty string at the end of a line.  */
 332
 333   BEGWORD,                      /* BEGWORD is a terminal symbol that matches
 334                                    the empty string at the beginning of a
 335                                    word.  */
 336
 337   ENDWORD,                      /* ENDWORD is a terminal symbol that matches
 338                                    the empty string at the end of a word.  */
 339
 340   LIMWORD,                      /* LIMWORD is a terminal symbol that matches
 341                                    the empty string at the beginning or the
 342                                    end of a word.  */
 343
 344   NOTLIMWORD,                   /* NOTLIMWORD is a terminal symbol that
 345                                    matches the empty string not at
 346                                    the beginning or end of a word.  */
 347
 348   BACKREF,                      /* BACKREF is generated by \<digit>
 349                                    or by any other construct that
 350                                    is not completely handled.  If the scanner
 351                                    detects a transition on backref, it returns
 352                                    a kind of "semi-success" indicating that
 353                                    the match will have to be verified with
 354                                    a backtracking matcher.  */
 355
 356   MBCSET,                       /* MBCSET is similar to CSET, but for
 357                                    multibyte characters.  */
 358
 359   CSET                          /* CSET and (and any value greater) is a
 360                                    terminal symbol that matches any of a
 361                                    class of characters.  */
 362 };
 363
 364
 365 /* States of the recognizer correspond to sets of positions in the parse
 366    tree, together with the constraints under which they may be matched.
 367    So a position is encoded as an index into the parse tree together with
 368    a constraint.  */
 369 typedef struct
 370 {
 371   idx_t index;                  /* Index into the parse array.  */
 372   unsigned int constraint;      /* Constraint for matching this position.  */
 373 } position;
 374
 375 /* Sets of positions are stored as arrays.  */
 376 typedef struct
 377 {
 378   position *elems;              /* Elements of this position set.  */
 379   idx_t nelem;                  /* Number of elements in this set.  */
 380   idx_t alloc;                  /* Number of elements allocated in ELEMS.  */
 381 } position_set;
 382
 383 /* A state of the dfa consists of a set of positions, some flags,
 384    and the token value of the lowest-numbered position of the state that
 385    contains an END token.  */
 386 typedef struct
 387 {
 388   size_t hash;                  /* Hash of the positions of this state.  */
 389   position_set elems;           /* Positions this state could match.  */
 390   unsigned char context;        /* Context from previous state.  */
 391   unsigned short constraint;    /* Constraint for this state to accept.  */
 392   position_set mbps;            /* Positions which can match multibyte
 393                                    characters or the follows, e.g., period.
 394                                    Used only if MB_CUR_MAX > 1.  */
 395   state_num mb_trindex;         /* Index of this state in MB_TRANS, or
 396                                    negative if the state does not have
 397                                    ANYCHAR.  */
 398 } dfa_state;
 399
 400 /* Maximum for any transition table count.  This should be at least 3,
 401    for the initial state setup.  */
 402 enum { MAX_TRCOUNT = 1024 };
 403
 404 /* A bracket operator.
 405    e.g., [a-c], [[:alpha:]], etc.  */
 406 struct mb_char_classes
 407 {
 408   ptrdiff_t cset;
 409   bool invert;
 410   char32_t *chars;              /* Normal characters.  */
 411   idx_t nchars;
 412   idx_t nchars_alloc;
 413 };
 414
 415 struct regex_syntax
 416 {
 417   /* Syntax bits controlling the behavior of the lexical analyzer.  */
 418   reg_syntax_t syntax_bits;
 419   int dfaopts;
 420   bool syntax_bits_set;
 421
 422   /* Flag for case-folding letters into sets.  */
 423   bool case_fold;
 424
 425   /* End-of-line byte in data.  */
 426   unsigned char eolbyte;
 427
 428   /* Cache of char-context values.  */
 429   char sbit[NOTCHAR];
 430
 431   /* If never_trail[B], the byte B cannot be a non-initial byte in a
 432      multibyte character.  */
 433   bool never_trail[NOTCHAR];
 434
 435   /* Set of characters considered letters.  */
 436   charclass letters;
 437
 438   /* Set of characters that are newline.  */
 439   charclass newline;
 440 };
 441
 442 /* Lexical analyzer.  All the dross that deals with the obnoxious
 443    GNU Regex syntax bits is located here.  The poor, suffering
 444    reader is referred to the GNU Regex documentation for the
 445    meaning of the @#%!@#%^!@ syntax bits.  */
 446 struct lexer_state
 447 {
 448   char const *ptr;      /* Pointer to next input character.  */
 449   idx_t left;           /* Number of characters remaining.  */
 450   token lasttok;        /* Previous token returned; initially END.  */
 451   idx_t parens;         /* Count of outstanding left parens.  */
 452   int minrep, maxrep;   /* Repeat counts for {m,n}.  */
 453
 454   /* 32-bit wide character representation of the current multibyte character,
 455      or WEOF if there was an encoding error.  Used only if
 456      MB_CUR_MAX > 1.  */
 457   wint_t wctok;
 458
 459   /* The most recently analyzed multibyte bracket expression.  */
 460   struct mb_char_classes brack;
 461
 462   /* We're separated from beginning or (, | only by zero-width characters.  */
 463   bool laststart;
 464 };
 465
 466 /* Recursive descent parser for regular expressions.  */
 467
 468 struct parser_state
 469 {
 470   token tok;               /* Lookahead token.  */
 471   idx_t depth;             /* Current depth of a hypothetical stack
 472                               holding deferred productions.  This is
 473                               used to determine the depth that will be
 474                               required of the real stack later on in
 475                               dfaanalyze.  */
 476 };
 477
 478 /* A compiled regular expression.  */
 479 struct dfa
 480 {
 481   /* Fields filled by the scanner.  */
 482   charclass *charclasses;       /* Array of character sets for CSET tokens.  */
 483   idx_t cindex;                 /* Index for adding new charclasses.  */
 484   idx_t calloc;                 /* Number of charclasses allocated.  */
 485   ptrdiff_t canychar;           /* Index of anychar class, or -1.  */
 486
 487   /* Scanner state */
 488   struct lexer_state lex;
 489
 490   /* Parser state */
 491   struct parser_state parse;
 492
 493   /* Fields filled by the parser.  */
 494   token *tokens;                /* Postfix parse array.  */
 495   idx_t tindex;                 /* Index for adding new tokens.  */
 496   idx_t talloc;                 /* Number of tokens currently allocated.  */
 497   idx_t depth;                  /* Depth required of an evaluation stack
 498                                    used for depth-first traversal of the
 499                                    parse tree.  */
 500   idx_t nleaves;                /* Number of non-EMPTY leaves
 501                                    in the parse tree.  */
 502   idx_t nregexps;               /* Count of parallel regexps being built
 503                                    with dfaparse.  */
 504   bool fast;                    /* The DFA is fast.  */
 505   bool epsilon;                 /* Does a token match only the empty string?  */
 506   token utf8_anychar_classes[9]; /* To lower ANYCHAR in UTF-8 locales.  */
 507   mbstate_t mbs;                /* Multibyte conversion state.  */
 508
 509   /* The following are valid only if MB_CUR_MAX > 1.  */
 510
 511   /* The value of multibyte_prop[i] is defined by following rule.
 512      if tokens[i] < NOTCHAR
 513      bit 0 : tokens[i] is the first byte of a character, including
 514      single-byte characters.
 515      bit 1 : tokens[i] is the last byte of a character, including
 516      single-byte characters.
 517
 518      e.g.
 519      tokens
 520      = 'single_byte_a', 'multi_byte_A', single_byte_b'
 521      = 'sb_a', 'mb_A(1st byte)', 'mb_A(2nd byte)', 'mb_A(3rd byte)', 'sb_b'
 522      multibyte_prop
 523      = 3     , 1               ,  0              ,  2              , 3
 524    */
 525   char *multibyte_prop;
 526
 527   /* Fields filled by the superset.  */
 528   struct dfa *superset;             /* Hint of the dfa.  */
 529
 530   /* Fields filled by the state builder.  */
 531   dfa_state *states;            /* States of the dfa.  */
 532   state_num sindex;             /* Index for adding new states.  */
 533   idx_t salloc;                 /* Number of states currently allocated.  */
 534
 535   /* Fields filled by the parse tree->NFA conversion.  */
 536   position_set *follows;        /* Array of follow sets, indexed by position
 537                                    index.  The follow of a position is the set
 538                                    of positions containing characters that
 539                                    could conceivably follow a character
 540                                    matching the given position in a string
 541                                    matching the regexp.  Allocated to the
 542                                    maximum possible position index.  */
 543   bool searchflag;              /* We are supposed to build a searching
 544                                    as opposed to an exact matcher.  A searching
 545                                    matcher finds the first and shortest string
 546                                    matching a regexp anywhere in the buffer,
 547                                    whereas an exact matcher finds the longest
 548                                    string matching, but anchored to the
 549                                    beginning of the buffer.  */
 550
 551   /* Fields filled by dfaanalyze.  */
 552   int *constraints;             /* Array of union of accepting constraints
 553                                    in the follow of a position.  */
 554   int *separates;               /* Array of contexts on follow of a
 555                                    position.  */
 556
 557   /* Fields filled by dfaexec.  */
 558   state_num tralloc;            /* Number of transition tables that have
 559                                    slots so far, not counting trans[-1] and
 560                                    trans[-2].  */
 561   int trcount;                  /* Number of transition tables that have
 562                                    been built, other than for initial
 563                                    states.  */
 564   int min_trcount;              /* Number of initial states.  Equivalently,
 565                                    the minimum state number for which trcount
 566                                    counts transitions.  */
 567   state_num **trans;            /* Transition tables for states that can
 568                                    never accept.  If the transitions for a
 569                                    state have not yet been computed, or the
 570                                    state could possibly accept, its entry in
 571                                    this table is NULL.  This points to two
 572                                    past the start of the allocated array,
 573                                    and trans[-1] and trans[-2] are always
 574                                    NULL.  */
 575   state_num **fails;            /* Transition tables after failing to accept
 576                                    on a state that potentially could do so.
 577                                    If trans[i] is non-null, fails[i] must
 578                                    be null.  */
 579   char *success;                /* Table of acceptance conditions used in
 580                                    dfaexec and computed in build_state.  */
 581   state_num *newlines;          /* Transitions on newlines.  The entry for a
 582                                    newline in any transition table is always
 583                                    -1 so we can count lines without wasting
 584                                    too many cycles.  The transition for a
 585                                    newline is stored separately and handled
 586                                    as a special case.  Newline is also used
 587                                    as a sentinel at the end of the buffer.  */
 588   state_num initstate_notbol;   /* Initial state for CTX_LETTER and CTX_NONE
 589                                    context in multibyte locales, in which we
 590                                    do not distinguish between their contexts,
 591                                    as not supported word.  */
 592   position_set mb_follows;      /* Follow set added by ANYCHAR on demand.  */
 593   state_num **mb_trans;         /* Transition tables for states with
 594                                    ANYCHAR.  */
 595   state_num mb_trcount;         /* Number of transition tables for states with
 596                                    ANYCHAR that have actually been built.  */
 597
 598   /* Syntax configuration.  This is near the end so that dfacopysyntax
 599      can memset up to here.  */
 600   struct regex_syntax syntax;
 601
 602   /* Information derived from the locale.  This is at the end so that
 603      a quick memset need not clear it specially.  */
 604
 605   /* dfaexec implementation.  */
 606   char *(*dfaexec) (struct dfa *, char const *, char *,
 607                     bool, idx_t *, bool *);
 608
 609   /* Other cached information derived from the locale.  */
 610   struct localeinfo localeinfo;
 611 };
 612
 613 /* User access to dfa internals.  */
 614
 615 /* S could possibly be an accepting state of R.  */
 616 static bool
 617 accepting (state_num s, struct dfa const *r)
 618 {
 619   return r->states[s].constraint != 0;
 620 }
 621
 622 /* STATE accepts in the specified context.  */
 623 static bool
 624 accepts_in_context (int prev, int curr, state_num state, struct dfa const *dfa)
 625 {
 626   return succeeds_in_context (dfa->states[state].constraint, prev, curr);
 627 }
 628
 629 static void regexp (struct dfa *dfa);
 630
 631 /* Store into *PWC the result of converting the leading bytes of the
 632    multibyte buffer S of length N bytes, using D->localeinfo.sbctowc
 633    and updating the conversion state in *D.  On conversion error,
 634    convert just a single byte, to WEOF.  Return the number of bytes
 635    converted.
 636
 637    This differs from mbrtoc32 (PWC, S, N, &D->mbs) as follows:
 638
 639    * PWC points to wint_t, not to char32_t.
 640    * The last arg is a dfa *D instead of merely a multibyte conversion
 641      state D->mbs.
 642    * N is idx_t not size_t, and must be at least 1.
 643    * S[N - 1] must be a sentinel byte.
 644    * Shift encodings are not supported.
 645    * The return value is always in the range 1..N.
 646    * D->mbs is always valid afterwards.
 647    * *PWC is always set to something.  */
 648 static int
 649 mbs_to_wchar (wint_t *pwc, char const *s, idx_t n, struct dfa *d)
 650 {
 651   unsigned char uc = s[0];
 652   wint_t wc = d->localeinfo.sbctowc[uc];
 653
 654   if (wc == WEOF)
 655     {
 656       char32_t wch;
 657       size_t nbytes = mbrtoc32 (&wch, s, n, &d->mbs);
 658       if (0 < nbytes && nbytes < (size_t) -2)
 659         {
 660           *pwc = wch;
 661           /* nbytes cannot be == (size) -3 here, since we rely on the
 662              'mbrtoc32-regular' module.  */
 663           return nbytes;
 664         }
 665       mbszero (&d->mbs);
 666     }
 667
 668   *pwc = wc;
 669   return 1;
 670 }
 671
 672 #ifdef DEBUG
 673
 674 static void
 675 prtok (token t)
 676 {
 677   if (t <= END)
 678     fprintf (stderr, "END");
 679   else if (0 <= t && t < NOTCHAR)
 680     {
 681       unsigned int ch = t;
 682       fprintf (stderr, "0x%02x", ch);
 683     }
 684   else
 685     {
 686       char const *s;
 687       switch (t)
 688         {
 689         case BEG:
 690           s = "BEG";
 691           break;
 692         case EMPTY:
 693           s = "EMPTY";
 694           break;
 695         case BACKREF:
 696           s = "BACKREF";
 697           break;
 698         case BEGLINE:
 699           s = "BEGLINE";
 700           break;
 701         case ENDLINE:
 702           s = "ENDLINE";
 703           break;
 704         case BEGWORD:
 705           s = "BEGWORD";
 706           break;
 707         case ENDWORD:
 708           s = "ENDWORD";
 709           break;
 710         case LIMWORD:
 711           s = "LIMWORD";
 712           break;
 713         case NOTLIMWORD:
 714           s = "NOTLIMWORD";
 715           break;
 716         case QMARK:
 717           s = "QMARK";
 718           break;
 719         case STAR:
 720           s = "STAR";
 721           break;
 722         case PLUS:
 723           s = "PLUS";
 724           break;
 725         case CAT:
 726           s = "CAT";
 727           break;
 728         case OR:
 729           s = "OR";
 730           break;
 731         case LPAREN:
 732           s = "LPAREN";
 733           break;
 734         case RPAREN:
 735           s = "RPAREN";
 736           break;
 737         case ANYCHAR:
 738           s = "ANYCHAR";
 739           break;
 740         case MBCSET:
 741           s = "MBCSET";
 742           break;
 743         default:
 744           s = "CSET";
 745           break;
 746         }
 747       fprintf (stderr, "%s", s);
 748     }
 749 }
 750 #endif /* DEBUG */
 751
 752 /* Stuff pertaining to charclasses.  */
 753
 754 static bool
 755 tstbit (unsigned int b, charclass const *c)
 756 {
 757   return c->w[b / CHARCLASS_WORD_BITS] >> b % CHARCLASS_WORD_BITS & 1;
 758 }
 759
 760 static void
 761 setbit (unsigned int b, charclass *c)
 762 {
 763   charclass_word one = 1;
 764   c->w[b / CHARCLASS_WORD_BITS] |= one << b % CHARCLASS_WORD_BITS;
 765 }
 766
 767 static void
 768 clrbit (unsigned int b, charclass *c)
 769 {
 770   charclass_word one = 1;
 771   c->w[b / CHARCLASS_WORD_BITS] &= ~(one << b % CHARCLASS_WORD_BITS);
 772 }
 773
 774 static void
 775 zeroset (charclass *s)
 776 {
 777   memset (s, 0, sizeof *s);
 778 }
 779
 780 static void
 781 fillset (charclass *s)
 782 {
 783   for (int i = 0; i < CHARCLASS_WORDS; i++)
 784     s->w[i] = CHARCLASS_WORD_MASK;
 785 }
 786
 787 static void
 788 notset (charclass *s)
 789 {
 790   for (int i = 0; i < CHARCLASS_WORDS; ++i)
 791     s->w[i] = CHARCLASS_WORD_MASK & ~s->w[i];
 792 }
 793
 794 static bool
 795 equal (charclass const *s1, charclass const *s2)
 796 {
 797   charclass_word w = 0;
 798   for (int i = 0; i < CHARCLASS_WORDS; i++)
 799     w |= s1->w[i] ^ s2->w[i];
 800   return w == 0;
 801 }
 802
 803 static bool
 804 emptyset (charclass const *s)
 805 {
 806   charclass_word w = 0;
 807   for (int i = 0; i < CHARCLASS_WORDS; i++)
 808     w |= s->w[i];
 809   return w == 0;
 810 }
 811
 812 /* Ensure that the array addressed by PA holds at least I + 1 items.
 813    Either return PA, or reallocate the array and return its new address.
 814    Although PA may be null, the returned value is never null.
 815
 816    The array holds *NITEMS items, where 0 <= I <= *NITEMS; *NITEMS
 817    is updated on reallocation.  If PA is null, *NITEMS must be zero.
 818    Do not allocate more than NITEMS_MAX items total; -1 means no limit.
 819    ITEM_SIZE is the size of one item; it must be positive.
 820    Avoid O(N**2) behavior on arrays growing linearly.  */
 821 static void *
 822 maybe_realloc (void *pa, idx_t i, idx_t *nitems,
 823                ptrdiff_t nitems_max, idx_t item_size)
 824 {
 825   if (i < *nitems)
 826     return pa;
 827   return xpalloc (pa, nitems, 1, nitems_max, item_size);
 828 }
 829
 830 /* In DFA D, find the index of charclass S, or allocate a new one.  */
 831 static idx_t
 832 charclass_index (struct dfa *d, charclass const *s)
 833 {
 834   idx_t i;
 835
 836   for (i = 0; i < d->cindex; ++i)
 837     if (equal (s, &d->charclasses[i]))
 838       return i;
 839   d->charclasses = maybe_realloc (d->charclasses, d->cindex, &d->calloc,
 840                                   TOKEN_MAX - CSET, sizeof *d->charclasses);
 841   ++d->cindex;
 842   d->charclasses[i] = *s;
 843   return i;
 844 }
 845
 846 static bool
 847 unibyte_word_constituent (struct dfa const *dfa, unsigned char c)
 848 {
 849   return dfa->localeinfo.sbctowc[c] != WEOF && (isalnum (c) || (c) == '_');
 850 }
 851
 852 static int
 853 char_context (struct dfa const *dfa, unsigned char c)
 854 {
 855   if (c == dfa->syntax.eolbyte && !(dfa->syntax.dfaopts & DFA_ANCHOR))
 856     return CTX_NEWLINE;
 857   if (unibyte_word_constituent (dfa, c))
 858     return CTX_LETTER;
 859   return CTX_NONE;
 860 }
 861
 862 /* Set a bit in the charclass for the given char32_t.  Do nothing if WC
 863    is represented by a multi-byte sequence.  Even for MB_CUR_MAX == 1,
 864    this may happen when folding case in weird Turkish locales where
 865    dotless i/dotted I are not included in the chosen character set.
 866    Return whether a bit was set in the charclass.  */
 867 static bool
 868 setbit_wc (char32_t wc, charclass *c)
 869 {
 870   int b = c32tob (wc);
 871   if (b < 0)
 872     return false;
 873
 874   setbit (b, c);
 875   return true;
 876 }
 877
 878 /* Set a bit for B and its case variants in the charclass C.
 879    MB_CUR_MAX must be 1.  */
 880 static void
 881 setbit_case_fold_c (int b, charclass *c)
 882 {
 883   int ub = toupper (b);
 884   for (int i = 0; i < NOTCHAR; i++)
 885     if (toupper (i) == ub)
 886       setbit (i, c);
 887 }
 888
 889 /* Fetch the next lexical input character from the pattern.  There
 890    must at least one byte of pattern input.  Set DFA->lex.wctok to the
 891    value of the character or to WEOF depending on whether the input is
 892    a valid multibyte character (possibly of length 1).  Then return
 893    the next input byte value, except return EOF if the input is a
 894    multibyte character of length greater than 1.  */
 895 static int
 896 fetch_wc (struct dfa *dfa)
 897 {
 898   int nbytes = mbs_to_wchar (&dfa->lex.wctok, dfa->lex.ptr, dfa->lex.left,
 899                              dfa);
 900   int c = nbytes == 1 ? to_uchar (dfa->lex.ptr[0]) : EOF;
 901   dfa->lex.ptr += nbytes;
 902   dfa->lex.left -= nbytes;
 903   return c;
 904 }
 905
 906 /* If there is no more input, report an error about unbalanced brackets.
 907    Otherwise, behave as with fetch_wc (DFA).  */
 908 static int
 909 bracket_fetch_wc (struct dfa *dfa)
 910 {
 911   if (! dfa->lex.left)
 912     dfaerror (_("unbalanced ["));
 913   return fetch_wc (dfa);
 914 }
 915
 916 typedef int predicate (int);
 917
 918 /* The following list maps the names of the Posix named character classes
 919    to predicate functions that determine whether a given character is in
 920    the class.  The leading [ has already been eaten by the lexical
 921    analyzer.  */
 922 struct dfa_ctype
 923 {
 924   const char *name;
 925   predicate *func;
 926   bool single_byte_only;
 927 };
 928
 929 static const struct dfa_ctype prednames[] = {
 930   {"alpha", isalpha, false},
 931   {"upper", isupper, false},
 932   {"lower", islower, false},
 933   {"digit", isdigit, true},
 934   {"xdigit", isxdigit, false},
 935   {"space", isspace, false},
 936   {"punct", ispunct, false},
 937   {"alnum", isalnum, false},
 938   {"print", isprint, false},
 939   {"graph", isgraph, false},
 940   {"cntrl", iscntrl, false},
 941   {"blank", isblank, false},
 942   {NULL, NULL, false}
 943 };
 944
 945 static const struct dfa_ctype *_GL_ATTRIBUTE_PURE
 946 find_pred (const char *str)
 947 {
 948   for (int i = 0; prednames[i].name; i++)
 949     if (str_eq (str, prednames[i].name))
 950       return &prednames[i];
 951   return NULL;
 952 }
 953
 954 /* Parse a bracket expression, which possibly includes multibyte
 955    characters.  */
 956 static token
 957 parse_bracket_exp (struct dfa *dfa)
 958 {
 959   /* This is a bracket expression that dfaexec is known to
 960      process correctly.  */
 961   bool known_bracket_exp = true;
 962
 963   /* Used to warn about [:space:].
 964      Bit 0 = first character is a colon.
 965      Bit 1 = last character is a colon.
 966      Bit 2 = includes any other character but a colon.
 967      Bit 3 = includes ranges, char/equiv classes or collation elements.  */
 968   int colon_warning_state;
 969
 970   dfa->lex.brack.nchars = 0;
 971   charclass ccl;
 972   zeroset (&ccl);
 973   int c = bracket_fetch_wc (dfa);
 974   bool invert = c == '^';
 975   if (invert)
 976     {
 977       c = bracket_fetch_wc (dfa);
 978       known_bracket_exp = dfa->localeinfo.simple;
 979     }
 980   wint_t wc = dfa->lex.wctok;
 981   int c1;
 982   wint_t wc1;
 983   colon_warning_state = (c == ':');
 984   do
 985     {
 986       c1 = NOTCHAR;     /* Mark c1 as not initialized.  */
 987       colon_warning_state &= ~2;
 988
 989       /* Note that if we're looking at some other [:...:] construct,
 990          we just treat it as a bunch of ordinary characters.  We can do
 991          this because we assume regex has checked for syntax errors before
 992          dfa is ever called.  */
 993       if (c == '[')
 994         {
 995           c1 = bracket_fetch_wc (dfa);
 996           wc1 = dfa->lex.wctok;
 997
 998           if ((c1 == ':' && (dfa->syntax.syntax_bits & RE_CHAR_CLASSES))
 999               || c1 == '.' || c1 == '=')
1000             {
1001               enum { MAX_BRACKET_STRING_LEN = 32 };
1002               char str[MAX_BRACKET_STRING_LEN + 1];
1003               int len = 0;
1004               for (;;)
1005                 {
1006                   c = bracket_fetch_wc (dfa);
1007                   if (dfa->lex.left == 0
1008                       || (c == c1 && dfa->lex.ptr[0] == ']'))
1009                     break;
1010                   if (len < MAX_BRACKET_STRING_LEN)
1011                     str[len++] = c;
1012                   else
1013                     /* This is in any case an invalid class name.  */
1014                     str[0] = '\0';
1015                 }
1016               str[len] = '\0';
1017
1018               /* Fetch bracket.  */
1019               c = bracket_fetch_wc (dfa);
1020               wc = dfa->lex.wctok;
1021               if (c1 == ':')
1022                 /* Build character class.  POSIX allows character
1023                    classes to match multicharacter collating elements,
1024                    but the regex code does not support that, so do not
1025                    worry about that possibility.  */
1026                 {
1027                   char const *class
1028                     = (dfa->syntax.case_fold && (str_eq (str, "upper")
1029                                                  || str_eq (str, "lower"))
1030                        ? "alpha" : str);
1031                   const struct dfa_ctype *pred = find_pred (class);
1032                   if (!pred)
1033                     dfaerror (_("invalid character class"));
1034
1035                   if (dfa->localeinfo.multibyte && !pred->single_byte_only)
1036                     known_bracket_exp = false;
1037                   else
1038                     for (int c2 = 0; c2 < NOTCHAR; ++c2)
1039                       if (pred->func (c2))
1040                         setbit (c2, &ccl);
1041                 }
1042               else
1043                 known_bracket_exp = false;
1044
1045               colon_warning_state |= 8;
1046
1047               /* Fetch new lookahead character.  */
1048               c1 = bracket_fetch_wc (dfa);
1049               wc1 = dfa->lex.wctok;
1050               continue;
1051             }
1052
1053           /* We treat '[' as a normal character here.  c/c1/wc/wc1
1054              are already set up.  */
1055         }
1056
1057       if (c == '\\'
1058           && (dfa->syntax.syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS))
1059         {
1060           c = bracket_fetch_wc (dfa);
1061           wc = dfa->lex.wctok;
1062         }
1063
1064       if (c1 == NOTCHAR)
1065         {
1066           c1 = bracket_fetch_wc (dfa);
1067           wc1 = dfa->lex.wctok;
1068         }
1069
1070       if (c1 == '-')
1071         /* build range characters.  */
1072         {
1073           int c2 = bracket_fetch_wc (dfa);
1074           wint_t wc2 = dfa->lex.wctok;
1075
1076           /* A bracket expression like [a-[.aa.]] matches an unknown set.
1077              Treat it like [-a[.aa.]] while parsing it, and
1078              remember that the set is unknown.  */
1079           if (c2 == '[' && dfa->lex.ptr[0] == '.')
1080             {
1081               known_bracket_exp = false;
1082               c2 = ']';
1083             }
1084
1085           if (c2 == ']')
1086             {
1087               /* In the case [x-], the - is an ordinary hyphen,
1088                  which is left in c1, the lookahead character.  */
1089               dfa->lex.ptr--;
1090               dfa->lex.left++;
1091             }
1092           else
1093             {
1094               if (c2 == '\\' && (dfa->syntax.syntax_bits
1095                                  & RE_BACKSLASH_ESCAPE_IN_LISTS))
1096                 {
1097                   c2 = bracket_fetch_wc (dfa);
1098                   wc2 = dfa->lex.wctok;
1099                 }
1100
1101               colon_warning_state |= 8;
1102               c1 = bracket_fetch_wc (dfa);
1103               wc1 = dfa->lex.wctok;
1104
1105               /* Treat [x-y] as a range if x != y.  */
1106               if (wc != wc2 || wc == WEOF)
1107                 {
1108                   if (dfa->localeinfo.simple
1109                       || (c_isdigit (c) & c_isdigit (c2)))
1110                     {
1111                       for (int ci = c; ci <= c2; ci++)
1112                         if (dfa->syntax.case_fold && isalpha (ci))
1113                           setbit_case_fold_c (ci, &ccl);
1114                         else
1115                           setbit (ci, &ccl);
1116                     }
1117                   else
1118                     known_bracket_exp = false;
1119
1120                   continue;
1121                 }
1122             }
1123         }
1124
1125       colon_warning_state |= (c == ':') ? 2 : 4;
1126
1127       if (!dfa->localeinfo.multibyte)
1128         {
1129           if (dfa->syntax.case_fold && isalpha (c))
1130             setbit_case_fold_c (c, &ccl);
1131           else
1132             setbit (c, &ccl);
1133           continue;
1134         }
1135
1136       if (wc == WEOF)
1137         known_bracket_exp = false;
1138       else
1139         {
1140           char32_t folded[CASE_FOLDED_BUFSIZE + 1];
1141           int n = (dfa->syntax.case_fold
1142                    ? case_folded_counterparts (wc, folded + 1) + 1
1143                    : 1);
1144           folded[0] = wc;
1145           for (int i = 0; i < n; i++)
1146             if (!setbit_wc (folded[i], &ccl))
1147               {
1148                 dfa->lex.brack.chars
1149                   = maybe_realloc (dfa->lex.brack.chars, dfa->lex.brack.nchars,
1150                                    &dfa->lex.brack.nchars_alloc, -1,
1151                                    sizeof *dfa->lex.brack.chars);
1152                 dfa->lex.brack.chars[dfa->lex.brack.nchars++] = folded[i];
1153               }
1154         }
1155     }
1156   while ((wc = wc1, (c = c1) != ']'));
1157
1158   if (colon_warning_state == 7)
1159     {
1160       char const *msg
1161         = _("character class syntax is [[:space:]], not [:space:]");
1162       if (dfa->syntax.dfaopts & DFA_CONFUSING_BRACKETS_ERROR)
1163         dfaerror (msg);
1164       else
1165         dfawarn (msg);
1166     }
1167
1168   if (! known_bracket_exp)
1169     return BACKREF;
1170
1171   if (dfa->localeinfo.multibyte && (invert || dfa->lex.brack.nchars != 0))
1172     {
1173       dfa->lex.brack.invert = invert;
1174       dfa->lex.brack.cset = emptyset (&ccl) ? -1 : charclass_index (dfa, &ccl);
1175       return MBCSET;
1176     }
1177
1178   if (invert)
1179     {
1180       notset (&ccl);
1181       if (dfa->syntax.syntax_bits & RE_HAT_LISTS_NOT_NEWLINE)
1182         clrbit ('\n', &ccl);
1183     }
1184
1185   return CSET + charclass_index (dfa, &ccl);
1186 }
1187
1188 struct lexptr
1189 {
1190   char const *ptr;
1191   idx_t left;
1192 };
1193
1194 static void
1195 push_lex_state (struct dfa *dfa, struct lexptr *ls, char const *s)
1196 {
1197   ls->ptr = dfa->lex.ptr;
1198   ls->left = dfa->lex.left;
1199   dfa->lex.ptr = s;
1200   dfa->lex.left = strlen (s);
1201 }
1202
1203 static void
1204 pop_lex_state (struct dfa *dfa, struct lexptr const *ls)
1205 {
1206   dfa->lex.ptr = ls->ptr;
1207   dfa->lex.left = ls->left;
1208 }
1209
1210 static token
1211 lex (struct dfa *dfa)
1212 {
1213   bool backslash = false;
1214
1215   /* Basic plan: We fetch a character.  If it's a backslash,
1216      we set the backslash flag and go through the loop again.
1217      On the plus side, this avoids having a duplicate of the
1218      main switch inside the backslash case.  On the minus side,
1219      it means that just about every case tests the backslash flag.  */
1220   for (int i = 0; ; i++)
1221     {
1222       /* This loop should consume at most a backslash and some other
1223          character.  */
1224       if (2 <= i)
1225         abort ();
1226
1227       if (! dfa->lex.left)
1228         return dfa->lex.lasttok = END;
1229       int c = fetch_wc (dfa);
1230
1231       switch (c)
1232         {
1233         case '\\':
1234           if (backslash)
1235             goto normal_char;
1236           if (dfa->lex.left == 0)
1237             dfaerror (_("unfinished \\ escape"));
1238           backslash = true;
1239           break;
1240
1241         case '^':
1242           if (backslash)
1243             goto normal_char;
1244           if (dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_ANCHORS
1245               || dfa->lex.lasttok == END || dfa->lex.lasttok == LPAREN
1246               || dfa->lex.lasttok == OR)
1247             return dfa->lex.lasttok = BEGLINE;
1248           goto normal_char;
1249
1250         case '$':
1251           if (backslash)
1252             goto normal_char;
1253           if (dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_ANCHORS
1254               || dfa->lex.left == 0
1255               || ((dfa->lex.left
1256                    > !(dfa->syntax.syntax_bits & RE_NO_BK_PARENS))
1257                   && (dfa->lex.ptr[!(dfa->syntax.syntax_bits & RE_NO_BK_PARENS)
1258                                    & (dfa->lex.ptr[0] == '\\')]
1259                       == ')'))
1260               || ((dfa->lex.left
1261                    > !(dfa->syntax.syntax_bits & RE_NO_BK_VBAR))
1262                   && (dfa->lex.ptr[!(dfa->syntax.syntax_bits & RE_NO_BK_VBAR)
1263                                    & (dfa->lex.ptr[0] == '\\')]
1264                       == '|'))
1265               || ((dfa->syntax.syntax_bits & RE_NEWLINE_ALT)
1266                   && dfa->lex.left > 0 && dfa->lex.ptr[0] == '\n'))
1267             return dfa->lex.lasttok = ENDLINE;
1268           goto normal_char;
1269
1270         case '1':
1271         case '2':
1272         case '3':
1273         case '4':
1274         case '5':
1275         case '6':
1276         case '7':
1277         case '8':
1278         case '9':
1279           if (!backslash)
1280             goto normal_char;
1281           if (dfa->syntax.syntax_bits & RE_NO_BK_REFS)
1282             goto stray_backslash;
1283
1284           dfa->lex.laststart = false;
1285           return dfa->lex.lasttok = BACKREF;
1286
1287         case '`':
1288           if (!backslash)
1289             goto normal_char;
1290           if (dfa->syntax.syntax_bits & RE_NO_GNU_OPS)
1291             goto stray_backslash;
1292
1293           /* FIXME: should be beginning of string */
1294           return dfa->lex.lasttok = BEGLINE;
1295
1296         case '\'':
1297           if (!backslash)
1298             goto normal_char;
1299           if (dfa->syntax.syntax_bits & RE_NO_GNU_OPS)
1300             goto stray_backslash;
1301
1302           /* FIXME: should be end of string */
1303           return dfa->lex.lasttok = ENDLINE;
1304
1305         case '<':
1306           if (!backslash)
1307             goto normal_char;
1308           if (dfa->syntax.syntax_bits & RE_NO_GNU_OPS)
1309             goto stray_backslash;
1310
1311           return dfa->lex.lasttok = BEGWORD;
1312
1313         case '>':
1314           if (!backslash)
1315             goto normal_char;
1316           if (dfa->syntax.syntax_bits & RE_NO_GNU_OPS)
1317             goto stray_backslash;
1318
1319           return dfa->lex.lasttok = ENDWORD;
1320
1321         case 'b':
1322           if (!backslash)
1323             goto normal_char;
1324           if (dfa->syntax.syntax_bits & RE_NO_GNU_OPS)
1325             goto stray_backslash;
1326
1327           return dfa->lex.lasttok = LIMWORD;
1328
1329         case 'B':
1330           if (!backslash)
1331             goto normal_char;
1332           if (dfa->syntax.syntax_bits & RE_NO_GNU_OPS)
1333             goto stray_backslash;
1334
1335           return dfa->lex.lasttok = NOTLIMWORD;
1336
1337         case '?':
1338           if (dfa->syntax.syntax_bits & RE_LIMITED_OPS)
1339             goto default_case;
1340           if (backslash != ((dfa->syntax.syntax_bits & RE_BK_PLUS_QM) != 0))
1341             goto normal_char;
1342           if (dfa->lex.laststart)
1343             {
1344               if (!(dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_OPS))
1345                 goto default_case;
1346               if (dfa->syntax.dfaopts & DFA_PLUS_WARN)
1347                 dfawarn (_("? at start of expression"));
1348             }
1349           return dfa->lex.lasttok = QMARK;
1350
1351         case '*':
1352           if (backslash)
1353             goto normal_char;
1354           if (dfa->lex.laststart)
1355             {
1356               if (!(dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_OPS))
1357                 goto default_case;
1358               if (dfa->syntax.dfaopts & DFA_STAR_WARN)
1359                 dfawarn (_("* at start of expression"));
1360             }
1361           return dfa->lex.lasttok = STAR;
1362
1363         case '+':
1364           if (dfa->syntax.syntax_bits & RE_LIMITED_OPS)
1365             goto default_case;
1366           if (backslash != ((dfa->syntax.syntax_bits & RE_BK_PLUS_QM) != 0))
1367             goto normal_char;
1368           if (dfa->lex.laststart)
1369             {
1370               if (!(dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_OPS))
1371                 goto default_case;
1372               if (dfa->syntax.dfaopts & DFA_PLUS_WARN)
1373                 dfawarn (_("+ at start of expression"));
1374             }
1375           return dfa->lex.lasttok = PLUS;
1376
1377         case '{':
1378           if (!(dfa->syntax.syntax_bits & RE_INTERVALS))
1379             goto default_case;
1380           if (backslash != ((dfa->syntax.syntax_bits & RE_NO_BK_BRACES) == 0))
1381             goto normal_char;
1382
1383           /* Cases:
1384              {M} - exact count
1385              {M,} - minimum count, maximum is infinity
1386              {,N} - 0 through N
1387              {,} - 0 to infinity (same as '*')
1388              {M,N} - M through N */
1389           {
1390             char const *p = dfa->lex.ptr;
1391             char const *lim = p + dfa->lex.left;
1392             dfa->lex.minrep = dfa->lex.maxrep = -1;
1393             for (; p != lim && c_isdigit (*p); p++)
1394               dfa->lex.minrep = (dfa->lex.minrep < 0
1395                                  ? *p - '0'
1396                                  : MIN (RE_DUP_MAX + 1,
1397                                         dfa->lex.minrep * 10 + *p - '0'));
1398             if (p != lim)
1399               {
1400                 if (*p != ',')
1401                   dfa->lex.maxrep = dfa->lex.minrep;
1402                 else
1403                   {
1404                     if (dfa->lex.minrep < 0)
1405                       dfa->lex.minrep = 0;
1406                     while (++p != lim && c_isdigit (*p))
1407                       dfa->lex.maxrep
1408                         = (dfa->lex.maxrep < 0
1409                            ? *p - '0'
1410                            : MIN (RE_DUP_MAX + 1,
1411                                   dfa->lex.maxrep * 10 + *p - '0'));
1412                   }
1413               }
1414             bool invalid_content
1415               = ! ((! backslash || (p != lim && *p++ == '\\'))
1416                    && p != lim && *p++ == '}'
1417                    && 0 <= dfa->lex.minrep
1418                    && (dfa->lex.maxrep < 0
1419                        || dfa->lex.minrep <= dfa->lex.maxrep));
1420             if (invalid_content
1421                 && (dfa->syntax.syntax_bits & RE_INVALID_INTERVAL_ORD))
1422               goto normal_char;
1423             if (dfa->lex.laststart)
1424               {
1425                 if (!(dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_OPS))
1426                   goto default_case;
1427                 if (dfa->syntax.dfaopts & DFA_PLUS_WARN)
1428                   dfawarn (_("{...} at start of expression"));
1429               }
1430             if (invalid_content)
1431               dfaerror (_("invalid content of \\{\\}"));
1432             if (RE_DUP_MAX < dfa->lex.maxrep)
1433               dfaerror (_("regular expression too big"));
1434             dfa->lex.ptr = p;
1435             dfa->lex.left = lim - p;
1436           }
1437           dfa->lex.laststart = false;
1438           return dfa->lex.lasttok = REPMN;
1439
1440         case '|':
1441           if (dfa->syntax.syntax_bits & RE_LIMITED_OPS)
1442             goto default_case;
1443           if (backslash != ((dfa->syntax.syntax_bits & RE_NO_BK_VBAR) == 0))
1444             goto normal_char;
1445           dfa->lex.laststart = true;
1446           return dfa->lex.lasttok = OR;
1447
1448         case '\n':
1449           if (!(dfa->syntax.syntax_bits & RE_NEWLINE_ALT))
1450             goto default_case;
1451           if (backslash)
1452             goto normal_char;
1453           dfa->lex.laststart = true;
1454           return dfa->lex.lasttok = OR;
1455
1456         case '(':
1457           if (backslash != ((dfa->syntax.syntax_bits & RE_NO_BK_PARENS) == 0))
1458             goto normal_char;
1459           dfa->lex.parens++;
1460           dfa->lex.laststart = true;
1461           return dfa->lex.lasttok = LPAREN;
1462
1463         case ')':
1464           if (backslash != ((dfa->syntax.syntax_bits & RE_NO_BK_PARENS) == 0))
1465             goto normal_char;
1466           if (dfa->lex.parens == 0
1467               && dfa->syntax.syntax_bits & RE_UNMATCHED_RIGHT_PAREN_ORD)
1468             goto normal_char;
1469           dfa->lex.parens--;
1470           dfa->lex.laststart = false;
1471           return dfa->lex.lasttok = RPAREN;
1472
1473         case '.':
1474           if (backslash)
1475             goto normal_char;
1476           if (dfa->canychar < 0)
1477             {
1478               charclass ccl;
1479               fillset (&ccl);
1480               if (!(dfa->syntax.syntax_bits & RE_DOT_NEWLINE))
1481                 clrbit ('\n', &ccl);
1482               if (dfa->syntax.syntax_bits & RE_DOT_NOT_NULL)
1483                 clrbit ('\0', &ccl);
1484               if (dfa->localeinfo.multibyte)
1485                 for (int c2 = 0; c2 < NOTCHAR; c2++)
1486                   if (dfa->localeinfo.sbctowc[c2] == WEOF)
1487                     clrbit (c2, &ccl);
1488               dfa->canychar = charclass_index (dfa, &ccl);
1489             }
1490           dfa->lex.laststart = false;
1491           return dfa->lex.lasttok = (dfa->localeinfo.multibyte
1492                                      ? ANYCHAR
1493                                      : CSET + dfa->canychar);
1494
1495         case 's':
1496         case 'S':
1497           if (!backslash)
1498             goto normal_char;
1499           if (dfa->syntax.syntax_bits & RE_NO_GNU_OPS)
1500             goto stray_backslash;
1501
1502           if (!dfa->localeinfo.multibyte)
1503             {
1504               charclass ccl;
1505               zeroset (&ccl);
1506               for (int c2 = 0; c2 < NOTCHAR; ++c2)
1507                 if (isspace (c2))
1508                   setbit (c2, &ccl);
1509               if (c == 'S')
1510                 notset (&ccl);
1511               dfa->lex.laststart = false;
1512               return dfa->lex.lasttok = CSET + charclass_index (dfa, &ccl);
1513             }
1514
1515           /* FIXME: see if optimizing this, as is done with ANYCHAR and
1516              add_utf8_anychar, makes sense.  */
1517
1518           /* \s and \S are documented to be equivalent to [[:space:]] and
1519              [^[:space:]] respectively, so tell the lexer to process those
1520              strings, each minus its "already processed" '['.  */
1521           {
1522             struct lexptr ls;
1523             push_lex_state (dfa, &ls, &"^[:space:]]"[c == 's']);
1524             dfa->lex.lasttok = parse_bracket_exp (dfa);
1525             pop_lex_state (dfa, &ls);
1526           }
1527
1528           dfa->lex.laststart = false;
1529           return dfa->lex.lasttok;
1530
1531         case 'w':
1532         case 'W':
1533           if (!backslash)
1534             goto normal_char;
1535           if (dfa->syntax.syntax_bits & RE_NO_GNU_OPS)
1536             goto stray_backslash;
1537
1538           if (!dfa->localeinfo.multibyte)
1539             {
1540               charclass ccl;
1541               zeroset (&ccl);
1542               for (int c2 = 0; c2 < NOTCHAR; ++c2)
1543                 if (dfa->syntax.sbit[c2] == CTX_LETTER)
1544                   setbit (c2, &ccl);
1545               if (c == 'W')
1546                 notset (&ccl);
1547               dfa->lex.laststart = false;
1548               return dfa->lex.lasttok = CSET + charclass_index (dfa, &ccl);
1549             }
1550
1551           /* FIXME: see if optimizing this, as is done with ANYCHAR and
1552              add_utf8_anychar, makes sense.  */
1553
1554           /* \w and \W are documented to be equivalent to [_[:alnum:]] and
1555              [^_[:alnum:]] respectively, so tell the lexer to process those
1556              strings, each minus its "already processed" '['.  */
1557           {
1558             struct lexptr ls;
1559             push_lex_state (dfa, &ls, &"^_[:alnum:]]"[c == 'w']);
1560             dfa->lex.lasttok = parse_bracket_exp (dfa);
1561             pop_lex_state (dfa, &ls);
1562           }
1563
1564           dfa->lex.laststart = false;
1565           return dfa->lex.lasttok;
1566
1567         case '[':
1568           if (backslash)
1569             goto normal_char;
1570           dfa->lex.laststart = false;
1571           return dfa->lex.lasttok = parse_bracket_exp (dfa);
1572
1573         default:
1574         default_case:
1575           if (!backslash)
1576             goto normal_char;
1577         stray_backslash:
1578           if (dfa->syntax.dfaopts & DFA_STRAY_BACKSLASH_WARN)
1579             {
1580               char const *msg;
1581               char msgbuf[100];
1582               if (!c32isprint (dfa->lex.wctok))
1583                 msg = _("stray \\ before unprintable character");
1584               else if (c32isspace (dfa->lex.wctok))
1585                 msg = _("stray \\ before white space");
1586               else
1587                 {
1588                   char buf[MB_LEN_MAX + 1];
1589                   mbstate_t s;
1590                   mbszero (&s);
1591                   size_t stored_bytes = c32rtomb (buf, dfa->lex.wctok, &s);
1592                   if (stored_bytes < (size_t) -1)
1593                     {
1594                       buf[stored_bytes] = '\0';
1595                       int n = snprintf (msgbuf, sizeof msgbuf,
1596                                         _("stray \\ before %s"), buf);
1597                       msg = 0 <= n && n < sizeof msgbuf ? msgbuf : _("stray \\");
1598                     }
1599                   else
1600                     msg = _("stray \\");
1601                 }
1602               dfawarn (msg);
1603             }
1604           FALLTHROUGH;
1605         case ']': case '}':
1606         normal_char:
1607           dfa->lex.laststart = false;
1608           /* For multibyte character sets, folding is done in atom.  Always
1609              return WCHAR.  */
1610           if (dfa->localeinfo.multibyte)
1611             return dfa->lex.lasttok = WCHAR;
1612
1613           if (dfa->syntax.case_fold && isalpha (c))
1614             {
1615               charclass ccl;
1616               zeroset (&ccl);
1617               setbit_case_fold_c (c, &ccl);
1618               return dfa->lex.lasttok = CSET + charclass_index (dfa, &ccl);
1619             }
1620
1621           return dfa->lex.lasttok = c;
1622         }
1623     }
1624 }
1625
1626 static void
1627 addtok_mb (struct dfa *dfa, token t, char mbprop)
1628 {
1629   if (dfa->talloc == dfa->tindex)
1630     {
1631       dfa->tokens = xpalloc (dfa->tokens, &dfa->talloc, 1, -1,
1632                              sizeof *dfa->tokens);
1633       if (dfa->localeinfo.multibyte)
1634         dfa->multibyte_prop = xreallocarray (dfa->multibyte_prop, dfa->talloc,
1635                                              sizeof *dfa->multibyte_prop);
1636     }
1637   if (dfa->localeinfo.multibyte)
1638     dfa->multibyte_prop[dfa->tindex] = mbprop;
1639   dfa->tokens[dfa->tindex++] = t;
1640
1641   switch (t)
1642     {
1643     case QMARK:
1644     case STAR:
1645     case PLUS:
1646       break;
1647
1648     case CAT:
1649     case OR:
1650       dfa->parse.depth--;
1651       break;
1652
1653     case EMPTY:
1654       dfa->epsilon = true;
1655       goto increment_depth;
1656
1657     case BACKREF:
1658       dfa->fast = false;
1659       goto increment_nleaves;
1660
1661     case BEGLINE:
1662     case ENDLINE:
1663     case BEGWORD:
1664     case ENDWORD:
1665     case LIMWORD:
1666     case NOTLIMWORD:
1667       dfa->epsilon = true;
1668       FALLTHROUGH;
1669     default:
1670     increment_nleaves:
1671       dfa->nleaves++;
1672     increment_depth:
1673       dfa->parse.depth++;
1674       if (dfa->depth < dfa->parse.depth)
1675         dfa->depth = dfa->parse.depth;
1676       break;
1677     }
1678 }
1679
1680 static void addtok_wc (struct dfa *dfa, wint_t wc);
1681
1682 /* Add the given token to the parse tree, maintaining the depth count and
1683    updating the maximum depth if necessary.  */
1684 static void
1685 addtok (struct dfa *dfa, token t)
1686 {
1687   if (dfa->localeinfo.multibyte && t == MBCSET)
1688     {
1689       bool need_or = false;
1690
1691       /* Extract wide characters into alternations for better performance.
1692          This does not require UTF-8.  */
1693       for (idx_t i = 0; i < dfa->lex.brack.nchars; i++)
1694         {
1695           addtok_wc (dfa, dfa->lex.brack.chars[i]);
1696           if (need_or)
1697             addtok (dfa, OR);
1698           need_or = true;
1699         }
1700       dfa->lex.brack.nchars = 0;
1701
1702       /* Wide characters have been handled above, so it is possible
1703          that the set is empty now.  Do nothing in that case.  */
1704       if (dfa->lex.brack.cset != -1)
1705         {
1706           addtok (dfa, CSET + dfa->lex.brack.cset);
1707           if (need_or)
1708             addtok (dfa, OR);
1709         }
1710     }
1711   else
1712     {
1713       addtok_mb (dfa, t, 3);
1714     }
1715 }
1716
1717 /* We treat a multibyte character as a single atom, so that DFA
1718    can treat a multibyte character as a single expression.
1719
1720    e.g., we construct the following tree from "<mb1><mb2>".
1721    <mb1(1st-byte)><mb1(2nd-byte)><CAT><mb1(3rd-byte)><CAT>
1722    <mb2(1st-byte)><mb2(2nd-byte)><CAT><mb2(3rd-byte)><CAT><CAT> */
1723 static void
1724 addtok_wc (struct dfa *dfa, wint_t wc)
1725 {
1726   unsigned char buf[MB_LEN_MAX];
1727   mbstate_t s;
1728   mbszero (&s);
1729   size_t stored_bytes = c32rtomb ((char *) buf, wc, &s);
1730   int buflen;
1731
1732   if (stored_bytes != (size_t) -1)
1733     buflen = stored_bytes;
1734   else
1735     {
1736       /* This is merely stop-gap.  buf[0] is undefined, yet skipping
1737          the addtok_mb call altogether can corrupt the heap.  */
1738       buflen = 1;
1739       buf[0] = 0;
1740     }
1741
1742   addtok_mb (dfa, buf[0], buflen == 1 ? 3 : 1);
1743   for (int i = 1; i < buflen; i++)
1744     {
1745       addtok_mb (dfa, buf[i], i == buflen - 1 ? 2 : 0);
1746       addtok (dfa, CAT);
1747     }
1748 }
1749
1750 static void
1751 add_utf8_anychar (struct dfa *dfa)
1752 {
1753   /* Since the Unicode Standard Version 4.0.0 (2003), a well-formed
1754      UTF-8 byte sequence has been defined as follows:
1755
1756      ([\x00-\x7f]
1757      |[\xc2-\xdf][\x80-\xbf]
1758      |[\xe0][\xa0-\xbf][\x80-\xbf]
1759      |[\xe1-\xec\xee-\xef][\x80-\xbf][\x80-\xbf]
1760      |[\xed][\x80-\x9f][\x80-\xbf]
1761      |[\xf0][\x90-\xbf][\x80-\xbf][\x80-\xbf])
1762      |[\xf1-\xf3][\x80-\xbf][\x80-\xbf][\x80-\xbf]
1763      |[\xf4][\x80-\x8f][\x80-\xbf][\x80-\xbf])
1764
1765      which I'll write more concisely "A|BC|DEC|FCC|GHC|IJCC|KCCC|LMCC",
1766      where A = [\x00-\x7f], B = [\xc2-\xdf], C = [\x80-\xbf],
1767      D = [\xe0], E = [\xa0-\xbf], F = [\xe1-\xec\xee-\xef], G = [\xed],
1768      H = [\x80-\x9f], I = [\xf0],
1769      J = [\x90-\xbf], K = [\xf1-\xf3], L = [\xf4], M = [\x80-\x8f].
1770
1771      This can be refactored to "A|(B|DE|GH|(F|IJ|LM|KC)C)C".  */
1772
1773   /* Mnemonics for classes containing two or more bytes.  */
1774   enum { A, B, C, E, F, H, J, K, M };
1775
1776   /* Mnemonics for single-byte tokens.  */
1777   enum { D_token = 0xe0, G_token = 0xed, I_token = 0xf0, L_token = 0xf4 };
1778
1779   static charclass const utf8_classes[] = {
1780     /* A. 00-7f: 1-byte sequence.  */
1781     CHARCLASS_INIT (0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0, 0, 0, 0),
1782
1783     /* B. c2-df: 1st byte of a 2-byte sequence.  */
1784     CHARCLASS_INIT (0, 0, 0, 0, 0, 0, 0xfffffffc, 0),
1785
1786     /* C. 80-bf: non-leading bytes.  */
1787     CHARCLASS_INIT (0, 0, 0, 0, 0xffffffff, 0xffffffff, 0, 0),
1788
1789     /* D. e0 (just a token).  */
1790
1791     /* E. a0-bf: 2nd byte of a "DEC" sequence.  */
1792     CHARCLASS_INIT (0, 0, 0, 0, 0, 0xffffffff, 0, 0),
1793
1794     /* F. e1-ec + ee-ef: 1st byte of an "FCC" sequence.  */
1795     CHARCLASS_INIT (0, 0, 0, 0, 0, 0, 0, 0xdffe),
1796
1797     /* G. ed (just a token).  */
1798
1799     /* H. 80-9f: 2nd byte of a "GHC" sequence.  */
1800     CHARCLASS_INIT (0, 0, 0, 0, 0xffffffff, 0, 0, 0),
1801
1802     /* I. f0 (just a token).  */
1803
1804     /* J. 90-bf: 2nd byte of an "IJCC" sequence.  */
1805     CHARCLASS_INIT (0, 0, 0, 0, 0xffff0000, 0xffffffff, 0, 0),
1806
1807     /* K. f1-f3: 1st byte of a "KCCC" sequence.  */
1808     CHARCLASS_INIT (0, 0, 0, 0, 0, 0, 0, 0xe0000),
1809
1810     /* L. f4 (just a token).  */
1811
1812     /* M. 80-8f: 2nd byte of a "LMCC" sequence.  */
1813     CHARCLASS_INIT (0, 0, 0, 0, 0xffff, 0, 0, 0),
1814   };
1815
1816   /* Define the character classes that are needed below.  */
1817   if (dfa->utf8_anychar_classes[0] == 0)
1818     {
1819       charclass c = utf8_classes[0];
1820       if (! (dfa->syntax.syntax_bits & RE_DOT_NEWLINE))
1821         clrbit ('\n', &c);
1822       if (dfa->syntax.syntax_bits & RE_DOT_NOT_NULL)
1823         clrbit ('\0', &c);
1824       dfa->utf8_anychar_classes[0] = CSET + charclass_index (dfa, &c);
1825
1826       for (int i = 1; i < sizeof utf8_classes / sizeof *utf8_classes; i++)
1827         dfa->utf8_anychar_classes[i]
1828           = CSET + charclass_index (dfa, &utf8_classes[i]);
1829     }
1830
1831   /* Implement the "A|(B|DE|GH|(F|IJ|LM|KC)C)C" pattern mentioned above.
1832      The token buffer is in reverse Polish order, so we get
1833      "A B D E CAT OR G H CAT OR F I J CAT OR L M CAT OR K
1834       C CAT OR C CAT OR C CAT OR".  */
1835   addtok (dfa, dfa->utf8_anychar_classes[A]);
1836   addtok (dfa, dfa->utf8_anychar_classes[B]);
1837   addtok (dfa, D_token);
1838   addtok (dfa, dfa->utf8_anychar_classes[E]);
1839   addtok (dfa, CAT);
1840   addtok (dfa, OR);
1841   addtok (dfa, G_token);
1842   addtok (dfa, dfa->utf8_anychar_classes[H]);
1843   addtok (dfa, CAT);
1844   addtok (dfa, OR);
1845   addtok (dfa, dfa->utf8_anychar_classes[F]);
1846   addtok (dfa, I_token);
1847   addtok (dfa, dfa->utf8_anychar_classes[J]);
1848   addtok (dfa, CAT);
1849   addtok (dfa, OR);
1850   addtok (dfa, L_token);
1851   addtok (dfa, dfa->utf8_anychar_classes[M]);
1852   addtok (dfa, CAT);
1853   addtok (dfa, OR);
1854   addtok (dfa, dfa->utf8_anychar_classes[K]);
1855   for (int i = 0; i < 3; i++)
1856     {
1857       addtok (dfa, dfa->utf8_anychar_classes[C]);
1858       addtok (dfa, CAT);
1859       addtok (dfa, OR);
1860     }
1861 }
1862
1863 /* The grammar understood by the parser is as follows.
1864
1865    regexp:
1866      regexp OR branch
1867      branch
1868
1869    branch:
1870      branch closure
1871      closure
1872
1873    closure:
1874      closure QMARK
1875      closure STAR
1876      closure PLUS
1877      closure REPMN
1878      atom
1879
1880    atom:
1881      <normal character>
1882      <multibyte character>
1883      ANYCHAR
1884      MBCSET
1885      CSET
1886      BACKREF
1887      BEGLINE
1888      ENDLINE
1889      BEGWORD
1890      ENDWORD
1891      LIMWORD
1892      NOTLIMWORD
1893      LPAREN regexp RPAREN
1894      <empty>
1895
1896    The parser builds a parse tree in postfix form in an array of tokens.  */
1897
1898 static void
1899 atom (struct dfa *dfa)
1900 {
1901   if ((0 <= dfa->parse.tok && dfa->parse.tok < NOTCHAR)
1902       || dfa->parse.tok >= CSET
1903       || dfa->parse.tok == BEG || dfa->parse.tok == BACKREF
1904       || dfa->parse.tok == BEGLINE || dfa->parse.tok == ENDLINE
1905       || dfa->parse.tok == BEGWORD || dfa->parse.tok == ENDWORD
1906       || dfa->parse.tok == LIMWORD || dfa->parse.tok == NOTLIMWORD
1907       || dfa->parse.tok == ANYCHAR || dfa->parse.tok == MBCSET)
1908     {
1909       if (dfa->parse.tok == ANYCHAR && dfa->localeinfo.using_utf8)
1910         {
1911           /* For UTF-8 expand the period to a series of CSETs that define a
1912              valid UTF-8 character.  This avoids using the slow multibyte
1913              path.  I'm pretty sure it would be both profitable and correct to
1914              do it for any encoding; however, the optimization must be done
1915              manually as it is done above in add_utf8_anychar.  So, let's
1916              start with UTF-8: it is the most used, and the structure of the
1917              encoding makes the correctness more obvious.  */
1918           add_utf8_anychar (dfa);
1919         }
1920       else
1921         addtok (dfa, dfa->parse.tok);
1922       dfa->parse.tok = lex (dfa);
1923     }
1924   else if (dfa->parse.tok == WCHAR)
1925     {
1926       if (dfa->lex.wctok == WEOF)
1927         addtok (dfa, BACKREF);
1928       else
1929         {
1930           addtok_wc (dfa, dfa->lex.wctok);
1931
1932           if (dfa->syntax.case_fold)
1933             {
1934               char32_t folded[CASE_FOLDED_BUFSIZE];
1935               int n = case_folded_counterparts (dfa->lex.wctok, folded);
1936               for (int i = 0; i < n; i++)
1937                 {
1938                   addtok_wc (dfa, folded[i]);
1939                   addtok (dfa, OR);
1940                 }
1941             }
1942         }
1943
1944       dfa->parse.tok = lex (dfa);
1945     }
1946   else if (dfa->parse.tok == LPAREN)
1947     {
1948       dfa->parse.tok = lex (dfa);
1949       regexp (dfa);
1950       if (dfa->parse.tok != RPAREN)
1951         dfaerror (_("unbalanced ("));
1952       dfa->parse.tok = lex (dfa);
1953     }
1954   else
1955     addtok (dfa, EMPTY);
1956 }
1957
1958 /* Return the number of tokens in the given subexpression.  */
1959 static idx_t _GL_ATTRIBUTE_PURE
1960 nsubtoks (struct dfa const *dfa, idx_t tindex)
1961 {
1962   switch (dfa->tokens[tindex - 1])
1963     {
1964     default:
1965       return 1;
1966     case QMARK:
1967     case STAR:
1968     case PLUS:
1969       return 1 + nsubtoks (dfa, tindex - 1);
1970     case CAT:
1971     case OR:
1972       {
1973         idx_t ntoks1 = nsubtoks (dfa, tindex - 1);
1974         return 1 + ntoks1 + nsubtoks (dfa, tindex - 1 - ntoks1);
1975       }
1976     }
1977 }
1978
1979 /* Copy the given subexpression to the top of the tree.  */
1980 static void
1981 copytoks (struct dfa *dfa, idx_t tindex, idx_t ntokens)
1982 {
1983   if (dfa->localeinfo.multibyte)
1984     for (idx_t i = 0; i < ntokens; i++)
1985       addtok_mb (dfa, dfa->tokens[tindex + i],
1986                  dfa->multibyte_prop[tindex + i]);
1987   else
1988     for (idx_t i = 0; i < ntokens; i++)
1989       addtok_mb (dfa, dfa->tokens[tindex + i], 3);
1990 }
1991
1992 static void
1993 closure (struct dfa *dfa)
1994 {
1995   atom (dfa);
1996   while (dfa->parse.tok == QMARK || dfa->parse.tok == STAR
1997          || dfa->parse.tok == PLUS || dfa->parse.tok == REPMN)
1998     if (dfa->parse.tok == REPMN && (dfa->lex.minrep || dfa->lex.maxrep))
1999       {
2000         idx_t ntokens = nsubtoks (dfa, dfa->tindex);
2001         idx_t tindex = dfa->tindex - ntokens;
2002         if (dfa->lex.maxrep < 0)
2003           addtok (dfa, PLUS);
2004         if (dfa->lex.minrep == 0)
2005           addtok (dfa, QMARK);
2006         int i;
2007         for (i = 1; i < dfa->lex.minrep; i++)
2008           {
2009             copytoks (dfa, tindex, ntokens);
2010             addtok (dfa, CAT);
2011           }
2012         for (; i < dfa->lex.maxrep; i++)
2013           {
2014             copytoks (dfa, tindex, ntokens);
2015             addtok (dfa, QMARK);
2016             addtok (dfa, CAT);
2017           }
2018         dfa->parse.tok = lex (dfa);
2019       }
2020     else if (dfa->parse.tok == REPMN)
2021       {
2022         dfa->tindex -= nsubtoks (dfa, dfa->tindex);
2023         dfa->parse.tok = lex (dfa);
2024         closure (dfa);
2025       }
2026     else
2027       {
2028         addtok (dfa, dfa->parse.tok);
2029         dfa->parse.tok = lex (dfa);
2030       }
2031 }
2032
2033 static void
2034 branch (struct dfa* dfa)
2035 {
2036   closure (dfa);
2037   while (dfa->parse.tok != RPAREN && dfa->parse.tok != OR
2038          && dfa->parse.tok >= 0)
2039     {
2040       closure (dfa);
2041       addtok (dfa, CAT);
2042     }
2043 }
2044
2045 static void
2046 regexp (struct dfa *dfa)
2047 {
2048   branch (dfa);
2049   while (dfa->parse.tok == OR)
2050     {
2051       dfa->parse.tok = lex (dfa);
2052       branch (dfa);
2053       addtok (dfa, OR);
2054     }
2055 }
2056
2057 /* Parse a string S of length LEN into D.  S can include NUL characters.
2058    This is the main entry point for the parser.  */
2059 void
2060 dfaparse (char const *s, idx_t len, struct dfa *d)
2061 {
2062   d->lex.ptr = s;
2063   d->lex.left = len;
2064   d->lex.lasttok = END;
2065   d->lex.laststart = true;
2066
2067   if (!d->syntax.syntax_bits_set)
2068     dfaerror (_("no syntax specified"));
2069
2070   if (!d->nregexps)
2071     addtok (d, BEG);
2072
2073   d->parse.tok = lex (d);
2074   d->parse.depth = d->depth;
2075
2076   regexp (d);
2077
2078   if (d->parse.tok != END)
2079     dfaerror (_("unbalanced )"));
2080
2081   addtok (d, END - d->nregexps);
2082   addtok (d, CAT);
2083
2084   if (d->nregexps)
2085     addtok (d, OR);
2086
2087   ++d->nregexps;
2088 }
2089
2090 /* Some primitives for operating on sets of positions.  */
2091
2092 /* Copy one set to another.  */
2093 static void
2094 copy (position_set const *src, position_set *dst)
2095 {
2096   if (dst->alloc < src->nelem)
2097     {
2098       free (dst->elems);
2099       dst->elems = xpalloc (NULL, &dst->alloc, src->nelem - dst->alloc, -1,
2100                             sizeof *dst->elems);
2101     }
2102   dst->nelem = src->nelem;
2103   if (src->nelem != 0)
2104     memcpy (dst->elems, src->elems, src->nelem * sizeof *dst->elems);
2105 }
2106
2107 static void
2108 alloc_position_set (position_set *s, idx_t size)
2109 {
2110   s->elems = xnmalloc (size, sizeof *s->elems);
2111   s->alloc = size;
2112   s->nelem = 0;
2113 }
2114
2115 /* Insert position P in set S.  S is maintained in sorted order on
2116    decreasing index.  If there is already an entry in S with P.index
2117    then merge (logically-OR) P's constraints into the one in S.
2118    S->elems must point to an array large enough to hold the resulting set.  */
2119 static void
2120 insert (position p, position_set *s)
2121 {
2122   idx_t count = s->nelem;
2123   idx_t lo = 0, hi = count;
2124   while (lo < hi)
2125     {
2126       idx_t mid = (lo + hi) >> 1;
2127       if (s->elems[mid].index < p.index)
2128         lo = mid + 1;
2129       else if (s->elems[mid].index == p.index)
2130         {
2131           s->elems[mid].constraint |= p.constraint;
2132           return;
2133         }
2134       else
2135         hi = mid;
2136     }
2137
2138   s->elems = maybe_realloc (s->elems, count, &s->alloc, -1, sizeof *s->elems);
2139   for (idx_t i = count; i > lo; i--)
2140     s->elems[i] = s->elems[i - 1];
2141   s->elems[lo] = p;
2142   ++s->nelem;
2143 }
2144
2145 static void
2146 append (position p, position_set *s)
2147 {
2148   idx_t count = s->nelem;
2149   s->elems = maybe_realloc (s->elems, count, &s->alloc, -1, sizeof *s->elems);
2150   s->elems[s->nelem++] = p;
2151 }
2152
2153 /* Merge S1 and S2 (with the additional constraint C2) into M.  The
2154    result is as if the positions of S1, and of S2 with the additional
2155    constraint C2, were inserted into an initially empty set.  */
2156 static void
2157 merge_constrained (position_set const *s1, position_set const *s2,
2158                    unsigned int c2, position_set *m)
2159 {
2160   idx_t i = 0, j = 0;
2161
2162   if (m->alloc - s1->nelem < s2->nelem)
2163     {
2164       free (m->elems);
2165       m->alloc = s1->nelem;
2166       m->elems = xpalloc (NULL, &m->alloc, s2->nelem, -1, sizeof *m->elems);
2167     }
2168   m->nelem = 0;
2169   while (i < s1->nelem || j < s2->nelem)
2170     if (! (j < s2->nelem)
2171         || (i < s1->nelem && s1->elems[i].index <= s2->elems[j].index))
2172       {
2173         unsigned int c = ((i < s1->nelem && j < s2->nelem
2174                            && s1->elems[i].index == s2->elems[j].index)
2175                           ? s2->elems[j++].constraint & c2
2176                           : 0);
2177         m->elems[m->nelem].index = s1->elems[i].index;
2178         m->elems[m->nelem++].constraint = s1->elems[i++].constraint | c;
2179       }
2180     else
2181       {
2182         if (s2->elems[j].constraint & c2)
2183           {
2184             m->elems[m->nelem].index = s2->elems[j].index;
2185             m->elems[m->nelem++].constraint = s2->elems[j].constraint & c2;
2186           }
2187         j++;
2188       }
2189 }
2190
2191 /* Merge two sets of positions into a third.  The result is exactly as if
2192    the positions of both sets were inserted into an initially empty set.  */
2193 static void
2194 merge (position_set const *s1, position_set const *s2, position_set *m)
2195 {
2196   merge_constrained (s1, s2, -1, m);
2197 }
2198
2199 /* Merge into DST all the elements of SRC, possibly destroying
2200    the contents of the temporary M.  */
2201 static void
2202 merge2 (position_set *dst, position_set const *src, position_set *m)
2203 {
2204   if (src->nelem < 4)
2205     {
2206       for (idx_t i = 0; i < src->nelem; i++)
2207         insert (src->elems[i], dst);
2208     }
2209    else
2210     {
2211       merge (src, dst, m);
2212       copy (m, dst);
2213     }
2214 }
2215
2216 /* Delete a position from a set.  Return the nonzero constraint of the
2217    deleted position, or zero if there was no such position.  */
2218 static unsigned int
2219 delete (idx_t del, position_set *s)
2220 {
2221   idx_t count = s->nelem;
2222   idx_t lo = 0, hi = count;
2223   while (lo < hi)
2224     {
2225       idx_t mid = (lo + hi) >> 1;
2226       if (s->elems[mid].index < del)
2227         lo = mid + 1;
2228       else if (s->elems[mid].index == del)
2229         {
2230           unsigned int c = s->elems[mid].constraint;
2231           idx_t i;
2232           for (i = mid; i + 1 < count; i++)
2233             s->elems[i] = s->elems[i + 1];
2234           s->nelem = i;
2235           return c;
2236         }
2237       else
2238         hi = mid;
2239     }
2240   return 0;
2241 }
2242
2243 /* Replace a position with the followed set.  */
2244 static void
2245 replace (position_set *dst, idx_t del, position_set *add,
2246          unsigned int constraint, position_set *tmp)
2247 {
2248   unsigned int c = delete (del, dst) & constraint;
2249
2250   if (c)
2251     {
2252       copy (dst, tmp);
2253       merge_constrained (tmp, add, c, dst);
2254     }
2255 }
2256
2257 /* Find the index of the state corresponding to the given position set with
2258    the given preceding context, or create a new state if there is no such
2259    state.  Context tells whether we got here on a newline or letter.  */
2260 static state_num
2261 state_index (struct dfa *d, position_set const *s, int context)
2262 {
2263   size_t hash = 0;
2264   int constraint = 0;
2265   state_num i;
2266
2267   for (i = 0; i < s->nelem; ++i)
2268     {
2269       idx_t ind = s->elems[i].index;
2270       hash ^= ind + s->elems[i].constraint;
2271     }
2272
2273   /* Try to find a state that exactly matches the proposed one.  */
2274   for (i = 0; i < d->sindex; ++i)
2275     {
2276       if (hash != d->states[i].hash || s->nelem != d->states[i].elems.nelem
2277           || context != d->states[i].context)
2278         continue;
2279       state_num j;
2280       for (j = 0; j < s->nelem; ++j)
2281         if (s->elems[j].constraint != d->states[i].elems.elems[j].constraint
2282             || s->elems[j].index != d->states[i].elems.elems[j].index)
2283           break;
2284       if (j == s->nelem)
2285         return i;
2286     }
2287
2288 #ifdef DEBUG
2289   fprintf (stderr, "new state %td\n nextpos:", i);
2290   for (state_num j = 0; j < s->nelem; j++)
2291     {
2292       fprintf (stderr, " %td:", s->elems[j].index);
2293       prtok (d->tokens[s->elems[j].index]);
2294     }
2295   fprintf (stderr, "\n context:");
2296   if (context ^ CTX_ANY)
2297     {
2298       if (context & CTX_NONE)
2299         fprintf (stderr, " CTX_NONE");
2300       if (context & CTX_LETTER)
2301         fprintf (stderr, " CTX_LETTER");
2302       if (context & CTX_NEWLINE)
2303         fprintf (stderr, " CTX_NEWLINE");
2304     }
2305   else
2306     fprintf (stderr, " CTX_ANY");
2307   fprintf (stderr, "\n");
2308 #endif
2309
2310   for (state_num j = 0; j < s->nelem; j++)
2311     {
2312       int c = d->constraints[s->elems[j].index];
2313
2314       if (c != 0)
2315         {
2316           if (succeeds_in_context (c, context, CTX_ANY))
2317             constraint |= c;
2318         }
2319       else if (d->tokens[s->elems[j].index] == BACKREF)
2320         constraint = NO_CONSTRAINT;
2321     }
2322
2323
2324   /* Create a new state.  */
2325   d->states = maybe_realloc (d->states, d->sindex, &d->salloc, -1,
2326                              sizeof *d->states);
2327   d->states[i].hash = hash;
2328   alloc_position_set (&d->states[i].elems, s->nelem);
2329   copy (s, &d->states[i].elems);
2330   d->states[i].context = context;
2331   d->states[i].constraint = constraint;
2332   d->states[i].mbps.nelem = 0;
2333   d->states[i].mbps.elems = NULL;
2334   d->states[i].mb_trindex = -1;
2335
2336   ++d->sindex;
2337
2338   return i;
2339 }
2340
2341 /* Find the epsilon closure of D's set of positions.  If any position of the set
2342    contains a symbol that matches the empty string in some context, replace
2343    that position with the elements of its follow labeled with an appropriate
2344    constraint.  Repeat exhaustively until no funny positions are left.
2345    S->elems must be large enough to hold the result.  BACKWARD is D's
2346    backward set; use and update it too.  */
2347 static void
2348 epsclosure (struct dfa const *d, position_set *backward)
2349 {
2350   position_set tmp;
2351   alloc_position_set (&tmp, d->nleaves);
2352   for (idx_t i = 0; i < d->tindex; i++)
2353     if (0 < d->follows[i].nelem)
2354       {
2355         unsigned int constraint;
2356         switch (d->tokens[i])
2357           {
2358           default:
2359             continue;
2360
2361           case BEGLINE:
2362             constraint = BEGLINE_CONSTRAINT;
2363             break;
2364           case ENDLINE:
2365             constraint = ENDLINE_CONSTRAINT;
2366             break;
2367           case BEGWORD:
2368             constraint = BEGWORD_CONSTRAINT;
2369             break;
2370           case ENDWORD:
2371             constraint = ENDWORD_CONSTRAINT;
2372             break;
2373           case LIMWORD:
2374             constraint = LIMWORD_CONSTRAINT;
2375             break;
2376           case NOTLIMWORD:
2377             constraint = NOTLIMWORD_CONSTRAINT;
2378             break;
2379           case EMPTY:
2380             constraint = NO_CONSTRAINT;
2381             break;
2382           }
2383
2384         delete (i, &d->follows[i]);
2385
2386         for (idx_t j = 0; j < backward[i].nelem; j++)
2387           replace (&d->follows[backward[i].elems[j].index], i, &d->follows[i],
2388                    constraint, &tmp);
2389         for (idx_t j = 0; j < d->follows[i].nelem; j++)
2390           replace (&backward[d->follows[i].elems[j].index], i, &backward[i],
2391                    NO_CONSTRAINT, &tmp);
2392       }
2393   free (tmp.elems);
2394 }
2395
2396 /* Returns the set of contexts for which there is at least one
2397    character included in C.  */
2398
2399 static int
2400 charclass_context (struct dfa const *dfa, charclass const *c)
2401 {
2402   int context = 0;
2403
2404   for (int j = 0; j < CHARCLASS_WORDS; j++)
2405     {
2406       if (c->w[j] & dfa->syntax.newline.w[j])
2407         context |= CTX_NEWLINE;
2408       if (c->w[j] & dfa->syntax.letters.w[j])
2409         context |= CTX_LETTER;
2410       if (c->w[j] & ~(dfa->syntax.letters.w[j] | dfa->syntax.newline.w[j]))
2411         context |= CTX_NONE;
2412     }
2413
2414   return context;
2415 }
2416
2417 /* Returns the contexts on which the position set S depends.  Each context
2418    in the set of returned contexts (let's call it SC) may have a different
2419    follow set than other contexts in SC, and also different from the
2420    follow set of the complement set (sc ^ CTX_ANY).  However, all contexts
2421    in the complement set will have the same follow set.  */
2422
2423 static int _GL_ATTRIBUTE_PURE
2424 state_separate_contexts (struct dfa *d, position_set const *s)
2425 {
2426   int separate_contexts = 0;
2427
2428   for (idx_t j = 0; j < s->nelem; j++)
2429     separate_contexts |= d->separates[s->elems[j].index];
2430
2431   return separate_contexts;
2432 }
2433
2434 enum
2435 {
2436   /* Single token is repeated.  It is distinguished from non-repeated.  */
2437   OPT_REPEAT = (1 << 0),
2438
2439   /* Multiple tokens are repeated.  This flag is on at head of tokens.  The
2440      node is not merged.  */
2441   OPT_LPAREN = (1 << 1),
2442
2443   /* Multiple branches are joined.  The node is not merged.  */
2444   OPT_RPAREN = (1 << 2),
2445
2446   /* The node is walked.  If the node is found in walking again, OPT_RPAREN
2447      flag is turned on. */
2448   OPT_WALKED = (1 << 3),
2449
2450   /* The node is queued.  The node is not queued again.  */
2451   OPT_QUEUED = (1 << 4)
2452 };
2453
2454 static void
2455 merge_nfa_state (struct dfa *d, idx_t tindex, char *flags,
2456                  position_set *merged)
2457 {
2458   position_set *follows = d->follows;
2459   idx_t nelem = 0;
2460
2461   for (idx_t i = 0; i < follows[tindex].nelem; i++)
2462     {
2463       idx_t sindex = follows[tindex].elems[i].index;
2464
2465       /* Skip the node as pruned in future.  */
2466       unsigned int iconstraint = follows[tindex].elems[i].constraint;
2467       if (iconstraint == 0)
2468         continue;
2469
2470       if (d->tokens[follows[tindex].elems[i].index] <= END)
2471         {
2472           d->constraints[tindex] |= follows[tindex].elems[i].constraint;
2473           continue;
2474         }
2475
2476       if (sindex != tindex && !(flags[sindex] & (OPT_LPAREN | OPT_RPAREN)))
2477         {
2478           idx_t j;
2479
2480           for (j = 0; j < nelem; j++)
2481             {
2482               idx_t dindex = follows[tindex].elems[j].index;
2483
2484               if (dindex == tindex)
2485                 continue;
2486
2487               if (follows[tindex].elems[j].constraint != iconstraint)
2488                 continue;
2489
2490               if (flags[dindex] & (OPT_LPAREN | OPT_RPAREN))
2491                 continue;
2492
2493               if (d->tokens[sindex] != d->tokens[dindex])
2494                 continue;
2495
2496               if ((flags[sindex] ^ flags[dindex]) & OPT_REPEAT)
2497                 continue;
2498
2499               if (flags[sindex] & OPT_REPEAT)
2500                 delete (sindex, &follows[sindex]);
2501
2502               merge2 (&follows[dindex], &follows[sindex], merged);
2503
2504               break;
2505             }
2506
2507           if (j < nelem)
2508             continue;
2509         }
2510
2511       follows[tindex].elems[nelem++] = follows[tindex].elems[i];
2512       flags[sindex] |= OPT_QUEUED;
2513     }
2514
2515   follows[tindex].nelem = nelem;
2516 }
2517
2518 static int
2519 compare (const void *a, const void *b)
2520 {
2521   position const *p = a, *q = b;
2522   return (p->index > q->index) - (p->index < q->index);
2523 }
2524
2525 static void
2526 reorder_tokens (struct dfa *d)
2527 {
2528   idx_t nleaves = 0;
2529   ptrdiff_t *map = xnmalloc (d->tindex, sizeof *map);
2530   map[0] = nleaves++;
2531   for (idx_t i = 1; i < d->tindex; i++)
2532     map[i] = -1;
2533
2534   token *tokens = xnmalloc (d->nleaves, sizeof *tokens);
2535   position_set *follows = xnmalloc (d->nleaves, sizeof *follows);
2536   int *constraints = xnmalloc (d->nleaves, sizeof *constraints);
2537   char *multibyte_prop = (d->localeinfo.multibyte
2538                           ? xnmalloc (d->nleaves, sizeof *multibyte_prop)
2539                           : NULL);
2540
2541   for (idx_t i = 0; i < d->tindex; i++)
2542     {
2543       if (map[i] < 0)
2544         {
2545           free (d->follows[i].elems);
2546           d->follows[i].elems = NULL;
2547           d->follows[i].nelem = 0;
2548           continue;
2549         }
2550
2551       tokens[map[i]] = d->tokens[i];
2552       follows[map[i]] = d->follows[i];
2553       constraints[map[i]] = d->constraints[i];
2554
2555       if (multibyte_prop != NULL)
2556         multibyte_prop[map[i]] = d->multibyte_prop[i];
2557
2558       for (idx_t j = 0; j < d->follows[i].nelem; j++)
2559         {
2560           if (map[d->follows[i].elems[j].index] == -1)
2561             map[d->follows[i].elems[j].index] = nleaves++;
2562
2563           d->follows[i].elems[j].index = map[d->follows[i].elems[j].index];
2564         }
2565
2566       qsort (d->follows[i].elems, d->follows[i].nelem,
2567              sizeof *d->follows[i].elems, compare);
2568     }
2569
2570   for (idx_t i = 0; i < nleaves; i++)
2571     {
2572       d->tokens[i] = tokens[i];
2573       d->follows[i] = follows[i];
2574       d->constraints[i] = constraints[i];
2575
2576       if (multibyte_prop != NULL)
2577         d->multibyte_prop[i] = multibyte_prop[i];
2578     }
2579
2580   d->tindex = d->nleaves = nleaves;
2581
2582   free (tokens);
2583   free (follows);
2584   free (constraints);
2585   free (multibyte_prop);
2586   free (map);
2587 }
2588
2589 static void
2590 dfaoptimize (struct dfa *d)
2591 {
2592   char *flags = xizalloc (d->tindex);
2593
2594   for (idx_t i = 0; i < d->tindex; i++)
2595     {
2596       for (idx_t j = 0; j < d->follows[i].nelem; j++)
2597         {
2598           if (d->follows[i].elems[j].index == i)
2599             flags[d->follows[i].elems[j].index] |= OPT_REPEAT;
2600           else if (d->follows[i].elems[j].index < i)
2601             flags[d->follows[i].elems[j].index] |= OPT_LPAREN;
2602           else if (flags[d->follows[i].elems[j].index] &= OPT_WALKED)
2603             flags[d->follows[i].elems[j].index] |= OPT_RPAREN;
2604           else
2605             flags[d->follows[i].elems[j].index] |= OPT_WALKED;
2606         }
2607     }
2608
2609   flags[0] |= OPT_QUEUED;
2610
2611   position_set merged0;
2612   position_set *merged = &merged0;
2613   alloc_position_set (merged, d->nleaves);
2614
2615   d->constraints = xicalloc (d->tindex, sizeof *d->constraints);
2616
2617   for (idx_t i = 0; i < d->tindex; i++)
2618     if (flags[i] & OPT_QUEUED)
2619       merge_nfa_state (d, i, flags, merged);
2620
2621   reorder_tokens (d);
2622
2623   free (merged->elems);
2624   free (flags);
2625 }
2626
2627 /* Perform bottom-up analysis on the parse tree, computing various functions.
2628    Note that at this point, we're pretending constructs like \< are real
2629    characters rather than constraints on what can follow them.
2630
2631    Nullable:  A node is nullable if it is at the root of a regexp that can
2632    match the empty string.
2633    *  EMPTY leaves are nullable.
2634    * No other leaf is nullable.
2635    * A QMARK or STAR node is nullable.
2636    * A PLUS node is nullable if its argument is nullable.
2637    * A CAT node is nullable if both its arguments are nullable.
2638    * An OR node is nullable if either argument is nullable.
2639
2640    Firstpos:  The firstpos of a node is the set of positions (nonempty leaves)
2641    that could correspond to the first character of a string matching the
2642    regexp rooted at the given node.
2643    * EMPTY leaves have empty firstpos.
2644    * The firstpos of a nonempty leaf is that leaf itself.
2645    * The firstpos of a QMARK, STAR, or PLUS node is the firstpos of its
2646      argument.
2647    * The firstpos of a CAT node is the firstpos of the left argument, union
2648      the firstpos of the right if the left argument is nullable.
2649    * The firstpos of an OR node is the union of firstpos of each argument.
2650
2651    Lastpos:  The lastpos of a node is the set of positions that could
2652    correspond to the last character of a string matching the regexp at
2653    the given node.
2654    * EMPTY leaves have empty lastpos.
2655    * The lastpos of a nonempty leaf is that leaf itself.
2656    * The lastpos of a QMARK, STAR, or PLUS node is the lastpos of its
2657      argument.
2658    * The lastpos of a CAT node is the lastpos of its right argument, union
2659      the lastpos of the left if the right argument is nullable.
2660    * The lastpos of an OR node is the union of the lastpos of each argument.
2661
2662    Follow:  The follow of a position is the set of positions that could
2663    correspond to the character following a character matching the node in
2664    a string matching the regexp.  At this point we consider special symbols
2665    that match the empty string in some context to be just normal characters.
2666    Later, if we find that a special symbol is in a follow set, we will
2667    replace it with the elements of its follow, labeled with an appropriate
2668    constraint.
2669    * Every node in the firstpos of the argument of a STAR or PLUS node is in
2670      the follow of every node in the lastpos.
2671    * Every node in the firstpos of the second argument of a CAT node is in
2672      the follow of every node in the lastpos of the first argument.
2673
2674    Because of the postfix representation of the parse tree, the depth-first
2675    analysis is conveniently done by a linear scan with the aid of a stack.
2676    Sets are stored as arrays of the elements, obeying a stack-like allocation
2677    scheme; the number of elements in each set deeper in the stack can be
2678    used to determine the address of a particular set's array.  */
2679 static void
2680 dfaanalyze (struct dfa *d, bool searchflag)
2681 {
2682   /* Array allocated to hold position sets.  */
2683   position *posalloc = xnmalloc (d->nleaves, 2 * sizeof *posalloc);
2684   /* Firstpos and lastpos elements.  */
2685   position *firstpos = posalloc;
2686   position *lastpos = firstpos + d->nleaves;
2687   position pos;
2688   position_set tmp;
2689
2690   /* Stack for element counts and nullable flags.  */
2691   struct
2692   {
2693     /* Whether the entry is nullable.  */
2694     bool nullable;
2695
2696     /* Counts of firstpos and lastpos sets.  */
2697     idx_t nfirstpos;
2698     idx_t nlastpos;
2699   } *stkalloc = xnmalloc (d->depth, sizeof *stkalloc), *stk = stkalloc;
2700
2701   position_set merged;          /* Result of merging sets.  */
2702
2703   addtok (d, CAT);
2704   idx_t tindex = d->tindex;
2705
2706 #ifdef DEBUG
2707   fprintf (stderr, "dfaanalyze:\n");
2708   for (idx_t i = 0; i < tindex; i++)
2709     {
2710       fprintf (stderr, " %td:", i);
2711       prtok (d->tokens[i]);
2712     }
2713   putc ('\n', stderr);
2714 #endif
2715
2716   d->searchflag = searchflag;
2717   alloc_position_set (&merged, d->nleaves);
2718   d->follows = xicalloc (tindex, sizeof *d->follows);
2719   position_set *backward
2720     = d->epsilon ? xicalloc (tindex, sizeof *backward) : NULL;
2721
2722   for (idx_t i = 0; i < tindex; i++)
2723     {
2724       switch (d->tokens[i])
2725         {
2726         case EMPTY:
2727           /* The empty set is nullable.  */
2728           stk->nullable = true;
2729
2730           /* The firstpos and lastpos of the empty leaf are both empty.  */
2731           stk->nfirstpos = stk->nlastpos = 0;
2732           stk++;
2733           break;
2734
2735         case STAR:
2736         case PLUS:
2737           /* Every element in the lastpos of the argument is in the backward
2738              set of every element in the firstpos.  */
2739           if (d->epsilon)
2740             {
2741               tmp.elems = lastpos - stk[-1].nlastpos;
2742               tmp.nelem = stk[-1].nlastpos;
2743               for (position *p = firstpos - stk[-1].nfirstpos;
2744                    p < firstpos; p++)
2745                 merge2 (&backward[p->index], &tmp, &merged);
2746             }
2747
2748           /* Every element in the firstpos of the argument is in the follow
2749              of every element in the lastpos.  */
2750           {
2751             tmp.elems = firstpos - stk[-1].nfirstpos;
2752             tmp.nelem = stk[-1].nfirstpos;
2753             for (position *p = lastpos - stk[-1].nlastpos; p < lastpos; p++)
2754               merge2 (&d->follows[p->index], &tmp, &merged);
2755           }
2756           FALLTHROUGH;
2757         case QMARK:
2758           /* A QMARK or STAR node is automatically nullable.  */
2759           if (d->tokens[i] != PLUS)
2760             stk[-1].nullable = true;
2761           break;
2762
2763         case CAT:
2764           /* Every element in the lastpos of the first argument is in
2765              the backward set of every element in the firstpos of the
2766              second argument.  */
2767           if (backward)
2768             {
2769               tmp.nelem = stk[-2].nlastpos;
2770               tmp.elems = lastpos - stk[-1].nlastpos - stk[-2].nlastpos;
2771               for (position *p = firstpos - stk[-1].nfirstpos;
2772                    p < firstpos; p++)
2773                 merge2 (&backward[p->index], &tmp, &merged);
2774             }
2775
2776           /* Every element in the firstpos of the second argument is in the
2777              follow of every element in the lastpos of the first argument.  */
2778           {
2779             tmp.nelem = stk[-1].nfirstpos;
2780             tmp.elems = firstpos - stk[-1].nfirstpos;
2781             for (position *plim = lastpos - stk[-1].nlastpos,
2782                    *p = plim - stk[-2].nlastpos;
2783                  p < plim; p++)
2784               merge2 (&d->follows[p->index], &tmp, &merged);
2785           }
2786
2787           /* The firstpos of a CAT node is the firstpos of the first argument,
2788              union that of the second argument if the first is nullable.  */
2789           if (stk[-2].nullable)
2790             stk[-2].nfirstpos += stk[-1].nfirstpos;
2791           else
2792             firstpos -= stk[-1].nfirstpos;
2793
2794           /* The lastpos of a CAT node is the lastpos of the second argument,
2795              union that of the first argument if the second is nullable.  */
2796           if (stk[-1].nullable)
2797             stk[-2].nlastpos += stk[-1].nlastpos;
2798           else
2799             {
2800               position *p = lastpos - stk[-1].nlastpos - stk[-2].nlastpos;
2801               for (idx_t j = 0; j < stk[-1].nlastpos; j++)
2802                 p[j] = p[j + stk[-2].nlastpos];
2803               lastpos -= stk[-2].nlastpos;
2804               stk[-2].nlastpos = stk[-1].nlastpos;
2805             }
2806
2807           /* A CAT node is nullable if both arguments are nullable.  */
2808           stk[-2].nullable &= stk[-1].nullable;
2809           stk--;
2810           break;
2811
2812         case OR:
2813           /* The firstpos is the union of the firstpos of each argument.  */
2814           stk[-2].nfirstpos += stk[-1].nfirstpos;
2815
2816           /* The lastpos is the union of the lastpos of each argument.  */
2817           stk[-2].nlastpos += stk[-1].nlastpos;
2818
2819           /* An OR node is nullable if either argument is nullable.  */
2820           stk[-2].nullable |= stk[-1].nullable;
2821           stk--;
2822           break;
2823
2824         default:
2825           /* Anything else is a nonempty position.  (Note that special
2826              constructs like \< are treated as nonempty strings here;
2827              an "epsilon closure" effectively makes them nullable later.
2828              Backreferences have to get a real position so we can detect
2829              transitions on them later.  But they are nullable.  */
2830           stk->nullable = d->tokens[i] == BACKREF;
2831
2832           /* This position is in its own firstpos and lastpos.  */
2833           stk->nfirstpos = stk->nlastpos = 1;
2834           stk++;
2835
2836           firstpos->index = lastpos->index = i;
2837           firstpos->constraint = lastpos->constraint = NO_CONSTRAINT;
2838           firstpos++, lastpos++;
2839
2840           break;
2841         }
2842 #ifdef DEBUG
2843       /* ... balance the above nonsyntactic #ifdef goo...  */
2844       fprintf (stderr, "node %td:", i);
2845       prtok (d->tokens[i]);
2846       putc ('\n', stderr);
2847       fprintf (stderr,
2848                stk[-1].nullable ? " nullable: yes\n" : " nullable: no\n");
2849       fprintf (stderr, " firstpos:");
2850       for (idx_t j = 0; j < stk[-1].nfirstpos; j++)
2851         {
2852           fprintf (stderr, " %td:", firstpos[j - stk[-1].nfirstpos].index);
2853           prtok (d->tokens[firstpos[j - stk[-1].nfirstpos].index]);
2854         }
2855       fprintf (stderr, "\n lastpos:");
2856       for (idx_t j = 0; j < stk[-1].nlastpos; j++)
2857         {
2858           fprintf (stderr, " %td:", lastpos[j - stk[-1].nlastpos].index);
2859           prtok (d->tokens[lastpos[j - stk[-1].nlastpos].index]);
2860         }
2861       putc ('\n', stderr);
2862 #endif
2863     }
2864
2865   if (backward)
2866     {
2867       /* For each follow set that is the follow set of a real position,
2868          replace it with its epsilon closure.  */
2869       epsclosure (d, backward);
2870
2871       for (idx_t i = 0; i < tindex; i++)
2872         free (backward[i].elems);
2873       free (backward);
2874     }
2875
2876   dfaoptimize (d);
2877
2878 #ifdef DEBUG
2879   for (idx_t i = 0; i < tindex; i++)
2880     if (d->tokens[i] == BEG || d->tokens[i] < NOTCHAR
2881         || d->tokens[i] == BACKREF || d->tokens[i] == ANYCHAR
2882         || d->tokens[i] == MBCSET || d->tokens[i] >= CSET)
2883       {
2884         fprintf (stderr, "follows(%td:", i);
2885         prtok (d->tokens[i]);
2886         fprintf (stderr, "):");
2887         for (idx_t j = 0; j < d->follows[i].nelem; j++)
2888           {
2889             fprintf (stderr, " %td:", d->follows[i].elems[j].index);
2890             prtok (d->tokens[d->follows[i].elems[j].index]);
2891           }
2892         putc ('\n', stderr);
2893       }
2894 #endif
2895
2896   pos.index = 0;
2897   pos.constraint = NO_CONSTRAINT;
2898
2899   alloc_position_set (&tmp, 1);
2900
2901   append (pos, &tmp);
2902
2903   d->separates = xicalloc (tindex, sizeof *d->separates);
2904
2905   for (idx_t i = 0; i < tindex; i++)
2906     {
2907       if (prev_newline_dependent (d->constraints[i]))
2908         d->separates[i] |= CTX_NEWLINE;
2909       if (prev_letter_dependent (d->constraints[i]))
2910         d->separates[i] |= CTX_LETTER;
2911
2912       for (idx_t j = 0; j < d->follows[i].nelem; j++)
2913         {
2914           if (prev_newline_dependent (d->follows[i].elems[j].constraint))
2915             d->separates[i] |= CTX_NEWLINE;
2916           if (prev_letter_dependent (d->follows[i].elems[j].constraint))
2917             d->separates[i] |= CTX_LETTER;
2918         }
2919     }
2920
2921   /* Context wanted by some position.  */
2922   int separate_contexts = state_separate_contexts (d, &tmp);
2923
2924   /* Build the initial state.  */
2925   if (separate_contexts & CTX_NEWLINE)
2926     state_index (d, &tmp, CTX_NEWLINE);
2927   d->initstate_notbol = d->min_trcount
2928     = state_index (d, &tmp, separate_contexts ^ CTX_ANY);
2929   if (separate_contexts & CTX_LETTER)
2930     d->min_trcount = state_index (d, &tmp, CTX_LETTER);
2931   d->min_trcount++;
2932   d->trcount = 0;
2933
2934   free (posalloc);
2935   free (stkalloc);
2936   free (merged.elems);
2937   free (tmp.elems);
2938 }
2939
2940 /* Make sure D's state arrays are large enough to hold NEW_STATE.  */
2941 static void
2942 realloc_trans_if_necessary (struct dfa *d)
2943 {
2944   state_num oldalloc = d->tralloc;
2945   if (oldalloc < d->sindex)
2946     {
2947       state_num **realtrans = d->trans ? d->trans - 2 : NULL;
2948       idx_t newalloc1 = realtrans ? d->tralloc + 2 : 0;
2949       realtrans = xpalloc (realtrans, &newalloc1, d->sindex - oldalloc,
2950                            -1, sizeof *realtrans);
2951       realtrans[0] = realtrans[1] = NULL;
2952       d->trans = realtrans + 2;
2953       idx_t newalloc = d->tralloc = newalloc1 - 2;
2954       d->fails = xreallocarray (d->fails, newalloc, sizeof *d->fails);
2955       d->success = xreallocarray (d->success, newalloc, sizeof *d->success);
2956       d->newlines = xreallocarray (d->newlines, newalloc, sizeof *d->newlines);
2957       if (d->localeinfo.multibyte)
2958         {
2959           realtrans = d->mb_trans ? d->mb_trans - 2 : NULL;
2960           realtrans = xreallocarray (realtrans, newalloc1, sizeof *realtrans);
2961           if (oldalloc == 0)
2962             realtrans[0] = realtrans[1] = NULL;
2963           d->mb_trans = realtrans + 2;
2964         }
2965       for (; oldalloc < newalloc; oldalloc++)
2966         {
2967           d->trans[oldalloc] = NULL;
2968           d->fails[oldalloc] = NULL;
2969           if (d->localeinfo.multibyte)
2970             d->mb_trans[oldalloc] = NULL;
2971         }
2972     }
2973 }
2974
2975 /*
2976    Calculate the transition table for a new state derived from state s
2977    for a compiled dfa d after input character uc, and return the new
2978    state number.
2979
2980    Do not worry about all possible input characters; calculate just the group
2981    of positions that match uc.  Label it with the set of characters that
2982    every position in the group matches (taking into account, if necessary,
2983    preceding context information of s).  Then find the union
2984    of these positions' follows, i.e., the set of positions of the
2985    new state.  For each character in the group's label, set the transition
2986    on this character to be to a state corresponding to the set's positions,
2987    and its associated backward context information, if necessary.
2988
2989    When building a searching matcher, include the positions of state
2990    0 in every state.
2991
2992    The group is constructed by building an equivalence-class
2993    partition of the positions of s.
2994
2995    For each position, find the set of characters C that it matches.  Eliminate
2996    any characters from C that fail on grounds of backward context.
2997
2998    Check whether the group's label L has nonempty
2999    intersection with C.  If L - C is nonempty, create a new group labeled
3000    L - C and having the same positions as the current group, and set L to
3001    the intersection of L and C.  Insert the position in the group, set
3002    C = C - L, and resume scanning.
3003
3004    If after comparing with every group there are characters remaining in C,
3005    create a new group labeled with the characters of C and insert this
3006    position in that group.  */
3007
3008 static state_num
3009 build_state (state_num s, struct dfa *d, unsigned char uc)
3010 {
3011   position_set follows;         /* Union of the follows for each
3012                                    position of the current state.  */
3013   position_set group;           /* Positions that match the input char.  */
3014   position_set tmp;             /* Temporary space for merging sets.  */
3015   state_num state;              /* New state.  */
3016   state_num state_newline;      /* New state on a newline transition.  */
3017   state_num state_letter;       /* New state on a letter transition.  */
3018
3019 #ifdef DEBUG
3020   fprintf (stderr, "build state %td\n", s);
3021 #endif
3022
3023   /* A pointer to the new transition table, and the table itself.  */
3024   state_num **ptrans = (accepting (s, d) ? d->fails : d->trans) + s;
3025   state_num *trans = *ptrans;
3026
3027   if (!trans)
3028     {
3029       /* MAX_TRCOUNT is an arbitrary upper limit on the number of
3030          transition tables that can exist at once, other than for
3031          initial states.  Often-used transition tables are quickly
3032          rebuilt, whereas rarely-used ones are cleared away.  */
3033       if (MAX_TRCOUNT <= d->trcount)
3034         {
3035           for (state_num i = d->min_trcount; i < d->tralloc; i++)
3036             {
3037               free (d->trans[i]);
3038               free (d->fails[i]);
3039               d->trans[i] = d->fails[i] = NULL;
3040             }
3041           d->trcount = 0;
3042         }
3043
3044       d->trcount++;
3045       *ptrans = trans = xmalloc (NOTCHAR * sizeof *trans);
3046
3047       /* Fill transition table with a default value which means that the
3048          transited state has not been calculated yet.  */
3049       for (int i = 0; i < NOTCHAR; i++)
3050         trans[i] = -2;
3051     }
3052
3053   /* Set up the success bits for this state.  */
3054   d->success[s] = 0;
3055   if (accepts_in_context (d->states[s].context, CTX_NEWLINE, s, d))
3056     d->success[s] |= CTX_NEWLINE;
3057   if (accepts_in_context (d->states[s].context, CTX_LETTER, s, d))
3058     d->success[s] |= CTX_LETTER;
3059   if (accepts_in_context (d->states[s].context, CTX_NONE, s, d))
3060     d->success[s] |= CTX_NONE;
3061
3062   alloc_position_set (&follows, d->nleaves);
3063
3064   /* Find the union of the follows of the positions of the group.
3065      This is a hideously inefficient loop.  Fix it someday.  */
3066   for (idx_t j = 0; j < d->states[s].elems.nelem; j++)
3067     for (idx_t k = 0;
3068          k < d->follows[d->states[s].elems.elems[j].index].nelem; ++k)
3069       insert (d->follows[d->states[s].elems.elems[j].index].elems[k],
3070               &follows);
3071
3072   /* Positions that match the input char.  */
3073   alloc_position_set (&group, d->nleaves);
3074
3075   /* The group's label.  */
3076   charclass label;
3077   fillset (&label);
3078
3079   for (idx_t i = 0; i < follows.nelem; i++)
3080     {
3081       charclass matches;            /* Set of matching characters.  */
3082       position pos = follows.elems[i];
3083       bool matched = false;
3084       if (d->tokens[pos.index] >= 0 && d->tokens[pos.index] < NOTCHAR)
3085         {
3086           zeroset (&matches);
3087           setbit (d->tokens[pos.index], &matches);
3088           if (d->tokens[pos.index] == uc)
3089             matched = true;
3090         }
3091       else if (d->tokens[pos.index] >= CSET)
3092         {
3093           matches = d->charclasses[d->tokens[pos.index] - CSET];
3094           if (tstbit (uc, &matches))
3095             matched = true;
3096         }
3097       else if (d->tokens[pos.index] == ANYCHAR)
3098         {
3099           matches = d->charclasses[d->canychar];
3100           if (tstbit (uc, &matches))
3101             matched = true;
3102
3103           /* ANYCHAR must match with a single character, so we must put
3104              it to D->states[s].mbps which contains the positions which
3105              can match with a single character not a byte.  If all
3106              positions which has ANYCHAR does not depend on context of
3107              next character, we put the follows instead of it to
3108              D->states[s].mbps to optimize.  */
3109           if (succeeds_in_context (pos.constraint, d->states[s].context,
3110                                    CTX_NONE))
3111             {
3112               if (d->states[s].mbps.nelem == 0)
3113                 alloc_position_set (&d->states[s].mbps, 1);
3114               insert (pos, &d->states[s].mbps);
3115             }
3116         }
3117       else
3118         continue;
3119
3120       /* Some characters may need to be eliminated from matches because
3121          they fail in the current context.  */
3122       if (pos.constraint != NO_CONSTRAINT)
3123         {
3124           if (!succeeds_in_context (pos.constraint,
3125                                     d->states[s].context, CTX_NEWLINE))
3126             for (int j = 0; j < CHARCLASS_WORDS; j++)
3127               matches.w[j] &= ~d->syntax.newline.w[j];
3128           if (!succeeds_in_context (pos.constraint,
3129                                     d->states[s].context, CTX_LETTER))
3130             for (int j = 0; j < CHARCLASS_WORDS; ++j)
3131               matches.w[j] &= ~d->syntax.letters.w[j];
3132           if (!succeeds_in_context (pos.constraint,
3133                                     d->states[s].context, CTX_NONE))
3134             for (int j = 0; j < CHARCLASS_WORDS; ++j)
3135               matches.w[j] &= d->syntax.letters.w[j] | d->syntax.newline.w[j];
3136
3137           /* If there are no characters left, there's no point in going on.  */
3138           if (emptyset (&matches))
3139             continue;
3140
3141           /* If we have reset the bit that made us declare "matched", reset
3142              that indicator, too.  This is required to avoid an infinite loop
3143              with this command: echo cx | LC_ALL=C grep -E 'c\b[x ]'  */
3144           if (!tstbit (uc, &matches))
3145             matched = false;
3146         }
3147
3148 #ifdef DEBUG
3149       fprintf (stderr, " nextpos %td:", pos.index);
3150       prtok (d->tokens[pos.index]);
3151       fprintf (stderr, " of");
3152       for (unsigned j = 0; j < NOTCHAR; j++)
3153         if (tstbit (j, &matches))
3154           fprintf (stderr, " 0x%02x", j);
3155       fprintf (stderr, "\n");
3156 #endif
3157
3158       if (matched)
3159         {
3160           for (int k = 0; k < CHARCLASS_WORDS; ++k)
3161             label.w[k] &= matches.w[k];
3162           append (pos, &group);
3163         }
3164       else
3165         {
3166           for (int k = 0; k < CHARCLASS_WORDS; ++k)
3167             label.w[k] &= ~matches.w[k];
3168         }
3169     }
3170
3171   alloc_position_set (&tmp, d->nleaves);
3172
3173   if (group.nelem > 0)
3174     {
3175       /* If we are building a searching matcher, throw in the positions
3176          of state 0 as well, if possible.  */
3177       if (d->searchflag)
3178         {
3179           /* If a token in follows.elems is not 1st byte of a multibyte
3180              character, or the states of follows must accept the bytes
3181              which are not 1st byte of the multibyte character.
3182              Then, if a state of follows encounters a byte, it must not be
3183              a 1st byte of a multibyte character nor a single byte character.
3184              In this case, do not add state[0].follows to next state, because
3185              state[0] must accept 1st-byte.
3186
3187              For example, suppose <sb a> is a certain single byte character,
3188              <mb A> is a certain multibyte character, and the codepoint of
3189              <sb a> equals the 2nd byte of the codepoint of <mb A>.  When
3190              state[0] accepts <sb a>, state[i] transits to state[i+1] by
3191              accepting the 1st byte of <mb A>, and state[i+1] accepts the
3192              2nd byte of <mb A>, if state[i+1] encounters the codepoint of
3193              <sb a>, it must not be <sb a> but the 2nd byte of <mb A>, so do
3194              not add state[0].  */
3195
3196           bool mergeit = !d->localeinfo.multibyte;
3197           if (!mergeit)
3198             {
3199               mergeit = true;
3200               for (idx_t j = 0; mergeit && j < group.nelem; j++)
3201                 mergeit &= d->multibyte_prop[group.elems[j].index];
3202             }
3203           if (mergeit)
3204             merge2 (&group, &d->states[0].elems, &tmp);
3205         }
3206
3207       /* Find out if the new state will want any context information,
3208          by calculating possible contexts that the group can match,
3209          and separate contexts that the new state wants to know.  */
3210       int possible_contexts = charclass_context (d, &label);
3211       int separate_contexts = state_separate_contexts (d, &group);
3212
3213       /* Find the state(s) corresponding to the union of the follows.  */
3214       if (possible_contexts & ~separate_contexts)
3215         state = state_index (d, &group, separate_contexts ^ CTX_ANY);
3216       else
3217         state = -1;
3218       if (separate_contexts & possible_contexts & CTX_NEWLINE)
3219         state_newline = state_index (d, &group, CTX_NEWLINE);
3220       else
3221         state_newline = state;
3222       if (separate_contexts & possible_contexts & CTX_LETTER)
3223         state_letter = state_index (d, &group, CTX_LETTER);
3224       else
3225         state_letter = state;
3226
3227       /* Reallocate now, to reallocate any newline transition properly.  */
3228       realloc_trans_if_necessary (d);
3229     }
3230
3231   /* If we are a searching matcher, the default transition is to a state
3232      containing the positions of state 0, otherwise the default transition
3233      is to fail miserably.  */
3234   else if (d->searchflag)
3235     {
3236       state_newline = 0;
3237       state_letter = d->min_trcount - 1;
3238       state = d->initstate_notbol;
3239     }
3240   else
3241     {
3242       state_newline = -1;
3243       state_letter = -1;
3244       state = -1;
3245     }
3246
3247   /* Set the transitions for each character in the label.  */
3248   for (int i = 0; i < NOTCHAR; i++)
3249     if (tstbit (i, &label))
3250       switch (d->syntax.sbit[i])
3251         {
3252         case CTX_NEWLINE:
3253           trans[i] = state_newline;
3254           break;
3255         case CTX_LETTER:
3256           trans[i] = state_letter;
3257           break;
3258         default:
3259           trans[i] = state;
3260           break;
3261         }
3262
3263 #ifdef DEBUG
3264   fprintf (stderr, "trans table %td", s);
3265   for (int i = 0; i < NOTCHAR; ++i)
3266     {
3267       if (!(i & 0xf))
3268         fprintf (stderr, "\n");
3269       fprintf (stderr, " %2td", trans[i]);
3270     }
3271   fprintf (stderr, "\n");
3272 #endif
3273
3274   free (group.elems);
3275   free (follows.elems);
3276   free (tmp.elems);
3277
3278   /* Keep the newline transition in a special place so we can use it as
3279      a sentinel.  */
3280   if (tstbit (d->syntax.eolbyte, &label))
3281     {
3282       d->newlines[s] = trans[d->syntax.eolbyte];
3283       trans[d->syntax.eolbyte] = -1;
3284     }
3285
3286   return trans[uc];
3287 }
3288
3289 /* Multibyte character handling sub-routines for dfaexec.  */
3290
3291 /* Consume a single byte and transit state from 's' to '*next_state'.
3292    This is almost the same as the state transition routine in dfaexec.
3293    But the state transition is done just once; otherwise, matching succeeds or
3294    we reach the end of the buffer.  */
3295 static state_num
3296 transit_state_singlebyte (struct dfa *d, state_num s, unsigned char const **pp)
3297 {
3298   state_num *t;
3299
3300   if (d->trans[s])
3301     t = d->trans[s];
3302   else if (d->fails[s])
3303     t = d->fails[s];
3304   else
3305     {
3306       build_state (s, d, **pp);
3307       if (d->trans[s])
3308         t = d->trans[s];
3309       else
3310         {
3311           t = d->fails[s];
3312           assert (t);
3313         }
3314     }
3315
3316   if (t[**pp] == -2)
3317     build_state (s, d, **pp);
3318
3319   return t[*(*pp)++];
3320 }
3321
3322 /* Transit state from s, then return new state and update the pointer of
3323    the buffer.  This function is for a period operator which can match a
3324    multi-byte character.  */
3325 static state_num
3326 transit_state (struct dfa *d, state_num s, unsigned char const **pp,
3327                unsigned char const *end)
3328 {
3329   wint_t wc;
3330
3331   int mbclen = mbs_to_wchar (&wc, (char const *) *pp, end - *pp, d);
3332
3333   /* This state has some operators which can match a multibyte character.  */
3334   d->mb_follows.nelem = 0;
3335
3336   /* Calculate the state which can be reached from the state 's' by
3337      consuming 'mbclen' single bytes from the buffer.  */
3338   state_num s1 = s;
3339   int mbci;
3340   for (mbci = 0; mbci < mbclen && (mbci == 0 || d->min_trcount <= s); mbci++)
3341     s = transit_state_singlebyte (d, s, pp);
3342   *pp += mbclen - mbci;
3343
3344   if (wc == WEOF)
3345     {
3346       /* It is an invalid character, so ANYCHAR is not accepted.  */
3347       return s;
3348     }
3349
3350   /* If all positions which have ANYCHAR do not depend on the context
3351      of the next character, calculate the next state with
3352      pre-calculated follows and cache the result.  */
3353   if (d->states[s1].mb_trindex < 0)
3354     {
3355       if (MAX_TRCOUNT <= d->mb_trcount)
3356         {
3357           state_num s3;
3358           for (s3 = -1; s3 < d->tralloc; s3++)
3359             {
3360               free (d->mb_trans[s3]);
3361               d->mb_trans[s3] = NULL;
3362             }
3363
3364           for (state_num i = 0; i < d->sindex; i++)
3365             d->states[i].mb_trindex = -1;
3366           d->mb_trcount = 0;
3367         }
3368       d->states[s1].mb_trindex = d->mb_trcount++;
3369     }
3370
3371   if (! d->mb_trans[s])
3372     {
3373       enum { TRANSPTR_SIZE = sizeof *d->mb_trans[s] };
3374       enum { TRANSALLOC_SIZE = MAX_TRCOUNT * TRANSPTR_SIZE };
3375       d->mb_trans[s] = xmalloc (TRANSALLOC_SIZE);
3376       for (int i = 0; i < MAX_TRCOUNT; i++)
3377         d->mb_trans[s][i] = -1;
3378     }
3379   else if (d->mb_trans[s][d->states[s1].mb_trindex] >= 0)
3380     return d->mb_trans[s][d->states[s1].mb_trindex];
3381
3382   if (s == -1)
3383     copy (&d->states[s1].mbps, &d->mb_follows);
3384   else
3385     merge (&d->states[s1].mbps, &d->states[s].elems, &d->mb_follows);
3386
3387   int separate_contexts = state_separate_contexts (d, &d->mb_follows);
3388   state_num s2 = state_index (d, &d->mb_follows, separate_contexts ^ CTX_ANY);
3389   realloc_trans_if_necessary (d);
3390
3391   d->mb_trans[s][d->states[s1].mb_trindex] = s2;
3392
3393   return s2;
3394 }
3395
3396 /* The initial state may encounter a byte which is not a single byte character
3397    nor the first byte of a multibyte character.  But it is incorrect for the
3398    initial state to accept such a byte.  For example, in Shift JIS the regular
3399    expression "\\" accepts the codepoint 0x5c, but should not accept the second
3400    byte of the codepoint 0x815c.  Then the initial state must skip the bytes
3401    that are not a single byte character nor the first byte of a multibyte
3402    character.
3403
3404    Given DFA state d, use mbs_to_wchar to advance MBP until it reaches
3405    or exceeds P, and return the advanced MBP.  If WCP is non-NULL and
3406    the result is greater than P, set *WCP to the final wide character
3407    processed, or to WEOF if no wide character is processed.  Otherwise,
3408    if WCP is non-NULL, *WCP may or may not be updated.
3409
3410    Both P and MBP must be no larger than END.  */
3411 static unsigned char const *
3412 skip_remains_mb (struct dfa *d, unsigned char const *p,
3413                  unsigned char const *mbp, char const *end)
3414 {
3415   if (d->syntax.never_trail[*p])
3416     return p;
3417   while (mbp < p)
3418     {
3419       wint_t wc;
3420       mbp += mbs_to_wchar (&wc, (char const *) mbp,
3421                            end - (char const *) mbp, d);
3422     }
3423   return mbp;
3424 }
3425
3426 /* Search through a buffer looking for a match to the struct dfa *D.
3427    Find the first occurrence of a string matching the regexp in the
3428    buffer, and the shortest possible version thereof.  Return a pointer to
3429    the first character after the match, or NULL if none is found.  BEGIN
3430    points to the beginning of the buffer, and END points to the first byte
3431    after its end.  Note however that we store a sentinel byte (usually
3432    newline) in *END, so the actual buffer must be one byte longer.
3433    When ALLOW_NL, newlines may appear in the matching string.
3434    If COUNT is non-NULL, increment *COUNT once for each newline processed.
3435    If MULTIBYTE, the input consists of multibyte characters and/or
3436    encoding-error bytes.  Otherwise, it consists of single-byte characters.
3437    Here is the list of features that make this DFA matcher punt:
3438     - [M-N] range in non-simple locale: regex is up to 25% faster on [a-z]
3439     - [^...] in non-simple locale
3440     - [[=foo=]] or [[.foo.]]
3441     - [[:alpha:]] etc. in multibyte locale (except [[:digit:]] works OK)
3442     - back-reference: (.)\1
3443     - word-delimiter in multibyte locale: \<, \>, \b, \B
3444    See struct localeinfo.simple for the definition of "simple locale".  */
3445
3446 static inline char *
3447 dfaexec_main (struct dfa *d, char const *begin, char *end, bool allow_nl,
3448               idx_t *count, bool multibyte)
3449 {
3450   if (MAX_TRCOUNT <= d->sindex)
3451     {
3452       for (state_num s = d->min_trcount; s < d->sindex; s++)
3453         {
3454           free (d->states[s].elems.elems);
3455           free (d->states[s].mbps.elems);
3456         }
3457       d->sindex = d->min_trcount;
3458
3459       if (d->trans)
3460         {
3461           for (state_num s = 0; s < d->tralloc; s++)
3462             {
3463               free (d->trans[s]);
3464               free (d->fails[s]);
3465               d->trans[s] = d->fails[s] = NULL;
3466             }
3467           d->trcount = 0;
3468         }
3469
3470       if (d->localeinfo.multibyte && d->mb_trans)
3471         {
3472           for (state_num s = -1; s < d->tralloc; s++)
3473             {
3474               free (d->mb_trans[s]);
3475               d->mb_trans[s] = NULL;
3476             }
3477           for (state_num s = 0; s < d->min_trcount; s++)
3478             d->states[s].mb_trindex = -1;
3479           d->mb_trcount = 0;
3480         }
3481     }
3482
3483   if (!d->tralloc)
3484     realloc_trans_if_necessary (d);
3485
3486   /* Current state.  */
3487   state_num s = 0, s1 = 0;
3488
3489   /* Current input character.  */
3490   unsigned char const *p = (unsigned char const *) begin;
3491   unsigned char const *mbp = p;
3492
3493   /* Copy of d->trans so it can be optimized into a register.  */
3494   state_num **trans = d->trans;
3495   unsigned char eol = d->syntax.eolbyte;  /* Likewise for eolbyte.  */
3496   unsigned char saved_end = *(unsigned char *) end;
3497   *end = eol;
3498
3499   if (multibyte)
3500     {
3501       mbszero (&d->mbs);
3502       if (d->mb_follows.alloc == 0)
3503         alloc_position_set (&d->mb_follows, d->nleaves);
3504     }
3505
3506   idx_t nlcount = 0;
3507   for (;;)
3508     {
3509       state_num *t;
3510       while ((t = trans[s]) != NULL)
3511         {
3512           if (s < d->min_trcount)
3513             {
3514               if (!multibyte || d->states[s].mbps.nelem == 0)
3515                 {
3516                   while (t[*p] == s)
3517                     p++;
3518                 }
3519               if (multibyte)
3520                 p = mbp = skip_remains_mb (d, p, mbp, end);
3521             }
3522
3523           if (multibyte)
3524             {
3525               s1 = s;
3526
3527               if (d->states[s].mbps.nelem == 0
3528                   || d->localeinfo.sbctowc[*p] != WEOF || (char *) p >= end)
3529                 {
3530                   /* If an input character does not match ANYCHAR, do it
3531                      like a single-byte character.  */
3532                   s = t[*p++];
3533                 }
3534               else
3535                 {
3536                   s = transit_state (d, s, &p, (unsigned char *) end);
3537                   mbp = p;
3538                   trans = d->trans;
3539                 }
3540             }
3541           else
3542             {
3543               s1 = t[*p++];
3544               t = trans[s1];
3545               if (! t)
3546                 {
3547                   state_num tmp = s;
3548                   s = s1;
3549                   s1 = tmp;     /* swap */
3550                   break;
3551                 }
3552               if (s < d->min_trcount)
3553                 {
3554                   while (t[*p] == s1)
3555                     p++;
3556                 }
3557               s = t[*p++];
3558             }
3559         }
3560
3561       if (s < 0)
3562         {
3563           if (s == -2)
3564             {
3565               s = build_state (s1, d, p[-1]);
3566               trans = d->trans;
3567             }
3568           else if ((char *) p <= end && p[-1] == eol && 0 <= d->newlines[s1])
3569             {
3570               /* The previous character was a newline.  Count it, and skip
3571                  checking of multibyte character boundary until here.  */
3572               nlcount++;
3573               mbp = p;
3574
3575               s = (allow_nl ? d->newlines[s1]
3576                    : d->syntax.sbit[eol] == CTX_NEWLINE ? 0
3577                    : d->syntax.sbit[eol] == CTX_LETTER ? d->min_trcount - 1
3578                    : d->initstate_notbol);
3579             }
3580           else
3581             {
3582               p = NULL;
3583               goto done;
3584             }
3585         }
3586       else if (d->fails[s])
3587         {
3588           if ((d->success[s] & d->syntax.sbit[*p])
3589               || ((char *) p == end
3590                   && accepts_in_context (d->states[s].context, CTX_NEWLINE, s,
3591                                          d)))
3592             goto done;
3593
3594           if (multibyte && s < d->min_trcount)
3595             p = mbp = skip_remains_mb (d, p, mbp, end);
3596
3597           s1 = s;
3598           if (!multibyte || d->states[s].mbps.nelem == 0
3599               || d->localeinfo.sbctowc[*p] != WEOF || (char *) p >= end)
3600             {
3601               /* If a input character does not match ANYCHAR, do it
3602                  like a single-byte character.  */
3603               s = d->fails[s][*p++];
3604             }
3605           else
3606             {
3607               s = transit_state (d, s, &p, (unsigned char *) end);
3608               mbp = p;
3609               trans = d->trans;
3610             }
3611         }
3612       else
3613         {
3614           build_state (s, d, p[0]);
3615           trans = d->trans;
3616         }
3617     }
3618
3619  done:
3620   if (count)
3621     *count += nlcount;
3622   *end = saved_end;
3623   return (char *) p;
3624 }
3625
3626 /* Specialized versions of dfaexec for multibyte and single-byte cases.
3627    This is for performance, as dfaexec_main is an inline function.  */
3628
3629 static char *
3630 dfaexec_mb (struct dfa *d, char const *begin, char *end,
3631             bool allow_nl, idx_t *count, bool *backref)
3632 {
3633   return dfaexec_main (d, begin, end, allow_nl, count, true);
3634 }
3635
3636 static char *
3637 dfaexec_sb (struct dfa *d, char const *begin, char *end,
3638             bool allow_nl, idx_t *count, bool *backref)
3639 {
3640   return dfaexec_main (d, begin, end, allow_nl, count, false);
3641 }
3642
3643 /* Always set *BACKREF and return BEGIN.  Use this wrapper for
3644    any regexp that uses a construct not supported by this code.  */
3645 static char *
3646 dfaexec_noop (struct dfa *d, char const *begin, char *end,
3647               bool allow_nl, idx_t *count, bool *backref)
3648 {
3649   *backref = true;
3650   return (char *) begin;
3651 }
3652
3653 /* Like dfaexec_main (D, BEGIN, END, ALLOW_NL, COUNT, D->localeinfo.multibyte),
3654    but faster and set *BACKREF if the DFA code does not support this
3655    regexp usage.  */
3656
3657 char *
3658 dfaexec (struct dfa *d, char const *begin, char *end,
3659          bool allow_nl, idx_t *count, bool *backref)
3660 {
3661   return d->dfaexec (d, begin, end, allow_nl, count, backref);
3662 }
3663
3664 struct dfa *
3665 dfasuperset (struct dfa const *d)
3666 {
3667   return d->superset;
3668 }
3669
3670 bool
3671 dfaisfast (struct dfa const *d)
3672 {
3673   return d->fast;
3674 }
3675
3676 static void
3677 free_mbdata (struct dfa *d)
3678 {
3679   free (d->multibyte_prop);
3680   free (d->lex.brack.chars);
3681   free (d->mb_follows.elems);
3682
3683   if (d->mb_trans)
3684     {
3685       state_num s;
3686       for (s = -1; s < d->tralloc; s++)
3687         free (d->mb_trans[s]);
3688       free (d->mb_trans - 2);
3689     }
3690 }
3691
3692 /* Return true if every construct in D is supported by this DFA matcher.  */
3693 bool
3694 dfasupported (struct dfa const *d)
3695 {
3696   for (idx_t i = 0; i < d->tindex; i++)
3697     {
3698       switch (d->tokens[i])
3699         {
3700         case BEGWORD:
3701         case ENDWORD:
3702         case LIMWORD:
3703         case NOTLIMWORD:
3704           if (!d->localeinfo.multibyte)
3705             continue;
3706           FALLTHROUGH;
3707         case BACKREF:
3708         case MBCSET:
3709           return false;
3710         }
3711     }
3712   return true;
3713 }
3714
3715 /* Disable use of the superset DFA if it is not likely to help
3716    performance.  */
3717 static void
3718 maybe_disable_superset_dfa (struct dfa *d)
3719 {
3720   if (!d->localeinfo.using_utf8)
3721     return;
3722
3723   bool have_backref = false;
3724   for (idx_t i = 0; i < d->tindex; i++)
3725     {
3726       switch (d->tokens[i])
3727         {
3728         case ANYCHAR:
3729           /* Lowered.  */
3730           abort ();
3731         case BACKREF:
3732           have_backref = true;
3733           break;
3734         case MBCSET:
3735           /* Requires multi-byte algorithm.  */
3736           return;
3737         default:
3738           break;
3739         }
3740     }
3741
3742   if (!have_backref && d->superset)
3743     {
3744       /* The superset DFA is not likely to be much faster, so remove it.  */
3745       dfafree (d->superset);
3746       free (d->superset);
3747       d->superset = NULL;
3748     }
3749
3750   free_mbdata (d);
3751   d->localeinfo.multibyte = false;
3752   d->dfaexec = dfaexec_sb;
3753   d->fast = true;
3754 }
3755
3756 static void
3757 dfassbuild (struct dfa *d)
3758 {
3759   struct dfa *sup = dfaalloc ();
3760
3761   *sup = *d;
3762   sup->localeinfo.multibyte = false;
3763   sup->dfaexec = dfaexec_sb;
3764   sup->multibyte_prop = NULL;
3765   sup->superset = NULL;
3766   sup->states = NULL;
3767   sup->sindex = 0;
3768   sup->constraints = NULL;
3769   sup->separates = NULL;
3770   sup->follows = NULL;
3771   sup->tralloc = 0;
3772   sup->trans = NULL;
3773   sup->fails = NULL;
3774   sup->success = NULL;
3775   sup->newlines = NULL;
3776
3777   sup->charclasses = xnmalloc (sup->calloc, sizeof *sup->charclasses);
3778   if (d->cindex)
3779     {
3780       memcpy (sup->charclasses, d->charclasses,
3781               d->cindex * sizeof *sup->charclasses);
3782     }
3783
3784   sup->tokens = xnmalloc (d->tindex, 2 * sizeof *sup->tokens);
3785   sup->talloc = d->tindex * 2;
3786
3787   bool have_achar = false;
3788   bool have_nchar = false;
3789   idx_t j;
3790   for (idx_t i = j = 0; i < d->tindex; i++)
3791     {
3792       switch (d->tokens[i])
3793         {
3794         case ANYCHAR:
3795         case MBCSET:
3796         case BACKREF:
3797           {
3798             charclass ccl;
3799             fillset (&ccl);
3800             sup->tokens[j++] = CSET + charclass_index (sup, &ccl);
3801             sup->tokens[j++] = STAR;
3802             if (d->tokens[i + 1] == QMARK || d->tokens[i + 1] == STAR
3803                 || d->tokens[i + 1] == PLUS)
3804               i++;
3805             have_achar = true;
3806           }
3807           break;
3808         case BEGWORD:
3809         case ENDWORD:
3810         case LIMWORD:
3811         case NOTLIMWORD:
3812           if (d->localeinfo.multibyte)
3813             {
3814               /* These constraints aren't supported in a multibyte locale.
3815                  Ignore them in the superset DFA.  */
3816               sup->tokens[j++] = EMPTY;
3817               break;
3818             }
3819           FALLTHROUGH;
3820         default:
3821           sup->tokens[j++] = d->tokens[i];
3822           if ((0 <= d->tokens[i] && d->tokens[i] < NOTCHAR)
3823               || d->tokens[i] >= CSET)
3824             have_nchar = true;
3825           break;
3826         }
3827     }
3828   sup->tindex = j;
3829
3830   if (have_nchar && (have_achar || d->localeinfo.multibyte))
3831     d->superset = sup;
3832   else
3833     {
3834       dfafree (sup);
3835       free (sup);
3836     }
3837 }
3838
3839 /* Parse a string S of length LEN into D (but skip this step if S is null).
3840    Then analyze D and build a matcher for it.
3841    SEARCHFLAG says whether to build a searching or an exact matcher.  */
3842 void
3843 dfacomp (char const *s, idx_t len, struct dfa *d, bool searchflag)
3844 {
3845   if (s != NULL)
3846     dfaparse (s, len, d);
3847
3848   dfassbuild (d);
3849
3850   if (dfasupported (d))
3851     {
3852       maybe_disable_superset_dfa (d);
3853       dfaanalyze (d, searchflag);
3854     }
3855   else
3856     {
3857       d->dfaexec = dfaexec_noop;
3858     }
3859
3860   if (d->superset)
3861     {
3862       d->fast = true;
3863       dfaanalyze (d->superset, searchflag);
3864     }
3865 }
3866
3867 /* Free the storage held by the components of a dfa.  */
3868 void
3869 dfafree (struct dfa *d)
3870 {
3871   free (d->charclasses);
3872   free (d->tokens);
3873
3874   if (d->localeinfo.multibyte)
3875     free_mbdata (d);
3876
3877   free (d->constraints);
3878   free (d->separates);
3879
3880   for (idx_t i = 0; i < d->sindex; i++)
3881     {
3882       free (d->states[i].elems.elems);
3883       free (d->states[i].mbps.elems);
3884     }
3885   free (d->states);
3886
3887   if (d->follows)
3888     {
3889       for (idx_t i = 0; i < d->tindex; i++)
3890         free (d->follows[i].elems);
3891       free (d->follows);
3892     }
3893
3894   if (d->trans)
3895     {
3896       for (idx_t i = 0; i < d->tralloc; i++)
3897         {
3898           free (d->trans[i]);
3899           free (d->fails[i]);
3900         }
3901
3902       free (d->trans - 2);
3903       free (d->fails);
3904       free (d->newlines);
3905       free (d->success);
3906     }
3907
3908   if (d->superset)
3909     {
3910       dfafree (d->superset);
3911       free (d->superset);
3912     }
3913 }
3914
3915 /* Having found the postfix representation of the regular expression,
3916    try to find a long sequence of characters that must appear in any line
3917    containing the r.e.
3918    Finding a "longest" sequence is beyond the scope here;
3919    we take an easy way out and hope for the best.
3920    (Take "(ab|a)b"--please.)
3921
3922    We do a bottom-up calculation of sequences of characters that must appear
3923    in matches of r.e.'s represented by trees rooted at the nodes of the postfix
3924    representation:
3925         sequences that must appear at the left of the match ("left")
3926         sequences that must appear at the right of the match ("right")
3927         lists of sequences that must appear somewhere in the match ("in")
3928         sequences that must constitute the match ("is")
3929
3930    When we get to the root of the tree, we use one of the longest of its
3931    calculated "in" sequences as our answer.
3932
3933    The sequences calculated for the various types of node (in pseudo ANSI c)
3934    are shown below.  "p" is the operand of unary operators (and the left-hand
3935    operand of binary operators); "q" is the right-hand operand of binary
3936    operators.
3937
3938    "ZERO" means "a zero-length sequence" below.
3939
3940         Type    left            right           is              in
3941         ----    ----            -----           --              --
3942         char c  # c             # c             # c             # c
3943
3944         ANYCHAR ZERO            ZERO            ZERO            ZERO
3945
3946         MBCSET  ZERO            ZERO            ZERO            ZERO
3947
3948         CSET    ZERO            ZERO            ZERO            ZERO
3949
3950         STAR    ZERO            ZERO            ZERO            ZERO
3951
3952         QMARK   ZERO            ZERO            ZERO            ZERO
3953
3954         PLUS    p->left         p->right        ZERO            p->in
3955
3956         CAT     (p->is==ZERO)?  (q->is==ZERO)?  (p->is!=ZERO && p->in plus
3957                 p->left :       q->right :      q->is!=ZERO) ?  q->in plus
3958                 p->is##q->left  p->right##q->is p->is##q->is : p->right##q->left
3959                                                 ZERO
3960
3961         OR      longest common  longest common  (do p->is and substrings common
3962                 leading         trailing        to q->is have same p->in and
3963                 (sub)sequence   (sub)sequence   q->in length and content) ?
3964                 of p->left      of p->right
3965                 and q->left     and q->right    p->is : NULL
3966
3967    If there's anything else we recognize in the tree, all four sequences get set
3968    to zero-length sequences.  If there's something we don't recognize in the
3969    tree, we just return a zero-length sequence.
3970
3971    Break ties in favor of infrequent letters (choosing 'zzz' in preference to
3972    'aaa')?
3973
3974    And ... is it here or someplace that we might ponder "optimizations" such as
3975         egrep 'psi|epsilon'     ->      egrep 'psi'
3976         egrep 'pepsi|epsilon'   ->      egrep 'epsi'
3977                                         (Yes, we now find "epsi" as a "string
3978                                         that must occur", but we might also
3979                                         simplify the *entire* r.e. being sought)
3980         grep '[c]'              ->      grep 'c'
3981         grep '(ab|a)b'          ->      grep 'ab'
3982         grep 'ab*'              ->      grep 'a'
3983         grep 'a*b'              ->      grep 'b'
3984
3985    There are several issues:
3986
3987    Is optimization easy (enough)?
3988
3989    Does optimization actually accomplish anything,
3990    or is the automaton you get from "psi|epsilon" (for example)
3991    the same as the one you get from "psi" (for example)?
3992
3993    Are optimizable r.e.'s likely to be used in real-life situations
3994    (something like 'ab*' is probably unlikely; something like is
3995    'psi|epsilon' is likelier)?  */
3996
3997 static char *
3998 icatalloc (char *old, char const *new)
3999 {
4000   idx_t newsize = strlen (new);
4001   if (newsize == 0)
4002     return old;
4003   idx_t oldsize = strlen (old);
4004   char *result = xirealloc (old, oldsize + newsize + 1);
4005   memcpy (result + oldsize, new, newsize + 1);
4006   return result;
4007 }
4008
4009 static void
4010 freelist (char **cpp)
4011 {
4012   while (*cpp)
4013     free (*cpp++);
4014 }
4015
4016 static char **
4017 enlistnew (char **cpp, char *new)
4018 {
4019   /* Is there already something in the list that's new (or longer)?  */
4020   idx_t i;
4021   for (i = 0; cpp[i] != NULL; i++)
4022     if (strstr (cpp[i], new) != NULL)
4023       {
4024         free (new);
4025         return cpp;
4026       }
4027   /* Eliminate any obsoleted strings.  */
4028   for (idx_t j = 0; cpp[j] != NULL; )
4029     if (strstr (new, cpp[j]) == NULL)
4030       ++j;
4031     else
4032       {
4033         free (cpp[j]);
4034         if (--i == j)
4035           break;
4036         cpp[j] = cpp[i];
4037         cpp[i] = NULL;
4038       }
4039   /* Add the new string.  */
4040   cpp = xreallocarray (cpp, i + 2, sizeof *cpp);
4041   cpp[i] = new;
4042   cpp[i + 1] = NULL;
4043   return cpp;
4044 }
4045
4046 static char **
4047 enlist (char **cpp, char const *str, idx_t len)
4048 {
4049   return enlistnew (cpp, ximemdup0 (str, len));
4050 }
4051
4052 /* Given pointers to two strings, return a pointer to an allocated
4053    list of their distinct common substrings.  */
4054 static char **
4055 comsubs (char *left, char const *right)
4056 {
4057   char **cpp = xzalloc (sizeof *cpp);
4058
4059   for (char *lcp = left; *lcp != '\0'; lcp++)
4060     {
4061       idx_t len = 0;
4062       char *rcp = strchr (right, *lcp);
4063       while (rcp != NULL)
4064         {
4065           idx_t i;
4066           for (i = 1; lcp[i] != '\0' && lcp[i] == rcp[i]; ++i)
4067             continue;
4068           if (i > len)
4069             len = i;
4070           rcp = strchr (rcp + 1, *lcp);
4071         }
4072       if (len != 0)
4073         cpp = enlist (cpp, lcp, len);
4074     }
4075   return cpp;
4076 }
4077
4078 static char **
4079 addlists (char **old, char **new)
4080 {
4081   for (; *new; new++)
4082     old = enlistnew (old, xstrdup (*new));
4083   return old;
4084 }
4085
4086 /* Given two lists of substrings, return a new list giving substrings
4087    common to both.  */
4088 static char **
4089 inboth (char **left, char **right)
4090 {
4091   char **both = xzalloc (sizeof *both);
4092
4093   for (idx_t lnum = 0; left[lnum] != NULL; lnum++)
4094     {
4095       for (idx_t rnum = 0; right[rnum] != NULL; rnum++)
4096         {
4097           char **temp = comsubs (left[lnum], right[rnum]);
4098           both = addlists (both, temp);
4099           freelist (temp);
4100           free (temp);
4101         }
4102     }
4103   return both;
4104 }
4105
4106 typedef struct must must;
4107
4108 struct must
4109 {
4110   char **in;
4111   char *left;
4112   char *right;
4113   char *is;
4114   bool begline;
4115   bool endline;
4116   must *prev;
4117 };
4118
4119 static must *
4120 allocmust (must *mp, idx_t size)
4121 {
4122   must *new_mp = xmalloc (sizeof *new_mp);
4123   new_mp->in = xzalloc (sizeof *new_mp->in);
4124   new_mp->left = xizalloc (size);
4125   new_mp->right = xizalloc (size);
4126   new_mp->is = xizalloc (size);
4127   new_mp->begline = false;
4128   new_mp->endline = false;
4129   new_mp->prev = mp;
4130   return new_mp;
4131 }
4132
4133 static void
4134 resetmust (must *mp)
4135 {
4136   freelist (mp->in);
4137   mp->in[0] = NULL;
4138   mp->left[0] = mp->right[0] = mp->is[0] = '\0';
4139   mp->begline = false;
4140   mp->endline = false;
4141 }
4142
4143 static void
4144 freemust (must *mp)
4145 {
4146   freelist (mp->in);
4147   free (mp->in);
4148   free (mp->left);
4149   free (mp->right);
4150   free (mp->is);
4151   free (mp);
4152 }
4153
4154 struct dfamust *
4155 dfamust (struct dfa const *d)
4156 {
4157   must *mp = NULL;
4158   char const *result = "";
4159   bool exact = false;
4160   bool begline = false;
4161   bool endline = false;
4162   bool need_begline = false;
4163   bool need_endline = false;
4164   bool case_fold_unibyte = d->syntax.case_fold & !d->localeinfo.multibyte;
4165
4166   for (idx_t ri = 1; ri + 1 < d->tindex; ri++)
4167     {
4168       token t = d->tokens[ri];
4169       switch (t)
4170         {
4171         case BEGLINE:
4172           mp = allocmust (mp, 2);
4173           mp->begline = true;
4174           need_begline = true;
4175           break;
4176         case ENDLINE:
4177           mp = allocmust (mp, 2);
4178           mp->endline = true;
4179           need_endline = true;
4180           break;
4181         case LPAREN:
4182         case RPAREN:
4183           assert (!"neither LPAREN nor RPAREN may appear here");
4184
4185         case EMPTY:
4186         case BEGWORD:
4187         case ENDWORD:
4188         case LIMWORD:
4189         case NOTLIMWORD:
4190         case BACKREF:
4191         case ANYCHAR:
4192         case MBCSET:
4193           mp = allocmust (mp, 2);
4194           break;
4195
4196         case STAR:
4197         case QMARK:
4198           assume_nonnull (mp);
4199           resetmust (mp);
4200           break;
4201
4202         case OR:
4203           {
4204             char **new;
4205             must *rmp = mp;
4206             assume_nonnull (rmp);
4207             must *lmp = mp = mp->prev;
4208             assume_nonnull (lmp);
4209             idx_t j, ln, rn, n;
4210
4211             /* Guaranteed to be.  Unlikely, but ...  */
4212             if (str_eq (lmp->is, rmp->is))
4213               {
4214                 lmp->begline &= rmp->begline;
4215                 lmp->endline &= rmp->endline;
4216               }
4217             else
4218               {
4219                 lmp->is[0] = '\0';
4220                 lmp->begline = false;
4221                 lmp->endline = false;
4222               }
4223             /* Left side--easy */
4224             idx_t i = 0;
4225             while (lmp->left[i] != '\0' && lmp->left[i] == rmp->left[i])
4226               ++i;
4227             lmp->left[i] = '\0';
4228             /* Right side */
4229             ln = strlen (lmp->right);
4230             rn = strlen (rmp->right);
4231             n = ln;
4232             if (n > rn)
4233               n = rn;
4234             for (i = 0; i < n; ++i)
4235               if (lmp->right[ln - i - 1] != rmp->right[rn - i - 1])
4236                 break;
4237             for (j = 0; j < i; ++j)
4238               lmp->right[j] = lmp->right[(ln - i) + j];
4239             lmp->right[j] = '\0';
4240             new = inboth (lmp->in, rmp->in);
4241             freelist (lmp->in);
4242             free (lmp->in);
4243             lmp->in = new;
4244             freemust (rmp);
4245           }
4246           break;
4247
4248         case PLUS:
4249           assume_nonnull (mp);
4250           mp->is[0] = '\0';
4251           break;
4252
4253         case END:
4254           assume_nonnull (mp);
4255           assert (!mp->prev);
4256           for (idx_t i = 0; mp->in[i] != NULL; i++)
4257             if (strlen (mp->in[i]) > strlen (result))
4258               result = mp->in[i];
4259           if (str_eq (result, mp->is))
4260             {
4261               if ((!need_begline || mp->begline) && (!need_endline
4262                                                      || mp->endline))
4263                 exact = true;
4264               begline = mp->begline;
4265               endline = mp->endline;
4266             }
4267           goto done;
4268
4269         case CAT:
4270           {
4271             must *rmp = mp;
4272             assume_nonnull (rmp);
4273             must *lmp = mp = mp->prev;
4274             assume_nonnull (lmp);
4275
4276             /* In.  Everything in left, plus everything in
4277                right, plus concatenation of
4278                left's right and right's left.  */
4279             lmp->in = addlists (lmp->in, rmp->in);
4280             if (lmp->right[0] != '\0' && rmp->left[0] != '\0')
4281               {
4282                 idx_t lrlen = strlen (lmp->right);
4283                 idx_t rllen = strlen (rmp->left);
4284                 char *tp = ximalloc (lrlen + rllen + 1);
4285                 memcpy (tp + lrlen, rmp->left, rllen + 1);
4286                 memcpy (tp, lmp->right, lrlen);
4287                 lmp->in = enlistnew (lmp->in, tp);
4288               }
4289             /* Left-hand */
4290             if (lmp->is[0] != '\0')
4291               lmp->left = icatalloc (lmp->left, rmp->left);
4292             /* Right-hand */
4293             if (rmp->is[0] == '\0')
4294               lmp->right[0] = '\0';
4295             lmp->right = icatalloc (lmp->right, rmp->right);
4296             /* Guaranteed to be */
4297             if ((lmp->is[0] != '\0' || lmp->begline)
4298                 && (rmp->is[0] != '\0' || rmp->endline))
4299               {
4300                 lmp->is = icatalloc (lmp->is, rmp->is);
4301                 lmp->endline = rmp->endline;
4302               }
4303             else
4304               {
4305                 lmp->is[0] = '\0';
4306                 lmp->begline = false;
4307                 lmp->endline = false;
4308               }
4309             freemust (rmp);
4310           }
4311           break;
4312
4313         case '\0':
4314           /* Not on *my* shift.  */
4315           goto done;
4316
4317         default:
4318           if (CSET <= t)
4319             {
4320               /* If T is a singleton, or if case-folding in a unibyte
4321                  locale and T's members all case-fold to the same char,
4322                  convert T to one of its members.  Otherwise, do
4323                  nothing further with T.  */
4324               charclass *ccl = &d->charclasses[t - CSET];
4325               int j;
4326               for (j = 0; j < NOTCHAR; j++)
4327                 if (tstbit (j, ccl))
4328                   break;
4329               if (! (j < NOTCHAR))
4330                 {
4331                   mp = allocmust (mp, 2);
4332                   break;
4333                 }
4334               t = j;
4335               while (++j < NOTCHAR)
4336                 if (tstbit (j, ccl)
4337                     && ! (case_fold_unibyte
4338                           && toupper (j) == toupper (t)))
4339                   break;
4340               if (j < NOTCHAR)
4341                 {
4342                   mp = allocmust (mp, 2);
4343                   break;
4344                 }
4345             }
4346
4347           idx_t rj = ri + 2;
4348           if (d->tokens[ri + 1] == CAT)
4349             {
4350               for (; rj < d->tindex - 1; rj += 2)
4351                 {
4352                   if ((rj != ri && (d->tokens[rj] <= 0
4353                                     || NOTCHAR <= d->tokens[rj]))
4354                       || d->tokens[rj + 1] != CAT)
4355                     break;
4356                 }
4357             }
4358           mp = allocmust (mp, ((rj - ri) >> 1) + 1);
4359           mp->is[0] = mp->left[0] = mp->right[0]
4360             = case_fold_unibyte ? toupper (t) : t;
4361
4362           idx_t i;
4363           for (i = 1; ri + 2 < rj; i++)
4364             {
4365               ri += 2;
4366               t = d->tokens[ri];
4367               mp->is[i] = mp->left[i] = mp->right[i]
4368                 = case_fold_unibyte ? toupper (t) : t;
4369             }
4370           mp->is[i] = mp->left[i] = mp->right[i] = '\0';
4371           mp->in = enlist (mp->in, mp->is, i);
4372           break;
4373         }
4374     }
4375  done:;
4376
4377   struct dfamust *dm = NULL;
4378   if (*result)
4379     {
4380       dm = xmalloc (FLEXSIZEOF (struct dfamust, must, strlen (result) + 1));
4381       dm->exact = exact;
4382       dm->begline = begline;
4383       dm->endline = endline;
4384       strcpy (dm->must, result);
4385     }
4386
4387   while (mp)
4388     {
4389       must *prev = mp->prev;
4390       freemust (mp);
4391       mp = prev;
4392     }
4393
4394   return dm;
4395 }
4396
4397 void
4398 dfamustfree (struct dfamust *dm)
4399 {
4400   free (dm);
4401 }
4402
4403 struct dfa *
4404 dfaalloc (void)
4405 {
4406   return xmalloc (sizeof (struct dfa));
4407 }
4408
4409 /* Initialize DFA.  */
4410 void
4411 dfasyntax (struct dfa *dfa, struct localeinfo const *linfo,
4412            reg_syntax_t bits, int dfaopts)
4413 {
4414   memset (dfa, 0, offsetof (struct dfa, dfaexec));
4415   dfa->dfaexec = linfo->multibyte ? dfaexec_mb : dfaexec_sb;
4416   dfa->localeinfo = *linfo;
4417
4418   dfa->fast = !dfa->localeinfo.multibyte;
4419
4420   dfa->canychar = -1;
4421   dfa->syntax.syntax_bits_set = true;
4422   dfa->syntax.case_fold = (bits & RE_ICASE) != 0;
4423   dfa->syntax.eolbyte = dfaopts & DFA_EOL_NUL ? '\0' : '\n';
4424   dfa->syntax.syntax_bits = bits;
4425   dfa->syntax.dfaopts = dfaopts;
4426
4427   for (int i = CHAR_MIN; i <= CHAR_MAX; ++i)
4428     {
4429       unsigned char uc = i;
4430
4431       dfa->syntax.sbit[uc] = char_context (dfa, uc);
4432       switch (dfa->syntax.sbit[uc])
4433         {
4434         case CTX_LETTER:
4435           setbit (uc, &dfa->syntax.letters);
4436           break;
4437         case CTX_NEWLINE:
4438           setbit (uc, &dfa->syntax.newline);
4439           break;
4440         }
4441
4442       /* POSIX requires that the five bytes in "\n\r./" (including the
4443          terminating NUL) cannot occur inside a multibyte character.  */
4444       dfa->syntax.never_trail[uc] = (dfa->localeinfo.using_utf8
4445                                      ? (uc & 0xc0) != 0x80
4446                                      : strchr ("\n\r./", uc) != NULL);
4447     }
4448 }
4449
4450 /* Initialize TO by copying FROM's syntax settings.  */
4451 void
4452 dfacopysyntax (struct dfa *to, struct dfa const *from)
4453 {
4454   memset (to, 0, offsetof (struct dfa, syntax));
4455   to->canychar = -1;
4456   to->fast = from->fast;
4457   to->syntax = from->syntax;
4458   to->dfaexec = from->dfaexec;
4459   to->localeinfo = from->localeinfo;
4460 }
4461
4462 /* vim:set shiftwidth=2: */