lib/dfa.c

   1 /* dfa.c - deterministic extended regexp routines for GNU
   2    Copyright (C) 1988, 1998, 2000, 2002, 2004-2005, 2007-2020 Free Software
   3    Foundation, Inc.
   4
   5    This program is free software; you can redistribute it and/or modify
   6    it under the terms of the GNU General Public License as published by
   7    the Free Software Foundation; either version 3, or (at your option)
   8    any later version.
   9
  10    This program is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13    GNU General Public License for more details.
  14
  15    You should have received a copy of the GNU General Public License
  16    along with this program; if not, write to the Free Software
  17    Foundation, Inc.,
  18    51 Franklin Street - Fifth Floor, Boston, MA  02110-1301, USA */
  19
  20 /* Written June, 1988 by Mike Haertel
  21    Modified July, 1988 by Arthur David Olson to assist BMG speedups  */
  22
  23 #include <config.h>
  24
  25 #include "dfa.h"
  26
  27 #include "flexmember.h"
  28
  29 #include <assert.h>
  30 #include <ctype.h>
  31 #include <stdint.h>
  32 #include <stdio.h>
  33 #include <stdlib.h>
  34 #include <limits.h>
  35 #include <string.h>
  36
  37 /* Another name for ptrdiff_t, for sizes of objects and nonnegative
  38    indexes into objects.  It is signed to help catch integer overflow.
  39    It has its own name because it is for nonnegative values only.  */
  40 typedef ptrdiff_t idx_t;
  41 static idx_t const IDX_MAX = PTRDIFF_MAX;
  42
  43 static bool
  44 streq (char const *a, char const *b)
  45 {
  46   return strcmp (a, b) == 0;
  47 }
  48
  49 static bool
  50 isasciidigit (char c)
  51 {
  52   return '0' <= c && c <= '9';
  53 }
  54
  55 #include "gettext.h"
  56 #define _(str) gettext (str)
  57
  58 #include <wchar.h>
  59
  60 #include "intprops.h"
  61 #include "xalloc.h"
  62 #include "localeinfo.h"
  63
  64 #ifndef FALLTHROUGH
  65 # if 201710L < __STDC_VERSION__
  66 #  define FALLTHROUGH [[__fallthrough__]]
  67 # elif (__GNUC__ >= 7) || (__clang_major__ >= 10)
  68 #  define FALLTHROUGH __attribute__ ((__fallthrough__))
  69 # else
  70 #  define FALLTHROUGH ((void) 0)
  71 # endif
  72 #endif
  73
  74 #ifndef MIN
  75 # define MIN(a,b) ((a) < (b) ? (a) : (b))
  76 #endif
  77
  78 /* HPUX defines these as macros in sys/param.h.  */
  79 #ifdef setbit
  80 # undef setbit
  81 #endif
  82 #ifdef clrbit
  83 # undef clrbit
  84 #endif
  85
  86 /* For code that does not use Gnulib’s isblank module.  */
  87 #if !defined isblank && !defined HAVE_ISBLANK && !defined GNULIB_ISBLANK
  88 # define isblank dfa_isblank
  89 static int
  90 isblank (int c)
  91 {
  92   return c == ' ' || c == '\t';
  93 }
  94 #endif
  95
  96 /* First integer value that is greater than any character code.  */
  97 enum { NOTCHAR = 1 << CHAR_BIT };
  98
  99 #ifdef UINT_LEAST64_MAX
 100
 101 /* Number of bits used in a charclass word.  */
 102 enum { CHARCLASS_WORD_BITS = 64 };
 103
 104 /* This represents part of a character class.  It must be unsigned and
 105    at least CHARCLASS_WORD_BITS wide.  Any excess bits are zero.  */
 106 typedef uint_least64_t charclass_word;
 107
 108 /* Part of a charclass initializer that represents 64 bits' worth of a
 109    charclass, where LO and HI are the low and high-order 32 bits of
 110    the 64-bit quantity.  */
 111 # define CHARCLASS_PAIR(lo, hi) (((charclass_word) (hi) << 32) + (lo))
 112
 113 #else
 114 /* Fallbacks for pre-C99 hosts that lack 64-bit integers.  */
 115 enum { CHARCLASS_WORD_BITS = 32 };
 116 typedef unsigned long charclass_word;
 117 # define CHARCLASS_PAIR(lo, hi) lo, hi
 118 #endif
 119
 120 /* An initializer for a charclass whose 32-bit words are A through H.  */
 121 #define CHARCLASS_INIT(a, b, c, d, e, f, g, h)          \
 122    {{                                                   \
 123       CHARCLASS_PAIR (a, b), CHARCLASS_PAIR (c, d),     \
 124       CHARCLASS_PAIR (e, f), CHARCLASS_PAIR (g, h)      \
 125    }}
 126
 127 /* The maximum useful value of a charclass_word; all used bits are 1.  */
 128 static charclass_word const CHARCLASS_WORD_MASK
 129   = ((charclass_word) 1 << (CHARCLASS_WORD_BITS - 1) << 1) - 1;
 130
 131 /* Number of words required to hold a bit for every character.  */
 132 enum
 133 {
 134   CHARCLASS_WORDS = (NOTCHAR + CHARCLASS_WORD_BITS - 1) / CHARCLASS_WORD_BITS
 135 };
 136
 137 /* Sets of unsigned characters are stored as bit vectors in arrays of ints.  */
 138 typedef struct { charclass_word w[CHARCLASS_WORDS]; } charclass;
 139
 140 /* Convert a possibly-signed character to an unsigned character.  This is
 141    a bit safer than casting to unsigned char, since it catches some type
 142    errors that the cast doesn't.  */
 143 static unsigned char
 144 to_uchar (char ch)
 145 {
 146   return ch;
 147 }
 148
 149 /* Contexts tell us whether a character is a newline or a word constituent.
 150    Word-constituent characters are those that satisfy iswalnum, plus '_'.
 151    Each character has a single CTX_* value; bitmasks of CTX_* values denote
 152    a particular character class.
 153
 154    A state also stores a context value, which is a bitmask of CTX_* values.
 155    A state's context represents a set of characters that the state's
 156    predecessors must match.  For example, a state whose context does not
 157    include CTX_LETTER will never have transitions where the previous
 158    character is a word constituent.  A state whose context is CTX_ANY
 159    might have transitions from any character.  */
 160
 161 enum
 162   {
 163     CTX_NONE = 1,
 164     CTX_LETTER = 2,
 165     CTX_NEWLINE = 4,
 166     CTX_ANY = 7
 167   };
 168
 169 /* Sometimes characters can only be matched depending on the surrounding
 170    context.  Such context decisions depend on what the previous character
 171    was, and the value of the current (lookahead) character.  Context
 172    dependent constraints are encoded as 9-bit integers.  Each bit that
 173    is set indicates that the constraint succeeds in the corresponding
 174    context.
 175
 176    bit 6-8  - valid contexts when next character is CTX_NEWLINE
 177    bit 3-5  - valid contexts when next character is CTX_LETTER
 178    bit 0-2  - valid contexts when next character is CTX_NONE
 179
 180    succeeds_in_context determines whether a given constraint
 181    succeeds in a particular context.  Prev is a bitmask of possible
 182    context values for the previous character, curr is the (single-bit)
 183    context value for the lookahead character.  */
 184 static int
 185 newline_constraint (int constraint)
 186 {
 187   return (constraint >> 6) & 7;
 188 }
 189 static int
 190 letter_constraint (int constraint)
 191 {
 192   return (constraint >> 3) & 7;
 193 }
 194 static int
 195 other_constraint (int constraint)
 196 {
 197   return constraint & 7;
 198 }
 199
 200 static bool
 201 succeeds_in_context (int constraint, int prev, int curr)
 202 {
 203   return !! (((curr & CTX_NONE      ? other_constraint (constraint) : 0) \
 204               | (curr & CTX_LETTER  ? letter_constraint (constraint) : 0) \
 205               | (curr & CTX_NEWLINE ? newline_constraint (constraint) : 0)) \
 206              & prev);
 207 }
 208
 209 /* The following describe what a constraint depends on.  */
 210 static bool
 211 prev_newline_dependent (int constraint)
 212 {
 213   return ((constraint ^ constraint >> 2) & 0111) != 0;
 214 }
 215 static bool
 216 prev_letter_dependent (int constraint)
 217 {
 218   return ((constraint ^ constraint >> 1) & 0111) != 0;
 219 }
 220
 221 /* Tokens that match the empty string subject to some constraint actually
 222    work by applying that constraint to determine what may follow them,
 223    taking into account what has gone before.  The following values are
 224    the constraints corresponding to the special tokens previously defined.  */
 225 enum
 226   {
 227     NO_CONSTRAINT = 0777,
 228     BEGLINE_CONSTRAINT = 0444,
 229     ENDLINE_CONSTRAINT = 0700,
 230     BEGWORD_CONSTRAINT = 0050,
 231     ENDWORD_CONSTRAINT = 0202,
 232     LIMWORD_CONSTRAINT = 0252,
 233     NOTLIMWORD_CONSTRAINT = 0525
 234   };
 235
 236 /* The regexp is parsed into an array of tokens in postfix form.  Some tokens
 237    are operators and others are terminal symbols.  Most (but not all) of these
 238    codes are returned by the lexical analyzer.  */
 239
 240 typedef ptrdiff_t token;
 241 static token const TOKEN_MAX = PTRDIFF_MAX;
 242
 243 /* States are indexed by state_num values.  These are normally
 244    nonnegative but -1 is used as a special value.  */
 245 typedef ptrdiff_t state_num;
 246
 247 /* Predefined token values.  */
 248 enum
 249 {
 250   END = -1,                     /* END is a terminal symbol that matches the
 251                                    end of input; any value of END or less in
 252                                    the parse tree is such a symbol.  Accepting
 253                                    states of the DFA are those that would have
 254                                    a transition on END.  This is -1, not some
 255                                    more-negative value, to tweak the speed of
 256                                    comparisons to END.  */
 257
 258   /* Ordinary character values are terminal symbols that match themselves.  */
 259
 260   /* CSET must come last in the following list of special tokens.  Otherwise,
 261      the list order matters only for performance.  Related special tokens
 262      should have nearby values so that code like (t == ANYCHAR || t == MBCSET
 263      || CSET <= t) can be done with a single machine-level comparison.  */
 264
 265   EMPTY = NOTCHAR,              /* EMPTY is a terminal symbol that matches
 266                                    the empty string.  */
 267
 268   QMARK,                        /* QMARK is an operator of one argument that
 269                                    matches zero or one occurrences of its
 270                                    argument.  */
 271
 272   STAR,                         /* STAR is an operator of one argument that
 273                                    matches the Kleene closure (zero or more
 274                                    occurrences) of its argument.  */
 275
 276   PLUS,                         /* PLUS is an operator of one argument that
 277                                    matches the positive closure (one or more
 278                                    occurrences) of its argument.  */
 279
 280   REPMN,                        /* REPMN is a lexical token corresponding
 281                                    to the {m,n} construct.  REPMN never
 282                                    appears in the compiled token vector.  */
 283
 284   CAT,                          /* CAT is an operator of two arguments that
 285                                    matches the concatenation of its
 286                                    arguments.  CAT is never returned by the
 287                                    lexical analyzer.  */
 288
 289   OR,                           /* OR is an operator of two arguments that
 290                                    matches either of its arguments.  */
 291
 292   LPAREN,                       /* LPAREN never appears in the parse tree,
 293                                    it is only a lexeme.  */
 294
 295   RPAREN,                       /* RPAREN never appears in the parse tree.  */
 296
 297   WCHAR,                        /* Only returned by lex.  wctok contains
 298                                    the wide character representation.  */
 299
 300   ANYCHAR,                      /* ANYCHAR is a terminal symbol that matches
 301                                    a valid multibyte (or single byte) character.
 302                                    It is used only if MB_CUR_MAX > 1.  */
 303
 304   BEG,                          /* BEG is an initial symbol that matches the
 305                                    beginning of input.  */
 306
 307   BEGLINE,                      /* BEGLINE is a terminal symbol that matches
 308                                    the empty string at the beginning of a
 309                                    line.  */
 310
 311   ENDLINE,                      /* ENDLINE is a terminal symbol that matches
 312                                    the empty string at the end of a line.  */
 313
 314   BEGWORD,                      /* BEGWORD is a terminal symbol that matches
 315                                    the empty string at the beginning of a
 316                                    word.  */
 317
 318   ENDWORD,                      /* ENDWORD is a terminal symbol that matches
 319                                    the empty string at the end of a word.  */
 320
 321   LIMWORD,                      /* LIMWORD is a terminal symbol that matches
 322                                    the empty string at the beginning or the
 323                                    end of a word.  */
 324
 325   NOTLIMWORD,                   /* NOTLIMWORD is a terminal symbol that
 326                                    matches the empty string not at
 327                                    the beginning or end of a word.  */
 328
 329   BACKREF,                      /* BACKREF is generated by \<digit>
 330                                    or by any other construct that
 331                                    is not completely handled.  If the scanner
 332                                    detects a transition on backref, it returns
 333                                    a kind of "semi-success" indicating that
 334                                    the match will have to be verified with
 335                                    a backtracking matcher.  */
 336
 337   MBCSET,                       /* MBCSET is similar to CSET, but for
 338                                    multibyte characters.  */
 339
 340   CSET                          /* CSET and (and any value greater) is a
 341                                    terminal symbol that matches any of a
 342                                    class of characters.  */
 343 };
 344
 345
 346 /* States of the recognizer correspond to sets of positions in the parse
 347    tree, together with the constraints under which they may be matched.
 348    So a position is encoded as an index into the parse tree together with
 349    a constraint.  */
 350 typedef struct
 351 {
 352   idx_t index;                  /* Index into the parse array.  */
 353   unsigned int constraint;      /* Constraint for matching this position.  */
 354 } position;
 355
 356 /* Sets of positions are stored as arrays.  */
 357 typedef struct
 358 {
 359   position *elems;              /* Elements of this position set.  */
 360   idx_t nelem;                  /* Number of elements in this set.  */
 361   idx_t alloc;                  /* Number of elements allocated in ELEMS.  */
 362 } position_set;
 363
 364 /* A state of the dfa consists of a set of positions, some flags,
 365    and the token value of the lowest-numbered position of the state that
 366    contains an END token.  */
 367 typedef struct
 368 {
 369   size_t hash;                  /* Hash of the positions of this state.  */
 370   position_set elems;           /* Positions this state could match.  */
 371   unsigned char context;        /* Context from previous state.  */
 372   unsigned short constraint;    /* Constraint for this state to accept.  */
 373   token first_end;              /* Token value of the first END in elems.  */
 374   position_set mbps;            /* Positions which can match multibyte
 375                                    characters or the follows, e.g., period.
 376                                    Used only if MB_CUR_MAX > 1.  */
 377   state_num mb_trindex;         /* Index of this state in MB_TRANS, or
 378                                    negative if the state does not have
 379                                    ANYCHAR.  */
 380 } dfa_state;
 381
 382 /* Maximum for any transition table count.  This should be at least 3,
 383    for the initial state setup.  */
 384 enum { MAX_TRCOUNT = 1024 };
 385
 386 /* A bracket operator.
 387    e.g., [a-c], [[:alpha:]], etc.  */
 388 struct mb_char_classes
 389 {
 390   ptrdiff_t cset;
 391   bool invert;
 392   wchar_t *chars;               /* Normal characters.  */
 393   idx_t nchars;
 394   idx_t nchars_alloc;
 395 };
 396
 397 struct regex_syntax
 398 {
 399   /* Syntax bits controlling the behavior of the lexical analyzer.  */
 400   reg_syntax_t syntax_bits;
 401   bool syntax_bits_set;
 402
 403   /* Flag for case-folding letters into sets.  */
 404   bool case_fold;
 405
 406   /* True if ^ and $ match only the start and end of data, and do not match
 407      end-of-line within data.  */
 408   bool anchor;
 409
 410   /* End-of-line byte in data.  */
 411   unsigned char eolbyte;
 412
 413   /* Cache of char-context values.  */
 414   char sbit[NOTCHAR];
 415
 416   /* If never_trail[B], the byte B cannot be a non-initial byte in a
 417      multibyte character.  */
 418   bool never_trail[NOTCHAR];
 419
 420   /* Set of characters considered letters.  */
 421   charclass letters;
 422
 423   /* Set of characters that are newline.  */
 424   charclass newline;
 425 };
 426
 427 /* Lexical analyzer.  All the dross that deals with the obnoxious
 428    GNU Regex syntax bits is located here.  The poor, suffering
 429    reader is referred to the GNU Regex documentation for the
 430    meaning of the @#%!@#%^!@ syntax bits.  */
 431 struct lexer_state
 432 {
 433   char const *ptr;      /* Pointer to next input character.  */
 434   idx_t left;           /* Number of characters remaining.  */
 435   token lasttok;        /* Previous token returned; initially END.  */
 436   idx_t parens;         /* Count of outstanding left parens.  */
 437   int minrep, maxrep;   /* Repeat counts for {m,n}.  */
 438
 439   /* Wide character representation of the current multibyte character,
 440      or WEOF if there was an encoding error.  Used only if
 441      MB_CUR_MAX > 1.  */
 442   wint_t wctok;
 443
 444   /* The most recently analyzed multibyte bracket expression.  */
 445   struct mb_char_classes brack;
 446
 447   /* We're separated from beginning or (, | only by zero-width characters.  */
 448   bool laststart;
 449 };
 450
 451 /* Recursive descent parser for regular expressions.  */
 452
 453 struct parser_state
 454 {
 455   token tok;               /* Lookahead token.  */
 456   idx_t depth;             /* Current depth of a hypothetical stack
 457                               holding deferred productions.  This is
 458                               used to determine the depth that will be
 459                               required of the real stack later on in
 460                               dfaanalyze.  */
 461 };
 462
 463 /* A compiled regular expression.  */
 464 struct dfa
 465 {
 466   /* Fields filled by the scanner.  */
 467   charclass *charclasses;       /* Array of character sets for CSET tokens.  */
 468   idx_t cindex;                 /* Index for adding new charclasses.  */
 469   idx_t calloc;                 /* Number of charclasses allocated.  */
 470   ptrdiff_t canychar;           /* Index of anychar class, or -1.  */
 471
 472   /* Scanner state */
 473   struct lexer_state lex;
 474
 475   /* Parser state */
 476   struct parser_state parse;
 477
 478   /* Fields filled by the parser.  */
 479   token *tokens;                /* Postfix parse array.  */
 480   idx_t tindex;                 /* Index for adding new tokens.  */
 481   idx_t talloc;                 /* Number of tokens currently allocated.  */
 482   idx_t depth;                  /* Depth required of an evaluation stack
 483                                    used for depth-first traversal of the
 484                                    parse tree.  */
 485   idx_t nleaves;                /* Number of leaves on the parse tree.  */
 486   idx_t nregexps;               /* Count of parallel regexps being built
 487                                    with dfaparse.  */
 488   bool fast;                    /* The DFA is fast.  */
 489   token utf8_anychar_classes[9]; /* To lower ANYCHAR in UTF-8 locales.  */
 490   mbstate_t mbs;                /* Multibyte conversion state.  */
 491
 492   /* The following are valid only if MB_CUR_MAX > 1.  */
 493
 494   /* The value of multibyte_prop[i] is defined by following rule.
 495      if tokens[i] < NOTCHAR
 496      bit 0 : tokens[i] is the first byte of a character, including
 497      single-byte characters.
 498      bit 1 : tokens[i] is the last byte of a character, including
 499      single-byte characters.
 500
 501      e.g.
 502      tokens
 503      = 'single_byte_a', 'multi_byte_A', single_byte_b'
 504      = 'sb_a', 'mb_A(1st byte)', 'mb_A(2nd byte)', 'mb_A(3rd byte)', 'sb_b'
 505      multibyte_prop
 506      = 3     , 1               ,  0              ,  2              , 3
 507    */
 508   char *multibyte_prop;
 509
 510   /* Fields filled by the superset.  */
 511   struct dfa *superset;             /* Hint of the dfa.  */
 512
 513   /* Fields filled by the state builder.  */
 514   dfa_state *states;            /* States of the dfa.  */
 515   state_num sindex;             /* Index for adding new states.  */
 516   idx_t salloc;                 /* Number of states currently allocated.  */
 517
 518   /* Fields filled by the parse tree->NFA conversion.  */
 519   position_set *follows;        /* Array of follow sets, indexed by position
 520                                    index.  The follow of a position is the set
 521                                    of positions containing characters that
 522                                    could conceivably follow a character
 523                                    matching the given position in a string
 524                                    matching the regexp.  Allocated to the
 525                                    maximum possible position index.  */
 526   bool searchflag;              /* We are supposed to build a searching
 527                                    as opposed to an exact matcher.  A searching
 528                                    matcher finds the first and shortest string
 529                                    matching a regexp anywhere in the buffer,
 530                                    whereas an exact matcher finds the longest
 531                                    string matching, but anchored to the
 532                                    beginning of the buffer.  */
 533
 534   /* Fields filled by dfaanalyze.  */
 535   int *constraints;             /* Array of union of accepting constraints
 536                                    in the follow of a position.  */
 537   int *separates;               /* Array of contexts on follow of a
 538                                    position.  */
 539
 540   /* Fields filled by dfaexec.  */
 541   state_num tralloc;            /* Number of transition tables that have
 542                                    slots so far, not counting trans[-1] and
 543                                    trans[-2].  */
 544   int trcount;                  /* Number of transition tables that have
 545                                    been built, other than for initial
 546                                    states.  */
 547   int min_trcount;              /* Number of initial states.  Equivalently,
 548                                    the minimum state number for which trcount
 549                                    counts transitions.  */
 550   state_num **trans;            /* Transition tables for states that can
 551                                    never accept.  If the transitions for a
 552                                    state have not yet been computed, or the
 553                                    state could possibly accept, its entry in
 554                                    this table is NULL.  This points to two
 555                                    past the start of the allocated array,
 556                                    and trans[-1] and trans[-2] are always
 557                                    NULL.  */
 558   state_num **fails;            /* Transition tables after failing to accept
 559                                    on a state that potentially could do so.
 560                                    If trans[i] is non-null, fails[i] must
 561                                    be null.  */
 562   char *success;                /* Table of acceptance conditions used in
 563                                    dfaexec and computed in build_state.  */
 564   state_num *newlines;          /* Transitions on newlines.  The entry for a
 565                                    newline in any transition table is always
 566                                    -1 so we can count lines without wasting
 567                                    too many cycles.  The transition for a
 568                                    newline is stored separately and handled
 569                                    as a special case.  Newline is also used
 570                                    as a sentinel at the end of the buffer.  */
 571   state_num initstate_notbol;   /* Initial state for CTX_LETTER and CTX_NONE
 572                                    context in multibyte locales, in which we
 573                                    do not distinguish between their contexts,
 574                                    as not supported word.  */
 575   position_set mb_follows;      /* Follow set added by ANYCHAR on demand.  */
 576   state_num **mb_trans;         /* Transition tables for states with
 577                                    ANYCHAR.  */
 578   state_num mb_trcount;         /* Number of transition tables for states with
 579                                    ANYCHAR that have actually been built.  */
 580
 581   /* Syntax configuration.  This is near the end so that dfacopysyntax
 582      can memset up to here.  */
 583   struct regex_syntax syntax;
 584
 585   /* Information derived from the locale.  This is at the end so that
 586      a quick memset need not clear it specially.  */
 587
 588   /* dfaexec implementation.  */
 589   char *(*dfaexec) (struct dfa *, char const *, char *,
 590                     bool, ptrdiff_t *, bool *);
 591
 592   /* Other cached information derived from the locale.  */
 593   struct localeinfo localeinfo;
 594 };
 595
 596 /* User access to dfa internals.  */
 597
 598 /* S could possibly be an accepting state of R.  */
 599 static bool
 600 accepting (state_num s, struct dfa const *r)
 601 {
 602   return r->states[s].constraint != 0;
 603 }
 604
 605 /* STATE accepts in the specified context.  */
 606 static bool
 607 accepts_in_context (int prev, int curr, state_num state, struct dfa const *dfa)
 608 {
 609   return succeeds_in_context (dfa->states[state].constraint, prev, curr);
 610 }
 611
 612 static void regexp (struct dfa *dfa);
 613
 614 /* Store into *PWC the result of converting the leading bytes of the
 615    multibyte buffer S of length N bytes, using D->localeinfo.sbctowc
 616    and updating the conversion state in *D.  On conversion error,
 617    convert just a single byte, to WEOF.  Return the number of bytes
 618    converted.
 619
 620    This differs from mbrtowc (PWC, S, N, &D->mbs) as follows:
 621
 622    * PWC points to wint_t, not to wchar_t.
 623    * The last arg is a dfa *D instead of merely a multibyte conversion
 624      state D->mbs.
 625    * N must be at least 1.
 626    * S[N - 1] must be a sentinel byte.
 627    * Shift encodings are not supported.
 628    * The return value is always in the range 1..N.
 629    * D->mbs is always valid afterwards.
 630    * *PWC is always set to something.  */
 631 static int
 632 mbs_to_wchar (wint_t *pwc, char const *s, size_t n, struct dfa *d)
 633 {
 634   unsigned char uc = s[0];
 635   wint_t wc = d->localeinfo.sbctowc[uc];
 636
 637   if (wc == WEOF)
 638     {
 639       wchar_t wch;
 640       size_t nbytes = mbrtowc (&wch, s, n, &d->mbs);
 641       if (0 < nbytes && nbytes < (size_t) -2)
 642         {
 643           *pwc = wch;
 644           return nbytes;
 645         }
 646       memset (&d->mbs, 0, sizeof d->mbs);
 647     }
 648
 649   *pwc = wc;
 650   return 1;
 651 }
 652
 653 #ifdef DEBUG
 654
 655 static void
 656 prtok (token t)
 657 {
 658   if (t <= END)
 659     fprintf (stderr, "END");
 660   else if (0 <= t && t < NOTCHAR)
 661     {
 662       unsigned int ch = t;
 663       fprintf (stderr, "0x%02x", ch);
 664     }
 665   else
 666     {
 667       char const *s;
 668       switch (t)
 669         {
 670         case BEG:
 671           s = "BEG";
 672           break;
 673         case EMPTY:
 674           s = "EMPTY";
 675           break;
 676         case BACKREF:
 677           s = "BACKREF";
 678           break;
 679         case BEGLINE:
 680           s = "BEGLINE";
 681           break;
 682         case ENDLINE:
 683           s = "ENDLINE";
 684           break;
 685         case BEGWORD:
 686           s = "BEGWORD";
 687           break;
 688         case ENDWORD:
 689           s = "ENDWORD";
 690           break;
 691         case LIMWORD:
 692           s = "LIMWORD";
 693           break;
 694         case NOTLIMWORD:
 695           s = "NOTLIMWORD";
 696           break;
 697         case QMARK:
 698           s = "QMARK";
 699           break;
 700         case STAR:
 701           s = "STAR";
 702           break;
 703         case PLUS:
 704           s = "PLUS";
 705           break;
 706         case CAT:
 707           s = "CAT";
 708           break;
 709         case OR:
 710           s = "OR";
 711           break;
 712         case LPAREN:
 713           s = "LPAREN";
 714           break;
 715         case RPAREN:
 716           s = "RPAREN";
 717           break;
 718         case ANYCHAR:
 719           s = "ANYCHAR";
 720           break;
 721         case MBCSET:
 722           s = "MBCSET";
 723           break;
 724         default:
 725           s = "CSET";
 726           break;
 727         }
 728       fprintf (stderr, "%s", s);
 729     }
 730 }
 731 #endif /* DEBUG */
 732
 733 /* Stuff pertaining to charclasses.  */
 734
 735 static bool
 736 tstbit (unsigned int b, charclass const *c)
 737 {
 738   return c->w[b / CHARCLASS_WORD_BITS] >> b % CHARCLASS_WORD_BITS & 1;
 739 }
 740
 741 static void
 742 setbit (unsigned int b, charclass *c)
 743 {
 744   charclass_word one = 1;
 745   c->w[b / CHARCLASS_WORD_BITS] |= one << b % CHARCLASS_WORD_BITS;
 746 }
 747
 748 static void
 749 clrbit (unsigned int b, charclass *c)
 750 {
 751   charclass_word one = 1;
 752   c->w[b / CHARCLASS_WORD_BITS] &= ~(one << b % CHARCLASS_WORD_BITS);
 753 }
 754
 755 static void
 756 zeroset (charclass *s)
 757 {
 758   memset (s, 0, sizeof *s);
 759 }
 760
 761 static void
 762 fillset (charclass *s)
 763 {
 764   for (int i = 0; i < CHARCLASS_WORDS; i++)
 765     s->w[i] = CHARCLASS_WORD_MASK;
 766 }
 767
 768 static void
 769 notset (charclass *s)
 770 {
 771   for (int i = 0; i < CHARCLASS_WORDS; ++i)
 772     s->w[i] = CHARCLASS_WORD_MASK & ~s->w[i];
 773 }
 774
 775 static bool
 776 equal (charclass const *s1, charclass const *s2)
 777 {
 778   charclass_word w = 0;
 779   for (int i = 0; i < CHARCLASS_WORDS; i++)
 780     w |= s1->w[i] ^ s2->w[i];
 781   return w == 0;
 782 }
 783
 784 static bool
 785 emptyset (charclass const *s)
 786 {
 787   charclass_word w = 0;
 788   for (int i = 0; i < CHARCLASS_WORDS; i++)
 789     w |= s->w[i];
 790   return w == 0;
 791 }
 792
 793 /* Grow PA, which points to an array of *NITEMS items, and return the
 794    location of the reallocated array, updating *NITEMS to reflect its
 795    new size.  The new array will contain at least NITEMS_INCR_MIN more
 796    items, but will not contain more than NITEMS_MAX items total.
 797    ITEM_SIZE is the size of each item, in bytes.
 798
 799    ITEM_SIZE and NITEMS_INCR_MIN must be positive.  *NITEMS must be
 800    nonnegative.  If NITEMS_MAX is -1, it is treated as if it were
 801    infinity.
 802
 803    If PA is null, then allocate a new array instead of reallocating
 804    the old one.
 805
 806    Thus, to grow an array A without saving its old contents, do
 807    { free (A); A = xpalloc (NULL, &AITEMS, ...); }.  */
 808
 809 static void *
 810 xpalloc (void *pa, idx_t *nitems, idx_t nitems_incr_min,
 811          ptrdiff_t nitems_max, idx_t item_size)
 812 {
 813   idx_t n0 = *nitems;
 814
 815   /* The approximate size to use for initial small allocation
 816      requests.  This is the largest "small" request for the GNU C
 817      library malloc.  */
 818   enum { DEFAULT_MXFAST = 64 * sizeof (size_t) / 4 };
 819
 820   /* If the array is tiny, grow it to about (but no greater than)
 821      DEFAULT_MXFAST bytes.  Otherwise, grow it by about 50%.
 822      Adjust the growth according to three constraints: NITEMS_INCR_MIN,
 823      NITEMS_MAX, and what the C language can represent safely.  */
 824
 825   idx_t n, nbytes;
 826   if (INT_ADD_WRAPV (n0, n0 >> 1, &n))
 827     n = IDX_MAX;
 828   if (0 <= nitems_max && nitems_max < n)
 829     n = nitems_max;
 830
 831   idx_t adjusted_nbytes
 832     = ((INT_MULTIPLY_WRAPV (n, item_size, &nbytes) || SIZE_MAX < nbytes)
 833        ? MIN (IDX_MAX, SIZE_MAX)
 834        : nbytes < DEFAULT_MXFAST ? DEFAULT_MXFAST : 0);
 835   if (adjusted_nbytes)
 836     {
 837       n = adjusted_nbytes / item_size;
 838       nbytes = adjusted_nbytes - adjusted_nbytes % item_size;
 839     }
 840
 841   if (! pa)
 842     *nitems = 0;
 843   if (n - n0 < nitems_incr_min
 844       && (INT_ADD_WRAPV (n0, nitems_incr_min, &n)
 845           || (0 <= nitems_max && nitems_max < n)
 846           || INT_MULTIPLY_WRAPV (n, item_size, &nbytes)))
 847     xalloc_die ();
 848   pa = xrealloc (pa, nbytes);
 849   *nitems = n;
 850   return pa;
 851 }
 852
 853 /* Ensure that the array addressed by PA holds at least I + 1 items.
 854    Either return PA, or reallocate the array and return its new address.
 855    Although PA may be null, the returned value is never null.
 856
 857    The array holds *NITEMS items, where 0 <= I <= *NITEMS; *NITEMS
 858    is updated on reallocation.  If PA is null, *NITEMS must be zero.
 859    Do not allocate more than NITEMS_MAX items total; -1 means no limit.
 860    ITEM_SIZE is the size of one item; it must be positive.
 861    Avoid O(N**2) behavior on arrays growing linearly.  */
 862 static void *
 863 maybe_realloc (void *pa, idx_t i, idx_t *nitems,
 864                ptrdiff_t nitems_max, idx_t item_size)
 865 {
 866   if (i < *nitems)
 867     return pa;
 868   return xpalloc (pa, nitems, 1, nitems_max, item_size);
 869 }
 870
 871 /* In DFA D, find the index of charclass S, or allocate a new one.  */
 872 static idx_t
 873 charclass_index (struct dfa *d, charclass const *s)
 874 {
 875   idx_t i;
 876
 877   for (i = 0; i < d->cindex; ++i)
 878     if (equal (s, &d->charclasses[i]))
 879       return i;
 880   d->charclasses = maybe_realloc (d->charclasses, d->cindex, &d->calloc,
 881                                   TOKEN_MAX - CSET, sizeof *d->charclasses);
 882   ++d->cindex;
 883   d->charclasses[i] = *s;
 884   return i;
 885 }
 886
 887 static bool
 888 unibyte_word_constituent (struct dfa const *dfa, unsigned char c)
 889 {
 890   return dfa->localeinfo.sbctowc[c] != WEOF && (isalnum (c) || (c) == '_');
 891 }
 892
 893 static int
 894 char_context (struct dfa const *dfa, unsigned char c)
 895 {
 896   if (c == dfa->syntax.eolbyte && !dfa->syntax.anchor)
 897     return CTX_NEWLINE;
 898   if (unibyte_word_constituent (dfa, c))
 899     return CTX_LETTER;
 900   return CTX_NONE;
 901 }
 902
 903 /* Set a bit in the charclass for the given wchar_t.  Do nothing if WC
 904    is represented by a multi-byte sequence.  Even for MB_CUR_MAX == 1,
 905    this may happen when folding case in weird Turkish locales where
 906    dotless i/dotted I are not included in the chosen character set.
 907    Return whether a bit was set in the charclass.  */
 908 static bool
 909 setbit_wc (wint_t wc, charclass *c)
 910 {
 911   int b = wctob (wc);
 912   if (b < 0)
 913     return false;
 914
 915   setbit (b, c);
 916   return true;
 917 }
 918
 919 /* Set a bit for B and its case variants in the charclass C.
 920    MB_CUR_MAX must be 1.  */
 921 static void
 922 setbit_case_fold_c (int b, charclass *c)
 923 {
 924   int ub = toupper (b);
 925   for (int i = 0; i < NOTCHAR; i++)
 926     if (toupper (i) == ub)
 927       setbit (i, c);
 928 }
 929
 930 /* Fetch the next lexical input character from the pattern.  There
 931    must at least one byte of pattern input.  Set DFA->lex.wctok to the
 932    value of the character or to WEOF depending on whether the input is
 933    a valid multibyte character (possibly of length 1).  Then return
 934    the next input byte value, except return EOF if the input is a
 935    multibyte character of length greater than 1.  */
 936 static int
 937 fetch_wc (struct dfa *dfa)
 938 {
 939   int nbytes = mbs_to_wchar (&dfa->lex.wctok, dfa->lex.ptr, dfa->lex.left,
 940                              dfa);
 941   int c = nbytes == 1 ? to_uchar (dfa->lex.ptr[0]) : EOF;
 942   dfa->lex.ptr += nbytes;
 943   dfa->lex.left -= nbytes;
 944   return c;
 945 }
 946
 947 /* If there is no more input, report an error about unbalanced brackets.
 948    Otherwise, behave as with fetch_wc (DFA).  */
 949 static int
 950 bracket_fetch_wc (struct dfa *dfa)
 951 {
 952   if (! dfa->lex.left)
 953     dfaerror (_("unbalanced ["));
 954   return fetch_wc (dfa);
 955 }
 956
 957 typedef int predicate (int);
 958
 959 /* The following list maps the names of the Posix named character classes
 960    to predicate functions that determine whether a given character is in
 961    the class.  The leading [ has already been eaten by the lexical
 962    analyzer.  */
 963 struct dfa_ctype
 964 {
 965   const char *name;
 966   predicate *func;
 967   bool single_byte_only;
 968 };
 969
 970 static const struct dfa_ctype prednames[] = {
 971   {"alpha", isalpha, false},
 972   {"upper", isupper, false},
 973   {"lower", islower, false},
 974   {"digit", isdigit, true},
 975   {"xdigit", isxdigit, false},
 976   {"space", isspace, false},
 977   {"punct", ispunct, false},
 978   {"alnum", isalnum, false},
 979   {"print", isprint, false},
 980   {"graph", isgraph, false},
 981   {"cntrl", iscntrl, false},
 982   {"blank", isblank, false},
 983   {NULL, NULL, false}
 984 };
 985
 986 static const struct dfa_ctype *_GL_ATTRIBUTE_PURE
 987 find_pred (const char *str)
 988 {
 989   for (int i = 0; prednames[i].name; i++)
 990     if (streq (str, prednames[i].name))
 991       return &prednames[i];
 992   return NULL;
 993 }
 994
 995 /* Parse a bracket expression, which possibly includes multibyte
 996    characters.  */
 997 static token
 998 parse_bracket_exp (struct dfa *dfa)
 999 {
1000   /* This is a bracket expression that dfaexec is known to
1001      process correctly.  */
1002   bool known_bracket_exp = true;
1003
1004   /* Used to warn about [:space:].
1005      Bit 0 = first character is a colon.
1006      Bit 1 = last character is a colon.
1007      Bit 2 = includes any other character but a colon.
1008      Bit 3 = includes ranges, char/equiv classes or collation elements.  */
1009   int colon_warning_state;
1010
1011   dfa->lex.brack.nchars = 0;
1012   charclass ccl;
1013   zeroset (&ccl);
1014   int c = bracket_fetch_wc (dfa);
1015   bool invert = c == '^';
1016   if (invert)
1017     {
1018       c = bracket_fetch_wc (dfa);
1019       known_bracket_exp = dfa->localeinfo.simple;
1020     }
1021   wint_t wc = dfa->lex.wctok;
1022   int c1;
1023   wint_t wc1;
1024   colon_warning_state = (c == ':');
1025   do
1026     {
1027       c1 = NOTCHAR;     /* Mark c1 as not initialized.  */
1028       colon_warning_state &= ~2;
1029
1030       /* Note that if we're looking at some other [:...:] construct,
1031          we just treat it as a bunch of ordinary characters.  We can do
1032          this because we assume regex has checked for syntax errors before
1033          dfa is ever called.  */
1034       if (c == '[')
1035         {
1036           c1 = bracket_fetch_wc (dfa);
1037           wc1 = dfa->lex.wctok;
1038
1039           if ((c1 == ':' && (dfa->syntax.syntax_bits & RE_CHAR_CLASSES))
1040               || c1 == '.' || c1 == '=')
1041             {
1042               enum { MAX_BRACKET_STRING_LEN = 32 };
1043               char str[MAX_BRACKET_STRING_LEN + 1];
1044               int len = 0;
1045               for (;;)
1046                 {
1047                   c = bracket_fetch_wc (dfa);
1048                   if (dfa->lex.left == 0
1049                       || (c == c1 && dfa->lex.ptr[0] == ']'))
1050                     break;
1051                   if (len < MAX_BRACKET_STRING_LEN)
1052                     str[len++] = c;
1053                   else
1054                     /* This is in any case an invalid class name.  */
1055                     str[0] = '\0';
1056                 }
1057               str[len] = '\0';
1058
1059               /* Fetch bracket.  */
1060               c = bracket_fetch_wc (dfa);
1061               wc = dfa->lex.wctok;
1062               if (c1 == ':')
1063                 /* Build character class.  POSIX allows character
1064                    classes to match multicharacter collating elements,
1065                    but the regex code does not support that, so do not
1066                    worry about that possibility.  */
1067                 {
1068                   char const *class
1069                     = (dfa->syntax.case_fold && (streq (str, "upper")
1070                                                  || streq (str, "lower"))
1071                        ? "alpha" : str);
1072                   const struct dfa_ctype *pred = find_pred (class);
1073                   if (!pred)
1074                     dfaerror (_("invalid character class"));
1075
1076                   if (dfa->localeinfo.multibyte && !pred->single_byte_only)
1077                     known_bracket_exp = false;
1078                   else
1079                     for (int c2 = 0; c2 < NOTCHAR; ++c2)
1080                       if (pred->func (c2))
1081                         setbit (c2, &ccl);
1082                 }
1083               else
1084                 known_bracket_exp = false;
1085
1086               colon_warning_state |= 8;
1087
1088               /* Fetch new lookahead character.  */
1089               c1 = bracket_fetch_wc (dfa);
1090               wc1 = dfa->lex.wctok;
1091               continue;
1092             }
1093
1094           /* We treat '[' as a normal character here.  c/c1/wc/wc1
1095              are already set up.  */
1096         }
1097
1098       if (c == '\\'
1099           && (dfa->syntax.syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS))
1100         {
1101           c = bracket_fetch_wc (dfa);
1102           wc = dfa->lex.wctok;
1103         }
1104
1105       if (c1 == NOTCHAR)
1106         {
1107           c1 = bracket_fetch_wc (dfa);
1108           wc1 = dfa->lex.wctok;
1109         }
1110
1111       if (c1 == '-')
1112         /* build range characters.  */
1113         {
1114           int c2 = bracket_fetch_wc (dfa);
1115           wint_t wc2 = dfa->lex.wctok;
1116
1117           /* A bracket expression like [a-[.aa.]] matches an unknown set.
1118              Treat it like [-a[.aa.]] while parsing it, and
1119              remember that the set is unknown.  */
1120           if (c2 == '[' && dfa->lex.ptr[0] == '.')
1121             {
1122               known_bracket_exp = false;
1123               c2 = ']';
1124             }
1125
1126           if (c2 == ']')
1127             {
1128               /* In the case [x-], the - is an ordinary hyphen,
1129                  which is left in c1, the lookahead character.  */
1130               dfa->lex.ptr--;
1131               dfa->lex.left++;
1132             }
1133           else
1134             {
1135               if (c2 == '\\' && (dfa->syntax.syntax_bits
1136                                  & RE_BACKSLASH_ESCAPE_IN_LISTS))
1137                 {
1138                   c2 = bracket_fetch_wc (dfa);
1139                   wc2 = dfa->lex.wctok;
1140                 }
1141
1142               colon_warning_state |= 8;
1143               c1 = bracket_fetch_wc (dfa);
1144               wc1 = dfa->lex.wctok;
1145
1146               /* Treat [x-y] as a range if x != y.  */
1147               if (wc != wc2 || wc == WEOF)
1148                 {
1149                   if (dfa->localeinfo.simple
1150                       || (isasciidigit (c) & isasciidigit (c2)))
1151                     {
1152                       for (int ci = c; ci <= c2; ci++)
1153                         if (dfa->syntax.case_fold && isalpha (ci))
1154                           setbit_case_fold_c (ci, &ccl);
1155                         else
1156                           setbit (ci, &ccl);
1157                     }
1158                   else
1159                     known_bracket_exp = false;
1160
1161                   continue;
1162                 }
1163             }
1164         }
1165
1166       colon_warning_state |= (c == ':') ? 2 : 4;
1167
1168       if (!dfa->localeinfo.multibyte)
1169         {
1170           if (dfa->syntax.case_fold && isalpha (c))
1171             setbit_case_fold_c (c, &ccl);
1172           else
1173             setbit (c, &ccl);
1174           continue;
1175         }
1176
1177       if (wc == WEOF)
1178         known_bracket_exp = false;
1179       else
1180         {
1181           wchar_t folded[CASE_FOLDED_BUFSIZE + 1];
1182           int n = (dfa->syntax.case_fold
1183                    ? case_folded_counterparts (wc, folded + 1) + 1
1184                    : 1);
1185           folded[0] = wc;
1186           for (int i = 0; i < n; i++)
1187             if (!setbit_wc (folded[i], &ccl))
1188               {
1189                 dfa->lex.brack.chars
1190                   = maybe_realloc (dfa->lex.brack.chars, dfa->lex.brack.nchars,
1191                                    &dfa->lex.brack.nchars_alloc, -1,
1192                                    sizeof *dfa->lex.brack.chars);
1193                 dfa->lex.brack.chars[dfa->lex.brack.nchars++] = folded[i];
1194               }
1195         }
1196     }
1197   while ((wc = wc1, (c = c1) != ']'));
1198
1199   if (colon_warning_state == 7)
1200     dfawarn (_("character class syntax is [[:space:]], not [:space:]"));
1201
1202   if (! known_bracket_exp)
1203     return BACKREF;
1204
1205   if (dfa->localeinfo.multibyte && (invert || dfa->lex.brack.nchars != 0))
1206     {
1207       dfa->lex.brack.invert = invert;
1208       dfa->lex.brack.cset = emptyset (&ccl) ? -1 : charclass_index (dfa, &ccl);
1209       return MBCSET;
1210     }
1211
1212   if (invert)
1213     {
1214       notset (&ccl);
1215       if (dfa->syntax.syntax_bits & RE_HAT_LISTS_NOT_NEWLINE)
1216         clrbit ('\n', &ccl);
1217     }
1218
1219   return CSET + charclass_index (dfa, &ccl);
1220 }
1221
1222 struct lexptr
1223 {
1224   char const *ptr;
1225   idx_t left;
1226 };
1227
1228 static void
1229 push_lex_state (struct dfa *dfa, struct lexptr *ls, char const *s)
1230 {
1231   ls->ptr = dfa->lex.ptr;
1232   ls->left = dfa->lex.left;
1233   dfa->lex.ptr = s;
1234   dfa->lex.left = strlen (s);
1235 }
1236
1237 static void
1238 pop_lex_state (struct dfa *dfa, struct lexptr const *ls)
1239 {
1240   dfa->lex.ptr = ls->ptr;
1241   dfa->lex.left = ls->left;
1242 }
1243
1244 static token
1245 lex (struct dfa *dfa)
1246 {
1247   bool backslash = false;
1248
1249   /* Basic plan: We fetch a character.  If it's a backslash,
1250      we set the backslash flag and go through the loop again.
1251      On the plus side, this avoids having a duplicate of the
1252      main switch inside the backslash case.  On the minus side,
1253      it means that just about every case begins with
1254      "if (backslash) ...".  */
1255   for (int i = 0; i < 2; ++i)
1256     {
1257       if (! dfa->lex.left)
1258         return dfa->lex.lasttok = END;
1259       int c = fetch_wc (dfa);
1260
1261       switch (c)
1262         {
1263         case '\\':
1264           if (backslash)
1265             goto normal_char;
1266           if (dfa->lex.left == 0)
1267             dfaerror (_("unfinished \\ escape"));
1268           backslash = true;
1269           break;
1270
1271         case '^':
1272           if (backslash)
1273             goto normal_char;
1274           if (dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_ANCHORS
1275               || dfa->lex.lasttok == END || dfa->lex.lasttok == LPAREN
1276               || dfa->lex.lasttok == OR)
1277             return dfa->lex.lasttok = BEGLINE;
1278           goto normal_char;
1279
1280         case '$':
1281           if (backslash)
1282             goto normal_char;
1283           if (dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_ANCHORS
1284               || dfa->lex.left == 0
1285               || ((dfa->lex.left
1286                    > !(dfa->syntax.syntax_bits & RE_NO_BK_PARENS))
1287                   && (dfa->lex.ptr[!(dfa->syntax.syntax_bits & RE_NO_BK_PARENS)
1288                                    & (dfa->lex.ptr[0] == '\\')]
1289                       == ')'))
1290               || ((dfa->lex.left
1291                    > !(dfa->syntax.syntax_bits & RE_NO_BK_VBAR))
1292                   && (dfa->lex.ptr[!(dfa->syntax.syntax_bits & RE_NO_BK_VBAR)
1293                                    & (dfa->lex.ptr[0] == '\\')]
1294                       == '|'))
1295               || ((dfa->syntax.syntax_bits & RE_NEWLINE_ALT)
1296                   && dfa->lex.left > 0 && dfa->lex.ptr[0] == '\n'))
1297             return dfa->lex.lasttok = ENDLINE;
1298           goto normal_char;
1299
1300         case '1':
1301         case '2':
1302         case '3':
1303         case '4':
1304         case '5':
1305         case '6':
1306         case '7':
1307         case '8':
1308         case '9':
1309           if (backslash && !(dfa->syntax.syntax_bits & RE_NO_BK_REFS))
1310             {
1311               dfa->lex.laststart = false;
1312               return dfa->lex.lasttok = BACKREF;
1313             }
1314           goto normal_char;
1315
1316         case '`':
1317           if (backslash && !(dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
1318             {
1319               /* FIXME: should be beginning of string */
1320               return dfa->lex.lasttok = BEGLINE;
1321             }
1322           goto normal_char;
1323
1324         case '\'':
1325           if (backslash && !(dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
1326             {
1327               /* FIXME: should be end of string */
1328               return dfa->lex.lasttok = ENDLINE;
1329             }
1330           goto normal_char;
1331
1332         case '<':
1333           if (backslash && !(dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
1334             return dfa->lex.lasttok = BEGWORD;
1335           goto normal_char;
1336
1337         case '>':
1338           if (backslash && !(dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
1339             return dfa->lex.lasttok = ENDWORD;
1340           goto normal_char;
1341
1342         case 'b':
1343           if (backslash && !(dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
1344             return dfa->lex.lasttok = LIMWORD;
1345           goto normal_char;
1346
1347         case 'B':
1348           if (backslash && !(dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
1349             return dfa->lex.lasttok = NOTLIMWORD;
1350           goto normal_char;
1351
1352         case '?':
1353           if (dfa->syntax.syntax_bits & RE_LIMITED_OPS)
1354             goto normal_char;
1355           if (backslash != ((dfa->syntax.syntax_bits & RE_BK_PLUS_QM) != 0))
1356             goto normal_char;
1357           if (!(dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_OPS)
1358               && dfa->lex.laststart)
1359             goto normal_char;
1360           return dfa->lex.lasttok = QMARK;
1361
1362         case '*':
1363           if (backslash)
1364             goto normal_char;
1365           if (!(dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_OPS)
1366               && dfa->lex.laststart)
1367             goto normal_char;
1368           return dfa->lex.lasttok = STAR;
1369
1370         case '+':
1371           if (dfa->syntax.syntax_bits & RE_LIMITED_OPS)
1372             goto normal_char;
1373           if (backslash != ((dfa->syntax.syntax_bits & RE_BK_PLUS_QM) != 0))
1374             goto normal_char;
1375           if (!(dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_OPS)
1376               && dfa->lex.laststart)
1377             goto normal_char;
1378           return dfa->lex.lasttok = PLUS;
1379
1380         case '{':
1381           if (!(dfa->syntax.syntax_bits & RE_INTERVALS))
1382             goto normal_char;
1383           if (backslash != ((dfa->syntax.syntax_bits & RE_NO_BK_BRACES) == 0))
1384             goto normal_char;
1385           if (!(dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_OPS)
1386               && dfa->lex.laststart)
1387             goto normal_char;
1388
1389           /* Cases:
1390              {M} - exact count
1391              {M,} - minimum count, maximum is infinity
1392              {,N} - 0 through N
1393              {,} - 0 to infinity (same as '*')
1394              {M,N} - M through N */
1395           {
1396             char const *p = dfa->lex.ptr;
1397             char const *lim = p + dfa->lex.left;
1398             dfa->lex.minrep = dfa->lex.maxrep = -1;
1399             for (; p != lim && isasciidigit (*p); p++)
1400               dfa->lex.minrep = (dfa->lex.minrep < 0
1401                                  ? *p - '0'
1402                                  : MIN (RE_DUP_MAX + 1,
1403                                         dfa->lex.minrep * 10 + *p - '0'));
1404             if (p != lim)
1405               {
1406                 if (*p != ',')
1407                   dfa->lex.maxrep = dfa->lex.minrep;
1408                 else
1409                   {
1410                     if (dfa->lex.minrep < 0)
1411                       dfa->lex.minrep = 0;
1412                     while (++p != lim && isasciidigit (*p))
1413                       dfa->lex.maxrep
1414                         = (dfa->lex.maxrep < 0
1415                            ? *p - '0'
1416                            : MIN (RE_DUP_MAX + 1,
1417                                   dfa->lex.maxrep * 10 + *p - '0'));
1418                   }
1419               }
1420             if (! ((! backslash || (p != lim && *p++ == '\\'))
1421                    && p != lim && *p++ == '}'
1422                    && 0 <= dfa->lex.minrep
1423                    && (dfa->lex.maxrep < 0
1424                        || dfa->lex.minrep <= dfa->lex.maxrep)))
1425               {
1426                 if (dfa->syntax.syntax_bits & RE_INVALID_INTERVAL_ORD)
1427                   goto normal_char;
1428                 dfaerror (_("invalid content of \\{\\}"));
1429               }
1430             if (RE_DUP_MAX < dfa->lex.maxrep)
1431               dfaerror (_("regular expression too big"));
1432             dfa->lex.ptr = p;
1433             dfa->lex.left = lim - p;
1434           }
1435           dfa->lex.laststart = false;
1436           return dfa->lex.lasttok = REPMN;
1437
1438         case '|':
1439           if (dfa->syntax.syntax_bits & RE_LIMITED_OPS)
1440             goto normal_char;
1441           if (backslash != ((dfa->syntax.syntax_bits & RE_NO_BK_VBAR) == 0))
1442             goto normal_char;
1443           dfa->lex.laststart = true;
1444           return dfa->lex.lasttok = OR;
1445
1446         case '\n':
1447           if (dfa->syntax.syntax_bits & RE_LIMITED_OPS
1448               || backslash || !(dfa->syntax.syntax_bits & RE_NEWLINE_ALT))
1449             goto normal_char;
1450           dfa->lex.laststart = true;
1451           return dfa->lex.lasttok = OR;
1452
1453         case '(':
1454           if (backslash != ((dfa->syntax.syntax_bits & RE_NO_BK_PARENS) == 0))
1455             goto normal_char;
1456           dfa->lex.parens++;
1457           dfa->lex.laststart = true;
1458           return dfa->lex.lasttok = LPAREN;
1459
1460         case ')':
1461           if (backslash != ((dfa->syntax.syntax_bits & RE_NO_BK_PARENS) == 0))
1462             goto normal_char;
1463           if (dfa->lex.parens == 0
1464               && dfa->syntax.syntax_bits & RE_UNMATCHED_RIGHT_PAREN_ORD)
1465             goto normal_char;
1466           dfa->lex.parens--;
1467           dfa->lex.laststart = false;
1468           return dfa->lex.lasttok = RPAREN;
1469
1470         case '.':
1471           if (backslash)
1472             goto normal_char;
1473           if (dfa->canychar < 0)
1474             {
1475               charclass ccl;
1476               fillset (&ccl);
1477               if (!(dfa->syntax.syntax_bits & RE_DOT_NEWLINE))
1478                 clrbit ('\n', &ccl);
1479               if (dfa->syntax.syntax_bits & RE_DOT_NOT_NULL)
1480                 clrbit ('\0', &ccl);
1481               if (dfa->localeinfo.multibyte)
1482                 for (int c2 = 0; c2 < NOTCHAR; c2++)
1483                   if (dfa->localeinfo.sbctowc[c2] == WEOF)
1484                     clrbit (c2, &ccl);
1485               dfa->canychar = charclass_index (dfa, &ccl);
1486             }
1487           dfa->lex.laststart = false;
1488           return dfa->lex.lasttok = (dfa->localeinfo.multibyte
1489                                      ? ANYCHAR
1490                                      : CSET + dfa->canychar);
1491
1492         case 's':
1493         case 'S':
1494           if (!backslash || (dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
1495             goto normal_char;
1496           if (!dfa->localeinfo.multibyte)
1497             {
1498               charclass ccl;
1499               zeroset (&ccl);
1500               for (int c2 = 0; c2 < NOTCHAR; ++c2)
1501                 if (isspace (c2))
1502                   setbit (c2, &ccl);
1503               if (c == 'S')
1504                 notset (&ccl);
1505               dfa->lex.laststart = false;
1506               return dfa->lex.lasttok = CSET + charclass_index (dfa, &ccl);
1507             }
1508
1509           /* FIXME: see if optimizing this, as is done with ANYCHAR and
1510              add_utf8_anychar, makes sense.  */
1511
1512           /* \s and \S are documented to be equivalent to [[:space:]] and
1513              [^[:space:]] respectively, so tell the lexer to process those
1514              strings, each minus its "already processed" '['.  */
1515           {
1516             struct lexptr ls;
1517             push_lex_state (dfa, &ls, &"^[:space:]]"[c == 's']);
1518             dfa->lex.lasttok = parse_bracket_exp (dfa);
1519             pop_lex_state (dfa, &ls);
1520           }
1521
1522           dfa->lex.laststart = false;
1523           return dfa->lex.lasttok;
1524
1525         case 'w':
1526         case 'W':
1527           if (!backslash || (dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
1528             goto normal_char;
1529
1530           if (!dfa->localeinfo.multibyte)
1531             {
1532               charclass ccl;
1533               zeroset (&ccl);
1534               for (int c2 = 0; c2 < NOTCHAR; ++c2)
1535                 if (dfa->syntax.sbit[c2] == CTX_LETTER)
1536                   setbit (c2, &ccl);
1537               if (c == 'W')
1538                 notset (&ccl);
1539               dfa->lex.laststart = false;
1540               return dfa->lex.lasttok = CSET + charclass_index (dfa, &ccl);
1541             }
1542
1543           /* FIXME: see if optimizing this, as is done with ANYCHAR and
1544              add_utf8_anychar, makes sense.  */
1545
1546           /* \w and \W are documented to be equivalent to [_[:alnum:]] and
1547              [^_[:alnum:]] respectively, so tell the lexer to process those
1548              strings, each minus its "already processed" '['.  */
1549           {
1550             struct lexptr ls;
1551             push_lex_state (dfa, &ls, &"^_[:alnum:]]"[c == 'w']);
1552             dfa->lex.lasttok = parse_bracket_exp (dfa);
1553             pop_lex_state (dfa, &ls);
1554           }
1555
1556           dfa->lex.laststart = false;
1557           return dfa->lex.lasttok;
1558
1559         case '[':
1560           if (backslash)
1561             goto normal_char;
1562           dfa->lex.laststart = false;
1563           return dfa->lex.lasttok = parse_bracket_exp (dfa);
1564
1565         default:
1566         normal_char:
1567           dfa->lex.laststart = false;
1568           /* For multibyte character sets, folding is done in atom.  Always
1569              return WCHAR.  */
1570           if (dfa->localeinfo.multibyte)
1571             return dfa->lex.lasttok = WCHAR;
1572
1573           if (dfa->syntax.case_fold && isalpha (c))
1574             {
1575               charclass ccl;
1576               zeroset (&ccl);
1577               setbit_case_fold_c (c, &ccl);
1578               return dfa->lex.lasttok = CSET + charclass_index (dfa, &ccl);
1579             }
1580
1581           return dfa->lex.lasttok = c;
1582         }
1583     }
1584
1585   /* The above loop should consume at most a backslash
1586      and some other character.  */
1587   abort ();
1588   return END;                   /* keeps pedantic compilers happy.  */
1589 }
1590
1591 static void
1592 addtok_mb (struct dfa *dfa, token t, char mbprop)
1593 {
1594   if (dfa->talloc == dfa->tindex)
1595     {
1596       dfa->tokens = xpalloc (dfa->tokens, &dfa->talloc, 1, -1,
1597                              sizeof *dfa->tokens);
1598       if (dfa->localeinfo.multibyte)
1599         dfa->multibyte_prop = xnrealloc (dfa->multibyte_prop, dfa->talloc,
1600                                          sizeof *dfa->multibyte_prop);
1601     }
1602   if (dfa->localeinfo.multibyte)
1603     dfa->multibyte_prop[dfa->tindex] = mbprop;
1604   dfa->tokens[dfa->tindex++] = t;
1605
1606   switch (t)
1607     {
1608     case QMARK:
1609     case STAR:
1610     case PLUS:
1611       break;
1612
1613     case CAT:
1614     case OR:
1615       dfa->parse.depth--;
1616       break;
1617
1618     case BACKREF:
1619       dfa->fast = false;
1620       FALLTHROUGH;
1621     default:
1622       dfa->nleaves++;
1623       FALLTHROUGH;
1624     case EMPTY:
1625       dfa->parse.depth++;
1626       break;
1627     }
1628   if (dfa->parse.depth > dfa->depth)
1629     dfa->depth = dfa->parse.depth;
1630 }
1631
1632 static void addtok_wc (struct dfa *dfa, wint_t wc);
1633
1634 /* Add the given token to the parse tree, maintaining the depth count and
1635    updating the maximum depth if necessary.  */
1636 static void
1637 addtok (struct dfa *dfa, token t)
1638 {
1639   if (dfa->localeinfo.multibyte && t == MBCSET)
1640     {
1641       bool need_or = false;
1642
1643       /* Extract wide characters into alternations for better performance.
1644          This does not require UTF-8.  */
1645       for (idx_t i = 0; i < dfa->lex.brack.nchars; i++)
1646         {
1647           addtok_wc (dfa, dfa->lex.brack.chars[i]);
1648           if (need_or)
1649             addtok (dfa, OR);
1650           need_or = true;
1651         }
1652       dfa->lex.brack.nchars = 0;
1653
1654       /* Wide characters have been handled above, so it is possible
1655          that the set is empty now.  Do nothing in that case.  */
1656       if (dfa->lex.brack.cset != -1)
1657         {
1658           addtok (dfa, CSET + dfa->lex.brack.cset);
1659           if (need_or)
1660             addtok (dfa, OR);
1661         }
1662     }
1663   else
1664     {
1665       addtok_mb (dfa, t, 3);
1666     }
1667 }
1668
1669 /* We treat a multibyte character as a single atom, so that DFA
1670    can treat a multibyte character as a single expression.
1671
1672    e.g., we construct the following tree from "<mb1><mb2>".
1673    <mb1(1st-byte)><mb1(2nd-byte)><CAT><mb1(3rd-byte)><CAT>
1674    <mb2(1st-byte)><mb2(2nd-byte)><CAT><mb2(3rd-byte)><CAT><CAT> */
1675 static void
1676 addtok_wc (struct dfa *dfa, wint_t wc)
1677 {
1678   unsigned char buf[MB_LEN_MAX];
1679   mbstate_t s = { 0 };
1680   size_t stored_bytes = wcrtomb ((char *) buf, wc, &s);
1681   int buflen;
1682
1683   if (stored_bytes != (size_t) -1)
1684     buflen = stored_bytes;
1685   else
1686     {
1687       /* This is merely stop-gap.  buf[0] is undefined, yet skipping
1688          the addtok_mb call altogether can corrupt the heap.  */
1689       buflen = 1;
1690       buf[0] = 0;
1691     }
1692
1693   addtok_mb (dfa, buf[0], buflen == 1 ? 3 : 1);
1694   for (int i = 1; i < buflen; i++)
1695     {
1696       addtok_mb (dfa, buf[i], i == buflen - 1 ? 2 : 0);
1697       addtok (dfa, CAT);
1698     }
1699 }
1700
1701 static void
1702 add_utf8_anychar (struct dfa *dfa)
1703 {
1704   /* Since the Unicode Standard Version 4.0.0 (2003), a well-formed
1705      UTF-8 byte sequence has been defined as follows:
1706
1707      ([\x00-\x7f]
1708      |[\xc2-\xdf][\x80-\xbf]
1709      |[\xe0][\xa0-\xbf][\x80-\xbf]
1710      |[\xe1-\xec\xee-\xef][\x80-\xbf][\x80-\xbf]
1711      |[\xed][\x80-\x9f][\x80-\xbf]
1712      |[\xf0][\x90-\xbf][\x80-\xbf][\x80-\xbf])
1713      |[\xf1-\xf3][\x80-\xbf][\x80-\xbf][\x80-\xbf]
1714      |[\xf4][\x80-\x8f][\x80-\xbf][\x80-\xbf])
1715
1716      which I'll write more concisely "A|BC|DEC|FCC|GHC|IJCC|KCCC|LMCC",
1717      where A = [\x00-\x7f], B = [\xc2-\xdf], C = [\x80-\xbf],
1718      D = [\xe0], E = [\xa0-\xbf], F = [\xe1-\xec\xee-\xef], G = [\xed],
1719      H = [\x80-\x9f], I = [\xf0],
1720      J = [\x90-\xbf], K = [\xf1-\xf3], L = [\xf4], M = [\x80-\x8f].
1721
1722      This can be refactored to "A|(B|DE|GH|(F|IJ|LM|KC)C)C".  */
1723
1724   /* Mnemonics for classes containing two or more bytes.  */
1725   enum { A, B, C, E, F, H, J, K, M };
1726
1727   /* Mnemonics for single-byte tokens.  */
1728   enum { D_token = 0xe0, G_token = 0xed, I_token = 0xf0, L_token = 0xf4 };
1729
1730   static charclass const utf8_classes[] = {
1731     /* A. 00-7f: 1-byte sequence.  */
1732     CHARCLASS_INIT (0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0, 0, 0, 0),
1733
1734     /* B. c2-df: 1st byte of a 2-byte sequence.  */
1735     CHARCLASS_INIT (0, 0, 0, 0, 0, 0, 0xfffffffc, 0),
1736
1737     /* C. 80-bf: non-leading bytes.  */
1738     CHARCLASS_INIT (0, 0, 0, 0, 0xffffffff, 0xffffffff, 0, 0),
1739
1740     /* D. e0 (just a token).  */
1741
1742     /* E. a0-bf: 2nd byte of a "DEC" sequence.  */
1743     CHARCLASS_INIT (0, 0, 0, 0, 0, 0xffffffff, 0, 0),
1744
1745     /* F. e1-ec + ee-ef: 1st byte of an "FCC" sequence.  */
1746     CHARCLASS_INIT (0, 0, 0, 0, 0, 0, 0, 0xdffe),
1747
1748     /* G. ed (just a token).  */
1749
1750     /* H. 80-9f: 2nd byte of a "GHC" sequence.  */
1751     CHARCLASS_INIT (0, 0, 0, 0, 0xffff, 0, 0, 0),
1752
1753     /* I. f0 (just a token).  */
1754
1755     /* J. 90-bf: 2nd byte of an "IJCC" sequence.  */
1756     CHARCLASS_INIT (0, 0, 0, 0, 0xffff0000, 0xffffffff, 0, 0),
1757
1758     /* K. f1-f3: 1st byte of a "KCCC" sequence.  */
1759     CHARCLASS_INIT (0, 0, 0, 0, 0, 0, 0, 0xe0000),
1760
1761     /* L. f4 (just a token).  */
1762
1763     /* M. 80-8f: 2nd byte of a "LMCC" sequence.  */
1764     CHARCLASS_INIT (0, 0, 0, 0, 0xff, 0, 0, 0),
1765   };
1766
1767   /* Define the character classes that are needed below.  */
1768   if (dfa->utf8_anychar_classes[0] == 0)
1769     {
1770       charclass c = utf8_classes[0];
1771       if (! (dfa->syntax.syntax_bits & RE_DOT_NEWLINE))
1772         clrbit ('\n', &c);
1773       if (dfa->syntax.syntax_bits & RE_DOT_NOT_NULL)
1774         clrbit ('\0', &c);
1775       dfa->utf8_anychar_classes[0] = CSET + charclass_index (dfa, &c);
1776
1777       for (int i = 1; i < sizeof utf8_classes / sizeof *utf8_classes; i++)
1778         dfa->utf8_anychar_classes[i]
1779           = CSET + charclass_index (dfa, &utf8_classes[i]);
1780     }
1781
1782   /* Implement the "A|(B|DE|GH|(F|IJ|LM|KC)C)C" pattern mentioned above.
1783      The token buffer is in reverse Polish order, so we get
1784      "A B D E CAT OR G H CAT OR F I J CAT OR L M CAT OR K
1785       C CAT OR C CAT OR C CAT OR".  */
1786   addtok (dfa, dfa->utf8_anychar_classes[A]);
1787   addtok (dfa, dfa->utf8_anychar_classes[B]);
1788   addtok (dfa, D_token);
1789   addtok (dfa, dfa->utf8_anychar_classes[E]);
1790   addtok (dfa, CAT);
1791   addtok (dfa, OR);
1792   addtok (dfa, G_token);
1793   addtok (dfa, dfa->utf8_anychar_classes[H]);
1794   addtok (dfa, CAT);
1795   addtok (dfa, OR);
1796   addtok (dfa, dfa->utf8_anychar_classes[F]);
1797   addtok (dfa, I_token);
1798   addtok (dfa, dfa->utf8_anychar_classes[J]);
1799   addtok (dfa, CAT);
1800   addtok (dfa, OR);
1801   addtok (dfa, L_token);
1802   addtok (dfa, dfa->utf8_anychar_classes[M]);
1803   addtok (dfa, CAT);
1804   addtok (dfa, OR);
1805   addtok (dfa, dfa->utf8_anychar_classes[K]);
1806   for (int i = 0; i < 3; i++)
1807     {
1808       addtok (dfa, dfa->utf8_anychar_classes[C]);
1809       addtok (dfa, CAT);
1810       addtok (dfa, OR);
1811     }
1812 }
1813
1814 /* The grammar understood by the parser is as follows.
1815
1816    regexp:
1817      regexp OR branch
1818      branch
1819
1820    branch:
1821      branch closure
1822      closure
1823
1824    closure:
1825      closure QMARK
1826      closure STAR
1827      closure PLUS
1828      closure REPMN
1829      atom
1830
1831    atom:
1832      <normal character>
1833      <multibyte character>
1834      ANYCHAR
1835      MBCSET
1836      CSET
1837      BACKREF
1838      BEGLINE
1839      ENDLINE
1840      BEGWORD
1841      ENDWORD
1842      LIMWORD
1843      NOTLIMWORD
1844      LPAREN regexp RPAREN
1845      <empty>
1846
1847    The parser builds a parse tree in postfix form in an array of tokens.  */
1848
1849 static void
1850 atom (struct dfa *dfa)
1851 {
1852   if ((0 <= dfa->parse.tok && dfa->parse.tok < NOTCHAR)
1853       || dfa->parse.tok >= CSET
1854       || dfa->parse.tok == BEG || dfa->parse.tok == BACKREF
1855       || dfa->parse.tok == BEGLINE || dfa->parse.tok == ENDLINE
1856       || dfa->parse.tok == BEGWORD || dfa->parse.tok == ENDWORD
1857       || dfa->parse.tok == LIMWORD || dfa->parse.tok == NOTLIMWORD
1858       || dfa->parse.tok == ANYCHAR || dfa->parse.tok == MBCSET)
1859     {
1860       if (dfa->parse.tok == ANYCHAR && dfa->localeinfo.using_utf8)
1861         {
1862           /* For UTF-8 expand the period to a series of CSETs that define a
1863              valid UTF-8 character.  This avoids using the slow multibyte
1864              path.  I'm pretty sure it would be both profitable and correct to
1865              do it for any encoding; however, the optimization must be done
1866              manually as it is done above in add_utf8_anychar.  So, let's
1867              start with UTF-8: it is the most used, and the structure of the
1868              encoding makes the correctness more obvious.  */
1869           add_utf8_anychar (dfa);
1870         }
1871       else
1872         addtok (dfa, dfa->parse.tok);
1873       dfa->parse.tok = lex (dfa);
1874     }
1875   else if (dfa->parse.tok == WCHAR)
1876     {
1877       if (dfa->lex.wctok == WEOF)
1878         addtok (dfa, BACKREF);
1879       else
1880         {
1881           addtok_wc (dfa, dfa->lex.wctok);
1882
1883           if (dfa->syntax.case_fold)
1884             {
1885               wchar_t folded[CASE_FOLDED_BUFSIZE];
1886               int n = case_folded_counterparts (dfa->lex.wctok, folded);
1887               for (int i = 0; i < n; i++)
1888                 {
1889                   addtok_wc (dfa, folded[i]);
1890                   addtok (dfa, OR);
1891                 }
1892             }
1893         }
1894
1895       dfa->parse.tok = lex (dfa);
1896     }
1897   else if (dfa->parse.tok == LPAREN)
1898     {
1899       dfa->parse.tok = lex (dfa);
1900       regexp (dfa);
1901       if (dfa->parse.tok != RPAREN)
1902         dfaerror (_("unbalanced ("));
1903       dfa->parse.tok = lex (dfa);
1904     }
1905   else
1906     addtok (dfa, EMPTY);
1907 }
1908
1909 /* Return the number of tokens in the given subexpression.  */
1910 static idx_t _GL_ATTRIBUTE_PURE
1911 nsubtoks (struct dfa const *dfa, idx_t tindex)
1912 {
1913   switch (dfa->tokens[tindex - 1])
1914     {
1915     default:
1916       return 1;
1917     case QMARK:
1918     case STAR:
1919     case PLUS:
1920       return 1 + nsubtoks (dfa, tindex - 1);
1921     case CAT:
1922     case OR:
1923       {
1924         idx_t ntoks1 = nsubtoks (dfa, tindex - 1);
1925         return 1 + ntoks1 + nsubtoks (dfa, tindex - 1 - ntoks1);
1926       }
1927     }
1928 }
1929
1930 /* Copy the given subexpression to the top of the tree.  */
1931 static void
1932 copytoks (struct dfa *dfa, idx_t tindex, idx_t ntokens)
1933 {
1934   if (dfa->localeinfo.multibyte)
1935     for (idx_t i = 0; i < ntokens; i++)
1936       addtok_mb (dfa, dfa->tokens[tindex + i],
1937                  dfa->multibyte_prop[tindex + i]);
1938   else
1939     for (idx_t i = 0; i < ntokens; i++)
1940       addtok_mb (dfa, dfa->tokens[tindex + i], 3);
1941 }
1942
1943 static void
1944 closure (struct dfa *dfa)
1945 {
1946   atom (dfa);
1947   while (dfa->parse.tok == QMARK || dfa->parse.tok == STAR
1948          || dfa->parse.tok == PLUS || dfa->parse.tok == REPMN)
1949     if (dfa->parse.tok == REPMN && (dfa->lex.minrep || dfa->lex.maxrep))
1950       {
1951         idx_t ntokens = nsubtoks (dfa, dfa->tindex);
1952         idx_t tindex = dfa->tindex - ntokens;
1953         if (dfa->lex.maxrep < 0)
1954           addtok (dfa, PLUS);
1955         if (dfa->lex.minrep == 0)
1956           addtok (dfa, QMARK);
1957         int i;
1958         for (i = 1; i < dfa->lex.minrep; i++)
1959           {
1960             copytoks (dfa, tindex, ntokens);
1961             addtok (dfa, CAT);
1962           }
1963         for (; i < dfa->lex.maxrep; i++)
1964           {
1965             copytoks (dfa, tindex, ntokens);
1966             addtok (dfa, QMARK);
1967             addtok (dfa, CAT);
1968           }
1969         dfa->parse.tok = lex (dfa);
1970       }
1971     else if (dfa->parse.tok == REPMN)
1972       {
1973         dfa->tindex -= nsubtoks (dfa, dfa->tindex);
1974         dfa->parse.tok = lex (dfa);
1975         closure (dfa);
1976       }
1977     else
1978       {
1979         addtok (dfa, dfa->parse.tok);
1980         dfa->parse.tok = lex (dfa);
1981       }
1982 }
1983
1984 static void
1985 branch (struct dfa* dfa)
1986 {
1987   closure (dfa);
1988   while (dfa->parse.tok != RPAREN && dfa->parse.tok != OR
1989          && dfa->parse.tok >= 0)
1990     {
1991       closure (dfa);
1992       addtok (dfa, CAT);
1993     }
1994 }
1995
1996 static void
1997 regexp (struct dfa *dfa)
1998 {
1999   branch (dfa);
2000   while (dfa->parse.tok == OR)
2001     {
2002       dfa->parse.tok = lex (dfa);
2003       branch (dfa);
2004       addtok (dfa, OR);
2005     }
2006 }
2007
2008 /* Parse a string S of length LEN into D.  S can include NUL characters.
2009    This is the main entry point for the parser.  */
2010 void
2011 dfaparse (char const *s, idx_t len, struct dfa *d)
2012 {
2013   d->lex.ptr = s;
2014   d->lex.left = len;
2015   d->lex.lasttok = END;
2016   d->lex.laststart = true;
2017
2018   if (!d->syntax.syntax_bits_set)
2019     dfaerror (_("no syntax specified"));
2020
2021   if (!d->nregexps)
2022     addtok (d, BEG);
2023
2024   d->parse.tok = lex (d);
2025   d->parse.depth = d->depth;
2026
2027   regexp (d);
2028
2029   if (d->parse.tok != END)
2030     dfaerror (_("unbalanced )"));
2031
2032   addtok (d, END - d->nregexps);
2033   addtok (d, CAT);
2034
2035   if (d->nregexps)
2036     addtok (d, OR);
2037
2038   ++d->nregexps;
2039 }
2040
2041 /* Some primitives for operating on sets of positions.  */
2042
2043 /* Copy one set to another.  */
2044 static void
2045 copy (position_set const *src, position_set *dst)
2046 {
2047   if (dst->alloc < src->nelem)
2048     {
2049       free (dst->elems);
2050       dst->elems = xpalloc (NULL, &dst->alloc, src->nelem - dst->alloc, -1,
2051                             sizeof *dst->elems);
2052     }
2053   dst->nelem = src->nelem;
2054   if (src->nelem != 0)
2055     memcpy (dst->elems, src->elems, src->nelem * sizeof *dst->elems);
2056 }
2057
2058 static void
2059 alloc_position_set (position_set *s, idx_t size)
2060 {
2061   s->elems = xnmalloc (size, sizeof *s->elems);
2062   s->alloc = size;
2063   s->nelem = 0;
2064 }
2065
2066 /* Insert position P in set S.  S is maintained in sorted order on
2067    decreasing index.  If there is already an entry in S with P.index
2068    then merge (logically-OR) P's constraints into the one in S.
2069    S->elems must point to an array large enough to hold the resulting set.  */
2070 static void
2071 insert (position p, position_set *s)
2072 {
2073   idx_t count = s->nelem;
2074   idx_t lo = 0, hi = count;
2075   while (lo < hi)
2076     {
2077       idx_t mid = (lo + hi) >> 1;
2078       if (s->elems[mid].index < p.index)
2079         lo = mid + 1;
2080       else if (s->elems[mid].index == p.index)
2081         {
2082           s->elems[mid].constraint |= p.constraint;
2083           return;
2084         }
2085       else
2086         hi = mid;
2087     }
2088
2089   s->elems = maybe_realloc (s->elems, count, &s->alloc, -1, sizeof *s->elems);
2090   for (idx_t i = count; i > lo; i--)
2091     s->elems[i] = s->elems[i - 1];
2092   s->elems[lo] = p;
2093   ++s->nelem;
2094 }
2095
2096 static void
2097 append (position p, position_set *s)
2098 {
2099   idx_t count = s->nelem;
2100   s->elems = maybe_realloc (s->elems, count, &s->alloc, -1, sizeof *s->elems);
2101   s->elems[s->nelem++] = p;
2102 }
2103
2104 /* Merge S1 and S2 (with the additional constraint C2) into M.  The
2105    result is as if the positions of S1, and of S2 with the additional
2106    constraint C2, were inserted into an initially empty set.  */
2107 static void
2108 merge_constrained (position_set const *s1, position_set const *s2,
2109                    unsigned int c2, position_set *m)
2110 {
2111   idx_t i = 0, j = 0;
2112
2113   if (m->alloc - s1->nelem < s2->nelem)
2114     {
2115       free (m->elems);
2116       m->alloc = s1->nelem;
2117       m->elems = xpalloc (NULL, &m->alloc, s2->nelem, -1, sizeof *m->elems);
2118     }
2119   m->nelem = 0;
2120   while (i < s1->nelem || j < s2->nelem)
2121     if (! (j < s2->nelem)
2122         || (i < s1->nelem && s1->elems[i].index <= s2->elems[j].index))
2123       {
2124         unsigned int c = ((i < s1->nelem && j < s2->nelem
2125                            && s1->elems[i].index == s2->elems[j].index)
2126                           ? s2->elems[j++].constraint & c2
2127                           : 0);
2128         m->elems[m->nelem].index = s1->elems[i].index;
2129         m->elems[m->nelem++].constraint = s1->elems[i++].constraint | c;
2130       }
2131     else
2132       {
2133         if (s2->elems[j].constraint & c2)
2134           {
2135             m->elems[m->nelem].index = s2->elems[j].index;
2136             m->elems[m->nelem++].constraint = s2->elems[j].constraint & c2;
2137           }
2138         j++;
2139       }
2140 }
2141
2142 /* Merge two sets of positions into a third.  The result is exactly as if
2143    the positions of both sets were inserted into an initially empty set.  */
2144 static void
2145 merge (position_set const *s1, position_set const *s2, position_set *m)
2146 {
2147   merge_constrained (s1, s2, -1, m);
2148 }
2149
2150 static void
2151 merge2 (position_set *dst, position_set const *src, position_set *m)
2152 {
2153   if (src->nelem < 4)
2154     {
2155       for (idx_t i = 0; i < src->nelem; i++)
2156         insert (src->elems[i], dst);
2157     }
2158    else
2159     {
2160       merge (src, dst, m);
2161       copy (m, dst);
2162     }
2163 }
2164
2165 /* Delete a position from a set.  Return the nonzero constraint of the
2166    deleted position, or zero if there was no such position.  */
2167 static unsigned int
2168 delete (idx_t del, position_set *s)
2169 {
2170   idx_t count = s->nelem;
2171   idx_t lo = 0, hi = count;
2172   while (lo < hi)
2173     {
2174       idx_t mid = (lo + hi) >> 1;
2175       if (s->elems[mid].index < del)
2176         lo = mid + 1;
2177       else if (s->elems[mid].index == del)
2178         {
2179           unsigned int c = s->elems[mid].constraint;
2180           idx_t i;
2181           for (i = mid; i + 1 < count; i++)
2182             s->elems[i] = s->elems[i + 1];
2183           s->nelem = i;
2184           return c;
2185         }
2186       else
2187         hi = mid;
2188     }
2189   return 0;
2190 }
2191
2192 /* Replace a position with the followed set.  */
2193 static void
2194 replace (position_set *dst, idx_t del, position_set *add,
2195          unsigned int constraint, position_set *tmp)
2196 {
2197   unsigned int c = delete (del, dst) & constraint;
2198
2199   if (c)
2200     {
2201       copy (dst, tmp);
2202       merge_constrained (tmp, add, c, dst);
2203     }
2204 }
2205
2206 /* Find the index of the state corresponding to the given position set with
2207    the given preceding context, or create a new state if there is no such
2208    state.  Context tells whether we got here on a newline or letter.  */
2209 static state_num
2210 state_index (struct dfa *d, position_set const *s, int context)
2211 {
2212   size_t hash = 0;
2213   int constraint = 0;
2214   state_num i;
2215   token first_end = 0;
2216
2217   for (i = 0; i < s->nelem; ++i)
2218     {
2219       size_t ind = s->elems[i].index;
2220       hash ^= ind + s->elems[i].constraint;
2221     }
2222
2223   /* Try to find a state that exactly matches the proposed one.  */
2224   for (i = 0; i < d->sindex; ++i)
2225     {
2226       if (hash != d->states[i].hash || s->nelem != d->states[i].elems.nelem
2227           || context != d->states[i].context)
2228         continue;
2229       state_num j;
2230       for (j = 0; j < s->nelem; ++j)
2231         if (s->elems[j].constraint != d->states[i].elems.elems[j].constraint
2232             || s->elems[j].index != d->states[i].elems.elems[j].index)
2233           break;
2234       if (j == s->nelem)
2235         return i;
2236     }
2237
2238 #ifdef DEBUG
2239   fprintf (stderr, "new state %td\n nextpos:", i);
2240   for (state_num j = 0; j < s->nelem; j++)
2241     {
2242       fprintf (stderr, " %td:", s->elems[j].index);
2243       prtok (d->tokens[s->elems[j].index]);
2244     }
2245   fprintf (stderr, "\n context:");
2246   if (context ^ CTX_ANY)
2247     {
2248       if (context & CTX_NONE)
2249         fprintf (stderr, " CTX_NONE");
2250       if (context & CTX_LETTER)
2251         fprintf (stderr, " CTX_LETTER");
2252       if (context & CTX_NEWLINE)
2253         fprintf (stderr, " CTX_NEWLINE");
2254     }
2255   else
2256     fprintf (stderr, " CTX_ANY");
2257   fprintf (stderr, "\n");
2258 #endif
2259
2260   for (state_num j = 0; j < s->nelem; j++)
2261     {
2262       int c = d->constraints[s->elems[j].index];
2263
2264       if (c != 0)
2265         {
2266           if (succeeds_in_context (c, context, CTX_ANY))
2267             constraint |= c;
2268           if (!first_end)
2269             first_end = d->tokens[s->elems[j].index];
2270         }
2271       else if (d->tokens[s->elems[j].index] == BACKREF)
2272         constraint = NO_CONSTRAINT;
2273     }
2274
2275
2276   /* Create a new state.  */
2277   d->states = maybe_realloc (d->states, d->sindex, &d->salloc, -1,
2278                              sizeof *d->states);
2279   d->states[i].hash = hash;
2280   alloc_position_set (&d->states[i].elems, s->nelem);
2281   copy (s, &d->states[i].elems);
2282   d->states[i].context = context;
2283   d->states[i].constraint = constraint;
2284   d->states[i].first_end = first_end;
2285   d->states[i].mbps.nelem = 0;
2286   d->states[i].mbps.elems = NULL;
2287   d->states[i].mb_trindex = -1;
2288
2289   ++d->sindex;
2290
2291   return i;
2292 }
2293
2294 /* Find the epsilon closure of a set of positions.  If any position of the set
2295    contains a symbol that matches the empty string in some context, replace
2296    that position with the elements of its follow labeled with an appropriate
2297    constraint.  Repeat exhaustively until no funny positions are left.
2298    S->elems must be large enough to hold the result.  */
2299 static void
2300 epsclosure (struct dfa const *d)
2301 {
2302   position_set tmp;
2303   alloc_position_set (&tmp, d->nleaves);
2304   for (idx_t i = 0; i < d->tindex; i++)
2305     if (d->follows[i].nelem > 0 && d->tokens[i] >= NOTCHAR
2306         && d->tokens[i] != BACKREF && d->tokens[i] != ANYCHAR
2307         && d->tokens[i] != MBCSET && d->tokens[i] < CSET)
2308       {
2309         unsigned int constraint;
2310         switch (d->tokens[i])
2311           {
2312           case BEGLINE:
2313             constraint = BEGLINE_CONSTRAINT;
2314             break;
2315           case ENDLINE:
2316             constraint = ENDLINE_CONSTRAINT;
2317             break;
2318           case BEGWORD:
2319             constraint = BEGWORD_CONSTRAINT;
2320             break;
2321           case ENDWORD:
2322             constraint = ENDWORD_CONSTRAINT;
2323             break;
2324           case LIMWORD:
2325             constraint = LIMWORD_CONSTRAINT;
2326             break;
2327           case NOTLIMWORD:
2328             constraint = NOTLIMWORD_CONSTRAINT;
2329             break;
2330           default:
2331             constraint = NO_CONSTRAINT;
2332             break;
2333           }
2334
2335         delete (i, &d->follows[i]);
2336
2337         for (idx_t j = 0; j < d->tindex; j++)
2338           if (i != j && d->follows[j].nelem > 0)
2339             replace (&d->follows[j], i, &d->follows[i], constraint, &tmp);
2340       }
2341   free (tmp.elems);
2342 }
2343
2344 /* Returns the set of contexts for which there is at least one
2345    character included in C.  */
2346
2347 static int
2348 charclass_context (struct dfa const *dfa, charclass const *c)
2349 {
2350   int context = 0;
2351
2352   for (int j = 0; j < CHARCLASS_WORDS; j++)
2353     {
2354       if (c->w[j] & dfa->syntax.newline.w[j])
2355         context |= CTX_NEWLINE;
2356       if (c->w[j] & dfa->syntax.letters.w[j])
2357         context |= CTX_LETTER;
2358       if (c->w[j] & ~(dfa->syntax.letters.w[j] | dfa->syntax.newline.w[j]))
2359         context |= CTX_NONE;
2360     }
2361
2362   return context;
2363 }
2364
2365 /* Returns the contexts on which the position set S depends.  Each context
2366    in the set of returned contexts (let's call it SC) may have a different
2367    follow set than other contexts in SC, and also different from the
2368    follow set of the complement set (sc ^ CTX_ANY).  However, all contexts
2369    in the complement set will have the same follow set.  */
2370
2371 static int _GL_ATTRIBUTE_PURE
2372 state_separate_contexts (struct dfa *d, position_set const *s)
2373 {
2374   int separate_contexts = 0;
2375
2376   for (idx_t j = 0; j < s->nelem; j++)
2377     separate_contexts |= d->separates[s->elems[j].index];
2378
2379   return separate_contexts;
2380 }
2381
2382 enum
2383 {
2384   /* Single token is repeated.  It is distinguished from non-repeated.  */
2385   OPT_REPEAT = (1 << 0),
2386
2387   /* Multiple tokens are repeated.  This flag is on at head of tokens.  The
2388      node is not merged.  */
2389   OPT_LPAREN = (1 << 1),
2390
2391   /* Multiple branches are joined.  The node is not merged.  */
2392   OPT_RPAREN = (1 << 2),
2393
2394   /* The node is walked.  If the node is found in walking again, OPT_RPAREN
2395      flag is turned on. */
2396   OPT_WALKED = (1 << 3),
2397
2398   /* The node is queued.  The node is not queued again.  */
2399   OPT_QUEUED = (1 << 4)
2400 };
2401
2402 static void
2403 merge_nfa_state (struct dfa *d, idx_t tindex, char *flags,
2404                  position_set *merged)
2405 {
2406   position_set *follows = d->follows;
2407   idx_t nelem = 0;
2408
2409   d->constraints[tindex] = 0;
2410
2411   for (idx_t i = 0; i < follows[tindex].nelem; i++)
2412     {
2413       idx_t sindex = follows[tindex].elems[i].index;
2414
2415       /* Skip the node as pruned in future.  */
2416       unsigned int iconstraint = follows[tindex].elems[i].constraint;
2417       if (iconstraint == 0)
2418         continue;
2419
2420       if (d->tokens[follows[tindex].elems[i].index] <= END)
2421         {
2422           d->constraints[tindex] |= follows[tindex].elems[i].constraint;
2423           continue;
2424         }
2425
2426       if (!(flags[sindex] & (OPT_LPAREN | OPT_RPAREN)))
2427         {
2428           idx_t j;
2429
2430           for (j = 0; j < nelem; j++)
2431             {
2432               idx_t dindex = follows[tindex].elems[j].index;
2433
2434               if (follows[tindex].elems[j].constraint != iconstraint)
2435                 continue;
2436
2437               if (flags[dindex] & (OPT_LPAREN | OPT_RPAREN))
2438                 continue;
2439
2440               if (d->tokens[sindex] != d->tokens[dindex])
2441                 continue;
2442
2443               if ((flags[sindex] ^ flags[dindex]) & OPT_REPEAT)
2444                 continue;
2445
2446               if (flags[sindex] & OPT_REPEAT)
2447                 delete (sindex, &follows[sindex]);
2448
2449               merge2 (&follows[dindex], &follows[sindex], merged);
2450
2451               break;
2452             }
2453
2454           if (j < nelem)
2455             continue;
2456         }
2457
2458       follows[tindex].elems[nelem++] = follows[tindex].elems[i];
2459       flags[sindex] |= OPT_QUEUED;
2460     }
2461
2462   follows[tindex].nelem = nelem;
2463 }
2464
2465 static int
2466 compare (const void *a, const void *b)
2467 {
2468   position const *p = a, *q = b;
2469   return (p->index > q->index) - (p->index < q->index);
2470 }
2471
2472 static void
2473 reorder_tokens (struct dfa *d)
2474 {
2475   idx_t nleaves;
2476   ptrdiff_t *map;
2477   token *tokens;
2478   position_set *follows;
2479   int *constraints;
2480   char *multibyte_prop;
2481
2482   nleaves = 0;
2483
2484   map = xnmalloc (d->tindex, sizeof *map);
2485
2486   map[0] = nleaves++;
2487
2488   for (idx_t i = 1; i < d->tindex; i++)
2489     map[i] = -1;
2490
2491   tokens = xnmalloc (d->nleaves, sizeof *tokens);
2492   follows = xnmalloc (d->nleaves, sizeof *follows);
2493   constraints = xnmalloc (d->nleaves, sizeof *constraints);
2494
2495   if (d->localeinfo.multibyte)
2496     multibyte_prop = xnmalloc (d->nleaves, sizeof *multibyte_prop);
2497   else
2498     multibyte_prop = NULL;
2499
2500   for (idx_t i = 0; i < d->tindex; i++)
2501     {
2502       if (map[i] == -1)
2503         {
2504           free (d->follows[i].elems);
2505           d->follows[i].elems = NULL;
2506           d->follows[i].nelem = 0;
2507           continue;
2508         }
2509
2510       tokens[map[i]] = d->tokens[i];
2511       follows[map[i]] = d->follows[i];
2512       constraints[map[i]] = d->constraints[i];
2513
2514       if (multibyte_prop != NULL)
2515         multibyte_prop[map[i]] = d->multibyte_prop[i];
2516
2517       for (idx_t j = 0; j < d->follows[i].nelem; j++)
2518         {
2519           if (map[d->follows[i].elems[j].index] == -1)
2520             map[d->follows[i].elems[j].index] = nleaves++;
2521
2522           d->follows[i].elems[j].index = map[d->follows[i].elems[j].index];
2523         }
2524
2525       qsort (d->follows[i].elems, d->follows[i].nelem,
2526              sizeof *d->follows[i].elems, compare);
2527     }
2528
2529   for (idx_t i = 0; i < nleaves; i++)
2530     {
2531       d->tokens[i] = tokens[i];
2532       d->follows[i] = follows[i];
2533       d->constraints[i] = constraints[i];
2534
2535       if (multibyte_prop != NULL)
2536         d->multibyte_prop[i] = multibyte_prop[i];
2537     }
2538
2539   d->tindex = d->nleaves = nleaves;
2540
2541   free (tokens);
2542   free (follows);
2543   free (constraints);
2544   free (multibyte_prop);
2545   free (map);
2546 }
2547
2548 static void
2549 dfaoptimize (struct dfa *d)
2550 {
2551   char *flags = xzalloc (d->tindex);
2552
2553   for (idx_t i = 0; i < d->tindex; i++)
2554     {
2555       for (idx_t j = 0; j < d->follows[i].nelem; j++)
2556         {
2557           if (d->follows[i].elems[j].index == i)
2558             flags[d->follows[i].elems[j].index] |= OPT_REPEAT;
2559           else if (d->follows[i].elems[j].index < i)
2560             flags[d->follows[i].elems[j].index] |= OPT_LPAREN;
2561           else if (flags[d->follows[i].elems[j].index] &= OPT_WALKED)
2562             flags[d->follows[i].elems[j].index] |= OPT_RPAREN;
2563           else
2564             flags[d->follows[i].elems[j].index] |= OPT_WALKED;
2565         }
2566     }
2567
2568   flags[0] |= OPT_QUEUED;
2569
2570   position_set merged0;
2571   position_set *merged = &merged0;
2572   alloc_position_set (merged, d->nleaves);
2573
2574   d->constraints = xnmalloc (d->tindex, sizeof *d->constraints);
2575
2576   for (idx_t i = 0; i < d->tindex; i++)
2577     if (flags[i] & OPT_QUEUED)
2578       merge_nfa_state (d, i, flags, merged);
2579
2580   reorder_tokens (d);
2581
2582   free (merged->elems);
2583   free (flags);
2584 }
2585
2586 /* Perform bottom-up analysis on the parse tree, computing various functions.
2587    Note that at this point, we're pretending constructs like \< are real
2588    characters rather than constraints on what can follow them.
2589
2590    Nullable:  A node is nullable if it is at the root of a regexp that can
2591    match the empty string.
2592    *  EMPTY leaves are nullable.
2593    * No other leaf is nullable.
2594    * A QMARK or STAR node is nullable.
2595    * A PLUS node is nullable if its argument is nullable.
2596    * A CAT node is nullable if both its arguments are nullable.
2597    * An OR node is nullable if either argument is nullable.
2598
2599    Firstpos:  The firstpos of a node is the set of positions (nonempty leaves)
2600    that could correspond to the first character of a string matching the
2601    regexp rooted at the given node.
2602    * EMPTY leaves have empty firstpos.
2603    * The firstpos of a nonempty leaf is that leaf itself.
2604    * The firstpos of a QMARK, STAR, or PLUS node is the firstpos of its
2605      argument.
2606    * The firstpos of a CAT node is the firstpos of the left argument, union
2607      the firstpos of the right if the left argument is nullable.
2608    * The firstpos of an OR node is the union of firstpos of each argument.
2609
2610    Lastpos:  The lastpos of a node is the set of positions that could
2611    correspond to the last character of a string matching the regexp at
2612    the given node.
2613    * EMPTY leaves have empty lastpos.
2614    * The lastpos of a nonempty leaf is that leaf itself.
2615    * The lastpos of a QMARK, STAR, or PLUS node is the lastpos of its
2616      argument.
2617    * The lastpos of a CAT node is the lastpos of its right argument, union
2618      the lastpos of the left if the right argument is nullable.
2619    * The lastpos of an OR node is the union of the lastpos of each argument.
2620
2621    Follow:  The follow of a position is the set of positions that could
2622    correspond to the character following a character matching the node in
2623    a string matching the regexp.  At this point we consider special symbols
2624    that match the empty string in some context to be just normal characters.
2625    Later, if we find that a special symbol is in a follow set, we will
2626    replace it with the elements of its follow, labeled with an appropriate
2627    constraint.
2628    * Every node in the firstpos of the argument of a STAR or PLUS node is in
2629      the follow of every node in the lastpos.
2630    * Every node in the firstpos of the second argument of a CAT node is in
2631      the follow of every node in the lastpos of the first argument.
2632
2633    Because of the postfix representation of the parse tree, the depth-first
2634    analysis is conveniently done by a linear scan with the aid of a stack.
2635    Sets are stored as arrays of the elements, obeying a stack-like allocation
2636    scheme; the number of elements in each set deeper in the stack can be
2637    used to determine the address of a particular set's array.  */
2638 static void
2639 dfaanalyze (struct dfa *d, bool searchflag)
2640 {
2641   /* Array allocated to hold position sets.  */
2642   position *posalloc = xnmalloc (d->nleaves, 2 * sizeof *posalloc);
2643   /* Firstpos and lastpos elements.  */
2644   position *firstpos = posalloc;
2645   position *lastpos = firstpos + d->nleaves;
2646   position pos;
2647   position_set tmp;
2648
2649   /* Stack for element counts and nullable flags.  */
2650   struct
2651   {
2652     /* Whether the entry is nullable.  */
2653     bool nullable;
2654
2655     /* Counts of firstpos and lastpos sets.  */
2656     idx_t nfirstpos;
2657     idx_t nlastpos;
2658   } *stkalloc = xnmalloc (d->depth, sizeof *stkalloc), *stk = stkalloc;
2659
2660   position_set merged;          /* Result of merging sets.  */
2661
2662   addtok (d, CAT);
2663
2664 #ifdef DEBUG
2665   fprintf (stderr, "dfaanalyze:\n");
2666   for (idx_t i = 0; i < d->tindex; i++)
2667     {
2668       fprintf (stderr, " %td:", i);
2669       prtok (d->tokens[i]);
2670     }
2671   putc ('\n', stderr);
2672 #endif
2673
2674   d->searchflag = searchflag;
2675   alloc_position_set (&merged, d->nleaves);
2676   d->follows = xcalloc (d->tindex, sizeof *d->follows);
2677
2678   for (idx_t i = 0; i < d->tindex; i++)
2679     {
2680       switch (d->tokens[i])
2681         {
2682         case EMPTY:
2683           /* The empty set is nullable.  */
2684           stk->nullable = true;
2685
2686           /* The firstpos and lastpos of the empty leaf are both empty.  */
2687           stk->nfirstpos = stk->nlastpos = 0;
2688           stk++;
2689           break;
2690
2691         case STAR:
2692         case PLUS:
2693           /* Every element in the firstpos of the argument is in the follow
2694              of every element in the lastpos.  */
2695           {
2696             tmp.elems = firstpos - stk[-1].nfirstpos;
2697             tmp.nelem = stk[-1].nfirstpos;
2698             position *p = lastpos - stk[-1].nlastpos;
2699             for (idx_t j = 0; j < stk[-1].nlastpos; j++)
2700               {
2701                 merge (&tmp, &d->follows[p[j].index], &merged);
2702                 copy (&merged, &d->follows[p[j].index]);
2703               }
2704           }
2705           FALLTHROUGH;
2706         case QMARK:
2707           /* A QMARK or STAR node is automatically nullable.  */
2708           if (d->tokens[i] != PLUS)
2709             stk[-1].nullable = true;
2710           break;
2711
2712         case CAT:
2713           /* Every element in the firstpos of the second argument is in the
2714              follow of every element in the lastpos of the first argument.  */
2715           {
2716             tmp.nelem = stk[-1].nfirstpos;
2717             tmp.elems = firstpos - stk[-1].nfirstpos;
2718             position *p = lastpos - stk[-1].nlastpos - stk[-2].nlastpos;
2719             for (idx_t j = 0; j < stk[-2].nlastpos; j++)
2720               {
2721                 merge (&tmp, &d->follows[p[j].index], &merged);
2722                 copy (&merged, &d->follows[p[j].index]);
2723               }
2724           }
2725
2726           /* The firstpos of a CAT node is the firstpos of the first argument,
2727              union that of the second argument if the first is nullable.  */
2728           if (stk[-2].nullable)
2729             stk[-2].nfirstpos += stk[-1].nfirstpos;
2730           else
2731             firstpos -= stk[-1].nfirstpos;
2732
2733           /* The lastpos of a CAT node is the lastpos of the second argument,
2734              union that of the first argument if the second is nullable.  */
2735           if (stk[-1].nullable)
2736             stk[-2].nlastpos += stk[-1].nlastpos;
2737           else
2738             {
2739               position *p = lastpos - stk[-1].nlastpos - stk[-2].nlastpos;
2740               for (idx_t j = 0; j < stk[-1].nlastpos; j++)
2741                 p[j] = p[j + stk[-2].nlastpos];
2742               lastpos -= stk[-2].nlastpos;
2743               stk[-2].nlastpos = stk[-1].nlastpos;
2744             }
2745
2746           /* A CAT node is nullable if both arguments are nullable.  */
2747           stk[-2].nullable &= stk[-1].nullable;
2748           stk--;
2749           break;
2750
2751         case OR:
2752           /* The firstpos is the union of the firstpos of each argument.  */
2753           stk[-2].nfirstpos += stk[-1].nfirstpos;
2754
2755           /* The lastpos is the union of the lastpos of each argument.  */
2756           stk[-2].nlastpos += stk[-1].nlastpos;
2757
2758           /* An OR node is nullable if either argument is nullable.  */
2759           stk[-2].nullable |= stk[-1].nullable;
2760           stk--;
2761           break;
2762
2763         default:
2764           /* Anything else is a nonempty position.  (Note that special
2765              constructs like \< are treated as nonempty strings here;
2766              an "epsilon closure" effectively makes them nullable later.
2767              Backreferences have to get a real position so we can detect
2768              transitions on them later.  But they are nullable.  */
2769           stk->nullable = d->tokens[i] == BACKREF;
2770
2771           /* This position is in its own firstpos and lastpos.  */
2772           stk->nfirstpos = stk->nlastpos = 1;
2773           stk++;
2774
2775           firstpos->index = lastpos->index = i;
2776           firstpos->constraint = lastpos->constraint = NO_CONSTRAINT;
2777           firstpos++, lastpos++;
2778
2779           break;
2780         }
2781 #ifdef DEBUG
2782       /* ... balance the above nonsyntactic #ifdef goo...  */
2783       fprintf (stderr, "node %td:", i);
2784       prtok (d->tokens[i]);
2785       putc ('\n', stderr);
2786       fprintf (stderr,
2787                stk[-1].nullable ? " nullable: yes\n" : " nullable: no\n");
2788       fprintf (stderr, " firstpos:");
2789       for (idx_t j = 0; j < stk[-1].nfirstpos; j++)
2790         {
2791           fprintf (stderr, " %td:", firstpos[j - stk[-1].nfirstpos].index);
2792           prtok (d->tokens[firstpos[j - stk[-1].nfirstpos].index]);
2793         }
2794       fprintf (stderr, "\n lastpos:");
2795       for (idx_t j = 0; j < stk[-1].nlastpos; j++)
2796         {
2797           fprintf (stderr, " %td:", lastpos[j - stk[-1].nlastpos].index);
2798           prtok (d->tokens[lastpos[j - stk[-1].nlastpos].index]);
2799         }
2800       putc ('\n', stderr);
2801 #endif
2802     }
2803
2804   /* For each follow set that is the follow set of a real position, replace
2805      it with its epsilon closure.  */
2806   epsclosure (d);
2807
2808   dfaoptimize (d);
2809
2810 #ifdef DEBUG
2811   for (idx_t i = 0; i < d->tindex; i++)
2812     if (d->tokens[i] == BEG || d->tokens[i] < NOTCHAR
2813         || d->tokens[i] == BACKREF || d->tokens[i] == ANYCHAR
2814         || d->tokens[i] == MBCSET || d->tokens[i] >= CSET)
2815       {
2816         fprintf (stderr, "follows(%td:", i);
2817         prtok (d->tokens[i]);
2818         fprintf (stderr, "):");
2819         for (idx_t j = 0; j < d->follows[i].nelem; j++)
2820           {
2821             fprintf (stderr, " %td:", d->follows[i].elems[j].index);
2822             prtok (d->tokens[d->follows[i].elems[j].index]);
2823           }
2824         putc ('\n', stderr);
2825       }
2826 #endif
2827
2828   pos.index = 0;
2829   pos.constraint = NO_CONSTRAINT;
2830
2831   alloc_position_set (&tmp, 1);
2832
2833   append (pos, &tmp);
2834
2835   d->separates = xnmalloc (d->tindex, sizeof *d->separates);
2836
2837   for (idx_t i = 0; i < d->tindex; i++)
2838     {
2839       d->separates[i] = 0;
2840
2841       if (prev_newline_dependent (d->constraints[i]))
2842         d->separates[i] |= CTX_NEWLINE;
2843       if (prev_letter_dependent (d->constraints[i]))
2844         d->separates[i] |= CTX_LETTER;
2845
2846       for (idx_t j = 0; j < d->follows[i].nelem; j++)
2847         {
2848           if (prev_newline_dependent (d->follows[i].elems[j].constraint))
2849             d->separates[i] |= CTX_NEWLINE;
2850           if (prev_letter_dependent (d->follows[i].elems[j].constraint))
2851             d->separates[i] |= CTX_LETTER;
2852         }
2853     }
2854
2855   /* Context wanted by some position.  */
2856   int separate_contexts = state_separate_contexts (d, &tmp);
2857
2858   /* Build the initial state.  */
2859   if (separate_contexts & CTX_NEWLINE)
2860     state_index (d, &tmp, CTX_NEWLINE);
2861   d->initstate_notbol = d->min_trcount
2862     = state_index (d, &tmp, separate_contexts ^ CTX_ANY);
2863   if (separate_contexts & CTX_LETTER)
2864     d->min_trcount = state_index (d, &tmp, CTX_LETTER);
2865   d->min_trcount++;
2866   d->trcount = 0;
2867
2868   free (posalloc);
2869   free (stkalloc);
2870   free (merged.elems);
2871   free (tmp.elems);
2872 }
2873
2874 /* Make sure D's state arrays are large enough to hold NEW_STATE.  */
2875 static void
2876 realloc_trans_if_necessary (struct dfa *d)
2877 {
2878   state_num oldalloc = d->tralloc;
2879   if (oldalloc < d->sindex)
2880     {
2881       state_num **realtrans = d->trans ? d->trans - 2 : NULL;
2882       idx_t newalloc1 = realtrans ? d->tralloc + 2 : 0;
2883       realtrans = xpalloc (realtrans, &newalloc1, d->sindex - oldalloc,
2884                            -1, sizeof *realtrans);
2885       realtrans[0] = realtrans[1] = NULL;
2886       d->trans = realtrans + 2;
2887       idx_t newalloc = d->tralloc = newalloc1 - 2;
2888       d->fails = xnrealloc (d->fails, newalloc, sizeof *d->fails);
2889       d->success = xnrealloc (d->success, newalloc, sizeof *d->success);
2890       d->newlines = xnrealloc (d->newlines, newalloc, sizeof *d->newlines);
2891       if (d->localeinfo.multibyte)
2892         {
2893           realtrans = d->mb_trans ? d->mb_trans - 2 : NULL;
2894           realtrans = xnrealloc (realtrans, newalloc1, sizeof *realtrans);
2895           if (oldalloc == 0)
2896             realtrans[0] = realtrans[1] = NULL;
2897           d->mb_trans = realtrans + 2;
2898         }
2899       for (; oldalloc < newalloc; oldalloc++)
2900         {
2901           d->trans[oldalloc] = NULL;
2902           d->fails[oldalloc] = NULL;
2903           if (d->localeinfo.multibyte)
2904             d->mb_trans[oldalloc] = NULL;
2905         }
2906     }
2907 }
2908
2909 /*
2910    Calculate the transition table for a new state derived from state s
2911    for a compiled dfa d after input character uc, and return the new
2912    state number.
2913
2914    Do not worry about all possible input characters; calculate just the group
2915    of positions that match uc.  Label it with the set of characters that
2916    every position in the group matches (taking into account, if necessary,
2917    preceding context information of s).  Then find the union
2918    of these positions' follows, i.e., the set of positions of the
2919    new state.  For each character in the group's label, set the transition
2920    on this character to be to a state corresponding to the set's positions,
2921    and its associated backward context information, if necessary.
2922
2923    When building a searching matcher, include the positions of state
2924    0 in every state.
2925
2926    The group is constructed by building an equivalence-class
2927    partition of the positions of s.
2928
2929    For each position, find the set of characters C that it matches.  Eliminate
2930    any characters from C that fail on grounds of backward context.
2931
2932    Check whether the group's label L has nonempty
2933    intersection with C.  If L - C is nonempty, create a new group labeled
2934    L - C and having the same positions as the current group, and set L to
2935    the intersection of L and C.  Insert the position in the group, set
2936    C = C - L, and resume scanning.
2937
2938    If after comparing with every group there are characters remaining in C,
2939    create a new group labeled with the characters of C and insert this
2940    position in that group.  */
2941
2942 static state_num
2943 build_state (state_num s, struct dfa *d, unsigned char uc)
2944 {
2945   position_set follows;         /* Union of the follows for each
2946                                    position of the current state.  */
2947   position_set group;           /* Positions that match the input char.  */
2948   position_set tmp;             /* Temporary space for merging sets.  */
2949   state_num state;              /* New state.  */
2950   state_num state_newline;      /* New state on a newline transition.  */
2951   state_num state_letter;       /* New state on a letter transition.  */
2952
2953 #ifdef DEBUG
2954   fprintf (stderr, "build state %td\n", s);
2955 #endif
2956
2957   /* A pointer to the new transition table, and the table itself.  */
2958   state_num **ptrans = (accepting (s, d) ? d->fails : d->trans) + s;
2959   state_num *trans = *ptrans;
2960
2961   if (!trans)
2962     {
2963       /* MAX_TRCOUNT is an arbitrary upper limit on the number of
2964          transition tables that can exist at once, other than for
2965          initial states.  Often-used transition tables are quickly
2966          rebuilt, whereas rarely-used ones are cleared away.  */
2967       if (MAX_TRCOUNT <= d->trcount)
2968         {
2969           for (state_num i = d->min_trcount; i < d->tralloc; i++)
2970             {
2971               free (d->trans[i]);
2972               free (d->fails[i]);
2973               d->trans[i] = d->fails[i] = NULL;
2974             }
2975           d->trcount = 0;
2976         }
2977
2978       d->trcount++;
2979       *ptrans = trans = xmalloc (NOTCHAR * sizeof *trans);
2980
2981       /* Fill transition table with a default value which means that the
2982          transited state has not been calculated yet.  */
2983       for (int i = 0; i < NOTCHAR; i++)
2984         trans[i] = -2;
2985     }
2986
2987   /* Set up the success bits for this state.  */
2988   d->success[s] = 0;
2989   if (accepts_in_context (d->states[s].context, CTX_NEWLINE, s, d))
2990     d->success[s] |= CTX_NEWLINE;
2991   if (accepts_in_context (d->states[s].context, CTX_LETTER, s, d))
2992     d->success[s] |= CTX_LETTER;
2993   if (accepts_in_context (d->states[s].context, CTX_NONE, s, d))
2994     d->success[s] |= CTX_NONE;
2995
2996   alloc_position_set (&follows, d->nleaves);
2997
2998   /* Find the union of the follows of the positions of the group.
2999      This is a hideously inefficient loop.  Fix it someday.  */
3000   for (idx_t j = 0; j < d->states[s].elems.nelem; j++)
3001     for (idx_t k = 0;
3002          k < d->follows[d->states[s].elems.elems[j].index].nelem; ++k)
3003       insert (d->follows[d->states[s].elems.elems[j].index].elems[k],
3004               &follows);
3005
3006   /* Positions that match the input char.  */
3007   alloc_position_set (&group, d->nleaves);
3008
3009   /* The group's label.  */
3010   charclass label;
3011   fillset (&label);
3012
3013   for (idx_t i = 0; i < follows.nelem; i++)
3014     {
3015       charclass matches;            /* Set of matching characters.  */
3016       position pos = follows.elems[i];
3017       bool matched = false;
3018       if (d->tokens[pos.index] >= 0 && d->tokens[pos.index] < NOTCHAR)
3019         {
3020           zeroset (&matches);
3021           setbit (d->tokens[pos.index], &matches);
3022           if (d->tokens[pos.index] == uc)
3023             matched = true;
3024         }
3025       else if (d->tokens[pos.index] >= CSET)
3026         {
3027           matches = d->charclasses[d->tokens[pos.index] - CSET];
3028           if (tstbit (uc, &matches))
3029             matched = true;
3030         }
3031       else if (d->tokens[pos.index] == ANYCHAR)
3032         {
3033           matches = d->charclasses[d->canychar];
3034           if (tstbit (uc, &matches))
3035             matched = true;
3036
3037           /* ANYCHAR must match with a single character, so we must put
3038              it to D->states[s].mbps which contains the positions which
3039              can match with a single character not a byte.  If all
3040              positions which has ANYCHAR does not depend on context of
3041              next character, we put the follows instead of it to
3042              D->states[s].mbps to optimize.  */
3043           if (succeeds_in_context (pos.constraint, d->states[s].context,
3044                                    CTX_NONE))
3045             {
3046               if (d->states[s].mbps.nelem == 0)
3047                 alloc_position_set (&d->states[s].mbps, 1);
3048               insert (pos, &d->states[s].mbps);
3049             }
3050         }
3051       else
3052         continue;
3053
3054       /* Some characters may need to be eliminated from matches because
3055          they fail in the current context.  */
3056       if (pos.constraint != NO_CONSTRAINT)
3057         {
3058           if (!succeeds_in_context (pos.constraint,
3059                                     d->states[s].context, CTX_NEWLINE))
3060             for (int j = 0; j < CHARCLASS_WORDS; j++)
3061               matches.w[j] &= ~d->syntax.newline.w[j];
3062           if (!succeeds_in_context (pos.constraint,
3063                                     d->states[s].context, CTX_LETTER))
3064             for (int j = 0; j < CHARCLASS_WORDS; ++j)
3065               matches.w[j] &= ~d->syntax.letters.w[j];
3066           if (!succeeds_in_context (pos.constraint,
3067                                     d->states[s].context, CTX_NONE))
3068             for (int j = 0; j < CHARCLASS_WORDS; ++j)
3069               matches.w[j] &= d->syntax.letters.w[j] | d->syntax.newline.w[j];
3070
3071           /* If there are no characters left, there's no point in going on.  */
3072           if (emptyset (&matches))
3073             continue;
3074
3075           /* If we have reset the bit that made us declare "matched", reset
3076              that indicator, too.  This is required to avoid an infinite loop
3077              with this command: echo cx | LC_ALL=C grep -E 'c\b[x ]'  */
3078           if (!tstbit (uc, &matches))
3079             matched = false;
3080         }
3081
3082 #ifdef DEBUG
3083       fprintf (stderr, " nextpos %td:", pos.index);
3084       prtok (d->tokens[pos.index]);
3085       fprintf (stderr, " of");
3086       for (unsigned j = 0; j < NOTCHAR; j++)
3087         if (tstbit (j, &matches))
3088           fprintf (stderr, " 0x%02x", j);
3089       fprintf (stderr, "\n");
3090 #endif
3091
3092       if (matched)
3093         {
3094           for (int k = 0; k < CHARCLASS_WORDS; ++k)
3095             label.w[k] &= matches.w[k];
3096           append (pos, &group);
3097         }
3098       else
3099         {
3100           for (int k = 0; k < CHARCLASS_WORDS; ++k)
3101             label.w[k] &= ~matches.w[k];
3102         }
3103     }
3104
3105   alloc_position_set (&tmp, d->nleaves);
3106
3107   if (group.nelem > 0)
3108     {
3109       /* If we are building a searching matcher, throw in the positions
3110          of state 0 as well, if possible.  */
3111       if (d->searchflag)
3112         {
3113           /* If a token in follows.elems is not 1st byte of a multibyte
3114              character, or the states of follows must accept the bytes
3115              which are not 1st byte of the multibyte character.
3116              Then, if a state of follows encounters a byte, it must not be
3117              a 1st byte of a multibyte character nor a single byte character.
3118              In this case, do not add state[0].follows to next state, because
3119              state[0] must accept 1st-byte.
3120
3121              For example, suppose <sb a> is a certain single byte character,
3122              <mb A> is a certain multibyte character, and the codepoint of
3123              <sb a> equals the 2nd byte of the codepoint of <mb A>.  When
3124              state[0] accepts <sb a>, state[i] transits to state[i+1] by
3125              accepting the 1st byte of <mb A>, and state[i+1] accepts the
3126              2nd byte of <mb A>, if state[i+1] encounters the codepoint of
3127              <sb a>, it must not be <sb a> but the 2nd byte of <mb A>, so do
3128              not add state[0].  */
3129
3130           bool mergeit = !d->localeinfo.multibyte;
3131           if (!mergeit)
3132             {
3133               mergeit = true;
3134               for (idx_t j = 0; mergeit && j < group.nelem; j++)
3135                 mergeit &= d->multibyte_prop[group.elems[j].index];
3136             }
3137           if (mergeit)
3138             {
3139               merge (&d->states[0].elems, &group, &tmp);
3140               copy (&tmp, &group);
3141             }
3142         }
3143
3144       /* Find out if the new state will want any context information,
3145          by calculating possible contexts that the group can match,
3146          and separate contexts that the new state wants to know.  */
3147       int possible_contexts = charclass_context (d, &label);
3148       int separate_contexts = state_separate_contexts (d, &group);
3149
3150       /* Find the state(s) corresponding to the union of the follows.  */
3151       if (possible_contexts & ~separate_contexts)
3152         state = state_index (d, &group, separate_contexts ^ CTX_ANY);
3153       else
3154         state = -1;
3155       if (separate_contexts & possible_contexts & CTX_NEWLINE)
3156         state_newline = state_index (d, &group, CTX_NEWLINE);
3157       else
3158         state_newline = state;
3159       if (separate_contexts & possible_contexts & CTX_LETTER)
3160         state_letter = state_index (d, &group, CTX_LETTER);
3161       else
3162         state_letter = state;
3163
3164       /* Reallocate now, to reallocate any newline transition properly.  */
3165       realloc_trans_if_necessary (d);
3166     }
3167
3168   /* If we are a searching matcher, the default transition is to a state
3169      containing the positions of state 0, otherwise the default transition
3170      is to fail miserably.  */
3171   else if (d->searchflag)
3172     {
3173       state_newline = 0;
3174       state_letter = d->min_trcount - 1;
3175       state = d->initstate_notbol;
3176     }
3177   else
3178     {
3179       state_newline = -1;
3180       state_letter = -1;
3181       state = -1;
3182     }
3183
3184   /* Set the transitions for each character in the label.  */
3185   for (int i = 0; i < NOTCHAR; i++)
3186     if (tstbit (i, &label))
3187       switch (d->syntax.sbit[i])
3188         {
3189         case CTX_NEWLINE:
3190           trans[i] = state_newline;
3191           break;
3192         case CTX_LETTER:
3193           trans[i] = state_letter;
3194           break;
3195         default:
3196           trans[i] = state;
3197           break;
3198         }
3199
3200 #ifdef DEBUG
3201   fprintf (stderr, "trans table %td", s);
3202   for (int i = 0; i < NOTCHAR; ++i)
3203     {
3204       if (!(i & 0xf))
3205         fprintf (stderr, "\n");
3206       fprintf (stderr, " %2td", trans[i]);
3207     }
3208   fprintf (stderr, "\n");
3209 #endif
3210
3211   free (group.elems);
3212   free (follows.elems);
3213   free (tmp.elems);
3214
3215   /* Keep the newline transition in a special place so we can use it as
3216      a sentinel.  */
3217   if (tstbit (d->syntax.eolbyte, &label))
3218     {
3219       d->newlines[s] = trans[d->syntax.eolbyte];
3220       trans[d->syntax.eolbyte] = -1;
3221     }
3222
3223   return trans[uc];
3224 }
3225
3226 /* Multibyte character handling sub-routines for dfaexec.  */
3227
3228 /* Consume a single byte and transit state from 's' to '*next_state'.
3229    This function is almost same as the state transition routin in dfaexec.
3230    But state transition is done just once, otherwise matching succeed or
3231    reach the end of the buffer.  */
3232 static state_num
3233 transit_state_singlebyte (struct dfa *d, state_num s, unsigned char const **pp)
3234 {
3235   state_num *t;
3236
3237   if (d->trans[s])
3238     t = d->trans[s];
3239   else if (d->fails[s])
3240     t = d->fails[s];
3241   else
3242     {
3243       build_state (s, d, **pp);
3244       if (d->trans[s])
3245         t = d->trans[s];
3246       else
3247         {
3248           t = d->fails[s];
3249           assert (t);
3250         }
3251     }
3252
3253   if (t[**pp] == -2)
3254     build_state (s, d, **pp);
3255
3256   return t[*(*pp)++];
3257 }
3258
3259 /* Transit state from s, then return new state and update the pointer of
3260    the buffer.  This function is for a period operator which can match a
3261    multi-byte character.  */
3262 static state_num
3263 transit_state (struct dfa *d, state_num s, unsigned char const **pp,
3264                unsigned char const *end)
3265 {
3266   wint_t wc;
3267
3268   int mbclen = mbs_to_wchar (&wc, (char const *) *pp, end - *pp, d);
3269
3270   /* This state has some operators which can match a multibyte character.  */
3271   d->mb_follows.nelem = 0;
3272
3273   /* Calculate the state which can be reached from the state 's' by
3274      consuming 'mbclen' single bytes from the buffer.  */
3275   state_num s1 = s;
3276   int mbci;
3277   for (mbci = 0; mbci < mbclen && (mbci == 0 || d->min_trcount <= s); mbci++)
3278     s = transit_state_singlebyte (d, s, pp);
3279   *pp += mbclen - mbci;
3280
3281   if (wc == WEOF)
3282     {
3283       /* It is an invalid character, so ANYCHAR is not accepted.  */
3284       return s;
3285     }
3286
3287   /* If all positions which have ANYCHAR do not depend on the context
3288      of the next character, calculate the next state with
3289      pre-calculated follows and cache the result.  */
3290   if (d->states[s1].mb_trindex < 0)
3291     {
3292       if (MAX_TRCOUNT <= d->mb_trcount)
3293         {
3294           state_num s3;
3295           for (s3 = -1; s3 < d->tralloc; s3++)
3296             {
3297               free (d->mb_trans[s3]);
3298               d->mb_trans[s3] = NULL;
3299             }
3300
3301           for (state_num i = 0; i < d->sindex; i++)
3302             d->states[i].mb_trindex = -1;
3303           d->mb_trcount = 0;
3304         }
3305       d->states[s1].mb_trindex = d->mb_trcount++;
3306     }
3307
3308   if (! d->mb_trans[s])
3309     {
3310       enum { TRANSPTR_SIZE = sizeof *d->mb_trans[s] };
3311       enum { TRANSALLOC_SIZE = MAX_TRCOUNT * TRANSPTR_SIZE };
3312       d->mb_trans[s] = xmalloc (TRANSALLOC_SIZE);
3313       for (int i = 0; i < MAX_TRCOUNT; i++)
3314         d->mb_trans[s][i] = -1;
3315     }
3316   else if (d->mb_trans[s][d->states[s1].mb_trindex] >= 0)
3317     return d->mb_trans[s][d->states[s1].mb_trindex];
3318
3319   if (s == -1)
3320     copy (&d->states[s1].mbps, &d->mb_follows);
3321   else
3322     merge (&d->states[s1].mbps, &d->states[s].elems, &d->mb_follows);
3323
3324   int separate_contexts = state_separate_contexts (d, &d->mb_follows);
3325   state_num s2 = state_index (d, &d->mb_follows, separate_contexts ^ CTX_ANY);
3326   realloc_trans_if_necessary (d);
3327
3328   d->mb_trans[s][d->states[s1].mb_trindex] = s2;
3329
3330   return s2;
3331 }
3332
3333 /* The initial state may encounter a byte which is not a single byte character
3334    nor the first byte of a multibyte character.  But it is incorrect for the
3335    initial state to accept such a byte.  For example, in Shift JIS the regular
3336    expression "\\" accepts the codepoint 0x5c, but should not accept the second
3337    byte of the codepoint 0x815c.  Then the initial state must skip the bytes
3338    that are not a single byte character nor the first byte of a multibyte
3339    character.
3340
3341    Given DFA state d, use mbs_to_wchar to advance MBP until it reaches
3342    or exceeds P, and return the advanced MBP.  If WCP is non-NULL and
3343    the result is greater than P, set *WCP to the final wide character
3344    processed, or to WEOF if no wide character is processed.  Otherwise,
3345    if WCP is non-NULL, *WCP may or may not be updated.
3346
3347    Both P and MBP must be no larger than END.  */
3348 static unsigned char const *
3349 skip_remains_mb (struct dfa *d, unsigned char const *p,
3350                  unsigned char const *mbp, char const *end)
3351 {
3352   if (d->syntax.never_trail[*p])
3353     return p;
3354   while (mbp < p)
3355     {
3356       wint_t wc;
3357       mbp += mbs_to_wchar (&wc, (char const *) mbp,
3358                            end - (char const *) mbp, d);
3359     }
3360   return mbp;
3361 }
3362
3363 /* Search through a buffer looking for a match to the struct dfa *D.
3364    Find the first occurrence of a string matching the regexp in the
3365    buffer, and the shortest possible version thereof.  Return a pointer to
3366    the first character after the match, or NULL if none is found.  BEGIN
3367    points to the beginning of the buffer, and END points to the first byte
3368    after its end.  Note however that we store a sentinel byte (usually
3369    newline) in *END, so the actual buffer must be one byte longer.
3370    When ALLOW_NL, newlines may appear in the matching string.
3371    If COUNT is non-NULL, increment *COUNT once for each newline processed.
3372    If MULTIBYTE, the input consists of multibyte characters and/or
3373    encoding-error bytes.  Otherwise, it consists of single-byte characters.
3374    Here is the list of features that make this DFA matcher punt:
3375     - [M-N] range in non-simple locale: regex is up to 25% faster on [a-z]
3376     - [^...] in non-simple locale
3377     - [[=foo=]] or [[.foo.]]
3378     - [[:alpha:]] etc. in multibyte locale (except [[:digit:]] works OK)
3379     - back-reference: (.)\1
3380     - word-delimiter in multibyte locale: \<, \>, \b, \B
3381    See struct localeinfo.simple for the definition of "simple locale".  */
3382
3383 static inline char *
3384 dfaexec_main (struct dfa *d, char const *begin, char *end, bool allow_nl,
3385               ptrdiff_t *count, bool multibyte)
3386 {
3387   if (MAX_TRCOUNT <= d->sindex)
3388     {
3389       for (state_num s = d->min_trcount; s < d->sindex; s++)
3390         {
3391           free (d->states[s].elems.elems);
3392           free (d->states[s].mbps.elems);
3393         }
3394       d->sindex = d->min_trcount;
3395
3396       if (d->trans)
3397         {
3398           for (state_num s = 0; s < d->tralloc; s++)
3399             {
3400               free (d->trans[s]);
3401               free (d->fails[s]);
3402               d->trans[s] = d->fails[s] = NULL;
3403             }
3404           d->trcount = 0;
3405         }
3406
3407       if (d->localeinfo.multibyte && d->mb_trans)
3408         {
3409           for (state_num s = -1; s < d->tralloc; s++)
3410             {
3411               free (d->mb_trans[s]);
3412               d->mb_trans[s] = NULL;
3413             }
3414           for (state_num s = 0; s < d->min_trcount; s++)
3415             d->states[s].mb_trindex = -1;
3416           d->mb_trcount = 0;
3417         }
3418     }
3419
3420   if (!d->tralloc)
3421     realloc_trans_if_necessary (d);
3422
3423   /* Current state.  */
3424   state_num s = 0, s1 = 0;
3425
3426   /* Current input character.  */
3427   unsigned char const *p = (unsigned char const *) begin;
3428   unsigned char const *mbp = p;
3429
3430   /* Copy of d->trans so it can be optimized into a register.  */
3431   state_num **trans = d->trans;
3432   unsigned char eol = d->syntax.eolbyte;  /* Likewise for eolbyte.  */
3433   unsigned char saved_end = *(unsigned char *) end;
3434   *end = eol;
3435
3436   if (multibyte)
3437     {
3438       memset (&d->mbs, 0, sizeof d->mbs);
3439       if (d->mb_follows.alloc == 0)
3440         alloc_position_set (&d->mb_follows, d->nleaves);
3441     }
3442
3443   idx_t nlcount = 0;
3444   for (;;)
3445     {
3446       state_num *t;
3447       while ((t = trans[s]) != NULL)
3448         {
3449           if (s < d->min_trcount)
3450             {
3451               if (!multibyte || d->states[s].mbps.nelem == 0)
3452                 {
3453                   while (t[*p] == s)
3454                     p++;
3455                 }
3456               if (multibyte)
3457                 p = mbp = skip_remains_mb (d, p, mbp, end);
3458             }
3459
3460           if (multibyte)
3461             {
3462               s1 = s;
3463
3464               if (d->states[s].mbps.nelem == 0
3465                   || d->localeinfo.sbctowc[*p] != WEOF || (char *) p >= end)
3466                 {
3467                   /* If an input character does not match ANYCHAR, do it
3468                      like a single-byte character.  */
3469                   s = t[*p++];
3470                 }
3471               else
3472                 {
3473                   s = transit_state (d, s, &p, (unsigned char *) end);
3474                   mbp = p;
3475                   trans = d->trans;
3476                 }
3477             }
3478           else
3479             {
3480               s1 = t[*p++];
3481               t = trans[s1];
3482               if (! t)
3483                 {
3484                   state_num tmp = s;
3485                   s = s1;
3486                   s1 = tmp;     /* swap */
3487                   break;
3488                 }
3489               if (s < d->min_trcount)
3490                 {
3491                   while (t[*p] == s1)
3492                     p++;
3493                 }
3494               s = t[*p++];
3495             }
3496         }
3497
3498       if (s < 0)
3499         {
3500           if (s == -2)
3501             {
3502               s = build_state (s1, d, p[-1]);
3503               trans = d->trans;
3504             }
3505           else if ((char *) p <= end && p[-1] == eol && 0 <= d->newlines[s1])
3506             {
3507               /* The previous character was a newline.  Count it, and skip
3508                  checking of multibyte character boundary until here.  */
3509               nlcount++;
3510               mbp = p;
3511
3512               s = (allow_nl ? d->newlines[s1]
3513                    : d->syntax.sbit[eol] == CTX_NEWLINE ? 0
3514                    : d->syntax.sbit[eol] == CTX_LETTER ? d->min_trcount - 1
3515                    : d->initstate_notbol);
3516             }
3517           else
3518             {
3519               p = NULL;
3520               goto done;
3521             }
3522         }
3523       else if (d->fails[s])
3524         {
3525           if ((d->success[s] & d->syntax.sbit[*p])
3526               || ((char *) p == end
3527                   && accepts_in_context (d->states[s].context, CTX_NEWLINE, s,
3528                                          d)))
3529             goto done;
3530
3531           if (multibyte && s < d->min_trcount)
3532             p = mbp = skip_remains_mb (d, p, mbp, end);
3533
3534           s1 = s;
3535           if (!multibyte || d->states[s].mbps.nelem == 0
3536               || d->localeinfo.sbctowc[*p] != WEOF || (char *) p >= end)
3537             {
3538               /* If a input character does not match ANYCHAR, do it
3539                  like a single-byte character.  */
3540               s = d->fails[s][*p++];
3541             }
3542           else
3543             {
3544               s = transit_state (d, s, &p, (unsigned char *) end);
3545               mbp = p;
3546               trans = d->trans;
3547             }
3548         }
3549       else
3550         {
3551           build_state (s, d, p[0]);
3552           trans = d->trans;
3553         }
3554     }
3555
3556  done:
3557   if (count)
3558     *count += nlcount;
3559   *end = saved_end;
3560   return (char *) p;
3561 }
3562
3563 /* Specialized versions of dfaexec for multibyte and single-byte cases.
3564    This is for performance, as dfaexec_main is an inline function.  */
3565
3566 static char *
3567 dfaexec_mb (struct dfa *d, char const *begin, char *end,
3568             bool allow_nl, ptrdiff_t *count, bool *backref)
3569 {
3570   return dfaexec_main (d, begin, end, allow_nl, count, true);
3571 }
3572
3573 static char *
3574 dfaexec_sb (struct dfa *d, char const *begin, char *end,
3575             bool allow_nl, ptrdiff_t *count, bool *backref)
3576 {
3577   return dfaexec_main (d, begin, end, allow_nl, count, false);
3578 }
3579
3580 /* Always set *BACKREF and return BEGIN.  Use this wrapper for
3581    any regexp that uses a construct not supported by this code.  */
3582 static char *
3583 dfaexec_noop (struct dfa *d, char const *begin, char *end,
3584               bool allow_nl, ptrdiff_t *count, bool *backref)
3585 {
3586   *backref = true;
3587   return (char *) begin;
3588 }
3589
3590 /* Like dfaexec_main (D, BEGIN, END, ALLOW_NL, COUNT, D->localeinfo.multibyte),
3591    but faster and set *BACKREF if the DFA code does not support this
3592    regexp usage.  */
3593
3594 char *
3595 dfaexec (struct dfa *d, char const *begin, char *end,
3596          bool allow_nl, ptrdiff_t *count, bool *backref)
3597 {
3598   return d->dfaexec (d, begin, end, allow_nl, count, backref);
3599 }
3600
3601 struct dfa *
3602 dfasuperset (struct dfa const *d)
3603 {
3604   return d->superset;
3605 }
3606
3607 bool
3608 dfaisfast (struct dfa const *d)
3609 {
3610   return d->fast;
3611 }
3612
3613 static void
3614 free_mbdata (struct dfa *d)
3615 {
3616   free (d->multibyte_prop);
3617   free (d->lex.brack.chars);
3618   free (d->mb_follows.elems);
3619
3620   if (d->mb_trans)
3621     {
3622       state_num s;
3623       for (s = -1; s < d->tralloc; s++)
3624         free (d->mb_trans[s]);
3625       free (d->mb_trans - 2);
3626     }
3627 }
3628
3629 /* Return true if every construct in D is supported by this DFA matcher.  */
3630 static bool _GL_ATTRIBUTE_PURE
3631 dfa_supported (struct dfa const *d)
3632 {
3633   for (idx_t i = 0; i < d->tindex; i++)
3634     {
3635       switch (d->tokens[i])
3636         {
3637         case BEGWORD:
3638         case ENDWORD:
3639         case LIMWORD:
3640         case NOTLIMWORD:
3641           if (!d->localeinfo.multibyte)
3642             continue;
3643           FALLTHROUGH;
3644         case BACKREF:
3645         case MBCSET:
3646           return false;
3647         }
3648     }
3649   return true;
3650 }
3651
3652 /* Disable use of the superset DFA if it is not likely to help
3653    performance.  */
3654 static void
3655 maybe_disable_superset_dfa (struct dfa *d)
3656 {
3657   if (!d->localeinfo.using_utf8)
3658     return;
3659
3660   bool have_backref = false;
3661   for (idx_t i = 0; i < d->tindex; i++)
3662     {
3663       switch (d->tokens[i])
3664         {
3665         case ANYCHAR:
3666           /* Lowered.  */
3667           abort ();
3668         case BACKREF:
3669           have_backref = true;
3670           break;
3671         case MBCSET:
3672           /* Requires multi-byte algorithm.  */
3673           return;
3674         default:
3675           break;
3676         }
3677     }
3678
3679   if (!have_backref && d->superset)
3680     {
3681       /* The superset DFA is not likely to be much faster, so remove it.  */
3682       dfafree (d->superset);
3683       free (d->superset);
3684       d->superset = NULL;
3685     }
3686
3687   free_mbdata (d);
3688   d->localeinfo.multibyte = false;
3689   d->dfaexec = dfaexec_sb;
3690   d->fast = true;
3691 }
3692
3693 static void
3694 dfassbuild (struct dfa *d)
3695 {
3696   struct dfa *sup = dfaalloc ();
3697
3698   *sup = *d;
3699   sup->localeinfo.multibyte = false;
3700   sup->dfaexec = dfaexec_sb;
3701   sup->multibyte_prop = NULL;
3702   sup->superset = NULL;
3703   sup->states = NULL;
3704   sup->sindex = 0;
3705   sup->constraints = NULL;
3706   sup->separates = NULL;
3707   sup->follows = NULL;
3708   sup->tralloc = 0;
3709   sup->trans = NULL;
3710   sup->fails = NULL;
3711   sup->success = NULL;
3712   sup->newlines = NULL;
3713
3714   sup->charclasses = xnmalloc (sup->calloc, sizeof *sup->charclasses);
3715   if (d->cindex)
3716     {
3717       memcpy (sup->charclasses, d->charclasses,
3718               d->cindex * sizeof *sup->charclasses);
3719     }
3720
3721   sup->tokens = xnmalloc (d->tindex, 2 * sizeof *sup->tokens);
3722   sup->talloc = d->tindex * 2;
3723
3724   bool have_achar = false;
3725   bool have_nchar = false;
3726   idx_t j;
3727   for (idx_t i = j = 0; i < d->tindex; i++)
3728     {
3729       switch (d->tokens[i])
3730         {
3731         case ANYCHAR:
3732         case MBCSET:
3733         case BACKREF:
3734           {
3735             charclass ccl;
3736             fillset (&ccl);
3737             sup->tokens[j++] = CSET + charclass_index (sup, &ccl);
3738             sup->tokens[j++] = STAR;
3739             if (d->tokens[i + 1] == QMARK || d->tokens[i + 1] == STAR
3740                 || d->tokens[i + 1] == PLUS)
3741               i++;
3742             have_achar = true;
3743           }
3744           break;
3745         case BEGWORD:
3746         case ENDWORD:
3747         case LIMWORD:
3748         case NOTLIMWORD:
3749           if (d->localeinfo.multibyte)
3750             {
3751               /* These constraints aren't supported in a multibyte locale.
3752                  Ignore them in the superset DFA.  */
3753               sup->tokens[j++] = EMPTY;
3754               break;
3755             }
3756           FALLTHROUGH;
3757         default:
3758           sup->tokens[j++] = d->tokens[i];
3759           if ((0 <= d->tokens[i] && d->tokens[i] < NOTCHAR)
3760               || d->tokens[i] >= CSET)
3761             have_nchar = true;
3762           break;
3763         }
3764     }
3765   sup->tindex = j;
3766
3767   if (have_nchar && (have_achar || d->localeinfo.multibyte))
3768     d->superset = sup;
3769   else
3770     {
3771       dfafree (sup);
3772       free (sup);
3773     }
3774 }
3775
3776 /* Parse a string S of length LEN into D (but skip this step if S is null).
3777    Then analyze D and build a matcher for it.
3778    SEARCHFLAG says whether to build a searching or an exact matcher.  */
3779 void
3780 dfacomp (char const *s, idx_t len, struct dfa *d, bool searchflag)
3781 {
3782   if (s != NULL)
3783     dfaparse (s, len, d);
3784
3785   dfassbuild (d);
3786
3787   if (dfa_supported (d))
3788     {
3789       maybe_disable_superset_dfa (d);
3790       dfaanalyze (d, searchflag);
3791     }
3792   else
3793     {
3794       d->dfaexec = dfaexec_noop;
3795     }
3796
3797   if (d->superset)
3798     {
3799       d->fast = true;
3800       dfaanalyze (d->superset, searchflag);
3801     }
3802 }
3803
3804 /* Free the storage held by the components of a dfa.  */
3805 void
3806 dfafree (struct dfa *d)
3807 {
3808   free (d->charclasses);
3809   free (d->tokens);
3810
3811   if (d->localeinfo.multibyte)
3812     free_mbdata (d);
3813
3814   free (d->constraints);
3815   free (d->separates);
3816
3817   for (idx_t i = 0; i < d->sindex; i++)
3818     {
3819       free (d->states[i].elems.elems);
3820       free (d->states[i].mbps.elems);
3821     }
3822   free (d->states);
3823
3824   if (d->follows)
3825     {
3826       for (idx_t i = 0; i < d->tindex; i++)
3827         free (d->follows[i].elems);
3828       free (d->follows);
3829     }
3830
3831   if (d->trans)
3832     {
3833       for (idx_t i = 0; i < d->tralloc; i++)
3834         {
3835           free (d->trans[i]);
3836           free (d->fails[i]);
3837         }
3838
3839       free (d->trans - 2);
3840       free (d->fails);
3841       free (d->newlines);
3842       free (d->success);
3843     }
3844
3845   if (d->superset)
3846     {
3847       dfafree (d->superset);
3848       free (d->superset);
3849     }
3850 }
3851
3852 /* Having found the postfix representation of the regular expression,
3853    try to find a long sequence of characters that must appear in any line
3854    containing the r.e.
3855    Finding a "longest" sequence is beyond the scope here;
3856    we take an easy way out and hope for the best.
3857    (Take "(ab|a)b"--please.)
3858
3859    We do a bottom-up calculation of sequences of characters that must appear
3860    in matches of r.e.'s represented by trees rooted at the nodes of the postfix
3861    representation:
3862         sequences that must appear at the left of the match ("left")
3863         sequences that must appear at the right of the match ("right")
3864         lists of sequences that must appear somewhere in the match ("in")
3865         sequences that must constitute the match ("is")
3866
3867    When we get to the root of the tree, we use one of the longest of its
3868    calculated "in" sequences as our answer.
3869
3870    The sequences calculated for the various types of node (in pseudo ANSI c)
3871    are shown below.  "p" is the operand of unary operators (and the left-hand
3872    operand of binary operators); "q" is the right-hand operand of binary
3873    operators.
3874
3875    "ZERO" means "a zero-length sequence" below.
3876
3877         Type    left            right           is              in
3878         ----    ----            -----           --              --
3879         char c  # c             # c             # c             # c
3880
3881         ANYCHAR ZERO            ZERO            ZERO            ZERO
3882
3883         MBCSET  ZERO            ZERO            ZERO            ZERO
3884
3885         CSET    ZERO            ZERO            ZERO            ZERO
3886
3887         STAR    ZERO            ZERO            ZERO            ZERO
3888
3889         QMARK   ZERO            ZERO            ZERO            ZERO
3890
3891         PLUS    p->left         p->right        ZERO            p->in
3892
3893         CAT     (p->is==ZERO)?  (q->is==ZERO)?  (p->is!=ZERO && p->in plus
3894                 p->left :       q->right :      q->is!=ZERO) ?  q->in plus
3895                 p->is##q->left  p->right##q->is p->is##q->is : p->right##q->left
3896                                                 ZERO
3897
3898         OR      longest common  longest common  (do p->is and substrings common
3899                 leading         trailing        to q->is have same p->in and
3900                 (sub)sequence   (sub)sequence   q->in length and content) ?
3901                 of p->left      of p->right
3902                 and q->left     and q->right    p->is : NULL
3903
3904    If there's anything else we recognize in the tree, all four sequences get set
3905    to zero-length sequences.  If there's something we don't recognize in the
3906    tree, we just return a zero-length sequence.
3907
3908    Break ties in favor of infrequent letters (choosing 'zzz' in preference to
3909    'aaa')?
3910
3911    And ... is it here or someplace that we might ponder "optimizations" such as
3912         egrep 'psi|epsilon'     ->      egrep 'psi'
3913         egrep 'pepsi|epsilon'   ->      egrep 'epsi'
3914                                         (Yes, we now find "epsi" as a "string
3915                                         that must occur", but we might also
3916                                         simplify the *entire* r.e. being sought)
3917         grep '[c]'              ->      grep 'c'
3918         grep '(ab|a)b'          ->      grep 'ab'
3919         grep 'ab*'              ->      grep 'a'
3920         grep 'a*b'              ->      grep 'b'
3921
3922    There are several issues:
3923
3924    Is optimization easy (enough)?
3925
3926    Does optimization actually accomplish anything,
3927    or is the automaton you get from "psi|epsilon" (for example)
3928    the same as the one you get from "psi" (for example)?
3929
3930    Are optimizable r.e.'s likely to be used in real-life situations
3931    (something like 'ab*' is probably unlikely; something like is
3932    'psi|epsilon' is likelier)?  */
3933
3934 static char *
3935 icatalloc (char *old, char const *new)
3936 {
3937   idx_t newsize = strlen (new);
3938   if (newsize == 0)
3939     return old;
3940   idx_t oldsize = strlen (old);
3941   char *result = xrealloc (old, oldsize + newsize + 1);
3942   memcpy (result + oldsize, new, newsize + 1);
3943   return result;
3944 }
3945
3946 static void
3947 freelist (char **cpp)
3948 {
3949   while (*cpp)
3950     free (*cpp++);
3951 }
3952
3953 static char **
3954 enlist (char **cpp, char *new, idx_t len)
3955 {
3956   new = memcpy (xmalloc (len + 1), new, len);
3957   new[len] = '\0';
3958   /* Is there already something in the list that's new (or longer)?  */
3959   idx_t i;
3960   for (i = 0; cpp[i] != NULL; i++)
3961     if (strstr (cpp[i], new) != NULL)
3962       {
3963         free (new);
3964         return cpp;
3965       }
3966   /* Eliminate any obsoleted strings.  */
3967   for (idx_t j = 0; cpp[j] != NULL; )
3968     if (strstr (new, cpp[j]) == NULL)
3969       ++j;
3970     else
3971       {
3972         free (cpp[j]);
3973         if (--i == j)
3974           break;
3975         cpp[j] = cpp[i];
3976         cpp[i] = NULL;
3977       }
3978   /* Add the new string.  */
3979   cpp = xnrealloc (cpp, i + 2, sizeof *cpp);
3980   cpp[i] = new;
3981   cpp[i + 1] = NULL;
3982   return cpp;
3983 }
3984
3985 /* Given pointers to two strings, return a pointer to an allocated
3986    list of their distinct common substrings.  */
3987 static char **
3988 comsubs (char *left, char const *right)
3989 {
3990   char **cpp = xzalloc (sizeof *cpp);
3991
3992   for (char *lcp = left; *lcp != '\0'; lcp++)
3993     {
3994       idx_t len = 0;
3995       char *rcp = strchr (right, *lcp);
3996       while (rcp != NULL)
3997         {
3998           idx_t i;
3999           for (i = 1; lcp[i] != '\0' && lcp[i] == rcp[i]; ++i)
4000             continue;
4001           if (i > len)
4002             len = i;
4003           rcp = strchr (rcp + 1, *lcp);
4004         }
4005       if (len != 0)
4006         cpp = enlist (cpp, lcp, len);
4007     }
4008   return cpp;
4009 }
4010
4011 static char **
4012 addlists (char **old, char **new)
4013 {
4014   for (; *new; new++)
4015     old = enlist (old, *new, strlen (*new));
4016   return old;
4017 }
4018
4019 /* Given two lists of substrings, return a new list giving substrings
4020    common to both.  */
4021 static char **
4022 inboth (char **left, char **right)
4023 {
4024   char **both = xzalloc (sizeof *both);
4025
4026   for (idx_t lnum = 0; left[lnum] != NULL; lnum++)
4027     {
4028       for (idx_t rnum = 0; right[rnum] != NULL; rnum++)
4029         {
4030           char **temp = comsubs (left[lnum], right[rnum]);
4031           both = addlists (both, temp);
4032           freelist (temp);
4033           free (temp);
4034         }
4035     }
4036   return both;
4037 }
4038
4039 typedef struct must must;
4040
4041 struct must
4042 {
4043   char **in;
4044   char *left;
4045   char *right;
4046   char *is;
4047   bool begline;
4048   bool endline;
4049   must *prev;
4050 };
4051
4052 static must *
4053 allocmust (must *mp, idx_t size)
4054 {
4055   must *new_mp = xmalloc (sizeof *new_mp);
4056   new_mp->in = xzalloc (sizeof *new_mp->in);
4057   new_mp->left = xzalloc (size);
4058   new_mp->right = xzalloc (size);
4059   new_mp->is = xzalloc (size);
4060   new_mp->begline = false;
4061   new_mp->endline = false;
4062   new_mp->prev = mp;
4063   return new_mp;
4064 }
4065
4066 static void
4067 resetmust (must *mp)
4068 {
4069   freelist (mp->in);
4070   mp->in[0] = NULL;
4071   mp->left[0] = mp->right[0] = mp->is[0] = '\0';
4072   mp->begline = false;
4073   mp->endline = false;
4074 }
4075
4076 static void
4077 freemust (must *mp)
4078 {
4079   freelist (mp->in);
4080   free (mp->in);
4081   free (mp->left);
4082   free (mp->right);
4083   free (mp->is);
4084   free (mp);
4085 }
4086
4087 struct dfamust *
4088 dfamust (struct dfa const *d)
4089 {
4090   must *mp = NULL;
4091   char const *result = "";
4092   bool exact = false;
4093   bool begline = false;
4094   bool endline = false;
4095   bool need_begline = false;
4096   bool need_endline = false;
4097   bool case_fold_unibyte = d->syntax.case_fold & !d->localeinfo.multibyte;
4098
4099   for (idx_t ri = 1; ri + 1 < d->tindex; ri++)
4100     {
4101       token t = d->tokens[ri];
4102       switch (t)
4103         {
4104         case BEGLINE:
4105           mp = allocmust (mp, 2);
4106           mp->begline = true;
4107           need_begline = true;
4108           break;
4109         case ENDLINE:
4110           mp = allocmust (mp, 2);
4111           mp->endline = true;
4112           need_endline = true;
4113           break;
4114         case LPAREN:
4115         case RPAREN:
4116           assert (!"neither LPAREN nor RPAREN may appear here");
4117
4118         case EMPTY:
4119         case BEGWORD:
4120         case ENDWORD:
4121         case LIMWORD:
4122         case NOTLIMWORD:
4123         case BACKREF:
4124         case ANYCHAR:
4125         case MBCSET:
4126           mp = allocmust (mp, 2);
4127           break;
4128
4129         case STAR:
4130         case QMARK:
4131           resetmust (mp);
4132           break;
4133
4134         case OR:
4135           {
4136             char **new;
4137             must *rmp = mp;
4138             must *lmp = mp = mp->prev;
4139             idx_t j, ln, rn, n;
4140
4141             /* Guaranteed to be.  Unlikely, but ...  */
4142             if (streq (lmp->is, rmp->is))
4143               {
4144                 lmp->begline &= rmp->begline;
4145                 lmp->endline &= rmp->endline;
4146               }
4147             else
4148               {
4149                 lmp->is[0] = '\0';
4150                 lmp->begline = false;
4151                 lmp->endline = false;
4152               }
4153             /* Left side--easy */
4154             idx_t i = 0;
4155             while (lmp->left[i] != '\0' && lmp->left[i] == rmp->left[i])
4156               ++i;
4157             lmp->left[i] = '\0';
4158             /* Right side */
4159             ln = strlen (lmp->right);
4160             rn = strlen (rmp->right);
4161             n = ln;
4162             if (n > rn)
4163               n = rn;
4164             for (i = 0; i < n; ++i)
4165               if (lmp->right[ln - i - 1] != rmp->right[rn - i - 1])
4166                 break;
4167             for (j = 0; j < i; ++j)
4168               lmp->right[j] = lmp->right[(ln - i) + j];
4169             lmp->right[j] = '\0';
4170             new = inboth (lmp->in, rmp->in);
4171             freelist (lmp->in);
4172             free (lmp->in);
4173             lmp->in = new;
4174             freemust (rmp);
4175           }
4176           break;
4177
4178         case PLUS:
4179           mp->is[0] = '\0';
4180           break;
4181
4182         case END:
4183           assert (!mp->prev);
4184           for (idx_t i = 0; mp->in[i] != NULL; i++)
4185             if (strlen (mp->in[i]) > strlen (result))
4186               result = mp->in[i];
4187           if (streq (result, mp->is))
4188             {
4189               if ((!need_begline || mp->begline) && (!need_endline
4190                                                      || mp->endline))
4191                 exact = true;
4192               begline = mp->begline;
4193               endline = mp->endline;
4194             }
4195           goto done;
4196
4197         case CAT:
4198           {
4199             must *rmp = mp;
4200             must *lmp = mp = mp->prev;
4201
4202             /* In.  Everything in left, plus everything in
4203                right, plus concatenation of
4204                left's right and right's left.  */
4205             lmp->in = addlists (lmp->in, rmp->in);
4206             if (lmp->right[0] != '\0' && rmp->left[0] != '\0')
4207               {
4208                 idx_t lrlen = strlen (lmp->right);
4209                 idx_t rllen = strlen (rmp->left);
4210                 char *tp = xmalloc (lrlen + rllen);
4211                 memcpy (tp, lmp->right, lrlen);
4212                 memcpy (tp + lrlen, rmp->left, rllen);
4213                 lmp->in = enlist (lmp->in, tp, lrlen + rllen);
4214                 free (tp);
4215               }
4216             /* Left-hand */
4217             if (lmp->is[0] != '\0')
4218               lmp->left = icatalloc (lmp->left, rmp->left);
4219             /* Right-hand */
4220             if (rmp->is[0] == '\0')
4221               lmp->right[0] = '\0';
4222             lmp->right = icatalloc (lmp->right, rmp->right);
4223             /* Guaranteed to be */
4224             if ((lmp->is[0] != '\0' || lmp->begline)
4225                 && (rmp->is[0] != '\0' || rmp->endline))
4226               {
4227                 lmp->is = icatalloc (lmp->is, rmp->is);
4228                 lmp->endline = rmp->endline;
4229               }
4230             else
4231               {
4232                 lmp->is[0] = '\0';
4233                 lmp->begline = false;
4234                 lmp->endline = false;
4235               }
4236             freemust (rmp);
4237           }
4238           break;
4239
4240         case '\0':
4241           /* Not on *my* shift.  */
4242           goto done;
4243
4244         default:
4245           if (CSET <= t)
4246             {
4247               /* If T is a singleton, or if case-folding in a unibyte
4248                  locale and T's members all case-fold to the same char,
4249                  convert T to one of its members.  Otherwise, do
4250                  nothing further with T.  */
4251               charclass *ccl = &d->charclasses[t - CSET];
4252               int j;
4253               for (j = 0; j < NOTCHAR; j++)
4254                 if (tstbit (j, ccl))
4255                   break;
4256               if (! (j < NOTCHAR))
4257                 {
4258                   mp = allocmust (mp, 2);
4259                   break;
4260                 }
4261               t = j;
4262               while (++j < NOTCHAR)
4263                 if (tstbit (j, ccl)
4264                     && ! (case_fold_unibyte
4265                           && toupper (j) == toupper (t)))
4266                   break;
4267               if (j < NOTCHAR)
4268                 {
4269                   mp = allocmust (mp, 2);
4270                   break;
4271                 }
4272             }
4273
4274           idx_t rj = ri + 2;
4275           if (d->tokens[ri + 1] == CAT)
4276             {
4277               for (; rj < d->tindex - 1; rj += 2)
4278                 {
4279                   if ((rj != ri && (d->tokens[rj] <= 0
4280                                     || NOTCHAR <= d->tokens[rj]))
4281                       || d->tokens[rj + 1] != CAT)
4282                     break;
4283                 }
4284             }
4285           mp = allocmust (mp, ((rj - ri) >> 1) + 1);
4286           mp->is[0] = mp->left[0] = mp->right[0]
4287             = case_fold_unibyte ? toupper (t) : t;
4288
4289           idx_t i;
4290           for (i = 1; ri + 2 < rj; i++)
4291             {
4292               ri += 2;
4293               t = d->tokens[ri];
4294               mp->is[i] = mp->left[i] = mp->right[i]
4295                 = case_fold_unibyte ? toupper (t) : t;
4296             }
4297           mp->is[i] = mp->left[i] = mp->right[i] = '\0';
4298           mp->in = enlist (mp->in, mp->is, i);
4299           break;
4300         }
4301     }
4302  done:;
4303
4304   struct dfamust *dm = NULL;
4305   if (*result)
4306     {
4307       dm = xmalloc (FLEXSIZEOF (struct dfamust, must, strlen (result) + 1));
4308       dm->exact = exact;
4309       dm->begline = begline;
4310       dm->endline = endline;
4311       strcpy (dm->must, result);
4312     }
4313
4314   while (mp)
4315     {
4316       must *prev = mp->prev;
4317       freemust (mp);
4318       mp = prev;
4319     }
4320
4321   return dm;
4322 }
4323
4324 void
4325 dfamustfree (struct dfamust *dm)
4326 {
4327   free (dm);
4328 }
4329
4330 struct dfa *
4331 dfaalloc (void)
4332 {
4333   return xmalloc (sizeof (struct dfa));
4334 }
4335
4336 /* Initialize DFA.  */
4337 void
4338 dfasyntax (struct dfa *dfa, struct localeinfo const *linfo,
4339            reg_syntax_t bits, int dfaopts)
4340 {
4341   memset (dfa, 0, offsetof (struct dfa, dfaexec));
4342   dfa->dfaexec = linfo->multibyte ? dfaexec_mb : dfaexec_sb;
4343   dfa->localeinfo = *linfo;
4344
4345   dfa->fast = !dfa->localeinfo.multibyte;
4346
4347   dfa->canychar = -1;
4348   dfa->syntax.syntax_bits_set = true;
4349   dfa->syntax.case_fold = (bits & RE_ICASE) != 0;
4350   dfa->syntax.anchor = (dfaopts & DFA_ANCHOR) != 0;
4351   dfa->syntax.eolbyte = dfaopts & DFA_EOL_NUL ? '\0' : '\n';
4352   dfa->syntax.syntax_bits = bits;
4353
4354   for (int i = CHAR_MIN; i <= CHAR_MAX; ++i)
4355     {
4356       unsigned char uc = i;
4357
4358       dfa->syntax.sbit[uc] = char_context (dfa, uc);
4359       switch (dfa->syntax.sbit[uc])
4360         {
4361         case CTX_LETTER:
4362           setbit (uc, &dfa->syntax.letters);
4363           break;
4364         case CTX_NEWLINE:
4365           setbit (uc, &dfa->syntax.newline);
4366           break;
4367         }
4368
4369       /* POSIX requires that the five bytes in "\n\r./" (including the
4370          terminating NUL) cannot occur inside a multibyte character.  */
4371       dfa->syntax.never_trail[uc] = (dfa->localeinfo.using_utf8
4372                                      ? (uc & 0xc0) != 0x80
4373                                      : strchr ("\n\r./", uc) != NULL);
4374     }
4375 }
4376
4377 /* Initialize TO by copying FROM's syntax settings.  */
4378 void
4379 dfacopysyntax (struct dfa *to, struct dfa const *from)
4380 {
4381   memset (to, 0, offsetof (struct dfa, syntax));
4382   to->canychar = -1;
4383   to->fast = from->fast;
4384   to->syntax = from->syntax;
4385   to->dfaexec = from->dfaexec;
4386   to->localeinfo = from->localeinfo;
4387 }
4388
4389 /* vim:set shiftwidth=2: */