lib/dfa.c

   1 /* dfa.c - deterministic extended regexp routines for GNU
   2    Copyright (C) 1988, 1998, 2000, 2002, 2004-2005, 2007-2019 Free Software
   3    Foundation, Inc.
   4
   5    This program is free software; you can redistribute it and/or modify
   6    it under the terms of the GNU General Public License as published by
   7    the Free Software Foundation; either version 3, or (at your option)
   8    any later version.
   9
  10    This program is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13    GNU General Public License for more details.
  14
  15    You should have received a copy of the GNU General Public License
  16    along with this program; if not, write to the Free Software
  17    Foundation, Inc.,
  18    51 Franklin Street - Fifth Floor, Boston, MA  02110-1301, USA */
  19
  20 /* Written June, 1988 by Mike Haertel
  21    Modified July, 1988 by Arthur David Olson to assist BMG speedups  */
  22
  23 #include <config.h>
  24
  25 #include "dfa.h"
  26
  27 #include <assert.h>
  28 #include <ctype.h>
  29 #include <stdint.h>
  30 #include <stdio.h>
  31 #include <stdlib.h>
  32 #include <limits.h>
  33 #include <string.h>
  34 #include <locale.h>
  35
  36 static bool
  37 streq (char const *a, char const *b)
  38 {
  39   return strcmp (a, b) == 0;
  40 }
  41
  42 static bool
  43 isasciidigit (char c)
  44 {
  45   return '0' <= c && c <= '9';
  46 }
  47
  48 #include "gettext.h"
  49 #define _(str) gettext (str)
  50
  51 #include <wchar.h>
  52
  53 #include "intprops.h"
  54 #include "xalloc.h"
  55 #include "localeinfo.h"
  56
  57 #ifndef FALLTHROUGH
  58 # if __GNUC__ < 7
  59 #  define FALLTHROUGH ((void) 0)
  60 # else
  61 #  define FALLTHROUGH __attribute__ ((__fallthrough__))
  62 # endif
  63 #endif
  64
  65 #ifndef MIN
  66 # define MIN(a,b) ((a) < (b) ? (a) : (b))
  67 #endif
  68
  69 /* HPUX defines these as macros in sys/param.h.  */
  70 #ifdef setbit
  71 # undef setbit
  72 #endif
  73 #ifdef clrbit
  74 # undef clrbit
  75 #endif
  76
  77 /* First integer value that is greater than any character code.  */
  78 enum { NOTCHAR = 1 << CHAR_BIT };
  79
  80 /* This represents part of a character class.  It must be unsigned and
  81    at least CHARCLASS_WORD_BITS wide.  Any excess bits are zero.  */
  82 typedef unsigned long int charclass_word;
  83
  84 /* CHARCLASS_WORD_BITS is the number of bits used in a charclass word.
  85    CHARCLASS_PAIR (LO, HI) is part of a charclass initializer, and
  86    represents 64 bits' worth of a charclass, where LO and HI are the
  87    low and high-order 32 bits of the 64-bit quantity.  */
  88 #if ULONG_MAX >> 31 >> 31 < 3
  89 enum { CHARCLASS_WORD_BITS = 32 };
  90 # define CHARCLASS_PAIR(lo, hi) lo, hi
  91 #else
  92 enum { CHARCLASS_WORD_BITS = 64 };
  93 # define CHARCLASS_PAIR(lo, hi) (((charclass_word) (hi) << 32) + (lo))
  94 #endif
  95
  96 /* An initializer for a charclass whose 32-bit words are A through H.  */
  97 #define CHARCLASS_INIT(a, b, c, d, e, f, g, h)          \
  98    {{                                                   \
  99       CHARCLASS_PAIR (a, b), CHARCLASS_PAIR (c, d),     \
 100       CHARCLASS_PAIR (e, f), CHARCLASS_PAIR (g, h)      \
 101    }}
 102
 103 /* The maximum useful value of a charclass_word; all used bits are 1.  */
 104 static charclass_word const CHARCLASS_WORD_MASK
 105   = ((charclass_word) 1 << (CHARCLASS_WORD_BITS - 1) << 1) - 1;
 106
 107 /* Number of words required to hold a bit for every character.  */
 108 enum
 109 {
 110   CHARCLASS_WORDS = (NOTCHAR + CHARCLASS_WORD_BITS - 1) / CHARCLASS_WORD_BITS
 111 };
 112
 113 /* Sets of unsigned characters are stored as bit vectors in arrays of ints.  */
 114 typedef struct { charclass_word w[CHARCLASS_WORDS]; } charclass;
 115
 116 /* Convert a possibly-signed character to an unsigned character.  This is
 117    a bit safer than casting to unsigned char, since it catches some type
 118    errors that the cast doesn't.  */
 119 static unsigned char
 120 to_uchar (char ch)
 121 {
 122   return ch;
 123 }
 124
 125 /* Contexts tell us whether a character is a newline or a word constituent.
 126    Word-constituent characters are those that satisfy iswalnum, plus '_'.
 127    Each character has a single CTX_* value; bitmasks of CTX_* values denote
 128    a particular character class.
 129
 130    A state also stores a context value, which is a bitmask of CTX_* values.
 131    A state's context represents a set of characters that the state's
 132    predecessors must match.  For example, a state whose context does not
 133    include CTX_LETTER will never have transitions where the previous
 134    character is a word constituent.  A state whose context is CTX_ANY
 135    might have transitions from any character.  */
 136
 137 enum
 138   {
 139     CTX_NONE = 1,
 140     CTX_LETTER = 2,
 141     CTX_NEWLINE = 4,
 142     CTX_ANY = 7
 143   };
 144
 145 /* Sometimes characters can only be matched depending on the surrounding
 146    context.  Such context decisions depend on what the previous character
 147    was, and the value of the current (lookahead) character.  Context
 148    dependent constraints are encoded as 9-bit integers.  Each bit that
 149    is set indicates that the constraint succeeds in the corresponding
 150    context.
 151
 152    bit 6-8  - valid contexts when next character is CTX_NEWLINE
 153    bit 3-5  - valid contexts when next character is CTX_LETTER
 154    bit 0-2  - valid contexts when next character is CTX_NONE
 155
 156    succeeds_in_context determines whether a given constraint
 157    succeeds in a particular context.  Prev is a bitmask of possible
 158    context values for the previous character, curr is the (single-bit)
 159    context value for the lookahead character.  */
 160 static int
 161 newline_constraint (int constraint)
 162 {
 163   return (constraint >> 6) & 7;
 164 }
 165 static int
 166 letter_constraint (int constraint)
 167 {
 168   return (constraint >> 3) & 7;
 169 }
 170 static int
 171 other_constraint (int constraint)
 172 {
 173   return constraint & 7;
 174 }
 175
 176 static bool
 177 succeeds_in_context (int constraint, int prev, int curr)
 178 {
 179   return !! (((curr & CTX_NONE      ? other_constraint (constraint) : 0) \
 180               | (curr & CTX_LETTER  ? letter_constraint (constraint) : 0) \
 181               | (curr & CTX_NEWLINE ? newline_constraint (constraint) : 0)) \
 182              & prev);
 183 }
 184
 185 /* The following describe what a constraint depends on.  */
 186 static bool
 187 prev_newline_dependent (int constraint)
 188 {
 189   return ((constraint ^ constraint >> 2) & 0111) != 0;
 190 }
 191 static bool
 192 prev_letter_dependent (int constraint)
 193 {
 194   return ((constraint ^ constraint >> 1) & 0111) != 0;
 195 }
 196
 197 /* Tokens that match the empty string subject to some constraint actually
 198    work by applying that constraint to determine what may follow them,
 199    taking into account what has gone before.  The following values are
 200    the constraints corresponding to the special tokens previously defined.  */
 201 enum
 202   {
 203     NO_CONSTRAINT = 0777,
 204     BEGLINE_CONSTRAINT = 0444,
 205     ENDLINE_CONSTRAINT = 0700,
 206     BEGWORD_CONSTRAINT = 0050,
 207     ENDWORD_CONSTRAINT = 0202,
 208     LIMWORD_CONSTRAINT = 0252,
 209     NOTLIMWORD_CONSTRAINT = 0525
 210   };
 211
 212 /* The regexp is parsed into an array of tokens in postfix form.  Some tokens
 213    are operators and others are terminal symbols.  Most (but not all) of these
 214    codes are returned by the lexical analyzer.  */
 215
 216 typedef ptrdiff_t token;
 217 static ptrdiff_t const TOKEN_MAX = PTRDIFF_MAX;
 218
 219 /* States are indexed by state_num values.  These are normally
 220    nonnegative but -1 is used as a special value.  */
 221 typedef ptrdiff_t state_num;
 222
 223 /* Predefined token values.  */
 224 enum
 225 {
 226   END = -1,                     /* END is a terminal symbol that matches the
 227                                    end of input; any value of END or less in
 228                                    the parse tree is such a symbol.  Accepting
 229                                    states of the DFA are those that would have
 230                                    a transition on END.  This is -1, not some
 231                                    more-negative value, to tweak the speed of
 232                                    comparisons to END.  */
 233
 234   /* Ordinary character values are terminal symbols that match themselves.  */
 235
 236   /* CSET must come last in the following list of special tokens.  Otherwise,
 237      the list order matters only for performance.  Related special tokens
 238      should have nearby values so that code like (t == ANYCHAR || t == MBCSET
 239      || CSET <= t) can be done with a single machine-level comparison.  */
 240
 241   EMPTY = NOTCHAR,              /* EMPTY is a terminal symbol that matches
 242                                    the empty string.  */
 243
 244   QMARK,                        /* QMARK is an operator of one argument that
 245                                    matches zero or one occurrences of its
 246                                    argument.  */
 247
 248   STAR,                         /* STAR is an operator of one argument that
 249                                    matches the Kleene closure (zero or more
 250                                    occurrences) of its argument.  */
 251
 252   PLUS,                         /* PLUS is an operator of one argument that
 253                                    matches the positive closure (one or more
 254                                    occurrences) of its argument.  */
 255
 256   REPMN,                        /* REPMN is a lexical token corresponding
 257                                    to the {m,n} construct.  REPMN never
 258                                    appears in the compiled token vector.  */
 259
 260   CAT,                          /* CAT is an operator of two arguments that
 261                                    matches the concatenation of its
 262                                    arguments.  CAT is never returned by the
 263                                    lexical analyzer.  */
 264
 265   OR,                           /* OR is an operator of two arguments that
 266                                    matches either of its arguments.  */
 267
 268   LPAREN,                       /* LPAREN never appears in the parse tree,
 269                                    it is only a lexeme.  */
 270
 271   RPAREN,                       /* RPAREN never appears in the parse tree.  */
 272
 273   WCHAR,                        /* Only returned by lex.  wctok contains
 274                                    the wide character representation.  */
 275
 276   ANYCHAR,                      /* ANYCHAR is a terminal symbol that matches
 277                                    a valid multibyte (or single byte) character.
 278                                    It is used only if MB_CUR_MAX > 1.  */
 279
 280   BEG,                          /* BEG is an initial symbol that matches the
 281                                    beginning of input.  */
 282
 283   BEGLINE,                      /* BEGLINE is a terminal symbol that matches
 284                                    the empty string at the beginning of a
 285                                    line.  */
 286
 287   ENDLINE,                      /* ENDLINE is a terminal symbol that matches
 288                                    the empty string at the end of a line.  */
 289
 290   BEGWORD,                      /* BEGWORD is a terminal symbol that matches
 291                                    the empty string at the beginning of a
 292                                    word.  */
 293
 294   ENDWORD,                      /* ENDWORD is a terminal symbol that matches
 295                                    the empty string at the end of a word.  */
 296
 297   LIMWORD,                      /* LIMWORD is a terminal symbol that matches
 298                                    the empty string at the beginning or the
 299                                    end of a word.  */
 300
 301   NOTLIMWORD,                   /* NOTLIMWORD is a terminal symbol that
 302                                    matches the empty string not at
 303                                    the beginning or end of a word.  */
 304
 305   BACKREF,                      /* BACKREF is generated by \<digit>
 306                                    or by any other construct that
 307                                    is not completely handled.  If the scanner
 308                                    detects a transition on backref, it returns
 309                                    a kind of "semi-success" indicating that
 310                                    the match will have to be verified with
 311                                    a backtracking matcher.  */
 312
 313   MBCSET,                       /* MBCSET is similar to CSET, but for
 314                                    multibyte characters.  */
 315
 316   CSET                          /* CSET and (and any value greater) is a
 317                                    terminal symbol that matches any of a
 318                                    class of characters.  */
 319 };
 320
 321
 322 /* States of the recognizer correspond to sets of positions in the parse
 323    tree, together with the constraints under which they may be matched.
 324    So a position is encoded as an index into the parse tree together with
 325    a constraint.  */
 326 typedef struct
 327 {
 328   size_t index;                 /* Index into the parse array.  */
 329   unsigned int constraint;      /* Constraint for matching this position.  */
 330 } position;
 331
 332 /* Sets of positions are stored as arrays.  */
 333 typedef struct
 334 {
 335   position *elems;              /* Elements of this position set.  */
 336   ptrdiff_t nelem;              /* Number of elements in this set.  */
 337   ptrdiff_t alloc;              /* Number of elements allocated in ELEMS.  */
 338 } position_set;
 339
 340 /* A state of the dfa consists of a set of positions, some flags,
 341    and the token value of the lowest-numbered position of the state that
 342    contains an END token.  */
 343 typedef struct
 344 {
 345   size_t hash;                  /* Hash of the positions of this state.  */
 346   position_set elems;           /* Positions this state could match.  */
 347   unsigned char context;        /* Context from previous state.  */
 348   unsigned short constraint;    /* Constraint for this state to accept.  */
 349   token first_end;              /* Token value of the first END in elems.  */
 350   position_set mbps;            /* Positions which can match multibyte
 351                                    characters or the follows, e.g., period.
 352                                    Used only if MB_CUR_MAX > 1.  */
 353   state_num mb_trindex;         /* Index of this state in MB_TRANS, or
 354                                    negative if the state does not have
 355                                    ANYCHAR.  */
 356 } dfa_state;
 357
 358 /* Maximum for any transition table count.  This should be at least 3,
 359    for the initial state setup.  */
 360 enum { MAX_TRCOUNT = 1024 };
 361
 362 /* A bracket operator.
 363    e.g., [a-c], [[:alpha:]], etc.  */
 364 struct mb_char_classes
 365 {
 366   ptrdiff_t cset;
 367   bool invert;
 368   wchar_t *chars;               /* Normal characters.  */
 369   ptrdiff_t nchars;
 370   ptrdiff_t nchars_alloc;
 371 };
 372
 373 struct regex_syntax
 374 {
 375   /* Syntax bits controlling the behavior of the lexical analyzer.  */
 376   reg_syntax_t syntax_bits;
 377   bool syntax_bits_set;
 378
 379   /* Flag for case-folding letters into sets.  */
 380   bool case_fold;
 381
 382   /* True if ^ and $ match only the start and end of data, and do not match
 383      end-of-line within data.  */
 384   bool anchor;
 385
 386   /* End-of-line byte in data.  */
 387   unsigned char eolbyte;
 388
 389   /* Cache of char-context values.  */
 390   char sbit[NOTCHAR];
 391
 392   /* If never_trail[B], the byte B cannot be a non-initial byte in a
 393      multibyte character.  */
 394   bool never_trail[NOTCHAR];
 395
 396   /* Set of characters considered letters.  */
 397   charclass letters;
 398
 399   /* Set of characters that are newline.  */
 400   charclass newline;
 401 };
 402
 403 /* Lexical analyzer.  All the dross that deals with the obnoxious
 404    GNU Regex syntax bits is located here.  The poor, suffering
 405    reader is referred to the GNU Regex documentation for the
 406    meaning of the @#%!@#%^!@ syntax bits.  */
 407 struct lexer_state
 408 {
 409   char const *ptr;      /* Pointer to next input character.  */
 410   size_t left;          /* Number of characters remaining.  */
 411   token lasttok;        /* Previous token returned; initially END.  */
 412   size_t parens;        /* Count of outstanding left parens.  */
 413   int minrep, maxrep;   /* Repeat counts for {m,n}.  */
 414
 415   /* Wide character representation of the current multibyte character,
 416      or WEOF if there was an encoding error.  Used only if
 417      MB_CUR_MAX > 1.  */
 418   wint_t wctok;
 419
 420   /* Length of the multibyte representation of wctok.  */
 421   int cur_mb_len;
 422
 423   /* The most recently analyzed multibyte bracket expression.  */
 424   struct mb_char_classes brack;
 425
 426   /* We're separated from beginning or (, | only by zero-width characters.  */
 427   bool laststart;
 428 };
 429
 430 /* Recursive descent parser for regular expressions.  */
 431
 432 struct parser_state
 433 {
 434   token tok;               /* Lookahead token.  */
 435   size_t depth;            /* Current depth of a hypothetical stack
 436                               holding deferred productions.  This is
 437                               used to determine the depth that will be
 438                               required of the real stack later on in
 439                               dfaanalyze.  */
 440 };
 441
 442 /* A compiled regular expression.  */
 443 struct dfa
 444 {
 445   /* Syntax configuration */
 446   struct regex_syntax syntax;
 447
 448   /* Fields filled by the scanner.  */
 449   charclass *charclasses;       /* Array of character sets for CSET tokens.  */
 450   ptrdiff_t cindex;             /* Index for adding new charclasses.  */
 451   ptrdiff_t calloc;             /* Number of charclasses allocated.  */
 452   size_t canychar;              /* Index of anychar class, or (size_t) -1.  */
 453
 454   /* Scanner state */
 455   struct lexer_state lex;
 456
 457   /* Parser state */
 458   struct parser_state parse;
 459
 460   /* Fields filled by the parser.  */
 461   token *tokens;                /* Postfix parse array.  */
 462   size_t tindex;                /* Index for adding new tokens.  */
 463   size_t talloc;                /* Number of tokens currently allocated.  */
 464   size_t depth;                 /* Depth required of an evaluation stack
 465                                    used for depth-first traversal of the
 466                                    parse tree.  */
 467   size_t nleaves;               /* Number of leaves on the parse tree.  */
 468   size_t nregexps;              /* Count of parallel regexps being built
 469                                    with dfaparse.  */
 470   bool fast;                    /* The DFA is fast.  */
 471   token utf8_anychar_classes[5]; /* To lower ANYCHAR in UTF-8 locales.  */
 472   mbstate_t mbs;                /* Multibyte conversion state.  */
 473
 474   /* The following are valid only if MB_CUR_MAX > 1.  */
 475
 476   /* The value of multibyte_prop[i] is defined by following rule.
 477      if tokens[i] < NOTCHAR
 478      bit 0 : tokens[i] is the first byte of a character, including
 479      single-byte characters.
 480      bit 1 : tokens[i] is the last byte of a character, including
 481      single-byte characters.
 482
 483      e.g.
 484      tokens
 485      = 'single_byte_a', 'multi_byte_A', single_byte_b'
 486      = 'sb_a', 'mb_A(1st byte)', 'mb_A(2nd byte)', 'mb_A(3rd byte)', 'sb_b'
 487      multibyte_prop
 488      = 3     , 1               ,  0              ,  2              , 3
 489    */
 490   char *multibyte_prop;
 491
 492   /* Fields filled by the superset.  */
 493   struct dfa *superset;             /* Hint of the dfa.  */
 494
 495   /* Fields filled by the state builder.  */
 496   dfa_state *states;            /* States of the dfa.  */
 497   state_num sindex;             /* Index for adding new states.  */
 498   ptrdiff_t salloc;             /* Number of states currently allocated.  */
 499
 500   /* Fields filled by the parse tree->NFA conversion.  */
 501   position_set *follows;        /* Array of follow sets, indexed by position
 502                                    index.  The follow of a position is the set
 503                                    of positions containing characters that
 504                                    could conceivably follow a character
 505                                    matching the given position in a string
 506                                    matching the regexp.  Allocated to the
 507                                    maximum possible position index.  */
 508   bool searchflag;              /* We are supposed to build a searching
 509                                    as opposed to an exact matcher.  A searching
 510                                    matcher finds the first and shortest string
 511                                    matching a regexp anywhere in the buffer,
 512                                    whereas an exact matcher finds the longest
 513                                    string matching, but anchored to the
 514                                    beginning of the buffer.  */
 515
 516   /* Fields filled by dfaanalyze.  */
 517   int *constraints;             /* Array of union of accepting constraints
 518                                    in the follow of a position.  */
 519   int *separates;               /* Array of contexts on follow of a
 520                                    position.  */
 521
 522   /* Fields filled by dfaexec.  */
 523   state_num tralloc;            /* Number of transition tables that have
 524                                    slots so far, not counting trans[-1] and
 525                                    trans[-2].  */
 526   int trcount;                  /* Number of transition tables that have
 527                                    been built, other than for initial
 528                                    states.  */
 529   int min_trcount;              /* Number of initial states.  Equivalently,
 530                                    the minimum state number for which trcount
 531                                    counts transitions.  */
 532   state_num **trans;            /* Transition tables for states that can
 533                                    never accept.  If the transitions for a
 534                                    state have not yet been computed, or the
 535                                    state could possibly accept, its entry in
 536                                    this table is NULL.  This points to two
 537                                    past the start of the allocated array,
 538                                    and trans[-1] and trans[-2] are always
 539                                    NULL.  */
 540   state_num **fails;            /* Transition tables after failing to accept
 541                                    on a state that potentially could do so.
 542                                    If trans[i] is non-null, fails[i] must
 543                                    be null.  */
 544   char *success;                /* Table of acceptance conditions used in
 545                                    dfaexec and computed in build_state.  */
 546   state_num *newlines;          /* Transitions on newlines.  The entry for a
 547                                    newline in any transition table is always
 548                                    -1 so we can count lines without wasting
 549                                    too many cycles.  The transition for a
 550                                    newline is stored separately and handled
 551                                    as a special case.  Newline is also used
 552                                    as a sentinel at the end of the buffer.  */
 553   state_num initstate_notbol;   /* Initial state for CTX_LETTER and CTX_NONE
 554                                    context in multibyte locales, in which we
 555                                    do not distinguish between their contexts,
 556                                    as not supported word.  */
 557   position_set mb_follows;      /* Follow set added by ANYCHAR on demand.  */
 558   state_num **mb_trans;         /* Transition tables for states with
 559                                    ANYCHAR.  */
 560   state_num mb_trcount;         /* Number of transition tables for states with
 561                                    ANYCHAR that have actually been built.  */
 562
 563   /* Information derived from the locale.  This is at the end so that
 564      a quick memset need not clear it specially.  */
 565
 566   /* dfaexec implementation.  */
 567   char *(*dfaexec) (struct dfa *, char const *, char *,
 568                     bool, size_t *, bool *);
 569
 570   /* The locale is simple, like the C locale.  These locales can be
 571      processed more efficiently, as they are single-byte, their native
 572      character set is in collating-sequence order, and they do not
 573      have multi-character collating elements.  */
 574   bool simple_locale;
 575
 576   /* Other cached information derived from the locale.  */
 577   struct localeinfo localeinfo;
 578 };
 579
 580 /* User access to dfa internals.  */
 581
 582 /* S could possibly be an accepting state of R.  */
 583 static bool
 584 accepting (state_num s, struct dfa const *r)
 585 {
 586   return r->states[s].constraint != 0;
 587 }
 588
 589 /* STATE accepts in the specified context.  */
 590 static bool
 591 accepts_in_context (int prev, int curr, state_num state, struct dfa const *dfa)
 592 {
 593   return succeeds_in_context (dfa->states[state].constraint, prev, curr);
 594 }
 595
 596 static void regexp (struct dfa *dfa);
 597
 598 /* Store into *PWC the result of converting the leading bytes of the
 599    multibyte buffer S of length N bytes, using D->localeinfo.sbctowc
 600    and updating the conversion state in *D.  On conversion error,
 601    convert just a single byte, to WEOF.  Return the number of bytes
 602    converted.
 603
 604    This differs from mbrtowc (PWC, S, N, &D->mbs) as follows:
 605
 606    * PWC points to wint_t, not to wchar_t.
 607    * The last arg is a dfa *D instead of merely a multibyte conversion
 608      state D->mbs.
 609    * N must be at least 1.
 610    * S[N - 1] must be a sentinel byte.
 611    * Shift encodings are not supported.
 612    * The return value is always in the range 1..N.
 613    * D->mbs is always valid afterwards.
 614    * *PWC is always set to something.  */
 615 static size_t
 616 mbs_to_wchar (wint_t *pwc, char const *s, size_t n, struct dfa *d)
 617 {
 618   unsigned char uc = s[0];
 619   wint_t wc = d->localeinfo.sbctowc[uc];
 620
 621   if (wc == WEOF)
 622     {
 623       wchar_t wch;
 624       size_t nbytes = mbrtowc (&wch, s, n, &d->mbs);
 625       if (0 < nbytes && nbytes < (size_t) -2)
 626         {
 627           *pwc = wch;
 628           return nbytes;
 629         }
 630       memset (&d->mbs, 0, sizeof d->mbs);
 631     }
 632
 633   *pwc = wc;
 634   return 1;
 635 }
 636
 637 #ifdef DEBUG
 638
 639 static void
 640 prtok (token t)
 641 {
 642   if (t <= END)
 643     fprintf (stderr, "END");
 644   else if (0 <= t && t < NOTCHAR)
 645     {
 646       unsigned int ch = t;
 647       fprintf (stderr, "0x%02x", ch);
 648     }
 649   else
 650     {
 651       char const *s;
 652       switch (t)
 653         {
 654         case BEG:
 655           s = "BEG";
 656           break;
 657         case EMPTY:
 658           s = "EMPTY";
 659           break;
 660         case BACKREF:
 661           s = "BACKREF";
 662           break;
 663         case BEGLINE:
 664           s = "BEGLINE";
 665           break;
 666         case ENDLINE:
 667           s = "ENDLINE";
 668           break;
 669         case BEGWORD:
 670           s = "BEGWORD";
 671           break;
 672         case ENDWORD:
 673           s = "ENDWORD";
 674           break;
 675         case LIMWORD:
 676           s = "LIMWORD";
 677           break;
 678         case NOTLIMWORD:
 679           s = "NOTLIMWORD";
 680           break;
 681         case QMARK:
 682           s = "QMARK";
 683           break;
 684         case STAR:
 685           s = "STAR";
 686           break;
 687         case PLUS:
 688           s = "PLUS";
 689           break;
 690         case CAT:
 691           s = "CAT";
 692           break;
 693         case OR:
 694           s = "OR";
 695           break;
 696         case LPAREN:
 697           s = "LPAREN";
 698           break;
 699         case RPAREN:
 700           s = "RPAREN";
 701           break;
 702         case ANYCHAR:
 703           s = "ANYCHAR";
 704           break;
 705         case MBCSET:
 706           s = "MBCSET";
 707           break;
 708         default:
 709           s = "CSET";
 710           break;
 711         }
 712       fprintf (stderr, "%s", s);
 713     }
 714 }
 715 #endif /* DEBUG */
 716
 717 /* Stuff pertaining to charclasses.  */
 718
 719 static bool
 720 tstbit (unsigned int b, charclass const *c)
 721 {
 722   return c->w[b / CHARCLASS_WORD_BITS] >> b % CHARCLASS_WORD_BITS & 1;
 723 }
 724
 725 static void
 726 setbit (unsigned int b, charclass *c)
 727 {
 728   charclass_word one = 1;
 729   c->w[b / CHARCLASS_WORD_BITS] |= one << b % CHARCLASS_WORD_BITS;
 730 }
 731
 732 static void
 733 clrbit (unsigned int b, charclass *c)
 734 {
 735   charclass_word one = 1;
 736   c->w[b / CHARCLASS_WORD_BITS] &= ~(one << b % CHARCLASS_WORD_BITS);
 737 }
 738
 739 static void
 740 zeroset (charclass *s)
 741 {
 742   memset (s, 0, sizeof *s);
 743 }
 744
 745 static void
 746 fillset (charclass *s)
 747 {
 748   for (int i = 0; i < CHARCLASS_WORDS; i++)
 749     s->w[i] = CHARCLASS_WORD_MASK;
 750 }
 751
 752 static void
 753 notset (charclass *s)
 754 {
 755   for (int i = 0; i < CHARCLASS_WORDS; ++i)
 756     s->w[i] = CHARCLASS_WORD_MASK & ~s->w[i];
 757 }
 758
 759 static bool
 760 equal (charclass const *s1, charclass const *s2)
 761 {
 762   charclass_word w = 0;
 763   for (int i = 0; i < CHARCLASS_WORDS; i++)
 764     w |= s1->w[i] ^ s2->w[i];
 765   return w == 0;
 766 }
 767
 768 static bool
 769 emptyset (charclass const *s)
 770 {
 771   charclass_word w = 0;
 772   for (int i = 0; i < CHARCLASS_WORDS; i++)
 773     w |= s->w[i];
 774   return w == 0;
 775 }
 776
 777 /* Grow PA, which points to an array of *NITEMS items, and return the
 778    location of the reallocated array, updating *NITEMS to reflect its
 779    new size.  The new array will contain at least NITEMS_INCR_MIN more
 780    items, but will not contain more than NITEMS_MAX items total.
 781    ITEM_SIZE is the size of each item, in bytes.
 782
 783    ITEM_SIZE and NITEMS_INCR_MIN must be positive.  *NITEMS must be
 784    nonnegative.  If NITEMS_MAX is -1, it is treated as if it were
 785    infinity.
 786
 787    If PA is null, then allocate a new array instead of reallocating
 788    the old one.
 789
 790    Thus, to grow an array A without saving its old contents, do
 791    { free (A); A = xpalloc (NULL, &AITEMS, ...); }.  */
 792
 793 static void *
 794 xpalloc (void *pa, ptrdiff_t *nitems, ptrdiff_t nitems_incr_min,
 795          ptrdiff_t nitems_max, ptrdiff_t item_size)
 796 {
 797   ptrdiff_t n0 = *nitems;
 798
 799   /* The approximate size to use for initial small allocation
 800      requests.  This is the largest "small" request for the GNU C
 801      library malloc.  */
 802   enum { DEFAULT_MXFAST = 64 * sizeof (size_t) / 4 };
 803
 804   /* If the array is tiny, grow it to about (but no greater than)
 805      DEFAULT_MXFAST bytes.  Otherwise, grow it by about 50%.
 806      Adjust the growth according to three constraints: NITEMS_INCR_MIN,
 807      NITEMS_MAX, and what the C language can represent safely.  */
 808
 809   ptrdiff_t n, nbytes;
 810   if (INT_ADD_WRAPV (n0, n0 >> 1, &n))
 811     n = PTRDIFF_MAX;
 812   if (0 <= nitems_max && nitems_max < n)
 813     n = nitems_max;
 814
 815   ptrdiff_t adjusted_nbytes
 816     = ((INT_MULTIPLY_WRAPV (n, item_size, &nbytes) || SIZE_MAX < nbytes)
 817        ? MIN (PTRDIFF_MAX, SIZE_MAX)
 818        : nbytes < DEFAULT_MXFAST ? DEFAULT_MXFAST : 0);
 819   if (adjusted_nbytes)
 820     {
 821       n = adjusted_nbytes / item_size;
 822       nbytes = adjusted_nbytes - adjusted_nbytes % item_size;
 823     }
 824
 825   if (! pa)
 826     *nitems = 0;
 827   if (n - n0 < nitems_incr_min
 828       && (INT_ADD_WRAPV (n0, nitems_incr_min, &n)
 829           || (0 <= nitems_max && nitems_max < n)
 830           || INT_MULTIPLY_WRAPV (n, item_size, &nbytes)))
 831     xalloc_die ();
 832   pa = xrealloc (pa, nbytes);
 833   *nitems = n;
 834   return pa;
 835 }
 836
 837 /* Ensure that the array addressed by PA holds at least I + 1 items.
 838    Either return PA, or reallocate the array and return its new address.
 839    Although PA may be null, the returned value is never null.
 840
 841    The array holds *NITEMS items, where 0 <= I <= *NITEMS; *NITEMS
 842    is updated on reallocation.  If PA is null, *NITEMS must be zero.
 843    Do not allocate more than NITEMS_MAX items total; -1 means no limit.
 844    ITEM_SIZE is the size of one item; it must be positive.
 845    Avoid O(N**2) behavior on arrays growing linearly.  */
 846 static void *
 847 maybe_realloc (void *pa, ptrdiff_t i, ptrdiff_t *nitems,
 848                ptrdiff_t nitems_max, ptrdiff_t item_size)
 849 {
 850   if (i < *nitems)
 851     return pa;
 852   return xpalloc (pa, nitems, 1, nitems_max, item_size);
 853 }
 854
 855 /* In DFA D, find the index of charclass S, or allocate a new one.  */
 856 static ptrdiff_t
 857 charclass_index (struct dfa *d, charclass *s)
 858 {
 859   ptrdiff_t i;
 860
 861   for (i = 0; i < d->cindex; ++i)
 862     if (equal (s, &d->charclasses[i]))
 863       return i;
 864   d->charclasses = maybe_realloc (d->charclasses, d->cindex, &d->calloc,
 865                                   TOKEN_MAX - CSET, sizeof *d->charclasses);
 866   ++d->cindex;
 867   d->charclasses[i] = *s;
 868   return i;
 869 }
 870
 871 static bool
 872 unibyte_word_constituent (struct dfa const *dfa, unsigned char c)
 873 {
 874   return dfa->localeinfo.sbctowc[c] != WEOF && (isalnum (c) || (c) == '_');
 875 }
 876
 877 static int
 878 char_context (struct dfa const *dfa, unsigned char c)
 879 {
 880   if (c == dfa->syntax.eolbyte && !dfa->syntax.anchor)
 881     return CTX_NEWLINE;
 882   if (unibyte_word_constituent (dfa, c))
 883     return CTX_LETTER;
 884   return CTX_NONE;
 885 }
 886
 887 /* Set a bit in the charclass for the given wchar_t.  Do nothing if WC
 888    is represented by a multi-byte sequence.  Even for MB_CUR_MAX == 1,
 889    this may happen when folding case in weird Turkish locales where
 890    dotless i/dotted I are not included in the chosen character set.
 891    Return whether a bit was set in the charclass.  */
 892 static bool
 893 setbit_wc (wint_t wc, charclass *c)
 894 {
 895   int b = wctob (wc);
 896   if (b < 0)
 897     return false;
 898
 899   setbit (b, c);
 900   return true;
 901 }
 902
 903 /* Set a bit for B and its case variants in the charclass C.
 904    MB_CUR_MAX must be 1.  */
 905 static void
 906 setbit_case_fold_c (int b, charclass *c)
 907 {
 908   int ub = toupper (b);
 909   for (int i = 0; i < NOTCHAR; i++)
 910     if (toupper (i) == ub)
 911       setbit (i, c);
 912 }
 913
 914 /* Return true if the locale compatible with the C locale.  */
 915
 916 static bool
 917 using_simple_locale (bool multibyte)
 918 {
 919   /* The native character set is known to be compatible with
 920      the C locale.  The following test isn't perfect, but it's good
 921      enough in practice, as only ASCII and EBCDIC are in common use
 922      and this test correctly accepts ASCII and rejects EBCDIC.  */
 923   enum { native_c_charset =
 924     ('\b' == 8 && '\t' == 9 && '\n' == 10 && '\v' == 11 && '\f' == 12
 925      && '\r' == 13 && ' ' == 32 && '!' == 33 && '"' == 34 && '#' == 35
 926      && '%' == 37 && '&' == 38 && '\'' == 39 && '(' == 40 && ')' == 41
 927      && '*' == 42 && '+' == 43 && ',' == 44 && '-' == 45 && '.' == 46
 928      && '/' == 47 && '0' == 48 && '9' == 57 && ':' == 58 && ';' == 59
 929      && '<' == 60 && '=' == 61 && '>' == 62 && '?' == 63 && 'A' == 65
 930      && 'Z' == 90 && '[' == 91 && '\\' == 92 && ']' == 93 && '^' == 94
 931      && '_' == 95 && 'a' == 97 && 'z' == 122 && '{' == 123 && '|' == 124
 932      && '}' == 125 && '~' == 126)
 933   };
 934
 935   if (!native_c_charset || multibyte)
 936     return false;
 937   else
 938     {
 939       /* Treat C and POSIX locales as being compatible.  Also, treat
 940          errors as compatible, as these are invariably from stubs.  */
 941       char const *loc = setlocale (LC_ALL, NULL);
 942       return !loc || streq (loc, "C") || streq (loc, "POSIX");
 943     }
 944 }
 945
 946 /* Fetch the next lexical input character from the pattern.  There
 947    must at least one byte of pattern input.  Set DFA->lex.wctok to the
 948    value of the character or to WEOF depending on whether the input is
 949    a valid multibyte character (possibly of length 1).  Then return
 950    the next input byte value, except return EOF if the input is a
 951    multibyte character of length greater than 1.  */
 952 static int
 953 fetch_wc (struct dfa *dfa)
 954 {
 955   size_t nbytes = mbs_to_wchar (&dfa->lex.wctok, dfa->lex.ptr, dfa->lex.left,
 956                                 dfa);
 957   dfa->lex.cur_mb_len = nbytes;
 958   int c = nbytes == 1 ? to_uchar (dfa->lex.ptr[0]) : EOF;
 959   dfa->lex.ptr += nbytes;
 960   dfa->lex.left -= nbytes;
 961   return c;
 962 }
 963
 964 /* If there is no more input, report an error about unbalanced brackets.
 965    Otherwise, behave as with fetch_wc (DFA).  */
 966 static int
 967 bracket_fetch_wc (struct dfa *dfa)
 968 {
 969   if (! dfa->lex.left)
 970     dfaerror (_("unbalanced ["));
 971   return fetch_wc (dfa);
 972 }
 973
 974 typedef int predicate (int);
 975
 976 /* The following list maps the names of the Posix named character classes
 977    to predicate functions that determine whether a given character is in
 978    the class.  The leading [ has already been eaten by the lexical
 979    analyzer.  */
 980 struct dfa_ctype
 981 {
 982   const char *name;
 983   predicate *func;
 984   bool single_byte_only;
 985 };
 986
 987 static const struct dfa_ctype prednames[] = {
 988   {"alpha", isalpha, false},
 989   {"upper", isupper, false},
 990   {"lower", islower, false},
 991   {"digit", isdigit, true},
 992   {"xdigit", isxdigit, false},
 993   {"space", isspace, false},
 994   {"punct", ispunct, false},
 995   {"alnum", isalnum, false},
 996   {"print", isprint, false},
 997   {"graph", isgraph, false},
 998   {"cntrl", iscntrl, false},
 999   {"blank", isblank, false},
1000   {NULL, NULL, false}
1001 };
1002
1003 static const struct dfa_ctype *_GL_ATTRIBUTE_PURE
1004 find_pred (const char *str)
1005 {
1006   for (unsigned int i = 0; prednames[i].name; ++i)
1007     if (streq (str, prednames[i].name))
1008       return &prednames[i];
1009   return NULL;
1010 }
1011
1012 /* Parse a bracket expression, which possibly includes multibyte
1013    characters.  */
1014 static token
1015 parse_bracket_exp (struct dfa *dfa)
1016 {
1017   /* This is a bracket expression that dfaexec is known to
1018      process correctly.  */
1019   bool known_bracket_exp = true;
1020
1021   /* Used to warn about [:space:].
1022      Bit 0 = first character is a colon.
1023      Bit 1 = last character is a colon.
1024      Bit 2 = includes any other character but a colon.
1025      Bit 3 = includes ranges, char/equiv classes or collation elements.  */
1026   int colon_warning_state;
1027
1028   dfa->lex.brack.nchars = 0;
1029   charclass ccl;
1030   zeroset (&ccl);
1031   int c = bracket_fetch_wc (dfa);
1032   bool invert = c == '^';
1033   if (invert)
1034     {
1035       c = bracket_fetch_wc (dfa);
1036       known_bracket_exp = dfa->simple_locale;
1037     }
1038   wint_t wc = dfa->lex.wctok;
1039   int c1;
1040   wint_t wc1;
1041   colon_warning_state = (c == ':');
1042   do
1043     {
1044       c1 = NOTCHAR;     /* Mark c1 as not initialized.  */
1045       colon_warning_state &= ~2;
1046
1047       /* Note that if we're looking at some other [:...:] construct,
1048          we just treat it as a bunch of ordinary characters.  We can do
1049          this because we assume regex has checked for syntax errors before
1050          dfa is ever called.  */
1051       if (c == '[')
1052         {
1053           c1 = bracket_fetch_wc (dfa);
1054           wc1 = dfa->lex.wctok;
1055
1056           if ((c1 == ':' && (dfa->syntax.syntax_bits & RE_CHAR_CLASSES))
1057               || c1 == '.' || c1 == '=')
1058             {
1059               enum { MAX_BRACKET_STRING_LEN = 32 };
1060               char str[MAX_BRACKET_STRING_LEN + 1];
1061               size_t len = 0;
1062               for (;;)
1063                 {
1064                   c = bracket_fetch_wc (dfa);
1065                   if (dfa->lex.left == 0
1066                       || (c == c1 && dfa->lex.ptr[0] == ']'))
1067                     break;
1068                   if (len < MAX_BRACKET_STRING_LEN)
1069                     str[len++] = c;
1070                   else
1071                     /* This is in any case an invalid class name.  */
1072                     str[0] = '\0';
1073                 }
1074               str[len] = '\0';
1075
1076               /* Fetch bracket.  */
1077               c = bracket_fetch_wc (dfa);
1078               wc = dfa->lex.wctok;
1079               if (c1 == ':')
1080                 /* Build character class.  POSIX allows character
1081                    classes to match multicharacter collating elements,
1082                    but the regex code does not support that, so do not
1083                    worry about that possibility.  */
1084                 {
1085                   char const *class
1086                     = (dfa->syntax.case_fold && (streq (str, "upper")
1087                                                  || streq (str, "lower"))
1088                        ? "alpha" : str);
1089                   const struct dfa_ctype *pred = find_pred (class);
1090                   if (!pred)
1091                     dfaerror (_("invalid character class"));
1092
1093                   if (dfa->localeinfo.multibyte && !pred->single_byte_only)
1094                     known_bracket_exp = false;
1095                   else
1096                     for (int c2 = 0; c2 < NOTCHAR; ++c2)
1097                       if (pred->func (c2))
1098                         setbit (c2, &ccl);
1099                 }
1100               else
1101                 known_bracket_exp = false;
1102
1103               colon_warning_state |= 8;
1104
1105               /* Fetch new lookahead character.  */
1106               c1 = bracket_fetch_wc (dfa);
1107               wc1 = dfa->lex.wctok;
1108               continue;
1109             }
1110
1111           /* We treat '[' as a normal character here.  c/c1/wc/wc1
1112              are already set up.  */
1113         }
1114
1115       if (c == '\\'
1116           && (dfa->syntax.syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS))
1117         {
1118           c = bracket_fetch_wc (dfa);
1119           wc = dfa->lex.wctok;
1120         }
1121
1122       if (c1 == NOTCHAR)
1123         {
1124           c1 = bracket_fetch_wc (dfa);
1125           wc1 = dfa->lex.wctok;
1126         }
1127
1128       if (c1 == '-')
1129         /* build range characters.  */
1130         {
1131           int c2 = bracket_fetch_wc (dfa);
1132           wint_t wc2 = dfa->lex.wctok;
1133
1134           /* A bracket expression like [a-[.aa.]] matches an unknown set.
1135              Treat it like [-a[.aa.]] while parsing it, and
1136              remember that the set is unknown.  */
1137           if (c2 == '[' && dfa->lex.ptr[0] == '.')
1138             {
1139               known_bracket_exp = false;
1140               c2 = ']';
1141             }
1142
1143           if (c2 == ']')
1144             {
1145               /* In the case [x-], the - is an ordinary hyphen,
1146                  which is left in c1, the lookahead character.  */
1147               dfa->lex.ptr -= dfa->lex.cur_mb_len;
1148               dfa->lex.left += dfa->lex.cur_mb_len;
1149             }
1150           else
1151             {
1152               if (c2 == '\\' && (dfa->syntax.syntax_bits
1153                                  & RE_BACKSLASH_ESCAPE_IN_LISTS))
1154                 {
1155                   c2 = bracket_fetch_wc (dfa);
1156                   wc2 = dfa->lex.wctok;
1157                 }
1158
1159               colon_warning_state |= 8;
1160               c1 = bracket_fetch_wc (dfa);
1161               wc1 = dfa->lex.wctok;
1162
1163               /* Treat [x-y] as a range if x != y.  */
1164               if (wc != wc2 || wc == WEOF)
1165                 {
1166                   if (dfa->simple_locale
1167                       || (isasciidigit (c) & isasciidigit (c2)))
1168                     {
1169                       for (int ci = c; ci <= c2; ci++)
1170                         if (dfa->syntax.case_fold && isalpha (ci))
1171                           setbit_case_fold_c (ci, &ccl);
1172                         else
1173                           setbit (ci, &ccl);
1174                     }
1175                   else
1176                     known_bracket_exp = false;
1177
1178                   continue;
1179                 }
1180             }
1181         }
1182
1183       colon_warning_state |= (c == ':') ? 2 : 4;
1184
1185       if (!dfa->localeinfo.multibyte)
1186         {
1187           if (dfa->syntax.case_fold && isalpha (c))
1188             setbit_case_fold_c (c, &ccl);
1189           else
1190             setbit (c, &ccl);
1191           continue;
1192         }
1193
1194       if (wc == WEOF)
1195         known_bracket_exp = false;
1196       else
1197         {
1198           wchar_t folded[CASE_FOLDED_BUFSIZE + 1];
1199           unsigned int n = (dfa->syntax.case_fold
1200                             ? case_folded_counterparts (wc, folded + 1) + 1
1201                             : 1);
1202           folded[0] = wc;
1203           for (unsigned int i = 0; i < n; i++)
1204             if (!setbit_wc (folded[i], &ccl))
1205               {
1206                 dfa->lex.brack.chars
1207                   = maybe_realloc (dfa->lex.brack.chars, dfa->lex.brack.nchars,
1208                                    &dfa->lex.brack.nchars_alloc, -1,
1209                                    sizeof *dfa->lex.brack.chars);
1210                 dfa->lex.brack.chars[dfa->lex.brack.nchars++] = folded[i];
1211               }
1212         }
1213     }
1214   while ((wc = wc1, (c = c1) != ']'));
1215
1216   if (colon_warning_state == 7)
1217     dfawarn (_("character class syntax is [[:space:]], not [:space:]"));
1218
1219   if (! known_bracket_exp)
1220     return BACKREF;
1221
1222   if (dfa->localeinfo.multibyte && (invert || dfa->lex.brack.nchars != 0))
1223     {
1224       dfa->lex.brack.invert = invert;
1225       dfa->lex.brack.cset = emptyset (&ccl) ? -1 : charclass_index (dfa, &ccl);
1226       return MBCSET;
1227     }
1228
1229   if (invert)
1230     {
1231       notset (&ccl);
1232       if (dfa->syntax.syntax_bits & RE_HAT_LISTS_NOT_NEWLINE)
1233         clrbit ('\n', &ccl);
1234     }
1235
1236   return CSET + charclass_index (dfa, &ccl);
1237 }
1238
1239 struct lexptr
1240 {
1241   char const *ptr;
1242   size_t left;
1243 };
1244
1245 static void
1246 push_lex_state (struct dfa *dfa, struct lexptr *ls, char const *s)
1247 {
1248   ls->ptr = dfa->lex.ptr;
1249   ls->left = dfa->lex.left;
1250   dfa->lex.ptr = s;
1251   dfa->lex.left = strlen (s);
1252 }
1253
1254 static void
1255 pop_lex_state (struct dfa *dfa, struct lexptr const *ls)
1256 {
1257   dfa->lex.ptr = ls->ptr;
1258   dfa->lex.left = ls->left;
1259 }
1260
1261 static token
1262 lex (struct dfa *dfa)
1263 {
1264   bool backslash = false;
1265
1266   /* Basic plan: We fetch a character.  If it's a backslash,
1267      we set the backslash flag and go through the loop again.
1268      On the plus side, this avoids having a duplicate of the
1269      main switch inside the backslash case.  On the minus side,
1270      it means that just about every case begins with
1271      "if (backslash) ...".  */
1272   for (int i = 0; i < 2; ++i)
1273     {
1274       if (! dfa->lex.left)
1275         return dfa->lex.lasttok = END;
1276       int c = fetch_wc (dfa);
1277
1278       switch (c)
1279         {
1280         case '\\':
1281           if (backslash)
1282             goto normal_char;
1283           if (dfa->lex.left == 0)
1284             dfaerror (_("unfinished \\ escape"));
1285           backslash = true;
1286           break;
1287
1288         case '^':
1289           if (backslash)
1290             goto normal_char;
1291           if (dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_ANCHORS
1292               || dfa->lex.lasttok == END || dfa->lex.lasttok == LPAREN
1293               || dfa->lex.lasttok == OR)
1294             return dfa->lex.lasttok = BEGLINE;
1295           goto normal_char;
1296
1297         case '$':
1298           if (backslash)
1299             goto normal_char;
1300           if (dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_ANCHORS
1301               || dfa->lex.left == 0
1302               || ((dfa->lex.left
1303                    > !(dfa->syntax.syntax_bits & RE_NO_BK_PARENS))
1304                   && (dfa->lex.ptr[!(dfa->syntax.syntax_bits & RE_NO_BK_PARENS)
1305                                    & (dfa->lex.ptr[0] == '\\')]
1306                       == ')'))
1307               || ((dfa->lex.left
1308                    > !(dfa->syntax.syntax_bits & RE_NO_BK_VBAR))
1309                   && (dfa->lex.ptr[!(dfa->syntax.syntax_bits & RE_NO_BK_VBAR)
1310                                    & (dfa->lex.ptr[0] == '\\')]
1311                       == '|'))
1312               || ((dfa->syntax.syntax_bits & RE_NEWLINE_ALT)
1313                   && dfa->lex.left > 0 && dfa->lex.ptr[0] == '\n'))
1314             return dfa->lex.lasttok = ENDLINE;
1315           goto normal_char;
1316
1317         case '1':
1318         case '2':
1319         case '3':
1320         case '4':
1321         case '5':
1322         case '6':
1323         case '7':
1324         case '8':
1325         case '9':
1326           if (backslash && !(dfa->syntax.syntax_bits & RE_NO_BK_REFS))
1327             {
1328               dfa->lex.laststart = false;
1329               return dfa->lex.lasttok = BACKREF;
1330             }
1331           goto normal_char;
1332
1333         case '`':
1334           if (backslash && !(dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
1335             {
1336               /* FIXME: should be beginning of string */
1337               return dfa->lex.lasttok = BEGLINE;
1338             }
1339           goto normal_char;
1340
1341         case '\'':
1342           if (backslash && !(dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
1343             {
1344               /* FIXME: should be end of string */
1345               return dfa->lex.lasttok = ENDLINE;
1346             }
1347           goto normal_char;
1348
1349         case '<':
1350           if (backslash && !(dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
1351             return dfa->lex.lasttok = BEGWORD;
1352           goto normal_char;
1353
1354         case '>':
1355           if (backslash && !(dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
1356             return dfa->lex.lasttok = ENDWORD;
1357           goto normal_char;
1358
1359         case 'b':
1360           if (backslash && !(dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
1361             return dfa->lex.lasttok = LIMWORD;
1362           goto normal_char;
1363
1364         case 'B':
1365           if (backslash && !(dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
1366             return dfa->lex.lasttok = NOTLIMWORD;
1367           goto normal_char;
1368
1369         case '?':
1370           if (dfa->syntax.syntax_bits & RE_LIMITED_OPS)
1371             goto normal_char;
1372           if (backslash != ((dfa->syntax.syntax_bits & RE_BK_PLUS_QM) != 0))
1373             goto normal_char;
1374           if (!(dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_OPS)
1375               && dfa->lex.laststart)
1376             goto normal_char;
1377           return dfa->lex.lasttok = QMARK;
1378
1379         case '*':
1380           if (backslash)
1381             goto normal_char;
1382           if (!(dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_OPS)
1383               && dfa->lex.laststart)
1384             goto normal_char;
1385           return dfa->lex.lasttok = STAR;
1386
1387         case '+':
1388           if (dfa->syntax.syntax_bits & RE_LIMITED_OPS)
1389             goto normal_char;
1390           if (backslash != ((dfa->syntax.syntax_bits & RE_BK_PLUS_QM) != 0))
1391             goto normal_char;
1392           if (!(dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_OPS)
1393               && dfa->lex.laststart)
1394             goto normal_char;
1395           return dfa->lex.lasttok = PLUS;
1396
1397         case '{':
1398           if (!(dfa->syntax.syntax_bits & RE_INTERVALS))
1399             goto normal_char;
1400           if (backslash != ((dfa->syntax.syntax_bits & RE_NO_BK_BRACES) == 0))
1401             goto normal_char;
1402           if (!(dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_OPS)
1403               && dfa->lex.laststart)
1404             goto normal_char;
1405
1406           /* Cases:
1407              {M} - exact count
1408              {M,} - minimum count, maximum is infinity
1409              {,N} - 0 through N
1410              {,} - 0 to infinity (same as '*')
1411              {M,N} - M through N */
1412           {
1413             char const *p = dfa->lex.ptr;
1414             char const *lim = p + dfa->lex.left;
1415             dfa->lex.minrep = dfa->lex.maxrep = -1;
1416             for (; p != lim && isasciidigit (*p); p++)
1417               dfa->lex.minrep = (dfa->lex.minrep < 0
1418                                  ? *p - '0'
1419                                  : MIN (RE_DUP_MAX + 1,
1420                                         dfa->lex.minrep * 10 + *p - '0'));
1421             if (p != lim)
1422               {
1423                 if (*p != ',')
1424                   dfa->lex.maxrep = dfa->lex.minrep;
1425                 else
1426                   {
1427                     if (dfa->lex.minrep < 0)
1428                       dfa->lex.minrep = 0;
1429                     while (++p != lim && isasciidigit (*p))
1430                       dfa->lex.maxrep
1431                         = (dfa->lex.maxrep < 0
1432                            ? *p - '0'
1433                            : MIN (RE_DUP_MAX + 1,
1434                                   dfa->lex.maxrep * 10 + *p - '0'));
1435                   }
1436               }
1437             if (! ((! backslash || (p != lim && *p++ == '\\'))
1438                    && p != lim && *p++ == '}'
1439                    && 0 <= dfa->lex.minrep
1440                    && (dfa->lex.maxrep < 0
1441                        || dfa->lex.minrep <= dfa->lex.maxrep)))
1442               {
1443                 if (dfa->syntax.syntax_bits & RE_INVALID_INTERVAL_ORD)
1444                   goto normal_char;
1445                 dfaerror (_("invalid content of \\{\\}"));
1446               }
1447             if (RE_DUP_MAX < dfa->lex.maxrep)
1448               dfaerror (_("regular expression too big"));
1449             dfa->lex.ptr = p;
1450             dfa->lex.left = lim - p;
1451           }
1452           dfa->lex.laststart = false;
1453           return dfa->lex.lasttok = REPMN;
1454
1455         case '|':
1456           if (dfa->syntax.syntax_bits & RE_LIMITED_OPS)
1457             goto normal_char;
1458           if (backslash != ((dfa->syntax.syntax_bits & RE_NO_BK_VBAR) == 0))
1459             goto normal_char;
1460           dfa->lex.laststart = true;
1461           return dfa->lex.lasttok = OR;
1462
1463         case '\n':
1464           if (dfa->syntax.syntax_bits & RE_LIMITED_OPS
1465               || backslash || !(dfa->syntax.syntax_bits & RE_NEWLINE_ALT))
1466             goto normal_char;
1467           dfa->lex.laststart = true;
1468           return dfa->lex.lasttok = OR;
1469
1470         case '(':
1471           if (backslash != ((dfa->syntax.syntax_bits & RE_NO_BK_PARENS) == 0))
1472             goto normal_char;
1473           dfa->lex.parens++;
1474           dfa->lex.laststart = true;
1475           return dfa->lex.lasttok = LPAREN;
1476
1477         case ')':
1478           if (backslash != ((dfa->syntax.syntax_bits & RE_NO_BK_PARENS) == 0))
1479             goto normal_char;
1480           if (dfa->lex.parens == 0
1481               && dfa->syntax.syntax_bits & RE_UNMATCHED_RIGHT_PAREN_ORD)
1482             goto normal_char;
1483           dfa->lex.parens--;
1484           dfa->lex.laststart = false;
1485           return dfa->lex.lasttok = RPAREN;
1486
1487         case '.':
1488           if (backslash)
1489             goto normal_char;
1490           if (dfa->canychar == (size_t) -1)
1491             {
1492               charclass ccl;
1493               fillset (&ccl);
1494               if (!(dfa->syntax.syntax_bits & RE_DOT_NEWLINE))
1495                 clrbit ('\n', &ccl);
1496               if (dfa->syntax.syntax_bits & RE_DOT_NOT_NULL)
1497                 clrbit ('\0', &ccl);
1498               if (dfa->localeinfo.multibyte)
1499                 for (int c2 = 0; c2 < NOTCHAR; c2++)
1500                   if (dfa->localeinfo.sbctowc[c2] == WEOF)
1501                     clrbit (c2, &ccl);
1502               dfa->canychar = charclass_index (dfa, &ccl);
1503             }
1504           dfa->lex.laststart = false;
1505           return dfa->lex.lasttok = (dfa->localeinfo.multibyte
1506                                      ? ANYCHAR
1507                                      : CSET + dfa->canychar);
1508
1509         case 's':
1510         case 'S':
1511           if (!backslash || (dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
1512             goto normal_char;
1513           if (!dfa->localeinfo.multibyte)
1514             {
1515               charclass ccl;
1516               zeroset (&ccl);
1517               for (int c2 = 0; c2 < NOTCHAR; ++c2)
1518                 if (isspace (c2))
1519                   setbit (c2, &ccl);
1520               if (c == 'S')
1521                 notset (&ccl);
1522               dfa->lex.laststart = false;
1523               return dfa->lex.lasttok = CSET + charclass_index (dfa, &ccl);
1524             }
1525
1526           /* FIXME: see if optimizing this, as is done with ANYCHAR and
1527              add_utf8_anychar, makes sense.  */
1528
1529           /* \s and \S are documented to be equivalent to [[:space:]] and
1530              [^[:space:]] respectively, so tell the lexer to process those
1531              strings, each minus its "already processed" '['.  */
1532           {
1533             struct lexptr ls;
1534             push_lex_state (dfa, &ls, &"^[:space:]]"[c == 's']);
1535             dfa->lex.lasttok = parse_bracket_exp (dfa);
1536             pop_lex_state (dfa, &ls);
1537           }
1538
1539           dfa->lex.laststart = false;
1540           return dfa->lex.lasttok;
1541
1542         case 'w':
1543         case 'W':
1544           if (!backslash || (dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
1545             goto normal_char;
1546
1547           if (!dfa->localeinfo.multibyte)
1548             {
1549               charclass ccl;
1550               zeroset (&ccl);
1551               for (int c2 = 0; c2 < NOTCHAR; ++c2)
1552                 if (dfa->syntax.sbit[c2] == CTX_LETTER)
1553                   setbit (c2, &ccl);
1554               if (c == 'W')
1555                 notset (&ccl);
1556               dfa->lex.laststart = false;
1557               return dfa->lex.lasttok = CSET + charclass_index (dfa, &ccl);
1558             }
1559
1560           /* FIXME: see if optimizing this, as is done with ANYCHAR and
1561              add_utf8_anychar, makes sense.  */
1562
1563           /* \w and \W are documented to be equivalent to [_[:alnum:]] and
1564              [^_[:alnum:]] respectively, so tell the lexer to process those
1565              strings, each minus its "already processed" '['.  */
1566           {
1567             struct lexptr ls;
1568             push_lex_state (dfa, &ls, &"^_[:alnum:]]"[c == 'w']);
1569             dfa->lex.lasttok = parse_bracket_exp (dfa);
1570             pop_lex_state (dfa, &ls);
1571           }
1572
1573           dfa->lex.laststart = false;
1574           return dfa->lex.lasttok;
1575
1576         case '[':
1577           if (backslash)
1578             goto normal_char;
1579           dfa->lex.laststart = false;
1580           return dfa->lex.lasttok = parse_bracket_exp (dfa);
1581
1582         default:
1583         normal_char:
1584           dfa->lex.laststart = false;
1585           /* For multibyte character sets, folding is done in atom.  Always
1586              return WCHAR.  */
1587           if (dfa->localeinfo.multibyte)
1588             return dfa->lex.lasttok = WCHAR;
1589
1590           if (dfa->syntax.case_fold && isalpha (c))
1591             {
1592               charclass ccl;
1593               zeroset (&ccl);
1594               setbit_case_fold_c (c, &ccl);
1595               return dfa->lex.lasttok = CSET + charclass_index (dfa, &ccl);
1596             }
1597
1598           return dfa->lex.lasttok = c;
1599         }
1600     }
1601
1602   /* The above loop should consume at most a backslash
1603      and some other character.  */
1604   abort ();
1605   return END;                   /* keeps pedantic compilers happy.  */
1606 }
1607
1608 static void
1609 addtok_mb (struct dfa *dfa, token t, char mbprop)
1610 {
1611   if (dfa->talloc == dfa->tindex)
1612     {
1613       dfa->tokens = x2nrealloc (dfa->tokens, &dfa->talloc,
1614                                 sizeof *dfa->tokens);
1615       if (dfa->localeinfo.multibyte)
1616         dfa->multibyte_prop = xnrealloc (dfa->multibyte_prop, dfa->talloc,
1617                                          sizeof *dfa->multibyte_prop);
1618     }
1619   if (dfa->localeinfo.multibyte)
1620     dfa->multibyte_prop[dfa->tindex] = mbprop;
1621   dfa->tokens[dfa->tindex++] = t;
1622
1623   switch (t)
1624     {
1625     case QMARK:
1626     case STAR:
1627     case PLUS:
1628       break;
1629
1630     case CAT:
1631     case OR:
1632       dfa->parse.depth--;
1633       break;
1634
1635     case BACKREF:
1636       dfa->fast = false;
1637       FALLTHROUGH;
1638     default:
1639       dfa->nleaves++;
1640       FALLTHROUGH;
1641     case EMPTY:
1642       dfa->parse.depth++;
1643       break;
1644     }
1645   if (dfa->parse.depth > dfa->depth)
1646     dfa->depth = dfa->parse.depth;
1647 }
1648
1649 static void addtok_wc (struct dfa *dfa, wint_t wc);
1650
1651 /* Add the given token to the parse tree, maintaining the depth count and
1652    updating the maximum depth if necessary.  */
1653 static void
1654 addtok (struct dfa *dfa, token t)
1655 {
1656   if (dfa->localeinfo.multibyte && t == MBCSET)
1657     {
1658       bool need_or = false;
1659
1660       /* Extract wide characters into alternations for better performance.
1661          This does not require UTF-8.  */
1662       for (ptrdiff_t i = 0; i < dfa->lex.brack.nchars; i++)
1663         {
1664           addtok_wc (dfa, dfa->lex.brack.chars[i]);
1665           if (need_or)
1666             addtok (dfa, OR);
1667           need_or = true;
1668         }
1669       dfa->lex.brack.nchars = 0;
1670
1671       /* Wide characters have been handled above, so it is possible
1672          that the set is empty now.  Do nothing in that case.  */
1673       if (dfa->lex.brack.cset != -1)
1674         {
1675           addtok (dfa, CSET + dfa->lex.brack.cset);
1676           if (need_or)
1677             addtok (dfa, OR);
1678         }
1679     }
1680   else
1681     {
1682       addtok_mb (dfa, t, 3);
1683     }
1684 }
1685
1686 /* We treat a multibyte character as a single atom, so that DFA
1687    can treat a multibyte character as a single expression.
1688
1689    e.g., we construct the following tree from "<mb1><mb2>".
1690    <mb1(1st-byte)><mb1(2nd-byte)><CAT><mb1(3rd-byte)><CAT>
1691    <mb2(1st-byte)><mb2(2nd-byte)><CAT><mb2(3rd-byte)><CAT><CAT> */
1692 static void
1693 addtok_wc (struct dfa *dfa, wint_t wc)
1694 {
1695   unsigned char buf[MB_LEN_MAX];
1696   mbstate_t s = { 0 };
1697   size_t stored_bytes = wcrtomb ((char *) buf, wc, &s);
1698
1699   if (stored_bytes != (size_t) -1)
1700     dfa->lex.cur_mb_len = stored_bytes;
1701   else
1702     {
1703       /* This is merely stop-gap.  buf[0] is undefined, yet skipping
1704          the addtok_mb call altogether can corrupt the heap.  */
1705       dfa->lex.cur_mb_len = 1;
1706       buf[0] = 0;
1707     }
1708
1709   addtok_mb (dfa, buf[0], dfa->lex.cur_mb_len == 1 ? 3 : 1);
1710   for (int i = 1; i < dfa->lex.cur_mb_len; i++)
1711     {
1712       addtok_mb (dfa, buf[i], i == dfa->lex.cur_mb_len - 1 ? 2 : 0);
1713       addtok (dfa, CAT);
1714     }
1715 }
1716
1717 static void
1718 add_utf8_anychar (struct dfa *dfa)
1719 {
1720   static charclass const utf8_classes[5] = {
1721     /* 80-bf: non-leading bytes.  */
1722     CHARCLASS_INIT (0, 0, 0, 0, 0xffffffff, 0xffffffff, 0, 0),
1723
1724     /* 00-7f: 1-byte sequence.  */
1725     CHARCLASS_INIT (0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0, 0, 0, 0),
1726
1727     /* c2-df: 2-byte sequence.  */
1728     CHARCLASS_INIT (0, 0, 0, 0, 0, 0, 0xfffffffc, 0),
1729
1730     /* e0-ef: 3-byte sequence.  */
1731     CHARCLASS_INIT (0, 0, 0, 0, 0, 0, 0, 0xffff),
1732
1733     /* f0-f7: 4-byte sequence.  */
1734     CHARCLASS_INIT (0, 0, 0, 0, 0, 0, 0, 0xff0000)
1735   };
1736   const unsigned int n = sizeof (utf8_classes) / sizeof (utf8_classes[0]);
1737
1738   /* Define the five character classes that are needed below.  */
1739   if (dfa->utf8_anychar_classes[0] == 0)
1740     for (unsigned int i = 0; i < n; i++)
1741       {
1742         charclass c = utf8_classes[i];
1743         if (i == 1)
1744           {
1745             if (!(dfa->syntax.syntax_bits & RE_DOT_NEWLINE))
1746               clrbit ('\n', &c);
1747             if (dfa->syntax.syntax_bits & RE_DOT_NOT_NULL)
1748               clrbit ('\0', &c);
1749           }
1750         dfa->utf8_anychar_classes[i] = CSET + charclass_index (dfa, &c);
1751       }
1752
1753   /* A valid UTF-8 character is
1754
1755      ([0x00-0x7f]
1756      |[0xc2-0xdf][0x80-0xbf]
1757      |[0xe0-0xef[0x80-0xbf][0x80-0xbf]
1758      |[0xf0-f7][0x80-0xbf][0x80-0xbf][0x80-0xbf])
1759
1760      which I'll write more concisely "B|CA|DAA|EAAA".  Factor the [0x00-0x7f]
1761      and you get "B|(C|(D|EA)A)A".  And since the token buffer is in reverse
1762      Polish notation, you get "B C D E A CAT OR A CAT OR A CAT OR".  */
1763   unsigned int i;
1764   for (i = 1; i < n; i++)
1765     addtok (dfa, dfa->utf8_anychar_classes[i]);
1766   while (--i > 1)
1767     {
1768       addtok (dfa, dfa->utf8_anychar_classes[0]);
1769       addtok (dfa, CAT);
1770       addtok (dfa, OR);
1771     }
1772 }
1773
1774 /* The grammar understood by the parser is as follows.
1775
1776    regexp:
1777      regexp OR branch
1778      branch
1779
1780    branch:
1781      branch closure
1782      closure
1783
1784    closure:
1785      closure QMARK
1786      closure STAR
1787      closure PLUS
1788      closure REPMN
1789      atom
1790
1791    atom:
1792      <normal character>
1793      <multibyte character>
1794      ANYCHAR
1795      MBCSET
1796      CSET
1797      BACKREF
1798      BEGLINE
1799      ENDLINE
1800      BEGWORD
1801      ENDWORD
1802      LIMWORD
1803      NOTLIMWORD
1804      LPAREN regexp RPAREN
1805      <empty>
1806
1807    The parser builds a parse tree in postfix form in an array of tokens.  */
1808
1809 static void
1810 atom (struct dfa *dfa)
1811 {
1812   if ((0 <= dfa->parse.tok && dfa->parse.tok < NOTCHAR)
1813       || dfa->parse.tok >= CSET
1814       || dfa->parse.tok == BEG || dfa->parse.tok == BACKREF
1815       || dfa->parse.tok == BEGLINE || dfa->parse.tok == ENDLINE
1816       || dfa->parse.tok == BEGWORD || dfa->parse.tok == ENDWORD
1817       || dfa->parse.tok == LIMWORD || dfa->parse.tok == NOTLIMWORD
1818       || dfa->parse.tok == ANYCHAR || dfa->parse.tok == MBCSET)
1819     {
1820       if (dfa->parse.tok == ANYCHAR && dfa->localeinfo.using_utf8)
1821         {
1822           /* For UTF-8 expand the period to a series of CSETs that define a
1823              valid UTF-8 character.  This avoids using the slow multibyte
1824              path.  I'm pretty sure it would be both profitable and correct to
1825              do it for any encoding; however, the optimization must be done
1826              manually as it is done above in add_utf8_anychar.  So, let's
1827              start with UTF-8: it is the most used, and the structure of the
1828              encoding makes the correctness more obvious.  */
1829           add_utf8_anychar (dfa);
1830         }
1831       else
1832         addtok (dfa, dfa->parse.tok);
1833       dfa->parse.tok = lex (dfa);
1834     }
1835   else if (dfa->parse.tok == WCHAR)
1836     {
1837       if (dfa->lex.wctok == WEOF)
1838         addtok (dfa, BACKREF);
1839       else
1840         {
1841           addtok_wc (dfa, dfa->lex.wctok);
1842
1843           if (dfa->syntax.case_fold)
1844             {
1845               wchar_t folded[CASE_FOLDED_BUFSIZE];
1846               unsigned int n = case_folded_counterparts (dfa->lex.wctok,
1847                                                          folded);
1848               for (unsigned int i = 0; i < n; i++)
1849                 {
1850                   addtok_wc (dfa, folded[i]);
1851                   addtok (dfa, OR);
1852                 }
1853             }
1854         }
1855
1856       dfa->parse.tok = lex (dfa);
1857     }
1858   else if (dfa->parse.tok == LPAREN)
1859     {
1860       dfa->parse.tok = lex (dfa);
1861       regexp (dfa);
1862       if (dfa->parse.tok != RPAREN)
1863         dfaerror (_("unbalanced ("));
1864       dfa->parse.tok = lex (dfa);
1865     }
1866   else
1867     addtok (dfa, EMPTY);
1868 }
1869
1870 /* Return the number of tokens in the given subexpression.  */
1871 static size_t _GL_ATTRIBUTE_PURE
1872 nsubtoks (struct dfa const *dfa, size_t tindex)
1873 {
1874   switch (dfa->tokens[tindex - 1])
1875     {
1876     default:
1877       return 1;
1878     case QMARK:
1879     case STAR:
1880     case PLUS:
1881       return 1 + nsubtoks (dfa, tindex - 1);
1882     case CAT:
1883     case OR:
1884       {
1885         size_t ntoks1 = nsubtoks (dfa, tindex - 1);
1886         return 1 + ntoks1 + nsubtoks (dfa, tindex - 1 - ntoks1);
1887       }
1888     }
1889 }
1890
1891 /* Copy the given subexpression to the top of the tree.  */
1892 static void
1893 copytoks (struct dfa *dfa, size_t tindex, size_t ntokens)
1894 {
1895   if (dfa->localeinfo.multibyte)
1896     for (size_t i = 0; i < ntokens; ++i)
1897       addtok_mb (dfa, dfa->tokens[tindex + i],
1898                  dfa->multibyte_prop[tindex + i]);
1899   else
1900     for (size_t i = 0; i < ntokens; ++i)
1901       addtok_mb (dfa, dfa->tokens[tindex + i], 3);
1902 }
1903
1904 static void
1905 closure (struct dfa *dfa)
1906 {
1907   atom (dfa);
1908   while (dfa->parse.tok == QMARK || dfa->parse.tok == STAR
1909          || dfa->parse.tok == PLUS || dfa->parse.tok == REPMN)
1910     if (dfa->parse.tok == REPMN && (dfa->lex.minrep || dfa->lex.maxrep))
1911       {
1912         size_t ntokens = nsubtoks (dfa, dfa->tindex);
1913         size_t tindex = dfa->tindex - ntokens;
1914         if (dfa->lex.maxrep < 0)
1915           addtok (dfa, PLUS);
1916         if (dfa->lex.minrep == 0)
1917           addtok (dfa, QMARK);
1918         int i;
1919         for (i = 1; i < dfa->lex.minrep; i++)
1920           {
1921             copytoks (dfa, tindex, ntokens);
1922             addtok (dfa, CAT);
1923           }
1924         for (; i < dfa->lex.maxrep; i++)
1925           {
1926             copytoks (dfa, tindex, ntokens);
1927             addtok (dfa, QMARK);
1928             addtok (dfa, CAT);
1929           }
1930         dfa->parse.tok = lex (dfa);
1931       }
1932     else if (dfa->parse.tok == REPMN)
1933       {
1934         dfa->tindex -= nsubtoks (dfa, dfa->tindex);
1935         dfa->parse.tok = lex (dfa);
1936         closure (dfa);
1937       }
1938     else
1939       {
1940         addtok (dfa, dfa->parse.tok);
1941         dfa->parse.tok = lex (dfa);
1942       }
1943 }
1944
1945 static void
1946 branch (struct dfa* dfa)
1947 {
1948   closure (dfa);
1949   while (dfa->parse.tok != RPAREN && dfa->parse.tok != OR
1950          && dfa->parse.tok >= 0)
1951     {
1952       closure (dfa);
1953       addtok (dfa, CAT);
1954     }
1955 }
1956
1957 static void
1958 regexp (struct dfa *dfa)
1959 {
1960   branch (dfa);
1961   while (dfa->parse.tok == OR)
1962     {
1963       dfa->parse.tok = lex (dfa);
1964       branch (dfa);
1965       addtok (dfa, OR);
1966     }
1967 }
1968
1969 /* Main entry point for the parser.  S is a string to be parsed, len is the
1970    length of the string, so s can include NUL characters.  D is a pointer to
1971    the struct dfa to parse into.  */
1972 static void
1973 dfaparse (char const *s, size_t len, struct dfa *d)
1974 {
1975   d->lex.ptr = s;
1976   d->lex.left = len;
1977   d->lex.lasttok = END;
1978   d->lex.laststart = true;
1979
1980   if (!d->syntax.syntax_bits_set)
1981     dfaerror (_("no syntax specified"));
1982
1983   if (!d->nregexps)
1984     addtok (d, BEG);
1985
1986   d->parse.tok = lex (d);
1987   d->parse.depth = d->depth;
1988
1989   regexp (d);
1990
1991   if (d->parse.tok != END)
1992     dfaerror (_("unbalanced )"));
1993
1994   addtok (d, END - d->nregexps);
1995   addtok (d, CAT);
1996
1997   if (d->nregexps)
1998     addtok (d, OR);
1999
2000   ++d->nregexps;
2001 }
2002
2003 /* Some primitives for operating on sets of positions.  */
2004
2005 /* Copy one set to another.  */
2006 static void
2007 copy (position_set const *src, position_set *dst)
2008 {
2009   if (dst->alloc < src->nelem)
2010     {
2011       free (dst->elems);
2012       dst->elems = xpalloc (NULL, &dst->alloc, src->nelem - dst->alloc, -1,
2013                             sizeof *dst->elems);
2014     }
2015   dst->nelem = src->nelem;
2016   if (src->nelem != 0)
2017     memcpy (dst->elems, src->elems, src->nelem * sizeof *dst->elems);
2018 }
2019
2020 static void
2021 alloc_position_set (position_set *s, size_t size)
2022 {
2023   s->elems = xnmalloc (size, sizeof *s->elems);
2024   s->alloc = size;
2025   s->nelem = 0;
2026 }
2027
2028 /* Insert position P in set S.  S is maintained in sorted order on
2029    decreasing index.  If there is already an entry in S with P.index
2030    then merge (logically-OR) P's constraints into the one in S.
2031    S->elems must point to an array large enough to hold the resulting set.  */
2032 static void
2033 insert (position p, position_set *s)
2034 {
2035   ptrdiff_t count = s->nelem;
2036   ptrdiff_t lo = 0, hi = count;
2037   while (lo < hi)
2038     {
2039       ptrdiff_t mid = (lo + hi) >> 1;
2040       if (s->elems[mid].index < p.index)
2041         lo = mid + 1;
2042       else if (s->elems[mid].index == p.index)
2043         {
2044           s->elems[mid].constraint |= p.constraint;
2045           return;
2046         }
2047       else
2048         hi = mid;
2049     }
2050
2051   s->elems = maybe_realloc (s->elems, count, &s->alloc, -1, sizeof *s->elems);
2052   for (ptrdiff_t i = count; i > lo; i--)
2053     s->elems[i] = s->elems[i - 1];
2054   s->elems[lo] = p;
2055   ++s->nelem;
2056 }
2057
2058 static void
2059 append (position p, position_set *s)
2060 {
2061   ptrdiff_t count = s->nelem;
2062   s->elems = maybe_realloc (s->elems, count, &s->alloc, -1, sizeof *s->elems);
2063   s->elems[s->nelem++] = p;
2064 }
2065
2066 /* Merge S1 and S2 (with the additional constraint C2) into M.  The
2067    result is as if the positions of S1, and of S2 with the additional
2068    constraint C2, were inserted into an initially empty set.  */
2069 static void
2070 merge_constrained (position_set const *s1, position_set const *s2,
2071                    unsigned int c2, position_set *m)
2072 {
2073   ptrdiff_t i = 0, j = 0;
2074
2075   if (m->alloc - s1->nelem < s2->nelem)
2076     {
2077       free (m->elems);
2078       m->alloc = s1->nelem;
2079       m->elems = xpalloc (NULL, &m->alloc, s2->nelem, -1, sizeof *m->elems);
2080     }
2081   m->nelem = 0;
2082   while (i < s1->nelem || j < s2->nelem)
2083     if (! (j < s2->nelem)
2084         || (i < s1->nelem && s1->elems[i].index <= s2->elems[j].index))
2085       {
2086         unsigned int c = ((i < s1->nelem && j < s2->nelem
2087                            && s1->elems[i].index == s2->elems[j].index)
2088                           ? s2->elems[j++].constraint & c2
2089                           : 0);
2090         m->elems[m->nelem].index = s1->elems[i].index;
2091         m->elems[m->nelem++].constraint = s1->elems[i++].constraint | c;
2092       }
2093     else
2094       {
2095         if (s2->elems[j].constraint & c2)
2096           {
2097             m->elems[m->nelem].index = s2->elems[j].index;
2098             m->elems[m->nelem++].constraint = s2->elems[j].constraint & c2;
2099           }
2100         j++;
2101       }
2102 }
2103
2104 /* Merge two sets of positions into a third.  The result is exactly as if
2105    the positions of both sets were inserted into an initially empty set.  */
2106 static void
2107 merge (position_set const *s1, position_set const *s2, position_set *m)
2108 {
2109   merge_constrained (s1, s2, -1, m);
2110 }
2111
2112 static void
2113 merge2 (position_set *dst, position_set const *src, position_set *m)
2114 {
2115   if (src->nelem < 4)
2116     {
2117       for (ptrdiff_t i = 0; i < src->nelem; ++i)
2118         insert (src->elems[i], dst);
2119     }
2120    else
2121     {
2122       merge (src, dst, m);
2123       copy (m, dst);
2124     }
2125 }
2126
2127 /* Delete a position from a set.  Return the nonzero constraint of the
2128    deleted position, or zero if there was no such position.  */
2129 static unsigned int
2130 delete (size_t del, position_set *s)
2131 {
2132   size_t count = s->nelem;
2133   size_t lo = 0, hi = count;
2134   while (lo < hi)
2135     {
2136       size_t mid = (lo + hi) >> 1;
2137       if (s->elems[mid].index < del)
2138         lo = mid + 1;
2139       else if (s->elems[mid].index == del)
2140         {
2141           unsigned int c = s->elems[mid].constraint;
2142           size_t i;
2143           for (i = mid; i + 1 < count; i++)
2144             s->elems[i] = s->elems[i + 1];
2145           s->nelem = i;
2146           return c;
2147         }
2148       else
2149         hi = mid;
2150     }
2151   return 0;
2152 }
2153
2154 /* Replace a position with the followed set.  */
2155 static void
2156 replace (position_set *dst, size_t del, position_set *add,
2157          unsigned int constraint, position_set *tmp)
2158 {
2159   unsigned int c = delete (del, dst) & constraint;
2160
2161   if (c)
2162     {
2163       copy (dst, tmp);
2164       merge_constrained (tmp, add, c, dst);
2165     }
2166 }
2167
2168 /* Find the index of the state corresponding to the given position set with
2169    the given preceding context, or create a new state if there is no such
2170    state.  Context tells whether we got here on a newline or letter.  */
2171 static state_num
2172 state_index (struct dfa *d, position_set const *s, int context)
2173 {
2174   size_t hash = 0;
2175   int constraint = 0;
2176   state_num i;
2177   token first_end = 0;
2178
2179   for (i = 0; i < s->nelem; ++i)
2180     hash ^= s->elems[i].index + s->elems[i].constraint;
2181
2182   /* Try to find a state that exactly matches the proposed one.  */
2183   for (i = 0; i < d->sindex; ++i)
2184     {
2185       if (hash != d->states[i].hash || s->nelem != d->states[i].elems.nelem
2186           || context != d->states[i].context)
2187         continue;
2188       state_num j;
2189       for (j = 0; j < s->nelem; ++j)
2190         if (s->elems[j].constraint != d->states[i].elems.elems[j].constraint
2191             || s->elems[j].index != d->states[i].elems.elems[j].index)
2192           break;
2193       if (j == s->nelem)
2194         return i;
2195     }
2196
2197 #ifdef DEBUG
2198   fprintf (stderr, "new state %zd\n nextpos:", i);
2199   for (state_num j = 0; j < s->nelem; j++)
2200     {
2201       fprintf (stderr, " %zu:", s->elems[j].index);
2202       prtok (d->tokens[s->elems[j].index]);
2203     }
2204   fprintf (stderr, "\n context:");
2205   if (context ^ CTX_ANY)
2206     {
2207       if (context & CTX_NONE)
2208         fprintf (stderr, " CTX_NONE");
2209       if (context & CTX_LETTER)
2210         fprintf (stderr, " CTX_LETTER");
2211       if (context & CTX_NEWLINE)
2212         fprintf (stderr, " CTX_NEWLINE");
2213     }
2214   else
2215     fprintf (stderr, " CTX_ANY");
2216   fprintf (stderr, "\n");
2217 #endif
2218
2219   for (state_num j = 0; j < s->nelem; j++)
2220     {
2221       int c = d->constraints[s->elems[j].index];
2222
2223       if (c != 0)
2224         {
2225           if (succeeds_in_context (c, context, CTX_ANY))
2226             constraint |= c;
2227           if (!first_end)
2228             first_end = d->tokens[s->elems[j].index];
2229         }
2230       else if (d->tokens[s->elems[j].index] == BACKREF)
2231         constraint = NO_CONSTRAINT;
2232     }
2233
2234
2235   /* Create a new state.  */
2236   d->states = maybe_realloc (d->states, d->sindex, &d->salloc, -1,
2237                              sizeof *d->states);
2238   d->states[i].hash = hash;
2239   alloc_position_set (&d->states[i].elems, s->nelem);
2240   copy (s, &d->states[i].elems);
2241   d->states[i].context = context;
2242   d->states[i].constraint = constraint;
2243   d->states[i].first_end = first_end;
2244   d->states[i].mbps.nelem = 0;
2245   d->states[i].mbps.elems = NULL;
2246   d->states[i].mb_trindex = -1;
2247
2248   ++d->sindex;
2249
2250   return i;
2251 }
2252
2253 /* Find the epsilon closure of a set of positions.  If any position of the set
2254    contains a symbol that matches the empty string in some context, replace
2255    that position with the elements of its follow labeled with an appropriate
2256    constraint.  Repeat exhaustively until no funny positions are left.
2257    S->elems must be large enough to hold the result.  */
2258 static void
2259 epsclosure (struct dfa const *d)
2260 {
2261   position_set tmp;
2262   alloc_position_set (&tmp, d->nleaves);
2263   for (size_t i = 0; i < d->tindex; ++i)
2264     if (d->follows[i].nelem > 0 && d->tokens[i] >= NOTCHAR
2265         && d->tokens[i] != BACKREF && d->tokens[i] != ANYCHAR
2266         && d->tokens[i] != MBCSET && d->tokens[i] < CSET)
2267       {
2268         unsigned int constraint;
2269         switch (d->tokens[i])
2270           {
2271           case BEGLINE:
2272             constraint = BEGLINE_CONSTRAINT;
2273             break;
2274           case ENDLINE:
2275             constraint = ENDLINE_CONSTRAINT;
2276             break;
2277           case BEGWORD:
2278             constraint = BEGWORD_CONSTRAINT;
2279             break;
2280           case ENDWORD:
2281             constraint = ENDWORD_CONSTRAINT;
2282             break;
2283           case LIMWORD:
2284             constraint = LIMWORD_CONSTRAINT;
2285             break;
2286           case NOTLIMWORD:
2287             constraint = NOTLIMWORD_CONSTRAINT;
2288             break;
2289           default:
2290             constraint = NO_CONSTRAINT;
2291             break;
2292           }
2293
2294         delete (i, &d->follows[i]);
2295
2296         for (size_t j = 0; j < d->tindex; j++)
2297           if (i != j && d->follows[j].nelem > 0)
2298             replace (&d->follows[j], i, &d->follows[i], constraint, &tmp);
2299       }
2300   free (tmp.elems);
2301 }
2302
2303 /* Returns the set of contexts for which there is at least one
2304    character included in C.  */
2305
2306 static int
2307 charclass_context (struct dfa const *dfa, charclass const *c)
2308 {
2309   int context = 0;
2310
2311   for (unsigned int j = 0; j < CHARCLASS_WORDS; ++j)
2312     {
2313       if (c->w[j] & dfa->syntax.newline.w[j])
2314         context |= CTX_NEWLINE;
2315       if (c->w[j] & dfa->syntax.letters.w[j])
2316         context |= CTX_LETTER;
2317       if (c->w[j] & ~(dfa->syntax.letters.w[j] | dfa->syntax.newline.w[j]))
2318         context |= CTX_NONE;
2319     }
2320
2321   return context;
2322 }
2323
2324 /* Returns the contexts on which the position set S depends.  Each context
2325    in the set of returned contexts (let's call it SC) may have a different
2326    follow set than other contexts in SC, and also different from the
2327    follow set of the complement set (sc ^ CTX_ANY).  However, all contexts
2328    in the complement set will have the same follow set.  */
2329
2330 static int _GL_ATTRIBUTE_PURE
2331 state_separate_contexts (struct dfa *d, position_set const *s)
2332 {
2333   int separate_contexts = 0;
2334
2335   for (size_t j = 0; j < s->nelem; j++)
2336     separate_contexts |= d->separates[s->elems[j].index];
2337
2338   return separate_contexts;
2339 }
2340
2341 enum
2342 {
2343   /* Single token is repeated.  It is distinguished from non-repeated.  */
2344   OPT_REPEAT = (1 << 0),
2345
2346   /* Multiple tokens are repeated.  This flag is on at head of tokens.  The
2347      node is not merged.  */
2348   OPT_LPAREN = (1 << 1),
2349
2350   /* Multiple branches are joined.  The node is not merged.  */
2351   OPT_RPAREN = (1 << 2),
2352
2353   /* The node is walked.  If the node is found in walking again, OPT_RPAREN
2354      flag is turned on. */
2355   OPT_WALKED = (1 << 3),
2356
2357   /* The node is queued.  The node is not queued again.  */
2358   OPT_QUEUED = (1 << 4)
2359 };
2360
2361 static void
2362 merge_nfa_state (struct dfa *d, size_t tindex, char *flags,
2363                  position_set *merged)
2364 {
2365   position_set *follows = d->follows;
2366   ptrdiff_t nelem = 0;
2367
2368   d->constraints[tindex] = 0;
2369
2370   for (ptrdiff_t i = 0; i < follows[tindex].nelem; i++)
2371     {
2372       size_t sindex = follows[tindex].elems[i].index;
2373
2374       /* Skip the node as pruned in future.  */
2375       unsigned int iconstraint = follows[tindex].elems[i].constraint;
2376       if (iconstraint == 0)
2377         continue;
2378
2379       if (d->tokens[follows[tindex].elems[i].index] <= END)
2380         {
2381           d->constraints[tindex] |= follows[tindex].elems[i].constraint;
2382           continue;
2383         }
2384
2385       if (!(flags[sindex] & (OPT_LPAREN | OPT_RPAREN)))
2386         {
2387           ptrdiff_t j;
2388
2389           for (j = 0; j < nelem; j++)
2390             {
2391               size_t dindex = follows[tindex].elems[j].index;
2392
2393               if (follows[tindex].elems[j].constraint != iconstraint)
2394                 continue;
2395
2396               if (flags[dindex] & (OPT_LPAREN | OPT_RPAREN))
2397                 continue;
2398
2399               if (d->tokens[sindex] != d->tokens[dindex])
2400                 continue;
2401
2402               if ((flags[sindex] ^ flags[dindex]) & OPT_REPEAT)
2403                 continue;
2404
2405               if (flags[sindex] & OPT_REPEAT)
2406                 delete (sindex, &follows[sindex]);
2407
2408               merge2 (&follows[dindex], &follows[sindex], merged);
2409
2410               break;
2411             }
2412
2413           if (j < nelem)
2414             continue;
2415         }
2416
2417       follows[tindex].elems[nelem++] = follows[tindex].elems[i];
2418       flags[sindex] |= OPT_QUEUED;
2419     }
2420
2421   follows[tindex].nelem = nelem;
2422 }
2423
2424 static int
2425 compare (const void *a, const void *b)
2426 {
2427   int aindex;
2428   int bindex;
2429
2430   aindex = (int) ((position *) a)->index;
2431   bindex = (int) ((position *) b)->index;
2432
2433   return aindex - bindex;
2434 }
2435
2436 static void
2437 reorder_tokens (struct dfa *d)
2438 {
2439   ptrdiff_t nleaves;
2440   ptrdiff_t *map;
2441   token *tokens;
2442   position_set *follows;
2443   int *constraints;
2444   char *multibyte_prop;
2445
2446   nleaves = 0;
2447
2448   map = xnmalloc (d->tindex, sizeof *map);
2449
2450   map[0] = nleaves++;
2451
2452   for (ptrdiff_t i = 1; i < d->tindex; i++)
2453     map[i] = -1;
2454
2455   tokens = xnmalloc (d->nleaves, sizeof *tokens);
2456   follows = xnmalloc (d->nleaves, sizeof *follows);
2457   constraints = xnmalloc (d->nleaves, sizeof *constraints);
2458
2459   if (d->localeinfo.multibyte)
2460     multibyte_prop = xnmalloc (d->nleaves, sizeof *multibyte_prop);
2461   else
2462     multibyte_prop = NULL;
2463
2464   for (ptrdiff_t i = 0; i < d->tindex; i++)
2465     {
2466       if (map[i] == -1)
2467         {
2468           free (d->follows[i].elems);
2469           d->follows[i].elems = NULL;
2470           d->follows[i].nelem = 0;
2471           continue;
2472         }
2473
2474       tokens[map[i]] = d->tokens[i];
2475       follows[map[i]] = d->follows[i];
2476       constraints[map[i]] = d->constraints[i];
2477
2478       if (multibyte_prop != NULL)
2479         multibyte_prop[map[i]] = d->multibyte_prop[i];
2480
2481       for (ptrdiff_t j = 0; j < d->follows[i].nelem; j++)
2482         {
2483           if (map[d->follows[i].elems[j].index] == -1)
2484             map[d->follows[i].elems[j].index] = nleaves++;
2485
2486           d->follows[i].elems[j].index = map[d->follows[i].elems[j].index];
2487         }
2488
2489       qsort (d->follows[i].elems, d->follows[i].nelem,
2490              sizeof *d->follows[i].elems, compare);
2491     }
2492
2493   for (ptrdiff_t i = 0; i < nleaves; i++)
2494     {
2495       d->tokens[i] = tokens[i];
2496       d->follows[i] = follows[i];
2497       d->constraints[i] = constraints[i];
2498
2499       if (multibyte_prop != NULL)
2500         d->multibyte_prop[i] = multibyte_prop[i];
2501     }
2502
2503   d->tindex = d->nleaves = nleaves;
2504
2505   free (tokens);
2506   free (follows);
2507   free (constraints);
2508   free (multibyte_prop);
2509   free (map);
2510 }
2511
2512 static void
2513 dfaoptimize (struct dfa *d)
2514 {
2515   char *flags;
2516   position_set merged0;
2517   position_set *merged;
2518
2519   flags = xmalloc (d->tindex * sizeof *flags);
2520   memset (flags, 0, d->tindex * sizeof *flags);
2521
2522   for (size_t i = 0; i < d->tindex; i++)
2523     {
2524       for (ptrdiff_t j = 0; j < d->follows[i].nelem; j++)
2525         {
2526           if (d->follows[i].elems[j].index == i)
2527             flags[d->follows[i].elems[j].index] |= OPT_REPEAT;
2528           else if (d->follows[i].elems[j].index < i)
2529             flags[d->follows[i].elems[j].index] |= OPT_LPAREN;
2530           else if (flags[d->follows[i].elems[j].index] &= OPT_WALKED)
2531             flags[d->follows[i].elems[j].index] |= OPT_RPAREN;
2532           else
2533             flags[d->follows[i].elems[j].index] |= OPT_WALKED;
2534         }
2535     }
2536
2537   flags[0] |= OPT_QUEUED;
2538
2539   merged = &merged0;
2540   alloc_position_set (merged, d->nleaves);
2541
2542   d->constraints = xnmalloc (d->tindex, sizeof *d->constraints);
2543
2544   for (ptrdiff_t i = 0; i < d->tindex; i++)
2545     if (flags[i] & OPT_QUEUED)
2546       merge_nfa_state (d, i, flags, merged);
2547
2548   reorder_tokens (d);
2549
2550   free (merged->elems);
2551   free (flags);
2552 }
2553
2554 /* Perform bottom-up analysis on the parse tree, computing various functions.
2555    Note that at this point, we're pretending constructs like \< are real
2556    characters rather than constraints on what can follow them.
2557
2558    Nullable:  A node is nullable if it is at the root of a regexp that can
2559    match the empty string.
2560    *  EMPTY leaves are nullable.
2561    * No other leaf is nullable.
2562    * A QMARK or STAR node is nullable.
2563    * A PLUS node is nullable if its argument is nullable.
2564    * A CAT node is nullable if both its arguments are nullable.
2565    * An OR node is nullable if either argument is nullable.
2566
2567    Firstpos:  The firstpos of a node is the set of positions (nonempty leaves)
2568    that could correspond to the first character of a string matching the
2569    regexp rooted at the given node.
2570    * EMPTY leaves have empty firstpos.
2571    * The firstpos of a nonempty leaf is that leaf itself.
2572    * The firstpos of a QMARK, STAR, or PLUS node is the firstpos of its
2573      argument.
2574    * The firstpos of a CAT node is the firstpos of the left argument, union
2575      the firstpos of the right if the left argument is nullable.
2576    * The firstpos of an OR node is the union of firstpos of each argument.
2577
2578    Lastpos:  The lastpos of a node is the set of positions that could
2579    correspond to the last character of a string matching the regexp at
2580    the given node.
2581    * EMPTY leaves have empty lastpos.
2582    * The lastpos of a nonempty leaf is that leaf itself.
2583    * The lastpos of a QMARK, STAR, or PLUS node is the lastpos of its
2584      argument.
2585    * The lastpos of a CAT node is the lastpos of its right argument, union
2586      the lastpos of the left if the right argument is nullable.
2587    * The lastpos of an OR node is the union of the lastpos of each argument.
2588
2589    Follow:  The follow of a position is the set of positions that could
2590    correspond to the character following a character matching the node in
2591    a string matching the regexp.  At this point we consider special symbols
2592    that match the empty string in some context to be just normal characters.
2593    Later, if we find that a special symbol is in a follow set, we will
2594    replace it with the elements of its follow, labeled with an appropriate
2595    constraint.
2596    * Every node in the firstpos of the argument of a STAR or PLUS node is in
2597      the follow of every node in the lastpos.
2598    * Every node in the firstpos of the second argument of a CAT node is in
2599      the follow of every node in the lastpos of the first argument.
2600
2601    Because of the postfix representation of the parse tree, the depth-first
2602    analysis is conveniently done by a linear scan with the aid of a stack.
2603    Sets are stored as arrays of the elements, obeying a stack-like allocation
2604    scheme; the number of elements in each set deeper in the stack can be
2605    used to determine the address of a particular set's array.  */
2606 static void
2607 dfaanalyze (struct dfa *d, bool searchflag)
2608 {
2609   /* Array allocated to hold position sets.  */
2610   position *posalloc = xnmalloc (d->nleaves, 2 * sizeof *posalloc);
2611   /* Firstpos and lastpos elements.  */
2612   position *firstpos = posalloc;
2613   position *lastpos = firstpos + d->nleaves;
2614   position pos;
2615   position_set tmp;
2616
2617   /* Stack for element counts and nullable flags.  */
2618   struct
2619   {
2620     /* Whether the entry is nullable.  */
2621     bool nullable;
2622
2623     /* Counts of firstpos and lastpos sets.  */
2624     size_t nfirstpos;
2625     size_t nlastpos;
2626   } *stkalloc = xnmalloc (d->depth, sizeof *stkalloc), *stk = stkalloc;
2627
2628   position_set merged;          /* Result of merging sets.  */
2629
2630   addtok (d, CAT);
2631
2632 #ifdef DEBUG
2633   fprintf (stderr, "dfaanalyze:\n");
2634   for (size_t i = 0; i < d->tindex; ++i)
2635     {
2636       fprintf (stderr, " %zu:", i);
2637       prtok (d->tokens[i]);
2638     }
2639   putc ('\n', stderr);
2640 #endif
2641
2642   d->searchflag = searchflag;
2643   alloc_position_set (&merged, d->nleaves);
2644   d->follows = xcalloc (d->tindex, sizeof *d->follows);
2645
2646   for (size_t i = 0; i < d->tindex; ++i)
2647     {
2648       switch (d->tokens[i])
2649         {
2650         case EMPTY:
2651           /* The empty set is nullable.  */
2652           stk->nullable = true;
2653
2654           /* The firstpos and lastpos of the empty leaf are both empty.  */
2655           stk->nfirstpos = stk->nlastpos = 0;
2656           stk++;
2657           break;
2658
2659         case STAR:
2660         case PLUS:
2661           /* Every element in the firstpos of the argument is in the follow
2662              of every element in the lastpos.  */
2663           {
2664             tmp.elems = firstpos - stk[-1].nfirstpos;
2665             tmp.nelem = stk[-1].nfirstpos;
2666             position *p = lastpos - stk[-1].nlastpos;
2667             for (size_t j = 0; j < stk[-1].nlastpos; j++)
2668               {
2669                 merge (&tmp, &d->follows[p[j].index], &merged);
2670                 copy (&merged, &d->follows[p[j].index]);
2671               }
2672           }
2673           FALLTHROUGH;
2674         case QMARK:
2675           /* A QMARK or STAR node is automatically nullable.  */
2676           if (d->tokens[i] != PLUS)
2677             stk[-1].nullable = true;
2678           break;
2679
2680         case CAT:
2681           /* Every element in the firstpos of the second argument is in the
2682              follow of every element in the lastpos of the first argument.  */
2683           {
2684             tmp.nelem = stk[-1].nfirstpos;
2685             tmp.elems = firstpos - stk[-1].nfirstpos;
2686             position *p = lastpos - stk[-1].nlastpos - stk[-2].nlastpos;
2687             for (size_t j = 0; j < stk[-2].nlastpos; j++)
2688               {
2689                 merge (&tmp, &d->follows[p[j].index], &merged);
2690                 copy (&merged, &d->follows[p[j].index]);
2691               }
2692           }
2693
2694           /* The firstpos of a CAT node is the firstpos of the first argument,
2695              union that of the second argument if the first is nullable.  */
2696           if (stk[-2].nullable)
2697             stk[-2].nfirstpos += stk[-1].nfirstpos;
2698           else
2699             firstpos -= stk[-1].nfirstpos;
2700
2701           /* The lastpos of a CAT node is the lastpos of the second argument,
2702              union that of the first argument if the second is nullable.  */
2703           if (stk[-1].nullable)
2704             stk[-2].nlastpos += stk[-1].nlastpos;
2705           else
2706             {
2707               position *p = lastpos - stk[-1].nlastpos - stk[-2].nlastpos;
2708               for (size_t j = 0; j < stk[-1].nlastpos; j++)
2709                 p[j] = p[j + stk[-2].nlastpos];
2710               lastpos -= stk[-2].nlastpos;
2711               stk[-2].nlastpos = stk[-1].nlastpos;
2712             }
2713
2714           /* A CAT node is nullable if both arguments are nullable.  */
2715           stk[-2].nullable &= stk[-1].nullable;
2716           stk--;
2717           break;
2718
2719         case OR:
2720           /* The firstpos is the union of the firstpos of each argument.  */
2721           stk[-2].nfirstpos += stk[-1].nfirstpos;
2722
2723           /* The lastpos is the union of the lastpos of each argument.  */
2724           stk[-2].nlastpos += stk[-1].nlastpos;
2725
2726           /* An OR node is nullable if either argument is nullable.  */
2727           stk[-2].nullable |= stk[-1].nullable;
2728           stk--;
2729           break;
2730
2731         default:
2732           /* Anything else is a nonempty position.  (Note that special
2733              constructs like \< are treated as nonempty strings here;
2734              an "epsilon closure" effectively makes them nullable later.
2735              Backreferences have to get a real position so we can detect
2736              transitions on them later.  But they are nullable.  */
2737           stk->nullable = d->tokens[i] == BACKREF;
2738
2739           /* This position is in its own firstpos and lastpos.  */
2740           stk->nfirstpos = stk->nlastpos = 1;
2741           stk++;
2742
2743           firstpos->index = lastpos->index = i;
2744           firstpos->constraint = lastpos->constraint = NO_CONSTRAINT;
2745           firstpos++, lastpos++;
2746
2747           break;
2748         }
2749 #ifdef DEBUG
2750       /* ... balance the above nonsyntactic #ifdef goo...  */
2751       fprintf (stderr, "node %zu:", i);
2752       prtok (d->tokens[i]);
2753       putc ('\n', stderr);
2754       fprintf (stderr,
2755                stk[-1].nullable ? " nullable: yes\n" : " nullable: no\n");
2756       fprintf (stderr, " firstpos:");
2757       for (size_t j = 0; j < stk[-1].nfirstpos; j++)
2758         {
2759           fprintf (stderr, " %zu:", firstpos[j - stk[-1].nfirstpos].index);
2760           prtok (d->tokens[firstpos[j - stk[-1].nfirstpos].index]);
2761         }
2762       fprintf (stderr, "\n lastpos:");
2763       for (size_t j = 0; j < stk[-1].nlastpos; j++)
2764         {
2765           fprintf (stderr, " %zu:", lastpos[j - stk[-1].nlastpos].index);
2766           prtok (d->tokens[lastpos[j - stk[-1].nlastpos].index]);
2767         }
2768       putc ('\n', stderr);
2769 #endif
2770     }
2771
2772   /* For each follow set that is the follow set of a real position, replace
2773      it with its epsilon closure.  */
2774   epsclosure (d);
2775
2776   dfaoptimize (d);
2777
2778 #ifdef DEBUG
2779   for (size_t i = 0; i < d->tindex; ++i)
2780     if (d->tokens[i] == BEG || d->tokens[i] < NOTCHAR
2781         || d->tokens[i] == BACKREF || d->tokens[i] == ANYCHAR
2782         || d->tokens[i] == MBCSET || d->tokens[i] >= CSET)
2783       {
2784         fprintf (stderr, "follows(%zu:", i);
2785         prtok (d->tokens[i]);
2786         fprintf (stderr, "):");
2787         for (size_t j = 0; j < d->follows[i].nelem; j++)
2788           {
2789             fprintf (stderr, " %zu:", d->follows[i].elems[j].index);
2790             prtok (d->tokens[d->follows[i].elems[j].index]);
2791           }
2792         putc ('\n', stderr);
2793       }
2794 #endif
2795
2796   pos.index = 0;
2797   pos.constraint = NO_CONSTRAINT;
2798
2799   alloc_position_set (&tmp, 1);
2800
2801   append (pos, &tmp);
2802
2803   d->separates = xnmalloc (d->tindex, sizeof *d->separates);
2804
2805   for (ptrdiff_t i = 0; i < d->tindex; i++)
2806     {
2807       d->separates[i] = 0;
2808
2809       if (prev_newline_dependent (d->constraints[i]))
2810         d->separates[i] |= CTX_NEWLINE;
2811       if (prev_letter_dependent (d->constraints[i]))
2812         d->separates[i] |= CTX_LETTER;
2813
2814       for (ptrdiff_t j = 0; j < d->follows[i].nelem; j++)
2815         {
2816           if (prev_newline_dependent (d->follows[i].elems[j].constraint))
2817             d->separates[i] |= CTX_NEWLINE;
2818           if (prev_letter_dependent (d->follows[i].elems[j].constraint))
2819             d->separates[i] |= CTX_LETTER;
2820         }
2821     }
2822
2823   /* Context wanted by some position.  */
2824   int separate_contexts = state_separate_contexts (d, &tmp);
2825
2826   /* Build the initial state.  */
2827   if (separate_contexts & CTX_NEWLINE)
2828     state_index (d, &tmp, CTX_NEWLINE);
2829   d->initstate_notbol = d->min_trcount
2830     = state_index (d, &tmp, separate_contexts ^ CTX_ANY);
2831   if (separate_contexts & CTX_LETTER)
2832     d->min_trcount = state_index (d, &tmp, CTX_LETTER);
2833   d->min_trcount++;
2834   d->trcount = 0;
2835
2836   free (posalloc);
2837   free (stkalloc);
2838   free (merged.elems);
2839   free (tmp.elems);
2840 }
2841
2842 /* Make sure D's state arrays are large enough to hold NEW_STATE.  */
2843 static void
2844 realloc_trans_if_necessary (struct dfa *d)
2845 {
2846   state_num oldalloc = d->tralloc;
2847   if (oldalloc < d->sindex)
2848     {
2849       state_num **realtrans = d->trans ? d->trans - 2 : NULL;
2850       ptrdiff_t newalloc1 = realtrans ? d->tralloc + 2 : 0;
2851       realtrans = xpalloc (realtrans, &newalloc1, d->sindex - oldalloc,
2852                            -1, sizeof *realtrans);
2853       realtrans[0] = realtrans[1] = NULL;
2854       d->trans = realtrans + 2;
2855       ptrdiff_t newalloc = d->tralloc = newalloc1 - 2;
2856       d->fails = xnrealloc (d->fails, newalloc, sizeof *d->fails);
2857       d->success = xnrealloc (d->success, newalloc, sizeof *d->success);
2858       d->newlines = xnrealloc (d->newlines, newalloc, sizeof *d->newlines);
2859       if (d->localeinfo.multibyte)
2860         {
2861           realtrans = d->mb_trans ? d->mb_trans - 2 : NULL;
2862           realtrans = xnrealloc (realtrans, newalloc1, sizeof *realtrans);
2863           if (oldalloc == 0)
2864             realtrans[0] = realtrans[1] = NULL;
2865           d->mb_trans = realtrans + 2;
2866         }
2867       for (; oldalloc < newalloc; oldalloc++)
2868         {
2869           d->trans[oldalloc] = NULL;
2870           d->fails[oldalloc] = NULL;
2871           if (d->localeinfo.multibyte)
2872             d->mb_trans[oldalloc] = NULL;
2873         }
2874     }
2875 }
2876
2877 /*
2878    Calculate the transition table for a new state derived from state s
2879    for a compiled dfa d after input character uc, and return the new
2880    state number.
2881
2882    Do not worry about all possible input characters; calculate just the group
2883    of positions that match uc.  Label it with the set of characters that
2884    every position in the group matches (taking into account, if necessary,
2885    preceding context information of s).  Then find the union
2886    of these positions' follows, i.e., the set of positions of the
2887    new state.  For each character in the group's label, set the transition
2888    on this character to be to a state corresponding to the set's positions,
2889    and its associated backward context information, if necessary.
2890
2891    When building a searching matcher, include the positions of state
2892    0 in every state.
2893
2894    The group is constructed by building an equivalence-class
2895    partition of the positions of s.
2896
2897    For each position, find the set of characters C that it matches.  Eliminate
2898    any characters from C that fail on grounds of backward context.
2899
2900    Check whether the group's label L has nonempty
2901    intersection with C.  If L - C is nonempty, create a new group labeled
2902    L - C and having the same positions as the current group, and set L to
2903    the intersection of L and C.  Insert the position in the group, set
2904    C = C - L, and resume scanning.
2905
2906    If after comparing with every group there are characters remaining in C,
2907    create a new group labeled with the characters of C and insert this
2908    position in that group.  */
2909
2910 static state_num
2911 build_state (state_num s, struct dfa *d, unsigned char uc)
2912 {
2913   position_set follows;         /* Union of the follows for each
2914                                    position of the current state.  */
2915   position_set group;           /* Positions that match the input char.  */
2916   position_set tmp;             /* Temporary space for merging sets.  */
2917   state_num state;              /* New state.  */
2918   state_num state_newline;      /* New state on a newline transition.  */
2919   state_num state_letter;       /* New state on a letter transition.  */
2920
2921 #ifdef DEBUG
2922   fprintf (stderr, "build state %td\n", s);
2923 #endif
2924
2925   /* A pointer to the new transition table, and the table itself.  */
2926   state_num **ptrans = (accepting (s, d) ? d->fails : d->trans) + s;
2927   state_num *trans = *ptrans;
2928
2929   if (!trans)
2930     {
2931       /* MAX_TRCOUNT is an arbitrary upper limit on the number of
2932          transition tables that can exist at once, other than for
2933          initial states.  Often-used transition tables are quickly
2934          rebuilt, whereas rarely-used ones are cleared away.  */
2935       if (MAX_TRCOUNT <= d->trcount)
2936         {
2937           for (state_num i = d->min_trcount; i < d->tralloc; i++)
2938             {
2939               free (d->trans[i]);
2940               free (d->fails[i]);
2941               d->trans[i] = d->fails[i] = NULL;
2942             }
2943           d->trcount = 0;
2944         }
2945
2946       d->trcount++;
2947       *ptrans = trans = xmalloc (NOTCHAR * sizeof *trans);
2948
2949       /* Fill transition table with a default value which means that the
2950          transited state has not been calculated yet.  */
2951       for (int i = 0; i < NOTCHAR; i++)
2952         trans[i] = -2;
2953     }
2954
2955   /* Set up the success bits for this state.  */
2956   d->success[s] = 0;
2957   if (accepts_in_context (d->states[s].context, CTX_NEWLINE, s, d))
2958     d->success[s] |= CTX_NEWLINE;
2959   if (accepts_in_context (d->states[s].context, CTX_LETTER, s, d))
2960     d->success[s] |= CTX_LETTER;
2961   if (accepts_in_context (d->states[s].context, CTX_NONE, s, d))
2962     d->success[s] |= CTX_NONE;
2963
2964   alloc_position_set (&follows, d->nleaves);
2965
2966   /* Find the union of the follows of the positions of the group.
2967      This is a hideously inefficient loop.  Fix it someday.  */
2968   for (size_t j = 0; j < d->states[s].elems.nelem; ++j)
2969     for (size_t k = 0;
2970          k < d->follows[d->states[s].elems.elems[j].index].nelem; ++k)
2971       insert (d->follows[d->states[s].elems.elems[j].index].elems[k],
2972               &follows);
2973
2974   /* Positions that match the input char.  */
2975   alloc_position_set (&group, d->nleaves);
2976
2977   /* The group's label.  */
2978   charclass label;
2979   fillset (&label);
2980
2981   for (size_t i = 0; i < follows.nelem; ++i)
2982     {
2983       charclass matches;            /* Set of matching characters.  */
2984       position pos = follows.elems[i];
2985       bool matched = false;
2986       if (d->tokens[pos.index] >= 0 && d->tokens[pos.index] < NOTCHAR)
2987         {
2988           zeroset (&matches);
2989           setbit (d->tokens[pos.index], &matches);
2990           if (d->tokens[pos.index] == uc)
2991             matched = true;
2992         }
2993       else if (d->tokens[pos.index] >= CSET)
2994         {
2995           matches = d->charclasses[d->tokens[pos.index] - CSET];
2996           if (tstbit (uc, &matches))
2997             matched = true;
2998         }
2999       else if (d->tokens[pos.index] == ANYCHAR)
3000         {
3001           matches = d->charclasses[d->canychar];
3002           if (tstbit (uc, &matches))
3003             matched = true;
3004
3005           /* ANYCHAR must match with a single character, so we must put
3006              it to D->states[s].mbps which contains the positions which
3007              can match with a single character not a byte.  If all
3008              positions which has ANYCHAR does not depend on context of
3009              next character, we put the follows instead of it to
3010              D->states[s].mbps to optimize.  */
3011           if (succeeds_in_context (pos.constraint, d->states[s].context,
3012                                    CTX_NONE))
3013             {
3014               if (d->states[s].mbps.nelem == 0)
3015                 alloc_position_set (&d->states[s].mbps, 1);
3016               insert (pos, &d->states[s].mbps);
3017             }
3018         }
3019       else
3020         continue;
3021
3022       /* Some characters may need to be eliminated from matches because
3023          they fail in the current context.  */
3024       if (pos.constraint != NO_CONSTRAINT)
3025         {
3026           if (!succeeds_in_context (pos.constraint,
3027                                     d->states[s].context, CTX_NEWLINE))
3028             for (size_t j = 0; j < CHARCLASS_WORDS; ++j)
3029               matches.w[j] &= ~d->syntax.newline.w[j];
3030           if (!succeeds_in_context (pos.constraint,
3031                                     d->states[s].context, CTX_LETTER))
3032             for (size_t j = 0; j < CHARCLASS_WORDS; ++j)
3033               matches.w[j] &= ~d->syntax.letters.w[j];
3034           if (!succeeds_in_context (pos.constraint,
3035                                     d->states[s].context, CTX_NONE))
3036             for (size_t j = 0; j < CHARCLASS_WORDS; ++j)
3037               matches.w[j] &= d->syntax.letters.w[j] | d->syntax.newline.w[j];
3038
3039           /* If there are no characters left, there's no point in going on.  */
3040           if (emptyset (&matches))
3041             continue;
3042
3043           /* If we have reset the bit that made us declare "matched", reset
3044              that indicator, too.  This is required to avoid an infinite loop
3045              with this command: echo cx | LC_ALL=C grep -E 'c\b[x ]'  */
3046           if (!tstbit (uc, &matches))
3047             matched = false;
3048         }
3049
3050 #ifdef DEBUG
3051       fprintf (stderr, " nextpos %zu:", pos.index);
3052       prtok (d->tokens[pos.index]);
3053       fprintf (stderr, " of");
3054       for (size_t j = 0; j < NOTCHAR; j++)
3055         if (tstbit (j, &matches))
3056           fprintf (stderr, " 0x%02zx", j);
3057       fprintf (stderr, "\n");
3058 #endif
3059
3060       if (matched)
3061         {
3062           for (size_t k = 0; k < CHARCLASS_WORDS; ++k)
3063             label.w[k] &= matches.w[k];
3064           append (pos, &group);
3065         }
3066       else
3067         {
3068           for (size_t k = 0; k < CHARCLASS_WORDS; ++k)
3069             label.w[k] &= ~matches.w[k];
3070         }
3071     }
3072
3073   alloc_position_set (&tmp, d->nleaves);
3074
3075   if (group.nelem > 0)
3076     {
3077       /* If we are building a searching matcher, throw in the positions
3078          of state 0 as well, if possible.  */
3079       if (d->searchflag)
3080         {
3081           /* If a token in follows.elems is not 1st byte of a multibyte
3082              character, or the states of follows must accept the bytes
3083              which are not 1st byte of the multibyte character.
3084              Then, if a state of follows encounters a byte, it must not be
3085              a 1st byte of a multibyte character nor a single byte character.
3086              In this case, do not add state[0].follows to next state, because
3087              state[0] must accept 1st-byte.
3088
3089              For example, suppose <sb a> is a certain single byte character,
3090              <mb A> is a certain multibyte character, and the codepoint of
3091              <sb a> equals the 2nd byte of the codepoint of <mb A>.  When
3092              state[0] accepts <sb a>, state[i] transits to state[i+1] by
3093              accepting the 1st byte of <mb A>, and state[i+1] accepts the
3094              2nd byte of <mb A>, if state[i+1] encounters the codepoint of
3095              <sb a>, it must not be <sb a> but the 2nd byte of <mb A>, so do
3096              not add state[0].  */
3097
3098           bool mergeit = !d->localeinfo.multibyte;
3099           if (!mergeit)
3100             {
3101               mergeit = true;
3102               for (size_t j = 0; mergeit && j < group.nelem; j++)
3103                 mergeit &= d->multibyte_prop[group.elems[j].index];
3104             }
3105           if (mergeit)
3106             {
3107               merge (&d->states[0].elems, &group, &tmp);
3108               copy (&tmp, &group);
3109             }
3110         }
3111
3112       /* Find out if the new state will want any context information,
3113          by calculating possible contexts that the group can match,
3114          and separate contexts that the new state wants to know.  */
3115       int possible_contexts = charclass_context (d, &label);
3116       int separate_contexts = state_separate_contexts (d, &group);
3117
3118       /* Find the state(s) corresponding to the union of the follows.  */
3119       if (possible_contexts & ~separate_contexts)
3120         state = state_index (d, &group, separate_contexts ^ CTX_ANY);
3121       else
3122         state = -1;
3123       if (separate_contexts & possible_contexts & CTX_NEWLINE)
3124         state_newline = state_index (d, &group, CTX_NEWLINE);
3125       else
3126         state_newline = state;
3127       if (separate_contexts & possible_contexts & CTX_LETTER)
3128         state_letter = state_index (d, &group, CTX_LETTER);
3129       else
3130         state_letter = state;
3131
3132       /* Reallocate now, to reallocate any newline transition properly.  */
3133       realloc_trans_if_necessary (d);
3134     }
3135
3136   /* If we are a searching matcher, the default transition is to a state
3137      containing the positions of state 0, otherwise the default transition
3138      is to fail miserably.  */
3139   else if (d->searchflag)
3140     {
3141       state_newline = 0;
3142       state_letter = d->min_trcount - 1;
3143       state = d->initstate_notbol;
3144     }
3145   else
3146     {
3147       state_newline = -1;
3148       state_letter = -1;
3149       state = -1;
3150     }
3151
3152   /* Set the transitions for each character in the label.  */
3153   for (size_t i = 0; i < NOTCHAR; i++)
3154     if (tstbit (i, &label))
3155       switch (d->syntax.sbit[i])
3156         {
3157         case CTX_NEWLINE:
3158           trans[i] = state_newline;
3159           break;
3160         case CTX_LETTER:
3161           trans[i] = state_letter;
3162           break;
3163         default:
3164           trans[i] = state;
3165           break;
3166         }
3167
3168 #ifdef DEBUG
3169   fprintf (stderr, "trans table %td", s);
3170   for (size_t i = 0; i < NOTCHAR; ++i)
3171     {
3172       if (!(i & 0xf))
3173         fprintf (stderr, "\n");
3174       fprintf (stderr, " %2td", trans[i]);
3175     }
3176   fprintf (stderr, "\n");
3177 #endif
3178
3179   free (group.elems);
3180   free (follows.elems);
3181   free (tmp.elems);
3182
3183   /* Keep the newline transition in a special place so we can use it as
3184      a sentinel.  */
3185   if (tstbit (d->syntax.eolbyte, &label))
3186     {
3187       d->newlines[s] = trans[d->syntax.eolbyte];
3188       trans[d->syntax.eolbyte] = -1;
3189     }
3190
3191   return trans[uc];
3192 }
3193
3194 /* Multibyte character handling sub-routines for dfaexec.  */
3195
3196 /* Consume a single byte and transit state from 's' to '*next_state'.
3197    This function is almost same as the state transition routin in dfaexec.
3198    But state transition is done just once, otherwise matching succeed or
3199    reach the end of the buffer.  */
3200 static state_num
3201 transit_state_singlebyte (struct dfa *d, state_num s, unsigned char const **pp)
3202 {
3203   state_num *t;
3204
3205   if (d->trans[s])
3206     t = d->trans[s];
3207   else if (d->fails[s])
3208     t = d->fails[s];
3209   else
3210     {
3211       build_state (s, d, **pp);
3212       if (d->trans[s])
3213         t = d->trans[s];
3214       else
3215         {
3216           t = d->fails[s];
3217           assert (t);
3218         }
3219     }
3220
3221   if (t[**pp] == -2)
3222     build_state (s, d, **pp);
3223
3224   return t[*(*pp)++];
3225 }
3226
3227 /* Transit state from s, then return new state and update the pointer of
3228    the buffer.  This function is for a period operator which can match a
3229    multi-byte character.  */
3230 static state_num
3231 transit_state (struct dfa *d, state_num s, unsigned char const **pp,
3232                unsigned char const *end)
3233 {
3234   wint_t wc;
3235
3236   int mbclen = mbs_to_wchar (&wc, (char const *) *pp, end - *pp, d);
3237
3238   /* This state has some operators which can match a multibyte character.  */
3239   d->mb_follows.nelem = 0;
3240
3241   /* Calculate the state which can be reached from the state 's' by
3242      consuming 'mbclen' single bytes from the buffer.  */
3243   state_num s1 = s;
3244   int mbci;
3245   for (mbci = 0; mbci < mbclen && (mbci == 0 || d->min_trcount <= s); mbci++)
3246     s = transit_state_singlebyte (d, s, pp);
3247   *pp += mbclen - mbci;
3248
3249   if (wc == WEOF)
3250     {
3251       /* It is an invalid character, so ANYCHAR is not accepted.  */
3252       return s;
3253     }
3254
3255   /* If all positions which have ANYCHAR do not depend on the context
3256      of the next character, calculate the next state with
3257      pre-calculated follows and cache the result.  */
3258   if (d->states[s1].mb_trindex < 0)
3259     {
3260       if (MAX_TRCOUNT <= d->mb_trcount)
3261         {
3262           state_num s3;
3263           for (s3 = -1; s3 < d->tralloc; s3++)
3264             {
3265               free (d->mb_trans[s3]);
3266               d->mb_trans[s3] = NULL;
3267             }
3268
3269           for (state_num i = 0; i < d->sindex; i++)
3270             d->states[i].mb_trindex = -1;
3271           d->mb_trcount = 0;
3272         }
3273       d->states[s1].mb_trindex = d->mb_trcount++;
3274     }
3275
3276   if (! d->mb_trans[s])
3277     {
3278       enum { TRANSPTR_SIZE = sizeof *d->mb_trans[s] };
3279       enum { TRANSALLOC_SIZE = MAX_TRCOUNT * TRANSPTR_SIZE };
3280       d->mb_trans[s] = xmalloc (TRANSALLOC_SIZE);
3281       for (int i = 0; i < MAX_TRCOUNT; i++)
3282         d->mb_trans[s][i] = -1;
3283     }
3284   else if (d->mb_trans[s][d->states[s1].mb_trindex] >= 0)
3285     return d->mb_trans[s][d->states[s1].mb_trindex];
3286
3287   if (s == -1)
3288     copy (&d->states[s1].mbps, &d->mb_follows);
3289   else
3290     merge (&d->states[s1].mbps, &d->states[s].elems, &d->mb_follows);
3291
3292   int separate_contexts = state_separate_contexts (d, &d->mb_follows);
3293   state_num s2 = state_index (d, &d->mb_follows, separate_contexts ^ CTX_ANY);
3294   realloc_trans_if_necessary (d);
3295
3296   d->mb_trans[s][d->states[s1].mb_trindex] = s2;
3297
3298   return s2;
3299 }
3300
3301 /* The initial state may encounter a byte which is not a single byte character
3302    nor the first byte of a multibyte character.  But it is incorrect for the
3303    initial state to accept such a byte.  For example, in Shift JIS the regular
3304    expression "\\" accepts the codepoint 0x5c, but should not accept the second
3305    byte of the codepoint 0x815c.  Then the initial state must skip the bytes
3306    that are not a single byte character nor the first byte of a multibyte
3307    character.
3308
3309    Given DFA state d, use mbs_to_wchar to advance MBP until it reaches
3310    or exceeds P, and return the advanced MBP.  If WCP is non-NULL and
3311    the result is greater than P, set *WCP to the final wide character
3312    processed, or to WEOF if no wide character is processed.  Otherwise,
3313    if WCP is non-NULL, *WCP may or may not be updated.
3314
3315    Both P and MBP must be no larger than END.  */
3316 static unsigned char const *
3317 skip_remains_mb (struct dfa *d, unsigned char const *p,
3318                  unsigned char const *mbp, char const *end)
3319 {
3320   if (d->syntax.never_trail[*p])
3321     return p;
3322   while (mbp < p)
3323     {
3324       wint_t wc;
3325       mbp += mbs_to_wchar (&wc, (char const *) mbp,
3326                            end - (char const *) mbp, d);
3327     }
3328   return mbp;
3329 }
3330
3331 /* Search through a buffer looking for a match to the struct dfa *D.
3332    Find the first occurrence of a string matching the regexp in the
3333    buffer, and the shortest possible version thereof.  Return a pointer to
3334    the first character after the match, or NULL if none is found.  BEGIN
3335    points to the beginning of the buffer, and END points to the first byte
3336    after its end.  Note however that we store a sentinel byte (usually
3337    newline) in *END, so the actual buffer must be one byte longer.
3338    When ALLOW_NL, newlines may appear in the matching string.
3339    If COUNT is non-NULL, increment *COUNT once for each newline processed.
3340    If MULTIBYTE, the input consists of multibyte characters and/or
3341    encoding-error bytes.  Otherwise, it consists of single-byte characters.
3342    Here is the list of features that make this DFA matcher punt:
3343     - [M-N] range in non-simple locale: regex is up to 25% faster on [a-z]
3344     - [^...] in non-simple locale
3345     - [[=foo=]] or [[.foo.]]
3346     - [[:alpha:]] etc. in multibyte locale (except [[:digit:]] works OK)
3347     - back-reference: (.)\1
3348     - word-delimiter in multibyte locale: \<, \>, \b, \B
3349    See using_simple_locale for the definition of "simple locale".  */
3350
3351 static inline char *
3352 dfaexec_main (struct dfa *d, char const *begin, char *end, bool allow_nl,
3353               size_t *count, bool multibyte)
3354 {
3355   if (MAX_TRCOUNT <= d->sindex)
3356     {
3357       for (state_num s = d->min_trcount; s < d->sindex; s++)
3358         {
3359           free (d->states[s].elems.elems);
3360           free (d->states[s].mbps.elems);
3361         }
3362       d->sindex = d->min_trcount;
3363
3364       if (d->trans)
3365         {
3366           for (state_num s = 0; s < d->tralloc; s++)
3367             {
3368               free (d->trans[s]);
3369               free (d->fails[s]);
3370               d->trans[s] = d->fails[s] = NULL;
3371             }
3372           d->trcount = 0;
3373         }
3374
3375       if (d->localeinfo.multibyte && d->mb_trans)
3376         {
3377           for (state_num s = -1; s < d->tralloc; s++)
3378             {
3379               free (d->mb_trans[s]);
3380               d->mb_trans[s] = NULL;
3381             }
3382           for (state_num s = 0; s < d->min_trcount; s++)
3383             d->states[s].mb_trindex = -1;
3384           d->mb_trcount = 0;
3385         }
3386     }
3387
3388   if (!d->tralloc)
3389     realloc_trans_if_necessary (d);
3390
3391   /* Current state.  */
3392   state_num s = 0, s1 = 0;
3393
3394   /* Current input character.  */
3395   unsigned char const *p = (unsigned char const *) begin;
3396   unsigned char const *mbp = p;
3397
3398   /* Copy of d->trans so it can be optimized into a register.  */
3399   state_num **trans = d->trans;
3400   unsigned char eol = d->syntax.eolbyte;  /* Likewise for eolbyte.  */
3401   unsigned char saved_end = *(unsigned char *) end;
3402   *end = eol;
3403
3404   if (multibyte)
3405     {
3406       memset (&d->mbs, 0, sizeof d->mbs);
3407       if (d->mb_follows.alloc == 0)
3408         alloc_position_set (&d->mb_follows, d->nleaves);
3409     }
3410
3411   size_t nlcount = 0;
3412   for (;;)
3413     {
3414       state_num *t;
3415       while ((t = trans[s]) != NULL)
3416         {
3417           if (s < d->min_trcount)
3418             {
3419               if (!multibyte || d->states[s].mbps.nelem == 0)
3420                 {
3421                   while (t[*p] == s)
3422                     p++;
3423                 }
3424               if (multibyte)
3425                 p = mbp = skip_remains_mb (d, p, mbp, end);
3426             }
3427
3428           if (multibyte)
3429             {
3430               s1 = s;
3431
3432               if (d->states[s].mbps.nelem == 0
3433                   || d->localeinfo.sbctowc[*p] != WEOF || (char *) p >= end)
3434                 {
3435                   /* If an input character does not match ANYCHAR, do it
3436                      like a single-byte character.  */
3437                   s = t[*p++];
3438                 }
3439               else
3440                 {
3441                   s = transit_state (d, s, &p, (unsigned char *) end);
3442                   mbp = p;
3443                   trans = d->trans;
3444                 }
3445             }
3446           else
3447             {
3448               s1 = t[*p++];
3449               t = trans[s1];
3450               if (! t)
3451                 {
3452                   state_num tmp = s;
3453                   s = s1;
3454                   s1 = tmp;     /* swap */
3455                   break;
3456                 }
3457               if (s < d->min_trcount)
3458                 {
3459                   while (t[*p] == s1)
3460                     p++;
3461                 }
3462               s = t[*p++];
3463             }
3464         }
3465
3466       if (s < 0)
3467         {
3468           if (s == -2)
3469             {
3470               s = build_state (s1, d, p[-1]);
3471               trans = d->trans;
3472             }
3473           else if ((char *) p <= end && p[-1] == eol && 0 <= d->newlines[s1])
3474             {
3475               /* The previous character was a newline.  Count it, and skip
3476                  checking of multibyte character boundary until here.  */
3477               nlcount++;
3478               mbp = p;
3479
3480               s = (allow_nl ? d->newlines[s1]
3481                    : d->syntax.sbit[eol] == CTX_NEWLINE ? 0
3482                    : d->syntax.sbit[eol] == CTX_LETTER ? d->min_trcount - 1
3483                    : d->initstate_notbol);
3484             }
3485           else
3486             {
3487               p = NULL;
3488               goto done;
3489             }
3490         }
3491       else if (d->fails[s])
3492         {
3493           if ((d->success[s] & d->syntax.sbit[*p])
3494               || ((char *) p == end
3495                   && accepts_in_context (d->states[s].context, CTX_NEWLINE, s,
3496                                          d)))
3497             goto done;
3498
3499           if (multibyte && s < d->min_trcount)
3500             p = mbp = skip_remains_mb (d, p, mbp, end);
3501
3502           s1 = s;
3503           if (!multibyte || d->states[s].mbps.nelem == 0
3504               || d->localeinfo.sbctowc[*p] != WEOF || (char *) p >= end)
3505             {
3506               /* If a input character does not match ANYCHAR, do it
3507                  like a single-byte character.  */
3508               s = d->fails[s][*p++];
3509             }
3510           else
3511             {
3512               s = transit_state (d, s, &p, (unsigned char *) end);
3513               mbp = p;
3514               trans = d->trans;
3515             }
3516         }
3517       else
3518         {
3519           build_state (s, d, p[0]);
3520           trans = d->trans;
3521         }
3522     }
3523
3524  done:
3525   if (count)
3526     *count += nlcount;
3527   *end = saved_end;
3528   return (char *) p;
3529 }
3530
3531 /* Specialized versions of dfaexec for multibyte and single-byte cases.
3532    This is for performance, as dfaexec_main is an inline function.  */
3533
3534 static char *
3535 dfaexec_mb (struct dfa *d, char const *begin, char *end,
3536             bool allow_nl, size_t *count, bool *backref)
3537 {
3538   return dfaexec_main (d, begin, end, allow_nl, count, true);
3539 }
3540
3541 static char *
3542 dfaexec_sb (struct dfa *d, char const *begin, char *end,
3543             bool allow_nl, size_t *count, bool *backref)
3544 {
3545   return dfaexec_main (d, begin, end, allow_nl, count, false);
3546 }
3547
3548 /* Always set *BACKREF and return BEGIN.  Use this wrapper for
3549    any regexp that uses a construct not supported by this code.  */
3550 static char *
3551 dfaexec_noop (struct dfa *d, char const *begin, char *end,
3552               bool allow_nl, size_t *count, bool *backref)
3553 {
3554   *backref = true;
3555   return (char *) begin;
3556 }
3557
3558 /* Like dfaexec_main (D, BEGIN, END, ALLOW_NL, COUNT, D->localeinfo.multibyte),
3559    but faster and set *BACKREF if the DFA code does not support this
3560    regexp usage.  */
3561
3562 char *
3563 dfaexec (struct dfa *d, char const *begin, char *end,
3564          bool allow_nl, size_t *count, bool *backref)
3565 {
3566   return d->dfaexec (d, begin, end, allow_nl, count, backref);
3567 }
3568
3569 struct dfa *
3570 dfasuperset (struct dfa const *d)
3571 {
3572   return d->superset;
3573 }
3574
3575 bool
3576 dfaisfast (struct dfa const *d)
3577 {
3578   return d->fast;
3579 }
3580
3581 static void
3582 free_mbdata (struct dfa *d)
3583 {
3584   free (d->multibyte_prop);
3585   free (d->lex.brack.chars);
3586   free (d->mb_follows.elems);
3587
3588   if (d->mb_trans)
3589     {
3590       state_num s;
3591       for (s = -1; s < d->tralloc; s++)
3592         free (d->mb_trans[s]);
3593       free (d->mb_trans - 2);
3594     }
3595 }
3596
3597 /* Return true if every construct in D is supported by this DFA matcher.  */
3598 static bool _GL_ATTRIBUTE_PURE
3599 dfa_supported (struct dfa const *d)
3600 {
3601   for (size_t i = 0; i < d->tindex; i++)
3602     {
3603       switch (d->tokens[i])
3604         {
3605         case BEGWORD:
3606         case ENDWORD:
3607         case LIMWORD:
3608         case NOTLIMWORD:
3609           if (!d->localeinfo.multibyte)
3610             continue;
3611           FALLTHROUGH;
3612         case BACKREF:
3613         case MBCSET:
3614           return false;
3615         }
3616     }
3617   return true;
3618 }
3619
3620 /* Disable use of the superset DFA if it is not likely to help
3621    performance.  */
3622 static void
3623 maybe_disable_superset_dfa (struct dfa *d)
3624 {
3625   if (!d->localeinfo.using_utf8)
3626     return;
3627
3628   bool have_backref = false;
3629   for (size_t i = 0; i < d->tindex; ++i)
3630     {
3631       switch (d->tokens[i])
3632         {
3633         case ANYCHAR:
3634           /* Lowered.  */
3635           abort ();
3636         case BACKREF:
3637           have_backref = true;
3638           break;
3639         case MBCSET:
3640           /* Requires multi-byte algorithm.  */
3641           return;
3642         default:
3643           break;
3644         }
3645     }
3646
3647   if (!have_backref && d->superset)
3648     {
3649       /* The superset DFA is not likely to be much faster, so remove it.  */
3650       dfafree (d->superset);
3651       free (d->superset);
3652       d->superset = NULL;
3653     }
3654
3655   free_mbdata (d);
3656   d->localeinfo.multibyte = false;
3657   d->dfaexec = dfaexec_sb;
3658   d->fast = true;
3659 }
3660
3661 static void
3662 dfassbuild (struct dfa *d)
3663 {
3664   struct dfa *sup = dfaalloc ();
3665
3666   *sup = *d;
3667   sup->localeinfo.multibyte = false;
3668   sup->dfaexec = dfaexec_sb;
3669   sup->multibyte_prop = NULL;
3670   sup->superset = NULL;
3671   sup->states = NULL;
3672   sup->sindex = 0;
3673   sup->constraints = NULL;
3674   sup->separates = NULL;
3675   sup->follows = NULL;
3676   sup->tralloc = 0;
3677   sup->trans = NULL;
3678   sup->fails = NULL;
3679   sup->success = NULL;
3680   sup->newlines = NULL;
3681
3682   sup->charclasses = xnmalloc (sup->calloc, sizeof *sup->charclasses);
3683   if (d->cindex)
3684     {
3685       memcpy (sup->charclasses, d->charclasses,
3686               d->cindex * sizeof *sup->charclasses);
3687     }
3688
3689   sup->tokens = xnmalloc (d->tindex, 2 * sizeof *sup->tokens);
3690   sup->talloc = d->tindex * 2;
3691
3692   bool have_achar = false;
3693   bool have_nchar = false;
3694   size_t j;
3695   for (size_t i = j = 0; i < d->tindex; i++)
3696     {
3697       switch (d->tokens[i])
3698         {
3699         case ANYCHAR:
3700         case MBCSET:
3701         case BACKREF:
3702           {
3703             charclass ccl;
3704             fillset (&ccl);
3705             sup->tokens[j++] = CSET + charclass_index (sup, &ccl);
3706             sup->tokens[j++] = STAR;
3707             if (d->tokens[i + 1] == QMARK || d->tokens[i + 1] == STAR
3708                 || d->tokens[i + 1] == PLUS)
3709               i++;
3710             have_achar = true;
3711           }
3712           break;
3713         case BEGWORD:
3714         case ENDWORD:
3715         case LIMWORD:
3716         case NOTLIMWORD:
3717           if (d->localeinfo.multibyte)
3718             {
3719               /* These constraints aren't supported in a multibyte locale.
3720                  Ignore them in the superset DFA.  */
3721               sup->tokens[j++] = EMPTY;
3722               break;
3723             }
3724           FALLTHROUGH;
3725         default:
3726           sup->tokens[j++] = d->tokens[i];
3727           if ((0 <= d->tokens[i] && d->tokens[i] < NOTCHAR)
3728               || d->tokens[i] >= CSET)
3729             have_nchar = true;
3730           break;
3731         }
3732     }
3733   sup->tindex = j;
3734
3735   if (have_nchar && (have_achar || d->localeinfo.multibyte))
3736     d->superset = sup;
3737   else
3738     {
3739       dfafree (sup);
3740       free (sup);
3741     }
3742 }
3743
3744 /* Parse and analyze a single string of the given length.  */
3745 void
3746 dfacomp (char const *s, size_t len, struct dfa *d, bool searchflag)
3747 {
3748   dfaparse (s, len, d);
3749   dfassbuild (d);
3750
3751   if (dfa_supported (d))
3752     {
3753       maybe_disable_superset_dfa (d);
3754       dfaanalyze (d, searchflag);
3755     }
3756   else
3757     {
3758       d->dfaexec = dfaexec_noop;
3759     }
3760
3761   if (d->superset)
3762     {
3763       d->fast = true;
3764       dfaanalyze (d->superset, searchflag);
3765     }
3766 }
3767
3768 /* Free the storage held by the components of a dfa.  */
3769 void
3770 dfafree (struct dfa *d)
3771 {
3772   free (d->charclasses);
3773   free (d->tokens);
3774
3775   if (d->localeinfo.multibyte)
3776     free_mbdata (d);
3777
3778   free (d->constraints);
3779   free (d->separates);
3780
3781   for (size_t i = 0; i < d->sindex; ++i)
3782     {
3783       free (d->states[i].elems.elems);
3784       free (d->states[i].mbps.elems);
3785     }
3786   free (d->states);
3787
3788   if (d->follows)
3789     {
3790       for (size_t i = 0; i < d->tindex; ++i)
3791         free (d->follows[i].elems);
3792       free (d->follows);
3793     }
3794
3795   if (d->trans)
3796     {
3797       for (size_t i = 0; i < d->tralloc; ++i)
3798         {
3799           free (d->trans[i]);
3800           free (d->fails[i]);
3801         }
3802
3803       free (d->trans - 2);
3804       free (d->fails);
3805       free (d->newlines);
3806       free (d->success);
3807     }
3808
3809   if (d->superset)
3810     {
3811       dfafree (d->superset);
3812       free (d->superset);
3813     }
3814 }
3815
3816 /* Having found the postfix representation of the regular expression,
3817    try to find a long sequence of characters that must appear in any line
3818    containing the r.e.
3819    Finding a "longest" sequence is beyond the scope here;
3820    we take an easy way out and hope for the best.
3821    (Take "(ab|a)b"--please.)
3822
3823    We do a bottom-up calculation of sequences of characters that must appear
3824    in matches of r.e.'s represented by trees rooted at the nodes of the postfix
3825    representation:
3826         sequences that must appear at the left of the match ("left")
3827         sequences that must appear at the right of the match ("right")
3828         lists of sequences that must appear somewhere in the match ("in")
3829         sequences that must constitute the match ("is")
3830
3831    When we get to the root of the tree, we use one of the longest of its
3832    calculated "in" sequences as our answer.
3833
3834    The sequences calculated for the various types of node (in pseudo ANSI c)
3835    are shown below.  "p" is the operand of unary operators (and the left-hand
3836    operand of binary operators); "q" is the right-hand operand of binary
3837    operators.
3838
3839    "ZERO" means "a zero-length sequence" below.
3840
3841         Type    left            right           is              in
3842         ----    ----            -----           --              --
3843         char c  # c             # c             # c             # c
3844
3845         ANYCHAR ZERO            ZERO            ZERO            ZERO
3846
3847         MBCSET  ZERO            ZERO            ZERO            ZERO
3848
3849         CSET    ZERO            ZERO            ZERO            ZERO
3850
3851         STAR    ZERO            ZERO            ZERO            ZERO
3852
3853         QMARK   ZERO            ZERO            ZERO            ZERO
3854
3855         PLUS    p->left         p->right        ZERO            p->in
3856
3857         CAT     (p->is==ZERO)?  (q->is==ZERO)?  (p->is!=ZERO && p->in plus
3858                 p->left :       q->right :      q->is!=ZERO) ?  q->in plus
3859                 p->is##q->left  p->right##q->is p->is##q->is : p->right##q->left
3860                                                 ZERO
3861
3862         OR      longest common  longest common  (do p->is and substrings common
3863                 leading         trailing        to q->is have same p->in and
3864                 (sub)sequence   (sub)sequence   q->in length and content) ?
3865                 of p->left      of p->right
3866                 and q->left     and q->right    p->is : NULL
3867
3868    If there's anything else we recognize in the tree, all four sequences get set
3869    to zero-length sequences.  If there's something we don't recognize in the
3870    tree, we just return a zero-length sequence.
3871
3872    Break ties in favor of infrequent letters (choosing 'zzz' in preference to
3873    'aaa')?
3874
3875    And ... is it here or someplace that we might ponder "optimizations" such as
3876         egrep 'psi|epsilon'     ->      egrep 'psi'
3877         egrep 'pepsi|epsilon'   ->      egrep 'epsi'
3878                                         (Yes, we now find "epsi" as a "string
3879                                         that must occur", but we might also
3880                                         simplify the *entire* r.e. being sought)
3881         grep '[c]'              ->      grep 'c'
3882         grep '(ab|a)b'          ->      grep 'ab'
3883         grep 'ab*'              ->      grep 'a'
3884         grep 'a*b'              ->      grep 'b'
3885
3886    There are several issues:
3887
3888    Is optimization easy (enough)?
3889
3890    Does optimization actually accomplish anything,
3891    or is the automaton you get from "psi|epsilon" (for example)
3892    the same as the one you get from "psi" (for example)?
3893
3894    Are optimizable r.e.'s likely to be used in real-life situations
3895    (something like 'ab*' is probably unlikely; something like is
3896    'psi|epsilon' is likelier)?  */
3897
3898 static char *
3899 icatalloc (char *old, char const *new)
3900 {
3901   size_t newsize = strlen (new);
3902   if (newsize == 0)
3903     return old;
3904   size_t oldsize = strlen (old);
3905   char *result = xrealloc (old, oldsize + newsize + 1);
3906   memcpy (result + oldsize, new, newsize + 1);
3907   return result;
3908 }
3909
3910 static void
3911 freelist (char **cpp)
3912 {
3913   while (*cpp)
3914     free (*cpp++);
3915 }
3916
3917 static char **
3918 enlist (char **cpp, char *new, size_t len)
3919 {
3920   new = memcpy (xmalloc (len + 1), new, len);
3921   new[len] = '\0';
3922   /* Is there already something in the list that's new (or longer)?  */
3923   size_t i;
3924   for (i = 0; cpp[i] != NULL; ++i)
3925     if (strstr (cpp[i], new) != NULL)
3926       {
3927         free (new);
3928         return cpp;
3929       }
3930   /* Eliminate any obsoleted strings.  */
3931   for (size_t j = 0; cpp[j] != NULL; )
3932     if (strstr (new, cpp[j]) == NULL)
3933       ++j;
3934     else
3935       {
3936         free (cpp[j]);
3937         if (--i == j)
3938           break;
3939         cpp[j] = cpp[i];
3940         cpp[i] = NULL;
3941       }
3942   /* Add the new string.  */
3943   cpp = xnrealloc (cpp, i + 2, sizeof *cpp);
3944   cpp[i] = new;
3945   cpp[i + 1] = NULL;
3946   return cpp;
3947 }
3948
3949 /* Given pointers to two strings, return a pointer to an allocated
3950    list of their distinct common substrings.  */
3951 static char **
3952 comsubs (char *left, char const *right)
3953 {
3954   char **cpp = xzalloc (sizeof *cpp);
3955
3956   for (char *lcp = left; *lcp != '\0'; lcp++)
3957     {
3958       size_t len = 0;
3959       char *rcp = strchr (right, *lcp);
3960       while (rcp != NULL)
3961         {
3962           size_t i;
3963           for (i = 1; lcp[i] != '\0' && lcp[i] == rcp[i]; ++i)
3964             continue;
3965           if (i > len)
3966             len = i;
3967           rcp = strchr (rcp + 1, *lcp);
3968         }
3969       if (len != 0)
3970         cpp = enlist (cpp, lcp, len);
3971     }
3972   return cpp;
3973 }
3974
3975 static char **
3976 addlists (char **old, char **new)
3977 {
3978   for (; *new; new++)
3979     old = enlist (old, *new, strlen (*new));
3980   return old;
3981 }
3982
3983 /* Given two lists of substrings, return a new list giving substrings
3984    common to both.  */
3985 static char **
3986 inboth (char **left, char **right)
3987 {
3988   char **both = xzalloc (sizeof *both);
3989
3990   for (size_t lnum = 0; left[lnum] != NULL; ++lnum)
3991     {
3992       for (size_t rnum = 0; right[rnum] != NULL; ++rnum)
3993         {
3994           char **temp = comsubs (left[lnum], right[rnum]);
3995           both = addlists (both, temp);
3996           freelist (temp);
3997           free (temp);
3998         }
3999     }
4000   return both;
4001 }
4002
4003 typedef struct must must;
4004
4005 struct must
4006 {
4007   char **in;
4008   char *left;
4009   char *right;
4010   char *is;
4011   bool begline;
4012   bool endline;
4013   must *prev;
4014 };
4015
4016 static must *
4017 allocmust (must *mp, size_t size)
4018 {
4019   must *new_mp = xmalloc (sizeof *new_mp);
4020   new_mp->in = xzalloc (sizeof *new_mp->in);
4021   new_mp->left = xzalloc (size);
4022   new_mp->right = xzalloc (size);
4023   new_mp->is = xzalloc (size);
4024   new_mp->begline = false;
4025   new_mp->endline = false;
4026   new_mp->prev = mp;
4027   return new_mp;
4028 }
4029
4030 static void
4031 resetmust (must *mp)
4032 {
4033   freelist (mp->in);
4034   mp->in[0] = NULL;
4035   mp->left[0] = mp->right[0] = mp->is[0] = '\0';
4036   mp->begline = false;
4037   mp->endline = false;
4038 }
4039
4040 static void
4041 freemust (must *mp)
4042 {
4043   freelist (mp->in);
4044   free (mp->in);
4045   free (mp->left);
4046   free (mp->right);
4047   free (mp->is);
4048   free (mp);
4049 }
4050
4051 struct dfamust *
4052 dfamust (struct dfa const *d)
4053 {
4054   must *mp = NULL;
4055   char const *result = "";
4056   bool exact = false;
4057   bool begline = false;
4058   bool endline = false;
4059   bool need_begline = false;
4060   bool need_endline = false;
4061   bool case_fold_unibyte = d->syntax.case_fold && MB_CUR_MAX == 1;
4062
4063   for (size_t ri = 1; ri + 1 < d->tindex; ri++)
4064     {
4065       token t = d->tokens[ri];
4066       switch (t)
4067         {
4068         case BEGLINE:
4069           mp = allocmust (mp, 2);
4070           mp->begline = true;
4071           need_begline = true;
4072           break;
4073         case ENDLINE:
4074           mp = allocmust (mp, 2);
4075           mp->endline = true;
4076           need_endline = true;
4077           break;
4078         case LPAREN:
4079         case RPAREN:
4080           assert (!"neither LPAREN nor RPAREN may appear here");
4081
4082         case EMPTY:
4083         case BEGWORD:
4084         case ENDWORD:
4085         case LIMWORD:
4086         case NOTLIMWORD:
4087         case BACKREF:
4088         case ANYCHAR:
4089         case MBCSET:
4090           mp = allocmust (mp, 2);
4091           break;
4092
4093         case STAR:
4094         case QMARK:
4095           resetmust (mp);
4096           break;
4097
4098         case OR:
4099           {
4100             char **new;
4101             must *rmp = mp;
4102             must *lmp = mp = mp->prev;
4103             size_t j, ln, rn, n;
4104
4105             /* Guaranteed to be.  Unlikely, but ...  */
4106             if (streq (lmp->is, rmp->is))
4107               {
4108                 lmp->begline &= rmp->begline;
4109                 lmp->endline &= rmp->endline;
4110               }
4111             else
4112               {
4113                 lmp->is[0] = '\0';
4114                 lmp->begline = false;
4115                 lmp->endline = false;
4116               }
4117             /* Left side--easy */
4118             size_t i = 0;
4119             while (lmp->left[i] != '\0' && lmp->left[i] == rmp->left[i])
4120               ++i;
4121             lmp->left[i] = '\0';
4122             /* Right side */
4123             ln = strlen (lmp->right);
4124             rn = strlen (rmp->right);
4125             n = ln;
4126             if (n > rn)
4127               n = rn;
4128             for (i = 0; i < n; ++i)
4129               if (lmp->right[ln - i - 1] != rmp->right[rn - i - 1])
4130                 break;
4131             for (j = 0; j < i; ++j)
4132               lmp->right[j] = lmp->right[(ln - i) + j];
4133             lmp->right[j] = '\0';
4134             new = inboth (lmp->in, rmp->in);
4135             freelist (lmp->in);
4136             free (lmp->in);
4137             lmp->in = new;
4138             freemust (rmp);
4139           }
4140           break;
4141
4142         case PLUS:
4143           mp->is[0] = '\0';
4144           break;
4145
4146         case END:
4147           assert (!mp->prev);
4148           for (size_t i = 0; mp->in[i] != NULL; ++i)
4149             if (strlen (mp->in[i]) > strlen (result))
4150               result = mp->in[i];
4151           if (streq (result, mp->is))
4152             {
4153               if ((!need_begline || mp->begline) && (!need_endline
4154                                                      || mp->endline))
4155                 exact = true;
4156               begline = mp->begline;
4157               endline = mp->endline;
4158             }
4159           goto done;
4160
4161         case CAT:
4162           {
4163             must *rmp = mp;
4164             must *lmp = mp = mp->prev;
4165
4166             /* In.  Everything in left, plus everything in
4167                right, plus concatenation of
4168                left's right and right's left.  */
4169             lmp->in = addlists (lmp->in, rmp->in);
4170             if (lmp->right[0] != '\0' && rmp->left[0] != '\0')
4171               {
4172                 size_t lrlen = strlen (lmp->right);
4173                 size_t rllen = strlen (rmp->left);
4174                 char *tp = xmalloc (lrlen + rllen);
4175                 memcpy (tp, lmp->right, lrlen);
4176                 memcpy (tp + lrlen, rmp->left, rllen);
4177                 lmp->in = enlist (lmp->in, tp, lrlen + rllen);
4178                 free (tp);
4179               }
4180             /* Left-hand */
4181             if (lmp->is[0] != '\0')
4182               lmp->left = icatalloc (lmp->left, rmp->left);
4183             /* Right-hand */
4184             if (rmp->is[0] == '\0')
4185               lmp->right[0] = '\0';
4186             lmp->right = icatalloc (lmp->right, rmp->right);
4187             /* Guaranteed to be */
4188             if ((lmp->is[0] != '\0' || lmp->begline)
4189                 && (rmp->is[0] != '\0' || rmp->endline))
4190               {
4191                 lmp->is = icatalloc (lmp->is, rmp->is);
4192                 lmp->endline = rmp->endline;
4193               }
4194             else
4195               {
4196                 lmp->is[0] = '\0';
4197                 lmp->begline = false;
4198                 lmp->endline = false;
4199               }
4200             freemust (rmp);
4201           }
4202           break;
4203
4204         case '\0':
4205           /* Not on *my* shift.  */
4206           goto done;
4207
4208         default:
4209           if (CSET <= t)
4210             {
4211               /* If T is a singleton, or if case-folding in a unibyte
4212                  locale and T's members all case-fold to the same char,
4213                  convert T to one of its members.  Otherwise, do
4214                  nothing further with T.  */
4215               charclass *ccl = &d->charclasses[t - CSET];
4216               int j;
4217               for (j = 0; j < NOTCHAR; j++)
4218                 if (tstbit (j, ccl))
4219                   break;
4220               if (! (j < NOTCHAR))
4221                 {
4222                   mp = allocmust (mp, 2);
4223                   break;
4224                 }
4225               t = j;
4226               while (++j < NOTCHAR)
4227                 if (tstbit (j, ccl)
4228                     && ! (case_fold_unibyte
4229                           && toupper (j) == toupper (t)))
4230                   break;
4231               if (j < NOTCHAR)
4232                 {
4233                   mp = allocmust (mp, 2);
4234                   break;
4235                 }
4236             }
4237
4238           size_t rj = ri + 2;
4239           if (d->tokens[ri + 1] == CAT)
4240             {
4241               for (; rj < d->tindex - 1; rj += 2)
4242                 {
4243                   if ((rj != ri && (d->tokens[rj] <= 0
4244                                     || NOTCHAR <= d->tokens[rj]))
4245                       || d->tokens[rj + 1] != CAT)
4246                     break;
4247                 }
4248             }
4249           mp = allocmust (mp, ((rj - ri) >> 1) + 1);
4250           mp->is[0] = mp->left[0] = mp->right[0]
4251             = case_fold_unibyte ? toupper (t) : t;
4252
4253           size_t i;
4254           for (i = 1; ri + 2 < rj; i++)
4255             {
4256               ri += 2;
4257               t = d->tokens[ri];
4258               mp->is[i] = mp->left[i] = mp->right[i]
4259                 = case_fold_unibyte ? toupper (t) : t;
4260             }
4261           mp->is[i] = mp->left[i] = mp->right[i] = '\0';
4262           mp->in = enlist (mp->in, mp->is, i);
4263           break;
4264         }
4265     }
4266  done:;
4267
4268   struct dfamust *dm = NULL;
4269   if (*result)
4270     {
4271       dm = xmalloc (sizeof *dm);
4272       dm->exact = exact;
4273       dm->begline = begline;
4274       dm->endline = endline;
4275       dm->must = xstrdup (result);
4276     }
4277
4278   while (mp)
4279     {
4280       must *prev = mp->prev;
4281       freemust (mp);
4282       mp = prev;
4283     }
4284
4285   return dm;
4286 }
4287
4288 void
4289 dfamustfree (struct dfamust *dm)
4290 {
4291   free (dm->must);
4292   free (dm);
4293 }
4294
4295 struct dfa *
4296 dfaalloc (void)
4297 {
4298   return xmalloc (sizeof (struct dfa));
4299 }
4300
4301 /* Initialize DFA.  */
4302 void
4303 dfasyntax (struct dfa *dfa, struct localeinfo const *linfo,
4304            reg_syntax_t bits, int dfaopts)
4305 {
4306   memset (dfa, 0, offsetof (struct dfa, dfaexec));
4307   dfa->dfaexec = linfo->multibyte ? dfaexec_mb : dfaexec_sb;
4308   dfa->simple_locale = using_simple_locale (linfo->multibyte);
4309   dfa->localeinfo = *linfo;
4310
4311   dfa->fast = !dfa->localeinfo.multibyte;
4312
4313   dfa->canychar = -1;
4314   dfa->lex.cur_mb_len = 1;
4315   dfa->syntax.syntax_bits_set = true;
4316   dfa->syntax.case_fold = (bits & RE_ICASE) != 0;
4317   dfa->syntax.anchor = (dfaopts & DFA_ANCHOR) != 0;
4318   dfa->syntax.eolbyte = dfaopts & DFA_EOL_NUL ? '\0' : '\n';
4319   dfa->syntax.syntax_bits = bits;
4320
4321   for (int i = CHAR_MIN; i <= CHAR_MAX; ++i)
4322     {
4323       unsigned char uc = i;
4324
4325       dfa->syntax.sbit[uc] = char_context (dfa, uc);
4326       switch (dfa->syntax.sbit[uc])
4327         {
4328         case CTX_LETTER:
4329           setbit (uc, &dfa->syntax.letters);
4330           break;
4331         case CTX_NEWLINE:
4332           setbit (uc, &dfa->syntax.newline);
4333           break;
4334         }
4335
4336       /* POSIX requires that the five bytes in "\n\r./" (including the
4337          terminating NUL) cannot occur inside a multibyte character.  */
4338       dfa->syntax.never_trail[uc] = (dfa->localeinfo.using_utf8
4339                                      ? (uc & 0xc0) != 0x80
4340                                      : strchr ("\n\r./", uc) != NULL);
4341     }
4342 }
4343
4344 /* vim:set shiftwidth=2: */