src/search.c

   1 /* String search routines for GNU Emacs.
   2    Copyright (C) 1985-1987, 1993-1994, 1997-1999, 2001-2011
   3                  Free Software Foundation, Inc.
   4
   5 This file is part of GNU Emacs.
   6
   7 GNU Emacs is free software: you can redistribute it and/or modify
   8 it under the terms of the GNU General Public License as published by
   9 the Free Software Foundation, either version 3 of the License, or
  10 (at your option) any later version.
  11
  12 GNU Emacs is distributed in the hope that it will be useful,
  13 but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15 GNU General Public License for more details.
  16
  17 You should have received a copy of the GNU General Public License
  18 along with GNU Emacs.  If not, see <http://www.gnu.org/licenses/>.  */
  19
  20
  21 #include <config.h>
  22 #include <setjmp.h>
  23 #include "lisp.h"
  24 #include "syntax.h"
  25 #include "category.h"
  26 #include "buffer.h"
  27 #include "character.h"
  28 #include "charset.h"
  29 #include "region-cache.h"
  30 #include "commands.h"
  31 #include "blockinput.h"
  32 #include "intervals.h"
  33
  34 #include <sys/types.h>
  35 #include "regex.h"
  36
  37 #define REGEXP_CACHE_SIZE 20
  38
  39 /* If the regexp is non-nil, then the buffer contains the compiled form
  40    of that regexp, suitable for searching.  */
  41 struct regexp_cache
  42 {
  43   struct regexp_cache *next;
  44   Lisp_Object regexp, whitespace_regexp;
  45   /* Syntax table for which the regexp applies.  We need this because
  46      of character classes.  If this is t, then the compiled pattern is valid
  47      for any syntax-table.  */
  48   Lisp_Object syntax_table;
  49   struct re_pattern_buffer buf;
  50   char fastmap[0400];
  51   /* Nonzero means regexp was compiled to do full POSIX backtracking.  */
  52   char posix;
  53 };
  54
  55 /* The instances of that struct.  */
  56 struct regexp_cache searchbufs[REGEXP_CACHE_SIZE];
  57
  58 /* The head of the linked list; points to the most recently used buffer.  */
  59 struct regexp_cache *searchbuf_head;
  60
  61
  62 /* Every call to re_match, etc., must pass &search_regs as the regs
  63    argument unless you can show it is unnecessary (i.e., if re_match
  64    is certainly going to be called again before region-around-match
  65    can be called).
  66
  67    Since the registers are now dynamically allocated, we need to make
  68    sure not to refer to the Nth register before checking that it has
  69    been allocated by checking search_regs.num_regs.
  70
  71    The regex code keeps track of whether it has allocated the search
  72    buffer using bits in the re_pattern_buffer.  This means that whenever
  73    you compile a new pattern, it completely forgets whether it has
  74    allocated any registers, and will allocate new registers the next
  75    time you call a searching or matching function.  Therefore, we need
  76    to call re_set_registers after compiling a new pattern or after
  77    setting the match registers, so that the regex functions will be
  78    able to free or re-allocate it properly.  */
  79 static struct re_registers search_regs;
  80
  81 /* The buffer in which the last search was performed, or
  82    Qt if the last search was done in a string;
  83    Qnil if no searching has been done yet.  */
  84 static Lisp_Object last_thing_searched;
  85
  86 /* error condition signaled when regexp compile_pattern fails */
  87
  88 Lisp_Object Qinvalid_regexp;
  89
  90 /* Error condition used for failing searches */
  91 Lisp_Object Qsearch_failed;
  92
  93 static void set_search_regs (EMACS_INT, EMACS_INT);
  94 static void save_search_regs (void);
  95 static EMACS_INT simple_search (EMACS_INT, unsigned char *, EMACS_INT,
  96                                 EMACS_INT, Lisp_Object, EMACS_INT, EMACS_INT,
  97                                 EMACS_INT, EMACS_INT);
  98 static EMACS_INT boyer_moore (EMACS_INT, unsigned char *, EMACS_INT, EMACS_INT,
  99                               Lisp_Object, Lisp_Object,
 100                               EMACS_INT, EMACS_INT,
 101                               EMACS_INT, EMACS_INT, int);
 102 static EMACS_INT search_buffer (Lisp_Object, EMACS_INT, EMACS_INT,
 103                                 EMACS_INT, EMACS_INT, EMACS_INT, int,
 104                                 Lisp_Object, Lisp_Object, int);
 105 static void matcher_overflow (void) NO_RETURN;
 106
 107 static void
 108 matcher_overflow (void)
 109 {
 110   error ("Stack overflow in regexp matcher");
 111 }
 112
 113 /* Compile a regexp and signal a Lisp error if anything goes wrong.
 114    PATTERN is the pattern to compile.
 115    CP is the place to put the result.
 116    TRANSLATE is a translation table for ignoring case, or nil for none.
 117    POSIX is nonzero if we want full backtracking (POSIX style)
 118    for this pattern.  0 means backtrack only enough to get a valid match.
 119
 120    The behavior also depends on Vsearch_spaces_regexp.  */
 121
 122 static void
 123 compile_pattern_1 (struct regexp_cache *cp, Lisp_Object pattern, Lisp_Object translate, int posix)
 124 {
 125   char *val;
 126   reg_syntax_t old;
 127
 128   cp->regexp = Qnil;
 129   cp->buf.translate = (! NILP (translate) ? translate : make_number (0));
 130   cp->posix = posix;
 131   cp->buf.multibyte = STRING_MULTIBYTE (pattern);
 132   cp->buf.charset_unibyte = charset_unibyte;
 133   if (STRINGP (Vsearch_spaces_regexp))
 134     cp->whitespace_regexp = Vsearch_spaces_regexp;
 135   else
 136     cp->whitespace_regexp = Qnil;
 137
 138   /* rms: I think BLOCK_INPUT is not needed here any more,
 139      because regex.c defines malloc to call xmalloc.
 140      Using BLOCK_INPUT here means the debugger won't run if an error occurs.
 141      So let's turn it off.  */
 142   /*  BLOCK_INPUT;  */
 143   old = re_set_syntax (RE_SYNTAX_EMACS
 144                        | (posix ? 0 : RE_NO_POSIX_BACKTRACKING));
 145
 146   if (STRINGP (Vsearch_spaces_regexp))
 147     re_set_whitespace_regexp (SSDATA (Vsearch_spaces_regexp));
 148   else
 149     re_set_whitespace_regexp (NULL);
 150
 151   val = (char *) re_compile_pattern (SSDATA (pattern),
 152                                      SBYTES (pattern), &cp->buf);
 153
 154   /* If the compiled pattern hard codes some of the contents of the
 155      syntax-table, it can only be reused with *this* syntax table.  */
 156   cp->syntax_table = cp->buf.used_syntax ? BVAR (current_buffer, syntax_table) : Qt;
 157
 158   re_set_whitespace_regexp (NULL);
 159
 160   re_set_syntax (old);
 161   /* UNBLOCK_INPUT;  */
 162   if (val)
 163     xsignal1 (Qinvalid_regexp, build_string (val));
 164
 165   cp->regexp = Fcopy_sequence (pattern);
 166 }
 167
 168 /* Shrink each compiled regexp buffer in the cache
 169    to the size actually used right now.
 170    This is called from garbage collection.  */
 171
 172 void
 173 shrink_regexp_cache (void)
 174 {
 175   struct regexp_cache *cp;
 176
 177   for (cp = searchbuf_head; cp != 0; cp = cp->next)
 178     {
 179       cp->buf.allocated = cp->buf.used;
 180       cp->buf.buffer
 181         = (unsigned char *) xrealloc (cp->buf.buffer, cp->buf.used);
 182     }
 183 }
 184
 185 /* Clear the regexp cache w.r.t. a particular syntax table,
 186    because it was changed.
 187    There is no danger of memory leak here because re_compile_pattern
 188    automagically manages the memory in each re_pattern_buffer struct,
 189    based on its `allocated' and `buffer' values.  */
 190 void
 191 clear_regexp_cache (void)
 192 {
 193   int i;
 194
 195   for (i = 0; i < REGEXP_CACHE_SIZE; ++i)
 196     /* It's tempting to compare with the syntax-table we've actually changed,
 197        but it's not sufficient because char-table inheritance means that
 198        modifying one syntax-table can change others at the same time.  */
 199     if (!EQ (searchbufs[i].syntax_table, Qt))
 200       searchbufs[i].regexp = Qnil;
 201 }
 202
 203 /* Compile a regexp if necessary, but first check to see if there's one in
 204    the cache.
 205    PATTERN is the pattern to compile.
 206    TRANSLATE is a translation table for ignoring case, or nil for none.
 207    REGP is the structure that says where to store the "register"
 208    values that will result from matching this pattern.
 209    If it is 0, we should compile the pattern not to record any
 210    subexpression bounds.
 211    POSIX is nonzero if we want full backtracking (POSIX style)
 212    for this pattern.  0 means backtrack only enough to get a valid match.  */
 213
 214 struct re_pattern_buffer *
 215 compile_pattern (Lisp_Object pattern, struct re_registers *regp, Lisp_Object translate, int posix, int multibyte)
 216 {
 217   struct regexp_cache *cp, **cpp;
 218
 219   for (cpp = &searchbuf_head; ; cpp = &cp->next)
 220     {
 221       cp = *cpp;
 222       /* Entries are initialized to nil, and may be set to nil by
 223          compile_pattern_1 if the pattern isn't valid.  Don't apply
 224          string accessors in those cases.  However, compile_pattern_1
 225          is only applied to the cache entry we pick here to reuse.  So
 226          nil should never appear before a non-nil entry.  */
 227       if (NILP (cp->regexp))
 228         goto compile_it;
 229       if (SCHARS (cp->regexp) == SCHARS (pattern)
 230           && STRING_MULTIBYTE (cp->regexp) == STRING_MULTIBYTE (pattern)
 231           && !NILP (Fstring_equal (cp->regexp, pattern))
 232           && EQ (cp->buf.translate, (! NILP (translate) ? translate : make_number (0)))
 233           && cp->posix == posix
 234           && (EQ (cp->syntax_table, Qt)
 235               || EQ (cp->syntax_table, BVAR (current_buffer, syntax_table)))
 236           && !NILP (Fequal (cp->whitespace_regexp, Vsearch_spaces_regexp))
 237           && cp->buf.charset_unibyte == charset_unibyte)
 238         break;
 239
 240       /* If we're at the end of the cache, compile into the nil cell
 241          we found, or the last (least recently used) cell with a
 242          string value.  */
 243       if (cp->next == 0)
 244         {
 245         compile_it:
 246           compile_pattern_1 (cp, pattern, translate, posix);
 247           break;
 248         }
 249     }
 250
 251   /* When we get here, cp (aka *cpp) contains the compiled pattern,
 252      either because we found it in the cache or because we just compiled it.
 253      Move it to the front of the queue to mark it as most recently used.  */
 254   *cpp = cp->next;
 255   cp->next = searchbuf_head;
 256   searchbuf_head = cp;
 257
 258   /* Advise the searching functions about the space we have allocated
 259      for register data.  */
 260   if (regp)
 261     re_set_registers (&cp->buf, regp, regp->num_regs, regp->start, regp->end);
 262
 263   /* The compiled pattern can be used both for multibyte and unibyte
 264      target.  But, we have to tell which the pattern is used for. */
 265   cp->buf.target_multibyte = multibyte;
 266
 267   return &cp->buf;
 268 }
 269
 270 \f
 271 static Lisp_Object
 272 looking_at_1 (Lisp_Object string, int posix)
 273 {
 274   Lisp_Object val;
 275   unsigned char *p1, *p2;
 276   EMACS_INT s1, s2;
 277   register EMACS_INT i;
 278   struct re_pattern_buffer *bufp;
 279
 280   if (running_asynch_code)
 281     save_search_regs ();
 282
 283   /* This is so set_image_of_range_1 in regex.c can find the EQV table.  */
 284   XCHAR_TABLE (BVAR (current_buffer, case_canon_table))->extras[2]
 285     = BVAR (current_buffer, case_eqv_table);
 286
 287   CHECK_STRING (string);
 288   bufp = compile_pattern (string,
 289                           (NILP (Vinhibit_changing_match_data)
 290                            ? &search_regs : NULL),
 291                           (!NILP (BVAR (current_buffer, case_fold_search))
 292                            ? BVAR (current_buffer, case_canon_table) : Qnil),
 293                           posix,
 294                           !NILP (BVAR (current_buffer, enable_multibyte_characters)));
 295
 296   immediate_quit = 1;
 297   QUIT;                 /* Do a pending quit right away, to avoid paradoxical behavior */
 298
 299   /* Get pointers and sizes of the two strings
 300      that make up the visible portion of the buffer. */
 301
 302   p1 = BEGV_ADDR;
 303   s1 = GPT_BYTE - BEGV_BYTE;
 304   p2 = GAP_END_ADDR;
 305   s2 = ZV_BYTE - GPT_BYTE;
 306   if (s1 < 0)
 307     {
 308       p2 = p1;
 309       s2 = ZV_BYTE - BEGV_BYTE;
 310       s1 = 0;
 311     }
 312   if (s2 < 0)
 313     {
 314       s1 = ZV_BYTE - BEGV_BYTE;
 315       s2 = 0;
 316     }
 317
 318   re_match_object = Qnil;
 319
 320   i = re_match_2 (bufp, (char *) p1, s1, (char *) p2, s2,
 321                   PT_BYTE - BEGV_BYTE,
 322                   (NILP (Vinhibit_changing_match_data)
 323                    ? &search_regs : NULL),
 324                   ZV_BYTE - BEGV_BYTE);
 325   immediate_quit = 0;
 326
 327   if (i == -2)
 328     matcher_overflow ();
 329
 330   val = (0 <= i ? Qt : Qnil);
 331   if (NILP (Vinhibit_changing_match_data) && i >= 0)
 332     for (i = 0; i < search_regs.num_regs; i++)
 333       if (search_regs.start[i] >= 0)
 334         {
 335           search_regs.start[i]
 336             = BYTE_TO_CHAR (search_regs.start[i] + BEGV_BYTE);
 337           search_regs.end[i]
 338             = BYTE_TO_CHAR (search_regs.end[i] + BEGV_BYTE);
 339         }
 340
 341   /* Set last_thing_searched only when match data is changed.  */
 342   if (NILP (Vinhibit_changing_match_data))
 343     XSETBUFFER (last_thing_searched, current_buffer);
 344
 345   return val;
 346 }
 347
 348 DEFUN ("looking-at", Flooking_at, Slooking_at, 1, 1, 0,
 349        doc: /* Return t if text after point matches regular expression REGEXP.
 350 This function modifies the match data that `match-beginning',
 351 `match-end' and `match-data' access; save and restore the match
 352 data if you want to preserve them.  */)
 353   (Lisp_Object regexp)
 354 {
 355   return looking_at_1 (regexp, 0);
 356 }
 357
 358 DEFUN ("posix-looking-at", Fposix_looking_at, Sposix_looking_at, 1, 1, 0,
 359        doc: /* Return t if text after point matches regular expression REGEXP.
 360 Find the longest match, in accord with Posix regular expression rules.
 361 This function modifies the match data that `match-beginning',
 362 `match-end' and `match-data' access; save and restore the match
 363 data if you want to preserve them.  */)
 364   (Lisp_Object regexp)
 365 {
 366   return looking_at_1 (regexp, 1);
 367 }
 368 \f
 369 static Lisp_Object
 370 string_match_1 (Lisp_Object regexp, Lisp_Object string, Lisp_Object start, int posix)
 371 {
 372   int val;
 373   struct re_pattern_buffer *bufp;
 374   EMACS_INT pos, pos_byte;
 375   int i;
 376
 377   if (running_asynch_code)
 378     save_search_regs ();
 379
 380   CHECK_STRING (regexp);
 381   CHECK_STRING (string);
 382
 383   if (NILP (start))
 384     pos = 0, pos_byte = 0;
 385   else
 386     {
 387       EMACS_INT len = SCHARS (string);
 388
 389       CHECK_NUMBER (start);
 390       pos = XINT (start);
 391       if (pos < 0 && -pos <= len)
 392         pos = len + pos;
 393       else if (0 > pos || pos > len)
 394         args_out_of_range (string, start);
 395       pos_byte = string_char_to_byte (string, pos);
 396     }
 397
 398   /* This is so set_image_of_range_1 in regex.c can find the EQV table.  */
 399   XCHAR_TABLE (BVAR (current_buffer, case_canon_table))->extras[2]
 400     = BVAR (current_buffer, case_eqv_table);
 401
 402   bufp = compile_pattern (regexp,
 403                           (NILP (Vinhibit_changing_match_data)
 404                            ? &search_regs : NULL),
 405                           (!NILP (BVAR (current_buffer, case_fold_search))
 406                            ? BVAR (current_buffer, case_canon_table) : Qnil),
 407                           posix,
 408                           STRING_MULTIBYTE (string));
 409   immediate_quit = 1;
 410   re_match_object = string;
 411
 412   val = re_search (bufp, SSDATA (string),
 413                    SBYTES (string), pos_byte,
 414                    SBYTES (string) - pos_byte,
 415                    (NILP (Vinhibit_changing_match_data)
 416                     ? &search_regs : NULL));
 417   immediate_quit = 0;
 418
 419   /* Set last_thing_searched only when match data is changed.  */
 420   if (NILP (Vinhibit_changing_match_data))
 421     last_thing_searched = Qt;
 422
 423   if (val == -2)
 424     matcher_overflow ();
 425   if (val < 0) return Qnil;
 426
 427   if (NILP (Vinhibit_changing_match_data))
 428     for (i = 0; i < search_regs.num_regs; i++)
 429       if (search_regs.start[i] >= 0)
 430         {
 431           search_regs.start[i]
 432             = string_byte_to_char (string, search_regs.start[i]);
 433           search_regs.end[i]
 434             = string_byte_to_char (string, search_regs.end[i]);
 435         }
 436
 437   return make_number (string_byte_to_char (string, val));
 438 }
 439
 440 DEFUN ("string-match", Fstring_match, Sstring_match, 2, 3, 0,
 441        doc: /* Return index of start of first match for REGEXP in STRING, or nil.
 442 Matching ignores case if `case-fold-search' is non-nil.
 443 If third arg START is non-nil, start search at that index in STRING.
 444 For index of first char beyond the match, do (match-end 0).
 445 `match-end' and `match-beginning' also give indices of substrings
 446 matched by parenthesis constructs in the pattern.
 447
 448 You can use the function `match-string' to extract the substrings
 449 matched by the parenthesis constructions in REGEXP. */)
 450   (Lisp_Object regexp, Lisp_Object string, Lisp_Object start)
 451 {
 452   return string_match_1 (regexp, string, start, 0);
 453 }
 454
 455 DEFUN ("posix-string-match", Fposix_string_match, Sposix_string_match, 2, 3, 0,
 456        doc: /* Return index of start of first match for REGEXP in STRING, or nil.
 457 Find the longest match, in accord with Posix regular expression rules.
 458 Case is ignored if `case-fold-search' is non-nil in the current buffer.
 459 If third arg START is non-nil, start search at that index in STRING.
 460 For index of first char beyond the match, do (match-end 0).
 461 `match-end' and `match-beginning' also give indices of substrings
 462 matched by parenthesis constructs in the pattern.  */)
 463   (Lisp_Object regexp, Lisp_Object string, Lisp_Object start)
 464 {
 465   return string_match_1 (regexp, string, start, 1);
 466 }
 467
 468 /* Match REGEXP against STRING, searching all of STRING,
 469    and return the index of the match, or negative on failure.
 470    This does not clobber the match data.  */
 471
 472 int
 473 fast_string_match (Lisp_Object regexp, Lisp_Object string)
 474 {
 475   int val;
 476   struct re_pattern_buffer *bufp;
 477
 478   bufp = compile_pattern (regexp, 0, Qnil,
 479                           0, STRING_MULTIBYTE (string));
 480   immediate_quit = 1;
 481   re_match_object = string;
 482
 483   val = re_search (bufp, SSDATA (string),
 484                    SBYTES (string), 0,
 485                    SBYTES (string), 0);
 486   immediate_quit = 0;
 487   return val;
 488 }
 489
 490 /* Match REGEXP against STRING, searching all of STRING ignoring case,
 491    and return the index of the match, or negative on failure.
 492    This does not clobber the match data.
 493    We assume that STRING contains single-byte characters.  */
 494
 495 int
 496 fast_c_string_match_ignore_case (Lisp_Object regexp, const char *string)
 497 {
 498   int val;
 499   struct re_pattern_buffer *bufp;
 500   size_t len = strlen (string);
 501
 502   regexp = string_make_unibyte (regexp);
 503   re_match_object = Qt;
 504   bufp = compile_pattern (regexp, 0,
 505                           Vascii_canon_table, 0,
 506                           0);
 507   immediate_quit = 1;
 508   val = re_search (bufp, string, len, 0, len, 0);
 509   immediate_quit = 0;
 510   return val;
 511 }
 512
 513 /* Like fast_string_match but ignore case.  */
 514
 515 int
 516 fast_string_match_ignore_case (Lisp_Object regexp, Lisp_Object string)
 517 {
 518   int val;
 519   struct re_pattern_buffer *bufp;
 520
 521   bufp = compile_pattern (regexp, 0, Vascii_canon_table,
 522                           0, STRING_MULTIBYTE (string));
 523   immediate_quit = 1;
 524   re_match_object = string;
 525
 526   val = re_search (bufp, SSDATA (string),
 527                    SBYTES (string), 0,
 528                    SBYTES (string), 0);
 529   immediate_quit = 0;
 530   return val;
 531 }
 532 \f
 533 /* Match REGEXP against the characters after POS to LIMIT, and return
 534    the number of matched characters.  If STRING is non-nil, match
 535    against the characters in it.  In that case, POS and LIMIT are
 536    indices into the string.  This function doesn't modify the match
 537    data.  */
 538
 539 EMACS_INT
 540 fast_looking_at (Lisp_Object regexp, EMACS_INT pos, EMACS_INT pos_byte, EMACS_INT limit, EMACS_INT limit_byte, Lisp_Object string)
 541 {
 542   int multibyte;
 543   struct re_pattern_buffer *buf;
 544   unsigned char *p1, *p2;
 545   EMACS_INT s1, s2;
 546   EMACS_INT len;
 547
 548   if (STRINGP (string))
 549     {
 550       if (pos_byte < 0)
 551         pos_byte = string_char_to_byte (string, pos);
 552       if (limit_byte < 0)
 553         limit_byte = string_char_to_byte (string, limit);
 554       p1 = NULL;
 555       s1 = 0;
 556       p2 = SDATA (string);
 557       s2 = SBYTES (string);
 558       re_match_object = string;
 559       multibyte = STRING_MULTIBYTE (string);
 560     }
 561   else
 562     {
 563       if (pos_byte < 0)
 564         pos_byte = CHAR_TO_BYTE (pos);
 565       if (limit_byte < 0)
 566         limit_byte = CHAR_TO_BYTE (limit);
 567       pos_byte -= BEGV_BYTE;
 568       limit_byte -= BEGV_BYTE;
 569       p1 = BEGV_ADDR;
 570       s1 = GPT_BYTE - BEGV_BYTE;
 571       p2 = GAP_END_ADDR;
 572       s2 = ZV_BYTE - GPT_BYTE;
 573       if (s1 < 0)
 574         {
 575           p2 = p1;
 576           s2 = ZV_BYTE - BEGV_BYTE;
 577           s1 = 0;
 578         }
 579       if (s2 < 0)
 580         {
 581           s1 = ZV_BYTE - BEGV_BYTE;
 582           s2 = 0;
 583         }
 584       re_match_object = Qnil;
 585       multibyte = ! NILP (BVAR (current_buffer, enable_multibyte_characters));
 586     }
 587
 588   buf = compile_pattern (regexp, 0, Qnil, 0, multibyte);
 589   immediate_quit = 1;
 590   len = re_match_2 (buf, (char *) p1, s1, (char *) p2, s2,
 591                     pos_byte, NULL, limit_byte);
 592   immediate_quit = 0;
 593
 594   return len;
 595 }
 596
 597 \f
 598 /* The newline cache: remembering which sections of text have no newlines.  */
 599
 600 /* If the user has requested newline caching, make sure it's on.
 601    Otherwise, make sure it's off.
 602    This is our cheezy way of associating an action with the change of
 603    state of a buffer-local variable.  */
 604 static void
 605 newline_cache_on_off (struct buffer *buf)
 606 {
 607   if (NILP (BVAR (buf, cache_long_line_scans)))
 608     {
 609       /* It should be off.  */
 610       if (buf->newline_cache)
 611         {
 612           free_region_cache (buf->newline_cache);
 613           buf->newline_cache = 0;
 614         }
 615     }
 616   else
 617     {
 618       /* It should be on.  */
 619       if (buf->newline_cache == 0)
 620         buf->newline_cache = new_region_cache ();
 621     }
 622 }
 623
 624 \f
 625 /* Search for COUNT instances of the character TARGET between START and END.
 626
 627    If COUNT is positive, search forwards; END must be >= START.
 628    If COUNT is negative, search backwards for the -COUNTth instance;
 629       END must be <= START.
 630    If COUNT is zero, do anything you please; run rogue, for all I care.
 631
 632    If END is zero, use BEGV or ZV instead, as appropriate for the
 633    direction indicated by COUNT.
 634
 635    If we find COUNT instances, set *SHORTAGE to zero, and return the
 636    position past the COUNTth match.  Note that for reverse motion
 637    this is not the same as the usual convention for Emacs motion commands.
 638
 639    If we don't find COUNT instances before reaching END, set *SHORTAGE
 640    to the number of TARGETs left unfound, and return END.
 641
 642    If ALLOW_QUIT is non-zero, set immediate_quit.  That's good to do
 643    except when inside redisplay.  */
 644
 645 EMACS_INT
 646 scan_buffer (register int target, EMACS_INT start, EMACS_INT end,
 647              EMACS_INT count, int *shortage, int allow_quit)
 648 {
 649   struct region_cache *newline_cache;
 650   int direction;
 651
 652   if (count > 0)
 653     {
 654       direction = 1;
 655       if (! end) end = ZV;
 656     }
 657   else
 658     {
 659       direction = -1;
 660       if (! end) end = BEGV;
 661     }
 662
 663   newline_cache_on_off (current_buffer);
 664   newline_cache = current_buffer->newline_cache;
 665
 666   if (shortage != 0)
 667     *shortage = 0;
 668
 669   immediate_quit = allow_quit;
 670
 671   if (count > 0)
 672     while (start != end)
 673       {
 674         /* Our innermost scanning loop is very simple; it doesn't know
 675            about gaps, buffer ends, or the newline cache.  ceiling is
 676            the position of the last character before the next such
 677            obstacle --- the last character the dumb search loop should
 678            examine.  */
 679         EMACS_INT ceiling_byte = CHAR_TO_BYTE (end) - 1;
 680         EMACS_INT start_byte = CHAR_TO_BYTE (start);
 681         EMACS_INT tem;
 682
 683         /* If we're looking for a newline, consult the newline cache
 684            to see where we can avoid some scanning.  */
 685         if (target == '\n' && newline_cache)
 686           {
 687             EMACS_INT next_change;
 688             immediate_quit = 0;
 689             while (region_cache_forward
 690                    (current_buffer, newline_cache, start_byte, &next_change))
 691               start_byte = next_change;
 692             immediate_quit = allow_quit;
 693
 694             /* START should never be after END.  */
 695             if (start_byte > ceiling_byte)
 696               start_byte = ceiling_byte;
 697
 698             /* Now the text after start is an unknown region, and
 699                next_change is the position of the next known region. */
 700             ceiling_byte = min (next_change - 1, ceiling_byte);
 701           }
 702
 703         /* The dumb loop can only scan text stored in contiguous
 704            bytes. BUFFER_CEILING_OF returns the last character
 705            position that is contiguous, so the ceiling is the
 706            position after that.  */
 707         tem = BUFFER_CEILING_OF (start_byte);
 708         ceiling_byte = min (tem, ceiling_byte);
 709
 710         {
 711           /* The termination address of the dumb loop.  */
 712           register unsigned char *ceiling_addr
 713             = BYTE_POS_ADDR (ceiling_byte) + 1;
 714           register unsigned char *cursor
 715             = BYTE_POS_ADDR (start_byte);
 716           unsigned char *base = cursor;
 717
 718           while (cursor < ceiling_addr)
 719             {
 720               unsigned char *scan_start = cursor;
 721
 722               /* The dumb loop.  */
 723               while (*cursor != target && ++cursor < ceiling_addr)
 724                 ;
 725
 726               /* If we're looking for newlines, cache the fact that
 727                  the region from start to cursor is free of them. */
 728               if (target == '\n' && newline_cache)
 729                 know_region_cache (current_buffer, newline_cache,
 730                                    start_byte + scan_start - base,
 731                                    start_byte + cursor - base);
 732
 733               /* Did we find the target character?  */
 734               if (cursor < ceiling_addr)
 735                 {
 736                   if (--count == 0)
 737                     {
 738                       immediate_quit = 0;
 739                       return BYTE_TO_CHAR (start_byte + cursor - base + 1);
 740                     }
 741                   cursor++;
 742                 }
 743             }
 744
 745           start = BYTE_TO_CHAR (start_byte + cursor - base);
 746         }
 747       }
 748   else
 749     while (start > end)
 750       {
 751         /* The last character to check before the next obstacle.  */
 752         EMACS_INT ceiling_byte = CHAR_TO_BYTE (end);
 753         EMACS_INT start_byte = CHAR_TO_BYTE (start);
 754         EMACS_INT tem;
 755
 756         /* Consult the newline cache, if appropriate.  */
 757         if (target == '\n' && newline_cache)
 758           {
 759             EMACS_INT next_change;
 760             immediate_quit = 0;
 761             while (region_cache_backward
 762                    (current_buffer, newline_cache, start_byte, &next_change))
 763               start_byte = next_change;
 764             immediate_quit = allow_quit;
 765
 766             /* Start should never be at or before end.  */
 767             if (start_byte <= ceiling_byte)
 768               start_byte = ceiling_byte + 1;
 769
 770             /* Now the text before start is an unknown region, and
 771                next_change is the position of the next known region. */
 772             ceiling_byte = max (next_change, ceiling_byte);
 773           }
 774
 775         /* Stop scanning before the gap.  */
 776         tem = BUFFER_FLOOR_OF (start_byte - 1);
 777         ceiling_byte = max (tem, ceiling_byte);
 778
 779         {
 780           /* The termination address of the dumb loop.  */
 781           register unsigned char *ceiling_addr = BYTE_POS_ADDR (ceiling_byte);
 782           register unsigned char *cursor = BYTE_POS_ADDR (start_byte - 1);
 783           unsigned char *base = cursor;
 784
 785           while (cursor >= ceiling_addr)
 786             {
 787               unsigned char *scan_start = cursor;
 788
 789               while (*cursor != target && --cursor >= ceiling_addr)
 790                 ;
 791
 792               /* If we're looking for newlines, cache the fact that
 793                  the region from after the cursor to start is free of them.  */
 794               if (target == '\n' && newline_cache)
 795                 know_region_cache (current_buffer, newline_cache,
 796                                    start_byte + cursor - base,
 797                                    start_byte + scan_start - base);
 798
 799               /* Did we find the target character?  */
 800               if (cursor >= ceiling_addr)
 801                 {
 802                   if (++count >= 0)
 803                     {
 804                       immediate_quit = 0;
 805                       return BYTE_TO_CHAR (start_byte + cursor - base);
 806                     }
 807                   cursor--;
 808                 }
 809             }
 810
 811           start = BYTE_TO_CHAR (start_byte + cursor - base);
 812         }
 813       }
 814
 815   immediate_quit = 0;
 816   if (shortage != 0)
 817     *shortage = count * direction;
 818   return start;
 819 }
 820 \f
 821 /* Search for COUNT instances of a line boundary, which means either a
 822    newline or (if selective display enabled) a carriage return.
 823    Start at START.  If COUNT is negative, search backwards.
 824
 825    We report the resulting position by calling TEMP_SET_PT_BOTH.
 826
 827    If we find COUNT instances. we position after (always after,
 828    even if scanning backwards) the COUNTth match, and return 0.
 829
 830    If we don't find COUNT instances before reaching the end of the
 831    buffer (or the beginning, if scanning backwards), we return
 832    the number of line boundaries left unfound, and position at
 833    the limit we bumped up against.
 834
 835    If ALLOW_QUIT is non-zero, set immediate_quit.  That's good to do
 836    except in special cases.  */
 837
 838 EMACS_INT
 839 scan_newline (EMACS_INT start, EMACS_INT start_byte,
 840               EMACS_INT limit, EMACS_INT limit_byte,
 841               register EMACS_INT count, int allow_quit)
 842 {
 843   int direction = ((count > 0) ? 1 : -1);
 844
 845   register unsigned char *cursor;
 846   unsigned char *base;
 847
 848   EMACS_INT ceiling;
 849   register unsigned char *ceiling_addr;
 850
 851   int old_immediate_quit = immediate_quit;
 852
 853   /* The code that follows is like scan_buffer
 854      but checks for either newline or carriage return.  */
 855
 856   if (allow_quit)
 857     immediate_quit++;
 858
 859   start_byte = CHAR_TO_BYTE (start);
 860
 861   if (count > 0)
 862     {
 863       while (start_byte < limit_byte)
 864         {
 865           ceiling =  BUFFER_CEILING_OF (start_byte);
 866           ceiling = min (limit_byte - 1, ceiling);
 867           ceiling_addr = BYTE_POS_ADDR (ceiling) + 1;
 868           base = (cursor = BYTE_POS_ADDR (start_byte));
 869           while (1)
 870             {
 871               while (*cursor != '\n' && ++cursor != ceiling_addr)
 872                 ;
 873
 874               if (cursor != ceiling_addr)
 875                 {
 876                   if (--count == 0)
 877                     {
 878                       immediate_quit = old_immediate_quit;
 879                       start_byte = start_byte + cursor - base + 1;
 880                       start = BYTE_TO_CHAR (start_byte);
 881                       TEMP_SET_PT_BOTH (start, start_byte);
 882                       return 0;
 883                     }
 884                   else
 885                     if (++cursor == ceiling_addr)
 886                       break;
 887                 }
 888               else
 889                 break;
 890             }
 891           start_byte += cursor - base;
 892         }
 893     }
 894   else
 895     {
 896       while (start_byte > limit_byte)
 897         {
 898           ceiling = BUFFER_FLOOR_OF (start_byte - 1);
 899           ceiling = max (limit_byte, ceiling);
 900           ceiling_addr = BYTE_POS_ADDR (ceiling) - 1;
 901           base = (cursor = BYTE_POS_ADDR (start_byte - 1) + 1);
 902           while (1)
 903             {
 904               while (--cursor != ceiling_addr && *cursor != '\n')
 905                 ;
 906
 907               if (cursor != ceiling_addr)
 908                 {
 909                   if (++count == 0)
 910                     {
 911                       immediate_quit = old_immediate_quit;
 912                       /* Return the position AFTER the match we found.  */
 913                       start_byte = start_byte + cursor - base + 1;
 914                       start = BYTE_TO_CHAR (start_byte);
 915                       TEMP_SET_PT_BOTH (start, start_byte);
 916                       return 0;
 917                     }
 918                 }
 919               else
 920                 break;
 921             }
 922           /* Here we add 1 to compensate for the last decrement
 923              of CURSOR, which took it past the valid range.  */
 924           start_byte += cursor - base + 1;
 925         }
 926     }
 927
 928   TEMP_SET_PT_BOTH (limit, limit_byte);
 929   immediate_quit = old_immediate_quit;
 930
 931   return count * direction;
 932 }
 933
 934 EMACS_INT
 935 find_next_newline_no_quit (EMACS_INT from, EMACS_INT cnt)
 936 {
 937   return scan_buffer ('\n', from, 0, cnt, (int *) 0, 0);
 938 }
 939
 940 /* Like find_next_newline, but returns position before the newline,
 941    not after, and only search up to TO.  This isn't just
 942    find_next_newline (...)-1, because you might hit TO.  */
 943
 944 EMACS_INT
 945 find_before_next_newline (EMACS_INT from, EMACS_INT to, EMACS_INT cnt)
 946 {
 947   int shortage;
 948   EMACS_INT pos = scan_buffer ('\n', from, to, cnt, &shortage, 1);
 949
 950   if (shortage == 0)
 951     pos--;
 952
 953   return pos;
 954 }
 955 \f
 956 /* Subroutines of Lisp buffer search functions. */
 957
 958 static Lisp_Object
 959 search_command (Lisp_Object string, Lisp_Object bound, Lisp_Object noerror,
 960                 Lisp_Object count, int direction, int RE, int posix)
 961 {
 962   register int np;
 963   EMACS_INT lim, lim_byte;
 964   int n = direction;
 965
 966   if (!NILP (count))
 967     {
 968       CHECK_NUMBER (count);
 969       n *= XINT (count);
 970     }
 971
 972   CHECK_STRING (string);
 973   if (NILP (bound))
 974     {
 975       if (n > 0)
 976         lim = ZV, lim_byte = ZV_BYTE;
 977       else
 978         lim = BEGV, lim_byte = BEGV_BYTE;
 979     }
 980   else
 981     {
 982       CHECK_NUMBER_COERCE_MARKER (bound);
 983       lim = XINT (bound);
 984       if (n > 0 ? lim < PT : lim > PT)
 985         error ("Invalid search bound (wrong side of point)");
 986       if (lim > ZV)
 987         lim = ZV, lim_byte = ZV_BYTE;
 988       else if (lim < BEGV)
 989         lim = BEGV, lim_byte = BEGV_BYTE;
 990       else
 991         lim_byte = CHAR_TO_BYTE (lim);
 992     }
 993
 994   /* This is so set_image_of_range_1 in regex.c can find the EQV table.  */
 995   XCHAR_TABLE (BVAR (current_buffer, case_canon_table))->extras[2]
 996     = BVAR (current_buffer, case_eqv_table);
 997
 998   np = search_buffer (string, PT, PT_BYTE, lim, lim_byte, n, RE,
 999                       (!NILP (BVAR (current_buffer, case_fold_search))
1000                        ? BVAR (current_buffer, case_canon_table)
1001                        : Qnil),
1002                       (!NILP (BVAR (current_buffer, case_fold_search))
1003                        ? BVAR (current_buffer, case_eqv_table)
1004                        : Qnil),
1005                       posix);
1006   if (np <= 0)
1007     {
1008       if (NILP (noerror))
1009         xsignal1 (Qsearch_failed, string);
1010
1011       if (!EQ (noerror, Qt))
1012         {
1013           if (lim < BEGV || lim > ZV)
1014             abort ();
1015           SET_PT_BOTH (lim, lim_byte);
1016           return Qnil;
1017 #if 0 /* This would be clean, but maybe programs depend on
1018          a value of nil here.  */
1019           np = lim;
1020 #endif
1021         }
1022       else
1023         return Qnil;
1024     }
1025
1026   if (np < BEGV || np > ZV)
1027     abort ();
1028
1029   SET_PT (np);
1030
1031   return make_number (np);
1032 }
1033 \f
1034 /* Return 1 if REGEXP it matches just one constant string.  */
1035
1036 static int
1037 trivial_regexp_p (Lisp_Object regexp)
1038 {
1039   EMACS_INT len = SBYTES (regexp);
1040   unsigned char *s = SDATA (regexp);
1041   while (--len >= 0)
1042     {
1043       switch (*s++)
1044         {
1045         case '.': case '*': case '+': case '?': case '[': case '^': case '$':
1046           return 0;
1047         case '\\':
1048           if (--len < 0)
1049             return 0;
1050           switch (*s++)
1051             {
1052             case '|': case '(': case ')': case '`': case '\'': case 'b':
1053             case 'B': case '<': case '>': case 'w': case 'W': case 's':
1054             case 'S': case '=': case '{': case '}': case '_':
1055             case 'c': case 'C': /* for categoryspec and notcategoryspec */
1056             case '1': case '2': case '3': case '4': case '5':
1057             case '6': case '7': case '8': case '9':
1058               return 0;
1059             }
1060         }
1061     }
1062   return 1;
1063 }
1064
1065 /* Search for the n'th occurrence of STRING in the current buffer,
1066    starting at position POS and stopping at position LIM,
1067    treating STRING as a literal string if RE is false or as
1068    a regular expression if RE is true.
1069
1070    If N is positive, searching is forward and LIM must be greater than POS.
1071    If N is negative, searching is backward and LIM must be less than POS.
1072
1073    Returns -x if x occurrences remain to be found (x > 0),
1074    or else the position at the beginning of the Nth occurrence
1075    (if searching backward) or the end (if searching forward).
1076
1077    POSIX is nonzero if we want full backtracking (POSIX style)
1078    for this pattern.  0 means backtrack only enough to get a valid match.  */
1079
1080 #define TRANSLATE(out, trt, d)                  \
1081 do                                              \
1082   {                                             \
1083     if (! NILP (trt))                           \
1084       {                                         \
1085         Lisp_Object temp;                       \
1086         temp = Faref (trt, make_number (d));    \
1087         if (INTEGERP (temp))                    \
1088           out = XINT (temp);                    \
1089         else                                    \
1090           out = d;                              \
1091       }                                         \
1092     else                                        \
1093       out = d;                                  \
1094   }                                             \
1095 while (0)
1096
1097 /* Only used in search_buffer, to record the end position of the match
1098    when searching regexps and SEARCH_REGS should not be changed
1099    (i.e. Vinhibit_changing_match_data is non-nil).  */
1100 static struct re_registers search_regs_1;
1101
1102 static EMACS_INT
1103 search_buffer (Lisp_Object string, EMACS_INT pos, EMACS_INT pos_byte,
1104                EMACS_INT lim, EMACS_INT lim_byte, EMACS_INT n,
1105                int RE, Lisp_Object trt, Lisp_Object inverse_trt, int posix)
1106 {
1107   EMACS_INT len = SCHARS (string);
1108   EMACS_INT len_byte = SBYTES (string);
1109   register int i;
1110
1111   if (running_asynch_code)
1112     save_search_regs ();
1113
1114   /* Searching 0 times means don't move.  */
1115   /* Null string is found at starting position.  */
1116   if (len == 0 || n == 0)
1117     {
1118       set_search_regs (pos_byte, 0);
1119       return pos;
1120     }
1121
1122   if (RE && !(trivial_regexp_p (string) && NILP (Vsearch_spaces_regexp)))
1123     {
1124       unsigned char *p1, *p2;
1125       EMACS_INT s1, s2;
1126       struct re_pattern_buffer *bufp;
1127
1128       bufp = compile_pattern (string,
1129                               (NILP (Vinhibit_changing_match_data)
1130                                ? &search_regs : &search_regs_1),
1131                               trt, posix,
1132                               !NILP (BVAR (current_buffer, enable_multibyte_characters)));
1133
1134       immediate_quit = 1;       /* Quit immediately if user types ^G,
1135                                    because letting this function finish
1136                                    can take too long. */
1137       QUIT;                     /* Do a pending quit right away,
1138                                    to avoid paradoxical behavior */
1139       /* Get pointers and sizes of the two strings
1140          that make up the visible portion of the buffer. */
1141
1142       p1 = BEGV_ADDR;
1143       s1 = GPT_BYTE - BEGV_BYTE;
1144       p2 = GAP_END_ADDR;
1145       s2 = ZV_BYTE - GPT_BYTE;
1146       if (s1 < 0)
1147         {
1148           p2 = p1;
1149           s2 = ZV_BYTE - BEGV_BYTE;
1150           s1 = 0;
1151         }
1152       if (s2 < 0)
1153         {
1154           s1 = ZV_BYTE - BEGV_BYTE;
1155           s2 = 0;
1156         }
1157       re_match_object = Qnil;
1158
1159       while (n < 0)
1160         {
1161           EMACS_INT val;
1162           val = re_search_2 (bufp, (char *) p1, s1, (char *) p2, s2,
1163                              pos_byte - BEGV_BYTE, lim_byte - pos_byte,
1164                              (NILP (Vinhibit_changing_match_data)
1165                               ? &search_regs : &search_regs_1),
1166                              /* Don't allow match past current point */
1167                              pos_byte - BEGV_BYTE);
1168           if (val == -2)
1169             {
1170               matcher_overflow ();
1171             }
1172           if (val >= 0)
1173             {
1174               if (NILP (Vinhibit_changing_match_data))
1175                 {
1176                   pos_byte = search_regs.start[0] + BEGV_BYTE;
1177                   for (i = 0; i < search_regs.num_regs; i++)
1178                     if (search_regs.start[i] >= 0)
1179                       {
1180                         search_regs.start[i]
1181                           = BYTE_TO_CHAR (search_regs.start[i] + BEGV_BYTE);
1182                         search_regs.end[i]
1183                           = BYTE_TO_CHAR (search_regs.end[i] + BEGV_BYTE);
1184                       }
1185                   XSETBUFFER (last_thing_searched, current_buffer);
1186                   /* Set pos to the new position. */
1187                   pos = search_regs.start[0];
1188                 }
1189               else
1190                 {
1191                   pos_byte = search_regs_1.start[0] + BEGV_BYTE;
1192                   /* Set pos to the new position.  */
1193                   pos = BYTE_TO_CHAR (search_regs_1.start[0] + BEGV_BYTE);
1194                 }
1195             }
1196           else
1197             {
1198               immediate_quit = 0;
1199               return (n);
1200             }
1201           n++;
1202         }
1203       while (n > 0)
1204         {
1205           EMACS_INT val;
1206           val = re_search_2 (bufp, (char *) p1, s1, (char *) p2, s2,
1207                              pos_byte - BEGV_BYTE, lim_byte - pos_byte,
1208                              (NILP (Vinhibit_changing_match_data)
1209                               ? &search_regs : &search_regs_1),
1210                              lim_byte - BEGV_BYTE);
1211           if (val == -2)
1212             {
1213               matcher_overflow ();
1214             }
1215           if (val >= 0)
1216             {
1217               if (NILP (Vinhibit_changing_match_data))
1218                 {
1219                   pos_byte = search_regs.end[0] + BEGV_BYTE;
1220                   for (i = 0; i < search_regs.num_regs; i++)
1221                     if (search_regs.start[i] >= 0)
1222                       {
1223                         search_regs.start[i]
1224                           = BYTE_TO_CHAR (search_regs.start[i] + BEGV_BYTE);
1225                         search_regs.end[i]
1226                           = BYTE_TO_CHAR (search_regs.end[i] + BEGV_BYTE);
1227                       }
1228                   XSETBUFFER (last_thing_searched, current_buffer);
1229                   pos = search_regs.end[0];
1230                 }
1231               else
1232                 {
1233                   pos_byte = search_regs_1.end[0] + BEGV_BYTE;
1234                   pos = BYTE_TO_CHAR (search_regs_1.end[0] + BEGV_BYTE);
1235                 }
1236             }
1237           else
1238             {
1239               immediate_quit = 0;
1240               return (0 - n);
1241             }
1242           n--;
1243         }
1244       immediate_quit = 0;
1245       return (pos);
1246     }
1247   else                          /* non-RE case */
1248     {
1249       unsigned char *raw_pattern, *pat;
1250       EMACS_INT raw_pattern_size;
1251       EMACS_INT raw_pattern_size_byte;
1252       unsigned char *patbuf;
1253       int multibyte = !NILP (BVAR (current_buffer, enable_multibyte_characters));
1254       unsigned char *base_pat;
1255       /* Set to positive if we find a non-ASCII char that need
1256          translation.  Otherwise set to zero later.  */
1257       int char_base = -1;
1258       int boyer_moore_ok = 1;
1259
1260       /* MULTIBYTE says whether the text to be searched is multibyte.
1261          We must convert PATTERN to match that, or we will not really
1262          find things right.  */
1263
1264       if (multibyte == STRING_MULTIBYTE (string))
1265         {
1266           raw_pattern = SDATA (string);
1267           raw_pattern_size = SCHARS (string);
1268           raw_pattern_size_byte = SBYTES (string);
1269         }
1270       else if (multibyte)
1271         {
1272           raw_pattern_size = SCHARS (string);
1273           raw_pattern_size_byte
1274             = count_size_as_multibyte (SDATA (string),
1275                                        raw_pattern_size);
1276           raw_pattern = (unsigned char *) alloca (raw_pattern_size_byte + 1);
1277           copy_text (SDATA (string), raw_pattern,
1278                      SCHARS (string), 0, 1);
1279         }
1280       else
1281         {
1282           /* Converting multibyte to single-byte.
1283
1284              ??? Perhaps this conversion should be done in a special way
1285              by subtracting nonascii-insert-offset from each non-ASCII char,
1286              so that only the multibyte chars which really correspond to
1287              the chosen single-byte character set can possibly match.  */
1288           raw_pattern_size = SCHARS (string);
1289           raw_pattern_size_byte = SCHARS (string);
1290           raw_pattern = (unsigned char *) alloca (raw_pattern_size + 1);
1291           copy_text (SDATA (string), raw_pattern,
1292                      SBYTES (string), 1, 0);
1293         }
1294
1295       /* Copy and optionally translate the pattern.  */
1296       len = raw_pattern_size;
1297       len_byte = raw_pattern_size_byte;
1298       patbuf = (unsigned char *) alloca (len * MAX_MULTIBYTE_LENGTH);
1299       pat = patbuf;
1300       base_pat = raw_pattern;
1301       if (multibyte)
1302         {
1303           /* Fill patbuf by translated characters in STRING while
1304              checking if we can use boyer-moore search.  If TRT is
1305              non-nil, we can use boyer-moore search only if TRT can be
1306              represented by the byte array of 256 elements.  For that,
1307              all non-ASCII case-equivalents of all case-senstive
1308              characters in STRING must belong to the same charset and
1309              row.  */
1310
1311           while (--len >= 0)
1312             {
1313               unsigned char str_base[MAX_MULTIBYTE_LENGTH], *str;
1314               int c, translated, inverse;
1315               int in_charlen, charlen;
1316
1317               /* If we got here and the RE flag is set, it's because we're
1318                  dealing with a regexp known to be trivial, so the backslash
1319                  just quotes the next character.  */
1320               if (RE && *base_pat == '\\')
1321                 {
1322                   len--;
1323                   raw_pattern_size--;
1324                   len_byte--;
1325                   base_pat++;
1326                 }
1327
1328               c = STRING_CHAR_AND_LENGTH (base_pat, in_charlen);
1329
1330               if (NILP (trt))
1331                 {
1332                   str = base_pat;
1333                   charlen = in_charlen;
1334                 }
1335               else
1336                 {
1337                   /* Translate the character.  */
1338                   TRANSLATE (translated, trt, c);
1339                   charlen = CHAR_STRING (translated, str_base);
1340                   str = str_base;
1341
1342                   /* Check if C has any other case-equivalents.  */
1343                   TRANSLATE (inverse, inverse_trt, c);
1344                   /* If so, check if we can use boyer-moore.  */
1345                   if (c != inverse && boyer_moore_ok)
1346                     {
1347                       /* Check if all equivalents belong to the same
1348                          group of characters.  Note that the check of C
1349                          itself is done by the last iteration.  */
1350                       int this_char_base = -1;
1351
1352                       while (boyer_moore_ok)
1353                         {
1354                           if (ASCII_BYTE_P (inverse))
1355                             {
1356                               if (this_char_base > 0)
1357                                 boyer_moore_ok = 0;
1358                               else
1359                                 this_char_base = 0;
1360                             }
1361                           else if (CHAR_BYTE8_P (inverse))
1362                             /* Boyer-moore search can't handle a
1363                                translation of an eight-bit
1364                                character.  */
1365                             boyer_moore_ok = 0;
1366                           else if (this_char_base < 0)
1367                             {
1368                               this_char_base = inverse & ~0x3F;
1369                               if (char_base < 0)
1370                                 char_base = this_char_base;
1371                               else if (this_char_base != char_base)
1372                                 boyer_moore_ok = 0;
1373                             }
1374                           else if ((inverse & ~0x3F) != this_char_base)
1375                             boyer_moore_ok = 0;
1376                           if (c == inverse)
1377                             break;
1378                           TRANSLATE (inverse, inverse_trt, inverse);
1379                         }
1380                     }
1381                 }
1382
1383               /* Store this character into the translated pattern.  */
1384               memcpy (pat, str, charlen);
1385               pat += charlen;
1386               base_pat += in_charlen;
1387               len_byte -= in_charlen;
1388             }
1389
1390           /* If char_base is still negative we didn't find any translated
1391              non-ASCII characters.  */
1392           if (char_base < 0)
1393             char_base = 0;
1394         }
1395       else
1396         {
1397           /* Unibyte buffer.  */
1398           char_base = 0;
1399           while (--len >= 0)
1400             {
1401               int c, translated;
1402
1403               /* If we got here and the RE flag is set, it's because we're
1404                  dealing with a regexp known to be trivial, so the backslash
1405                  just quotes the next character.  */
1406               if (RE && *base_pat == '\\')
1407                 {
1408                   len--;
1409                   raw_pattern_size--;
1410                   base_pat++;
1411                 }
1412               c = *base_pat++;
1413               TRANSLATE (translated, trt, c);
1414               *pat++ = translated;
1415             }
1416         }
1417
1418       len_byte = pat - patbuf;
1419       len = raw_pattern_size;
1420       pat = base_pat = patbuf;
1421
1422       if (boyer_moore_ok)
1423         return boyer_moore (n, pat, len, len_byte, trt, inverse_trt,
1424                             pos, pos_byte, lim, lim_byte,
1425                             char_base);
1426       else
1427         return simple_search (n, pat, len, len_byte, trt,
1428                               pos, pos_byte, lim, lim_byte);
1429     }
1430 }
1431 \f
1432 /* Do a simple string search N times for the string PAT,
1433    whose length is LEN/LEN_BYTE,
1434    from buffer position POS/POS_BYTE until LIM/LIM_BYTE.
1435    TRT is the translation table.
1436
1437    Return the character position where the match is found.
1438    Otherwise, if M matches remained to be found, return -M.
1439
1440    This kind of search works regardless of what is in PAT and
1441    regardless of what is in TRT.  It is used in cases where
1442    boyer_moore cannot work.  */
1443
1444 static EMACS_INT
1445 simple_search (EMACS_INT n, unsigned char *pat,
1446                EMACS_INT len, EMACS_INT len_byte, Lisp_Object trt,
1447                EMACS_INT pos, EMACS_INT pos_byte,
1448                EMACS_INT lim, EMACS_INT lim_byte)
1449 {
1450   int multibyte = ! NILP (BVAR (current_buffer, enable_multibyte_characters));
1451   int forward = n > 0;
1452   /* Number of buffer bytes matched.  Note that this may be different
1453      from len_byte in a multibyte buffer.  */
1454   EMACS_INT match_byte;
1455
1456   if (lim > pos && multibyte)
1457     while (n > 0)
1458       {
1459         while (1)
1460           {
1461             /* Try matching at position POS.  */
1462             EMACS_INT this_pos = pos;
1463             EMACS_INT this_pos_byte = pos_byte;
1464             EMACS_INT this_len = len;
1465             unsigned char *p = pat;
1466             if (pos + len > lim || pos_byte + len_byte > lim_byte)
1467               goto stop;
1468
1469             while (this_len > 0)
1470               {
1471                 int charlen, buf_charlen;
1472                 int pat_ch, buf_ch;
1473
1474                 pat_ch = STRING_CHAR_AND_LENGTH (p, charlen);
1475                 buf_ch = STRING_CHAR_AND_LENGTH (BYTE_POS_ADDR (this_pos_byte),
1476                                                  buf_charlen);
1477                 TRANSLATE (buf_ch, trt, buf_ch);
1478
1479                 if (buf_ch != pat_ch)
1480                   break;
1481
1482                 this_len--;
1483                 p += charlen;
1484
1485                 this_pos_byte += buf_charlen;
1486                 this_pos++;
1487               }
1488
1489             if (this_len == 0)
1490               {
1491                 match_byte = this_pos_byte - pos_byte;
1492                 pos += len;
1493                 pos_byte += match_byte;
1494                 break;
1495               }
1496
1497             INC_BOTH (pos, pos_byte);
1498           }
1499
1500         n--;
1501       }
1502   else if (lim > pos)
1503     while (n > 0)
1504       {
1505         while (1)
1506           {
1507             /* Try matching at position POS.  */
1508             EMACS_INT this_pos = pos;
1509             EMACS_INT this_len = len;
1510             unsigned char *p = pat;
1511
1512             if (pos + len > lim)
1513               goto stop;
1514
1515             while (this_len > 0)
1516               {
1517                 int pat_ch = *p++;
1518                 int buf_ch = FETCH_BYTE (this_pos);
1519                 TRANSLATE (buf_ch, trt, buf_ch);
1520
1521                 if (buf_ch != pat_ch)
1522                   break;
1523
1524                 this_len--;
1525                 this_pos++;
1526               }
1527
1528             if (this_len == 0)
1529               {
1530                 match_byte = len;
1531                 pos += len;
1532                 break;
1533               }
1534
1535             pos++;
1536           }
1537
1538         n--;
1539       }
1540   /* Backwards search.  */
1541   else if (lim < pos && multibyte)
1542     while (n < 0)
1543       {
1544         while (1)
1545           {
1546             /* Try matching at position POS.  */
1547             EMACS_INT this_pos = pos;
1548             EMACS_INT this_pos_byte = pos_byte;
1549             EMACS_INT this_len = len;
1550             const unsigned char *p = pat + len_byte;
1551
1552             if (this_pos - len < lim || (pos_byte - len_byte) < lim_byte)
1553               goto stop;
1554
1555             while (this_len > 0)
1556               {
1557                 int charlen;
1558                 int pat_ch, buf_ch;
1559
1560                 DEC_BOTH (this_pos, this_pos_byte);
1561                 PREV_CHAR_BOUNDARY (p, pat);
1562                 pat_ch = STRING_CHAR (p);
1563                 buf_ch = STRING_CHAR (BYTE_POS_ADDR (this_pos_byte));
1564                 TRANSLATE (buf_ch, trt, buf_ch);
1565
1566                 if (buf_ch != pat_ch)
1567                   break;
1568
1569                 this_len--;
1570               }
1571
1572             if (this_len == 0)
1573               {
1574                 match_byte = pos_byte - this_pos_byte;
1575                 pos = this_pos;
1576                 pos_byte = this_pos_byte;
1577                 break;
1578               }
1579
1580             DEC_BOTH (pos, pos_byte);
1581           }
1582
1583         n++;
1584       }
1585   else if (lim < pos)
1586     while (n < 0)
1587       {
1588         while (1)
1589           {
1590             /* Try matching at position POS.  */
1591             EMACS_INT this_pos = pos - len;
1592             EMACS_INT this_len = len;
1593             unsigned char *p = pat;
1594
1595             if (this_pos < lim)
1596               goto stop;
1597
1598             while (this_len > 0)
1599               {
1600                 int pat_ch = *p++;
1601                 int buf_ch = FETCH_BYTE (this_pos);
1602                 TRANSLATE (buf_ch, trt, buf_ch);
1603
1604                 if (buf_ch != pat_ch)
1605                   break;
1606                 this_len--;
1607                 this_pos++;
1608               }
1609
1610             if (this_len == 0)
1611               {
1612                 match_byte = len;
1613                 pos -= len;
1614                 break;
1615               }
1616
1617             pos--;
1618           }
1619
1620         n++;
1621       }
1622
1623  stop:
1624   if (n == 0)
1625     {
1626       if (forward)
1627         set_search_regs ((multibyte ? pos_byte : pos) - match_byte, match_byte);
1628       else
1629         set_search_regs (multibyte ? pos_byte : pos, match_byte);
1630
1631       return pos;
1632     }
1633   else if (n > 0)
1634     return -n;
1635   else
1636     return n;
1637 }
1638 \f
1639 /* Do Boyer-Moore search N times for the string BASE_PAT,
1640    whose length is LEN/LEN_BYTE,
1641    from buffer position POS/POS_BYTE until LIM/LIM_BYTE.
1642    DIRECTION says which direction we search in.
1643    TRT and INVERSE_TRT are translation tables.
1644    Characters in PAT are already translated by TRT.
1645
1646    This kind of search works if all the characters in BASE_PAT that
1647    have nontrivial translation are the same aside from the last byte.
1648    This makes it possible to translate just the last byte of a
1649    character, and do so after just a simple test of the context.
1650    CHAR_BASE is nonzero if there is such a non-ASCII character.
1651
1652    If that criterion is not satisfied, do not call this function.  */
1653
1654 static EMACS_INT
1655 boyer_moore (EMACS_INT n, unsigned char *base_pat,
1656              EMACS_INT len, EMACS_INT len_byte,
1657              Lisp_Object trt, Lisp_Object inverse_trt,
1658              EMACS_INT pos, EMACS_INT pos_byte,
1659              EMACS_INT lim, EMACS_INT lim_byte, int char_base)
1660 {
1661   int direction = ((n > 0) ? 1 : -1);
1662   register EMACS_INT dirlen;
1663   EMACS_INT limit;
1664   int stride_for_teases = 0;
1665   int BM_tab[0400];
1666   register unsigned char *cursor, *p_limit;
1667   register EMACS_INT i;
1668   register int j;
1669   unsigned char *pat, *pat_end;
1670   int multibyte = ! NILP (BVAR (current_buffer, enable_multibyte_characters));
1671
1672   unsigned char simple_translate[0400];
1673   /* These are set to the preceding bytes of a byte to be translated
1674      if char_base is nonzero.  As the maximum byte length of a
1675      multibyte character is 5, we have to check at most four previous
1676      bytes.  */
1677   int translate_prev_byte1 = 0;
1678   int translate_prev_byte2 = 0;
1679   int translate_prev_byte3 = 0;
1680   int translate_prev_byte4 = 0;
1681
1682   /* The general approach is that we are going to maintain that we know
1683      the first (closest to the present position, in whatever direction
1684      we're searching) character that could possibly be the last
1685      (furthest from present position) character of a valid match.  We
1686      advance the state of our knowledge by looking at that character
1687      and seeing whether it indeed matches the last character of the
1688      pattern.  If it does, we take a closer look.  If it does not, we
1689      move our pointer (to putative last characters) as far as is
1690      logically possible.  This amount of movement, which I call a
1691      stride, will be the length of the pattern if the actual character
1692      appears nowhere in the pattern, otherwise it will be the distance
1693      from the last occurrence of that character to the end of the
1694      pattern.  If the amount is zero we have a possible match.  */
1695
1696   /* Here we make a "mickey mouse" BM table.  The stride of the search
1697      is determined only by the last character of the putative match.
1698      If that character does not match, we will stride the proper
1699      distance to propose a match that superimposes it on the last
1700      instance of a character that matches it (per trt), or misses
1701      it entirely if there is none. */
1702
1703   dirlen = len_byte * direction;
1704
1705   /* Record position after the end of the pattern.  */
1706   pat_end = base_pat + len_byte;
1707   /* BASE_PAT points to a character that we start scanning from.
1708      It is the first character in a forward search,
1709      the last character in a backward search.  */
1710   if (direction < 0)
1711     base_pat = pat_end - 1;
1712
1713   /* A character that does not appear in the pattern induces a
1714      stride equal to the pattern length.  */
1715   for (i = 0; i < 0400; i++)
1716     BM_tab[i] = dirlen;
1717
1718   /* We use this for translation, instead of TRT itself.
1719      We fill this in to handle the characters that actually
1720      occur in the pattern.  Others don't matter anyway!  */
1721   for (i = 0; i < 0400; i++)
1722     simple_translate[i] = i;
1723
1724   if (char_base)
1725     {
1726       /* Setup translate_prev_byte1/2/3/4 from CHAR_BASE.  Only a
1727          byte following them are the target of translation.  */
1728       unsigned char str[MAX_MULTIBYTE_LENGTH];
1729       int len = CHAR_STRING (char_base, str);
1730
1731       translate_prev_byte1 = str[len - 2];
1732       if (len > 2)
1733         {
1734           translate_prev_byte2 = str[len - 3];
1735           if (len > 3)
1736             {
1737               translate_prev_byte3 = str[len - 4];
1738               if (len > 4)
1739                 translate_prev_byte4 = str[len - 5];
1740             }
1741         }
1742     }
1743
1744   i = 0;
1745   while (i != dirlen)
1746     {
1747       unsigned char *ptr = base_pat + i;
1748       i += direction;
1749       if (! NILP (trt))
1750         {
1751           /* If the byte currently looking at is the last of a
1752              character to check case-equivalents, set CH to that
1753              character.  An ASCII character and a non-ASCII character
1754              matching with CHAR_BASE are to be checked.  */
1755           int ch = -1;
1756
1757           if (ASCII_BYTE_P (*ptr) || ! multibyte)
1758             ch = *ptr;
1759           else if (char_base
1760                    && ((pat_end - ptr) == 1 || CHAR_HEAD_P (ptr[1])))
1761             {
1762               unsigned char *charstart = ptr - 1;
1763
1764               while (! (CHAR_HEAD_P (*charstart)))
1765                 charstart--;
1766               ch = STRING_CHAR (charstart);
1767               if (char_base != (ch & ~0x3F))
1768                 ch = -1;
1769             }
1770
1771           if (ch >= 0200)
1772             j = (ch & 0x3F) | 0200;
1773           else
1774             j = *ptr;
1775
1776           if (i == dirlen)
1777             stride_for_teases = BM_tab[j];
1778
1779           BM_tab[j] = dirlen - i;
1780           /* A translation table is accompanied by its inverse -- see */
1781           /* comment following downcase_table for details */
1782           if (ch >= 0)
1783             {
1784               int starting_ch = ch;
1785               int starting_j = j;
1786
1787               while (1)
1788                 {
1789                   TRANSLATE (ch, inverse_trt, ch);
1790                   if (ch >= 0200)
1791                     j = (ch & 0x3F) | 0200;
1792                   else
1793                     j = ch;
1794
1795                   /* For all the characters that map into CH,
1796                      set up simple_translate to map the last byte
1797                      into STARTING_J.  */
1798                   simple_translate[j] = starting_j;
1799                   if (ch == starting_ch)
1800                     break;
1801                   BM_tab[j] = dirlen - i;
1802                 }
1803             }
1804         }
1805       else
1806         {
1807           j = *ptr;
1808
1809           if (i == dirlen)
1810             stride_for_teases = BM_tab[j];
1811           BM_tab[j] = dirlen - i;
1812         }
1813       /* stride_for_teases tells how much to stride if we get a
1814          match on the far character but are subsequently
1815          disappointed, by recording what the stride would have been
1816          for that character if the last character had been
1817          different.  */
1818     }
1819   pos_byte += dirlen - ((direction > 0) ? direction : 0);
1820   /* loop invariant - POS_BYTE points at where last char (first
1821      char if reverse) of pattern would align in a possible match.  */
1822   while (n != 0)
1823     {
1824       EMACS_INT tail_end;
1825       unsigned char *tail_end_ptr;
1826
1827       /* It's been reported that some (broken) compiler thinks that
1828          Boolean expressions in an arithmetic context are unsigned.
1829          Using an explicit ?1:0 prevents this.  */
1830       if ((lim_byte - pos_byte - ((direction > 0) ? 1 : 0)) * direction
1831           < 0)
1832         return (n * (0 - direction));
1833       /* First we do the part we can by pointers (maybe nothing) */
1834       QUIT;
1835       pat = base_pat;
1836       limit = pos_byte - dirlen + direction;
1837       if (direction > 0)
1838         {
1839           limit = BUFFER_CEILING_OF (limit);
1840           /* LIMIT is now the last (not beyond-last!) value POS_BYTE
1841              can take on without hitting edge of buffer or the gap.  */
1842           limit = min (limit, pos_byte + 20000);
1843           limit = min (limit, lim_byte - 1);
1844         }
1845       else
1846         {
1847           limit = BUFFER_FLOOR_OF (limit);
1848           /* LIMIT is now the last (not beyond-last!) value POS_BYTE
1849              can take on without hitting edge of buffer or the gap.  */
1850           limit = max (limit, pos_byte - 20000);
1851           limit = max (limit, lim_byte);
1852         }
1853       tail_end = BUFFER_CEILING_OF (pos_byte) + 1;
1854       tail_end_ptr = BYTE_POS_ADDR (tail_end);
1855
1856       if ((limit - pos_byte) * direction > 20)
1857         {
1858           unsigned char *p2;
1859
1860           p_limit = BYTE_POS_ADDR (limit);
1861           p2 = (cursor = BYTE_POS_ADDR (pos_byte));
1862           /* In this loop, pos + cursor - p2 is the surrogate for pos.  */
1863           while (1)             /* use one cursor setting as long as i can */
1864             {
1865               if (direction > 0) /* worth duplicating */
1866                 {
1867                   while (cursor <= p_limit)
1868                     {
1869                       if (BM_tab[*cursor] == 0)
1870                         goto hit;
1871                       cursor += BM_tab[*cursor];
1872                     }
1873                 }
1874               else
1875                 {
1876                   while (cursor >= p_limit)
1877                     {
1878                       if (BM_tab[*cursor] == 0)
1879                         goto hit;
1880                       cursor += BM_tab[*cursor];
1881                     }
1882                 }
1883               /* If you are here, cursor is beyond the end of the
1884                  searched region.  You fail to match within the
1885                  permitted region and would otherwise try a character
1886                  beyond that region.  */
1887               break;
1888
1889             hit:
1890               i = dirlen - direction;
1891               if (! NILP (trt))
1892                 {
1893                   while ((i -= direction) + direction != 0)
1894                     {
1895                       int ch;
1896                       cursor -= direction;
1897                       /* Translate only the last byte of a character.  */
1898                       if (! multibyte
1899                           || ((cursor == tail_end_ptr
1900                                || CHAR_HEAD_P (cursor[1]))
1901                               && (CHAR_HEAD_P (cursor[0])
1902                                   /* Check if this is the last byte of
1903                                      a translable character.  */
1904                                   || (translate_prev_byte1 == cursor[-1]
1905                                       && (CHAR_HEAD_P (translate_prev_byte1)
1906                                           || (translate_prev_byte2 == cursor[-2]
1907                                               && (CHAR_HEAD_P (translate_prev_byte2)
1908                                                   || (translate_prev_byte3 == cursor[-3]))))))))
1909                         ch = simple_translate[*cursor];
1910                       else
1911                         ch = *cursor;
1912                       if (pat[i] != ch)
1913                         break;
1914                     }
1915                 }
1916               else
1917                 {
1918                   while ((i -= direction) + direction != 0)
1919                     {
1920                       cursor -= direction;
1921                       if (pat[i] != *cursor)
1922                         break;
1923                     }
1924                 }
1925               cursor += dirlen - i - direction; /* fix cursor */
1926               if (i + direction == 0)
1927                 {
1928                   EMACS_INT position, start, end;
1929
1930                   cursor -= direction;
1931
1932                   position = pos_byte + cursor - p2 + ((direction > 0)
1933                                                        ? 1 - len_byte : 0);
1934                   set_search_regs (position, len_byte);
1935
1936                   if (NILP (Vinhibit_changing_match_data))
1937                     {
1938                       start = search_regs.start[0];
1939                       end = search_regs.end[0];
1940                     }
1941                   else
1942                     /* If Vinhibit_changing_match_data is non-nil,
1943                        search_regs will not be changed.  So let's
1944                        compute start and end here.  */
1945                     {
1946                       start = BYTE_TO_CHAR (position);
1947                       end = BYTE_TO_CHAR (position + len_byte);
1948                     }
1949
1950                   if ((n -= direction) != 0)
1951                     cursor += dirlen; /* to resume search */
1952                   else
1953                     return direction > 0 ? end : start;
1954                 }
1955               else
1956                 cursor += stride_for_teases; /* <sigh> we lose -  */
1957             }
1958           pos_byte += cursor - p2;
1959         }
1960       else
1961         /* Now we'll pick up a clump that has to be done the hard
1962            way because it covers a discontinuity.  */
1963         {
1964           limit = ((direction > 0)
1965                    ? BUFFER_CEILING_OF (pos_byte - dirlen + 1)
1966                    : BUFFER_FLOOR_OF (pos_byte - dirlen - 1));
1967           limit = ((direction > 0)
1968                    ? min (limit + len_byte, lim_byte - 1)
1969                    : max (limit - len_byte, lim_byte));
1970           /* LIMIT is now the last value POS_BYTE can have
1971              and still be valid for a possible match.  */
1972           while (1)
1973             {
1974               /* This loop can be coded for space rather than
1975                  speed because it will usually run only once.
1976                  (the reach is at most len + 21, and typically
1977                  does not exceed len).  */
1978               while ((limit - pos_byte) * direction >= 0)
1979                 {
1980                   int ch = FETCH_BYTE (pos_byte);
1981                   if (BM_tab[ch] == 0)
1982                     goto hit2;
1983                   pos_byte += BM_tab[ch];
1984                 }
1985               break;    /* ran off the end */
1986
1987             hit2:
1988               /* Found what might be a match.  */
1989               i = dirlen - direction;
1990               while ((i -= direction) + direction != 0)
1991                 {
1992                   int ch;
1993                   unsigned char *ptr;
1994                   pos_byte -= direction;
1995                   ptr = BYTE_POS_ADDR (pos_byte);
1996                   /* Translate only the last byte of a character.  */
1997                   if (! multibyte
1998                       || ((ptr == tail_end_ptr
1999                            || CHAR_HEAD_P (ptr[1]))
2000                           && (CHAR_HEAD_P (ptr[0])
2001                               /* Check if this is the last byte of a
2002                                  translable character.  */
2003                               || (translate_prev_byte1 == ptr[-1]
2004                                   && (CHAR_HEAD_P (translate_prev_byte1)
2005                                       || (translate_prev_byte2 == ptr[-2]
2006                                           && (CHAR_HEAD_P (translate_prev_byte2)
2007                                               || translate_prev_byte3 == ptr[-3])))))))
2008                     ch = simple_translate[*ptr];
2009                   else
2010                     ch = *ptr;
2011                   if (pat[i] != ch)
2012                     break;
2013                 }
2014               /* Above loop has moved POS_BYTE part or all the way
2015                  back to the first pos (last pos if reverse).
2016                  Set it once again at the last (first if reverse) char.  */
2017               pos_byte += dirlen - i - direction;
2018               if (i + direction == 0)
2019                 {
2020                   EMACS_INT position, start, end;
2021                   pos_byte -= direction;
2022
2023                   position = pos_byte + ((direction > 0) ? 1 - len_byte : 0);
2024                   set_search_regs (position, len_byte);
2025
2026                   if (NILP (Vinhibit_changing_match_data))
2027                     {
2028                       start = search_regs.start[0];
2029                       end = search_regs.end[0];
2030                     }
2031                   else
2032                     /* If Vinhibit_changing_match_data is non-nil,
2033                        search_regs will not be changed.  So let's
2034                        compute start and end here.  */
2035                     {
2036                       start = BYTE_TO_CHAR (position);
2037                       end = BYTE_TO_CHAR (position + len_byte);
2038                     }
2039
2040                   if ((n -= direction) != 0)
2041                     pos_byte += dirlen; /* to resume search */
2042                   else
2043                     return direction > 0 ? end : start;
2044                 }
2045               else
2046                 pos_byte += stride_for_teases;
2047             }
2048           }
2049       /* We have done one clump.  Can we continue? */
2050       if ((lim_byte - pos_byte) * direction < 0)
2051         return ((0 - n) * direction);
2052     }
2053   return BYTE_TO_CHAR (pos_byte);
2054 }
2055
2056 /* Record beginning BEG_BYTE and end BEG_BYTE + NBYTES
2057    for the overall match just found in the current buffer.
2058    Also clear out the match data for registers 1 and up.  */
2059
2060 static void
2061 set_search_regs (EMACS_INT beg_byte, EMACS_INT nbytes)
2062 {
2063   int i;
2064
2065   if (!NILP (Vinhibit_changing_match_data))
2066     return;
2067
2068   /* Make sure we have registers in which to store
2069      the match position.  */
2070   if (search_regs.num_regs == 0)
2071     {
2072       search_regs.start = (regoff_t *) xmalloc (2 * sizeof (regoff_t));
2073       search_regs.end = (regoff_t *) xmalloc (2 * sizeof (regoff_t));
2074       search_regs.num_regs = 2;
2075     }
2076
2077   /* Clear out the other registers.  */
2078   for (i = 1; i < search_regs.num_regs; i++)
2079     {
2080       search_regs.start[i] = -1;
2081       search_regs.end[i] = -1;
2082     }
2083
2084   search_regs.start[0] = BYTE_TO_CHAR (beg_byte);
2085   search_regs.end[0] = BYTE_TO_CHAR (beg_byte + nbytes);
2086   XSETBUFFER (last_thing_searched, current_buffer);
2087 }
2088 \f
2089 /* Given STRING, a string of words separated by word delimiters,
2090    compute a regexp that matches those exact words separated by
2091    arbitrary punctuation.  If LAX is nonzero, the end of the string
2092    need not match a word boundary unless it ends in whitespace.  */
2093
2094 static Lisp_Object
2095 wordify (Lisp_Object string, int lax)
2096 {
2097   register unsigned char *p, *o;
2098   register EMACS_INT i, i_byte, len, punct_count = 0, word_count = 0;
2099   Lisp_Object val;
2100   int prev_c = 0;
2101   EMACS_INT adjust;
2102   int whitespace_at_end;
2103
2104   CHECK_STRING (string);
2105   p = SDATA (string);
2106   len = SCHARS (string);
2107
2108   for (i = 0, i_byte = 0; i < len; )
2109     {
2110       int c;
2111
2112       FETCH_STRING_CHAR_AS_MULTIBYTE_ADVANCE (c, string, i, i_byte);
2113
2114       if (SYNTAX (c) != Sword)
2115         {
2116           punct_count++;
2117           if (i > 0 && SYNTAX (prev_c) == Sword)
2118             word_count++;
2119         }
2120
2121       prev_c = c;
2122     }
2123
2124   if (SYNTAX (prev_c) == Sword)
2125     {
2126       word_count++;
2127       whitespace_at_end = 0;
2128     }
2129   else
2130     whitespace_at_end = 1;
2131
2132   if (!word_count)
2133     return empty_unibyte_string;
2134
2135   adjust = - punct_count + 5 * (word_count - 1)
2136     + ((lax && !whitespace_at_end) ? 2 : 4);
2137   if (STRING_MULTIBYTE (string))
2138     val = make_uninit_multibyte_string (len + adjust,
2139                                         SBYTES (string)
2140                                         + adjust);
2141   else
2142     val = make_uninit_string (len + adjust);
2143
2144   o = SDATA (val);
2145   *o++ = '\\';
2146   *o++ = 'b';
2147   prev_c = 0;
2148
2149   for (i = 0, i_byte = 0; i < len; )
2150     {
2151       int c;
2152       EMACS_INT i_byte_orig = i_byte;
2153
2154       FETCH_STRING_CHAR_AS_MULTIBYTE_ADVANCE (c, string, i, i_byte);
2155
2156       if (SYNTAX (c) == Sword)
2157         {
2158           memcpy (o, SDATA (string) + i_byte_orig, i_byte - i_byte_orig);
2159           o += i_byte - i_byte_orig;
2160         }
2161       else if (i > 0 && SYNTAX (prev_c) == Sword && --word_count)
2162         {
2163           *o++ = '\\';
2164           *o++ = 'W';
2165           *o++ = '\\';
2166           *o++ = 'W';
2167           *o++ = '*';
2168         }
2169
2170       prev_c = c;
2171     }
2172
2173   if (!lax || whitespace_at_end)
2174     {
2175       *o++ = '\\';
2176       *o++ = 'b';
2177     }
2178
2179   return val;
2180 }
2181 \f
2182 DEFUN ("search-backward", Fsearch_backward, Ssearch_backward, 1, 4,
2183        "MSearch backward: ",
2184        doc: /* Search backward from point for STRING.
2185 Set point to the beginning of the occurrence found, and return point.
2186 An optional second argument bounds the search; it is a buffer position.
2187 The match found must not extend before that position.
2188 Optional third argument, if t, means if fail just return nil (no error).
2189  If not nil and not t, position at limit of search and return nil.
2190 Optional fourth argument is repeat count--search for successive occurrences.
2191
2192 Search case-sensitivity is determined by the value of the variable
2193 `case-fold-search', which see.
2194
2195 See also the functions `match-beginning', `match-end' and `replace-match'.  */)
2196   (Lisp_Object string, Lisp_Object bound, Lisp_Object noerror, Lisp_Object count)
2197 {
2198   return search_command (string, bound, noerror, count, -1, 0, 0);
2199 }
2200
2201 DEFUN ("search-forward", Fsearch_forward, Ssearch_forward, 1, 4, "MSearch: ",
2202        doc: /* Search forward from point for STRING.
2203 Set point to the end of the occurrence found, and return point.
2204 An optional second argument bounds the search; it is a buffer position.
2205 The match found must not extend after that position.  A value of nil is
2206   equivalent to (point-max).
2207 Optional third argument, if t, means if fail just return nil (no error).
2208   If not nil and not t, move to limit of search and return nil.
2209 Optional fourth argument is repeat count--search for successive occurrences.
2210
2211 Search case-sensitivity is determined by the value of the variable
2212 `case-fold-search', which see.
2213
2214 See also the functions `match-beginning', `match-end' and `replace-match'.  */)
2215   (Lisp_Object string, Lisp_Object bound, Lisp_Object noerror, Lisp_Object count)
2216 {
2217   return search_command (string, bound, noerror, count, 1, 0, 0);
2218 }
2219
2220 DEFUN ("word-search-backward", Fword_search_backward, Sword_search_backward, 1, 4,
2221        "sWord search backward: ",
2222        doc: /* Search backward from point for STRING, ignoring differences in punctuation.
2223 Set point to the beginning of the occurrence found, and return point.
2224 An optional second argument bounds the search; it is a buffer position.
2225 The match found must not extend before that position.
2226 Optional third argument, if t, means if fail just return nil (no error).
2227   If not nil and not t, move to limit of search and return nil.
2228 Optional fourth argument is repeat count--search for successive occurrences.  */)
2229   (Lisp_Object string, Lisp_Object bound, Lisp_Object noerror, Lisp_Object count)
2230 {
2231   return search_command (wordify (string, 0), bound, noerror, count, -1, 1, 0);
2232 }
2233
2234 DEFUN ("word-search-forward", Fword_search_forward, Sword_search_forward, 1, 4,
2235        "sWord search: ",
2236        doc: /* Search forward from point for STRING, ignoring differences in punctuation.
2237 Set point to the end of the occurrence found, and return point.
2238 An optional second argument bounds the search; it is a buffer position.
2239 The match found must not extend after that position.
2240 Optional third argument, if t, means if fail just return nil (no error).
2241   If not nil and not t, move to limit of search and return nil.
2242 Optional fourth argument is repeat count--search for successive occurrences.  */)
2243   (Lisp_Object string, Lisp_Object bound, Lisp_Object noerror, Lisp_Object count)
2244 {
2245   return search_command (wordify (string, 0), bound, noerror, count, 1, 1, 0);
2246 }
2247
2248 DEFUN ("word-search-backward-lax", Fword_search_backward_lax, Sword_search_backward_lax, 1, 4,
2249        "sWord search backward: ",
2250        doc: /* Search backward from point for STRING, ignoring differences in punctuation.
2251 Set point to the beginning of the occurrence found, and return point.
2252
2253 Unlike `word-search-backward', the end of STRING need not match a word
2254 boundary unless it ends in whitespace.
2255
2256 An optional second argument bounds the search; it is a buffer position.
2257 The match found must not extend before that position.
2258 Optional third argument, if t, means if fail just return nil (no error).
2259   If not nil and not t, move to limit of search and return nil.
2260 Optional fourth argument is repeat count--search for successive occurrences.  */)
2261   (Lisp_Object string, Lisp_Object bound, Lisp_Object noerror, Lisp_Object count)
2262 {
2263   return search_command (wordify (string, 1), bound, noerror, count, -1, 1, 0);
2264 }
2265
2266 DEFUN ("word-search-forward-lax", Fword_search_forward_lax, Sword_search_forward_lax, 1, 4,
2267        "sWord search: ",
2268        doc: /* Search forward from point for STRING, ignoring differences in punctuation.
2269 Set point to the end of the occurrence found, and return point.
2270
2271 Unlike `word-search-forward', the end of STRING need not match a word
2272 boundary unless it ends in whitespace.
2273
2274 An optional second argument bounds the search; it is a buffer position.
2275 The match found must not extend after that position.
2276 Optional third argument, if t, means if fail just return nil (no error).
2277   If not nil and not t, move to limit of search and return nil.
2278 Optional fourth argument is repeat count--search for successive occurrences.  */)
2279   (Lisp_Object string, Lisp_Object bound, Lisp_Object noerror, Lisp_Object count)
2280 {
2281   return search_command (wordify (string, 1), bound, noerror, count, 1, 1, 0);
2282 }
2283
2284 DEFUN ("re-search-backward", Fre_search_backward, Sre_search_backward, 1, 4,
2285        "sRE search backward: ",
2286        doc: /* Search backward from point for match for regular expression REGEXP.
2287 Set point to the beginning of the match, and return point.
2288 The match found is the one starting last in the buffer
2289 and yet ending before the origin of the search.
2290 An optional second argument bounds the search; it is a buffer position.
2291 The match found must start at or after that position.
2292 Optional third argument, if t, means if fail just return nil (no error).
2293   If not nil and not t, move to limit of search and return nil.
2294 Optional fourth argument is repeat count--search for successive occurrences.
2295 See also the functions `match-beginning', `match-end', `match-string',
2296 and `replace-match'.  */)
2297   (Lisp_Object regexp, Lisp_Object bound, Lisp_Object noerror, Lisp_Object count)
2298 {
2299   return search_command (regexp, bound, noerror, count, -1, 1, 0);
2300 }
2301
2302 DEFUN ("re-search-forward", Fre_search_forward, Sre_search_forward, 1, 4,
2303        "sRE search: ",
2304        doc: /* Search forward from point for regular expression REGEXP.
2305 Set point to the end of the occurrence found, and return point.
2306 An optional second argument bounds the search; it is a buffer position.
2307 The match found must not extend after that position.
2308 Optional third argument, if t, means if fail just return nil (no error).
2309   If not nil and not t, move to limit of search and return nil.
2310 Optional fourth argument is repeat count--search for successive occurrences.
2311 See also the functions `match-beginning', `match-end', `match-string',
2312 and `replace-match'.  */)
2313   (Lisp_Object regexp, Lisp_Object bound, Lisp_Object noerror, Lisp_Object count)
2314 {
2315   return search_command (regexp, bound, noerror, count, 1, 1, 0);
2316 }
2317
2318 DEFUN ("posix-search-backward", Fposix_search_backward, Sposix_search_backward, 1, 4,
2319        "sPosix search backward: ",
2320        doc: /* Search backward from point for match for regular expression REGEXP.
2321 Find the longest match in accord with Posix regular expression rules.
2322 Set point to the beginning of the match, and return point.
2323 The match found is the one starting last in the buffer
2324 and yet ending before the origin of the search.
2325 An optional second argument bounds the search; it is a buffer position.
2326 The match found must start at or after that position.
2327 Optional third argument, if t, means if fail just return nil (no error).
2328   If not nil and not t, move to limit of search and return nil.
2329 Optional fourth argument is repeat count--search for successive occurrences.
2330 See also the functions `match-beginning', `match-end', `match-string',
2331 and `replace-match'.  */)
2332   (Lisp_Object regexp, Lisp_Object bound, Lisp_Object noerror, Lisp_Object count)
2333 {
2334   return search_command (regexp, bound, noerror, count, -1, 1, 1);
2335 }
2336
2337 DEFUN ("posix-search-forward", Fposix_search_forward, Sposix_search_forward, 1, 4,
2338        "sPosix search: ",
2339        doc: /* Search forward from point for regular expression REGEXP.
2340 Find the longest match in accord with Posix regular expression rules.
2341 Set point to the end of the occurrence found, and return point.
2342 An optional second argument bounds the search; it is a buffer position.
2343 The match found must not extend after that position.
2344 Optional third argument, if t, means if fail just return nil (no error).
2345   If not nil and not t, move to limit of search and return nil.
2346 Optional fourth argument is repeat count--search for successive occurrences.
2347 See also the functions `match-beginning', `match-end', `match-string',
2348 and `replace-match'.  */)
2349   (Lisp_Object regexp, Lisp_Object bound, Lisp_Object noerror, Lisp_Object count)
2350 {
2351   return search_command (regexp, bound, noerror, count, 1, 1, 1);
2352 }
2353 \f
2354 DEFUN ("replace-match", Freplace_match, Sreplace_match, 1, 5, 0,
2355        doc: /* Replace text matched by last search with NEWTEXT.
2356 Leave point at the end of the replacement text.
2357
2358 If second arg FIXEDCASE is non-nil, do not alter case of replacement text.
2359 Otherwise maybe capitalize the whole text, or maybe just word initials,
2360 based on the replaced text.
2361 If the replaced text has only capital letters
2362 and has at least one multiletter word, convert NEWTEXT to all caps.
2363 Otherwise if all words are capitalized in the replaced text,
2364 capitalize each word in NEWTEXT.
2365
2366 If third arg LITERAL is non-nil, insert NEWTEXT literally.
2367 Otherwise treat `\\' as special:
2368   `\\&' in NEWTEXT means substitute original matched text.
2369   `\\N' means substitute what matched the Nth `\\(...\\)'.
2370        If Nth parens didn't match, substitute nothing.
2371   `\\\\' means insert one `\\'.
2372 Case conversion does not apply to these substitutions.
2373
2374 FIXEDCASE and LITERAL are optional arguments.
2375
2376 The optional fourth argument STRING can be a string to modify.
2377 This is meaningful when the previous match was done against STRING,
2378 using `string-match'.  When used this way, `replace-match'
2379 creates and returns a new string made by copying STRING and replacing
2380 the part of STRING that was matched.
2381
2382 The optional fifth argument SUBEXP specifies a subexpression;
2383 it says to replace just that subexpression with NEWTEXT,
2384 rather than replacing the entire matched text.
2385 This is, in a vague sense, the inverse of using `\\N' in NEWTEXT;
2386 `\\N' copies subexp N into NEWTEXT, but using N as SUBEXP puts
2387 NEWTEXT in place of subexp N.
2388 This is useful only after a regular expression search or match,
2389 since only regular expressions have distinguished subexpressions.  */)
2390   (Lisp_Object newtext, Lisp_Object fixedcase, Lisp_Object literal, Lisp_Object string, Lisp_Object subexp)
2391 {
2392   enum { nochange, all_caps, cap_initial } case_action;
2393   register EMACS_INT pos, pos_byte;
2394   int some_multiletter_word;
2395   int some_lowercase;
2396   int some_uppercase;
2397   int some_nonuppercase_initial;
2398   register int c, prevc;
2399   int sub;
2400   EMACS_INT opoint, newpoint;
2401
2402   CHECK_STRING (newtext);
2403
2404   if (! NILP (string))
2405     CHECK_STRING (string);
2406
2407   case_action = nochange;       /* We tried an initialization */
2408                                 /* but some C compilers blew it */
2409
2410   if (search_regs.num_regs <= 0)
2411     error ("`replace-match' called before any match found");
2412
2413   if (NILP (subexp))
2414     sub = 0;
2415   else
2416     {
2417       CHECK_NUMBER (subexp);
2418       sub = XINT (subexp);
2419       if (sub < 0 || sub >= search_regs.num_regs)
2420         args_out_of_range (subexp, make_number (search_regs.num_regs));
2421     }
2422
2423   if (NILP (string))
2424     {
2425       if (search_regs.start[sub] < BEGV
2426           || search_regs.start[sub] > search_regs.end[sub]
2427           || search_regs.end[sub] > ZV)
2428         args_out_of_range (make_number (search_regs.start[sub]),
2429                            make_number (search_regs.end[sub]));
2430     }
2431   else
2432     {
2433       if (search_regs.start[sub] < 0
2434           || search_regs.start[sub] > search_regs.end[sub]
2435           || search_regs.end[sub] > SCHARS (string))
2436         args_out_of_range (make_number (search_regs.start[sub]),
2437                            make_number (search_regs.end[sub]));
2438     }
2439
2440   if (NILP (fixedcase))
2441     {
2442       /* Decide how to casify by examining the matched text. */
2443       EMACS_INT last;
2444
2445       pos = search_regs.start[sub];
2446       last = search_regs.end[sub];
2447
2448       if (NILP (string))
2449         pos_byte = CHAR_TO_BYTE (pos);
2450       else
2451         pos_byte = string_char_to_byte (string, pos);
2452
2453       prevc = '\n';
2454       case_action = all_caps;
2455
2456       /* some_multiletter_word is set nonzero if any original word
2457          is more than one letter long. */
2458       some_multiletter_word = 0;
2459       some_lowercase = 0;
2460       some_nonuppercase_initial = 0;
2461       some_uppercase = 0;
2462
2463       while (pos < last)
2464         {
2465           if (NILP (string))
2466             {
2467               c = FETCH_CHAR_AS_MULTIBYTE (pos_byte);
2468               INC_BOTH (pos, pos_byte);
2469             }
2470           else
2471             FETCH_STRING_CHAR_AS_MULTIBYTE_ADVANCE (c, string, pos, pos_byte);
2472
2473           if (LOWERCASEP (c))
2474             {
2475               /* Cannot be all caps if any original char is lower case */
2476
2477               some_lowercase = 1;
2478               if (SYNTAX (prevc) != Sword)
2479                 some_nonuppercase_initial = 1;
2480               else
2481                 some_multiletter_word = 1;
2482             }
2483           else if (UPPERCASEP (c))
2484             {
2485               some_uppercase = 1;
2486               if (SYNTAX (prevc) != Sword)
2487                 ;
2488               else
2489                 some_multiletter_word = 1;
2490             }
2491           else
2492             {
2493               /* If the initial is a caseless word constituent,
2494                  treat that like a lowercase initial.  */
2495               if (SYNTAX (prevc) != Sword)
2496                 some_nonuppercase_initial = 1;
2497             }
2498
2499           prevc = c;
2500         }
2501
2502       /* Convert to all caps if the old text is all caps
2503          and has at least one multiletter word.  */
2504       if (! some_lowercase && some_multiletter_word)
2505         case_action = all_caps;
2506       /* Capitalize each word, if the old text has all capitalized words.  */
2507       else if (!some_nonuppercase_initial && some_multiletter_word)
2508         case_action = cap_initial;
2509       else if (!some_nonuppercase_initial && some_uppercase)
2510         /* Should x -> yz, operating on X, give Yz or YZ?
2511            We'll assume the latter.  */
2512         case_action = all_caps;
2513       else
2514         case_action = nochange;
2515     }
2516
2517   /* Do replacement in a string.  */
2518   if (!NILP (string))
2519     {
2520       Lisp_Object before, after;
2521
2522       before = Fsubstring (string, make_number (0),
2523                            make_number (search_regs.start[sub]));
2524       after = Fsubstring (string, make_number (search_regs.end[sub]), Qnil);
2525
2526       /* Substitute parts of the match into NEWTEXT
2527          if desired.  */
2528       if (NILP (literal))
2529         {
2530           EMACS_INT lastpos = 0;
2531           EMACS_INT lastpos_byte = 0;
2532           /* We build up the substituted string in ACCUM.  */
2533           Lisp_Object accum;
2534           Lisp_Object middle;
2535           int length = SBYTES (newtext);
2536
2537           accum = Qnil;
2538
2539           for (pos_byte = 0, pos = 0; pos_byte < length;)
2540             {
2541               EMACS_INT substart = -1;
2542               EMACS_INT subend = 0;
2543               int delbackslash = 0;
2544
2545               FETCH_STRING_CHAR_ADVANCE (c, newtext, pos, pos_byte);
2546
2547               if (c == '\\')
2548                 {
2549                   FETCH_STRING_CHAR_ADVANCE (c, newtext, pos, pos_byte);
2550
2551                   if (c == '&')
2552                     {
2553                       substart = search_regs.start[sub];
2554                       subend = search_regs.end[sub];
2555                     }
2556                   else if (c >= '1' && c <= '9')
2557                     {
2558                       if (search_regs.start[c - '0'] >= 0
2559                           && c <= search_regs.num_regs + '0')
2560                         {
2561                           substart = search_regs.start[c - '0'];
2562                           subend = search_regs.end[c - '0'];
2563                         }
2564                       else
2565                         {
2566                           /* If that subexp did not match,
2567                              replace \\N with nothing.  */
2568                           substart = 0;
2569                           subend = 0;
2570                         }
2571                     }
2572                   else if (c == '\\')
2573                     delbackslash = 1;
2574                   else
2575                     error ("Invalid use of `\\' in replacement text");
2576                 }
2577               if (substart >= 0)
2578                 {
2579                   if (pos - 2 != lastpos)
2580                     middle = substring_both (newtext, lastpos,
2581                                              lastpos_byte,
2582                                              pos - 2, pos_byte - 2);
2583                   else
2584                     middle = Qnil;
2585                   accum = concat3 (accum, middle,
2586                                    Fsubstring (string,
2587                                                make_number (substart),
2588                                                make_number (subend)));
2589                   lastpos = pos;
2590                   lastpos_byte = pos_byte;
2591                 }
2592               else if (delbackslash)
2593                 {
2594                   middle = substring_both (newtext, lastpos,
2595                                            lastpos_byte,
2596                                            pos - 1, pos_byte - 1);
2597
2598                   accum = concat2 (accum, middle);
2599                   lastpos = pos;
2600                   lastpos_byte = pos_byte;
2601                 }
2602             }
2603
2604           if (pos != lastpos)
2605             middle = substring_both (newtext, lastpos,
2606                                      lastpos_byte,
2607                                      pos, pos_byte);
2608           else
2609             middle = Qnil;
2610
2611           newtext = concat2 (accum, middle);
2612         }
2613
2614       /* Do case substitution in NEWTEXT if desired.  */
2615       if (case_action == all_caps)
2616         newtext = Fupcase (newtext);
2617       else if (case_action == cap_initial)
2618         newtext = Fupcase_initials (newtext);
2619
2620       return concat3 (before, newtext, after);
2621     }
2622
2623   /* Record point, then move (quietly) to the start of the match.  */
2624   if (PT >= search_regs.end[sub])
2625     opoint = PT - ZV;
2626   else if (PT > search_regs.start[sub])
2627     opoint = search_regs.end[sub] - ZV;
2628   else
2629     opoint = PT;
2630
2631   /* If we want non-literal replacement,
2632      perform substitution on the replacement string.  */
2633   if (NILP (literal))
2634     {
2635       EMACS_INT length = SBYTES (newtext);
2636       unsigned char *substed;
2637       EMACS_INT substed_alloc_size, substed_len;
2638       int buf_multibyte = !NILP (BVAR (current_buffer, enable_multibyte_characters));
2639       int str_multibyte = STRING_MULTIBYTE (newtext);
2640       Lisp_Object rev_tbl;
2641       int really_changed = 0;
2642
2643       rev_tbl = Qnil;
2644
2645       substed_alloc_size = length * 2 + 100;
2646       substed = (unsigned char *) xmalloc (substed_alloc_size + 1);
2647       substed_len = 0;
2648
2649       /* Go thru NEWTEXT, producing the actual text to insert in
2650          SUBSTED while adjusting multibyteness to that of the current
2651          buffer.  */
2652
2653       for (pos_byte = 0, pos = 0; pos_byte < length;)
2654         {
2655           unsigned char str[MAX_MULTIBYTE_LENGTH];
2656           const unsigned char *add_stuff = NULL;
2657           EMACS_INT add_len = 0;
2658           int idx = -1;
2659
2660           if (str_multibyte)
2661             {
2662               FETCH_STRING_CHAR_ADVANCE_NO_CHECK (c, newtext, pos, pos_byte);
2663               if (!buf_multibyte)
2664                 c = multibyte_char_to_unibyte (c, rev_tbl);
2665             }
2666           else
2667             {
2668               /* Note that we don't have to increment POS.  */
2669               c = SREF (newtext, pos_byte++);
2670               if (buf_multibyte)
2671                 MAKE_CHAR_MULTIBYTE (c);
2672             }
2673
2674           /* Either set ADD_STUFF and ADD_LEN to the text to put in SUBSTED,
2675              or set IDX to a match index, which means put that part
2676              of the buffer text into SUBSTED.  */
2677
2678           if (c == '\\')
2679             {
2680               really_changed = 1;
2681
2682               if (str_multibyte)
2683                 {
2684                   FETCH_STRING_CHAR_ADVANCE_NO_CHECK (c, newtext,
2685                                                       pos, pos_byte);
2686                   if (!buf_multibyte && !ASCII_CHAR_P (c))
2687                     c = multibyte_char_to_unibyte (c, rev_tbl);
2688                 }
2689               else
2690                 {
2691                   c = SREF (newtext, pos_byte++);
2692                   if (buf_multibyte)
2693                     MAKE_CHAR_MULTIBYTE (c);
2694                 }
2695
2696               if (c == '&')
2697                 idx = sub;
2698               else if (c >= '1' && c <= '9' && c <= search_regs.num_regs + '0')
2699                 {
2700                   if (search_regs.start[c - '0'] >= 1)
2701                     idx = c - '0';
2702                 }
2703               else if (c == '\\')
2704                 add_len = 1, add_stuff = (unsigned char *) "\\";
2705               else
2706                 {
2707                   xfree (substed);
2708                   error ("Invalid use of `\\' in replacement text");
2709                 }
2710             }
2711           else
2712             {
2713               add_len = CHAR_STRING (c, str);
2714               add_stuff = str;
2715             }
2716
2717           /* If we want to copy part of a previous match,
2718              set up ADD_STUFF and ADD_LEN to point to it.  */
2719           if (idx >= 0)
2720             {
2721               EMACS_INT begbyte = CHAR_TO_BYTE (search_regs.start[idx]);
2722               add_len = CHAR_TO_BYTE (search_regs.end[idx]) - begbyte;
2723               if (search_regs.start[idx] < GPT && GPT < search_regs.end[idx])
2724                 move_gap (search_regs.start[idx]);
2725               add_stuff = BYTE_POS_ADDR (begbyte);
2726             }
2727
2728           /* Now the stuff we want to add to SUBSTED
2729              is invariably ADD_LEN bytes starting at ADD_STUFF.  */
2730
2731           /* Make sure SUBSTED is big enough.  */
2732           if (substed_len + add_len >= substed_alloc_size)
2733             {
2734               substed_alloc_size = substed_len + add_len + 500;
2735               substed = (unsigned char *) xrealloc (substed,
2736                                                     substed_alloc_size + 1);
2737             }
2738
2739           /* Now add to the end of SUBSTED.  */
2740           if (add_stuff)
2741             {
2742               memcpy (substed + substed_len, add_stuff, add_len);
2743               substed_len += add_len;
2744             }
2745         }
2746
2747       if (really_changed)
2748         {
2749           if (buf_multibyte)
2750             {
2751               EMACS_INT nchars =
2752                 multibyte_chars_in_text (substed, substed_len);
2753
2754               newtext = make_multibyte_string ((char *) substed, nchars,
2755                                                substed_len);
2756             }
2757           else
2758             newtext = make_unibyte_string ((char *) substed, substed_len);
2759         }
2760       xfree (substed);
2761     }
2762
2763   /* Replace the old text with the new in the cleanest possible way.  */
2764   replace_range (search_regs.start[sub], search_regs.end[sub],
2765                  newtext, 1, 0, 1);
2766   newpoint = search_regs.start[sub] + SCHARS (newtext);
2767
2768   if (case_action == all_caps)
2769     Fupcase_region (make_number (search_regs.start[sub]),
2770                     make_number (newpoint));
2771   else if (case_action == cap_initial)
2772     Fupcase_initials_region (make_number (search_regs.start[sub]),
2773                              make_number (newpoint));
2774
2775   /* Adjust search data for this change.  */
2776   {
2777     EMACS_INT oldend = search_regs.end[sub];
2778     EMACS_INT oldstart = search_regs.start[sub];
2779     EMACS_INT change = newpoint - search_regs.end[sub];
2780     int i;
2781
2782     for (i = 0; i < search_regs.num_regs; i++)
2783       {
2784         if (search_regs.start[i] >= oldend)
2785           search_regs.start[i] += change;
2786         else if (search_regs.start[i] > oldstart)
2787           search_regs.start[i] = oldstart;
2788         if (search_regs.end[i] >= oldend)
2789           search_regs.end[i] += change;
2790         else if (search_regs.end[i] > oldstart)
2791           search_regs.end[i] = oldstart;
2792       }
2793   }
2794
2795   /* Put point back where it was in the text.  */
2796   if (opoint <= 0)
2797     TEMP_SET_PT (opoint + ZV);
2798   else
2799     TEMP_SET_PT (opoint);
2800
2801   /* Now move point "officially" to the start of the inserted replacement.  */
2802   move_if_not_intangible (newpoint);
2803
2804   return Qnil;
2805 }
2806 \f
2807 static Lisp_Object
2808 match_limit (Lisp_Object num, int beginningp)
2809 {
2810   register int n;
2811
2812   CHECK_NUMBER (num);
2813   n = XINT (num);
2814   if (n < 0)
2815     args_out_of_range (num, make_number (0));
2816   if (search_regs.num_regs <= 0)
2817     error ("No match data, because no search succeeded");
2818   if (n >= search_regs.num_regs
2819       || search_regs.start[n] < 0)
2820     return Qnil;
2821   return (make_number ((beginningp) ? search_regs.start[n]
2822                                     : search_regs.end[n]));
2823 }
2824
2825 DEFUN ("match-beginning", Fmatch_beginning, Smatch_beginning, 1, 1, 0,
2826        doc: /* Return position of start of text matched by last search.
2827 SUBEXP, a number, specifies which parenthesized expression in the last
2828   regexp.
2829 Value is nil if SUBEXPth pair didn't match, or there were less than
2830   SUBEXP pairs.
2831 Zero means the entire text matched by the whole regexp or whole string.  */)
2832   (Lisp_Object subexp)
2833 {
2834   return match_limit (subexp, 1);
2835 }
2836
2837 DEFUN ("match-end", Fmatch_end, Smatch_end, 1, 1, 0,
2838        doc: /* Return position of end of text matched by last search.
2839 SUBEXP, a number, specifies which parenthesized expression in the last
2840   regexp.
2841 Value is nil if SUBEXPth pair didn't match, or there were less than
2842   SUBEXP pairs.
2843 Zero means the entire text matched by the whole regexp or whole string.  */)
2844   (Lisp_Object subexp)
2845 {
2846   return match_limit (subexp, 0);
2847 }
2848
2849 DEFUN ("match-data", Fmatch_data, Smatch_data, 0, 3, 0,
2850        doc: /* Return a list containing all info on what the last search matched.
2851 Element 2N is `(match-beginning N)'; element 2N + 1 is `(match-end N)'.
2852 All the elements are markers or nil (nil if the Nth pair didn't match)
2853 if the last match was on a buffer; integers or nil if a string was matched.
2854 Use `set-match-data' to reinstate the data in this list.
2855
2856 If INTEGERS (the optional first argument) is non-nil, always use
2857 integers \(rather than markers) to represent buffer positions.  In
2858 this case, and if the last match was in a buffer, the buffer will get
2859 stored as one additional element at the end of the list.
2860
2861 If REUSE is a list, reuse it as part of the value.  If REUSE is long
2862 enough to hold all the values, and if INTEGERS is non-nil, no consing
2863 is done.
2864
2865 If optional third arg RESEAT is non-nil, any previous markers on the
2866 REUSE list will be modified to point to nowhere.
2867
2868 Return value is undefined if the last search failed.  */)
2869   (Lisp_Object integers, Lisp_Object reuse, Lisp_Object reseat)
2870 {
2871   Lisp_Object tail, prev;
2872   Lisp_Object *data;
2873   int i, len;
2874
2875   if (!NILP (reseat))
2876     for (tail = reuse; CONSP (tail); tail = XCDR (tail))
2877       if (MARKERP (XCAR (tail)))
2878         {
2879           unchain_marker (XMARKER (XCAR (tail)));
2880           XSETCAR (tail, Qnil);
2881         }
2882
2883   if (NILP (last_thing_searched))
2884     return Qnil;
2885
2886   prev = Qnil;
2887
2888   data = (Lisp_Object *) alloca ((2 * search_regs.num_regs + 1)
2889                                  * sizeof (Lisp_Object));
2890
2891   len = 0;
2892   for (i = 0; i < search_regs.num_regs; i++)
2893     {
2894       int start = search_regs.start[i];
2895       if (start >= 0)
2896         {
2897           if (EQ (last_thing_searched, Qt)
2898               || ! NILP (integers))
2899             {
2900               XSETFASTINT (data[2 * i], start);
2901               XSETFASTINT (data[2 * i + 1], search_regs.end[i]);
2902             }
2903           else if (BUFFERP (last_thing_searched))
2904             {
2905               data[2 * i] = Fmake_marker ();
2906               Fset_marker (data[2 * i],
2907                            make_number (start),
2908                            last_thing_searched);
2909               data[2 * i + 1] = Fmake_marker ();
2910               Fset_marker (data[2 * i + 1],
2911                            make_number (search_regs.end[i]),
2912                            last_thing_searched);
2913             }
2914           else
2915             /* last_thing_searched must always be Qt, a buffer, or Qnil.  */
2916             abort ();
2917
2918           len = 2 * i + 2;
2919         }
2920       else
2921         data[2 * i] = data[2 * i + 1] = Qnil;
2922     }
2923
2924   if (BUFFERP (last_thing_searched) && !NILP (integers))
2925     {
2926       data[len] = last_thing_searched;
2927       len++;
2928     }
2929
2930   /* If REUSE is not usable, cons up the values and return them.  */
2931   if (! CONSP (reuse))
2932     return Flist (len, data);
2933
2934   /* If REUSE is a list, store as many value elements as will fit
2935      into the elements of REUSE.  */
2936   for (i = 0, tail = reuse; CONSP (tail);
2937        i++, tail = XCDR (tail))
2938     {
2939       if (i < len)
2940         XSETCAR (tail, data[i]);
2941       else
2942         XSETCAR (tail, Qnil);
2943       prev = tail;
2944     }
2945
2946   /* If we couldn't fit all value elements into REUSE,
2947      cons up the rest of them and add them to the end of REUSE.  */
2948   if (i < len)
2949     XSETCDR (prev, Flist (len - i, data + i));
2950
2951   return reuse;
2952 }
2953
2954 /* We used to have an internal use variant of `reseat' described as:
2955
2956       If RESEAT is `evaporate', put the markers back on the free list
2957       immediately.  No other references to the markers must exist in this
2958       case, so it is used only internally on the unwind stack and
2959       save-match-data from Lisp.
2960
2961    But it was ill-conceived: those supposedly-internal markers get exposed via
2962    the undo-list, so freeing them here is unsafe.  */
2963
2964 DEFUN ("set-match-data", Fset_match_data, Sset_match_data, 1, 2, 0,
2965        doc: /* Set internal data on last search match from elements of LIST.
2966 LIST should have been created by calling `match-data' previously.
2967
2968 If optional arg RESEAT is non-nil, make markers on LIST point nowhere.  */)
2969   (register Lisp_Object list, Lisp_Object reseat)
2970 {
2971   register int i;
2972   register Lisp_Object marker;
2973
2974   if (running_asynch_code)
2975     save_search_regs ();
2976
2977   CHECK_LIST (list);
2978
2979   /* Unless we find a marker with a buffer or an explicit buffer
2980      in LIST, assume that this match data came from a string.  */
2981   last_thing_searched = Qt;
2982
2983   /* Allocate registers if they don't already exist.  */
2984   {
2985     int length = XFASTINT (Flength (list)) / 2;
2986
2987     if (length > search_regs.num_regs)
2988       {
2989         if (search_regs.num_regs == 0)
2990           {
2991             search_regs.start
2992               = (regoff_t *) xmalloc (length * sizeof (regoff_t));
2993             search_regs.end
2994               = (regoff_t *) xmalloc (length * sizeof (regoff_t));
2995           }
2996         else
2997           {
2998             search_regs.start
2999               = (regoff_t *) xrealloc (search_regs.start,
3000                                        length * sizeof (regoff_t));
3001             search_regs.end
3002               = (regoff_t *) xrealloc (search_regs.end,
3003                                        length * sizeof (regoff_t));
3004           }
3005
3006         for (i = search_regs.num_regs; i < length; i++)
3007           search_regs.start[i] = -1;
3008
3009         search_regs.num_regs = length;
3010       }
3011
3012     for (i = 0; CONSP (list); i++)
3013       {
3014         marker = XCAR (list);
3015         if (BUFFERP (marker))
3016           {
3017             last_thing_searched = marker;
3018             break;
3019           }
3020         if (i >= length)
3021           break;
3022         if (NILP (marker))
3023           {
3024             search_regs.start[i] = -1;
3025             list = XCDR (list);
3026           }
3027         else
3028           {
3029             EMACS_INT from;
3030             Lisp_Object m;
3031
3032             m = marker;
3033             if (MARKERP (marker))
3034               {
3035                 if (XMARKER (marker)->buffer == 0)
3036                   XSETFASTINT (marker, 0);
3037                 else
3038                   XSETBUFFER (last_thing_searched, XMARKER (marker)->buffer);
3039               }
3040
3041             CHECK_NUMBER_COERCE_MARKER (marker);
3042             from = XINT (marker);
3043
3044             if (!NILP (reseat) && MARKERP (m))
3045               {
3046                 unchain_marker (XMARKER (m));
3047                 XSETCAR (list, Qnil);
3048               }
3049
3050             if ((list = XCDR (list), !CONSP (list)))
3051               break;
3052
3053             m = marker = XCAR (list);
3054
3055             if (MARKERP (marker) && XMARKER (marker)->buffer == 0)
3056               XSETFASTINT (marker, 0);
3057
3058             CHECK_NUMBER_COERCE_MARKER (marker);
3059             search_regs.start[i] = from;
3060             search_regs.end[i] = XINT (marker);
3061
3062             if (!NILP (reseat) && MARKERP (m))
3063               {
3064                 unchain_marker (XMARKER (m));
3065                 XSETCAR (list, Qnil);
3066               }
3067           }
3068         list = XCDR (list);
3069       }
3070
3071     for (; i < search_regs.num_regs; i++)
3072       search_regs.start[i] = -1;
3073   }
3074
3075   return Qnil;
3076 }
3077
3078 /* If non-zero the match data have been saved in saved_search_regs
3079    during the execution of a sentinel or filter. */
3080 static int search_regs_saved;
3081 static struct re_registers saved_search_regs;
3082 static Lisp_Object saved_last_thing_searched;
3083
3084 /* Called from Flooking_at, Fstring_match, search_buffer, Fstore_match_data
3085    if asynchronous code (filter or sentinel) is running. */
3086 static void
3087 save_search_regs (void)
3088 {
3089   if (!search_regs_saved)
3090     {
3091       saved_search_regs.num_regs = search_regs.num_regs;
3092       saved_search_regs.start = search_regs.start;
3093       saved_search_regs.end = search_regs.end;
3094       saved_last_thing_searched = last_thing_searched;
3095       last_thing_searched = Qnil;
3096       search_regs.num_regs = 0;
3097       search_regs.start = 0;
3098       search_regs.end = 0;
3099
3100       search_regs_saved = 1;
3101     }
3102 }
3103
3104 /* Called upon exit from filters and sentinels. */
3105 void
3106 restore_search_regs (void)
3107 {
3108   if (search_regs_saved)
3109     {
3110       if (search_regs.num_regs > 0)
3111         {
3112           xfree (search_regs.start);
3113           xfree (search_regs.end);
3114         }
3115       search_regs.num_regs = saved_search_regs.num_regs;
3116       search_regs.start = saved_search_regs.start;
3117       search_regs.end = saved_search_regs.end;
3118       last_thing_searched = saved_last_thing_searched;
3119       saved_last_thing_searched = Qnil;
3120       search_regs_saved = 0;
3121     }
3122 }
3123
3124 static Lisp_Object
3125 unwind_set_match_data (Lisp_Object list)
3126 {
3127   /* It is NOT ALWAYS safe to free (evaporate) the markers immediately.  */
3128   return Fset_match_data (list, Qt);
3129 }
3130
3131 /* Called to unwind protect the match data.  */
3132 void
3133 record_unwind_save_match_data (void)
3134 {
3135   record_unwind_protect (unwind_set_match_data,
3136                          Fmatch_data (Qnil, Qnil, Qnil));
3137 }
3138
3139 /* Quote a string to inactivate reg-expr chars */
3140
3141 DEFUN ("regexp-quote", Fregexp_quote, Sregexp_quote, 1, 1, 0,
3142        doc: /* Return a regexp string which matches exactly STRING and nothing else.  */)
3143   (Lisp_Object string)
3144 {
3145   register char *in, *out, *end;
3146   register char *temp;
3147   int backslashes_added = 0;
3148
3149   CHECK_STRING (string);
3150
3151   temp = (char *) alloca (SBYTES (string) * 2);
3152
3153   /* Now copy the data into the new string, inserting escapes. */
3154
3155   in = SSDATA (string);
3156   end = in + SBYTES (string);
3157   out = temp;
3158
3159   for (; in != end; in++)
3160     {
3161       if (*in == '['
3162           || *in == '*' || *in == '.' || *in == '\\'
3163           || *in == '?' || *in == '+'
3164           || *in == '^' || *in == '$')
3165         *out++ = '\\', backslashes_added++;
3166       *out++ = *in;
3167     }
3168
3169   return make_specified_string (temp,
3170                                 SCHARS (string) + backslashes_added,
3171                                 out - temp,
3172                                 STRING_MULTIBYTE (string));
3173 }
3174 \f
3175 void
3176 syms_of_search (void)
3177 {
3178   register int i;
3179
3180   for (i = 0; i < REGEXP_CACHE_SIZE; ++i)
3181     {
3182       searchbufs[i].buf.allocated = 100;
3183       searchbufs[i].buf.buffer = (unsigned char *) xmalloc (100);
3184       searchbufs[i].buf.fastmap = searchbufs[i].fastmap;
3185       searchbufs[i].regexp = Qnil;
3186       searchbufs[i].whitespace_regexp = Qnil;
3187       searchbufs[i].syntax_table = Qnil;
3188       staticpro (&searchbufs[i].regexp);
3189       staticpro (&searchbufs[i].whitespace_regexp);
3190       staticpro (&searchbufs[i].syntax_table);
3191       searchbufs[i].next = (i == REGEXP_CACHE_SIZE-1 ? 0 : &searchbufs[i+1]);
3192     }
3193   searchbuf_head = &searchbufs[0];
3194
3195   Qsearch_failed = intern_c_string ("search-failed");
3196   staticpro (&Qsearch_failed);
3197   Qinvalid_regexp = intern_c_string ("invalid-regexp");
3198   staticpro (&Qinvalid_regexp);
3199
3200   Fput (Qsearch_failed, Qerror_conditions,
3201         pure_cons (Qsearch_failed, pure_cons (Qerror, Qnil)));
3202   Fput (Qsearch_failed, Qerror_message,
3203         make_pure_c_string ("Search failed"));
3204
3205   Fput (Qinvalid_regexp, Qerror_conditions,
3206         pure_cons (Qinvalid_regexp, pure_cons (Qerror, Qnil)));
3207   Fput (Qinvalid_regexp, Qerror_message,
3208         make_pure_c_string ("Invalid regexp"));
3209
3210   last_thing_searched = Qnil;
3211   staticpro (&last_thing_searched);
3212
3213   saved_last_thing_searched = Qnil;
3214   staticpro (&saved_last_thing_searched);
3215
3216   DEFVAR_LISP ("search-spaces-regexp", Vsearch_spaces_regexp,
3217       doc: /* Regexp to substitute for bunches of spaces in regexp search.
3218 Some commands use this for user-specified regexps.
3219 Spaces that occur inside character classes or repetition operators
3220 or other such regexp constructs are not replaced with this.
3221 A value of nil (which is the normal value) means treat spaces literally.  */);
3222   Vsearch_spaces_regexp = Qnil;
3223
3224   DEFVAR_LISP ("inhibit-changing-match-data", Vinhibit_changing_match_data,
3225       doc: /* Internal use only.
3226 If non-nil, the primitive searching and matching functions
3227 such as `looking-at', `string-match', `re-search-forward', etc.,
3228 do not set the match data.  The proper way to use this variable
3229 is to bind it with `let' around a small expression.  */);
3230   Vinhibit_changing_match_data = Qnil;
3231
3232   defsubr (&Slooking_at);
3233   defsubr (&Sposix_looking_at);
3234   defsubr (&Sstring_match);
3235   defsubr (&Sposix_string_match);
3236   defsubr (&Ssearch_forward);
3237   defsubr (&Ssearch_backward);
3238   defsubr (&Sword_search_forward);
3239   defsubr (&Sword_search_backward);
3240   defsubr (&Sword_search_forward_lax);
3241   defsubr (&Sword_search_backward_lax);
3242   defsubr (&Sre_search_forward);
3243   defsubr (&Sre_search_backward);
3244   defsubr (&Sposix_search_forward);
3245   defsubr (&Sposix_search_backward);
3246   defsubr (&Sreplace_match);
3247   defsubr (&Smatch_beginning);
3248   defsubr (&Smatch_end);
3249   defsubr (&Smatch_data);
3250   defsubr (&Sset_match_data);
3251   defsubr (&Sregexp_quote);
3252 }