fix use of RE_W{START,STOP} in wide char version
[nvi.git] / ex / ex_subst.c
blobf77f1e0ae2a509f55525d6cf4043f5810d6f9003
1 /*-
2 * Copyright (c) 1992, 1993, 1994
3 * The Regents of the University of California. All rights reserved.
4 * Copyright (c) 1992, 1993, 1994, 1995, 1996
5 * Keith Bostic. All rights reserved.
7 * See the LICENSE file for redistribution information.
8 */
10 #include "config.h"
12 #ifndef lint
13 static const char sccsid[] = "$Id: ex_subst.c,v 10.46 2000/09/02 13:14:15 skimo Exp $ (Berkeley) $Date: 2000/09/02 13:14:15 $";
14 #endif /* not lint */
16 #include <sys/types.h>
17 #include <sys/queue.h>
18 #include <sys/time.h>
20 #include <bitstring.h>
21 #include <ctype.h>
22 #include <errno.h>
23 #include <limits.h>
24 #include <stdio.h>
25 #include <stdlib.h>
26 #include <string.h>
27 #include <unistd.h>
29 #include "../common/common.h"
30 #include "../vi/vi.h"
32 #define SUB_FIRST 0x01 /* The 'r' flag isn't reasonable. */
33 #define SUB_MUSTSETR 0x02 /* The 'r' flag is required. */
35 static int re_conv __P((SCR *, CHAR_T **, size_t *, int *));
36 static int re_cscope_conv __P((SCR *, CHAR_T **, size_t *, int *));
37 static int re_sub __P((SCR *,
38 CHAR_T *, CHAR_T **, size_t *, size_t *, regmatch_t [10]));
39 static int re_tag_conv __P((SCR *, CHAR_T **, size_t *, int *));
40 static int s __P((SCR *, EXCMD *, CHAR_T *, regex_t *, u_int));
43 * ex_s --
44 * [line [,line]] s[ubstitute] [[/;]pat[/;]/repl[/;] [cgr] [count] [#lp]]
46 * Substitute on lines matching a pattern.
48 * PUBLIC: int ex_s __P((SCR *, EXCMD *));
50 int
51 ex_s(sp, cmdp)
52 SCR *sp;
53 EXCMD *cmdp;
55 regex_t *re;
56 size_t blen, len;
57 u_int flags;
58 int delim;
59 CHAR_T *bp, *p, *ptrn, *rep, *t;
62 * Skip leading white space.
64 * !!!
65 * Historic vi allowed any non-alphanumeric to serve as the
66 * substitution command delimiter.
68 * !!!
69 * If the arguments are empty, it's the same as &, i.e. we
70 * repeat the last substitution.
72 if (cmdp->argc == 0)
73 goto subagain;
74 for (p = cmdp->argv[0]->bp,
75 len = cmdp->argv[0]->len; len > 0; --len, ++p) {
76 if (!isblank(*p))
77 break;
79 if (len == 0)
80 subagain: return (ex_subagain(sp, cmdp));
82 delim = *p++;
83 if (isalnum(delim) || delim == '\\')
84 return (s(sp, cmdp, p, &sp->subre_c, SUB_MUSTSETR));
87 * !!!
88 * The full-blown substitute command reset the remembered
89 * state of the 'c' and 'g' suffices.
91 sp->c_suffix = sp->g_suffix = 0;
94 * Get the pattern string, toss escaping characters.
96 * !!!
97 * Historic vi accepted any of the following forms:
99 * :s/abc/def/ change "abc" to "def"
100 * :s/abc/def change "abc" to "def"
101 * :s/abc/ delete "abc"
102 * :s/abc delete "abc"
104 * QUOTING NOTE:
106 * Only toss an escaping character if it escapes a delimiter.
107 * This means that "s/A/\\\\f" replaces "A" with "\\f". It
108 * would be nice to be more regular, i.e. for each layer of
109 * escaping a single escaping character is removed, but that's
110 * not how the historic vi worked.
112 for (ptrn = t = p;;) {
113 if (p[0] == '\0' || p[0] == delim) {
114 if (p[0] == delim)
115 ++p;
117 * !!!
118 * Nul terminate the pattern string -- it's passed
119 * to regcomp which doesn't understand anything else.
121 *t = '\0';
122 break;
124 if (p[0] == '\\')
125 if (p[1] == delim)
126 ++p;
127 else if (p[1] == '\\')
128 *t++ = *p++;
129 *t++ = *p++;
133 * If the pattern string is empty, use the last RE (not just the
134 * last substitution RE).
136 if (*ptrn == '\0') {
137 if (sp->re == NULL) {
138 ex_emsg(sp, NULL, EXM_NOPREVRE);
139 return (1);
142 /* Re-compile the RE if necessary. */
143 if (!F_ISSET(sp, SC_RE_SEARCH) &&
144 re_compile(sp, sp->re, sp->re_len,
145 NULL, NULL, &sp->re_c, SEARCH_CSEARCH | SEARCH_MSG))
146 return (1);
147 flags = 0;
148 } else {
150 * !!!
151 * Compile the RE. Historic practice is that substitutes set
152 * the search direction as well as both substitute and search
153 * RE's. We compile the RE twice, as we don't want to bother
154 * ref counting the pattern string and (opaque) structure.
156 if (re_compile(sp, ptrn, t - ptrn, &sp->re,
157 &sp->re_len, &sp->re_c, SEARCH_CSEARCH | SEARCH_MSG))
158 return (1);
159 if (re_compile(sp, ptrn, t - ptrn, &sp->subre,
160 &sp->subre_len, &sp->subre_c, SEARCH_CSUBST | SEARCH_MSG))
161 return (1);
163 flags = SUB_FIRST;
164 sp->searchdir = FORWARD;
166 re = &sp->re_c;
169 * Get the replacement string.
171 * The special character & (\& if O_MAGIC not set) matches the
172 * entire RE. No handling of & is required here, it's done by
173 * re_sub().
175 * The special character ~ (\~ if O_MAGIC not set) inserts the
176 * previous replacement string into this replacement string.
177 * Count ~'s to figure out how much space we need. We could
178 * special case nonexistent last patterns or whether or not
179 * O_MAGIC is set, but it's probably not worth the effort.
181 * QUOTING NOTE:
183 * Only toss an escaping character if it escapes a delimiter or
184 * if O_MAGIC is set and it escapes a tilde.
186 * !!!
187 * If the entire replacement pattern is "%", then use the last
188 * replacement pattern. This semantic was added to vi in System
189 * V and then percolated elsewhere, presumably around the time
190 * that it was added to their version of ed(1).
192 if (p[0] == '\0' || p[0] == delim) {
193 if (p[0] == delim)
194 ++p;
195 if (sp->repl != NULL)
196 free(sp->repl);
197 sp->repl = NULL;
198 sp->repl_len = 0;
199 } else if (p[0] == '%' && (p[1] == '\0' || p[1] == delim))
200 p += p[1] == delim ? 2 : 1;
201 else {
202 for (rep = p, len = 0;
203 p[0] != '\0' && p[0] != delim; ++p, ++len)
204 if (p[0] == '~')
205 len += sp->repl_len;
206 GET_SPACE_RETW(sp, bp, blen, len);
207 for (t = bp, len = 0, p = rep;;) {
208 if (p[0] == '\0' || p[0] == delim) {
209 if (p[0] == delim)
210 ++p;
211 break;
213 if (p[0] == '\\') {
214 if (p[1] == delim)
215 ++p;
216 else if (p[1] == '\\') {
217 *t++ = *p++;
218 ++len;
219 } else if (p[1] == '~') {
220 ++p;
221 if (!O_ISSET(sp, O_MAGIC))
222 goto tilde;
224 } else if (p[0] == '~' && O_ISSET(sp, O_MAGIC)) {
225 tilde: ++p;
226 MEMCPYW(t, sp->repl, sp->repl_len);
227 t += sp->repl_len;
228 len += sp->repl_len;
229 continue;
231 *t++ = *p++;
232 ++len;
234 if ((sp->repl_len = len) != 0) {
235 if (sp->repl != NULL)
236 free(sp->repl);
237 if ((sp->repl = malloc(len)) == NULL) {
238 msgq(sp, M_SYSERR, NULL);
239 FREE_SPACEW(sp, bp, blen);
240 return (1);
242 MEMCPYW(sp->repl, bp, len);
244 FREE_SPACEW(sp, bp, blen);
246 return (s(sp, cmdp, p, re, flags));
250 * ex_subagain --
251 * [line [,line]] & [cgr] [count] [#lp]]
253 * Substitute using the last substitute RE and replacement pattern.
255 * PUBLIC: int ex_subagain __P((SCR *, EXCMD *));
258 ex_subagain(sp, cmdp)
259 SCR *sp;
260 EXCMD *cmdp;
262 if (sp->subre == NULL) {
263 ex_emsg(sp, NULL, EXM_NOPREVRE);
264 return (1);
266 if (!F_ISSET(sp, SC_RE_SUBST) &&
267 re_compile(sp, sp->subre, sp->subre_len,
268 NULL, NULL, &sp->subre_c, SEARCH_CSUBST | SEARCH_MSG))
269 return (1);
270 return (s(sp,
271 cmdp, cmdp->argc ? cmdp->argv[0]->bp : NULL, &sp->subre_c, 0));
275 * ex_subtilde --
276 * [line [,line]] ~ [cgr] [count] [#lp]]
278 * Substitute using the last RE and last substitute replacement pattern.
280 * PUBLIC: int ex_subtilde __P((SCR *, EXCMD *));
283 ex_subtilde(sp, cmdp)
284 SCR *sp;
285 EXCMD *cmdp;
287 if (sp->re == NULL) {
288 ex_emsg(sp, NULL, EXM_NOPREVRE);
289 return (1);
291 if (!F_ISSET(sp, SC_RE_SEARCH) && re_compile(sp, sp->re,
292 sp->re_len, NULL, NULL, &sp->re_c, SEARCH_CSEARCH | SEARCH_MSG))
293 return (1);
294 return (s(sp,
295 cmdp, cmdp->argc ? cmdp->argv[0]->bp : NULL, &sp->re_c, 0));
299 * s --
300 * Do the substitution. This stuff is *really* tricky. There are lots of
301 * special cases, and general nastiness. Don't mess with it unless you're
302 * pretty confident.
304 * The nasty part of the substitution is what happens when the replacement
305 * string contains newlines. It's a bit tricky -- consider the information
306 * that has to be retained for "s/f\(o\)o/^M\1^M\1/". The solution here is
307 * to build a set of newline offsets which we use to break the line up later,
308 * when the replacement is done. Don't change it unless you're *damned*
309 * confident.
311 #define NEEDNEWLINE(sp) { \
312 if (sp->newl_len == sp->newl_cnt) { \
313 sp->newl_len += 25; \
314 REALLOC(sp, sp->newl, size_t *, \
315 sp->newl_len * sizeof(size_t)); \
316 if (sp->newl == NULL) { \
317 sp->newl_len = 0; \
318 return (1); \
323 #define BUILD(sp, l, len) { \
324 if (lbclen + (len) > lblen) { \
325 lblen += MAX(lbclen + (len), 256); \
326 REALLOC(sp, lb, CHAR_T *, lblen * sizeof(CHAR_T)); \
327 if (lb == NULL) { \
328 lbclen = 0; \
329 return (1); \
332 MEMCPYW(lb + lbclen, l, len); \
333 lbclen += len; \
336 #define NEEDSP(sp, len, pnt) { \
337 if (lbclen + (len) > lblen) { \
338 lblen += MAX(lbclen + (len), 256); \
339 REALLOC(sp, lb, CHAR_T *, lblen * sizeof(CHAR_T)); \
340 if (lb == NULL) { \
341 lbclen = 0; \
342 return (1); \
344 pnt = lb + lbclen; \
348 static int
349 s(sp, cmdp, s, re, flags)
350 SCR *sp;
351 EXCMD *cmdp;
352 CHAR_T *s;
353 regex_t *re;
354 u_int flags;
356 EVENT ev;
357 MARK from, to;
358 TEXTH tiq;
359 db_recno_t elno, lno, slno;
360 u_long ul;
361 regmatch_t match[10];
362 size_t blen, cnt, last, lbclen, lblen, len, llen;
363 size_t offset, saved_offset, scno;
364 int cflag, lflag, nflag, pflag, rflag;
365 int didsub, do_eol_match, eflags, empty_ok, eval;
366 int linechanged, matched, quit, rval;
367 CHAR_T *p, *lb, *bp;
368 enum nresult nret;
370 NEEDFILE(sp, cmdp);
372 slno = sp->lno;
373 scno = sp->cno;
376 * !!!
377 * Historically, the 'g' and 'c' suffices were always toggled as flags,
378 * so ":s/A/B/" was the same as ":s/A/B/ccgg". If O_EDCOMPATIBLE was
379 * not set, they were initialized to 0 for all substitute commands. If
380 * O_EDCOMPATIBLE was set, they were initialized to 0 only if the user
381 * specified substitute/replacement patterns (see ex_s()).
383 if (!O_ISSET(sp, O_EDCOMPATIBLE))
384 sp->c_suffix = sp->g_suffix = 0;
387 * Historic vi permitted the '#', 'l' and 'p' options in vi mode, but
388 * it only displayed the last change. I'd disallow them, but they are
389 * useful in combination with the [v]global commands. In the current
390 * model the problem is combining them with the 'c' flag -- the screen
391 * would have to flip back and forth between the confirm screen and the
392 * ex print screen, which would be pretty awful. We do display all
393 * changes, though, for what that's worth.
395 * !!!
396 * Historic vi was fairly strict about the order of "options", the
397 * count, and "flags". I'm somewhat fuzzy on the difference between
398 * options and flags, anyway, so this is a simpler approach, and we
399 * just take it them in whatever order the user gives them. (The ex
400 * usage statement doesn't reflect this.)
402 cflag = lflag = nflag = pflag = rflag = 0;
403 if (s == NULL)
404 goto noargs;
405 for (lno = OOBLNO; *s != '\0'; ++s)
406 switch (*s) {
407 case ' ':
408 case '\t':
409 continue;
410 case '+':
411 ++cmdp->flagoff;
412 break;
413 case '-':
414 --cmdp->flagoff;
415 break;
416 case '0': case '1': case '2': case '3': case '4':
417 case '5': case '6': case '7': case '8': case '9':
418 if (lno != OOBLNO)
419 goto usage;
420 errno = 0;
421 nret = nget_uslong(sp, &ul, s, &s, 10);
422 lno = ul;
423 if (*s == '\0') /* Loop increment correction. */
424 --s;
425 if (nret != NUM_OK) {
426 if (nret == NUM_OVER)
427 msgq(sp, M_ERR, "153|Count overflow");
428 else if (nret == NUM_UNDER)
429 msgq(sp, M_ERR, "154|Count underflow");
430 else
431 msgq(sp, M_SYSERR, NULL);
432 return (1);
435 * In historic vi, the count was inclusive from the
436 * second address.
438 cmdp->addr1.lno = cmdp->addr2.lno;
439 cmdp->addr2.lno += lno - 1;
440 if (!db_exist(sp, cmdp->addr2.lno) &&
441 db_last(sp, &cmdp->addr2.lno))
442 return (1);
443 break;
444 case '#':
445 nflag = 1;
446 break;
447 case 'c':
448 sp->c_suffix = !sp->c_suffix;
450 /* Ex text structure initialization. */
451 if (F_ISSET(sp, SC_EX)) {
452 memset(&tiq, 0, sizeof(TEXTH));
453 CIRCLEQ_INIT(&tiq);
455 break;
456 case 'g':
457 sp->g_suffix = !sp->g_suffix;
458 break;
459 case 'l':
460 lflag = 1;
461 break;
462 case 'p':
463 pflag = 1;
464 break;
465 case 'r':
466 if (LF_ISSET(SUB_FIRST)) {
467 msgq(sp, M_ERR,
468 "155|Regular expression specified; r flag meaningless");
469 return (1);
471 if (!F_ISSET(sp, SC_RE_SEARCH)) {
472 ex_emsg(sp, NULL, EXM_NOPREVRE);
473 return (1);
475 rflag = 1;
476 re = &sp->re_c;
477 break;
478 default:
479 goto usage;
482 if (*s != '\0' || !rflag && LF_ISSET(SUB_MUSTSETR)) {
483 usage: ex_emsg(sp, cmdp->cmd->usage, EXM_USAGE);
484 return (1);
487 noargs: if (F_ISSET(sp, SC_VI) && sp->c_suffix && (lflag || nflag || pflag)) {
488 msgq(sp, M_ERR,
489 "156|The #, l and p flags may not be combined with the c flag in vi mode");
490 return (1);
494 * bp: if interactive, line cache
495 * blen: if interactive, line cache length
496 * lb: build buffer pointer.
497 * lbclen: current length of built buffer.
498 * lblen; length of build buffer.
500 bp = lb = NULL;
501 blen = lbclen = lblen = 0;
503 /* For each line... */
504 lno = cmdp->addr1.lno == 0 ? 1 : cmdp->addr1.lno;
505 for (matched = quit = 0,
506 elno = cmdp->addr2.lno; !quit && lno <= elno; ++lno) {
508 /* Someone's unhappy, time to stop. */
509 if (INTERRUPTED(sp))
510 break;
512 /* Get the line. */
513 if (db_get(sp, lno, DBG_FATAL, &s, &llen))
514 goto err;
517 * Make a local copy if doing confirmation -- when calling
518 * the confirm routine we're likely to lose the cached copy.
520 if (sp->c_suffix) {
521 if (bp == NULL) {
522 GET_SPACE_RETW(sp, bp, blen, llen);
523 } else
524 ADD_SPACE_RETW(sp, bp, blen, llen);
525 MEMCPYW(bp, s, llen);
526 s = bp;
529 /* Start searching from the beginning. */
530 offset = 0;
531 len = llen;
533 /* Reset the build buffer offset. */
534 lbclen = 0;
536 /* Reset empty match flag. */
537 empty_ok = 1;
540 * We don't want to have to do a setline if the line didn't
541 * change -- keep track of whether or not this line changed.
542 * If doing confirmations, don't want to keep setting the
543 * line if change is refused -- keep track of substitutions.
545 didsub = linechanged = 0;
547 /* New line, do an EOL match. */
548 do_eol_match = 1;
550 /* It's not nul terminated, but we pretend it is. */
551 eflags = REG_STARTEND;
554 * The search area is from s + offset to the EOL.
556 * Generally, match[0].rm_so is the offset of the start
557 * of the match from the start of the search, and offset
558 * is the offset of the start of the last search.
560 nextmatch: match[0].rm_so = 0;
561 match[0].rm_eo = len;
563 /* Get the next match. */
564 eval = regexec(re, s + offset, 10, match, eflags);
567 * There wasn't a match or if there was an error, deal with
568 * it. If there was a previous match in this line, resolve
569 * the changes into the database. Otherwise, just move on.
571 if (eval == REG_NOMATCH)
572 goto endmatch;
573 if (eval != 0) {
574 re_error(sp, eval, re);
575 goto err;
577 matched = 1;
579 /* Only the first search can match an anchored expression. */
580 eflags |= REG_NOTBOL;
583 * !!!
584 * It's possible to match 0-length strings -- for example, the
585 * command s;a*;X;, when matched against the string "aabb" will
586 * result in "XbXbX", i.e. the matches are "aa", the space
587 * between the b's and the space between the b's and the end of
588 * the string. There is a similar space between the beginning
589 * of the string and the a's. The rule that we use (because vi
590 * historically used it) is that any 0-length match, occurring
591 * immediately after a match, is ignored. Otherwise, the above
592 * example would have resulted in "XXbXbX". Another example is
593 * incorrectly using " *" to replace groups of spaces with one
594 * space.
596 * The way we do this is that if we just had a successful match,
597 * the starting offset does not skip characters, and the match
598 * is empty, ignore the match and move forward. If there's no
599 * more characters in the string, we were attempting to match
600 * after the last character, so quit.
602 if (!empty_ok && match[0].rm_so == 0 && match[0].rm_eo == 0) {
603 empty_ok = 1;
604 if (len == 0)
605 goto endmatch;
606 BUILD(sp, s + offset, 1)
607 ++offset;
608 --len;
609 goto nextmatch;
612 /* Confirm change. */
613 if (sp->c_suffix) {
615 * Set the cursor position for confirmation. Note,
616 * if we matched on a '$', the cursor may be past
617 * the end of line.
619 from.lno = to.lno = lno;
620 from.cno = match[0].rm_so + offset;
621 to.cno = match[0].rm_eo + offset;
623 * Both ex and vi have to correct for a change before
624 * the first character in the line.
626 if (llen == 0)
627 from.cno = to.cno = 0;
628 if (F_ISSET(sp, SC_VI)) {
630 * Only vi has to correct for a change after
631 * the last character in the line.
633 * XXX
634 * It would be nice to change the vi code so
635 * that we could display a cursor past EOL.
637 if (to.cno >= llen)
638 to.cno = llen - 1;
639 if (from.cno >= llen)
640 from.cno = llen - 1;
642 sp->lno = from.lno;
643 sp->cno = from.cno;
644 if (vs_refresh(sp, 1))
645 goto err;
647 vs_update(sp, msg_cat(sp,
648 "169|Confirm change? [n]", NULL), NULL);
650 if (v_event_get(sp, &ev, 0, 0))
651 goto err;
652 switch (ev.e_event) {
653 case E_CHARACTER:
654 break;
655 case E_EOF:
656 case E_ERR:
657 case E_INTERRUPT:
658 goto lquit;
659 default:
660 v_event_err(sp, &ev);
661 goto lquit;
663 } else {
664 if (ex_print(sp, cmdp, &from, &to, 0) ||
665 ex_scprint(sp, &from, &to))
666 goto lquit;
667 if (ex_txt(sp, &tiq, 0, TXT_CR))
668 goto err;
669 ev.e_c = tiq.cqh_first->lb[0];
672 switch (ev.e_c) {
673 case CH_YES:
674 break;
675 default:
676 case CH_NO:
677 didsub = 0;
678 BUILD(sp, s +offset, match[0].rm_eo);
679 goto skip;
680 case CH_QUIT:
681 /* Set the quit/interrupted flags. */
682 lquit: quit = 1;
683 F_SET(sp->gp, G_INTERRUPTED);
686 * Resolve any changes, then return to (and
687 * exit from) the main loop.
689 goto endmatch;
694 * Set the cursor to the last position changed, converting
695 * from 1-based to 0-based.
697 sp->lno = lno;
698 sp->cno = match[0].rm_so;
700 /* Copy the bytes before the match into the build buffer. */
701 BUILD(sp, s + offset, match[0].rm_so);
703 /* Substitute the matching bytes. */
704 didsub = 1;
705 if (re_sub(sp, s + offset, &lb, &lbclen, &lblen, match))
706 goto err;
708 /* Set the change flag so we know this line was modified. */
709 linechanged = 1;
711 /* Move past the matched bytes. */
712 skip: offset += match[0].rm_eo;
713 len -= match[0].rm_eo;
715 /* A match cannot be followed by an empty pattern. */
716 empty_ok = 0;
719 * If doing a global change with confirmation, we have to
720 * update the screen. The basic idea is to store the line
721 * so the screen update routines can find it, and restart.
723 if (didsub && sp->c_suffix && sp->g_suffix) {
725 * The new search offset will be the end of the
726 * modified line.
728 saved_offset = lbclen;
730 /* Copy the rest of the line. */
731 if (len)
732 BUILD(sp, s + offset, len)
734 /* Set the new offset. */
735 offset = saved_offset;
737 /* Store inserted lines, adjusting the build buffer. */
738 last = 0;
739 if (sp->newl_cnt) {
740 for (cnt = 0;
741 cnt < sp->newl_cnt; ++cnt, ++lno, ++elno) {
742 if (db_insert(sp, lno,
743 lb + last, sp->newl[cnt] - last))
744 goto err;
745 last = sp->newl[cnt] + 1;
746 ++sp->rptlines[L_ADDED];
748 lbclen -= last;
749 offset -= last;
750 sp->newl_cnt = 0;
753 /* Store and retrieve the line. */
754 if (db_set(sp, lno, lb + last, lbclen))
755 goto err;
756 if (db_get(sp, lno, DBG_FATAL, &s, &llen))
757 goto err;
758 ADD_SPACE_RETW(sp, bp, blen, llen)
759 MEMCPYW(bp, s, llen);
760 s = bp;
761 len = llen - offset;
763 /* Restart the build. */
764 lbclen = 0;
765 BUILD(sp, s, offset);
768 * If we haven't already done the after-the-string
769 * match, do one. Set REG_NOTEOL so the '$' pattern
770 * only matches once.
772 if (!do_eol_match)
773 goto endmatch;
774 if (offset == len) {
775 do_eol_match = 0;
776 eflags |= REG_NOTEOL;
778 goto nextmatch;
782 * If it's a global:
784 * If at the end of the string, do a test for the after
785 * the string match. Set REG_NOTEOL so the '$' pattern
786 * only matches once.
788 if (sp->g_suffix && do_eol_match) {
789 if (len == 0) {
790 do_eol_match = 0;
791 eflags |= REG_NOTEOL;
793 goto nextmatch;
796 endmatch: if (!linechanged)
797 continue;
799 /* Copy any remaining bytes into the build buffer. */
800 if (len)
801 BUILD(sp, s + offset, len)
803 /* Store inserted lines, adjusting the build buffer. */
804 last = 0;
805 if (sp->newl_cnt) {
806 for (cnt = 0;
807 cnt < sp->newl_cnt; ++cnt, ++lno, ++elno) {
808 if (db_insert(sp,
809 lno, lb + last, sp->newl[cnt] - last))
810 goto err;
811 last = sp->newl[cnt] + 1;
812 ++sp->rptlines[L_ADDED];
814 lbclen -= last;
815 sp->newl_cnt = 0;
818 /* Store the changed line. */
819 if (db_set(sp, lno, lb + last, lbclen))
820 goto err;
822 /* Update changed line counter. */
823 if (sp->rptlchange != lno) {
824 sp->rptlchange = lno;
825 ++sp->rptlines[L_CHANGED];
829 * !!!
830 * Display as necessary. Historic practice is to only
831 * display the last line of a line split into multiple
832 * lines.
834 if (lflag || nflag || pflag) {
835 from.lno = to.lno = lno;
836 from.cno = to.cno = 0;
837 if (lflag)
838 (void)ex_print(sp, cmdp, &from, &to, E_C_LIST);
839 if (nflag)
840 (void)ex_print(sp, cmdp, &from, &to, E_C_HASH);
841 if (pflag)
842 (void)ex_print(sp, cmdp, &from, &to, E_C_PRINT);
847 * !!!
848 * Historically, vi attempted to leave the cursor at the same place if
849 * the substitution was done at the current cursor position. Otherwise
850 * it moved it to the first non-blank of the last line changed. There
851 * were some problems: for example, :s/$/foo/ with the cursor on the
852 * last character of the line left the cursor on the last character, or
853 * the & command with multiple occurrences of the matching string in the
854 * line usually left the cursor in a fairly random position.
856 * We try to do the same thing, with the exception that if the user is
857 * doing substitution with confirmation, we move to the last line about
858 * which the user was consulted, as opposed to the last line that they
859 * actually changed. This prevents a screen flash if the user doesn't
860 * change many of the possible lines.
862 if (!sp->c_suffix && (sp->lno != slno || sp->cno != scno)) {
863 sp->cno = 0;
864 (void)nonblank(sp, sp->lno, &sp->cno);
868 * If not in a global command, and nothing matched, say so.
869 * Else, if none of the lines displayed, put something up.
871 rval = 0;
872 if (!matched) {
873 if (!F_ISSET(sp, SC_EX_GLOBAL)) {
874 msgq(sp, M_ERR, "157|No match found");
875 goto err;
877 } else if (!lflag && !nflag && !pflag)
878 F_SET(cmdp, E_AUTOPRINT);
880 if (0) {
881 err: rval = 1;
884 if (bp != NULL)
885 FREE_SPACEW(sp, bp, blen);
886 if (lb != NULL)
887 free(lb);
888 return (rval);
892 * re_compile --
893 * Compile the RE.
895 * PUBLIC: int re_compile __P((SCR *,
896 * PUBLIC: CHAR_T *, size_t, CHAR_T **, size_t *, regex_t *, u_int));
899 re_compile(sp, ptrn, plen, ptrnp, lenp, rep, flags)
900 SCR *sp;
901 CHAR_T *ptrn, **ptrnp;
902 size_t plen, *lenp;
903 regex_t *rep;
904 u_int flags;
906 size_t len;
907 int reflags, replaced, rval;
908 CHAR_T *p;
910 /* Set RE flags. */
911 reflags = 0;
912 if (LF_ISSET(SEARCH_EXTEND))
913 reflags |= REG_EXTENDED;
914 if (LF_ISSET(SEARCH_IC))
915 reflags |= REG_ICASE;
916 if (LF_ISSET(SEARCH_LITERAL))
917 reflags |= REG_NOSPEC;
918 if (!LF_ISSET(SEARCH_NOOPT | SEARCH_CSCOPE | SEARCH_TAG)) {
919 if (O_ISSET(sp, O_EXTENDED))
920 reflags |= REG_EXTENDED;
921 if (O_ISSET(sp, O_IGNORECASE))
922 reflags |= REG_ICASE;
923 if (O_ISSET(sp, O_ICLOWER))
924 goto iclower;
926 if (LF_ISSET(SEARCH_ICL)) {
927 iclower: for (p = ptrn, len = plen; len > 0; ++p, --len)
928 if (isupper(*p))
929 break;
930 if (len == 0)
931 reflags |= REG_ICASE;
934 /* If we're replacing a saved value, clear the old one. */
935 if (LF_ISSET(SEARCH_CSEARCH) && F_ISSET(sp, SC_RE_SEARCH)) {
936 regfree(&sp->re_c);
937 F_CLR(sp, SC_RE_SEARCH);
939 if (LF_ISSET(SEARCH_CSUBST) && F_ISSET(sp, SC_RE_SUBST)) {
940 regfree(&sp->subre_c);
941 F_CLR(sp, SC_RE_SUBST);
945 * If we're saving the string, it's a pattern we haven't seen before,
946 * so convert the vi-style RE's to POSIX 1003.2 RE's. Save a copy for
947 * later recompilation. Free any previously saved value.
949 if (ptrnp != NULL) {
950 replaced = 0;
951 if (LF_ISSET(SEARCH_CSCOPE)) {
952 if (re_cscope_conv(sp, &ptrn, &plen, &replaced))
953 return (1);
955 * XXX
956 * Currently, the match-any-<blank> expression used in
957 * re_cscope_conv() requires extended RE's. This may
958 * not be right or safe.
960 reflags |= REG_EXTENDED;
961 } else if (LF_ISSET(SEARCH_TAG)) {
962 if (re_tag_conv(sp, &ptrn, &plen, &replaced))
963 return (1);
964 } else if (!LF_ISSET(SEARCH_LITERAL))
965 if (re_conv(sp, &ptrn, &plen, &replaced))
966 return (1);
968 /* Discard previous pattern. */
969 if (*ptrnp != NULL) {
970 free(*ptrnp);
971 *ptrnp = NULL;
973 if (lenp != NULL)
974 *lenp = plen;
977 * Copy the string into allocated memory.
979 * XXX
980 * Regcomp isn't 8-bit clean, so the pattern is nul-terminated
981 * for now. There's just no other solution.
983 MALLOC(sp, *ptrnp, CHAR_T *, (plen + 1) * sizeof(CHAR_T));
984 if (*ptrnp != NULL) {
985 MEMCPYW(*ptrnp, ptrn, plen);
986 (*ptrnp)[plen] = '\0';
989 /* Free up conversion-routine-allocated memory. */
990 if (replaced)
991 FREE_SPACEW(sp, ptrn, 0);
993 if (*ptrnp == NULL)
994 return (1);
996 ptrn = *ptrnp;
1000 * XXX
1001 * Regcomp isn't 8-bit clean, so we just lost if the pattern
1002 * contained a nul. Bummer!
1004 if ((rval = regcomp(rep, ptrn, /* plen, */ reflags)) != 0) {
1005 if (LF_ISSET(SEARCH_MSG))
1006 re_error(sp, rval, rep);
1007 return (1);
1010 if (LF_ISSET(SEARCH_CSEARCH))
1011 F_SET(sp, SC_RE_SEARCH);
1012 if (LF_ISSET(SEARCH_CSUBST))
1013 F_SET(sp, SC_RE_SUBST);
1015 return (0);
1019 * re_conv --
1020 * Convert vi's regular expressions into something that the
1021 * the POSIX 1003.2 RE functions can handle.
1023 * There are three conversions we make to make vi's RE's (specifically
1024 * the global, search, and substitute patterns) work with POSIX RE's.
1026 * 1: If O_MAGIC is not set, strip backslashes from the magic character
1027 * set (.[*~) that have them, and add them to the ones that don't.
1028 * 2: If O_MAGIC is not set, the string "\~" is replaced with the text
1029 * from the last substitute command's replacement string. If O_MAGIC
1030 * is set, it's the string "~".
1031 * 3: The pattern \<ptrn\> does "word" searches, convert it to use the
1032 * new RE escapes.
1034 * !!!/XXX
1035 * This doesn't exactly match the historic behavior of vi because we do
1036 * the ~ substitution before calling the RE engine, so magic characters
1037 * in the replacement string will be expanded by the RE engine, and they
1038 * weren't historically. It's a bug.
1040 static int
1041 re_conv(sp, ptrnp, plenp, replacedp)
1042 SCR *sp;
1043 CHAR_T **ptrnp;
1044 size_t *plenp;
1045 int *replacedp;
1047 size_t blen, len, needlen;
1048 int magic;
1049 CHAR_T *bp, *p, *t;
1052 * First pass through, we figure out how much space we'll need.
1053 * We do it in two passes, on the grounds that most of the time
1054 * the user is doing a search and won't have magic characters.
1055 * That way we can skip most of the memory allocation and copies.
1057 magic = 0;
1058 for (p = *ptrnp, len = *plenp, needlen = 0; len > 0; ++p, --len)
1059 switch (*p) {
1060 case '\\':
1061 if (len > 1) {
1062 --len;
1063 switch (*++p) {
1064 case '<':
1065 magic = 1;
1066 needlen += RE_WSTART_LEN;
1067 break;
1068 case '>':
1069 magic = 1;
1070 needlen += RE_WSTOP_LEN;
1071 break;
1072 case '~':
1073 if (!O_ISSET(sp, O_MAGIC)) {
1074 magic = 1;
1075 needlen += sp->repl_len;
1077 break;
1078 case '.':
1079 case '[':
1080 case '*':
1081 if (!O_ISSET(sp, O_MAGIC)) {
1082 magic = 1;
1083 needlen += 1;
1085 break;
1086 default:
1087 needlen += 2;
1089 } else
1090 needlen += 1;
1091 break;
1092 case '~':
1093 if (O_ISSET(sp, O_MAGIC)) {
1094 magic = 1;
1095 needlen += sp->repl_len;
1097 break;
1098 case '.':
1099 case '[':
1100 case '*':
1101 if (!O_ISSET(sp, O_MAGIC)) {
1102 magic = 1;
1103 needlen += 2;
1105 break;
1106 default:
1107 needlen += 1;
1108 break;
1111 if (!magic) {
1112 *replacedp = 0;
1113 return (0);
1116 /* Get enough memory to hold the final pattern. */
1117 *replacedp = 1;
1118 GET_SPACE_RETW(sp, bp, blen, needlen);
1120 for (p = *ptrnp, len = *plenp, t = bp; len > 0; ++p, --len)
1121 switch (*p) {
1122 case '\\':
1123 if (len > 1) {
1124 --len;
1125 switch (*++p) {
1126 case '<':
1127 MEMCPY(t,
1128 RE_WSTART, RE_WSTART_LEN - 1);
1129 t += RE_WSTART_LEN - 1;
1130 break;
1131 case '>':
1132 MEMCPY(t,
1133 RE_WSTOP, RE_WSTOP_LEN - 1);
1134 t += RE_WSTOP_LEN - 1;
1135 break;
1136 case '~':
1137 if (O_ISSET(sp, O_MAGIC))
1138 *t++ = '~';
1139 else {
1140 MEMCPYW(t,
1141 sp->repl, sp->repl_len);
1142 t += sp->repl_len;
1144 break;
1145 case '.':
1146 case '[':
1147 case '*':
1148 if (O_ISSET(sp, O_MAGIC))
1149 *t++ = '\\';
1150 *t++ = *p;
1151 break;
1152 default:
1153 *t++ = '\\';
1154 *t++ = *p;
1156 } else
1157 *t++ = '\\';
1158 break;
1159 case '~':
1160 if (O_ISSET(sp, O_MAGIC)) {
1161 MEMCPYW(t, sp->repl, sp->repl_len);
1162 t += sp->repl_len;
1163 } else
1164 *t++ = '~';
1165 break;
1166 case '.':
1167 case '[':
1168 case '*':
1169 if (!O_ISSET(sp, O_MAGIC))
1170 *t++ = '\\';
1171 *t++ = *p;
1172 break;
1173 default:
1174 *t++ = *p;
1175 break;
1178 *ptrnp = bp;
1179 *plenp = t - bp;
1180 return (0);
1184 * re_tag_conv --
1185 * Convert a tags search path into something that the POSIX
1186 * 1003.2 RE functions can handle.
1188 static int
1189 re_tag_conv(sp, ptrnp, plenp, replacedp)
1190 SCR *sp;
1191 CHAR_T **ptrnp;
1192 size_t *plenp;
1193 int *replacedp;
1195 size_t blen, len;
1196 int lastdollar;
1197 CHAR_T *bp, *p, *t;
1199 len = *plenp;
1201 /* Max memory usage is 2 times the length of the string. */
1202 *replacedp = 1;
1203 GET_SPACE_RETW(sp, bp, blen, len * 2);
1205 p = *ptrnp;
1206 t = bp;
1208 /* If the last character is a '/' or '?', we just strip it. */
1209 if (len > 0 && (p[len - 1] == '/' || p[len - 1] == '?'))
1210 --len;
1212 /* If the next-to-last or last character is a '$', it's magic. */
1213 if (len > 0 && p[len - 1] == '$') {
1214 --len;
1215 lastdollar = 1;
1216 } else
1217 lastdollar = 0;
1219 /* If the first character is a '/' or '?', we just strip it. */
1220 if (len > 0 && (p[0] == '/' || p[0] == '?')) {
1221 ++p;
1222 --len;
1225 /* If the first or second character is a '^', it's magic. */
1226 if (p[0] == '^') {
1227 *t++ = *p++;
1228 --len;
1232 * Escape every other magic character we can find, meanwhile stripping
1233 * the backslashes ctags inserts when escaping the search delimiter
1234 * characters.
1236 for (; len > 0; --len) {
1237 if (p[0] == '\\' && (p[1] == '/' || p[1] == '?')) {
1238 ++p;
1239 --len;
1240 } else if (strchr("^.[]$*", p[0]))
1241 *t++ = '\\';
1242 *t++ = *p++;
1244 if (lastdollar)
1245 *t++ = '$';
1247 *ptrnp = bp;
1248 *plenp = t - bp;
1249 return (0);
1253 * re_cscope_conv --
1254 * Convert a cscope search path into something that the POSIX
1255 * 1003.2 RE functions can handle.
1257 static int
1258 re_cscope_conv(sp, ptrnp, plenp, replacedp)
1259 SCR *sp;
1260 CHAR_T **ptrnp;
1261 size_t *plenp;
1262 int *replacedp;
1264 size_t blen, len, nspaces;
1265 CHAR_T *bp, *t;
1266 CHAR_T *p;
1267 CHAR_T *wp;
1268 size_t wlen;
1271 * Each space in the source line printed by cscope represents an
1272 * arbitrary sequence of spaces, tabs, and comments.
1274 #define CSCOPE_RE_SPACE "([ \t]|/\\*([^*]|\\*/)*\\*/)*"
1275 #define CSCOPE_LEN sizeof(CSCOPE_RE_SPACE) - 1
1276 CHAR2INT(sp, CSCOPE_RE_SPACE, CSCOPE_LEN, wp, wlen);
1277 for (nspaces = 0, p = *ptrnp, len = *plenp; len > 0; ++p, --len)
1278 if (*p == ' ')
1279 ++nspaces;
1282 * Allocate plenty of space:
1283 * the string, plus potential escaping characters;
1284 * nspaces + 2 copies of CSCOPE_RE_SPACE;
1285 * ^, $, nul terminator characters.
1287 *replacedp = 1;
1288 len = (p - *ptrnp) * 2 + (nspaces + 2) * sizeof(CSCOPE_RE_SPACE) + 3;
1289 GET_SPACE_RETW(sp, bp, blen, len);
1291 p = *ptrnp;
1292 t = bp;
1294 *t++ = '^';
1295 MEMCPYW(t, wp, wlen);
1296 t += wlen;
1298 for (len = *plenp; len > 0; ++p, --len)
1299 if (*p == ' ') {
1300 MEMCPYW(t, wp, wlen);
1301 t += wlen;
1302 } else {
1303 if (strchr("\\^.[]$*+?()|{}", *p))
1304 *t++ = '\\';
1305 *t++ = *p;
1308 MEMCPYW(t, wp, wlen);
1309 t += wlen;
1310 *t++ = '$';
1312 *ptrnp = bp;
1313 *plenp = t - bp;
1314 return (0);
1318 * re_error --
1319 * Report a regular expression error.
1321 * PUBLIC: void re_error __P((SCR *, int, regex_t *));
1323 void
1324 re_error(sp, errcode, preg)
1325 SCR *sp;
1326 int errcode;
1327 regex_t *preg;
1329 size_t s;
1330 char *oe;
1332 s = regerror(errcode, preg, "", 0);
1333 if ((oe = malloc(s)) == NULL)
1334 msgq(sp, M_SYSERR, NULL);
1335 else {
1336 (void)regerror(errcode, preg, oe, s);
1337 msgq(sp, M_ERR, "RE error: %s", oe);
1338 free(oe);
1343 * re_sub --
1344 * Do the substitution for a regular expression.
1346 static int
1347 re_sub(sp, ip, lbp, lbclenp, lblenp, match)
1348 SCR *sp;
1349 CHAR_T *ip; /* Input line. */
1350 CHAR_T **lbp;
1351 size_t *lbclenp, *lblenp;
1352 regmatch_t match[10];
1354 enum { C_NOTSET, C_LOWER, C_ONELOWER, C_ONEUPPER, C_UPPER } conv;
1355 size_t lbclen, lblen; /* Local copies. */
1356 size_t mlen; /* Match length. */
1357 size_t rpl; /* Remaining replacement length. */
1358 CHAR_T *rp; /* Replacement pointer. */
1359 int ch;
1360 int no; /* Match replacement offset. */
1361 CHAR_T *p, *t; /* Buffer pointers. */
1362 CHAR_T *lb; /* Local copies. */
1364 lb = *lbp; /* Get local copies. */
1365 lbclen = *lbclenp;
1366 lblen = *lblenp;
1369 * QUOTING NOTE:
1371 * There are some special sequences that vi provides in the
1372 * replacement patterns.
1373 * & string the RE matched (\& if nomagic set)
1374 * \# n-th regular subexpression
1375 * \E end \U, \L conversion
1376 * \e end \U, \L conversion
1377 * \l convert the next character to lower-case
1378 * \L convert to lower-case, until \E, \e, or end of replacement
1379 * \u convert the next character to upper-case
1380 * \U convert to upper-case, until \E, \e, or end of replacement
1382 * Otherwise, since this is the lowest level of replacement, discard
1383 * all escaping characters. This (hopefully) matches historic practice.
1385 #define OUTCH(ch, nltrans) { \
1386 CHAR_T __ch = (ch); \
1387 u_int __value = KEY_VAL(sp, __ch); \
1388 if (nltrans && (__value == K_CR || __value == K_NL)) { \
1389 NEEDNEWLINE(sp); \
1390 sp->newl[sp->newl_cnt++] = lbclen; \
1391 } else if (conv != C_NOTSET) { \
1392 switch (conv) { \
1393 case C_ONELOWER: \
1394 conv = C_NOTSET; \
1395 /* FALLTHROUGH */ \
1396 case C_LOWER: \
1397 if (isupper(__ch)) \
1398 __ch = tolower(__ch); \
1399 break; \
1400 case C_ONEUPPER: \
1401 conv = C_NOTSET; \
1402 /* FALLTHROUGH */ \
1403 case C_UPPER: \
1404 if (islower(__ch)) \
1405 __ch = toupper(__ch); \
1406 break; \
1407 default: \
1408 abort(); \
1411 NEEDSP(sp, 1, p); \
1412 *p++ = __ch; \
1413 ++lbclen; \
1415 conv = C_NOTSET;
1416 for (rp = sp->repl, rpl = sp->repl_len, p = lb + lbclen; rpl--;) {
1417 switch (ch = *rp++) {
1418 case '&':
1419 if (O_ISSET(sp, O_MAGIC)) {
1420 no = 0;
1421 goto subzero;
1423 break;
1424 case '\\':
1425 if (rpl == 0)
1426 break;
1427 --rpl;
1428 switch (ch = *rp) {
1429 case '&':
1430 ++rp;
1431 if (!O_ISSET(sp, O_MAGIC)) {
1432 no = 0;
1433 goto subzero;
1435 break;
1436 case '0': case '1': case '2': case '3': case '4':
1437 case '5': case '6': case '7': case '8': case '9':
1438 no = *rp++ - '0';
1439 subzero: if (match[no].rm_so == -1 ||
1440 match[no].rm_eo == -1)
1441 break;
1442 mlen = match[no].rm_eo - match[no].rm_so;
1443 for (t = ip + match[no].rm_so; mlen--; ++t)
1444 OUTCH(*t, 0);
1445 continue;
1446 case 'e':
1447 case 'E':
1448 ++rp;
1449 conv = C_NOTSET;
1450 continue;
1451 case 'l':
1452 ++rp;
1453 conv = C_ONELOWER;
1454 continue;
1455 case 'L':
1456 ++rp;
1457 conv = C_LOWER;
1458 continue;
1459 case 'u':
1460 ++rp;
1461 conv = C_ONEUPPER;
1462 continue;
1463 case 'U':
1464 ++rp;
1465 conv = C_UPPER;
1466 continue;
1467 default:
1468 ++rp;
1469 break;
1472 OUTCH(ch, 1);
1475 *lbp = lb; /* Update caller's information. */
1476 *lbclenp = lbclen;
1477 *lblenp = lblen;
1478 return (0);