1 static const char CVSID
[] = "$Id: regexConvert.c,v 1.5 2001/08/25 15:58:54 amai Exp $";
2 /*------------------------------------------------------------------------*
3 * `CompileRE', `ExecRE', and `ConvertSubstituteRE' -- regular expression parsing
5 * This is a HIGHLY ALTERED VERSION of Henry Spencer's `regcomp'
6 * code adapted for NEdit.
8 * .-------------------------------------------------------------------.
9 * | ORIGINAL COPYRIGHT NOTICE: |
11 * | Copyright (c) 1986 by University of Toronto. |
12 * | Written by Henry Spencer. Not derived from licensed software. |
14 * | Permission is granted to anyone to use this software for any |
15 * | purpose on any computer system, and to redistribute it freely, |
16 * | subject to the following restrictions: |
18 * | 1. The author is not responsible for the consequences of use of |
19 * | this software, no matter how awful, even if they arise |
20 * | from defects in it. |
22 * | 2. The origin of this software must not be misrepresented, either |
23 * | by explicit claim or by omission. |
25 * | 3. Altered versions must be plainly marked as such, and must not |
26 * | be misrepresented as being the original software. |
27 * `-------------------------------------------------------------------'
35 #include <X11/Intrinsic.h>
37 #include "regexConvert.h"
40 /* Utility definitions. */
44 #define CONVERT_FAIL(m) {*Error_Ptr = (m); return 0;}
45 #define IS_QUANTIFIER(c) ((c) == '*' || (c) == '+' || (c) == '?')
46 #define U_CHAR_AT(p) ((unsigned int) *(unsigned char *)(p))
48 /* Flags to be passed up and down via function parameters during compile. */
50 #define WORST 0 /* Worst case. No assumptions can be made.*/
51 #define HAS_WIDTH 1 /* Known never to match null string. */
52 #define SIMPLE 2 /* Simple enough to be STAR/PLUS operand. */
54 #define NO_PAREN 0 /* Only set by initial call to "chunk". */
55 #define PAREN 1 /* Used for normal capturing parentheses. */
60 /* Global work variables for `ConvertRE'. */
62 static unsigned char *Reg_Parse
; /* Input scan ptr (scans user's regex) */
63 static int Total_Paren
; /* Parentheses, (), counter. */
64 static unsigned long Convert_Size
; /* Address of this used as flag. */
65 static unsigned char *Code_Emit_Ptr
; /* When Code_Emit_Ptr is set to
66 &Compute_Size no code is emitted.
67 Instead, the size of code that WOULD
68 have been generated is accumulated in
69 Convert_Size. Otherwise,
70 Code_Emit_Ptr points to where compiled
71 regex code is to be written. */
72 static unsigned char Compute_Size
;
73 static char **Error_Ptr
; /* Place to store error messages so
74 they can be returned by `ConvertRE' */
75 static char Error_Text
[128];/* Sting to build error messages in. */
77 static unsigned char Meta_Char
[] = ".*+?[(|)^<>$";
79 static unsigned char *Convert_Str
;
81 /* Forward declarations for functions used by `ConvertRE'. */
83 static int alternative (int *flag_param
);
84 static int chunk (int paren
, int *flag_param
);
85 static void emit_convert_byte (unsigned char c
);
86 static unsigned char literal_escape (unsigned char c
, int);
87 static int atom (int *flag_param
);
88 static void reg_error (char *str
);
89 static int piece (int *flag_param
);
91 /*----------------------------------------------------------------------*
94 * Compiles a regular expression into the internal format used by
97 * Beware that the optimization and preparation code in here knows about
98 * some of the structure of the compiled regexp.
99 *----------------------------------------------------------------------*/
101 char * ConvertRE (const char *exp
, char **errorText
, char *cap_parens
) {
103 int flags_local
, pass
;
105 /* Set up `errorText' to receive failure reports. */
107 Error_Ptr
= errorText
;
110 if (exp
== NULL
) CONVERT_FAIL ("NULL argument to `ConvertRE\'");
112 Code_Emit_Ptr
= &Compute_Size
;
115 /* We can't allocate space until we know how big the compiled form will be,
116 but we can't compile it (and thus know how big it is) until we've got a
117 place to put the code. So we cheat: we compile it twice, once with code
118 generation turned off and size counting turned on, and once "for real".
119 This also means that we don't allocate space until we are sure that the
120 thing really will compile successfully, and we never have to move the
121 code and thus invalidate pointers into it. (Note that it has to be in
122 one piece because free() must be able to free it all.) */
124 for (pass
= 1; pass
<= 2; pass
++) {
125 /*-------------------------------------------*
126 * FIRST PASS: Determine size and legality. *
127 * SECOND PASS: Emit converted code. *
128 *-------------------------------------------*/
130 Reg_Parse
= (unsigned char *) exp
;
133 if (chunk (NO_PAREN
, &flags_local
) == 0) return (NULL
); /* Something
135 emit_convert_byte ('\0');
138 /* Allocate memory. */
141 (unsigned char *) XtMalloc (sizeof (unsigned char) * Convert_Size
);
143 if (Convert_Str
== NULL
) {
144 CONVERT_FAIL ("out of memory in `ConvertRE\'");
147 Code_Emit_Ptr
= Convert_Str
;
151 return (char *) Convert_Str
;
154 /*----------------------------------------------------------------------*
157 * Process main body of regex or process a parenthesized "thing". *
159 * Caller must absorb opening parenthesis.
160 *----------------------------------------------------------------------*/
162 static int chunk (int paren
, int *flag_param
) {
164 register int this_branch
;
167 *flag_param
= HAS_WIDTH
; /* Tentatively. */
169 /* Make an OPEN node, if parenthesized. */
171 if (paren
== PAREN
) {
172 if (Total_Paren
>= NSUBEXP
) {
173 sprintf (Error_Text
, "number of ()'s > %d", (int) NSUBEXP
);
174 CONVERT_FAIL (Error_Text
);
180 /* Pick up the branches, linking them together. */
183 this_branch
= alternative (&flags_local
);
185 if (this_branch
== 0) return 0;
187 /* If any alternative could be zero width, consider the whole
188 parenthisized thing to be zero width. */
190 if (!(flags_local
& HAS_WIDTH
)) *flag_param
&= ~HAS_WIDTH
;
192 /* Are there more alternatives to process? */
194 if (*Reg_Parse
!= '|') break;
196 emit_convert_byte ('|');
201 /* Check for proper termination. */
203 if (paren
!= NO_PAREN
&& *Reg_Parse
!= ')') {
204 CONVERT_FAIL ("missing right parenthesis \')\'");
206 } else if (paren
!= NO_PAREN
) {
207 emit_convert_byte (')');
210 } else if (paren
== NO_PAREN
&& *Reg_Parse
!= '\0') {
211 if (*Reg_Parse
== ')') {
212 CONVERT_FAIL ("missing left parenthesis \'(\'");
214 CONVERT_FAIL ("junk on end"); /* "Can't happen" - NOTREACHED */
221 /*----------------------------------------------------------------------*
222 * alternative - Processes one alternative of an '|' operator.
223 *----------------------------------------------------------------------*/
225 static int alternative (int *flag_param
) {
230 *flag_param
= WORST
; /* Tentatively. */
232 /* Loop until we hit the start of the next alternative, the end of this set
233 of alternatives (end of parentheses), or the end of the regex. */
235 while (*Reg_Parse
!= '|' && *Reg_Parse
!= ')' && *Reg_Parse
!= '\0') {
236 ret_val
= piece (&flags_local
);
238 if (ret_val
== 0) return 0; /* Something went wrong. */
240 *flag_param
|= flags_local
& HAS_WIDTH
;
246 /*----------------------------------------------------------------------*
247 * piece - something followed by possible '*', '+', or '?'.
248 *----------------------------------------------------------------------*/
250 static int piece (int *flag_param
) {
252 register int ret_val
;
253 register unsigned char op_code
;
254 unsigned long min_val
= REG_ZERO
;
257 ret_val
= atom (&flags_local
);
259 if (ret_val
== 0) return 0; /* Something went wrong. */
261 op_code
= *Reg_Parse
;
263 if (!IS_QUANTIFIER (op_code
)) {
264 *flag_param
= flags_local
;
271 if (op_code
== '+') min_val
= REG_ONE
;
273 /* It is dangerous to apply certain quantifiers to a possibly zero width
276 if (!(flags_local
& HAS_WIDTH
) && min_val
> REG_ZERO
) {
277 sprintf (Error_Text
, "%c operand could be empty", op_code
);
279 CONVERT_FAIL (Error_Text
);
282 *flag_param
= (min_val
> REG_ZERO
) ? (WORST
| HAS_WIDTH
) : WORST
;
284 if ( !((op_code
== '*') || (op_code
== '+') || (op_code
== '?')) ) {
285 /* We get here if the IS_QUANTIFIER macro is not coordinated properly
286 with this function. */
288 CONVERT_FAIL ("internal error #2, `piece\'");
291 if (IS_QUANTIFIER (*Reg_Parse
)) {
292 sprintf (Error_Text
, "nested quantifiers, %c%c", op_code
, *Reg_Parse
);
294 CONVERT_FAIL (Error_Text
);
297 emit_convert_byte (op_code
);
302 /*----------------------------------------------------------------------*
303 * atom - Process one regex item at the lowest level
304 *----------------------------------------------------------------------*/
306 static int atom (int *flag_param
) {
311 *flag_param
= WORST
; /* Tentatively. */
313 switch (*Reg_Parse
++) {
315 emit_convert_byte ('^');
319 emit_convert_byte ('$');
323 emit_convert_byte ('<');
327 emit_convert_byte ('>');
331 emit_convert_byte ('.');
333 *flag_param
|= (HAS_WIDTH
| SIMPLE
); break;
336 emit_convert_byte ('(');
338 ret_val
= chunk (PAREN
, &flags_local
);
340 if (ret_val
== 0) return 0; /* Something went wrong. */
342 /* Add HAS_WIDTH flag if it was set by call to chunk. */
344 *flag_param
|= flags_local
& HAS_WIDTH
;
351 CONVERT_FAIL ("internal error #3, `atom\'"); /* Supposed to be */
352 /* caught earlier. */
356 sprintf (Error_Text
, "%c follows nothing", *(Reg_Parse
- 1));
357 CONVERT_FAIL (Error_Text
);
360 emit_convert_byte ('\\'); /* Quote braces. */
361 emit_convert_byte ('{');
367 register unsigned int last_value
;
368 unsigned char last_emit
= 0;
369 unsigned char buffer
[500];
376 int u_score_flag
= 0;
380 /* Handle characters that can only occur at the start of a class. */
382 if (*Reg_Parse
== '^') { /* Complement of range. */
388 if (*Reg_Parse
== ']' || *Reg_Parse
== '-') {
389 /* If '-' or ']' is the first character in a class,
390 it is a literal character in the class. */
392 last_emit
= *Reg_Parse
;
395 CONVERT_FAIL ("too much data in [] to convert.");
398 buffer
[head
++] = '\\'; /* Escape `]' and '-' for clarity. */
399 buffer
[head
++] = *Reg_Parse
;
404 /* Handle the rest of the class characters. */
406 while (*Reg_Parse
!= '\0' && *Reg_Parse
!= ']') {
407 if (*Reg_Parse
== '-') { /* Process a range, e.g [a-z]. */
410 if (*Reg_Parse
== ']' || *Reg_Parse
== '\0') {
411 /* If '-' is the last character in a class it is a literal
412 character. If `Reg_Parse' points to the end of the
413 regex string, an error will be generated later. */
418 CONVERT_FAIL ("too much data in [] to convert.");
421 buffer
[head
++] = '\\'; /* Escape '-' for clarity. */
422 buffer
[head
++] = '-';
425 if (*Reg_Parse
== '\\') {
426 /* Handle escaped characters within a class range. */
430 if ((test
= literal_escape (*Reg_Parse
, 0))) {
432 buffer
[head
++] = '-';
434 if (*Reg_Parse
!= '\"') {
435 emit_convert_byte ('\\');
438 buffer
[head
++] = *Reg_Parse
;
439 last_value
= (unsigned int) test
;
443 "\\%c is an invalid escape sequence(3)",
446 CONVERT_FAIL (Error_Text
);
449 last_value
= U_CHAR_AT (Reg_Parse
);
451 if (last_emit
== '0' && last_value
== '9') {
454 } else if (last_emit
== 'a' && last_value
== 'z') {
457 } else if (last_emit
== 'A' && last_value
== 'Z') {
461 buffer
[head
++] = '-';
463 if ((test
= literal_escape (*Reg_Parse
, 1))) {
464 /* Ordinary character matches an escape sequence;
465 convert it to the escape sequence. */
469 "too much data in [] to convert.");
472 buffer
[head
++] = '\\';
474 if (test
== '0') { /* Make octal escape. */
476 buffer
[head
++] = '0';
477 buffer
[head
++] = ('0' + (test
/ 64));
478 test
-= (test
/ 64) * 64;
479 buffer
[head
++] = ('0' + (test
/ 8));
480 test
-= (test
/ 8) * 8;
481 buffer
[head
++] = ('0' + test
);
483 buffer
[head
++] = test
;
486 buffer
[head
++] = last_value
;
491 if (last_emit
> last_value
) {
492 CONVERT_FAIL ("invalid [] range");
495 last_emit
= (unsigned char) last_value
;
499 } /* End class character range code. */
500 } else if (*Reg_Parse
== '\\') {
503 if ((test
= literal_escape (*Reg_Parse
, 0)) != '\0') {
507 CONVERT_FAIL ("too much data in [] to convert.");
510 if (*Reg_Parse
!= '\"') {
511 buffer
[head
++] = '\\';
514 buffer
[head
++] = *Reg_Parse
;
518 "\\%c is an invalid escape sequence(1)",
521 CONVERT_FAIL (Error_Text
);
526 /* End of class escaped sequence code */
528 last_emit
= *Reg_Parse
;
530 if (*Reg_Parse
== '_') {
531 u_score_flag
= 1; /* Emit later if we can't do `\w'. */
533 } else if ((test
= literal_escape (*Reg_Parse
, 1))) {
534 /* Ordinary character matches an escape sequence;
535 convert it to the escape sequence. */
538 CONVERT_FAIL ("too much data in [] to convert.");
541 buffer
[head
++] = '\\';
543 if (test
== '0') { /* Make octal escape. */
545 buffer
[head
++] = '0';
546 buffer
[head
++] = ('0' + (test
/ 64));
547 test
-= (test
/ 64) * 64;
548 buffer
[head
++] = ('0' + (test
/ 8));
549 test
-= (test
/ 8) * 8;
550 buffer
[head
++] = ('0' + test
);
553 CONVERT_FAIL ("too much data in [] to convert.");
556 buffer
[head
++] = test
;
560 CONVERT_FAIL ("too much data in [] to convert.");
563 buffer
[head
++] = *Reg_Parse
;
568 } /* End of while (*Reg_Parse != '\0' && *Reg_Parse != ']') */
570 if (*Reg_Parse
!= ']') CONVERT_FAIL ("missing right \']\'");
572 buffer
[head
] = '\0';
574 /* NOTE: it is impossible to specify an empty class. This is
575 because [] would be interpreted as "begin character class"
576 followed by a literal ']' character and no "end character class"
577 delimiter (']'). Because of this, it is always safe to assume
578 that a class HAS_WIDTH. */
580 Reg_Parse
++; *flag_param
|= HAS_WIDTH
| SIMPLE
;
583 if (( a_z_flag
&& A_Z_flag
&& zero_nine
&& u_score_flag
) ||
584 ( a_z_flag
&& A_Z_flag
&& !zero_nine
&& !u_score_flag
) ||
585 (!a_z_flag
&& !A_Z_flag
&& zero_nine
&& !u_score_flag
)) {
592 emit_convert_byte ('[');
593 if (negated
) emit_convert_byte ('^');
596 /* Output any shortcut escapes if we can. */
598 while (a_z_flag
|| A_Z_flag
|| zero_nine
|| u_score_flag
) {
599 if (a_z_flag
&& A_Z_flag
&& zero_nine
&& u_score_flag
) {
600 emit_convert_byte ('\\');
602 if (negated
&& !do_brackets
) {
603 emit_convert_byte ('W');
605 emit_convert_byte ('w');
608 a_z_flag
= A_Z_flag
= zero_nine
= u_score_flag
= 0;
609 } else if (a_z_flag
&& A_Z_flag
) {
610 emit_convert_byte ('\\');
612 if (negated
&& !do_brackets
) {
613 emit_convert_byte ('L');
615 emit_convert_byte ('l');
618 a_z_flag
= A_Z_flag
= 0;
619 } else if (zero_nine
) {
620 emit_convert_byte ('\\');
622 if (negated
&& !do_brackets
) {
623 emit_convert_byte ('D');
625 emit_convert_byte ('d');
629 } else if (a_z_flag
) {
630 emit_convert_byte ('a');
631 emit_convert_byte ('-');
632 emit_convert_byte ('z');
635 } else if (A_Z_flag
) {
636 emit_convert_byte ('A');
637 emit_convert_byte ('-');
638 emit_convert_byte ('Z');
641 } else if (u_score_flag
) {
642 emit_convert_byte ('_');
648 /* Output our buffered class characters. */
650 for (head
= 0; buffer
[head
] != '\0'; head
++) {
651 emit_convert_byte (buffer
[head
]);
655 emit_convert_byte (']');
659 break; /* End of character class code. */
661 /* Fall through to Default case to handle literal escapes. */
664 Reg_Parse
--; /* If we fell through from the above code, we are now
665 pointing at the back slash (\) character. */
667 unsigned char *parse_save
, *emit_save
;
668 int emit_diff
, len
= 0;
670 /* Loop until we find a meta character or end of regex string. */
672 for (; *Reg_Parse
!= '\0' &&
673 !strchr ((char *) Meta_Char
, (int) *Reg_Parse
);
676 /* Save where we are in case we have to back
677 this character out. */
679 parse_save
= Reg_Parse
;
680 emit_save
= Code_Emit_Ptr
;
682 if (*Reg_Parse
== '\\') {
683 if ((test
= literal_escape (*(Reg_Parse
+ 1), 0))) {
684 if (*(Reg_Parse
+ 1) != '\"') {
685 emit_convert_byte ('\\');
688 Reg_Parse
++; /* Point to escaped character */
689 emit_convert_byte (*Reg_Parse
);
693 "\\%c is an invalid escape sequence(2)",
696 CONVERT_FAIL (Error_Text
);
701 /* Ordinary character */
703 if ((test
= literal_escape (*Reg_Parse
, 1))) {
704 /* Ordinary character matches an escape sequence;
705 convert it to the escape sequence. */
707 emit_convert_byte ('\\');
711 emit_convert_byte ('0');
712 emit_convert_byte ('0' + (test
/ 64));
713 test
-= (test
/ 64) * 64;
714 emit_convert_byte ('0' + (test
/ 8));
715 test
-= (test
/ 8) * 8;
716 emit_convert_byte ('0' + test
);
718 emit_convert_byte (test
);
721 emit_convert_byte (*Reg_Parse
);
727 /* If next regex token is a quantifier (?, +. *, or {m,n}) and
728 our EXACTLY node so far is more than one character, leave the
729 last character to be made into an EXACTLY node one character
730 wide for the multiplier to act on. For example 'abcd* would
731 have an EXACTLY node with an 'abc' operand followed by a STAR
732 node followed by another EXACTLY node with a 'd' operand. */
734 if (IS_QUANTIFIER (*Reg_Parse
) && len
> 0) {
735 Reg_Parse
= parse_save
; /* Point to previous regex token. */
736 emit_diff
= (Code_Emit_Ptr
- emit_save
);
738 if (Code_Emit_Ptr
== &Compute_Size
) {
739 Convert_Size
-= emit_diff
;
740 } else { /* Write over previously emitted byte. */
741 Code_Emit_Ptr
= emit_save
;
748 if (len
<= 0) CONVERT_FAIL ("internal error #4, `atom\'");
750 *flag_param
|= HAS_WIDTH
;
752 if (len
== 1) *flag_param
|= SIMPLE
;
754 } /* END switch (*Reg_Parse++) */
759 /*----------------------------------------------------------------------*
762 * Emit (if appropriate) a byte of converted code.
763 *----------------------------------------------------------------------*/
765 static void emit_convert_byte (unsigned char c
) {
767 if (Code_Emit_Ptr
== &Compute_Size
) {
770 *Code_Emit_Ptr
++ = c
;
774 /*--------------------------------------------------------------------*
777 * Recognize escaped literal characters (prefixed with backslash),
778 * and translate them into the corresponding character.
780 * Returns the proper character value or NULL if not a valid literal
782 *--------------------------------------------------------------------*/
784 static unsigned char literal_escape (unsigned char c
, int action
) {
786 static unsigned char control_escape
[] = {
789 'f', 'n', 'r', 't', 'v', '\0'
792 static unsigned char control_actual
[] = {
794 #ifdef EBCDIC_CHARSET
795 0x27, /* Escape character in IBM's EBCDIC character set. */
797 0x1B, /* Escape character in ASCII character set. */
799 '\f', '\n', '\r', '\t', '\v', '\0'
802 static unsigned char valid_escape
[] = {
803 'a', 'b', 'f', 'n', 'r', 't', 'v', '(', ')', '[',
804 ']', '<', '>', '.', '\\', '|', '^', '$', '*', '+',
808 static unsigned char value
[] = {
809 '\a', '\b', '\f', '\n', '\r', '\t', '\v', '(', ')', '[',
810 ']', '<', '>', '.', '\\', '|', '^', '$', '*', '+',
817 for (i
= 0; valid_escape
[i
] != '\0'; i
++) {
818 if (c
== valid_escape
[i
]) return value
[i
];
820 } else if (action
== 1) {
821 for (i
= 0; control_actual
[i
] != '\0'; i
++) {
822 if (c
== control_actual
[i
]) {
823 return control_escape
[i
];
830 /* Signal to generate an numeric (octal) escape. */
838 /*----------------------------------------------------------------------*
839 * ConvertSubstituteRE - Perform substitutions after a `regexp' match.
840 *----------------------------------------------------------------------*/
842 void ConvertSubstituteRE (
847 register unsigned char *src
;
848 register unsigned char *dst
;
849 register unsigned char c
;
850 register unsigned char test
;
852 if (source
== NULL
|| dest
== NULL
) {
853 reg_error ("NULL parm to `ConvertSubstituteRE\'");
858 src
= (unsigned char *) source
;
859 dst
= (unsigned char *) dest
;
861 while ((c
= *src
++) != '\0') {
864 /* Process any case altering tokens, i.e \u, \U, \l, \L. */
866 if (*src
== 'u' || *src
== 'U' || *src
== 'l' || *src
== 'L') {
882 } else if (c
== '\\') {
884 /* Convert `\0' to `&' */
888 } else if ('1' <= *src
&& *src
<= '9') {
892 } else if ((test
= literal_escape (*src
, 0)) != '\0') {
896 } else if (*src
== '\0') {
897 /* If '\' is the last character of the replacement string, it is
898 interpreted as a literal backslash. */
902 /* Old regex's allowed any escape sequence. Convert these to
903 unescaped characters that replace themselves; i.e. they don't
904 need to be escaped. */
909 /* Ordinary character. */
911 if (((char *) dst
- (char *) dest
) >= (max
- 1)) {
914 if ((test
= literal_escape (c
, 1))) {
915 /* Ordinary character matches an escape sequence;
916 convert it to the escape sequence. */
920 if (test
== '0') { /* Make octal escape. */
923 *dst
++ = ('0' + (test
/ 64));
924 test
-= (test
/ 64) * 64;
925 *dst
++ = ('0' + (test
/ 8));
926 test
-= (test
/ 8) * 8;
927 *dst
++ = ('0' + test
);
942 /*----------------------------------------------------------------------*
944 *----------------------------------------------------------------------*/
946 static void reg_error (char *str
) {
950 "NEdit: Internal error processing regular expression (%s)\n",