2 * Implements the regexp and regsub commands for Jim
4 * (c) 2008 Steve Bennett <steveb@workware.net.au>
6 * Uses C library regcomp()/regexec() for the matching.
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
12 * 1. Redistributions of source code must retain the above copyright
13 * notice, this list of conditions and the following disclaimer.
14 * 2. Redistributions in binary form must reproduce the above
15 * copyright notice, this list of conditions and the following
16 * disclaimer in the documentation and/or other materials
17 * provided with the distribution.
19 * THIS SOFTWARE IS PROVIDED BY THE JIM TCL PROJECT ``AS IS'' AND ANY
20 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
21 * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
22 * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
23 * JIM TCL PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
24 * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
25 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
28 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
29 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
30 * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32 * The views and conclusions contained in the software and documentation
33 * are those of the authors and should not be interpreted as representing
34 * official policies, either expressed or implied, of the Jim Tcl Project.
36 * Based on code originally from Tcl 6.7:
38 * Copyright 1987-1991 Regents of the University of California
39 * Permission to use, copy, modify, and distribute this
40 * software and its documentation for any purpose and without
41 * fee is hereby granted, provided that the above copyright
42 * notice appear in all copies. The University of California
43 * makes no representations about the suitability of this
44 * software for any purpose. It is provided "as is" without
45 * express or implied warranty.
51 #include "jimautoconf.h"
52 #if defined(JIM_REGEXP)
53 #include "jimregexp.h"
60 static void FreeRegexpInternalRep(Jim_Interp
*interp
, Jim_Obj
*objPtr
)
62 regfree(objPtr
->internalRep
.ptrIntValue
.ptr
);
63 Jim_Free(objPtr
->internalRep
.ptrIntValue
.ptr
);
66 /* internal rep is stored in ptrIntvalue
67 * ptr = compiled regex
70 static const Jim_ObjType regexpObjType
= {
72 FreeRegexpInternalRep
,
78 static regex_t
*SetRegexpFromAny(Jim_Interp
*interp
, Jim_Obj
*objPtr
, unsigned flags
)
84 /* Check if the object is already an uptodate variable */
85 if (objPtr
->typePtr
== ®expObjType
&&
86 objPtr
->internalRep
.ptrIntValue
.ptr
&& objPtr
->internalRep
.ptrIntValue
.int1
== flags
) {
88 return objPtr
->internalRep
.ptrIntValue
.ptr
;
91 /* Not a regexp or the flags do not match */
93 /* Get the string representation */
94 pattern
= Jim_String(objPtr
);
95 compre
= Jim_Alloc(sizeof(regex_t
));
97 if ((ret
= regcomp(compre
, pattern
, REG_EXTENDED
| flags
)) != 0) {
100 regerror(ret
, compre
, buf
, sizeof(buf
));
101 Jim_SetResultFormatted(interp
, "couldn't compile regular expression pattern: %s", buf
);
107 Jim_FreeIntRep(interp
, objPtr
);
109 objPtr
->typePtr
= ®expObjType
;
110 objPtr
->internalRep
.ptrIntValue
.int1
= flags
;
111 objPtr
->internalRep
.ptrIntValue
.ptr
= compre
;
116 int Jim_RegexpCmd(Jim_Interp
*interp
, int argc
, Jim_Obj
*const *argv
)
124 regmatch_t
*pmatch
= NULL
;
128 const char *source_str
;
131 Jim_Obj
*resultListObj
= NULL
;
132 int regcomp_flags
= 0;
136 OPT_INDICES
, OPT_NOCASE
, OPT_LINE
, OPT_ALL
, OPT_INLINE
, OPT_START
, OPT_END
138 static const char * const options
[] = {
139 "-indices", "-nocase", "-line", "-all", "-inline", "-start", "--", NULL
144 Jim_WrongNumArgs(interp
, 1, argv
,
145 "?-switch ...? exp string ?matchVar? ?subMatchVar ...?");
149 for (i
= 1; i
< argc
; i
++) {
150 const char *opt
= Jim_String(argv
[i
]);
155 if (Jim_GetEnum(interp
, argv
[i
], options
, &option
, "switch", JIM_ERRMSG
| JIM_ENUM_ABBREV
) != JIM_OK
) {
158 if (option
== OPT_END
) {
168 regcomp_flags
|= REG_ICASE
;
172 regcomp_flags
|= REG_NEWLINE
;
187 if (Jim_GetIndex(interp
, argv
[i
], &offset
) != JIM_OK
) {
197 regex
= SetRegexpFromAny(interp
, argv
[i
], regcomp_flags
);
202 pattern
= Jim_String(argv
[i
]);
203 source_str
= Jim_GetString(argv
[i
+ 1], &source_len
);
205 num_vars
= argc
- i
- 2;
209 Jim_SetResultString(interp
, "regexp match variables not allowed when using -inline",
214 num_vars
= regex
->re_nsub
+ 1;
217 pmatch
= Jim_Alloc((num_vars
+ 1) * sizeof(*pmatch
));
219 /* If an offset has been specified, adjust for that now.
220 * If it points past the end of the string, point to the terminating null
224 offset
+= source_len
+ 1;
226 if (offset
> source_len
) {
227 source_str
+= source_len
;
229 else if (offset
> 0) {
230 source_str
+= utf8_index(source_str
, offset
);
232 eflags
|= REG_NOTBOL
;
236 resultListObj
= Jim_NewListObj(interp
, NULL
, 0);
240 match
= regexec(regex
, source_str
, num_vars
+ 1, pmatch
, eflags
);
241 if (match
>= REG_BADPAT
) {
244 regerror(match
, regex
, buf
, sizeof(buf
));
245 Jim_SetResultFormatted(interp
, "error while matching pattern: %s", buf
);
250 if (match
== REG_NOMATCH
) {
256 if (opt_all
&& !opt_inline
) {
257 /* Just count the number of matches, so skip the substitution h */
262 * If additional variable names have been specified, return
263 * index information in those variables.
267 for (i
+= 2; opt_inline
? j
< num_vars
: i
< argc
; i
++, j
++) {
271 resultObj
= Jim_NewListObj(interp
, NULL
, 0);
274 resultObj
= Jim_NewStringObj(interp
, "", 0);
277 if (pmatch
[j
].rm_so
== -1) {
279 Jim_ListAppendElement(interp
, resultObj
, Jim_NewIntObj(interp
, -1));
280 Jim_ListAppendElement(interp
, resultObj
, Jim_NewIntObj(interp
, -1));
285 /* rm_so and rm_eo are byte offsets. We need char offsets */
286 int so
= utf8_strlen(source_str
, pmatch
[j
].rm_so
);
287 int eo
= utf8_strlen(source_str
, pmatch
[j
].rm_eo
);
288 Jim_ListAppendElement(interp
, resultObj
, Jim_NewIntObj(interp
, offset
+ so
));
289 Jim_ListAppendElement(interp
, resultObj
, Jim_NewIntObj(interp
, offset
+ eo
- 1));
292 Jim_AppendString(interp
, resultObj
, source_str
+ pmatch
[j
].rm_so
, pmatch
[j
].rm_eo
- pmatch
[j
].rm_so
);
297 Jim_ListAppendElement(interp
, resultListObj
, resultObj
);
300 /* And now set the result variable */
301 result
= Jim_SetVariable(interp
, argv
[i
], resultObj
);
303 if (result
!= JIM_OK
) {
304 Jim_FreeObj(interp
, resultObj
);
311 if (opt_all
&& (pattern
[0] != '^' || (regcomp_flags
& REG_NEWLINE
)) && *source_str
) {
312 if (pmatch
[0].rm_eo
) {
313 offset
+= utf8_strlen(source_str
, pmatch
[0].rm_eo
);
314 source_str
+= pmatch
[0].rm_eo
;
327 if (result
== JIM_OK
) {
329 Jim_SetResult(interp
, resultListObj
);
332 Jim_SetResultInt(interp
, num_matches
);
340 #define MAX_SUB_MATCHES 50
342 int Jim_RegsubCmd(Jim_Interp
*interp
, int argc
, Jim_Obj
*const *argv
)
344 int regcomp_flags
= 0;
345 int regexec_flags
= 0;
351 regmatch_t pmatch
[MAX_SUB_MATCHES
+ 1];
357 const char *source_str
;
359 const char *replace_str
;
364 OPT_NOCASE
, OPT_LINE
, OPT_ALL
, OPT_START
, OPT_END
366 static const char * const options
[] = {
367 "-nocase", "-line", "-all", "-start", "--", NULL
372 Jim_WrongNumArgs(interp
, 1, argv
,
373 "?-switch ...? exp string subSpec ?varName?");
377 for (i
= 1; i
< argc
; i
++) {
378 const char *opt
= Jim_String(argv
[i
]);
383 if (Jim_GetEnum(interp
, argv
[i
], options
, &option
, "switch", JIM_ERRMSG
| JIM_ENUM_ABBREV
) != JIM_OK
) {
386 if (option
== OPT_END
) {
392 regcomp_flags
|= REG_ICASE
;
396 regcomp_flags
|= REG_NEWLINE
;
407 if (Jim_GetIndex(interp
, argv
[i
], &offset
) != JIM_OK
) {
413 if (argc
- i
!= 3 && argc
- i
!= 4) {
417 regex
= SetRegexpFromAny(interp
, argv
[i
], regcomp_flags
);
421 pattern
= Jim_String(argv
[i
]);
423 source_str
= Jim_GetString(argv
[i
+ 1], &source_len
);
424 replace_str
= Jim_GetString(argv
[i
+ 2], &replace_len
);
425 varname
= argv
[i
+ 3];
427 /* Create the result string */
428 resultObj
= Jim_NewStringObj(interp
, "", 0);
430 /* If an offset has been specified, adjust for that now.
431 * If it points past the end of the string, point to the terminating null
435 offset
+= source_len
+ 1;
437 if (offset
> source_len
) {
440 else if (offset
< 0) {
444 /* Convert from character offset to byte offset */
445 offset
= utf8_index(source_str
, offset
);
447 /* Copy the part before -start */
448 Jim_AppendString(interp
, resultObj
, source_str
, offset
);
451 * The following loop is to handle multiple matches within the
452 * same source string; each iteration handles one match and its
453 * corresponding substitution. If "-all" hasn't been specified
454 * then the loop body only gets executed once.
457 n
= source_len
- offset
;
458 p
= source_str
+ offset
;
460 int match
= regexec(regex
, p
, MAX_SUB_MATCHES
, pmatch
, regexec_flags
);
462 if (match
>= REG_BADPAT
) {
465 regerror(match
, regex
, buf
, sizeof(buf
));
466 Jim_SetResultFormatted(interp
, "error while matching pattern: %s", buf
);
469 if (match
== REG_NOMATCH
) {
476 * Copy the portion of the source string before the match to the
479 Jim_AppendString(interp
, resultObj
, p
, pmatch
[0].rm_so
);
482 * Append the subSpec (replace_str) argument to the variable, making appropriate
483 * substitutions. This code is a bit hairy because of the backslash
484 * conventions and because the code saves up ranges of characters in
485 * subSpec to reduce the number of calls to Jim_SetVar.
488 for (j
= 0; j
< replace_len
; j
++) {
490 int c
= replace_str
[j
];
495 else if (c
== '\\' && j
< replace_len
) {
496 c
= replace_str
[++j
];
497 if ((c
>= '0') && (c
<= '9')) {
500 else if ((c
== '\\') || (c
== '&')) {
501 Jim_AppendString(interp
, resultObj
, replace_str
+ j
, 1);
505 /* If the replacement is a trailing backslash, just replace with a backslash, otherwise
506 * with the literal backslash and the following character
508 Jim_AppendString(interp
, resultObj
, replace_str
+ j
- 1, (j
== replace_len
) ? 1 : 2);
513 Jim_AppendString(interp
, resultObj
, replace_str
+ j
, 1);
516 if ((idx
< MAX_SUB_MATCHES
) && pmatch
[idx
].rm_so
!= -1 && pmatch
[idx
].rm_eo
!= -1) {
517 Jim_AppendString(interp
, resultObj
, p
+ pmatch
[idx
].rm_so
,
518 pmatch
[idx
].rm_eo
- pmatch
[idx
].rm_so
);
522 p
+= pmatch
[0].rm_eo
;
523 n
-= pmatch
[0].rm_eo
;
525 /* If -all is not specified, or there is no source left, we are done */
526 if (!opt_all
|| n
== 0) {
530 /* An anchored pattern without -line must be done */
531 if ((regcomp_flags
& REG_NEWLINE
) == 0 && pattern
[0] == '^') {
535 /* If the pattern is empty, need to step forwards */
536 if (pattern
[0] == '\0' && n
) {
537 /* Need to copy the char we are moving over */
538 Jim_AppendString(interp
, resultObj
, p
, 1);
543 regexec_flags
|= REG_NOTBOL
;
547 * Copy the portion of the string after the last match to the
550 Jim_AppendString(interp
, resultObj
, p
, -1);
552 /* And now set or return the result variable */
554 result
= Jim_SetVariable(interp
, varname
, resultObj
);
556 if (result
== JIM_OK
) {
557 Jim_SetResultInt(interp
, num_matches
);
560 Jim_FreeObj(interp
, resultObj
);
564 Jim_SetResult(interp
, resultObj
);
571 int Jim_regexpInit(Jim_Interp
*interp
)
573 if (Jim_PackageProvide(interp
, "regexp", "1.0", JIM_ERRMSG
))
576 Jim_CreateCommand(interp
, "regexp", Jim_RegexpCmd
, NULL
, NULL
);
577 Jim_CreateCommand(interp
, "regsub", Jim_RegsubCmd
, NULL
, NULL
);