configure: allow "--full" options to be explicitly disabled
[jimtcl.git] / jim-regexp.c
blob8eb457d0d9c0be210d7b99244578de7729e279b2
1 /*
2 * Implements the regexp and regsub commands for Jim
4 * (c) 2008 Steve Bennett <steveb@workware.net.au>
6 * Uses C library regcomp()/regexec() for the matching.
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
12 * 1. Redistributions of source code must retain the above copyright
13 * notice, this list of conditions and the following disclaimer.
14 * 2. Redistributions in binary form must reproduce the above
15 * copyright notice, this list of conditions and the following
16 * disclaimer in the documentation and/or other materials
17 * provided with the distribution.
19 * THIS SOFTWARE IS PROVIDED BY THE JIM TCL PROJECT ``AS IS'' AND ANY
20 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
21 * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
22 * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
23 * JIM TCL PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
24 * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
25 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
28 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
29 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
30 * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32 * The views and conclusions contained in the software and documentation
33 * are those of the authors and should not be interpreted as representing
34 * official policies, either expressed or implied, of the Jim Tcl Project.
36 * Based on code originally from Tcl 6.7:
38 * Copyright 1987-1991 Regents of the University of California
39 * Permission to use, copy, modify, and distribute this
40 * software and its documentation for any purpose and without
41 * fee is hereby granted, provided that the above copyright
42 * notice appear in all copies. The University of California
43 * makes no representations about the suitability of this
44 * software for any purpose. It is provided "as is" without
45 * express or implied warranty.
48 #include <stdlib.h>
49 #include <string.h>
51 #include "jimautoconf.h"
52 #if defined(JIM_REGEXP)
53 #include "jimregexp.h"
54 #else
55 #include <regex.h>
56 #endif
57 #include "jim.h"
59 static void FreeRegexpInternalRep(Jim_Interp *interp, Jim_Obj *objPtr)
61 regfree(objPtr->internalRep.regexpValue.compre);
62 Jim_Free(objPtr->internalRep.regexpValue.compre);
65 static const Jim_ObjType regexpObjType = {
66 "regexp",
67 FreeRegexpInternalRep,
68 NULL,
69 NULL,
70 JIM_TYPE_NONE
73 static regex_t *SetRegexpFromAny(Jim_Interp *interp, Jim_Obj *objPtr, unsigned flags)
75 regex_t *compre;
76 const char *pattern;
77 int ret;
79 /* Check if the object is already an uptodate variable */
80 if (objPtr->typePtr == &regexpObjType &&
81 objPtr->internalRep.regexpValue.compre && objPtr->internalRep.regexpValue.flags == flags) {
82 /* nothing to do */
83 return objPtr->internalRep.regexpValue.compre;
86 /* Not a regexp or the flags do not match */
88 /* Get the string representation */
89 pattern = Jim_String(objPtr);
90 compre = Jim_Alloc(sizeof(regex_t));
92 if ((ret = regcomp(compre, pattern, REG_EXTENDED | flags)) != 0) {
93 char buf[100];
95 regerror(ret, compre, buf, sizeof(buf));
96 Jim_SetResultFormatted(interp, "couldn't compile regular expression pattern: %s", buf);
97 regfree(compre);
98 Jim_Free(compre);
99 return NULL;
102 Jim_FreeIntRep(interp, objPtr);
104 objPtr->typePtr = &regexpObjType;
105 objPtr->internalRep.regexpValue.flags = flags;
106 objPtr->internalRep.regexpValue.compre = compre;
108 return compre;
111 int Jim_RegexpCmd(Jim_Interp *interp, int argc, Jim_Obj *const *argv)
113 int opt_indices = 0;
114 int opt_all = 0;
115 int opt_inline = 0;
116 regex_t *regex;
117 int match, i, j;
118 int offset = 0;
119 regmatch_t *pmatch = NULL;
120 int source_len;
121 int result = JIM_OK;
122 const char *pattern;
123 const char *source_str;
124 int num_matches = 0;
125 int num_vars;
126 Jim_Obj *resultListObj = NULL;
127 int regcomp_flags = 0;
128 int eflags = 0;
129 int option;
130 enum {
131 OPT_INDICES, OPT_NOCASE, OPT_LINE, OPT_ALL, OPT_INLINE, OPT_START, OPT_END
133 static const char * const options[] = {
134 "-indices", "-nocase", "-line", "-all", "-inline", "-start", "--", NULL
137 if (argc < 3) {
138 wrongNumArgs:
139 Jim_WrongNumArgs(interp, 1, argv,
140 "?-switch ...? exp string ?matchVar? ?subMatchVar ...?");
141 return JIM_ERR;
144 for (i = 1; i < argc; i++) {
145 const char *opt = Jim_String(argv[i]);
147 if (*opt != '-') {
148 break;
150 if (Jim_GetEnum(interp, argv[i], options, &option, "switch", JIM_ERRMSG | JIM_ENUM_ABBREV) != JIM_OK) {
151 return JIM_ERR;
153 if (option == OPT_END) {
154 i++;
155 break;
157 switch (option) {
158 case OPT_INDICES:
159 opt_indices = 1;
160 break;
162 case OPT_NOCASE:
163 regcomp_flags |= REG_ICASE;
164 break;
166 case OPT_LINE:
167 regcomp_flags |= REG_NEWLINE;
168 break;
170 case OPT_ALL:
171 opt_all = 1;
172 break;
174 case OPT_INLINE:
175 opt_inline = 1;
176 break;
178 case OPT_START:
179 if (++i == argc) {
180 goto wrongNumArgs;
182 if (Jim_GetIndex(interp, argv[i], &offset) != JIM_OK) {
183 return JIM_ERR;
185 break;
188 if (argc - i < 2) {
189 goto wrongNumArgs;
192 regex = SetRegexpFromAny(interp, argv[i], regcomp_flags);
193 if (!regex) {
194 return JIM_ERR;
197 pattern = Jim_String(argv[i]);
198 source_str = Jim_GetString(argv[i + 1], &source_len);
200 num_vars = argc - i - 2;
202 if (opt_inline) {
203 if (num_vars) {
204 Jim_SetResultString(interp, "regexp match variables not allowed when using -inline",
205 -1);
206 result = JIM_ERR;
207 goto done;
209 num_vars = regex->re_nsub + 1;
212 pmatch = Jim_Alloc((num_vars + 1) * sizeof(*pmatch));
214 /* If an offset has been specified, adjust for that now.
215 * If it points past the end of the string, point to the terminating null
217 if (offset) {
218 if (offset < 0) {
219 offset += source_len + 1;
221 if (offset > source_len) {
222 source_str += source_len;
224 else if (offset > 0) {
225 source_str += offset;
227 eflags |= REG_NOTBOL;
230 if (opt_inline) {
231 resultListObj = Jim_NewListObj(interp, NULL, 0);
234 next_match:
235 match = regexec(regex, source_str, num_vars + 1, pmatch, eflags);
236 if (match >= REG_BADPAT) {
237 char buf[100];
239 regerror(match, regex, buf, sizeof(buf));
240 Jim_SetResultFormatted(interp, "error while matching pattern: %s", buf);
241 result = JIM_ERR;
242 goto done;
245 if (match == REG_NOMATCH) {
246 goto done;
249 num_matches++;
251 if (opt_all && !opt_inline) {
252 /* Just count the number of matches, so skip the substitution h */
253 goto try_next_match;
257 * If additional variable names have been specified, return
258 * index information in those variables.
261 j = 0;
262 for (i += 2; opt_inline ? j < num_vars : i < argc; i++, j++) {
263 Jim_Obj *resultObj;
265 if (opt_indices) {
266 resultObj = Jim_NewListObj(interp, NULL, 0);
268 else {
269 resultObj = Jim_NewStringObj(interp, "", 0);
272 if (pmatch[j].rm_so == -1) {
273 if (opt_indices) {
274 Jim_ListAppendElement(interp, resultObj, Jim_NewIntObj(interp, -1));
275 Jim_ListAppendElement(interp, resultObj, Jim_NewIntObj(interp, -1));
278 else {
279 int len = pmatch[j].rm_eo - pmatch[j].rm_so;
281 if (opt_indices) {
282 Jim_ListAppendElement(interp, resultObj, Jim_NewIntObj(interp,
283 offset + pmatch[j].rm_so));
284 Jim_ListAppendElement(interp, resultObj, Jim_NewIntObj(interp,
285 offset + pmatch[j].rm_so + len - 1));
287 else {
288 Jim_AppendString(interp, resultObj, source_str + pmatch[j].rm_so, len);
292 if (opt_inline) {
293 Jim_ListAppendElement(interp, resultListObj, resultObj);
295 else {
296 /* And now set the result variable */
297 result = Jim_SetVariable(interp, argv[i], resultObj);
299 if (result != JIM_OK) {
300 Jim_FreeObj(interp, resultObj);
301 break;
306 try_next_match:
307 if (opt_all && (pattern[0] != '^' || (regcomp_flags & REG_NEWLINE)) && *source_str) {
308 if (pmatch[0].rm_eo) {
309 offset += pmatch[0].rm_eo;
310 source_str += pmatch[0].rm_eo;
312 else {
313 source_str++;
314 offset++;
316 if (*source_str) {
317 eflags = REG_NOTBOL;
318 goto next_match;
322 done:
323 if (result == JIM_OK) {
324 if (opt_inline) {
325 Jim_SetResult(interp, resultListObj);
327 else {
328 Jim_SetResultInt(interp, num_matches);
332 Jim_Free(pmatch);
333 return result;
336 #define MAX_SUB_MATCHES 50
338 int Jim_RegsubCmd(Jim_Interp *interp, int argc, Jim_Obj *const *argv)
340 int regcomp_flags = 0;
341 int regexec_flags = 0;
342 int opt_all = 0;
343 int offset = 0;
344 regex_t *regex;
345 const char *p;
346 int result;
347 regmatch_t pmatch[MAX_SUB_MATCHES + 1];
348 int num_matches = 0;
350 int i, j, n;
351 Jim_Obj *varname;
352 Jim_Obj *resultObj;
353 const char *source_str;
354 int source_len;
355 const char *replace_str;
356 int replace_len;
357 const char *pattern;
358 int option;
359 enum {
360 OPT_NOCASE, OPT_LINE, OPT_ALL, OPT_START, OPT_END
362 static const char * const options[] = {
363 "-nocase", "-line", "-all", "-start", "--", NULL
366 if (argc < 4) {
367 wrongNumArgs:
368 Jim_WrongNumArgs(interp, 1, argv,
369 "?-switch ...? exp string subSpec ?varName?");
370 return JIM_ERR;
373 for (i = 1; i < argc; i++) {
374 const char *opt = Jim_String(argv[i]);
376 if (*opt != '-') {
377 break;
379 if (Jim_GetEnum(interp, argv[i], options, &option, "switch", JIM_ERRMSG | JIM_ENUM_ABBREV) != JIM_OK) {
380 return JIM_ERR;
382 if (option == OPT_END) {
383 i++;
384 break;
386 switch (option) {
387 case OPT_NOCASE:
388 regcomp_flags |= REG_ICASE;
389 break;
391 case OPT_LINE:
392 regcomp_flags |= REG_NEWLINE;
393 break;
395 case OPT_ALL:
396 opt_all = 1;
397 break;
399 case OPT_START:
400 if (++i == argc) {
401 goto wrongNumArgs;
403 if (Jim_GetIndex(interp, argv[i], &offset) != JIM_OK) {
404 return JIM_ERR;
406 break;
409 if (argc - i != 3 && argc - i != 4) {
410 goto wrongNumArgs;
413 regex = SetRegexpFromAny(interp, argv[i], regcomp_flags);
414 if (!regex) {
415 return JIM_ERR;
417 pattern = Jim_String(argv[i]);
419 source_str = Jim_GetString(argv[i + 1], &source_len);
420 replace_str = Jim_GetString(argv[i + 2], &replace_len);
421 varname = argv[i + 3];
423 /* Create the result string */
424 resultObj = Jim_NewStringObj(interp, "", 0);
426 /* If an offset has been specified, adjust for that now.
427 * If it points past the end of the string, point to the terminating null
429 if (offset) {
430 if (offset < 0) {
431 offset += source_len + 1;
433 if (offset > source_len) {
434 offset = source_len;
436 else if (offset < 0) {
437 offset = 0;
441 /* Copy the part before -start */
442 Jim_AppendString(interp, resultObj, source_str, offset);
445 * The following loop is to handle multiple matches within the
446 * same source string; each iteration handles one match and its
447 * corresponding substitution. If "-all" hasn't been specified
448 * then the loop body only gets executed once.
451 n = source_len - offset;
452 p = source_str + offset;
453 do {
454 int match = regexec(regex, p, MAX_SUB_MATCHES, pmatch, regexec_flags);
456 if (match >= REG_BADPAT) {
457 char buf[100];
459 regerror(match, regex, buf, sizeof(buf));
460 Jim_SetResultFormatted(interp, "error while matching pattern: %s", buf);
461 return JIM_ERR;
463 if (match == REG_NOMATCH) {
464 break;
467 num_matches++;
470 * Copy the portion of the source string before the match to the
471 * result variable.
473 Jim_AppendString(interp, resultObj, p, pmatch[0].rm_so);
476 * Append the subSpec (replace_str) argument to the variable, making appropriate
477 * substitutions. This code is a bit hairy because of the backslash
478 * conventions and because the code saves up ranges of characters in
479 * subSpec to reduce the number of calls to Jim_SetVar.
482 for (j = 0; j < replace_len; j++) {
483 int idx;
484 int c = replace_str[j];
486 if (c == '&') {
487 idx = 0;
489 else if (c == '\\' && j < replace_len) {
490 c = replace_str[++j];
491 if ((c >= '0') && (c <= '9')) {
492 idx = c - '0';
494 else if ((c == '\\') || (c == '&')) {
495 Jim_AppendString(interp, resultObj, replace_str + j, 1);
496 continue;
498 else {
499 /* If the replacement is a trailing backslash, just replace with a backslash, otherwise
500 * with the literal backslash and the following character
502 Jim_AppendString(interp, resultObj, replace_str + j - 1, (j == replace_len) ? 1 : 2);
503 continue;
506 else {
507 Jim_AppendString(interp, resultObj, replace_str + j, 1);
508 continue;
510 if ((idx < MAX_SUB_MATCHES) && pmatch[idx].rm_so != -1 && pmatch[idx].rm_eo != -1) {
511 Jim_AppendString(interp, resultObj, p + pmatch[idx].rm_so,
512 pmatch[idx].rm_eo - pmatch[idx].rm_so);
516 p += pmatch[0].rm_eo;
517 n -= pmatch[0].rm_eo;
519 /* If -all is not specified, or there is no source left, we are done */
520 if (!opt_all || n == 0) {
521 break;
524 /* An anchored pattern without -line must be done */
525 if ((regcomp_flags & REG_NEWLINE) == 0 && pattern[0] == '^') {
526 break;
529 /* If the pattern is empty, need to step forwards */
530 if (pattern[0] == '\0' && n) {
531 /* Need to copy the char we are moving over */
532 Jim_AppendString(interp, resultObj, p, 1);
533 p++;
534 n--;
537 regexec_flags |= REG_NOTBOL;
538 } while (n);
541 * Copy the portion of the string after the last match to the
542 * result variable.
544 Jim_AppendString(interp, resultObj, p, -1);
546 /* And now set or return the result variable */
547 if (argc - i == 4) {
548 result = Jim_SetVariable(interp, varname, resultObj);
550 if (result == JIM_OK) {
551 Jim_SetResultInt(interp, num_matches);
553 else {
554 Jim_FreeObj(interp, resultObj);
557 else {
558 Jim_SetResult(interp, resultObj);
559 result = JIM_OK;
562 return result;
565 int Jim_regexpInit(Jim_Interp *interp)
567 if (Jim_PackageProvide(interp, "regexp", "1.0", JIM_ERRMSG))
568 return JIM_ERR;
570 Jim_CreateCommand(interp, "regexp", Jim_RegexpCmd, NULL, NULL);
571 Jim_CreateCommand(interp, "regsub", Jim_RegsubCmd, NULL, NULL);
572 return JIM_OK;