regexp, regsub: -start is a character index
[jimtcl.git] / jim-regexp.c
blob81f320774b4f545a31335f90786af0c74b401c93
1 /*
2 * Implements the regexp and regsub commands for Jim
4 * (c) 2008 Steve Bennett <steveb@workware.net.au>
6 * Uses C library regcomp()/regexec() for the matching.
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
12 * 1. Redistributions of source code must retain the above copyright
13 * notice, this list of conditions and the following disclaimer.
14 * 2. Redistributions in binary form must reproduce the above
15 * copyright notice, this list of conditions and the following
16 * disclaimer in the documentation and/or other materials
17 * provided with the distribution.
19 * THIS SOFTWARE IS PROVIDED BY THE JIM TCL PROJECT ``AS IS'' AND ANY
20 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
21 * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
22 * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
23 * JIM TCL PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
24 * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
25 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
28 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
29 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
30 * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32 * The views and conclusions contained in the software and documentation
33 * are those of the authors and should not be interpreted as representing
34 * official policies, either expressed or implied, of the Jim Tcl Project.
36 * Based on code originally from Tcl 6.7:
38 * Copyright 1987-1991 Regents of the University of California
39 * Permission to use, copy, modify, and distribute this
40 * software and its documentation for any purpose and without
41 * fee is hereby granted, provided that the above copyright
42 * notice appear in all copies. The University of California
43 * makes no representations about the suitability of this
44 * software for any purpose. It is provided "as is" without
45 * express or implied warranty.
48 #include <stdlib.h>
49 #include <string.h>
51 #include "jimautoconf.h"
52 #if defined(JIM_REGEXP)
53 #include "jimregexp.h"
54 #else
55 #include <regex.h>
56 #endif
57 #include "jim.h"
58 #include "utf8.h"
60 static void FreeRegexpInternalRep(Jim_Interp *interp, Jim_Obj *objPtr)
62 regfree(objPtr->internalRep.ptrIntValue.ptr);
63 Jim_Free(objPtr->internalRep.ptrIntValue.ptr);
66 /* internal rep is stored in ptrIntvalue
67 * ptr = compiled regex
68 * int1 = flags
70 static const Jim_ObjType regexpObjType = {
71 "regexp",
72 FreeRegexpInternalRep,
73 NULL,
74 NULL,
75 JIM_TYPE_NONE
78 static regex_t *SetRegexpFromAny(Jim_Interp *interp, Jim_Obj *objPtr, unsigned flags)
80 regex_t *compre;
81 const char *pattern;
82 int ret;
84 /* Check if the object is already an uptodate variable */
85 if (objPtr->typePtr == &regexpObjType &&
86 objPtr->internalRep.ptrIntValue.ptr && objPtr->internalRep.ptrIntValue.int1 == flags) {
87 /* nothing to do */
88 return objPtr->internalRep.ptrIntValue.ptr;
91 /* Not a regexp or the flags do not match */
93 /* Get the string representation */
94 pattern = Jim_String(objPtr);
95 compre = Jim_Alloc(sizeof(regex_t));
97 if ((ret = regcomp(compre, pattern, REG_EXTENDED | flags)) != 0) {
98 char buf[100];
100 regerror(ret, compre, buf, sizeof(buf));
101 Jim_SetResultFormatted(interp, "couldn't compile regular expression pattern: %s", buf);
102 regfree(compre);
103 Jim_Free(compre);
104 return NULL;
107 Jim_FreeIntRep(interp, objPtr);
109 objPtr->typePtr = &regexpObjType;
110 objPtr->internalRep.ptrIntValue.int1 = flags;
111 objPtr->internalRep.ptrIntValue.ptr = compre;
113 return compre;
116 int Jim_RegexpCmd(Jim_Interp *interp, int argc, Jim_Obj *const *argv)
118 int opt_indices = 0;
119 int opt_all = 0;
120 int opt_inline = 0;
121 regex_t *regex;
122 int match, i, j;
123 int offset = 0;
124 regmatch_t *pmatch = NULL;
125 int source_len;
126 int result = JIM_OK;
127 const char *pattern;
128 const char *source_str;
129 int num_matches = 0;
130 int num_vars;
131 Jim_Obj *resultListObj = NULL;
132 int regcomp_flags = 0;
133 int eflags = 0;
134 int option;
135 enum {
136 OPT_INDICES, OPT_NOCASE, OPT_LINE, OPT_ALL, OPT_INLINE, OPT_START, OPT_END
138 static const char * const options[] = {
139 "-indices", "-nocase", "-line", "-all", "-inline", "-start", "--", NULL
142 if (argc < 3) {
143 wrongNumArgs:
144 Jim_WrongNumArgs(interp, 1, argv,
145 "?-switch ...? exp string ?matchVar? ?subMatchVar ...?");
146 return JIM_ERR;
149 for (i = 1; i < argc; i++) {
150 const char *opt = Jim_String(argv[i]);
152 if (*opt != '-') {
153 break;
155 if (Jim_GetEnum(interp, argv[i], options, &option, "switch", JIM_ERRMSG | JIM_ENUM_ABBREV) != JIM_OK) {
156 return JIM_ERR;
158 if (option == OPT_END) {
159 i++;
160 break;
162 switch (option) {
163 case OPT_INDICES:
164 opt_indices = 1;
165 break;
167 case OPT_NOCASE:
168 regcomp_flags |= REG_ICASE;
169 break;
171 case OPT_LINE:
172 regcomp_flags |= REG_NEWLINE;
173 break;
175 case OPT_ALL:
176 opt_all = 1;
177 break;
179 case OPT_INLINE:
180 opt_inline = 1;
181 break;
183 case OPT_START:
184 if (++i == argc) {
185 goto wrongNumArgs;
187 if (Jim_GetIndex(interp, argv[i], &offset) != JIM_OK) {
188 return JIM_ERR;
190 break;
193 if (argc - i < 2) {
194 goto wrongNumArgs;
197 regex = SetRegexpFromAny(interp, argv[i], regcomp_flags);
198 if (!regex) {
199 return JIM_ERR;
202 pattern = Jim_String(argv[i]);
203 source_str = Jim_GetString(argv[i + 1], &source_len);
205 num_vars = argc - i - 2;
207 if (opt_inline) {
208 if (num_vars) {
209 Jim_SetResultString(interp, "regexp match variables not allowed when using -inline",
210 -1);
211 result = JIM_ERR;
212 goto done;
214 num_vars = regex->re_nsub + 1;
217 pmatch = Jim_Alloc((num_vars + 1) * sizeof(*pmatch));
219 /* If an offset has been specified, adjust for that now.
220 * If it points past the end of the string, point to the terminating null
222 if (offset) {
223 if (offset < 0) {
224 offset += source_len + 1;
226 if (offset > source_len) {
227 source_str += source_len;
229 else if (offset > 0) {
230 source_str += utf8_index(source_str, offset);
232 eflags |= REG_NOTBOL;
235 if (opt_inline) {
236 resultListObj = Jim_NewListObj(interp, NULL, 0);
239 next_match:
240 match = regexec(regex, source_str, num_vars + 1, pmatch, eflags);
241 if (match >= REG_BADPAT) {
242 char buf[100];
244 regerror(match, regex, buf, sizeof(buf));
245 Jim_SetResultFormatted(interp, "error while matching pattern: %s", buf);
246 result = JIM_ERR;
247 goto done;
250 if (match == REG_NOMATCH) {
251 goto done;
254 num_matches++;
256 if (opt_all && !opt_inline) {
257 /* Just count the number of matches, so skip the substitution h */
258 goto try_next_match;
262 * If additional variable names have been specified, return
263 * index information in those variables.
266 j = 0;
267 for (i += 2; opt_inline ? j < num_vars : i < argc; i++, j++) {
268 Jim_Obj *resultObj;
270 if (opt_indices) {
271 resultObj = Jim_NewListObj(interp, NULL, 0);
273 else {
274 resultObj = Jim_NewStringObj(interp, "", 0);
277 if (pmatch[j].rm_so == -1) {
278 if (opt_indices) {
279 Jim_ListAppendElement(interp, resultObj, Jim_NewIntObj(interp, -1));
280 Jim_ListAppendElement(interp, resultObj, Jim_NewIntObj(interp, -1));
283 else {
284 int len = pmatch[j].rm_eo - pmatch[j].rm_so;
286 if (opt_indices) {
287 Jim_ListAppendElement(interp, resultObj, Jim_NewIntObj(interp,
288 offset + pmatch[j].rm_so));
289 Jim_ListAppendElement(interp, resultObj, Jim_NewIntObj(interp,
290 offset + pmatch[j].rm_so + len - 1));
292 else {
293 Jim_AppendString(interp, resultObj, source_str + pmatch[j].rm_so, len);
297 if (opt_inline) {
298 Jim_ListAppendElement(interp, resultListObj, resultObj);
300 else {
301 /* And now set the result variable */
302 result = Jim_SetVariable(interp, argv[i], resultObj);
304 if (result != JIM_OK) {
305 Jim_FreeObj(interp, resultObj);
306 break;
311 try_next_match:
312 if (opt_all && (pattern[0] != '^' || (regcomp_flags & REG_NEWLINE)) && *source_str) {
313 if (pmatch[0].rm_eo) {
314 offset += pmatch[0].rm_eo;
315 source_str += pmatch[0].rm_eo;
317 else {
318 source_str++;
319 offset++;
321 if (*source_str) {
322 eflags = REG_NOTBOL;
323 goto next_match;
327 done:
328 if (result == JIM_OK) {
329 if (opt_inline) {
330 Jim_SetResult(interp, resultListObj);
332 else {
333 Jim_SetResultInt(interp, num_matches);
337 Jim_Free(pmatch);
338 return result;
341 #define MAX_SUB_MATCHES 50
343 int Jim_RegsubCmd(Jim_Interp *interp, int argc, Jim_Obj *const *argv)
345 int regcomp_flags = 0;
346 int regexec_flags = 0;
347 int opt_all = 0;
348 int offset = 0;
349 regex_t *regex;
350 const char *p;
351 int result;
352 regmatch_t pmatch[MAX_SUB_MATCHES + 1];
353 int num_matches = 0;
355 int i, j, n;
356 Jim_Obj *varname;
357 Jim_Obj *resultObj;
358 const char *source_str;
359 int source_len;
360 const char *replace_str;
361 int replace_len;
362 const char *pattern;
363 int option;
364 enum {
365 OPT_NOCASE, OPT_LINE, OPT_ALL, OPT_START, OPT_END
367 static const char * const options[] = {
368 "-nocase", "-line", "-all", "-start", "--", NULL
371 if (argc < 4) {
372 wrongNumArgs:
373 Jim_WrongNumArgs(interp, 1, argv,
374 "?-switch ...? exp string subSpec ?varName?");
375 return JIM_ERR;
378 for (i = 1; i < argc; i++) {
379 const char *opt = Jim_String(argv[i]);
381 if (*opt != '-') {
382 break;
384 if (Jim_GetEnum(interp, argv[i], options, &option, "switch", JIM_ERRMSG | JIM_ENUM_ABBREV) != JIM_OK) {
385 return JIM_ERR;
387 if (option == OPT_END) {
388 i++;
389 break;
391 switch (option) {
392 case OPT_NOCASE:
393 regcomp_flags |= REG_ICASE;
394 break;
396 case OPT_LINE:
397 regcomp_flags |= REG_NEWLINE;
398 break;
400 case OPT_ALL:
401 opt_all = 1;
402 break;
404 case OPT_START:
405 if (++i == argc) {
406 goto wrongNumArgs;
408 if (Jim_GetIndex(interp, argv[i], &offset) != JIM_OK) {
409 return JIM_ERR;
411 break;
414 if (argc - i != 3 && argc - i != 4) {
415 goto wrongNumArgs;
418 regex = SetRegexpFromAny(interp, argv[i], regcomp_flags);
419 if (!regex) {
420 return JIM_ERR;
422 pattern = Jim_String(argv[i]);
424 source_str = Jim_GetString(argv[i + 1], &source_len);
425 replace_str = Jim_GetString(argv[i + 2], &replace_len);
426 varname = argv[i + 3];
428 /* Create the result string */
429 resultObj = Jim_NewStringObj(interp, "", 0);
431 /* If an offset has been specified, adjust for that now.
432 * If it points past the end of the string, point to the terminating null
434 if (offset) {
435 if (offset < 0) {
436 offset += source_len + 1;
438 if (offset > source_len) {
439 offset = source_len;
441 else if (offset < 0) {
442 offset = 0;
445 /* Convert from character offset to byte offset */
446 offset = utf8_index(source_str, offset);
448 /* Copy the part before -start */
449 Jim_AppendString(interp, resultObj, source_str, offset);
452 * The following loop is to handle multiple matches within the
453 * same source string; each iteration handles one match and its
454 * corresponding substitution. If "-all" hasn't been specified
455 * then the loop body only gets executed once.
458 n = source_len - offset;
459 p = source_str + offset;
460 do {
461 int match = regexec(regex, p, MAX_SUB_MATCHES, pmatch, regexec_flags);
463 if (match >= REG_BADPAT) {
464 char buf[100];
466 regerror(match, regex, buf, sizeof(buf));
467 Jim_SetResultFormatted(interp, "error while matching pattern: %s", buf);
468 return JIM_ERR;
470 if (match == REG_NOMATCH) {
471 break;
474 num_matches++;
477 * Copy the portion of the source string before the match to the
478 * result variable.
480 Jim_AppendString(interp, resultObj, p, pmatch[0].rm_so);
483 * Append the subSpec (replace_str) argument to the variable, making appropriate
484 * substitutions. This code is a bit hairy because of the backslash
485 * conventions and because the code saves up ranges of characters in
486 * subSpec to reduce the number of calls to Jim_SetVar.
489 for (j = 0; j < replace_len; j++) {
490 int idx;
491 int c = replace_str[j];
493 if (c == '&') {
494 idx = 0;
496 else if (c == '\\' && j < replace_len) {
497 c = replace_str[++j];
498 if ((c >= '0') && (c <= '9')) {
499 idx = c - '0';
501 else if ((c == '\\') || (c == '&')) {
502 Jim_AppendString(interp, resultObj, replace_str + j, 1);
503 continue;
505 else {
506 /* If the replacement is a trailing backslash, just replace with a backslash, otherwise
507 * with the literal backslash and the following character
509 Jim_AppendString(interp, resultObj, replace_str + j - 1, (j == replace_len) ? 1 : 2);
510 continue;
513 else {
514 Jim_AppendString(interp, resultObj, replace_str + j, 1);
515 continue;
517 if ((idx < MAX_SUB_MATCHES) && pmatch[idx].rm_so != -1 && pmatch[idx].rm_eo != -1) {
518 Jim_AppendString(interp, resultObj, p + pmatch[idx].rm_so,
519 pmatch[idx].rm_eo - pmatch[idx].rm_so);
523 p += pmatch[0].rm_eo;
524 n -= pmatch[0].rm_eo;
526 /* If -all is not specified, or there is no source left, we are done */
527 if (!opt_all || n == 0) {
528 break;
531 /* An anchored pattern without -line must be done */
532 if ((regcomp_flags & REG_NEWLINE) == 0 && pattern[0] == '^') {
533 break;
536 /* If the pattern is empty, need to step forwards */
537 if (pattern[0] == '\0' && n) {
538 /* Need to copy the char we are moving over */
539 Jim_AppendString(interp, resultObj, p, 1);
540 p++;
541 n--;
544 regexec_flags |= REG_NOTBOL;
545 } while (n);
548 * Copy the portion of the string after the last match to the
549 * result variable.
551 Jim_AppendString(interp, resultObj, p, -1);
553 /* And now set or return the result variable */
554 if (argc - i == 4) {
555 result = Jim_SetVariable(interp, varname, resultObj);
557 if (result == JIM_OK) {
558 Jim_SetResultInt(interp, num_matches);
560 else {
561 Jim_FreeObj(interp, resultObj);
564 else {
565 Jim_SetResult(interp, resultObj);
566 result = JIM_OK;
569 return result;
572 int Jim_regexpInit(Jim_Interp *interp)
574 if (Jim_PackageProvide(interp, "regexp", "1.0", JIM_ERRMSG))
575 return JIM_ERR;
577 Jim_CreateCommand(interp, "regexp", Jim_RegexpCmd, NULL, NULL);
578 Jim_CreateCommand(interp, "regsub", Jim_RegsubCmd, NULL, NULL);
579 return JIM_OK;