Make manual page rendering easier.
[jimtcl/wkoszek.git] / jim-regexp.c
blob72d2f1f4faa1449b27f2c35e6b1c15a96e2a7d97
1 /*
2 * (c) 2008 Steve Bennett <steveb@workware.net.au>
4 * Implements the regexp and regsub commands for Jim
6 * Uses C library regcomp()/regexec() for the matching.
8 * The FreeBSD license
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
14 * 1. Redistributions of source code must retain the above copyright
15 * notice, this list of conditions and the following disclaimer.
16 * 2. Redistributions in binary form must reproduce the above
17 * copyright notice, this list of conditions and the following
18 * disclaimer in the documentation and/or other materials
19 * provided with the distribution.
21 * THIS SOFTWARE IS PROVIDED BY THE JIM TCL PROJECT ``AS IS'' AND ANY
22 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
23 * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
24 * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25 * JIM TCL PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
26 * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
27 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
30 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
31 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
32 * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
34 * The views and conclusions contained in the software and documentation
35 * are those of the authors and should not be interpreted as representing
36 * official policies, either expressed or implied, of the Jim Tcl Project.
38 * Based on code originally from Tcl 6.7:
40 * Copyright 1987-1991 Regents of the University of California
41 * Permission to use, copy, modify, and distribute this
42 * software and its documentation for any purpose and without
43 * fee is hereby granted, provided that the above copyright
44 * notice appear in all copies. The University of California
45 * makes no representations about the suitability of this
46 * software for any purpose. It is provided "as is" without
47 * express or implied warranty.
50 #include <regex.h>
51 #include <string.h>
53 #define JIM_EXTENSION
54 #include "jim.h"
56 /* REVISIT: Would be useful in jim.h */
57 static void Jim_SetIntResult(Jim_Interp *interp, jim_wide wide)
59 Jim_SetResult(interp, Jim_NewIntObj(interp, wide));
62 /**
63 * REVISIT: Should cache a number of compiled regexps for performance reasons.
65 static regex_t *
66 compile_regexp(Jim_Interp *interp, const char *pattern, int flags)
68 int ret;
70 regex_t *result = (regex_t *)Jim_Alloc(sizeof(*result));
72 if ((ret = regcomp(result, pattern, REG_EXTENDED | flags)) != 0) {
73 char buf[100];
74 regerror(ret, result, buf, sizeof(buf));
75 Jim_SetResult(interp, Jim_NewEmptyStringObj(interp));
76 Jim_AppendStrings(interp, Jim_GetResult(interp), "couldn't compile regular expression pattern: ", buf, NULL);
77 Jim_Free(result);
78 return NULL;
80 return result;
83 int Jim_RegexpCmd(Jim_Interp *interp, int argc, Jim_Obj *const *argv)
85 int opt_indices = 0;
86 int opt_all = 0;
87 int opt_inline = 0;
88 regex_t *regex;
89 int match, i, j;
90 long offset = 0;
91 regmatch_t *pmatch = NULL;
92 int source_len;
93 int result = JIM_OK;
94 const char *pattern;
95 const char *source_str;
96 int num_matches = 0;
97 int num_vars;
98 Jim_Obj *resultListObj = NULL;
99 int regcomp_flags = 0;
101 if (argc < 3) {
102 wrongNumArgs:
103 Jim_WrongNumArgs(interp, 1, argv, "?-nocase? ?-line? ?-indices? ?-start offset? ?-all? ?-inline? exp string ?matchVar? ?subMatchVar ...?");
104 return JIM_ERR;
107 for (i = 1; i < argc; i++) {
108 if (Jim_CompareStringImmediate(interp, argv[i], "-indices")) {
109 opt_indices = 1;
111 else if (Jim_CompareStringImmediate(interp, argv[i], "-nocase")) {
112 regcomp_flags |= REG_ICASE;
114 else if (Jim_CompareStringImmediate(interp, argv[i], "-line")) {
115 regcomp_flags |= REG_NEWLINE;
117 else if (Jim_CompareStringImmediate(interp, argv[i], "-all")) {
118 opt_all = 1;
120 else if (Jim_CompareStringImmediate(interp, argv[i], "-inline")) {
121 opt_inline = 1;
123 else if (Jim_CompareStringImmediate(interp, argv[i], "-start")) {
124 if (++i == argc) {
125 goto wrongNumArgs;
127 if (Jim_GetLong(interp, argv[i], &offset) != JIM_OK) {
128 return JIM_ERR;
131 else if (Jim_CompareStringImmediate(interp, argv[i], "--")) {
132 i++;
133 break;
135 else {
136 const char *opt = Jim_GetString(argv[i], NULL);
137 if (*opt == '-') {
138 /* Bad option */
139 goto wrongNumArgs;
141 break;
144 if (argc - i < 2) {
145 goto wrongNumArgs;
148 pattern = Jim_GetString(argv[i], NULL);
149 regex = compile_regexp(interp, pattern, regcomp_flags);
150 if (regex == NULL) {
151 return JIM_ERR;
154 source_str = Jim_GetString(argv[i + 1], &source_len);
156 num_vars = argc - i - 2;
158 if (opt_inline) {
159 if (num_vars) {
160 Jim_SetResultString(interp, "regexp match variables not allowed when using -inline", -1);
161 result = JIM_ERR;
162 goto done;
164 /* REVISIT: Ugly! */
165 num_vars = 100;
168 pmatch = Jim_Alloc((num_vars + 1) * sizeof(*pmatch));
170 /* If an offset has been specified, adjust for that now.
171 * If it points past the end of the string, point to the terminating null
173 if (offset) {
174 if (offset > source_len) {
175 source_str += source_len;
176 } else if (offset > 0) {
177 source_str += offset;
181 if (opt_inline) {
182 resultListObj = Jim_NewListObj(interp, NULL, 0);
185 next_match:
186 match = regexec(regex, source_str, num_vars + 1, pmatch, 0);
187 if (match >= REG_BADPAT) {
188 char buf[100];
189 regerror(match, regex, buf, sizeof(buf));
190 Jim_SetResultString(interp, "", 0);
191 Jim_AppendStrings(interp, Jim_GetResult(interp), "error while matching pattern: ", buf, NULL);
192 result = JIM_ERR;
193 goto done;
196 if (match == REG_NOMATCH) {
197 goto done;
200 num_matches++;
202 if (opt_all && !opt_inline) {
203 /* Just count the number of matches, so skip the substitution h*/
204 goto try_next_match;
208 * If additional variable names have been specified, return
209 * index information in those variables.
212 //fprintf(stderr, "source_str=%s, [0].rm_eo=%d\n", source_str, pmatch[0].rm_eo);
214 j = 0;
215 for (i += 2; opt_inline ? pmatch[j].rm_so != -1 : i < argc; i++, j++) {
216 Jim_Obj *resultObj;
218 if (opt_indices) {
219 resultObj = Jim_NewListObj(interp, NULL, 0);
221 else {
222 resultObj = Jim_NewStringObj(interp, "", 0);
225 if (pmatch[j].rm_so == -1) {
226 if (opt_indices) {
227 Jim_ListAppendElement(interp, resultObj, Jim_NewIntObj(interp, -1));
228 Jim_ListAppendElement(interp, resultObj, Jim_NewIntObj(interp, -1));
230 } else {
231 int len = pmatch[j].rm_eo - pmatch[j].rm_so;
232 if (opt_indices) {
233 Jim_ListAppendElement(interp, resultObj, Jim_NewIntObj(interp, offset + pmatch[j].rm_so));
234 Jim_ListAppendElement(interp, resultObj, Jim_NewIntObj(interp, offset + pmatch[j].rm_so + len - 1));
235 } else {
236 Jim_AppendString(interp, resultObj, source_str + pmatch[j].rm_so, len);
240 if (opt_inline) {
241 Jim_ListAppendElement(interp, resultListObj, resultObj);
243 else {
244 /* And now set the result variable */
245 result = Jim_SetVariable(interp, argv[i], resultObj);
247 if (result != JIM_OK) {
248 Jim_SetResult(interp, Jim_NewEmptyStringObj(interp));
249 Jim_AppendStrings(interp, Jim_GetResult(interp), "couldn't set variable \"", Jim_GetString(argv[i], NULL), "\"", NULL);
250 Jim_FreeObj(interp, resultObj);
251 break;
256 try_next_match:
257 if (opt_all && pattern[0] != '^' && *source_str) {
258 if (pmatch[0].rm_eo) {
259 source_str += pmatch[0].rm_eo;
261 else {
262 source_str++;
264 if (*source_str) {
265 goto next_match;
269 done:
270 if (result == JIM_OK) {
271 if (opt_inline) {
272 Jim_SetResult(interp, resultListObj);
274 else {
275 Jim_SetIntResult(interp, num_matches);
279 Jim_Free(pmatch);
280 regfree(regex);
281 Jim_Free(regex);
282 return result;
285 #define MAX_SUB_MATCHES 10
287 int Jim_RegsubCmd(Jim_Interp *interp, int argc, Jim_Obj *const *argv)
289 int regcomp_flags = 0;
290 int opt_all = 0;
291 long offset = 0;
292 regex_t *regex;
293 const char *p;
294 int result = JIM_ERR;
295 regmatch_t pmatch[MAX_SUB_MATCHES + 1];
296 int num_matches = 0;
298 int i;
299 Jim_Obj *varname;
300 Jim_Obj *resultObj;
301 const char *source_str;
302 int source_len;
303 const char *replace_str;
304 const char *pattern;
306 if (argc < 5) {
307 wrongNumArgs:
308 Jim_WrongNumArgs(interp, 1, argv, "?-nocase? ?-all? exp string subSpec varName");
309 return JIM_ERR;
312 for (i = 1; i < argc; i++) {
313 if (Jim_CompareStringImmediate(interp, argv[i], "-nocase")) {
314 regcomp_flags |= REG_ICASE;
316 else if (Jim_CompareStringImmediate(interp, argv[i], "-line")) {
317 regcomp_flags |= REG_NEWLINE;
319 else if (Jim_CompareStringImmediate(interp, argv[i], "-all")) {
320 opt_all = 1;
322 else if (Jim_CompareStringImmediate(interp, argv[i], "-start")) {
323 if (++i == argc) {
324 goto wrongNumArgs;
326 if (Jim_GetLong(interp, argv[i], &offset) != JIM_OK) {
327 return JIM_ERR;
330 else if (Jim_CompareStringImmediate(interp, argv[i], "--")) {
331 i++;
332 break;
334 else {
335 const char *opt = Jim_GetString(argv[i], NULL);
336 if (*opt == '-') {
337 /* Bad option */
338 goto wrongNumArgs;
340 break;
343 if (argc - i != 4) {
344 goto wrongNumArgs;
347 pattern = Jim_GetString(argv[i], NULL);
348 regex = compile_regexp(interp, pattern, regcomp_flags);
349 if (regex == NULL) {
350 return JIM_ERR;
353 source_str = Jim_GetString(argv[i + 1], &source_len);
354 replace_str = Jim_GetString(argv[i + 2], NULL);
355 varname = argv[i + 3];
357 /* Create the result string */
358 resultObj = Jim_NewStringObj(interp, "", 0);
360 /* If an offset has been specified, adjust for that now.
361 * If it points past the end of the string, point to the terminating null
363 if (offset) {
364 if (offset > source_len) {
365 offset = source_len;
366 } else if (offset < 0) {
367 offset = 0;
371 /* Copy the part before -start */
372 Jim_AppendString(interp, resultObj, source_str, offset);
375 * The following loop is to handle multiple matches within the
376 * same source string; each iteration handles one match and its
377 * corresponding substitution. If "-all" hasn't been specified
378 * then the loop body only gets executed once.
381 for (p = source_str + offset; *p != 0; ) {
382 const char *src;
383 int match = regexec(regex, p, MAX_SUB_MATCHES, pmatch, 0);
384 if (match >= REG_BADPAT) {
385 char buf[100];
386 regerror(match, regex, buf, sizeof(buf));
387 Jim_SetResultString(interp, "", 0);
388 Jim_AppendStrings(interp, Jim_GetResult(interp), "error while matching pattern: ", buf, NULL);
389 goto done;
391 if (match == REG_NOMATCH) {
392 break;
395 num_matches++;
398 * Copy the portion of the source string before the match to the
399 * result variable.
401 Jim_AppendString(interp, resultObj, p, pmatch[0].rm_so);
404 * Append the subSpec (replace_str) argument to the variable, making appropriate
405 * substitutions. This code is a bit hairy because of the backslash
406 * conventions and because the code saves up ranges of characters in
407 * subSpec to reduce the number of calls to Jim_SetVar.
410 for (src = replace_str; *src; src++) {
411 int index;
412 int c = *src;
414 if (c == '&') {
415 index = 0;
417 else if (c == '\\') {
418 c = *++src;
419 if ((c >= '0') && (c <= '9')) {
420 index = c - '0';
422 else if ((c == '\\') || (c == '&')) {
423 Jim_AppendString(interp, resultObj, src, 1);
424 continue;
426 else {
427 Jim_AppendString(interp, resultObj, src - 1, 2);
428 continue;
431 else {
432 Jim_AppendString(interp, resultObj, src, 1);
433 continue;
435 if ((index < MAX_SUB_MATCHES) && pmatch[index].rm_so != -1 && pmatch[index].rm_eo != -1) {
436 Jim_AppendString(interp, resultObj, p + pmatch[index].rm_so, pmatch[index].rm_eo - pmatch[index].rm_so);
440 p += pmatch[0].rm_eo;
442 if (!opt_all || pmatch[0].rm_eo == 0 || pattern[0] == '^') {
443 /* If we are doing a single match, or we haven't moved with this match
444 * or this is an anchored match, we stop */
445 break;
450 * Copy the portion of the string after the last match to the
451 * result variable.
453 Jim_AppendString(interp, resultObj, p, -1);
455 /* And now set the result variable */
456 result = Jim_SetVariable(interp, varname, resultObj);
458 if (result == JIM_OK) {
459 Jim_SetIntResult(interp, num_matches);
461 else {
462 Jim_SetResult(interp, Jim_NewEmptyStringObj(interp));
463 Jim_AppendStrings(interp, Jim_GetResult(interp), "couldn't set variable \"", Jim_GetString(varname, NULL), "\"", NULL);
464 Jim_FreeObj(interp, resultObj);
467 done:
468 regfree(regex);
469 Jim_Free(regex);
470 return result;
473 int Jim_OnLoad(Jim_Interp *interp)
475 Jim_InitExtension(interp);
476 if (Jim_PackageProvide(interp, "regexp", "1.0", JIM_ERRMSG) != JIM_OK) {
477 return JIM_ERR;
479 Jim_CreateCommand(interp, "regexp", Jim_RegexpCmd, NULL, NULL);
480 Jim_CreateCommand(interp, "regsub", Jim_RegsubCmd, NULL, NULL);
481 return JIM_OK;