Added information on function name added to LogRecord, and the 'extra' keyword parameter.
[python.git] / Modules / regexmodule.c
blobd44993262ea6b67163504300a4caf3b3111bfe4a
1 /*
2 XXX support range parameter on search
3 XXX support mstop parameter on search
4 */
7 /* Regular expression objects */
8 /* This uses Tatu Ylonen's copyleft-free reimplementation of
9 GNU regular expressions */
11 #include "Python.h"
13 #include <ctype.h>
15 #include "regexpr.h"
17 static PyObject *RegexError; /* Exception */
19 typedef struct {
20 PyObject_HEAD
21 struct re_pattern_buffer re_patbuf; /* The compiled expression */
22 struct re_registers re_regs; /* The registers from the last match */
23 char re_fastmap[256]; /* Storage for fastmap */
24 PyObject *re_translate; /* String object for translate table */
25 PyObject *re_lastok; /* String object last matched/searched */
26 PyObject *re_groupindex; /* Group name to index dictionary */
27 PyObject *re_givenpat; /* Pattern with symbolic groups */
28 PyObject *re_realpat; /* Pattern without symbolic groups */
29 } regexobject;
31 /* Regex object methods */
33 static void
34 reg_dealloc(regexobject *re)
36 if (re->re_patbuf.buffer)
37 free(re->re_patbuf.buffer);
38 Py_XDECREF(re->re_translate);
39 Py_XDECREF(re->re_lastok);
40 Py_XDECREF(re->re_groupindex);
41 Py_XDECREF(re->re_givenpat);
42 Py_XDECREF(re->re_realpat);
43 PyObject_Del(re);
46 static PyObject *
47 makeresult(struct re_registers *regs)
49 PyObject *v;
50 int i;
51 static PyObject *filler = NULL;
53 if (filler == NULL) {
54 filler = Py_BuildValue("(ii)", -1, -1);
55 if (filler == NULL)
56 return NULL;
58 v = PyTuple_New(RE_NREGS);
59 if (v == NULL)
60 return NULL;
62 for (i = 0; i < RE_NREGS; i++) {
63 int lo = regs->start[i];
64 int hi = regs->end[i];
65 PyObject *w;
66 if (lo == -1 && hi == -1) {
67 w = filler;
68 Py_INCREF(w);
70 else
71 w = Py_BuildValue("(ii)", lo, hi);
72 if (w == NULL || PyTuple_SetItem(v, i, w) < 0) {
73 Py_DECREF(v);
74 return NULL;
77 return v;
80 static PyObject *
81 regobj_match(regexobject *re, PyObject *args)
83 PyObject *argstring;
84 char *buffer;
85 int size;
86 int offset = 0;
87 int result;
89 if (!PyArg_ParseTuple(args, "O|i:match", &argstring, &offset))
90 return NULL;
91 if (!PyArg_Parse(argstring, "t#", &buffer, &size))
92 return NULL;
94 if (offset < 0 || offset > size) {
95 PyErr_SetString(RegexError, "match offset out of range");
96 return NULL;
98 Py_XDECREF(re->re_lastok);
99 re->re_lastok = NULL;
100 result = _Py_re_match(&re->re_patbuf, (unsigned char *)buffer, size, offset,
101 &re->re_regs);
102 if (result < -1) {
103 /* Serious failure of some sort; if re_match didn't
104 set an exception, raise a generic error */
105 if (!PyErr_Occurred())
106 PyErr_SetString(RegexError, "match failure");
107 return NULL;
109 if (result >= 0) {
110 Py_INCREF(argstring);
111 re->re_lastok = argstring;
113 return PyInt_FromLong((long)result); /* Length of the match or -1 */
116 static PyObject *
117 regobj_search(regexobject *re, PyObject *args)
119 PyObject *argstring;
120 char *buffer;
121 int size;
122 int offset = 0;
123 int range;
124 int result;
126 if (!PyArg_ParseTuple(args, "O|i:search", &argstring, &offset))
127 return NULL;
128 if (!PyArg_Parse(argstring, "t#:search", &buffer, &size))
129 return NULL;
131 if (offset < 0 || offset > size) {
132 PyErr_SetString(RegexError, "search offset out of range");
133 return NULL;
135 /* NB: In Emacs 18.57, the documentation for re_search[_2] and
136 the implementation don't match: the documentation states that
137 |range| positions are tried, while the code tries |range|+1
138 positions. It seems more productive to believe the code! */
139 range = size - offset;
140 Py_XDECREF(re->re_lastok);
141 re->re_lastok = NULL;
142 result = _Py_re_search(&re->re_patbuf, (unsigned char *)buffer, size, offset, range,
143 &re->re_regs);
144 if (result < -1) {
145 /* Serious failure of some sort; if re_match didn't
146 set an exception, raise a generic error */
147 if (!PyErr_Occurred())
148 PyErr_SetString(RegexError, "match failure");
149 return NULL;
151 if (result >= 0) {
152 Py_INCREF(argstring);
153 re->re_lastok = argstring;
155 return PyInt_FromLong((long)result); /* Position of the match or -1 */
158 /* get the group from the regex where index can be a string (group name) or
159 an integer index [0 .. 99]
161 static PyObject*
162 group_from_index(regexobject *re, PyObject *index)
164 int i, a, b;
165 char *v;
167 if (PyString_Check(index))
168 if (re->re_groupindex == NULL ||
169 !(index = PyDict_GetItem(re->re_groupindex, index)))
171 PyErr_SetString(RegexError,
172 "group() group name doesn't exist");
173 return NULL;
176 i = PyInt_AsLong(index);
177 if (i == -1 && PyErr_Occurred())
178 return NULL;
180 if (i < 0 || i >= RE_NREGS) {
181 PyErr_SetString(RegexError, "group() index out of range");
182 return NULL;
184 if (re->re_lastok == NULL) {
185 PyErr_SetString(RegexError,
186 "group() only valid after successful match/search");
187 return NULL;
189 a = re->re_regs.start[i];
190 b = re->re_regs.end[i];
191 if (a < 0 || b < 0) {
192 Py_INCREF(Py_None);
193 return Py_None;
196 if (!(v = PyString_AsString(re->re_lastok)))
197 return NULL;
199 return PyString_FromStringAndSize(v+a, b-a);
203 static PyObject *
204 regobj_group(regexobject *re, PyObject *args)
206 int n = PyTuple_Size(args);
207 int i;
208 PyObject *res = NULL;
210 if (n < 0)
211 return NULL;
212 if (n == 0) {
213 PyErr_SetString(PyExc_TypeError, "not enough arguments");
214 return NULL;
216 if (n == 1) {
217 /* return value is a single string */
218 PyObject *index = PyTuple_GetItem(args, 0);
219 if (!index)
220 return NULL;
222 return group_from_index(re, index);
225 /* return value is a tuple */
226 if (!(res = PyTuple_New(n)))
227 return NULL;
229 for (i = 0; i < n; i++) {
230 PyObject *index = PyTuple_GetItem(args, i);
231 PyObject *group = NULL;
233 if (!index)
234 goto finally;
235 if (!(group = group_from_index(re, index)))
236 goto finally;
237 if (PyTuple_SetItem(res, i, group) < 0)
238 goto finally;
240 return res;
242 finally:
243 Py_DECREF(res);
244 return NULL;
248 static struct PyMethodDef reg_methods[] = {
249 {"match", (PyCFunction)regobj_match, METH_VARARGS},
250 {"search", (PyCFunction)regobj_search, METH_VARARGS},
251 {"group", (PyCFunction)regobj_group, METH_VARARGS},
252 {NULL, NULL} /* sentinel */
257 static char* members[] = {
258 "last", "regs", "translate",
259 "groupindex", "realpat", "givenpat",
260 NULL
264 static PyObject *
265 regobj_getattr(regexobject *re, char *name)
267 if (strcmp(name, "regs") == 0) {
268 if (re->re_lastok == NULL) {
269 Py_INCREF(Py_None);
270 return Py_None;
272 return makeresult(&re->re_regs);
274 if (strcmp(name, "last") == 0) {
275 if (re->re_lastok == NULL) {
276 Py_INCREF(Py_None);
277 return Py_None;
279 Py_INCREF(re->re_lastok);
280 return re->re_lastok;
282 if (strcmp(name, "translate") == 0) {
283 if (re->re_translate == NULL) {
284 Py_INCREF(Py_None);
285 return Py_None;
287 Py_INCREF(re->re_translate);
288 return re->re_translate;
290 if (strcmp(name, "groupindex") == 0) {
291 if (re->re_groupindex == NULL) {
292 Py_INCREF(Py_None);
293 return Py_None;
295 Py_INCREF(re->re_groupindex);
296 return re->re_groupindex;
298 if (strcmp(name, "realpat") == 0) {
299 if (re->re_realpat == NULL) {
300 Py_INCREF(Py_None);
301 return Py_None;
303 Py_INCREF(re->re_realpat);
304 return re->re_realpat;
306 if (strcmp(name, "givenpat") == 0) {
307 if (re->re_givenpat == NULL) {
308 Py_INCREF(Py_None);
309 return Py_None;
311 Py_INCREF(re->re_givenpat);
312 return re->re_givenpat;
314 if (strcmp(name, "__members__") == 0) {
315 int i = 0;
316 PyObject *list = NULL;
318 /* okay, so it's unlikely this list will change that often.
319 still, it's easier to change it in just one place.
321 while (members[i])
322 i++;
323 if (!(list = PyList_New(i)))
324 return NULL;
326 i = 0;
327 while (members[i]) {
328 PyObject* v = PyString_FromString(members[i]);
329 if (!v || PyList_SetItem(list, i, v) < 0) {
330 Py_DECREF(list);
331 return NULL;
333 i++;
335 return list;
337 return Py_FindMethod(reg_methods, (PyObject *)re, name);
340 static PyTypeObject Regextype = {
341 PyObject_HEAD_INIT(NULL)
342 0, /*ob_size*/
343 "regex.regex", /*tp_name*/
344 sizeof(regexobject), /*tp_size*/
345 0, /*tp_itemsize*/
346 /* methods */
347 (destructor)reg_dealloc, /*tp_dealloc*/
348 0, /*tp_print*/
349 (getattrfunc)regobj_getattr, /*tp_getattr*/
350 0, /*tp_setattr*/
351 0, /*tp_compare*/
352 0, /*tp_repr*/
355 /* reference counting invariants:
356 pattern: borrowed
357 translate: borrowed
358 givenpat: borrowed
359 groupindex: transferred
361 static PyObject *
362 newregexobject(PyObject *pattern, PyObject *translate, PyObject *givenpat, PyObject *groupindex)
364 regexobject *re;
365 char *pat;
366 int size;
368 if (!PyArg_Parse(pattern, "t#", &pat, &size))
369 return NULL;
371 if (translate != NULL && PyString_Size(translate) != 256) {
372 PyErr_SetString(RegexError,
373 "translation table must be 256 bytes");
374 return NULL;
376 re = PyObject_New(regexobject, &Regextype);
377 if (re != NULL) {
378 char *error;
379 re->re_patbuf.buffer = NULL;
380 re->re_patbuf.allocated = 0;
381 re->re_patbuf.fastmap = (unsigned char *)re->re_fastmap;
382 if (translate) {
383 re->re_patbuf.translate = (unsigned char *)PyString_AsString(translate);
384 if (!re->re_patbuf.translate)
385 goto finally;
386 Py_INCREF(translate);
388 else
389 re->re_patbuf.translate = NULL;
390 re->re_translate = translate;
391 re->re_lastok = NULL;
392 re->re_groupindex = groupindex;
393 Py_INCREF(pattern);
394 re->re_realpat = pattern;
395 Py_INCREF(givenpat);
396 re->re_givenpat = givenpat;
397 error = _Py_re_compile_pattern((unsigned char *)pat, size, &re->re_patbuf);
398 if (error != NULL) {
399 PyErr_SetString(RegexError, error);
400 goto finally;
403 return (PyObject *)re;
404 finally:
405 Py_DECREF(re);
406 return NULL;
409 static PyObject *
410 regex_compile(PyObject *self, PyObject *args)
412 PyObject *pat = NULL;
413 PyObject *tran = NULL;
415 if (!PyArg_ParseTuple(args, "S|S:compile", &pat, &tran))
416 return NULL;
417 return newregexobject(pat, tran, pat, NULL);
420 static PyObject *
421 symcomp(PyObject *pattern, PyObject *gdict)
423 char *opat, *oend, *o, *n, *g, *v;
424 int group_count = 0;
425 int sz;
426 int escaped = 0;
427 char name_buf[128];
428 PyObject *npattern;
429 int require_escape = re_syntax & RE_NO_BK_PARENS ? 0 : 1;
431 if (!(opat = PyString_AsString(pattern)))
432 return NULL;
434 if ((sz = PyString_Size(pattern)) < 0)
435 return NULL;
437 oend = opat + sz;
438 o = opat;
440 if (oend == opat) {
441 Py_INCREF(pattern);
442 return pattern;
445 if (!(npattern = PyString_FromStringAndSize((char*)NULL, sz)) ||
446 !(n = PyString_AsString(npattern)))
447 return NULL;
449 while (o < oend) {
450 if (*o == '(' && escaped == require_escape) {
451 char *backtrack;
452 escaped = 0;
453 ++group_count;
454 *n++ = *o;
455 if (++o >= oend || *o != '<')
456 continue;
457 /* *o == '<' */
458 if (o+1 < oend && *(o+1) == '>')
459 continue;
460 backtrack = o;
461 g = name_buf;
462 for (++o; o < oend;) {
463 if (*o == '>') {
464 PyObject *group_name = NULL;
465 PyObject *group_index = NULL;
466 *g++ = '\0';
467 group_name = PyString_FromString(name_buf);
468 group_index = PyInt_FromLong(group_count);
469 if (group_name == NULL ||
470 group_index == NULL ||
471 PyDict_SetItem(gdict, group_name,
472 group_index) != 0)
474 Py_XDECREF(group_name);
475 Py_XDECREF(group_index);
476 Py_XDECREF(npattern);
477 return NULL;
479 Py_DECREF(group_name);
480 Py_DECREF(group_index);
481 ++o; /* eat the '>' */
482 break;
484 if (!isalnum(Py_CHARMASK(*o)) && *o != '_') {
485 o = backtrack;
486 break;
488 *g++ = *o++;
491 else if (*o == '[' && !escaped) {
492 *n++ = *o;
493 ++o; /* eat the char following '[' */
494 *n++ = *o;
495 while (o < oend && *o != ']') {
496 ++o;
497 *n++ = *o;
499 if (o < oend)
500 ++o;
502 else if (*o == '\\') {
503 escaped = 1;
504 *n++ = *o;
505 ++o;
507 else {
508 escaped = 0;
509 *n++ = *o;
510 ++o;
514 if (!(v = PyString_AsString(npattern))) {
515 Py_DECREF(npattern);
516 return NULL;
518 /* _PyString_Resize() decrements npattern on failure */
519 _PyString_Resize(&npattern, n - v);
520 return npattern;
524 static PyObject *
525 regex_symcomp(PyObject *self, PyObject *args)
527 PyObject *pattern;
528 PyObject *tran = NULL;
529 PyObject *gdict = NULL;
530 PyObject *npattern;
531 PyObject *retval = NULL;
533 if (!PyArg_ParseTuple(args, "S|S:symcomp", &pattern, &tran))
534 return NULL;
536 gdict = PyDict_New();
537 if (gdict == NULL || (npattern = symcomp(pattern, gdict)) == NULL) {
538 Py_DECREF(gdict);
539 Py_DECREF(pattern);
540 return NULL;
542 retval = newregexobject(npattern, tran, pattern, gdict);
543 Py_DECREF(npattern);
544 return retval;
548 static PyObject *cache_pat;
549 static PyObject *cache_prog;
551 static int
552 update_cache(PyObject *pat)
554 PyObject *tuple = PyTuple_Pack(1, pat);
555 int status = 0;
557 if (!tuple)
558 return -1;
560 if (pat != cache_pat) {
561 Py_XDECREF(cache_pat);
562 cache_pat = NULL;
563 Py_XDECREF(cache_prog);
564 cache_prog = regex_compile((PyObject *)NULL, tuple);
565 if (cache_prog == NULL) {
566 status = -1;
567 goto finally;
569 cache_pat = pat;
570 Py_INCREF(cache_pat);
572 finally:
573 Py_DECREF(tuple);
574 return status;
577 static PyObject *
578 regex_match(PyObject *self, PyObject *args)
580 PyObject *pat, *string;
581 PyObject *tuple, *v;
583 if (!PyArg_ParseTuple(args, "SS:match", &pat, &string))
584 return NULL;
585 if (update_cache(pat) < 0)
586 return NULL;
588 if (!(tuple = Py_BuildValue("(S)", string)))
589 return NULL;
590 v = regobj_match((regexobject *)cache_prog, tuple);
591 Py_DECREF(tuple);
592 return v;
595 static PyObject *
596 regex_search(PyObject *self, PyObject *args)
598 PyObject *pat, *string;
599 PyObject *tuple, *v;
601 if (!PyArg_ParseTuple(args, "SS:search", &pat, &string))
602 return NULL;
603 if (update_cache(pat) < 0)
604 return NULL;
606 if (!(tuple = Py_BuildValue("(S)", string)))
607 return NULL;
608 v = regobj_search((regexobject *)cache_prog, tuple);
609 Py_DECREF(tuple);
610 return v;
613 static PyObject *
614 regex_set_syntax(PyObject *self, PyObject *args)
616 int syntax;
617 if (!PyArg_ParseTuple(args, "i:set_syntax", &syntax))
618 return NULL;
619 syntax = re_set_syntax(syntax);
620 /* wipe the global pattern cache */
621 Py_XDECREF(cache_pat);
622 cache_pat = NULL;
623 Py_XDECREF(cache_prog);
624 cache_prog = NULL;
625 return PyInt_FromLong((long)syntax);
628 static PyObject *
629 regex_get_syntax(PyObject *self)
631 return PyInt_FromLong((long)re_syntax);
635 static struct PyMethodDef regex_global_methods[] = {
636 {"compile", regex_compile, METH_VARARGS},
637 {"symcomp", regex_symcomp, METH_VARARGS},
638 {"match", regex_match, METH_VARARGS},
639 {"search", regex_search, METH_VARARGS},
640 {"set_syntax", regex_set_syntax, METH_VARARGS},
641 {"get_syntax", (PyCFunction)regex_get_syntax, METH_NOARGS},
642 {NULL, NULL} /* sentinel */
645 PyMODINIT_FUNC
646 initregex(void)
648 PyObject *m, *d, *v;
649 int i;
650 char *s;
652 /* Initialize object type */
653 Regextype.ob_type = &PyType_Type;
655 m = Py_InitModule("regex", regex_global_methods);
656 if (m == NULL)
657 return;
658 d = PyModule_GetDict(m);
660 if (PyErr_Warn(PyExc_DeprecationWarning,
661 "the regex module is deprecated; "
662 "please use the re module") < 0)
663 return;
665 /* Initialize regex.error exception */
666 v = RegexError = PyErr_NewException("regex.error", NULL, NULL);
667 if (v == NULL || PyDict_SetItemString(d, "error", v) != 0)
668 goto finally;
670 /* Initialize regex.casefold constant */
671 if (!(v = PyString_FromStringAndSize((char *)NULL, 256)))
672 goto finally;
674 if (!(s = PyString_AsString(v)))
675 goto finally;
677 for (i = 0; i < 256; i++) {
678 if (isupper(i))
679 s[i] = tolower(i);
680 else
681 s[i] = i;
683 if (PyDict_SetItemString(d, "casefold", v) < 0)
684 goto finally;
685 Py_DECREF(v);
687 if (!PyErr_Occurred())
688 return;
689 finally:
690 /* Nothing */ ;