winegstreamer: Pass desired input queue length to wg_transform_create.
[wine.git] / libs / xml2 / xmlregexp.c
bloba2a36c499183064621457d8d6175aa5722214840
1 /*
2 * regexp.c: generic and extensible Regular Expression engine
4 * Basically designed with the purpose of compiling regexps for
5 * the variety of validation/schemas mechanisms now available in
6 * XML related specifications these include:
7 * - XML-1.0 DTD validation
8 * - XML Schemas structure part 1
9 * - XML Schemas Datatypes part 2 especially Appendix F
10 * - RELAX-NG/TREX i.e. the counter proposal
12 * See Copyright for the status of this software.
14 * Daniel Veillard <veillard@redhat.com>
17 #define IN_LIBXML
18 #include "libxml.h"
20 #ifdef LIBXML_REGEXP_ENABLED
22 /* #define DEBUG_ERR */
24 #include <stdio.h>
25 #include <string.h>
26 #include <limits.h>
28 #include <libxml/tree.h>
29 #include <libxml/parserInternals.h>
30 #include <libxml/xmlregexp.h>
31 #include <libxml/xmlautomata.h>
32 #include <libxml/xmlunicode.h>
34 #ifndef SIZE_MAX
35 #define SIZE_MAX ((size_t) -1)
36 #endif
38 /* #define DEBUG_REGEXP_GRAPH */
39 /* #define DEBUG_REGEXP_EXEC */
40 /* #define DEBUG_PUSH */
41 /* #define DEBUG_COMPACTION */
43 #define MAX_PUSH 10000000
45 #ifdef ERROR
46 #undef ERROR
47 #endif
48 #define ERROR(str) \
49 ctxt->error = XML_REGEXP_COMPILE_ERROR; \
50 xmlRegexpErrCompile(ctxt, str);
51 #define NEXT ctxt->cur++
52 #define CUR (*(ctxt->cur))
53 #define NXT(index) (ctxt->cur[index])
55 #define CUR_SCHAR(s, l) xmlStringCurrentChar(NULL, s, &l)
56 #define NEXTL(l) ctxt->cur += l;
57 #define XML_REG_STRING_SEPARATOR '|'
59 * Need PREV to check on a '-' within a Character Group. May only be used
60 * when it's guaranteed that cur is not at the beginning of ctxt->string!
62 #define PREV (ctxt->cur[-1])
64 /**
65 * TODO:
67 * macro to flag unimplemented blocks
69 #define TODO \
70 xmlGenericError(xmlGenericErrorContext, \
71 "Unimplemented block at %s:%d\n", \
72 __FILE__, __LINE__);
74 /************************************************************************
75 * *
76 * Datatypes and structures *
77 * *
78 ************************************************************************/
81 * Note: the order of the enums below is significant, do not shuffle
83 typedef enum {
84 XML_REGEXP_EPSILON = 1,
85 XML_REGEXP_CHARVAL,
86 XML_REGEXP_RANGES,
87 XML_REGEXP_SUBREG, /* used for () sub regexps */
88 XML_REGEXP_STRING,
89 XML_REGEXP_ANYCHAR, /* . */
90 XML_REGEXP_ANYSPACE, /* \s */
91 XML_REGEXP_NOTSPACE, /* \S */
92 XML_REGEXP_INITNAME, /* \l */
93 XML_REGEXP_NOTINITNAME, /* \L */
94 XML_REGEXP_NAMECHAR, /* \c */
95 XML_REGEXP_NOTNAMECHAR, /* \C */
96 XML_REGEXP_DECIMAL, /* \d */
97 XML_REGEXP_NOTDECIMAL, /* \D */
98 XML_REGEXP_REALCHAR, /* \w */
99 XML_REGEXP_NOTREALCHAR, /* \W */
100 XML_REGEXP_LETTER = 100,
101 XML_REGEXP_LETTER_UPPERCASE,
102 XML_REGEXP_LETTER_LOWERCASE,
103 XML_REGEXP_LETTER_TITLECASE,
104 XML_REGEXP_LETTER_MODIFIER,
105 XML_REGEXP_LETTER_OTHERS,
106 XML_REGEXP_MARK,
107 XML_REGEXP_MARK_NONSPACING,
108 XML_REGEXP_MARK_SPACECOMBINING,
109 XML_REGEXP_MARK_ENCLOSING,
110 XML_REGEXP_NUMBER,
111 XML_REGEXP_NUMBER_DECIMAL,
112 XML_REGEXP_NUMBER_LETTER,
113 XML_REGEXP_NUMBER_OTHERS,
114 XML_REGEXP_PUNCT,
115 XML_REGEXP_PUNCT_CONNECTOR,
116 XML_REGEXP_PUNCT_DASH,
117 XML_REGEXP_PUNCT_OPEN,
118 XML_REGEXP_PUNCT_CLOSE,
119 XML_REGEXP_PUNCT_INITQUOTE,
120 XML_REGEXP_PUNCT_FINQUOTE,
121 XML_REGEXP_PUNCT_OTHERS,
122 XML_REGEXP_SEPAR,
123 XML_REGEXP_SEPAR_SPACE,
124 XML_REGEXP_SEPAR_LINE,
125 XML_REGEXP_SEPAR_PARA,
126 XML_REGEXP_SYMBOL,
127 XML_REGEXP_SYMBOL_MATH,
128 XML_REGEXP_SYMBOL_CURRENCY,
129 XML_REGEXP_SYMBOL_MODIFIER,
130 XML_REGEXP_SYMBOL_OTHERS,
131 XML_REGEXP_OTHER,
132 XML_REGEXP_OTHER_CONTROL,
133 XML_REGEXP_OTHER_FORMAT,
134 XML_REGEXP_OTHER_PRIVATE,
135 XML_REGEXP_OTHER_NA,
136 XML_REGEXP_BLOCK_NAME
137 } xmlRegAtomType;
139 typedef enum {
140 XML_REGEXP_QUANT_EPSILON = 1,
141 XML_REGEXP_QUANT_ONCE,
142 XML_REGEXP_QUANT_OPT,
143 XML_REGEXP_QUANT_MULT,
144 XML_REGEXP_QUANT_PLUS,
145 XML_REGEXP_QUANT_ONCEONLY,
146 XML_REGEXP_QUANT_ALL,
147 XML_REGEXP_QUANT_RANGE
148 } xmlRegQuantType;
150 typedef enum {
151 XML_REGEXP_START_STATE = 1,
152 XML_REGEXP_FINAL_STATE,
153 XML_REGEXP_TRANS_STATE,
154 XML_REGEXP_SINK_STATE,
155 XML_REGEXP_UNREACH_STATE
156 } xmlRegStateType;
158 typedef enum {
159 XML_REGEXP_MARK_NORMAL = 0,
160 XML_REGEXP_MARK_START,
161 XML_REGEXP_MARK_VISITED
162 } xmlRegMarkedType;
164 typedef struct _xmlRegRange xmlRegRange;
165 typedef xmlRegRange *xmlRegRangePtr;
167 struct _xmlRegRange {
168 int neg; /* 0 normal, 1 not, 2 exclude */
169 xmlRegAtomType type;
170 int start;
171 int end;
172 xmlChar *blockName;
175 typedef struct _xmlRegAtom xmlRegAtom;
176 typedef xmlRegAtom *xmlRegAtomPtr;
178 typedef struct _xmlAutomataState xmlRegState;
179 typedef xmlRegState *xmlRegStatePtr;
181 struct _xmlRegAtom {
182 int no;
183 xmlRegAtomType type;
184 xmlRegQuantType quant;
185 int min;
186 int max;
188 void *valuep;
189 void *valuep2;
190 int neg;
191 int codepoint;
192 xmlRegStatePtr start;
193 xmlRegStatePtr start0;
194 xmlRegStatePtr stop;
195 int maxRanges;
196 int nbRanges;
197 xmlRegRangePtr *ranges;
198 void *data;
201 typedef struct _xmlRegCounter xmlRegCounter;
202 typedef xmlRegCounter *xmlRegCounterPtr;
204 struct _xmlRegCounter {
205 int min;
206 int max;
209 typedef struct _xmlRegTrans xmlRegTrans;
210 typedef xmlRegTrans *xmlRegTransPtr;
212 struct _xmlRegTrans {
213 xmlRegAtomPtr atom;
214 int to;
215 int counter;
216 int count;
217 int nd;
220 struct _xmlAutomataState {
221 xmlRegStateType type;
222 xmlRegMarkedType mark;
223 xmlRegMarkedType markd;
224 xmlRegMarkedType reached;
225 int no;
226 int maxTrans;
227 int nbTrans;
228 xmlRegTrans *trans;
229 /* knowing states pointing to us can speed things up */
230 int maxTransTo;
231 int nbTransTo;
232 int *transTo;
235 typedef struct _xmlAutomata xmlRegParserCtxt;
236 typedef xmlRegParserCtxt *xmlRegParserCtxtPtr;
238 #define AM_AUTOMATA_RNG 1
240 struct _xmlAutomata {
241 xmlChar *string;
242 xmlChar *cur;
244 int error;
245 int neg;
247 xmlRegStatePtr start;
248 xmlRegStatePtr end;
249 xmlRegStatePtr state;
251 xmlRegAtomPtr atom;
253 int maxAtoms;
254 int nbAtoms;
255 xmlRegAtomPtr *atoms;
257 int maxStates;
258 int nbStates;
259 xmlRegStatePtr *states;
261 int maxCounters;
262 int nbCounters;
263 xmlRegCounter *counters;
265 int determinist;
266 int negs;
267 int flags;
269 int depth;
272 struct _xmlRegexp {
273 xmlChar *string;
274 int nbStates;
275 xmlRegStatePtr *states;
276 int nbAtoms;
277 xmlRegAtomPtr *atoms;
278 int nbCounters;
279 xmlRegCounter *counters;
280 int determinist;
281 int flags;
283 * That's the compact form for determinists automatas
285 int nbstates;
286 int *compact;
287 void **transdata;
288 int nbstrings;
289 xmlChar **stringMap;
292 typedef struct _xmlRegExecRollback xmlRegExecRollback;
293 typedef xmlRegExecRollback *xmlRegExecRollbackPtr;
295 struct _xmlRegExecRollback {
296 xmlRegStatePtr state;/* the current state */
297 int index; /* the index in the input stack */
298 int nextbranch; /* the next transition to explore in that state */
299 int *counts; /* save the automata state if it has some */
302 typedef struct _xmlRegInputToken xmlRegInputToken;
303 typedef xmlRegInputToken *xmlRegInputTokenPtr;
305 struct _xmlRegInputToken {
306 xmlChar *value;
307 void *data;
310 struct _xmlRegExecCtxt {
311 int status; /* execution status != 0 indicate an error */
312 int determinist; /* did we find an indeterministic behaviour */
313 xmlRegexpPtr comp; /* the compiled regexp */
314 xmlRegExecCallbacks callback;
315 void *data;
317 xmlRegStatePtr state;/* the current state */
318 int transno; /* the current transition on that state */
319 int transcount; /* the number of chars in char counted transitions */
322 * A stack of rollback states
324 int maxRollbacks;
325 int nbRollbacks;
326 xmlRegExecRollback *rollbacks;
329 * The state of the automata if any
331 int *counts;
334 * The input stack
336 int inputStackMax;
337 int inputStackNr;
338 int index;
339 int *charStack;
340 const xmlChar *inputString; /* when operating on characters */
341 xmlRegInputTokenPtr inputStack;/* when operating on strings */
344 * error handling
346 int errStateNo; /* the error state number */
347 xmlRegStatePtr errState; /* the error state */
348 xmlChar *errString; /* the string raising the error */
349 int *errCounts; /* counters at the error state */
350 int nbPush;
353 #define REGEXP_ALL_COUNTER 0x123456
354 #define REGEXP_ALL_LAX_COUNTER 0x123457
356 static void xmlFAParseRegExp(xmlRegParserCtxtPtr ctxt, int top);
357 static void xmlRegFreeState(xmlRegStatePtr state);
358 static void xmlRegFreeAtom(xmlRegAtomPtr atom);
359 static int xmlRegStrEqualWildcard(const xmlChar *expStr, const xmlChar *valStr);
360 static int xmlRegCheckCharacter(xmlRegAtomPtr atom, int codepoint);
361 static int xmlRegCheckCharacterRange(xmlRegAtomType type, int codepoint,
362 int neg, int start, int end, const xmlChar *blockName);
364 void xmlAutomataSetFlags(xmlAutomataPtr am, int flags);
366 /************************************************************************
368 * Regexp memory error handler *
370 ************************************************************************/
372 * xmlRegexpErrMemory:
373 * @extra: extra information
375 * Handle an out of memory condition
377 static void
378 xmlRegexpErrMemory(xmlRegParserCtxtPtr ctxt, const char *extra)
380 const char *regexp = NULL;
381 if (ctxt != NULL) {
382 regexp = (const char *) ctxt->string;
383 ctxt->error = XML_ERR_NO_MEMORY;
385 __xmlRaiseError(NULL, NULL, NULL, NULL, NULL, XML_FROM_REGEXP,
386 XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, extra,
387 regexp, NULL, 0, 0,
388 "Memory allocation failed : %s\n", extra);
392 * xmlRegexpErrCompile:
393 * @extra: extra information
395 * Handle a compilation failure
397 static void
398 xmlRegexpErrCompile(xmlRegParserCtxtPtr ctxt, const char *extra)
400 const char *regexp = NULL;
401 int idx = 0;
403 if (ctxt != NULL) {
404 regexp = (const char *) ctxt->string;
405 idx = ctxt->cur - ctxt->string;
406 ctxt->error = XML_REGEXP_COMPILE_ERROR;
408 __xmlRaiseError(NULL, NULL, NULL, NULL, NULL, XML_FROM_REGEXP,
409 XML_REGEXP_COMPILE_ERROR, XML_ERR_FATAL, NULL, 0, extra,
410 regexp, NULL, idx, 0,
411 "failed to compile: %s\n", extra);
414 /************************************************************************
416 * Allocation/Deallocation *
418 ************************************************************************/
420 static int xmlFAComputesDeterminism(xmlRegParserCtxtPtr ctxt);
423 * xmlRegCalloc2:
424 * @dim1: size of first dimension
425 * @dim2: size of second dimension
426 * @elemSize: size of element
428 * Allocate a two-dimensional array and set all elements to zero.
430 * Returns the new array or NULL in case of error.
432 static void*
433 xmlRegCalloc2(size_t dim1, size_t dim2, size_t elemSize) {
434 size_t totalSize;
435 void *ret;
437 /* Check for overflow */
438 if (dim1 > SIZE_MAX / dim2 / elemSize)
439 return (NULL);
440 totalSize = dim1 * dim2 * elemSize;
441 ret = xmlMalloc(totalSize);
442 if (ret != NULL)
443 memset(ret, 0, totalSize);
444 return (ret);
448 * xmlRegEpxFromParse:
449 * @ctxt: the parser context used to build it
451 * Allocate a new regexp and fill it with the result from the parser
453 * Returns the new regexp or NULL in case of error
455 static xmlRegexpPtr
456 xmlRegEpxFromParse(xmlRegParserCtxtPtr ctxt) {
457 xmlRegexpPtr ret;
459 ret = (xmlRegexpPtr) xmlMalloc(sizeof(xmlRegexp));
460 if (ret == NULL) {
461 xmlRegexpErrMemory(ctxt, "compiling regexp");
462 return(NULL);
464 memset(ret, 0, sizeof(xmlRegexp));
465 ret->string = ctxt->string;
466 ret->nbStates = ctxt->nbStates;
467 ret->states = ctxt->states;
468 ret->nbAtoms = ctxt->nbAtoms;
469 ret->atoms = ctxt->atoms;
470 ret->nbCounters = ctxt->nbCounters;
471 ret->counters = ctxt->counters;
472 ret->determinist = ctxt->determinist;
473 ret->flags = ctxt->flags;
474 if (ret->determinist == -1) {
475 xmlRegexpIsDeterminist(ret);
478 if ((ret->determinist != 0) &&
479 (ret->nbCounters == 0) &&
480 (ctxt->negs == 0) &&
481 (ret->atoms != NULL) &&
482 (ret->atoms[0] != NULL) &&
483 (ret->atoms[0]->type == XML_REGEXP_STRING)) {
484 int i, j, nbstates = 0, nbatoms = 0;
485 int *stateRemap;
486 int *stringRemap;
487 int *transitions;
488 void **transdata;
489 xmlChar **stringMap;
490 xmlChar *value;
493 * Switch to a compact representation
494 * 1/ counting the effective number of states left
495 * 2/ counting the unique number of atoms, and check that
496 * they are all of the string type
497 * 3/ build a table state x atom for the transitions
500 stateRemap = xmlMalloc(ret->nbStates * sizeof(int));
501 if (stateRemap == NULL) {
502 xmlRegexpErrMemory(ctxt, "compiling regexp");
503 xmlFree(ret);
504 return(NULL);
506 for (i = 0;i < ret->nbStates;i++) {
507 if (ret->states[i] != NULL) {
508 stateRemap[i] = nbstates;
509 nbstates++;
510 } else {
511 stateRemap[i] = -1;
514 #ifdef DEBUG_COMPACTION
515 printf("Final: %d states\n", nbstates);
516 #endif
517 stringMap = xmlMalloc(ret->nbAtoms * sizeof(char *));
518 if (stringMap == NULL) {
519 xmlRegexpErrMemory(ctxt, "compiling regexp");
520 xmlFree(stateRemap);
521 xmlFree(ret);
522 return(NULL);
524 stringRemap = xmlMalloc(ret->nbAtoms * sizeof(int));
525 if (stringRemap == NULL) {
526 xmlRegexpErrMemory(ctxt, "compiling regexp");
527 xmlFree(stringMap);
528 xmlFree(stateRemap);
529 xmlFree(ret);
530 return(NULL);
532 for (i = 0;i < ret->nbAtoms;i++) {
533 if ((ret->atoms[i]->type == XML_REGEXP_STRING) &&
534 (ret->atoms[i]->quant == XML_REGEXP_QUANT_ONCE)) {
535 value = ret->atoms[i]->valuep;
536 for (j = 0;j < nbatoms;j++) {
537 if (xmlStrEqual(stringMap[j], value)) {
538 stringRemap[i] = j;
539 break;
542 if (j >= nbatoms) {
543 stringRemap[i] = nbatoms;
544 stringMap[nbatoms] = xmlStrdup(value);
545 if (stringMap[nbatoms] == NULL) {
546 for (i = 0;i < nbatoms;i++)
547 xmlFree(stringMap[i]);
548 xmlFree(stringRemap);
549 xmlFree(stringMap);
550 xmlFree(stateRemap);
551 xmlFree(ret);
552 return(NULL);
554 nbatoms++;
556 } else {
557 xmlFree(stateRemap);
558 xmlFree(stringRemap);
559 for (i = 0;i < nbatoms;i++)
560 xmlFree(stringMap[i]);
561 xmlFree(stringMap);
562 xmlFree(ret);
563 return(NULL);
566 #ifdef DEBUG_COMPACTION
567 printf("Final: %d atoms\n", nbatoms);
568 #endif
569 transitions = (int *) xmlRegCalloc2(nbstates + 1, nbatoms + 1,
570 sizeof(int));
571 if (transitions == NULL) {
572 xmlFree(stateRemap);
573 xmlFree(stringRemap);
574 for (i = 0;i < nbatoms;i++)
575 xmlFree(stringMap[i]);
576 xmlFree(stringMap);
577 xmlFree(ret);
578 return(NULL);
582 * Allocate the transition table. The first entry for each
583 * state corresponds to the state type.
585 transdata = NULL;
587 for (i = 0;i < ret->nbStates;i++) {
588 int stateno, atomno, targetno, prev;
589 xmlRegStatePtr state;
590 xmlRegTransPtr trans;
592 stateno = stateRemap[i];
593 if (stateno == -1)
594 continue;
595 state = ret->states[i];
597 transitions[stateno * (nbatoms + 1)] = state->type;
599 for (j = 0;j < state->nbTrans;j++) {
600 trans = &(state->trans[j]);
601 if ((trans->to == -1) || (trans->atom == NULL))
602 continue;
603 atomno = stringRemap[trans->atom->no];
604 if ((trans->atom->data != NULL) && (transdata == NULL)) {
605 transdata = (void **) xmlRegCalloc2(nbstates, nbatoms,
606 sizeof(void *));
607 if (transdata == NULL) {
608 xmlRegexpErrMemory(ctxt, "compiling regexp");
609 break;
612 targetno = stateRemap[trans->to];
614 * if the same atom can generate transitions to 2 different
615 * states then it means the automata is not deterministic and
616 * the compact form can't be used !
618 prev = transitions[stateno * (nbatoms + 1) + atomno + 1];
619 if (prev != 0) {
620 if (prev != targetno + 1) {
621 ret->determinist = 0;
622 #ifdef DEBUG_COMPACTION
623 printf("Indet: state %d trans %d, atom %d to %d : %d to %d\n",
624 i, j, trans->atom->no, trans->to, atomno, targetno);
625 printf(" previous to is %d\n", prev);
626 #endif
627 if (transdata != NULL)
628 xmlFree(transdata);
629 xmlFree(transitions);
630 xmlFree(stateRemap);
631 xmlFree(stringRemap);
632 for (i = 0;i < nbatoms;i++)
633 xmlFree(stringMap[i]);
634 xmlFree(stringMap);
635 goto not_determ;
637 } else {
638 #if 0
639 printf("State %d trans %d: atom %d to %d : %d to %d\n",
640 i, j, trans->atom->no, trans->to, atomno, targetno);
641 #endif
642 transitions[stateno * (nbatoms + 1) + atomno + 1] =
643 targetno + 1; /* to avoid 0 */
644 if (transdata != NULL)
645 transdata[stateno * nbatoms + atomno] =
646 trans->atom->data;
650 ret->determinist = 1;
651 #ifdef DEBUG_COMPACTION
653 * Debug
655 for (i = 0;i < nbstates;i++) {
656 for (j = 0;j < nbatoms + 1;j++) {
657 printf("%02d ", transitions[i * (nbatoms + 1) + j]);
659 printf("\n");
661 printf("\n");
662 #endif
664 * Cleanup of the old data
666 if (ret->states != NULL) {
667 for (i = 0;i < ret->nbStates;i++)
668 xmlRegFreeState(ret->states[i]);
669 xmlFree(ret->states);
671 ret->states = NULL;
672 ret->nbStates = 0;
673 if (ret->atoms != NULL) {
674 for (i = 0;i < ret->nbAtoms;i++)
675 xmlRegFreeAtom(ret->atoms[i]);
676 xmlFree(ret->atoms);
678 ret->atoms = NULL;
679 ret->nbAtoms = 0;
681 ret->compact = transitions;
682 ret->transdata = transdata;
683 ret->stringMap = stringMap;
684 ret->nbstrings = nbatoms;
685 ret->nbstates = nbstates;
686 xmlFree(stateRemap);
687 xmlFree(stringRemap);
689 not_determ:
690 ctxt->string = NULL;
691 ctxt->nbStates = 0;
692 ctxt->states = NULL;
693 ctxt->nbAtoms = 0;
694 ctxt->atoms = NULL;
695 ctxt->nbCounters = 0;
696 ctxt->counters = NULL;
697 return(ret);
701 * xmlRegNewParserCtxt:
702 * @string: the string to parse
704 * Allocate a new regexp parser context
706 * Returns the new context or NULL in case of error
708 static xmlRegParserCtxtPtr
709 xmlRegNewParserCtxt(const xmlChar *string) {
710 xmlRegParserCtxtPtr ret;
712 ret = (xmlRegParserCtxtPtr) xmlMalloc(sizeof(xmlRegParserCtxt));
713 if (ret == NULL)
714 return(NULL);
715 memset(ret, 0, sizeof(xmlRegParserCtxt));
716 if (string != NULL)
717 ret->string = xmlStrdup(string);
718 ret->cur = ret->string;
719 ret->neg = 0;
720 ret->negs = 0;
721 ret->error = 0;
722 ret->determinist = -1;
723 return(ret);
727 * xmlRegNewRange:
728 * @ctxt: the regexp parser context
729 * @neg: is that negative
730 * @type: the type of range
731 * @start: the start codepoint
732 * @end: the end codepoint
734 * Allocate a new regexp range
736 * Returns the new range or NULL in case of error
738 static xmlRegRangePtr
739 xmlRegNewRange(xmlRegParserCtxtPtr ctxt,
740 int neg, xmlRegAtomType type, int start, int end) {
741 xmlRegRangePtr ret;
743 ret = (xmlRegRangePtr) xmlMalloc(sizeof(xmlRegRange));
744 if (ret == NULL) {
745 xmlRegexpErrMemory(ctxt, "allocating range");
746 return(NULL);
748 ret->neg = neg;
749 ret->type = type;
750 ret->start = start;
751 ret->end = end;
752 return(ret);
756 * xmlRegFreeRange:
757 * @range: the regexp range
759 * Free a regexp range
761 static void
762 xmlRegFreeRange(xmlRegRangePtr range) {
763 if (range == NULL)
764 return;
766 if (range->blockName != NULL)
767 xmlFree(range->blockName);
768 xmlFree(range);
772 * xmlRegCopyRange:
773 * @range: the regexp range
775 * Copy a regexp range
777 * Returns the new copy or NULL in case of error.
779 static xmlRegRangePtr
780 xmlRegCopyRange(xmlRegParserCtxtPtr ctxt, xmlRegRangePtr range) {
781 xmlRegRangePtr ret;
783 if (range == NULL)
784 return(NULL);
786 ret = xmlRegNewRange(ctxt, range->neg, range->type, range->start,
787 range->end);
788 if (ret == NULL)
789 return(NULL);
790 if (range->blockName != NULL) {
791 ret->blockName = xmlStrdup(range->blockName);
792 if (ret->blockName == NULL) {
793 xmlRegexpErrMemory(ctxt, "allocating range");
794 xmlRegFreeRange(ret);
795 return(NULL);
798 return(ret);
802 * xmlRegNewAtom:
803 * @ctxt: the regexp parser context
804 * @type: the type of atom
806 * Allocate a new atom
808 * Returns the new atom or NULL in case of error
810 static xmlRegAtomPtr
811 xmlRegNewAtom(xmlRegParserCtxtPtr ctxt, xmlRegAtomType type) {
812 xmlRegAtomPtr ret;
814 ret = (xmlRegAtomPtr) xmlMalloc(sizeof(xmlRegAtom));
815 if (ret == NULL) {
816 xmlRegexpErrMemory(ctxt, "allocating atom");
817 return(NULL);
819 memset(ret, 0, sizeof(xmlRegAtom));
820 ret->type = type;
821 ret->quant = XML_REGEXP_QUANT_ONCE;
822 ret->min = 0;
823 ret->max = 0;
824 return(ret);
828 * xmlRegFreeAtom:
829 * @atom: the regexp atom
831 * Free a regexp atom
833 static void
834 xmlRegFreeAtom(xmlRegAtomPtr atom) {
835 int i;
837 if (atom == NULL)
838 return;
840 for (i = 0;i < atom->nbRanges;i++)
841 xmlRegFreeRange(atom->ranges[i]);
842 if (atom->ranges != NULL)
843 xmlFree(atom->ranges);
844 if ((atom->type == XML_REGEXP_STRING) && (atom->valuep != NULL))
845 xmlFree(atom->valuep);
846 if ((atom->type == XML_REGEXP_STRING) && (atom->valuep2 != NULL))
847 xmlFree(atom->valuep2);
848 if ((atom->type == XML_REGEXP_BLOCK_NAME) && (atom->valuep != NULL))
849 xmlFree(atom->valuep);
850 xmlFree(atom);
854 * xmlRegCopyAtom:
855 * @ctxt: the regexp parser context
856 * @atom: the original atom
858 * Allocate a new regexp range
860 * Returns the new atom or NULL in case of error
862 static xmlRegAtomPtr
863 xmlRegCopyAtom(xmlRegParserCtxtPtr ctxt, xmlRegAtomPtr atom) {
864 xmlRegAtomPtr ret;
866 ret = (xmlRegAtomPtr) xmlMalloc(sizeof(xmlRegAtom));
867 if (ret == NULL) {
868 xmlRegexpErrMemory(ctxt, "copying atom");
869 return(NULL);
871 memset(ret, 0, sizeof(xmlRegAtom));
872 ret->type = atom->type;
873 ret->quant = atom->quant;
874 ret->min = atom->min;
875 ret->max = atom->max;
876 if (atom->nbRanges > 0) {
877 int i;
879 ret->ranges = (xmlRegRangePtr *) xmlMalloc(sizeof(xmlRegRangePtr) *
880 atom->nbRanges);
881 if (ret->ranges == NULL) {
882 xmlRegexpErrMemory(ctxt, "copying atom");
883 goto error;
885 for (i = 0;i < atom->nbRanges;i++) {
886 ret->ranges[i] = xmlRegCopyRange(ctxt, atom->ranges[i]);
887 if (ret->ranges[i] == NULL)
888 goto error;
889 ret->nbRanges = i + 1;
892 return(ret);
894 error:
895 xmlRegFreeAtom(ret);
896 return(NULL);
899 static xmlRegStatePtr
900 xmlRegNewState(xmlRegParserCtxtPtr ctxt) {
901 xmlRegStatePtr ret;
903 ret = (xmlRegStatePtr) xmlMalloc(sizeof(xmlRegState));
904 if (ret == NULL) {
905 xmlRegexpErrMemory(ctxt, "allocating state");
906 return(NULL);
908 memset(ret, 0, sizeof(xmlRegState));
909 ret->type = XML_REGEXP_TRANS_STATE;
910 ret->mark = XML_REGEXP_MARK_NORMAL;
911 return(ret);
915 * xmlRegFreeState:
916 * @state: the regexp state
918 * Free a regexp state
920 static void
921 xmlRegFreeState(xmlRegStatePtr state) {
922 if (state == NULL)
923 return;
925 if (state->trans != NULL)
926 xmlFree(state->trans);
927 if (state->transTo != NULL)
928 xmlFree(state->transTo);
929 xmlFree(state);
933 * xmlRegFreeParserCtxt:
934 * @ctxt: the regexp parser context
936 * Free a regexp parser context
938 static void
939 xmlRegFreeParserCtxt(xmlRegParserCtxtPtr ctxt) {
940 int i;
941 if (ctxt == NULL)
942 return;
944 if (ctxt->string != NULL)
945 xmlFree(ctxt->string);
946 if (ctxt->states != NULL) {
947 for (i = 0;i < ctxt->nbStates;i++)
948 xmlRegFreeState(ctxt->states[i]);
949 xmlFree(ctxt->states);
951 if (ctxt->atoms != NULL) {
952 for (i = 0;i < ctxt->nbAtoms;i++)
953 xmlRegFreeAtom(ctxt->atoms[i]);
954 xmlFree(ctxt->atoms);
956 if (ctxt->counters != NULL)
957 xmlFree(ctxt->counters);
958 xmlFree(ctxt);
961 /************************************************************************
963 * Display of Data structures *
965 ************************************************************************/
967 static void
968 xmlRegPrintAtomType(FILE *output, xmlRegAtomType type) {
969 switch (type) {
970 case XML_REGEXP_EPSILON:
971 fprintf(output, "epsilon "); break;
972 case XML_REGEXP_CHARVAL:
973 fprintf(output, "charval "); break;
974 case XML_REGEXP_RANGES:
975 fprintf(output, "ranges "); break;
976 case XML_REGEXP_SUBREG:
977 fprintf(output, "subexpr "); break;
978 case XML_REGEXP_STRING:
979 fprintf(output, "string "); break;
980 case XML_REGEXP_ANYCHAR:
981 fprintf(output, "anychar "); break;
982 case XML_REGEXP_ANYSPACE:
983 fprintf(output, "anyspace "); break;
984 case XML_REGEXP_NOTSPACE:
985 fprintf(output, "notspace "); break;
986 case XML_REGEXP_INITNAME:
987 fprintf(output, "initname "); break;
988 case XML_REGEXP_NOTINITNAME:
989 fprintf(output, "notinitname "); break;
990 case XML_REGEXP_NAMECHAR:
991 fprintf(output, "namechar "); break;
992 case XML_REGEXP_NOTNAMECHAR:
993 fprintf(output, "notnamechar "); break;
994 case XML_REGEXP_DECIMAL:
995 fprintf(output, "decimal "); break;
996 case XML_REGEXP_NOTDECIMAL:
997 fprintf(output, "notdecimal "); break;
998 case XML_REGEXP_REALCHAR:
999 fprintf(output, "realchar "); break;
1000 case XML_REGEXP_NOTREALCHAR:
1001 fprintf(output, "notrealchar "); break;
1002 case XML_REGEXP_LETTER:
1003 fprintf(output, "LETTER "); break;
1004 case XML_REGEXP_LETTER_UPPERCASE:
1005 fprintf(output, "LETTER_UPPERCASE "); break;
1006 case XML_REGEXP_LETTER_LOWERCASE:
1007 fprintf(output, "LETTER_LOWERCASE "); break;
1008 case XML_REGEXP_LETTER_TITLECASE:
1009 fprintf(output, "LETTER_TITLECASE "); break;
1010 case XML_REGEXP_LETTER_MODIFIER:
1011 fprintf(output, "LETTER_MODIFIER "); break;
1012 case XML_REGEXP_LETTER_OTHERS:
1013 fprintf(output, "LETTER_OTHERS "); break;
1014 case XML_REGEXP_MARK:
1015 fprintf(output, "MARK "); break;
1016 case XML_REGEXP_MARK_NONSPACING:
1017 fprintf(output, "MARK_NONSPACING "); break;
1018 case XML_REGEXP_MARK_SPACECOMBINING:
1019 fprintf(output, "MARK_SPACECOMBINING "); break;
1020 case XML_REGEXP_MARK_ENCLOSING:
1021 fprintf(output, "MARK_ENCLOSING "); break;
1022 case XML_REGEXP_NUMBER:
1023 fprintf(output, "NUMBER "); break;
1024 case XML_REGEXP_NUMBER_DECIMAL:
1025 fprintf(output, "NUMBER_DECIMAL "); break;
1026 case XML_REGEXP_NUMBER_LETTER:
1027 fprintf(output, "NUMBER_LETTER "); break;
1028 case XML_REGEXP_NUMBER_OTHERS:
1029 fprintf(output, "NUMBER_OTHERS "); break;
1030 case XML_REGEXP_PUNCT:
1031 fprintf(output, "PUNCT "); break;
1032 case XML_REGEXP_PUNCT_CONNECTOR:
1033 fprintf(output, "PUNCT_CONNECTOR "); break;
1034 case XML_REGEXP_PUNCT_DASH:
1035 fprintf(output, "PUNCT_DASH "); break;
1036 case XML_REGEXP_PUNCT_OPEN:
1037 fprintf(output, "PUNCT_OPEN "); break;
1038 case XML_REGEXP_PUNCT_CLOSE:
1039 fprintf(output, "PUNCT_CLOSE "); break;
1040 case XML_REGEXP_PUNCT_INITQUOTE:
1041 fprintf(output, "PUNCT_INITQUOTE "); break;
1042 case XML_REGEXP_PUNCT_FINQUOTE:
1043 fprintf(output, "PUNCT_FINQUOTE "); break;
1044 case XML_REGEXP_PUNCT_OTHERS:
1045 fprintf(output, "PUNCT_OTHERS "); break;
1046 case XML_REGEXP_SEPAR:
1047 fprintf(output, "SEPAR "); break;
1048 case XML_REGEXP_SEPAR_SPACE:
1049 fprintf(output, "SEPAR_SPACE "); break;
1050 case XML_REGEXP_SEPAR_LINE:
1051 fprintf(output, "SEPAR_LINE "); break;
1052 case XML_REGEXP_SEPAR_PARA:
1053 fprintf(output, "SEPAR_PARA "); break;
1054 case XML_REGEXP_SYMBOL:
1055 fprintf(output, "SYMBOL "); break;
1056 case XML_REGEXP_SYMBOL_MATH:
1057 fprintf(output, "SYMBOL_MATH "); break;
1058 case XML_REGEXP_SYMBOL_CURRENCY:
1059 fprintf(output, "SYMBOL_CURRENCY "); break;
1060 case XML_REGEXP_SYMBOL_MODIFIER:
1061 fprintf(output, "SYMBOL_MODIFIER "); break;
1062 case XML_REGEXP_SYMBOL_OTHERS:
1063 fprintf(output, "SYMBOL_OTHERS "); break;
1064 case XML_REGEXP_OTHER:
1065 fprintf(output, "OTHER "); break;
1066 case XML_REGEXP_OTHER_CONTROL:
1067 fprintf(output, "OTHER_CONTROL "); break;
1068 case XML_REGEXP_OTHER_FORMAT:
1069 fprintf(output, "OTHER_FORMAT "); break;
1070 case XML_REGEXP_OTHER_PRIVATE:
1071 fprintf(output, "OTHER_PRIVATE "); break;
1072 case XML_REGEXP_OTHER_NA:
1073 fprintf(output, "OTHER_NA "); break;
1074 case XML_REGEXP_BLOCK_NAME:
1075 fprintf(output, "BLOCK "); break;
1079 static void
1080 xmlRegPrintQuantType(FILE *output, xmlRegQuantType type) {
1081 switch (type) {
1082 case XML_REGEXP_QUANT_EPSILON:
1083 fprintf(output, "epsilon "); break;
1084 case XML_REGEXP_QUANT_ONCE:
1085 fprintf(output, "once "); break;
1086 case XML_REGEXP_QUANT_OPT:
1087 fprintf(output, "? "); break;
1088 case XML_REGEXP_QUANT_MULT:
1089 fprintf(output, "* "); break;
1090 case XML_REGEXP_QUANT_PLUS:
1091 fprintf(output, "+ "); break;
1092 case XML_REGEXP_QUANT_RANGE:
1093 fprintf(output, "range "); break;
1094 case XML_REGEXP_QUANT_ONCEONLY:
1095 fprintf(output, "onceonly "); break;
1096 case XML_REGEXP_QUANT_ALL:
1097 fprintf(output, "all "); break;
1100 static void
1101 xmlRegPrintRange(FILE *output, xmlRegRangePtr range) {
1102 fprintf(output, " range: ");
1103 if (range->neg)
1104 fprintf(output, "negative ");
1105 xmlRegPrintAtomType(output, range->type);
1106 fprintf(output, "%c - %c\n", range->start, range->end);
1109 static void
1110 xmlRegPrintAtom(FILE *output, xmlRegAtomPtr atom) {
1111 fprintf(output, " atom: ");
1112 if (atom == NULL) {
1113 fprintf(output, "NULL\n");
1114 return;
1116 if (atom->neg)
1117 fprintf(output, "not ");
1118 xmlRegPrintAtomType(output, atom->type);
1119 xmlRegPrintQuantType(output, atom->quant);
1120 if (atom->quant == XML_REGEXP_QUANT_RANGE)
1121 fprintf(output, "%d-%d ", atom->min, atom->max);
1122 if (atom->type == XML_REGEXP_STRING)
1123 fprintf(output, "'%s' ", (char *) atom->valuep);
1124 if (atom->type == XML_REGEXP_CHARVAL)
1125 fprintf(output, "char %c\n", atom->codepoint);
1126 else if (atom->type == XML_REGEXP_RANGES) {
1127 int i;
1128 fprintf(output, "%d entries\n", atom->nbRanges);
1129 for (i = 0; i < atom->nbRanges;i++)
1130 xmlRegPrintRange(output, atom->ranges[i]);
1131 } else if (atom->type == XML_REGEXP_SUBREG) {
1132 fprintf(output, "start %d end %d\n", atom->start->no, atom->stop->no);
1133 } else {
1134 fprintf(output, "\n");
1138 static void
1139 xmlRegPrintTrans(FILE *output, xmlRegTransPtr trans) {
1140 fprintf(output, " trans: ");
1141 if (trans == NULL) {
1142 fprintf(output, "NULL\n");
1143 return;
1145 if (trans->to < 0) {
1146 fprintf(output, "removed\n");
1147 return;
1149 if (trans->nd != 0) {
1150 if (trans->nd == 2)
1151 fprintf(output, "last not determinist, ");
1152 else
1153 fprintf(output, "not determinist, ");
1155 if (trans->counter >= 0) {
1156 fprintf(output, "counted %d, ", trans->counter);
1158 if (trans->count == REGEXP_ALL_COUNTER) {
1159 fprintf(output, "all transition, ");
1160 } else if (trans->count >= 0) {
1161 fprintf(output, "count based %d, ", trans->count);
1163 if (trans->atom == NULL) {
1164 fprintf(output, "epsilon to %d\n", trans->to);
1165 return;
1167 if (trans->atom->type == XML_REGEXP_CHARVAL)
1168 fprintf(output, "char %c ", trans->atom->codepoint);
1169 fprintf(output, "atom %d, to %d\n", trans->atom->no, trans->to);
1172 static void
1173 xmlRegPrintState(FILE *output, xmlRegStatePtr state) {
1174 int i;
1176 fprintf(output, " state: ");
1177 if (state == NULL) {
1178 fprintf(output, "NULL\n");
1179 return;
1181 if (state->type == XML_REGEXP_START_STATE)
1182 fprintf(output, "START ");
1183 if (state->type == XML_REGEXP_FINAL_STATE)
1184 fprintf(output, "FINAL ");
1186 fprintf(output, "%d, %d transitions:\n", state->no, state->nbTrans);
1187 for (i = 0;i < state->nbTrans; i++) {
1188 xmlRegPrintTrans(output, &(state->trans[i]));
1192 #ifdef DEBUG_REGEXP_GRAPH
1193 static void
1194 xmlRegPrintCtxt(FILE *output, xmlRegParserCtxtPtr ctxt) {
1195 int i;
1197 fprintf(output, " ctxt: ");
1198 if (ctxt == NULL) {
1199 fprintf(output, "NULL\n");
1200 return;
1202 fprintf(output, "'%s' ", ctxt->string);
1203 if (ctxt->error)
1204 fprintf(output, "error ");
1205 if (ctxt->neg)
1206 fprintf(output, "neg ");
1207 fprintf(output, "\n");
1208 fprintf(output, "%d atoms:\n", ctxt->nbAtoms);
1209 for (i = 0;i < ctxt->nbAtoms; i++) {
1210 fprintf(output, " %02d ", i);
1211 xmlRegPrintAtom(output, ctxt->atoms[i]);
1213 if (ctxt->atom != NULL) {
1214 fprintf(output, "current atom:\n");
1215 xmlRegPrintAtom(output, ctxt->atom);
1217 fprintf(output, "%d states:", ctxt->nbStates);
1218 if (ctxt->start != NULL)
1219 fprintf(output, " start: %d", ctxt->start->no);
1220 if (ctxt->end != NULL)
1221 fprintf(output, " end: %d", ctxt->end->no);
1222 fprintf(output, "\n");
1223 for (i = 0;i < ctxt->nbStates; i++) {
1224 xmlRegPrintState(output, ctxt->states[i]);
1226 fprintf(output, "%d counters:\n", ctxt->nbCounters);
1227 for (i = 0;i < ctxt->nbCounters; i++) {
1228 fprintf(output, " %d: min %d max %d\n", i, ctxt->counters[i].min,
1229 ctxt->counters[i].max);
1232 #endif
1234 /************************************************************************
1236 * Finite Automata structures manipulations *
1238 ************************************************************************/
1240 static void
1241 xmlRegAtomAddRange(xmlRegParserCtxtPtr ctxt, xmlRegAtomPtr atom,
1242 int neg, xmlRegAtomType type, int start, int end,
1243 xmlChar *blockName) {
1244 xmlRegRangePtr range;
1246 if (atom == NULL) {
1247 ERROR("add range: atom is NULL");
1248 return;
1250 if (atom->type != XML_REGEXP_RANGES) {
1251 ERROR("add range: atom is not ranges");
1252 return;
1254 if (atom->maxRanges == 0) {
1255 atom->maxRanges = 4;
1256 atom->ranges = (xmlRegRangePtr *) xmlMalloc(atom->maxRanges *
1257 sizeof(xmlRegRangePtr));
1258 if (atom->ranges == NULL) {
1259 xmlRegexpErrMemory(ctxt, "adding ranges");
1260 atom->maxRanges = 0;
1261 return;
1263 } else if (atom->nbRanges >= atom->maxRanges) {
1264 xmlRegRangePtr *tmp;
1265 atom->maxRanges *= 2;
1266 tmp = (xmlRegRangePtr *) xmlRealloc(atom->ranges, atom->maxRanges *
1267 sizeof(xmlRegRangePtr));
1268 if (tmp == NULL) {
1269 xmlRegexpErrMemory(ctxt, "adding ranges");
1270 atom->maxRanges /= 2;
1271 return;
1273 atom->ranges = tmp;
1275 range = xmlRegNewRange(ctxt, neg, type, start, end);
1276 if (range == NULL)
1277 return;
1278 range->blockName = blockName;
1279 atom->ranges[atom->nbRanges++] = range;
1283 static int
1284 xmlRegGetCounter(xmlRegParserCtxtPtr ctxt) {
1285 if (ctxt->maxCounters == 0) {
1286 ctxt->maxCounters = 4;
1287 ctxt->counters = (xmlRegCounter *) xmlMalloc(ctxt->maxCounters *
1288 sizeof(xmlRegCounter));
1289 if (ctxt->counters == NULL) {
1290 xmlRegexpErrMemory(ctxt, "allocating counter");
1291 ctxt->maxCounters = 0;
1292 return(-1);
1294 } else if (ctxt->nbCounters >= ctxt->maxCounters) {
1295 xmlRegCounter *tmp;
1296 ctxt->maxCounters *= 2;
1297 tmp = (xmlRegCounter *) xmlRealloc(ctxt->counters, ctxt->maxCounters *
1298 sizeof(xmlRegCounter));
1299 if (tmp == NULL) {
1300 xmlRegexpErrMemory(ctxt, "allocating counter");
1301 ctxt->maxCounters /= 2;
1302 return(-1);
1304 ctxt->counters = tmp;
1306 ctxt->counters[ctxt->nbCounters].min = -1;
1307 ctxt->counters[ctxt->nbCounters].max = -1;
1308 return(ctxt->nbCounters++);
1311 static int
1312 xmlRegAtomPush(xmlRegParserCtxtPtr ctxt, xmlRegAtomPtr atom) {
1313 if (atom == NULL) {
1314 ERROR("atom push: atom is NULL");
1315 return(-1);
1317 if (ctxt->maxAtoms == 0) {
1318 ctxt->maxAtoms = 4;
1319 ctxt->atoms = (xmlRegAtomPtr *) xmlMalloc(ctxt->maxAtoms *
1320 sizeof(xmlRegAtomPtr));
1321 if (ctxt->atoms == NULL) {
1322 xmlRegexpErrMemory(ctxt, "pushing atom");
1323 ctxt->maxAtoms = 0;
1324 return(-1);
1326 } else if (ctxt->nbAtoms >= ctxt->maxAtoms) {
1327 xmlRegAtomPtr *tmp;
1328 ctxt->maxAtoms *= 2;
1329 tmp = (xmlRegAtomPtr *) xmlRealloc(ctxt->atoms, ctxt->maxAtoms *
1330 sizeof(xmlRegAtomPtr));
1331 if (tmp == NULL) {
1332 xmlRegexpErrMemory(ctxt, "allocating counter");
1333 ctxt->maxAtoms /= 2;
1334 return(-1);
1336 ctxt->atoms = tmp;
1338 atom->no = ctxt->nbAtoms;
1339 ctxt->atoms[ctxt->nbAtoms++] = atom;
1340 return(0);
1343 static void
1344 xmlRegStateAddTransTo(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr target,
1345 int from) {
1346 if (target->maxTransTo == 0) {
1347 target->maxTransTo = 8;
1348 target->transTo = (int *) xmlMalloc(target->maxTransTo *
1349 sizeof(int));
1350 if (target->transTo == NULL) {
1351 xmlRegexpErrMemory(ctxt, "adding transition");
1352 target->maxTransTo = 0;
1353 return;
1355 } else if (target->nbTransTo >= target->maxTransTo) {
1356 int *tmp;
1357 target->maxTransTo *= 2;
1358 tmp = (int *) xmlRealloc(target->transTo, target->maxTransTo *
1359 sizeof(int));
1360 if (tmp == NULL) {
1361 xmlRegexpErrMemory(ctxt, "adding transition");
1362 target->maxTransTo /= 2;
1363 return;
1365 target->transTo = tmp;
1367 target->transTo[target->nbTransTo] = from;
1368 target->nbTransTo++;
1371 static void
1372 xmlRegStateAddTrans(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr state,
1373 xmlRegAtomPtr atom, xmlRegStatePtr target,
1374 int counter, int count) {
1376 int nrtrans;
1378 if (state == NULL) {
1379 ERROR("add state: state is NULL");
1380 return;
1382 if (target == NULL) {
1383 ERROR("add state: target is NULL");
1384 return;
1387 * Other routines follow the philosophy 'When in doubt, add a transition'
1388 * so we check here whether such a transition is already present and, if
1389 * so, silently ignore this request.
1392 for (nrtrans = state->nbTrans - 1; nrtrans >= 0; nrtrans--) {
1393 xmlRegTransPtr trans = &(state->trans[nrtrans]);
1394 if ((trans->atom == atom) &&
1395 (trans->to == target->no) &&
1396 (trans->counter == counter) &&
1397 (trans->count == count)) {
1398 #ifdef DEBUG_REGEXP_GRAPH
1399 printf("Ignoring duplicate transition from %d to %d\n",
1400 state->no, target->no);
1401 #endif
1402 return;
1406 if (state->maxTrans == 0) {
1407 state->maxTrans = 8;
1408 state->trans = (xmlRegTrans *) xmlMalloc(state->maxTrans *
1409 sizeof(xmlRegTrans));
1410 if (state->trans == NULL) {
1411 xmlRegexpErrMemory(ctxt, "adding transition");
1412 state->maxTrans = 0;
1413 return;
1415 } else if (state->nbTrans >= state->maxTrans) {
1416 xmlRegTrans *tmp;
1417 state->maxTrans *= 2;
1418 tmp = (xmlRegTrans *) xmlRealloc(state->trans, state->maxTrans *
1419 sizeof(xmlRegTrans));
1420 if (tmp == NULL) {
1421 xmlRegexpErrMemory(ctxt, "adding transition");
1422 state->maxTrans /= 2;
1423 return;
1425 state->trans = tmp;
1427 #ifdef DEBUG_REGEXP_GRAPH
1428 printf("Add trans from %d to %d ", state->no, target->no);
1429 if (count == REGEXP_ALL_COUNTER)
1430 printf("all transition\n");
1431 else if (count >= 0)
1432 printf("count based %d\n", count);
1433 else if (counter >= 0)
1434 printf("counted %d\n", counter);
1435 else if (atom == NULL)
1436 printf("epsilon transition\n");
1437 else if (atom != NULL)
1438 xmlRegPrintAtom(stdout, atom);
1439 #endif
1441 state->trans[state->nbTrans].atom = atom;
1442 state->trans[state->nbTrans].to = target->no;
1443 state->trans[state->nbTrans].counter = counter;
1444 state->trans[state->nbTrans].count = count;
1445 state->trans[state->nbTrans].nd = 0;
1446 state->nbTrans++;
1447 xmlRegStateAddTransTo(ctxt, target, state->no);
1450 static int
1451 xmlRegStatePush(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr state) {
1452 if (state == NULL) return(-1);
1453 if (ctxt->maxStates == 0) {
1454 ctxt->maxStates = 4;
1455 ctxt->states = (xmlRegStatePtr *) xmlMalloc(ctxt->maxStates *
1456 sizeof(xmlRegStatePtr));
1457 if (ctxt->states == NULL) {
1458 xmlRegexpErrMemory(ctxt, "adding state");
1459 ctxt->maxStates = 0;
1460 return(-1);
1462 } else if (ctxt->nbStates >= ctxt->maxStates) {
1463 xmlRegStatePtr *tmp;
1464 ctxt->maxStates *= 2;
1465 tmp = (xmlRegStatePtr *) xmlRealloc(ctxt->states, ctxt->maxStates *
1466 sizeof(xmlRegStatePtr));
1467 if (tmp == NULL) {
1468 xmlRegexpErrMemory(ctxt, "adding state");
1469 ctxt->maxStates /= 2;
1470 return(-1);
1472 ctxt->states = tmp;
1474 state->no = ctxt->nbStates;
1475 ctxt->states[ctxt->nbStates++] = state;
1476 return(0);
1480 * xmlFAGenerateAllTransition:
1481 * @ctxt: a regexp parser context
1482 * @from: the from state
1483 * @to: the target state or NULL for building a new one
1484 * @lax:
1487 static void
1488 xmlFAGenerateAllTransition(xmlRegParserCtxtPtr ctxt,
1489 xmlRegStatePtr from, xmlRegStatePtr to,
1490 int lax) {
1491 if (to == NULL) {
1492 to = xmlRegNewState(ctxt);
1493 xmlRegStatePush(ctxt, to);
1494 ctxt->state = to;
1496 if (lax)
1497 xmlRegStateAddTrans(ctxt, from, NULL, to, -1, REGEXP_ALL_LAX_COUNTER);
1498 else
1499 xmlRegStateAddTrans(ctxt, from, NULL, to, -1, REGEXP_ALL_COUNTER);
1503 * xmlFAGenerateEpsilonTransition:
1504 * @ctxt: a regexp parser context
1505 * @from: the from state
1506 * @to: the target state or NULL for building a new one
1509 static void
1510 xmlFAGenerateEpsilonTransition(xmlRegParserCtxtPtr ctxt,
1511 xmlRegStatePtr from, xmlRegStatePtr to) {
1512 if (to == NULL) {
1513 to = xmlRegNewState(ctxt);
1514 xmlRegStatePush(ctxt, to);
1515 ctxt->state = to;
1517 xmlRegStateAddTrans(ctxt, from, NULL, to, -1, -1);
1521 * xmlFAGenerateCountedEpsilonTransition:
1522 * @ctxt: a regexp parser context
1523 * @from: the from state
1524 * @to: the target state or NULL for building a new one
1525 * counter: the counter for that transition
1528 static void
1529 xmlFAGenerateCountedEpsilonTransition(xmlRegParserCtxtPtr ctxt,
1530 xmlRegStatePtr from, xmlRegStatePtr to, int counter) {
1531 if (to == NULL) {
1532 to = xmlRegNewState(ctxt);
1533 xmlRegStatePush(ctxt, to);
1534 ctxt->state = to;
1536 xmlRegStateAddTrans(ctxt, from, NULL, to, counter, -1);
1540 * xmlFAGenerateCountedTransition:
1541 * @ctxt: a regexp parser context
1542 * @from: the from state
1543 * @to: the target state or NULL for building a new one
1544 * counter: the counter for that transition
1547 static void
1548 xmlFAGenerateCountedTransition(xmlRegParserCtxtPtr ctxt,
1549 xmlRegStatePtr from, xmlRegStatePtr to, int counter) {
1550 if (to == NULL) {
1551 to = xmlRegNewState(ctxt);
1552 xmlRegStatePush(ctxt, to);
1553 ctxt->state = to;
1555 xmlRegStateAddTrans(ctxt, from, NULL, to, -1, counter);
1559 * xmlFAGenerateTransitions:
1560 * @ctxt: a regexp parser context
1561 * @from: the from state
1562 * @to: the target state or NULL for building a new one
1563 * @atom: the atom generating the transition
1565 * Returns 0 if success and -1 in case of error.
1567 static int
1568 xmlFAGenerateTransitions(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr from,
1569 xmlRegStatePtr to, xmlRegAtomPtr atom) {
1570 xmlRegStatePtr end;
1571 int nullable = 0;
1573 if (atom == NULL) {
1574 ERROR("generate transition: atom == NULL");
1575 return(-1);
1577 if (atom->type == XML_REGEXP_SUBREG) {
1579 * this is a subexpression handling one should not need to
1580 * create a new node except for XML_REGEXP_QUANT_RANGE.
1582 if (xmlRegAtomPush(ctxt, atom) < 0) {
1583 return(-1);
1585 if ((to != NULL) && (atom->stop != to) &&
1586 (atom->quant != XML_REGEXP_QUANT_RANGE)) {
1588 * Generate an epsilon transition to link to the target
1590 xmlFAGenerateEpsilonTransition(ctxt, atom->stop, to);
1591 #ifdef DV
1592 } else if ((to == NULL) && (atom->quant != XML_REGEXP_QUANT_RANGE) &&
1593 (atom->quant != XML_REGEXP_QUANT_ONCE)) {
1594 to = xmlRegNewState(ctxt);
1595 xmlRegStatePush(ctxt, to);
1596 ctxt->state = to;
1597 xmlFAGenerateEpsilonTransition(ctxt, atom->stop, to);
1598 #endif
1600 switch (atom->quant) {
1601 case XML_REGEXP_QUANT_OPT:
1602 atom->quant = XML_REGEXP_QUANT_ONCE;
1604 * transition done to the state after end of atom.
1605 * 1. set transition from atom start to new state
1606 * 2. set transition from atom end to this state.
1608 if (to == NULL) {
1609 xmlFAGenerateEpsilonTransition(ctxt, atom->start, 0);
1610 xmlFAGenerateEpsilonTransition(ctxt, atom->stop,
1611 ctxt->state);
1612 } else {
1613 xmlFAGenerateEpsilonTransition(ctxt, atom->start, to);
1615 break;
1616 case XML_REGEXP_QUANT_MULT:
1617 atom->quant = XML_REGEXP_QUANT_ONCE;
1618 xmlFAGenerateEpsilonTransition(ctxt, atom->start, atom->stop);
1619 xmlFAGenerateEpsilonTransition(ctxt, atom->stop, atom->start);
1620 break;
1621 case XML_REGEXP_QUANT_PLUS:
1622 atom->quant = XML_REGEXP_QUANT_ONCE;
1623 xmlFAGenerateEpsilonTransition(ctxt, atom->stop, atom->start);
1624 break;
1625 case XML_REGEXP_QUANT_RANGE: {
1626 int counter;
1627 xmlRegStatePtr inter, newstate;
1630 * create the final state now if needed
1632 if (to != NULL) {
1633 newstate = to;
1634 } else {
1635 newstate = xmlRegNewState(ctxt);
1636 xmlRegStatePush(ctxt, newstate);
1640 * The principle here is to use counted transition
1641 * to avoid explosion in the number of states in the
1642 * graph. This is clearly more complex but should not
1643 * be exploitable at runtime.
1645 if ((atom->min == 0) && (atom->start0 == NULL)) {
1646 xmlRegAtomPtr copy;
1648 * duplicate a transition based on atom to count next
1649 * occurrences after 1. We cannot loop to atom->start
1650 * directly because we need an epsilon transition to
1651 * newstate.
1653 /* ???? For some reason it seems we never reach that
1654 case, I suppose this got optimized out before when
1655 building the automata */
1656 copy = xmlRegCopyAtom(ctxt, atom);
1657 if (copy == NULL)
1658 return(-1);
1659 copy->quant = XML_REGEXP_QUANT_ONCE;
1660 copy->min = 0;
1661 copy->max = 0;
1663 if (xmlFAGenerateTransitions(ctxt, atom->start, NULL, copy)
1664 < 0)
1665 return(-1);
1666 inter = ctxt->state;
1667 counter = xmlRegGetCounter(ctxt);
1668 ctxt->counters[counter].min = atom->min - 1;
1669 ctxt->counters[counter].max = atom->max - 1;
1670 /* count the number of times we see it again */
1671 xmlFAGenerateCountedEpsilonTransition(ctxt, inter,
1672 atom->stop, counter);
1673 /* allow a way out based on the count */
1674 xmlFAGenerateCountedTransition(ctxt, inter,
1675 newstate, counter);
1676 /* and also allow a direct exit for 0 */
1677 xmlFAGenerateEpsilonTransition(ctxt, atom->start,
1678 newstate);
1679 } else {
1681 * either we need the atom at least once or there
1682 * is an atom->start0 allowing to easily plug the
1683 * epsilon transition.
1685 counter = xmlRegGetCounter(ctxt);
1686 ctxt->counters[counter].min = atom->min - 1;
1687 ctxt->counters[counter].max = atom->max - 1;
1688 /* allow a way out based on the count */
1689 xmlFAGenerateCountedTransition(ctxt, atom->stop,
1690 newstate, counter);
1691 /* count the number of times we see it again */
1692 xmlFAGenerateCountedEpsilonTransition(ctxt, atom->stop,
1693 atom->start, counter);
1694 /* and if needed allow a direct exit for 0 */
1695 if (atom->min == 0)
1696 xmlFAGenerateEpsilonTransition(ctxt, atom->start0,
1697 newstate);
1700 atom->min = 0;
1701 atom->max = 0;
1702 atom->quant = XML_REGEXP_QUANT_ONCE;
1703 ctxt->state = newstate;
1705 default:
1706 break;
1708 return(0);
1710 if ((atom->min == 0) && (atom->max == 0) &&
1711 (atom->quant == XML_REGEXP_QUANT_RANGE)) {
1713 * we can discard the atom and generate an epsilon transition instead
1715 if (to == NULL) {
1716 to = xmlRegNewState(ctxt);
1717 if (to != NULL)
1718 xmlRegStatePush(ctxt, to);
1719 else {
1720 return(-1);
1723 xmlFAGenerateEpsilonTransition(ctxt, from, to);
1724 ctxt->state = to;
1725 xmlRegFreeAtom(atom);
1726 return(0);
1728 if (to == NULL) {
1729 to = xmlRegNewState(ctxt);
1730 if (to != NULL)
1731 xmlRegStatePush(ctxt, to);
1732 else {
1733 return(-1);
1736 end = to;
1737 if ((atom->quant == XML_REGEXP_QUANT_MULT) ||
1738 (atom->quant == XML_REGEXP_QUANT_PLUS)) {
1740 * Do not pollute the target state by adding transitions from
1741 * it as it is likely to be the shared target of multiple branches.
1742 * So isolate with an epsilon transition.
1744 xmlRegStatePtr tmp;
1746 tmp = xmlRegNewState(ctxt);
1747 if (tmp != NULL)
1748 xmlRegStatePush(ctxt, tmp);
1749 else {
1750 return(-1);
1752 xmlFAGenerateEpsilonTransition(ctxt, tmp, to);
1753 to = tmp;
1755 if (xmlRegAtomPush(ctxt, atom) < 0) {
1756 return(-1);
1758 if ((atom->quant == XML_REGEXP_QUANT_RANGE) &&
1759 (atom->min == 0) && (atom->max > 0)) {
1760 nullable = 1;
1761 atom->min = 1;
1762 if (atom->max == 1)
1763 atom->quant = XML_REGEXP_QUANT_OPT;
1765 xmlRegStateAddTrans(ctxt, from, atom, to, -1, -1);
1766 ctxt->state = end;
1767 switch (atom->quant) {
1768 case XML_REGEXP_QUANT_OPT:
1769 atom->quant = XML_REGEXP_QUANT_ONCE;
1770 xmlFAGenerateEpsilonTransition(ctxt, from, to);
1771 break;
1772 case XML_REGEXP_QUANT_MULT:
1773 atom->quant = XML_REGEXP_QUANT_ONCE;
1774 xmlFAGenerateEpsilonTransition(ctxt, from, to);
1775 xmlRegStateAddTrans(ctxt, to, atom, to, -1, -1);
1776 break;
1777 case XML_REGEXP_QUANT_PLUS:
1778 atom->quant = XML_REGEXP_QUANT_ONCE;
1779 xmlRegStateAddTrans(ctxt, to, atom, to, -1, -1);
1780 break;
1781 case XML_REGEXP_QUANT_RANGE:
1782 if (nullable)
1783 xmlFAGenerateEpsilonTransition(ctxt, from, to);
1784 break;
1785 default:
1786 break;
1788 return(0);
1792 * xmlFAReduceEpsilonTransitions:
1793 * @ctxt: a regexp parser context
1794 * @fromnr: the from state
1795 * @tonr: the to state
1796 * @counter: should that transition be associated to a counted
1799 static void
1800 xmlFAReduceEpsilonTransitions(xmlRegParserCtxtPtr ctxt, int fromnr,
1801 int tonr, int counter) {
1802 int transnr;
1803 xmlRegStatePtr from;
1804 xmlRegStatePtr to;
1806 #ifdef DEBUG_REGEXP_GRAPH
1807 printf("xmlFAReduceEpsilonTransitions(%d, %d)\n", fromnr, tonr);
1808 #endif
1809 from = ctxt->states[fromnr];
1810 if (from == NULL)
1811 return;
1812 to = ctxt->states[tonr];
1813 if (to == NULL)
1814 return;
1815 if ((to->mark == XML_REGEXP_MARK_START) ||
1816 (to->mark == XML_REGEXP_MARK_VISITED))
1817 return;
1819 to->mark = XML_REGEXP_MARK_VISITED;
1820 if (to->type == XML_REGEXP_FINAL_STATE) {
1821 #ifdef DEBUG_REGEXP_GRAPH
1822 printf("State %d is final, so %d becomes final\n", tonr, fromnr);
1823 #endif
1824 from->type = XML_REGEXP_FINAL_STATE;
1826 for (transnr = 0;transnr < to->nbTrans;transnr++) {
1827 if (to->trans[transnr].to < 0)
1828 continue;
1829 if (to->trans[transnr].atom == NULL) {
1831 * Don't remove counted transitions
1832 * Don't loop either
1834 if (to->trans[transnr].to != fromnr) {
1835 if (to->trans[transnr].count >= 0) {
1836 int newto = to->trans[transnr].to;
1838 xmlRegStateAddTrans(ctxt, from, NULL,
1839 ctxt->states[newto],
1840 -1, to->trans[transnr].count);
1841 } else {
1842 #ifdef DEBUG_REGEXP_GRAPH
1843 printf("Found epsilon trans %d from %d to %d\n",
1844 transnr, tonr, to->trans[transnr].to);
1845 #endif
1846 if (to->trans[transnr].counter >= 0) {
1847 xmlFAReduceEpsilonTransitions(ctxt, fromnr,
1848 to->trans[transnr].to,
1849 to->trans[transnr].counter);
1850 } else {
1851 xmlFAReduceEpsilonTransitions(ctxt, fromnr,
1852 to->trans[transnr].to,
1853 counter);
1857 } else {
1858 int newto = to->trans[transnr].to;
1860 if (to->trans[transnr].counter >= 0) {
1861 xmlRegStateAddTrans(ctxt, from, to->trans[transnr].atom,
1862 ctxt->states[newto],
1863 to->trans[transnr].counter, -1);
1864 } else {
1865 xmlRegStateAddTrans(ctxt, from, to->trans[transnr].atom,
1866 ctxt->states[newto], counter, -1);
1870 to->mark = XML_REGEXP_MARK_NORMAL;
1874 * xmlFAEliminateSimpleEpsilonTransitions:
1875 * @ctxt: a regexp parser context
1877 * Eliminating general epsilon transitions can get costly in the general
1878 * algorithm due to the large amount of generated new transitions and
1879 * associated comparisons. However for simple epsilon transition used just
1880 * to separate building blocks when generating the automata this can be
1881 * reduced to state elimination:
1882 * - if there exists an epsilon from X to Y
1883 * - if there is no other transition from X
1884 * then X and Y are semantically equivalent and X can be eliminated
1885 * If X is the start state then make Y the start state, else replace the
1886 * target of all transitions to X by transitions to Y.
1888 * If X is a final state, skip it.
1889 * Otherwise it would be necessary to manipulate counters for this case when
1890 * eliminating state 2:
1891 * State 1 has a transition with an atom to state 2.
1892 * State 2 is final and has an epsilon transition to state 1.
1894 static void
1895 xmlFAEliminateSimpleEpsilonTransitions(xmlRegParserCtxtPtr ctxt) {
1896 int statenr, i, j, newto;
1897 xmlRegStatePtr state, tmp;
1899 for (statenr = 0;statenr < ctxt->nbStates;statenr++) {
1900 state = ctxt->states[statenr];
1901 if (state == NULL)
1902 continue;
1903 if (state->nbTrans != 1)
1904 continue;
1905 if (state->type == XML_REGEXP_UNREACH_STATE ||
1906 state->type == XML_REGEXP_FINAL_STATE)
1907 continue;
1908 /* is the only transition out a basic transition */
1909 if ((state->trans[0].atom == NULL) &&
1910 (state->trans[0].to >= 0) &&
1911 (state->trans[0].to != statenr) &&
1912 (state->trans[0].counter < 0) &&
1913 (state->trans[0].count < 0)) {
1914 newto = state->trans[0].to;
1916 if (state->type == XML_REGEXP_START_STATE) {
1917 #ifdef DEBUG_REGEXP_GRAPH
1918 printf("Found simple epsilon trans from start %d to %d\n",
1919 statenr, newto);
1920 #endif
1921 } else {
1922 #ifdef DEBUG_REGEXP_GRAPH
1923 printf("Found simple epsilon trans from %d to %d\n",
1924 statenr, newto);
1925 #endif
1926 for (i = 0;i < state->nbTransTo;i++) {
1927 tmp = ctxt->states[state->transTo[i]];
1928 for (j = 0;j < tmp->nbTrans;j++) {
1929 if (tmp->trans[j].to == statenr) {
1930 #ifdef DEBUG_REGEXP_GRAPH
1931 printf("Changed transition %d on %d to go to %d\n",
1932 j, tmp->no, newto);
1933 #endif
1934 tmp->trans[j].to = -1;
1935 xmlRegStateAddTrans(ctxt, tmp, tmp->trans[j].atom,
1936 ctxt->states[newto],
1937 tmp->trans[j].counter,
1938 tmp->trans[j].count);
1942 if (state->type == XML_REGEXP_FINAL_STATE)
1943 ctxt->states[newto]->type = XML_REGEXP_FINAL_STATE;
1944 /* eliminate the transition completely */
1945 state->nbTrans = 0;
1947 state->type = XML_REGEXP_UNREACH_STATE;
1955 * xmlFAEliminateEpsilonTransitions:
1956 * @ctxt: a regexp parser context
1959 static void
1960 xmlFAEliminateEpsilonTransitions(xmlRegParserCtxtPtr ctxt) {
1961 int statenr, transnr;
1962 xmlRegStatePtr state;
1963 int has_epsilon;
1965 if (ctxt->states == NULL) return;
1968 * Eliminate simple epsilon transition and the associated unreachable
1969 * states.
1971 xmlFAEliminateSimpleEpsilonTransitions(ctxt);
1972 for (statenr = 0;statenr < ctxt->nbStates;statenr++) {
1973 state = ctxt->states[statenr];
1974 if ((state != NULL) && (state->type == XML_REGEXP_UNREACH_STATE)) {
1975 #ifdef DEBUG_REGEXP_GRAPH
1976 printf("Removed unreachable state %d\n", statenr);
1977 #endif
1978 xmlRegFreeState(state);
1979 ctxt->states[statenr] = NULL;
1983 has_epsilon = 0;
1986 * Build the completed transitions bypassing the epsilons
1987 * Use a marking algorithm to avoid loops
1988 * Mark sink states too.
1989 * Process from the latest states backward to the start when
1990 * there is long cascading epsilon chains this minimize the
1991 * recursions and transition compares when adding the new ones
1993 for (statenr = ctxt->nbStates - 1;statenr >= 0;statenr--) {
1994 state = ctxt->states[statenr];
1995 if (state == NULL)
1996 continue;
1997 if ((state->nbTrans == 0) &&
1998 (state->type != XML_REGEXP_FINAL_STATE)) {
1999 state->type = XML_REGEXP_SINK_STATE;
2001 for (transnr = 0;transnr < state->nbTrans;transnr++) {
2002 if ((state->trans[transnr].atom == NULL) &&
2003 (state->trans[transnr].to >= 0)) {
2004 if (state->trans[transnr].to == statenr) {
2005 state->trans[transnr].to = -1;
2006 #ifdef DEBUG_REGEXP_GRAPH
2007 printf("Removed loopback epsilon trans %d on %d\n",
2008 transnr, statenr);
2009 #endif
2010 } else if (state->trans[transnr].count < 0) {
2011 int newto = state->trans[transnr].to;
2013 #ifdef DEBUG_REGEXP_GRAPH
2014 printf("Found epsilon trans %d from %d to %d\n",
2015 transnr, statenr, newto);
2016 #endif
2017 has_epsilon = 1;
2018 state->trans[transnr].to = -2;
2019 state->mark = XML_REGEXP_MARK_START;
2020 xmlFAReduceEpsilonTransitions(ctxt, statenr,
2021 newto, state->trans[transnr].counter);
2022 state->mark = XML_REGEXP_MARK_NORMAL;
2023 #ifdef DEBUG_REGEXP_GRAPH
2024 } else {
2025 printf("Found counted transition %d on %d\n",
2026 transnr, statenr);
2027 #endif
2033 * Eliminate the epsilon transitions
2035 if (has_epsilon) {
2036 for (statenr = 0;statenr < ctxt->nbStates;statenr++) {
2037 state = ctxt->states[statenr];
2038 if (state == NULL)
2039 continue;
2040 for (transnr = 0;transnr < state->nbTrans;transnr++) {
2041 xmlRegTransPtr trans = &(state->trans[transnr]);
2042 if ((trans->atom == NULL) &&
2043 (trans->count < 0) &&
2044 (trans->to >= 0)) {
2045 trans->to = -1;
2052 * Use this pass to detect unreachable states too
2054 for (statenr = 0;statenr < ctxt->nbStates;statenr++) {
2055 state = ctxt->states[statenr];
2056 if (state != NULL)
2057 state->reached = XML_REGEXP_MARK_NORMAL;
2059 state = ctxt->states[0];
2060 if (state != NULL)
2061 state->reached = XML_REGEXP_MARK_START;
2062 while (state != NULL) {
2063 xmlRegStatePtr target = NULL;
2064 state->reached = XML_REGEXP_MARK_VISITED;
2066 * Mark all states reachable from the current reachable state
2068 for (transnr = 0;transnr < state->nbTrans;transnr++) {
2069 if ((state->trans[transnr].to >= 0) &&
2070 ((state->trans[transnr].atom != NULL) ||
2071 (state->trans[transnr].count >= 0))) {
2072 int newto = state->trans[transnr].to;
2074 if (ctxt->states[newto] == NULL)
2075 continue;
2076 if (ctxt->states[newto]->reached == XML_REGEXP_MARK_NORMAL) {
2077 ctxt->states[newto]->reached = XML_REGEXP_MARK_START;
2078 target = ctxt->states[newto];
2084 * find the next accessible state not explored
2086 if (target == NULL) {
2087 for (statenr = 1;statenr < ctxt->nbStates;statenr++) {
2088 state = ctxt->states[statenr];
2089 if ((state != NULL) && (state->reached ==
2090 XML_REGEXP_MARK_START)) {
2091 target = state;
2092 break;
2096 state = target;
2098 for (statenr = 0;statenr < ctxt->nbStates;statenr++) {
2099 state = ctxt->states[statenr];
2100 if ((state != NULL) && (state->reached == XML_REGEXP_MARK_NORMAL)) {
2101 #ifdef DEBUG_REGEXP_GRAPH
2102 printf("Removed unreachable state %d\n", statenr);
2103 #endif
2104 xmlRegFreeState(state);
2105 ctxt->states[statenr] = NULL;
2111 static int
2112 xmlFACompareRanges(xmlRegRangePtr range1, xmlRegRangePtr range2) {
2113 int ret = 0;
2115 if ((range1->type == XML_REGEXP_RANGES) ||
2116 (range2->type == XML_REGEXP_RANGES) ||
2117 (range2->type == XML_REGEXP_SUBREG) ||
2118 (range1->type == XML_REGEXP_SUBREG) ||
2119 (range1->type == XML_REGEXP_STRING) ||
2120 (range2->type == XML_REGEXP_STRING))
2121 return(-1);
2123 /* put them in order */
2124 if (range1->type > range2->type) {
2125 xmlRegRangePtr tmp;
2127 tmp = range1;
2128 range1 = range2;
2129 range2 = tmp;
2131 if ((range1->type == XML_REGEXP_ANYCHAR) ||
2132 (range2->type == XML_REGEXP_ANYCHAR)) {
2133 ret = 1;
2134 } else if ((range1->type == XML_REGEXP_EPSILON) ||
2135 (range2->type == XML_REGEXP_EPSILON)) {
2136 return(0);
2137 } else if (range1->type == range2->type) {
2138 if (range1->type != XML_REGEXP_CHARVAL)
2139 ret = 1;
2140 else if ((range1->end < range2->start) ||
2141 (range2->end < range1->start))
2142 ret = 0;
2143 else
2144 ret = 1;
2145 } else if (range1->type == XML_REGEXP_CHARVAL) {
2146 int codepoint;
2147 int neg = 0;
2150 * just check all codepoints in the range for acceptance,
2151 * this is usually way cheaper since done only once at
2152 * compilation than testing over and over at runtime or
2153 * pushing too many states when evaluating.
2155 if (((range1->neg == 0) && (range2->neg != 0)) ||
2156 ((range1->neg != 0) && (range2->neg == 0)))
2157 neg = 1;
2159 for (codepoint = range1->start;codepoint <= range1->end ;codepoint++) {
2160 ret = xmlRegCheckCharacterRange(range2->type, codepoint,
2161 0, range2->start, range2->end,
2162 range2->blockName);
2163 if (ret < 0)
2164 return(-1);
2165 if (((neg == 1) && (ret == 0)) ||
2166 ((neg == 0) && (ret == 1)))
2167 return(1);
2169 return(0);
2170 } else if ((range1->type == XML_REGEXP_BLOCK_NAME) ||
2171 (range2->type == XML_REGEXP_BLOCK_NAME)) {
2172 if (range1->type == range2->type) {
2173 ret = xmlStrEqual(range1->blockName, range2->blockName);
2174 } else {
2176 * comparing a block range with anything else is way
2177 * too costly, and maintaining the table is like too much
2178 * memory too, so let's force the automata to save state
2179 * here.
2181 return(1);
2183 } else if ((range1->type < XML_REGEXP_LETTER) ||
2184 (range2->type < XML_REGEXP_LETTER)) {
2185 if ((range1->type == XML_REGEXP_ANYSPACE) &&
2186 (range2->type == XML_REGEXP_NOTSPACE))
2187 ret = 0;
2188 else if ((range1->type == XML_REGEXP_INITNAME) &&
2189 (range2->type == XML_REGEXP_NOTINITNAME))
2190 ret = 0;
2191 else if ((range1->type == XML_REGEXP_NAMECHAR) &&
2192 (range2->type == XML_REGEXP_NOTNAMECHAR))
2193 ret = 0;
2194 else if ((range1->type == XML_REGEXP_DECIMAL) &&
2195 (range2->type == XML_REGEXP_NOTDECIMAL))
2196 ret = 0;
2197 else if ((range1->type == XML_REGEXP_REALCHAR) &&
2198 (range2->type == XML_REGEXP_NOTREALCHAR))
2199 ret = 0;
2200 else {
2201 /* same thing to limit complexity */
2202 return(1);
2204 } else {
2205 ret = 0;
2206 /* range1->type < range2->type here */
2207 switch (range1->type) {
2208 case XML_REGEXP_LETTER:
2209 /* all disjoint except in the subgroups */
2210 if ((range2->type == XML_REGEXP_LETTER_UPPERCASE) ||
2211 (range2->type == XML_REGEXP_LETTER_LOWERCASE) ||
2212 (range2->type == XML_REGEXP_LETTER_TITLECASE) ||
2213 (range2->type == XML_REGEXP_LETTER_MODIFIER) ||
2214 (range2->type == XML_REGEXP_LETTER_OTHERS))
2215 ret = 1;
2216 break;
2217 case XML_REGEXP_MARK:
2218 if ((range2->type == XML_REGEXP_MARK_NONSPACING) ||
2219 (range2->type == XML_REGEXP_MARK_SPACECOMBINING) ||
2220 (range2->type == XML_REGEXP_MARK_ENCLOSING))
2221 ret = 1;
2222 break;
2223 case XML_REGEXP_NUMBER:
2224 if ((range2->type == XML_REGEXP_NUMBER_DECIMAL) ||
2225 (range2->type == XML_REGEXP_NUMBER_LETTER) ||
2226 (range2->type == XML_REGEXP_NUMBER_OTHERS))
2227 ret = 1;
2228 break;
2229 case XML_REGEXP_PUNCT:
2230 if ((range2->type == XML_REGEXP_PUNCT_CONNECTOR) ||
2231 (range2->type == XML_REGEXP_PUNCT_DASH) ||
2232 (range2->type == XML_REGEXP_PUNCT_OPEN) ||
2233 (range2->type == XML_REGEXP_PUNCT_CLOSE) ||
2234 (range2->type == XML_REGEXP_PUNCT_INITQUOTE) ||
2235 (range2->type == XML_REGEXP_PUNCT_FINQUOTE) ||
2236 (range2->type == XML_REGEXP_PUNCT_OTHERS))
2237 ret = 1;
2238 break;
2239 case XML_REGEXP_SEPAR:
2240 if ((range2->type == XML_REGEXP_SEPAR_SPACE) ||
2241 (range2->type == XML_REGEXP_SEPAR_LINE) ||
2242 (range2->type == XML_REGEXP_SEPAR_PARA))
2243 ret = 1;
2244 break;
2245 case XML_REGEXP_SYMBOL:
2246 if ((range2->type == XML_REGEXP_SYMBOL_MATH) ||
2247 (range2->type == XML_REGEXP_SYMBOL_CURRENCY) ||
2248 (range2->type == XML_REGEXP_SYMBOL_MODIFIER) ||
2249 (range2->type == XML_REGEXP_SYMBOL_OTHERS))
2250 ret = 1;
2251 break;
2252 case XML_REGEXP_OTHER:
2253 if ((range2->type == XML_REGEXP_OTHER_CONTROL) ||
2254 (range2->type == XML_REGEXP_OTHER_FORMAT) ||
2255 (range2->type == XML_REGEXP_OTHER_PRIVATE))
2256 ret = 1;
2257 break;
2258 default:
2259 if ((range2->type >= XML_REGEXP_LETTER) &&
2260 (range2->type < XML_REGEXP_BLOCK_NAME))
2261 ret = 0;
2262 else {
2263 /* safety net ! */
2264 return(1);
2268 if (((range1->neg == 0) && (range2->neg != 0)) ||
2269 ((range1->neg != 0) && (range2->neg == 0)))
2270 ret = !ret;
2271 return(ret);
2275 * xmlFACompareAtomTypes:
2276 * @type1: an atom type
2277 * @type2: an atom type
2279 * Compares two atoms type to check whether they intersect in some ways,
2280 * this is used by xmlFACompareAtoms only
2282 * Returns 1 if they may intersect and 0 otherwise
2284 static int
2285 xmlFACompareAtomTypes(xmlRegAtomType type1, xmlRegAtomType type2) {
2286 if ((type1 == XML_REGEXP_EPSILON) ||
2287 (type1 == XML_REGEXP_CHARVAL) ||
2288 (type1 == XML_REGEXP_RANGES) ||
2289 (type1 == XML_REGEXP_SUBREG) ||
2290 (type1 == XML_REGEXP_STRING) ||
2291 (type1 == XML_REGEXP_ANYCHAR))
2292 return(1);
2293 if ((type2 == XML_REGEXP_EPSILON) ||
2294 (type2 == XML_REGEXP_CHARVAL) ||
2295 (type2 == XML_REGEXP_RANGES) ||
2296 (type2 == XML_REGEXP_SUBREG) ||
2297 (type2 == XML_REGEXP_STRING) ||
2298 (type2 == XML_REGEXP_ANYCHAR))
2299 return(1);
2301 if (type1 == type2) return(1);
2303 /* simplify subsequent compares by making sure type1 < type2 */
2304 if (type1 > type2) {
2305 xmlRegAtomType tmp = type1;
2306 type1 = type2;
2307 type2 = tmp;
2309 switch (type1) {
2310 case XML_REGEXP_ANYSPACE: /* \s */
2311 /* can't be a letter, number, mark, punctuation, symbol */
2312 if ((type2 == XML_REGEXP_NOTSPACE) ||
2313 ((type2 >= XML_REGEXP_LETTER) &&
2314 (type2 <= XML_REGEXP_LETTER_OTHERS)) ||
2315 ((type2 >= XML_REGEXP_NUMBER) &&
2316 (type2 <= XML_REGEXP_NUMBER_OTHERS)) ||
2317 ((type2 >= XML_REGEXP_MARK) &&
2318 (type2 <= XML_REGEXP_MARK_ENCLOSING)) ||
2319 ((type2 >= XML_REGEXP_PUNCT) &&
2320 (type2 <= XML_REGEXP_PUNCT_OTHERS)) ||
2321 ((type2 >= XML_REGEXP_SYMBOL) &&
2322 (type2 <= XML_REGEXP_SYMBOL_OTHERS))
2323 ) return(0);
2324 break;
2325 case XML_REGEXP_NOTSPACE: /* \S */
2326 break;
2327 case XML_REGEXP_INITNAME: /* \l */
2328 /* can't be a number, mark, separator, punctuation, symbol or other */
2329 if ((type2 == XML_REGEXP_NOTINITNAME) ||
2330 ((type2 >= XML_REGEXP_NUMBER) &&
2331 (type2 <= XML_REGEXP_NUMBER_OTHERS)) ||
2332 ((type2 >= XML_REGEXP_MARK) &&
2333 (type2 <= XML_REGEXP_MARK_ENCLOSING)) ||
2334 ((type2 >= XML_REGEXP_SEPAR) &&
2335 (type2 <= XML_REGEXP_SEPAR_PARA)) ||
2336 ((type2 >= XML_REGEXP_PUNCT) &&
2337 (type2 <= XML_REGEXP_PUNCT_OTHERS)) ||
2338 ((type2 >= XML_REGEXP_SYMBOL) &&
2339 (type2 <= XML_REGEXP_SYMBOL_OTHERS)) ||
2340 ((type2 >= XML_REGEXP_OTHER) &&
2341 (type2 <= XML_REGEXP_OTHER_NA))
2342 ) return(0);
2343 break;
2344 case XML_REGEXP_NOTINITNAME: /* \L */
2345 break;
2346 case XML_REGEXP_NAMECHAR: /* \c */
2347 /* can't be a mark, separator, punctuation, symbol or other */
2348 if ((type2 == XML_REGEXP_NOTNAMECHAR) ||
2349 ((type2 >= XML_REGEXP_MARK) &&
2350 (type2 <= XML_REGEXP_MARK_ENCLOSING)) ||
2351 ((type2 >= XML_REGEXP_PUNCT) &&
2352 (type2 <= XML_REGEXP_PUNCT_OTHERS)) ||
2353 ((type2 >= XML_REGEXP_SEPAR) &&
2354 (type2 <= XML_REGEXP_SEPAR_PARA)) ||
2355 ((type2 >= XML_REGEXP_SYMBOL) &&
2356 (type2 <= XML_REGEXP_SYMBOL_OTHERS)) ||
2357 ((type2 >= XML_REGEXP_OTHER) &&
2358 (type2 <= XML_REGEXP_OTHER_NA))
2359 ) return(0);
2360 break;
2361 case XML_REGEXP_NOTNAMECHAR: /* \C */
2362 break;
2363 case XML_REGEXP_DECIMAL: /* \d */
2364 /* can't be a letter, mark, separator, punctuation, symbol or other */
2365 if ((type2 == XML_REGEXP_NOTDECIMAL) ||
2366 (type2 == XML_REGEXP_REALCHAR) ||
2367 ((type2 >= XML_REGEXP_LETTER) &&
2368 (type2 <= XML_REGEXP_LETTER_OTHERS)) ||
2369 ((type2 >= XML_REGEXP_MARK) &&
2370 (type2 <= XML_REGEXP_MARK_ENCLOSING)) ||
2371 ((type2 >= XML_REGEXP_PUNCT) &&
2372 (type2 <= XML_REGEXP_PUNCT_OTHERS)) ||
2373 ((type2 >= XML_REGEXP_SEPAR) &&
2374 (type2 <= XML_REGEXP_SEPAR_PARA)) ||
2375 ((type2 >= XML_REGEXP_SYMBOL) &&
2376 (type2 <= XML_REGEXP_SYMBOL_OTHERS)) ||
2377 ((type2 >= XML_REGEXP_OTHER) &&
2378 (type2 <= XML_REGEXP_OTHER_NA))
2379 )return(0);
2380 break;
2381 case XML_REGEXP_NOTDECIMAL: /* \D */
2382 break;
2383 case XML_REGEXP_REALCHAR: /* \w */
2384 /* can't be a mark, separator, punctuation, symbol or other */
2385 if ((type2 == XML_REGEXP_NOTDECIMAL) ||
2386 ((type2 >= XML_REGEXP_MARK) &&
2387 (type2 <= XML_REGEXP_MARK_ENCLOSING)) ||
2388 ((type2 >= XML_REGEXP_PUNCT) &&
2389 (type2 <= XML_REGEXP_PUNCT_OTHERS)) ||
2390 ((type2 >= XML_REGEXP_SEPAR) &&
2391 (type2 <= XML_REGEXP_SEPAR_PARA)) ||
2392 ((type2 >= XML_REGEXP_SYMBOL) &&
2393 (type2 <= XML_REGEXP_SYMBOL_OTHERS)) ||
2394 ((type2 >= XML_REGEXP_OTHER) &&
2395 (type2 <= XML_REGEXP_OTHER_NA))
2396 )return(0);
2397 break;
2398 case XML_REGEXP_NOTREALCHAR: /* \W */
2399 break;
2401 * at that point we know both type 1 and type2 are from
2402 * character categories are ordered and are different,
2403 * it becomes simple because this is a partition
2405 case XML_REGEXP_LETTER:
2406 if (type2 <= XML_REGEXP_LETTER_OTHERS)
2407 return(1);
2408 return(0);
2409 case XML_REGEXP_LETTER_UPPERCASE:
2410 case XML_REGEXP_LETTER_LOWERCASE:
2411 case XML_REGEXP_LETTER_TITLECASE:
2412 case XML_REGEXP_LETTER_MODIFIER:
2413 case XML_REGEXP_LETTER_OTHERS:
2414 return(0);
2415 case XML_REGEXP_MARK:
2416 if (type2 <= XML_REGEXP_MARK_ENCLOSING)
2417 return(1);
2418 return(0);
2419 case XML_REGEXP_MARK_NONSPACING:
2420 case XML_REGEXP_MARK_SPACECOMBINING:
2421 case XML_REGEXP_MARK_ENCLOSING:
2422 return(0);
2423 case XML_REGEXP_NUMBER:
2424 if (type2 <= XML_REGEXP_NUMBER_OTHERS)
2425 return(1);
2426 return(0);
2427 case XML_REGEXP_NUMBER_DECIMAL:
2428 case XML_REGEXP_NUMBER_LETTER:
2429 case XML_REGEXP_NUMBER_OTHERS:
2430 return(0);
2431 case XML_REGEXP_PUNCT:
2432 if (type2 <= XML_REGEXP_PUNCT_OTHERS)
2433 return(1);
2434 return(0);
2435 case XML_REGEXP_PUNCT_CONNECTOR:
2436 case XML_REGEXP_PUNCT_DASH:
2437 case XML_REGEXP_PUNCT_OPEN:
2438 case XML_REGEXP_PUNCT_CLOSE:
2439 case XML_REGEXP_PUNCT_INITQUOTE:
2440 case XML_REGEXP_PUNCT_FINQUOTE:
2441 case XML_REGEXP_PUNCT_OTHERS:
2442 return(0);
2443 case XML_REGEXP_SEPAR:
2444 if (type2 <= XML_REGEXP_SEPAR_PARA)
2445 return(1);
2446 return(0);
2447 case XML_REGEXP_SEPAR_SPACE:
2448 case XML_REGEXP_SEPAR_LINE:
2449 case XML_REGEXP_SEPAR_PARA:
2450 return(0);
2451 case XML_REGEXP_SYMBOL:
2452 if (type2 <= XML_REGEXP_SYMBOL_OTHERS)
2453 return(1);
2454 return(0);
2455 case XML_REGEXP_SYMBOL_MATH:
2456 case XML_REGEXP_SYMBOL_CURRENCY:
2457 case XML_REGEXP_SYMBOL_MODIFIER:
2458 case XML_REGEXP_SYMBOL_OTHERS:
2459 return(0);
2460 case XML_REGEXP_OTHER:
2461 if (type2 <= XML_REGEXP_OTHER_NA)
2462 return(1);
2463 return(0);
2464 case XML_REGEXP_OTHER_CONTROL:
2465 case XML_REGEXP_OTHER_FORMAT:
2466 case XML_REGEXP_OTHER_PRIVATE:
2467 case XML_REGEXP_OTHER_NA:
2468 return(0);
2469 default:
2470 break;
2472 return(1);
2476 * xmlFAEqualAtoms:
2477 * @atom1: an atom
2478 * @atom2: an atom
2479 * @deep: if not set only compare string pointers
2481 * Compares two atoms to check whether they are the same exactly
2482 * this is used to remove equivalent transitions
2484 * Returns 1 if same and 0 otherwise
2486 static int
2487 xmlFAEqualAtoms(xmlRegAtomPtr atom1, xmlRegAtomPtr atom2, int deep) {
2488 int ret = 0;
2490 if (atom1 == atom2)
2491 return(1);
2492 if ((atom1 == NULL) || (atom2 == NULL))
2493 return(0);
2495 if (atom1->type != atom2->type)
2496 return(0);
2497 switch (atom1->type) {
2498 case XML_REGEXP_EPSILON:
2499 ret = 0;
2500 break;
2501 case XML_REGEXP_STRING:
2502 if (!deep)
2503 ret = (atom1->valuep == atom2->valuep);
2504 else
2505 ret = xmlStrEqual((xmlChar *)atom1->valuep,
2506 (xmlChar *)atom2->valuep);
2507 break;
2508 case XML_REGEXP_CHARVAL:
2509 ret = (atom1->codepoint == atom2->codepoint);
2510 break;
2511 case XML_REGEXP_RANGES:
2512 /* too hard to do in the general case */
2513 ret = 0;
2514 default:
2515 break;
2517 return(ret);
2521 * xmlFACompareAtoms:
2522 * @atom1: an atom
2523 * @atom2: an atom
2524 * @deep: if not set only compare string pointers
2526 * Compares two atoms to check whether they intersect in some ways,
2527 * this is used by xmlFAComputesDeterminism and xmlFARecurseDeterminism only
2529 * Returns 1 if yes and 0 otherwise
2531 static int
2532 xmlFACompareAtoms(xmlRegAtomPtr atom1, xmlRegAtomPtr atom2, int deep) {
2533 int ret = 1;
2535 if (atom1 == atom2)
2536 return(1);
2537 if ((atom1 == NULL) || (atom2 == NULL))
2538 return(0);
2540 if ((atom1->type == XML_REGEXP_ANYCHAR) ||
2541 (atom2->type == XML_REGEXP_ANYCHAR))
2542 return(1);
2544 if (atom1->type > atom2->type) {
2545 xmlRegAtomPtr tmp;
2546 tmp = atom1;
2547 atom1 = atom2;
2548 atom2 = tmp;
2550 if (atom1->type != atom2->type) {
2551 ret = xmlFACompareAtomTypes(atom1->type, atom2->type);
2552 /* if they can't intersect at the type level break now */
2553 if (ret == 0)
2554 return(0);
2556 switch (atom1->type) {
2557 case XML_REGEXP_STRING:
2558 if (!deep)
2559 ret = (atom1->valuep != atom2->valuep);
2560 else {
2561 xmlChar *val1 = (xmlChar *)atom1->valuep;
2562 xmlChar *val2 = (xmlChar *)atom2->valuep;
2563 int compound1 = (xmlStrchr(val1, '|') != NULL);
2564 int compound2 = (xmlStrchr(val2, '|') != NULL);
2566 /* Ignore negative match flag for ##other namespaces */
2567 if (compound1 != compound2)
2568 return(0);
2570 ret = xmlRegStrEqualWildcard(val1, val2);
2572 break;
2573 case XML_REGEXP_EPSILON:
2574 goto not_determinist;
2575 case XML_REGEXP_CHARVAL:
2576 if (atom2->type == XML_REGEXP_CHARVAL) {
2577 ret = (atom1->codepoint == atom2->codepoint);
2578 } else {
2579 ret = xmlRegCheckCharacter(atom2, atom1->codepoint);
2580 if (ret < 0)
2581 ret = 1;
2583 break;
2584 case XML_REGEXP_RANGES:
2585 if (atom2->type == XML_REGEXP_RANGES) {
2586 int i, j, res;
2587 xmlRegRangePtr r1, r2;
2590 * need to check that none of the ranges eventually matches
2592 for (i = 0;i < atom1->nbRanges;i++) {
2593 for (j = 0;j < atom2->nbRanges;j++) {
2594 r1 = atom1->ranges[i];
2595 r2 = atom2->ranges[j];
2596 res = xmlFACompareRanges(r1, r2);
2597 if (res == 1) {
2598 ret = 1;
2599 goto done;
2603 ret = 0;
2605 break;
2606 default:
2607 goto not_determinist;
2609 done:
2610 if (atom1->neg != atom2->neg) {
2611 ret = !ret;
2613 if (ret == 0)
2614 return(0);
2615 not_determinist:
2616 return(1);
2620 * xmlFARecurseDeterminism:
2621 * @ctxt: a regexp parser context
2623 * Check whether the associated regexp is determinist,
2624 * should be called after xmlFAEliminateEpsilonTransitions()
2627 static int
2628 xmlFARecurseDeterminism(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr state,
2629 int to, xmlRegAtomPtr atom) {
2630 int ret = 1;
2631 int res;
2632 int transnr, nbTrans;
2633 xmlRegTransPtr t1;
2634 int deep = 1;
2636 if (state == NULL)
2637 return(ret);
2638 if (state->markd == XML_REGEXP_MARK_VISITED)
2639 return(ret);
2641 if (ctxt->flags & AM_AUTOMATA_RNG)
2642 deep = 0;
2645 * don't recurse on transitions potentially added in the course of
2646 * the elimination.
2648 nbTrans = state->nbTrans;
2649 for (transnr = 0;transnr < nbTrans;transnr++) {
2650 t1 = &(state->trans[transnr]);
2652 * check transitions conflicting with the one looked at
2654 if (t1->atom == NULL) {
2655 if (t1->to < 0)
2656 continue;
2657 state->markd = XML_REGEXP_MARK_VISITED;
2658 res = xmlFARecurseDeterminism(ctxt, ctxt->states[t1->to],
2659 to, atom);
2660 if (res == 0) {
2661 ret = 0;
2662 /* t1->nd = 1; */
2664 continue;
2666 if (t1->to != to)
2667 continue;
2668 if (xmlFACompareAtoms(t1->atom, atom, deep)) {
2669 ret = 0;
2670 /* mark the transition as non-deterministic */
2671 t1->nd = 1;
2674 return(ret);
2678 * xmlFAFinishRecurseDeterminism:
2679 * @ctxt: a regexp parser context
2681 * Reset flags after checking determinism.
2683 static void
2684 xmlFAFinishRecurseDeterminism(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr state) {
2685 int transnr, nbTrans;
2687 if (state == NULL)
2688 return;
2689 if (state->markd != XML_REGEXP_MARK_VISITED)
2690 return;
2691 state->markd = 0;
2693 nbTrans = state->nbTrans;
2694 for (transnr = 0; transnr < nbTrans; transnr++) {
2695 xmlRegTransPtr t1 = &state->trans[transnr];
2696 if ((t1->atom == NULL) && (t1->to >= 0))
2697 xmlFAFinishRecurseDeterminism(ctxt, ctxt->states[t1->to]);
2702 * xmlFAComputesDeterminism:
2703 * @ctxt: a regexp parser context
2705 * Check whether the associated regexp is determinist,
2706 * should be called after xmlFAEliminateEpsilonTransitions()
2709 static int
2710 xmlFAComputesDeterminism(xmlRegParserCtxtPtr ctxt) {
2711 int statenr, transnr;
2712 xmlRegStatePtr state;
2713 xmlRegTransPtr t1, t2, last;
2714 int i;
2715 int ret = 1;
2716 int deep = 1;
2718 #ifdef DEBUG_REGEXP_GRAPH
2719 printf("xmlFAComputesDeterminism\n");
2720 xmlRegPrintCtxt(stdout, ctxt);
2721 #endif
2722 if (ctxt->determinist != -1)
2723 return(ctxt->determinist);
2725 if (ctxt->flags & AM_AUTOMATA_RNG)
2726 deep = 0;
2729 * First cleanup the automata removing cancelled transitions
2731 for (statenr = 0;statenr < ctxt->nbStates;statenr++) {
2732 state = ctxt->states[statenr];
2733 if (state == NULL)
2734 continue;
2735 if (state->nbTrans < 2)
2736 continue;
2737 for (transnr = 0;transnr < state->nbTrans;transnr++) {
2738 t1 = &(state->trans[transnr]);
2740 * Determinism checks in case of counted or all transitions
2741 * will have to be handled separately
2743 if (t1->atom == NULL) {
2744 /* t1->nd = 1; */
2745 continue;
2747 if (t1->to == -1) /* eliminated */
2748 continue;
2749 for (i = 0;i < transnr;i++) {
2750 t2 = &(state->trans[i]);
2751 if (t2->to == -1) /* eliminated */
2752 continue;
2753 if (t2->atom != NULL) {
2754 if (t1->to == t2->to) {
2756 * Here we use deep because we want to keep the
2757 * transitions which indicate a conflict
2759 if (xmlFAEqualAtoms(t1->atom, t2->atom, deep) &&
2760 (t1->counter == t2->counter) &&
2761 (t1->count == t2->count))
2762 t2->to = -1; /* eliminated */
2770 * Check for all states that there aren't 2 transitions
2771 * with the same atom and a different target.
2773 for (statenr = 0;statenr < ctxt->nbStates;statenr++) {
2774 state = ctxt->states[statenr];
2775 if (state == NULL)
2776 continue;
2777 if (state->nbTrans < 2)
2778 continue;
2779 last = NULL;
2780 for (transnr = 0;transnr < state->nbTrans;transnr++) {
2781 t1 = &(state->trans[transnr]);
2783 * Determinism checks in case of counted or all transitions
2784 * will have to be handled separately
2786 if (t1->atom == NULL) {
2787 continue;
2789 if (t1->to == -1) /* eliminated */
2790 continue;
2791 for (i = 0;i < transnr;i++) {
2792 t2 = &(state->trans[i]);
2793 if (t2->to == -1) /* eliminated */
2794 continue;
2795 if (t2->atom != NULL) {
2797 * But here we don't use deep because we want to
2798 * find transitions which indicate a conflict
2800 if (xmlFACompareAtoms(t1->atom, t2->atom, 1)) {
2801 ret = 0;
2802 /* mark the transitions as non-deterministic ones */
2803 t1->nd = 1;
2804 t2->nd = 1;
2805 last = t1;
2807 } else if (t1->to != -1) {
2809 * do the closure in case of remaining specific
2810 * epsilon transitions like choices or all
2812 ret = xmlFARecurseDeterminism(ctxt, ctxt->states[t1->to],
2813 t2->to, t2->atom);
2814 xmlFAFinishRecurseDeterminism(ctxt, ctxt->states[t1->to]);
2815 /* don't shortcut the computation so all non deterministic
2816 transition get marked down
2817 if (ret == 0)
2818 return(0);
2820 if (ret == 0) {
2821 t1->nd = 1;
2822 /* t2->nd = 1; */
2823 last = t1;
2827 /* don't shortcut the computation so all non deterministic
2828 transition get marked down
2829 if (ret == 0)
2830 break; */
2834 * mark specifically the last non-deterministic transition
2835 * from a state since there is no need to set-up rollback
2836 * from it
2838 if (last != NULL) {
2839 last->nd = 2;
2842 /* don't shortcut the computation so all non deterministic
2843 transition get marked down
2844 if (ret == 0)
2845 break; */
2848 ctxt->determinist = ret;
2849 return(ret);
2852 /************************************************************************
2854 * Routines to check input against transition atoms *
2856 ************************************************************************/
2858 static int
2859 xmlRegCheckCharacterRange(xmlRegAtomType type, int codepoint, int neg,
2860 int start, int end, const xmlChar *blockName) {
2861 int ret = 0;
2863 switch (type) {
2864 case XML_REGEXP_STRING:
2865 case XML_REGEXP_SUBREG:
2866 case XML_REGEXP_RANGES:
2867 case XML_REGEXP_EPSILON:
2868 return(-1);
2869 case XML_REGEXP_ANYCHAR:
2870 ret = ((codepoint != '\n') && (codepoint != '\r'));
2871 break;
2872 case XML_REGEXP_CHARVAL:
2873 ret = ((codepoint >= start) && (codepoint <= end));
2874 break;
2875 case XML_REGEXP_NOTSPACE:
2876 neg = !neg;
2877 /* Falls through. */
2878 case XML_REGEXP_ANYSPACE:
2879 ret = ((codepoint == '\n') || (codepoint == '\r') ||
2880 (codepoint == '\t') || (codepoint == ' '));
2881 break;
2882 case XML_REGEXP_NOTINITNAME:
2883 neg = !neg;
2884 /* Falls through. */
2885 case XML_REGEXP_INITNAME:
2886 ret = (IS_LETTER(codepoint) ||
2887 (codepoint == '_') || (codepoint == ':'));
2888 break;
2889 case XML_REGEXP_NOTNAMECHAR:
2890 neg = !neg;
2891 /* Falls through. */
2892 case XML_REGEXP_NAMECHAR:
2893 ret = (IS_LETTER(codepoint) || IS_DIGIT(codepoint) ||
2894 (codepoint == '.') || (codepoint == '-') ||
2895 (codepoint == '_') || (codepoint == ':') ||
2896 IS_COMBINING(codepoint) || IS_EXTENDER(codepoint));
2897 break;
2898 case XML_REGEXP_NOTDECIMAL:
2899 neg = !neg;
2900 /* Falls through. */
2901 case XML_REGEXP_DECIMAL:
2902 ret = xmlUCSIsCatNd(codepoint);
2903 break;
2904 case XML_REGEXP_REALCHAR:
2905 neg = !neg;
2906 /* Falls through. */
2907 case XML_REGEXP_NOTREALCHAR:
2908 ret = xmlUCSIsCatP(codepoint);
2909 if (ret == 0)
2910 ret = xmlUCSIsCatZ(codepoint);
2911 if (ret == 0)
2912 ret = xmlUCSIsCatC(codepoint);
2913 break;
2914 case XML_REGEXP_LETTER:
2915 ret = xmlUCSIsCatL(codepoint);
2916 break;
2917 case XML_REGEXP_LETTER_UPPERCASE:
2918 ret = xmlUCSIsCatLu(codepoint);
2919 break;
2920 case XML_REGEXP_LETTER_LOWERCASE:
2921 ret = xmlUCSIsCatLl(codepoint);
2922 break;
2923 case XML_REGEXP_LETTER_TITLECASE:
2924 ret = xmlUCSIsCatLt(codepoint);
2925 break;
2926 case XML_REGEXP_LETTER_MODIFIER:
2927 ret = xmlUCSIsCatLm(codepoint);
2928 break;
2929 case XML_REGEXP_LETTER_OTHERS:
2930 ret = xmlUCSIsCatLo(codepoint);
2931 break;
2932 case XML_REGEXP_MARK:
2933 ret = xmlUCSIsCatM(codepoint);
2934 break;
2935 case XML_REGEXP_MARK_NONSPACING:
2936 ret = xmlUCSIsCatMn(codepoint);
2937 break;
2938 case XML_REGEXP_MARK_SPACECOMBINING:
2939 ret = xmlUCSIsCatMc(codepoint);
2940 break;
2941 case XML_REGEXP_MARK_ENCLOSING:
2942 ret = xmlUCSIsCatMe(codepoint);
2943 break;
2944 case XML_REGEXP_NUMBER:
2945 ret = xmlUCSIsCatN(codepoint);
2946 break;
2947 case XML_REGEXP_NUMBER_DECIMAL:
2948 ret = xmlUCSIsCatNd(codepoint);
2949 break;
2950 case XML_REGEXP_NUMBER_LETTER:
2951 ret = xmlUCSIsCatNl(codepoint);
2952 break;
2953 case XML_REGEXP_NUMBER_OTHERS:
2954 ret = xmlUCSIsCatNo(codepoint);
2955 break;
2956 case XML_REGEXP_PUNCT:
2957 ret = xmlUCSIsCatP(codepoint);
2958 break;
2959 case XML_REGEXP_PUNCT_CONNECTOR:
2960 ret = xmlUCSIsCatPc(codepoint);
2961 break;
2962 case XML_REGEXP_PUNCT_DASH:
2963 ret = xmlUCSIsCatPd(codepoint);
2964 break;
2965 case XML_REGEXP_PUNCT_OPEN:
2966 ret = xmlUCSIsCatPs(codepoint);
2967 break;
2968 case XML_REGEXP_PUNCT_CLOSE:
2969 ret = xmlUCSIsCatPe(codepoint);
2970 break;
2971 case XML_REGEXP_PUNCT_INITQUOTE:
2972 ret = xmlUCSIsCatPi(codepoint);
2973 break;
2974 case XML_REGEXP_PUNCT_FINQUOTE:
2975 ret = xmlUCSIsCatPf(codepoint);
2976 break;
2977 case XML_REGEXP_PUNCT_OTHERS:
2978 ret = xmlUCSIsCatPo(codepoint);
2979 break;
2980 case XML_REGEXP_SEPAR:
2981 ret = xmlUCSIsCatZ(codepoint);
2982 break;
2983 case XML_REGEXP_SEPAR_SPACE:
2984 ret = xmlUCSIsCatZs(codepoint);
2985 break;
2986 case XML_REGEXP_SEPAR_LINE:
2987 ret = xmlUCSIsCatZl(codepoint);
2988 break;
2989 case XML_REGEXP_SEPAR_PARA:
2990 ret = xmlUCSIsCatZp(codepoint);
2991 break;
2992 case XML_REGEXP_SYMBOL:
2993 ret = xmlUCSIsCatS(codepoint);
2994 break;
2995 case XML_REGEXP_SYMBOL_MATH:
2996 ret = xmlUCSIsCatSm(codepoint);
2997 break;
2998 case XML_REGEXP_SYMBOL_CURRENCY:
2999 ret = xmlUCSIsCatSc(codepoint);
3000 break;
3001 case XML_REGEXP_SYMBOL_MODIFIER:
3002 ret = xmlUCSIsCatSk(codepoint);
3003 break;
3004 case XML_REGEXP_SYMBOL_OTHERS:
3005 ret = xmlUCSIsCatSo(codepoint);
3006 break;
3007 case XML_REGEXP_OTHER:
3008 ret = xmlUCSIsCatC(codepoint);
3009 break;
3010 case XML_REGEXP_OTHER_CONTROL:
3011 ret = xmlUCSIsCatCc(codepoint);
3012 break;
3013 case XML_REGEXP_OTHER_FORMAT:
3014 ret = xmlUCSIsCatCf(codepoint);
3015 break;
3016 case XML_REGEXP_OTHER_PRIVATE:
3017 ret = xmlUCSIsCatCo(codepoint);
3018 break;
3019 case XML_REGEXP_OTHER_NA:
3020 /* ret = xmlUCSIsCatCn(codepoint); */
3021 /* Seems it doesn't exist anymore in recent Unicode releases */
3022 ret = 0;
3023 break;
3024 case XML_REGEXP_BLOCK_NAME:
3025 ret = xmlUCSIsBlock(codepoint, (const char *) blockName);
3026 break;
3028 if (neg)
3029 return(!ret);
3030 return(ret);
3033 static int
3034 xmlRegCheckCharacter(xmlRegAtomPtr atom, int codepoint) {
3035 int i, ret = 0;
3036 xmlRegRangePtr range;
3038 if ((atom == NULL) || (!IS_CHAR(codepoint)))
3039 return(-1);
3041 switch (atom->type) {
3042 case XML_REGEXP_SUBREG:
3043 case XML_REGEXP_EPSILON:
3044 return(-1);
3045 case XML_REGEXP_CHARVAL:
3046 return(codepoint == atom->codepoint);
3047 case XML_REGEXP_RANGES: {
3048 int accept = 0;
3050 for (i = 0;i < atom->nbRanges;i++) {
3051 range = atom->ranges[i];
3052 if (range->neg == 2) {
3053 ret = xmlRegCheckCharacterRange(range->type, codepoint,
3054 0, range->start, range->end,
3055 range->blockName);
3056 if (ret != 0)
3057 return(0); /* excluded char */
3058 } else if (range->neg) {
3059 ret = xmlRegCheckCharacterRange(range->type, codepoint,
3060 0, range->start, range->end,
3061 range->blockName);
3062 if (ret == 0)
3063 accept = 1;
3064 else
3065 return(0);
3066 } else {
3067 ret = xmlRegCheckCharacterRange(range->type, codepoint,
3068 0, range->start, range->end,
3069 range->blockName);
3070 if (ret != 0)
3071 accept = 1; /* might still be excluded */
3074 return(accept);
3076 case XML_REGEXP_STRING:
3077 printf("TODO: XML_REGEXP_STRING\n");
3078 return(-1);
3079 case XML_REGEXP_ANYCHAR:
3080 case XML_REGEXP_ANYSPACE:
3081 case XML_REGEXP_NOTSPACE:
3082 case XML_REGEXP_INITNAME:
3083 case XML_REGEXP_NOTINITNAME:
3084 case XML_REGEXP_NAMECHAR:
3085 case XML_REGEXP_NOTNAMECHAR:
3086 case XML_REGEXP_DECIMAL:
3087 case XML_REGEXP_NOTDECIMAL:
3088 case XML_REGEXP_REALCHAR:
3089 case XML_REGEXP_NOTREALCHAR:
3090 case XML_REGEXP_LETTER:
3091 case XML_REGEXP_LETTER_UPPERCASE:
3092 case XML_REGEXP_LETTER_LOWERCASE:
3093 case XML_REGEXP_LETTER_TITLECASE:
3094 case XML_REGEXP_LETTER_MODIFIER:
3095 case XML_REGEXP_LETTER_OTHERS:
3096 case XML_REGEXP_MARK:
3097 case XML_REGEXP_MARK_NONSPACING:
3098 case XML_REGEXP_MARK_SPACECOMBINING:
3099 case XML_REGEXP_MARK_ENCLOSING:
3100 case XML_REGEXP_NUMBER:
3101 case XML_REGEXP_NUMBER_DECIMAL:
3102 case XML_REGEXP_NUMBER_LETTER:
3103 case XML_REGEXP_NUMBER_OTHERS:
3104 case XML_REGEXP_PUNCT:
3105 case XML_REGEXP_PUNCT_CONNECTOR:
3106 case XML_REGEXP_PUNCT_DASH:
3107 case XML_REGEXP_PUNCT_OPEN:
3108 case XML_REGEXP_PUNCT_CLOSE:
3109 case XML_REGEXP_PUNCT_INITQUOTE:
3110 case XML_REGEXP_PUNCT_FINQUOTE:
3111 case XML_REGEXP_PUNCT_OTHERS:
3112 case XML_REGEXP_SEPAR:
3113 case XML_REGEXP_SEPAR_SPACE:
3114 case XML_REGEXP_SEPAR_LINE:
3115 case XML_REGEXP_SEPAR_PARA:
3116 case XML_REGEXP_SYMBOL:
3117 case XML_REGEXP_SYMBOL_MATH:
3118 case XML_REGEXP_SYMBOL_CURRENCY:
3119 case XML_REGEXP_SYMBOL_MODIFIER:
3120 case XML_REGEXP_SYMBOL_OTHERS:
3121 case XML_REGEXP_OTHER:
3122 case XML_REGEXP_OTHER_CONTROL:
3123 case XML_REGEXP_OTHER_FORMAT:
3124 case XML_REGEXP_OTHER_PRIVATE:
3125 case XML_REGEXP_OTHER_NA:
3126 case XML_REGEXP_BLOCK_NAME:
3127 ret = xmlRegCheckCharacterRange(atom->type, codepoint, 0, 0, 0,
3128 (const xmlChar *)atom->valuep);
3129 if (atom->neg)
3130 ret = !ret;
3131 break;
3133 return(ret);
3136 /************************************************************************
3138 * Saving and restoring state of an execution context *
3140 ************************************************************************/
3142 #ifdef DEBUG_REGEXP_EXEC
3143 static void
3144 xmlFARegDebugExec(xmlRegExecCtxtPtr exec) {
3145 printf("state: %d:%d:idx %d", exec->state->no, exec->transno, exec->index);
3146 if (exec->inputStack != NULL) {
3147 int i;
3148 printf(": ");
3149 for (i = 0;(i < 3) && (i < exec->inputStackNr);i++)
3150 printf("%s ", (const char *)
3151 exec->inputStack[exec->inputStackNr - (i + 1)].value);
3152 } else {
3153 printf(": %s", &(exec->inputString[exec->index]));
3155 printf("\n");
3157 #endif
3159 static void
3160 xmlFARegExecSave(xmlRegExecCtxtPtr exec) {
3161 #ifdef DEBUG_REGEXP_EXEC
3162 printf("saving ");
3163 exec->transno++;
3164 xmlFARegDebugExec(exec);
3165 exec->transno--;
3166 #endif
3167 #ifdef MAX_PUSH
3168 if (exec->nbPush > MAX_PUSH) {
3169 return;
3171 exec->nbPush++;
3172 #endif
3174 if (exec->maxRollbacks == 0) {
3175 exec->maxRollbacks = 4;
3176 exec->rollbacks = (xmlRegExecRollback *) xmlMalloc(exec->maxRollbacks *
3177 sizeof(xmlRegExecRollback));
3178 if (exec->rollbacks == NULL) {
3179 xmlRegexpErrMemory(NULL, "saving regexp");
3180 exec->maxRollbacks = 0;
3181 return;
3183 memset(exec->rollbacks, 0,
3184 exec->maxRollbacks * sizeof(xmlRegExecRollback));
3185 } else if (exec->nbRollbacks >= exec->maxRollbacks) {
3186 xmlRegExecRollback *tmp;
3187 int len = exec->maxRollbacks;
3189 exec->maxRollbacks *= 2;
3190 tmp = (xmlRegExecRollback *) xmlRealloc(exec->rollbacks,
3191 exec->maxRollbacks * sizeof(xmlRegExecRollback));
3192 if (tmp == NULL) {
3193 xmlRegexpErrMemory(NULL, "saving regexp");
3194 exec->maxRollbacks /= 2;
3195 return;
3197 exec->rollbacks = tmp;
3198 tmp = &exec->rollbacks[len];
3199 memset(tmp, 0, (exec->maxRollbacks - len) * sizeof(xmlRegExecRollback));
3201 exec->rollbacks[exec->nbRollbacks].state = exec->state;
3202 exec->rollbacks[exec->nbRollbacks].index = exec->index;
3203 exec->rollbacks[exec->nbRollbacks].nextbranch = exec->transno + 1;
3204 if (exec->comp->nbCounters > 0) {
3205 if (exec->rollbacks[exec->nbRollbacks].counts == NULL) {
3206 exec->rollbacks[exec->nbRollbacks].counts = (int *)
3207 xmlMalloc(exec->comp->nbCounters * sizeof(int));
3208 if (exec->rollbacks[exec->nbRollbacks].counts == NULL) {
3209 xmlRegexpErrMemory(NULL, "saving regexp");
3210 exec->status = -5;
3211 return;
3214 memcpy(exec->rollbacks[exec->nbRollbacks].counts, exec->counts,
3215 exec->comp->nbCounters * sizeof(int));
3217 exec->nbRollbacks++;
3220 static void
3221 xmlFARegExecRollBack(xmlRegExecCtxtPtr exec) {
3222 if (exec->nbRollbacks <= 0) {
3223 exec->status = -1;
3224 #ifdef DEBUG_REGEXP_EXEC
3225 printf("rollback failed on empty stack\n");
3226 #endif
3227 return;
3229 exec->nbRollbacks--;
3230 exec->state = exec->rollbacks[exec->nbRollbacks].state;
3231 exec->index = exec->rollbacks[exec->nbRollbacks].index;
3232 exec->transno = exec->rollbacks[exec->nbRollbacks].nextbranch;
3233 if (exec->comp->nbCounters > 0) {
3234 if (exec->rollbacks[exec->nbRollbacks].counts == NULL) {
3235 fprintf(stderr, "exec save: allocation failed");
3236 exec->status = -6;
3237 return;
3239 if (exec->counts) {
3240 memcpy(exec->counts, exec->rollbacks[exec->nbRollbacks].counts,
3241 exec->comp->nbCounters * sizeof(int));
3245 #ifdef DEBUG_REGEXP_EXEC
3246 printf("restored ");
3247 xmlFARegDebugExec(exec);
3248 #endif
3251 /************************************************************************
3253 * Verifier, running an input against a compiled regexp *
3255 ************************************************************************/
3257 static int
3258 xmlFARegExec(xmlRegexpPtr comp, const xmlChar *content) {
3259 xmlRegExecCtxt execval;
3260 xmlRegExecCtxtPtr exec = &execval;
3261 int ret, codepoint = 0, len, deter;
3263 exec->inputString = content;
3264 exec->index = 0;
3265 exec->nbPush = 0;
3266 exec->determinist = 1;
3267 exec->maxRollbacks = 0;
3268 exec->nbRollbacks = 0;
3269 exec->rollbacks = NULL;
3270 exec->status = 0;
3271 exec->comp = comp;
3272 exec->state = comp->states[0];
3273 exec->transno = 0;
3274 exec->transcount = 0;
3275 exec->inputStack = NULL;
3276 exec->inputStackMax = 0;
3277 if (comp->nbCounters > 0) {
3278 exec->counts = (int *) xmlMalloc(comp->nbCounters * sizeof(int));
3279 if (exec->counts == NULL) {
3280 xmlRegexpErrMemory(NULL, "running regexp");
3281 return(-1);
3283 memset(exec->counts, 0, comp->nbCounters * sizeof(int));
3284 } else
3285 exec->counts = NULL;
3286 while ((exec->status == 0) && (exec->state != NULL) &&
3287 ((exec->inputString[exec->index] != 0) ||
3288 ((exec->state != NULL) &&
3289 (exec->state->type != XML_REGEXP_FINAL_STATE)))) {
3290 xmlRegTransPtr trans;
3291 xmlRegAtomPtr atom;
3294 * If end of input on non-terminal state, rollback, however we may
3295 * still have epsilon like transition for counted transitions
3296 * on counters, in that case don't break too early. Additionally,
3297 * if we are working on a range like "AB{0,2}", where B is not present,
3298 * we don't want to break.
3300 len = 1;
3301 if ((exec->inputString[exec->index] == 0) && (exec->counts == NULL)) {
3303 * if there is a transition, we must check if
3304 * atom allows minOccurs of 0
3306 if (exec->transno < exec->state->nbTrans) {
3307 trans = &exec->state->trans[exec->transno];
3308 if (trans->to >=0) {
3309 atom = trans->atom;
3310 if (!((atom->min == 0) && (atom->max > 0)))
3311 goto rollback;
3313 } else
3314 goto rollback;
3317 exec->transcount = 0;
3318 for (;exec->transno < exec->state->nbTrans;exec->transno++) {
3319 trans = &exec->state->trans[exec->transno];
3320 if (trans->to < 0)
3321 continue;
3322 atom = trans->atom;
3323 ret = 0;
3324 deter = 1;
3325 if (trans->count >= 0) {
3326 int count;
3327 xmlRegCounterPtr counter;
3329 if (exec->counts == NULL) {
3330 exec->status = -1;
3331 goto error;
3334 * A counted transition.
3337 count = exec->counts[trans->count];
3338 counter = &exec->comp->counters[trans->count];
3339 #ifdef DEBUG_REGEXP_EXEC
3340 printf("testing count %d: val %d, min %d, max %d\n",
3341 trans->count, count, counter->min, counter->max);
3342 #endif
3343 ret = ((count >= counter->min) && (count <= counter->max));
3344 if ((ret) && (counter->min != counter->max))
3345 deter = 0;
3346 } else if (atom == NULL) {
3347 fprintf(stderr, "epsilon transition left at runtime\n");
3348 exec->status = -2;
3349 break;
3350 } else if (exec->inputString[exec->index] != 0) {
3351 codepoint = CUR_SCHAR(&(exec->inputString[exec->index]), len);
3352 ret = xmlRegCheckCharacter(atom, codepoint);
3353 if ((ret == 1) && (atom->min >= 0) && (atom->max > 0)) {
3354 xmlRegStatePtr to = comp->states[trans->to];
3357 * this is a multiple input sequence
3358 * If there is a counter associated increment it now.
3359 * do not increment if the counter is already over the
3360 * maximum limit in which case get to next transition
3362 if (trans->counter >= 0) {
3363 xmlRegCounterPtr counter;
3365 if ((exec->counts == NULL) ||
3366 (exec->comp == NULL) ||
3367 (exec->comp->counters == NULL)) {
3368 exec->status = -1;
3369 goto error;
3371 counter = &exec->comp->counters[trans->counter];
3372 if (exec->counts[trans->counter] >= counter->max)
3373 continue; /* for loop on transitions */
3375 /* Save before incrementing */
3376 if (exec->state->nbTrans > exec->transno + 1) {
3377 xmlFARegExecSave(exec);
3379 if (trans->counter >= 0) {
3380 #ifdef DEBUG_REGEXP_EXEC
3381 printf("Increasing count %d\n", trans->counter);
3382 #endif
3383 exec->counts[trans->counter]++;
3385 exec->transcount = 1;
3386 do {
3388 * Try to progress as much as possible on the input
3390 if (exec->transcount == atom->max) {
3391 break;
3393 exec->index += len;
3395 * End of input: stop here
3397 if (exec->inputString[exec->index] == 0) {
3398 exec->index -= len;
3399 break;
3401 if (exec->transcount >= atom->min) {
3402 int transno = exec->transno;
3403 xmlRegStatePtr state = exec->state;
3406 * The transition is acceptable save it
3408 exec->transno = -1; /* trick */
3409 exec->state = to;
3410 xmlFARegExecSave(exec);
3411 exec->transno = transno;
3412 exec->state = state;
3414 codepoint = CUR_SCHAR(&(exec->inputString[exec->index]),
3415 len);
3416 ret = xmlRegCheckCharacter(atom, codepoint);
3417 exec->transcount++;
3418 } while (ret == 1);
3419 if (exec->transcount < atom->min)
3420 ret = 0;
3423 * If the last check failed but one transition was found
3424 * possible, rollback
3426 if (ret < 0)
3427 ret = 0;
3428 if (ret == 0) {
3429 goto rollback;
3431 if (trans->counter >= 0) {
3432 if (exec->counts == NULL) {
3433 exec->status = -1;
3434 goto error;
3436 #ifdef DEBUG_REGEXP_EXEC
3437 printf("Decreasing count %d\n", trans->counter);
3438 #endif
3439 exec->counts[trans->counter]--;
3441 } else if ((ret == 0) && (atom->min == 0) && (atom->max > 0)) {
3443 * we don't match on the codepoint, but minOccurs of 0
3444 * says that's ok. Setting len to 0 inhibits stepping
3445 * over the codepoint.
3447 exec->transcount = 1;
3448 len = 0;
3449 ret = 1;
3451 } else if ((atom->min == 0) && (atom->max > 0)) {
3452 /* another spot to match when minOccurs is 0 */
3453 exec->transcount = 1;
3454 len = 0;
3455 ret = 1;
3457 if (ret == 1) {
3458 if ((trans->nd == 1) ||
3459 ((trans->count >= 0) && (deter == 0) &&
3460 (exec->state->nbTrans > exec->transno + 1))) {
3461 #ifdef DEBUG_REGEXP_EXEC
3462 if (trans->nd == 1)
3463 printf("Saving on nd transition atom %d for %c at %d\n",
3464 trans->atom->no, codepoint, exec->index);
3465 else
3466 printf("Saving on counted transition count %d for %c at %d\n",
3467 trans->count, codepoint, exec->index);
3468 #endif
3469 xmlFARegExecSave(exec);
3471 if (trans->counter >= 0) {
3472 xmlRegCounterPtr counter;
3474 /* make sure we don't go over the counter maximum value */
3475 if ((exec->counts == NULL) ||
3476 (exec->comp == NULL) ||
3477 (exec->comp->counters == NULL)) {
3478 exec->status = -1;
3479 goto error;
3481 counter = &exec->comp->counters[trans->counter];
3482 if (exec->counts[trans->counter] >= counter->max)
3483 continue; /* for loop on transitions */
3484 #ifdef DEBUG_REGEXP_EXEC
3485 printf("Increasing count %d\n", trans->counter);
3486 #endif
3487 exec->counts[trans->counter]++;
3489 if ((trans->count >= 0) &&
3490 (trans->count < REGEXP_ALL_COUNTER)) {
3491 if (exec->counts == NULL) {
3492 exec->status = -1;
3493 goto error;
3495 #ifdef DEBUG_REGEXP_EXEC
3496 printf("resetting count %d on transition\n",
3497 trans->count);
3498 #endif
3499 exec->counts[trans->count] = 0;
3501 #ifdef DEBUG_REGEXP_EXEC
3502 printf("entering state %d\n", trans->to);
3503 #endif
3504 exec->state = comp->states[trans->to];
3505 exec->transno = 0;
3506 if (trans->atom != NULL) {
3507 exec->index += len;
3509 goto progress;
3510 } else if (ret < 0) {
3511 exec->status = -4;
3512 break;
3515 if ((exec->transno != 0) || (exec->state->nbTrans == 0)) {
3516 rollback:
3518 * Failed to find a way out
3520 exec->determinist = 0;
3521 #ifdef DEBUG_REGEXP_EXEC
3522 printf("rollback from state %d on %d:%c\n", exec->state->no,
3523 codepoint,codepoint);
3524 #endif
3525 xmlFARegExecRollBack(exec);
3527 progress:
3528 continue;
3530 error:
3531 if (exec->rollbacks != NULL) {
3532 if (exec->counts != NULL) {
3533 int i;
3535 for (i = 0;i < exec->maxRollbacks;i++)
3536 if (exec->rollbacks[i].counts != NULL)
3537 xmlFree(exec->rollbacks[i].counts);
3539 xmlFree(exec->rollbacks);
3541 if (exec->state == NULL)
3542 return(-1);
3543 if (exec->counts != NULL)
3544 xmlFree(exec->counts);
3545 if (exec->status == 0)
3546 return(1);
3547 if (exec->status == -1) {
3548 if (exec->nbPush > MAX_PUSH)
3549 return(-1);
3550 return(0);
3552 return(exec->status);
3555 /************************************************************************
3557 * Progressive interface to the verifier one atom at a time *
3559 ************************************************************************/
3560 #ifdef DEBUG_ERR
3561 static void testerr(xmlRegExecCtxtPtr exec);
3562 #endif
3565 * xmlRegNewExecCtxt:
3566 * @comp: a precompiled regular expression
3567 * @callback: a callback function used for handling progresses in the
3568 * automata matching phase
3569 * @data: the context data associated to the callback in this context
3571 * Build a context used for progressive evaluation of a regexp.
3573 * Returns the new context
3575 xmlRegExecCtxtPtr
3576 xmlRegNewExecCtxt(xmlRegexpPtr comp, xmlRegExecCallbacks callback, void *data) {
3577 xmlRegExecCtxtPtr exec;
3579 if (comp == NULL)
3580 return(NULL);
3581 if ((comp->compact == NULL) && (comp->states == NULL))
3582 return(NULL);
3583 exec = (xmlRegExecCtxtPtr) xmlMalloc(sizeof(xmlRegExecCtxt));
3584 if (exec == NULL) {
3585 xmlRegexpErrMemory(NULL, "creating execution context");
3586 return(NULL);
3588 memset(exec, 0, sizeof(xmlRegExecCtxt));
3589 exec->inputString = NULL;
3590 exec->index = 0;
3591 exec->determinist = 1;
3592 exec->maxRollbacks = 0;
3593 exec->nbRollbacks = 0;
3594 exec->rollbacks = NULL;
3595 exec->status = 0;
3596 exec->comp = comp;
3597 if (comp->compact == NULL)
3598 exec->state = comp->states[0];
3599 exec->transno = 0;
3600 exec->transcount = 0;
3601 exec->callback = callback;
3602 exec->data = data;
3603 if (comp->nbCounters > 0) {
3605 * For error handling, exec->counts is allocated twice the size
3606 * the second half is used to store the data in case of rollback
3608 exec->counts = (int *) xmlMalloc(comp->nbCounters * sizeof(int)
3609 * 2);
3610 if (exec->counts == NULL) {
3611 xmlRegexpErrMemory(NULL, "creating execution context");
3612 xmlFree(exec);
3613 return(NULL);
3615 memset(exec->counts, 0, comp->nbCounters * sizeof(int) * 2);
3616 exec->errCounts = &exec->counts[comp->nbCounters];
3617 } else {
3618 exec->counts = NULL;
3619 exec->errCounts = NULL;
3621 exec->inputStackMax = 0;
3622 exec->inputStackNr = 0;
3623 exec->inputStack = NULL;
3624 exec->errStateNo = -1;
3625 exec->errString = NULL;
3626 exec->nbPush = 0;
3627 return(exec);
3631 * xmlRegFreeExecCtxt:
3632 * @exec: a regular expression evaluation context
3634 * Free the structures associated to a regular expression evaluation context.
3636 void
3637 xmlRegFreeExecCtxt(xmlRegExecCtxtPtr exec) {
3638 if (exec == NULL)
3639 return;
3641 if (exec->rollbacks != NULL) {
3642 if (exec->counts != NULL) {
3643 int i;
3645 for (i = 0;i < exec->maxRollbacks;i++)
3646 if (exec->rollbacks[i].counts != NULL)
3647 xmlFree(exec->rollbacks[i].counts);
3649 xmlFree(exec->rollbacks);
3651 if (exec->counts != NULL)
3652 xmlFree(exec->counts);
3653 if (exec->inputStack != NULL) {
3654 int i;
3656 for (i = 0;i < exec->inputStackNr;i++) {
3657 if (exec->inputStack[i].value != NULL)
3658 xmlFree(exec->inputStack[i].value);
3660 xmlFree(exec->inputStack);
3662 if (exec->errString != NULL)
3663 xmlFree(exec->errString);
3664 xmlFree(exec);
3667 static void
3668 xmlFARegExecSaveInputString(xmlRegExecCtxtPtr exec, const xmlChar *value,
3669 void *data) {
3670 #ifdef DEBUG_PUSH
3671 printf("saving value: %d:%s\n", exec->inputStackNr, value);
3672 #endif
3673 if (exec->inputStackMax == 0) {
3674 exec->inputStackMax = 4;
3675 exec->inputStack = (xmlRegInputTokenPtr)
3676 xmlMalloc(exec->inputStackMax * sizeof(xmlRegInputToken));
3677 if (exec->inputStack == NULL) {
3678 xmlRegexpErrMemory(NULL, "pushing input string");
3679 exec->inputStackMax = 0;
3680 return;
3682 } else if (exec->inputStackNr + 1 >= exec->inputStackMax) {
3683 xmlRegInputTokenPtr tmp;
3685 exec->inputStackMax *= 2;
3686 tmp = (xmlRegInputTokenPtr) xmlRealloc(exec->inputStack,
3687 exec->inputStackMax * sizeof(xmlRegInputToken));
3688 if (tmp == NULL) {
3689 xmlRegexpErrMemory(NULL, "pushing input string");
3690 exec->inputStackMax /= 2;
3691 return;
3693 exec->inputStack = tmp;
3695 exec->inputStack[exec->inputStackNr].value = xmlStrdup(value);
3696 exec->inputStack[exec->inputStackNr].data = data;
3697 exec->inputStackNr++;
3698 exec->inputStack[exec->inputStackNr].value = NULL;
3699 exec->inputStack[exec->inputStackNr].data = NULL;
3703 * xmlRegStrEqualWildcard:
3704 * @expStr: the string to be evaluated
3705 * @valStr: the validation string
3707 * Checks if both strings are equal or have the same content. "*"
3708 * can be used as a wildcard in @valStr; "|" is used as a separator of
3709 * substrings in both @expStr and @valStr.
3711 * Returns 1 if the comparison is satisfied and the number of substrings
3712 * is equal, 0 otherwise.
3715 static int
3716 xmlRegStrEqualWildcard(const xmlChar *expStr, const xmlChar *valStr) {
3717 if (expStr == valStr) return(1);
3718 if (expStr == NULL) return(0);
3719 if (valStr == NULL) return(0);
3720 do {
3722 * Eval if we have a wildcard for the current item.
3724 if (*expStr != *valStr) {
3725 /* if one of them starts with a wildcard make valStr be it */
3726 if (*valStr == '*') {
3727 const xmlChar *tmp;
3729 tmp = valStr;
3730 valStr = expStr;
3731 expStr = tmp;
3733 if ((*valStr != 0) && (*expStr != 0) && (*expStr++ == '*')) {
3734 do {
3735 if (*valStr == XML_REG_STRING_SEPARATOR)
3736 break;
3737 valStr++;
3738 } while (*valStr != 0);
3739 continue;
3740 } else
3741 return(0);
3743 expStr++;
3744 valStr++;
3745 } while (*valStr != 0);
3746 if (*expStr != 0)
3747 return (0);
3748 else
3749 return (1);
3753 * xmlRegCompactPushString:
3754 * @exec: a regexp execution context
3755 * @comp: the precompiled exec with a compact table
3756 * @value: a string token input
3757 * @data: data associated to the token to reuse in callbacks
3759 * Push one input token in the execution context
3761 * Returns: 1 if the regexp reached a final state, 0 if non-final, and
3762 * a negative value in case of error.
3764 static int
3765 xmlRegCompactPushString(xmlRegExecCtxtPtr exec,
3766 xmlRegexpPtr comp,
3767 const xmlChar *value,
3768 void *data) {
3769 int state = exec->index;
3770 int i, target;
3772 if ((comp == NULL) || (comp->compact == NULL) || (comp->stringMap == NULL))
3773 return(-1);
3775 if (value == NULL) {
3777 * are we at a final state ?
3779 if (comp->compact[state * (comp->nbstrings + 1)] ==
3780 XML_REGEXP_FINAL_STATE)
3781 return(1);
3782 return(0);
3785 #ifdef DEBUG_PUSH
3786 printf("value pushed: %s\n", value);
3787 #endif
3790 * Examine all outside transitions from current state
3792 for (i = 0;i < comp->nbstrings;i++) {
3793 target = comp->compact[state * (comp->nbstrings + 1) + i + 1];
3794 if ((target > 0) && (target <= comp->nbstates)) {
3795 target--; /* to avoid 0 */
3796 if (xmlRegStrEqualWildcard(comp->stringMap[i], value)) {
3797 exec->index = target;
3798 if ((exec->callback != NULL) && (comp->transdata != NULL)) {
3799 exec->callback(exec->data, value,
3800 comp->transdata[state * comp->nbstrings + i], data);
3802 #ifdef DEBUG_PUSH
3803 printf("entering state %d\n", target);
3804 #endif
3805 if (comp->compact[target * (comp->nbstrings + 1)] ==
3806 XML_REGEXP_SINK_STATE)
3807 goto error;
3809 if (comp->compact[target * (comp->nbstrings + 1)] ==
3810 XML_REGEXP_FINAL_STATE)
3811 return(1);
3812 return(0);
3817 * Failed to find an exit transition out from current state for the
3818 * current token
3820 #ifdef DEBUG_PUSH
3821 printf("failed to find a transition for %s on state %d\n", value, state);
3822 #endif
3823 error:
3824 if (exec->errString != NULL)
3825 xmlFree(exec->errString);
3826 exec->errString = xmlStrdup(value);
3827 exec->errStateNo = state;
3828 exec->status = -1;
3829 #ifdef DEBUG_ERR
3830 testerr(exec);
3831 #endif
3832 return(-1);
3836 * xmlRegExecPushStringInternal:
3837 * @exec: a regexp execution context or NULL to indicate the end
3838 * @value: a string token input
3839 * @data: data associated to the token to reuse in callbacks
3840 * @compound: value was assembled from 2 strings
3842 * Push one input token in the execution context
3844 * Returns: 1 if the regexp reached a final state, 0 if non-final, and
3845 * a negative value in case of error.
3847 static int
3848 xmlRegExecPushStringInternal(xmlRegExecCtxtPtr exec, const xmlChar *value,
3849 void *data, int compound) {
3850 xmlRegTransPtr trans;
3851 xmlRegAtomPtr atom;
3852 int ret;
3853 int final = 0;
3854 int progress = 1;
3856 if (exec == NULL)
3857 return(-1);
3858 if (exec->comp == NULL)
3859 return(-1);
3860 if (exec->status != 0)
3861 return(exec->status);
3863 if (exec->comp->compact != NULL)
3864 return(xmlRegCompactPushString(exec, exec->comp, value, data));
3866 if (value == NULL) {
3867 if (exec->state->type == XML_REGEXP_FINAL_STATE)
3868 return(1);
3869 final = 1;
3872 #ifdef DEBUG_PUSH
3873 printf("value pushed: %s\n", value);
3874 #endif
3876 * If we have an active rollback stack push the new value there
3877 * and get back to where we were left
3879 if ((value != NULL) && (exec->inputStackNr > 0)) {
3880 xmlFARegExecSaveInputString(exec, value, data);
3881 value = exec->inputStack[exec->index].value;
3882 data = exec->inputStack[exec->index].data;
3883 #ifdef DEBUG_PUSH
3884 printf("value loaded: %s\n", value);
3885 #endif
3888 while ((exec->status == 0) &&
3889 ((value != NULL) ||
3890 ((final == 1) &&
3891 (exec->state->type != XML_REGEXP_FINAL_STATE)))) {
3894 * End of input on non-terminal state, rollback, however we may
3895 * still have epsilon like transition for counted transitions
3896 * on counters, in that case don't break too early.
3898 if ((value == NULL) && (exec->counts == NULL))
3899 goto rollback;
3901 exec->transcount = 0;
3902 for (;exec->transno < exec->state->nbTrans;exec->transno++) {
3903 trans = &exec->state->trans[exec->transno];
3904 if (trans->to < 0)
3905 continue;
3906 atom = trans->atom;
3907 ret = 0;
3908 if (trans->count == REGEXP_ALL_LAX_COUNTER) {
3909 int i;
3910 int count;
3911 xmlRegTransPtr t;
3912 xmlRegCounterPtr counter;
3914 ret = 0;
3916 #ifdef DEBUG_PUSH
3917 printf("testing all lax %d\n", trans->count);
3918 #endif
3920 * Check all counted transitions from the current state
3922 if ((value == NULL) && (final)) {
3923 ret = 1;
3924 } else if (value != NULL) {
3925 for (i = 0;i < exec->state->nbTrans;i++) {
3926 t = &exec->state->trans[i];
3927 if ((t->counter < 0) || (t == trans))
3928 continue;
3929 counter = &exec->comp->counters[t->counter];
3930 count = exec->counts[t->counter];
3931 if ((count < counter->max) &&
3932 (t->atom != NULL) &&
3933 (xmlStrEqual(value, t->atom->valuep))) {
3934 ret = 0;
3935 break;
3937 if ((count >= counter->min) &&
3938 (count < counter->max) &&
3939 (t->atom != NULL) &&
3940 (xmlStrEqual(value, t->atom->valuep))) {
3941 ret = 1;
3942 break;
3946 } else if (trans->count == REGEXP_ALL_COUNTER) {
3947 int i;
3948 int count;
3949 xmlRegTransPtr t;
3950 xmlRegCounterPtr counter;
3952 ret = 1;
3954 #ifdef DEBUG_PUSH
3955 printf("testing all %d\n", trans->count);
3956 #endif
3958 * Check all counted transitions from the current state
3960 for (i = 0;i < exec->state->nbTrans;i++) {
3961 t = &exec->state->trans[i];
3962 if ((t->counter < 0) || (t == trans))
3963 continue;
3964 counter = &exec->comp->counters[t->counter];
3965 count = exec->counts[t->counter];
3966 if ((count < counter->min) || (count > counter->max)) {
3967 ret = 0;
3968 break;
3971 } else if (trans->count >= 0) {
3972 int count;
3973 xmlRegCounterPtr counter;
3976 * A counted transition.
3979 count = exec->counts[trans->count];
3980 counter = &exec->comp->counters[trans->count];
3981 #ifdef DEBUG_PUSH
3982 printf("testing count %d: val %d, min %d, max %d\n",
3983 trans->count, count, counter->min, counter->max);
3984 #endif
3985 ret = ((count >= counter->min) && (count <= counter->max));
3986 } else if (atom == NULL) {
3987 fprintf(stderr, "epsilon transition left at runtime\n");
3988 exec->status = -2;
3989 break;
3990 } else if (value != NULL) {
3991 ret = xmlRegStrEqualWildcard(atom->valuep, value);
3992 if (atom->neg) {
3993 ret = !ret;
3994 if (!compound)
3995 ret = 0;
3997 if ((ret == 1) && (trans->counter >= 0)) {
3998 xmlRegCounterPtr counter;
3999 int count;
4001 count = exec->counts[trans->counter];
4002 counter = &exec->comp->counters[trans->counter];
4003 if (count >= counter->max)
4004 ret = 0;
4007 if ((ret == 1) && (atom->min > 0) && (atom->max > 0)) {
4008 xmlRegStatePtr to = exec->comp->states[trans->to];
4011 * this is a multiple input sequence
4013 if (exec->state->nbTrans > exec->transno + 1) {
4014 if (exec->inputStackNr <= 0) {
4015 xmlFARegExecSaveInputString(exec, value, data);
4017 xmlFARegExecSave(exec);
4019 exec->transcount = 1;
4020 do {
4022 * Try to progress as much as possible on the input
4024 if (exec->transcount == atom->max) {
4025 break;
4027 exec->index++;
4028 value = exec->inputStack[exec->index].value;
4029 data = exec->inputStack[exec->index].data;
4030 #ifdef DEBUG_PUSH
4031 printf("value loaded: %s\n", value);
4032 #endif
4035 * End of input: stop here
4037 if (value == NULL) {
4038 exec->index --;
4039 break;
4041 if (exec->transcount >= atom->min) {
4042 int transno = exec->transno;
4043 xmlRegStatePtr state = exec->state;
4046 * The transition is acceptable save it
4048 exec->transno = -1; /* trick */
4049 exec->state = to;
4050 if (exec->inputStackNr <= 0) {
4051 xmlFARegExecSaveInputString(exec, value, data);
4053 xmlFARegExecSave(exec);
4054 exec->transno = transno;
4055 exec->state = state;
4057 ret = xmlStrEqual(value, atom->valuep);
4058 exec->transcount++;
4059 } while (ret == 1);
4060 if (exec->transcount < atom->min)
4061 ret = 0;
4064 * If the last check failed but one transition was found
4065 * possible, rollback
4067 if (ret < 0)
4068 ret = 0;
4069 if (ret == 0) {
4070 goto rollback;
4074 if (ret == 1) {
4075 if ((exec->callback != NULL) && (atom != NULL) &&
4076 (data != NULL)) {
4077 exec->callback(exec->data, atom->valuep,
4078 atom->data, data);
4080 if (exec->state->nbTrans > exec->transno + 1) {
4081 if (exec->inputStackNr <= 0) {
4082 xmlFARegExecSaveInputString(exec, value, data);
4084 xmlFARegExecSave(exec);
4086 if (trans->counter >= 0) {
4087 #ifdef DEBUG_PUSH
4088 printf("Increasing count %d\n", trans->counter);
4089 #endif
4090 exec->counts[trans->counter]++;
4092 if ((trans->count >= 0) &&
4093 (trans->count < REGEXP_ALL_COUNTER)) {
4094 #ifdef DEBUG_REGEXP_EXEC
4095 printf("resetting count %d on transition\n",
4096 trans->count);
4097 #endif
4098 exec->counts[trans->count] = 0;
4100 #ifdef DEBUG_PUSH
4101 printf("entering state %d\n", trans->to);
4102 #endif
4103 if ((exec->comp->states[trans->to] != NULL) &&
4104 (exec->comp->states[trans->to]->type ==
4105 XML_REGEXP_SINK_STATE)) {
4107 * entering a sink state, save the current state as error
4108 * state.
4110 if (exec->errString != NULL)
4111 xmlFree(exec->errString);
4112 exec->errString = xmlStrdup(value);
4113 exec->errState = exec->state;
4114 memcpy(exec->errCounts, exec->counts,
4115 exec->comp->nbCounters * sizeof(int));
4117 exec->state = exec->comp->states[trans->to];
4118 exec->transno = 0;
4119 if (trans->atom != NULL) {
4120 if (exec->inputStack != NULL) {
4121 exec->index++;
4122 if (exec->index < exec->inputStackNr) {
4123 value = exec->inputStack[exec->index].value;
4124 data = exec->inputStack[exec->index].data;
4125 #ifdef DEBUG_PUSH
4126 printf("value loaded: %s\n", value);
4127 #endif
4128 } else {
4129 value = NULL;
4130 data = NULL;
4131 #ifdef DEBUG_PUSH
4132 printf("end of input\n");
4133 #endif
4135 } else {
4136 value = NULL;
4137 data = NULL;
4138 #ifdef DEBUG_PUSH
4139 printf("end of input\n");
4140 #endif
4143 goto progress;
4144 } else if (ret < 0) {
4145 exec->status = -4;
4146 break;
4149 if ((exec->transno != 0) || (exec->state->nbTrans == 0)) {
4150 rollback:
4152 * if we didn't yet rollback on the current input
4153 * store the current state as the error state.
4155 if ((progress) && (exec->state != NULL) &&
4156 (exec->state->type != XML_REGEXP_SINK_STATE)) {
4157 progress = 0;
4158 if (exec->errString != NULL)
4159 xmlFree(exec->errString);
4160 exec->errString = xmlStrdup(value);
4161 exec->errState = exec->state;
4162 if (exec->comp->nbCounters)
4163 memcpy(exec->errCounts, exec->counts,
4164 exec->comp->nbCounters * sizeof(int));
4168 * Failed to find a way out
4170 exec->determinist = 0;
4171 xmlFARegExecRollBack(exec);
4172 if ((exec->inputStack != NULL ) && (exec->status == 0)) {
4173 value = exec->inputStack[exec->index].value;
4174 data = exec->inputStack[exec->index].data;
4175 #ifdef DEBUG_PUSH
4176 printf("value loaded: %s\n", value);
4177 #endif
4180 continue;
4181 progress:
4182 progress = 1;
4183 continue;
4185 if (exec->status == 0) {
4186 return(exec->state->type == XML_REGEXP_FINAL_STATE);
4188 #ifdef DEBUG_ERR
4189 if (exec->status < 0) {
4190 testerr(exec);
4192 #endif
4193 return(exec->status);
4197 * xmlRegExecPushString:
4198 * @exec: a regexp execution context or NULL to indicate the end
4199 * @value: a string token input
4200 * @data: data associated to the token to reuse in callbacks
4202 * Push one input token in the execution context
4204 * Returns: 1 if the regexp reached a final state, 0 if non-final, and
4205 * a negative value in case of error.
4208 xmlRegExecPushString(xmlRegExecCtxtPtr exec, const xmlChar *value,
4209 void *data) {
4210 return(xmlRegExecPushStringInternal(exec, value, data, 0));
4214 * xmlRegExecPushString2:
4215 * @exec: a regexp execution context or NULL to indicate the end
4216 * @value: the first string token input
4217 * @value2: the second string token input
4218 * @data: data associated to the token to reuse in callbacks
4220 * Push one input token in the execution context
4222 * Returns: 1 if the regexp reached a final state, 0 if non-final, and
4223 * a negative value in case of error.
4226 xmlRegExecPushString2(xmlRegExecCtxtPtr exec, const xmlChar *value,
4227 const xmlChar *value2, void *data) {
4228 xmlChar buf[150];
4229 int lenn, lenp, ret;
4230 xmlChar *str;
4232 if (exec == NULL)
4233 return(-1);
4234 if (exec->comp == NULL)
4235 return(-1);
4236 if (exec->status != 0)
4237 return(exec->status);
4239 if (value2 == NULL)
4240 return(xmlRegExecPushString(exec, value, data));
4242 lenn = strlen((char *) value2);
4243 lenp = strlen((char *) value);
4245 if (150 < lenn + lenp + 2) {
4246 str = (xmlChar *) xmlMallocAtomic(lenn + lenp + 2);
4247 if (str == NULL) {
4248 exec->status = -1;
4249 return(-1);
4251 } else {
4252 str = buf;
4254 memcpy(&str[0], value, lenp);
4255 str[lenp] = XML_REG_STRING_SEPARATOR;
4256 memcpy(&str[lenp + 1], value2, lenn);
4257 str[lenn + lenp + 1] = 0;
4259 if (exec->comp->compact != NULL)
4260 ret = xmlRegCompactPushString(exec, exec->comp, str, data);
4261 else
4262 ret = xmlRegExecPushStringInternal(exec, str, data, 1);
4264 if (str != buf)
4265 xmlFree(str);
4266 return(ret);
4270 * xmlRegExecGetValues:
4271 * @exec: a regexp execution context
4272 * @err: error extraction or normal one
4273 * @nbval: pointer to the number of accepted values IN/OUT
4274 * @nbneg: return number of negative transitions
4275 * @values: pointer to the array of acceptable values
4276 * @terminal: return value if this was a terminal state
4278 * Extract information from the regexp execution, internal routine to
4279 * implement xmlRegExecNextValues() and xmlRegExecErrInfo()
4281 * Returns: 0 in case of success or -1 in case of error.
4283 static int
4284 xmlRegExecGetValues(xmlRegExecCtxtPtr exec, int err,
4285 int *nbval, int *nbneg,
4286 xmlChar **values, int *terminal) {
4287 int maxval;
4288 int nb = 0;
4290 if ((exec == NULL) || (nbval == NULL) || (nbneg == NULL) ||
4291 (values == NULL) || (*nbval <= 0))
4292 return(-1);
4294 maxval = *nbval;
4295 *nbval = 0;
4296 *nbneg = 0;
4297 if ((exec->comp != NULL) && (exec->comp->compact != NULL)) {
4298 xmlRegexpPtr comp;
4299 int target, i, state;
4301 comp = exec->comp;
4303 if (err) {
4304 if (exec->errStateNo == -1) return(-1);
4305 state = exec->errStateNo;
4306 } else {
4307 state = exec->index;
4309 if (terminal != NULL) {
4310 if (comp->compact[state * (comp->nbstrings + 1)] ==
4311 XML_REGEXP_FINAL_STATE)
4312 *terminal = 1;
4313 else
4314 *terminal = 0;
4316 for (i = 0;(i < comp->nbstrings) && (nb < maxval);i++) {
4317 target = comp->compact[state * (comp->nbstrings + 1) + i + 1];
4318 if ((target > 0) && (target <= comp->nbstates) &&
4319 (comp->compact[(target - 1) * (comp->nbstrings + 1)] !=
4320 XML_REGEXP_SINK_STATE)) {
4321 values[nb++] = comp->stringMap[i];
4322 (*nbval)++;
4325 for (i = 0;(i < comp->nbstrings) && (nb < maxval);i++) {
4326 target = comp->compact[state * (comp->nbstrings + 1) + i + 1];
4327 if ((target > 0) && (target <= comp->nbstates) &&
4328 (comp->compact[(target - 1) * (comp->nbstrings + 1)] ==
4329 XML_REGEXP_SINK_STATE)) {
4330 values[nb++] = comp->stringMap[i];
4331 (*nbneg)++;
4334 } else {
4335 int transno;
4336 xmlRegTransPtr trans;
4337 xmlRegAtomPtr atom;
4338 xmlRegStatePtr state;
4340 if (terminal != NULL) {
4341 if (exec->state->type == XML_REGEXP_FINAL_STATE)
4342 *terminal = 1;
4343 else
4344 *terminal = 0;
4347 if (err) {
4348 if (exec->errState == NULL) return(-1);
4349 state = exec->errState;
4350 } else {
4351 if (exec->state == NULL) return(-1);
4352 state = exec->state;
4354 for (transno = 0;
4355 (transno < state->nbTrans) && (nb < maxval);
4356 transno++) {
4357 trans = &state->trans[transno];
4358 if (trans->to < 0)
4359 continue;
4360 atom = trans->atom;
4361 if ((atom == NULL) || (atom->valuep == NULL))
4362 continue;
4363 if (trans->count == REGEXP_ALL_LAX_COUNTER) {
4364 /* this should not be reached but ... */
4365 TODO;
4366 } else if (trans->count == REGEXP_ALL_COUNTER) {
4367 /* this should not be reached but ... */
4368 TODO;
4369 } else if (trans->counter >= 0) {
4370 xmlRegCounterPtr counter = NULL;
4371 int count;
4373 if (err)
4374 count = exec->errCounts[trans->counter];
4375 else
4376 count = exec->counts[trans->counter];
4377 if (exec->comp != NULL)
4378 counter = &exec->comp->counters[trans->counter];
4379 if ((counter == NULL) || (count < counter->max)) {
4380 if (atom->neg)
4381 values[nb++] = (xmlChar *) atom->valuep2;
4382 else
4383 values[nb++] = (xmlChar *) atom->valuep;
4384 (*nbval)++;
4386 } else {
4387 if ((exec->comp != NULL) && (exec->comp->states[trans->to] != NULL) &&
4388 (exec->comp->states[trans->to]->type !=
4389 XML_REGEXP_SINK_STATE)) {
4390 if (atom->neg)
4391 values[nb++] = (xmlChar *) atom->valuep2;
4392 else
4393 values[nb++] = (xmlChar *) atom->valuep;
4394 (*nbval)++;
4398 for (transno = 0;
4399 (transno < state->nbTrans) && (nb < maxval);
4400 transno++) {
4401 trans = &state->trans[transno];
4402 if (trans->to < 0)
4403 continue;
4404 atom = trans->atom;
4405 if ((atom == NULL) || (atom->valuep == NULL))
4406 continue;
4407 if (trans->count == REGEXP_ALL_LAX_COUNTER) {
4408 continue;
4409 } else if (trans->count == REGEXP_ALL_COUNTER) {
4410 continue;
4411 } else if (trans->counter >= 0) {
4412 continue;
4413 } else {
4414 if ((exec->comp->states[trans->to] != NULL) &&
4415 (exec->comp->states[trans->to]->type ==
4416 XML_REGEXP_SINK_STATE)) {
4417 if (atom->neg)
4418 values[nb++] = (xmlChar *) atom->valuep2;
4419 else
4420 values[nb++] = (xmlChar *) atom->valuep;
4421 (*nbneg)++;
4426 return(0);
4430 * xmlRegExecNextValues:
4431 * @exec: a regexp execution context
4432 * @nbval: pointer to the number of accepted values IN/OUT
4433 * @nbneg: return number of negative transitions
4434 * @values: pointer to the array of acceptable values
4435 * @terminal: return value if this was a terminal state
4437 * Extract information from the regexp execution,
4438 * the parameter @values must point to an array of @nbval string pointers
4439 * on return nbval will contain the number of possible strings in that
4440 * state and the @values array will be updated with them. The string values
4441 * returned will be freed with the @exec context and don't need to be
4442 * deallocated.
4444 * Returns: 0 in case of success or -1 in case of error.
4447 xmlRegExecNextValues(xmlRegExecCtxtPtr exec, int *nbval, int *nbneg,
4448 xmlChar **values, int *terminal) {
4449 return(xmlRegExecGetValues(exec, 0, nbval, nbneg, values, terminal));
4453 * xmlRegExecErrInfo:
4454 * @exec: a regexp execution context generating an error
4455 * @string: return value for the error string
4456 * @nbval: pointer to the number of accepted values IN/OUT
4457 * @nbneg: return number of negative transitions
4458 * @values: pointer to the array of acceptable values
4459 * @terminal: return value if this was a terminal state
4461 * Extract error information from the regexp execution, the parameter
4462 * @string will be updated with the value pushed and not accepted,
4463 * the parameter @values must point to an array of @nbval string pointers
4464 * on return nbval will contain the number of possible strings in that
4465 * state and the @values array will be updated with them. The string values
4466 * returned will be freed with the @exec context and don't need to be
4467 * deallocated.
4469 * Returns: 0 in case of success or -1 in case of error.
4472 xmlRegExecErrInfo(xmlRegExecCtxtPtr exec, const xmlChar **string,
4473 int *nbval, int *nbneg, xmlChar **values, int *terminal) {
4474 if (exec == NULL)
4475 return(-1);
4476 if (string != NULL) {
4477 if (exec->status != 0)
4478 *string = exec->errString;
4479 else
4480 *string = NULL;
4482 return(xmlRegExecGetValues(exec, 1, nbval, nbneg, values, terminal));
4485 #ifdef DEBUG_ERR
4486 static void testerr(xmlRegExecCtxtPtr exec) {
4487 const xmlChar *string;
4488 xmlChar *values[5];
4489 int nb = 5;
4490 int nbneg;
4491 int terminal;
4492 xmlRegExecErrInfo(exec, &string, &nb, &nbneg, &values[0], &terminal);
4494 #endif
4496 #if 0
4497 static int
4498 xmlRegExecPushChar(xmlRegExecCtxtPtr exec, int UCS) {
4499 xmlRegTransPtr trans;
4500 xmlRegAtomPtr atom;
4501 int ret;
4502 int codepoint, len;
4504 if (exec == NULL)
4505 return(-1);
4506 if (exec->status != 0)
4507 return(exec->status);
4509 while ((exec->status == 0) &&
4510 ((exec->inputString[exec->index] != 0) ||
4511 (exec->state->type != XML_REGEXP_FINAL_STATE))) {
4514 * End of input on non-terminal state, rollback, however we may
4515 * still have epsilon like transition for counted transitions
4516 * on counters, in that case don't break too early.
4518 if ((exec->inputString[exec->index] == 0) && (exec->counts == NULL))
4519 goto rollback;
4521 exec->transcount = 0;
4522 for (;exec->transno < exec->state->nbTrans;exec->transno++) {
4523 trans = &exec->state->trans[exec->transno];
4524 if (trans->to < 0)
4525 continue;
4526 atom = trans->atom;
4527 ret = 0;
4528 if (trans->count >= 0) {
4529 int count;
4530 xmlRegCounterPtr counter;
4533 * A counted transition.
4536 count = exec->counts[trans->count];
4537 counter = &exec->comp->counters[trans->count];
4538 #ifdef DEBUG_REGEXP_EXEC
4539 printf("testing count %d: val %d, min %d, max %d\n",
4540 trans->count, count, counter->min, counter->max);
4541 #endif
4542 ret = ((count >= counter->min) && (count <= counter->max));
4543 } else if (atom == NULL) {
4544 fprintf(stderr, "epsilon transition left at runtime\n");
4545 exec->status = -2;
4546 break;
4547 } else if (exec->inputString[exec->index] != 0) {
4548 codepoint = CUR_SCHAR(&(exec->inputString[exec->index]), len);
4549 ret = xmlRegCheckCharacter(atom, codepoint);
4550 if ((ret == 1) && (atom->min > 0) && (atom->max > 0)) {
4551 xmlRegStatePtr to = exec->comp->states[trans->to];
4554 * this is a multiple input sequence
4556 if (exec->state->nbTrans > exec->transno + 1) {
4557 xmlFARegExecSave(exec);
4559 exec->transcount = 1;
4560 do {
4562 * Try to progress as much as possible on the input
4564 if (exec->transcount == atom->max) {
4565 break;
4567 exec->index += len;
4569 * End of input: stop here
4571 if (exec->inputString[exec->index] == 0) {
4572 exec->index -= len;
4573 break;
4575 if (exec->transcount >= atom->min) {
4576 int transno = exec->transno;
4577 xmlRegStatePtr state = exec->state;
4580 * The transition is acceptable save it
4582 exec->transno = -1; /* trick */
4583 exec->state = to;
4584 xmlFARegExecSave(exec);
4585 exec->transno = transno;
4586 exec->state = state;
4588 codepoint = CUR_SCHAR(&(exec->inputString[exec->index]),
4589 len);
4590 ret = xmlRegCheckCharacter(atom, codepoint);
4591 exec->transcount++;
4592 } while (ret == 1);
4593 if (exec->transcount < atom->min)
4594 ret = 0;
4597 * If the last check failed but one transition was found
4598 * possible, rollback
4600 if (ret < 0)
4601 ret = 0;
4602 if (ret == 0) {
4603 goto rollback;
4607 if (ret == 1) {
4608 if (exec->state->nbTrans > exec->transno + 1) {
4609 xmlFARegExecSave(exec);
4612 * restart count for expressions like this ((abc){2})*
4614 if (trans->count >= 0) {
4615 #ifdef DEBUG_REGEXP_EXEC
4616 printf("Reset count %d\n", trans->count);
4617 #endif
4618 exec->counts[trans->count] = 0;
4620 if (trans->counter >= 0) {
4621 #ifdef DEBUG_REGEXP_EXEC
4622 printf("Increasing count %d\n", trans->counter);
4623 #endif
4624 exec->counts[trans->counter]++;
4626 #ifdef DEBUG_REGEXP_EXEC
4627 printf("entering state %d\n", trans->to);
4628 #endif
4629 exec->state = exec->comp->states[trans->to];
4630 exec->transno = 0;
4631 if (trans->atom != NULL) {
4632 exec->index += len;
4634 goto progress;
4635 } else if (ret < 0) {
4636 exec->status = -4;
4637 break;
4640 if ((exec->transno != 0) || (exec->state->nbTrans == 0)) {
4641 rollback:
4643 * Failed to find a way out
4645 exec->determinist = 0;
4646 xmlFARegExecRollBack(exec);
4648 progress:
4649 continue;
4652 #endif
4653 /************************************************************************
4655 * Parser for the Schemas Datatype Regular Expressions *
4656 * http://www.w3.org/TR/2001/REC-xmlschema-2-20010502/#regexs *
4658 ************************************************************************/
4661 * xmlFAIsChar:
4662 * @ctxt: a regexp parser context
4664 * [10] Char ::= [^.\?*+()|#x5B#x5D]
4666 static int
4667 xmlFAIsChar(xmlRegParserCtxtPtr ctxt) {
4668 int cur;
4669 int len;
4671 cur = CUR_SCHAR(ctxt->cur, len);
4672 if ((cur == '.') || (cur == '\\') || (cur == '?') ||
4673 (cur == '*') || (cur == '+') || (cur == '(') ||
4674 (cur == ')') || (cur == '|') || (cur == 0x5B) ||
4675 (cur == 0x5D) || (cur == 0))
4676 return(-1);
4677 return(cur);
4681 * xmlFAParseCharProp:
4682 * @ctxt: a regexp parser context
4684 * [27] charProp ::= IsCategory | IsBlock
4685 * [28] IsCategory ::= Letters | Marks | Numbers | Punctuation |
4686 * Separators | Symbols | Others
4687 * [29] Letters ::= 'L' [ultmo]?
4688 * [30] Marks ::= 'M' [nce]?
4689 * [31] Numbers ::= 'N' [dlo]?
4690 * [32] Punctuation ::= 'P' [cdseifo]?
4691 * [33] Separators ::= 'Z' [slp]?
4692 * [34] Symbols ::= 'S' [mcko]?
4693 * [35] Others ::= 'C' [cfon]?
4694 * [36] IsBlock ::= 'Is' [a-zA-Z0-9#x2D]+
4696 static void
4697 xmlFAParseCharProp(xmlRegParserCtxtPtr ctxt) {
4698 int cur;
4699 xmlRegAtomType type = (xmlRegAtomType) 0;
4700 xmlChar *blockName = NULL;
4702 cur = CUR;
4703 if (cur == 'L') {
4704 NEXT;
4705 cur = CUR;
4706 if (cur == 'u') {
4707 NEXT;
4708 type = XML_REGEXP_LETTER_UPPERCASE;
4709 } else if (cur == 'l') {
4710 NEXT;
4711 type = XML_REGEXP_LETTER_LOWERCASE;
4712 } else if (cur == 't') {
4713 NEXT;
4714 type = XML_REGEXP_LETTER_TITLECASE;
4715 } else if (cur == 'm') {
4716 NEXT;
4717 type = XML_REGEXP_LETTER_MODIFIER;
4718 } else if (cur == 'o') {
4719 NEXT;
4720 type = XML_REGEXP_LETTER_OTHERS;
4721 } else {
4722 type = XML_REGEXP_LETTER;
4724 } else if (cur == 'M') {
4725 NEXT;
4726 cur = CUR;
4727 if (cur == 'n') {
4728 NEXT;
4729 /* nonspacing */
4730 type = XML_REGEXP_MARK_NONSPACING;
4731 } else if (cur == 'c') {
4732 NEXT;
4733 /* spacing combining */
4734 type = XML_REGEXP_MARK_SPACECOMBINING;
4735 } else if (cur == 'e') {
4736 NEXT;
4737 /* enclosing */
4738 type = XML_REGEXP_MARK_ENCLOSING;
4739 } else {
4740 /* all marks */
4741 type = XML_REGEXP_MARK;
4743 } else if (cur == 'N') {
4744 NEXT;
4745 cur = CUR;
4746 if (cur == 'd') {
4747 NEXT;
4748 /* digital */
4749 type = XML_REGEXP_NUMBER_DECIMAL;
4750 } else if (cur == 'l') {
4751 NEXT;
4752 /* letter */
4753 type = XML_REGEXP_NUMBER_LETTER;
4754 } else if (cur == 'o') {
4755 NEXT;
4756 /* other */
4757 type = XML_REGEXP_NUMBER_OTHERS;
4758 } else {
4759 /* all numbers */
4760 type = XML_REGEXP_NUMBER;
4762 } else if (cur == 'P') {
4763 NEXT;
4764 cur = CUR;
4765 if (cur == 'c') {
4766 NEXT;
4767 /* connector */
4768 type = XML_REGEXP_PUNCT_CONNECTOR;
4769 } else if (cur == 'd') {
4770 NEXT;
4771 /* dash */
4772 type = XML_REGEXP_PUNCT_DASH;
4773 } else if (cur == 's') {
4774 NEXT;
4775 /* open */
4776 type = XML_REGEXP_PUNCT_OPEN;
4777 } else if (cur == 'e') {
4778 NEXT;
4779 /* close */
4780 type = XML_REGEXP_PUNCT_CLOSE;
4781 } else if (cur == 'i') {
4782 NEXT;
4783 /* initial quote */
4784 type = XML_REGEXP_PUNCT_INITQUOTE;
4785 } else if (cur == 'f') {
4786 NEXT;
4787 /* final quote */
4788 type = XML_REGEXP_PUNCT_FINQUOTE;
4789 } else if (cur == 'o') {
4790 NEXT;
4791 /* other */
4792 type = XML_REGEXP_PUNCT_OTHERS;
4793 } else {
4794 /* all punctuation */
4795 type = XML_REGEXP_PUNCT;
4797 } else if (cur == 'Z') {
4798 NEXT;
4799 cur = CUR;
4800 if (cur == 's') {
4801 NEXT;
4802 /* space */
4803 type = XML_REGEXP_SEPAR_SPACE;
4804 } else if (cur == 'l') {
4805 NEXT;
4806 /* line */
4807 type = XML_REGEXP_SEPAR_LINE;
4808 } else if (cur == 'p') {
4809 NEXT;
4810 /* paragraph */
4811 type = XML_REGEXP_SEPAR_PARA;
4812 } else {
4813 /* all separators */
4814 type = XML_REGEXP_SEPAR;
4816 } else if (cur == 'S') {
4817 NEXT;
4818 cur = CUR;
4819 if (cur == 'm') {
4820 NEXT;
4821 type = XML_REGEXP_SYMBOL_MATH;
4822 /* math */
4823 } else if (cur == 'c') {
4824 NEXT;
4825 type = XML_REGEXP_SYMBOL_CURRENCY;
4826 /* currency */
4827 } else if (cur == 'k') {
4828 NEXT;
4829 type = XML_REGEXP_SYMBOL_MODIFIER;
4830 /* modifiers */
4831 } else if (cur == 'o') {
4832 NEXT;
4833 type = XML_REGEXP_SYMBOL_OTHERS;
4834 /* other */
4835 } else {
4836 /* all symbols */
4837 type = XML_REGEXP_SYMBOL;
4839 } else if (cur == 'C') {
4840 NEXT;
4841 cur = CUR;
4842 if (cur == 'c') {
4843 NEXT;
4844 /* control */
4845 type = XML_REGEXP_OTHER_CONTROL;
4846 } else if (cur == 'f') {
4847 NEXT;
4848 /* format */
4849 type = XML_REGEXP_OTHER_FORMAT;
4850 } else if (cur == 'o') {
4851 NEXT;
4852 /* private use */
4853 type = XML_REGEXP_OTHER_PRIVATE;
4854 } else if (cur == 'n') {
4855 NEXT;
4856 /* not assigned */
4857 type = XML_REGEXP_OTHER_NA;
4858 } else {
4859 /* all others */
4860 type = XML_REGEXP_OTHER;
4862 } else if (cur == 'I') {
4863 const xmlChar *start;
4864 NEXT;
4865 cur = CUR;
4866 if (cur != 's') {
4867 ERROR("IsXXXX expected");
4868 return;
4870 NEXT;
4871 start = ctxt->cur;
4872 cur = CUR;
4873 if (((cur >= 'a') && (cur <= 'z')) ||
4874 ((cur >= 'A') && (cur <= 'Z')) ||
4875 ((cur >= '0') && (cur <= '9')) ||
4876 (cur == 0x2D)) {
4877 NEXT;
4878 cur = CUR;
4879 while (((cur >= 'a') && (cur <= 'z')) ||
4880 ((cur >= 'A') && (cur <= 'Z')) ||
4881 ((cur >= '0') && (cur <= '9')) ||
4882 (cur == 0x2D)) {
4883 NEXT;
4884 cur = CUR;
4887 type = XML_REGEXP_BLOCK_NAME;
4888 blockName = xmlStrndup(start, ctxt->cur - start);
4889 } else {
4890 ERROR("Unknown char property");
4891 return;
4893 if (ctxt->atom == NULL) {
4894 ctxt->atom = xmlRegNewAtom(ctxt, type);
4895 if (ctxt->atom != NULL)
4896 ctxt->atom->valuep = blockName;
4897 } else if (ctxt->atom->type == XML_REGEXP_RANGES) {
4898 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg,
4899 type, 0, 0, blockName);
4903 static int parse_escaped_codeunit(xmlRegParserCtxtPtr ctxt)
4905 int val = 0, i, cur;
4906 for (i = 0; i < 4; i++) {
4907 NEXT;
4908 val *= 16;
4909 cur = CUR;
4910 if (cur >= '0' && cur <= '9') {
4911 val += cur - '0';
4912 } else if (cur >= 'A' && cur <= 'F') {
4913 val += cur - 'A' + 10;
4914 } else if (cur >= 'a' && cur <= 'f') {
4915 val += cur - 'a' + 10;
4916 } else {
4917 ERROR("Expecting hex digit");
4918 return -1;
4921 return val;
4924 static int parse_escaped_codepoint(xmlRegParserCtxtPtr ctxt)
4926 int val = parse_escaped_codeunit(ctxt);
4927 if (0xD800 <= val && val <= 0xDBFF) {
4928 NEXT;
4929 if (CUR == '\\') {
4930 NEXT;
4931 if (CUR == 'u') {
4932 int low = parse_escaped_codeunit(ctxt);
4933 if (0xDC00 <= low && low <= 0xDFFF) {
4934 return (val - 0xD800) * 0x400 + (low - 0xDC00) + 0x10000;
4938 ERROR("Invalid low surrogate pair code unit");
4939 val = -1;
4941 return val;
4945 * xmlFAParseCharClassEsc:
4946 * @ctxt: a regexp parser context
4948 * [23] charClassEsc ::= ( SingleCharEsc | MultiCharEsc | catEsc | complEsc )
4949 * [24] SingleCharEsc ::= '\' [nrt\|.?*+(){}#x2D#x5B#x5D#x5E]
4950 * [25] catEsc ::= '\p{' charProp '}'
4951 * [26] complEsc ::= '\P{' charProp '}'
4952 * [37] MultiCharEsc ::= '.' | ('\' [sSiIcCdDwW])
4954 static void
4955 xmlFAParseCharClassEsc(xmlRegParserCtxtPtr ctxt) {
4956 int cur;
4958 if (CUR == '.') {
4959 if (ctxt->atom == NULL) {
4960 ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_ANYCHAR);
4961 } else if (ctxt->atom->type == XML_REGEXP_RANGES) {
4962 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg,
4963 XML_REGEXP_ANYCHAR, 0, 0, NULL);
4965 NEXT;
4966 return;
4968 if (CUR != '\\') {
4969 ERROR("Escaped sequence: expecting \\");
4970 return;
4972 NEXT;
4973 cur = CUR;
4974 if (cur == 'p') {
4975 NEXT;
4976 if (CUR != '{') {
4977 ERROR("Expecting '{'");
4978 return;
4980 NEXT;
4981 xmlFAParseCharProp(ctxt);
4982 if (CUR != '}') {
4983 ERROR("Expecting '}'");
4984 return;
4986 NEXT;
4987 } else if (cur == 'P') {
4988 NEXT;
4989 if (CUR != '{') {
4990 ERROR("Expecting '{'");
4991 return;
4993 NEXT;
4994 xmlFAParseCharProp(ctxt);
4995 if (ctxt->atom != NULL)
4996 ctxt->atom->neg = 1;
4997 if (CUR != '}') {
4998 ERROR("Expecting '}'");
4999 return;
5001 NEXT;
5002 } else if ((cur == 'n') || (cur == 'r') || (cur == 't') || (cur == '\\') ||
5003 (cur == '|') || (cur == '.') || (cur == '?') || (cur == '*') ||
5004 (cur == '+') || (cur == '(') || (cur == ')') || (cur == '{') ||
5005 (cur == '}') || (cur == 0x2D) || (cur == 0x5B) || (cur == 0x5D) ||
5006 (cur == 0x5E) ||
5008 /* Non-standard escape sequences:
5009 * Java 1.8|.NET Core 3.1|MSXML 6 */
5010 (cur == '!') || /* + | + | + */
5011 (cur == '"') || /* + | + | + */
5012 (cur == '#') || /* + | + | + */
5013 (cur == '$') || /* + | + | + */
5014 (cur == '%') || /* + | + | + */
5015 (cur == ',') || /* + | + | + */
5016 (cur == '/') || /* + | + | + */
5017 (cur == ':') || /* + | + | + */
5018 (cur == ';') || /* + | + | + */
5019 (cur == '=') || /* + | + | + */
5020 (cur == '>') || /* | + | + */
5021 (cur == '@') || /* + | + | + */
5022 (cur == '`') || /* + | + | + */
5023 (cur == '~') || /* + | + | + */
5024 (cur == 'u')) { /* | + | + */
5025 if (ctxt->atom == NULL) {
5026 ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_CHARVAL);
5027 if (ctxt->atom != NULL) {
5028 switch (cur) {
5029 case 'n':
5030 ctxt->atom->codepoint = '\n';
5031 break;
5032 case 'r':
5033 ctxt->atom->codepoint = '\r';
5034 break;
5035 case 't':
5036 ctxt->atom->codepoint = '\t';
5037 break;
5038 case 'u':
5039 cur = parse_escaped_codepoint(ctxt);
5040 if (cur < 0) {
5041 return;
5043 ctxt->atom->codepoint = cur;
5044 break;
5045 default:
5046 ctxt->atom->codepoint = cur;
5049 } else if (ctxt->atom->type == XML_REGEXP_RANGES) {
5050 switch (cur) {
5051 case 'n':
5052 cur = '\n';
5053 break;
5054 case 'r':
5055 cur = '\r';
5056 break;
5057 case 't':
5058 cur = '\t';
5059 break;
5061 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg,
5062 XML_REGEXP_CHARVAL, cur, cur, NULL);
5064 NEXT;
5065 } else if ((cur == 's') || (cur == 'S') || (cur == 'i') || (cur == 'I') ||
5066 (cur == 'c') || (cur == 'C') || (cur == 'd') || (cur == 'D') ||
5067 (cur == 'w') || (cur == 'W')) {
5068 xmlRegAtomType type = XML_REGEXP_ANYSPACE;
5070 switch (cur) {
5071 case 's':
5072 type = XML_REGEXP_ANYSPACE;
5073 break;
5074 case 'S':
5075 type = XML_REGEXP_NOTSPACE;
5076 break;
5077 case 'i':
5078 type = XML_REGEXP_INITNAME;
5079 break;
5080 case 'I':
5081 type = XML_REGEXP_NOTINITNAME;
5082 break;
5083 case 'c':
5084 type = XML_REGEXP_NAMECHAR;
5085 break;
5086 case 'C':
5087 type = XML_REGEXP_NOTNAMECHAR;
5088 break;
5089 case 'd':
5090 type = XML_REGEXP_DECIMAL;
5091 break;
5092 case 'D':
5093 type = XML_REGEXP_NOTDECIMAL;
5094 break;
5095 case 'w':
5096 type = XML_REGEXP_REALCHAR;
5097 break;
5098 case 'W':
5099 type = XML_REGEXP_NOTREALCHAR;
5100 break;
5102 NEXT;
5103 if (ctxt->atom == NULL) {
5104 ctxt->atom = xmlRegNewAtom(ctxt, type);
5105 } else if (ctxt->atom->type == XML_REGEXP_RANGES) {
5106 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg,
5107 type, 0, 0, NULL);
5109 } else {
5110 ERROR("Wrong escape sequence, misuse of character '\\'");
5115 * xmlFAParseCharRange:
5116 * @ctxt: a regexp parser context
5118 * [17] charRange ::= seRange | XmlCharRef | XmlCharIncDash
5119 * [18] seRange ::= charOrEsc '-' charOrEsc
5120 * [20] charOrEsc ::= XmlChar | SingleCharEsc
5121 * [21] XmlChar ::= [^\#x2D#x5B#x5D]
5122 * [22] XmlCharIncDash ::= [^\#x5B#x5D]
5124 static void
5125 xmlFAParseCharRange(xmlRegParserCtxtPtr ctxt) {
5126 int cur, len;
5127 int start = -1;
5128 int end = -1;
5130 if (CUR == '\0') {
5131 ERROR("Expecting ']'");
5132 return;
5135 cur = CUR;
5136 if (cur == '\\') {
5137 NEXT;
5138 cur = CUR;
5139 switch (cur) {
5140 case 'n': start = 0xA; break;
5141 case 'r': start = 0xD; break;
5142 case 't': start = 0x9; break;
5143 case '\\': case '|': case '.': case '-': case '^': case '?':
5144 case '*': case '+': case '{': case '}': case '(': case ')':
5145 case '[': case ']':
5146 start = cur; break;
5147 default:
5148 ERROR("Invalid escape value");
5149 return;
5151 end = start;
5152 len = 1;
5153 } else if ((cur != 0x5B) && (cur != 0x5D)) {
5154 end = start = CUR_SCHAR(ctxt->cur, len);
5155 } else {
5156 ERROR("Expecting a char range");
5157 return;
5160 * Since we are "inside" a range, we can assume ctxt->cur is past
5161 * the start of ctxt->string, and PREV should be safe
5163 if ((start == '-') && (NXT(1) != ']') && (PREV != '[') && (PREV != '^')) {
5164 NEXTL(len);
5165 return;
5167 NEXTL(len);
5168 cur = CUR;
5169 if ((cur != '-') || (NXT(1) == '[') || (NXT(1) == ']')) {
5170 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg,
5171 XML_REGEXP_CHARVAL, start, end, NULL);
5172 return;
5174 NEXT;
5175 cur = CUR;
5176 if (cur == '\\') {
5177 NEXT;
5178 cur = CUR;
5179 switch (cur) {
5180 case 'n': end = 0xA; break;
5181 case 'r': end = 0xD; break;
5182 case 't': end = 0x9; break;
5183 case '\\': case '|': case '.': case '-': case '^': case '?':
5184 case '*': case '+': case '{': case '}': case '(': case ')':
5185 case '[': case ']':
5186 end = cur; break;
5187 default:
5188 ERROR("Invalid escape value");
5189 return;
5191 len = 1;
5192 } else if ((cur != '\0') && (cur != 0x5B) && (cur != 0x5D)) {
5193 end = CUR_SCHAR(ctxt->cur, len);
5194 } else {
5195 ERROR("Expecting the end of a char range");
5196 return;
5199 /* TODO check that the values are acceptable character ranges for XML */
5200 if (end < start) {
5201 ERROR("End of range is before start of range");
5202 } else {
5203 NEXTL(len);
5204 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg,
5205 XML_REGEXP_CHARVAL, start, end, NULL);
5207 return;
5211 * xmlFAParsePosCharGroup:
5212 * @ctxt: a regexp parser context
5214 * [14] posCharGroup ::= ( charRange | charClassEsc )+
5216 static void
5217 xmlFAParsePosCharGroup(xmlRegParserCtxtPtr ctxt) {
5218 do {
5219 if (CUR == '\\') {
5220 xmlFAParseCharClassEsc(ctxt);
5221 } else {
5222 xmlFAParseCharRange(ctxt);
5224 } while ((CUR != ']') && (CUR != '-') &&
5225 (CUR != 0) && (ctxt->error == 0));
5229 * xmlFAParseCharGroup:
5230 * @ctxt: a regexp parser context
5232 * [13] charGroup ::= posCharGroup | negCharGroup | charClassSub
5233 * [15] negCharGroup ::= '^' posCharGroup
5234 * [16] charClassSub ::= ( posCharGroup | negCharGroup ) '-' charClassExpr
5235 * [12] charClassExpr ::= '[' charGroup ']'
5237 static void
5238 xmlFAParseCharGroup(xmlRegParserCtxtPtr ctxt) {
5239 int neg = ctxt->neg;
5241 if (CUR == '^') {
5242 NEXT;
5243 ctxt->neg = !ctxt->neg;
5244 xmlFAParsePosCharGroup(ctxt);
5245 ctxt->neg = neg;
5247 while ((CUR != ']') && (ctxt->error == 0)) {
5248 if ((CUR == '-') && (NXT(1) == '[')) {
5249 NEXT; /* eat the '-' */
5250 NEXT; /* eat the '[' */
5251 ctxt->neg = 2;
5252 xmlFAParseCharGroup(ctxt);
5253 ctxt->neg = neg;
5254 if (CUR == ']') {
5255 NEXT;
5256 } else {
5257 ERROR("charClassExpr: ']' expected");
5259 break;
5260 } else {
5261 xmlFAParsePosCharGroup(ctxt);
5267 * xmlFAParseCharClass:
5268 * @ctxt: a regexp parser context
5270 * [11] charClass ::= charClassEsc | charClassExpr
5271 * [12] charClassExpr ::= '[' charGroup ']'
5273 static void
5274 xmlFAParseCharClass(xmlRegParserCtxtPtr ctxt) {
5275 if (CUR == '[') {
5276 NEXT;
5277 ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_RANGES);
5278 if (ctxt->atom == NULL)
5279 return;
5280 xmlFAParseCharGroup(ctxt);
5281 if (CUR == ']') {
5282 NEXT;
5283 } else {
5284 ERROR("xmlFAParseCharClass: ']' expected");
5286 } else {
5287 xmlFAParseCharClassEsc(ctxt);
5292 * xmlFAParseQuantExact:
5293 * @ctxt: a regexp parser context
5295 * [8] QuantExact ::= [0-9]+
5297 * Returns 0 if success or -1 in case of error
5299 static int
5300 xmlFAParseQuantExact(xmlRegParserCtxtPtr ctxt) {
5301 int ret = 0;
5302 int ok = 0;
5303 int overflow = 0;
5305 while ((CUR >= '0') && (CUR <= '9')) {
5306 if (ret > INT_MAX / 10) {
5307 overflow = 1;
5308 } else {
5309 int digit = CUR - '0';
5311 ret *= 10;
5312 if (ret > INT_MAX - digit)
5313 overflow = 1;
5314 else
5315 ret += digit;
5317 ok = 1;
5318 NEXT;
5320 if ((ok != 1) || (overflow == 1)) {
5321 return(-1);
5323 return(ret);
5327 * xmlFAParseQuantifier:
5328 * @ctxt: a regexp parser context
5330 * [4] quantifier ::= [?*+] | ( '{' quantity '}' )
5331 * [5] quantity ::= quantRange | quantMin | QuantExact
5332 * [6] quantRange ::= QuantExact ',' QuantExact
5333 * [7] quantMin ::= QuantExact ','
5334 * [8] QuantExact ::= [0-9]+
5336 static int
5337 xmlFAParseQuantifier(xmlRegParserCtxtPtr ctxt) {
5338 int cur;
5340 cur = CUR;
5341 if ((cur == '?') || (cur == '*') || (cur == '+')) {
5342 if (ctxt->atom != NULL) {
5343 if (cur == '?')
5344 ctxt->atom->quant = XML_REGEXP_QUANT_OPT;
5345 else if (cur == '*')
5346 ctxt->atom->quant = XML_REGEXP_QUANT_MULT;
5347 else if (cur == '+')
5348 ctxt->atom->quant = XML_REGEXP_QUANT_PLUS;
5350 NEXT;
5351 return(1);
5353 if (cur == '{') {
5354 int min = 0, max = 0;
5356 NEXT;
5357 cur = xmlFAParseQuantExact(ctxt);
5358 if (cur >= 0)
5359 min = cur;
5360 else {
5361 ERROR("Improper quantifier");
5363 if (CUR == ',') {
5364 NEXT;
5365 if (CUR == '}')
5366 max = INT_MAX;
5367 else {
5368 cur = xmlFAParseQuantExact(ctxt);
5369 if (cur >= 0)
5370 max = cur;
5371 else {
5372 ERROR("Improper quantifier");
5376 if (CUR == '}') {
5377 NEXT;
5378 } else {
5379 ERROR("Unterminated quantifier");
5381 if (max == 0)
5382 max = min;
5383 if (ctxt->atom != NULL) {
5384 ctxt->atom->quant = XML_REGEXP_QUANT_RANGE;
5385 ctxt->atom->min = min;
5386 ctxt->atom->max = max;
5388 return(1);
5390 return(0);
5394 * xmlFAParseAtom:
5395 * @ctxt: a regexp parser context
5397 * [9] atom ::= Char | charClass | ( '(' regExp ')' )
5399 static int
5400 xmlFAParseAtom(xmlRegParserCtxtPtr ctxt) {
5401 int codepoint, len;
5403 codepoint = xmlFAIsChar(ctxt);
5404 if (codepoint > 0) {
5405 ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_CHARVAL);
5406 if (ctxt->atom == NULL)
5407 return(-1);
5408 codepoint = CUR_SCHAR(ctxt->cur, len);
5409 ctxt->atom->codepoint = codepoint;
5410 NEXTL(len);
5411 return(1);
5412 } else if (CUR == '|') {
5413 return(0);
5414 } else if (CUR == 0) {
5415 return(0);
5416 } else if (CUR == ')') {
5417 return(0);
5418 } else if (CUR == '(') {
5419 xmlRegStatePtr start, oldend, start0;
5421 NEXT;
5422 if (ctxt->depth >= 50) {
5423 ERROR("xmlFAParseAtom: maximum nesting depth exceeded");
5424 return(-1);
5427 * this extra Epsilon transition is needed if we count with 0 allowed
5428 * unfortunately this can't be known at that point
5430 xmlFAGenerateEpsilonTransition(ctxt, ctxt->state, NULL);
5431 start0 = ctxt->state;
5432 xmlFAGenerateEpsilonTransition(ctxt, ctxt->state, NULL);
5433 start = ctxt->state;
5434 oldend = ctxt->end;
5435 ctxt->end = NULL;
5436 ctxt->atom = NULL;
5437 ctxt->depth++;
5438 xmlFAParseRegExp(ctxt, 0);
5439 ctxt->depth--;
5440 if (CUR == ')') {
5441 NEXT;
5442 } else {
5443 ERROR("xmlFAParseAtom: expecting ')'");
5445 ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_SUBREG);
5446 if (ctxt->atom == NULL)
5447 return(-1);
5448 ctxt->atom->start = start;
5449 ctxt->atom->start0 = start0;
5450 ctxt->atom->stop = ctxt->state;
5451 ctxt->end = oldend;
5452 return(1);
5453 } else if ((CUR == '[') || (CUR == '\\') || (CUR == '.')) {
5454 xmlFAParseCharClass(ctxt);
5455 return(1);
5457 return(0);
5461 * xmlFAParsePiece:
5462 * @ctxt: a regexp parser context
5464 * [3] piece ::= atom quantifier?
5466 static int
5467 xmlFAParsePiece(xmlRegParserCtxtPtr ctxt) {
5468 int ret;
5470 ctxt->atom = NULL;
5471 ret = xmlFAParseAtom(ctxt);
5472 if (ret == 0)
5473 return(0);
5474 if (ctxt->atom == NULL) {
5475 ERROR("internal: no atom generated");
5477 xmlFAParseQuantifier(ctxt);
5478 return(1);
5482 * xmlFAParseBranch:
5483 * @ctxt: a regexp parser context
5484 * @to: optional target to the end of the branch
5486 * @to is used to optimize by removing duplicate path in automata
5487 * in expressions like (a|b)(c|d)
5489 * [2] branch ::= piece*
5491 static int
5492 xmlFAParseBranch(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr to) {
5493 xmlRegStatePtr previous;
5494 int ret;
5496 previous = ctxt->state;
5497 ret = xmlFAParsePiece(ctxt);
5498 if (ret == 0) {
5499 /* Empty branch */
5500 xmlFAGenerateEpsilonTransition(ctxt, previous, to);
5501 } else {
5502 if (xmlFAGenerateTransitions(ctxt, previous,
5503 (CUR=='|' || CUR==')' || CUR==0) ? to : NULL, ctxt->atom) < 0)
5504 return(-1);
5505 previous = ctxt->state;
5506 ctxt->atom = NULL;
5508 while ((ret != 0) && (ctxt->error == 0)) {
5509 ret = xmlFAParsePiece(ctxt);
5510 if (ret != 0) {
5511 if (xmlFAGenerateTransitions(ctxt, previous,
5512 (CUR=='|' || CUR==')' || CUR==0) ? to : NULL,
5513 ctxt->atom) < 0)
5514 return(-1);
5515 previous = ctxt->state;
5516 ctxt->atom = NULL;
5519 return(0);
5523 * xmlFAParseRegExp:
5524 * @ctxt: a regexp parser context
5525 * @top: is this the top-level expression ?
5527 * [1] regExp ::= branch ( '|' branch )*
5529 static void
5530 xmlFAParseRegExp(xmlRegParserCtxtPtr ctxt, int top) {
5531 xmlRegStatePtr start, end;
5533 /* if not top start should have been generated by an epsilon trans */
5534 start = ctxt->state;
5535 ctxt->end = NULL;
5536 xmlFAParseBranch(ctxt, NULL);
5537 if (top) {
5538 #ifdef DEBUG_REGEXP_GRAPH
5539 printf("State %d is final\n", ctxt->state->no);
5540 #endif
5541 ctxt->state->type = XML_REGEXP_FINAL_STATE;
5543 if (CUR != '|') {
5544 ctxt->end = ctxt->state;
5545 return;
5547 end = ctxt->state;
5548 while ((CUR == '|') && (ctxt->error == 0)) {
5549 NEXT;
5550 ctxt->state = start;
5551 ctxt->end = NULL;
5552 xmlFAParseBranch(ctxt, end);
5554 if (!top) {
5555 ctxt->state = end;
5556 ctxt->end = end;
5560 /************************************************************************
5562 * The basic API *
5564 ************************************************************************/
5567 * xmlRegexpPrint:
5568 * @output: the file for the output debug
5569 * @regexp: the compiled regexp
5571 * Print the content of the compiled regular expression
5573 void
5574 xmlRegexpPrint(FILE *output, xmlRegexpPtr regexp) {
5575 int i;
5577 if (output == NULL)
5578 return;
5579 fprintf(output, " regexp: ");
5580 if (regexp == NULL) {
5581 fprintf(output, "NULL\n");
5582 return;
5584 fprintf(output, "'%s' ", regexp->string);
5585 fprintf(output, "\n");
5586 fprintf(output, "%d atoms:\n", regexp->nbAtoms);
5587 for (i = 0;i < regexp->nbAtoms; i++) {
5588 fprintf(output, " %02d ", i);
5589 xmlRegPrintAtom(output, regexp->atoms[i]);
5591 fprintf(output, "%d states:", regexp->nbStates);
5592 fprintf(output, "\n");
5593 for (i = 0;i < regexp->nbStates; i++) {
5594 xmlRegPrintState(output, regexp->states[i]);
5596 fprintf(output, "%d counters:\n", regexp->nbCounters);
5597 for (i = 0;i < regexp->nbCounters; i++) {
5598 fprintf(output, " %d: min %d max %d\n", i, regexp->counters[i].min,
5599 regexp->counters[i].max);
5604 * xmlRegexpCompile:
5605 * @regexp: a regular expression string
5607 * Parses a regular expression conforming to XML Schemas Part 2 Datatype
5608 * Appendix F and builds an automata suitable for testing strings against
5609 * that regular expression
5611 * Returns the compiled expression or NULL in case of error
5613 xmlRegexpPtr
5614 xmlRegexpCompile(const xmlChar *regexp) {
5615 xmlRegexpPtr ret;
5616 xmlRegParserCtxtPtr ctxt;
5618 ctxt = xmlRegNewParserCtxt(regexp);
5619 if (ctxt == NULL)
5620 return(NULL);
5622 /* initialize the parser */
5623 ctxt->end = NULL;
5624 ctxt->start = ctxt->state = xmlRegNewState(ctxt);
5625 xmlRegStatePush(ctxt, ctxt->start);
5627 /* parse the expression building an automata */
5628 xmlFAParseRegExp(ctxt, 1);
5629 if (CUR != 0) {
5630 ERROR("xmlFAParseRegExp: extra characters");
5632 if (ctxt->error != 0) {
5633 xmlRegFreeParserCtxt(ctxt);
5634 return(NULL);
5636 ctxt->end = ctxt->state;
5637 ctxt->start->type = XML_REGEXP_START_STATE;
5638 ctxt->end->type = XML_REGEXP_FINAL_STATE;
5640 /* remove the Epsilon except for counted transitions */
5641 xmlFAEliminateEpsilonTransitions(ctxt);
5644 if (ctxt->error != 0) {
5645 xmlRegFreeParserCtxt(ctxt);
5646 return(NULL);
5648 ret = xmlRegEpxFromParse(ctxt);
5649 xmlRegFreeParserCtxt(ctxt);
5650 return(ret);
5654 * xmlRegexpExec:
5655 * @comp: the compiled regular expression
5656 * @content: the value to check against the regular expression
5658 * Check if the regular expression generates the value
5660 * Returns 1 if it matches, 0 if not and a negative value in case of error
5663 xmlRegexpExec(xmlRegexpPtr comp, const xmlChar *content) {
5664 if ((comp == NULL) || (content == NULL))
5665 return(-1);
5666 return(xmlFARegExec(comp, content));
5670 * xmlRegexpIsDeterminist:
5671 * @comp: the compiled regular expression
5673 * Check if the regular expression is determinist
5675 * Returns 1 if it yes, 0 if not and a negative value in case of error
5678 xmlRegexpIsDeterminist(xmlRegexpPtr comp) {
5679 xmlAutomataPtr am;
5680 int ret;
5682 if (comp == NULL)
5683 return(-1);
5684 if (comp->determinist != -1)
5685 return(comp->determinist);
5687 am = xmlNewAutomata();
5688 if (am == NULL)
5689 return(-1);
5690 if (am->states != NULL) {
5691 int i;
5693 for (i = 0;i < am->nbStates;i++)
5694 xmlRegFreeState(am->states[i]);
5695 xmlFree(am->states);
5697 am->nbAtoms = comp->nbAtoms;
5698 am->atoms = comp->atoms;
5699 am->nbStates = comp->nbStates;
5700 am->states = comp->states;
5701 am->determinist = -1;
5702 am->flags = comp->flags;
5703 ret = xmlFAComputesDeterminism(am);
5704 am->atoms = NULL;
5705 am->states = NULL;
5706 xmlFreeAutomata(am);
5707 comp->determinist = ret;
5708 return(ret);
5712 * xmlRegFreeRegexp:
5713 * @regexp: the regexp
5715 * Free a regexp
5717 void
5718 xmlRegFreeRegexp(xmlRegexpPtr regexp) {
5719 int i;
5720 if (regexp == NULL)
5721 return;
5723 if (regexp->string != NULL)
5724 xmlFree(regexp->string);
5725 if (regexp->states != NULL) {
5726 for (i = 0;i < regexp->nbStates;i++)
5727 xmlRegFreeState(regexp->states[i]);
5728 xmlFree(regexp->states);
5730 if (regexp->atoms != NULL) {
5731 for (i = 0;i < regexp->nbAtoms;i++)
5732 xmlRegFreeAtom(regexp->atoms[i]);
5733 xmlFree(regexp->atoms);
5735 if (regexp->counters != NULL)
5736 xmlFree(regexp->counters);
5737 if (regexp->compact != NULL)
5738 xmlFree(regexp->compact);
5739 if (regexp->transdata != NULL)
5740 xmlFree(regexp->transdata);
5741 if (regexp->stringMap != NULL) {
5742 for (i = 0; i < regexp->nbstrings;i++)
5743 xmlFree(regexp->stringMap[i]);
5744 xmlFree(regexp->stringMap);
5747 xmlFree(regexp);
5750 #ifdef LIBXML_AUTOMATA_ENABLED
5751 /************************************************************************
5753 * The Automata interface *
5755 ************************************************************************/
5758 * xmlNewAutomata:
5760 * Create a new automata
5762 * Returns the new object or NULL in case of failure
5764 xmlAutomataPtr
5765 xmlNewAutomata(void) {
5766 xmlAutomataPtr ctxt;
5768 ctxt = xmlRegNewParserCtxt(NULL);
5769 if (ctxt == NULL)
5770 return(NULL);
5772 /* initialize the parser */
5773 ctxt->end = NULL;
5774 ctxt->start = ctxt->state = xmlRegNewState(ctxt);
5775 if (ctxt->start == NULL) {
5776 xmlFreeAutomata(ctxt);
5777 return(NULL);
5779 ctxt->start->type = XML_REGEXP_START_STATE;
5780 if (xmlRegStatePush(ctxt, ctxt->start) < 0) {
5781 xmlRegFreeState(ctxt->start);
5782 xmlFreeAutomata(ctxt);
5783 return(NULL);
5785 ctxt->flags = 0;
5787 return(ctxt);
5791 * xmlFreeAutomata:
5792 * @am: an automata
5794 * Free an automata
5796 void
5797 xmlFreeAutomata(xmlAutomataPtr am) {
5798 if (am == NULL)
5799 return;
5800 xmlRegFreeParserCtxt(am);
5804 * xmlAutomataSetFlags:
5805 * @am: an automata
5806 * @flags: a set of internal flags
5808 * Set some flags on the automata
5810 void
5811 xmlAutomataSetFlags(xmlAutomataPtr am, int flags) {
5812 if (am == NULL)
5813 return;
5814 am->flags |= flags;
5818 * xmlAutomataGetInitState:
5819 * @am: an automata
5821 * Initial state lookup
5823 * Returns the initial state of the automata
5825 xmlAutomataStatePtr
5826 xmlAutomataGetInitState(xmlAutomataPtr am) {
5827 if (am == NULL)
5828 return(NULL);
5829 return(am->start);
5833 * xmlAutomataSetFinalState:
5834 * @am: an automata
5835 * @state: a state in this automata
5837 * Makes that state a final state
5839 * Returns 0 or -1 in case of error
5842 xmlAutomataSetFinalState(xmlAutomataPtr am, xmlAutomataStatePtr state) {
5843 if ((am == NULL) || (state == NULL))
5844 return(-1);
5845 state->type = XML_REGEXP_FINAL_STATE;
5846 return(0);
5850 * xmlAutomataNewTransition:
5851 * @am: an automata
5852 * @from: the starting point of the transition
5853 * @to: the target point of the transition or NULL
5854 * @token: the input string associated to that transition
5855 * @data: data passed to the callback function if the transition is activated
5857 * If @to is NULL, this creates first a new target state in the automata
5858 * and then adds a transition from the @from state to the target state
5859 * activated by the value of @token
5861 * Returns the target state or NULL in case of error
5863 xmlAutomataStatePtr
5864 xmlAutomataNewTransition(xmlAutomataPtr am, xmlAutomataStatePtr from,
5865 xmlAutomataStatePtr to, const xmlChar *token,
5866 void *data) {
5867 xmlRegAtomPtr atom;
5869 if ((am == NULL) || (from == NULL) || (token == NULL))
5870 return(NULL);
5871 atom = xmlRegNewAtom(am, XML_REGEXP_STRING);
5872 if (atom == NULL)
5873 return(NULL);
5874 atom->data = data;
5875 atom->valuep = xmlStrdup(token);
5877 if (xmlFAGenerateTransitions(am, from, to, atom) < 0) {
5878 xmlRegFreeAtom(atom);
5879 return(NULL);
5881 if (to == NULL)
5882 return(am->state);
5883 return(to);
5887 * xmlAutomataNewTransition2:
5888 * @am: an automata
5889 * @from: the starting point of the transition
5890 * @to: the target point of the transition or NULL
5891 * @token: the first input string associated to that transition
5892 * @token2: the second input string associated to that transition
5893 * @data: data passed to the callback function if the transition is activated
5895 * If @to is NULL, this creates first a new target state in the automata
5896 * and then adds a transition from the @from state to the target state
5897 * activated by the value of @token
5899 * Returns the target state or NULL in case of error
5901 xmlAutomataStatePtr
5902 xmlAutomataNewTransition2(xmlAutomataPtr am, xmlAutomataStatePtr from,
5903 xmlAutomataStatePtr to, const xmlChar *token,
5904 const xmlChar *token2, void *data) {
5905 xmlRegAtomPtr atom;
5907 if ((am == NULL) || (from == NULL) || (token == NULL))
5908 return(NULL);
5909 atom = xmlRegNewAtom(am, XML_REGEXP_STRING);
5910 if (atom == NULL)
5911 return(NULL);
5912 atom->data = data;
5913 if ((token2 == NULL) || (*token2 == 0)) {
5914 atom->valuep = xmlStrdup(token);
5915 } else {
5916 int lenn, lenp;
5917 xmlChar *str;
5919 lenn = strlen((char *) token2);
5920 lenp = strlen((char *) token);
5922 str = (xmlChar *) xmlMallocAtomic(lenn + lenp + 2);
5923 if (str == NULL) {
5924 xmlRegFreeAtom(atom);
5925 return(NULL);
5927 memcpy(&str[0], token, lenp);
5928 str[lenp] = '|';
5929 memcpy(&str[lenp + 1], token2, lenn);
5930 str[lenn + lenp + 1] = 0;
5932 atom->valuep = str;
5935 if (xmlFAGenerateTransitions(am, from, to, atom) < 0) {
5936 xmlRegFreeAtom(atom);
5937 return(NULL);
5939 if (to == NULL)
5940 return(am->state);
5941 return(to);
5945 * xmlAutomataNewNegTrans:
5946 * @am: an automata
5947 * @from: the starting point of the transition
5948 * @to: the target point of the transition or NULL
5949 * @token: the first input string associated to that transition
5950 * @token2: the second input string associated to that transition
5951 * @data: data passed to the callback function if the transition is activated
5953 * If @to is NULL, this creates first a new target state in the automata
5954 * and then adds a transition from the @from state to the target state
5955 * activated by any value except (@token,@token2)
5956 * Note that if @token2 is not NULL, then (X, NULL) won't match to follow
5957 # the semantic of XSD ##other
5959 * Returns the target state or NULL in case of error
5961 xmlAutomataStatePtr
5962 xmlAutomataNewNegTrans(xmlAutomataPtr am, xmlAutomataStatePtr from,
5963 xmlAutomataStatePtr to, const xmlChar *token,
5964 const xmlChar *token2, void *data) {
5965 xmlRegAtomPtr atom;
5966 xmlChar err_msg[200];
5968 if ((am == NULL) || (from == NULL) || (token == NULL))
5969 return(NULL);
5970 atom = xmlRegNewAtom(am, XML_REGEXP_STRING);
5971 if (atom == NULL)
5972 return(NULL);
5973 atom->data = data;
5974 atom->neg = 1;
5975 if ((token2 == NULL) || (*token2 == 0)) {
5976 atom->valuep = xmlStrdup(token);
5977 } else {
5978 int lenn, lenp;
5979 xmlChar *str;
5981 lenn = strlen((char *) token2);
5982 lenp = strlen((char *) token);
5984 str = (xmlChar *) xmlMallocAtomic(lenn + lenp + 2);
5985 if (str == NULL) {
5986 xmlRegFreeAtom(atom);
5987 return(NULL);
5989 memcpy(&str[0], token, lenp);
5990 str[lenp] = '|';
5991 memcpy(&str[lenp + 1], token2, lenn);
5992 str[lenn + lenp + 1] = 0;
5994 atom->valuep = str;
5996 snprintf((char *) err_msg, 199, "not %s", (const char *) atom->valuep);
5997 err_msg[199] = 0;
5998 atom->valuep2 = xmlStrdup(err_msg);
6000 if (xmlFAGenerateTransitions(am, from, to, atom) < 0) {
6001 xmlRegFreeAtom(atom);
6002 return(NULL);
6004 am->negs++;
6005 if (to == NULL)
6006 return(am->state);
6007 return(to);
6011 * xmlAutomataNewCountTrans2:
6012 * @am: an automata
6013 * @from: the starting point of the transition
6014 * @to: the target point of the transition or NULL
6015 * @token: the input string associated to that transition
6016 * @token2: the second input string associated to that transition
6017 * @min: the minimum successive occurrences of token
6018 * @max: the maximum successive occurrences of token
6019 * @data: data associated to the transition
6021 * If @to is NULL, this creates first a new target state in the automata
6022 * and then adds a transition from the @from state to the target state
6023 * activated by a succession of input of value @token and @token2 and
6024 * whose number is between @min and @max
6026 * Returns the target state or NULL in case of error
6028 xmlAutomataStatePtr
6029 xmlAutomataNewCountTrans2(xmlAutomataPtr am, xmlAutomataStatePtr from,
6030 xmlAutomataStatePtr to, const xmlChar *token,
6031 const xmlChar *token2,
6032 int min, int max, void *data) {
6033 xmlRegAtomPtr atom;
6034 int counter;
6036 if ((am == NULL) || (from == NULL) || (token == NULL))
6037 return(NULL);
6038 if (min < 0)
6039 return(NULL);
6040 if ((max < min) || (max < 1))
6041 return(NULL);
6042 atom = xmlRegNewAtom(am, XML_REGEXP_STRING);
6043 if (atom == NULL)
6044 return(NULL);
6045 if ((token2 == NULL) || (*token2 == 0)) {
6046 atom->valuep = xmlStrdup(token);
6047 } else {
6048 int lenn, lenp;
6049 xmlChar *str;
6051 lenn = strlen((char *) token2);
6052 lenp = strlen((char *) token);
6054 str = (xmlChar *) xmlMallocAtomic(lenn + lenp + 2);
6055 if (str == NULL) {
6056 xmlRegFreeAtom(atom);
6057 return(NULL);
6059 memcpy(&str[0], token, lenp);
6060 str[lenp] = '|';
6061 memcpy(&str[lenp + 1], token2, lenn);
6062 str[lenn + lenp + 1] = 0;
6064 atom->valuep = str;
6066 atom->data = data;
6067 if (min == 0)
6068 atom->min = 1;
6069 else
6070 atom->min = min;
6071 atom->max = max;
6074 * associate a counter to the transition.
6076 counter = xmlRegGetCounter(am);
6077 am->counters[counter].min = min;
6078 am->counters[counter].max = max;
6080 /* xmlFAGenerateTransitions(am, from, to, atom); */
6081 if (to == NULL) {
6082 to = xmlRegNewState(am);
6083 xmlRegStatePush(am, to);
6085 xmlRegStateAddTrans(am, from, atom, to, counter, -1);
6086 xmlRegAtomPush(am, atom);
6087 am->state = to;
6089 if (to == NULL)
6090 to = am->state;
6091 if (to == NULL)
6092 return(NULL);
6093 if (min == 0)
6094 xmlFAGenerateEpsilonTransition(am, from, to);
6095 return(to);
6099 * xmlAutomataNewCountTrans:
6100 * @am: an automata
6101 * @from: the starting point of the transition
6102 * @to: the target point of the transition or NULL
6103 * @token: the input string associated to that transition
6104 * @min: the minimum successive occurrences of token
6105 * @max: the maximum successive occurrences of token
6106 * @data: data associated to the transition
6108 * If @to is NULL, this creates first a new target state in the automata
6109 * and then adds a transition from the @from state to the target state
6110 * activated by a succession of input of value @token and whose number
6111 * is between @min and @max
6113 * Returns the target state or NULL in case of error
6115 xmlAutomataStatePtr
6116 xmlAutomataNewCountTrans(xmlAutomataPtr am, xmlAutomataStatePtr from,
6117 xmlAutomataStatePtr to, const xmlChar *token,
6118 int min, int max, void *data) {
6119 xmlRegAtomPtr atom;
6120 int counter;
6122 if ((am == NULL) || (from == NULL) || (token == NULL))
6123 return(NULL);
6124 if (min < 0)
6125 return(NULL);
6126 if ((max < min) || (max < 1))
6127 return(NULL);
6128 atom = xmlRegNewAtom(am, XML_REGEXP_STRING);
6129 if (atom == NULL)
6130 return(NULL);
6131 atom->valuep = xmlStrdup(token);
6132 atom->data = data;
6133 if (min == 0)
6134 atom->min = 1;
6135 else
6136 atom->min = min;
6137 atom->max = max;
6140 * associate a counter to the transition.
6142 counter = xmlRegGetCounter(am);
6143 am->counters[counter].min = min;
6144 am->counters[counter].max = max;
6146 /* xmlFAGenerateTransitions(am, from, to, atom); */
6147 if (to == NULL) {
6148 to = xmlRegNewState(am);
6149 xmlRegStatePush(am, to);
6151 xmlRegStateAddTrans(am, from, atom, to, counter, -1);
6152 xmlRegAtomPush(am, atom);
6153 am->state = to;
6155 if (to == NULL)
6156 to = am->state;
6157 if (to == NULL)
6158 return(NULL);
6159 if (min == 0)
6160 xmlFAGenerateEpsilonTransition(am, from, to);
6161 return(to);
6165 * xmlAutomataNewOnceTrans2:
6166 * @am: an automata
6167 * @from: the starting point of the transition
6168 * @to: the target point of the transition or NULL
6169 * @token: the input string associated to that transition
6170 * @token2: the second input string associated to that transition
6171 * @min: the minimum successive occurrences of token
6172 * @max: the maximum successive occurrences of token
6173 * @data: data associated to the transition
6175 * If @to is NULL, this creates first a new target state in the automata
6176 * and then adds a transition from the @from state to the target state
6177 * activated by a succession of input of value @token and @token2 and whose
6178 * number is between @min and @max, moreover that transition can only be
6179 * crossed once.
6181 * Returns the target state or NULL in case of error
6183 xmlAutomataStatePtr
6184 xmlAutomataNewOnceTrans2(xmlAutomataPtr am, xmlAutomataStatePtr from,
6185 xmlAutomataStatePtr to, const xmlChar *token,
6186 const xmlChar *token2,
6187 int min, int max, void *data) {
6188 xmlRegAtomPtr atom;
6189 int counter;
6191 if ((am == NULL) || (from == NULL) || (token == NULL))
6192 return(NULL);
6193 if (min < 1)
6194 return(NULL);
6195 if (max < min)
6196 return(NULL);
6197 atom = xmlRegNewAtom(am, XML_REGEXP_STRING);
6198 if (atom == NULL)
6199 return(NULL);
6200 if ((token2 == NULL) || (*token2 == 0)) {
6201 atom->valuep = xmlStrdup(token);
6202 } else {
6203 int lenn, lenp;
6204 xmlChar *str;
6206 lenn = strlen((char *) token2);
6207 lenp = strlen((char *) token);
6209 str = (xmlChar *) xmlMallocAtomic(lenn + lenp + 2);
6210 if (str == NULL) {
6211 xmlRegFreeAtom(atom);
6212 return(NULL);
6214 memcpy(&str[0], token, lenp);
6215 str[lenp] = '|';
6216 memcpy(&str[lenp + 1], token2, lenn);
6217 str[lenn + lenp + 1] = 0;
6219 atom->valuep = str;
6221 atom->data = data;
6222 atom->quant = XML_REGEXP_QUANT_ONCEONLY;
6223 atom->min = min;
6224 atom->max = max;
6226 * associate a counter to the transition.
6228 counter = xmlRegGetCounter(am);
6229 am->counters[counter].min = 1;
6230 am->counters[counter].max = 1;
6232 /* xmlFAGenerateTransitions(am, from, to, atom); */
6233 if (to == NULL) {
6234 to = xmlRegNewState(am);
6235 xmlRegStatePush(am, to);
6237 xmlRegStateAddTrans(am, from, atom, to, counter, -1);
6238 xmlRegAtomPush(am, atom);
6239 am->state = to;
6240 return(to);
6246 * xmlAutomataNewOnceTrans:
6247 * @am: an automata
6248 * @from: the starting point of the transition
6249 * @to: the target point of the transition or NULL
6250 * @token: the input string associated to that transition
6251 * @min: the minimum successive occurrences of token
6252 * @max: the maximum successive occurrences of token
6253 * @data: data associated to the transition
6255 * If @to is NULL, this creates first a new target state in the automata
6256 * and then adds a transition from the @from state to the target state
6257 * activated by a succession of input of value @token and whose number
6258 * is between @min and @max, moreover that transition can only be crossed
6259 * once.
6261 * Returns the target state or NULL in case of error
6263 xmlAutomataStatePtr
6264 xmlAutomataNewOnceTrans(xmlAutomataPtr am, xmlAutomataStatePtr from,
6265 xmlAutomataStatePtr to, const xmlChar *token,
6266 int min, int max, void *data) {
6267 xmlRegAtomPtr atom;
6268 int counter;
6270 if ((am == NULL) || (from == NULL) || (token == NULL))
6271 return(NULL);
6272 if (min < 1)
6273 return(NULL);
6274 if (max < min)
6275 return(NULL);
6276 atom = xmlRegNewAtom(am, XML_REGEXP_STRING);
6277 if (atom == NULL)
6278 return(NULL);
6279 atom->valuep = xmlStrdup(token);
6280 atom->data = data;
6281 atom->quant = XML_REGEXP_QUANT_ONCEONLY;
6282 atom->min = min;
6283 atom->max = max;
6285 * associate a counter to the transition.
6287 counter = xmlRegGetCounter(am);
6288 am->counters[counter].min = 1;
6289 am->counters[counter].max = 1;
6291 /* xmlFAGenerateTransitions(am, from, to, atom); */
6292 if (to == NULL) {
6293 to = xmlRegNewState(am);
6294 xmlRegStatePush(am, to);
6296 xmlRegStateAddTrans(am, from, atom, to, counter, -1);
6297 xmlRegAtomPush(am, atom);
6298 am->state = to;
6299 return(to);
6303 * xmlAutomataNewState:
6304 * @am: an automata
6306 * Create a new disconnected state in the automata
6308 * Returns the new state or NULL in case of error
6310 xmlAutomataStatePtr
6311 xmlAutomataNewState(xmlAutomataPtr am) {
6312 xmlAutomataStatePtr to;
6314 if (am == NULL)
6315 return(NULL);
6316 to = xmlRegNewState(am);
6317 xmlRegStatePush(am, to);
6318 return(to);
6322 * xmlAutomataNewEpsilon:
6323 * @am: an automata
6324 * @from: the starting point of the transition
6325 * @to: the target point of the transition or NULL
6327 * If @to is NULL, this creates first a new target state in the automata
6328 * and then adds an epsilon transition from the @from state to the
6329 * target state
6331 * Returns the target state or NULL in case of error
6333 xmlAutomataStatePtr
6334 xmlAutomataNewEpsilon(xmlAutomataPtr am, xmlAutomataStatePtr from,
6335 xmlAutomataStatePtr to) {
6336 if ((am == NULL) || (from == NULL))
6337 return(NULL);
6338 xmlFAGenerateEpsilonTransition(am, from, to);
6339 if (to == NULL)
6340 return(am->state);
6341 return(to);
6345 * xmlAutomataNewAllTrans:
6346 * @am: an automata
6347 * @from: the starting point of the transition
6348 * @to: the target point of the transition or NULL
6349 * @lax: allow to transition if not all all transitions have been activated
6351 * If @to is NULL, this creates first a new target state in the automata
6352 * and then adds a an ALL transition from the @from state to the
6353 * target state. That transition is an epsilon transition allowed only when
6354 * all transitions from the @from node have been activated.
6356 * Returns the target state or NULL in case of error
6358 xmlAutomataStatePtr
6359 xmlAutomataNewAllTrans(xmlAutomataPtr am, xmlAutomataStatePtr from,
6360 xmlAutomataStatePtr to, int lax) {
6361 if ((am == NULL) || (from == NULL))
6362 return(NULL);
6363 xmlFAGenerateAllTransition(am, from, to, lax);
6364 if (to == NULL)
6365 return(am->state);
6366 return(to);
6370 * xmlAutomataNewCounter:
6371 * @am: an automata
6372 * @min: the minimal value on the counter
6373 * @max: the maximal value on the counter
6375 * Create a new counter
6377 * Returns the counter number or -1 in case of error
6380 xmlAutomataNewCounter(xmlAutomataPtr am, int min, int max) {
6381 int ret;
6383 if (am == NULL)
6384 return(-1);
6386 ret = xmlRegGetCounter(am);
6387 if (ret < 0)
6388 return(-1);
6389 am->counters[ret].min = min;
6390 am->counters[ret].max = max;
6391 return(ret);
6395 * xmlAutomataNewCountedTrans:
6396 * @am: an automata
6397 * @from: the starting point of the transition
6398 * @to: the target point of the transition or NULL
6399 * @counter: the counter associated to that transition
6401 * If @to is NULL, this creates first a new target state in the automata
6402 * and then adds an epsilon transition from the @from state to the target state
6403 * which will increment the counter provided
6405 * Returns the target state or NULL in case of error
6407 xmlAutomataStatePtr
6408 xmlAutomataNewCountedTrans(xmlAutomataPtr am, xmlAutomataStatePtr from,
6409 xmlAutomataStatePtr to, int counter) {
6410 if ((am == NULL) || (from == NULL) || (counter < 0))
6411 return(NULL);
6412 xmlFAGenerateCountedEpsilonTransition(am, from, to, counter);
6413 if (to == NULL)
6414 return(am->state);
6415 return(to);
6419 * xmlAutomataNewCounterTrans:
6420 * @am: an automata
6421 * @from: the starting point of the transition
6422 * @to: the target point of the transition or NULL
6423 * @counter: the counter associated to that transition
6425 * If @to is NULL, this creates first a new target state in the automata
6426 * and then adds an epsilon transition from the @from state to the target state
6427 * which will be allowed only if the counter is within the right range.
6429 * Returns the target state or NULL in case of error
6431 xmlAutomataStatePtr
6432 xmlAutomataNewCounterTrans(xmlAutomataPtr am, xmlAutomataStatePtr from,
6433 xmlAutomataStatePtr to, int counter) {
6434 if ((am == NULL) || (from == NULL) || (counter < 0))
6435 return(NULL);
6436 xmlFAGenerateCountedTransition(am, from, to, counter);
6437 if (to == NULL)
6438 return(am->state);
6439 return(to);
6443 * xmlAutomataCompile:
6444 * @am: an automata
6446 * Compile the automata into a Reg Exp ready for being executed.
6447 * The automata should be free after this point.
6449 * Returns the compiled regexp or NULL in case of error
6451 xmlRegexpPtr
6452 xmlAutomataCompile(xmlAutomataPtr am) {
6453 xmlRegexpPtr ret;
6455 if ((am == NULL) || (am->error != 0)) return(NULL);
6456 xmlFAEliminateEpsilonTransitions(am);
6457 /* xmlFAComputesDeterminism(am); */
6458 ret = xmlRegEpxFromParse(am);
6460 return(ret);
6464 * xmlAutomataIsDeterminist:
6465 * @am: an automata
6467 * Checks if an automata is determinist.
6469 * Returns 1 if true, 0 if not, and -1 in case of error
6472 xmlAutomataIsDeterminist(xmlAutomataPtr am) {
6473 int ret;
6475 if (am == NULL)
6476 return(-1);
6478 ret = xmlFAComputesDeterminism(am);
6479 return(ret);
6481 #endif /* LIBXML_AUTOMATA_ENABLED */
6483 #ifdef LIBXML_EXPR_ENABLED
6484 /************************************************************************
6486 * Formal Expression handling code *
6488 ************************************************************************/
6489 /************************************************************************
6491 * Expression handling context *
6493 ************************************************************************/
6495 struct _xmlExpCtxt {
6496 xmlDictPtr dict;
6497 xmlExpNodePtr *table;
6498 int size;
6499 int nbElems;
6500 int nb_nodes;
6501 int maxNodes;
6502 const char *expr;
6503 const char *cur;
6504 int nb_cons;
6505 int tabSize;
6509 * xmlExpNewCtxt:
6510 * @maxNodes: the maximum number of nodes
6511 * @dict: optional dictionary to use internally
6513 * Creates a new context for manipulating expressions
6515 * Returns the context or NULL in case of error
6517 xmlExpCtxtPtr
6518 xmlExpNewCtxt(int maxNodes, xmlDictPtr dict) {
6519 xmlExpCtxtPtr ret;
6520 int size = 256;
6522 if (maxNodes <= 4096)
6523 maxNodes = 4096;
6525 ret = (xmlExpCtxtPtr) xmlMalloc(sizeof(xmlExpCtxt));
6526 if (ret == NULL)
6527 return(NULL);
6528 memset(ret, 0, sizeof(xmlExpCtxt));
6529 ret->size = size;
6530 ret->nbElems = 0;
6531 ret->maxNodes = maxNodes;
6532 ret->table = xmlMalloc(size * sizeof(xmlExpNodePtr));
6533 if (ret->table == NULL) {
6534 xmlFree(ret);
6535 return(NULL);
6537 memset(ret->table, 0, size * sizeof(xmlExpNodePtr));
6538 if (dict == NULL) {
6539 ret->dict = xmlDictCreate();
6540 if (ret->dict == NULL) {
6541 xmlFree(ret->table);
6542 xmlFree(ret);
6543 return(NULL);
6545 } else {
6546 ret->dict = dict;
6547 xmlDictReference(ret->dict);
6549 return(ret);
6553 * xmlExpFreeCtxt:
6554 * @ctxt: an expression context
6556 * Free an expression context
6558 void
6559 xmlExpFreeCtxt(xmlExpCtxtPtr ctxt) {
6560 if (ctxt == NULL)
6561 return;
6562 xmlDictFree(ctxt->dict);
6563 if (ctxt->table != NULL)
6564 xmlFree(ctxt->table);
6565 xmlFree(ctxt);
6568 /************************************************************************
6570 * Structure associated to an expression node *
6572 ************************************************************************/
6573 #define MAX_NODES 10000
6575 /* #define DEBUG_DERIV */
6578 * TODO:
6579 * - Wildcards
6580 * - public API for creation
6582 * Started
6583 * - regression testing
6585 * Done
6586 * - split into module and test tool
6587 * - memleaks
6590 typedef enum {
6591 XML_EXP_NILABLE = (1 << 0)
6592 } xmlExpNodeInfo;
6594 #define IS_NILLABLE(node) ((node)->info & XML_EXP_NILABLE)
6596 struct _xmlExpNode {
6597 unsigned char type;/* xmlExpNodeType */
6598 unsigned char info;/* OR of xmlExpNodeInfo */
6599 unsigned short key; /* the hash key */
6600 unsigned int ref; /* The number of references */
6601 int c_max; /* the maximum length it can consume */
6602 xmlExpNodePtr exp_left;
6603 xmlExpNodePtr next;/* the next node in the hash table or free list */
6604 union {
6605 struct {
6606 int f_min;
6607 int f_max;
6608 } count;
6609 struct {
6610 xmlExpNodePtr f_right;
6611 } children;
6612 const xmlChar *f_str;
6613 } field;
6616 #define exp_min field.count.f_min
6617 #define exp_max field.count.f_max
6618 /* #define exp_left field.children.f_left */
6619 #define exp_right field.children.f_right
6620 #define exp_str field.f_str
6622 static xmlExpNodePtr xmlExpNewNode(xmlExpCtxtPtr ctxt, xmlExpNodeType type);
6623 static xmlExpNode forbiddenExpNode = {
6624 XML_EXP_FORBID, 0, 0, 0, 0, NULL, NULL, {{ 0, 0}}
6626 xmlExpNodePtr forbiddenExp = &forbiddenExpNode;
6627 static xmlExpNode emptyExpNode = {
6628 XML_EXP_EMPTY, 1, 0, 0, 0, NULL, NULL, {{ 0, 0}}
6630 xmlExpNodePtr emptyExp = &emptyExpNode;
6632 /************************************************************************
6634 * The custom hash table for unicity and canonicalization *
6635 * of sub-expressions pointers *
6637 ************************************************************************/
6639 * xmlExpHashNameComputeKey:
6640 * Calculate the hash key for a token
6642 static unsigned short
6643 xmlExpHashNameComputeKey(const xmlChar *name) {
6644 unsigned short value = 0L;
6645 char ch;
6647 if (name != NULL) {
6648 value += 30 * (*name);
6649 while ((ch = *name++) != 0) {
6650 value = value ^ ((value << 5) + (value >> 3) + (unsigned long)ch);
6653 return (value);
6657 * xmlExpHashComputeKey:
6658 * Calculate the hash key for a compound expression
6660 static unsigned short
6661 xmlExpHashComputeKey(xmlExpNodeType type, xmlExpNodePtr left,
6662 xmlExpNodePtr right) {
6663 unsigned long value;
6664 unsigned short ret;
6666 switch (type) {
6667 case XML_EXP_SEQ:
6668 value = left->key;
6669 value += right->key;
6670 value *= 3;
6671 ret = (unsigned short) value;
6672 break;
6673 case XML_EXP_OR:
6674 value = left->key;
6675 value += right->key;
6676 value *= 7;
6677 ret = (unsigned short) value;
6678 break;
6679 case XML_EXP_COUNT:
6680 value = left->key;
6681 value += right->key;
6682 ret = (unsigned short) value;
6683 break;
6684 default:
6685 ret = 0;
6687 return(ret);
6691 static xmlExpNodePtr
6692 xmlExpNewNode(xmlExpCtxtPtr ctxt, xmlExpNodeType type) {
6693 xmlExpNodePtr ret;
6695 if (ctxt->nb_nodes >= MAX_NODES)
6696 return(NULL);
6697 ret = (xmlExpNodePtr) xmlMalloc(sizeof(xmlExpNode));
6698 if (ret == NULL)
6699 return(NULL);
6700 memset(ret, 0, sizeof(xmlExpNode));
6701 ret->type = type;
6702 ret->next = NULL;
6703 ctxt->nb_nodes++;
6704 ctxt->nb_cons++;
6705 return(ret);
6709 * xmlExpHashGetEntry:
6710 * @table: the hash table
6712 * Get the unique entry from the hash table. The entry is created if
6713 * needed. @left and @right are consumed, i.e. their ref count will
6714 * be decremented by the operation.
6716 * Returns the pointer or NULL in case of error
6718 static xmlExpNodePtr
6719 xmlExpHashGetEntry(xmlExpCtxtPtr ctxt, xmlExpNodeType type,
6720 xmlExpNodePtr left, xmlExpNodePtr right,
6721 const xmlChar *name, int min, int max) {
6722 unsigned short kbase, key;
6723 xmlExpNodePtr entry;
6724 xmlExpNodePtr insert;
6726 if (ctxt == NULL)
6727 return(NULL);
6730 * Check for duplicate and insertion location.
6732 if (type == XML_EXP_ATOM) {
6733 kbase = xmlExpHashNameComputeKey(name);
6734 } else if (type == XML_EXP_COUNT) {
6735 /* COUNT reduction rule 1 */
6736 /* a{1} -> a */
6737 if (min == max) {
6738 if (min == 1) {
6739 return(left);
6741 if (min == 0) {
6742 xmlExpFree(ctxt, left);
6743 return(emptyExp);
6746 if (min < 0) {
6747 xmlExpFree(ctxt, left);
6748 return(forbiddenExp);
6750 if (max == -1)
6751 kbase = min + 79;
6752 else
6753 kbase = max - min;
6754 kbase += left->key;
6755 } else if (type == XML_EXP_OR) {
6756 /* Forbid reduction rules */
6757 if (left->type == XML_EXP_FORBID) {
6758 xmlExpFree(ctxt, left);
6759 return(right);
6761 if (right->type == XML_EXP_FORBID) {
6762 xmlExpFree(ctxt, right);
6763 return(left);
6766 /* OR reduction rule 1 */
6767 /* a | a reduced to a */
6768 if (left == right) {
6769 xmlExpFree(ctxt, right);
6770 return(left);
6772 /* OR canonicalization rule 1 */
6773 /* linearize (a | b) | c into a | (b | c) */
6774 if ((left->type == XML_EXP_OR) && (right->type != XML_EXP_OR)) {
6775 xmlExpNodePtr tmp = left;
6776 left = right;
6777 right = tmp;
6779 /* OR reduction rule 2 */
6780 /* a | (a | b) and b | (a | b) are reduced to a | b */
6781 if (right->type == XML_EXP_OR) {
6782 if ((left == right->exp_left) ||
6783 (left == right->exp_right)) {
6784 xmlExpFree(ctxt, left);
6785 return(right);
6788 /* OR canonicalization rule 2 */
6789 /* linearize (a | b) | c into a | (b | c) */
6790 if (left->type == XML_EXP_OR) {
6791 xmlExpNodePtr tmp;
6793 /* OR canonicalization rule 2 */
6794 if ((left->exp_right->type != XML_EXP_OR) &&
6795 (left->exp_right->key < left->exp_left->key)) {
6796 tmp = left->exp_right;
6797 left->exp_right = left->exp_left;
6798 left->exp_left = tmp;
6800 left->exp_right->ref++;
6801 tmp = xmlExpHashGetEntry(ctxt, XML_EXP_OR, left->exp_right, right,
6802 NULL, 0, 0);
6803 left->exp_left->ref++;
6804 tmp = xmlExpHashGetEntry(ctxt, XML_EXP_OR, left->exp_left, tmp,
6805 NULL, 0, 0);
6807 xmlExpFree(ctxt, left);
6808 return(tmp);
6810 if (right->type == XML_EXP_OR) {
6811 /* Ordering in the tree */
6812 /* C | (A | B) -> A | (B | C) */
6813 if (left->key > right->exp_right->key) {
6814 xmlExpNodePtr tmp;
6815 right->exp_right->ref++;
6816 tmp = xmlExpHashGetEntry(ctxt, XML_EXP_OR, right->exp_right,
6817 left, NULL, 0, 0);
6818 right->exp_left->ref++;
6819 tmp = xmlExpHashGetEntry(ctxt, XML_EXP_OR, right->exp_left,
6820 tmp, NULL, 0, 0);
6821 xmlExpFree(ctxt, right);
6822 return(tmp);
6824 /* Ordering in the tree */
6825 /* B | (A | C) -> A | (B | C) */
6826 if (left->key > right->exp_left->key) {
6827 xmlExpNodePtr tmp;
6828 right->exp_right->ref++;
6829 tmp = xmlExpHashGetEntry(ctxt, XML_EXP_OR, left,
6830 right->exp_right, NULL, 0, 0);
6831 right->exp_left->ref++;
6832 tmp = xmlExpHashGetEntry(ctxt, XML_EXP_OR, right->exp_left,
6833 tmp, NULL, 0, 0);
6834 xmlExpFree(ctxt, right);
6835 return(tmp);
6838 /* we know both types are != XML_EXP_OR here */
6839 else if (left->key > right->key) {
6840 xmlExpNodePtr tmp = left;
6841 left = right;
6842 right = tmp;
6844 kbase = xmlExpHashComputeKey(type, left, right);
6845 } else if (type == XML_EXP_SEQ) {
6846 /* Forbid reduction rules */
6847 if (left->type == XML_EXP_FORBID) {
6848 xmlExpFree(ctxt, right);
6849 return(left);
6851 if (right->type == XML_EXP_FORBID) {
6852 xmlExpFree(ctxt, left);
6853 return(right);
6855 /* Empty reduction rules */
6856 if (right->type == XML_EXP_EMPTY) {
6857 return(left);
6859 if (left->type == XML_EXP_EMPTY) {
6860 return(right);
6862 kbase = xmlExpHashComputeKey(type, left, right);
6863 } else
6864 return(NULL);
6866 key = kbase % ctxt->size;
6867 if (ctxt->table[key] != NULL) {
6868 for (insert = ctxt->table[key]; insert != NULL;
6869 insert = insert->next) {
6870 if ((insert->key == kbase) &&
6871 (insert->type == type)) {
6872 if (type == XML_EXP_ATOM) {
6873 if (name == insert->exp_str) {
6874 insert->ref++;
6875 return(insert);
6877 } else if (type == XML_EXP_COUNT) {
6878 if ((insert->exp_min == min) && (insert->exp_max == max) &&
6879 (insert->exp_left == left)) {
6880 insert->ref++;
6881 left->ref--;
6882 return(insert);
6884 } else if ((insert->exp_left == left) &&
6885 (insert->exp_right == right)) {
6886 insert->ref++;
6887 left->ref--;
6888 right->ref--;
6889 return(insert);
6895 entry = xmlExpNewNode(ctxt, type);
6896 if (entry == NULL)
6897 return(NULL);
6898 entry->key = kbase;
6899 if (type == XML_EXP_ATOM) {
6900 entry->exp_str = name;
6901 entry->c_max = 1;
6902 } else if (type == XML_EXP_COUNT) {
6903 entry->exp_min = min;
6904 entry->exp_max = max;
6905 entry->exp_left = left;
6906 if ((min == 0) || (IS_NILLABLE(left)))
6907 entry->info |= XML_EXP_NILABLE;
6908 if (max < 0)
6909 entry->c_max = -1;
6910 else
6911 entry->c_max = max * entry->exp_left->c_max;
6912 } else {
6913 entry->exp_left = left;
6914 entry->exp_right = right;
6915 if (type == XML_EXP_OR) {
6916 if ((IS_NILLABLE(left)) || (IS_NILLABLE(right)))
6917 entry->info |= XML_EXP_NILABLE;
6918 if ((entry->exp_left->c_max == -1) ||
6919 (entry->exp_right->c_max == -1))
6920 entry->c_max = -1;
6921 else if (entry->exp_left->c_max > entry->exp_right->c_max)
6922 entry->c_max = entry->exp_left->c_max;
6923 else
6924 entry->c_max = entry->exp_right->c_max;
6925 } else {
6926 if ((IS_NILLABLE(left)) && (IS_NILLABLE(right)))
6927 entry->info |= XML_EXP_NILABLE;
6928 if ((entry->exp_left->c_max == -1) ||
6929 (entry->exp_right->c_max == -1))
6930 entry->c_max = -1;
6931 else
6932 entry->c_max = entry->exp_left->c_max + entry->exp_right->c_max;
6935 entry->ref = 1;
6936 if (ctxt->table[key] != NULL)
6937 entry->next = ctxt->table[key];
6939 ctxt->table[key] = entry;
6940 ctxt->nbElems++;
6942 return(entry);
6946 * xmlExpFree:
6947 * @ctxt: the expression context
6948 * @exp: the expression
6950 * Dereference the expression
6952 void
6953 xmlExpFree(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp) {
6954 if ((exp == NULL) || (exp == forbiddenExp) || (exp == emptyExp))
6955 return;
6956 exp->ref--;
6957 if (exp->ref == 0) {
6958 unsigned short key;
6960 /* Unlink it first from the hash table */
6961 key = exp->key % ctxt->size;
6962 if (ctxt->table[key] == exp) {
6963 ctxt->table[key] = exp->next;
6964 } else {
6965 xmlExpNodePtr tmp;
6967 tmp = ctxt->table[key];
6968 while (tmp != NULL) {
6969 if (tmp->next == exp) {
6970 tmp->next = exp->next;
6971 break;
6973 tmp = tmp->next;
6977 if ((exp->type == XML_EXP_SEQ) || (exp->type == XML_EXP_OR)) {
6978 xmlExpFree(ctxt, exp->exp_left);
6979 xmlExpFree(ctxt, exp->exp_right);
6980 } else if (exp->type == XML_EXP_COUNT) {
6981 xmlExpFree(ctxt, exp->exp_left);
6983 xmlFree(exp);
6984 ctxt->nb_nodes--;
6989 * xmlExpRef:
6990 * @exp: the expression
6992 * Increase the reference count of the expression
6994 void
6995 xmlExpRef(xmlExpNodePtr exp) {
6996 if (exp != NULL)
6997 exp->ref++;
7001 * xmlExpNewAtom:
7002 * @ctxt: the expression context
7003 * @name: the atom name
7004 * @len: the atom name length in byte (or -1);
7006 * Get the atom associated to this name from that context
7008 * Returns the node or NULL in case of error
7010 xmlExpNodePtr
7011 xmlExpNewAtom(xmlExpCtxtPtr ctxt, const xmlChar *name, int len) {
7012 if ((ctxt == NULL) || (name == NULL))
7013 return(NULL);
7014 name = xmlDictLookup(ctxt->dict, name, len);
7015 if (name == NULL)
7016 return(NULL);
7017 return(xmlExpHashGetEntry(ctxt, XML_EXP_ATOM, NULL, NULL, name, 0, 0));
7021 * xmlExpNewOr:
7022 * @ctxt: the expression context
7023 * @left: left expression
7024 * @right: right expression
7026 * Get the atom associated to the choice @left | @right
7027 * Note that @left and @right are consumed in the operation, to keep
7028 * an handle on them use xmlExpRef() and use xmlExpFree() to release them,
7029 * this is true even in case of failure (unless ctxt == NULL).
7031 * Returns the node or NULL in case of error
7033 xmlExpNodePtr
7034 xmlExpNewOr(xmlExpCtxtPtr ctxt, xmlExpNodePtr left, xmlExpNodePtr right) {
7035 if (ctxt == NULL)
7036 return(NULL);
7037 if ((left == NULL) || (right == NULL)) {
7038 xmlExpFree(ctxt, left);
7039 xmlExpFree(ctxt, right);
7040 return(NULL);
7042 return(xmlExpHashGetEntry(ctxt, XML_EXP_OR, left, right, NULL, 0, 0));
7046 * xmlExpNewSeq:
7047 * @ctxt: the expression context
7048 * @left: left expression
7049 * @right: right expression
7051 * Get the atom associated to the sequence @left , @right
7052 * Note that @left and @right are consumed in the operation, to keep
7053 * an handle on them use xmlExpRef() and use xmlExpFree() to release them,
7054 * this is true even in case of failure (unless ctxt == NULL).
7056 * Returns the node or NULL in case of error
7058 xmlExpNodePtr
7059 xmlExpNewSeq(xmlExpCtxtPtr ctxt, xmlExpNodePtr left, xmlExpNodePtr right) {
7060 if (ctxt == NULL)
7061 return(NULL);
7062 if ((left == NULL) || (right == NULL)) {
7063 xmlExpFree(ctxt, left);
7064 xmlExpFree(ctxt, right);
7065 return(NULL);
7067 return(xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, left, right, NULL, 0, 0));
7071 * xmlExpNewRange:
7072 * @ctxt: the expression context
7073 * @subset: the expression to be repeated
7074 * @min: the lower bound for the repetition
7075 * @max: the upper bound for the repetition, -1 means infinite
7077 * Get the atom associated to the range (@subset){@min, @max}
7078 * Note that @subset is consumed in the operation, to keep
7079 * an handle on it use xmlExpRef() and use xmlExpFree() to release it,
7080 * this is true even in case of failure (unless ctxt == NULL).
7082 * Returns the node or NULL in case of error
7084 xmlExpNodePtr
7085 xmlExpNewRange(xmlExpCtxtPtr ctxt, xmlExpNodePtr subset, int min, int max) {
7086 if (ctxt == NULL)
7087 return(NULL);
7088 if ((subset == NULL) || (min < 0) || (max < -1) ||
7089 ((max >= 0) && (min > max))) {
7090 xmlExpFree(ctxt, subset);
7091 return(NULL);
7093 return(xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, subset,
7094 NULL, NULL, min, max));
7097 /************************************************************************
7099 * Public API for operations on expressions *
7101 ************************************************************************/
7103 static int
7104 xmlExpGetLanguageInt(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp,
7105 const xmlChar**list, int len, int nb) {
7106 int tmp, tmp2;
7107 tail:
7108 switch (exp->type) {
7109 case XML_EXP_EMPTY:
7110 return(0);
7111 case XML_EXP_ATOM:
7112 for (tmp = 0;tmp < nb;tmp++)
7113 if (list[tmp] == exp->exp_str)
7114 return(0);
7115 if (nb >= len)
7116 return(-2);
7117 list[nb] = exp->exp_str;
7118 return(1);
7119 case XML_EXP_COUNT:
7120 exp = exp->exp_left;
7121 goto tail;
7122 case XML_EXP_SEQ:
7123 case XML_EXP_OR:
7124 tmp = xmlExpGetLanguageInt(ctxt, exp->exp_left, list, len, nb);
7125 if (tmp < 0)
7126 return(tmp);
7127 tmp2 = xmlExpGetLanguageInt(ctxt, exp->exp_right, list, len,
7128 nb + tmp);
7129 if (tmp2 < 0)
7130 return(tmp2);
7131 return(tmp + tmp2);
7133 return(-1);
7137 * xmlExpGetLanguage:
7138 * @ctxt: the expression context
7139 * @exp: the expression
7140 * @langList: where to store the tokens
7141 * @len: the allocated length of @list
7143 * Find all the strings used in @exp and store them in @list
7145 * Returns the number of unique strings found, -1 in case of errors and
7146 * -2 if there is more than @len strings
7149 xmlExpGetLanguage(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp,
7150 const xmlChar**langList, int len) {
7151 if ((ctxt == NULL) || (exp == NULL) || (langList == NULL) || (len <= 0))
7152 return(-1);
7153 return(xmlExpGetLanguageInt(ctxt, exp, langList, len, 0));
7156 static int
7157 xmlExpGetStartInt(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp,
7158 const xmlChar**list, int len, int nb) {
7159 int tmp, tmp2;
7160 tail:
7161 switch (exp->type) {
7162 case XML_EXP_FORBID:
7163 return(0);
7164 case XML_EXP_EMPTY:
7165 return(0);
7166 case XML_EXP_ATOM:
7167 for (tmp = 0;tmp < nb;tmp++)
7168 if (list[tmp] == exp->exp_str)
7169 return(0);
7170 if (nb >= len)
7171 return(-2);
7172 list[nb] = exp->exp_str;
7173 return(1);
7174 case XML_EXP_COUNT:
7175 exp = exp->exp_left;
7176 goto tail;
7177 case XML_EXP_SEQ:
7178 tmp = xmlExpGetStartInt(ctxt, exp->exp_left, list, len, nb);
7179 if (tmp < 0)
7180 return(tmp);
7181 if (IS_NILLABLE(exp->exp_left)) {
7182 tmp2 = xmlExpGetStartInt(ctxt, exp->exp_right, list, len,
7183 nb + tmp);
7184 if (tmp2 < 0)
7185 return(tmp2);
7186 tmp += tmp2;
7188 return(tmp);
7189 case XML_EXP_OR:
7190 tmp = xmlExpGetStartInt(ctxt, exp->exp_left, list, len, nb);
7191 if (tmp < 0)
7192 return(tmp);
7193 tmp2 = xmlExpGetStartInt(ctxt, exp->exp_right, list, len,
7194 nb + tmp);
7195 if (tmp2 < 0)
7196 return(tmp2);
7197 return(tmp + tmp2);
7199 return(-1);
7203 * xmlExpGetStart:
7204 * @ctxt: the expression context
7205 * @exp: the expression
7206 * @tokList: where to store the tokens
7207 * @len: the allocated length of @list
7209 * Find all the strings that appears at the start of the languages
7210 * accepted by @exp and store them in @list. E.g. for (a, b) | c
7211 * it will return the list [a, c]
7213 * Returns the number of unique strings found, -1 in case of errors and
7214 * -2 if there is more than @len strings
7217 xmlExpGetStart(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp,
7218 const xmlChar**tokList, int len) {
7219 if ((ctxt == NULL) || (exp == NULL) || (tokList == NULL) || (len <= 0))
7220 return(-1);
7221 return(xmlExpGetStartInt(ctxt, exp, tokList, len, 0));
7225 * xmlExpIsNillable:
7226 * @exp: the expression
7228 * Finds if the expression is nillable, i.e. if it accepts the empty sequence
7230 * Returns 1 if nillable, 0 if not and -1 in case of error
7233 xmlExpIsNillable(xmlExpNodePtr exp) {
7234 if (exp == NULL)
7235 return(-1);
7236 return(IS_NILLABLE(exp) != 0);
7239 static xmlExpNodePtr
7240 xmlExpStringDeriveInt(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, const xmlChar *str)
7242 xmlExpNodePtr ret;
7244 switch (exp->type) {
7245 case XML_EXP_EMPTY:
7246 return(forbiddenExp);
7247 case XML_EXP_FORBID:
7248 return(forbiddenExp);
7249 case XML_EXP_ATOM:
7250 if (exp->exp_str == str) {
7251 #ifdef DEBUG_DERIV
7252 printf("deriv atom: equal => Empty\n");
7253 #endif
7254 ret = emptyExp;
7255 } else {
7256 #ifdef DEBUG_DERIV
7257 printf("deriv atom: mismatch => forbid\n");
7258 #endif
7259 /* TODO wildcards here */
7260 ret = forbiddenExp;
7262 return(ret);
7263 case XML_EXP_OR: {
7264 xmlExpNodePtr tmp;
7266 #ifdef DEBUG_DERIV
7267 printf("deriv or: => or(derivs)\n");
7268 #endif
7269 tmp = xmlExpStringDeriveInt(ctxt, exp->exp_left, str);
7270 if (tmp == NULL) {
7271 return(NULL);
7273 ret = xmlExpStringDeriveInt(ctxt, exp->exp_right, str);
7274 if (ret == NULL) {
7275 xmlExpFree(ctxt, tmp);
7276 return(NULL);
7278 ret = xmlExpHashGetEntry(ctxt, XML_EXP_OR, tmp, ret,
7279 NULL, 0, 0);
7280 return(ret);
7282 case XML_EXP_SEQ:
7283 #ifdef DEBUG_DERIV
7284 printf("deriv seq: starting with left\n");
7285 #endif
7286 ret = xmlExpStringDeriveInt(ctxt, exp->exp_left, str);
7287 if (ret == NULL) {
7288 return(NULL);
7289 } else if (ret == forbiddenExp) {
7290 if (IS_NILLABLE(exp->exp_left)) {
7291 #ifdef DEBUG_DERIV
7292 printf("deriv seq: left failed but nillable\n");
7293 #endif
7294 ret = xmlExpStringDeriveInt(ctxt, exp->exp_right, str);
7296 } else {
7297 #ifdef DEBUG_DERIV
7298 printf("deriv seq: left match => sequence\n");
7299 #endif
7300 exp->exp_right->ref++;
7301 ret = xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, ret, exp->exp_right,
7302 NULL, 0, 0);
7304 return(ret);
7305 case XML_EXP_COUNT: {
7306 int min, max;
7307 xmlExpNodePtr tmp;
7309 if (exp->exp_max == 0)
7310 return(forbiddenExp);
7311 ret = xmlExpStringDeriveInt(ctxt, exp->exp_left, str);
7312 if (ret == NULL)
7313 return(NULL);
7314 if (ret == forbiddenExp) {
7315 #ifdef DEBUG_DERIV
7316 printf("deriv count: pattern mismatch => forbid\n");
7317 #endif
7318 return(ret);
7320 if (exp->exp_max == 1)
7321 return(ret);
7322 if (exp->exp_max < 0) /* unbounded */
7323 max = -1;
7324 else
7325 max = exp->exp_max - 1;
7326 if (exp->exp_min > 0)
7327 min = exp->exp_min - 1;
7328 else
7329 min = 0;
7330 exp->exp_left->ref++;
7331 tmp = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, exp->exp_left, NULL,
7332 NULL, min, max);
7333 if (ret == emptyExp) {
7334 #ifdef DEBUG_DERIV
7335 printf("deriv count: match to empty => new count\n");
7336 #endif
7337 return(tmp);
7339 #ifdef DEBUG_DERIV
7340 printf("deriv count: match => sequence with new count\n");
7341 #endif
7342 return(xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, ret, tmp,
7343 NULL, 0, 0));
7346 return(NULL);
7350 * xmlExpStringDerive:
7351 * @ctxt: the expression context
7352 * @exp: the expression
7353 * @str: the string
7354 * @len: the string len in bytes if available
7356 * Do one step of Brzozowski derivation of the expression @exp with
7357 * respect to the input string
7359 * Returns the resulting expression or NULL in case of internal error
7361 xmlExpNodePtr
7362 xmlExpStringDerive(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp,
7363 const xmlChar *str, int len) {
7364 const xmlChar *input;
7366 if ((exp == NULL) || (ctxt == NULL) || (str == NULL)) {
7367 return(NULL);
7370 * check the string is in the dictionary, if yes use an interned
7371 * copy, otherwise we know it's not an acceptable input
7373 input = xmlDictExists(ctxt->dict, str, len);
7374 if (input == NULL) {
7375 return(forbiddenExp);
7377 return(xmlExpStringDeriveInt(ctxt, exp, input));
7380 static int
7381 xmlExpCheckCard(xmlExpNodePtr exp, xmlExpNodePtr sub) {
7382 int ret = 1;
7384 if (sub->c_max == -1) {
7385 if (exp->c_max != -1)
7386 ret = 0;
7387 } else if ((exp->c_max >= 0) && (exp->c_max < sub->c_max)) {
7388 ret = 0;
7390 #if 0
7391 if ((IS_NILLABLE(sub)) && (!IS_NILLABLE(exp)))
7392 ret = 0;
7393 #endif
7394 return(ret);
7397 static xmlExpNodePtr xmlExpExpDeriveInt(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp,
7398 xmlExpNodePtr sub);
7400 * xmlExpDivide:
7401 * @ctxt: the expressions context
7402 * @exp: the englobing expression
7403 * @sub: the subexpression
7404 * @mult: the multiple expression
7405 * @remain: the remain from the derivation of the multiple
7407 * Check if exp is a multiple of sub, i.e. if there is a finite number n
7408 * so that sub{n} subsume exp
7410 * Returns the multiple value if successful, 0 if it is not a multiple
7411 * and -1 in case of internal error.
7414 static int
7415 xmlExpDivide(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, xmlExpNodePtr sub,
7416 xmlExpNodePtr *mult, xmlExpNodePtr *remain) {
7417 int i;
7418 xmlExpNodePtr tmp, tmp2;
7420 if (mult != NULL) *mult = NULL;
7421 if (remain != NULL) *remain = NULL;
7422 if (exp->c_max == -1) return(0);
7423 if (IS_NILLABLE(exp) && (!IS_NILLABLE(sub))) return(0);
7425 for (i = 1;i <= exp->c_max;i++) {
7426 sub->ref++;
7427 tmp = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT,
7428 sub, NULL, NULL, i, i);
7429 if (tmp == NULL) {
7430 return(-1);
7432 if (!xmlExpCheckCard(tmp, exp)) {
7433 xmlExpFree(ctxt, tmp);
7434 continue;
7436 tmp2 = xmlExpExpDeriveInt(ctxt, tmp, exp);
7437 if (tmp2 == NULL) {
7438 xmlExpFree(ctxt, tmp);
7439 return(-1);
7441 if ((tmp2 != forbiddenExp) && (IS_NILLABLE(tmp2))) {
7442 if (remain != NULL)
7443 *remain = tmp2;
7444 else
7445 xmlExpFree(ctxt, tmp2);
7446 if (mult != NULL)
7447 *mult = tmp;
7448 else
7449 xmlExpFree(ctxt, tmp);
7450 #ifdef DEBUG_DERIV
7451 printf("Divide succeeded %d\n", i);
7452 #endif
7453 return(i);
7455 xmlExpFree(ctxt, tmp);
7456 xmlExpFree(ctxt, tmp2);
7458 #ifdef DEBUG_DERIV
7459 printf("Divide failed\n");
7460 #endif
7461 return(0);
7465 * xmlExpExpDeriveInt:
7466 * @ctxt: the expressions context
7467 * @exp: the englobing expression
7468 * @sub: the subexpression
7470 * Try to do a step of Brzozowski derivation but at a higher level
7471 * the input being a subexpression.
7473 * Returns the resulting expression or NULL in case of internal error
7475 static xmlExpNodePtr
7476 xmlExpExpDeriveInt(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, xmlExpNodePtr sub) {
7477 xmlExpNodePtr ret, tmp, tmp2, tmp3;
7478 const xmlChar **tab;
7479 int len, i;
7482 * In case of equality and if the expression can only consume a finite
7483 * amount, then the derivation is empty
7485 if ((exp == sub) && (exp->c_max >= 0)) {
7486 #ifdef DEBUG_DERIV
7487 printf("Equal(exp, sub) and finite -> Empty\n");
7488 #endif
7489 return(emptyExp);
7492 * decompose sub sequence first
7494 if (sub->type == XML_EXP_EMPTY) {
7495 #ifdef DEBUG_DERIV
7496 printf("Empty(sub) -> Empty\n");
7497 #endif
7498 exp->ref++;
7499 return(exp);
7501 if (sub->type == XML_EXP_SEQ) {
7502 #ifdef DEBUG_DERIV
7503 printf("Seq(sub) -> decompose\n");
7504 #endif
7505 tmp = xmlExpExpDeriveInt(ctxt, exp, sub->exp_left);
7506 if (tmp == NULL)
7507 return(NULL);
7508 if (tmp == forbiddenExp)
7509 return(tmp);
7510 ret = xmlExpExpDeriveInt(ctxt, tmp, sub->exp_right);
7511 xmlExpFree(ctxt, tmp);
7512 return(ret);
7514 if (sub->type == XML_EXP_OR) {
7515 #ifdef DEBUG_DERIV
7516 printf("Or(sub) -> decompose\n");
7517 #endif
7518 tmp = xmlExpExpDeriveInt(ctxt, exp, sub->exp_left);
7519 if (tmp == forbiddenExp)
7520 return(tmp);
7521 if (tmp == NULL)
7522 return(NULL);
7523 ret = xmlExpExpDeriveInt(ctxt, exp, sub->exp_right);
7524 if ((ret == NULL) || (ret == forbiddenExp)) {
7525 xmlExpFree(ctxt, tmp);
7526 return(ret);
7528 return(xmlExpHashGetEntry(ctxt, XML_EXP_OR, tmp, ret, NULL, 0, 0));
7530 if (!xmlExpCheckCard(exp, sub)) {
7531 #ifdef DEBUG_DERIV
7532 printf("CheckCard(exp, sub) failed -> Forbid\n");
7533 #endif
7534 return(forbiddenExp);
7536 switch (exp->type) {
7537 case XML_EXP_EMPTY:
7538 if (sub == emptyExp)
7539 return(emptyExp);
7540 #ifdef DEBUG_DERIV
7541 printf("Empty(exp) -> Forbid\n");
7542 #endif
7543 return(forbiddenExp);
7544 case XML_EXP_FORBID:
7545 #ifdef DEBUG_DERIV
7546 printf("Forbid(exp) -> Forbid\n");
7547 #endif
7548 return(forbiddenExp);
7549 case XML_EXP_ATOM:
7550 if (sub->type == XML_EXP_ATOM) {
7551 /* TODO: handle wildcards */
7552 if (exp->exp_str == sub->exp_str) {
7553 #ifdef DEBUG_DERIV
7554 printf("Atom match -> Empty\n");
7555 #endif
7556 return(emptyExp);
7558 #ifdef DEBUG_DERIV
7559 printf("Atom mismatch -> Forbid\n");
7560 #endif
7561 return(forbiddenExp);
7563 if ((sub->type == XML_EXP_COUNT) &&
7564 (sub->exp_max == 1) &&
7565 (sub->exp_left->type == XML_EXP_ATOM)) {
7566 /* TODO: handle wildcards */
7567 if (exp->exp_str == sub->exp_left->exp_str) {
7568 #ifdef DEBUG_DERIV
7569 printf("Atom match -> Empty\n");
7570 #endif
7571 return(emptyExp);
7573 #ifdef DEBUG_DERIV
7574 printf("Atom mismatch -> Forbid\n");
7575 #endif
7576 return(forbiddenExp);
7578 #ifdef DEBUG_DERIV
7579 printf("Complex exp vs Atom -> Forbid\n");
7580 #endif
7581 return(forbiddenExp);
7582 case XML_EXP_SEQ:
7583 /* try to get the sequence consumed only if possible */
7584 if (xmlExpCheckCard(exp->exp_left, sub)) {
7585 /* See if the sequence can be consumed directly */
7586 #ifdef DEBUG_DERIV
7587 printf("Seq trying left only\n");
7588 #endif
7589 ret = xmlExpExpDeriveInt(ctxt, exp->exp_left, sub);
7590 if ((ret != forbiddenExp) && (ret != NULL)) {
7591 #ifdef DEBUG_DERIV
7592 printf("Seq trying left only worked\n");
7593 #endif
7595 * TODO: assumption here that we are determinist
7596 * i.e. we won't get to a nillable exp left
7597 * subset which could be matched by the right
7598 * part too.
7599 * e.g.: (a | b)+,(a | c) and 'a+,a'
7601 exp->exp_right->ref++;
7602 return(xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, ret,
7603 exp->exp_right, NULL, 0, 0));
7605 #ifdef DEBUG_DERIV
7606 } else {
7607 printf("Seq: left too short\n");
7608 #endif
7610 /* Try instead to decompose */
7611 if (sub->type == XML_EXP_COUNT) {
7612 int min, max;
7614 #ifdef DEBUG_DERIV
7615 printf("Seq: sub is a count\n");
7616 #endif
7617 ret = xmlExpExpDeriveInt(ctxt, exp->exp_left, sub->exp_left);
7618 if (ret == NULL)
7619 return(NULL);
7620 if (ret != forbiddenExp) {
7621 #ifdef DEBUG_DERIV
7622 printf("Seq , Count match on left\n");
7623 #endif
7624 if (sub->exp_max < 0)
7625 max = -1;
7626 else
7627 max = sub->exp_max -1;
7628 if (sub->exp_min > 0)
7629 min = sub->exp_min -1;
7630 else
7631 min = 0;
7632 exp->exp_right->ref++;
7633 tmp = xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, ret,
7634 exp->exp_right, NULL, 0, 0);
7635 if (tmp == NULL)
7636 return(NULL);
7638 sub->exp_left->ref++;
7639 tmp2 = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT,
7640 sub->exp_left, NULL, NULL, min, max);
7641 if (tmp2 == NULL) {
7642 xmlExpFree(ctxt, tmp);
7643 return(NULL);
7645 ret = xmlExpExpDeriveInt(ctxt, tmp, tmp2);
7646 xmlExpFree(ctxt, tmp);
7647 xmlExpFree(ctxt, tmp2);
7648 return(ret);
7651 /* we made no progress on structured operations */
7652 break;
7653 case XML_EXP_OR:
7654 #ifdef DEBUG_DERIV
7655 printf("Or , trying both side\n");
7656 #endif
7657 ret = xmlExpExpDeriveInt(ctxt, exp->exp_left, sub);
7658 if (ret == NULL)
7659 return(NULL);
7660 tmp = xmlExpExpDeriveInt(ctxt, exp->exp_right, sub);
7661 if (tmp == NULL) {
7662 xmlExpFree(ctxt, ret);
7663 return(NULL);
7665 return(xmlExpHashGetEntry(ctxt, XML_EXP_OR, ret, tmp, NULL, 0, 0));
7666 case XML_EXP_COUNT: {
7667 int min, max;
7669 if (sub->type == XML_EXP_COUNT) {
7671 * Try to see if the loop is completely subsumed
7673 tmp = xmlExpExpDeriveInt(ctxt, exp->exp_left, sub->exp_left);
7674 if (tmp == NULL)
7675 return(NULL);
7676 if (tmp == forbiddenExp) {
7677 int mult;
7679 #ifdef DEBUG_DERIV
7680 printf("Count, Count inner don't subsume\n");
7681 #endif
7682 mult = xmlExpDivide(ctxt, sub->exp_left, exp->exp_left,
7683 NULL, &tmp);
7684 if (mult <= 0) {
7685 #ifdef DEBUG_DERIV
7686 printf("Count, Count not multiple => forbidden\n");
7687 #endif
7688 return(forbiddenExp);
7690 if (sub->exp_max == -1) {
7691 max = -1;
7692 if (exp->exp_max == -1) {
7693 if (exp->exp_min <= sub->exp_min * mult)
7694 min = 0;
7695 else
7696 min = exp->exp_min - sub->exp_min * mult;
7697 } else {
7698 #ifdef DEBUG_DERIV
7699 printf("Count, Count finite can't subsume infinite\n");
7700 #endif
7701 xmlExpFree(ctxt, tmp);
7702 return(forbiddenExp);
7704 } else {
7705 if (exp->exp_max == -1) {
7706 #ifdef DEBUG_DERIV
7707 printf("Infinite loop consume mult finite loop\n");
7708 #endif
7709 if (exp->exp_min > sub->exp_min * mult) {
7710 max = -1;
7711 min = exp->exp_min - sub->exp_min * mult;
7712 } else {
7713 max = -1;
7714 min = 0;
7716 } else {
7717 if (exp->exp_max < sub->exp_max * mult) {
7718 #ifdef DEBUG_DERIV
7719 printf("loops max mult mismatch => forbidden\n");
7720 #endif
7721 xmlExpFree(ctxt, tmp);
7722 return(forbiddenExp);
7724 if (sub->exp_max * mult > exp->exp_min)
7725 min = 0;
7726 else
7727 min = exp->exp_min - sub->exp_max * mult;
7728 max = exp->exp_max - sub->exp_max * mult;
7731 } else if (!IS_NILLABLE(tmp)) {
7733 * TODO: loop here to try to grow if working on finite
7734 * blocks.
7736 #ifdef DEBUG_DERIV
7737 printf("Count, Count remain not nillable => forbidden\n");
7738 #endif
7739 xmlExpFree(ctxt, tmp);
7740 return(forbiddenExp);
7741 } else if (sub->exp_max == -1) {
7742 if (exp->exp_max == -1) {
7743 if (exp->exp_min <= sub->exp_min) {
7744 #ifdef DEBUG_DERIV
7745 printf("Infinite loops Okay => COUNT(0,Inf)\n");
7746 #endif
7747 max = -1;
7748 min = 0;
7749 } else {
7750 #ifdef DEBUG_DERIV
7751 printf("Infinite loops min => Count(X,Inf)\n");
7752 #endif
7753 max = -1;
7754 min = exp->exp_min - sub->exp_min;
7756 } else if (exp->exp_min > sub->exp_min) {
7757 #ifdef DEBUG_DERIV
7758 printf("loops min mismatch 1 => forbidden ???\n");
7759 #endif
7760 xmlExpFree(ctxt, tmp);
7761 return(forbiddenExp);
7762 } else {
7763 max = -1;
7764 min = 0;
7766 } else {
7767 if (exp->exp_max == -1) {
7768 #ifdef DEBUG_DERIV
7769 printf("Infinite loop consume finite loop\n");
7770 #endif
7771 if (exp->exp_min > sub->exp_min) {
7772 max = -1;
7773 min = exp->exp_min - sub->exp_min;
7774 } else {
7775 max = -1;
7776 min = 0;
7778 } else {
7779 if (exp->exp_max < sub->exp_max) {
7780 #ifdef DEBUG_DERIV
7781 printf("loops max mismatch => forbidden\n");
7782 #endif
7783 xmlExpFree(ctxt, tmp);
7784 return(forbiddenExp);
7786 if (sub->exp_max > exp->exp_min)
7787 min = 0;
7788 else
7789 min = exp->exp_min - sub->exp_max;
7790 max = exp->exp_max - sub->exp_max;
7793 #ifdef DEBUG_DERIV
7794 printf("loops match => SEQ(COUNT())\n");
7795 #endif
7796 exp->exp_left->ref++;
7797 tmp2 = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, exp->exp_left,
7798 NULL, NULL, min, max);
7799 if (tmp2 == NULL) {
7800 return(NULL);
7802 ret = xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, tmp, tmp2,
7803 NULL, 0, 0);
7804 return(ret);
7806 tmp = xmlExpExpDeriveInt(ctxt, exp->exp_left, sub);
7807 if (tmp == NULL)
7808 return(NULL);
7809 if (tmp == forbiddenExp) {
7810 #ifdef DEBUG_DERIV
7811 printf("loop mismatch => forbidden\n");
7812 #endif
7813 return(forbiddenExp);
7815 if (exp->exp_min > 0)
7816 min = exp->exp_min - 1;
7817 else
7818 min = 0;
7819 if (exp->exp_max < 0)
7820 max = -1;
7821 else
7822 max = exp->exp_max - 1;
7824 #ifdef DEBUG_DERIV
7825 printf("loop match => SEQ(COUNT())\n");
7826 #endif
7827 exp->exp_left->ref++;
7828 tmp2 = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, exp->exp_left,
7829 NULL, NULL, min, max);
7830 if (tmp2 == NULL)
7831 return(NULL);
7832 ret = xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, tmp, tmp2,
7833 NULL, 0, 0);
7834 return(ret);
7838 #ifdef DEBUG_DERIV
7839 printf("Fallback to derivative\n");
7840 #endif
7841 if (IS_NILLABLE(sub)) {
7842 if (!(IS_NILLABLE(exp)))
7843 return(forbiddenExp);
7844 else
7845 ret = emptyExp;
7846 } else
7847 ret = NULL;
7849 * here the structured derivation made no progress so
7850 * we use the default token based derivation to force one more step
7852 if (ctxt->tabSize == 0)
7853 ctxt->tabSize = 40;
7855 tab = (const xmlChar **) xmlMalloc(ctxt->tabSize *
7856 sizeof(const xmlChar *));
7857 if (tab == NULL) {
7858 return(NULL);
7862 * collect all the strings accepted by the subexpression on input
7864 len = xmlExpGetStartInt(ctxt, sub, tab, ctxt->tabSize, 0);
7865 while (len < 0) {
7866 const xmlChar **temp;
7867 temp = (const xmlChar **) xmlRealloc((xmlChar **) tab, ctxt->tabSize * 2 *
7868 sizeof(const xmlChar *));
7869 if (temp == NULL) {
7870 xmlFree((xmlChar **) tab);
7871 return(NULL);
7873 tab = temp;
7874 ctxt->tabSize *= 2;
7875 len = xmlExpGetStartInt(ctxt, sub, tab, ctxt->tabSize, 0);
7877 for (i = 0;i < len;i++) {
7878 tmp = xmlExpStringDeriveInt(ctxt, exp, tab[i]);
7879 if ((tmp == NULL) || (tmp == forbiddenExp)) {
7880 xmlExpFree(ctxt, ret);
7881 xmlFree((xmlChar **) tab);
7882 return(tmp);
7884 tmp2 = xmlExpStringDeriveInt(ctxt, sub, tab[i]);
7885 if ((tmp2 == NULL) || (tmp2 == forbiddenExp)) {
7886 xmlExpFree(ctxt, tmp);
7887 xmlExpFree(ctxt, ret);
7888 xmlFree((xmlChar **) tab);
7889 return(tmp);
7891 tmp3 = xmlExpExpDeriveInt(ctxt, tmp, tmp2);
7892 xmlExpFree(ctxt, tmp);
7893 xmlExpFree(ctxt, tmp2);
7895 if ((tmp3 == NULL) || (tmp3 == forbiddenExp)) {
7896 xmlExpFree(ctxt, ret);
7897 xmlFree((xmlChar **) tab);
7898 return(tmp3);
7901 if (ret == NULL)
7902 ret = tmp3;
7903 else {
7904 ret = xmlExpHashGetEntry(ctxt, XML_EXP_OR, ret, tmp3, NULL, 0, 0);
7905 if (ret == NULL) {
7906 xmlFree((xmlChar **) tab);
7907 return(NULL);
7911 xmlFree((xmlChar **) tab);
7912 return(ret);
7916 * xmlExpExpDerive:
7917 * @ctxt: the expressions context
7918 * @exp: the englobing expression
7919 * @sub: the subexpression
7921 * Evaluates the expression resulting from @exp consuming a sub expression @sub
7922 * Based on algebraic derivation and sometimes direct Brzozowski derivation
7923 * it usually takes less than linear time and can handle expressions generating
7924 * infinite languages.
7926 * Returns the resulting expression or NULL in case of internal error, the
7927 * result must be freed
7929 xmlExpNodePtr
7930 xmlExpExpDerive(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, xmlExpNodePtr sub) {
7931 if ((exp == NULL) || (ctxt == NULL) || (sub == NULL))
7932 return(NULL);
7935 * O(1) speedups
7937 if (IS_NILLABLE(sub) && (!IS_NILLABLE(exp))) {
7938 #ifdef DEBUG_DERIV
7939 printf("Sub nillable and not exp : can't subsume\n");
7940 #endif
7941 return(forbiddenExp);
7943 if (xmlExpCheckCard(exp, sub) == 0) {
7944 #ifdef DEBUG_DERIV
7945 printf("sub generate longer sequences than exp : can't subsume\n");
7946 #endif
7947 return(forbiddenExp);
7949 return(xmlExpExpDeriveInt(ctxt, exp, sub));
7953 * xmlExpSubsume:
7954 * @ctxt: the expressions context
7955 * @exp: the englobing expression
7956 * @sub: the subexpression
7958 * Check whether @exp accepts all the languages accepted by @sub
7959 * the input being a subexpression.
7961 * Returns 1 if true 0 if false and -1 in case of failure.
7964 xmlExpSubsume(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, xmlExpNodePtr sub) {
7965 xmlExpNodePtr tmp;
7967 if ((exp == NULL) || (ctxt == NULL) || (sub == NULL))
7968 return(-1);
7971 * TODO: speedup by checking the language of sub is a subset of the
7972 * language of exp
7975 * O(1) speedups
7977 if (IS_NILLABLE(sub) && (!IS_NILLABLE(exp))) {
7978 #ifdef DEBUG_DERIV
7979 printf("Sub nillable and not exp : can't subsume\n");
7980 #endif
7981 return(0);
7983 if (xmlExpCheckCard(exp, sub) == 0) {
7984 #ifdef DEBUG_DERIV
7985 printf("sub generate longer sequences than exp : can't subsume\n");
7986 #endif
7987 return(0);
7989 tmp = xmlExpExpDeriveInt(ctxt, exp, sub);
7990 #ifdef DEBUG_DERIV
7991 printf("Result derivation :\n");
7992 PRINT_EXP(tmp);
7993 #endif
7994 if (tmp == NULL)
7995 return(-1);
7996 if (tmp == forbiddenExp)
7997 return(0);
7998 if (tmp == emptyExp)
7999 return(1);
8000 if ((tmp != NULL) && (IS_NILLABLE(tmp))) {
8001 xmlExpFree(ctxt, tmp);
8002 return(1);
8004 xmlExpFree(ctxt, tmp);
8005 return(0);
8008 /************************************************************************
8010 * Parsing expression *
8012 ************************************************************************/
8014 static xmlExpNodePtr xmlExpParseExpr(xmlExpCtxtPtr ctxt);
8016 #undef CUR
8017 #define CUR (*ctxt->cur)
8018 #undef NEXT
8019 #define NEXT ctxt->cur++;
8020 #undef IS_BLANK
8021 #define IS_BLANK(c) ((c == ' ') || (c == '\n') || (c == '\r') || (c == '\t'))
8022 #define SKIP_BLANKS while (IS_BLANK(*ctxt->cur)) ctxt->cur++;
8024 static int
8025 xmlExpParseNumber(xmlExpCtxtPtr ctxt) {
8026 int ret = 0;
8028 SKIP_BLANKS
8029 if (CUR == '*') {
8030 NEXT
8031 return(-1);
8033 if ((CUR < '0') || (CUR > '9'))
8034 return(-1);
8035 while ((CUR >= '0') && (CUR <= '9')) {
8036 ret = ret * 10 + (CUR - '0');
8037 NEXT
8039 return(ret);
8042 static xmlExpNodePtr
8043 xmlExpParseOr(xmlExpCtxtPtr ctxt) {
8044 const char *base;
8045 xmlExpNodePtr ret;
8046 const xmlChar *val;
8048 SKIP_BLANKS
8049 base = ctxt->cur;
8050 if (*ctxt->cur == '(') {
8051 NEXT
8052 ret = xmlExpParseExpr(ctxt);
8053 SKIP_BLANKS
8054 if (*ctxt->cur != ')') {
8055 fprintf(stderr, "unbalanced '(' : %s\n", base);
8056 xmlExpFree(ctxt, ret);
8057 return(NULL);
8059 NEXT;
8060 SKIP_BLANKS
8061 goto parse_quantifier;
8063 while ((CUR != 0) && (!(IS_BLANK(CUR))) && (CUR != '(') &&
8064 (CUR != ')') && (CUR != '|') && (CUR != ',') && (CUR != '{') &&
8065 (CUR != '*') && (CUR != '+') && (CUR != '?') && (CUR != '}'))
8066 NEXT;
8067 val = xmlDictLookup(ctxt->dict, BAD_CAST base, ctxt->cur - base);
8068 if (val == NULL)
8069 return(NULL);
8070 ret = xmlExpHashGetEntry(ctxt, XML_EXP_ATOM, NULL, NULL, val, 0, 0);
8071 if (ret == NULL)
8072 return(NULL);
8073 SKIP_BLANKS
8074 parse_quantifier:
8075 if (CUR == '{') {
8076 int min, max;
8078 NEXT
8079 min = xmlExpParseNumber(ctxt);
8080 if (min < 0) {
8081 xmlExpFree(ctxt, ret);
8082 return(NULL);
8084 SKIP_BLANKS
8085 if (CUR == ',') {
8086 NEXT
8087 max = xmlExpParseNumber(ctxt);
8088 SKIP_BLANKS
8089 } else
8090 max = min;
8091 if (CUR != '}') {
8092 xmlExpFree(ctxt, ret);
8093 return(NULL);
8095 NEXT
8096 ret = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, ret, NULL, NULL,
8097 min, max);
8098 SKIP_BLANKS
8099 } else if (CUR == '?') {
8100 NEXT
8101 ret = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, ret, NULL, NULL,
8102 0, 1);
8103 SKIP_BLANKS
8104 } else if (CUR == '+') {
8105 NEXT
8106 ret = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, ret, NULL, NULL,
8107 1, -1);
8108 SKIP_BLANKS
8109 } else if (CUR == '*') {
8110 NEXT
8111 ret = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, ret, NULL, NULL,
8112 0, -1);
8113 SKIP_BLANKS
8115 return(ret);
8119 static xmlExpNodePtr
8120 xmlExpParseSeq(xmlExpCtxtPtr ctxt) {
8121 xmlExpNodePtr ret, right;
8123 ret = xmlExpParseOr(ctxt);
8124 SKIP_BLANKS
8125 while (CUR == '|') {
8126 NEXT
8127 right = xmlExpParseOr(ctxt);
8128 if (right == NULL) {
8129 xmlExpFree(ctxt, ret);
8130 return(NULL);
8132 ret = xmlExpHashGetEntry(ctxt, XML_EXP_OR, ret, right, NULL, 0, 0);
8133 if (ret == NULL)
8134 return(NULL);
8136 return(ret);
8139 static xmlExpNodePtr
8140 xmlExpParseExpr(xmlExpCtxtPtr ctxt) {
8141 xmlExpNodePtr ret, right;
8143 ret = xmlExpParseSeq(ctxt);
8144 SKIP_BLANKS
8145 while (CUR == ',') {
8146 NEXT
8147 right = xmlExpParseSeq(ctxt);
8148 if (right == NULL) {
8149 xmlExpFree(ctxt, ret);
8150 return(NULL);
8152 ret = xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, ret, right, NULL, 0, 0);
8153 if (ret == NULL)
8154 return(NULL);
8156 return(ret);
8160 * xmlExpParse:
8161 * @ctxt: the expressions context
8162 * @expr: the 0 terminated string
8164 * Minimal parser for regexps, it understand the following constructs
8165 * - string terminals
8166 * - choice operator |
8167 * - sequence operator ,
8168 * - subexpressions (...)
8169 * - usual cardinality operators + * and ?
8170 * - finite sequences { min, max }
8171 * - infinite sequences { min, * }
8172 * There is minimal checkings made especially no checking on strings values
8174 * Returns a new expression or NULL in case of failure
8176 xmlExpNodePtr
8177 xmlExpParse(xmlExpCtxtPtr ctxt, const char *expr) {
8178 xmlExpNodePtr ret;
8180 ctxt->expr = expr;
8181 ctxt->cur = expr;
8183 ret = xmlExpParseExpr(ctxt);
8184 SKIP_BLANKS
8185 if (*ctxt->cur != 0) {
8186 xmlExpFree(ctxt, ret);
8187 return(NULL);
8189 return(ret);
8192 static void
8193 xmlExpDumpInt(xmlBufferPtr buf, xmlExpNodePtr expr, int glob) {
8194 xmlExpNodePtr c;
8196 if (expr == NULL) return;
8197 if (glob) xmlBufferWriteChar(buf, "(");
8198 switch (expr->type) {
8199 case XML_EXP_EMPTY:
8200 xmlBufferWriteChar(buf, "empty");
8201 break;
8202 case XML_EXP_FORBID:
8203 xmlBufferWriteChar(buf, "forbidden");
8204 break;
8205 case XML_EXP_ATOM:
8206 xmlBufferWriteCHAR(buf, expr->exp_str);
8207 break;
8208 case XML_EXP_SEQ:
8209 c = expr->exp_left;
8210 if ((c->type == XML_EXP_SEQ) || (c->type == XML_EXP_OR))
8211 xmlExpDumpInt(buf, c, 1);
8212 else
8213 xmlExpDumpInt(buf, c, 0);
8214 xmlBufferWriteChar(buf, " , ");
8215 c = expr->exp_right;
8216 if ((c->type == XML_EXP_SEQ) || (c->type == XML_EXP_OR))
8217 xmlExpDumpInt(buf, c, 1);
8218 else
8219 xmlExpDumpInt(buf, c, 0);
8220 break;
8221 case XML_EXP_OR:
8222 c = expr->exp_left;
8223 if ((c->type == XML_EXP_SEQ) || (c->type == XML_EXP_OR))
8224 xmlExpDumpInt(buf, c, 1);
8225 else
8226 xmlExpDumpInt(buf, c, 0);
8227 xmlBufferWriteChar(buf, " | ");
8228 c = expr->exp_right;
8229 if ((c->type == XML_EXP_SEQ) || (c->type == XML_EXP_OR))
8230 xmlExpDumpInt(buf, c, 1);
8231 else
8232 xmlExpDumpInt(buf, c, 0);
8233 break;
8234 case XML_EXP_COUNT: {
8235 char rep[40];
8237 c = expr->exp_left;
8238 if ((c->type == XML_EXP_SEQ) || (c->type == XML_EXP_OR))
8239 xmlExpDumpInt(buf, c, 1);
8240 else
8241 xmlExpDumpInt(buf, c, 0);
8242 if ((expr->exp_min == 0) && (expr->exp_max == 1)) {
8243 rep[0] = '?';
8244 rep[1] = 0;
8245 } else if ((expr->exp_min == 0) && (expr->exp_max == -1)) {
8246 rep[0] = '*';
8247 rep[1] = 0;
8248 } else if ((expr->exp_min == 1) && (expr->exp_max == -1)) {
8249 rep[0] = '+';
8250 rep[1] = 0;
8251 } else if (expr->exp_max == expr->exp_min) {
8252 snprintf(rep, 39, "{%d}", expr->exp_min);
8253 } else if (expr->exp_max < 0) {
8254 snprintf(rep, 39, "{%d,inf}", expr->exp_min);
8255 } else {
8256 snprintf(rep, 39, "{%d,%d}", expr->exp_min, expr->exp_max);
8258 rep[39] = 0;
8259 xmlBufferWriteChar(buf, rep);
8260 break;
8262 default:
8263 fprintf(stderr, "Error in tree\n");
8265 if (glob)
8266 xmlBufferWriteChar(buf, ")");
8269 * xmlExpDump:
8270 * @buf: a buffer to receive the output
8271 * @expr: the compiled expression
8273 * Serialize the expression as compiled to the buffer
8275 void
8276 xmlExpDump(xmlBufferPtr buf, xmlExpNodePtr expr) {
8277 if ((buf == NULL) || (expr == NULL))
8278 return;
8279 xmlExpDumpInt(buf, expr, 0);
8283 * xmlExpMaxToken:
8284 * @expr: a compiled expression
8286 * Indicate the maximum number of input a expression can accept
8288 * Returns the maximum length or -1 in case of error
8291 xmlExpMaxToken(xmlExpNodePtr expr) {
8292 if (expr == NULL)
8293 return(-1);
8294 return(expr->c_max);
8298 * xmlExpCtxtNbNodes:
8299 * @ctxt: an expression context
8301 * Debugging facility provides the number of allocated nodes at a that point
8303 * Returns the number of nodes in use or -1 in case of error
8306 xmlExpCtxtNbNodes(xmlExpCtxtPtr ctxt) {
8307 if (ctxt == NULL)
8308 return(-1);
8309 return(ctxt->nb_nodes);
8313 * xmlExpCtxtNbCons:
8314 * @ctxt: an expression context
8316 * Debugging facility provides the number of allocated nodes over lifetime
8318 * Returns the number of nodes ever allocated or -1 in case of error
8321 xmlExpCtxtNbCons(xmlExpCtxtPtr ctxt) {
8322 if (ctxt == NULL)
8323 return(-1);
8324 return(ctxt->nb_cons);
8327 #endif /* LIBXML_EXPR_ENABLED */
8329 #endif /* LIBXML_REGEXP_ENABLED */