2 * regexp.c: generic and extensible Regular Expression engine
4 * Basically designed with the purpose of compiling regexps for
5 * the variety of validation/schemas mechanisms now available in
6 * XML related specifications these include:
7 * - XML-1.0 DTD validation
8 * - XML Schemas structure part 1
9 * - XML Schemas Datatypes part 2 especially Appendix F
10 * - RELAX-NG/TREX i.e. the counter proposal
12 * See Copyright for the status of this software.
14 * Daniel Veillard <veillard@redhat.com>
20 #ifdef LIBXML_REGEXP_ENABLED
22 /* #define DEBUG_ERR */
28 #include <libxml/tree.h>
29 #include <libxml/parserInternals.h>
30 #include <libxml/xmlregexp.h>
31 #include <libxml/xmlautomata.h>
32 #include <libxml/xmlunicode.h>
35 #define SIZE_MAX ((size_t) -1)
38 /* #define DEBUG_REGEXP_GRAPH */
39 /* #define DEBUG_REGEXP_EXEC */
40 /* #define DEBUG_PUSH */
41 /* #define DEBUG_COMPACTION */
43 #define MAX_PUSH 10000000
49 ctxt->error = XML_REGEXP_COMPILE_ERROR; \
50 xmlRegexpErrCompile(ctxt, str);
51 #define NEXT ctxt->cur++
52 #define CUR (*(ctxt->cur))
53 #define NXT(index) (ctxt->cur[index])
55 #define CUR_SCHAR(s, l) xmlStringCurrentChar(NULL, s, &l)
56 #define NEXTL(l) ctxt->cur += l;
57 #define XML_REG_STRING_SEPARATOR '|'
59 * Need PREV to check on a '-' within a Character Group. May only be used
60 * when it's guaranteed that cur is not at the beginning of ctxt->string!
62 #define PREV (ctxt->cur[-1])
67 * macro to flag unimplemented blocks
70 xmlGenericError(xmlGenericErrorContext, \
71 "Unimplemented block at %s:%d\n", \
74 /************************************************************************
76 * Datatypes and structures *
78 ************************************************************************/
81 * Note: the order of the enums below is significant, do not shuffle
84 XML_REGEXP_EPSILON
= 1,
87 XML_REGEXP_SUBREG
, /* used for () sub regexps */
89 XML_REGEXP_ANYCHAR
, /* . */
90 XML_REGEXP_ANYSPACE
, /* \s */
91 XML_REGEXP_NOTSPACE
, /* \S */
92 XML_REGEXP_INITNAME
, /* \l */
93 XML_REGEXP_NOTINITNAME
, /* \L */
94 XML_REGEXP_NAMECHAR
, /* \c */
95 XML_REGEXP_NOTNAMECHAR
, /* \C */
96 XML_REGEXP_DECIMAL
, /* \d */
97 XML_REGEXP_NOTDECIMAL
, /* \D */
98 XML_REGEXP_REALCHAR
, /* \w */
99 XML_REGEXP_NOTREALCHAR
, /* \W */
100 XML_REGEXP_LETTER
= 100,
101 XML_REGEXP_LETTER_UPPERCASE
,
102 XML_REGEXP_LETTER_LOWERCASE
,
103 XML_REGEXP_LETTER_TITLECASE
,
104 XML_REGEXP_LETTER_MODIFIER
,
105 XML_REGEXP_LETTER_OTHERS
,
107 XML_REGEXP_MARK_NONSPACING
,
108 XML_REGEXP_MARK_SPACECOMBINING
,
109 XML_REGEXP_MARK_ENCLOSING
,
111 XML_REGEXP_NUMBER_DECIMAL
,
112 XML_REGEXP_NUMBER_LETTER
,
113 XML_REGEXP_NUMBER_OTHERS
,
115 XML_REGEXP_PUNCT_CONNECTOR
,
116 XML_REGEXP_PUNCT_DASH
,
117 XML_REGEXP_PUNCT_OPEN
,
118 XML_REGEXP_PUNCT_CLOSE
,
119 XML_REGEXP_PUNCT_INITQUOTE
,
120 XML_REGEXP_PUNCT_FINQUOTE
,
121 XML_REGEXP_PUNCT_OTHERS
,
123 XML_REGEXP_SEPAR_SPACE
,
124 XML_REGEXP_SEPAR_LINE
,
125 XML_REGEXP_SEPAR_PARA
,
127 XML_REGEXP_SYMBOL_MATH
,
128 XML_REGEXP_SYMBOL_CURRENCY
,
129 XML_REGEXP_SYMBOL_MODIFIER
,
130 XML_REGEXP_SYMBOL_OTHERS
,
132 XML_REGEXP_OTHER_CONTROL
,
133 XML_REGEXP_OTHER_FORMAT
,
134 XML_REGEXP_OTHER_PRIVATE
,
136 XML_REGEXP_BLOCK_NAME
140 XML_REGEXP_QUANT_EPSILON
= 1,
141 XML_REGEXP_QUANT_ONCE
,
142 XML_REGEXP_QUANT_OPT
,
143 XML_REGEXP_QUANT_MULT
,
144 XML_REGEXP_QUANT_PLUS
,
145 XML_REGEXP_QUANT_ONCEONLY
,
146 XML_REGEXP_QUANT_ALL
,
147 XML_REGEXP_QUANT_RANGE
151 XML_REGEXP_START_STATE
= 1,
152 XML_REGEXP_FINAL_STATE
,
153 XML_REGEXP_TRANS_STATE
,
154 XML_REGEXP_SINK_STATE
,
155 XML_REGEXP_UNREACH_STATE
159 XML_REGEXP_MARK_NORMAL
= 0,
160 XML_REGEXP_MARK_START
,
161 XML_REGEXP_MARK_VISITED
164 typedef struct _xmlRegRange xmlRegRange
;
165 typedef xmlRegRange
*xmlRegRangePtr
;
167 struct _xmlRegRange
{
168 int neg
; /* 0 normal, 1 not, 2 exclude */
175 typedef struct _xmlRegAtom xmlRegAtom
;
176 typedef xmlRegAtom
*xmlRegAtomPtr
;
178 typedef struct _xmlAutomataState xmlRegState
;
179 typedef xmlRegState
*xmlRegStatePtr
;
184 xmlRegQuantType quant
;
192 xmlRegStatePtr start
;
193 xmlRegStatePtr start0
;
197 xmlRegRangePtr
*ranges
;
201 typedef struct _xmlRegCounter xmlRegCounter
;
202 typedef xmlRegCounter
*xmlRegCounterPtr
;
204 struct _xmlRegCounter
{
209 typedef struct _xmlRegTrans xmlRegTrans
;
210 typedef xmlRegTrans
*xmlRegTransPtr
;
212 struct _xmlRegTrans
{
220 struct _xmlAutomataState
{
221 xmlRegStateType type
;
222 xmlRegMarkedType mark
;
223 xmlRegMarkedType markd
;
224 xmlRegMarkedType reached
;
229 /* knowing states pointing to us can speed things up */
235 typedef struct _xmlAutomata xmlRegParserCtxt
;
236 typedef xmlRegParserCtxt
*xmlRegParserCtxtPtr
;
238 #define AM_AUTOMATA_RNG 1
240 struct _xmlAutomata
{
247 xmlRegStatePtr start
;
249 xmlRegStatePtr state
;
255 xmlRegAtomPtr
*atoms
;
259 xmlRegStatePtr
*states
;
263 xmlRegCounter
*counters
;
275 xmlRegStatePtr
*states
;
277 xmlRegAtomPtr
*atoms
;
279 xmlRegCounter
*counters
;
283 * That's the compact form for determinists automatas
292 typedef struct _xmlRegExecRollback xmlRegExecRollback
;
293 typedef xmlRegExecRollback
*xmlRegExecRollbackPtr
;
295 struct _xmlRegExecRollback
{
296 xmlRegStatePtr state
;/* the current state */
297 int index
; /* the index in the input stack */
298 int nextbranch
; /* the next transition to explore in that state */
299 int *counts
; /* save the automata state if it has some */
302 typedef struct _xmlRegInputToken xmlRegInputToken
;
303 typedef xmlRegInputToken
*xmlRegInputTokenPtr
;
305 struct _xmlRegInputToken
{
310 struct _xmlRegExecCtxt
{
311 int status
; /* execution status != 0 indicate an error */
312 int determinist
; /* did we find an indeterministic behaviour */
313 xmlRegexpPtr comp
; /* the compiled regexp */
314 xmlRegExecCallbacks callback
;
317 xmlRegStatePtr state
;/* the current state */
318 int transno
; /* the current transition on that state */
319 int transcount
; /* the number of chars in char counted transitions */
322 * A stack of rollback states
326 xmlRegExecRollback
*rollbacks
;
329 * The state of the automata if any
340 const xmlChar
*inputString
; /* when operating on characters */
341 xmlRegInputTokenPtr inputStack
;/* when operating on strings */
346 int errStateNo
; /* the error state number */
347 xmlRegStatePtr errState
; /* the error state */
348 xmlChar
*errString
; /* the string raising the error */
349 int *errCounts
; /* counters at the error state */
353 #define REGEXP_ALL_COUNTER 0x123456
354 #define REGEXP_ALL_LAX_COUNTER 0x123457
356 static void xmlFAParseRegExp(xmlRegParserCtxtPtr ctxt
, int top
);
357 static void xmlRegFreeState(xmlRegStatePtr state
);
358 static void xmlRegFreeAtom(xmlRegAtomPtr atom
);
359 static int xmlRegStrEqualWildcard(const xmlChar
*expStr
, const xmlChar
*valStr
);
360 static int xmlRegCheckCharacter(xmlRegAtomPtr atom
, int codepoint
);
361 static int xmlRegCheckCharacterRange(xmlRegAtomType type
, int codepoint
,
362 int neg
, int start
, int end
, const xmlChar
*blockName
);
364 void xmlAutomataSetFlags(xmlAutomataPtr am
, int flags
);
366 /************************************************************************
368 * Regexp memory error handler *
370 ************************************************************************/
372 * xmlRegexpErrMemory:
373 * @extra: extra information
375 * Handle an out of memory condition
378 xmlRegexpErrMemory(xmlRegParserCtxtPtr ctxt
, const char *extra
)
380 const char *regexp
= NULL
;
382 regexp
= (const char *) ctxt
->string
;
383 ctxt
->error
= XML_ERR_NO_MEMORY
;
385 __xmlRaiseError(NULL
, NULL
, NULL
, NULL
, NULL
, XML_FROM_REGEXP
,
386 XML_ERR_NO_MEMORY
, XML_ERR_FATAL
, NULL
, 0, extra
,
388 "Memory allocation failed : %s\n", extra
);
392 * xmlRegexpErrCompile:
393 * @extra: extra information
395 * Handle a compilation failure
398 xmlRegexpErrCompile(xmlRegParserCtxtPtr ctxt
, const char *extra
)
400 const char *regexp
= NULL
;
404 regexp
= (const char *) ctxt
->string
;
405 idx
= ctxt
->cur
- ctxt
->string
;
406 ctxt
->error
= XML_REGEXP_COMPILE_ERROR
;
408 __xmlRaiseError(NULL
, NULL
, NULL
, NULL
, NULL
, XML_FROM_REGEXP
,
409 XML_REGEXP_COMPILE_ERROR
, XML_ERR_FATAL
, NULL
, 0, extra
,
410 regexp
, NULL
, idx
, 0,
411 "failed to compile: %s\n", extra
);
414 /************************************************************************
416 * Allocation/Deallocation *
418 ************************************************************************/
420 static int xmlFAComputesDeterminism(xmlRegParserCtxtPtr ctxt
);
424 * @dim1: size of first dimension
425 * @dim2: size of second dimension
426 * @elemSize: size of element
428 * Allocate a two-dimensional array and set all elements to zero.
430 * Returns the new array or NULL in case of error.
433 xmlRegCalloc2(size_t dim1
, size_t dim2
, size_t elemSize
) {
437 /* Check for overflow */
438 if (dim1
> SIZE_MAX
/ dim2
/ elemSize
)
440 totalSize
= dim1
* dim2
* elemSize
;
441 ret
= xmlMalloc(totalSize
);
443 memset(ret
, 0, totalSize
);
448 * xmlRegEpxFromParse:
449 * @ctxt: the parser context used to build it
451 * Allocate a new regexp and fill it with the result from the parser
453 * Returns the new regexp or NULL in case of error
456 xmlRegEpxFromParse(xmlRegParserCtxtPtr ctxt
) {
459 ret
= (xmlRegexpPtr
) xmlMalloc(sizeof(xmlRegexp
));
461 xmlRegexpErrMemory(ctxt
, "compiling regexp");
464 memset(ret
, 0, sizeof(xmlRegexp
));
465 ret
->string
= ctxt
->string
;
466 ret
->nbStates
= ctxt
->nbStates
;
467 ret
->states
= ctxt
->states
;
468 ret
->nbAtoms
= ctxt
->nbAtoms
;
469 ret
->atoms
= ctxt
->atoms
;
470 ret
->nbCounters
= ctxt
->nbCounters
;
471 ret
->counters
= ctxt
->counters
;
472 ret
->determinist
= ctxt
->determinist
;
473 ret
->flags
= ctxt
->flags
;
474 if (ret
->determinist
== -1) {
475 xmlRegexpIsDeterminist(ret
);
478 if ((ret
->determinist
!= 0) &&
479 (ret
->nbCounters
== 0) &&
481 (ret
->atoms
!= NULL
) &&
482 (ret
->atoms
[0] != NULL
) &&
483 (ret
->atoms
[0]->type
== XML_REGEXP_STRING
)) {
484 int i
, j
, nbstates
= 0, nbatoms
= 0;
493 * Switch to a compact representation
494 * 1/ counting the effective number of states left
495 * 2/ counting the unique number of atoms, and check that
496 * they are all of the string type
497 * 3/ build a table state x atom for the transitions
500 stateRemap
= xmlMalloc(ret
->nbStates
* sizeof(int));
501 if (stateRemap
== NULL
) {
502 xmlRegexpErrMemory(ctxt
, "compiling regexp");
506 for (i
= 0;i
< ret
->nbStates
;i
++) {
507 if (ret
->states
[i
] != NULL
) {
508 stateRemap
[i
] = nbstates
;
514 #ifdef DEBUG_COMPACTION
515 printf("Final: %d states\n", nbstates
);
517 stringMap
= xmlMalloc(ret
->nbAtoms
* sizeof(char *));
518 if (stringMap
== NULL
) {
519 xmlRegexpErrMemory(ctxt
, "compiling regexp");
524 stringRemap
= xmlMalloc(ret
->nbAtoms
* sizeof(int));
525 if (stringRemap
== NULL
) {
526 xmlRegexpErrMemory(ctxt
, "compiling regexp");
532 for (i
= 0;i
< ret
->nbAtoms
;i
++) {
533 if ((ret
->atoms
[i
]->type
== XML_REGEXP_STRING
) &&
534 (ret
->atoms
[i
]->quant
== XML_REGEXP_QUANT_ONCE
)) {
535 value
= ret
->atoms
[i
]->valuep
;
536 for (j
= 0;j
< nbatoms
;j
++) {
537 if (xmlStrEqual(stringMap
[j
], value
)) {
543 stringRemap
[i
] = nbatoms
;
544 stringMap
[nbatoms
] = xmlStrdup(value
);
545 if (stringMap
[nbatoms
] == NULL
) {
546 for (i
= 0;i
< nbatoms
;i
++)
547 xmlFree(stringMap
[i
]);
548 xmlFree(stringRemap
);
558 xmlFree(stringRemap
);
559 for (i
= 0;i
< nbatoms
;i
++)
560 xmlFree(stringMap
[i
]);
566 #ifdef DEBUG_COMPACTION
567 printf("Final: %d atoms\n", nbatoms
);
569 transitions
= (int *) xmlRegCalloc2(nbstates
+ 1, nbatoms
+ 1,
571 if (transitions
== NULL
) {
573 xmlFree(stringRemap
);
574 for (i
= 0;i
< nbatoms
;i
++)
575 xmlFree(stringMap
[i
]);
582 * Allocate the transition table. The first entry for each
583 * state corresponds to the state type.
587 for (i
= 0;i
< ret
->nbStates
;i
++) {
588 int stateno
, atomno
, targetno
, prev
;
589 xmlRegStatePtr state
;
590 xmlRegTransPtr trans
;
592 stateno
= stateRemap
[i
];
595 state
= ret
->states
[i
];
597 transitions
[stateno
* (nbatoms
+ 1)] = state
->type
;
599 for (j
= 0;j
< state
->nbTrans
;j
++) {
600 trans
= &(state
->trans
[j
]);
601 if ((trans
->to
== -1) || (trans
->atom
== NULL
))
603 atomno
= stringRemap
[trans
->atom
->no
];
604 if ((trans
->atom
->data
!= NULL
) && (transdata
== NULL
)) {
605 transdata
= (void **) xmlRegCalloc2(nbstates
, nbatoms
,
607 if (transdata
== NULL
) {
608 xmlRegexpErrMemory(ctxt
, "compiling regexp");
612 targetno
= stateRemap
[trans
->to
];
614 * if the same atom can generate transitions to 2 different
615 * states then it means the automata is not deterministic and
616 * the compact form can't be used !
618 prev
= transitions
[stateno
* (nbatoms
+ 1) + atomno
+ 1];
620 if (prev
!= targetno
+ 1) {
621 ret
->determinist
= 0;
622 #ifdef DEBUG_COMPACTION
623 printf("Indet: state %d trans %d, atom %d to %d : %d to %d\n",
624 i
, j
, trans
->atom
->no
, trans
->to
, atomno
, targetno
);
625 printf(" previous to is %d\n", prev
);
627 if (transdata
!= NULL
)
629 xmlFree(transitions
);
631 xmlFree(stringRemap
);
632 for (i
= 0;i
< nbatoms
;i
++)
633 xmlFree(stringMap
[i
]);
639 printf("State %d trans %d: atom %d to %d : %d to %d\n",
640 i
, j
, trans
->atom
->no
, trans
->to
, atomno
, targetno
);
642 transitions
[stateno
* (nbatoms
+ 1) + atomno
+ 1] =
643 targetno
+ 1; /* to avoid 0 */
644 if (transdata
!= NULL
)
645 transdata
[stateno
* nbatoms
+ atomno
] =
650 ret
->determinist
= 1;
651 #ifdef DEBUG_COMPACTION
655 for (i
= 0;i
< nbstates
;i
++) {
656 for (j
= 0;j
< nbatoms
+ 1;j
++) {
657 printf("%02d ", transitions
[i
* (nbatoms
+ 1) + j
]);
664 * Cleanup of the old data
666 if (ret
->states
!= NULL
) {
667 for (i
= 0;i
< ret
->nbStates
;i
++)
668 xmlRegFreeState(ret
->states
[i
]);
669 xmlFree(ret
->states
);
673 if (ret
->atoms
!= NULL
) {
674 for (i
= 0;i
< ret
->nbAtoms
;i
++)
675 xmlRegFreeAtom(ret
->atoms
[i
]);
681 ret
->compact
= transitions
;
682 ret
->transdata
= transdata
;
683 ret
->stringMap
= stringMap
;
684 ret
->nbstrings
= nbatoms
;
685 ret
->nbstates
= nbstates
;
687 xmlFree(stringRemap
);
695 ctxt
->nbCounters
= 0;
696 ctxt
->counters
= NULL
;
701 * xmlRegNewParserCtxt:
702 * @string: the string to parse
704 * Allocate a new regexp parser context
706 * Returns the new context or NULL in case of error
708 static xmlRegParserCtxtPtr
709 xmlRegNewParserCtxt(const xmlChar
*string
) {
710 xmlRegParserCtxtPtr ret
;
712 ret
= (xmlRegParserCtxtPtr
) xmlMalloc(sizeof(xmlRegParserCtxt
));
715 memset(ret
, 0, sizeof(xmlRegParserCtxt
));
717 ret
->string
= xmlStrdup(string
);
718 ret
->cur
= ret
->string
;
722 ret
->determinist
= -1;
728 * @ctxt: the regexp parser context
729 * @neg: is that negative
730 * @type: the type of range
731 * @start: the start codepoint
732 * @end: the end codepoint
734 * Allocate a new regexp range
736 * Returns the new range or NULL in case of error
738 static xmlRegRangePtr
739 xmlRegNewRange(xmlRegParserCtxtPtr ctxt
,
740 int neg
, xmlRegAtomType type
, int start
, int end
) {
743 ret
= (xmlRegRangePtr
) xmlMalloc(sizeof(xmlRegRange
));
745 xmlRegexpErrMemory(ctxt
, "allocating range");
757 * @range: the regexp range
759 * Free a regexp range
762 xmlRegFreeRange(xmlRegRangePtr range
) {
766 if (range
->blockName
!= NULL
)
767 xmlFree(range
->blockName
);
773 * @range: the regexp range
775 * Copy a regexp range
777 * Returns the new copy or NULL in case of error.
779 static xmlRegRangePtr
780 xmlRegCopyRange(xmlRegParserCtxtPtr ctxt
, xmlRegRangePtr range
) {
786 ret
= xmlRegNewRange(ctxt
, range
->neg
, range
->type
, range
->start
,
790 if (range
->blockName
!= NULL
) {
791 ret
->blockName
= xmlStrdup(range
->blockName
);
792 if (ret
->blockName
== NULL
) {
793 xmlRegexpErrMemory(ctxt
, "allocating range");
794 xmlRegFreeRange(ret
);
803 * @ctxt: the regexp parser context
804 * @type: the type of atom
806 * Allocate a new atom
808 * Returns the new atom or NULL in case of error
811 xmlRegNewAtom(xmlRegParserCtxtPtr ctxt
, xmlRegAtomType type
) {
814 ret
= (xmlRegAtomPtr
) xmlMalloc(sizeof(xmlRegAtom
));
816 xmlRegexpErrMemory(ctxt
, "allocating atom");
819 memset(ret
, 0, sizeof(xmlRegAtom
));
821 ret
->quant
= XML_REGEXP_QUANT_ONCE
;
829 * @atom: the regexp atom
834 xmlRegFreeAtom(xmlRegAtomPtr atom
) {
840 for (i
= 0;i
< atom
->nbRanges
;i
++)
841 xmlRegFreeRange(atom
->ranges
[i
]);
842 if (atom
->ranges
!= NULL
)
843 xmlFree(atom
->ranges
);
844 if ((atom
->type
== XML_REGEXP_STRING
) && (atom
->valuep
!= NULL
))
845 xmlFree(atom
->valuep
);
846 if ((atom
->type
== XML_REGEXP_STRING
) && (atom
->valuep2
!= NULL
))
847 xmlFree(atom
->valuep2
);
848 if ((atom
->type
== XML_REGEXP_BLOCK_NAME
) && (atom
->valuep
!= NULL
))
849 xmlFree(atom
->valuep
);
855 * @ctxt: the regexp parser context
856 * @atom: the original atom
858 * Allocate a new regexp range
860 * Returns the new atom or NULL in case of error
863 xmlRegCopyAtom(xmlRegParserCtxtPtr ctxt
, xmlRegAtomPtr atom
) {
866 ret
= (xmlRegAtomPtr
) xmlMalloc(sizeof(xmlRegAtom
));
868 xmlRegexpErrMemory(ctxt
, "copying atom");
871 memset(ret
, 0, sizeof(xmlRegAtom
));
872 ret
->type
= atom
->type
;
873 ret
->quant
= atom
->quant
;
874 ret
->min
= atom
->min
;
875 ret
->max
= atom
->max
;
876 if (atom
->nbRanges
> 0) {
879 ret
->ranges
= (xmlRegRangePtr
*) xmlMalloc(sizeof(xmlRegRangePtr
) *
881 if (ret
->ranges
== NULL
) {
882 xmlRegexpErrMemory(ctxt
, "copying atom");
885 for (i
= 0;i
< atom
->nbRanges
;i
++) {
886 ret
->ranges
[i
] = xmlRegCopyRange(ctxt
, atom
->ranges
[i
]);
887 if (ret
->ranges
[i
] == NULL
)
889 ret
->nbRanges
= i
+ 1;
899 static xmlRegStatePtr
900 xmlRegNewState(xmlRegParserCtxtPtr ctxt
) {
903 ret
= (xmlRegStatePtr
) xmlMalloc(sizeof(xmlRegState
));
905 xmlRegexpErrMemory(ctxt
, "allocating state");
908 memset(ret
, 0, sizeof(xmlRegState
));
909 ret
->type
= XML_REGEXP_TRANS_STATE
;
910 ret
->mark
= XML_REGEXP_MARK_NORMAL
;
916 * @state: the regexp state
918 * Free a regexp state
921 xmlRegFreeState(xmlRegStatePtr state
) {
925 if (state
->trans
!= NULL
)
926 xmlFree(state
->trans
);
927 if (state
->transTo
!= NULL
)
928 xmlFree(state
->transTo
);
933 * xmlRegFreeParserCtxt:
934 * @ctxt: the regexp parser context
936 * Free a regexp parser context
939 xmlRegFreeParserCtxt(xmlRegParserCtxtPtr ctxt
) {
944 if (ctxt
->string
!= NULL
)
945 xmlFree(ctxt
->string
);
946 if (ctxt
->states
!= NULL
) {
947 for (i
= 0;i
< ctxt
->nbStates
;i
++)
948 xmlRegFreeState(ctxt
->states
[i
]);
949 xmlFree(ctxt
->states
);
951 if (ctxt
->atoms
!= NULL
) {
952 for (i
= 0;i
< ctxt
->nbAtoms
;i
++)
953 xmlRegFreeAtom(ctxt
->atoms
[i
]);
954 xmlFree(ctxt
->atoms
);
956 if (ctxt
->counters
!= NULL
)
957 xmlFree(ctxt
->counters
);
961 /************************************************************************
963 * Display of Data structures *
965 ************************************************************************/
968 xmlRegPrintAtomType(FILE *output
, xmlRegAtomType type
) {
970 case XML_REGEXP_EPSILON
:
971 fprintf(output
, "epsilon "); break;
972 case XML_REGEXP_CHARVAL
:
973 fprintf(output
, "charval "); break;
974 case XML_REGEXP_RANGES
:
975 fprintf(output
, "ranges "); break;
976 case XML_REGEXP_SUBREG
:
977 fprintf(output
, "subexpr "); break;
978 case XML_REGEXP_STRING
:
979 fprintf(output
, "string "); break;
980 case XML_REGEXP_ANYCHAR
:
981 fprintf(output
, "anychar "); break;
982 case XML_REGEXP_ANYSPACE
:
983 fprintf(output
, "anyspace "); break;
984 case XML_REGEXP_NOTSPACE
:
985 fprintf(output
, "notspace "); break;
986 case XML_REGEXP_INITNAME
:
987 fprintf(output
, "initname "); break;
988 case XML_REGEXP_NOTINITNAME
:
989 fprintf(output
, "notinitname "); break;
990 case XML_REGEXP_NAMECHAR
:
991 fprintf(output
, "namechar "); break;
992 case XML_REGEXP_NOTNAMECHAR
:
993 fprintf(output
, "notnamechar "); break;
994 case XML_REGEXP_DECIMAL
:
995 fprintf(output
, "decimal "); break;
996 case XML_REGEXP_NOTDECIMAL
:
997 fprintf(output
, "notdecimal "); break;
998 case XML_REGEXP_REALCHAR
:
999 fprintf(output
, "realchar "); break;
1000 case XML_REGEXP_NOTREALCHAR
:
1001 fprintf(output
, "notrealchar "); break;
1002 case XML_REGEXP_LETTER
:
1003 fprintf(output
, "LETTER "); break;
1004 case XML_REGEXP_LETTER_UPPERCASE
:
1005 fprintf(output
, "LETTER_UPPERCASE "); break;
1006 case XML_REGEXP_LETTER_LOWERCASE
:
1007 fprintf(output
, "LETTER_LOWERCASE "); break;
1008 case XML_REGEXP_LETTER_TITLECASE
:
1009 fprintf(output
, "LETTER_TITLECASE "); break;
1010 case XML_REGEXP_LETTER_MODIFIER
:
1011 fprintf(output
, "LETTER_MODIFIER "); break;
1012 case XML_REGEXP_LETTER_OTHERS
:
1013 fprintf(output
, "LETTER_OTHERS "); break;
1014 case XML_REGEXP_MARK
:
1015 fprintf(output
, "MARK "); break;
1016 case XML_REGEXP_MARK_NONSPACING
:
1017 fprintf(output
, "MARK_NONSPACING "); break;
1018 case XML_REGEXP_MARK_SPACECOMBINING
:
1019 fprintf(output
, "MARK_SPACECOMBINING "); break;
1020 case XML_REGEXP_MARK_ENCLOSING
:
1021 fprintf(output
, "MARK_ENCLOSING "); break;
1022 case XML_REGEXP_NUMBER
:
1023 fprintf(output
, "NUMBER "); break;
1024 case XML_REGEXP_NUMBER_DECIMAL
:
1025 fprintf(output
, "NUMBER_DECIMAL "); break;
1026 case XML_REGEXP_NUMBER_LETTER
:
1027 fprintf(output
, "NUMBER_LETTER "); break;
1028 case XML_REGEXP_NUMBER_OTHERS
:
1029 fprintf(output
, "NUMBER_OTHERS "); break;
1030 case XML_REGEXP_PUNCT
:
1031 fprintf(output
, "PUNCT "); break;
1032 case XML_REGEXP_PUNCT_CONNECTOR
:
1033 fprintf(output
, "PUNCT_CONNECTOR "); break;
1034 case XML_REGEXP_PUNCT_DASH
:
1035 fprintf(output
, "PUNCT_DASH "); break;
1036 case XML_REGEXP_PUNCT_OPEN
:
1037 fprintf(output
, "PUNCT_OPEN "); break;
1038 case XML_REGEXP_PUNCT_CLOSE
:
1039 fprintf(output
, "PUNCT_CLOSE "); break;
1040 case XML_REGEXP_PUNCT_INITQUOTE
:
1041 fprintf(output
, "PUNCT_INITQUOTE "); break;
1042 case XML_REGEXP_PUNCT_FINQUOTE
:
1043 fprintf(output
, "PUNCT_FINQUOTE "); break;
1044 case XML_REGEXP_PUNCT_OTHERS
:
1045 fprintf(output
, "PUNCT_OTHERS "); break;
1046 case XML_REGEXP_SEPAR
:
1047 fprintf(output
, "SEPAR "); break;
1048 case XML_REGEXP_SEPAR_SPACE
:
1049 fprintf(output
, "SEPAR_SPACE "); break;
1050 case XML_REGEXP_SEPAR_LINE
:
1051 fprintf(output
, "SEPAR_LINE "); break;
1052 case XML_REGEXP_SEPAR_PARA
:
1053 fprintf(output
, "SEPAR_PARA "); break;
1054 case XML_REGEXP_SYMBOL
:
1055 fprintf(output
, "SYMBOL "); break;
1056 case XML_REGEXP_SYMBOL_MATH
:
1057 fprintf(output
, "SYMBOL_MATH "); break;
1058 case XML_REGEXP_SYMBOL_CURRENCY
:
1059 fprintf(output
, "SYMBOL_CURRENCY "); break;
1060 case XML_REGEXP_SYMBOL_MODIFIER
:
1061 fprintf(output
, "SYMBOL_MODIFIER "); break;
1062 case XML_REGEXP_SYMBOL_OTHERS
:
1063 fprintf(output
, "SYMBOL_OTHERS "); break;
1064 case XML_REGEXP_OTHER
:
1065 fprintf(output
, "OTHER "); break;
1066 case XML_REGEXP_OTHER_CONTROL
:
1067 fprintf(output
, "OTHER_CONTROL "); break;
1068 case XML_REGEXP_OTHER_FORMAT
:
1069 fprintf(output
, "OTHER_FORMAT "); break;
1070 case XML_REGEXP_OTHER_PRIVATE
:
1071 fprintf(output
, "OTHER_PRIVATE "); break;
1072 case XML_REGEXP_OTHER_NA
:
1073 fprintf(output
, "OTHER_NA "); break;
1074 case XML_REGEXP_BLOCK_NAME
:
1075 fprintf(output
, "BLOCK "); break;
1080 xmlRegPrintQuantType(FILE *output
, xmlRegQuantType type
) {
1082 case XML_REGEXP_QUANT_EPSILON
:
1083 fprintf(output
, "epsilon "); break;
1084 case XML_REGEXP_QUANT_ONCE
:
1085 fprintf(output
, "once "); break;
1086 case XML_REGEXP_QUANT_OPT
:
1087 fprintf(output
, "? "); break;
1088 case XML_REGEXP_QUANT_MULT
:
1089 fprintf(output
, "* "); break;
1090 case XML_REGEXP_QUANT_PLUS
:
1091 fprintf(output
, "+ "); break;
1092 case XML_REGEXP_QUANT_RANGE
:
1093 fprintf(output
, "range "); break;
1094 case XML_REGEXP_QUANT_ONCEONLY
:
1095 fprintf(output
, "onceonly "); break;
1096 case XML_REGEXP_QUANT_ALL
:
1097 fprintf(output
, "all "); break;
1101 xmlRegPrintRange(FILE *output
, xmlRegRangePtr range
) {
1102 fprintf(output
, " range: ");
1104 fprintf(output
, "negative ");
1105 xmlRegPrintAtomType(output
, range
->type
);
1106 fprintf(output
, "%c - %c\n", range
->start
, range
->end
);
1110 xmlRegPrintAtom(FILE *output
, xmlRegAtomPtr atom
) {
1111 fprintf(output
, " atom: ");
1113 fprintf(output
, "NULL\n");
1117 fprintf(output
, "not ");
1118 xmlRegPrintAtomType(output
, atom
->type
);
1119 xmlRegPrintQuantType(output
, atom
->quant
);
1120 if (atom
->quant
== XML_REGEXP_QUANT_RANGE
)
1121 fprintf(output
, "%d-%d ", atom
->min
, atom
->max
);
1122 if (atom
->type
== XML_REGEXP_STRING
)
1123 fprintf(output
, "'%s' ", (char *) atom
->valuep
);
1124 if (atom
->type
== XML_REGEXP_CHARVAL
)
1125 fprintf(output
, "char %c\n", atom
->codepoint
);
1126 else if (atom
->type
== XML_REGEXP_RANGES
) {
1128 fprintf(output
, "%d entries\n", atom
->nbRanges
);
1129 for (i
= 0; i
< atom
->nbRanges
;i
++)
1130 xmlRegPrintRange(output
, atom
->ranges
[i
]);
1131 } else if (atom
->type
== XML_REGEXP_SUBREG
) {
1132 fprintf(output
, "start %d end %d\n", atom
->start
->no
, atom
->stop
->no
);
1134 fprintf(output
, "\n");
1139 xmlRegPrintTrans(FILE *output
, xmlRegTransPtr trans
) {
1140 fprintf(output
, " trans: ");
1141 if (trans
== NULL
) {
1142 fprintf(output
, "NULL\n");
1145 if (trans
->to
< 0) {
1146 fprintf(output
, "removed\n");
1149 if (trans
->nd
!= 0) {
1151 fprintf(output
, "last not determinist, ");
1153 fprintf(output
, "not determinist, ");
1155 if (trans
->counter
>= 0) {
1156 fprintf(output
, "counted %d, ", trans
->counter
);
1158 if (trans
->count
== REGEXP_ALL_COUNTER
) {
1159 fprintf(output
, "all transition, ");
1160 } else if (trans
->count
>= 0) {
1161 fprintf(output
, "count based %d, ", trans
->count
);
1163 if (trans
->atom
== NULL
) {
1164 fprintf(output
, "epsilon to %d\n", trans
->to
);
1167 if (trans
->atom
->type
== XML_REGEXP_CHARVAL
)
1168 fprintf(output
, "char %c ", trans
->atom
->codepoint
);
1169 fprintf(output
, "atom %d, to %d\n", trans
->atom
->no
, trans
->to
);
1173 xmlRegPrintState(FILE *output
, xmlRegStatePtr state
) {
1176 fprintf(output
, " state: ");
1177 if (state
== NULL
) {
1178 fprintf(output
, "NULL\n");
1181 if (state
->type
== XML_REGEXP_START_STATE
)
1182 fprintf(output
, "START ");
1183 if (state
->type
== XML_REGEXP_FINAL_STATE
)
1184 fprintf(output
, "FINAL ");
1186 fprintf(output
, "%d, %d transitions:\n", state
->no
, state
->nbTrans
);
1187 for (i
= 0;i
< state
->nbTrans
; i
++) {
1188 xmlRegPrintTrans(output
, &(state
->trans
[i
]));
1192 #ifdef DEBUG_REGEXP_GRAPH
1194 xmlRegPrintCtxt(FILE *output
, xmlRegParserCtxtPtr ctxt
) {
1197 fprintf(output
, " ctxt: ");
1199 fprintf(output
, "NULL\n");
1202 fprintf(output
, "'%s' ", ctxt
->string
);
1204 fprintf(output
, "error ");
1206 fprintf(output
, "neg ");
1207 fprintf(output
, "\n");
1208 fprintf(output
, "%d atoms:\n", ctxt
->nbAtoms
);
1209 for (i
= 0;i
< ctxt
->nbAtoms
; i
++) {
1210 fprintf(output
, " %02d ", i
);
1211 xmlRegPrintAtom(output
, ctxt
->atoms
[i
]);
1213 if (ctxt
->atom
!= NULL
) {
1214 fprintf(output
, "current atom:\n");
1215 xmlRegPrintAtom(output
, ctxt
->atom
);
1217 fprintf(output
, "%d states:", ctxt
->nbStates
);
1218 if (ctxt
->start
!= NULL
)
1219 fprintf(output
, " start: %d", ctxt
->start
->no
);
1220 if (ctxt
->end
!= NULL
)
1221 fprintf(output
, " end: %d", ctxt
->end
->no
);
1222 fprintf(output
, "\n");
1223 for (i
= 0;i
< ctxt
->nbStates
; i
++) {
1224 xmlRegPrintState(output
, ctxt
->states
[i
]);
1226 fprintf(output
, "%d counters:\n", ctxt
->nbCounters
);
1227 for (i
= 0;i
< ctxt
->nbCounters
; i
++) {
1228 fprintf(output
, " %d: min %d max %d\n", i
, ctxt
->counters
[i
].min
,
1229 ctxt
->counters
[i
].max
);
1234 /************************************************************************
1236 * Finite Automata structures manipulations *
1238 ************************************************************************/
1241 xmlRegAtomAddRange(xmlRegParserCtxtPtr ctxt
, xmlRegAtomPtr atom
,
1242 int neg
, xmlRegAtomType type
, int start
, int end
,
1243 xmlChar
*blockName
) {
1244 xmlRegRangePtr range
;
1247 ERROR("add range: atom is NULL");
1250 if (atom
->type
!= XML_REGEXP_RANGES
) {
1251 ERROR("add range: atom is not ranges");
1254 if (atom
->maxRanges
== 0) {
1255 atom
->maxRanges
= 4;
1256 atom
->ranges
= (xmlRegRangePtr
*) xmlMalloc(atom
->maxRanges
*
1257 sizeof(xmlRegRangePtr
));
1258 if (atom
->ranges
== NULL
) {
1259 xmlRegexpErrMemory(ctxt
, "adding ranges");
1260 atom
->maxRanges
= 0;
1263 } else if (atom
->nbRanges
>= atom
->maxRanges
) {
1264 xmlRegRangePtr
*tmp
;
1265 atom
->maxRanges
*= 2;
1266 tmp
= (xmlRegRangePtr
*) xmlRealloc(atom
->ranges
, atom
->maxRanges
*
1267 sizeof(xmlRegRangePtr
));
1269 xmlRegexpErrMemory(ctxt
, "adding ranges");
1270 atom
->maxRanges
/= 2;
1275 range
= xmlRegNewRange(ctxt
, neg
, type
, start
, end
);
1278 range
->blockName
= blockName
;
1279 atom
->ranges
[atom
->nbRanges
++] = range
;
1284 xmlRegGetCounter(xmlRegParserCtxtPtr ctxt
) {
1285 if (ctxt
->maxCounters
== 0) {
1286 ctxt
->maxCounters
= 4;
1287 ctxt
->counters
= (xmlRegCounter
*) xmlMalloc(ctxt
->maxCounters
*
1288 sizeof(xmlRegCounter
));
1289 if (ctxt
->counters
== NULL
) {
1290 xmlRegexpErrMemory(ctxt
, "allocating counter");
1291 ctxt
->maxCounters
= 0;
1294 } else if (ctxt
->nbCounters
>= ctxt
->maxCounters
) {
1296 ctxt
->maxCounters
*= 2;
1297 tmp
= (xmlRegCounter
*) xmlRealloc(ctxt
->counters
, ctxt
->maxCounters
*
1298 sizeof(xmlRegCounter
));
1300 xmlRegexpErrMemory(ctxt
, "allocating counter");
1301 ctxt
->maxCounters
/= 2;
1304 ctxt
->counters
= tmp
;
1306 ctxt
->counters
[ctxt
->nbCounters
].min
= -1;
1307 ctxt
->counters
[ctxt
->nbCounters
].max
= -1;
1308 return(ctxt
->nbCounters
++);
1312 xmlRegAtomPush(xmlRegParserCtxtPtr ctxt
, xmlRegAtomPtr atom
) {
1314 ERROR("atom push: atom is NULL");
1317 if (ctxt
->maxAtoms
== 0) {
1319 ctxt
->atoms
= (xmlRegAtomPtr
*) xmlMalloc(ctxt
->maxAtoms
*
1320 sizeof(xmlRegAtomPtr
));
1321 if (ctxt
->atoms
== NULL
) {
1322 xmlRegexpErrMemory(ctxt
, "pushing atom");
1326 } else if (ctxt
->nbAtoms
>= ctxt
->maxAtoms
) {
1328 ctxt
->maxAtoms
*= 2;
1329 tmp
= (xmlRegAtomPtr
*) xmlRealloc(ctxt
->atoms
, ctxt
->maxAtoms
*
1330 sizeof(xmlRegAtomPtr
));
1332 xmlRegexpErrMemory(ctxt
, "allocating counter");
1333 ctxt
->maxAtoms
/= 2;
1338 atom
->no
= ctxt
->nbAtoms
;
1339 ctxt
->atoms
[ctxt
->nbAtoms
++] = atom
;
1344 xmlRegStateAddTransTo(xmlRegParserCtxtPtr ctxt
, xmlRegStatePtr target
,
1346 if (target
->maxTransTo
== 0) {
1347 target
->maxTransTo
= 8;
1348 target
->transTo
= (int *) xmlMalloc(target
->maxTransTo
*
1350 if (target
->transTo
== NULL
) {
1351 xmlRegexpErrMemory(ctxt
, "adding transition");
1352 target
->maxTransTo
= 0;
1355 } else if (target
->nbTransTo
>= target
->maxTransTo
) {
1357 target
->maxTransTo
*= 2;
1358 tmp
= (int *) xmlRealloc(target
->transTo
, target
->maxTransTo
*
1361 xmlRegexpErrMemory(ctxt
, "adding transition");
1362 target
->maxTransTo
/= 2;
1365 target
->transTo
= tmp
;
1367 target
->transTo
[target
->nbTransTo
] = from
;
1368 target
->nbTransTo
++;
1372 xmlRegStateAddTrans(xmlRegParserCtxtPtr ctxt
, xmlRegStatePtr state
,
1373 xmlRegAtomPtr atom
, xmlRegStatePtr target
,
1374 int counter
, int count
) {
1378 if (state
== NULL
) {
1379 ERROR("add state: state is NULL");
1382 if (target
== NULL
) {
1383 ERROR("add state: target is NULL");
1387 * Other routines follow the philosophy 'When in doubt, add a transition'
1388 * so we check here whether such a transition is already present and, if
1389 * so, silently ignore this request.
1392 for (nrtrans
= state
->nbTrans
- 1; nrtrans
>= 0; nrtrans
--) {
1393 xmlRegTransPtr trans
= &(state
->trans
[nrtrans
]);
1394 if ((trans
->atom
== atom
) &&
1395 (trans
->to
== target
->no
) &&
1396 (trans
->counter
== counter
) &&
1397 (trans
->count
== count
)) {
1398 #ifdef DEBUG_REGEXP_GRAPH
1399 printf("Ignoring duplicate transition from %d to %d\n",
1400 state
->no
, target
->no
);
1406 if (state
->maxTrans
== 0) {
1407 state
->maxTrans
= 8;
1408 state
->trans
= (xmlRegTrans
*) xmlMalloc(state
->maxTrans
*
1409 sizeof(xmlRegTrans
));
1410 if (state
->trans
== NULL
) {
1411 xmlRegexpErrMemory(ctxt
, "adding transition");
1412 state
->maxTrans
= 0;
1415 } else if (state
->nbTrans
>= state
->maxTrans
) {
1417 state
->maxTrans
*= 2;
1418 tmp
= (xmlRegTrans
*) xmlRealloc(state
->trans
, state
->maxTrans
*
1419 sizeof(xmlRegTrans
));
1421 xmlRegexpErrMemory(ctxt
, "adding transition");
1422 state
->maxTrans
/= 2;
1427 #ifdef DEBUG_REGEXP_GRAPH
1428 printf("Add trans from %d to %d ", state
->no
, target
->no
);
1429 if (count
== REGEXP_ALL_COUNTER
)
1430 printf("all transition\n");
1431 else if (count
>= 0)
1432 printf("count based %d\n", count
);
1433 else if (counter
>= 0)
1434 printf("counted %d\n", counter
);
1435 else if (atom
== NULL
)
1436 printf("epsilon transition\n");
1437 else if (atom
!= NULL
)
1438 xmlRegPrintAtom(stdout
, atom
);
1441 state
->trans
[state
->nbTrans
].atom
= atom
;
1442 state
->trans
[state
->nbTrans
].to
= target
->no
;
1443 state
->trans
[state
->nbTrans
].counter
= counter
;
1444 state
->trans
[state
->nbTrans
].count
= count
;
1445 state
->trans
[state
->nbTrans
].nd
= 0;
1447 xmlRegStateAddTransTo(ctxt
, target
, state
->no
);
1451 xmlRegStatePush(xmlRegParserCtxtPtr ctxt
, xmlRegStatePtr state
) {
1452 if (state
== NULL
) return(-1);
1453 if (ctxt
->maxStates
== 0) {
1454 ctxt
->maxStates
= 4;
1455 ctxt
->states
= (xmlRegStatePtr
*) xmlMalloc(ctxt
->maxStates
*
1456 sizeof(xmlRegStatePtr
));
1457 if (ctxt
->states
== NULL
) {
1458 xmlRegexpErrMemory(ctxt
, "adding state");
1459 ctxt
->maxStates
= 0;
1462 } else if (ctxt
->nbStates
>= ctxt
->maxStates
) {
1463 xmlRegStatePtr
*tmp
;
1464 ctxt
->maxStates
*= 2;
1465 tmp
= (xmlRegStatePtr
*) xmlRealloc(ctxt
->states
, ctxt
->maxStates
*
1466 sizeof(xmlRegStatePtr
));
1468 xmlRegexpErrMemory(ctxt
, "adding state");
1469 ctxt
->maxStates
/= 2;
1474 state
->no
= ctxt
->nbStates
;
1475 ctxt
->states
[ctxt
->nbStates
++] = state
;
1480 * xmlFAGenerateAllTransition:
1481 * @ctxt: a regexp parser context
1482 * @from: the from state
1483 * @to: the target state or NULL for building a new one
1488 xmlFAGenerateAllTransition(xmlRegParserCtxtPtr ctxt
,
1489 xmlRegStatePtr from
, xmlRegStatePtr to
,
1492 to
= xmlRegNewState(ctxt
);
1493 xmlRegStatePush(ctxt
, to
);
1497 xmlRegStateAddTrans(ctxt
, from
, NULL
, to
, -1, REGEXP_ALL_LAX_COUNTER
);
1499 xmlRegStateAddTrans(ctxt
, from
, NULL
, to
, -1, REGEXP_ALL_COUNTER
);
1503 * xmlFAGenerateEpsilonTransition:
1504 * @ctxt: a regexp parser context
1505 * @from: the from state
1506 * @to: the target state or NULL for building a new one
1510 xmlFAGenerateEpsilonTransition(xmlRegParserCtxtPtr ctxt
,
1511 xmlRegStatePtr from
, xmlRegStatePtr to
) {
1513 to
= xmlRegNewState(ctxt
);
1514 xmlRegStatePush(ctxt
, to
);
1517 xmlRegStateAddTrans(ctxt
, from
, NULL
, to
, -1, -1);
1521 * xmlFAGenerateCountedEpsilonTransition:
1522 * @ctxt: a regexp parser context
1523 * @from: the from state
1524 * @to: the target state or NULL for building a new one
1525 * counter: the counter for that transition
1529 xmlFAGenerateCountedEpsilonTransition(xmlRegParserCtxtPtr ctxt
,
1530 xmlRegStatePtr from
, xmlRegStatePtr to
, int counter
) {
1532 to
= xmlRegNewState(ctxt
);
1533 xmlRegStatePush(ctxt
, to
);
1536 xmlRegStateAddTrans(ctxt
, from
, NULL
, to
, counter
, -1);
1540 * xmlFAGenerateCountedTransition:
1541 * @ctxt: a regexp parser context
1542 * @from: the from state
1543 * @to: the target state or NULL for building a new one
1544 * counter: the counter for that transition
1548 xmlFAGenerateCountedTransition(xmlRegParserCtxtPtr ctxt
,
1549 xmlRegStatePtr from
, xmlRegStatePtr to
, int counter
) {
1551 to
= xmlRegNewState(ctxt
);
1552 xmlRegStatePush(ctxt
, to
);
1555 xmlRegStateAddTrans(ctxt
, from
, NULL
, to
, -1, counter
);
1559 * xmlFAGenerateTransitions:
1560 * @ctxt: a regexp parser context
1561 * @from: the from state
1562 * @to: the target state or NULL for building a new one
1563 * @atom: the atom generating the transition
1565 * Returns 0 if success and -1 in case of error.
1568 xmlFAGenerateTransitions(xmlRegParserCtxtPtr ctxt
, xmlRegStatePtr from
,
1569 xmlRegStatePtr to
, xmlRegAtomPtr atom
) {
1574 ERROR("generate transition: atom == NULL");
1577 if (atom
->type
== XML_REGEXP_SUBREG
) {
1579 * this is a subexpression handling one should not need to
1580 * create a new node except for XML_REGEXP_QUANT_RANGE.
1582 if (xmlRegAtomPush(ctxt
, atom
) < 0) {
1585 if ((to
!= NULL
) && (atom
->stop
!= to
) &&
1586 (atom
->quant
!= XML_REGEXP_QUANT_RANGE
)) {
1588 * Generate an epsilon transition to link to the target
1590 xmlFAGenerateEpsilonTransition(ctxt
, atom
->stop
, to
);
1592 } else if ((to
== NULL
) && (atom
->quant
!= XML_REGEXP_QUANT_RANGE
) &&
1593 (atom
->quant
!= XML_REGEXP_QUANT_ONCE
)) {
1594 to
= xmlRegNewState(ctxt
);
1595 xmlRegStatePush(ctxt
, to
);
1597 xmlFAGenerateEpsilonTransition(ctxt
, atom
->stop
, to
);
1600 switch (atom
->quant
) {
1601 case XML_REGEXP_QUANT_OPT
:
1602 atom
->quant
= XML_REGEXP_QUANT_ONCE
;
1604 * transition done to the state after end of atom.
1605 * 1. set transition from atom start to new state
1606 * 2. set transition from atom end to this state.
1609 xmlFAGenerateEpsilonTransition(ctxt
, atom
->start
, 0);
1610 xmlFAGenerateEpsilonTransition(ctxt
, atom
->stop
,
1613 xmlFAGenerateEpsilonTransition(ctxt
, atom
->start
, to
);
1616 case XML_REGEXP_QUANT_MULT
:
1617 atom
->quant
= XML_REGEXP_QUANT_ONCE
;
1618 xmlFAGenerateEpsilonTransition(ctxt
, atom
->start
, atom
->stop
);
1619 xmlFAGenerateEpsilonTransition(ctxt
, atom
->stop
, atom
->start
);
1621 case XML_REGEXP_QUANT_PLUS
:
1622 atom
->quant
= XML_REGEXP_QUANT_ONCE
;
1623 xmlFAGenerateEpsilonTransition(ctxt
, atom
->stop
, atom
->start
);
1625 case XML_REGEXP_QUANT_RANGE
: {
1627 xmlRegStatePtr inter
, newstate
;
1630 * create the final state now if needed
1635 newstate
= xmlRegNewState(ctxt
);
1636 xmlRegStatePush(ctxt
, newstate
);
1640 * The principle here is to use counted transition
1641 * to avoid explosion in the number of states in the
1642 * graph. This is clearly more complex but should not
1643 * be exploitable at runtime.
1645 if ((atom
->min
== 0) && (atom
->start0
== NULL
)) {
1648 * duplicate a transition based on atom to count next
1649 * occurrences after 1. We cannot loop to atom->start
1650 * directly because we need an epsilon transition to
1653 /* ???? For some reason it seems we never reach that
1654 case, I suppose this got optimized out before when
1655 building the automata */
1656 copy
= xmlRegCopyAtom(ctxt
, atom
);
1659 copy
->quant
= XML_REGEXP_QUANT_ONCE
;
1663 if (xmlFAGenerateTransitions(ctxt
, atom
->start
, NULL
, copy
)
1666 inter
= ctxt
->state
;
1667 counter
= xmlRegGetCounter(ctxt
);
1668 ctxt
->counters
[counter
].min
= atom
->min
- 1;
1669 ctxt
->counters
[counter
].max
= atom
->max
- 1;
1670 /* count the number of times we see it again */
1671 xmlFAGenerateCountedEpsilonTransition(ctxt
, inter
,
1672 atom
->stop
, counter
);
1673 /* allow a way out based on the count */
1674 xmlFAGenerateCountedTransition(ctxt
, inter
,
1676 /* and also allow a direct exit for 0 */
1677 xmlFAGenerateEpsilonTransition(ctxt
, atom
->start
,
1681 * either we need the atom at least once or there
1682 * is an atom->start0 allowing to easily plug the
1683 * epsilon transition.
1685 counter
= xmlRegGetCounter(ctxt
);
1686 ctxt
->counters
[counter
].min
= atom
->min
- 1;
1687 ctxt
->counters
[counter
].max
= atom
->max
- 1;
1688 /* allow a way out based on the count */
1689 xmlFAGenerateCountedTransition(ctxt
, atom
->stop
,
1691 /* count the number of times we see it again */
1692 xmlFAGenerateCountedEpsilonTransition(ctxt
, atom
->stop
,
1693 atom
->start
, counter
);
1694 /* and if needed allow a direct exit for 0 */
1696 xmlFAGenerateEpsilonTransition(ctxt
, atom
->start0
,
1702 atom
->quant
= XML_REGEXP_QUANT_ONCE
;
1703 ctxt
->state
= newstate
;
1710 if ((atom
->min
== 0) && (atom
->max
== 0) &&
1711 (atom
->quant
== XML_REGEXP_QUANT_RANGE
)) {
1713 * we can discard the atom and generate an epsilon transition instead
1716 to
= xmlRegNewState(ctxt
);
1718 xmlRegStatePush(ctxt
, to
);
1723 xmlFAGenerateEpsilonTransition(ctxt
, from
, to
);
1725 xmlRegFreeAtom(atom
);
1729 to
= xmlRegNewState(ctxt
);
1731 xmlRegStatePush(ctxt
, to
);
1737 if ((atom
->quant
== XML_REGEXP_QUANT_MULT
) ||
1738 (atom
->quant
== XML_REGEXP_QUANT_PLUS
)) {
1740 * Do not pollute the target state by adding transitions from
1741 * it as it is likely to be the shared target of multiple branches.
1742 * So isolate with an epsilon transition.
1746 tmp
= xmlRegNewState(ctxt
);
1748 xmlRegStatePush(ctxt
, tmp
);
1752 xmlFAGenerateEpsilonTransition(ctxt
, tmp
, to
);
1755 if (xmlRegAtomPush(ctxt
, atom
) < 0) {
1758 if ((atom
->quant
== XML_REGEXP_QUANT_RANGE
) &&
1759 (atom
->min
== 0) && (atom
->max
> 0)) {
1763 atom
->quant
= XML_REGEXP_QUANT_OPT
;
1765 xmlRegStateAddTrans(ctxt
, from
, atom
, to
, -1, -1);
1767 switch (atom
->quant
) {
1768 case XML_REGEXP_QUANT_OPT
:
1769 atom
->quant
= XML_REGEXP_QUANT_ONCE
;
1770 xmlFAGenerateEpsilonTransition(ctxt
, from
, to
);
1772 case XML_REGEXP_QUANT_MULT
:
1773 atom
->quant
= XML_REGEXP_QUANT_ONCE
;
1774 xmlFAGenerateEpsilonTransition(ctxt
, from
, to
);
1775 xmlRegStateAddTrans(ctxt
, to
, atom
, to
, -1, -1);
1777 case XML_REGEXP_QUANT_PLUS
:
1778 atom
->quant
= XML_REGEXP_QUANT_ONCE
;
1779 xmlRegStateAddTrans(ctxt
, to
, atom
, to
, -1, -1);
1781 case XML_REGEXP_QUANT_RANGE
:
1783 xmlFAGenerateEpsilonTransition(ctxt
, from
, to
);
1792 * xmlFAReduceEpsilonTransitions:
1793 * @ctxt: a regexp parser context
1794 * @fromnr: the from state
1795 * @tonr: the to state
1796 * @counter: should that transition be associated to a counted
1800 xmlFAReduceEpsilonTransitions(xmlRegParserCtxtPtr ctxt
, int fromnr
,
1801 int tonr
, int counter
) {
1803 xmlRegStatePtr from
;
1806 #ifdef DEBUG_REGEXP_GRAPH
1807 printf("xmlFAReduceEpsilonTransitions(%d, %d)\n", fromnr
, tonr
);
1809 from
= ctxt
->states
[fromnr
];
1812 to
= ctxt
->states
[tonr
];
1815 if ((to
->mark
== XML_REGEXP_MARK_START
) ||
1816 (to
->mark
== XML_REGEXP_MARK_VISITED
))
1819 to
->mark
= XML_REGEXP_MARK_VISITED
;
1820 if (to
->type
== XML_REGEXP_FINAL_STATE
) {
1821 #ifdef DEBUG_REGEXP_GRAPH
1822 printf("State %d is final, so %d becomes final\n", tonr
, fromnr
);
1824 from
->type
= XML_REGEXP_FINAL_STATE
;
1826 for (transnr
= 0;transnr
< to
->nbTrans
;transnr
++) {
1827 if (to
->trans
[transnr
].to
< 0)
1829 if (to
->trans
[transnr
].atom
== NULL
) {
1831 * Don't remove counted transitions
1834 if (to
->trans
[transnr
].to
!= fromnr
) {
1835 if (to
->trans
[transnr
].count
>= 0) {
1836 int newto
= to
->trans
[transnr
].to
;
1838 xmlRegStateAddTrans(ctxt
, from
, NULL
,
1839 ctxt
->states
[newto
],
1840 -1, to
->trans
[transnr
].count
);
1842 #ifdef DEBUG_REGEXP_GRAPH
1843 printf("Found epsilon trans %d from %d to %d\n",
1844 transnr
, tonr
, to
->trans
[transnr
].to
);
1846 if (to
->trans
[transnr
].counter
>= 0) {
1847 xmlFAReduceEpsilonTransitions(ctxt
, fromnr
,
1848 to
->trans
[transnr
].to
,
1849 to
->trans
[transnr
].counter
);
1851 xmlFAReduceEpsilonTransitions(ctxt
, fromnr
,
1852 to
->trans
[transnr
].to
,
1858 int newto
= to
->trans
[transnr
].to
;
1860 if (to
->trans
[transnr
].counter
>= 0) {
1861 xmlRegStateAddTrans(ctxt
, from
, to
->trans
[transnr
].atom
,
1862 ctxt
->states
[newto
],
1863 to
->trans
[transnr
].counter
, -1);
1865 xmlRegStateAddTrans(ctxt
, from
, to
->trans
[transnr
].atom
,
1866 ctxt
->states
[newto
], counter
, -1);
1870 to
->mark
= XML_REGEXP_MARK_NORMAL
;
1874 * xmlFAEliminateSimpleEpsilonTransitions:
1875 * @ctxt: a regexp parser context
1877 * Eliminating general epsilon transitions can get costly in the general
1878 * algorithm due to the large amount of generated new transitions and
1879 * associated comparisons. However for simple epsilon transition used just
1880 * to separate building blocks when generating the automata this can be
1881 * reduced to state elimination:
1882 * - if there exists an epsilon from X to Y
1883 * - if there is no other transition from X
1884 * then X and Y are semantically equivalent and X can be eliminated
1885 * If X is the start state then make Y the start state, else replace the
1886 * target of all transitions to X by transitions to Y.
1888 * If X is a final state, skip it.
1889 * Otherwise it would be necessary to manipulate counters for this case when
1890 * eliminating state 2:
1891 * State 1 has a transition with an atom to state 2.
1892 * State 2 is final and has an epsilon transition to state 1.
1895 xmlFAEliminateSimpleEpsilonTransitions(xmlRegParserCtxtPtr ctxt
) {
1896 int statenr
, i
, j
, newto
;
1897 xmlRegStatePtr state
, tmp
;
1899 for (statenr
= 0;statenr
< ctxt
->nbStates
;statenr
++) {
1900 state
= ctxt
->states
[statenr
];
1903 if (state
->nbTrans
!= 1)
1905 if (state
->type
== XML_REGEXP_UNREACH_STATE
||
1906 state
->type
== XML_REGEXP_FINAL_STATE
)
1908 /* is the only transition out a basic transition */
1909 if ((state
->trans
[0].atom
== NULL
) &&
1910 (state
->trans
[0].to
>= 0) &&
1911 (state
->trans
[0].to
!= statenr
) &&
1912 (state
->trans
[0].counter
< 0) &&
1913 (state
->trans
[0].count
< 0)) {
1914 newto
= state
->trans
[0].to
;
1916 if (state
->type
== XML_REGEXP_START_STATE
) {
1917 #ifdef DEBUG_REGEXP_GRAPH
1918 printf("Found simple epsilon trans from start %d to %d\n",
1922 #ifdef DEBUG_REGEXP_GRAPH
1923 printf("Found simple epsilon trans from %d to %d\n",
1926 for (i
= 0;i
< state
->nbTransTo
;i
++) {
1927 tmp
= ctxt
->states
[state
->transTo
[i
]];
1928 for (j
= 0;j
< tmp
->nbTrans
;j
++) {
1929 if (tmp
->trans
[j
].to
== statenr
) {
1930 #ifdef DEBUG_REGEXP_GRAPH
1931 printf("Changed transition %d on %d to go to %d\n",
1934 tmp
->trans
[j
].to
= -1;
1935 xmlRegStateAddTrans(ctxt
, tmp
, tmp
->trans
[j
].atom
,
1936 ctxt
->states
[newto
],
1937 tmp
->trans
[j
].counter
,
1938 tmp
->trans
[j
].count
);
1942 if (state
->type
== XML_REGEXP_FINAL_STATE
)
1943 ctxt
->states
[newto
]->type
= XML_REGEXP_FINAL_STATE
;
1944 /* eliminate the transition completely */
1947 state
->type
= XML_REGEXP_UNREACH_STATE
;
1955 * xmlFAEliminateEpsilonTransitions:
1956 * @ctxt: a regexp parser context
1960 xmlFAEliminateEpsilonTransitions(xmlRegParserCtxtPtr ctxt
) {
1961 int statenr
, transnr
;
1962 xmlRegStatePtr state
;
1965 if (ctxt
->states
== NULL
) return;
1968 * Eliminate simple epsilon transition and the associated unreachable
1971 xmlFAEliminateSimpleEpsilonTransitions(ctxt
);
1972 for (statenr
= 0;statenr
< ctxt
->nbStates
;statenr
++) {
1973 state
= ctxt
->states
[statenr
];
1974 if ((state
!= NULL
) && (state
->type
== XML_REGEXP_UNREACH_STATE
)) {
1975 #ifdef DEBUG_REGEXP_GRAPH
1976 printf("Removed unreachable state %d\n", statenr
);
1978 xmlRegFreeState(state
);
1979 ctxt
->states
[statenr
] = NULL
;
1986 * Build the completed transitions bypassing the epsilons
1987 * Use a marking algorithm to avoid loops
1988 * Mark sink states too.
1989 * Process from the latest states backward to the start when
1990 * there is long cascading epsilon chains this minimize the
1991 * recursions and transition compares when adding the new ones
1993 for (statenr
= ctxt
->nbStates
- 1;statenr
>= 0;statenr
--) {
1994 state
= ctxt
->states
[statenr
];
1997 if ((state
->nbTrans
== 0) &&
1998 (state
->type
!= XML_REGEXP_FINAL_STATE
)) {
1999 state
->type
= XML_REGEXP_SINK_STATE
;
2001 for (transnr
= 0;transnr
< state
->nbTrans
;transnr
++) {
2002 if ((state
->trans
[transnr
].atom
== NULL
) &&
2003 (state
->trans
[transnr
].to
>= 0)) {
2004 if (state
->trans
[transnr
].to
== statenr
) {
2005 state
->trans
[transnr
].to
= -1;
2006 #ifdef DEBUG_REGEXP_GRAPH
2007 printf("Removed loopback epsilon trans %d on %d\n",
2010 } else if (state
->trans
[transnr
].count
< 0) {
2011 int newto
= state
->trans
[transnr
].to
;
2013 #ifdef DEBUG_REGEXP_GRAPH
2014 printf("Found epsilon trans %d from %d to %d\n",
2015 transnr
, statenr
, newto
);
2018 state
->trans
[transnr
].to
= -2;
2019 state
->mark
= XML_REGEXP_MARK_START
;
2020 xmlFAReduceEpsilonTransitions(ctxt
, statenr
,
2021 newto
, state
->trans
[transnr
].counter
);
2022 state
->mark
= XML_REGEXP_MARK_NORMAL
;
2023 #ifdef DEBUG_REGEXP_GRAPH
2025 printf("Found counted transition %d on %d\n",
2033 * Eliminate the epsilon transitions
2036 for (statenr
= 0;statenr
< ctxt
->nbStates
;statenr
++) {
2037 state
= ctxt
->states
[statenr
];
2040 for (transnr
= 0;transnr
< state
->nbTrans
;transnr
++) {
2041 xmlRegTransPtr trans
= &(state
->trans
[transnr
]);
2042 if ((trans
->atom
== NULL
) &&
2043 (trans
->count
< 0) &&
2052 * Use this pass to detect unreachable states too
2054 for (statenr
= 0;statenr
< ctxt
->nbStates
;statenr
++) {
2055 state
= ctxt
->states
[statenr
];
2057 state
->reached
= XML_REGEXP_MARK_NORMAL
;
2059 state
= ctxt
->states
[0];
2061 state
->reached
= XML_REGEXP_MARK_START
;
2062 while (state
!= NULL
) {
2063 xmlRegStatePtr target
= NULL
;
2064 state
->reached
= XML_REGEXP_MARK_VISITED
;
2066 * Mark all states reachable from the current reachable state
2068 for (transnr
= 0;transnr
< state
->nbTrans
;transnr
++) {
2069 if ((state
->trans
[transnr
].to
>= 0) &&
2070 ((state
->trans
[transnr
].atom
!= NULL
) ||
2071 (state
->trans
[transnr
].count
>= 0))) {
2072 int newto
= state
->trans
[transnr
].to
;
2074 if (ctxt
->states
[newto
] == NULL
)
2076 if (ctxt
->states
[newto
]->reached
== XML_REGEXP_MARK_NORMAL
) {
2077 ctxt
->states
[newto
]->reached
= XML_REGEXP_MARK_START
;
2078 target
= ctxt
->states
[newto
];
2084 * find the next accessible state not explored
2086 if (target
== NULL
) {
2087 for (statenr
= 1;statenr
< ctxt
->nbStates
;statenr
++) {
2088 state
= ctxt
->states
[statenr
];
2089 if ((state
!= NULL
) && (state
->reached
==
2090 XML_REGEXP_MARK_START
)) {
2098 for (statenr
= 0;statenr
< ctxt
->nbStates
;statenr
++) {
2099 state
= ctxt
->states
[statenr
];
2100 if ((state
!= NULL
) && (state
->reached
== XML_REGEXP_MARK_NORMAL
)) {
2101 #ifdef DEBUG_REGEXP_GRAPH
2102 printf("Removed unreachable state %d\n", statenr
);
2104 xmlRegFreeState(state
);
2105 ctxt
->states
[statenr
] = NULL
;
2112 xmlFACompareRanges(xmlRegRangePtr range1
, xmlRegRangePtr range2
) {
2115 if ((range1
->type
== XML_REGEXP_RANGES
) ||
2116 (range2
->type
== XML_REGEXP_RANGES
) ||
2117 (range2
->type
== XML_REGEXP_SUBREG
) ||
2118 (range1
->type
== XML_REGEXP_SUBREG
) ||
2119 (range1
->type
== XML_REGEXP_STRING
) ||
2120 (range2
->type
== XML_REGEXP_STRING
))
2123 /* put them in order */
2124 if (range1
->type
> range2
->type
) {
2131 if ((range1
->type
== XML_REGEXP_ANYCHAR
) ||
2132 (range2
->type
== XML_REGEXP_ANYCHAR
)) {
2134 } else if ((range1
->type
== XML_REGEXP_EPSILON
) ||
2135 (range2
->type
== XML_REGEXP_EPSILON
)) {
2137 } else if (range1
->type
== range2
->type
) {
2138 if (range1
->type
!= XML_REGEXP_CHARVAL
)
2140 else if ((range1
->end
< range2
->start
) ||
2141 (range2
->end
< range1
->start
))
2145 } else if (range1
->type
== XML_REGEXP_CHARVAL
) {
2150 * just check all codepoints in the range for acceptance,
2151 * this is usually way cheaper since done only once at
2152 * compilation than testing over and over at runtime or
2153 * pushing too many states when evaluating.
2155 if (((range1
->neg
== 0) && (range2
->neg
!= 0)) ||
2156 ((range1
->neg
!= 0) && (range2
->neg
== 0)))
2159 for (codepoint
= range1
->start
;codepoint
<= range1
->end
;codepoint
++) {
2160 ret
= xmlRegCheckCharacterRange(range2
->type
, codepoint
,
2161 0, range2
->start
, range2
->end
,
2165 if (((neg
== 1) && (ret
== 0)) ||
2166 ((neg
== 0) && (ret
== 1)))
2170 } else if ((range1
->type
== XML_REGEXP_BLOCK_NAME
) ||
2171 (range2
->type
== XML_REGEXP_BLOCK_NAME
)) {
2172 if (range1
->type
== range2
->type
) {
2173 ret
= xmlStrEqual(range1
->blockName
, range2
->blockName
);
2176 * comparing a block range with anything else is way
2177 * too costly, and maintaining the table is like too much
2178 * memory too, so let's force the automata to save state
2183 } else if ((range1
->type
< XML_REGEXP_LETTER
) ||
2184 (range2
->type
< XML_REGEXP_LETTER
)) {
2185 if ((range1
->type
== XML_REGEXP_ANYSPACE
) &&
2186 (range2
->type
== XML_REGEXP_NOTSPACE
))
2188 else if ((range1
->type
== XML_REGEXP_INITNAME
) &&
2189 (range2
->type
== XML_REGEXP_NOTINITNAME
))
2191 else if ((range1
->type
== XML_REGEXP_NAMECHAR
) &&
2192 (range2
->type
== XML_REGEXP_NOTNAMECHAR
))
2194 else if ((range1
->type
== XML_REGEXP_DECIMAL
) &&
2195 (range2
->type
== XML_REGEXP_NOTDECIMAL
))
2197 else if ((range1
->type
== XML_REGEXP_REALCHAR
) &&
2198 (range2
->type
== XML_REGEXP_NOTREALCHAR
))
2201 /* same thing to limit complexity */
2206 /* range1->type < range2->type here */
2207 switch (range1
->type
) {
2208 case XML_REGEXP_LETTER
:
2209 /* all disjoint except in the subgroups */
2210 if ((range2
->type
== XML_REGEXP_LETTER_UPPERCASE
) ||
2211 (range2
->type
== XML_REGEXP_LETTER_LOWERCASE
) ||
2212 (range2
->type
== XML_REGEXP_LETTER_TITLECASE
) ||
2213 (range2
->type
== XML_REGEXP_LETTER_MODIFIER
) ||
2214 (range2
->type
== XML_REGEXP_LETTER_OTHERS
))
2217 case XML_REGEXP_MARK
:
2218 if ((range2
->type
== XML_REGEXP_MARK_NONSPACING
) ||
2219 (range2
->type
== XML_REGEXP_MARK_SPACECOMBINING
) ||
2220 (range2
->type
== XML_REGEXP_MARK_ENCLOSING
))
2223 case XML_REGEXP_NUMBER
:
2224 if ((range2
->type
== XML_REGEXP_NUMBER_DECIMAL
) ||
2225 (range2
->type
== XML_REGEXP_NUMBER_LETTER
) ||
2226 (range2
->type
== XML_REGEXP_NUMBER_OTHERS
))
2229 case XML_REGEXP_PUNCT
:
2230 if ((range2
->type
== XML_REGEXP_PUNCT_CONNECTOR
) ||
2231 (range2
->type
== XML_REGEXP_PUNCT_DASH
) ||
2232 (range2
->type
== XML_REGEXP_PUNCT_OPEN
) ||
2233 (range2
->type
== XML_REGEXP_PUNCT_CLOSE
) ||
2234 (range2
->type
== XML_REGEXP_PUNCT_INITQUOTE
) ||
2235 (range2
->type
== XML_REGEXP_PUNCT_FINQUOTE
) ||
2236 (range2
->type
== XML_REGEXP_PUNCT_OTHERS
))
2239 case XML_REGEXP_SEPAR
:
2240 if ((range2
->type
== XML_REGEXP_SEPAR_SPACE
) ||
2241 (range2
->type
== XML_REGEXP_SEPAR_LINE
) ||
2242 (range2
->type
== XML_REGEXP_SEPAR_PARA
))
2245 case XML_REGEXP_SYMBOL
:
2246 if ((range2
->type
== XML_REGEXP_SYMBOL_MATH
) ||
2247 (range2
->type
== XML_REGEXP_SYMBOL_CURRENCY
) ||
2248 (range2
->type
== XML_REGEXP_SYMBOL_MODIFIER
) ||
2249 (range2
->type
== XML_REGEXP_SYMBOL_OTHERS
))
2252 case XML_REGEXP_OTHER
:
2253 if ((range2
->type
== XML_REGEXP_OTHER_CONTROL
) ||
2254 (range2
->type
== XML_REGEXP_OTHER_FORMAT
) ||
2255 (range2
->type
== XML_REGEXP_OTHER_PRIVATE
))
2259 if ((range2
->type
>= XML_REGEXP_LETTER
) &&
2260 (range2
->type
< XML_REGEXP_BLOCK_NAME
))
2268 if (((range1
->neg
== 0) && (range2
->neg
!= 0)) ||
2269 ((range1
->neg
!= 0) && (range2
->neg
== 0)))
2275 * xmlFACompareAtomTypes:
2276 * @type1: an atom type
2277 * @type2: an atom type
2279 * Compares two atoms type to check whether they intersect in some ways,
2280 * this is used by xmlFACompareAtoms only
2282 * Returns 1 if they may intersect and 0 otherwise
2285 xmlFACompareAtomTypes(xmlRegAtomType type1
, xmlRegAtomType type2
) {
2286 if ((type1
== XML_REGEXP_EPSILON
) ||
2287 (type1
== XML_REGEXP_CHARVAL
) ||
2288 (type1
== XML_REGEXP_RANGES
) ||
2289 (type1
== XML_REGEXP_SUBREG
) ||
2290 (type1
== XML_REGEXP_STRING
) ||
2291 (type1
== XML_REGEXP_ANYCHAR
))
2293 if ((type2
== XML_REGEXP_EPSILON
) ||
2294 (type2
== XML_REGEXP_CHARVAL
) ||
2295 (type2
== XML_REGEXP_RANGES
) ||
2296 (type2
== XML_REGEXP_SUBREG
) ||
2297 (type2
== XML_REGEXP_STRING
) ||
2298 (type2
== XML_REGEXP_ANYCHAR
))
2301 if (type1
== type2
) return(1);
2303 /* simplify subsequent compares by making sure type1 < type2 */
2304 if (type1
> type2
) {
2305 xmlRegAtomType tmp
= type1
;
2310 case XML_REGEXP_ANYSPACE
: /* \s */
2311 /* can't be a letter, number, mark, punctuation, symbol */
2312 if ((type2
== XML_REGEXP_NOTSPACE
) ||
2313 ((type2
>= XML_REGEXP_LETTER
) &&
2314 (type2
<= XML_REGEXP_LETTER_OTHERS
)) ||
2315 ((type2
>= XML_REGEXP_NUMBER
) &&
2316 (type2
<= XML_REGEXP_NUMBER_OTHERS
)) ||
2317 ((type2
>= XML_REGEXP_MARK
) &&
2318 (type2
<= XML_REGEXP_MARK_ENCLOSING
)) ||
2319 ((type2
>= XML_REGEXP_PUNCT
) &&
2320 (type2
<= XML_REGEXP_PUNCT_OTHERS
)) ||
2321 ((type2
>= XML_REGEXP_SYMBOL
) &&
2322 (type2
<= XML_REGEXP_SYMBOL_OTHERS
))
2325 case XML_REGEXP_NOTSPACE
: /* \S */
2327 case XML_REGEXP_INITNAME
: /* \l */
2328 /* can't be a number, mark, separator, punctuation, symbol or other */
2329 if ((type2
== XML_REGEXP_NOTINITNAME
) ||
2330 ((type2
>= XML_REGEXP_NUMBER
) &&
2331 (type2
<= XML_REGEXP_NUMBER_OTHERS
)) ||
2332 ((type2
>= XML_REGEXP_MARK
) &&
2333 (type2
<= XML_REGEXP_MARK_ENCLOSING
)) ||
2334 ((type2
>= XML_REGEXP_SEPAR
) &&
2335 (type2
<= XML_REGEXP_SEPAR_PARA
)) ||
2336 ((type2
>= XML_REGEXP_PUNCT
) &&
2337 (type2
<= XML_REGEXP_PUNCT_OTHERS
)) ||
2338 ((type2
>= XML_REGEXP_SYMBOL
) &&
2339 (type2
<= XML_REGEXP_SYMBOL_OTHERS
)) ||
2340 ((type2
>= XML_REGEXP_OTHER
) &&
2341 (type2
<= XML_REGEXP_OTHER_NA
))
2344 case XML_REGEXP_NOTINITNAME
: /* \L */
2346 case XML_REGEXP_NAMECHAR
: /* \c */
2347 /* can't be a mark, separator, punctuation, symbol or other */
2348 if ((type2
== XML_REGEXP_NOTNAMECHAR
) ||
2349 ((type2
>= XML_REGEXP_MARK
) &&
2350 (type2
<= XML_REGEXP_MARK_ENCLOSING
)) ||
2351 ((type2
>= XML_REGEXP_PUNCT
) &&
2352 (type2
<= XML_REGEXP_PUNCT_OTHERS
)) ||
2353 ((type2
>= XML_REGEXP_SEPAR
) &&
2354 (type2
<= XML_REGEXP_SEPAR_PARA
)) ||
2355 ((type2
>= XML_REGEXP_SYMBOL
) &&
2356 (type2
<= XML_REGEXP_SYMBOL_OTHERS
)) ||
2357 ((type2
>= XML_REGEXP_OTHER
) &&
2358 (type2
<= XML_REGEXP_OTHER_NA
))
2361 case XML_REGEXP_NOTNAMECHAR
: /* \C */
2363 case XML_REGEXP_DECIMAL
: /* \d */
2364 /* can't be a letter, mark, separator, punctuation, symbol or other */
2365 if ((type2
== XML_REGEXP_NOTDECIMAL
) ||
2366 (type2
== XML_REGEXP_REALCHAR
) ||
2367 ((type2
>= XML_REGEXP_LETTER
) &&
2368 (type2
<= XML_REGEXP_LETTER_OTHERS
)) ||
2369 ((type2
>= XML_REGEXP_MARK
) &&
2370 (type2
<= XML_REGEXP_MARK_ENCLOSING
)) ||
2371 ((type2
>= XML_REGEXP_PUNCT
) &&
2372 (type2
<= XML_REGEXP_PUNCT_OTHERS
)) ||
2373 ((type2
>= XML_REGEXP_SEPAR
) &&
2374 (type2
<= XML_REGEXP_SEPAR_PARA
)) ||
2375 ((type2
>= XML_REGEXP_SYMBOL
) &&
2376 (type2
<= XML_REGEXP_SYMBOL_OTHERS
)) ||
2377 ((type2
>= XML_REGEXP_OTHER
) &&
2378 (type2
<= XML_REGEXP_OTHER_NA
))
2381 case XML_REGEXP_NOTDECIMAL
: /* \D */
2383 case XML_REGEXP_REALCHAR
: /* \w */
2384 /* can't be a mark, separator, punctuation, symbol or other */
2385 if ((type2
== XML_REGEXP_NOTDECIMAL
) ||
2386 ((type2
>= XML_REGEXP_MARK
) &&
2387 (type2
<= XML_REGEXP_MARK_ENCLOSING
)) ||
2388 ((type2
>= XML_REGEXP_PUNCT
) &&
2389 (type2
<= XML_REGEXP_PUNCT_OTHERS
)) ||
2390 ((type2
>= XML_REGEXP_SEPAR
) &&
2391 (type2
<= XML_REGEXP_SEPAR_PARA
)) ||
2392 ((type2
>= XML_REGEXP_SYMBOL
) &&
2393 (type2
<= XML_REGEXP_SYMBOL_OTHERS
)) ||
2394 ((type2
>= XML_REGEXP_OTHER
) &&
2395 (type2
<= XML_REGEXP_OTHER_NA
))
2398 case XML_REGEXP_NOTREALCHAR
: /* \W */
2401 * at that point we know both type 1 and type2 are from
2402 * character categories are ordered and are different,
2403 * it becomes simple because this is a partition
2405 case XML_REGEXP_LETTER
:
2406 if (type2
<= XML_REGEXP_LETTER_OTHERS
)
2409 case XML_REGEXP_LETTER_UPPERCASE
:
2410 case XML_REGEXP_LETTER_LOWERCASE
:
2411 case XML_REGEXP_LETTER_TITLECASE
:
2412 case XML_REGEXP_LETTER_MODIFIER
:
2413 case XML_REGEXP_LETTER_OTHERS
:
2415 case XML_REGEXP_MARK
:
2416 if (type2
<= XML_REGEXP_MARK_ENCLOSING
)
2419 case XML_REGEXP_MARK_NONSPACING
:
2420 case XML_REGEXP_MARK_SPACECOMBINING
:
2421 case XML_REGEXP_MARK_ENCLOSING
:
2423 case XML_REGEXP_NUMBER
:
2424 if (type2
<= XML_REGEXP_NUMBER_OTHERS
)
2427 case XML_REGEXP_NUMBER_DECIMAL
:
2428 case XML_REGEXP_NUMBER_LETTER
:
2429 case XML_REGEXP_NUMBER_OTHERS
:
2431 case XML_REGEXP_PUNCT
:
2432 if (type2
<= XML_REGEXP_PUNCT_OTHERS
)
2435 case XML_REGEXP_PUNCT_CONNECTOR
:
2436 case XML_REGEXP_PUNCT_DASH
:
2437 case XML_REGEXP_PUNCT_OPEN
:
2438 case XML_REGEXP_PUNCT_CLOSE
:
2439 case XML_REGEXP_PUNCT_INITQUOTE
:
2440 case XML_REGEXP_PUNCT_FINQUOTE
:
2441 case XML_REGEXP_PUNCT_OTHERS
:
2443 case XML_REGEXP_SEPAR
:
2444 if (type2
<= XML_REGEXP_SEPAR_PARA
)
2447 case XML_REGEXP_SEPAR_SPACE
:
2448 case XML_REGEXP_SEPAR_LINE
:
2449 case XML_REGEXP_SEPAR_PARA
:
2451 case XML_REGEXP_SYMBOL
:
2452 if (type2
<= XML_REGEXP_SYMBOL_OTHERS
)
2455 case XML_REGEXP_SYMBOL_MATH
:
2456 case XML_REGEXP_SYMBOL_CURRENCY
:
2457 case XML_REGEXP_SYMBOL_MODIFIER
:
2458 case XML_REGEXP_SYMBOL_OTHERS
:
2460 case XML_REGEXP_OTHER
:
2461 if (type2
<= XML_REGEXP_OTHER_NA
)
2464 case XML_REGEXP_OTHER_CONTROL
:
2465 case XML_REGEXP_OTHER_FORMAT
:
2466 case XML_REGEXP_OTHER_PRIVATE
:
2467 case XML_REGEXP_OTHER_NA
:
2479 * @deep: if not set only compare string pointers
2481 * Compares two atoms to check whether they are the same exactly
2482 * this is used to remove equivalent transitions
2484 * Returns 1 if same and 0 otherwise
2487 xmlFAEqualAtoms(xmlRegAtomPtr atom1
, xmlRegAtomPtr atom2
, int deep
) {
2492 if ((atom1
== NULL
) || (atom2
== NULL
))
2495 if (atom1
->type
!= atom2
->type
)
2497 switch (atom1
->type
) {
2498 case XML_REGEXP_EPSILON
:
2501 case XML_REGEXP_STRING
:
2503 ret
= (atom1
->valuep
== atom2
->valuep
);
2505 ret
= xmlStrEqual((xmlChar
*)atom1
->valuep
,
2506 (xmlChar
*)atom2
->valuep
);
2508 case XML_REGEXP_CHARVAL
:
2509 ret
= (atom1
->codepoint
== atom2
->codepoint
);
2511 case XML_REGEXP_RANGES
:
2512 /* too hard to do in the general case */
2521 * xmlFACompareAtoms:
2524 * @deep: if not set only compare string pointers
2526 * Compares two atoms to check whether they intersect in some ways,
2527 * this is used by xmlFAComputesDeterminism and xmlFARecurseDeterminism only
2529 * Returns 1 if yes and 0 otherwise
2532 xmlFACompareAtoms(xmlRegAtomPtr atom1
, xmlRegAtomPtr atom2
, int deep
) {
2537 if ((atom1
== NULL
) || (atom2
== NULL
))
2540 if ((atom1
->type
== XML_REGEXP_ANYCHAR
) ||
2541 (atom2
->type
== XML_REGEXP_ANYCHAR
))
2544 if (atom1
->type
> atom2
->type
) {
2550 if (atom1
->type
!= atom2
->type
) {
2551 ret
= xmlFACompareAtomTypes(atom1
->type
, atom2
->type
);
2552 /* if they can't intersect at the type level break now */
2556 switch (atom1
->type
) {
2557 case XML_REGEXP_STRING
:
2559 ret
= (atom1
->valuep
!= atom2
->valuep
);
2561 xmlChar
*val1
= (xmlChar
*)atom1
->valuep
;
2562 xmlChar
*val2
= (xmlChar
*)atom2
->valuep
;
2563 int compound1
= (xmlStrchr(val1
, '|') != NULL
);
2564 int compound2
= (xmlStrchr(val2
, '|') != NULL
);
2566 /* Ignore negative match flag for ##other namespaces */
2567 if (compound1
!= compound2
)
2570 ret
= xmlRegStrEqualWildcard(val1
, val2
);
2573 case XML_REGEXP_EPSILON
:
2574 goto not_determinist
;
2575 case XML_REGEXP_CHARVAL
:
2576 if (atom2
->type
== XML_REGEXP_CHARVAL
) {
2577 ret
= (atom1
->codepoint
== atom2
->codepoint
);
2579 ret
= xmlRegCheckCharacter(atom2
, atom1
->codepoint
);
2584 case XML_REGEXP_RANGES
:
2585 if (atom2
->type
== XML_REGEXP_RANGES
) {
2587 xmlRegRangePtr r1
, r2
;
2590 * need to check that none of the ranges eventually matches
2592 for (i
= 0;i
< atom1
->nbRanges
;i
++) {
2593 for (j
= 0;j
< atom2
->nbRanges
;j
++) {
2594 r1
= atom1
->ranges
[i
];
2595 r2
= atom2
->ranges
[j
];
2596 res
= xmlFACompareRanges(r1
, r2
);
2607 goto not_determinist
;
2610 if (atom1
->neg
!= atom2
->neg
) {
2620 * xmlFARecurseDeterminism:
2621 * @ctxt: a regexp parser context
2623 * Check whether the associated regexp is determinist,
2624 * should be called after xmlFAEliminateEpsilonTransitions()
2628 xmlFARecurseDeterminism(xmlRegParserCtxtPtr ctxt
, xmlRegStatePtr state
,
2629 int to
, xmlRegAtomPtr atom
) {
2632 int transnr
, nbTrans
;
2638 if (state
->markd
== XML_REGEXP_MARK_VISITED
)
2641 if (ctxt
->flags
& AM_AUTOMATA_RNG
)
2645 * don't recurse on transitions potentially added in the course of
2648 nbTrans
= state
->nbTrans
;
2649 for (transnr
= 0;transnr
< nbTrans
;transnr
++) {
2650 t1
= &(state
->trans
[transnr
]);
2652 * check transitions conflicting with the one looked at
2654 if (t1
->atom
== NULL
) {
2657 state
->markd
= XML_REGEXP_MARK_VISITED
;
2658 res
= xmlFARecurseDeterminism(ctxt
, ctxt
->states
[t1
->to
],
2668 if (xmlFACompareAtoms(t1
->atom
, atom
, deep
)) {
2670 /* mark the transition as non-deterministic */
2678 * xmlFAFinishRecurseDeterminism:
2679 * @ctxt: a regexp parser context
2681 * Reset flags after checking determinism.
2684 xmlFAFinishRecurseDeterminism(xmlRegParserCtxtPtr ctxt
, xmlRegStatePtr state
) {
2685 int transnr
, nbTrans
;
2689 if (state
->markd
!= XML_REGEXP_MARK_VISITED
)
2693 nbTrans
= state
->nbTrans
;
2694 for (transnr
= 0; transnr
< nbTrans
; transnr
++) {
2695 xmlRegTransPtr t1
= &state
->trans
[transnr
];
2696 if ((t1
->atom
== NULL
) && (t1
->to
>= 0))
2697 xmlFAFinishRecurseDeterminism(ctxt
, ctxt
->states
[t1
->to
]);
2702 * xmlFAComputesDeterminism:
2703 * @ctxt: a regexp parser context
2705 * Check whether the associated regexp is determinist,
2706 * should be called after xmlFAEliminateEpsilonTransitions()
2710 xmlFAComputesDeterminism(xmlRegParserCtxtPtr ctxt
) {
2711 int statenr
, transnr
;
2712 xmlRegStatePtr state
;
2713 xmlRegTransPtr t1
, t2
, last
;
2718 #ifdef DEBUG_REGEXP_GRAPH
2719 printf("xmlFAComputesDeterminism\n");
2720 xmlRegPrintCtxt(stdout
, ctxt
);
2722 if (ctxt
->determinist
!= -1)
2723 return(ctxt
->determinist
);
2725 if (ctxt
->flags
& AM_AUTOMATA_RNG
)
2729 * First cleanup the automata removing cancelled transitions
2731 for (statenr
= 0;statenr
< ctxt
->nbStates
;statenr
++) {
2732 state
= ctxt
->states
[statenr
];
2735 if (state
->nbTrans
< 2)
2737 for (transnr
= 0;transnr
< state
->nbTrans
;transnr
++) {
2738 t1
= &(state
->trans
[transnr
]);
2740 * Determinism checks in case of counted or all transitions
2741 * will have to be handled separately
2743 if (t1
->atom
== NULL
) {
2747 if (t1
->to
== -1) /* eliminated */
2749 for (i
= 0;i
< transnr
;i
++) {
2750 t2
= &(state
->trans
[i
]);
2751 if (t2
->to
== -1) /* eliminated */
2753 if (t2
->atom
!= NULL
) {
2754 if (t1
->to
== t2
->to
) {
2756 * Here we use deep because we want to keep the
2757 * transitions which indicate a conflict
2759 if (xmlFAEqualAtoms(t1
->atom
, t2
->atom
, deep
) &&
2760 (t1
->counter
== t2
->counter
) &&
2761 (t1
->count
== t2
->count
))
2762 t2
->to
= -1; /* eliminated */
2770 * Check for all states that there aren't 2 transitions
2771 * with the same atom and a different target.
2773 for (statenr
= 0;statenr
< ctxt
->nbStates
;statenr
++) {
2774 state
= ctxt
->states
[statenr
];
2777 if (state
->nbTrans
< 2)
2780 for (transnr
= 0;transnr
< state
->nbTrans
;transnr
++) {
2781 t1
= &(state
->trans
[transnr
]);
2783 * Determinism checks in case of counted or all transitions
2784 * will have to be handled separately
2786 if (t1
->atom
== NULL
) {
2789 if (t1
->to
== -1) /* eliminated */
2791 for (i
= 0;i
< transnr
;i
++) {
2792 t2
= &(state
->trans
[i
]);
2793 if (t2
->to
== -1) /* eliminated */
2795 if (t2
->atom
!= NULL
) {
2797 * But here we don't use deep because we want to
2798 * find transitions which indicate a conflict
2800 if (xmlFACompareAtoms(t1
->atom
, t2
->atom
, 1)) {
2802 /* mark the transitions as non-deterministic ones */
2807 } else if (t1
->to
!= -1) {
2809 * do the closure in case of remaining specific
2810 * epsilon transitions like choices or all
2812 ret
= xmlFARecurseDeterminism(ctxt
, ctxt
->states
[t1
->to
],
2814 xmlFAFinishRecurseDeterminism(ctxt
, ctxt
->states
[t1
->to
]);
2815 /* don't shortcut the computation so all non deterministic
2816 transition get marked down
2827 /* don't shortcut the computation so all non deterministic
2828 transition get marked down
2834 * mark specifically the last non-deterministic transition
2835 * from a state since there is no need to set-up rollback
2842 /* don't shortcut the computation so all non deterministic
2843 transition get marked down
2848 ctxt
->determinist
= ret
;
2852 /************************************************************************
2854 * Routines to check input against transition atoms *
2856 ************************************************************************/
2859 xmlRegCheckCharacterRange(xmlRegAtomType type
, int codepoint
, int neg
,
2860 int start
, int end
, const xmlChar
*blockName
) {
2864 case XML_REGEXP_STRING
:
2865 case XML_REGEXP_SUBREG
:
2866 case XML_REGEXP_RANGES
:
2867 case XML_REGEXP_EPSILON
:
2869 case XML_REGEXP_ANYCHAR
:
2870 ret
= ((codepoint
!= '\n') && (codepoint
!= '\r'));
2872 case XML_REGEXP_CHARVAL
:
2873 ret
= ((codepoint
>= start
) && (codepoint
<= end
));
2875 case XML_REGEXP_NOTSPACE
:
2877 /* Falls through. */
2878 case XML_REGEXP_ANYSPACE
:
2879 ret
= ((codepoint
== '\n') || (codepoint
== '\r') ||
2880 (codepoint
== '\t') || (codepoint
== ' '));
2882 case XML_REGEXP_NOTINITNAME
:
2884 /* Falls through. */
2885 case XML_REGEXP_INITNAME
:
2886 ret
= (IS_LETTER(codepoint
) ||
2887 (codepoint
== '_') || (codepoint
== ':'));
2889 case XML_REGEXP_NOTNAMECHAR
:
2891 /* Falls through. */
2892 case XML_REGEXP_NAMECHAR
:
2893 ret
= (IS_LETTER(codepoint
) || IS_DIGIT(codepoint
) ||
2894 (codepoint
== '.') || (codepoint
== '-') ||
2895 (codepoint
== '_') || (codepoint
== ':') ||
2896 IS_COMBINING(codepoint
) || IS_EXTENDER(codepoint
));
2898 case XML_REGEXP_NOTDECIMAL
:
2900 /* Falls through. */
2901 case XML_REGEXP_DECIMAL
:
2902 ret
= xmlUCSIsCatNd(codepoint
);
2904 case XML_REGEXP_REALCHAR
:
2906 /* Falls through. */
2907 case XML_REGEXP_NOTREALCHAR
:
2908 ret
= xmlUCSIsCatP(codepoint
);
2910 ret
= xmlUCSIsCatZ(codepoint
);
2912 ret
= xmlUCSIsCatC(codepoint
);
2914 case XML_REGEXP_LETTER
:
2915 ret
= xmlUCSIsCatL(codepoint
);
2917 case XML_REGEXP_LETTER_UPPERCASE
:
2918 ret
= xmlUCSIsCatLu(codepoint
);
2920 case XML_REGEXP_LETTER_LOWERCASE
:
2921 ret
= xmlUCSIsCatLl(codepoint
);
2923 case XML_REGEXP_LETTER_TITLECASE
:
2924 ret
= xmlUCSIsCatLt(codepoint
);
2926 case XML_REGEXP_LETTER_MODIFIER
:
2927 ret
= xmlUCSIsCatLm(codepoint
);
2929 case XML_REGEXP_LETTER_OTHERS
:
2930 ret
= xmlUCSIsCatLo(codepoint
);
2932 case XML_REGEXP_MARK
:
2933 ret
= xmlUCSIsCatM(codepoint
);
2935 case XML_REGEXP_MARK_NONSPACING
:
2936 ret
= xmlUCSIsCatMn(codepoint
);
2938 case XML_REGEXP_MARK_SPACECOMBINING
:
2939 ret
= xmlUCSIsCatMc(codepoint
);
2941 case XML_REGEXP_MARK_ENCLOSING
:
2942 ret
= xmlUCSIsCatMe(codepoint
);
2944 case XML_REGEXP_NUMBER
:
2945 ret
= xmlUCSIsCatN(codepoint
);
2947 case XML_REGEXP_NUMBER_DECIMAL
:
2948 ret
= xmlUCSIsCatNd(codepoint
);
2950 case XML_REGEXP_NUMBER_LETTER
:
2951 ret
= xmlUCSIsCatNl(codepoint
);
2953 case XML_REGEXP_NUMBER_OTHERS
:
2954 ret
= xmlUCSIsCatNo(codepoint
);
2956 case XML_REGEXP_PUNCT
:
2957 ret
= xmlUCSIsCatP(codepoint
);
2959 case XML_REGEXP_PUNCT_CONNECTOR
:
2960 ret
= xmlUCSIsCatPc(codepoint
);
2962 case XML_REGEXP_PUNCT_DASH
:
2963 ret
= xmlUCSIsCatPd(codepoint
);
2965 case XML_REGEXP_PUNCT_OPEN
:
2966 ret
= xmlUCSIsCatPs(codepoint
);
2968 case XML_REGEXP_PUNCT_CLOSE
:
2969 ret
= xmlUCSIsCatPe(codepoint
);
2971 case XML_REGEXP_PUNCT_INITQUOTE
:
2972 ret
= xmlUCSIsCatPi(codepoint
);
2974 case XML_REGEXP_PUNCT_FINQUOTE
:
2975 ret
= xmlUCSIsCatPf(codepoint
);
2977 case XML_REGEXP_PUNCT_OTHERS
:
2978 ret
= xmlUCSIsCatPo(codepoint
);
2980 case XML_REGEXP_SEPAR
:
2981 ret
= xmlUCSIsCatZ(codepoint
);
2983 case XML_REGEXP_SEPAR_SPACE
:
2984 ret
= xmlUCSIsCatZs(codepoint
);
2986 case XML_REGEXP_SEPAR_LINE
:
2987 ret
= xmlUCSIsCatZl(codepoint
);
2989 case XML_REGEXP_SEPAR_PARA
:
2990 ret
= xmlUCSIsCatZp(codepoint
);
2992 case XML_REGEXP_SYMBOL
:
2993 ret
= xmlUCSIsCatS(codepoint
);
2995 case XML_REGEXP_SYMBOL_MATH
:
2996 ret
= xmlUCSIsCatSm(codepoint
);
2998 case XML_REGEXP_SYMBOL_CURRENCY
:
2999 ret
= xmlUCSIsCatSc(codepoint
);
3001 case XML_REGEXP_SYMBOL_MODIFIER
:
3002 ret
= xmlUCSIsCatSk(codepoint
);
3004 case XML_REGEXP_SYMBOL_OTHERS
:
3005 ret
= xmlUCSIsCatSo(codepoint
);
3007 case XML_REGEXP_OTHER
:
3008 ret
= xmlUCSIsCatC(codepoint
);
3010 case XML_REGEXP_OTHER_CONTROL
:
3011 ret
= xmlUCSIsCatCc(codepoint
);
3013 case XML_REGEXP_OTHER_FORMAT
:
3014 ret
= xmlUCSIsCatCf(codepoint
);
3016 case XML_REGEXP_OTHER_PRIVATE
:
3017 ret
= xmlUCSIsCatCo(codepoint
);
3019 case XML_REGEXP_OTHER_NA
:
3020 /* ret = xmlUCSIsCatCn(codepoint); */
3021 /* Seems it doesn't exist anymore in recent Unicode releases */
3024 case XML_REGEXP_BLOCK_NAME
:
3025 ret
= xmlUCSIsBlock(codepoint
, (const char *) blockName
);
3034 xmlRegCheckCharacter(xmlRegAtomPtr atom
, int codepoint
) {
3036 xmlRegRangePtr range
;
3038 if ((atom
== NULL
) || (!IS_CHAR(codepoint
)))
3041 switch (atom
->type
) {
3042 case XML_REGEXP_SUBREG
:
3043 case XML_REGEXP_EPSILON
:
3045 case XML_REGEXP_CHARVAL
:
3046 return(codepoint
== atom
->codepoint
);
3047 case XML_REGEXP_RANGES
: {
3050 for (i
= 0;i
< atom
->nbRanges
;i
++) {
3051 range
= atom
->ranges
[i
];
3052 if (range
->neg
== 2) {
3053 ret
= xmlRegCheckCharacterRange(range
->type
, codepoint
,
3054 0, range
->start
, range
->end
,
3057 return(0); /* excluded char */
3058 } else if (range
->neg
) {
3059 ret
= xmlRegCheckCharacterRange(range
->type
, codepoint
,
3060 0, range
->start
, range
->end
,
3067 ret
= xmlRegCheckCharacterRange(range
->type
, codepoint
,
3068 0, range
->start
, range
->end
,
3071 accept
= 1; /* might still be excluded */
3076 case XML_REGEXP_STRING
:
3077 printf("TODO: XML_REGEXP_STRING\n");
3079 case XML_REGEXP_ANYCHAR
:
3080 case XML_REGEXP_ANYSPACE
:
3081 case XML_REGEXP_NOTSPACE
:
3082 case XML_REGEXP_INITNAME
:
3083 case XML_REGEXP_NOTINITNAME
:
3084 case XML_REGEXP_NAMECHAR
:
3085 case XML_REGEXP_NOTNAMECHAR
:
3086 case XML_REGEXP_DECIMAL
:
3087 case XML_REGEXP_NOTDECIMAL
:
3088 case XML_REGEXP_REALCHAR
:
3089 case XML_REGEXP_NOTREALCHAR
:
3090 case XML_REGEXP_LETTER
:
3091 case XML_REGEXP_LETTER_UPPERCASE
:
3092 case XML_REGEXP_LETTER_LOWERCASE
:
3093 case XML_REGEXP_LETTER_TITLECASE
:
3094 case XML_REGEXP_LETTER_MODIFIER
:
3095 case XML_REGEXP_LETTER_OTHERS
:
3096 case XML_REGEXP_MARK
:
3097 case XML_REGEXP_MARK_NONSPACING
:
3098 case XML_REGEXP_MARK_SPACECOMBINING
:
3099 case XML_REGEXP_MARK_ENCLOSING
:
3100 case XML_REGEXP_NUMBER
:
3101 case XML_REGEXP_NUMBER_DECIMAL
:
3102 case XML_REGEXP_NUMBER_LETTER
:
3103 case XML_REGEXP_NUMBER_OTHERS
:
3104 case XML_REGEXP_PUNCT
:
3105 case XML_REGEXP_PUNCT_CONNECTOR
:
3106 case XML_REGEXP_PUNCT_DASH
:
3107 case XML_REGEXP_PUNCT_OPEN
:
3108 case XML_REGEXP_PUNCT_CLOSE
:
3109 case XML_REGEXP_PUNCT_INITQUOTE
:
3110 case XML_REGEXP_PUNCT_FINQUOTE
:
3111 case XML_REGEXP_PUNCT_OTHERS
:
3112 case XML_REGEXP_SEPAR
:
3113 case XML_REGEXP_SEPAR_SPACE
:
3114 case XML_REGEXP_SEPAR_LINE
:
3115 case XML_REGEXP_SEPAR_PARA
:
3116 case XML_REGEXP_SYMBOL
:
3117 case XML_REGEXP_SYMBOL_MATH
:
3118 case XML_REGEXP_SYMBOL_CURRENCY
:
3119 case XML_REGEXP_SYMBOL_MODIFIER
:
3120 case XML_REGEXP_SYMBOL_OTHERS
:
3121 case XML_REGEXP_OTHER
:
3122 case XML_REGEXP_OTHER_CONTROL
:
3123 case XML_REGEXP_OTHER_FORMAT
:
3124 case XML_REGEXP_OTHER_PRIVATE
:
3125 case XML_REGEXP_OTHER_NA
:
3126 case XML_REGEXP_BLOCK_NAME
:
3127 ret
= xmlRegCheckCharacterRange(atom
->type
, codepoint
, 0, 0, 0,
3128 (const xmlChar
*)atom
->valuep
);
3136 /************************************************************************
3138 * Saving and restoring state of an execution context *
3140 ************************************************************************/
3142 #ifdef DEBUG_REGEXP_EXEC
3144 xmlFARegDebugExec(xmlRegExecCtxtPtr exec
) {
3145 printf("state: %d:%d:idx %d", exec
->state
->no
, exec
->transno
, exec
->index
);
3146 if (exec
->inputStack
!= NULL
) {
3149 for (i
= 0;(i
< 3) && (i
< exec
->inputStackNr
);i
++)
3150 printf("%s ", (const char *)
3151 exec
->inputStack
[exec
->inputStackNr
- (i
+ 1)].value
);
3153 printf(": %s", &(exec
->inputString
[exec
->index
]));
3160 xmlFARegExecSave(xmlRegExecCtxtPtr exec
) {
3161 #ifdef DEBUG_REGEXP_EXEC
3164 xmlFARegDebugExec(exec
);
3168 if (exec
->nbPush
> MAX_PUSH
) {
3174 if (exec
->maxRollbacks
== 0) {
3175 exec
->maxRollbacks
= 4;
3176 exec
->rollbacks
= (xmlRegExecRollback
*) xmlMalloc(exec
->maxRollbacks
*
3177 sizeof(xmlRegExecRollback
));
3178 if (exec
->rollbacks
== NULL
) {
3179 xmlRegexpErrMemory(NULL
, "saving regexp");
3180 exec
->maxRollbacks
= 0;
3183 memset(exec
->rollbacks
, 0,
3184 exec
->maxRollbacks
* sizeof(xmlRegExecRollback
));
3185 } else if (exec
->nbRollbacks
>= exec
->maxRollbacks
) {
3186 xmlRegExecRollback
*tmp
;
3187 int len
= exec
->maxRollbacks
;
3189 exec
->maxRollbacks
*= 2;
3190 tmp
= (xmlRegExecRollback
*) xmlRealloc(exec
->rollbacks
,
3191 exec
->maxRollbacks
* sizeof(xmlRegExecRollback
));
3193 xmlRegexpErrMemory(NULL
, "saving regexp");
3194 exec
->maxRollbacks
/= 2;
3197 exec
->rollbacks
= tmp
;
3198 tmp
= &exec
->rollbacks
[len
];
3199 memset(tmp
, 0, (exec
->maxRollbacks
- len
) * sizeof(xmlRegExecRollback
));
3201 exec
->rollbacks
[exec
->nbRollbacks
].state
= exec
->state
;
3202 exec
->rollbacks
[exec
->nbRollbacks
].index
= exec
->index
;
3203 exec
->rollbacks
[exec
->nbRollbacks
].nextbranch
= exec
->transno
+ 1;
3204 if (exec
->comp
->nbCounters
> 0) {
3205 if (exec
->rollbacks
[exec
->nbRollbacks
].counts
== NULL
) {
3206 exec
->rollbacks
[exec
->nbRollbacks
].counts
= (int *)
3207 xmlMalloc(exec
->comp
->nbCounters
* sizeof(int));
3208 if (exec
->rollbacks
[exec
->nbRollbacks
].counts
== NULL
) {
3209 xmlRegexpErrMemory(NULL
, "saving regexp");
3214 memcpy(exec
->rollbacks
[exec
->nbRollbacks
].counts
, exec
->counts
,
3215 exec
->comp
->nbCounters
* sizeof(int));
3217 exec
->nbRollbacks
++;
3221 xmlFARegExecRollBack(xmlRegExecCtxtPtr exec
) {
3222 if (exec
->nbRollbacks
<= 0) {
3224 #ifdef DEBUG_REGEXP_EXEC
3225 printf("rollback failed on empty stack\n");
3229 exec
->nbRollbacks
--;
3230 exec
->state
= exec
->rollbacks
[exec
->nbRollbacks
].state
;
3231 exec
->index
= exec
->rollbacks
[exec
->nbRollbacks
].index
;
3232 exec
->transno
= exec
->rollbacks
[exec
->nbRollbacks
].nextbranch
;
3233 if (exec
->comp
->nbCounters
> 0) {
3234 if (exec
->rollbacks
[exec
->nbRollbacks
].counts
== NULL
) {
3235 fprintf(stderr
, "exec save: allocation failed");
3240 memcpy(exec
->counts
, exec
->rollbacks
[exec
->nbRollbacks
].counts
,
3241 exec
->comp
->nbCounters
* sizeof(int));
3245 #ifdef DEBUG_REGEXP_EXEC
3246 printf("restored ");
3247 xmlFARegDebugExec(exec
);
3251 /************************************************************************
3253 * Verifier, running an input against a compiled regexp *
3255 ************************************************************************/
3258 xmlFARegExec(xmlRegexpPtr comp
, const xmlChar
*content
) {
3259 xmlRegExecCtxt execval
;
3260 xmlRegExecCtxtPtr exec
= &execval
;
3261 int ret
, codepoint
= 0, len
, deter
;
3263 exec
->inputString
= content
;
3266 exec
->determinist
= 1;
3267 exec
->maxRollbacks
= 0;
3268 exec
->nbRollbacks
= 0;
3269 exec
->rollbacks
= NULL
;
3272 exec
->state
= comp
->states
[0];
3274 exec
->transcount
= 0;
3275 exec
->inputStack
= NULL
;
3276 exec
->inputStackMax
= 0;
3277 if (comp
->nbCounters
> 0) {
3278 exec
->counts
= (int *) xmlMalloc(comp
->nbCounters
* sizeof(int));
3279 if (exec
->counts
== NULL
) {
3280 xmlRegexpErrMemory(NULL
, "running regexp");
3283 memset(exec
->counts
, 0, comp
->nbCounters
* sizeof(int));
3285 exec
->counts
= NULL
;
3286 while ((exec
->status
== 0) && (exec
->state
!= NULL
) &&
3287 ((exec
->inputString
[exec
->index
] != 0) ||
3288 ((exec
->state
!= NULL
) &&
3289 (exec
->state
->type
!= XML_REGEXP_FINAL_STATE
)))) {
3290 xmlRegTransPtr trans
;
3294 * If end of input on non-terminal state, rollback, however we may
3295 * still have epsilon like transition for counted transitions
3296 * on counters, in that case don't break too early. Additionally,
3297 * if we are working on a range like "AB{0,2}", where B is not present,
3298 * we don't want to break.
3301 if ((exec
->inputString
[exec
->index
] == 0) && (exec
->counts
== NULL
)) {
3303 * if there is a transition, we must check if
3304 * atom allows minOccurs of 0
3306 if (exec
->transno
< exec
->state
->nbTrans
) {
3307 trans
= &exec
->state
->trans
[exec
->transno
];
3308 if (trans
->to
>=0) {
3310 if (!((atom
->min
== 0) && (atom
->max
> 0)))
3317 exec
->transcount
= 0;
3318 for (;exec
->transno
< exec
->state
->nbTrans
;exec
->transno
++) {
3319 trans
= &exec
->state
->trans
[exec
->transno
];
3325 if (trans
->count
>= 0) {
3327 xmlRegCounterPtr counter
;
3329 if (exec
->counts
== NULL
) {
3334 * A counted transition.
3337 count
= exec
->counts
[trans
->count
];
3338 counter
= &exec
->comp
->counters
[trans
->count
];
3339 #ifdef DEBUG_REGEXP_EXEC
3340 printf("testing count %d: val %d, min %d, max %d\n",
3341 trans
->count
, count
, counter
->min
, counter
->max
);
3343 ret
= ((count
>= counter
->min
) && (count
<= counter
->max
));
3344 if ((ret
) && (counter
->min
!= counter
->max
))
3346 } else if (atom
== NULL
) {
3347 fprintf(stderr
, "epsilon transition left at runtime\n");
3350 } else if (exec
->inputString
[exec
->index
] != 0) {
3351 codepoint
= CUR_SCHAR(&(exec
->inputString
[exec
->index
]), len
);
3352 ret
= xmlRegCheckCharacter(atom
, codepoint
);
3353 if ((ret
== 1) && (atom
->min
>= 0) && (atom
->max
> 0)) {
3354 xmlRegStatePtr to
= comp
->states
[trans
->to
];
3357 * this is a multiple input sequence
3358 * If there is a counter associated increment it now.
3359 * do not increment if the counter is already over the
3360 * maximum limit in which case get to next transition
3362 if (trans
->counter
>= 0) {
3363 xmlRegCounterPtr counter
;
3365 if ((exec
->counts
== NULL
) ||
3366 (exec
->comp
== NULL
) ||
3367 (exec
->comp
->counters
== NULL
)) {
3371 counter
= &exec
->comp
->counters
[trans
->counter
];
3372 if (exec
->counts
[trans
->counter
] >= counter
->max
)
3373 continue; /* for loop on transitions */
3375 /* Save before incrementing */
3376 if (exec
->state
->nbTrans
> exec
->transno
+ 1) {
3377 xmlFARegExecSave(exec
);
3379 if (trans
->counter
>= 0) {
3380 #ifdef DEBUG_REGEXP_EXEC
3381 printf("Increasing count %d\n", trans
->counter
);
3383 exec
->counts
[trans
->counter
]++;
3385 exec
->transcount
= 1;
3388 * Try to progress as much as possible on the input
3390 if (exec
->transcount
== atom
->max
) {
3395 * End of input: stop here
3397 if (exec
->inputString
[exec
->index
] == 0) {
3401 if (exec
->transcount
>= atom
->min
) {
3402 int transno
= exec
->transno
;
3403 xmlRegStatePtr state
= exec
->state
;
3406 * The transition is acceptable save it
3408 exec
->transno
= -1; /* trick */
3410 xmlFARegExecSave(exec
);
3411 exec
->transno
= transno
;
3412 exec
->state
= state
;
3414 codepoint
= CUR_SCHAR(&(exec
->inputString
[exec
->index
]),
3416 ret
= xmlRegCheckCharacter(atom
, codepoint
);
3419 if (exec
->transcount
< atom
->min
)
3423 * If the last check failed but one transition was found
3424 * possible, rollback
3431 if (trans
->counter
>= 0) {
3432 if (exec
->counts
== NULL
) {
3436 #ifdef DEBUG_REGEXP_EXEC
3437 printf("Decreasing count %d\n", trans
->counter
);
3439 exec
->counts
[trans
->counter
]--;
3441 } else if ((ret
== 0) && (atom
->min
== 0) && (atom
->max
> 0)) {
3443 * we don't match on the codepoint, but minOccurs of 0
3444 * says that's ok. Setting len to 0 inhibits stepping
3445 * over the codepoint.
3447 exec
->transcount
= 1;
3451 } else if ((atom
->min
== 0) && (atom
->max
> 0)) {
3452 /* another spot to match when minOccurs is 0 */
3453 exec
->transcount
= 1;
3458 if ((trans
->nd
== 1) ||
3459 ((trans
->count
>= 0) && (deter
== 0) &&
3460 (exec
->state
->nbTrans
> exec
->transno
+ 1))) {
3461 #ifdef DEBUG_REGEXP_EXEC
3463 printf("Saving on nd transition atom %d for %c at %d\n",
3464 trans
->atom
->no
, codepoint
, exec
->index
);
3466 printf("Saving on counted transition count %d for %c at %d\n",
3467 trans
->count
, codepoint
, exec
->index
);
3469 xmlFARegExecSave(exec
);
3471 if (trans
->counter
>= 0) {
3472 xmlRegCounterPtr counter
;
3474 /* make sure we don't go over the counter maximum value */
3475 if ((exec
->counts
== NULL
) ||
3476 (exec
->comp
== NULL
) ||
3477 (exec
->comp
->counters
== NULL
)) {
3481 counter
= &exec
->comp
->counters
[trans
->counter
];
3482 if (exec
->counts
[trans
->counter
] >= counter
->max
)
3483 continue; /* for loop on transitions */
3484 #ifdef DEBUG_REGEXP_EXEC
3485 printf("Increasing count %d\n", trans
->counter
);
3487 exec
->counts
[trans
->counter
]++;
3489 if ((trans
->count
>= 0) &&
3490 (trans
->count
< REGEXP_ALL_COUNTER
)) {
3491 if (exec
->counts
== NULL
) {
3495 #ifdef DEBUG_REGEXP_EXEC
3496 printf("resetting count %d on transition\n",
3499 exec
->counts
[trans
->count
] = 0;
3501 #ifdef DEBUG_REGEXP_EXEC
3502 printf("entering state %d\n", trans
->to
);
3504 exec
->state
= comp
->states
[trans
->to
];
3506 if (trans
->atom
!= NULL
) {
3510 } else if (ret
< 0) {
3515 if ((exec
->transno
!= 0) || (exec
->state
->nbTrans
== 0)) {
3518 * Failed to find a way out
3520 exec
->determinist
= 0;
3521 #ifdef DEBUG_REGEXP_EXEC
3522 printf("rollback from state %d on %d:%c\n", exec
->state
->no
,
3523 codepoint
,codepoint
);
3525 xmlFARegExecRollBack(exec
);
3531 if (exec
->rollbacks
!= NULL
) {
3532 if (exec
->counts
!= NULL
) {
3535 for (i
= 0;i
< exec
->maxRollbacks
;i
++)
3536 if (exec
->rollbacks
[i
].counts
!= NULL
)
3537 xmlFree(exec
->rollbacks
[i
].counts
);
3539 xmlFree(exec
->rollbacks
);
3541 if (exec
->state
== NULL
)
3543 if (exec
->counts
!= NULL
)
3544 xmlFree(exec
->counts
);
3545 if (exec
->status
== 0)
3547 if (exec
->status
== -1) {
3548 if (exec
->nbPush
> MAX_PUSH
)
3552 return(exec
->status
);
3555 /************************************************************************
3557 * Progressive interface to the verifier one atom at a time *
3559 ************************************************************************/
3561 static void testerr(xmlRegExecCtxtPtr exec
);
3565 * xmlRegNewExecCtxt:
3566 * @comp: a precompiled regular expression
3567 * @callback: a callback function used for handling progresses in the
3568 * automata matching phase
3569 * @data: the context data associated to the callback in this context
3571 * Build a context used for progressive evaluation of a regexp.
3573 * Returns the new context
3576 xmlRegNewExecCtxt(xmlRegexpPtr comp
, xmlRegExecCallbacks callback
, void *data
) {
3577 xmlRegExecCtxtPtr exec
;
3581 if ((comp
->compact
== NULL
) && (comp
->states
== NULL
))
3583 exec
= (xmlRegExecCtxtPtr
) xmlMalloc(sizeof(xmlRegExecCtxt
));
3585 xmlRegexpErrMemory(NULL
, "creating execution context");
3588 memset(exec
, 0, sizeof(xmlRegExecCtxt
));
3589 exec
->inputString
= NULL
;
3591 exec
->determinist
= 1;
3592 exec
->maxRollbacks
= 0;
3593 exec
->nbRollbacks
= 0;
3594 exec
->rollbacks
= NULL
;
3597 if (comp
->compact
== NULL
)
3598 exec
->state
= comp
->states
[0];
3600 exec
->transcount
= 0;
3601 exec
->callback
= callback
;
3603 if (comp
->nbCounters
> 0) {
3605 * For error handling, exec->counts is allocated twice the size
3606 * the second half is used to store the data in case of rollback
3608 exec
->counts
= (int *) xmlMalloc(comp
->nbCounters
* sizeof(int)
3610 if (exec
->counts
== NULL
) {
3611 xmlRegexpErrMemory(NULL
, "creating execution context");
3615 memset(exec
->counts
, 0, comp
->nbCounters
* sizeof(int) * 2);
3616 exec
->errCounts
= &exec
->counts
[comp
->nbCounters
];
3618 exec
->counts
= NULL
;
3619 exec
->errCounts
= NULL
;
3621 exec
->inputStackMax
= 0;
3622 exec
->inputStackNr
= 0;
3623 exec
->inputStack
= NULL
;
3624 exec
->errStateNo
= -1;
3625 exec
->errString
= NULL
;
3631 * xmlRegFreeExecCtxt:
3632 * @exec: a regular expression evaluation context
3634 * Free the structures associated to a regular expression evaluation context.
3637 xmlRegFreeExecCtxt(xmlRegExecCtxtPtr exec
) {
3641 if (exec
->rollbacks
!= NULL
) {
3642 if (exec
->counts
!= NULL
) {
3645 for (i
= 0;i
< exec
->maxRollbacks
;i
++)
3646 if (exec
->rollbacks
[i
].counts
!= NULL
)
3647 xmlFree(exec
->rollbacks
[i
].counts
);
3649 xmlFree(exec
->rollbacks
);
3651 if (exec
->counts
!= NULL
)
3652 xmlFree(exec
->counts
);
3653 if (exec
->inputStack
!= NULL
) {
3656 for (i
= 0;i
< exec
->inputStackNr
;i
++) {
3657 if (exec
->inputStack
[i
].value
!= NULL
)
3658 xmlFree(exec
->inputStack
[i
].value
);
3660 xmlFree(exec
->inputStack
);
3662 if (exec
->errString
!= NULL
)
3663 xmlFree(exec
->errString
);
3668 xmlFARegExecSaveInputString(xmlRegExecCtxtPtr exec
, const xmlChar
*value
,
3671 printf("saving value: %d:%s\n", exec
->inputStackNr
, value
);
3673 if (exec
->inputStackMax
== 0) {
3674 exec
->inputStackMax
= 4;
3675 exec
->inputStack
= (xmlRegInputTokenPtr
)
3676 xmlMalloc(exec
->inputStackMax
* sizeof(xmlRegInputToken
));
3677 if (exec
->inputStack
== NULL
) {
3678 xmlRegexpErrMemory(NULL
, "pushing input string");
3679 exec
->inputStackMax
= 0;
3682 } else if (exec
->inputStackNr
+ 1 >= exec
->inputStackMax
) {
3683 xmlRegInputTokenPtr tmp
;
3685 exec
->inputStackMax
*= 2;
3686 tmp
= (xmlRegInputTokenPtr
) xmlRealloc(exec
->inputStack
,
3687 exec
->inputStackMax
* sizeof(xmlRegInputToken
));
3689 xmlRegexpErrMemory(NULL
, "pushing input string");
3690 exec
->inputStackMax
/= 2;
3693 exec
->inputStack
= tmp
;
3695 exec
->inputStack
[exec
->inputStackNr
].value
= xmlStrdup(value
);
3696 exec
->inputStack
[exec
->inputStackNr
].data
= data
;
3697 exec
->inputStackNr
++;
3698 exec
->inputStack
[exec
->inputStackNr
].value
= NULL
;
3699 exec
->inputStack
[exec
->inputStackNr
].data
= NULL
;
3703 * xmlRegStrEqualWildcard:
3704 * @expStr: the string to be evaluated
3705 * @valStr: the validation string
3707 * Checks if both strings are equal or have the same content. "*"
3708 * can be used as a wildcard in @valStr; "|" is used as a separator of
3709 * substrings in both @expStr and @valStr.
3711 * Returns 1 if the comparison is satisfied and the number of substrings
3712 * is equal, 0 otherwise.
3716 xmlRegStrEqualWildcard(const xmlChar
*expStr
, const xmlChar
*valStr
) {
3717 if (expStr
== valStr
) return(1);
3718 if (expStr
== NULL
) return(0);
3719 if (valStr
== NULL
) return(0);
3722 * Eval if we have a wildcard for the current item.
3724 if (*expStr
!= *valStr
) {
3725 /* if one of them starts with a wildcard make valStr be it */
3726 if (*valStr
== '*') {
3733 if ((*valStr
!= 0) && (*expStr
!= 0) && (*expStr
++ == '*')) {
3735 if (*valStr
== XML_REG_STRING_SEPARATOR
)
3738 } while (*valStr
!= 0);
3745 } while (*valStr
!= 0);
3753 * xmlRegCompactPushString:
3754 * @exec: a regexp execution context
3755 * @comp: the precompiled exec with a compact table
3756 * @value: a string token input
3757 * @data: data associated to the token to reuse in callbacks
3759 * Push one input token in the execution context
3761 * Returns: 1 if the regexp reached a final state, 0 if non-final, and
3762 * a negative value in case of error.
3765 xmlRegCompactPushString(xmlRegExecCtxtPtr exec
,
3767 const xmlChar
*value
,
3769 int state
= exec
->index
;
3772 if ((comp
== NULL
) || (comp
->compact
== NULL
) || (comp
->stringMap
== NULL
))
3775 if (value
== NULL
) {
3777 * are we at a final state ?
3779 if (comp
->compact
[state
* (comp
->nbstrings
+ 1)] ==
3780 XML_REGEXP_FINAL_STATE
)
3786 printf("value pushed: %s\n", value
);
3790 * Examine all outside transitions from current state
3792 for (i
= 0;i
< comp
->nbstrings
;i
++) {
3793 target
= comp
->compact
[state
* (comp
->nbstrings
+ 1) + i
+ 1];
3794 if ((target
> 0) && (target
<= comp
->nbstates
)) {
3795 target
--; /* to avoid 0 */
3796 if (xmlRegStrEqualWildcard(comp
->stringMap
[i
], value
)) {
3797 exec
->index
= target
;
3798 if ((exec
->callback
!= NULL
) && (comp
->transdata
!= NULL
)) {
3799 exec
->callback(exec
->data
, value
,
3800 comp
->transdata
[state
* comp
->nbstrings
+ i
], data
);
3803 printf("entering state %d\n", target
);
3805 if (comp
->compact
[target
* (comp
->nbstrings
+ 1)] ==
3806 XML_REGEXP_SINK_STATE
)
3809 if (comp
->compact
[target
* (comp
->nbstrings
+ 1)] ==
3810 XML_REGEXP_FINAL_STATE
)
3817 * Failed to find an exit transition out from current state for the
3821 printf("failed to find a transition for %s on state %d\n", value
, state
);
3824 if (exec
->errString
!= NULL
)
3825 xmlFree(exec
->errString
);
3826 exec
->errString
= xmlStrdup(value
);
3827 exec
->errStateNo
= state
;
3836 * xmlRegExecPushStringInternal:
3837 * @exec: a regexp execution context or NULL to indicate the end
3838 * @value: a string token input
3839 * @data: data associated to the token to reuse in callbacks
3840 * @compound: value was assembled from 2 strings
3842 * Push one input token in the execution context
3844 * Returns: 1 if the regexp reached a final state, 0 if non-final, and
3845 * a negative value in case of error.
3848 xmlRegExecPushStringInternal(xmlRegExecCtxtPtr exec
, const xmlChar
*value
,
3849 void *data
, int compound
) {
3850 xmlRegTransPtr trans
;
3858 if (exec
->comp
== NULL
)
3860 if (exec
->status
!= 0)
3861 return(exec
->status
);
3863 if (exec
->comp
->compact
!= NULL
)
3864 return(xmlRegCompactPushString(exec
, exec
->comp
, value
, data
));
3866 if (value
== NULL
) {
3867 if (exec
->state
->type
== XML_REGEXP_FINAL_STATE
)
3873 printf("value pushed: %s\n", value
);
3876 * If we have an active rollback stack push the new value there
3877 * and get back to where we were left
3879 if ((value
!= NULL
) && (exec
->inputStackNr
> 0)) {
3880 xmlFARegExecSaveInputString(exec
, value
, data
);
3881 value
= exec
->inputStack
[exec
->index
].value
;
3882 data
= exec
->inputStack
[exec
->index
].data
;
3884 printf("value loaded: %s\n", value
);
3888 while ((exec
->status
== 0) &&
3891 (exec
->state
->type
!= XML_REGEXP_FINAL_STATE
)))) {
3894 * End of input on non-terminal state, rollback, however we may
3895 * still have epsilon like transition for counted transitions
3896 * on counters, in that case don't break too early.
3898 if ((value
== NULL
) && (exec
->counts
== NULL
))
3901 exec
->transcount
= 0;
3902 for (;exec
->transno
< exec
->state
->nbTrans
;exec
->transno
++) {
3903 trans
= &exec
->state
->trans
[exec
->transno
];
3908 if (trans
->count
== REGEXP_ALL_LAX_COUNTER
) {
3912 xmlRegCounterPtr counter
;
3917 printf("testing all lax %d\n", trans
->count
);
3920 * Check all counted transitions from the current state
3922 if ((value
== NULL
) && (final
)) {
3924 } else if (value
!= NULL
) {
3925 for (i
= 0;i
< exec
->state
->nbTrans
;i
++) {
3926 t
= &exec
->state
->trans
[i
];
3927 if ((t
->counter
< 0) || (t
== trans
))
3929 counter
= &exec
->comp
->counters
[t
->counter
];
3930 count
= exec
->counts
[t
->counter
];
3931 if ((count
< counter
->max
) &&
3932 (t
->atom
!= NULL
) &&
3933 (xmlStrEqual(value
, t
->atom
->valuep
))) {
3937 if ((count
>= counter
->min
) &&
3938 (count
< counter
->max
) &&
3939 (t
->atom
!= NULL
) &&
3940 (xmlStrEqual(value
, t
->atom
->valuep
))) {
3946 } else if (trans
->count
== REGEXP_ALL_COUNTER
) {
3950 xmlRegCounterPtr counter
;
3955 printf("testing all %d\n", trans
->count
);
3958 * Check all counted transitions from the current state
3960 for (i
= 0;i
< exec
->state
->nbTrans
;i
++) {
3961 t
= &exec
->state
->trans
[i
];
3962 if ((t
->counter
< 0) || (t
== trans
))
3964 counter
= &exec
->comp
->counters
[t
->counter
];
3965 count
= exec
->counts
[t
->counter
];
3966 if ((count
< counter
->min
) || (count
> counter
->max
)) {
3971 } else if (trans
->count
>= 0) {
3973 xmlRegCounterPtr counter
;
3976 * A counted transition.
3979 count
= exec
->counts
[trans
->count
];
3980 counter
= &exec
->comp
->counters
[trans
->count
];
3982 printf("testing count %d: val %d, min %d, max %d\n",
3983 trans
->count
, count
, counter
->min
, counter
->max
);
3985 ret
= ((count
>= counter
->min
) && (count
<= counter
->max
));
3986 } else if (atom
== NULL
) {
3987 fprintf(stderr
, "epsilon transition left at runtime\n");
3990 } else if (value
!= NULL
) {
3991 ret
= xmlRegStrEqualWildcard(atom
->valuep
, value
);
3997 if ((ret
== 1) && (trans
->counter
>= 0)) {
3998 xmlRegCounterPtr counter
;
4001 count
= exec
->counts
[trans
->counter
];
4002 counter
= &exec
->comp
->counters
[trans
->counter
];
4003 if (count
>= counter
->max
)
4007 if ((ret
== 1) && (atom
->min
> 0) && (atom
->max
> 0)) {
4008 xmlRegStatePtr to
= exec
->comp
->states
[trans
->to
];
4011 * this is a multiple input sequence
4013 if (exec
->state
->nbTrans
> exec
->transno
+ 1) {
4014 if (exec
->inputStackNr
<= 0) {
4015 xmlFARegExecSaveInputString(exec
, value
, data
);
4017 xmlFARegExecSave(exec
);
4019 exec
->transcount
= 1;
4022 * Try to progress as much as possible on the input
4024 if (exec
->transcount
== atom
->max
) {
4028 value
= exec
->inputStack
[exec
->index
].value
;
4029 data
= exec
->inputStack
[exec
->index
].data
;
4031 printf("value loaded: %s\n", value
);
4035 * End of input: stop here
4037 if (value
== NULL
) {
4041 if (exec
->transcount
>= atom
->min
) {
4042 int transno
= exec
->transno
;
4043 xmlRegStatePtr state
= exec
->state
;
4046 * The transition is acceptable save it
4048 exec
->transno
= -1; /* trick */
4050 if (exec
->inputStackNr
<= 0) {
4051 xmlFARegExecSaveInputString(exec
, value
, data
);
4053 xmlFARegExecSave(exec
);
4054 exec
->transno
= transno
;
4055 exec
->state
= state
;
4057 ret
= xmlStrEqual(value
, atom
->valuep
);
4060 if (exec
->transcount
< atom
->min
)
4064 * If the last check failed but one transition was found
4065 * possible, rollback
4075 if ((exec
->callback
!= NULL
) && (atom
!= NULL
) &&
4077 exec
->callback(exec
->data
, atom
->valuep
,
4080 if (exec
->state
->nbTrans
> exec
->transno
+ 1) {
4081 if (exec
->inputStackNr
<= 0) {
4082 xmlFARegExecSaveInputString(exec
, value
, data
);
4084 xmlFARegExecSave(exec
);
4086 if (trans
->counter
>= 0) {
4088 printf("Increasing count %d\n", trans
->counter
);
4090 exec
->counts
[trans
->counter
]++;
4092 if ((trans
->count
>= 0) &&
4093 (trans
->count
< REGEXP_ALL_COUNTER
)) {
4094 #ifdef DEBUG_REGEXP_EXEC
4095 printf("resetting count %d on transition\n",
4098 exec
->counts
[trans
->count
] = 0;
4101 printf("entering state %d\n", trans
->to
);
4103 if ((exec
->comp
->states
[trans
->to
] != NULL
) &&
4104 (exec
->comp
->states
[trans
->to
]->type
==
4105 XML_REGEXP_SINK_STATE
)) {
4107 * entering a sink state, save the current state as error
4110 if (exec
->errString
!= NULL
)
4111 xmlFree(exec
->errString
);
4112 exec
->errString
= xmlStrdup(value
);
4113 exec
->errState
= exec
->state
;
4114 memcpy(exec
->errCounts
, exec
->counts
,
4115 exec
->comp
->nbCounters
* sizeof(int));
4117 exec
->state
= exec
->comp
->states
[trans
->to
];
4119 if (trans
->atom
!= NULL
) {
4120 if (exec
->inputStack
!= NULL
) {
4122 if (exec
->index
< exec
->inputStackNr
) {
4123 value
= exec
->inputStack
[exec
->index
].value
;
4124 data
= exec
->inputStack
[exec
->index
].data
;
4126 printf("value loaded: %s\n", value
);
4132 printf("end of input\n");
4139 printf("end of input\n");
4144 } else if (ret
< 0) {
4149 if ((exec
->transno
!= 0) || (exec
->state
->nbTrans
== 0)) {
4152 * if we didn't yet rollback on the current input
4153 * store the current state as the error state.
4155 if ((progress
) && (exec
->state
!= NULL
) &&
4156 (exec
->state
->type
!= XML_REGEXP_SINK_STATE
)) {
4158 if (exec
->errString
!= NULL
)
4159 xmlFree(exec
->errString
);
4160 exec
->errString
= xmlStrdup(value
);
4161 exec
->errState
= exec
->state
;
4162 if (exec
->comp
->nbCounters
)
4163 memcpy(exec
->errCounts
, exec
->counts
,
4164 exec
->comp
->nbCounters
* sizeof(int));
4168 * Failed to find a way out
4170 exec
->determinist
= 0;
4171 xmlFARegExecRollBack(exec
);
4172 if ((exec
->inputStack
!= NULL
) && (exec
->status
== 0)) {
4173 value
= exec
->inputStack
[exec
->index
].value
;
4174 data
= exec
->inputStack
[exec
->index
].data
;
4176 printf("value loaded: %s\n", value
);
4185 if (exec
->status
== 0) {
4186 return(exec
->state
->type
== XML_REGEXP_FINAL_STATE
);
4189 if (exec
->status
< 0) {
4193 return(exec
->status
);
4197 * xmlRegExecPushString:
4198 * @exec: a regexp execution context or NULL to indicate the end
4199 * @value: a string token input
4200 * @data: data associated to the token to reuse in callbacks
4202 * Push one input token in the execution context
4204 * Returns: 1 if the regexp reached a final state, 0 if non-final, and
4205 * a negative value in case of error.
4208 xmlRegExecPushString(xmlRegExecCtxtPtr exec
, const xmlChar
*value
,
4210 return(xmlRegExecPushStringInternal(exec
, value
, data
, 0));
4214 * xmlRegExecPushString2:
4215 * @exec: a regexp execution context or NULL to indicate the end
4216 * @value: the first string token input
4217 * @value2: the second string token input
4218 * @data: data associated to the token to reuse in callbacks
4220 * Push one input token in the execution context
4222 * Returns: 1 if the regexp reached a final state, 0 if non-final, and
4223 * a negative value in case of error.
4226 xmlRegExecPushString2(xmlRegExecCtxtPtr exec
, const xmlChar
*value
,
4227 const xmlChar
*value2
, void *data
) {
4229 int lenn
, lenp
, ret
;
4234 if (exec
->comp
== NULL
)
4236 if (exec
->status
!= 0)
4237 return(exec
->status
);
4240 return(xmlRegExecPushString(exec
, value
, data
));
4242 lenn
= strlen((char *) value2
);
4243 lenp
= strlen((char *) value
);
4245 if (150 < lenn
+ lenp
+ 2) {
4246 str
= (xmlChar
*) xmlMallocAtomic(lenn
+ lenp
+ 2);
4254 memcpy(&str
[0], value
, lenp
);
4255 str
[lenp
] = XML_REG_STRING_SEPARATOR
;
4256 memcpy(&str
[lenp
+ 1], value2
, lenn
);
4257 str
[lenn
+ lenp
+ 1] = 0;
4259 if (exec
->comp
->compact
!= NULL
)
4260 ret
= xmlRegCompactPushString(exec
, exec
->comp
, str
, data
);
4262 ret
= xmlRegExecPushStringInternal(exec
, str
, data
, 1);
4270 * xmlRegExecGetValues:
4271 * @exec: a regexp execution context
4272 * @err: error extraction or normal one
4273 * @nbval: pointer to the number of accepted values IN/OUT
4274 * @nbneg: return number of negative transitions
4275 * @values: pointer to the array of acceptable values
4276 * @terminal: return value if this was a terminal state
4278 * Extract information from the regexp execution, internal routine to
4279 * implement xmlRegExecNextValues() and xmlRegExecErrInfo()
4281 * Returns: 0 in case of success or -1 in case of error.
4284 xmlRegExecGetValues(xmlRegExecCtxtPtr exec
, int err
,
4285 int *nbval
, int *nbneg
,
4286 xmlChar
**values
, int *terminal
) {
4290 if ((exec
== NULL
) || (nbval
== NULL
) || (nbneg
== NULL
) ||
4291 (values
== NULL
) || (*nbval
<= 0))
4297 if ((exec
->comp
!= NULL
) && (exec
->comp
->compact
!= NULL
)) {
4299 int target
, i
, state
;
4304 if (exec
->errStateNo
== -1) return(-1);
4305 state
= exec
->errStateNo
;
4307 state
= exec
->index
;
4309 if (terminal
!= NULL
) {
4310 if (comp
->compact
[state
* (comp
->nbstrings
+ 1)] ==
4311 XML_REGEXP_FINAL_STATE
)
4316 for (i
= 0;(i
< comp
->nbstrings
) && (nb
< maxval
);i
++) {
4317 target
= comp
->compact
[state
* (comp
->nbstrings
+ 1) + i
+ 1];
4318 if ((target
> 0) && (target
<= comp
->nbstates
) &&
4319 (comp
->compact
[(target
- 1) * (comp
->nbstrings
+ 1)] !=
4320 XML_REGEXP_SINK_STATE
)) {
4321 values
[nb
++] = comp
->stringMap
[i
];
4325 for (i
= 0;(i
< comp
->nbstrings
) && (nb
< maxval
);i
++) {
4326 target
= comp
->compact
[state
* (comp
->nbstrings
+ 1) + i
+ 1];
4327 if ((target
> 0) && (target
<= comp
->nbstates
) &&
4328 (comp
->compact
[(target
- 1) * (comp
->nbstrings
+ 1)] ==
4329 XML_REGEXP_SINK_STATE
)) {
4330 values
[nb
++] = comp
->stringMap
[i
];
4336 xmlRegTransPtr trans
;
4338 xmlRegStatePtr state
;
4340 if (terminal
!= NULL
) {
4341 if (exec
->state
->type
== XML_REGEXP_FINAL_STATE
)
4348 if (exec
->errState
== NULL
) return(-1);
4349 state
= exec
->errState
;
4351 if (exec
->state
== NULL
) return(-1);
4352 state
= exec
->state
;
4355 (transno
< state
->nbTrans
) && (nb
< maxval
);
4357 trans
= &state
->trans
[transno
];
4361 if ((atom
== NULL
) || (atom
->valuep
== NULL
))
4363 if (trans
->count
== REGEXP_ALL_LAX_COUNTER
) {
4364 /* this should not be reached but ... */
4366 } else if (trans
->count
== REGEXP_ALL_COUNTER
) {
4367 /* this should not be reached but ... */
4369 } else if (trans
->counter
>= 0) {
4370 xmlRegCounterPtr counter
= NULL
;
4374 count
= exec
->errCounts
[trans
->counter
];
4376 count
= exec
->counts
[trans
->counter
];
4377 if (exec
->comp
!= NULL
)
4378 counter
= &exec
->comp
->counters
[trans
->counter
];
4379 if ((counter
== NULL
) || (count
< counter
->max
)) {
4381 values
[nb
++] = (xmlChar
*) atom
->valuep2
;
4383 values
[nb
++] = (xmlChar
*) atom
->valuep
;
4387 if ((exec
->comp
!= NULL
) && (exec
->comp
->states
[trans
->to
] != NULL
) &&
4388 (exec
->comp
->states
[trans
->to
]->type
!=
4389 XML_REGEXP_SINK_STATE
)) {
4391 values
[nb
++] = (xmlChar
*) atom
->valuep2
;
4393 values
[nb
++] = (xmlChar
*) atom
->valuep
;
4399 (transno
< state
->nbTrans
) && (nb
< maxval
);
4401 trans
= &state
->trans
[transno
];
4405 if ((atom
== NULL
) || (atom
->valuep
== NULL
))
4407 if (trans
->count
== REGEXP_ALL_LAX_COUNTER
) {
4409 } else if (trans
->count
== REGEXP_ALL_COUNTER
) {
4411 } else if (trans
->counter
>= 0) {
4414 if ((exec
->comp
->states
[trans
->to
] != NULL
) &&
4415 (exec
->comp
->states
[trans
->to
]->type
==
4416 XML_REGEXP_SINK_STATE
)) {
4418 values
[nb
++] = (xmlChar
*) atom
->valuep2
;
4420 values
[nb
++] = (xmlChar
*) atom
->valuep
;
4430 * xmlRegExecNextValues:
4431 * @exec: a regexp execution context
4432 * @nbval: pointer to the number of accepted values IN/OUT
4433 * @nbneg: return number of negative transitions
4434 * @values: pointer to the array of acceptable values
4435 * @terminal: return value if this was a terminal state
4437 * Extract information from the regexp execution,
4438 * the parameter @values must point to an array of @nbval string pointers
4439 * on return nbval will contain the number of possible strings in that
4440 * state and the @values array will be updated with them. The string values
4441 * returned will be freed with the @exec context and don't need to be
4444 * Returns: 0 in case of success or -1 in case of error.
4447 xmlRegExecNextValues(xmlRegExecCtxtPtr exec
, int *nbval
, int *nbneg
,
4448 xmlChar
**values
, int *terminal
) {
4449 return(xmlRegExecGetValues(exec
, 0, nbval
, nbneg
, values
, terminal
));
4453 * xmlRegExecErrInfo:
4454 * @exec: a regexp execution context generating an error
4455 * @string: return value for the error string
4456 * @nbval: pointer to the number of accepted values IN/OUT
4457 * @nbneg: return number of negative transitions
4458 * @values: pointer to the array of acceptable values
4459 * @terminal: return value if this was a terminal state
4461 * Extract error information from the regexp execution, the parameter
4462 * @string will be updated with the value pushed and not accepted,
4463 * the parameter @values must point to an array of @nbval string pointers
4464 * on return nbval will contain the number of possible strings in that
4465 * state and the @values array will be updated with them. The string values
4466 * returned will be freed with the @exec context and don't need to be
4469 * Returns: 0 in case of success or -1 in case of error.
4472 xmlRegExecErrInfo(xmlRegExecCtxtPtr exec
, const xmlChar
**string
,
4473 int *nbval
, int *nbneg
, xmlChar
**values
, int *terminal
) {
4476 if (string
!= NULL
) {
4477 if (exec
->status
!= 0)
4478 *string
= exec
->errString
;
4482 return(xmlRegExecGetValues(exec
, 1, nbval
, nbneg
, values
, terminal
));
4486 static void testerr(xmlRegExecCtxtPtr exec
) {
4487 const xmlChar
*string
;
4492 xmlRegExecErrInfo(exec
, &string
, &nb
, &nbneg
, &values
[0], &terminal
);
4498 xmlRegExecPushChar(xmlRegExecCtxtPtr exec
, int UCS
) {
4499 xmlRegTransPtr trans
;
4506 if (exec
->status
!= 0)
4507 return(exec
->status
);
4509 while ((exec
->status
== 0) &&
4510 ((exec
->inputString
[exec
->index
] != 0) ||
4511 (exec
->state
->type
!= XML_REGEXP_FINAL_STATE
))) {
4514 * End of input on non-terminal state, rollback, however we may
4515 * still have epsilon like transition for counted transitions
4516 * on counters, in that case don't break too early.
4518 if ((exec
->inputString
[exec
->index
] == 0) && (exec
->counts
== NULL
))
4521 exec
->transcount
= 0;
4522 for (;exec
->transno
< exec
->state
->nbTrans
;exec
->transno
++) {
4523 trans
= &exec
->state
->trans
[exec
->transno
];
4528 if (trans
->count
>= 0) {
4530 xmlRegCounterPtr counter
;
4533 * A counted transition.
4536 count
= exec
->counts
[trans
->count
];
4537 counter
= &exec
->comp
->counters
[trans
->count
];
4538 #ifdef DEBUG_REGEXP_EXEC
4539 printf("testing count %d: val %d, min %d, max %d\n",
4540 trans
->count
, count
, counter
->min
, counter
->max
);
4542 ret
= ((count
>= counter
->min
) && (count
<= counter
->max
));
4543 } else if (atom
== NULL
) {
4544 fprintf(stderr
, "epsilon transition left at runtime\n");
4547 } else if (exec
->inputString
[exec
->index
] != 0) {
4548 codepoint
= CUR_SCHAR(&(exec
->inputString
[exec
->index
]), len
);
4549 ret
= xmlRegCheckCharacter(atom
, codepoint
);
4550 if ((ret
== 1) && (atom
->min
> 0) && (atom
->max
> 0)) {
4551 xmlRegStatePtr to
= exec
->comp
->states
[trans
->to
];
4554 * this is a multiple input sequence
4556 if (exec
->state
->nbTrans
> exec
->transno
+ 1) {
4557 xmlFARegExecSave(exec
);
4559 exec
->transcount
= 1;
4562 * Try to progress as much as possible on the input
4564 if (exec
->transcount
== atom
->max
) {
4569 * End of input: stop here
4571 if (exec
->inputString
[exec
->index
] == 0) {
4575 if (exec
->transcount
>= atom
->min
) {
4576 int transno
= exec
->transno
;
4577 xmlRegStatePtr state
= exec
->state
;
4580 * The transition is acceptable save it
4582 exec
->transno
= -1; /* trick */
4584 xmlFARegExecSave(exec
);
4585 exec
->transno
= transno
;
4586 exec
->state
= state
;
4588 codepoint
= CUR_SCHAR(&(exec
->inputString
[exec
->index
]),
4590 ret
= xmlRegCheckCharacter(atom
, codepoint
);
4593 if (exec
->transcount
< atom
->min
)
4597 * If the last check failed but one transition was found
4598 * possible, rollback
4608 if (exec
->state
->nbTrans
> exec
->transno
+ 1) {
4609 xmlFARegExecSave(exec
);
4612 * restart count for expressions like this ((abc){2})*
4614 if (trans
->count
>= 0) {
4615 #ifdef DEBUG_REGEXP_EXEC
4616 printf("Reset count %d\n", trans
->count
);
4618 exec
->counts
[trans
->count
] = 0;
4620 if (trans
->counter
>= 0) {
4621 #ifdef DEBUG_REGEXP_EXEC
4622 printf("Increasing count %d\n", trans
->counter
);
4624 exec
->counts
[trans
->counter
]++;
4626 #ifdef DEBUG_REGEXP_EXEC
4627 printf("entering state %d\n", trans
->to
);
4629 exec
->state
= exec
->comp
->states
[trans
->to
];
4631 if (trans
->atom
!= NULL
) {
4635 } else if (ret
< 0) {
4640 if ((exec
->transno
!= 0) || (exec
->state
->nbTrans
== 0)) {
4643 * Failed to find a way out
4645 exec
->determinist
= 0;
4646 xmlFARegExecRollBack(exec
);
4653 /************************************************************************
4655 * Parser for the Schemas Datatype Regular Expressions *
4656 * http://www.w3.org/TR/2001/REC-xmlschema-2-20010502/#regexs *
4658 ************************************************************************/
4662 * @ctxt: a regexp parser context
4664 * [10] Char ::= [^.\?*+()|#x5B#x5D]
4667 xmlFAIsChar(xmlRegParserCtxtPtr ctxt
) {
4671 cur
= CUR_SCHAR(ctxt
->cur
, len
);
4672 if ((cur
== '.') || (cur
== '\\') || (cur
== '?') ||
4673 (cur
== '*') || (cur
== '+') || (cur
== '(') ||
4674 (cur
== ')') || (cur
== '|') || (cur
== 0x5B) ||
4675 (cur
== 0x5D) || (cur
== 0))
4681 * xmlFAParseCharProp:
4682 * @ctxt: a regexp parser context
4684 * [27] charProp ::= IsCategory | IsBlock
4685 * [28] IsCategory ::= Letters | Marks | Numbers | Punctuation |
4686 * Separators | Symbols | Others
4687 * [29] Letters ::= 'L' [ultmo]?
4688 * [30] Marks ::= 'M' [nce]?
4689 * [31] Numbers ::= 'N' [dlo]?
4690 * [32] Punctuation ::= 'P' [cdseifo]?
4691 * [33] Separators ::= 'Z' [slp]?
4692 * [34] Symbols ::= 'S' [mcko]?
4693 * [35] Others ::= 'C' [cfon]?
4694 * [36] IsBlock ::= 'Is' [a-zA-Z0-9#x2D]+
4697 xmlFAParseCharProp(xmlRegParserCtxtPtr ctxt
) {
4699 xmlRegAtomType type
= (xmlRegAtomType
) 0;
4700 xmlChar
*blockName
= NULL
;
4708 type
= XML_REGEXP_LETTER_UPPERCASE
;
4709 } else if (cur
== 'l') {
4711 type
= XML_REGEXP_LETTER_LOWERCASE
;
4712 } else if (cur
== 't') {
4714 type
= XML_REGEXP_LETTER_TITLECASE
;
4715 } else if (cur
== 'm') {
4717 type
= XML_REGEXP_LETTER_MODIFIER
;
4718 } else if (cur
== 'o') {
4720 type
= XML_REGEXP_LETTER_OTHERS
;
4722 type
= XML_REGEXP_LETTER
;
4724 } else if (cur
== 'M') {
4730 type
= XML_REGEXP_MARK_NONSPACING
;
4731 } else if (cur
== 'c') {
4733 /* spacing combining */
4734 type
= XML_REGEXP_MARK_SPACECOMBINING
;
4735 } else if (cur
== 'e') {
4738 type
= XML_REGEXP_MARK_ENCLOSING
;
4741 type
= XML_REGEXP_MARK
;
4743 } else if (cur
== 'N') {
4749 type
= XML_REGEXP_NUMBER_DECIMAL
;
4750 } else if (cur
== 'l') {
4753 type
= XML_REGEXP_NUMBER_LETTER
;
4754 } else if (cur
== 'o') {
4757 type
= XML_REGEXP_NUMBER_OTHERS
;
4760 type
= XML_REGEXP_NUMBER
;
4762 } else if (cur
== 'P') {
4768 type
= XML_REGEXP_PUNCT_CONNECTOR
;
4769 } else if (cur
== 'd') {
4772 type
= XML_REGEXP_PUNCT_DASH
;
4773 } else if (cur
== 's') {
4776 type
= XML_REGEXP_PUNCT_OPEN
;
4777 } else if (cur
== 'e') {
4780 type
= XML_REGEXP_PUNCT_CLOSE
;
4781 } else if (cur
== 'i') {
4784 type
= XML_REGEXP_PUNCT_INITQUOTE
;
4785 } else if (cur
== 'f') {
4788 type
= XML_REGEXP_PUNCT_FINQUOTE
;
4789 } else if (cur
== 'o') {
4792 type
= XML_REGEXP_PUNCT_OTHERS
;
4794 /* all punctuation */
4795 type
= XML_REGEXP_PUNCT
;
4797 } else if (cur
== 'Z') {
4803 type
= XML_REGEXP_SEPAR_SPACE
;
4804 } else if (cur
== 'l') {
4807 type
= XML_REGEXP_SEPAR_LINE
;
4808 } else if (cur
== 'p') {
4811 type
= XML_REGEXP_SEPAR_PARA
;
4813 /* all separators */
4814 type
= XML_REGEXP_SEPAR
;
4816 } else if (cur
== 'S') {
4821 type
= XML_REGEXP_SYMBOL_MATH
;
4823 } else if (cur
== 'c') {
4825 type
= XML_REGEXP_SYMBOL_CURRENCY
;
4827 } else if (cur
== 'k') {
4829 type
= XML_REGEXP_SYMBOL_MODIFIER
;
4831 } else if (cur
== 'o') {
4833 type
= XML_REGEXP_SYMBOL_OTHERS
;
4837 type
= XML_REGEXP_SYMBOL
;
4839 } else if (cur
== 'C') {
4845 type
= XML_REGEXP_OTHER_CONTROL
;
4846 } else if (cur
== 'f') {
4849 type
= XML_REGEXP_OTHER_FORMAT
;
4850 } else if (cur
== 'o') {
4853 type
= XML_REGEXP_OTHER_PRIVATE
;
4854 } else if (cur
== 'n') {
4857 type
= XML_REGEXP_OTHER_NA
;
4860 type
= XML_REGEXP_OTHER
;
4862 } else if (cur
== 'I') {
4863 const xmlChar
*start
;
4867 ERROR("IsXXXX expected");
4873 if (((cur
>= 'a') && (cur
<= 'z')) ||
4874 ((cur
>= 'A') && (cur
<= 'Z')) ||
4875 ((cur
>= '0') && (cur
<= '9')) ||
4879 while (((cur
>= 'a') && (cur
<= 'z')) ||
4880 ((cur
>= 'A') && (cur
<= 'Z')) ||
4881 ((cur
>= '0') && (cur
<= '9')) ||
4887 type
= XML_REGEXP_BLOCK_NAME
;
4888 blockName
= xmlStrndup(start
, ctxt
->cur
- start
);
4890 ERROR("Unknown char property");
4893 if (ctxt
->atom
== NULL
) {
4894 ctxt
->atom
= xmlRegNewAtom(ctxt
, type
);
4895 if (ctxt
->atom
!= NULL
)
4896 ctxt
->atom
->valuep
= blockName
;
4897 } else if (ctxt
->atom
->type
== XML_REGEXP_RANGES
) {
4898 xmlRegAtomAddRange(ctxt
, ctxt
->atom
, ctxt
->neg
,
4899 type
, 0, 0, blockName
);
4903 static int parse_escaped_codeunit(xmlRegParserCtxtPtr ctxt
)
4905 int val
= 0, i
, cur
;
4906 for (i
= 0; i
< 4; i
++) {
4910 if (cur
>= '0' && cur
<= '9') {
4912 } else if (cur
>= 'A' && cur
<= 'F') {
4913 val
+= cur
- 'A' + 10;
4914 } else if (cur
>= 'a' && cur
<= 'f') {
4915 val
+= cur
- 'a' + 10;
4917 ERROR("Expecting hex digit");
4924 static int parse_escaped_codepoint(xmlRegParserCtxtPtr ctxt
)
4926 int val
= parse_escaped_codeunit(ctxt
);
4927 if (0xD800 <= val
&& val
<= 0xDBFF) {
4932 int low
= parse_escaped_codeunit(ctxt
);
4933 if (0xDC00 <= low
&& low
<= 0xDFFF) {
4934 return (val
- 0xD800) * 0x400 + (low
- 0xDC00) + 0x10000;
4938 ERROR("Invalid low surrogate pair code unit");
4945 * xmlFAParseCharClassEsc:
4946 * @ctxt: a regexp parser context
4948 * [23] charClassEsc ::= ( SingleCharEsc | MultiCharEsc | catEsc | complEsc )
4949 * [24] SingleCharEsc ::= '\' [nrt\|.?*+(){}#x2D#x5B#x5D#x5E]
4950 * [25] catEsc ::= '\p{' charProp '}'
4951 * [26] complEsc ::= '\P{' charProp '}'
4952 * [37] MultiCharEsc ::= '.' | ('\' [sSiIcCdDwW])
4955 xmlFAParseCharClassEsc(xmlRegParserCtxtPtr ctxt
) {
4959 if (ctxt
->atom
== NULL
) {
4960 ctxt
->atom
= xmlRegNewAtom(ctxt
, XML_REGEXP_ANYCHAR
);
4961 } else if (ctxt
->atom
->type
== XML_REGEXP_RANGES
) {
4962 xmlRegAtomAddRange(ctxt
, ctxt
->atom
, ctxt
->neg
,
4963 XML_REGEXP_ANYCHAR
, 0, 0, NULL
);
4969 ERROR("Escaped sequence: expecting \\");
4977 ERROR("Expecting '{'");
4981 xmlFAParseCharProp(ctxt
);
4983 ERROR("Expecting '}'");
4987 } else if (cur
== 'P') {
4990 ERROR("Expecting '{'");
4994 xmlFAParseCharProp(ctxt
);
4995 if (ctxt
->atom
!= NULL
)
4996 ctxt
->atom
->neg
= 1;
4998 ERROR("Expecting '}'");
5002 } else if ((cur
== 'n') || (cur
== 'r') || (cur
== 't') || (cur
== '\\') ||
5003 (cur
== '|') || (cur
== '.') || (cur
== '?') || (cur
== '*') ||
5004 (cur
== '+') || (cur
== '(') || (cur
== ')') || (cur
== '{') ||
5005 (cur
== '}') || (cur
== 0x2D) || (cur
== 0x5B) || (cur
== 0x5D) ||
5008 /* Non-standard escape sequences:
5009 * Java 1.8|.NET Core 3.1|MSXML 6 */
5010 (cur
== '!') || /* + | + | + */
5011 (cur
== '"') || /* + | + | + */
5012 (cur
== '#') || /* + | + | + */
5013 (cur
== '$') || /* + | + | + */
5014 (cur
== '%') || /* + | + | + */
5015 (cur
== ',') || /* + | + | + */
5016 (cur
== '/') || /* + | + | + */
5017 (cur
== ':') || /* + | + | + */
5018 (cur
== ';') || /* + | + | + */
5019 (cur
== '=') || /* + | + | + */
5020 (cur
== '>') || /* | + | + */
5021 (cur
== '@') || /* + | + | + */
5022 (cur
== '`') || /* + | + | + */
5023 (cur
== '~') || /* + | + | + */
5024 (cur
== 'u')) { /* | + | + */
5025 if (ctxt
->atom
== NULL
) {
5026 ctxt
->atom
= xmlRegNewAtom(ctxt
, XML_REGEXP_CHARVAL
);
5027 if (ctxt
->atom
!= NULL
) {
5030 ctxt
->atom
->codepoint
= '\n';
5033 ctxt
->atom
->codepoint
= '\r';
5036 ctxt
->atom
->codepoint
= '\t';
5039 cur
= parse_escaped_codepoint(ctxt
);
5043 ctxt
->atom
->codepoint
= cur
;
5046 ctxt
->atom
->codepoint
= cur
;
5049 } else if (ctxt
->atom
->type
== XML_REGEXP_RANGES
) {
5061 xmlRegAtomAddRange(ctxt
, ctxt
->atom
, ctxt
->neg
,
5062 XML_REGEXP_CHARVAL
, cur
, cur
, NULL
);
5065 } else if ((cur
== 's') || (cur
== 'S') || (cur
== 'i') || (cur
== 'I') ||
5066 (cur
== 'c') || (cur
== 'C') || (cur
== 'd') || (cur
== 'D') ||
5067 (cur
== 'w') || (cur
== 'W')) {
5068 xmlRegAtomType type
= XML_REGEXP_ANYSPACE
;
5072 type
= XML_REGEXP_ANYSPACE
;
5075 type
= XML_REGEXP_NOTSPACE
;
5078 type
= XML_REGEXP_INITNAME
;
5081 type
= XML_REGEXP_NOTINITNAME
;
5084 type
= XML_REGEXP_NAMECHAR
;
5087 type
= XML_REGEXP_NOTNAMECHAR
;
5090 type
= XML_REGEXP_DECIMAL
;
5093 type
= XML_REGEXP_NOTDECIMAL
;
5096 type
= XML_REGEXP_REALCHAR
;
5099 type
= XML_REGEXP_NOTREALCHAR
;
5103 if (ctxt
->atom
== NULL
) {
5104 ctxt
->atom
= xmlRegNewAtom(ctxt
, type
);
5105 } else if (ctxt
->atom
->type
== XML_REGEXP_RANGES
) {
5106 xmlRegAtomAddRange(ctxt
, ctxt
->atom
, ctxt
->neg
,
5110 ERROR("Wrong escape sequence, misuse of character '\\'");
5115 * xmlFAParseCharRange:
5116 * @ctxt: a regexp parser context
5118 * [17] charRange ::= seRange | XmlCharRef | XmlCharIncDash
5119 * [18] seRange ::= charOrEsc '-' charOrEsc
5120 * [20] charOrEsc ::= XmlChar | SingleCharEsc
5121 * [21] XmlChar ::= [^\#x2D#x5B#x5D]
5122 * [22] XmlCharIncDash ::= [^\#x5B#x5D]
5125 xmlFAParseCharRange(xmlRegParserCtxtPtr ctxt
) {
5131 ERROR("Expecting ']'");
5140 case 'n': start
= 0xA; break;
5141 case 'r': start
= 0xD; break;
5142 case 't': start
= 0x9; break;
5143 case '\\': case '|': case '.': case '-': case '^': case '?':
5144 case '*': case '+': case '{': case '}': case '(': case ')':
5148 ERROR("Invalid escape value");
5153 } else if ((cur
!= 0x5B) && (cur
!= 0x5D)) {
5154 end
= start
= CUR_SCHAR(ctxt
->cur
, len
);
5156 ERROR("Expecting a char range");
5160 * Since we are "inside" a range, we can assume ctxt->cur is past
5161 * the start of ctxt->string, and PREV should be safe
5163 if ((start
== '-') && (NXT(1) != ']') && (PREV
!= '[') && (PREV
!= '^')) {
5169 if ((cur
!= '-') || (NXT(1) == '[') || (NXT(1) == ']')) {
5170 xmlRegAtomAddRange(ctxt
, ctxt
->atom
, ctxt
->neg
,
5171 XML_REGEXP_CHARVAL
, start
, end
, NULL
);
5180 case 'n': end
= 0xA; break;
5181 case 'r': end
= 0xD; break;
5182 case 't': end
= 0x9; break;
5183 case '\\': case '|': case '.': case '-': case '^': case '?':
5184 case '*': case '+': case '{': case '}': case '(': case ')':
5188 ERROR("Invalid escape value");
5192 } else if ((cur
!= '\0') && (cur
!= 0x5B) && (cur
!= 0x5D)) {
5193 end
= CUR_SCHAR(ctxt
->cur
, len
);
5195 ERROR("Expecting the end of a char range");
5199 /* TODO check that the values are acceptable character ranges for XML */
5201 ERROR("End of range is before start of range");
5204 xmlRegAtomAddRange(ctxt
, ctxt
->atom
, ctxt
->neg
,
5205 XML_REGEXP_CHARVAL
, start
, end
, NULL
);
5211 * xmlFAParsePosCharGroup:
5212 * @ctxt: a regexp parser context
5214 * [14] posCharGroup ::= ( charRange | charClassEsc )+
5217 xmlFAParsePosCharGroup(xmlRegParserCtxtPtr ctxt
) {
5220 xmlFAParseCharClassEsc(ctxt
);
5222 xmlFAParseCharRange(ctxt
);
5224 } while ((CUR
!= ']') && (CUR
!= '-') &&
5225 (CUR
!= 0) && (ctxt
->error
== 0));
5229 * xmlFAParseCharGroup:
5230 * @ctxt: a regexp parser context
5232 * [13] charGroup ::= posCharGroup | negCharGroup | charClassSub
5233 * [15] negCharGroup ::= '^' posCharGroup
5234 * [16] charClassSub ::= ( posCharGroup | negCharGroup ) '-' charClassExpr
5235 * [12] charClassExpr ::= '[' charGroup ']'
5238 xmlFAParseCharGroup(xmlRegParserCtxtPtr ctxt
) {
5239 int neg
= ctxt
->neg
;
5243 ctxt
->neg
= !ctxt
->neg
;
5244 xmlFAParsePosCharGroup(ctxt
);
5247 while ((CUR
!= ']') && (ctxt
->error
== 0)) {
5248 if ((CUR
== '-') && (NXT(1) == '[')) {
5249 NEXT
; /* eat the '-' */
5250 NEXT
; /* eat the '[' */
5252 xmlFAParseCharGroup(ctxt
);
5257 ERROR("charClassExpr: ']' expected");
5261 xmlFAParsePosCharGroup(ctxt
);
5267 * xmlFAParseCharClass:
5268 * @ctxt: a regexp parser context
5270 * [11] charClass ::= charClassEsc | charClassExpr
5271 * [12] charClassExpr ::= '[' charGroup ']'
5274 xmlFAParseCharClass(xmlRegParserCtxtPtr ctxt
) {
5277 ctxt
->atom
= xmlRegNewAtom(ctxt
, XML_REGEXP_RANGES
);
5278 if (ctxt
->atom
== NULL
)
5280 xmlFAParseCharGroup(ctxt
);
5284 ERROR("xmlFAParseCharClass: ']' expected");
5287 xmlFAParseCharClassEsc(ctxt
);
5292 * xmlFAParseQuantExact:
5293 * @ctxt: a regexp parser context
5295 * [8] QuantExact ::= [0-9]+
5297 * Returns 0 if success or -1 in case of error
5300 xmlFAParseQuantExact(xmlRegParserCtxtPtr ctxt
) {
5305 while ((CUR
>= '0') && (CUR
<= '9')) {
5306 if (ret
> INT_MAX
/ 10) {
5309 int digit
= CUR
- '0';
5312 if (ret
> INT_MAX
- digit
)
5320 if ((ok
!= 1) || (overflow
== 1)) {
5327 * xmlFAParseQuantifier:
5328 * @ctxt: a regexp parser context
5330 * [4] quantifier ::= [?*+] | ( '{' quantity '}' )
5331 * [5] quantity ::= quantRange | quantMin | QuantExact
5332 * [6] quantRange ::= QuantExact ',' QuantExact
5333 * [7] quantMin ::= QuantExact ','
5334 * [8] QuantExact ::= [0-9]+
5337 xmlFAParseQuantifier(xmlRegParserCtxtPtr ctxt
) {
5341 if ((cur
== '?') || (cur
== '*') || (cur
== '+')) {
5342 if (ctxt
->atom
!= NULL
) {
5344 ctxt
->atom
->quant
= XML_REGEXP_QUANT_OPT
;
5345 else if (cur
== '*')
5346 ctxt
->atom
->quant
= XML_REGEXP_QUANT_MULT
;
5347 else if (cur
== '+')
5348 ctxt
->atom
->quant
= XML_REGEXP_QUANT_PLUS
;
5354 int min
= 0, max
= 0;
5357 cur
= xmlFAParseQuantExact(ctxt
);
5361 ERROR("Improper quantifier");
5368 cur
= xmlFAParseQuantExact(ctxt
);
5372 ERROR("Improper quantifier");
5379 ERROR("Unterminated quantifier");
5383 if (ctxt
->atom
!= NULL
) {
5384 ctxt
->atom
->quant
= XML_REGEXP_QUANT_RANGE
;
5385 ctxt
->atom
->min
= min
;
5386 ctxt
->atom
->max
= max
;
5395 * @ctxt: a regexp parser context
5397 * [9] atom ::= Char | charClass | ( '(' regExp ')' )
5400 xmlFAParseAtom(xmlRegParserCtxtPtr ctxt
) {
5403 codepoint
= xmlFAIsChar(ctxt
);
5404 if (codepoint
> 0) {
5405 ctxt
->atom
= xmlRegNewAtom(ctxt
, XML_REGEXP_CHARVAL
);
5406 if (ctxt
->atom
== NULL
)
5408 codepoint
= CUR_SCHAR(ctxt
->cur
, len
);
5409 ctxt
->atom
->codepoint
= codepoint
;
5412 } else if (CUR
== '|') {
5414 } else if (CUR
== 0) {
5416 } else if (CUR
== ')') {
5418 } else if (CUR
== '(') {
5419 xmlRegStatePtr start
, oldend
, start0
;
5422 if (ctxt
->depth
>= 50) {
5423 ERROR("xmlFAParseAtom: maximum nesting depth exceeded");
5427 * this extra Epsilon transition is needed if we count with 0 allowed
5428 * unfortunately this can't be known at that point
5430 xmlFAGenerateEpsilonTransition(ctxt
, ctxt
->state
, NULL
);
5431 start0
= ctxt
->state
;
5432 xmlFAGenerateEpsilonTransition(ctxt
, ctxt
->state
, NULL
);
5433 start
= ctxt
->state
;
5438 xmlFAParseRegExp(ctxt
, 0);
5443 ERROR("xmlFAParseAtom: expecting ')'");
5445 ctxt
->atom
= xmlRegNewAtom(ctxt
, XML_REGEXP_SUBREG
);
5446 if (ctxt
->atom
== NULL
)
5448 ctxt
->atom
->start
= start
;
5449 ctxt
->atom
->start0
= start0
;
5450 ctxt
->atom
->stop
= ctxt
->state
;
5453 } else if ((CUR
== '[') || (CUR
== '\\') || (CUR
== '.')) {
5454 xmlFAParseCharClass(ctxt
);
5462 * @ctxt: a regexp parser context
5464 * [3] piece ::= atom quantifier?
5467 xmlFAParsePiece(xmlRegParserCtxtPtr ctxt
) {
5471 ret
= xmlFAParseAtom(ctxt
);
5474 if (ctxt
->atom
== NULL
) {
5475 ERROR("internal: no atom generated");
5477 xmlFAParseQuantifier(ctxt
);
5483 * @ctxt: a regexp parser context
5484 * @to: optional target to the end of the branch
5486 * @to is used to optimize by removing duplicate path in automata
5487 * in expressions like (a|b)(c|d)
5489 * [2] branch ::= piece*
5492 xmlFAParseBranch(xmlRegParserCtxtPtr ctxt
, xmlRegStatePtr to
) {
5493 xmlRegStatePtr previous
;
5496 previous
= ctxt
->state
;
5497 ret
= xmlFAParsePiece(ctxt
);
5500 xmlFAGenerateEpsilonTransition(ctxt
, previous
, to
);
5502 if (xmlFAGenerateTransitions(ctxt
, previous
,
5503 (CUR
=='|' || CUR
==')' || CUR
==0) ? to
: NULL
, ctxt
->atom
) < 0)
5505 previous
= ctxt
->state
;
5508 while ((ret
!= 0) && (ctxt
->error
== 0)) {
5509 ret
= xmlFAParsePiece(ctxt
);
5511 if (xmlFAGenerateTransitions(ctxt
, previous
,
5512 (CUR
=='|' || CUR
==')' || CUR
==0) ? to
: NULL
,
5515 previous
= ctxt
->state
;
5524 * @ctxt: a regexp parser context
5525 * @top: is this the top-level expression ?
5527 * [1] regExp ::= branch ( '|' branch )*
5530 xmlFAParseRegExp(xmlRegParserCtxtPtr ctxt
, int top
) {
5531 xmlRegStatePtr start
, end
;
5533 /* if not top start should have been generated by an epsilon trans */
5534 start
= ctxt
->state
;
5536 xmlFAParseBranch(ctxt
, NULL
);
5538 #ifdef DEBUG_REGEXP_GRAPH
5539 printf("State %d is final\n", ctxt
->state
->no
);
5541 ctxt
->state
->type
= XML_REGEXP_FINAL_STATE
;
5544 ctxt
->end
= ctxt
->state
;
5548 while ((CUR
== '|') && (ctxt
->error
== 0)) {
5550 ctxt
->state
= start
;
5552 xmlFAParseBranch(ctxt
, end
);
5560 /************************************************************************
5564 ************************************************************************/
5568 * @output: the file for the output debug
5569 * @regexp: the compiled regexp
5571 * Print the content of the compiled regular expression
5574 xmlRegexpPrint(FILE *output
, xmlRegexpPtr regexp
) {
5579 fprintf(output
, " regexp: ");
5580 if (regexp
== NULL
) {
5581 fprintf(output
, "NULL\n");
5584 fprintf(output
, "'%s' ", regexp
->string
);
5585 fprintf(output
, "\n");
5586 fprintf(output
, "%d atoms:\n", regexp
->nbAtoms
);
5587 for (i
= 0;i
< regexp
->nbAtoms
; i
++) {
5588 fprintf(output
, " %02d ", i
);
5589 xmlRegPrintAtom(output
, regexp
->atoms
[i
]);
5591 fprintf(output
, "%d states:", regexp
->nbStates
);
5592 fprintf(output
, "\n");
5593 for (i
= 0;i
< regexp
->nbStates
; i
++) {
5594 xmlRegPrintState(output
, regexp
->states
[i
]);
5596 fprintf(output
, "%d counters:\n", regexp
->nbCounters
);
5597 for (i
= 0;i
< regexp
->nbCounters
; i
++) {
5598 fprintf(output
, " %d: min %d max %d\n", i
, regexp
->counters
[i
].min
,
5599 regexp
->counters
[i
].max
);
5605 * @regexp: a regular expression string
5607 * Parses a regular expression conforming to XML Schemas Part 2 Datatype
5608 * Appendix F and builds an automata suitable for testing strings against
5609 * that regular expression
5611 * Returns the compiled expression or NULL in case of error
5614 xmlRegexpCompile(const xmlChar
*regexp
) {
5616 xmlRegParserCtxtPtr ctxt
;
5618 ctxt
= xmlRegNewParserCtxt(regexp
);
5622 /* initialize the parser */
5624 ctxt
->start
= ctxt
->state
= xmlRegNewState(ctxt
);
5625 xmlRegStatePush(ctxt
, ctxt
->start
);
5627 /* parse the expression building an automata */
5628 xmlFAParseRegExp(ctxt
, 1);
5630 ERROR("xmlFAParseRegExp: extra characters");
5632 if (ctxt
->error
!= 0) {
5633 xmlRegFreeParserCtxt(ctxt
);
5636 ctxt
->end
= ctxt
->state
;
5637 ctxt
->start
->type
= XML_REGEXP_START_STATE
;
5638 ctxt
->end
->type
= XML_REGEXP_FINAL_STATE
;
5640 /* remove the Epsilon except for counted transitions */
5641 xmlFAEliminateEpsilonTransitions(ctxt
);
5644 if (ctxt
->error
!= 0) {
5645 xmlRegFreeParserCtxt(ctxt
);
5648 ret
= xmlRegEpxFromParse(ctxt
);
5649 xmlRegFreeParserCtxt(ctxt
);
5655 * @comp: the compiled regular expression
5656 * @content: the value to check against the regular expression
5658 * Check if the regular expression generates the value
5660 * Returns 1 if it matches, 0 if not and a negative value in case of error
5663 xmlRegexpExec(xmlRegexpPtr comp
, const xmlChar
*content
) {
5664 if ((comp
== NULL
) || (content
== NULL
))
5666 return(xmlFARegExec(comp
, content
));
5670 * xmlRegexpIsDeterminist:
5671 * @comp: the compiled regular expression
5673 * Check if the regular expression is determinist
5675 * Returns 1 if it yes, 0 if not and a negative value in case of error
5678 xmlRegexpIsDeterminist(xmlRegexpPtr comp
) {
5684 if (comp
->determinist
!= -1)
5685 return(comp
->determinist
);
5687 am
= xmlNewAutomata();
5690 if (am
->states
!= NULL
) {
5693 for (i
= 0;i
< am
->nbStates
;i
++)
5694 xmlRegFreeState(am
->states
[i
]);
5695 xmlFree(am
->states
);
5697 am
->nbAtoms
= comp
->nbAtoms
;
5698 am
->atoms
= comp
->atoms
;
5699 am
->nbStates
= comp
->nbStates
;
5700 am
->states
= comp
->states
;
5701 am
->determinist
= -1;
5702 am
->flags
= comp
->flags
;
5703 ret
= xmlFAComputesDeterminism(am
);
5706 xmlFreeAutomata(am
);
5707 comp
->determinist
= ret
;
5713 * @regexp: the regexp
5718 xmlRegFreeRegexp(xmlRegexpPtr regexp
) {
5723 if (regexp
->string
!= NULL
)
5724 xmlFree(regexp
->string
);
5725 if (regexp
->states
!= NULL
) {
5726 for (i
= 0;i
< regexp
->nbStates
;i
++)
5727 xmlRegFreeState(regexp
->states
[i
]);
5728 xmlFree(regexp
->states
);
5730 if (regexp
->atoms
!= NULL
) {
5731 for (i
= 0;i
< regexp
->nbAtoms
;i
++)
5732 xmlRegFreeAtom(regexp
->atoms
[i
]);
5733 xmlFree(regexp
->atoms
);
5735 if (regexp
->counters
!= NULL
)
5736 xmlFree(regexp
->counters
);
5737 if (regexp
->compact
!= NULL
)
5738 xmlFree(regexp
->compact
);
5739 if (regexp
->transdata
!= NULL
)
5740 xmlFree(regexp
->transdata
);
5741 if (regexp
->stringMap
!= NULL
) {
5742 for (i
= 0; i
< regexp
->nbstrings
;i
++)
5743 xmlFree(regexp
->stringMap
[i
]);
5744 xmlFree(regexp
->stringMap
);
5750 #ifdef LIBXML_AUTOMATA_ENABLED
5751 /************************************************************************
5753 * The Automata interface *
5755 ************************************************************************/
5760 * Create a new automata
5762 * Returns the new object or NULL in case of failure
5765 xmlNewAutomata(void) {
5766 xmlAutomataPtr ctxt
;
5768 ctxt
= xmlRegNewParserCtxt(NULL
);
5772 /* initialize the parser */
5774 ctxt
->start
= ctxt
->state
= xmlRegNewState(ctxt
);
5775 if (ctxt
->start
== NULL
) {
5776 xmlFreeAutomata(ctxt
);
5779 ctxt
->start
->type
= XML_REGEXP_START_STATE
;
5780 if (xmlRegStatePush(ctxt
, ctxt
->start
) < 0) {
5781 xmlRegFreeState(ctxt
->start
);
5782 xmlFreeAutomata(ctxt
);
5797 xmlFreeAutomata(xmlAutomataPtr am
) {
5800 xmlRegFreeParserCtxt(am
);
5804 * xmlAutomataSetFlags:
5806 * @flags: a set of internal flags
5808 * Set some flags on the automata
5811 xmlAutomataSetFlags(xmlAutomataPtr am
, int flags
) {
5818 * xmlAutomataGetInitState:
5821 * Initial state lookup
5823 * Returns the initial state of the automata
5826 xmlAutomataGetInitState(xmlAutomataPtr am
) {
5833 * xmlAutomataSetFinalState:
5835 * @state: a state in this automata
5837 * Makes that state a final state
5839 * Returns 0 or -1 in case of error
5842 xmlAutomataSetFinalState(xmlAutomataPtr am
, xmlAutomataStatePtr state
) {
5843 if ((am
== NULL
) || (state
== NULL
))
5845 state
->type
= XML_REGEXP_FINAL_STATE
;
5850 * xmlAutomataNewTransition:
5852 * @from: the starting point of the transition
5853 * @to: the target point of the transition or NULL
5854 * @token: the input string associated to that transition
5855 * @data: data passed to the callback function if the transition is activated
5857 * If @to is NULL, this creates first a new target state in the automata
5858 * and then adds a transition from the @from state to the target state
5859 * activated by the value of @token
5861 * Returns the target state or NULL in case of error
5864 xmlAutomataNewTransition(xmlAutomataPtr am
, xmlAutomataStatePtr from
,
5865 xmlAutomataStatePtr to
, const xmlChar
*token
,
5869 if ((am
== NULL
) || (from
== NULL
) || (token
== NULL
))
5871 atom
= xmlRegNewAtom(am
, XML_REGEXP_STRING
);
5875 atom
->valuep
= xmlStrdup(token
);
5877 if (xmlFAGenerateTransitions(am
, from
, to
, atom
) < 0) {
5878 xmlRegFreeAtom(atom
);
5887 * xmlAutomataNewTransition2:
5889 * @from: the starting point of the transition
5890 * @to: the target point of the transition or NULL
5891 * @token: the first input string associated to that transition
5892 * @token2: the second input string associated to that transition
5893 * @data: data passed to the callback function if the transition is activated
5895 * If @to is NULL, this creates first a new target state in the automata
5896 * and then adds a transition from the @from state to the target state
5897 * activated by the value of @token
5899 * Returns the target state or NULL in case of error
5902 xmlAutomataNewTransition2(xmlAutomataPtr am
, xmlAutomataStatePtr from
,
5903 xmlAutomataStatePtr to
, const xmlChar
*token
,
5904 const xmlChar
*token2
, void *data
) {
5907 if ((am
== NULL
) || (from
== NULL
) || (token
== NULL
))
5909 atom
= xmlRegNewAtom(am
, XML_REGEXP_STRING
);
5913 if ((token2
== NULL
) || (*token2
== 0)) {
5914 atom
->valuep
= xmlStrdup(token
);
5919 lenn
= strlen((char *) token2
);
5920 lenp
= strlen((char *) token
);
5922 str
= (xmlChar
*) xmlMallocAtomic(lenn
+ lenp
+ 2);
5924 xmlRegFreeAtom(atom
);
5927 memcpy(&str
[0], token
, lenp
);
5929 memcpy(&str
[lenp
+ 1], token2
, lenn
);
5930 str
[lenn
+ lenp
+ 1] = 0;
5935 if (xmlFAGenerateTransitions(am
, from
, to
, atom
) < 0) {
5936 xmlRegFreeAtom(atom
);
5945 * xmlAutomataNewNegTrans:
5947 * @from: the starting point of the transition
5948 * @to: the target point of the transition or NULL
5949 * @token: the first input string associated to that transition
5950 * @token2: the second input string associated to that transition
5951 * @data: data passed to the callback function if the transition is activated
5953 * If @to is NULL, this creates first a new target state in the automata
5954 * and then adds a transition from the @from state to the target state
5955 * activated by any value except (@token,@token2)
5956 * Note that if @token2 is not NULL, then (X, NULL) won't match to follow
5957 # the semantic of XSD ##other
5959 * Returns the target state or NULL in case of error
5962 xmlAutomataNewNegTrans(xmlAutomataPtr am
, xmlAutomataStatePtr from
,
5963 xmlAutomataStatePtr to
, const xmlChar
*token
,
5964 const xmlChar
*token2
, void *data
) {
5966 xmlChar err_msg
[200];
5968 if ((am
== NULL
) || (from
== NULL
) || (token
== NULL
))
5970 atom
= xmlRegNewAtom(am
, XML_REGEXP_STRING
);
5975 if ((token2
== NULL
) || (*token2
== 0)) {
5976 atom
->valuep
= xmlStrdup(token
);
5981 lenn
= strlen((char *) token2
);
5982 lenp
= strlen((char *) token
);
5984 str
= (xmlChar
*) xmlMallocAtomic(lenn
+ lenp
+ 2);
5986 xmlRegFreeAtom(atom
);
5989 memcpy(&str
[0], token
, lenp
);
5991 memcpy(&str
[lenp
+ 1], token2
, lenn
);
5992 str
[lenn
+ lenp
+ 1] = 0;
5996 snprintf((char *) err_msg
, 199, "not %s", (const char *) atom
->valuep
);
5998 atom
->valuep2
= xmlStrdup(err_msg
);
6000 if (xmlFAGenerateTransitions(am
, from
, to
, atom
) < 0) {
6001 xmlRegFreeAtom(atom
);
6011 * xmlAutomataNewCountTrans2:
6013 * @from: the starting point of the transition
6014 * @to: the target point of the transition or NULL
6015 * @token: the input string associated to that transition
6016 * @token2: the second input string associated to that transition
6017 * @min: the minimum successive occurrences of token
6018 * @max: the maximum successive occurrences of token
6019 * @data: data associated to the transition
6021 * If @to is NULL, this creates first a new target state in the automata
6022 * and then adds a transition from the @from state to the target state
6023 * activated by a succession of input of value @token and @token2 and
6024 * whose number is between @min and @max
6026 * Returns the target state or NULL in case of error
6029 xmlAutomataNewCountTrans2(xmlAutomataPtr am
, xmlAutomataStatePtr from
,
6030 xmlAutomataStatePtr to
, const xmlChar
*token
,
6031 const xmlChar
*token2
,
6032 int min
, int max
, void *data
) {
6036 if ((am
== NULL
) || (from
== NULL
) || (token
== NULL
))
6040 if ((max
< min
) || (max
< 1))
6042 atom
= xmlRegNewAtom(am
, XML_REGEXP_STRING
);
6045 if ((token2
== NULL
) || (*token2
== 0)) {
6046 atom
->valuep
= xmlStrdup(token
);
6051 lenn
= strlen((char *) token2
);
6052 lenp
= strlen((char *) token
);
6054 str
= (xmlChar
*) xmlMallocAtomic(lenn
+ lenp
+ 2);
6056 xmlRegFreeAtom(atom
);
6059 memcpy(&str
[0], token
, lenp
);
6061 memcpy(&str
[lenp
+ 1], token2
, lenn
);
6062 str
[lenn
+ lenp
+ 1] = 0;
6074 * associate a counter to the transition.
6076 counter
= xmlRegGetCounter(am
);
6077 am
->counters
[counter
].min
= min
;
6078 am
->counters
[counter
].max
= max
;
6080 /* xmlFAGenerateTransitions(am, from, to, atom); */
6082 to
= xmlRegNewState(am
);
6083 xmlRegStatePush(am
, to
);
6085 xmlRegStateAddTrans(am
, from
, atom
, to
, counter
, -1);
6086 xmlRegAtomPush(am
, atom
);
6094 xmlFAGenerateEpsilonTransition(am
, from
, to
);
6099 * xmlAutomataNewCountTrans:
6101 * @from: the starting point of the transition
6102 * @to: the target point of the transition or NULL
6103 * @token: the input string associated to that transition
6104 * @min: the minimum successive occurrences of token
6105 * @max: the maximum successive occurrences of token
6106 * @data: data associated to the transition
6108 * If @to is NULL, this creates first a new target state in the automata
6109 * and then adds a transition from the @from state to the target state
6110 * activated by a succession of input of value @token and whose number
6111 * is between @min and @max
6113 * Returns the target state or NULL in case of error
6116 xmlAutomataNewCountTrans(xmlAutomataPtr am
, xmlAutomataStatePtr from
,
6117 xmlAutomataStatePtr to
, const xmlChar
*token
,
6118 int min
, int max
, void *data
) {
6122 if ((am
== NULL
) || (from
== NULL
) || (token
== NULL
))
6126 if ((max
< min
) || (max
< 1))
6128 atom
= xmlRegNewAtom(am
, XML_REGEXP_STRING
);
6131 atom
->valuep
= xmlStrdup(token
);
6140 * associate a counter to the transition.
6142 counter
= xmlRegGetCounter(am
);
6143 am
->counters
[counter
].min
= min
;
6144 am
->counters
[counter
].max
= max
;
6146 /* xmlFAGenerateTransitions(am, from, to, atom); */
6148 to
= xmlRegNewState(am
);
6149 xmlRegStatePush(am
, to
);
6151 xmlRegStateAddTrans(am
, from
, atom
, to
, counter
, -1);
6152 xmlRegAtomPush(am
, atom
);
6160 xmlFAGenerateEpsilonTransition(am
, from
, to
);
6165 * xmlAutomataNewOnceTrans2:
6167 * @from: the starting point of the transition
6168 * @to: the target point of the transition or NULL
6169 * @token: the input string associated to that transition
6170 * @token2: the second input string associated to that transition
6171 * @min: the minimum successive occurrences of token
6172 * @max: the maximum successive occurrences of token
6173 * @data: data associated to the transition
6175 * If @to is NULL, this creates first a new target state in the automata
6176 * and then adds a transition from the @from state to the target state
6177 * activated by a succession of input of value @token and @token2 and whose
6178 * number is between @min and @max, moreover that transition can only be
6181 * Returns the target state or NULL in case of error
6184 xmlAutomataNewOnceTrans2(xmlAutomataPtr am
, xmlAutomataStatePtr from
,
6185 xmlAutomataStatePtr to
, const xmlChar
*token
,
6186 const xmlChar
*token2
,
6187 int min
, int max
, void *data
) {
6191 if ((am
== NULL
) || (from
== NULL
) || (token
== NULL
))
6197 atom
= xmlRegNewAtom(am
, XML_REGEXP_STRING
);
6200 if ((token2
== NULL
) || (*token2
== 0)) {
6201 atom
->valuep
= xmlStrdup(token
);
6206 lenn
= strlen((char *) token2
);
6207 lenp
= strlen((char *) token
);
6209 str
= (xmlChar
*) xmlMallocAtomic(lenn
+ lenp
+ 2);
6211 xmlRegFreeAtom(atom
);
6214 memcpy(&str
[0], token
, lenp
);
6216 memcpy(&str
[lenp
+ 1], token2
, lenn
);
6217 str
[lenn
+ lenp
+ 1] = 0;
6222 atom
->quant
= XML_REGEXP_QUANT_ONCEONLY
;
6226 * associate a counter to the transition.
6228 counter
= xmlRegGetCounter(am
);
6229 am
->counters
[counter
].min
= 1;
6230 am
->counters
[counter
].max
= 1;
6232 /* xmlFAGenerateTransitions(am, from, to, atom); */
6234 to
= xmlRegNewState(am
);
6235 xmlRegStatePush(am
, to
);
6237 xmlRegStateAddTrans(am
, from
, atom
, to
, counter
, -1);
6238 xmlRegAtomPush(am
, atom
);
6246 * xmlAutomataNewOnceTrans:
6248 * @from: the starting point of the transition
6249 * @to: the target point of the transition or NULL
6250 * @token: the input string associated to that transition
6251 * @min: the minimum successive occurrences of token
6252 * @max: the maximum successive occurrences of token
6253 * @data: data associated to the transition
6255 * If @to is NULL, this creates first a new target state in the automata
6256 * and then adds a transition from the @from state to the target state
6257 * activated by a succession of input of value @token and whose number
6258 * is between @min and @max, moreover that transition can only be crossed
6261 * Returns the target state or NULL in case of error
6264 xmlAutomataNewOnceTrans(xmlAutomataPtr am
, xmlAutomataStatePtr from
,
6265 xmlAutomataStatePtr to
, const xmlChar
*token
,
6266 int min
, int max
, void *data
) {
6270 if ((am
== NULL
) || (from
== NULL
) || (token
== NULL
))
6276 atom
= xmlRegNewAtom(am
, XML_REGEXP_STRING
);
6279 atom
->valuep
= xmlStrdup(token
);
6281 atom
->quant
= XML_REGEXP_QUANT_ONCEONLY
;
6285 * associate a counter to the transition.
6287 counter
= xmlRegGetCounter(am
);
6288 am
->counters
[counter
].min
= 1;
6289 am
->counters
[counter
].max
= 1;
6291 /* xmlFAGenerateTransitions(am, from, to, atom); */
6293 to
= xmlRegNewState(am
);
6294 xmlRegStatePush(am
, to
);
6296 xmlRegStateAddTrans(am
, from
, atom
, to
, counter
, -1);
6297 xmlRegAtomPush(am
, atom
);
6303 * xmlAutomataNewState:
6306 * Create a new disconnected state in the automata
6308 * Returns the new state or NULL in case of error
6311 xmlAutomataNewState(xmlAutomataPtr am
) {
6312 xmlAutomataStatePtr to
;
6316 to
= xmlRegNewState(am
);
6317 xmlRegStatePush(am
, to
);
6322 * xmlAutomataNewEpsilon:
6324 * @from: the starting point of the transition
6325 * @to: the target point of the transition or NULL
6327 * If @to is NULL, this creates first a new target state in the automata
6328 * and then adds an epsilon transition from the @from state to the
6331 * Returns the target state or NULL in case of error
6334 xmlAutomataNewEpsilon(xmlAutomataPtr am
, xmlAutomataStatePtr from
,
6335 xmlAutomataStatePtr to
) {
6336 if ((am
== NULL
) || (from
== NULL
))
6338 xmlFAGenerateEpsilonTransition(am
, from
, to
);
6345 * xmlAutomataNewAllTrans:
6347 * @from: the starting point of the transition
6348 * @to: the target point of the transition or NULL
6349 * @lax: allow to transition if not all all transitions have been activated
6351 * If @to is NULL, this creates first a new target state in the automata
6352 * and then adds a an ALL transition from the @from state to the
6353 * target state. That transition is an epsilon transition allowed only when
6354 * all transitions from the @from node have been activated.
6356 * Returns the target state or NULL in case of error
6359 xmlAutomataNewAllTrans(xmlAutomataPtr am
, xmlAutomataStatePtr from
,
6360 xmlAutomataStatePtr to
, int lax
) {
6361 if ((am
== NULL
) || (from
== NULL
))
6363 xmlFAGenerateAllTransition(am
, from
, to
, lax
);
6370 * xmlAutomataNewCounter:
6372 * @min: the minimal value on the counter
6373 * @max: the maximal value on the counter
6375 * Create a new counter
6377 * Returns the counter number or -1 in case of error
6380 xmlAutomataNewCounter(xmlAutomataPtr am
, int min
, int max
) {
6386 ret
= xmlRegGetCounter(am
);
6389 am
->counters
[ret
].min
= min
;
6390 am
->counters
[ret
].max
= max
;
6395 * xmlAutomataNewCountedTrans:
6397 * @from: the starting point of the transition
6398 * @to: the target point of the transition or NULL
6399 * @counter: the counter associated to that transition
6401 * If @to is NULL, this creates first a new target state in the automata
6402 * and then adds an epsilon transition from the @from state to the target state
6403 * which will increment the counter provided
6405 * Returns the target state or NULL in case of error
6408 xmlAutomataNewCountedTrans(xmlAutomataPtr am
, xmlAutomataStatePtr from
,
6409 xmlAutomataStatePtr to
, int counter
) {
6410 if ((am
== NULL
) || (from
== NULL
) || (counter
< 0))
6412 xmlFAGenerateCountedEpsilonTransition(am
, from
, to
, counter
);
6419 * xmlAutomataNewCounterTrans:
6421 * @from: the starting point of the transition
6422 * @to: the target point of the transition or NULL
6423 * @counter: the counter associated to that transition
6425 * If @to is NULL, this creates first a new target state in the automata
6426 * and then adds an epsilon transition from the @from state to the target state
6427 * which will be allowed only if the counter is within the right range.
6429 * Returns the target state or NULL in case of error
6432 xmlAutomataNewCounterTrans(xmlAutomataPtr am
, xmlAutomataStatePtr from
,
6433 xmlAutomataStatePtr to
, int counter
) {
6434 if ((am
== NULL
) || (from
== NULL
) || (counter
< 0))
6436 xmlFAGenerateCountedTransition(am
, from
, to
, counter
);
6443 * xmlAutomataCompile:
6446 * Compile the automata into a Reg Exp ready for being executed.
6447 * The automata should be free after this point.
6449 * Returns the compiled regexp or NULL in case of error
6452 xmlAutomataCompile(xmlAutomataPtr am
) {
6455 if ((am
== NULL
) || (am
->error
!= 0)) return(NULL
);
6456 xmlFAEliminateEpsilonTransitions(am
);
6457 /* xmlFAComputesDeterminism(am); */
6458 ret
= xmlRegEpxFromParse(am
);
6464 * xmlAutomataIsDeterminist:
6467 * Checks if an automata is determinist.
6469 * Returns 1 if true, 0 if not, and -1 in case of error
6472 xmlAutomataIsDeterminist(xmlAutomataPtr am
) {
6478 ret
= xmlFAComputesDeterminism(am
);
6481 #endif /* LIBXML_AUTOMATA_ENABLED */
6483 #ifdef LIBXML_EXPR_ENABLED
6484 /************************************************************************
6486 * Formal Expression handling code *
6488 ************************************************************************/
6489 /************************************************************************
6491 * Expression handling context *
6493 ************************************************************************/
6495 struct _xmlExpCtxt
{
6497 xmlExpNodePtr
*table
;
6510 * @maxNodes: the maximum number of nodes
6511 * @dict: optional dictionary to use internally
6513 * Creates a new context for manipulating expressions
6515 * Returns the context or NULL in case of error
6518 xmlExpNewCtxt(int maxNodes
, xmlDictPtr dict
) {
6522 if (maxNodes
<= 4096)
6525 ret
= (xmlExpCtxtPtr
) xmlMalloc(sizeof(xmlExpCtxt
));
6528 memset(ret
, 0, sizeof(xmlExpCtxt
));
6531 ret
->maxNodes
= maxNodes
;
6532 ret
->table
= xmlMalloc(size
* sizeof(xmlExpNodePtr
));
6533 if (ret
->table
== NULL
) {
6537 memset(ret
->table
, 0, size
* sizeof(xmlExpNodePtr
));
6539 ret
->dict
= xmlDictCreate();
6540 if (ret
->dict
== NULL
) {
6541 xmlFree(ret
->table
);
6547 xmlDictReference(ret
->dict
);
6554 * @ctxt: an expression context
6556 * Free an expression context
6559 xmlExpFreeCtxt(xmlExpCtxtPtr ctxt
) {
6562 xmlDictFree(ctxt
->dict
);
6563 if (ctxt
->table
!= NULL
)
6564 xmlFree(ctxt
->table
);
6568 /************************************************************************
6570 * Structure associated to an expression node *
6572 ************************************************************************/
6573 #define MAX_NODES 10000
6575 /* #define DEBUG_DERIV */
6580 * - public API for creation
6583 * - regression testing
6586 * - split into module and test tool
6591 XML_EXP_NILABLE
= (1 << 0)
6594 #define IS_NILLABLE(node) ((node)->info & XML_EXP_NILABLE)
6596 struct _xmlExpNode
{
6597 unsigned char type
;/* xmlExpNodeType */
6598 unsigned char info
;/* OR of xmlExpNodeInfo */
6599 unsigned short key
; /* the hash key */
6600 unsigned int ref
; /* The number of references */
6601 int c_max
; /* the maximum length it can consume */
6602 xmlExpNodePtr exp_left
;
6603 xmlExpNodePtr next
;/* the next node in the hash table or free list */
6610 xmlExpNodePtr f_right
;
6612 const xmlChar
*f_str
;
6616 #define exp_min field.count.f_min
6617 #define exp_max field.count.f_max
6618 /* #define exp_left field.children.f_left */
6619 #define exp_right field.children.f_right
6620 #define exp_str field.f_str
6622 static xmlExpNodePtr
xmlExpNewNode(xmlExpCtxtPtr ctxt
, xmlExpNodeType type
);
6623 static xmlExpNode forbiddenExpNode
= {
6624 XML_EXP_FORBID
, 0, 0, 0, 0, NULL
, NULL
, {{ 0, 0}}
6626 xmlExpNodePtr forbiddenExp
= &forbiddenExpNode
;
6627 static xmlExpNode emptyExpNode
= {
6628 XML_EXP_EMPTY
, 1, 0, 0, 0, NULL
, NULL
, {{ 0, 0}}
6630 xmlExpNodePtr emptyExp
= &emptyExpNode
;
6632 /************************************************************************
6634 * The custom hash table for unicity and canonicalization *
6635 * of sub-expressions pointers *
6637 ************************************************************************/
6639 * xmlExpHashNameComputeKey:
6640 * Calculate the hash key for a token
6642 static unsigned short
6643 xmlExpHashNameComputeKey(const xmlChar
*name
) {
6644 unsigned short value
= 0L;
6648 value
+= 30 * (*name
);
6649 while ((ch
= *name
++) != 0) {
6650 value
= value
^ ((value
<< 5) + (value
>> 3) + (unsigned long)ch
);
6657 * xmlExpHashComputeKey:
6658 * Calculate the hash key for a compound expression
6660 static unsigned short
6661 xmlExpHashComputeKey(xmlExpNodeType type
, xmlExpNodePtr left
,
6662 xmlExpNodePtr right
) {
6663 unsigned long value
;
6669 value
+= right
->key
;
6671 ret
= (unsigned short) value
;
6675 value
+= right
->key
;
6677 ret
= (unsigned short) value
;
6681 value
+= right
->key
;
6682 ret
= (unsigned short) value
;
6691 static xmlExpNodePtr
6692 xmlExpNewNode(xmlExpCtxtPtr ctxt
, xmlExpNodeType type
) {
6695 if (ctxt
->nb_nodes
>= MAX_NODES
)
6697 ret
= (xmlExpNodePtr
) xmlMalloc(sizeof(xmlExpNode
));
6700 memset(ret
, 0, sizeof(xmlExpNode
));
6709 * xmlExpHashGetEntry:
6710 * @table: the hash table
6712 * Get the unique entry from the hash table. The entry is created if
6713 * needed. @left and @right are consumed, i.e. their ref count will
6714 * be decremented by the operation.
6716 * Returns the pointer or NULL in case of error
6718 static xmlExpNodePtr
6719 xmlExpHashGetEntry(xmlExpCtxtPtr ctxt
, xmlExpNodeType type
,
6720 xmlExpNodePtr left
, xmlExpNodePtr right
,
6721 const xmlChar
*name
, int min
, int max
) {
6722 unsigned short kbase
, key
;
6723 xmlExpNodePtr entry
;
6724 xmlExpNodePtr insert
;
6730 * Check for duplicate and insertion location.
6732 if (type
== XML_EXP_ATOM
) {
6733 kbase
= xmlExpHashNameComputeKey(name
);
6734 } else if (type
== XML_EXP_COUNT
) {
6735 /* COUNT reduction rule 1 */
6742 xmlExpFree(ctxt
, left
);
6747 xmlExpFree(ctxt
, left
);
6748 return(forbiddenExp
);
6755 } else if (type
== XML_EXP_OR
) {
6756 /* Forbid reduction rules */
6757 if (left
->type
== XML_EXP_FORBID
) {
6758 xmlExpFree(ctxt
, left
);
6761 if (right
->type
== XML_EXP_FORBID
) {
6762 xmlExpFree(ctxt
, right
);
6766 /* OR reduction rule 1 */
6767 /* a | a reduced to a */
6768 if (left
== right
) {
6769 xmlExpFree(ctxt
, right
);
6772 /* OR canonicalization rule 1 */
6773 /* linearize (a | b) | c into a | (b | c) */
6774 if ((left
->type
== XML_EXP_OR
) && (right
->type
!= XML_EXP_OR
)) {
6775 xmlExpNodePtr tmp
= left
;
6779 /* OR reduction rule 2 */
6780 /* a | (a | b) and b | (a | b) are reduced to a | b */
6781 if (right
->type
== XML_EXP_OR
) {
6782 if ((left
== right
->exp_left
) ||
6783 (left
== right
->exp_right
)) {
6784 xmlExpFree(ctxt
, left
);
6788 /* OR canonicalization rule 2 */
6789 /* linearize (a | b) | c into a | (b | c) */
6790 if (left
->type
== XML_EXP_OR
) {
6793 /* OR canonicalization rule 2 */
6794 if ((left
->exp_right
->type
!= XML_EXP_OR
) &&
6795 (left
->exp_right
->key
< left
->exp_left
->key
)) {
6796 tmp
= left
->exp_right
;
6797 left
->exp_right
= left
->exp_left
;
6798 left
->exp_left
= tmp
;
6800 left
->exp_right
->ref
++;
6801 tmp
= xmlExpHashGetEntry(ctxt
, XML_EXP_OR
, left
->exp_right
, right
,
6803 left
->exp_left
->ref
++;
6804 tmp
= xmlExpHashGetEntry(ctxt
, XML_EXP_OR
, left
->exp_left
, tmp
,
6807 xmlExpFree(ctxt
, left
);
6810 if (right
->type
== XML_EXP_OR
) {
6811 /* Ordering in the tree */
6812 /* C | (A | B) -> A | (B | C) */
6813 if (left
->key
> right
->exp_right
->key
) {
6815 right
->exp_right
->ref
++;
6816 tmp
= xmlExpHashGetEntry(ctxt
, XML_EXP_OR
, right
->exp_right
,
6818 right
->exp_left
->ref
++;
6819 tmp
= xmlExpHashGetEntry(ctxt
, XML_EXP_OR
, right
->exp_left
,
6821 xmlExpFree(ctxt
, right
);
6824 /* Ordering in the tree */
6825 /* B | (A | C) -> A | (B | C) */
6826 if (left
->key
> right
->exp_left
->key
) {
6828 right
->exp_right
->ref
++;
6829 tmp
= xmlExpHashGetEntry(ctxt
, XML_EXP_OR
, left
,
6830 right
->exp_right
, NULL
, 0, 0);
6831 right
->exp_left
->ref
++;
6832 tmp
= xmlExpHashGetEntry(ctxt
, XML_EXP_OR
, right
->exp_left
,
6834 xmlExpFree(ctxt
, right
);
6838 /* we know both types are != XML_EXP_OR here */
6839 else if (left
->key
> right
->key
) {
6840 xmlExpNodePtr tmp
= left
;
6844 kbase
= xmlExpHashComputeKey(type
, left
, right
);
6845 } else if (type
== XML_EXP_SEQ
) {
6846 /* Forbid reduction rules */
6847 if (left
->type
== XML_EXP_FORBID
) {
6848 xmlExpFree(ctxt
, right
);
6851 if (right
->type
== XML_EXP_FORBID
) {
6852 xmlExpFree(ctxt
, left
);
6855 /* Empty reduction rules */
6856 if (right
->type
== XML_EXP_EMPTY
) {
6859 if (left
->type
== XML_EXP_EMPTY
) {
6862 kbase
= xmlExpHashComputeKey(type
, left
, right
);
6866 key
= kbase
% ctxt
->size
;
6867 if (ctxt
->table
[key
] != NULL
) {
6868 for (insert
= ctxt
->table
[key
]; insert
!= NULL
;
6869 insert
= insert
->next
) {
6870 if ((insert
->key
== kbase
) &&
6871 (insert
->type
== type
)) {
6872 if (type
== XML_EXP_ATOM
) {
6873 if (name
== insert
->exp_str
) {
6877 } else if (type
== XML_EXP_COUNT
) {
6878 if ((insert
->exp_min
== min
) && (insert
->exp_max
== max
) &&
6879 (insert
->exp_left
== left
)) {
6884 } else if ((insert
->exp_left
== left
) &&
6885 (insert
->exp_right
== right
)) {
6895 entry
= xmlExpNewNode(ctxt
, type
);
6899 if (type
== XML_EXP_ATOM
) {
6900 entry
->exp_str
= name
;
6902 } else if (type
== XML_EXP_COUNT
) {
6903 entry
->exp_min
= min
;
6904 entry
->exp_max
= max
;
6905 entry
->exp_left
= left
;
6906 if ((min
== 0) || (IS_NILLABLE(left
)))
6907 entry
->info
|= XML_EXP_NILABLE
;
6911 entry
->c_max
= max
* entry
->exp_left
->c_max
;
6913 entry
->exp_left
= left
;
6914 entry
->exp_right
= right
;
6915 if (type
== XML_EXP_OR
) {
6916 if ((IS_NILLABLE(left
)) || (IS_NILLABLE(right
)))
6917 entry
->info
|= XML_EXP_NILABLE
;
6918 if ((entry
->exp_left
->c_max
== -1) ||
6919 (entry
->exp_right
->c_max
== -1))
6921 else if (entry
->exp_left
->c_max
> entry
->exp_right
->c_max
)
6922 entry
->c_max
= entry
->exp_left
->c_max
;
6924 entry
->c_max
= entry
->exp_right
->c_max
;
6926 if ((IS_NILLABLE(left
)) && (IS_NILLABLE(right
)))
6927 entry
->info
|= XML_EXP_NILABLE
;
6928 if ((entry
->exp_left
->c_max
== -1) ||
6929 (entry
->exp_right
->c_max
== -1))
6932 entry
->c_max
= entry
->exp_left
->c_max
+ entry
->exp_right
->c_max
;
6936 if (ctxt
->table
[key
] != NULL
)
6937 entry
->next
= ctxt
->table
[key
];
6939 ctxt
->table
[key
] = entry
;
6947 * @ctxt: the expression context
6948 * @exp: the expression
6950 * Dereference the expression
6953 xmlExpFree(xmlExpCtxtPtr ctxt
, xmlExpNodePtr exp
) {
6954 if ((exp
== NULL
) || (exp
== forbiddenExp
) || (exp
== emptyExp
))
6957 if (exp
->ref
== 0) {
6960 /* Unlink it first from the hash table */
6961 key
= exp
->key
% ctxt
->size
;
6962 if (ctxt
->table
[key
] == exp
) {
6963 ctxt
->table
[key
] = exp
->next
;
6967 tmp
= ctxt
->table
[key
];
6968 while (tmp
!= NULL
) {
6969 if (tmp
->next
== exp
) {
6970 tmp
->next
= exp
->next
;
6977 if ((exp
->type
== XML_EXP_SEQ
) || (exp
->type
== XML_EXP_OR
)) {
6978 xmlExpFree(ctxt
, exp
->exp_left
);
6979 xmlExpFree(ctxt
, exp
->exp_right
);
6980 } else if (exp
->type
== XML_EXP_COUNT
) {
6981 xmlExpFree(ctxt
, exp
->exp_left
);
6990 * @exp: the expression
6992 * Increase the reference count of the expression
6995 xmlExpRef(xmlExpNodePtr exp
) {
7002 * @ctxt: the expression context
7003 * @name: the atom name
7004 * @len: the atom name length in byte (or -1);
7006 * Get the atom associated to this name from that context
7008 * Returns the node or NULL in case of error
7011 xmlExpNewAtom(xmlExpCtxtPtr ctxt
, const xmlChar
*name
, int len
) {
7012 if ((ctxt
== NULL
) || (name
== NULL
))
7014 name
= xmlDictLookup(ctxt
->dict
, name
, len
);
7017 return(xmlExpHashGetEntry(ctxt
, XML_EXP_ATOM
, NULL
, NULL
, name
, 0, 0));
7022 * @ctxt: the expression context
7023 * @left: left expression
7024 * @right: right expression
7026 * Get the atom associated to the choice @left | @right
7027 * Note that @left and @right are consumed in the operation, to keep
7028 * an handle on them use xmlExpRef() and use xmlExpFree() to release them,
7029 * this is true even in case of failure (unless ctxt == NULL).
7031 * Returns the node or NULL in case of error
7034 xmlExpNewOr(xmlExpCtxtPtr ctxt
, xmlExpNodePtr left
, xmlExpNodePtr right
) {
7037 if ((left
== NULL
) || (right
== NULL
)) {
7038 xmlExpFree(ctxt
, left
);
7039 xmlExpFree(ctxt
, right
);
7042 return(xmlExpHashGetEntry(ctxt
, XML_EXP_OR
, left
, right
, NULL
, 0, 0));
7047 * @ctxt: the expression context
7048 * @left: left expression
7049 * @right: right expression
7051 * Get the atom associated to the sequence @left , @right
7052 * Note that @left and @right are consumed in the operation, to keep
7053 * an handle on them use xmlExpRef() and use xmlExpFree() to release them,
7054 * this is true even in case of failure (unless ctxt == NULL).
7056 * Returns the node or NULL in case of error
7059 xmlExpNewSeq(xmlExpCtxtPtr ctxt
, xmlExpNodePtr left
, xmlExpNodePtr right
) {
7062 if ((left
== NULL
) || (right
== NULL
)) {
7063 xmlExpFree(ctxt
, left
);
7064 xmlExpFree(ctxt
, right
);
7067 return(xmlExpHashGetEntry(ctxt
, XML_EXP_SEQ
, left
, right
, NULL
, 0, 0));
7072 * @ctxt: the expression context
7073 * @subset: the expression to be repeated
7074 * @min: the lower bound for the repetition
7075 * @max: the upper bound for the repetition, -1 means infinite
7077 * Get the atom associated to the range (@subset){@min, @max}
7078 * Note that @subset is consumed in the operation, to keep
7079 * an handle on it use xmlExpRef() and use xmlExpFree() to release it,
7080 * this is true even in case of failure (unless ctxt == NULL).
7082 * Returns the node or NULL in case of error
7085 xmlExpNewRange(xmlExpCtxtPtr ctxt
, xmlExpNodePtr subset
, int min
, int max
) {
7088 if ((subset
== NULL
) || (min
< 0) || (max
< -1) ||
7089 ((max
>= 0) && (min
> max
))) {
7090 xmlExpFree(ctxt
, subset
);
7093 return(xmlExpHashGetEntry(ctxt
, XML_EXP_COUNT
, subset
,
7094 NULL
, NULL
, min
, max
));
7097 /************************************************************************
7099 * Public API for operations on expressions *
7101 ************************************************************************/
7104 xmlExpGetLanguageInt(xmlExpCtxtPtr ctxt
, xmlExpNodePtr exp
,
7105 const xmlChar
**list
, int len
, int nb
) {
7108 switch (exp
->type
) {
7112 for (tmp
= 0;tmp
< nb
;tmp
++)
7113 if (list
[tmp
] == exp
->exp_str
)
7117 list
[nb
] = exp
->exp_str
;
7120 exp
= exp
->exp_left
;
7124 tmp
= xmlExpGetLanguageInt(ctxt
, exp
->exp_left
, list
, len
, nb
);
7127 tmp2
= xmlExpGetLanguageInt(ctxt
, exp
->exp_right
, list
, len
,
7137 * xmlExpGetLanguage:
7138 * @ctxt: the expression context
7139 * @exp: the expression
7140 * @langList: where to store the tokens
7141 * @len: the allocated length of @list
7143 * Find all the strings used in @exp and store them in @list
7145 * Returns the number of unique strings found, -1 in case of errors and
7146 * -2 if there is more than @len strings
7149 xmlExpGetLanguage(xmlExpCtxtPtr ctxt
, xmlExpNodePtr exp
,
7150 const xmlChar
**langList
, int len
) {
7151 if ((ctxt
== NULL
) || (exp
== NULL
) || (langList
== NULL
) || (len
<= 0))
7153 return(xmlExpGetLanguageInt(ctxt
, exp
, langList
, len
, 0));
7157 xmlExpGetStartInt(xmlExpCtxtPtr ctxt
, xmlExpNodePtr exp
,
7158 const xmlChar
**list
, int len
, int nb
) {
7161 switch (exp
->type
) {
7162 case XML_EXP_FORBID
:
7167 for (tmp
= 0;tmp
< nb
;tmp
++)
7168 if (list
[tmp
] == exp
->exp_str
)
7172 list
[nb
] = exp
->exp_str
;
7175 exp
= exp
->exp_left
;
7178 tmp
= xmlExpGetStartInt(ctxt
, exp
->exp_left
, list
, len
, nb
);
7181 if (IS_NILLABLE(exp
->exp_left
)) {
7182 tmp2
= xmlExpGetStartInt(ctxt
, exp
->exp_right
, list
, len
,
7190 tmp
= xmlExpGetStartInt(ctxt
, exp
->exp_left
, list
, len
, nb
);
7193 tmp2
= xmlExpGetStartInt(ctxt
, exp
->exp_right
, list
, len
,
7204 * @ctxt: the expression context
7205 * @exp: the expression
7206 * @tokList: where to store the tokens
7207 * @len: the allocated length of @list
7209 * Find all the strings that appears at the start of the languages
7210 * accepted by @exp and store them in @list. E.g. for (a, b) | c
7211 * it will return the list [a, c]
7213 * Returns the number of unique strings found, -1 in case of errors and
7214 * -2 if there is more than @len strings
7217 xmlExpGetStart(xmlExpCtxtPtr ctxt
, xmlExpNodePtr exp
,
7218 const xmlChar
**tokList
, int len
) {
7219 if ((ctxt
== NULL
) || (exp
== NULL
) || (tokList
== NULL
) || (len
<= 0))
7221 return(xmlExpGetStartInt(ctxt
, exp
, tokList
, len
, 0));
7226 * @exp: the expression
7228 * Finds if the expression is nillable, i.e. if it accepts the empty sequence
7230 * Returns 1 if nillable, 0 if not and -1 in case of error
7233 xmlExpIsNillable(xmlExpNodePtr exp
) {
7236 return(IS_NILLABLE(exp
) != 0);
7239 static xmlExpNodePtr
7240 xmlExpStringDeriveInt(xmlExpCtxtPtr ctxt
, xmlExpNodePtr exp
, const xmlChar
*str
)
7244 switch (exp
->type
) {
7246 return(forbiddenExp
);
7247 case XML_EXP_FORBID
:
7248 return(forbiddenExp
);
7250 if (exp
->exp_str
== str
) {
7252 printf("deriv atom: equal => Empty\n");
7257 printf("deriv atom: mismatch => forbid\n");
7259 /* TODO wildcards here */
7267 printf("deriv or: => or(derivs)\n");
7269 tmp
= xmlExpStringDeriveInt(ctxt
, exp
->exp_left
, str
);
7273 ret
= xmlExpStringDeriveInt(ctxt
, exp
->exp_right
, str
);
7275 xmlExpFree(ctxt
, tmp
);
7278 ret
= xmlExpHashGetEntry(ctxt
, XML_EXP_OR
, tmp
, ret
,
7284 printf("deriv seq: starting with left\n");
7286 ret
= xmlExpStringDeriveInt(ctxt
, exp
->exp_left
, str
);
7289 } else if (ret
== forbiddenExp
) {
7290 if (IS_NILLABLE(exp
->exp_left
)) {
7292 printf("deriv seq: left failed but nillable\n");
7294 ret
= xmlExpStringDeriveInt(ctxt
, exp
->exp_right
, str
);
7298 printf("deriv seq: left match => sequence\n");
7300 exp
->exp_right
->ref
++;
7301 ret
= xmlExpHashGetEntry(ctxt
, XML_EXP_SEQ
, ret
, exp
->exp_right
,
7305 case XML_EXP_COUNT
: {
7309 if (exp
->exp_max
== 0)
7310 return(forbiddenExp
);
7311 ret
= xmlExpStringDeriveInt(ctxt
, exp
->exp_left
, str
);
7314 if (ret
== forbiddenExp
) {
7316 printf("deriv count: pattern mismatch => forbid\n");
7320 if (exp
->exp_max
== 1)
7322 if (exp
->exp_max
< 0) /* unbounded */
7325 max
= exp
->exp_max
- 1;
7326 if (exp
->exp_min
> 0)
7327 min
= exp
->exp_min
- 1;
7330 exp
->exp_left
->ref
++;
7331 tmp
= xmlExpHashGetEntry(ctxt
, XML_EXP_COUNT
, exp
->exp_left
, NULL
,
7333 if (ret
== emptyExp
) {
7335 printf("deriv count: match to empty => new count\n");
7340 printf("deriv count: match => sequence with new count\n");
7342 return(xmlExpHashGetEntry(ctxt
, XML_EXP_SEQ
, ret
, tmp
,
7350 * xmlExpStringDerive:
7351 * @ctxt: the expression context
7352 * @exp: the expression
7354 * @len: the string len in bytes if available
7356 * Do one step of Brzozowski derivation of the expression @exp with
7357 * respect to the input string
7359 * Returns the resulting expression or NULL in case of internal error
7362 xmlExpStringDerive(xmlExpCtxtPtr ctxt
, xmlExpNodePtr exp
,
7363 const xmlChar
*str
, int len
) {
7364 const xmlChar
*input
;
7366 if ((exp
== NULL
) || (ctxt
== NULL
) || (str
== NULL
)) {
7370 * check the string is in the dictionary, if yes use an interned
7371 * copy, otherwise we know it's not an acceptable input
7373 input
= xmlDictExists(ctxt
->dict
, str
, len
);
7374 if (input
== NULL
) {
7375 return(forbiddenExp
);
7377 return(xmlExpStringDeriveInt(ctxt
, exp
, input
));
7381 xmlExpCheckCard(xmlExpNodePtr exp
, xmlExpNodePtr sub
) {
7384 if (sub
->c_max
== -1) {
7385 if (exp
->c_max
!= -1)
7387 } else if ((exp
->c_max
>= 0) && (exp
->c_max
< sub
->c_max
)) {
7391 if ((IS_NILLABLE(sub
)) && (!IS_NILLABLE(exp
)))
7397 static xmlExpNodePtr
xmlExpExpDeriveInt(xmlExpCtxtPtr ctxt
, xmlExpNodePtr exp
,
7401 * @ctxt: the expressions context
7402 * @exp: the englobing expression
7403 * @sub: the subexpression
7404 * @mult: the multiple expression
7405 * @remain: the remain from the derivation of the multiple
7407 * Check if exp is a multiple of sub, i.e. if there is a finite number n
7408 * so that sub{n} subsume exp
7410 * Returns the multiple value if successful, 0 if it is not a multiple
7411 * and -1 in case of internal error.
7415 xmlExpDivide(xmlExpCtxtPtr ctxt
, xmlExpNodePtr exp
, xmlExpNodePtr sub
,
7416 xmlExpNodePtr
*mult
, xmlExpNodePtr
*remain
) {
7418 xmlExpNodePtr tmp
, tmp2
;
7420 if (mult
!= NULL
) *mult
= NULL
;
7421 if (remain
!= NULL
) *remain
= NULL
;
7422 if (exp
->c_max
== -1) return(0);
7423 if (IS_NILLABLE(exp
) && (!IS_NILLABLE(sub
))) return(0);
7425 for (i
= 1;i
<= exp
->c_max
;i
++) {
7427 tmp
= xmlExpHashGetEntry(ctxt
, XML_EXP_COUNT
,
7428 sub
, NULL
, NULL
, i
, i
);
7432 if (!xmlExpCheckCard(tmp
, exp
)) {
7433 xmlExpFree(ctxt
, tmp
);
7436 tmp2
= xmlExpExpDeriveInt(ctxt
, tmp
, exp
);
7438 xmlExpFree(ctxt
, tmp
);
7441 if ((tmp2
!= forbiddenExp
) && (IS_NILLABLE(tmp2
))) {
7445 xmlExpFree(ctxt
, tmp2
);
7449 xmlExpFree(ctxt
, tmp
);
7451 printf("Divide succeeded %d\n", i
);
7455 xmlExpFree(ctxt
, tmp
);
7456 xmlExpFree(ctxt
, tmp2
);
7459 printf("Divide failed\n");
7465 * xmlExpExpDeriveInt:
7466 * @ctxt: the expressions context
7467 * @exp: the englobing expression
7468 * @sub: the subexpression
7470 * Try to do a step of Brzozowski derivation but at a higher level
7471 * the input being a subexpression.
7473 * Returns the resulting expression or NULL in case of internal error
7475 static xmlExpNodePtr
7476 xmlExpExpDeriveInt(xmlExpCtxtPtr ctxt
, xmlExpNodePtr exp
, xmlExpNodePtr sub
) {
7477 xmlExpNodePtr ret
, tmp
, tmp2
, tmp3
;
7478 const xmlChar
**tab
;
7482 * In case of equality and if the expression can only consume a finite
7483 * amount, then the derivation is empty
7485 if ((exp
== sub
) && (exp
->c_max
>= 0)) {
7487 printf("Equal(exp, sub) and finite -> Empty\n");
7492 * decompose sub sequence first
7494 if (sub
->type
== XML_EXP_EMPTY
) {
7496 printf("Empty(sub) -> Empty\n");
7501 if (sub
->type
== XML_EXP_SEQ
) {
7503 printf("Seq(sub) -> decompose\n");
7505 tmp
= xmlExpExpDeriveInt(ctxt
, exp
, sub
->exp_left
);
7508 if (tmp
== forbiddenExp
)
7510 ret
= xmlExpExpDeriveInt(ctxt
, tmp
, sub
->exp_right
);
7511 xmlExpFree(ctxt
, tmp
);
7514 if (sub
->type
== XML_EXP_OR
) {
7516 printf("Or(sub) -> decompose\n");
7518 tmp
= xmlExpExpDeriveInt(ctxt
, exp
, sub
->exp_left
);
7519 if (tmp
== forbiddenExp
)
7523 ret
= xmlExpExpDeriveInt(ctxt
, exp
, sub
->exp_right
);
7524 if ((ret
== NULL
) || (ret
== forbiddenExp
)) {
7525 xmlExpFree(ctxt
, tmp
);
7528 return(xmlExpHashGetEntry(ctxt
, XML_EXP_OR
, tmp
, ret
, NULL
, 0, 0));
7530 if (!xmlExpCheckCard(exp
, sub
)) {
7532 printf("CheckCard(exp, sub) failed -> Forbid\n");
7534 return(forbiddenExp
);
7536 switch (exp
->type
) {
7538 if (sub
== emptyExp
)
7541 printf("Empty(exp) -> Forbid\n");
7543 return(forbiddenExp
);
7544 case XML_EXP_FORBID
:
7546 printf("Forbid(exp) -> Forbid\n");
7548 return(forbiddenExp
);
7550 if (sub
->type
== XML_EXP_ATOM
) {
7551 /* TODO: handle wildcards */
7552 if (exp
->exp_str
== sub
->exp_str
) {
7554 printf("Atom match -> Empty\n");
7559 printf("Atom mismatch -> Forbid\n");
7561 return(forbiddenExp
);
7563 if ((sub
->type
== XML_EXP_COUNT
) &&
7564 (sub
->exp_max
== 1) &&
7565 (sub
->exp_left
->type
== XML_EXP_ATOM
)) {
7566 /* TODO: handle wildcards */
7567 if (exp
->exp_str
== sub
->exp_left
->exp_str
) {
7569 printf("Atom match -> Empty\n");
7574 printf("Atom mismatch -> Forbid\n");
7576 return(forbiddenExp
);
7579 printf("Complex exp vs Atom -> Forbid\n");
7581 return(forbiddenExp
);
7583 /* try to get the sequence consumed only if possible */
7584 if (xmlExpCheckCard(exp
->exp_left
, sub
)) {
7585 /* See if the sequence can be consumed directly */
7587 printf("Seq trying left only\n");
7589 ret
= xmlExpExpDeriveInt(ctxt
, exp
->exp_left
, sub
);
7590 if ((ret
!= forbiddenExp
) && (ret
!= NULL
)) {
7592 printf("Seq trying left only worked\n");
7595 * TODO: assumption here that we are determinist
7596 * i.e. we won't get to a nillable exp left
7597 * subset which could be matched by the right
7599 * e.g.: (a | b)+,(a | c) and 'a+,a'
7601 exp
->exp_right
->ref
++;
7602 return(xmlExpHashGetEntry(ctxt
, XML_EXP_SEQ
, ret
,
7603 exp
->exp_right
, NULL
, 0, 0));
7607 printf("Seq: left too short\n");
7610 /* Try instead to decompose */
7611 if (sub
->type
== XML_EXP_COUNT
) {
7615 printf("Seq: sub is a count\n");
7617 ret
= xmlExpExpDeriveInt(ctxt
, exp
->exp_left
, sub
->exp_left
);
7620 if (ret
!= forbiddenExp
) {
7622 printf("Seq , Count match on left\n");
7624 if (sub
->exp_max
< 0)
7627 max
= sub
->exp_max
-1;
7628 if (sub
->exp_min
> 0)
7629 min
= sub
->exp_min
-1;
7632 exp
->exp_right
->ref
++;
7633 tmp
= xmlExpHashGetEntry(ctxt
, XML_EXP_SEQ
, ret
,
7634 exp
->exp_right
, NULL
, 0, 0);
7638 sub
->exp_left
->ref
++;
7639 tmp2
= xmlExpHashGetEntry(ctxt
, XML_EXP_COUNT
,
7640 sub
->exp_left
, NULL
, NULL
, min
, max
);
7642 xmlExpFree(ctxt
, tmp
);
7645 ret
= xmlExpExpDeriveInt(ctxt
, tmp
, tmp2
);
7646 xmlExpFree(ctxt
, tmp
);
7647 xmlExpFree(ctxt
, tmp2
);
7651 /* we made no progress on structured operations */
7655 printf("Or , trying both side\n");
7657 ret
= xmlExpExpDeriveInt(ctxt
, exp
->exp_left
, sub
);
7660 tmp
= xmlExpExpDeriveInt(ctxt
, exp
->exp_right
, sub
);
7662 xmlExpFree(ctxt
, ret
);
7665 return(xmlExpHashGetEntry(ctxt
, XML_EXP_OR
, ret
, tmp
, NULL
, 0, 0));
7666 case XML_EXP_COUNT
: {
7669 if (sub
->type
== XML_EXP_COUNT
) {
7671 * Try to see if the loop is completely subsumed
7673 tmp
= xmlExpExpDeriveInt(ctxt
, exp
->exp_left
, sub
->exp_left
);
7676 if (tmp
== forbiddenExp
) {
7680 printf("Count, Count inner don't subsume\n");
7682 mult
= xmlExpDivide(ctxt
, sub
->exp_left
, exp
->exp_left
,
7686 printf("Count, Count not multiple => forbidden\n");
7688 return(forbiddenExp
);
7690 if (sub
->exp_max
== -1) {
7692 if (exp
->exp_max
== -1) {
7693 if (exp
->exp_min
<= sub
->exp_min
* mult
)
7696 min
= exp
->exp_min
- sub
->exp_min
* mult
;
7699 printf("Count, Count finite can't subsume infinite\n");
7701 xmlExpFree(ctxt
, tmp
);
7702 return(forbiddenExp
);
7705 if (exp
->exp_max
== -1) {
7707 printf("Infinite loop consume mult finite loop\n");
7709 if (exp
->exp_min
> sub
->exp_min
* mult
) {
7711 min
= exp
->exp_min
- sub
->exp_min
* mult
;
7717 if (exp
->exp_max
< sub
->exp_max
* mult
) {
7719 printf("loops max mult mismatch => forbidden\n");
7721 xmlExpFree(ctxt
, tmp
);
7722 return(forbiddenExp
);
7724 if (sub
->exp_max
* mult
> exp
->exp_min
)
7727 min
= exp
->exp_min
- sub
->exp_max
* mult
;
7728 max
= exp
->exp_max
- sub
->exp_max
* mult
;
7731 } else if (!IS_NILLABLE(tmp
)) {
7733 * TODO: loop here to try to grow if working on finite
7737 printf("Count, Count remain not nillable => forbidden\n");
7739 xmlExpFree(ctxt
, tmp
);
7740 return(forbiddenExp
);
7741 } else if (sub
->exp_max
== -1) {
7742 if (exp
->exp_max
== -1) {
7743 if (exp
->exp_min
<= sub
->exp_min
) {
7745 printf("Infinite loops Okay => COUNT(0,Inf)\n");
7751 printf("Infinite loops min => Count(X,Inf)\n");
7754 min
= exp
->exp_min
- sub
->exp_min
;
7756 } else if (exp
->exp_min
> sub
->exp_min
) {
7758 printf("loops min mismatch 1 => forbidden ???\n");
7760 xmlExpFree(ctxt
, tmp
);
7761 return(forbiddenExp
);
7767 if (exp
->exp_max
== -1) {
7769 printf("Infinite loop consume finite loop\n");
7771 if (exp
->exp_min
> sub
->exp_min
) {
7773 min
= exp
->exp_min
- sub
->exp_min
;
7779 if (exp
->exp_max
< sub
->exp_max
) {
7781 printf("loops max mismatch => forbidden\n");
7783 xmlExpFree(ctxt
, tmp
);
7784 return(forbiddenExp
);
7786 if (sub
->exp_max
> exp
->exp_min
)
7789 min
= exp
->exp_min
- sub
->exp_max
;
7790 max
= exp
->exp_max
- sub
->exp_max
;
7794 printf("loops match => SEQ(COUNT())\n");
7796 exp
->exp_left
->ref
++;
7797 tmp2
= xmlExpHashGetEntry(ctxt
, XML_EXP_COUNT
, exp
->exp_left
,
7798 NULL
, NULL
, min
, max
);
7802 ret
= xmlExpHashGetEntry(ctxt
, XML_EXP_SEQ
, tmp
, tmp2
,
7806 tmp
= xmlExpExpDeriveInt(ctxt
, exp
->exp_left
, sub
);
7809 if (tmp
== forbiddenExp
) {
7811 printf("loop mismatch => forbidden\n");
7813 return(forbiddenExp
);
7815 if (exp
->exp_min
> 0)
7816 min
= exp
->exp_min
- 1;
7819 if (exp
->exp_max
< 0)
7822 max
= exp
->exp_max
- 1;
7825 printf("loop match => SEQ(COUNT())\n");
7827 exp
->exp_left
->ref
++;
7828 tmp2
= xmlExpHashGetEntry(ctxt
, XML_EXP_COUNT
, exp
->exp_left
,
7829 NULL
, NULL
, min
, max
);
7832 ret
= xmlExpHashGetEntry(ctxt
, XML_EXP_SEQ
, tmp
, tmp2
,
7839 printf("Fallback to derivative\n");
7841 if (IS_NILLABLE(sub
)) {
7842 if (!(IS_NILLABLE(exp
)))
7843 return(forbiddenExp
);
7849 * here the structured derivation made no progress so
7850 * we use the default token based derivation to force one more step
7852 if (ctxt
->tabSize
== 0)
7855 tab
= (const xmlChar
**) xmlMalloc(ctxt
->tabSize
*
7856 sizeof(const xmlChar
*));
7862 * collect all the strings accepted by the subexpression on input
7864 len
= xmlExpGetStartInt(ctxt
, sub
, tab
, ctxt
->tabSize
, 0);
7866 const xmlChar
**temp
;
7867 temp
= (const xmlChar
**) xmlRealloc((xmlChar
**) tab
, ctxt
->tabSize
* 2 *
7868 sizeof(const xmlChar
*));
7870 xmlFree((xmlChar
**) tab
);
7875 len
= xmlExpGetStartInt(ctxt
, sub
, tab
, ctxt
->tabSize
, 0);
7877 for (i
= 0;i
< len
;i
++) {
7878 tmp
= xmlExpStringDeriveInt(ctxt
, exp
, tab
[i
]);
7879 if ((tmp
== NULL
) || (tmp
== forbiddenExp
)) {
7880 xmlExpFree(ctxt
, ret
);
7881 xmlFree((xmlChar
**) tab
);
7884 tmp2
= xmlExpStringDeriveInt(ctxt
, sub
, tab
[i
]);
7885 if ((tmp2
== NULL
) || (tmp2
== forbiddenExp
)) {
7886 xmlExpFree(ctxt
, tmp
);
7887 xmlExpFree(ctxt
, ret
);
7888 xmlFree((xmlChar
**) tab
);
7891 tmp3
= xmlExpExpDeriveInt(ctxt
, tmp
, tmp2
);
7892 xmlExpFree(ctxt
, tmp
);
7893 xmlExpFree(ctxt
, tmp2
);
7895 if ((tmp3
== NULL
) || (tmp3
== forbiddenExp
)) {
7896 xmlExpFree(ctxt
, ret
);
7897 xmlFree((xmlChar
**) tab
);
7904 ret
= xmlExpHashGetEntry(ctxt
, XML_EXP_OR
, ret
, tmp3
, NULL
, 0, 0);
7906 xmlFree((xmlChar
**) tab
);
7911 xmlFree((xmlChar
**) tab
);
7917 * @ctxt: the expressions context
7918 * @exp: the englobing expression
7919 * @sub: the subexpression
7921 * Evaluates the expression resulting from @exp consuming a sub expression @sub
7922 * Based on algebraic derivation and sometimes direct Brzozowski derivation
7923 * it usually takes less than linear time and can handle expressions generating
7924 * infinite languages.
7926 * Returns the resulting expression or NULL in case of internal error, the
7927 * result must be freed
7930 xmlExpExpDerive(xmlExpCtxtPtr ctxt
, xmlExpNodePtr exp
, xmlExpNodePtr sub
) {
7931 if ((exp
== NULL
) || (ctxt
== NULL
) || (sub
== NULL
))
7937 if (IS_NILLABLE(sub
) && (!IS_NILLABLE(exp
))) {
7939 printf("Sub nillable and not exp : can't subsume\n");
7941 return(forbiddenExp
);
7943 if (xmlExpCheckCard(exp
, sub
) == 0) {
7945 printf("sub generate longer sequences than exp : can't subsume\n");
7947 return(forbiddenExp
);
7949 return(xmlExpExpDeriveInt(ctxt
, exp
, sub
));
7954 * @ctxt: the expressions context
7955 * @exp: the englobing expression
7956 * @sub: the subexpression
7958 * Check whether @exp accepts all the languages accepted by @sub
7959 * the input being a subexpression.
7961 * Returns 1 if true 0 if false and -1 in case of failure.
7964 xmlExpSubsume(xmlExpCtxtPtr ctxt
, xmlExpNodePtr exp
, xmlExpNodePtr sub
) {
7967 if ((exp
== NULL
) || (ctxt
== NULL
) || (sub
== NULL
))
7971 * TODO: speedup by checking the language of sub is a subset of the
7977 if (IS_NILLABLE(sub
) && (!IS_NILLABLE(exp
))) {
7979 printf("Sub nillable and not exp : can't subsume\n");
7983 if (xmlExpCheckCard(exp
, sub
) == 0) {
7985 printf("sub generate longer sequences than exp : can't subsume\n");
7989 tmp
= xmlExpExpDeriveInt(ctxt
, exp
, sub
);
7991 printf("Result derivation :\n");
7996 if (tmp
== forbiddenExp
)
7998 if (tmp
== emptyExp
)
8000 if ((tmp
!= NULL
) && (IS_NILLABLE(tmp
))) {
8001 xmlExpFree(ctxt
, tmp
);
8004 xmlExpFree(ctxt
, tmp
);
8008 /************************************************************************
8010 * Parsing expression *
8012 ************************************************************************/
8014 static xmlExpNodePtr
xmlExpParseExpr(xmlExpCtxtPtr ctxt
);
8017 #define CUR (*ctxt->cur)
8019 #define NEXT ctxt->cur++;
8021 #define IS_BLANK(c) ((c == ' ') || (c == '\n') || (c == '\r') || (c == '\t'))
8022 #define SKIP_BLANKS while (IS_BLANK(*ctxt->cur)) ctxt->cur++;
8025 xmlExpParseNumber(xmlExpCtxtPtr ctxt
) {
8033 if ((CUR
< '0') || (CUR
> '9'))
8035 while ((CUR
>= '0') && (CUR
<= '9')) {
8036 ret
= ret
* 10 + (CUR
- '0');
8042 static xmlExpNodePtr
8043 xmlExpParseOr(xmlExpCtxtPtr ctxt
) {
8050 if (*ctxt
->cur
== '(') {
8052 ret
= xmlExpParseExpr(ctxt
);
8054 if (*ctxt
->cur
!= ')') {
8055 fprintf(stderr
, "unbalanced '(' : %s\n", base
);
8056 xmlExpFree(ctxt
, ret
);
8061 goto parse_quantifier
;
8063 while ((CUR
!= 0) && (!(IS_BLANK(CUR
))) && (CUR
!= '(') &&
8064 (CUR
!= ')') && (CUR
!= '|') && (CUR
!= ',') && (CUR
!= '{') &&
8065 (CUR
!= '*') && (CUR
!= '+') && (CUR
!= '?') && (CUR
!= '}'))
8067 val
= xmlDictLookup(ctxt
->dict
, BAD_CAST base
, ctxt
->cur
- base
);
8070 ret
= xmlExpHashGetEntry(ctxt
, XML_EXP_ATOM
, NULL
, NULL
, val
, 0, 0);
8079 min
= xmlExpParseNumber(ctxt
);
8081 xmlExpFree(ctxt
, ret
);
8087 max
= xmlExpParseNumber(ctxt
);
8092 xmlExpFree(ctxt
, ret
);
8096 ret
= xmlExpHashGetEntry(ctxt
, XML_EXP_COUNT
, ret
, NULL
, NULL
,
8099 } else if (CUR
== '?') {
8101 ret
= xmlExpHashGetEntry(ctxt
, XML_EXP_COUNT
, ret
, NULL
, NULL
,
8104 } else if (CUR
== '+') {
8106 ret
= xmlExpHashGetEntry(ctxt
, XML_EXP_COUNT
, ret
, NULL
, NULL
,
8109 } else if (CUR
== '*') {
8111 ret
= xmlExpHashGetEntry(ctxt
, XML_EXP_COUNT
, ret
, NULL
, NULL
,
8119 static xmlExpNodePtr
8120 xmlExpParseSeq(xmlExpCtxtPtr ctxt
) {
8121 xmlExpNodePtr ret
, right
;
8123 ret
= xmlExpParseOr(ctxt
);
8125 while (CUR
== '|') {
8127 right
= xmlExpParseOr(ctxt
);
8128 if (right
== NULL
) {
8129 xmlExpFree(ctxt
, ret
);
8132 ret
= xmlExpHashGetEntry(ctxt
, XML_EXP_OR
, ret
, right
, NULL
, 0, 0);
8139 static xmlExpNodePtr
8140 xmlExpParseExpr(xmlExpCtxtPtr ctxt
) {
8141 xmlExpNodePtr ret
, right
;
8143 ret
= xmlExpParseSeq(ctxt
);
8145 while (CUR
== ',') {
8147 right
= xmlExpParseSeq(ctxt
);
8148 if (right
== NULL
) {
8149 xmlExpFree(ctxt
, ret
);
8152 ret
= xmlExpHashGetEntry(ctxt
, XML_EXP_SEQ
, ret
, right
, NULL
, 0, 0);
8161 * @ctxt: the expressions context
8162 * @expr: the 0 terminated string
8164 * Minimal parser for regexps, it understand the following constructs
8165 * - string terminals
8166 * - choice operator |
8167 * - sequence operator ,
8168 * - subexpressions (...)
8169 * - usual cardinality operators + * and ?
8170 * - finite sequences { min, max }
8171 * - infinite sequences { min, * }
8172 * There is minimal checkings made especially no checking on strings values
8174 * Returns a new expression or NULL in case of failure
8177 xmlExpParse(xmlExpCtxtPtr ctxt
, const char *expr
) {
8183 ret
= xmlExpParseExpr(ctxt
);
8185 if (*ctxt
->cur
!= 0) {
8186 xmlExpFree(ctxt
, ret
);
8193 xmlExpDumpInt(xmlBufferPtr buf
, xmlExpNodePtr expr
, int glob
) {
8196 if (expr
== NULL
) return;
8197 if (glob
) xmlBufferWriteChar(buf
, "(");
8198 switch (expr
->type
) {
8200 xmlBufferWriteChar(buf
, "empty");
8202 case XML_EXP_FORBID
:
8203 xmlBufferWriteChar(buf
, "forbidden");
8206 xmlBufferWriteCHAR(buf
, expr
->exp_str
);
8210 if ((c
->type
== XML_EXP_SEQ
) || (c
->type
== XML_EXP_OR
))
8211 xmlExpDumpInt(buf
, c
, 1);
8213 xmlExpDumpInt(buf
, c
, 0);
8214 xmlBufferWriteChar(buf
, " , ");
8215 c
= expr
->exp_right
;
8216 if ((c
->type
== XML_EXP_SEQ
) || (c
->type
== XML_EXP_OR
))
8217 xmlExpDumpInt(buf
, c
, 1);
8219 xmlExpDumpInt(buf
, c
, 0);
8223 if ((c
->type
== XML_EXP_SEQ
) || (c
->type
== XML_EXP_OR
))
8224 xmlExpDumpInt(buf
, c
, 1);
8226 xmlExpDumpInt(buf
, c
, 0);
8227 xmlBufferWriteChar(buf
, " | ");
8228 c
= expr
->exp_right
;
8229 if ((c
->type
== XML_EXP_SEQ
) || (c
->type
== XML_EXP_OR
))
8230 xmlExpDumpInt(buf
, c
, 1);
8232 xmlExpDumpInt(buf
, c
, 0);
8234 case XML_EXP_COUNT
: {
8238 if ((c
->type
== XML_EXP_SEQ
) || (c
->type
== XML_EXP_OR
))
8239 xmlExpDumpInt(buf
, c
, 1);
8241 xmlExpDumpInt(buf
, c
, 0);
8242 if ((expr
->exp_min
== 0) && (expr
->exp_max
== 1)) {
8245 } else if ((expr
->exp_min
== 0) && (expr
->exp_max
== -1)) {
8248 } else if ((expr
->exp_min
== 1) && (expr
->exp_max
== -1)) {
8251 } else if (expr
->exp_max
== expr
->exp_min
) {
8252 snprintf(rep
, 39, "{%d}", expr
->exp_min
);
8253 } else if (expr
->exp_max
< 0) {
8254 snprintf(rep
, 39, "{%d,inf}", expr
->exp_min
);
8256 snprintf(rep
, 39, "{%d,%d}", expr
->exp_min
, expr
->exp_max
);
8259 xmlBufferWriteChar(buf
, rep
);
8263 fprintf(stderr
, "Error in tree\n");
8266 xmlBufferWriteChar(buf
, ")");
8270 * @buf: a buffer to receive the output
8271 * @expr: the compiled expression
8273 * Serialize the expression as compiled to the buffer
8276 xmlExpDump(xmlBufferPtr buf
, xmlExpNodePtr expr
) {
8277 if ((buf
== NULL
) || (expr
== NULL
))
8279 xmlExpDumpInt(buf
, expr
, 0);
8284 * @expr: a compiled expression
8286 * Indicate the maximum number of input a expression can accept
8288 * Returns the maximum length or -1 in case of error
8291 xmlExpMaxToken(xmlExpNodePtr expr
) {
8294 return(expr
->c_max
);
8298 * xmlExpCtxtNbNodes:
8299 * @ctxt: an expression context
8301 * Debugging facility provides the number of allocated nodes at a that point
8303 * Returns the number of nodes in use or -1 in case of error
8306 xmlExpCtxtNbNodes(xmlExpCtxtPtr ctxt
) {
8309 return(ctxt
->nb_nodes
);
8314 * @ctxt: an expression context
8316 * Debugging facility provides the number of allocated nodes over lifetime
8318 * Returns the number of nodes ever allocated or -1 in case of error
8321 xmlExpCtxtNbCons(xmlExpCtxtPtr ctxt
) {
8324 return(ctxt
->nb_cons
);
8327 #endif /* LIBXML_EXPR_ENABLED */
8329 #endif /* LIBXML_REGEXP_ENABLED */