2 * regexp.c: generic and extensible Regular Expression engine
4 * Basically designed with the purpose of compiling regexps for
5 * the variety of validation/schemas mechanisms now available in
6 * XML related specifications these include:
7 * - XML-1.0 DTD validation
8 * - XML Schemas structure part 1
9 * - XML Schemas Datatypes part 2 especially Appendix F
10 * - RELAX-NG/TREX i.e. the counter proposal
12 * See Copyright for the status of this software.
14 * Daniel Veillard <veillard@redhat.com>
20 #ifdef LIBXML_REGEXP_ENABLED
22 /* #define DEBUG_ERR */
28 #include <libxml/tree.h>
29 #include <libxml/parserInternals.h>
30 #include <libxml/xmlregexp.h>
31 #include <libxml/xmlautomata.h>
32 #include <libxml/xmlunicode.h>
34 #include "private/error.h"
35 #include "private/regexp.h"
38 #define SIZE_MAX ((size_t) -1)
41 /* #define DEBUG_REGEXP_GRAPH */
42 /* #define DEBUG_REGEXP_EXEC */
43 /* #define DEBUG_PUSH */
44 /* #define DEBUG_COMPACTION */
46 #define MAX_PUSH 10000000
52 ctxt->error = XML_REGEXP_COMPILE_ERROR; \
53 xmlRegexpErrCompile(ctxt, str);
54 #define NEXT ctxt->cur++
55 #define CUR (*(ctxt->cur))
56 #define NXT(index) (ctxt->cur[index])
58 #define CUR_SCHAR(s, l) xmlStringCurrentChar(NULL, s, &l)
59 #define NEXTL(l) ctxt->cur += l;
60 #define XML_REG_STRING_SEPARATOR '|'
62 * Need PREV to check on a '-' within a Character Group. May only be used
63 * when it's guaranteed that cur is not at the beginning of ctxt->string!
65 #define PREV (ctxt->cur[-1])
70 * macro to flag unimplemented blocks
73 xmlGenericError(xmlGenericErrorContext, \
74 "Unimplemented block at %s:%d\n", \
77 /************************************************************************
79 * Datatypes and structures *
81 ************************************************************************/
84 * Note: the order of the enums below is significant, do not shuffle
87 XML_REGEXP_EPSILON
= 1,
90 XML_REGEXP_SUBREG
, /* used for () sub regexps */
92 XML_REGEXP_ANYCHAR
, /* . */
93 XML_REGEXP_ANYSPACE
, /* \s */
94 XML_REGEXP_NOTSPACE
, /* \S */
95 XML_REGEXP_INITNAME
, /* \l */
96 XML_REGEXP_NOTINITNAME
, /* \L */
97 XML_REGEXP_NAMECHAR
, /* \c */
98 XML_REGEXP_NOTNAMECHAR
, /* \C */
99 XML_REGEXP_DECIMAL
, /* \d */
100 XML_REGEXP_NOTDECIMAL
, /* \D */
101 XML_REGEXP_REALCHAR
, /* \w */
102 XML_REGEXP_NOTREALCHAR
, /* \W */
103 XML_REGEXP_LETTER
= 100,
104 XML_REGEXP_LETTER_UPPERCASE
,
105 XML_REGEXP_LETTER_LOWERCASE
,
106 XML_REGEXP_LETTER_TITLECASE
,
107 XML_REGEXP_LETTER_MODIFIER
,
108 XML_REGEXP_LETTER_OTHERS
,
110 XML_REGEXP_MARK_NONSPACING
,
111 XML_REGEXP_MARK_SPACECOMBINING
,
112 XML_REGEXP_MARK_ENCLOSING
,
114 XML_REGEXP_NUMBER_DECIMAL
,
115 XML_REGEXP_NUMBER_LETTER
,
116 XML_REGEXP_NUMBER_OTHERS
,
118 XML_REGEXP_PUNCT_CONNECTOR
,
119 XML_REGEXP_PUNCT_DASH
,
120 XML_REGEXP_PUNCT_OPEN
,
121 XML_REGEXP_PUNCT_CLOSE
,
122 XML_REGEXP_PUNCT_INITQUOTE
,
123 XML_REGEXP_PUNCT_FINQUOTE
,
124 XML_REGEXP_PUNCT_OTHERS
,
126 XML_REGEXP_SEPAR_SPACE
,
127 XML_REGEXP_SEPAR_LINE
,
128 XML_REGEXP_SEPAR_PARA
,
130 XML_REGEXP_SYMBOL_MATH
,
131 XML_REGEXP_SYMBOL_CURRENCY
,
132 XML_REGEXP_SYMBOL_MODIFIER
,
133 XML_REGEXP_SYMBOL_OTHERS
,
135 XML_REGEXP_OTHER_CONTROL
,
136 XML_REGEXP_OTHER_FORMAT
,
137 XML_REGEXP_OTHER_PRIVATE
,
139 XML_REGEXP_BLOCK_NAME
143 XML_REGEXP_QUANT_EPSILON
= 1,
144 XML_REGEXP_QUANT_ONCE
,
145 XML_REGEXP_QUANT_OPT
,
146 XML_REGEXP_QUANT_MULT
,
147 XML_REGEXP_QUANT_PLUS
,
148 XML_REGEXP_QUANT_ONCEONLY
,
149 XML_REGEXP_QUANT_ALL
,
150 XML_REGEXP_QUANT_RANGE
154 XML_REGEXP_START_STATE
= 1,
155 XML_REGEXP_FINAL_STATE
,
156 XML_REGEXP_TRANS_STATE
,
157 XML_REGEXP_SINK_STATE
,
158 XML_REGEXP_UNREACH_STATE
162 XML_REGEXP_MARK_NORMAL
= 0,
163 XML_REGEXP_MARK_START
,
164 XML_REGEXP_MARK_VISITED
167 typedef struct _xmlRegRange xmlRegRange
;
168 typedef xmlRegRange
*xmlRegRangePtr
;
170 struct _xmlRegRange
{
171 int neg
; /* 0 normal, 1 not, 2 exclude */
178 typedef struct _xmlRegAtom xmlRegAtom
;
179 typedef xmlRegAtom
*xmlRegAtomPtr
;
181 typedef struct _xmlAutomataState xmlRegState
;
182 typedef xmlRegState
*xmlRegStatePtr
;
187 xmlRegQuantType quant
;
195 xmlRegStatePtr start
;
196 xmlRegStatePtr start0
;
200 xmlRegRangePtr
*ranges
;
204 typedef struct _xmlRegCounter xmlRegCounter
;
205 typedef xmlRegCounter
*xmlRegCounterPtr
;
207 struct _xmlRegCounter
{
212 typedef struct _xmlRegTrans xmlRegTrans
;
213 typedef xmlRegTrans
*xmlRegTransPtr
;
215 struct _xmlRegTrans
{
223 struct _xmlAutomataState
{
224 xmlRegStateType type
;
225 xmlRegMarkedType mark
;
226 xmlRegMarkedType markd
;
227 xmlRegMarkedType reached
;
232 /* knowing states pointing to us can speed things up */
238 typedef struct _xmlAutomata xmlRegParserCtxt
;
239 typedef xmlRegParserCtxt
*xmlRegParserCtxtPtr
;
241 #define AM_AUTOMATA_RNG 1
243 struct _xmlAutomata
{
250 xmlRegStatePtr start
;
252 xmlRegStatePtr state
;
258 xmlRegAtomPtr
*atoms
;
262 xmlRegStatePtr
*states
;
266 xmlRegCounter
*counters
;
278 xmlRegStatePtr
*states
;
280 xmlRegAtomPtr
*atoms
;
282 xmlRegCounter
*counters
;
286 * That's the compact form for determinists automatas
295 typedef struct _xmlRegExecRollback xmlRegExecRollback
;
296 typedef xmlRegExecRollback
*xmlRegExecRollbackPtr
;
298 struct _xmlRegExecRollback
{
299 xmlRegStatePtr state
;/* the current state */
300 int index
; /* the index in the input stack */
301 int nextbranch
; /* the next transition to explore in that state */
302 int *counts
; /* save the automata state if it has some */
305 typedef struct _xmlRegInputToken xmlRegInputToken
;
306 typedef xmlRegInputToken
*xmlRegInputTokenPtr
;
308 struct _xmlRegInputToken
{
313 struct _xmlRegExecCtxt
{
314 int status
; /* execution status != 0 indicate an error */
315 int determinist
; /* did we find an indeterministic behaviour */
316 xmlRegexpPtr comp
; /* the compiled regexp */
317 xmlRegExecCallbacks callback
;
320 xmlRegStatePtr state
;/* the current state */
321 int transno
; /* the current transition on that state */
322 int transcount
; /* the number of chars in char counted transitions */
325 * A stack of rollback states
329 xmlRegExecRollback
*rollbacks
;
332 * The state of the automata if any
343 const xmlChar
*inputString
; /* when operating on characters */
344 xmlRegInputTokenPtr inputStack
;/* when operating on strings */
349 int errStateNo
; /* the error state number */
350 xmlRegStatePtr errState
; /* the error state */
351 xmlChar
*errString
; /* the string raising the error */
352 int *errCounts
; /* counters at the error state */
356 #define REGEXP_ALL_COUNTER 0x123456
357 #define REGEXP_ALL_LAX_COUNTER 0x123457
359 static void xmlFAParseRegExp(xmlRegParserCtxtPtr ctxt
, int top
);
360 static void xmlRegFreeState(xmlRegStatePtr state
);
361 static void xmlRegFreeAtom(xmlRegAtomPtr atom
);
362 static int xmlRegStrEqualWildcard(const xmlChar
*expStr
, const xmlChar
*valStr
);
363 static int xmlRegCheckCharacter(xmlRegAtomPtr atom
, int codepoint
);
364 static int xmlRegCheckCharacterRange(xmlRegAtomType type
, int codepoint
,
365 int neg
, int start
, int end
, const xmlChar
*blockName
);
367 /************************************************************************
369 * Regexp memory error handler *
371 ************************************************************************/
373 * xmlRegexpErrMemory:
374 * @extra: extra information
376 * Handle an out of memory condition
379 xmlRegexpErrMemory(xmlRegParserCtxtPtr ctxt
, const char *extra
)
381 const char *regexp
= NULL
;
383 regexp
= (const char *) ctxt
->string
;
384 ctxt
->error
= XML_ERR_NO_MEMORY
;
386 __xmlRaiseError(NULL
, NULL
, NULL
, NULL
, NULL
, XML_FROM_REGEXP
,
387 XML_ERR_NO_MEMORY
, XML_ERR_FATAL
, NULL
, 0, extra
,
389 "Memory allocation failed : %s\n", extra
);
393 * xmlRegexpErrCompile:
394 * @extra: extra information
396 * Handle a compilation failure
399 xmlRegexpErrCompile(xmlRegParserCtxtPtr ctxt
, const char *extra
)
401 const char *regexp
= NULL
;
405 regexp
= (const char *) ctxt
->string
;
406 idx
= ctxt
->cur
- ctxt
->string
;
407 ctxt
->error
= XML_REGEXP_COMPILE_ERROR
;
409 __xmlRaiseError(NULL
, NULL
, NULL
, NULL
, NULL
, XML_FROM_REGEXP
,
410 XML_REGEXP_COMPILE_ERROR
, XML_ERR_FATAL
, NULL
, 0, extra
,
411 regexp
, NULL
, idx
, 0,
412 "failed to compile: %s\n", extra
);
415 /************************************************************************
417 * Allocation/Deallocation *
419 ************************************************************************/
421 static int xmlFAComputesDeterminism(xmlRegParserCtxtPtr ctxt
);
425 * @dim1: size of first dimension
426 * @dim2: size of second dimension
427 * @elemSize: size of element
429 * Allocate a two-dimensional array and set all elements to zero.
431 * Returns the new array or NULL in case of error.
434 xmlRegCalloc2(size_t dim1
, size_t dim2
, size_t elemSize
) {
438 /* Check for overflow */
439 if ((dim2
== 0) || (elemSize
== 0) ||
440 (dim1
> SIZE_MAX
/ dim2
/ elemSize
))
442 totalSize
= dim1
* dim2
* elemSize
;
443 ret
= xmlMalloc(totalSize
);
445 memset(ret
, 0, totalSize
);
450 * xmlRegEpxFromParse:
451 * @ctxt: the parser context used to build it
453 * Allocate a new regexp and fill it with the result from the parser
455 * Returns the new regexp or NULL in case of error
458 xmlRegEpxFromParse(xmlRegParserCtxtPtr ctxt
) {
461 ret
= (xmlRegexpPtr
) xmlMalloc(sizeof(xmlRegexp
));
463 xmlRegexpErrMemory(ctxt
, "compiling regexp");
466 memset(ret
, 0, sizeof(xmlRegexp
));
467 ret
->string
= ctxt
->string
;
468 ret
->nbStates
= ctxt
->nbStates
;
469 ret
->states
= ctxt
->states
;
470 ret
->nbAtoms
= ctxt
->nbAtoms
;
471 ret
->atoms
= ctxt
->atoms
;
472 ret
->nbCounters
= ctxt
->nbCounters
;
473 ret
->counters
= ctxt
->counters
;
474 ret
->determinist
= ctxt
->determinist
;
475 ret
->flags
= ctxt
->flags
;
476 if (ret
->determinist
== -1) {
477 xmlRegexpIsDeterminist(ret
);
480 if ((ret
->determinist
!= 0) &&
481 (ret
->nbCounters
== 0) &&
483 (ret
->atoms
!= NULL
) &&
484 (ret
->atoms
[0] != NULL
) &&
485 (ret
->atoms
[0]->type
== XML_REGEXP_STRING
)) {
486 int i
, j
, nbstates
= 0, nbatoms
= 0;
495 * Switch to a compact representation
496 * 1/ counting the effective number of states left
497 * 2/ counting the unique number of atoms, and check that
498 * they are all of the string type
499 * 3/ build a table state x atom for the transitions
502 stateRemap
= xmlMalloc(ret
->nbStates
* sizeof(int));
503 if (stateRemap
== NULL
) {
504 xmlRegexpErrMemory(ctxt
, "compiling regexp");
508 for (i
= 0;i
< ret
->nbStates
;i
++) {
509 if (ret
->states
[i
] != NULL
) {
510 stateRemap
[i
] = nbstates
;
516 #ifdef DEBUG_COMPACTION
517 printf("Final: %d states\n", nbstates
);
519 stringMap
= xmlMalloc(ret
->nbAtoms
* sizeof(char *));
520 if (stringMap
== NULL
) {
521 xmlRegexpErrMemory(ctxt
, "compiling regexp");
526 stringRemap
= xmlMalloc(ret
->nbAtoms
* sizeof(int));
527 if (stringRemap
== NULL
) {
528 xmlRegexpErrMemory(ctxt
, "compiling regexp");
534 for (i
= 0;i
< ret
->nbAtoms
;i
++) {
535 if ((ret
->atoms
[i
]->type
== XML_REGEXP_STRING
) &&
536 (ret
->atoms
[i
]->quant
== XML_REGEXP_QUANT_ONCE
)) {
537 value
= ret
->atoms
[i
]->valuep
;
538 for (j
= 0;j
< nbatoms
;j
++) {
539 if (xmlStrEqual(stringMap
[j
], value
)) {
545 stringRemap
[i
] = nbatoms
;
546 stringMap
[nbatoms
] = xmlStrdup(value
);
547 if (stringMap
[nbatoms
] == NULL
) {
548 for (i
= 0;i
< nbatoms
;i
++)
549 xmlFree(stringMap
[i
]);
550 xmlFree(stringRemap
);
560 xmlFree(stringRemap
);
561 for (i
= 0;i
< nbatoms
;i
++)
562 xmlFree(stringMap
[i
]);
568 #ifdef DEBUG_COMPACTION
569 printf("Final: %d atoms\n", nbatoms
);
571 transitions
= (int *) xmlRegCalloc2(nbstates
+ 1, nbatoms
+ 1,
573 if (transitions
== NULL
) {
575 xmlFree(stringRemap
);
576 for (i
= 0;i
< nbatoms
;i
++)
577 xmlFree(stringMap
[i
]);
584 * Allocate the transition table. The first entry for each
585 * state corresponds to the state type.
589 for (i
= 0;i
< ret
->nbStates
;i
++) {
590 int stateno
, atomno
, targetno
, prev
;
591 xmlRegStatePtr state
;
592 xmlRegTransPtr trans
;
594 stateno
= stateRemap
[i
];
597 state
= ret
->states
[i
];
599 transitions
[stateno
* (nbatoms
+ 1)] = state
->type
;
601 for (j
= 0;j
< state
->nbTrans
;j
++) {
602 trans
= &(state
->trans
[j
]);
603 if ((trans
->to
== -1) || (trans
->atom
== NULL
))
605 atomno
= stringRemap
[trans
->atom
->no
];
606 if ((trans
->atom
->data
!= NULL
) && (transdata
== NULL
)) {
607 transdata
= (void **) xmlRegCalloc2(nbstates
, nbatoms
,
609 if (transdata
== NULL
) {
610 xmlRegexpErrMemory(ctxt
, "compiling regexp");
614 targetno
= stateRemap
[trans
->to
];
616 * if the same atom can generate transitions to 2 different
617 * states then it means the automata is not deterministic and
618 * the compact form can't be used !
620 prev
= transitions
[stateno
* (nbatoms
+ 1) + atomno
+ 1];
622 if (prev
!= targetno
+ 1) {
623 ret
->determinist
= 0;
624 #ifdef DEBUG_COMPACTION
625 printf("Indet: state %d trans %d, atom %d to %d : %d to %d\n",
626 i
, j
, trans
->atom
->no
, trans
->to
, atomno
, targetno
);
627 printf(" previous to is %d\n", prev
);
629 if (transdata
!= NULL
)
631 xmlFree(transitions
);
633 xmlFree(stringRemap
);
634 for (i
= 0;i
< nbatoms
;i
++)
635 xmlFree(stringMap
[i
]);
641 printf("State %d trans %d: atom %d to %d : %d to %d\n",
642 i
, j
, trans
->atom
->no
, trans
->to
, atomno
, targetno
);
644 transitions
[stateno
* (nbatoms
+ 1) + atomno
+ 1] =
645 targetno
+ 1; /* to avoid 0 */
646 if (transdata
!= NULL
)
647 transdata
[stateno
* nbatoms
+ atomno
] =
652 ret
->determinist
= 1;
653 #ifdef DEBUG_COMPACTION
657 for (i
= 0;i
< nbstates
;i
++) {
658 for (j
= 0;j
< nbatoms
+ 1;j
++) {
659 printf("%02d ", transitions
[i
* (nbatoms
+ 1) + j
]);
666 * Cleanup of the old data
668 if (ret
->states
!= NULL
) {
669 for (i
= 0;i
< ret
->nbStates
;i
++)
670 xmlRegFreeState(ret
->states
[i
]);
671 xmlFree(ret
->states
);
675 if (ret
->atoms
!= NULL
) {
676 for (i
= 0;i
< ret
->nbAtoms
;i
++)
677 xmlRegFreeAtom(ret
->atoms
[i
]);
683 ret
->compact
= transitions
;
684 ret
->transdata
= transdata
;
685 ret
->stringMap
= stringMap
;
686 ret
->nbstrings
= nbatoms
;
687 ret
->nbstates
= nbstates
;
689 xmlFree(stringRemap
);
697 ctxt
->nbCounters
= 0;
698 ctxt
->counters
= NULL
;
703 * xmlRegNewParserCtxt:
704 * @string: the string to parse
706 * Allocate a new regexp parser context
708 * Returns the new context or NULL in case of error
710 static xmlRegParserCtxtPtr
711 xmlRegNewParserCtxt(const xmlChar
*string
) {
712 xmlRegParserCtxtPtr ret
;
714 ret
= (xmlRegParserCtxtPtr
) xmlMalloc(sizeof(xmlRegParserCtxt
));
717 memset(ret
, 0, sizeof(xmlRegParserCtxt
));
719 ret
->string
= xmlStrdup(string
);
720 ret
->cur
= ret
->string
;
724 ret
->determinist
= -1;
730 * @ctxt: the regexp parser context
731 * @neg: is that negative
732 * @type: the type of range
733 * @start: the start codepoint
734 * @end: the end codepoint
736 * Allocate a new regexp range
738 * Returns the new range or NULL in case of error
740 static xmlRegRangePtr
741 xmlRegNewRange(xmlRegParserCtxtPtr ctxt
,
742 int neg
, xmlRegAtomType type
, int start
, int end
) {
745 ret
= (xmlRegRangePtr
) xmlMalloc(sizeof(xmlRegRange
));
747 xmlRegexpErrMemory(ctxt
, "allocating range");
759 * @range: the regexp range
761 * Free a regexp range
764 xmlRegFreeRange(xmlRegRangePtr range
) {
768 if (range
->blockName
!= NULL
)
769 xmlFree(range
->blockName
);
775 * @range: the regexp range
777 * Copy a regexp range
779 * Returns the new copy or NULL in case of error.
781 static xmlRegRangePtr
782 xmlRegCopyRange(xmlRegParserCtxtPtr ctxt
, xmlRegRangePtr range
) {
788 ret
= xmlRegNewRange(ctxt
, range
->neg
, range
->type
, range
->start
,
792 if (range
->blockName
!= NULL
) {
793 ret
->blockName
= xmlStrdup(range
->blockName
);
794 if (ret
->blockName
== NULL
) {
795 xmlRegexpErrMemory(ctxt
, "allocating range");
796 xmlRegFreeRange(ret
);
805 * @ctxt: the regexp parser context
806 * @type: the type of atom
808 * Allocate a new atom
810 * Returns the new atom or NULL in case of error
813 xmlRegNewAtom(xmlRegParserCtxtPtr ctxt
, xmlRegAtomType type
) {
816 ret
= (xmlRegAtomPtr
) xmlMalloc(sizeof(xmlRegAtom
));
818 xmlRegexpErrMemory(ctxt
, "allocating atom");
821 memset(ret
, 0, sizeof(xmlRegAtom
));
823 ret
->quant
= XML_REGEXP_QUANT_ONCE
;
831 * @atom: the regexp atom
836 xmlRegFreeAtom(xmlRegAtomPtr atom
) {
842 for (i
= 0;i
< atom
->nbRanges
;i
++)
843 xmlRegFreeRange(atom
->ranges
[i
]);
844 if (atom
->ranges
!= NULL
)
845 xmlFree(atom
->ranges
);
846 if ((atom
->type
== XML_REGEXP_STRING
) && (atom
->valuep
!= NULL
))
847 xmlFree(atom
->valuep
);
848 if ((atom
->type
== XML_REGEXP_STRING
) && (atom
->valuep2
!= NULL
))
849 xmlFree(atom
->valuep2
);
850 if ((atom
->type
== XML_REGEXP_BLOCK_NAME
) && (atom
->valuep
!= NULL
))
851 xmlFree(atom
->valuep
);
857 * @ctxt: the regexp parser context
858 * @atom: the original atom
860 * Allocate a new regexp range
862 * Returns the new atom or NULL in case of error
865 xmlRegCopyAtom(xmlRegParserCtxtPtr ctxt
, xmlRegAtomPtr atom
) {
868 ret
= (xmlRegAtomPtr
) xmlMalloc(sizeof(xmlRegAtom
));
870 xmlRegexpErrMemory(ctxt
, "copying atom");
873 memset(ret
, 0, sizeof(xmlRegAtom
));
874 ret
->type
= atom
->type
;
875 ret
->quant
= atom
->quant
;
876 ret
->min
= atom
->min
;
877 ret
->max
= atom
->max
;
878 if (atom
->nbRanges
> 0) {
881 ret
->ranges
= (xmlRegRangePtr
*) xmlMalloc(sizeof(xmlRegRangePtr
) *
883 if (ret
->ranges
== NULL
) {
884 xmlRegexpErrMemory(ctxt
, "copying atom");
887 for (i
= 0;i
< atom
->nbRanges
;i
++) {
888 ret
->ranges
[i
] = xmlRegCopyRange(ctxt
, atom
->ranges
[i
]);
889 if (ret
->ranges
[i
] == NULL
)
891 ret
->nbRanges
= i
+ 1;
901 static xmlRegStatePtr
902 xmlRegNewState(xmlRegParserCtxtPtr ctxt
) {
905 ret
= (xmlRegStatePtr
) xmlMalloc(sizeof(xmlRegState
));
907 xmlRegexpErrMemory(ctxt
, "allocating state");
910 memset(ret
, 0, sizeof(xmlRegState
));
911 ret
->type
= XML_REGEXP_TRANS_STATE
;
912 ret
->mark
= XML_REGEXP_MARK_NORMAL
;
918 * @state: the regexp state
920 * Free a regexp state
923 xmlRegFreeState(xmlRegStatePtr state
) {
927 if (state
->trans
!= NULL
)
928 xmlFree(state
->trans
);
929 if (state
->transTo
!= NULL
)
930 xmlFree(state
->transTo
);
935 * xmlRegFreeParserCtxt:
936 * @ctxt: the regexp parser context
938 * Free a regexp parser context
941 xmlRegFreeParserCtxt(xmlRegParserCtxtPtr ctxt
) {
946 if (ctxt
->string
!= NULL
)
947 xmlFree(ctxt
->string
);
948 if (ctxt
->states
!= NULL
) {
949 for (i
= 0;i
< ctxt
->nbStates
;i
++)
950 xmlRegFreeState(ctxt
->states
[i
]);
951 xmlFree(ctxt
->states
);
953 if (ctxt
->atoms
!= NULL
) {
954 for (i
= 0;i
< ctxt
->nbAtoms
;i
++)
955 xmlRegFreeAtom(ctxt
->atoms
[i
]);
956 xmlFree(ctxt
->atoms
);
958 if (ctxt
->counters
!= NULL
)
959 xmlFree(ctxt
->counters
);
963 /************************************************************************
965 * Display of Data structures *
967 ************************************************************************/
970 xmlRegPrintAtomType(FILE *output
, xmlRegAtomType type
) {
972 case XML_REGEXP_EPSILON
:
973 fprintf(output
, "epsilon "); break;
974 case XML_REGEXP_CHARVAL
:
975 fprintf(output
, "charval "); break;
976 case XML_REGEXP_RANGES
:
977 fprintf(output
, "ranges "); break;
978 case XML_REGEXP_SUBREG
:
979 fprintf(output
, "subexpr "); break;
980 case XML_REGEXP_STRING
:
981 fprintf(output
, "string "); break;
982 case XML_REGEXP_ANYCHAR
:
983 fprintf(output
, "anychar "); break;
984 case XML_REGEXP_ANYSPACE
:
985 fprintf(output
, "anyspace "); break;
986 case XML_REGEXP_NOTSPACE
:
987 fprintf(output
, "notspace "); break;
988 case XML_REGEXP_INITNAME
:
989 fprintf(output
, "initname "); break;
990 case XML_REGEXP_NOTINITNAME
:
991 fprintf(output
, "notinitname "); break;
992 case XML_REGEXP_NAMECHAR
:
993 fprintf(output
, "namechar "); break;
994 case XML_REGEXP_NOTNAMECHAR
:
995 fprintf(output
, "notnamechar "); break;
996 case XML_REGEXP_DECIMAL
:
997 fprintf(output
, "decimal "); break;
998 case XML_REGEXP_NOTDECIMAL
:
999 fprintf(output
, "notdecimal "); break;
1000 case XML_REGEXP_REALCHAR
:
1001 fprintf(output
, "realchar "); break;
1002 case XML_REGEXP_NOTREALCHAR
:
1003 fprintf(output
, "notrealchar "); break;
1004 case XML_REGEXP_LETTER
:
1005 fprintf(output
, "LETTER "); break;
1006 case XML_REGEXP_LETTER_UPPERCASE
:
1007 fprintf(output
, "LETTER_UPPERCASE "); break;
1008 case XML_REGEXP_LETTER_LOWERCASE
:
1009 fprintf(output
, "LETTER_LOWERCASE "); break;
1010 case XML_REGEXP_LETTER_TITLECASE
:
1011 fprintf(output
, "LETTER_TITLECASE "); break;
1012 case XML_REGEXP_LETTER_MODIFIER
:
1013 fprintf(output
, "LETTER_MODIFIER "); break;
1014 case XML_REGEXP_LETTER_OTHERS
:
1015 fprintf(output
, "LETTER_OTHERS "); break;
1016 case XML_REGEXP_MARK
:
1017 fprintf(output
, "MARK "); break;
1018 case XML_REGEXP_MARK_NONSPACING
:
1019 fprintf(output
, "MARK_NONSPACING "); break;
1020 case XML_REGEXP_MARK_SPACECOMBINING
:
1021 fprintf(output
, "MARK_SPACECOMBINING "); break;
1022 case XML_REGEXP_MARK_ENCLOSING
:
1023 fprintf(output
, "MARK_ENCLOSING "); break;
1024 case XML_REGEXP_NUMBER
:
1025 fprintf(output
, "NUMBER "); break;
1026 case XML_REGEXP_NUMBER_DECIMAL
:
1027 fprintf(output
, "NUMBER_DECIMAL "); break;
1028 case XML_REGEXP_NUMBER_LETTER
:
1029 fprintf(output
, "NUMBER_LETTER "); break;
1030 case XML_REGEXP_NUMBER_OTHERS
:
1031 fprintf(output
, "NUMBER_OTHERS "); break;
1032 case XML_REGEXP_PUNCT
:
1033 fprintf(output
, "PUNCT "); break;
1034 case XML_REGEXP_PUNCT_CONNECTOR
:
1035 fprintf(output
, "PUNCT_CONNECTOR "); break;
1036 case XML_REGEXP_PUNCT_DASH
:
1037 fprintf(output
, "PUNCT_DASH "); break;
1038 case XML_REGEXP_PUNCT_OPEN
:
1039 fprintf(output
, "PUNCT_OPEN "); break;
1040 case XML_REGEXP_PUNCT_CLOSE
:
1041 fprintf(output
, "PUNCT_CLOSE "); break;
1042 case XML_REGEXP_PUNCT_INITQUOTE
:
1043 fprintf(output
, "PUNCT_INITQUOTE "); break;
1044 case XML_REGEXP_PUNCT_FINQUOTE
:
1045 fprintf(output
, "PUNCT_FINQUOTE "); break;
1046 case XML_REGEXP_PUNCT_OTHERS
:
1047 fprintf(output
, "PUNCT_OTHERS "); break;
1048 case XML_REGEXP_SEPAR
:
1049 fprintf(output
, "SEPAR "); break;
1050 case XML_REGEXP_SEPAR_SPACE
:
1051 fprintf(output
, "SEPAR_SPACE "); break;
1052 case XML_REGEXP_SEPAR_LINE
:
1053 fprintf(output
, "SEPAR_LINE "); break;
1054 case XML_REGEXP_SEPAR_PARA
:
1055 fprintf(output
, "SEPAR_PARA "); break;
1056 case XML_REGEXP_SYMBOL
:
1057 fprintf(output
, "SYMBOL "); break;
1058 case XML_REGEXP_SYMBOL_MATH
:
1059 fprintf(output
, "SYMBOL_MATH "); break;
1060 case XML_REGEXP_SYMBOL_CURRENCY
:
1061 fprintf(output
, "SYMBOL_CURRENCY "); break;
1062 case XML_REGEXP_SYMBOL_MODIFIER
:
1063 fprintf(output
, "SYMBOL_MODIFIER "); break;
1064 case XML_REGEXP_SYMBOL_OTHERS
:
1065 fprintf(output
, "SYMBOL_OTHERS "); break;
1066 case XML_REGEXP_OTHER
:
1067 fprintf(output
, "OTHER "); break;
1068 case XML_REGEXP_OTHER_CONTROL
:
1069 fprintf(output
, "OTHER_CONTROL "); break;
1070 case XML_REGEXP_OTHER_FORMAT
:
1071 fprintf(output
, "OTHER_FORMAT "); break;
1072 case XML_REGEXP_OTHER_PRIVATE
:
1073 fprintf(output
, "OTHER_PRIVATE "); break;
1074 case XML_REGEXP_OTHER_NA
:
1075 fprintf(output
, "OTHER_NA "); break;
1076 case XML_REGEXP_BLOCK_NAME
:
1077 fprintf(output
, "BLOCK "); break;
1082 xmlRegPrintQuantType(FILE *output
, xmlRegQuantType type
) {
1084 case XML_REGEXP_QUANT_EPSILON
:
1085 fprintf(output
, "epsilon "); break;
1086 case XML_REGEXP_QUANT_ONCE
:
1087 fprintf(output
, "once "); break;
1088 case XML_REGEXP_QUANT_OPT
:
1089 fprintf(output
, "? "); break;
1090 case XML_REGEXP_QUANT_MULT
:
1091 fprintf(output
, "* "); break;
1092 case XML_REGEXP_QUANT_PLUS
:
1093 fprintf(output
, "+ "); break;
1094 case XML_REGEXP_QUANT_RANGE
:
1095 fprintf(output
, "range "); break;
1096 case XML_REGEXP_QUANT_ONCEONLY
:
1097 fprintf(output
, "onceonly "); break;
1098 case XML_REGEXP_QUANT_ALL
:
1099 fprintf(output
, "all "); break;
1103 xmlRegPrintRange(FILE *output
, xmlRegRangePtr range
) {
1104 fprintf(output
, " range: ");
1106 fprintf(output
, "negative ");
1107 xmlRegPrintAtomType(output
, range
->type
);
1108 fprintf(output
, "%c - %c\n", range
->start
, range
->end
);
1112 xmlRegPrintAtom(FILE *output
, xmlRegAtomPtr atom
) {
1113 fprintf(output
, " atom: ");
1115 fprintf(output
, "NULL\n");
1119 fprintf(output
, "not ");
1120 xmlRegPrintAtomType(output
, atom
->type
);
1121 xmlRegPrintQuantType(output
, atom
->quant
);
1122 if (atom
->quant
== XML_REGEXP_QUANT_RANGE
)
1123 fprintf(output
, "%d-%d ", atom
->min
, atom
->max
);
1124 if (atom
->type
== XML_REGEXP_STRING
)
1125 fprintf(output
, "'%s' ", (char *) atom
->valuep
);
1126 if (atom
->type
== XML_REGEXP_CHARVAL
)
1127 fprintf(output
, "char %c\n", atom
->codepoint
);
1128 else if (atom
->type
== XML_REGEXP_RANGES
) {
1130 fprintf(output
, "%d entries\n", atom
->nbRanges
);
1131 for (i
= 0; i
< atom
->nbRanges
;i
++)
1132 xmlRegPrintRange(output
, atom
->ranges
[i
]);
1133 } else if (atom
->type
== XML_REGEXP_SUBREG
) {
1134 fprintf(output
, "start %d end %d\n", atom
->start
->no
, atom
->stop
->no
);
1136 fprintf(output
, "\n");
1141 xmlRegPrintTrans(FILE *output
, xmlRegTransPtr trans
) {
1142 fprintf(output
, " trans: ");
1143 if (trans
== NULL
) {
1144 fprintf(output
, "NULL\n");
1147 if (trans
->to
< 0) {
1148 fprintf(output
, "removed\n");
1151 if (trans
->nd
!= 0) {
1153 fprintf(output
, "last not determinist, ");
1155 fprintf(output
, "not determinist, ");
1157 if (trans
->counter
>= 0) {
1158 fprintf(output
, "counted %d, ", trans
->counter
);
1160 if (trans
->count
== REGEXP_ALL_COUNTER
) {
1161 fprintf(output
, "all transition, ");
1162 } else if (trans
->count
>= 0) {
1163 fprintf(output
, "count based %d, ", trans
->count
);
1165 if (trans
->atom
== NULL
) {
1166 fprintf(output
, "epsilon to %d\n", trans
->to
);
1169 if (trans
->atom
->type
== XML_REGEXP_CHARVAL
)
1170 fprintf(output
, "char %c ", trans
->atom
->codepoint
);
1171 fprintf(output
, "atom %d, to %d\n", trans
->atom
->no
, trans
->to
);
1175 xmlRegPrintState(FILE *output
, xmlRegStatePtr state
) {
1178 fprintf(output
, " state: ");
1179 if (state
== NULL
) {
1180 fprintf(output
, "NULL\n");
1183 if (state
->type
== XML_REGEXP_START_STATE
)
1184 fprintf(output
, "START ");
1185 if (state
->type
== XML_REGEXP_FINAL_STATE
)
1186 fprintf(output
, "FINAL ");
1188 fprintf(output
, "%d, %d transitions:\n", state
->no
, state
->nbTrans
);
1189 for (i
= 0;i
< state
->nbTrans
; i
++) {
1190 xmlRegPrintTrans(output
, &(state
->trans
[i
]));
1194 #ifdef DEBUG_REGEXP_GRAPH
1196 xmlRegPrintCtxt(FILE *output
, xmlRegParserCtxtPtr ctxt
) {
1199 fprintf(output
, " ctxt: ");
1201 fprintf(output
, "NULL\n");
1204 fprintf(output
, "'%s' ", ctxt
->string
);
1206 fprintf(output
, "error ");
1208 fprintf(output
, "neg ");
1209 fprintf(output
, "\n");
1210 fprintf(output
, "%d atoms:\n", ctxt
->nbAtoms
);
1211 for (i
= 0;i
< ctxt
->nbAtoms
; i
++) {
1212 fprintf(output
, " %02d ", i
);
1213 xmlRegPrintAtom(output
, ctxt
->atoms
[i
]);
1215 if (ctxt
->atom
!= NULL
) {
1216 fprintf(output
, "current atom:\n");
1217 xmlRegPrintAtom(output
, ctxt
->atom
);
1219 fprintf(output
, "%d states:", ctxt
->nbStates
);
1220 if (ctxt
->start
!= NULL
)
1221 fprintf(output
, " start: %d", ctxt
->start
->no
);
1222 if (ctxt
->end
!= NULL
)
1223 fprintf(output
, " end: %d", ctxt
->end
->no
);
1224 fprintf(output
, "\n");
1225 for (i
= 0;i
< ctxt
->nbStates
; i
++) {
1226 xmlRegPrintState(output
, ctxt
->states
[i
]);
1228 fprintf(output
, "%d counters:\n", ctxt
->nbCounters
);
1229 for (i
= 0;i
< ctxt
->nbCounters
; i
++) {
1230 fprintf(output
, " %d: min %d max %d\n", i
, ctxt
->counters
[i
].min
,
1231 ctxt
->counters
[i
].max
);
1236 /************************************************************************
1238 * Finite Automata structures manipulations *
1240 ************************************************************************/
1242 static xmlRegRangePtr
1243 xmlRegAtomAddRange(xmlRegParserCtxtPtr ctxt
, xmlRegAtomPtr atom
,
1244 int neg
, xmlRegAtomType type
, int start
, int end
,
1245 xmlChar
*blockName
) {
1246 xmlRegRangePtr range
;
1249 ERROR("add range: atom is NULL");
1252 if (atom
->type
!= XML_REGEXP_RANGES
) {
1253 ERROR("add range: atom is not ranges");
1256 if (atom
->maxRanges
== 0) {
1257 atom
->maxRanges
= 4;
1258 atom
->ranges
= (xmlRegRangePtr
*) xmlMalloc(atom
->maxRanges
*
1259 sizeof(xmlRegRangePtr
));
1260 if (atom
->ranges
== NULL
) {
1261 xmlRegexpErrMemory(ctxt
, "adding ranges");
1262 atom
->maxRanges
= 0;
1265 } else if (atom
->nbRanges
>= atom
->maxRanges
) {
1266 xmlRegRangePtr
*tmp
;
1267 atom
->maxRanges
*= 2;
1268 tmp
= (xmlRegRangePtr
*) xmlRealloc(atom
->ranges
, atom
->maxRanges
*
1269 sizeof(xmlRegRangePtr
));
1271 xmlRegexpErrMemory(ctxt
, "adding ranges");
1272 atom
->maxRanges
/= 2;
1277 range
= xmlRegNewRange(ctxt
, neg
, type
, start
, end
);
1280 range
->blockName
= blockName
;
1281 atom
->ranges
[atom
->nbRanges
++] = range
;
1287 xmlRegGetCounter(xmlRegParserCtxtPtr ctxt
) {
1288 if (ctxt
->maxCounters
== 0) {
1289 ctxt
->maxCounters
= 4;
1290 ctxt
->counters
= (xmlRegCounter
*) xmlMalloc(ctxt
->maxCounters
*
1291 sizeof(xmlRegCounter
));
1292 if (ctxt
->counters
== NULL
) {
1293 xmlRegexpErrMemory(ctxt
, "allocating counter");
1294 ctxt
->maxCounters
= 0;
1297 } else if (ctxt
->nbCounters
>= ctxt
->maxCounters
) {
1299 ctxt
->maxCounters
*= 2;
1300 tmp
= (xmlRegCounter
*) xmlRealloc(ctxt
->counters
, ctxt
->maxCounters
*
1301 sizeof(xmlRegCounter
));
1303 xmlRegexpErrMemory(ctxt
, "allocating counter");
1304 ctxt
->maxCounters
/= 2;
1307 ctxt
->counters
= tmp
;
1309 ctxt
->counters
[ctxt
->nbCounters
].min
= -1;
1310 ctxt
->counters
[ctxt
->nbCounters
].max
= -1;
1311 return(ctxt
->nbCounters
++);
1315 xmlRegAtomPush(xmlRegParserCtxtPtr ctxt
, xmlRegAtomPtr atom
) {
1317 ERROR("atom push: atom is NULL");
1320 if (ctxt
->nbAtoms
>= ctxt
->maxAtoms
) {
1321 size_t newSize
= ctxt
->maxAtoms
? ctxt
->maxAtoms
* 2 : 4;
1324 tmp
= xmlRealloc(ctxt
->atoms
, newSize
* sizeof(xmlRegAtomPtr
));
1326 xmlRegexpErrMemory(ctxt
, "allocating counter");
1330 ctxt
->maxAtoms
= newSize
;
1332 atom
->no
= ctxt
->nbAtoms
;
1333 ctxt
->atoms
[ctxt
->nbAtoms
++] = atom
;
1338 xmlRegStateAddTransTo(xmlRegParserCtxtPtr ctxt
, xmlRegStatePtr target
,
1340 if (target
->maxTransTo
== 0) {
1341 target
->maxTransTo
= 8;
1342 target
->transTo
= (int *) xmlMalloc(target
->maxTransTo
*
1344 if (target
->transTo
== NULL
) {
1345 xmlRegexpErrMemory(ctxt
, "adding transition");
1346 target
->maxTransTo
= 0;
1349 } else if (target
->nbTransTo
>= target
->maxTransTo
) {
1351 target
->maxTransTo
*= 2;
1352 tmp
= (int *) xmlRealloc(target
->transTo
, target
->maxTransTo
*
1355 xmlRegexpErrMemory(ctxt
, "adding transition");
1356 target
->maxTransTo
/= 2;
1359 target
->transTo
= tmp
;
1361 target
->transTo
[target
->nbTransTo
] = from
;
1362 target
->nbTransTo
++;
1366 xmlRegStateAddTrans(xmlRegParserCtxtPtr ctxt
, xmlRegStatePtr state
,
1367 xmlRegAtomPtr atom
, xmlRegStatePtr target
,
1368 int counter
, int count
) {
1372 if (state
== NULL
) {
1373 ERROR("add state: state is NULL");
1376 if (target
== NULL
) {
1377 ERROR("add state: target is NULL");
1381 * Other routines follow the philosophy 'When in doubt, add a transition'
1382 * so we check here whether such a transition is already present and, if
1383 * so, silently ignore this request.
1386 for (nrtrans
= state
->nbTrans
- 1; nrtrans
>= 0; nrtrans
--) {
1387 xmlRegTransPtr trans
= &(state
->trans
[nrtrans
]);
1388 if ((trans
->atom
== atom
) &&
1389 (trans
->to
== target
->no
) &&
1390 (trans
->counter
== counter
) &&
1391 (trans
->count
== count
)) {
1392 #ifdef DEBUG_REGEXP_GRAPH
1393 printf("Ignoring duplicate transition from %d to %d\n",
1394 state
->no
, target
->no
);
1400 if (state
->maxTrans
== 0) {
1401 state
->maxTrans
= 8;
1402 state
->trans
= (xmlRegTrans
*) xmlMalloc(state
->maxTrans
*
1403 sizeof(xmlRegTrans
));
1404 if (state
->trans
== NULL
) {
1405 xmlRegexpErrMemory(ctxt
, "adding transition");
1406 state
->maxTrans
= 0;
1409 } else if (state
->nbTrans
>= state
->maxTrans
) {
1411 state
->maxTrans
*= 2;
1412 tmp
= (xmlRegTrans
*) xmlRealloc(state
->trans
, state
->maxTrans
*
1413 sizeof(xmlRegTrans
));
1415 xmlRegexpErrMemory(ctxt
, "adding transition");
1416 state
->maxTrans
/= 2;
1421 #ifdef DEBUG_REGEXP_GRAPH
1422 printf("Add trans from %d to %d ", state
->no
, target
->no
);
1423 if (count
== REGEXP_ALL_COUNTER
)
1424 printf("all transition\n");
1425 else if (count
>= 0)
1426 printf("count based %d\n", count
);
1427 else if (counter
>= 0)
1428 printf("counted %d\n", counter
);
1429 else if (atom
== NULL
)
1430 printf("epsilon transition\n");
1431 else if (atom
!= NULL
)
1432 xmlRegPrintAtom(stdout
, atom
);
1435 state
->trans
[state
->nbTrans
].atom
= atom
;
1436 state
->trans
[state
->nbTrans
].to
= target
->no
;
1437 state
->trans
[state
->nbTrans
].counter
= counter
;
1438 state
->trans
[state
->nbTrans
].count
= count
;
1439 state
->trans
[state
->nbTrans
].nd
= 0;
1441 xmlRegStateAddTransTo(ctxt
, target
, state
->no
);
1444 static xmlRegStatePtr
1445 xmlRegStatePush(xmlRegParserCtxtPtr ctxt
) {
1446 xmlRegStatePtr state
;
1448 if (ctxt
->nbStates
>= ctxt
->maxStates
) {
1449 size_t newSize
= ctxt
->maxStates
? ctxt
->maxStates
* 2 : 4;
1450 xmlRegStatePtr
*tmp
;
1452 tmp
= xmlRealloc(ctxt
->states
, newSize
* sizeof(tmp
[0]));
1454 xmlRegexpErrMemory(ctxt
, "adding state");
1458 ctxt
->maxStates
= newSize
;
1461 state
= xmlRegNewState(ctxt
);
1465 state
->no
= ctxt
->nbStates
;
1466 ctxt
->states
[ctxt
->nbStates
++] = state
;
1472 * xmlFAGenerateAllTransition:
1473 * @ctxt: a regexp parser context
1474 * @from: the from state
1475 * @to: the target state or NULL for building a new one
1480 xmlFAGenerateAllTransition(xmlRegParserCtxtPtr ctxt
,
1481 xmlRegStatePtr from
, xmlRegStatePtr to
,
1484 to
= xmlRegStatePush(ctxt
);
1490 xmlRegStateAddTrans(ctxt
, from
, NULL
, to
, -1, REGEXP_ALL_LAX_COUNTER
);
1492 xmlRegStateAddTrans(ctxt
, from
, NULL
, to
, -1, REGEXP_ALL_COUNTER
);
1497 * xmlFAGenerateEpsilonTransition:
1498 * @ctxt: a regexp parser context
1499 * @from: the from state
1500 * @to: the target state or NULL for building a new one
1504 xmlFAGenerateEpsilonTransition(xmlRegParserCtxtPtr ctxt
,
1505 xmlRegStatePtr from
, xmlRegStatePtr to
) {
1507 to
= xmlRegStatePush(ctxt
);
1512 xmlRegStateAddTrans(ctxt
, from
, NULL
, to
, -1, -1);
1517 * xmlFAGenerateCountedEpsilonTransition:
1518 * @ctxt: a regexp parser context
1519 * @from: the from state
1520 * @to: the target state or NULL for building a new one
1521 * counter: the counter for that transition
1525 xmlFAGenerateCountedEpsilonTransition(xmlRegParserCtxtPtr ctxt
,
1526 xmlRegStatePtr from
, xmlRegStatePtr to
, int counter
) {
1528 to
= xmlRegStatePush(ctxt
);
1533 xmlRegStateAddTrans(ctxt
, from
, NULL
, to
, counter
, -1);
1538 * xmlFAGenerateCountedTransition:
1539 * @ctxt: a regexp parser context
1540 * @from: the from state
1541 * @to: the target state or NULL for building a new one
1542 * counter: the counter for that transition
1546 xmlFAGenerateCountedTransition(xmlRegParserCtxtPtr ctxt
,
1547 xmlRegStatePtr from
, xmlRegStatePtr to
, int counter
) {
1549 to
= xmlRegStatePush(ctxt
);
1554 xmlRegStateAddTrans(ctxt
, from
, NULL
, to
, -1, counter
);
1559 * xmlFAGenerateTransitions:
1560 * @ctxt: a regexp parser context
1561 * @from: the from state
1562 * @to: the target state or NULL for building a new one
1563 * @atom: the atom generating the transition
1565 * Returns 0 if success and -1 in case of error.
1568 xmlFAGenerateTransitions(xmlRegParserCtxtPtr ctxt
, xmlRegStatePtr from
,
1569 xmlRegStatePtr to
, xmlRegAtomPtr atom
) {
1574 ERROR("generate transition: atom == NULL");
1577 if (atom
->type
== XML_REGEXP_SUBREG
) {
1579 * this is a subexpression handling one should not need to
1580 * create a new node except for XML_REGEXP_QUANT_RANGE.
1582 if ((to
!= NULL
) && (atom
->stop
!= to
) &&
1583 (atom
->quant
!= XML_REGEXP_QUANT_RANGE
)) {
1585 * Generate an epsilon transition to link to the target
1587 xmlFAGenerateEpsilonTransition(ctxt
, atom
->stop
, to
);
1589 } else if ((to
== NULL
) && (atom
->quant
!= XML_REGEXP_QUANT_RANGE
) &&
1590 (atom
->quant
!= XML_REGEXP_QUANT_ONCE
)) {
1591 to
= xmlRegStatePush(ctxt
, to
);
1595 xmlFAGenerateEpsilonTransition(ctxt
, atom
->stop
, to
);
1598 switch (atom
->quant
) {
1599 case XML_REGEXP_QUANT_OPT
:
1600 atom
->quant
= XML_REGEXP_QUANT_ONCE
;
1602 * transition done to the state after end of atom.
1603 * 1. set transition from atom start to new state
1604 * 2. set transition from atom end to this state.
1607 xmlFAGenerateEpsilonTransition(ctxt
, atom
->start
, 0);
1608 xmlFAGenerateEpsilonTransition(ctxt
, atom
->stop
,
1611 xmlFAGenerateEpsilonTransition(ctxt
, atom
->start
, to
);
1614 case XML_REGEXP_QUANT_MULT
:
1615 atom
->quant
= XML_REGEXP_QUANT_ONCE
;
1616 xmlFAGenerateEpsilonTransition(ctxt
, atom
->start
, atom
->stop
);
1617 xmlFAGenerateEpsilonTransition(ctxt
, atom
->stop
, atom
->start
);
1619 case XML_REGEXP_QUANT_PLUS
:
1620 atom
->quant
= XML_REGEXP_QUANT_ONCE
;
1621 xmlFAGenerateEpsilonTransition(ctxt
, atom
->stop
, atom
->start
);
1623 case XML_REGEXP_QUANT_RANGE
: {
1625 xmlRegStatePtr inter
, newstate
;
1628 * create the final state now if needed
1633 newstate
= xmlRegStatePush(ctxt
);
1634 if (newstate
== NULL
)
1639 * The principle here is to use counted transition
1640 * to avoid explosion in the number of states in the
1641 * graph. This is clearly more complex but should not
1642 * be exploitable at runtime.
1644 if ((atom
->min
== 0) && (atom
->start0
== NULL
)) {
1647 * duplicate a transition based on atom to count next
1648 * occurrences after 1. We cannot loop to atom->start
1649 * directly because we need an epsilon transition to
1652 /* ???? For some reason it seems we never reach that
1653 case, I suppose this got optimized out before when
1654 building the automata */
1655 copy
= xmlRegCopyAtom(ctxt
, atom
);
1658 copy
->quant
= XML_REGEXP_QUANT_ONCE
;
1662 if (xmlFAGenerateTransitions(ctxt
, atom
->start
, NULL
, copy
)
1664 xmlRegFreeAtom(copy
);
1667 inter
= ctxt
->state
;
1668 counter
= xmlRegGetCounter(ctxt
);
1671 ctxt
->counters
[counter
].min
= atom
->min
- 1;
1672 ctxt
->counters
[counter
].max
= atom
->max
- 1;
1673 /* count the number of times we see it again */
1674 xmlFAGenerateCountedEpsilonTransition(ctxt
, inter
,
1675 atom
->stop
, counter
);
1676 /* allow a way out based on the count */
1677 xmlFAGenerateCountedTransition(ctxt
, inter
,
1679 /* and also allow a direct exit for 0 */
1680 xmlFAGenerateEpsilonTransition(ctxt
, atom
->start
,
1684 * either we need the atom at least once or there
1685 * is an atom->start0 allowing to easily plug the
1686 * epsilon transition.
1688 counter
= xmlRegGetCounter(ctxt
);
1691 ctxt
->counters
[counter
].min
= atom
->min
- 1;
1692 ctxt
->counters
[counter
].max
= atom
->max
- 1;
1693 /* allow a way out based on the count */
1694 xmlFAGenerateCountedTransition(ctxt
, atom
->stop
,
1696 /* count the number of times we see it again */
1697 xmlFAGenerateCountedEpsilonTransition(ctxt
, atom
->stop
,
1698 atom
->start
, counter
);
1699 /* and if needed allow a direct exit for 0 */
1701 xmlFAGenerateEpsilonTransition(ctxt
, atom
->start0
,
1707 atom
->quant
= XML_REGEXP_QUANT_ONCE
;
1708 ctxt
->state
= newstate
;
1713 if (xmlRegAtomPush(ctxt
, atom
) < 0)
1717 if ((atom
->min
== 0) && (atom
->max
== 0) &&
1718 (atom
->quant
== XML_REGEXP_QUANT_RANGE
)) {
1720 * we can discard the atom and generate an epsilon transition instead
1723 to
= xmlRegStatePush(ctxt
);
1727 xmlFAGenerateEpsilonTransition(ctxt
, from
, to
);
1729 xmlRegFreeAtom(atom
);
1733 to
= xmlRegStatePush(ctxt
);
1738 if ((atom
->quant
== XML_REGEXP_QUANT_MULT
) ||
1739 (atom
->quant
== XML_REGEXP_QUANT_PLUS
)) {
1741 * Do not pollute the target state by adding transitions from
1742 * it as it is likely to be the shared target of multiple branches.
1743 * So isolate with an epsilon transition.
1747 tmp
= xmlRegStatePush(ctxt
);
1750 xmlFAGenerateEpsilonTransition(ctxt
, tmp
, to
);
1753 if ((atom
->quant
== XML_REGEXP_QUANT_RANGE
) &&
1754 (atom
->min
== 0) && (atom
->max
> 0)) {
1758 atom
->quant
= XML_REGEXP_QUANT_OPT
;
1760 xmlRegStateAddTrans(ctxt
, from
, atom
, to
, -1, -1);
1762 switch (atom
->quant
) {
1763 case XML_REGEXP_QUANT_OPT
:
1764 atom
->quant
= XML_REGEXP_QUANT_ONCE
;
1765 xmlFAGenerateEpsilonTransition(ctxt
, from
, to
);
1767 case XML_REGEXP_QUANT_MULT
:
1768 atom
->quant
= XML_REGEXP_QUANT_ONCE
;
1769 xmlFAGenerateEpsilonTransition(ctxt
, from
, to
);
1770 xmlRegStateAddTrans(ctxt
, to
, atom
, to
, -1, -1);
1772 case XML_REGEXP_QUANT_PLUS
:
1773 atom
->quant
= XML_REGEXP_QUANT_ONCE
;
1774 xmlRegStateAddTrans(ctxt
, to
, atom
, to
, -1, -1);
1776 case XML_REGEXP_QUANT_RANGE
:
1778 xmlFAGenerateEpsilonTransition(ctxt
, from
, to
);
1783 if (xmlRegAtomPush(ctxt
, atom
) < 0)
1789 * xmlFAReduceEpsilonTransitions:
1790 * @ctxt: a regexp parser context
1791 * @fromnr: the from state
1792 * @tonr: the to state
1793 * @counter: should that transition be associated to a counted
1797 xmlFAReduceEpsilonTransitions(xmlRegParserCtxtPtr ctxt
, int fromnr
,
1798 int tonr
, int counter
) {
1800 xmlRegStatePtr from
;
1803 #ifdef DEBUG_REGEXP_GRAPH
1804 printf("xmlFAReduceEpsilonTransitions(%d, %d)\n", fromnr
, tonr
);
1806 from
= ctxt
->states
[fromnr
];
1809 to
= ctxt
->states
[tonr
];
1812 if ((to
->mark
== XML_REGEXP_MARK_START
) ||
1813 (to
->mark
== XML_REGEXP_MARK_VISITED
))
1816 to
->mark
= XML_REGEXP_MARK_VISITED
;
1817 if (to
->type
== XML_REGEXP_FINAL_STATE
) {
1818 #ifdef DEBUG_REGEXP_GRAPH
1819 printf("State %d is final, so %d becomes final\n", tonr
, fromnr
);
1821 from
->type
= XML_REGEXP_FINAL_STATE
;
1823 for (transnr
= 0;transnr
< to
->nbTrans
;transnr
++) {
1824 if (to
->trans
[transnr
].to
< 0)
1826 if (to
->trans
[transnr
].atom
== NULL
) {
1828 * Don't remove counted transitions
1831 if (to
->trans
[transnr
].to
!= fromnr
) {
1832 if (to
->trans
[transnr
].count
>= 0) {
1833 int newto
= to
->trans
[transnr
].to
;
1835 xmlRegStateAddTrans(ctxt
, from
, NULL
,
1836 ctxt
->states
[newto
],
1837 -1, to
->trans
[transnr
].count
);
1839 #ifdef DEBUG_REGEXP_GRAPH
1840 printf("Found epsilon trans %d from %d to %d\n",
1841 transnr
, tonr
, to
->trans
[transnr
].to
);
1843 if (to
->trans
[transnr
].counter
>= 0) {
1844 xmlFAReduceEpsilonTransitions(ctxt
, fromnr
,
1845 to
->trans
[transnr
].to
,
1846 to
->trans
[transnr
].counter
);
1848 xmlFAReduceEpsilonTransitions(ctxt
, fromnr
,
1849 to
->trans
[transnr
].to
,
1855 int newto
= to
->trans
[transnr
].to
;
1857 if (to
->trans
[transnr
].counter
>= 0) {
1858 xmlRegStateAddTrans(ctxt
, from
, to
->trans
[transnr
].atom
,
1859 ctxt
->states
[newto
],
1860 to
->trans
[transnr
].counter
, -1);
1862 xmlRegStateAddTrans(ctxt
, from
, to
->trans
[transnr
].atom
,
1863 ctxt
->states
[newto
], counter
, -1);
1867 to
->mark
= XML_REGEXP_MARK_NORMAL
;
1871 * xmlFAEliminateSimpleEpsilonTransitions:
1872 * @ctxt: a regexp parser context
1874 * Eliminating general epsilon transitions can get costly in the general
1875 * algorithm due to the large amount of generated new transitions and
1876 * associated comparisons. However for simple epsilon transition used just
1877 * to separate building blocks when generating the automata this can be
1878 * reduced to state elimination:
1879 * - if there exists an epsilon from X to Y
1880 * - if there is no other transition from X
1881 * then X and Y are semantically equivalent and X can be eliminated
1882 * If X is the start state then make Y the start state, else replace the
1883 * target of all transitions to X by transitions to Y.
1885 * If X is a final state, skip it.
1886 * Otherwise it would be necessary to manipulate counters for this case when
1887 * eliminating state 2:
1888 * State 1 has a transition with an atom to state 2.
1889 * State 2 is final and has an epsilon transition to state 1.
1892 xmlFAEliminateSimpleEpsilonTransitions(xmlRegParserCtxtPtr ctxt
) {
1893 int statenr
, i
, j
, newto
;
1894 xmlRegStatePtr state
, tmp
;
1896 for (statenr
= 0;statenr
< ctxt
->nbStates
;statenr
++) {
1897 state
= ctxt
->states
[statenr
];
1900 if (state
->nbTrans
!= 1)
1902 if (state
->type
== XML_REGEXP_UNREACH_STATE
||
1903 state
->type
== XML_REGEXP_FINAL_STATE
)
1905 /* is the only transition out a basic transition */
1906 if ((state
->trans
[0].atom
== NULL
) &&
1907 (state
->trans
[0].to
>= 0) &&
1908 (state
->trans
[0].to
!= statenr
) &&
1909 (state
->trans
[0].counter
< 0) &&
1910 (state
->trans
[0].count
< 0)) {
1911 newto
= state
->trans
[0].to
;
1913 if (state
->type
== XML_REGEXP_START_STATE
) {
1914 #ifdef DEBUG_REGEXP_GRAPH
1915 printf("Found simple epsilon trans from start %d to %d\n",
1919 #ifdef DEBUG_REGEXP_GRAPH
1920 printf("Found simple epsilon trans from %d to %d\n",
1923 for (i
= 0;i
< state
->nbTransTo
;i
++) {
1924 tmp
= ctxt
->states
[state
->transTo
[i
]];
1925 for (j
= 0;j
< tmp
->nbTrans
;j
++) {
1926 if (tmp
->trans
[j
].to
== statenr
) {
1927 #ifdef DEBUG_REGEXP_GRAPH
1928 printf("Changed transition %d on %d to go to %d\n",
1931 tmp
->trans
[j
].to
= -1;
1932 xmlRegStateAddTrans(ctxt
, tmp
, tmp
->trans
[j
].atom
,
1933 ctxt
->states
[newto
],
1934 tmp
->trans
[j
].counter
,
1935 tmp
->trans
[j
].count
);
1939 if (state
->type
== XML_REGEXP_FINAL_STATE
)
1940 ctxt
->states
[newto
]->type
= XML_REGEXP_FINAL_STATE
;
1941 /* eliminate the transition completely */
1944 state
->type
= XML_REGEXP_UNREACH_STATE
;
1952 * xmlFAEliminateEpsilonTransitions:
1953 * @ctxt: a regexp parser context
1957 xmlFAEliminateEpsilonTransitions(xmlRegParserCtxtPtr ctxt
) {
1958 int statenr
, transnr
;
1959 xmlRegStatePtr state
;
1962 if (ctxt
->states
== NULL
) return;
1965 * Eliminate simple epsilon transition and the associated unreachable
1968 xmlFAEliminateSimpleEpsilonTransitions(ctxt
);
1969 for (statenr
= 0;statenr
< ctxt
->nbStates
;statenr
++) {
1970 state
= ctxt
->states
[statenr
];
1971 if ((state
!= NULL
) && (state
->type
== XML_REGEXP_UNREACH_STATE
)) {
1972 #ifdef DEBUG_REGEXP_GRAPH
1973 printf("Removed unreachable state %d\n", statenr
);
1975 xmlRegFreeState(state
);
1976 ctxt
->states
[statenr
] = NULL
;
1983 * Build the completed transitions bypassing the epsilons
1984 * Use a marking algorithm to avoid loops
1985 * Mark sink states too.
1986 * Process from the latest states backward to the start when
1987 * there is long cascading epsilon chains this minimize the
1988 * recursions and transition compares when adding the new ones
1990 for (statenr
= ctxt
->nbStates
- 1;statenr
>= 0;statenr
--) {
1991 state
= ctxt
->states
[statenr
];
1994 if ((state
->nbTrans
== 0) &&
1995 (state
->type
!= XML_REGEXP_FINAL_STATE
)) {
1996 state
->type
= XML_REGEXP_SINK_STATE
;
1998 for (transnr
= 0;transnr
< state
->nbTrans
;transnr
++) {
1999 if ((state
->trans
[transnr
].atom
== NULL
) &&
2000 (state
->trans
[transnr
].to
>= 0)) {
2001 if (state
->trans
[transnr
].to
== statenr
) {
2002 state
->trans
[transnr
].to
= -1;
2003 #ifdef DEBUG_REGEXP_GRAPH
2004 printf("Removed loopback epsilon trans %d on %d\n",
2007 } else if (state
->trans
[transnr
].count
< 0) {
2008 int newto
= state
->trans
[transnr
].to
;
2010 #ifdef DEBUG_REGEXP_GRAPH
2011 printf("Found epsilon trans %d from %d to %d\n",
2012 transnr
, statenr
, newto
);
2015 state
->trans
[transnr
].to
= -2;
2016 state
->mark
= XML_REGEXP_MARK_START
;
2017 xmlFAReduceEpsilonTransitions(ctxt
, statenr
,
2018 newto
, state
->trans
[transnr
].counter
);
2019 state
->mark
= XML_REGEXP_MARK_NORMAL
;
2020 #ifdef DEBUG_REGEXP_GRAPH
2022 printf("Found counted transition %d on %d\n",
2030 * Eliminate the epsilon transitions
2033 for (statenr
= 0;statenr
< ctxt
->nbStates
;statenr
++) {
2034 state
= ctxt
->states
[statenr
];
2037 for (transnr
= 0;transnr
< state
->nbTrans
;transnr
++) {
2038 xmlRegTransPtr trans
= &(state
->trans
[transnr
]);
2039 if ((trans
->atom
== NULL
) &&
2040 (trans
->count
< 0) &&
2049 * Use this pass to detect unreachable states too
2051 for (statenr
= 0;statenr
< ctxt
->nbStates
;statenr
++) {
2052 state
= ctxt
->states
[statenr
];
2054 state
->reached
= XML_REGEXP_MARK_NORMAL
;
2056 state
= ctxt
->states
[0];
2058 state
->reached
= XML_REGEXP_MARK_START
;
2059 while (state
!= NULL
) {
2060 xmlRegStatePtr target
= NULL
;
2061 state
->reached
= XML_REGEXP_MARK_VISITED
;
2063 * Mark all states reachable from the current reachable state
2065 for (transnr
= 0;transnr
< state
->nbTrans
;transnr
++) {
2066 if ((state
->trans
[transnr
].to
>= 0) &&
2067 ((state
->trans
[transnr
].atom
!= NULL
) ||
2068 (state
->trans
[transnr
].count
>= 0))) {
2069 int newto
= state
->trans
[transnr
].to
;
2071 if (ctxt
->states
[newto
] == NULL
)
2073 if (ctxt
->states
[newto
]->reached
== XML_REGEXP_MARK_NORMAL
) {
2074 ctxt
->states
[newto
]->reached
= XML_REGEXP_MARK_START
;
2075 target
= ctxt
->states
[newto
];
2081 * find the next accessible state not explored
2083 if (target
== NULL
) {
2084 for (statenr
= 1;statenr
< ctxt
->nbStates
;statenr
++) {
2085 state
= ctxt
->states
[statenr
];
2086 if ((state
!= NULL
) && (state
->reached
==
2087 XML_REGEXP_MARK_START
)) {
2095 for (statenr
= 0;statenr
< ctxt
->nbStates
;statenr
++) {
2096 state
= ctxt
->states
[statenr
];
2097 if ((state
!= NULL
) && (state
->reached
== XML_REGEXP_MARK_NORMAL
)) {
2098 #ifdef DEBUG_REGEXP_GRAPH
2099 printf("Removed unreachable state %d\n", statenr
);
2101 xmlRegFreeState(state
);
2102 ctxt
->states
[statenr
] = NULL
;
2109 xmlFACompareRanges(xmlRegRangePtr range1
, xmlRegRangePtr range2
) {
2112 if ((range1
->type
== XML_REGEXP_RANGES
) ||
2113 (range2
->type
== XML_REGEXP_RANGES
) ||
2114 (range2
->type
== XML_REGEXP_SUBREG
) ||
2115 (range1
->type
== XML_REGEXP_SUBREG
) ||
2116 (range1
->type
== XML_REGEXP_STRING
) ||
2117 (range2
->type
== XML_REGEXP_STRING
))
2120 /* put them in order */
2121 if (range1
->type
> range2
->type
) {
2128 if ((range1
->type
== XML_REGEXP_ANYCHAR
) ||
2129 (range2
->type
== XML_REGEXP_ANYCHAR
)) {
2131 } else if ((range1
->type
== XML_REGEXP_EPSILON
) ||
2132 (range2
->type
== XML_REGEXP_EPSILON
)) {
2134 } else if (range1
->type
== range2
->type
) {
2135 if (range1
->type
!= XML_REGEXP_CHARVAL
)
2137 else if ((range1
->end
< range2
->start
) ||
2138 (range2
->end
< range1
->start
))
2142 } else if (range1
->type
== XML_REGEXP_CHARVAL
) {
2147 * just check all codepoints in the range for acceptance,
2148 * this is usually way cheaper since done only once at
2149 * compilation than testing over and over at runtime or
2150 * pushing too many states when evaluating.
2152 if (((range1
->neg
== 0) && (range2
->neg
!= 0)) ||
2153 ((range1
->neg
!= 0) && (range2
->neg
== 0)))
2156 for (codepoint
= range1
->start
;codepoint
<= range1
->end
;codepoint
++) {
2157 ret
= xmlRegCheckCharacterRange(range2
->type
, codepoint
,
2158 0, range2
->start
, range2
->end
,
2162 if (((neg
== 1) && (ret
== 0)) ||
2163 ((neg
== 0) && (ret
== 1)))
2167 } else if ((range1
->type
== XML_REGEXP_BLOCK_NAME
) ||
2168 (range2
->type
== XML_REGEXP_BLOCK_NAME
)) {
2169 if (range1
->type
== range2
->type
) {
2170 ret
= xmlStrEqual(range1
->blockName
, range2
->blockName
);
2173 * comparing a block range with anything else is way
2174 * too costly, and maintaining the table is like too much
2175 * memory too, so let's force the automata to save state
2180 } else if ((range1
->type
< XML_REGEXP_LETTER
) ||
2181 (range2
->type
< XML_REGEXP_LETTER
)) {
2182 if ((range1
->type
== XML_REGEXP_ANYSPACE
) &&
2183 (range2
->type
== XML_REGEXP_NOTSPACE
))
2185 else if ((range1
->type
== XML_REGEXP_INITNAME
) &&
2186 (range2
->type
== XML_REGEXP_NOTINITNAME
))
2188 else if ((range1
->type
== XML_REGEXP_NAMECHAR
) &&
2189 (range2
->type
== XML_REGEXP_NOTNAMECHAR
))
2191 else if ((range1
->type
== XML_REGEXP_DECIMAL
) &&
2192 (range2
->type
== XML_REGEXP_NOTDECIMAL
))
2194 else if ((range1
->type
== XML_REGEXP_REALCHAR
) &&
2195 (range2
->type
== XML_REGEXP_NOTREALCHAR
))
2198 /* same thing to limit complexity */
2203 /* range1->type < range2->type here */
2204 switch (range1
->type
) {
2205 case XML_REGEXP_LETTER
:
2206 /* all disjoint except in the subgroups */
2207 if ((range2
->type
== XML_REGEXP_LETTER_UPPERCASE
) ||
2208 (range2
->type
== XML_REGEXP_LETTER_LOWERCASE
) ||
2209 (range2
->type
== XML_REGEXP_LETTER_TITLECASE
) ||
2210 (range2
->type
== XML_REGEXP_LETTER_MODIFIER
) ||
2211 (range2
->type
== XML_REGEXP_LETTER_OTHERS
))
2214 case XML_REGEXP_MARK
:
2215 if ((range2
->type
== XML_REGEXP_MARK_NONSPACING
) ||
2216 (range2
->type
== XML_REGEXP_MARK_SPACECOMBINING
) ||
2217 (range2
->type
== XML_REGEXP_MARK_ENCLOSING
))
2220 case XML_REGEXP_NUMBER
:
2221 if ((range2
->type
== XML_REGEXP_NUMBER_DECIMAL
) ||
2222 (range2
->type
== XML_REGEXP_NUMBER_LETTER
) ||
2223 (range2
->type
== XML_REGEXP_NUMBER_OTHERS
))
2226 case XML_REGEXP_PUNCT
:
2227 if ((range2
->type
== XML_REGEXP_PUNCT_CONNECTOR
) ||
2228 (range2
->type
== XML_REGEXP_PUNCT_DASH
) ||
2229 (range2
->type
== XML_REGEXP_PUNCT_OPEN
) ||
2230 (range2
->type
== XML_REGEXP_PUNCT_CLOSE
) ||
2231 (range2
->type
== XML_REGEXP_PUNCT_INITQUOTE
) ||
2232 (range2
->type
== XML_REGEXP_PUNCT_FINQUOTE
) ||
2233 (range2
->type
== XML_REGEXP_PUNCT_OTHERS
))
2236 case XML_REGEXP_SEPAR
:
2237 if ((range2
->type
== XML_REGEXP_SEPAR_SPACE
) ||
2238 (range2
->type
== XML_REGEXP_SEPAR_LINE
) ||
2239 (range2
->type
== XML_REGEXP_SEPAR_PARA
))
2242 case XML_REGEXP_SYMBOL
:
2243 if ((range2
->type
== XML_REGEXP_SYMBOL_MATH
) ||
2244 (range2
->type
== XML_REGEXP_SYMBOL_CURRENCY
) ||
2245 (range2
->type
== XML_REGEXP_SYMBOL_MODIFIER
) ||
2246 (range2
->type
== XML_REGEXP_SYMBOL_OTHERS
))
2249 case XML_REGEXP_OTHER
:
2250 if ((range2
->type
== XML_REGEXP_OTHER_CONTROL
) ||
2251 (range2
->type
== XML_REGEXP_OTHER_FORMAT
) ||
2252 (range2
->type
== XML_REGEXP_OTHER_PRIVATE
))
2256 if ((range2
->type
>= XML_REGEXP_LETTER
) &&
2257 (range2
->type
< XML_REGEXP_BLOCK_NAME
))
2265 if (((range1
->neg
== 0) && (range2
->neg
!= 0)) ||
2266 ((range1
->neg
!= 0) && (range2
->neg
== 0)))
2272 * xmlFACompareAtomTypes:
2273 * @type1: an atom type
2274 * @type2: an atom type
2276 * Compares two atoms type to check whether they intersect in some ways,
2277 * this is used by xmlFACompareAtoms only
2279 * Returns 1 if they may intersect and 0 otherwise
2282 xmlFACompareAtomTypes(xmlRegAtomType type1
, xmlRegAtomType type2
) {
2283 if ((type1
== XML_REGEXP_EPSILON
) ||
2284 (type1
== XML_REGEXP_CHARVAL
) ||
2285 (type1
== XML_REGEXP_RANGES
) ||
2286 (type1
== XML_REGEXP_SUBREG
) ||
2287 (type1
== XML_REGEXP_STRING
) ||
2288 (type1
== XML_REGEXP_ANYCHAR
))
2290 if ((type2
== XML_REGEXP_EPSILON
) ||
2291 (type2
== XML_REGEXP_CHARVAL
) ||
2292 (type2
== XML_REGEXP_RANGES
) ||
2293 (type2
== XML_REGEXP_SUBREG
) ||
2294 (type2
== XML_REGEXP_STRING
) ||
2295 (type2
== XML_REGEXP_ANYCHAR
))
2298 if (type1
== type2
) return(1);
2300 /* simplify subsequent compares by making sure type1 < type2 */
2301 if (type1
> type2
) {
2302 xmlRegAtomType tmp
= type1
;
2307 case XML_REGEXP_ANYSPACE
: /* \s */
2308 /* can't be a letter, number, mark, punctuation, symbol */
2309 if ((type2
== XML_REGEXP_NOTSPACE
) ||
2310 ((type2
>= XML_REGEXP_LETTER
) &&
2311 (type2
<= XML_REGEXP_LETTER_OTHERS
)) ||
2312 ((type2
>= XML_REGEXP_NUMBER
) &&
2313 (type2
<= XML_REGEXP_NUMBER_OTHERS
)) ||
2314 ((type2
>= XML_REGEXP_MARK
) &&
2315 (type2
<= XML_REGEXP_MARK_ENCLOSING
)) ||
2316 ((type2
>= XML_REGEXP_PUNCT
) &&
2317 (type2
<= XML_REGEXP_PUNCT_OTHERS
)) ||
2318 ((type2
>= XML_REGEXP_SYMBOL
) &&
2319 (type2
<= XML_REGEXP_SYMBOL_OTHERS
))
2322 case XML_REGEXP_NOTSPACE
: /* \S */
2324 case XML_REGEXP_INITNAME
: /* \l */
2325 /* can't be a number, mark, separator, punctuation, symbol or other */
2326 if ((type2
== XML_REGEXP_NOTINITNAME
) ||
2327 ((type2
>= XML_REGEXP_NUMBER
) &&
2328 (type2
<= XML_REGEXP_NUMBER_OTHERS
)) ||
2329 ((type2
>= XML_REGEXP_MARK
) &&
2330 (type2
<= XML_REGEXP_MARK_ENCLOSING
)) ||
2331 ((type2
>= XML_REGEXP_SEPAR
) &&
2332 (type2
<= XML_REGEXP_SEPAR_PARA
)) ||
2333 ((type2
>= XML_REGEXP_PUNCT
) &&
2334 (type2
<= XML_REGEXP_PUNCT_OTHERS
)) ||
2335 ((type2
>= XML_REGEXP_SYMBOL
) &&
2336 (type2
<= XML_REGEXP_SYMBOL_OTHERS
)) ||
2337 ((type2
>= XML_REGEXP_OTHER
) &&
2338 (type2
<= XML_REGEXP_OTHER_NA
))
2341 case XML_REGEXP_NOTINITNAME
: /* \L */
2343 case XML_REGEXP_NAMECHAR
: /* \c */
2344 /* can't be a mark, separator, punctuation, symbol or other */
2345 if ((type2
== XML_REGEXP_NOTNAMECHAR
) ||
2346 ((type2
>= XML_REGEXP_MARK
) &&
2347 (type2
<= XML_REGEXP_MARK_ENCLOSING
)) ||
2348 ((type2
>= XML_REGEXP_PUNCT
) &&
2349 (type2
<= XML_REGEXP_PUNCT_OTHERS
)) ||
2350 ((type2
>= XML_REGEXP_SEPAR
) &&
2351 (type2
<= XML_REGEXP_SEPAR_PARA
)) ||
2352 ((type2
>= XML_REGEXP_SYMBOL
) &&
2353 (type2
<= XML_REGEXP_SYMBOL_OTHERS
)) ||
2354 ((type2
>= XML_REGEXP_OTHER
) &&
2355 (type2
<= XML_REGEXP_OTHER_NA
))
2358 case XML_REGEXP_NOTNAMECHAR
: /* \C */
2360 case XML_REGEXP_DECIMAL
: /* \d */
2361 /* can't be a letter, mark, separator, punctuation, symbol or other */
2362 if ((type2
== XML_REGEXP_NOTDECIMAL
) ||
2363 (type2
== XML_REGEXP_REALCHAR
) ||
2364 ((type2
>= XML_REGEXP_LETTER
) &&
2365 (type2
<= XML_REGEXP_LETTER_OTHERS
)) ||
2366 ((type2
>= XML_REGEXP_MARK
) &&
2367 (type2
<= XML_REGEXP_MARK_ENCLOSING
)) ||
2368 ((type2
>= XML_REGEXP_PUNCT
) &&
2369 (type2
<= XML_REGEXP_PUNCT_OTHERS
)) ||
2370 ((type2
>= XML_REGEXP_SEPAR
) &&
2371 (type2
<= XML_REGEXP_SEPAR_PARA
)) ||
2372 ((type2
>= XML_REGEXP_SYMBOL
) &&
2373 (type2
<= XML_REGEXP_SYMBOL_OTHERS
)) ||
2374 ((type2
>= XML_REGEXP_OTHER
) &&
2375 (type2
<= XML_REGEXP_OTHER_NA
))
2378 case XML_REGEXP_NOTDECIMAL
: /* \D */
2380 case XML_REGEXP_REALCHAR
: /* \w */
2381 /* can't be a mark, separator, punctuation, symbol or other */
2382 if ((type2
== XML_REGEXP_NOTDECIMAL
) ||
2383 ((type2
>= XML_REGEXP_MARK
) &&
2384 (type2
<= XML_REGEXP_MARK_ENCLOSING
)) ||
2385 ((type2
>= XML_REGEXP_PUNCT
) &&
2386 (type2
<= XML_REGEXP_PUNCT_OTHERS
)) ||
2387 ((type2
>= XML_REGEXP_SEPAR
) &&
2388 (type2
<= XML_REGEXP_SEPAR_PARA
)) ||
2389 ((type2
>= XML_REGEXP_SYMBOL
) &&
2390 (type2
<= XML_REGEXP_SYMBOL_OTHERS
)) ||
2391 ((type2
>= XML_REGEXP_OTHER
) &&
2392 (type2
<= XML_REGEXP_OTHER_NA
))
2395 case XML_REGEXP_NOTREALCHAR
: /* \W */
2398 * at that point we know both type 1 and type2 are from
2399 * character categories are ordered and are different,
2400 * it becomes simple because this is a partition
2402 case XML_REGEXP_LETTER
:
2403 if (type2
<= XML_REGEXP_LETTER_OTHERS
)
2406 case XML_REGEXP_LETTER_UPPERCASE
:
2407 case XML_REGEXP_LETTER_LOWERCASE
:
2408 case XML_REGEXP_LETTER_TITLECASE
:
2409 case XML_REGEXP_LETTER_MODIFIER
:
2410 case XML_REGEXP_LETTER_OTHERS
:
2412 case XML_REGEXP_MARK
:
2413 if (type2
<= XML_REGEXP_MARK_ENCLOSING
)
2416 case XML_REGEXP_MARK_NONSPACING
:
2417 case XML_REGEXP_MARK_SPACECOMBINING
:
2418 case XML_REGEXP_MARK_ENCLOSING
:
2420 case XML_REGEXP_NUMBER
:
2421 if (type2
<= XML_REGEXP_NUMBER_OTHERS
)
2424 case XML_REGEXP_NUMBER_DECIMAL
:
2425 case XML_REGEXP_NUMBER_LETTER
:
2426 case XML_REGEXP_NUMBER_OTHERS
:
2428 case XML_REGEXP_PUNCT
:
2429 if (type2
<= XML_REGEXP_PUNCT_OTHERS
)
2432 case XML_REGEXP_PUNCT_CONNECTOR
:
2433 case XML_REGEXP_PUNCT_DASH
:
2434 case XML_REGEXP_PUNCT_OPEN
:
2435 case XML_REGEXP_PUNCT_CLOSE
:
2436 case XML_REGEXP_PUNCT_INITQUOTE
:
2437 case XML_REGEXP_PUNCT_FINQUOTE
:
2438 case XML_REGEXP_PUNCT_OTHERS
:
2440 case XML_REGEXP_SEPAR
:
2441 if (type2
<= XML_REGEXP_SEPAR_PARA
)
2444 case XML_REGEXP_SEPAR_SPACE
:
2445 case XML_REGEXP_SEPAR_LINE
:
2446 case XML_REGEXP_SEPAR_PARA
:
2448 case XML_REGEXP_SYMBOL
:
2449 if (type2
<= XML_REGEXP_SYMBOL_OTHERS
)
2452 case XML_REGEXP_SYMBOL_MATH
:
2453 case XML_REGEXP_SYMBOL_CURRENCY
:
2454 case XML_REGEXP_SYMBOL_MODIFIER
:
2455 case XML_REGEXP_SYMBOL_OTHERS
:
2457 case XML_REGEXP_OTHER
:
2458 if (type2
<= XML_REGEXP_OTHER_NA
)
2461 case XML_REGEXP_OTHER_CONTROL
:
2462 case XML_REGEXP_OTHER_FORMAT
:
2463 case XML_REGEXP_OTHER_PRIVATE
:
2464 case XML_REGEXP_OTHER_NA
:
2476 * @deep: if not set only compare string pointers
2478 * Compares two atoms to check whether they are the same exactly
2479 * this is used to remove equivalent transitions
2481 * Returns 1 if same and 0 otherwise
2484 xmlFAEqualAtoms(xmlRegAtomPtr atom1
, xmlRegAtomPtr atom2
, int deep
) {
2489 if ((atom1
== NULL
) || (atom2
== NULL
))
2492 if (atom1
->type
!= atom2
->type
)
2494 switch (atom1
->type
) {
2495 case XML_REGEXP_EPSILON
:
2498 case XML_REGEXP_STRING
:
2500 ret
= (atom1
->valuep
== atom2
->valuep
);
2502 ret
= xmlStrEqual((xmlChar
*)atom1
->valuep
,
2503 (xmlChar
*)atom2
->valuep
);
2505 case XML_REGEXP_CHARVAL
:
2506 ret
= (atom1
->codepoint
== atom2
->codepoint
);
2508 case XML_REGEXP_RANGES
:
2509 /* too hard to do in the general case */
2518 * xmlFACompareAtoms:
2521 * @deep: if not set only compare string pointers
2523 * Compares two atoms to check whether they intersect in some ways,
2524 * this is used by xmlFAComputesDeterminism and xmlFARecurseDeterminism only
2526 * Returns 1 if yes and 0 otherwise
2529 xmlFACompareAtoms(xmlRegAtomPtr atom1
, xmlRegAtomPtr atom2
, int deep
) {
2534 if ((atom1
== NULL
) || (atom2
== NULL
))
2537 if ((atom1
->type
== XML_REGEXP_ANYCHAR
) ||
2538 (atom2
->type
== XML_REGEXP_ANYCHAR
))
2541 if (atom1
->type
> atom2
->type
) {
2547 if (atom1
->type
!= atom2
->type
) {
2548 ret
= xmlFACompareAtomTypes(atom1
->type
, atom2
->type
);
2549 /* if they can't intersect at the type level break now */
2553 switch (atom1
->type
) {
2554 case XML_REGEXP_STRING
:
2556 ret
= (atom1
->valuep
!= atom2
->valuep
);
2558 xmlChar
*val1
= (xmlChar
*)atom1
->valuep
;
2559 xmlChar
*val2
= (xmlChar
*)atom2
->valuep
;
2560 int compound1
= (xmlStrchr(val1
, '|') != NULL
);
2561 int compound2
= (xmlStrchr(val2
, '|') != NULL
);
2563 /* Ignore negative match flag for ##other namespaces */
2564 if (compound1
!= compound2
)
2567 ret
= xmlRegStrEqualWildcard(val1
, val2
);
2570 case XML_REGEXP_EPSILON
:
2571 goto not_determinist
;
2572 case XML_REGEXP_CHARVAL
:
2573 if (atom2
->type
== XML_REGEXP_CHARVAL
) {
2574 ret
= (atom1
->codepoint
== atom2
->codepoint
);
2576 ret
= xmlRegCheckCharacter(atom2
, atom1
->codepoint
);
2581 case XML_REGEXP_RANGES
:
2582 if (atom2
->type
== XML_REGEXP_RANGES
) {
2584 xmlRegRangePtr r1
, r2
;
2587 * need to check that none of the ranges eventually matches
2589 for (i
= 0;i
< atom1
->nbRanges
;i
++) {
2590 for (j
= 0;j
< atom2
->nbRanges
;j
++) {
2591 r1
= atom1
->ranges
[i
];
2592 r2
= atom2
->ranges
[j
];
2593 res
= xmlFACompareRanges(r1
, r2
);
2604 goto not_determinist
;
2607 if (atom1
->neg
!= atom2
->neg
) {
2617 * xmlFARecurseDeterminism:
2618 * @ctxt: a regexp parser context
2620 * Check whether the associated regexp is determinist,
2621 * should be called after xmlFAEliminateEpsilonTransitions()
2625 xmlFARecurseDeterminism(xmlRegParserCtxtPtr ctxt
, xmlRegStatePtr state
,
2626 int to
, xmlRegAtomPtr atom
) {
2629 int transnr
, nbTrans
;
2635 if (state
->markd
== XML_REGEXP_MARK_VISITED
)
2638 if (ctxt
->flags
& AM_AUTOMATA_RNG
)
2642 * don't recurse on transitions potentially added in the course of
2645 nbTrans
= state
->nbTrans
;
2646 for (transnr
= 0;transnr
< nbTrans
;transnr
++) {
2647 t1
= &(state
->trans
[transnr
]);
2649 * check transitions conflicting with the one looked at
2651 if (t1
->atom
== NULL
) {
2654 state
->markd
= XML_REGEXP_MARK_VISITED
;
2655 res
= xmlFARecurseDeterminism(ctxt
, ctxt
->states
[t1
->to
],
2665 if (xmlFACompareAtoms(t1
->atom
, atom
, deep
)) {
2667 /* mark the transition as non-deterministic */
2675 * xmlFAFinishRecurseDeterminism:
2676 * @ctxt: a regexp parser context
2678 * Reset flags after checking determinism.
2681 xmlFAFinishRecurseDeterminism(xmlRegParserCtxtPtr ctxt
, xmlRegStatePtr state
) {
2682 int transnr
, nbTrans
;
2686 if (state
->markd
!= XML_REGEXP_MARK_VISITED
)
2690 nbTrans
= state
->nbTrans
;
2691 for (transnr
= 0; transnr
< nbTrans
; transnr
++) {
2692 xmlRegTransPtr t1
= &state
->trans
[transnr
];
2693 if ((t1
->atom
== NULL
) && (t1
->to
>= 0))
2694 xmlFAFinishRecurseDeterminism(ctxt
, ctxt
->states
[t1
->to
]);
2699 * xmlFAComputesDeterminism:
2700 * @ctxt: a regexp parser context
2702 * Check whether the associated regexp is determinist,
2703 * should be called after xmlFAEliminateEpsilonTransitions()
2707 xmlFAComputesDeterminism(xmlRegParserCtxtPtr ctxt
) {
2708 int statenr
, transnr
;
2709 xmlRegStatePtr state
;
2710 xmlRegTransPtr t1
, t2
, last
;
2715 #ifdef DEBUG_REGEXP_GRAPH
2716 printf("xmlFAComputesDeterminism\n");
2717 xmlRegPrintCtxt(stdout
, ctxt
);
2719 if (ctxt
->determinist
!= -1)
2720 return(ctxt
->determinist
);
2722 if (ctxt
->flags
& AM_AUTOMATA_RNG
)
2726 * First cleanup the automata removing cancelled transitions
2728 for (statenr
= 0;statenr
< ctxt
->nbStates
;statenr
++) {
2729 state
= ctxt
->states
[statenr
];
2732 if (state
->nbTrans
< 2)
2734 for (transnr
= 0;transnr
< state
->nbTrans
;transnr
++) {
2735 t1
= &(state
->trans
[transnr
]);
2737 * Determinism checks in case of counted or all transitions
2738 * will have to be handled separately
2740 if (t1
->atom
== NULL
) {
2744 if (t1
->to
== -1) /* eliminated */
2746 for (i
= 0;i
< transnr
;i
++) {
2747 t2
= &(state
->trans
[i
]);
2748 if (t2
->to
== -1) /* eliminated */
2750 if (t2
->atom
!= NULL
) {
2751 if (t1
->to
== t2
->to
) {
2753 * Here we use deep because we want to keep the
2754 * transitions which indicate a conflict
2756 if (xmlFAEqualAtoms(t1
->atom
, t2
->atom
, deep
) &&
2757 (t1
->counter
== t2
->counter
) &&
2758 (t1
->count
== t2
->count
))
2759 t2
->to
= -1; /* eliminated */
2767 * Check for all states that there aren't 2 transitions
2768 * with the same atom and a different target.
2770 for (statenr
= 0;statenr
< ctxt
->nbStates
;statenr
++) {
2771 state
= ctxt
->states
[statenr
];
2774 if (state
->nbTrans
< 2)
2777 for (transnr
= 0;transnr
< state
->nbTrans
;transnr
++) {
2778 t1
= &(state
->trans
[transnr
]);
2780 * Determinism checks in case of counted or all transitions
2781 * will have to be handled separately
2783 if (t1
->atom
== NULL
) {
2786 if (t1
->to
== -1) /* eliminated */
2788 for (i
= 0;i
< transnr
;i
++) {
2789 t2
= &(state
->trans
[i
]);
2790 if (t2
->to
== -1) /* eliminated */
2792 if (t2
->atom
!= NULL
) {
2794 * But here we don't use deep because we want to
2795 * find transitions which indicate a conflict
2797 if (xmlFACompareAtoms(t1
->atom
, t2
->atom
, 1)) {
2799 /* mark the transitions as non-deterministic ones */
2804 } else if (t1
->to
!= -1) {
2806 * do the closure in case of remaining specific
2807 * epsilon transitions like choices or all
2809 ret
= xmlFARecurseDeterminism(ctxt
, ctxt
->states
[t1
->to
],
2811 xmlFAFinishRecurseDeterminism(ctxt
, ctxt
->states
[t1
->to
]);
2812 /* don't shortcut the computation so all non deterministic
2813 transition get marked down
2824 /* don't shortcut the computation so all non deterministic
2825 transition get marked down
2831 * mark specifically the last non-deterministic transition
2832 * from a state since there is no need to set-up rollback
2839 /* don't shortcut the computation so all non deterministic
2840 transition get marked down
2845 ctxt
->determinist
= ret
;
2849 /************************************************************************
2851 * Routines to check input against transition atoms *
2853 ************************************************************************/
2856 xmlRegCheckCharacterRange(xmlRegAtomType type
, int codepoint
, int neg
,
2857 int start
, int end
, const xmlChar
*blockName
) {
2861 case XML_REGEXP_STRING
:
2862 case XML_REGEXP_SUBREG
:
2863 case XML_REGEXP_RANGES
:
2864 case XML_REGEXP_EPSILON
:
2866 case XML_REGEXP_ANYCHAR
:
2867 ret
= ((codepoint
!= '\n') && (codepoint
!= '\r'));
2869 case XML_REGEXP_CHARVAL
:
2870 ret
= ((codepoint
>= start
) && (codepoint
<= end
));
2872 case XML_REGEXP_NOTSPACE
:
2874 /* Falls through. */
2875 case XML_REGEXP_ANYSPACE
:
2876 ret
= ((codepoint
== '\n') || (codepoint
== '\r') ||
2877 (codepoint
== '\t') || (codepoint
== ' '));
2879 case XML_REGEXP_NOTINITNAME
:
2881 /* Falls through. */
2882 case XML_REGEXP_INITNAME
:
2883 ret
= (IS_LETTER(codepoint
) ||
2884 (codepoint
== '_') || (codepoint
== ':'));
2886 case XML_REGEXP_NOTNAMECHAR
:
2888 /* Falls through. */
2889 case XML_REGEXP_NAMECHAR
:
2890 ret
= (IS_LETTER(codepoint
) || IS_DIGIT(codepoint
) ||
2891 (codepoint
== '.') || (codepoint
== '-') ||
2892 (codepoint
== '_') || (codepoint
== ':') ||
2893 IS_COMBINING(codepoint
) || IS_EXTENDER(codepoint
));
2895 case XML_REGEXP_NOTDECIMAL
:
2897 /* Falls through. */
2898 case XML_REGEXP_DECIMAL
:
2899 ret
= xmlUCSIsCatNd(codepoint
);
2901 case XML_REGEXP_REALCHAR
:
2903 /* Falls through. */
2904 case XML_REGEXP_NOTREALCHAR
:
2905 ret
= xmlUCSIsCatP(codepoint
);
2907 ret
= xmlUCSIsCatZ(codepoint
);
2909 ret
= xmlUCSIsCatC(codepoint
);
2911 case XML_REGEXP_LETTER
:
2912 ret
= xmlUCSIsCatL(codepoint
);
2914 case XML_REGEXP_LETTER_UPPERCASE
:
2915 ret
= xmlUCSIsCatLu(codepoint
);
2917 case XML_REGEXP_LETTER_LOWERCASE
:
2918 ret
= xmlUCSIsCatLl(codepoint
);
2920 case XML_REGEXP_LETTER_TITLECASE
:
2921 ret
= xmlUCSIsCatLt(codepoint
);
2923 case XML_REGEXP_LETTER_MODIFIER
:
2924 ret
= xmlUCSIsCatLm(codepoint
);
2926 case XML_REGEXP_LETTER_OTHERS
:
2927 ret
= xmlUCSIsCatLo(codepoint
);
2929 case XML_REGEXP_MARK
:
2930 ret
= xmlUCSIsCatM(codepoint
);
2932 case XML_REGEXP_MARK_NONSPACING
:
2933 ret
= xmlUCSIsCatMn(codepoint
);
2935 case XML_REGEXP_MARK_SPACECOMBINING
:
2936 ret
= xmlUCSIsCatMc(codepoint
);
2938 case XML_REGEXP_MARK_ENCLOSING
:
2939 ret
= xmlUCSIsCatMe(codepoint
);
2941 case XML_REGEXP_NUMBER
:
2942 ret
= xmlUCSIsCatN(codepoint
);
2944 case XML_REGEXP_NUMBER_DECIMAL
:
2945 ret
= xmlUCSIsCatNd(codepoint
);
2947 case XML_REGEXP_NUMBER_LETTER
:
2948 ret
= xmlUCSIsCatNl(codepoint
);
2950 case XML_REGEXP_NUMBER_OTHERS
:
2951 ret
= xmlUCSIsCatNo(codepoint
);
2953 case XML_REGEXP_PUNCT
:
2954 ret
= xmlUCSIsCatP(codepoint
);
2956 case XML_REGEXP_PUNCT_CONNECTOR
:
2957 ret
= xmlUCSIsCatPc(codepoint
);
2959 case XML_REGEXP_PUNCT_DASH
:
2960 ret
= xmlUCSIsCatPd(codepoint
);
2962 case XML_REGEXP_PUNCT_OPEN
:
2963 ret
= xmlUCSIsCatPs(codepoint
);
2965 case XML_REGEXP_PUNCT_CLOSE
:
2966 ret
= xmlUCSIsCatPe(codepoint
);
2968 case XML_REGEXP_PUNCT_INITQUOTE
:
2969 ret
= xmlUCSIsCatPi(codepoint
);
2971 case XML_REGEXP_PUNCT_FINQUOTE
:
2972 ret
= xmlUCSIsCatPf(codepoint
);
2974 case XML_REGEXP_PUNCT_OTHERS
:
2975 ret
= xmlUCSIsCatPo(codepoint
);
2977 case XML_REGEXP_SEPAR
:
2978 ret
= xmlUCSIsCatZ(codepoint
);
2980 case XML_REGEXP_SEPAR_SPACE
:
2981 ret
= xmlUCSIsCatZs(codepoint
);
2983 case XML_REGEXP_SEPAR_LINE
:
2984 ret
= xmlUCSIsCatZl(codepoint
);
2986 case XML_REGEXP_SEPAR_PARA
:
2987 ret
= xmlUCSIsCatZp(codepoint
);
2989 case XML_REGEXP_SYMBOL
:
2990 ret
= xmlUCSIsCatS(codepoint
);
2992 case XML_REGEXP_SYMBOL_MATH
:
2993 ret
= xmlUCSIsCatSm(codepoint
);
2995 case XML_REGEXP_SYMBOL_CURRENCY
:
2996 ret
= xmlUCSIsCatSc(codepoint
);
2998 case XML_REGEXP_SYMBOL_MODIFIER
:
2999 ret
= xmlUCSIsCatSk(codepoint
);
3001 case XML_REGEXP_SYMBOL_OTHERS
:
3002 ret
= xmlUCSIsCatSo(codepoint
);
3004 case XML_REGEXP_OTHER
:
3005 ret
= xmlUCSIsCatC(codepoint
);
3007 case XML_REGEXP_OTHER_CONTROL
:
3008 ret
= xmlUCSIsCatCc(codepoint
);
3010 case XML_REGEXP_OTHER_FORMAT
:
3011 ret
= xmlUCSIsCatCf(codepoint
);
3013 case XML_REGEXP_OTHER_PRIVATE
:
3014 ret
= xmlUCSIsCatCo(codepoint
);
3016 case XML_REGEXP_OTHER_NA
:
3017 /* ret = xmlUCSIsCatCn(codepoint); */
3018 /* Seems it doesn't exist anymore in recent Unicode releases */
3021 case XML_REGEXP_BLOCK_NAME
:
3022 ret
= xmlUCSIsBlock(codepoint
, (const char *) blockName
);
3031 xmlRegCheckCharacter(xmlRegAtomPtr atom
, int codepoint
) {
3033 xmlRegRangePtr range
;
3035 if ((atom
== NULL
) || (!IS_CHAR(codepoint
)))
3038 switch (atom
->type
) {
3039 case XML_REGEXP_SUBREG
:
3040 case XML_REGEXP_EPSILON
:
3042 case XML_REGEXP_CHARVAL
:
3043 return(codepoint
== atom
->codepoint
);
3044 case XML_REGEXP_RANGES
: {
3047 for (i
= 0;i
< atom
->nbRanges
;i
++) {
3048 range
= atom
->ranges
[i
];
3049 if (range
->neg
== 2) {
3050 ret
= xmlRegCheckCharacterRange(range
->type
, codepoint
,
3051 0, range
->start
, range
->end
,
3054 return(0); /* excluded char */
3055 } else if (range
->neg
) {
3056 ret
= xmlRegCheckCharacterRange(range
->type
, codepoint
,
3057 0, range
->start
, range
->end
,
3064 ret
= xmlRegCheckCharacterRange(range
->type
, codepoint
,
3065 0, range
->start
, range
->end
,
3068 accept
= 1; /* might still be excluded */
3073 case XML_REGEXP_STRING
:
3074 printf("TODO: XML_REGEXP_STRING\n");
3076 case XML_REGEXP_ANYCHAR
:
3077 case XML_REGEXP_ANYSPACE
:
3078 case XML_REGEXP_NOTSPACE
:
3079 case XML_REGEXP_INITNAME
:
3080 case XML_REGEXP_NOTINITNAME
:
3081 case XML_REGEXP_NAMECHAR
:
3082 case XML_REGEXP_NOTNAMECHAR
:
3083 case XML_REGEXP_DECIMAL
:
3084 case XML_REGEXP_NOTDECIMAL
:
3085 case XML_REGEXP_REALCHAR
:
3086 case XML_REGEXP_NOTREALCHAR
:
3087 case XML_REGEXP_LETTER
:
3088 case XML_REGEXP_LETTER_UPPERCASE
:
3089 case XML_REGEXP_LETTER_LOWERCASE
:
3090 case XML_REGEXP_LETTER_TITLECASE
:
3091 case XML_REGEXP_LETTER_MODIFIER
:
3092 case XML_REGEXP_LETTER_OTHERS
:
3093 case XML_REGEXP_MARK
:
3094 case XML_REGEXP_MARK_NONSPACING
:
3095 case XML_REGEXP_MARK_SPACECOMBINING
:
3096 case XML_REGEXP_MARK_ENCLOSING
:
3097 case XML_REGEXP_NUMBER
:
3098 case XML_REGEXP_NUMBER_DECIMAL
:
3099 case XML_REGEXP_NUMBER_LETTER
:
3100 case XML_REGEXP_NUMBER_OTHERS
:
3101 case XML_REGEXP_PUNCT
:
3102 case XML_REGEXP_PUNCT_CONNECTOR
:
3103 case XML_REGEXP_PUNCT_DASH
:
3104 case XML_REGEXP_PUNCT_OPEN
:
3105 case XML_REGEXP_PUNCT_CLOSE
:
3106 case XML_REGEXP_PUNCT_INITQUOTE
:
3107 case XML_REGEXP_PUNCT_FINQUOTE
:
3108 case XML_REGEXP_PUNCT_OTHERS
:
3109 case XML_REGEXP_SEPAR
:
3110 case XML_REGEXP_SEPAR_SPACE
:
3111 case XML_REGEXP_SEPAR_LINE
:
3112 case XML_REGEXP_SEPAR_PARA
:
3113 case XML_REGEXP_SYMBOL
:
3114 case XML_REGEXP_SYMBOL_MATH
:
3115 case XML_REGEXP_SYMBOL_CURRENCY
:
3116 case XML_REGEXP_SYMBOL_MODIFIER
:
3117 case XML_REGEXP_SYMBOL_OTHERS
:
3118 case XML_REGEXP_OTHER
:
3119 case XML_REGEXP_OTHER_CONTROL
:
3120 case XML_REGEXP_OTHER_FORMAT
:
3121 case XML_REGEXP_OTHER_PRIVATE
:
3122 case XML_REGEXP_OTHER_NA
:
3123 case XML_REGEXP_BLOCK_NAME
:
3124 ret
= xmlRegCheckCharacterRange(atom
->type
, codepoint
, 0, 0, 0,
3125 (const xmlChar
*)atom
->valuep
);
3133 /************************************************************************
3135 * Saving and restoring state of an execution context *
3137 ************************************************************************/
3139 #ifdef DEBUG_REGEXP_EXEC
3141 xmlFARegDebugExec(xmlRegExecCtxtPtr exec
) {
3142 printf("state: %d:%d:idx %d", exec
->state
->no
, exec
->transno
, exec
->index
);
3143 if (exec
->inputStack
!= NULL
) {
3146 for (i
= 0;(i
< 3) && (i
< exec
->inputStackNr
);i
++)
3147 printf("%s ", (const char *)
3148 exec
->inputStack
[exec
->inputStackNr
- (i
+ 1)].value
);
3150 printf(": %s", &(exec
->inputString
[exec
->index
]));
3157 xmlFARegExecSave(xmlRegExecCtxtPtr exec
) {
3158 #ifdef DEBUG_REGEXP_EXEC
3161 xmlFARegDebugExec(exec
);
3165 if (exec
->nbPush
> MAX_PUSH
) {
3171 if (exec
->maxRollbacks
== 0) {
3172 exec
->maxRollbacks
= 4;
3173 exec
->rollbacks
= (xmlRegExecRollback
*) xmlMalloc(exec
->maxRollbacks
*
3174 sizeof(xmlRegExecRollback
));
3175 if (exec
->rollbacks
== NULL
) {
3176 xmlRegexpErrMemory(NULL
, "saving regexp");
3177 exec
->maxRollbacks
= 0;
3180 memset(exec
->rollbacks
, 0,
3181 exec
->maxRollbacks
* sizeof(xmlRegExecRollback
));
3182 } else if (exec
->nbRollbacks
>= exec
->maxRollbacks
) {
3183 xmlRegExecRollback
*tmp
;
3184 int len
= exec
->maxRollbacks
;
3186 exec
->maxRollbacks
*= 2;
3187 tmp
= (xmlRegExecRollback
*) xmlRealloc(exec
->rollbacks
,
3188 exec
->maxRollbacks
* sizeof(xmlRegExecRollback
));
3190 xmlRegexpErrMemory(NULL
, "saving regexp");
3191 exec
->maxRollbacks
/= 2;
3194 exec
->rollbacks
= tmp
;
3195 tmp
= &exec
->rollbacks
[len
];
3196 memset(tmp
, 0, (exec
->maxRollbacks
- len
) * sizeof(xmlRegExecRollback
));
3198 exec
->rollbacks
[exec
->nbRollbacks
].state
= exec
->state
;
3199 exec
->rollbacks
[exec
->nbRollbacks
].index
= exec
->index
;
3200 exec
->rollbacks
[exec
->nbRollbacks
].nextbranch
= exec
->transno
+ 1;
3201 if (exec
->comp
->nbCounters
> 0) {
3202 if (exec
->rollbacks
[exec
->nbRollbacks
].counts
== NULL
) {
3203 exec
->rollbacks
[exec
->nbRollbacks
].counts
= (int *)
3204 xmlMalloc(exec
->comp
->nbCounters
* sizeof(int));
3205 if (exec
->rollbacks
[exec
->nbRollbacks
].counts
== NULL
) {
3206 xmlRegexpErrMemory(NULL
, "saving regexp");
3211 memcpy(exec
->rollbacks
[exec
->nbRollbacks
].counts
, exec
->counts
,
3212 exec
->comp
->nbCounters
* sizeof(int));
3214 exec
->nbRollbacks
++;
3218 xmlFARegExecRollBack(xmlRegExecCtxtPtr exec
) {
3219 if (exec
->nbRollbacks
<= 0) {
3221 #ifdef DEBUG_REGEXP_EXEC
3222 printf("rollback failed on empty stack\n");
3226 exec
->nbRollbacks
--;
3227 exec
->state
= exec
->rollbacks
[exec
->nbRollbacks
].state
;
3228 exec
->index
= exec
->rollbacks
[exec
->nbRollbacks
].index
;
3229 exec
->transno
= exec
->rollbacks
[exec
->nbRollbacks
].nextbranch
;
3230 if (exec
->comp
->nbCounters
> 0) {
3231 if (exec
->rollbacks
[exec
->nbRollbacks
].counts
== NULL
) {
3232 fprintf(stderr
, "exec save: allocation failed");
3237 memcpy(exec
->counts
, exec
->rollbacks
[exec
->nbRollbacks
].counts
,
3238 exec
->comp
->nbCounters
* sizeof(int));
3242 #ifdef DEBUG_REGEXP_EXEC
3243 printf("restored ");
3244 xmlFARegDebugExec(exec
);
3248 /************************************************************************
3250 * Verifier, running an input against a compiled regexp *
3252 ************************************************************************/
3255 xmlFARegExec(xmlRegexpPtr comp
, const xmlChar
*content
) {
3256 xmlRegExecCtxt execval
;
3257 xmlRegExecCtxtPtr exec
= &execval
;
3258 int ret
, codepoint
= 0, len
, deter
;
3260 exec
->inputString
= content
;
3263 exec
->determinist
= 1;
3264 exec
->maxRollbacks
= 0;
3265 exec
->nbRollbacks
= 0;
3266 exec
->rollbacks
= NULL
;
3269 exec
->state
= comp
->states
[0];
3271 exec
->transcount
= 0;
3272 exec
->inputStack
= NULL
;
3273 exec
->inputStackMax
= 0;
3274 if (comp
->nbCounters
> 0) {
3275 exec
->counts
= (int *) xmlMalloc(comp
->nbCounters
* sizeof(int));
3276 if (exec
->counts
== NULL
) {
3277 xmlRegexpErrMemory(NULL
, "running regexp");
3280 memset(exec
->counts
, 0, comp
->nbCounters
* sizeof(int));
3282 exec
->counts
= NULL
;
3283 while ((exec
->status
== 0) && (exec
->state
!= NULL
) &&
3284 ((exec
->inputString
[exec
->index
] != 0) ||
3285 ((exec
->state
!= NULL
) &&
3286 (exec
->state
->type
!= XML_REGEXP_FINAL_STATE
)))) {
3287 xmlRegTransPtr trans
;
3291 * If end of input on non-terminal state, rollback, however we may
3292 * still have epsilon like transition for counted transitions
3293 * on counters, in that case don't break too early. Additionally,
3294 * if we are working on a range like "AB{0,2}", where B is not present,
3295 * we don't want to break.
3298 if ((exec
->inputString
[exec
->index
] == 0) && (exec
->counts
== NULL
)) {
3300 * if there is a transition, we must check if
3301 * atom allows minOccurs of 0
3303 if (exec
->transno
< exec
->state
->nbTrans
) {
3304 trans
= &exec
->state
->trans
[exec
->transno
];
3305 if (trans
->to
>=0) {
3307 if (!((atom
->min
== 0) && (atom
->max
> 0)))
3314 exec
->transcount
= 0;
3315 for (;exec
->transno
< exec
->state
->nbTrans
;exec
->transno
++) {
3316 trans
= &exec
->state
->trans
[exec
->transno
];
3322 if (trans
->count
>= 0) {
3324 xmlRegCounterPtr counter
;
3326 if (exec
->counts
== NULL
) {
3331 * A counted transition.
3334 count
= exec
->counts
[trans
->count
];
3335 counter
= &exec
->comp
->counters
[trans
->count
];
3336 #ifdef DEBUG_REGEXP_EXEC
3337 printf("testing count %d: val %d, min %d, max %d\n",
3338 trans
->count
, count
, counter
->min
, counter
->max
);
3340 ret
= ((count
>= counter
->min
) && (count
<= counter
->max
));
3341 if ((ret
) && (counter
->min
!= counter
->max
))
3343 } else if (atom
== NULL
) {
3344 fprintf(stderr
, "epsilon transition left at runtime\n");
3347 } else if (exec
->inputString
[exec
->index
] != 0) {
3348 codepoint
= CUR_SCHAR(&(exec
->inputString
[exec
->index
]), len
);
3349 ret
= xmlRegCheckCharacter(atom
, codepoint
);
3350 if ((ret
== 1) && (atom
->min
>= 0) && (atom
->max
> 0)) {
3351 xmlRegStatePtr to
= comp
->states
[trans
->to
];
3354 * this is a multiple input sequence
3355 * If there is a counter associated increment it now.
3356 * do not increment if the counter is already over the
3357 * maximum limit in which case get to next transition
3359 if (trans
->counter
>= 0) {
3360 xmlRegCounterPtr counter
;
3362 if ((exec
->counts
== NULL
) ||
3363 (exec
->comp
== NULL
) ||
3364 (exec
->comp
->counters
== NULL
)) {
3368 counter
= &exec
->comp
->counters
[trans
->counter
];
3369 if (exec
->counts
[trans
->counter
] >= counter
->max
)
3370 continue; /* for loop on transitions */
3372 /* Save before incrementing */
3373 if (exec
->state
->nbTrans
> exec
->transno
+ 1) {
3374 xmlFARegExecSave(exec
);
3376 if (trans
->counter
>= 0) {
3377 #ifdef DEBUG_REGEXP_EXEC
3378 printf("Increasing count %d\n", trans
->counter
);
3380 exec
->counts
[trans
->counter
]++;
3382 exec
->transcount
= 1;
3385 * Try to progress as much as possible on the input
3387 if (exec
->transcount
== atom
->max
) {
3392 * End of input: stop here
3394 if (exec
->inputString
[exec
->index
] == 0) {
3398 if (exec
->transcount
>= atom
->min
) {
3399 int transno
= exec
->transno
;
3400 xmlRegStatePtr state
= exec
->state
;
3403 * The transition is acceptable save it
3405 exec
->transno
= -1; /* trick */
3407 xmlFARegExecSave(exec
);
3408 exec
->transno
= transno
;
3409 exec
->state
= state
;
3411 codepoint
= CUR_SCHAR(&(exec
->inputString
[exec
->index
]),
3413 ret
= xmlRegCheckCharacter(atom
, codepoint
);
3416 if (exec
->transcount
< atom
->min
)
3420 * If the last check failed but one transition was found
3421 * possible, rollback
3428 if (trans
->counter
>= 0) {
3429 if (exec
->counts
== NULL
) {
3433 #ifdef DEBUG_REGEXP_EXEC
3434 printf("Decreasing count %d\n", trans
->counter
);
3436 exec
->counts
[trans
->counter
]--;
3438 } else if ((ret
== 0) && (atom
->min
== 0) && (atom
->max
> 0)) {
3440 * we don't match on the codepoint, but minOccurs of 0
3441 * says that's ok. Setting len to 0 inhibits stepping
3442 * over the codepoint.
3444 exec
->transcount
= 1;
3448 } else if ((atom
->min
== 0) && (atom
->max
> 0)) {
3449 /* another spot to match when minOccurs is 0 */
3450 exec
->transcount
= 1;
3455 if ((trans
->nd
== 1) ||
3456 ((trans
->count
>= 0) && (deter
== 0) &&
3457 (exec
->state
->nbTrans
> exec
->transno
+ 1))) {
3458 #ifdef DEBUG_REGEXP_EXEC
3460 printf("Saving on nd transition atom %d for %c at %d\n",
3461 trans
->atom
->no
, codepoint
, exec
->index
);
3463 printf("Saving on counted transition count %d for %c at %d\n",
3464 trans
->count
, codepoint
, exec
->index
);
3466 xmlFARegExecSave(exec
);
3468 if (trans
->counter
>= 0) {
3469 xmlRegCounterPtr counter
;
3471 /* make sure we don't go over the counter maximum value */
3472 if ((exec
->counts
== NULL
) ||
3473 (exec
->comp
== NULL
) ||
3474 (exec
->comp
->counters
== NULL
)) {
3478 counter
= &exec
->comp
->counters
[trans
->counter
];
3479 if (exec
->counts
[trans
->counter
] >= counter
->max
)
3480 continue; /* for loop on transitions */
3481 #ifdef DEBUG_REGEXP_EXEC
3482 printf("Increasing count %d\n", trans
->counter
);
3484 exec
->counts
[trans
->counter
]++;
3486 if ((trans
->count
>= 0) &&
3487 (trans
->count
< REGEXP_ALL_COUNTER
)) {
3488 if (exec
->counts
== NULL
) {
3492 #ifdef DEBUG_REGEXP_EXEC
3493 printf("resetting count %d on transition\n",
3496 exec
->counts
[trans
->count
] = 0;
3498 #ifdef DEBUG_REGEXP_EXEC
3499 printf("entering state %d\n", trans
->to
);
3501 exec
->state
= comp
->states
[trans
->to
];
3503 if (trans
->atom
!= NULL
) {
3507 } else if (ret
< 0) {
3512 if ((exec
->transno
!= 0) || (exec
->state
->nbTrans
== 0)) {
3515 * Failed to find a way out
3517 exec
->determinist
= 0;
3518 #ifdef DEBUG_REGEXP_EXEC
3519 printf("rollback from state %d on %d:%c\n", exec
->state
->no
,
3520 codepoint
,codepoint
);
3522 xmlFARegExecRollBack(exec
);
3528 if (exec
->rollbacks
!= NULL
) {
3529 if (exec
->counts
!= NULL
) {
3532 for (i
= 0;i
< exec
->maxRollbacks
;i
++)
3533 if (exec
->rollbacks
[i
].counts
!= NULL
)
3534 xmlFree(exec
->rollbacks
[i
].counts
);
3536 xmlFree(exec
->rollbacks
);
3538 if (exec
->state
== NULL
)
3540 if (exec
->counts
!= NULL
)
3541 xmlFree(exec
->counts
);
3542 if (exec
->status
== 0)
3544 if (exec
->status
== -1) {
3545 if (exec
->nbPush
> MAX_PUSH
)
3549 return(exec
->status
);
3552 /************************************************************************
3554 * Progressive interface to the verifier one atom at a time *
3556 ************************************************************************/
3558 static void testerr(xmlRegExecCtxtPtr exec
);
3562 * xmlRegNewExecCtxt:
3563 * @comp: a precompiled regular expression
3564 * @callback: a callback function used for handling progresses in the
3565 * automata matching phase
3566 * @data: the context data associated to the callback in this context
3568 * Build a context used for progressive evaluation of a regexp.
3570 * Returns the new context
3573 xmlRegNewExecCtxt(xmlRegexpPtr comp
, xmlRegExecCallbacks callback
, void *data
) {
3574 xmlRegExecCtxtPtr exec
;
3578 if ((comp
->compact
== NULL
) && (comp
->states
== NULL
))
3580 exec
= (xmlRegExecCtxtPtr
) xmlMalloc(sizeof(xmlRegExecCtxt
));
3582 xmlRegexpErrMemory(NULL
, "creating execution context");
3585 memset(exec
, 0, sizeof(xmlRegExecCtxt
));
3586 exec
->inputString
= NULL
;
3588 exec
->determinist
= 1;
3589 exec
->maxRollbacks
= 0;
3590 exec
->nbRollbacks
= 0;
3591 exec
->rollbacks
= NULL
;
3594 if (comp
->compact
== NULL
)
3595 exec
->state
= comp
->states
[0];
3597 exec
->transcount
= 0;
3598 exec
->callback
= callback
;
3600 if (comp
->nbCounters
> 0) {
3602 * For error handling, exec->counts is allocated twice the size
3603 * the second half is used to store the data in case of rollback
3605 exec
->counts
= (int *) xmlMalloc(comp
->nbCounters
* sizeof(int)
3607 if (exec
->counts
== NULL
) {
3608 xmlRegexpErrMemory(NULL
, "creating execution context");
3612 memset(exec
->counts
, 0, comp
->nbCounters
* sizeof(int) * 2);
3613 exec
->errCounts
= &exec
->counts
[comp
->nbCounters
];
3615 exec
->counts
= NULL
;
3616 exec
->errCounts
= NULL
;
3618 exec
->inputStackMax
= 0;
3619 exec
->inputStackNr
= 0;
3620 exec
->inputStack
= NULL
;
3621 exec
->errStateNo
= -1;
3622 exec
->errString
= NULL
;
3628 * xmlRegFreeExecCtxt:
3629 * @exec: a regular expression evaluation context
3631 * Free the structures associated to a regular expression evaluation context.
3634 xmlRegFreeExecCtxt(xmlRegExecCtxtPtr exec
) {
3638 if (exec
->rollbacks
!= NULL
) {
3639 if (exec
->counts
!= NULL
) {
3642 for (i
= 0;i
< exec
->maxRollbacks
;i
++)
3643 if (exec
->rollbacks
[i
].counts
!= NULL
)
3644 xmlFree(exec
->rollbacks
[i
].counts
);
3646 xmlFree(exec
->rollbacks
);
3648 if (exec
->counts
!= NULL
)
3649 xmlFree(exec
->counts
);
3650 if (exec
->inputStack
!= NULL
) {
3653 for (i
= 0;i
< exec
->inputStackNr
;i
++) {
3654 if (exec
->inputStack
[i
].value
!= NULL
)
3655 xmlFree(exec
->inputStack
[i
].value
);
3657 xmlFree(exec
->inputStack
);
3659 if (exec
->errString
!= NULL
)
3660 xmlFree(exec
->errString
);
3665 xmlFARegExecSaveInputString(xmlRegExecCtxtPtr exec
, const xmlChar
*value
,
3668 printf("saving value: %d:%s\n", exec
->inputStackNr
, value
);
3670 if (exec
->inputStackMax
== 0) {
3671 exec
->inputStackMax
= 4;
3672 exec
->inputStack
= (xmlRegInputTokenPtr
)
3673 xmlMalloc(exec
->inputStackMax
* sizeof(xmlRegInputToken
));
3674 if (exec
->inputStack
== NULL
) {
3675 xmlRegexpErrMemory(NULL
, "pushing input string");
3676 exec
->inputStackMax
= 0;
3679 } else if (exec
->inputStackNr
+ 1 >= exec
->inputStackMax
) {
3680 xmlRegInputTokenPtr tmp
;
3682 exec
->inputStackMax
*= 2;
3683 tmp
= (xmlRegInputTokenPtr
) xmlRealloc(exec
->inputStack
,
3684 exec
->inputStackMax
* sizeof(xmlRegInputToken
));
3686 xmlRegexpErrMemory(NULL
, "pushing input string");
3687 exec
->inputStackMax
/= 2;
3690 exec
->inputStack
= tmp
;
3692 exec
->inputStack
[exec
->inputStackNr
].value
= xmlStrdup(value
);
3693 exec
->inputStack
[exec
->inputStackNr
].data
= data
;
3694 exec
->inputStackNr
++;
3695 exec
->inputStack
[exec
->inputStackNr
].value
= NULL
;
3696 exec
->inputStack
[exec
->inputStackNr
].data
= NULL
;
3700 * xmlRegStrEqualWildcard:
3701 * @expStr: the string to be evaluated
3702 * @valStr: the validation string
3704 * Checks if both strings are equal or have the same content. "*"
3705 * can be used as a wildcard in @valStr; "|" is used as a separator of
3706 * substrings in both @expStr and @valStr.
3708 * Returns 1 if the comparison is satisfied and the number of substrings
3709 * is equal, 0 otherwise.
3713 xmlRegStrEqualWildcard(const xmlChar
*expStr
, const xmlChar
*valStr
) {
3714 if (expStr
== valStr
) return(1);
3715 if (expStr
== NULL
) return(0);
3716 if (valStr
== NULL
) return(0);
3719 * Eval if we have a wildcard for the current item.
3721 if (*expStr
!= *valStr
) {
3722 /* if one of them starts with a wildcard make valStr be it */
3723 if (*valStr
== '*') {
3730 if ((*valStr
!= 0) && (*expStr
!= 0) && (*expStr
++ == '*')) {
3732 if (*valStr
== XML_REG_STRING_SEPARATOR
)
3735 } while (*valStr
!= 0);
3742 } while (*valStr
!= 0);
3750 * xmlRegCompactPushString:
3751 * @exec: a regexp execution context
3752 * @comp: the precompiled exec with a compact table
3753 * @value: a string token input
3754 * @data: data associated to the token to reuse in callbacks
3756 * Push one input token in the execution context
3758 * Returns: 1 if the regexp reached a final state, 0 if non-final, and
3759 * a negative value in case of error.
3762 xmlRegCompactPushString(xmlRegExecCtxtPtr exec
,
3764 const xmlChar
*value
,
3766 int state
= exec
->index
;
3769 if ((comp
== NULL
) || (comp
->compact
== NULL
) || (comp
->stringMap
== NULL
))
3772 if (value
== NULL
) {
3774 * are we at a final state ?
3776 if (comp
->compact
[state
* (comp
->nbstrings
+ 1)] ==
3777 XML_REGEXP_FINAL_STATE
)
3783 printf("value pushed: %s\n", value
);
3787 * Examine all outside transitions from current state
3789 for (i
= 0;i
< comp
->nbstrings
;i
++) {
3790 target
= comp
->compact
[state
* (comp
->nbstrings
+ 1) + i
+ 1];
3791 if ((target
> 0) && (target
<= comp
->nbstates
)) {
3792 target
--; /* to avoid 0 */
3793 if (xmlRegStrEqualWildcard(comp
->stringMap
[i
], value
)) {
3794 exec
->index
= target
;
3795 if ((exec
->callback
!= NULL
) && (comp
->transdata
!= NULL
)) {
3796 exec
->callback(exec
->data
, value
,
3797 comp
->transdata
[state
* comp
->nbstrings
+ i
], data
);
3800 printf("entering state %d\n", target
);
3802 if (comp
->compact
[target
* (comp
->nbstrings
+ 1)] ==
3803 XML_REGEXP_SINK_STATE
)
3806 if (comp
->compact
[target
* (comp
->nbstrings
+ 1)] ==
3807 XML_REGEXP_FINAL_STATE
)
3814 * Failed to find an exit transition out from current state for the
3818 printf("failed to find a transition for %s on state %d\n", value
, state
);
3821 if (exec
->errString
!= NULL
)
3822 xmlFree(exec
->errString
);
3823 exec
->errString
= xmlStrdup(value
);
3824 exec
->errStateNo
= state
;
3833 * xmlRegExecPushStringInternal:
3834 * @exec: a regexp execution context or NULL to indicate the end
3835 * @value: a string token input
3836 * @data: data associated to the token to reuse in callbacks
3837 * @compound: value was assembled from 2 strings
3839 * Push one input token in the execution context
3841 * Returns: 1 if the regexp reached a final state, 0 if non-final, and
3842 * a negative value in case of error.
3845 xmlRegExecPushStringInternal(xmlRegExecCtxtPtr exec
, const xmlChar
*value
,
3846 void *data
, int compound
) {
3847 xmlRegTransPtr trans
;
3855 if (exec
->comp
== NULL
)
3857 if (exec
->status
!= 0)
3858 return(exec
->status
);
3860 if (exec
->comp
->compact
!= NULL
)
3861 return(xmlRegCompactPushString(exec
, exec
->comp
, value
, data
));
3863 if (value
== NULL
) {
3864 if (exec
->state
->type
== XML_REGEXP_FINAL_STATE
)
3870 printf("value pushed: %s\n", value
);
3873 * If we have an active rollback stack push the new value there
3874 * and get back to where we were left
3876 if ((value
!= NULL
) && (exec
->inputStackNr
> 0)) {
3877 xmlFARegExecSaveInputString(exec
, value
, data
);
3878 value
= exec
->inputStack
[exec
->index
].value
;
3879 data
= exec
->inputStack
[exec
->index
].data
;
3881 printf("value loaded: %s\n", value
);
3885 while ((exec
->status
== 0) &&
3888 (exec
->state
->type
!= XML_REGEXP_FINAL_STATE
)))) {
3891 * End of input on non-terminal state, rollback, however we may
3892 * still have epsilon like transition for counted transitions
3893 * on counters, in that case don't break too early.
3895 if ((value
== NULL
) && (exec
->counts
== NULL
))
3898 exec
->transcount
= 0;
3899 for (;exec
->transno
< exec
->state
->nbTrans
;exec
->transno
++) {
3900 trans
= &exec
->state
->trans
[exec
->transno
];
3905 if (trans
->count
== REGEXP_ALL_LAX_COUNTER
) {
3909 xmlRegCounterPtr counter
;
3914 printf("testing all lax %d\n", trans
->count
);
3917 * Check all counted transitions from the current state
3919 if ((value
== NULL
) && (final
)) {
3921 } else if (value
!= NULL
) {
3922 for (i
= 0;i
< exec
->state
->nbTrans
;i
++) {
3923 t
= &exec
->state
->trans
[i
];
3924 if ((t
->counter
< 0) || (t
== trans
))
3926 counter
= &exec
->comp
->counters
[t
->counter
];
3927 count
= exec
->counts
[t
->counter
];
3928 if ((count
< counter
->max
) &&
3929 (t
->atom
!= NULL
) &&
3930 (xmlStrEqual(value
, t
->atom
->valuep
))) {
3934 if ((count
>= counter
->min
) &&
3935 (count
< counter
->max
) &&
3936 (t
->atom
!= NULL
) &&
3937 (xmlStrEqual(value
, t
->atom
->valuep
))) {
3943 } else if (trans
->count
== REGEXP_ALL_COUNTER
) {
3947 xmlRegCounterPtr counter
;
3952 printf("testing all %d\n", trans
->count
);
3955 * Check all counted transitions from the current state
3957 for (i
= 0;i
< exec
->state
->nbTrans
;i
++) {
3958 t
= &exec
->state
->trans
[i
];
3959 if ((t
->counter
< 0) || (t
== trans
))
3961 counter
= &exec
->comp
->counters
[t
->counter
];
3962 count
= exec
->counts
[t
->counter
];
3963 if ((count
< counter
->min
) || (count
> counter
->max
)) {
3968 } else if (trans
->count
>= 0) {
3970 xmlRegCounterPtr counter
;
3973 * A counted transition.
3976 count
= exec
->counts
[trans
->count
];
3977 counter
= &exec
->comp
->counters
[trans
->count
];
3979 printf("testing count %d: val %d, min %d, max %d\n",
3980 trans
->count
, count
, counter
->min
, counter
->max
);
3982 ret
= ((count
>= counter
->min
) && (count
<= counter
->max
));
3983 } else if (atom
== NULL
) {
3984 fprintf(stderr
, "epsilon transition left at runtime\n");
3987 } else if (value
!= NULL
) {
3988 ret
= xmlRegStrEqualWildcard(atom
->valuep
, value
);
3994 if ((ret
== 1) && (trans
->counter
>= 0)) {
3995 xmlRegCounterPtr counter
;
3998 count
= exec
->counts
[trans
->counter
];
3999 counter
= &exec
->comp
->counters
[trans
->counter
];
4000 if (count
>= counter
->max
)
4004 if ((ret
== 1) && (atom
->min
> 0) && (atom
->max
> 0)) {
4005 xmlRegStatePtr to
= exec
->comp
->states
[trans
->to
];
4008 * this is a multiple input sequence
4010 if (exec
->state
->nbTrans
> exec
->transno
+ 1) {
4011 if (exec
->inputStackNr
<= 0) {
4012 xmlFARegExecSaveInputString(exec
, value
, data
);
4014 xmlFARegExecSave(exec
);
4016 exec
->transcount
= 1;
4019 * Try to progress as much as possible on the input
4021 if (exec
->transcount
== atom
->max
) {
4025 value
= exec
->inputStack
[exec
->index
].value
;
4026 data
= exec
->inputStack
[exec
->index
].data
;
4028 printf("value loaded: %s\n", value
);
4032 * End of input: stop here
4034 if (value
== NULL
) {
4038 if (exec
->transcount
>= atom
->min
) {
4039 int transno
= exec
->transno
;
4040 xmlRegStatePtr state
= exec
->state
;
4043 * The transition is acceptable save it
4045 exec
->transno
= -1; /* trick */
4047 if (exec
->inputStackNr
<= 0) {
4048 xmlFARegExecSaveInputString(exec
, value
, data
);
4050 xmlFARegExecSave(exec
);
4051 exec
->transno
= transno
;
4052 exec
->state
= state
;
4054 ret
= xmlStrEqual(value
, atom
->valuep
);
4057 if (exec
->transcount
< atom
->min
)
4061 * If the last check failed but one transition was found
4062 * possible, rollback
4072 if ((exec
->callback
!= NULL
) && (atom
!= NULL
) &&
4074 exec
->callback(exec
->data
, atom
->valuep
,
4077 if (exec
->state
->nbTrans
> exec
->transno
+ 1) {
4078 if (exec
->inputStackNr
<= 0) {
4079 xmlFARegExecSaveInputString(exec
, value
, data
);
4081 xmlFARegExecSave(exec
);
4083 if (trans
->counter
>= 0) {
4085 printf("Increasing count %d\n", trans
->counter
);
4087 exec
->counts
[trans
->counter
]++;
4089 if ((trans
->count
>= 0) &&
4090 (trans
->count
< REGEXP_ALL_COUNTER
)) {
4091 #ifdef DEBUG_REGEXP_EXEC
4092 printf("resetting count %d on transition\n",
4095 exec
->counts
[trans
->count
] = 0;
4098 printf("entering state %d\n", trans
->to
);
4100 if ((exec
->comp
->states
[trans
->to
] != NULL
) &&
4101 (exec
->comp
->states
[trans
->to
]->type
==
4102 XML_REGEXP_SINK_STATE
)) {
4104 * entering a sink state, save the current state as error
4107 if (exec
->errString
!= NULL
)
4108 xmlFree(exec
->errString
);
4109 exec
->errString
= xmlStrdup(value
);
4110 exec
->errState
= exec
->state
;
4111 memcpy(exec
->errCounts
, exec
->counts
,
4112 exec
->comp
->nbCounters
* sizeof(int));
4114 exec
->state
= exec
->comp
->states
[trans
->to
];
4116 if (trans
->atom
!= NULL
) {
4117 if (exec
->inputStack
!= NULL
) {
4119 if (exec
->index
< exec
->inputStackNr
) {
4120 value
= exec
->inputStack
[exec
->index
].value
;
4121 data
= exec
->inputStack
[exec
->index
].data
;
4123 printf("value loaded: %s\n", value
);
4129 printf("end of input\n");
4136 printf("end of input\n");
4141 } else if (ret
< 0) {
4146 if ((exec
->transno
!= 0) || (exec
->state
->nbTrans
== 0)) {
4149 * if we didn't yet rollback on the current input
4150 * store the current state as the error state.
4152 if ((progress
) && (exec
->state
!= NULL
) &&
4153 (exec
->state
->type
!= XML_REGEXP_SINK_STATE
)) {
4155 if (exec
->errString
!= NULL
)
4156 xmlFree(exec
->errString
);
4157 exec
->errString
= xmlStrdup(value
);
4158 exec
->errState
= exec
->state
;
4159 if (exec
->comp
->nbCounters
)
4160 memcpy(exec
->errCounts
, exec
->counts
,
4161 exec
->comp
->nbCounters
* sizeof(int));
4165 * Failed to find a way out
4167 exec
->determinist
= 0;
4168 xmlFARegExecRollBack(exec
);
4169 if ((exec
->inputStack
!= NULL
) && (exec
->status
== 0)) {
4170 value
= exec
->inputStack
[exec
->index
].value
;
4171 data
= exec
->inputStack
[exec
->index
].data
;
4173 printf("value loaded: %s\n", value
);
4182 if (exec
->status
== 0) {
4183 return(exec
->state
->type
== XML_REGEXP_FINAL_STATE
);
4186 if (exec
->status
< 0) {
4190 return(exec
->status
);
4194 * xmlRegExecPushString:
4195 * @exec: a regexp execution context or NULL to indicate the end
4196 * @value: a string token input
4197 * @data: data associated to the token to reuse in callbacks
4199 * Push one input token in the execution context
4201 * Returns: 1 if the regexp reached a final state, 0 if non-final, and
4202 * a negative value in case of error.
4205 xmlRegExecPushString(xmlRegExecCtxtPtr exec
, const xmlChar
*value
,
4207 return(xmlRegExecPushStringInternal(exec
, value
, data
, 0));
4211 * xmlRegExecPushString2:
4212 * @exec: a regexp execution context or NULL to indicate the end
4213 * @value: the first string token input
4214 * @value2: the second string token input
4215 * @data: data associated to the token to reuse in callbacks
4217 * Push one input token in the execution context
4219 * Returns: 1 if the regexp reached a final state, 0 if non-final, and
4220 * a negative value in case of error.
4223 xmlRegExecPushString2(xmlRegExecCtxtPtr exec
, const xmlChar
*value
,
4224 const xmlChar
*value2
, void *data
) {
4226 int lenn
, lenp
, ret
;
4231 if (exec
->comp
== NULL
)
4233 if (exec
->status
!= 0)
4234 return(exec
->status
);
4237 return(xmlRegExecPushString(exec
, value
, data
));
4239 lenn
= strlen((char *) value2
);
4240 lenp
= strlen((char *) value
);
4242 if (150 < lenn
+ lenp
+ 2) {
4243 str
= (xmlChar
*) xmlMallocAtomic(lenn
+ lenp
+ 2);
4251 memcpy(&str
[0], value
, lenp
);
4252 str
[lenp
] = XML_REG_STRING_SEPARATOR
;
4253 memcpy(&str
[lenp
+ 1], value2
, lenn
);
4254 str
[lenn
+ lenp
+ 1] = 0;
4256 if (exec
->comp
->compact
!= NULL
)
4257 ret
= xmlRegCompactPushString(exec
, exec
->comp
, str
, data
);
4259 ret
= xmlRegExecPushStringInternal(exec
, str
, data
, 1);
4267 * xmlRegExecGetValues:
4268 * @exec: a regexp execution context
4269 * @err: error extraction or normal one
4270 * @nbval: pointer to the number of accepted values IN/OUT
4271 * @nbneg: return number of negative transitions
4272 * @values: pointer to the array of acceptable values
4273 * @terminal: return value if this was a terminal state
4275 * Extract information from the regexp execution, internal routine to
4276 * implement xmlRegExecNextValues() and xmlRegExecErrInfo()
4278 * Returns: 0 in case of success or -1 in case of error.
4281 xmlRegExecGetValues(xmlRegExecCtxtPtr exec
, int err
,
4282 int *nbval
, int *nbneg
,
4283 xmlChar
**values
, int *terminal
) {
4287 if ((exec
== NULL
) || (nbval
== NULL
) || (nbneg
== NULL
) ||
4288 (values
== NULL
) || (*nbval
<= 0))
4294 if ((exec
->comp
!= NULL
) && (exec
->comp
->compact
!= NULL
)) {
4296 int target
, i
, state
;
4301 if (exec
->errStateNo
== -1) return(-1);
4302 state
= exec
->errStateNo
;
4304 state
= exec
->index
;
4306 if (terminal
!= NULL
) {
4307 if (comp
->compact
[state
* (comp
->nbstrings
+ 1)] ==
4308 XML_REGEXP_FINAL_STATE
)
4313 for (i
= 0;(i
< comp
->nbstrings
) && (nb
< maxval
);i
++) {
4314 target
= comp
->compact
[state
* (comp
->nbstrings
+ 1) + i
+ 1];
4315 if ((target
> 0) && (target
<= comp
->nbstates
) &&
4316 (comp
->compact
[(target
- 1) * (comp
->nbstrings
+ 1)] !=
4317 XML_REGEXP_SINK_STATE
)) {
4318 values
[nb
++] = comp
->stringMap
[i
];
4322 for (i
= 0;(i
< comp
->nbstrings
) && (nb
< maxval
);i
++) {
4323 target
= comp
->compact
[state
* (comp
->nbstrings
+ 1) + i
+ 1];
4324 if ((target
> 0) && (target
<= comp
->nbstates
) &&
4325 (comp
->compact
[(target
- 1) * (comp
->nbstrings
+ 1)] ==
4326 XML_REGEXP_SINK_STATE
)) {
4327 values
[nb
++] = comp
->stringMap
[i
];
4333 xmlRegTransPtr trans
;
4335 xmlRegStatePtr state
;
4337 if (terminal
!= NULL
) {
4338 if (exec
->state
->type
== XML_REGEXP_FINAL_STATE
)
4345 if (exec
->errState
== NULL
) return(-1);
4346 state
= exec
->errState
;
4348 if (exec
->state
== NULL
) return(-1);
4349 state
= exec
->state
;
4352 (transno
< state
->nbTrans
) && (nb
< maxval
);
4354 trans
= &state
->trans
[transno
];
4358 if ((atom
== NULL
) || (atom
->valuep
== NULL
))
4360 if (trans
->count
== REGEXP_ALL_LAX_COUNTER
) {
4361 /* this should not be reached but ... */
4363 } else if (trans
->count
== REGEXP_ALL_COUNTER
) {
4364 /* this should not be reached but ... */
4366 } else if (trans
->counter
>= 0) {
4367 xmlRegCounterPtr counter
= NULL
;
4371 count
= exec
->errCounts
[trans
->counter
];
4373 count
= exec
->counts
[trans
->counter
];
4374 if (exec
->comp
!= NULL
)
4375 counter
= &exec
->comp
->counters
[trans
->counter
];
4376 if ((counter
== NULL
) || (count
< counter
->max
)) {
4378 values
[nb
++] = (xmlChar
*) atom
->valuep2
;
4380 values
[nb
++] = (xmlChar
*) atom
->valuep
;
4384 if ((exec
->comp
!= NULL
) && (exec
->comp
->states
[trans
->to
] != NULL
) &&
4385 (exec
->comp
->states
[trans
->to
]->type
!=
4386 XML_REGEXP_SINK_STATE
)) {
4388 values
[nb
++] = (xmlChar
*) atom
->valuep2
;
4390 values
[nb
++] = (xmlChar
*) atom
->valuep
;
4396 (transno
< state
->nbTrans
) && (nb
< maxval
);
4398 trans
= &state
->trans
[transno
];
4402 if ((atom
== NULL
) || (atom
->valuep
== NULL
))
4404 if (trans
->count
== REGEXP_ALL_LAX_COUNTER
) {
4406 } else if (trans
->count
== REGEXP_ALL_COUNTER
) {
4408 } else if (trans
->counter
>= 0) {
4411 if ((exec
->comp
->states
[trans
->to
] != NULL
) &&
4412 (exec
->comp
->states
[trans
->to
]->type
==
4413 XML_REGEXP_SINK_STATE
)) {
4415 values
[nb
++] = (xmlChar
*) atom
->valuep2
;
4417 values
[nb
++] = (xmlChar
*) atom
->valuep
;
4427 * xmlRegExecNextValues:
4428 * @exec: a regexp execution context
4429 * @nbval: pointer to the number of accepted values IN/OUT
4430 * @nbneg: return number of negative transitions
4431 * @values: pointer to the array of acceptable values
4432 * @terminal: return value if this was a terminal state
4434 * Extract information from the regexp execution,
4435 * the parameter @values must point to an array of @nbval string pointers
4436 * on return nbval will contain the number of possible strings in that
4437 * state and the @values array will be updated with them. The string values
4438 * returned will be freed with the @exec context and don't need to be
4441 * Returns: 0 in case of success or -1 in case of error.
4444 xmlRegExecNextValues(xmlRegExecCtxtPtr exec
, int *nbval
, int *nbneg
,
4445 xmlChar
**values
, int *terminal
) {
4446 return(xmlRegExecGetValues(exec
, 0, nbval
, nbneg
, values
, terminal
));
4450 * xmlRegExecErrInfo:
4451 * @exec: a regexp execution context generating an error
4452 * @string: return value for the error string
4453 * @nbval: pointer to the number of accepted values IN/OUT
4454 * @nbneg: return number of negative transitions
4455 * @values: pointer to the array of acceptable values
4456 * @terminal: return value if this was a terminal state
4458 * Extract error information from the regexp execution, the parameter
4459 * @string will be updated with the value pushed and not accepted,
4460 * the parameter @values must point to an array of @nbval string pointers
4461 * on return nbval will contain the number of possible strings in that
4462 * state and the @values array will be updated with them. The string values
4463 * returned will be freed with the @exec context and don't need to be
4466 * Returns: 0 in case of success or -1 in case of error.
4469 xmlRegExecErrInfo(xmlRegExecCtxtPtr exec
, const xmlChar
**string
,
4470 int *nbval
, int *nbneg
, xmlChar
**values
, int *terminal
) {
4473 if (string
!= NULL
) {
4474 if (exec
->status
!= 0)
4475 *string
= exec
->errString
;
4479 return(xmlRegExecGetValues(exec
, 1, nbval
, nbneg
, values
, terminal
));
4483 static void testerr(xmlRegExecCtxtPtr exec
) {
4484 const xmlChar
*string
;
4489 xmlRegExecErrInfo(exec
, &string
, &nb
, &nbneg
, &values
[0], &terminal
);
4495 xmlRegExecPushChar(xmlRegExecCtxtPtr exec
, int UCS
) {
4496 xmlRegTransPtr trans
;
4503 if (exec
->status
!= 0)
4504 return(exec
->status
);
4506 while ((exec
->status
== 0) &&
4507 ((exec
->inputString
[exec
->index
] != 0) ||
4508 (exec
->state
->type
!= XML_REGEXP_FINAL_STATE
))) {
4511 * End of input on non-terminal state, rollback, however we may
4512 * still have epsilon like transition for counted transitions
4513 * on counters, in that case don't break too early.
4515 if ((exec
->inputString
[exec
->index
] == 0) && (exec
->counts
== NULL
))
4518 exec
->transcount
= 0;
4519 for (;exec
->transno
< exec
->state
->nbTrans
;exec
->transno
++) {
4520 trans
= &exec
->state
->trans
[exec
->transno
];
4525 if (trans
->count
>= 0) {
4527 xmlRegCounterPtr counter
;
4530 * A counted transition.
4533 count
= exec
->counts
[trans
->count
];
4534 counter
= &exec
->comp
->counters
[trans
->count
];
4535 #ifdef DEBUG_REGEXP_EXEC
4536 printf("testing count %d: val %d, min %d, max %d\n",
4537 trans
->count
, count
, counter
->min
, counter
->max
);
4539 ret
= ((count
>= counter
->min
) && (count
<= counter
->max
));
4540 } else if (atom
== NULL
) {
4541 fprintf(stderr
, "epsilon transition left at runtime\n");
4544 } else if (exec
->inputString
[exec
->index
] != 0) {
4545 codepoint
= CUR_SCHAR(&(exec
->inputString
[exec
->index
]), len
);
4546 ret
= xmlRegCheckCharacter(atom
, codepoint
);
4547 if ((ret
== 1) && (atom
->min
> 0) && (atom
->max
> 0)) {
4548 xmlRegStatePtr to
= exec
->comp
->states
[trans
->to
];
4551 * this is a multiple input sequence
4553 if (exec
->state
->nbTrans
> exec
->transno
+ 1) {
4554 xmlFARegExecSave(exec
);
4556 exec
->transcount
= 1;
4559 * Try to progress as much as possible on the input
4561 if (exec
->transcount
== atom
->max
) {
4566 * End of input: stop here
4568 if (exec
->inputString
[exec
->index
] == 0) {
4572 if (exec
->transcount
>= atom
->min
) {
4573 int transno
= exec
->transno
;
4574 xmlRegStatePtr state
= exec
->state
;
4577 * The transition is acceptable save it
4579 exec
->transno
= -1; /* trick */
4581 xmlFARegExecSave(exec
);
4582 exec
->transno
= transno
;
4583 exec
->state
= state
;
4585 codepoint
= CUR_SCHAR(&(exec
->inputString
[exec
->index
]),
4587 ret
= xmlRegCheckCharacter(atom
, codepoint
);
4590 if (exec
->transcount
< atom
->min
)
4594 * If the last check failed but one transition was found
4595 * possible, rollback
4605 if (exec
->state
->nbTrans
> exec
->transno
+ 1) {
4606 xmlFARegExecSave(exec
);
4609 * restart count for expressions like this ((abc){2})*
4611 if (trans
->count
>= 0) {
4612 #ifdef DEBUG_REGEXP_EXEC
4613 printf("Reset count %d\n", trans
->count
);
4615 exec
->counts
[trans
->count
] = 0;
4617 if (trans
->counter
>= 0) {
4618 #ifdef DEBUG_REGEXP_EXEC
4619 printf("Increasing count %d\n", trans
->counter
);
4621 exec
->counts
[trans
->counter
]++;
4623 #ifdef DEBUG_REGEXP_EXEC
4624 printf("entering state %d\n", trans
->to
);
4626 exec
->state
= exec
->comp
->states
[trans
->to
];
4628 if (trans
->atom
!= NULL
) {
4632 } else if (ret
< 0) {
4637 if ((exec
->transno
!= 0) || (exec
->state
->nbTrans
== 0)) {
4640 * Failed to find a way out
4642 exec
->determinist
= 0;
4643 xmlFARegExecRollBack(exec
);
4650 /************************************************************************
4652 * Parser for the Schemas Datatype Regular Expressions *
4653 * http://www.w3.org/TR/2001/REC-xmlschema-2-20010502/#regexs *
4655 ************************************************************************/
4659 * @ctxt: a regexp parser context
4661 * [10] Char ::= [^.\?*+()|#x5B#x5D]
4664 xmlFAIsChar(xmlRegParserCtxtPtr ctxt
) {
4668 cur
= CUR_SCHAR(ctxt
->cur
, len
);
4669 if ((cur
== '.') || (cur
== '\\') || (cur
== '?') ||
4670 (cur
== '*') || (cur
== '+') || (cur
== '(') ||
4671 (cur
== ')') || (cur
== '|') || (cur
== 0x5B) ||
4672 (cur
== 0x5D) || (cur
== 0))
4678 * xmlFAParseCharProp:
4679 * @ctxt: a regexp parser context
4681 * [27] charProp ::= IsCategory | IsBlock
4682 * [28] IsCategory ::= Letters | Marks | Numbers | Punctuation |
4683 * Separators | Symbols | Others
4684 * [29] Letters ::= 'L' [ultmo]?
4685 * [30] Marks ::= 'M' [nce]?
4686 * [31] Numbers ::= 'N' [dlo]?
4687 * [32] Punctuation ::= 'P' [cdseifo]?
4688 * [33] Separators ::= 'Z' [slp]?
4689 * [34] Symbols ::= 'S' [mcko]?
4690 * [35] Others ::= 'C' [cfon]?
4691 * [36] IsBlock ::= 'Is' [a-zA-Z0-9#x2D]+
4694 xmlFAParseCharProp(xmlRegParserCtxtPtr ctxt
) {
4696 xmlRegAtomType type
= (xmlRegAtomType
) 0;
4697 xmlChar
*blockName
= NULL
;
4705 type
= XML_REGEXP_LETTER_UPPERCASE
;
4706 } else if (cur
== 'l') {
4708 type
= XML_REGEXP_LETTER_LOWERCASE
;
4709 } else if (cur
== 't') {
4711 type
= XML_REGEXP_LETTER_TITLECASE
;
4712 } else if (cur
== 'm') {
4714 type
= XML_REGEXP_LETTER_MODIFIER
;
4715 } else if (cur
== 'o') {
4717 type
= XML_REGEXP_LETTER_OTHERS
;
4719 type
= XML_REGEXP_LETTER
;
4721 } else if (cur
== 'M') {
4727 type
= XML_REGEXP_MARK_NONSPACING
;
4728 } else if (cur
== 'c') {
4730 /* spacing combining */
4731 type
= XML_REGEXP_MARK_SPACECOMBINING
;
4732 } else if (cur
== 'e') {
4735 type
= XML_REGEXP_MARK_ENCLOSING
;
4738 type
= XML_REGEXP_MARK
;
4740 } else if (cur
== 'N') {
4746 type
= XML_REGEXP_NUMBER_DECIMAL
;
4747 } else if (cur
== 'l') {
4750 type
= XML_REGEXP_NUMBER_LETTER
;
4751 } else if (cur
== 'o') {
4754 type
= XML_REGEXP_NUMBER_OTHERS
;
4757 type
= XML_REGEXP_NUMBER
;
4759 } else if (cur
== 'P') {
4765 type
= XML_REGEXP_PUNCT_CONNECTOR
;
4766 } else if (cur
== 'd') {
4769 type
= XML_REGEXP_PUNCT_DASH
;
4770 } else if (cur
== 's') {
4773 type
= XML_REGEXP_PUNCT_OPEN
;
4774 } else if (cur
== 'e') {
4777 type
= XML_REGEXP_PUNCT_CLOSE
;
4778 } else if (cur
== 'i') {
4781 type
= XML_REGEXP_PUNCT_INITQUOTE
;
4782 } else if (cur
== 'f') {
4785 type
= XML_REGEXP_PUNCT_FINQUOTE
;
4786 } else if (cur
== 'o') {
4789 type
= XML_REGEXP_PUNCT_OTHERS
;
4791 /* all punctuation */
4792 type
= XML_REGEXP_PUNCT
;
4794 } else if (cur
== 'Z') {
4800 type
= XML_REGEXP_SEPAR_SPACE
;
4801 } else if (cur
== 'l') {
4804 type
= XML_REGEXP_SEPAR_LINE
;
4805 } else if (cur
== 'p') {
4808 type
= XML_REGEXP_SEPAR_PARA
;
4810 /* all separators */
4811 type
= XML_REGEXP_SEPAR
;
4813 } else if (cur
== 'S') {
4818 type
= XML_REGEXP_SYMBOL_MATH
;
4820 } else if (cur
== 'c') {
4822 type
= XML_REGEXP_SYMBOL_CURRENCY
;
4824 } else if (cur
== 'k') {
4826 type
= XML_REGEXP_SYMBOL_MODIFIER
;
4828 } else if (cur
== 'o') {
4830 type
= XML_REGEXP_SYMBOL_OTHERS
;
4834 type
= XML_REGEXP_SYMBOL
;
4836 } else if (cur
== 'C') {
4842 type
= XML_REGEXP_OTHER_CONTROL
;
4843 } else if (cur
== 'f') {
4846 type
= XML_REGEXP_OTHER_FORMAT
;
4847 } else if (cur
== 'o') {
4850 type
= XML_REGEXP_OTHER_PRIVATE
;
4851 } else if (cur
== 'n') {
4854 type
= XML_REGEXP_OTHER_NA
;
4857 type
= XML_REGEXP_OTHER
;
4859 } else if (cur
== 'I') {
4860 const xmlChar
*start
;
4864 ERROR("IsXXXX expected");
4870 if (((cur
>= 'a') && (cur
<= 'z')) ||
4871 ((cur
>= 'A') && (cur
<= 'Z')) ||
4872 ((cur
>= '0') && (cur
<= '9')) ||
4876 while (((cur
>= 'a') && (cur
<= 'z')) ||
4877 ((cur
>= 'A') && (cur
<= 'Z')) ||
4878 ((cur
>= '0') && (cur
<= '9')) ||
4884 type
= XML_REGEXP_BLOCK_NAME
;
4885 blockName
= xmlStrndup(start
, ctxt
->cur
- start
);
4887 ERROR("Unknown char property");
4890 if (ctxt
->atom
== NULL
) {
4891 ctxt
->atom
= xmlRegNewAtom(ctxt
, type
);
4892 if (ctxt
->atom
== NULL
) {
4896 ctxt
->atom
->valuep
= blockName
;
4897 } else if (ctxt
->atom
->type
== XML_REGEXP_RANGES
) {
4898 if (xmlRegAtomAddRange(ctxt
, ctxt
->atom
, ctxt
->neg
,
4899 type
, 0, 0, blockName
) == NULL
) {
4905 static int parse_escaped_codeunit(xmlRegParserCtxtPtr ctxt
)
4907 int val
= 0, i
, cur
;
4908 for (i
= 0; i
< 4; i
++) {
4912 if (cur
>= '0' && cur
<= '9') {
4914 } else if (cur
>= 'A' && cur
<= 'F') {
4915 val
+= cur
- 'A' + 10;
4916 } else if (cur
>= 'a' && cur
<= 'f') {
4917 val
+= cur
- 'a' + 10;
4919 ERROR("Expecting hex digit");
4926 static int parse_escaped_codepoint(xmlRegParserCtxtPtr ctxt
)
4928 int val
= parse_escaped_codeunit(ctxt
);
4929 if (0xD800 <= val
&& val
<= 0xDBFF) {
4934 int low
= parse_escaped_codeunit(ctxt
);
4935 if (0xDC00 <= low
&& low
<= 0xDFFF) {
4936 return (val
- 0xD800) * 0x400 + (low
- 0xDC00) + 0x10000;
4940 ERROR("Invalid low surrogate pair code unit");
4947 * xmlFAParseCharClassEsc:
4948 * @ctxt: a regexp parser context
4950 * [23] charClassEsc ::= ( SingleCharEsc | MultiCharEsc | catEsc | complEsc )
4951 * [24] SingleCharEsc ::= '\' [nrt\|.?*+(){}#x2D#x5B#x5D#x5E]
4952 * [25] catEsc ::= '\p{' charProp '}'
4953 * [26] complEsc ::= '\P{' charProp '}'
4954 * [37] MultiCharEsc ::= '.' | ('\' [sSiIcCdDwW])
4957 xmlFAParseCharClassEsc(xmlRegParserCtxtPtr ctxt
) {
4961 if (ctxt
->atom
== NULL
) {
4962 ctxt
->atom
= xmlRegNewAtom(ctxt
, XML_REGEXP_ANYCHAR
);
4963 } else if (ctxt
->atom
->type
== XML_REGEXP_RANGES
) {
4964 xmlRegAtomAddRange(ctxt
, ctxt
->atom
, ctxt
->neg
,
4965 XML_REGEXP_ANYCHAR
, 0, 0, NULL
);
4971 ERROR("Escaped sequence: expecting \\");
4979 ERROR("Expecting '{'");
4983 xmlFAParseCharProp(ctxt
);
4985 ERROR("Expecting '}'");
4989 } else if (cur
== 'P') {
4992 ERROR("Expecting '{'");
4996 xmlFAParseCharProp(ctxt
);
4997 if (ctxt
->atom
!= NULL
)
4998 ctxt
->atom
->neg
= 1;
5000 ERROR("Expecting '}'");
5004 } else if ((cur
== 'n') || (cur
== 'r') || (cur
== 't') || (cur
== '\\') ||
5005 (cur
== '|') || (cur
== '.') || (cur
== '?') || (cur
== '*') ||
5006 (cur
== '+') || (cur
== '(') || (cur
== ')') || (cur
== '{') ||
5007 (cur
== '}') || (cur
== 0x2D) || (cur
== 0x5B) || (cur
== 0x5D) ||
5010 /* Non-standard escape sequences:
5011 * Java 1.8|.NET Core 3.1|MSXML 6 */
5012 (cur
== '!') || /* + | + | + */
5013 (cur
== '"') || /* + | + | + */
5014 (cur
== '#') || /* + | + | + */
5015 (cur
== '$') || /* + | + | + */
5016 (cur
== '%') || /* + | + | + */
5017 (cur
== ',') || /* + | + | + */
5018 (cur
== '/') || /* + | + | + */
5019 (cur
== ':') || /* + | + | + */
5020 (cur
== ';') || /* + | + | + */
5021 (cur
== '=') || /* + | + | + */
5022 (cur
== '>') || /* | + | + */
5023 (cur
== '@') || /* + | + | + */
5024 (cur
== '`') || /* + | + | + */
5025 (cur
== '~') || /* + | + | + */
5026 (cur
== 'u')) { /* | + | + */
5027 if (ctxt
->atom
== NULL
) {
5028 ctxt
->atom
= xmlRegNewAtom(ctxt
, XML_REGEXP_CHARVAL
);
5029 if (ctxt
->atom
!= NULL
) {
5032 ctxt
->atom
->codepoint
= '\n';
5035 ctxt
->atom
->codepoint
= '\r';
5038 ctxt
->atom
->codepoint
= '\t';
5041 cur
= parse_escaped_codepoint(ctxt
);
5045 ctxt
->atom
->codepoint
= cur
;
5048 ctxt
->atom
->codepoint
= cur
;
5051 } else if (ctxt
->atom
->type
== XML_REGEXP_RANGES
) {
5063 xmlRegAtomAddRange(ctxt
, ctxt
->atom
, ctxt
->neg
,
5064 XML_REGEXP_CHARVAL
, cur
, cur
, NULL
);
5067 } else if ((cur
== 's') || (cur
== 'S') || (cur
== 'i') || (cur
== 'I') ||
5068 (cur
== 'c') || (cur
== 'C') || (cur
== 'd') || (cur
== 'D') ||
5069 (cur
== 'w') || (cur
== 'W')) {
5070 xmlRegAtomType type
= XML_REGEXP_ANYSPACE
;
5074 type
= XML_REGEXP_ANYSPACE
;
5077 type
= XML_REGEXP_NOTSPACE
;
5080 type
= XML_REGEXP_INITNAME
;
5083 type
= XML_REGEXP_NOTINITNAME
;
5086 type
= XML_REGEXP_NAMECHAR
;
5089 type
= XML_REGEXP_NOTNAMECHAR
;
5092 type
= XML_REGEXP_DECIMAL
;
5095 type
= XML_REGEXP_NOTDECIMAL
;
5098 type
= XML_REGEXP_REALCHAR
;
5101 type
= XML_REGEXP_NOTREALCHAR
;
5105 if (ctxt
->atom
== NULL
) {
5106 ctxt
->atom
= xmlRegNewAtom(ctxt
, type
);
5107 } else if (ctxt
->atom
->type
== XML_REGEXP_RANGES
) {
5108 xmlRegAtomAddRange(ctxt
, ctxt
->atom
, ctxt
->neg
,
5112 ERROR("Wrong escape sequence, misuse of character '\\'");
5117 * xmlFAParseCharRange:
5118 * @ctxt: a regexp parser context
5120 * [17] charRange ::= seRange | XmlCharRef | XmlCharIncDash
5121 * [18] seRange ::= charOrEsc '-' charOrEsc
5122 * [20] charOrEsc ::= XmlChar | SingleCharEsc
5123 * [21] XmlChar ::= [^\#x2D#x5B#x5D]
5124 * [22] XmlCharIncDash ::= [^\#x5B#x5D]
5127 xmlFAParseCharRange(xmlRegParserCtxtPtr ctxt
) {
5133 ERROR("Expecting ']'");
5142 case 'n': start
= 0xA; break;
5143 case 'r': start
= 0xD; break;
5144 case 't': start
= 0x9; break;
5145 case '\\': case '|': case '.': case '-': case '^': case '?':
5146 case '*': case '+': case '{': case '}': case '(': case ')':
5150 ERROR("Invalid escape value");
5155 } else if ((cur
!= 0x5B) && (cur
!= 0x5D)) {
5156 end
= start
= CUR_SCHAR(ctxt
->cur
, len
);
5158 ERROR("Expecting a char range");
5162 * Since we are "inside" a range, we can assume ctxt->cur is past
5163 * the start of ctxt->string, and PREV should be safe
5165 if ((start
== '-') && (NXT(1) != ']') && (PREV
!= '[') && (PREV
!= '^')) {
5171 if ((cur
!= '-') || (NXT(1) == '[') || (NXT(1) == ']')) {
5172 xmlRegAtomAddRange(ctxt
, ctxt
->atom
, ctxt
->neg
,
5173 XML_REGEXP_CHARVAL
, start
, end
, NULL
);
5182 case 'n': end
= 0xA; break;
5183 case 'r': end
= 0xD; break;
5184 case 't': end
= 0x9; break;
5185 case '\\': case '|': case '.': case '-': case '^': case '?':
5186 case '*': case '+': case '{': case '}': case '(': case ')':
5190 ERROR("Invalid escape value");
5194 } else if ((cur
!= '\0') && (cur
!= 0x5B) && (cur
!= 0x5D)) {
5195 end
= CUR_SCHAR(ctxt
->cur
, len
);
5197 ERROR("Expecting the end of a char range");
5201 /* TODO check that the values are acceptable character ranges for XML */
5203 ERROR("End of range is before start of range");
5206 xmlRegAtomAddRange(ctxt
, ctxt
->atom
, ctxt
->neg
,
5207 XML_REGEXP_CHARVAL
, start
, end
, NULL
);
5213 * xmlFAParsePosCharGroup:
5214 * @ctxt: a regexp parser context
5216 * [14] posCharGroup ::= ( charRange | charClassEsc )+
5219 xmlFAParsePosCharGroup(xmlRegParserCtxtPtr ctxt
) {
5222 xmlFAParseCharClassEsc(ctxt
);
5224 xmlFAParseCharRange(ctxt
);
5226 } while ((CUR
!= ']') && (CUR
!= '-') &&
5227 (CUR
!= 0) && (ctxt
->error
== 0));
5231 * xmlFAParseCharGroup:
5232 * @ctxt: a regexp parser context
5234 * [13] charGroup ::= posCharGroup | negCharGroup | charClassSub
5235 * [15] negCharGroup ::= '^' posCharGroup
5236 * [16] charClassSub ::= ( posCharGroup | negCharGroup ) '-' charClassExpr
5237 * [12] charClassExpr ::= '[' charGroup ']'
5240 xmlFAParseCharGroup(xmlRegParserCtxtPtr ctxt
) {
5241 int neg
= ctxt
->neg
;
5245 ctxt
->neg
= !ctxt
->neg
;
5246 xmlFAParsePosCharGroup(ctxt
);
5249 while ((CUR
!= ']') && (ctxt
->error
== 0)) {
5250 if ((CUR
== '-') && (NXT(1) == '[')) {
5251 NEXT
; /* eat the '-' */
5252 NEXT
; /* eat the '[' */
5254 xmlFAParseCharGroup(ctxt
);
5259 ERROR("charClassExpr: ']' expected");
5263 xmlFAParsePosCharGroup(ctxt
);
5269 * xmlFAParseCharClass:
5270 * @ctxt: a regexp parser context
5272 * [11] charClass ::= charClassEsc | charClassExpr
5273 * [12] charClassExpr ::= '[' charGroup ']'
5276 xmlFAParseCharClass(xmlRegParserCtxtPtr ctxt
) {
5279 ctxt
->atom
= xmlRegNewAtom(ctxt
, XML_REGEXP_RANGES
);
5280 if (ctxt
->atom
== NULL
)
5282 xmlFAParseCharGroup(ctxt
);
5286 ERROR("xmlFAParseCharClass: ']' expected");
5289 xmlFAParseCharClassEsc(ctxt
);
5294 * xmlFAParseQuantExact:
5295 * @ctxt: a regexp parser context
5297 * [8] QuantExact ::= [0-9]+
5299 * Returns 0 if success or -1 in case of error
5302 xmlFAParseQuantExact(xmlRegParserCtxtPtr ctxt
) {
5307 while ((CUR
>= '0') && (CUR
<= '9')) {
5308 if (ret
> INT_MAX
/ 10) {
5311 int digit
= CUR
- '0';
5314 if (ret
> INT_MAX
- digit
)
5322 if ((ok
!= 1) || (overflow
== 1)) {
5329 * xmlFAParseQuantifier:
5330 * @ctxt: a regexp parser context
5332 * [4] quantifier ::= [?*+] | ( '{' quantity '}' )
5333 * [5] quantity ::= quantRange | quantMin | QuantExact
5334 * [6] quantRange ::= QuantExact ',' QuantExact
5335 * [7] quantMin ::= QuantExact ','
5336 * [8] QuantExact ::= [0-9]+
5339 xmlFAParseQuantifier(xmlRegParserCtxtPtr ctxt
) {
5343 if ((cur
== '?') || (cur
== '*') || (cur
== '+')) {
5344 if (ctxt
->atom
!= NULL
) {
5346 ctxt
->atom
->quant
= XML_REGEXP_QUANT_OPT
;
5347 else if (cur
== '*')
5348 ctxt
->atom
->quant
= XML_REGEXP_QUANT_MULT
;
5349 else if (cur
== '+')
5350 ctxt
->atom
->quant
= XML_REGEXP_QUANT_PLUS
;
5356 int min
= 0, max
= 0;
5359 cur
= xmlFAParseQuantExact(ctxt
);
5363 ERROR("Improper quantifier");
5370 cur
= xmlFAParseQuantExact(ctxt
);
5374 ERROR("Improper quantifier");
5381 ERROR("Unterminated quantifier");
5385 if (ctxt
->atom
!= NULL
) {
5386 ctxt
->atom
->quant
= XML_REGEXP_QUANT_RANGE
;
5387 ctxt
->atom
->min
= min
;
5388 ctxt
->atom
->max
= max
;
5397 * @ctxt: a regexp parser context
5399 * [9] atom ::= Char | charClass | ( '(' regExp ')' )
5402 xmlFAParseAtom(xmlRegParserCtxtPtr ctxt
) {
5405 codepoint
= xmlFAIsChar(ctxt
);
5406 if (codepoint
> 0) {
5407 ctxt
->atom
= xmlRegNewAtom(ctxt
, XML_REGEXP_CHARVAL
);
5408 if (ctxt
->atom
== NULL
)
5410 codepoint
= CUR_SCHAR(ctxt
->cur
, len
);
5411 ctxt
->atom
->codepoint
= codepoint
;
5414 } else if (CUR
== '|') {
5416 } else if (CUR
== 0) {
5418 } else if (CUR
== ')') {
5420 } else if (CUR
== '(') {
5421 xmlRegStatePtr start
, oldend
, start0
;
5424 if (ctxt
->depth
>= 50) {
5425 ERROR("xmlFAParseAtom: maximum nesting depth exceeded");
5429 * this extra Epsilon transition is needed if we count with 0 allowed
5430 * unfortunately this can't be known at that point
5432 xmlFAGenerateEpsilonTransition(ctxt
, ctxt
->state
, NULL
);
5433 start0
= ctxt
->state
;
5434 xmlFAGenerateEpsilonTransition(ctxt
, ctxt
->state
, NULL
);
5435 start
= ctxt
->state
;
5440 xmlFAParseRegExp(ctxt
, 0);
5445 ERROR("xmlFAParseAtom: expecting ')'");
5447 ctxt
->atom
= xmlRegNewAtom(ctxt
, XML_REGEXP_SUBREG
);
5448 if (ctxt
->atom
== NULL
)
5450 ctxt
->atom
->start
= start
;
5451 ctxt
->atom
->start0
= start0
;
5452 ctxt
->atom
->stop
= ctxt
->state
;
5455 } else if ((CUR
== '[') || (CUR
== '\\') || (CUR
== '.')) {
5456 xmlFAParseCharClass(ctxt
);
5464 * @ctxt: a regexp parser context
5466 * [3] piece ::= atom quantifier?
5469 xmlFAParsePiece(xmlRegParserCtxtPtr ctxt
) {
5473 ret
= xmlFAParseAtom(ctxt
);
5476 if (ctxt
->atom
== NULL
) {
5477 ERROR("internal: no atom generated");
5479 xmlFAParseQuantifier(ctxt
);
5485 * @ctxt: a regexp parser context
5486 * @to: optional target to the end of the branch
5488 * @to is used to optimize by removing duplicate path in automata
5489 * in expressions like (a|b)(c|d)
5491 * [2] branch ::= piece*
5494 xmlFAParseBranch(xmlRegParserCtxtPtr ctxt
, xmlRegStatePtr to
) {
5495 xmlRegStatePtr previous
;
5498 previous
= ctxt
->state
;
5499 ret
= xmlFAParsePiece(ctxt
);
5502 xmlFAGenerateEpsilonTransition(ctxt
, previous
, to
);
5504 if (xmlFAGenerateTransitions(ctxt
, previous
,
5505 (CUR
=='|' || CUR
==')' || CUR
==0) ? to
: NULL
,
5507 xmlRegFreeAtom(ctxt
->atom
);
5511 previous
= ctxt
->state
;
5514 while ((ret
!= 0) && (ctxt
->error
== 0)) {
5515 ret
= xmlFAParsePiece(ctxt
);
5517 if (xmlFAGenerateTransitions(ctxt
, previous
,
5518 (CUR
=='|' || CUR
==')' || CUR
==0) ? to
: NULL
,
5520 xmlRegFreeAtom(ctxt
->atom
);
5524 previous
= ctxt
->state
;
5533 * @ctxt: a regexp parser context
5534 * @top: is this the top-level expression ?
5536 * [1] regExp ::= branch ( '|' branch )*
5539 xmlFAParseRegExp(xmlRegParserCtxtPtr ctxt
, int top
) {
5540 xmlRegStatePtr start
, end
;
5542 /* if not top start should have been generated by an epsilon trans */
5543 start
= ctxt
->state
;
5545 xmlFAParseBranch(ctxt
, NULL
);
5547 #ifdef DEBUG_REGEXP_GRAPH
5548 printf("State %d is final\n", ctxt
->state
->no
);
5550 ctxt
->state
->type
= XML_REGEXP_FINAL_STATE
;
5553 ctxt
->end
= ctxt
->state
;
5557 while ((CUR
== '|') && (ctxt
->error
== 0)) {
5559 ctxt
->state
= start
;
5561 xmlFAParseBranch(ctxt
, end
);
5569 /************************************************************************
5573 ************************************************************************/
5577 * @output: the file for the output debug
5578 * @regexp: the compiled regexp
5580 * Print the content of the compiled regular expression
5583 xmlRegexpPrint(FILE *output
, xmlRegexpPtr regexp
) {
5588 fprintf(output
, " regexp: ");
5589 if (regexp
== NULL
) {
5590 fprintf(output
, "NULL\n");
5593 fprintf(output
, "'%s' ", regexp
->string
);
5594 fprintf(output
, "\n");
5595 fprintf(output
, "%d atoms:\n", regexp
->nbAtoms
);
5596 for (i
= 0;i
< regexp
->nbAtoms
; i
++) {
5597 fprintf(output
, " %02d ", i
);
5598 xmlRegPrintAtom(output
, regexp
->atoms
[i
]);
5600 fprintf(output
, "%d states:", regexp
->nbStates
);
5601 fprintf(output
, "\n");
5602 for (i
= 0;i
< regexp
->nbStates
; i
++) {
5603 xmlRegPrintState(output
, regexp
->states
[i
]);
5605 fprintf(output
, "%d counters:\n", regexp
->nbCounters
);
5606 for (i
= 0;i
< regexp
->nbCounters
; i
++) {
5607 fprintf(output
, " %d: min %d max %d\n", i
, regexp
->counters
[i
].min
,
5608 regexp
->counters
[i
].max
);
5614 * @regexp: a regular expression string
5616 * Parses a regular expression conforming to XML Schemas Part 2 Datatype
5617 * Appendix F and builds an automata suitable for testing strings against
5618 * that regular expression
5620 * Returns the compiled expression or NULL in case of error
5623 xmlRegexpCompile(const xmlChar
*regexp
) {
5624 xmlRegexpPtr ret
= NULL
;
5625 xmlRegParserCtxtPtr ctxt
;
5627 ctxt
= xmlRegNewParserCtxt(regexp
);
5631 /* initialize the parser */
5632 ctxt
->state
= xmlRegStatePush(ctxt
);
5633 if (ctxt
->state
== NULL
)
5635 ctxt
->start
= ctxt
->state
;
5638 /* parse the expression building an automata */
5639 xmlFAParseRegExp(ctxt
, 1);
5641 ERROR("xmlFAParseRegExp: extra characters");
5643 if (ctxt
->error
!= 0)
5645 ctxt
->end
= ctxt
->state
;
5646 ctxt
->start
->type
= XML_REGEXP_START_STATE
;
5647 ctxt
->end
->type
= XML_REGEXP_FINAL_STATE
;
5649 /* remove the Epsilon except for counted transitions */
5650 xmlFAEliminateEpsilonTransitions(ctxt
);
5653 if (ctxt
->error
!= 0)
5655 ret
= xmlRegEpxFromParse(ctxt
);
5658 xmlRegFreeParserCtxt(ctxt
);
5664 * @comp: the compiled regular expression
5665 * @content: the value to check against the regular expression
5667 * Check if the regular expression generates the value
5669 * Returns 1 if it matches, 0 if not and a negative value in case of error
5672 xmlRegexpExec(xmlRegexpPtr comp
, const xmlChar
*content
) {
5673 if ((comp
== NULL
) || (content
== NULL
))
5675 return(xmlFARegExec(comp
, content
));
5679 * xmlRegexpIsDeterminist:
5680 * @comp: the compiled regular expression
5682 * Check if the regular expression is determinist
5684 * Returns 1 if it yes, 0 if not and a negative value in case of error
5687 xmlRegexpIsDeterminist(xmlRegexpPtr comp
) {
5693 if (comp
->determinist
!= -1)
5694 return(comp
->determinist
);
5696 am
= xmlNewAutomata();
5699 if (am
->states
!= NULL
) {
5702 for (i
= 0;i
< am
->nbStates
;i
++)
5703 xmlRegFreeState(am
->states
[i
]);
5704 xmlFree(am
->states
);
5706 am
->nbAtoms
= comp
->nbAtoms
;
5707 am
->atoms
= comp
->atoms
;
5708 am
->nbStates
= comp
->nbStates
;
5709 am
->states
= comp
->states
;
5710 am
->determinist
= -1;
5711 am
->flags
= comp
->flags
;
5712 ret
= xmlFAComputesDeterminism(am
);
5715 xmlFreeAutomata(am
);
5716 comp
->determinist
= ret
;
5722 * @regexp: the regexp
5727 xmlRegFreeRegexp(xmlRegexpPtr regexp
) {
5732 if (regexp
->string
!= NULL
)
5733 xmlFree(regexp
->string
);
5734 if (regexp
->states
!= NULL
) {
5735 for (i
= 0;i
< regexp
->nbStates
;i
++)
5736 xmlRegFreeState(regexp
->states
[i
]);
5737 xmlFree(regexp
->states
);
5739 if (regexp
->atoms
!= NULL
) {
5740 for (i
= 0;i
< regexp
->nbAtoms
;i
++)
5741 xmlRegFreeAtom(regexp
->atoms
[i
]);
5742 xmlFree(regexp
->atoms
);
5744 if (regexp
->counters
!= NULL
)
5745 xmlFree(regexp
->counters
);
5746 if (regexp
->compact
!= NULL
)
5747 xmlFree(regexp
->compact
);
5748 if (regexp
->transdata
!= NULL
)
5749 xmlFree(regexp
->transdata
);
5750 if (regexp
->stringMap
!= NULL
) {
5751 for (i
= 0; i
< regexp
->nbstrings
;i
++)
5752 xmlFree(regexp
->stringMap
[i
]);
5753 xmlFree(regexp
->stringMap
);
5759 #ifdef LIBXML_AUTOMATA_ENABLED
5760 /************************************************************************
5762 * The Automata interface *
5764 ************************************************************************/
5769 * Create a new automata
5771 * Returns the new object or NULL in case of failure
5774 xmlNewAutomata(void) {
5775 xmlAutomataPtr ctxt
;
5777 ctxt
= xmlRegNewParserCtxt(NULL
);
5781 /* initialize the parser */
5782 ctxt
->state
= xmlRegStatePush(ctxt
);
5783 if (ctxt
->state
== NULL
) {
5784 xmlFreeAutomata(ctxt
);
5787 ctxt
->start
= ctxt
->state
;
5790 ctxt
->start
->type
= XML_REGEXP_START_STATE
;
5803 xmlFreeAutomata(xmlAutomataPtr am
) {
5806 xmlRegFreeParserCtxt(am
);
5810 * xmlAutomataSetFlags:
5812 * @flags: a set of internal flags
5814 * Set some flags on the automata
5817 xmlAutomataSetFlags(xmlAutomataPtr am
, int flags
) {
5824 * xmlAutomataGetInitState:
5827 * Initial state lookup
5829 * Returns the initial state of the automata
5832 xmlAutomataGetInitState(xmlAutomataPtr am
) {
5839 * xmlAutomataSetFinalState:
5841 * @state: a state in this automata
5843 * Makes that state a final state
5845 * Returns 0 or -1 in case of error
5848 xmlAutomataSetFinalState(xmlAutomataPtr am
, xmlAutomataStatePtr state
) {
5849 if ((am
== NULL
) || (state
== NULL
))
5851 state
->type
= XML_REGEXP_FINAL_STATE
;
5856 * xmlAutomataNewTransition:
5858 * @from: the starting point of the transition
5859 * @to: the target point of the transition or NULL
5860 * @token: the input string associated to that transition
5861 * @data: data passed to the callback function if the transition is activated
5863 * If @to is NULL, this creates first a new target state in the automata
5864 * and then adds a transition from the @from state to the target state
5865 * activated by the value of @token
5867 * Returns the target state or NULL in case of error
5870 xmlAutomataNewTransition(xmlAutomataPtr am
, xmlAutomataStatePtr from
,
5871 xmlAutomataStatePtr to
, const xmlChar
*token
,
5875 if ((am
== NULL
) || (from
== NULL
) || (token
== NULL
))
5877 atom
= xmlRegNewAtom(am
, XML_REGEXP_STRING
);
5881 atom
->valuep
= xmlStrdup(token
);
5883 if (xmlFAGenerateTransitions(am
, from
, to
, atom
) < 0) {
5884 xmlRegFreeAtom(atom
);
5893 * xmlAutomataNewTransition2:
5895 * @from: the starting point of the transition
5896 * @to: the target point of the transition or NULL
5897 * @token: the first input string associated to that transition
5898 * @token2: the second input string associated to that transition
5899 * @data: data passed to the callback function if the transition is activated
5901 * If @to is NULL, this creates first a new target state in the automata
5902 * and then adds a transition from the @from state to the target state
5903 * activated by the value of @token
5905 * Returns the target state or NULL in case of error
5908 xmlAutomataNewTransition2(xmlAutomataPtr am
, xmlAutomataStatePtr from
,
5909 xmlAutomataStatePtr to
, const xmlChar
*token
,
5910 const xmlChar
*token2
, void *data
) {
5913 if ((am
== NULL
) || (from
== NULL
) || (token
== NULL
))
5915 atom
= xmlRegNewAtom(am
, XML_REGEXP_STRING
);
5919 if ((token2
== NULL
) || (*token2
== 0)) {
5920 atom
->valuep
= xmlStrdup(token
);
5925 lenn
= strlen((char *) token2
);
5926 lenp
= strlen((char *) token
);
5928 str
= (xmlChar
*) xmlMallocAtomic(lenn
+ lenp
+ 2);
5930 xmlRegFreeAtom(atom
);
5933 memcpy(&str
[0], token
, lenp
);
5935 memcpy(&str
[lenp
+ 1], token2
, lenn
);
5936 str
[lenn
+ lenp
+ 1] = 0;
5941 if (xmlFAGenerateTransitions(am
, from
, to
, atom
) < 0) {
5942 xmlRegFreeAtom(atom
);
5951 * xmlAutomataNewNegTrans:
5953 * @from: the starting point of the transition
5954 * @to: the target point of the transition or NULL
5955 * @token: the first input string associated to that transition
5956 * @token2: the second input string associated to that transition
5957 * @data: data passed to the callback function if the transition is activated
5959 * If @to is NULL, this creates first a new target state in the automata
5960 * and then adds a transition from the @from state to the target state
5961 * activated by any value except (@token,@token2)
5962 * Note that if @token2 is not NULL, then (X, NULL) won't match to follow
5963 # the semantic of XSD ##other
5965 * Returns the target state or NULL in case of error
5968 xmlAutomataNewNegTrans(xmlAutomataPtr am
, xmlAutomataStatePtr from
,
5969 xmlAutomataStatePtr to
, const xmlChar
*token
,
5970 const xmlChar
*token2
, void *data
) {
5972 xmlChar err_msg
[200];
5974 if ((am
== NULL
) || (from
== NULL
) || (token
== NULL
))
5976 atom
= xmlRegNewAtom(am
, XML_REGEXP_STRING
);
5981 if ((token2
== NULL
) || (*token2
== 0)) {
5982 atom
->valuep
= xmlStrdup(token
);
5987 lenn
= strlen((char *) token2
);
5988 lenp
= strlen((char *) token
);
5990 str
= (xmlChar
*) xmlMallocAtomic(lenn
+ lenp
+ 2);
5992 xmlRegFreeAtom(atom
);
5995 memcpy(&str
[0], token
, lenp
);
5997 memcpy(&str
[lenp
+ 1], token2
, lenn
);
5998 str
[lenn
+ lenp
+ 1] = 0;
6002 snprintf((char *) err_msg
, 199, "not %s", (const char *) atom
->valuep
);
6004 atom
->valuep2
= xmlStrdup(err_msg
);
6006 if (xmlFAGenerateTransitions(am
, from
, to
, atom
) < 0) {
6007 xmlRegFreeAtom(atom
);
6017 * xmlAutomataNewCountTrans2:
6019 * @from: the starting point of the transition
6020 * @to: the target point of the transition or NULL
6021 * @token: the input string associated to that transition
6022 * @token2: the second input string associated to that transition
6023 * @min: the minimum successive occurrences of token
6024 * @max: the maximum successive occurrences of token
6025 * @data: data associated to the transition
6027 * If @to is NULL, this creates first a new target state in the automata
6028 * and then adds a transition from the @from state to the target state
6029 * activated by a succession of input of value @token and @token2 and
6030 * whose number is between @min and @max
6032 * Returns the target state or NULL in case of error
6035 xmlAutomataNewCountTrans2(xmlAutomataPtr am
, xmlAutomataStatePtr from
,
6036 xmlAutomataStatePtr to
, const xmlChar
*token
,
6037 const xmlChar
*token2
,
6038 int min
, int max
, void *data
) {
6042 if ((am
== NULL
) || (from
== NULL
) || (token
== NULL
))
6046 if ((max
< min
) || (max
< 1))
6048 atom
= xmlRegNewAtom(am
, XML_REGEXP_STRING
);
6051 if ((token2
== NULL
) || (*token2
== 0)) {
6052 atom
->valuep
= xmlStrdup(token
);
6053 if (atom
->valuep
== NULL
)
6059 lenn
= strlen((char *) token2
);
6060 lenp
= strlen((char *) token
);
6062 str
= (xmlChar
*) xmlMallocAtomic(lenn
+ lenp
+ 2);
6065 memcpy(&str
[0], token
, lenp
);
6067 memcpy(&str
[lenp
+ 1], token2
, lenn
);
6068 str
[lenn
+ lenp
+ 1] = 0;
6080 * associate a counter to the transition.
6082 counter
= xmlRegGetCounter(am
);
6085 am
->counters
[counter
].min
= min
;
6086 am
->counters
[counter
].max
= max
;
6088 /* xmlFAGenerateTransitions(am, from, to, atom); */
6090 to
= xmlRegStatePush(am
);
6094 xmlRegStateAddTrans(am
, from
, atom
, to
, counter
, -1);
6095 if (xmlRegAtomPush(am
, atom
) < 0)
6104 xmlFAGenerateEpsilonTransition(am
, from
, to
);
6108 xmlRegFreeAtom(atom
);
6113 * xmlAutomataNewCountTrans:
6115 * @from: the starting point of the transition
6116 * @to: the target point of the transition or NULL
6117 * @token: the input string associated to that transition
6118 * @min: the minimum successive occurrences of token
6119 * @max: the maximum successive occurrences of token
6120 * @data: data associated to the transition
6122 * If @to is NULL, this creates first a new target state in the automata
6123 * and then adds a transition from the @from state to the target state
6124 * activated by a succession of input of value @token and whose number
6125 * is between @min and @max
6127 * Returns the target state or NULL in case of error
6130 xmlAutomataNewCountTrans(xmlAutomataPtr am
, xmlAutomataStatePtr from
,
6131 xmlAutomataStatePtr to
, const xmlChar
*token
,
6132 int min
, int max
, void *data
) {
6136 if ((am
== NULL
) || (from
== NULL
) || (token
== NULL
))
6140 if ((max
< min
) || (max
< 1))
6142 atom
= xmlRegNewAtom(am
, XML_REGEXP_STRING
);
6145 atom
->valuep
= xmlStrdup(token
);
6146 if (atom
->valuep
== NULL
)
6156 * associate a counter to the transition.
6158 counter
= xmlRegGetCounter(am
);
6161 am
->counters
[counter
].min
= min
;
6162 am
->counters
[counter
].max
= max
;
6164 /* xmlFAGenerateTransitions(am, from, to, atom); */
6166 to
= xmlRegStatePush(am
);
6170 xmlRegStateAddTrans(am
, from
, atom
, to
, counter
, -1);
6171 if (xmlRegAtomPush(am
, atom
) < 0)
6180 xmlFAGenerateEpsilonTransition(am
, from
, to
);
6184 xmlRegFreeAtom(atom
);
6189 * xmlAutomataNewOnceTrans2:
6191 * @from: the starting point of the transition
6192 * @to: the target point of the transition or NULL
6193 * @token: the input string associated to that transition
6194 * @token2: the second input string associated to that transition
6195 * @min: the minimum successive occurrences of token
6196 * @max: the maximum successive occurrences of token
6197 * @data: data associated to the transition
6199 * If @to is NULL, this creates first a new target state in the automata
6200 * and then adds a transition from the @from state to the target state
6201 * activated by a succession of input of value @token and @token2 and whose
6202 * number is between @min and @max, moreover that transition can only be
6205 * Returns the target state or NULL in case of error
6208 xmlAutomataNewOnceTrans2(xmlAutomataPtr am
, xmlAutomataStatePtr from
,
6209 xmlAutomataStatePtr to
, const xmlChar
*token
,
6210 const xmlChar
*token2
,
6211 int min
, int max
, void *data
) {
6215 if ((am
== NULL
) || (from
== NULL
) || (token
== NULL
))
6221 atom
= xmlRegNewAtom(am
, XML_REGEXP_STRING
);
6224 if ((token2
== NULL
) || (*token2
== 0)) {
6225 atom
->valuep
= xmlStrdup(token
);
6226 if (atom
->valuep
== NULL
)
6232 lenn
= strlen((char *) token2
);
6233 lenp
= strlen((char *) token
);
6235 str
= (xmlChar
*) xmlMallocAtomic(lenn
+ lenp
+ 2);
6238 memcpy(&str
[0], token
, lenp
);
6240 memcpy(&str
[lenp
+ 1], token2
, lenn
);
6241 str
[lenn
+ lenp
+ 1] = 0;
6246 atom
->quant
= XML_REGEXP_QUANT_ONCEONLY
;
6250 * associate a counter to the transition.
6252 counter
= xmlRegGetCounter(am
);
6255 am
->counters
[counter
].min
= 1;
6256 am
->counters
[counter
].max
= 1;
6258 /* xmlFAGenerateTransitions(am, from, to, atom); */
6260 to
= xmlRegStatePush(am
);
6264 xmlRegStateAddTrans(am
, from
, atom
, to
, counter
, -1);
6265 if (xmlRegAtomPush(am
, atom
) < 0)
6271 xmlRegFreeAtom(atom
);
6278 * xmlAutomataNewOnceTrans:
6280 * @from: the starting point of the transition
6281 * @to: the target point of the transition or NULL
6282 * @token: the input string associated to that transition
6283 * @min: the minimum successive occurrences of token
6284 * @max: the maximum successive occurrences of token
6285 * @data: data associated to the transition
6287 * If @to is NULL, this creates first a new target state in the automata
6288 * and then adds a transition from the @from state to the target state
6289 * activated by a succession of input of value @token and whose number
6290 * is between @min and @max, moreover that transition can only be crossed
6293 * Returns the target state or NULL in case of error
6296 xmlAutomataNewOnceTrans(xmlAutomataPtr am
, xmlAutomataStatePtr from
,
6297 xmlAutomataStatePtr to
, const xmlChar
*token
,
6298 int min
, int max
, void *data
) {
6302 if ((am
== NULL
) || (from
== NULL
) || (token
== NULL
))
6308 atom
= xmlRegNewAtom(am
, XML_REGEXP_STRING
);
6311 atom
->valuep
= xmlStrdup(token
);
6313 atom
->quant
= XML_REGEXP_QUANT_ONCEONLY
;
6317 * associate a counter to the transition.
6319 counter
= xmlRegGetCounter(am
);
6322 am
->counters
[counter
].min
= 1;
6323 am
->counters
[counter
].max
= 1;
6325 /* xmlFAGenerateTransitions(am, from, to, atom); */
6327 to
= xmlRegStatePush(am
);
6331 xmlRegStateAddTrans(am
, from
, atom
, to
, counter
, -1);
6332 if (xmlRegAtomPush(am
, atom
) < 0)
6338 xmlRegFreeAtom(atom
);
6343 * xmlAutomataNewState:
6346 * Create a new disconnected state in the automata
6348 * Returns the new state or NULL in case of error
6351 xmlAutomataNewState(xmlAutomataPtr am
) {
6354 return(xmlRegStatePush(am
));
6358 * xmlAutomataNewEpsilon:
6360 * @from: the starting point of the transition
6361 * @to: the target point of the transition or NULL
6363 * If @to is NULL, this creates first a new target state in the automata
6364 * and then adds an epsilon transition from the @from state to the
6367 * Returns the target state or NULL in case of error
6370 xmlAutomataNewEpsilon(xmlAutomataPtr am
, xmlAutomataStatePtr from
,
6371 xmlAutomataStatePtr to
) {
6372 if ((am
== NULL
) || (from
== NULL
))
6374 xmlFAGenerateEpsilonTransition(am
, from
, to
);
6381 * xmlAutomataNewAllTrans:
6383 * @from: the starting point of the transition
6384 * @to: the target point of the transition or NULL
6385 * @lax: allow to transition if not all all transitions have been activated
6387 * If @to is NULL, this creates first a new target state in the automata
6388 * and then adds a an ALL transition from the @from state to the
6389 * target state. That transition is an epsilon transition allowed only when
6390 * all transitions from the @from node have been activated.
6392 * Returns the target state or NULL in case of error
6395 xmlAutomataNewAllTrans(xmlAutomataPtr am
, xmlAutomataStatePtr from
,
6396 xmlAutomataStatePtr to
, int lax
) {
6397 if ((am
== NULL
) || (from
== NULL
))
6399 xmlFAGenerateAllTransition(am
, from
, to
, lax
);
6406 * xmlAutomataNewCounter:
6408 * @min: the minimal value on the counter
6409 * @max: the maximal value on the counter
6411 * Create a new counter
6413 * Returns the counter number or -1 in case of error
6416 xmlAutomataNewCounter(xmlAutomataPtr am
, int min
, int max
) {
6422 ret
= xmlRegGetCounter(am
);
6425 am
->counters
[ret
].min
= min
;
6426 am
->counters
[ret
].max
= max
;
6431 * xmlAutomataNewCountedTrans:
6433 * @from: the starting point of the transition
6434 * @to: the target point of the transition or NULL
6435 * @counter: the counter associated to that transition
6437 * If @to is NULL, this creates first a new target state in the automata
6438 * and then adds an epsilon transition from the @from state to the target state
6439 * which will increment the counter provided
6441 * Returns the target state or NULL in case of error
6444 xmlAutomataNewCountedTrans(xmlAutomataPtr am
, xmlAutomataStatePtr from
,
6445 xmlAutomataStatePtr to
, int counter
) {
6446 if ((am
== NULL
) || (from
== NULL
) || (counter
< 0))
6448 xmlFAGenerateCountedEpsilonTransition(am
, from
, to
, counter
);
6455 * xmlAutomataNewCounterTrans:
6457 * @from: the starting point of the transition
6458 * @to: the target point of the transition or NULL
6459 * @counter: the counter associated to that transition
6461 * If @to is NULL, this creates first a new target state in the automata
6462 * and then adds an epsilon transition from the @from state to the target state
6463 * which will be allowed only if the counter is within the right range.
6465 * Returns the target state or NULL in case of error
6468 xmlAutomataNewCounterTrans(xmlAutomataPtr am
, xmlAutomataStatePtr from
,
6469 xmlAutomataStatePtr to
, int counter
) {
6470 if ((am
== NULL
) || (from
== NULL
) || (counter
< 0))
6472 xmlFAGenerateCountedTransition(am
, from
, to
, counter
);
6479 * xmlAutomataCompile:
6482 * Compile the automata into a Reg Exp ready for being executed.
6483 * The automata should be free after this point.
6485 * Returns the compiled regexp or NULL in case of error
6488 xmlAutomataCompile(xmlAutomataPtr am
) {
6491 if ((am
== NULL
) || (am
->error
!= 0)) return(NULL
);
6492 xmlFAEliminateEpsilonTransitions(am
);
6493 /* xmlFAComputesDeterminism(am); */
6494 ret
= xmlRegEpxFromParse(am
);
6500 * xmlAutomataIsDeterminist:
6503 * Checks if an automata is determinist.
6505 * Returns 1 if true, 0 if not, and -1 in case of error
6508 xmlAutomataIsDeterminist(xmlAutomataPtr am
) {
6514 ret
= xmlFAComputesDeterminism(am
);
6517 #endif /* LIBXML_AUTOMATA_ENABLED */
6519 #ifdef LIBXML_EXPR_ENABLED
6520 /************************************************************************
6522 * Formal Expression handling code *
6524 ************************************************************************/
6525 /************************************************************************
6527 * Expression handling context *
6529 ************************************************************************/
6531 struct _xmlExpCtxt
{
6533 xmlExpNodePtr
*table
;
6546 * @maxNodes: the maximum number of nodes
6547 * @dict: optional dictionary to use internally
6549 * Creates a new context for manipulating expressions
6551 * Returns the context or NULL in case of error
6554 xmlExpNewCtxt(int maxNodes
, xmlDictPtr dict
) {
6558 if (maxNodes
<= 4096)
6561 ret
= (xmlExpCtxtPtr
) xmlMalloc(sizeof(xmlExpCtxt
));
6564 memset(ret
, 0, sizeof(xmlExpCtxt
));
6567 ret
->maxNodes
= maxNodes
;
6568 ret
->table
= xmlMalloc(size
* sizeof(xmlExpNodePtr
));
6569 if (ret
->table
== NULL
) {
6573 memset(ret
->table
, 0, size
* sizeof(xmlExpNodePtr
));
6575 ret
->dict
= xmlDictCreate();
6576 if (ret
->dict
== NULL
) {
6577 xmlFree(ret
->table
);
6583 xmlDictReference(ret
->dict
);
6590 * @ctxt: an expression context
6592 * Free an expression context
6595 xmlExpFreeCtxt(xmlExpCtxtPtr ctxt
) {
6598 xmlDictFree(ctxt
->dict
);
6599 if (ctxt
->table
!= NULL
)
6600 xmlFree(ctxt
->table
);
6604 /************************************************************************
6606 * Structure associated to an expression node *
6608 ************************************************************************/
6609 #define MAX_NODES 10000
6611 /* #define DEBUG_DERIV */
6616 * - public API for creation
6619 * - regression testing
6622 * - split into module and test tool
6627 XML_EXP_NILABLE
= (1 << 0)
6630 #define IS_NILLABLE(node) ((node)->info & XML_EXP_NILABLE)
6632 struct _xmlExpNode
{
6633 unsigned char type
;/* xmlExpNodeType */
6634 unsigned char info
;/* OR of xmlExpNodeInfo */
6635 unsigned short key
; /* the hash key */
6636 unsigned int ref
; /* The number of references */
6637 int c_max
; /* the maximum length it can consume */
6638 xmlExpNodePtr exp_left
;
6639 xmlExpNodePtr next
;/* the next node in the hash table or free list */
6646 xmlExpNodePtr f_right
;
6648 const xmlChar
*f_str
;
6652 #define exp_min field.count.f_min
6653 #define exp_max field.count.f_max
6654 /* #define exp_left field.children.f_left */
6655 #define exp_right field.children.f_right
6656 #define exp_str field.f_str
6658 static xmlExpNodePtr
xmlExpNewNode(xmlExpCtxtPtr ctxt
, xmlExpNodeType type
);
6659 static xmlExpNode forbiddenExpNode
= {
6660 XML_EXP_FORBID
, 0, 0, 0, 0, NULL
, NULL
, {{ 0, 0}}
6662 xmlExpNodePtr forbiddenExp
= &forbiddenExpNode
;
6663 static xmlExpNode emptyExpNode
= {
6664 XML_EXP_EMPTY
, 1, 0, 0, 0, NULL
, NULL
, {{ 0, 0}}
6666 xmlExpNodePtr emptyExp
= &emptyExpNode
;
6668 /************************************************************************
6670 * The custom hash table for unicity and canonicalization *
6671 * of sub-expressions pointers *
6673 ************************************************************************/
6675 * xmlExpHashNameComputeKey:
6676 * Calculate the hash key for a token
6678 static unsigned short
6679 xmlExpHashNameComputeKey(const xmlChar
*name
) {
6680 unsigned short value
= 0L;
6684 value
+= 30 * (*name
);
6685 while ((ch
= *name
++) != 0) {
6686 value
= value
^ ((value
<< 5) + (value
>> 3) + (unsigned long)ch
);
6693 * xmlExpHashComputeKey:
6694 * Calculate the hash key for a compound expression
6696 static unsigned short
6697 xmlExpHashComputeKey(xmlExpNodeType type
, xmlExpNodePtr left
,
6698 xmlExpNodePtr right
) {
6699 unsigned long value
;
6705 value
+= right
->key
;
6707 ret
= (unsigned short) value
;
6711 value
+= right
->key
;
6713 ret
= (unsigned short) value
;
6717 value
+= right
->key
;
6718 ret
= (unsigned short) value
;
6727 static xmlExpNodePtr
6728 xmlExpNewNode(xmlExpCtxtPtr ctxt
, xmlExpNodeType type
) {
6731 if (ctxt
->nb_nodes
>= MAX_NODES
)
6733 ret
= (xmlExpNodePtr
) xmlMalloc(sizeof(xmlExpNode
));
6736 memset(ret
, 0, sizeof(xmlExpNode
));
6745 * xmlExpHashGetEntry:
6746 * @table: the hash table
6748 * Get the unique entry from the hash table. The entry is created if
6749 * needed. @left and @right are consumed, i.e. their ref count will
6750 * be decremented by the operation.
6752 * Returns the pointer or NULL in case of error
6754 static xmlExpNodePtr
6755 xmlExpHashGetEntry(xmlExpCtxtPtr ctxt
, xmlExpNodeType type
,
6756 xmlExpNodePtr left
, xmlExpNodePtr right
,
6757 const xmlChar
*name
, int min
, int max
) {
6758 unsigned short kbase
, key
;
6759 xmlExpNodePtr entry
;
6760 xmlExpNodePtr insert
;
6766 * Check for duplicate and insertion location.
6768 if (type
== XML_EXP_ATOM
) {
6769 kbase
= xmlExpHashNameComputeKey(name
);
6770 } else if (type
== XML_EXP_COUNT
) {
6771 /* COUNT reduction rule 1 */
6778 xmlExpFree(ctxt
, left
);
6783 xmlExpFree(ctxt
, left
);
6784 return(forbiddenExp
);
6791 } else if (type
== XML_EXP_OR
) {
6792 /* Forbid reduction rules */
6793 if (left
->type
== XML_EXP_FORBID
) {
6794 xmlExpFree(ctxt
, left
);
6797 if (right
->type
== XML_EXP_FORBID
) {
6798 xmlExpFree(ctxt
, right
);
6802 /* OR reduction rule 1 */
6803 /* a | a reduced to a */
6804 if (left
== right
) {
6805 xmlExpFree(ctxt
, right
);
6808 /* OR canonicalization rule 1 */
6809 /* linearize (a | b) | c into a | (b | c) */
6810 if ((left
->type
== XML_EXP_OR
) && (right
->type
!= XML_EXP_OR
)) {
6811 xmlExpNodePtr tmp
= left
;
6815 /* OR reduction rule 2 */
6816 /* a | (a | b) and b | (a | b) are reduced to a | b */
6817 if (right
->type
== XML_EXP_OR
) {
6818 if ((left
== right
->exp_left
) ||
6819 (left
== right
->exp_right
)) {
6820 xmlExpFree(ctxt
, left
);
6824 /* OR canonicalization rule 2 */
6825 /* linearize (a | b) | c into a | (b | c) */
6826 if (left
->type
== XML_EXP_OR
) {
6829 /* OR canonicalization rule 2 */
6830 if ((left
->exp_right
->type
!= XML_EXP_OR
) &&
6831 (left
->exp_right
->key
< left
->exp_left
->key
)) {
6832 tmp
= left
->exp_right
;
6833 left
->exp_right
= left
->exp_left
;
6834 left
->exp_left
= tmp
;
6836 left
->exp_right
->ref
++;
6837 tmp
= xmlExpHashGetEntry(ctxt
, XML_EXP_OR
, left
->exp_right
, right
,
6839 left
->exp_left
->ref
++;
6840 tmp
= xmlExpHashGetEntry(ctxt
, XML_EXP_OR
, left
->exp_left
, tmp
,
6843 xmlExpFree(ctxt
, left
);
6846 if (right
->type
== XML_EXP_OR
) {
6847 /* Ordering in the tree */
6848 /* C | (A | B) -> A | (B | C) */
6849 if (left
->key
> right
->exp_right
->key
) {
6851 right
->exp_right
->ref
++;
6852 tmp
= xmlExpHashGetEntry(ctxt
, XML_EXP_OR
, right
->exp_right
,
6854 right
->exp_left
->ref
++;
6855 tmp
= xmlExpHashGetEntry(ctxt
, XML_EXP_OR
, right
->exp_left
,
6857 xmlExpFree(ctxt
, right
);
6860 /* Ordering in the tree */
6861 /* B | (A | C) -> A | (B | C) */
6862 if (left
->key
> right
->exp_left
->key
) {
6864 right
->exp_right
->ref
++;
6865 tmp
= xmlExpHashGetEntry(ctxt
, XML_EXP_OR
, left
,
6866 right
->exp_right
, NULL
, 0, 0);
6867 right
->exp_left
->ref
++;
6868 tmp
= xmlExpHashGetEntry(ctxt
, XML_EXP_OR
, right
->exp_left
,
6870 xmlExpFree(ctxt
, right
);
6874 /* we know both types are != XML_EXP_OR here */
6875 else if (left
->key
> right
->key
) {
6876 xmlExpNodePtr tmp
= left
;
6880 kbase
= xmlExpHashComputeKey(type
, left
, right
);
6881 } else if (type
== XML_EXP_SEQ
) {
6882 /* Forbid reduction rules */
6883 if (left
->type
== XML_EXP_FORBID
) {
6884 xmlExpFree(ctxt
, right
);
6887 if (right
->type
== XML_EXP_FORBID
) {
6888 xmlExpFree(ctxt
, left
);
6891 /* Empty reduction rules */
6892 if (right
->type
== XML_EXP_EMPTY
) {
6895 if (left
->type
== XML_EXP_EMPTY
) {
6898 kbase
= xmlExpHashComputeKey(type
, left
, right
);
6902 key
= kbase
% ctxt
->size
;
6903 if (ctxt
->table
[key
] != NULL
) {
6904 for (insert
= ctxt
->table
[key
]; insert
!= NULL
;
6905 insert
= insert
->next
) {
6906 if ((insert
->key
== kbase
) &&
6907 (insert
->type
== type
)) {
6908 if (type
== XML_EXP_ATOM
) {
6909 if (name
== insert
->exp_str
) {
6913 } else if (type
== XML_EXP_COUNT
) {
6914 if ((insert
->exp_min
== min
) && (insert
->exp_max
== max
) &&
6915 (insert
->exp_left
== left
)) {
6920 } else if ((insert
->exp_left
== left
) &&
6921 (insert
->exp_right
== right
)) {
6931 entry
= xmlExpNewNode(ctxt
, type
);
6935 if (type
== XML_EXP_ATOM
) {
6936 entry
->exp_str
= name
;
6938 } else if (type
== XML_EXP_COUNT
) {
6939 entry
->exp_min
= min
;
6940 entry
->exp_max
= max
;
6941 entry
->exp_left
= left
;
6942 if ((min
== 0) || (IS_NILLABLE(left
)))
6943 entry
->info
|= XML_EXP_NILABLE
;
6947 entry
->c_max
= max
* entry
->exp_left
->c_max
;
6949 entry
->exp_left
= left
;
6950 entry
->exp_right
= right
;
6951 if (type
== XML_EXP_OR
) {
6952 if ((IS_NILLABLE(left
)) || (IS_NILLABLE(right
)))
6953 entry
->info
|= XML_EXP_NILABLE
;
6954 if ((entry
->exp_left
->c_max
== -1) ||
6955 (entry
->exp_right
->c_max
== -1))
6957 else if (entry
->exp_left
->c_max
> entry
->exp_right
->c_max
)
6958 entry
->c_max
= entry
->exp_left
->c_max
;
6960 entry
->c_max
= entry
->exp_right
->c_max
;
6962 if ((IS_NILLABLE(left
)) && (IS_NILLABLE(right
)))
6963 entry
->info
|= XML_EXP_NILABLE
;
6964 if ((entry
->exp_left
->c_max
== -1) ||
6965 (entry
->exp_right
->c_max
== -1))
6968 entry
->c_max
= entry
->exp_left
->c_max
+ entry
->exp_right
->c_max
;
6972 if (ctxt
->table
[key
] != NULL
)
6973 entry
->next
= ctxt
->table
[key
];
6975 ctxt
->table
[key
] = entry
;
6983 * @ctxt: the expression context
6984 * @exp: the expression
6986 * Dereference the expression
6989 xmlExpFree(xmlExpCtxtPtr ctxt
, xmlExpNodePtr exp
) {
6990 if ((exp
== NULL
) || (exp
== forbiddenExp
) || (exp
== emptyExp
))
6993 if (exp
->ref
== 0) {
6996 /* Unlink it first from the hash table */
6997 key
= exp
->key
% ctxt
->size
;
6998 if (ctxt
->table
[key
] == exp
) {
6999 ctxt
->table
[key
] = exp
->next
;
7003 tmp
= ctxt
->table
[key
];
7004 while (tmp
!= NULL
) {
7005 if (tmp
->next
== exp
) {
7006 tmp
->next
= exp
->next
;
7013 if ((exp
->type
== XML_EXP_SEQ
) || (exp
->type
== XML_EXP_OR
)) {
7014 xmlExpFree(ctxt
, exp
->exp_left
);
7015 xmlExpFree(ctxt
, exp
->exp_right
);
7016 } else if (exp
->type
== XML_EXP_COUNT
) {
7017 xmlExpFree(ctxt
, exp
->exp_left
);
7026 * @exp: the expression
7028 * Increase the reference count of the expression
7031 xmlExpRef(xmlExpNodePtr exp
) {
7038 * @ctxt: the expression context
7039 * @name: the atom name
7040 * @len: the atom name length in byte (or -1);
7042 * Get the atom associated to this name from that context
7044 * Returns the node or NULL in case of error
7047 xmlExpNewAtom(xmlExpCtxtPtr ctxt
, const xmlChar
*name
, int len
) {
7048 if ((ctxt
== NULL
) || (name
== NULL
))
7050 name
= xmlDictLookup(ctxt
->dict
, name
, len
);
7053 return(xmlExpHashGetEntry(ctxt
, XML_EXP_ATOM
, NULL
, NULL
, name
, 0, 0));
7058 * @ctxt: the expression context
7059 * @left: left expression
7060 * @right: right expression
7062 * Get the atom associated to the choice @left | @right
7063 * Note that @left and @right are consumed in the operation, to keep
7064 * an handle on them use xmlExpRef() and use xmlExpFree() to release them,
7065 * this is true even in case of failure (unless ctxt == NULL).
7067 * Returns the node or NULL in case of error
7070 xmlExpNewOr(xmlExpCtxtPtr ctxt
, xmlExpNodePtr left
, xmlExpNodePtr right
) {
7073 if ((left
== NULL
) || (right
== NULL
)) {
7074 xmlExpFree(ctxt
, left
);
7075 xmlExpFree(ctxt
, right
);
7078 return(xmlExpHashGetEntry(ctxt
, XML_EXP_OR
, left
, right
, NULL
, 0, 0));
7083 * @ctxt: the expression context
7084 * @left: left expression
7085 * @right: right expression
7087 * Get the atom associated to the sequence @left , @right
7088 * Note that @left and @right are consumed in the operation, to keep
7089 * an handle on them use xmlExpRef() and use xmlExpFree() to release them,
7090 * this is true even in case of failure (unless ctxt == NULL).
7092 * Returns the node or NULL in case of error
7095 xmlExpNewSeq(xmlExpCtxtPtr ctxt
, xmlExpNodePtr left
, xmlExpNodePtr right
) {
7098 if ((left
== NULL
) || (right
== NULL
)) {
7099 xmlExpFree(ctxt
, left
);
7100 xmlExpFree(ctxt
, right
);
7103 return(xmlExpHashGetEntry(ctxt
, XML_EXP_SEQ
, left
, right
, NULL
, 0, 0));
7108 * @ctxt: the expression context
7109 * @subset: the expression to be repeated
7110 * @min: the lower bound for the repetition
7111 * @max: the upper bound for the repetition, -1 means infinite
7113 * Get the atom associated to the range (@subset){@min, @max}
7114 * Note that @subset is consumed in the operation, to keep
7115 * an handle on it use xmlExpRef() and use xmlExpFree() to release it,
7116 * this is true even in case of failure (unless ctxt == NULL).
7118 * Returns the node or NULL in case of error
7121 xmlExpNewRange(xmlExpCtxtPtr ctxt
, xmlExpNodePtr subset
, int min
, int max
) {
7124 if ((subset
== NULL
) || (min
< 0) || (max
< -1) ||
7125 ((max
>= 0) && (min
> max
))) {
7126 xmlExpFree(ctxt
, subset
);
7129 return(xmlExpHashGetEntry(ctxt
, XML_EXP_COUNT
, subset
,
7130 NULL
, NULL
, min
, max
));
7133 /************************************************************************
7135 * Public API for operations on expressions *
7137 ************************************************************************/
7140 xmlExpGetLanguageInt(xmlExpCtxtPtr ctxt
, xmlExpNodePtr exp
,
7141 const xmlChar
**list
, int len
, int nb
) {
7144 switch (exp
->type
) {
7148 for (tmp
= 0;tmp
< nb
;tmp
++)
7149 if (list
[tmp
] == exp
->exp_str
)
7153 list
[nb
] = exp
->exp_str
;
7156 exp
= exp
->exp_left
;
7160 tmp
= xmlExpGetLanguageInt(ctxt
, exp
->exp_left
, list
, len
, nb
);
7163 tmp2
= xmlExpGetLanguageInt(ctxt
, exp
->exp_right
, list
, len
,
7173 * xmlExpGetLanguage:
7174 * @ctxt: the expression context
7175 * @exp: the expression
7176 * @langList: where to store the tokens
7177 * @len: the allocated length of @list
7179 * Find all the strings used in @exp and store them in @list
7181 * Returns the number of unique strings found, -1 in case of errors and
7182 * -2 if there is more than @len strings
7185 xmlExpGetLanguage(xmlExpCtxtPtr ctxt
, xmlExpNodePtr exp
,
7186 const xmlChar
**langList
, int len
) {
7187 if ((ctxt
== NULL
) || (exp
== NULL
) || (langList
== NULL
) || (len
<= 0))
7189 return(xmlExpGetLanguageInt(ctxt
, exp
, langList
, len
, 0));
7193 xmlExpGetStartInt(xmlExpCtxtPtr ctxt
, xmlExpNodePtr exp
,
7194 const xmlChar
**list
, int len
, int nb
) {
7197 switch (exp
->type
) {
7198 case XML_EXP_FORBID
:
7203 for (tmp
= 0;tmp
< nb
;tmp
++)
7204 if (list
[tmp
] == exp
->exp_str
)
7208 list
[nb
] = exp
->exp_str
;
7211 exp
= exp
->exp_left
;
7214 tmp
= xmlExpGetStartInt(ctxt
, exp
->exp_left
, list
, len
, nb
);
7217 if (IS_NILLABLE(exp
->exp_left
)) {
7218 tmp2
= xmlExpGetStartInt(ctxt
, exp
->exp_right
, list
, len
,
7226 tmp
= xmlExpGetStartInt(ctxt
, exp
->exp_left
, list
, len
, nb
);
7229 tmp2
= xmlExpGetStartInt(ctxt
, exp
->exp_right
, list
, len
,
7240 * @ctxt: the expression context
7241 * @exp: the expression
7242 * @tokList: where to store the tokens
7243 * @len: the allocated length of @list
7245 * Find all the strings that appears at the start of the languages
7246 * accepted by @exp and store them in @list. E.g. for (a, b) | c
7247 * it will return the list [a, c]
7249 * Returns the number of unique strings found, -1 in case of errors and
7250 * -2 if there is more than @len strings
7253 xmlExpGetStart(xmlExpCtxtPtr ctxt
, xmlExpNodePtr exp
,
7254 const xmlChar
**tokList
, int len
) {
7255 if ((ctxt
== NULL
) || (exp
== NULL
) || (tokList
== NULL
) || (len
<= 0))
7257 return(xmlExpGetStartInt(ctxt
, exp
, tokList
, len
, 0));
7262 * @exp: the expression
7264 * Finds if the expression is nillable, i.e. if it accepts the empty sequence
7266 * Returns 1 if nillable, 0 if not and -1 in case of error
7269 xmlExpIsNillable(xmlExpNodePtr exp
) {
7272 return(IS_NILLABLE(exp
) != 0);
7275 static xmlExpNodePtr
7276 xmlExpStringDeriveInt(xmlExpCtxtPtr ctxt
, xmlExpNodePtr exp
, const xmlChar
*str
)
7280 switch (exp
->type
) {
7282 return(forbiddenExp
);
7283 case XML_EXP_FORBID
:
7284 return(forbiddenExp
);
7286 if (exp
->exp_str
== str
) {
7288 printf("deriv atom: equal => Empty\n");
7293 printf("deriv atom: mismatch => forbid\n");
7295 /* TODO wildcards here */
7303 printf("deriv or: => or(derivs)\n");
7305 tmp
= xmlExpStringDeriveInt(ctxt
, exp
->exp_left
, str
);
7309 ret
= xmlExpStringDeriveInt(ctxt
, exp
->exp_right
, str
);
7311 xmlExpFree(ctxt
, tmp
);
7314 ret
= xmlExpHashGetEntry(ctxt
, XML_EXP_OR
, tmp
, ret
,
7320 printf("deriv seq: starting with left\n");
7322 ret
= xmlExpStringDeriveInt(ctxt
, exp
->exp_left
, str
);
7325 } else if (ret
== forbiddenExp
) {
7326 if (IS_NILLABLE(exp
->exp_left
)) {
7328 printf("deriv seq: left failed but nillable\n");
7330 ret
= xmlExpStringDeriveInt(ctxt
, exp
->exp_right
, str
);
7334 printf("deriv seq: left match => sequence\n");
7336 exp
->exp_right
->ref
++;
7337 ret
= xmlExpHashGetEntry(ctxt
, XML_EXP_SEQ
, ret
, exp
->exp_right
,
7341 case XML_EXP_COUNT
: {
7345 if (exp
->exp_max
== 0)
7346 return(forbiddenExp
);
7347 ret
= xmlExpStringDeriveInt(ctxt
, exp
->exp_left
, str
);
7350 if (ret
== forbiddenExp
) {
7352 printf("deriv count: pattern mismatch => forbid\n");
7356 if (exp
->exp_max
== 1)
7358 if (exp
->exp_max
< 0) /* unbounded */
7361 max
= exp
->exp_max
- 1;
7362 if (exp
->exp_min
> 0)
7363 min
= exp
->exp_min
- 1;
7366 exp
->exp_left
->ref
++;
7367 tmp
= xmlExpHashGetEntry(ctxt
, XML_EXP_COUNT
, exp
->exp_left
, NULL
,
7369 if (ret
== emptyExp
) {
7371 printf("deriv count: match to empty => new count\n");
7376 printf("deriv count: match => sequence with new count\n");
7378 return(xmlExpHashGetEntry(ctxt
, XML_EXP_SEQ
, ret
, tmp
,
7386 * xmlExpStringDerive:
7387 * @ctxt: the expression context
7388 * @exp: the expression
7390 * @len: the string len in bytes if available
7392 * Do one step of Brzozowski derivation of the expression @exp with
7393 * respect to the input string
7395 * Returns the resulting expression or NULL in case of internal error
7398 xmlExpStringDerive(xmlExpCtxtPtr ctxt
, xmlExpNodePtr exp
,
7399 const xmlChar
*str
, int len
) {
7400 const xmlChar
*input
;
7402 if ((exp
== NULL
) || (ctxt
== NULL
) || (str
== NULL
)) {
7406 * check the string is in the dictionary, if yes use an interned
7407 * copy, otherwise we know it's not an acceptable input
7409 input
= xmlDictExists(ctxt
->dict
, str
, len
);
7410 if (input
== NULL
) {
7411 return(forbiddenExp
);
7413 return(xmlExpStringDeriveInt(ctxt
, exp
, input
));
7417 xmlExpCheckCard(xmlExpNodePtr exp
, xmlExpNodePtr sub
) {
7420 if (sub
->c_max
== -1) {
7421 if (exp
->c_max
!= -1)
7423 } else if ((exp
->c_max
>= 0) && (exp
->c_max
< sub
->c_max
)) {
7427 if ((IS_NILLABLE(sub
)) && (!IS_NILLABLE(exp
)))
7433 static xmlExpNodePtr
xmlExpExpDeriveInt(xmlExpCtxtPtr ctxt
, xmlExpNodePtr exp
,
7437 * @ctxt: the expressions context
7438 * @exp: the englobing expression
7439 * @sub: the subexpression
7440 * @mult: the multiple expression
7441 * @remain: the remain from the derivation of the multiple
7443 * Check if exp is a multiple of sub, i.e. if there is a finite number n
7444 * so that sub{n} subsume exp
7446 * Returns the multiple value if successful, 0 if it is not a multiple
7447 * and -1 in case of internal error.
7451 xmlExpDivide(xmlExpCtxtPtr ctxt
, xmlExpNodePtr exp
, xmlExpNodePtr sub
,
7452 xmlExpNodePtr
*mult
, xmlExpNodePtr
*remain
) {
7454 xmlExpNodePtr tmp
, tmp2
;
7456 if (mult
!= NULL
) *mult
= NULL
;
7457 if (remain
!= NULL
) *remain
= NULL
;
7458 if (exp
->c_max
== -1) return(0);
7459 if (IS_NILLABLE(exp
) && (!IS_NILLABLE(sub
))) return(0);
7461 for (i
= 1;i
<= exp
->c_max
;i
++) {
7463 tmp
= xmlExpHashGetEntry(ctxt
, XML_EXP_COUNT
,
7464 sub
, NULL
, NULL
, i
, i
);
7468 if (!xmlExpCheckCard(tmp
, exp
)) {
7469 xmlExpFree(ctxt
, tmp
);
7472 tmp2
= xmlExpExpDeriveInt(ctxt
, tmp
, exp
);
7474 xmlExpFree(ctxt
, tmp
);
7477 if ((tmp2
!= forbiddenExp
) && (IS_NILLABLE(tmp2
))) {
7481 xmlExpFree(ctxt
, tmp2
);
7485 xmlExpFree(ctxt
, tmp
);
7487 printf("Divide succeeded %d\n", i
);
7491 xmlExpFree(ctxt
, tmp
);
7492 xmlExpFree(ctxt
, tmp2
);
7495 printf("Divide failed\n");
7501 * xmlExpExpDeriveInt:
7502 * @ctxt: the expressions context
7503 * @exp: the englobing expression
7504 * @sub: the subexpression
7506 * Try to do a step of Brzozowski derivation but at a higher level
7507 * the input being a subexpression.
7509 * Returns the resulting expression or NULL in case of internal error
7511 static xmlExpNodePtr
7512 xmlExpExpDeriveInt(xmlExpCtxtPtr ctxt
, xmlExpNodePtr exp
, xmlExpNodePtr sub
) {
7513 xmlExpNodePtr ret
, tmp
, tmp2
, tmp3
;
7514 const xmlChar
**tab
;
7518 * In case of equality and if the expression can only consume a finite
7519 * amount, then the derivation is empty
7521 if ((exp
== sub
) && (exp
->c_max
>= 0)) {
7523 printf("Equal(exp, sub) and finite -> Empty\n");
7528 * decompose sub sequence first
7530 if (sub
->type
== XML_EXP_EMPTY
) {
7532 printf("Empty(sub) -> Empty\n");
7537 if (sub
->type
== XML_EXP_SEQ
) {
7539 printf("Seq(sub) -> decompose\n");
7541 tmp
= xmlExpExpDeriveInt(ctxt
, exp
, sub
->exp_left
);
7544 if (tmp
== forbiddenExp
)
7546 ret
= xmlExpExpDeriveInt(ctxt
, tmp
, sub
->exp_right
);
7547 xmlExpFree(ctxt
, tmp
);
7550 if (sub
->type
== XML_EXP_OR
) {
7552 printf("Or(sub) -> decompose\n");
7554 tmp
= xmlExpExpDeriveInt(ctxt
, exp
, sub
->exp_left
);
7555 if (tmp
== forbiddenExp
)
7559 ret
= xmlExpExpDeriveInt(ctxt
, exp
, sub
->exp_right
);
7560 if ((ret
== NULL
) || (ret
== forbiddenExp
)) {
7561 xmlExpFree(ctxt
, tmp
);
7564 return(xmlExpHashGetEntry(ctxt
, XML_EXP_OR
, tmp
, ret
, NULL
, 0, 0));
7566 if (!xmlExpCheckCard(exp
, sub
)) {
7568 printf("CheckCard(exp, sub) failed -> Forbid\n");
7570 return(forbiddenExp
);
7572 switch (exp
->type
) {
7574 if (sub
== emptyExp
)
7577 printf("Empty(exp) -> Forbid\n");
7579 return(forbiddenExp
);
7580 case XML_EXP_FORBID
:
7582 printf("Forbid(exp) -> Forbid\n");
7584 return(forbiddenExp
);
7586 if (sub
->type
== XML_EXP_ATOM
) {
7587 /* TODO: handle wildcards */
7588 if (exp
->exp_str
== sub
->exp_str
) {
7590 printf("Atom match -> Empty\n");
7595 printf("Atom mismatch -> Forbid\n");
7597 return(forbiddenExp
);
7599 if ((sub
->type
== XML_EXP_COUNT
) &&
7600 (sub
->exp_max
== 1) &&
7601 (sub
->exp_left
->type
== XML_EXP_ATOM
)) {
7602 /* TODO: handle wildcards */
7603 if (exp
->exp_str
== sub
->exp_left
->exp_str
) {
7605 printf("Atom match -> Empty\n");
7610 printf("Atom mismatch -> Forbid\n");
7612 return(forbiddenExp
);
7615 printf("Complex exp vs Atom -> Forbid\n");
7617 return(forbiddenExp
);
7619 /* try to get the sequence consumed only if possible */
7620 if (xmlExpCheckCard(exp
->exp_left
, sub
)) {
7621 /* See if the sequence can be consumed directly */
7623 printf("Seq trying left only\n");
7625 ret
= xmlExpExpDeriveInt(ctxt
, exp
->exp_left
, sub
);
7626 if ((ret
!= forbiddenExp
) && (ret
!= NULL
)) {
7628 printf("Seq trying left only worked\n");
7631 * TODO: assumption here that we are determinist
7632 * i.e. we won't get to a nillable exp left
7633 * subset which could be matched by the right
7635 * e.g.: (a | b)+,(a | c) and 'a+,a'
7637 exp
->exp_right
->ref
++;
7638 return(xmlExpHashGetEntry(ctxt
, XML_EXP_SEQ
, ret
,
7639 exp
->exp_right
, NULL
, 0, 0));
7643 printf("Seq: left too short\n");
7646 /* Try instead to decompose */
7647 if (sub
->type
== XML_EXP_COUNT
) {
7651 printf("Seq: sub is a count\n");
7653 ret
= xmlExpExpDeriveInt(ctxt
, exp
->exp_left
, sub
->exp_left
);
7656 if (ret
!= forbiddenExp
) {
7658 printf("Seq , Count match on left\n");
7660 if (sub
->exp_max
< 0)
7663 max
= sub
->exp_max
-1;
7664 if (sub
->exp_min
> 0)
7665 min
= sub
->exp_min
-1;
7668 exp
->exp_right
->ref
++;
7669 tmp
= xmlExpHashGetEntry(ctxt
, XML_EXP_SEQ
, ret
,
7670 exp
->exp_right
, NULL
, 0, 0);
7674 sub
->exp_left
->ref
++;
7675 tmp2
= xmlExpHashGetEntry(ctxt
, XML_EXP_COUNT
,
7676 sub
->exp_left
, NULL
, NULL
, min
, max
);
7678 xmlExpFree(ctxt
, tmp
);
7681 ret
= xmlExpExpDeriveInt(ctxt
, tmp
, tmp2
);
7682 xmlExpFree(ctxt
, tmp
);
7683 xmlExpFree(ctxt
, tmp2
);
7687 /* we made no progress on structured operations */
7691 printf("Or , trying both side\n");
7693 ret
= xmlExpExpDeriveInt(ctxt
, exp
->exp_left
, sub
);
7696 tmp
= xmlExpExpDeriveInt(ctxt
, exp
->exp_right
, sub
);
7698 xmlExpFree(ctxt
, ret
);
7701 return(xmlExpHashGetEntry(ctxt
, XML_EXP_OR
, ret
, tmp
, NULL
, 0, 0));
7702 case XML_EXP_COUNT
: {
7705 if (sub
->type
== XML_EXP_COUNT
) {
7707 * Try to see if the loop is completely subsumed
7709 tmp
= xmlExpExpDeriveInt(ctxt
, exp
->exp_left
, sub
->exp_left
);
7712 if (tmp
== forbiddenExp
) {
7716 printf("Count, Count inner don't subsume\n");
7718 mult
= xmlExpDivide(ctxt
, sub
->exp_left
, exp
->exp_left
,
7722 printf("Count, Count not multiple => forbidden\n");
7724 return(forbiddenExp
);
7726 if (sub
->exp_max
== -1) {
7728 if (exp
->exp_max
== -1) {
7729 if (exp
->exp_min
<= sub
->exp_min
* mult
)
7732 min
= exp
->exp_min
- sub
->exp_min
* mult
;
7735 printf("Count, Count finite can't subsume infinite\n");
7737 xmlExpFree(ctxt
, tmp
);
7738 return(forbiddenExp
);
7741 if (exp
->exp_max
== -1) {
7743 printf("Infinite loop consume mult finite loop\n");
7745 if (exp
->exp_min
> sub
->exp_min
* mult
) {
7747 min
= exp
->exp_min
- sub
->exp_min
* mult
;
7753 if (exp
->exp_max
< sub
->exp_max
* mult
) {
7755 printf("loops max mult mismatch => forbidden\n");
7757 xmlExpFree(ctxt
, tmp
);
7758 return(forbiddenExp
);
7760 if (sub
->exp_max
* mult
> exp
->exp_min
)
7763 min
= exp
->exp_min
- sub
->exp_max
* mult
;
7764 max
= exp
->exp_max
- sub
->exp_max
* mult
;
7767 } else if (!IS_NILLABLE(tmp
)) {
7769 * TODO: loop here to try to grow if working on finite
7773 printf("Count, Count remain not nillable => forbidden\n");
7775 xmlExpFree(ctxt
, tmp
);
7776 return(forbiddenExp
);
7777 } else if (sub
->exp_max
== -1) {
7778 if (exp
->exp_max
== -1) {
7779 if (exp
->exp_min
<= sub
->exp_min
) {
7781 printf("Infinite loops Okay => COUNT(0,Inf)\n");
7787 printf("Infinite loops min => Count(X,Inf)\n");
7790 min
= exp
->exp_min
- sub
->exp_min
;
7792 } else if (exp
->exp_min
> sub
->exp_min
) {
7794 printf("loops min mismatch 1 => forbidden ???\n");
7796 xmlExpFree(ctxt
, tmp
);
7797 return(forbiddenExp
);
7803 if (exp
->exp_max
== -1) {
7805 printf("Infinite loop consume finite loop\n");
7807 if (exp
->exp_min
> sub
->exp_min
) {
7809 min
= exp
->exp_min
- sub
->exp_min
;
7815 if (exp
->exp_max
< sub
->exp_max
) {
7817 printf("loops max mismatch => forbidden\n");
7819 xmlExpFree(ctxt
, tmp
);
7820 return(forbiddenExp
);
7822 if (sub
->exp_max
> exp
->exp_min
)
7825 min
= exp
->exp_min
- sub
->exp_max
;
7826 max
= exp
->exp_max
- sub
->exp_max
;
7830 printf("loops match => SEQ(COUNT())\n");
7832 exp
->exp_left
->ref
++;
7833 tmp2
= xmlExpHashGetEntry(ctxt
, XML_EXP_COUNT
, exp
->exp_left
,
7834 NULL
, NULL
, min
, max
);
7838 ret
= xmlExpHashGetEntry(ctxt
, XML_EXP_SEQ
, tmp
, tmp2
,
7842 tmp
= xmlExpExpDeriveInt(ctxt
, exp
->exp_left
, sub
);
7845 if (tmp
== forbiddenExp
) {
7847 printf("loop mismatch => forbidden\n");
7849 return(forbiddenExp
);
7851 if (exp
->exp_min
> 0)
7852 min
= exp
->exp_min
- 1;
7855 if (exp
->exp_max
< 0)
7858 max
= exp
->exp_max
- 1;
7861 printf("loop match => SEQ(COUNT())\n");
7863 exp
->exp_left
->ref
++;
7864 tmp2
= xmlExpHashGetEntry(ctxt
, XML_EXP_COUNT
, exp
->exp_left
,
7865 NULL
, NULL
, min
, max
);
7868 ret
= xmlExpHashGetEntry(ctxt
, XML_EXP_SEQ
, tmp
, tmp2
,
7875 printf("Fallback to derivative\n");
7877 if (IS_NILLABLE(sub
)) {
7878 if (!(IS_NILLABLE(exp
)))
7879 return(forbiddenExp
);
7885 * here the structured derivation made no progress so
7886 * we use the default token based derivation to force one more step
7888 if (ctxt
->tabSize
== 0)
7891 tab
= (const xmlChar
**) xmlMalloc(ctxt
->tabSize
*
7892 sizeof(const xmlChar
*));
7898 * collect all the strings accepted by the subexpression on input
7900 len
= xmlExpGetStartInt(ctxt
, sub
, tab
, ctxt
->tabSize
, 0);
7902 const xmlChar
**temp
;
7903 temp
= (const xmlChar
**) xmlRealloc((xmlChar
**) tab
, ctxt
->tabSize
* 2 *
7904 sizeof(const xmlChar
*));
7906 xmlFree((xmlChar
**) tab
);
7911 len
= xmlExpGetStartInt(ctxt
, sub
, tab
, ctxt
->tabSize
, 0);
7913 for (i
= 0;i
< len
;i
++) {
7914 tmp
= xmlExpStringDeriveInt(ctxt
, exp
, tab
[i
]);
7915 if ((tmp
== NULL
) || (tmp
== forbiddenExp
)) {
7916 xmlExpFree(ctxt
, ret
);
7917 xmlFree((xmlChar
**) tab
);
7920 tmp2
= xmlExpStringDeriveInt(ctxt
, sub
, tab
[i
]);
7921 if ((tmp2
== NULL
) || (tmp2
== forbiddenExp
)) {
7922 xmlExpFree(ctxt
, tmp
);
7923 xmlExpFree(ctxt
, ret
);
7924 xmlFree((xmlChar
**) tab
);
7927 tmp3
= xmlExpExpDeriveInt(ctxt
, tmp
, tmp2
);
7928 xmlExpFree(ctxt
, tmp
);
7929 xmlExpFree(ctxt
, tmp2
);
7931 if ((tmp3
== NULL
) || (tmp3
== forbiddenExp
)) {
7932 xmlExpFree(ctxt
, ret
);
7933 xmlFree((xmlChar
**) tab
);
7940 ret
= xmlExpHashGetEntry(ctxt
, XML_EXP_OR
, ret
, tmp3
, NULL
, 0, 0);
7942 xmlFree((xmlChar
**) tab
);
7947 xmlFree((xmlChar
**) tab
);
7953 * @ctxt: the expressions context
7954 * @exp: the englobing expression
7955 * @sub: the subexpression
7957 * Evaluates the expression resulting from @exp consuming a sub expression @sub
7958 * Based on algebraic derivation and sometimes direct Brzozowski derivation
7959 * it usually takes less than linear time and can handle expressions generating
7960 * infinite languages.
7962 * Returns the resulting expression or NULL in case of internal error, the
7963 * result must be freed
7966 xmlExpExpDerive(xmlExpCtxtPtr ctxt
, xmlExpNodePtr exp
, xmlExpNodePtr sub
) {
7967 if ((exp
== NULL
) || (ctxt
== NULL
) || (sub
== NULL
))
7973 if (IS_NILLABLE(sub
) && (!IS_NILLABLE(exp
))) {
7975 printf("Sub nillable and not exp : can't subsume\n");
7977 return(forbiddenExp
);
7979 if (xmlExpCheckCard(exp
, sub
) == 0) {
7981 printf("sub generate longer sequences than exp : can't subsume\n");
7983 return(forbiddenExp
);
7985 return(xmlExpExpDeriveInt(ctxt
, exp
, sub
));
7990 * @ctxt: the expressions context
7991 * @exp: the englobing expression
7992 * @sub: the subexpression
7994 * Check whether @exp accepts all the languages accepted by @sub
7995 * the input being a subexpression.
7997 * Returns 1 if true 0 if false and -1 in case of failure.
8000 xmlExpSubsume(xmlExpCtxtPtr ctxt
, xmlExpNodePtr exp
, xmlExpNodePtr sub
) {
8003 if ((exp
== NULL
) || (ctxt
== NULL
) || (sub
== NULL
))
8007 * TODO: speedup by checking the language of sub is a subset of the
8013 if (IS_NILLABLE(sub
) && (!IS_NILLABLE(exp
))) {
8015 printf("Sub nillable and not exp : can't subsume\n");
8019 if (xmlExpCheckCard(exp
, sub
) == 0) {
8021 printf("sub generate longer sequences than exp : can't subsume\n");
8025 tmp
= xmlExpExpDeriveInt(ctxt
, exp
, sub
);
8027 printf("Result derivation :\n");
8032 if (tmp
== forbiddenExp
)
8034 if (tmp
== emptyExp
)
8036 if ((tmp
!= NULL
) && (IS_NILLABLE(tmp
))) {
8037 xmlExpFree(ctxt
, tmp
);
8040 xmlExpFree(ctxt
, tmp
);
8044 /************************************************************************
8046 * Parsing expression *
8048 ************************************************************************/
8050 static xmlExpNodePtr
xmlExpParseExpr(xmlExpCtxtPtr ctxt
);
8053 #define CUR (*ctxt->cur)
8055 #define NEXT ctxt->cur++;
8057 #define IS_BLANK(c) ((c == ' ') || (c == '\n') || (c == '\r') || (c == '\t'))
8058 #define SKIP_BLANKS while (IS_BLANK(*ctxt->cur)) ctxt->cur++;
8061 xmlExpParseNumber(xmlExpCtxtPtr ctxt
) {
8069 if ((CUR
< '0') || (CUR
> '9'))
8071 while ((CUR
>= '0') && (CUR
<= '9')) {
8072 ret
= ret
* 10 + (CUR
- '0');
8078 static xmlExpNodePtr
8079 xmlExpParseOr(xmlExpCtxtPtr ctxt
) {
8086 if (*ctxt
->cur
== '(') {
8088 ret
= xmlExpParseExpr(ctxt
);
8090 if (*ctxt
->cur
!= ')') {
8091 fprintf(stderr
, "unbalanced '(' : %s\n", base
);
8092 xmlExpFree(ctxt
, ret
);
8097 goto parse_quantifier
;
8099 while ((CUR
!= 0) && (!(IS_BLANK(CUR
))) && (CUR
!= '(') &&
8100 (CUR
!= ')') && (CUR
!= '|') && (CUR
!= ',') && (CUR
!= '{') &&
8101 (CUR
!= '*') && (CUR
!= '+') && (CUR
!= '?') && (CUR
!= '}'))
8103 val
= xmlDictLookup(ctxt
->dict
, BAD_CAST base
, ctxt
->cur
- base
);
8106 ret
= xmlExpHashGetEntry(ctxt
, XML_EXP_ATOM
, NULL
, NULL
, val
, 0, 0);
8115 min
= xmlExpParseNumber(ctxt
);
8117 xmlExpFree(ctxt
, ret
);
8123 max
= xmlExpParseNumber(ctxt
);
8128 xmlExpFree(ctxt
, ret
);
8132 ret
= xmlExpHashGetEntry(ctxt
, XML_EXP_COUNT
, ret
, NULL
, NULL
,
8135 } else if (CUR
== '?') {
8137 ret
= xmlExpHashGetEntry(ctxt
, XML_EXP_COUNT
, ret
, NULL
, NULL
,
8140 } else if (CUR
== '+') {
8142 ret
= xmlExpHashGetEntry(ctxt
, XML_EXP_COUNT
, ret
, NULL
, NULL
,
8145 } else if (CUR
== '*') {
8147 ret
= xmlExpHashGetEntry(ctxt
, XML_EXP_COUNT
, ret
, NULL
, NULL
,
8155 static xmlExpNodePtr
8156 xmlExpParseSeq(xmlExpCtxtPtr ctxt
) {
8157 xmlExpNodePtr ret
, right
;
8159 ret
= xmlExpParseOr(ctxt
);
8161 while (CUR
== '|') {
8163 right
= xmlExpParseOr(ctxt
);
8164 if (right
== NULL
) {
8165 xmlExpFree(ctxt
, ret
);
8168 ret
= xmlExpHashGetEntry(ctxt
, XML_EXP_OR
, ret
, right
, NULL
, 0, 0);
8175 static xmlExpNodePtr
8176 xmlExpParseExpr(xmlExpCtxtPtr ctxt
) {
8177 xmlExpNodePtr ret
, right
;
8179 ret
= xmlExpParseSeq(ctxt
);
8181 while (CUR
== ',') {
8183 right
= xmlExpParseSeq(ctxt
);
8184 if (right
== NULL
) {
8185 xmlExpFree(ctxt
, ret
);
8188 ret
= xmlExpHashGetEntry(ctxt
, XML_EXP_SEQ
, ret
, right
, NULL
, 0, 0);
8197 * @ctxt: the expressions context
8198 * @expr: the 0 terminated string
8200 * Minimal parser for regexps, it understand the following constructs
8201 * - string terminals
8202 * - choice operator |
8203 * - sequence operator ,
8204 * - subexpressions (...)
8205 * - usual cardinality operators + * and ?
8206 * - finite sequences { min, max }
8207 * - infinite sequences { min, * }
8208 * There is minimal checkings made especially no checking on strings values
8210 * Returns a new expression or NULL in case of failure
8213 xmlExpParse(xmlExpCtxtPtr ctxt
, const char *expr
) {
8219 ret
= xmlExpParseExpr(ctxt
);
8221 if (*ctxt
->cur
!= 0) {
8222 xmlExpFree(ctxt
, ret
);
8229 xmlExpDumpInt(xmlBufferPtr buf
, xmlExpNodePtr expr
, int glob
) {
8232 if (expr
== NULL
) return;
8233 if (glob
) xmlBufferWriteChar(buf
, "(");
8234 switch (expr
->type
) {
8236 xmlBufferWriteChar(buf
, "empty");
8238 case XML_EXP_FORBID
:
8239 xmlBufferWriteChar(buf
, "forbidden");
8242 xmlBufferWriteCHAR(buf
, expr
->exp_str
);
8246 if ((c
->type
== XML_EXP_SEQ
) || (c
->type
== XML_EXP_OR
))
8247 xmlExpDumpInt(buf
, c
, 1);
8249 xmlExpDumpInt(buf
, c
, 0);
8250 xmlBufferWriteChar(buf
, " , ");
8251 c
= expr
->exp_right
;
8252 if ((c
->type
== XML_EXP_SEQ
) || (c
->type
== XML_EXP_OR
))
8253 xmlExpDumpInt(buf
, c
, 1);
8255 xmlExpDumpInt(buf
, c
, 0);
8259 if ((c
->type
== XML_EXP_SEQ
) || (c
->type
== XML_EXP_OR
))
8260 xmlExpDumpInt(buf
, c
, 1);
8262 xmlExpDumpInt(buf
, c
, 0);
8263 xmlBufferWriteChar(buf
, " | ");
8264 c
= expr
->exp_right
;
8265 if ((c
->type
== XML_EXP_SEQ
) || (c
->type
== XML_EXP_OR
))
8266 xmlExpDumpInt(buf
, c
, 1);
8268 xmlExpDumpInt(buf
, c
, 0);
8270 case XML_EXP_COUNT
: {
8274 if ((c
->type
== XML_EXP_SEQ
) || (c
->type
== XML_EXP_OR
))
8275 xmlExpDumpInt(buf
, c
, 1);
8277 xmlExpDumpInt(buf
, c
, 0);
8278 if ((expr
->exp_min
== 0) && (expr
->exp_max
== 1)) {
8281 } else if ((expr
->exp_min
== 0) && (expr
->exp_max
== -1)) {
8284 } else if ((expr
->exp_min
== 1) && (expr
->exp_max
== -1)) {
8287 } else if (expr
->exp_max
== expr
->exp_min
) {
8288 snprintf(rep
, 39, "{%d}", expr
->exp_min
);
8289 } else if (expr
->exp_max
< 0) {
8290 snprintf(rep
, 39, "{%d,inf}", expr
->exp_min
);
8292 snprintf(rep
, 39, "{%d,%d}", expr
->exp_min
, expr
->exp_max
);
8295 xmlBufferWriteChar(buf
, rep
);
8299 fprintf(stderr
, "Error in tree\n");
8302 xmlBufferWriteChar(buf
, ")");
8306 * @buf: a buffer to receive the output
8307 * @expr: the compiled expression
8309 * Serialize the expression as compiled to the buffer
8312 xmlExpDump(xmlBufferPtr buf
, xmlExpNodePtr expr
) {
8313 if ((buf
== NULL
) || (expr
== NULL
))
8315 xmlExpDumpInt(buf
, expr
, 0);
8320 * @expr: a compiled expression
8322 * Indicate the maximum number of input a expression can accept
8324 * Returns the maximum length or -1 in case of error
8327 xmlExpMaxToken(xmlExpNodePtr expr
) {
8330 return(expr
->c_max
);
8334 * xmlExpCtxtNbNodes:
8335 * @ctxt: an expression context
8337 * Debugging facility provides the number of allocated nodes at a that point
8339 * Returns the number of nodes in use or -1 in case of error
8342 xmlExpCtxtNbNodes(xmlExpCtxtPtr ctxt
) {
8345 return(ctxt
->nb_nodes
);
8350 * @ctxt: an expression context
8352 * Debugging facility provides the number of allocated nodes over lifetime
8354 * Returns the number of nodes ever allocated or -1 in case of error
8357 xmlExpCtxtNbCons(xmlExpCtxtPtr ctxt
) {
8360 return(ctxt
->nb_cons
);
8363 #endif /* LIBXML_EXPR_ENABLED */
8365 #endif /* LIBXML_REGEXP_ENABLED */