3 Copyright (C
) 2003 Elwood C
. Downey
5 This library is free software
; you can redistribute it
and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation
; either
8 version
2.1 of the License
, or (at your option
) any later version
.
10 This library is distributed in the hope that it will be useful
,
11 but WITHOUT ANY WARRANTY
; without even the implied warranty of
12 MERCHANTABILITY
or FITNESS FOR A PARTICULAR PURPOSE
. See the GNU
13 Lesser General Public License
for more details
.
15 You should have received a copy of the GNU Lesser General Public
16 License along with
this library
; if not, write to the Free Software
17 Foundation
, Inc
., 59 Temple Place
, Suite
330, Boston
, MA
02111-1307 USA
21 /* little DOM-style XML parser.
22 * only handles elements, attributes and pcdata content.
23 * <! ... > and <? ... > are silently ignored.
24 * pcdata is collected into one string, sans leading whitespace first line.
26 * #define MAIN_TST to create standalone test program
37 static int oneXMLchar (LilXML
*lp
, int c
, char errmsg
[]);
38 static void initParser(LilXML
*lp
);
39 static void pushXMLEle(LilXML
*lp
);
40 static void popXMLEle(LilXML
*lp
);
41 static void resetEndTag(LilXML
*lp
);
42 static void addAttr(LilXML
*lp
);
43 static void delAttr (XMLAtt
*a
);
44 static int isTokenChar (int start
, int c
);
45 static void growString (char **sp
, int c
);
46 static void growPCData (XMLEle
*ep
, int c
);
47 static char *newString (void);
48 static void *moremem (void *old
, int n
);
49 static void freemem (void *m
);
52 LOOK4START
= 0, /* looking for first element start */
53 LOOK4TAG
, /* looking for element tag */
54 INTAG
, /* reading tag */
55 LOOK4ATTRN
, /* looking for attr name, > or / */
56 INATTRN
, /* reading attr name */
57 LOOK4ATTRV
, /* looking for attr value */
58 SAWSLASH
, /* saw / in element opening */
59 INATTRV
, /* in attr value */
60 LOOK4CON
, /* skipping leading content whitespc */
61 INCON
, /* reading content */
62 LTINCON
, /* saw < in content */
63 LOOK4CLOSETAG
, /* looking for closing tag after < */
64 INCLOSETAG
/* reading closing tag */
65 } State
; /* parsing states */
67 /* maintain state while parsing */
69 State cs
; /* current state */
70 int ln
; /* line number for diags */
71 XMLEle
*ce
; /* current element being built */
72 char *endtag
; /* to check for match with opening tag*/
73 int delim
; /* attribute value delimiter */
74 int lastc
; /* last char (just used wiht skipping)*/
75 int skipping
; /* in comment or declaration */
78 /* internal representation of a (possibly nested) XML element */
80 char *tag
; /* element tag */
81 struct _xml_ele
*pe
; /* parent element, or NULL if root */
82 XMLAtt
**at
; /* list of attributes */
83 int nat
; /* number of attributes */
84 int ait
; /* used to iterate over at[] */
85 struct _xml_ele
**el
; /* list of child elements */
86 int nel
; /* number of child elements */
87 int eit
; /* used to iterate over el[] */
88 char *pcdata
; /* character data in this element */
89 int pcdatal
; /* handy length sans \0 (tends to be big) */
92 /* internal representation of an attribute */
94 char *name
; /* name */
95 char *valu
; /* value */
96 struct _xml_ele
*ce
; /* containing element */
99 /* pass back a fresh handle for use with our other functions */
103 LilXML
*lp
= (LilXML
*) moremem (NULL
, sizeof(LilXML
));
110 delLilXML (LilXML
*lp
)
115 /* delete ep and all its children */
117 delXMLEle (XMLEle
*ep
)
125 /* delete all parts of ep */
127 freemem (ep
->pcdata
);
129 for (i
= 0; i
< ep
->nat
; i
++)
134 for (i
= 0; i
< ep
->nel
; i
++)
135 delXMLEle (ep
->el
[i
]);
139 /* delete ep itself */
143 /* process one more character of an XML file.
144 * when find closure with outter element return root of complete tree.
145 * when find error return NULL with reason in errmsg[].
146 * when need more return NULL with errmsg[0] = '\0'.
147 * N.B. it is up to the caller to delete the tree delXMLEle().
150 readXMLEle (LilXML
*lp
, int newc
, char errmsg
[])
155 /* start optimistic */
160 snprintf (errmsg
, ERRMSG_SIZE
, "Line %d: XML EOF", lp
->ln
);
169 /* skip comments and declarations. requires 1 char history */
170 if (!lp
->skipping
&& lp
->lastc
== '<' && (newc
== '?' || newc
== '!')) {
186 /* do a pending '<' first then newc */
187 if (lp
->lastc
== '<') {
188 if (oneXMLchar (lp
, '<', errmsg
) < 0) {
192 /* N.B. we assume '<' will never result in closure */
195 /* process newc (at last!) */
196 s
= oneXMLchar (lp
, newc
, errmsg
);
206 /* Ok! return ce and we start over.
207 * N.B. up to caller to call delXMLEle with what we return.
215 /* search ep for an attribute with given name.
216 * return NULL if not found.
219 findXMLAtt (XMLEle
*ep
, const char *name
)
223 for (i
= 0; i
< ep
->nat
; i
++)
224 if (!strcmp (ep
->at
[i
]->name
, name
))
229 /* search ep for an element with given tag.
230 * return NULL if not found.
233 findXMLEle (XMLEle
*ep
, const char *tag
)
237 for (i
= 0; i
< ep
->nel
; i
++)
238 if (!strcmp (ep
->el
[i
]->tag
, tag
))
243 /* iterate over each child element of ep.
244 * call first time with first set to 1, then 0 from then on.
245 * returns NULL when no more or err
248 nextXMLEle (XMLEle
*ep
, int init
)
256 if (eit
< 0 || eit
>= ep
->nel
)
258 return (ep
->el
[eit
]);
261 /* iterate over each attribute of ep.
262 * call first time with first set to 1, then 0 from then on.
263 * returns NULL when no more or err
266 nextXMLAtt (XMLEle
*ep
, int init
)
274 if (ait
< 0 || ait
>= ep
->nat
)
276 return (ep
->at
[ait
]);
279 /* return parent of given XMLEle */
281 parentXMLEle (XMLEle
*ep
)
286 /* return parent element of given XMLAtt */
288 parentXMLAtt (XMLAtt
*ap
)
293 /* access functions */
295 /* return the tag name of the given element */
297 tagXMLEle (XMLEle
*ep
)
302 /* return the pcdata portion of the given element */
304 pcdataXMLEle (XMLEle
*ep
)
309 /* return the number of characters in the pcdata portion of the given element */
311 pcdatalenXMLEle (XMLEle
*ep
)
313 return (ep
->pcdatal
);
316 /* return the nanme of the given attribute */
318 nameXMLAtt (XMLAtt
*ap
)
323 /* return the value of the given attribute */
325 valuXMLAtt (XMLAtt
*ap
)
330 /* return the number of child elements of the given element */
337 /* return the number of attributes in the given element */
345 /* search ep for an attribute with the given name and return its value.
346 * return "" if not found.
349 findXMLAttValu (XMLEle
*ep
, char *name
)
351 XMLAtt
*a
= findXMLAtt (ep
, name
);
352 return (a
? a
->valu
: "");
355 /* handy wrapper to read one xml file.
356 * return root element else NULL with report in errmsg[]
359 readXMLFile (FILE *fp
, LilXML
*lp
, char errmsg
[])
363 while ((c
= fgetc(fp
)) != EOF
) {
364 XMLEle
*root
= readXMLEle (lp
, c
, errmsg
);
365 if (root
|| errmsg
[0])
372 /* sample print ep to fp
373 * N.B. set level = 0 on first call
375 #define PRINDENT 4 /* sample print indent each level */
377 prXMLEle (FILE *fp
, XMLEle
*ep
, int level
)
379 int indent
= level
*PRINDENT
;
382 fprintf (fp
, "%*s<%s", indent
, "", ep
->tag
);
383 for (i
= 0; i
< ep
->nat
; i
++)
384 fprintf (fp
, " %s=\"%s\"", ep
->at
[i
]->name
, ep
->at
[i
]->valu
);
387 for (i
= 0; i
< ep
->nel
; i
++)
388 prXMLEle (fp
, ep
->el
[i
], level
+1);
394 /* indent if none or one line */
395 nl
= strpbrk (ep
->pcdata
, "\n\r");
396 if (!nl
|| nl
== &ep
->pcdata
[ep
->pcdatal
-1])
397 fprintf (fp
, "%*s", indent
+PRINDENT
, "");
398 fprintf (fp
, "%s", ep
->pcdata
);
402 if (ep
->nel
> 0 || ep
->pcdata
[0])
403 fprintf (fp
, "%*s</%s>\n", indent
, "", ep
->tag
);
405 fprintf (fp
, "/>\n");
410 /* process one more char in XML file.
411 * if find final closure, return 1 and tree is in ce.
412 * if need more, return 0.
413 * if real trouble, return -1 and put reason in errmsg.
416 oneXMLchar (LilXML
*lp
, int c
, char errmsg
[])
419 case LOOK4START
: /* looking for first element start */
424 /* silently ignore until resync */
427 case LOOK4TAG
: /* looking for element tag */
428 if (isTokenChar (1, c
)) {
429 growString (&lp
->ce
->tag
, c
);
431 } else if (!isspace(c
)) {
432 snprintf (errmsg
, ERRMSG_SIZE
, "Line %d: Bogus tag char %c", lp
->ln
, c
);
437 case INTAG
: /* reading tag */
438 if (isTokenChar (0, c
))
439 growString (&lp
->ce
->tag
, c
);
448 case LOOK4ATTRN
: /* looking for attr name, > or / */
453 else if (isTokenChar (1, c
)) {
455 growString (&lp
->ce
->at
[lp
->ce
->nat
-1]->name
, c
);
457 } else if (!isspace(c
)) {
458 snprintf (errmsg
, ERRMSG_SIZE
, "Line %d: Bogus leading attr name char: %c",
464 case SAWSLASH
: /* saw / in element opening */
467 return(1); /* root has no content */
471 snprintf (errmsg
, ERRMSG_SIZE
, "Line %d: Bogus char %c before >", lp
->ln
, c
);
476 case INATTRN
: /* reading attr name */
477 if (isTokenChar (0, c
))
478 growString (&lp
->ce
->at
[lp
->ce
->nat
-1]->name
, c
);
479 else if (isspace(c
) || c
== '=')
482 snprintf (errmsg
, ERRMSG_SIZE
, "Line %d: Bogus attr name char: %c", lp
->ln
,c
);
487 case LOOK4ATTRV
: /* looking for attr value */
488 if (c
== '\'' || c
== '"') {
490 growString (&lp
->ce
->at
[lp
->ce
->nat
-1]->valu
, '\0');
492 } else if (!(isspace(c
) || c
== '=')) {
493 snprintf (errmsg
, ERRMSG_SIZE
, "Line %d: No value for attribute %.100s", lp
->ln
,
494 lp
->ce
->at
[lp
->ce
->nat
-1]->name
);
499 case INATTRV
: /* in attr value */
502 else if (!iscntrl(c
))
503 growString (&lp
->ce
->at
[lp
->ce
->nat
-1]->valu
, c
);
506 case LOOK4CON
: /* skipping leading content whitespace*/
509 else if (!isspace(c
)) {
510 growPCData (lp
->ce
, c
);
515 case INCON
: /* reading content */
517 /* if text contains a nl trim trailing blanks.
518 * chomp trailing nl if only one.
520 char *nl
= strpbrk (lp
->ce
->pcdata
, "\n\r");
522 while (lp
->ce
->pcdatal
> 0 &&
523 lp
->ce
->pcdata
[lp
->ce
->pcdatal
-1] == ' ')
524 lp
->ce
->pcdata
[--lp
->ce
->pcdatal
] = '\0';
525 if (nl
== &lp
->ce
->pcdata
[lp
->ce
->pcdatal
-1])
526 lp
->ce
->pcdata
[--lp
->ce
->pcdatal
] = '\0'; /* safe! */
529 growPCData (lp
->ce
, c
);
532 case LTINCON
: /* saw < in content */
535 lp
->cs
= LOOK4CLOSETAG
;
538 if (isTokenChar(1,c
)) {
539 growString (&lp
->ce
->tag
, c
);
546 case LOOK4CLOSETAG
: /* looking for closing tag after < */
547 if (isTokenChar (1, c
)) {
548 growString (&lp
->endtag
, c
);
550 } else if (!isspace(c
)) {
551 snprintf (errmsg
, ERRMSG_SIZE
, "Line %d: Bogus preend tag char %c", lp
->ln
,c
);
556 case INCLOSETAG
: /* reading closing tag */
557 if (isTokenChar(0, c
))
558 growString (&lp
->endtag
, c
);
560 if (strcmp (lp
->ce
->tag
, lp
->endtag
)) {
561 snprintf (errmsg
, ERRMSG_SIZE
, "Line %d: closing tag %.64s does not match %.64s",
562 lp
->ln
, lp
->endtag
, lp
->ce
->tag
);
564 } else if (lp
->ce
->pe
) {
566 lp
->cs
= LOOK4CON
; /* back to content after nested elem */
568 return (1); /* yes! */
569 } else if (!isspace(c
)) {
570 snprintf (errmsg
, ERRMSG_SIZE
, "Line %d: Bogus end tag char %c", lp
->ln
, c
);
579 /* set up for a fresh start */
581 initParser(LilXML
*lp
)
583 memset (lp
, 0, sizeof(*lp
));
593 /* start a new XMLEle.
594 * if ce already set up, add to its list of child elements.
595 * point ce to a new XMLEle.
596 * endtag no longer valid.
599 pushXMLEle(LilXML
*lp
)
601 XMLEle
*newe
= (XMLEle
*) moremem (NULL
, sizeof(XMLEle
));
604 memset (newe
, 0, sizeof(*newe
));
605 newe
->tag
= newString();
606 newe
->pcdata
= newString();
610 ce
->el
= (XMLEle
**) moremem (ce
->el
, (ce
->nel
+1)*sizeof(XMLEle
*));
611 ce
->el
[ce
->nel
++] = newe
;
617 /* point ce to parent of current ce.
618 * endtag no longer valid.
621 popXMLEle(LilXML
*lp
)
627 /* add one new XMLAtt to the current element */
631 XMLAtt
*newa
= (XMLAtt
*) moremem (NULL
, sizeof(XMLAtt
));
634 memset (newa
, 0, sizeof(*newa
));
635 newa
->name
= newString();
636 newa
->valu
= newString();
639 ce
->at
= (XMLAtt
**) moremem (ce
->at
, (ce
->nat
+1)*sizeof(XMLAtt
*));
640 ce
->at
[ce
->nat
++] = newa
;
643 /* delete a and all it holds */
656 /* delete endtag if appropriate */
658 resetEndTag(LilXML
*lp
)
661 freemem (lp
->endtag
);
666 /* 1 if c is a valid token character, else 0.
667 * it can be alpha or '_' or numeric unless start.
670 isTokenChar (int start
, int c
)
672 return (isalpha(c
) || c
== '_' || (!start
&& isdigit(c
)));
675 /* grow the malloced string at *sp to append c */
677 growString (char **sp
, int c
)
679 int l
= *sp
? strlen(*sp
) : 0;
680 *sp
= (char *) moremem (*sp
, l
+2); /* c + '\0' */
681 (*sp
)[l
++] = (char)c
;
685 /* special fast version of growString just for ep->pcdata that avoids all the
686 * strlens and tiny increments in allocated mem
689 growPCData (XMLEle
*ep
, int c
)
691 int l
= ep
->pcdatal
++;
693 int nm
= 32*(l
/32+1) + 2; /* c + '\0' */
694 ep
->pcdata
= (char *) moremem (ep
->pcdata
, nm
);
696 ep
->pcdata
[l
++] = (char)c
;
697 ep
->pcdata
[l
] = '\0';
700 /* return a malloced string of one '\0' */
706 *(str
= (char *)moremem(NULL
, 16)) = '\0'; /* expect more */
711 moremem (void *old
, int n
)
713 return (old
? realloc (old
, n
) : malloc (n
));
722 #if defined(MAIN_TST)
724 main (int ac
, char *av
[])
726 LilXML
*lp
= newLilXML();
727 char errmsg
[ERRMSG_SIZE
];
730 root
= readXMLFile (stdin
, lp
, errmsg
);
732 fprintf (stderr
, "::::::::::::: %s\n", tagXMLEle(root
));
733 prXMLEle (stdout
, root
, 0);
735 } else if (errmsg
[0]) {
736 fprintf (stderr
, "Error: %s\n", errmsg
);