some updates
[iv.d.git] / saxy.d
blobcf2b7e8a8e3fcbbcf8a570ea5a108fac6c2700a5
1 /* Invisible Vector Library
2 * coded by Ketmar // Invisible Vector <ketmar@ketmar.no-ip.org>
3 * Understanding is not required. Only obedience.
5 * This software is provided 'as-is', without any express or implied
6 * warranty. In no event will the authors be held liable for any damages
7 * arising from the use of this software.
8 * Permission is granted to anyone to use this software for any purpose,
9 * including commercial applications, and to alter it and redistribute it
10 * freely, subject to the following restrictions:
11 * 1. The origin of this software must not be misrepresented; you must not
12 * claim that you wrote the original software. If you use this software
13 * in a product, an acknowledgment in the product documentation would be
14 * appreciated but is not required.
15 * 2. Altered source versions must be plainly marked as such, and must not be
16 * misrepresented as being the original software.
17 * 3. This notice may not be removed or altered from any source distribution.
19 * This program is distributed in the hope that it will be useful,
20 * but WITHOUT ANY WARRANTY; without even the implied warranty of
21 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
23 // SAX style xml parser
24 module iv.saxy /*is aliced*/;
26 import std.encoding;
27 import std.range;
29 import iv.alice;
30 import iv.strex;
31 import iv.vfs;
34 // ////////////////////////////////////////////////////////////////////////// //
35 //*WARNING*: attr keys are *NOT* strings!
36 void xmparse(ST) (auto ref ST fl,
37 scope void delegate (char[] name, char[][string] attrs) tagStart,
38 scope void delegate (char[] name) tagEnd,
39 scope void delegate (char[] text) content,
40 ) if (isReadableStream!ST || (isInputRange!ST && is(ElementEncodingType!ST == char))) {
41 char[] buf;
42 uint bufpos;
43 char[][string] attrs;
44 scope(exit) {
45 attrs.destroy;
46 buf.destroy;
49 static bool isValidNameChar() (char ch) {
50 pragma(inline, true);
51 return
52 (ch >= '0' && ch <= '9') ||
53 (ch >= 'A' && ch <= 'Z') ||
54 (ch >= 'a' && ch <= 'z') ||
55 ch == '_' || ch == '-' || ch == ':';
58 int tagLevel = 0;
60 void bufPut (const(char)[] chars...) {
61 if (/*tagLevel &&*/ chars.length) {
62 if (chars.length+bufpos > buf.length) {
63 if (chars.length+bufpos >= int.max) throw new Exception("out of memory in xml parser");
64 buf.assumeSafeAppend;
65 buf.length = ((chars.length+bufpos)|0x3ff)+1;
67 buf[bufpos..bufpos+chars.length] = chars[];
68 bufpos += chars.length;
72 void clearBuf () {
73 bufpos = 0;
76 char curCh;
77 bool eof;
79 static if (isReadableStream!ST) {
80 char[] rdbuf;
81 scope(exit) rdbuf.destroy;
82 uint rdbufpos, rdbufused;
85 void skipChar () {
86 if (!eof) {
87 static if (isReadableStream!ST) {
88 // buffer more bytes
89 if (rdbufpos >= rdbufused) {
90 if (rdbuf.length == 0) rdbuf.length = 32*1024;
91 auto rd = fl.rawRead(rdbuf[]);
92 if (rd.length == 0) { eof = true; curCh = 0; return; }
93 rdbufpos = 0;
94 rdbufused = cast(uint)rd.length;
96 curCh = rdbuf.ptr[rdbufpos++];
97 } else {
98 if (fl.empty) { eof = true; curCh = 0; return; }
99 curCh = fl.front;
100 fl.popFront;
102 if (curCh == 0) curCh = ' ';
106 // curCh is '&'
107 void parseEntity (bool inattr) {
108 assert(curCh == '&');
109 bufPut(curCh);
110 auto xpos = bufpos;
111 skipChar();
112 if (inattr) {
113 while (!eof && curCh != '/' && curCh != '>' && curCh != '?' && curCh != ';' && bufpos-xpos < 9) {
114 bufPut(curCh);
115 skipChar();
117 } else {
118 while (!eof && curCh != '<' && curCh != ';' && bufpos-xpos < 9) {
119 bufPut(curCh);
120 skipChar();
123 if (!eof && curCh == ';' && bufpos > xpos) {
124 import std.utf : encode, UseReplacementDchar;
125 char[4] ubuf = void; // utf buffer
126 switch (buf[xpos..bufpos]) {
127 case "lt": bufpos = xpos-1; bufPut('<'); break;
128 case "gt": bufpos = xpos-1; bufPut('>'); break;
129 case "amp": bufpos = xpos-1; bufPut('&'); break;
130 case "quot": bufpos = xpos-1; bufPut('"'); break;
131 case "apos": bufpos = xpos-1; bufPut('\''); break;
132 default:
133 bufPut(curCh); // first put ';'
134 if (bufpos-xpos > 3 && buf.ptr[xpos] == '#' && buf.ptr[xpos+1] == 'x') {
135 // should be hex code
136 uint n = 0;
137 auto pos = xpos+2;
138 while (pos < bufpos-1) {
139 char ch = buf.ptr[pos++];
140 if (ch >= '0' && ch <= '9') n = n*16+ch-'0';
141 else if (ch >= 'A' && ch <= 'F') n = n*16+ch-'A'+10;
142 else if (ch >= 'a' && ch <= 'f') n = n*16+ch-'a'+10;
143 else { n = uint.max; break; } // invalid digit
144 if (n > dchar.max) break; // invalid char
146 if (n <= dchar.max) {
147 if (n == 1) n = 32;
148 bufpos = xpos-1;
149 auto sz = encode!(UseReplacementDchar.yes)(ubuf, cast(dchar)n);
150 foreach (immutable char ch; ubuf[0..sz]) bufPut(ch);
152 } else if (bufpos-xpos > 2 && buf.ptr[xpos] == '#') {
153 // shoud be decimal code
154 uint n = 0;
155 auto pos = xpos+1;
156 while (pos < bufpos-1) {
157 char ch = buf.ptr[pos++];
158 if (ch >= '0' && ch <= '9') n = n*10+ch-'0';
159 else { n = uint.max; break; } // invalid digit
160 if (n > dchar.max) break; // invalid char
162 if (n <= dchar.max) {
163 if (n == 1) n = 32;
164 bufpos = xpos-1;
165 auto sz = encode!(UseReplacementDchar.yes)(ubuf, cast(dchar)n);
166 foreach (immutable char ch; ubuf[0..sz]) bufPut(ch);
169 break;
171 skipChar();
175 void parseCData () {
176 clearBuf();
177 while (!eof) {
178 if (bufpos >= 3 && buf.ptr[bufpos-1] == '>' && buf.ptr[bufpos-2] == ']' && buf.ptr[bufpos-3] == ']') {
179 bufpos -= 3;
180 break;
182 bufPut(curCh);
183 skipChar();
185 if (tagLevel && bufpos > 0 && content !is null) content(buf[0..bufpos]);
186 clearBuf();
189 void parseContent () {
190 clearBuf();
191 while (!eof) {
192 if (curCh == '<') break;
193 if (curCh != '&') {
194 bufPut(curCh);
195 skipChar();
196 } else {
197 parseEntity(false);
200 if (tagLevel && bufpos > 0 && content !is null) content(buf[0..bufpos]);
201 clearBuf();
204 void parseTag () {
205 assert(!eof && curCh == '<');
206 clearBuf();
207 skipChar();
208 if (eof) throw new Exception("invalid xml");
209 bool inlineClose = false, closeTag = false;
210 if (curCh == '!') {
211 // either CDATA, or comment-like
212 skipChar();
213 if (curCh == '[') {
214 // this *must* be CDATA
215 skipChar();
216 if (curCh != 'C') throw new Exception("invalid xml");
217 skipChar();
218 if (curCh != 'D') throw new Exception("invalid xml");
219 skipChar();
220 if (curCh != 'A') throw new Exception("invalid xml");
221 skipChar();
222 if (curCh != 'T') throw new Exception("invalid xml");
223 skipChar();
224 if (curCh != 'A') throw new Exception("invalid xml");
225 skipChar();
226 if (curCh != '[') throw new Exception("invalid xml");
227 skipChar();
228 clearBuf();
229 parseCData();
230 return;
231 } else if (curCh == '-') {
232 // comment
233 skipChar();
234 if (curCh != '-') throw new Exception("invalid xml");
235 skipChar();
236 for (;;) {
237 if (eof) throw new Exception("invalid xml");
238 if (curCh == '-') {
239 skipChar();
240 if (curCh == '-') {
241 skipChar();
242 if (curCh == '>') {
243 skipChar();
244 break;
247 } else {
248 skipChar();
251 clearBuf();
252 return;
253 } else {
254 // !tag
255 bufPut('!');
257 } else {
258 if (curCh == '/') { closeTag = true; skipChar(); }
259 if (curCh == '?') { bufPut(curCh); skipChar(); }
261 if (eof || !isValidNameChar(curCh)) throw new Exception("invalid xml");
262 while (isValidNameChar(curCh)) {
263 bufPut(curCh);
264 skipChar();
266 //{ import std.stdio; writeln("TAG: ", buf[0..bufpos].quote); }
267 // now parse attributes
268 scope(exit) attrs.clear();
269 while (!eof && curCh <= ' ') skipChar();
270 // closing tag?
271 auto tagnameend = bufpos;
272 if (!closeTag) {
273 // attr=["]name["]
274 // read the whole tag, so we can add AA items without anchoring stale memory
275 if (eof) throw new Exception("invalid xml");
276 if (curCh != '/' && curCh != '>' && curCh != '?') {
277 bufPut(' ');
278 auto stpos = bufpos;
279 char qch = 0;
280 for (;;) {
281 if (eof) throw new Exception("invalid xml");
282 if (qch) {
283 if (curCh == qch) { qch = 0; curCh = 1; }
284 if (curCh == '&') {
285 parseEntity(true);
286 continue;
288 } else {
289 if (curCh == '/' || curCh == '>' || curCh == '?') break;
290 if (curCh == '"' || curCh == '\'') {
291 qch = curCh;
292 curCh = 1;
293 } else if (curCh == 1) {
294 curCh = 32;
297 bufPut(curCh);
298 skipChar();
300 // now parse attributes
301 //{ import std.stdio; writeln(": ", buf[stpos..bufpos].quote); }
302 while (stpos < bufpos) {
303 while (stpos < bufpos && buf.ptr[stpos] <= ' ' && buf.ptr[stpos] != 1) ++stpos;
304 if (stpos >= bufpos) break;
305 //{ import std.stdio; writeln(": ", buf[stpos..bufpos].quote); }
306 if (!isValidNameChar(buf.ptr[stpos])) throw new Exception("invalid xml: "~buf[stpos..bufpos].quote);
307 auto nst = stpos;
308 while (stpos < bufpos && isValidNameChar(buf.ptr[stpos])) ++stpos;
309 string aname = cast(string)(buf[nst..stpos]); // unsafe cast, but meh...
310 while (stpos < bufpos && buf.ptr[stpos] <= ' ' && buf.ptr[stpos] != 1) ++stpos;
311 if (stpos >= bufpos) { attrs[aname] = null; break; } // no value
312 if (buf.ptr[stpos] != '=') { attrs[aname] = null; continue; } // no value
313 ++stpos;
314 while (stpos < bufpos && buf.ptr[stpos] <= ' ' && buf.ptr[stpos] != 1) ++stpos;
315 if (stpos >= bufpos) { attrs[aname] = buf[bufpos..bufpos]; break; }
316 //if (buf.ptr[stpos] == '"' || buf.ptr[stpos] == '\'')
317 if (buf.ptr[stpos] == 1)
319 auto ech = buf.ptr[stpos];
320 nst = ++stpos;
321 while (stpos < bufpos && buf.ptr[stpos] != ech) ++stpos;
322 if (stpos >= bufpos) throw new Exception("invalid xml");
323 attrs[aname] = buf[nst..stpos];
324 ++stpos;
325 } else {
326 nst = stpos;
327 while (stpos < bufpos && buf.ptr[stpos] > ' ') ++stpos;
328 attrs[aname] = buf[nst..stpos];
333 if (curCh == '?') {
334 if (buf.ptr[0] != '?') throw new Exception("invalid xml");
335 skipChar();
336 inlineClose = true;
337 } else if (buf.ptr[0] != '!') {
338 if (curCh == '/') { inlineClose = true; skipChar(); }
339 } else {
340 inlineClose = true;
342 if (curCh != '>') throw new Exception("invalid xml");
343 skipChar();
344 if (closeTag) {
345 if (inlineClose) throw new Exception("invalid xml");
346 if (tagEnd !is null) tagEnd(buf[0..tagnameend]);
347 --tagLevel;
348 } else {
349 ++tagLevel;
350 if (tagStart !is null) tagStart(buf[0..tagnameend], attrs);
351 if (inlineClose) {
352 if (tagEnd !is null) tagEnd(buf[0..tagnameend]);
353 --tagLevel;
358 while (!eof) {
359 //writeln("*** ", tagLevel, " ***");
360 parseContent();
361 if (eof) break;
362 if (curCh == '<') {
363 parseTag();
364 if (tagLevel < 0) throw new Exception("invalid xml");
368 if (tagLevel != 0) throw new Exception("invalid xml");
372 // ////////////////////////////////////////////////////////////////////////// //
373 // you can use "quantifiers" in pathes, like this:
374 // "/a/b/c*/d+/*"
375 // that means "any number of 'c' tags", "one or more 'd' tags", "any number of any tags"
376 // the last is useful to parse things like "bold" tag inside "p" tag, for example
377 final class SaxyEx {
378 private import std.range;
379 public:
380 alias TagOpenCB = void delegate (char[] name, char[][string] attrs);
381 alias TagOpenCBNA = void delegate (char[] name);
382 alias TagCloseCB = void delegate (char[] name);
383 alias TagContentCB = void delegate (char[] text);
385 private:
386 static struct PathElement {
387 string name; // empty: any tag
388 char quant = 0; // '+', '*', 0
391 static struct TagCB {
392 enum Type { Open, Close, Content }
393 Type type;
394 PathElement[] path;
395 bool pathHasQuants; // use faster algo if there are no quantifiers
396 bool openNoAttr;
397 union {
398 TagOpenCB open;
399 TagCloseCB close;
400 TagContentCB content;
404 private:
405 TagCB[] callbacksOpen;
406 TagCB[] callbacksClose;
407 TagCB[] callbacksContent;
409 public:
410 this () {}
412 void load (const(char)[] filename) { loadFile(VFile(filename)); }
414 void loadStream(ST) (auto ref ST st) if (isReadableStream!ST || (isInputRange!ST && is(ElementEncodingType!ST == char))) { loadFile(st); }
416 void onOpen(ST : const(char)[]) (ST path, TagOpenCB cb) {
417 assert(cb !is null);
418 auto tcb = newCallback!"open"(path);
419 tcb.open = cb;
420 tcb.openNoAttr = false;
423 void onOpen(ST : const(char)[]) (ST path, TagOpenCBNA cb) {
424 assert(cb !is null);
425 auto tcb = newCallback!"open"(path);
426 tcb.close = cb; // lucky me
427 tcb.openNoAttr = true;
430 void onClose(ST : const(char)[]) (ST path, TagCloseCB cb) {
431 assert(cb !is null);
432 auto tcb = newCallback!"close"(path);
433 tcb.close = cb;
436 void onContent(ST : const(char)[]) (ST path, TagContentCB cb) {
437 assert(cb !is null);
438 auto tcb = newCallback!"content"(path);
439 tcb.content = cb;
442 private:
443 TagCB* newCallback(string type, ST : const(char)[]) (ST path) {
444 static if (is(ST == typeof(null))) {
445 return newCallback("");
446 } else {
447 // parse path
448 bool hasQuants = false;
449 PathElement[] pth;
450 if (path.length) {
451 while (path.length != 0) {
452 while (path.length != 0 && path.ptr[0] == '/') path = path[1..$];
453 if (path.length == 0) break;
454 usize e = 0;
455 while (e < path.length && path.ptr[e] != '/') ++e;
456 //if (e == 1 && path.ptr[0] == '+') throw new Exception("invalid callback path");
457 if (path.ptr[e-1] == '+' || path.ptr[e-1] == '*') {
458 pth ~= PathElement(path[0..e-1].idup, path.ptr[e-1]);
459 hasQuants = true;
460 } else {
461 pth ~= PathElement(path[0..e].idup, 0);
463 path = path[e..$];
465 if (pth.length == 0) throw new Exception("invalid callback path");
466 } else {
467 hasQuants = true;
468 pth ~= PathElement(null, '*');
470 TagCB* res;
471 static if (type == "open") {
472 callbacksOpen.length += 1;
473 res = &callbacksOpen[$-1];
474 res.type = TagCB.Type.Open;
475 } else static if (type == "close") {
476 callbacksClose.length += 1;
477 res = &callbacksClose[$-1];
478 res.type = TagCB.Type.Close;
479 } else static if (type == "content") {
480 callbacksContent.length += 1;
481 res = &callbacksContent[$-1];
482 res.type = TagCB.Type.Content;
483 } else {
484 static assert(0, "wtf?!");
486 res.path = pth;
487 res.pathHasQuants = hasQuants;
488 return res;
492 // yes, i can make it faster with some more preprocessing, but why should i bother?
493 static bool pathHit (const(char)[][] tagStack, PathElement[] path, bool hasQuants) {
494 version(none) {
495 import std.stdio;
496 writeln("tagStack: ", tagStack[]);
497 foreach (const ref PathElement pe; path) {
498 write((pe.quant ? pe.quant : ' '), pe.name);
500 writeln;
502 if (!hasQuants) {
503 // easy case
504 if (tagStack.length != path.length) return false;
505 foreach_reverse (immutable idx, const ref PathElement pe; path) {
506 if (tagStack.ptr[idx] != pe.name) return false;
508 return true;
511 static bool hasQ (PathElement[] path) {
512 foreach (const ref PathElement pe; path) if (pe.quant) return true;
513 return false;
516 while (path.length > 0) {
517 auto pe = &path[0];
518 path = path[1..$];
519 if (pe.quant == '*') {
520 if (pe.name.length == 0) {
521 // any number of any tag, including zero
522 if (path.length == 0) return true;
523 while (tagStack.length > 0) {
524 if (pathHit(tagStack, path, hasQ(path))) return true;
525 tagStack = tagStack[1..$];
527 return false;
528 } else {
529 // any number of given tag, including zero
530 // skip this tag and continue
531 while (tagStack.length && tagStack.ptr[0] == pe.name) tagStack = tagStack[1..$];
533 } else if (pe.quant == '+') {
534 if (pe.name.length == 0) {
535 // any number of any tag, not including zero
536 if (path.length == 0) return (tagStack.length > 0);
537 while (tagStack.length > 0) {
538 if (pathHit(tagStack, path, hasQ(path))) return true;
539 tagStack = tagStack[1..$];
541 return false;
542 } else {
543 // any number of given tag, not including zero
544 if (tagStack.length == 0 || tagStack.ptr[0] != pe.name) return false;
545 // skip this tag and continue
546 while (tagStack.length && tagStack.ptr[0] == pe.name) tagStack = tagStack[1..$];
548 } else if (pe.name.length != 0) {
549 // named tag
550 if (tagStack.length == 0) return false;
551 if (pe.name != tagStack.ptr[0]) return false;
552 tagStack = tagStack[1..$];
553 } else {
554 // any tag
555 tagStack = tagStack[1..$];
558 return (tagStack.length == 0);
561 private:
562 void loadFile(ST) (auto ref ST fl) if (isReadableStream!ST || (isInputRange!ST && is(ElementEncodingType!ST == char))) {
563 bool seenXML;
564 bool tagStackLastWasAppend = true;
565 const(char)[][] tagStack; // all data is in tagStackBuf
566 char[] tagStackBuf;
567 scope(exit) tagStackBuf.destroy;
568 uint tagStackBufPos;
569 EncodingScheme efrom, eto;
570 scope(exit) { efrom.destroy; eto.destroy; }
571 char[] recbuf; // recode buffer
572 usize rcpos; // for recode buffer
573 scope(exit) recbuf.destroy;
575 void pushTag (const(char)[] s) {
576 if (s.length) {
577 if (tagStackBufPos+s.length >= tagStackBuf.length) {
578 if (tagStackBufPos >= int.max/2) throw new Exception("too many tags");
579 tagStackBuf.length = ((tagStackBufPos+s.length)|0x3ff)+1;
581 tagStackBuf[tagStackBufPos..tagStackBufPos+s.length] = s[];
582 if (!tagStackLastWasAppend) { tagStack.assumeSafeAppend; tagStackLastWasAppend = true; }
583 tagStack ~= tagStackBuf[tagStackBufPos..tagStackBufPos+s.length];
584 tagStackBufPos += s.length;
585 } else {
586 if (!tagStackLastWasAppend) { tagStack.assumeSafeAppend; tagStackLastWasAppend = true; }
587 tagStack ~= "";
591 void popTag () {
592 tagStack.length -= 1;
593 auto idx = tagStack.length;
594 tagStackBufPos -= tagStack.ptr[idx].length;
595 tagStackLastWasAppend = false;
598 char[] nrecode(bool doreset=true) (char[] text) {
599 if (efrom is null) return text; // nothing to do
600 static if (doreset) rcpos = 0;
601 bool needRecode = false;
602 foreach (char ch; text) if (ch >= 0x80) { needRecode = true; break; }
603 if (!needRecode) return text;
604 auto stpos = rcpos;
605 ubyte[16] buf;
606 auto ub = cast(const(ubyte)[])text;
607 while (ub.length > 0) {
608 dchar dc = efrom.safeDecode(ub);
609 if (dc == INVALID_SEQUENCE) dc = '?';
610 auto len = eto.encode(dc, buf);
611 if (rcpos+len > recbuf.length) {
612 recbuf.assumeSafeAppend; // the user is expected to copy data
613 recbuf.length = ((rcpos+len)|0x3ff)+1;
615 recbuf[rcpos..rcpos+len] = cast(char[])buf[0..len];
616 rcpos += len;
618 return recbuf[stpos..rcpos];
621 xmparse(fl,
622 (char[] name, char[][string] attrs) {
623 if (name == "?xml") {
624 if (seenXML) throw new Exception("duplicate '?xml?' tag");
625 seenXML = true;
626 if (auto ec = "encoding" in attrs) {
627 foreach (ref char ch; *ec) {
628 import std.ascii : toLower;
629 ch = ch.toLower;
631 if ((*ec).length && *ec != "utf-8") {
632 efrom = EncodingScheme.create(cast(string)(*ec)); // let's hope that it is safe...
633 eto = EncodingScheme.create("utf-8");
636 return;
638 if (!seenXML) throw new Exception("no '?xml?' tag");
639 pushTag(name);
640 bool attrsRecoded = (efrom is null);
641 foreach (ref TagCB tcb; callbacksOpen) {
642 if (tcb.type == TagCB.Type.Open && pathHit(tagStack, tcb.path, tcb.pathHasQuants)) {
643 if (tcb.openNoAttr) {
644 tcb.close(name);
645 } else {
646 // recode attrs and call the callback
647 if (!attrsRecoded) {
648 rcpos = 0; // reset recode
649 foreach (ref v; attrs.byValue) v = nrecode!false(v);
650 attrsRecoded = true;
652 tcb.open(name, attrs);
657 (char[] name) {
658 if (name == "?xml") return;
659 if (tagStack.length == 0 || tagStack[$-1] != name) throw new Exception("unbalanced xml tags");
660 foreach (ref TagCB tcb; callbacksClose) {
661 if (tcb.type == TagCB.Type.Close && pathHit(tagStack, tcb.path, tcb.pathHasQuants)) {
662 // call the callback
663 tcb.close(name);
666 popTag();
668 (char[] text) {
669 bool textRecoded = (efrom is null);
670 foreach (ref TagCB tcb; callbacksContent) {
671 if (tcb.type == TagCB.Type.Content && pathHit(tagStack, tcb.path, tcb.pathHasQuants)) {
672 // recode text and call the callback
673 if (!textRecoded) {
674 text = nrecode(text);
675 textRecoded = true;
677 tcb.content(text);