1 /* Invisible Vector Library
2 * coded by Ketmar // Invisible Vector <ketmar@ketmar.no-ip.org>
3 * Understanding is not required. Only obedience.
5 * This software is provided 'as-is', without any express or implied
6 * warranty. In no event will the authors be held liable for any damages
7 * arising from the use of this software.
8 * Permission is granted to anyone to use this software for any purpose,
9 * including commercial applications, and to alter it and redistribute it
10 * freely, subject to the following restrictions:
11 * 1. The origin of this software must not be misrepresented; you must not
12 * claim that you wrote the original software. If you use this software
13 * in a product, an acknowledgment in the product documentation would be
14 * appreciated but is not required.
15 * 2. Altered source versions must be plainly marked as such, and must not be
16 * misrepresented as being the original software.
17 * 3. This notice may not be removed or altered from any source distribution.
19 * This program is distributed in the hope that it will be useful,
20 * but WITHOUT ANY WARRANTY; without even the implied warranty of
21 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
23 // SAX style xml parser
24 module iv
.saxy
/*is aliced*/;
34 // ////////////////////////////////////////////////////////////////////////// //
35 //*WARNING*: attr keys are *NOT* strings!
36 void xmparse(ST
) (auto ref ST fl
,
37 scope void delegate (char[] name
, char[][string
] attrs
) tagStart
,
38 scope void delegate (char[] name
) tagEnd
,
39 scope void delegate (char[] text
) content
,
40 ) if (isReadableStream
!ST ||
(isInputRange
!ST
&& is(ElementEncodingType
!ST
== char))) {
49 static bool isValidNameChar() (char ch
) {
52 (ch
>= '0' && ch
<= '9') ||
53 (ch
>= 'A' && ch
<= 'Z') ||
54 (ch
>= 'a' && ch
<= 'z') ||
55 ch
== '_' || ch
== '-' || ch
== ':';
60 void bufPut (const(char)[] chars
...) {
61 if (/*tagLevel &&*/ chars
.length
) {
62 if (chars
.length
+bufpos
> buf
.length
) {
63 if (chars
.length
+bufpos
>= int.max
) throw new Exception("out of memory in xml parser");
65 buf
.length
= ((chars
.length
+bufpos
)|
0x3ff)+1;
67 buf
[bufpos
..bufpos
+chars
.length
] = chars
[];
68 bufpos
+= chars
.length
;
79 static if (isReadableStream
!ST
) {
81 scope(exit
) rdbuf
.destroy
;
82 uint rdbufpos
, rdbufused
;
87 static if (isReadableStream
!ST
) {
89 if (rdbufpos
>= rdbufused
) {
90 if (rdbuf
.length
== 0) rdbuf
.length
= 32*1024;
91 auto rd
= fl
.rawRead(rdbuf
[]);
92 if (rd
.length
== 0) { eof
= true; curCh
= 0; return; }
94 rdbufused
= cast(uint)rd
.length
;
96 curCh
= rdbuf
.ptr
[rdbufpos
++];
98 if (fl
.empty
) { eof
= true; curCh
= 0; return; }
102 if (curCh
== 0) curCh
= ' ';
107 void parseEntity (bool inattr
) {
108 assert(curCh
== '&');
113 while (!eof
&& curCh
!= '/' && curCh
!= '>' && curCh
!= '?' && curCh
!= ';' && bufpos
-xpos
< 9) {
118 while (!eof
&& curCh
!= '<' && curCh
!= ';' && bufpos
-xpos
< 9) {
123 if (!eof
&& curCh
== ';' && bufpos
> xpos
) {
124 import std
.utf
: encode
, UseReplacementDchar
;
125 char[4] ubuf
= void; // utf buffer
126 switch (buf
[xpos
..bufpos
]) {
127 case "lt": bufpos
= xpos
-1; bufPut('<'); break;
128 case "gt": bufpos
= xpos
-1; bufPut('>'); break;
129 case "amp": bufpos
= xpos
-1; bufPut('&'); break;
130 case "quot": bufpos
= xpos
-1; bufPut('"'); break;
131 case "apos": bufpos
= xpos
-1; bufPut('\''); break;
133 bufPut(curCh
); // first put ';'
134 if (bufpos
-xpos
> 3 && buf
.ptr
[xpos
] == '#' && buf
.ptr
[xpos
+1] == 'x') {
135 // should be hex code
138 while (pos
< bufpos
-1) {
139 char ch
= buf
.ptr
[pos
++];
140 if (ch
>= '0' && ch
<= '9') n
= n
*16+ch
-'0';
141 else if (ch
>= 'A' && ch
<= 'F') n
= n
*16+ch
-'A'+10;
142 else if (ch
>= 'a' && ch
<= 'f') n
= n
*16+ch
-'a'+10;
143 else { n
= uint.max
; break; } // invalid digit
144 if (n
> dchar.max
) break; // invalid char
146 if (n
<= dchar.max
) {
149 auto sz
= encode
!(UseReplacementDchar
.yes
)(ubuf
, cast(dchar)n
);
150 foreach (immutable char ch
; ubuf
[0..sz
]) bufPut(ch
);
152 } else if (bufpos
-xpos
> 2 && buf
.ptr
[xpos
] == '#') {
153 // shoud be decimal code
156 while (pos
< bufpos
-1) {
157 char ch
= buf
.ptr
[pos
++];
158 if (ch
>= '0' && ch
<= '9') n
= n
*10+ch
-'0';
159 else { n
= uint.max
; break; } // invalid digit
160 if (n
> dchar.max
) break; // invalid char
162 if (n
<= dchar.max
) {
165 auto sz
= encode
!(UseReplacementDchar
.yes
)(ubuf
, cast(dchar)n
);
166 foreach (immutable char ch
; ubuf
[0..sz
]) bufPut(ch
);
178 if (bufpos
>= 3 && buf
.ptr
[bufpos
-1] == '>' && buf
.ptr
[bufpos
-2] == ']' && buf
.ptr
[bufpos
-3] == ']') {
185 if (tagLevel
&& bufpos
> 0 && content
!is null) content(buf
[0..bufpos
]);
189 void parseContent () {
192 if (curCh
== '<') break;
200 if (tagLevel
&& bufpos
> 0 && content
!is null) content(buf
[0..bufpos
]);
205 assert(!eof
&& curCh
== '<');
208 if (eof
) throw new Exception("invalid xml");
209 bool inlineClose
= false, closeTag
= false;
211 // either CDATA, or comment-like
214 // this *must* be CDATA
216 if (curCh
!= 'C') throw new Exception("invalid xml");
218 if (curCh
!= 'D') throw new Exception("invalid xml");
220 if (curCh
!= 'A') throw new Exception("invalid xml");
222 if (curCh
!= 'T') throw new Exception("invalid xml");
224 if (curCh
!= 'A') throw new Exception("invalid xml");
226 if (curCh
!= '[') throw new Exception("invalid xml");
231 } else if (curCh
== '-') {
234 if (curCh
!= '-') throw new Exception("invalid xml");
237 if (eof
) throw new Exception("invalid xml");
258 if (curCh
== '/') { closeTag
= true; skipChar(); }
259 if (curCh
== '?') { bufPut(curCh
); skipChar(); }
261 if (eof ||
!isValidNameChar(curCh
)) throw new Exception("invalid xml");
262 while (isValidNameChar(curCh
)) {
266 //{ import std.stdio; writeln("TAG: ", buf[0..bufpos].quote); }
267 // now parse attributes
268 scope(exit
) attrs
.clear();
269 while (!eof
&& curCh
<= ' ') skipChar();
271 auto tagnameend
= bufpos
;
274 // read the whole tag, so we can add AA items without anchoring stale memory
275 if (eof
) throw new Exception("invalid xml");
276 if (curCh
!= '/' && curCh
!= '>' && curCh
!= '?') {
281 if (eof
) throw new Exception("invalid xml");
283 if (curCh
== qch
) { qch
= 0; curCh
= 1; }
289 if (curCh
== '/' || curCh
== '>' || curCh
== '?') break;
290 if (curCh
== '"' || curCh
== '\'') {
293 } else if (curCh
== 1) {
300 // now parse attributes
301 //{ import std.stdio; writeln(": ", buf[stpos..bufpos].quote); }
302 while (stpos
< bufpos
) {
303 while (stpos
< bufpos
&& buf
.ptr
[stpos
] <= ' ' && buf
.ptr
[stpos
] != 1) ++stpos
;
304 if (stpos
>= bufpos
) break;
305 //{ import std.stdio; writeln(": ", buf[stpos..bufpos].quote); }
306 if (!isValidNameChar(buf
.ptr
[stpos
])) throw new Exception("invalid xml: "~buf
[stpos
..bufpos
].quote
);
308 while (stpos
< bufpos
&& isValidNameChar(buf
.ptr
[stpos
])) ++stpos
;
309 string aname
= cast(string
)(buf
[nst
..stpos
]); // unsafe cast, but meh...
310 while (stpos
< bufpos
&& buf
.ptr
[stpos
] <= ' ' && buf
.ptr
[stpos
] != 1) ++stpos
;
311 if (stpos
>= bufpos
) { attrs
[aname
] = null; break; } // no value
312 if (buf
.ptr
[stpos
] != '=') { attrs
[aname
] = null; continue; } // no value
314 while (stpos
< bufpos
&& buf
.ptr
[stpos
] <= ' ' && buf
.ptr
[stpos
] != 1) ++stpos
;
315 if (stpos
>= bufpos
) { attrs
[aname
] = buf
[bufpos
..bufpos
]; break; }
316 //if (buf.ptr[stpos] == '"' || buf.ptr[stpos] == '\'')
317 if (buf
.ptr
[stpos
] == 1)
319 auto ech
= buf
.ptr
[stpos
];
321 while (stpos
< bufpos
&& buf
.ptr
[stpos
] != ech
) ++stpos
;
322 if (stpos
>= bufpos
) throw new Exception("invalid xml");
323 attrs
[aname
] = buf
[nst
..stpos
];
327 while (stpos
< bufpos
&& buf
.ptr
[stpos
] > ' ') ++stpos
;
328 attrs
[aname
] = buf
[nst
..stpos
];
334 if (buf
.ptr
[0] != '?') throw new Exception("invalid xml");
337 } else if (buf
.ptr
[0] != '!') {
338 if (curCh
== '/') { inlineClose
= true; skipChar(); }
342 if (curCh
!= '>') throw new Exception("invalid xml");
345 if (inlineClose
) throw new Exception("invalid xml");
346 if (tagEnd
!is null) tagEnd(buf
[0..tagnameend
]);
350 if (tagStart
!is null) tagStart(buf
[0..tagnameend
], attrs
);
352 if (tagEnd
!is null) tagEnd(buf
[0..tagnameend
]);
359 //writeln("*** ", tagLevel, " ***");
364 if (tagLevel
< 0) throw new Exception("invalid xml");
368 if (tagLevel
!= 0) throw new Exception("invalid xml");
372 // ////////////////////////////////////////////////////////////////////////// //
373 // you can use "quantifiers" in pathes, like this:
375 // that means "any number of 'c' tags", "one or more 'd' tags", "any number of any tags"
376 // the last is useful to parse things like "bold" tag inside "p" tag, for example
378 private import std
.range
;
380 alias TagOpenCB
= void delegate (char[] name
, char[][string
] attrs
);
381 alias TagOpenCBNA
= void delegate (char[] name
);
382 alias TagCloseCB
= void delegate (char[] name
);
383 alias TagContentCB
= void delegate (char[] text
);
386 static struct PathElement
{
387 string name
; // empty: any tag
388 char quant
= 0; // '+', '*', 0
391 static struct TagCB
{
392 enum Type
{ Open
, Close
, Content
}
395 bool pathHasQuants
; // use faster algo if there are no quantifiers
400 TagContentCB content
;
405 TagCB
[] callbacksOpen
;
406 TagCB
[] callbacksClose
;
407 TagCB
[] callbacksContent
;
412 void load (const(char)[] filename
) { loadFile(VFile(filename
)); }
414 void loadStream(ST
) (auto ref ST st
) if (isReadableStream
!ST ||
(isInputRange
!ST
&& is(ElementEncodingType
!ST
== char))) { loadFile(st
); }
416 void onOpen(ST
: const(char)[]) (ST path
, TagOpenCB cb
) {
418 auto tcb
= newCallback
!"open"(path
);
420 tcb
.openNoAttr
= false;
423 void onOpen(ST
: const(char)[]) (ST path
, TagOpenCBNA cb
) {
425 auto tcb
= newCallback
!"open"(path
);
426 tcb
.close
= cb
; // lucky me
427 tcb
.openNoAttr
= true;
430 void onClose(ST
: const(char)[]) (ST path
, TagCloseCB cb
) {
432 auto tcb
= newCallback
!"close"(path
);
436 void onContent(ST
: const(char)[]) (ST path
, TagContentCB cb
) {
438 auto tcb
= newCallback
!"content"(path
);
443 TagCB
* newCallback(string type
, ST
: const(char)[]) (ST path
) {
444 static if (is(ST
== typeof(null))) {
445 return newCallback("");
448 bool hasQuants
= false;
451 while (path
.length
!= 0) {
452 while (path
.length
!= 0 && path
.ptr
[0] == '/') path
= path
[1..$];
453 if (path
.length
== 0) break;
455 while (e
< path
.length
&& path
.ptr
[e
] != '/') ++e
;
456 //if (e == 1 && path.ptr[0] == '+') throw new Exception("invalid callback path");
457 if (path
.ptr
[e
-1] == '+' || path
.ptr
[e
-1] == '*') {
458 pth
~= PathElement(path
[0..e
-1].idup
, path
.ptr
[e
-1]);
461 pth
~= PathElement(path
[0..e
].idup
, 0);
465 if (pth
.length
== 0) throw new Exception("invalid callback path");
468 pth
~= PathElement(null, '*');
471 static if (type
== "open") {
472 callbacksOpen
.length
+= 1;
473 res
= &callbacksOpen
[$-1];
474 res
.type
= TagCB
.Type
.Open
;
475 } else static if (type
== "close") {
476 callbacksClose
.length
+= 1;
477 res
= &callbacksClose
[$-1];
478 res
.type
= TagCB
.Type
.Close
;
479 } else static if (type
== "content") {
480 callbacksContent
.length
+= 1;
481 res
= &callbacksContent
[$-1];
482 res
.type
= TagCB
.Type
.Content
;
484 static assert(0, "wtf?!");
487 res
.pathHasQuants
= hasQuants
;
492 // yes, i can make it faster with some more preprocessing, but why should i bother?
493 static bool pathHit (const(char)[][] tagStack
, PathElement
[] path
, bool hasQuants
) {
496 writeln("tagStack: ", tagStack
[]);
497 foreach (const ref PathElement pe
; path
) {
498 write((pe
.quant ? pe
.quant
: ' '), pe
.name
);
504 if (tagStack
.length
!= path
.length
) return false;
505 foreach_reverse (immutable idx
, const ref PathElement pe
; path
) {
506 if (tagStack
.ptr
[idx
] != pe
.name
) return false;
511 static bool hasQ (PathElement
[] path
) {
512 foreach (const ref PathElement pe
; path
) if (pe
.quant
) return true;
516 while (path
.length
> 0) {
519 if (pe
.quant
== '*') {
520 if (pe
.name
.length
== 0) {
521 // any number of any tag, including zero
522 if (path
.length
== 0) return true;
523 while (tagStack
.length
> 0) {
524 if (pathHit(tagStack
, path
, hasQ(path
))) return true;
525 tagStack
= tagStack
[1..$];
529 // any number of given tag, including zero
530 // skip this tag and continue
531 while (tagStack
.length
&& tagStack
.ptr
[0] == pe
.name
) tagStack
= tagStack
[1..$];
533 } else if (pe
.quant
== '+') {
534 if (pe
.name
.length
== 0) {
535 // any number of any tag, not including zero
536 if (path
.length
== 0) return (tagStack
.length
> 0);
537 while (tagStack
.length
> 0) {
538 if (pathHit(tagStack
, path
, hasQ(path
))) return true;
539 tagStack
= tagStack
[1..$];
543 // any number of given tag, not including zero
544 if (tagStack
.length
== 0 || tagStack
.ptr
[0] != pe
.name
) return false;
545 // skip this tag and continue
546 while (tagStack
.length
&& tagStack
.ptr
[0] == pe
.name
) tagStack
= tagStack
[1..$];
548 } else if (pe
.name
.length
!= 0) {
550 if (tagStack
.length
== 0) return false;
551 if (pe
.name
!= tagStack
.ptr
[0]) return false;
552 tagStack
= tagStack
[1..$];
555 tagStack
= tagStack
[1..$];
558 return (tagStack
.length
== 0);
562 void loadFile(ST
) (auto ref ST fl
) if (isReadableStream
!ST ||
(isInputRange
!ST
&& is(ElementEncodingType
!ST
== char))) {
564 bool tagStackLastWasAppend
= true;
565 const(char)[][] tagStack
; // all data is in tagStackBuf
567 scope(exit
) tagStackBuf
.destroy
;
569 EncodingScheme efrom
, eto
;
570 scope(exit
) { efrom
.destroy
; eto
.destroy
; }
571 char[] recbuf
; // recode buffer
572 usize rcpos
; // for recode buffer
573 scope(exit
) recbuf
.destroy
;
575 void pushTag (const(char)[] s
) {
577 if (tagStackBufPos
+s
.length
>= tagStackBuf
.length
) {
578 if (tagStackBufPos
>= int.max
/2) throw new Exception("too many tags");
579 tagStackBuf
.length
= ((tagStackBufPos
+s
.length
)|
0x3ff)+1;
581 tagStackBuf
[tagStackBufPos
..tagStackBufPos
+s
.length
] = s
[];
582 if (!tagStackLastWasAppend
) { tagStack
.assumeSafeAppend
; tagStackLastWasAppend
= true; }
583 tagStack
~= tagStackBuf
[tagStackBufPos
..tagStackBufPos
+s
.length
];
584 tagStackBufPos
+= s
.length
;
586 if (!tagStackLastWasAppend
) { tagStack
.assumeSafeAppend
; tagStackLastWasAppend
= true; }
592 tagStack
.length
-= 1;
593 auto idx
= tagStack
.length
;
594 tagStackBufPos
-= tagStack
.ptr
[idx
].length
;
595 tagStackLastWasAppend
= false;
598 char[] nrecode(bool doreset
=true) (char[] text
) {
599 if (efrom
is null) return text
; // nothing to do
600 static if (doreset
) rcpos
= 0;
601 bool needRecode
= false;
602 foreach (char ch
; text
) if (ch
>= 0x80) { needRecode
= true; break; }
603 if (!needRecode
) return text
;
606 auto ub
= cast(const(ubyte)[])text
;
607 while (ub
.length
> 0) {
608 dchar dc
= efrom
.safeDecode(ub
);
609 if (dc
== INVALID_SEQUENCE
) dc
= '?';
610 auto len
= eto
.encode(dc
, buf
);
611 if (rcpos
+len
> recbuf
.length
) {
612 recbuf
.assumeSafeAppend
; // the user is expected to copy data
613 recbuf
.length
= ((rcpos
+len
)|
0x3ff)+1;
615 recbuf
[rcpos
..rcpos
+len
] = cast(char[])buf
[0..len
];
618 return recbuf
[stpos
..rcpos
];
622 (char[] name
, char[][string
] attrs
) {
623 if (name
== "?xml") {
624 if (seenXML
) throw new Exception("duplicate '?xml?' tag");
626 if (auto ec
= "encoding" in attrs
) {
627 foreach (ref char ch
; *ec
) {
628 import std
.ascii
: toLower
;
631 if ((*ec
).length
&& *ec
!= "utf-8") {
632 efrom
= EncodingScheme
.create(cast(string
)(*ec
)); // let's hope that it is safe...
633 eto
= EncodingScheme
.create("utf-8");
638 if (!seenXML
) throw new Exception("no '?xml?' tag");
640 bool attrsRecoded
= (efrom
is null);
641 foreach (ref TagCB tcb
; callbacksOpen
) {
642 if (tcb
.type
== TagCB
.Type
.Open
&& pathHit(tagStack
, tcb
.path
, tcb
.pathHasQuants
)) {
643 if (tcb
.openNoAttr
) {
646 // recode attrs and call the callback
648 rcpos
= 0; // reset recode
649 foreach (ref v
; attrs
.byValue
) v
= nrecode
!false(v
);
652 tcb
.open(name
, attrs
);
658 if (name
== "?xml") return;
659 if (tagStack
.length
== 0 || tagStack
[$-1] != name
) throw new Exception("unbalanced xml tags");
660 foreach (ref TagCB tcb
; callbacksClose
) {
661 if (tcb
.type
== TagCB
.Type
.Close
&& pathHit(tagStack
, tcb
.path
, tcb
.pathHasQuants
)) {
669 bool textRecoded
= (efrom
is null);
670 foreach (ref TagCB tcb
; callbacksContent
) {
671 if (tcb
.type
== TagCB
.Type
.Content
&& pathHit(tagStack
, tcb
.path
, tcb
.pathHasQuants
)) {
672 // recode text and call the callback
674 text
= nrecode(text
);