1 /* Invisible Vector Library
2 * coded by Ketmar // Invisible Vector <ketmar@ketmar.no-ip.org>
3 * Understanding is not required. Only obedience.
5 * This software is provided 'as-is', without any express or implied
6 * warranty. In no event will the authors be held liable for any damages
7 * arising from the use of this software.
8 * Permission is granted to anyone to use this software for any purpose,
9 * including commercial applications, and to alter it and redistribute it
10 * freely, subject to the following restrictions:
11 * 1. The origin of this software must not be misrepresented; you must not
12 * claim that you wrote the original software. If you use this software
13 * in a product, an acknowledgment in the product documentation would be
14 * appreciated but is not required.
15 * 2. Altered source versions must be plainly marked as such, and must not be
16 * misrepresented as being the original software.
17 * 3. This notice may not be removed or altered from any source distribution.
19 * This program is distributed in the hope that it will be useful,
20 * but WITHOUT ANY WARRANTY; without even the implied warranty of
21 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
23 // SAX style xml parser
24 module iv
.saxy
/*is aliced*/;
34 // ////////////////////////////////////////////////////////////////////////// //
35 //*WARNING*: attr keys are *NOT* strings!
36 void xmparse(ST
) (auto ref ST fl
,
37 scope void delegate (char[] name
, char[][string
] attrs
) tagStart
,
38 scope void delegate (char[] name
) tagEnd
,
39 scope void delegate (char[] text
) content
,
40 ) if (isReadableStream
!ST ||
(isInputRange
!ST
&& is(ElementEncodingType
!ST
== char))) {
49 static bool isValidNameChar() (char ch
) {
52 (ch
>= '0' && ch
<= '9') ||
53 (ch
>= 'A' && ch
<= 'Z') ||
54 (ch
>= 'a' && ch
<= 'z') ||
55 ch
== '_' || ch
== '-' || ch
== ':';
60 void bufPut (const(char)[] chars
...) {
61 if (/*tagLevel &&*/ chars
.length
) {
62 if (chars
.length
+bufpos
> buf
.length
) {
63 if (chars
.length
+bufpos
>= int.max
) throw new Exception("out of memory in xml parser");
65 buf
.length
= ((chars
.length
+bufpos
)|
0x3ff)+1;
67 buf
[bufpos
..bufpos
+chars
.length
] = chars
[];
68 bufpos
+= chars
.length
;
79 static if (isReadableStream
!ST
) {
81 scope(exit
) rdbuf
.destroy
;
82 uint rdbufpos
, rdbufused
;
87 static if (isReadableStream
!ST
) {
89 if (rdbufpos
>= rdbufused
) {
90 if (rdbuf
.length
== 0) rdbuf
.length
= 32*1024;
91 auto rd
= fl
.rawRead(rdbuf
[]);
92 if (rd
.length
== 0) { eof
= true; curCh
= 0; return; }
94 rdbufused
= cast(uint)rd
.length
;
96 curCh
= rdbuf
.ptr
[rdbufpos
++];
98 if (fl
.empty
) { eof
= true; curCh
= 0; return; }
102 if (curCh
== 0) curCh
= ' ';
107 void parseEntity (bool inattr
) {
108 assert(curCh
== '&');
113 while (!eof
&& curCh
!= '/' && curCh
!= '>' && curCh
!= '?' && curCh
!= ';' && bufpos
-xpos
< 9) {
118 while (!eof
&& curCh
!= '<' && curCh
!= ';' && bufpos
-xpos
< 9) {
123 if (!eof
&& curCh
== ';' && bufpos
> xpos
) {
124 import std
.utf
: encode
, UseReplacementDchar
;
125 char[4] ubuf
= void; // utf buffer
126 switch (buf
[xpos
..bufpos
]) {
127 case "lt": bufpos
= xpos
-1; bufPut('<'); break;
128 case "gt": bufpos
= xpos
-1; bufPut('>'); break;
129 case "amp": bufpos
= xpos
-1; bufPut('&'); break;
130 case "quot": bufpos
= xpos
-1; bufPut('"'); break;
131 case "apos": bufpos
= xpos
-1; bufPut('\''); break;
133 bufPut(curCh
); // first put ';'
134 if (bufpos
-xpos
> 3 && buf
.ptr
[xpos
] == '#' && buf
.ptr
[xpos
+1] == 'x') {
135 // should be hex code
138 while (pos
< bufpos
-1) {
139 char ch
= buf
.ptr
[pos
++];
140 if (ch
>= '0' && ch
<= '9') n
= n
*16+ch
-'0';
141 else if (ch
>= 'A' && ch
<= 'F') n
= n
*16+ch
-'A'+10;
142 else if (ch
>= 'a' && ch
<= 'f') n
= n
*16+ch
-'a'+10;
143 else { n
= uint.max
; break; } // invalid digit
144 if (n
> dchar.max
) break; // invalid char
146 if (n
<= dchar.max
) {
148 auto sz
= encode
!(UseReplacementDchar
.yes
)(ubuf
, cast(dchar)n
);
149 foreach (immutable char ch
; ubuf
[0..sz
]) bufPut(ch
);
151 } else if (bufpos
-xpos
> 2 && buf
.ptr
[xpos
] == '#') {
152 // shoud be decimal code
155 while (pos
< bufpos
-1) {
156 char ch
= buf
.ptr
[pos
++];
157 if (ch
>= '0' && ch
<= '9') n
= n
*10+ch
-'0';
158 else { n
= uint.max
; break; } // invalid digit
159 if (n
> dchar.max
) break; // invalid char
161 if (n
<= dchar.max
) {
163 auto sz
= encode
!(UseReplacementDchar
.yes
)(ubuf
, cast(dchar)n
);
164 foreach (immutable char ch
; ubuf
[0..sz
]) bufPut(ch
);
176 if (bufpos
>= 3 && buf
.ptr
[bufpos
-1] == '>' && buf
.ptr
[bufpos
-2] == ']' && buf
.ptr
[bufpos
-3] == ']') {
183 if (tagLevel
&& bufpos
> 0 && content
!is null) content(buf
[0..bufpos
]);
187 void parseContent () {
190 if (curCh
== '<') break;
198 if (tagLevel
&& bufpos
> 0 && content
!is null) content(buf
[0..bufpos
]);
203 assert(!eof
&& curCh
== '<');
206 if (eof
) throw new Exception("invalid xml");
207 bool inlineClose
= false, closeTag
= false;
209 // either CDATA, or comment-like
212 // this *must* be CDATA
214 if (curCh
!= 'C') throw new Exception("invalid xml");
216 if (curCh
!= 'D') throw new Exception("invalid xml");
218 if (curCh
!= 'A') throw new Exception("invalid xml");
220 if (curCh
!= 'T') throw new Exception("invalid xml");
222 if (curCh
!= 'A') throw new Exception("invalid xml");
224 if (curCh
!= '[') throw new Exception("invalid xml");
229 } else if (curCh
== '-') {
232 if (curCh
!= '-') throw new Exception("invalid xml");
235 if (eof
) throw new Exception("invalid xml");
256 if (curCh
== '/') { closeTag
= true; skipChar(); }
257 if (curCh
== '?') { bufPut(curCh
); skipChar(); }
259 if (eof ||
!isValidNameChar(curCh
)) throw new Exception("invalid xml");
260 while (isValidNameChar(curCh
)) {
264 //{ import std.stdio; writeln("TAG: ", buf[0..bufpos].quote); }
265 // now parse attributes
266 scope(exit
) attrs
.clear();
267 while (!eof
&& curCh
<= ' ') skipChar();
269 auto tagnameend
= bufpos
;
272 // read the whole tag, so we can add AA items without anchoring stale memory
273 if (eof
) throw new Exception("invalid xml");
274 if (curCh
!= '/' && curCh
!= '>' && curCh
!= '?') {
279 if (eof
) throw new Exception("invalid xml");
281 if (curCh
== qch
) qch
= 0;
287 if (curCh
== '/' || curCh
== '>' || curCh
== '?') break;
288 if (curCh
== '"' || curCh
== '\'') qch
= curCh
;
293 // now parse attributes
294 while (stpos
< bufpos
) {
295 while (stpos
< bufpos
&& buf
.ptr
[stpos
] <= ' ') ++stpos
;
296 if (stpos
>= bufpos
) break;
297 //{ import std.stdio; writeln(": ", buf[stpos..bufpos].quote); }
298 if (!isValidNameChar(buf
.ptr
[stpos
])) throw new Exception("invalid xml");
300 while (stpos
< bufpos
&& isValidNameChar(buf
.ptr
[stpos
])) ++stpos
;
301 string aname
= cast(string
)(buf
[nst
..stpos
]); // unsafe cast, but meh...
302 while (stpos
< bufpos
&& buf
.ptr
[stpos
] <= ' ') ++stpos
;
303 if (stpos
>= bufpos
) { attrs
[aname
] = null; break; } // no value
304 if (buf
.ptr
[stpos
] != '=') { attrs
[aname
] = null; continue; } // no value
306 if (stpos
>= bufpos
) { attrs
[aname
] = buf
[bufpos
..bufpos
]; break; }
307 if (buf
.ptr
[stpos
] == '"' || buf
.ptr
[stpos
] == '\'') {
308 auto ech
= buf
.ptr
[stpos
];
310 while (stpos
< bufpos
&& buf
.ptr
[stpos
] != ech
) ++stpos
;
311 if (stpos
>= bufpos
) throw new Exception("invalid xml");
312 attrs
[aname
] = buf
[nst
..stpos
];
316 while (stpos
< bufpos
&& buf
.ptr
[stpos
] > ' ') ++stpos
;
317 attrs
[aname
] = buf
[nst
..stpos
];
323 if (buf
.ptr
[0] != '?') throw new Exception("invalid xml");
326 } else if (buf
.ptr
[0] != '!') {
327 if (curCh
== '/') { inlineClose
= true; skipChar(); }
331 if (curCh
!= '>') throw new Exception("invalid xml");
334 if (inlineClose
) throw new Exception("invalid xml");
335 if (tagEnd
!is null) tagEnd(buf
[0..tagnameend
]);
339 if (tagStart
!is null) tagStart(buf
[0..tagnameend
], attrs
);
341 if (tagEnd
!is null) tagEnd(buf
[0..tagnameend
]);
348 //writeln("*** ", tagLevel, " ***");
353 if (tagLevel
< 0) throw new Exception("invalid xml");
357 if (tagLevel
!= 0) throw new Exception("invalid xml");
361 // ////////////////////////////////////////////////////////////////////////// //
362 // you can use "quantifiers" in pathes, like this:
364 // that means "any number of 'c' tags", "one or more 'd' tags", "any number of any tags"
365 // the last is useful to parse things like "bold" tag inside "p" tag, for example
367 private import std
.range
;
369 alias TagOpenCB
= void delegate (char[] name
, char[][string
] attrs
);
370 alias TagOpenCBNA
= void delegate (char[] name
);
371 alias TagCloseCB
= void delegate (char[] name
);
372 alias TagContentCB
= void delegate (char[] text
);
375 static struct PathElement
{
376 string name
; // empty: any tag
377 char quant
= 0; // '+', '*', 0
380 static struct TagCB
{
381 enum Type
{ Open
, Close
, Content
}
384 bool pathHasQuants
; // use faster algo if there are no quantifiers
389 TagContentCB content
;
394 TagCB
[] callbacksOpen
;
395 TagCB
[] callbacksClose
;
396 TagCB
[] callbacksContent
;
401 void load (const(char)[] filename
) { loadFile(VFile(filename
)); }
403 void loadStream(ST
) (auto ref ST st
) if (isReadableStream
!ST ||
(isInputRange
!ST
&& is(ElementEncodingType
!ST
== char))) { loadFile(st
); }
405 void onOpen(ST
: const(char)[]) (ST path
, TagOpenCB cb
) {
407 auto tcb
= newCallback
!"open"(path
);
409 tcb
.openNoAttr
= false;
412 void onOpen(ST
: const(char)[]) (ST path
, TagOpenCBNA cb
) {
414 auto tcb
= newCallback
!"open"(path
);
415 tcb
.close
= cb
; // lucky me
416 tcb
.openNoAttr
= true;
419 void onClose(ST
: const(char)[]) (ST path
, TagCloseCB cb
) {
421 auto tcb
= newCallback
!"close"(path
);
425 void onContent(ST
: const(char)[]) (ST path
, TagContentCB cb
) {
427 auto tcb
= newCallback
!"content"(path
);
432 TagCB
* newCallback(string type
, ST
: const(char)[]) (ST path
) {
433 static if (is(ST
== typeof(null))) {
434 return newCallback("");
437 bool hasQuants
= false;
440 while (path
.length
!= 0) {
441 while (path
.length
!= 0 && path
.ptr
[0] == '/') path
= path
[1..$];
442 if (path
.length
== 0) break;
444 while (e
< path
.length
&& path
.ptr
[e
] != '/') ++e
;
445 //if (e == 1 && path.ptr[0] == '+') throw new Exception("invalid callback path");
446 if (path
.ptr
[e
-1] == '+' || path
.ptr
[e
-1] == '*') {
447 pth
~= PathElement(path
[0..e
-1].idup
, path
.ptr
[e
-1]);
450 pth
~= PathElement(path
[0..e
].idup
, 0);
454 if (pth
.length
== 0) throw new Exception("invalid callback path");
457 pth
~= PathElement(null, '*');
460 static if (type
== "open") {
461 callbacksOpen
.length
+= 1;
462 res
= &callbacksOpen
[$-1];
463 res
.type
= TagCB
.Type
.Open
;
464 } else static if (type
== "close") {
465 callbacksClose
.length
+= 1;
466 res
= &callbacksClose
[$-1];
467 res
.type
= TagCB
.Type
.Close
;
468 } else static if (type
== "content") {
469 callbacksContent
.length
+= 1;
470 res
= &callbacksContent
[$-1];
471 res
.type
= TagCB
.Type
.Content
;
473 static assert(0, "wtf?!");
476 res
.pathHasQuants
= hasQuants
;
481 // yes, i can make it faster with some more preprocessing, but why should i bother?
482 static bool pathHit (const(char)[][] tagStack
, PathElement
[] path
, bool hasQuants
) {
485 writeln("tagStack: ", tagStack
[]);
486 foreach (const ref PathElement pe
; path
) {
487 write((pe
.quant ? pe
.quant
: ' '), pe
.name
);
493 if (tagStack
.length
!= path
.length
) return false;
494 foreach_reverse (immutable idx
, const ref PathElement pe
; path
) {
495 if (tagStack
.ptr
[idx
] != pe
.name
) return false;
500 static bool hasQ (PathElement
[] path
) {
501 foreach (const ref PathElement pe
; path
) if (pe
.quant
) return true;
505 while (path
.length
> 0) {
508 if (pe
.quant
== '*') {
509 if (pe
.name
.length
== 0) {
510 // any number of any tag, including zero
511 if (path
.length
== 0) return true;
512 while (tagStack
.length
> 0) {
513 if (pathHit(tagStack
, path
, hasQ(path
))) return true;
514 tagStack
= tagStack
[1..$];
518 // any number of given tag, including zero
519 // skip this tag and continue
520 while (tagStack
.length
&& tagStack
.ptr
[0] == pe
.name
) tagStack
= tagStack
[1..$];
522 } else if (pe
.quant
== '+') {
523 if (pe
.name
.length
== 0) {
524 // any number of any tag, not including zero
525 if (path
.length
== 0) return (tagStack
.length
> 0);
526 while (tagStack
.length
> 0) {
527 if (pathHit(tagStack
, path
, hasQ(path
))) return true;
528 tagStack
= tagStack
[1..$];
532 // any number of given tag, not including zero
533 if (tagStack
.length
== 0 || tagStack
.ptr
[0] != pe
.name
) return false;
534 // skip this tag and continue
535 while (tagStack
.length
&& tagStack
.ptr
[0] == pe
.name
) tagStack
= tagStack
[1..$];
537 } else if (pe
.name
.length
!= 0) {
539 if (tagStack
.length
== 0) return false;
540 if (pe
.name
!= tagStack
.ptr
[0]) return false;
541 tagStack
= tagStack
[1..$];
544 tagStack
= tagStack
[1..$];
547 return (tagStack
.length
== 0);
551 void loadFile(ST
) (auto ref ST fl
) if (isReadableStream
!ST ||
(isInputRange
!ST
&& is(ElementEncodingType
!ST
== char))) {
553 bool tagStackLastWasAppend
= true;
554 const(char)[][] tagStack
; // all data is in tagStackBuf
556 scope(exit
) tagStackBuf
.destroy
;
558 EncodingScheme efrom
, eto
;
559 scope(exit
) { efrom
.destroy
; eto
.destroy
; }
560 char[] recbuf
; // recode buffer
561 usize rcpos
; // for recode buffer
562 scope(exit
) recbuf
.destroy
;
564 void pushTag (const(char)[] s
) {
566 if (tagStackBufPos
+s
.length
>= tagStackBuf
.length
) {
567 if (tagStackBufPos
>= int.max
/2) throw new Exception("too many tags");
568 tagStackBuf
.length
= ((tagStackBufPos
+s
.length
)|
0x3ff)+1;
570 tagStackBuf
[tagStackBufPos
..tagStackBufPos
+s
.length
] = s
[];
571 if (!tagStackLastWasAppend
) { tagStack
.assumeSafeAppend
; tagStackLastWasAppend
= true; }
572 tagStack
~= tagStackBuf
[tagStackBufPos
..tagStackBufPos
+s
.length
];
573 tagStackBufPos
+= s
.length
;
575 if (!tagStackLastWasAppend
) { tagStack
.assumeSafeAppend
; tagStackLastWasAppend
= true; }
581 tagStack
.length
-= 1;
582 auto idx
= tagStack
.length
;
583 tagStackBufPos
-= tagStack
.ptr
[idx
].length
;
584 tagStackLastWasAppend
= false;
587 char[] nrecode(bool doreset
=true) (char[] text
) {
588 if (efrom
is null) return text
; // nothing to do
589 static if (doreset
) rcpos
= 0;
590 bool needRecode
= false;
591 foreach (char ch
; text
) if (ch
>= 0x80) { needRecode
= true; break; }
592 if (!needRecode
) return text
;
595 auto ub
= cast(const(ubyte)[])text
;
596 while (ub
.length
> 0) {
597 dchar dc
= efrom
.safeDecode(ub
);
598 if (dc
== INVALID_SEQUENCE
) dc
= '?';
599 auto len
= eto
.encode(dc
, buf
);
600 if (rcpos
+len
> recbuf
.length
) {
601 recbuf
.assumeSafeAppend
; // the user is expected to copy data
602 recbuf
.length
= ((rcpos
+len
)|
0x3ff)+1;
604 recbuf
[rcpos
..rcpos
+len
] = cast(char[])buf
[0..len
];
607 return recbuf
[stpos
..rcpos
];
611 (char[] name
, char[][string
] attrs
) {
612 if (name
== "?xml") {
613 if (seenXML
) throw new Exception("duplicate '?xml?' tag");
615 if (auto ec
= "encoding" in attrs
) {
616 foreach (ref char ch
; *ec
) {
617 import std
.ascii
: toLower
;
620 if ((*ec
).length
&& *ec
!= "utf-8") {
621 efrom
= EncodingScheme
.create(cast(string
)(*ec
)); // let's hope that it is safe...
622 eto
= EncodingScheme
.create("utf-8");
627 if (!seenXML
) throw new Exception("no '?xml?' tag");
629 bool attrsRecoded
= (efrom
is null);
630 foreach (ref TagCB tcb
; callbacksOpen
) {
631 if (tcb
.type
== TagCB
.Type
.Open
&& pathHit(tagStack
, tcb
.path
, tcb
.pathHasQuants
)) {
632 if (tcb
.openNoAttr
) {
635 // recode attrs and call the callback
637 rcpos
= 0; // reset recode
638 foreach (ref v
; attrs
.byValue
) v
= nrecode
!false(v
);
641 tcb
.open(name
, attrs
);
647 if (name
== "?xml") return;
648 if (tagStack
.length
== 0 || tagStack
[$-1] != name
) throw new Exception("unbalanced xml tags");
649 foreach (ref TagCB tcb
; callbacksClose
) {
650 if (tcb
.type
== TagCB
.Type
.Close
&& pathHit(tagStack
, tcb
.path
, tcb
.pathHasQuants
)) {
658 bool textRecoded
= (efrom
is null);
659 foreach (ref TagCB tcb
; callbacksContent
) {
660 if (tcb
.type
== TagCB
.Type
.Content
&& pathHit(tagStack
, tcb
.path
, tcb
.pathHasQuants
)) {
661 // recode text and call the callback
663 text
= nrecode(text
);