2010-04-06 Jb Evain <jbevain@novell.com>
[mcs.git] / class / Commons.Xml.Relaxng / Commons.Xml.Relaxng.Rnc / RncTokenizer.cs
blob7908f696a955d2a3a48b192e38d440c6fdb54ab7
1 //
2 // RELAX NG Compact Syntax parser
3 //
4 // Author:
5 // Atsushi Enomoto <ginga@kit.hi-ho.ne.jp>
6 //
7 // (C)2003 Atsushi Enomoto
8 // (C)2004 Novell Inc.
9 //
12 // Permission is hereby granted, free of charge, to any person obtaining
13 // a copy of this software and associated documentation files (the
14 // "Software"), to deal in the Software without restriction, including
15 // without limitation the rights to use, copy, modify, merge, publish,
16 // distribute, sublicense, and/or sell copies of the Software, and to
17 // permit persons to whom the Software is furnished to do so, subject to
18 // the following conditions:
19 //
20 // The above copyright notice and this permission notice shall be
21 // included in all copies or substantial portions of the Software.
22 //
23 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24 // EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25 // MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26 // NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
27 // LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
28 // OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
29 // WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
32 using System;
33 using System.Collections;
34 using System.IO;
35 using System.Xml;
36 using Commons.Xml.Relaxng;
38 namespace Commons.Xml.Relaxng.Rnc
40 internal class RncTokenizer : Commons.Xml.Relaxng.Rnc.yyParser.yyInput
42 TextReader source;
44 int currentToken;
45 object tokenValue;
46 int peekChar;
47 string peekString;
48 bool isElement;
49 bool isLiteralNsUri;
51 int line = 1;
52 int column;
53 int savedLineNumber = 1;
54 int savedLinePosition;
55 bool nextIncrementLine;
56 string baseUri;
58 public RncTokenizer (TextReader source, string baseUri)
60 this.source = source;
61 this.baseUri = baseUri;
64 public bool IsElement {
65 get { return isElement; }
68 public int Line {
69 get { return savedLineNumber; }
72 public int Column {
73 get { return savedLinePosition; }
76 public string BaseUri {
77 get { return baseUri; }
80 // jay interface implementation
82 public int token ()
84 return currentToken;
87 public bool advance ()
89 tokenValue = null;
90 currentToken = ParseToken (false);
91 savedLineNumber = line;
92 savedLinePosition = column;
93 return currentToken != Token.EOF;
96 public object value ()
98 return tokenValue;
101 // private methods
103 private int ReadEscapedHexNumber (int current)
105 int i = source.Read ();
106 switch (i) {
107 case '0':
108 case '1':
109 case '2':
110 case '3':
111 case '4':
112 case '5':
113 case '6':
114 case '7':
115 case '8':
116 case '9':
117 current = current * 16 + (i - '0');
118 return ReadEscapedHexNumber (current);
119 case 'A':
120 case 'B':
121 case 'C':
122 case 'D':
123 case 'E':
124 case 'F':
125 current = current * 16 + (i - 'A') + 10;
126 return ReadEscapedHexNumber (current);
127 case 'a':
128 case 'b':
129 case 'c':
130 case 'd':
131 case 'e':
132 case 'f':
133 current = current * 16 + (i - 'a' + 10);
134 return ReadEscapedHexNumber (current);
136 peekChar = i;
137 return current;
140 private int ReadFromStream ()
142 int ret = source.Read ();
143 if (ret != '\\')
144 return ret;
145 ret = source.Read ();
146 switch (ret) {
147 case 'x':
148 int tmp;
149 int xcount = 0;
150 do {
151 xcount++;
152 tmp = source.Read ();
153 } while (tmp == 'x');
154 if (tmp != '{') {
155 peekString = new string ('x', xcount);
156 if (tmp >= 0)
157 peekString += (char) tmp;
158 return '\\';
160 ret = ReadEscapedHexNumber (0);
161 if (peekChar != '}')
162 break;
163 peekChar = 0;
164 return ret;
166 peekString = new string ((char) ret, 1);
167 return '\\';
170 private int PeekChar ()
172 if (peekChar == 0) {
173 if (peekString != null) {
174 peekChar = peekString [0];
175 peekString = peekString.Length == 1 ?
176 null : peekString.Substring (1);
178 else
179 peekChar = ReadFromStream ();
182 return peekChar;
185 private int ReadChar ()
187 int ret;
188 if (peekChar != 0) {
189 ret = peekChar;
190 peekChar = 0;
192 else if (peekString != null) {
193 ret = peekString [0];
194 peekString = peekString.Length == 1 ?
195 null : peekString.Substring (1);
197 else
198 ret = ReadFromStream ();
200 if (nextIncrementLine) {
201 line++;
202 column = 1;
203 nextIncrementLine = false;
205 switch (ret) {
206 case '\r':
207 break;
208 case '\n':
209 nextIncrementLine = true;
210 goto default;
211 default:
212 column++;
213 break;
216 return ret;
219 private void SkipWhitespaces ()
221 while (true) {
222 switch (PeekChar ()) {
223 case ' ':
224 case '\t':
225 case '\r':
226 case '\n':
227 ReadChar ();
228 continue;
229 default:
230 return;
235 char [] nameBuffer = new char [30];
237 private string ReadQuoted (char quoteChar)
239 int index = 0;
240 bool loop = true;
241 while (loop) {
242 int c = ReadChar ();
243 switch (c) {
244 case -1:
245 case '\'':
246 case '\"':
247 if (quoteChar != c)
248 goto default;
249 loop = false;
250 break;
251 default:
252 if (c < 0)
253 throw new RelaxngException ("Unterminated quoted literal.");
254 if (XmlChar.IsInvalid (c))
255 throw new RelaxngException ("Invalid character in literal.");
256 AppendNameChar (c, ref index);
257 break;
261 return new string (nameBuffer, 0, index);
264 private void AppendNameChar (int c, ref int index)
266 if (nameBuffer.Length == index) {
267 char [] arr = new char [index * 2];
268 Array.Copy (nameBuffer, arr, index);
269 nameBuffer = arr;
271 if (c > 0x10000) {
272 AppendNameChar ((c - 0x10000) / 0x400 + 0xD800, ref index);
273 AppendNameChar ((c - 0x10000) % 0x400 + 0xDC00, ref index);
275 else
276 nameBuffer [index++] = (char) c;
279 private string ReadTripleQuoted (char quoteChar)
281 int index = 0;
282 bool loop = true;
283 do {
284 int c = ReadChar ();
285 switch (c) {
286 case -1:
287 case '\'':
288 case '\"':
289 // 1
290 if (quoteChar != c)
291 goto default;
292 // 2
293 if ((c = PeekChar ()) != quoteChar) {
294 AppendNameChar (quoteChar, ref index);
295 goto default;
297 ReadChar ();
298 // 3
299 if ((c = PeekChar ()) == quoteChar) {
300 ReadChar ();
301 loop = false;
302 break;
304 AppendNameChar (quoteChar, ref index);
305 AppendNameChar (quoteChar, ref index);
306 break;
307 default:
308 if (c < 0)
309 throw new RelaxngException ("Unterminated triple-quoted literal.");
310 if (XmlChar.IsInvalid (c))
311 throw new RelaxngException ("Invalid character in literal.");
312 AppendNameChar (c, ref index);
313 break;
315 } while (loop);
317 return new string (nameBuffer, 0, index);
320 private string ReadOneName ()
322 int index = 0;
323 bool loop = true;
324 int c = PeekChar ();
325 if (!XmlChar.IsFirstNameChar (c) || !XmlChar.IsNCNameChar (c))
326 throw new RelaxngException (String.Format ("Invalid NCName start character: {0}", c));
327 do {
328 c = PeekChar ();
329 switch (c) {
330 case -1:
331 case ' ':
332 case '\t':
333 case '\r':
334 case '\n':
335 ReadChar ();
336 loop = false;
337 break;
338 default:
339 if (!XmlChar.IsNCNameChar (c)) {
340 loop = false;
341 break;
344 ReadChar ();
345 if (nameBuffer.Length == index) {
346 char [] arr = new char [index * 2];
347 Array.Copy (nameBuffer, arr, index);
348 nameBuffer = arr;
350 nameBuffer [index++] = (char) c;
351 break;
353 } while (loop);
355 return new string (nameBuffer, 0, index);
358 private string ReadLine ()
360 string s = source.ReadLine ();
361 line++;
362 column = 1;
363 return s;
366 private int ParseToken (bool backslashed)
368 SkipWhitespaces ();
369 int c = ReadChar ();
370 string name;
371 switch (c) {
372 case -1:
373 return Token.EOF;
374 case '=':
375 return Token.Equal;
376 case '~':
377 return Token.Tilde;
378 case ',':
379 return Token.Comma;
380 case '{':
381 return Token.OpenCurly;
382 case '}':
383 return Token.CloseCurly;
384 case '(':
385 return Token.OpenParen;
386 case ')':
387 return Token.CloseParen;
388 case '[':
389 return Token.OpenBracket;
390 case ']':
391 return Token.CloseBracket;
392 case '&':
393 if (PeekChar () != '=')
394 return Token.Amp;
395 ReadChar ();
396 return Token.AndEquals;
397 case '|':
398 if (PeekChar () != '=')
399 return Token.Bar;
400 ReadChar ();
401 return Token.OrEquals;
402 case '?':
403 return Token.Question;
404 case '*':
405 // See also ':' for NsName
406 return Token.Asterisk;
407 case '\\':
408 if (backslashed)
409 return Token.ERROR;
410 return ParseToken (true);
411 case '+':
412 return Token.Plus;
413 case '-':
414 return Token.Minus;
415 case '>':
416 if (PeekChar () == '>') {
417 ReadChar ();
418 return Token.TwoGreaters;
420 peekChar = '>';
421 goto default;
422 case '#':
423 // tokenValue = ReadLine ();
424 // return Token.Documentation;
425 ReadLine ();
426 return ParseToken (false);
427 case '\'':
428 case '\"':
429 if (PeekChar () != c)
430 name = ReadQuoted ((char) c);
431 else {
432 ReadChar ();
433 if (PeekChar () == c) {
434 ReadChar ();
435 name = ReadTripleQuoted ((char) c);
436 } // else '' or ""
437 name = String.Empty;
439 int invidx = XmlChar.IndexOfInvalid (name, true) ;
440 if (invidx >= 0)
441 throw new RelaxngException (String.Format ("Invalid XML character in compact syntax literal segment at {0:X}", (int) name [invidx]));
442 tokenValue = name;
443 return Token.LiteralSegment;
444 default:
445 if (!XmlChar.IsNCNameChar (c))
446 throw new RelaxngException ("Invalid NCName character.");
447 peekChar = c;
448 name = ReadOneName ();
449 if (PeekChar () == ':') {
450 ReadChar ();
451 if (PeekChar () == '*') {
452 ReadChar ();
453 tokenValue = name;
454 return Token.NsName;
456 tokenValue = name + ":" + ReadOneName ();
457 return Token.CName;
460 tokenValue = name;
461 if (backslashed)
462 return Token.QuotedIdentifier;
463 switch (name) {
464 case "attribute":
465 isElement = false;
466 return Token.KeywordAttribute;
467 case "element":
468 isElement = true;
469 return Token.KeywordElement;
470 case "datatypes":
471 return Token.KeywordDatatypes;
472 case "default":
473 return Token.KeywordDefault;
474 case "div":
475 return Token.KeywordDiv;
476 case "empty":
477 return Token.KeywordEmpty;
478 case "external":
479 return Token.KeywordExternal;
480 case "grammar":
481 return Token.KeywordGrammar;
482 case "include":
483 return Token.KeywordInclude;
484 case "inherit":
485 return Token.KeywordInherit;
486 case "list":
487 return Token.KeywordList;
488 case "mixed":
489 return Token.KeywordMixed;
490 case "namespace":
491 return Token.KeywordNamespace;
492 case "notAllowed":
493 return Token.KeywordNotAllowed;
494 case "parent":
495 return Token.KeywordParent;
496 case "start":
497 return Token.KeywordStart;
498 case "string":
499 return Token.KeywordString;
500 case "text":
501 return Token.KeywordText;
502 case "token":
503 return Token.KeywordToken;
504 default:
505 return Token.NCName;