**** Merged from MCS ****
[mono-project.git] / mcs / class / System.XML / System.Xml.XPath / Tokenizer.cs
blob9c1677006240a616ed3db136e8acb16f8d693597
1 //
2 // System.Xml.XPath.Tokenizer
3 //
4 // Author:
5 // Piers Haken (piersh@friskit.com)
6 //
7 // (C) 2002 Piers Haken
8 //
11 // Permission is hereby granted, free of charge, to any person obtaining
12 // a copy of this software and associated documentation files (the
13 // "Software"), to deal in the Software without restriction, including
14 // without limitation the rights to use, copy, modify, merge, publish,
15 // distribute, sublicense, and/or sell copies of the Software, and to
16 // permit persons to whom the Software is furnished to do so, subject to
17 // the following conditions:
18 //
19 // The above copyright notice and this permission notice shall be
20 // included in all copies or substantial portions of the Software.
21 //
22 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
23 // EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
24 // MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
25 // NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
26 // LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
27 // OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
28 // WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
30 using System;
31 using System.Globalization;
32 using System.IO;
33 using System.Text;
34 using System.Collections;
35 using Mono.Xml.XPath;
36 using Mono.Xml.XPath.yyParser;
38 namespace System.Xml.XPath
40 internal class Tokenizer : Mono.Xml.XPath.yyParser.yyInput
42 private string m_rgchInput;
43 private int m_ich;
44 private int m_cch;
45 private int m_iToken;
46 private int m_iTokenPrev = Token.EOF;
47 private Object m_objToken;
48 private bool m_fPrevWasOperator = false;
49 private bool m_fThisIsOperator = false;
50 private static readonly Hashtable s_mapTokens = new Hashtable ();
51 private static readonly Object [] s_rgTokenMap =
53 Token.AND, "and",
54 Token.OR, "or",
55 Token.DIV, "div",
56 Token.MOD, "mod",
57 Token.ANCESTOR, "ancestor",
58 Token.ANCESTOR_OR_SELF, "ancestor-or-self",
59 Token.ATTRIBUTE, "attribute",
60 Token.CHILD, "child",
61 Token.DESCENDANT, "descendant",
62 Token.DESCENDANT_OR_SELF, "descendant-or-self",
63 Token.FOLLOWING, "following",
64 Token.FOLLOWING_SIBLING, "following-sibling",
65 Token.NAMESPACE, "namespace",
66 Token.PARENT, "parent",
67 Token.PRECEDING, "preceding",
68 Token.PRECEDING_SIBLING, "preceding-sibling",
69 Token.SELF, "self",
70 Token.COMMENT, "comment",
71 Token.TEXT, "text",
72 Token.PROCESSING_INSTRUCTION, "processing-instruction",
73 Token.NODE, "node",
75 private const char EOL = '\0';
77 static Tokenizer ()
79 for (int i = 0; i < s_rgTokenMap.Length; i += 2)
80 s_mapTokens.Add (s_rgTokenMap [i + 1], s_rgTokenMap [i]);
83 public Tokenizer (string strInput)
85 //Console.WriteLine ("Tokenizing: " + strInput);
86 m_rgchInput = strInput;
87 m_ich = 0;
88 m_cch = strInput.Length;
89 SkipWhitespace ();
92 private char Peek (int iOffset)
94 if (m_ich + iOffset>= m_cch)
95 return EOL;
96 return m_rgchInput [m_ich + iOffset];
99 private char Peek ()
101 return Peek (0);
104 private char GetChar ()
106 if (m_ich >= m_cch)
107 return EOL;
108 return m_rgchInput [m_ich++];
111 private char PutBack ()
113 if (m_ich == 0)
114 throw new XPathException ("XPath parser returned an error status: invalid tokenizer state.");
115 return m_rgchInput [--m_ich];
118 private bool SkipWhitespace () // returns trus if any whitespace was skipped
120 if (!IsWhitespace (Peek ()))
121 return false;
123 while (IsWhitespace (Peek ()))
124 GetChar ();
126 return true;
129 private int ParseNumber ()
131 StringBuilder sb = new StringBuilder ();
133 while (IsDigit (Peek ()))
134 sb.Append ((char) GetChar ());
136 // don't handle '3.' as an error case (it is not. XPath 3.7 syntax [30])
137 if (Peek () == '.')
139 sb.Append ((char) GetChar ());
140 while (IsDigit (Peek ()))
141 sb.Append ((char) GetChar ());
143 m_objToken = Double.Parse (sb.ToString (), NumberFormatInfo.InvariantInfo);
144 return Token.NUMBER;
147 private int ParseLiteral ()
149 StringBuilder sb = new StringBuilder ();
151 char chInit = GetChar ();
152 char ch;
153 while ((ch = Peek ()) != chInit)
155 if (ch == EOL)
156 throw new XPathException ("unmatched "+chInit+" in expression");
157 sb.Append ((char) GetChar ());
159 GetChar ();
160 m_objToken = sb.ToString ();
161 return Token.LITERAL;
164 private string ReadIdentifier ()
166 StringBuilder sb = new StringBuilder ();
168 char ch = Peek ();
169 if (!Char.IsLetter (ch) && ch != '_')
170 return null;
172 sb.Append ((char) GetChar ());
174 while ((ch = Peek ()) == '_' || ch == '-' || ch == '.' || Char.IsLetterOrDigit (ch))
175 sb.Append ((char) GetChar ());
177 SkipWhitespace ();
178 return sb.ToString ();
181 private int ParseIdentifier ()
183 string strToken = ReadIdentifier ();
184 Object objToken = s_mapTokens [strToken];
186 int iToken = (objToken != null) ? (int) objToken : Token.QName;
187 m_objToken = strToken;
189 char ch = Peek ();
190 if (ch == ':')
192 if (Peek (1) == ':')
194 // If the two characters following an NCName (possibly
195 // after intervening ExprWhitespace) are ::, then the
196 // token must be recognized as an AxisName.
197 if (objToken == null || !IsAxisName (iToken))
198 throw new XPathException ("invalid axis name: '"+strToken+"'");
199 return iToken;
202 GetChar ();
203 SkipWhitespace ();
204 ch = Peek ();
206 if (ch == '*')
208 GetChar ();
209 m_objToken = new XmlQualifiedName ("", strToken);
210 return Token.QName;
212 string strToken2 = ReadIdentifier ();
213 if (strToken2 == null)
214 throw new XPathException ("invalid QName: "+strToken+":"+(char)ch);
216 ch = Peek ();
217 m_objToken = new XmlQualifiedName (strToken2, strToken);
218 if (ch == '(')
219 return Token.FUNCTION_NAME;
220 return Token.QName;
223 // If there is a preceding token and the preceding
224 // token is not one of @, ::, (, [, , or an Operator,
225 // then a * must be recognized as a MultiplyOperator
226 // and an NCName must be recognized as an OperatorName.
227 if (!IsFirstToken && !m_fPrevWasOperator)
229 if (objToken == null || !IsOperatorName (iToken))
230 throw new XPathException ("invalid operator name: '"+strToken+"'");
231 return iToken;
234 if (ch == '(')
236 // If the character following an NCName (possibly
237 // after intervening ExprWhitespace) is (, then the
238 // token must be recognized as a NodeType or a FunctionName.
239 if (objToken == null)
241 m_objToken = new XmlQualifiedName (strToken, "");
242 return Token.FUNCTION_NAME;
244 if (IsNodeType (iToken))
245 return iToken;
246 throw new XPathException ("invalid function name: '"+strToken+"'");
249 m_objToken = new XmlQualifiedName (strToken, "");
250 return Token.QName;
253 private static bool IsWhitespace (char ch)
255 // return Char.IsWhiteSpace (ch);
256 return (ch == ' ' || ch == '\t' || ch == '\n' || ch == '\r');
259 private static bool IsDigit (char ch)
261 // return Char.IsDigit (ch);
262 return ch >= '0' && ch <= '9';
266 int ParseToken ()
268 char ch = Peek ();
269 switch (ch)
271 case EOL:
272 return Token.EOF;
274 case '/':
275 m_fThisIsOperator = true;
276 GetChar ();
277 if (Peek () == '/')
279 GetChar ();
280 return Token.SLASH2;
282 return Token.SLASH;
284 case '.':
285 GetChar ();
286 if (Peek () == '.')
288 GetChar ();
289 return Token.DOT2;
291 else if (IsDigit (Peek ()))
293 PutBack ();
294 return ParseNumber ();
296 return Token.DOT;
298 case ':':
299 GetChar ();
300 if (Peek () == ':')
302 m_fThisIsOperator = true;
303 GetChar ();
304 return Token.COLON2;
306 return Token.ERROR;
308 case ',':
309 m_fThisIsOperator = true;
310 GetChar ();
311 return Token.COMMA;
313 case '@':
314 m_fThisIsOperator = true;
315 GetChar ();
316 return Token.AT;
318 case '[':
319 m_fThisIsOperator = true;
320 GetChar ();
321 return Token.BRACKET_OPEN;
323 case ']':
324 GetChar ();
325 return Token.BRACKET_CLOSE;
327 case '(':
328 m_fThisIsOperator = true;
329 GetChar ();
330 return Token.PAREN_OPEN;
332 case ')':
333 GetChar ();
334 return Token.PAREN_CLOSE;
336 case '+':
337 m_fThisIsOperator = true;
338 GetChar ();
339 return Token.PLUS;
341 case '-':
342 m_fThisIsOperator = true;
343 GetChar ();
344 return Token.MINUS;
346 case '*':
347 GetChar ();
348 if (!IsFirstToken && !m_fPrevWasOperator)
350 m_fThisIsOperator = true;
351 return Token.MULTIPLY;
353 return Token.ASTERISK;
355 case '$':
356 GetChar ();
357 m_fThisIsOperator = true;
358 return Token.DOLLAR;
360 case '|':
361 m_fThisIsOperator = true;
362 GetChar ();
363 return Token.BAR;
365 case '=':
366 m_fThisIsOperator = true;
367 GetChar ();
368 return Token.EQ;
370 case '!':
371 GetChar ();
372 if (Peek () == '=')
374 m_fThisIsOperator = true;
375 GetChar ();
376 return Token.NE;
378 break;
380 case '>':
381 m_fThisIsOperator = true;
382 GetChar ();
383 if (Peek () == '=')
385 GetChar ();
386 return Token.GE;
388 return Token.GT;
390 case '<':
391 m_fThisIsOperator = true;
392 GetChar ();
393 if (Peek () == '=')
395 GetChar ();
396 return Token.LE;
398 return Token.LT;
400 case '\'':
401 return ParseLiteral ();
403 case '\"':
404 return ParseLiteral ();
406 default:
407 if (IsDigit (ch))
409 return ParseNumber ();
411 else if (Char.IsLetter (ch) || ch == '_') // NCName
413 int iToken = ParseIdentifier ();
414 if (IsOperatorName (iToken))
415 m_fThisIsOperator = true;
416 return iToken;
418 break;
420 throw new XPathException ("invalid token: '"+ch+"'");
423 ///////////////////////////
424 // yyParser.yyInput methods
425 ///////////////////////////
427 /** move on to next token.
428 @return false if positioned beyond tokens.
429 @throws IOException on input error.
431 public bool advance ()
433 m_fThisIsOperator = false;
434 m_objToken = null;
435 m_iToken = ParseToken ();
436 bool fWhitespace = SkipWhitespace ();
437 m_iTokenPrev = m_iToken;
438 m_fPrevWasOperator = m_fThisIsOperator;
439 return (m_iToken != Token.EOF);
442 /** classifies current token.
443 Should not be called if advance() returned false.
444 @return current %token or single character.
446 public int token ()
448 return m_iToken;
451 /** associated with current token.
452 Should not be called if advance() returned false.
453 @return value for token().
455 public Object value ()
457 return m_objToken;
459 private bool IsFirstToken { get { return m_iTokenPrev == Token.EOF; } }
461 private bool IsNodeType (int iToken)
463 switch (iToken)
465 case Token.COMMENT:
466 case Token.TEXT:
467 case Token.PROCESSING_INSTRUCTION:
468 case Token.NODE:
469 return true;
470 default:
471 return false;
474 private bool IsOperatorName (int iToken)
476 switch (iToken)
478 case Token.AND:
479 case Token.OR:
480 case Token.MOD:
481 case Token.DIV:
482 return true;
483 default:
484 return false;
487 private bool IsAxisName (int iToken)
489 switch (iToken)
491 case Token.ATTRIBUTE:
492 case Token.ANCESTOR:
493 case Token.ANCESTOR_OR_SELF:
494 case Token.CHILD:
495 case Token.DESCENDANT:
496 case Token.DESCENDANT_OR_SELF:
497 case Token.FOLLOWING:
498 case Token.FOLLOWING_SIBLING:
499 case Token.NAMESPACE:
500 case Token.PARENT:
501 case Token.PRECEDING:
502 case Token.PRECEDING_SIBLING:
503 case Token.SELF:
504 return true;
505 default:
506 return false;