2 // System.Xml.XPath.Tokenizer
5 // Piers Haken (piersh@friskit.com)
7 // (C) 2002 Piers Haken
11 // Permission is hereby granted, free of charge, to any person obtaining
12 // a copy of this software and associated documentation files (the
13 // "Software"), to deal in the Software without restriction, including
14 // without limitation the rights to use, copy, modify, merge, publish,
15 // distribute, sublicense, and/or sell copies of the Software, and to
16 // permit persons to whom the Software is furnished to do so, subject to
17 // the following conditions:
19 // The above copyright notice and this permission notice shall be
20 // included in all copies or substantial portions of the Software.
22 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
23 // EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
24 // MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
25 // NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
26 // LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
27 // OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
28 // WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
31 using System
.Globalization
;
34 using System
.Collections
;
36 using Mono
.Xml
.XPath
.yyParser
;
38 namespace System
.Xml
.XPath
40 internal class Tokenizer
: Mono
.Xml
.XPath
.yyParser
.yyInput
42 private string m_rgchInput
;
46 private int m_iTokenPrev
= Token
.EOF
;
47 private Object m_objToken
;
48 private bool m_fPrevWasOperator
= false;
49 private bool m_fThisIsOperator
= false;
50 private static readonly Hashtable s_mapTokens
= new Hashtable ();
51 private static readonly Object
[] s_rgTokenMap
=
57 Token
.ANCESTOR
, "ancestor",
58 Token
.ANCESTOR_OR_SELF
, "ancestor-or-self",
59 Token
.ATTRIBUTE
, "attribute",
61 Token
.DESCENDANT
, "descendant",
62 Token
.DESCENDANT_OR_SELF
, "descendant-or-self",
63 Token
.FOLLOWING
, "following",
64 Token
.FOLLOWING_SIBLING
, "following-sibling",
65 Token
.NAMESPACE
, "namespace",
66 Token
.PARENT
, "parent",
67 Token
.PRECEDING
, "preceding",
68 Token
.PRECEDING_SIBLING
, "preceding-sibling",
70 Token
.COMMENT
, "comment",
72 Token
.PROCESSING_INSTRUCTION
, "processing-instruction",
75 private const char EOL
= '\0';
79 for (int i
= 0; i
< s_rgTokenMap
.Length
; i
+= 2)
80 s_mapTokens
.Add (s_rgTokenMap
[i
+ 1], s_rgTokenMap
[i
]);
83 public Tokenizer (string strInput
)
85 //Console.WriteLine ("Tokenizing: " + strInput);
86 m_rgchInput
= strInput
;
88 m_cch
= strInput
.Length
;
92 private char Peek (int iOffset
)
94 if (m_ich
+ iOffset
>= m_cch
)
96 return m_rgchInput
[m_ich
+ iOffset
];
104 private char GetChar ()
108 return m_rgchInput
[m_ich
++];
111 private char PutBack ()
114 throw new XPathException ("XPath parser returned an error status: invalid tokenizer state.");
115 return m_rgchInput
[--m_ich
];
118 private bool SkipWhitespace () // returns trus if any whitespace was skipped
120 if (!IsWhitespace (Peek ()))
123 while (IsWhitespace (Peek ()))
129 private int ParseNumber ()
131 StringBuilder sb
= new StringBuilder ();
133 while (IsDigit (Peek ()))
134 sb
.Append ((char) GetChar ());
136 // don't handle '3.' as an error case (it is not. XPath 3.7 syntax [30])
139 sb
.Append ((char) GetChar ());
140 while (IsDigit (Peek ()))
141 sb
.Append ((char) GetChar ());
143 m_objToken
= Double
.Parse (sb
.ToString (), NumberFormatInfo
.InvariantInfo
);
147 private int ParseLiteral ()
149 StringBuilder sb
= new StringBuilder ();
151 char chInit
= GetChar ();
153 while ((ch
= Peek ()) != chInit
)
156 throw new XPathException ("unmatched "+chInit
+" in expression");
157 sb
.Append ((char) GetChar ());
160 m_objToken
= sb
.ToString ();
161 return Token
.LITERAL
;
164 private string ReadIdentifier ()
166 StringBuilder sb
= new StringBuilder ();
169 if (!Char
.IsLetter (ch
) && ch
!= '_')
172 sb
.Append ((char) GetChar ());
174 while ((ch
= Peek ()) == '_' || ch
== '-' || ch
== '.' || Char
.IsLetterOrDigit (ch
))
175 sb
.Append ((char) GetChar ());
178 return sb
.ToString ();
181 private int ParseIdentifier ()
183 string strToken
= ReadIdentifier ();
184 Object objToken
= s_mapTokens
[strToken
];
186 int iToken
= (objToken
!= null) ? (int) objToken
: Token
.QName
;
187 m_objToken
= strToken
;
194 // If the two characters following an NCName (possibly
195 // after intervening ExprWhitespace) are ::, then the
196 // token must be recognized as an AxisName.
197 if (objToken
== null || !IsAxisName (iToken
))
198 throw new XPathException ("invalid axis name: '"+strToken
+"'");
209 m_objToken
= new XmlQualifiedName ("", strToken
);
212 string strToken2
= ReadIdentifier ();
213 if (strToken2
== null)
214 throw new XPathException ("invalid QName: "+strToken
+":"+(char)ch
);
217 m_objToken
= new XmlQualifiedName (strToken2
, strToken
);
219 return Token
.FUNCTION_NAME
;
223 // If there is a preceding token and the preceding
224 // token is not one of @, ::, (, [, , or an Operator,
225 // then a * must be recognized as a MultiplyOperator
226 // and an NCName must be recognized as an OperatorName.
227 if (!IsFirstToken
&& !m_fPrevWasOperator
)
229 if (objToken
== null || !IsOperatorName (iToken
))
230 throw new XPathException ("invalid operator name: '"+strToken
+"'");
236 // If the character following an NCName (possibly
237 // after intervening ExprWhitespace) is (, then the
238 // token must be recognized as a NodeType or a FunctionName.
239 if (objToken
== null)
241 m_objToken
= new XmlQualifiedName (strToken
, "");
242 return Token
.FUNCTION_NAME
;
244 if (IsNodeType (iToken
))
246 throw new XPathException ("invalid function name: '"+strToken
+"'");
249 m_objToken
= new XmlQualifiedName (strToken
, "");
253 private static bool IsWhitespace (char ch
)
255 // return Char.IsWhiteSpace (ch);
256 return (ch
== ' ' || ch
== '\t' || ch
== '\n' || ch
== '\r');
259 private static bool IsDigit (char ch
)
261 // return Char.IsDigit (ch);
262 return ch
>= '0' && ch
<= '9';
275 m_fThisIsOperator
= true;
291 else if (IsDigit (Peek ()))
294 return ParseNumber ();
302 m_fThisIsOperator
= true;
309 m_fThisIsOperator
= true;
314 m_fThisIsOperator
= true;
319 m_fThisIsOperator
= true;
321 return Token
.BRACKET_OPEN
;
325 return Token
.BRACKET_CLOSE
;
328 m_fThisIsOperator
= true;
330 return Token
.PAREN_OPEN
;
334 return Token
.PAREN_CLOSE
;
337 m_fThisIsOperator
= true;
342 m_fThisIsOperator
= true;
348 if (!IsFirstToken
&& !m_fPrevWasOperator
)
350 m_fThisIsOperator
= true;
351 return Token
.MULTIPLY
;
353 return Token
.ASTERISK
;
357 m_fThisIsOperator
= true;
361 m_fThisIsOperator
= true;
366 m_fThisIsOperator
= true;
374 m_fThisIsOperator
= true;
381 m_fThisIsOperator
= true;
391 m_fThisIsOperator
= true;
401 return ParseLiteral ();
404 return ParseLiteral ();
409 return ParseNumber ();
411 else if (Char
.IsLetter (ch
) || ch
== '_') // NCName
413 int iToken
= ParseIdentifier ();
414 if (IsOperatorName (iToken
))
415 m_fThisIsOperator
= true;
420 throw new XPathException ("invalid token: '"+ch
+"'");
423 ///////////////////////////
424 // yyParser.yyInput methods
425 ///////////////////////////
427 /** move on to next token.
428 @return false if positioned beyond tokens.
429 @throws IOException on input error.
431 public bool advance ()
433 m_fThisIsOperator
= false;
435 m_iToken
= ParseToken ();
436 bool fWhitespace
= SkipWhitespace ();
437 m_iTokenPrev
= m_iToken
;
438 m_fPrevWasOperator
= m_fThisIsOperator
;
439 return (m_iToken
!= Token
.EOF
);
442 /** classifies current token.
443 Should not be called if advance() returned false.
444 @return current %token or single character.
451 /** associated with current token.
452 Should not be called if advance() returned false.
453 @return value for token().
455 public Object
value ()
459 private bool IsFirstToken { get { return m_iTokenPrev == Token.EOF; }
}
461 private bool IsNodeType (int iToken
)
467 case Token
.PROCESSING_INSTRUCTION
:
474 private bool IsOperatorName (int iToken
)
487 private bool IsAxisName (int iToken
)
491 case Token
.ATTRIBUTE
:
493 case Token
.ANCESTOR_OR_SELF
:
495 case Token
.DESCENDANT
:
496 case Token
.DESCENDANT_OR_SELF
:
497 case Token
.FOLLOWING
:
498 case Token
.FOLLOWING_SIBLING
:
499 case Token
.NAMESPACE
:
501 case Token
.PRECEDING
:
502 case Token
.PRECEDING_SIBLING
: