Merge from mainline (gomp-merge-2005-02-26).
[official-gcc.git] / libjava / java / io / StreamTokenizer.java
blobb5bd38f2d78f58b54c67fb1cc621d8e7599354bf
1 /* StreamTokenizer.java -- parses streams of characters into tokens
2 Copyright (C) 1998, 1999, 2000, 2001, 2002, 2003 Free Software Foundation
4 This file is part of GNU Classpath.
6 GNU Classpath is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2, or (at your option)
9 any later version.
11 GNU Classpath is distributed in the hope that it will be useful, but
12 WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with GNU Classpath; see the file COPYING. If not, write to the
18 Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
19 02111-1307 USA.
21 Linking this library statically or dynamically with other modules is
22 making a combined work based on this library. Thus, the terms and
23 conditions of the GNU General Public License cover the whole
24 combination.
26 As a special exception, the copyright holders of this library give you
27 permission to link this library with independent modules to produce an
28 executable, regardless of the license terms of these independent
29 modules, and to copy and distribute the resulting executable under
30 terms of your choice, provided that you also meet, for each linked
31 independent module, the terms and conditions of the license of that
32 module. An independent module is a module which is not derived from
33 or based on this library. If you modify this library, you may extend
34 this exception to your version of the library, but you are not
35 obligated to do so. If you do not wish to do so, delete this
36 exception statement from your version. */
38 package java.io;
40 /**
41 * This class parses streams of characters into tokens. There are a
42 * million-zillion flags that can be set to control the parsing, as
43 * described under the various method headings.
45 * @author Warren Levy (warrenl@cygnus.com)
46 * @date October 25, 1998.
48 /* Written using "Java Class Libraries", 2nd edition, ISBN 0-201-31002-3
49 * "The Java Language Specification", ISBN 0-201-63451-1
50 * plus online API docs for JDK 1.2 beta from http://www.javasoft.com.
51 * Status: Believed complete and correct.
54 public class StreamTokenizer
56 /** A constant indicating that the end of the stream has been read. */
57 public static final int TT_EOF = -1;
59 /** A constant indicating that the end of the line has been read. */
60 public static final int TT_EOL = '\n';
62 /** A constant indicating that a number token has been read. */
63 public static final int TT_NUMBER = -2;
65 /** A constant indicating that a word token has been read. */
66 public static final int TT_WORD = -3;
68 /** A constant indicating that no tokens have been read yet. */
69 private static final int TT_NONE = -4;
71 /**
72 * Contains the type of the token read resulting from a call to nextToken
73 * The rules are as follows:
74 * <ul>
75 * <li>For a token consisting of a single ordinary character, this is the
76 * value of that character.</li>
77 * <li>For a quoted string, this is the value of the quote character</li>
78 * <li>For a word, this is TT_WORD</li>
79 * <li>For a number, this is TT_NUMBER</li>
80 * <li>For the end of the line, this is TT_EOL</li>
81 * <li>For the end of the stream, this is TT_EOF</li>
82 * </ul>
84 public int ttype = TT_NONE;
86 /** The String associated with word and string tokens. */
87 public String sval;
89 /** The numeric value associated with number tokens. */
90 public double nval;
92 /* Indicates whether end-of-line is recognized as a token. */
93 private boolean eolSignificant = false;
95 /* Indicates whether word tokens are automatically made lower case. */
96 private boolean lowerCase = false;
98 /* Indicates whether C++ style comments are recognized and skipped. */
99 private boolean slashSlash = false;
101 /* Indicates whether C style comments are recognized and skipped. */
102 private boolean slashStar = false;
104 /* Attribute tables of each byte from 0x00 to 0xFF. */
105 private boolean[] whitespace = new boolean[256];
106 private boolean[] alphabetic = new boolean[256];
107 private boolean[] numeric = new boolean[256];
108 private boolean[] quote = new boolean[256];
109 private boolean[] comment = new boolean[256];
111 /* The Reader associated with this class. */
112 private PushbackReader in;
114 /* Indicates if a token has been pushed back. */
115 private boolean pushedBack = false;
117 /* Contains the current line number of the reader. */
118 private int lineNumber = 1;
121 * This method reads bytes from an <code>InputStream</code> and tokenizes
122 * them. For details on how this method operates by default, see
123 * <code>StreamTokenizer(Reader)</code>.
125 * @param is The <code>InputStream</code> to read from
127 * @deprecated Since JDK 1.1.
129 public StreamTokenizer(InputStream is)
131 this(new InputStreamReader(is));
135 * This method initializes a new <code>StreamTokenizer</code> to read
136 * characters from a <code>Reader</code> and parse them. The char values
137 * have their hight bits masked so that the value is treated a character
138 * in the range of 0x0000 to 0x00FF.
139 * <p>
140 * This constructor sets up the parsing table to parse the stream in the
141 * following manner:
142 * <ul>
143 * <li>The values 'A' through 'Z', 'a' through 'z' and 0xA0 through 0xFF
144 * are initialized as alphabetic</li>
145 * <li>The values 0x00 through 0x20 are initialized as whitespace</li>
146 * <li>The values '\'' and '"' are initialized as quote characters</li>
147 * <li>'/' is a comment character</li>
148 * <li>Numbers will be parsed</li>
149 * <li>EOL is not treated as significant</li>
150 * <li>C and C++ (//) comments are not recognized</li>
151 * </ul>
153 * @param r The <code>Reader</code> to read chars from
155 public StreamTokenizer(Reader r)
157 in = new PushbackReader(r);
159 whitespaceChars(0x00, 0x20);
160 wordChars('A', 'Z');
161 wordChars('a', 'z');
162 wordChars(0xA0, 0xFF);
163 commentChar('/');
164 quoteChar('\'');
165 quoteChar('"');
166 parseNumbers();
170 * This method sets the comment attribute on the specified
171 * character. Other attributes for the character are cleared.
173 * @param ch The character to set the comment attribute for, passed as an int
175 public void commentChar(int ch)
177 if (ch >= 0 && ch <= 255)
179 comment[ch] = true;
180 whitespace[ch] = false;
181 alphabetic[ch] = false;
182 numeric[ch] = false;
183 quote[ch] = false;
188 * This method sets a flag that indicates whether or not the end of line
189 * sequence terminates and is a token. The defaults to <code>false</code>
191 * @param flag <code>true</code> if EOF is significant, <code>false</code>
192 * otherwise
194 public void eolIsSignificant(boolean flag)
196 eolSignificant = flag;
200 * This method returns the current line number. Note that if the
201 * <code>pushBack()</code> method is called, it has no effect on the
202 * line number returned by this method.
204 * @return The current line number
206 public int lineno()
208 return lineNumber;
212 * This method sets a flag that indicates whether or not alphabetic
213 * tokens that are returned should be converted to lower case.
215 * @param flag <code>true</code> to convert to lower case,
216 * <code>false</code> otherwise
218 public void lowerCaseMode(boolean flag)
220 lowerCase = flag;
223 private boolean isWhitespace(int ch)
225 return (ch >= 0 && ch <= 255 && whitespace[ch]);
228 private boolean isAlphabetic(int ch)
230 return ((ch > 255) || (ch >= 0 && alphabetic[ch]));
233 private boolean isNumeric(int ch)
235 return (ch >= 0 && ch <= 255 && numeric[ch]);
238 private boolean isQuote(int ch)
240 return (ch >= 0 && ch <= 255 && quote[ch]);
243 private boolean isComment(int ch)
245 return (ch >= 0 && ch <= 255 && comment[ch]);
249 * This method reads the next token from the stream. It sets the
250 * <code>ttype</code> variable to the appropriate token type and
251 * returns it. It also can set <code>sval</code> or <code>nval</code>
252 * as described below. The parsing strategy is as follows:
253 * <ul>
254 * <li>Skip any whitespace characters.</li>
255 * <li>If a numeric character is encountered, attempt to parse a numeric
256 * value. Leading '-' characters indicate a numeric only if followed by
257 * another non-'-' numeric. The value of the numeric token is terminated
258 * by either the first non-numeric encountered, or the second occurrence of
259 * '-' or '.'. The token type returned is TT_NUMBER and <code>nval</code>
260 * is set to the value parsed.</li>
261 * <li>If an alphabetic character is parsed, all subsequent characters
262 * are read until the first non-alphabetic or non-numeric character is
263 * encountered. The token type returned is TT_WORD and the value parsed
264 * is stored in <code>sval</code>. If lower case mode is set, the token
265 * stored in <code>sval</code> is converted to lower case. The end of line
266 * sequence terminates a word only if EOL signficance has been turned on.
267 * The start of a comment also terminates a word. Any character with a
268 * non-alphabetic and non-numeric attribute (such as white space, a quote,
269 * or a commet) are treated as non-alphabetic and terminate the word.</li>
270 * <li>If a comment character is parsed, then all remaining characters on
271 * the current line are skipped and another token is parsed. Any EOL or
272 * EOF's encountered are not discarded, but rather terminate the comment.</li>
273 * <li>If a quote character is parsed, then all characters up to the
274 * second occurrence of the same quote character are parsed into a
275 * <code>String</code>. This <code>String</code> is stored as
276 * <code>sval</code>, but is not converted to lower case, even if lower case
277 * mode is enabled. The token type returned is the value of the quote
278 * character encountered. Any escape sequences
279 * (\b (backspace), \t (HTAB), \n (linefeed), \f (form feed), \r
280 * (carriage return), \" (double quote), \' (single quote), \\
281 * (backslash), \XXX (octal esacpe)) are converted to the appropriate
282 * char values. Invalid esacape sequences are left in untranslated.
283 * Unicode characters like ('\ u0000') are not recognized. </li>
284 * <li>If the C++ comment sequence "//" is encountered, and the parser
285 * is configured to handle that sequence, then the remainder of the line
286 * is skipped and another token is read exactly as if a character with
287 * the comment attribute was encountered.</li>
288 * <li>If the C comment sequence "/*" is encountered, and the parser
289 * is configured to handle that sequence, then all characters up to and
290 * including the comment terminator sequence are discarded and another
291 * token is parsed.</li>
292 * <li>If all cases above are not met, then the character is an ordinary
293 * character that is parsed as a token by itself. The char encountered
294 * is returned as the token type.</li>
295 * </ul>
297 * @return The token type
298 * @exception IOException If an I/O error occurs
300 public int nextToken() throws IOException
302 if (pushedBack)
304 pushedBack = false;
305 if (ttype != TT_NONE)
306 return ttype;
309 sval = null;
310 int ch;
312 // Skip whitespace. Deal with EOL along the way.
313 while (isWhitespace(ch = in.read()))
314 if (ch == '\n' || ch == '\r')
316 lineNumber++;
318 // Throw away \n if in combination with \r.
319 if (ch == '\r' && (ch = in.read()) != '\n')
321 if (ch != TT_EOF)
322 in.unread(ch);
324 if (eolSignificant)
325 return (ttype = TT_EOL);
328 if (ch == '/')
329 if ((ch = in.read()) == '/' && slashSlash)
331 while ((ch = in.read()) != '\n' && ch != '\r' && ch != TT_EOF)
333 if (ch != TT_EOF)
334 in.unread(ch);
335 return nextToken(); // Recursive, but not too deep in normal cases
337 else if (ch == '*' && slashStar)
339 while (true)
341 ch = in.read();
342 if (ch == '*')
344 if ((ch = in.read()) == '/')
345 break;
346 else if (ch != TT_EOF)
347 in.unread(ch);
349 else if (ch == '\n' || ch == '\r')
351 lineNumber++;
352 if (ch == '\r' && (ch = in.read()) != '\n')
354 if (ch != TT_EOF)
355 in.unread(ch);
358 else if (ch == TT_EOF)
360 break;
363 return nextToken(); // Recursive, but not too deep in normal cases
365 else
367 if (ch != TT_EOF)
368 in.unread(ch);
369 ch = '/';
372 if (ch == TT_EOF)
373 ttype = TT_EOF;
374 else if (isNumeric(ch))
376 boolean isNegative = false;
377 if (ch == '-')
379 // Read ahead to see if this is an ordinary '-' rather than numeric.
380 ch = in.read();
381 if (isNumeric(ch) && ch != '-')
383 isNegative = true;
385 else
387 if (ch != TT_EOF)
388 in.unread(ch);
389 return (ttype = '-');
393 StringBuffer tokbuf = new StringBuffer();
394 tokbuf.append((char) ch);
396 int decCount = 0;
397 while (isNumeric(ch = in.read()) && ch != '-')
398 if (ch == '.' && decCount++ > 0)
399 break;
400 else
401 tokbuf.append((char) ch);
403 if (ch != TT_EOF)
404 in.unread(ch);
405 ttype = TT_NUMBER;
408 nval = Double.valueOf(tokbuf.toString()).doubleValue();
410 catch (NumberFormatException _)
412 nval = 0.0;
414 if (isNegative)
415 nval = -nval;
417 else if (isAlphabetic(ch))
419 StringBuffer tokbuf = new StringBuffer();
420 tokbuf.append((char) ch);
421 while (isAlphabetic(ch = in.read()) || isNumeric(ch))
422 tokbuf.append((char) ch);
423 if (ch != TT_EOF)
424 in.unread(ch);
425 ttype = TT_WORD;
426 sval = tokbuf.toString();
427 if (lowerCase)
428 sval = sval.toLowerCase();
430 else if (isComment(ch))
432 while ((ch = in.read()) != '\n' && ch != '\r' && ch != TT_EOF)
434 if (ch != TT_EOF)
435 in.unread(ch);
436 return nextToken(); // Recursive, but not too deep in normal cases.
438 else if (isQuote(ch))
440 ttype = ch;
441 StringBuffer tokbuf = new StringBuffer();
442 while ((ch = in.read()) != ttype && ch != '\n' && ch != '\r' &&
443 ch != TT_EOF)
445 if (ch == '\\')
446 switch (ch = in.read())
448 case 'a': ch = 0x7;
449 break;
450 case 'b': ch = '\b';
451 break;
452 case 'f': ch = 0xC;
453 break;
454 case 'n': ch = '\n';
455 break;
456 case 'r': ch = '\r';
457 break;
458 case 't': ch = '\t';
459 break;
460 case 'v': ch = 0xB;
461 break;
462 case '\n': ch = '\n';
463 break;
464 case '\r': ch = '\r';
465 break;
466 case '\"':
467 case '\'':
468 case '\\':
469 break;
470 default:
471 int ch1, nextch;
472 if ((nextch = ch1 = ch) >= '0' && ch <= '7')
474 ch -= '0';
475 if ((nextch = in.read()) >= '0' && nextch <= '7')
477 ch = ch * 8 + nextch - '0';
478 if ((nextch = in.read()) >= '0' && nextch <= '7' &&
479 ch1 >= '0' && ch1 <= '3')
481 ch = ch * 8 + nextch - '0';
482 nextch = in.read();
487 if (nextch != TT_EOF)
488 in.unread(nextch);
491 tokbuf.append((char) ch);
494 // Throw away matching quote char.
495 if (ch != ttype && ch != TT_EOF)
496 in.unread(ch);
498 sval = tokbuf.toString();
500 else
502 ttype = ch;
505 return ttype;
508 private void resetChar(int ch)
510 whitespace[ch] = alphabetic[ch] = numeric[ch] = quote[ch] = comment[ch] =
511 false;
515 * This method makes the specified character an ordinary character. This
516 * means that none of the attributes (whitespace, alphabetic, numeric,
517 * quote, or comment) will be set on this character. This character will
518 * parse as its own token.
520 * @param ch The character to make ordinary, passed as an int
522 public void ordinaryChar(int ch)
524 if (ch >= 0 && ch <= 255)
525 resetChar(ch);
529 * This method makes all the characters in the specified range, range
530 * terminators included, ordinary. This means the none of the attributes
531 * (whitespace, alphabetic, numeric, quote, or comment) will be set on
532 * any of the characters in the range. This makes each character in this
533 * range parse as its own token.
535 * @param low The low end of the range of values to set the whitespace
536 * attribute for
537 * @param hi The high end of the range of values to set the whitespace
538 * attribute for
540 public void ordinaryChars(int low, int hi)
542 if (low < 0)
543 low = 0;
544 if (hi > 255)
545 hi = 255;
546 for (int i = low; i <= hi; i++)
547 resetChar(i);
551 * This method sets the numeric attribute on the characters '0' - '9' and
552 * the characters '.' and '-'.
554 public void parseNumbers()
556 for (int i = 0; i <= 9; i++)
557 numeric['0' + i] = true;
559 numeric['.'] = true;
560 numeric['-'] = true;
564 * Puts the current token back into the StreamTokenizer so
565 * <code>nextToken</code> will return the same value on the next call.
566 * May cause the lineno method to return an incorrect value
567 * if lineno is called before the next call to nextToken.
569 public void pushBack()
571 pushedBack = true;
575 * This method sets the quote attribute on the specified character.
576 * Other attributes for the character are cleared.
578 * @param ch The character to set the quote attribute for, passed as an int.
580 public void quoteChar(int ch)
582 if (ch >= 0 && ch <= 255)
584 quote[ch] = true;
585 comment[ch] = false;
586 whitespace[ch] = false;
587 alphabetic[ch] = false;
588 numeric[ch] = false;
593 * This method removes all attributes (whitespace, alphabetic, numeric,
594 * quote, and comment) from all characters. It is equivalent to calling
595 * <code>ordinaryChars(0x00, 0xFF)</code>.
597 * @see #ordinaryChars(int, int)
599 public void resetSyntax()
601 ordinaryChars(0x00, 0xFF);
605 * This method sets a flag that indicates whether or not "C++" language style
606 * comments ("//" comments through EOL ) are handled by the parser.
607 * If this is <code>true</code> commented out sequences are skipped and
608 * ignored by the parser. This defaults to <code>false</code>.
610 * @param flag <code>true</code> to recognized and handle "C++" style
611 * comments, <code>false</code> otherwise
613 public void slashSlashComments(boolean flag)
615 slashSlash = flag;
619 * This method sets a flag that indicates whether or not "C" language style
620 * comments (with nesting not allowed) are handled by the parser.
621 * If this is <code>true</code> commented out sequences are skipped and
622 * ignored by the parser. This defaults to <code>false</code>.
624 * @param flag <code>true</code> to recognized and handle "C" style comments,
625 * <code>false</code> otherwise
627 public void slashStarComments(boolean flag)
629 slashStar = flag;
633 * This method returns the current token value as a <code>String</code> in
634 * the form "Token[x], line n", where 'n' is the current line numbers and
635 * 'x' is determined as follows.
636 * <p>
637 * <ul>
638 * <li>If no token has been read, then 'x' is "NOTHING" and 'n' is 0</li>
639 * <li>If <code>ttype</code> is TT_EOF, then 'x' is "EOF"</li>
640 * <li>If <code>ttype</code> is TT_EOL, then 'x' is "EOL"</li>
641 * <li>If <code>ttype</code> is TT_WORD, then 'x' is <code>sval</code></li>
642 * <li>If <code>ttype</code> is TT_NUMBER, then 'x' is "n=strnval" where
643 * 'strnval' is <code>String.valueOf(nval)</code>.</li>
644 * <li>If <code>ttype</code> is a quote character, then 'x' is
645 * <code>sval</code></li>
646 * <li>For all other cases, 'x' is <code>ttype</code></li>
647 * </ul>
649 public String toString()
651 String tempstr;
652 if (ttype == TT_EOF)
653 tempstr = "EOF";
654 else if (ttype == TT_EOL)
655 tempstr = "EOL";
656 else if (ttype == TT_WORD)
657 tempstr = sval;
658 else if (ttype == TT_NUMBER)
659 tempstr = "n=" + nval;
660 else if (ttype == TT_NONE)
661 tempstr = "NOTHING";
662 else // must be an ordinary char.
663 tempstr = "\'" + (char) ttype + "\'";
665 return "Token[" + tempstr + "], line " + lineno();
669 * This method sets the whitespace attribute for all characters in the
670 * specified range, range terminators included.
672 * @param low The low end of the range of values to set the whitespace
673 * attribute for
674 * @param hi The high end of the range of values to set the whitespace
675 * attribute for
677 public void whitespaceChars(int low, int hi)
679 if (low < 0)
680 low = 0;
681 if (hi > 255)
682 hi = 255;
683 for (int i = low; i <= hi; i++)
685 resetChar(i);
686 whitespace[i] = true;
691 * This method sets the alphabetic attribute for all characters in the
692 * specified range, range terminators included.
694 * @param low The low end of the range of values to set the alphabetic
695 * attribute for
696 * @param hi The high end of the range of values to set the alphabetic
697 * attribute for
699 public void wordChars(int low, int hi)
701 if (low < 0)
702 low = 0;
703 if (hi > 255)
704 hi = 255;
705 for (int i = low; i <= hi; i++)
706 alphabetic[i] = true;