1 /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2 /* This Source Code Form is subject to the terms of the Mozilla Public
3 * License, v. 2.0. If a copy of the MPL was not distributed with this
4 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
11 * The scanner is a low-level service class that knows
12 * how to consume characters out of an (internal) stream.
13 * This class also offers a series of utility methods
14 * that most tokenizers want, such as readUntil()
15 * and SkipWhitespace().
24 #include "nsIParser.h"
25 #include "nsIUnicodeDecoder.h"
26 #include "nsScannerString.h"
30 class nsReadEndCondition
{
32 const char16_t
*mChars
;
34 explicit nsReadEndCondition(const char16_t
* aTerminateChars
);
36 nsReadEndCondition(const nsReadEndCondition
& aOther
); // No copying
37 void operator=(const nsReadEndCondition
& aOther
); // No assigning
44 * Use this constructor for the XML fragment parsing case
46 explicit nsScanner(const nsAString
& anHTMLString
);
49 * Use this constructor if you want i/o to be based on
50 * a file (therefore a stream) or just data you provide via Append().
52 nsScanner(nsString
& aFilename
, bool aCreateStream
);
57 * retrieve next char from internal input stream
59 * @update gess 3/25/98
60 * @param ch is the char to accept new value
61 * @return error code reflecting read status
63 nsresult
GetChar(char16_t
& ch
);
66 * peek ahead to consume next char from scanner's internal
69 * @update gess 3/25/98
70 * @param ch is the char to accept new value
71 * @return error code reflecting read status
73 nsresult
Peek(char16_t
& ch
, uint32_t aOffset
=0);
75 nsresult
Peek(nsAString
& aStr
, int32_t aNumChars
, int32_t aOffset
= 0);
78 * Skip over chars as long as they equal given char
80 * @update gess 3/25/98
81 * @param char to be skipped
84 nsresult
SkipOver(char16_t aSkipChar
);
87 * Skip whitespace on scanner input stream
89 * @update gess 3/25/98
90 * @return error status
92 nsresult
SkipWhitespace(int32_t& aNewlinesSkipped
);
95 * Consume characters until you run into space, a '<', a '>', or a '/'.
97 * @param aString - receives new data from stream
100 nsresult
ReadTagIdentifier(nsScannerSharedSubstring
& aString
);
103 * Consume characters until you run into a char that's not valid in an
106 * @param aString - receives new data from stream
109 nsresult
ReadEntityIdentifier(nsString
& aString
);
110 nsresult
ReadNumber(nsString
& aString
,int32_t aBase
);
111 nsresult
ReadWhitespace(nsScannerSharedSubstring
& aString
,
112 int32_t& aNewlinesSkipped
,
114 nsresult
ReadWhitespace(nsScannerIterator
& aStart
,
115 nsScannerIterator
& aEnd
,
116 int32_t& aNewlinesSkipped
);
119 * Consume characters until you find the terminal char
121 * @update gess 3/25/98
122 * @param aString receives new data from stream
123 * @param aTerminal contains terminating char
124 * @param addTerminal tells us whether to append terminal to aString
127 nsresult
ReadUntil(nsAString
& aString
,
132 * Consume characters until you find one contained in given
135 * @update gess 3/25/98
136 * @param aString receives new data from stream
137 * @param aTermSet contains set of terminating chars
138 * @param addTerminal tells us whether to append terminal to aString
141 nsresult
ReadUntil(nsAString
& aString
,
142 const nsReadEndCondition
& aEndCondition
,
145 nsresult
ReadUntil(nsScannerSharedSubstring
& aString
,
146 const nsReadEndCondition
& aEndCondition
,
149 nsresult
ReadUntil(nsScannerIterator
& aStart
,
150 nsScannerIterator
& aEnd
,
151 const nsReadEndCondition
& aEndCondition
,
155 * Records current offset position in input stream. This allows us
156 * to back up to this point if the need should arise, such as when
157 * tokenization gets interrupted.
159 * @update gess 5/12/98
166 * Resets current offset position of input stream to marked position.
167 * This allows us to back up to this point if the need should arise,
168 * such as when tokenization gets interrupted.
169 * NOTE: IT IS REALLY BAD FORM TO CALL RELEASE WITHOUT CALLING MARK FIRST!
171 * @update gess 5/12/98
175 void RewindToMark(void);
181 * @update harishd 01/12/99
185 bool UngetReadable(const nsAString
& aBuffer
);
190 * @update gess 5/13/98
194 nsresult
Append(const nsAString
& aBuffer
);
199 * @update gess 5/21/98
203 nsresult
Append(const char* aBuffer
, uint32_t aLen
,
204 nsIRequest
*aRequest
);
207 * Call this to copy bytes out of the scanner that have not yet been consumed
208 * by the tokenization process.
210 * @update gess 5/12/98
211 * @param aCopyBuffer is where the scanner buffer will be copied to
214 void CopyUnusedData(nsString
& aCopyBuffer
);
217 * Retrieve the name of the file that the scanner is reading from.
218 * In some cases, it's just a given name, because the scanner isn't
219 * really reading from a file.
221 * @update gess 5/12/98
224 nsString
& GetFilename(void);
226 static void SelfTest();
229 * Use this setter to change the scanner's unicode decoder
231 * @update ftang 3/02/99
232 * @param aCharset a normalized (alias resolved) charset name
233 * @param aCharsetSource- where the charset info came from
236 nsresult
SetDocumentCharset(const nsACString
& aCharset
, int32_t aSource
);
238 void BindSubstring(nsScannerSubstring
& aSubstring
, const nsScannerIterator
& aStart
, const nsScannerIterator
& aEnd
);
239 void CurrentPosition(nsScannerIterator
& aPosition
);
240 void EndReading(nsScannerIterator
& aPosition
);
241 void SetPosition(nsScannerIterator
& aPosition
,
242 bool aTruncate
= false,
243 bool aReverse
= false);
244 void ReplaceCharacter(nsScannerIterator
& aPosition
,
248 * Internal method used to cause the internal buffer to
249 * be filled with data.
253 bool IsIncremental(void) {return mIncremental
;}
254 void SetIncremental(bool anIncrValue
) {mIncremental
=anIncrValue
;}
257 * Return the position of the first non-whitespace
258 * character. This is only reliable before consumers start
259 * reading from this scanner.
261 int32_t FirstNonWhitespacePosition()
263 return mFirstNonWhitespacePosition
;
267 * Override replacement character used by nsIUnicodeDecoder.
268 * Default behavior is that it uses nsIUnicodeDecoder's mapping.
270 * @param aReplacementCharacter the replacement character
271 * XML (expat) parser uses 0xffff
273 void OverrideReplacementCharacter(char16_t aReplacementCharacter
);
277 bool AppendToBuffer(nsScannerString::Buffer
*, nsIRequest
*aRequest
, int32_t aErrorPos
= -1);
278 bool AppendToBuffer(const nsAString
& aStr
)
280 nsScannerString::Buffer
* buf
= nsScannerString::AllocBufferFromString(aStr
);
283 AppendToBuffer(buf
, nullptr);
287 nsScannerString
* mSlidingBuffer
;
288 nsScannerIterator mCurrentPosition
; // The position we will next read from in the scanner buffer
289 nsScannerIterator mMarkPosition
; // The position last marked (we may rewind to here)
290 nsScannerIterator mEndPosition
; // The current end of the scanner buffer
291 nsScannerIterator mFirstInvalidPosition
; // The position of the first invalid character that was detected
293 uint32_t mCountRemaining
; // The number of bytes still to be read
294 // from the scanner buffer
296 bool mHasInvalidCharacter
;
297 char16_t mReplacementCharacter
;
298 int32_t mFirstNonWhitespacePosition
;
299 int32_t mCharsetSource
;
301 nsCOMPtr
<nsIUnicodeDecoder
> mUnicodeDecoder
;
304 nsScanner
&operator =(const nsScanner
&); // Not implemented.