1 /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2 /* vim: set ts=2 sw=2 et tw=78: */
3 /* This Source Code Form is subject to the terms of the Mozilla Public
4 * License, v. 2.0. If a copy of the MPL was not distributed with this
5 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
7 //#define __INCREMENTAL 1
11 #include "mozilla/Attributes.h"
12 #include "mozilla/DebugOnly.h"
13 #include "mozilla/Encoding.h"
15 #include "nsReadableUtils.h"
16 #include "nsUTF8Utils.h" // for LossyConvertEncoding
19 #include "nsCharsetSource.h"
21 nsReadEndCondition::nsReadEndCondition(const char16_t
* aTerminateChars
)
22 : mChars(aTerminateChars
),
23 mFilter(char16_t(~0)) // All bits set
25 // Build filter that will be used to filter out characters with
26 // bits that none of the terminal chars have. This works very well
27 // because terminal chars often have only the last 4-6 bits set and
28 // normal ascii letters have bit 7 set. Other letters have even higher
32 const char16_t
* current
= aTerminateChars
;
33 char16_t terminalChar
= *current
;
34 while (terminalChar
) {
35 mFilter
&= ~terminalChar
;
37 terminalChar
= *current
;
42 * Use this constructor if you want i/o to be based on
43 * a single string you hand in during construction.
44 * This short cut was added for Javascript.
46 * @update gess 5/12/98
47 * @param aMode represents the parser mode (nav, other)
50 nsScanner::nsScanner(const nsAString
& anHTMLString
) {
51 MOZ_COUNT_CTOR(nsScanner
);
53 mSlidingBuffer
= nullptr;
54 if (AppendToBuffer(anHTMLString
)) {
55 mSlidingBuffer
->BeginReading(mCurrentPosition
);
57 /* XXX see hack below, re: bug 182067 */
58 memset(&mCurrentPosition
, 0, sizeof(mCurrentPosition
));
59 mEndPosition
= mCurrentPosition
;
61 mMarkPosition
= mCurrentPosition
;
63 mUnicodeDecoder
= nullptr;
64 mCharsetSource
= kCharsetUninitialized
;
68 * Use this constructor if you want i/o to be based on strings
69 * the scanner receives. If you pass a null filename, you
70 * can still provide data to the scanner via append.
72 nsScanner::nsScanner(nsString
& aFilename
, bool aCreateStream
)
73 : mFilename(aFilename
) {
74 MOZ_COUNT_CTOR(nsScanner
);
75 NS_ASSERTION(!aCreateStream
, "This is always true.");
77 mSlidingBuffer
= nullptr;
79 // XXX This is a big hack. We need to initialize the iterators to something.
80 // What matters is that mCurrentPosition == mEndPosition, so that our methods
81 // believe that we are at EOF (see bug 182067). We null out mCurrentPosition
82 // so that we have some hope of catching null pointer dereferences associated
83 // with this hack. --darin
84 memset(&mCurrentPosition
, 0, sizeof(mCurrentPosition
));
85 mMarkPosition
= mCurrentPosition
;
86 mEndPosition
= mCurrentPosition
;
90 mUnicodeDecoder
= nullptr;
91 mCharsetSource
= kCharsetUninitialized
;
92 // XML defaults to UTF-8 and about:blank is UTF-8, too.
93 SetDocumentCharset(UTF_8_ENCODING
, kCharsetFromDocTypeDefault
);
96 nsresult
nsScanner::SetDocumentCharset(NotNull
<const Encoding
*> aEncoding
,
98 if (aSource
< mCharsetSource
) // priority is lower than the current one
101 mCharsetSource
= aSource
;
102 nsCString charsetName
;
103 aEncoding
->Name(charsetName
);
104 if (!mCharset
.IsEmpty() && charsetName
.Equals(mCharset
)) {
105 return NS_OK
; // no difference, don't change it
108 // different, need to change it
110 mCharset
.Assign(charsetName
);
112 mUnicodeDecoder
= aEncoding
->NewDecoderWithBOMRemoval();
120 * @update gess 3/25/98
124 nsScanner::~nsScanner() {
125 delete mSlidingBuffer
;
127 MOZ_COUNT_DTOR(nsScanner
);
131 * Resets current offset position of input stream to marked position.
132 * This allows us to back up to this point if the need should arise,
133 * such as when tokenization gets interrupted.
134 * NOTE: IT IS REALLY BAD FORM TO CALL RELEASE WITHOUT CALLING MARK FIRST!
136 * @update gess 5/12/98
140 void nsScanner::RewindToMark(void) {
141 if (mSlidingBuffer
) {
142 mCurrentPosition
= mMarkPosition
;
147 * Records current offset position in input stream. This allows us
148 * to back up to this point if the need should arise, such as when
149 * tokenization gets interrupted.
151 * @update gess 7/29/98
155 int32_t nsScanner::Mark() {
156 int32_t distance
= 0;
157 if (mSlidingBuffer
) {
158 nsScannerIterator oldStart
;
159 mSlidingBuffer
->BeginReading(oldStart
);
161 distance
= Distance(oldStart
, mCurrentPosition
);
163 mSlidingBuffer
->DiscardPrefix(mCurrentPosition
);
164 mSlidingBuffer
->BeginReading(mCurrentPosition
);
165 mMarkPosition
= mCurrentPosition
;
172 * Insert data to our underlying input buffer as
173 * if it were read from an input stream.
175 * @update harishd 01/12/99
178 bool nsScanner::UngetReadable(const nsAString
& aBuffer
) {
179 if (!mSlidingBuffer
) {
183 mSlidingBuffer
->UngetReadable(aBuffer
, mCurrentPosition
);
184 mSlidingBuffer
->BeginReading(
185 mCurrentPosition
); // Insertion invalidated our iterators
186 mSlidingBuffer
->EndReading(mEndPosition
);
192 * Append data to our underlying input buffer as
193 * if it were read from an input stream.
198 nsresult
nsScanner::Append(const nsAString
& aBuffer
) {
199 if (!AppendToBuffer(aBuffer
)) return NS_ERROR_OUT_OF_MEMORY
;
206 * @update gess 5/21/98
210 nsresult
nsScanner::Append(const char* aBuffer
, uint32_t aLen
) {
211 nsresult res
= NS_OK
;
212 if (mUnicodeDecoder
) {
213 CheckedInt
<size_t> needed
= mUnicodeDecoder
->MaxUTF16BufferLength(aLen
);
214 if (!needed
.isValid()) {
215 return NS_ERROR_OUT_OF_MEMORY
;
217 CheckedInt
<uint32_t> allocLen(1); // null terminator due to legacy sadness
218 allocLen
+= needed
.value();
219 if (!allocLen
.isValid()) {
220 return NS_ERROR_OUT_OF_MEMORY
;
222 nsScannerString::Buffer
* buffer
=
223 nsScannerString::AllocBuffer(allocLen
.value());
224 NS_ENSURE_TRUE(buffer
, NS_ERROR_OUT_OF_MEMORY
);
225 char16_t
* unichars
= buffer
->DataStart();
230 Tie(result
, read
, written
) =
231 mUnicodeDecoder
->DecodeToUTF16WithoutReplacement(
232 AsBytes(Span(aBuffer
, aLen
)), Span(unichars
, needed
.value()),
233 false); // Retain bug about failure to handle EOF
234 MOZ_ASSERT(result
!= kOutputFull
);
235 MOZ_ASSERT(read
<= aLen
);
236 MOZ_ASSERT(written
<= needed
.value());
237 if (result
!= kInputEmpty
) {
238 // Since about:blank is empty, this line runs only for XML. Use a
239 // character that's illegal in XML instead of U+FFFD in order to make
240 // expat flag the error. There is no need to loop and convert more, since
241 // expat will stop here anyway.
242 unichars
[written
++] = 0xFFFF;
244 buffer
->SetDataLength(written
);
245 // Don't propagate return code of unicode decoder
246 // since it doesn't reflect on our success or failure
249 if (!AppendToBuffer(buffer
)) res
= NS_ERROR_OUT_OF_MEMORY
;
251 NS_WARNING("No decoder found.");
252 res
= NS_ERROR_FAILURE
;
259 * retrieve next char from scanners internal input stream
261 * @update gess 3/25/98
263 * @return error code reflecting read status
265 nsresult
nsScanner::GetChar(char16_t
& aChar
) {
266 if (!mSlidingBuffer
|| mCurrentPosition
== mEndPosition
) {
268 return NS_ERROR_HTMLPARSER_EOF
;
271 aChar
= *mCurrentPosition
++;
276 void nsScanner::BindSubstring(nsScannerSubstring
& aSubstring
,
277 const nsScannerIterator
& aStart
,
278 const nsScannerIterator
& aEnd
) {
279 aSubstring
.Rebind(*mSlidingBuffer
, aStart
, aEnd
);
282 void nsScanner::CurrentPosition(nsScannerIterator
& aPosition
) {
283 aPosition
= mCurrentPosition
;
286 void nsScanner::EndReading(nsScannerIterator
& aPosition
) {
287 aPosition
= mEndPosition
;
290 void nsScanner::SetPosition(nsScannerIterator
& aPosition
, bool aTerminate
) {
291 if (mSlidingBuffer
) {
292 mCurrentPosition
= aPosition
;
293 if (aTerminate
&& (mCurrentPosition
== mEndPosition
)) {
294 mMarkPosition
= mCurrentPosition
;
295 mSlidingBuffer
->DiscardPrefix(mCurrentPosition
);
300 bool nsScanner::AppendToBuffer(nsScannerString::Buffer
* aBuf
) {
301 if (!mSlidingBuffer
) {
302 mSlidingBuffer
= new nsScannerString(aBuf
);
303 if (!mSlidingBuffer
) return false;
304 mSlidingBuffer
->BeginReading(mCurrentPosition
);
305 mMarkPosition
= mCurrentPosition
;
306 mSlidingBuffer
->EndReading(mEndPosition
);
308 mSlidingBuffer
->AppendBuffer(aBuf
);
309 if (mCurrentPosition
== mEndPosition
) {
310 mSlidingBuffer
->BeginReading(mCurrentPosition
);
312 mSlidingBuffer
->EndReading(mEndPosition
);
319 * call this to copy bytes out of the scanner that have not yet been consumed
320 * by the tokenization process.
322 * @update gess 5/12/98
323 * @param aCopyBuffer is where the scanner buffer will be copied to
324 * @return true if OK or false on OOM
326 bool nsScanner::CopyUnusedData(nsString
& aCopyBuffer
) {
327 if (!mSlidingBuffer
) {
328 aCopyBuffer
.Truncate();
332 nsScannerIterator start
, end
;
333 start
= mCurrentPosition
;
336 return CopyUnicodeTo(start
, end
, aCopyBuffer
);
340 * Retrieve the name of the file that the scanner is reading from.
341 * In some cases, it's just a given name, because the scanner isn't
342 * really reading from a file.
344 * @update gess 5/12/98
347 nsString
& nsScanner::GetFilename(void) { return mFilename
; }
350 * Conduct self test. Actually, selftesting for this class
351 * occurs in the parser selftest.
353 * @update gess 3/25/98
358 void nsScanner::SelfTest(void) {