1 /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2 /* vim: set ts=2 sw=2 et tw=78: */
3 /* This Source Code Form is subject to the terms of the Mozilla Public
4 * License, v. 2.0. If a copy of the MPL was not distributed with this
5 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
7 // #define __INCREMENTAL 1
11 #include "mozilla/Attributes.h"
12 #include "mozilla/DebugOnly.h"
13 #include "mozilla/Encoding.h"
14 #include "mozilla/UniquePtr.h"
16 #include "nsReadableUtils.h"
17 #include "nsUTF8Utils.h" // for LossyConvertEncoding
20 #include "nsCharsetSource.h"
22 nsReadEndCondition::nsReadEndCondition(const char16_t
* aTerminateChars
)
23 : mChars(aTerminateChars
),
24 mFilter(char16_t(~0)) // All bits set
26 // Build filter that will be used to filter out characters with
27 // bits that none of the terminal chars have. This works very well
28 // because terminal chars often have only the last 4-6 bits set and
29 // normal ascii letters have bit 7 set. Other letters have even higher
33 const char16_t
* current
= aTerminateChars
;
34 char16_t terminalChar
= *current
;
35 while (terminalChar
) {
36 mFilter
&= ~terminalChar
;
38 terminalChar
= *current
;
43 * Use this constructor if you want i/o to be based on
44 * a single string you hand in during construction.
45 * This short cut was added for Javascript.
47 * @update gess 5/12/98
48 * @param aMode represents the parser mode (nav, other)
51 nsScanner::nsScanner(const nsAString
& anHTMLString
, bool aIncremental
)
52 : mIncremental(aIncremental
) {
53 MOZ_COUNT_CTOR(nsScanner
);
55 AppendToBuffer(anHTMLString
);
56 MOZ_ASSERT(mMarkPosition
== mCurrentPosition
);
60 * Use this constructor if you want i/o to be based on strings
61 * the scanner receives. If you pass a null filename, you
62 * can still provide data to the scanner via append.
64 nsScanner::nsScanner(nsIURI
* aURI
) : mURI(aURI
), mIncremental(true) {
65 MOZ_COUNT_CTOR(nsScanner
);
67 // XXX This is a big hack. We need to initialize the iterators to something.
68 // What matters is that mCurrentPosition == mEndPosition, so that our methods
69 // believe that we are at EOF (see bug 182067). We null out mCurrentPosition
70 // so that we have some hope of catching null pointer dereferences associated
71 // with this hack. --darin
72 memset(&mCurrentPosition
, 0, sizeof(mCurrentPosition
));
73 mMarkPosition
= mCurrentPosition
;
74 mEndPosition
= mCurrentPosition
;
76 // XML defaults to UTF-8 and about:blank is UTF-8, too.
77 SetDocumentCharset(UTF_8_ENCODING
, kCharsetFromDocTypeDefault
);
80 nsresult
nsScanner::SetDocumentCharset(NotNull
<const Encoding
*> aEncoding
,
82 if (aSource
< mCharsetSource
) // priority is lower than the current one
85 mCharsetSource
= aSource
;
86 nsCString charsetName
;
87 aEncoding
->Name(charsetName
);
88 if (!mCharset
.IsEmpty() && charsetName
.Equals(mCharset
)) {
89 return NS_OK
; // no difference, don't change it
92 // different, need to change it
94 mCharset
.Assign(charsetName
);
96 mUnicodeDecoder
= aEncoding
->NewDecoderWithBOMRemoval();
104 * @update gess 3/25/98
108 nsScanner::~nsScanner() { MOZ_COUNT_DTOR(nsScanner
); }
111 * Resets current offset position of input stream to marked position.
112 * This allows us to back up to this point if the need should arise,
113 * such as when tokenization gets interrupted.
114 * NOTE: IT IS REALLY BAD FORM TO CALL RELEASE WITHOUT CALLING MARK FIRST!
116 * @update gess 5/12/98
120 void nsScanner::RewindToMark(void) {
121 if (mSlidingBuffer
) {
122 mCurrentPosition
= mMarkPosition
;
127 * Records current offset position in input stream. This allows us
128 * to back up to this point if the need should arise, such as when
129 * tokenization gets interrupted.
131 * @update gess 7/29/98
135 int32_t nsScanner::Mark() {
136 int32_t distance
= 0;
137 if (mSlidingBuffer
) {
138 nsScannerIterator oldStart
;
139 mSlidingBuffer
->BeginReading(oldStart
);
141 distance
= Distance(oldStart
, mCurrentPosition
);
143 mSlidingBuffer
->DiscardPrefix(mCurrentPosition
);
144 mSlidingBuffer
->BeginReading(mCurrentPosition
);
145 mMarkPosition
= mCurrentPosition
;
152 * Insert data to our underlying input buffer as
153 * if it were read from an input stream.
155 * @update harishd 01/12/99
158 bool nsScanner::UngetReadable(const nsAString
& aBuffer
) {
159 if (!mSlidingBuffer
) {
163 mSlidingBuffer
->UngetReadable(aBuffer
, mCurrentPosition
);
164 mSlidingBuffer
->BeginReading(
165 mCurrentPosition
); // Insertion invalidated our iterators
166 mSlidingBuffer
->EndReading(mEndPosition
);
172 * Append data to our underlying input buffer as
173 * if it were read from an input stream.
178 nsresult
nsScanner::Append(const nsAString
& aBuffer
) {
179 if (!AppendToBuffer(aBuffer
)) return NS_ERROR_OUT_OF_MEMORY
;
186 * @update gess 5/21/98
190 nsresult
nsScanner::Append(const char* aBuffer
, uint32_t aLen
) {
191 nsresult res
= NS_OK
;
192 if (mUnicodeDecoder
) {
193 mozilla::CheckedInt
<size_t> needed
=
194 mUnicodeDecoder
->MaxUTF16BufferLength(aLen
);
195 if (!needed
.isValid()) {
196 return NS_ERROR_OUT_OF_MEMORY
;
198 mozilla::CheckedInt
<uint32_t> allocLen(
199 1); // null terminator due to legacy sadness
200 allocLen
+= needed
.value();
201 if (!allocLen
.isValid()) {
202 return NS_ERROR_OUT_OF_MEMORY
;
204 nsScannerString::Buffer
* buffer
=
205 nsScannerString::AllocBuffer(allocLen
.value());
206 NS_ENSURE_TRUE(buffer
, NS_ERROR_OUT_OF_MEMORY
);
207 char16_t
* unichars
= buffer
->DataStart();
212 // Do not use structured binding lest deal with [-Werror=unused-variable]
213 std::tie(result
, read
, written
) =
214 mUnicodeDecoder
->DecodeToUTF16WithoutReplacement(
215 AsBytes(mozilla::Span(aBuffer
, aLen
)),
216 mozilla::Span(unichars
, needed
.value()),
217 false); // Retain bug about failure to handle EOF
218 MOZ_ASSERT(result
!= mozilla::kOutputFull
);
219 MOZ_ASSERT(read
<= aLen
);
220 MOZ_ASSERT(written
<= needed
.value());
221 if (result
!= mozilla::kInputEmpty
) {
222 // Since about:blank is empty, this line runs only for XML. Use a
223 // character that's illegal in XML instead of U+FFFD in order to make
224 // expat flag the error. There is no need to loop and convert more, since
225 // expat will stop here anyway.
226 unichars
[written
++] = 0xFFFF;
228 buffer
->SetDataLength(written
);
229 // Don't propagate return code of unicode decoder
230 // since it doesn't reflect on our success or failure
233 AppendToBuffer(buffer
);
235 NS_WARNING("No decoder found.");
236 res
= NS_ERROR_FAILURE
;
243 * retrieve next char from scanners internal input stream
245 * @update gess 3/25/98
247 * @return error code reflecting read status
249 nsresult
nsScanner::GetChar(char16_t
& aChar
) {
250 if (!mSlidingBuffer
|| mCurrentPosition
== mEndPosition
) {
252 return NS_ERROR_HTMLPARSER_EOF
;
255 aChar
= *mCurrentPosition
++;
260 void nsScanner::BindSubstring(nsScannerSubstring
& aSubstring
,
261 const nsScannerIterator
& aStart
,
262 const nsScannerIterator
& aEnd
) {
263 aSubstring
.Rebind(*mSlidingBuffer
, aStart
, aEnd
);
266 void nsScanner::CurrentPosition(nsScannerIterator
& aPosition
) {
267 aPosition
= mCurrentPosition
;
270 void nsScanner::EndReading(nsScannerIterator
& aPosition
) {
271 aPosition
= mEndPosition
;
274 void nsScanner::SetPosition(nsScannerIterator
& aPosition
, bool aTerminate
) {
275 if (mSlidingBuffer
) {
276 mCurrentPosition
= aPosition
;
277 if (aTerminate
&& (mCurrentPosition
== mEndPosition
)) {
278 mMarkPosition
= mCurrentPosition
;
279 mSlidingBuffer
->DiscardPrefix(mCurrentPosition
);
284 void nsScanner::AppendToBuffer(nsScannerString::Buffer
* aBuf
) {
285 if (!mSlidingBuffer
) {
286 mSlidingBuffer
= mozilla::MakeUnique
<nsScannerString
>(aBuf
);
287 mSlidingBuffer
->BeginReading(mCurrentPosition
);
288 mMarkPosition
= mCurrentPosition
;
290 mSlidingBuffer
->AppendBuffer(aBuf
);
291 if (mCurrentPosition
== mEndPosition
) {
292 mSlidingBuffer
->BeginReading(mCurrentPosition
);
295 mSlidingBuffer
->EndReading(mEndPosition
);
299 * call this to copy bytes out of the scanner that have not yet been consumed
300 * by the tokenization process.
302 * @update gess 5/12/98
303 * @param aCopyBuffer is where the scanner buffer will be copied to
304 * @return true if OK or false on OOM
306 bool nsScanner::CopyUnusedData(nsString
& aCopyBuffer
) {
307 if (!mSlidingBuffer
) {
308 aCopyBuffer
.Truncate();
312 nsScannerIterator start
, end
;
313 start
= mCurrentPosition
;
316 return CopyUnicodeTo(start
, end
, aCopyBuffer
);
320 * Conduct self test. Actually, selftesting for this class
321 * occurs in the parser selftest.
323 * @update gess 3/25/98
328 void nsScanner::SelfTest(void) {