Bug 1700051: part 33) Move `AdjustSoftBeginAndBuildSoftText` to `SoftText`. r=smaug
[gecko.git] / parser / htmlparser / nsScanner.cpp
blob7095ef9392caef2c127490eda85e95f4e7803626
1 /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2 /* vim: set ts=2 sw=2 et tw=78: */
3 /* This Source Code Form is subject to the terms of the Mozilla Public
4 * License, v. 2.0. If a copy of the MPL was not distributed with this
5 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
7 //#define __INCREMENTAL 1
9 #include "nsScanner.h"
11 #include "mozilla/Attributes.h"
12 #include "mozilla/DebugOnly.h"
13 #include "mozilla/Encoding.h"
14 #include "nsDebug.h"
15 #include "nsReadableUtils.h"
16 #include "nsUTF8Utils.h" // for LossyConvertEncoding
17 #include "nsCRT.h"
18 #include "nsParser.h"
19 #include "nsCharsetSource.h"
21 nsReadEndCondition::nsReadEndCondition(const char16_t* aTerminateChars)
22 : mChars(aTerminateChars),
23 mFilter(char16_t(~0)) // All bits set
25 // Build filter that will be used to filter out characters with
26 // bits that none of the terminal chars have. This works very well
27 // because terminal chars often have only the last 4-6 bits set and
28 // normal ascii letters have bit 7 set. Other letters have even higher
29 // bits set.
31 // Calculate filter
32 const char16_t* current = aTerminateChars;
33 char16_t terminalChar = *current;
34 while (terminalChar) {
35 mFilter &= ~terminalChar;
36 ++current;
37 terminalChar = *current;
41 /**
42 * Use this constructor if you want i/o to be based on
43 * a single string you hand in during construction.
44 * This short cut was added for Javascript.
46 * @update gess 5/12/98
47 * @param aMode represents the parser mode (nav, other)
48 * @return
50 nsScanner::nsScanner(const nsAString& anHTMLString) {
51 MOZ_COUNT_CTOR(nsScanner);
53 mSlidingBuffer = nullptr;
54 if (AppendToBuffer(anHTMLString)) {
55 mSlidingBuffer->BeginReading(mCurrentPosition);
56 } else {
57 /* XXX see hack below, re: bug 182067 */
58 memset(&mCurrentPosition, 0, sizeof(mCurrentPosition));
59 mEndPosition = mCurrentPosition;
61 mMarkPosition = mCurrentPosition;
62 mIncremental = false;
63 mUnicodeDecoder = nullptr;
64 mCharsetSource = kCharsetUninitialized;
67 /**
68 * Use this constructor if you want i/o to be based on strings
69 * the scanner receives. If you pass a null filename, you
70 * can still provide data to the scanner via append.
72 nsScanner::nsScanner(nsString& aFilename, bool aCreateStream)
73 : mFilename(aFilename) {
74 MOZ_COUNT_CTOR(nsScanner);
75 NS_ASSERTION(!aCreateStream, "This is always true.");
77 mSlidingBuffer = nullptr;
79 // XXX This is a big hack. We need to initialize the iterators to something.
80 // What matters is that mCurrentPosition == mEndPosition, so that our methods
81 // believe that we are at EOF (see bug 182067). We null out mCurrentPosition
82 // so that we have some hope of catching null pointer dereferences associated
83 // with this hack. --darin
84 memset(&mCurrentPosition, 0, sizeof(mCurrentPosition));
85 mMarkPosition = mCurrentPosition;
86 mEndPosition = mCurrentPosition;
88 mIncremental = true;
90 mUnicodeDecoder = nullptr;
91 mCharsetSource = kCharsetUninitialized;
92 // XML defaults to UTF-8 and about:blank is UTF-8, too.
93 SetDocumentCharset(UTF_8_ENCODING, kCharsetFromDocTypeDefault);
96 nsresult nsScanner::SetDocumentCharset(NotNull<const Encoding*> aEncoding,
97 int32_t aSource) {
98 if (aSource < mCharsetSource) // priority is lower than the current one
99 return NS_OK;
101 mCharsetSource = aSource;
102 nsCString charsetName;
103 aEncoding->Name(charsetName);
104 if (!mCharset.IsEmpty() && charsetName.Equals(mCharset)) {
105 return NS_OK; // no difference, don't change it
108 // different, need to change it
110 mCharset.Assign(charsetName);
112 mUnicodeDecoder = aEncoding->NewDecoderWithBOMRemoval();
114 return NS_OK;
118 * default destructor
120 * @update gess 3/25/98
121 * @param
122 * @return
124 nsScanner::~nsScanner() {
125 delete mSlidingBuffer;
127 MOZ_COUNT_DTOR(nsScanner);
131 * Resets current offset position of input stream to marked position.
132 * This allows us to back up to this point if the need should arise,
133 * such as when tokenization gets interrupted.
134 * NOTE: IT IS REALLY BAD FORM TO CALL RELEASE WITHOUT CALLING MARK FIRST!
136 * @update gess 5/12/98
137 * @param
138 * @return
140 void nsScanner::RewindToMark(void) {
141 if (mSlidingBuffer) {
142 mCurrentPosition = mMarkPosition;
147 * Records current offset position in input stream. This allows us
148 * to back up to this point if the need should arise, such as when
149 * tokenization gets interrupted.
151 * @update gess 7/29/98
152 * @param
153 * @return
155 int32_t nsScanner::Mark() {
156 int32_t distance = 0;
157 if (mSlidingBuffer) {
158 nsScannerIterator oldStart;
159 mSlidingBuffer->BeginReading(oldStart);
161 distance = Distance(oldStart, mCurrentPosition);
163 mSlidingBuffer->DiscardPrefix(mCurrentPosition);
164 mSlidingBuffer->BeginReading(mCurrentPosition);
165 mMarkPosition = mCurrentPosition;
168 return distance;
172 * Insert data to our underlying input buffer as
173 * if it were read from an input stream.
175 * @update harishd 01/12/99
176 * @return error code
178 bool nsScanner::UngetReadable(const nsAString& aBuffer) {
179 if (!mSlidingBuffer) {
180 return false;
183 mSlidingBuffer->UngetReadable(aBuffer, mCurrentPosition);
184 mSlidingBuffer->BeginReading(
185 mCurrentPosition); // Insertion invalidated our iterators
186 mSlidingBuffer->EndReading(mEndPosition);
188 return true;
192 * Append data to our underlying input buffer as
193 * if it were read from an input stream.
195 * @update gess4/3/98
196 * @return error code
198 nsresult nsScanner::Append(const nsAString& aBuffer) {
199 if (!AppendToBuffer(aBuffer)) return NS_ERROR_OUT_OF_MEMORY;
200 return NS_OK;
206 * @update gess 5/21/98
207 * @param
208 * @return
210 nsresult nsScanner::Append(const char* aBuffer, uint32_t aLen) {
211 nsresult res = NS_OK;
212 if (mUnicodeDecoder) {
213 CheckedInt<size_t> needed = mUnicodeDecoder->MaxUTF16BufferLength(aLen);
214 if (!needed.isValid()) {
215 return NS_ERROR_OUT_OF_MEMORY;
217 CheckedInt<uint32_t> allocLen(1); // null terminator due to legacy sadness
218 allocLen += needed.value();
219 if (!allocLen.isValid()) {
220 return NS_ERROR_OUT_OF_MEMORY;
222 nsScannerString::Buffer* buffer =
223 nsScannerString::AllocBuffer(allocLen.value());
224 NS_ENSURE_TRUE(buffer, NS_ERROR_OUT_OF_MEMORY);
225 char16_t* unichars = buffer->DataStart();
227 uint32_t result;
228 size_t read;
229 size_t written;
230 Tie(result, read, written) =
231 mUnicodeDecoder->DecodeToUTF16WithoutReplacement(
232 AsBytes(Span(aBuffer, aLen)), Span(unichars, needed.value()),
233 false); // Retain bug about failure to handle EOF
234 MOZ_ASSERT(result != kOutputFull);
235 MOZ_ASSERT(read <= aLen);
236 MOZ_ASSERT(written <= needed.value());
237 if (result != kInputEmpty) {
238 // Since about:blank is empty, this line runs only for XML. Use a
239 // character that's illegal in XML instead of U+FFFD in order to make
240 // expat flag the error. There is no need to loop and convert more, since
241 // expat will stop here anyway.
242 unichars[written++] = 0xFFFF;
244 buffer->SetDataLength(written);
245 // Don't propagate return code of unicode decoder
246 // since it doesn't reflect on our success or failure
247 // - Ref. bug 87110
248 res = NS_OK;
249 if (!AppendToBuffer(buffer)) res = NS_ERROR_OUT_OF_MEMORY;
250 } else {
251 NS_WARNING("No decoder found.");
252 res = NS_ERROR_FAILURE;
255 return res;
259 * retrieve next char from scanners internal input stream
261 * @update gess 3/25/98
262 * @param
263 * @return error code reflecting read status
265 nsresult nsScanner::GetChar(char16_t& aChar) {
266 if (!mSlidingBuffer || mCurrentPosition == mEndPosition) {
267 aChar = 0;
268 return NS_ERROR_HTMLPARSER_EOF;
271 aChar = *mCurrentPosition++;
273 return NS_OK;
276 void nsScanner::BindSubstring(nsScannerSubstring& aSubstring,
277 const nsScannerIterator& aStart,
278 const nsScannerIterator& aEnd) {
279 aSubstring.Rebind(*mSlidingBuffer, aStart, aEnd);
282 void nsScanner::CurrentPosition(nsScannerIterator& aPosition) {
283 aPosition = mCurrentPosition;
286 void nsScanner::EndReading(nsScannerIterator& aPosition) {
287 aPosition = mEndPosition;
290 void nsScanner::SetPosition(nsScannerIterator& aPosition, bool aTerminate) {
291 if (mSlidingBuffer) {
292 mCurrentPosition = aPosition;
293 if (aTerminate && (mCurrentPosition == mEndPosition)) {
294 mMarkPosition = mCurrentPosition;
295 mSlidingBuffer->DiscardPrefix(mCurrentPosition);
300 bool nsScanner::AppendToBuffer(nsScannerString::Buffer* aBuf) {
301 if (!mSlidingBuffer) {
302 mSlidingBuffer = new nsScannerString(aBuf);
303 if (!mSlidingBuffer) return false;
304 mSlidingBuffer->BeginReading(mCurrentPosition);
305 mMarkPosition = mCurrentPosition;
306 mSlidingBuffer->EndReading(mEndPosition);
307 } else {
308 mSlidingBuffer->AppendBuffer(aBuf);
309 if (mCurrentPosition == mEndPosition) {
310 mSlidingBuffer->BeginReading(mCurrentPosition);
312 mSlidingBuffer->EndReading(mEndPosition);
315 return true;
319 * call this to copy bytes out of the scanner that have not yet been consumed
320 * by the tokenization process.
322 * @update gess 5/12/98
323 * @param aCopyBuffer is where the scanner buffer will be copied to
324 * @return true if OK or false on OOM
326 bool nsScanner::CopyUnusedData(nsString& aCopyBuffer) {
327 if (!mSlidingBuffer) {
328 aCopyBuffer.Truncate();
329 return true;
332 nsScannerIterator start, end;
333 start = mCurrentPosition;
334 end = mEndPosition;
336 return CopyUnicodeTo(start, end, aCopyBuffer);
340 * Retrieve the name of the file that the scanner is reading from.
341 * In some cases, it's just a given name, because the scanner isn't
342 * really reading from a file.
344 * @update gess 5/12/98
345 * @return
347 nsString& nsScanner::GetFilename(void) { return mFilename; }
350 * Conduct self test. Actually, selftesting for this class
351 * occurs in the parser selftest.
353 * @update gess 3/25/98
354 * @param
355 * @return
358 void nsScanner::SelfTest(void) {
359 #ifdef _DEBUG
360 #endif