parser/htmlparser/nsScanner.cpp

   1 /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
   2 /* vim: set ts=2 sw=2 et tw=78: */
   3 /* This Source Code Form is subject to the terms of the Mozilla Public
   4  * License, v. 2.0. If a copy of the MPL was not distributed with this
   5  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
   6
   7 //#define __INCREMENTAL 1
   8
   9 #include "nsScanner.h"
  10
  11 #include "mozilla/Attributes.h"
  12 #include "mozilla/DebugOnly.h"
  13 #include "mozilla/Encoding.h"
  14 #include "nsDebug.h"
  15 #include "nsReadableUtils.h"
  16 #include "nsUTF8Utils.h"  // for LossyConvertEncoding
  17 #include "nsCRT.h"
  18 #include "nsParser.h"
  19 #include "nsCharsetSource.h"
  20
  21 nsReadEndCondition::nsReadEndCondition(const char16_t* aTerminateChars)
  22     : mChars(aTerminateChars),
  23       mFilter(char16_t(~0))  // All bits set
  24 {
  25   // Build filter that will be used to filter out characters with
  26   // bits that none of the terminal chars have. This works very well
  27   // because terminal chars often have only the last 4-6 bits set and
  28   // normal ascii letters have bit 7 set. Other letters have even higher
  29   // bits set.
  30
  31   // Calculate filter
  32   const char16_t* current = aTerminateChars;
  33   char16_t terminalChar = *current;
  34   while (terminalChar) {
  35     mFilter &= ~terminalChar;
  36     ++current;
  37     terminalChar = *current;
  38   }
  39 }
  40
  41 /**
  42  *  Use this constructor if you want i/o to be based on
  43  *  a single string you hand in during construction.
  44  *  This short cut was added for Javascript.
  45  *
  46  *  @update  gess 5/12/98
  47  *  @param   aMode represents the parser mode (nav, other)
  48  *  @return
  49  */
  50 nsScanner::nsScanner(const nsAString& anHTMLString) {
  51   MOZ_COUNT_CTOR(nsScanner);
  52
  53   mSlidingBuffer = nullptr;
  54   if (AppendToBuffer(anHTMLString)) {
  55     mSlidingBuffer->BeginReading(mCurrentPosition);
  56   } else {
  57     /* XXX see hack below, re: bug 182067 */
  58     memset(&mCurrentPosition, 0, sizeof(mCurrentPosition));
  59     mEndPosition = mCurrentPosition;
  60   }
  61   mMarkPosition = mCurrentPosition;
  62   mIncremental = false;
  63   mUnicodeDecoder = nullptr;
  64   mCharsetSource = kCharsetUninitialized;
  65 }
  66
  67 /**
  68  *  Use this constructor if you want i/o to be based on strings
  69  *  the scanner receives. If you pass a null filename, you
  70  *  can still provide data to the scanner via append.
  71  */
  72 nsScanner::nsScanner(nsString& aFilename, bool aCreateStream)
  73     : mFilename(aFilename) {
  74   MOZ_COUNT_CTOR(nsScanner);
  75   NS_ASSERTION(!aCreateStream, "This is always true.");
  76
  77   mSlidingBuffer = nullptr;
  78
  79   // XXX This is a big hack.  We need to initialize the iterators to something.
  80   // What matters is that mCurrentPosition == mEndPosition, so that our methods
  81   // believe that we are at EOF (see bug 182067).  We null out mCurrentPosition
  82   // so that we have some hope of catching null pointer dereferences associated
  83   // with this hack. --darin
  84   memset(&mCurrentPosition, 0, sizeof(mCurrentPosition));
  85   mMarkPosition = mCurrentPosition;
  86   mEndPosition = mCurrentPosition;
  87
  88   mIncremental = true;
  89
  90   mUnicodeDecoder = nullptr;
  91   mCharsetSource = kCharsetUninitialized;
  92   // XML defaults to UTF-8 and about:blank is UTF-8, too.
  93   SetDocumentCharset(UTF_8_ENCODING, kCharsetFromDocTypeDefault);
  94 }
  95
  96 nsresult nsScanner::SetDocumentCharset(NotNull<const Encoding*> aEncoding,
  97                                        int32_t aSource) {
  98   if (aSource < mCharsetSource)  // priority is lower than the current one
  99     return NS_OK;
 100
 101   mCharsetSource = aSource;
 102   nsCString charsetName;
 103   aEncoding->Name(charsetName);
 104   if (!mCharset.IsEmpty() && charsetName.Equals(mCharset)) {
 105     return NS_OK;  // no difference, don't change it
 106   }
 107
 108   // different, need to change it
 109
 110   mCharset.Assign(charsetName);
 111
 112   mUnicodeDecoder = aEncoding->NewDecoderWithBOMRemoval();
 113
 114   return NS_OK;
 115 }
 116
 117 /**
 118  *  default destructor
 119  *
 120  *  @update  gess 3/25/98
 121  *  @param
 122  *  @return
 123  */
 124 nsScanner::~nsScanner() {
 125   delete mSlidingBuffer;
 126
 127   MOZ_COUNT_DTOR(nsScanner);
 128 }
 129
 130 /**
 131  *  Resets current offset position of input stream to marked position.
 132  *  This allows us to back up to this point if the need should arise,
 133  *  such as when tokenization gets interrupted.
 134  *  NOTE: IT IS REALLY BAD FORM TO CALL RELEASE WITHOUT CALLING MARK FIRST!
 135  *
 136  *  @update  gess 5/12/98
 137  *  @param
 138  *  @return
 139  */
 140 void nsScanner::RewindToMark(void) {
 141   if (mSlidingBuffer) {
 142     mCurrentPosition = mMarkPosition;
 143   }
 144 }
 145
 146 /**
 147  *  Records current offset position in input stream. This allows us
 148  *  to back up to this point if the need should arise, such as when
 149  *  tokenization gets interrupted.
 150  *
 151  *  @update  gess 7/29/98
 152  *  @param
 153  *  @return
 154  */
 155 int32_t nsScanner::Mark() {
 156   int32_t distance = 0;
 157   if (mSlidingBuffer) {
 158     nsScannerIterator oldStart;
 159     mSlidingBuffer->BeginReading(oldStart);
 160
 161     distance = Distance(oldStart, mCurrentPosition);
 162
 163     mSlidingBuffer->DiscardPrefix(mCurrentPosition);
 164     mSlidingBuffer->BeginReading(mCurrentPosition);
 165     mMarkPosition = mCurrentPosition;
 166   }
 167
 168   return distance;
 169 }
 170
 171 /**
 172  * Insert data to our underlying input buffer as
 173  * if it were read from an input stream.
 174  *
 175  * @update  harishd 01/12/99
 176  * @return  error code
 177  */
 178 bool nsScanner::UngetReadable(const nsAString& aBuffer) {
 179   if (!mSlidingBuffer) {
 180     return false;
 181   }
 182
 183   mSlidingBuffer->UngetReadable(aBuffer, mCurrentPosition);
 184   mSlidingBuffer->BeginReading(
 185       mCurrentPosition);  // Insertion invalidated our iterators
 186   mSlidingBuffer->EndReading(mEndPosition);
 187
 188   return true;
 189 }
 190
 191 /**
 192  * Append data to our underlying input buffer as
 193  * if it were read from an input stream.
 194  *
 195  * @update  gess4/3/98
 196  * @return  error code
 197  */
 198 nsresult nsScanner::Append(const nsAString& aBuffer) {
 199   if (!AppendToBuffer(aBuffer)) return NS_ERROR_OUT_OF_MEMORY;
 200   return NS_OK;
 201 }
 202
 203 /**
 204  *
 205  *
 206  *  @update  gess 5/21/98
 207  *  @param
 208  *  @return
 209  */
 210 nsresult nsScanner::Append(const char* aBuffer, uint32_t aLen) {
 211   nsresult res = NS_OK;
 212   if (mUnicodeDecoder) {
 213     CheckedInt<size_t> needed = mUnicodeDecoder->MaxUTF16BufferLength(aLen);
 214     if (!needed.isValid()) {
 215       return NS_ERROR_OUT_OF_MEMORY;
 216     }
 217     CheckedInt<uint32_t> allocLen(1);  // null terminator due to legacy sadness
 218     allocLen += needed.value();
 219     if (!allocLen.isValid()) {
 220       return NS_ERROR_OUT_OF_MEMORY;
 221     }
 222     nsScannerString::Buffer* buffer =
 223         nsScannerString::AllocBuffer(allocLen.value());
 224     NS_ENSURE_TRUE(buffer, NS_ERROR_OUT_OF_MEMORY);
 225     char16_t* unichars = buffer->DataStart();
 226
 227     uint32_t result;
 228     size_t read;
 229     size_t written;
 230     Tie(result, read, written) =
 231         mUnicodeDecoder->DecodeToUTF16WithoutReplacement(
 232             AsBytes(Span(aBuffer, aLen)), Span(unichars, needed.value()),
 233             false);  // Retain bug about failure to handle EOF
 234     MOZ_ASSERT(result != kOutputFull);
 235     MOZ_ASSERT(read <= aLen);
 236     MOZ_ASSERT(written <= needed.value());
 237     if (result != kInputEmpty) {
 238       // Since about:blank is empty, this line runs only for XML. Use a
 239       // character that's illegal in XML instead of U+FFFD in order to make
 240       // expat flag the error. There is no need to loop and convert more, since
 241       // expat will stop here anyway.
 242       unichars[written++] = 0xFFFF;
 243     }
 244     buffer->SetDataLength(written);
 245     // Don't propagate return code of unicode decoder
 246     // since it doesn't reflect on our success or failure
 247     // - Ref. bug 87110
 248     res = NS_OK;
 249     if (!AppendToBuffer(buffer)) res = NS_ERROR_OUT_OF_MEMORY;
 250   } else {
 251     NS_WARNING("No decoder found.");
 252     res = NS_ERROR_FAILURE;
 253   }
 254
 255   return res;
 256 }
 257
 258 /**
 259  *  retrieve next char from scanners internal input stream
 260  *
 261  *  @update  gess 3/25/98
 262  *  @param
 263  *  @return  error code reflecting read status
 264  */
 265 nsresult nsScanner::GetChar(char16_t& aChar) {
 266   if (!mSlidingBuffer || mCurrentPosition == mEndPosition) {
 267     aChar = 0;
 268     return NS_ERROR_HTMLPARSER_EOF;
 269   }
 270
 271   aChar = *mCurrentPosition++;
 272
 273   return NS_OK;
 274 }
 275
 276 void nsScanner::BindSubstring(nsScannerSubstring& aSubstring,
 277                               const nsScannerIterator& aStart,
 278                               const nsScannerIterator& aEnd) {
 279   aSubstring.Rebind(*mSlidingBuffer, aStart, aEnd);
 280 }
 281
 282 void nsScanner::CurrentPosition(nsScannerIterator& aPosition) {
 283   aPosition = mCurrentPosition;
 284 }
 285
 286 void nsScanner::EndReading(nsScannerIterator& aPosition) {
 287   aPosition = mEndPosition;
 288 }
 289
 290 void nsScanner::SetPosition(nsScannerIterator& aPosition, bool aTerminate) {
 291   if (mSlidingBuffer) {
 292     mCurrentPosition = aPosition;
 293     if (aTerminate && (mCurrentPosition == mEndPosition)) {
 294       mMarkPosition = mCurrentPosition;
 295       mSlidingBuffer->DiscardPrefix(mCurrentPosition);
 296     }
 297   }
 298 }
 299
 300 bool nsScanner::AppendToBuffer(nsScannerString::Buffer* aBuf) {
 301   if (!mSlidingBuffer) {
 302     mSlidingBuffer = new nsScannerString(aBuf);
 303     if (!mSlidingBuffer) return false;
 304     mSlidingBuffer->BeginReading(mCurrentPosition);
 305     mMarkPosition = mCurrentPosition;
 306     mSlidingBuffer->EndReading(mEndPosition);
 307   } else {
 308     mSlidingBuffer->AppendBuffer(aBuf);
 309     if (mCurrentPosition == mEndPosition) {
 310       mSlidingBuffer->BeginReading(mCurrentPosition);
 311     }
 312     mSlidingBuffer->EndReading(mEndPosition);
 313   }
 314
 315   return true;
 316 }
 317
 318 /**
 319  *  call this to copy bytes out of the scanner that have not yet been consumed
 320  *  by the tokenization process.
 321  *
 322  *  @update  gess 5/12/98
 323  *  @param   aCopyBuffer is where the scanner buffer will be copied to
 324  *  @return  true if OK or false on OOM
 325  */
 326 bool nsScanner::CopyUnusedData(nsString& aCopyBuffer) {
 327   if (!mSlidingBuffer) {
 328     aCopyBuffer.Truncate();
 329     return true;
 330   }
 331
 332   nsScannerIterator start, end;
 333   start = mCurrentPosition;
 334   end = mEndPosition;
 335
 336   return CopyUnicodeTo(start, end, aCopyBuffer);
 337 }
 338
 339 /**
 340  *  Retrieve the name of the file that the scanner is reading from.
 341  *  In some cases, it's just a given name, because the scanner isn't
 342  *  really reading from a file.
 343  *
 344  *  @update  gess 5/12/98
 345  *  @return
 346  */
 347 nsString& nsScanner::GetFilename(void) { return mFilename; }
 348
 349 /**
 350  *  Conduct self test. Actually, selftesting for this class
 351  *  occurs in the parser selftest.
 352  *
 353  *  @update  gess 3/25/98
 354  *  @param
 355  *  @return
 356  */
 357
 358 void nsScanner::SelfTest(void) {
 359 #ifdef _DEBUG
 360 #endif
 361 }