parser/htmlparser/nsScanner.cpp

   1 /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
   2 /* vim: set ts=2 sw=2 et tw=78: */
   3 /* This Source Code Form is subject to the terms of the Mozilla Public
   4  * License, v. 2.0. If a copy of the MPL was not distributed with this
   5  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
   6
   7 // #define __INCREMENTAL 1
   8
   9 #include "nsScanner.h"
  10
  11 #include "mozilla/Attributes.h"
  12 #include "mozilla/DebugOnly.h"
  13 #include "mozilla/Encoding.h"
  14 #include "mozilla/UniquePtr.h"
  15 #include "nsDebug.h"
  16 #include "nsReadableUtils.h"
  17 #include "nsUTF8Utils.h"  // for LossyConvertEncoding
  18 #include "nsCRT.h"
  19 #include "nsParser.h"
  20 #include "nsCharsetSource.h"
  21
  22 nsReadEndCondition::nsReadEndCondition(const char16_t* aTerminateChars)
  23     : mChars(aTerminateChars),
  24       mFilter(char16_t(~0))  // All bits set
  25 {
  26   // Build filter that will be used to filter out characters with
  27   // bits that none of the terminal chars have. This works very well
  28   // because terminal chars often have only the last 4-6 bits set and
  29   // normal ascii letters have bit 7 set. Other letters have even higher
  30   // bits set.
  31
  32   // Calculate filter
  33   const char16_t* current = aTerminateChars;
  34   char16_t terminalChar = *current;
  35   while (terminalChar) {
  36     mFilter &= ~terminalChar;
  37     ++current;
  38     terminalChar = *current;
  39   }
  40 }
  41
  42 /**
  43  *  Use this constructor if you want i/o to be based on
  44  *  a single string you hand in during construction.
  45  *  This short cut was added for Javascript.
  46  *
  47  *  @update  gess 5/12/98
  48  *  @param   aMode represents the parser mode (nav, other)
  49  *  @return
  50  */
  51 nsScanner::nsScanner(const nsAString& anHTMLString, bool aIncremental)
  52     : mIncremental(aIncremental) {
  53   MOZ_COUNT_CTOR(nsScanner);
  54
  55   AppendToBuffer(anHTMLString);
  56   MOZ_ASSERT(mMarkPosition == mCurrentPosition);
  57 }
  58
  59 /**
  60  *  Use this constructor if you want i/o to be based on strings
  61  *  the scanner receives. If you pass a null filename, you
  62  *  can still provide data to the scanner via append.
  63  */
  64 nsScanner::nsScanner(nsIURI* aURI) : mURI(aURI), mIncremental(true) {
  65   MOZ_COUNT_CTOR(nsScanner);
  66
  67   // XXX This is a big hack.  We need to initialize the iterators to something.
  68   // What matters is that mCurrentPosition == mEndPosition, so that our methods
  69   // believe that we are at EOF (see bug 182067).  We null out mCurrentPosition
  70   // so that we have some hope of catching null pointer dereferences associated
  71   // with this hack. --darin
  72   memset(&mCurrentPosition, 0, sizeof(mCurrentPosition));
  73   mMarkPosition = mCurrentPosition;
  74   mEndPosition = mCurrentPosition;
  75
  76   // XML defaults to UTF-8 and about:blank is UTF-8, too.
  77   SetDocumentCharset(UTF_8_ENCODING, kCharsetFromDocTypeDefault);
  78 }
  79
  80 nsresult nsScanner::SetDocumentCharset(NotNull<const Encoding*> aEncoding,
  81                                        int32_t aSource) {
  82   if (aSource < mCharsetSource)  // priority is lower than the current one
  83     return NS_OK;
  84
  85   mCharsetSource = aSource;
  86   nsCString charsetName;
  87   aEncoding->Name(charsetName);
  88   if (!mCharset.IsEmpty() && charsetName.Equals(mCharset)) {
  89     return NS_OK;  // no difference, don't change it
  90   }
  91
  92   // different, need to change it
  93
  94   mCharset.Assign(charsetName);
  95
  96   mUnicodeDecoder = aEncoding->NewDecoderWithBOMRemoval();
  97
  98   return NS_OK;
  99 }
 100
 101 /**
 102  *  default destructor
 103  *
 104  *  @update  gess 3/25/98
 105  *  @param
 106  *  @return
 107  */
 108 nsScanner::~nsScanner() { MOZ_COUNT_DTOR(nsScanner); }
 109
 110 /**
 111  *  Resets current offset position of input stream to marked position.
 112  *  This allows us to back up to this point if the need should arise,
 113  *  such as when tokenization gets interrupted.
 114  *  NOTE: IT IS REALLY BAD FORM TO CALL RELEASE WITHOUT CALLING MARK FIRST!
 115  *
 116  *  @update  gess 5/12/98
 117  *  @param
 118  *  @return
 119  */
 120 void nsScanner::RewindToMark(void) {
 121   if (mSlidingBuffer) {
 122     mCurrentPosition = mMarkPosition;
 123   }
 124 }
 125
 126 /**
 127  *  Records current offset position in input stream. This allows us
 128  *  to back up to this point if the need should arise, such as when
 129  *  tokenization gets interrupted.
 130  *
 131  *  @update  gess 7/29/98
 132  *  @param
 133  *  @return
 134  */
 135 int32_t nsScanner::Mark() {
 136   int32_t distance = 0;
 137   if (mSlidingBuffer) {
 138     nsScannerIterator oldStart;
 139     mSlidingBuffer->BeginReading(oldStart);
 140
 141     distance = Distance(oldStart, mCurrentPosition);
 142
 143     mSlidingBuffer->DiscardPrefix(mCurrentPosition);
 144     mSlidingBuffer->BeginReading(mCurrentPosition);
 145     mMarkPosition = mCurrentPosition;
 146   }
 147
 148   return distance;
 149 }
 150
 151 /**
 152  * Insert data to our underlying input buffer as
 153  * if it were read from an input stream.
 154  *
 155  * @update  harishd 01/12/99
 156  * @return  error code
 157  */
 158 bool nsScanner::UngetReadable(const nsAString& aBuffer) {
 159   if (!mSlidingBuffer) {
 160     return false;
 161   }
 162
 163   mSlidingBuffer->UngetReadable(aBuffer, mCurrentPosition);
 164   mSlidingBuffer->BeginReading(
 165       mCurrentPosition);  // Insertion invalidated our iterators
 166   mSlidingBuffer->EndReading(mEndPosition);
 167
 168   return true;
 169 }
 170
 171 /**
 172  * Append data to our underlying input buffer as
 173  * if it were read from an input stream.
 174  *
 175  * @update  gess4/3/98
 176  * @return  error code
 177  */
 178 nsresult nsScanner::Append(const nsAString& aBuffer) {
 179   if (!AppendToBuffer(aBuffer)) return NS_ERROR_OUT_OF_MEMORY;
 180   return NS_OK;
 181 }
 182
 183 /**
 184  *
 185  *
 186  *  @update  gess 5/21/98
 187  *  @param
 188  *  @return
 189  */
 190 nsresult nsScanner::Append(const char* aBuffer, uint32_t aLen) {
 191   nsresult res = NS_OK;
 192   if (mUnicodeDecoder) {
 193     mozilla::CheckedInt<size_t> needed =
 194         mUnicodeDecoder->MaxUTF16BufferLength(aLen);
 195     if (!needed.isValid()) {
 196       return NS_ERROR_OUT_OF_MEMORY;
 197     }
 198     mozilla::CheckedInt<uint32_t> allocLen(
 199         1);  // null terminator due to legacy sadness
 200     allocLen += needed.value();
 201     if (!allocLen.isValid()) {
 202       return NS_ERROR_OUT_OF_MEMORY;
 203     }
 204     nsScannerString::Buffer* buffer =
 205         nsScannerString::AllocBuffer(allocLen.value());
 206     NS_ENSURE_TRUE(buffer, NS_ERROR_OUT_OF_MEMORY);
 207     char16_t* unichars = buffer->DataStart();
 208
 209     uint32_t result;
 210     size_t read;
 211     size_t written;
 212     // Do not use structured binding lest deal with [-Werror=unused-variable]
 213     std::tie(result, read, written) =
 214         mUnicodeDecoder->DecodeToUTF16WithoutReplacement(
 215             AsBytes(mozilla::Span(aBuffer, aLen)),
 216             mozilla::Span(unichars, needed.value()),
 217             false);  // Retain bug about failure to handle EOF
 218     MOZ_ASSERT(result != mozilla::kOutputFull);
 219     MOZ_ASSERT(read <= aLen);
 220     MOZ_ASSERT(written <= needed.value());
 221     if (result != mozilla::kInputEmpty) {
 222       // Since about:blank is empty, this line runs only for XML. Use a
 223       // character that's illegal in XML instead of U+FFFD in order to make
 224       // expat flag the error. There is no need to loop and convert more, since
 225       // expat will stop here anyway.
 226       unichars[written++] = 0xFFFF;
 227     }
 228     buffer->SetDataLength(written);
 229     // Don't propagate return code of unicode decoder
 230     // since it doesn't reflect on our success or failure
 231     // - Ref. bug 87110
 232     res = NS_OK;
 233     AppendToBuffer(buffer);
 234   } else {
 235     NS_WARNING("No decoder found.");
 236     res = NS_ERROR_FAILURE;
 237   }
 238
 239   return res;
 240 }
 241
 242 /**
 243  *  retrieve next char from scanners internal input stream
 244  *
 245  *  @update  gess 3/25/98
 246  *  @param
 247  *  @return  error code reflecting read status
 248  */
 249 nsresult nsScanner::GetChar(char16_t& aChar) {
 250   if (!mSlidingBuffer || mCurrentPosition == mEndPosition) {
 251     aChar = 0;
 252     return NS_ERROR_HTMLPARSER_EOF;
 253   }
 254
 255   aChar = *mCurrentPosition++;
 256
 257   return NS_OK;
 258 }
 259
 260 void nsScanner::BindSubstring(nsScannerSubstring& aSubstring,
 261                               const nsScannerIterator& aStart,
 262                               const nsScannerIterator& aEnd) {
 263   aSubstring.Rebind(*mSlidingBuffer, aStart, aEnd);
 264 }
 265
 266 void nsScanner::CurrentPosition(nsScannerIterator& aPosition) {
 267   aPosition = mCurrentPosition;
 268 }
 269
 270 void nsScanner::EndReading(nsScannerIterator& aPosition) {
 271   aPosition = mEndPosition;
 272 }
 273
 274 void nsScanner::SetPosition(nsScannerIterator& aPosition, bool aTerminate) {
 275   if (mSlidingBuffer) {
 276     mCurrentPosition = aPosition;
 277     if (aTerminate && (mCurrentPosition == mEndPosition)) {
 278       mMarkPosition = mCurrentPosition;
 279       mSlidingBuffer->DiscardPrefix(mCurrentPosition);
 280     }
 281   }
 282 }
 283
 284 void nsScanner::AppendToBuffer(nsScannerString::Buffer* aBuf) {
 285   if (!mSlidingBuffer) {
 286     mSlidingBuffer = mozilla::MakeUnique<nsScannerString>(aBuf);
 287     mSlidingBuffer->BeginReading(mCurrentPosition);
 288     mMarkPosition = mCurrentPosition;
 289   } else {
 290     mSlidingBuffer->AppendBuffer(aBuf);
 291     if (mCurrentPosition == mEndPosition) {
 292       mSlidingBuffer->BeginReading(mCurrentPosition);
 293     }
 294   }
 295   mSlidingBuffer->EndReading(mEndPosition);
 296 }
 297
 298 /**
 299  *  call this to copy bytes out of the scanner that have not yet been consumed
 300  *  by the tokenization process.
 301  *
 302  *  @update  gess 5/12/98
 303  *  @param   aCopyBuffer is where the scanner buffer will be copied to
 304  *  @return  true if OK or false on OOM
 305  */
 306 bool nsScanner::CopyUnusedData(nsString& aCopyBuffer) {
 307   if (!mSlidingBuffer) {
 308     aCopyBuffer.Truncate();
 309     return true;
 310   }
 311
 312   nsScannerIterator start, end;
 313   start = mCurrentPosition;
 314   end = mEndPosition;
 315
 316   return CopyUnicodeTo(start, end, aCopyBuffer);
 317 }
 318
 319 /**
 320  *  Conduct self test. Actually, selftesting for this class
 321  *  occurs in the parser selftest.
 322  *
 323  *  @update  gess 3/25/98
 324  *  @param
 325  *  @return
 326  */
 327
 328 void nsScanner::SelfTest(void) {
 329 #ifdef _DEBUG
 330 #endif
 331 }