parser/htmlparser/nsScanner.h

   1 /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
   2 /* This Source Code Form is subject to the terms of the Mozilla Public
   3  * License, v. 2.0. If a copy of the MPL was not distributed with this
   4  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
   5
   6
   7 /**
   8  * MODULE NOTES:
   9  * @update  gess 4/1/98
  10  *
  11  * The scanner is a low-level service class that knows
  12  * how to consume characters out of an (internal) stream.
  13  * This class also offers a series of utility methods
  14  * that most tokenizers want, such as readUntil()
  15  * and SkipWhitespace().
  16  */
  17
  18
  19 #ifndef SCANNER
  20 #define SCANNER
  21
  22 #include "nsCOMPtr.h"
  23 #include "nsString.h"
  24 #include "nsIParser.h"
  25 #include "nsIUnicodeDecoder.h"
  26 #include "nsScannerString.h"
  27
  28 class nsParser;
  29
  30 class nsReadEndCondition {
  31 public:
  32   const char16_t *mChars;
  33   char16_t mFilter;
  34   explicit nsReadEndCondition(const char16_t* aTerminateChars);
  35 private:
  36   nsReadEndCondition(const nsReadEndCondition& aOther); // No copying
  37   void operator=(const nsReadEndCondition& aOther); // No assigning
  38 };
  39
  40 class nsScanner {
  41   public:
  42
  43       /**
  44        *  Use this constructor for the XML fragment parsing case
  45        */
  46       explicit nsScanner(const nsAString& anHTMLString);
  47
  48       /**
  49        *  Use this constructor if you want i/o to be based on
  50        *  a file (therefore a stream) or just data you provide via Append().
  51        */
  52       nsScanner(nsString& aFilename, bool aCreateStream);
  53
  54       ~nsScanner();
  55
  56       /**
  57        *  retrieve next char from internal input stream
  58        *
  59        *  @update  gess 3/25/98
  60        *  @param   ch is the char to accept new value
  61        *  @return  error code reflecting read status
  62        */
  63       nsresult GetChar(char16_t& ch);
  64
  65       /**
  66        *  peek ahead to consume next char from scanner's internal
  67        *  input buffer
  68        *
  69        *  @update  gess 3/25/98
  70        *  @param   ch is the char to accept new value
  71        *  @return  error code reflecting read status
  72        */
  73       nsresult Peek(char16_t& ch, uint32_t aOffset=0);
  74
  75       nsresult Peek(nsAString& aStr, int32_t aNumChars, int32_t aOffset = 0);
  76
  77       /**
  78        *  Skip over chars as long as they equal given char
  79        *
  80        *  @update  gess 3/25/98
  81        *  @param   char to be skipped
  82        *  @return  error code
  83        */
  84       nsresult SkipOver(char16_t aSkipChar);
  85
  86       /**
  87        *  Skip whitespace on scanner input stream
  88        *
  89        *  @update  gess 3/25/98
  90        *  @return  error status
  91        */
  92       nsresult SkipWhitespace(int32_t& aNewlinesSkipped);
  93
  94       /**
  95        *  Consume characters until you run into space, a '<', a '>', or a '/'.
  96        *
  97        *  @param   aString - receives new data from stream
  98        *  @return  error code
  99        */
 100       nsresult ReadTagIdentifier(nsScannerSharedSubstring& aString);
 101
 102       /**
 103        *  Consume characters until you run into a char that's not valid in an
 104        *  entity name
 105        *
 106        *  @param   aString - receives new data from stream
 107        *  @return  error code
 108        */
 109       nsresult ReadEntityIdentifier(nsString& aString);
 110       nsresult ReadNumber(nsString& aString,int32_t aBase);
 111       nsresult ReadWhitespace(nsScannerSharedSubstring& aString,
 112                               int32_t& aNewlinesSkipped,
 113                               bool& aHaveCR);
 114       nsresult ReadWhitespace(nsScannerIterator& aStart,
 115                               nsScannerIterator& aEnd,
 116                               int32_t& aNewlinesSkipped);
 117
 118       /**
 119        *  Consume characters until you find the terminal char
 120        *
 121        *  @update  gess 3/25/98
 122        *  @param   aString receives new data from stream
 123        *  @param   aTerminal contains terminating char
 124        *  @param   addTerminal tells us whether to append terminal to aString
 125        *  @return  error code
 126        */
 127       nsresult ReadUntil(nsAString& aString,
 128                          char16_t aTerminal,
 129                          bool addTerminal);
 130
 131       /**
 132        *  Consume characters until you find one contained in given
 133        *  terminal set.
 134        *
 135        *  @update  gess 3/25/98
 136        *  @param   aString receives new data from stream
 137        *  @param   aTermSet contains set of terminating chars
 138        *  @param   addTerminal tells us whether to append terminal to aString
 139        *  @return  error code
 140        */
 141       nsresult ReadUntil(nsAString& aString,
 142                          const nsReadEndCondition& aEndCondition,
 143                          bool addTerminal);
 144
 145       nsresult ReadUntil(nsScannerSharedSubstring& aString,
 146                          const nsReadEndCondition& aEndCondition,
 147                          bool addTerminal);
 148
 149       nsresult ReadUntil(nsScannerIterator& aStart,
 150                          nsScannerIterator& aEnd,
 151                          const nsReadEndCondition& aEndCondition,
 152                          bool addTerminal);
 153
 154       /**
 155        *  Records current offset position in input stream. This allows us
 156        *  to back up to this point if the need should arise, such as when
 157        *  tokenization gets interrupted.
 158        *
 159        *  @update  gess 5/12/98
 160        *  @param
 161        *  @return
 162        */
 163       int32_t Mark(void);
 164
 165       /**
 166        *  Resets current offset position of input stream to marked position.
 167        *  This allows us to back up to this point if the need should arise,
 168        *  such as when tokenization gets interrupted.
 169        *  NOTE: IT IS REALLY BAD FORM TO CALL RELEASE WITHOUT CALLING MARK FIRST!
 170        *
 171        *  @update  gess 5/12/98
 172        *  @param
 173        *  @return
 174        */
 175       void RewindToMark(void);
 176
 177
 178       /**
 179        *
 180        *
 181        *  @update  harishd 01/12/99
 182        *  @param
 183        *  @return
 184        */
 185       bool UngetReadable(const nsAString& aBuffer);
 186
 187       /**
 188        *
 189        *
 190        *  @update  gess 5/13/98
 191        *  @param
 192        *  @return
 193        */
 194       nsresult Append(const nsAString& aBuffer);
 195
 196       /**
 197        *
 198        *
 199        *  @update  gess 5/21/98
 200        *  @param
 201        *  @return
 202        */
 203       nsresult Append(const char* aBuffer, uint32_t aLen,
 204                       nsIRequest *aRequest);
 205
 206       /**
 207        *  Call this to copy bytes out of the scanner that have not yet been consumed
 208        *  by the tokenization process.
 209        *
 210        *  @update  gess 5/12/98
 211        *  @param   aCopyBuffer is where the scanner buffer will be copied to
 212        *  @return  nada
 213        */
 214       void CopyUnusedData(nsString& aCopyBuffer);
 215
 216       /**
 217        *  Retrieve the name of the file that the scanner is reading from.
 218        *  In some cases, it's just a given name, because the scanner isn't
 219        *  really reading from a file.
 220        *
 221        *  @update  gess 5/12/98
 222        *  @return
 223        */
 224       nsString& GetFilename(void);
 225
 226       static void SelfTest();
 227
 228       /**
 229        *  Use this setter to change the scanner's unicode decoder
 230        *
 231        *  @update  ftang 3/02/99
 232        *  @param   aCharset a normalized (alias resolved) charset name
 233        *  @param   aCharsetSource- where the charset info came from
 234        *  @return
 235        */
 236       nsresult SetDocumentCharset(const nsACString& aCharset, int32_t aSource);
 237
 238       void BindSubstring(nsScannerSubstring& aSubstring, const nsScannerIterator& aStart, const nsScannerIterator& aEnd);
 239       void CurrentPosition(nsScannerIterator& aPosition);
 240       void EndReading(nsScannerIterator& aPosition);
 241       void SetPosition(nsScannerIterator& aPosition,
 242                        bool aTruncate = false,
 243                        bool aReverse = false);
 244       void ReplaceCharacter(nsScannerIterator& aPosition,
 245                             char16_t aChar);
 246
 247       /**
 248        * Internal method used to cause the internal buffer to
 249        * be filled with data.
 250        *
 251        * @update  gess4/3/98
 252        */
 253       bool      IsIncremental(void) {return mIncremental;}
 254       void      SetIncremental(bool anIncrValue) {mIncremental=anIncrValue;}
 255
 256       /**
 257        * Return the position of the first non-whitespace
 258        * character. This is only reliable before consumers start
 259        * reading from this scanner.
 260        */
 261       int32_t FirstNonWhitespacePosition()
 262       {
 263         return mFirstNonWhitespacePosition;
 264       }
 265
 266       /**
 267        * Override replacement character used by nsIUnicodeDecoder.
 268        * Default behavior is that it uses nsIUnicodeDecoder's mapping.
 269        *
 270        * @param aReplacementCharacter the replacement character
 271        *        XML (expat) parser uses 0xffff
 272        */
 273       void OverrideReplacementCharacter(char16_t aReplacementCharacter);
 274
 275   protected:
 276
 277       bool AppendToBuffer(nsScannerString::Buffer *, nsIRequest *aRequest, int32_t aErrorPos = -1);
 278       bool AppendToBuffer(const nsAString& aStr)
 279       {
 280         nsScannerString::Buffer* buf = nsScannerString::AllocBufferFromString(aStr);
 281         if (!buf)
 282           return false;
 283         AppendToBuffer(buf, nullptr);
 284         return true;
 285       }
 286
 287       nsScannerString*             mSlidingBuffer;
 288       nsScannerIterator            mCurrentPosition; // The position we will next read from in the scanner buffer
 289       nsScannerIterator            mMarkPosition;    // The position last marked (we may rewind to here)
 290       nsScannerIterator            mEndPosition;     // The current end of the scanner buffer
 291       nsScannerIterator            mFirstInvalidPosition; // The position of the first invalid character that was detected
 292       nsString        mFilename;
 293       uint32_t        mCountRemaining; // The number of bytes still to be read
 294                                        // from the scanner buffer
 295       bool            mIncremental;
 296       bool            mHasInvalidCharacter;
 297       char16_t       mReplacementCharacter;
 298       int32_t         mFirstNonWhitespacePosition;
 299       int32_t         mCharsetSource;
 300       nsCString       mCharset;
 301       nsCOMPtr<nsIUnicodeDecoder> mUnicodeDecoder;
 302
 303   private:
 304       nsScanner &operator =(const nsScanner &); // Not implemented.
 305 };
 306
 307 #endif
 308
 309