Bug 1866777 - Disable test_race_cache_with_network.js on windows opt for frequent...
[gecko.git] / xpcom / ds / Tokenizer.h
blob713b63f2696071765e0502538788036d64633715
1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2 /* vim: set ts=8 sts=2 et sw=2 tw=80: */
3 /* This Source Code Form is subject to the terms of the Mozilla Public
4 * License, v. 2.0. If a copy of the MPL was not distributed with this
5 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
7 #ifndef Tokenizer_h__
8 #define Tokenizer_h__
10 #include <type_traits>
12 #include "nsString.h"
13 #include "mozilla/CheckedInt.h"
14 #include "mozilla/ScopeExit.h"
15 #include "mozilla/UniquePtr.h"
16 #include "nsTArray.h"
18 namespace mozilla {
20 template <typename TChar>
21 class TokenizerBase {
22 public:
23 typedef nsTSubstring<TChar> TAString;
24 typedef nsTString<TChar> TString;
25 typedef nsTDependentString<TChar> TDependentString;
26 typedef nsTDependentSubstring<TChar> TDependentSubstring;
28 static TChar const sWhitespaces[];
30 /**
31 * The analyzer works with elements in the input cut to a sequence of token
32 * where each token has an elementary type
34 enum TokenType : uint32_t {
35 TOKEN_UNKNOWN,
36 TOKEN_RAW,
37 TOKEN_ERROR,
38 TOKEN_INTEGER,
39 TOKEN_WORD,
40 TOKEN_CHAR,
41 TOKEN_WS,
42 TOKEN_EOL,
43 TOKEN_EOF,
44 TOKEN_CUSTOM0 = 1000
47 enum ECaseSensitivity { CASE_SENSITIVE, CASE_INSENSITIVE };
49 /**
50 * Class holding the type and the value of a token. It can be manually
51 * created to allow checks against it via methods of TTokenizer or are results
52 * of some of the TTokenizer's methods.
54 class Token {
55 TokenType mType;
56 TDependentSubstring mWord;
57 TString mCustom;
58 TChar mChar;
59 uint64_t mInteger;
60 ECaseSensitivity mCustomCaseInsensitivity;
61 bool mCustomEnabled;
63 // If this token is a result of the parsing process, this member is
64 // referencing a sub-string in the input buffer. If this is externally
65 // created Token this member is left an empty string.
66 TDependentSubstring mFragment;
68 friend class TokenizerBase<TChar>;
69 void AssignFragment(typename TAString::const_char_iterator begin,
70 typename TAString::const_char_iterator end);
72 static Token Raw();
74 public:
75 Token();
76 Token(const Token& aOther);
77 Token& operator=(const Token& aOther);
79 // Static constructors of tokens by type and value
80 static Token Word(TAString const& aWord);
81 static Token Char(TChar const aChar);
82 static Token Number(uint64_t const aNumber);
83 static Token Whitespace();
84 static Token NewLine();
85 static Token EndOfFile();
86 static Token Error();
88 // Compares the two tokens, type must be identical and value
89 // of one of the tokens must be 'any' or equal.
90 bool Equals(const Token& aOther) const;
92 TokenType Type() const { return mType; }
93 TChar AsChar() const;
94 TDependentSubstring AsString() const;
95 uint64_t AsInteger() const;
97 TDependentSubstring Fragment() const { return mFragment; }
101 * Consumers may register a custom string that, when found in the input, is
102 * considered a token and returned by Next*() and accepted by Check*()
103 * methods. AddCustomToken() returns a reference to a token that can then be
104 * comapred using Token::Equals() againts the output from Next*() or be passed
105 * to Check*().
107 Token AddCustomToken(const TAString& aValue,
108 ECaseSensitivity aCaseInsensitivity,
109 bool aEnabled = true);
110 template <uint32_t N>
111 Token AddCustomToken(const TChar (&aValue)[N],
112 ECaseSensitivity aCaseInsensitivity,
113 bool aEnabled = true) {
114 return AddCustomToken(TDependentSubstring(aValue, N - 1),
115 aCaseInsensitivity, aEnabled);
117 void RemoveCustomToken(Token& aToken);
119 * Only applies to a custom type of a Token (see AddCustomToken above.)
120 * This turns on and off token recognition. When a custom token is disabled,
121 * it's ignored as never added as a custom token.
123 void EnableCustomToken(Token const& aToken, bool aEnable);
126 * Mode of tokenization.
127 * FULL tokenization, the default, recognizes built-in tokens and any custom
128 * tokens, if added. CUSTOM_ONLY will only recognize custom tokens, the rest
129 * is seen as 'raw'. This mode can be understood as a 'binary' mode.
131 enum class Mode { FULL, CUSTOM_ONLY };
132 void SetTokenizingMode(Mode aMode);
135 * Return false iff the last Check*() call has returned false or when we've
136 * read past the end of the input string.
138 [[nodiscard]] bool HasFailed() const;
140 protected:
141 explicit TokenizerBase(const TChar* aWhitespaces = nullptr,
142 const TChar* aAdditionalWordChars = nullptr);
144 // false if we have already read the EOF token.
145 bool HasInput() const;
146 // Main parsing function, it doesn't shift the read cursor, just returns the
147 // next token position.
148 typename TAString::const_char_iterator Parse(Token& aToken) const;
149 // Is read cursor at the end?
150 bool IsEnd(const typename TAString::const_char_iterator& caret) const;
151 // True, when we are at the end of the input data, but it has not been marked
152 // as complete yet. In that case we cannot proceed with providing a
153 // multi-TChar token.
154 bool IsPending(const typename TAString::const_char_iterator& caret) const;
155 // Is read cursor on a character that is a word start?
156 bool IsWordFirst(const TChar aInput) const;
157 // Is read cursor on a character that is an in-word letter?
158 bool IsWord(const TChar aInput) const;
159 // Is read cursor on a character that is a valid number?
160 // TODO - support multiple radix
161 bool IsNumber(const TChar aInput) const;
162 // Is equal to the given custom token?
163 bool IsCustom(const typename TAString::const_char_iterator& caret,
164 const Token& aCustomToken, uint32_t* aLongest = nullptr) const;
166 // Friendly helper to assign a fragment on a Token
167 static void AssignFragment(Token& aToken,
168 typename TAString::const_char_iterator begin,
169 typename TAString::const_char_iterator end);
171 #ifdef DEBUG
172 // This is called from inside Tokenizer methods to make sure the token is
173 // valid.
174 void Validate(Token const& aToken);
175 #endif
177 // true iff we have already read the EOF token
178 bool mPastEof;
179 // true iff the last Check*() call has returned false, reverts to true on
180 // Rollback() call
181 bool mHasFailed;
182 // true if the input string is final (finished), false when we expect more
183 // data yet to be fed to the tokenizer (see IncrementalTokenizer derived
184 // class).
185 bool mInputFinished;
186 // custom only vs full tokenizing mode, see the Parse() method
187 Mode mMode;
188 // minimal raw data chunked delivery during incremental feed
189 uint32_t mMinRawDelivery;
191 // Customizable list of whitespaces
192 const TChar* mWhitespaces;
193 // Additinal custom word characters
194 const TChar* mAdditionalWordChars;
196 // All these point to the original buffer passed to the constructor or to the
197 // incremental buffer after FeedInput.
198 typename TAString::const_char_iterator
199 mCursor; // Position of the current (actually next to read) token start
200 typename TAString::const_char_iterator mEnd; // End of the input position
202 // This is the list of tokens user has registered with AddCustomToken()
203 nsTArray<UniquePtr<Token>> mCustomTokens;
204 uint32_t mNextCustomTokenID;
206 private:
207 TokenizerBase() = delete;
208 TokenizerBase(const TokenizerBase&) = delete;
209 TokenizerBase(TokenizerBase&&) = delete;
210 TokenizerBase(const TokenizerBase&&) = delete;
211 TokenizerBase& operator=(const TokenizerBase&) = delete;
215 * This is a simple implementation of a lexical analyzer or maybe better
216 * called a tokenizer.
218 * Please use Tokenizer or Tokenizer16 classes, that are specializations
219 * of this template class. Tokenizer is for ASCII input, Tokenizer16 may
220 * handle char16_t input, but doesn't recognize whitespaces or numbers
221 * other than standard `char` specialized Tokenizer class.
223 template <typename TChar>
224 class TTokenizer : public TokenizerBase<TChar> {
225 public:
226 typedef TokenizerBase<TChar> base;
229 * @param aSource
230 * The string to parse.
231 * IMPORTANT NOTE: TTokenizer doesn't ensure the input string buffer
232 * lifetime. It's up to the consumer to make sure the string's buffer outlives
233 * the TTokenizer!
234 * @param aWhitespaces
235 * If non-null TTokenizer will use this custom set of whitespaces for
236 * CheckWhite() and SkipWhites() calls. By default the list consists of space
237 * and tab.
238 * @param aAdditionalWordChars
239 * If non-null it will be added to the list of characters that consist a
240 * word. This is useful when you want to accept e.g. '-' in HTTP headers. By
241 * default a word character is consider any character for which upper case
242 * is different from lower case.
244 * If there is an overlap between aWhitespaces and aAdditionalWordChars, the
245 * check for word characters is made first.
247 explicit TTokenizer(const typename base::TAString& aSource,
248 const TChar* aWhitespaces = nullptr,
249 const TChar* aAdditionalWordChars = nullptr);
250 explicit TTokenizer(const TChar* aSource, const TChar* aWhitespaces = nullptr,
251 const TChar* aAdditionalWordChars = nullptr);
254 * When there is still anything to read from the input, tokenize it, store the
255 * token type and value to aToken result and shift the cursor past this just
256 * parsed token. Each call to Next() reads another token from the input and
257 * shifts the cursor. Returns false if we have passed the end of the input.
259 [[nodiscard]] bool Next(typename base::Token& aToken);
262 * Parse the token on the input read cursor position, check its type is equal
263 * to aTokenType and if so, put it into aResult, shift the cursor and return
264 * true. Otherwise, leave the input read cursor position intact and return
265 * false.
267 [[nodiscard]] bool Check(const typename base::TokenType aTokenType,
268 typename base::Token& aResult);
270 * Same as above method, just compares both token type and token value passed
271 * in aToken. When both the type and the value equals, shift the cursor and
272 * return true. Otherwise return false.
274 [[nodiscard]] bool Check(const typename base::Token& aToken);
277 * SkipWhites method (below) may also skip new line characters automatically.
279 enum WhiteSkipping {
281 * SkipWhites will only skip what is defined as a white space (default).
283 DONT_INCLUDE_NEW_LINE = 0,
285 * SkipWhites will skip definited white spaces as well as new lines
286 * automatically.
288 INCLUDE_NEW_LINE = 1
292 * Skips any occurence of whitespaces specified in mWhitespaces member,
293 * optionally skip also new lines.
295 void SkipWhites(WhiteSkipping aIncludeNewLines = DONT_INCLUDE_NEW_LINE);
298 * Skips all tokens until the given one is found or EOF is hit. The token
299 * or EOF are next to read.
301 void SkipUntil(typename base::Token const& aToken);
303 // These are mostly shortcuts for the Check() methods above.
306 * Check whitespace character is present.
308 [[nodiscard]] bool CheckWhite() { return Check(base::Token::Whitespace()); }
310 * Check there is a single character on the read cursor position. If so,
311 * shift the read cursor position and return true. Otherwise false.
313 [[nodiscard]] bool CheckChar(const TChar aChar) {
314 return Check(base::Token::Char(aChar));
317 * This is a customizable version of CheckChar. aClassifier is a function
318 * called with value of the character on the current input read position. If
319 * this user function returns true, read cursor is shifted and true returned.
320 * Otherwise false. The user classifiction function is not called when we are
321 * at or past the end and false is immediately returned.
323 [[nodiscard]] bool CheckChar(bool (*aClassifier)(const TChar aChar));
325 * Check for a whole expected word.
327 [[nodiscard]] bool CheckWord(const typename base::TAString& aWord) {
328 return Check(base::Token::Word(aWord));
331 * Shortcut for literal const word check with compile time length calculation.
333 template <uint32_t N>
334 [[nodiscard]] bool CheckWord(const TChar (&aWord)[N]) {
335 return Check(
336 base::Token::Word(typename base::TDependentString(aWord, N - 1)));
339 * Helper to check for a string compound of multiple tokens like "foo bar".
340 * The match is binary-exact, a white space or a delimiter character in the
341 * phrase must match exactly the characters in the input.
343 [[nodiscard]] bool CheckPhrase(const typename base::TAString& aPhrase);
344 template <uint32_t N>
345 [[nodiscard]] bool CheckPhrase(const TChar (&aPhrase)[N]) {
346 return CheckPhrase(typename base::TDependentString(aPhrase, N - 1));
349 * Checks \r, \n or \r\n.
351 [[nodiscard]] bool CheckEOL() { return Check(base::Token::NewLine()); }
353 * Checks we are at the end of the input string reading. If so, shift past
354 * the end and returns true. Otherwise does nothing and returns false.
356 [[nodiscard]] bool CheckEOF() { return Check(base::Token::EndOfFile()); }
359 * These are shortcuts to obtain the value immediately when the token type
360 * matches.
362 [[nodiscard]] bool ReadChar(TChar* aValue);
363 [[nodiscard]] bool ReadChar(bool (*aClassifier)(const TChar aChar),
364 TChar* aValue);
365 [[nodiscard]] bool ReadWord(typename base::TAString& aValue);
366 [[nodiscard]] bool ReadWord(typename base::TDependentSubstring& aValue);
369 * This is an integer read helper. It returns false and doesn't move the read
370 * cursor when any of the following happens:
371 * - the token at the read cursor is not an integer
372 * - the final number doesn't fit the T type
373 * Otherwise true is returned, aValue is filled with the integral number
374 * and the cursor is moved forward.
376 template <typename T>
377 [[nodiscard]] bool ReadInteger(T* aValue) {
378 MOZ_RELEASE_ASSERT(aValue);
380 typename base::TAString::const_char_iterator rollback = mRollback;
381 typename base::TAString::const_char_iterator cursor = base::mCursor;
382 typename base::Token t;
383 if (!Check(base::TOKEN_INTEGER, t)) {
384 return false;
387 mozilla::CheckedInt<T> checked(t.AsInteger());
388 if (!checked.isValid()) {
389 // Move to a state as if Check() call has failed
390 mRollback = rollback;
391 base::mCursor = cursor;
392 base::mHasFailed = true;
393 return false;
396 *aValue = checked.value();
397 return true;
401 * Same as above, but accepts an integer with an optional minus sign.
403 template <typename T, typename V = std::enable_if_t<
404 std::is_signed_v<std::remove_pointer_t<T>>,
405 std::remove_pointer_t<T>>>
406 [[nodiscard]] bool ReadSignedInteger(T* aValue) {
407 MOZ_RELEASE_ASSERT(aValue);
409 typename base::TAString::const_char_iterator rollback = mRollback;
410 typename base::TAString::const_char_iterator cursor = base::mCursor;
411 auto revert = MakeScopeExit([&] {
412 // Move to a state as if Check() call has failed
413 mRollback = rollback;
414 base::mCursor = cursor;
415 base::mHasFailed = true;
418 // Using functional raw access because '-' could be part of the word set
419 // making CheckChar('-') not work.
420 bool minus = CheckChar([](const TChar aChar) { return aChar == '-'; });
422 typename base::Token t;
423 if (!Check(base::TOKEN_INTEGER, t)) {
424 return false;
427 mozilla::CheckedInt<T> checked(t.AsInteger());
428 if (minus) {
429 checked *= -1;
432 if (!checked.isValid()) {
433 return false;
436 *aValue = checked.value();
437 revert.release();
438 return true;
442 * Returns the read cursor position back as it was before the last call of any
443 * parsing method of TTokenizer (Next, Check*, Skip*, Read*) so that the last
444 * operation can be repeated. Rollback cannot be used multiple times, it only
445 * reverts the last successfull parse operation. It also cannot be used
446 * before any parsing operation has been called on the TTokenizer.
448 void Rollback();
451 * Record() and Claim() are collecting the input as it is being parsed to
452 * obtain a substring between particular syntax bounderies defined by any
453 * recursive descent parser or simple parser the TTokenizer is used to read
454 * the input for. Inlucsion of a token that has just been parsed can be
455 * controlled using an arguemnt.
457 enum ClaimInclusion {
459 * Include resulting (or passed) token of the last lexical analyzer
460 * operation in the result.
462 INCLUDE_LAST,
464 * Do not include it.
466 EXCLUDE_LAST
470 * Start the process of recording. Based on aInclude value the begining of
471 * the recorded sub-string is at the current position (EXCLUDE_LAST) or at the
472 * position before the last parsed token (INCLUDE_LAST).
474 void Record(ClaimInclusion aInclude = EXCLUDE_LAST);
476 * Claim result of the record started with Record() call before. Depending on
477 * aInclude the ending of the sub-string result includes or excludes the last
478 * parsed or checked token.
480 void Claim(typename base::TAString& aResult,
481 ClaimInclusion aInclude = EXCLUDE_LAST);
482 void Claim(typename base::TDependentSubstring& aResult,
483 ClaimInclusion aInclude = EXCLUDE_LAST);
486 * If aToken is found, aResult is set to the substring between the current
487 * position and the position of aToken, potentially including aToken depending
488 * on aInclude.
489 * If aToken isn't found aResult is set to the substring between the current
490 * position and the end of the string.
491 * If aToken is found, the method returns true. Otherwise it returns false.
493 * Calling Rollback() after ReadUntil() will return the read cursor to the
494 * position it had before ReadUntil was called.
496 [[nodiscard]] bool ReadUntil(typename base::Token const& aToken,
497 typename base::TDependentSubstring& aResult,
498 ClaimInclusion aInclude = EXCLUDE_LAST);
499 [[nodiscard]] bool ReadUntil(typename base::Token const& aToken,
500 typename base::TAString& aResult,
501 ClaimInclusion aInclude = EXCLUDE_LAST);
503 protected:
504 // All these point to the original buffer passed to the TTokenizer's
505 // constructor
506 typename base::TAString::const_char_iterator
507 mRecord; // Position where the recorded sub-string for Claim() is
508 typename base::TAString::const_char_iterator
509 mRollback; // Position of the previous token start
511 private:
512 TTokenizer() = delete;
513 TTokenizer(const TTokenizer&) = delete;
514 TTokenizer(TTokenizer&&) = delete;
515 TTokenizer(const TTokenizer&&) = delete;
516 TTokenizer& operator=(const TTokenizer&) = delete;
519 typedef TTokenizer<char> Tokenizer;
520 typedef TTokenizer<char16_t> Tokenizer16;
522 } // namespace mozilla
524 #endif // Tokenizer_h__