Bug 1854550 - pt 12. Allow inlining between mozjemalloc and PHC r=glandium
[gecko.git] / xpcom / ds / Tokenizer.cpp
blob3b0f6b02dd23282455a5e3b9ef893aa90d4daba3
1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2 /* vim: set ts=8 sts=2 et sw=2 tw=80: */
3 /* This Source Code Form is subject to the terms of the Mozilla Public
4 * License, v. 2.0. If a copy of the MPL was not distributed with this
5 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
7 #include "Tokenizer.h"
9 #include "nsUnicharUtils.h"
10 #include <algorithm>
12 namespace mozilla {
14 template <>
15 char const TokenizerBase<char>::sWhitespaces[] = {' ', '\t', 0};
16 template <>
17 char16_t const TokenizerBase<char16_t>::sWhitespaces[3] = {' ', '\t', 0};
19 template <typename TChar>
20 static bool contains(TChar const* const list, TChar const needle) {
21 for (TChar const* c = list; *c; ++c) {
22 if (needle == *c) {
23 return true;
26 return false;
29 template <typename TChar>
30 TTokenizer<TChar>::TTokenizer(const typename base::TAString& aSource,
31 const TChar* aWhitespaces,
32 const TChar* aAdditionalWordChars)
33 : TokenizerBase<TChar>(aWhitespaces, aAdditionalWordChars) {
34 base::mInputFinished = true;
35 aSource.BeginReading(base::mCursor);
36 mRecord = mRollback = base::mCursor;
37 aSource.EndReading(base::mEnd);
40 template <typename TChar>
41 TTokenizer<TChar>::TTokenizer(const TChar* aSource, const TChar* aWhitespaces,
42 const TChar* aAdditionalWordChars)
43 : TTokenizer(typename base::TDependentString(aSource), aWhitespaces,
44 aAdditionalWordChars) {}
46 template <typename TChar>
47 bool TTokenizer<TChar>::Next(typename base::Token& aToken) {
48 if (!base::HasInput()) {
49 base::mHasFailed = true;
50 return false;
53 mRollback = base::mCursor;
54 base::mCursor = base::Parse(aToken);
56 base::AssignFragment(aToken, mRollback, base::mCursor);
58 base::mPastEof = aToken.Type() == base::TOKEN_EOF;
59 base::mHasFailed = false;
60 return true;
63 template <typename TChar>
64 bool TTokenizer<TChar>::Check(const typename base::TokenType aTokenType,
65 typename base::Token& aResult) {
66 if (!base::HasInput()) {
67 base::mHasFailed = true;
68 return false;
71 typename base::TAString::const_char_iterator next = base::Parse(aResult);
72 if (aTokenType != aResult.Type()) {
73 base::mHasFailed = true;
74 return false;
77 mRollback = base::mCursor;
78 base::mCursor = next;
80 base::AssignFragment(aResult, mRollback, base::mCursor);
82 base::mPastEof = aResult.Type() == base::TOKEN_EOF;
83 base::mHasFailed = false;
84 return true;
87 template <typename TChar>
88 bool TTokenizer<TChar>::Check(const typename base::Token& aToken) {
89 #ifdef DEBUG
90 base::Validate(aToken);
91 #endif
93 if (!base::HasInput()) {
94 base::mHasFailed = true;
95 return false;
98 typename base::Token parsed;
99 typename base::TAString::const_char_iterator next = base::Parse(parsed);
100 if (!aToken.Equals(parsed)) {
101 base::mHasFailed = true;
102 return false;
105 mRollback = base::mCursor;
106 base::mCursor = next;
107 base::mPastEof = parsed.Type() == base::TOKEN_EOF;
108 base::mHasFailed = false;
109 return true;
112 template <typename TChar>
113 void TTokenizer<TChar>::SkipWhites(WhiteSkipping aIncludeNewLines) {
114 if (!CheckWhite() &&
115 (aIncludeNewLines == DONT_INCLUDE_NEW_LINE || !CheckEOL())) {
116 return;
119 typename base::TAString::const_char_iterator rollback = mRollback;
120 while (CheckWhite() || (aIncludeNewLines == INCLUDE_NEW_LINE && CheckEOL())) {
123 base::mHasFailed = false;
124 mRollback = rollback;
127 template <typename TChar>
128 void TTokenizer<TChar>::SkipUntil(typename base::Token const& aToken) {
129 typename base::TAString::const_char_iterator rollback = base::mCursor;
130 const typename base::Token eof = base::Token::EndOfFile();
132 typename base::Token t;
133 while (Next(t)) {
134 if (aToken.Equals(t) || eof.Equals(t)) {
135 Rollback();
136 break;
140 mRollback = rollback;
143 template <typename TChar>
144 bool TTokenizer<TChar>::CheckChar(bool (*aClassifier)(const TChar aChar)) {
145 if (!aClassifier) {
146 MOZ_ASSERT(false);
147 return false;
150 if (!base::HasInput() || base::mCursor == base::mEnd) {
151 base::mHasFailed = true;
152 return false;
155 if (!aClassifier(*base::mCursor)) {
156 base::mHasFailed = true;
157 return false;
160 mRollback = base::mCursor;
161 ++base::mCursor;
162 base::mHasFailed = false;
163 return true;
166 template <typename TChar>
167 bool TTokenizer<TChar>::CheckPhrase(const typename base::TAString& aPhrase) {
168 if (!base::HasInput()) {
169 return false;
172 typedef typename base::TAString::const_char_iterator Cursor;
174 TTokenizer<TChar> pattern(aPhrase);
175 MOZ_ASSERT(!pattern.CheckEOF(),
176 "This will return true but won't shift the Tokenizer's cursor");
178 return [&](Cursor cursor, Cursor rollback) mutable {
179 while (true) {
180 if (pattern.CheckEOF()) {
181 base::mHasFailed = false;
182 mRollback = cursor;
183 return true;
186 typename base::Token t1, t2;
187 Unused << Next(t1);
188 Unused << pattern.Next(t2);
189 if (t1.Type() == t2.Type() && t1.Fragment().Equals(t2.Fragment())) {
190 continue;
193 break;
196 base::mHasFailed = true;
197 base::mPastEof = false;
198 base::mCursor = cursor;
199 mRollback = rollback;
200 return false;
201 }(base::mCursor, mRollback);
204 template <typename TChar>
205 bool TTokenizer<TChar>::ReadChar(TChar* aValue) {
206 MOZ_RELEASE_ASSERT(aValue);
208 typename base::Token t;
209 if (!Check(base::TOKEN_CHAR, t)) {
210 return false;
213 *aValue = t.AsChar();
214 return true;
217 template <typename TChar>
218 bool TTokenizer<TChar>::ReadChar(bool (*aClassifier)(const TChar aChar),
219 TChar* aValue) {
220 MOZ_RELEASE_ASSERT(aValue);
222 if (!CheckChar(aClassifier)) {
223 return false;
226 *aValue = *mRollback;
227 return true;
230 template <typename TChar>
231 bool TTokenizer<TChar>::ReadWord(typename base::TAString& aValue) {
232 typename base::Token t;
233 if (!Check(base::TOKEN_WORD, t)) {
234 return false;
237 aValue.Assign(t.AsString());
238 return true;
241 template <typename TChar>
242 bool TTokenizer<TChar>::ReadWord(typename base::TDependentSubstring& aValue) {
243 typename base::Token t;
244 if (!Check(base::TOKEN_WORD, t)) {
245 return false;
248 aValue.Rebind(t.AsString().BeginReading(), t.AsString().Length());
249 return true;
252 template <typename TChar>
253 bool TTokenizer<TChar>::ReadUntil(typename base::Token const& aToken,
254 typename base::TAString& aResult,
255 ClaimInclusion aInclude) {
256 typename base::TDependentSubstring substring;
257 bool rv = ReadUntil(aToken, substring, aInclude);
258 aResult.Assign(substring);
259 return rv;
262 template <typename TChar>
263 bool TTokenizer<TChar>::ReadUntil(typename base::Token const& aToken,
264 typename base::TDependentSubstring& aResult,
265 ClaimInclusion aInclude) {
266 typename base::TAString::const_char_iterator record = mRecord;
267 Record();
268 typename base::TAString::const_char_iterator rollback = mRollback =
269 base::mCursor;
271 bool found = false;
272 typename base::Token t;
273 while (Next(t)) {
274 if (aToken.Equals(t)) {
275 found = true;
276 break;
278 if (t.Equals(base::Token::EndOfFile())) {
279 // We don't want to eat it.
280 Rollback();
281 break;
285 Claim(aResult, aInclude);
286 mRollback = rollback;
287 mRecord = record;
288 return found;
291 template <typename TChar>
292 void TTokenizer<TChar>::Rollback() {
293 MOZ_ASSERT(base::mCursor > mRollback || base::mPastEof, "TODO!!!");
295 base::mPastEof = false;
296 base::mHasFailed = false;
297 base::mCursor = mRollback;
300 template <typename TChar>
301 void TTokenizer<TChar>::Record(ClaimInclusion aInclude) {
302 mRecord = aInclude == INCLUDE_LAST ? mRollback : base::mCursor;
305 template <typename TChar>
306 void TTokenizer<TChar>::Claim(typename base::TAString& aResult,
307 ClaimInclusion aInclusion) {
308 typename base::TAString::const_char_iterator close =
309 aInclusion == EXCLUDE_LAST ? mRollback : base::mCursor;
310 aResult.Assign(Substring(mRecord, close));
313 template <typename TChar>
314 void TTokenizer<TChar>::Claim(typename base::TDependentSubstring& aResult,
315 ClaimInclusion aInclusion) {
316 typename base::TAString::const_char_iterator close =
317 aInclusion == EXCLUDE_LAST ? mRollback : base::mCursor;
319 MOZ_RELEASE_ASSERT(close >= mRecord, "Overflow!");
320 aResult.Rebind(mRecord, close - mRecord);
323 // TokenizerBase
325 template <typename TChar>
326 TokenizerBase<TChar>::TokenizerBase(const TChar* aWhitespaces,
327 const TChar* aAdditionalWordChars)
328 : mPastEof(false),
329 mHasFailed(false),
330 mInputFinished(true),
331 mMode(Mode::FULL),
332 mMinRawDelivery(1024),
333 mWhitespaces(aWhitespaces ? aWhitespaces : sWhitespaces),
334 mAdditionalWordChars(aAdditionalWordChars),
335 mCursor(nullptr),
336 mEnd(nullptr),
337 mNextCustomTokenID(TOKEN_CUSTOM0) {}
339 template <typename TChar>
340 auto TokenizerBase<TChar>::AddCustomToken(const TAString& aValue,
341 ECaseSensitivity aCaseInsensitivity,
342 bool aEnabled) -> Token {
343 MOZ_ASSERT(!aValue.IsEmpty());
345 UniquePtr<Token>& t = *mCustomTokens.AppendElement();
346 t = MakeUnique<Token>();
348 t->mType = static_cast<TokenType>(++mNextCustomTokenID);
349 t->mCustomCaseInsensitivity = aCaseInsensitivity;
350 t->mCustomEnabled = aEnabled;
351 t->mCustom.Assign(aValue);
352 return *t;
355 template <typename TChar>
356 void TokenizerBase<TChar>::RemoveCustomToken(Token& aToken) {
357 if (aToken.mType == TOKEN_UNKNOWN) {
358 // Already removed
359 return;
362 for (UniquePtr<Token> const& custom : mCustomTokens) {
363 if (custom->mType == aToken.mType) {
364 mCustomTokens.RemoveElement(custom);
365 aToken.mType = TOKEN_UNKNOWN;
366 return;
370 MOZ_ASSERT(false, "Token to remove not found");
373 template <typename TChar>
374 void TokenizerBase<TChar>::EnableCustomToken(Token const& aToken,
375 bool aEnabled) {
376 if (aToken.mType == TOKEN_UNKNOWN) {
377 // Already removed
378 return;
381 for (UniquePtr<Token> const& custom : mCustomTokens) {
382 if (custom->Type() == aToken.Type()) {
383 // This effectively destroys the token instance.
384 custom->mCustomEnabled = aEnabled;
385 return;
389 MOZ_ASSERT(false, "Token to change not found");
392 template <typename TChar>
393 void TokenizerBase<TChar>::SetTokenizingMode(Mode aMode) {
394 mMode = aMode;
397 template <typename TChar>
398 bool TokenizerBase<TChar>::HasFailed() const {
399 return mHasFailed;
402 template <typename TChar>
403 bool TokenizerBase<TChar>::HasInput() const {
404 return !mPastEof;
407 template <typename TChar>
408 auto TokenizerBase<TChar>::Parse(Token& aToken) const ->
409 typename TAString::const_char_iterator {
410 if (mCursor == mEnd) {
411 if (!mInputFinished) {
412 return mCursor;
415 aToken = Token::EndOfFile();
416 return mEnd;
419 MOZ_RELEASE_ASSERT(mEnd >= mCursor, "Overflow!");
420 typename TAString::size_type available = mEnd - mCursor;
422 uint32_t longestCustom = 0;
423 for (UniquePtr<Token> const& custom : mCustomTokens) {
424 if (IsCustom(mCursor, *custom, &longestCustom)) {
425 aToken = *custom;
426 return mCursor + custom->mCustom.Length();
430 if (!mInputFinished && available < longestCustom) {
431 // Not enough data to deterministically decide.
432 return mCursor;
435 typename TAString::const_char_iterator next = mCursor;
437 if (mMode == Mode::CUSTOM_ONLY) {
438 // We have to do a brute-force search for all of the enabled custom
439 // tokens.
440 while (next < mEnd) {
441 ++next;
442 for (UniquePtr<Token> const& custom : mCustomTokens) {
443 if (IsCustom(next, *custom)) {
444 aToken = Token::Raw();
445 return next;
450 if (mInputFinished) {
451 // End of the data reached.
452 aToken = Token::Raw();
453 return next;
456 if (longestCustom < available && available > mMinRawDelivery) {
457 // We can return some data w/o waiting for either a custom token
458 // or call to FinishData() when we leave the tail where all the
459 // custom tokens potentially fit, so we can't lose only partially
460 // delivered tokens. This preserves reasonable granularity.
461 aToken = Token::Raw();
462 return mEnd - longestCustom + 1;
465 // Not enough data to deterministically decide.
466 return mCursor;
469 enum State {
470 PARSE_INTEGER,
471 PARSE_WORD,
472 PARSE_CRLF,
473 PARSE_LF,
474 PARSE_WS,
475 PARSE_CHAR,
476 } state;
478 if (IsWordFirst(*next)) {
479 state = PARSE_WORD;
480 } else if (IsNumber(*next)) {
481 state = PARSE_INTEGER;
482 } else if (contains(mWhitespaces, *next)) { // not UTF-8 friendly?
483 state = PARSE_WS;
484 } else if (*next == '\r') {
485 state = PARSE_CRLF;
486 } else if (*next == '\n') {
487 state = PARSE_LF;
488 } else {
489 state = PARSE_CHAR;
492 mozilla::CheckedUint64 resultingNumber = 0;
494 while (next < mEnd) {
495 switch (state) {
496 case PARSE_INTEGER:
497 // Keep it simple for now
498 resultingNumber *= 10;
499 resultingNumber += static_cast<uint64_t>(*next - '0');
501 ++next;
502 if (IsPending(next)) {
503 break;
505 if (IsEnd(next) || !IsNumber(*next)) {
506 if (!resultingNumber.isValid()) {
507 aToken = Token::Error();
508 } else {
509 aToken = Token::Number(resultingNumber.value());
511 return next;
513 break;
515 case PARSE_WORD:
516 ++next;
517 if (IsPending(next)) {
518 break;
520 if (IsEnd(next) || !IsWord(*next)) {
521 aToken = Token::Word(Substring(mCursor, next));
522 return next;
524 break;
526 case PARSE_CRLF:
527 ++next;
528 if (IsPending(next)) {
529 break;
531 if (!IsEnd(next) && *next == '\n') { // LF is optional
532 ++next;
534 aToken = Token::NewLine();
535 return next;
537 case PARSE_LF:
538 ++next;
539 aToken = Token::NewLine();
540 return next;
542 case PARSE_WS:
543 ++next;
544 aToken = Token::Whitespace();
545 return next;
547 case PARSE_CHAR:
548 ++next;
549 aToken = Token::Char(*mCursor);
550 return next;
551 } // switch (state)
552 } // while (next < end)
554 MOZ_ASSERT(!mInputFinished);
555 return mCursor;
558 template <typename TChar>
559 bool TokenizerBase<TChar>::IsEnd(
560 const typename TAString::const_char_iterator& caret) const {
561 return caret == mEnd;
564 template <typename TChar>
565 bool TokenizerBase<TChar>::IsPending(
566 const typename TAString::const_char_iterator& caret) const {
567 return IsEnd(caret) && !mInputFinished;
570 template <typename TChar>
571 bool TokenizerBase<TChar>::IsWordFirst(const TChar aInput) const {
572 // TODO: make this fully work with unicode
573 return (ToLowerCase(static_cast<uint32_t>(aInput)) !=
574 ToUpperCase(static_cast<uint32_t>(aInput))) ||
575 '_' == aInput ||
576 (mAdditionalWordChars ? contains(mAdditionalWordChars, aInput)
577 : false);
580 template <typename TChar>
581 bool TokenizerBase<TChar>::IsWord(const TChar aInput) const {
582 return IsWordFirst(aInput) || IsNumber(aInput);
585 template <typename TChar>
586 bool TokenizerBase<TChar>::IsNumber(const TChar aInput) const {
587 // TODO: are there unicode numbers?
588 return aInput >= '0' && aInput <= '9';
591 template <typename TChar>
592 bool TokenizerBase<TChar>::IsCustom(
593 const typename TAString::const_char_iterator& caret,
594 const Token& aCustomToken, uint32_t* aLongest) const {
595 MOZ_ASSERT(aCustomToken.mType > TOKEN_CUSTOM0);
596 if (!aCustomToken.mCustomEnabled) {
597 return false;
600 if (aLongest) {
601 *aLongest = std::max<uint32_t>(*aLongest, aCustomToken.mCustom.Length());
604 // This is not very likely to happen according to how we call this method
605 // and since it's on a hot path, it's just a diagnostic assert,
606 // not a release assert.
607 MOZ_DIAGNOSTIC_ASSERT(mEnd >= caret, "Overflow?");
608 uint32_t inputLength = mEnd - caret;
609 if (aCustomToken.mCustom.Length() > inputLength) {
610 return false;
613 TDependentSubstring inputFragment(caret, aCustomToken.mCustom.Length());
614 if (aCustomToken.mCustomCaseInsensitivity == CASE_INSENSITIVE) {
615 if constexpr (std::is_same_v<TChar, char>) {
616 return inputFragment.Equals(aCustomToken.mCustom,
617 nsCaseInsensitiveUTF8StringComparator);
618 } else {
619 return inputFragment.Equals(aCustomToken.mCustom,
620 nsCaseInsensitiveStringComparator);
623 return inputFragment.Equals(aCustomToken.mCustom);
626 template <typename TChar>
627 void TokenizerBase<TChar>::AssignFragment(
628 Token& aToken, typename TAString::const_char_iterator begin,
629 typename TAString::const_char_iterator end) {
630 aToken.AssignFragment(begin, end);
633 #ifdef DEBUG
635 template <typename TChar>
636 void TokenizerBase<TChar>::Validate(Token const& aToken) {
637 if (aToken.Type() == TOKEN_WORD) {
638 typename TAString::const_char_iterator c = aToken.AsString().BeginReading();
639 typename TAString::const_char_iterator e = aToken.AsString().EndReading();
641 if (c < e) {
642 MOZ_ASSERT(IsWordFirst(*c));
643 while (++c < e) {
644 MOZ_ASSERT(IsWord(*c));
650 #endif
652 // TokenizerBase::Token
654 template <typename TChar>
655 TokenizerBase<TChar>::Token::Token()
656 : mType(TOKEN_UNKNOWN),
657 mChar(0),
658 mInteger(0),
659 mCustomCaseInsensitivity(CASE_SENSITIVE),
660 mCustomEnabled(false) {}
662 template <typename TChar>
663 TokenizerBase<TChar>::Token::Token(const Token& aOther)
664 : mType(aOther.mType),
665 mCustom(aOther.mCustom),
666 mChar(aOther.mChar),
667 mInteger(aOther.mInteger),
668 mCustomCaseInsensitivity(aOther.mCustomCaseInsensitivity),
669 mCustomEnabled(aOther.mCustomEnabled) {
670 if (mType == TOKEN_WORD || mType > TOKEN_CUSTOM0) {
671 mWord.Rebind(aOther.mWord.BeginReading(), aOther.mWord.Length());
675 template <typename TChar>
676 auto TokenizerBase<TChar>::Token::operator=(const Token& aOther) -> Token& {
677 mType = aOther.mType;
678 mCustom = aOther.mCustom;
679 mChar = aOther.mChar;
680 mWord.Rebind(aOther.mWord.BeginReading(), aOther.mWord.Length());
681 mInteger = aOther.mInteger;
682 mCustomCaseInsensitivity = aOther.mCustomCaseInsensitivity;
683 mCustomEnabled = aOther.mCustomEnabled;
684 return *this;
687 template <typename TChar>
688 void TokenizerBase<TChar>::Token::AssignFragment(
689 typename TAString::const_char_iterator begin,
690 typename TAString::const_char_iterator end) {
691 MOZ_RELEASE_ASSERT(end >= begin, "Overflow!");
692 mFragment.Rebind(begin, end - begin);
695 // static
696 template <typename TChar>
697 auto TokenizerBase<TChar>::Token::Raw() -> Token {
698 Token t;
699 t.mType = TOKEN_RAW;
700 return t;
703 // static
704 template <typename TChar>
705 auto TokenizerBase<TChar>::Token::Word(TAString const& aValue) -> Token {
706 Token t;
707 t.mType = TOKEN_WORD;
708 t.mWord.Rebind(aValue.BeginReading(), aValue.Length());
709 return t;
712 // static
713 template <typename TChar>
714 auto TokenizerBase<TChar>::Token::Char(TChar const aValue) -> Token {
715 Token t;
716 t.mType = TOKEN_CHAR;
717 t.mChar = aValue;
718 return t;
721 // static
722 template <typename TChar>
723 auto TokenizerBase<TChar>::Token::Number(uint64_t const aValue) -> Token {
724 Token t;
725 t.mType = TOKEN_INTEGER;
726 t.mInteger = aValue;
727 return t;
730 // static
731 template <typename TChar>
732 auto TokenizerBase<TChar>::Token::Whitespace() -> Token {
733 Token t;
734 t.mType = TOKEN_WS;
735 t.mChar = '\0';
736 return t;
739 // static
740 template <typename TChar>
741 auto TokenizerBase<TChar>::Token::NewLine() -> Token {
742 Token t;
743 t.mType = TOKEN_EOL;
744 return t;
747 // static
748 template <typename TChar>
749 auto TokenizerBase<TChar>::Token::EndOfFile() -> Token {
750 Token t;
751 t.mType = TOKEN_EOF;
752 return t;
755 // static
756 template <typename TChar>
757 auto TokenizerBase<TChar>::Token::Error() -> Token {
758 Token t;
759 t.mType = TOKEN_ERROR;
760 return t;
763 template <typename TChar>
764 bool TokenizerBase<TChar>::Token::Equals(const Token& aOther) const {
765 if (mType != aOther.mType) {
766 return false;
769 switch (mType) {
770 case TOKEN_INTEGER:
771 return AsInteger() == aOther.AsInteger();
772 case TOKEN_WORD:
773 return AsString() == aOther.AsString();
774 case TOKEN_CHAR:
775 return AsChar() == aOther.AsChar();
776 default:
777 return true;
781 template <typename TChar>
782 TChar TokenizerBase<TChar>::Token::AsChar() const {
783 MOZ_ASSERT(mType == TOKEN_CHAR || mType == TOKEN_WS);
784 return mChar;
787 template <typename TChar>
788 auto TokenizerBase<TChar>::Token::AsString() const -> TDependentSubstring {
789 MOZ_ASSERT(mType == TOKEN_WORD);
790 return mWord;
793 template <typename TChar>
794 uint64_t TokenizerBase<TChar>::Token::AsInteger() const {
795 MOZ_ASSERT(mType == TOKEN_INTEGER);
796 return mInteger;
799 template class TokenizerBase<char>;
800 template class TokenizerBase<char16_t>;
802 template class TTokenizer<char>;
803 template class TTokenizer<char16_t>;
805 } // namespace mozilla