1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2 /* vim: set ts=8 sts=2 et sw=2 tw=80: */
3 /* This Source Code Form is subject to the terms of the Mozilla Public
4 * License, v. 2.0. If a copy of the MPL was not distributed with this
5 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
9 #include "nsUnicharUtils.h"
15 char const TokenizerBase
<char>::sWhitespaces
[] = {' ', '\t', 0};
17 char16_t
const TokenizerBase
<char16_t
>::sWhitespaces
[3] = {' ', '\t', 0};
19 template <typename TChar
>
20 static bool contains(TChar
const* const list
, TChar
const needle
) {
21 for (TChar
const* c
= list
; *c
; ++c
) {
29 template <typename TChar
>
30 TTokenizer
<TChar
>::TTokenizer(const typename
base::TAString
& aSource
,
31 const TChar
* aWhitespaces
,
32 const TChar
* aAdditionalWordChars
)
33 : TokenizerBase
<TChar
>(aWhitespaces
, aAdditionalWordChars
) {
34 base::mInputFinished
= true;
35 aSource
.BeginReading(base::mCursor
);
36 mRecord
= mRollback
= base::mCursor
;
37 aSource
.EndReading(base::mEnd
);
40 template <typename TChar
>
41 TTokenizer
<TChar
>::TTokenizer(const TChar
* aSource
, const TChar
* aWhitespaces
,
42 const TChar
* aAdditionalWordChars
)
43 : TTokenizer(typename
base::TDependentString(aSource
), aWhitespaces
,
44 aAdditionalWordChars
) {}
46 template <typename TChar
>
47 bool TTokenizer
<TChar
>::Next(typename
base::Token
& aToken
) {
48 if (!base::HasInput()) {
49 base::mHasFailed
= true;
53 mRollback
= base::mCursor
;
54 base::mCursor
= base::Parse(aToken
);
56 base::AssignFragment(aToken
, mRollback
, base::mCursor
);
58 base::mPastEof
= aToken
.Type() == base::TOKEN_EOF
;
59 base::mHasFailed
= false;
63 template <typename TChar
>
64 bool TTokenizer
<TChar
>::Check(const typename
base::TokenType aTokenType
,
65 typename
base::Token
& aResult
) {
66 if (!base::HasInput()) {
67 base::mHasFailed
= true;
71 typename
base::TAString::const_char_iterator next
= base::Parse(aResult
);
72 if (aTokenType
!= aResult
.Type()) {
73 base::mHasFailed
= true;
77 mRollback
= base::mCursor
;
80 base::AssignFragment(aResult
, mRollback
, base::mCursor
);
82 base::mPastEof
= aResult
.Type() == base::TOKEN_EOF
;
83 base::mHasFailed
= false;
87 template <typename TChar
>
88 bool TTokenizer
<TChar
>::Check(const typename
base::Token
& aToken
) {
90 base::Validate(aToken
);
93 if (!base::HasInput()) {
94 base::mHasFailed
= true;
98 typename
base::Token parsed
;
99 typename
base::TAString::const_char_iterator next
= base::Parse(parsed
);
100 if (!aToken
.Equals(parsed
)) {
101 base::mHasFailed
= true;
105 mRollback
= base::mCursor
;
106 base::mCursor
= next
;
107 base::mPastEof
= parsed
.Type() == base::TOKEN_EOF
;
108 base::mHasFailed
= false;
112 template <typename TChar
>
113 void TTokenizer
<TChar
>::SkipWhites(WhiteSkipping aIncludeNewLines
) {
115 (aIncludeNewLines
== DONT_INCLUDE_NEW_LINE
|| !CheckEOL())) {
119 typename
base::TAString::const_char_iterator rollback
= mRollback
;
120 while (CheckWhite() || (aIncludeNewLines
== INCLUDE_NEW_LINE
&& CheckEOL())) {
123 base::mHasFailed
= false;
124 mRollback
= rollback
;
127 template <typename TChar
>
128 void TTokenizer
<TChar
>::SkipUntil(typename
base::Token
const& aToken
) {
129 typename
base::TAString::const_char_iterator rollback
= base::mCursor
;
130 const typename
base::Token eof
= base::Token::EndOfFile();
132 typename
base::Token t
;
134 if (aToken
.Equals(t
) || eof
.Equals(t
)) {
140 mRollback
= rollback
;
143 template <typename TChar
>
144 bool TTokenizer
<TChar
>::CheckChar(bool (*aClassifier
)(const TChar aChar
)) {
150 if (!base::HasInput() || base::mCursor
== base::mEnd
) {
151 base::mHasFailed
= true;
155 if (!aClassifier(*base::mCursor
)) {
156 base::mHasFailed
= true;
160 mRollback
= base::mCursor
;
162 base::mHasFailed
= false;
166 template <typename TChar
>
167 bool TTokenizer
<TChar
>::CheckPhrase(const typename
base::TAString
& aPhrase
) {
168 if (!base::HasInput()) {
172 typedef typename
base::TAString::const_char_iterator Cursor
;
174 TTokenizer
<TChar
> pattern(aPhrase
);
175 MOZ_ASSERT(!pattern
.CheckEOF(),
176 "This will return true but won't shift the Tokenizer's cursor");
178 return [&](Cursor cursor
, Cursor rollback
) mutable {
180 if (pattern
.CheckEOF()) {
181 base::mHasFailed
= false;
186 typename
base::Token t1
, t2
;
188 Unused
<< pattern
.Next(t2
);
189 if (t1
.Type() == t2
.Type() && t1
.Fragment().Equals(t2
.Fragment())) {
196 base::mHasFailed
= true;
197 base::mPastEof
= false;
198 base::mCursor
= cursor
;
199 mRollback
= rollback
;
201 }(base::mCursor
, mRollback
);
204 template <typename TChar
>
205 bool TTokenizer
<TChar
>::ReadChar(TChar
* aValue
) {
206 MOZ_RELEASE_ASSERT(aValue
);
208 typename
base::Token t
;
209 if (!Check(base::TOKEN_CHAR
, t
)) {
213 *aValue
= t
.AsChar();
217 template <typename TChar
>
218 bool TTokenizer
<TChar
>::ReadChar(bool (*aClassifier
)(const TChar aChar
),
220 MOZ_RELEASE_ASSERT(aValue
);
222 if (!CheckChar(aClassifier
)) {
226 *aValue
= *mRollback
;
230 template <typename TChar
>
231 bool TTokenizer
<TChar
>::ReadWord(typename
base::TAString
& aValue
) {
232 typename
base::Token t
;
233 if (!Check(base::TOKEN_WORD
, t
)) {
237 aValue
.Assign(t
.AsString());
241 template <typename TChar
>
242 bool TTokenizer
<TChar
>::ReadWord(typename
base::TDependentSubstring
& aValue
) {
243 typename
base::Token t
;
244 if (!Check(base::TOKEN_WORD
, t
)) {
248 aValue
.Rebind(t
.AsString().BeginReading(), t
.AsString().Length());
252 template <typename TChar
>
253 bool TTokenizer
<TChar
>::ReadUntil(typename
base::Token
const& aToken
,
254 typename
base::TAString
& aResult
,
255 ClaimInclusion aInclude
) {
256 typename
base::TDependentSubstring substring
;
257 bool rv
= ReadUntil(aToken
, substring
, aInclude
);
258 aResult
.Assign(substring
);
262 template <typename TChar
>
263 bool TTokenizer
<TChar
>::ReadUntil(typename
base::Token
const& aToken
,
264 typename
base::TDependentSubstring
& aResult
,
265 ClaimInclusion aInclude
) {
266 typename
base::TAString::const_char_iterator record
= mRecord
;
268 typename
base::TAString::const_char_iterator rollback
= mRollback
=
272 typename
base::Token t
;
274 if (aToken
.Equals(t
)) {
278 if (t
.Equals(base::Token::EndOfFile())) {
279 // We don't want to eat it.
285 Claim(aResult
, aInclude
);
286 mRollback
= rollback
;
291 template <typename TChar
>
292 void TTokenizer
<TChar
>::Rollback() {
293 MOZ_ASSERT(base::mCursor
> mRollback
|| base::mPastEof
, "TODO!!!");
295 base::mPastEof
= false;
296 base::mHasFailed
= false;
297 base::mCursor
= mRollback
;
300 template <typename TChar
>
301 void TTokenizer
<TChar
>::Record(ClaimInclusion aInclude
) {
302 mRecord
= aInclude
== INCLUDE_LAST
? mRollback
: base::mCursor
;
305 template <typename TChar
>
306 void TTokenizer
<TChar
>::Claim(typename
base::TAString
& aResult
,
307 ClaimInclusion aInclusion
) {
308 typename
base::TAString::const_char_iterator close
=
309 aInclusion
== EXCLUDE_LAST
? mRollback
: base::mCursor
;
310 aResult
.Assign(Substring(mRecord
, close
));
313 template <typename TChar
>
314 void TTokenizer
<TChar
>::Claim(typename
base::TDependentSubstring
& aResult
,
315 ClaimInclusion aInclusion
) {
316 typename
base::TAString::const_char_iterator close
=
317 aInclusion
== EXCLUDE_LAST
? mRollback
: base::mCursor
;
319 MOZ_RELEASE_ASSERT(close
>= mRecord
, "Overflow!");
320 aResult
.Rebind(mRecord
, close
- mRecord
);
325 template <typename TChar
>
326 TokenizerBase
<TChar
>::TokenizerBase(const TChar
* aWhitespaces
,
327 const TChar
* aAdditionalWordChars
)
330 mInputFinished(true),
332 mMinRawDelivery(1024),
333 mWhitespaces(aWhitespaces
? aWhitespaces
: sWhitespaces
),
334 mAdditionalWordChars(aAdditionalWordChars
),
337 mNextCustomTokenID(TOKEN_CUSTOM0
) {}
339 template <typename TChar
>
340 auto TokenizerBase
<TChar
>::AddCustomToken(const TAString
& aValue
,
341 ECaseSensitivity aCaseInsensitivity
,
342 bool aEnabled
) -> Token
{
343 MOZ_ASSERT(!aValue
.IsEmpty());
345 UniquePtr
<Token
>& t
= *mCustomTokens
.AppendElement();
346 t
= MakeUnique
<Token
>();
348 t
->mType
= static_cast<TokenType
>(++mNextCustomTokenID
);
349 t
->mCustomCaseInsensitivity
= aCaseInsensitivity
;
350 t
->mCustomEnabled
= aEnabled
;
351 t
->mCustom
.Assign(aValue
);
355 template <typename TChar
>
356 void TokenizerBase
<TChar
>::RemoveCustomToken(Token
& aToken
) {
357 if (aToken
.mType
== TOKEN_UNKNOWN
) {
362 for (UniquePtr
<Token
> const& custom
: mCustomTokens
) {
363 if (custom
->mType
== aToken
.mType
) {
364 mCustomTokens
.RemoveElement(custom
);
365 aToken
.mType
= TOKEN_UNKNOWN
;
370 MOZ_ASSERT(false, "Token to remove not found");
373 template <typename TChar
>
374 void TokenizerBase
<TChar
>::EnableCustomToken(Token
const& aToken
,
376 if (aToken
.mType
== TOKEN_UNKNOWN
) {
381 for (UniquePtr
<Token
> const& custom
: mCustomTokens
) {
382 if (custom
->Type() == aToken
.Type()) {
383 // This effectively destroys the token instance.
384 custom
->mCustomEnabled
= aEnabled
;
389 MOZ_ASSERT(false, "Token to change not found");
392 template <typename TChar
>
393 void TokenizerBase
<TChar
>::SetTokenizingMode(Mode aMode
) {
397 template <typename TChar
>
398 bool TokenizerBase
<TChar
>::HasFailed() const {
402 template <typename TChar
>
403 bool TokenizerBase
<TChar
>::HasInput() const {
407 template <typename TChar
>
408 auto TokenizerBase
<TChar
>::Parse(Token
& aToken
) const ->
409 typename
TAString::const_char_iterator
{
410 if (mCursor
== mEnd
) {
411 if (!mInputFinished
) {
415 aToken
= Token::EndOfFile();
419 MOZ_RELEASE_ASSERT(mEnd
>= mCursor
, "Overflow!");
420 typename
TAString::size_type available
= mEnd
- mCursor
;
422 uint32_t longestCustom
= 0;
423 for (UniquePtr
<Token
> const& custom
: mCustomTokens
) {
424 if (IsCustom(mCursor
, *custom
, &longestCustom
)) {
426 return mCursor
+ custom
->mCustom
.Length();
430 if (!mInputFinished
&& available
< longestCustom
) {
431 // Not enough data to deterministically decide.
435 typename
TAString::const_char_iterator next
= mCursor
;
437 if (mMode
== Mode::CUSTOM_ONLY
) {
438 // We have to do a brute-force search for all of the enabled custom
440 while (next
< mEnd
) {
442 for (UniquePtr
<Token
> const& custom
: mCustomTokens
) {
443 if (IsCustom(next
, *custom
)) {
444 aToken
= Token::Raw();
450 if (mInputFinished
) {
451 // End of the data reached.
452 aToken
= Token::Raw();
456 if (longestCustom
< available
&& available
> mMinRawDelivery
) {
457 // We can return some data w/o waiting for either a custom token
458 // or call to FinishData() when we leave the tail where all the
459 // custom tokens potentially fit, so we can't lose only partially
460 // delivered tokens. This preserves reasonable granularity.
461 aToken
= Token::Raw();
462 return mEnd
- longestCustom
+ 1;
465 // Not enough data to deterministically decide.
478 if (IsWordFirst(*next
)) {
480 } else if (IsNumber(*next
)) {
481 state
= PARSE_INTEGER
;
482 } else if (contains(mWhitespaces
, *next
)) { // not UTF-8 friendly?
484 } else if (*next
== '\r') {
486 } else if (*next
== '\n') {
492 mozilla::CheckedUint64 resultingNumber
= 0;
494 while (next
< mEnd
) {
497 // Keep it simple for now
498 resultingNumber
*= 10;
499 resultingNumber
+= static_cast<uint64_t>(*next
- '0');
502 if (IsPending(next
)) {
505 if (IsEnd(next
) || !IsNumber(*next
)) {
506 if (!resultingNumber
.isValid()) {
507 aToken
= Token::Error();
509 aToken
= Token::Number(resultingNumber
.value());
517 if (IsPending(next
)) {
520 if (IsEnd(next
) || !IsWord(*next
)) {
521 aToken
= Token::Word(Substring(mCursor
, next
));
528 if (IsPending(next
)) {
531 if (!IsEnd(next
) && *next
== '\n') { // LF is optional
534 aToken
= Token::NewLine();
539 aToken
= Token::NewLine();
544 aToken
= Token::Whitespace();
549 aToken
= Token::Char(*mCursor
);
552 } // while (next < end)
554 MOZ_ASSERT(!mInputFinished
);
558 template <typename TChar
>
559 bool TokenizerBase
<TChar
>::IsEnd(
560 const typename
TAString::const_char_iterator
& caret
) const {
561 return caret
== mEnd
;
564 template <typename TChar
>
565 bool TokenizerBase
<TChar
>::IsPending(
566 const typename
TAString::const_char_iterator
& caret
) const {
567 return IsEnd(caret
) && !mInputFinished
;
570 template <typename TChar
>
571 bool TokenizerBase
<TChar
>::IsWordFirst(const TChar aInput
) const {
572 // TODO: make this fully work with unicode
573 return (ToLowerCase(static_cast<uint32_t>(aInput
)) !=
574 ToUpperCase(static_cast<uint32_t>(aInput
))) ||
576 (mAdditionalWordChars
? contains(mAdditionalWordChars
, aInput
)
580 template <typename TChar
>
581 bool TokenizerBase
<TChar
>::IsWord(const TChar aInput
) const {
582 return IsWordFirst(aInput
) || IsNumber(aInput
);
585 template <typename TChar
>
586 bool TokenizerBase
<TChar
>::IsNumber(const TChar aInput
) const {
587 // TODO: are there unicode numbers?
588 return aInput
>= '0' && aInput
<= '9';
591 template <typename TChar
>
592 bool TokenizerBase
<TChar
>::IsCustom(
593 const typename
TAString::const_char_iterator
& caret
,
594 const Token
& aCustomToken
, uint32_t* aLongest
) const {
595 MOZ_ASSERT(aCustomToken
.mType
> TOKEN_CUSTOM0
);
596 if (!aCustomToken
.mCustomEnabled
) {
601 *aLongest
= std::max
<uint32_t>(*aLongest
, aCustomToken
.mCustom
.Length());
604 // This is not very likely to happen according to how we call this method
605 // and since it's on a hot path, it's just a diagnostic assert,
606 // not a release assert.
607 MOZ_DIAGNOSTIC_ASSERT(mEnd
>= caret
, "Overflow?");
608 uint32_t inputLength
= mEnd
- caret
;
609 if (aCustomToken
.mCustom
.Length() > inputLength
) {
613 TDependentSubstring
inputFragment(caret
, aCustomToken
.mCustom
.Length());
614 if (aCustomToken
.mCustomCaseInsensitivity
== CASE_INSENSITIVE
) {
615 if constexpr (std::is_same_v
<TChar
, char>) {
616 return inputFragment
.Equals(aCustomToken
.mCustom
,
617 nsCaseInsensitiveUTF8StringComparator
);
619 return inputFragment
.Equals(aCustomToken
.mCustom
,
620 nsCaseInsensitiveStringComparator
);
623 return inputFragment
.Equals(aCustomToken
.mCustom
);
626 template <typename TChar
>
627 void TokenizerBase
<TChar
>::AssignFragment(
628 Token
& aToken
, typename
TAString::const_char_iterator begin
,
629 typename
TAString::const_char_iterator end
) {
630 aToken
.AssignFragment(begin
, end
);
635 template <typename TChar
>
636 void TokenizerBase
<TChar
>::Validate(Token
const& aToken
) {
637 if (aToken
.Type() == TOKEN_WORD
) {
638 typename
TAString::const_char_iterator c
= aToken
.AsString().BeginReading();
639 typename
TAString::const_char_iterator e
= aToken
.AsString().EndReading();
642 MOZ_ASSERT(IsWordFirst(*c
));
644 MOZ_ASSERT(IsWord(*c
));
652 // TokenizerBase::Token
654 template <typename TChar
>
655 TokenizerBase
<TChar
>::Token::Token()
656 : mType(TOKEN_UNKNOWN
),
659 mCustomCaseInsensitivity(CASE_SENSITIVE
),
660 mCustomEnabled(false) {}
662 template <typename TChar
>
663 TokenizerBase
<TChar
>::Token::Token(const Token
& aOther
)
664 : mType(aOther
.mType
),
665 mCustom(aOther
.mCustom
),
667 mInteger(aOther
.mInteger
),
668 mCustomCaseInsensitivity(aOther
.mCustomCaseInsensitivity
),
669 mCustomEnabled(aOther
.mCustomEnabled
) {
670 if (mType
== TOKEN_WORD
|| mType
> TOKEN_CUSTOM0
) {
671 mWord
.Rebind(aOther
.mWord
.BeginReading(), aOther
.mWord
.Length());
675 template <typename TChar
>
676 auto TokenizerBase
<TChar
>::Token::operator=(const Token
& aOther
) -> Token
& {
677 mType
= aOther
.mType
;
678 mCustom
= aOther
.mCustom
;
679 mChar
= aOther
.mChar
;
680 mWord
.Rebind(aOther
.mWord
.BeginReading(), aOther
.mWord
.Length());
681 mInteger
= aOther
.mInteger
;
682 mCustomCaseInsensitivity
= aOther
.mCustomCaseInsensitivity
;
683 mCustomEnabled
= aOther
.mCustomEnabled
;
687 template <typename TChar
>
688 void TokenizerBase
<TChar
>::Token::AssignFragment(
689 typename
TAString::const_char_iterator begin
,
690 typename
TAString::const_char_iterator end
) {
691 MOZ_RELEASE_ASSERT(end
>= begin
, "Overflow!");
692 mFragment
.Rebind(begin
, end
- begin
);
696 template <typename TChar
>
697 auto TokenizerBase
<TChar
>::Token::Raw() -> Token
{
704 template <typename TChar
>
705 auto TokenizerBase
<TChar
>::Token::Word(TAString
const& aValue
) -> Token
{
707 t
.mType
= TOKEN_WORD
;
708 t
.mWord
.Rebind(aValue
.BeginReading(), aValue
.Length());
713 template <typename TChar
>
714 auto TokenizerBase
<TChar
>::Token::Char(TChar
const aValue
) -> Token
{
716 t
.mType
= TOKEN_CHAR
;
722 template <typename TChar
>
723 auto TokenizerBase
<TChar
>::Token::Number(uint64_t const aValue
) -> Token
{
725 t
.mType
= TOKEN_INTEGER
;
731 template <typename TChar
>
732 auto TokenizerBase
<TChar
>::Token::Whitespace() -> Token
{
740 template <typename TChar
>
741 auto TokenizerBase
<TChar
>::Token::NewLine() -> Token
{
748 template <typename TChar
>
749 auto TokenizerBase
<TChar
>::Token::EndOfFile() -> Token
{
756 template <typename TChar
>
757 auto TokenizerBase
<TChar
>::Token::Error() -> Token
{
759 t
.mType
= TOKEN_ERROR
;
763 template <typename TChar
>
764 bool TokenizerBase
<TChar
>::Token::Equals(const Token
& aOther
) const {
765 if (mType
!= aOther
.mType
) {
771 return AsInteger() == aOther
.AsInteger();
773 return AsString() == aOther
.AsString();
775 return AsChar() == aOther
.AsChar();
781 template <typename TChar
>
782 TChar TokenizerBase
<TChar
>::Token::AsChar() const {
783 MOZ_ASSERT(mType
== TOKEN_CHAR
|| mType
== TOKEN_WS
);
787 template <typename TChar
>
788 auto TokenizerBase
<TChar
>::Token::AsString() const -> TDependentSubstring
{
789 MOZ_ASSERT(mType
== TOKEN_WORD
);
793 template <typename TChar
>
794 uint64_t TokenizerBase
<TChar
>::Token::AsInteger() const {
795 MOZ_ASSERT(mType
== TOKEN_INTEGER
);
799 template class TokenizerBase
<char>;
800 template class TokenizerBase
<char16_t
>;
802 template class TTokenizer
<char>;
803 template class TTokenizer
<char16_t
>;
805 } // namespace mozilla