Bug 1825336 - Make toolkit/components/url-classifier/ buildable outside of a unified...
[gecko.git] / xpcom / ds / IncrementalTokenizer.h
blobc2647052c9a87ddfc85ff406cc4398d64c348182
1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2 /* vim: set ts=8 sts=2 et sw=2 tw=80: */
3 /* This Source Code Form is subject to the terms of the Mozilla Public
4 * License, v. 2.0. If a copy of the MPL was not distributed with this
5 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
7 #ifndef INCREMENTAL_TOKENIZER_H__
8 #define INCREMENTAL_TOKENIZER_H__
10 #include "mozilla/Tokenizer.h"
12 #include "nsError.h"
13 #include <functional>
15 class nsIInputStream;
17 namespace mozilla {
19 class IncrementalTokenizer : public TokenizerBase<char> {
20 public:
21 /**
22 * The consumer callback. The function is called for every single token
23 * as found in the input. Failure result returned by this callback stops
24 * the tokenization immediately and bubbles to result of Feed/FinishInput.
26 * Fragment()s of consumed tokens are ensured to remain valid until next call
27 * to Feed/FinishInput and are pointing to a single linear buffer. Hence,
28 * those can be safely used to accumulate the data for processing after
29 * Feed/FinishInput returned.
31 typedef std::function<nsresult(Token const&, IncrementalTokenizer& i)>
32 Consumer;
34 /**
35 * For aWhitespaces and aAdditionalWordChars arguments see TokenizerBase.
37 * @param aConsumer
38 * A mandatory non-null argument, a function that consumes the tokens as
39 * they come when the tokenizer is fed.
40 * @param aRawMinBuffered
41 * When we have buffered at least aRawMinBuffered data, but there was no
42 * custom token found so far because of too small incremental feed chunks,
43 * deliver the raw data to preserve streaming and to save memory. This only
44 * has effect in OnlyCustomTokenizing mode.
46 explicit IncrementalTokenizer(Consumer&& aConsumer,
47 const char* aWhitespaces = nullptr,
48 const char* aAdditionalWordChars = nullptr,
49 uint32_t aRawMinBuffered = 1024);
51 /**
52 * Pushes the input to be tokenized. These directly call the Consumer
53 * callback on every found token. Result of the Consumer callback is returned
54 * here.
56 * The tokenizer must be initialized with a valid consumer prior call to these
57 * methods. It's not allowed to call Feed/FinishInput from inside the
58 * Consumer callback.
60 nsresult FeedInput(const nsACString& aInput);
61 nsresult FeedInput(nsIInputStream* aInput, uint32_t aCount);
62 nsresult FinishInput();
64 /**
65 * Can only be called from inside the consumer callback.
67 * When there is still anything to read from the input, tokenize it, store
68 * the token type and value to aToken result and shift the cursor past this
69 * just parsed token. Each call to Next() reads another token from
70 * the input and shifts the cursor.
72 * Returns false if there is not enough data to deterministically recognize
73 * tokens or when the last returned token was EOF.
75 [[nodiscard]] bool Next(Token& aToken);
77 /**
78 * Can only be called from inside the consumer callback.
80 * Tells the tokenizer to revert the cursor and stop the async parsing until
81 * next feed of the input. This is useful when more than one token is needed
82 * to decide on the syntax but there is not enough input to get a next token
83 * (Next() returned false.)
85 void NeedMoreInput();
87 /**
88 * Can only be called from inside the consumer callback.
90 * This makes the consumer callback be called again while parsing
91 * the input at the previous cursor position again. This is useful when
92 * the tokenizer state (custom tokens, tokenization mode) has changed and
93 * we want to re-parse the input again.
95 void Rollback();
97 private:
98 // Loops over the input with TokenizerBase::Parse and calls the Consumer
99 // callback.
100 nsresult Process();
102 #ifdef DEBUG
103 // True when inside the consumer callback, used only for assertions.
104 bool mConsuming;
105 #endif // DEBUG
106 // Modifyable only from the Consumer callback, tells the parser to break,
107 // rollback and wait for more input.
108 bool mNeedMoreInput;
109 // Modifyable only from the Consumer callback, tells the parser to rollback
110 // and parse the input again, with (if modified) new settings of the
111 // tokenizer.
112 bool mRollback;
113 // The input buffer. Updated with each call to Feed/FinishInput.
114 nsCString mInput;
115 // Numerical index pointing at the current cursor position. We don't keep
116 // direct reference to the string buffer since the buffer gets often
117 // reallocated.
118 nsCString::index_type mInputCursor;
119 // Refernce to the consumer function.
120 Consumer mConsumer;
123 } // namespace mozilla
125 #endif