1 // Copyright 2014 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 // All data that is passed through a WebSocket with type "Text" needs to be
6 // validated as UTF8. Since this is done on the IO thread, it needs to be
9 // We are only interested in the performance on valid UTF8. Invalid UTF8 will
10 // result in a connection failure, so is unlikely to become a source of
11 // performance issues.
13 #include "base/i18n/streaming_utf8_validator.h"
17 #include "base/basictypes.h"
18 #include "base/bind.h"
19 #include "base/callback.h"
20 #include "base/strings/string_util.h"
21 #include "base/strings/stringprintf.h"
22 #include "base/test/perf_time_logger.h"
23 #include "testing/gtest/include/gtest/gtest.h"
28 // We want to test ranges of valid UTF-8 sequences. These ranges are inclusive.
29 // They are intended to be large enough that the validator needs to do
30 // meaningful work while being in some sense "realistic" (eg. control characters
32 const char kOneByteSeqRangeStart
[] = " "; // U+0020
33 const char kOneByteSeqRangeEnd
[] = "~"; // U+007E
35 const char kTwoByteSeqRangeStart
[] = "\xc2\xa0"; // U+00A0 non-breaking space
36 const char kTwoByteSeqRangeEnd
[] = "\xc9\x8f"; // U+024F small y with stroke
38 const char kThreeByteSeqRangeStart
[] = "\xe3\x81\x82"; // U+3042 Hiragana "a"
39 const char kThreeByteSeqRangeEnd
[] = "\xe9\xbf\x83"; // U+9FC3 "to blink"
41 const char kFourByteSeqRangeStart
[] = "\xf0\xa0\x80\x8b"; // U+2000B
42 const char kFourByteSeqRangeEnd
[] = "\xf0\xaa\x9a\xb2"; // U+2A6B2
44 // The different lengths of strings to test.
45 const size_t kTestLengths
[] = {1, 32, 256, 32768, 1 << 20};
47 // Simplest possible byte-at-a-time validator, to provide a baseline
48 // for comparison. This is only tried on 1-byte UTF-8 sequences, as
49 // the results will not be meaningful with sequences containing
51 bool IsString7Bit(const std::string
& s
) {
52 for (std::string::const_iterator it
= s
.begin(); it
!= s
.end(); ++it
) {
59 // Assumes that |previous| is a valid UTF-8 sequence, and attempts to return
60 // the next one. Is just barely smart enough to iterate through the ranges
62 std::string
NextUtf8Sequence(const std::string
& previous
) {
63 DCHECK(StreamingUtf8Validator::Validate(previous
));
64 std::string next
= previous
;
65 for (int i
= static_cast<int>(previous
.length() - 1); i
>= 0; --i
) {
66 // All bytes in a UTF-8 sequence except the first one are
67 // constrained to the range 0x80 to 0xbf, inclusive. When we
68 // increment past 0xbf, we carry into the previous byte.
69 if (i
> 0 && next
[i
] == '\xbf') {
76 DCHECK(StreamingUtf8Validator::Validate(next
))
77 << "Result \"" << next
<< "\" failed validation";
81 typedef bool (*TestTargetType
)(const std::string
&);
83 // Run fuction |target| over |test_string| |times| times, and report the results
84 // using |description|.
85 bool RunTest(const std::string
& description
,
86 TestTargetType target
,
87 const std::string
& test_string
,
89 base::PerfTimeLogger
timer(description
.c_str());
91 for (int i
= 0; i
< times
; ++i
) {
92 result
= target(test_string
) && result
;
98 // Construct a string by repeating |input| enough times to equal or exceed
100 std::string
ConstructRepeatedTestString(const std::string
& input
,
102 std::string output
= input
;
103 while (output
.length() * 2 < length
) {
106 if (output
.length() < length
) {
107 output
+= ConstructRepeatedTestString(input
, length
- output
.length());
112 // Construct a string by expanding the range of UTF-8 sequences
113 // between |input_start| and |input_end|, inclusive, and then
114 // repeating the resulting string until it equals or exceeds |length|
115 // bytes. |input_start| and |input_end| must be valid UTF-8
117 std::string
ConstructRangedTestString(const std::string
& input_start
,
118 const std::string
& input_end
,
120 std::string output
= input_start
;
121 std::string input
= input_start
;
122 while (output
.length() < length
&& input
!= input_end
) {
123 input
= NextUtf8Sequence(input
);
126 if (output
.length() < length
) {
127 output
= ConstructRepeatedTestString(output
, length
);
132 struct TestFunctionDescription
{
133 TestTargetType function
;
134 const char* function_name
;
137 bool IsStringUTF8(const std::string
& str
) {
138 return base::IsStringUTF8(base::StringPiece(str
));
141 // IsString7Bit is intentionally placed last so it can be excluded easily.
142 const TestFunctionDescription kTestFunctions
[] = {
143 {&StreamingUtf8Validator::Validate
, "StreamingUtf8Validator"},
144 {&IsStringUTF8
, "IsStringUTF8"}, {&IsString7Bit
, "IsString7Bit"}};
146 // Construct a test string from |construct_test_string| for each of the lengths
147 // in |kTestLengths| in turn. For each string, run each test in |test_functions|
148 // for a number of iterations such that the total number of bytes validated
152 base::Callback
<std::string(size_t length
)> construct_test_string
,
153 const TestFunctionDescription
* test_functions
,
155 for (size_t i
= 0; i
< arraysize(kTestLengths
); ++i
) {
156 const size_t length
= kTestLengths
[i
];
157 const std::string test_string
= construct_test_string
.Run(length
);
158 const int real_length
= static_cast<int>(test_string
.length());
159 const int times
= (1 << 24) / real_length
;
160 for (size_t test_index
= 0; test_index
< test_count
; ++test_index
) {
161 EXPECT_TRUE(RunTest(StringPrintf(format
,
162 test_functions
[test_index
].function_name
,
165 test_functions
[test_index
].function
,
172 TEST(StreamingUtf8ValidatorPerfTest
, OneByteRepeated
) {
173 RunSomeTests("%s: bytes=1 repeated length=%d repeat=%d",
174 base::Bind(ConstructRepeatedTestString
, kOneByteSeqRangeStart
),
179 TEST(StreamingUtf8ValidatorPerfTest
, OneByteRange
) {
180 RunSomeTests("%s: bytes=1 ranged length=%d repeat=%d",
181 base::Bind(ConstructRangedTestString
,
182 kOneByteSeqRangeStart
,
183 kOneByteSeqRangeEnd
),
188 TEST(StreamingUtf8ValidatorPerfTest
, TwoByteRepeated
) {
189 RunSomeTests("%s: bytes=2 repeated length=%d repeat=%d",
190 base::Bind(ConstructRepeatedTestString
, kTwoByteSeqRangeStart
),
195 TEST(StreamingUtf8ValidatorPerfTest
, TwoByteRange
) {
196 RunSomeTests("%s: bytes=2 ranged length=%d repeat=%d",
197 base::Bind(ConstructRangedTestString
,
198 kTwoByteSeqRangeStart
,
199 kTwoByteSeqRangeEnd
),
204 TEST(StreamingUtf8ValidatorPerfTest
, ThreeByteRepeated
) {
206 "%s: bytes=3 repeated length=%d repeat=%d",
207 base::Bind(ConstructRepeatedTestString
, kThreeByteSeqRangeStart
),
212 TEST(StreamingUtf8ValidatorPerfTest
, ThreeByteRange
) {
213 RunSomeTests("%s: bytes=3 ranged length=%d repeat=%d",
214 base::Bind(ConstructRangedTestString
,
215 kThreeByteSeqRangeStart
,
216 kThreeByteSeqRangeEnd
),
221 TEST(StreamingUtf8ValidatorPerfTest
, FourByteRepeated
) {
222 RunSomeTests("%s: bytes=4 repeated length=%d repeat=%d",
223 base::Bind(ConstructRepeatedTestString
, kFourByteSeqRangeStart
),
228 TEST(StreamingUtf8ValidatorPerfTest
, FourByteRange
) {
229 RunSomeTests("%s: bytes=4 ranged length=%d repeat=%d",
230 base::Bind(ConstructRangedTestString
,
231 kFourByteSeqRangeStart
,
232 kFourByteSeqRangeEnd
),