1 // Copyright (c) 2011 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
7 // These memory-resident streams are used for serializing data into a sequential
10 // Streams are divided into SourceStreams for reading and SinkStreams for
11 // writing. Streams are aggregated into Sets which allows several streams to be
12 // used at once. Example: we can write A1, B1, A2, B2 but achieve the memory
13 // layout A1 A2 B1 B2 by writing 'A's to one stream and 'B's to another.
15 // The aggregated streams are important to Courgette's compression efficiency,
16 // we use it to cluster similar kinds of data which helps to generate longer
17 // common subsequences and repeated sequences.
19 #include "courgette/streams.h"
23 #include "base/basictypes.h"
24 #include "base/logging.h"
28 // Update this version number if the serialization format of a StreamSet
30 static const unsigned int kStreamsSerializationFormatVersion
= 20090218;
33 // This is a cut down Varint implementation, implementing only what we use for
38 // Maximum lengths of varint encoding of uint32
39 static const int kMax32
= 5;
41 // Parses a Varint32 encoded value from |source| and stores it in |output|,
42 // and returns a pointer to the following byte. Returns NULL if a valid
43 // varint value was not found before |limit|.
44 static const uint8
* Parse32WithLimit(const uint8
* source
, const uint8
* limit
,
47 // Writes the Varint32 encoded representation of |value| to buffer
48 // |destination|. |destination| must have sufficient length to hold kMax32
49 // bytes. Returns a pointer to the byte just past the last encoded byte.
50 static uint8
* Encode32(uint8
* destination
, uint32 value
);
53 // Parses a Varint32 encoded unsigned number from |source|. The Varint32
54 // encoding is a little-endian sequence of bytes containing base-128 digits,
55 // with the high order bit set to indicate if there are more digits.
57 // For each byte, we mask out the digit and 'or' it into the right place in the
60 // The digit loop is unrolled for performance. It usually exits after the first
62 const uint8
* Varint::Parse32WithLimit(const uint8
* source
,
78 result
|= (digit
& 127) << 7;
87 result
|= (digit
& 127) << 14;
96 result
|= (digit
& 127) << 21;
105 result
|= (digit
& 127) << 28;
111 return NULL
; // Value is too long to be a Varint32.
114 // Write the base-128 digits in little-endian order. All except the last digit
115 // have the high bit set to indicate more digits.
116 inline uint8
* Varint::Encode32(uint8
* destination
, uint32 value
) {
117 while (value
>= 128) {
118 *(destination
++) = static_cast<uint8
>(value
) | 128;
121 *(destination
++) = static_cast<uint8
>(value
);
125 void SourceStream::Init(const SinkStream
& sink
) {
126 Init(sink
.Buffer(), sink
.Length());
129 bool SourceStream::Read(void* destination
, size_t count
) {
130 if (current_
+ count
> end_
)
132 memcpy(destination
, current_
, count
);
137 bool SourceStream::ReadVarint32(uint32
* output_value
) {
138 const uint8
* after
= Varint::Parse32WithLimit(current_
, end_
, output_value
);
145 bool SourceStream::ReadVarint32Signed(int32
* output_value
) {
146 // Signed numbers are encoded as unsigned numbers so that numbers nearer zero
147 // have shorter varint encoding.
148 // 0000xxxx encoded as 000xxxx0.
149 // 1111xxxx encoded as 000yyyy1 where yyyy is complement of xxxx.
150 uint32 unsigned_value
;
151 if (!ReadVarint32(&unsigned_value
))
153 if (unsigned_value
& 1)
154 *output_value
= ~static_cast<int32
>(unsigned_value
>> 1);
156 *output_value
= (unsigned_value
>> 1);
160 bool SourceStream::ShareSubstream(size_t offset
, size_t length
,
161 SourceStream
* substream
) {
162 if (offset
> Remaining())
164 if (length
> Remaining() - offset
)
166 substream
->Init(current_
+ offset
, length
);
170 bool SourceStream::ReadSubstream(size_t length
, SourceStream
* substream
) {
171 if (!ShareSubstream(0, length
, substream
))
177 bool SourceStream::Skip(size_t byte_count
) {
178 if (current_
+ byte_count
> end_
)
180 current_
+= byte_count
;
184 CheckBool
SinkStream::Write(const void* data
, size_t byte_count
) {
185 return buffer_
.append(static_cast<const char*>(data
), byte_count
);
188 CheckBool
SinkStream::WriteVarint32(uint32 value
) {
189 uint8 buffer
[Varint::kMax32
];
190 uint8
* end
= Varint::Encode32(buffer
, value
);
191 return Write(buffer
, end
- buffer
);
194 CheckBool
SinkStream::WriteVarint32Signed(int32 value
) {
195 // Encode signed numbers so that numbers nearer zero have shorter
197 // 0000xxxx encoded as 000xxxx0.
198 // 1111xxxx encoded as 000yyyy1 where yyyy is complement of xxxx.
201 ret
= WriteVarint32(~value
* 2 + 1);
203 ret
= WriteVarint32(value
* 2);
207 CheckBool
SinkStream::WriteSizeVarint32(size_t value
) {
208 uint32 narrowed_value
= static_cast<uint32
>(value
);
209 // On 32-bit, the compiler should figure out this test always fails.
210 LOG_ASSERT(value
== narrowed_value
);
211 return WriteVarint32(narrowed_value
);
214 CheckBool
SinkStream::Append(SinkStream
* other
) {
215 bool ret
= Write(other
->buffer_
.data(), other
->buffer_
.size());
221 void SinkStream::Retire() {
225 ////////////////////////////////////////////////////////////////////////////////
227 SourceStreamSet::SourceStreamSet()
228 : count_(kMaxStreams
) {
231 SourceStreamSet::~SourceStreamSet() {
235 // Initializes from |source|.
236 // The stream set for N streams is serialized as a header
237 // <version><N><length1><length2>...<lengthN>
238 // followed by the stream contents
239 // <bytes1><bytes2>...<bytesN>
241 bool SourceStreamSet::Init(const void* source
, size_t byte_count
) {
242 const uint8
* start
= static_cast<const uint8
*>(source
);
243 const uint8
* end
= start
+ byte_count
;
245 unsigned int version
;
246 const uint8
* finger
= Varint::Parse32WithLimit(start
, end
, &version
);
249 if (version
!= kStreamsSerializationFormatVersion
)
253 finger
= Varint::Parse32WithLimit(finger
, end
, &count
);
256 if (count
> kMaxStreams
)
261 unsigned int lengths
[kMaxStreams
];
262 size_t accumulated_length
= 0;
264 for (size_t i
= 0; i
< count_
; ++i
) {
265 finger
= Varint::Parse32WithLimit(finger
, end
, &lengths
[i
]);
268 accumulated_length
+= lengths
[i
];
271 // Remaining bytes should add up to sum of lengths.
272 if (static_cast<size_t>(end
- finger
) != accumulated_length
)
275 accumulated_length
= finger
- start
;
276 for (size_t i
= 0; i
< count_
; ++i
) {
277 stream(i
)->Init(start
+ accumulated_length
, lengths
[i
]);
278 accumulated_length
+= lengths
[i
];
284 bool SourceStreamSet::Init(SourceStream
* source
) {
285 // TODO(sra): consume the rest of |source|.
286 return Init(source
->Buffer(), source
->Remaining());
289 bool SourceStreamSet::ReadSet(SourceStreamSet
* set
) {
290 uint32 stream_count
= 0;
291 SourceStream
* control_stream
= this->stream(0);
292 if (!control_stream
->ReadVarint32(&stream_count
))
295 uint32 lengths
[kMaxStreams
] = {}; // i.e. all zero.
297 for (size_t i
= 0; i
< stream_count
; ++i
) {
298 if (!control_stream
->ReadVarint32(&lengths
[i
]))
302 for (size_t i
= 0; i
< stream_count
; ++i
) {
303 if (!this->stream(i
)->ReadSubstream(lengths
[i
], set
->stream(i
)))
309 bool SourceStreamSet::Empty() const {
310 for (size_t i
= 0; i
< count_
; ++i
) {
311 if (streams_
[i
].Remaining() != 0)
317 ////////////////////////////////////////////////////////////////////////////////
319 SinkStreamSet::SinkStreamSet()
320 : count_(kMaxStreams
) {
323 SinkStreamSet::~SinkStreamSet() {
326 void SinkStreamSet::Init(size_t stream_index_limit
) {
327 count_
= stream_index_limit
;
330 // The header for a stream set for N streams is serialized as
331 // <version><N><length1><length2>...<lengthN>
332 CheckBool
SinkStreamSet::CopyHeaderTo(SinkStream
* header
) {
333 bool ret
= header
->WriteVarint32(kStreamsSerializationFormatVersion
);
335 ret
= header
->WriteSizeVarint32(count_
);
336 for (size_t i
= 0; ret
&& i
< count_
; ++i
) {
337 ret
= header
->WriteSizeVarint32(stream(i
)->Length());
343 // Writes |this| to |combined_stream|. See SourceStreamSet::Init for the layout
344 // of the stream metadata and contents.
345 CheckBool
SinkStreamSet::CopyTo(SinkStream
*combined_stream
) {
347 bool ret
= CopyHeaderTo(&header
);
351 // Reserve the correct amount of storage.
352 size_t length
= header
.Length();
353 for (size_t i
= 0; i
< count_
; ++i
) {
354 length
+= stream(i
)->Length();
356 ret
= combined_stream
->Reserve(length
);
358 ret
= combined_stream
->Append(&header
);
359 for (size_t i
= 0; ret
&& i
< count_
; ++i
) {
360 ret
= combined_stream
->Append(stream(i
));
366 CheckBool
SinkStreamSet::WriteSet(SinkStreamSet
* set
) {
367 uint32 lengths
[kMaxStreams
];
368 // 'stream_count' includes all non-empty streams and all empty stream numbered
369 // lower than a non-empty stream.
370 size_t stream_count
= 0;
371 for (size_t i
= 0; i
< kMaxStreams
; ++i
) {
372 SinkStream
* stream
= set
->stream(i
);
373 lengths
[i
] = static_cast<uint32
>(stream
->Length());
375 stream_count
= i
+ 1;
378 SinkStream
* control_stream
= this->stream(0);
379 bool ret
= control_stream
->WriteSizeVarint32(stream_count
);
380 for (size_t i
= 0; ret
&& i
< stream_count
; ++i
) {
381 ret
= control_stream
->WriteSizeVarint32(lengths
[i
]);
384 for (size_t i
= 0; ret
&& i
< stream_count
; ++i
) {
385 ret
= this->stream(i
)->Append(set
->stream(i
));