6 #define MAXRUN 16843009
10 const char* hexes
= "0123456789ABCDEF";
13 rrdata_set::instance::instance() throw()
15 memset(bytes
, 0, RRDATA_BYTES
);
18 rrdata_set::instance::instance(const unsigned char* b
) throw()
20 memcpy(bytes
, b
, RRDATA_BYTES
);
23 rrdata_set::instance::instance(const std::string
& id
) throw()
25 memset(bytes
, 0, RRDATA_BYTES
);
26 for(unsigned i
= 0; i
< id
.length() && i
< 2 * RRDATA_BYTES
; i
++) {
29 if(ch
>= '0' && ch
<= '9')
31 else if(ch
>= 'A' && ch
<= 'F')
33 else if(ch
>= 'a' && ch
<= 'f')
35 bytes
[i
/ 2] = bytes
[i
/ 2] * 16 + h
;
39 bool rrdata_set::instance::operator<(const struct instance
& i
) const throw()
41 for(unsigned j
= 0; j
< RRDATA_BYTES
; j
++)
42 if(bytes
[j
] < i
.bytes
[j
])
44 else if(bytes
[j
] > i
.bytes
[j
])
49 bool rrdata_set::instance::operator==(const struct instance
& i
) const throw()
51 for(unsigned j
= 0; j
< RRDATA_BYTES
; j
++)
52 if(bytes
[j
] != i
.bytes
[j
])
57 const struct rrdata_set::instance
rrdata_set::instance::operator++(int) throw()
64 struct rrdata_set::instance
& rrdata_set::instance::operator++() throw()
67 for(unsigned i
= RRDATA_BYTES
- 1; i
< RRDATA_BYTES
; i
--) {
68 unsigned newcarry
= (bytes
[i
] == 255 && carry
);
75 struct rrdata_set::instance
rrdata_set::instance::operator+(unsigned inc
) const throw()
77 rrdata_set::instance n
= *this;
79 for(unsigned i
= RRDATA_BYTES
- 1; i
< RRDATA_BYTES
; i
--) {
80 unsigned newcarry
= ((unsigned)n
.bytes
[i
] + carry
) >> 8;
81 if(newcarry
== 0 && carry
> 255)
82 newcarry
= (1U << (8 * sizeof(unsigned) - 8));
89 unsigned rrdata_set::instance::operator-(const struct instance
& m
) const throw()
92 uint8_t diff
[RRDATA_BYTES
] = {0};
94 for(unsigned i
= RRDATA_BYTES
- 1; i
< RRDATA_BYTES
; i
--) {
95 diff
[i
] = bytes
[i
] - m
.bytes
[i
] - borrow
;
96 borrow
= ((unsigned)m
.bytes
[i
] + borrow
> (unsigned)bytes
[i
]) ? 1 : 0;
98 for(unsigned i
= 0; i
< RRDATA_BYTES
; i
++) {
99 if((result
<< 8 >> 8) != result
)
100 return std::numeric_limits
<unsigned>::max();
107 rrdata_set::rrdata_set() throw()
114 void rrdata_set::read_base(const std::string
& projectfile
, bool lazy
) throw(std::bad_alloc
)
116 if(projectfile
== current_projectfile
&& (!lazy_mode
|| lazy
))
119 std::set
<std::pair
<instance
, instance
>> new_rrset
;
121 current_projectfile
= projectfile
;
128 std::set
<std::pair
<instance
, instance
>> new_rrset
;
129 uint64_t new_count
= 0;
130 if(projectfile
== current_projectfile
) {
134 std::string filename
= projectfile
;
139 std::ifstream
ihandle(filename
.c_str(), std::ios_base::in
| std::ios_base::binary
);
141 unsigned char bytes
[RRDATA_BYTES
];
142 ihandle
.read(reinterpret_cast<char*>(bytes
), RRDATA_BYTES
);
144 //std::cerr << "Loaded symbol: " << k << std::endl;
145 _add(k
, k
+ 1, new_rrset
, new_count
);
148 ohandle
.open(filename
.c_str(), std::ios_base::out
| std::ios_base::app
| std::ios_base::binary
);
151 if(projectfile
== current_projectfile
&& lazy_mode
&& !lazy
) {
152 //Finish the project creation, write all.
154 instance tmp
= i
.first
;
155 while(tmp
!= i
.second
) {
156 ohandle
.write(reinterpret_cast<const char*>(tmp
.bytes
), RRDATA_BYTES
);
164 current_projectfile
= projectfile
;
168 void rrdata_set::close() throw()
170 current_projectfile
= "";
176 void rrdata_set::add(const struct rrdata_set::instance
& i
) throw(std::bad_alloc
)
178 if(_add(i
) && handle_open
) {
179 //std::cerr << "New symbol: " << i << std::endl;
180 ohandle
.write(reinterpret_cast<const char*>(i
.bytes
), RRDATA_BYTES
);
185 void rrdata_set::add_internal() throw(std::bad_alloc
)
192 void flush_symbol(std::vector
<char>& strm
, const rrdata_set::instance
& base
,
193 const rrdata_set::instance
& predicted
, unsigned count
)
196 char buf1
[RRDATA_BYTES
+ 4];
202 } else if(count
< 258) {
205 } else if(count
< 65794) {
213 for(j
= 0; j
< 31; j
++)
214 if(base
.bytes
[j
] != predicted
.bytes
[j
])
218 memcpy(buf1
+ 1, base
.bytes
+ j
, RRDATA_BYTES
- j
);
219 buf2
[0] = (count
- bias
) >> 16;
220 buf2
[1] = (count
- bias
) >> 8;
221 buf2
[2] = (count
- bias
);
222 memcpy(buf1
+ (RRDATA_BYTES
- j
+ 1), buf2
+ (3 - (opcode
>> 5)), opcode
>> 5);
223 for(size_t s
= 0; s
< (RRDATA_BYTES
- j
+ 1) + (opcode
>> 5); s
++)
224 strm
.push_back(buf1
[s
]);
225 //std::cerr << "Encoding " << count << " symbols starting from " << base << std::endl;
228 uint64_t symbols_in_interval(const rrdata_set::instance
& b
, const rrdata_set::instance
& e
) throw()
231 rrdata_set::instance x
= b
;
233 unsigned diff
= e
- x
;
241 uint64_t rrdata_set::write(std::vector
<char>& strm
) throw(std::bad_alloc
)
245 instance last_encode_end
;
246 memset(last_encode_end
.bytes
, 0, RRDATA_BYTES
);
249 instance encode_base
;
250 unsigned encode_count
= 0;
252 //std::cerr << "Considering " << *i << std::endl;
253 encode_base
= i
.first
;
254 while(encode_base
!= i
.second
) {
255 unsigned syms
= i
.second
- encode_base
;
258 flush_symbol(strm
, encode_base
, predicted
, syms
);
260 encode_base
= encode_base
+ syms
;
261 predicted
= encode_base
;
270 uint64_t rrdata_set::read(std::vector
<char>& strm
, bool dummy
) throw(std::bad_alloc
)
275 memset(decoding
.bytes
, 0, RRDATA_BYTES
);
276 while(ptr
< strm
.size()) {
278 unsigned char buf1
[RRDATA_BYTES
];
279 unsigned char buf2
[3];
280 opcode
= strm
[ptr
++];
281 unsigned validbytes
= (opcode
& 0x1F);
282 unsigned lengthbytes
= (opcode
& 0x60) >> 5;
284 memcpy(buf1
, &strm
[ptr
], RRDATA_BYTES
- validbytes
);
285 ptr
+= (RRDATA_BYTES
- validbytes
);
286 memcpy(decoding
.bytes
+ validbytes
, buf1
, RRDATA_BYTES
- validbytes
);
287 if(lengthbytes
> 0) {
288 memcpy(buf2
, &strm
[ptr
], lengthbytes
);
292 repeat
= 2 + static_cast<unsigned>(buf2
[0]);
294 repeat
= 258 + static_cast<unsigned>(buf2
[0]) * 256 + buf2
[1];
296 repeat
= 65794 + static_cast<unsigned>(buf2
[0]) * 65536 + static_cast<unsigned>(buf2
[1]) *
298 //std::cerr << "Decoding " << repeat << " symbols starting from " << decoding << std::endl;
301 if(!_in_set(decoding
, decoding
+ repeat
))
302 for(unsigned i
= 0; i
< repeat
; i
++) {
303 //TODO: Optimize this.
304 instance n
= decoding
+ i
;
305 if(!_in_set(n
) && handle_open
) {
306 ohandle
.write(reinterpret_cast<const char*>(n
.bytes
), RRDATA_BYTES
);
312 rrdata_set::_add(decoding
, decoding
+ repeat
);
314 decoding
= decoding
+ repeat
;
323 uint64_t rrdata_set::count(std::vector
<char>& strm
) throw(std::bad_alloc
)
325 return read(strm
, true);
328 uint64_t rrdata_set::count() throw()
337 void rrdata_set::set_internal(const instance
& b
) throw()
342 std::ostream
& operator<<(std::ostream
& os
, const struct rrdata_set::instance
& j
)
344 for(unsigned i
= 0; i
< 32; i
++) {
345 os
<< hexes
[j
.bytes
[i
] / 16] << hexes
[j
.bytes
[i
] % 16];
350 bool rrdata_set::_add(const instance
& b
)
354 return (c
!= rcount
);
357 void rrdata_set::_add(const instance
& b
, const instance
& e
)
359 _add(b
, e
, data
, rcount
);
362 void rrdata_set::_add(const instance
& b
, const instance
& e
, std::set
<std::pair
<instance
, instance
>>& set
,
365 //Special case: Nothing.
367 set
.insert(std::make_pair(b
, e
));
368 cnt
+= symbols_in_interval(b
, e
);
372 auto itr
= set
.lower_bound(std::make_pair(b
, e
));
373 if(itr
!= set
.end() && itr
->first
== b
&& itr
->second
== e
)
375 set
.insert(std::make_pair(b
, e
));
376 cnt
+= symbols_in_interval(b
, e
);
377 itr
= set
.lower_bound(std::make_pair(b
, e
));
380 if(itr1
!= set
.begin()) itr1
--;
382 bool have1
= (itr1
!= itr
);
383 instance rangebase
= b
;
384 //If the thing is entierely in itr1, undo the add.
385 if(have1
&& b
>= itr1
->first
&& e
<= itr1
->second
) {
386 cnt
-= symbols_in_interval(b
, e
);
390 //Attach the thing to itr1 if appropriate.
391 if(have1
&& b
<= itr1
->second
) {
392 cnt
-= symbols_in_interval(b
, itr1
->second
);
393 rangebase
= itr1
->first
;
394 set
.insert(std::make_pair(itr1
->first
, e
));
395 auto tmp
= set
.lower_bound(std::make_pair(itr1
->first
, e
));
401 while(itr2
!= set
.end()) {
403 break; //Nothing to merge anymore.
404 if(e
>= itr2
->second
&& (rangebase
!= itr2
->first
|| e
!= itr2
->second
)) {
405 //This entiere range is subsumed.
406 cnt
-= symbols_in_interval(itr2
->first
, itr2
->second
);
410 } else if(e
< itr2
->second
) {
411 //Combines with range.
412 cnt
-= symbols_in_interval(itr2
->first
, e
);
413 if(rangebase
!= itr2
->first
) {
414 set
.insert(std::make_pair(rangebase
, itr2
->second
));
423 bool rrdata_set::_in_set(const instance
& b
, const instance
& e
)
429 auto itr
= data
.lower_bound(std::make_pair(b
, e
));
430 if(itr
== data
.end()) {
431 //If there is anything, it must be the last node.
432 auto r
= *data
.rbegin();
433 return (r
.first
<= b
&& r
.second
>= e
);
435 //It may be this node or the previous one.
436 if(itr
->first
<= b
&& itr
->second
>= e
)
439 return (itr
->first
<= b
&& itr
->second
>= e
);
443 std::string
rrdata_set::debug_dump()
445 std::ostringstream x
;
448 x
<< "{" << i
.first
<< "," << i
.second
<< "}";
453 uint64_t rrdata_set::debug_nodecount(std::set
<std::pair
<instance
, instance
>>& set
)
457 x
+= symbols_in_interval(i
.first
, i
.second
);