1 # Library for storing and accessing arbitrary chunks of compressed data.
2 # By Stian Haklev (shaklev@gmail.com), 2007
3 # Released under MIT and GPL licenses
7 # archive = ZArchive::Writer.new('eo.zdump')
8 # index = File.read('index.html')
9 # archive.add('index.html', index)
10 # archive.add_hardlink('index.htm', 'index.html)
13 # archive = ZArchive::Reader.new('eo.zdump')
14 # puts(archive.get('index.html))
16 %w(sha1 zutil).each {|x| require x}
23 # methods are bz2 and zlib
26 def initialize(method)
28 require (@method == METHOD_BZ2 ? 'bz2' : 'zlib')
33 when METHOD_BZ2 : BZ2::Reader.new(txt).read
34 when METHOD_ZLIB : Zlib::Inflate.new.inflate(txt)
38 # compresses a textchunk, that is able to be uncompressed independently
41 when METHOD_BZ2 : (BZ2::Writer.new << txt).flush
42 when METHOD_ZLIB : Zlib::Deflate.new.deflate(txt, Zlib::FINISH)
52 zdump = File.open(@file, 'r')
53 @zindex_loc, @meta_loc, @compress, idx_size = zdump.read(12).unpack('VVCC')
55 @compressor = Compressor.new(@compress)
59 # we open this on each request, because otherwise it gets messy with threading
60 zdump = File.open(@file, 'r')
62 loc = get_location(url, zdump, @zindex_loc)
63 return loc ? get_text(zdump, *loc) : nil
66 def get_text(zdump, block_offset, block_size, offset, size)
67 text_compr = readloc( zdump, block_size, block_offset )
68 text_uncompr = @compressor.uncompress( text_compr )
69 return text_uncompr[offset, size]
73 zdump = File.open(@file, 'r')
75 Marshal.load(zdump.read)
78 def get_location(url, zdump, zindex_loc)
79 sha1, firstfour = sha1_w_sub(url, @idx_size)
81 # uses this number to calculate the location of the metaindex entry
82 loc = (firstfour * 8) + zindex_loc
84 # finds the location of the index entry
85 start, size = readloc(zdump, 8, loc).unpack('V2')
86 idx = readloc(zdump, size, start)
88 # the index consists of a number of 36 byte entries. it sorts through
89 # until it finds the right one.
92 hex, *coordinates = idx.pop(36).unpack('H40V4') until ( hex == sha1 || idx.nil? )
93 return coordinates if hex == sha1
99 attr_reader :location, :hardlinks
101 @@entry = Struct.new(:uri, :block, :buflocation, :size, :sha1)
102 @@block = Struct.new(:number, :start, :size, :pages)
104 # the uri to open, the minimum size of blocks, and zlib or bz2
105 def initialize(file, method = METHOD_BZ2, idx_size = 4, blocksize = 900000)
106 @compressor = Compressor.new(method)
107 @blocksize = blocksize
108 @file = File.open(file, "w")
110 @cur_block, @buflocation, @size = 0, 0, 0
112 @location = 12 # (to hold start of index)
118 # adds a blob of text that will be acessible through a certain uri
120 # if redirect, add to index and keep going
121 entry = @@entry.new(uri, @cur_block, @buflocation, text.size)
123 # calculate the sha1 code, use the first four characters as the index
124 entry.sha1, firstfour = sha1_w_sub(entry.uri, @idx_size)
126 # add this entry to the index in the right place
127 @index[firstfour] ||= []
128 @index[firstfour] << entry
130 # add to the buffer, and update the counter
132 @buflocation += text.size
134 flush_block if @buffer.size > @blocksize
137 # hardlinks the contents of one uri to that of another
138 def add_hardlink(uri, targeturi)
139 @hardlinks[uri] = targeturi
146 # finish up, process hardlinks, and write index to file
148 flush_block unless @buffer.empty?
151 # writing the location of the archive (it's after the dump data)
152 writeloc(@file, [@location].pack('V'), 0)
155 location = (sha1subset('FFFFFFFFFF', @idx_size) * 8) + indexloc
156 # p = File.open("zlog", "w")
157 each_entry_with_index do |entry, idx|
160 writeloc(@file, [location, entry.size].pack('V2'), (idx * 8) + indexloc)
161 writeloc(@file, entry, location)
163 # p << "*" * 80 << "\n"
164 # p << "seek #{(idx * 8) + indexloc} location #{location} size #{entry.size}" << "\n"
165 # p << unpack(entry).join(":") << "\n"
167 location += entry.size
171 writeloc(@file, [location, @compressor.method, @idx_size].pack('VCC'), 4)
173 writeloc(@file, Marshal.dump(@meta), @location) if defined?(@meta)
179 # yields an entry that is ready to be written to the index
180 def each_entry_with_index
181 @index.each_with_index do |hash, idx|
184 hash.each {|x| entry << pack(x.sha1, @block_ary[x.block].start, @block_ary[x.block].size, x.buflocation, x.size) }
189 # must be run after all the uris have been added, so their coordinates are known
190 # adds entries for the hardlinks into the main index
191 def process_hardlinks
193 @hardlinks.each do |file, target|
196 # in case of recursive redirects, which shouldn't happen, but alas
198 while @hardlinks[target] && recursion < 3
200 target = @hardlinks[target]
203 # we'll just traverse the index and fetch the coords of the target
204 sha1, firstfour = sha1_w_sub(file)
205 sha1_target, firstfour_target = sha1_w_sub(target)
207 entries = @index[firstfour_target]
210 target = entries.select {|entry| entry.sha1 == sha1_target}
212 # it really shouldn't be empty... if it is - the redirect is useless
215 entry = target[0].dup # so we don't overwrite the original
217 # we just reuse the same entry, rewrite the sha1, and add it to the index
219 @index[firstfour] ||= []
220 @index[firstfour] << entry
224 @hardlinks = nil # clean up some memory
227 # output the block in buffer to file, store the coords, and clean the buffer
229 bf_compr = @compressor.compress(@buffer)
230 writeloc(@file, bf_compr, @location)
231 @block_ary[@cur_block] = @@block.new(@cur_block, @location, bf_compr.size)
236 @location += bf_compr.size