1 # Library for storing and accessing arbitrary chunks of compressed data.
2 # By Stian Haklev (shaklev@gmail.com), 2007
3 # Released under MIT and GPL licenses
7 # archive = ZArchive::Writer.new('eo.zdump')
8 # index = File.read('index.html')
9 # archive.add('index.html', index)
10 # archive.add_hardlink('index.htm', 'index.html)
13 # archive = ZArchive::Reader.new('eo.zdump')
14 # puts(archive.get('index.html))
16 %w(sha1 zutil).each {|x| require x}
23 # methods are bz2 and zlib
26 def initialize(method)
28 require (@method == METHOD_BZ2 ? 'bz2' : 'zlib')
33 when METHOD_BZ2 : BZ2::Reader.new(txt).read
34 when METHOD_ZLIB : Zlib::Inflate.new.inflate(txt)
38 # compresses a textchunk, that is able to be uncompressed independently
41 when METHOD_BZ2 : (BZ2::Writer.new << txt).flush
42 when METHOD_ZLIB : Zlib::Deflate.new.deflate(txt, Zlib::FINISH)
52 zdump = File.open(@file, 'r')
53 @zindex_loc, @meta_loc, @compress, idx_size = zdump.read(12).unpack('VVCC')
55 @compressor = Compressor.new(@compress)
59 zdump = File.open(@file, 'r')
61 zindex_loc = zdump.read(4).unpack('V')[0]
62 loc = get_location(url, zdump, zindex_loc)
63 return loc ? loc[3] : 0
67 # we open this on each request, because otherwise it gets messy with threading
68 zdump = File.open(@file, 'r')
70 loc = get_location(url, zdump, @zindex_loc)
71 return loc ? get_text(zdump, *loc) : nil
74 def get_text(zdump, block_offset, block_size, offset, size)
75 text_compr = readloc( zdump, block_size, block_offset )
76 text_uncompr = @compressor.uncompress( text_compr )
77 return text_uncompr[offset, size]
81 zdump = File.open(@file, 'r')
83 Marshal.load(zdump.read)
86 def get_location(url, zdump, zindex_loc)
87 sha1, firstfour = sha1_w_sub(url, @idx_size)
89 # uses this number to calculate the location of the metaindex entry
90 loc = (firstfour * 8) + zindex_loc
92 # finds the location of the index entry
93 start, size = readloc(zdump, 8, loc).unpack('V2')
94 idx = readloc(zdump, size, start)
96 # the index consists of a number of 36 byte entries. it sorts through
97 # until it finds the right one.
100 hex, *coordinates = idx.pop(36).unpack('H40V4') until ( hex == sha1 || idx.nil? )
101 return coordinates if hex == sha1
107 attr_reader :location, :hardlinks
109 @@entry = Struct.new(:uri, :block, :buflocation, :size, :sha1)
110 @@block = Struct.new(:number, :start, :size, :pages)
112 # the uri to open, the minimum size of blocks, and zlib or bz2
113 def initialize(file, method = METHOD_BZ2, idx_size = 4, blocksize = 900000)
114 @compressor = Compressor.new(method)
115 @blocksize = blocksize
116 @file = File.open(file, "w")
118 @cur_block, @buflocation, @size = 0, 0, 0
120 @location = 12 # (to hold start of index)
126 # adds a blob of text that will be acessible through a certain uri
128 # if redirect, add to index and keep going
129 entry = @@entry.new(uri, @cur_block, @buflocation, text.size)
131 # calculate the sha1 code, use the first four characters as the index
132 entry.sha1, firstfour = sha1_w_sub(entry.uri, @idx_size)
134 # add this entry to the index in the right place
135 @index[firstfour] ||= []
136 @index[firstfour] << entry
138 # add to the buffer, and update the counter
140 @buflocation += text.size
142 flush_block if @buffer.size > @blocksize
145 # hardlinks the contents of one uri to that of another
146 def add_hardlink(uri, targeturi)
147 @hardlinks[uri] = targeturi
154 # finish up, process hardlinks, and write index to file
156 flush_block unless @buffer.empty?
159 # writing the location of the archive (it's after the dump data)
160 writeloc(@file, [@location].pack('V'), 0)
163 location = (sha1subset('FFFFFFFFFF', @idx_size) * 8) + indexloc
164 # p = File.open("zlog", "w")
165 each_entry_with_index do |entry, idx|
168 writeloc(@file, [location, entry.size].pack('V2'), (idx * 8) + indexloc)
169 writeloc(@file, entry, location)
171 # p << "*" * 80 << "\n"
172 # p << "seek #{(idx * 8) + indexloc} location #{location} size #{entry.size}" << "\n"
173 # p << unpack(entry).join(":") << "\n"
175 location += entry.size
179 writeloc(@file, [location, @compressor.method, @idx_size].pack('VCC'), 4)
181 writeloc(@file, Marshal.dump(@meta), @location) if defined?(@meta)
187 # yields an entry that is ready to be written to the index
188 def each_entry_with_index
189 @index.each_with_index do |hash, idx|
192 hash.each {|x| entry << pack(x.sha1, @block_ary[x.block].start, @block_ary[x.block].size, x.buflocation, x.size) }
197 # must be run after all the uris have been added, so their coordinates are known
198 # adds entries for the hardlinks into the main index
199 def process_hardlinks
201 @hardlinks.each do |file, target|
204 # in case of recursive redirects, which shouldn't happen, but alas
206 while @hardlinks[target] && recursion < 3
208 target = @hardlinks[target]
211 # we'll just traverse the index and fetch the coords of the target
212 sha1, firstfour = sha1_w_sub(file)
213 sha1_target, firstfour_target = sha1_w_sub(target)
215 entries = @index[firstfour_target]
218 target = entries.select {|entry| entry.sha1 == sha1_target}
220 # it really shouldn't be empty... if it is - the redirect is useless
223 entry = target[0].dup # so we don't overwrite the original
225 # we just reuse the same entry, rewrite the sha1, and add it to the index
227 @index[firstfour] ||= []
228 @index[firstfour] << entry
232 @hardlinks = nil # clean up some memory
235 # output the block in buffer to file, store the coords, and clean the buffer
237 bf_compr = @compressor.compress(@buffer)
238 writeloc(@file, bf_compr, @location)
239 @block_ary[@cur_block] = @@block.new(@cur_block, @location, bf_compr.size)
244 @location += bf_compr.size