2 %w(md5 zcompress find htmlshrinker zcompress).each {|x| require x}
5 return string.unpack('H32V4' * (string.size/32))
8 def pack(md5, bstart, bsize, start, size)
9 return [md5, bstart, bsize, start, size].pack('H32V4')
13 sprintf("%d", "0x" + four[0..3]).to_i
17 def writeloc(text, offset)
23 HTMLSHRINKER = HTMLShrinker.new(ARGV[1])
26 attr_reader :text, :compressed, :size, :compressed_size, :filename, :index_content, :block, :buflocation
28 def initialize(filename, block, buflocation)
31 @text = HTMLSHRINKER.compress(File.read(filename))
33 @buflocation = buflocation
42 Block = Struct.new(:number, :start, :size, :pages)
46 uncompr_size, compr_size, cur_block, counter, buflocation, size = *[0] * 6
48 location = 4 # (to hold start of index)
50 name = (ARGV[1] ? ARGV[1] : "default")
53 puts "Indexing files in #{ARGV[0]}/ and writing the file #{name}"
54 zdump = File.open("#{name}", "w")
57 ignore = ARGV[2] ? Regexp.new(ARGV[2]) : /^(Bilde~|Bruker|Pembicaraan_Pengguna~)/
59 Find.find(ARGV[0]) do |newfile|
60 next if File.directory?(newfile) || !File.readable?(newfile)
61 next if newfile =~ ignore
64 if counter.to_i / 1000.0 == counter / 1000
65 puts "#{counter} files indexed in #{Time.now - t}, average #{counter.to_f / (Time.now - t)} files per second. #{uncompr_size} data compressed to #{compr_size}, compression ratio #{compr_size.to_f / uncompr_size.to_f}."
68 wf = Webpage.new(newfile, cur_block, buflocation)
69 buflocation += wf.text.size
72 next if buffer.size < 900000
74 bf_compr = ZCompress.compress(buffer)
75 compr_size += bf_compr.size
77 block_ary[cur_block] = Block.new(cur_block, location, bf_compr.size)
79 uncompr_size += buflocation
82 location += bf_compr.size
83 puts "Writing block no #{cur_block}"
87 # to ensure last part of buffer is written
88 bf_compr = ZCompress::compress(buffer)
90 block_ary[cur_block] = Block.new(cur_block, location, bf_compr.size)
91 location += bf_compr.size
93 # writing start of index
94 zdump.writeloc([location].pack('V'), 0)
95 puts "location #{location}"
96 puts "Finished, writing index. #{Time.now - t}"
99 index.each_pair do |file|
100 md5 = Digest::MD5.hexdigest( file.filename )
101 firstfour = md5subset( md5 )
102 entry = pack(md5, block_ary[file.block].start, block_ary[file.block].size, file.buflocation, file.size)
103 subindex[firstfour] ||= ""
104 subindex[firstfour] << entry
107 puts "Sorted one time. #{Time.now - t}"
110 subidxloc = (65535*8) + indexloc # 65535 = 0xFFFF
112 subindex.each_with_index do |entry, idx|
114 zdump.writeloc( [location, entry.size].pack('V2'), (idx * 8) + indexloc )
115 zdump.writeloc(entry, subidxloc)
116 subidxloc += entry.size
120 puts "Finished. #{Time.now - t}"