Initial import
[zip-doc.git] / zdump.rb
blob9b68924d8de57ebd15195631efc932723db410c1
1 #!/usr/bin/ruby
2 %w(md5 zcompress find htmlshrinker zcompress).each {|x| require x}
3          
4 def unpack(string)
5   return string.unpack('H32V4' * (string.size/32))
6 end  
7   
8 def pack(md5, bstart, bsize, start, size)
9   return [md5, bstart, bsize, start, size].pack('H32V4')
10 end
12 def md5subset(four)
13   sprintf("%d", "0x" + four[0..3]).to_i                                                  
14 end
15                                
16 class IO
17   def writeloc(text, offset)
18     self.seek offset
19     self.write text
20   end
21 end
22                                                  
23 HTMLSHRINKER = HTMLShrinker.new(ARGV[1])
25 class Webpage    
26   attr_reader :text, :compressed, :size, :compressed_size, :filename, :index_content, :block, :buflocation
27   
28   def initialize(filename, block, buflocation)
29     @filename = filename                                                                    
30     @block = block
31     @text = HTMLSHRINKER.compress(File.read(filename))
32     @size = @text.size
33     @buflocation = buflocation
34   end
35                     
36   def empty!
37     @text = ''
38     @index_content = ''
39   end
40 end
41             
42 Block = Struct.new(:number, :start, :size, :pages)                         
43                                                    
44 index = []             
45 block_ary = []  
46 uncompr_size, compr_size, cur_block, counter, buflocation, size = *[0] * 6
47 buffer = ''
48 location = 4 # (to hold start of index)
50 name = (ARGV[1] ? ARGV[1] : "default")
51             
52 t = Time.now
53 puts "Indexing files in #{ARGV[0]}/ and writing the file #{name}"
54 zdump = File.open("#{name}", "w")
55 zdump.seek(location)
57 ignore = ARGV[2] ? Regexp.new(ARGV[2]) : /^(Bilde~|Bruker|Pembicaraan_Pengguna~)/ 
59 Find.find(ARGV[0]) do |newfile|
60   next if File.directory?(newfile) || !File.readable?(newfile)
61   next if newfile =~ ignore
63   counter += 1                  
64   if counter.to_i / 1000.0 == counter / 1000                                                               
65     puts "#{counter} files indexed in #{Time.now - t}, average #{counter.to_f / (Time.now - t)} files per second. #{uncompr_size} data compressed to #{compr_size}, compression ratio #{compr_size.to_f / uncompr_size.to_f}." 
66   end
68   wf = Webpage.new(newfile, cur_block, buflocation)
69   buflocation += wf.text.size
70   wf.empty!
71   index << wf
72   next if buffer.size < 900000    
74   bf_compr = ZCompress.compress(buffer)
75   compr_size += bf_compr.size
76   zdump.write(bf_compr)
77   block_ary[cur_block] = Block.new(cur_block, location, bf_compr.size)
78   buffer = ''       
79   uncompr_size += buflocation
80   buflocation = 0
81   cur_block += 1                                           
82   location += bf_compr.size
83   puts "Writing block no #{cur_block}"
84        
85 end        
87 # to ensure last part of buffer is written
88 bf_compr = ZCompress::compress(buffer)
89 zdump.write(bf_compr)
90 block_ary[cur_block] = Block.new(cur_block, location, bf_compr.size)
91 location += bf_compr.size                             
93 # writing start of index
94 zdump.writeloc([location].pack('V'), 0)                      
95 puts "location #{location}"
96 puts "Finished, writing index. #{Time.now - t}"
97            
98 subindex = []                        
99 index.each_pair do |file| 
100   md5 = Digest::MD5.hexdigest( file.filename )
101   firstfour = md5subset( md5 )
102   entry = pack(md5, block_ary[file.block].start, block_ary[file.block].size, file.buflocation, file.size)
103   subindex[firstfour] ||= "" 
104   subindex[firstfour] << entry
107 puts "Sorted one time. #{Time.now - t}"        
109 indexloc = location
110 subidxloc = (65535*8) + indexloc    # 65535 = 0xFFFF
112 subindex.each_with_index do |entry, idx|
113   next if entry.nil?  
114   zdump.writeloc( [location, entry.size].pack('V2'), (idx * 8) + indexloc )                   
115   zdump.writeloc(entry, subidxloc)
116   subidxloc += entry.size
119 zdump.close
120 puts "Finished. #{Time.now - t}"