zdump-7z.rb

   1 #!/usr/bin/ruby
   2 %w(md5 zcompress find htmlshrinker zcompress).each {|x| require x}
   3
   4 def unpack(string)
   5   return string.unpack('H32V4' * (string.size/32))
   6 end
   7
   8 def pack(md5, bstart, bsize, start, size)
   9   return [md5, bstart, bsize, start, size].pack('H32V4')
  10 end
  11
  12 def md5subset(four)
  13   sprintf("%d", "0x" + four[0..3]).to_i
  14 end
  15
  16 #ZFERRET = Ferret::Index::Index.new(:path => "#{ARGV[1]}.zferret")
  17 HTMLSHRINKER = HTMLShrinker.new(ARGV[1])
  18
  19 # File lib/dipus/util.rb, line 229
  20 class String
  21   def rsplit(*args, &block)
  22     reverse.split(*args, &block).reverse.map{|s| s.reverse }
  23   end
  24 end
  25
  26
  27 class Webpage
  28   attr_reader :text, :compressed, :size, :compressed_size, :filename, :index_content, :block, :buflocation
  29
  30   def initialize(filename, block, buflocation)
  31     @filename = filename
  32     @block = block
  33     @text = HTMLSHRINKER.compress(File.read(filename))
  34     @size = @text.size
  35 #    @index_content = index_content
  36     @buflocation = buflocation
  37   end
  38
  39   def empty!
  40     @text = ''
  41     @index_content = ''
  42   end
  43
  44   def index_content
  45     content = ""
  46     case @filename
  47       when /.txt$/i
  48         content = @text
  49
  50       when /.htm$|.html$/i        # get the file, strip all <> tags
  51         content = @text.gsub(/\<head>.*?\<\/head>/im,"").gsub(/\<.*?\>/m, " ")
  52     end
  53     return content.strip
  54   end
  55 end
  56
  57 class Block
  58   attr_reader :number, :start, :size
  59   def initialize(number, start, size)
  60     @number = number
  61     @start = start
  62     @size = size
  63   end
  64 end
  65
  66 index = []
  67 block_ary = []
  68 cur_block, counter, buflocation, location, size, buffer = 0, 0, 0, 0, 0, ""
  69 name = (ARGV[1] ? ARGV[1] : "default")
  70
  71 t = Time.now
  72 puts "Indexing files in #{ARGV[0]}/ and writing the file #{name}.zindex and directory #{name}.zferret."
  73 zdump = File.open("#{name}.zdump", "w")
  74 ignore = ARGV[2] ? Regexp.new(ARGV[2]) : /^(Bilde~|Bruker|Pembicaraan_Pengguna~)/
  75 filelist = []
  76 `7za l #{ARGV[0]}`.each do |line|
  77   a, b = line.rsplit(" ", 2)
  78   next if b.nil?
  79   filelist << b if b.match("/") && b.match(".")
  80 end
  81 p filelist
  82 #Find.find(ARGV[0]) do |newfile|
  83 filelist.each do |newfile|
  84   puts newfile
  85   puts "7za x -y #{ARGV[0]} #{newfile}"
  86   `7za x #{ARGV[0]} #{newfile}`
  87 #  next if File.directory?(newfile) || !File.readable?(newfile)
  88   next if newfile =~ ignore
  89   wf = Webpage.new(newfile, cur_block, buflocation)
  90   puts "#{counter} files indexed." if counter.to_i / 100.0 == counter / 100
  91
  92   buffer << wf.text
  93   buflocation += wf.text.size
  94   wf.empty!
  95   counter += 1
  96   index << wf
  97   next if buffer.size < 900000
  98
  99   bf_compr = ZCompress::compress(buffer)
 100   zdump.write(bf_compr)
 101   block_ary[cur_block] = Block.new(cur_block, location, bf_compr.size)
 102   buffer = ''
 103   buflocation = 0
 104   cur_block += 1
 105   location += bf_compr.size
 106   puts "Writing block no #{cur_block}"
 107
 108 #  ZFERRET << {:filename => wf.filename, :content => wf.index_content, :offset => location, :size => wf.compressed_size }
 109 #  location += wf.compressed_size
 110
 111 end
 112
 113 # to ensure last part of buffer is written
 114 bf_compr = ZCompress::compress(buffer)
 115 zdump.write(bf_compr)
 116 block_ary[cur_block] = Block.new(cur_block, location, bf_compr.size)
 117
 118 zdump.close
 119 puts "Finished, writing index. #{Time.now - t}"
 120
 121 pages = {}
 122 index.each do |file|
 123   pages[file.filename] = {:block_start => block_ary[file.block].start,
 124                           :block_size => block_ary[file.block].size,
 125                           :start => file.buflocation,
 126                           :size => file.size}
 127 end
 128 subindex = []
 129
 130 puts "Sorted onetime. #{Time.now - t}"
 131 pages.each_pair do |x, y|
 132   md5 = MD5.md5(x).hexdigest
 133   entry = pack(md5, y[:block_start], y[:block_size], y[:start], y[:size])
 134   firstfour = md5subset(md5)
 135   subindex[firstfour] = "" if subindex[firstfour].nil?
 136   subindex[firstfour] << entry
 137 end
 138
 139 puts "Sorted another time. #{Time.now - t}"
 140
 141 newindex = File.open(name +".zindex",'w+')
 142 location = (65535*8)
 143 # p = File.open(name + ".zlog",'w')
 144 subindex.each_with_index do |entry, idx|
 145   next if entry.nil?
 146   newindex.seek(idx*8)
 147   newindex.print([location, entry.size].pack('V2'))
 148   newindex.seek(location)
 149   newindex.print(entry)
 150
 151   # p << "*" * 80 << "\n"
 152   # p << "seek #{idx*8} location #{location} size #{entry.size}" << "\n"
 153   # p << unpack(entry).join(":") << "\n"
 154
 155   location += entry.size
 156 end
 157 puts "Finished. #{Time.now - t}"
 158 newindex.close
 159 # p.close