zdump.rb

   1 #!/usr/bin/ruby
   2 # Program that packs a directory tree into a zdump file.
   3 # By Stian Haklev (shaklev@gmail.com), 2007
   4 # Released under MIT and GPL licenses
   5 #
   6 # Usage: ruby zdump.rb <directory> <output file> <template file>
   7
   8 %w(sha1 rubygems zarchive find htmlshrinker zutil cgi trollop).each {|x| require x}
   9 include ZUtil
  10
  11 STDOUT.sync = true
  12
  13 # do commandline parsing
  14 opts = Trollop::options do
  15   version "zip-doc 0.1 (c) 2007 Stian Haklev (MIT/GPL)"
  16   banner <<-EOS
  17 zdump.rb is part of the zip-doc suite. It basicallly processes the contents of a wikipedia-*-html.7z file (that has already been unextracted), and generates a .zdump file that can be used with mongrel-web.rb.
  18
  19 Usage:
  20        ruby zdump.rb [options] <path> <filename>
  21        (for example ruby zdump.rb ../Downloads/id)
  22 where [options] are:
  23 EOS
  24
  25   opt :ignore, "Comma-separated list of file patterns to ignore, f. ex: ^User%talk,Discussion. ^ means begins at the start of a line, and % matches anything", :type => :string
  26   opt :idxsize, "Size of index, recommend 2 for small collections and 4 for Wikipedia", :type => :integer, :default => 4
  27   opt :zlib, "Use zlib instead of bzip2"
  28   opt :suffix, "No of letters to remove from path (default is usually good enough)", :type => :integer
  29   opt :blocksize, "Blocksize for compression in kb, defaults to 900", :type => :integer, :default => 900
  30   opt :templatefile, "Name of template file (defaults to index.html in given directory)", :type => :string
  31 end
  32
  33 Trollop::die :idxsize, "out of range, must be between 1 and 7" unless !opts[:idxsize] || (opts[:idxsize] > 0 && opts[:idxsize] < 8)
  34 Trollop::die :blocksize, "out of range, must be between 1 and 10000" unless !opts[:idxsize] || (opts[:idxsize] > 0 && opts[:idxsize] < 10001)
  35 Trollop::die :templatefile, "does not exist" unless !opts[:templatefile] || File.exists?(opts[:templatefile])
  36
  37 # check the rest of the arguments
  38 Trollop::die "Wrong number of arguments" unless ARGV.size == 2
  39 dir = ARGV[0]
  40 Trollop::die "Directory #{dir} does not exist or is not readable" unless File.exists?(dir)
  41 Trollop::die "Directory #{ARGV[1]} does not exist" unless File.exists?(File.dirname(ARGV[1]))
  42
  43 # transform ignore to regexp
  44 if opts[:ignore]
  45   ignore = Regexp.new(opts[:ignore].gsub('%', '.*?').split(',').join('|'))
  46 end
  47
  48 shrinker = HTMLShrinker.new
  49 name = ARGV[1]
  50
  51 t = Time.now
  52 base = File.join(dir, "/")
  53 puts "Indexing files in #{base} and writing the file #{name}"
  54 to_strip = opts[:suffix] ? opts[:suffix] : (base).size
  55 compr = opts[:zlib] ? ZArchive::METHOD_ZLIB : ZArchive::METHOD_BZ2
  56 archive = ZArchive::Writer.new(name, compr, opts[:idxsize], opts[:blocksize] * 1000)
  57
  58
  59 template = shrinker.extract_template(File.read(base + "index.html" ))
  60 archive.add("__Zdump_Template__", template)
  61
  62 no_of_files = 1
  63 all_counter = 1
  64 puts "Reading filelist."
  65 filelist = []
  66 Find.find(base) do |newfile|
  67   all_counter += 1
  68   next if File.directory?(newfile) || !File.readable?(newfile)
  69   next if newfile =~ ignore
  70   filelist << newfile
  71   no_of_files += 1
  72 end
  73
  74 puts "Filelist read, selected #{no_of_files} out of #{all_counter}, making up #{npp(100 * no_of_files.to_f / all_counter.to_f)}%."
  75 puts "Beginning to compress."
  76 t2 = Time.now
  77 filelist.each_with_index do |newfile, counter|
  78   if (counter).to_f / 1000.0 == (counter) / 1000
  79     page_per_sec = counter.to_f / (Time.now - t2).to_f
  80     puts "\n#{counter} pages indexed in #{npp(Time.now - t)} seconds, average #{npp(page_per_sec)} files per second. #{archive.hardlinks.size} redirects, #{npp(archive.hardlinks.size.to_f * 100 / counter.to_f)} percentage of all pages."
  81     puts "Estimated time left: #{npp(((no_of_files - counter).to_f / page_per_sec) /60)} minutes."
  82     STDOUT.print "Writing block: "
  83   end
  84   text = shrinker.compress(File.read(newfile))
  85   if text[0..2] == "#R "
  86     archive.add_hardlink(newfile, text[3..-1])
  87   else
  88     archive.add(newfile[to_strip..-1], text)
  89   end
  90 end
  91 filelist = nil # memory cleanup
  92
  93 puts "\n\nFinished, flushing index/processing redirects. #{npp(Time.now - t)}"
  94 archive.flush # to make sure all blocks have been written