2 %w(md5 zcompress find htmlshrinker zcompress).each {|x| require x}
5 return string.unpack('H32V4' * (string.size/32))
8 def pack(md5, bstart, bsize, start, size)
9 return [md5, bstart, bsize, start, size].pack('H32V4')
13 sprintf("%d", "0x" + four[0..3]).to_i
16 #ZFERRET = Ferret::Index::Index.new(:path => "#{ARGV[1]}.zferret")
17 HTMLSHRINKER = HTMLShrinker.new(ARGV[1])
19 # File lib/dipus/util.rb, line 229
21 def rsplit(*args, &block)
22 reverse.split(*args, &block).reverse.map{|s| s.reverse }
28 attr_reader :text, :compressed, :size, :compressed_size, :filename, :index_content, :block, :buflocation
30 def initialize(filename, block, buflocation)
33 @text = HTMLSHRINKER.compress(File.read(filename))
35 # @index_content = index_content
36 @buflocation = buflocation
50 when /.htm$|.html$/i # get the file, strip all <> tags
51 content = @text.gsub(/\<head>.*?\<\/head>/im,"").gsub(/\<.*?\>/m, " ")
58 attr_reader :number, :start, :size
59 def initialize(number, start, size)
68 cur_block, counter, buflocation, location, size, buffer = 0, 0, 0, 0, 0, ""
69 name = (ARGV[1] ? ARGV[1] : "default")
72 puts "Indexing files in #{ARGV[0]}/ and writing the file #{name}.zindex and directory #{name}.zferret."
73 zdump = File.open("#{name}.zdump", "w")
74 ignore = ARGV[2] ? Regexp.new(ARGV[2]) : /^(Bilde~|Bruker|Pembicaraan_Pengguna~)/
76 `7za l #{ARGV[0]}`.each do |line|
77 a, b = line.rsplit(" ", 2)
79 filelist << b if b.match("/") && b.match(".")
82 #Find.find(ARGV[0]) do |newfile|
83 filelist.each do |newfile|
85 puts "7za x -y #{ARGV[0]} #{newfile}"
86 `7za x #{ARGV[0]} #{newfile}`
87 # next if File.directory?(newfile) || !File.readable?(newfile)
88 next if newfile =~ ignore
89 wf = Webpage.new(newfile, cur_block, buflocation)
90 puts "#{counter} files indexed." if counter.to_i / 100.0 == counter / 100
93 buflocation += wf.text.size
97 next if buffer.size < 900000
99 bf_compr = ZCompress::compress(buffer)
100 zdump.write(bf_compr)
101 block_ary[cur_block] = Block.new(cur_block, location, bf_compr.size)
105 location += bf_compr.size
106 puts "Writing block no #{cur_block}"
108 # ZFERRET << {:filename => wf.filename, :content => wf.index_content, :offset => location, :size => wf.compressed_size }
109 # location += wf.compressed_size
113 # to ensure last part of buffer is written
114 bf_compr = ZCompress::compress(buffer)
115 zdump.write(bf_compr)
116 block_ary[cur_block] = Block.new(cur_block, location, bf_compr.size)
119 puts "Finished, writing index. #{Time.now - t}"
123 pages[file.filename] = {:block_start => block_ary[file.block].start,
124 :block_size => block_ary[file.block].size,
125 :start => file.buflocation,
130 puts "Sorted onetime. #{Time.now - t}"
131 pages.each_pair do |x, y|
132 md5 = MD5.md5(x).hexdigest
133 entry = pack(md5, y[:block_start], y[:block_size], y[:start], y[:size])
134 firstfour = md5subset(md5)
135 subindex[firstfour] = "" if subindex[firstfour].nil?
136 subindex[firstfour] << entry
139 puts "Sorted another time. #{Time.now - t}"
141 newindex = File.open(name +".zindex",'w+')
143 # p = File.open(name + ".zlog",'w')
144 subindex.each_with_index do |entry, idx|
147 newindex.print([location, entry.size].pack('V2'))
148 newindex.seek(location)
149 newindex.print(entry)
151 # p << "*" * 80 << "\n"
152 # p << "seek #{idx*8} location #{location} size #{entry.size}" << "\n"
153 # p << unpack(entry).join(":") << "\n"
155 location += entry.size
157 puts "Finished. #{Time.now - t}"