htmlshrinker.rb

   1 #!/usr/bin/ruby
   2 # Program to replace commonly used html, extract out top and bottom parts
   3 # of pages, which are roughly similar, and recompose them in the other end
   4 # By Stian Haklev (shaklev@gmail.com), 2007
   5 # Released under MIT and GPL licenses
   6
   7 require 'htmlshrinker-data'
   8
   9 class HTMLExpander
  10   attr_accessor :before, :after
  11   def initialize(template, archive)
  12     @before, @after = template.split(20.chr)
  13     @before.sub!(/\<title>(.*?)\<\/title>/,'<title>TITLE</title>')
  14     @before.gsub!('./', '/')
  15     @after.gsub!(/href="([^\/])/, 'href="/\1')
  16 #    @before.gsub!(/href="[^.]/, 'href="/\1')
  17     @before.sub!(/\<h1 class\=\"firstHeading\">(.*?)\<\/h1>/, '<h1 class="firstHeading">TITLE</h1>')
  18     @after.sub!(/\<li id="f-credits">(.*?)\<\/li>/, '')
  19   end
  20
  21   def uncompress(text)
  22     title, languages, text = text.split("\n", 3)
  23 #    p languages.split(":")
  24     HTMLShrinker_data::Replacements.each {|x, y| text.gsub!(y, x)}
  25     #gsub(/TITLE/, title).gsub("POINTER", @csstext + @jstext)
  26     return @before.gsub('TITLE', title) + text + @after
  27   end
  28 end
  29
  30 class HTMLShrinker
  31   def compress(text)
  32     if text =~ /\<meta http-equiv=\"Refresh\" content=\"0\;url=(.*?)\" \/\>/
  33       url = url_unescape(Regexp::last_match[1].gsub('../', ''))
  34       return "#R #{url}"
  35     end
  36     title = (text.match(/"firstHeading">(.*?)\<\/h1>/m) ? Regexp::last_match[1] : "Unnamed")
  37     languages = ''
  38     # if text.match(/<div id="p-lang" class="portlet">(.*?)\<\/div>/)
  39     #   languages = Regexp::last_match[1]
  40     #   langs = {}
  41     #   languages.scan(/<a href="(.*?)">/) do |match|
  42     #     match = match[0].gsub("../", "")
  43     #     lang, url = match.split("/",2)
  44     #     langs[lang] = url
  45     #   end
  46     #   languages = langs.to_a.join(":")
  47     #   p languages
  48     # end
  49     text = Regexp::last_match[1] if text.match(/ start content -->(.*?)\<\!-- end content /m)
  50     HTMLShrinker_data::Replacements.each {|x, y| text.gsub!(x, y) }
  51     ZUtil::strip_whitespace(text)
  52     text.gsub!(/<img src=(.*?)>/, "")
  53     return [title, languages, text].join("\n")
  54   end
  55
  56   # takes an example html file, extracts the top and bottom, does some replacements
  57   # - this can later be stored and handed to HTMLShrinker at initialization
  58   def extract_template(text)
  59     before = Regexp::last_match.pre_match if text.match(/<\!-- start content -->/)
  60     after = Regexp::last_match.post_match if text.match(/<\!-- end content -->/)
  61     return [before, after].join(20.chr)
  62   end
  63 end
  64