scripts de backup
[slim.amamou.scripts.git] / crawlinx.rb
blob2174b4feab49c7818ce1d17fe3d6bd2911f91c21
1 #!/usr/bin/ruby1.8
3 require 'net/http'
4 require 'uri'
5 require 'rexml/document'
8 initial_url = URI.parse ARGV[0]
9 limit_url = URI.parse ARGV[1]
11 urls = Array.new
12 urls << initial_url              
14 begin
15         current_url = urls.shift
16         current_url.route_from(limit_url)
18         begin
19         page = REXML::Document.new(Net::HTTP.get(current_url))
21         rescue REXML::ParseException
22                 puts "erreur XML dans " + current_url
23                 next
24         end
26         # extract urls
27         page.elements.each('//a[@href]') do |anchor|
28                 url = URI.parse(anchor.attributes['href'])
29                 if url.relative? 
30                 then 
31                         url = current_url + url
32                 end
34                 urls << url
35         end
37         urls.uniq!
38         
39         # print urls
40         urls.each do |url|
41                 puts url
42         end
44 end while not urls.empty?