updated git and svn scripts
[xrzperl.git] / sinabook_parsesearch
blobb6303196bec59146ce8eff6770fee8468574a4cb
1 #!/usr/bin/perl -w
2 ###APPNAME: sinabook_parsesearch
3 ###APPAUTHOR: xiaoranzzz
4 ###APPDATE: Tue Mar 11 16:56:49 2008
5 ###APPVER: 0.1
6 ###APPDESC: [cmd] default as sinabook_justtext
7 ###APPUSAGE: URL
8 ###APPOPTION:
9 use strict;
10 use URI::Escape;
11 use HTML::TreeBuilder;
12 use Data::Dumper;
14 $ARGV[0]="-h" unless(@ARGV);
15 foreach(@ARGV){
16 exit(system("plhelp",$0,@ARGV)) if($_ eq "-h" || $_ eq "--help");
19 my $URL=shift;
21 open FI,"-|","netcat '$URL' | iconv -f gb2312 -t utf8" or die("Unable to fork netcat | iconv\n");
22 my @tree;
23 $tree[0] = HTML::TreeBuilder->new();
24 my $pages=0;
25 while(<FI>) {
26 $tree[0]->parse($_);
27 my @match = $_ =~ /(\/booksearch\/booksearch\.php\?page=)([0-9]+)(\&[^\"\' ]+)/;
28 if(@match) {
29 $pages = $match[1];
32 close FI;
33 $tree[0]->eof;
35 for(my $i=2;$i<=$pages;$i++) {
36 $tree[$i-1] = HTML::TreeBuilder->new();
37 open FI,"-|","netcat '$URL&page=$i' | iconv -f gb2312 -t utf8" or die("Unable to fork netcat | iconv\n");
38 $tree[$i-1]->parse($_) while(<FI>);
39 close FI;
40 $tree[$i-1]->eof;
43 open FO,"|-","sort -u";
44 foreach my $tree(@tree) {
45 foreach my $div($tree->look_down("_tag","div","class","des")) {
46 foreach my $node($div->look_down("_tag","a")) {
47 print FO $node->attr("href"),"\n";
50 $tree->delete();
52 close FO