updated git and svn scripts
[xrzperl.git] / thread_persist_content
blob57f8ba7560d505a6d5fbfdfd6f2b843451e04d13
1 #!/usr/bin/perl -w
2 ###APPNAME: thread_persist_content
3 ###APPAUTHOR: duel
4 ###APPDATE: 2009-01-29 02:50:08
5 ###APPVER: 0.1
6 ###APPDESC: thread_persist_content
7 ###APPUSAGE: [urls|files]
8 ###APPEXAMPLE: thread_persist_content
9 ###APPOPTION:
10 use strict;
12 #ENV variable MUST be defined somewhere,
13 #FOR perl to search modules from,
14 #OR nothing will work
15 use lib $ENV{XR_PERL_MODULE_DIR};
17 use MyPlace::Script::Usage qw/help_required help_even_empty/;
18 exit 0 if(help_required($0,@ARGV));
19 #exit 0 if(help_even_empty($0,@ARGV));
21 use HTML::TreeBuilder;
22 use MyPlace::HTML::Convertor;
23 use Encode qw/decode/;
24 binmode STDERR,"utf8";
26 my $body_id = shift;
27 my $user = shift;
28 my $pass = shift;
29 my @files = @ARGV;
30 unless(@files) {while(<>){chomp;push @files,$_;}}
32 die("No files specified\n") unless(@files);
34 sub get_title {
35 my $tree = shift;
36 my ($title) = $tree->look_down("_tag","title");
37 if($title) {
38 return $title->as_text();
40 return undef
43 sub uniq_filename($$) {
44 my $base=shift;
45 my $ext=shift;
46 my $inc = "";
47 while(-f "$base$inc$ext") {
48 $inc = $inc ? $inc + 1 : 1;
50 return "$base$inc$ext";
53 sub download_url {
54 my($url,$file) = @_;
55 return 1 if(-f $file);
56 open FI,"-|","netcat_autologin",$url,$user,$pass or return undef;
57 open FO,">",$file or return undef;
58 print FO <FI>;
59 close FO;
60 return 1;
63 my %dir_check;
64 sub mkdir_check {
65 my $dir = shift;
66 return 1 if($dir_check{$dir});
67 unless(-d $dir) {
68 unless(mkdir $dir) {
69 print STDERR "$!\n";return 0;
71 $dir_check{$dir}=1;
73 else {
74 $dir_check{$dir}=1;
76 return 1;
79 my $idx = 1;
80 my $count = @files;
81 foreach my $file (@files) {
82 print STDERR "[$idx/$count] Process $file ...\n";
83 $idx++;
84 if($file =~ /^http:\/\//i) {
85 my $filename = $file;
86 $filename =~ s/^.*\///;
87 $filename ||= "index.html";
88 mkdir_check("src") or die();
89 $filename = "src/$filename";
90 if(-f $filename) {
91 goto download_end;
93 print STDERR "Downloading $file->$filename ...\n";
94 if(download_url($file,$filename)) {
95 # print STDERR "[OK]\n";
97 else {
98 unlink $filename if(-f $filename);
99 # print STDERR "[Failed]\n";
101 download_end:
102 $file = $filename;
104 unless (-f $file) {
105 print STDERR "File not exists : $file\t[Skipped]\n";
106 next;
108 my @data;
109 open FI,"<",$file;
110 while(<FI>) {
111 push @data,decode("gbk",$_);
113 close FI;
114 my $tree=HTML::TreeBuilder->new_from_content(@data);
115 my $title = get_title($tree);
116 $title =~ s/^\s+//;
117 $title =~ s/\s+$//;
118 unless($title) {
119 $title = $file;
120 $title =~ s/.*\///;
121 $title =~ s/\..*$//g;
123 $title =~ s/\[.*$//g;
124 $title =~ s/5u.*//g;
125 $title =~ s/[\/\\\!\*\+]//g;
126 my $body;
127 if($body_id) {
128 ($body) = $tree->look_down("id",$body_id);
129 ($body) = $tree->look_down("class",$body_id) unless($body);
131 $body = $tree unless($body);
132 print STDERR "Found Content Title: $title ";
133 my @images = $body->look_down(_tag=>"img",src=>qr/\.jpg$/i);
134 if(@images) {
135 print STDERR "Persist As Images\n";
136 mkdir_check("img") or die();
137 my $imgd = "img/$title";
138 mkdir_check($imgd) or next;
139 chdir $imgd;
140 open FO,"|-","batchget" or die("$!\n");
141 print FO $_->attr('src'),"\n" foreach(@images);
142 close FO;
143 chdir "../../";
145 else {
146 mkdir_check("txt") or die();
147 my $dst = "txt/$title.txt";
148 print STDERR "Persist As Text -> $dst ...\t";
149 if(-f $dst) {
150 print STDERR "[Skipped(File exists)]\n";
151 next;
153 open FO,">:utf8",$dst or die("$!\n");
154 my $text = text_from_node($body);
155 print FO $title,"\n","\n";
156 print FO @{$text},"\n" if($text);
157 close FO;
158 print STDERR "[OK]\n";
160 next unless($tree);
161 $tree->delete();