Added clivescan utility. Works for most parts (youtube and gvideo).
[clive-utils.git] / clivescan
blob409ee202f05b17a9d86e8036c5bcf7a2010b1e37
1 #!/usr/bin/env perl
2 # -*- coding: ascii -*-
3 ###########################################################################
4 # clivescan, the video link scanning utility for clive
5 # Copyright (C) 2008 Toni Gundogdu.
7 # This file is part of clive-utils.
9 # clivescan is free software: you can redistribute it and/or modify
10 # it under the terms of the GNU General Public License as published by
11 # the Free Software Foundation, either version 3 of the License, or
12 # (at your option) any later version.
14 # clivescan is distributed in the hope that it will be useful,
15 # but WITHOUT ANY WARRANTY; without even the implied warranty of
16 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 # GNU General Public License for more details.
19 # You should have received a copy of the GNU General Public License
20 # along with clivescan. If not, see <http://www.gnu.org/licenses/>.
21 ###########################################################################
23 # Keep it simple.
25 use warnings;
26 use strict;
28 binmode(STDOUT, ":utf8");
30 use HTML::TokeParser;
31 use WWW::Curl::Easy;
32 use Tk::FontDialog;
33 use Tk::DialogBox;
34 use Config::Tiny;
35 use Tk::Tree;
36 use Tk;
38 # Core modules:
39 use Getopt::Long qw(:config bundling);
40 use Digest::SHA qw(sha1_hex);
41 use File::Spec;
42 use File::Find;
43 use File::Path;
44 use Pod::Usage;
45 use Encode;
46 use Cwd;
48 # Non-essentials
49 my %opted_mods = (Clipboard => 1);
50 eval "use Clipboard"; $opted_mods{Clipboard}=0 if $@;
52 my $VERSION = "2.0beta2";
53 my $CONFIGDIR = $ENV{CLIVESCAN_CONFIGDIR}
54 || File::Spec->catfile($ENV{HOME}, ".config/clivescan");
55 my $CONFIGFILE = File::Spec->catfile($CONFIGDIR, "config");
56 my $PREFSFILE = File::Spec->catfile($CONFIGDIR, "prefs");
58 my %opts; # Holds the options
59 my @queue; # Holds the current URL queue
60 my %found_queue;# Holds the results of the scanned video page links
61 my $curl; # Holds the curl handle (reused throught lifespan)
62 my $mw; # Holds the main window handle (GUI)
63 my $pwmain; # Holds the handle to the main paned window
64 my $pwtop; # Holds the handle to the top paned window
65 my $pwbottom; # Holds the handle to the bottom paned window
66 my $lbtlink; # Holds the handle to the listbox tree of found links
67 my $lbtqueue; # Holds the handle to the listbox tree of queued links
69 # Parse config
70 my $conf = Config::Tiny->read($CONFIGFILE);
71 my $prefs = Config::Tiny->read($PREFSFILE);
72 %opts = (
73 clive => $conf->{clive}->{path},
74 opts => $conf->{clive}->{opts},
75 agent => $conf->{http}->{agent},
76 proxy => $conf->{http}->{proxy},
78 geometry=> $prefs->{gui}->{geometry},
79 pwmain => $prefs->{gui}->{pwmain},
80 pwtop => $prefs->{gui}->{pwtop},
81 pwbottom=> $prefs->{gui}->{pwbottom},
82 mainfont=> $prefs->{gui}->{mainfont},
85 # Parse cmdline
86 # Define those not read from config, init with defaults
87 $opts{quiet} = 0;
88 $opts{paste} = 0;
89 $opts{all} = 0;
90 $opts{debug} = 0;
91 $opts{help} = 0;
92 $opts{manual} = 0;
93 $opts{version} = 0;
94 $opts{mainfont} = $opts{mainfont} || "{helvetica} -12 bold";
96 GetOptions(\%opts,
97 'debug|d', 'help|h', 'manual|m', 'version|v', 'all|a',
98 'paste|x', 'quiet|q', 'clive|c=s', 'opts|o=s', 'agent|U=s',
99 'proxy|y=s',
100 # Workaround since '$longopt|shortopt' is a no-no.
101 'noproxy|X' => sub { $opts{proxy} = ""; },
102 ) or pod2usage(1);
104 # Since 'version|v' => \&print_version and exit cannot tango with tk
105 print_version(0) if $opts{version};
106 pod2usage(-exitstatus => 0, -verbose => 1) if $opts{help};
107 pod2usage(-exitstatus => 0, -verbose => 2) if $opts{manual};
109 $opts{clive} = $opts{clive} || $ENV{CLIVE_PATH};
110 find_clive() unless $opts{clive};
112 get_queue();
114 select STDERR; $| = 1; # => unbuffered
115 select STDOUT; $| = 1;
117 process_queue();
119 unless ( $opts{all} ) { init_gui(); }
120 else { grab_all(); }
123 ## Subroutines: Connection
125 sub init_curl {
126 $curl = WWW::Curl::Easy->new;
127 $curl->setopt(CURLOPT_USERAGENT, $opts{agent} || "Mozilla/5.0");
128 $curl->setopt(CURLOPT_PROXY, $opts{proxy}) if defined $opts{proxy};
129 $curl->setopt(CURLOPT_VERBOSE, 1) if $opts{debug};
130 $curl->setopt(CURLOPT_FOLLOWLOCATION, 1);
131 $curl->setopt(CURLOPT_AUTOREFERER, 1);
132 $curl->setopt(CURLOPT_HEADER, 0);
133 $curl->setopt(CURLOPT_NOBODY, 0);
136 sub fetch_page {
137 my ($url, $resp, $rc) = (shift, 0, 0);
138 open my $fh, ">", \$resp;
140 $curl->setopt(CURLOPT_URL, $url);
141 $curl->setopt(CURLOPT_ENCODING, "");
142 $curl->setopt(CURLOPT_WRITEDATA, $fh);
143 $rc = $curl->perform;
145 return ($rc, $fh, $resp);
149 ## Subroutines: Queue
151 sub get_queue {
152 if ( $opts{paste} ) {
153 print STDERR "error: Clipboard module not found" and exit
154 unless $opted_mods{Clipboard};
155 my $data = Clipboard->paste();
156 if ( $data ) {
157 parse_input($_) foreach split/\n/,$data;
161 parse_input($_) foreach @ARGV;
162 unless ( @queue ) { parse_input($_) while ( <STDIN> ); }
164 my %h = map {$_,1} @queue; # Remove duplicates
165 @queue = keys %h;
168 sub process_queue {
169 init_curl();
170 foreach ( @queue ) {
171 print "Fetching $_ ..." unless $opts{quiet};
172 my ($rc, $fh, $resp, $errmsg) = fetch_page($_);
173 if ( $rc == 0 ) {
174 $rc = $curl->getinfo(CURLINFO_RESPONSE_CODE);
175 if ( $rc == 0 or $rc == 200 ) {
176 scan_page($_, \$resp);
177 } else {
178 $errmsg = $curl->strerror($rc)." (http/$rc)";
180 } else {
181 $errmsg = $curl->strerror($rc)." (http/$rc)";
183 close $fh;
184 print STDERR "\n==> error: $errmsg\n" if $errmsg;
188 sub scan_page {
189 my ($scanurl, $pageref) = @_;
190 print "done.\n" unless $opts{quiet};
191 $$pageref =~ tr{\n}//d;
193 my $p = HTML::TokeParser->new($pageref);
194 $p->get_tag("title");
195 my $pagetitle = $p->get_trimmed_text;
197 my %re = (
198 Youtube => {
199 url_prefix => "http://youtube.com/watch?v=",
200 search_for => qr|\Q/watch?v=\E(.*?)["< &]|i,
202 YoutubeEmbed => {
203 url_prefix => "http://youtube.com/watch?v=",
204 search_for => qr|\Qyoutube.com/v/\E(.*?)/|i,
206 GVideo => { # NOTE: Ignores original TLD, uses .com for extraction
207 url_prefix => "http://video.google.com/videoplay?docid=",
208 search_for => qr|\Q/googleplayer.swf?docid=\E(.*?)["< &]|i,
210 GVideoEmbed => { # NOTE: Ditto.
211 url_prefix => "http://video.google.com/videoplay?docid=",
212 search_for => qr|\Q/videoplay?docid=\E(.*?)["< &]|i,
216 print "=> Scanning page for links " unless $opts{quiet};
218 sub _scan_progress {
219 my ($linksref, $link) = @_;
220 push @$linksref,$link;
221 if ( scalar (@$linksref) % 5 == 0 ) { print scalar (@$linksref); }
222 else { print "."; }
225 my @links;
226 while ( my $host = each( %re ) ) {
227 _scan_progress(\@links, "$re{$host}{url_prefix}$1")
228 while ( $$pageref =~ /$re{$host}{search_for}/g );
231 my %h = map { $_, 1 } @links; # Weed out duplicates
232 @links = keys %h;
233 #print "$_\n" foreach(@links);
235 print "\n=> Found " .scalar @links. " links after removing duplicates.\n"
236 unless $opts{quiet};
238 my %verified_links;
239 foreach my $link ( @links ) {
240 print "==> Fetching $link ..." unless $opts{quiet};
241 my ($rc, $fh, $resp, $errmsg) = fetch_page($link);
242 if ( $rc == 0 ) {
243 $rc = $curl->getinfo(CURLINFO_RESPONSE_CODE);
244 if ( $rc == 0 or $rc == 200 ) {
245 print "done.\n" unless $opts{quiet};
246 # Grab title
247 $p = HTML::TokeParser->new(\$resp);
248 $p->get_tag("title");
249 my $title = $p->get_trimmed_text;
250 # Store, prevent link duplicates
251 my $sha1 = sha1_hex($link);
252 $verified_links{$sha1} = {link => $link, title => $title}
253 unless defined $verified_links{$sha1};
254 } else {
255 $errmsg = $curl->strerror($rc)." (http/$rc)";
257 } else {
258 $errmsg = $curl->strerror($rc)." (http/$rc)";
260 close $fh;
261 print STDERR "\n==> error: $errmsg\n" if $errmsg;
264 $found_queue{ sha1_hex($scanurl) } =
265 { title => $pagetitle, url => $scanurl, videos => {%verified_links} };
268 sub grab_all {
269 my @q;
270 for my $i ( keys %found_queue ) {
271 my %videos = %{$found_queue{$i}{videos}};
272 for my $j ( keys %videos ) {
273 push @q, $videos{$j}{link};
276 run_clive(@q);
280 ## Subroutines: Helpers
282 sub parse_input {
283 my $url = shift;
285 return if $url =~ /^$/;
286 chomp $url;
288 $url = "http://$url" if $url !~ m!^http://!i;
289 push @queue, $url;
292 sub find_clive {
293 print "Trying to locate 'clive' ...";
295 find ( sub { $opts{clive} = $File::Find::name if ( $_ eq 'clive' ) },
296 split /:/, $ENV{PATH} || getcwd);
298 if ( $opts{clive} ) { print "$opts{clive}\n"; }
299 else { print STDERR "error: not found, use --clive=path\n"; exit; }
302 sub run_clive {
303 my (@q) = @_;
304 system "$opts{clive} $opts{opts} " . join(' ', @q);
307 sub print_version {
308 my $noexit = shift;
309 my $perl_v = sprintf "%vd", $^V;
310 my $clipb_v = $opted_mods{Clipboard} ? $Clipboard::VERSION : "-";
311 my $s = sprintf
312 "clivescan version $VERSION. Copyright (C) 2008 Toni Gundogdu.
314 Perl: $perl_v ($^O)
315 Modules:
316 * Config::Tiny/$Config::Tiny::VERSION\t\t* WWW::Curl/$WWW::Curl::VERSION
317 * Tk/$Tk::VERSION\t\t\t* Tk::Tree/$Tk::Tree::VERSION
318 * Tk::DialogBox/$Tk::DialogBox::VERSION\t\t* Clipboard/$clipb_v
319 * Tk::FontDialog/$Tk::FontDialog::VERSION
320 Core modules:
321 * Getopt::Long/$Getopt::Long::VERSION\t\t* Digest::SHA/$Digest::SHA::VERSION
322 * File::Spec/$File::Spec::VERSION\t\t* File::Find/$File::Find::VERSION
323 * File::Path/$File::Path::VERSION\t\t* Encode/$Encode::VERSION
324 * Pod::Usage/$Pod::Usage::VERSION\t\t* Cwd/$Cwd::VERSION
326 This program comes with ABSOLUTELY NO WARRANTY. You may redistribute copies of
327 clivescan under the terms of the GNU General Public License as published by the
328 Free Software Foundation, either version 3 of the License, or (at your option)
329 any later version. You should have received a copy of the General Public License
330 along with this program. If not, see http://www.gnu.org/licenses/.
332 return $s if $noexit;
333 print $s; exit;
337 # GUI:
339 sub init_gui {
340 return unless %found_queue;
342 $mw = MainWindow->new;
343 $mw->geometry($opts{geometry}) if defined $opts{geometry};
344 $mw->title('clivescan');
345 $mw->protocol('WM_DELETE_WINDOW', sub { save_prefs(); exit; });
347 # Menubar
348 my $mb = $mw->Menu;
349 $mw->configure(-menu => $mb);
351 # Menu: File
352 my $file = $mb->cascade(-label => 'File', -underline => 0, -tearoff => 0);
353 $file->command(-label => 'Extract videos in queue...',
354 -underline => 0, -command => \&on_extract);
355 $file->separator;
356 $file->command(-label => 'Quit', -underline => 0,
357 -command => sub { save_prefs(); exit; } );
359 # Menu: Edit
360 my $edit = $mb->cascade(-label => 'Edit', -underline => 0, -tearoff => 0);
361 $edit->command(-label => 'Preferences...',
362 -underline => 0, -command => \&on_prefs);
364 # Menu: Help
365 my $help = $mb->cascade(-label => 'Help', -underline => 0, -tearoff => 0);
366 $help->command(-label => 'About...',
367 -underline => 0, -command => \&on_about);
369 # The GUI has an upper and a lower part
370 $pwmain = $mw->Panedwindow(-orient => 'v', -opaqueresize => 0);
372 # Upper part
373 $pwtop = $pwmain->Panedwindow(-orient => 'h', -opaqueresize => 0);
375 # Upper: Channels
376 my $lbar = $pwtop->Frame;
378 $lbtlink = $lbar->Scrolled('Tree',
379 -scrollbars => 'osoe',
380 -itemtype => 'text',
381 -selectmode => 'extended',
382 -indicator => 1,
383 -drawbranch => 1,
384 )->pack(-side => 'top', -expand => 1, -fill => 'both');
386 for my $i ( keys %found_queue ) {
387 my $scantitle = $found_queue{$i}{title};
388 $scantitle =~ tr{.}//d;
390 $lbtlink->add($scantitle);
391 $lbtlink->itemCreate($scantitle, 0, -text => $scantitle, -itemtype => 'text');
393 for my $j ( keys %{$found_queue{$i}{videos}} ) {
394 my %video = %{$found_queue{$i}{videos}{$j}};
396 my $title = $video{title};
397 $title =~ tr{.}//d;
399 my $path = "$scantitle.$title";
401 $lbtlink->add($path, -data => {%video});
402 $lbtlink->itemCreate($path, 0,
403 -text => $title, -itemtype => 'text');
406 $lbtlink->autosetmode;
407 $lbtlink->close($_) foreach ( $lbtlink->infoChildren('') );
409 my $rbar = $pwtop->Frame; # Button toolbar
410 $rbar->Button(-text => 'Grab', -command => \&on_grab
411 )->pack(-fill => 'x');
413 $rbar->Button(-text => 'Grab everything', -command => \&on_grab_all
414 )->pack(-fill => 'x');
416 $pwtop->add($lbar, $rbar, -width => $opts{pwtop} || 200);
418 # Lower part
419 $pwbottom = $pwmain->Panedwindow(-orient => 'h', -opaqueresize => 0);
421 $lbtqueue = $pwbottom->Scrolled('Tree',
422 -scrollbars => 'osoe',
423 -itemtype => 'text',
424 -selectmode => 'extended',
425 -indicator => 1,
426 -drawbranch => 1,
429 my $bar = $pwbottom->Frame; # Button toolbar
431 $bar->Button(-text => 'Remove', -command => \&on_remove
432 )->pack(-fill => 'x');
434 $bar->Button(-text => 'Clear', -command => \&on_clear
435 )->pack(-fill => 'x');
437 $bar->Button(-text => 'Extract videos...', -command => \&on_extract
438 )->pack(-fill => 'x', -side => 'bottom');
440 $pwbottom->add($lbtqueue, $bar, -width => $opts{pwbottom} || 200);
442 # Add upper and lower parts to main paned window
443 $pwmain->add($pwtop, $pwbottom, -height => $opts{pwmain} || 200);
445 $mw->RefontTree(-font => $opts{mainfont});
446 $pwmain->pack(-expand => 1, -fill => 'both');
448 MainLoop;
451 sub save_prefs {
452 mkpath( [$CONFIGDIR], 1, 0700 );
454 my $c = Config::Tiny->new;
455 $c->{gui}->{geometry} = $mw->geometry();
456 $c->{gui}->{pwmain} = ($pwmain->sashCoord(0))[1]-7;
457 $c->{gui}->{pwtop} = ($pwtop->sashCoord(0))[0]-7;
458 $c->{gui}->{pwbottom} = ($pwbottom->sashCoord(0))[0]-7;
459 $c->{gui}->{mainfont} = $opts{mainfont};
461 $c->write($PREFSFILE);
464 sub on_prefs_ok {
465 ($opts{mainfont}) = @_;
466 $mw->RefontTree(-font => $opts{mainfont});
467 save_prefs();
470 sub queue_item {
471 my $path = shift;
472 return if $path !~ /\./;
473 return if $lbtqueue->infoExists($path);
475 my %video = %{$lbtlink->infoData($path)};
476 my ($link) = split /\./, $path;
478 unless ( $lbtqueue->infoExists($link) ) {
479 $lbtqueue->add($link);
480 $lbtqueue->itemCreate($link, 0,
481 -text => $link, -itemtype => 'text');
484 $lbtqueue->add($path, -data => {%video});
485 $lbtqueue->itemCreate($path, 0,
486 -text => $video{title}, -itemtype => 'text');
489 sub on_grab {
490 queue_item($_) foreach ( $lbtlink->infoSelection );
491 $lbtqueue->autosetmode;
494 sub on_grab_all {
495 foreach ( $lbtlink->infoChildren("") ) {
496 my ($parent) = split /\./;
497 queue_item($_)
498 foreach ($lbtlink->infoChildren($parent) );
500 $lbtqueue->autosetmode;
503 sub on_remove {
504 $lbtqueue->deleteEntry($_)
505 foreach ( $lbtqueue->infoSelection );
508 sub on_clear {
509 $lbtqueue->deleteAll;
512 sub on_about {
513 my $dlg = $mw->DialogBox(-title => 'About', -buttons => ['OK']);
514 my $txt = $dlg->add('Text')->pack;
515 $txt->insert('end', print_version(1));
516 $dlg->Show;
519 sub change_font {
520 my ($top, $lblv, $lbl) = @_;
521 my $font = $top->FontDialog(-initfont => $$lblv)->Show;
523 if ( defined $font ) {
524 my $descr = $top->FontDialog->GetDescriptiveFontName($font);
525 $lbl->configure(-font => $descr);
526 $$lblv = $descr;
530 sub on_prefs {
531 my $dlg = $mw->DialogBox(-title => 'clivescan preferences',
532 -buttons => ['OK','Cancel']);
534 $dlg->add('Label', -text => 'Fonts: press to choose'
535 )->grid(-sticky => 'w', -pady => 10);
537 my ($mainfont) = ($opts{mainfont});
538 my $mainfontl = $dlg->Label(-textvariable => \$mainfont);
540 $dlg->add('Button', -text => 'Main font',
541 -command => sub { change_font($dlg, \$mainfont, $mainfontl) }
542 )->grid($mainfontl, -sticky => 'w', -padx => '5');
544 on_prefs_ok($mainfont) if $dlg->Show eq 'OK';
547 sub on_extract {
548 my @q;
549 foreach ( $lbtqueue->infoChildren('') ) {
550 foreach ( $lbtqueue->infoChildren($_) ) {
551 my %video = %{$lbtqueue->infoData($_)};
552 push @q, $video{link};
555 return unless @q;
557 # Prompt for clive(1) options
558 my $dlg = $mw->DialogBox(-title => 'clive(1) options',
559 -buttons => ['OK','Cancel']);
561 $dlg->add('Label', -text => 'Path to clive'
562 )->grid(my $clivepath = $dlg->Entry(-width => 25),
563 -sticky => 'w', -padx => '5');
565 $dlg->add('Label', -text => 'Runtime options'
566 )->grid(my $cliveopts = $dlg->Entry(-width => 25),
567 -sticky => 'w', -padx => '5');
569 $clivepath->insert('end', $opts{clive});
570 $cliveopts->insert('end', $opts{opts});
572 if ( $dlg->Show() eq 'OK' ) {
573 $opts{clive} = $clivepath->get;
574 $opts{opts} = $cliveopts->get;
575 $mw->destroy;
576 run_clive(@q);