2 # -*- coding: ascii -*-
3 ###########################################################################
4 # clivescan, the video link scanning utility for clive
5 # Copyright (C) 2008 Toni Gundogdu.
7 # This file is part of clive-utils.
9 # clivescan is free software: you can redistribute it and/or modify
10 # it under the terms of the GNU General Public License as published by
11 # the Free Software Foundation, either version 3 of the License, or
12 # (at your option) any later version.
14 # clivescan is distributed in the hope that it will be useful,
15 # but WITHOUT ANY WARRANTY; without even the implied warranty of
16 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 # GNU General Public License for more details.
19 # You should have received a copy of the GNU General Public License
20 # along with clivescan. If not, see <http://www.gnu.org/licenses/>.
21 ###########################################################################
28 binmode(STDOUT
, ":utf8");
39 use Getopt
::Long
qw(:config bundling);
40 use Digest
::SHA
qw(sha1_hex);
49 my %opted_mods = (Clipboard
=> 1);
50 eval "use Clipboard"; $opted_mods{Clipboard
}=0 if $@
;
52 my $VERSION = "2.0beta2";
53 my $CONFIGDIR = $ENV{CLIVESCAN_CONFIGDIR
}
54 || File
::Spec
->catfile($ENV{HOME
}, ".config/clivescan");
55 my $CONFIGFILE = File
::Spec
->catfile($CONFIGDIR, "config");
56 my $PREFSFILE = File
::Spec
->catfile($CONFIGDIR, "prefs");
58 my %opts; # Holds the options
59 my @queue; # Holds the current URL queue
60 my %found_queue;# Holds the results of the scanned video page links
61 my $curl; # Holds the curl handle (reused throught lifespan)
62 my $mw; # Holds the main window handle (GUI)
63 my $pwmain; # Holds the handle to the main paned window
64 my $pwtop; # Holds the handle to the top paned window
65 my $pwbottom; # Holds the handle to the bottom paned window
66 my $lbtlink; # Holds the handle to the listbox tree of found links
67 my $lbtqueue; # Holds the handle to the listbox tree of queued links
70 my $conf = Config
::Tiny
->read($CONFIGFILE);
71 my $prefs = Config
::Tiny
->read($PREFSFILE);
73 clive
=> $conf->{clive
}->{path
},
74 opts
=> $conf->{clive
}->{opts
},
75 agent
=> $conf->{http
}->{agent
},
76 proxy
=> $conf->{http
}->{proxy
},
78 geometry
=> $prefs->{gui
}->{geometry
},
79 pwmain
=> $prefs->{gui
}->{pwmain
},
80 pwtop
=> $prefs->{gui
}->{pwtop
},
81 pwbottom
=> $prefs->{gui
}->{pwbottom
},
82 mainfont
=> $prefs->{gui
}->{mainfont
},
86 # Define those not read from config, init with defaults
94 $opts{mainfont
} = $opts{mainfont
} || "{helvetica} -12 bold";
97 'debug|d', 'help|h', 'manual|m', 'version|v', 'all|a',
98 'paste|x', 'quiet|q', 'clive|c=s', 'opts|o=s', 'agent|U=s',
100 # Workaround since '$longopt|shortopt' is a no-no.
101 'noproxy|X' => sub { $opts{proxy
} = ""; },
104 # Since 'version|v' => \&print_version and exit cannot tango with tk
105 print_version
(0) if $opts{version
};
106 pod2usage
(-exitstatus
=> 0, -verbose
=> 1) if $opts{help
};
107 pod2usage
(-exitstatus
=> 0, -verbose
=> 2) if $opts{manual
};
109 $opts{clive
} = $opts{clive
} || $ENV{CLIVE_PATH
};
110 find_clive
() unless $opts{clive
};
114 select STDERR
; $| = 1; # => unbuffered
115 select STDOUT
; $| = 1;
119 unless ( $opts{all
} ) { init_gui
(); }
123 ## Subroutines: Connection
126 $curl = WWW
::Curl
::Easy
->new;
127 $curl->setopt(CURLOPT_USERAGENT
, $opts{agent
} || "Mozilla/5.0");
128 $curl->setopt(CURLOPT_PROXY
, $opts{proxy
}) if defined $opts{proxy
};
129 $curl->setopt(CURLOPT_VERBOSE
, 1) if $opts{debug
};
130 $curl->setopt(CURLOPT_FOLLOWLOCATION
, 1);
131 $curl->setopt(CURLOPT_AUTOREFERER
, 1);
132 $curl->setopt(CURLOPT_HEADER
, 0);
133 $curl->setopt(CURLOPT_NOBODY
, 0);
137 my ($url, $resp, $rc) = (shift, 0, 0);
138 open my $fh, ">", \
$resp;
140 $curl->setopt(CURLOPT_URL
, $url);
141 $curl->setopt(CURLOPT_ENCODING
, "");
142 $curl->setopt(CURLOPT_WRITEDATA
, $fh);
143 $rc = $curl->perform;
145 return ($rc, $fh, $resp);
149 ## Subroutines: Queue
152 if ( $opts{paste
} ) {
153 print STDERR
"error: Clipboard module not found" and exit
154 unless $opted_mods{Clipboard
};
155 my $data = Clipboard
->paste();
157 parse_input
($_) foreach split/\n/,$data;
161 parse_input
($_) foreach @ARGV;
162 unless ( @queue ) { parse_input
($_) while ( <STDIN
> ); }
164 my %h = map {$_,1} @queue; # Remove duplicates
171 print "Fetching $_ ..." unless $opts{quiet
};
172 my ($rc, $fh, $resp, $errmsg) = fetch_page
($_);
174 $rc = $curl->getinfo(CURLINFO_RESPONSE_CODE
);
175 if ( $rc == 0 or $rc == 200 ) {
176 scan_page
($_, \
$resp);
178 $errmsg = $curl->strerror($rc)." (http/$rc)";
181 $errmsg = $curl->strerror($rc)." (http/$rc)";
184 print STDERR
"\n==> error: $errmsg\n" if $errmsg;
189 my ($scanurl, $pageref) = @_;
190 print "done.\n" unless $opts{quiet
};
191 $$pageref =~ tr
{\n}//d;
193 my $p = HTML
::TokeParser
->new($pageref);
194 $p->get_tag("title");
195 my $pagetitle = $p->get_trimmed_text;
199 url_prefix
=> "http://youtube.com/watch?v=",
200 search_for
=> qr
|\Q
/watch?v
=\E
(.*?
)["< &]|i,
203 url_prefix => "http
://youtube
.com
/watch?v
=",
204 search_for => qr|\Qyoutube.com/v/\E(.*?)$|i,
206 GVideo => { # NOTE: Ignores original TLD, uses .com for extraction
207 url_prefix => "http
://video
.google
.com
/videoplay?docid
=",
208 search_for => qr|\Q/googleplayer.swf?docid=\E(.*?)["< &]|i
,
210 GVideoEmbed
=> { # NOTE: Ditto.
211 url_prefix
=> "http://video.google.com/videoplay?docid=",
212 search_for
=> qr
|\Q
/videoplay?docid
=\E
(.*?
)$|i
,
214 Metacafe
=> { # NOTE: metacafe.com/watch/$id is enough for redirect
215 url_prefix
=> "http://metacafe.com/watch/",
216 search_for
=> qr
|\Q
/watch/\E
(.*?
)/|i
,
219 url_prefix
=> "http://metacafe.com/watch/",
220 search_for
=> qr
|\Qmetacafe
.com
/fplayer
/\E
(.*?
)/|i
,
222 SevenLoad
=> { # NOTE: Ditto. Subdomain can be ignored.
223 url_prefix
=> "http://sevenload.com/videos/",
224 search_for
=> qr
|\Q
/videos/\E
(.*?
)\
-|i
,
227 url_prefix
=> "http://sevenload.com/videos/",
228 search_for
=> qr
|\Qsevenload
.com
/pl
/\E
(.*?
)/|i
,
231 url_prefix
=> "http://break.com/index/",
232 search_for
=> qr
|\Q
/index/\E
(.*?
)["< &]|i,
234 # TODO: add BreakEmbed, e.g.:
235 # Page URL: http://break.com/index/if-all-movies-had-cell-phones.html
236 # Embed URL: http://embed.break.com/600081
239 print "=> Scanning page
for links
" unless $opts{quiet};
242 my ($linksref, $link) = @_;
243 push @$linksref,$link;
244 if ( scalar (@$linksref) % 5 == 0 ) { print scalar (@$linksref); }
249 while ( my $host = each( %re ) ) {
250 _scan_progress(\@links, "$re{$host}{url_prefix
}$1")
251 while ( $$pageref =~ /$re{$host}{search_for}/g );
254 my %h = map { $_, 1 } @links; # Weed out duplicates
256 #print "$_\n" foreach(@links);
258 print "\n=> Found
" .scalar @links. " links after removing duplicates
.\n"
262 foreach my $link ( @links ) {
263 print "==> Fetching
$link ..." unless $opts{quiet};
264 my ($rc, $fh, $resp, $errmsg) = fetch_page($link);
266 $rc = $curl->getinfo(CURLINFO_RESPONSE_CODE);
267 if ( $rc == 0 or $rc == 200 ) {
268 print "done
.\n" unless $opts{quiet};
270 $p = HTML::TokeParser->new(\$resp);
271 $p->get_tag("title
");
272 my $title = $p->get_trimmed_text;
273 # Store, prevent link duplicates
274 my $sha1 = sha1_hex($link);
275 $verified_links{$sha1} = {link => $link, title => $title}
276 unless defined $verified_links{$sha1};
278 $errmsg = $curl->strerror($rc)." (http
/$rc)";
281 $errmsg = $curl->strerror($rc)." (http
/$rc)";
284 print STDERR "\n==> error
: $errmsg\n" if $errmsg;
287 $found_queue{ sha1_hex($scanurl) } =
288 { title => $pagetitle, url => $scanurl, videos => {%verified_links} };
293 for my $i ( keys %found_queue ) {
294 my %videos = %{$found_queue{$i}{videos}};
295 for my $j ( keys %videos ) {
296 push @q, $videos{$j}{link};
303 ## Subroutines: Helpers
308 return if $url =~ /^$/;
311 $url = "http
://$url" if $url !~ m!^http://!i;
316 print "Trying to locate
'clive' ...";
318 find ( sub { $opts{clive} = $File::Find::name if ( $_ eq 'clive' ) },
319 split /:/, $ENV{PATH} || getcwd);
321 if ( $opts{clive} ) { print "$opts{clive
}\n"; }
322 else { print STDERR "error
: not found
, use --clive
=path
\n"; exit; }
327 system "$opts{clive
} $opts{opts
} " . join(' ', @q);
332 my $perl_v = sprintf "%vd", $^V;
333 my $clipb_v = $opted_mods{Clipboard} ? $Clipboard::VERSION : "-";
335 "clivescan version
$VERSION. Copyright
(C
) 2008 Toni Gundogdu
.
339 * Config
::Tiny
/$Config::Tiny::VERSION\t\t* WWW::Curl/$WWW::Curl
::VERSION
340 * Tk
/$Tk::VERSION\t\t\t* Tk::Tree/$Tk::Tree
::VERSION
341 * Tk
::DialogBox
/$Tk::DialogBox::VERSION\t\t* Clipboard/$clipb_v
342 * Tk
::FontDialog
/$Tk::FontDialog
::VERSION
344 * Getopt
::Long
/$Getopt::Long::VERSION\t\t* Digest::SHA/$Digest::SHA
::VERSION
345 * File
::Spec
/$File::Spec::VERSION\t\t* File::Find/$File::Find
::VERSION
346 * File
::Path
/$File::Path::VERSION\t\t* Encode/$Encode::VERSION
347 * Pod
::Usage
/$Pod::Usage::VERSION\t\t* Cwd/$Cwd::VERSION
349 This program comes with ABSOLUTELY NO WARRANTY
. You may redistribute copies of
350 clivescan under the terms of the GNU General Public License as published by the
351 Free Software Foundation
, either version
3 of the License
, or (at your option
)
352 any later version
. You should have received a copy of the General Public License
353 along with this program
. If
not, see http
://www
.gnu
.org
/licenses/.
355 return $s if $noexit;
363 return unless %found_queue;
365 $mw = MainWindow->new;
366 $mw->geometry($opts{geometry}) if defined $opts{geometry};
367 $mw->title('clivescan');
368 $mw->protocol('WM_DELETE_WINDOW', sub { save_prefs(); exit; });
372 $mw->configure(-menu => $mb);
375 my $file = $mb->cascade(-label => 'File', -underline => 0, -tearoff => 0);
376 $file->command(-label => 'Extract videos in queue...',
377 -underline => 0, -command => \&on_extract);
379 $file->command(-label => 'Quit', -underline => 0,
380 -command => sub { save_prefs(); exit; } );
383 my $edit = $mb->cascade(-label => 'Edit', -underline => 0, -tearoff => 0);
384 $edit->command(-label => 'Preferences...',
385 -underline => 0, -command => \&on_prefs);
388 my $help = $mb->cascade(-label => 'Help', -underline => 0, -tearoff => 0);
389 $help->command(-label => 'About...',
390 -underline => 0, -command => \&on_about);
392 # The GUI has an upper and a lower part
393 $pwmain = $mw->Panedwindow(-orient => 'v', -opaqueresize => 0);
396 $pwtop = $pwmain->Panedwindow(-orient => 'h', -opaqueresize => 0);
399 my $lbar = $pwtop->Frame;
401 $lbtlink = $lbar->Scrolled('Tree',
402 -scrollbars => 'osoe',
404 -selectmode => 'extended',
407 )->pack(-side => 'top', -expand => 1, -fill => 'both');
409 for my $i ( keys %found_queue ) {
410 my $scantitle = $found_queue{$i}{title};
411 $scantitle =~ tr{.}//d;
413 $lbtlink->add($scantitle);
414 $lbtlink->itemCreate($scantitle, 0, -text => $scantitle, -itemtype => 'text');
416 for my $j ( keys %{$found_queue{$i}{videos}} ) {
417 my %video = %{$found_queue{$i}{videos}{$j}};
419 my $title = $video{title};
422 my $path = "$scantitle.$title";
424 $lbtlink->add($path, -data => {%video});
425 $lbtlink->itemCreate($path, 0,
426 -text => $title, -itemtype => 'text');
429 $lbtlink->autosetmode;
430 $lbtlink->close($_) foreach ( $lbtlink->infoChildren('') );
432 my $rbar = $pwtop->Frame; # Button toolbar
433 $rbar->Button(-text => 'Grab', -command => \&on_grab
434 )->pack(-fill => 'x');
436 $rbar->Button(-text => 'Grab everything', -command => \&on_grab_all
437 )->pack(-fill => 'x');
439 $pwtop->add($lbar, $rbar, -width => $opts{pwtop} || 200);
442 $pwbottom = $pwmain->Panedwindow(-orient => 'h', -opaqueresize => 0);
444 $lbtqueue = $pwbottom->Scrolled('Tree',
445 -scrollbars => 'osoe',
447 -selectmode => 'extended',
452 my $bar = $pwbottom->Frame; # Button toolbar
454 $bar->Button(-text => 'Remove', -command => \&on_remove
455 )->pack(-fill => 'x');
457 $bar->Button(-text => 'Clear', -command => \&on_clear
458 )->pack(-fill => 'x');
460 $bar->Button(-text => 'Extract videos...', -command => \&on_extract
461 )->pack(-fill => 'x', -side => 'bottom');
463 $pwbottom->add($lbtqueue, $bar, -width => $opts{pwbottom} || 200);
465 # Add upper and lower parts to main paned window
466 $pwmain->add($pwtop, $pwbottom, -height => $opts{pwmain} || 200);
468 $mw->RefontTree(-font => $opts{mainfont});
469 $pwmain->pack(-expand => 1, -fill => 'both');
475 mkpath( [$CONFIGDIR], 1, 0700 );
477 my $c = Config::Tiny->new;
478 $c->{gui}->{geometry} = $mw->geometry();
479 $c->{gui}->{pwmain} = ($pwmain->sashCoord(0))[1]-7;
480 $c->{gui}->{pwtop} = ($pwtop->sashCoord(0))[0]-7;
481 $c->{gui}->{pwbottom} = ($pwbottom->sashCoord(0))[0]-7;
482 $c->{gui}->{mainfont} = $opts{mainfont};
484 $c->write($PREFSFILE);
488 ($opts{mainfont}) = @_;
489 $mw->RefontTree(-font => $opts{mainfont});
495 return if $path !~ /\./;
496 return if $lbtqueue->infoExists($path);
498 my %video = %{$lbtlink->infoData($path)};
499 my ($link) = split /\./, $path;
501 unless ( $lbtqueue->infoExists($link) ) {
502 $lbtqueue->add($link);
503 $lbtqueue->itemCreate($link, 0,
504 -text => $link, -itemtype => 'text');
507 $lbtqueue->add($path, -data => {%video});
508 $lbtqueue->itemCreate($path, 0,
509 -text => $video{title}, -itemtype => 'text');
513 queue_item($_) foreach ( $lbtlink->infoSelection );
514 $lbtqueue->autosetmode;
518 foreach ( $lbtlink->infoChildren("") ) {
519 my ($parent) = split /\./;
521 foreach ($lbtlink->infoChildren($parent) );
523 $lbtqueue->autosetmode;
527 $lbtqueue->deleteEntry($_)
528 foreach ( $lbtqueue->infoSelection );
532 $lbtqueue->deleteAll;
536 my $dlg = $mw->DialogBox(-title => 'About', -buttons => ['OK']);
537 my $txt = $dlg->add('Text')->pack;
538 $txt->insert('end', print_version(1));
543 my ($top, $lblv, $lbl) = @_;
544 my $font = $top->FontDialog(-initfont => $$lblv)->Show;
546 if ( defined $font ) {
547 my $descr = $top->FontDialog->GetDescriptiveFontName($font);
548 $lbl->configure(-font => $descr);
554 my $dlg = $mw->DialogBox(-title => 'clivescan preferences',
555 -buttons => ['OK','Cancel']);
557 $dlg->add('Label', -text => 'Fonts: press to choose'
558 )->grid(-sticky => 'w', -pady => 10);
560 my ($mainfont) = ($opts{mainfont});
561 my $mainfontl = $dlg->Label(-textvariable => \$mainfont);
563 $dlg->add('Button', -text => 'Main font',
564 -command => sub { change_font($dlg, \$mainfont, $mainfontl) }
565 )->grid($mainfontl, -sticky => 'w', -padx => '5');
567 on_prefs_ok($mainfont) if $dlg->Show eq 'OK';
572 foreach ( $lbtqueue->infoChildren('') ) {
573 foreach ( $lbtqueue->infoChildren($_) ) {
574 my %video = %{$lbtqueue->infoData($_)};
575 push @q, $video{link};
580 # Prompt for clive(1) options
581 my $dlg = $mw->DialogBox(-title => 'clive(1) options',
582 -buttons => ['OK','Cancel']);
584 $dlg->add('Label', -text => 'Path to clive'
585 )->grid(my $clivepath = $dlg->Entry(-width => 60),
586 -sticky => 'w', -padx => '5');
588 $dlg->add('Label', -text => 'Runtime options'
589 )->grid(my $cliveopts = $dlg->Entry(-width => 60),
590 -sticky => 'w', -padx => '5');
592 $clivepath->insert('end', $opts{clive});
593 $cliveopts->insert('end', $opts{opts});
595 if ( $dlg->Show() eq 'OK' ) {
596 $opts{clive} = $clivepath->get;
597 $opts{opts} = $cliveopts->get;
607 clivescan - the video link scanning utility for clive
611 clivescan [option]... [URL]...
615 clivescan is an utility that scans video pages for video links and
616 uses L<clive(1)> to extract them. The utility scans for video page
617 and embedded video links.
619 Historically, the video link scanning function was part of L<clive(1)>
620 and it was written in Python/Newt. The clivescan utility was written
621 in Perl/Tk to replace the feature that was removed in clive 2.0. This
622 utility is part of the B<clive-utils> project.
626 You may freely specify options after the command-line arguments. For example:
628 clivescan -a URL --opts=--noextract
640 Show version and exit.
642 =item B<--clive=>I<path>
644 I<path> to L<clive(1)> command. If unspecified, clivescan will attempt to
645 locate it in the $PATH. Additionally, the B<CLIVE_PATH> environment variable
646 can be used. See also L</CONFIG>.
648 =item B<--opts=>I<opts>
650 I<opts> to append to clive call. See L<clive(1)> for more on the available
655 Grab all videos without prompting the GUI.
663 =item B<-U --agent=>I<string>
665 Identify as I<string> to the HTTP server. Defaults to "Mozilla
/5.0".
667 =item B<-y --proxy=>I<address>
669 Use I<address> for HTTP proxy, e.g. http://foo:1234. If http_proxy
670 environment variable is defined, it will be used.
672 =item B<-X --noproxy>
674 Do not use the defined HTTP proxy (B<--proxy>, config or http_proxy).
682 =item % clivescan youtube.com video.google.com
684 Scans both, Youtube and GoogleVideo front pages for video links.
686 =item % cat E<gt>E<gt> url.lst
688 http://video.google.com
689 http://youtube.com/communitychannel
693 =item % cat url.lst | clivescan
695 Reads input from UNIX pipe.
697 =item % clivescan --opts="-f mp4
"
699 Appends the I<opts> to the L<clive(1)> call.
701 =item % clivescan --all http://youtube.com
703 Grabs all found videos from the Youtube front page.
709 By default, clivescan searches the ~/.config/clivescan directory for the
710 config file. The B<CLIVESCAN_CONFIGDIR> environment variable can be used
711 to override this behaviour.
715 =item ~/.config/clivescan/config
719 =item ~/.config/clivescan/prefs
721 GUI preferences (e.g. fonts, window position, sash coords, ...).
727 ## Example config file for clivescan.
730 path = /usr/local/bin/clive
735 proxy = http://foo:1234
739 L<clive(1)> <clivefeed(1)>
743 Project: http://googlecode.com/p/clive-utils/
745 A clive-utils development repository can be obtained from:
747 % git clone git://repo.or.cz/clive-utils.git
753 Written by Toni Gundogdu <legatvs@gmail.com>