2 # -*- coding: ascii -*-
3 ###########################################################################
4 # clivescan, the video link scanning utility for clive
5 # Copyright (C) 2008 Toni Gundogdu.
7 # This file is part of clive-utils.
9 # clivescan is free software: you can redistribute it and/or modify
10 # it under the terms of the GNU General Public License as published by
11 # the Free Software Foundation, either version 3 of the License, or
12 # (at your option) any later version.
14 # clivescan is distributed in the hope that it will be useful,
15 # but WITHOUT ANY WARRANTY; without even the implied warranty of
16 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 # GNU General Public License for more details.
19 # You should have received a copy of the GNU General Public License
20 # along with clivescan. If not, see <http://www.gnu.org/licenses/>.
21 ###########################################################################
28 binmode(STDOUT
, ":utf8");
39 use Getopt
::Long
qw(:config bundling);
40 use Digest
::SHA
qw(sha1_hex);
49 my %opted_mods = (Clipboard
=> 1);
50 eval "use Clipboard"; $opted_mods{Clipboard
}=0 if $@
;
52 my $VERSION = "2.0beta2";
53 my $CONFIGDIR = $ENV{CLIVESCAN_CONFIGDIR
}
54 || File
::Spec
->catfile($ENV{HOME
}, ".config/clivescan");
55 my $CONFIGFILE = File
::Spec
->catfile($CONFIGDIR, "config");
56 my $PREFSFILE = File
::Spec
->catfile($CONFIGDIR, "prefs");
58 my %opts; # Holds the options
59 my @queue; # Holds the current URL queue
60 my %found_queue;# Holds the results of the scanned video page links
61 my $curl; # Holds the curl handle (reused throught lifespan)
62 my $mw; # Holds the main window handle (GUI)
63 my $pwmain; # Holds the handle to the main paned window
64 my $pwtop; # Holds the handle to the top paned window
65 my $pwbottom; # Holds the handle to the bottom paned window
66 my $lbtlink; # Holds the handle to the listbox tree of found links
67 my $lbtqueue; # Holds the handle to the listbox tree of queued links
70 my $conf = Config
::Tiny
->read($CONFIGFILE);
71 my $prefs = Config
::Tiny
->read($PREFSFILE);
73 clive
=> $conf->{clive
}->{path
},
74 opts
=> $conf->{clive
}->{opts
},
75 agent
=> $conf->{http
}->{agent
},
76 proxy
=> $conf->{http
}->{proxy
},
78 geometry
=> $prefs->{gui
}->{geometry
},
79 pwmain
=> $prefs->{gui
}->{pwmain
},
80 pwtop
=> $prefs->{gui
}->{pwtop
},
81 pwbottom
=> $prefs->{gui
}->{pwbottom
},
82 mainfont
=> $prefs->{gui
}->{mainfont
},
86 # Define those not read from config, init with defaults
95 $opts{mainfont
} = $opts{mainfont
} || "{helvetica} -12 bold";
98 'debug|d', 'help|h', 'manual|m', 'version|v', 'all|a',
99 'paste|x', 'quiet|q', 'clive|c=s', 'opts|o=s', 'agent|U=s',
101 # Workaround since '$longopt|shortopt' is a no-no.
102 'noproxy|X' => sub { $opts{proxy
} = "" },
103 'nostrict|S' => sub { $opts{strict
} = 0 },
106 # Since 'version|v' => \&print_version and exit cannot tango with tk
107 print_version
(0) if $opts{version
};
108 pod2usage
(-exitstatus
=> 0, -verbose
=> 1) if $opts{help
};
109 pod2usage
(-exitstatus
=> 0, -verbose
=> 2) if $opts{manual
};
111 $opts{clive
} = $opts{clive
} || $ENV{CLIVE_PATH
};
112 find_clive
() unless $opts{clive
};
116 select STDERR
; $| = 1; # => unbuffered
117 select STDOUT
; $| = 1;
121 unless ( $opts{all
} ) { init_gui
(); }
125 ## Subroutines: Connection
128 $curl = WWW
::Curl
::Easy
->new;
129 $curl->setopt(CURLOPT_USERAGENT
, $opts{agent
} || "Mozilla/5.0");
130 $curl->setopt(CURLOPT_PROXY
, $opts{proxy
}) if defined $opts{proxy
};
131 $curl->setopt(CURLOPT_VERBOSE
, 1) if $opts{debug
};
132 $curl->setopt(CURLOPT_FOLLOWLOCATION
, 1);
133 $curl->setopt(CURLOPT_AUTOREFERER
, 1);
134 $curl->setopt(CURLOPT_HEADER
, 0);
135 $curl->setopt(CURLOPT_NOBODY
, 0);
139 my ($url, $resp, $rc) = (shift, 0, 0);
140 open my $fh, ">", \
$resp;
142 $curl->setopt(CURLOPT_URL
, $url);
143 $curl->setopt(CURLOPT_ENCODING
, "");
144 $curl->setopt(CURLOPT_WRITEDATA
, $fh);
145 $rc = $curl->perform;
147 return ($rc, $fh, $resp);
151 ## Subroutines: Queue
154 if ( $opts{paste
} ) {
155 print STDERR
"error: Clipboard module not found" and exit
156 unless $opted_mods{Clipboard
};
157 my $data = Clipboard
->paste();
159 parse_input
($_) foreach split/\n/,$data;
163 parse_input
($_) foreach @ARGV;
164 unless ( @queue ) { parse_input
($_) while ( <STDIN
> ); }
166 my %h = map {$_,1} @queue; # Remove duplicates
173 print "Fetching $_ ..." unless $opts{quiet
};
174 my ($rc, $fh, $resp, $errmsg) = fetch_page
($_);
176 $rc = $curl->getinfo(CURLINFO_RESPONSE_CODE
);
177 if ( $rc == 0 or $rc == 200 ) {
178 scan_page
($_, \
$resp);
180 $errmsg = $curl->strerror($rc)." (http/$rc)";
183 $errmsg = $curl->strerror($rc)." (http/$rc)";
186 print STDERR
"\n==> error: $errmsg\n" if $errmsg;
191 my ($scanurl, $pageref) = @_;
192 print "done.\n" unless $opts{quiet
};
193 $$pageref =~ tr
{\n}//d;
195 my $p = HTML
::TokeParser
->new($pageref);
196 $p->get_tag("title");
197 my $pagetitle = $p->get_trimmed_text;
200 # in_scanurl: regex used to bind this search pattern to specified
201 # domain. Undefined for embedded link searches. See clivescan(1).
202 # search_for: regex used to grab the video ID
203 # url_prefix: combined with video ID to construct video page URL
205 in_scanurl
=> qr
|\Qyoutube
.com\E
|i
,
206 search_for
=> qr
|\Q
/watch?v
=\E
(.*?
)["< &]|i,
207 url_prefix => "http
://youtube
.com
/watch?v
=",
211 search_for => qr|\Qyoutube.com/v/\E(.*?)$|i,
212 url_prefix => "http
://youtube
.com
/watch?v
=",
214 GVideo => { # NOTE: Ignores original TLD, uses .com for extraction
215 in_scanurl => qr|\Qvideo.google.\E|i,
216 search_for => qr|\Q/videoplay?docid=\E(.*?)["< &]|i
,
217 url_prefix
=> "http://video.google.com/videoplay?docid=",
219 GVideoEmbed
=> { # NOTE: Ditto.
221 search_for
=> qr
|\Q
/googleplayer
.swf?docid
=\E
(.*?
)|i
,
222 url_prefix
=> "http://video.google.com/videoplay?docid=",
224 Metacafe
=> { # NOTE: metacafe.com/watch/$id is enough for redirect
225 in_scanurl
=> qr
|\Qmetacafe
.com\E
|i
,
226 search_for
=> qr
|\Q
/watch/\E
(.*?
)/|i
,
227 url_prefix
=> "http://metacafe.com/watch/",
231 search_for
=> qr
|\Qmetacafe
.com
/fplayer
/\E
(.*?
)/|i
,
232 url_prefix
=> "http://metacafe.com/watch/",
234 SevenLoad
=> { # NOTE: Ditto. Subdomain can be ignored.
235 in_scanurl
=> qr
|\Qsevenload
.com\E
|i
,
236 search_for
=> qr
|\Q
/videos/\E
(.*?
)\
-|i
,
237 url_prefix
=> "http://sevenload.com/videos/",
241 search_for
=> qr
|\Qsevenload
.com
/pl
/\E
(.*?
)/|i
,
242 url_prefix
=> "http://sevenload.com/videos/",
245 in_scanurl
=> qr
|\Qbreak
.com\E
|i
,
246 search_for
=> qr
|\Q
/index/\E
(.*?
)["< &]|i,
247 url_prefix => "http
://break.com
/index/",
249 # TODO: add BreakEmbed, e.g.:
250 # Page URL: http://break.com/index/if-all-movies-had-cell-phones.html
251 # Embed URL: http://embed.break.com/600081
254 print "=> Scanning page
for links
" unless $opts{quiet};
257 my ($linksref, $link) = @_;
258 push @$linksref,$link;
259 if ( scalar (@$linksref) % 5 == 0 ) { print scalar (@$linksref); }
264 while ( my $host = each( %re ) ) {
265 if ( defined $re{$host}{in_scanurl} and $opts{strict} ) {
266 next unless $scanurl =~ /$re{$host}{in_scanurl}/;
268 _scan_progress(\@links, "$re{$host}{url_prefix
}$1")
269 while ( $$pageref =~ /$re{$host}{search_for}/g );
272 my %h = map { $_, 1 } @links; # Weed out duplicates
275 print "\n=> Found
" .scalar @links. " links after removing duplicates
.\n"
279 foreach my $link ( @links ) {
280 print "==> Fetching
$link ..." unless $opts{quiet};
281 my ($rc, $fh, $resp, $errmsg) = fetch_page($link);
283 $rc = $curl->getinfo(CURLINFO_RESPONSE_CODE);
284 if ( $rc == 0 or $rc == 200 ) {
285 print "done
.\n" unless $opts{quiet};
287 $p = HTML::TokeParser->new(\$resp);
288 $p->get_tag("title
");
289 my $title = $p->get_trimmed_text;
290 # Store, prevent link duplicates
291 my $sha1 = sha1_hex($link);
292 $verified_links{$sha1} = {link => $link, title => $title}
293 unless defined $verified_links{$sha1};
295 $errmsg = $curl->strerror($rc)." (http
/$rc)";
298 $errmsg = $curl->strerror($rc)." (http
/$rc)";
301 print STDERR "\n==> error
: $errmsg\n" if $errmsg;
304 $found_queue{ sha1_hex($scanurl) } =
305 { title => $pagetitle, url => $scanurl, videos => {%verified_links} };
310 for my $i ( keys %found_queue ) {
311 my %videos = %{$found_queue{$i}{videos}};
312 for my $j ( keys %videos ) {
313 push @q, $videos{$j}{link};
320 ## Subroutines: Helpers
325 return if $url =~ /^$/;
328 $url = "http
://$url" if $url !~ m!^http://!i;
333 print "Trying to locate
'clive' ...";
335 find ( sub { $opts{clive} = $File::Find::name if ( $_ eq 'clive' ) },
336 split /:/, $ENV{PATH} || getcwd);
338 if ( $opts{clive} ) { print "$opts{clive
}\n"; }
339 else { print STDERR "error
: not found
, use --clive
=path
\n"; exit; }
344 system "$opts{clive
} $opts{opts
} " . join(' ', @q);
349 my $perl_v = sprintf "%vd", $^V;
350 my $clipb_v = $opted_mods{Clipboard} ? $Clipboard::VERSION : "-";
352 "clivescan version
$VERSION. Copyright
(C
) 2008 Toni Gundogdu
.
356 * Config
::Tiny
/$Config::Tiny::VERSION\t\t* WWW::Curl/$WWW::Curl
::VERSION
357 * Tk
/$Tk::VERSION\t\t\t* Tk::Tree/$Tk::Tree
::VERSION
358 * Tk
::DialogBox
/$Tk::DialogBox::VERSION\t\t* Clipboard/$clipb_v
359 * Tk
::FontDialog
/$Tk::FontDialog
::VERSION
361 * Getopt
::Long
/$Getopt::Long::VERSION\t\t* Digest::SHA/$Digest::SHA
::VERSION
362 * File
::Spec
/$File::Spec::VERSION\t\t* File::Find/$File::Find
::VERSION
363 * File
::Path
/$File::Path::VERSION\t\t* Encode/$Encode::VERSION
364 * Pod
::Usage
/$Pod::Usage::VERSION\t\t* Cwd/$Cwd::VERSION
366 This program comes with ABSOLUTELY NO WARRANTY
. You may redistribute copies of
367 clivescan under the terms of the GNU General Public License as published by the
368 Free Software Foundation
, either version
3 of the License
, or (at your option
)
369 any later version
. You should have received a copy of the General Public License
370 along with this program
. If
not, see http
://www
.gnu
.org
/licenses/.
372 return $s if $noexit;
380 return unless keys ( %found_queue ) == 0;
382 $mw = MainWindow->new;
383 $mw->geometry($opts{geometry}) if defined $opts{geometry};
384 $mw->title('clivescan');
385 $mw->protocol('WM_DELETE_WINDOW', sub { save_prefs(); exit; });
389 $mw->configure(-menu => $mb);
392 my $file = $mb->cascade(-label => 'File', -underline => 0, -tearoff => 0);
393 $file->command(-label => 'Extract videos in queue...',
394 -underline => 0, -command => \&on_extract);
396 $file->command(-label => 'Quit', -underline => 0,
397 -command => sub { save_prefs(); exit; } );
400 my $edit = $mb->cascade(-label => 'Edit', -underline => 0, -tearoff => 0);
401 $edit->command(-label => 'Preferences...',
402 -underline => 0, -command => \&on_prefs);
405 my $help = $mb->cascade(-label => 'Help', -underline => 0, -tearoff => 0);
406 $help->command(-label => 'About...',
407 -underline => 0, -command => \&on_about);
409 # The GUI has an upper and a lower part
410 $pwmain = $mw->Panedwindow(-orient => 'v', -opaqueresize => 0);
413 $pwtop = $pwmain->Panedwindow(-orient => 'h', -opaqueresize => 0);
416 my $lbar = $pwtop->Frame;
418 $lbtlink = $lbar->Scrolled('Tree',
419 -scrollbars => 'osoe',
421 -selectmode => 'extended',
424 )->pack(-side => 'top', -expand => 1, -fill => 'both');
426 for my $i ( keys %found_queue ) {
427 my $scantitle = $found_queue{$i}{title};
428 $scantitle =~ tr{.}//d;
430 $lbtlink->add($scantitle);
431 $lbtlink->itemCreate($scantitle, 0, -text => $scantitle, -itemtype => 'text');
433 for my $j ( keys %{$found_queue{$i}{videos}} ) {
434 my %video = %{$found_queue{$i}{videos}{$j}};
436 my $title = $video{title};
439 my $path = "$scantitle.$title";
441 $lbtlink->add($path, -data => {%video});
442 $lbtlink->itemCreate($path, 0,
443 -text => $title, -itemtype => 'text');
446 $lbtlink->autosetmode;
447 $lbtlink->close($_) foreach ( $lbtlink->infoChildren('') );
449 my $rbar = $pwtop->Frame; # Button toolbar
450 $rbar->Button(-text => 'Grab', -command => \&on_grab
451 )->pack(-fill => 'x');
453 $rbar->Button(-text => 'Grab everything', -command => \&on_grab_all
454 )->pack(-fill => 'x');
456 $pwtop->add($lbar, $rbar, -width => $opts{pwtop} || 200);
459 $pwbottom = $pwmain->Panedwindow(-orient => 'h', -opaqueresize => 0);
461 $lbtqueue = $pwbottom->Scrolled('Tree',
462 -scrollbars => 'osoe',
464 -selectmode => 'extended',
469 my $bar = $pwbottom->Frame; # Button toolbar
471 $bar->Button(-text => 'Remove', -command => \&on_remove
472 )->pack(-fill => 'x');
474 $bar->Button(-text => 'Clear', -command => \&on_clear
475 )->pack(-fill => 'x');
477 $bar->Button(-text => 'Extract videos...', -command => \&on_extract
478 )->pack(-fill => 'x', -side => 'bottom');
480 $pwbottom->add($lbtqueue, $bar, -width => $opts{pwbottom} || 200);
482 # Add upper and lower parts to main paned window
483 $pwmain->add($pwtop, $pwbottom, -height => $opts{pwmain} || 200);
485 $mw->RefontTree(-font => $opts{mainfont});
486 $pwmain->pack(-expand => 1, -fill => 'both');
492 mkpath( [$CONFIGDIR], 1, 0700 );
494 my $c = Config::Tiny->new;
495 $c->{gui}->{geometry} = $mw->geometry();
496 $c->{gui}->{pwmain} = ($pwmain->sashCoord(0))[1]-7;
497 $c->{gui}->{pwtop} = ($pwtop->sashCoord(0))[0]-7;
498 $c->{gui}->{pwbottom} = ($pwbottom->sashCoord(0))[0]-7;
499 $c->{gui}->{mainfont} = $opts{mainfont};
501 $c->write($PREFSFILE);
505 ($opts{mainfont}) = @_;
506 $mw->RefontTree(-font => $opts{mainfont});
512 return if $path !~ /\./;
513 return if $lbtqueue->infoExists($path);
515 my %video = %{$lbtlink->infoData($path)};
516 my ($link) = split /\./, $path;
518 unless ( $lbtqueue->infoExists($link) ) {
519 $lbtqueue->add($link);
520 $lbtqueue->itemCreate($link, 0,
521 -text => $link, -itemtype => 'text');
524 $lbtqueue->add($path, -data => {%video});
525 $lbtqueue->itemCreate($path, 0,
526 -text => $video{title}, -itemtype => 'text');
530 queue_item($_) foreach ( $lbtlink->infoSelection );
531 $lbtqueue->autosetmode;
535 foreach ( $lbtlink->infoChildren("") ) {
536 my ($parent) = split /\./;
538 foreach ($lbtlink->infoChildren($parent) );
540 $lbtqueue->autosetmode;
544 $lbtqueue->deleteEntry($_)
545 foreach ( $lbtqueue->infoSelection );
549 $lbtqueue->deleteAll;
553 my $dlg = $mw->DialogBox(-title => 'About', -buttons => ['OK']);
554 my $txt = $dlg->add('Text')->pack;
555 $txt->insert('end', print_version(1));
560 my ($top, $lblv, $lbl) = @_;
561 my $font = $top->FontDialog(-initfont => $$lblv)->Show;
563 if ( defined $font ) {
564 my $descr = $top->FontDialog->GetDescriptiveFontName($font);
565 $lbl->configure(-font => $descr);
571 my $dlg = $mw->DialogBox(-title => 'clivescan preferences',
572 -buttons => ['OK','Cancel']);
574 $dlg->add('Label', -text => 'Fonts: press to choose'
575 )->grid(-sticky => 'w', -pady => 10);
577 my ($mainfont) = ($opts{mainfont});
578 my $mainfontl = $dlg->Label(-textvariable => \$mainfont);
580 $dlg->add('Button', -text => 'Main font',
581 -command => sub { change_font($dlg, \$mainfont, $mainfontl) }
582 )->grid($mainfontl, -sticky => 'w', -padx => '5');
584 on_prefs_ok($mainfont) if $dlg->Show eq 'OK';
589 foreach ( $lbtqueue->infoChildren('') ) {
590 foreach ( $lbtqueue->infoChildren($_) ) {
591 my %video = %{$lbtqueue->infoData($_)};
592 push @q, $video{link};
597 # Prompt for clive(1) options
598 my $dlg = $mw->DialogBox(-title => 'clive(1) options',
599 -buttons => ['OK','Cancel']);
601 $dlg->add('Label', -text => 'Path to clive'
602 )->grid(my $clivepath = $dlg->Entry(-width => 60),
603 -sticky => 'w', -padx => '5');
605 $dlg->add('Label', -text => 'Runtime options'
606 )->grid(my $cliveopts = $dlg->Entry(-width => 60),
607 -sticky => 'w', -padx => '5');
609 $clivepath->insert('end', $opts{clive});
610 $cliveopts->insert('end', $opts{opts});
612 if ( $dlg->Show() eq 'OK' ) {
613 $opts{clive} = $clivepath->get;
614 $opts{opts} = $cliveopts->get;
624 clivescan - the video link scanning utility for clive
628 clivescan [option]... [URL]...
632 clivescan is an utility that scans video pages for video links and
633 uses L<clive(1)> to extract them. The utility scans for video page
634 and embedded video links.
636 Historically, the video link scanning function was part of L<clive(1)>
637 and it was written in Python/Newt. The clivescan utility was written
638 in Perl/Tk to replace the feature that was removed in clive 2.0. This
639 utility is part of the B<clive-utils> project.
643 You may freely specify options after the command-line arguments. For example:
645 clivescan -a URL --opts=--noextract
657 Show version and exit.
659 =item B<--clive=>I<path>
661 I<path> to L<clive(1)> command. If unspecified, clivescan will attempt to
662 locate it in the $PATH. Additionally, the B<CLIVE_PATH> environment variable
663 can be used. See also L</CONFIG>.
665 =item B<--opts=>I<opts>
667 I<opts> to append to clive call. See L<clive(1)> for more on the available
672 Grab all videos without prompting the GUI.
674 =item B<-S --nostrict>
676 This may come as a shock but clivescan is B<not> a perfect utility. This
677 option was added as a workaround for some search pattern issues that are
678 known to occur in some cases.
680 When searching for break.com videos, for example, clivescan looks for the
681 "/index/" pattern. This is, unfortunately, a fairly common string to be
682 found anywhere in the web -- like in the Youtube front page. To demonstrate
685 % clivescan --nostrict "http
://youtube
.com
"
687 Let's dwell deeper into this. Observe.
689 % clivescan "http
://video
.google
.com
/videosearch?q
=inurl
%3Abreak"
691 Returns no break.com videos even though it should. clivescan defaults to
692 "strict
" host binding, meaning that it makes sure that the B<scan URL>
693 contains a hard-coded B<domain pattern>, or "break.com
" in this case.
695 % clivescan "http
://break.com
"
697 Hence the above works as expected. To fix the I<original> issue with the
698 video.google.com URL, you need to use the B<--nostrict> option:
700 % clivescan -S "http
://video
.google
.com
/videosearch?q
=inurl
%3Abreak"
702 This causes clivescan to ignore the B<domain pattern>.
704 Using domains in the search patterns would have been a sound idea but this
705 has been made impossible by the video hosts that often refer to their
706 video pages using local paths (e.g. href="/watch?v
=$id").
708 It is obviously not a elegant solution, and is even likely to cause other issues
709 when scanning multiple sources at once. Should someone come up with a better
710 solution, please see L</AUTHOR> for contact details.
718 =item B<-U --agent=>I<string>
720 Identify as I<string> to the HTTP server. Defaults to "Mozilla
/5.0".
722 =item B<-y --proxy=>I<address>
724 Use I<address> for HTTP proxy, e.g. http://foo:1234. If http_proxy
725 environment variable is defined, it will be used.
727 =item B<-X --noproxy>
729 Do not use the defined HTTP proxy (B<--proxy>, config or http_proxy).
737 =item % clivescan youtube.com video.google.com
739 Scans both, Youtube and GoogleVideo front pages for video links.
741 =item % cat E<gt>E<gt> url.lst
743 http://video.google.com
744 http://youtube.com/communitychannel
748 =item % cat url.lst | clivescan
750 Reads input from UNIX pipe.
752 =item % clivescan --opts="-f mp4
"
754 Appends the I<opts> to the L<clive(1)> call.
756 =item % clivescan --all http://youtube.com
758 Grabs all found videos from the Youtube front page.
764 By default, clivescan searches the ~/.config/clivescan directory for the
765 config file. The B<CLIVESCAN_CONFIGDIR> environment variable can be used
766 to override this behaviour.
770 =item ~/.config/clivescan/config
774 =item ~/.config/clivescan/prefs
776 GUI preferences (e.g. fonts, window position, sash coords, ...).
782 ## Example config file for clivescan.
785 path = /usr/local/bin/clive
790 proxy = http://foo:1234
794 L<clive(1)> <clivefeed(1)>
798 Project: http://googlecode.com/p/clive-utils/
800 A clive-utils development repository can be obtained from:
802 % git clone git://repo.or.cz/clive-utils.git
808 Written by Toni Gundogdu <legatvs@gmail.com>