2 # -*- coding: ascii -*-
3 ###########################################################################
4 # clivescan, the video link scanning utility for clive
5 # Copyright (C) 2008 Toni Gundogdu.
7 # This file is part of clive-utils.
9 # clivescan is free software: you can redistribute it and/or modify
10 # it under the terms of the GNU General Public License as published by
11 # the Free Software Foundation, either version 3 of the License, or
12 # (at your option) any later version.
14 # clivescan is distributed in the hope that it will be useful,
15 # but WITHOUT ANY WARRANTY; without even the implied warranty of
16 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 # GNU General Public License for more details.
19 # You should have received a copy of the GNU General Public License
20 # along with clivescan. If not, see <http://www.gnu.org/licenses/>.
21 ###########################################################################
28 binmode(STDOUT
, ":utf8");
39 use Getopt
::Long
qw(:config bundling);
40 use Digest
::SHA
qw(sha1_hex);
49 my %opted_mods = (Clipboard
=> 1);
50 eval "use Clipboard"; $opted_mods{Clipboard
}=0 if $@
;
52 my $VERSION = "2.0beta2";
53 my $CONFIGDIR = $ENV{CLIVESCAN_CONFIGDIR
}
54 || File
::Spec
->catfile($ENV{HOME
}, ".config/clivescan");
55 my $CONFIGFILE = File
::Spec
->catfile($CONFIGDIR, "config");
56 my $PREFSFILE = File
::Spec
->catfile($CONFIGDIR, "prefs");
58 my %opts; # Holds the options
59 my @queue; # Holds the current URL queue
60 my %found_queue;# Holds the results of the scanned video page links
61 my $curl; # Holds the curl handle (reused throught lifespan)
62 my $mw; # Holds the main window handle (GUI)
63 my $pwmain; # Holds the handle to the main paned window
64 my $pwtop; # Holds the handle to the top paned window
65 my $pwbottom; # Holds the handle to the bottom paned window
66 my $lbtlink; # Holds the handle to the listbox tree of found links
67 my $lbtqueue; # Holds the handle to the listbox tree of queued links
70 my $conf = Config
::Tiny
->read($CONFIGFILE);
71 my $prefs = Config
::Tiny
->read($PREFSFILE);
73 clive
=> $conf->{clive
}->{path
},
74 opts
=> $conf->{clive
}->{opts
},
75 agent
=> $conf->{http
}->{agent
},
76 proxy
=> $conf->{http
}->{proxy
},
78 geometry
=> $prefs->{gui
}->{geometry
},
79 pwmain
=> $prefs->{gui
}->{pwmain
},
80 pwtop
=> $prefs->{gui
}->{pwtop
},
81 pwbottom
=> $prefs->{gui
}->{pwbottom
},
82 mainfont
=> $prefs->{gui
}->{mainfont
},
86 # Define those not read from config, init with defaults
95 $opts{mainfont
} = $opts{mainfont
} || "{helvetica} -12 bold";
98 'debug|d', 'help|h', 'manual|m', 'version|v', 'all|a',
99 'paste|x', 'quiet|q', 'clive|c=s', 'opts|o=s', 'agent|U=s',
101 # Workaround since '$longopt|shortopt' is a no-no.
102 'noproxy|X' => sub { $opts{proxy
} = "" },
103 'nostrict|S' => sub { $opts{strict
} = 0 },
106 # Since 'version|v' => \&print_version and exit cannot tango with tk
107 print_version
(0) if $opts{version
};
108 pod2usage
(-exitstatus
=> 0, -verbose
=> 1) if $opts{help
};
109 pod2usage
(-exitstatus
=> 0, -verbose
=> 2) if $opts{manual
};
111 $opts{clive
} = $opts{clive
} || $ENV{CLIVE_PATH
};
112 find_clive
() unless $opts{clive
};
116 select STDERR
; $| = 1; # => unbuffered
117 select STDOUT
; $| = 1;
121 unless ( $opts{all
} ) { init_gui
(); }
125 ## Subroutines: Connection
128 $curl = WWW
::Curl
::Easy
->new;
129 $curl->setopt(CURLOPT_USERAGENT
, $opts{agent
} || "Mozilla/5.0");
130 $curl->setopt(CURLOPT_PROXY
, $opts{proxy
}) if defined $opts{proxy
};
131 $curl->setopt(CURLOPT_VERBOSE
, 1) if $opts{debug
};
132 $curl->setopt(CURLOPT_FOLLOWLOCATION
, 1);
133 $curl->setopt(CURLOPT_AUTOREFERER
, 1);
134 $curl->setopt(CURLOPT_HEADER
, 0);
135 $curl->setopt(CURLOPT_NOBODY
, 0);
139 my ($url, $resp, $rc) = (shift, 0, 0);
140 open my $fh, ">", \
$resp;
142 $curl->setopt(CURLOPT_URL
, $url);
143 $curl->setopt(CURLOPT_ENCODING
, "");
144 $curl->setopt(CURLOPT_WRITEDATA
, $fh);
145 $rc = $curl->perform;
147 return ($rc, $fh, $resp);
151 ## Subroutines: Queue
154 if ( $opts{paste
} ) {
155 print STDERR
"error: Clipboard module not found" and exit
156 unless $opted_mods{Clipboard
};
157 my $data = Clipboard
->paste();
159 parse_input
($_) foreach split/\n/,$data;
163 parse_input
($_) foreach @ARGV;
164 unless ( @queue ) { parse_input
($_) while ( <STDIN
> ); }
166 my %h = map {$_,1} @queue; # Remove duplicates
173 print "Fetching $_ ..." unless $opts{quiet
};
174 my ($rc, $fh, $resp, $errmsg) = fetch_page
($_);
176 $rc = $curl->getinfo(CURLINFO_RESPONSE_CODE
);
177 if ( $rc == 0 or $rc == 200 ) {
178 scan_page
($_, \
$resp);
180 $errmsg = $curl->strerror($rc)." (http/$rc)";
183 $errmsg = $curl->strerror($rc)." (http/$rc)";
186 print STDERR
"\n==> error: $errmsg\n" if $errmsg;
191 my ($scanurl, $pageref) = @_;
192 print "done.\n" unless $opts{quiet
};
193 $$pageref =~ tr
{\n}//d;
195 my $p = HTML
::TokeParser
->new($pageref);
196 $p->get_tag("title");
197 my $pagetitle = $p->get_trimmed_text;
200 # in_scanurl: regex used to bind this search pattern to specified
201 # domain. Undefined for embedded link searches. See clivescan(1).
202 # search_for: regex used to grab the video ID
203 # url_prefix: combined with video ID to construct video page URL
205 # NOTE: We're not using domains in the search patterns because
206 # most of the supported hosts refer to their videos using local
207 # paths, e.g. <a href="/watch?v=...">.
209 in_scanurl
=> qr
|\Qyoutube
.com\E
|i
,
210 search_for
=> qr
|\Q
/watch?v
=\E
(.*?
)["< &]|i,
211 url_prefix => "http
://youtube
.com
/watch?v
=",
215 search_for => qr|\Qyoutube.com/v/\E(.*?)["< &]|i
,
216 url_prefix
=> "http://youtube.com/watch?v=",
218 GVideo
=> { # NOTE: Ignores original TLD, uses .com for extraction
219 in_scanurl
=> qr
|\Qvideo
.google
.\E
|i
,
220 search_for
=> qr
|\Q
/videoplay?docid
=\E
(.*?
)["< &]|i,
221 url_prefix => "http
://video
.google
.com
/videoplay?docid
=",
223 GVideoEmbed => { # NOTE: Ditto.
225 search_for => qr|\Q/googleplayer.swf?docid=\E(.*?)["< &]|i
,
226 url_prefix
=> "http://video.google.com/videoplay?docid=",
228 # Metacafe => { # NOTE: metacafe.com/watch/$id is enough for redirect
229 # in_scanurl => qr|\Qmetacafe.com\E|i,
230 # search_for => qr|\Q/watch/\E(.*?)/|i,
231 # url_prefix => "http://metacafe.com/watch/",
234 # in_scanurl => undef,
235 # search_for => qr|\Qmetacafe.com/fplayer/\E(.*?)/|i,
236 # url_prefix => "http://metacafe.com/watch/",
238 SevenLoad
=> { # NOTE: Ditto. Subdomain can be ignored.
239 in_scanurl
=> qr
|\Qsevenload
.com\E
|i
,
240 search_for
=> qr
|\Q
/videos/\E
(.*?
)\
-|i
,
241 url_prefix
=> "http://sevenload.com/videos/",
245 search_for
=> qr
|\Qsevenload
.com
/pl
/\E
(.*?
)/|i
,
246 url_prefix
=> "http://sevenload.com/videos/",
249 in_scanurl
=> qr
|\Qbreak
.com\E
|i
,
250 search_for
=> qr
|\Q
/index/\E
(.*?
)["< &]|i,
251 url_prefix => "http
://break.com
/index/",
253 # TODO: add BreakEmbed, e.g.:
254 # Page URL: http://break.com/index/if-all-movies-had-cell-phones.html
255 # Embed URL: http://embed.break.com/600081
258 print "=> Scanning page
for links
" unless $opts{quiet};
261 my ($linksref, $link) = @_;
262 push @$linksref,$link;
263 unless ( $opts{quiet} ) {
264 if ( scalar (@$linksref) % 5 == 0 ) { print scalar (@$linksref); }
270 while ( my $host = each( %re ) ) {
271 if ( defined $re{$host}{in_scanurl} and $opts{strict} ) {
272 next unless $scanurl =~ /$re{$host}{in_scanurl}/;
274 _scan_progress(\@links, "$re{$host}{url_prefix
}$1")
275 while ( $$pageref =~ /$re{$host}{search_for}/g );
278 my %h = map { $_, 1 } @links; # Weed out duplicates
281 print "\n=> Found
" .scalar @links. " links after removing duplicates
.\n"
285 foreach my $link ( @links ) {
286 print "==> Fetching
$link ..." unless $opts{quiet};
287 my ($rc, $fh, $resp, $errmsg) = fetch_page($link);
289 $rc = $curl->getinfo(CURLINFO_RESPONSE_CODE);
290 if ( $rc == 0 or $rc == 200 ) {
291 print "done
.\n" unless $opts{quiet};
293 $p = HTML::TokeParser->new(\$resp);
294 $p->get_tag("title
");
295 my $title = $p->get_trimmed_text;
296 # Store, prevent link duplicates
297 my $sha1 = sha1_hex($link);
298 $verified_links{$sha1} = {link => $link, title => $title}
299 unless defined $verified_links{$sha1};
301 $errmsg = $curl->strerror($rc)." (http
/$rc)";
304 $errmsg = $curl->strerror($rc)." (http
/$rc)";
307 print STDERR "\n==> error
: $errmsg\n" if $errmsg;
310 $found_queue{ sha1_hex($scanurl) } =
311 { title => $pagetitle, url => $scanurl, videos => {%verified_links} };
316 for my $i ( keys %found_queue ) {
317 my %videos = %{$found_queue{$i}{videos}};
318 for my $j ( keys %videos ) {
319 push @q, $videos{$j}{link};
326 ## Subroutines: Helpers
331 return if $url =~ /^$/;
334 $url = "http
://$url" if $url !~ m!^http://!i;
339 print "Trying to locate
'clive' ..." unless $opts{quiet};
341 find ( sub { $opts{clive} = $File::Find::name if ( $_ eq 'clive' ) },
342 split /:/, $ENV{PATH} || getcwd);
344 if ( $opts{clive} ) { print "$opts{clive
}\n" unless $opts{quiet}; }
345 else { print STDERR "error
: not found
, use --clive
=path
\n"; exit; }
350 system "$opts{clive
} $opts{opts
} " . join(' ', @q);
355 my $perl_v = sprintf "%vd", $^V;
356 my $clipb_v = $opted_mods{Clipboard} ? $Clipboard::VERSION : "-";
358 "clivescan version
$VERSION. Copyright
(C
) 2008 Toni Gundogdu
.
362 * Config
::Tiny
/$Config::Tiny::VERSION\t\t* WWW::Curl/$WWW::Curl
::VERSION
363 * Tk
/$Tk::VERSION\t\t\t* Tk::Tree/$Tk::Tree
::VERSION
364 * Tk
::DialogBox
/$Tk::DialogBox::VERSION\t\t* Clipboard/$clipb_v
365 * Tk
::FontDialog
/$Tk::FontDialog::VERSION\t\t* HTML::TokeParser/$HTML::TokeParser
::VERSION
367 * Getopt
::Long
/$Getopt::Long::VERSION\t\t* Digest::SHA/$Digest::SHA
::VERSION
368 * File
::Spec
/$File::Spec::VERSION\t\t* File::Find/$File::Find
::VERSION
369 * File
::Path
/$File::Path::VERSION\t\t* Encode/$Encode::VERSION
370 * Pod
::Usage
/$Pod::Usage::VERSION\t\t* Cwd/$Cwd::VERSION
372 This program comes with ABSOLUTELY NO WARRANTY
. You may redistribute copies of
373 clivescan under the terms of the GNU General Public License as published by the
374 Free Software Foundation
, either version
3 of the License
, or (at your option
)
375 any later version
. You should have received a copy of the General Public License
376 along with this program
. If
not, see http
://www
.gnu
.org
/licenses/.
378 return $s if $noexit;
386 return if keys %found_queue == 0;
388 $mw = MainWindow->new;
389 $mw->geometry($opts{geometry}) if defined $opts{geometry};
390 $mw->title('clivescan');
391 $mw->protocol('WM_DELETE_WINDOW', sub { save_prefs(); exit; });
395 $mw->configure(-menu => $mb);
398 my $file = $mb->cascade(-label => 'File', -underline => 0, -tearoff => 0);
399 $file->command(-label => 'Extract videos in queue...',
400 -underline => 0, -command => \&on_extract);
402 $file->command(-label => 'Quit', -underline => 0,
403 -command => sub { save_prefs(); exit; } );
406 my $edit = $mb->cascade(-label => 'Edit', -underline => 0, -tearoff => 0);
407 $edit->command(-label => 'Preferences...',
408 -underline => 0, -command => \&on_prefs);
411 my $help = $mb->cascade(-label => 'Help', -underline => 0, -tearoff => 0);
412 $help->command(-label => 'About...',
413 -underline => 0, -command => \&on_about);
415 # The GUI has an upper and a lower part
416 $pwmain = $mw->Panedwindow(-orient => 'v', -opaqueresize => 0);
419 $pwtop = $pwmain->Panedwindow(-orient => 'h', -opaqueresize => 0);
422 my $lbar = $pwtop->Frame;
424 $lbtlink = $lbar->Scrolled('Tree',
425 -scrollbars => 'osoe',
427 -selectmode => 'extended',
430 )->pack(-side => 'top', -expand => 1, -fill => 'both');
432 for my $i ( keys %found_queue ) {
433 my $scantitle = $found_queue{$i}{title};
434 $scantitle =~ tr{.}//d;
436 $lbtlink->add($scantitle);
437 $lbtlink->itemCreate($scantitle, 0, -text => $scantitle, -itemtype => 'text');
439 for my $j ( keys %{$found_queue{$i}{videos}} ) {
440 my %video = %{$found_queue{$i}{videos}{$j}};
442 my $title = $video{title};
446 for ( my $k=0;; ++$k ) {
447 $path = "$scantitle.$title (#$k)";
448 last unless $lbtlink->infoExists($path);
451 $lbtlink->add($path, -data
=> {%video});
452 $lbtlink->itemCreate($path, 0,
453 -text
=> $title, -itemtype
=> 'text');
456 $lbtlink->autosetmode;
457 $lbtlink->close($_) foreach ( $lbtlink->infoChildren('') );
459 my $rbar = $pwtop->Frame; # Button toolbar
460 $rbar->Button(-text
=> 'Grab', -command
=> \
&on_grab
461 )->pack(-fill
=> 'x');
463 $rbar->Button(-text
=> 'Grab everything', -command
=> \
&on_grab_all
464 )->pack(-fill
=> 'x');
466 $pwtop->add($lbar, $rbar, -width
=> $opts{pwtop
} || 200);
469 $pwbottom = $pwmain->Panedwindow(-orient
=> 'h', -opaqueresize
=> 0);
471 $lbtqueue = $pwbottom->Scrolled('Tree',
472 -scrollbars
=> 'osoe',
474 -selectmode
=> 'extended',
479 my $bar = $pwbottom->Frame; # Button toolbar
481 $bar->Button(-text
=> 'Remove', -command
=> \
&on_remove
482 )->pack(-fill
=> 'x');
484 $bar->Button(-text
=> 'Clear', -command
=> \
&on_clear
485 )->pack(-fill
=> 'x');
487 $bar->Button(-text
=> 'Extract videos...', -command
=> \
&on_extract
488 )->pack(-fill
=> 'x', -side
=> 'bottom');
490 $pwbottom->add($lbtqueue, $bar, -width
=> $opts{pwbottom
} || 200);
492 # Add upper and lower parts to main paned window
493 $pwmain->add($pwtop, $pwbottom, -height
=> $opts{pwmain
} || 200);
495 $mw->RefontTree(-font
=> $opts{mainfont
});
496 $pwmain->pack(-expand
=> 1, -fill
=> 'both');
502 mkpath
( [$CONFIGDIR], 1, 0700 );
504 my $c = Config
::Tiny
->new;
505 $c->{gui
}->{geometry
} = $mw->geometry();
506 $c->{gui
}->{pwmain
} = ($pwmain->sashCoord(0))[1]-7;
507 $c->{gui
}->{pwtop
} = ($pwtop->sashCoord(0))[0]-7;
508 $c->{gui
}->{pwbottom
} = ($pwbottom->sashCoord(0))[0]-7;
509 $c->{gui
}->{mainfont
} = $opts{mainfont
};
511 $c->write($PREFSFILE);
515 ($opts{mainfont
}) = @_;
516 $mw->RefontTree(-font
=> $opts{mainfont
});
522 return if $path !~ /\./;
523 return if $lbtqueue->infoExists($path);
525 my %video = %{$lbtlink->infoData($path)};
526 my ($link) = split /\./, $path;
528 unless ( $lbtqueue->infoExists($link) ) {
529 $lbtqueue->add($link);
530 $lbtqueue->itemCreate($link, 0,
531 -text
=> $link, -itemtype
=> 'text');
534 $lbtqueue->add($path, -data
=> {%video});
535 $lbtqueue->itemCreate($path, 0,
536 -text
=> $video{title
}, -itemtype
=> 'text');
540 queue_item
($_) foreach ( $lbtlink->infoSelection );
541 $lbtqueue->autosetmode;
545 foreach ( $lbtlink->infoChildren("") ) {
546 my ($parent) = split /\./;
548 foreach ($lbtlink->infoChildren($parent) );
550 $lbtqueue->autosetmode;
554 $lbtqueue->deleteEntry($_)
555 foreach ( $lbtqueue->infoSelection );
559 $lbtqueue->deleteAll;
563 my $dlg = $mw->DialogBox(-title
=> 'About', -buttons
=> ['OK']);
564 my $txt = $dlg->add('Text')->pack;
565 $txt->insert('end', print_version
(1));
570 my ($top, $lblv, $lbl) = @_;
571 my $font = $top->FontDialog(-initfont
=> $$lblv)->Show;
573 if ( defined $font ) {
574 my $descr = $top->FontDialog->GetDescriptiveFontName($font);
575 $lbl->configure(-font
=> $descr);
581 my $dlg = $mw->DialogBox(-title
=> 'clivescan preferences',
582 -buttons
=> ['OK','Cancel']);
584 $dlg->add('Label', -text
=> 'Fonts: press to choose'
585 )->grid(-sticky
=> 'w', -pady
=> 10);
587 my ($mainfont) = ($opts{mainfont
});
588 my $mainfontl = $dlg->Label(-textvariable
=> \
$mainfont);
590 $dlg->add('Button', -text
=> 'Main font',
591 -command
=> sub { change_font
($dlg, \
$mainfont, $mainfontl) }
592 )->grid($mainfontl, -sticky
=> 'w', -padx
=> '5');
594 on_prefs_ok
($mainfont) if $dlg->Show eq 'OK';
599 foreach ( $lbtqueue->infoChildren('') ) {
600 foreach ( $lbtqueue->infoChildren($_) ) {
601 my %video = %{$lbtqueue->infoData($_)};
602 push @q, $video{link};
607 # Prompt for clive(1) options
608 my $dlg = $mw->DialogBox(-title
=> 'clive(1) options',
609 -buttons
=> ['OK','Cancel']);
611 $dlg->add('Label', -text
=> 'Path to clive'
612 )->grid(my $clivepath = $dlg->Entry(-width
=> 60),
613 -sticky
=> 'w', -padx
=> '5');
615 $dlg->add('Label', -text
=> 'Runtime options'
616 )->grid(my $cliveopts = $dlg->Entry(-width
=> 60),
617 -sticky
=> 'w', -padx
=> '5');
619 $clivepath->insert('end', $opts{clive
});
620 $cliveopts->insert('end', $opts{opts
});
622 if ( $dlg->Show() eq 'OK' ) {
623 $opts{clive
} = $clivepath->get;
624 $opts{opts
} = $cliveopts->get;
634 clivescan - the video link scanning utility for clive
638 clivescan [option]... [URL]...
642 clivescan is an utility that scans video pages for video links and
643 uses L<clive(1)> to extract them. The utility scans for video page
644 and embedded video links.
646 Historically, the video link scanning function was part of L<clive(1)>
647 and it was written in Python/Newt. The clivescan utility was written
648 in Perl/Tk to replace the feature that was removed in clive 2.0. This
649 utility is part of the B<clive-utils> project.
653 You may freely specify options after the command-line arguments. For example:
655 % clivescan -a URL --opts=--noextract
665 =item B<-v --version>
667 Show version and exit.
669 =item B<-c --clive=>I<path>
671 I<path> to L<clive(1)> command. If unspecified, clivescan will attempt to
672 locate it in the $PATH. Additionally, the B<CLIVE_PATH> environment variable
673 can be used. See also L</CONFIG>.
675 =item B<-o --opts=>I<opts>
677 I<opts> to append to clive call. See L<clive(1)> for more on the available
682 Grab all videos without prompting the GUI.
684 =item B<-S --nostrict>
686 This option provides a workaround for some search pattern issues that
687 are known to occur. For example:
689 % clivescan "http://video.google.com/videosearch?q=inurl%3Abreak"
691 Does not return any break.com videos even though the page lists them.
692 This happens because clivescan assumes that the domain break.com can
693 be found in the URL. To override this restrictive default behaviour:
695 % clivescan -S "http://video.google.com/videosearch?q=inurl%3Abreak"
703 =item B<-U --agent=>I<string>
705 Identify as I<string> to the HTTP server. Defaults to "Mozilla/5.0".
707 =item B<-y --proxy=>I<address>
709 Use I<address> for HTTP proxy, e.g. http://foo:1234. If http_proxy
710 environment variable is defined, it will be used.
712 =item B<-X --noproxy>
714 Do not use the defined HTTP proxy (B<--proxy>, config or http_proxy).
722 =item % clivescan youtube.com video.google.com
724 Scans both, Youtube and GoogleVideo front pages for video links.
726 =item % cat E<gt>E<gt> url.lst
728 http://video.google.com
729 http://youtube.com/communitychannel
733 =item % cat url.lst | clivescan
735 Reads input from UNIX pipe.
737 =item % clivescan --opts="-f mp4"
739 Appends the I<opts> to the L<clive(1)> call.
741 =item % clivescan --all http://youtube.com
743 Grabs all found videos from the Youtube front page.
749 By default, clivescan searches the ~/.config/clivescan directory for the
750 config file. The B<CLIVESCAN_CONFIGDIR> environment variable can be used
751 to override this behaviour.
755 =item ~/.config/clivescan/config
759 =item ~/.config/clivescan/prefs
761 GUI preferences (e.g. fonts, window position, sash coords, ...).
767 ## Example config file for clivescan.
770 path = /usr/local/bin/clive
775 proxy = http://foo:1234
779 L<clive(1)> L<clivefeed(1)>
783 Project: http://googlecode.com/p/clive-utils/
785 A clive-utils development repository can be obtained from:
787 % git clone git://repo.or.cz/clive-utils.git
793 Written by Toni Gundogdu <legatvs@gmail.com>