Tweaked youtube and gvideo embedded URL search patterns.
[clive-utils.git] / clivescan
blob0e30033127d66f67eefbc8c550198cd90dd20db8
1 #!/usr/bin/env perl
2 # -*- coding: ascii -*-
3 ###########################################################################
4 # clivescan, the video link scanning utility for clive
5 # Copyright (C) 2008 Toni Gundogdu.
7 # This file is part of clive-utils.
9 # clivescan is free software: you can redistribute it and/or modify
10 # it under the terms of the GNU General Public License as published by
11 # the Free Software Foundation, either version 3 of the License, or
12 # (at your option) any later version.
14 # clivescan is distributed in the hope that it will be useful,
15 # but WITHOUT ANY WARRANTY; without even the implied warranty of
16 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 # GNU General Public License for more details.
19 # You should have received a copy of the GNU General Public License
20 # along with clivescan. If not, see <http://www.gnu.org/licenses/>.
21 ###########################################################################
23 # Keep it simple.
25 use warnings;
26 use strict;
28 binmode(STDOUT, ":utf8");
30 use HTML::TokeParser;
31 use WWW::Curl::Easy;
32 use Tk::FontDialog;
33 use Tk::DialogBox;
34 use Config::Tiny;
35 use Tk::Tree;
36 use Tk;
38 # Core modules:
39 use Getopt::Long qw(:config bundling);
40 use Digest::SHA qw(sha1_hex);
41 use File::Spec;
42 use File::Find;
43 use File::Path;
44 use Pod::Usage;
45 use Encode;
46 use Cwd;
48 # Non-essentials
49 my %opted_mods = (Clipboard => 1);
50 eval "use Clipboard"; $opted_mods{Clipboard}=0 if $@;
52 my $VERSION = "2.0beta2";
53 my $CONFIGDIR = $ENV{CLIVESCAN_CONFIGDIR}
54 || File::Spec->catfile($ENV{HOME}, ".config/clivescan");
55 my $CONFIGFILE = File::Spec->catfile($CONFIGDIR, "config");
56 my $PREFSFILE = File::Spec->catfile($CONFIGDIR, "prefs");
58 my %opts; # Holds the options
59 my @queue; # Holds the current URL queue
60 my %found_queue;# Holds the results of the scanned video page links
61 my $curl; # Holds the curl handle (reused throught lifespan)
62 my $mw; # Holds the main window handle (GUI)
63 my $pwmain; # Holds the handle to the main paned window
64 my $pwtop; # Holds the handle to the top paned window
65 my $pwbottom; # Holds the handle to the bottom paned window
66 my $lbtlink; # Holds the handle to the listbox tree of found links
67 my $lbtqueue; # Holds the handle to the listbox tree of queued links
69 # Parse config
70 my $conf = Config::Tiny->read($CONFIGFILE);
71 my $prefs = Config::Tiny->read($PREFSFILE);
72 %opts = (
73 clive => $conf->{clive}->{path},
74 opts => $conf->{clive}->{opts},
75 agent => $conf->{http}->{agent},
76 proxy => $conf->{http}->{proxy},
78 geometry=> $prefs->{gui}->{geometry},
79 pwmain => $prefs->{gui}->{pwmain},
80 pwtop => $prefs->{gui}->{pwtop},
81 pwbottom=> $prefs->{gui}->{pwbottom},
82 mainfont=> $prefs->{gui}->{mainfont},
85 # Parse cmdline
86 # Define those not read from config, init with defaults
87 $opts{quiet} = 0;
88 $opts{paste} = 0;
89 $opts{all} = 0;
90 $opts{strict} = 1;
91 $opts{debug} = 0;
92 $opts{help} = 0;
93 $opts{manual} = 0;
94 $opts{version} = 0;
95 $opts{mainfont} = $opts{mainfont} || "{helvetica} -12 bold";
97 GetOptions(\%opts,
98 'debug|d', 'help|h', 'manual|m', 'version|v', 'all|a',
99 'paste|x', 'quiet|q', 'clive|c=s', 'opts|o=s', 'agent|U=s',
100 'proxy|y=s',
101 # Workaround since '$longopt|shortopt' is a no-no.
102 'noproxy|X' => sub { $opts{proxy} = "" },
103 'nostrict|S' => sub { $opts{strict} = 0 },
104 ) or pod2usage(1);
106 # Since 'version|v' => \&print_version and exit cannot tango with tk
107 print_version(0) if $opts{version};
108 pod2usage(-exitstatus => 0, -verbose => 1) if $opts{help};
109 pod2usage(-exitstatus => 0, -verbose => 2) if $opts{manual};
111 $opts{clive} = $opts{clive} || $ENV{CLIVE_PATH};
112 find_clive() unless $opts{clive};
114 get_queue();
116 select STDERR; $| = 1; # => unbuffered
117 select STDOUT; $| = 1;
119 process_queue();
121 unless ( $opts{all} ) { init_gui(); }
122 else { grab_all(); }
125 ## Subroutines: Connection
127 sub init_curl {
128 $curl = WWW::Curl::Easy->new;
129 $curl->setopt(CURLOPT_USERAGENT, $opts{agent} || "Mozilla/5.0");
130 $curl->setopt(CURLOPT_PROXY, $opts{proxy}) if defined $opts{proxy};
131 $curl->setopt(CURLOPT_VERBOSE, 1) if $opts{debug};
132 $curl->setopt(CURLOPT_FOLLOWLOCATION, 1);
133 $curl->setopt(CURLOPT_AUTOREFERER, 1);
134 $curl->setopt(CURLOPT_HEADER, 0);
135 $curl->setopt(CURLOPT_NOBODY, 0);
138 sub fetch_page {
139 my ($url, $resp, $rc) = (shift, 0, 0);
140 open my $fh, ">", \$resp;
142 $curl->setopt(CURLOPT_URL, $url);
143 $curl->setopt(CURLOPT_ENCODING, "");
144 $curl->setopt(CURLOPT_WRITEDATA, $fh);
145 $rc = $curl->perform;
147 return ($rc, $fh, $resp);
151 ## Subroutines: Queue
153 sub get_queue {
154 if ( $opts{paste} ) {
155 print STDERR "error: Clipboard module not found" and exit
156 unless $opted_mods{Clipboard};
157 my $data = Clipboard->paste();
158 if ( $data ) {
159 parse_input($_) foreach split/\n/,$data;
163 parse_input($_) foreach @ARGV;
164 unless ( @queue ) { parse_input($_) while ( <STDIN> ); }
166 my %h = map {$_,1} @queue; # Remove duplicates
167 @queue = keys %h;
170 sub process_queue {
171 init_curl();
172 foreach ( @queue ) {
173 print "Fetching $_ ..." unless $opts{quiet};
174 my ($rc, $fh, $resp, $errmsg) = fetch_page($_);
175 if ( $rc == 0 ) {
176 $rc = $curl->getinfo(CURLINFO_RESPONSE_CODE);
177 if ( $rc == 0 or $rc == 200 ) {
178 scan_page($_, \$resp);
179 } else {
180 $errmsg = $curl->strerror($rc)." (http/$rc)";
182 } else {
183 $errmsg = $curl->strerror($rc)." (http/$rc)";
185 close $fh;
186 print STDERR "\n==> error: $errmsg\n" if $errmsg;
190 sub scan_page {
191 my ($scanurl, $pageref) = @_;
192 print "done.\n" unless $opts{quiet};
193 $$pageref =~ tr{\n}//d;
195 my $p = HTML::TokeParser->new($pageref);
196 $p->get_tag("title");
197 my $pagetitle = $p->get_trimmed_text;
199 my %re = (
200 # in_scanurl: regex used to bind this search pattern to specified
201 # domain. Undefined for embedded link searches. See clivescan(1).
202 # search_for: regex used to grab the video ID
203 # url_prefix: combined with video ID to construct video page URL
204 Youtube => {
205 in_scanurl => qr|\Qyoutube.com\E|i,
206 search_for => qr|\Q/watch?v=\E(.*?)["< &]|i,
207 url_prefix => "http://youtube.com/watch?v=",
209 YoutubeEmbed => {
210 in_scanurl => undef,
211 search_for => qr|\Qyoutube.com/v/\E(.*?)["< &]|i,
212 url_prefix => "http://youtube.com/watch?v=",
214 GVideo => { # NOTE: Ignores original TLD, uses .com for extraction
215 in_scanurl => qr|\Qvideo.google.\E|i,
216 search_for => qr|\Q/videoplay?docid=\E(.*?)["< &]|i,
217 url_prefix => "http://video.google.com/videoplay?docid=",
219 GVideoEmbed => { # NOTE: Ditto.
220 in_scanurl => undef,
221 search_for => qr|\Q/googleplayer.swf?docid=\E(.*?)["< &]|i,
222 url_prefix => "http://video.google.com/videoplay?docid=",
224 Metacafe => { # NOTE: metacafe.com/watch/$id is enough for redirect
225 in_scanurl => qr|\Qmetacafe.com\E|i,
226 search_for => qr|\Q/watch/\E(.*?)/|i,
227 url_prefix => "http://metacafe.com/watch/",
229 MetacafeEmbed => {
230 in_scanurl => undef,
231 search_for => qr|\Qmetacafe.com/fplayer/\E(.*?)/|i,
232 url_prefix => "http://metacafe.com/watch/",
234 SevenLoad => { # NOTE: Ditto. Subdomain can be ignored.
235 in_scanurl => qr|\Qsevenload.com\E|i,
236 search_for => qr|\Q/videos/\E(.*?)\-|i,
237 url_prefix => "http://sevenload.com/videos/",
239 SevenLoadEmbed => {
240 in_scanurl => undef,
241 search_for => qr|\Qsevenload.com/pl/\E(.*?)/|i,
242 url_prefix => "http://sevenload.com/videos/",
244 Break => {
245 in_scanurl => qr|\Qbreak.com\E|i,
246 search_for => qr|\Q/index/\E(.*?)["< &]|i,
247 url_prefix => "http://break.com/index/",
249 # TODO: add BreakEmbed, e.g.:
250 # Page URL: http://break.com/index/if-all-movies-had-cell-phones.html
251 # Embed URL: http://embed.break.com/600081
254 print "=> Scanning page for links " unless $opts{quiet};
256 sub _scan_progress {
257 my ($linksref, $link) = @_;
258 push @$linksref,$link;
259 if ( scalar (@$linksref) % 5 == 0 ) { print scalar (@$linksref); }
260 else { print "."; }
263 my @links;
264 while ( my $host = each( %re ) ) {
265 if ( defined $re{$host}{in_scanurl} and $opts{strict} ) {
266 next unless $scanurl =~ /$re{$host}{in_scanurl}/;
268 _scan_progress(\@links, "$re{$host}{url_prefix}$1")
269 while ( $$pageref =~ /$re{$host}{search_for}/g );
272 my %h = map { $_, 1 } @links; # Weed out duplicates
273 @links = keys %h;
275 print "\n=> Found " .scalar @links. " links after removing duplicates.\n"
276 unless $opts{quiet};
278 my %verified_links;
279 foreach my $link ( @links ) {
280 print "==> Fetching $link ..." unless $opts{quiet};
281 my ($rc, $fh, $resp, $errmsg) = fetch_page($link);
282 if ( $rc == 0 ) {
283 $rc = $curl->getinfo(CURLINFO_RESPONSE_CODE);
284 if ( $rc == 0 or $rc == 200 ) {
285 print "done.\n" unless $opts{quiet};
286 # Grab title
287 $p = HTML::TokeParser->new(\$resp);
288 $p->get_tag("title");
289 my $title = $p->get_trimmed_text;
290 # Store, prevent link duplicates
291 my $sha1 = sha1_hex($link);
292 $verified_links{$sha1} = {link => $link, title => $title}
293 unless defined $verified_links{$sha1};
294 } else {
295 $errmsg = $curl->strerror($rc)." (http/$rc)";
297 } else {
298 $errmsg = $curl->strerror($rc)." (http/$rc)";
300 close $fh;
301 print STDERR "\n==> error: $errmsg\n" if $errmsg;
304 $found_queue{ sha1_hex($scanurl) } =
305 { title => $pagetitle, url => $scanurl, videos => {%verified_links} };
308 sub grab_all {
309 my @q;
310 for my $i ( keys %found_queue ) {
311 my %videos = %{$found_queue{$i}{videos}};
312 for my $j ( keys %videos ) {
313 push @q, $videos{$j}{link};
316 run_clive(@q);
320 ## Subroutines: Helpers
322 sub parse_input {
323 my $url = shift;
325 return if $url =~ /^$/;
326 chomp $url;
328 $url = "http://$url" if $url !~ m!^http://!i;
329 push @queue, $url;
332 sub find_clive {
333 print "Trying to locate 'clive' ...";
335 find ( sub { $opts{clive} = $File::Find::name if ( $_ eq 'clive' ) },
336 split /:/, $ENV{PATH} || getcwd);
338 if ( $opts{clive} ) { print "$opts{clive}\n"; }
339 else { print STDERR "error: not found, use --clive=path\n"; exit; }
342 sub run_clive {
343 my (@q) = @_;
344 system "$opts{clive} $opts{opts} " . join(' ', @q);
347 sub print_version {
348 my $noexit = shift;
349 my $perl_v = sprintf "%vd", $^V;
350 my $clipb_v = $opted_mods{Clipboard} ? $Clipboard::VERSION : "-";
351 my $s = sprintf
352 "clivescan version $VERSION. Copyright (C) 2008 Toni Gundogdu.
354 Perl: $perl_v ($^O)
355 Modules:
356 * Config::Tiny/$Config::Tiny::VERSION\t\t* WWW::Curl/$WWW::Curl::VERSION
357 * Tk/$Tk::VERSION\t\t\t* Tk::Tree/$Tk::Tree::VERSION
358 * Tk::DialogBox/$Tk::DialogBox::VERSION\t\t* Clipboard/$clipb_v
359 * Tk::FontDialog/$Tk::FontDialog::VERSION
360 Core modules:
361 * Getopt::Long/$Getopt::Long::VERSION\t\t* Digest::SHA/$Digest::SHA::VERSION
362 * File::Spec/$File::Spec::VERSION\t\t* File::Find/$File::Find::VERSION
363 * File::Path/$File::Path::VERSION\t\t* Encode/$Encode::VERSION
364 * Pod::Usage/$Pod::Usage::VERSION\t\t* Cwd/$Cwd::VERSION
366 This program comes with ABSOLUTELY NO WARRANTY. You may redistribute copies of
367 clivescan under the terms of the GNU General Public License as published by the
368 Free Software Foundation, either version 3 of the License, or (at your option)
369 any later version. You should have received a copy of the General Public License
370 along with this program. If not, see http://www.gnu.org/licenses/.
372 return $s if $noexit;
373 print $s; exit;
377 # GUI:
379 sub init_gui {
380 return if keys %found_queue == 0;
382 $mw = MainWindow->new;
383 $mw->geometry($opts{geometry}) if defined $opts{geometry};
384 $mw->title('clivescan');
385 $mw->protocol('WM_DELETE_WINDOW', sub { save_prefs(); exit; });
387 # Menubar
388 my $mb = $mw->Menu;
389 $mw->configure(-menu => $mb);
391 # Menu: File
392 my $file = $mb->cascade(-label => 'File', -underline => 0, -tearoff => 0);
393 $file->command(-label => 'Extract videos in queue...',
394 -underline => 0, -command => \&on_extract);
395 $file->separator;
396 $file->command(-label => 'Quit', -underline => 0,
397 -command => sub { save_prefs(); exit; } );
399 # Menu: Edit
400 my $edit = $mb->cascade(-label => 'Edit', -underline => 0, -tearoff => 0);
401 $edit->command(-label => 'Preferences...',
402 -underline => 0, -command => \&on_prefs);
404 # Menu: Help
405 my $help = $mb->cascade(-label => 'Help', -underline => 0, -tearoff => 0);
406 $help->command(-label => 'About...',
407 -underline => 0, -command => \&on_about);
409 # The GUI has an upper and a lower part
410 $pwmain = $mw->Panedwindow(-orient => 'v', -opaqueresize => 0);
412 # Upper part
413 $pwtop = $pwmain->Panedwindow(-orient => 'h', -opaqueresize => 0);
415 # Upper: Channels
416 my $lbar = $pwtop->Frame;
418 $lbtlink = $lbar->Scrolled('Tree',
419 -scrollbars => 'osoe',
420 -itemtype => 'text',
421 -selectmode => 'extended',
422 -indicator => 1,
423 -drawbranch => 1,
424 )->pack(-side => 'top', -expand => 1, -fill => 'both');
426 for my $i ( keys %found_queue ) {
427 my $scantitle = $found_queue{$i}{title};
428 $scantitle =~ tr{.}//d;
430 $lbtlink->add($scantitle);
431 $lbtlink->itemCreate($scantitle, 0, -text => $scantitle, -itemtype => 'text');
433 for my $j ( keys %{$found_queue{$i}{videos}} ) {
434 my %video = %{$found_queue{$i}{videos}{$j}};
436 my $title = $video{title};
437 $title =~ tr{.}//d;
439 my $path;
440 for ( my $k=0;; ++$k ) {
441 $path = "$scantitle.$title (#$k)";
442 last unless $lbtlink->infoExists($path);
445 $lbtlink->add($path, -data => {%video});
446 $lbtlink->itemCreate($path, 0,
447 -text => $title, -itemtype => 'text');
450 $lbtlink->autosetmode;
451 $lbtlink->close($_) foreach ( $lbtlink->infoChildren('') );
453 my $rbar = $pwtop->Frame; # Button toolbar
454 $rbar->Button(-text => 'Grab', -command => \&on_grab
455 )->pack(-fill => 'x');
457 $rbar->Button(-text => 'Grab everything', -command => \&on_grab_all
458 )->pack(-fill => 'x');
460 $pwtop->add($lbar, $rbar, -width => $opts{pwtop} || 200);
462 # Lower part
463 $pwbottom = $pwmain->Panedwindow(-orient => 'h', -opaqueresize => 0);
465 $lbtqueue = $pwbottom->Scrolled('Tree',
466 -scrollbars => 'osoe',
467 -itemtype => 'text',
468 -selectmode => 'extended',
469 -indicator => 1,
470 -drawbranch => 1,
473 my $bar = $pwbottom->Frame; # Button toolbar
475 $bar->Button(-text => 'Remove', -command => \&on_remove
476 )->pack(-fill => 'x');
478 $bar->Button(-text => 'Clear', -command => \&on_clear
479 )->pack(-fill => 'x');
481 $bar->Button(-text => 'Extract videos...', -command => \&on_extract
482 )->pack(-fill => 'x', -side => 'bottom');
484 $pwbottom->add($lbtqueue, $bar, -width => $opts{pwbottom} || 200);
486 # Add upper and lower parts to main paned window
487 $pwmain->add($pwtop, $pwbottom, -height => $opts{pwmain} || 200);
489 $mw->RefontTree(-font => $opts{mainfont});
490 $pwmain->pack(-expand => 1, -fill => 'both');
492 MainLoop;
495 sub save_prefs {
496 mkpath( [$CONFIGDIR], 1, 0700 );
498 my $c = Config::Tiny->new;
499 $c->{gui}->{geometry} = $mw->geometry();
500 $c->{gui}->{pwmain} = ($pwmain->sashCoord(0))[1]-7;
501 $c->{gui}->{pwtop} = ($pwtop->sashCoord(0))[0]-7;
502 $c->{gui}->{pwbottom} = ($pwbottom->sashCoord(0))[0]-7;
503 $c->{gui}->{mainfont} = $opts{mainfont};
505 $c->write($PREFSFILE);
508 sub on_prefs_ok {
509 ($opts{mainfont}) = @_;
510 $mw->RefontTree(-font => $opts{mainfont});
511 save_prefs();
514 sub queue_item {
515 my $path = shift;
516 return if $path !~ /\./;
517 return if $lbtqueue->infoExists($path);
519 my %video = %{$lbtlink->infoData($path)};
520 my ($link) = split /\./, $path;
522 unless ( $lbtqueue->infoExists($link) ) {
523 $lbtqueue->add($link);
524 $lbtqueue->itemCreate($link, 0,
525 -text => $link, -itemtype => 'text');
528 $lbtqueue->add($path, -data => {%video});
529 $lbtqueue->itemCreate($path, 0,
530 -text => $video{title}, -itemtype => 'text');
533 sub on_grab {
534 queue_item($_) foreach ( $lbtlink->infoSelection );
535 $lbtqueue->autosetmode;
538 sub on_grab_all {
539 foreach ( $lbtlink->infoChildren("") ) {
540 my ($parent) = split /\./;
541 queue_item($_)
542 foreach ($lbtlink->infoChildren($parent) );
544 $lbtqueue->autosetmode;
547 sub on_remove {
548 $lbtqueue->deleteEntry($_)
549 foreach ( $lbtqueue->infoSelection );
552 sub on_clear {
553 $lbtqueue->deleteAll;
556 sub on_about {
557 my $dlg = $mw->DialogBox(-title => 'About', -buttons => ['OK']);
558 my $txt = $dlg->add('Text')->pack;
559 $txt->insert('end', print_version(1));
560 $dlg->Show;
563 sub change_font {
564 my ($top, $lblv, $lbl) = @_;
565 my $font = $top->FontDialog(-initfont => $$lblv)->Show;
567 if ( defined $font ) {
568 my $descr = $top->FontDialog->GetDescriptiveFontName($font);
569 $lbl->configure(-font => $descr);
570 $$lblv = $descr;
574 sub on_prefs {
575 my $dlg = $mw->DialogBox(-title => 'clivescan preferences',
576 -buttons => ['OK','Cancel']);
578 $dlg->add('Label', -text => 'Fonts: press to choose'
579 )->grid(-sticky => 'w', -pady => 10);
581 my ($mainfont) = ($opts{mainfont});
582 my $mainfontl = $dlg->Label(-textvariable => \$mainfont);
584 $dlg->add('Button', -text => 'Main font',
585 -command => sub { change_font($dlg, \$mainfont, $mainfontl) }
586 )->grid($mainfontl, -sticky => 'w', -padx => '5');
588 on_prefs_ok($mainfont) if $dlg->Show eq 'OK';
591 sub on_extract {
592 my @q;
593 foreach ( $lbtqueue->infoChildren('') ) {
594 foreach ( $lbtqueue->infoChildren($_) ) {
595 my %video = %{$lbtqueue->infoData($_)};
596 push @q, $video{link};
599 return unless @q;
601 # Prompt for clive(1) options
602 my $dlg = $mw->DialogBox(-title => 'clive(1) options',
603 -buttons => ['OK','Cancel']);
605 $dlg->add('Label', -text => 'Path to clive'
606 )->grid(my $clivepath = $dlg->Entry(-width => 60),
607 -sticky => 'w', -padx => '5');
609 $dlg->add('Label', -text => 'Runtime options'
610 )->grid(my $cliveopts = $dlg->Entry(-width => 60),
611 -sticky => 'w', -padx => '5');
613 $clivepath->insert('end', $opts{clive});
614 $cliveopts->insert('end', $opts{opts});
616 if ( $dlg->Show() eq 'OK' ) {
617 $opts{clive} = $clivepath->get;
618 $opts{opts} = $cliveopts->get;
619 $mw->destroy;
620 run_clive(@q);
624 __END__
626 =head1 NAME
628 clivescan - the video link scanning utility for clive
630 =head1 SYNOPSIS
632 clivescan [option]... [URL]...
634 =head1 DESCRIPTION
636 clivescan is an utility that scans video pages for video links and
637 uses L<clive(1)> to extract them. The utility scans for video page
638 and embedded video links.
640 Historically, the video link scanning function was part of L<clive(1)>
641 and it was written in Python/Newt. The clivescan utility was written
642 in Perl/Tk to replace the feature that was removed in clive 2.0. This
643 utility is part of the B<clive-utils> project.
645 =head1 OPTIONS
647 You may freely specify options after the command-line arguments. For example:
649 clivescan -a URL --opts=--noextract
651 B<Basic Options>
653 =over 4
655 =item B<-h --help>
657 Show help and exit.
659 =item B<--version>
661 Show version and exit.
663 =item B<--clive=>I<path>
665 I<path> to L<clive(1)> command. If unspecified, clivescan will attempt to
666 locate it in the $PATH. Additionally, the B<CLIVE_PATH> environment variable
667 can be used. See also L</CONFIG>.
669 =item B<--opts=>I<opts>
671 I<opts> to append to clive call. See L<clive(1)> for more on the available
672 options.
674 =item B<-a --all>
676 Grab all videos without prompting the GUI.
678 =item B<-S --nostrict>
680 This may come as a shock but clivescan is B<not> a perfect utility. This
681 option was added as a workaround for some search pattern issues that are
682 known to occur in some cases.
684 When searching for break.com videos, for example, clivescan looks for the
685 "/index/" pattern. This is, unfortunately, a fairly common string to be
686 found anywhere in the web -- like in the Youtube front page. To demonstrate
687 this, try:
689 % clivescan --nostrict "http://youtube.com"
691 Let's dwell deeper into this. Observe.
693 % clivescan "http://video.google.com/videosearch?q=inurl%3Abreak"
695 Returns no break.com videos even though it should. clivescan defaults to
696 "strict" host binding, meaning that it makes sure that the B<scan URL>
697 contains a hard-coded B<domain pattern>, or "break.com" in this case.
699 % clivescan "http://break.com"
701 Hence the above works as expected. To fix the I<original> issue with the
702 video.google.com URL, you need to use the B<--nostrict> option:
704 % clivescan -S "http://video.google.com/videosearch?q=inurl%3Abreak"
706 This causes clivescan to ignore the B<domain pattern>.
708 Using domains in the search patterns would have been a sound idea but this
709 has been made impossible by the video hosts that often refer to their
710 video pages using local paths (e.g. href="/watch?v=$id").
712 It is obviously not a elegant solution, and is even likely to cause other issues
713 when scanning multiple sources at once. Should someone come up with a better
714 solution, please see L</AUTHOR> for contact details.
716 =back
718 B<HTTP Options>
720 =over 4
722 =item B<-U --agent=>I<string>
724 Identify as I<string> to the HTTP server. Defaults to "Mozilla/5.0".
726 =item B<-y --proxy=>I<address>
728 Use I<address> for HTTP proxy, e.g. http://foo:1234. If http_proxy
729 environment variable is defined, it will be used.
731 =item B<-X --noproxy>
733 Do not use the defined HTTP proxy (B<--proxy>, config or http_proxy).
735 =back
737 =head1 EXAMPLES
739 =over 4
741 =item % clivescan youtube.com video.google.com
743 Scans both, Youtube and GoogleVideo front pages for video links.
745 =item % cat E<gt>E<gt> url.lst
747 http://video.google.com
748 http://youtube.com/communitychannel
749 http://sevenload.com
750 http://break.com
752 =item % cat url.lst | clivescan
754 Reads input from UNIX pipe.
756 =item % clivescan --opts="-f mp4"
758 Appends the I<opts> to the L<clive(1)> call.
760 =item % clivescan --all http://youtube.com
762 Grabs all found videos from the Youtube front page.
764 =back
766 =head1 FILES
768 By default, clivescan searches the ~/.config/clivescan directory for the
769 config file. The B<CLIVESCAN_CONFIGDIR> environment variable can be used
770 to override this behaviour.
772 =over 4
774 =item ~/.config/clivescan/config
776 Configuration file.
778 =item ~/.config/clivescan/prefs
780 GUI preferences (e.g. fonts, window position, sash coords, ...).
782 =back
784 =head1 CONFIG
786 ## Example config file for clivescan.
788 [clive]
789 path = /usr/local/bin/clive
790 opts = -f mp4
792 [http]
793 agent = Mozilla/5.0
794 proxy = http://foo:1234
796 =head1 SEE ALSO
798 L<clive(1)> <clivefeed(1)>
800 =head1 OTHER
802 Project: http://googlecode.com/p/clive-utils/
804 A clive-utils development repository can be obtained from:
806 % git clone git://repo.or.cz/clive-utils.git
808 Patches welcome.
810 =head1 AUTHOR
812 Written by Toni Gundogdu <legatvs@gmail.com>
814 =cut