2 # -*- coding: ascii -*-
3 ###########################################################################
4 # clivescan, the video link scanning utility for clive
6 # Copyright (c) 2008-2009 Toni Gundogdu <legatvs@gmail.com>
8 # Permission to use, copy, modify, and distribute this software for any
9 # purpose with or without fee is hereby granted, provided that the above
10 # copyright notice and this permission notice appear in all copies.
12 # THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
13 # WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
14 # MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
15 # ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
16 # WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
17 # ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
18 # OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
19 ###########################################################################
24 use constant VERSION
=> "2.1.7";
26 binmode( STDOUT
, ":utf8" );
27 use Getopt
::Long
qw(:config bundling);
28 use WWW
::Curl
::Easy
4.05;
29 use File
::Find
qw(find);
35 my %opted_mods = ( Clipboard
=> 1, FontDialog
=> 1 );
37 $opted_mods{Clipboard
} = 0 if $@
;
39 my $CONFIGDIR = $ENV{CLIVESCAN_HOME
}
40 || File
::Spec
->catfile( $ENV{HOME
}, ".config/clive-utils" );
41 my $CONFIGFILE = File
::Spec
->catfile( $CONFIGDIR, "config" );
42 my $PREFSFILE = File
::Spec
->catfile( $CONFIGDIR, "scan.prefs" );
43 my $RECALLFILE = File
::Spec
->catfile( $CONFIGDIR, "scan.recall" );
44 my $SELECTFILE = File
::Spec
->catfile( $CONFIGDIR, "scan.sel" );
47 my @queue; # current URL queue
48 my %found_queue; # results of the scanned video page links
49 my $curl; # curl handle (reused through lifespan)
50 my $mw; # main window handle (GUI)
51 my $pwmain; # handle to the main paned window
52 my $pwtop; # handle to the top paned window
53 my $pwbottom; # handle to the bottom paned window
54 my $lbtlink; # handle to the listbox tree of found links
55 my $lbtqueue; # handle to the listbox tree of queued links
58 my $conf = Config
::Tiny
->read($CONFIGFILE);
59 my $prefs = Config
::Tiny
->read($PREFSFILE);
61 clive
=> $conf->{clive
}->{path
},
62 opts
=> $conf->{clive
}->{opts
},
63 agent
=> $conf->{http
}->{agent
},
64 proxy
=> $conf->{http
}->{proxy
},
65 geometry
=> $prefs->{gui
}->{geometry
},
66 pwmain
=> $prefs->{gui
}->{pwmain
},
67 pwtop
=> $prefs->{gui
}->{pwtop
},
68 pwbottom
=> $prefs->{gui
}->{pwbottom
},
69 mainfont
=> $prefs->{gui
}->{mainfont
},
73 $opts{mainfont
} = $opts{mainfont
} || "{helvetica} -12 bold";
77 'debug|d', 'help|h', 'all|a', 'agent|U=s', 'proxy|y=s',
78 'paste|p', 'quiet|q', 'clive|c=s', 'opts|o=s',
79 'recall|r', 'selected|s',
80 'version|v' => \
&print_version
,
82 # Workaround since '$longopt|shortopt' is a no-no.
83 'no-proxy|X' => sub { $opts{proxy
} = "" },
84 'no-strict|n' => sub { $opts{strict
} = 0 },
89 Pod
::Usage
::pod2usage
( -exitstatus
=> 0, -verbose
=> 1 );
94 ## Subroutines: Connection
97 $curl = WWW
::Curl
::Easy
->new;
98 $curl->setopt( CURLOPT_USERAGENT
, $opts{agent
} || "Mozilla/5.0" );
99 $curl->setopt( CURLOPT_PROXY
, $opts{proxy
} ) if defined $opts{proxy
};
100 $curl->setopt( CURLOPT_VERBOSE
, 1 ) if $opts{debug
};
101 $curl->setopt( CURLOPT_FOLLOWLOCATION
, 1 );
102 $curl->setopt( CURLOPT_AUTOREFERER
, 1 );
103 $curl->setopt( CURLOPT_HEADER
, 0 );
104 $curl->setopt( CURLOPT_NOBODY
, 0 );
108 my ( $url, $resp, $rc ) = ( shift, 0, 0 );
109 open my $fh, ">", \
$resp;
111 $curl->setopt( CURLOPT_URL
, $url );
112 $curl->setopt( CURLOPT_ENCODING
, "" );
113 $curl->setopt( CURLOPT_WRITEDATA
, $fh );
114 $rc = $curl->perform;
116 return ( $rc, $fh, decode_utf8
($resp) );
119 ## Subroutines: Queue
122 if ( $opts{recall
} and -e
$RECALLFILE ) {
123 if ( open my $fh, "<$RECALLFILE" ) {
124 parse_input
($_) while (<$fh>);
128 print STDERR
"error: $RECALLFILE: $!\n";
132 if ( $opts{paste
} ) {
133 print STDERR
"error: Clipboard module not found\n" and exit
134 unless $opted_mods{Clipboard
};
135 my $data = Clipboard
->paste();
137 parse_input
($_) foreach split /\n/, $data;
141 parse_input
($_) foreach @ARGV;
143 if ( scalar(@queue) == 0 && scalar( @ARGV == 0 ) ) {
144 parse_input
($_) while <STDIN
>;
147 write_last_file
( $RECALLFILE, @queue );
153 require HTML
::TokeParser
;
157 print "fetch $_ ..." unless $opts{quiet
};
158 my ( $rc, $fh, $resp, $errmsg ) = fetch_page
($_);
160 $rc = $curl->getinfo(CURLINFO_RESPONSE_CODE
);
161 if ( $rc == 0 or $rc == 200 ) {
162 scan_page
( $_, \
$resp );
165 $errmsg = $curl->strerror($rc) . " (http/$rc)";
169 $errmsg = $curl->strerror($rc) . " (http/$rc)";
172 print STDERR
"\nerror: $errmsg\n" if $errmsg;
177 my ( $scanurl, $pageref ) = @_;
178 print "done.\n" unless $opts{quiet
};
179 $$pageref =~ tr
{\n}//d;
181 my $p = HTML
::TokeParser
->new($pageref);
182 $p->get_tag("title");
183 my $pagetitle = $p->get_trimmed_text;
189 # in_scanurl: regex used to bind this search pattern to specified
190 # domain. Undefined for embedded link searches. See clivescan(1).
191 # search_for: regex used to grab the video ID
192 # url_prefix: combined with video ID to construct video page URL
194 # NOTE: We're not using domains in the search patterns because
195 # most of the supported hosts refer to their videos using local
196 # paths, e.g. <a href="/watch?v=...">.
198 in_scanurl
=> qr
|youtube
(.*).com
|i
,
199 search_for
=> qr
|\Q
/watch?v
=\E
(.*?
)["< &#%]|i,
200 url_prefix => "http
://youtube
.com
/watch?v
=",
204 search_for => qr|.com/v/(.*?)["< &#%]|i,
205 url_prefix
=> "http://youtube.com/watch?v=",
207 GVideo
=> { # NOTE: Ignores original TLD, uses .com for extraction
208 in_scanurl
=> qr
|video
.google
.|i
,
209 search_for
=> qr
|\Q
/videoplay?docid
=\E
(.*?
)["< &#%]|i,
210 url_prefix => "http
://video
.google
.com
/videoplay?docid
=",
212 GVideoEmbed => { # NOTE: Ditto.
214 search_for => qr|\Q/googleplayer.swf?docid=\E(.*?)["< &#%]|i,
215 url_prefix
=> "http://video.google.com/videoplay?docid=",
218 # Metacafe => { # NOTE: metacafe.com/watch/$id is enough for redirect
219 # in_scanurl => qr|metacafe.com|i,
220 # search_for => qr|\Q/watch/\E(.*?)/|i,
221 # url_prefix => "http://metacafe.com/watch/",
224 # in_scanurl => undef,
225 # search_for => qr|\Qmetacafe.com/fplayer/\E(.*?)/|i,
226 # url_prefix => "http://metacafe.com/watch/",
228 SevenLoad
=> { # NOTE: Ditto. Subdomain can be ignored.
229 in_scanurl
=> qr
|sevenload
.com
|i
,
230 search_for
=> qr
|\Q
/videos/\E
(.*?
)\
-|i
,
231 url_prefix
=> "http://sevenload.com/videos/",
235 search_for
=> qr
|\Qsevenload
.com
/pl
/\E
(.*?
)/|i
,
236 url_prefix
=> "http://sevenload.com/videos/",
238 LastfmYoutube
=> { # Lastfm wraps some of the Youtube videos
239 in_scanurl
=> qr
|last.fm
|i
,
240 search_for
=> qr
|\Q
/+videos/\E\Q
+1-\E
(.*?
)["< &#%]|i,
241 url_prefix => "http
://youtube
.com
/watch?v
=",
244 in_scanurl => qr|break.com|i,
245 search_for => qr|\Q/index/\E(.*?)["< &#%]|i,
246 url_prefix
=> "http://break.com/index/",
249 # TODO: add BreakEmbed, e.g.:
250 # Page URL: http://break.com/index/if-all-movies-had-cell-phones.html
251 # Embed URL: http://embed.break.com/600081
253 in_scanurl
=> qr
|liveleak
.com
|i
,
254 search_for
=> qr
|\Q
/view?i
=\E
(.*?
)["< &#%]|i,
255 url_prefix => "http
://liveleak
.com
/view?i
=",
259 url_prefix => "http
://liveleak
.com
/view?i
=",
260 search_for => qr|\Qliveleak.com/e/\E(.*?)["< &#%]|i,
264 print "scan " unless $opts{quiet
};
267 my ( $linksref, $link ) = @_;
268 push @
$linksref, $link;
269 unless ( $opts{quiet
} ) {
270 if ( scalar(@
$linksref) % 5 == 0 ) {
271 print scalar(@
$linksref);
278 while ( my $host = each(%re) ) {
279 if ( defined $re{$host}{in_scanurl
} and $opts{strict
} ) {
280 next unless $scanurl =~ /$re{$host}{in_scanurl}/;
282 _scan_progress
( \
@links, "$re{$host}{url_prefix}$1" )
283 while ( $$pageref =~ /$re{$host}{search_for}/g );
286 print "\nremove duplicates ..." unless $opts{quiet
};
288 my %h = map { $_, 1 } @links; # Weed out duplicates
291 print " found " . scalar @links . " unique link(s).\n"
295 foreach my $link (@links) {
296 print "fetch $link ..." unless $opts{quiet
};
297 my ( $rc, $fh, $resp, $errmsg ) = fetch_page
($link);
299 $rc = $curl->getinfo(CURLINFO_RESPONSE_CODE
);
300 if ( $rc == 0 or $rc == 200 ) {
301 print "done.\n" unless $opts{quiet
};
304 $p = HTML
::TokeParser
->new( \
$resp );
305 $p->get_tag("title");
306 my $title = $p->get_trimmed_text;
308 # Store, skip if link exists already
309 my $sha1 = Digest
::SHA
::sha1_hex
($link);
311 $verified_links{$sha1} = { link => $link, title
=> $title }
312 unless defined $verified_links{$sha1};
315 $errmsg = $curl->strerror($rc) . " (http/$rc)";
319 $errmsg = $curl->strerror($rc) . " (http/$rc)";
322 print STDERR
"\nerror: $errmsg\n" if $errmsg;
325 if ( $pagetitle and scalar keys %verified_links > 0 ) {
326 $found_queue{ Digest
::SHA
::sha1_hex
($scanurl) } = {
329 videos
=> {%verified_links}
336 for my $i ( keys %found_queue ) {
337 my %videos = %{ $found_queue{$i}{videos
} };
338 for my $j ( keys %videos ) {
339 push @q, $videos{$j}{link};
345 ## Subroutines: Helpers
348 $opts{clive
} = $opts{clive
} || $ENV{CLIVE_PATH
};
349 find_clive
() unless $opts{clive
};
351 if ( $opts{selected
} and -e
$SELECTFILE ) {
352 if ( open my $fh, "<$SELECTFILE" ) {
353 parse_input
($_) while (<$fh>);
358 print STDERR
"error: $SELECTFILE: $!\n";
365 $| = 1; # => unbuffered
371 unless ( $opts{all
} ) { init_gui
(); }
376 sub write_last_file
{
377 my ( $file, @queue ) = @_;
378 if ( open my $fh, ">$file" ) {
379 print( $fh "$_\n" ) foreach @queue;
383 print STDERR
"error: $file: $!\n";
390 return if $url =~ /^$/;
393 $url = "http://$url" if $url !~ m!^http://!i;
398 print "locate clive ..." unless $opts{quiet
};
403 $opts{clive
} = $File::Find
::name
404 if ( $_ eq 'clive' );
407 $ENV{PATH
} || Cwd
::getcwd
410 if ( $opts{clive
} ) { print "$opts{clive}\n" unless $opts{quiet
}; }
412 print STDERR
"\nerror: not found, use --clive=path\n";
420 write_last_file
( $SELECTFILE, @q );
424 print STDERR
"error: fork failed: $!\n";
427 elsif ( $pid != 0 ) {
428 exec "$opts{clive} $opts{opts} " . join( ' ', @q )
429 or print STDERR
"error: exec failed: $!\n" and exit(1);
435 my $perl_v = sprintf( "--with-perl=%vd", $^V
);
437 sprintf( "clivescan version %s with WWW::Curl version "
438 . "$WWW::Curl::VERSION [%s].\n"
439 . "Copyright (c) 2008-2009 Toni Gundogdu "
440 . "<legatvs\@gmail.com>.\n\n",
443 $str .= "\t$perl_v\n\t";
445 eval "require Tk::FontDialog";
446 $opted_mods{FontDialog
} = 0 if $@
;
449 while ( my ( $key, $value ) = each(%opted_mods) ) {
450 $str .= sprintf( "--with-$key=%s ", $value ?
"yes" : "no" );
451 $str .= "\n" if ( ++$i % 2 == 0 );
454 "\nclivescan is licensed under the ISC license which is "
455 . "functionally\nequivalent to the 2-clause BSD licence.\n"
456 . "\tReport bugs to <http://code.google.com/p/clive-utils/issues/>.\n";
457 return $str if $noexit;
465 return if scalar keys %found_queue == 0;
469 require Tk
::DialogBox
;
470 eval "require Tk::FontDialog";
471 $opted_mods{FontDialog
} = 0 if $@
;
473 $mw = MainWindow
->new;
474 $mw->geometry( $opts{geometry
} ) if defined $opts{geometry
};
475 $mw->title('clivescan');
476 $mw->protocol( 'WM_DELETE_WINDOW', sub { save_prefs
(); $mw->destroy } );
480 $mw->configure( -menu
=> $mb );
483 my $file = $mb->cascade( -label
=> '~File', -tearoff
=> 0 );
485 -label
=> '~Extract videos in queue...',
486 -command
=> \
&on_extract
491 -command
=> sub { save_prefs
(); $mw->destroy }
495 if ( $opted_mods{FontDialog
} ) {
496 my $edit = $mb->cascade( -label
=> '~Edit', -tearoff
=> 0 );
498 -label
=> 'Prefere~nces...',
499 -command
=> \
&on_prefs
504 my $help = $mb->cascade( -label
=> '~Help', -tearoff
=> 0 );
506 -label
=> '~About...',
507 -command
=> \
&on_about
510 # The GUI has an upper and a lower part
511 $pwmain = $mw->Panedwindow( -orient
=> 'v', -opaqueresize
=> 0 );
514 $pwtop = $pwmain->Panedwindow( -orient
=> 'h', -opaqueresize
=> 0 );
517 my $lbar = $pwtop->Frame;
519 $lbtlink = $lbar->Scrolled(
521 -scrollbars
=> 'osoe',
523 -selectmode
=> 'extended',
526 )->pack( -side
=> 'top', -expand
=> 1, -fill
=> 'both' );
528 for my $i ( keys %found_queue ) {
529 my $scantitle = $found_queue{$i}{title
};
530 $scantitle =~ tr
{.}//d;
532 $lbtlink->add($scantitle);
533 $lbtlink->itemCreate(
539 for my $j ( keys %{ $found_queue{$i}{videos
} } ) {
540 my %video = %{ $found_queue{$i}{videos
}{$j} };
542 my $title = $video{title
};
546 for ( my $k = 0 ; ; ++$k ) {
547 $path = "$scantitle.$title (#$k)";
548 last unless $lbtlink->infoExists($path);
551 $lbtlink->add( $path, -data
=> {%video} );
552 $lbtlink->itemCreate(
559 $lbtlink->autosetmode;
560 $lbtlink->close($_) foreach ( $lbtlink->infoChildren('') );
562 my $rbar = $pwtop->Frame; # Button toolbar
565 -command
=> \
&on_grab
566 )->pack( -fill
=> 'x' );
569 -text
=> 'Grab everything',
570 -command
=> \
&on_grab_all
571 )->pack( -fill
=> 'x' );
573 $pwtop->add( $lbar, $rbar, -width
=> $opts{pwtop
} || 200 );
576 $pwbottom = $pwmain->Panedwindow( -orient
=> 'h', -opaqueresize
=> 0 );
578 $lbtqueue = $pwbottom->Scrolled(
580 -scrollbars
=> 'osoe',
582 -selectmode
=> 'extended',
587 my $bar = $pwbottom->Frame; # Button toolbar
591 -command
=> \
&on_remove
592 )->pack( -fill
=> 'x' );
596 -command
=> \
&on_clear
597 )->pack( -fill
=> 'x' );
600 -text
=> 'Extract videos...',
601 -command
=> \
&on_extract
602 )->pack( -fill
=> 'x', -side
=> 'bottom' );
604 $pwbottom->add( $lbtqueue, $bar, -width
=> $opts{pwbottom
} || 200 );
606 # Add upper and lower parts to main paned window
607 $pwmain->add( $pwtop, $pwbottom, -height
=> $opts{pwmain
} || 200 );
609 $mw->RefontTree( -font
=> $opts{mainfont
} )
610 if $opted_mods{FontDialog
};
612 $pwmain->pack( -expand
=> 1, -fill
=> 'both' );
619 File
::Path
::mkpath
( [$CONFIGDIR], 0, 0700 );
621 my $c = Config
::Tiny
->new;
622 $c->{gui
}->{geometry
} = $mw->geometry();
623 $c->{gui
}->{pwmain
} = ( $pwmain->sashCoord(0) )[1] - 7;
624 $c->{gui
}->{pwtop
} = ( $pwtop->sashCoord(0) )[0] - 7;
625 $c->{gui
}->{pwbottom
} = ( $pwbottom->sashCoord(0) )[0] - 7;
626 $c->{gui
}->{mainfont
} = $opts{mainfont
};
628 $c->write($PREFSFILE);
632 ( $opts{mainfont
} ) = @_;
633 $mw->RefontTree( -font
=> $opts{mainfont
} );
639 return if $path !~ /\./;
640 return if $lbtqueue->infoExists($path);
642 my %video = %{ $lbtlink->infoData($path) };
643 my ($link) = split /\./, $path;
645 unless ( $lbtqueue->infoExists($link) ) {
646 $lbtqueue->add($link);
647 $lbtqueue->itemCreate(
654 $lbtqueue->add( $path, -data
=> {%video} );
655 $lbtqueue->itemCreate(
657 -text
=> $video{title
},
663 queue_item
($_) foreach ( $lbtlink->infoSelection );
664 $lbtqueue->autosetmode;
668 foreach ( $lbtlink->infoChildren("") ) {
669 my ($parent) = split /\./;
670 queue_item
($_) foreach ( $lbtlink->infoChildren($parent) );
672 $lbtqueue->autosetmode;
676 $lbtqueue->deleteEntry($_) foreach ( $lbtqueue->infoSelection );
680 $lbtqueue->deleteAll;
684 my $dlg = $mw->DialogBox( -title
=> 'About', -buttons
=> ['OK'] );
685 my $txt = $dlg->add( 'Text', -height
=> 9 )->pack;
686 $txt->insert( 'end', print_version
(1) );
691 my ( $top, $lblv, $lbl ) = @_;
692 my $font = $top->FontDialog( -initfont
=> $$lblv )->Show;
694 if ( defined $font ) {
695 my $descr = $top->FontDialog->GetDescriptiveFontName($font);
696 $lbl->configure( -font
=> $descr );
702 my $dlg = $mw->DialogBox(
703 -title
=> 'clivescan preferences',
704 -buttons
=> [ 'OK', 'Cancel' ]
707 $dlg->add( 'Label', -text
=> 'Fonts: press to choose' )
708 ->grid( -sticky
=> 'w', -pady
=> 10 );
710 my ($mainfont) = ( $opts{mainfont
} );
711 my $mainfontl = $dlg->Label( -textvariable
=> \
$mainfont );
715 -text
=> 'Main font',
716 -command
=> sub { change_font
( $dlg, \
$mainfont, $mainfontl ) }
717 )->grid( $mainfontl, -sticky
=> 'w', -padx
=> '5' );
719 on_prefs_ok
($mainfont) if $dlg->Show eq 'OK';
724 foreach ( $lbtqueue->infoChildren('') ) {
725 foreach ( $lbtqueue->infoChildren($_) ) {
726 my %video = %{ $lbtqueue->infoData($_) };
727 push @q, $video{link};
732 # Prompt for clive(1) options
733 my $dlg = $mw->DialogBox(
734 -title
=> 'clive(1) options',
735 -buttons
=> [ 'OK', 'Cancel' ]
738 $dlg->add( 'Label', -text
=> 'Path to clive' )->grid(
739 my $clivepath = $dlg->Entry( -width
=> 60 ),
744 $dlg->add( 'Label', -text
=> 'Runtime options' )->grid(
745 my $cliveopts = $dlg->Entry( -width
=> 60 ),
750 $clivepath->insert( 'end', $opts{clive
} );
751 $cliveopts->insert( 'end', $opts{opts
} );
753 if ( $dlg->Show() eq 'OK' ) {
754 $opts{clive
} = $clivepath->get;
755 $opts{opts
} = $cliveopts->get;
765 clivescan [option]... [URL]...
769 -h, --help print help and exit
770 -v, --version print version and exit
771 -c, --clive=PATH path to clive(1) command
772 -o, --opts=OPTIONS options passed to clive(1) command
773 -a, --all extract all videos without prompting
774 -s, --selected re-extract last video selection
775 -r, --recall recall last input
776 -n, --no-strict work around host specific search pattern issues
777 -p, --paste paste input data from clipboard
778 -U, --agent=STRING identify as STRING to http server
779 -y, --proxy=ADDR use address for http proxy
780 -X, --no-proxy do not use http proxy