Created the public git repo for 2.x+.
[clive.git] / clive
blobf0801729a281716e04a0f2180fcb6ac273b9fc9d
1 #!/usr/bin/env perl
2 # -*- coding: ascii -*-
3 ###########################################################################
4 # clive, the non-interactive video extraction utility
5 # Copyright (C) 2007,2008 Toni Gundogdu.
7 # clive is free software: you can redistribute it and/or modify
8 # it under the terms of the GNU General Public License as published by
9 # the Free Software Foundation, either version 3 of the License, or
10 # (at your option) any later version.
12 # clive is distributed in the hope that it will be useful,
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 # GNU General Public License for more details.
17 # You should have received a copy of the GNU General Public License
18 # along with clive. If not, see <http://www.gnu.org/licenses/>.
19 ###########################################################################
21 # Keep it simple.
23 use strict;
24 use warnings;
26 binmode(STDOUT, ":utf8");
28 use HTML::TokeParser;
29 use WWW::Curl::Easy;
30 use Config::Tiny;
31 use URI::Escape;
32 use BerkeleyDB;
33 use IO::Pager;
35 # Core modules:
36 use Digest::SHA qw(sha1_hex);
37 use POSIX qw(strftime);
38 use Getopt::Long;
39 use XML::Simple;
40 use File::Path;
41 use File::Spec;
42 use Pod::Usage;
43 use Encode;
44 use Cwd;
46 # Check for non-essential modules: set flags that indicate their availability
47 my %optional_mods = (Clipboard => 1);
48 eval "use Clipboard;"; $optional_mods{Clipboard}=0 if $@;
50 my $VERSION = "2.0beta1";
51 my $HOMEDIR = $ENV{HOME} or die "error: HOME environment variable not set";
52 my $CONFIGDIR = File::Spec->catfile( $HOMEDIR, ".config/clive");
53 my $CONFIGFILE = File::Spec->catfile($CONFIGDIR, "config");
54 my $CACHEFILE = File::Spec->catfile($CONFIGDIR, "cache");
55 my $RECALLFILE = File::Spec->catfile($CONFIGDIR, "recall");
57 my %opts; # Holds runtime options
58 my @queue; # Holds input URLs
59 my $workdir=getcwd; # Holds startup workdir
60 my $logfile; # Holds path to logfile (--output-file, --append-file)
61 my $curl; # Holds the curl handle: reused throughout lifespan
62 my $cache_db; # Holds the handle to cache BDB
63 my %cache; # Holds the handle to cache BDB (tied hash)
64 my $hash; # Hash (SHA1) of the current URL
65 my %entry; # Multi-purpose video (cache) record (hold/read/write)
66 my $youtube_on=0; # Flag: Whether logged into Youtube
67 my $last_bspaces; # Progress: Keeps count of the last printed backspaces
68 my $curr_fn; # Progress: Holds the name of the current video file
69 my $time_started; # Progress: Holds transfer started time
70 my $last_eta; # Progress: Holds last saved ETA for file transfer
72 my $default_showfmt # Default --show format
73 = qq/%D: "%t" | %mMB/;
75 my %re_hosts = ( # Precompiled regex used to identify the host
76 IsYoutube => qr|\Qyoutube.com\E|i, IsGoogle => qr|\Qvideo.google.\E|i,
77 IsSevenload => qr|\Qsevenload.com\E|i, IsBreak => qr|\Qbreak.com\E|i,
78 IsMetacafe => qr|\Qmetacafe.com\E|i
81 # Parse config
82 my $c = Config::Tiny->read($CONFIGFILE);
83 %opts = (
84 agent => $c->{http}->{agent},
85 proxy => $c->{http}->{proxy},
86 maxspeed=> $c->{http}->{maxspeed},
87 minspeed=> $c->{http}->{minspeed},
88 savedir => $c->{output}->{savedir},
89 cclass => $c->{output}->{cclass},
90 fnfmt => $c->{output}->{file},
91 showfmt => $c->{output}->{show},
92 ytuser => $c->{youtube}->{user},
93 ytpass => $c->{youtube}->{pass},
94 play => $c->{commands}->{play},
97 # Parse cmdline
98 # Define those not read from config and init with defaults
99 $opts{quiet} = 0;
100 $opts{paste} = 0;
101 $opts{format} = 'flv';
102 $opts{extract} = 1;
103 $opts{renew} = 0;
104 $opts{clear} = 0;
105 $opts{recall} = 0;
106 $opts{login} = 1;
107 $opts{show} = 0;
108 $opts{grep} = undef;
109 $opts{case} = 1;
110 $opts{delete} = 0;
111 $opts{background} = 0;
112 $opts{output} = undef;
113 $opts{append} = undef;
114 $opts{progress} = 1;
115 $opts{debug} = 0;
116 $opts{help} = 0;
117 $opts{manual} = 0;
118 $opts{version} = 0;
120 Getopt::Long::Configure("bundling");
121 GetOptions(\%opts,
122 'debug|d', 'help|h', 'manual|m', 'version|v',
123 'paste|x', 'show|s', 'delete|D', 'clear|C',
124 'continue|c', 'renew|R', 'recall|r', 'format|f=s',
125 'output|o=s', 'append|a=s', 'background|b', 'quiet|q',
126 'grep|g=s',
127 #'maxspeed!', 'minspeed!',
128 # Since '$longopt!|$shortopt' is a no-no.
129 'ignore-case|i' => sub { $opts{case} = 0; },
130 'noextract|n' => sub { $opts{extract} = 0; },
131 'noplay|P' => sub { $opts{play} = 0; },
132 'nologin|L' => sub { $opts{login} = 0; },
133 'noproxy|X' => sub { $opts{proxy} = ""; },
134 'noprogress|G' => sub { $opts{progress}= 0; },
135 ) or pod2usage(1);
137 print_version() if $opts{version};
138 pod2usage(-exitstatus => 0, -verbose => 1) if $opts{help};
139 pod2usage(-exitstatus => 0, -verbose => 2) if $opts{manual};
141 init_cache();
143 if ( $opts{clear} ) { clear_cache(); }
144 elsif ( $opts{show} ) { show_cache(); }
146 get_queue();
148 select STDERR; $| = 1; # Make unbuffered
149 select STDOUT; $| = 1;
151 daemonize() if $opts{background};
153 process_queue();
155 free_cache();
158 ## Subroutines: Connection
160 sub set_curl_opts { # Set common curl options for handle
161 $curl->setopt(CURLOPT_USERAGENT,
162 $opts{agent} ? $opts{agent} : "Mozilla/5.0");
164 $curl->setopt(CURLOPT_VERBOSE, 1) if $opts{debug};
165 $curl->setopt(CURLOPT_PROXY, $opts{proxy}) if defined $opts{proxy};
166 $curl->setopt(CURLOPT_FOLLOWLOCATION, 1);
167 $curl->setopt(CURLOPT_AUTOREFERER, 1);
168 $curl->setopt(CURLOPT_HEADER, 1);
169 $curl->setopt(CURLOPT_NOBODY, 0);
171 # NOTE: No effect. Bug in WWW::Curl::Easy?
172 $curl->setopt(CURLOPT_MAX_RECV_SPEED_LARGE, $opts{maxspeed})
173 if $opts{maxpseed};
175 $curl->setopt(CURLOPT_LOW_SPEED_LIMIT, $opts{minspeed})
176 if $opts{minspeed};
179 sub auth_youtube { # Log into Youtube
180 print "=> Youtube: Attempting to login as $opts{ytuser} ..."
181 unless $opts{quiet};
183 my $response = "";
184 open my $fh, ">", \$response;
186 my $login_url = "http://youtube.com/login?current_form=loginform"
187 ."&username=$opts{ytuser}&password=$opts{ytpass}&action_login=log+in";
189 $curl->setopt(CURLOPT_URL, $login_url);
190 $curl->setopt(CURLOPT_COOKIEFILE, ""); # Enable cookies from here on
191 $curl->setopt(CURLOPT_ENCODING, ""); # Supported encodings
192 $curl->setopt(CURLOPT_WRITEDATA, $fh);
194 my $rc = $curl->perform;
195 my $errmsg;
197 if ( $rc == 0 ) {
198 foreach ( $response ) {
199 $errmsg = "error: incorrect login for $opts{ytuser}" and last
200 if /login was incorrect/i;
202 } else {
203 $errmsg = "error: ".$curl->strerror($rc)." (http/$rc)";
205 close $fh;
207 print STDERR "\n$errmsg\n" and exit if $errmsg;
209 print "done.\n=> Youtube: Bypassing age check ..." unless $opts{quiet};
210 $curl->setopt(CURLOPT_COOKIE, "is_adult=" . uc( sha1_hex(rand()) ) );
211 print "Done.\n" unless $opts{quiet};
213 $youtube_on = 1;
217 # Subroutines: Queue
219 sub process_queue {
220 $curl = WWW::Curl::Easy->new;
221 set_curl_opts();
223 foreach ( @queue ) {
224 $hash = sha1_hex($_);
226 my ($rc, $rfh, $response) = fetch_page($_);
227 my $errmsg;
229 # TODO: Clean up
230 if ( $rc == 0 ) {
231 $rc = $curl->getinfo(CURLINFO_RESPONSE_CODE);
232 if ( $rc == 0 or $rc == 200) {
233 if ( ! defined( $entry{page_url} ) ) {
234 next if process_page($_, \$response, $rfh) == -1;
236 if ( $entry{xurl} ) {
237 my ($rc, $content_type) = query_video_length();
238 if ( $rc == 0 ) {
239 my ($rc, $path) = extract_video($content_type)
240 if $content_type;
241 if ( $opts{play} and $rc == 0 ) {
242 print "=> Play: $path\n";
243 my $cmd = $opts{play};
244 $cmd =~ s/%i/"$path"/;
245 system($cmd.">/dev/null") == 0
246 or die "system exited with $?";
250 } else {
251 $errmsg = $curl->strerror($rc)." (http/$rc)";
253 } else {
254 $errmsg = $curl->strerror($rc)." (http/$rc)";
256 close $rfh;
257 print STDERR "\n==> error: $errmsg\n" if $errmsg;
261 sub fetch_page {
262 my ($url, $from_cache, $response, $rc) = (shift, 0, "", 0);
264 open my $fh, ">", \$response;
266 # Log into Youtube if username and password are defined
267 if ( $opts{ytuser} and $opts{ytpass} and $opts{login} ) {
268 auth_youtube() if ! $youtube_on and $url =~ /$re_hosts{IsYoutube}/;
271 if ( $cache{$hash} ) {
272 fetch_entry($hash); # Make sure cached "format" matches with options
273 $from_cache = 1 if $opts{format} eq $entry{file_format};
276 $from_cache = 0 if $opts{renew};
278 printf "%s $url ...", ! $from_cache ? "Fetching":"Caching"
279 unless $opts{quiet};
281 if ( ! $from_cache ) {
282 %entry = ();
283 $curl->setopt(CURLOPT_URL, $url);
284 $curl->setopt(CURLOPT_ENCODING, ""); # Supported encodings
285 $curl->setopt(CURLOPT_WRITEDATA, $fh);
286 $rc = $curl->perform;
289 return ($rc, $fh, $response);
292 sub process_page {
293 my ($url, $response_ref, $response_fh) = @_;
294 print "done.\n=> Processing page ..." unless $opts{quiet};
296 $$response_ref =~ tr{\n}//d;
298 my $p = HTML::TokeParser->new($response_ref);
299 $p->get_tag("title");
300 my $title = $p->get_trimmed_text;
302 my ($xurl, $id);
303 if ( $url =~ /$re_hosts{IsYoutube}/ ) {
304 ($xurl, $id) = handle_youtube($response_ref);
305 } elsif ( $url =~ /$re_hosts{IsGoogle}/ ) {
306 ($xurl, $id) = handle_google($response_ref);
307 } elsif ( $url =~ /$re_hosts{IsSevenload}/ ) {
308 ($xurl, $id, $title) = handle_sevenload($response_ref, $response_fh);
309 } elsif ( $url =~ /$re_hosts{IsBreak}/ ) {
310 ($xurl, $id, $title) = handle_break($response_ref);
311 } elsif ( $url =~ /$re_hosts{IsMetacafe}/ ) {
312 ($xurl, $id, $title) = handle_metacafe($response_ref);
314 return -1 if ! $xurl or ! $id or ! $title;
316 $title = decode_utf8($title); # sevenload, break grab title from elsewhere
317 $title =~ tr{;}//d; # Cache values cannot contain ';'
319 $entry{page_url} = $url;
320 $entry{xurl} = $xurl;
321 $entry{page_title} = $title;
322 $entry{video_id} = $id;
323 $entry{file_format} = $opts{format};
325 return 0;
328 sub query_video_length {
329 my ($content_type, $errmsg);
331 if ( ! $entry{file_length} ) {
332 print "done.\n=> Querying file length ..." unless $opts{quiet};
334 $curl->setopt(CURLOPT_URL, $entry{xurl});
335 # We're not interested in downloading the file. GET => HEAD request.
336 $curl->setopt(CURLOPT_NOBODY, 1);
337 my $rc = $curl->perform;
338 # Reset HEAD => GET
339 $curl->setopt(CURLOPT_HTTPGET, 1);
341 $entry{file_length} =
342 $curl->getinfo(CURLINFO_CONTENT_LENGTH_DOWNLOAD);
344 $content_type =
345 $entry{file_suffix} =
346 $curl->getinfo(CURLINFO_CONTENT_TYPE);
348 $rc = $curl->getinfo(CURLINFO_RESPONSE_CODE);
350 # TODO: Clean up
351 if ( $rc == 200 ) {
352 my $content_ok = 0;
353 if ( $content_type =~ m!video/(.*)! ) {
354 $entry{file_suffix} = $1;
355 if ( $content_type =~ /(.*)-(.*)$/ ) {
356 $entry{file_suffix} = $2;
357 } $content_ok = 1;
358 # Break and Metacafe return "text/plain" for Content-Type
359 } elsif ( $content_type =~ m!text/plain! ) {
360 if ( $opts{format} eq "flv" ) {
361 if ( $entry{page_url} =~ /$re_hosts{IsBreak}/
362 or $entry{page_url} =~ /$re_hosts{IsMetacafe}/ ) {
363 $entry{file_suffix} = "flv";
364 $content_ok = 1;
368 $errmsg = "expected different content-type, "
369 . "received \"$content_type\"" unless $content_ok;
370 } else {
371 $errmsg = "server returned HTTP/$rc";
373 } else { # Construct content-type from cache
374 $content_type = "video/$entry{file_suffix}";
377 unless ( $opts{quiet} ) {
378 if ( ! $errmsg ) { print "done.\n"; }
379 else { print STDERR "\n==> error: $errmsg\n"; }
382 return ($errmsg ? -1:0, $content_type);
385 sub extract_video {
386 my $content_type= shift;
387 my $fn = title_to_filename($entry{page_title});
388 my $path = File::Spec->catfile( $opts{savedir} || $workdir, $fn );
389 my $filemode = ">";
390 my $cont_from = 0;
391 my $remaining = $entry{file_length};
392 my $size = -s $path;
393 my $rc = 0;
394 my $errmsg;
396 # We have everything for cache. Add/update the bdb entry.
397 save_entry($hash);
399 $curl->setopt(CURLOPT_ENCODING, "identity"); # Disable
401 if ( $size ) {
402 if ( $size == $entry{file_length} and $opts{extract} ) {
403 print "=> Refusing to extract. "
404 . "localfile length matches remotefile length.\n";
405 return (0, $path);
407 if ( $size < $entry{file_length} and $opts{continue} ) {
408 $cont_from = $size;
409 $filemode = ">>";
410 $remaining = ($entry{file_length} - $cont_from);
411 } else {
412 $path = newname_if_exists( $opts{savedir} || $workdir, $fn );
416 unless ( $opts{quiet} ) {
417 print "=> File: $fn\n" if ( ! $opts{extract} );
418 print "=> Length: $entry{file_length} ";
419 printf"(%.2fMB) ",$entry{file_length}/1024/1024 if $entry{file_length};
420 printf "From: %u (Left: %u) ", $cont_from, $remaining if $cont_from;
421 printf "[$content_type]" if $content_type;
422 print "\n";
425 if ( $rc == 0 ) { #
426 # -1 = Disable subsequent play with --noextract
427 return (-1, $path) unless $opts{extract};
429 if ( open my $fh, "$filemode$path" ) {
430 $curl->setopt(CURLOPT_URL, $entry{xurl});
431 $curl->setopt(CURLOPT_HEADER, 0); # Disable
432 $curl->setopt(CURLOPT_RESUME_FROM, $cont_from) if $cont_from;
433 $curl->setopt(CURLOPT_WRITEDATA, $fh);
435 unless ( $opts{quiet} ) {
436 $curl->setopt(CURLOPT_PROGRESSFUNCTION, \&progress_callback);
437 $curl->setopt(CURLOPT_NOPROGRESS, 0);
438 $curr_fn = $fn;
439 $last_bspaces = 0;
440 $time_started = time;
441 $last_eta = '';
444 $rc = $curl->perform;
445 close $fh;
447 # Reset
448 $curl->setopt(CURLOPT_HEADER, 1);
450 if ( $rc == 0 ) { $rc = $curl->getinfo(CURLINFO_RESPONSE_CODE); }
451 else { $errmsg = $curl->strerror($rc)." (http/$rc)"; }
452 } else {
453 $errmsg = "$path: $!";
454 $rc = -1; # Disable subsequent play
456 } else {
457 $errmsg = $curl->strerror($rc)." (http/$rc)";
460 if ( $errmsg ) { print STDERR "\n==> error: $errmsg\n"; }
461 else { print "\n==> Closed with HTTP/$rc.\n" unless $opts{quiet}; }
463 return ($errmsg ? -1:0, $path);
466 sub get_queue {
467 if ( $opts{recall} and -e $RECALLFILE ) {
468 open my $fh, "<$RECALLFILE" or die "error: $RECALLFILE: $!";
469 parse_input($_) while ( <$fh> );
470 close $fh;
473 if ( $opts{paste} ) {
474 print STDERR "error: Clipboard module not found" and exit
475 unless $optional_mods{Clipboard};
476 parse_input($_) foreach Clipboard->paste();
479 parse_input($_) foreach @ARGV;
480 grep_cache() if $opts{grep};
481 read_stdin() unless @queue;
483 my %h = map { $_, 1 } @queue; # Remove duplicates
484 @queue = keys %h;
486 open my $fh, ">$RECALLFILE" or die "error: $RECALLFILE: $!";
487 print $fh "$_\n" foreach @queue;
488 close $fh;
491 sub read_stdin {
492 parse_input($_) while ( <STDIN> );
495 sub parse_input {
496 my $url = shift;
498 return if $url =~ /^$/;
499 chomp $url;
501 if ( $url =~ /&srcurl=(.*?)&/ ) { # GVideo: one of many redirects
502 unless ( $opts{quiet} ) {
503 print "Found redirect in ...".(split /&/,$url)[0]."\n";
504 print "=> Using the redirect URL instead\n";
506 $url = uri_unescape($1);
509 # Insert http:// if not found
510 if ( $url !~ /^http:\/\//i ) { $url = "http://$url"; }
512 # Translate embedded URL to video page URL
513 $url =~ s{/v/}{/watch?v=}ig; # Youtube
514 $url =~ s{\Q/googleplayer.swf?docid=}{/videoplay?docid=\E}ig; # GVideo
516 # Remove params from the URL NOTE: May require tweaking
517 $url = (split /&/, $url) [0];
519 foreach my $re ( %re_hosts ) {
520 push @queue,$url and return 0 if $url =~ /$re/;
522 print STDERR "error: nosupport: $url\n";
523 return -1;
527 # Subroutines: Video page handlers
529 sub handle_youtube {
530 my ($response_ref, $xurl) = (shift);
532 my %re = (
533 GrabID => qr/"video_id": "(.*?)"/,
534 GrabT => qr/"t": "(.*?)"/
537 my $id = $1 if $$response_ref =~ /$re{GrabID}/;
538 my $t = $1 if $$response_ref =~ /$re{GrabT}/;
540 if ( $id and $t ) {
541 $xurl = "http://youtube.com/get_video?video_id=$id&t=$t";
542 my $fmt;
543 if ( $opts{format} eq "mp4" ) { $fmt = 18; }
544 elsif ( $opts{format} eq "3gpp" ) { $fmt = 17; }
545 elsif ( $opts{format} eq "xflv" ) { $fmt = 6; }
546 $xurl .= "&fmt=$fmt" if $fmt;
547 } else {
548 printf STDERR "\nerror: failed to extract &%s\n", $id ? "t":"video_id";
550 return ($xurl, $id);
553 sub handle_google {
554 my $response_ref = shift;
556 my %re = (
557 GrabRedirect => qr|lfRedirect\('(.*?)'|,
558 GrabVideoURL => qr|\Qgoogleplayer.swf?videoUrl\x3d\E(.*?)\Q\x26|,
559 GrabID => qr|docid: '(.*?)'|,
560 GrabMP4 => qr|\Qhref="http://vp.\E(.*?)"|,
563 my $redir = $1 if $$response_ref =~ /$re{GrabRedirect}/;
564 my $xurl = uri_unescape($1) if $$response_ref =~ /$re{GrabVideoURL}/;
565 my $id = $1 if $$response_ref =~ /$re{GrabID}/;
566 my $mp4 = $1 if $$response_ref =~ /$re{GrabMP4}/;
568 if ( $redir ) {
569 $redir =~ s{\\x3d}{=};
570 push @queue, $redir;
571 print "Found a redirect to another host. Pushed into queue.\n"
572 unless $opts{quiet};
573 } else {
574 $xurl = $mp4 if ( $mp4 and $opts{format} eq "mp4" );
575 print STDERR "\nerror: extraction url not found\n" unless $xurl;
577 return ($xurl, $id);
580 sub handle_sevenload {
581 my ($response_ref, $response_fh) = @_;
583 my %re = ( GrabConfigPath => qr|configPath=(.*?)"| );
584 my $confpath = uri_unescape($1) if $$response_ref =~ /$re{GrabConfigPath}/;
586 my ($id, $xurl, $title);
587 if ( $confpath ) {
588 ($xurl, $id, $title) =
589 fetch_sevenload_configxml($confpath, $response_fh);
590 } else {
591 print STDERR "\nerror: configPath not found\n";
593 return ($xurl, $id, $title);
596 sub handle_break {
597 my $response_ref = shift;
599 my %re = (
600 GrabTitle => qr|id="vid_title" content="(.*?)"|,
601 GrabID => qr|ContentID='(.*?)'|,
602 GrabFilePath => qr|ContentFilePath='(.*?)'|,
603 GrabFileName => qr|FileName='(.*?)'|
606 my $title = $1 if $$response_ref =~ /$re{GrabTitle}/;
607 my $id = $1 if $$response_ref =~ /$re{GrabID}/;
608 my $fpath = $1 if $$response_ref =~ /$re{GrabFilePath}/;
609 my $fname = $1 if $$response_ref =~ /$re{GrabFileName}/;
611 my ($xurl, $errmsg);
612 if ( $fpath and $fname ) {
613 $xurl = "http://media1.break.com/dnet/media/$fpath/$fname";
614 my $fmt = $opts{format};
615 $fmt = 'flv' if not grep /$opts{format}/, ('flv','wmv');
616 $xurl .= ".$fmt";
617 } else {
618 $errmsg = "failed to extract ContentFilePath" if ! $fpath;
619 $errmsg = "failed to extract FileName" if ! $fname and ! $errmsg;
622 $errmsg = "failed to extract title" if ! $title and ! $errmsg;
623 $errmsg = "failed to extract id" if ! $id and ! $errmsg;
624 print STDERR "\nerror: " . $errmsg . "\n" if $errmsg;
626 return ($xurl, $id, $title);
629 sub handle_metacafe {
630 my $response_ref = shift;
632 my %re = (
633 GrabTitle => qr|"title":"(.*?)"|,
634 GrabID => qr|"itemID":"(.*?)"|,
635 GrabItemFiles => qr|ItemFiles(.*?)"|,
636 GrabVideoCDN => qr|"videoCDNURL":"(.*?)"|
639 my $title = $1 if $$response_ref =~ /$re{GrabTitle}/;
640 my $id = $1 if $$response_ref =~ /$re{GrabID}/;
641 my $itemfiles = $1 if $$response_ref =~ /$re{GrabItemFiles}/;
642 my $videocdn = $1 if $$response_ref =~ /$re{GrabVideoCDN}/;
644 my ($xurl, $errmsg);
645 if ( $itemfiles and $videocdn ) {
646 $itemfiles =~ tr{\\}//d;
647 $videocdn =~ tr{\\}//d;
648 $xurl = $videocdn.$itemfiles;
649 } else {
650 $errmsg = "failed to extract ItemFiles" if ! $itemfiles;
651 $errmsg = "failed to extract videoCDNURL" if ! $videocdn and ! $errmsg;
654 $errmsg = "failed to extract title" if ! $title and ! $errmsg;
655 $errmsg = "failed to extract itemID" if ! $id and ! $errmsg;
656 print STDERR "\nerror: " . $errmsg . "\n" if $errmsg;
658 return ($xurl, $id, $title);
662 # Subroutines: LittleHelpers
664 sub daemonize {
665 $logfile = $opts{append}
666 || $opts{output}
667 || File::Spec->catfile( $workdir, "clive-log" );
669 my $pid = fork;
670 if ( $pid < 0 ) {
671 print STDERR "\nfork failed: $!";
672 exit 1;
673 } elsif ( $pid != 0 ) {
674 print "Continuing in background, pid $pid.\n";
675 print "Output will be written to $logfile.\n" unless $opts{quiet};
676 exit 0;
679 chdir $workdir;
681 my $mode = $opts{append} ? ">>" : ">";
682 $logfile = "/dev/null" if $opts{quiet};
684 open STDOUT, "$mode", "$logfile" or die "cannot redirect STDOUT: $!";
685 open STDERR, ">&STDOUT" or die "cannot dup STDOUT: $!";
688 sub fetch_sevenload_configxml {
689 my ($conf_url, $response_fh) = @_;
690 print "done.\n=> Fetching config XML..." unless $opts{quiet};
692 my $conf_xml = "";
693 open my $conf_fh, ">", \$conf_xml;
695 $curl->setopt(CURLOPT_URL, $conf_url);
696 $curl->setopt(CURLOPT_HEADER, 0);
697 $curl->setopt(CURLOPT_WRITEDATA, $conf_fh);
699 my $rc = $curl->perform;
701 # Reset
702 $curl->setopt(CURLOPT_HEADER, 1);
703 $curl->setopt(CURLOPT_WRITEDATA, $response_fh);
705 close $conf_fh;
707 my ($id, $xurl, $title);
709 if ( $rc == 0 ) {
710 my $xml = XMLin($conf_xml);
711 #use Data::Dumper; print Dumper($xml);
712 $title = $xml->{playlists}{playlist}{items}{item}{title}; # Monstrous.
713 $id = $xml->{playlists}{playlist}{items}{item}{id};
714 $xurl = $xml->{playlists}{playlist}{items}{item}{videos}{video}{url};
715 } else {
716 print STDERR "\nerror: " . $curl->strerror($rc) . " (http/$rc)\n";
719 my $errmsg;
720 $errmsg = "failed to extract item title" if ! $title;
721 $errmsg = "failed to extract item id" if ! $id and ! $errmsg;
722 print STDERR "\nerror: " . $errmsg . "\n" if $errmsg;
724 return ($xurl, $id, $title);
727 sub title_to_filename {
728 my $title = shift;
730 $title =~ s/youtube - //i; # Remove host specific strings from title
731 $title =~ s/ video//i; # Breakcom
733 my $r = $opts{cclass} || qr|\w|;
734 $title = join '', $title =~ /$r/g;
736 # Courtesy of:
737 # http://search.cpan.org/~gaas/URI-1.37/URI.pm#PARSING_URIs_WITH_REGEXP
738 my ($scheme, $authority, $path, $query, $fragment) =
739 m{(?:([^:/?#]+):)?(?://([^/?#]*))?([^?#]*)(?:\?([^#]*))?(?:#(.*))?}o;
740 # Extract the domain from the URL.
741 my @a = split /\./, $authority;
743 my $fn = $opts{fnfmt} || "%t-(%i)-[%d].%s";
744 my $timestamp = strftime("%F %T",localtime);
746 my %h = (
747 "%t" => $title,
748 "%s" => $entry{file_suffix},
749 "%d" => $a[scalar @a-2], # Without the TLD.
750 "%i" => $entry{video_id},
751 "%D" => (split / /, $timestamp)[0],
752 "%T" => (split / /, $timestamp)[1],
753 "%S" => $timestamp,
756 my $m = join '|', keys %h;
757 $fn =~ s/($m)/$h{$1}/ig;
759 return $fn;
762 sub newname_if_exists {
763 my ($path, $orig, $new) = (shift, shift);
765 for ( my $i=1;; $i++ ) {
766 $new = File::Spec->catfile( $path, "$orig.$i" );
767 last if ! -e $new;
769 return $new;
772 sub progress_callback {
773 return 0 unless $opts{progress};
775 my ($clientp, $dltotal, $dlnow, $ultotal, $ulnow) = @_;
777 my $percent = 0;
778 $percent = int ( $dlnow / $dltotal * 100 ) if $dlnow;
780 my $elapsed = time - $time_started;
781 return 0 if $elapsed < 1.0;
783 my $rate = $dlnow / $elapsed;
785 my $eta = "--:--:--";
786 if ( $rate > 0 ) {
787 my $left = ( $dltotal - $dlnow ) / $rate;
788 my $ss = $left % 60;
789 my $mm = int( ( $left % 3600 ) / 60 );
790 my $hh = int( $left / 3600 );
791 if ( $hh > 99 ) { $eta = sprintf "%2dh%02dm", $hh, $mm; }
792 else { $eta = sprintf "%2dh%02dm%02ds", $hh, $mm, $ss; }
794 return 0 if $eta eq $last_eta;
796 my $s = sprintf "%.50s%4.4g%%%8.1fKB/s%12s",
797 $curr_fn, $percent, $rate/1024, $eta;
799 print "\b" x $last_bspaces . $s unless $opts{quiet};
800 $last_bspaces = length(encode_utf8($s));
802 return 0;
805 sub init_cache {
806 mkpath( [$CONFIGDIR], 1, 0700 );
807 $cache_db = tie %cache, "BerkeleyDB::Hash",
808 -Filename => $CACHEFILE,
809 -Flags => DB_CREATE
810 or die "error: cannot open $CACHEFILE: $! $BerkeleyDB::Error\n";
813 sub format_show {
814 my $s = shift;
815 my %e = map_entry(shift);
817 my %h = (
818 "%t" => $e{page_title},
819 "%i" => $e{video_id},
820 "%l" => $e{file_length},
821 "%m" => sprintf("%.2f", $e{file_length}/1048576),
822 "%u" => $e{page_url},
823 "%x" => $e{xurl},
824 "%D" => (split / /, $e{time_stamp})[0],
825 "%T" => (split / /, $e{time_stamp})[1],
826 "%S" => $e{time_stamp},
829 my $m = join '|', keys %h;
830 $s =~ s/($m)/$h{$1}/ig;
832 return $s;
835 sub show_cache {
836 IO::Pager->new(*STDOUT);
838 my $fmt = $opts{showfmt} || $default_showfmt;
839 my @entries = ();
841 if ( $opts{grep} ) {
842 grep_cache(); # Stores matches => @queue
843 push @entries, format_show( $fmt, sha1_hex($_) )
844 foreach ( @queue );
845 } else {
846 push @entries, format_show( $fmt, $_ )
847 foreach ( sort keys %cache );
849 print STDOUT "$_\n" foreach sort @entries;
850 close STDOUT;
852 if ( $opts{grep} and $opts{delete} and scalar @queue > 0 ) {
853 print "Confirm delete (y/N):";
854 $_ = lc <STDIN>;
855 chomp;
856 if ( lc $_ eq "y" ) { delete $cache{sha1_hex($_)} foreach ( @queue ); }
858 exit;
861 sub clear_cache {
862 unlink $CACHEFILE if -e $CACHEFILE;
863 exit;
866 sub free_cache {
867 undef $cache_db;
868 untie %cache;
871 sub map_entry {
872 my $key = shift;
873 my @values = split /;/, $cache{$key};
875 my @keys = qw(
876 file_suffix file_length file_format page_title
877 page_url time_stamp video_id xurl
878 ); # Order matters. See also save_cache_entry.
880 my $i = 0;
881 return map { $_ => $values[$i++] } @keys;
884 sub fetch_entry {
885 %entry = map_entry($hash);
886 $entry{page_title} = decode_utf8($entry{page_title});
887 #while (my ($key, $value) = each(%entry)) { print "$key => $value\n"; } die;
890 sub save_entry {
891 my @values;
893 $entry{time_stamp} = strftime("%F %T",localtime);
894 push @values,$entry{$_} foreach sort keys %entry;
896 $cache{$hash} = join ';', @values;
897 $cache_db->db_sync();
900 sub grep_cache {
901 my $g = $opts{case} ? qr|$opts{grep}| : qr|$opts{grep}|i;
902 my $fmt = $opts{showfmt} || $default_showfmt;
903 foreach ( sort keys %cache ) {
904 my @e = split /;/, $cache{$_};
905 if ( grep /$g/, @e ) {
906 if ( $opts{delete} ) {
907 if ( $opts{show} ) { push @queue,$e[4]; }
908 else { delete $cache{$_}; }
910 else { push @queue,$e[4]; } # 4=URL
913 exit if $opts{delete} and not $opts{show};
916 sub print_version {
917 my $perl_v = sprintf "%vd", $^V;
918 my $clipboard_v = $optional_mods{Clipboard} ? $Clipboard::VERSION : "-";
919 print
920 "clive version $VERSION. Copyright (C) 2007,2008 Toni Gundogdu.
922 Perl: $perl_v ($^O)
923 Modules:
924 * Config::Tiny/$Config::Tiny::VERSION\t\t* BerkeleyDB/$BerkeleyDB::VERSION
925 * WWW::Curl/$WWW::Curl::VERSION\t\t* URI::Escape/$URI::Escape::VERSION
926 * HTML::TokeParser/$HTML::TokeParser::VERSION\t* Clipboard/$clipboard_v
927 * IO::Pager/$IO::Pager::VERSION
928 Core modules:
929 * POSIX/$POSIX::VERSION\t\t\t* Cwd/$Cwd::VERSION
930 * Getopt::Long/$Getopt::Long::VERSION\t\t* Pod::Usage/$Pod::Usage::VERSION
931 * File::Path/$File::Path::VERSION\t\t* File::Spec/$File::Spec::VERSION
932 * Digest::SHA/$Digest::SHA::VERSION\t\t* Encode/$Encode::VERSION
934 See --manual for a list of the supported websites.
936 This program comes with ABSOLUTELY NO WARRANTY. You may redistribute copies of
937 clive under the terms of the GNU General Public License as published by the
938 Free Software Foundation, either version 3 of the License, or (at your option)
939 any later version. You should have received a copy of the General Public License
940 along with this program. If not, see http://www.gnu.org/licenses/.
941 "; exit;
945 __END__
947 =head1 NAME
949 clive - the non-interactive video extraction utility
951 =head1 SYNOPSIS
953 clive [option]... [URL]...
955 =head1 DESCRIPTION
957 clive is an open source command-line utility for extracting videos from Youtube
958 and other video sharing websites. It was originally written to bypass the
959 Adobe Flash requirement needed to view the hosted videos.
961 clive is non-interactive meaning it can work in the background while the user
962 is not logged on. This allows the user to start an extraction and disconnect
963 from the system letting clive finish the work. By contrast, most of the
964 extraction websites and UNIX scripts require constant user's presence
965 which can be a great hindrance when transferring a lot of data.
967 Users familiar with the GNU L<wget(1)> utility will notice that clive borrows
968 some of the feature concepts from wget, as well as, with some changes,
969 the above paragraph, option syntax description below and the output option
970 descriptions. Kudos to the wget team for their original work.
972 =head1 OPTIONS
974 =over 4
976 =item B<Option Syntax>
978 Every option has a long form along with the short one. Long options are more
979 convenient to remember but take time to type. You may freely mix different
980 option styles, or specify options after the command-line arguments. For example:
982 clive -c --format=mp4 URL -n
984 You may also put several options together that do not require arguments.
985 For example:
987 clive -xcn URL
989 Which is equivalent to:
991 clive -x -c -n URL
993 =back
995 B<Basic Options>
997 =over 4
999 =item B<-h --help>
1001 Print this help and exit.
1003 =item B<-m --manual>
1005 Display the manual page and exit.
1007 =item B<-v --version>
1009 Display version details and exit.
1011 =item B<-b --background>
1013 Go to background immediately after startup. If no output file is specified
1014 using the B<--output> or B<--append>, the output is redirected to I<clive-log>
1015 file.
1017 =back
1019 B<Cache Options>
1021 =over 4
1023 =item B<-R --renew>
1025 Renew the cache entries for the input URLs. See L</CACHE> in the manual page.
1027 =item B<-s --show>
1029 Print cache entries to standard output. See L</CONFIG> in the manual page
1030 for how to configure the output.
1032 =item B<-g --grep=>I<pattern>
1034 Grep cache entries for I<pattern>. All cache entry values are included in
1035 the search.
1037 % clive --grep=git
1038 % clive --grep=^git --ignore-case
1040 The matched entries are then extracted. To only show the matches, use the
1041 B<--show> option. For example:
1043 % clive -sg ^git
1045 =item B<-i --ignore-case>
1047 When used with B<--grep>, causes clive to ignore case differences between
1048 the patterns.
1050 =item B<-D --delete>
1052 When used with B<--grep>, deletes the matched entries from cache. If used
1053 together with the B<--show> option, causes clive to prompt to confirm delete.
1054 For example:
1056 % clive -siDg ^git
1058 =item B<-C --clear>
1060 Clear cache and exit.
1062 =back
1064 B<Logging and Input Options>
1066 =over 4
1068 =item B<-o --output=>I<logfile>
1070 Log all messages to I<logfile>. The messages are normally reported to
1071 stdout and stderr.
1073 =item B<-a --append=>I<logfile>
1075 Append to I<logfile>. This is the same as B<--output> but it appends to
1076 I<logfile> instead of overwriting it. If the I<logfile> does not exist,
1077 the file is created.
1079 =item B<-d --debug>
1081 Causes the program to print debug messages.
1083 =item B<-G --noprogress>
1085 Turn off progress meter.
1087 =item B<-q --quiet>
1089 Turn off all output.
1091 =item B<-r --recall>
1093 Recall the last URL batch. If this option is used, no URLs need to be present
1094 on the command-line.
1096 =item B<-x --paste>
1098 Paste input from clipboard. If this option is used, no URLs need to be present
1099 on the command-line. The pasted URLs are expected to be separated with newlines.
1101 =back
1103 B<Download Options>
1105 =over 4
1107 =item B<-c --continue>
1109 Continue extraction of a partially downloaded file. Note that this works only
1110 with HTTP servers that support the "Range" header. Ignored unless I<localfile>
1111 E<lt> I<remotefile>.
1113 The "requested range was not delivered" error typically implies that the
1114 host does not allow continuing partially extracted video files. You will
1115 see this error if you attempt to continue a partially downloaded flv video
1116 from Youtube, for example.
1118 =item B<-X --noproxy>
1120 Do not use of the proxy defined in the config or the http_proxy environment
1121 variable.
1123 =item B<-L --nologin>
1125 Do not log in. Ignored unless [youtube]:user and [youtube]:pass are used
1126 in the config file.
1128 =item B<-n --noextract>
1130 Do not actually extract any videos.
1132 =item B<-f --format=>I<format>
1134 Extract I<format> of the video. See L</FORMATS> in the manual page.
1136 =item B<-P --noplay>
1138 Disable subsequent play. Ignored unless [commands]:play is used in the
1139 configuration file.
1141 =back
1143 =head1 EXAMPLES
1145 =over 4
1147 =item clive "http://youtube.com/watch?v=3HD220e0bx4"
1149 Extracts the video from the specified URL.
1151 =item cat url.lst | clive
1153 Reads input from UNIX pipe. Separate each URL with a newline.
1155 =item clive -x URL URL
1157 Combines input from the command-line and the clipboard (each URL separated
1158 with a newline).
1160 =item clive -rf mp4
1162 Recalls the last URL batch and extracts the mp4 format.
1164 =item clive -g 3HD220e0bx4
1166 Greps the pattern from the cache and extracts the matched videos.
1168 =item clive -iDg ^3hd2
1170 Same as above but I<deletes> the matched entries from the cache instead of
1171 extracting them.
1173 =item clive -s
1175 Dumps the contents of the cache to stdout.
1177 =item clive -sig ^3hd2
1179 Instead of displaying all of the cache entries, show only the matching ones.
1181 =item clive -big ^3hd2 -o my.log
1183 Goes to background immediately after startup, redirects output to I<my.log>
1184 file, greps for the pattern and extracts the video.
1186 =item clive -bqig ^3hd2
1188 Same as above but turns off all output. See also the B<--noprogress> option.
1190 =back
1192 =head1 FORMATS
1194 clive defaults to extract the flv format unless the B<--format> option is
1195 used. The requested format may not always be available and in such case
1196 the server usually returns the HTTP/404 or the HTTP/403 error.
1198 The quality of the video depends on the uploaded video quality. Each
1199 website typically recompresses the uploaded videos to 320x240 resolution
1200 (sometimes higher). As this varies per video and website, you should not
1201 read too much into the video quality information listed below.
1203 =over 4
1205 =item B<www.youtube.com>
1207 Formats: flv | mp4 | 3gpp | xflv
1209 The flv format is usually available unless the video has been removed or
1210 set private. The mp4 and 3gpp formats are often, or will become, available.
1211 The xflv on the other hand appears to be very rarely available.
1213 Videos dating back to 2006 are usually available as flv only. The B<--continue>
1214 option should work with all other formats but flv.
1216 =back
1218 =over 4
1220 =item B<video.google.com>
1222 Formats: flv | mp4
1224 The mp4 may not always be available.
1226 The B<--continue> option does not work with the flv format. Streaming seems
1227 impossible with the mp4. For a comparison, this is possible with Youtube's
1228 mp4 videos which are compressed using a different mp4 codec.
1230 =back
1232 =over 4
1234 =item B<www.sevenload.com>
1236 Formats: flv
1238 The B<--continue> option works.
1240 =back
1242 =over 4
1244 =item B<www.break.com>
1246 Formats: flv | wmv
1248 The B<--continue> option works.
1250 =back
1252 =over 4
1254 =item B<www.metacafe.com>
1256 Formats: flv
1258 The B<--continue> option works.
1260 =back
1262 =head1 CACHE
1264 The cache has two purposes:
1266 =over 4
1268 =item 1.
1270 Gather reusable info for a fast re-extraction without having to fetch the
1271 same data again.
1273 =item 2.
1275 Keep a record of videos. The B<--grep> option can then later be used to
1276 extract the videos.
1278 =back
1280 Each cache entry contains information about a video, including, but not limited
1281 to, page title, file length and extraction URL.
1283 Some entries may need to be renewed from time to time as some websites have
1284 their extraction URLs expire after awhile. Youtube is an example of this.
1285 Youtube servers usually return the HTTP/410 error if the extraction URL has
1286 expired. You can use the B<--renew> option to fix this.
1288 Note that if you use a different B<--format> than previously, clive will renew
1289 the cache entry automatically. This is done for two reasons:
1291 =over 4
1293 =item 1.
1295 The cached extraction URL would point to a wrong file
1297 =item 2.
1299 The file length would be incorrect
1301 =back
1303 =head1 UNICODE
1305 As long as the terminal can handle unicode, so should clive. Details of enabling
1306 unicode in your terminal falls outside the scope of this manual page.
1307 If you are running X, switching to a unicode capable terminal
1308 (e.g. L<uxterm(1)>) may also provide some remedy to this.
1310 If you are using a user-defined character class in the config file,
1311 make sure it is not blocking any unicode characters. See L</CONFIG>.
1313 =head1 FILES
1315 =over 4
1317 =item ~/.config/clive/config
1319 Configuration file for clive. See L</CONFIG>.
1321 =item ~/.config/clive/cache
1323 Contains the cache entries of the visited URLs. A Berkeley DB (Hash) file.
1325 =item ~/.config/clive/recall
1327 Contains the last URL batch. Can be recalled with the B<--recall> option.
1329 =back
1331 =head1 CONFIG
1333 ## Example config file for clive.
1334 ## Recommended: chmod 0600 ~/.config/clive/config
1336 [http]
1337 ## HTTP User-agent string (default: Mozilla/5.0).
1338 agent = Furball/0.2
1340 ## HTTP proxy.
1341 proxy = http://foo:1234
1343 [output]
1344 ## Save videos to directory (default: cwd).
1345 savedir = /home/user/videos
1347 ## Character class used to filter out garbage characters from
1348 ## video filenames (default: \w).
1349 cclass = [A-Za-z0-9]
1350 #cclass = .
1352 ## Extracted video filename format (default: %t-(%i)-[%d].%s).
1353 ## %t = video name after applying the character class regex
1354 ## %s = video file suffix (e.g. flv)
1355 ## %d = video domain
1356 ## %i = video id
1357 ## %D = current date
1358 ## %T = current time
1359 ## %S = timestamp (same as: %D %T)
1360 file = %t.%e
1362 ## Format for --show (default: %D: "%t" | %mMB)
1363 ## %t = video page title
1364 ## %i = video id
1365 ## %l = video file length (bytes)
1366 ## %u = video page url
1367 ## %x = video extraction url
1368 ## %D = video extraction date
1369 ## %T = video extraction time
1370 ## %S = video extraction timestamp (same as: %D %T)
1371 show = %t (id: %i | bytes: %l)
1373 [youtube]
1374 ## Username and password for Youtube. OPTIONAL unless you
1375 ## plan on extracting flagged content.
1376 user = myusername
1377 pass = mypassword
1379 [commands]
1380 ## Path to a player command. If used, clive will play the
1381 ## extracted videos subsequently. Be sure to use the %i
1382 ## specifier for input file.
1383 play = /usr/local/bin/xine -f %i
1385 =head1 SEE ALSO
1387 =over 4
1389 =item Website:
1391 http://clive.sf.net/
1393 =item Project:
1395 http://googlecode.com/p/clive/
1397 =item Issue Tracker:
1399 http://googlecode.com/p/clive/issues/
1401 =item Announcements:
1403 http://googlegroups.com/group/clive-announce/
1405 =back
1407 =head1 OTHER
1409 A clive development repository can be obtained from:
1411 git clone git://repo.or.cz/clive.git
1413 Patches welcome.
1415 =head1 AUTHOR
1417 Written by Toni Gundogdu <legatvs@gmail.com>.
1419 =cut