2 # -*- coding: ascii -*-
5 # Copyright (C) 2010 Toni Gundogdu <legatvs@gmail.com>.
7 # This program is free software: you can redistribute it and/or modify
8 # it under the terms of the GNU General Public License as published by
9 # the Free Software Foundation, either version 3 of the License, or
10 # (at your option) any later version.
12 # This program is distributed in the hope that it will be useful,
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 # GNU General Public License for more details.
17 # You should have received a copy of the GNU General Public License
18 # along with this program. If not, see <http://www.gnu.org/licenses/>.
24 binmode STDOUT
, ":utf8";
25 binmode STDERR
, ":utf8";
27 use version
0.77 (); our $VERSION = version
->declare("0.0.5");
29 use Getopt
::ArgvFile
( home
=> 1, startupFilename
=> [qw(.gcaprc)] );
30 use Getopt
::Long
qw(:config bundling);
46 'version' => \
&print_version
,
47 'license' => \
&print_license
,
48 'help' => \
&print_help
,
51 $config{regexp
} ||= "/(\\w|\\s)/g";
52 apply_regexp
( $config{regexp
} ); # Check syntax.
56 print "gcap version $VERSION\n";
62 "Copyright (C) 2010 Toni Gundogdu. GNU GPL v3+. This is free software;
63 see the source for copying conditions. There is NO warranty; not even
64 for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
71 Pod
::Usage
::pod2usage
( -exitstatus
=> 0, -verbose
=> 1 );
80 print_help
() unless scalar @ARGV;
83 "http://video.google.com/timedtext?hl=en&type=list&v=";
85 my $q = qr{v[=/]((?>[-_\w]{11}))};
87 if ( $url =~ /^http:/i ) {
93 "error: does not look like a youtube video page URL.\n";
98 $url = "$req_body$url";
101 print STDERR
"Checking ..." unless $config{quiet
};
105 my $a = new LWP
::UserAgent
;
107 $a->env_proxy; # http://search.cpan.org/perldoc?LWP::UserAgent
109 $a->proxy( 'http', $config{proxy
} ) if $config{proxy
};
110 $a->no_proxy('') if $config{no_proxy
};
114 my $p = new XML
::DOM
::Parser
( LWP_UserAgent
=> $a );
115 my $d = $p->parsefile($url);
116 my $r = $d->getDocumentElement;
118 for my $e ( $r->getElementsByTagName("track") ) {
120 name
=> $e->getAttributeNode("name")->getValue || "",
121 lang_code
=> $e->getAttributeNode("lang_code")->getValue,
123 $e->getAttributeNode("lang_translated")->getValue,
126 push @captions, \
%tmp;
127 print STDERR
"." unless $config{quiet
};
130 print STDERR
"done.\n" unless $config{quiet
};
134 my $v = $1 if $url =~ /$q/ or die "error: no match: video id";
136 get_title
( $v, $a ) if $config{title
};
137 prompt
() if $config{interactive
};
141 foreach (@captions) {
142 ++$t if $_->{selected
};
147 foreach (@captions) {
149 next unless $_->{selected
};
152 "http://video.google.com/timedtext?"
153 . "hl=$_->{lang_code}"
154 . "&lang=$_->{lang_code}"
155 . "&name=$_->{name}" . "&v=$v";
157 my $fname = sprintf "%s_%s.srt", $v, $_->{lang_code
};
161 apply_regexp
( $config{regexp
}, $video_title );
162 $fname = sprintf "%s_%s.srt", $video_title, $_->{lang_code
};
165 open my $fh, ">", $fname or die "$fname: $!\n";
166 binmode $fh, ":utf8";
168 unless ( $config{quiet
} ) {
169 printf STDERR
"(%02d of %02d) ", ++$n, $t if $t > 0;
170 print STDERR
"Saving $fname ...";
173 $d = $p->parsefile($url);
174 $r = $d->getDocumentElement;
179 for my $e ( $r->getElementsByTagName("text") ) {
181 my $tmp = $e->getFirstChild;
184 my $text = trim
( $tmp->getNodeValue );
187 my $start = $e->getAttributeNode("start")->getValue;
192 if ( $start =~ /(\d+)/ ) {
196 /\d+\.(\d+)/; # should only capture 3 first digits
199 my @start = gmtime($start_sec);
201 $tmp = $e->getAttributeNode("dur");
202 my $dur = $tmp ?
$tmp->getValue : $start - $last_start;
204 my $end_sec = $start + $dur;
206 $dur =~ /\d+\.(\d+)/; # should only capture 3 first digits
207 my $end_msec = $1 || 0;
209 my @end = gmtime($end_sec);
212 "%d\r\n%02d:%02d:%02d,%03d --> %02d:%02d:%02d,%03d\r\n%s\r\n\r\n",
213 $i++, @start[ 2, 1, 0 ], $start_msec, @end[ 2, 1, 0 ],
216 $last_start = $start;
223 print STDERR
"done.\n" unless $config{quiet
};
238 'n' => \
&select_none
,
239 'i' => \
&invert_selection
,
243 print STDERR
"Enter prompt. "
244 . qq/Type "help" to get a list of commands.\n/;
254 if ( $ln =~ /(\d+)/ ) {
258 next unless $ln =~ /(\w)/;
259 $cmds{$1}() if defined $cmds{$1};
267 my $url = "http://www.youtube.com/get_video_info?&video_id=$v"
268 . "&el=detailpage&ps=default&eurl=&gl=US&hl=en";
270 my $r = $a->get($url);
272 unless ( $r->is_success ) {
273 printf STDERR
"\nerror: $url: %s\n", $r->status_line;
279 my $q = CGI
->new( $r->content );
281 if ( $q->param('reason') ) {
282 printf STDERR
"\nerror: %s: %s (errorcode: %d)\n",
283 $url, $q->param("reason"), $q->param("errorcode");
287 $video_title = Encode
::decode_utf8
( $q->param('title') );
290 unless ($video_title) {
291 print STDERR
"\nwarning: $url: use id instead\n"
292 unless $config{quiet
};
301 if ( $re =~ /^\/(.*)\
/(.*)$/ ) {
307 "error: invalid regexp syntax, expected `/pattern/flags'\n";
313 my $q = $flags =~ /i/ ?
qr/$pat/i : qr/$pat/;
315 return join '', $flags =~ /g/ ?
$s =~ /$q/g : $s =~ /$q/;
319 print STDERR
"Commands:
321 list .. display found captions (> indicates selected for download)
324 invert .. invert selection
325 (number) .. toggle caption
326 get .. download selected captions
327 quit .. quit without downloading captions\n"
328 . qq/Command name abbreviations are allowed, e.g. "h" instead of "help"\n/;
332 foreach (@captions) {
333 if ( $_->{selected
} ) {
338 print STDERR
"error: you have not selected anything\n";
345 foreach (@captions) {
346 printf STDERR
"%2s%02d: $_->{lang_transl}\n",
347 $_->{selected
} ?
">" : "", ++$i;
352 $_->{selected
} = 1 foreach @captions;
357 $_->{selected
} = 0 foreach @captions;
361 sub invert_selection
{
362 $_->{selected
} = !$_->{selected
} foreach @captions;
368 if ( $i >= 0 && exists $captions[$i] ) {
369 $captions[$i]->{selected
} = !$captions[$i]->{selected
};
373 print STDERR
"error: out of rate\n";
388 gcap - Youtube closed caption retriever
392 gcap [-i] [-t] [-r E<lt>regexpE<gt>] [E<lt>urlE<gt> | E<lt>video_idE<gt>]
393 [--proxy E<lt>addrE<gt> | --no-proxy]
397 gcap is a command line tool for retrieving Youtube closed captions.
398 The retrieved closed captions are saved in SubRip (srt) file format.
399 The srt files are saved as "$videoid_$langid.srt" by default.
403 --help print help and exit
404 --version print version and exit
405 --license print license and exit
407 -i, --interactive run in interactive mode
408 -t, --title parse video title and use it in filename
409 -r, --regexp arg (="/(\w|\s)/g") cleanup title with regexp
410 --proxy arg (=http_env) use proxy for http connections
411 --no-proxy disable use of http proxy
413 =head1 OPTION DESCRIPTIONS
423 Print version and exit.
427 Print license and exit.
433 =item B<-i, --interactive>
435 Enable interactive prompt which can be used to select the downloaded
436 closed captions. By default gcap downloads all available captions
441 Parse video title and use it in the output filename(s) instead of
442 video ID. The default is no.
444 =item B<-r, --regexp>=arg
446 Cleanup video title using the specified I<arg> regular expression.
447 The default is "/(\w|\s)/g".
449 =item B<--proxy> I<arg>
451 Use I<arg> for HTTP proxy, e.g. "http://foo:1234". Overrides the http_proxy
456 Disable use of HTTP proxy. Overrides both C<--proxy> and http_proxy environment
465 =item B<gcap 0QRO3gKj3qw>
467 =item B<gcap "http://www.youtube.com/watch?v=0QRO3gKj3qw">
469 Typical use. Both achieve the same.
475 Exits 0 on success, otherwise 1.
481 =item $HOME/.gcaprc, for example:
483 echo "--interactive" >> ~/.gcaprc
491 =item B<Availability>
493 Not all Youtube videos have closed captions. The following message
494 indicates that the video does not have any closed captions available.
495 URL omitted for brevity.
497 Couldn't parsefile [...] with LWP: no element found at line 1,
498 column 0, byte -1 at /usr/lib/perl5/vendor_perl/XML/Parser.pm ...
502 gcap depends on XML::DOM which uses LWP::UserAgent to retrieve
503 the data. Note that LWP::UserAgent reads http_proxy environment
506 env http_proxy=http://foo:1234 gcap video_id
510 <http://gcap.googlecode.com/>
512 =item B<Development repository>
514 <git://repo.or.cz/gcap.git>
516 e.g. git clone git://repo.or.cz/gcap.git
522 Toni Gundogdu <legatvs gmail com>