Bump version to 0.0.6
[gcap.git] / bin / gcap
blob12e6ead2cc4f2c14c9207a9888a991ad3270ad86
1 #!/usr/bin/perl
2 # -*- coding: ascii -*-
5 # Copyright (C) 2010 Toni Gundogdu <legatvs@gmail.com>.
7 # This program is free software: you can redistribute it and/or modify
8 # it under the terms of the GNU General Public License as published by
9 # the Free Software Foundation, either version 3 of the License, or
10 # (at your option) any later version.
12 # This program is distributed in the hope that it will be useful,
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 # GNU General Public License for more details.
17 # You should have received a copy of the GNU General Public License
18 # along with this program. If not, see <http://www.gnu.org/licenses/>.
21 use warnings;
22 use strict;
24 binmode STDOUT, ":utf8";
25 binmode STDERR, ":utf8";
27 use version 0.77 (); our $VERSION = version->declare("0.0.6");
29 use Getopt::ArgvFile( home => 1, startupFilename => [qw(.gcaprc)] );
30 use Getopt::Long qw(:config bundling);
32 my %config;
33 my $video_title;
35 exit main();
37 sub init {
38 GetOptions(
39 \%config,
40 'interactive|i',
41 'title|t',
42 'regexp|r=s',
43 'proxy=s',
44 'no_proxy|no-proxy',
45 'quiet|q',
46 'version' => \&print_version,
47 'license' => \&print_license,
48 'help' => \&print_help,
49 ) or exit 1;
51 $config{regexp} ||= "/(\\w|\\s)/g";
52 apply_regexp( $config{regexp} ); # Check syntax.
55 sub print_version {
56 print "gcap version $VERSION\n";
57 exit 0;
60 sub print_license {
61 print
62 "Copyright (C) 2010 Toni Gundogdu. GNU GPL v3+. This is free software;
63 see the source for copying conditions. There is NO warranty; not even
64 for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
66 exit 0;
69 sub print_help {
70 require Pod::Usage;
71 Pod::Usage::pod2usage( -exitstatus => 0, -verbose => 1 );
74 my @captions;
76 sub main {
78 init();
80 print_help() unless scalar @ARGV;
82 my $req_body =
83 "http://video.google.com/timedtext?hl=en&type=list&v=";
84 my $url = $ARGV[0];
85 my $q = qr{v[=/]((?>[-_\w]{11}))};
87 if ( $url =~ /^http:/i ) {
88 if ( $url =~ /$q/ ) {
89 $url = "$req_body$1";
91 else {
92 print STDERR
93 "error: does not look like a youtube video page URL.\n";
94 exit 1;
97 else {
98 $url = "$req_body$url";
101 print STDERR "Checking ..." unless $config{quiet};
103 require LWP;
105 my $a = new LWP::UserAgent;
107 $a->env_proxy; # http://search.cpan.org/perldoc?LWP::UserAgent
109 $a->proxy( 'http', $config{proxy} ) if $config{proxy};
110 $a->no_proxy('') if $config{no_proxy};
112 require XML::DOM;
114 my $p = new XML::DOM::Parser( LWP_UserAgent => $a );
115 my $d = $p->parsefile($url);
116 my $r = $d->getDocumentElement;
118 for my $e ( $r->getElementsByTagName("track") ) {
119 my %tmp = (
120 name => $e->getAttributeNode("name")->getValue || "",
121 lang_code => $e->getAttributeNode("lang_code")->getValue,
122 lang_transl =>
123 $e->getAttributeNode("lang_translated")->getValue,
124 selected => 1
126 push @captions, \%tmp;
127 print STDERR "." unless $config{quiet};
130 print STDERR "done.\n" unless $config{quiet};
132 $d->dispose;
134 my $v = $1 if $url =~ /$q/ or die "error: no match: video id";
136 get_title( $v, $a ) if $config{title};
137 prompt() if $config{interactive};
139 my $t = 0;
141 foreach (@captions) {
142 ++$t if $_->{selected};
145 require HTML::Entities;
147 my $n = 0;
149 foreach (@captions) {
151 next unless $_->{selected};
153 $url =
154 "http://video.google.com/timedtext?"
155 . "hl=$_->{lang_code}"
156 . "&lang=$_->{lang_code}"
157 . "&name=$_->{name}" . "&v=$v";
159 my $fname = sprintf "%s_%s.srt", $v, $_->{lang_code};
161 if ($video_title) {
162 $video_title =
163 apply_regexp( $config{regexp}, $video_title );
164 $fname = sprintf "%s_%s.srt", $video_title, $_->{lang_code};
167 open my $fh, ">", $fname or die "$fname: $!\n";
168 binmode $fh, ":utf8";
170 unless ( $config{quiet} ) {
171 printf STDERR "(%02d of %02d) ", ++$n, $t if $t > 0;
172 print STDERR "Saving $fname ...";
175 $d = $p->parsefile($url);
176 $r = $d->getDocumentElement;
178 my $i = 1;
179 my $last_start = 0;
181 for my $e ( $r->getElementsByTagName("text") ) {
183 my $tmp = $e->getFirstChild;
184 next unless $tmp;
186 my $text = trim( $tmp->getNodeValue );
187 next unless $text;
188 $text = HTML::Entities::decode_entities($text);
190 my $start = $e->getAttributeNode("start")->getValue;
192 my $start_sec = 0;
193 my $start_msec = 0;
195 if ( $start =~ /(\d+)/ ) {
196 $start_sec = $1;
197 $start_msec = $1
198 if $start =~
199 /\d+\.(\d+)/; # should only capture 3 first digits
202 my @start = gmtime($start_sec);
204 $tmp = $e->getAttributeNode("dur");
205 my $dur = $tmp ? $tmp->getValue : $start - $last_start;
207 my $end_sec = $start + $dur;
209 $dur =~ /\d+\.(\d+)/; # should only capture 3 first digits
210 my $end_msec = $1 || 0;
212 my @end = gmtime($end_sec);
214 printf $fh
215 "%d\r\n%02d:%02d:%02d,%03d --> %02d:%02d:%02d,%03d\r\n%s\r\n\r\n",
216 $i++, @start[ 2, 1, 0 ], $start_msec, @end[ 2, 1, 0 ],
217 $end_msec, $text;
219 $last_start = $start;
222 $d->dispose;
224 close $fh;
226 print STDERR "done.\n" unless $config{quiet};
229 return 0;
232 my $done = 0;
234 sub prompt {
236 my %cmds = (
237 'h' => \&help,
238 'q' => \&quit,
239 'l' => \&list,
240 'a' => \&select_all,
241 'n' => \&select_none,
242 'i' => \&invert_selection,
243 'g' => \&get,
246 print STDERR "Enter prompt. "
247 . qq/Type "help" to get a list of commands.\n/;
248 list();
250 my $p = "(gcap) ";
252 while ( !$done ) {
253 print STDERR $p;
254 my $ln = <STDIN>;
255 next unless $ln;
256 chomp $ln;
257 if ( $ln =~ /(\d+)/ ) {
258 toggle_caption($1);
260 else {
261 next unless $ln =~ /(\w)/;
262 $cmds{$1}() if defined $cmds{$1};
267 sub get_title {
268 my ( $v, $a ) = @_;
270 my $url = "http://www.youtube.com/get_video_info?&video_id=$v"
271 . "&el=detailpage&ps=default&eurl=&gl=US&hl=en";
273 my $r = $a->get($url);
275 unless ( $r->is_success ) {
276 printf STDERR "\nerror: $url: %s\n", $r->status_line;
277 return;
280 require CGI;
282 my $q = CGI->new( $r->content );
284 if ( $q->param('reason') ) {
285 printf STDERR "\nerror: %s: %s (errorcode: %d)\n",
286 $url, $q->param("reason"), $q->param("errorcode");
288 else {
289 require Encode;
290 $video_title = Encode::decode_utf8( $q->param('title') );
293 unless ($video_title) {
294 print STDERR "\nwarning: $url: use id instead\n"
295 unless $config{quiet};
299 sub apply_regexp {
301 my ( $re, $s ) = @_;
302 my ( $pat, $flags );
304 if ( $re =~ /^\/(.*)\/(.*)$/ ) {
305 $pat = $1;
306 $flags = $2;
308 else {
309 print STDERR
310 "error: invalid regexp syntax, expected `/pattern/flags'\n";
311 exit 1;
314 return unless $s;
316 my $q = $flags =~ /i/ ? qr/$pat/i : qr/$pat/;
318 return join '', $flags =~ /g/ ? $s =~ /$q/g : $s =~ /$q/;
321 sub help {
322 print STDERR "Commands:
323 help .. this
324 list .. display found captions (> indicates selected for download)
325 all .. select all
326 none .. select none
327 invert .. invert selection
328 (number) .. toggle caption
329 get .. download selected captions
330 quit .. quit without downloading captions\n"
331 . qq/Command name abbreviations are allowed, e.g. "h" instead of "help"\n/;
334 sub get {
335 foreach (@captions) {
336 if ( $_->{selected} ) {
337 $done = 1;
338 return;
341 print STDERR "error: you have not selected anything\n";
344 sub quit { exit 0; }
346 sub list {
347 my $i = 0;
348 foreach (@captions) {
349 printf STDERR "%2s%02d: $_->{lang_transl}\n",
350 $_->{selected} ? ">" : "", ++$i;
354 sub select_all {
355 $_->{selected} = 1 foreach @captions;
356 list();
359 sub select_none {
360 $_->{selected} = 0 foreach @captions;
361 list();
364 sub invert_selection {
365 $_->{selected} = !$_->{selected} foreach @captions;
366 list();
369 sub toggle_caption {
370 my $i = (shift) - 1;
371 if ( $i >= 0 && exists $captions[$i] ) {
372 $captions[$i]->{selected} = !$captions[$i]->{selected};
373 list();
375 else {
376 print STDERR "error: out of rate\n";
380 sub trim {
381 my $s = shift;
382 $s =~ s/^\s+//;
383 $s =~ s/\s+$//;
384 return $s;
387 __END__
389 =head1 NAME
391 gcap - Youtube closed caption retriever
393 =head1 SYNOPSIS
395 gcap [-i] [-t] [-r E<lt>regexpE<gt>] [E<lt>urlE<gt> | E<lt>video_idE<gt>]
396 [--proxy E<lt>addrE<gt> | --no-proxy]
398 =head1 DESCRIPTION
400 gcap is a command line tool for retrieving Youtube closed captions.
401 The retrieved closed captions are saved in SubRip (srt) file format.
402 The srt files are saved as "$videoid_$langid.srt" by default.
404 =head1 OPTIONS
406 --help print help and exit
407 --version print version and exit
408 --license print license and exit
409 -q, --quiet be quiet
410 -i, --interactive run in interactive mode
411 -t, --title parse video title and use it in filename
412 -r, --regexp arg (="/(\w|\s)/g") cleanup title with regexp
413 --proxy arg (=http_env) use proxy for http connections
414 --no-proxy disable use of http proxy
416 =head1 OPTION DESCRIPTIONS
418 =over 4
420 =item B<--help>
422 Print help and exit.
424 =item B<--version>
426 Print version and exit.
428 =item B<--license>
430 Print license and exit.
432 =item B<-q, --quiet>
434 Be quiet.
436 =item B<-i, --interactive>
438 Enable interactive prompt which can be used to select the downloaded
439 closed captions. By default gcap downloads all available captions
440 without prompting.
442 =item B<-t, --title>
444 Parse video title and use it in the output filename(s) instead of
445 video ID. The default is no.
447 =item B<-r, --regexp>=arg
449 Cleanup video title using the specified I<arg> regular expression.
450 The default is "/(\w|\s)/g".
452 =item B<--proxy> I<arg>
454 Use I<arg> for HTTP proxy, e.g. "http://foo:1234". Overrides the http_proxy
455 environment setting.
457 =item B<--no-proxy>
459 Disable use of HTTP proxy. Overrides both C<--proxy> and http_proxy environment
460 settings.
462 =back
464 =head1 EXAMPLES
466 =over 4
468 =item B<gcap 0QRO3gKj3qw>
470 =item B<gcap "http://www.youtube.com/watch?v=0QRO3gKj3qw">
472 Typical use. Both achieve the same.
474 =back
476 =head1 EXIT STATUS
478 Exits 0 on success, otherwise 1.
480 =head1 FILES
482 =over 4
484 =item $HOME/.gcaprc, for example:
486 echo "--interactive" >> ~/.gcaprc
488 =back
490 =head1 NOTES
492 =over 4
494 =item B<Availability>
496 Not all Youtube videos have closed captions. The following message
497 indicates that the video does not have any closed captions available.
498 URL omitted for brevity.
500 Couldn't parsefile [...] with LWP: no element found at line 1,
501 column 0, byte -1 at /usr/lib/perl5/vendor_perl/XML/Parser.pm ...
503 =item B<http_proxy>
505 gcap depends on XML::DOM which uses LWP::UserAgent to retrieve
506 the data. Note that LWP::UserAgent reads http_proxy environment
507 setting. e.g.:
509 env http_proxy=http://foo:1234 gcap video_id
511 =item B<Project>
513 <http://gcap.googlecode.com/>
515 =item B<Development repository>
517 <git://repo.or.cz/gcap.git>
519 e.g. git clone git://repo.or.cz/gcap.git
521 =back
523 =head1 AUTHOR
525 Toni Gundogdu <legatvs gmail com>
527 =cut