Apply perltidy -l=72 style
[gcap.git] / bin / gcap
blob15be35adbaf1229a521cb0fb5f1ffe61f626f6cf
1 #!/usr/bin/perl
2 # -*- coding: ascii -*-
5 # Copyright (C) 2010 Toni Gundogdu <legatvs@gmail.com>.
7 # This program is free software: you can redistribute it and/or modify
8 # it under the terms of the GNU General Public License as published by
9 # the Free Software Foundation, either version 3 of the License, or
10 # (at your option) any later version.
12 # This program is distributed in the hope that it will be useful,
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 # GNU General Public License for more details.
17 # You should have received a copy of the GNU General Public License
18 # along with this program. If not, see <http://www.gnu.org/licenses/>.
21 use warnings;
22 use strict;
24 binmode STDOUT, ":utf8";
25 binmode STDERR, ":utf8";
27 use version 0.77 (); our $VERSION = version->declare("0.0.5");
29 use Getopt::ArgvFile( home => 1, startupFilename => [qw(.gcaprc)] );
30 use Getopt::Long qw(:config bundling);
32 my %config;
33 my $video_title;
35 exit main();
37 sub init {
38 GetOptions(
39 \%config,
40 'interactive|i',
41 'title|t',
42 'regexp|r=s',
43 'proxy=s',
44 'no_proxy|no-proxy',
45 'quiet|q',
46 'version' => \&print_version,
47 'license' => \&print_license,
48 'help' => \&print_help,
49 ) or exit 1;
51 $config{regexp} ||= "/(\\w|\\s)/g";
52 apply_regexp( $config{regexp} ); # Check syntax.
55 sub print_version {
56 print "gcap version $VERSION\n";
57 exit 0;
60 sub print_license {
61 print
62 "Copyright (C) 2010 Toni Gundogdu. GNU GPL v3+. This is free software;
63 see the source for copying conditions. There is NO warranty; not even
64 for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
66 exit 0;
69 sub print_help {
70 require Pod::Usage;
71 Pod::Usage::pod2usage( -exitstatus => 0, -verbose => 1 );
74 my @captions;
76 sub main {
78 init();
80 print_help() unless scalar @ARGV;
82 my $req_body =
83 "http://video.google.com/timedtext?hl=en&type=list&v=";
84 my $url = $ARGV[0];
85 my $q = qr{v[=/]((?>[-_\w]{11}))};
87 if ( $url =~ /^http:/i ) {
88 if ( $url =~ /$q/ ) {
89 $url = "$req_body$1";
91 else {
92 print STDERR
93 "error: does not look like a youtube video page URL.\n";
94 exit 1;
97 else {
98 $url = "$req_body$url";
101 print STDERR "Checking ..." unless $config{quiet};
103 require LWP;
105 my $a = new LWP::UserAgent;
107 $a->env_proxy; # http://search.cpan.org/perldoc?LWP::UserAgent
109 $a->proxy( 'http', $config{proxy} ) if $config{proxy};
110 $a->no_proxy('') if $config{no_proxy};
112 require XML::DOM;
114 my $p = new XML::DOM::Parser( LWP_UserAgent => $a );
115 my $d = $p->parsefile($url);
116 my $r = $d->getDocumentElement;
118 for my $e ( $r->getElementsByTagName("track") ) {
119 my %tmp = (
120 name => $e->getAttributeNode("name")->getValue || "",
121 lang_code => $e->getAttributeNode("lang_code")->getValue,
122 lang_transl =>
123 $e->getAttributeNode("lang_translated")->getValue,
124 selected => 1
126 push @captions, \%tmp;
127 print STDERR "." unless $config{quiet};
130 print STDERR "done.\n" unless $config{quiet};
132 $d->dispose;
134 my $v = $1 if $url =~ /$q/ or die "error: no match: video id";
136 get_title( $v, $a ) if $config{title};
137 prompt() if $config{interactive};
139 my $t = 0;
141 foreach (@captions) {
142 ++$t if $_->{selected};
145 my $n = 0;
147 foreach (@captions) {
149 next unless $_->{selected};
151 $url =
152 "http://video.google.com/timedtext?"
153 . "hl=$_->{lang_code}"
154 . "&lang=$_->{lang_code}"
155 . "&name=$_->{name}" . "&v=$v";
157 my $fname = sprintf "%s_%s.srt", $v, $_->{lang_code};
159 if ($video_title) {
160 $video_title =
161 apply_regexp( $config{regexp}, $video_title );
162 $fname = sprintf "%s_%s.srt", $video_title, $_->{lang_code};
165 open my $fh, ">", $fname or die "$fname: $!\n";
166 binmode $fh, ":utf8";
168 unless ( $config{quiet} ) {
169 printf STDERR "(%02d of %02d) ", ++$n, $t if $t > 0;
170 print STDERR "Saving $fname ...";
173 $d = $p->parsefile($url);
174 $r = $d->getDocumentElement;
176 my $i = 1;
177 my $last_start = 0;
179 for my $e ( $r->getElementsByTagName("text") ) {
181 my $tmp = $e->getFirstChild;
182 next unless $tmp;
184 my $text = trim( $tmp->getNodeValue );
185 next unless $text;
187 my $start = $e->getAttributeNode("start")->getValue;
189 my $start_sec = 0;
190 my $start_msec = 0;
192 if ( $start =~ /(\d+)/ ) {
193 $start_sec = $1;
194 $start_msec = $1
195 if $start =~
196 /\d+\.(\d+)/; # should only capture 3 first digits
199 my @start = gmtime($start_sec);
201 $tmp = $e->getAttributeNode("dur");
202 my $dur = $tmp ? $tmp->getValue : $start - $last_start;
204 my $end_sec = $start + $dur;
206 $dur =~ /\d+\.(\d+)/; # should only capture 3 first digits
207 my $end_msec = $1 || 0;
209 my @end = gmtime($end_sec);
211 printf $fh
212 "%d\r\n%02d:%02d:%02d,%03d --> %02d:%02d:%02d,%03d\r\n%s\r\n\r\n",
213 $i++, @start[ 2, 1, 0 ], $start_msec, @end[ 2, 1, 0 ],
214 $end_msec, $text;
216 $last_start = $start;
219 $d->dispose;
221 close $fh;
223 print STDERR "done.\n" unless $config{quiet};
226 return 0;
229 my $done = 0;
231 sub prompt {
233 my %cmds = (
234 'h' => \&help,
235 'q' => \&quit,
236 'l' => \&list,
237 'a' => \&select_all,
238 'n' => \&select_none,
239 'i' => \&invert_selection,
240 'g' => \&get,
243 print STDERR "Enter prompt. "
244 . qq/Type "help" to get a list of commands.\n/;
245 list();
247 my $p = "(gcap) ";
249 while ( !$done ) {
250 print STDERR $p;
251 my $ln = <STDIN>;
252 next unless $ln;
253 chomp $ln;
254 if ( $ln =~ /(\d+)/ ) {
255 toggle_caption($1);
257 else {
258 next unless $ln =~ /(\w)/;
259 $cmds{$1}() if defined $cmds{$1};
264 sub get_title {
265 my ( $v, $a ) = @_;
267 my $url = "http://www.youtube.com/get_video_info?&video_id=$v"
268 . "&el=detailpage&ps=default&eurl=&gl=US&hl=en";
270 my $r = $a->get($url);
272 unless ( $r->is_success ) {
273 printf STDERR "\nerror: $url: %s\n", $r->status_line;
274 return;
277 require CGI;
279 my $q = CGI->new( $r->content );
281 if ( $q->param('reason') ) {
282 printf STDERR "\nerror: %s: %s (errorcode: %d)\n",
283 $url, $q->param("reason"), $q->param("errorcode");
285 else {
286 require Encode;
287 $video_title = Encode::decode_utf8( $q->param('title') );
290 unless ($video_title) {
291 print STDERR "\nwarning: $url: use id instead\n"
292 unless $config{quiet};
296 sub apply_regexp {
298 my ( $re, $s ) = @_;
299 my ( $pat, $flags );
301 if ( $re =~ /^\/(.*)\/(.*)$/ ) {
302 $pat = $1;
303 $flags = $2;
305 else {
306 print STDERR
307 "error: invalid regexp syntax, expected `/pattern/flags'\n";
308 exit 1;
311 return unless $s;
313 my $q = $flags =~ /i/ ? qr/$pat/i : qr/$pat/;
315 return join '', $flags =~ /g/ ? $s =~ /$q/g : $s =~ /$q/;
318 sub help {
319 print STDERR "Commands:
320 help .. this
321 list .. display found captions (> indicates selected for download)
322 all .. select all
323 none .. select none
324 invert .. invert selection
325 (number) .. toggle caption
326 get .. download selected captions
327 quit .. quit without downloading captions\n"
328 . qq/Command name abbreviations are allowed, e.g. "h" instead of "help"\n/;
331 sub get {
332 foreach (@captions) {
333 if ( $_->{selected} ) {
334 $done = 1;
335 return;
338 print STDERR "error: you have not selected anything\n";
341 sub quit { exit 0; }
343 sub list {
344 my $i = 0;
345 foreach (@captions) {
346 printf STDERR "%2s%02d: $_->{lang_transl}\n",
347 $_->{selected} ? ">" : "", ++$i;
351 sub select_all {
352 $_->{selected} = 1 foreach @captions;
353 list();
356 sub select_none {
357 $_->{selected} = 0 foreach @captions;
358 list();
361 sub invert_selection {
362 $_->{selected} = !$_->{selected} foreach @captions;
363 list();
366 sub toggle_caption {
367 my $i = (shift) - 1;
368 if ( $i >= 0 && exists $captions[$i] ) {
369 $captions[$i]->{selected} = !$captions[$i]->{selected};
370 list();
372 else {
373 print STDERR "error: out of rate\n";
377 sub trim {
378 my $s = shift;
379 $s =~ s/^\s+//;
380 $s =~ s/\s+$//;
381 return $s;
384 __END__
386 =head1 NAME
388 gcap - Youtube closed caption retriever
390 =head1 SYNOPSIS
392 gcap [-i] [-t] [-r E<lt>regexpE<gt>] [E<lt>urlE<gt> | E<lt>video_idE<gt>]
393 [--proxy E<lt>addrE<gt> | --no-proxy]
395 =head1 DESCRIPTION
397 gcap is a command line tool for retrieving Youtube closed captions.
398 The retrieved closed captions are saved in SubRip (srt) file format.
399 The srt files are saved as "$videoid_$langid.srt" by default.
401 =head1 OPTIONS
403 --help print help and exit
404 --version print version and exit
405 --license print license and exit
406 -q, --quiet be quiet
407 -i, --interactive run in interactive mode
408 -t, --title parse video title and use it in filename
409 -r, --regexp arg (="/(\w|\s)/g") cleanup title with regexp
410 --proxy arg (=http_env) use proxy for http connections
411 --no-proxy disable use of http proxy
413 =head1 OPTION DESCRIPTIONS
415 =over 4
417 =item B<--help>
419 Print help and exit.
421 =item B<--version>
423 Print version and exit.
425 =item B<--license>
427 Print license and exit.
429 =item B<-q, --quiet>
431 Be quiet.
433 =item B<-i, --interactive>
435 Enable interactive prompt which can be used to select the downloaded
436 closed captions. By default gcap downloads all available captions
437 without prompting.
439 =item B<-t, --title>
441 Parse video title and use it in the output filename(s) instead of
442 video ID. The default is no.
444 =item B<-r, --regexp>=arg
446 Cleanup video title using the specified I<arg> regular expression.
447 The default is "/(\w|\s)/g".
449 =item B<--proxy> I<arg>
451 Use I<arg> for HTTP proxy, e.g. "http://foo:1234". Overrides the http_proxy
452 environment setting.
454 =item B<--no-proxy>
456 Disable use of HTTP proxy. Overrides both C<--proxy> and http_proxy environment
457 settings.
459 =back
461 =head1 EXAMPLES
463 =over 4
465 =item B<gcap 0QRO3gKj3qw>
467 =item B<gcap "http://www.youtube.com/watch?v=0QRO3gKj3qw">
469 Typical use. Both achieve the same.
471 =back
473 =head1 EXIT STATUS
475 Exits 0 on success, otherwise 1.
477 =head1 FILES
479 =over 4
481 =item $HOME/.gcaprc, for example:
483 echo "--interactive" >> ~/.gcaprc
485 =back
487 =head1 NOTES
489 =over 4
491 =item B<Availability>
493 Not all Youtube videos have closed captions. The following message
494 indicates that the video does not have any closed captions available.
495 URL omitted for brevity.
497 Couldn't parsefile [...] with LWP: no element found at line 1,
498 column 0, byte -1 at /usr/lib/perl5/vendor_perl/XML/Parser.pm ...
500 =item B<http_proxy>
502 gcap depends on XML::DOM which uses LWP::UserAgent to retrieve
503 the data. Note that LWP::UserAgent reads http_proxy environment
504 setting. e.g.:
506 env http_proxy=http://foo:1234 gcap video_id
508 =item B<Project>
510 <http://gcap.googlecode.com/>
512 =item B<Development repository>
514 <git://repo.or.cz/gcap.git>
516 e.g. git clone git://repo.or.cz/gcap.git
518 =back
520 =head1 AUTHOR
522 Toni Gundogdu <legatvs gmail com>
524 =cut