cleanup --help
[gcap.git] / bin / gcap
blob564cb19f62d3d63202d72284abb5f6ddcc726a19
1 #!/usr/bin/perl
2 # -*- coding: ascii -*-
5 # Copyright (C) 2010 Toni Gundogdu <legatvs@gmail.com>.
7 # This program is free software: you can redistribute it and/or modify
8 # it under the terms of the GNU General Public License as published by
9 # the Free Software Foundation, either version 3 of the License, or
10 # (at your option) any later version.
12 # This program is distributed in the hope that it will be useful,
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 # GNU General Public License for more details.
17 # You should have received a copy of the GNU General Public License
18 # along with this program. If not, see <http://www.gnu.org/licenses/>.
21 use warnings;
22 use strict;
24 binmode STDOUT, ":utf8";
25 binmode STDERR, ":utf8";
27 use version 0.77 (); our $VERSION = version->declare ("0.0.4");
29 use Getopt::ArgvFile( home => 1, startupFilename => [qw(.gcaprc)] );
30 use Getopt::Long qw(:config bundling);
32 my %config;
33 my $video_title;
35 exit main();
37 sub init {
38 GetOptions(
39 \%config,
40 'interactive|i',
41 'title|t',
42 'regexp|r=s',
43 'version' => \&print_version,
44 'license' => \&print_license,
45 'help' => \&print_help,
46 ) or exit 1;
48 $config{regexp} ||= "/(\\w|\\s)/g";
49 apply_regexp ($config{regexp}); # Check syntax.
52 sub print_version {
53 print "gcap version $VERSION\n";
54 exit 0;
57 sub print_license {
58 print
59 "Copyright (C) 2010 Toni Gundogdu. GNU GPL v3+. This is free software;
60 see the source for copying conditions. There is NO warranty; not even
61 for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
63 exit 0;
66 sub print_help {
67 require Pod::Usage;
68 Pod::Usage::pod2usage( -exitstatus => 0, -verbose => 1 );
71 my @captions;
73 sub main {
75 init();
77 print_help() unless scalar @ARGV;
79 my $req_body = "http://video.google.com/timedtext?hl=en&type=list&v=";
80 my $url = $ARGV[0];
81 my $q = qr{v[=/]([-_\w]{11,11}+)};
83 if ($url =~ /^http:/i ) {
84 if ($url =~ /$q/) {
85 $url = "$req_body$1";
87 else {
88 print STDERR "error: does not look like a youtube video page URL.\n";
89 exit 1;
92 else {
93 $url = "$req_body$url";
96 print STDERR "Checking ...";
98 require XML::DOM;
100 my $p = new XML::DOM::Parser;
101 my $d = $p->parsefile ($url);
102 my $r = $d->getDocumentElement;
104 for my $e ( $r->getElementsByTagName ("track") ) {
105 my %tmp = (
106 name => $e->getAttributeNode ("name")->getValue || "",
107 lang_code => $e->getAttributeNode ("lang_code")->getValue,
108 lang_transl => $e->getAttributeNode ("lang_translated")->getValue,
109 selected => 1
111 push @captions, \%tmp;
112 print STDERR ".";
115 print STDERR "done.\n";
117 $d->dispose;
119 my $v = $1 if $url =~ /$q/ or die "error: no match: video id";
121 get_title ($v) if $config{title};
122 prompt() if $config{interactive};
124 my $t = 0;
126 foreach (@captions) {
127 ++$t if $_->{selected};
130 my $n = 0;
132 foreach (@captions) {
134 next unless $_->{selected};
136 $url = "http://video.google.com/timedtext?"
137 . "hl=$_->{lang_code}"
138 . "&lang=$_->{lang_code}"
139 . "&name=$_->{name}"
140 . "&v=$v";
142 my $fname = sprintf "%s_%s.srt", $v, $_->{lang_code};
144 if ($video_title) {
145 $video_title = apply_regexp ($config{regexp}, $video_title);
146 $fname = sprintf "%s_%s.srt", $video_title, $_->{lang_code};
149 open my $fh, ">", $fname or die "$fname: $!\n";
150 binmode $fh, ":utf8";
152 printf STDERR "(%02d of %02d) ", ++$n, $t if $t > 0;
153 print STDERR "Saving $fname ...";
155 $d = $p->parsefile ($url);
156 $r = $d->getDocumentElement;
158 my $i = 1;
159 my $last_start = 0;
161 for my $e ($r->getElementsByTagName ("text") ) {
163 my $tmp = $e->getFirstChild;
164 next unless $tmp;
166 my $text = trim ($tmp->getNodeValue);
167 next unless $text;
169 my $start = $e->getAttributeNode ("start")->getValue;
171 my $start_sec = 0;
172 my $start_msec = 0;
174 if ($start =~ /(\d+)/) {
175 $start_sec = $1;
176 $start_msec = $1 if $start =~ /\d+\.(\d+)/; # should only capture 3 first digits
179 my @start = gmtime ($start_sec);
181 $tmp = $e->getAttributeNode ("dur");
182 my $dur = $tmp ? $tmp->getValue : $start - $last_start;
184 my $end_sec = $start + $dur;
186 $dur =~ /\d+\.(\d+)/; # should only capture 3 first digits
187 my $end_msec = $1 || 0;
189 my @end = gmtime ($end_sec);
191 printf $fh "%d\r\n%02d:%02d:%02d,%03d --> %02d:%02d:%02d,%03d\r\n%s\r\n\r\n",
192 $i++, @start[2,1,0], $start_msec, @end[2,1,0], $end_msec, $text;
194 $last_start = $start;
197 $d->dispose;
199 close $fh;
201 print STDERR "done.\n";
204 return 0;
207 my $done = 0;
209 sub prompt {
211 my %cmds = (
212 'h' => \&help,
213 'q' => \&quit,
214 'l' => \&list,
215 'a' => \&select_all,
216 'n' => \&select_none,
217 'i' => \&invert_selection,
218 'g' => \&get,
221 print STDERR "Enter prompt. " . qq/Type "help" to get a list of commands.\n/;
222 list();
224 my $p = "(gcap) ";
226 while (!$done) {
227 print STDERR $p;
228 my $ln = <STDIN>;
229 next unless $ln;
230 chomp $ln;
231 if ($ln =~ /(\d+)/) {
232 toggle_caption ($1);
234 else {
235 next unless $ln =~ /(\w)/;
236 $cmds{$1}() if defined $cmds{$1};
241 sub get_title {
243 my $v = shift;
245 my $url = "http://www.youtube.com/get_video_info?&video_id=$v"
246 . "&el=detailpage&ps=default&eurl=&gl=US&hl=en";
248 require LWP;
250 my $a = new LWP::UserAgent;
252 $a->env_proxy;
254 my $r = $a->get ($url);
256 unless ($r->is_success) {
257 printf STDERR "\nerror: $url: %s\n", $r->status_line;
258 return;
261 require CGI;
263 my $q = CGI->new ($r->content);
265 if ($q->param ('reason')) {
266 printf STDERR "\nerror: %s: %s (errorcode: %d)\n",
267 $url, $q->param ("reason"), $q->param ("errorcode");
269 else {
270 require Encode;
271 $video_title = Encode::decode_utf8 ($q->param ('title'));
274 unless ($video_title) {
275 print STDERR "\nwarning: $url: use id instead\n";
279 sub apply_regexp {
281 my ($re,$s) = @_;
282 my ($pat, $flags);
284 if ($re =~ /^\/(.*)\/(.*)$/) {
285 $pat = $1;
286 $flags = $2;
288 else {
289 print STDERR "error: invalid regexp syntax, expected `/pattern/flags'\n";
290 exit 1;
293 return unless $s;
295 my $q = $flags =~ /i/ ? qr/$pat/i : qr/$pat/;
297 return join '', $flags =~ /g/ ? $s =~ /$q/g : $s =~ /$q/;
300 sub help {
301 print STDERR "Commands:
302 help .. this
303 list .. display found captions (> indicates selected for download)
304 all .. select all
305 none .. select none
306 invert .. invert selection
307 (number) .. toggle caption
308 get .. download selected captions
309 quit .. quit without downloading captions\n"
310 . qq/Command name abbreviations are allowed, e.g. "h" instead of "help"\n/;
313 sub get {
314 foreach (@captions) {
315 if ($_->{selected}) {
316 $done = 1;
317 return;
320 print STDERR "error: you have not selected anything\n";
323 sub quit { exit 0; }
325 sub list {
326 my $i = 0;
327 foreach (@captions) {
328 printf STDERR "%2s%02d: $_->{lang_transl}\n", $_->{selected} ? ">":"", ++$i;
332 sub select_all {
333 $_->{selected} = 1 foreach @captions;
334 list();
337 sub select_none {
338 $_->{selected} = 0 foreach @captions;
339 list();
342 sub invert_selection {
343 $_->{selected} = !$_->{selected} foreach @captions;
344 list();
347 sub toggle_caption {
348 my $i = (shift) - 1;
349 if ($i >= 0 && exists $captions[$i]) {
350 $captions[$i]->{selected} = !$captions[$i]->{selected};
351 list();
353 else {
354 print STDERR "error: out of rate\n";
358 sub trim {
359 my $s = shift;
360 $s =~ s/^\s+//;
361 $s =~ s/\s+$//;
362 return $s;
365 __END__
367 =head1 NAME
369 gcap - Youtube closed caption retriever
371 =head1 SYNOPSIS
373 gcap [-i] [-t] [-r E<lt>regexpE<gt>] [E<lt>urlE<gt> | E<lt>video_idE<gt>]
375 =head1 DESCRIPTION
377 gcap is a command line tool for retrieving Youtube closed captions.
378 The retrieved closed captions are saved in SubRip (srt) file format.
379 The srt files are saved as "$videoid_$langid.srt" by default.
381 =head1 OPTIONS
383 --help print help and exit
384 --version print version and exit
385 --license print license and exit
386 -i, --interactive run in interactive mode
387 -t, --title parse video title and use it in filename
388 -r, --regexp arg (="/(\w|\s)/g") cleanup title with regexp
390 =head1 OPTION DESCRIPTIONS
392 =over 4
394 =item B<--help>
396 Print help and exit.
398 =item B<--version>
400 Print version and exit.
402 =item B<--license>
404 Print license and exit.
406 =item B<-i, --interactive>
408 Enable interactive prompt which can be used to select the downloaded
409 closed captions. By default gcap downloads all available captions
410 without prompting.
412 =item B<-t, --title>
414 Parse video title and use it in the output filename(s) instead of
415 video ID. The default is no.
417 =item B<-r, --regexp>=arg
419 Cleanup video title using the specified I<arg> regular expression.
420 The default is "/(\w|\s)/g".
422 =back
424 =head1 EXAMPLES
426 =over 4
428 =item B<gcap 0QRO3gKj3qw>
430 =item B<gcap "http://www.youtube.com/watch?v=0QRO3gKj3qw">
432 Typical use. Both achieve the same.
434 =back
436 =head1 EXIT STATUS
438 Exits 0 on success, otherwise 1.
440 =head1 FILES
442 =over 4
444 =item $HOME/.gcaprc, for example:
446 echo "--interactive" >> ~/.gcaprc
448 =back
450 =head1 NOTES
452 =over 4
454 =item B<Availability>
456 Not all Youtube videos have closed captions. The following message
457 indicates that the video does not have any closed captions available.
458 URL omitted for brevity.
460 Couldn't parsefile [...] with LWP: no element found at line 1,
461 column 0, byte -1 at /usr/lib/perl5/vendor_perl/XML/Parser.pm ...
463 =item B<http_proxy>
465 gcap depends on XML::DOM which uses LWP::UserAgent to retrieve
466 the data. Note that LWP::UserAgent reads http_proxy environment
467 setting. e.g.:
469 env http_proxy=http://foo:1234 gcap video_id
471 =item B<Project>
473 <http://gcap.googlecode.com/>
475 =item B<Development repository>
477 <git://repo.or.cz/gcap.git>
479 e.g. git clone git://repo.or.cz/gcap.git
481 =back
483 =head1 AUTHOR
485 Toni Gundogdu <legatvs gmail com>
487 =cut