initial.
[gcap.git] / bin / gcap
blob725c8dd3517c2ee9e6d6215cba4a5da3cdf720e5
1 #!/usr/bin/perl
2 # -*- coding: ascii -*-
5 # Copyright (C) 2010 Toni Gundogdu <legatvs@gmail.com>.
7 # This program is free software: you can redistribute it and/or modify
8 # it under the terms of the GNU General Public License as published by
9 # the Free Software Foundation, either version 3 of the License, or
10 # (at your option) any later version.
12 # This program is distributed in the hope that it will be useful,
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 # GNU General Public License for more details.
17 # You should have received a copy of the GNU General Public License
18 # along with this program. If not, see <http://www.gnu.org/licenses/>.
21 use warnings;
22 use strict;
24 binmode STDOUT, ":utf8";
25 binmode STDERR, ":utf8";
27 use Getopt::ArgvFile( home => 1, startupFilename => [qw(.gcaprc)] );
28 use Getopt::Long qw(:config bundling);
30 my $VERSION = "0.0.1";
31 my %config;
32 my $video_title;
34 exit main();
36 sub init {
37 GetOptions(
38 \%config,
39 'interactive|i',
40 'title|t',
41 'regexp|r=s',
42 'version' => \&print_version,
43 'license' => \&print_license,
44 'help' => \&print_help,
45 ) or exit 1;
47 $config{regexp} ||= "/(\\w|\\s)/g";
48 apply_regexp ($config{regexp}); # Check syntax.
51 sub print_version {
52 print "gcap version $VERSION\n";
53 exit 0;
56 sub print_license {
57 print
58 "Copyright (C) 2010 Toni Gundogdu. GNU GPL v3+. This is free software;
59 see the source for copying conditions. There is NO warranty; not even
60 for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
62 exit 0;
65 sub print_help {
66 require Pod::Usage;
67 Pod::Usage::pod2usage( -exitstatus => 0, -verbose => 1 );
70 my @captions;
72 sub main {
74 init();
76 print_help() unless scalar @ARGV;
78 my $req_body = "http://video.google.com/timedtext?hl=en&type=list&v=";
79 my $url = $ARGV[0];
81 if ($url =~ /^http:/i ) {
82 if ($url =~ /v=([-_\w]+)/) {
83 $url = "$req_body$1";
85 else {
86 print STDERR "error: does not look like a youtube video page URL.\n";
87 exit 1;
90 else {
91 $url = "$req_body$url";
94 print STDERR "Checking ...";
96 require XML::DOM;
98 my $p = new XML::DOM::Parser;
99 my $d = $p->parsefile ($url);
100 my $r = $d->getDocumentElement;
102 for my $e ( $r->getElementsByTagName ("track") ) {
103 my %tmp = (
104 name => $e->getAttributeNode ("name")->getValue || "",
105 lang_code => $e->getAttributeNode ("lang_code")->getValue,
106 lang_transl => $e->getAttributeNode ("lang_translated")->getValue,
107 selected => 1
109 push @captions, \%tmp;
110 print STDERR ".";
113 print STDERR "done.\n";
115 $d->dispose;
117 my $v = $1 if $url =~ /v=([-_\w]+)/;
119 get_title ($v) if $config{title};
120 prompt() if $config{interactive};
122 my $t = 0;
124 foreach (@captions) {
125 ++$t if $_->{selected};
128 my $n = 0;
130 foreach (@captions) {
132 next unless $_->{selected};
134 $url = "http://video.google.com/timedtext?"
135 . "hl=$_->{lang_code}"
136 . "&lang=$_->{lang_code}"
137 . "&name=$_->{name}"
138 . "&v=$v";
140 my $fname = sprintf "%s_%s.srt", $v, $_->{lang_code};
142 if ($video_title) {
143 $video_title = apply_regexp ($config{regexp}, $video_title);
144 $fname = sprintf "%s_%s.srt", $video_title, $_->{lang_code};
147 open my $fh, ">", $fname or die "$fname: $!\n";
148 binmode $fh, ":utf8";
150 printf STDERR "(%02d of %02d) ", ++$n, $t if $t > 0;
151 print STDERR "Saving $fname ...";
153 $d = $p->parsefile ($url);
154 $r = $d->getDocumentElement;
156 my $i = 1;
157 my $last_start = 0;
159 for my $e ($r->getElementsByTagName ("text") ) {
161 my $tmp = $e->getFirstChild;
162 next unless $tmp;
164 my $text = trim ($tmp->getNodeValue);
165 next unless $text;
167 my $start = $e->getAttributeNode ("start")->getValue;
169 my $start_sec = 0;
170 my $start_msec = 0;
172 if ($start =~ /(\d+)/) {
173 $start_sec = $1;
174 $start_msec = $1 if $start =~ /\d+\.(\d+)/; # should only capture 3 first digits
177 my @start = gmtime ($start_sec);
179 $tmp = $e->getAttributeNode ("dur");
180 my $dur = $tmp ? $tmp->getValue : $start - $last_start;
182 my $end_sec = $start + $dur;
184 $dur =~ /\d+\.(\d+)/; # should only capture 3 first digits
185 my $end_msec = $1 || 0;
187 my @end = gmtime ($end_sec);
189 printf $fh "%d\r\n%02d:%02d:%02d,%03d --> %02d:%02d:%02d,%03d\r\n%s\r\n\r\n",
190 $i++, @start[2,1,0], $start_msec, @end[2,1,0], $end_msec, $text;
192 $last_start = $start;
195 $d->dispose;
197 close $fh;
199 print STDERR "done.\n";
202 return 0;
205 my $done = 0;
207 sub prompt {
209 my %cmds = (
210 'h' => \&help,
211 'q' => \&quit,
212 'l' => \&list,
213 'a' => \&select_all,
214 'n' => \&select_none,
215 'i' => \&invert_selection,
216 'g' => \&get,
219 print STDERR "Enter prompt. " . qq/Type "help" to get a list of commands.\n/;
220 list();
222 my $p = "(gcap) ";
224 while (!$done) {
225 print STDERR $p;
226 my $ln = <STDIN>;
227 next unless $ln;
228 chomp $ln;
229 if ($ln =~ /(\d+)/) {
230 toggle_caption ($1);
232 else {
233 next unless $ln =~ /(\w)/;
234 $cmds{$1}() if defined $cmds{$1};
239 sub get_title {
241 my $v = shift;
243 my $url = "http://www.youtube.com/get_video_info?&video_id=$v"
244 . "&el=detailpage&ps=default&eurl=&gl=US&hl=en";
246 require LWP;
248 my $a = new LWP::UserAgent;
249 my $r = $a->get ($url);
251 unless ($r->is_success) {
252 print STDERR "error: " . $r->status_line
253 . "\nerror: while trying to fetch video title\n";
254 return;
257 require URI::Escape;
259 my $config = URI::Escape::uri_unescape ($r->content);
261 $config =~ s/\+/ /g;
263 if ($config =~ /&reason=(.*?)[?:&]?$/) {
264 my $e = $1;
265 print STDERR "error: $e\n";
267 else {
268 $video_title = $1 if $config =~ /&title=(.*?)&/;
271 unless ($video_title) {
272 print STDERR "warning: Could not match video title. "
273 . "Use video ID instead of title.\n";
277 sub apply_regexp {
279 my ($re,$s) = @_;
280 my ($pat, $flags);
282 if ($re =~ /^\/(.*)\/(.*)$/) {
283 $pat = $1;
284 $flags = $2;
286 else {
287 print STDERR "error: invalid regexp syntax, expected `/pattern/flags'\n";
288 exit 1;
291 return unless $s;
293 my $q = $flags =~ /i/ ? qr/$pat/i : qr/$pat/;
295 return join '', $flags =~ /g/ ? $s =~ /$q/g : $s =~ /$q/;
298 sub help {
299 print STDERR "Commands:
300 help .. this
301 list .. display found captions (> indicates selected for download)
302 all .. select all
303 none .. select none
304 invert .. invert selection
305 (number) .. toggle caption
306 get .. download selected captions
307 quit .. quit without downloading captions\n"
308 . qq/Command name abbreviations are allowed, e.g. "h" instead of "help"\n/;
311 sub get {
312 foreach (@captions) {
313 if ($_->{selected}) {
314 $done = 1;
315 return;
318 print STDERR "error: you have not selected anything\n";
321 sub quit { exit 0; }
323 sub list {
324 my $i = 0;
325 foreach (@captions) {
326 printf STDERR "%2s%02d: $_->{lang_transl}\n", $_->{selected} ? ">":"", ++$i;
330 sub select_all {
331 $_->{selected} = 1 foreach @captions;
332 list();
335 sub select_none {
336 $_->{selected} = 0 foreach @captions;
337 list();
340 sub invert_selection {
341 $_->{selected} = !$_->{selected} foreach @captions;
342 list();
345 sub toggle_caption {
346 my $i = (shift) - 1;
347 if ($i >= 0 && exists $captions[$i]) {
348 $captions[$i]->{selected} = !$captions[$i]->{selected};
349 list();
351 else {
352 print STDERR "error: out of rate\n";
356 sub trim {
357 my $s = shift;
358 $s =~ s/^\s+//;
359 $s =~ s/\s+$//;
360 return $s;
363 __END__
365 =head1 NAME
367 gcap - Youtube closed caption retriever
369 =head1 SYNOPSIS
371 gcap [options] [URL|VIDEO_ID]
373 =head1 DESCRIPTION
375 gcap is a command line tool for retrieving Youtube closed captions.
376 The retrieved closed captions are saved in SubRip (srt) file format.
377 The srt files are saved as "$videoid_$langid.srt" by default.
379 =head1 OPTIONS
381 --help print help and exit
382 --version print version and exit
383 --license print license and exit
384 -i, --interactive run in interactive mode, default is no
385 -t, --title parse video title and use it in filename, default is no
386 -r, --regexp =arg cleanup title with regexp, default is /(\w|\s)/g
388 =head1 OPTION DESCRIPTIONS
390 =over 4
392 =item B<--help>
394 Print help and exit.
396 =item B<--version>
398 Print version and exit.
400 =item B<--license>
402 Print license and exit.
404 =item B<-i, --interactive>
406 Enable interactive prompt which can be used to select the downloaded
407 closed captions. By default gcap downloads all available captions
408 without prompting.
410 =item B<-t, --title>
412 Parse video title and use it in the output filename(s) instead of
413 video ID. The default is no.
415 =item B<-r, --regexp>=arg
417 Cleanup video title using the specified I<arg> regular expression.
418 The default is "/(\w|\s)/g".
420 =back
422 =head1 EXAMPLES
424 =over 4
426 =item B<gcap 0QRO3gKj3qw>
428 =item B<gcap "http://www.youtube.com/watch?v=0QRO3gKj3qw">
430 Typical use. Both achieve the same.
432 =back
434 =head1 EXIT STATUS
436 Exits 0 on success, otherwise 1.
438 =head1 FILES
440 =over 4
442 =item $HOME/.gcaprc, for example:
444 echo "--interactive" >> ~/.gcaprc
446 =back
448 =head1 NOTES
450 =over 4
452 =item B<Availability>
454 Not all Youtube videos have closed captions. The following message
455 indicates that the video does not have any closed captions available.
456 URL omitted for brevity.
458 Couldn't parsefile [...] with LWP: no element found at line 1,
459 column 0, byte -1 at /usr/lib/perl5/vendor_perl/XML/Parser.pm ...
461 =item B<http_proxy>
463 gcap depends on XML::DOM which uses LWP::UserAgent to retrieve
464 the data. Note that LWP::UserAgent reads http_proxy environment
465 setting. e.g.:
467 env http_proxy=http://foo:1234 gcap video_id
469 =item B<Project>
471 <http://gcap.googlecode.com/>
473 =item B<Development repository>
475 <git://repo.or.cz/gcap.git>
477 e.g. git clone git://repo.or.cz/gcap.git
479 =back
481 =head1 AUTHOR
483 Toni Gundogdu <legatvs gmail com>
485 =cut