2 # -*- coding: ascii -*-
5 # Copyright (C) 2010 Toni Gundogdu <legatvs@gmail.com>.
7 # This program is free software: you can redistribute it and/or modify
8 # it under the terms of the GNU General Public License as published by
9 # the Free Software Foundation, either version 3 of the License, or
10 # (at your option) any later version.
12 # This program is distributed in the hope that it will be useful,
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 # GNU General Public License for more details.
17 # You should have received a copy of the GNU General Public License
18 # along with this program. If not, see <http://www.gnu.org/licenses/>.
24 binmode STDOUT
, ":utf8";
25 binmode STDERR
, ":utf8";
27 use version
0.77 (); our $VERSION = version
->declare ("0.0.4");
29 use Getopt
::ArgvFile
( home
=> 1, startupFilename
=> [qw(.gcaprc)] );
30 use Getopt
::Long
qw(:config bundling);
43 'version' => \
&print_version
,
44 'license' => \
&print_license
,
45 'help' => \
&print_help
,
48 $config{regexp
} ||= "/(\\w|\\s)/g";
49 apply_regexp
($config{regexp
}); # Check syntax.
53 print "gcap version $VERSION\n";
59 "Copyright (C) 2010 Toni Gundogdu. GNU GPL v3+. This is free software;
60 see the source for copying conditions. There is NO warranty; not even
61 for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
68 Pod
::Usage
::pod2usage
( -exitstatus
=> 0, -verbose
=> 1 );
77 print_help
() unless scalar @ARGV;
79 my $req_body = "http://video.google.com/timedtext?hl=en&type=list&v=";
81 my $q = qr{v[=/]([-_\w]{11,11}+)};
83 if ($url =~ /^http:/i ) {
88 print STDERR
"error: does not look like a youtube video page URL.\n";
93 $url = "$req_body$url";
96 print STDERR
"Checking ...";
100 my $p = new XML
::DOM
::Parser
;
101 my $d = $p->parsefile ($url);
102 my $r = $d->getDocumentElement;
104 for my $e ( $r->getElementsByTagName ("track") ) {
106 name
=> $e->getAttributeNode ("name")->getValue || "",
107 lang_code
=> $e->getAttributeNode ("lang_code")->getValue,
108 lang_transl
=> $e->getAttributeNode ("lang_translated")->getValue,
111 push @captions, \
%tmp;
115 print STDERR
"done.\n";
119 my $v = $1 if $url =~ /$q/ or die "error: no match: video id";
121 get_title
($v) if $config{title
};
122 prompt
() if $config{interactive
};
126 foreach (@captions) {
127 ++$t if $_->{selected
};
132 foreach (@captions) {
134 next unless $_->{selected
};
136 $url = "http://video.google.com/timedtext?"
137 . "hl=$_->{lang_code}"
138 . "&lang=$_->{lang_code}"
142 my $fname = sprintf "%s_%s.srt", $v, $_->{lang_code
};
145 $video_title = apply_regexp
($config{regexp
}, $video_title);
146 $fname = sprintf "%s_%s.srt", $video_title, $_->{lang_code
};
149 open my $fh, ">", $fname or die "$fname: $!\n";
150 binmode $fh, ":utf8";
152 printf STDERR
"(%02d of %02d) ", ++$n, $t if $t > 0;
153 print STDERR
"Saving $fname ...";
155 $d = $p->parsefile ($url);
156 $r = $d->getDocumentElement;
161 for my $e ($r->getElementsByTagName ("text") ) {
163 my $tmp = $e->getFirstChild;
166 my $text = trim
($tmp->getNodeValue);
169 my $start = $e->getAttributeNode ("start")->getValue;
174 if ($start =~ /(\d+)/) {
176 $start_msec = $1 if $start =~ /\d+\.(\d+)/; # should only capture 3 first digits
179 my @start = gmtime ($start_sec);
181 $tmp = $e->getAttributeNode ("dur");
182 my $dur = $tmp ?
$tmp->getValue : $start - $last_start;
184 my $end_sec = $start + $dur;
186 $dur =~ /\d+\.(\d+)/; # should only capture 3 first digits
187 my $end_msec = $1 || 0;
189 my @end = gmtime ($end_sec);
191 printf $fh "%d\r\n%02d:%02d:%02d,%03d --> %02d:%02d:%02d,%03d\r\n%s\r\n\r\n",
192 $i++, @start[2,1,0], $start_msec, @end[2,1,0], $end_msec, $text;
194 $last_start = $start;
201 print STDERR
"done.\n";
216 'n' => \
&select_none
,
217 'i' => \
&invert_selection
,
221 print STDERR
"Enter prompt. " . qq/Type "help" to get a list of commands.\n/;
231 if ($ln =~ /(\d+)/) {
235 next unless $ln =~ /(\w)/;
236 $cmds{$1}() if defined $cmds{$1};
245 my $url = "http://www.youtube.com/get_video_info?&video_id=$v"
246 . "&el=detailpage&ps=default&eurl=&gl=US&hl=en";
250 my $a = new LWP
::UserAgent
;
254 my $r = $a->get ($url);
256 unless ($r->is_success) {
257 printf STDERR
"\nerror: $url: %s\n", $r->status_line;
263 my $q = CGI
->new ($r->content);
265 if ($q->param ('reason')) {
266 printf STDERR
"\nerror: %s: %s (errorcode: %d)\n",
267 $url, $q->param ("reason"), $q->param ("errorcode");
271 $video_title = Encode
::decode_utf8
($q->param ('title'));
274 unless ($video_title) {
275 print STDERR
"\nwarning: $url: use id instead\n";
284 if ($re =~ /^\/(.*)\
/(.*)$/) {
289 print STDERR
"error: invalid regexp syntax, expected `/pattern/flags'\n";
295 my $q = $flags =~ /i/ ?
qr/$pat/i : qr/$pat/;
297 return join '', $flags =~ /g/ ?
$s =~ /$q/g : $s =~ /$q/;
301 print STDERR
"Commands:
303 list .. display found captions (> indicates selected for download)
306 invert .. invert selection
307 (number) .. toggle caption
308 get .. download selected captions
309 quit .. quit without downloading captions\n"
310 . qq/Command name abbreviations are allowed, e.g. "h" instead of "help"\n/;
314 foreach (@captions) {
315 if ($_->{selected
}) {
320 print STDERR
"error: you have not selected anything\n";
327 foreach (@captions) {
328 printf STDERR
"%2s%02d: $_->{lang_transl}\n", $_->{selected
} ?
">":"", ++$i;
333 $_->{selected
} = 1 foreach @captions;
338 $_->{selected
} = 0 foreach @captions;
342 sub invert_selection
{
343 $_->{selected
} = !$_->{selected
} foreach @captions;
349 if ($i >= 0 && exists $captions[$i]) {
350 $captions[$i]->{selected
} = !$captions[$i]->{selected
};
354 print STDERR
"error: out of rate\n";
369 gcap - Youtube closed caption retriever
373 gcap [-i] [-t] [-r E<lt>regexpE<gt>] [E<lt>urlE<gt> | E<lt>video_idE<gt>]
377 gcap is a command line tool for retrieving Youtube closed captions.
378 The retrieved closed captions are saved in SubRip (srt) file format.
379 The srt files are saved as "$videoid_$langid.srt" by default.
383 --help print help and exit
384 --version print version and exit
385 --license print license and exit
386 -i, --interactive run in interactive mode
387 -t, --title parse video title and use it in filename
388 -r, --regexp arg (="/(\w|\s)/g") cleanup title with regexp
390 =head1 OPTION DESCRIPTIONS
400 Print version and exit.
404 Print license and exit.
406 =item B<-i, --interactive>
408 Enable interactive prompt which can be used to select the downloaded
409 closed captions. By default gcap downloads all available captions
414 Parse video title and use it in the output filename(s) instead of
415 video ID. The default is no.
417 =item B<-r, --regexp>=arg
419 Cleanup video title using the specified I<arg> regular expression.
420 The default is "/(\w|\s)/g".
428 =item B<gcap 0QRO3gKj3qw>
430 =item B<gcap "http://www.youtube.com/watch?v=0QRO3gKj3qw">
432 Typical use. Both achieve the same.
438 Exits 0 on success, otherwise 1.
444 =item $HOME/.gcaprc, for example:
446 echo "--interactive" >> ~/.gcaprc
454 =item B<Availability>
456 Not all Youtube videos have closed captions. The following message
457 indicates that the video does not have any closed captions available.
458 URL omitted for brevity.
460 Couldn't parsefile [...] with LWP: no element found at line 1,
461 column 0, byte -1 at /usr/lib/perl5/vendor_perl/XML/Parser.pm ...
465 gcap depends on XML::DOM which uses LWP::UserAgent to retrieve
466 the data. Note that LWP::UserAgent reads http_proxy environment
469 env http_proxy=http://foo:1234 gcap video_id
473 <http://gcap.googlecode.com/>
475 =item B<Development repository>
477 <git://repo.or.cz/gcap.git>
479 e.g. git clone git://repo.or.cz/gcap.git
485 Toni Gundogdu <legatvs gmail com>