2 # -*- coding: ascii -*-
5 # Copyright (C) 2010 Toni Gundogdu <legatvs@gmail.com>.
7 # This program is free software: you can redistribute it and/or modify
8 # it under the terms of the GNU General Public License as published by
9 # the Free Software Foundation, either version 3 of the License, or
10 # (at your option) any later version.
12 # This program is distributed in the hope that it will be useful,
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 # GNU General Public License for more details.
17 # You should have received a copy of the GNU General Public License
18 # along with this program. If not, see <http://www.gnu.org/licenses/>.
24 binmode STDOUT
, ":utf8";
25 binmode STDERR
, ":utf8";
27 use Getopt
::ArgvFile
( home
=> 1, startupFilename
=> [qw(.gcaprc)] );
28 use Getopt
::Long
qw(:config bundling);
30 my $VERSION = "0.0.1";
42 'version' => \
&print_version
,
43 'license' => \
&print_license
,
44 'help' => \
&print_help
,
47 $config{regexp
} ||= "/(\\w|\\s)/g";
48 apply_regexp
($config{regexp
}); # Check syntax.
52 print "gcap version $VERSION\n";
58 "Copyright (C) 2010 Toni Gundogdu. GNU GPL v3+. This is free software;
59 see the source for copying conditions. There is NO warranty; not even
60 for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
67 Pod
::Usage
::pod2usage
( -exitstatus
=> 0, -verbose
=> 1 );
76 print_help
() unless scalar @ARGV;
78 my $req_body = "http://video.google.com/timedtext?hl=en&type=list&v=";
81 if ($url =~ /^http:/i ) {
82 if ($url =~ /v=([-_\w]+)/) {
86 print STDERR
"error: does not look like a youtube video page URL.\n";
91 $url = "$req_body$url";
94 print STDERR
"Checking ...";
98 my $p = new XML
::DOM
::Parser
;
99 my $d = $p->parsefile ($url);
100 my $r = $d->getDocumentElement;
102 for my $e ( $r->getElementsByTagName ("track") ) {
104 name
=> $e->getAttributeNode ("name")->getValue || "",
105 lang_code
=> $e->getAttributeNode ("lang_code")->getValue,
106 lang_transl
=> $e->getAttributeNode ("lang_translated")->getValue,
109 push @captions, \
%tmp;
113 print STDERR
"done.\n";
117 my $v = $1 if $url =~ /v=([-_\w]+)/;
119 get_title
($v) if $config{title
};
120 prompt
() if $config{interactive
};
124 foreach (@captions) {
125 ++$t if $_->{selected
};
130 foreach (@captions) {
132 next unless $_->{selected
};
134 $url = "http://video.google.com/timedtext?"
135 . "hl=$_->{lang_code}"
136 . "&lang=$_->{lang_code}"
140 my $fname = sprintf "%s_%s.srt", $v, $_->{lang_code
};
143 $video_title = apply_regexp
($config{regexp
}, $video_title);
144 $fname = sprintf "%s_%s.srt", $video_title, $_->{lang_code
};
147 open my $fh, ">", $fname or die "$fname: $!\n";
148 binmode $fh, ":utf8";
150 printf STDERR
"(%02d of %02d) ", ++$n, $t if $t > 0;
151 print STDERR
"Saving $fname ...";
153 $d = $p->parsefile ($url);
154 $r = $d->getDocumentElement;
159 for my $e ($r->getElementsByTagName ("text") ) {
161 my $tmp = $e->getFirstChild;
164 my $text = trim
($tmp->getNodeValue);
167 my $start = $e->getAttributeNode ("start")->getValue;
172 if ($start =~ /(\d+)/) {
174 $start_msec = $1 if $start =~ /\d+\.(\d+)/; # should only capture 3 first digits
177 my @start = gmtime ($start_sec);
179 $tmp = $e->getAttributeNode ("dur");
180 my $dur = $tmp ?
$tmp->getValue : $start - $last_start;
182 my $end_sec = $start + $dur;
184 $dur =~ /\d+\.(\d+)/; # should only capture 3 first digits
185 my $end_msec = $1 || 0;
187 my @end = gmtime ($end_sec);
189 printf $fh "%d\r\n%02d:%02d:%02d,%03d --> %02d:%02d:%02d,%03d\r\n%s\r\n\r\n",
190 $i++, @start[2,1,0], $start_msec, @end[2,1,0], $end_msec, $text;
192 $last_start = $start;
199 print STDERR
"done.\n";
214 'n' => \
&select_none
,
215 'i' => \
&invert_selection
,
219 print STDERR
"Enter prompt. " . qq/Type "help" to get a list of commands.\n/;
229 if ($ln =~ /(\d+)/) {
233 next unless $ln =~ /(\w)/;
234 $cmds{$1}() if defined $cmds{$1};
243 my $url = "http://www.youtube.com/get_video_info?&video_id=$v"
244 . "&el=detailpage&ps=default&eurl=&gl=US&hl=en";
248 my $a = new LWP
::UserAgent
;
249 my $r = $a->get ($url);
251 unless ($r->is_success) {
252 print STDERR
"error: " . $r->status_line
253 . "\nerror: while trying to fetch video title\n";
259 my $config = URI
::Escape
::uri_unescape
($r->content);
263 if ($config =~ /&reason=(.*?)[?:&]?$/) {
265 print STDERR
"error: $e\n";
268 $video_title = $1 if $config =~ /&title=(.*?)&/;
271 unless ($video_title) {
272 print STDERR
"warning: Could not match video title. "
273 . "Use video ID instead of title.\n";
282 if ($re =~ /^\/(.*)\
/(.*)$/) {
287 print STDERR
"error: invalid regexp syntax, expected `/pattern/flags'\n";
293 my $q = $flags =~ /i/ ?
qr/$pat/i : qr/$pat/;
295 return join '', $flags =~ /g/ ?
$s =~ /$q/g : $s =~ /$q/;
299 print STDERR
"Commands:
301 list .. display found captions (> indicates selected for download)
304 invert .. invert selection
305 (number) .. toggle caption
306 get .. download selected captions
307 quit .. quit without downloading captions\n"
308 . qq/Command name abbreviations are allowed, e.g. "h" instead of "help"\n/;
312 foreach (@captions) {
313 if ($_->{selected
}) {
318 print STDERR
"error: you have not selected anything\n";
325 foreach (@captions) {
326 printf STDERR
"%2s%02d: $_->{lang_transl}\n", $_->{selected
} ?
">":"", ++$i;
331 $_->{selected
} = 1 foreach @captions;
336 $_->{selected
} = 0 foreach @captions;
340 sub invert_selection
{
341 $_->{selected
} = !$_->{selected
} foreach @captions;
347 if ($i >= 0 && exists $captions[$i]) {
348 $captions[$i]->{selected
} = !$captions[$i]->{selected
};
352 print STDERR
"error: out of rate\n";
367 gcap - Youtube closed caption retriever
371 gcap [options] [URL|VIDEO_ID]
375 gcap is a command line tool for retrieving Youtube closed captions.
376 The retrieved closed captions are saved in SubRip (srt) file format.
377 The srt files are saved as "$videoid_$langid.srt" by default.
381 --help print help and exit
382 --version print version and exit
383 --license print license and exit
384 -i, --interactive run in interactive mode, default is no
385 -t, --title parse video title and use it in filename, default is no
386 -r, --regexp =arg cleanup title with regexp, default is /(\w|\s)/g
388 =head1 OPTION DESCRIPTIONS
398 Print version and exit.
402 Print license and exit.
404 =item B<-i, --interactive>
406 Enable interactive prompt which can be used to select the downloaded
407 closed captions. By default gcap downloads all available captions
412 Parse video title and use it in the output filename(s) instead of
413 video ID. The default is no.
415 =item B<-r, --regexp>=arg
417 Cleanup video title using the specified I<arg> regular expression.
418 The default is "/(\w|\s)/g".
426 =item B<gcap 0QRO3gKj3qw>
428 =item B<gcap "http://www.youtube.com/watch?v=0QRO3gKj3qw">
430 Typical use. Both achieve the same.
436 Exits 0 on success, otherwise 1.
442 =item $HOME/.gcaprc, for example:
444 echo "--interactive" >> ~/.gcaprc
452 =item B<Availability>
454 Not all Youtube videos have closed captions. The following message
455 indicates that the video does not have any closed captions available.
456 URL omitted for brevity.
458 Couldn't parsefile [...] with LWP: no element found at line 1,
459 column 0, byte -1 at /usr/lib/perl5/vendor_perl/XML/Parser.pm ...
463 gcap depends on XML::DOM which uses LWP::UserAgent to retrieve
464 the data. Note that LWP::UserAgent reads http_proxy environment
467 env http_proxy=http://foo:1234 gcap video_id
471 <http://gcap.googlecode.com/>
473 =item B<Development repository>
475 <git://repo.or.cz/gcap.git>
477 e.g. git clone git://repo.or.cz/gcap.git
483 Toni Gundogdu <legatvs gmail com>