2 # -*- coding: ascii -*-
5 # Copyright (C) 2010 Toni Gundogdu <legatvs@gmail.com>.
7 # This program is free software: you can redistribute it and/or modify
8 # it under the terms of the GNU General Public License as published by
9 # the Free Software Foundation, either version 3 of the License, or
10 # (at your option) any later version.
12 # This program is distributed in the hope that it will be useful,
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 # GNU General Public License for more details.
17 # You should have received a copy of the GNU General Public License
18 # along with this program. If not, see <http://www.gnu.org/licenses/>.
24 binmode STDOUT
, ":utf8";
25 binmode STDERR
, ":utf8";
27 use version
0.77 (); our $VERSION = version
->declare("0.0.6");
29 use Getopt
::ArgvFile
( home
=> 1, startupFilename
=> [qw(.gcaprc)] );
30 use Getopt
::Long
qw(:config bundling);
46 'version' => \
&print_version
,
47 'license' => \
&print_license
,
48 'help' => \
&print_help
,
51 $config{regexp
} ||= "/(\\w|\\s)/g";
52 apply_regexp
( $config{regexp
} ); # Check syntax.
56 print "gcap version $VERSION\n";
62 "Copyright (C) 2010 Toni Gundogdu. GNU GPL v3+. This is free software;
63 see the source for copying conditions. There is NO warranty; not even
64 for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
71 Pod
::Usage
::pod2usage
( -exitstatus
=> 0, -verbose
=> 1 );
80 print_help
() unless scalar @ARGV;
83 "http://video.google.com/timedtext?hl=en&type=list&v=";
85 my $q = qr{v[=/]((?>[-_\w]{11}))};
87 if ( $url =~ /^http:/i ) {
93 "error: does not look like a youtube video page URL.\n";
98 $url = "$req_body$url";
101 print STDERR
"Checking ..." unless $config{quiet
};
105 my $a = new LWP
::UserAgent
;
107 $a->env_proxy; # http://search.cpan.org/perldoc?LWP::UserAgent
109 $a->proxy( 'http', $config{proxy
} ) if $config{proxy
};
110 $a->no_proxy('') if $config{no_proxy
};
114 my $p = new XML
::DOM
::Parser
( LWP_UserAgent
=> $a );
115 my $d = $p->parsefile($url);
116 my $r = $d->getDocumentElement;
118 for my $e ( $r->getElementsByTagName("track") ) {
120 name
=> $e->getAttributeNode("name")->getValue || "",
121 lang_code
=> $e->getAttributeNode("lang_code")->getValue,
123 $e->getAttributeNode("lang_translated")->getValue,
126 push @captions, \
%tmp;
127 print STDERR
"." unless $config{quiet
};
130 print STDERR
"done.\n" unless $config{quiet
};
134 my $v = $1 if $url =~ /$q/ or die "error: no match: video id";
136 get_title
( $v, $a ) if $config{title
};
137 prompt
() if $config{interactive
};
141 foreach (@captions) {
142 ++$t if $_->{selected
};
145 require HTML
::Entities
;
149 foreach (@captions) {
151 next unless $_->{selected
};
154 "http://video.google.com/timedtext?"
155 . "hl=$_->{lang_code}"
156 . "&lang=$_->{lang_code}"
157 . "&name=$_->{name}" . "&v=$v";
159 my $fname = sprintf "%s_%s.srt", $v, $_->{lang_code
};
163 apply_regexp
( $config{regexp
}, $video_title );
164 $fname = sprintf "%s_%s.srt", $video_title, $_->{lang_code
};
167 open my $fh, ">", $fname or die "$fname: $!\n";
168 binmode $fh, ":utf8";
170 unless ( $config{quiet
} ) {
171 printf STDERR
"(%02d of %02d) ", ++$n, $t if $t > 0;
172 print STDERR
"Saving $fname ...";
175 $d = $p->parsefile($url);
176 $r = $d->getDocumentElement;
181 for my $e ( $r->getElementsByTagName("text") ) {
183 my $tmp = $e->getFirstChild;
186 my $text = trim
( $tmp->getNodeValue );
188 $text = HTML
::Entities
::decode_entities
($text);
190 my $start = $e->getAttributeNode("start")->getValue;
195 if ( $start =~ /(\d+)/ ) {
199 /\d+\.(\d+)/; # should only capture 3 first digits
202 my @start = gmtime($start_sec);
204 $tmp = $e->getAttributeNode("dur");
205 my $dur = $tmp ?
$tmp->getValue : $start - $last_start;
207 my $end_sec = $start + $dur;
209 $dur =~ /\d+\.(\d+)/; # should only capture 3 first digits
210 my $end_msec = $1 || 0;
212 my @end = gmtime($end_sec);
215 "%d\r\n%02d:%02d:%02d,%03d --> %02d:%02d:%02d,%03d\r\n%s\r\n\r\n",
216 $i++, @start[ 2, 1, 0 ], $start_msec, @end[ 2, 1, 0 ],
219 $last_start = $start;
226 print STDERR
"done.\n" unless $config{quiet
};
241 'n' => \
&select_none
,
242 'i' => \
&invert_selection
,
246 print STDERR
"Enter prompt. "
247 . qq/Type "help" to get a list of commands.\n/;
257 if ( $ln =~ /(\d+)/ ) {
261 next unless $ln =~ /(\w)/;
262 $cmds{$1}() if defined $cmds{$1};
270 my $url = "http://www.youtube.com/get_video_info?&video_id=$v"
271 . "&el=detailpage&ps=default&eurl=&gl=US&hl=en";
273 my $r = $a->get($url);
275 unless ( $r->is_success ) {
276 printf STDERR
"\nerror: $url: %s\n", $r->status_line;
282 my $q = CGI
->new( $r->content );
284 if ( $q->param('reason') ) {
285 printf STDERR
"\nerror: %s: %s (errorcode: %d)\n",
286 $url, $q->param("reason"), $q->param("errorcode");
290 $video_title = Encode
::decode_utf8
( $q->param('title') );
293 unless ($video_title) {
294 print STDERR
"\nwarning: $url: use id instead\n"
295 unless $config{quiet
};
304 if ( $re =~ /^\/(.*)\
/(.*)$/ ) {
310 "error: invalid regexp syntax, expected `/pattern/flags'\n";
316 my $q = $flags =~ /i/ ?
qr/$pat/i : qr/$pat/;
318 return join '', $flags =~ /g/ ?
$s =~ /$q/g : $s =~ /$q/;
322 print STDERR
"Commands:
324 list .. display found captions (> indicates selected for download)
327 invert .. invert selection
328 (number) .. toggle caption
329 get .. download selected captions
330 quit .. quit without downloading captions\n"
331 . qq/Command name abbreviations are allowed, e.g. "h" instead of "help"\n/;
335 foreach (@captions) {
336 if ( $_->{selected
} ) {
341 print STDERR
"error: you have not selected anything\n";
348 foreach (@captions) {
349 printf STDERR
"%2s%02d: $_->{lang_transl}\n",
350 $_->{selected
} ?
">" : "", ++$i;
355 $_->{selected
} = 1 foreach @captions;
360 $_->{selected
} = 0 foreach @captions;
364 sub invert_selection
{
365 $_->{selected
} = !$_->{selected
} foreach @captions;
371 if ( $i >= 0 && exists $captions[$i] ) {
372 $captions[$i]->{selected
} = !$captions[$i]->{selected
};
376 print STDERR
"error: out of rate\n";
391 gcap - Youtube closed caption retriever
395 gcap [-i] [-t] [-r E<lt>regexpE<gt>] [E<lt>urlE<gt> | E<lt>video_idE<gt>]
396 [--proxy E<lt>addrE<gt> | --no-proxy]
400 gcap is a command line tool for retrieving Youtube closed captions.
401 The retrieved closed captions are saved in SubRip (srt) file format.
402 The srt files are saved as "$videoid_$langid.srt" by default.
406 --help print help and exit
407 --version print version and exit
408 --license print license and exit
410 -i, --interactive run in interactive mode
411 -t, --title parse video title and use it in filename
412 -r, --regexp arg (="/(\w|\s)/g") cleanup title with regexp
413 --proxy arg (=http_env) use proxy for http connections
414 --no-proxy disable use of http proxy
416 =head1 OPTION DESCRIPTIONS
426 Print version and exit.
430 Print license and exit.
436 =item B<-i, --interactive>
438 Enable interactive prompt which can be used to select the downloaded
439 closed captions. By default gcap downloads all available captions
444 Parse video title and use it in the output filename(s) instead of
445 video ID. The default is no.
447 =item B<-r, --regexp>=arg
449 Cleanup video title using the specified I<arg> regular expression.
450 The default is "/(\w|\s)/g".
452 =item B<--proxy> I<arg>
454 Use I<arg> for HTTP proxy, e.g. "http://foo:1234". Overrides the http_proxy
459 Disable use of HTTP proxy. Overrides both C<--proxy> and http_proxy environment
468 =item B<gcap 0QRO3gKj3qw>
470 =item B<gcap "http://www.youtube.com/watch?v=0QRO3gKj3qw">
472 Typical use. Both achieve the same.
478 Exits 0 on success, otherwise 1.
484 =item $HOME/.gcaprc, for example:
486 echo "--interactive" >> ~/.gcaprc
494 =item B<Availability>
496 Not all Youtube videos have closed captions. The following message
497 indicates that the video does not have any closed captions available.
498 URL omitted for brevity.
500 Couldn't parsefile [...] with LWP: no element found at line 1,
501 column 0, byte -1 at /usr/lib/perl5/vendor_perl/XML/Parser.pm ...
505 gcap depends on XML::DOM which uses LWP::UserAgent to retrieve
506 the data. Note that LWP::UserAgent reads http_proxy environment
509 env http_proxy=http://foo:1234 gcap video_id
513 <http://gcap.googlecode.com/>
515 =item B<Development repository>
517 <git://repo.or.cz/gcap.git>
519 e.g. git clone git://repo.or.cz/gcap.git
525 Toni Gundogdu <legatvs gmail com>