Arte*: handle attribute audio=bilingual
[nonametv.git] / lib / NonameTV / Importer / Arte_util.pm
blob837d3539fc8d4d76aa97e9e0191dbde1439f9013
1 package NonameTV::Importer::Arte_util;
3 use strict;
4 use warnings;
6 =pod
8 Import data from xml files (result of Word2Xml). The parsing of the
9 data relies only on the text-content of the document, not on the
10 formatting.
12 =cut
14 use utf8;
16 use DateTime;
17 use XML::LibXML;
19 use NonameTV qw/AddCategory File2Xml MyGet norm/;
20 use NonameTV::DataStore::Helper;
21 use NonameTV::DataStore::Updater;
22 use NonameTV::Log qw/d progress w error/;
24 BEGIN {
25 use Exporter ();
26 our ($VERSION, @ISA, @EXPORT, @EXPORT_OK, %EXPORT_TAGS);
28 # set the version for version checking
29 $VERSION = 0.1;
31 @ISA = qw(Exporter);
32 @EXPORT = qw( );
33 %EXPORT_TAGS = ( ); # eg: TAG => [ qw!name1 name2! ],
34 @EXPORT_OK = qw/ImportFull/;
36 our @EXPORT_OK;
40 # States
41 use constant {
42 ST_START => 0,
43 ST_FDATE => 1, # Found date
44 ST_FHEAD => 2, # Found head with starttime and title
45 ST_FSUBINFO => 3, # Found sub info
46 ST_FDESCSHORT => 4, # Found short description
47 ST_FDESCLONG => 5, # After long description
48 ST_FADDINFO => 6, # After additional info
51 # Import files that contain full programming details,
52 # usually for an entire month.
53 # $doc is an XML::LibXML::Document object.
54 sub ImportFull
56 my( $filename, $doc, $chd, $dsh ) = @_;
57 my $have_batch;
59 # Find all div-entries.
60 my $ns = $doc->find( "//div" );
62 if( $ns->size() == 0 )
64 error( "Arte: $chd->{xmltvid}: No programme entries found in $filename" );
65 return;
68 if ($filename =~ m/\.doc$/i) {
69 progress( "Arte: $chd->{xmltvid}: Processing $filename" );
70 $have_batch = 0;
71 } else {
72 progress ("Arte: processing batch");
73 $have_batch = 1;
76 my $date;
77 my $currdate = "x";
78 my $time;
79 my $title;
80 my $subinfo;
81 my $shortdesc;
82 my $longdesc;
83 my $addinfo;
85 my $state = ST_START;
87 foreach my $div ($ns->get_nodelist)
90 # Ignore English titles in National Geographic.
91 next if $div->findvalue( '@name' ) =~ /title in english/i;
93 my( $text ) = norm( $div->findvalue( './/text()' ) );
95 # strip strange " * " in front of paragraph
96 $text =~ s|^\s*\*\s+||;
97 next if $text eq "";
99 my $type;
101 if( isDate( $text ) ){
103 $date = ParseDate( $text );
104 if( not defined $date ) {
105 error( "Arte: $chd->{xmltvid}: $filename Invalid date $text" );
108 if( $date ne $currdate ) {
110 if( $currdate ne "x" ) {
111 if ($have_batch == 0) {
112 $dsh->EndBatch( 1 );
116 if ($have_batch == 0) {
117 my $batch_id = $chd->{xmltvid} . "_" . $date;
118 $dsh->StartBatch( $batch_id , $chd->{id} );
120 $dsh->StartDate( $date , "03:00" );
121 $currdate = $date;
123 progress("Arte: $chd->{xmltvid}: Date is: $date");
125 $state = ST_FDATE;
128 } elsif( isTitle( $text ) ){
130 # start of a new programme, write out last one and go ahead
132 $state = ST_FHEAD;
134 } elsif( isSubTitle( $text ) ){
136 $state = ST_FSUBINFO;
138 } elsif( $text =~ /\[Kurz\]$/i ){
140 $state = ST_FDESCSHORT;
141 next;
143 } elsif( $text =~ /^\[Lang\]$/i ){
145 $state = ST_FDESCLONG;
146 next;
148 } elsif( $text =~ /^\[Zusatzinfo\]$/i ){
150 $state = ST_FADDINFO;
151 next;
153 } elsif( $text =~ /^ARTE stellt.*bereit\.$/ ){
154 $state = ST_FADDINFO;
155 } elsif( $text =~ /^Dieses Programm wurde in HD produziert\.$/ ){
156 $state = ST_FADDINFO;
159 # did we collect one full programme?
160 if( ( $state eq ST_FDATE or $state eq ST_FHEAD ) and $time and $title and ( $subinfo or $longdesc ) ){
161 my $ce = {
162 channel_id => $chd->{id},
163 start_time => $time,
167 # strip duration
168 $title =~ s/\s+\d+\s+min\.\s*$//i;
170 my $aspect = undef;
171 if( $title =~ /\s+16:9\s*$/ ){
172 $aspect = "16:9";
173 $title =~ s/\s+16:9\s*$//i;
176 my $stereo = undef;
177 if( $title =~ /\s+stereo\s*$/i ){
178 $stereo = "stereo";
179 $title =~ s/\s+stereo\s*$//i;
180 } elsif( $title =~ /\s+dolby\s*$/i ){
181 $stereo = "dolby digital"; # FIXME it means Dolby Digital 2.0 / AC3 2.0 here
182 $title =~ s/\s+dolby\s*$//i;
185 # parse episode number
186 my $episode;
187 if ($title =~ m|\s*\(\d+/\d+\)\s*$|) {
188 my ($episodenum, $episodecount) = ($title =~ m|\s*\((\d+)/(\d+)\)\s*$|);
189 $episode = '. ' . ($episodenum-1) . '/' . ($episodecount) . ' .';
190 $title =~ s|\s*\(\d+/\d+\)\s*$||;
191 } elsif ($title =~ m|\s*\(\d+\)\s*$|) {
192 my ($episodenum) = ($title =~ m|\s*\((\d+)\)\s*$|);
193 $episode = '. ' . ($episodenum-1) . ' .';
194 $title =~ s|\s*\(\d+\)\s*$||;
196 $ce->{episode} = $episode if $episode;
197 $ce->{title} = $title;
198 d( "Arte: $chd->{xmltvid}: $time - $title" );
201 if ( defined ($addinfo)) {
202 ParseExtraInfo( \$dsh->{ds}, \$ce, $addinfo );
205 if ( defined ($subinfo)) {
206 ParseExtraInfo( \$dsh->{ds}, \$ce, $subinfo );
209 $ce->{description} = $longdesc if $longdesc;
212 $ce->{aspect} = $aspect if $aspect;
213 $ce->{stereo} = $stereo if $stereo;
216 $dsh->AddProgramme( $ce );
218 $time = undef;
219 $title = undef;
220 $subinfo = undef;
221 $shortdesc = undef;
222 $longdesc = undef;
223 $addinfo = undef;
226 # after subinfo line there comes
227 # some text with information about the program
228 if ( $state eq ST_FHEAD ) {
229 ( $time, $title ) = ParseTitle( $text );
230 } elsif( $state eq ST_FSUBINFO ){
231 $subinfo .= $text . "\n";
232 } elsif( $state eq ST_FDESCSHORT ){
233 $shortdesc .= $text . "\n";
234 } elsif( $state eq ST_FDESCLONG ){
235 $longdesc .= $text . "\n";
236 } elsif( $state eq ST_FADDINFO ){
237 $addinfo .= $text . "\n";
241 if ($have_batch == 0) {
242 $dsh->EndBatch( 1 );
245 return;
248 sub isDate {
249 my ( $text ) = @_;
251 # format 'Samstag, 21.11.2009'
252 if( $text =~ /^(Montag|Dienstag|Mittwoch|Donnerstag|Freitag|Samstag|Sonntag),\s+\d+\.\d+\.\d+$/i ){
253 return 1;
256 return 0;
259 sub ParseDate
261 my( $text ) = @_;
263 my( $weekday, $day, $month, $year );
265 # try 'Sunday 1 June 2008'
266 if( $text =~ /^(Montag|Dienstag|Mittwoch|Donnerstag|Freitag|Samstag|Sonntag),\s+\d+\.\d+\.\d+$/i ){
267 ( $weekday, $day, $month, $year ) = ( $text =~ /^(\S+),\s+(\d+)\.(\d+)\.(\d+)$/ );
270 return sprintf( '%d-%02d-%02d', $year, $month, $day );
273 sub isTitle
275 my( $text ) = @_;
277 if( $text =~ /^\d\d:\d\d\s+\S+/ ){
278 return 1;
281 return 0;
284 sub ParseTitle
286 my( $text ) = @_;
288 my( $time, $rest ) = ( $text =~ /^(\d+:\d+)\s+(.*)\s*$/ );
290 return( $time, $rest );
293 sub isSubTitle
295 my( $text ) = @_;
297 if( $text =~ m/^\[\d{2}:\d{2}\]\s+\S+/ ){
298 return 1;
301 return 0;
304 sub ParseExtraInfo
306 my( $ds, $ce, $text ) = @_;
308 my $seengenre = undef;
309 my $genre = undef;
310 my $productiondate = undef;
312 # join back together lines that got split due to length
313 $text =~ s/,\n/, /g;
314 my @lines = split( /\n/, $text );
315 foreach my $line ( @lines ){
316 if( $line =~ m/^\[\d{2}:\d{2}\]/ ){
317 # strip the time
318 $line =~ s|^\[\d{2}:\d{2}\]\s+||;
320 # is it an episodetitle?
321 if( $line =~ m|^\(\d+\):| ) {
322 my ($episodenum, $episodetitle) = ($line =~ m|^\((\d+)\):\s*(.*?)\s*$|);
323 if (!defined ($$ce->{episode})) {
324 $$ce->{episode} = '. ' . ($episodenum-1) . ' .';
326 if (defined ($episodetitle)) {
327 $$ce->{subtitle} = $episodetitle;
329 next;
332 # first line is it a repeat?
333 if ($line =~ m/, Wiederholung vom \d+\.\d+\.$/) {
334 ($genre) = ($line =~ m|^(.*), Wiederholung vom \d+\.\d+\.$|);
335 $seengenre = 1;
336 next;
339 if ($line =~ m/^Wiederholung vom \d+\. \d+\. \d{4}$/) {
340 next;
343 # strip dub, premiere
344 $line =~ s|, Zweikanalton$||;
345 $line =~ s|, Synchronfassung$||;
346 $line =~ s|, Erstausstrahlung$||;
347 $line =~ s|, Schwarz-Wei\x{df}$||;
348 $line =~ s|, Schwerpunkt: [^,]+$||;
349 $line =~ s|, Synchronfassung$||;
350 $line =~ s|, Originalfassung mit Untertiteln||; # yes, it's not the last
351 # is it the genre?
352 # [genre, ]contries year[, producing stations]
353 if( ($genre, $productiondate) = ( $line =~ m|^([^,]+)\s*,[^,]+\s+(\d{4}),[^,]+$| ) ) {
354 # genre, country year, network
355 $seengenre = 1;
356 } elsif( ($genre, $productiondate) = ( $line =~ m|^([^,]+)\s*,[^,]+\s+(\d{4})$| ) ) {
357 # genre, country year
358 $seengenre = 1;
359 } elsif( ($productiondate) = ( $line =~ m|^[^,]+\s+(\d{4}),[^,]+$| ) ) {
360 # country year, network
361 $seengenre = 1;
362 } else {
363 # then it must be the subtitle
364 $$ce->{subtitle} = $line;
365 next;
369 if( $line =~ /^Dieses Programm wurde in HD produziert\.$/ ){
370 $$ce->{quality} = 'HDTV';
371 next;
374 if( $line =~ /^ARTE stellt diesen Beitrag auch/ ){
375 # strip reference to ARTE+7 video on demand
376 next;
379 if( $line =~ /^ARTE strahlt diesen Film auch in einer untertitelten Fassung f/ ){
380 # strip subtitle for hard of hearing
381 next;
384 if( $line =~ /^ARTE strahlt diesen Film auch in einer H/ ){
385 # strip audio for the blind
386 next;
389 if( $line =~ m|unter: www\.arte\.tv/| )
391 # strip generic links
392 next;
395 # not the first line, maybe still a repeat? (copy from above)
396 if ($line =~ m/, Wiederholung vom \d+\.\d+\.$/) {
397 ($genre) = ($line =~ m|^(.*), Wiederholung vom \d+\.\d+\.$|);
398 $seengenre = 1;
399 next;
401 if ($line =~ m/^Wiederholung vom \d+\.\d+\.$/) {
402 next;
404 if ($line =~ m/^Wiederholung vom \d+\. \d+\. \d{4}$/) {
405 next;
408 # parse actors
409 if( $line =~ /^Mit:\s+.*$/ ){
410 my ( $actor ) = ( $line =~ /^Mit:\s+(.*)$/ );
411 # remove name of role, not yet supported
412 $actor =~ s|\s+-\s+\(.*?\)||g;
413 my @actors = split( ', ', $actor );
414 $$ce->{actors} = join( ', ', @actors);
415 next;
418 if( $line =~ m|^Themenabend:| ) {
419 next;
422 # parse attention (which looks like credits)
423 if( $line =~ m|Achtung:| ) {
424 $line =~ s|\s*\(Achtung:.*\)$||;
425 $line =~ s|\s*Achtung:.*$||;
428 # parse credits (all but actors)
429 if( $line =~ /^\S+:\s+.*$/ ){
430 my @credits = split( '; ', $line );
431 foreach my $credit (@credits) {
432 my ($job, $people) = ($credit =~ m|^(\S+):\s*(.*)$|);
433 if (!defined $job) {
434 w("parse error, credits are no credits: $credit");
435 } elsif ($job eq 'Regie') {
436 $$ce->{directors} = $people;
437 } elsif ($job eq 'Buch') {
438 $$ce->{writers} = $people;
439 } elsif ($job eq 'Kamera') {
440 } elsif ($job eq 'Schnitt') {
441 } elsif ($job eq 'Ton') {
442 } elsif ($job eq 'Musik') {
443 } elsif ($job eq 'Produzent') {
444 $$ce->{producers} = $people;
445 } elsif ($job eq 'Produktion') {
446 } elsif ($job eq 'Redaktion') {
447 } elsif ($job eq 'Moderation') {
448 $$ce->{presenters} = $people;
449 } elsif ($job eq 'Gast') {
450 $$ce->{Guests} = $people;
451 } elsif ($job eq 'Dirigent') {
452 } elsif ($job eq 'Orchester') {
453 } elsif ($job eq 'Choreografie') {
454 } elsif ($job eq 'Komponist') {
455 } elsif ($job eq 'Maske') {
456 } elsif ($job eq 'Kostüme') {
457 } elsif ($job eq 'Ausstattung') {
458 } elsif ($job eq 'Regieassistenz') {
459 } elsif ($job eq 'Restaurierung') {
460 } elsif ($job eq 'Licht') {
461 } elsif ($job eq 'Fernsehregie') {
462 } elsif ($job eq 'Inszenierung') {
463 } elsif ($job eq 'Chor') {
464 } elsif ($job eq 'Herstellungsleitung') {
465 } elsif ($job eq 'Buch/Autor') {
466 $$ce->{writers} = $people;
467 } else {
468 d( "unhandled job $job" );
471 # FIXME split at ; and handle more roles the just directors
472 next;
475 # strip dub, premiere
476 $line =~ s|, Zweikanalton$||;
477 $line =~ s|, Synchronfassung$||;
478 $line =~ s|, Erstausstrahlung$||;
479 $line =~ s|, Schwerpunkt: [^,]+$||;
480 $line =~ s|, Synchronfassung$||;
481 $line =~ s|, Originalfassung mit Untertiteln||; # yes, it's not the last
482 # is it the genre?
483 # genre, contries year, producing stations
484 if( ($genre, $productiondate) = ($line =~ m|^([^,]+)\s*,[^,]+\s+(\d{4}),[^,]+$| ) ) {
485 $seengenre = 1;
486 next;
488 # genre, contries year
489 if( ($genre, $productiondate) = ($line =~ m|^([^,]+)\s*,[^,]+\s+(\d{4})$| ) ) {
490 $seengenre = 1;
491 next;
493 # contries year, producing stations
494 if( ($productiondate) = ($line =~ m|[^,]+\s+(\d{4}),[^,]+$| ) ) {
495 next;
498 # FIXME has title (incl. part number), origtitle, partnumber: empty episode title
499 if( $line =~ m|^\(\d+\):$| ) {
500 next;
504 d( "unhandled subinfo: $line" );
507 if( defined( $productiondate ) ) {
508 $$ce->{production_date} = $productiondate . '-01-01';
511 if( defined( $genre) ) {
512 my ( $program_type, $categ ) = $$ds->LookupCat( "Arte_genre", $genre );
513 AddCategory( $$ce, $program_type, $categ );
519 ### Setup coding system
520 ## Local Variables:
521 ## coding: utf-8
522 ## End: