1 package NonameTV
::Importer
::Arte_util
;
8 Import data from xml files (result of Word2Xml). The parsing of the
9 data relies only on the text-content of the document, not on the
19 use NonameTV qw
/AddCategory File2Xml MyGet norm/;
20 use NonameTV
::DataStore
::Helper
;
21 use NonameTV
::DataStore
::Updater
;
22 use NonameTV
::Log qw
/d progress w error/;
26 our ($VERSION, @ISA, @EXPORT, @EXPORT_OK, %EXPORT_TAGS);
28 # set the version for version checking
33 %EXPORT_TAGS = ( ); # eg: TAG => [ qw!name1 name2! ],
34 @EXPORT_OK = qw
/ImportFull/;
43 ST_FDATE
=> 1, # Found date
44 ST_FHEAD
=> 2, # Found head with starttime and title
45 ST_FSUBINFO
=> 3, # Found sub info
46 ST_FDESCSHORT
=> 4, # Found short description
47 ST_FDESCLONG
=> 5, # After long description
48 ST_FADDINFO
=> 6, # After additional info
51 # Import files that contain full programming details,
52 # usually for an entire month.
53 # $doc is an XML::LibXML::Document object.
56 my( $filename, $doc, $chd, $dsh ) = @_;
59 # Find all div-entries.
60 my $ns = $doc->find( "//div" );
62 if( $ns->size() == 0 )
64 error
( "Arte: $chd->{xmltvid}: No programme entries found in $filename" );
68 if ($filename =~ m/\.doc$/i) {
69 progress
( "Arte: $chd->{xmltvid}: Processing $filename" );
72 progress
("Arte: processing batch");
87 foreach my $div ($ns->get_nodelist)
90 # Ignore English titles in National Geographic.
91 next if $div->findvalue( '@name' ) =~ /title in english/i;
93 my( $text ) = norm
( $div->findvalue( './/text()' ) );
95 # strip strange " * " in front of paragraph
96 $text =~ s
|^\s
*\
*\s
+||;
101 if( isDate
( $text ) ){
103 $date = ParseDate
( $text );
104 if( not defined $date ) {
105 error
( "Arte: $chd->{xmltvid}: $filename Invalid date $text" );
108 if( $date ne $currdate ) {
110 if( $currdate ne "x" ) {
111 if ($have_batch == 0) {
116 if ($have_batch == 0) {
117 my $batch_id = $chd->{xmltvid
} . "_" . $date;
118 $dsh->StartBatch( $batch_id , $chd->{id
} );
120 $dsh->StartDate( $date , "03:00" );
123 progress
("Arte: $chd->{xmltvid}: Date is: $date");
128 } elsif( isTitle
( $text ) ){
130 # start of a new programme, write out last one and go ahead
134 } elsif( isSubTitle
( $text ) ){
136 $state = ST_FSUBINFO
;
138 } elsif( $text =~ /\[Kurz\]$/i ){
140 $state = ST_FDESCSHORT
;
143 } elsif( $text =~ /^\[Lang\]$/i ){
145 $state = ST_FDESCLONG
;
148 } elsif( $text =~ /^\[Zusatzinfo\]$/i ){
150 $state = ST_FADDINFO
;
153 } elsif( $text =~ /^ARTE stellt.*bereit\.$/ ){
154 $state = ST_FADDINFO
;
155 } elsif( $text =~ /^Dieses Programm wurde in HD produziert\.$/ ){
156 $state = ST_FADDINFO
;
159 # did we collect one full programme?
160 if( ( $state eq ST_FDATE
or $state eq ST_FHEAD
) and $time and $title and ( $subinfo or $longdesc ) ){
162 channel_id
=> $chd->{id
},
168 $title =~ s/\s+\d+\s+min\.\s*$//i;
171 if( $title =~ /\s+16:9\s*$/ ){
173 $title =~ s/\s+16:9\s*$//i;
177 if( $title =~ /\s+stereo\s*$/i ){
179 $title =~ s/\s+stereo\s*$//i;
180 } elsif( $title =~ /\s+dolby\s*$/i ){
181 $stereo = "dolby digital"; # FIXME it means Dolby Digital 2.0 / AC3 2.0 here
182 $title =~ s/\s+dolby\s*$//i;
185 # parse episode number
187 if ($title =~ m
|\s
*\
(\d
+/\d
+\
)\s
*$|) {
188 my ($episodenum, $episodecount) = ($title =~ m
|\s
*\
((\d
+)/(\d
+)\
)\s
*$|);
189 $episode = '. ' . ($episodenum-1) . '/' . ($episodecount) . ' .';
190 $title =~ s
|\s
*\
(\d
+/\d
+\
)\s
*$||;
191 } elsif ($title =~ m
|\s
*\
(\d
+\
)\s
*$|) {
192 my ($episodenum) = ($title =~ m
|\s
*\
((\d
+)\
)\s
*$|);
193 $episode = '. ' . ($episodenum-1) . ' .';
194 $title =~ s
|\s
*\
(\d
+\
)\s
*$||;
196 $ce->{episode
} = $episode if $episode;
197 $ce->{title
} = $title;
198 d
( "Arte: $chd->{xmltvid}: $time - $title" );
201 if ( defined ($addinfo)) {
202 ParseExtraInfo
( \
$dsh->{ds
}, \
$ce, $addinfo );
205 if ( defined ($subinfo)) {
206 ParseExtraInfo
( \
$dsh->{ds
}, \
$ce, $subinfo );
209 $ce->{description
} = $longdesc if $longdesc;
212 $ce->{aspect
} = $aspect if $aspect;
213 $ce->{stereo
} = $stereo if $stereo;
216 $dsh->AddProgramme( $ce );
226 # after subinfo line there comes
227 # some text with information about the program
228 if ( $state eq ST_FHEAD
) {
229 ( $time, $title ) = ParseTitle
( $text );
230 } elsif( $state eq ST_FSUBINFO
){
231 $subinfo .= $text . "\n";
232 } elsif( $state eq ST_FDESCSHORT
){
233 $shortdesc .= $text . "\n";
234 } elsif( $state eq ST_FDESCLONG
){
235 $longdesc .= $text . "\n";
236 } elsif( $state eq ST_FADDINFO
){
237 $addinfo .= $text . "\n";
241 if ($have_batch == 0) {
251 # format 'Samstag, 21.11.2009'
252 if( $text =~ /^(Montag|Dienstag|Mittwoch|Donnerstag|Freitag|Samstag|Sonntag),\s+\d+\.\d+\.\d+$/i ){
263 my( $weekday, $day, $month, $year );
265 # try 'Sunday 1 June 2008'
266 if( $text =~ /^(Montag|Dienstag|Mittwoch|Donnerstag|Freitag|Samstag|Sonntag),\s+\d+\.\d+\.\d+$/i ){
267 ( $weekday, $day, $month, $year ) = ( $text =~ /^(\S+),\s+(\d+)\.(\d+)\.(\d+)$/ );
270 return sprintf( '%d-%02d-%02d', $year, $month, $day );
277 if( $text =~ /^\d\d:\d\d\s+\S+/ ){
288 my( $time, $rest ) = ( $text =~ /^(\d+:\d+)\s+(.*)\s*$/ );
290 return( $time, $rest );
297 if( $text =~ m/^\[\d{2}:\d{2}\]\s+\S+/ ){
306 my( $ds, $ce, $text ) = @_;
308 my $seengenre = undef;
310 my $productiondate = undef;
312 # join back together lines that got split due to length
314 my @lines = split( /\n/, $text );
315 foreach my $line ( @lines ){
316 if( $line =~ m/^\[\d{2}:\d{2}\]/ ){
318 $line =~ s
|^\
[\d
{2}:\d
{2}\
]\s
+||;
320 # is it an episodetitle?
321 if( $line =~ m
|^\
(\d
+\
):| ) {
322 my ($episodenum, $episodetitle) = ($line =~ m
|^\
((\d
+)\
):\s
*(.*?
)\s
*$|);
323 if (!defined ($$ce->{episode
})) {
324 $$ce->{episode
} = '. ' . ($episodenum-1) . ' .';
326 if (defined ($episodetitle)) {
327 $$ce->{subtitle
} = $episodetitle;
332 # first line is it a repeat?
333 if ($line =~ m/, Wiederholung vom \d+\.\d+\.$/) {
334 ($genre) = ($line =~ m
|^(.*), Wiederholung vom \d
+\
.\d
+\
.$|);
339 if ($line =~ m/^Wiederholung vom \d+\. \d+\. \d{4}$/) {
343 # strip dub, premiere
344 $line =~ s
|, Zweikanalton
$||;
345 $line =~ s
|, Synchronfassung
$||;
346 $line =~ s
|, Erstausstrahlung
$||;
347 $line =~ s
|, Schwarz
-Wei\x
{df
}$||;
348 $line =~ s
|, Schwerpunkt
: [^,]+$||;
349 $line =~ s
|, Synchronfassung
$||;
350 $line =~ s
|, Originalfassung mit Untertiteln
||; # yes, it's not the last
352 # [genre, ]contries year[, producing stations]
353 if( ($genre, $productiondate) = ( $line =~ m
|^([^,]+)\s
*,[^,]+\s
+(\d
{4}),[^,]+$| ) ) {
354 # genre, country year, network
356 } elsif( ($genre, $productiondate) = ( $line =~ m
|^([^,]+)\s
*,[^,]+\s
+(\d
{4})$| ) ) {
357 # genre, country year
359 } elsif( ($productiondate) = ( $line =~ m
|^[^,]+\s
+(\d
{4}),[^,]+$| ) ) {
360 # country year, network
363 # then it must be the subtitle
364 $$ce->{subtitle
} = $line;
369 if( $line =~ /^Dieses Programm wurde in HD produziert\.$/ ){
370 $$ce->{quality
} = 'HDTV';
374 if( $line =~ /^ARTE stellt diesen Beitrag auch/ ){
375 # strip reference to ARTE+7 video on demand
379 if( $line =~ /^ARTE strahlt diesen Film auch in einer untertitelten Fassung f/ ){
380 # strip subtitle for hard of hearing
384 if( $line =~ /^ARTE strahlt diesen Film auch in einer H/ ){
385 # strip audio for the blind
389 if( $line =~ m
|unter
: www\
.arte\
.tv
/| )
391 # strip generic links
395 # not the first line, maybe still a repeat? (copy from above)
396 if ($line =~ m/, Wiederholung vom \d+\.\d+\.$/) {
397 ($genre) = ($line =~ m
|^(.*), Wiederholung vom \d
+\
.\d
+\
.$|);
401 if ($line =~ m/^Wiederholung vom \d+\.\d+\.$/) {
404 if ($line =~ m/^Wiederholung vom \d+\. \d+\. \d{4}$/) {
409 if( $line =~ /^Mit:\s+.*$/ ){
410 my ( $actor ) = ( $line =~ /^Mit:\s+(.*)$/ );
411 # remove name of role, not yet supported
412 $actor =~ s
|\s
+-\s
+\
(.*?\
)||g
;
413 my @actors = split( ', ', $actor );
414 $$ce->{actors
} = join( ', ', @actors);
418 if( $line =~ m
|^Themenabend
:| ) {
422 # parse attention (which looks like credits)
423 if( $line =~ m
|Achtung
:| ) {
424 $line =~ s
|\s
*\
(Achtung
:.*\
)$||;
425 $line =~ s
|\s
*Achtung
:.*$||;
428 # parse credits (all but actors)
429 if( $line =~ /^\S+:\s+.*$/ ){
430 my @credits = split( '; ', $line );
431 foreach my $credit (@credits) {
432 my ($job, $people) = ($credit =~ m
|^(\S
+):\s
*(.*)$|);
434 w
("parse error, credits are no credits: $credit");
435 } elsif ($job eq 'Regie') {
436 $$ce->{directors
} = $people;
437 } elsif ($job eq 'Buch') {
438 $$ce->{writers
} = $people;
439 } elsif ($job eq 'Kamera') {
440 } elsif ($job eq 'Schnitt') {
441 } elsif ($job eq 'Ton') {
442 } elsif ($job eq 'Musik') {
443 } elsif ($job eq 'Produzent') {
444 $$ce->{producers
} = $people;
445 } elsif ($job eq 'Produktion') {
446 } elsif ($job eq 'Redaktion') {
447 } elsif ($job eq 'Moderation') {
448 $$ce->{presenters
} = $people;
449 } elsif ($job eq 'Gast') {
450 $$ce->{Guests
} = $people;
451 } elsif ($job eq 'Dirigent') {
452 } elsif ($job eq 'Orchester') {
453 } elsif ($job eq 'Choreografie') {
454 } elsif ($job eq 'Komponist') {
455 } elsif ($job eq 'Maske') {
456 } elsif ($job eq 'Kostüme') {
457 } elsif ($job eq 'Ausstattung') {
458 } elsif ($job eq 'Regieassistenz') {
459 } elsif ($job eq 'Restaurierung') {
460 } elsif ($job eq 'Licht') {
461 } elsif ($job eq 'Fernsehregie') {
462 } elsif ($job eq 'Inszenierung') {
463 } elsif ($job eq 'Chor') {
464 } elsif ($job eq 'Herstellungsleitung') {
465 } elsif ($job eq 'Buch/Autor') {
466 $$ce->{writers
} = $people;
468 d
( "unhandled job $job" );
471 # FIXME split at ; and handle more roles the just directors
475 # strip dub, premiere
476 $line =~ s
|, Zweikanalton
$||;
477 $line =~ s
|, Synchronfassung
$||;
478 $line =~ s
|, Erstausstrahlung
$||;
479 $line =~ s
|, Schwerpunkt
: [^,]+$||;
480 $line =~ s
|, Synchronfassung
$||;
481 $line =~ s
|, Originalfassung mit Untertiteln
||; # yes, it's not the last
483 # genre, contries year, producing stations
484 if( ($genre, $productiondate) = ($line =~ m
|^([^,]+)\s
*,[^,]+\s
+(\d
{4}),[^,]+$| ) ) {
488 # genre, contries year
489 if( ($genre, $productiondate) = ($line =~ m
|^([^,]+)\s
*,[^,]+\s
+(\d
{4})$| ) ) {
493 # contries year, producing stations
494 if( ($productiondate) = ($line =~ m
|[^,]+\s
+(\d
{4}),[^,]+$| ) ) {
498 # FIXME has title (incl. part number), origtitle, partnumber: empty episode title
499 if( $line =~ m
|^\
(\d
+\
):$| ) {
504 d
( "unhandled subinfo: $line" );
507 if( defined( $productiondate ) ) {
508 $$ce->{production_date
} = $productiondate . '-01-01';
511 if( defined( $genre) ) {
512 my ( $program_type, $categ ) = $$ds->LookupCat( "Arte_genre", $genre );
513 AddCategory
( $$ce, $program_type, $categ );
519 ### Setup coding system