6 # Mark this source-file as encoded in utf-8.
11 use File
::Temp qw
/tempfile tempdir/;
14 use NonameTV
::StringMatcher
;
15 use NonameTV
::Log qw
/w/;
20 our ($VERSION, @ISA, @EXPORT, @EXPORT_OK, %EXPORT_TAGS);
22 # set the version for version checking
27 %EXPORT_TAGS = ( ); # eg: TAG => [ qw!name1 name2! ],
28 @EXPORT_OK = qw
/MyGet expand_entities
34 ParseDescCatSwe FixProgrammeData
42 my $wvhtml = '/usr/bin/wvHtml --charset=utf8';
43 # my $wvhtml = '/usr/bin/wvHtml';
45 my $ua = LWP
::UserAgent
->new( agent
=> "Grabber from http://tv.swedb.se",
48 # Fetch a url. Returns ($content, true) if data was fetched from server and
49 # different from the last time the same url was fetched, ($content, false) if
50 # it was fetched from the server and was the same as the last time it was
51 # fetched and (undef,$error_message) if there was an error fetching the data.
56 my $res = $ua->get( $url );
58 if( $res->is_success )
60 return ($res->content, not defined( $res->header( 'X-Content-Unchanged' ) ) );
64 return (undef, $res->status_line );
71 231 => 'c', # This should really be a c with a special mark on it.
72 # Unicode 000E7, UTF-8 195 167.
86 my( $num, $str ) = @_;
88 if( not defined( $ent{$num} ) )
91 print STDERR
"Unknown entity $num in $str\n";
101 $str =~ s/\&#(\d+);/_expand($1,$str)/eg;
106 =item Html2Xml( $content )
108 Convert the HTML in $content into an XML::LibXML::Document.
110 Prints an error-message to STDERR and returns undef if the conversion
118 my $xml = XML
::LibXML
->new;
121 # Remove character that makes the parser stop.
125 eval { $doc = $xml->parse_html_string($html, {
127 suppress_errors
=> 1,
128 suppress_warnings
=> 1,
132 my ($package, $filename, $line) = caller;
133 print "parse_html_string failed: $@ when called from $filename:$line\n";
140 =item Htmlfile2Xml( $filename )
142 Convert the HTML in a file into an XML::LibXML::Document.
144 Prints an error-message to STDERR and returns undef if the conversion
151 my( $filename ) = @_;
153 my $html = read_file
( $filename );
155 return Html2Xml
( $html );
159 =item Word2Xml( $content )
161 Convert the Microsoft Word document in $content into html and return
162 the html as an XML::LibXML::Document.
164 Prints an error-message to STDERR and returns undef if the conversion
173 my( $fh, $filename ) = tempfile
();
177 my $doc = Wordfile2Xml
( $filename );
184 my( $filename ) = @_;
186 my $html = qx/$wvhtml "$filename" -/;
189 w
"$wvhtml $filename - failed: $?";
193 # Remove character that makes LibXML choke.
194 $html =~ s/\…/.../g;
196 return Html2Xml
( $html );
200 my( $filename ) = @_;
202 my $data = read_file
( $filename );
204 if( $data =~ /^\<\!DOCTYPE HTML/ )
206 # This is an override that has already been run through wvHtml
207 $doc = Html2Xml
( $data );
211 $doc = Word2Xml
( $data );
221 if( $$cref =~ /^\<\!DOCTYPE HTML/ )
223 # This is an override that has already been run through wvHtml
224 $doc = Html2Xml
( $$cref );
228 $doc = Word2Xml
( $$cref );
236 FindParagraphs( $doc, $expr )
238 Finds all paragraphs in the part of an xml-tree that matches an
239 xpath-expression. Returns a reference to an array of strings.
240 All paragraphs are normalized and empty strings are removed from the
243 Both <div> and <br> are taken into account when splitting the document
246 Use the expression '//body//.' for html-documents when you want to see
247 all paragraphs in the page.
259 my( $doc, $elements ) = @_;
261 my $ns = $doc->find( $elements );
266 foreach my $node ($ns->get_nodelist()) {
267 if( $node->nodeName eq "#text" ) {
268 $p .= $node->textContent();
270 elsif( defined $paraelem{ $node->nodeName } ) {
273 push @paragraphs, $p;
283 # Remove any strange quotation marks and other syntactic marks
284 # that we don't want to have. Remove leading and trailing space as well
285 # multiple whitespace characters.
286 # Returns the empty string if called with an undef string.
291 return "" if not defined( $str );
293 # Uncomment the code below and change the regexp to learn which
294 # character code perl thinks a certain character has.
295 # These codes can be used in \x{YY} expressions as shown below.
296 # if( $str =~ /unique string/ ) {
297 # for( my $i=0; $i < length( $str ); $i++ ) {
298 # printf( "%2x: %s\n", ord( substr( $str, $i, 1 ) ),
299 # substr( $str, $i, 1 ) );
303 $str = expand_entities
( $str );
305 $str =~ tr/\x{96}\x{93}\x{94}/-""/; #
306 $str =~ tr/\x{201d}\x{201c}/""/;
307 $str =~ tr/\x{2022}/*/; # Bullet
308 $str =~ tr/\x{2013}\x{2018}\x{2019}/-''/;
309 $str =~ tr/\x{017c}\x{0144}\x{0105}/zna/;
310 $str =~ s/\x{85}/... /g;
311 $str =~ s/\x{2026}/.../sg;
312 $str =~ s/\x{2007}/ /sg;
316 $str =~ tr/\n\r\t / /s;
323 Add program_type and category to an entry if the entry does not already
324 have a program_type and category.
326 AddCategory( $ce, $program_type, $category );
332 my( $ce, $program_type, $category ) = @_;
334 if( not defined( $ce->{program_type
} ) and defined( $program_type )
335 and ( $program_type =~ /\S/ ) )
337 $ce->{program_type
} = $program_type;
340 if( not defined( $ce->{category
} ) and defined( $category )
341 and ( $category =~ /\S/ ) )
343 $ce->{category
} = $category;
347 =item ParseDescCatSwe
349 Parse a program description in Swedish and return program_type
350 and category that can be deduced from the description.
352 my( $pty, $cat ) = ParseDescCatSwe( "Amerikansk äventyrsserie" );
356 my $sm = NonameTV
::StringMatcher
->new();
357 $sm->AddRegexp( qr/kriminalserie/i, [ 'series', 'Crime/Mystery' ] );
358 $sm->AddRegexp( qr/deckarserie/i, [ 'series', 'Crime/Mystery' ] );
359 $sm->AddRegexp( qr/polisserie/i, [ 'series', 'Crime/Mystery' ] );
360 $sm->AddRegexp( qr/familjeserie/i, [ 'series', undef ] );
361 $sm->AddRegexp( qr/tecknad serie/i, [ 'series', undef ] );
362 $sm->AddRegexp( qr/animerad serie/i, [ 'series', undef ] );
363 $sm->AddRegexp( qr/dramakomediserie/i, [ 'series', 'Comedy' ] );
364 $sm->AddRegexp( qr/dramaserie/i, [ 'series', 'Drama' ] );
365 $sm->AddRegexp( qr/resedokumentärserie/i,[ 'series', 'Food/Travel' ] );
366 $sm->AddRegexp( qr/komediserie/i, [ 'series', 'Comedy' ] );
367 $sm->AddRegexp( qr/realityserie/i, [ 'series', 'Reality' ] );
368 $sm->AddRegexp( qr/realityshow/i, [ 'series', 'Reality' ] );
369 $sm->AddRegexp( qr/dokusåpa/i, [ 'series', 'Reality' ] );
370 $sm->AddRegexp( qr/actiondramaserie/i, [ 'series', 'Action' ] );
371 $sm->AddRegexp( qr/actionserie/i, [ 'series', 'Action' ] );
372 $sm->AddRegexp( qr/underhållningsserie/i,[ 'series', undef ] );
373 $sm->AddRegexp( qr/äventyrsserie/i, [ 'series', 'Action/Adv' ] );
374 $sm->AddRegexp( qr/äventyrskomediserie/i,[ 'series', 'Comedy' ] );
375 $sm->AddRegexp( qr/dokumentärserie/i, [ 'series', 'Documentary' ] );
376 $sm->AddRegexp( qr/dramadokumentär/i, [ undef, 'Documentary' ] );
378 $sm->AddRegexp( qr/barnserie/i, [ 'series', "Children's" ] );
379 $sm->AddRegexp( qr/matlagningsserie/i, [ 'series', 'Cooking' ] );
380 $sm->AddRegexp( qr/motorserie/i, [ 'series', undef ] );
381 $sm->AddRegexp( qr/fixarserie/i, [ 'series', "Home/How-to" ] );
382 $sm->AddRegexp( qr/science[-\s]*fiction[-\s]*serie/i,
383 [ 'series', 'SciFi' ] );
384 $sm->AddRegexp( qr/barnprogram/i, [ undef, "Children's" ] );
387 $sm->AddRegexp( qr/\b(familje|drama|action)*komedi\b/i, [ 'movie', "Comedy" ] );
389 $sm->AddRegexp( qr/\b(krigs|kriminal)*drama\b/i, [ 'movie', "Drama" ] );
391 $sm->AddRegexp( qr/\baction(drama|film)*\b/i, [ 'movie', "Action/Adv" ] );
393 $sm->AddRegexp( qr/\b.ventyrsdrama\b/i, [ 'movie', "Action/Adv" ] );
395 $sm->AddRegexp( qr/\bv.stern(film)*\b/i, [ 'movie', undef ] );
397 $sm->AddRegexp( qr/\b(drama)*thriller\b/i, [ 'movie', "Crime" ] );
399 $sm->AddRegexp( qr/\bscience\s*fiction(rysare)*\b/i, [ 'movie', "SciFi" ] );
401 $sm->AddRegexp( qr/\b(l.ng)*film\b/i, [ 'movie', undef ] );
408 return (undef, undef) if not defined $desc;
410 my $res = $sm->Match( $desc );
411 if( defined( $res ) )
417 return (undef,undef);
425 $d->{title
} =~ s/^s.songs+tart\s*:*\s*//gi;
426 $d->{title
} =~ s/^seriestart\s*:*\s*//gi;
427 $d->{title
} =~ s/^reprisstart\s*:*\s*//gi;
428 $d->{title
} =~ s/^programstart\s*:*\s*//gi;
430 $d->{title
} =~ s/^s.songs*avslutning\s*:*\s*//gi;
431 $d->{title
} =~ s/^sista\s+delen\s*:*\s*//gi;
432 $d->{title
} =~ s/^sista\s+avsnittet\s*:*\s*//gi;
434 if( $d->{title
} =~ s/^((matin.)|(fredagsbio))\s*:\s*//gi )
436 $d->{program_type
} = 'movie';
437 $d->{category
} = 'Movies';
440 # Set program_type to series if the entry has an episode-number
441 # with a defined episode (i.e. second part),
442 # but doesn't have a program_type.
443 if( exists( $d->{episode
} ) and defined( $d->{episode
} ) and
444 ($d->{episode
} !~ /^\s*\.\s*\./) and
445 ( (not defined($d->{program_type
})) or ($d->{program_type
} =~ /^\s*$/) ) )
447 $d->{program_type
} = "series";
453 my $doc = ParseXml( $strref );
455 Parse an xml-string into an XML::LibXML document. Takes a reference to a
456 string as the only reference.
465 if( not defined $xml ) {
466 $xml = XML
::LibXML
->new;
467 $xml->load_ext_dtd(0);
472 $doc = $xml->parse_string($$cref);
475 w
"Failed to parse xml: $@";
484 Parse a reference to an xml-string in xmltv-format into a reference to an
485 array of hashes with programme-info.
492 my $doc = ParseXml
( $cref );
493 return undef if not defined $doc;
497 # Find all "programme"-entries.
498 my $ns = $doc->find( "//programme" );
499 if( $ns->size() == 0 ) {
503 foreach my $pgm ($ns->get_nodelist) {
504 my $start = $pgm->findvalue( '@start' );
505 my $start_dt = create_dt
( $start );
507 my $stop = $pgm->findvalue( '@stop' );
508 my $stop_dt = create_dt
( $stop );
510 my $title = $pgm->findvalue( 'title' );
511 my $subtitle = $pgm->findvalue( 'sub-title' );
513 my $desc = $pgm->findvalue( 'desc' );
514 my $cat1 = $pgm->findvalue( 'category[1]' );
515 my $cat2 = $pgm->findvalue( 'category[2]' );
516 my $episode = $pgm->findvalue( 'episode-num[@system="xmltv_ns"]' );
517 my $production_date = $pgm->findvalue( 'date' );
518 my $url = $pgm->findvalue( 'url' );
520 my $aspect = $pgm->findvalue( 'video/aspect' );
523 my $ns = $pgm->find( ".//actor" );
525 foreach my $act ($ns->get_nodelist) {
526 push @actors, $act->findvalue(".");
530 $ns = $pgm->find( ".//director" );
532 foreach my $dir ($ns->get_nodelist) {
533 push @directors, $dir->findvalue(".");
537 start_dt
=> $start_dt,
540 description
=> $desc,
543 if( $subtitle =~ /\S/ ) {
544 $e{subtitle
} = $subtitle;
547 if( $episode =~ /\S/ ) {
548 $e{episode
} = $episode;
555 if( $cat1 =~ /^[a-z]/ ) {
556 $e{program_type
} = $cat1;
558 elsif( $cat1 =~ /^[A-Z]/ ) {
559 $e{category
} = $cat1;
562 if( $cat2 =~ /^[a-z]/ ) {
563 $e{program_type
} = $cat2;
565 elsif( $cat2 =~ /^[A-Z]/ ) {
566 $e{category
} = $cat2;
569 if( $production_date =~ /\S/ ) {
570 $e{production_date
} = "$production_date-01-01";
573 if( $aspect =~ /\S/ ) {
574 $e{aspect
} = $aspect;
577 if( scalar( @directors ) > 0 ) {
578 $e{directors
} = join ", ", @directors;
581 if( scalar( @actors ) > 0 ) {
582 $e{actors
} = join ", ", @actors;
594 my( $datetime ) = @_;
596 my( $year, $month, $day, $hour, $minute, $second, $tz ) =
597 ($datetime =~ /(\d
{4})(\d
{2})(\d
{2})
598 (\d
{2})(\d
{2})(\d
{2})\s
+
601 my $dt = DateTime
->new(
616 Convert month name to month number
621 my( $monthname , $lang ) = @_;
623 my( @months_1, @months_2, @months_3, @months_4, @months_5 );
625 if( $lang =~ /^en$/ ){
626 @months_1 = qw
/jan feb mar apr may jun jul aug sep oct nov dec/;
627 @months_2 = qw
/janu febr marc apr may june july augu sept octo nov dec/;
628 @months_3 = qw
/january february march april may june july august september october november december/;
629 @months_4 = qw
/1 2 3 4 5 6 7 8 9 10 11 12/;
630 @months_5 = qw
/1 2 3 4 5 6 7 8 9 10 11 12/;
631 } elsif( $lang =~ /^hr$/ ){
632 @months_1 = qw
/sij vel ozu tra svi lip srp kol ruj lis stu pro/;
633 @months_2 = qw
/sijecanj veljaca ozujak travanj svibanj lipanj srpanj kolovoz rujan listopad studeni prosinac/;
634 @months_3 = qw
/sijecnja veljace ozujka travnja svibnja lipnja srpnja kolovoza rujna listopada studenoga prosinca/;
635 @months_4 = qw
/sijeèa veljače ožujka travnja svibnja lipnja srpnja kolovoza rujna listopada studenog prosinca/;
636 @months_5 = qw
/siječnja veljace ozujka travnja svibnja lipnja srpnja kolovoza rujna listopada studenog prosinca/;
637 } elsif( $lang =~ /^sr$/ ){
638 @months_1 = qw
/jan feb mar apr maj jun jul aug sep okt nov dec/;
639 @months_2 = qw
/januar februar mart april maj jun juli august septembar oktobar novembar decembar/;
640 @months_3 = qw
/januara februara marta aprila maja juna jula augusta septembra oktobra novembra decembra/;
641 @months_4 = qw
/1 2 3 4 5 6 7 8 9 10 11 12/;
642 @months_5 = qw
/1 2 3 4 5 6 7 8 9 10 11 12/;
643 } elsif( $lang =~ /^it$/ ){
644 @months_1 = qw
/gen feb mar apr mag giu lug ago set ott nov dic/;
645 @months_2 = qw
/gennaio febbraio marzo aprile maggio giugno luglio agosto settembre ottobre novembre dicembre/;
646 @months_3 = qw
/1 2 3 4 5 6 7 8 9 10 11 12/;
647 @months_4 = qw
/1 2 3 4 5 6 7 8 9 10 11 12/;
648 @months_5 = qw
/1 2 3 4 5 6 7 8 9 10 11 12/;
649 } elsif( $lang =~ /^fr$/ ){
650 @months_1 = qw
/jan fav mar avr mai jui jul aou sep oct nov dec/;
651 @months_2 = qw
/JANVIER FÉVRIER mars avril mai juin juillet Août septembre octobre novembre DÉCEMBRE/;
652 @months_3 = qw
/janvier favrier mMARS AVRIL MAI JUIN juillet aout septembre octobre novembre DÉCEMBRE/;
653 @months_4 = qw
/1 2 3 4 5 6 7 8 9 10 11 12/;
654 @months_5 = qw
/1 2 3 4 5 6 7 8 9 10 11 12/;
655 } elsif( $lang =~ /^ru$/ ){
656 @months_1 = qw
/jan fav mar aprelja maja jui jul aou sep oct nov dec/;
657 @months_2 = qw
/JANVIER FÉVRIER mars avril mai juin juillet aout septembre octobre novembre DÉCEMBRE/;
658 @months_3 = qw
/janvier favrier mars avril mai juin juillet aout septembre octobre novembre DÉCEMBRE/;
659 @months_4 = qw
/1 2 3 4 5 6 7 8 9 10 11 12/;
660 @months_5 = qw
/1 2 3 4 5 6 7 8 9 10 11 12/;
665 for( my $i = 0; $i < scalar(@months_1); $i++ ){
666 $monthnames{$months_1[$i]} = $i+1;
669 for( my $i = 0; $i < scalar(@months_2); $i++ ){
670 $monthnames{$months_2[$i]} = $i+1;
673 for( my $i = 0; $i < scalar(@months_3); $i++ ){
674 $monthnames{$months_3[$i]} = $i+1;
677 for( my $i = 0; $i < scalar(@months_4); $i++ ){
678 $monthnames{$months_4[$i]} = $i+1;
681 for( my $i = 0; $i < scalar(@months_5); $i++ ){
682 $monthnames{$months_5[$i]} = $i+1;
685 my $month = $monthnames{$monthname};
686 my $lcmonth = $monthnames{lc $monthname};
688 return $month||$lcmonth;
693 Function: CompareArrays
695 Compare two arrays (new and old) and call functions to reflect added,
696 deleted and unchanged entries.
700 $new - A reference to the new array
701 $old - A reference to the old array
702 $cb - A hashref with callback functions
704 CompareArrays calls the following callback functions:
706 $cb->{cmp}( $enew, $eold ) - Compare an entry from $new with an
707 entry from $old. Shall return -1 if $ea is
708 less than $eb, 0 if they are equal and 1
711 $cb->{added}($enew) - Called for all entries that are present in
712 $new but not in $old.
714 $cb->{deleted}($eold) - Called for all entries that are present in
715 $old but not in $new.
717 $cb->{equal}($enew, $eold) - Called for all entries that are present in
720 Additionally, $cb->{max} shall contain an entry that is always
721 regarded as greater than any possible entry in $new and $old.
729 sub CompareArrays
#( $new, $old, $cb )
731 my( $new, $old, $cb ) = @_;
733 my @a = sort { $cb->{cmp}( $a, $b ) } @
{$new};
734 my @b = sort { $cb->{cmp}( $a, $b ) } @
{$old};
746 # If both arrays have reached the end, we are done.
747 if( ($cb->{cmp}($da, $cb->{max
}) == 0) and
748 ($cb->{cmp}($db, $cb->{max
}) == 0 ) ) {
752 my $cmp = $cb->{cmp}($da, $db);
755 $cb->{equal
}($da, $db);
772 ### Setup coding system