Merge branch 'master' of git+ssh://tcrnek@repo.or.cz/srv/git/nonametv
[nonametv.git] / lib / NonameTV.pm
blob43b302b0742b2d64a6ab991572b1ea82ea6b82df
1 package NonameTV;
3 use strict;
4 use warnings;
6 # Mark this source-file as encoded in utf-8.
7 use utf8;
8 use Env;
10 use LWP::UserAgent;
11 use File::Temp qw/tempfile tempdir/;
12 use File::Slurp;
14 use NonameTV::StringMatcher;
15 use NonameTV::Log qw/w/;
16 use XML::LibXML;
18 BEGIN {
19 use Exporter ();
20 our ($VERSION, @ISA, @EXPORT, @EXPORT_OK, %EXPORT_TAGS);
22 # set the version for version checking
23 $VERSION = 0.3;
25 @ISA = qw(Exporter);
26 @EXPORT = qw( );
27 %EXPORT_TAGS = ( ); # eg: TAG => [ qw!name1 name2! ],
28 @EXPORT_OK = qw/MyGet expand_entities
29 Html2Xml Htmlfile2Xml
30 Word2Xml Wordfile2Xml
31 File2Xml Content2Xml
32 FindParagraphs
33 norm AddCategory
34 ParseDescCatSwe FixProgrammeData
35 ParseXml ParseXmltv
36 MonthNumber
37 CompareArrays
40 our @EXPORT_OK;
42 my $wvhtml = '/usr/bin/wvHtml --charset=utf8';
43 # my $wvhtml = '/usr/bin/wvHtml';
45 my $ua = LWP::UserAgent->new( agent => "Grabber from http://tv.swedb.se",
46 cookie_jar => {} );
48 # Fetch a url. Returns ($content, true) if data was fetched from server and
49 # different from the last time the same url was fetched, ($content, false) if
50 # it was fetched from the server and was the same as the last time it was
51 # fetched and (undef,$error_message) if there was an error fetching the data.
53 sub MyGet
55 my( $url ) = @_;
56 my $res = $ua->get( $url );
58 if( $res->is_success )
60 return ($res->content, not defined( $res->header( 'X-Content-Unchanged' ) ) );
62 else
64 return (undef, $res->status_line );
68 # åäö ÅÄÖ
69 my %ent = (
70 257 => 'ä',
71 231 => 'c', # This should really be a c with a special mark on it.
72 # Unicode 000E7, UTF-8 195 167.
73 337 => 'ö',
74 8211 => '-',
75 8212 => '--',
76 8216 => "'",
77 8217 => "'",
78 8220 => '"',
79 8221 => '"',
80 8230 => '...',
81 8364 => "(euro)",
84 sub _expand
86 my( $num, $str ) = @_;
88 if( not defined( $ent{$num} ) )
90 $ent{$num} = "";
91 print STDERR "Unknown entity $num in $str\n";
94 return $ent{$num};
97 sub expand_entities
99 my( $str ) = @_;
101 $str =~ s/\&#(\d+);/_expand($1,$str)/eg;
103 return $str;
106 =item Html2Xml( $content )
108 Convert the HTML in $content into an XML::LibXML::Document.
110 Prints an error-message to STDERR and returns undef if the conversion
111 fails.
113 =cut
115 sub Html2Xml {
116 my( $html ) = @_;
118 my $xml = XML::LibXML->new;
119 $xml->recover(1);
121 # Remove character that makes the parser stop.
122 $html =~ s/\x00//g;
124 my $doc;
125 eval { $doc = $xml->parse_html_string($html, {
126 recover => 1,
127 suppress_errors => 1,
128 suppress_warnings => 1,
129 }); };
131 if( $@ ne "" ) {
132 my ($package, $filename, $line) = caller;
133 print "parse_html_string failed: $@ when called from $filename:$line\n";
134 return undef;
137 return $doc;
140 =item Htmlfile2Xml( $filename )
142 Convert the HTML in a file into an XML::LibXML::Document.
144 Prints an error-message to STDERR and returns undef if the conversion
145 fails.
147 =cut
149 sub Htmlfile2Xml
151 my( $filename ) = @_;
153 my $html = read_file( $filename );
155 return Html2Xml( $html );
159 =item Word2Xml( $content )
161 Convert the Microsoft Word document in $content into html and return
162 the html as an XML::LibXML::Document.
164 Prints an error-message to STDERR and returns undef if the conversion
165 fails.
167 =cut
169 sub Word2Xml
171 my( $content ) = @_;
173 my( $fh, $filename ) = tempfile();
174 print $fh $content;
175 close( $fh );
177 my $doc = Wordfile2Xml( $filename );
178 unlink( $filename );
179 return $doc;
182 sub Wordfile2Xml
184 my( $filename ) = @_;
186 my $html = qx/$wvhtml "$filename" -/;
187 if( $? )
189 w "$wvhtml $filename - failed: $?";
190 return undef;
193 # Remove character that makes LibXML choke.
194 $html =~ s/\…/.../g;
196 return Html2Xml( $html );
199 sub File2Xml {
200 my( $filename ) = @_;
202 my $data = read_file( $filename );
203 my $doc;
204 if( $data =~ /^\<\!DOCTYPE HTML/ )
206 # This is an override that has already been run through wvHtml
207 $doc = Html2Xml( $data );
209 else
211 $doc = Word2Xml( $data );
214 return $doc;
217 sub Content2Xml {
218 my( $cref ) = @_;
220 my $doc;
221 if( $$cref =~ /^\<\!DOCTYPE HTML/ )
223 # This is an override that has already been run through wvHtml
224 $doc = Html2Xml( $$cref );
226 else
228 $doc = Word2Xml( $$cref );
231 return $doc;
234 =pod
236 FindParagraphs( $doc, $expr )
238 Finds all paragraphs in the part of an xml-tree that matches an
239 xpath-expression. Returns a reference to an array of strings.
240 All paragraphs are normalized and empty strings are removed from the
241 array.
243 Both <div> and <br> are taken into account when splitting the document
244 into paragraphs.
246 Use the expression '//body//.' for html-documents when you want to see
247 all paragraphs in the page.
249 =cut
251 my %paraelem = (
252 p => 1,
253 br => 1,
254 div => 1,
255 td => 1,
258 sub FindParagraphs {
259 my( $doc, $elements ) = @_;
261 my $ns = $doc->find( $elements );
263 my @paragraphs;
264 my $p = "";
266 foreach my $node ($ns->get_nodelist()) {
267 if( $node->nodeName eq "#text" ) {
268 $p .= $node->textContent();
270 elsif( defined $paraelem{ $node->nodeName } ) {
271 $p = norm( $p );
272 if( $p ne "" ) {
273 push @paragraphs, $p;
274 $p = "";
279 return \@paragraphs;
283 # Remove any strange quotation marks and other syntactic marks
284 # that we don't want to have. Remove leading and trailing space as well
285 # multiple whitespace characters.
286 # Returns the empty string if called with an undef string.
287 sub norm
289 my( $str ) = @_;
291 return "" if not defined( $str );
293 # Uncomment the code below and change the regexp to learn which
294 # character code perl thinks a certain character has.
295 # These codes can be used in \x{YY} expressions as shown below.
296 # if( $str =~ /unique string/ ) {
297 # for( my $i=0; $i < length( $str ); $i++ ) {
298 # printf( "%2x: %s\n", ord( substr( $str, $i, 1 ) ),
299 # substr( $str, $i, 1 ) );
303 $str = expand_entities( $str );
305 $str =~ tr/\x{96}\x{93}\x{94}/-""/; #
306 $str =~ tr/\x{201d}\x{201c}/""/;
307 $str =~ tr/\x{2022}/*/; # Bullet
308 $str =~ tr/\x{2013}\x{2018}\x{2019}/-''/;
309 $str =~ tr/\x{017c}\x{0144}\x{0105}/zna/;
310 $str =~ s/\x{85}/... /g;
311 $str =~ s/\x{2026}/.../sg;
312 $str =~ s/\x{2007}/ /sg;
314 $str =~ s/^\s+//;
315 $str =~ s/\s+$//;
316 $str =~ tr/\n\r\t / /s;
318 return $str;
321 =item AddCategory
323 Add program_type and category to an entry if the entry does not already
324 have a program_type and category.
326 AddCategory( $ce, $program_type, $category );
328 =cut
330 sub AddCategory
332 my( $ce, $program_type, $category ) = @_;
334 if( not defined( $ce->{program_type} ) and defined( $program_type )
335 and ( $program_type =~ /\S/ ) )
337 $ce->{program_type} = $program_type;
340 if( not defined( $ce->{category} ) and defined( $category )
341 and ( $category =~ /\S/ ) )
343 $ce->{category} = $category;
347 =item ParseDescCatSwe
349 Parse a program description in Swedish and return program_type
350 and category that can be deduced from the description.
352 my( $pty, $cat ) = ParseDescCatSwe( "Amerikansk äventyrsserie" );
354 =cut
356 my $sm = NonameTV::StringMatcher->new();
357 $sm->AddRegexp( qr/kriminalserie/i, [ 'series', 'Crime/Mystery' ] );
358 $sm->AddRegexp( qr/deckarserie/i, [ 'series', 'Crime/Mystery' ] );
359 $sm->AddRegexp( qr/polisserie/i, [ 'series', 'Crime/Mystery' ] );
360 $sm->AddRegexp( qr/familjeserie/i, [ 'series', undef ] );
361 $sm->AddRegexp( qr/tecknad serie/i, [ 'series', undef ] );
362 $sm->AddRegexp( qr/animerad serie/i, [ 'series', undef ] );
363 $sm->AddRegexp( qr/dramakomediserie/i, [ 'series', 'Comedy' ] );
364 $sm->AddRegexp( qr/dramaserie/i, [ 'series', 'Drama' ] );
365 $sm->AddRegexp( qr/resedokumentärserie/i,[ 'series', 'Food/Travel' ] );
366 $sm->AddRegexp( qr/komediserie/i, [ 'series', 'Comedy' ] );
367 $sm->AddRegexp( qr/realityserie/i, [ 'series', 'Reality' ] );
368 $sm->AddRegexp( qr/realityshow/i, [ 'series', 'Reality' ] );
369 $sm->AddRegexp( qr/dokusåpa/i, [ 'series', 'Reality' ] );
370 $sm->AddRegexp( qr/actiondramaserie/i, [ 'series', 'Action' ] );
371 $sm->AddRegexp( qr/actionserie/i, [ 'series', 'Action' ] );
372 $sm->AddRegexp( qr/underhållningsserie/i,[ 'series', undef ] );
373 $sm->AddRegexp( qr/äventyrsserie/i, [ 'series', 'Action/Adv' ] );
374 $sm->AddRegexp( qr/äventyrskomediserie/i,[ 'series', 'Comedy' ] );
375 $sm->AddRegexp( qr/dokumentärserie/i, [ 'series', 'Documentary' ] );
376 $sm->AddRegexp( qr/dramadokumentär/i, [ undef, 'Documentary' ] );
378 $sm->AddRegexp( qr/barnserie/i, [ 'series', "Children's" ] );
379 $sm->AddRegexp( qr/matlagningsserie/i, [ 'series', 'Cooking' ] );
380 $sm->AddRegexp( qr/motorserie/i, [ 'series', undef ] );
381 $sm->AddRegexp( qr/fixarserie/i, [ 'series', "Home/How-to" ] );
382 $sm->AddRegexp( qr/science[-\s]*fiction[-\s]*serie/i,
383 [ 'series', 'SciFi' ] );
384 $sm->AddRegexp( qr/barnprogram/i, [ undef, "Children's" ] );
386 # Movies
387 $sm->AddRegexp( qr/\b(familje|drama|action)*komedi\b/i, [ 'movie', "Comedy" ] );
389 $sm->AddRegexp( qr/\b(krigs|kriminal)*drama\b/i, [ 'movie', "Drama" ] );
391 $sm->AddRegexp( qr/\baction(drama|film)*\b/i, [ 'movie', "Action/Adv" ] );
393 $sm->AddRegexp( qr/\b.ventyrsdrama\b/i, [ 'movie', "Action/Adv" ] );
395 $sm->AddRegexp( qr/\bv.stern(film)*\b/i, [ 'movie', undef ] );
397 $sm->AddRegexp( qr/\b(drama)*thriller\b/i, [ 'movie', "Crime" ] );
399 $sm->AddRegexp( qr/\bscience\s*fiction(rysare)*\b/i, [ 'movie', "SciFi" ] );
401 $sm->AddRegexp( qr/\b(l.ng)*film\b/i, [ 'movie', undef ] );
404 sub ParseDescCatSwe
406 my( $desc ) = @_;
408 return (undef, undef) if not defined $desc;
410 my $res = $sm->Match( $desc );
411 if( defined( $res ) )
413 return @{$res};
415 else
417 return (undef,undef);
421 sub FixProgrammeData
423 my( $d ) = @_;
425 $d->{title} =~ s/^s.songs+tart\s*:*\s*//gi;
426 $d->{title} =~ s/^seriestart\s*:*\s*//gi;
427 $d->{title} =~ s/^reprisstart\s*:*\s*//gi;
428 $d->{title} =~ s/^programstart\s*:*\s*//gi;
430 $d->{title} =~ s/^s.songs*avslutning\s*:*\s*//gi;
431 $d->{title} =~ s/^sista\s+delen\s*:*\s*//gi;
432 $d->{title} =~ s/^sista\s+avsnittet\s*:*\s*//gi;
434 if( $d->{title} =~ s/^((matin.)|(fredagsbio))\s*:\s*//gi )
436 $d->{program_type} = 'movie';
437 $d->{category} = 'Movies';
440 # Set program_type to series if the entry has an episode-number
441 # with a defined episode (i.e. second part),
442 # but doesn't have a program_type.
443 if( exists( $d->{episode} ) and defined( $d->{episode} ) and
444 ($d->{episode} !~ /^\s*\.\s*\./) and
445 ( (not defined($d->{program_type})) or ($d->{program_type} =~ /^\s*$/) ) )
447 $d->{program_type} = "series";
451 =pod
453 my $doc = ParseXml( $strref );
455 Parse an xml-string into an XML::LibXML document. Takes a reference to a
456 string as the only reference.
458 =cut
460 my $xml;
462 sub ParseXml {
463 my( $cref ) = @_;
465 if( not defined $xml ) {
466 $xml = XML::LibXML->new;
467 $xml->load_ext_dtd(0);
470 my $doc;
471 eval {
472 $doc = $xml->parse_string($$cref);
474 if( $@ ne "" ) {
475 w "Failed to parse xml: $@";
476 return undef;
479 return $doc;
482 =pod
484 Parse a reference to an xml-string in xmltv-format into a reference to an
485 array of hashes with programme-info.
487 =cut
489 sub ParseXmltv {
490 my( $cref ) = @_;
492 my $doc = ParseXml( $cref );
493 return undef if not defined $doc;
495 my @d;
497 # Find all "programme"-entries.
498 my $ns = $doc->find( "//programme" );
499 if( $ns->size() == 0 ) {
500 return;
503 foreach my $pgm ($ns->get_nodelist) {
504 my $start = $pgm->findvalue( '@start' );
505 my $start_dt = create_dt( $start );
507 my $stop = $pgm->findvalue( '@stop' );
508 my $stop_dt = create_dt( $stop );
510 my $title = $pgm->findvalue( 'title' );
511 my $subtitle = $pgm->findvalue( 'sub-title' );
513 my $desc = $pgm->findvalue( 'desc' );
514 my $cat1 = $pgm->findvalue( 'category[1]' );
515 my $cat2 = $pgm->findvalue( 'category[2]' );
516 my $episode = $pgm->findvalue( 'episode-num[@system="xmltv_ns"]' );
517 my $production_date = $pgm->findvalue( 'date' );
518 my $url = $pgm->findvalue( 'url' );
520 my $aspect = $pgm->findvalue( 'video/aspect' );
522 my @actors;
523 my $ns = $pgm->find( ".//actor" );
525 foreach my $act ($ns->get_nodelist) {
526 push @actors, $act->findvalue(".");
529 my @directors;
530 $ns = $pgm->find( ".//director" );
532 foreach my $dir ($ns->get_nodelist) {
533 push @directors, $dir->findvalue(".");
536 my %e = (
537 start_dt => $start_dt,
538 stop_dt => $stop_dt,
539 title => $title,
540 description => $desc,
543 if( $subtitle =~ /\S/ ) {
544 $e{subtitle} = $subtitle;
547 if( $episode =~ /\S/ ) {
548 $e{episode} = $episode;
551 if( $url =~ /\S/ ) {
552 $e{url} = $url;
555 if( $cat1 =~ /^[a-z]/ ) {
556 $e{program_type} = $cat1;
558 elsif( $cat1 =~ /^[A-Z]/ ) {
559 $e{category} = $cat1;
562 if( $cat2 =~ /^[a-z]/ ) {
563 $e{program_type} = $cat2;
565 elsif( $cat2 =~ /^[A-Z]/ ) {
566 $e{category} = $cat2;
569 if( $production_date =~ /\S/ ) {
570 $e{production_date} = "$production_date-01-01";
573 if( $aspect =~ /\S/ ) {
574 $e{aspect} = $aspect;
577 if( scalar( @directors ) > 0 ) {
578 $e{directors} = join ", ", @directors;
581 if( scalar( @actors ) > 0 ) {
582 $e{actors} = join ", ", @actors;
586 push @d, \%e;
589 return \@d;
592 sub create_dt
594 my( $datetime ) = @_;
596 my( $year, $month, $day, $hour, $minute, $second, $tz ) =
597 ($datetime =~ /(\d{4})(\d{2})(\d{2})
598 (\d{2})(\d{2})(\d{2})\s+
599 (\S+)$/x);
601 my $dt = DateTime->new(
602 year => $year,
603 month => $month,
604 day => $day,
605 hour => $hour,
606 minute => $minute,
607 second => $second,
608 time_zone => $tz
611 return $dt;
614 =pod
616 Convert month name to month number
618 =cut
620 sub MonthNumber {
621 my( $monthname , $lang ) = @_;
623 my( @months_1, @months_2, @months_3, @months_4, @months_5 );
625 if( $lang =~ /^en$/ ){
626 @months_1 = qw/jan feb mar apr may jun jul aug sep oct nov dec/;
627 @months_2 = qw/janu febr marc apr may june july augu sept octo nov dec/;
628 @months_3 = qw/january february march april may june july august september october november december/;
629 @months_4 = qw/1 2 3 4 5 6 7 8 9 10 11 12/;
630 @months_5 = qw/1 2 3 4 5 6 7 8 9 10 11 12/;
631 } elsif( $lang =~ /^hr$/ ){
632 @months_1 = qw/sij vel ozu tra svi lip srp kol ruj lis stu pro/;
633 @months_2 = qw/sijecanj veljaca ozujak travanj svibanj lipanj srpanj kolovoz rujan listopad studeni prosinac/;
634 @months_3 = qw/sijecnja veljace ozujka travnja svibnja lipnja srpnja kolovoza rujna listopada studenoga prosinca/;
635 @months_4 = qw/sijeèa veljače ožujka travnja svibnja lipnja srpnja kolovoza rujna listopada studenog prosinca/;
636 @months_5 = qw/siječnja veljace ozujka travnja svibnja lipnja srpnja kolovoza rujna listopada studenog prosinca/;
637 } elsif( $lang =~ /^sr$/ ){
638 @months_1 = qw/jan feb mar apr maj jun jul aug sep okt nov dec/;
639 @months_2 = qw/januar februar mart april maj jun juli august septembar oktobar novembar decembar/;
640 @months_3 = qw/januara februara marta aprila maja juna jula augusta septembra oktobra novembra decembra/;
641 @months_4 = qw/1 2 3 4 5 6 7 8 9 10 11 12/;
642 @months_5 = qw/1 2 3 4 5 6 7 8 9 10 11 12/;
643 } elsif( $lang =~ /^it$/ ){
644 @months_1 = qw/gen feb mar apr mag giu lug ago set ott nov dic/;
645 @months_2 = qw/gennaio febbraio marzo aprile maggio giugno luglio agosto settembre ottobre novembre dicembre/;
646 @months_3 = qw/1 2 3 4 5 6 7 8 9 10 11 12/;
647 @months_4 = qw/1 2 3 4 5 6 7 8 9 10 11 12/;
648 @months_5 = qw/1 2 3 4 5 6 7 8 9 10 11 12/;
649 } elsif( $lang =~ /^fr$/ ){
650 @months_1 = qw/jan fav mar avr mai jui jul aou sep oct nov dec/;
651 @months_2 = qw/JANVIER FÉVRIER mars avril mai juin juillet Août septembre octobre novembre DÉCEMBRE/;
652 @months_3 = qw/janvier favrier mMARS AVRIL MAI JUIN juillet aout septembre octobre novembre DÉCEMBRE/;
653 @months_4 = qw/1 2 3 4 5 6 7 8 9 10 11 12/;
654 @months_5 = qw/1 2 3 4 5 6 7 8 9 10 11 12/;
655 } elsif( $lang =~ /^ru$/ ){
656 @months_1 = qw/jan fav mar aprelja maja jui jul aou sep oct nov dec/;
657 @months_2 = qw/JANVIER FÉVRIER mars avril mai juin juillet aout septembre octobre novembre DÉCEMBRE/;
658 @months_3 = qw/janvier favrier mars avril mai juin juillet aout septembre octobre novembre DÉCEMBRE/;
659 @months_4 = qw/1 2 3 4 5 6 7 8 9 10 11 12/;
660 @months_5 = qw/1 2 3 4 5 6 7 8 9 10 11 12/;
663 my %monthnames = ();
665 for( my $i = 0; $i < scalar(@months_1); $i++ ){
666 $monthnames{$months_1[$i]} = $i+1;
669 for( my $i = 0; $i < scalar(@months_2); $i++ ){
670 $monthnames{$months_2[$i]} = $i+1;
673 for( my $i = 0; $i < scalar(@months_3); $i++ ){
674 $monthnames{$months_3[$i]} = $i+1;
677 for( my $i = 0; $i < scalar(@months_4); $i++ ){
678 $monthnames{$months_4[$i]} = $i+1;
681 for( my $i = 0; $i < scalar(@months_5); $i++ ){
682 $monthnames{$months_5[$i]} = $i+1;
685 my $month = $monthnames{$monthname};
686 my $lcmonth = $monthnames{lc $monthname};
688 return $month||$lcmonth;
691 =begin nd
693 Function: CompareArrays
695 Compare two arrays (new and old) and call functions to reflect added,
696 deleted and unchanged entries.
698 Parameters:
700 $new - A reference to the new array
701 $old - A reference to the old array
702 $cb - A hashref with callback functions
704 CompareArrays calls the following callback functions:
706 $cb->{cmp}( $enew, $eold ) - Compare an entry from $new with an
707 entry from $old. Shall return -1 if $ea is
708 less than $eb, 0 if they are equal and 1
709 otherwise.
711 $cb->{added}($enew) - Called for all entries that are present in
712 $new but not in $old.
714 $cb->{deleted}($eold) - Called for all entries that are present in
715 $old but not in $new.
717 $cb->{equal}($enew, $eold) - Called for all entries that are present in
718 both $new and $old.
720 Additionally, $cb->{max} shall contain an entry that is always
721 regarded as greater than any possible entry in $new and $old.
723 Returns:
725 nothing
727 =cut
729 sub CompareArrays #( $new, $old, $cb )
731 my( $new, $old, $cb ) = @_;
733 my @a = sort { $cb->{cmp}( $a, $b ) } @{$new};
734 my @b = sort { $cb->{cmp}( $a, $b ) } @{$old};
736 push @a, $cb->{max};
737 push @b, $cb->{max};
739 my $ia = 0;
740 my $ib = 0;
742 while( 1 ) {
743 my $da = $a[$ia];
744 my $db = $b[$ib];
746 # If both arrays have reached the end, we are done.
747 if( ($cb->{cmp}($da, $cb->{max}) == 0) and
748 ($cb->{cmp}($db, $cb->{max}) == 0 ) ) {
749 last;
752 my $cmp = $cb->{cmp}($da, $db);
754 if( $cmp == 0 ) {
755 $cb->{equal}($da, $db);
756 $ia++;
757 $ib++;
759 elsif( $cmp < 0 ) {
760 $cb->{added}( $da );
761 $ia++;
763 else {
764 $cb->{deleted}($db);
765 $ib++;
772 ### Setup coding system
773 ## Local Variables:
774 ## coding: utf-8
775 ## End: