lib/NonameTV.pm

   1 package NonameTV;
   2
   3 use strict;
   4 use warnings;
   5
   6 # Mark this source-file as encoded in utf-8.
   7 use utf8;
   8 use Env;
   9
  10 use LWP::UserAgent;
  11 use File::Temp qw/tempfile tempdir/;
  12 use File::Slurp;
  13
  14 use NonameTV::StringMatcher;
  15 use NonameTV::Log qw/logdie error/;
  16 use XML::LibXML;
  17
  18 BEGIN {
  19     use Exporter   ();
  20     our ($VERSION, @ISA, @EXPORT, @EXPORT_OK, %EXPORT_TAGS);
  21
  22     # set the version for version checking
  23     $VERSION     = 0.3;
  24
  25     @ISA         = qw(Exporter);
  26     @EXPORT      = qw( );
  27     %EXPORT_TAGS = ( );     # eg: TAG => [ qw!name1 name2! ],
  28     @EXPORT_OK   = qw/MyGet expand_entities
  29                       Html2Xml Htmlfile2Xml
  30                       Wordfile2HtmlTree Htmlfile2HtmlTree
  31                       Word2Xml Wordfile2Xml
  32                       File2Xml
  33                       FindParagraphs
  34                       norm AddCategory
  35                       ParseDescCatSwe FixProgrammeData
  36                       ParseXml ParseXmltv
  37                       MonthNumber/;
  38 }
  39 our @EXPORT_OK;
  40
  41 my $wvhtml = '/usr/bin/wvHtml --charset=utf8';
  42 # my $wvhtml = '/usr/bin/wvHtml';
  43
  44 my $ua = LWP::UserAgent->new( agent => "Grabber from http://tv.swedb.se",
  45                               cookie_jar => {} );
  46
  47 # Fetch a url. Returns ($content, true) if data was fetched from server and
  48 # different from the last time the same url was fetched, ($content, false) if
  49 # it was fetched from the server and was the same as the last time it was
  50 # fetched and (undef,$error_message) if there was an error fetching the data.
  51
  52 sub MyGet
  53 {
  54   my( $url ) = @_;
  55   my $res = $ua->get( $url );
  56
  57   if( $res->is_success )
  58   {
  59     return ($res->content, not defined( $res->header( 'X-Content-Unchanged' ) ) );
  60   }
  61   else
  62   {
  63     return (undef, $res->status_line );
  64   }
  65 }
  66
  67 # åäö ÅÄÖ
  68 my %ent = (
  69            257  => 'ä',
  70            231  => 'c', # This should really be a c with a special mark on it.
  71                         # Unicode 000E7, UTF-8 195 167.
  72            337  => 'ö',
  73            8211 => '-',
  74            8212 => '--',
  75            8216 => "'",
  76            8217 => "'",
  77            8220 => '"',
  78            8221 => '"',
  79            8230 => '...',
  80            8364 => "(euro)",
  81            );
  82
  83 sub _expand
  84 {
  85   my( $num, $str ) = @_;
  86
  87   if( not defined( $ent{$num} ) )
  88   {
  89     $ent{$num} = "";
  90     print STDERR "Unknown entity $num in $str\n";
  91   }
  92
  93   return $ent{$num};
  94 }
  95
  96 sub expand_entities
  97 {
  98   my( $str ) = @_;
  99
 100   $str =~ s/\&#(\d+);/_expand($1,$str)/eg;
 101
 102   return $str;
 103 }
 104
 105 =item Html2Xml( $content )
 106
 107 Convert the HTML in $content into an XML::LibXML::Document.
 108
 109 Prints an error-message to STDERR and returns undef if the conversion
 110 fails.
 111
 112 =cut
 113
 114 sub Html2Xml
 115 {
 116   my( $html ) = @_;
 117   my $xml = XML::LibXML->new;
 118   $xml->recover(1);
 119
 120   # Stupid XML::LibXML writes to STDERR. Redirect it temporarily.
 121   open(SAVERR, ">&STDERR"); # save the stderr fhandle
 122   print SAVERR "Nothing\n" if 0;
 123   open(STDERR,"> /dev/null");
 124
 125   # Remove character that makes the parser stop.
 126   $html =~ s/\x00//g;
 127
 128   my $doc;
 129   eval { $doc = $xml->parse_html_string($html); };
 130
 131   # Restore STDERR
 132   open( STDERR, ">&SAVERR" );
 133
 134   if( $@ ne "" )
 135   {
 136     my ($package, $filename, $line) = caller;
 137     print "parse_html_string failed: $@ when called from $filename:$line\n";
 138     return undef;
 139   }
 140
 141   return $doc;
 142 }
 143
 144 =item Htmlfile2Xml( $filename )
 145
 146 Convert the HTML in a file into an XML::LibXML::Document.
 147
 148 Prints an error-message to STDERR and returns undef if the conversion
 149 fails.
 150
 151 =cut
 152
 153 sub Htmlfile2Xml
 154 {
 155   my( $filename ) = @_;
 156
 157   my $html = read_file( $filename );
 158
 159   return Html2Xml( $html );
 160 }
 161
 162
 163 =item Word2Xml( $content )
 164
 165 Convert the Microsoft Word document in $content into html and return
 166 the html as an XML::LibXML::Document.
 167
 168 Prints an error-message to STDERR and returns undef if the conversion
 169 fails.
 170
 171 =cut
 172
 173 sub Word2Xml
 174 {
 175   my( $content ) = @_;
 176
 177   my( $fh, $filename ) = tempfile();
 178   print $fh $content;
 179   close( $fh );
 180
 181   my $doc = Wordfile2Xml( $filename );
 182   unlink( $filename );
 183   return $doc;
 184 }
 185
 186 sub Wordfile2Xml
 187 {
 188   my( $filename ) = @_;
 189
 190   my $html = qx/$wvhtml "$filename" -/;
 191   if( $? )
 192   {
 193     print "$wvhtml $filename - failed: $?\n";
 194     return undef;
 195   }
 196
 197   # Remove character that makes LibXML choke.
 198   $html =~ s/\&hellip;/.../g;
 199
 200   return Html2Xml( $html );
 201 }
 202
 203 sub File2Xml {
 204   my( $filename ) = @_;
 205
 206   my $data = read_file( $filename );
 207   my $doc;
 208   if( $data =~ /^\<\!DOCTYPE HTML/ )
 209   {
 210     # This is an override that has already been run through wvHtml
 211     $doc = Html2Xml( $data );
 212   }
 213   else
 214   {
 215     $doc = Word2Xml( $data );
 216   }
 217
 218   return $doc;
 219 }
 220
 221 =pod
 222
 223 FindParagraphs( $doc, $expr )
 224
 225 Finds all paragraphs in the part of an xml-tree that matches an
 226 xpath-expression. Returns a reference to an array of strings.
 227 All paragraphs are normalized and empty strings are removed from the
 228 array.
 229
 230 Both <div> and <br> are taken into account when splitting the document
 231 into paragraphs.
 232
 233 Use the expression '//body//.' for html-documents when you want to see
 234 all paragraphs in the page.
 235
 236 =cut
 237
 238 my %paraelem = (
 239                 p => 1,
 240                 br => 1,
 241                 div => 1,
 242                 td => 1,
 243                 );
 244
 245 sub FindParagraphs {
 246   my( $doc, $elements ) = @_;
 247
 248   my $ns = $doc->find( $elements );
 249
 250   my @paragraphs;
 251   my $p = "";
 252
 253   foreach my $node ($ns->get_nodelist()) {
 254     if( $node->nodeName eq "#text" ) {
 255       $p .= $node->textContent();
 256     }
 257     elsif( defined $paraelem{ $node->nodeName } ) {
 258       $p = norm( $p );
 259       if( $p ne "" ) {
 260         push @paragraphs, $p;
 261         $p = "";
 262       }
 263     }
 264   }
 265
 266   return \@paragraphs;
 267 }
 268
 269
 270 # Remove any strange quotation marks and other syntactic marks
 271 # that we don't want to have. Remove leading and trailing space as well
 272 # multiple whitespace characters.
 273 # Returns the empty string if called with an undef string.
 274 sub norm
 275 {
 276   my( $str ) = @_;
 277
 278   return "" if not defined( $str );
 279
 280 # Uncomment the code below and change the regexp to learn which
 281 # character code perl thinks a certain character has.
 282 # These codes can be used in \x{YY} expressions as shown below.
 283 #  if( $str =~ /unique string/ ) {
 284 #    for( my $i=0; $i < length( $str ); $i++ ) {
 285 #      printf( "%2x: %s\n", ord( substr( $str, $i, 1 ) ),
 286 #               substr( $str, $i, 1 ) );
 287 #    }
 288 #  }
 289
 290   $str = expand_entities( $str );
 291
 292   $str =~ tr/\x{96}\x{93}\x{94}/-""/; #
 293   $str =~ tr/\x{201d}\x{201c}/""/;
 294   $str =~ tr/\x{2013}\x{2019}/-'/;
 295   $str =~ s/\x{85}/... /g;
 296   $str =~ s/\x{2026}/.../sg;
 297   $str =~ s/\x{2007}/ /sg;
 298
 299   $str =~ s/^\s+//;
 300   $str =~ s/\s+$//;
 301   $str =~ tr/\n\r\t /    /s;
 302
 303   return $str;
 304 }
 305
 306 # Generate HTML file in tempdir and run Htmlfile2HtmlTree
 307 sub Wordfile2HtmlTree
 308 {
 309   my ($filename) = @_;
 310
 311   my $dir= tempdir( CLEANUP => 1 );
 312   (my $htmlfile= "$filename.html") =~ s|.*/([^/]+)$|$1|;
 313   if(system "$wvhtml --targetdir=\"$dir\" \"$filename\" \"$htmlfile\"") {
 314       print "$wvhtml --targetdir=\"$dir\" \"$filename\" \"$htmlfile\" failed: $?\n";
 315       return undef;
 316   }
 317   return &Htmlfile2HtmlTree("$dir/$htmlfile");
 318 }
 319
 320 # Generate HTML::Tree from html file
 321 sub Htmlfile2HtmlTree
 322 {
 323     my ($filename)= @_;
 324     my $tree = HTML::TreeBuilder->new();
 325     open(my $fh, "<:utf8", "$filename")
 326       or logdie( "Failed to read from $filename" );
 327
 328     $tree->parse_file($fh);
 329
 330     return $tree;
 331 }
 332
 333 =item AddCategory
 334
 335 Add program_type and category to an entry if the entry does not already
 336 have a program_type and category.
 337
 338 AddCategory( $ce, $program_type, $category );
 339
 340 =cut
 341
 342 sub AddCategory
 343 {
 344   my( $ce, $program_type, $category ) = @_;
 345
 346   if( not defined( $ce->{program_type} ) and defined( $program_type )
 347       and ( $program_type =~ /\S/ ) )
 348   {
 349     $ce->{program_type} = $program_type;
 350   }
 351
 352   if( not defined( $ce->{category} ) and defined( $category )
 353       and ( $category =~ /\S/ ) )
 354   {
 355     $ce->{category} = $category;
 356   }
 357 }
 358
 359 =item ParseDescCatSwe
 360
 361 Parse a program description in Swedish and return program_type
 362 and category that can be deduced from the description.
 363
 364   my( $pty, $cat ) = ParseDescCatSwe( "Amerikansk äventyrsserie" );
 365
 366 =cut
 367
 368 my $sm = NonameTV::StringMatcher->new();
 369 $sm->AddRegexp( qr/kriminalserie/i,      [ 'series', 'Crime/Mystery' ] );
 370 $sm->AddRegexp( qr/deckarserie/i,        [ 'series', 'Crime/Mystery' ] );
 371 $sm->AddRegexp( qr/polisserie/i,         [ 'series', 'Crime/Mystery' ] );
 372 $sm->AddRegexp( qr/familjeserie/i,       [ 'series', undef ] );
 373 $sm->AddRegexp( qr/tecknad serie/i,      [ 'series', undef ] );
 374 $sm->AddRegexp( qr/animerad serie/i,     [ 'series', undef ] );
 375 $sm->AddRegexp( qr/dramakomediserie/i,   [ 'series', 'Comedy' ] );
 376 $sm->AddRegexp( qr/dramaserie/i,         [ 'series', 'Drama' ] );
 377 $sm->AddRegexp( qr/resedokumentärserie/i,[ 'series', 'Food/Travel' ] );
 378 $sm->AddRegexp( qr/komediserie/i,        [ 'series', 'Comedy' ] );
 379 $sm->AddRegexp( qr/realityserie/i,       [ 'series', 'Reality' ] );
 380 $sm->AddRegexp( qr/realityshow/i,        [ 'series', 'Reality' ] );
 381 $sm->AddRegexp( qr/dokusåpa/i,           [ 'series', 'Reality' ] );
 382 $sm->AddRegexp( qr/actiondramaserie/i,   [ 'series', 'Action' ] );
 383 $sm->AddRegexp( qr/actionserie/i,        [ 'series', 'Action' ] );
 384 $sm->AddRegexp( qr/underhållningsserie/i,[ 'series', undef ] );
 385 $sm->AddRegexp( qr/äventyrsserie/i,      [ 'series', 'Action/Adv' ] );
 386 $sm->AddRegexp( qr/äventyrskomediserie/i,[ 'series', 'Comedy' ] );
 387 $sm->AddRegexp( qr/dokumentärserie/i,    [ 'series', 'Documentary' ] );
 388 $sm->AddRegexp( qr/dramadokumentär/i,    [ undef,    'Documentary' ] );
 389
 390 $sm->AddRegexp( qr/barnserie/i,          [ 'series', "Children's" ] );
 391 $sm->AddRegexp( qr/matlagningsserie/i,   [ 'series', 'Cooking' ] );
 392 $sm->AddRegexp( qr/motorserie/i,         [ 'series', undef ] );
 393 $sm->AddRegexp( qr/fixarserie/i,         [ 'series', "Home/How-to" ] );
 394 $sm->AddRegexp( qr/science[-\s]*fiction[-\s]*serie/i,
 395                 [ 'series', 'SciFi' ] );
 396 $sm->AddRegexp( qr/barnprogram/i,          [ undef, "Children's" ] );
 397
 398 # Movies
 399 $sm->AddRegexp( qr/\b(familje|drama|action)*komedi\b/i,  [ 'movie', "Comedy" ] );
 400
 401 $sm->AddRegexp( qr/\b(krigs|kriminal)*drama\b/i,  [ 'movie', "Drama" ] );
 402
 403 $sm->AddRegexp( qr/\baction(drama|film)*\b/i,     [ 'movie', "Action/Adv" ] );
 404
 405 $sm->AddRegexp( qr/\b.ventyrsdrama\b/i,           [ 'movie', "Action/Adv" ] );
 406
 407 $sm->AddRegexp( qr/\bv.stern(film)*\b/i,          [ 'movie', undef ] );
 408
 409 $sm->AddRegexp( qr/\b(drama)*thriller\b/i,        [ 'movie', "Crime" ] );
 410
 411 $sm->AddRegexp( qr/\bscience\s*fiction(rysare)*\b/i, [ 'movie', "SciFi" ] );
 412
 413 $sm->AddRegexp( qr/\b(l.ng)*film\b/i,             [ 'movie', undef ] );
 414
 415
 416 sub ParseDescCatSwe
 417 {
 418   my( $desc ) = @_;
 419
 420   return (undef, undef) if not defined $desc;
 421
 422   my $res = $sm->Match( $desc );
 423   if( defined( $res ) )
 424   {
 425     return @{$res};
 426   }
 427   else
 428   {
 429     return (undef,undef);
 430   }
 431 }
 432
 433 sub FixProgrammeData
 434 {
 435   my( $d ) = @_;
 436
 437   $d->{title} =~ s/^s.songs+tart\s*:*\s*//gi;
 438   $d->{title} =~ s/^seriestart\s*:*\s*//gi;
 439   $d->{title} =~ s/^reprisstart\s*:*\s*//gi;
 440   $d->{title} =~ s/^programstart\s*:*\s*//gi;
 441
 442   $d->{title} =~ s/^s.songs*avslutning\s*:*\s*//gi;
 443   $d->{title} =~ s/^sista\s+delen\s*:*\s*//gi;
 444   $d->{title} =~ s/^sista\s+avsnittet\s*:*\s*//gi;
 445
 446   if( $d->{title} =~ s/^((matin.)|(fredagsbio))\s*:\s*//gi )
 447   {
 448     $d->{program_type} = 'movie';
 449     $d->{category} = 'Movies';
 450   }
 451
 452   # Set program_type to series if the entry has an episode-number
 453   # with a defined episode (i.e. second part),
 454   # but doesn't have a program_type.
 455   if( exists( $d->{episode} ) and defined( $d->{episode} ) and
 456       ($d->{episode} !~ /^\s*\.\s*\./) and
 457       ( (not defined($d->{program_type})) or ($d->{program_type} =~ /^\s*$/) ) )
 458   {
 459     $d->{program_type} = "series";
 460   }
 461 }
 462
 463 =pod
 464
 465 my $doc = ParseXml( $strref );
 466
 467 Parse an xml-string into an XML::LibXML document. Takes a reference to a
 468 string as the only reference.
 469
 470 =cut
 471
 472 my $xml;
 473
 474 sub ParseXml {
 475   my( $cref ) = @_;
 476
 477   if( not defined $xml ) {
 478     $xml = XML::LibXML->new;
 479     $xml->load_ext_dtd(0);
 480   }
 481
 482   my $doc;
 483   eval {
 484     $doc = $xml->parse_string($$cref);
 485   };
 486   if( $@ ne "" )   {
 487     error( "???: Failed to parse: $@" );
 488     return undef;
 489   }
 490
 491   return $doc;
 492 }
 493
 494 =pod
 495
 496 Parse a reference to an xml-string in xmltv-format into a reference to an
 497 array of hashes with programme-info.
 498
 499 =cut
 500
 501 sub ParseXmltv {
 502   my( $cref ) = @_;
 503
 504   my $doc = ParseXml( $cref );
 505   return undef if not defined $doc;
 506
 507   my @d;
 508
 509   # Find all "programme"-entries.
 510   my $ns = $doc->find( "//programme" );
 511   if( $ns->size() == 0 ) {
 512     return;
 513   }
 514
 515   foreach my $pgm ($ns->get_nodelist) {
 516     my $start = $pgm->findvalue( '@start' );
 517     my $start_dt = create_dt( $start );
 518
 519     my $stop = $pgm->findvalue( '@stop' );
 520     my $stop_dt = create_dt( $stop );
 521
 522     my $title = $pgm->findvalue( 'title' );
 523     my $subtitle = $pgm->findvalue( 'sub-title' );
 524
 525     my $desc = $pgm->findvalue( 'desc' );
 526     my $cat1 = $pgm->findvalue( 'category[1]' );
 527     my $cat2 = $pgm->findvalue( 'category[2]' );
 528     my $episode = $pgm->findvalue( 'episode-num[@system="xmltv_ns"]' );
 529     my $production_date = $pgm->findvalue( 'date' );
 530
 531     my $aspect = $pgm->findvalue( 'video/aspect' );
 532
 533     my @actors;
 534     my $ns = $pgm->find( ".//actor" );
 535
 536     foreach my $act ($ns->get_nodelist) {
 537       push @actors, $act->findvalue(".");
 538     }
 539
 540     my @directors;
 541     $ns = $pgm->find( ".//director" );
 542
 543     foreach my $dir ($ns->get_nodelist) {
 544       push @directors, $dir->findvalue(".");
 545     }
 546
 547     my %e = (
 548       start_dt => $start_dt,
 549       stop_dt => $stop_dt,
 550       title => $title,
 551       description => $desc,
 552     );
 553
 554     if( $subtitle =~ /\S/ ) {
 555       $e{subtitle} = $subtitle;
 556     }
 557
 558     if( $episode =~ /\S/ ) {
 559       $e{episode} = $episode;
 560     }
 561
 562     if( $cat1 =~ /^[a-z]/ ) {
 563       $e{program_type} = $cat1;
 564     }
 565     elsif( $cat1 =~ /^[A-Z]/ ) {
 566       $e{category} = $cat1;
 567     }
 568
 569     if( $cat2 =~ /^[a-z]/ ) {
 570       $e{program_type} = $cat2;
 571     }
 572     elsif( $cat2 =~ /^[A-Z]/ ) {
 573       $e{category} = $cat2;
 574     }
 575
 576     if( $production_date =~ /\S/ ) {
 577       $e{production_date} = "$production_date-01-01";
 578     }
 579
 580     if( $aspect =~ /\S/ ) {
 581       $e{aspect} = $aspect;
 582     }
 583
 584     if( scalar( @directors ) > 0 ) {
 585       $e{directors} = join ", ", @directors;
 586     }
 587
 588     if( scalar( @actors ) > 0 ) {
 589       $e{actors} = join ", ", @actors;
 590     }
 591
 592
 593     push @d, \%e;
 594   }
 595
 596   return \@d;
 597 }
 598
 599 sub create_dt
 600 {
 601   my( $datetime ) = @_;
 602
 603   my( $year, $month, $day, $hour, $minute, $second, $tz ) =
 604     ($datetime =~ /(\d{4})(\d{2})(\d{2})
 605                    (\d{2})(\d{2})(\d{2})\s+
 606                    (\S+)$/x);
 607
 608   my $dt = DateTime->new(
 609                           year => $year,
 610                           month => $month,
 611                           day => $day,
 612                           hour => $hour,
 613                           minute => $minute,
 614                           second => $second,
 615                           time_zone => $tz
 616                           );
 617
 618   return $dt;
 619 }
 620
 621 =pod
 622
 623 Convert month name to month number
 624
 625 =cut
 626
 627 sub MonthNumber {
 628   my( $monthname , $lang ) = @_;
 629
 630   my( @months_1, @months_2 );
 631
 632   if( $lang =~ /^en$/ ){
 633     @months_1 = qw/jan feb mar apr may jun jul aug sep oct nov dec/;
 634     @months_2 = qw/january february march april may june july august september october november december/;
 635   } elsif( $lang =~ /^hr$/ ){
 636     @months_1 = qw/sijecanj veljaca ozujak travanj svibanj lipanj srpanj kolovoz rujan listopad studeni prosinac/;
 637     @months_2 = qw/sijecnja veljace ozujka travnja svibnja lipnja srpnja kolovoza rujna listopada studenoga prosinca/;
 638   }
 639
 640   my %monthnames = ();
 641
 642   for( my $i = 0; $i < scalar(@months_1); $i++ ){
 643     $monthnames{$months_1[$i]} = $i+1;
 644   }
 645
 646   for( my $i = 0; $i < scalar(@months_2); $i++ ){
 647     $monthnames{$months_2[$i]} = $i+1;
 648   }
 649
 650   my $month = $monthnames{lc $monthname};
 651
 652   return $month;
 653 }
 654
 655 1;
 656
 657 ### Setup coding system
 658 ## Local Variables:
 659 ## coding: utf-8
 660 ## End: