Few fixes
[nonametv.git] / lib / NonameTV.pm
blob1f8e5b471d2e9015d22c91a4cfca69bb1243b658
1 package NonameTV;
3 use strict;
4 use warnings;
6 # Mark this source-file as encoded in utf-8.
7 use utf8;
8 use Env;
10 use LWP::UserAgent;
11 use File::Temp qw/tempfile tempdir/;
12 use File::Slurp;
14 use NonameTV::StringMatcher;
15 use NonameTV::Log qw/logdie error/;
16 use XML::LibXML;
18 BEGIN {
19 use Exporter ();
20 our ($VERSION, @ISA, @EXPORT, @EXPORT_OK, %EXPORT_TAGS);
22 # set the version for version checking
23 $VERSION = 0.3;
25 @ISA = qw(Exporter);
26 @EXPORT = qw( );
27 %EXPORT_TAGS = ( ); # eg: TAG => [ qw!name1 name2! ],
28 @EXPORT_OK = qw/MyGet expand_entities
29 Html2Xml Htmlfile2Xml
30 Wordfile2HtmlTree Htmlfile2HtmlTree
31 Word2Xml Wordfile2Xml
32 File2Xml
33 FindParagraphs
34 norm AddCategory
35 ParseDescCatSwe FixProgrammeData
36 ParseXml ParseXmltv
37 MonthNumber/;
39 our @EXPORT_OK;
41 my $wvhtml = '/usr/bin/wvHtml --charset=utf8';
42 # my $wvhtml = '/usr/bin/wvHtml';
44 my $ua = LWP::UserAgent->new( agent => "Grabber from http://tv.swedb.se",
45 cookie_jar => {} );
47 # Fetch a url. Returns ($content, true) if data was fetched from server and
48 # different from the last time the same url was fetched, ($content, false) if
49 # it was fetched from the server and was the same as the last time it was
50 # fetched and (undef,$error_message) if there was an error fetching the data.
52 sub MyGet
54 my( $url ) = @_;
55 my $res = $ua->get( $url );
57 if( $res->is_success )
59 return ($res->content, not defined( $res->header( 'X-Content-Unchanged' ) ) );
61 else
63 return (undef, $res->status_line );
67 # åäö ÅÄÖ
68 my %ent = (
69 257 => 'ä',
70 231 => 'c', # This should really be a c with a special mark on it.
71 # Unicode 000E7, UTF-8 195 167.
72 337 => 'ö',
73 8211 => '-',
74 8212 => '--',
75 8216 => "'",
76 8217 => "'",
77 8220 => '"',
78 8221 => '"',
79 8230 => '...',
80 8364 => "(euro)",
83 sub _expand
85 my( $num, $str ) = @_;
87 if( not defined( $ent{$num} ) )
89 $ent{$num} = "";
90 print STDERR "Unknown entity $num in $str\n";
93 return $ent{$num};
96 sub expand_entities
98 my( $str ) = @_;
100 $str =~ s/\&#(\d+);/_expand($1,$str)/eg;
102 return $str;
105 =item Html2Xml( $content )
107 Convert the HTML in $content into an XML::LibXML::Document.
109 Prints an error-message to STDERR and returns undef if the conversion
110 fails.
112 =cut
114 sub Html2Xml
116 my( $html ) = @_;
117 my $xml = XML::LibXML->new;
118 $xml->recover(1);
120 # Stupid XML::LibXML writes to STDERR. Redirect it temporarily.
121 open(SAVERR, ">&STDERR"); # save the stderr fhandle
122 print SAVERR "Nothing\n" if 0;
123 open(STDERR,"> /dev/null");
125 # Remove character that makes the parser stop.
126 $html =~ s/\x00//g;
128 my $doc;
129 eval { $doc = $xml->parse_html_string($html); };
131 # Restore STDERR
132 open( STDERR, ">&SAVERR" );
134 if( $@ ne "" )
136 my ($package, $filename, $line) = caller;
137 print "parse_html_string failed: $@ when called from $filename:$line\n";
138 return undef;
141 return $doc;
144 =item Htmlfile2Xml( $filename )
146 Convert the HTML in a file into an XML::LibXML::Document.
148 Prints an error-message to STDERR and returns undef if the conversion
149 fails.
151 =cut
153 sub Htmlfile2Xml
155 my( $filename ) = @_;
157 my $html = read_file( $filename );
159 return Html2Xml( $html );
163 =item Word2Xml( $content )
165 Convert the Microsoft Word document in $content into html and return
166 the html as an XML::LibXML::Document.
168 Prints an error-message to STDERR and returns undef if the conversion
169 fails.
171 =cut
173 sub Word2Xml
175 my( $content ) = @_;
177 my( $fh, $filename ) = tempfile();
178 print $fh $content;
179 close( $fh );
181 my $doc = Wordfile2Xml( $filename );
182 unlink( $filename );
183 return $doc;
186 sub Wordfile2Xml
188 my( $filename ) = @_;
190 my $html = qx/$wvhtml "$filename" -/;
191 if( $? )
193 print "$wvhtml $filename - failed: $?\n";
194 return undef;
197 # Remove character that makes LibXML choke.
198 $html =~ s/\…/.../g;
200 return Html2Xml( $html );
203 sub File2Xml {
204 my( $filename ) = @_;
206 my $data = read_file( $filename );
207 my $doc;
208 if( $data =~ /^\<\!DOCTYPE HTML/ )
210 # This is an override that has already been run through wvHtml
211 $doc = Html2Xml( $data );
213 else
215 $doc = Word2Xml( $data );
218 return $doc;
221 =pod
223 FindParagraphs( $doc, $expr )
225 Finds all paragraphs in the part of an xml-tree that matches an
226 xpath-expression. Returns a reference to an array of strings.
227 All paragraphs are normalized and empty strings are removed from the
228 array.
230 Both <div> and <br> are taken into account when splitting the document
231 into paragraphs.
233 Use the expression '//body//.' for html-documents when you want to see
234 all paragraphs in the page.
236 =cut
238 my %paraelem = (
239 p => 1,
240 br => 1,
241 div => 1,
242 td => 1,
245 sub FindParagraphs {
246 my( $doc, $elements ) = @_;
248 my $ns = $doc->find( $elements );
250 my @paragraphs;
251 my $p = "";
253 foreach my $node ($ns->get_nodelist()) {
254 if( $node->nodeName eq "#text" ) {
255 $p .= $node->textContent();
257 elsif( defined $paraelem{ $node->nodeName } ) {
258 $p = norm( $p );
259 if( $p ne "" ) {
260 push @paragraphs, $p;
261 $p = "";
266 return \@paragraphs;
270 # Remove any strange quotation marks and other syntactic marks
271 # that we don't want to have. Remove leading and trailing space as well
272 # multiple whitespace characters.
273 # Returns the empty string if called with an undef string.
274 sub norm
276 my( $str ) = @_;
278 return "" if not defined( $str );
280 # Uncomment the code below and change the regexp to learn which
281 # character code perl thinks a certain character has.
282 # These codes can be used in \x{YY} expressions as shown below.
283 # if( $str =~ /unique string/ ) {
284 # for( my $i=0; $i < length( $str ); $i++ ) {
285 # printf( "%2x: %s\n", ord( substr( $str, $i, 1 ) ),
286 # substr( $str, $i, 1 ) );
290 $str = expand_entities( $str );
292 $str =~ tr/\x{96}\x{93}\x{94}/-""/; #
293 $str =~ tr/\x{201d}\x{201c}/""/;
294 $str =~ tr/\x{2013}\x{2019}/-'/;
295 $str =~ s/\x{85}/... /g;
296 $str =~ s/\x{2026}/.../sg;
297 $str =~ s/\x{2007}/ /sg;
299 $str =~ s/^\s+//;
300 $str =~ s/\s+$//;
301 $str =~ tr/\n\r\t / /s;
303 return $str;
306 # Generate HTML file in tempdir and run Htmlfile2HtmlTree
307 sub Wordfile2HtmlTree
309 my ($filename) = @_;
311 my $dir= tempdir( CLEANUP => 1 );
312 (my $htmlfile= "$filename.html") =~ s|.*/([^/]+)$|$1|;
313 if(system "$wvhtml --targetdir=\"$dir\" \"$filename\" \"$htmlfile\"") {
314 print "$wvhtml --targetdir=\"$dir\" \"$filename\" \"$htmlfile\" failed: $?\n";
315 return undef;
317 return &Htmlfile2HtmlTree("$dir/$htmlfile");
320 # Generate HTML::Tree from html file
321 sub Htmlfile2HtmlTree
323 my ($filename)= @_;
324 my $tree = HTML::TreeBuilder->new();
325 open(my $fh, "<:utf8", "$filename")
326 or logdie( "Failed to read from $filename" );
328 $tree->parse_file($fh);
330 return $tree;
333 =item AddCategory
335 Add program_type and category to an entry if the entry does not already
336 have a program_type and category.
338 AddCategory( $ce, $program_type, $category );
340 =cut
342 sub AddCategory
344 my( $ce, $program_type, $category ) = @_;
346 if( not defined( $ce->{program_type} ) and defined( $program_type )
347 and ( $program_type =~ /\S/ ) )
349 $ce->{program_type} = $program_type;
352 if( not defined( $ce->{category} ) and defined( $category )
353 and ( $category =~ /\S/ ) )
355 $ce->{category} = $category;
359 =item ParseDescCatSwe
361 Parse a program description in Swedish and return program_type
362 and category that can be deduced from the description.
364 my( $pty, $cat ) = ParseDescCatSwe( "Amerikansk äventyrsserie" );
366 =cut
368 my $sm = NonameTV::StringMatcher->new();
369 $sm->AddRegexp( qr/kriminalserie/i, [ 'series', 'Crime/Mystery' ] );
370 $sm->AddRegexp( qr/deckarserie/i, [ 'series', 'Crime/Mystery' ] );
371 $sm->AddRegexp( qr/polisserie/i, [ 'series', 'Crime/Mystery' ] );
372 $sm->AddRegexp( qr/familjeserie/i, [ 'series', undef ] );
373 $sm->AddRegexp( qr/tecknad serie/i, [ 'series', undef ] );
374 $sm->AddRegexp( qr/animerad serie/i, [ 'series', undef ] );
375 $sm->AddRegexp( qr/dramakomediserie/i, [ 'series', 'Comedy' ] );
376 $sm->AddRegexp( qr/dramaserie/i, [ 'series', 'Drama' ] );
377 $sm->AddRegexp( qr/resedokumentärserie/i,[ 'series', 'Food/Travel' ] );
378 $sm->AddRegexp( qr/komediserie/i, [ 'series', 'Comedy' ] );
379 $sm->AddRegexp( qr/realityserie/i, [ 'series', 'Reality' ] );
380 $sm->AddRegexp( qr/realityshow/i, [ 'series', 'Reality' ] );
381 $sm->AddRegexp( qr/dokusåpa/i, [ 'series', 'Reality' ] );
382 $sm->AddRegexp( qr/actiondramaserie/i, [ 'series', 'Action' ] );
383 $sm->AddRegexp( qr/actionserie/i, [ 'series', 'Action' ] );
384 $sm->AddRegexp( qr/underhållningsserie/i,[ 'series', undef ] );
385 $sm->AddRegexp( qr/äventyrsserie/i, [ 'series', 'Action/Adv' ] );
386 $sm->AddRegexp( qr/äventyrskomediserie/i,[ 'series', 'Comedy' ] );
387 $sm->AddRegexp( qr/dokumentärserie/i, [ 'series', 'Documentary' ] );
388 $sm->AddRegexp( qr/dramadokumentär/i, [ undef, 'Documentary' ] );
390 $sm->AddRegexp( qr/barnserie/i, [ 'series', "Children's" ] );
391 $sm->AddRegexp( qr/matlagningsserie/i, [ 'series', 'Cooking' ] );
392 $sm->AddRegexp( qr/motorserie/i, [ 'series', undef ] );
393 $sm->AddRegexp( qr/fixarserie/i, [ 'series', "Home/How-to" ] );
394 $sm->AddRegexp( qr/science[-\s]*fiction[-\s]*serie/i,
395 [ 'series', 'SciFi' ] );
396 $sm->AddRegexp( qr/barnprogram/i, [ undef, "Children's" ] );
398 # Movies
399 $sm->AddRegexp( qr/\b(familje|drama|action)*komedi\b/i, [ 'movie', "Comedy" ] );
401 $sm->AddRegexp( qr/\b(krigs|kriminal)*drama\b/i, [ 'movie', "Drama" ] );
403 $sm->AddRegexp( qr/\baction(drama|film)*\b/i, [ 'movie', "Action/Adv" ] );
405 $sm->AddRegexp( qr/\b.ventyrsdrama\b/i, [ 'movie', "Action/Adv" ] );
407 $sm->AddRegexp( qr/\bv.stern(film)*\b/i, [ 'movie', undef ] );
409 $sm->AddRegexp( qr/\b(drama)*thriller\b/i, [ 'movie', "Crime" ] );
411 $sm->AddRegexp( qr/\bscience\s*fiction(rysare)*\b/i, [ 'movie', "SciFi" ] );
413 $sm->AddRegexp( qr/\b(l.ng)*film\b/i, [ 'movie', undef ] );
416 sub ParseDescCatSwe
418 my( $desc ) = @_;
420 return (undef, undef) if not defined $desc;
422 my $res = $sm->Match( $desc );
423 if( defined( $res ) )
425 return @{$res};
427 else
429 return (undef,undef);
433 sub FixProgrammeData
435 my( $d ) = @_;
437 $d->{title} =~ s/^s.songs+tart\s*:*\s*//gi;
438 $d->{title} =~ s/^seriestart\s*:*\s*//gi;
439 $d->{title} =~ s/^reprisstart\s*:*\s*//gi;
440 $d->{title} =~ s/^programstart\s*:*\s*//gi;
442 $d->{title} =~ s/^s.songs*avslutning\s*:*\s*//gi;
443 $d->{title} =~ s/^sista\s+delen\s*:*\s*//gi;
444 $d->{title} =~ s/^sista\s+avsnittet\s*:*\s*//gi;
446 if( $d->{title} =~ s/^((matin.)|(fredagsbio))\s*:\s*//gi )
448 $d->{program_type} = 'movie';
449 $d->{category} = 'Movies';
452 # Set program_type to series if the entry has an episode-number
453 # with a defined episode (i.e. second part),
454 # but doesn't have a program_type.
455 if( exists( $d->{episode} ) and defined( $d->{episode} ) and
456 ($d->{episode} !~ /^\s*\.\s*\./) and
457 ( (not defined($d->{program_type})) or ($d->{program_type} =~ /^\s*$/) ) )
459 $d->{program_type} = "series";
463 =pod
465 my $doc = ParseXml( $strref );
467 Parse an xml-string into an XML::LibXML document. Takes a reference to a
468 string as the only reference.
470 =cut
472 my $xml;
474 sub ParseXml {
475 my( $cref ) = @_;
477 if( not defined $xml ) {
478 $xml = XML::LibXML->new;
479 $xml->load_ext_dtd(0);
482 my $doc;
483 eval {
484 $doc = $xml->parse_string($$cref);
486 if( $@ ne "" ) {
487 error( "???: Failed to parse: $@" );
488 return undef;
491 return $doc;
494 =pod
496 Parse a reference to an xml-string in xmltv-format into a reference to an
497 array of hashes with programme-info.
499 =cut
501 sub ParseXmltv {
502 my( $cref ) = @_;
504 my $doc = ParseXml( $cref );
505 return undef if not defined $doc;
507 my @d;
509 # Find all "programme"-entries.
510 my $ns = $doc->find( "//programme" );
511 if( $ns->size() == 0 ) {
512 return;
515 foreach my $pgm ($ns->get_nodelist) {
516 my $start = $pgm->findvalue( '@start' );
517 my $start_dt = create_dt( $start );
519 my $stop = $pgm->findvalue( '@stop' );
520 my $stop_dt = create_dt( $stop );
522 my $title = $pgm->findvalue( 'title' );
523 my $subtitle = $pgm->findvalue( 'sub-title' );
525 my $desc = $pgm->findvalue( 'desc' );
526 my $cat1 = $pgm->findvalue( 'category[1]' );
527 my $cat2 = $pgm->findvalue( 'category[2]' );
528 my $episode = $pgm->findvalue( 'episode-num[@system="xmltv_ns"]' );
529 my $production_date = $pgm->findvalue( 'date' );
531 my $aspect = $pgm->findvalue( 'video/aspect' );
533 my @actors;
534 my $ns = $pgm->find( ".//actor" );
536 foreach my $act ($ns->get_nodelist) {
537 push @actors, $act->findvalue(".");
540 my @directors;
541 $ns = $pgm->find( ".//director" );
543 foreach my $dir ($ns->get_nodelist) {
544 push @directors, $dir->findvalue(".");
547 my %e = (
548 start_dt => $start_dt,
549 stop_dt => $stop_dt,
550 title => $title,
551 description => $desc,
554 if( $subtitle =~ /\S/ ) {
555 $e{subtitle} = $subtitle;
558 if( $episode =~ /\S/ ) {
559 $e{episode} = $episode;
562 if( $cat1 =~ /^[a-z]/ ) {
563 $e{program_type} = $cat1;
565 elsif( $cat1 =~ /^[A-Z]/ ) {
566 $e{category} = $cat1;
569 if( $cat2 =~ /^[a-z]/ ) {
570 $e{program_type} = $cat2;
572 elsif( $cat2 =~ /^[A-Z]/ ) {
573 $e{category} = $cat2;
576 if( $production_date =~ /\S/ ) {
577 $e{production_date} = "$production_date-01-01";
580 if( $aspect =~ /\S/ ) {
581 $e{aspect} = $aspect;
584 if( scalar( @directors ) > 0 ) {
585 $e{directors} = join ", ", @directors;
588 if( scalar( @actors ) > 0 ) {
589 $e{actors} = join ", ", @actors;
593 push @d, \%e;
596 return \@d;
599 sub create_dt
601 my( $datetime ) = @_;
603 my( $year, $month, $day, $hour, $minute, $second, $tz ) =
604 ($datetime =~ /(\d{4})(\d{2})(\d{2})
605 (\d{2})(\d{2})(\d{2})\s+
606 (\S+)$/x);
608 my $dt = DateTime->new(
609 year => $year,
610 month => $month,
611 day => $day,
612 hour => $hour,
613 minute => $minute,
614 second => $second,
615 time_zone => $tz
618 return $dt;
621 =pod
623 Convert month name to month number
625 =cut
627 sub MonthNumber {
628 my( $monthname , $lang ) = @_;
630 my( @months_1, @months_2 );
632 if( $lang =~ /^en$/ ){
633 @months_1 = qw/jan feb mar apr may jun jul aug sep oct nov dec/;
634 @months_2 = qw/january february march april may june july august september october november december/;
635 } elsif( $lang =~ /^hr$/ ){
636 @months_1 = qw/sijecanj veljaca ozujak travanj svibanj lipanj srpanj kolovoz rujan listopad studeni prosinac/;
637 @months_2 = qw/sijecnja veljace ozujka travnja svibnja lipnja srpnja kolovoza rujna listopada studenoga prosinca/;
640 my %monthnames = ();
642 for( my $i = 0; $i < scalar(@months_1); $i++ ){
643 $monthnames{$months_1[$i]} = $i+1;
646 for( my $i = 0; $i < scalar(@months_2); $i++ ){
647 $monthnames{$months_2[$i]} = $i+1;
650 my $month = $monthnames{lc $monthname};
652 return $month;
657 ### Setup coding system
658 ## Local Variables:
659 ## coding: utf-8
660 ## End: