6 # Mark this source-file as encoded in utf-8.
11 use File
::Temp qw
/tempfile tempdir/;
14 use NonameTV
::StringMatcher
;
15 use NonameTV
::Log qw
/logdie error/;
20 our ($VERSION, @ISA, @EXPORT, @EXPORT_OK, %EXPORT_TAGS);
22 # set the version for version checking
27 %EXPORT_TAGS = ( ); # eg: TAG => [ qw!name1 name2! ],
28 @EXPORT_OK = qw
/MyGet expand_entities
30 Wordfile2HtmlTree Htmlfile2HtmlTree
35 ParseDescCatSwe FixProgrammeData
41 my $wvhtml = '/usr/bin/wvHtml --charset=utf8';
42 # my $wvhtml = '/usr/bin/wvHtml';
44 my $ua = LWP
::UserAgent
->new( agent
=> "Grabber from http://tv.swedb.se",
47 # Fetch a url. Returns ($content, true) if data was fetched from server and
48 # different from the last time the same url was fetched, ($content, false) if
49 # it was fetched from the server and was the same as the last time it was
50 # fetched and (undef,$error_message) if there was an error fetching the data.
55 my $res = $ua->get( $url );
57 if( $res->is_success )
59 return ($res->content, not defined( $res->header( 'X-Content-Unchanged' ) ) );
63 return (undef, $res->status_line );
70 231 => 'c', # This should really be a c with a special mark on it.
71 # Unicode 000E7, UTF-8 195 167.
85 my( $num, $str ) = @_;
87 if( not defined( $ent{$num} ) )
90 print STDERR
"Unknown entity $num in $str\n";
100 $str =~ s/\&#(\d+);/_expand($1,$str)/eg;
105 =item Html2Xml( $content )
107 Convert the HTML in $content into an XML::LibXML::Document.
109 Prints an error-message to STDERR and returns undef if the conversion
117 my $xml = XML
::LibXML
->new;
120 # Stupid XML::LibXML writes to STDERR. Redirect it temporarily.
121 open(SAVERR
, ">&STDERR"); # save the stderr fhandle
122 print SAVERR
"Nothing\n" if 0;
123 open(STDERR
,"> /dev/null");
125 # Remove character that makes the parser stop.
129 eval { $doc = $xml->parse_html_string($html); };
132 open( STDERR
, ">&SAVERR" );
136 my ($package, $filename, $line) = caller;
137 print "parse_html_string failed: $@ when called from $filename:$line\n";
144 =item Htmlfile2Xml( $filename )
146 Convert the HTML in a file into an XML::LibXML::Document.
148 Prints an error-message to STDERR and returns undef if the conversion
155 my( $filename ) = @_;
157 my $html = read_file
( $filename );
159 return Html2Xml
( $html );
163 =item Word2Xml( $content )
165 Convert the Microsoft Word document in $content into html and return
166 the html as an XML::LibXML::Document.
168 Prints an error-message to STDERR and returns undef if the conversion
177 my( $fh, $filename ) = tempfile
();
181 my $doc = Wordfile2Xml
( $filename );
188 my( $filename ) = @_;
190 my $html = qx/$wvhtml "$filename" -/;
193 print "$wvhtml $filename - failed: $?\n";
197 # Remove character that makes LibXML choke.
198 $html =~ s/\…/.../g;
200 return Html2Xml
( $html );
204 my( $filename ) = @_;
206 my $data = read_file
( $filename );
208 if( $data =~ /^\<\!DOCTYPE HTML/ )
210 # This is an override that has already been run through wvHtml
211 $doc = Html2Xml
( $data );
215 $doc = Word2Xml
( $data );
223 FindParagraphs( $doc, $expr )
225 Finds all paragraphs in the part of an xml-tree that matches an
226 xpath-expression. Returns a reference to an array of strings.
227 All paragraphs are normalized and empty strings are removed from the
230 Both <div> and <br> are taken into account when splitting the document
233 Use the expression '//body//.' for html-documents when you want to see
234 all paragraphs in the page.
246 my( $doc, $elements ) = @_;
248 my $ns = $doc->find( $elements );
253 foreach my $node ($ns->get_nodelist()) {
254 if( $node->nodeName eq "#text" ) {
255 $p .= $node->textContent();
257 elsif( defined $paraelem{ $node->nodeName } ) {
260 push @paragraphs, $p;
270 # Remove any strange quotation marks and other syntactic marks
271 # that we don't want to have. Remove leading and trailing space as well
272 # multiple whitespace characters.
273 # Returns the empty string if called with an undef string.
278 return "" if not defined( $str );
280 # Uncomment the code below and change the regexp to learn which
281 # character code perl thinks a certain character has.
282 # These codes can be used in \x{YY} expressions as shown below.
283 # if( $str =~ /unique string/ ) {
284 # for( my $i=0; $i < length( $str ); $i++ ) {
285 # printf( "%2x: %s\n", ord( substr( $str, $i, 1 ) ),
286 # substr( $str, $i, 1 ) );
290 $str = expand_entities
( $str );
292 $str =~ tr/\x{96}\x{93}\x{94}/-""/; #
293 $str =~ tr/\x{201d}\x{201c}/""/;
294 $str =~ tr/\x{2013}\x{2019}/-'/;
295 $str =~ s/\x{85}/... /g;
296 $str =~ s/\x{2026}/.../sg;
297 $str =~ s/\x{2007}/ /sg;
301 $str =~ tr/\n\r\t / /s;
306 # Generate HTML file in tempdir and run Htmlfile2HtmlTree
307 sub Wordfile2HtmlTree
311 my $dir= tempdir
( CLEANUP
=> 1 );
312 (my $htmlfile= "$filename.html") =~ s
|.*/([^/]+)$|$1|;
313 if(system "$wvhtml --targetdir=\"$dir\" \"$filename\" \"$htmlfile\"") {
314 print "$wvhtml --targetdir=\"$dir\" \"$filename\" \"$htmlfile\" failed: $?\n";
317 return &Htmlfile2HtmlTree
("$dir/$htmlfile");
320 # Generate HTML::Tree from html file
321 sub Htmlfile2HtmlTree
324 my $tree = HTML
::TreeBuilder
->new();
325 open(my $fh, "<:utf8", "$filename")
326 or logdie
( "Failed to read from $filename" );
328 $tree->parse_file($fh);
335 Add program_type and category to an entry if the entry does not already
336 have a program_type and category.
338 AddCategory( $ce, $program_type, $category );
344 my( $ce, $program_type, $category ) = @_;
346 if( not defined( $ce->{program_type
} ) and defined( $program_type )
347 and ( $program_type =~ /\S/ ) )
349 $ce->{program_type
} = $program_type;
352 if( not defined( $ce->{category
} ) and defined( $category )
353 and ( $category =~ /\S/ ) )
355 $ce->{category
} = $category;
359 =item ParseDescCatSwe
361 Parse a program description in Swedish and return program_type
362 and category that can be deduced from the description.
364 my( $pty, $cat ) = ParseDescCatSwe( "Amerikansk äventyrsserie" );
368 my $sm = NonameTV
::StringMatcher
->new();
369 $sm->AddRegexp( qr/kriminalserie/i, [ 'series', 'Crime/Mystery' ] );
370 $sm->AddRegexp( qr/deckarserie/i, [ 'series', 'Crime/Mystery' ] );
371 $sm->AddRegexp( qr/polisserie/i, [ 'series', 'Crime/Mystery' ] );
372 $sm->AddRegexp( qr/familjeserie/i, [ 'series', undef ] );
373 $sm->AddRegexp( qr/tecknad serie/i, [ 'series', undef ] );
374 $sm->AddRegexp( qr/animerad serie/i, [ 'series', undef ] );
375 $sm->AddRegexp( qr/dramakomediserie/i, [ 'series', 'Comedy' ] );
376 $sm->AddRegexp( qr/dramaserie/i, [ 'series', 'Drama' ] );
377 $sm->AddRegexp( qr/resedokumentärserie/i,[ 'series', 'Food/Travel' ] );
378 $sm->AddRegexp( qr/komediserie/i, [ 'series', 'Comedy' ] );
379 $sm->AddRegexp( qr/realityserie/i, [ 'series', 'Reality' ] );
380 $sm->AddRegexp( qr/realityshow/i, [ 'series', 'Reality' ] );
381 $sm->AddRegexp( qr/dokusåpa/i, [ 'series', 'Reality' ] );
382 $sm->AddRegexp( qr/actiondramaserie/i, [ 'series', 'Action' ] );
383 $sm->AddRegexp( qr/actionserie/i, [ 'series', 'Action' ] );
384 $sm->AddRegexp( qr/underhållningsserie/i,[ 'series', undef ] );
385 $sm->AddRegexp( qr/äventyrsserie/i, [ 'series', 'Action/Adv' ] );
386 $sm->AddRegexp( qr/äventyrskomediserie/i,[ 'series', 'Comedy' ] );
387 $sm->AddRegexp( qr/dokumentärserie/i, [ 'series', 'Documentary' ] );
388 $sm->AddRegexp( qr/dramadokumentär/i, [ undef, 'Documentary' ] );
390 $sm->AddRegexp( qr/barnserie/i, [ 'series', "Children's" ] );
391 $sm->AddRegexp( qr/matlagningsserie/i, [ 'series', 'Cooking' ] );
392 $sm->AddRegexp( qr/motorserie/i, [ 'series', undef ] );
393 $sm->AddRegexp( qr/fixarserie/i, [ 'series', "Home/How-to" ] );
394 $sm->AddRegexp( qr/science[-\s]*fiction[-\s]*serie/i,
395 [ 'series', 'SciFi' ] );
396 $sm->AddRegexp( qr/barnprogram/i, [ undef, "Children's" ] );
399 $sm->AddRegexp( qr/\b(familje|drama|action)*komedi\b/i, [ 'movie', "Comedy" ] );
401 $sm->AddRegexp( qr/\b(krigs|kriminal)*drama\b/i, [ 'movie', "Drama" ] );
403 $sm->AddRegexp( qr/\baction(drama|film)*\b/i, [ 'movie', "Action/Adv" ] );
405 $sm->AddRegexp( qr/\b.ventyrsdrama\b/i, [ 'movie', "Action/Adv" ] );
407 $sm->AddRegexp( qr/\bv.stern(film)*\b/i, [ 'movie', undef ] );
409 $sm->AddRegexp( qr/\b(drama)*thriller\b/i, [ 'movie', "Crime" ] );
411 $sm->AddRegexp( qr/\bscience\s*fiction(rysare)*\b/i, [ 'movie', "SciFi" ] );
413 $sm->AddRegexp( qr/\b(l.ng)*film\b/i, [ 'movie', undef ] );
420 return (undef, undef) if not defined $desc;
422 my $res = $sm->Match( $desc );
423 if( defined( $res ) )
429 return (undef,undef);
437 $d->{title
} =~ s/^s.songs+tart\s*:*\s*//gi;
438 $d->{title
} =~ s/^seriestart\s*:*\s*//gi;
439 $d->{title
} =~ s/^reprisstart\s*:*\s*//gi;
440 $d->{title
} =~ s/^programstart\s*:*\s*//gi;
442 $d->{title
} =~ s/^s.songs*avslutning\s*:*\s*//gi;
443 $d->{title
} =~ s/^sista\s+delen\s*:*\s*//gi;
444 $d->{title
} =~ s/^sista\s+avsnittet\s*:*\s*//gi;
446 if( $d->{title
} =~ s/^((matin.)|(fredagsbio))\s*:\s*//gi )
448 $d->{program_type
} = 'movie';
449 $d->{category
} = 'Movies';
452 # Set program_type to series if the entry has an episode-number
453 # with a defined episode (i.e. second part),
454 # but doesn't have a program_type.
455 if( exists( $d->{episode
} ) and defined( $d->{episode
} ) and
456 ($d->{episode
} !~ /^\s*\.\s*\./) and
457 ( (not defined($d->{program_type
})) or ($d->{program_type
} =~ /^\s*$/) ) )
459 $d->{program_type
} = "series";
465 my $doc = ParseXml( $strref );
467 Parse an xml-string into an XML::LibXML document. Takes a reference to a
468 string as the only reference.
477 if( not defined $xml ) {
478 $xml = XML
::LibXML
->new;
479 $xml->load_ext_dtd(0);
484 $doc = $xml->parse_string($$cref);
487 error
( "???: Failed to parse: $@" );
496 Parse a reference to an xml-string in xmltv-format into a reference to an
497 array of hashes with programme-info.
504 my $doc = ParseXml
( $cref );
505 return undef if not defined $doc;
509 # Find all "programme"-entries.
510 my $ns = $doc->find( "//programme" );
511 if( $ns->size() == 0 ) {
515 foreach my $pgm ($ns->get_nodelist) {
516 my $start = $pgm->findvalue( '@start' );
517 my $start_dt = create_dt
( $start );
519 my $stop = $pgm->findvalue( '@stop' );
520 my $stop_dt = create_dt
( $stop );
522 my $title = $pgm->findvalue( 'title' );
523 my $subtitle = $pgm->findvalue( 'sub-title' );
525 my $desc = $pgm->findvalue( 'desc' );
526 my $cat1 = $pgm->findvalue( 'category[1]' );
527 my $cat2 = $pgm->findvalue( 'category[2]' );
528 my $episode = $pgm->findvalue( 'episode-num[@system="xmltv_ns"]' );
529 my $production_date = $pgm->findvalue( 'date' );
531 my $aspect = $pgm->findvalue( 'video/aspect' );
534 my $ns = $pgm->find( ".//actor" );
536 foreach my $act ($ns->get_nodelist) {
537 push @actors, $act->findvalue(".");
541 $ns = $pgm->find( ".//director" );
543 foreach my $dir ($ns->get_nodelist) {
544 push @directors, $dir->findvalue(".");
548 start_dt
=> $start_dt,
551 description
=> $desc,
554 if( $subtitle =~ /\S/ ) {
555 $e{subtitle
} = $subtitle;
558 if( $episode =~ /\S/ ) {
559 $e{episode
} = $episode;
562 if( $cat1 =~ /^[a-z]/ ) {
563 $e{program_type
} = $cat1;
565 elsif( $cat1 =~ /^[A-Z]/ ) {
566 $e{category
} = $cat1;
569 if( $cat2 =~ /^[a-z]/ ) {
570 $e{program_type
} = $cat2;
572 elsif( $cat2 =~ /^[A-Z]/ ) {
573 $e{category
} = $cat2;
576 if( $production_date =~ /\S/ ) {
577 $e{production_date
} = "$production_date-01-01";
580 if( $aspect =~ /\S/ ) {
581 $e{aspect
} = $aspect;
584 if( scalar( @directors ) > 0 ) {
585 $e{directors
} = join ", ", @directors;
588 if( scalar( @actors ) > 0 ) {
589 $e{actors
} = join ", ", @actors;
601 my( $datetime ) = @_;
603 my( $year, $month, $day, $hour, $minute, $second, $tz ) =
604 ($datetime =~ /(\d
{4})(\d
{2})(\d
{2})
605 (\d
{2})(\d
{2})(\d
{2})\s
+
608 my $dt = DateTime
->new(
623 Convert month name to month number
628 my( $monthname , $lang ) = @_;
630 my( @months_1, @months_2 );
632 if( $lang =~ /^en$/ ){
633 @months_1 = qw
/jan feb mar apr may jun jul aug sep oct nov dec/;
634 @months_2 = qw
/january february march april may june july august september october november december/;
635 } elsif( $lang =~ /^hr$/ ){
636 @months_1 = qw
/sijecanj veljaca ozujak travanj svibanj lipanj srpanj kolovoz rujan listopad studeni prosinac/;
637 @months_2 = qw
/sijecnja veljace ozujka travnja svibnja lipnja srpnja kolovoza rujna listopada studenoga prosinca/;
642 for( my $i = 0; $i < scalar(@months_1); $i++ ){
643 $monthnames{$months_1[$i]} = $i+1;
646 for( my $i = 0; $i < scalar(@months_2); $i++ ){
647 $monthnames{$months_2[$i]} = $i+1;
650 my $month = $monthnames{lc $monthname};
657 ### Setup coding system