lib/NonameTV/Importer/NovaTV.pm

   1 package NonameTV::Importer::NovaTV;
   2
   3 use strict;
   4 use warnings;
   5
   6 =pod
   7
   8 Import data from Word-files delivered via e-mail.  Each day
   9 is handled as a separate batch.
  10
  11 Features:
  12
  13 =cut
  14
  15 use utf8;
  16
  17 use DateTime;
  18 use XML::LibXML;
  19 #use Text::Capitalize qw/capitalize_title/;
  20
  21 use NonameTV qw/MyGet Wordfile2Xml Htmlfile2Xml norm AddCategory/;
  22 use NonameTV::DataStore::Helper;
  23 use NonameTV::Log qw/info progress error logdie
  24                      log_to_string log_to_string_result/;
  25
  26 use NonameTV::Importer::BaseFile;
  27
  28 use base 'NonameTV::Importer::BaseFile';
  29
  30 sub new {
  31   my $proto = shift;
  32   my $class = ref($proto) || $proto;
  33   my $self  = $class->SUPER::new( @_ );
  34   bless ($self, $class);
  35
  36   $self->{grabber_name} = "NovaTV";
  37
  38   my $dsh = NonameTV::DataStore::Helper->new( $self->{datastore} );
  39   $self->{datastorehelper} = $dsh;
  40
  41   return $self;
  42 }
  43
  44 sub ImportContentFile
  45 {
  46   my $self = shift;
  47   my( $file, $chd ) = @_;
  48
  49   if( $file !~ /program/i and $file !~ /izmjena/i and $file !~ /\.doc/ ) {
  50     progress( "NovaTV: Skipping unknown file $file" );
  51     return;
  52   }
  53
  54   progress( "NovaTV: Processing $file" );
  55
  56   $self->{fileerror} = 0;
  57
  58   my $xmltvid=$chd->{xmltvid};
  59   my $channel_id = $chd->{id};
  60   my $dsh = $self->{datastorehelper};
  61   my $ds = $self->{datastore};
  62
  63
  64   my $doc;
  65   $doc = Wordfile2Xml( $file );
  66
  67   if( not defined( $doc ) ) {
  68     error( "NovaTV $file: Failed to parse" );
  69     return;
  70   }
  71
  72   my @nodes = $doc->findnodes( '//span[@style="text-transform:uppercase"]/text()' );
  73   foreach my $node (@nodes) {
  74     my $str = $node->getData();
  75     $node->setData( uc( $str ) );
  76   }
  77
  78   # Find all paragraphs.
  79   my $ns = $doc->find( "//div" );
  80
  81   if( $ns->size() == 0 ) {
  82     error( "NovaTV $file: No divs found." ) ;
  83     return;
  84   }
  85
  86   my $currdate = undef;
  87   my $nowyear = DateTime->today->year();
  88   my $date;
  89   my @ces;
  90   my $targetshow;
  91   my $description;
  92   my $subtitle;
  93   my $directors;
  94   my $actors;
  95
  96   foreach my $div ($ns->get_nodelist) {
  97
  98     my( $text ) = norm( $div->findvalue( '.' ) );
  99
 100     #print "> $text\n";
 101
 102     if( $text eq "" ) {
 103       # blank line
 104     }
 105     elsif( $text =~ /^PROGRAM NOVE TV za/i ) {
 106       progress("NovaTV: OK, this is the file with the schedules: $file");
 107     }
 108     #elsif( $text =~ /^([[:upper:]]+) (\d+)\.(\d+)/ ) { # the line with the date in format 'MONDAY 12.4.'
 109     elsif( $text =~ /^(\S+) (\d+)\.(\d+)/ ) { # the line with the date in format 'MONDAY 12.4.'
 110
 111       $date = ParseDate( $text , $nowyear );
 112
 113       if( defined $date ) {
 114         progress("NovaTV: Date $date");
 115
 116         $dsh->EndBatch( 1 )
 117           if defined $currdate;
 118
 119         my $batch_id = "${xmltvid}_" . $date->ymd();
 120         $dsh->StartBatch( $batch_id, $channel_id );
 121         $dsh->StartDate( $date->ymd("-") , "07:00" );
 122         $currdate = $date;
 123       }
 124
 125       # save last day if we have it in memory
 126       if( @ces ){
 127         foreach my $element (@ces) {
 128
 129           progress("NovaTV: $element->{start_time} : $element->{title}");
 130
 131           $dsh->AddProgramme( $element );
 132         }
 133       }
 134
 135       # empty last day array
 136       undef @ces;
 137       undef $targetshow;
 138       undef $description;
 139       undef $subtitle;
 140       undef $directors;
 141       undef $actors;
 142     }
 143     elsif( $text =~ /^(\d+)\.(\d+) (\S+)/ ) { # the line with the show in format '19.30 Show title, genre'
 144
 145       my( $starttime, $title, $genre ) = ParseShow( $text , $date );
 146
 147       my $ce = {
 148         channel_id   => $chd->{id},
 149         start_time => $starttime->hms(":"),
 150         title => norm($title),
 151       };
 152
 153       if( $genre ){
 154         my($program_type, $category ) = $ds->LookupCat( 'NovaTV', $genre );
 155         AddCategory( $ce, $program_type, $category );
 156       }
 157
 158       # add the programme to the array
 159       # as we have to add description later
 160       push( @ces , $ce );
 161
 162     }
 163     elsif( isCroUcase( $text ) ) { # the line with description title in format 'ALL IN CAPS'
 164
 165       # if we have something in the description buffer
 166       # then this is for the last targetshow
 167       if( $targetshow and $description ){
 168         $targetshow->{description} = $description if defined $description;
 169         $targetshow->{subtitle} = $subtitle if defined $subtitle;
 170         $targetshow->{directors} = $directors if defined $directors;
 171         $targetshow->{actors} = $actors if defined $actors;
 172         undef $description;
 173         undef $subtitle;
 174         undef $directors;
 175         undef $actors;
 176       }
 177
 178       my $utext = utf8ucase( $text );
 179
 180       # find if we have the show with that name
 181       foreach my $element (@ces) {
 182
 183         my $utitle = utf8ucase( $element->{title} );
 184
 185         if( $utext eq $utitle ){
 186           $targetshow = $element;
 187           last;
 188         }
 189       }
 190     }
 191     else {
 192
 193       # if we know the target show then this is the description
 194       if( $targetshow ){
 195
 196         $description .= $text;
 197
 198         # subtitle if present in the first description line
 199         if( $text =~ /^\(.*\)/ ){
 200           $subtitle = $text;
 201         }
 202
 203         # subtitle if present in one text line
 204         if( $text =~ s/^Redatelj: // ){
 205           $directors = $text;
 206         }
 207
 208         # actor if present in the one text line
 209         if( $text =~ s/^Glume: // ){
 210           $actors = $text;
 211         }
 212
 213       } else {
 214         #error( "Ignoring $text" );
 215       }
 216     }
 217   }
 218   $dsh->EndBatch( 1 );
 219
 220   return;
 221 }
 222
 223 sub ParseDate {
 224   my( $text, $year ) = @_;
 225
 226   #my( $dayname, $day, $month ) = ($text =~ /([[:upper:]]+) (\d+)\.(\d+)/);
 227   my( $dayname, $day, $month ) = ($text =~ /(\S+) (\d+)\.(\d+)/);
 228
 229   my $dt = DateTime->new( year   => $year,
 230                           month  => $month,
 231                           day    => $day,
 232                           hour   => 0,
 233                           minute => 0,
 234                           second => 0,
 235                           time_zone => 'Europe/Zagreb',
 236   );
 237
 238   return $dt;
 239 }
 240
 241 sub ParseShow {
 242   my( $text, $date ) = @_;
 243   my( $title, $genre );
 244
 245   my( $hour, $min, $string ) = ($text =~ /(\d+)\.(\d+) (.*)/);
 246
 247   if( $string =~ /,/ ){
 248     ( $title, $genre ) = $string =~ m/(.*, )(.*)$/;
 249     if( $title ){
 250       $title =~ s/, $//;
 251     }
 252   }
 253   else
 254   {
 255     $title = $string;
 256   }
 257
 258   my $sdt = $date->clone()->add( hours => $hour , minutes => $min );
 259
 260   return( $sdt , $title , $genre );
 261 }
 262
 263 sub utf8ucase {
 264   my( $str ) = @_;
 265   my $newstr = $str;
 266
 267   $newstr =~ s/\xC4\x8D/\xC4\x8C/;      # tvrdo c
 268   $newstr =~ s/\xC4\x87/\xC4\x86/;      # meko c
 269   $newstr =~ s/\xC4\x91/\xC4\x90/;      # d
 270   $newstr =~ s/\xC5\xA1/\xC5\xA0/;      # s
 271   $newstr =~ s/\xC5\xBE/\xC5\xBD/;      # z
 272
 273   $newstr = uc($newstr);
 274
 275   return( $newstr );
 276 }
 277
 278 sub isCroUcase {
 279   my( $str ) = @_;
 280
 281   if( $str =~ /[[:lower:]]/ ){
 282     return 0;
 283   }
 284
 285   return 1;
 286 }
 287
 288 1;
 289
 290 ### Setup coding system
 291 ## Local Variables:
 292 ## coding: utf-8
 293 ## End: