C4/Matcher.pm

   1 package C4::Matcher;
   2
   3 # Copyright (C) 2007 LibLime, 2012 C & P Bibliography Services
   4 #
   5 # This file is part of Koha.
   6 #
   7 # Koha is free software; you can redistribute it and/or modify it
   8 # under the terms of the GNU General Public License as published by
   9 # the Free Software Foundation; either version 3 of the License, or
  10 # (at your option) any later version.
  11 #
  12 # Koha is distributed in the hope that it will be useful, but
  13 # WITHOUT ANY WARRANTY; without even the implied warranty of
  14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  15 # GNU General Public License for more details.
  16 #
  17 # You should have received a copy of the GNU General Public License
  18 # along with Koha; if not, see <http://www.gnu.org/licenses>.
  19
  20 use strict;
  21 use warnings;
  22
  23 use C4::Context;
  24 use MARC::Record;
  25
  26 use vars qw($VERSION);
  27
  28 BEGIN {
  29         # set the version for version checking
  30     $VERSION = 3.07.00.049;
  31 }
  32
  33 =head1 NAME
  34
  35 C4::Matcher - find MARC records matching another one
  36
  37 =head1 SYNOPSIS
  38
  39   my @matchers = C4::Matcher::GetMatcherList();
  40
  41   my $matcher = C4::Matcher->new($record_type);
  42   $matcher->threshold($threshold);
  43   $matcher->code($code);
  44   $matcher->description($description);
  45
  46   $matcher->add_simple_matchpoint('isbn', 1000, '020', 'a', -1, 0, '');
  47   $matcher->add_simple_matchpoint('Date', 1000, '008', '', 7, 4, '');
  48   $matcher->add_matchpoint('isbn', 1000, [ { tag => '020', subfields => 'a', norms => [] } ]);
  49
  50   $matcher->add_simple_required_check('245', 'a', -1, 0, '', '245', 'a', -1, 0, '');
  51   $matcher->add_required_check([ { tag => '245', subfields => 'a', norms => [] } ],
  52                                [ { tag => '245', subfields => 'a', norms => [] } ]);
  53
  54   my @matches = $matcher->get_matches($marc_record, $max_matches);
  55
  56   foreach $match (@matches) {
  57
  58       # matches already sorted in order of
  59       # decreasing score
  60       print "record ID: $match->{'record_id'};
  61       print "score:     $match->{'score'};
  62
  63   }
  64
  65   my $matcher_description = $matcher->dump();
  66
  67 =head1 FUNCTIONS
  68
  69 =cut
  70
  71 =head2 GetMatcherList
  72
  73   my @matchers = C4::Matcher::GetMatcherList();
  74
  75 Returns an array of hashrefs list all matchers
  76 present in the database.  Each hashref includes:
  77
  78  * matcher_id
  79  * code
  80  * description
  81
  82 =cut
  83
  84 sub GetMatcherList {
  85     my $dbh = C4::Context->dbh;
  86
  87     my $sth = $dbh->prepare_cached("SELECT matcher_id, code, description FROM marc_matchers ORDER BY matcher_id");
  88     $sth->execute();
  89     my @results = ();
  90     while (my $row = $sth->fetchrow_hashref) {
  91         push @results, $row;
  92     }
  93     return @results;
  94 }
  95
  96 =head2 GetMatcherId
  97
  98   my $matcher_id = C4::Matcher::GetMatcherId($code);
  99
 100 Returns the matcher_id of a code.
 101
 102 =cut
 103
 104 sub GetMatcherId {
 105     my ($code) = @_;
 106     my $dbh = C4::Context->dbh;
 107
 108     my $matcher_id = $dbh->selectrow_array("SELECT matcher_id FROM marc_matchers WHERE code = ?", undef, $code);
 109     return $matcher_id;
 110 }
 111
 112 =head1 METHODS
 113
 114 =head2 new
 115
 116   my $matcher = C4::Matcher->new($record_type, $threshold);
 117
 118 Creates a new Matcher.  C<$record_type> indicates which search
 119 database to use, e.g., 'biblio' or 'authority' and defaults to
 120 'biblio', while C<$threshold> is the minimum score required for a match
 121 and defaults to 1000.
 122
 123 =cut
 124
 125 sub new {
 126     my $class = shift;
 127     my $self = {};
 128
 129     $self->{'id'} = undef;
 130
 131     if ($#_ > -1) {
 132         $self->{'record_type'} = shift;
 133     } else {
 134         $self->{'record_type'} = 'biblio';
 135     }
 136
 137     if ($#_ > -1) {
 138         $self->{'threshold'} = shift;
 139     } else {
 140         $self->{'threshold'} = 1000;
 141     }
 142
 143     $self->{'code'} = '';
 144     $self->{'description'} = '';
 145
 146     $self->{'matchpoints'} = [];
 147     $self->{'required_checks'} = [];
 148
 149     bless $self, $class;
 150     return $self;
 151 }
 152
 153 =head2 fetch
 154
 155   my $matcher = C4::Matcher->fetch($id);
 156
 157 Creates a matcher object from the version stored
 158 in the database.  If a matcher with the given
 159 id does not exist, returns undef.
 160
 161 =cut
 162
 163 sub fetch {
 164     my $class = shift;
 165     my $id = shift;
 166     my $dbh = C4::Context->dbh();
 167
 168     my $sth = $dbh->prepare_cached("SELECT * FROM marc_matchers WHERE matcher_id = ?");
 169     $sth->execute($id);
 170     my $row = $sth->fetchrow_hashref;
 171     $sth->finish();
 172     return undef unless defined $row;
 173
 174     my $self = {};
 175     $self->{'id'} = $row->{'matcher_id'};
 176     $self->{'record_type'} = $row->{'record_type'};
 177     $self->{'code'} = $row->{'code'};
 178     $self->{'description'} = $row->{'description'};
 179     $self->{'threshold'} = int($row->{'threshold'});
 180     bless $self, $class;
 181
 182     # matchpoints
 183     $self->{'matchpoints'} = [];
 184     $sth = $dbh->prepare_cached("SELECT * FROM matcher_matchpoints WHERE matcher_id = ? ORDER BY matchpoint_id");
 185     $sth->execute($self->{'id'});
 186     while (my $row = $sth->fetchrow_hashref) {
 187         my $matchpoint = $self->_fetch_matchpoint($row->{'matchpoint_id'});
 188         push @{ $self->{'matchpoints'} }, $matchpoint;
 189     }
 190
 191     # required checks
 192     $self->{'required_checks'} = [];
 193     $sth = $dbh->prepare_cached("SELECT * FROM matchchecks WHERE matcher_id = ? ORDER BY matchcheck_id");
 194     $sth->execute($self->{'id'});
 195     while (my $row = $sth->fetchrow_hashref) {
 196         my $source_matchpoint = $self->_fetch_matchpoint($row->{'source_matchpoint_id'});
 197         my $target_matchpoint = $self->_fetch_matchpoint($row->{'target_matchpoint_id'});
 198         my $matchcheck = {};
 199         $matchcheck->{'source_matchpoint'} = $source_matchpoint;
 200         $matchcheck->{'target_matchpoint'} = $target_matchpoint;
 201         push @{ $self->{'required_checks'} }, $matchcheck;
 202     }
 203
 204     return $self;
 205 }
 206
 207 sub _fetch_matchpoint {
 208     my $self = shift;
 209     my $matchpoint_id = shift;
 210
 211     my $dbh = C4::Context->dbh;
 212     my $sth = $dbh->prepare_cached("SELECT * FROM matchpoints WHERE matchpoint_id = ?");
 213     $sth->execute($matchpoint_id);
 214     my $row = $sth->fetchrow_hashref;
 215     my $matchpoint = {};
 216     $matchpoint->{'index'} = $row->{'search_index'};
 217     $matchpoint->{'score'} = int($row->{'score'});
 218     $sth->finish();
 219
 220     $matchpoint->{'components'} = [];
 221     $sth = $dbh->prepare_cached("SELECT * FROM matchpoint_components WHERE matchpoint_id = ? ORDER BY sequence");
 222     $sth->execute($matchpoint_id);
 223     while ($row = $sth->fetchrow_hashref) {
 224         my $component = {};
 225         $component->{'tag'} = $row->{'tag'};
 226         $component->{'subfields'} = { map { $_ => 1 } split(//,  $row->{'subfields'}) };
 227         $component->{'offset'} = int($row->{'offset'});
 228         $component->{'length'} = int($row->{'length'});
 229         $component->{'norms'} = [];
 230         my $sth2 = $dbh->prepare_cached("SELECT *
 231                                          FROM matchpoint_component_norms
 232                                          WHERE matchpoint_component_id = ? ORDER BY sequence");
 233         $sth2->execute($row->{'matchpoint_component_id'});
 234         while (my $row2 = $sth2->fetchrow_hashref) {
 235             push @{ $component->{'norms'} }, $row2->{'norm_routine'};
 236         }
 237         push @{ $matchpoint->{'components'} }, $component;
 238     }
 239     return $matchpoint;
 240 }
 241
 242 =head2 store
 243
 244   my $id = $matcher->store();
 245
 246 Stores matcher in database.  The return value is the ID
 247 of the marc_matchers row.  If the matcher was
 248 previously retrieved from the database via the fetch()
 249 method, the DB representation of the matcher
 250 is replaced.
 251
 252 =cut
 253
 254 sub store {
 255     my $self = shift;
 256
 257     if (defined $self->{'id'}) {
 258         # update
 259         $self->_del_matcher_components();
 260         $self->_update_marc_matchers();
 261     } else {
 262         # create new
 263         $self->_new_marc_matchers();
 264     }
 265     $self->_store_matcher_components();
 266     return $self->{'id'};
 267 }
 268
 269 sub _del_matcher_components {
 270     my $self = shift;
 271
 272     my $dbh = C4::Context->dbh();
 273     my $sth = $dbh->prepare_cached("DELETE FROM matchpoints WHERE matcher_id = ?");
 274     $sth->execute($self->{'id'});
 275     $sth = $dbh->prepare_cached("DELETE FROM matchchecks WHERE matcher_id = ?");
 276     $sth->execute($self->{'id'});
 277     # foreign key delete cascades take care of deleting relevant rows
 278     # from matcher_matchpoints, matchpoint_components, and
 279     # matchpoint_component_norms
 280 }
 281
 282 sub _update_marc_matchers {
 283     my $self = shift;
 284
 285     my $dbh = C4::Context->dbh();
 286     my $sth = $dbh->prepare_cached("UPDATE marc_matchers
 287                                     SET code = ?,
 288                                         description = ?,
 289                                         record_type = ?,
 290                                         threshold = ?
 291                                     WHERE matcher_id = ?");
 292     $sth->execute($self->{'code'}, $self->{'description'}, $self->{'record_type'}, $self->{'threshold'}, $self->{'id'});
 293 }
 294
 295 sub _new_marc_matchers {
 296     my $self = shift;
 297
 298     my $dbh = C4::Context->dbh();
 299     my $sth = $dbh->prepare_cached("INSERT INTO marc_matchers
 300                                     (code, description, record_type, threshold)
 301                                     VALUES (?, ?, ?, ?)");
 302     $sth->execute($self->{'code'}, $self->{'description'}, $self->{'record_type'}, $self->{'threshold'});
 303     $self->{'id'} = $dbh->{'mysql_insertid'};
 304 }
 305
 306 sub _store_matcher_components {
 307     my $self = shift;
 308
 309     my $dbh = C4::Context->dbh();
 310     my $sth;
 311     my $matcher_id = $self->{'id'};
 312     foreach my $matchpoint (@{ $self->{'matchpoints'}}) {
 313         my $matchpoint_id = $self->_store_matchpoint($matchpoint);
 314         $sth = $dbh->prepare_cached("INSERT INTO matcher_matchpoints (matcher_id, matchpoint_id)
 315                                      VALUES (?, ?)");
 316         $sth->execute($matcher_id, $matchpoint_id);
 317     }
 318     foreach my $matchcheck (@{ $self->{'required_checks'} }) {
 319         my $source_matchpoint_id = $self->_store_matchpoint($matchcheck->{'source_matchpoint'});
 320         my $target_matchpoint_id = $self->_store_matchpoint($matchcheck->{'target_matchpoint'});
 321         $sth = $dbh->prepare_cached("INSERT INTO matchchecks
 322                                      (matcher_id, source_matchpoint_id, target_matchpoint_id)
 323                                      VALUES (?, ?, ?)");
 324         $sth->execute($matcher_id, $source_matchpoint_id,  $target_matchpoint_id);
 325     }
 326
 327 }
 328
 329 sub _store_matchpoint {
 330     my $self = shift;
 331     my $matchpoint = shift;
 332
 333     my $dbh = C4::Context->dbh();
 334     my $sth;
 335     my $matcher_id = $self->{'id'};
 336     $sth = $dbh->prepare_cached("INSERT INTO matchpoints (matcher_id, search_index, score)
 337                                  VALUES (?, ?, ?)");
 338     $sth->execute($matcher_id, $matchpoint->{'index'}, $matchpoint->{'score'});
 339     my $matchpoint_id = $dbh->{'mysql_insertid'};
 340     my $seqnum = 0;
 341     foreach my $component (@{ $matchpoint->{'components'} }) {
 342         $seqnum++;
 343         $sth = $dbh->prepare_cached("INSERT INTO matchpoint_components
 344                                      (matchpoint_id, sequence, tag, subfields, offset, length)
 345                                      VALUES (?, ?, ?, ?, ?, ?)");
 346         $sth->bind_param(1, $matchpoint_id);
 347         $sth->bind_param(2, $seqnum);
 348         $sth->bind_param(3, $component->{'tag'});
 349         $sth->bind_param(4, join "", sort keys %{ $component->{'subfields'} });
 350         $sth->bind_param(5, $component->{'offset'});
 351         $sth->bind_param(6, $component->{'length'});
 352         $sth->execute();
 353         my $matchpoint_component_id = $dbh->{'mysql_insertid'};
 354         my $normseq = 0;
 355         foreach my $norm (@{ $component->{'norms'} }) {
 356             $normseq++;
 357             $sth = $dbh->prepare_cached("INSERT INTO matchpoint_component_norms
 358                                          (matchpoint_component_id, sequence, norm_routine)
 359                                          VALUES (?, ?, ?)");
 360             $sth->execute($matchpoint_component_id, $normseq, $norm);
 361         }
 362     }
 363     return $matchpoint_id;
 364 }
 365
 366
 367 =head2 delete
 368
 369   C4::Matcher->delete($id);
 370
 371 Deletes the matcher of the specified ID
 372 from the database.
 373
 374 =cut
 375
 376 sub delete {
 377     my $class = shift;
 378     my $matcher_id = shift;
 379
 380     my $dbh = C4::Context->dbh;
 381     my $sth = $dbh->prepare("DELETE FROM marc_matchers WHERE matcher_id = ?");
 382     $sth->execute($matcher_id); # relying on cascading deletes to clean up everything
 383 }
 384
 385 =head2 record_type
 386
 387   $matcher->record_type('biblio');
 388   my $record_type = $matcher->record_type();
 389
 390 Accessor method.
 391
 392 =cut
 393
 394 sub record_type {
 395     my $self = shift;
 396     @_ ? $self->{'record_type'} = shift : $self->{'record_type'};
 397 }
 398
 399 =head2 threshold
 400
 401   $matcher->threshold(1000);
 402   my $threshold = $matcher->threshold();
 403
 404 Accessor method.
 405
 406 =cut
 407
 408 sub threshold {
 409     my $self = shift;
 410     @_ ? $self->{'threshold'} = shift : $self->{'threshold'};
 411 }
 412
 413 =head2 _id
 414
 415   $matcher->_id(123);
 416   my $id = $matcher->_id();
 417
 418 Accessor method.  Note that using this method
 419 to set the DB ID of the matcher should not be
 420 done outside of the editing CGI.
 421
 422 =cut
 423
 424 sub _id {
 425     my $self = shift;
 426     @_ ? $self->{'id'} = shift : $self->{'id'};
 427 }
 428
 429 =head2 code
 430
 431   $matcher->code('ISBN');
 432   my $code = $matcher->code();
 433
 434 Accessor method.
 435
 436 =cut
 437
 438 sub code {
 439     my $self = shift;
 440     @_ ? $self->{'code'} = shift : $self->{'code'};
 441 }
 442
 443 =head2 description
 444
 445   $matcher->description('match on ISBN');
 446   my $description = $matcher->description();
 447
 448 Accessor method.
 449
 450 =cut
 451
 452 sub description {
 453     my $self = shift;
 454     @_ ? $self->{'description'} = shift : $self->{'description'};
 455 }
 456
 457 =head2 add_matchpoint
 458
 459   $matcher->add_matchpoint($index, $score, $matchcomponents);
 460
 461 Adds a matchpoint that may include multiple components.  The $index
 462 parameter identifies the index that will be searched, while $score
 463 is the weight that will be added if a match is found.
 464
 465 $matchcomponents should be a reference to an array of matchpoint
 466 compoents, each of which should be a hash containing the following
 467 keys:
 468     tag
 469     subfields
 470     offset
 471     length
 472     norms
 473
 474 The normalization_rules value should in turn be a reference to an
 475 array, each element of which should be a reference to a
 476 normalization subroutine (under C4::Normalize) to be applied
 477 to the source string.
 478
 479 =cut
 480
 481 sub add_matchpoint {
 482     my $self = shift;
 483     my ($index, $score, $matchcomponents) = @_;
 484
 485     my $matchpoint = {};
 486     $matchpoint->{'index'} = $index;
 487     $matchpoint->{'score'} = $score;
 488     $matchpoint->{'components'} = [];
 489     foreach my $input_component (@{ $matchcomponents }) {
 490         push @{ $matchpoint->{'components'} }, _parse_match_component($input_component);
 491     }
 492     push @{ $self->{'matchpoints'} }, $matchpoint;
 493 }
 494
 495 =head2 add_simple_matchpoint
 496
 497   $matcher->add_simple_matchpoint($index, $score, $source_tag,
 498                             $source_subfields, $source_offset,
 499                             $source_length, $source_normalizer);
 500
 501
 502 Adds a simple matchpoint rule -- after composing a key based on the source tag and subfields,
 503 normalized per the normalization fuction, search the index.  All records retrieved
 504 will receive the assigned score.
 505
 506 =cut
 507
 508 sub add_simple_matchpoint {
 509     my $self = shift;
 510     my ($index, $score, $source_tag, $source_subfields, $source_offset, $source_length, $source_normalizer) = @_;
 511
 512     $self->add_matchpoint($index, $score, [
 513                           { tag => $source_tag, subfields => $source_subfields,
 514                             offset => $source_offset, 'length' => $source_length,
 515                             norms => [ $source_normalizer ]
 516                           }
 517                          ]);
 518 }
 519
 520 =head2 add_required_check
 521
 522   $match->add_required_check($source_matchpoint, $target_matchpoint);
 523
 524 Adds a required check definition.  A required check means that in
 525 order for a match to be considered valid, the key derived from the
 526 source (incoming) record must match the key derived from the target
 527 (already in DB) record.
 528
 529 Unlike a regular matchpoint, only the first repeat of each tag
 530 in the source and target match criteria are considered.
 531
 532 A typical example of a required check would be verifying that the
 533 titles and publication dates match.
 534
 535 $source_matchpoint and $target_matchpoint are each a reference to
 536 an array of hashes, where each hash follows the same definition
 537 as the matchpoint component specification in add_matchpoint, i.e.,
 538
 539     tag
 540     subfields
 541     offset
 542     length
 543     norms
 544
 545 The normalization_rules value should in turn be a reference to an
 546 array, each element of which should be a reference to a
 547 normalization subroutine (under C4::Normalize) to be applied
 548 to the source string.
 549
 550 =cut
 551
 552 sub add_required_check {
 553     my $self = shift;
 554     my ($source_matchpoint, $target_matchpoint) = @_;
 555
 556     my $matchcheck = {};
 557     $matchcheck->{'source_matchpoint'}->{'index'} = '';
 558     $matchcheck->{'source_matchpoint'}->{'score'} = 0;
 559     $matchcheck->{'source_matchpoint'}->{'components'} = [];
 560     $matchcheck->{'target_matchpoint'}->{'index'} = '';
 561     $matchcheck->{'target_matchpoint'}->{'score'} = 0;
 562     $matchcheck->{'target_matchpoint'}->{'components'} = [];
 563     foreach my $input_component (@{ $source_matchpoint }) {
 564         push @{ $matchcheck->{'source_matchpoint'}->{'components'} }, _parse_match_component($input_component);
 565     }
 566     foreach my $input_component (@{ $target_matchpoint }) {
 567         push @{ $matchcheck->{'target_matchpoint'}->{'components'} }, _parse_match_component($input_component);
 568     }
 569     push @{ $self->{'required_checks'} }, $matchcheck;
 570 }
 571
 572 =head2 add_simple_required_check
 573
 574   $matcher->add_simple_required_check($source_tag, $source_subfields,
 575                 $source_offset, $source_length, $source_normalizer,
 576                 $target_tag, $target_subfields, $target_offset,
 577                 $target_length, $target_normalizer);
 578
 579 Adds a required check, which requires that the normalized keys made from the source and targets
 580 must match for a match to be considered valid.
 581
 582 =cut
 583
 584 sub add_simple_required_check {
 585     my $self = shift;
 586     my ($source_tag, $source_subfields, $source_offset, $source_length, $source_normalizer,
 587         $target_tag, $target_subfields, $target_offset, $target_length, $target_normalizer) = @_;
 588
 589     $self->add_required_check(
 590       [ { tag => $source_tag, subfields => $source_subfields, offset => $source_offset, 'length' => $source_length,
 591           norms => [ $source_normalizer ] } ],
 592       [ { tag => $target_tag, subfields => $target_subfields, offset => $target_offset, 'length' => $target_length,
 593           norms => [ $target_normalizer ] } ]
 594     );
 595 }
 596
 597 =head2 get_matches
 598
 599   my @matches = $matcher->get_matches($marc_record, $max_matches);
 600   foreach $match (@matches) {
 601       # matches already sorted in order of
 602       # decreasing score
 603       print "record ID: $match->{'record_id'};
 604       print "score:     $match->{'score'};
 605   }
 606
 607 Identifies all of the records matching the given MARC record.  For a record already
 608 in the database to be considered a match, it must meet the following criteria:
 609
 610 =over 2
 611
 612 =item 1. Total score from its matching field must exceed the supplied threshold.
 613
 614 =item 2. It must pass all required checks.
 615
 616 =back
 617
 618 Only the top $max_matches matches are returned.  The returned array is sorted
 619 in order of decreasing score, i.e., the best match is first.
 620
 621 =cut
 622
 623 sub get_matches {
 624     my $self = shift;
 625     my ($source_record, $max_matches) = @_;
 626
 627     my %matches = ();
 628
 629     my $QParser;
 630     $QParser = C4::Context->queryparser if (C4::Context->preference('UseQueryParser'));
 631     foreach my $matchpoint ( @{ $self->{'matchpoints'} } ) {
 632         my @source_keys = _get_match_keys( $source_record, $matchpoint );
 633
 634         next if scalar(@source_keys) == 0;
 635
 636         # FIXME - because of a bug in QueryParser, an expression ofthe
 637         # format 'isbn:"isbn1" || isbn:"isbn2" || isbn"isbn3"...'
 638         # does not get parsed correctly, so we will not
 639         # do AggressiveMatchOnISBN if UseQueryParser is on
 640         @source_keys = C4::Koha::GetVariationsOfISBNs(@source_keys)
 641           if ( $matchpoint->{index} =~ /^isbn$/i
 642             && C4::Context->preference('AggressiveMatchOnISBN') )
 643             && !C4::Context->preference('UseQueryParser');
 644
 645         # build query
 646         my $query;
 647         my $error;
 648         my $searchresults;
 649         my $total_hits;
 650         if ( $self->{'record_type'} eq 'biblio' ) {
 651
 652             if ($QParser) {
 653                 $query = join( " || ",
 654                     map { "$matchpoint->{'index'}:$_" } @source_keys );
 655             }
 656             else {
 657                 my $phr = C4::Context->preference('AggressiveMatchOnISBN') ? ',phr' : q{};
 658                 $query = join( " or ",
 659                     map { "$matchpoint->{'index'}$phr=$_" } @source_keys );
 660             }
 661
 662             require C4::Search;
 663
 664             ( $error, $searchresults, $total_hits ) =
 665               C4::Search::SimpleSearch( $query, 0, $max_matches );
 666         }
 667         elsif ( $self->{'record_type'} eq 'authority' ) {
 668             my $authresults;
 669             my @marclist;
 670             my @and_or;
 671             my @excluding = [];
 672             my @operator;
 673             my @value;
 674             foreach my $key (@source_keys) {
 675                 push @marclist, $matchpoint->{'index'};
 676                 push @and_or,   'or';
 677                 push @operator, 'exact';
 678                 push @value,    $key;
 679             }
 680             require C4::AuthoritiesMarc;
 681             ( $authresults, $total_hits ) =
 682               C4::AuthoritiesMarc::SearchAuthorities(
 683                 \@marclist,  \@and_or, \@excluding, \@operator,
 684                 \@value,     0,        20,          undef,
 685                 'AuthidAsc', 1
 686               );
 687             foreach my $result (@$authresults) {
 688                 push @$searchresults, $result->{'authid'};
 689             }
 690         }
 691
 692         if ( defined $error ) {
 693             warn "search failed ($query) $error";
 694         }
 695         else {
 696             foreach my $matched ( @{$searchresults} ) {
 697                 $matches{$matched} += $matchpoint->{'score'};
 698             }
 699         }
 700     }
 701
 702     # get rid of any that don't meet the threshold
 703     %matches = map { ($matches{$_} >= $self->{'threshold'}) ? ($_ => $matches{$_}) : () } keys %matches;
 704
 705     # get rid of any that don't meet the required checks
 706     %matches = map { _passes_required_checks($source_record, $_, $self->{'required_checks'}) ?  ($_ => $matches{$_}) : () }
 707                 keys %matches unless ($self->{'record_type'} eq 'auth');
 708
 709     my @results = ();
 710     if ($self->{'record_type'} eq 'biblio') {
 711         require C4::Biblio;
 712         foreach my $marcblob (keys %matches) {
 713             my $target_record = C4::Search::new_record_from_zebra('biblioserver',$marcblob);
 714             my $record_number;
 715             my $result = C4::Biblio::TransformMarcToKoha(C4::Context->dbh, $target_record, '');
 716             $record_number = $result->{'biblionumber'};
 717             push @results, { 'record_id' => $record_number, 'score' => $matches{$marcblob} };
 718         }
 719     } elsif ($self->{'record_type'} eq 'authority') {
 720         require C4::AuthoritiesMarc;
 721         foreach my $authid (keys %matches) {
 722             push @results, { 'record_id' => $authid, 'score' => $matches{$authid} };
 723         }
 724     }
 725     @results = sort { $b->{'score'} cmp $a->{'score'} } @results;
 726     if (scalar(@results) > $max_matches) {
 727         @results = @results[0..$max_matches-1];
 728     }
 729     return @results;
 730
 731 }
 732
 733 =head2 dump
 734
 735   $description = $matcher->dump();
 736
 737 Returns a reference to a structure containing all of the information
 738 in the matcher object.  This is mainly a convenience method to
 739 aid setting up a HTML editing form.
 740
 741 =cut
 742
 743 sub dump {
 744     my $self = shift;
 745
 746     my $result = {};
 747
 748     $result->{'matcher_id'} = $self->{'id'};
 749     $result->{'code'} = $self->{'code'};
 750     $result->{'description'} = $self->{'description'};
 751     $result->{'record_type'} = $self->{'record_type'};
 752
 753     $result->{'matchpoints'} = [];
 754     foreach my $matchpoint (@{ $self->{'matchpoints'} }) {
 755         push @{  $result->{'matchpoints'} }, $matchpoint;
 756     }
 757     $result->{'matchchecks'} = [];
 758     foreach my $matchcheck (@{ $self->{'required_checks'} }) {
 759         push @{  $result->{'matchchecks'} }, $matchcheck;
 760     }
 761
 762     return $result;
 763 }
 764
 765 sub _passes_required_checks {
 766     my ($source_record, $target_blob, $matchchecks) = @_;
 767     my $target_record = MARC::Record->new_from_usmarc($target_blob); # FIXME -- need to avoid parsing record twice
 768
 769     # no checks supplied == automatic pass
 770     return 1 if $#{ $matchchecks } == -1;
 771
 772     foreach my $matchcheck (@{ $matchchecks }) {
 773         my $source_key = join "", _get_match_keys($source_record, $matchcheck->{'source_matchpoint'});
 774         my $target_key = join "", _get_match_keys($target_record, $matchcheck->{'target_matchpoint'});
 775         return 0 unless $source_key eq $target_key;
 776     }
 777     return 1;
 778 }
 779
 780 sub _get_match_keys {
 781     my $source_record = shift;
 782     my $matchpoint = shift;
 783     my $check_only_first_repeat = @_ ? shift : 0;
 784
 785     # If there is more than one component to the matchpoint (e.g.,
 786     # matchpoint includes both 003 and 001), any repeats
 787     # of the first component's tag are identified; repeats
 788     # of the subsequent components' tags are appended to
 789     # each parallel key dervied from the first component,
 790     # up to the number of repeats of the first component's tag.
 791     #
 792     # For example, if the record has one 003 and two 001s, only
 793     # one key is retrieved because there is only one 003.  The key
 794     # will consist of the contents of the first 003 and first 001.
 795     #
 796     # If there are two 003s and two 001s, there will be two keys:
 797     #    first 003 + first 001
 798     #    second 003 + second 001
 799
 800     my @keys = ();
 801     for (my $i = 0; $i <= $#{ $matchpoint->{'components'} }; $i++) {
 802         my $component = $matchpoint->{'components'}->[$i];
 803         my $j = -1;
 804         FIELD: foreach my $field ($source_record->field($component->{'tag'})) {
 805             $j++;
 806             last FIELD if $j > 0 and $check_only_first_repeat;
 807             last FIELD if $i > 0 and $j > $#keys;
 808             my $key = "";
 809                         my $string;
 810             if ($field->is_control_field()) {
 811                                 $string=$field->data();
 812             } else {
 813                 foreach my $subfield ($field->subfields()) {
 814                     if (exists $component->{'subfields'}->{$subfield->[0]}) {
 815                         $string .= " " . $subfield->[1];
 816                     }
 817                 }
 818                         }
 819             if ($component->{'length'}>0) {
 820                     $string= substr($string, $component->{'offset'}, $component->{'length'});
 821                             # FIXME normalize, substr
 822             } elsif ($component->{'offset'}) {
 823                     $string= substr($string, $component->{'offset'});
 824             }
 825             $key = _normalize($string);
 826             if ($i == 0) {
 827                 push @keys, $key if $key;
 828             } else {
 829                 $keys[$j] .= " $key" if $key;
 830             }
 831         }
 832     }
 833     return @keys;
 834 }
 835
 836
 837 sub _parse_match_component {
 838     my $input_component = shift;
 839
 840     my $component = {};
 841     $component->{'tag'} = $input_component->{'tag'};
 842     $component->{'subfields'} = { map { $_ => 1 } split(//, $input_component->{'subfields'}) };
 843     $component->{'offset'} = exists($input_component->{'offset'}) ? $input_component->{'offset'} : -1;
 844     $component->{'length'} = $input_component->{'length'} ? $input_component->{'length'} : 0;
 845     $component->{'norms'} =  $input_component->{'norms'} ? $input_component->{'norms'} : [];
 846
 847     return $component;
 848 }
 849
 850 # FIXME - default normalizer
 851 sub _normalize {
 852     my $value = uc shift;
 853     $value =~ s/[.;:,\]\[\)\(\/'"]//g;
 854     $value =~ s/^\s+//;
 855     #$value =~ s/^\s+$//;
 856     $value =~ s/\s+$//;
 857     $value =~ s/\s+/ /g;
 858     #$value =~ s/[.;,\]\[\)\(\/"']//g;
 859     return $value;
 860 }
 861
 862 1;
 863 __END__
 864
 865 =head1 AUTHOR
 866
 867 Koha Development Team <http://koha-community.org/>
 868
 869 Galen Charlton <galen.charlton@liblime.com>
 870
 871 =cut