C4/Matcher.pm

   1 package C4::Matcher;
   2
   3 # Copyright (C) 2007 LibLime
   4 #
   5 # This file is part of Koha.
   6 #
   7 # Koha is free software; you can redistribute it and/or modify it under the
   8 # terms of the GNU General Public License as published by the Free Software
   9 # Foundation; either version 2 of the License, or (at your option) any later
  10 # version.
  11 #
  12 # Koha is distributed in the hope that it will be useful, but WITHOUT ANY
  13 # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
  14 # A PARTICULAR PURPOSE.  See the GNU General Public License for more details.
  15 #
  16 # You should have received a copy of the GNU General Public License along with
  17 # Koha; if not, write to the Free Software Foundation, Inc., 59 Temple Place,
  18 # Suite 330, Boston, MA  02111-1307 USA
  19
  20 use strict;
  21 use C4::Context;
  22 use MARC::Record;
  23 use C4::Search;
  24 use C4::Biblio;
  25
  26 use vars qw($VERSION);
  27
  28 BEGIN {
  29         # set the version for version checking
  30         $VERSION = 3.01;
  31 }
  32
  33 =head1 NAME
  34
  35 C4::Matcher - find MARC records matching another one
  36
  37 =head1 SYNOPSIS
  38
  39 =over 4
  40
  41 my @matchers = C4::Matcher::GetMatcherList();
  42
  43 my $matcher = C4::Matcher->new($record_type);
  44 $matcher->threshold($threshold);
  45 $matcher->code($code);
  46 $matcher->description($description);
  47
  48 $matcher->add_simple_matchpoint('isbn', 1000, '020', 'a', -1, 0, '');
  49 $matcher->add_simple_matchpoint('Date', 1000, '008', '', 7, 4, '');
  50 $matcher->add_matchpoint('isbn', 1000, [ { tag => '020', subfields => 'a', norms => [] } ]);
  51
  52 $matcher->add_simple_required_check('245', 'a', -1, 0, '', '245', 'a', -1, 0, '');
  53 $matcher->add_required_check([ { tag => '245', subfields => 'a', norms => [] } ],
  54                              [ { tag => '245', subfields => 'a', norms => [] } ]);
  55
  56 my @matches = $matcher->get_matches($marc_record, $max_matches);
  57
  58 foreach $match (@matches) {
  59
  60     # matches already sorted in order of
  61     # decreasing score
  62     print "record ID: $match->{'record_id'};
  63     print "score:     $match->{'score'};
  64
  65 }
  66
  67 my $matcher_description = $matcher->dump();
  68
  69 =back
  70
  71 =head1 FUNCTIONS
  72
  73 =cut
  74
  75 =head2 GetMatcherList
  76
  77 =over 4
  78
  79 my @matchers = C4::Matcher::GetMatcherList();
  80
  81 =back
  82
  83 Returns an array of hashrefs list all matchers
  84 present in the database.  Each hashref includes:
  85
  86 matcher_id
  87 code
  88 description
  89
  90 =cut
  91
  92 sub GetMatcherList {
  93     my $dbh = C4::Context->dbh;
  94
  95     my $sth = $dbh->prepare_cached("SELECT matcher_id, code, description FROM marc_matchers ORDER BY matcher_id");
  96     $sth->execute();
  97     my @results = ();
  98     while (my $row = $sth->fetchrow_hashref) {
  99         push @results, $row;
 100     }
 101     return @results;
 102 }
 103
 104 =head1 METHODS
 105
 106 =cut
 107
 108 =head2 new
 109
 110 =over 4
 111
 112 my $matcher = C4::Matcher->new($record_type, $threshold);
 113
 114 =back
 115
 116 Creates a new Matcher.  C<$record_type> indicates which search
 117 database to use, e.g., 'biblio' or 'authority' and defaults to
 118 'biblio', while C<$threshold> is the minimum score required for a match
 119 and defaults to 1000.
 120
 121 =cut
 122
 123 sub new {
 124     my $class = shift;
 125     my $self = {};
 126
 127     $self->{'id'} = undef;
 128
 129     if ($#_ > -1) {
 130         $self->{'record_type'} = shift;
 131     } else {
 132         $self->{'record_type'} = 'biblio';
 133     }
 134
 135     if ($#_ > -1) {
 136         $self->{'threshold'} = shift;
 137     } else {
 138         $self->{'threshold'} = 1000;
 139     }
 140
 141     $self->{'code'} = '';
 142     $self->{'description'} = '';
 143
 144     $self->{'matchpoints'} = [];
 145     $self->{'required_checks'} = [];
 146
 147     bless $self, $class;
 148     return $self;
 149 }
 150
 151 =head2 fetch
 152
 153 =over 4
 154
 155 my $matcher = C4::Matcher->fetch($id);
 156
 157 =back
 158
 159 Creates a matcher object from the version stored
 160 in the database.  If a matcher with the given
 161 id does not exist, returns undef.
 162
 163 =cut
 164
 165 sub fetch {
 166     my $class = shift;
 167     my $id = shift;
 168     my $dbh = C4::Context->dbh();
 169
 170     my $sth = $dbh->prepare_cached("SELECT * FROM marc_matchers WHERE matcher_id = ?");
 171     $sth->execute($id);
 172     my $row = $sth->fetchrow_hashref;
 173     $sth->finish();
 174     return undef unless defined $row;
 175
 176     my $self = {};
 177     $self->{'id'} = $row->{'matcher_id'};
 178     $self->{'record_type'} = $row->{'record_type'};
 179     $self->{'code'} = $row->{'code'};
 180     $self->{'description'} = $row->{'description'};
 181     $self->{'threshold'} = int($row->{'threshold'});
 182     bless $self, $class;
 183
 184     # matchpoints
 185     $self->{'matchpoints'} = [];
 186     $sth = $dbh->prepare_cached("SELECT * FROM matcher_matchpoints WHERE matcher_id = ? ORDER BY matchpoint_id");
 187     $sth->execute($self->{'id'});
 188     while (my $row = $sth->fetchrow_hashref) {
 189         my $matchpoint = $self->_fetch_matchpoint($row->{'matchpoint_id'});
 190         push @{ $self->{'matchpoints'} }, $matchpoint;
 191     }
 192
 193     # required checks
 194     $self->{'required_checks'} = [];
 195     $sth = $dbh->prepare_cached("SELECT * FROM matchchecks WHERE matcher_id = ? ORDER BY matchcheck_id");
 196     $sth->execute($self->{'id'});
 197     while (my $row = $sth->fetchrow_hashref) {
 198         my $source_matchpoint = $self->_fetch_matchpoint($row->{'source_matchpoint_id'});
 199         my $target_matchpoint = $self->_fetch_matchpoint($row->{'target_matchpoint_id'});
 200         my $matchcheck = {};
 201         $matchcheck->{'source_matchpoint'} = $source_matchpoint;
 202         $matchcheck->{'target_matchpoint'} = $target_matchpoint;
 203         push @{ $self->{'required_checks'} }, $matchcheck;
 204     }
 205
 206     return $self;
 207 }
 208
 209 sub _fetch_matchpoint {
 210     my $self = shift;
 211     my $matchpoint_id = shift;
 212
 213     my $dbh = C4::Context->dbh;
 214     my $sth = $dbh->prepare_cached("SELECT * FROM matchpoints WHERE matchpoint_id = ?");
 215     $sth->execute($matchpoint_id);
 216     my $row = $sth->fetchrow_hashref;
 217     my $matchpoint = {};
 218     $matchpoint->{'index'} = $row->{'search_index'};
 219     $matchpoint->{'score'} = int($row->{'score'});
 220     $sth->finish();
 221
 222     $matchpoint->{'components'} = [];
 223     $sth = $dbh->prepare_cached("SELECT * FROM matchpoint_components WHERE matchpoint_id = ? ORDER BY sequence");
 224     $sth->execute($matchpoint_id);
 225     while ($row = $sth->fetchrow_hashref) {
 226         my $component = {};
 227         $component->{'tag'} = $row->{'tag'};
 228         $component->{'subfields'} = { map { $_ => 1 } split(//,  $row->{'subfields'}) };
 229         $component->{'offset'} = int($row->{'offset'});
 230         $component->{'length'} = int($row->{'length'});
 231         $component->{'norms'} = [];
 232         my $sth2 = $dbh->prepare_cached("SELECT *
 233                                          FROM matchpoint_component_norms
 234                                          WHERE matchpoint_component_id = ? ORDER BY sequence");
 235         $sth2->execute($row->{'matchpoint_component_id'});
 236         while (my $row2 = $sth2->fetchrow_hashref) {
 237             push @{ $component->{'norms'} }, $row2->{'norm_routine'};
 238         }
 239         push @{ $matchpoint->{'components'} }, $component;
 240     }
 241     return $matchpoint;
 242 }
 243
 244 =head2 store
 245
 246 =over 4
 247
 248 my $id = $matcher->store();
 249
 250 =back
 251
 252 Stores matcher in database.  The return value is the ID
 253 of the marc_matchers row.  If the matcher was
 254 previously retrieved from the database via the fetch()
 255 method, the DB representation of the matcher
 256 is replaced.
 257
 258 =cut
 259
 260 sub store {
 261     my $self = shift;
 262
 263     if (defined $self->{'id'}) {
 264         # update
 265         $self->_del_matcher_components();
 266         $self->_update_marc_matchers();
 267     } else {
 268         # create new
 269         $self->_new_marc_matchers();
 270     }
 271     $self->_store_matcher_components();
 272     return $self->{'id'};
 273 }
 274
 275 sub _del_matcher_components {
 276     my $self = shift;
 277
 278     my $dbh = C4::Context->dbh();
 279     my $sth = $dbh->prepare_cached("DELETE FROM matchpoints WHERE matcher_id = ?");
 280     $sth->execute($self->{'id'});
 281     $sth = $dbh->prepare_cached("DELETE FROM matchchecks WHERE matcher_id = ?");
 282     $sth->execute($self->{'id'});
 283     # foreign key delete cascades take care of deleting relevant rows
 284     # from matcher_matchpoints, matchpoint_components, and
 285     # matchpoint_component_norms
 286 }
 287
 288 sub _update_marc_matchers {
 289     my $self = shift;
 290
 291     my $dbh = C4::Context->dbh();
 292     my $sth = $dbh->prepare_cached("UPDATE marc_matchers
 293                                     SET code = ?,
 294                                         description = ?,
 295                                         record_type = ?,
 296                                         threshold = ?
 297                                     WHERE matcher_id = ?");
 298     $sth->execute($self->{'code'}, $self->{'description'}, $self->{'record_type'}, $self->{'threshold'}, $self->{'id'});
 299 }
 300
 301 sub _new_marc_matchers {
 302     my $self = shift;
 303
 304     my $dbh = C4::Context->dbh();
 305     my $sth = $dbh->prepare_cached("INSERT INTO marc_matchers
 306                                     (code, description, record_type, threshold)
 307                                     VALUES (?, ?, ?, ?)");
 308     $sth->execute($self->{'code'}, $self->{'description'}, $self->{'record_type'}, $self->{'threshold'});
 309     $self->{'id'} = $dbh->{'mysql_insertid'};
 310 }
 311
 312 sub _store_matcher_components {
 313     my $self = shift;
 314
 315     my $dbh = C4::Context->dbh();
 316     my $sth;
 317     my $matcher_id = $self->{'id'};
 318     foreach my $matchpoint (@{ $self->{'matchpoints'}}) {
 319         my $matchpoint_id = $self->_store_matchpoint($matchpoint);
 320         $sth = $dbh->prepare_cached("INSERT INTO matcher_matchpoints (matcher_id, matchpoint_id)
 321                                      VALUES (?, ?)");
 322         $sth->execute($matcher_id, $matchpoint_id);
 323     }
 324     foreach my $matchcheck (@{ $self->{'required_checks'} }) {
 325         my $source_matchpoint_id = $self->_store_matchpoint($matchcheck->{'source_matchpoint'});
 326         my $target_matchpoint_id = $self->_store_matchpoint($matchcheck->{'target_matchpoint'});
 327         $sth = $dbh->prepare_cached("INSERT INTO matchchecks
 328                                      (matcher_id, source_matchpoint_id, target_matchpoint_id)
 329                                      VALUES (?, ?, ?)");
 330         $sth->execute($matcher_id, $source_matchpoint_id,  $target_matchpoint_id);
 331     }
 332
 333 }
 334
 335 sub _store_matchpoint {
 336     my $self = shift;
 337     my $matchpoint = shift;
 338
 339     my $dbh = C4::Context->dbh();
 340     my $sth;
 341     my $matcher_id = $self->{'id'};
 342     $sth = $dbh->prepare_cached("INSERT INTO matchpoints (matcher_id, search_index, score)
 343                                  VALUES (?, ?, ?)");
 344     $sth->execute($matcher_id, $matchpoint->{'index'}, $matchpoint->{'score'});
 345     my $matchpoint_id = $dbh->{'mysql_insertid'};
 346     my $seqnum = 0;
 347     foreach my $component (@{ $matchpoint->{'components'} }) {
 348         $seqnum++;
 349         $sth = $dbh->prepare_cached("INSERT INTO matchpoint_components
 350                                      (matchpoint_id, sequence, tag, subfields, offset, length)
 351                                      VALUES (?, ?, ?, ?, ?, ?)");
 352         $sth->bind_param(1, $matchpoint_id);
 353         $sth->bind_param(2, $seqnum);
 354         $sth->bind_param(3, $component->{'tag'});
 355         $sth->bind_param(4, join "", sort keys %{ $component->{'subfields'} });
 356         $sth->bind_param(5, $component->{'offset'});
 357         $sth->bind_param(6, $component->{'length'});
 358         $sth->execute();
 359         my $matchpoint_component_id = $dbh->{'mysql_insertid'};
 360         my $normseq = 0;
 361         foreach my $norm (@{ $component->{'norms'} }) {
 362             $normseq++;
 363             $sth = $dbh->prepare_cached("INSERT INTO matchpoint_component_norms
 364                                          (matchpoint_component_id, sequence, norm_routine)
 365                                          VALUES (?, ?, ?)");
 366             $sth->execute($matchpoint_component_id, $normseq, $norm);
 367         }
 368     }
 369     return $matchpoint_id;
 370 }
 371
 372
 373 =head2 delete
 374
 375 =over 4
 376
 377 C4::Matcher->delete($id);
 378
 379 =back
 380
 381 Deletes the matcher of the specified ID
 382 from the database.
 383
 384 =cut
 385
 386 sub delete {
 387     my $class = shift;
 388     my $matcher_id = shift;
 389
 390     my $dbh = C4::Context->dbh;
 391     my $sth = $dbh->prepare("DELETE FROM marc_matchers WHERE matcher_id = ?");
 392     $sth->execute($matcher_id); # relying on cascading deletes to clean up everything
 393 }
 394
 395 =head2 threshold
 396
 397 =over 4
 398
 399 $matcher->threshold(1000);
 400 my $threshold = $matcher->threshold();
 401
 402 =back
 403
 404 Accessor method.
 405
 406 =cut
 407
 408 sub threshold {
 409     my $self = shift;
 410     @_ ? $self->{'threshold'} = shift : $self->{'threshold'};
 411 }
 412
 413 =head2 _id
 414
 415 =over 4
 416
 417 $matcher->_id(123);
 418 my $id = $matcher->_id();
 419
 420 =back
 421
 422 Accessor method.  Note that using this method
 423 to set the DB ID of the matcher should not be
 424 done outside of the editing CGI.
 425
 426 =cut
 427
 428 sub _id {
 429     my $self = shift;
 430     @_ ? $self->{'id'} = shift : $self->{'id'};
 431 }
 432
 433 =head2 code
 434
 435 =over 4
 436
 437 $matcher->code('ISBN');
 438 my $code = $matcher->code();
 439
 440 =back
 441
 442 Accessor method.
 443
 444 =cut
 445
 446 sub code {
 447     my $self = shift;
 448     @_ ? $self->{'code'} = shift : $self->{'code'};
 449 }
 450
 451 =head2 description
 452
 453 =over 4
 454
 455 $matcher->description('match on ISBN');
 456 my $description = $matcher->description();
 457
 458 =back
 459
 460 Accessor method.
 461
 462 =cut
 463
 464 sub description {
 465     my $self = shift;
 466     @_ ? $self->{'description'} = shift : $self->{'description'};
 467 }
 468
 469 =head2 add_matchpoint
 470
 471 =over 4
 472
 473 $matcher->add_matchpoint($index, $score, $matchcomponents);
 474
 475 =back
 476
 477 Adds a matchpoint that may include multiple components.  The $index
 478 parameter identifies the index that will be searched, while $score
 479 is the weight that will be added if a match is found.
 480
 481 $matchcomponents should be a reference to an array of matchpoint
 482 compoents, each of which should be a hash containing the following
 483 keys:
 484     tag
 485     subfields
 486     offset
 487     length
 488     norms
 489
 490 The normalization_rules value should in turn be a reference to an
 491 array, each element of which should be a reference to a
 492 normalization subroutine (under C4::Normalize) to be applied
 493 to the source string.
 494
 495 =cut
 496
 497 sub add_matchpoint {
 498     my $self = shift;
 499     my ($index, $score, $matchcomponents) = @_;
 500
 501     my $matchpoint = {};
 502     $matchpoint->{'index'} = $index;
 503     $matchpoint->{'score'} = $score;
 504     $matchpoint->{'components'} = [];
 505     foreach my $input_component (@{ $matchcomponents }) {
 506         push @{ $matchpoint->{'components'} }, _parse_match_component($input_component);
 507     }
 508     push @{ $self->{'matchpoints'} }, $matchpoint;
 509 }
 510
 511 =head2 add_simple_matchpoint
 512
 513 =over 4
 514
 515 $matcher->add_simple_matchpoint($index, $score, $source_tag, $source_subfields,
 516                                 $source_offset, $source_length,
 517                                 $source_normalizer);
 518
 519 =back
 520
 521 Adds a simple matchpoint rule -- after composing a key based on the source tag and subfields,
 522 normalized per the normalization fuction, search the index.  All records retrieved
 523 will receive the assigned score.
 524
 525 =cut
 526
 527 sub add_simple_matchpoint {
 528     my $self = shift;
 529     my ($index, $score, $source_tag, $source_subfields, $source_offset, $source_length, $source_normalizer) = @_;
 530
 531     $self->add_matchpoint($index, $score, [
 532                           { tag => $source_tag, subfields => $source_subfields,
 533                             offset => $source_offset, 'length' => $source_length,
 534                             norms => [ $source_normalizer ]
 535                           }
 536                          ]);
 537 }
 538
 539 =head2 add_required_check
 540
 541 =over 4
 542
 543 $match->add_required_check($source_matchpoint, $target_matchpoint);
 544
 545 =back
 546
 547 Adds a required check definition.  A required check means that in
 548 order for a match to be considered valid, the key derived from the
 549 source (incoming) record must match the key derived from the target
 550 (already in DB) record.
 551
 552 Unlike a regular matchpoint, only the first repeat of each tag
 553 in the source and target match criteria are considered.
 554
 555 A typical example of a required check would be verifying that the
 556 titles and publication dates match.
 557
 558 $source_matchpoint and $target_matchpoint are each a reference to
 559 an array of hashes, where each hash follows the same definition
 560 as the matchpoint component specification in add_matchpoint, i.e.,
 561
 562     tag
 563     subfields
 564     offset
 565     length
 566     norms
 567
 568 The normalization_rules value should in turn be a reference to an
 569 array, each element of which should be a reference to a
 570 normalization subroutine (under C4::Normalize) to be applied
 571 to the source string.
 572
 573 =cut
 574
 575 sub add_required_check {
 576     my $self = shift;
 577     my ($source_matchpoint, $target_matchpoint) = @_;
 578
 579     my $matchcheck = {};
 580     $matchcheck->{'source_matchpoint'}->{'index'} = '';
 581     $matchcheck->{'source_matchpoint'}->{'score'} = 0;
 582     $matchcheck->{'source_matchpoint'}->{'components'} = [];
 583     $matchcheck->{'target_matchpoint'}->{'index'} = '';
 584     $matchcheck->{'target_matchpoint'}->{'score'} = 0;
 585     $matchcheck->{'target_matchpoint'}->{'components'} = [];
 586     foreach my $input_component (@{ $source_matchpoint }) {
 587         push @{ $matchcheck->{'source_matchpoint'}->{'components'} }, _parse_match_component($input_component);
 588     }
 589     foreach my $input_component (@{ $target_matchpoint }) {
 590         push @{ $matchcheck->{'target_matchpoint'}->{'components'} }, _parse_match_component($input_component);
 591     }
 592     push @{ $self->{'required_checks'} }, $matchcheck;
 593 }
 594
 595 =head2 add_simple_required_check
 596
 597 $matcher->add_simple_required_check($source_tag, $source_subfields, $source_offset, $source_length, $source_normalizer,
 598                                     $target_tag, $target_subfields, $target_offset, $target_length, $target_normalizer);
 599
 600 =over 4
 601
 602 Adds a required check, which requires that the normalized keys made from the source and targets
 603 must match for a match to be considered valid.
 604
 605 =back
 606
 607 =cut
 608
 609 sub add_simple_required_check {
 610     my $self = shift;
 611     my ($source_tag, $source_subfields, $source_offset, $source_length, $source_normalizer,
 612         $target_tag, $target_subfields, $target_offset, $target_length, $target_normalizer) = @_;
 613
 614     $self->add_required_check(
 615       [ { tag => $source_tag, subfields => $source_subfields, offset => $source_offset, 'length' => $source_length,
 616           norms => [ $source_normalizer ] } ],
 617       [ { tag => $target_tag, subfields => $target_subfields, offset => $target_offset, 'length' => $target_length,
 618           norms => [ $target_normalizer ] } ]
 619     );
 620 }
 621
 622 =head2 find_matches
 623
 624 =over 4
 625
 626 my @matches = $matcher->get_matches($marc_record, $max_matches);
 627 foreach $match (@matches) {
 628   # matches already sorted in order of
 629   # decreasing score
 630   print "record ID: $match->{'record_id'};
 631   print "score:     $match->{'score'};
 632 }
 633
 634 =back
 635
 636 Identifies all of the records matching the given MARC record.  For a record already
 637 in the database to be considered a match, it must meet the following criteria:
 638
 639 =over 2
 640
 641 =item 1. Total score from its matching field must exceed the supplied threshold.
 642
 643 =item 2. It must pass all required checks.
 644
 645 =back
 646
 647 Only the top $max_matches matches are returned.  The returned array is sorted
 648 in order of decreasing score, i.e., the best match is first.
 649
 650 =cut
 651
 652 sub get_matches {
 653     my $self = shift;
 654     my ($source_record, $max_matches) = @_;
 655
 656     my %matches = ();
 657
 658     foreach my $matchpoint (@{ $self->{'matchpoints'} }) {
 659         my @source_keys = _get_match_keys($source_record, $matchpoint);
 660         next if scalar(@source_keys) == 0;
 661         # build query
 662         my $query = join(" or ", map { "$matchpoint->{'index'}=$_" } @source_keys);
 663         # FIXME only searching biblio index at the moment
 664         my ($error, $searchresults, $total_hits) = SimpleSearch($query, 0, $max_matches);
 665
 666         warn "search failed ($query) $error" if $error;
 667         foreach my $matched (@$searchresults) {
 668             $matches{$matched} += $matchpoint->{'score'};
 669         }
 670     }
 671
 672     # get rid of any that don't meet the threshold
 673     %matches = map { ($matches{$_} >= $self->{'threshold'}) ? ($_ => $matches{$_}) : () } keys %matches;
 674
 675     # get rid of any that don't meet the required checks
 676     %matches = map { _passes_required_checks($source_record, $_, $self->{'required_checks'}) ?  ($_ => $matches{$_}) : () }
 677                 keys %matches;
 678
 679     my @results = ();
 680     foreach my $marcblob (keys %matches) {
 681         my $target_record = MARC::Record->new_from_usmarc($marcblob);
 682         my $result = TransformMarcToKoha(C4::Context->dbh, $target_record, '');
 683         # FIXME - again, bibliospecific
 684         # also, can search engine be induced to give just the number in the first place?
 685         my $record_number = $result->{'biblionumber'};
 686         push @results, { 'record_id' => $record_number, 'score' => $matches{$marcblob} };
 687     }
 688     @results = sort { $b->{'score'} cmp $a->{'score'} } @results;
 689     if (scalar(@results) > $max_matches) {
 690         @results = @results[0..$max_matches-1];
 691     }
 692     return @results;
 693
 694 }
 695
 696 =head2 dump
 697
 698 =over 4
 699
 700 $description = $matcher->dump();
 701
 702 =back
 703
 704 Returns a reference to a structure containing all of the information
 705 in the matcher object.  This is mainly a convenience method to
 706 aid setting up a HTML editing form.
 707
 708 =cut
 709
 710 sub dump {
 711     my $self = shift;
 712
 713     my $result = {};
 714
 715     $result->{'matcher_id'} = $self->{'id'};
 716     $result->{'code'} = $self->{'code'};
 717     $result->{'description'} = $self->{'description'};
 718
 719     $result->{'matchpoints'} = [];
 720     foreach my $matchpoint (@{ $self->{'matchpoints'} }) {
 721         push @{  $result->{'matchpoints'} }, $matchpoint;
 722     }
 723     $result->{'matchchecks'} = [];
 724     foreach my $matchcheck (@{ $self->{'required_checks'} }) {
 725         push @{  $result->{'matchchecks'} }, $matchcheck;
 726     }
 727
 728     return $result;
 729 }
 730
 731 sub _passes_required_checks {
 732     my ($source_record, $target_blob, $matchchecks) = @_;
 733     my $target_record = MARC::Record->new_from_usmarc($target_blob); # FIXME -- need to avoid parsing record twice
 734
 735     # no checks supplied == automatic pass
 736     return 1 if $#{ $matchchecks } == -1;
 737
 738     foreach my $matchcheck (@{ $matchchecks }) {
 739         my $source_key = join "", _get_match_keys($source_record, $matchcheck->{'source_matchpoint'});
 740         my $target_key = join "", _get_match_keys($target_record, $matchcheck->{'target_matchpoint'});
 741         return 0 unless $source_key eq $target_key;
 742     }
 743     return 1;
 744 }
 745
 746 sub _get_match_keys {
 747     my $source_record = shift;
 748     my $matchpoint = shift;
 749     my $check_only_first_repeat = @_ ? shift : 0;
 750
 751     # If there is more than one component to the matchpoint (e.g.,
 752     # matchpoint includes both 003 and 001), any repeats
 753     # of the first component's tag are identified; repeats
 754     # of the subsequent components' tags are appended to
 755     # each parallel key dervied from the first component,
 756     # up to the number of repeats of the first component's tag.
 757     #
 758     # For example, if the record has one 003 and two 001s, only
 759     # one key is retrieved because there is only one 003.  The key
 760     # will consist of the contents of the first 003 and first 001.
 761     #
 762     # If there are two 003s and two 001s, there will be two keys:
 763     #    first 003 + first 001
 764     #    second 003 + second 001
 765
 766     my @keys = ();
 767     for (my $i = 0; $i <= $#{ $matchpoint->{'components'} }; $i++) {
 768         my $component = $matchpoint->{'components'}->[$i];
 769         my $j = -1;
 770         FIELD: foreach my $field ($source_record->field($component->{'tag'})) {
 771             $j++;
 772             last FIELD if $j > 0 and $check_only_first_repeat;
 773             last FIELD if $i > 0 and $j > $#keys;
 774             my $key = "";
 775             if ($field->is_control_field()) {
 776                 if ($component->{'length'}) {
 777                     $key = _normalize(substr($field->data(), $component->{'offset'}, $component->{'length'}))
 778                             # FIXME normalize, substr
 779                 } else {
 780                     $key = _normalize($field->data());
 781                 }
 782             } else {
 783                 foreach my $subfield ($field->subfields()) {
 784                     if (exists $component->{'subfields'}->{$subfield->[0]}) {
 785                         $key .= " " . $subfield->[1];
 786                     }
 787                 }
 788                 $key = _normalize($key);
 789                 if ($component->{'length'}){
 790                    if (length($key) > $component->{'length'}){
 791                      $key = _normalize(substr($key,$component->{'offset'},$component->{'length'}));
 792                    }
 793                 }
 794             }
 795             if ($i == 0) {
 796                 push @keys, $key if $key;
 797             } else {
 798                 $keys[$j] .= " $key" if $key;
 799             }
 800         }
 801     }
 802     return @keys;
 803
 804 }
 805
 806
 807 sub _parse_match_component {
 808     my $input_component = shift;
 809
 810     my $component = {};
 811     $component->{'tag'} = $input_component->{'tag'};
 812     $component->{'subfields'} = { map { $_ => 1 } split(//, $input_component->{'subfields'}) };
 813     $component->{'offset'} = exists($input_component->{'offset'}) ? $input_component->{'offset'} : -1;
 814     $component->{'length'} = $input_component->{'length'} ? $input_component->{'length'} : 0;
 815     $component->{'norms'} =  $input_component->{'norms'} ? $input_component->{'norms'} : [];
 816
 817     return $component;
 818 }
 819
 820 # FIXME - default normalizer
 821 sub _normalize {
 822     my $value = uc shift;
 823     $value =~ s/.;:,\]\[\)\(\/'"//g;
 824     $value =~ s/^\s+//;
 825     #$value =~ s/^\s+$//;
 826     $value =~ s/\s+$//;
 827     $value =~ s/\s+/ /g;
 828     #$value =~ s/[.;,\]\[\)\(\/"']//g;
 829     return $value;
 830 }
 831
 832 1;
 833 __END__
 834
 835 =head1 AUTHOR
 836
 837 Koha Development Team <info@koha.org>
 838
 839 Galen Charlton <galen.charlton@liblime.com>
 840
 841 =cut