C4/Ris.pm

   1 package C4::Ris;
   2
   3 # Original script :
   4 ## marc2ris: converts MARC21 and UNIMARC datasets to RIS format
   5 ##           See comments below for compliance with other MARC dialects
   6 ##
   7 ## usage: perl marc2ris < infile.marc > outfile.ris
   8 ##
   9 ## Dependencies: perl 5.6.0 or later
  10 ##               MARC::Record
  11 ##               MARC::Charset
  12 ##
  13 ## markus@mhoenicka.de 2002-11-16
  14
  15 ##   This program is free software; you can redistribute it and/or modify
  16 ##   it under the terms of the GNU General Public License as published by
  17 ##   the Free Software Foundation; either version 2 of the License, or
  18 ##   (at your option) any later version.
  19 ##
  20 ##   This program is distributed in the hope that it will be useful,
  21 ##   but WITHOUT ANY WARRANTY; without even the implied warranty of
  22 ##   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  23 ##   GNU General Public License for more details.
  24
  25 ##   You should have received a copy of the GNU General Public License
  26 ##   along with this program; if not, write to the Free Software
  27 ##   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  28
  29 ## Some background about MARC as understood by this script
  30 ## The default input format used in this script is MARC21, which
  31 ## superseded USMARC and CANMARC. The specification can be found at:
  32 ## http://lcweb.loc.gov/marc/
  33 ## UNIMARC follows the specification at:
  34 ## http://www.ifla.org/VI/3/p1996-1/sec-uni.htm
  35 ## UKMARC support is a bit shaky because there is no specification available
  36 ## for free. The wisdom used in this script was taken from a PDF document
  37 ## comparing UKMARC to MARC21 found at:
  38 ## www.bl.uk/services/bibliographic/marcchange.pdf
  39
  40
  41 # Modified 2008 by BibLibre for Koha
  42 # Modified 2011 by Catalyst
  43 # Modified 2011 by Equinox Software, Inc.
  44 #
  45 # This file is part of Koha.
  46 #
  47 # Koha is free software; you can redistribute it and/or modify it under the
  48 # terms of the GNU General Public License as published by the Free Software
  49 # Foundation; either version 2 of the License, or (at your option) any later
  50 # version.
  51 #
  52 # Koha is distributed in the hope that it will be useful, but WITHOUT ANY
  53 # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
  54 # A PARTICULAR PURPOSE.  See the GNU General Public License for more details.
  55 #
  56 # You should have received a copy of the GNU General Public License along with
  57 # Koha; if not, write to the Free Software Foundation, Inc., 59 Temple Place,
  58 # Suite 330, Boston, MA  02111-1307 USA
  59 #
  60 #
  61
  62 #use strict;
  63 #use warnings; FIXME - Bug 2505
  64
  65 use vars qw($VERSION @ISA @EXPORT);
  66
  67 # set the version for version checking
  68 $VERSION = 3.00;
  69
  70 @ISA = qw(Exporter);
  71
  72 # only export API methods
  73
  74 @EXPORT = qw(
  75   &marc2ris
  76 );
  77
  78
  79 =head1 marc2bibtex - Convert from UNIMARC to RIS
  80
  81   my ($ris) = marc2ris($record);
  82
  83 Returns a RIS scalar
  84
  85 C<$record> - a MARC::Record object
  86
  87 =cut
  88
  89 sub marc2ris {
  90     my ($record) = @_;
  91     my $output;
  92
  93     my $marcflavour = C4::Context->preference("marcflavour");
  94     my $intype = lc($marcflavour);
  95     my $marcprint = 0; # Debug flag;
  96
  97     # Let's redirect stdout
  98     open my $oldout, ">&STDOUT";
  99     my $outvar;
 100     close STDOUT;
 101     open STDOUT,'>', \$outvar;
 102
 103
 104         ## First we should check the character encoding. This may be
 105         ## MARC-8 or UTF-8. The former is indicated by a blank, the latter
 106         ## by 'a' at position 09 (zero-based) of the leader
 107         my $leader = $record->leader();
 108         if ($intype eq "marc21") {
 109             if ($leader =~ /^.{9}a/) {
 110                 print "<marc>---\r\n<marc>UTF-8 data\r\n" if $marcprint;
 111                 $utf = 1;
 112             }
 113             else {
 114                 print "<marc>---\r\n<marc>MARC-8 data\r\n" if $marcprint;
 115             }
 116         }
 117         ## else: other MARC formats do not specify the character encoding
 118         ## we assume it's *not* UTF-8
 119
 120         ## start RIS dataset
 121         &print_typetag($leader);
 122
 123         ## retrieve all author fields and collect them in a list
 124         my @author_fields;
 125
 126         if ($intype eq "unimarc") {
 127             ## Fields 700, 701, and 702 can contain author names
 128             @author_fields = ($record->field('700'), $record->field('701'), $record->field('702'));
 129         }
 130         else {  ## marc21, ukmarc
 131             ## Field 100 sometimes carries main author
 132             ## Field(s) 700 carry added entries - personal names
 133             @author_fields = ($record->field('100'), $record->field('700'));
 134         }
 135
 136         ## loop over all author fields
 137         foreach my $field (@author_fields) {
 138             if (length($field)) {
 139                 my $author = &get_author($field);
 140                 print "AU  - ",&charconv($author),"\r\n";
 141             }
 142         }
 143
 144         # ToDo: should we specify anonymous as author if we didn't find
 145         # one? or use one of the corporate/meeting names below?
 146
 147         ## add corporate names or meeting names as editors ??
 148         my @editor_fields;
 149
 150         if ($intype eq "unimarc") {
 151             ## Fields 710, 711, and 712 can carry corporate names
 152             ## Field(s) 720, 721, 722, 730 have additional candidates
 153             @editor_fields = ($record->field('710'), $record->field('711'), $record->field('712'), $record->field('720'), $record->field('721'), $record->field('722'), $record->field('730'));
 154         }
 155         else { ## marc21, ukmarc
 156             ## Fields 110 and 111 carry the main entries - corporate name and
 157             ## meeting name, respectively
 158             ## Field(s) 710, 711 carry added entries - personal names
 159             @editor_fields = ($record->field('110'), $record->field('111'), $record->field('710'), $record->field('711'));
 160         }
 161
 162         ## loop over all editor fields
 163         foreach my $field (@editor_fields) {
 164             if (length($field)) {
 165                 my $editor = &get_editor($field);
 166                 print "ED  - ",&charconv($editor),"\r\n";
 167             }
 168         }
 169
 170         ## get info from the title field
 171         if ($intype eq "unimarc") {
 172             &print_title($record->field('200'));
 173         }
 174         else { ## marc21, ukmarc
 175             &print_title($record->field('245'));
 176         }
 177
 178         ## series title
 179         if ($intype eq "unimarc") {
 180             &print_stitle($record->field('225'));
 181         }
 182         else { ## marc21, ukmarc
 183             &print_stitle($record->field('490'));
 184         }
 185
 186         ## ISBN/ISSN
 187         if ($intype eq "unimarc") {
 188             &print_isbn($record->field('010'));
 189             &print_issn($record->field('011'));
 190         }
 191         elsif ($intype eq "ukmarc") {
 192             &print_isbn($record->field('021'));
 193             ## this is just an assumption
 194             &print_issn($record->field('022'));
 195         }
 196         else { ## assume marc21
 197             &print_isbn($record->field('020'));
 198             &print_issn($record->field('022'));
 199         }
 200
 201         if ($intype eq "marc21") {
 202             &print_loc_callno($record->field('050'));
 203             &print_dewey($record->field('082'));
 204         }
 205         ## else: unimarc, ukmarc do not seem to store call numbers?
 206
 207         ## publication info
 208         if ($intype eq "unimarc") {
 209             &print_pubinfo($record->field('210'));
 210         }
 211         else { ## marc21, ukmarc
 212             &print_pubinfo($record->field('260'));
 213         }
 214
 215         ## 6XX fields contain KW candidates. We add all of them to a
 216         ## hash to eliminate duplicates
 217         my %kwpool;
 218
 219         if ($intype eq "unimarc") {
 220             foreach ('600', '601', '602', '604', '605', '606','607', '608', '610', '615', '620', '660'. '661', '670', '675', '676', '680', '686') {
 221                 &get_keywords(\%kwpool, "$_",$record->field($_));
 222             }
 223         }
 224         elsif ($intype eq "ukmarc") {
 225             foreach ('600', '610', '611', '630', '650', '651','653', '655', '660', '661', '668', '690', '691', '692', '695') {
 226                 &get_keywords(\%kwpool, "$_",$record->field($_));
 227             }
 228         }
 229         else { ## assume marc21
 230             foreach ('600', '610', '611', '630', '650', '651','653', '654', '655', '656', '657', '658') {
 231                 &get_keywords(\%kwpool, "$_",$record->field($_));
 232             }
 233         }
 234
 235         ## print all keywords found in the hash. The value of each hash
 236         ## entry is the number of occurrences, but we're not really interested
 237         ## in that and rather print the key
 238         while (my ($key, $value) = each %kwpool) {
 239             print "KW  - ", &charconv($key), "\r\n";
 240         }
 241
 242         ## 5XX have various candidates for notes and abstracts. We pool
 243         ## all notes-like stuff in one list.
 244         my @notepool;
 245
 246         ## these fields have notes candidates
 247         if ($intype eq "unimarc") {
 248             foreach ('300', '301', '302', '303', '304', '305', '306', '307', '308', '310', '311', '312', '313', '314', '315', '316', '317', '318', '320', '321', '322', '323', '324', '325', '326', '327', '328', '332', '333', '336', '337', '345') {
 249                 &pool_subx(\@notepool, $_, $record->field($_));
 250             }
 251         }
 252         elsif ($intype eq "ukmarc") {
 253             foreach ('500', '501', '502', '503', '504', '505', '506', '508', '514', '515', '516', '521', '524', '525', '528', '530', '531', '532', '533', '534', '535', '537', '538', '540', '541', '542', '544', '554', '555', '556', '557', '561', '563', '580', '583', '584', '586') {
 254                 &pool_subx(\@notepool, $_, $record->field($_));
 255             }
 256         }
 257         else { ## assume marc21
 258             foreach ('500', '501', '502', '504', '505', '506', '507', '508', '510', '511', '513', '514', '515', '516', '518', '521', '522', '524', '525', '526', '530', '533', '534', '535') {
 259                 &pool_subx(\@notepool, $_, $record->field($_));
 260             }
 261         }
 262
 263         my $allnotes = join "; ", @notepool;
 264
 265         if (length($allnotes) > 0) {
 266             print "N1  - ", &charconv($allnotes), "\r\n";
 267         }
 268
 269         ## 320/520 have the abstract
 270         if ($intype eq "unimarc") {
 271             &print_abstract($record->field('320'));
 272         }
 273         elsif ($intype eq "ukmarc") {
 274             &print_abstract($record->field('512'), $record->field('513'));
 275         }
 276         else { ## assume marc21
 277             &print_abstract($record->field('520'));
 278         }
 279
 280     # 856u has the URI
 281     if ($record->field('856')) {
 282         print_uri($record->field('856'));
 283     }
 284
 285         ## end RIS dataset
 286         print "ER  - \r\n";
 287
 288     # Let's re-redirect stdout
 289     close STDOUT;
 290     open STDOUT, ">&", $oldout;
 291
 292     return $outvar;
 293
 294 }
 295
 296
 297 ##********************************************************************
 298 ## print_typetag(): prints the first line of a RIS dataset including
 299 ## the preceeding newline
 300 ## Argument: the leader of a MARC dataset
 301 ## Returns: the value at leader position 06
 302 ##********************************************************************
 303 sub print_typetag {
 304   my ($leader)= @_;
 305     ## the keys of typehash are the allowed values at position 06
 306     ## of the leader of a MARC record, the values are the RIS types
 307     ## that might appropriately represent these types.
 308     my %ustypehash = (
 309                     "a" => "BOOK",
 310                     "c" => "MUSIC",
 311                     "d" => "MUSIC",
 312                     "e" => "MAP",
 313                     "f" => "MAP",
 314                     "g" => "ADVS",
 315                     "i" => "SOUND",
 316                     "j" => "SOUND",
 317                     "k" => "ART",
 318                     "m" => "DATA",
 319                     "o" => "GEN",
 320                     "p" => "GEN",
 321                     "r" => "ART",
 322                     "t" => "GEN",
 323                 );
 324
 325     my %unitypehash = (
 326                     "a" => "BOOK",
 327                     "b" => "BOOK",
 328                     "c" => "MUSIC",
 329                     "d" => "MUSIC",
 330                     "e" => "MAP",
 331                     "f" => "MAP",
 332                     "g" => "ADVS",
 333                     "i" => "SOUND",
 334                     "j" => "SOUND",
 335                     "k" => "ART",
 336                     "l" => "ELEC",
 337                     "m" => "ADVS",
 338                     "r" => "ART",
 339                 );
 340
 341     ## The type of a MARC record is found at position 06 of the leader
 342     my $typeofrecord = substr($leader, 6, 1);
 343
 344     ## ToDo: for books, field 008 positions 24-27 might have a few more
 345     ## hints
 346
 347     my %typehash;
 348
 349     ## the ukmarc here is just a guess
 350     if ($intype eq "marc21" || $intype eq "ukmarc") {
 351         %typehash = %ustypehash;
 352     }
 353     elsif ($intype eq "unimarc") {
 354         %typehash = %unitypehash;
 355     }
 356     else {
 357         ## assume MARC21 as default
 358         %typehash = %ustypehash;
 359     }
 360
 361     if (!exists $typehash{$typeofrecord}) {
 362         print "TY  - BOOK\r\n"; ## most reasonable default
 363         warn ("no type found - assume BOOK") if $marcprint;
 364     }
 365     else {
 366         print "TY  - $typehash{$typeofrecord}\r\n";
 367     }
 368
 369     ## use $typeofrecord as the return value, just in case
 370     $typeofrecord;
 371 }
 372
 373 ##********************************************************************
 374 ## normalize_author(): normalizes an authorname
 375 ## Arguments: authorname subfield a
 376 ##            authorname subfield b
 377 ##            authorname subfield c
 378 ##            name type if known: 0=direct order
 379 ##                               1=only surname or full name in
 380 ##                                 inverted order
 381 ##                               3=family, clan, dynasty name
 382 ## Returns: the normalized authorname
 383 ##********************************************************************
 384 sub normalize_author {
 385     my($rawauthora, $rawauthorb, $rawauthorc, $nametype) = @_;
 386
 387     if ($nametype == 0) {
 388         # ToDo: convert every input to Last[,(F.|First)[ (M.|Middle)[,Suffix]]]
 389         warn("name >>$rawauthora<< in direct order - leave as is") if $marcprint;
 390         return $rawauthora;
 391     }
 392     elsif ($nametype == 1) {
 393         ## start munging subfield a (the real name part)
 394         ## remove spaces after separators
 395         $rawauthora =~ s%([,.]+) *%$1%g;
 396
 397         ## remove trailing separators after spaces
 398         $rawauthora =~ s% *[,;:/]*$%%;
 399
 400         ## remove periods after a non-abbreviated name
 401         $rawauthora =~ s%(\w{2,})\.%$1%g;
 402
 403         ## start munging subfield b (something like the suffix)
 404         ## remove trailing separators after spaces
 405         $rawauthorb =~ s% *[,;:/]*$%%;
 406
 407         ## we currently ignore subfield c until someone complains
 408         if (length($rawauthorb) > 0) {
 409             return join ",", ($rawauthora, $rawauthorb);
 410         }
 411         else {
 412             return $rawauthora;
 413         }
 414     }
 415     elsif ($nametype == 3) {
 416         return $rawauthora;
 417     }
 418 }
 419
 420 ##********************************************************************
 421 ## get_author(): gets authorname info from MARC fields 100, 700
 422 ## Argument: field (100 or 700)
 423 ## Returns: an author string in the format found in the record
 424 ##********************************************************************
 425 sub get_author {
 426     my ($authorfield) = @_;
 427     my ($indicator);
 428
 429     ## the sequence of the name parts is encoded either in indicator
 430     ## 1 (marc21) or 2 (unimarc)
 431     if ($intype eq "unimarc") {
 432         $indicator = 2;
 433     }
 434     else { ## assume marc21
 435         $indicator = 1;
 436     }
 437
 438     print "<marc>:Author(Ind$indicator): ", $authorfield->indicator("$indicator"),"\r\n" if $marcprint;
 439     print "<marc>:Author(\$a): ", $authorfield->subfield('a'),"\r\n" if $marcprint;
 440     print "<marc>:Author(\$b): ", $authorfield->subfield('b'),"\r\n" if $marcprint;
 441     print "<marc>:Author(\$c): ", $authorfield->subfield('c'),"\r\n" if $marcprint;
 442     print "<marc>:Author(\$h): ", $authorfield->subfield('h'),"\r\n" if $marcprint;
 443     if ($intype eq "ukmarc") {
 444         my $authorname = $authorfield->subfield('a') . "," . $authorfield->subfield('h');
 445         normalize_author($authorname, $authorfield->subfield('b'), $authorfield->subfield('c'), $authorfield->indicator("$indicator"));
 446     }
 447     else {
 448         normalize_author($authorfield->subfield('a'), $authorfield->subfield('b'), $authorfield->subfield('c'), $authorfield->indicator("$indicator"));
 449     }
 450 }
 451
 452 ##********************************************************************
 453 ## get_editor(): gets editor info from MARC fields 110, 111, 710, 711
 454 ## Argument: field (110, 111, 710, or 711)
 455 ## Returns: an author string in the format found in the record
 456 ##********************************************************************
 457 sub get_editor {
 458     my ($editorfield) = @_;
 459
 460     if (!$editorfield) {
 461         return;
 462     }
 463     else {
 464         print "<marc>Editor(\$a): ", $editorfield->subfield('a'),"\r\n" if $marcprint;
 465         print "<marc>Editor(\$b): ", $editorfield->subfield('b'),"\r\n" if $marcprint;
 466         print "<marc>editor(\$c): ", $editorfield->subfield('c'),"\r\n" if $marcprint;
 467         return $editorfield->subfield('a');
 468     }
 469 }
 470
 471 ##********************************************************************
 472 ## print_title(): gets info from MARC field 245
 473 ## Arguments: field (245)
 474 ## Returns:
 475 ##********************************************************************
 476 sub print_title {
 477     my ($titlefield) = @_;
 478     if (!$titlefield) {
 479         print "<marc>empty title field (245)\r\n" if $marcprint;
 480         warn("empty title field (245)") if $marcprint;
 481     }
 482     else {
 483         print "<marc>Title(\$a): ",$titlefield->subfield('a'),"\r\n" if $marcprint;
 484         print "<marc>Title(\$b): ",$titlefield->subfield('b'),"\r\n" if $marcprint;
 485         print "<marc>Title(\$c): ",$titlefield->subfield('c'),"\r\n" if $marcprint;
 486
 487         ## The title is usually written in a very odd notation. The title
 488         ## proper ($a) often ends with a space followed by a separator like
 489         ## a slash or a colon. The subtitle ($b) doesn't start with a space
 490         ## so simple concatenation looks odd. We have to conditionally remove
 491         ## the separator and make sure there's a space between title and
 492         ## subtitle
 493
 494         my $clean_title = $titlefield->subfield('a');
 495
 496         my $clean_subtitle = $titlefield->subfield('b');
 497         $clean_title =~ s% *[/:;.]$%%;
 498         $clean_subtitle =~ s%^ *(.*) *[/:;.]$%$1%;
 499
 500         if (length($clean_title) > 0
 501             || (length($clean_subtitle) > 0 && $intype ne "unimarc")) {
 502             print "TI  - ", &charconv($clean_title);
 503
 504             ## subfield $b is relevant only for marc21/ukmarc
 505             if (length($clean_subtitle) > 0 && $intype ne "unimarc") {
 506                 print ": ",&charconv($clean_subtitle);
 507             }
 508             print "\r\n";
 509         }
 510
 511         ## The statement of responsibility is just this: horrors. There is
 512         ## no formal definition how authors, editors and the like should
 513         ## be written and designated. The field is free-form and resistant
 514         ## to all parsing efforts, so this information is lost on me
 515     }
 516 }
 517
 518 ##********************************************************************
 519 ## print_stitle(): prints info from series title field
 520 ## Arguments: field
 521 ## Returns:
 522 ##********************************************************************
 523 sub print_stitle {
 524     my ($titlefield) = @_;
 525
 526     if (!$titlefield) {
 527         print "<marc>empty series title field\r\n" if $marcprint;
 528     }
 529     else {
 530         print "<marc>Series title(\$a): ",$titlefield->subfield('a'),"\r\n" if $marcprint;
 531         my $clean_title = $titlefield->subfield('a');
 532
 533         $clean_title =~ s% *[/:;.]$%%;
 534
 535         if (length($clean_title) > 0) {
 536             print "T2  - ", &charconv($clean_title),"\r\n";
 537         }
 538
 539         if ($intype eq "unimarc") {
 540             print "<marc>Series vol(\$v): ",$titlefield->subfield('v'),"\r\n" if $marcprint;
 541             if (length($titlefield->subfield('v')) > 0) {
 542                 print "VL  - ", &charconv($titlefield->subfield('v')),"\r\n";
 543             }
 544         }
 545     }
 546 }
 547
 548 ##********************************************************************
 549 ## print_isbn(): gets info from MARC field 020
 550 ## Arguments: field (020)
 551 ##********************************************************************
 552 sub print_isbn {
 553     my($isbnfield) = @_;
 554
 555     if (!$isbnfield || length ($isbnfield->subfield('a')) == 0) {
 556         print "<marc>no isbn found (020\$a)\r\n" if $marcprint;
 557         warn("no isbn found") if $marcprint;
 558     }
 559     else {
 560         if (length ($isbnfield->subfield('a')) < 10) {
 561             print "<marc>truncated isbn (020\$a)\r\n" if $marcprint;
 562             warn("truncated isbn") if $marcprint;
 563         }
 564
 565         my $isbn = substr($isbnfield->subfield('a'), 0, 10);
 566         print "SN  - ", &charconv($isbn), "\r\n";
 567     }
 568 }
 569
 570 ##********************************************************************
 571 ## print_issn(): gets info from MARC field 022
 572 ## Arguments: field (022)
 573 ##********************************************************************
 574 sub print_issn {
 575     my($issnfield) = @_;
 576
 577     if (!$issnfield || length ($issnfield->subfield('a')) == 0) {
 578         print "<marc>no issn found (022\$a)\r\n" if $marcprint;
 579         warn("no issn found") if $marcprint;
 580     }
 581     else {
 582         if (length ($issnfield->subfield('a')) < 9) {
 583             print "<marc>truncated issn (022\$a)\r\n" if $marcprint;
 584             warn("truncated issn") if $marcprint;
 585         }
 586
 587         my $issn = substr($issnfield->subfield('a'), 0, 9);
 588         print "SN  - ", &charconv($issn), "\r\n";
 589     }
 590 }
 591
 592 ###
 593 # print_uri() prints info from 856 u
 594 ###
 595 sub print_uri {
 596     my @f856s = @_;
 597
 598     foreach my $f856 (@f856s) {
 599         if (my $uri = $f856->subfield('u')) {
 600                 print "UR  - ", charconv($uri), "\r\n";
 601         }
 602     }
 603 }
 604
 605 ##********************************************************************
 606 ## print_loc_callno(): gets info from MARC field 050
 607 ## Arguments: field (050)
 608 ##********************************************************************
 609 sub print_loc_callno {
 610     my($callnofield) = @_;
 611
 612     if (!$callnofield || length ($callnofield->subfield('a')) == 0) {
 613         print "<marc>no LOC call number found (050\$a)\r\n" if $marcprint;
 614         warn("no LOC call number found") if $marcprint;
 615     }
 616     else {
 617         print "AV  - ", &charconv($callnofield->subfield('a')), " ", &charconv($callnofield->subfield('b')), "\r\n";
 618     }
 619 }
 620
 621 ##********************************************************************
 622 ## print_dewey(): gets info from MARC field 082
 623 ## Arguments: field (082)
 624 ##********************************************************************
 625 sub print_dewey {
 626     my($deweyfield) = @_;
 627
 628     if (!$deweyfield || length ($deweyfield->subfield('a')) == 0) {
 629         print "<marc>no Dewey number found (082\$a)\r\n" if $marcprint;
 630         warn("no Dewey number found") if $marcprint;
 631     }
 632     else {
 633         print "U1  - ", &charconv($deweyfield->subfield('a')), " ", &charconv($deweyfield->subfield('2')), "\r\n";
 634     }
 635 }
 636
 637 ##********************************************************************
 638 ## print_pubinfo(): gets info from MARC field 260
 639 ## Arguments: field (260)
 640 ##********************************************************************
 641 sub print_pubinfo {
 642     my($pubinfofield) = @_;
 643
 644     if (!$pubinfofield) {
 645         print "<marc>no publication information found (260)\r\n" if $marcprint;
 646         warn("no publication information found") if $marcprint;
 647     }
 648     else {
 649         ## the following information is available in MARC21:
 650         ## $a place -> CY
 651         ## $b publisher -> PB
 652         ## $c date -> PY
 653         ## the corresponding subfields for UNIMARC:
 654         ## $a place -> CY
 655         ## $c publisher -> PB
 656         ## $d date -> PY
 657
 658         ## all of them are repeatable. We pool all places into a
 659         ## comma-separated list in CY. We also pool all publishers
 660         ## into a comma-separated list in PB.  We break the rule with
 661         ## the date field because this wouldn't make much sense. In
 662         ## this case, we use the first occurrence for PY, the second
 663         ## for Y2, and ignore the rest
 664
 665         my @pubsubfields = $pubinfofield->subfields();
 666         my @cities;
 667         my @publishers;
 668         my $pycounter = 0;
 669
 670         my $pubsub_place;
 671         my $pubsub_publisher;
 672         my $pubsub_date;
 673
 674         if ($intype eq "unimarc") {
 675             $pubsub_place = "a";
 676             $pubsub_publisher = "c";
 677             $pubsub_date = "d";
 678         }
 679         else { ## assume marc21
 680             $pubsub_place = "a";
 681             $pubsub_publisher = "b";
 682             $pubsub_date = "c";
 683         }
 684
 685         ## loop over all subfield list entries
 686         for my $tuple (@pubsubfields) {
 687             ## each tuple consists of the subfield code and the value
 688             if (@$tuple[0] eq $pubsub_place) {
 689                 ## strip any trailing crap
 690                 $_ = @$tuple[1];
 691                 s% *[,;:/]$%%;
 692                 ## pool all occurrences in a list
 693                 push (@cities, $_);
 694             }
 695             elsif (@$tuple[0] eq $pubsub_publisher) {
 696                 ## strip any trailing crap
 697                 $_ = @$tuple[1];
 698                 s% *[,;:/]$%%;
 699                 ## pool all occurrences in a list
 700                 push (@publishers, $_);
 701             }
 702             elsif (@$tuple[0] eq $pubsub_date) {
 703                 ## the dates are free-form, so we want to extract
 704                 ## a four-digit year and leave the rest as
 705                 ## "other info"
 706                 $protoyear = @$tuple[1];
 707                 print "<marc>Year (260\$c): $protoyear\r\n" if $marcprint;
 708
 709                 ## strip any separator chars at the end
 710                 $protoyear =~ s% *[\.;:/]*$%%;
 711
 712                 ## isolate a four-digit year. We discard anything
 713                 ## preceeding the year, but keep everything after
 714                 ## the year as other info.
 715                 $protoyear =~ s%\D*([0-9\-]{4})(.*)%$1///$2%;
 716
 717                 ## check what we've got. If there is no four-digit
 718                 ## year, make it up. If digits are replaced by '-',
 719                 ## replace those with 0s
 720
 721                 if (index($protoyear, "/") == 4) {
 722                     ## have year info
 723                     ## replace all '-' in the four-digit year
 724                     ## by '0'
 725                     substr($protoyear,0,4) =~ s!-!0!g;
 726                 }
 727                 else {
 728                     ## have no year info
 729                     print "<marc>no four-digit year found, use 0000\r\n" if $marcprint;
 730                     $protoyear = "0000///$protoyear";
 731                     warn("no four-digit year found, use 0000") if $marcprint;
 732                 }
 733
 734                 if ($pycounter == 0 && length($protoyear)) {
 735                     print "PY  - $protoyear\r\n";
 736                 }
 737                 elsif ($pycounter == 1 && length($_)) {
 738                     print "Y2  - $protoyear\r\n";
 739                 }
 740                 ## else: discard
 741             }
 742             ## else: discard
 743         }
 744
 745         ## now dump the collected CY and PB lists
 746         if (@cities > 0) {
 747             print "CY  - ", &charconv(join(", ", @cities)), "\r\n";
 748         }
 749         if (@publishers > 0) {
 750             print "PB  - ", &charconv(join(", ", @publishers)), "\r\n";
 751         }
 752     }
 753 }
 754
 755 ##********************************************************************
 756 ## get_keywords(): prints info from MARC fields 6XX
 757 ## Arguments: list of fields (6XX)
 758 ##********************************************************************
 759 sub get_keywords {
 760     my($href, $fieldname, @keywords) = @_;
 761
 762     ## a list of all possible subfields
 763     my @subfields = ('a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'x', 'y', 'z', '2', '3', '4');
 764
 765     ## loop over all 6XX fields
 766     foreach $kwfield (@keywords) {
 767         if ($kwfield != undef) {
 768             ## authornames get special treatment
 769             if ($fieldname eq "600") {
 770                 my $val = normalize_author($kwfield->subfield('a'), $kwfield->subfield('b'), $kwfield->subfield('c'), $kwfield->indicator('1'));
 771                 ${$href}{$val} += 1;
 772                 print "<marc>Field $kwfield subfield a:", $kwfield->subfield('a'), "\r\n<marc>Field $kwfield subfield b:", $kwfield->subfield('b'), "\r\n<marc>Field $kwfield subfield c:", $kwfield->subfield('c'), "\r\n" if $marcprint;
 773             }
 774             else {
 775                 ## retrieve all available subfields
 776                 @kwsubfields = $kwfield->subfields();
 777
 778                 ## loop over all available subfield tuples
 779                 foreach $kwtuple (@kwsubfields) {
 780                     ## loop over all subfields to check
 781                     foreach $subfield (@subfields) {
 782                         ## [0] contains subfield code
 783                         if (@$kwtuple[0] eq $subfield) {
 784                             ## [1] contains value, remove trailing separators
 785                             @$kwtuple[1] =~ s% *[,;.:/]*$%%;
 786                             if (length(@$kwtuple[1]) > 0) {
 787                                 ## add to hash
 788                                 ${$href}{@$kwtuple[1]} += 1;
 789                                 print "<marc>Field $fieldname subfield $subfield:", @$kwtuple[1], "\r\n" if $marcprint;
 790                             }
 791                             ## we can leave the subfields loop here
 792                             last;
 793                         }
 794                     }
 795                 }
 796             }
 797         }
 798     }
 799 }
 800
 801 ##********************************************************************
 802 ## pool_subx(): adds contents of several subfields to a list
 803 ## Arguments: reference to a list
 804 ##            field name
 805 ##            list of fields (5XX)
 806 ##********************************************************************
 807 sub pool_subx {
 808     my($aref, $fieldname, @notefields) = @_;
 809
 810     ## we use a list that contains the interesting subfields
 811     ## for each field
 812     # ToDo: this is apparently correct only for marc21
 813     my @subfields;
 814
 815     if ($fieldname eq "500") {
 816         @subfields = ('a');
 817     }
 818     elsif ($fieldname eq "501") {
 819         @subfields = ('a');
 820     }
 821     elsif ($fieldname eq "502") {
 822         @subfields = ('a');
 823             }
 824     elsif ($fieldname eq "504") {
 825         @subfields = ('a', 'b');
 826     }
 827     elsif ($fieldname eq "505") {
 828         @subfields = ('a', 'g', 'r', 't', 'u');
 829     }
 830     elsif ($fieldname eq "506") {
 831         @subfields = ('a', 'b', 'c', 'd', 'e');
 832     }
 833     elsif ($fieldname eq "507") {
 834         @subfields = ('a', 'b');
 835     }
 836     elsif ($fieldname eq "508") {
 837         @subfields = ('a');
 838     }
 839     elsif ($fieldname eq "510") {
 840         @subfields = ('a', 'b', 'c', 'x', '3');
 841     }
 842     elsif ($fieldname eq "511") {
 843         @subfields = ('a');
 844     }
 845     elsif ($fieldname eq "513") {
 846         @subfields = ('a', 'b');
 847     }
 848     elsif ($fieldname eq "514") {
 849         @subfields = ('a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'm', 'u', 'z');
 850     }
 851     elsif ($fieldname eq "515") {
 852         @subfields = ('a');
 853     }
 854     elsif ($fieldname eq "516") {
 855         @subfields = ('a');
 856     }
 857     elsif ($fieldname eq "518") {
 858         @subfields = ('a', '3');
 859     }
 860     elsif ($fieldname eq "521") {
 861         @subfields = ('a', 'b', '3');
 862     }
 863     elsif ($fieldname eq "522") {
 864         @subfields = ('a');
 865     }
 866     elsif ($fieldname eq "524") {
 867         @subfields = ('a', '2', '3');
 868     }
 869     elsif ($fieldname eq "525") {
 870         @subfields = ('a');
 871     }
 872     elsif ($fieldname eq "526") {
 873         @subfields = ('a', 'b', 'c', 'd', 'i', 'x', 'z', '5');
 874     }
 875     elsif ($fieldname eq "530") {
 876         @subfields = ('a', 'b', 'c', 'd', 'u', '3');
 877     }
 878     elsif ($fieldname eq "533") {
 879         @subfields = ('a', 'b', 'c', 'd', 'e', 'f', 'm', 'n', '3');
 880     }
 881     elsif ($fieldname eq "534") {
 882         @subfields = ('a', 'b', 'c', 'e', 'f', 'k', 'l', 'm', 'n', 'p', 't', 'x', 'z');
 883     }
 884     elsif ($fieldname eq "535") {
 885         @subfields = ('a', 'b', 'c', 'd', 'g', '3');
 886     }
 887
 888     ## loop over all notefields
 889     foreach $notefield (@notefields) {
 890         if ($notefield != undef) {
 891             ## retrieve all available subfield tuples
 892             @notesubfields = $notefield->subfields();
 893
 894             ## loop over all subfield tuples
 895             foreach $notetuple (@notesubfields) {
 896                 ## loop over all subfields to check
 897                 foreach $subfield (@subfields) {
 898                     ## [0] contains subfield code
 899                     if (@$notetuple[0] eq $subfield) {
 900                         ## [1] contains value, remove trailing separators
 901                         print "<marc>field $fieldname subfield $subfield: ", @$notetuple[1], "\r\n" if $marcprint;
 902                         @$notetuple[1] =~ s% *[,;.:/]*$%%;
 903                         if (length(@$notetuple[1]) > 0) {
 904                             ## add to list
 905                             push @{$aref}, @$notetuple[1];
 906                         }
 907                         last;
 908                     }
 909                 }
 910             }
 911         }
 912     }
 913 }
 914
 915 ##********************************************************************
 916 ## print_abstract(): prints abstract fields
 917 ## Arguments: list of fields (520)
 918 ##********************************************************************
 919 sub print_abstract {
 920     # ToDo: take care of repeatable subfields
 921     my(@abfields) = @_;
 922
 923     ## we check the following subfields
 924     my @subfields = ('a', 'b');
 925
 926     ## we generate a list for all useful strings
 927     my @abstrings;
 928
 929     ## loop over all abfields
 930     foreach $abfield (@abfields) {
 931         foreach $field (@subfields) {
 932             if (length ($abfield->subfield($field)) > 0) {
 933                 my $ab = $abfield->subfield($field);
 934
 935                 print "<marc>field 520 subfield $field: $ab\r\n" if $marcprint;
 936
 937                 ## strip trailing separators
 938                 $ab =~ s% *[;,:./]*$%%;
 939
 940                 ## add string to the list
 941                 push (@abstrings, $ab);
 942             }
 943         }
 944     }
 945
 946     my $allabs = join "; ", @abstrings;
 947
 948     if (length($allabs) > 0) {
 949         print "N2  - ", &charconv($allabs), "\r\n";
 950     }
 951
 952 }
 953
 954
 955
 956 ##********************************************************************
 957 ## charconv(): converts to a different charset based on a global var
 958 ## Arguments: string
 959 ## Returns: string
 960 ##********************************************************************
 961 sub charconv {
 962     if ($utf) {
 963         ## return unaltered if already utf-8
 964         return @_;
 965     }
 966     elsif ($uniout eq "t") {
 967         ## convert to utf-8
 968         return marc8_to_utf8("@_");
 969     }
 970     else {
 971         ## return unaltered if no utf-8 requested
 972         return @_;
 973     }
 974 }
 975 1;