Bug 14832: Fix encoding issues when exporting in RIS
[koha.git] / C4 / Ris.pm
blob4b7a869a813a02b426b797e1f6791c1a19c4b948
1 package C4::Ris;
3 # Original script :
4 ## marc2ris: converts MARC21 and UNIMARC datasets to RIS format
5 ## See comments below for compliance with other MARC dialects
6 ##
7 ## usage: perl marc2ris < infile.marc > outfile.ris
8 ##
9 ## Dependencies: perl 5.6.0 or later
10 ## MARC::Record
11 ## MARC::Charset
13 ## markus@mhoenicka.de 2002-11-16
15 ## This program is free software; you can redistribute it and/or modify
16 ## it under the terms of the GNU General Public License as published by
17 ## the Free Software Foundation; either version 2 of the License, or
18 ## (at your option) any later version.
19 ##
20 ## This program is distributed in the hope that it will be useful,
21 ## but WITHOUT ANY WARRANTY; without even the implied warranty of
22 ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23 ## GNU General Public License for more details.
25 ## You should have received a copy of the GNU General Public License
26 ## along with this program; if not, write to the Free Software
27 ## Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
29 ## Some background about MARC as understood by this script
30 ## The default input format used in this script is MARC21, which
31 ## superseded USMARC and CANMARC. The specification can be found at:
32 ## http://lcweb.loc.gov/marc/
33 ## UNIMARC follows the specification at:
34 ## http://www.ifla.org/VI/3/p1996-1/sec-uni.htm
35 ## UKMARC support is a bit shaky because there is no specification available
36 ## for free. The wisdom used in this script was taken from a PDF document
37 ## comparing UKMARC to MARC21 found at:
38 ## www.bl.uk/services/bibliographic/marcchange.pdf
41 # Modified 2008 by BibLibre for Koha
42 # Modified 2011 by Catalyst
43 # Modified 2011 by Equinox Software, Inc.
45 # This file is part of Koha.
47 # Koha is free software; you can redistribute it and/or modify it
48 # under the terms of the GNU General Public License as published by
49 # the Free Software Foundation; either version 3 of the License, or
50 # (at your option) any later version.
52 # Koha is distributed in the hope that it will be useful, but
53 # WITHOUT ANY WARRANTY; without even the implied warranty of
54 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
55 # GNU General Public License for more details.
57 # You should have received a copy of the GNU General Public License
58 # along with Koha; if not, see <http://www.gnu.org/licenses>.
62 #use strict;
63 #use warnings;
65 use List::MoreUtils qw/uniq/;
66 use vars qw($VERSION @ISA @EXPORT);
68 # set the version for version checking
69 $VERSION = 3.07.00.049;
71 @ISA = qw(Exporter);
73 # only export API methods
75 @EXPORT = qw(
76 &marc2ris
80 =head1 marc2bibtex - Convert from UNIMARC to RIS
82 my ($ris) = marc2ris($record);
84 Returns a RIS scalar
86 C<$record> - a MARC::Record object
88 =cut
90 sub marc2ris {
91 my ($record) = @_;
92 my $output;
94 my $marcflavour = C4::Context->preference("marcflavour");
95 $intype = lc($marcflavour);
96 my $marcprint = 0; # Debug flag;
98 # Let's redirect stdout
99 open my $oldout, ">&STDOUT";
100 my $outvar;
101 close STDOUT;
102 open STDOUT,'>:encoding(utf8)', \$outvar;
105 ## First we should check the character encoding. This may be
106 ## MARC-8 or UTF-8. The former is indicated by a blank, the latter
107 ## by 'a' at position 09 (zero-based) of the leader
108 my $leader = $record->leader();
109 if ($intype eq "marc21") {
110 if ($leader =~ /^.{9}a/) {
111 print "<marc>---\r\n<marc>UTF-8 data\r\n" if $marcprint;
112 $utf = 1;
114 else {
115 print "<marc>---\r\n<marc>MARC-8 data\r\n" if $marcprint;
118 ## else: other MARC formats do not specify the character encoding
119 ## we assume it's *not* UTF-8
121 ## start RIS dataset
122 &print_typetag($leader);
124 ## retrieve all author fields and collect them in a list
125 my @author_fields;
127 if ($intype eq "unimarc") {
128 ## Fields 700, 701, and 702 can contain author names
129 @author_fields = ($record->field('700'), $record->field('701'), $record->field('702'));
131 else { ## marc21, ukmarc
132 ## Field 100 sometimes carries main author
133 ## Field(s) 700 carry added entries - personal names
134 @author_fields = ($record->field('100'), $record->field('700'));
137 ## loop over all author fields
138 foreach my $field (@author_fields) {
139 if (length($field)) {
140 my $author = &get_author($field);
141 print "AU - ",&charconv($author),"\r\n";
145 # ToDo: should we specify anonymous as author if we didn't find
146 # one? or use one of the corporate/meeting names below?
148 ## add corporate names or meeting names as editors ??
149 my @editor_fields;
151 if ($intype eq "unimarc") {
152 ## Fields 710, 711, and 712 can carry corporate names
153 ## Field(s) 720, 721, 722, 730 have additional candidates
154 @editor_fields = ($record->field('710'), $record->field('711'), $record->field('712'), $record->field('720'), $record->field('721'), $record->field('722'), $record->field('730'));
156 else { ## marc21, ukmarc
157 ## Fields 110 and 111 carry the main entries - corporate name and
158 ## meeting name, respectively
159 ## Field(s) 710, 711 carry added entries - personal names
160 @editor_fields = ($record->field('110'), $record->field('111'), $record->field('710'), $record->field('711'));
163 ## loop over all editor fields
164 foreach my $field (@editor_fields) {
165 if (length($field)) {
166 my $editor = &get_editor($field);
167 print "ED - ",&charconv($editor),"\r\n";
171 ## get info from the title field
172 if ($intype eq "unimarc") {
173 &print_title($record->field('200'));
175 else { ## marc21, ukmarc
176 &print_title($record->field('245'));
179 ## series title
180 if ($intype eq "unimarc") {
181 &print_stitle($record->field('225'));
183 else { ## marc21, ukmarc
184 &print_stitle($record->field('490'));
187 ## ISBN/ISSN
188 if ($intype eq "unimarc") {
189 &print_isbn($record->field('010'));
190 &print_issn($record->field('011'));
192 elsif ($intype eq "ukmarc") {
193 &print_isbn($record->field('021'));
194 ## this is just an assumption
195 &print_issn($record->field('022'));
197 else { ## assume marc21
198 &print_isbn($record->field('020'));
199 &print_issn($record->field('022'));
202 if ($intype eq "marc21") {
203 &print_loc_callno($record->field('050'));
204 &print_dewey($record->field('082'));
206 ## else: unimarc, ukmarc do not seem to store call numbers?
208 ## publication info
209 if ($intype eq "unimarc") {
210 &print_pubinfo($record->field('210'));
212 else { ## marc21, ukmarc
213 if ($record->field('264')) {
214 &print_pubinfo($record->field('264'));
216 else {
217 &print_pubinfo($record->field('260'));
221 ## 6XX fields contain KW candidates. We add all of them to a
223 my @field_list;
224 if ($intype eq "unimarc") {
225 @field_list = ('600', '601', '602', '604', '605', '606','607', '608', '610', '615', '620', '660', '661', '670', '675', '676', '680', '686');
226 } elsif ($intype eq "ukmarc") {
227 @field_list = ('600', '610', '611', '630', '650', '651','653', '655', '660', '661', '668', '690', '691', '692', '695');
228 } else { ## assume marc21
229 @field_list = ('600', '610', '611', '630', '650', '651','653', '654', '655', '656', '657', '658');
232 my @kwpool;
233 for my $f ( @field_list ) {
234 my @fields = $record->field($f);
235 push @kwpool, ( get_keywords("$f",$record->field($f)) );
238 # Remove duplicate
239 @kwpool = uniq @kwpool;
241 for my $kw ( @kwpool ) {
242 print "KW - ", &charconv($kw), "\r\n";
245 ## 5XX have various candidates for notes and abstracts. We pool
246 ## all notes-like stuff in one list.
247 my @notepool;
249 ## these fields have notes candidates
250 if ($intype eq "unimarc") {
251 foreach ('300', '301', '302', '303', '304', '305', '306', '307', '308', '310', '311', '312', '313', '314', '315', '316', '317', '318', '320', '321', '322', '323', '324', '325', '326', '327', '328', '332', '333', '336', '337', '345') {
252 &pool_subx(\@notepool, $_, $record->field($_));
255 elsif ($intype eq "ukmarc") {
256 foreach ('500', '501', '502', '503', '504', '505', '506', '508', '514', '515', '516', '521', '524', '525', '528', '530', '531', '532', '533', '534', '535', '537', '538', '540', '541', '542', '544', '554', '555', '556', '557', '561', '563', '580', '583', '584', '586') {
257 &pool_subx(\@notepool, $_, $record->field($_));
260 else { ## assume marc21
261 foreach ('500', '501', '502', '504', '505', '506', '507', '508', '510', '511', '513', '514', '515', '516', '518', '521', '522', '524', '525', '526', '530', '533', '534', '535') {
262 &pool_subx(\@notepool, $_, $record->field($_));
266 my $allnotes = join "; ", @notepool;
268 if (length($allnotes) > 0) {
269 print "N1 - ", &charconv($allnotes), "\r\n";
272 ## 320/520 have the abstract
273 if ($intype eq "unimarc") {
274 &print_abstract($record->field('320'));
276 elsif ($intype eq "ukmarc") {
277 &print_abstract($record->field('512'), $record->field('513'));
279 else { ## assume marc21
280 &print_abstract($record->field('520'));
283 # 856u has the URI
284 if ($record->field('856')) {
285 print_uri($record->field('856'));
288 ## end RIS dataset
289 print "ER - \r\n";
291 # Let's re-redirect stdout
292 close STDOUT;
293 open STDOUT, ">&", $oldout;
295 return $outvar;
300 ##********************************************************************
301 ## print_typetag(): prints the first line of a RIS dataset including
302 ## the preceeding newline
303 ## Argument: the leader of a MARC dataset
304 ## Returns: the value at leader position 06
305 ##********************************************************************
306 sub print_typetag {
307 my ($leader)= @_;
308 ## the keys of typehash are the allowed values at position 06
309 ## of the leader of a MARC record, the values are the RIS types
310 ## that might appropriately represent these types.
311 my %ustypehash = (
312 "a" => "BOOK",
313 "c" => "MUSIC",
314 "d" => "MUSIC",
315 "e" => "MAP",
316 "f" => "MAP",
317 "g" => "ADVS",
318 "i" => "SOUND",
319 "j" => "SOUND",
320 "k" => "ART",
321 "m" => "DATA",
322 "o" => "GEN",
323 "p" => "GEN",
324 "r" => "ART",
325 "t" => "GEN",
328 my %unitypehash = (
329 "a" => "BOOK",
330 "b" => "BOOK",
331 "c" => "MUSIC",
332 "d" => "MUSIC",
333 "e" => "MAP",
334 "f" => "MAP",
335 "g" => "ADVS",
336 "i" => "SOUND",
337 "j" => "SOUND",
338 "k" => "ART",
339 "l" => "ELEC",
340 "m" => "ADVS",
341 "r" => "ART",
344 ## The type of a MARC record is found at position 06 of the leader
345 my $typeofrecord = substr($leader, 6, 1);
347 ## ToDo: for books, field 008 positions 24-27 might have a few more
348 ## hints
350 my %typehash;
352 ## the ukmarc here is just a guess
353 if ($intype eq "marc21" || $intype eq "ukmarc") {
354 %typehash = %ustypehash;
356 elsif ($intype eq "unimarc") {
357 %typehash = %unitypehash;
359 else {
360 ## assume MARC21 as default
361 %typehash = %ustypehash;
364 if (!exists $typehash{$typeofrecord}) {
365 print "TY - BOOK\r\n"; ## most reasonable default
366 warn ("no type found - assume BOOK") if $marcprint;
368 else {
369 print "TY - $typehash{$typeofrecord}\r\n";
372 ## use $typeofrecord as the return value, just in case
373 $typeofrecord;
376 ##********************************************************************
377 ## normalize_author(): normalizes an authorname
378 ## Arguments: authorname subfield a
379 ## authorname subfield b
380 ## authorname subfield c
381 ## name type if known: 0=direct order
382 ## 1=only surname or full name in
383 ## inverted order
384 ## 3=family, clan, dynasty name
385 ## Returns: the normalized authorname
386 ##********************************************************************
387 sub normalize_author {
388 my($rawauthora, $rawauthorb, $rawauthorc, $nametype) = @_;
390 if ($nametype == 0) {
391 # ToDo: convert every input to Last[,(F.|First)[ (M.|Middle)[,Suffix]]]
392 warn("name >>$rawauthora<< in direct order - leave as is") if $marcprint;
393 return $rawauthora;
395 elsif ($nametype == 1) {
396 ## start munging subfield a (the real name part)
397 ## remove spaces after separators
398 $rawauthora =~ s%([,.]+) *%$1%g;
400 ## remove trailing separators after spaces
401 $rawauthora =~ s% *[,;:/]*$%%;
403 ## remove periods after a non-abbreviated name
404 $rawauthora =~ s%(\w{2,})\.%$1%g;
406 ## start munging subfield b (something like the suffix)
407 ## remove trailing separators after spaces
408 $rawauthorb =~ s% *[,;:/]*$%%;
410 ## we currently ignore subfield c until someone complains
411 if (length($rawauthorb) > 0) {
412 return join ", ", ($rawauthora, $rawauthorb);
414 else {
415 return $rawauthora;
418 elsif ($nametype == 3) {
419 return $rawauthora;
423 ##********************************************************************
424 ## get_author(): gets authorname info from MARC fields 100, 700
425 ## Argument: field (100 or 700)
426 ## Returns: an author string in the format found in the record
427 ##********************************************************************
428 sub get_author {
429 my ($authorfield) = @_;
430 my ($indicator);
432 ## the sequence of the name parts is encoded either in indicator
433 ## 1 (marc21) or 2 (unimarc)
434 if ($intype eq "unimarc") {
435 $indicator = 2;
437 else { ## assume marc21
438 $indicator = 1;
441 print "<marc>:Author(Ind$indicator): ", $authorfield->indicator("$indicator"),"\r\n" if $marcprint;
442 print "<marc>:Author(\$a): ", $authorfield->subfield('a'),"\r\n" if $marcprint;
443 print "<marc>:Author(\$b): ", $authorfield->subfield('b'),"\r\n" if $marcprint;
444 print "<marc>:Author(\$c): ", $authorfield->subfield('c'),"\r\n" if $marcprint;
445 print "<marc>:Author(\$h): ", $authorfield->subfield('h'),"\r\n" if $marcprint;
446 if ($intype eq "ukmarc") {
447 my $authorname = $authorfield->subfield('a') . "," . $authorfield->subfield('h');
448 normalize_author($authorname, $authorfield->subfield('b'), $authorfield->subfield('c'), $authorfield->indicator("$indicator"));
450 else {
451 normalize_author($authorfield->subfield('a') // '', $authorfield->subfield('b') // '', $authorfield->subfield('c') // '', $authorfield->indicator("$indicator"));
455 ##********************************************************************
456 ## get_editor(): gets editor info from MARC fields 110, 111, 710, 711
457 ## Argument: field (110, 111, 710, or 711)
458 ## Returns: an author string in the format found in the record
459 ##********************************************************************
460 sub get_editor {
461 my ($editorfield) = @_;
463 if (!$editorfield) {
464 return;
466 else {
467 print "<marc>Editor(\$a): ", $editorfield->subfield('a'),"\r\n" if $marcprint;
468 print "<marc>Editor(\$b): ", $editorfield->subfield('b'),"\r\n" if $marcprint;
469 print "<marc>editor(\$c): ", $editorfield->subfield('c'),"\r\n" if $marcprint;
470 return $editorfield->subfield('a');
474 ##********************************************************************
475 ## print_title(): gets info from MARC field 245
476 ## Arguments: field (245)
477 ## Returns:
478 ##********************************************************************
479 sub print_title {
480 my ($titlefield) = @_;
481 if (!$titlefield) {
482 print "<marc>empty title field (245)\r\n" if $marcprint;
483 warn("empty title field (245)") if $marcprint;
485 else {
486 print "<marc>Title(\$a): ",$titlefield->subfield('a'),"\r\n" if $marcprint;
487 print "<marc>Title(\$b): ",$titlefield->subfield('b'),"\r\n" if $marcprint;
488 print "<marc>Title(\$c): ",$titlefield->subfield('c'),"\r\n" if $marcprint;
490 ## The title is usually written in a very odd notation. The title
491 ## proper ($a) often ends with a space followed by a separator like
492 ## a slash or a colon. The subtitle ($b) doesn't start with a space
493 ## so simple concatenation looks odd. We have to conditionally remove
494 ## the separator and make sure there's a space between title and
495 ## subtitle
497 my $clean_title = $titlefield->subfield('a');
499 my $clean_subtitle = $titlefield->subfield('b');
500 $clean_title =~ s% *[/:;.]$%%;
501 $clean_subtitle =~ s%^ *(.*) *[/:;.]$%$1%;
503 if (length($clean_title) > 0
504 || (length($clean_subtitle) > 0 && $intype ne "unimarc")) {
505 print "TI - ", &charconv($clean_title);
507 ## subfield $b is relevant only for marc21/ukmarc
508 if (length($clean_subtitle) > 0 && $intype ne "unimarc") {
509 print ": ",&charconv($clean_subtitle);
511 print "\r\n";
514 ## The statement of responsibility is just this: horrors. There is
515 ## no formal definition how authors, editors and the like should
516 ## be written and designated. The field is free-form and resistant
517 ## to all parsing efforts, so this information is lost on me
521 ##********************************************************************
522 ## print_stitle(): prints info from series title field
523 ## Arguments: field
524 ## Returns:
525 ##********************************************************************
526 sub print_stitle {
527 my ($titlefield) = @_;
529 if (!$titlefield) {
530 print "<marc>empty series title field\r\n" if $marcprint;
532 else {
533 print "<marc>Series title(\$a): ",$titlefield->subfield('a'),"\r\n" if $marcprint;
534 my $clean_title = $titlefield->subfield('a');
536 $clean_title =~ s% *[/:;.]$%%;
538 if (length($clean_title) > 0) {
539 print "T2 - ", &charconv($clean_title),"\r\n";
542 if ($intype eq "unimarc") {
543 print "<marc>Series vol(\$v): ",$titlefield->subfield('v'),"\r\n" if $marcprint;
544 if (length($titlefield->subfield('v')) > 0) {
545 print "VL - ", &charconv($titlefield->subfield('v')),"\r\n";
551 ##********************************************************************
552 ## print_isbn(): gets info from MARC field 020
553 ## Arguments: field (020)
554 ##********************************************************************
555 sub print_isbn {
556 my($isbnfield) = @_;
558 if (!$isbnfield || length ($isbnfield->subfield('a')) == 0) {
559 print "<marc>no isbn found (020\$a)\r\n" if $marcprint;
560 warn("no isbn found") if $marcprint;
562 else {
563 if (length ($isbnfield->subfield('a')) < 10) {
564 print "<marc>truncated isbn (020\$a)\r\n" if $marcprint;
565 warn("truncated isbn") if $marcprint;
568 my $isbn = substr($isbnfield->subfield('a'), 0, 10);
569 print "SN - ", &charconv($isbn), "\r\n";
573 ##********************************************************************
574 ## print_issn(): gets info from MARC field 022
575 ## Arguments: field (022)
576 ##********************************************************************
577 sub print_issn {
578 my($issnfield) = @_;
580 if (!$issnfield || length ($issnfield->subfield('a')) == 0) {
581 print "<marc>no issn found (022\$a)\r\n" if $marcprint;
582 warn("no issn found") if $marcprint;
584 else {
585 if (length ($issnfield->subfield('a')) < 9) {
586 print "<marc>truncated issn (022\$a)\r\n" if $marcprint;
587 warn("truncated issn") if $marcprint;
590 my $issn = substr($issnfield->subfield('a'), 0, 9);
591 print "SN - ", &charconv($issn), "\r\n";
596 # print_uri() prints info from 856 u
598 sub print_uri {
599 my @f856s = @_;
601 foreach my $f856 (@f856s) {
602 if (my $uri = $f856->subfield('u')) {
603 print "UR - ", charconv($uri), "\r\n";
608 ##********************************************************************
609 ## print_loc_callno(): gets info from MARC field 050
610 ## Arguments: field (050)
611 ##********************************************************************
612 sub print_loc_callno {
613 my($callnofield) = @_;
615 if (!$callnofield || length ($callnofield->subfield('a')) == 0) {
616 print "<marc>no LOC call number found (050\$a)\r\n" if $marcprint;
617 warn("no LOC call number found") if $marcprint;
619 else {
620 print "AV - ", &charconv($callnofield->subfield('a')), " ", &charconv($callnofield->subfield('b')), "\r\n";
624 ##********************************************************************
625 ## print_dewey(): gets info from MARC field 082
626 ## Arguments: field (082)
627 ##********************************************************************
628 sub print_dewey {
629 my($deweyfield) = @_;
631 if (!$deweyfield || length ($deweyfield->subfield('a')) == 0) {
632 print "<marc>no Dewey number found (082\$a)\r\n" if $marcprint;
633 warn("no Dewey number found") if $marcprint;
635 else {
636 print "U1 - ", &charconv($deweyfield->subfield('a')), " ", &charconv($deweyfield->subfield('2')), "\r\n";
640 ##********************************************************************
641 ## print_pubinfo(): gets info from MARC field 260
642 ## Arguments: field (260)
643 ##********************************************************************
644 sub print_pubinfo {
645 my($pubinfofield) = @_;
647 if (!$pubinfofield) {
648 print "<marc>no publication information found (260/264)\r\n" if $marcprint;
649 warn("no publication information found") if $marcprint;
651 else {
652 ## the following information is available in MARC21:
653 ## $a place -> CY
654 ## $b publisher -> PB
655 ## $c date -> PY
656 ## the corresponding subfields for UNIMARC:
657 ## $a place -> CY
658 ## $c publisher -> PB
659 ## $d date -> PY
661 ## all of them are repeatable. We pool all places into a
662 ## comma-separated list in CY. We also pool all publishers
663 ## into a comma-separated list in PB. We break the rule with
664 ## the date field because this wouldn't make much sense. In
665 ## this case, we use the first occurrence for PY, the second
666 ## for Y2, and ignore the rest
668 my @pubsubfields = $pubinfofield->subfields();
669 my @cities;
670 my @publishers;
671 my $pycounter = 0;
673 my $pubsub_place;
674 my $pubsub_publisher;
675 my $pubsub_date;
677 if ($intype eq "unimarc") {
678 $pubsub_place = "a";
679 $pubsub_publisher = "c";
680 $pubsub_date = "d";
682 else { ## assume marc21
683 $pubsub_place = "a";
684 $pubsub_publisher = "b";
685 $pubsub_date = "c";
688 ## loop over all subfield list entries
689 for my $tuple (@pubsubfields) {
690 ## each tuple consists of the subfield code and the value
691 if (@$tuple[0] eq $pubsub_place) {
692 ## strip any trailing crap
693 $_ = @$tuple[1];
694 s% *[,;:/]$%%;
695 ## pool all occurrences in a list
696 push (@cities, $_);
698 elsif (@$tuple[0] eq $pubsub_publisher) {
699 ## strip any trailing crap
700 $_ = @$tuple[1];
701 s% *[,;:/]$%%;
702 ## pool all occurrences in a list
703 push (@publishers, $_);
705 elsif (@$tuple[0] eq $pubsub_date) {
706 ## the dates are free-form, so we want to extract
707 ## a four-digit year and leave the rest as
708 ## "other info"
709 $protoyear = @$tuple[1];
710 print "<marc>Year (260\$c): $protoyear\r\n" if $marcprint;
712 ## strip any separator chars at the end
713 $protoyear =~ s% *[\.;:/]*$%%;
715 ## isolate a four-digit year. We discard anything
716 ## preceeding the year, but keep everything after
717 ## the year as other info.
718 $protoyear =~ s%\D*([0-9\-]{4})(.*)%$1///$2%;
720 ## check what we've got. If there is no four-digit
721 ## year, make it up. If digits are replaced by '-',
722 ## replace those with 0s
724 if (index($protoyear, "/") == 4) {
725 ## have year info
726 ## replace all '-' in the four-digit year
727 ## by '0'
728 substr($protoyear,0,4) =~ s!-!0!g;
730 else {
731 ## have no year info
732 print "<marc>no four-digit year found, use 0000\r\n" if $marcprint;
733 $protoyear = "0000///$protoyear";
734 warn("no four-digit year found, use 0000") if $marcprint;
737 if ($pycounter == 0 && length($protoyear)) {
738 print "PY - $protoyear\r\n";
740 elsif ($pycounter == 1 && length($_)) {
741 print "Y2 - $protoyear\r\n";
743 ## else: discard
745 ## else: discard
748 ## now dump the collected CY and PB lists
749 if (@cities > 0) {
750 print "CY - ", &charconv(join(", ", @cities)), "\r\n";
752 if (@publishers > 0) {
753 print "PB - ", &charconv(join(", ", @publishers)), "\r\n";
758 ##********************************************************************
759 ## get_keywords(): prints info from MARC fields 6XX
760 ## Arguments: list of fields (6XX)
761 ##********************************************************************
762 sub get_keywords {
763 my($fieldname, @keywords) = @_;
765 my @kw;
766 ## a list of all possible subfields
767 my @subfields = ('a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'x', 'y', 'z', '2', '3', '4');
769 ## loop over all 6XX fields
770 foreach my $kwfield (@keywords) {
771 if ($kwfield != undef) {
772 ## authornames get special treatment
773 if ($fieldname eq "600") {
774 my $val = normalize_author($kwfield->subfield('a'), $kwfield->subfield('b'), $kwfield->subfield('c'), $kwfield->indicator('1'));
775 push @kw, $val;
776 print "<marc>Field $kwfield subfield a:", $kwfield->subfield('a'), "\r\n<marc>Field $kwfield subfield b:", $kwfield->subfield('b'), "\r\n<marc>Field $kwfield subfield c:", $kwfield->subfield('c'), "\r\n" if $marcprint;
778 else {
779 ## retrieve all available subfields
780 @kwsubfields = $kwfield->subfields();
782 ## loop over all available subfield tuples
783 foreach my $kwtuple (@kwsubfields) {
784 ## loop over all subfields to check
785 foreach my $subfield (@subfields) {
786 ## [0] contains subfield code
787 if (@$kwtuple[0] eq $subfield) {
788 ## [1] contains value, remove trailing separators
789 @$kwtuple[1] =~ s% *[,;.:/]*$%%;
790 if (length(@$kwtuple[1]) > 0) {
791 push @kw, @$kwtuple[1];
792 print "<marc>Field $fieldname subfield $subfield:", @$kwtuple[1], "\r\n" if $marcprint;
794 ## we can leave the subfields loop here
795 last;
802 return @kw;
805 ##********************************************************************
806 ## pool_subx(): adds contents of several subfields to a list
807 ## Arguments: reference to a list
808 ## field name
809 ## list of fields (5XX)
810 ##********************************************************************
811 sub pool_subx {
812 my($aref, $fieldname, @notefields) = @_;
814 ## we use a list that contains the interesting subfields
815 ## for each field
816 # ToDo: this is apparently correct only for marc21
817 my @subfields;
819 if ($fieldname eq "500") {
820 @subfields = ('a');
822 elsif ($fieldname eq "501") {
823 @subfields = ('a');
825 elsif ($fieldname eq "502") {
826 @subfields = ('a');
828 elsif ($fieldname eq "504") {
829 @subfields = ('a', 'b');
831 elsif ($fieldname eq "505") {
832 @subfields = ('a', 'g', 'r', 't', 'u');
834 elsif ($fieldname eq "506") {
835 @subfields = ('a', 'b', 'c', 'd', 'e');
837 elsif ($fieldname eq "507") {
838 @subfields = ('a', 'b');
840 elsif ($fieldname eq "508") {
841 @subfields = ('a');
843 elsif ($fieldname eq "510") {
844 @subfields = ('a', 'b', 'c', 'x', '3');
846 elsif ($fieldname eq "511") {
847 @subfields = ('a');
849 elsif ($fieldname eq "513") {
850 @subfields = ('a', 'b');
852 elsif ($fieldname eq "514") {
853 @subfields = ('a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'm', 'u', 'z');
855 elsif ($fieldname eq "515") {
856 @subfields = ('a');
858 elsif ($fieldname eq "516") {
859 @subfields = ('a');
861 elsif ($fieldname eq "518") {
862 @subfields = ('a', '3');
864 elsif ($fieldname eq "521") {
865 @subfields = ('a', 'b', '3');
867 elsif ($fieldname eq "522") {
868 @subfields = ('a');
870 elsif ($fieldname eq "524") {
871 @subfields = ('a', '2', '3');
873 elsif ($fieldname eq "525") {
874 @subfields = ('a');
876 elsif ($fieldname eq "526") {
877 @subfields = ('a', 'b', 'c', 'd', 'i', 'x', 'z', '5');
879 elsif ($fieldname eq "530") {
880 @subfields = ('a', 'b', 'c', 'd', 'u', '3');
882 elsif ($fieldname eq "533") {
883 @subfields = ('a', 'b', 'c', 'd', 'e', 'f', 'm', 'n', '3');
885 elsif ($fieldname eq "534") {
886 @subfields = ('a', 'b', 'c', 'e', 'f', 'k', 'l', 'm', 'n', 'p', 't', 'x', 'z');
888 elsif ($fieldname eq "535") {
889 @subfields = ('a', 'b', 'c', 'd', 'g', '3');
892 ## loop over all notefields
893 foreach my $notefield (@notefields) {
894 if ($notefield != undef) {
895 ## retrieve all available subfield tuples
896 @notesubfields = $notefield->subfields();
898 ## loop over all subfield tuples
899 foreach my $notetuple (@notesubfields) {
900 ## loop over all subfields to check
901 foreach my $subfield (@subfields) {
902 ## [0] contains subfield code
903 if (@$notetuple[0] eq $subfield) {
904 ## [1] contains value, remove trailing separators
905 print "<marc>field $fieldname subfield $subfield: ", @$notetuple[1], "\r\n" if $marcprint;
906 @$notetuple[1] =~ s% *[,;.:/]*$%%;
907 if (length(@$notetuple[1]) > 0) {
908 ## add to list
909 push @{$aref}, @$notetuple[1];
911 last;
919 ##********************************************************************
920 ## print_abstract(): prints abstract fields
921 ## Arguments: list of fields (520)
922 ##********************************************************************
923 sub print_abstract {
924 # ToDo: take care of repeatable subfields
925 my(@abfields) = @_;
927 ## we check the following subfields
928 my @subfields = ('a', 'b');
930 ## we generate a list for all useful strings
931 my @abstrings;
933 ## loop over all abfields
934 foreach my $abfield (@abfields) {
935 foreach my $field (@subfields) {
936 if ( length( $abfield->subfield($field) ) > 0 ) {
937 my $ab = $abfield->subfield($field);
939 print "<marc>field 520 subfield $field: $ab\r\n" if $marcprint;
941 ## strip trailing separators
942 $ab =~ s% *[;,:./]*$%%;
944 ## add string to the list
945 push( @abstrings, $ab );
950 my $allabs = join "; ", @abstrings;
952 if (length($allabs) > 0) {
953 print "N2 - ", &charconv($allabs), "\r\n";
960 ##********************************************************************
961 ## charconv(): converts to a different charset based on a global var
962 ## Arguments: string
963 ## Returns: string
964 ##********************************************************************
965 sub charconv {
966 if ($utf) {
967 ## return unaltered if already utf-8
968 return @_;
970 elsif ($uniout eq "t") {
971 ## convert to utf-8
972 return marc8_to_utf8("@_");
974 else {
975 ## return unaltered if no utf-8 requested
976 return @_;