C4/Charset.pm

   1 package C4::Charset;
   2
   3 # Copyright (C) 2008 LibLime
   4 #
   5 # This file is part of Koha.
   6 #
   7 # Koha is free software; you can redistribute it and/or modify it under the
   8 # terms of the GNU General Public License as published by the Free Software
   9 # Foundation; either version 2 of the License, or (at your option) any later
  10 # version.
  11 #
  12 # Koha is distributed in the hope that it will be useful, but WITHOUT ANY
  13 # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
  14 # A PARTICULAR PURPOSE.  See the GNU General Public License for more details.
  15 #
  16 # You should have received a copy of the GNU General Public License along
  17 # with Koha; if not, write to the Free Software Foundation, Inc.,
  18 # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
  19
  20 use strict;
  21 use warnings;
  22
  23 use MARC::Charset qw/marc8_to_utf8/;
  24 use Text::Iconv;
  25 use C4::Context;
  26 use C4::Debug;
  27 use Unicode::Normalize;
  28
  29 use vars qw($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
  30
  31 BEGIN {
  32     # set the version for version checking
  33     $VERSION = 3.07.00.049;
  34     require Exporter;
  35     @ISA    = qw(Exporter);
  36     @EXPORT = qw(
  37         NormalizeString
  38         IsStringUTF8ish
  39         MarcToUTF8Record
  40         SetUTF8Flag
  41         SetMarcUnicodeFlag
  42         StripNonXmlChars
  43         nsb_clean
  44     );
  45 }
  46
  47 =encoding UTF-8
  48
  49 =head1 NAME
  50
  51 C4::Charset - utilities for handling character set conversions.
  52
  53 =head1 SYNOPSIS
  54
  55   use C4::Charset;
  56
  57 =head1 DESCRIPTION
  58
  59 This module contains routines for dealing with character set
  60 conversions, particularly for MARC records.
  61
  62 A variety of character encodings are in use by various MARC
  63 standards, and even more character encodings are used by
  64 non-standard MARC records.  The various MARC formats generally
  65 do not do a good job of advertising a given record's character
  66 encoding, and even when a record does advertise its encoding,
  67 e.g., via the Leader/09, experience has shown that one cannot
  68 trust it.
  69
  70 Ultimately, all MARC records are stored in Koha in UTF-8 and
  71 must be converted from whatever the source character encoding is.
  72 The goal of this module is to ensure that these conversions
  73 take place accurately.  When a character conversion cannot take
  74 place, or at least not accurately, the module was provide
  75 enough information to allow user-facing code to inform the user
  76 on how to deal with the situation.
  77
  78 =cut
  79
  80 =head1 FUNCTIONS
  81
  82 =head2 IsStringUTF8ish
  83
  84   my $is_utf8 = IsStringUTF8ish($str);
  85
  86 Determines if C<$str> is valid UTF-8.  This can mean
  87 one of two things:
  88
  89 =over
  90
  91 =item *
  92
  93 The Perl UTF-8 flag is set and the string contains valid UTF-8.
  94
  95 =item *
  96
  97 The Perl UTF-8 flag is B<not> set, but the octets contain
  98 valid UTF-8.
  99
 100 =back
 101
 102 The function is named C<IsStringUTF8ish> instead of C<IsStringUTF8>
 103 because in one could be presented with a MARC blob that is
 104 not actually in UTF-8 but whose sequence of octets appears to be
 105 valid UTF-8.  The rest of the MARC character conversion functions
 106 will assume that this situation occur does not very often.
 107
 108 =cut
 109
 110 sub IsStringUTF8ish {
 111     my $str = shift;
 112
 113     return 1 if utf8::is_utf8($str);
 114     return utf8::decode($str);
 115 }
 116
 117 =head2 SetUTF8Flag
 118
 119   my $marc_record = SetUTF8Flag($marc_record, $nfd);
 120
 121 This function sets the PERL UTF8 flag for data.
 122 It is required when using new_from_usmarc
 123 since MARC::File::USMARC does not handle PERL UTF8 setting.
 124 When editing unicode marc records fields and subfields, you
 125 would end up in double encoding without using this function.
 126
 127 If $nfd is set, string normalization will use NFD instead of NFC
 128
 129 FIXME
 130 In my opinion, this function belongs to MARC::Record and not
 131 to this package.
 132 But since it handles charset, and MARC::Record, it finds its way in that package
 133
 134 =cut
 135
 136 sub SetUTF8Flag{
 137     my ($record, $nfd)=@_;
 138     return unless ($record && $record->fields());
 139     foreach my $field ($record->fields()){
 140         if ($field->tag()>=10){
 141             my @subfields;
 142             foreach my $subfield ($field->subfields()){
 143                 push @subfields,($$subfield[0],NormalizeString($$subfield[1],$nfd));
 144             }
 145             eval {
 146                 my $newfield=MARC::Field->new(
 147                             $field->tag(),
 148                             $field->indicator(1),
 149                             $field->indicator(2),
 150                             @subfields
 151                         );
 152                 $field->replace_with($newfield);
 153             };
 154             warn "ERROR occurred in SetUTF8Flag $@" if $@;
 155         }
 156     }
 157 }
 158
 159 =head2 NormalizeString
 160
 161     my $normalized_string=NormalizeString($string,$nfd,$transform);
 162
 163 Given a string
 164 nfd : If you want to set NFD and not NFC
 165 transform : If you expect all the signs to be removed
 166
 167 Sets the PERL UTF8 Flag on your initial data if need be
 168 and applies cleaning if required
 169
 170 Returns a utf8 NFC normalized string
 171
 172 Sample code :
 173    my $string=NormalizeString ("l'ornithoptère");
 174    #results into ornithoptère in NFC form and sets UTF8 Flag
 175
 176 =cut
 177
 178
 179 sub NormalizeString{
 180         my ($string,$nfd,$transform)=@_;
 181         utf8::decode($string) unless (utf8::is_utf8($string));
 182         if ($nfd){
 183                 $string= NFD($string);
 184         }
 185         else {
 186                 $string=NFC($string);
 187         }
 188         if ($transform){
 189     $string=~s/\<|\>|\^|\;|\.|\?|,|\-|\(|\)|\[|\]|\{|\}|\$|\%|\!|\*|\:|\\|\/|\&|\"|\'/ /g;
 190         #removing one letter words "d'" "l'"  was changed into "d " "l "
 191     $string=~s/\b\S\b//g;
 192     $string=~s/\s+$//g;
 193         }
 194     return $string;
 195 }
 196
 197 =head2 MarcToUTF8Record
 198
 199   ($marc_record, $converted_from, $errors_arrayref) = MarcToUTF8Record($marc_blob,
 200                                         $marc_flavour, [, $source_encoding]);
 201
 202 Given a MARC blob or a C<MARC::Record>, the MARC flavour, and an
 203 optional source encoding, return a C<MARC::Record> that is
 204 converted to UTF-8.
 205
 206 The returned C<$marc_record> is guaranteed to be in valid UTF-8, but
 207 is not guaranteed to have been converted correctly.  Specifically,
 208 if C<$converted_from> is 'failed', the MARC record returned failed
 209 character conversion and had each of its non-ASCII octets changed
 210 to the Unicode replacement character.
 211
 212 If the source encoding was not specified, this routine will
 213 try to guess it; the character encoding used for a successful
 214 conversion is returned in C<$converted_from>.
 215
 216 =cut
 217
 218 sub MarcToUTF8Record {
 219     my $marc = shift;
 220     my $marc_flavour = shift;
 221     my $source_encoding = shift;
 222     my $marc_record;
 223     my $marc_blob_is_utf8 = 0;
 224     if (ref($marc) eq 'MARC::Record') {
 225         my $marc_blob = $marc->as_usmarc();
 226         $marc_blob_is_utf8 = IsStringUTF8ish($marc_blob);
 227         $marc_record = $marc;
 228     } else {
 229         # dealing with a MARC blob
 230
 231         # remove any ersatz whitespace from the beginning and
 232         # end of the MARC blob -- these can creep into MARC
 233         # files produced by several sources -- caller really
 234         # should be doing this, however
 235         $marc =~ s/^\s+//;
 236         $marc =~ s/\s+$//;
 237         $marc_blob_is_utf8 = IsStringUTF8ish($marc);
 238         eval {
 239             $marc_record = MARC::Record->new_from_usmarc($marc);
 240         };
 241         if ($@) {
 242             # if we fail the first time, one likely problem
 243             # is that we have a MARC21 record that says that it's
 244             # UTF-8 (Leader/09 = 'a') but contains non-UTF-8 characters.
 245             # We'll try parsing it again.
 246             substr($marc, 9, 1) = ' ';
 247             eval {
 248                 $marc_record = MARC::Record->new_from_usmarc($marc);
 249             };
 250             if ($@) {
 251                 # it's hopeless; return an empty MARC::Record
 252                 return MARC::Record->new(), 'failed', ['could not parse MARC blob'];
 253             }
 254         }
 255     }
 256
 257     # If we do not know the source encoding, try some guesses
 258     # as follows:
 259     #   1. Record is UTF-8 already.
 260     #   2. If MARC flavor is MARC21 or NORMARC, then
 261     #      a. record is MARC-8
 262     #      b. record is ISO-8859-1
 263     #   3. If MARC flavor is UNIMARC, then
 264     if (not defined $source_encoding) {
 265         if ($marc_blob_is_utf8) {
 266             # note that for MARC21/NORMARC we are not bothering to check
 267             # if the Leader/09 is set to 'a' or not -- because
 268             # of problems with various ILSs (including Koha in the
 269             # past, alas), this just is not trustworthy.
 270             SetMarcUnicodeFlag($marc_record, $marc_flavour);
 271             return $marc_record, 'UTF-8', [];
 272         } else {
 273             if ($marc_flavour eq 'MARC21' || $marc_flavour eq 'NORMARC') {
 274                 return _default_marc21_charconv_to_utf8($marc_record, $marc_flavour);
 275             } elsif ($marc_flavour =~/UNIMARC/) {
 276                 return _default_unimarc_charconv_to_utf8($marc_record, $marc_flavour);
 277             } else {
 278                 return _default_marc21_charconv_to_utf8($marc_record, $marc_flavour);
 279             }
 280         }
 281     } else {
 282         # caller knows the character encoding
 283         my $original_marc_record = $marc_record->clone();
 284         my @errors;
 285         if ($source_encoding =~ /utf-?8/i) {
 286             if ($marc_blob_is_utf8) {
 287                 SetMarcUnicodeFlag($marc_record, $marc_flavour);
 288                 return $marc_record, 'UTF-8', [];
 289             } else {
 290                 push @errors, 'specified UTF-8 => UTF-8 conversion, but record is not in UTF-8';
 291             }
 292         } elsif ($source_encoding =~ /marc-?8/i) {
 293             @errors = _marc_marc8_to_utf8($marc_record, $marc_flavour);
 294         } elsif ($source_encoding =~ /5426/) {
 295             @errors = _marc_iso5426_to_utf8($marc_record, $marc_flavour);
 296         } else {
 297             # assume any other character encoding is for Text::Iconv
 298             @errors = _marc_to_utf8_via_text_iconv($marc_record, $marc_flavour, $source_encoding);
 299         }
 300
 301         if (@errors) {
 302             _marc_to_utf8_replacement_char($original_marc_record, $marc_flavour);
 303             return $original_marc_record, 'failed', \@errors;
 304         } else {
 305             return $marc_record, $source_encoding, [];
 306         }
 307     }
 308
 309 }
 310
 311 =head2 SetMarcUnicodeFlag
 312
 313   SetMarcUnicodeFlag($marc_record, $marc_flavour);
 314
 315 Set both the internal MARC::Record encoding flag
 316 and the appropriate Leader/09 (MARC21) or
 317 100/26-29 (UNIMARC) to indicate that the record
 318 is in UTF-8.  Note that this does B<not> do
 319 any actual character conversion.
 320
 321 =cut
 322
 323 sub SetMarcUnicodeFlag {
 324     my $marc_record = shift;
 325     my $marc_flavour = shift; # || C4::Context->preference("marcflavour");
 326
 327     $marc_record->encoding('UTF-8');
 328     if ($marc_flavour eq 'MARC21' || $marc_flavour eq 'NORMARC') {
 329         my $leader = $marc_record->leader();
 330         substr($leader, 9, 1) = 'a';
 331         $marc_record->leader($leader);
 332     } elsif ($marc_flavour =~/UNIMARC/) {
 333         my $defaultlanguage = C4::Context->preference("UNIMARCField100Language");
 334         $defaultlanguage = "fre" if (!$defaultlanguage || length($defaultlanguage) != 3);
 335         my $string;
 336                 my ($subflength,$encodingposition)=($marc_flavour=~/AUTH/?(21,12):(36,25));
 337                 $string=$marc_record->subfield( 100, "a" );
 338         if (defined $string && length($string)==$subflength) {
 339                         $string = substr $string, 0,$subflength if (length($string)>$subflength);
 340         }
 341         else {
 342             $string = POSIX::strftime( "%Y%m%d", localtime );
 343             $string =~ s/\-//g;
 344             $string = sprintf( "%-*s", $subflength, $string );
 345             substr ( $string, ($encodingposition - 3), 3, $defaultlanguage);
 346         }
 347         substr( $string, $encodingposition, 3, "y50" );
 348         if ( $marc_record->subfield( 100, "a" ) ) {
 349                         $marc_record->field('100')->update(a=>$string);
 350                 }
 351                 else {
 352             $marc_record->insert_grouped_field(
 353                 MARC::Field->new( 100, '', '', "a" => $string ) );
 354         }
 355                 $debug && warn "encodage: ", substr( $marc_record->subfield(100, 'a'), $encodingposition, 3 );
 356     } else {
 357         warn "Unrecognized marcflavour: $marc_flavour";
 358     }
 359 }
 360
 361 =head2 StripNonXmlChars
 362
 363   my $new_str = StripNonXmlChars($old_str);
 364
 365 Given a string, return a copy with the
 366 characters that are illegal in XML
 367 removed.
 368
 369 This function exists to work around a problem
 370 that can occur with badly-encoded MARC records.
 371 Specifically, if a UTF-8 MARC record also
 372 has excape (\x1b) characters, MARC::File::XML
 373 will let the escape characters pass through
 374 when as_xml() or as_xml_record() is called.  The
 375 problem is that the escape character is not
 376 legal in well-formed XML documents, so when
 377 MARC::File::XML attempts to parse such a record,
 378 the XML parser will fail.
 379
 380 Stripping such characters will allow a
 381 MARC::Record->new_from_xml()
 382 to work, at the possible risk of some data loss.
 383
 384 =cut
 385
 386 sub StripNonXmlChars {
 387     my $str = shift;
 388     if (!defined($str) || $str eq ""){
 389         return "";
 390     }
 391     $str =~ s/[^\x09\x0A\x0D\x{0020}-\x{D7FF}\x{E000}-\x{FFFD}\x{10000}-\x{10FFFF}]//g;
 392     return $str;
 393 }
 394
 395
 396
 397 =head2 nsb_clean
 398
 399 =over 4
 400
 401 nsb_clean($string);
 402
 403 =back
 404
 405 Removes Non Sorting Block characters
 406
 407 =cut
 408 sub nsb_clean {
 409     my $NSB  = '\x88' ;        # NSB : begin Non Sorting Block
 410     my $NSE  = '\x89' ;        # NSE : Non Sorting Block end
 411     my $NSB2 = '\x98' ;        # NSB : begin Non Sorting Block
 412     my $NSE2 = '\x9C' ;        # NSE : Non Sorting Block end
 413     my $C2   = '\xC2' ;        # What is this char ? It is sometimes left by the regexp after removing NSB / NSE
 414
 415     # handles non sorting blocks
 416     my ($string) = @_ ;
 417     $_ = $string ;
 418     s/$NSB//g ;
 419     s/$NSE//g ;
 420     s/$NSB2//g ;
 421     s/$NSE2//g ;
 422     s/$C2//g ;
 423     $string = $_ ;
 424
 425     return($string) ;
 426 }
 427
 428
 429 =head1 INTERNAL FUNCTIONS
 430
 431 =head2 _default_marc21_charconv_to_utf8
 432
 433   my ($new_marc_record, $guessed_charset) = _default_marc21_charconv_to_utf8($marc_record);
 434
 435 Converts a C<MARC::Record> of unknown character set to UTF-8,
 436 first by trying a MARC-8 to UTF-8 conversion, then ISO-8859-1
 437 to UTF-8, then a default conversion that replaces each non-ASCII
 438 character with the replacement character.
 439
 440 The C<$guessed_charset> return value contains the character set
 441 that resulted in a conversion to valid UTF-8; note that
 442 if the MARC-8 and ISO-8859-1 conversions failed, the value of
 443 this is 'failed'.
 444
 445 =cut
 446
 447 sub _default_marc21_charconv_to_utf8 {
 448     my $marc_record = shift;
 449     my $marc_flavour = shift;
 450
 451     my $trial_marc8 = $marc_record->clone();
 452     my @all_errors = ();
 453     my @errors = _marc_marc8_to_utf8($trial_marc8, $marc_flavour);
 454     unless (@errors) {
 455         return $trial_marc8, 'MARC-8', [];
 456     }
 457     push @all_errors, @errors;
 458
 459     my $trial_8859_1 = $marc_record->clone();
 460     @errors = _marc_to_utf8_via_text_iconv($trial_8859_1, $marc_flavour, 'iso-8859-1');
 461     unless (@errors) {
 462         return $trial_8859_1, 'iso-8859-1', []; # note -- we could return \@all_errors
 463                                                 # instead if we wanted to report details
 464                                                 # of the failed attempt at MARC-8 => UTF-8
 465     }
 466     push @all_errors, @errors;
 467
 468     my $default_converted = $marc_record->clone();
 469     _marc_to_utf8_replacement_char($default_converted, $marc_flavour);
 470     return $default_converted, 'failed', \@all_errors;
 471 }
 472
 473 =head2 _default_unimarc_charconv_to_utf8
 474
 475   my ($new_marc_record, $guessed_charset) = _default_unimarc_charconv_to_utf8($marc_record);
 476
 477 Converts a C<MARC::Record> of unknown character set to UTF-8,
 478 first by trying a ISO-5426 to UTF-8 conversion, then ISO-8859-1
 479 to UTF-8, then a default conversion that replaces each non-ASCII
 480 character with the replacement character.
 481
 482 The C<$guessed_charset> return value contains the character set
 483 that resulted in a conversion to valid UTF-8; note that
 484 if the MARC-8 and ISO-8859-1 conversions failed, the value of
 485 this is 'failed'.
 486
 487 =cut
 488
 489 sub _default_unimarc_charconv_to_utf8 {
 490     my $marc_record = shift;
 491     my $marc_flavour = shift;
 492
 493     my $trial_marc8 = $marc_record->clone();
 494     my @all_errors = ();
 495     my @errors = _marc_iso5426_to_utf8($trial_marc8, $marc_flavour);
 496     unless (@errors) {
 497         return $trial_marc8, 'iso-5426';
 498     }
 499     push @all_errors, @errors;
 500
 501     my $trial_8859_1 = $marc_record->clone();
 502     @errors = _marc_to_utf8_via_text_iconv($trial_8859_1, $marc_flavour, 'iso-8859-1');
 503     unless (@errors) {
 504         return $trial_8859_1, 'iso-8859-1';
 505     }
 506     push @all_errors, @errors;
 507
 508     my $default_converted = $marc_record->clone();
 509     _marc_to_utf8_replacement_char($default_converted, $marc_flavour);
 510     return $default_converted, 'failed', \@all_errors;
 511 }
 512
 513 =head2 _marc_marc8_to_utf8
 514
 515   my @errors = _marc_marc8_to_utf8($marc_record, $marc_flavour, $source_encoding);
 516
 517 Convert a C<MARC::Record> to UTF-8 in-place from MARC-8.
 518 If the conversion fails for some reason, an
 519 appropriate messages will be placed in the returned
 520 C<@errors> array.
 521
 522 =cut
 523
 524 sub _marc_marc8_to_utf8 {
 525     my $marc_record = shift;
 526     my $marc_flavour = shift;
 527
 528     my $prev_ignore = MARC::Charset->ignore_errors();
 529     MARC::Charset->ignore_errors(1);
 530
 531     # trap warnings raised by MARC::Charset
 532     my @errors = ();
 533     local $SIG{__WARN__} = sub {
 534         my $msg = $_[0];
 535         if ($msg =~ /MARC.Charset/) {
 536             # FIXME - purpose of this regexp is to strip out the
 537             # line reference to MARC/Charset.pm, but as it
 538             # exists probably won't work quite on Windows --
 539             # some sort of minimal-bunch back-tracking RE
 540             # would be helpful here
 541             $msg =~ s/at [\/].*?.MARC.Charset\.pm line \d+\.\n$//;
 542             push @errors, $msg;
 543         } else {
 544             # if warning doesn't come from MARC::Charset, just
 545             # pass it on
 546             warn $msg;
 547         }
 548     };
 549
 550     foreach my $field ($marc_record->fields()) {
 551         if ($field->is_control_field()) {
 552             ; # do nothing -- control fields should not contain non-ASCII characters
 553         } else {
 554             my @converted_subfields;
 555             foreach my $subfield ($field->subfields()) {
 556                 my $utf8sf = MARC::Charset::marc8_to_utf8($subfield->[1]);
 557                 unless (IsStringUTF8ish($utf8sf)) {
 558                     # Because of a bug in MARC::Charset 0.98, if the string
 559                     # has (a) one or more diacritics that (b) are only in character positions
 560                     # 128 to 255 inclusive, the resulting converted string is not in
 561                     # UTF-8, but the legacy 8-bit encoding (e.g., ISO-8859-1).  If that
 562                     # occurs, upgrade the string in place.  Moral of the story seems to be
 563                     # that pack("U", ...) is better than chr(...) if you need to guarantee
 564                     # that the resulting string is UTF-8.
 565                     utf8::upgrade($utf8sf);
 566                 }
 567                 push @converted_subfields, $subfield->[0], $utf8sf;
 568             }
 569
 570             $field->replace_with(MARC::Field->new(
 571                 $field->tag(), $field->indicator(1), $field->indicator(2),
 572                 @converted_subfields)
 573             );
 574         }
 575     }
 576
 577     MARC::Charset->ignore_errors($prev_ignore);
 578
 579     SetMarcUnicodeFlag($marc_record, $marc_flavour);
 580
 581     return @errors;
 582 }
 583
 584 =head2 _marc_iso5426_to_utf8
 585
 586   my @errors = _marc_iso5426_to_utf8($marc_record, $marc_flavour, $source_encoding);
 587
 588 Convert a C<MARC::Record> to UTF-8 in-place from ISO-5426.
 589 If the conversion fails for some reason, an
 590 appropriate messages will be placed in the returned
 591 C<@errors> array.
 592
 593 FIXME - is ISO-5426 equivalent enough to MARC-8
 594 that C<MARC::Charset> can be used instead?
 595
 596 =cut
 597
 598 sub _marc_iso5426_to_utf8 {
 599     my $marc_record = shift;
 600     my $marc_flavour = shift;
 601
 602     my @errors = ();
 603
 604     foreach my $field ($marc_record->fields()) {
 605         if ($field->is_control_field()) {
 606             ; # do nothing -- control fields should not contain non-ASCII characters
 607         } else {
 608             my @converted_subfields;
 609             foreach my $subfield ($field->subfields()) {
 610                 my $utf8sf = char_decode5426($subfield->[1]);
 611                 push @converted_subfields, $subfield->[0], $utf8sf;
 612             }
 613
 614             $field->replace_with(MARC::Field->new(
 615                 $field->tag(), $field->indicator(1), $field->indicator(2),
 616                 @converted_subfields)
 617             );
 618         }
 619     }
 620
 621     SetMarcUnicodeFlag($marc_record, $marc_flavour);
 622
 623     return @errors;
 624 }
 625
 626 =head2 _marc_to_utf8_via_text_iconv
 627
 628   my @errors = _marc_to_utf8_via_text_iconv($marc_record, $marc_flavour, $source_encoding);
 629
 630 Convert a C<MARC::Record> to UTF-8 in-place using the
 631 C<Text::Iconv> CPAN module.  Any source encoding accepted
 632 by the user's iconv installation should work.  If
 633 the source encoding is not recognized on the user's
 634 server or the conversion fails for some reason,
 635 appropriate messages will be placed in the returned
 636 C<@errors> array.
 637
 638 =cut
 639
 640 sub _marc_to_utf8_via_text_iconv {
 641     my $marc_record = shift;
 642     my $marc_flavour = shift;
 643     my $source_encoding = shift;
 644
 645     my @errors = ();
 646     my $decoder;
 647     eval { $decoder = Text::Iconv->new($source_encoding, 'utf8'); };
 648     if ($@) {
 649         push @errors, "Could not initialze $source_encoding => utf8 converter: $@";
 650         return @errors;
 651     }
 652
 653     my $prev_raise_error = Text::Iconv->raise_error();
 654     Text::Iconv->raise_error(1);
 655
 656     foreach my $field ($marc_record->fields()) {
 657         if ($field->is_control_field()) {
 658             ; # do nothing -- control fields should not contain non-ASCII characters
 659         } else {
 660             my @converted_subfields;
 661             foreach my $subfield ($field->subfields()) {
 662                 my $converted_value;
 663                 my $conversion_ok = 1;
 664                 eval { $converted_value = $decoder->convert($subfield->[1]); };
 665                 if ($@) {
 666                     $conversion_ok = 0;
 667                     push @errors, $@;
 668                 } elsif (not defined $converted_value) {
 669                     $conversion_ok = 0;
 670                     push @errors, "Text::Iconv conversion failed - retval is " . $decoder->retval();
 671                 }
 672
 673                 if ($conversion_ok) {
 674                     push @converted_subfields, $subfield->[0], $converted_value;
 675                 } else {
 676                     $converted_value = $subfield->[1];
 677                     $converted_value =~ s/[\200-\377]/\xef\xbf\xbd/g;
 678                     push @converted_subfields, $subfield->[0], $converted_value;
 679                 }
 680             }
 681
 682             $field->replace_with(MARC::Field->new(
 683                 $field->tag(), $field->indicator(1), $field->indicator(2),
 684                 @converted_subfields)
 685             );
 686         }
 687     }
 688
 689     SetMarcUnicodeFlag($marc_record, $marc_flavour);
 690     Text::Iconv->raise_error($prev_raise_error);
 691
 692     return @errors;
 693 }
 694
 695 =head2 _marc_to_utf8_replacement_char
 696
 697   _marc_to_utf8_replacement_char($marc_record, $marc_flavour);
 698
 699 Convert a C<MARC::Record> to UTF-8 in-place, adopting the
 700 unsatisfactory method of replacing all non-ASCII (e.g.,
 701 where the eight bit is set) octet with the Unicode
 702 replacement character.  This is meant as a last-ditch
 703 method, and would be best used as part of a UI that
 704 lets a cataloguer pick various character conversions
 705 until he or she finds the right one.
 706
 707 =cut
 708
 709 sub _marc_to_utf8_replacement_char {
 710     my $marc_record = shift;
 711     my $marc_flavour = shift;
 712
 713     foreach my $field ($marc_record->fields()) {
 714         if ($field->is_control_field()) {
 715             ; # do nothing -- control fields should not contain non-ASCII characters
 716         } else {
 717             my @converted_subfields;
 718             foreach my $subfield ($field->subfields()) {
 719                 my $value = $subfield->[1];
 720                 $value =~ s/[\200-\377]/\xef\xbf\xbd/g;
 721                 push @converted_subfields, $subfield->[0], $value;
 722             }
 723
 724             $field->replace_with(MARC::Field->new(
 725                 $field->tag(), $field->indicator(1), $field->indicator(2),
 726                 @converted_subfields)
 727             );
 728         }
 729     }
 730
 731     SetMarcUnicodeFlag($marc_record, $marc_flavour);
 732 }
 733
 734 =head2 char_decode5426
 735
 736   my $utf8string = char_decode5426($iso_5426_string);
 737
 738 Converts a string from ISO-5426 to UTF-8.
 739
 740 =cut
 741
 742
 743 my %chars;
 744 $chars{0xb0}=0x0101;#3/0ayn[ain]
 745 $chars{0xb1}=0x0623;#3/1alif/hamzah[alefwithhamzaabove]
 746 #$chars{0xb2}=0x00e0;#'à';
 747 $chars{0xb2}=0x00e0;#3/2leftlowsinglequotationmark
 748 #$chars{0xb3}=0x00e7;#'ç';
 749 $chars{0xb3}=0x00e7;#3/2leftlowsinglequotationmark
 750 # $chars{0xb4}='è';
 751 $chars{0xb4}=0x00e8;
 752 $chars{0xbd}=0x02b9;
 753 $chars{0xbe}=0x02ba;
 754 # $chars{0xb5}='é';
 755 $chars{0xb5}=0x00e9;
 756 $chars{0x97}=0x003c;#3/2leftlowsinglequotationmark
 757 $chars{0x98}=0x003e;#3/2leftlowsinglequotationmark
 758 $chars{0xfa}=0x0153; #oe
 759 $chars{0xea}=0x0152; #oe
 760 $chars{0x81d1}=0x00b0;
 761
 762 ####
 763 ## combined characters iso5426
 764
 765 $chars{0xc041}=0x1ea2; # capital a with hook above
 766 $chars{0xc045}=0x1eba; # capital e with hook above
 767 $chars{0xc049}=0x1ec8; # capital i with hook above
 768 $chars{0xc04f}=0x1ece; # capital o with hook above
 769 $chars{0xc055}=0x1ee6; # capital u with hook above
 770 $chars{0xc059}=0x1ef6; # capital y with hook above
 771 $chars{0xc061}=0x1ea3; # small a with hook above
 772 $chars{0xc065}=0x1ebb; # small e with hook above
 773 $chars{0xc069}=0x1ec9; # small i with hook above
 774 $chars{0xc06f}=0x1ecf; # small o with hook above
 775 $chars{0xc075}=0x1ee7; # small u with hook above
 776 $chars{0xc079}=0x1ef7; # small y with hook above
 777
 778         # 4/1 grave accent
 779 $chars{0xc141}=0x00c0; # capital a with grave accent
 780 $chars{0xc145}=0x00c8; # capital e with grave accent
 781 $chars{0xc149}=0x00cc; # capital i with grave accent
 782 $chars{0xc14f}=0x00d2; # capital o with grave accent
 783 $chars{0xc155}=0x00d9; # capital u with grave accent
 784 $chars{0xc157}=0x1e80; # capital w with grave
 785 $chars{0xc159}=0x1ef2; # capital y with grave
 786 $chars{0xc161}=0x00e0; # small a with grave accent
 787 $chars{0xc165}=0x00e8; # small e with grave accent
 788 $chars{0xc169}=0x00ec; # small i with grave accent
 789 $chars{0xc16f}=0x00f2; # small o with grave accent
 790 $chars{0xc175}=0x00f9; # small u with grave accent
 791 $chars{0xc177}=0x1e81; # small w with grave
 792 $chars{0xc179}=0x1ef3; # small y with grave
 793         # 4/2 acute accent
 794 $chars{0xc241}=0x00c1; # capital a with acute accent
 795 $chars{0xc243}=0x0106; # capital c with acute accent
 796 $chars{0xc245}=0x00c9; # capital e with acute accent
 797 $chars{0xc247}=0x01f4; # capital g with acute
 798 $chars{0xc249}=0x00cd; # capital i with acute accent
 799 $chars{0xc24b}=0x1e30; # capital k with acute
 800 $chars{0xc24c}=0x0139; # capital l with acute accent
 801 $chars{0xc24d}=0x1e3e; # capital m with acute
 802 $chars{0xc24e}=0x0143; # capital n with acute accent
 803 $chars{0xc24f}=0x00d3; # capital o with acute accent
 804 $chars{0xc250}=0x1e54; # capital p with acute
 805 $chars{0xc252}=0x0154; # capital r with acute accent
 806 $chars{0xc253}=0x015a; # capital s with acute accent
 807 $chars{0xc255}=0x00da; # capital u with acute accent
 808 $chars{0xc257}=0x1e82; # capital w with acute
 809 $chars{0xc259}=0x00dd; # capital y with acute accent
 810 $chars{0xc25a}=0x0179; # capital z with acute accent
 811 $chars{0xc261}=0x00e1; # small a with acute accent
 812 $chars{0xc263}=0x0107; # small c with acute accent
 813 $chars{0xc265}=0x00e9; # small e with acute accent
 814 $chars{0xc267}=0x01f5; # small g with acute
 815 $chars{0xc269}=0x00ed; # small i with acute accent
 816 $chars{0xc26b}=0x1e31; # small k with acute
 817 $chars{0xc26c}=0x013a; # small l with acute accent
 818 $chars{0xc26d}=0x1e3f; # small m with acute
 819 $chars{0xc26e}=0x0144; # small n with acute accent
 820 $chars{0xc26f}=0x00f3; # small o with acute accent
 821 $chars{0xc270}=0x1e55; # small p with acute
 822 $chars{0xc272}=0x0155; # small r with acute accent
 823 $chars{0xc273}=0x015b; # small s with acute accent
 824 $chars{0xc275}=0x00fa; # small u with acute accent
 825 $chars{0xc277}=0x1e83; # small w with acute
 826 $chars{0xc279}=0x00fd; # small y with acute accent
 827 $chars{0xc27a}=0x017a; # small z with acute accent
 828 $chars{0xc2e1}=0x01fc; # capital ae with acute
 829 $chars{0xc2f1}=0x01fd; # small ae with acute
 830        # 4/3 circumflex accent
 831 $chars{0xc341}=0x00c2; # capital a with circumflex accent
 832 $chars{0xc343}=0x0108; # capital c with circumflex
 833 $chars{0xc345}=0x00ca; # capital e with circumflex accent
 834 $chars{0xc347}=0x011c; # capital g with circumflex
 835 $chars{0xc348}=0x0124; # capital h with circumflex
 836 $chars{0xc349}=0x00ce; # capital i with circumflex accent
 837 $chars{0xc34a}=0x0134; # capital j with circumflex
 838 $chars{0xc34f}=0x00d4; # capital o with circumflex accent
 839 $chars{0xc353}=0x015c; # capital s with circumflex
 840 $chars{0xc355}=0x00db; # capital u with circumflex
 841 $chars{0xc357}=0x0174; # capital w with circumflex
 842 $chars{0xc359}=0x0176; # capital y with circumflex
 843 $chars{0xc35a}=0x1e90; # capital z with circumflex
 844 $chars{0xc361}=0x00e2; # small a with circumflex accent
 845 $chars{0xc363}=0x0109; # small c with circumflex
 846 $chars{0xc365}=0x00ea; # small e with circumflex accent
 847 $chars{0xc367}=0x011d; # small g with circumflex
 848 $chars{0xc368}=0x0125; # small h with circumflex
 849 $chars{0xc369}=0x00ee; # small i with circumflex accent
 850 $chars{0xc36a}=0x0135; # small j with circumflex
 851 $chars{0xc36e}=0x00f1; # small n with tilde
 852 $chars{0xc36f}=0x00f4; # small o with circumflex accent
 853 $chars{0xc373}=0x015d; # small s with circumflex
 854 $chars{0xc375}=0x00fb; # small u with circumflex
 855 $chars{0xc377}=0x0175; # small w with circumflex
 856 $chars{0xc379}=0x0177; # small y with circumflex
 857 $chars{0xc37a}=0x1e91; # small z with circumflex
 858         # 4/4 tilde
 859 $chars{0xc441}=0x00c3; # capital a with tilde
 860 $chars{0xc445}=0x1ebc; # capital e with tilde
 861 $chars{0xc449}=0x0128; # capital i with tilde
 862 $chars{0xc44e}=0x00d1; # capital n with tilde
 863 $chars{0xc44f}=0x00d5; # capital o with tilde
 864 $chars{0xc455}=0x0168; # capital u with tilde
 865 $chars{0xc456}=0x1e7c; # capital v with tilde
 866 $chars{0xc459}=0x1ef8; # capital y with tilde
 867 $chars{0xc461}=0x00e3; # small a with tilde
 868 $chars{0xc465}=0x1ebd; # small e with tilde
 869 $chars{0xc469}=0x0129; # small i with tilde
 870 $chars{0xc46e}=0x00f1; # small n with tilde
 871 $chars{0xc46f}=0x00f5; # small o with tilde
 872 $chars{0xc475}=0x0169; # small u with tilde
 873 $chars{0xc476}=0x1e7d; # small v with tilde
 874 $chars{0xc479}=0x1ef9; # small y with tilde
 875     # 4/5 macron
 876 $chars{0xc541}=0x0100; # capital a with macron
 877 $chars{0xc545}=0x0112; # capital e with macron
 878 $chars{0xc547}=0x1e20; # capital g with macron
 879 $chars{0xc549}=0x012a; # capital i with macron
 880 $chars{0xc54f}=0x014c; # capital o with macron
 881 $chars{0xc555}=0x016a; # capital u with macron
 882 $chars{0xc561}=0x0101; # small a with macron
 883 $chars{0xc565}=0x0113; # small e with macron
 884 $chars{0xc567}=0x1e21; # small g with macron
 885 $chars{0xc569}=0x012b; # small i with macron
 886 $chars{0xc56f}=0x014d; # small o with macron
 887 $chars{0xc575}=0x016b; # small u with macron
 888 $chars{0xc572}=0x0159; # small r with macron
 889 $chars{0xc5e1}=0x01e2; # capital ae with macron
 890 $chars{0xc5f1}=0x01e3; # small ae with macron
 891         # 4/6 breve
 892 $chars{0xc641}=0x0102; # capital a with breve
 893 $chars{0xc645}=0x0114; # capital e with breve
 894 $chars{0xc647}=0x011e; # capital g with breve
 895 $chars{0xc649}=0x012c; # capital i with breve
 896 $chars{0xc64f}=0x014e; # capital o with breve
 897 $chars{0xc655}=0x016c; # capital u with breve
 898 $chars{0xc661}=0x0103; # small a with breve
 899 $chars{0xc665}=0x0115; # small e with breve
 900 $chars{0xc667}=0x011f; # small g with breve
 901 $chars{0xc669}=0x012d; # small i with breve
 902 $chars{0xc66f}=0x014f; # small o with breve
 903 $chars{0xc675}=0x016d; # small u with breve
 904         # 4/7 dot above
 905 $chars{0xc7b0}=0x01e1; # Ain with dot above
 906 $chars{0xc742}=0x1e02; # capital b with dot above
 907 $chars{0xc743}=0x010a; # capital c with dot above
 908 $chars{0xc744}=0x1e0a; # capital d with dot above
 909 $chars{0xc745}=0x0116; # capital e with dot above
 910 $chars{0xc746}=0x1e1e; # capital f with dot above
 911 $chars{0xc747}=0x0120; # capital g with dot above
 912 $chars{0xc748}=0x1e22; # capital h with dot above
 913 $chars{0xc749}=0x0130; # capital i with dot above
 914 $chars{0xc74d}=0x1e40; # capital m with dot above
 915 $chars{0xc74e}=0x1e44; # capital n with dot above
 916 $chars{0xc750}=0x1e56; # capital p with dot above
 917 $chars{0xc752}=0x1e58; # capital r with dot above
 918 $chars{0xc753}=0x1e60; # capital s with dot above
 919 $chars{0xc754}=0x1e6a; # capital t with dot above
 920 $chars{0xc757}=0x1e86; # capital w with dot above
 921 $chars{0xc758}=0x1e8a; # capital x with dot above
 922 $chars{0xc759}=0x1e8e; # capital y with dot above
 923 $chars{0xc75a}=0x017b; # capital z with dot above
 924 $chars{0xc761}=0x0227; # small b with dot above
 925 $chars{0xc762}=0x1e03; # small b with dot above
 926 $chars{0xc763}=0x010b; # small c with dot above
 927 $chars{0xc764}=0x1e0b; # small d with dot above
 928 $chars{0xc765}=0x0117; # small e with dot above
 929 $chars{0xc766}=0x1e1f; # small f with dot above
 930 $chars{0xc767}=0x0121; # small g with dot above
 931 $chars{0xc768}=0x1e23; # small h with dot above
 932 $chars{0xc76d}=0x1e41; # small m with dot above
 933 $chars{0xc76e}=0x1e45; # small n with dot above
 934 $chars{0xc770}=0x1e57; # small p with dot above
 935 $chars{0xc772}=0x1e59; # small r with dot above
 936 $chars{0xc773}=0x1e61; # small s with dot above
 937 $chars{0xc774}=0x1e6b; # small t with dot above
 938 $chars{0xc777}=0x1e87; # small w with dot above
 939 $chars{0xc778}=0x1e8b; # small x with dot above
 940 $chars{0xc779}=0x1e8f; # small y with dot above
 941 $chars{0xc77a}=0x017c; # small z with dot above
 942         # 4/8 trema, diaresis
 943 $chars{0xc820}=0x00a8; # diaeresis
 944 $chars{0xc841}=0x00c4; # capital a with diaeresis
 945 $chars{0xc845}=0x00cb; # capital e with diaeresis
 946 $chars{0xc848}=0x1e26; # capital h with diaeresis
 947 $chars{0xc849}=0x00cf; # capital i with diaeresis
 948 $chars{0xc84f}=0x00d6; # capital o with diaeresis
 949 $chars{0xc855}=0x00dc; # capital u with diaeresis
 950 $chars{0xc857}=0x1e84; # capital w with diaeresis
 951 $chars{0xc858}=0x1e8c; # capital x with diaeresis
 952 $chars{0xc859}=0x0178; # capital y with diaeresis
 953 $chars{0xc861}=0x00e4; # small a with diaeresis
 954 $chars{0xc865}=0x00eb; # small e with diaeresis
 955 $chars{0xc868}=0x1e27; # small h with diaeresis
 956 $chars{0xc869}=0x00ef; # small i with diaeresis
 957 $chars{0xc86f}=0x00f6; # small o with diaeresis
 958 $chars{0xc874}=0x1e97; # small t with diaeresis
 959 $chars{0xc875}=0x00fc; # small u with diaeresis
 960 $chars{0xc877}=0x1e85; # small w with diaeresis
 961 $chars{0xc878}=0x1e8d; # small x with diaeresis
 962 $chars{0xc879}=0x00ff; # small y with diaeresis
 963         # 4/9 umlaut
 964 $chars{0xc920}=0x00a8; # [diaeresis]
 965 $chars{0xc961}=0x00e4; # a with umlaut
 966 $chars{0xc965}=0x00eb; # e with umlaut
 967 $chars{0xc969}=0x00ef; # i with umlaut
 968 $chars{0xc96f}=0x00f6; # o with umlaut
 969 $chars{0xc975}=0x00fc; # u with umlaut
 970         # 4/10 circle above
 971 $chars{0xca41}=0x00c5; # capital a with ring above
 972 $chars{0xcaad}=0x016e; # capital u with ring above
 973 $chars{0xca61}=0x00e5; # small a with ring above
 974 $chars{0xca75}=0x016f; # small u with ring above
 975 $chars{0xca77}=0x1e98; # small w with ring above
 976 $chars{0xca79}=0x1e99; # small y with ring above
 977         # 4/11 high comma off centre
 978         # 4/12 inverted high comma centred
 979         # 4/13 double acute accent
 980 $chars{0xcd4f}=0x0150; # capital o with double acute
 981 $chars{0xcd55}=0x0170; # capital u with double acute
 982 $chars{0xcd6f}=0x0151; # small o with double acute
 983 $chars{0xcd75}=0x0171; # small u with double acute
 984         # 4/14 horn
 985 $chars{0xce54}=0x01a0; # latin capital letter o with horn
 986 $chars{0xce55}=0x01af; # latin capital letter u with horn
 987 $chars{0xce74}=0x01a1; # latin small letter o with horn
 988 $chars{0xce75}=0x01b0; # latin small letter u with horn
 989         # 4/15 caron (hacek
 990 $chars{0xcf41}=0x01cd; # capital a with caron
 991 $chars{0xcf43}=0x010c; # capital c with caron
 992 $chars{0xcf44}=0x010e; # capital d with caron
 993 $chars{0xcf45}=0x011a; # capital e with caron
 994 $chars{0xcf47}=0x01e6; # capital g with caron
 995 $chars{0xcf49}=0x01cf; # capital i with caron
 996 $chars{0xcf4b}=0x01e8; # capital k with caron
 997 $chars{0xcf4c}=0x013d; # capital l with caron
 998 $chars{0xcf4e}=0x0147; # capital n with caron
 999 $chars{0xcf4f}=0x01d1; # capital o with caron
1000 $chars{0xcf52}=0x0158; # capital r with caron
1001 $chars{0xcf53}=0x0160; # capital s with caron
1002 $chars{0xcf54}=0x0164; # capital t with caron
1003 $chars{0xcf55}=0x01d3; # capital u with caron
1004 $chars{0xcf5a}=0x017d; # capital z with caron
1005 $chars{0xcf61}=0x01ce; # small a with caron
1006 $chars{0xcf63}=0x010d; # small c with caron
1007 $chars{0xcf64}=0x010f; # small d with caron
1008 $chars{0xcf65}=0x011b; # small e with caron
1009 $chars{0xcf67}=0x01e7; # small g with caron
1010 $chars{0xcf69}=0x01d0; # small i with caron
1011 $chars{0xcf6a}=0x01f0; # small j with caron
1012 $chars{0xcf6b}=0x01e9; # small k with caron
1013 $chars{0xcf6c}=0x013e; # small l with caron
1014 $chars{0xcf6e}=0x0148; # small n with caron
1015 $chars{0xcf6f}=0x01d2; # small o with caron
1016 $chars{0xcf72}=0x0159; # small r with caron
1017 $chars{0xcf73}=0x0161; # small s with caron
1018 $chars{0xcf74}=0x0165; # small t with caron
1019 $chars{0xcf75}=0x01d4; # small u with caron
1020 $chars{0xcf7a}=0x017e; # small z with caron
1021         # 5/0 cedilla
1022 $chars{0xd020}=0x00b8; # cedilla
1023 $chars{0xd043}=0x00c7; # capital c with cedilla
1024 $chars{0xd044}=0x1e10; # capital d with cedilla
1025 $chars{0xd047}=0x0122; # capital g with cedilla
1026 $chars{0xd048}=0x1e28; # capital h with cedilla
1027 $chars{0xd04b}=0x0136; # capital k with cedilla
1028 $chars{0xd04c}=0x013b; # capital l with cedilla
1029 $chars{0xd04e}=0x0145; # capital n with cedilla
1030 $chars{0xd052}=0x0156; # capital r with cedilla
1031 $chars{0xd053}=0x015e; # capital s with cedilla
1032 $chars{0xd054}=0x0162; # capital t with cedilla
1033 $chars{0xd063}=0x00e7; # small c with cedilla
1034 $chars{0xd064}=0x1e11; # small d with cedilla
1035 $chars{0xd065}=0x0119; # small e with cedilla
1036 $chars{0xd067}=0x0123; # small g with cedilla
1037 $chars{0xd068}=0x1e29; # small h with cedilla
1038 $chars{0xd06b}=0x0137; # small k with cedilla
1039 $chars{0xd06c}=0x013c; # small l with cedilla
1040 $chars{0xd06e}=0x0146; # small n with cedilla
1041 $chars{0xd072}=0x0157; # small r with cedilla
1042 $chars{0xd073}=0x015f; # small s with cedilla
1043 $chars{0xd074}=0x0163; # small t with cedilla
1044         # 5/1 rude
1045         # 5/2 hook to left
1046         # 5/3 ogonek (hook to right
1047 $chars{0xd320}=0x02db; # ogonek
1048 $chars{0xd341}=0x0104; # capital a with ogonek
1049 $chars{0xd345}=0x0118; # capital e with ogonek
1050 $chars{0xd349}=0x012e; # capital i with ogonek
1051 $chars{0xd34f}=0x01ea; # capital o with ogonek
1052 $chars{0xd355}=0x0172; # capital u with ogonek
1053 $chars{0xd361}=0x0105; # small a with ogonek
1054 $chars{0xd365}=0x0119; # small e with ogonek
1055 $chars{0xd369}=0x012f; # small i with ogonek
1056 $chars{0xd36f}=0x01eb; # small o with ogonek
1057 $chars{0xd375}=0x0173; # small u with ogonek
1058         # 5/4 circle below
1059 $chars{0xd441}=0x1e00; # capital a with ring below
1060 $chars{0xd461}=0x1e01; # small a with ring below
1061         # 5/5 half circle below
1062 $chars{0xf948}=0x1e2a; # capital h with breve below
1063 $chars{0xf968}=0x1e2b; # small h with breve below
1064         # 5/6 dot below
1065 $chars{0xd641}=0x1ea0; # capital a with dot below
1066 $chars{0xd642}=0x1e04; # capital b with dot below
1067 $chars{0xd644}=0x1e0c; # capital d with dot below
1068 $chars{0xd645}=0x1eb8; # capital e with dot below
1069 $chars{0xd648}=0x1e24; # capital h with dot below
1070 $chars{0xd649}=0x1eca; # capital i with dot below
1071 $chars{0xd64b}=0x1e32; # capital k with dot below
1072 $chars{0xd64c}=0x1e36; # capital l with dot below
1073 $chars{0xd64d}=0x1e42; # capital m with dot below
1074 $chars{0xd64e}=0x1e46; # capital n with dot below
1075 $chars{0xd64f}=0x1ecc; # capital o with dot below
1076 $chars{0xd652}=0x1e5a; # capital r with dot below
1077 $chars{0xd653}=0x1e62; # capital s with dot below
1078 $chars{0xd654}=0x1e6c; # capital t with dot below
1079 $chars{0xd655}=0x1ee4; # capital u with dot below
1080 $chars{0xd656}=0x1e7e; # capital v with dot below
1081 $chars{0xd657}=0x1e88; # capital w with dot below
1082 $chars{0xd659}=0x1ef4; # capital y with dot below
1083 $chars{0xd65a}=0x1e92; # capital z with dot below
1084 $chars{0xd661}=0x1ea1; # small a with dot below
1085 $chars{0xd662}=0x1e05; # small b with dot below
1086 $chars{0xd664}=0x1e0d; # small d with dot below
1087 $chars{0xd665}=0x1eb9; # small e with dot below
1088 $chars{0xd668}=0x1e25; # small h with dot below
1089 $chars{0xd669}=0x1ecb; # small i with dot below
1090 $chars{0xd66b}=0x1e33; # small k with dot below
1091 $chars{0xd66c}=0x1e37; # small l with dot below
1092 $chars{0xd66d}=0x1e43; # small m with dot below
1093 $chars{0xd66e}=0x1e47; # small n with dot below
1094 $chars{0xd66f}=0x1ecd; # small o with dot below
1095 $chars{0xd672}=0x1e5b; # small r with dot below
1096 $chars{0xd673}=0x1e63; # small s with dot below
1097 $chars{0xd674}=0x1e6d; # small t with dot below
1098 $chars{0xd675}=0x1ee5; # small u with dot below
1099 $chars{0xd676}=0x1e7f; # small v with dot below
1100 $chars{0xd677}=0x1e89; # small w with dot below
1101 $chars{0xd679}=0x1ef5; # small y with dot below
1102 $chars{0xd67a}=0x1e93; # small z with dot below
1103         # 5/7 double dot below
1104 $chars{0xd755}=0x1e72; # capital u with diaeresis below
1105 $chars{0xd775}=0x1e73; # small u with diaeresis below
1106         # 5/8 underline
1107 $chars{0xd820}=0x005f; # underline
1108         # 5/9 double underline
1109 $chars{0xd920}=0x2017; # double underline
1110         # 5/10 small low vertical bar
1111 $chars{0xda20}=0x02cc; #
1112         # 5/11 circumflex below
1113         # 5/12 (this position shall not be used)
1114         # 5/13 left half of ligature sign and of double tilde
1115         # 5/14 right half of ligature sign
1116         # 5/15 right half of double tilde
1117 #     map {printf "%x :%x\n",$_,$chars{$_};}keys %chars;
1118
1119 sub char_decode5426 {
1120     my ( $string) = @_;
1121     my $result;
1122
1123     my @data = unpack("C*", $string);
1124     my @characters;
1125     my $length=scalar(@data);
1126     for (my $i = 0; $i < scalar(@data); $i++) {
1127       my $char= $data[$i];
1128       if ($char >= 0x00 && $char <= 0x7F){
1129         #IsAscii
1130
1131           push @characters,$char unless ($char<0x02 ||$char== 0x0F);
1132       }elsif (($char >= 0xC0 && $char <= 0xDF)) {
1133         #Combined Char
1134         my $convchar ;
1135         if ($chars{$char*256+$data[$i+1]}) {
1136           $convchar= $chars{$char * 256 + $data[$i+1]};
1137           $i++;
1138 #           printf "char %x $char, char to convert %x , converted %x\n",$char,$char * 256 + $data[$i - 1],$convchar;
1139         } elsif ($chars{$char})  {
1140           $convchar= $chars{$char};
1141 #           printf "0xC char %x, converted %x\n",$char,$chars{$char};
1142         }else {
1143           $convchar=$char;
1144         }
1145         push @characters,$convchar;
1146       } else {
1147         my $convchar;
1148         if ($chars{$char})  {
1149           $convchar= $chars{$char};
1150 #            printf "char %x,  converted %x\n",$char,$chars{$char};
1151         }else {
1152 #            printf "char %x $char\n",$char;
1153           $convchar=$char;
1154         }
1155         push @characters,$convchar;
1156       }
1157     }
1158     $result=pack "U*",@characters;
1159 #     $result=~s/\x01//;
1160 #     $result=~s/\x00//;
1161      $result=~s/\x0f//;
1162      $result=~s/\x1b.//;
1163      $result=~s/\x0e//;
1164      $result=~s/\x1b\x5b//;
1165 #   map{printf "%x",$_} @characters;
1166 #   printf "\n";
1167   return $result;
1168 }
1169
1170 1;
1171
1172
1173 =head1 AUTHOR
1174
1175 Koha Development Team <http://koha-community.org/>
1176
1177 Galen Charlton <galen.charlton@liblime.com>
1178
1179 =cut