C4/Charset.pm

   1 package C4::Charset;
   2
   3 # Copyright (C) 2008 LibLime
   4 #
   5 # This file is part of Koha.
   6 #
   7 # Koha is free software; you can redistribute it and/or modify it under the
   8 # terms of the GNU General Public License as published by the Free Software
   9 # Foundation; either version 2 of the License, or (at your option) any later
  10 # version.
  11 #
  12 # Koha is distributed in the hope that it will be useful, but WITHOUT ANY
  13 # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
  14 # A PARTICULAR PURPOSE.  See the GNU General Public License for more details.
  15 #
  16 # You should have received a copy of the GNU General Public License along
  17 # with Koha; if not, write to the Free Software Foundation, Inc.,
  18 # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
  19
  20 use strict;
  21 use warnings;
  22
  23 use MARC::Charset qw/marc8_to_utf8/;
  24 use Text::Iconv;
  25 use C4::Debug;
  26 use Unicode::Normalize;
  27
  28 use vars qw($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
  29
  30 BEGIN {
  31     # set the version for version checking
  32     $VERSION = 3.01;
  33     require Exporter;
  34     @ISA    = qw(Exporter);
  35     @EXPORT = qw(
  36         NormalizeString
  37         IsStringUTF8ish
  38         MarcToUTF8Record
  39         SetUTF8Flag
  40         SetMarcUnicodeFlag
  41         StripNonXmlChars
  42     );
  43 }
  44
  45 =head1 NAME
  46
  47 C4::Charset - utilities for handling character set conversions.
  48
  49 =head1 SYNOPSIS
  50
  51   use C4::Charset;
  52
  53 =head1 DESCRIPTION
  54
  55 This module contains routines for dealing with character set
  56 conversions, particularly for MARC records.
  57
  58 A variety of character encodings are in use by various MARC
  59 standards, and even more character encodings are used by
  60 non-standard MARC records.  The various MARC formats generally
  61 do not do a good job of advertising a given record's character
  62 encoding, and even when a record does advertise its encoding,
  63 e.g., via the Leader/09, experience has shown that one cannot
  64 trust it.
  65
  66 Ultimately, all MARC records are stored in Koha in UTF-8 and
  67 must be converted from whatever the source character encoding is.
  68 The goal of this module is to ensure that these conversions
  69 take place accurately.  When a character conversion cannot take
  70 place, or at least not accurately, the module was provide
  71 enough information to allow user-facing code to inform the user
  72 on how to deal with the situation.
  73
  74 =cut
  75
  76 =head1 FUNCTIONS
  77
  78 =head2 IsStringUTF8ish
  79
  80   my $is_utf8 = IsStringUTF8ish($str);
  81
  82 Determines if C<$str> is valid UTF-8.  This can mean
  83 one of two things:
  84
  85 =over
  86
  87 =item *
  88
  89 The Perl UTF-8 flag is set and the string contains valid UTF-8.
  90
  91 =item *
  92
  93 The Perl UTF-8 flag is B<not> set, but the octets contain
  94 valid UTF-8.
  95
  96 =back
  97
  98 The function is named C<IsStringUTF8ish> instead of C<IsStringUTF8>
  99 because in one could be presented with a MARC blob that is
 100 not actually in UTF-8 but whose sequence of octets appears to be
 101 valid UTF-8.  The rest of the MARC character conversion functions
 102 will assume that this situation occur does not very often.
 103
 104 =cut
 105
 106 sub IsStringUTF8ish {
 107     my $str = shift;
 108
 109     return 1 if utf8::is_utf8($str);
 110     return utf8::decode($str);
 111 }
 112
 113 =head2 SetUTF8Flag
 114
 115   my $marc_record = SetUTF8Flag($marc_record, $nfd);
 116
 117 This function sets the PERL UTF8 flag for data.
 118 It is required when using new_from_usmarc
 119 since MARC::File::USMARC does not handle PERL UTF8 setting.
 120 When editing unicode marc records fields and subfields, you
 121 would end up in double encoding without using this function.
 122
 123 If $nfd is set, string normalization will use NFD instead of NFC
 124
 125 FIXME
 126 In my opinion, this function belongs to MARC::Record and not
 127 to this package.
 128 But since it handles charset, and MARC::Record, it finds its way in that package
 129
 130 =cut
 131
 132 sub SetUTF8Flag{
 133         my ($record, $nfd)=@_;
 134         return unless ($record && $record->fields());
 135         foreach my $field ($record->fields()){
 136                 if ($field->tag()>=10){
 137                         my @subfields;
 138                         foreach my $subfield ($field->subfields()){
 139                                 push @subfields,($$subfield[0],NormalizeString($$subfield[1],$nfd));
 140                         }
 141                         my $newfield=MARC::Field->new(
 142                                                         $field->tag(),
 143                                                         $field->indicator(1),
 144                                                         $field->indicator(2),
 145                                                         @subfields
 146                                                 );
 147                         $field->replace_with($newfield);
 148                 }
 149         }
 150 }
 151
 152 =head2 NormalizeString
 153
 154     my $normalized_string=NormalizeString($string,$nfd,$transform);
 155
 156 Given a string
 157 nfd : If you want to set NFD and not NFC
 158 transform : If you expect all the signs to be removed
 159
 160 Sets the PERL UTF8 Flag on your initial data if need be
 161 and applies cleaning if required
 162
 163 Returns a utf8 NFC normalized string
 164
 165 Sample code :
 166    my $string=NormalizeString ("l'ornithoptère");
 167    #results into ornithoptère in NFC form and sets UTF8 Flag
 168
 169 =cut
 170
 171
 172 sub NormalizeString{
 173         my ($string,$nfd,$transform)=@_;
 174         utf8::decode($string) unless (utf8::is_utf8($string));
 175         if ($nfd){
 176                 $string= NFD($string);
 177         }
 178         else {
 179                 $string=NFC($string);
 180         }
 181         if ($transform){
 182     $string=~s/\<|\>|\^|\;|\.|\?|,|\-|\(|\)|\[|\]|\{|\}|\$|\%|\!|\*|\:|\\|\/|\&|\"|\'/ /g;
 183         #removing one letter words "d'" "l'"  was changed into "d " "l "
 184     $string=~s/\b\S\b//g;
 185     $string=~s/\s+$//g;
 186         }
 187     return $string;
 188 }
 189
 190 =head2 MarcToUTF8Record
 191
 192   ($marc_record, $converted_from, $errors_arrayref) = MarcToUTF8Record($marc_blob,
 193                                         $marc_flavour, [, $source_encoding]);
 194
 195 Given a MARC blob or a C<MARC::Record>, the MARC flavour, and an
 196 optional source encoding, return a C<MARC::Record> that is
 197 converted to UTF-8.
 198
 199 The returned C<$marc_record> is guaranteed to be in valid UTF-8, but
 200 is not guaranteed to have been converted correctly.  Specifically,
 201 if C<$converted_from> is 'failed', the MARC record returned failed
 202 character conversion and had each of its non-ASCII octets changed
 203 to the Unicode replacement character.
 204
 205 If the source encoding was not specified, this routine will
 206 try to guess it; the character encoding used for a successful
 207 conversion is returned in C<$converted_from>.
 208
 209 =cut
 210
 211 sub MarcToUTF8Record {
 212     my $marc = shift;
 213     my $marc_flavour = shift;
 214     my $source_encoding = shift;
 215     my $marc_record;
 216     my $marc_blob_is_utf8 = 0;
 217     if (ref($marc) eq 'MARC::Record') {
 218         my $marc_blob = $marc->as_usmarc();
 219         $marc_blob_is_utf8 = IsStringUTF8ish($marc_blob);
 220         $marc_record = $marc;
 221     } else {
 222         # dealing with a MARC blob
 223
 224         # remove any ersatz whitespace from the beginning and
 225         # end of the MARC blob -- these can creep into MARC
 226         # files produced by several sources -- caller really
 227         # should be doing this, however
 228         $marc =~ s/^\s+//;
 229         $marc =~ s/\s+$//;
 230         $marc_blob_is_utf8 = IsStringUTF8ish($marc);
 231         eval {
 232             $marc_record = MARC::Record->new_from_usmarc($marc);
 233         };
 234         if ($@) {
 235             # if we fail the first time, one likely problem
 236             # is that we have a MARC21 record that says that it's
 237             # UTF-8 (Leader/09 = 'a') but contains non-UTF-8 characters.
 238             # We'll try parsing it again.
 239             substr($marc, 9, 1) = ' ';
 240             eval {
 241                 $marc_record = MARC::Record->new_from_usmarc($marc);
 242             };
 243             if ($@) {
 244                 # it's hopeless; return an empty MARC::Record
 245                 return MARC::Record->new(), 'failed', ['could not parse MARC blob'];
 246             }
 247         }
 248     }
 249
 250     # If we do not know the source encoding, try some guesses
 251     # as follows:
 252     #   1. Record is UTF-8 already.
 253     #   2. If MARC flavor is MARC21 or NORMARC, then
 254     #      a. record is MARC-8
 255     #      b. record is ISO-8859-1
 256     #   3. If MARC flavor is UNIMARC, then
 257     if (not defined $source_encoding) {
 258         if ($marc_blob_is_utf8) {
 259             # note that for MARC21/NORMARC we are not bothering to check
 260             # if the Leader/09 is set to 'a' or not -- because
 261             # of problems with various ILSs (including Koha in the
 262             # past, alas), this just is not trustworthy.
 263             SetMarcUnicodeFlag($marc_record, $marc_flavour);
 264             return $marc_record, 'UTF-8', [];
 265         } else {
 266             if ($marc_flavour eq 'MARC21' || $marc_flavour eq 'NORMARC') {
 267                 return _default_marc21_charconv_to_utf8($marc_record, $marc_flavour);
 268             } elsif ($marc_flavour =~/UNIMARC/) {
 269                 return _default_unimarc_charconv_to_utf8($marc_record, $marc_flavour);
 270             } else {
 271                 return _default_marc21_charconv_to_utf8($marc_record, $marc_flavour);
 272             }
 273         }
 274     } else {
 275         # caller knows the character encoding
 276         my $original_marc_record = $marc_record->clone();
 277         my @errors;
 278         if ($source_encoding =~ /utf-?8/i) {
 279             if ($marc_blob_is_utf8) {
 280                 SetMarcUnicodeFlag($marc_record, $marc_flavour);
 281                 return $marc_record, 'UTF-8', [];
 282             } else {
 283                 push @errors, 'specified UTF-8 => UTF-8 conversion, but record is not in UTF-8';
 284             }
 285         } elsif ($source_encoding =~ /marc-?8/i) {
 286             @errors = _marc_marc8_to_utf8($marc_record, $marc_flavour);
 287         } elsif ($source_encoding =~ /5426/) {
 288             @errors = _marc_iso5426_to_utf8($marc_record, $marc_flavour);
 289         } else {
 290             # assume any other character encoding is for Text::Iconv
 291             @errors = _marc_to_utf8_via_text_iconv($marc_record, $marc_flavour, $source_encoding);
 292         }
 293
 294         if (@errors) {
 295             _marc_to_utf8_replacement_char($original_marc_record, $marc_flavour);
 296             return $original_marc_record, 'failed', \@errors;
 297         } else {
 298             return $marc_record, $source_encoding, [];
 299         }
 300     }
 301
 302 }
 303
 304 =head2 SetMarcUnicodeFlag
 305
 306   SetMarcUnicodeFlag($marc_record, $marc_flavour);
 307
 308 Set both the internal MARC::Record encoding flag
 309 and the appropriate Leader/09 (MARC21) or
 310 100/26-29 (UNIMARC) to indicate that the record
 311 is in UTF-8.  Note that this does B<not> do
 312 any actual character conversion.
 313
 314 =cut
 315
 316 sub SetMarcUnicodeFlag {
 317     my $marc_record = shift;
 318     my $marc_flavour = shift; # || C4::Context->preference("marcflavour");
 319
 320     $marc_record->encoding('UTF-8');
 321     if ($marc_flavour eq 'MARC21' || $marc_flavour eq 'NORMARC') {
 322         my $leader = $marc_record->leader();
 323         substr($leader, 9, 1) = 'a';
 324         $marc_record->leader($leader);
 325     } elsif ($marc_flavour =~/UNIMARC/) {
 326         my $string;
 327                 my ($subflength,$encodingposition)=($marc_flavour=~/AUTH/?(21,9):(36,22));
 328                 $string=$marc_record->subfield( 100, "a" );
 329         if (defined $string && length($string)==$subflength) {
 330                         $string = substr $string, 0,$subflength if (length($string)>$subflength);
 331         }
 332         else {
 333             $string = POSIX::strftime( "%Y%m%d", localtime );
 334             $string =~ s/\-//g;
 335             $string = sprintf( "%-*s", $subflength, $string );
 336         }
 337         substr( $string, $encodingposition, 8, "frey50  " );
 338         if ( $marc_record->subfield( 100, "a" ) ) {
 339                         $marc_record->field('100')->update(a=>$string);
 340                 }
 341                 else {
 342             $marc_record->insert_grouped_field(
 343                 MARC::Field->new( 100, '', '', "a" => $string ) );
 344         }
 345                 $debug && warn "encodage: ", substr( $marc_record->subfield(100, 'a'), $encodingposition, 8 );
 346     } else {
 347         warn "Unrecognized marcflavour: $marc_flavour";
 348     }
 349 }
 350
 351 =head2 StripNonXmlChars
 352
 353   my $new_str = StripNonXmlChars($old_str);
 354
 355 Given a string, return a copy with the
 356 characters that are illegal in XML
 357 removed.
 358
 359 This function exists to work around a problem
 360 that can occur with badly-encoded MARC records.
 361 Specifically, if a UTF-8 MARC record also
 362 has excape (\x1b) characters, MARC::File::XML
 363 will let the escape characters pass through
 364 when as_xml() or as_xml_record() is called.  The
 365 problem is that the escape character is not
 366 legal in well-formed XML documents, so when
 367 MARC::File::XML attempts to parse such a record,
 368 the XML parser will fail.
 369
 370 Stripping such characters will allow a
 371 MARC::Record->new_from_xml()
 372 to work, at the possible risk of some data loss.
 373
 374 =cut
 375
 376 sub StripNonXmlChars {
 377     my $str = shift;
 378     if (!defined($str) || $str eq ""){
 379         return "";
 380     }
 381     $str =~ s/[^\x09\x0A\x0D\x{0020}-\x{D7FF}\x{E000}-\x{FFFD}\x{10000}-\x{10FFFF}]//g;
 382     return $str;
 383 }
 384
 385 =head1 INTERNAL FUNCTIONS
 386
 387 =head2 _default_marc21_charconv_to_utf8
 388
 389   my ($new_marc_record, $guessed_charset) = _default_marc21_charconv_to_utf8($marc_record);
 390
 391 Converts a C<MARC::Record> of unknown character set to UTF-8,
 392 first by trying a MARC-8 to UTF-8 conversion, then ISO-8859-1
 393 to UTF-8, then a default conversion that replaces each non-ASCII
 394 character with the replacement character.
 395
 396 The C<$guessed_charset> return value contains the character set
 397 that resulted in a conversion to valid UTF-8; note that
 398 if the MARC-8 and ISO-8859-1 conversions failed, the value of
 399 this is 'failed'.
 400
 401 =cut
 402
 403 sub _default_marc21_charconv_to_utf8 {
 404     my $marc_record = shift;
 405     my $marc_flavour = shift;
 406
 407     my $trial_marc8 = $marc_record->clone();
 408     my @all_errors = ();
 409     my @errors = _marc_marc8_to_utf8($trial_marc8, $marc_flavour);
 410     unless (@errors) {
 411         return $trial_marc8, 'MARC-8', [];
 412     }
 413     push @all_errors, @errors;
 414
 415     my $trial_8859_1 = $marc_record->clone();
 416     @errors = _marc_to_utf8_via_text_iconv($trial_8859_1, $marc_flavour, 'iso-8859-1');
 417     unless (@errors) {
 418         return $trial_8859_1, 'iso-8859-1', []; # note -- we could return \@all_errors
 419                                                 # instead if we wanted to report details
 420                                                 # of the failed attempt at MARC-8 => UTF-8
 421     }
 422     push @all_errors, @errors;
 423
 424     my $default_converted = $marc_record->clone();
 425     _marc_to_utf8_replacement_char($default_converted, $marc_flavour);
 426     return $default_converted, 'failed', \@all_errors;
 427 }
 428
 429 =head2 _default_unimarc_charconv_to_utf8
 430
 431   my ($new_marc_record, $guessed_charset) = _default_unimarc_charconv_to_utf8($marc_record);
 432
 433 Converts a C<MARC::Record> of unknown character set to UTF-8,
 434 first by trying a ISO-5426 to UTF-8 conversion, then ISO-8859-1
 435 to UTF-8, then a default conversion that replaces each non-ASCII
 436 character with the replacement character.
 437
 438 The C<$guessed_charset> return value contains the character set
 439 that resulted in a conversion to valid UTF-8; note that
 440 if the MARC-8 and ISO-8859-1 conversions failed, the value of
 441 this is 'failed'.
 442
 443 =cut
 444
 445 sub _default_unimarc_charconv_to_utf8 {
 446     my $marc_record = shift;
 447     my $marc_flavour = shift;
 448
 449     my $trial_marc8 = $marc_record->clone();
 450     my @all_errors = ();
 451     my @errors = _marc_iso5426_to_utf8($trial_marc8, $marc_flavour);
 452     unless (@errors) {
 453         return $trial_marc8, 'iso-5426';
 454     }
 455     push @all_errors, @errors;
 456
 457     my $trial_8859_1 = $marc_record->clone();
 458     @errors = _marc_to_utf8_via_text_iconv($trial_8859_1, $marc_flavour, 'iso-8859-1');
 459     unless (@errors) {
 460         return $trial_8859_1, 'iso-8859-1';
 461     }
 462     push @all_errors, @errors;
 463
 464     my $default_converted = $marc_record->clone();
 465     _marc_to_utf8_replacement_char($default_converted, $marc_flavour);
 466     return $default_converted, 'failed', \@all_errors;
 467 }
 468
 469 =head2 _marc_marc8_to_utf8
 470
 471   my @errors = _marc_marc8_to_utf8($marc_record, $marc_flavour, $source_encoding);
 472
 473 Convert a C<MARC::Record> to UTF-8 in-place from MARC-8.
 474 If the conversion fails for some reason, an
 475 appropriate messages will be placed in the returned
 476 C<@errors> array.
 477
 478 =cut
 479
 480 sub _marc_marc8_to_utf8 {
 481     my $marc_record = shift;
 482     my $marc_flavour = shift;
 483
 484     my $prev_ignore = MARC::Charset->ignore_errors();
 485     MARC::Charset->ignore_errors(1);
 486
 487     # trap warnings raised by MARC::Charset
 488     my @errors = ();
 489     local $SIG{__WARN__} = sub {
 490         my $msg = $_[0];
 491         if ($msg =~ /MARC.Charset/) {
 492             # FIXME - purpose of this regexp is to strip out the
 493             # line reference to MARC/Charset.pm, but as it
 494             # exists probably won't work quite on Windows --
 495             # some sort of minimal-bunch back-tracking RE
 496             # would be helpful here
 497             $msg =~ s/at [\/].*?.MARC.Charset\.pm line \d+\.\n$//;
 498             push @errors, $msg;
 499         } else {
 500             # if warning doesn't come from MARC::Charset, just
 501             # pass it on
 502             warn $msg;
 503         }
 504     };
 505
 506     foreach my $field ($marc_record->fields()) {
 507         if ($field->is_control_field()) {
 508             ; # do nothing -- control fields should not contain non-ASCII characters
 509         } else {
 510             my @converted_subfields;
 511             foreach my $subfield ($field->subfields()) {
 512                 my $utf8sf = MARC::Charset::marc8_to_utf8($subfield->[1]);
 513                 unless (IsStringUTF8ish($utf8sf)) {
 514                     # Because of a bug in MARC::Charset 0.98, if the string
 515                     # has (a) one or more diacritics that (b) are only in character positions
 516                     # 128 to 255 inclusive, the resulting converted string is not in
 517                     # UTF-8, but the legacy 8-bit encoding (e.g., ISO-8859-1).  If that
 518                     # occurs, upgrade the string in place.  Moral of the story seems to be
 519                     # that pack("U", ...) is better than chr(...) if you need to guarantee
 520                     # that the resulting string is UTF-8.
 521                     utf8::upgrade($utf8sf);
 522                 }
 523                 push @converted_subfields, $subfield->[0], $utf8sf;
 524             }
 525
 526             $field->replace_with(MARC::Field->new(
 527                 $field->tag(), $field->indicator(1), $field->indicator(2),
 528                 @converted_subfields)
 529             );
 530         }
 531     }
 532
 533     MARC::Charset->ignore_errors($prev_ignore);
 534
 535     SetMarcUnicodeFlag($marc_record, $marc_flavour);
 536
 537     return @errors;
 538 }
 539
 540 =head2 _marc_iso5426_to_utf8
 541
 542   my @errors = _marc_iso5426_to_utf8($marc_record, $marc_flavour, $source_encoding);
 543
 544 Convert a C<MARC::Record> to UTF-8 in-place from ISO-5426.
 545 If the conversion fails for some reason, an
 546 appropriate messages will be placed in the returned
 547 C<@errors> array.
 548
 549 FIXME - is ISO-5426 equivalent enough to MARC-8
 550 that C<MARC::Charset> can be used instead?
 551
 552 =cut
 553
 554 sub _marc_iso5426_to_utf8 {
 555     my $marc_record = shift;
 556     my $marc_flavour = shift;
 557
 558     my @errors = ();
 559
 560     foreach my $field ($marc_record->fields()) {
 561         if ($field->is_control_field()) {
 562             ; # do nothing -- control fields should not contain non-ASCII characters
 563         } else {
 564             my @converted_subfields;
 565             foreach my $subfield ($field->subfields()) {
 566                 my $utf8sf = char_decode5426($subfield->[1]);
 567                 push @converted_subfields, $subfield->[0], $utf8sf;
 568             }
 569
 570             $field->replace_with(MARC::Field->new(
 571                 $field->tag(), $field->indicator(1), $field->indicator(2),
 572                 @converted_subfields)
 573             );
 574         }
 575     }
 576
 577     SetMarcUnicodeFlag($marc_record, $marc_flavour);
 578
 579     return @errors;
 580 }
 581
 582 =head2 _marc_to_utf8_via_text_iconv
 583
 584   my @errors = _marc_to_utf8_via_text_iconv($marc_record, $marc_flavour, $source_encoding);
 585
 586 Convert a C<MARC::Record> to UTF-8 in-place using the
 587 C<Text::Iconv> CPAN module.  Any source encoding accepted
 588 by the user's iconv installation should work.  If
 589 the source encoding is not recognized on the user's
 590 server or the conversion fails for some reason,
 591 appropriate messages will be placed in the returned
 592 C<@errors> array.
 593
 594 =cut
 595
 596 sub _marc_to_utf8_via_text_iconv {
 597     my $marc_record = shift;
 598     my $marc_flavour = shift;
 599     my $source_encoding = shift;
 600
 601     my @errors = ();
 602     my $decoder;
 603     eval { $decoder = Text::Iconv->new($source_encoding, 'utf8'); };
 604     if ($@) {
 605         push @errors, "Could not initialze $source_encoding => utf8 converter: $@";
 606         return @errors;
 607     }
 608
 609     my $prev_raise_error = Text::Iconv->raise_error();
 610     Text::Iconv->raise_error(1);
 611
 612     foreach my $field ($marc_record->fields()) {
 613         if ($field->is_control_field()) {
 614             ; # do nothing -- control fields should not contain non-ASCII characters
 615         } else {
 616             my @converted_subfields;
 617             foreach my $subfield ($field->subfields()) {
 618                 my $converted_value;
 619                 my $conversion_ok = 1;
 620                 eval { $converted_value = $decoder->convert($subfield->[1]); };
 621                 if ($@) {
 622                     $conversion_ok = 0;
 623                     push @errors, $@;
 624                 } elsif (not defined $converted_value) {
 625                     $conversion_ok = 0;
 626                     push @errors, "Text::Iconv conversion failed - retval is " . $decoder->retval();
 627                 }
 628
 629                 if ($conversion_ok) {
 630                     push @converted_subfields, $subfield->[0], $converted_value;
 631                 } else {
 632                     $converted_value = $subfield->[1];
 633                     $converted_value =~ s/[\200-\377]/\xef\xbf\xbd/g;
 634                     push @converted_subfields, $subfield->[0], $converted_value;
 635                 }
 636             }
 637
 638             $field->replace_with(MARC::Field->new(
 639                 $field->tag(), $field->indicator(1), $field->indicator(2),
 640                 @converted_subfields)
 641             );
 642         }
 643     }
 644
 645     SetMarcUnicodeFlag($marc_record, $marc_flavour);
 646     Text::Iconv->raise_error($prev_raise_error);
 647
 648     return @errors;
 649 }
 650
 651 =head2 _marc_to_utf8_replacement_char
 652
 653   _marc_to_utf8_replacement_char($marc_record, $marc_flavour);
 654
 655 Convert a C<MARC::Record> to UTF-8 in-place, adopting the
 656 unsatisfactory method of replacing all non-ASCII (e.g.,
 657 where the eight bit is set) octet with the Unicode
 658 replacement character.  This is meant as a last-ditch
 659 method, and would be best used as part of a UI that
 660 lets a cataloguer pick various character conversions
 661 until he or she finds the right one.
 662
 663 =cut
 664
 665 sub _marc_to_utf8_replacement_char {
 666     my $marc_record = shift;
 667     my $marc_flavour = shift;
 668
 669     foreach my $field ($marc_record->fields()) {
 670         if ($field->is_control_field()) {
 671             ; # do nothing -- control fields should not contain non-ASCII characters
 672         } else {
 673             my @converted_subfields;
 674             foreach my $subfield ($field->subfields()) {
 675                 my $value = $subfield->[1];
 676                 $value =~ s/[\200-\377]/\xef\xbf\xbd/g;
 677                 push @converted_subfields, $subfield->[0], $value;
 678             }
 679
 680             $field->replace_with(MARC::Field->new(
 681                 $field->tag(), $field->indicator(1), $field->indicator(2),
 682                 @converted_subfields)
 683             );
 684         }
 685     }
 686
 687     SetMarcUnicodeFlag($marc_record, $marc_flavour);
 688 }
 689
 690 =head2 char_decode5426
 691
 692   my $utf8string = char_decode5426($iso_5426_string);
 693
 694 Converts a string from ISO-5426 to UTF-8.
 695
 696 =cut
 697
 698
 699 my %chars;
 700 $chars{0xb0}=0x0101;#3/0ayn[ain]
 701 $chars{0xb1}=0x0623;#3/1alif/hamzah[alefwithhamzaabove]
 702 #$chars{0xb2}=0x00e0;#'à';
 703 $chars{0xb2}=0x00e0;#3/2leftlowsinglequotationmark
 704 #$chars{0xb3}=0x00e7;#'ç';
 705 $chars{0xb3}=0x00e7;#3/2leftlowsinglequotationmark
 706 # $chars{0xb4}='è';
 707 $chars{0xb4}=0x00e8;
 708 $chars{0xbd}=0x02b9;
 709 $chars{0xbe}=0x02ba;
 710 # $chars{0xb5}='é';
 711 $chars{0xb5}=0x00e9;
 712 $chars{0x97}=0x003c;#3/2leftlowsinglequotationmark
 713 $chars{0x98}=0x003e;#3/2leftlowsinglequotationmark
 714 $chars{0xfa}=0x0153; #oe
 715 $chars{0xea}=0x0152; #oe
 716 $chars{0x81d1}=0x00b0;
 717
 718 ####
 719 ## combined characters iso5426
 720
 721 $chars{0xc041}=0x1ea2; # capital a with hook above
 722 $chars{0xc045}=0x1eba; # capital e with hook above
 723 $chars{0xc049}=0x1ec8; # capital i with hook above
 724 $chars{0xc04f}=0x1ece; # capital o with hook above
 725 $chars{0xc055}=0x1ee6; # capital u with hook above
 726 $chars{0xc059}=0x1ef6; # capital y with hook above
 727 $chars{0xc061}=0x1ea3; # small a with hook above
 728 $chars{0xc065}=0x1ebb; # small e with hook above
 729 $chars{0xc069}=0x1ec9; # small i with hook above
 730 $chars{0xc06f}=0x1ecf; # small o with hook above
 731 $chars{0xc075}=0x1ee7; # small u with hook above
 732 $chars{0xc079}=0x1ef7; # small y with hook above
 733
 734         # 4/1 grave accent
 735 $chars{0xc141}=0x00c0; # capital a with grave accent
 736 $chars{0xc145}=0x00c8; # capital e with grave accent
 737 $chars{0xc149}=0x00cc; # capital i with grave accent
 738 $chars{0xc14f}=0x00d2; # capital o with grave accent
 739 $chars{0xc155}=0x00d9; # capital u with grave accent
 740 $chars{0xc157}=0x1e80; # capital w with grave
 741 $chars{0xc159}=0x1ef2; # capital y with grave
 742 $chars{0xc161}=0x00e0; # small a with grave accent
 743 $chars{0xc165}=0x00e8; # small e with grave accent
 744 $chars{0xc169}=0x00ec; # small i with grave accent
 745 $chars{0xc16f}=0x00f2; # small o with grave accent
 746 $chars{0xc175}=0x00f9; # small u with grave accent
 747 $chars{0xc177}=0x1e81; # small w with grave
 748 $chars{0xc179}=0x1ef3; # small y with grave
 749         # 4/2 acute accent
 750 $chars{0xc241}=0x00c1; # capital a with acute accent
 751 $chars{0xc243}=0x0106; # capital c with acute accent
 752 $chars{0xc245}=0x00c9; # capital e with acute accent
 753 $chars{0xc247}=0x01f4; # capital g with acute
 754 $chars{0xc249}=0x00cd; # capital i with acute accent
 755 $chars{0xc24b}=0x1e30; # capital k with acute
 756 $chars{0xc24c}=0x0139; # capital l with acute accent
 757 $chars{0xc24d}=0x1e3e; # capital m with acute
 758 $chars{0xc24e}=0x0143; # capital n with acute accent
 759 $chars{0xc24f}=0x00d3; # capital o with acute accent
 760 $chars{0xc250}=0x1e54; # capital p with acute
 761 $chars{0xc252}=0x0154; # capital r with acute accent
 762 $chars{0xc253}=0x015a; # capital s with acute accent
 763 $chars{0xc255}=0x00da; # capital u with acute accent
 764 $chars{0xc257}=0x1e82; # capital w with acute
 765 $chars{0xc259}=0x00dd; # capital y with acute accent
 766 $chars{0xc25a}=0x0179; # capital z with acute accent
 767 $chars{0xc261}=0x00e1; # small a with acute accent
 768 $chars{0xc263}=0x0107; # small c with acute accent
 769 $chars{0xc265}=0x00e9; # small e with acute accent
 770 $chars{0xc267}=0x01f5; # small g with acute
 771 $chars{0xc269}=0x00ed; # small i with acute accent
 772 $chars{0xc26b}=0x1e31; # small k with acute
 773 $chars{0xc26c}=0x013a; # small l with acute accent
 774 $chars{0xc26d}=0x1e3f; # small m with acute
 775 $chars{0xc26e}=0x0144; # small n with acute accent
 776 $chars{0xc26f}=0x00f3; # small o with acute accent
 777 $chars{0xc270}=0x1e55; # small p with acute
 778 $chars{0xc272}=0x0155; # small r with acute accent
 779 $chars{0xc273}=0x015b; # small s with acute accent
 780 $chars{0xc275}=0x00fa; # small u with acute accent
 781 $chars{0xc277}=0x1e83; # small w with acute
 782 $chars{0xc279}=0x00fd; # small y with acute accent
 783 $chars{0xc27a}=0x017a; # small z with acute accent
 784 $chars{0xc2e1}=0x01fc; # capital ae with acute
 785 $chars{0xc2f1}=0x01fd; # small ae with acute
 786        # 4/3 circumflex accent
 787 $chars{0xc341}=0x00c2; # capital a with circumflex accent
 788 $chars{0xc343}=0x0108; # capital c with circumflex
 789 $chars{0xc345}=0x00ca; # capital e with circumflex accent
 790 $chars{0xc347}=0x011c; # capital g with circumflex
 791 $chars{0xc348}=0x0124; # capital h with circumflex
 792 $chars{0xc349}=0x00ce; # capital i with circumflex accent
 793 $chars{0xc34a}=0x0134; # capital j with circumflex
 794 $chars{0xc34f}=0x00d4; # capital o with circumflex accent
 795 $chars{0xc353}=0x015c; # capital s with circumflex
 796 $chars{0xc355}=0x00db; # capital u with circumflex
 797 $chars{0xc357}=0x0174; # capital w with circumflex
 798 $chars{0xc359}=0x0176; # capital y with circumflex
 799 $chars{0xc35a}=0x1e90; # capital z with circumflex
 800 $chars{0xc361}=0x00e2; # small a with circumflex accent
 801 $chars{0xc363}=0x0109; # small c with circumflex
 802 $chars{0xc365}=0x00ea; # small e with circumflex accent
 803 $chars{0xc367}=0x011d; # small g with circumflex
 804 $chars{0xc368}=0x0125; # small h with circumflex
 805 $chars{0xc369}=0x00ee; # small i with circumflex accent
 806 $chars{0xc36a}=0x0135; # small j with circumflex
 807 $chars{0xc36e}=0x00f1; # small n with tilde
 808 $chars{0xc36f}=0x00f4; # small o with circumflex accent
 809 $chars{0xc373}=0x015d; # small s with circumflex
 810 $chars{0xc375}=0x00fb; # small u with circumflex
 811 $chars{0xc377}=0x0175; # small w with circumflex
 812 $chars{0xc379}=0x0177; # small y with circumflex
 813 $chars{0xc37a}=0x1e91; # small z with circumflex
 814         # 4/4 tilde
 815 $chars{0xc441}=0x00c3; # capital a with tilde
 816 $chars{0xc445}=0x1ebc; # capital e with tilde
 817 $chars{0xc449}=0x0128; # capital i with tilde
 818 $chars{0xc44e}=0x00d1; # capital n with tilde
 819 $chars{0xc44f}=0x00d5; # capital o with tilde
 820 $chars{0xc455}=0x0168; # capital u with tilde
 821 $chars{0xc456}=0x1e7c; # capital v with tilde
 822 $chars{0xc459}=0x1ef8; # capital y with tilde
 823 $chars{0xc461}=0x00e3; # small a with tilde
 824 $chars{0xc465}=0x1ebd; # small e with tilde
 825 $chars{0xc469}=0x0129; # small i with tilde
 826 $chars{0xc46e}=0x00f1; # small n with tilde
 827 $chars{0xc46f}=0x00f5; # small o with tilde
 828 $chars{0xc475}=0x0169; # small u with tilde
 829 $chars{0xc476}=0x1e7d; # small v with tilde
 830 $chars{0xc479}=0x1ef9; # small y with tilde
 831     # 4/5 macron
 832 $chars{0xc541}=0x0100; # capital a with macron
 833 $chars{0xc545}=0x0112; # capital e with macron
 834 $chars{0xc547}=0x1e20; # capital g with macron
 835 $chars{0xc549}=0x012a; # capital i with macron
 836 $chars{0xc54f}=0x014c; # capital o with macron
 837 $chars{0xc555}=0x016a; # capital u with macron
 838 $chars{0xc561}=0x0101; # small a with macron
 839 $chars{0xc565}=0x0113; # small e with macron
 840 $chars{0xc567}=0x1e21; # small g with macron
 841 $chars{0xc569}=0x012b; # small i with macron
 842 $chars{0xc56f}=0x014d; # small o with macron
 843 $chars{0xc575}=0x016b; # small u with macron
 844 $chars{0xc572}=0x0159; # small r with macron
 845 $chars{0xc5e1}=0x01e2; # capital ae with macron
 846 $chars{0xc5f1}=0x01e3; # small ae with macron
 847         # 4/6 breve
 848 $chars{0xc641}=0x0102; # capital a with breve
 849 $chars{0xc645}=0x0114; # capital e with breve
 850 $chars{0xc647}=0x011e; # capital g with breve
 851 $chars{0xc649}=0x012c; # capital i with breve
 852 $chars{0xc64f}=0x014e; # capital o with breve
 853 $chars{0xc655}=0x016c; # capital u with breve
 854 $chars{0xc661}=0x0103; # small a with breve
 855 $chars{0xc665}=0x0115; # small e with breve
 856 $chars{0xc667}=0x011f; # small g with breve
 857 $chars{0xc669}=0x012d; # small i with breve
 858 $chars{0xc66f}=0x014f; # small o with breve
 859 $chars{0xc675}=0x016d; # small u with breve
 860         # 4/7 dot above
 861 $chars{0xc7b0}=0x01e1; # Ain with dot above
 862 $chars{0xc742}=0x1e02; # capital b with dot above
 863 $chars{0xc743}=0x010a; # capital c with dot above
 864 $chars{0xc744}=0x1e0a; # capital d with dot above
 865 $chars{0xc745}=0x0116; # capital e with dot above
 866 $chars{0xc746}=0x1e1e; # capital f with dot above
 867 $chars{0xc747}=0x0120; # capital g with dot above
 868 $chars{0xc748}=0x1e22; # capital h with dot above
 869 $chars{0xc749}=0x0130; # capital i with dot above
 870 $chars{0xc74d}=0x1e40; # capital m with dot above
 871 $chars{0xc74e}=0x1e44; # capital n with dot above
 872 $chars{0xc750}=0x1e56; # capital p with dot above
 873 $chars{0xc752}=0x1e58; # capital r with dot above
 874 $chars{0xc753}=0x1e60; # capital s with dot above
 875 $chars{0xc754}=0x1e6a; # capital t with dot above
 876 $chars{0xc757}=0x1e86; # capital w with dot above
 877 $chars{0xc758}=0x1e8a; # capital x with dot above
 878 $chars{0xc759}=0x1e8e; # capital y with dot above
 879 $chars{0xc75a}=0x017b; # capital z with dot above
 880 $chars{0xc761}=0x0227; # small b with dot above
 881 $chars{0xc762}=0x1e03; # small b with dot above
 882 $chars{0xc763}=0x010b; # small c with dot above
 883 $chars{0xc764}=0x1e0b; # small d with dot above
 884 $chars{0xc765}=0x0117; # small e with dot above
 885 $chars{0xc766}=0x1e1f; # small f with dot above
 886 $chars{0xc767}=0x0121; # small g with dot above
 887 $chars{0xc768}=0x1e23; # small h with dot above
 888 $chars{0xc76d}=0x1e41; # small m with dot above
 889 $chars{0xc76e}=0x1e45; # small n with dot above
 890 $chars{0xc770}=0x1e57; # small p with dot above
 891 $chars{0xc772}=0x1e59; # small r with dot above
 892 $chars{0xc773}=0x1e61; # small s with dot above
 893 $chars{0xc774}=0x1e6b; # small t with dot above
 894 $chars{0xc777}=0x1e87; # small w with dot above
 895 $chars{0xc778}=0x1e8b; # small x with dot above
 896 $chars{0xc779}=0x1e8f; # small y with dot above
 897 $chars{0xc77a}=0x017c; # small z with dot above
 898         # 4/8 trema, diaresis
 899 $chars{0xc820}=0x00a8; # diaeresis
 900 $chars{0xc841}=0x00c4; # capital a with diaeresis
 901 $chars{0xc845}=0x00cb; # capital e with diaeresis
 902 $chars{0xc848}=0x1e26; # capital h with diaeresis
 903 $chars{0xc849}=0x00cf; # capital i with diaeresis
 904 $chars{0xc84f}=0x00d6; # capital o with diaeresis
 905 $chars{0xc855}=0x00dc; # capital u with diaeresis
 906 $chars{0xc857}=0x1e84; # capital w with diaeresis
 907 $chars{0xc858}=0x1e8c; # capital x with diaeresis
 908 $chars{0xc859}=0x0178; # capital y with diaeresis
 909 $chars{0xc861}=0x00e4; # small a with diaeresis
 910 $chars{0xc865}=0x00eb; # small e with diaeresis
 911 $chars{0xc868}=0x1e27; # small h with diaeresis
 912 $chars{0xc869}=0x00ef; # small i with diaeresis
 913 $chars{0xc86f}=0x00f6; # small o with diaeresis
 914 $chars{0xc874}=0x1e97; # small t with diaeresis
 915 $chars{0xc875}=0x00fc; # small u with diaeresis
 916 $chars{0xc877}=0x1e85; # small w with diaeresis
 917 $chars{0xc878}=0x1e8d; # small x with diaeresis
 918 $chars{0xc879}=0x00ff; # small y with diaeresis
 919         # 4/9 umlaut
 920 $chars{0xc920}=0x00a8; # [diaeresis]
 921 $chars{0xc961}=0x00e4; # a with umlaut
 922 $chars{0xc965}=0x00eb; # e with umlaut
 923 $chars{0xc969}=0x00ef; # i with umlaut
 924 $chars{0xc96f}=0x00f6; # o with umlaut
 925 $chars{0xc975}=0x00fc; # u with umlaut
 926         # 4/10 circle above
 927 $chars{0xca41}=0x00c5; # capital a with ring above
 928 $chars{0xcaad}=0x016e; # capital u with ring above
 929 $chars{0xca61}=0x00e5; # small a with ring above
 930 $chars{0xca75}=0x016f; # small u with ring above
 931 $chars{0xca77}=0x1e98; # small w with ring above
 932 $chars{0xca79}=0x1e99; # small y with ring above
 933         # 4/11 high comma off centre
 934         # 4/12 inverted high comma centred
 935         # 4/13 double acute accent
 936 $chars{0xcd4f}=0x0150; # capital o with double acute
 937 $chars{0xcd55}=0x0170; # capital u with double acute
 938 $chars{0xcd6f}=0x0151; # small o with double acute
 939 $chars{0xcd75}=0x0171; # small u with double acute
 940         # 4/14 horn
 941 $chars{0xce54}=0x01a0; # latin capital letter o with horn
 942 $chars{0xce55}=0x01af; # latin capital letter u with horn
 943 $chars{0xce74}=0x01a1; # latin small letter o with horn
 944 $chars{0xce75}=0x01b0; # latin small letter u with horn
 945         # 4/15 caron (hacek
 946 $chars{0xcf41}=0x01cd; # capital a with caron
 947 $chars{0xcf43}=0x010c; # capital c with caron
 948 $chars{0xcf44}=0x010e; # capital d with caron
 949 $chars{0xcf45}=0x011a; # capital e with caron
 950 $chars{0xcf47}=0x01e6; # capital g with caron
 951 $chars{0xcf49}=0x01cf; # capital i with caron
 952 $chars{0xcf4b}=0x01e8; # capital k with caron
 953 $chars{0xcf4c}=0x013d; # capital l with caron
 954 $chars{0xcf4e}=0x0147; # capital n with caron
 955 $chars{0xcf4f}=0x01d1; # capital o with caron
 956 $chars{0xcf52}=0x0158; # capital r with caron
 957 $chars{0xcf53}=0x0160; # capital s with caron
 958 $chars{0xcf54}=0x0164; # capital t with caron
 959 $chars{0xcf55}=0x01d3; # capital u with caron
 960 $chars{0xcf5a}=0x017d; # capital z with caron
 961 $chars{0xcf61}=0x01ce; # small a with caron
 962 $chars{0xcf63}=0x010d; # small c with caron
 963 $chars{0xcf64}=0x010f; # small d with caron
 964 $chars{0xcf65}=0x011b; # small e with caron
 965 $chars{0xcf67}=0x01e7; # small g with caron
 966 $chars{0xcf69}=0x01d0; # small i with caron
 967 $chars{0xcf6a}=0x01f0; # small j with caron
 968 $chars{0xcf6b}=0x01e9; # small k with caron
 969 $chars{0xcf6c}=0x013e; # small l with caron
 970 $chars{0xcf6e}=0x0148; # small n with caron
 971 $chars{0xcf6f}=0x01d2; # small o with caron
 972 $chars{0xcf72}=0x0159; # small r with caron
 973 $chars{0xcf73}=0x0161; # small s with caron
 974 $chars{0xcf74}=0x0165; # small t with caron
 975 $chars{0xcf75}=0x01d4; # small u with caron
 976 $chars{0xcf7a}=0x017e; # small z with caron
 977         # 5/0 cedilla
 978 $chars{0xd020}=0x00b8; # cedilla
 979 $chars{0xd043}=0x00c7; # capital c with cedilla
 980 $chars{0xd044}=0x1e10; # capital d with cedilla
 981 $chars{0xd047}=0x0122; # capital g with cedilla
 982 $chars{0xd048}=0x1e28; # capital h with cedilla
 983 $chars{0xd04b}=0x0136; # capital k with cedilla
 984 $chars{0xd04c}=0x013b; # capital l with cedilla
 985 $chars{0xd04e}=0x0145; # capital n with cedilla
 986 $chars{0xd052}=0x0156; # capital r with cedilla
 987 $chars{0xd053}=0x015e; # capital s with cedilla
 988 $chars{0xd054}=0x0162; # capital t with cedilla
 989 $chars{0xd063}=0x00e7; # small c with cedilla
 990 $chars{0xd064}=0x1e11; # small d with cedilla
 991 $chars{0xd065}=0x0119; # small e with cedilla
 992 $chars{0xd067}=0x0123; # small g with cedilla
 993 $chars{0xd068}=0x1e29; # small h with cedilla
 994 $chars{0xd06b}=0x0137; # small k with cedilla
 995 $chars{0xd06c}=0x013c; # small l with cedilla
 996 $chars{0xd06e}=0x0146; # small n with cedilla
 997 $chars{0xd072}=0x0157; # small r with cedilla
 998 $chars{0xd073}=0x015f; # small s with cedilla
 999 $chars{0xd074}=0x0163; # small t with cedilla
1000         # 5/1 rude
1001         # 5/2 hook to left
1002         # 5/3 ogonek (hook to right
1003 $chars{0xd320}=0x02db; # ogonek
1004 $chars{0xd341}=0x0104; # capital a with ogonek
1005 $chars{0xd345}=0x0118; # capital e with ogonek
1006 $chars{0xd349}=0x012e; # capital i with ogonek
1007 $chars{0xd34f}=0x01ea; # capital o with ogonek
1008 $chars{0xd355}=0x0172; # capital u with ogonek
1009 $chars{0xd361}=0x0105; # small a with ogonek
1010 $chars{0xd365}=0x0119; # small e with ogonek
1011 $chars{0xd369}=0x012f; # small i with ogonek
1012 $chars{0xd36f}=0x01eb; # small o with ogonek
1013 $chars{0xd375}=0x0173; # small u with ogonek
1014         # 5/4 circle below
1015 $chars{0xd441}=0x1e00; # capital a with ring below
1016 $chars{0xd461}=0x1e01; # small a with ring below
1017         # 5/5 half circle below
1018 $chars{0xf948}=0x1e2a; # capital h with breve below
1019 $chars{0xf968}=0x1e2b; # small h with breve below
1020         # 5/6 dot below
1021 $chars{0xd641}=0x1ea0; # capital a with dot below
1022 $chars{0xd642}=0x1e04; # capital b with dot below
1023 $chars{0xd644}=0x1e0c; # capital d with dot below
1024 $chars{0xd645}=0x1eb8; # capital e with dot below
1025 $chars{0xd648}=0x1e24; # capital h with dot below
1026 $chars{0xd649}=0x1eca; # capital i with dot below
1027 $chars{0xd64b}=0x1e32; # capital k with dot below
1028 $chars{0xd64c}=0x1e36; # capital l with dot below
1029 $chars{0xd64d}=0x1e42; # capital m with dot below
1030 $chars{0xd64e}=0x1e46; # capital n with dot below
1031 $chars{0xd64f}=0x1ecc; # capital o with dot below
1032 $chars{0xd652}=0x1e5a; # capital r with dot below
1033 $chars{0xd653}=0x1e62; # capital s with dot below
1034 $chars{0xd654}=0x1e6c; # capital t with dot below
1035 $chars{0xd655}=0x1ee4; # capital u with dot below
1036 $chars{0xd656}=0x1e7e; # capital v with dot below
1037 $chars{0xd657}=0x1e88; # capital w with dot below
1038 $chars{0xd659}=0x1ef4; # capital y with dot below
1039 $chars{0xd65a}=0x1e92; # capital z with dot below
1040 $chars{0xd661}=0x1ea1; # small a with dot below
1041 $chars{0xd662}=0x1e05; # small b with dot below
1042 $chars{0xd664}=0x1e0d; # small d with dot below
1043 $chars{0xd665}=0x1eb9; # small e with dot below
1044 $chars{0xd668}=0x1e25; # small h with dot below
1045 $chars{0xd669}=0x1ecb; # small i with dot below
1046 $chars{0xd66b}=0x1e33; # small k with dot below
1047 $chars{0xd66c}=0x1e37; # small l with dot below
1048 $chars{0xd66d}=0x1e43; # small m with dot below
1049 $chars{0xd66e}=0x1e47; # small n with dot below
1050 $chars{0xd66f}=0x1ecd; # small o with dot below
1051 $chars{0xd672}=0x1e5b; # small r with dot below
1052 $chars{0xd673}=0x1e63; # small s with dot below
1053 $chars{0xd674}=0x1e6d; # small t with dot below
1054 $chars{0xd675}=0x1ee5; # small u with dot below
1055 $chars{0xd676}=0x1e7f; # small v with dot below
1056 $chars{0xd677}=0x1e89; # small w with dot below
1057 $chars{0xd679}=0x1ef5; # small y with dot below
1058 $chars{0xd67a}=0x1e93; # small z with dot below
1059         # 5/7 double dot below
1060 $chars{0xd755}=0x1e72; # capital u with diaeresis below
1061 $chars{0xd775}=0x1e73; # small u with diaeresis below
1062         # 5/8 underline
1063 $chars{0xd820}=0x005f; # underline
1064         # 5/9 double underline
1065 $chars{0xd920}=0x2017; # double underline
1066         # 5/10 small low vertical bar
1067 $chars{0xda20}=0x02cc; #
1068         # 5/11 circumflex below
1069         # 5/12 (this position shall not be used)
1070         # 5/13 left half of ligature sign and of double tilde
1071         # 5/14 right half of ligature sign
1072         # 5/15 right half of double tilde
1073 #     map {printf "%x :%x\n",$_,$chars{$_};}keys %chars;
1074
1075 sub char_decode5426 {
1076     my ( $string) = @_;
1077     my $result;
1078
1079     my @data = unpack("C*", $string);
1080     my @characters;
1081     my $length=scalar(@data);
1082     for (my $i = 0; $i < scalar(@data); $i++) {
1083       my $char= $data[$i];
1084       if ($char >= 0x00 && $char <= 0x7F){
1085         #IsAscii
1086
1087           push @characters,$char unless ($char<0x02 ||$char== 0x0F);
1088       }elsif (($char >= 0xC0 && $char <= 0xDF)) {
1089         #Combined Char
1090         my $convchar ;
1091         if ($chars{$char*256+$data[$i+1]}) {
1092           $convchar= $chars{$char * 256 + $data[$i+1]};
1093           $i++;
1094 #           printf "char %x $char, char to convert %x , converted %x\n",$char,$char * 256 + $data[$i - 1],$convchar;
1095         } elsif ($chars{$char})  {
1096           $convchar= $chars{$char};
1097 #           printf "0xC char %x, converted %x\n",$char,$chars{$char};
1098         }else {
1099           $convchar=$char;
1100         }
1101         push @characters,$convchar;
1102       } else {
1103         my $convchar;
1104         if ($chars{$char})  {
1105           $convchar= $chars{$char};
1106 #            printf "char %x,  converted %x\n",$char,$chars{$char};
1107         }else {
1108 #            printf "char %x $char\n",$char;
1109           $convchar=$char;
1110         }
1111         push @characters,$convchar;
1112       }
1113     }
1114     $result=pack "U*",@characters;
1115 #     $result=~s/\x01//;
1116 #     $result=~s/\x00//;
1117      $result=~s/\x0f//;
1118      $result=~s/\x1b.//;
1119      $result=~s/\x0e//;
1120      $result=~s/\x1b\x5b//;
1121 #   map{printf "%x",$_} @characters;
1122 #   printf "\n";
1123   return $result;
1124 }
1125
1126 1;
1127
1128
1129 =head1 AUTHOR
1130
1131 Koha Development Team <http://koha-community.org/>
1132
1133 Galen Charlton <galen.charlton@liblime.com>
1134
1135 =cut