C4/Charset.pm

   1 package C4::Charset;
   2
   3 # Copyright (C) 2008 LibLime
   4 #
   5 # This file is part of Koha.
   6 #
   7 # Koha is free software; you can redistribute it and/or modify it under the
   8 # terms of the GNU General Public License as published by the Free Software
   9 # Foundation; either version 2 of the License, or (at your option) any later
  10 # version.
  11 #
  12 # Koha is distributed in the hope that it will be useful, but WITHOUT ANY
  13 # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
  14 # A PARTICULAR PURPOSE.  See the GNU General Public License for more details.
  15 #
  16 # You should have received a copy of the GNU General Public License along
  17 # with Koha; if not, write to the Free Software Foundation, Inc.,
  18 # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
  19
  20 use strict;
  21 use warnings;
  22
  23 use MARC::Charset qw/marc8_to_utf8/;
  24 use Text::Iconv;
  25 use C4::Debug;
  26 use Unicode::Normalize;
  27
  28 use vars qw($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
  29
  30 BEGIN {
  31     # set the version for version checking
  32     $VERSION = 3.01;
  33     require Exporter;
  34     @ISA    = qw(Exporter);
  35     @EXPORT = qw(
  36         IsStringUTF8ish
  37         MarcToUTF8Record
  38         SetUTF8Flag
  39         SetMarcUnicodeFlag
  40         StripNonXmlChars
  41     );
  42 }
  43
  44 =head1 NAME
  45
  46 C4::Charset - utilities for handling character set conversions.
  47
  48 =head1 SYNOPSIS
  49
  50   use C4::Charset;
  51
  52 =head1 DESCRIPTION
  53
  54 This module contains routines for dealing with character set
  55 conversions, particularly for MARC records.
  56
  57 A variety of character encodings are in use by various MARC
  58 standards, and even more character encodings are used by
  59 non-standard MARC records.  The various MARC formats generally
  60 do not do a good job of advertising a given record's character
  61 encoding, and even when a record does advertise its encoding,
  62 e.g., via the Leader/09, experience has shown that one cannot
  63 trust it.
  64
  65 Ultimately, all MARC records are stored in Koha in UTF-8 and
  66 must be converted from whatever the source character encoding is.
  67 The goal of this module is to ensure that these conversions
  68 take place accurately.  When a character conversion cannot take
  69 place, or at least not accurately, the module was provide
  70 enough information to allow user-facing code to inform the user
  71 on how to deal with the situation.
  72
  73 =cut
  74
  75 =head1 FUNCTIONS
  76
  77 =head2 IsStringUTF8ish
  78
  79   my $is_utf8 = IsStringUTF8ish($str);
  80
  81 Determines if C<$str> is valid UTF-8.  This can mean
  82 one of two things:
  83
  84 =over
  85
  86 =item *
  87
  88 The Perl UTF-8 flag is set and the string contains valid UTF-8.
  89
  90 =item *
  91
  92 The Perl UTF-8 flag is B<not> set, but the octets contain
  93 valid UTF-8.
  94
  95 =back
  96
  97 The function is named C<IsStringUTF8ish> instead of C<IsStringUTF8>
  98 because in one could be presented with a MARC blob that is
  99 not actually in UTF-8 but whose sequence of octets appears to be
 100 valid UTF-8.  The rest of the MARC character conversion functions
 101 will assume that this situation occur does not very often.
 102
 103 =cut
 104
 105 sub IsStringUTF8ish {
 106     my $str = shift;
 107
 108     return 1 if utf8::is_utf8($str);
 109     return utf8::decode($str);
 110 }
 111
 112 =head2 SetUTF8Flag
 113
 114   my $marc_record = SetUTF8Flag($marc_record);
 115
 116 This function sets the PERL UTF8 flag for data.
 117 It is required when using new_from_usmarc
 118 since MARC::File::USMARC does not handle PERL UTF8 setting.
 119 When editing unicode marc records fields and subfields, you
 120 would end up in double encoding without using this function.
 121
 122 FIXME
 123 In my opinion, this function belongs to MARC::Record and not
 124 to this package.
 125 But since it handles charset, and MARC::Record, it finds its way in that package
 126
 127 =cut
 128
 129 sub SetUTF8Flag{
 130         my ($record)=@_;
 131         return unless ($record && $record->fields());
 132         foreach my $field ($record->fields()){
 133                 if ($field->tag()>=10){
 134                         my @subfields;
 135                         foreach my $subfield ($field->subfields()){
 136                                 push @subfields,($$subfield[0],NormalizeString($$subfield[1]));
 137                         }
 138                         my $newfield=MARC::Field->new(
 139                                                         $field->tag(),
 140                                                         $field->indicator(1),
 141                                                         $field->indicator(2),
 142                                                         @subfields
 143                                                 );
 144                         $field->replace_with($newfield);
 145                 }
 146         }
 147 }
 148
 149 =head2 NormalizeString
 150
 151     my $normalized_string=NormalizeString($string);
 152
 153 Given a string
 154
 155 nfc : If you want to set NFC and not NFD
 156 transform : If you expect all the signs to be removed
 157 Sets the PERL UTF8 Flag on your initial data if need be
 158 and applies cleaning if required
 159
 160 Returns a utf8 NFD normalized string
 161
 162 Sample code :
 163     my $string=NormalizeString ("l'ornithoptère");
 164     #results into ornithoptère in NFD form and sets UTF8 Flag
 165
 166 =cut
 167
 168 sub NormalizeString{
 169         my ($string,$nfc,$transform)=@_;
 170         utf8::decode($string) unless (utf8::is_utf8($string));
 171         if ($nfc){
 172                 $string= NFD($string);
 173         }
 174         else {
 175                 $string=NFC($string);
 176         }
 177         if ($transform){
 178     $string=~s/\<|\>|\^|\;|\.|\?|,|\-|\(|\)|\[|\]|\{|\}|\$|\%|\!|\*|\:|\\|\/|\&|\"|\'/ /g;
 179         #removing one letter words "d'" "l'"  was changed into "d " "l "
 180     $string=~s/\b\S\b//g;
 181     $string=~s/\s+$//g;
 182         }
 183     return $string;
 184 }
 185
 186 =head2 MarcToUTF8Record
 187
 188   ($marc_record, $converted_from, $errors_arrayref) = MarcToUTF8Record($marc_blob,
 189                                         $marc_flavour, [, $source_encoding]);
 190
 191 Given a MARC blob or a C<MARC::Record>, the MARC flavour, and an
 192 optional source encoding, return a C<MARC::Record> that is
 193 converted to UTF-8.
 194
 195 The returned C<$marc_record> is guaranteed to be in valid UTF-8, but
 196 is not guaranteed to have been converted correctly.  Specifically,
 197 if C<$converted_from> is 'failed', the MARC record returned failed
 198 character conversion and had each of its non-ASCII octets changed
 199 to the Unicode replacement character.
 200
 201 If the source encoding was not specified, this routine will
 202 try to guess it; the character encoding used for a successful
 203 conversion is returned in C<$converted_from>.
 204
 205 =cut
 206
 207 sub MarcToUTF8Record {
 208     my $marc = shift;
 209     my $marc_flavour = shift;
 210     my $source_encoding = shift;
 211     my $marc_record;
 212     my $marc_blob_is_utf8 = 0;
 213     if (ref($marc) eq 'MARC::Record') {
 214         my $marc_blob = $marc->as_usmarc();
 215         $marc_blob_is_utf8 = IsStringUTF8ish($marc_blob);
 216         $marc_record = $marc;
 217     } else {
 218         # dealing with a MARC blob
 219
 220         # remove any ersatz whitespace from the beginning and
 221         # end of the MARC blob -- these can creep into MARC
 222         # files produced by several sources -- caller really
 223         # should be doing this, however
 224         $marc =~ s/^\s+//;
 225         $marc =~ s/\s+$//;
 226         $marc_blob_is_utf8 = IsStringUTF8ish($marc);
 227         eval {
 228             $marc_record = MARC::Record->new_from_usmarc($marc);
 229         };
 230         if ($@) {
 231             # if we fail the first time, one likely problem
 232             # is that we have a MARC21 record that says that it's
 233             # UTF-8 (Leader/09 = 'a') but contains non-UTF-8 characters.
 234             # We'll try parsing it again.
 235             substr($marc, 9, 1) = ' ';
 236             eval {
 237                 $marc_record = MARC::Record->new_from_usmarc($marc);
 238             };
 239             if ($@) {
 240                 # it's hopeless; return an empty MARC::Record
 241                 return MARC::Record->new(), 'failed', ['could not parse MARC blob'];
 242             }
 243         }
 244     }
 245
 246     # If we do not know the source encoding, try some guesses
 247     # as follows:
 248     #   1. Record is UTF-8 already.
 249     #   2. If MARC flavor is MARC21, then
 250     #      a. record is MARC-8
 251     #      b. record is ISO-8859-1
 252     #   3. If MARC flavor is UNIMARC, then
 253     if (not defined $source_encoding) {
 254         if ($marc_blob_is_utf8) {
 255             # note that for MARC21 we are not bothering to check
 256             # if the Leader/09 is set to 'a' or not -- because
 257             # of problems with various ILSs (including Koha in the
 258             # past, alas), this just is not trustworthy.
 259             SetMarcUnicodeFlag($marc_record, $marc_flavour);
 260             return $marc_record, 'UTF-8', [];
 261         } else {
 262             if ($marc_flavour eq 'MARC21') {
 263                 return _default_marc21_charconv_to_utf8($marc_record, $marc_flavour);
 264             } elsif ($marc_flavour =~/UNIMARC/) {
 265                 return _default_unimarc_charconv_to_utf8($marc_record, $marc_flavour);
 266             } else {
 267                 return _default_marc21_charconv_to_utf8($marc_record, $marc_flavour);
 268             }
 269         }
 270     } else {
 271         # caller knows the character encoding
 272         my $original_marc_record = $marc_record->clone();
 273         my @errors;
 274         if ($source_encoding =~ /utf-?8/i) {
 275             if ($marc_blob_is_utf8) {
 276                 SetMarcUnicodeFlag($marc_record, $marc_flavour);
 277                 return $marc_record, 'UTF-8', [];
 278             } else {
 279                 push @errors, 'specified UTF-8 => UTF-8 conversion, but record is not in UTF-8';
 280             }
 281         } elsif ($source_encoding =~ /marc-?8/i) {
 282             @errors = _marc_marc8_to_utf8($marc_record, $marc_flavour);
 283         } elsif ($source_encoding =~ /5426/) {
 284             @errors = _marc_iso5426_to_utf8($marc_record, $marc_flavour);
 285         } else {
 286             # assume any other character encoding is for Text::Iconv
 287             @errors = _marc_to_utf8_via_text_iconv($marc_record, $marc_flavour, $source_encoding);
 288         }
 289
 290         if (@errors) {
 291             _marc_to_utf8_replacement_char($original_marc_record, $marc_flavour);
 292             return $original_marc_record, 'failed', \@errors;
 293         } else {
 294             return $marc_record, $source_encoding, [];
 295         }
 296     }
 297
 298 }
 299
 300 =head2 SetMarcUnicodeFlag
 301
 302   SetMarcUnicodeFlag($marc_record, $marc_flavour);
 303
 304 Set both the internal MARC::Record encoding flag
 305 and the appropriate Leader/09 (MARC21) or
 306 100/26-29 (UNIMARC) to indicate that the record
 307 is in UTF-8.  Note that this does B<not> do
 308 any actual character conversion.
 309
 310 =cut
 311
 312 sub SetMarcUnicodeFlag {
 313     my $marc_record = shift;
 314     my $marc_flavour = shift; # || C4::Context->preference("marcflavour");
 315
 316     $marc_record->encoding('UTF-8');
 317     if ($marc_flavour eq 'MARC21') {
 318         my $leader = $marc_record->leader();
 319         substr($leader, 9, 1) = 'a';
 320         $marc_record->leader($leader);
 321     } elsif ($marc_flavour =~/UNIMARC/) {
 322         my $string;
 323                 my ($subflength,$encodingposition)=($marc_flavour=~/AUTH/?(21,9):(36,22));
 324                 $string=$marc_record->subfield( 100, "a" );
 325         if (defined $string && length($string)==$subflength) {
 326                         $string = substr $string, 0,$subflength if (length($string)>$subflength);
 327         }
 328         else {
 329             $string = POSIX::strftime( "%Y%m%d", localtime );
 330             $string =~ s/\-//g;
 331             $string = sprintf( "%-*s", $subflength, $string );
 332         }
 333         substr( $string, $encodingposition, 8, "frey50  " );
 334         if ( $marc_record->subfield( 100, "a" ) ) {
 335                         $marc_record->field('100')->update(a=>$string);
 336                 }
 337                 else {
 338             $marc_record->insert_grouped_field(
 339                 MARC::Field->new( 100, '', '', "a" => $string ) );
 340         }
 341                 $debug && warn "encodage: ", substr( $marc_record->subfield(100, 'a'), $encodingposition, 8 );
 342     } else {
 343         warn "Unrecognized marcflavour: $marc_flavour";
 344     }
 345 }
 346
 347 =head2 StripNonXmlChars
 348
 349   my $new_str = StripNonXmlChars($old_str);
 350
 351 Given a string, return a copy with the
 352 characters that are illegal in XML
 353 removed.
 354
 355 This function exists to work around a problem
 356 that can occur with badly-encoded MARC records.
 357 Specifically, if a UTF-8 MARC record also
 358 has excape (\x1b) characters, MARC::File::XML
 359 will let the escape characters pass through
 360 when as_xml() or as_xml_record() is called.  The
 361 problem is that the escape character is not
 362 legal in well-formed XML documents, so when
 363 MARC::File::XML attempts to parse such a record,
 364 the XML parser will fail.
 365
 366 Stripping such characters will allow a
 367 MARC::Record->new_from_xml()
 368 to work, at the possible risk of some data loss.
 369
 370 =cut
 371
 372 sub StripNonXmlChars {
 373     my $str = shift;
 374     if (!defined($str) || $str eq ""){
 375         return "";
 376     }
 377     $str =~ s/[^\x09\x0A\x0D\x{0020}-\x{D7FF}\x{E000}-\x{FFFD}\x{10000}-\x{10FFFF}]//g;
 378     return $str;
 379 }
 380
 381 =head1 INTERNAL FUNCTIONS
 382
 383 =head2 _default_marc21_charconv_to_utf8
 384
 385   my ($new_marc_record, $guessed_charset) = _default_marc21_charconv_to_utf8($marc_record);
 386
 387 Converts a C<MARC::Record> of unknown character set to UTF-8,
 388 first by trying a MARC-8 to UTF-8 conversion, then ISO-8859-1
 389 to UTF-8, then a default conversion that replaces each non-ASCII
 390 character with the replacement character.
 391
 392 The C<$guessed_charset> return value contains the character set
 393 that resulted in a conversion to valid UTF-8; note that
 394 if the MARC-8 and ISO-8859-1 conversions failed, the value of
 395 this is 'failed'.
 396
 397 =cut
 398
 399 sub _default_marc21_charconv_to_utf8 {
 400     my $marc_record = shift;
 401     my $marc_flavour = shift;
 402
 403     my $trial_marc8 = $marc_record->clone();
 404     my @all_errors = ();
 405     my @errors = _marc_marc8_to_utf8($trial_marc8, $marc_flavour);
 406     unless (@errors) {
 407         return $trial_marc8, 'MARC-8', [];
 408     }
 409     push @all_errors, @errors;
 410
 411     my $trial_8859_1 = $marc_record->clone();
 412     @errors = _marc_to_utf8_via_text_iconv($trial_8859_1, $marc_flavour, 'iso-8859-1');
 413     unless (@errors) {
 414         return $trial_8859_1, 'iso-8859-1', []; # note -- we could return \@all_errors
 415                                                 # instead if we wanted to report details
 416                                                 # of the failed attempt at MARC-8 => UTF-8
 417     }
 418     push @all_errors, @errors;
 419
 420     my $default_converted = $marc_record->clone();
 421     _marc_to_utf8_replacement_char($default_converted, $marc_flavour);
 422     return $default_converted, 'failed', \@all_errors;
 423 }
 424
 425 =head2 _default_unimarc_charconv_to_utf8
 426
 427   my ($new_marc_record, $guessed_charset) = _default_unimarc_charconv_to_utf8($marc_record);
 428
 429 Converts a C<MARC::Record> of unknown character set to UTF-8,
 430 first by trying a ISO-5426 to UTF-8 conversion, then ISO-8859-1
 431 to UTF-8, then a default conversion that replaces each non-ASCII
 432 character with the replacement character.
 433
 434 The C<$guessed_charset> return value contains the character set
 435 that resulted in a conversion to valid UTF-8; note that
 436 if the MARC-8 and ISO-8859-1 conversions failed, the value of
 437 this is 'failed'.
 438
 439 =cut
 440
 441 sub _default_unimarc_charconv_to_utf8 {
 442     my $marc_record = shift;
 443     my $marc_flavour = shift;
 444
 445     my $trial_marc8 = $marc_record->clone();
 446     my @all_errors = ();
 447     my @errors = _marc_iso5426_to_utf8($trial_marc8, $marc_flavour);
 448     unless (@errors) {
 449         return $trial_marc8, 'iso-5426';
 450     }
 451     push @all_errors, @errors;
 452
 453     my $trial_8859_1 = $marc_record->clone();
 454     @errors = _marc_to_utf8_via_text_iconv($trial_8859_1, $marc_flavour, 'iso-8859-1');
 455     unless (@errors) {
 456         return $trial_8859_1, 'iso-8859-1';
 457     }
 458     push @all_errors, @errors;
 459
 460     my $default_converted = $marc_record->clone();
 461     _marc_to_utf8_replacement_char($default_converted, $marc_flavour);
 462     return $default_converted, 'failed', \@all_errors;
 463 }
 464
 465 =head2 _marc_marc8_to_utf8
 466
 467   my @errors = _marc_marc8_to_utf8($marc_record, $marc_flavour, $source_encoding);
 468
 469 Convert a C<MARC::Record> to UTF-8 in-place from MARC-8.
 470 If the conversion fails for some reason, an
 471 appropriate messages will be placed in the returned
 472 C<@errors> array.
 473
 474 =cut
 475
 476 sub _marc_marc8_to_utf8 {
 477     my $marc_record = shift;
 478     my $marc_flavour = shift;
 479
 480     my $prev_ignore = MARC::Charset->ignore_errors();
 481     MARC::Charset->ignore_errors(1);
 482
 483     # trap warnings raised by MARC::Charset
 484     my @errors = ();
 485     local $SIG{__WARN__} = sub {
 486         my $msg = $_[0];
 487         if ($msg =~ /MARC.Charset/) {
 488             # FIXME - purpose of this regexp is to strip out the
 489             # line reference to MARC/Charset.pm, but as it
 490             # exists probably won't work quite on Windows --
 491             # some sort of minimal-bunch back-tracking RE
 492             # would be helpful here
 493             $msg =~ s/at [\/].*?.MARC.Charset\.pm line \d+\.\n$//;
 494             push @errors, $msg;
 495         } else {
 496             # if warning doesn't come from MARC::Charset, just
 497             # pass it on
 498             warn $msg;
 499         }
 500     };
 501
 502     foreach my $field ($marc_record->fields()) {
 503         if ($field->is_control_field()) {
 504             ; # do nothing -- control fields should not contain non-ASCII characters
 505         } else {
 506             my @converted_subfields;
 507             foreach my $subfield ($field->subfields()) {
 508                 my $utf8sf = MARC::Charset::marc8_to_utf8($subfield->[1]);
 509                 unless (IsStringUTF8ish($utf8sf)) {
 510                     # Because of a bug in MARC::Charset 0.98, if the string
 511                     # has (a) one or more diacritics that (b) are only in character positions
 512                     # 128 to 255 inclusive, the resulting converted string is not in
 513                     # UTF-8, but the legacy 8-bit encoding (e.g., ISO-8859-1).  If that
 514                     # occurs, upgrade the string in place.  Moral of the story seems to be
 515                     # that pack("U", ...) is better than chr(...) if you need to guarantee
 516                     # that the resulting string is UTF-8.
 517                     utf8::upgrade($utf8sf);
 518                 }
 519                 push @converted_subfields, $subfield->[0], $utf8sf;
 520             }
 521
 522             $field->replace_with(MARC::Field->new(
 523                 $field->tag(), $field->indicator(1), $field->indicator(2),
 524                 @converted_subfields)
 525             );
 526         }
 527     }
 528
 529     MARC::Charset->ignore_errors($prev_ignore);
 530
 531     SetMarcUnicodeFlag($marc_record, $marc_flavour);
 532
 533     return @errors;
 534 }
 535
 536 =head2 _marc_iso5426_to_utf8
 537
 538   my @errors = _marc_iso5426_to_utf8($marc_record, $marc_flavour, $source_encoding);
 539
 540 Convert a C<MARC::Record> to UTF-8 in-place from ISO-5426.
 541 If the conversion fails for some reason, an
 542 appropriate messages will be placed in the returned
 543 C<@errors> array.
 544
 545 FIXME - is ISO-5426 equivalent enough to MARC-8
 546 that C<MARC::Charset> can be used instead?
 547
 548 =cut
 549
 550 sub _marc_iso5426_to_utf8 {
 551     my $marc_record = shift;
 552     my $marc_flavour = shift;
 553
 554     my @errors = ();
 555
 556     foreach my $field ($marc_record->fields()) {
 557         if ($field->is_control_field()) {
 558             ; # do nothing -- control fields should not contain non-ASCII characters
 559         } else {
 560             my @converted_subfields;
 561             foreach my $subfield ($field->subfields()) {
 562                 my $utf8sf = char_decode5426($subfield->[1]);
 563                 push @converted_subfields, $subfield->[0], $utf8sf;
 564             }
 565
 566             $field->replace_with(MARC::Field->new(
 567                 $field->tag(), $field->indicator(1), $field->indicator(2),
 568                 @converted_subfields)
 569             );
 570         }
 571     }
 572
 573     SetMarcUnicodeFlag($marc_record, $marc_flavour);
 574
 575     return @errors;
 576 }
 577
 578 =head2 _marc_to_utf8_via_text_iconv
 579
 580   my @errors = _marc_to_utf8_via_text_iconv($marc_record, $marc_flavour, $source_encoding);
 581
 582 Convert a C<MARC::Record> to UTF-8 in-place using the
 583 C<Text::Iconv> CPAN module.  Any source encoding accepted
 584 by the user's iconv installation should work.  If
 585 the source encoding is not recognized on the user's
 586 server or the conversion fails for some reason,
 587 appropriate messages will be placed in the returned
 588 C<@errors> array.
 589
 590 =cut
 591
 592 sub _marc_to_utf8_via_text_iconv {
 593     my $marc_record = shift;
 594     my $marc_flavour = shift;
 595     my $source_encoding = shift;
 596
 597     my @errors = ();
 598     my $decoder;
 599     eval { $decoder = Text::Iconv->new($source_encoding, 'utf8'); };
 600     if ($@) {
 601         push @errors, "Could not initialze $source_encoding => utf8 converter: $@";
 602         return @errors;
 603     }
 604
 605     my $prev_raise_error = Text::Iconv->raise_error();
 606     Text::Iconv->raise_error(1);
 607
 608     foreach my $field ($marc_record->fields()) {
 609         if ($field->is_control_field()) {
 610             ; # do nothing -- control fields should not contain non-ASCII characters
 611         } else {
 612             my @converted_subfields;
 613             foreach my $subfield ($field->subfields()) {
 614                 my $converted_value;
 615                 my $conversion_ok = 1;
 616                 eval { $converted_value = $decoder->convert($subfield->[1]); };
 617                 if ($@) {
 618                     $conversion_ok = 0;
 619                     push @errors, $@;
 620                 } elsif (not defined $converted_value) {
 621                     $conversion_ok = 0;
 622                     push @errors, "Text::Iconv conversion failed - retval is " . $decoder->retval();
 623                 }
 624
 625                 if ($conversion_ok) {
 626                     push @converted_subfields, $subfield->[0], $converted_value;
 627                 } else {
 628                     $converted_value = $subfield->[1];
 629                     $converted_value =~ s/[\200-\377]/\xef\xbf\xbd/g;
 630                     push @converted_subfields, $subfield->[0], $converted_value;
 631                 }
 632             }
 633
 634             $field->replace_with(MARC::Field->new(
 635                 $field->tag(), $field->indicator(1), $field->indicator(2),
 636                 @converted_subfields)
 637             );
 638         }
 639     }
 640
 641     SetMarcUnicodeFlag($marc_record, $marc_flavour);
 642     Text::Iconv->raise_error($prev_raise_error);
 643
 644     return @errors;
 645 }
 646
 647 =head2 _marc_to_utf8_replacement_char
 648
 649   _marc_to_utf8_replacement_char($marc_record, $marc_flavour);
 650
 651 Convert a C<MARC::Record> to UTF-8 in-place, adopting the
 652 unsatisfactory method of replacing all non-ASCII (e.g.,
 653 where the eight bit is set) octet with the Unicode
 654 replacement character.  This is meant as a last-ditch
 655 method, and would be best used as part of a UI that
 656 lets a cataloguer pick various character conversions
 657 until he or she finds the right one.
 658
 659 =cut
 660
 661 sub _marc_to_utf8_replacement_char {
 662     my $marc_record = shift;
 663     my $marc_flavour = shift;
 664
 665     foreach my $field ($marc_record->fields()) {
 666         if ($field->is_control_field()) {
 667             ; # do nothing -- control fields should not contain non-ASCII characters
 668         } else {
 669             my @converted_subfields;
 670             foreach my $subfield ($field->subfields()) {
 671                 my $value = $subfield->[1];
 672                 $value =~ s/[\200-\377]/\xef\xbf\xbd/g;
 673                 push @converted_subfields, $subfield->[0], $value;
 674             }
 675
 676             $field->replace_with(MARC::Field->new(
 677                 $field->tag(), $field->indicator(1), $field->indicator(2),
 678                 @converted_subfields)
 679             );
 680         }
 681     }
 682
 683     SetMarcUnicodeFlag($marc_record, $marc_flavour);
 684 }
 685
 686 =head2 char_decode5426
 687
 688   my $utf8string = char_decode5426($iso_5426_string);
 689
 690 Converts a string from ISO-5426 to UTF-8.
 691
 692 =cut
 693
 694
 695 my %chars;
 696 $chars{0xb0}=0x0101;#3/0ayn[ain]
 697 $chars{0xb1}=0x0623;#3/1alif/hamzah[alefwithhamzaabove]
 698 #$chars{0xb2}=0x00e0;#'à';
 699 $chars{0xb2}=0x00e0;#3/2leftlowsinglequotationmark
 700 #$chars{0xb3}=0x00e7;#'ç';
 701 $chars{0xb3}=0x00e7;#3/2leftlowsinglequotationmark
 702 # $chars{0xb4}='è';
 703 $chars{0xb4}=0x00e8;
 704 $chars{0xbd}=0x02b9;
 705 $chars{0xbe}=0x02ba;
 706 # $chars{0xb5}='é';
 707 $chars{0xb5}=0x00e9;
 708 $chars{0x97}=0x003c;#3/2leftlowsinglequotationmark
 709 $chars{0x98}=0x003e;#3/2leftlowsinglequotationmark
 710 $chars{0xfa}=0x0153; #oe
 711 $chars{0xea}=0x0152; #oe
 712 $chars{0x81d1}=0x00b0;
 713
 714 ####
 715 ## combined characters iso5426
 716
 717 $chars{0xc041}=0x1ea2; # capital a with hook above
 718 $chars{0xc045}=0x1eba; # capital e with hook above
 719 $chars{0xc049}=0x1ec8; # capital i with hook above
 720 $chars{0xc04f}=0x1ece; # capital o with hook above
 721 $chars{0xc055}=0x1ee6; # capital u with hook above
 722 $chars{0xc059}=0x1ef6; # capital y with hook above
 723 $chars{0xc061}=0x1ea3; # small a with hook above
 724 $chars{0xc065}=0x1ebb; # small e with hook above
 725 $chars{0xc069}=0x1ec9; # small i with hook above
 726 $chars{0xc06f}=0x1ecf; # small o with hook above
 727 $chars{0xc075}=0x1ee7; # small u with hook above
 728 $chars{0xc079}=0x1ef7; # small y with hook above
 729
 730         # 4/1 grave accent
 731 $chars{0xc141}=0x00c0; # capital a with grave accent
 732 $chars{0xc145}=0x00c8; # capital e with grave accent
 733 $chars{0xc149}=0x00cc; # capital i with grave accent
 734 $chars{0xc14f}=0x00d2; # capital o with grave accent
 735 $chars{0xc155}=0x00d9; # capital u with grave accent
 736 $chars{0xc157}=0x1e80; # capital w with grave
 737 $chars{0xc159}=0x1ef2; # capital y with grave
 738 $chars{0xc161}=0x00e0; # small a with grave accent
 739 $chars{0xc165}=0x00e8; # small e with grave accent
 740 $chars{0xc169}=0x00ec; # small i with grave accent
 741 $chars{0xc16f}=0x00f2; # small o with grave accent
 742 $chars{0xc175}=0x00f9; # small u with grave accent
 743 $chars{0xc177}=0x1e81; # small w with grave
 744 $chars{0xc179}=0x1ef3; # small y with grave
 745         # 4/2 acute accent
 746 $chars{0xc241}=0x00c1; # capital a with acute accent
 747 $chars{0xc243}=0x0106; # capital c with acute accent
 748 $chars{0xc245}=0x00c9; # capital e with acute accent
 749 $chars{0xc247}=0x01f4; # capital g with acute
 750 $chars{0xc249}=0x00cd; # capital i with acute accent
 751 $chars{0xc24b}=0x1e30; # capital k with acute
 752 $chars{0xc24c}=0x0139; # capital l with acute accent
 753 $chars{0xc24d}=0x1e3e; # capital m with acute
 754 $chars{0xc24e}=0x0143; # capital n with acute accent
 755 $chars{0xc24f}=0x00d3; # capital o with acute accent
 756 $chars{0xc250}=0x1e54; # capital p with acute
 757 $chars{0xc252}=0x0154; # capital r with acute accent
 758 $chars{0xc253}=0x015a; # capital s with acute accent
 759 $chars{0xc255}=0x00da; # capital u with acute accent
 760 $chars{0xc257}=0x1e82; # capital w with acute
 761 $chars{0xc259}=0x00dd; # capital y with acute accent
 762 $chars{0xc25a}=0x0179; # capital z with acute accent
 763 $chars{0xc261}=0x00e1; # small a with acute accent
 764 $chars{0xc263}=0x0107; # small c with acute accent
 765 $chars{0xc265}=0x00e9; # small e with acute accent
 766 $chars{0xc267}=0x01f5; # small g with acute
 767 $chars{0xc269}=0x00ed; # small i with acute accent
 768 $chars{0xc26b}=0x1e31; # small k with acute
 769 $chars{0xc26c}=0x013a; # small l with acute accent
 770 $chars{0xc26d}=0x1e3f; # small m with acute
 771 $chars{0xc26e}=0x0144; # small n with acute accent
 772 $chars{0xc26f}=0x00f3; # small o with acute accent
 773 $chars{0xc270}=0x1e55; # small p with acute
 774 $chars{0xc272}=0x0155; # small r with acute accent
 775 $chars{0xc273}=0x015b; # small s with acute accent
 776 $chars{0xc275}=0x00fa; # small u with acute accent
 777 $chars{0xc277}=0x1e83; # small w with acute
 778 $chars{0xc279}=0x00fd; # small y with acute accent
 779 $chars{0xc27a}=0x017a; # small z with acute accent
 780 $chars{0xc2e1}=0x01fc; # capital ae with acute
 781 $chars{0xc2f1}=0x01fd; # small ae with acute
 782        # 4/3 circumflex accent
 783 $chars{0xc341}=0x00c2; # capital a with circumflex accent
 784 $chars{0xc343}=0x0108; # capital c with circumflex
 785 $chars{0xc345}=0x00ca; # capital e with circumflex accent
 786 $chars{0xc347}=0x011c; # capital g with circumflex
 787 $chars{0xc348}=0x0124; # capital h with circumflex
 788 $chars{0xc349}=0x00ce; # capital i with circumflex accent
 789 $chars{0xc34a}=0x0134; # capital j with circumflex
 790 $chars{0xc34f}=0x00d4; # capital o with circumflex accent
 791 $chars{0xc353}=0x015c; # capital s with circumflex
 792 $chars{0xc355}=0x00db; # capital u with circumflex
 793 $chars{0xc357}=0x0174; # capital w with circumflex
 794 $chars{0xc359}=0x0176; # capital y with circumflex
 795 $chars{0xc35a}=0x1e90; # capital z with circumflex
 796 $chars{0xc361}=0x00e2; # small a with circumflex accent
 797 $chars{0xc363}=0x0109; # small c with circumflex
 798 $chars{0xc365}=0x00ea; # small e with circumflex accent
 799 $chars{0xc367}=0x011d; # small g with circumflex
 800 $chars{0xc368}=0x0125; # small h with circumflex
 801 $chars{0xc369}=0x00ee; # small i with circumflex accent
 802 $chars{0xc36a}=0x0135; # small j with circumflex
 803 $chars{0xc36e}=0x00f1; # small n with tilde
 804 $chars{0xc36f}=0x00f4; # small o with circumflex accent
 805 $chars{0xc373}=0x015d; # small s with circumflex
 806 $chars{0xc375}=0x00fb; # small u with circumflex
 807 $chars{0xc377}=0x0175; # small w with circumflex
 808 $chars{0xc379}=0x0177; # small y with circumflex
 809 $chars{0xc37a}=0x1e91; # small z with circumflex
 810         # 4/4 tilde
 811 $chars{0xc441}=0x00c3; # capital a with tilde
 812 $chars{0xc445}=0x1ebc; # capital e with tilde
 813 $chars{0xc449}=0x0128; # capital i with tilde
 814 $chars{0xc44e}=0x00d1; # capital n with tilde
 815 $chars{0xc44f}=0x00d5; # capital o with tilde
 816 $chars{0xc455}=0x0168; # capital u with tilde
 817 $chars{0xc456}=0x1e7c; # capital v with tilde
 818 $chars{0xc459}=0x1ef8; # capital y with tilde
 819 $chars{0xc461}=0x00e3; # small a with tilde
 820 $chars{0xc465}=0x1ebd; # small e with tilde
 821 $chars{0xc469}=0x0129; # small i with tilde
 822 $chars{0xc46e}=0x00f1; # small n with tilde
 823 $chars{0xc46f}=0x00f5; # small o with tilde
 824 $chars{0xc475}=0x0169; # small u with tilde
 825 $chars{0xc476}=0x1e7d; # small v with tilde
 826 $chars{0xc479}=0x1ef9; # small y with tilde
 827     # 4/5 macron
 828 $chars{0xc541}=0x0100; # capital a with macron
 829 $chars{0xc545}=0x0112; # capital e with macron
 830 $chars{0xc547}=0x1e20; # capital g with macron
 831 $chars{0xc549}=0x012a; # capital i with macron
 832 $chars{0xc54f}=0x014c; # capital o with macron
 833 $chars{0xc555}=0x016a; # capital u with macron
 834 $chars{0xc561}=0x0101; # small a with macron
 835 $chars{0xc565}=0x0113; # small e with macron
 836 $chars{0xc567}=0x1e21; # small g with macron
 837 $chars{0xc569}=0x012b; # small i with macron
 838 $chars{0xc56f}=0x014d; # small o with macron
 839 $chars{0xc575}=0x016b; # small u with macron
 840 $chars{0xc572}=0x0159; # small r with macron
 841 $chars{0xc5e1}=0x01e2; # capital ae with macron
 842 $chars{0xc5f1}=0x01e3; # small ae with macron
 843         # 4/6 breve
 844 $chars{0xc641}=0x0102; # capital a with breve
 845 $chars{0xc645}=0x0114; # capital e with breve
 846 $chars{0xc647}=0x011e; # capital g with breve
 847 $chars{0xc649}=0x012c; # capital i with breve
 848 $chars{0xc64f}=0x014e; # capital o with breve
 849 $chars{0xc655}=0x016c; # capital u with breve
 850 $chars{0xc661}=0x0103; # small a with breve
 851 $chars{0xc665}=0x0115; # small e with breve
 852 $chars{0xc667}=0x011f; # small g with breve
 853 $chars{0xc669}=0x012d; # small i with breve
 854 $chars{0xc66f}=0x014f; # small o with breve
 855 $chars{0xc675}=0x016d; # small u with breve
 856         # 4/7 dot above
 857 $chars{0xc7b0}=0x01e1; # Ain with dot above
 858 $chars{0xc742}=0x1e02; # capital b with dot above
 859 $chars{0xc743}=0x010a; # capital c with dot above
 860 $chars{0xc744}=0x1e0a; # capital d with dot above
 861 $chars{0xc745}=0x0116; # capital e with dot above
 862 $chars{0xc746}=0x1e1e; # capital f with dot above
 863 $chars{0xc747}=0x0120; # capital g with dot above
 864 $chars{0xc748}=0x1e22; # capital h with dot above
 865 $chars{0xc749}=0x0130; # capital i with dot above
 866 $chars{0xc74d}=0x1e40; # capital m with dot above
 867 $chars{0xc74e}=0x1e44; # capital n with dot above
 868 $chars{0xc750}=0x1e56; # capital p with dot above
 869 $chars{0xc752}=0x1e58; # capital r with dot above
 870 $chars{0xc753}=0x1e60; # capital s with dot above
 871 $chars{0xc754}=0x1e6a; # capital t with dot above
 872 $chars{0xc757}=0x1e86; # capital w with dot above
 873 $chars{0xc758}=0x1e8a; # capital x with dot above
 874 $chars{0xc759}=0x1e8e; # capital y with dot above
 875 $chars{0xc75a}=0x017b; # capital z with dot above
 876 $chars{0xc761}=0x0227; # small b with dot above
 877 $chars{0xc762}=0x1e03; # small b with dot above
 878 $chars{0xc763}=0x010b; # small c with dot above
 879 $chars{0xc764}=0x1e0b; # small d with dot above
 880 $chars{0xc765}=0x0117; # small e with dot above
 881 $chars{0xc766}=0x1e1f; # small f with dot above
 882 $chars{0xc767}=0x0121; # small g with dot above
 883 $chars{0xc768}=0x1e23; # small h with dot above
 884 $chars{0xc76d}=0x1e41; # small m with dot above
 885 $chars{0xc76e}=0x1e45; # small n with dot above
 886 $chars{0xc770}=0x1e57; # small p with dot above
 887 $chars{0xc772}=0x1e59; # small r with dot above
 888 $chars{0xc773}=0x1e61; # small s with dot above
 889 $chars{0xc774}=0x1e6b; # small t with dot above
 890 $chars{0xc777}=0x1e87; # small w with dot above
 891 $chars{0xc778}=0x1e8b; # small x with dot above
 892 $chars{0xc779}=0x1e8f; # small y with dot above
 893 $chars{0xc77a}=0x017c; # small z with dot above
 894         # 4/8 trema, diaresis
 895 $chars{0xc820}=0x00a8; # diaeresis
 896 $chars{0xc841}=0x00c4; # capital a with diaeresis
 897 $chars{0xc845}=0x00cb; # capital e with diaeresis
 898 $chars{0xc848}=0x1e26; # capital h with diaeresis
 899 $chars{0xc849}=0x00cf; # capital i with diaeresis
 900 $chars{0xc84f}=0x00d6; # capital o with diaeresis
 901 $chars{0xc855}=0x00dc; # capital u with diaeresis
 902 $chars{0xc857}=0x1e84; # capital w with diaeresis
 903 $chars{0xc858}=0x1e8c; # capital x with diaeresis
 904 $chars{0xc859}=0x0178; # capital y with diaeresis
 905 $chars{0xc861}=0x00e4; # small a with diaeresis
 906 $chars{0xc865}=0x00eb; # small e with diaeresis
 907 $chars{0xc868}=0x1e27; # small h with diaeresis
 908 $chars{0xc869}=0x00ef; # small i with diaeresis
 909 $chars{0xc86f}=0x00f6; # small o with diaeresis
 910 $chars{0xc874}=0x1e97; # small t with diaeresis
 911 $chars{0xc875}=0x00fc; # small u with diaeresis
 912 $chars{0xc877}=0x1e85; # small w with diaeresis
 913 $chars{0xc878}=0x1e8d; # small x with diaeresis
 914 $chars{0xc879}=0x00ff; # small y with diaeresis
 915         # 4/9 umlaut
 916 $chars{0xc920}=0x00a8; # [diaeresis]
 917 $chars{0xc961}=0x00e4; # a with umlaut
 918 $chars{0xc965}=0x00eb; # e with umlaut
 919 $chars{0xc969}=0x00ef; # i with umlaut
 920 $chars{0xc96f}=0x00f6; # o with umlaut
 921 $chars{0xc975}=0x00fc; # u with umlaut
 922         # 4/10 circle above
 923 $chars{0xca41}=0x00c5; # capital a with ring above
 924 $chars{0xcaad}=0x016e; # capital u with ring above
 925 $chars{0xca61}=0x00e5; # small a with ring above
 926 $chars{0xca75}=0x016f; # small u with ring above
 927 $chars{0xca77}=0x1e98; # small w with ring above
 928 $chars{0xca79}=0x1e99; # small y with ring above
 929         # 4/11 high comma off centre
 930         # 4/12 inverted high comma centred
 931         # 4/13 double acute accent
 932 $chars{0xcd4f}=0x0150; # capital o with double acute
 933 $chars{0xcd55}=0x0170; # capital u with double acute
 934 $chars{0xcd6f}=0x0151; # small o with double acute
 935 $chars{0xcd75}=0x0171; # small u with double acute
 936         # 4/14 horn
 937 $chars{0xce54}=0x01a0; # latin capital letter o with horn
 938 $chars{0xce55}=0x01af; # latin capital letter u with horn
 939 $chars{0xce74}=0x01a1; # latin small letter o with horn
 940 $chars{0xce75}=0x01b0; # latin small letter u with horn
 941         # 4/15 caron (hacek
 942 $chars{0xcf41}=0x01cd; # capital a with caron
 943 $chars{0xcf43}=0x010c; # capital c with caron
 944 $chars{0xcf44}=0x010e; # capital d with caron
 945 $chars{0xcf45}=0x011a; # capital e with caron
 946 $chars{0xcf47}=0x01e6; # capital g with caron
 947 $chars{0xcf49}=0x01cf; # capital i with caron
 948 $chars{0xcf4b}=0x01e8; # capital k with caron
 949 $chars{0xcf4c}=0x013d; # capital l with caron
 950 $chars{0xcf4e}=0x0147; # capital n with caron
 951 $chars{0xcf4f}=0x01d1; # capital o with caron
 952 $chars{0xcf52}=0x0158; # capital r with caron
 953 $chars{0xcf53}=0x0160; # capital s with caron
 954 $chars{0xcf54}=0x0164; # capital t with caron
 955 $chars{0xcf55}=0x01d3; # capital u with caron
 956 $chars{0xcf5a}=0x017d; # capital z with caron
 957 $chars{0xcf61}=0x01ce; # small a with caron
 958 $chars{0xcf63}=0x010d; # small c with caron
 959 $chars{0xcf64}=0x010f; # small d with caron
 960 $chars{0xcf65}=0x011b; # small e with caron
 961 $chars{0xcf67}=0x01e7; # small g with caron
 962 $chars{0xcf69}=0x01d0; # small i with caron
 963 $chars{0xcf6a}=0x01f0; # small j with caron
 964 $chars{0xcf6b}=0x01e9; # small k with caron
 965 $chars{0xcf6c}=0x013e; # small l with caron
 966 $chars{0xcf6e}=0x0148; # small n with caron
 967 $chars{0xcf6f}=0x01d2; # small o with caron
 968 $chars{0xcf72}=0x0159; # small r with caron
 969 $chars{0xcf73}=0x0161; # small s with caron
 970 $chars{0xcf74}=0x0165; # small t with caron
 971 $chars{0xcf75}=0x01d4; # small u with caron
 972 $chars{0xcf7a}=0x017e; # small z with caron
 973         # 5/0 cedilla
 974 $chars{0xd020}=0x00b8; # cedilla
 975 $chars{0xd043}=0x00c7; # capital c with cedilla
 976 $chars{0xd044}=0x1e10; # capital d with cedilla
 977 $chars{0xd047}=0x0122; # capital g with cedilla
 978 $chars{0xd048}=0x1e28; # capital h with cedilla
 979 $chars{0xd04b}=0x0136; # capital k with cedilla
 980 $chars{0xd04c}=0x013b; # capital l with cedilla
 981 $chars{0xd04e}=0x0145; # capital n with cedilla
 982 $chars{0xd052}=0x0156; # capital r with cedilla
 983 $chars{0xd053}=0x015e; # capital s with cedilla
 984 $chars{0xd054}=0x0162; # capital t with cedilla
 985 $chars{0xd063}=0x00e7; # small c with cedilla
 986 $chars{0xd064}=0x1e11; # small d with cedilla
 987 $chars{0xd065}=0x0119; # small e with cedilla
 988 $chars{0xd067}=0x0123; # small g with cedilla
 989 $chars{0xd068}=0x1e29; # small h with cedilla
 990 $chars{0xd06b}=0x0137; # small k with cedilla
 991 $chars{0xd06c}=0x013c; # small l with cedilla
 992 $chars{0xd06e}=0x0146; # small n with cedilla
 993 $chars{0xd072}=0x0157; # small r with cedilla
 994 $chars{0xd073}=0x015f; # small s with cedilla
 995 $chars{0xd074}=0x0163; # small t with cedilla
 996         # 5/1 rude
 997         # 5/2 hook to left
 998         # 5/3 ogonek (hook to right
 999 $chars{0xd320}=0x02db; # ogonek
1000 $chars{0xd341}=0x0104; # capital a with ogonek
1001 $chars{0xd345}=0x0118; # capital e with ogonek
1002 $chars{0xd349}=0x012e; # capital i with ogonek
1003 $chars{0xd34f}=0x01ea; # capital o with ogonek
1004 $chars{0xd355}=0x0172; # capital u with ogonek
1005 $chars{0xd361}=0x0105; # small a with ogonek
1006 $chars{0xd365}=0x0119; # small e with ogonek
1007 $chars{0xd369}=0x012f; # small i with ogonek
1008 $chars{0xd36f}=0x01eb; # small o with ogonek
1009 $chars{0xd375}=0x0173; # small u with ogonek
1010         # 5/4 circle below
1011 $chars{0xd441}=0x1e00; # capital a with ring below
1012 $chars{0xd461}=0x1e01; # small a with ring below
1013         # 5/5 half circle below
1014 $chars{0xf948}=0x1e2a; # capital h with breve below
1015 $chars{0xf968}=0x1e2b; # small h with breve below
1016         # 5/6 dot below
1017 $chars{0xd641}=0x1ea0; # capital a with dot below
1018 $chars{0xd642}=0x1e04; # capital b with dot below
1019 $chars{0xd644}=0x1e0c; # capital d with dot below
1020 $chars{0xd645}=0x1eb8; # capital e with dot below
1021 $chars{0xd648}=0x1e24; # capital h with dot below
1022 $chars{0xd649}=0x1eca; # capital i with dot below
1023 $chars{0xd64b}=0x1e32; # capital k with dot below
1024 $chars{0xd64c}=0x1e36; # capital l with dot below
1025 $chars{0xd64d}=0x1e42; # capital m with dot below
1026 $chars{0xd64e}=0x1e46; # capital n with dot below
1027 $chars{0xd64f}=0x1ecc; # capital o with dot below
1028 $chars{0xd652}=0x1e5a; # capital r with dot below
1029 $chars{0xd653}=0x1e62; # capital s with dot below
1030 $chars{0xd654}=0x1e6c; # capital t with dot below
1031 $chars{0xd655}=0x1ee4; # capital u with dot below
1032 $chars{0xd656}=0x1e7e; # capital v with dot below
1033 $chars{0xd657}=0x1e88; # capital w with dot below
1034 $chars{0xd659}=0x1ef4; # capital y with dot below
1035 $chars{0xd65a}=0x1e92; # capital z with dot below
1036 $chars{0xd661}=0x1ea1; # small a with dot below
1037 $chars{0xd662}=0x1e05; # small b with dot below
1038 $chars{0xd664}=0x1e0d; # small d with dot below
1039 $chars{0xd665}=0x1eb9; # small e with dot below
1040 $chars{0xd668}=0x1e25; # small h with dot below
1041 $chars{0xd669}=0x1ecb; # small i with dot below
1042 $chars{0xd66b}=0x1e33; # small k with dot below
1043 $chars{0xd66c}=0x1e37; # small l with dot below
1044 $chars{0xd66d}=0x1e43; # small m with dot below
1045 $chars{0xd66e}=0x1e47; # small n with dot below
1046 $chars{0xd66f}=0x1ecd; # small o with dot below
1047 $chars{0xd672}=0x1e5b; # small r with dot below
1048 $chars{0xd673}=0x1e63; # small s with dot below
1049 $chars{0xd674}=0x1e6d; # small t with dot below
1050 $chars{0xd675}=0x1ee5; # small u with dot below
1051 $chars{0xd676}=0x1e7f; # small v with dot below
1052 $chars{0xd677}=0x1e89; # small w with dot below
1053 $chars{0xd679}=0x1ef5; # small y with dot below
1054 $chars{0xd67a}=0x1e93; # small z with dot below
1055         # 5/7 double dot below
1056 $chars{0xd755}=0x1e72; # capital u with diaeresis below
1057 $chars{0xd775}=0x1e73; # small u with diaeresis below
1058         # 5/8 underline
1059 $chars{0xd820}=0x005f; # underline
1060         # 5/9 double underline
1061 $chars{0xd920}=0x2017; # double underline
1062         # 5/10 small low vertical bar
1063 $chars{0xda20}=0x02cc; #
1064         # 5/11 circumflex below
1065         # 5/12 (this position shall not be used)
1066         # 5/13 left half of ligature sign and of double tilde
1067         # 5/14 right half of ligature sign
1068         # 5/15 right half of double tilde
1069 #     map {printf "%x :%x\n",$_,$chars{$_};}keys %chars;
1070
1071 sub char_decode5426 {
1072     my ( $string) = @_;
1073     my $result;
1074
1075     my @data = unpack("C*", $string);
1076     my @characters;
1077     my $length=scalar(@data);
1078     for (my $i = 0; $i < scalar(@data); $i++) {
1079       my $char= $data[$i];
1080       if ($char >= 0x00 && $char <= 0x7F){
1081         #IsAscii
1082
1083           push @characters,$char unless ($char<0x02 ||$char== 0x0F);
1084       }elsif (($char >= 0xC0 && $char <= 0xDF)) {
1085         #Combined Char
1086         my $convchar ;
1087         if ($chars{$char*256+$data[$i+1]}) {
1088           $convchar= $chars{$char * 256 + $data[$i+1]};
1089           $i++;
1090 #           printf "char %x $char, char to convert %x , converted %x\n",$char,$char * 256 + $data[$i - 1],$convchar;
1091         } elsif ($chars{$char})  {
1092           $convchar= $chars{$char};
1093 #           printf "0xC char %x, converted %x\n",$char,$chars{$char};
1094         }else {
1095           $convchar=$char;
1096         }
1097         push @characters,$convchar;
1098       } else {
1099         my $convchar;
1100         if ($chars{$char})  {
1101           $convchar= $chars{$char};
1102 #            printf "char %x,  converted %x\n",$char,$chars{$char};
1103         }else {
1104 #            printf "char %x $char\n",$char;
1105           $convchar=$char;
1106         }
1107         push @characters,$convchar;
1108       }
1109     }
1110     $result=pack "U*",@characters;
1111 #     $result=~s/\x01//;
1112 #     $result=~s/\x00//;
1113      $result=~s/\x0f//;
1114      $result=~s/\x1b.//;
1115      $result=~s/\x0e//;
1116      $result=~s/\x1b\x5b//;
1117 #   map{printf "%x",$_} @characters;
1118 #   printf "\n";
1119   return $result;
1120 }
1121
1122 1;
1123
1124
1125 =head1 AUTHOR
1126
1127 Koha Development Team <http://koha-community.org/>
1128
1129 Galen Charlton <galen.charlton@liblime.com>
1130
1131 =cut