C4/Charset.pm

   1 package C4::Charset;
   2
   3 # Copyright (C) 2008 LibLime
   4 #
   5 # This file is part of Koha.
   6 #
   7 # Koha is free software; you can redistribute it and/or modify it
   8 # under the terms of the GNU General Public License as published by
   9 # the Free Software Foundation; either version 3 of the License, or
  10 # (at your option) any later version.
  11 #
  12 # Koha is distributed in the hope that it will be useful, but
  13 # WITHOUT ANY WARRANTY; without even the implied warranty of
  14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  15 # GNU General Public License for more details.
  16 #
  17 # You should have received a copy of the GNU General Public License
  18 # along with Koha; if not, see <http://www.gnu.org/licenses>.
  19
  20 use strict;
  21 use warnings;
  22
  23 use MARC::Charset qw/marc8_to_utf8/;
  24 use Text::Iconv;
  25 use C4::Debug;
  26 use Unicode::Normalize;
  27 use Encode qw( decode encode is_utf8 );
  28
  29 use vars qw(@ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
  30
  31 BEGIN {
  32     require Exporter;
  33     @ISA    = qw(Exporter);
  34     @EXPORT = qw(
  35         NormalizeString
  36         IsStringUTF8ish
  37         MarcToUTF8Record
  38         SetUTF8Flag
  39         SetMarcUnicodeFlag
  40         StripNonXmlChars
  41         nsb_clean
  42         SanitizeRecord
  43     );
  44 }
  45
  46 =encoding UTF-8
  47
  48 =head1 NAME
  49
  50 C4::Charset - utilities for handling character set conversions.
  51
  52 =head1 SYNOPSIS
  53
  54   use C4::Charset;
  55
  56 =head1 DESCRIPTION
  57
  58 This module contains routines for dealing with character set
  59 conversions, particularly for MARC records.
  60
  61 A variety of character encodings are in use by various MARC
  62 standards, and even more character encodings are used by
  63 non-standard MARC records.  The various MARC formats generally
  64 do not do a good job of advertising a given record's character
  65 encoding, and even when a record does advertise its encoding,
  66 e.g., via the Leader/09, experience has shown that one cannot
  67 trust it.
  68
  69 Ultimately, all MARC records are stored in Koha in UTF-8 and
  70 must be converted from whatever the source character encoding is.
  71 The goal of this module is to ensure that these conversions
  72 take place accurately.  When a character conversion cannot take
  73 place, or at least not accurately, the module was provide
  74 enough information to allow user-facing code to inform the user
  75 on how to deal with the situation.
  76
  77 =cut
  78
  79 =head1 FUNCTIONS
  80
  81 =head2 IsStringUTF8ish
  82
  83   my $is_utf8 = IsStringUTF8ish($str);
  84
  85 Determines if C<$str> is valid UTF-8.  This can mean
  86 one of two things:
  87
  88 =over
  89
  90 =item *
  91
  92 The Perl UTF-8 flag is set and the string contains valid UTF-8.
  93
  94 =item *
  95
  96 The Perl UTF-8 flag is B<not> set, but the octets contain
  97 valid UTF-8.
  98
  99 =back
 100
 101 The function is named C<IsStringUTF8ish> instead of C<IsStringUTF8>
 102 because in one could be presented with a MARC blob that is
 103 not actually in UTF-8 but whose sequence of octets appears to be
 104 valid UTF-8.  The rest of the MARC character conversion functions
 105 will assume that this situation occur does not very often.
 106
 107 =cut
 108
 109 sub IsStringUTF8ish {
 110     my $str = shift;
 111
 112     return 1 if Encode::is_utf8($str);
 113     return utf8::decode( $str );
 114 }
 115
 116 =head2 SetUTF8Flag
 117
 118   my $marc_record = SetUTF8Flag($marc_record, $nfd);
 119
 120 This function sets the PERL UTF8 flag for data.
 121 It is required when using new_from_usmarc
 122 since MARC::File::USMARC does not handle PERL UTF8 setting.
 123 When editing unicode marc records fields and subfields, you
 124 would end up in double encoding without using this function.
 125
 126 If $nfd is set, string normalization will use NFD instead of NFC
 127
 128 FIXME
 129 In my opinion, this function belongs to MARC::Record and not
 130 to this package.
 131 But since it handles charset, and MARC::Record, it finds its way in that package
 132
 133 =cut
 134
 135 sub SetUTF8Flag{
 136     my ($record, $nfd)=@_;
 137     return unless ($record && $record->fields());
 138     foreach my $field ($record->fields()){
 139         if ($field->tag()>=10){
 140             my @subfields;
 141             foreach my $subfield ($field->subfields()){
 142                 push @subfields,($$subfield[0],NormalizeString($$subfield[1],$nfd));
 143             }
 144             eval {
 145                 my $newfield=MARC::Field->new(
 146                             $field->tag(),
 147                             $field->indicator(1),
 148                             $field->indicator(2),
 149                             @subfields
 150                         );
 151                 $field->replace_with($newfield);
 152             };
 153             warn "ERROR occurred in SetUTF8Flag $@" if $@;
 154         }
 155     }
 156 }
 157
 158 =head2 NormalizeString
 159
 160     my $normalized_string=NormalizeString($string,$nfd,$transform);
 161
 162 Given a string
 163 nfd : If you want to set NFD and not NFC
 164 transform : If you expect all the signs to be removed
 165
 166 Sets the PERL UTF8 Flag on your initial data if need be
 167 and applies cleaning if required
 168
 169 Returns a utf8 NFC normalized string
 170
 171 Sample code :
 172    my $string=NormalizeString ("l'ornithoptère");
 173    #results into ornithoptère in NFC form and sets UTF8 Flag
 174
 175 =cut
 176
 177
 178 sub NormalizeString{
 179         my ($string,$nfd,$transform)=@_;
 180     return $string unless defined($string); # force scalar context return.
 181     $string = Encode::decode('UTF-8', $string) unless (Encode::is_utf8($string));
 182         if ($nfd){
 183                 $string= NFD($string);
 184         }
 185         else {
 186                 $string=NFC($string);
 187         }
 188         if ($transform){
 189     $string=~s/\<|\>|\^|\;|\.|\?|,|\-|\(|\)|\[|\]|\{|\}|\$|\%|\!|\*|\:|\\|\/|\&|\"|\'/ /g;
 190         #removing one letter words "d'" "l'"  was changed into "d " "l "
 191     $string=~s/\b\S\b//g;
 192     $string=~s/\s+$//g;
 193         }
 194     return $string;
 195 }
 196
 197 =head2 MarcToUTF8Record
 198
 199   ($marc_record, $converted_from, $errors_arrayref) = MarcToUTF8Record($marc_blob,
 200                                         $marc_flavour, [, $source_encoding]);
 201
 202 Given a MARC blob or a C<MARC::Record>, the MARC flavour, and an
 203 optional source encoding, return a C<MARC::Record> that is
 204 converted to UTF-8.
 205
 206 The returned C<$marc_record> is guaranteed to be in valid UTF-8, but
 207 is not guaranteed to have been converted correctly.  Specifically,
 208 if C<$converted_from> is 'failed', the MARC record returned failed
 209 character conversion and had each of its non-ASCII octets changed
 210 to the Unicode replacement character.
 211
 212 If the source encoding was not specified, this routine will
 213 try to guess it; the character encoding used for a successful
 214 conversion is returned in C<$converted_from>.
 215
 216 =cut
 217
 218 sub MarcToUTF8Record {
 219     my $marc = shift;
 220     my $marc_flavour = shift;
 221     my $source_encoding = shift;
 222     my $marc_record;
 223     my $marc_blob_is_utf8 = 0;
 224     if (ref($marc) eq 'MARC::Record') {
 225         my $marc_blob = $marc->as_usmarc();
 226         $marc_blob_is_utf8 = IsStringUTF8ish($marc_blob);
 227         $marc_record = $marc;
 228     } else {
 229         # dealing with a MARC blob
 230
 231         # remove any ersatz whitespace from the beginning and
 232         # end of the MARC blob -- these can creep into MARC
 233         # files produced by several sources -- caller really
 234         # should be doing this, however
 235         $marc =~ s/^\s+//;
 236         $marc =~ s/\s+$//;
 237         $marc_blob_is_utf8 = IsStringUTF8ish($marc);
 238         eval {
 239             $marc_record = MARC::Record->new_from_usmarc($marc);
 240         };
 241         if ($@) {
 242             # if we fail the first time, one likely problem
 243             # is that we have a MARC21 record that says that it's
 244             # UTF-8 (Leader/09 = 'a') but contains non-UTF-8 characters.
 245             # We'll try parsing it again.
 246             substr($marc, 9, 1) = ' ';
 247             eval {
 248                 $marc_record = MARC::Record->new_from_usmarc($marc);
 249             };
 250             if ($@) {
 251                 # it's hopeless; return an empty MARC::Record
 252                 return MARC::Record->new(), 'failed', ['could not parse MARC blob'];
 253             }
 254         }
 255     }
 256
 257     # If we do not know the source encoding, try some guesses
 258     # as follows:
 259     #   1. Record is UTF-8 already.
 260     #   2. If MARC flavor is MARC21 or NORMARC, then
 261     #      a. record is MARC-8
 262     #      b. record is ISO-8859-1
 263     #   3. If MARC flavor is UNIMARC, then
 264     if (not defined $source_encoding) {
 265         if ($marc_blob_is_utf8) {
 266             # note that for MARC21/NORMARC we are not bothering to check
 267             # if the Leader/09 is set to 'a' or not -- because
 268             # of problems with various ILSs (including Koha in the
 269             # past, alas), this just is not trustworthy.
 270             SetMarcUnicodeFlag($marc_record, $marc_flavour);
 271             return $marc_record, 'UTF-8', [];
 272         } else {
 273             if ($marc_flavour eq 'MARC21' || $marc_flavour eq 'NORMARC') {
 274                 return _default_marc21_charconv_to_utf8($marc_record, $marc_flavour);
 275             } elsif ($marc_flavour =~/UNIMARC/) {
 276                 return _default_unimarc_charconv_to_utf8($marc_record, $marc_flavour);
 277             } else {
 278                 return _default_marc21_charconv_to_utf8($marc_record, $marc_flavour);
 279             }
 280         }
 281     } else {
 282         # caller knows the character encoding
 283         my $original_marc_record = $marc_record->clone();
 284         my @errors;
 285         if ($source_encoding =~ /utf-?8/i) {
 286             if ($marc_blob_is_utf8) {
 287                 SetMarcUnicodeFlag($marc_record, $marc_flavour);
 288                 return $marc_record, 'UTF-8', [];
 289             } else {
 290                 push @errors, 'specified UTF-8 => UTF-8 conversion, but record is not in UTF-8';
 291             }
 292         } elsif ($source_encoding =~ /marc-?8/i) {
 293             @errors = _marc_marc8_to_utf8($marc_record, $marc_flavour);
 294         } elsif ($source_encoding =~ /5426/) {
 295             @errors = _marc_iso5426_to_utf8($marc_record, $marc_flavour);
 296         } else {
 297             # assume any other character encoding is for Text::Iconv
 298             @errors = _marc_to_utf8_via_text_iconv($marc_record, $marc_flavour, $source_encoding);
 299         }
 300
 301         if (@errors) {
 302             _marc_to_utf8_replacement_char($original_marc_record, $marc_flavour);
 303             return $original_marc_record, 'failed', \@errors;
 304         } else {
 305             return $marc_record, $source_encoding, [];
 306         }
 307     }
 308
 309 }
 310
 311 =head2 SetMarcUnicodeFlag
 312
 313   SetMarcUnicodeFlag($marc_record, $marc_flavour);
 314
 315 Set both the internal MARC::Record encoding flag
 316 and the appropriate Leader/09 (MARC21) or
 317 100/26-29 (UNIMARC) to indicate that the record
 318 is in UTF-8.  Note that this does B<not> do
 319 any actual character conversion.
 320
 321 =cut
 322
 323 sub SetMarcUnicodeFlag {
 324     my $marc_record = shift;
 325     my $marc_flavour = shift; # || C4::Context->preference("marcflavour");
 326
 327     $marc_record->encoding('UTF-8');
 328     if ($marc_flavour eq 'MARC21' || $marc_flavour eq 'NORMARC') {
 329         my $leader = $marc_record->leader();
 330         substr($leader, 9, 1) = 'a';
 331         $marc_record->leader($leader);
 332     } elsif ($marc_flavour =~/UNIMARC/) {
 333         require C4::Context;
 334         my $defaultlanguage = C4::Context->preference("UNIMARCField100Language");
 335         $defaultlanguage = "fre" if (!$defaultlanguage || length($defaultlanguage) != 3);
 336         my $string;
 337                 my ($subflength,$encodingposition)=($marc_flavour=~/AUTH/?(21,12):(36,25));
 338                 $string=$marc_record->subfield( 100, "a" );
 339         if (defined $string && length($string)==$subflength) {
 340                         $string = substr $string, 0,$subflength if (length($string)>$subflength);
 341         }
 342         else {
 343             $string = POSIX::strftime( "%Y%m%d", localtime );
 344             $string =~ s/\-//g;
 345             $string = sprintf( "%-*s", $subflength, $string );
 346             substr ( $string, ($encodingposition - 3), 3, $defaultlanguage);
 347         }
 348         substr( $string, $encodingposition, 3, "y50" );
 349         if ( $marc_record->subfield( 100, "a" ) ) {
 350                         $marc_record->field('100')->update(a=>$string);
 351                 }
 352                 else {
 353             $marc_record->insert_grouped_field(
 354                 MARC::Field->new( 100, '', '', "a" => $string ) );
 355         }
 356                 $debug && warn "encodage: ", substr( $marc_record->subfield(100, 'a'), $encodingposition, 3 );
 357     } else {
 358         warn "Unrecognized marcflavour: $marc_flavour";
 359     }
 360 }
 361
 362 =head2 StripNonXmlChars
 363
 364   my $new_str = StripNonXmlChars($old_str);
 365
 366 Given a string, return a copy with the
 367 characters that are illegal in XML
 368 removed.
 369
 370 This function exists to work around a problem
 371 that can occur with badly-encoded MARC records.
 372 Specifically, if a UTF-8 MARC record also
 373 has excape (\x1b) characters, MARC::File::XML
 374 will let the escape characters pass through
 375 when as_xml() or as_xml_record() is called.  The
 376 problem is that the escape character is not
 377 legal in well-formed XML documents, so when
 378 MARC::File::XML attempts to parse such a record,
 379 the XML parser will fail.
 380
 381 Stripping such characters will allow a
 382 MARC::Record->new_from_xml()
 383 to work, at the possible risk of some data loss.
 384
 385 =cut
 386
 387 sub StripNonXmlChars {
 388     my $str = shift;
 389     if (!defined($str) || $str eq ""){
 390         return "";
 391     }
 392     $str =~ s/[^\x09\x0A\x0D\x{0020}-\x{D7FF}\x{E000}-\x{FFFD}\x{10000}-\x{10FFFF}]//g;
 393     return $str;
 394 }
 395
 396
 397
 398 =head2 nsb_clean
 399
 400 =over 4
 401
 402 nsb_clean($string);
 403
 404 =back
 405
 406 Removes Non Sorting Block characters
 407
 408 =cut
 409 sub nsb_clean {
 410     my $NSB  = '\x88' ;        # NSB : begin Non Sorting Block
 411     my $NSE  = '\x89' ;        # NSE : Non Sorting Block end
 412     my $NSB2 = '\x98' ;        # NSB : begin Non Sorting Block
 413     my $NSE2 = '\x9C' ;        # NSE : Non Sorting Block end
 414     my $C2   = '\xC2' ;        # What is this char ? It is sometimes left by the regexp after removing NSB / NSE
 415
 416     # handles non sorting blocks
 417     my ($string) = @_ ;
 418     $_ = $string ;
 419     s/($C2){0,1}($NSB|$NSB2)//g ;
 420     s/($C2){0,1}($NSE|$NSE2)//g ;
 421     $string = $_ ;
 422
 423     return($string) ;
 424 }
 425
 426
 427 =head2 SanitizeRecord
 428
 429 SanitizeRecord($marcrecord);
 430
 431 Sanitize a record
 432 This routine is called in the maintenance script misc/maintenance/sanitize_records.pl.
 433 It cleans any string with '&amp;amp;...', replacing it by '&'
 434
 435 =cut
 436
 437 sub SanitizeRecord {
 438     my ( $record, $biblionumber ) = @_;
 439     my $string;
 440     my $record_modified = 0;
 441     my $frameworkcode   = C4::Biblio::GetFrameworkCode($biblionumber);
 442     my ( $url_field, $url_subfield ) =
 443       C4::Biblio::GetMarcFromKohaField( 'biblioitems.url', $frameworkcode );
 444     foreach my $field ( $record->fields() ) {
 445         if ( $field->is_control_field() ) {
 446             my $value           = $field->data();
 447             my $sanitized_value = _clean_ampersand($value);
 448             $record_modified = 1 if $sanitized_value ne $value;
 449             $field->update($sanitized_value);
 450         }
 451         else {
 452             my @subfields = $field->subfields();
 453             my @new_subfields;
 454             foreach my $subfield (@subfields) {
 455                 next
 456                   if $url_field eq $field->tag()
 457                       and $url_subfield eq $subfield->[0];
 458                 my $value           = $subfield->[1];
 459                 my $sanitized_value = _clean_ampersand($value);
 460                 push @new_subfields, $subfield->[0] => $sanitized_value;
 461                 $record_modified = 1 if $sanitized_value ne $value;
 462             }
 463             if ( scalar(@new_subfields) > 0 ) {
 464                 my $new_field = eval {
 465                     MARC::Field->new(
 466                         $field->tag(),        $field->indicator(1),
 467                         $field->indicator(2), @new_subfields
 468                     );
 469                 };
 470                 if ($@) {
 471                     warn "error : $@";
 472                 }
 473                 else {
 474                     $field->replace_with($new_field);
 475                 }
 476
 477             }
 478         }
 479     }
 480
 481     return $record, $record_modified;
 482 }
 483
 484 sub _clean_ampersand {
 485     my ($string) = @_;
 486     $string =~ s/(&)(amp;)+/$1/g;
 487     return $string;
 488 }
 489
 490 =head1 INTERNAL FUNCTIONS
 491
 492 =head2 _default_marc21_charconv_to_utf8
 493
 494   my ($new_marc_record, $guessed_charset) = _default_marc21_charconv_to_utf8($marc_record);
 495
 496 Converts a C<MARC::Record> of unknown character set to UTF-8,
 497 first by trying a MARC-8 to UTF-8 conversion, then ISO-8859-1
 498 to UTF-8, then a default conversion that replaces each non-ASCII
 499 character with the replacement character.
 500
 501 The C<$guessed_charset> return value contains the character set
 502 that resulted in a conversion to valid UTF-8; note that
 503 if the MARC-8 and ISO-8859-1 conversions failed, the value of
 504 this is 'failed'.
 505
 506 =cut
 507
 508 sub _default_marc21_charconv_to_utf8 {
 509     my $marc_record = shift;
 510     my $marc_flavour = shift;
 511
 512     my $trial_marc8 = $marc_record->clone();
 513     my @all_errors = ();
 514     my @errors = _marc_marc8_to_utf8($trial_marc8, $marc_flavour);
 515     unless (@errors) {
 516         return $trial_marc8, 'MARC-8', [];
 517     }
 518     push @all_errors, @errors;
 519
 520     my $trial_8859_1 = $marc_record->clone();
 521     @errors = _marc_to_utf8_via_text_iconv($trial_8859_1, $marc_flavour, 'iso-8859-1');
 522     unless (@errors) {
 523         return $trial_8859_1, 'iso-8859-1', []; # note -- we could return \@all_errors
 524                                                 # instead if we wanted to report details
 525                                                 # of the failed attempt at MARC-8 => UTF-8
 526     }
 527     push @all_errors, @errors;
 528
 529     my $default_converted = $marc_record->clone();
 530     _marc_to_utf8_replacement_char($default_converted, $marc_flavour);
 531     return $default_converted, 'failed', \@all_errors;
 532 }
 533
 534 =head2 _default_unimarc_charconv_to_utf8
 535
 536   my ($new_marc_record, $guessed_charset) = _default_unimarc_charconv_to_utf8($marc_record);
 537
 538 Converts a C<MARC::Record> of unknown character set to UTF-8,
 539 first by trying a ISO-5426 to UTF-8 conversion, then ISO-8859-1
 540 to UTF-8, then a default conversion that replaces each non-ASCII
 541 character with the replacement character.
 542
 543 The C<$guessed_charset> return value contains the character set
 544 that resulted in a conversion to valid UTF-8; note that
 545 if the MARC-8 and ISO-8859-1 conversions failed, the value of
 546 this is 'failed'.
 547
 548 =cut
 549
 550 sub _default_unimarc_charconv_to_utf8 {
 551     my $marc_record = shift;
 552     my $marc_flavour = shift;
 553
 554     my $trial_marc8 = $marc_record->clone();
 555     my @all_errors = ();
 556     my @errors = _marc_iso5426_to_utf8($trial_marc8, $marc_flavour);
 557     unless (@errors) {
 558         return $trial_marc8, 'iso-5426';
 559     }
 560     push @all_errors, @errors;
 561
 562     my $trial_8859_1 = $marc_record->clone();
 563     @errors = _marc_to_utf8_via_text_iconv($trial_8859_1, $marc_flavour, 'iso-8859-1');
 564     unless (@errors) {
 565         return $trial_8859_1, 'iso-8859-1';
 566     }
 567     push @all_errors, @errors;
 568
 569     my $default_converted = $marc_record->clone();
 570     _marc_to_utf8_replacement_char($default_converted, $marc_flavour);
 571     return $default_converted, 'failed', \@all_errors;
 572 }
 573
 574 =head2 _marc_marc8_to_utf8
 575
 576   my @errors = _marc_marc8_to_utf8($marc_record, $marc_flavour, $source_encoding);
 577
 578 Convert a C<MARC::Record> to UTF-8 in-place from MARC-8.
 579 If the conversion fails for some reason, an
 580 appropriate messages will be placed in the returned
 581 C<@errors> array.
 582
 583 =cut
 584
 585 sub _marc_marc8_to_utf8 {
 586     my $marc_record = shift;
 587     my $marc_flavour = shift;
 588
 589     my $prev_ignore = MARC::Charset->ignore_errors();
 590     MARC::Charset->ignore_errors(1);
 591
 592     # trap warnings raised by MARC::Charset
 593     my @errors = ();
 594     local $SIG{__WARN__} = sub {
 595         my $msg = $_[0];
 596         if ($msg =~ /MARC.Charset/) {
 597             # FIXME - purpose of this regexp is to strip out the
 598             # line reference to MARC/Charset.pm, but as it
 599             # exists probably won't work quite on Windows --
 600             # some sort of minimal-bunch back-tracking RE
 601             # would be helpful here
 602             $msg =~ s/at [\/].*?.MARC.Charset\.pm line \d+\.\n$//;
 603             push @errors, $msg;
 604         } else {
 605             # if warning doesn't come from MARC::Charset, just
 606             # pass it on
 607             warn $msg;
 608         }
 609     };
 610
 611     foreach my $field ($marc_record->fields()) {
 612         if ($field->is_control_field()) {
 613             ; # do nothing -- control fields should not contain non-ASCII characters
 614         } else {
 615             my @converted_subfields;
 616             foreach my $subfield ($field->subfields()) {
 617                 my $utf8sf = MARC::Charset::marc8_to_utf8($subfield->[1]);
 618                 unless (IsStringUTF8ish($utf8sf)) {
 619                     # Because of a bug in MARC::Charset 0.98, if the string
 620                     # has (a) one or more diacritics that (b) are only in character positions
 621                     # 128 to 255 inclusive, the resulting converted string is not in
 622                     # UTF-8, but the legacy 8-bit encoding (e.g., ISO-8859-1).  If that
 623                     # occurs, upgrade the string in place.  Moral of the story seems to be
 624                     # that pack("U", ...) is better than chr(...) if you need to guarantee
 625                     # that the resulting string is UTF-8.
 626                     $utf8sf = Encode::encode('UTF-8', $utf8sf);
 627                 }
 628                 push @converted_subfields, $subfield->[0], $utf8sf;
 629             }
 630
 631             $field->replace_with(MARC::Field->new(
 632                 $field->tag(), $field->indicator(1), $field->indicator(2),
 633                 @converted_subfields)
 634             );
 635         }
 636     }
 637
 638     MARC::Charset->ignore_errors($prev_ignore);
 639
 640     SetMarcUnicodeFlag($marc_record, $marc_flavour);
 641
 642     return @errors;
 643 }
 644
 645 =head2 _marc_iso5426_to_utf8
 646
 647   my @errors = _marc_iso5426_to_utf8($marc_record, $marc_flavour, $source_encoding);
 648
 649 Convert a C<MARC::Record> to UTF-8 in-place from ISO-5426.
 650 If the conversion fails for some reason, an
 651 appropriate messages will be placed in the returned
 652 C<@errors> array.
 653
 654 FIXME - is ISO-5426 equivalent enough to MARC-8
 655 that C<MARC::Charset> can be used instead?
 656
 657 =cut
 658
 659 sub _marc_iso5426_to_utf8 {
 660     my $marc_record = shift;
 661     my $marc_flavour = shift;
 662
 663     my @errors = ();
 664
 665     foreach my $field ($marc_record->fields()) {
 666         if ($field->is_control_field()) {
 667             ; # do nothing -- control fields should not contain non-ASCII characters
 668         } else {
 669             my @converted_subfields;
 670             foreach my $subfield ($field->subfields()) {
 671                 my $utf8sf = char_decode5426($subfield->[1]);
 672                 push @converted_subfields, $subfield->[0], $utf8sf;
 673             }
 674
 675             $field->replace_with(MARC::Field->new(
 676                 $field->tag(), $field->indicator(1), $field->indicator(2),
 677                 @converted_subfields)
 678             );
 679         }
 680     }
 681
 682     SetMarcUnicodeFlag($marc_record, $marc_flavour);
 683
 684     return @errors;
 685 }
 686
 687 =head2 _marc_to_utf8_via_text_iconv
 688
 689   my @errors = _marc_to_utf8_via_text_iconv($marc_record, $marc_flavour, $source_encoding);
 690
 691 Convert a C<MARC::Record> to UTF-8 in-place using the
 692 C<Text::Iconv> CPAN module.  Any source encoding accepted
 693 by the user's iconv installation should work.  If
 694 the source encoding is not recognized on the user's
 695 server or the conversion fails for some reason,
 696 appropriate messages will be placed in the returned
 697 C<@errors> array.
 698
 699 =cut
 700
 701 sub _marc_to_utf8_via_text_iconv {
 702     my $marc_record = shift;
 703     my $marc_flavour = shift;
 704     my $source_encoding = shift;
 705
 706     my @errors = ();
 707     my $decoder;
 708     eval { $decoder = Text::Iconv->new($source_encoding, 'utf8'); };
 709     if ($@) {
 710         push @errors, "Could not initialze $source_encoding => utf8 converter: $@";
 711         return @errors;
 712     }
 713
 714     my $prev_raise_error = Text::Iconv->raise_error();
 715     Text::Iconv->raise_error(1);
 716
 717     foreach my $field ($marc_record->fields()) {
 718         if ($field->is_control_field()) {
 719             ; # do nothing -- control fields should not contain non-ASCII characters
 720         } else {
 721             my @converted_subfields;
 722             foreach my $subfield ($field->subfields()) {
 723                 my $converted_value;
 724                 my $conversion_ok = 1;
 725                 eval { $converted_value = $decoder->convert($subfield->[1]); };
 726                 if ($@) {
 727                     $conversion_ok = 0;
 728                     push @errors, $@;
 729                 } elsif (not defined $converted_value) {
 730                     $conversion_ok = 0;
 731                     push @errors, "Text::Iconv conversion failed - retval is " . $decoder->retval();
 732                 }
 733
 734                 if ($conversion_ok) {
 735                     push @converted_subfields, $subfield->[0], $converted_value;
 736                 } else {
 737                     $converted_value = $subfield->[1];
 738                     $converted_value =~ s/[\200-\377]/\xef\xbf\xbd/g;
 739                     push @converted_subfields, $subfield->[0], $converted_value;
 740                 }
 741             }
 742
 743             $field->replace_with(MARC::Field->new(
 744                 $field->tag(), $field->indicator(1), $field->indicator(2),
 745                 @converted_subfields)
 746             );
 747         }
 748     }
 749
 750     SetMarcUnicodeFlag($marc_record, $marc_flavour);
 751     Text::Iconv->raise_error($prev_raise_error);
 752
 753     return @errors;
 754 }
 755
 756 =head2 _marc_to_utf8_replacement_char
 757
 758   _marc_to_utf8_replacement_char($marc_record, $marc_flavour);
 759
 760 Convert a C<MARC::Record> to UTF-8 in-place, adopting the
 761 unsatisfactory method of replacing all non-ASCII (e.g.,
 762 where the eight bit is set) octet with the Unicode
 763 replacement character.  This is meant as a last-ditch
 764 method, and would be best used as part of a UI that
 765 lets a cataloguer pick various character conversions
 766 until he or she finds the right one.
 767
 768 =cut
 769
 770 sub _marc_to_utf8_replacement_char {
 771     my $marc_record = shift;
 772     my $marc_flavour = shift;
 773
 774     foreach my $field ($marc_record->fields()) {
 775         if ($field->is_control_field()) {
 776             ; # do nothing -- control fields should not contain non-ASCII characters
 777         } else {
 778             my @converted_subfields;
 779             foreach my $subfield ($field->subfields()) {
 780                 my $value = $subfield->[1];
 781                 $value =~ s/[\200-\377]/\xef\xbf\xbd/g;
 782                 push @converted_subfields, $subfield->[0], $value;
 783             }
 784
 785             $field->replace_with(MARC::Field->new(
 786                 $field->tag(), $field->indicator(1), $field->indicator(2),
 787                 @converted_subfields)
 788             );
 789         }
 790     }
 791
 792     SetMarcUnicodeFlag($marc_record, $marc_flavour);
 793 }
 794
 795 =head2 char_decode5426
 796
 797   my $utf8string = char_decode5426($iso_5426_string);
 798
 799 Converts a string from ISO-5426 to UTF-8.
 800
 801 =cut
 802
 803
 804 my %chars;
 805
 806 ####
 807 ## 0xb
 808 $chars{0xb0}=0x0101;#3/0ayn[ain]
 809 $chars{0xb1}=0x0623;#3/1alif/hamzah[alefwithhamzaabove]
 810 #$chars{0xb2}=0x00e0;#'à';
 811 $chars{0xb2}=0x00e0;#3/2leftlowsinglequotationmark
 812 #$chars{0xb3}=0x00e7;#'ç';
 813 $chars{0xb3}=0x00e7;#3/2leftlowsinglequotationmark
 814 # $chars{0xb4}='è';
 815 $chars{0xb4}=0x00e8;
 816 # $chars{0xb5}='é';
 817 $chars{0xb5}=0x00e9;
 818 $chars{0xb6}=0x2021; # double dagger
 819 $chars{0xb7}=0x00b7; # middle dot
 820 $chars{0xb8}=0x2033; # double prime
 821 $chars{0xb9}=0x2019; # right single quotation mark
 822 $chars{0xba}=0x201d; # right double quotation mark
 823 $chars{0xbb}=0x00bb; # right-pointing double angle quotation mark
 824 $chars{0xbc}=0x266f; # music sharp sign
 825 $chars{0xbd}=0x02b9; # modifier letter prime
 826 $chars{0xbe}=0x02ba; # modifier letter double prime
 827 $chars{0xbf}=0x00bf; # inverted question mark
 828
 829 ####
 830 ## 0xe
 831 $chars{0xe1}=0x00c6; # latin capital letter ae
 832 $chars{0xe2}=0x0110; # latin capital letter d with stroke
 833 $chars{0xe6}=0x0132; # latin capital ligature ij
 834 $chars{0xe8}=0x0141; # latin capital letter l with stroke
 835 $chars{0xe9}=0x00d8; # latin capital letter o with stroke
 836 $chars{0xea}=0x0152; # latin capital ligature oe
 837 $chars{0xec}=0x00de; # latin capital letter thorn
 838
 839 ####
 840 ## 0xf
 841 $chars{0xf1}=0x00e6; # latin small letter ae
 842 $chars{0xf2}=0x0111; # latin small letter d with stroke
 843 $chars{0xf3}=0x00f0; # latin small letter eth
 844 $chars{0xf5}=0x0131; # latin small letter dotless i
 845 $chars{0xf6}=0x0133; # latin small ligature ij
 846 $chars{0xf8}=0x0142; # latin small letter l with stroke
 847 $chars{0xf9}=0x00f8; # latin small letter o with stroke
 848 $chars{0xfa}=0x0153; # latin small ligature oe
 849 $chars{0xfb}=0x00df; # latin small letter sharp s
 850 $chars{0xfc}=0x00fe; # latin small letter thorn
 851
 852 ####
 853 ## Others
 854 $chars{0x97}=0x003c;#3/2leftlowsinglequotationmark
 855 $chars{0x98}=0x003e;#3/2leftlowsinglequotationmark
 856 #$chars{0x81d1}=0x00b0; # FIXME useless
 857
 858 ####
 859 ## combined characters iso5426
 860
 861 $chars{0xc041}=0x1ea2; # capital a with hook above
 862 $chars{0xc045}=0x1eba; # capital e with hook above
 863 $chars{0xc049}=0x1ec8; # capital i with hook above
 864 $chars{0xc04f}=0x1ece; # capital o with hook above
 865 $chars{0xc055}=0x1ee6; # capital u with hook above
 866 $chars{0xc059}=0x1ef6; # capital y with hook above
 867 $chars{0xc061}=0x1ea3; # small a with hook above
 868 $chars{0xc065}=0x1ebb; # small e with hook above
 869 $chars{0xc069}=0x1ec9; # small i with hook above
 870 $chars{0xc06f}=0x1ecf; # small o with hook above
 871 $chars{0xc075}=0x1ee7; # small u with hook above
 872 $chars{0xc079}=0x1ef7; # small y with hook above
 873
 874         # 4/1 grave accent
 875 $chars{0xc141}=0x00c0; # capital a with grave accent
 876 $chars{0xc145}=0x00c8; # capital e with grave accent
 877 $chars{0xc149}=0x00cc; # capital i with grave accent
 878 $chars{0xc14f}=0x00d2; # capital o with grave accent
 879 $chars{0xc155}=0x00d9; # capital u with grave accent
 880 $chars{0xc157}=0x1e80; # capital w with grave
 881 $chars{0xc159}=0x1ef2; # capital y with grave
 882 $chars{0xc161}=0x00e0; # small a with grave accent
 883 $chars{0xc165}=0x00e8; # small e with grave accent
 884 $chars{0xc169}=0x00ec; # small i with grave accent
 885 $chars{0xc16f}=0x00f2; # small o with grave accent
 886 $chars{0xc175}=0x00f9; # small u with grave accent
 887 $chars{0xc177}=0x1e81; # small w with grave
 888 $chars{0xc179}=0x1ef3; # small y with grave
 889         # 4/2 acute accent
 890 $chars{0xc241}=0x00c1; # capital a with acute accent
 891 $chars{0xc243}=0x0106; # capital c with acute accent
 892 $chars{0xc245}=0x00c9; # capital e with acute accent
 893 $chars{0xc247}=0x01f4; # capital g with acute
 894 $chars{0xc249}=0x00cd; # capital i with acute accent
 895 $chars{0xc24b}=0x1e30; # capital k with acute
 896 $chars{0xc24c}=0x0139; # capital l with acute accent
 897 $chars{0xc24d}=0x1e3e; # capital m with acute
 898 $chars{0xc24e}=0x0143; # capital n with acute accent
 899 $chars{0xc24f}=0x00d3; # capital o with acute accent
 900 $chars{0xc250}=0x1e54; # capital p with acute
 901 $chars{0xc252}=0x0154; # capital r with acute accent
 902 $chars{0xc253}=0x015a; # capital s with acute accent
 903 $chars{0xc255}=0x00da; # capital u with acute accent
 904 $chars{0xc257}=0x1e82; # capital w with acute
 905 $chars{0xc259}=0x00dd; # capital y with acute accent
 906 $chars{0xc25a}=0x0179; # capital z with acute accent
 907 $chars{0xc261}=0x00e1; # small a with acute accent
 908 $chars{0xc263}=0x0107; # small c with acute accent
 909 $chars{0xc265}=0x00e9; # small e with acute accent
 910 $chars{0xc267}=0x01f5; # small g with acute
 911 $chars{0xc269}=0x00ed; # small i with acute accent
 912 $chars{0xc26b}=0x1e31; # small k with acute
 913 $chars{0xc26c}=0x013a; # small l with acute accent
 914 $chars{0xc26d}=0x1e3f; # small m with acute
 915 $chars{0xc26e}=0x0144; # small n with acute accent
 916 $chars{0xc26f}=0x00f3; # small o with acute accent
 917 $chars{0xc270}=0x1e55; # small p with acute
 918 $chars{0xc272}=0x0155; # small r with acute accent
 919 $chars{0xc273}=0x015b; # small s with acute accent
 920 $chars{0xc275}=0x00fa; # small u with acute accent
 921 $chars{0xc277}=0x1e83; # small w with acute
 922 $chars{0xc279}=0x00fd; # small y with acute accent
 923 $chars{0xc27a}=0x017a; # small z with acute accent
 924 $chars{0xc2e1}=0x01fc; # capital ae with acute
 925 $chars{0xc2f1}=0x01fd; # small ae with acute
 926        # 4/3 circumflex accent
 927 $chars{0xc341}=0x00c2; # capital a with circumflex accent
 928 $chars{0xc343}=0x0108; # capital c with circumflex
 929 $chars{0xc345}=0x00ca; # capital e with circumflex accent
 930 $chars{0xc347}=0x011c; # capital g with circumflex
 931 $chars{0xc348}=0x0124; # capital h with circumflex
 932 $chars{0xc349}=0x00ce; # capital i with circumflex accent
 933 $chars{0xc34a}=0x0134; # capital j with circumflex
 934 $chars{0xc34f}=0x00d4; # capital o with circumflex accent
 935 $chars{0xc353}=0x015c; # capital s with circumflex
 936 $chars{0xc355}=0x00db; # capital u with circumflex
 937 $chars{0xc357}=0x0174; # capital w with circumflex
 938 $chars{0xc359}=0x0176; # capital y with circumflex
 939 $chars{0xc35a}=0x1e90; # capital z with circumflex
 940 $chars{0xc361}=0x00e2; # small a with circumflex accent
 941 $chars{0xc363}=0x0109; # small c with circumflex
 942 $chars{0xc365}=0x00ea; # small e with circumflex accent
 943 $chars{0xc367}=0x011d; # small g with circumflex
 944 $chars{0xc368}=0x0125; # small h with circumflex
 945 $chars{0xc369}=0x00ee; # small i with circumflex accent
 946 $chars{0xc36a}=0x0135; # small j with circumflex
 947 $chars{0xc36e}=0x00f1; # small n with tilde
 948 $chars{0xc36f}=0x00f4; # small o with circumflex accent
 949 $chars{0xc373}=0x015d; # small s with circumflex
 950 $chars{0xc375}=0x00fb; # small u with circumflex
 951 $chars{0xc377}=0x0175; # small w with circumflex
 952 $chars{0xc379}=0x0177; # small y with circumflex
 953 $chars{0xc37a}=0x1e91; # small z with circumflex
 954         # 4/4 tilde
 955 $chars{0xc441}=0x00c3; # capital a with tilde
 956 $chars{0xc445}=0x1ebc; # capital e with tilde
 957 $chars{0xc449}=0x0128; # capital i with tilde
 958 $chars{0xc44e}=0x00d1; # capital n with tilde
 959 $chars{0xc44f}=0x00d5; # capital o with tilde
 960 $chars{0xc455}=0x0168; # capital u with tilde
 961 $chars{0xc456}=0x1e7c; # capital v with tilde
 962 $chars{0xc459}=0x1ef8; # capital y with tilde
 963 $chars{0xc461}=0x00e3; # small a with tilde
 964 $chars{0xc465}=0x1ebd; # small e with tilde
 965 $chars{0xc469}=0x0129; # small i with tilde
 966 $chars{0xc46e}=0x00f1; # small n with tilde
 967 $chars{0xc46f}=0x00f5; # small o with tilde
 968 $chars{0xc475}=0x0169; # small u with tilde
 969 $chars{0xc476}=0x1e7d; # small v with tilde
 970 $chars{0xc479}=0x1ef9; # small y with tilde
 971     # 4/5 macron
 972 $chars{0xc541}=0x0100; # capital a with macron
 973 $chars{0xc545}=0x0112; # capital e with macron
 974 $chars{0xc547}=0x1e20; # capital g with macron
 975 $chars{0xc549}=0x012a; # capital i with macron
 976 $chars{0xc54f}=0x014c; # capital o with macron
 977 $chars{0xc555}=0x016a; # capital u with macron
 978 $chars{0xc561}=0x0101; # small a with macron
 979 $chars{0xc565}=0x0113; # small e with macron
 980 $chars{0xc567}=0x1e21; # small g with macron
 981 $chars{0xc569}=0x012b; # small i with macron
 982 $chars{0xc56f}=0x014d; # small o with macron
 983 $chars{0xc575}=0x016b; # small u with macron
 984 $chars{0xc572}=0x0159; # small r with macron
 985 $chars{0xc5e1}=0x01e2; # capital ae with macron
 986 $chars{0xc5f1}=0x01e3; # small ae with macron
 987         # 4/6 breve
 988 $chars{0xc641}=0x0102; # capital a with breve
 989 $chars{0xc645}=0x0114; # capital e with breve
 990 $chars{0xc647}=0x011e; # capital g with breve
 991 $chars{0xc649}=0x012c; # capital i with breve
 992 $chars{0xc64f}=0x014e; # capital o with breve
 993 $chars{0xc655}=0x016c; # capital u with breve
 994 $chars{0xc661}=0x0103; # small a with breve
 995 $chars{0xc665}=0x0115; # small e with breve
 996 $chars{0xc667}=0x011f; # small g with breve
 997 $chars{0xc669}=0x012d; # small i with breve
 998 $chars{0xc66f}=0x014f; # small o with breve
 999 $chars{0xc675}=0x016d; # small u with breve
1000         # 4/7 dot above
1001 $chars{0xc7b0}=0x01e1; # Ain with dot above
1002 $chars{0xc742}=0x1e02; # capital b with dot above
1003 $chars{0xc743}=0x010a; # capital c with dot above
1004 $chars{0xc744}=0x1e0a; # capital d with dot above
1005 $chars{0xc745}=0x0116; # capital e with dot above
1006 $chars{0xc746}=0x1e1e; # capital f with dot above
1007 $chars{0xc747}=0x0120; # capital g with dot above
1008 $chars{0xc748}=0x1e22; # capital h with dot above
1009 $chars{0xc749}=0x0130; # capital i with dot above
1010 $chars{0xc74d}=0x1e40; # capital m with dot above
1011 $chars{0xc74e}=0x1e44; # capital n with dot above
1012 $chars{0xc750}=0x1e56; # capital p with dot above
1013 $chars{0xc752}=0x1e58; # capital r with dot above
1014 $chars{0xc753}=0x1e60; # capital s with dot above
1015 $chars{0xc754}=0x1e6a; # capital t with dot above
1016 $chars{0xc757}=0x1e86; # capital w with dot above
1017 $chars{0xc758}=0x1e8a; # capital x with dot above
1018 $chars{0xc759}=0x1e8e; # capital y with dot above
1019 $chars{0xc75a}=0x017b; # capital z with dot above
1020 $chars{0xc761}=0x0227; # small b with dot above
1021 $chars{0xc762}=0x1e03; # small b with dot above
1022 $chars{0xc763}=0x010b; # small c with dot above
1023 $chars{0xc764}=0x1e0b; # small d with dot above
1024 $chars{0xc765}=0x0117; # small e with dot above
1025 $chars{0xc766}=0x1e1f; # small f with dot above
1026 $chars{0xc767}=0x0121; # small g with dot above
1027 $chars{0xc768}=0x1e23; # small h with dot above
1028 $chars{0xc76d}=0x1e41; # small m with dot above
1029 $chars{0xc76e}=0x1e45; # small n with dot above
1030 $chars{0xc770}=0x1e57; # small p with dot above
1031 $chars{0xc772}=0x1e59; # small r with dot above
1032 $chars{0xc773}=0x1e61; # small s with dot above
1033 $chars{0xc774}=0x1e6b; # small t with dot above
1034 $chars{0xc777}=0x1e87; # small w with dot above
1035 $chars{0xc778}=0x1e8b; # small x with dot above
1036 $chars{0xc779}=0x1e8f; # small y with dot above
1037 $chars{0xc77a}=0x017c; # small z with dot above
1038         # 4/8 trema, diaresis
1039 $chars{0xc820}=0x00a8; # diaeresis
1040 $chars{0xc841}=0x00c4; # capital a with diaeresis
1041 $chars{0xc845}=0x00cb; # capital e with diaeresis
1042 $chars{0xc848}=0x1e26; # capital h with diaeresis
1043 $chars{0xc849}=0x00cf; # capital i with diaeresis
1044 $chars{0xc84f}=0x00d6; # capital o with diaeresis
1045 $chars{0xc855}=0x00dc; # capital u with diaeresis
1046 $chars{0xc857}=0x1e84; # capital w with diaeresis
1047 $chars{0xc858}=0x1e8c; # capital x with diaeresis
1048 $chars{0xc859}=0x0178; # capital y with diaeresis
1049 $chars{0xc861}=0x00e4; # small a with diaeresis
1050 $chars{0xc865}=0x00eb; # small e with diaeresis
1051 $chars{0xc868}=0x1e27; # small h with diaeresis
1052 $chars{0xc869}=0x00ef; # small i with diaeresis
1053 $chars{0xc86f}=0x00f6; # small o with diaeresis
1054 $chars{0xc874}=0x1e97; # small t with diaeresis
1055 $chars{0xc875}=0x00fc; # small u with diaeresis
1056 $chars{0xc877}=0x1e85; # small w with diaeresis
1057 $chars{0xc878}=0x1e8d; # small x with diaeresis
1058 $chars{0xc879}=0x00ff; # small y with diaeresis
1059         # 4/9 umlaut
1060 $chars{0xc920}=0x00a8; # [diaeresis]
1061 $chars{0xc961}=0x00e4; # a with umlaut
1062 $chars{0xc965}=0x00eb; # e with umlaut
1063 $chars{0xc969}=0x00ef; # i with umlaut
1064 $chars{0xc96f}=0x00f6; # o with umlaut
1065 $chars{0xc975}=0x00fc; # u with umlaut
1066         # 4/10 circle above
1067 $chars{0xca41}=0x00c5; # capital a with ring above
1068 $chars{0xcaad}=0x016e; # capital u with ring above
1069 $chars{0xca61}=0x00e5; # small a with ring above
1070 $chars{0xca75}=0x016f; # small u with ring above
1071 $chars{0xca77}=0x1e98; # small w with ring above
1072 $chars{0xca79}=0x1e99; # small y with ring above
1073         # 4/11 high comma off centre
1074         # 4/12 inverted high comma centred
1075         # 4/13 double acute accent
1076 $chars{0xcd4f}=0x0150; # capital o with double acute
1077 $chars{0xcd55}=0x0170; # capital u with double acute
1078 $chars{0xcd6f}=0x0151; # small o with double acute
1079 $chars{0xcd75}=0x0171; # small u with double acute
1080         # 4/14 horn
1081 $chars{0xce54}=0x01a0; # latin capital letter o with horn
1082 $chars{0xce55}=0x01af; # latin capital letter u with horn
1083 $chars{0xce74}=0x01a1; # latin small letter o with horn
1084 $chars{0xce75}=0x01b0; # latin small letter u with horn
1085         # 4/15 caron (hacek
1086 $chars{0xcf41}=0x01cd; # capital a with caron
1087 $chars{0xcf43}=0x010c; # capital c with caron
1088 $chars{0xcf44}=0x010e; # capital d with caron
1089 $chars{0xcf45}=0x011a; # capital e with caron
1090 $chars{0xcf47}=0x01e6; # capital g with caron
1091 $chars{0xcf49}=0x01cf; # capital i with caron
1092 $chars{0xcf4b}=0x01e8; # capital k with caron
1093 $chars{0xcf4c}=0x013d; # capital l with caron
1094 $chars{0xcf4e}=0x0147; # capital n with caron
1095 $chars{0xcf4f}=0x01d1; # capital o with caron
1096 $chars{0xcf52}=0x0158; # capital r with caron
1097 $chars{0xcf53}=0x0160; # capital s with caron
1098 $chars{0xcf54}=0x0164; # capital t with caron
1099 $chars{0xcf55}=0x01d3; # capital u with caron
1100 $chars{0xcf5a}=0x017d; # capital z with caron
1101 $chars{0xcf61}=0x01ce; # small a with caron
1102 $chars{0xcf63}=0x010d; # small c with caron
1103 $chars{0xcf64}=0x010f; # small d with caron
1104 $chars{0xcf65}=0x011b; # small e with caron
1105 $chars{0xcf67}=0x01e7; # small g with caron
1106 $chars{0xcf69}=0x01d0; # small i with caron
1107 $chars{0xcf6a}=0x01f0; # small j with caron
1108 $chars{0xcf6b}=0x01e9; # small k with caron
1109 $chars{0xcf6c}=0x013e; # small l with caron
1110 $chars{0xcf6e}=0x0148; # small n with caron
1111 $chars{0xcf6f}=0x01d2; # small o with caron
1112 $chars{0xcf72}=0x0159; # small r with caron
1113 $chars{0xcf73}=0x0161; # small s with caron
1114 $chars{0xcf74}=0x0165; # small t with caron
1115 $chars{0xcf75}=0x01d4; # small u with caron
1116 $chars{0xcf7a}=0x017e; # small z with caron
1117         # 5/0 cedilla
1118 $chars{0xd020}=0x00b8; # cedilla
1119 $chars{0xd043}=0x00c7; # capital c with cedilla
1120 $chars{0xd044}=0x1e10; # capital d with cedilla
1121 $chars{0xd047}=0x0122; # capital g with cedilla
1122 $chars{0xd048}=0x1e28; # capital h with cedilla
1123 $chars{0xd04b}=0x0136; # capital k with cedilla
1124 $chars{0xd04c}=0x013b; # capital l with cedilla
1125 $chars{0xd04e}=0x0145; # capital n with cedilla
1126 $chars{0xd052}=0x0156; # capital r with cedilla
1127 $chars{0xd053}=0x015e; # capital s with cedilla
1128 $chars{0xd054}=0x0162; # capital t with cedilla
1129 $chars{0xd063}=0x00e7; # small c with cedilla
1130 $chars{0xd064}=0x1e11; # small d with cedilla
1131 $chars{0xd065}=0x0119; # small e with cedilla
1132 $chars{0xd067}=0x0123; # small g with cedilla
1133 $chars{0xd068}=0x1e29; # small h with cedilla
1134 $chars{0xd06b}=0x0137; # small k with cedilla
1135 $chars{0xd06c}=0x013c; # small l with cedilla
1136 $chars{0xd06e}=0x0146; # small n with cedilla
1137 $chars{0xd072}=0x0157; # small r with cedilla
1138 $chars{0xd073}=0x015f; # small s with cedilla
1139 $chars{0xd074}=0x0163; # small t with cedilla
1140         # 5/1 rude
1141         # 5/2 hook to left
1142         # 5/3 ogonek (hook to right
1143 $chars{0xd320}=0x02db; # ogonek
1144 $chars{0xd341}=0x0104; # capital a with ogonek
1145 $chars{0xd345}=0x0118; # capital e with ogonek
1146 $chars{0xd349}=0x012e; # capital i with ogonek
1147 $chars{0xd34f}=0x01ea; # capital o with ogonek
1148 $chars{0xd355}=0x0172; # capital u with ogonek
1149 $chars{0xd361}=0x0105; # small a with ogonek
1150 $chars{0xd365}=0x0119; # small e with ogonek
1151 $chars{0xd369}=0x012f; # small i with ogonek
1152 $chars{0xd36f}=0x01eb; # small o with ogonek
1153 $chars{0xd375}=0x0173; # small u with ogonek
1154         # 5/4 circle below
1155 $chars{0xd441}=0x1e00; # capital a with ring below
1156 $chars{0xd461}=0x1e01; # small a with ring below
1157         # 5/5 half circle below
1158 $chars{0xd548}=0x1e2a; # capital h with breve below
1159 $chars{0xd568}=0x1e2b; # small h with breve below
1160         # 5/6 dot below
1161 $chars{0xd641}=0x1ea0; # capital a with dot below
1162 $chars{0xd642}=0x1e04; # capital b with dot below
1163 $chars{0xd644}=0x1e0c; # capital d with dot below
1164 $chars{0xd645}=0x1eb8; # capital e with dot below
1165 $chars{0xd648}=0x1e24; # capital h with dot below
1166 $chars{0xd649}=0x1eca; # capital i with dot below
1167 $chars{0xd64b}=0x1e32; # capital k with dot below
1168 $chars{0xd64c}=0x1e36; # capital l with dot below
1169 $chars{0xd64d}=0x1e42; # capital m with dot below
1170 $chars{0xd64e}=0x1e46; # capital n with dot below
1171 $chars{0xd64f}=0x1ecc; # capital o with dot below
1172 $chars{0xd652}=0x1e5a; # capital r with dot below
1173 $chars{0xd653}=0x1e62; # capital s with dot below
1174 $chars{0xd654}=0x1e6c; # capital t with dot below
1175 $chars{0xd655}=0x1ee4; # capital u with dot below
1176 $chars{0xd656}=0x1e7e; # capital v with dot below
1177 $chars{0xd657}=0x1e88; # capital w with dot below
1178 $chars{0xd659}=0x1ef4; # capital y with dot below
1179 $chars{0xd65a}=0x1e92; # capital z with dot below
1180 $chars{0xd661}=0x1ea1; # small a with dot below
1181 $chars{0xd662}=0x1e05; # small b with dot below
1182 $chars{0xd664}=0x1e0d; # small d with dot below
1183 $chars{0xd665}=0x1eb9; # small e with dot below
1184 $chars{0xd668}=0x1e25; # small h with dot below
1185 $chars{0xd669}=0x1ecb; # small i with dot below
1186 $chars{0xd66b}=0x1e33; # small k with dot below
1187 $chars{0xd66c}=0x1e37; # small l with dot below
1188 $chars{0xd66d}=0x1e43; # small m with dot below
1189 $chars{0xd66e}=0x1e47; # small n with dot below
1190 $chars{0xd66f}=0x1ecd; # small o with dot below
1191 $chars{0xd672}=0x1e5b; # small r with dot below
1192 $chars{0xd673}=0x1e63; # small s with dot below
1193 $chars{0xd674}=0x1e6d; # small t with dot below
1194 $chars{0xd675}=0x1ee5; # small u with dot below
1195 $chars{0xd676}=0x1e7f; # small v with dot below
1196 $chars{0xd677}=0x1e89; # small w with dot below
1197 $chars{0xd679}=0x1ef5; # small y with dot below
1198 $chars{0xd67a}=0x1e93; # small z with dot below
1199         # 5/7 double dot below
1200 $chars{0xd755}=0x1e72; # capital u with diaeresis below
1201 $chars{0xd775}=0x1e73; # small u with diaeresis below
1202         # 5/8 underline
1203 $chars{0xd820}=0x005f; # underline
1204         # 5/9 double underline
1205 $chars{0xd920}=0x2017; # double underline
1206         # 5/10 small low vertical bar
1207 $chars{0xda20}=0x02cc; #
1208         # 5/11 circumflex below
1209         # 5/12 (this position shall not be used)
1210         # 5/13 left half of ligature sign and of double tilde
1211         # 5/14 right half of ligature sign
1212         # 5/15 right half of double tilde
1213 #     map {printf "%x :%x\n",$_,$chars{$_};}keys %chars;
1214
1215 sub char_decode5426 {
1216     my ( $string) = @_;
1217     my $result;
1218
1219     my @data = unpack("C*", $string);
1220     my @characters;
1221     my $length=scalar(@data);
1222     for (my $i = 0; $i < scalar(@data); $i++) {
1223       my $char= $data[$i];
1224       if ($char >= 0x00 && $char <= 0x7F){
1225         #IsAscii
1226
1227           push @characters,$char unless ($char<0x02 ||$char== 0x0F);
1228       }elsif (($char >= 0xC0 && $char <= 0xDF)) {
1229         #Combined Char
1230         my $convchar ;
1231         if ($chars{$char*256+$data[$i+1]}) {
1232           $convchar= $chars{$char * 256 + $data[$i+1]};
1233           $i++;
1234 #           printf "char %x $char, char to convert %x , converted %x\n",$char,$char * 256 + $data[$i - 1],$convchar;
1235         } elsif ($chars{$char})  {
1236           $convchar= $chars{$char};
1237 #           printf "0xC char %x, converted %x\n",$char,$chars{$char};
1238         }else {
1239           $convchar=$char;
1240         }
1241         push @characters,$convchar;
1242       } else {
1243         my $convchar;
1244         if ($chars{$char})  {
1245           $convchar= $chars{$char};
1246 #            printf "char %x,  converted %x\n",$char,$chars{$char};
1247         }else {
1248 #            printf "char %x $char\n",$char;
1249           $convchar=$char;
1250         }
1251         push @characters,$convchar;
1252       }
1253     }
1254     $result=pack "U*",@characters;
1255 #     $result=~s/\x01//;
1256 #     $result=~s/\x00//;
1257      $result=~s/\x0f//;
1258      $result=~s/\x1b.//;
1259      $result=~s/\x0e//;
1260      $result=~s/\x1b\x5b//;
1261 #   map{printf "%x",$_} @characters;
1262 #   printf "\n";
1263   return $result;
1264 }
1265
1266 1;
1267
1268
1269 =head1 AUTHOR
1270
1271 Koha Development Team <http://koha-community.org/>
1272
1273 Galen Charlton <galen.charlton@liblime.com>
1274
1275 =cut