Bug 17554: (followup) Shibboleth check should use ->find too
[koha.git] / C4 / Charset.pm
blob906321f4763bf9ba9908369d3acea8c8022fa443
1 package C4::Charset;
3 # Copyright (C) 2008 LibLime
5 # This file is part of Koha.
7 # Koha is free software; you can redistribute it and/or modify it
8 # under the terms of the GNU General Public License as published by
9 # the Free Software Foundation; either version 3 of the License, or
10 # (at your option) any later version.
12 # Koha is distributed in the hope that it will be useful, but
13 # WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 # GNU General Public License for more details.
17 # You should have received a copy of the GNU General Public License
18 # along with Koha; if not, see <http://www.gnu.org/licenses>.
20 use strict;
21 use warnings;
23 use MARC::Charset qw/marc8_to_utf8/;
24 use Text::Iconv;
25 use C4::Debug;
26 use Unicode::Normalize;
27 use Encode qw( decode encode is_utf8 );
29 use vars qw(@ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
31 BEGIN {
32 require Exporter;
33 @ISA = qw(Exporter);
34 @EXPORT = qw(
35 NormalizeString
36 IsStringUTF8ish
37 MarcToUTF8Record
38 SetUTF8Flag
39 SetMarcUnicodeFlag
40 StripNonXmlChars
41 nsb_clean
42 SanitizeRecord
46 =encoding UTF-8
48 =head1 NAME
50 C4::Charset - utilities for handling character set conversions.
52 =head1 SYNOPSIS
54 use C4::Charset;
56 =head1 DESCRIPTION
58 This module contains routines for dealing with character set
59 conversions, particularly for MARC records.
61 A variety of character encodings are in use by various MARC
62 standards, and even more character encodings are used by
63 non-standard MARC records. The various MARC formats generally
64 do not do a good job of advertising a given record's character
65 encoding, and even when a record does advertise its encoding,
66 e.g., via the Leader/09, experience has shown that one cannot
67 trust it.
69 Ultimately, all MARC records are stored in Koha in UTF-8 and
70 must be converted from whatever the source character encoding is.
71 The goal of this module is to ensure that these conversions
72 take place accurately. When a character conversion cannot take
73 place, or at least not accurately, the module was provide
74 enough information to allow user-facing code to inform the user
75 on how to deal with the situation.
77 =cut
79 =head1 FUNCTIONS
81 =head2 IsStringUTF8ish
83 my $is_utf8 = IsStringUTF8ish($str);
85 Determines if C<$str> is valid UTF-8. This can mean
86 one of two things:
88 =over
90 =item *
92 The Perl UTF-8 flag is set and the string contains valid UTF-8.
94 =item *
96 The Perl UTF-8 flag is B<not> set, but the octets contain
97 valid UTF-8.
99 =back
101 The function is named C<IsStringUTF8ish> instead of C<IsStringUTF8>
102 because in one could be presented with a MARC blob that is
103 not actually in UTF-8 but whose sequence of octets appears to be
104 valid UTF-8. The rest of the MARC character conversion functions
105 will assume that this situation occur does not very often.
107 =cut
109 sub IsStringUTF8ish {
110 my $str = shift;
112 return 1 if Encode::is_utf8($str);
113 return utf8::decode( $str );
116 =head2 SetUTF8Flag
118 my $marc_record = SetUTF8Flag($marc_record, $nfd);
120 This function sets the PERL UTF8 flag for data.
121 It is required when using new_from_usmarc
122 since MARC::File::USMARC does not handle PERL UTF8 setting.
123 When editing unicode marc records fields and subfields, you
124 would end up in double encoding without using this function.
126 If $nfd is set, string normalization will use NFD instead of NFC
128 FIXME
129 In my opinion, this function belongs to MARC::Record and not
130 to this package.
131 But since it handles charset, and MARC::Record, it finds its way in that package
133 =cut
135 sub SetUTF8Flag{
136 my ($record, $nfd)=@_;
137 return unless ($record && $record->fields());
138 foreach my $field ($record->fields()){
139 if ($field->tag()>=10){
140 my @subfields;
141 foreach my $subfield ($field->subfields()){
142 push @subfields,($$subfield[0],NormalizeString($$subfield[1],$nfd));
144 eval {
145 my $newfield=MARC::Field->new(
146 $field->tag(),
147 $field->indicator(1),
148 $field->indicator(2),
149 @subfields
151 $field->replace_with($newfield);
153 warn "ERROR occurred in SetUTF8Flag $@" if $@;
158 =head2 NormalizeString
160 my $normalized_string=NormalizeString($string,$nfd,$transform);
162 Given a string
163 nfd : If you want to set NFD and not NFC
164 transform : If you expect all the signs to be removed
166 Sets the PERL UTF8 Flag on your initial data if need be
167 and applies cleaning if required
169 Returns a utf8 NFC normalized string
171 Sample code :
172 my $string=NormalizeString ("l'ornithoptère");
173 #results into ornithoptère in NFC form and sets UTF8 Flag
175 =cut
178 sub NormalizeString{
179 my ($string,$nfd,$transform)=@_;
180 return $string unless defined($string); # force scalar context return.
181 $string = Encode::decode('UTF-8', $string) unless (Encode::is_utf8($string));
182 if ($nfd){
183 $string= NFD($string);
185 else {
186 $string=NFC($string);
188 if ($transform){
189 $string=~s/\<|\>|\^|\;|\.|\?|,|\-|\(|\)|\[|\]|\{|\}|\$|\%|\!|\*|\:|\\|\/|\&|\"|\'/ /g;
190 #removing one letter words "d'" "l'" was changed into "d " "l "
191 $string=~s/\b\S\b//g;
192 $string=~s/\s+$//g;
194 return $string;
197 =head2 MarcToUTF8Record
199 ($marc_record, $converted_from, $errors_arrayref) = MarcToUTF8Record($marc_blob,
200 $marc_flavour, [, $source_encoding]);
202 Given a MARC blob or a C<MARC::Record>, the MARC flavour, and an
203 optional source encoding, return a C<MARC::Record> that is
204 converted to UTF-8.
206 The returned C<$marc_record> is guaranteed to be in valid UTF-8, but
207 is not guaranteed to have been converted correctly. Specifically,
208 if C<$converted_from> is 'failed', the MARC record returned failed
209 character conversion and had each of its non-ASCII octets changed
210 to the Unicode replacement character.
212 If the source encoding was not specified, this routine will
213 try to guess it; the character encoding used for a successful
214 conversion is returned in C<$converted_from>.
216 =cut
218 sub MarcToUTF8Record {
219 my $marc = shift;
220 my $marc_flavour = shift;
221 my $source_encoding = shift;
222 my $marc_record;
223 my $marc_blob_is_utf8 = 0;
224 if (ref($marc) eq 'MARC::Record') {
225 my $marc_blob = $marc->as_usmarc();
226 $marc_blob_is_utf8 = IsStringUTF8ish($marc_blob);
227 $marc_record = $marc;
228 } else {
229 # dealing with a MARC blob
231 # remove any ersatz whitespace from the beginning and
232 # end of the MARC blob -- these can creep into MARC
233 # files produced by several sources -- caller really
234 # should be doing this, however
235 $marc =~ s/^\s+//;
236 $marc =~ s/\s+$//;
237 $marc_blob_is_utf8 = IsStringUTF8ish($marc);
238 eval {
239 $marc_record = MARC::Record->new_from_usmarc($marc);
241 if ($@) {
242 # if we fail the first time, one likely problem
243 # is that we have a MARC21 record that says that it's
244 # UTF-8 (Leader/09 = 'a') but contains non-UTF-8 characters.
245 # We'll try parsing it again.
246 substr($marc, 9, 1) = ' ';
247 eval {
248 $marc_record = MARC::Record->new_from_usmarc($marc);
250 if ($@) {
251 # it's hopeless; return an empty MARC::Record
252 return MARC::Record->new(), 'failed', ['could not parse MARC blob'];
257 # If we do not know the source encoding, try some guesses
258 # as follows:
259 # 1. Record is UTF-8 already.
260 # 2. If MARC flavor is MARC21 or NORMARC, then
261 # a. record is MARC-8
262 # b. record is ISO-8859-1
263 # 3. If MARC flavor is UNIMARC, then
264 if (not defined $source_encoding) {
265 if ($marc_blob_is_utf8) {
266 # note that for MARC21/NORMARC we are not bothering to check
267 # if the Leader/09 is set to 'a' or not -- because
268 # of problems with various ILSs (including Koha in the
269 # past, alas), this just is not trustworthy.
270 SetMarcUnicodeFlag($marc_record, $marc_flavour);
271 return $marc_record, 'UTF-8', [];
272 } else {
273 if ($marc_flavour eq 'MARC21' || $marc_flavour eq 'NORMARC') {
274 return _default_marc21_charconv_to_utf8($marc_record, $marc_flavour);
275 } elsif ($marc_flavour =~/UNIMARC/) {
276 return _default_unimarc_charconv_to_utf8($marc_record, $marc_flavour);
277 } else {
278 return _default_marc21_charconv_to_utf8($marc_record, $marc_flavour);
281 } else {
282 # caller knows the character encoding
283 my $original_marc_record = $marc_record->clone();
284 my @errors;
285 if ($source_encoding =~ /utf-?8/i) {
286 if ($marc_blob_is_utf8) {
287 SetMarcUnicodeFlag($marc_record, $marc_flavour);
288 return $marc_record, 'UTF-8', [];
289 } else {
290 push @errors, 'specified UTF-8 => UTF-8 conversion, but record is not in UTF-8';
292 } elsif ($source_encoding =~ /marc-?8/i) {
293 @errors = _marc_marc8_to_utf8($marc_record, $marc_flavour);
294 } elsif ($source_encoding =~ /5426/) {
295 @errors = _marc_iso5426_to_utf8($marc_record, $marc_flavour);
296 } else {
297 # assume any other character encoding is for Text::Iconv
298 @errors = _marc_to_utf8_via_text_iconv($marc_record, $marc_flavour, $source_encoding);
301 if (@errors) {
302 _marc_to_utf8_replacement_char($original_marc_record, $marc_flavour);
303 return $original_marc_record, 'failed', \@errors;
304 } else {
305 return $marc_record, $source_encoding, [];
311 =head2 SetMarcUnicodeFlag
313 SetMarcUnicodeFlag($marc_record, $marc_flavour);
315 Set both the internal MARC::Record encoding flag
316 and the appropriate Leader/09 (MARC21) or
317 100/26-29 (UNIMARC) to indicate that the record
318 is in UTF-8. Note that this does B<not> do
319 any actual character conversion.
321 =cut
323 sub SetMarcUnicodeFlag {
324 my $marc_record = shift;
325 my $marc_flavour = shift; # || C4::Context->preference("marcflavour");
327 if ($marc_flavour eq 'MARC21' || $marc_flavour eq 'NORMARC') {
328 my $leader = $marc_record->leader();
329 substr($leader, 9, 1) = 'a';
330 $marc_record->leader($leader);
331 } elsif ($marc_flavour =~/UNIMARC/) {
332 require C4::Context;
333 my $defaultlanguage = C4::Context->preference("UNIMARCField100Language");
334 $defaultlanguage = "fre" if (!$defaultlanguage || length($defaultlanguage) != 3);
335 my $string;
336 my ($subflength,$encodingposition)=($marc_flavour=~/AUTH/?(21,12):(36,25));
337 $string=$marc_record->subfield( 100, "a" );
338 if (defined $string && length($string)==$subflength) {
339 $string = substr $string, 0,$subflength if (length($string)>$subflength);
341 else {
342 $string = POSIX::strftime( "%Y%m%d", localtime );
343 $string =~ s/\-//g;
344 $string = sprintf( "%-*s", $subflength, $string );
345 substr ( $string, ($encodingposition - 3), 3, $defaultlanguage);
347 substr( $string, $encodingposition, 3, "y50" );
348 if ( $marc_record->subfield( 100, "a" ) ) {
349 $marc_record->field('100')->update(a=>$string);
351 else {
352 $marc_record->insert_grouped_field(
353 MARC::Field->new( 100, '', '', "a" => $string ) );
355 $debug && warn "encodage: ", substr( $marc_record->subfield(100, 'a'), $encodingposition, 3 );
356 } else {
357 warn "Unrecognized marcflavour: $marc_flavour";
361 =head2 StripNonXmlChars
363 my $new_str = StripNonXmlChars($old_str);
365 Given a string, return a copy with the
366 characters that are illegal in XML
367 removed.
369 This function exists to work around a problem
370 that can occur with badly-encoded MARC records.
371 Specifically, if a UTF-8 MARC record also
372 has excape (\x1b) characters, MARC::File::XML
373 will let the escape characters pass through
374 when as_xml() or as_xml_record() is called. The
375 problem is that the escape character is not
376 legal in well-formed XML documents, so when
377 MARC::File::XML attempts to parse such a record,
378 the XML parser will fail.
380 Stripping such characters will allow a
381 MARC::Record->new_from_xml()
382 to work, at the possible risk of some data loss.
384 =cut
386 sub StripNonXmlChars {
387 my $str = shift;
388 if (!defined($str) || $str eq ""){
389 return "";
391 $str =~ s/[^\x09\x0A\x0D\x{0020}-\x{D7FF}\x{E000}-\x{FFFD}\x{10000}-\x{10FFFF}]//g;
392 return $str;
397 =head2 nsb_clean
399 =over 4
401 nsb_clean($string);
403 =back
405 Removes Non Sorting Block characters
407 =cut
408 sub nsb_clean {
409 my $NSB = '\x88' ; # NSB : begin Non Sorting Block
410 my $NSE = '\x89' ; # NSE : Non Sorting Block end
411 my $NSB2 = '\x98' ; # NSB : begin Non Sorting Block
412 my $NSE2 = '\x9C' ; # NSE : Non Sorting Block end
413 my $C2 = '\xC2' ; # What is this char ? It is sometimes left by the regexp after removing NSB / NSE
415 # handles non sorting blocks
416 my ($string) = @_ ;
417 $_ = $string ;
418 s/($C2){0,1}($NSB|$NSB2)//g ;
419 s/($C2){0,1}($NSE|$NSE2)//g ;
420 $string = $_ ;
422 return($string) ;
426 =head2 SanitizeRecord
428 SanitizeRecord($marcrecord);
430 Sanitize a record
431 This routine is called in the maintenance script misc/maintenance/sanitize_records.pl.
432 It cleans any string with '&amp;amp;...', replacing it by '&'
434 =cut
436 sub SanitizeRecord {
437 my ( $record, $biblionumber ) = @_;
438 my $string;
439 my $record_modified = 0;
440 my $frameworkcode = C4::Biblio::GetFrameworkCode($biblionumber);
441 my ( $url_field, $url_subfield ) =
442 C4::Biblio::GetMarcFromKohaField( 'biblioitems.url', $frameworkcode );
443 foreach my $field ( $record->fields() ) {
444 if ( $field->is_control_field() ) {
445 my $value = $field->data();
446 my $sanitized_value = _clean_ampersand($value);
447 $record_modified = 1 if $sanitized_value ne $value;
448 $field->update($sanitized_value);
450 else {
451 my @subfields = $field->subfields();
452 my @new_subfields;
453 foreach my $subfield (@subfields) {
454 next
455 if $url_field eq $field->tag()
456 and $url_subfield eq $subfield->[0];
457 my $value = $subfield->[1];
458 my $sanitized_value = _clean_ampersand($value);
459 push @new_subfields, $subfield->[0] => $sanitized_value;
460 $record_modified = 1 if $sanitized_value ne $value;
462 if ( scalar(@new_subfields) > 0 ) {
463 my $new_field = eval {
464 MARC::Field->new(
465 $field->tag(), $field->indicator(1),
466 $field->indicator(2), @new_subfields
469 if ($@) {
470 warn "error : $@";
472 else {
473 $field->replace_with($new_field);
480 return $record, $record_modified;
483 sub _clean_ampersand {
484 my ($string) = @_;
485 $string =~ s/(&)(amp;)+/$1/g;
486 return $string;
489 =head1 INTERNAL FUNCTIONS
491 =head2 _default_marc21_charconv_to_utf8
493 my ($new_marc_record, $guessed_charset) = _default_marc21_charconv_to_utf8($marc_record);
495 Converts a C<MARC::Record> of unknown character set to UTF-8,
496 first by trying a MARC-8 to UTF-8 conversion, then ISO-8859-1
497 to UTF-8, then a default conversion that replaces each non-ASCII
498 character with the replacement character.
500 The C<$guessed_charset> return value contains the character set
501 that resulted in a conversion to valid UTF-8; note that
502 if the MARC-8 and ISO-8859-1 conversions failed, the value of
503 this is 'failed'.
505 =cut
507 sub _default_marc21_charconv_to_utf8 {
508 my $marc_record = shift;
509 my $marc_flavour = shift;
511 my $trial_marc8 = $marc_record->clone();
512 my @all_errors = ();
513 my @errors = _marc_marc8_to_utf8($trial_marc8, $marc_flavour);
514 unless (@errors) {
515 return $trial_marc8, 'MARC-8', [];
517 push @all_errors, @errors;
519 my $trial_8859_1 = $marc_record->clone();
520 @errors = _marc_to_utf8_via_text_iconv($trial_8859_1, $marc_flavour, 'iso-8859-1');
521 unless (@errors) {
522 return $trial_8859_1, 'iso-8859-1', []; # note -- we could return \@all_errors
523 # instead if we wanted to report details
524 # of the failed attempt at MARC-8 => UTF-8
526 push @all_errors, @errors;
528 my $default_converted = $marc_record->clone();
529 _marc_to_utf8_replacement_char($default_converted, $marc_flavour);
530 return $default_converted, 'failed', \@all_errors;
533 =head2 _default_unimarc_charconv_to_utf8
535 my ($new_marc_record, $guessed_charset) = _default_unimarc_charconv_to_utf8($marc_record);
537 Converts a C<MARC::Record> of unknown character set to UTF-8,
538 first by trying a ISO-5426 to UTF-8 conversion, then ISO-8859-1
539 to UTF-8, then a default conversion that replaces each non-ASCII
540 character with the replacement character.
542 The C<$guessed_charset> return value contains the character set
543 that resulted in a conversion to valid UTF-8; note that
544 if the MARC-8 and ISO-8859-1 conversions failed, the value of
545 this is 'failed'.
547 =cut
549 sub _default_unimarc_charconv_to_utf8 {
550 my $marc_record = shift;
551 my $marc_flavour = shift;
553 my $trial_marc8 = $marc_record->clone();
554 my @all_errors = ();
555 my @errors = _marc_iso5426_to_utf8($trial_marc8, $marc_flavour);
556 unless (@errors) {
557 return $trial_marc8, 'iso-5426';
559 push @all_errors, @errors;
561 my $trial_8859_1 = $marc_record->clone();
562 @errors = _marc_to_utf8_via_text_iconv($trial_8859_1, $marc_flavour, 'iso-8859-1');
563 unless (@errors) {
564 return $trial_8859_1, 'iso-8859-1';
566 push @all_errors, @errors;
568 my $default_converted = $marc_record->clone();
569 _marc_to_utf8_replacement_char($default_converted, $marc_flavour);
570 return $default_converted, 'failed', \@all_errors;
573 =head2 _marc_marc8_to_utf8
575 my @errors = _marc_marc8_to_utf8($marc_record, $marc_flavour, $source_encoding);
577 Convert a C<MARC::Record> to UTF-8 in-place from MARC-8.
578 If the conversion fails for some reason, an
579 appropriate messages will be placed in the returned
580 C<@errors> array.
582 =cut
584 sub _marc_marc8_to_utf8 {
585 my $marc_record = shift;
586 my $marc_flavour = shift;
588 my $prev_ignore = MARC::Charset->ignore_errors();
589 MARC::Charset->ignore_errors(1);
591 # trap warnings raised by MARC::Charset
592 my @errors = ();
593 local $SIG{__WARN__} = sub {
594 my $msg = $_[0];
595 if ($msg =~ /MARC.Charset/) {
596 # FIXME - purpose of this regexp is to strip out the
597 # line reference to MARC/Charset.pm, but as it
598 # exists probably won't work quite on Windows --
599 # some sort of minimal-bunch back-tracking RE
600 # would be helpful here
601 $msg =~ s/at [\/].*?.MARC.Charset\.pm line \d+\.\n$//;
602 push @errors, $msg;
603 } else {
604 # if warning doesn't come from MARC::Charset, just
605 # pass it on
606 warn $msg;
610 foreach my $field ($marc_record->fields()) {
611 if ($field->is_control_field()) {
612 ; # do nothing -- control fields should not contain non-ASCII characters
613 } else {
614 my @converted_subfields;
615 foreach my $subfield ($field->subfields()) {
616 my $utf8sf = MARC::Charset::marc8_to_utf8($subfield->[1]);
617 unless (IsStringUTF8ish($utf8sf)) {
618 # Because of a bug in MARC::Charset 0.98, if the string
619 # has (a) one or more diacritics that (b) are only in character positions
620 # 128 to 255 inclusive, the resulting converted string is not in
621 # UTF-8, but the legacy 8-bit encoding (e.g., ISO-8859-1). If that
622 # occurs, upgrade the string in place. Moral of the story seems to be
623 # that pack("U", ...) is better than chr(...) if you need to guarantee
624 # that the resulting string is UTF-8.
625 $utf8sf = Encode::encode('UTF-8', $utf8sf);
627 push @converted_subfields, $subfield->[0], $utf8sf;
630 $field->replace_with(MARC::Field->new(
631 $field->tag(), $field->indicator(1), $field->indicator(2),
632 @converted_subfields)
637 MARC::Charset->ignore_errors($prev_ignore);
639 SetMarcUnicodeFlag($marc_record, $marc_flavour);
641 return @errors;
644 =head2 _marc_iso5426_to_utf8
646 my @errors = _marc_iso5426_to_utf8($marc_record, $marc_flavour, $source_encoding);
648 Convert a C<MARC::Record> to UTF-8 in-place from ISO-5426.
649 If the conversion fails for some reason, an
650 appropriate messages will be placed in the returned
651 C<@errors> array.
653 FIXME - is ISO-5426 equivalent enough to MARC-8
654 that C<MARC::Charset> can be used instead?
656 =cut
658 sub _marc_iso5426_to_utf8 {
659 my $marc_record = shift;
660 my $marc_flavour = shift;
662 my @errors = ();
664 foreach my $field ($marc_record->fields()) {
665 if ($field->is_control_field()) {
666 ; # do nothing -- control fields should not contain non-ASCII characters
667 } else {
668 my @converted_subfields;
669 foreach my $subfield ($field->subfields()) {
670 my $utf8sf = char_decode5426($subfield->[1]);
671 push @converted_subfields, $subfield->[0], $utf8sf;
674 $field->replace_with(MARC::Field->new(
675 $field->tag(), $field->indicator(1), $field->indicator(2),
676 @converted_subfields)
681 SetMarcUnicodeFlag($marc_record, $marc_flavour);
683 return @errors;
686 =head2 _marc_to_utf8_via_text_iconv
688 my @errors = _marc_to_utf8_via_text_iconv($marc_record, $marc_flavour, $source_encoding);
690 Convert a C<MARC::Record> to UTF-8 in-place using the
691 C<Text::Iconv> CPAN module. Any source encoding accepted
692 by the user's iconv installation should work. If
693 the source encoding is not recognized on the user's
694 server or the conversion fails for some reason,
695 appropriate messages will be placed in the returned
696 C<@errors> array.
698 =cut
700 sub _marc_to_utf8_via_text_iconv {
701 my $marc_record = shift;
702 my $marc_flavour = shift;
703 my $source_encoding = shift;
705 my @errors = ();
706 my $decoder;
707 eval { $decoder = Text::Iconv->new($source_encoding, 'utf8'); };
708 if ($@) {
709 push @errors, "Could not initialze $source_encoding => utf8 converter: $@";
710 return @errors;
713 my $prev_raise_error = Text::Iconv->raise_error();
714 Text::Iconv->raise_error(1);
716 foreach my $field ($marc_record->fields()) {
717 if ($field->is_control_field()) {
718 ; # do nothing -- control fields should not contain non-ASCII characters
719 } else {
720 my @converted_subfields;
721 foreach my $subfield ($field->subfields()) {
722 my $converted_value;
723 my $conversion_ok = 1;
724 eval { $converted_value = $decoder->convert($subfield->[1]); };
725 if ($@) {
726 $conversion_ok = 0;
727 push @errors, $@;
728 } elsif (not defined $converted_value) {
729 $conversion_ok = 0;
730 push @errors, "Text::Iconv conversion failed - retval is " . $decoder->retval();
733 if ($conversion_ok) {
734 push @converted_subfields, $subfield->[0], $converted_value;
735 } else {
736 $converted_value = $subfield->[1];
737 $converted_value =~ s/[\200-\377]/\xef\xbf\xbd/g;
738 push @converted_subfields, $subfield->[0], $converted_value;
742 $field->replace_with(MARC::Field->new(
743 $field->tag(), $field->indicator(1), $field->indicator(2),
744 @converted_subfields)
749 SetMarcUnicodeFlag($marc_record, $marc_flavour);
750 Text::Iconv->raise_error($prev_raise_error);
752 return @errors;
755 =head2 _marc_to_utf8_replacement_char
757 _marc_to_utf8_replacement_char($marc_record, $marc_flavour);
759 Convert a C<MARC::Record> to UTF-8 in-place, adopting the
760 unsatisfactory method of replacing all non-ASCII (e.g.,
761 where the eight bit is set) octet with the Unicode
762 replacement character. This is meant as a last-ditch
763 method, and would be best used as part of a UI that
764 lets a cataloguer pick various character conversions
765 until they find the right one.
767 =cut
769 sub _marc_to_utf8_replacement_char {
770 my $marc_record = shift;
771 my $marc_flavour = shift;
773 foreach my $field ($marc_record->fields()) {
774 if ($field->is_control_field()) {
775 ; # do nothing -- control fields should not contain non-ASCII characters
776 } else {
777 my @converted_subfields;
778 foreach my $subfield ($field->subfields()) {
779 my $value = $subfield->[1];
780 $value =~ s/[\200-\377]/\xef\xbf\xbd/g;
781 push @converted_subfields, $subfield->[0], $value;
784 $field->replace_with(MARC::Field->new(
785 $field->tag(), $field->indicator(1), $field->indicator(2),
786 @converted_subfields)
791 SetMarcUnicodeFlag($marc_record, $marc_flavour);
794 =head2 char_decode5426
796 my $utf8string = char_decode5426($iso_5426_string);
798 Converts a string from ISO-5426 to UTF-8.
800 =cut
803 my %chars;
805 ####
806 ## 0xb
807 $chars{0xb0}=0x0101;#3/0ayn[ain]
808 $chars{0xb1}=0x0623;#3/1alif/hamzah[alefwithhamzaabove]
809 #$chars{0xb2}=0x00e0;#'à';
810 $chars{0xb2}=0x00e0;#3/2leftlowsinglequotationmark
811 #$chars{0xb3}=0x00e7;#'ç';
812 $chars{0xb3}=0x00e7;#3/2leftlowsinglequotationmark
813 # $chars{0xb4}='è';
814 $chars{0xb4}=0x00e8;
815 # $chars{0xb5}='é';
816 $chars{0xb5}=0x00e9;
817 $chars{0xb6}=0x2021; # double dagger
818 $chars{0xb7}=0x00b7; # middle dot
819 $chars{0xb8}=0x2033; # double prime
820 $chars{0xb9}=0x2019; # right single quotation mark
821 $chars{0xba}=0x201d; # right double quotation mark
822 $chars{0xbb}=0x00bb; # right-pointing double angle quotation mark
823 $chars{0xbc}=0x266f; # music sharp sign
824 $chars{0xbd}=0x02b9; # modifier letter prime
825 $chars{0xbe}=0x02ba; # modifier letter double prime
826 $chars{0xbf}=0x00bf; # inverted question mark
828 ####
829 ## 0xe
830 $chars{0xe1}=0x00c6; # latin capital letter ae
831 $chars{0xe2}=0x0110; # latin capital letter d with stroke
832 $chars{0xe6}=0x0132; # latin capital ligature ij
833 $chars{0xe8}=0x0141; # latin capital letter l with stroke
834 $chars{0xe9}=0x00d8; # latin capital letter o with stroke
835 $chars{0xea}=0x0152; # latin capital ligature oe
836 $chars{0xec}=0x00de; # latin capital letter thorn
838 ####
839 ## 0xf
840 $chars{0xf1}=0x00e6; # latin small letter ae
841 $chars{0xf2}=0x0111; # latin small letter d with stroke
842 $chars{0xf3}=0x00f0; # latin small letter eth
843 $chars{0xf5}=0x0131; # latin small letter dotless i
844 $chars{0xf6}=0x0133; # latin small ligature ij
845 $chars{0xf8}=0x0142; # latin small letter l with stroke
846 $chars{0xf9}=0x00f8; # latin small letter o with stroke
847 $chars{0xfa}=0x0153; # latin small ligature oe
848 $chars{0xfb}=0x00df; # latin small letter sharp s
849 $chars{0xfc}=0x00fe; # latin small letter thorn
851 ####
852 ## Others
853 $chars{0x97}=0x003c;#3/2leftlowsinglequotationmark
854 $chars{0x98}=0x003e;#3/2leftlowsinglequotationmark
855 #$chars{0x81d1}=0x00b0; # FIXME useless
857 ####
858 ## combined characters iso5426
860 $chars{0xc041}=0x1ea2; # capital a with hook above
861 $chars{0xc045}=0x1eba; # capital e with hook above
862 $chars{0xc049}=0x1ec8; # capital i with hook above
863 $chars{0xc04f}=0x1ece; # capital o with hook above
864 $chars{0xc055}=0x1ee6; # capital u with hook above
865 $chars{0xc059}=0x1ef6; # capital y with hook above
866 $chars{0xc061}=0x1ea3; # small a with hook above
867 $chars{0xc065}=0x1ebb; # small e with hook above
868 $chars{0xc069}=0x1ec9; # small i with hook above
869 $chars{0xc06f}=0x1ecf; # small o with hook above
870 $chars{0xc075}=0x1ee7; # small u with hook above
871 $chars{0xc079}=0x1ef7; # small y with hook above
873 # 4/1 grave accent
874 $chars{0xc141}=0x00c0; # capital a with grave accent
875 $chars{0xc145}=0x00c8; # capital e with grave accent
876 $chars{0xc149}=0x00cc; # capital i with grave accent
877 $chars{0xc14f}=0x00d2; # capital o with grave accent
878 $chars{0xc155}=0x00d9; # capital u with grave accent
879 $chars{0xc157}=0x1e80; # capital w with grave
880 $chars{0xc159}=0x1ef2; # capital y with grave
881 $chars{0xc161}=0x00e0; # small a with grave accent
882 $chars{0xc165}=0x00e8; # small e with grave accent
883 $chars{0xc169}=0x00ec; # small i with grave accent
884 $chars{0xc16f}=0x00f2; # small o with grave accent
885 $chars{0xc175}=0x00f9; # small u with grave accent
886 $chars{0xc177}=0x1e81; # small w with grave
887 $chars{0xc179}=0x1ef3; # small y with grave
888 # 4/2 acute accent
889 $chars{0xc241}=0x00c1; # capital a with acute accent
890 $chars{0xc243}=0x0106; # capital c with acute accent
891 $chars{0xc245}=0x00c9; # capital e with acute accent
892 $chars{0xc247}=0x01f4; # capital g with acute
893 $chars{0xc249}=0x00cd; # capital i with acute accent
894 $chars{0xc24b}=0x1e30; # capital k with acute
895 $chars{0xc24c}=0x0139; # capital l with acute accent
896 $chars{0xc24d}=0x1e3e; # capital m with acute
897 $chars{0xc24e}=0x0143; # capital n with acute accent
898 $chars{0xc24f}=0x00d3; # capital o with acute accent
899 $chars{0xc250}=0x1e54; # capital p with acute
900 $chars{0xc252}=0x0154; # capital r with acute accent
901 $chars{0xc253}=0x015a; # capital s with acute accent
902 $chars{0xc255}=0x00da; # capital u with acute accent
903 $chars{0xc257}=0x1e82; # capital w with acute
904 $chars{0xc259}=0x00dd; # capital y with acute accent
905 $chars{0xc25a}=0x0179; # capital z with acute accent
906 $chars{0xc261}=0x00e1; # small a with acute accent
907 $chars{0xc263}=0x0107; # small c with acute accent
908 $chars{0xc265}=0x00e9; # small e with acute accent
909 $chars{0xc267}=0x01f5; # small g with acute
910 $chars{0xc269}=0x00ed; # small i with acute accent
911 $chars{0xc26b}=0x1e31; # small k with acute
912 $chars{0xc26c}=0x013a; # small l with acute accent
913 $chars{0xc26d}=0x1e3f; # small m with acute
914 $chars{0xc26e}=0x0144; # small n with acute accent
915 $chars{0xc26f}=0x00f3; # small o with acute accent
916 $chars{0xc270}=0x1e55; # small p with acute
917 $chars{0xc272}=0x0155; # small r with acute accent
918 $chars{0xc273}=0x015b; # small s with acute accent
919 $chars{0xc275}=0x00fa; # small u with acute accent
920 $chars{0xc277}=0x1e83; # small w with acute
921 $chars{0xc279}=0x00fd; # small y with acute accent
922 $chars{0xc27a}=0x017a; # small z with acute accent
923 $chars{0xc2e1}=0x01fc; # capital ae with acute
924 $chars{0xc2f1}=0x01fd; # small ae with acute
925 # 4/3 circumflex accent
926 $chars{0xc341}=0x00c2; # capital a with circumflex accent
927 $chars{0xc343}=0x0108; # capital c with circumflex
928 $chars{0xc345}=0x00ca; # capital e with circumflex accent
929 $chars{0xc347}=0x011c; # capital g with circumflex
930 $chars{0xc348}=0x0124; # capital h with circumflex
931 $chars{0xc349}=0x00ce; # capital i with circumflex accent
932 $chars{0xc34a}=0x0134; # capital j with circumflex
933 $chars{0xc34f}=0x00d4; # capital o with circumflex accent
934 $chars{0xc353}=0x015c; # capital s with circumflex
935 $chars{0xc355}=0x00db; # capital u with circumflex
936 $chars{0xc357}=0x0174; # capital w with circumflex
937 $chars{0xc359}=0x0176; # capital y with circumflex
938 $chars{0xc35a}=0x1e90; # capital z with circumflex
939 $chars{0xc361}=0x00e2; # small a with circumflex accent
940 $chars{0xc363}=0x0109; # small c with circumflex
941 $chars{0xc365}=0x00ea; # small e with circumflex accent
942 $chars{0xc367}=0x011d; # small g with circumflex
943 $chars{0xc368}=0x0125; # small h with circumflex
944 $chars{0xc369}=0x00ee; # small i with circumflex accent
945 $chars{0xc36a}=0x0135; # small j with circumflex
946 $chars{0xc36e}=0x00f1; # small n with tilde
947 $chars{0xc36f}=0x00f4; # small o with circumflex accent
948 $chars{0xc373}=0x015d; # small s with circumflex
949 $chars{0xc375}=0x00fb; # small u with circumflex
950 $chars{0xc377}=0x0175; # small w with circumflex
951 $chars{0xc379}=0x0177; # small y with circumflex
952 $chars{0xc37a}=0x1e91; # small z with circumflex
953 # 4/4 tilde
954 $chars{0xc441}=0x00c3; # capital a with tilde
955 $chars{0xc445}=0x1ebc; # capital e with tilde
956 $chars{0xc449}=0x0128; # capital i with tilde
957 $chars{0xc44e}=0x00d1; # capital n with tilde
958 $chars{0xc44f}=0x00d5; # capital o with tilde
959 $chars{0xc455}=0x0168; # capital u with tilde
960 $chars{0xc456}=0x1e7c; # capital v with tilde
961 $chars{0xc459}=0x1ef8; # capital y with tilde
962 $chars{0xc461}=0x00e3; # small a with tilde
963 $chars{0xc465}=0x1ebd; # small e with tilde
964 $chars{0xc469}=0x0129; # small i with tilde
965 $chars{0xc46e}=0x00f1; # small n with tilde
966 $chars{0xc46f}=0x00f5; # small o with tilde
967 $chars{0xc475}=0x0169; # small u with tilde
968 $chars{0xc476}=0x1e7d; # small v with tilde
969 $chars{0xc479}=0x1ef9; # small y with tilde
970 # 4/5 macron
971 $chars{0xc541}=0x0100; # capital a with macron
972 $chars{0xc545}=0x0112; # capital e with macron
973 $chars{0xc547}=0x1e20; # capital g with macron
974 $chars{0xc549}=0x012a; # capital i with macron
975 $chars{0xc54f}=0x014c; # capital o with macron
976 $chars{0xc555}=0x016a; # capital u with macron
977 $chars{0xc561}=0x0101; # small a with macron
978 $chars{0xc565}=0x0113; # small e with macron
979 $chars{0xc567}=0x1e21; # small g with macron
980 $chars{0xc569}=0x012b; # small i with macron
981 $chars{0xc56f}=0x014d; # small o with macron
982 $chars{0xc575}=0x016b; # small u with macron
983 $chars{0xc572}=0x0159; # small r with macron
984 $chars{0xc5e1}=0x01e2; # capital ae with macron
985 $chars{0xc5f1}=0x01e3; # small ae with macron
986 # 4/6 breve
987 $chars{0xc641}=0x0102; # capital a with breve
988 $chars{0xc645}=0x0114; # capital e with breve
989 $chars{0xc647}=0x011e; # capital g with breve
990 $chars{0xc649}=0x012c; # capital i with breve
991 $chars{0xc64f}=0x014e; # capital o with breve
992 $chars{0xc655}=0x016c; # capital u with breve
993 $chars{0xc661}=0x0103; # small a with breve
994 $chars{0xc665}=0x0115; # small e with breve
995 $chars{0xc667}=0x011f; # small g with breve
996 $chars{0xc669}=0x012d; # small i with breve
997 $chars{0xc66f}=0x014f; # small o with breve
998 $chars{0xc675}=0x016d; # small u with breve
999 # 4/7 dot above
1000 $chars{0xc7b0}=0x01e1; # Ain with dot above
1001 $chars{0xc742}=0x1e02; # capital b with dot above
1002 $chars{0xc743}=0x010a; # capital c with dot above
1003 $chars{0xc744}=0x1e0a; # capital d with dot above
1004 $chars{0xc745}=0x0116; # capital e with dot above
1005 $chars{0xc746}=0x1e1e; # capital f with dot above
1006 $chars{0xc747}=0x0120; # capital g with dot above
1007 $chars{0xc748}=0x1e22; # capital h with dot above
1008 $chars{0xc749}=0x0130; # capital i with dot above
1009 $chars{0xc74d}=0x1e40; # capital m with dot above
1010 $chars{0xc74e}=0x1e44; # capital n with dot above
1011 $chars{0xc750}=0x1e56; # capital p with dot above
1012 $chars{0xc752}=0x1e58; # capital r with dot above
1013 $chars{0xc753}=0x1e60; # capital s with dot above
1014 $chars{0xc754}=0x1e6a; # capital t with dot above
1015 $chars{0xc757}=0x1e86; # capital w with dot above
1016 $chars{0xc758}=0x1e8a; # capital x with dot above
1017 $chars{0xc759}=0x1e8e; # capital y with dot above
1018 $chars{0xc75a}=0x017b; # capital z with dot above
1019 $chars{0xc761}=0x0227; # small b with dot above
1020 $chars{0xc762}=0x1e03; # small b with dot above
1021 $chars{0xc763}=0x010b; # small c with dot above
1022 $chars{0xc764}=0x1e0b; # small d with dot above
1023 $chars{0xc765}=0x0117; # small e with dot above
1024 $chars{0xc766}=0x1e1f; # small f with dot above
1025 $chars{0xc767}=0x0121; # small g with dot above
1026 $chars{0xc768}=0x1e23; # small h with dot above
1027 $chars{0xc76d}=0x1e41; # small m with dot above
1028 $chars{0xc76e}=0x1e45; # small n with dot above
1029 $chars{0xc770}=0x1e57; # small p with dot above
1030 $chars{0xc772}=0x1e59; # small r with dot above
1031 $chars{0xc773}=0x1e61; # small s with dot above
1032 $chars{0xc774}=0x1e6b; # small t with dot above
1033 $chars{0xc777}=0x1e87; # small w with dot above
1034 $chars{0xc778}=0x1e8b; # small x with dot above
1035 $chars{0xc779}=0x1e8f; # small y with dot above
1036 $chars{0xc77a}=0x017c; # small z with dot above
1037 # 4/8 trema, diaresis
1038 $chars{0xc820}=0x00a8; # diaeresis
1039 $chars{0xc841}=0x00c4; # capital a with diaeresis
1040 $chars{0xc845}=0x00cb; # capital e with diaeresis
1041 $chars{0xc848}=0x1e26; # capital h with diaeresis
1042 $chars{0xc849}=0x00cf; # capital i with diaeresis
1043 $chars{0xc84f}=0x00d6; # capital o with diaeresis
1044 $chars{0xc855}=0x00dc; # capital u with diaeresis
1045 $chars{0xc857}=0x1e84; # capital w with diaeresis
1046 $chars{0xc858}=0x1e8c; # capital x with diaeresis
1047 $chars{0xc859}=0x0178; # capital y with diaeresis
1048 $chars{0xc861}=0x00e4; # small a with diaeresis
1049 $chars{0xc865}=0x00eb; # small e with diaeresis
1050 $chars{0xc868}=0x1e27; # small h with diaeresis
1051 $chars{0xc869}=0x00ef; # small i with diaeresis
1052 $chars{0xc86f}=0x00f6; # small o with diaeresis
1053 $chars{0xc874}=0x1e97; # small t with diaeresis
1054 $chars{0xc875}=0x00fc; # small u with diaeresis
1055 $chars{0xc877}=0x1e85; # small w with diaeresis
1056 $chars{0xc878}=0x1e8d; # small x with diaeresis
1057 $chars{0xc879}=0x00ff; # small y with diaeresis
1058 # 4/9 umlaut
1059 $chars{0xc920}=0x00a8; # [diaeresis]
1060 $chars{0xc961}=0x00e4; # a with umlaut
1061 $chars{0xc965}=0x00eb; # e with umlaut
1062 $chars{0xc969}=0x00ef; # i with umlaut
1063 $chars{0xc96f}=0x00f6; # o with umlaut
1064 $chars{0xc975}=0x00fc; # u with umlaut
1065 # 4/10 circle above
1066 $chars{0xca41}=0x00c5; # capital a with ring above
1067 $chars{0xcaad}=0x016e; # capital u with ring above
1068 $chars{0xca61}=0x00e5; # small a with ring above
1069 $chars{0xca75}=0x016f; # small u with ring above
1070 $chars{0xca77}=0x1e98; # small w with ring above
1071 $chars{0xca79}=0x1e99; # small y with ring above
1072 # 4/11 high comma off centre
1073 # 4/12 inverted high comma centred
1074 # 4/13 double acute accent
1075 $chars{0xcd4f}=0x0150; # capital o with double acute
1076 $chars{0xcd55}=0x0170; # capital u with double acute
1077 $chars{0xcd6f}=0x0151; # small o with double acute
1078 $chars{0xcd75}=0x0171; # small u with double acute
1079 # 4/14 horn
1080 $chars{0xce54}=0x01a0; # latin capital letter o with horn
1081 $chars{0xce55}=0x01af; # latin capital letter u with horn
1082 $chars{0xce74}=0x01a1; # latin small letter o with horn
1083 $chars{0xce75}=0x01b0; # latin small letter u with horn
1084 # 4/15 caron (hacek
1085 $chars{0xcf41}=0x01cd; # capital a with caron
1086 $chars{0xcf43}=0x010c; # capital c with caron
1087 $chars{0xcf44}=0x010e; # capital d with caron
1088 $chars{0xcf45}=0x011a; # capital e with caron
1089 $chars{0xcf47}=0x01e6; # capital g with caron
1090 $chars{0xcf49}=0x01cf; # capital i with caron
1091 $chars{0xcf4b}=0x01e8; # capital k with caron
1092 $chars{0xcf4c}=0x013d; # capital l with caron
1093 $chars{0xcf4e}=0x0147; # capital n with caron
1094 $chars{0xcf4f}=0x01d1; # capital o with caron
1095 $chars{0xcf52}=0x0158; # capital r with caron
1096 $chars{0xcf53}=0x0160; # capital s with caron
1097 $chars{0xcf54}=0x0164; # capital t with caron
1098 $chars{0xcf55}=0x01d3; # capital u with caron
1099 $chars{0xcf5a}=0x017d; # capital z with caron
1100 $chars{0xcf61}=0x01ce; # small a with caron
1101 $chars{0xcf63}=0x010d; # small c with caron
1102 $chars{0xcf64}=0x010f; # small d with caron
1103 $chars{0xcf65}=0x011b; # small e with caron
1104 $chars{0xcf67}=0x01e7; # small g with caron
1105 $chars{0xcf69}=0x01d0; # small i with caron
1106 $chars{0xcf6a}=0x01f0; # small j with caron
1107 $chars{0xcf6b}=0x01e9; # small k with caron
1108 $chars{0xcf6c}=0x013e; # small l with caron
1109 $chars{0xcf6e}=0x0148; # small n with caron
1110 $chars{0xcf6f}=0x01d2; # small o with caron
1111 $chars{0xcf72}=0x0159; # small r with caron
1112 $chars{0xcf73}=0x0161; # small s with caron
1113 $chars{0xcf74}=0x0165; # small t with caron
1114 $chars{0xcf75}=0x01d4; # small u with caron
1115 $chars{0xcf7a}=0x017e; # small z with caron
1116 # 5/0 cedilla
1117 $chars{0xd020}=0x00b8; # cedilla
1118 $chars{0xd043}=0x00c7; # capital c with cedilla
1119 $chars{0xd044}=0x1e10; # capital d with cedilla
1120 $chars{0xd047}=0x0122; # capital g with cedilla
1121 $chars{0xd048}=0x1e28; # capital h with cedilla
1122 $chars{0xd04b}=0x0136; # capital k with cedilla
1123 $chars{0xd04c}=0x013b; # capital l with cedilla
1124 $chars{0xd04e}=0x0145; # capital n with cedilla
1125 $chars{0xd052}=0x0156; # capital r with cedilla
1126 $chars{0xd053}=0x015e; # capital s with cedilla
1127 $chars{0xd054}=0x0162; # capital t with cedilla
1128 $chars{0xd063}=0x00e7; # small c with cedilla
1129 $chars{0xd064}=0x1e11; # small d with cedilla
1130 $chars{0xd065}=0x0119; # small e with cedilla
1131 $chars{0xd067}=0x0123; # small g with cedilla
1132 $chars{0xd068}=0x1e29; # small h with cedilla
1133 $chars{0xd06b}=0x0137; # small k with cedilla
1134 $chars{0xd06c}=0x013c; # small l with cedilla
1135 $chars{0xd06e}=0x0146; # small n with cedilla
1136 $chars{0xd072}=0x0157; # small r with cedilla
1137 $chars{0xd073}=0x015f; # small s with cedilla
1138 $chars{0xd074}=0x0163; # small t with cedilla
1139 # 5/1 rude
1140 # 5/2 hook to left
1141 # 5/3 ogonek (hook to right
1142 $chars{0xd320}=0x02db; # ogonek
1143 $chars{0xd341}=0x0104; # capital a with ogonek
1144 $chars{0xd345}=0x0118; # capital e with ogonek
1145 $chars{0xd349}=0x012e; # capital i with ogonek
1146 $chars{0xd34f}=0x01ea; # capital o with ogonek
1147 $chars{0xd355}=0x0172; # capital u with ogonek
1148 $chars{0xd361}=0x0105; # small a with ogonek
1149 $chars{0xd365}=0x0119; # small e with ogonek
1150 $chars{0xd369}=0x012f; # small i with ogonek
1151 $chars{0xd36f}=0x01eb; # small o with ogonek
1152 $chars{0xd375}=0x0173; # small u with ogonek
1153 # 5/4 circle below
1154 $chars{0xd441}=0x1e00; # capital a with ring below
1155 $chars{0xd461}=0x1e01; # small a with ring below
1156 # 5/5 half circle below
1157 $chars{0xd548}=0x1e2a; # capital h with breve below
1158 $chars{0xd568}=0x1e2b; # small h with breve below
1159 # 5/6 dot below
1160 $chars{0xd641}=0x1ea0; # capital a with dot below
1161 $chars{0xd642}=0x1e04; # capital b with dot below
1162 $chars{0xd644}=0x1e0c; # capital d with dot below
1163 $chars{0xd645}=0x1eb8; # capital e with dot below
1164 $chars{0xd648}=0x1e24; # capital h with dot below
1165 $chars{0xd649}=0x1eca; # capital i with dot below
1166 $chars{0xd64b}=0x1e32; # capital k with dot below
1167 $chars{0xd64c}=0x1e36; # capital l with dot below
1168 $chars{0xd64d}=0x1e42; # capital m with dot below
1169 $chars{0xd64e}=0x1e46; # capital n with dot below
1170 $chars{0xd64f}=0x1ecc; # capital o with dot below
1171 $chars{0xd652}=0x1e5a; # capital r with dot below
1172 $chars{0xd653}=0x1e62; # capital s with dot below
1173 $chars{0xd654}=0x1e6c; # capital t with dot below
1174 $chars{0xd655}=0x1ee4; # capital u with dot below
1175 $chars{0xd656}=0x1e7e; # capital v with dot below
1176 $chars{0xd657}=0x1e88; # capital w with dot below
1177 $chars{0xd659}=0x1ef4; # capital y with dot below
1178 $chars{0xd65a}=0x1e92; # capital z with dot below
1179 $chars{0xd661}=0x1ea1; # small a with dot below
1180 $chars{0xd662}=0x1e05; # small b with dot below
1181 $chars{0xd664}=0x1e0d; # small d with dot below
1182 $chars{0xd665}=0x1eb9; # small e with dot below
1183 $chars{0xd668}=0x1e25; # small h with dot below
1184 $chars{0xd669}=0x1ecb; # small i with dot below
1185 $chars{0xd66b}=0x1e33; # small k with dot below
1186 $chars{0xd66c}=0x1e37; # small l with dot below
1187 $chars{0xd66d}=0x1e43; # small m with dot below
1188 $chars{0xd66e}=0x1e47; # small n with dot below
1189 $chars{0xd66f}=0x1ecd; # small o with dot below
1190 $chars{0xd672}=0x1e5b; # small r with dot below
1191 $chars{0xd673}=0x1e63; # small s with dot below
1192 $chars{0xd674}=0x1e6d; # small t with dot below
1193 $chars{0xd675}=0x1ee5; # small u with dot below
1194 $chars{0xd676}=0x1e7f; # small v with dot below
1195 $chars{0xd677}=0x1e89; # small w with dot below
1196 $chars{0xd679}=0x1ef5; # small y with dot below
1197 $chars{0xd67a}=0x1e93; # small z with dot below
1198 # 5/7 double dot below
1199 $chars{0xd755}=0x1e72; # capital u with diaeresis below
1200 $chars{0xd775}=0x1e73; # small u with diaeresis below
1201 # 5/8 underline
1202 $chars{0xd820}=0x005f; # underline
1203 # 5/9 double underline
1204 $chars{0xd920}=0x2017; # double underline
1205 # 5/10 small low vertical bar
1206 $chars{0xda20}=0x02cc; #
1207 # 5/11 circumflex below
1208 # 5/12 (this position shall not be used)
1209 # 5/13 left half of ligature sign and of double tilde
1210 # 5/14 right half of ligature sign
1211 # 5/15 right half of double tilde
1212 # map {printf "%x :%x\n",$_,$chars{$_};}keys %chars;
1214 sub char_decode5426 {
1215 my ( $string) = @_;
1216 my $result;
1218 my @data = unpack("C*", $string);
1219 my @characters;
1220 my $length=scalar(@data);
1221 for (my $i = 0; $i < scalar(@data); $i++) {
1222 my $char= $data[$i];
1223 if ($char >= 0x00 && $char <= 0x7F){
1224 #IsAscii
1226 push @characters,$char unless ($char<0x02 ||$char== 0x0F);
1227 }elsif (($char >= 0xC0 && $char <= 0xDF)) {
1228 #Combined Char
1229 my $convchar ;
1230 if ($chars{$char*256+$data[$i+1]}) {
1231 $convchar= $chars{$char * 256 + $data[$i+1]};
1232 $i++;
1233 # printf "char %x $char, char to convert %x , converted %x\n",$char,$char * 256 + $data[$i - 1],$convchar;
1234 } elsif ($chars{$char}) {
1235 $convchar= $chars{$char};
1236 # printf "0xC char %x, converted %x\n",$char,$chars{$char};
1237 }else {
1238 $convchar=$char;
1240 push @characters,$convchar;
1241 } else {
1242 my $convchar;
1243 if ($chars{$char}) {
1244 $convchar= $chars{$char};
1245 # printf "char %x, converted %x\n",$char,$chars{$char};
1246 }else {
1247 # printf "char %x $char\n",$char;
1248 $convchar=$char;
1250 push @characters,$convchar;
1253 $result=pack "U*",@characters;
1254 # $result=~s/\x01//;
1255 # $result=~s/\x00//;
1256 $result=~s/\x0f//;
1257 $result=~s/\x1b.//;
1258 $result=~s/\x0e//;
1259 $result=~s/\x1b\x5b//;
1260 # map{printf "%x",$_} @characters;
1261 # printf "\n";
1262 return $result;
1268 =head1 AUTHOR
1270 Koha Development Team <http://koha-community.org/>
1272 Galen Charlton <galen.charlton@liblime.com>
1274 =cut