Update README.Debian for koha-common with post-install setup.
[koha.git] / C4 / Charset.pm
blob2f1d123336ae12f4c02a87edbf30a8dd252f922d
1 package C4::Charset;
3 # Copyright (C) 2008 LibLime
5 # This file is part of Koha.
7 # Koha is free software; you can redistribute it and/or modify it under the
8 # terms of the GNU General Public License as published by the Free Software
9 # Foundation; either version 2 of the License, or (at your option) any later
10 # version.
12 # Koha is distributed in the hope that it will be useful, but WITHOUT ANY
13 # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
14 # A PARTICULAR PURPOSE. See the GNU General Public License for more details.
16 # You should have received a copy of the GNU General Public License along
17 # with Koha; if not, write to the Free Software Foundation, Inc.,
18 # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
20 use strict;
21 use warnings;
23 use MARC::Charset qw/marc8_to_utf8/;
24 use Text::Iconv;
25 use C4::Debug;
26 use Unicode::Normalize;
28 use vars qw($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
30 BEGIN {
31 # set the version for version checking
32 $VERSION = 3.01;
33 require Exporter;
34 @ISA = qw(Exporter);
35 @EXPORT = qw(
36 IsStringUTF8ish
37 MarcToUTF8Record
38 SetUTF8Flag
39 SetMarcUnicodeFlag
40 StripNonXmlChars
44 =head1 NAME
46 C4::Charset - utilities for handling character set conversions.
48 =head1 SYNOPSIS
50 use C4::Charset;
52 =head1 DESCRIPTION
54 This module contains routines for dealing with character set
55 conversions, particularly for MARC records.
57 A variety of character encodings are in use by various MARC
58 standards, and even more character encodings are used by
59 non-standard MARC records. The various MARC formats generally
60 do not do a good job of advertising a given record's character
61 encoding, and even when a record does advertise its encoding,
62 e.g., via the Leader/09, experience has shown that one cannot
63 trust it.
65 Ultimately, all MARC records are stored in Koha in UTF-8 and
66 must be converted from whatever the source character encoding is.
67 The goal of this module is to ensure that these conversions
68 take place accurately. When a character conversion cannot take
69 place, or at least not accurately, the module was provide
70 enough information to allow user-facing code to inform the user
71 on how to deal with the situation.
73 =cut
75 =head1 FUNCTIONS
77 =head2 IsStringUTF8ish
79 =over 4
81 my $is_utf8 = IsStringUTF8ish($str);
83 =back
85 Determines if C<$str> is valid UTF-8. This can mean
86 one of two things:
88 =over 2
90 =item *
92 The Perl UTF-8 flag is set and the string contains valid UTF-8.
94 =item *
96 The Perl UTF-8 flag is B<not> set, but the octets contain
97 valid UTF-8.
99 =back
101 The function is named C<IsStringUTF8ish> instead of C<IsStringUTF8>
102 because in one could be presented with a MARC blob that is
103 not actually in UTF-8 but whose sequence of octets appears to be
104 valid UTF-8. The rest of the MARC character conversion functions
105 will assume that this situation occur does not very often.
107 =cut
109 sub IsStringUTF8ish {
110 my $str = shift;
112 return 1 if utf8::is_utf8($str);
113 return utf8::decode($str);
116 =head2 SetUTF8Flag
118 =over 4
120 my $marc_record = SetUTF8Flag($marc_record);
122 =back
124 This function sets the PERL UTF8 flag for data.
125 It is required when using new_from_usmarc
126 since MARC::File::USMARC does not handle PERL UTF8 setting.
127 When editing unicode marc records fields and subfields, you
128 would end up in double encoding without using this function.
130 FIXME
131 In my opinion, this function belongs to MARC::Record and not
132 to this package.
133 But since it handles charset, and MARC::Record, it finds its way in that package
135 =cut
137 sub SetUTF8Flag{
138 my ($record)=@_;
139 return unless ($record && $record->fields());
140 foreach my $field ($record->fields()){
141 if ($field->tag()>=10){
142 my @subfields;
143 foreach my $subfield ($field->subfields()){
144 push @subfields,($$subfield[0],NormalizeString($$subfield[1]));
146 my $newfield=MARC::Field->new(
147 $field->tag(),
148 $field->indicator(1),
149 $field->indicator(2),
150 @subfields
152 $field->replace_with($newfield);
157 =head2 NormalizeString
159 =over 4
161 my $normalized_string=NormalizeString($string);
163 =back
165 Given
166 a string
167 nfc : If you want to set NFC and not NFD
168 transform : If you expect all the signs to be removed
169 Sets the PERL UTF8 Flag on your initial data if need be
170 and applies cleaning if required
172 Returns a utf8 NFD normalized string
174 Sample code :
175 my $string=NormalizeString ("l'ornithoptère");
176 #results into ornithoptère in NFD form and sets UTF8 Flag
177 =cut
179 sub NormalizeString{
180 my ($string,$nfc,$transform)=@_;
181 utf8::decode($string) unless (utf8::is_utf8($string));
182 if ($nfc){
183 $string= NFD($string);
185 else {
186 $string=NFC($string);
188 if ($transform){
189 $string=~s/\<|\>|\^|\;|\.|\?|,|\-|\(|\)|\[|\]|\{|\}|\$|\%|\!|\*|\:|\\|\/|\&|\"|\'/ /g;
190 #removing one letter words "d'" "l'" was changed into "d " "l "
191 $string=~s/\b\S\b//g;
192 $string=~s/\s+$//g;
194 return $string;
197 =head2 MarcToUTF8Record
199 =over 4
201 ($marc_record, $converted_from, $errors_arrayref) = MarcToUTF8Record($marc_blob, $marc_flavour, [, $source_encoding]);
203 =back
205 Given a MARC blob or a C<MARC::Record>, the MARC flavour, and an
206 optional source encoding, return a C<MARC::Record> that is
207 converted to UTF-8.
209 The returned C<$marc_record> is guaranteed to be in valid UTF-8, but
210 is not guaranteed to have been converted correctly. Specifically,
211 if C<$converted_from> is 'failed', the MARC record returned failed
212 character conversion and had each of its non-ASCII octets changed
213 to the Unicode replacement character.
215 If the source encoding was not specified, this routine will
216 try to guess it; the character encoding used for a successful
217 conversion is returned in C<$converted_from>.
219 =cut
221 sub MarcToUTF8Record {
222 my $marc = shift;
223 my $marc_flavour = shift;
224 my $source_encoding = shift;
225 my $marc_record;
226 my $marc_blob_is_utf8 = 0;
227 if (ref($marc) eq 'MARC::Record') {
228 my $marc_blob = $marc->as_usmarc();
229 $marc_blob_is_utf8 = IsStringUTF8ish($marc_blob);
230 $marc_record = $marc;
231 } else {
232 # dealing with a MARC blob
234 # remove any ersatz whitespace from the beginning and
235 # end of the MARC blob -- these can creep into MARC
236 # files produced by several sources -- caller really
237 # should be doing this, however
238 $marc =~ s/^\s+//;
239 $marc =~ s/\s+$//;
240 $marc_blob_is_utf8 = IsStringUTF8ish($marc);
241 eval {
242 $marc_record = MARC::Record->new_from_usmarc($marc);
244 if ($@) {
245 # if we fail the first time, one likely problem
246 # is that we have a MARC21 record that says that it's
247 # UTF-8 (Leader/09 = 'a') but contains non-UTF-8 characters.
248 # We'll try parsing it again.
249 substr($marc, 9, 1) = ' ';
250 eval {
251 $marc_record = MARC::Record->new_from_usmarc($marc);
253 if ($@) {
254 # it's hopeless; return an empty MARC::Record
255 return MARC::Record->new(), 'failed', ['could not parse MARC blob'];
260 # If we do not know the source encoding, try some guesses
261 # as follows:
262 # 1. Record is UTF-8 already.
263 # 2. If MARC flavor is MARC21, then
264 # a. record is MARC-8
265 # b. record is ISO-8859-1
266 # 3. If MARC flavor is UNIMARC, then
267 if (not defined $source_encoding) {
268 if ($marc_blob_is_utf8) {
269 # note that for MARC21 we are not bothering to check
270 # if the Leader/09 is set to 'a' or not -- because
271 # of problems with various ILSs (including Koha in the
272 # past, alas), this just is not trustworthy.
273 SetMarcUnicodeFlag($marc_record, $marc_flavour);
274 return $marc_record, 'UTF-8', [];
275 } else {
276 if ($marc_flavour eq 'MARC21') {
277 return _default_marc21_charconv_to_utf8($marc_record, $marc_flavour);
278 } elsif ($marc_flavour =~/UNIMARC/) {
279 return _default_unimarc_charconv_to_utf8($marc_record, $marc_flavour);
280 } else {
281 return _default_marc21_charconv_to_utf8($marc_record, $marc_flavour);
284 } else {
285 # caller knows the character encoding
286 my $original_marc_record = $marc_record->clone();
287 my @errors;
288 if ($source_encoding =~ /utf-?8/i) {
289 if ($marc_blob_is_utf8) {
290 SetMarcUnicodeFlag($marc_record, $marc_flavour);
291 return $marc_record, 'UTF-8', [];
292 } else {
293 push @errors, 'specified UTF-8 => UTF-8 conversion, but record is not in UTF-8';
295 } elsif ($source_encoding =~ /marc-?8/i) {
296 @errors = _marc_marc8_to_utf8($marc_record, $marc_flavour);
297 } elsif ($source_encoding =~ /5426/) {
298 @errors = _marc_iso5426_to_utf8($marc_record, $marc_flavour);
299 } else {
300 # assume any other character encoding is for Text::Iconv
301 @errors = _marc_to_utf8_via_text_iconv($marc_record, $marc_flavour, $source_encoding);
304 if (@errors) {
305 _marc_to_utf8_replacement_char($original_marc_record, $marc_flavour);
306 return $original_marc_record, 'failed', \@errors;
307 } else {
308 return $marc_record, $source_encoding, [];
314 =head2 SetMarcUnicodeFlag
316 =over 4
318 SetMarcUnicodeFlag($marc_record, $marc_flavour);
320 =back
322 Set both the internal MARC::Record encoding flag
323 and the appropriate Leader/09 (MARC21) or
324 100/26-29 (UNIMARC) to indicate that the record
325 is in UTF-8. Note that this does B<not> do
326 any actual character conversion.
328 =cut
330 sub SetMarcUnicodeFlag {
331 my $marc_record = shift;
332 my $marc_flavour = shift; # || C4::Context->preference("marcflavour");
334 $marc_record->encoding('UTF-8');
335 if ($marc_flavour eq 'MARC21') {
336 my $leader = $marc_record->leader();
337 substr($leader, 9, 1) = 'a';
338 $marc_record->leader($leader);
339 } elsif ($marc_flavour =~/UNIMARC/) {
340 my $string;
341 my ($subflength,$encodingposition)=($marc_flavour=~/AUTH/?(21,9):(36,22));
342 $string=$marc_record->subfield( 100, "a" );
343 if (defined $string && length($string)==$subflength) {
344 $string = substr $string, 0,$subflength if (length($string)>$subflength);
346 else {
347 $string = POSIX::strftime( "%Y%m%d", localtime );
348 $string =~ s/\-//g;
349 $string = sprintf( "%-*s", $subflength, $string );
351 substr( $string, $encodingposition, 8, "frey50 " );
352 if ( $marc_record->subfield( 100, "a" ) ) {
353 $marc_record->field('100')->update(a=>$string);
355 else {
356 $marc_record->insert_grouped_field(
357 MARC::Field->new( 100, '', '', "a" => $string ) );
359 $debug && warn "encodage: ", substr( $marc_record->subfield(100, 'a'), $encodingposition, 8 );
360 } else {
361 warn "Unrecognized marcflavour: $marc_flavour";
365 =head2 StripNonXmlChars
367 =over 4
369 my $new_str = StripNonXmlChars($old_str);
371 =back
373 Given a string, return a copy with the
374 characters that are illegal in XML
375 removed.
377 This function exists to work around a problem
378 that can occur with badly-encoded MARC records.
379 Specifically, if a UTF-8 MARC record also
380 has excape (\x1b) characters, MARC::File::XML
381 will let the escape characters pass through
382 when as_xml() or as_xml_record() is called. The
383 problem is that the escape character is not
384 legal in well-formed XML documents, so when
385 MARC::File::XML attempts to parse such a record,
386 the XML parser will fail.
388 Stripping such characters will allow a
389 MARC::Record->new_from_xml()
390 to work, at the possible risk of some data loss.
392 =cut
394 sub StripNonXmlChars {
395 my $str = shift;
396 if (!defined($str) || $str eq ""){
397 return "";
399 $str =~ s/[^\x09\x0A\x0D\x{0020}-\x{D7FF}\x{E000}-\x{FFFD}\x{10000}-\x{10FFFF}]//g;
400 return $str;
403 =head1 INTERNAL FUNCTIONS
405 =head2 _default_marc21_charconv_to_utf8
407 =over 4
409 my ($new_marc_record, $guessed_charset) = _default_marc21_charconv_to_utf8($marc_record);
411 =back
413 Converts a C<MARC::Record> of unknown character set to UTF-8,
414 first by trying a MARC-8 to UTF-8 conversion, then ISO-8859-1
415 to UTF-8, then a default conversion that replaces each non-ASCII
416 character with the replacement character.
418 The C<$guessed_charset> return value contains the character set
419 that resulted in a conversion to valid UTF-8; note that
420 if the MARC-8 and ISO-8859-1 conversions failed, the value of
421 this is 'failed'.
423 =cut
425 sub _default_marc21_charconv_to_utf8 {
426 my $marc_record = shift;
427 my $marc_flavour = shift;
429 my $trial_marc8 = $marc_record->clone();
430 my @all_errors = ();
431 my @errors = _marc_marc8_to_utf8($trial_marc8, $marc_flavour);
432 unless (@errors) {
433 return $trial_marc8, 'MARC-8', [];
435 push @all_errors, @errors;
437 my $trial_8859_1 = $marc_record->clone();
438 @errors = _marc_to_utf8_via_text_iconv($trial_8859_1, $marc_flavour, 'iso-8859-1');
439 unless (@errors) {
440 return $trial_8859_1, 'iso-8859-1', []; # note -- we could return \@all_errors
441 # instead if we wanted to report details
442 # of the failed attempt at MARC-8 => UTF-8
444 push @all_errors, @errors;
446 my $default_converted = $marc_record->clone();
447 _marc_to_utf8_replacement_char($default_converted, $marc_flavour);
448 return $default_converted, 'failed', \@all_errors;
451 =head2 _default_unimarc_charconv_to_utf8
453 =over 4
455 my ($new_marc_record, $guessed_charset) = _default_unimarc_charconv_to_utf8($marc_record);
457 =back
459 Converts a C<MARC::Record> of unknown character set to UTF-8,
460 first by trying a ISO-5426 to UTF-8 conversion, then ISO-8859-1
461 to UTF-8, then a default conversion that replaces each non-ASCII
462 character with the replacement character.
464 The C<$guessed_charset> return value contains the character set
465 that resulted in a conversion to valid UTF-8; note that
466 if the MARC-8 and ISO-8859-1 conversions failed, the value of
467 this is 'failed'.
469 =cut
471 sub _default_unimarc_charconv_to_utf8 {
472 my $marc_record = shift;
473 my $marc_flavour = shift;
475 my $trial_marc8 = $marc_record->clone();
476 my @all_errors = ();
477 my @errors = _marc_iso5426_to_utf8($trial_marc8, $marc_flavour);
478 unless (@errors) {
479 return $trial_marc8, 'iso-5426';
481 push @all_errors, @errors;
483 my $trial_8859_1 = $marc_record->clone();
484 @errors = _marc_to_utf8_via_text_iconv($trial_8859_1, $marc_flavour, 'iso-8859-1');
485 unless (@errors) {
486 return $trial_8859_1, 'iso-8859-1';
488 push @all_errors, @errors;
490 my $default_converted = $marc_record->clone();
491 _marc_to_utf8_replacement_char($default_converted, $marc_flavour);
492 return $default_converted, 'failed', \@all_errors;
495 =head2 _marc_marc8_to_utf8
497 =over 4
499 my @errors = _marc_marc8_to_utf8($marc_record, $marc_flavour, $source_encoding);
501 =back
503 Convert a C<MARC::Record> to UTF-8 in-place from MARC-8.
504 If the conversion fails for some reason, an
505 appropriate messages will be placed in the returned
506 C<@errors> array.
508 =cut
510 sub _marc_marc8_to_utf8 {
511 my $marc_record = shift;
512 my $marc_flavour = shift;
514 my $prev_ignore = MARC::Charset->ignore_errors();
515 MARC::Charset->ignore_errors(1);
517 # trap warnings raised by MARC::Charset
518 my @errors = ();
519 local $SIG{__WARN__} = sub {
520 my $msg = $_[0];
521 if ($msg =~ /MARC.Charset/) {
522 # FIXME - purpose of this regexp is to strip out the
523 # line reference to MARC/Charset.pm, but as it
524 # exists probably won't work quite on Windows --
525 # some sort of minimal-bunch back-tracking RE
526 # would be helpful here
527 $msg =~ s/at [\/].*?.MARC.Charset\.pm line \d+\.\n$//;
528 push @errors, $msg;
529 } else {
530 # if warning doesn't come from MARC::Charset, just
531 # pass it on
532 warn $msg;
536 foreach my $field ($marc_record->fields()) {
537 if ($field->is_control_field()) {
538 ; # do nothing -- control fields should not contain non-ASCII characters
539 } else {
540 my @converted_subfields;
541 foreach my $subfield ($field->subfields()) {
542 my $utf8sf = MARC::Charset::marc8_to_utf8($subfield->[1]);
543 unless (IsStringUTF8ish($utf8sf)) {
544 # Because of a bug in MARC::Charset 0.98, if the string
545 # has (a) one or more diacritics that (b) are only in character positions
546 # 128 to 255 inclusive, the resulting converted string is not in
547 # UTF-8, but the legacy 8-bit encoding (e.g., ISO-8859-1). If that
548 # occurs, upgrade the string in place. Moral of the story seems to be
549 # that pack("U", ...) is better than chr(...) if you need to guarantee
550 # that the resulting string is UTF-8.
551 utf8::upgrade($utf8sf);
553 push @converted_subfields, $subfield->[0], $utf8sf;
556 $field->replace_with(MARC::Field->new(
557 $field->tag(), $field->indicator(1), $field->indicator(2),
558 @converted_subfields)
563 MARC::Charset->ignore_errors($prev_ignore);
565 SetMarcUnicodeFlag($marc_record, $marc_flavour);
567 return @errors;
570 =head2 _marc_iso5426_to_utf8
572 =over 4
574 my @errors = _marc_iso5426_to_utf8($marc_record, $marc_flavour, $source_encoding);
576 =back
578 Convert a C<MARC::Record> to UTF-8 in-place from ISO-5426.
579 If the conversion fails for some reason, an
580 appropriate messages will be placed in the returned
581 C<@errors> array.
583 FIXME - is ISO-5426 equivalent enough to MARC-8
584 that C<MARC::Charset> can be used instead?
586 =cut
588 sub _marc_iso5426_to_utf8 {
589 my $marc_record = shift;
590 my $marc_flavour = shift;
592 my @errors = ();
594 foreach my $field ($marc_record->fields()) {
595 if ($field->is_control_field()) {
596 ; # do nothing -- control fields should not contain non-ASCII characters
597 } else {
598 my @converted_subfields;
599 foreach my $subfield ($field->subfields()) {
600 my $utf8sf = char_decode5426($subfield->[1]);
601 push @converted_subfields, $subfield->[0], $utf8sf;
604 $field->replace_with(MARC::Field->new(
605 $field->tag(), $field->indicator(1), $field->indicator(2),
606 @converted_subfields)
611 SetMarcUnicodeFlag($marc_record, $marc_flavour);
613 return @errors;
616 =head2 _marc_to_utf8_via_text_iconv
618 =over 4
620 my @errors = _marc_to_utf8_via_text_iconv($marc_record, $marc_flavour, $source_encoding);
622 =back
624 Convert a C<MARC::Record> to UTF-8 in-place using the
625 C<Text::Iconv> CPAN module. Any source encoding accepted
626 by the user's iconv installation should work. If
627 the source encoding is not recognized on the user's
628 server or the conversion fails for some reason,
629 appropriate messages will be placed in the returned
630 C<@errors> array.
632 =cut
634 sub _marc_to_utf8_via_text_iconv {
635 my $marc_record = shift;
636 my $marc_flavour = shift;
637 my $source_encoding = shift;
639 my @errors = ();
640 my $decoder;
641 eval { $decoder = Text::Iconv->new($source_encoding, 'utf8'); };
642 if ($@) {
643 push @errors, "Could not initialze $source_encoding => utf8 converter: $@";
644 return @errors;
647 my $prev_raise_error = Text::Iconv->raise_error();
648 Text::Iconv->raise_error(1);
650 foreach my $field ($marc_record->fields()) {
651 if ($field->is_control_field()) {
652 ; # do nothing -- control fields should not contain non-ASCII characters
653 } else {
654 my @converted_subfields;
655 foreach my $subfield ($field->subfields()) {
656 my $converted_value;
657 my $conversion_ok = 1;
658 eval { $converted_value = $decoder->convert($subfield->[1]); };
659 if ($@) {
660 $conversion_ok = 0;
661 push @errors, $@;
662 } elsif (not defined $converted_value) {
663 $conversion_ok = 0;
664 push @errors, "Text::Iconv conversion failed - retval is " . $decoder->retval();
667 if ($conversion_ok) {
668 push @converted_subfields, $subfield->[0], $converted_value;
669 } else {
670 $converted_value = $subfield->[1];
671 $converted_value =~ s/[\200-\377]/\xef\xbf\xbd/g;
672 push @converted_subfields, $subfield->[0], $converted_value;
676 $field->replace_with(MARC::Field->new(
677 $field->tag(), $field->indicator(1), $field->indicator(2),
678 @converted_subfields)
683 SetMarcUnicodeFlag($marc_record, $marc_flavour);
684 Text::Iconv->raise_error($prev_raise_error);
686 return @errors;
689 =head2 _marc_to_utf8_replacement_char
691 =over 4
693 _marc_to_utf8_replacement_char($marc_record, $marc_flavour);
695 =back
697 Convert a C<MARC::Record> to UTF-8 in-place, adopting the
698 unsatisfactory method of replacing all non-ASCII (e.g.,
699 where the eight bit is set) octet with the Unicode
700 replacement character. This is meant as a last-ditch
701 method, and would be best used as part of a UI that
702 lets a cataloguer pick various character conversions
703 until he or she finds the right one.
705 =cut
707 sub _marc_to_utf8_replacement_char {
708 my $marc_record = shift;
709 my $marc_flavour = shift;
711 foreach my $field ($marc_record->fields()) {
712 if ($field->is_control_field()) {
713 ; # do nothing -- control fields should not contain non-ASCII characters
714 } else {
715 my @converted_subfields;
716 foreach my $subfield ($field->subfields()) {
717 my $value = $subfield->[1];
718 $value =~ s/[\200-\377]/\xef\xbf\xbd/g;
719 push @converted_subfields, $subfield->[0], $value;
722 $field->replace_with(MARC::Field->new(
723 $field->tag(), $field->indicator(1), $field->indicator(2),
724 @converted_subfields)
729 SetMarcUnicodeFlag($marc_record, $marc_flavour);
732 =head2 char_decode5426
734 =over 4
736 my $utf8string = char_decode5426($iso_5426_string);
738 =back
740 Converts a string from ISO-5426 to UTF-8.
742 =cut
745 my %chars;
746 $chars{0xb0}=0x0101;#3/0ayn[ain]
747 $chars{0xb1}=0x0623;#3/1alif/hamzah[alefwithhamzaabove]
748 #$chars{0xb2}=0x00e0;#'à';
749 $chars{0xb2}=0x00e0;#3/2leftlowsinglequotationmark
750 #$chars{0xb3}=0x00e7;#'ç';
751 $chars{0xb3}=0x00e7;#3/2leftlowsinglequotationmark
752 # $chars{0xb4}='è';
753 $chars{0xb4}=0x00e8;
754 $chars{0xbd}=0x02b9;
755 $chars{0xbe}=0x02ba;
756 # $chars{0xb5}='é';
757 $chars{0xb5}=0x00e9;
758 $chars{0x97}=0x003c;#3/2leftlowsinglequotationmark
759 $chars{0x98}=0x003e;#3/2leftlowsinglequotationmark
760 $chars{0xfa}=0x0153; #oe
761 $chars{0xea}=0x0152; #oe
762 $chars{0x81d1}=0x00b0;
764 ####
765 ## combined characters iso5426
767 $chars{0xc041}=0x1ea2; # capital a with hook above
768 $chars{0xc045}=0x1eba; # capital e with hook above
769 $chars{0xc049}=0x1ec8; # capital i with hook above
770 $chars{0xc04f}=0x1ece; # capital o with hook above
771 $chars{0xc055}=0x1ee6; # capital u with hook above
772 $chars{0xc059}=0x1ef6; # capital y with hook above
773 $chars{0xc061}=0x1ea3; # small a with hook above
774 $chars{0xc065}=0x1ebb; # small e with hook above
775 $chars{0xc069}=0x1ec9; # small i with hook above
776 $chars{0xc06f}=0x1ecf; # small o with hook above
777 $chars{0xc075}=0x1ee7; # small u with hook above
778 $chars{0xc079}=0x1ef7; # small y with hook above
780 # 4/1 grave accent
781 $chars{0xc141}=0x00c0; # capital a with grave accent
782 $chars{0xc145}=0x00c8; # capital e with grave accent
783 $chars{0xc149}=0x00cc; # capital i with grave accent
784 $chars{0xc14f}=0x00d2; # capital o with grave accent
785 $chars{0xc155}=0x00d9; # capital u with grave accent
786 $chars{0xc157}=0x1e80; # capital w with grave
787 $chars{0xc159}=0x1ef2; # capital y with grave
788 $chars{0xc161}=0x00e0; # small a with grave accent
789 $chars{0xc165}=0x00e8; # small e with grave accent
790 $chars{0xc169}=0x00ec; # small i with grave accent
791 $chars{0xc16f}=0x00f2; # small o with grave accent
792 $chars{0xc175}=0x00f9; # small u with grave accent
793 $chars{0xc177}=0x1e81; # small w with grave
794 $chars{0xc179}=0x1ef3; # small y with grave
795 # 4/2 acute accent
796 $chars{0xc241}=0x00c1; # capital a with acute accent
797 $chars{0xc243}=0x0106; # capital c with acute accent
798 $chars{0xc245}=0x00c9; # capital e with acute accent
799 $chars{0xc247}=0x01f4; # capital g with acute
800 $chars{0xc249}=0x00cd; # capital i with acute accent
801 $chars{0xc24b}=0x1e30; # capital k with acute
802 $chars{0xc24c}=0x0139; # capital l with acute accent
803 $chars{0xc24d}=0x1e3e; # capital m with acute
804 $chars{0xc24e}=0x0143; # capital n with acute accent
805 $chars{0xc24f}=0x00d3; # capital o with acute accent
806 $chars{0xc250}=0x1e54; # capital p with acute
807 $chars{0xc252}=0x0154; # capital r with acute accent
808 $chars{0xc253}=0x015a; # capital s with acute accent
809 $chars{0xc255}=0x00da; # capital u with acute accent
810 $chars{0xc257}=0x1e82; # capital w with acute
811 $chars{0xc259}=0x00dd; # capital y with acute accent
812 $chars{0xc25a}=0x0179; # capital z with acute accent
813 $chars{0xc261}=0x00e1; # small a with acute accent
814 $chars{0xc263}=0x0107; # small c with acute accent
815 $chars{0xc265}=0x00e9; # small e with acute accent
816 $chars{0xc267}=0x01f5; # small g with acute
817 $chars{0xc269}=0x00ed; # small i with acute accent
818 $chars{0xc26b}=0x1e31; # small k with acute
819 $chars{0xc26c}=0x013a; # small l with acute accent
820 $chars{0xc26d}=0x1e3f; # small m with acute
821 $chars{0xc26e}=0x0144; # small n with acute accent
822 $chars{0xc26f}=0x00f3; # small o with acute accent
823 $chars{0xc270}=0x1e55; # small p with acute
824 $chars{0xc272}=0x0155; # small r with acute accent
825 $chars{0xc273}=0x015b; # small s with acute accent
826 $chars{0xc275}=0x00fa; # small u with acute accent
827 $chars{0xc277}=0x1e83; # small w with acute
828 $chars{0xc279}=0x00fd; # small y with acute accent
829 $chars{0xc27a}=0x017a; # small z with acute accent
830 $chars{0xc2e1}=0x01fc; # capital ae with acute
831 $chars{0xc2f1}=0x01fd; # small ae with acute
832 # 4/3 circumflex accent
833 $chars{0xc341}=0x00c2; # capital a with circumflex accent
834 $chars{0xc343}=0x0108; # capital c with circumflex
835 $chars{0xc345}=0x00ca; # capital e with circumflex accent
836 $chars{0xc347}=0x011c; # capital g with circumflex
837 $chars{0xc348}=0x0124; # capital h with circumflex
838 $chars{0xc349}=0x00ce; # capital i with circumflex accent
839 $chars{0xc34a}=0x0134; # capital j with circumflex
840 $chars{0xc34f}=0x00d4; # capital o with circumflex accent
841 $chars{0xc353}=0x015c; # capital s with circumflex
842 $chars{0xc355}=0x00db; # capital u with circumflex
843 $chars{0xc357}=0x0174; # capital w with circumflex
844 $chars{0xc359}=0x0176; # capital y with circumflex
845 $chars{0xc35a}=0x1e90; # capital z with circumflex
846 $chars{0xc361}=0x00e2; # small a with circumflex accent
847 $chars{0xc363}=0x0109; # small c with circumflex
848 $chars{0xc365}=0x00ea; # small e with circumflex accent
849 $chars{0xc367}=0x011d; # small g with circumflex
850 $chars{0xc368}=0x0125; # small h with circumflex
851 $chars{0xc369}=0x00ee; # small i with circumflex accent
852 $chars{0xc36a}=0x0135; # small j with circumflex
853 $chars{0xc36e}=0x00f1; # small n with tilde
854 $chars{0xc36f}=0x00f4; # small o with circumflex accent
855 $chars{0xc373}=0x015d; # small s with circumflex
856 $chars{0xc375}=0x00fb; # small u with circumflex
857 $chars{0xc377}=0x0175; # small w with circumflex
858 $chars{0xc379}=0x0177; # small y with circumflex
859 $chars{0xc37a}=0x1e91; # small z with circumflex
860 # 4/4 tilde
861 $chars{0xc441}=0x00c3; # capital a with tilde
862 $chars{0xc445}=0x1ebc; # capital e with tilde
863 $chars{0xc449}=0x0128; # capital i with tilde
864 $chars{0xc44e}=0x00d1; # capital n with tilde
865 $chars{0xc44f}=0x00d5; # capital o with tilde
866 $chars{0xc455}=0x0168; # capital u with tilde
867 $chars{0xc456}=0x1e7c; # capital v with tilde
868 $chars{0xc459}=0x1ef8; # capital y with tilde
869 $chars{0xc461}=0x00e3; # small a with tilde
870 $chars{0xc465}=0x1ebd; # small e with tilde
871 $chars{0xc469}=0x0129; # small i with tilde
872 $chars{0xc46e}=0x00f1; # small n with tilde
873 $chars{0xc46f}=0x00f5; # small o with tilde
874 $chars{0xc475}=0x0169; # small u with tilde
875 $chars{0xc476}=0x1e7d; # small v with tilde
876 $chars{0xc479}=0x1ef9; # small y with tilde
877 # 4/5 macron
878 $chars{0xc541}=0x0100; # capital a with macron
879 $chars{0xc545}=0x0112; # capital e with macron
880 $chars{0xc547}=0x1e20; # capital g with macron
881 $chars{0xc549}=0x012a; # capital i with macron
882 $chars{0xc54f}=0x014c; # capital o with macron
883 $chars{0xc555}=0x016a; # capital u with macron
884 $chars{0xc561}=0x0101; # small a with macron
885 $chars{0xc565}=0x0113; # small e with macron
886 $chars{0xc567}=0x1e21; # small g with macron
887 $chars{0xc569}=0x012b; # small i with macron
888 $chars{0xc56f}=0x014d; # small o with macron
889 $chars{0xc575}=0x016b; # small u with macron
890 $chars{0xc572}=0x0159; # small r with macron
891 $chars{0xc5e1}=0x01e2; # capital ae with macron
892 $chars{0xc5f1}=0x01e3; # small ae with macron
893 # 4/6 breve
894 $chars{0xc641}=0x0102; # capital a with breve
895 $chars{0xc645}=0x0114; # capital e with breve
896 $chars{0xc647}=0x011e; # capital g with breve
897 $chars{0xc649}=0x012c; # capital i with breve
898 $chars{0xc64f}=0x014e; # capital o with breve
899 $chars{0xc655}=0x016c; # capital u with breve
900 $chars{0xc661}=0x0103; # small a with breve
901 $chars{0xc665}=0x0115; # small e with breve
902 $chars{0xc667}=0x011f; # small g with breve
903 $chars{0xc669}=0x012d; # small i with breve
904 $chars{0xc66f}=0x014f; # small o with breve
905 $chars{0xc675}=0x016d; # small u with breve
906 # 4/7 dot above
907 $chars{0xc7b0}=0x01e1; # Ain with dot above
908 $chars{0xc742}=0x1e02; # capital b with dot above
909 $chars{0xc743}=0x010a; # capital c with dot above
910 $chars{0xc744}=0x1e0a; # capital d with dot above
911 $chars{0xc745}=0x0116; # capital e with dot above
912 $chars{0xc746}=0x1e1e; # capital f with dot above
913 $chars{0xc747}=0x0120; # capital g with dot above
914 $chars{0xc748}=0x1e22; # capital h with dot above
915 $chars{0xc749}=0x0130; # capital i with dot above
916 $chars{0xc74d}=0x1e40; # capital m with dot above
917 $chars{0xc74e}=0x1e44; # capital n with dot above
918 $chars{0xc750}=0x1e56; # capital p with dot above
919 $chars{0xc752}=0x1e58; # capital r with dot above
920 $chars{0xc753}=0x1e60; # capital s with dot above
921 $chars{0xc754}=0x1e6a; # capital t with dot above
922 $chars{0xc757}=0x1e86; # capital w with dot above
923 $chars{0xc758}=0x1e8a; # capital x with dot above
924 $chars{0xc759}=0x1e8e; # capital y with dot above
925 $chars{0xc75a}=0x017b; # capital z with dot above
926 $chars{0xc761}=0x0227; # small b with dot above
927 $chars{0xc762}=0x1e03; # small b with dot above
928 $chars{0xc763}=0x010b; # small c with dot above
929 $chars{0xc764}=0x1e0b; # small d with dot above
930 $chars{0xc765}=0x0117; # small e with dot above
931 $chars{0xc766}=0x1e1f; # small f with dot above
932 $chars{0xc767}=0x0121; # small g with dot above
933 $chars{0xc768}=0x1e23; # small h with dot above
934 $chars{0xc76d}=0x1e41; # small m with dot above
935 $chars{0xc76e}=0x1e45; # small n with dot above
936 $chars{0xc770}=0x1e57; # small p with dot above
937 $chars{0xc772}=0x1e59; # small r with dot above
938 $chars{0xc773}=0x1e61; # small s with dot above
939 $chars{0xc774}=0x1e6b; # small t with dot above
940 $chars{0xc777}=0x1e87; # small w with dot above
941 $chars{0xc778}=0x1e8b; # small x with dot above
942 $chars{0xc779}=0x1e8f; # small y with dot above
943 $chars{0xc77a}=0x017c; # small z with dot above
944 # 4/8 trema, diaresis
945 $chars{0xc820}=0x00a8; # diaeresis
946 $chars{0xc841}=0x00c4; # capital a with diaeresis
947 $chars{0xc845}=0x00cb; # capital e with diaeresis
948 $chars{0xc848}=0x1e26; # capital h with diaeresis
949 $chars{0xc849}=0x00cf; # capital i with diaeresis
950 $chars{0xc84f}=0x00d6; # capital o with diaeresis
951 $chars{0xc855}=0x00dc; # capital u with diaeresis
952 $chars{0xc857}=0x1e84; # capital w with diaeresis
953 $chars{0xc858}=0x1e8c; # capital x with diaeresis
954 $chars{0xc859}=0x0178; # capital y with diaeresis
955 $chars{0xc861}=0x00e4; # small a with diaeresis
956 $chars{0xc865}=0x00eb; # small e with diaeresis
957 $chars{0xc868}=0x1e27; # small h with diaeresis
958 $chars{0xc869}=0x00ef; # small i with diaeresis
959 $chars{0xc86f}=0x00f6; # small o with diaeresis
960 $chars{0xc874}=0x1e97; # small t with diaeresis
961 $chars{0xc875}=0x00fc; # small u with diaeresis
962 $chars{0xc877}=0x1e85; # small w with diaeresis
963 $chars{0xc878}=0x1e8d; # small x with diaeresis
964 $chars{0xc879}=0x00ff; # small y with diaeresis
965 # 4/9 umlaut
966 $chars{0xc920}=0x00a8; # [diaeresis]
967 $chars{0xc961}=0x00e4; # a with umlaut
968 $chars{0xc965}=0x00eb; # e with umlaut
969 $chars{0xc969}=0x00ef; # i with umlaut
970 $chars{0xc96f}=0x00f6; # o with umlaut
971 $chars{0xc975}=0x00fc; # u with umlaut
972 # 4/10 circle above
973 $chars{0xca41}=0x00c5; # capital a with ring above
974 $chars{0xcaad}=0x016e; # capital u with ring above
975 $chars{0xca61}=0x00e5; # small a with ring above
976 $chars{0xca75}=0x016f; # small u with ring above
977 $chars{0xca77}=0x1e98; # small w with ring above
978 $chars{0xca79}=0x1e99; # small y with ring above
979 # 4/11 high comma off centre
980 # 4/12 inverted high comma centred
981 # 4/13 double acute accent
982 $chars{0xcd4f}=0x0150; # capital o with double acute
983 $chars{0xcd55}=0x0170; # capital u with double acute
984 $chars{0xcd6f}=0x0151; # small o with double acute
985 $chars{0xcd75}=0x0171; # small u with double acute
986 # 4/14 horn
987 $chars{0xce54}=0x01a0; # latin capital letter o with horn
988 $chars{0xce55}=0x01af; # latin capital letter u with horn
989 $chars{0xce74}=0x01a1; # latin small letter o with horn
990 $chars{0xce75}=0x01b0; # latin small letter u with horn
991 # 4/15 caron (hacek
992 $chars{0xcf41}=0x01cd; # capital a with caron
993 $chars{0xcf43}=0x010c; # capital c with caron
994 $chars{0xcf44}=0x010e; # capital d with caron
995 $chars{0xcf45}=0x011a; # capital e with caron
996 $chars{0xcf47}=0x01e6; # capital g with caron
997 $chars{0xcf49}=0x01cf; # capital i with caron
998 $chars{0xcf4b}=0x01e8; # capital k with caron
999 $chars{0xcf4c}=0x013d; # capital l with caron
1000 $chars{0xcf4e}=0x0147; # capital n with caron
1001 $chars{0xcf4f}=0x01d1; # capital o with caron
1002 $chars{0xcf52}=0x0158; # capital r with caron
1003 $chars{0xcf53}=0x0160; # capital s with caron
1004 $chars{0xcf54}=0x0164; # capital t with caron
1005 $chars{0xcf55}=0x01d3; # capital u with caron
1006 $chars{0xcf5a}=0x017d; # capital z with caron
1007 $chars{0xcf61}=0x01ce; # small a with caron
1008 $chars{0xcf63}=0x010d; # small c with caron
1009 $chars{0xcf64}=0x010f; # small d with caron
1010 $chars{0xcf65}=0x011b; # small e with caron
1011 $chars{0xcf67}=0x01e7; # small g with caron
1012 $chars{0xcf69}=0x01d0; # small i with caron
1013 $chars{0xcf6a}=0x01f0; # small j with caron
1014 $chars{0xcf6b}=0x01e9; # small k with caron
1015 $chars{0xcf6c}=0x013e; # small l with caron
1016 $chars{0xcf6e}=0x0148; # small n with caron
1017 $chars{0xcf6f}=0x01d2; # small o with caron
1018 $chars{0xcf72}=0x0159; # small r with caron
1019 $chars{0xcf73}=0x0161; # small s with caron
1020 $chars{0xcf74}=0x0165; # small t with caron
1021 $chars{0xcf75}=0x01d4; # small u with caron
1022 $chars{0xcf7a}=0x017e; # small z with caron
1023 # 5/0 cedilla
1024 $chars{0xd020}=0x00b8; # cedilla
1025 $chars{0xd043}=0x00c7; # capital c with cedilla
1026 $chars{0xd044}=0x1e10; # capital d with cedilla
1027 $chars{0xd047}=0x0122; # capital g with cedilla
1028 $chars{0xd048}=0x1e28; # capital h with cedilla
1029 $chars{0xd04b}=0x0136; # capital k with cedilla
1030 $chars{0xd04c}=0x013b; # capital l with cedilla
1031 $chars{0xd04e}=0x0145; # capital n with cedilla
1032 $chars{0xd052}=0x0156; # capital r with cedilla
1033 $chars{0xd053}=0x015e; # capital s with cedilla
1034 $chars{0xd054}=0x0162; # capital t with cedilla
1035 $chars{0xd063}=0x00e7; # small c with cedilla
1036 $chars{0xd064}=0x1e11; # small d with cedilla
1037 $chars{0xd065}=0x0119; # small e with cedilla
1038 $chars{0xd067}=0x0123; # small g with cedilla
1039 $chars{0xd068}=0x1e29; # small h with cedilla
1040 $chars{0xd06b}=0x0137; # small k with cedilla
1041 $chars{0xd06c}=0x013c; # small l with cedilla
1042 $chars{0xd06e}=0x0146; # small n with cedilla
1043 $chars{0xd072}=0x0157; # small r with cedilla
1044 $chars{0xd073}=0x015f; # small s with cedilla
1045 $chars{0xd074}=0x0163; # small t with cedilla
1046 # 5/1 rude
1047 # 5/2 hook to left
1048 # 5/3 ogonek (hook to right
1049 $chars{0xd320}=0x02db; # ogonek
1050 $chars{0xd341}=0x0104; # capital a with ogonek
1051 $chars{0xd345}=0x0118; # capital e with ogonek
1052 $chars{0xd349}=0x012e; # capital i with ogonek
1053 $chars{0xd34f}=0x01ea; # capital o with ogonek
1054 $chars{0xd355}=0x0172; # capital u with ogonek
1055 $chars{0xd361}=0x0105; # small a with ogonek
1056 $chars{0xd365}=0x0119; # small e with ogonek
1057 $chars{0xd369}=0x012f; # small i with ogonek
1058 $chars{0xd36f}=0x01eb; # small o with ogonek
1059 $chars{0xd375}=0x0173; # small u with ogonek
1060 # 5/4 circle below
1061 $chars{0xd441}=0x1e00; # capital a with ring below
1062 $chars{0xd461}=0x1e01; # small a with ring below
1063 # 5/5 half circle below
1064 $chars{0xf948}=0x1e2a; # capital h with breve below
1065 $chars{0xf968}=0x1e2b; # small h with breve below
1066 # 5/6 dot below
1067 $chars{0xd641}=0x1ea0; # capital a with dot below
1068 $chars{0xd642}=0x1e04; # capital b with dot below
1069 $chars{0xd644}=0x1e0c; # capital d with dot below
1070 $chars{0xd645}=0x1eb8; # capital e with dot below
1071 $chars{0xd648}=0x1e24; # capital h with dot below
1072 $chars{0xd649}=0x1eca; # capital i with dot below
1073 $chars{0xd64b}=0x1e32; # capital k with dot below
1074 $chars{0xd64c}=0x1e36; # capital l with dot below
1075 $chars{0xd64d}=0x1e42; # capital m with dot below
1076 $chars{0xd64e}=0x1e46; # capital n with dot below
1077 $chars{0xd64f}=0x1ecc; # capital o with dot below
1078 $chars{0xd652}=0x1e5a; # capital r with dot below
1079 $chars{0xd653}=0x1e62; # capital s with dot below
1080 $chars{0xd654}=0x1e6c; # capital t with dot below
1081 $chars{0xd655}=0x1ee4; # capital u with dot below
1082 $chars{0xd656}=0x1e7e; # capital v with dot below
1083 $chars{0xd657}=0x1e88; # capital w with dot below
1084 $chars{0xd659}=0x1ef4; # capital y with dot below
1085 $chars{0xd65a}=0x1e92; # capital z with dot below
1086 $chars{0xd661}=0x1ea1; # small a with dot below
1087 $chars{0xd662}=0x1e05; # small b with dot below
1088 $chars{0xd664}=0x1e0d; # small d with dot below
1089 $chars{0xd665}=0x1eb9; # small e with dot below
1090 $chars{0xd668}=0x1e25; # small h with dot below
1091 $chars{0xd669}=0x1ecb; # small i with dot below
1092 $chars{0xd66b}=0x1e33; # small k with dot below
1093 $chars{0xd66c}=0x1e37; # small l with dot below
1094 $chars{0xd66d}=0x1e43; # small m with dot below
1095 $chars{0xd66e}=0x1e47; # small n with dot below
1096 $chars{0xd66f}=0x1ecd; # small o with dot below
1097 $chars{0xd672}=0x1e5b; # small r with dot below
1098 $chars{0xd673}=0x1e63; # small s with dot below
1099 $chars{0xd674}=0x1e6d; # small t with dot below
1100 $chars{0xd675}=0x1ee5; # small u with dot below
1101 $chars{0xd676}=0x1e7f; # small v with dot below
1102 $chars{0xd677}=0x1e89; # small w with dot below
1103 $chars{0xd679}=0x1ef5; # small y with dot below
1104 $chars{0xd67a}=0x1e93; # small z with dot below
1105 # 5/7 double dot below
1106 $chars{0xd755}=0x1e72; # capital u with diaeresis below
1107 $chars{0xd775}=0x1e73; # small u with diaeresis below
1108 # 5/8 underline
1109 $chars{0xd820}=0x005f; # underline
1110 # 5/9 double underline
1111 $chars{0xd920}=0x2017; # double underline
1112 # 5/10 small low vertical bar
1113 $chars{0xda20}=0x02cc; #
1114 # 5/11 circumflex below
1115 # 5/12 (this position shall not be used)
1116 # 5/13 left half of ligature sign and of double tilde
1117 # 5/14 right half of ligature sign
1118 # 5/15 right half of double tilde
1119 # map {printf "%x :%x\n",$_,$chars{$_};}keys %chars;
1121 sub char_decode5426 {
1122 my ( $string) = @_;
1123 my $result;
1125 my @data = unpack("C*", $string);
1126 my @characters;
1127 my $length=scalar(@data);
1128 for (my $i = 0; $i < scalar(@data); $i++) {
1129 my $char= $data[$i];
1130 if ($char >= 0x00 && $char <= 0x7F){
1131 #IsAscii
1133 push @characters,$char unless ($char<0x02 ||$char== 0x0F);
1134 }elsif (($char >= 0xC0 && $char <= 0xDF)) {
1135 #Combined Char
1136 my $convchar ;
1137 if ($chars{$char*256+$data[$i+1]}) {
1138 $convchar= $chars{$char * 256 + $data[$i+1]};
1139 $i++;
1140 # printf "char %x $char, char to convert %x , converted %x\n",$char,$char * 256 + $data[$i - 1],$convchar;
1141 } elsif ($chars{$char}) {
1142 $convchar= $chars{$char};
1143 # printf "0xC char %x, converted %x\n",$char,$chars{$char};
1144 }else {
1145 $convchar=$char;
1147 push @characters,$convchar;
1148 } else {
1149 my $convchar;
1150 if ($chars{$char}) {
1151 $convchar= $chars{$char};
1152 # printf "char %x, converted %x\n",$char,$chars{$char};
1153 }else {
1154 # printf "char %x $char\n",$char;
1155 $convchar=$char;
1157 push @characters,$convchar;
1160 $result=pack "U*",@characters;
1161 # $result=~s/\x01//;
1162 # $result=~s/\x00//;
1163 $result=~s/\x0f//;
1164 $result=~s/\x1b.//;
1165 $result=~s/\x0e//;
1166 $result=~s/\x1b\x5b//;
1167 # map{printf "%x",$_} @characters;
1168 # printf "\n";
1169 return $result;
1175 =head1 AUTHOR
1177 Koha Development Team <info@koha.org>
1179 Galen Charlton <galen.charlton@liblime.com>
1181 =cut