wr72054 fixing the z3950 search for acquisitions
[koha.git] / C4 / Charset.pm
blobb3b4b722c311d27f448af171091990974084e827
1 package C4::Charset;
3 # Copyright (C) 2008 LibLime
5 # This file is part of Koha.
7 # Koha is free software; you can redistribute it and/or modify it under the
8 # terms of the GNU General Public License as published by the Free Software
9 # Foundation; either version 2 of the License, or (at your option) any later
10 # version.
12 # Koha is distributed in the hope that it will be useful, but WITHOUT ANY
13 # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
14 # A PARTICULAR PURPOSE. See the GNU General Public License for more details.
16 # You should have received a copy of the GNU General Public License along
17 # with Koha; if not, write to the Free Software Foundation, Inc.,
18 # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
20 use strict;
21 use warnings;
23 use MARC::Charset qw/marc8_to_utf8/;
24 use Text::Iconv;
25 use C4::Debug;
26 use Unicode::Normalize;
28 use vars qw($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
30 BEGIN {
31 # set the version for version checking
32 $VERSION = 3.01;
33 require Exporter;
34 @ISA = qw(Exporter);
35 @EXPORT = qw(
36 IsStringUTF8ish
37 MarcToUTF8Record
38 SetUTF8Flag
39 SetMarcUnicodeFlag
40 StripNonXmlChars
44 =head1 NAME
46 C4::Charset - utilities for handling character set conversions.
48 =head1 SYNOPSIS
50 use C4::Charset;
52 =head1 DESCRIPTION
54 This module contains routines for dealing with character set
55 conversions, particularly for MARC records.
57 A variety of character encodings are in use by various MARC
58 standards, and even more character encodings are used by
59 non-standard MARC records. The various MARC formats generally
60 do not do a good job of advertising a given record's character
61 encoding, and even when a record does advertise its encoding,
62 e.g., via the Leader/09, experience has shown that one cannot
63 trust it.
65 Ultimately, all MARC records are stored in Koha in UTF-8 and
66 must be converted from whatever the source character encoding is.
67 The goal of this module is to ensure that these conversions
68 take place accurately. When a character conversion cannot take
69 place, or at least not accurately, the module was provide
70 enough information to allow user-facing code to inform the user
71 on how to deal with the situation.
73 =cut
75 =head1 FUNCTIONS
77 =head2 IsStringUTF8ish
79 my $is_utf8 = IsStringUTF8ish($str);
81 Determines if C<$str> is valid UTF-8. This can mean
82 one of two things:
84 =over
86 =item *
88 The Perl UTF-8 flag is set and the string contains valid UTF-8.
90 =item *
92 The Perl UTF-8 flag is B<not> set, but the octets contain
93 valid UTF-8.
95 =back
97 The function is named C<IsStringUTF8ish> instead of C<IsStringUTF8>
98 because in one could be presented with a MARC blob that is
99 not actually in UTF-8 but whose sequence of octets appears to be
100 valid UTF-8. The rest of the MARC character conversion functions
101 will assume that this situation occur does not very often.
103 =cut
105 sub IsStringUTF8ish {
106 my $str = shift;
108 return 1 if utf8::is_utf8($str);
109 return utf8::decode($str);
112 =head2 SetUTF8Flag
114 my $marc_record = SetUTF8Flag($marc_record);
116 This function sets the PERL UTF8 flag for data.
117 It is required when using new_from_usmarc
118 since MARC::File::USMARC does not handle PERL UTF8 setting.
119 When editing unicode marc records fields and subfields, you
120 would end up in double encoding without using this function.
122 FIXME
123 In my opinion, this function belongs to MARC::Record and not
124 to this package.
125 But since it handles charset, and MARC::Record, it finds its way in that package
127 =cut
129 sub SetUTF8Flag{
130 my ($record)=@_;
131 return unless ($record && $record->fields());
132 foreach my $field ($record->fields()){
133 if ($field->tag()>=10){
134 my @subfields;
135 foreach my $subfield ($field->subfields()){
136 push @subfields,($$subfield[0],NormalizeString($$subfield[1]));
138 my $newfield=MARC::Field->new(
139 $field->tag(),
140 $field->indicator(1),
141 $field->indicator(2),
142 @subfields
144 $field->replace_with($newfield);
149 =head2 NormalizeString
151 my $normalized_string=NormalizeString($string);
153 Given a string
155 nfc : If you want to set NFC and not NFD
156 transform : If you expect all the signs to be removed
157 Sets the PERL UTF8 Flag on your initial data if need be
158 and applies cleaning if required
160 Returns a utf8 NFD normalized string
162 Sample code :
163 my $string=NormalizeString ("l'ornithoptère");
164 #results into ornithoptère in NFD form and sets UTF8 Flag
166 =cut
168 sub NormalizeString{
169 my ($string,$nfc,$transform)=@_;
170 utf8::decode($string) unless (utf8::is_utf8($string));
171 if ($nfc){
172 $string= NFD($string);
174 else {
175 $string=NFC($string);
177 if ($transform){
178 $string=~s/\<|\>|\^|\;|\.|\?|,|\-|\(|\)|\[|\]|\{|\}|\$|\%|\!|\*|\:|\\|\/|\&|\"|\'/ /g;
179 #removing one letter words "d'" "l'" was changed into "d " "l "
180 $string=~s/\b\S\b//g;
181 $string=~s/\s+$//g;
183 return $string;
186 =head2 MarcToUTF8Record
188 ($marc_record, $converted_from, $errors_arrayref) = MarcToUTF8Record($marc_blob,
189 $marc_flavour, [, $source_encoding]);
191 Given a MARC blob or a C<MARC::Record>, the MARC flavour, and an
192 optional source encoding, return a C<MARC::Record> that is
193 converted to UTF-8.
195 The returned C<$marc_record> is guaranteed to be in valid UTF-8, but
196 is not guaranteed to have been converted correctly. Specifically,
197 if C<$converted_from> is 'failed', the MARC record returned failed
198 character conversion and had each of its non-ASCII octets changed
199 to the Unicode replacement character.
201 If the source encoding was not specified, this routine will
202 try to guess it; the character encoding used for a successful
203 conversion is returned in C<$converted_from>.
205 =cut
207 sub MarcToUTF8Record {
208 my $marc = shift;
209 my $marc_flavour = shift;
210 my $source_encoding = shift;
211 my $marc_record;
212 my $marc_blob_is_utf8 = 0;
213 if (ref($marc) eq 'MARC::Record') {
214 my $marc_blob = $marc->as_usmarc();
215 $marc_blob_is_utf8 = IsStringUTF8ish($marc_blob);
216 $marc_record = $marc;
217 } else {
218 # dealing with a MARC blob
220 # remove any ersatz whitespace from the beginning and
221 # end of the MARC blob -- these can creep into MARC
222 # files produced by several sources -- caller really
223 # should be doing this, however
224 $marc =~ s/^\s+//;
225 $marc =~ s/\s+$//;
226 $marc_blob_is_utf8 = IsStringUTF8ish($marc);
227 eval {
228 $marc_record = MARC::Record->new_from_usmarc($marc);
230 if ($@) {
231 # if we fail the first time, one likely problem
232 # is that we have a MARC21 record that says that it's
233 # UTF-8 (Leader/09 = 'a') but contains non-UTF-8 characters.
234 # We'll try parsing it again.
235 substr($marc, 9, 1) = ' ';
236 eval {
237 $marc_record = MARC::Record->new_from_usmarc($marc);
239 if ($@) {
240 # it's hopeless; return an empty MARC::Record
241 return MARC::Record->new(), 'failed', ['could not parse MARC blob'];
246 # If we do not know the source encoding, try some guesses
247 # as follows:
248 # 1. Record is UTF-8 already.
249 # 2. If MARC flavor is MARC21, then
250 # a. record is MARC-8
251 # b. record is ISO-8859-1
252 # 3. If MARC flavor is UNIMARC, then
253 if (not defined $source_encoding) {
254 if ($marc_blob_is_utf8) {
255 # note that for MARC21 we are not bothering to check
256 # if the Leader/09 is set to 'a' or not -- because
257 # of problems with various ILSs (including Koha in the
258 # past, alas), this just is not trustworthy.
259 SetMarcUnicodeFlag($marc_record, $marc_flavour);
260 return $marc_record, 'UTF-8', [];
261 } else {
262 if ($marc_flavour eq 'MARC21') {
263 return _default_marc21_charconv_to_utf8($marc_record, $marc_flavour);
264 } elsif ($marc_flavour =~/UNIMARC/) {
265 return _default_unimarc_charconv_to_utf8($marc_record, $marc_flavour);
266 } else {
267 return _default_marc21_charconv_to_utf8($marc_record, $marc_flavour);
270 } else {
271 # caller knows the character encoding
272 my $original_marc_record = $marc_record->clone();
273 my @errors;
274 if ($source_encoding =~ /utf-?8/i) {
275 if ($marc_blob_is_utf8) {
276 SetMarcUnicodeFlag($marc_record, $marc_flavour);
277 return $marc_record, 'UTF-8', [];
278 } else {
279 push @errors, 'specified UTF-8 => UTF-8 conversion, but record is not in UTF-8';
281 } elsif ($source_encoding =~ /marc-?8/i) {
282 @errors = _marc_marc8_to_utf8($marc_record, $marc_flavour);
283 } elsif ($source_encoding =~ /5426/) {
284 @errors = _marc_iso5426_to_utf8($marc_record, $marc_flavour);
285 } else {
286 # assume any other character encoding is for Text::Iconv
287 @errors = _marc_to_utf8_via_text_iconv($marc_record, $marc_flavour, $source_encoding);
290 if (@errors) {
291 _marc_to_utf8_replacement_char($original_marc_record, $marc_flavour);
292 return $original_marc_record, 'failed', \@errors;
293 } else {
294 return $marc_record, $source_encoding, [];
300 =head2 SetMarcUnicodeFlag
302 SetMarcUnicodeFlag($marc_record, $marc_flavour);
304 Set both the internal MARC::Record encoding flag
305 and the appropriate Leader/09 (MARC21) or
306 100/26-29 (UNIMARC) to indicate that the record
307 is in UTF-8. Note that this does B<not> do
308 any actual character conversion.
310 =cut
312 sub SetMarcUnicodeFlag {
313 my $marc_record = shift;
314 my $marc_flavour = shift; # || C4::Context->preference("marcflavour");
316 $marc_record->encoding('UTF-8');
317 if ($marc_flavour eq 'MARC21') {
318 my $leader = $marc_record->leader();
319 substr($leader, 9, 1) = 'a';
320 $marc_record->leader($leader);
321 } elsif ($marc_flavour =~/UNIMARC/) {
322 my $string;
323 my ($subflength,$encodingposition)=($marc_flavour=~/AUTH/?(21,9):(36,22));
324 $string=$marc_record->subfield( 100, "a" );
325 if (defined $string && length($string)==$subflength) {
326 $string = substr $string, 0,$subflength if (length($string)>$subflength);
328 else {
329 $string = POSIX::strftime( "%Y%m%d", localtime );
330 $string =~ s/\-//g;
331 $string = sprintf( "%-*s", $subflength, $string );
333 substr( $string, $encodingposition, 8, "frey50 " );
334 if ( $marc_record->subfield( 100, "a" ) ) {
335 $marc_record->field('100')->update(a=>$string);
337 else {
338 $marc_record->insert_grouped_field(
339 MARC::Field->new( 100, '', '', "a" => $string ) );
341 $debug && warn "encodage: ", substr( $marc_record->subfield(100, 'a'), $encodingposition, 8 );
342 } else {
343 warn "Unrecognized marcflavour: $marc_flavour";
347 =head2 StripNonXmlChars
349 my $new_str = StripNonXmlChars($old_str);
351 Given a string, return a copy with the
352 characters that are illegal in XML
353 removed.
355 This function exists to work around a problem
356 that can occur with badly-encoded MARC records.
357 Specifically, if a UTF-8 MARC record also
358 has excape (\x1b) characters, MARC::File::XML
359 will let the escape characters pass through
360 when as_xml() or as_xml_record() is called. The
361 problem is that the escape character is not
362 legal in well-formed XML documents, so when
363 MARC::File::XML attempts to parse such a record,
364 the XML parser will fail.
366 Stripping such characters will allow a
367 MARC::Record->new_from_xml()
368 to work, at the possible risk of some data loss.
370 =cut
372 sub StripNonXmlChars {
373 my $str = shift;
374 if (!defined($str) || $str eq ""){
375 return "";
377 $str =~ s/[^\x09\x0A\x0D\x{0020}-\x{D7FF}\x{E000}-\x{FFFD}\x{10000}-\x{10FFFF}]//g;
378 return $str;
381 =head1 INTERNAL FUNCTIONS
383 =head2 _default_marc21_charconv_to_utf8
385 my ($new_marc_record, $guessed_charset) = _default_marc21_charconv_to_utf8($marc_record);
387 Converts a C<MARC::Record> of unknown character set to UTF-8,
388 first by trying a MARC-8 to UTF-8 conversion, then ISO-8859-1
389 to UTF-8, then a default conversion that replaces each non-ASCII
390 character with the replacement character.
392 The C<$guessed_charset> return value contains the character set
393 that resulted in a conversion to valid UTF-8; note that
394 if the MARC-8 and ISO-8859-1 conversions failed, the value of
395 this is 'failed'.
397 =cut
399 sub _default_marc21_charconv_to_utf8 {
400 my $marc_record = shift;
401 my $marc_flavour = shift;
403 my $trial_marc8 = $marc_record->clone();
404 my @all_errors = ();
405 my @errors = _marc_marc8_to_utf8($trial_marc8, $marc_flavour);
406 unless (@errors) {
407 return $trial_marc8, 'MARC-8', [];
409 push @all_errors, @errors;
411 my $trial_8859_1 = $marc_record->clone();
412 @errors = _marc_to_utf8_via_text_iconv($trial_8859_1, $marc_flavour, 'iso-8859-1');
413 unless (@errors) {
414 return $trial_8859_1, 'iso-8859-1', []; # note -- we could return \@all_errors
415 # instead if we wanted to report details
416 # of the failed attempt at MARC-8 => UTF-8
418 push @all_errors, @errors;
420 my $default_converted = $marc_record->clone();
421 _marc_to_utf8_replacement_char($default_converted, $marc_flavour);
422 return $default_converted, 'failed', \@all_errors;
425 =head2 _default_unimarc_charconv_to_utf8
427 my ($new_marc_record, $guessed_charset) = _default_unimarc_charconv_to_utf8($marc_record);
429 Converts a C<MARC::Record> of unknown character set to UTF-8,
430 first by trying a ISO-5426 to UTF-8 conversion, then ISO-8859-1
431 to UTF-8, then a default conversion that replaces each non-ASCII
432 character with the replacement character.
434 The C<$guessed_charset> return value contains the character set
435 that resulted in a conversion to valid UTF-8; note that
436 if the MARC-8 and ISO-8859-1 conversions failed, the value of
437 this is 'failed'.
439 =cut
441 sub _default_unimarc_charconv_to_utf8 {
442 my $marc_record = shift;
443 my $marc_flavour = shift;
445 my $trial_marc8 = $marc_record->clone();
446 my @all_errors = ();
447 my @errors = _marc_iso5426_to_utf8($trial_marc8, $marc_flavour);
448 unless (@errors) {
449 return $trial_marc8, 'iso-5426';
451 push @all_errors, @errors;
453 my $trial_8859_1 = $marc_record->clone();
454 @errors = _marc_to_utf8_via_text_iconv($trial_8859_1, $marc_flavour, 'iso-8859-1');
455 unless (@errors) {
456 return $trial_8859_1, 'iso-8859-1';
458 push @all_errors, @errors;
460 my $default_converted = $marc_record->clone();
461 _marc_to_utf8_replacement_char($default_converted, $marc_flavour);
462 return $default_converted, 'failed', \@all_errors;
465 =head2 _marc_marc8_to_utf8
467 my @errors = _marc_marc8_to_utf8($marc_record, $marc_flavour, $source_encoding);
469 Convert a C<MARC::Record> to UTF-8 in-place from MARC-8.
470 If the conversion fails for some reason, an
471 appropriate messages will be placed in the returned
472 C<@errors> array.
474 =cut
476 sub _marc_marc8_to_utf8 {
477 my $marc_record = shift;
478 my $marc_flavour = shift;
480 my $prev_ignore = MARC::Charset->ignore_errors();
481 MARC::Charset->ignore_errors(1);
483 # trap warnings raised by MARC::Charset
484 my @errors = ();
485 local $SIG{__WARN__} = sub {
486 my $msg = $_[0];
487 if ($msg =~ /MARC.Charset/) {
488 # FIXME - purpose of this regexp is to strip out the
489 # line reference to MARC/Charset.pm, but as it
490 # exists probably won't work quite on Windows --
491 # some sort of minimal-bunch back-tracking RE
492 # would be helpful here
493 $msg =~ s/at [\/].*?.MARC.Charset\.pm line \d+\.\n$//;
494 push @errors, $msg;
495 } else {
496 # if warning doesn't come from MARC::Charset, just
497 # pass it on
498 warn $msg;
502 foreach my $field ($marc_record->fields()) {
503 if ($field->is_control_field()) {
504 ; # do nothing -- control fields should not contain non-ASCII characters
505 } else {
506 my @converted_subfields;
507 foreach my $subfield ($field->subfields()) {
508 my $utf8sf = MARC::Charset::marc8_to_utf8($subfield->[1]);
509 unless (IsStringUTF8ish($utf8sf)) {
510 # Because of a bug in MARC::Charset 0.98, if the string
511 # has (a) one or more diacritics that (b) are only in character positions
512 # 128 to 255 inclusive, the resulting converted string is not in
513 # UTF-8, but the legacy 8-bit encoding (e.g., ISO-8859-1). If that
514 # occurs, upgrade the string in place. Moral of the story seems to be
515 # that pack("U", ...) is better than chr(...) if you need to guarantee
516 # that the resulting string is UTF-8.
517 utf8::upgrade($utf8sf);
519 push @converted_subfields, $subfield->[0], $utf8sf;
522 $field->replace_with(MARC::Field->new(
523 $field->tag(), $field->indicator(1), $field->indicator(2),
524 @converted_subfields)
529 MARC::Charset->ignore_errors($prev_ignore);
531 SetMarcUnicodeFlag($marc_record, $marc_flavour);
533 return @errors;
536 =head2 _marc_iso5426_to_utf8
538 my @errors = _marc_iso5426_to_utf8($marc_record, $marc_flavour, $source_encoding);
540 Convert a C<MARC::Record> to UTF-8 in-place from ISO-5426.
541 If the conversion fails for some reason, an
542 appropriate messages will be placed in the returned
543 C<@errors> array.
545 FIXME - is ISO-5426 equivalent enough to MARC-8
546 that C<MARC::Charset> can be used instead?
548 =cut
550 sub _marc_iso5426_to_utf8 {
551 my $marc_record = shift;
552 my $marc_flavour = shift;
554 my @errors = ();
556 foreach my $field ($marc_record->fields()) {
557 if ($field->is_control_field()) {
558 ; # do nothing -- control fields should not contain non-ASCII characters
559 } else {
560 my @converted_subfields;
561 foreach my $subfield ($field->subfields()) {
562 my $utf8sf = char_decode5426($subfield->[1]);
563 push @converted_subfields, $subfield->[0], $utf8sf;
566 $field->replace_with(MARC::Field->new(
567 $field->tag(), $field->indicator(1), $field->indicator(2),
568 @converted_subfields)
573 SetMarcUnicodeFlag($marc_record, $marc_flavour);
575 return @errors;
578 =head2 _marc_to_utf8_via_text_iconv
580 my @errors = _marc_to_utf8_via_text_iconv($marc_record, $marc_flavour, $source_encoding);
582 Convert a C<MARC::Record> to UTF-8 in-place using the
583 C<Text::Iconv> CPAN module. Any source encoding accepted
584 by the user's iconv installation should work. If
585 the source encoding is not recognized on the user's
586 server or the conversion fails for some reason,
587 appropriate messages will be placed in the returned
588 C<@errors> array.
590 =cut
592 sub _marc_to_utf8_via_text_iconv {
593 my $marc_record = shift;
594 my $marc_flavour = shift;
595 my $source_encoding = shift;
597 my @errors = ();
598 my $decoder;
599 eval { $decoder = Text::Iconv->new($source_encoding, 'utf8'); };
600 if ($@) {
601 push @errors, "Could not initialze $source_encoding => utf8 converter: $@";
602 return @errors;
605 my $prev_raise_error = Text::Iconv->raise_error();
606 Text::Iconv->raise_error(1);
608 foreach my $field ($marc_record->fields()) {
609 if ($field->is_control_field()) {
610 ; # do nothing -- control fields should not contain non-ASCII characters
611 } else {
612 my @converted_subfields;
613 foreach my $subfield ($field->subfields()) {
614 my $converted_value;
615 my $conversion_ok = 1;
616 eval { $converted_value = $decoder->convert($subfield->[1]); };
617 if ($@) {
618 $conversion_ok = 0;
619 push @errors, $@;
620 } elsif (not defined $converted_value) {
621 $conversion_ok = 0;
622 push @errors, "Text::Iconv conversion failed - retval is " . $decoder->retval();
625 if ($conversion_ok) {
626 push @converted_subfields, $subfield->[0], $converted_value;
627 } else {
628 $converted_value = $subfield->[1];
629 $converted_value =~ s/[\200-\377]/\xef\xbf\xbd/g;
630 push @converted_subfields, $subfield->[0], $converted_value;
634 $field->replace_with(MARC::Field->new(
635 $field->tag(), $field->indicator(1), $field->indicator(2),
636 @converted_subfields)
641 SetMarcUnicodeFlag($marc_record, $marc_flavour);
642 Text::Iconv->raise_error($prev_raise_error);
644 return @errors;
647 =head2 _marc_to_utf8_replacement_char
649 _marc_to_utf8_replacement_char($marc_record, $marc_flavour);
651 Convert a C<MARC::Record> to UTF-8 in-place, adopting the
652 unsatisfactory method of replacing all non-ASCII (e.g.,
653 where the eight bit is set) octet with the Unicode
654 replacement character. This is meant as a last-ditch
655 method, and would be best used as part of a UI that
656 lets a cataloguer pick various character conversions
657 until he or she finds the right one.
659 =cut
661 sub _marc_to_utf8_replacement_char {
662 my $marc_record = shift;
663 my $marc_flavour = shift;
665 foreach my $field ($marc_record->fields()) {
666 if ($field->is_control_field()) {
667 ; # do nothing -- control fields should not contain non-ASCII characters
668 } else {
669 my @converted_subfields;
670 foreach my $subfield ($field->subfields()) {
671 my $value = $subfield->[1];
672 $value =~ s/[\200-\377]/\xef\xbf\xbd/g;
673 push @converted_subfields, $subfield->[0], $value;
676 $field->replace_with(MARC::Field->new(
677 $field->tag(), $field->indicator(1), $field->indicator(2),
678 @converted_subfields)
683 SetMarcUnicodeFlag($marc_record, $marc_flavour);
686 =head2 char_decode5426
688 my $utf8string = char_decode5426($iso_5426_string);
690 Converts a string from ISO-5426 to UTF-8.
692 =cut
695 my %chars;
696 $chars{0xb0}=0x0101;#3/0ayn[ain]
697 $chars{0xb1}=0x0623;#3/1alif/hamzah[alefwithhamzaabove]
698 #$chars{0xb2}=0x00e0;#'à';
699 $chars{0xb2}=0x00e0;#3/2leftlowsinglequotationmark
700 #$chars{0xb3}=0x00e7;#'ç';
701 $chars{0xb3}=0x00e7;#3/2leftlowsinglequotationmark
702 # $chars{0xb4}='è';
703 $chars{0xb4}=0x00e8;
704 $chars{0xbd}=0x02b9;
705 $chars{0xbe}=0x02ba;
706 # $chars{0xb5}='é';
707 $chars{0xb5}=0x00e9;
708 $chars{0x97}=0x003c;#3/2leftlowsinglequotationmark
709 $chars{0x98}=0x003e;#3/2leftlowsinglequotationmark
710 $chars{0xfa}=0x0153; #oe
711 $chars{0xea}=0x0152; #oe
712 $chars{0x81d1}=0x00b0;
714 ####
715 ## combined characters iso5426
717 $chars{0xc041}=0x1ea2; # capital a with hook above
718 $chars{0xc045}=0x1eba; # capital e with hook above
719 $chars{0xc049}=0x1ec8; # capital i with hook above
720 $chars{0xc04f}=0x1ece; # capital o with hook above
721 $chars{0xc055}=0x1ee6; # capital u with hook above
722 $chars{0xc059}=0x1ef6; # capital y with hook above
723 $chars{0xc061}=0x1ea3; # small a with hook above
724 $chars{0xc065}=0x1ebb; # small e with hook above
725 $chars{0xc069}=0x1ec9; # small i with hook above
726 $chars{0xc06f}=0x1ecf; # small o with hook above
727 $chars{0xc075}=0x1ee7; # small u with hook above
728 $chars{0xc079}=0x1ef7; # small y with hook above
730 # 4/1 grave accent
731 $chars{0xc141}=0x00c0; # capital a with grave accent
732 $chars{0xc145}=0x00c8; # capital e with grave accent
733 $chars{0xc149}=0x00cc; # capital i with grave accent
734 $chars{0xc14f}=0x00d2; # capital o with grave accent
735 $chars{0xc155}=0x00d9; # capital u with grave accent
736 $chars{0xc157}=0x1e80; # capital w with grave
737 $chars{0xc159}=0x1ef2; # capital y with grave
738 $chars{0xc161}=0x00e0; # small a with grave accent
739 $chars{0xc165}=0x00e8; # small e with grave accent
740 $chars{0xc169}=0x00ec; # small i with grave accent
741 $chars{0xc16f}=0x00f2; # small o with grave accent
742 $chars{0xc175}=0x00f9; # small u with grave accent
743 $chars{0xc177}=0x1e81; # small w with grave
744 $chars{0xc179}=0x1ef3; # small y with grave
745 # 4/2 acute accent
746 $chars{0xc241}=0x00c1; # capital a with acute accent
747 $chars{0xc243}=0x0106; # capital c with acute accent
748 $chars{0xc245}=0x00c9; # capital e with acute accent
749 $chars{0xc247}=0x01f4; # capital g with acute
750 $chars{0xc249}=0x00cd; # capital i with acute accent
751 $chars{0xc24b}=0x1e30; # capital k with acute
752 $chars{0xc24c}=0x0139; # capital l with acute accent
753 $chars{0xc24d}=0x1e3e; # capital m with acute
754 $chars{0xc24e}=0x0143; # capital n with acute accent
755 $chars{0xc24f}=0x00d3; # capital o with acute accent
756 $chars{0xc250}=0x1e54; # capital p with acute
757 $chars{0xc252}=0x0154; # capital r with acute accent
758 $chars{0xc253}=0x015a; # capital s with acute accent
759 $chars{0xc255}=0x00da; # capital u with acute accent
760 $chars{0xc257}=0x1e82; # capital w with acute
761 $chars{0xc259}=0x00dd; # capital y with acute accent
762 $chars{0xc25a}=0x0179; # capital z with acute accent
763 $chars{0xc261}=0x00e1; # small a with acute accent
764 $chars{0xc263}=0x0107; # small c with acute accent
765 $chars{0xc265}=0x00e9; # small e with acute accent
766 $chars{0xc267}=0x01f5; # small g with acute
767 $chars{0xc269}=0x00ed; # small i with acute accent
768 $chars{0xc26b}=0x1e31; # small k with acute
769 $chars{0xc26c}=0x013a; # small l with acute accent
770 $chars{0xc26d}=0x1e3f; # small m with acute
771 $chars{0xc26e}=0x0144; # small n with acute accent
772 $chars{0xc26f}=0x00f3; # small o with acute accent
773 $chars{0xc270}=0x1e55; # small p with acute
774 $chars{0xc272}=0x0155; # small r with acute accent
775 $chars{0xc273}=0x015b; # small s with acute accent
776 $chars{0xc275}=0x00fa; # small u with acute accent
777 $chars{0xc277}=0x1e83; # small w with acute
778 $chars{0xc279}=0x00fd; # small y with acute accent
779 $chars{0xc27a}=0x017a; # small z with acute accent
780 $chars{0xc2e1}=0x01fc; # capital ae with acute
781 $chars{0xc2f1}=0x01fd; # small ae with acute
782 # 4/3 circumflex accent
783 $chars{0xc341}=0x00c2; # capital a with circumflex accent
784 $chars{0xc343}=0x0108; # capital c with circumflex
785 $chars{0xc345}=0x00ca; # capital e with circumflex accent
786 $chars{0xc347}=0x011c; # capital g with circumflex
787 $chars{0xc348}=0x0124; # capital h with circumflex
788 $chars{0xc349}=0x00ce; # capital i with circumflex accent
789 $chars{0xc34a}=0x0134; # capital j with circumflex
790 $chars{0xc34f}=0x00d4; # capital o with circumflex accent
791 $chars{0xc353}=0x015c; # capital s with circumflex
792 $chars{0xc355}=0x00db; # capital u with circumflex
793 $chars{0xc357}=0x0174; # capital w with circumflex
794 $chars{0xc359}=0x0176; # capital y with circumflex
795 $chars{0xc35a}=0x1e90; # capital z with circumflex
796 $chars{0xc361}=0x00e2; # small a with circumflex accent
797 $chars{0xc363}=0x0109; # small c with circumflex
798 $chars{0xc365}=0x00ea; # small e with circumflex accent
799 $chars{0xc367}=0x011d; # small g with circumflex
800 $chars{0xc368}=0x0125; # small h with circumflex
801 $chars{0xc369}=0x00ee; # small i with circumflex accent
802 $chars{0xc36a}=0x0135; # small j with circumflex
803 $chars{0xc36e}=0x00f1; # small n with tilde
804 $chars{0xc36f}=0x00f4; # small o with circumflex accent
805 $chars{0xc373}=0x015d; # small s with circumflex
806 $chars{0xc375}=0x00fb; # small u with circumflex
807 $chars{0xc377}=0x0175; # small w with circumflex
808 $chars{0xc379}=0x0177; # small y with circumflex
809 $chars{0xc37a}=0x1e91; # small z with circumflex
810 # 4/4 tilde
811 $chars{0xc441}=0x00c3; # capital a with tilde
812 $chars{0xc445}=0x1ebc; # capital e with tilde
813 $chars{0xc449}=0x0128; # capital i with tilde
814 $chars{0xc44e}=0x00d1; # capital n with tilde
815 $chars{0xc44f}=0x00d5; # capital o with tilde
816 $chars{0xc455}=0x0168; # capital u with tilde
817 $chars{0xc456}=0x1e7c; # capital v with tilde
818 $chars{0xc459}=0x1ef8; # capital y with tilde
819 $chars{0xc461}=0x00e3; # small a with tilde
820 $chars{0xc465}=0x1ebd; # small e with tilde
821 $chars{0xc469}=0x0129; # small i with tilde
822 $chars{0xc46e}=0x00f1; # small n with tilde
823 $chars{0xc46f}=0x00f5; # small o with tilde
824 $chars{0xc475}=0x0169; # small u with tilde
825 $chars{0xc476}=0x1e7d; # small v with tilde
826 $chars{0xc479}=0x1ef9; # small y with tilde
827 # 4/5 macron
828 $chars{0xc541}=0x0100; # capital a with macron
829 $chars{0xc545}=0x0112; # capital e with macron
830 $chars{0xc547}=0x1e20; # capital g with macron
831 $chars{0xc549}=0x012a; # capital i with macron
832 $chars{0xc54f}=0x014c; # capital o with macron
833 $chars{0xc555}=0x016a; # capital u with macron
834 $chars{0xc561}=0x0101; # small a with macron
835 $chars{0xc565}=0x0113; # small e with macron
836 $chars{0xc567}=0x1e21; # small g with macron
837 $chars{0xc569}=0x012b; # small i with macron
838 $chars{0xc56f}=0x014d; # small o with macron
839 $chars{0xc575}=0x016b; # small u with macron
840 $chars{0xc572}=0x0159; # small r with macron
841 $chars{0xc5e1}=0x01e2; # capital ae with macron
842 $chars{0xc5f1}=0x01e3; # small ae with macron
843 # 4/6 breve
844 $chars{0xc641}=0x0102; # capital a with breve
845 $chars{0xc645}=0x0114; # capital e with breve
846 $chars{0xc647}=0x011e; # capital g with breve
847 $chars{0xc649}=0x012c; # capital i with breve
848 $chars{0xc64f}=0x014e; # capital o with breve
849 $chars{0xc655}=0x016c; # capital u with breve
850 $chars{0xc661}=0x0103; # small a with breve
851 $chars{0xc665}=0x0115; # small e with breve
852 $chars{0xc667}=0x011f; # small g with breve
853 $chars{0xc669}=0x012d; # small i with breve
854 $chars{0xc66f}=0x014f; # small o with breve
855 $chars{0xc675}=0x016d; # small u with breve
856 # 4/7 dot above
857 $chars{0xc7b0}=0x01e1; # Ain with dot above
858 $chars{0xc742}=0x1e02; # capital b with dot above
859 $chars{0xc743}=0x010a; # capital c with dot above
860 $chars{0xc744}=0x1e0a; # capital d with dot above
861 $chars{0xc745}=0x0116; # capital e with dot above
862 $chars{0xc746}=0x1e1e; # capital f with dot above
863 $chars{0xc747}=0x0120; # capital g with dot above
864 $chars{0xc748}=0x1e22; # capital h with dot above
865 $chars{0xc749}=0x0130; # capital i with dot above
866 $chars{0xc74d}=0x1e40; # capital m with dot above
867 $chars{0xc74e}=0x1e44; # capital n with dot above
868 $chars{0xc750}=0x1e56; # capital p with dot above
869 $chars{0xc752}=0x1e58; # capital r with dot above
870 $chars{0xc753}=0x1e60; # capital s with dot above
871 $chars{0xc754}=0x1e6a; # capital t with dot above
872 $chars{0xc757}=0x1e86; # capital w with dot above
873 $chars{0xc758}=0x1e8a; # capital x with dot above
874 $chars{0xc759}=0x1e8e; # capital y with dot above
875 $chars{0xc75a}=0x017b; # capital z with dot above
876 $chars{0xc761}=0x0227; # small b with dot above
877 $chars{0xc762}=0x1e03; # small b with dot above
878 $chars{0xc763}=0x010b; # small c with dot above
879 $chars{0xc764}=0x1e0b; # small d with dot above
880 $chars{0xc765}=0x0117; # small e with dot above
881 $chars{0xc766}=0x1e1f; # small f with dot above
882 $chars{0xc767}=0x0121; # small g with dot above
883 $chars{0xc768}=0x1e23; # small h with dot above
884 $chars{0xc76d}=0x1e41; # small m with dot above
885 $chars{0xc76e}=0x1e45; # small n with dot above
886 $chars{0xc770}=0x1e57; # small p with dot above
887 $chars{0xc772}=0x1e59; # small r with dot above
888 $chars{0xc773}=0x1e61; # small s with dot above
889 $chars{0xc774}=0x1e6b; # small t with dot above
890 $chars{0xc777}=0x1e87; # small w with dot above
891 $chars{0xc778}=0x1e8b; # small x with dot above
892 $chars{0xc779}=0x1e8f; # small y with dot above
893 $chars{0xc77a}=0x017c; # small z with dot above
894 # 4/8 trema, diaresis
895 $chars{0xc820}=0x00a8; # diaeresis
896 $chars{0xc841}=0x00c4; # capital a with diaeresis
897 $chars{0xc845}=0x00cb; # capital e with diaeresis
898 $chars{0xc848}=0x1e26; # capital h with diaeresis
899 $chars{0xc849}=0x00cf; # capital i with diaeresis
900 $chars{0xc84f}=0x00d6; # capital o with diaeresis
901 $chars{0xc855}=0x00dc; # capital u with diaeresis
902 $chars{0xc857}=0x1e84; # capital w with diaeresis
903 $chars{0xc858}=0x1e8c; # capital x with diaeresis
904 $chars{0xc859}=0x0178; # capital y with diaeresis
905 $chars{0xc861}=0x00e4; # small a with diaeresis
906 $chars{0xc865}=0x00eb; # small e with diaeresis
907 $chars{0xc868}=0x1e27; # small h with diaeresis
908 $chars{0xc869}=0x00ef; # small i with diaeresis
909 $chars{0xc86f}=0x00f6; # small o with diaeresis
910 $chars{0xc874}=0x1e97; # small t with diaeresis
911 $chars{0xc875}=0x00fc; # small u with diaeresis
912 $chars{0xc877}=0x1e85; # small w with diaeresis
913 $chars{0xc878}=0x1e8d; # small x with diaeresis
914 $chars{0xc879}=0x00ff; # small y with diaeresis
915 # 4/9 umlaut
916 $chars{0xc920}=0x00a8; # [diaeresis]
917 $chars{0xc961}=0x00e4; # a with umlaut
918 $chars{0xc965}=0x00eb; # e with umlaut
919 $chars{0xc969}=0x00ef; # i with umlaut
920 $chars{0xc96f}=0x00f6; # o with umlaut
921 $chars{0xc975}=0x00fc; # u with umlaut
922 # 4/10 circle above
923 $chars{0xca41}=0x00c5; # capital a with ring above
924 $chars{0xcaad}=0x016e; # capital u with ring above
925 $chars{0xca61}=0x00e5; # small a with ring above
926 $chars{0xca75}=0x016f; # small u with ring above
927 $chars{0xca77}=0x1e98; # small w with ring above
928 $chars{0xca79}=0x1e99; # small y with ring above
929 # 4/11 high comma off centre
930 # 4/12 inverted high comma centred
931 # 4/13 double acute accent
932 $chars{0xcd4f}=0x0150; # capital o with double acute
933 $chars{0xcd55}=0x0170; # capital u with double acute
934 $chars{0xcd6f}=0x0151; # small o with double acute
935 $chars{0xcd75}=0x0171; # small u with double acute
936 # 4/14 horn
937 $chars{0xce54}=0x01a0; # latin capital letter o with horn
938 $chars{0xce55}=0x01af; # latin capital letter u with horn
939 $chars{0xce74}=0x01a1; # latin small letter o with horn
940 $chars{0xce75}=0x01b0; # latin small letter u with horn
941 # 4/15 caron (hacek
942 $chars{0xcf41}=0x01cd; # capital a with caron
943 $chars{0xcf43}=0x010c; # capital c with caron
944 $chars{0xcf44}=0x010e; # capital d with caron
945 $chars{0xcf45}=0x011a; # capital e with caron
946 $chars{0xcf47}=0x01e6; # capital g with caron
947 $chars{0xcf49}=0x01cf; # capital i with caron
948 $chars{0xcf4b}=0x01e8; # capital k with caron
949 $chars{0xcf4c}=0x013d; # capital l with caron
950 $chars{0xcf4e}=0x0147; # capital n with caron
951 $chars{0xcf4f}=0x01d1; # capital o with caron
952 $chars{0xcf52}=0x0158; # capital r with caron
953 $chars{0xcf53}=0x0160; # capital s with caron
954 $chars{0xcf54}=0x0164; # capital t with caron
955 $chars{0xcf55}=0x01d3; # capital u with caron
956 $chars{0xcf5a}=0x017d; # capital z with caron
957 $chars{0xcf61}=0x01ce; # small a with caron
958 $chars{0xcf63}=0x010d; # small c with caron
959 $chars{0xcf64}=0x010f; # small d with caron
960 $chars{0xcf65}=0x011b; # small e with caron
961 $chars{0xcf67}=0x01e7; # small g with caron
962 $chars{0xcf69}=0x01d0; # small i with caron
963 $chars{0xcf6a}=0x01f0; # small j with caron
964 $chars{0xcf6b}=0x01e9; # small k with caron
965 $chars{0xcf6c}=0x013e; # small l with caron
966 $chars{0xcf6e}=0x0148; # small n with caron
967 $chars{0xcf6f}=0x01d2; # small o with caron
968 $chars{0xcf72}=0x0159; # small r with caron
969 $chars{0xcf73}=0x0161; # small s with caron
970 $chars{0xcf74}=0x0165; # small t with caron
971 $chars{0xcf75}=0x01d4; # small u with caron
972 $chars{0xcf7a}=0x017e; # small z with caron
973 # 5/0 cedilla
974 $chars{0xd020}=0x00b8; # cedilla
975 $chars{0xd043}=0x00c7; # capital c with cedilla
976 $chars{0xd044}=0x1e10; # capital d with cedilla
977 $chars{0xd047}=0x0122; # capital g with cedilla
978 $chars{0xd048}=0x1e28; # capital h with cedilla
979 $chars{0xd04b}=0x0136; # capital k with cedilla
980 $chars{0xd04c}=0x013b; # capital l with cedilla
981 $chars{0xd04e}=0x0145; # capital n with cedilla
982 $chars{0xd052}=0x0156; # capital r with cedilla
983 $chars{0xd053}=0x015e; # capital s with cedilla
984 $chars{0xd054}=0x0162; # capital t with cedilla
985 $chars{0xd063}=0x00e7; # small c with cedilla
986 $chars{0xd064}=0x1e11; # small d with cedilla
987 $chars{0xd065}=0x0119; # small e with cedilla
988 $chars{0xd067}=0x0123; # small g with cedilla
989 $chars{0xd068}=0x1e29; # small h with cedilla
990 $chars{0xd06b}=0x0137; # small k with cedilla
991 $chars{0xd06c}=0x013c; # small l with cedilla
992 $chars{0xd06e}=0x0146; # small n with cedilla
993 $chars{0xd072}=0x0157; # small r with cedilla
994 $chars{0xd073}=0x015f; # small s with cedilla
995 $chars{0xd074}=0x0163; # small t with cedilla
996 # 5/1 rude
997 # 5/2 hook to left
998 # 5/3 ogonek (hook to right
999 $chars{0xd320}=0x02db; # ogonek
1000 $chars{0xd341}=0x0104; # capital a with ogonek
1001 $chars{0xd345}=0x0118; # capital e with ogonek
1002 $chars{0xd349}=0x012e; # capital i with ogonek
1003 $chars{0xd34f}=0x01ea; # capital o with ogonek
1004 $chars{0xd355}=0x0172; # capital u with ogonek
1005 $chars{0xd361}=0x0105; # small a with ogonek
1006 $chars{0xd365}=0x0119; # small e with ogonek
1007 $chars{0xd369}=0x012f; # small i with ogonek
1008 $chars{0xd36f}=0x01eb; # small o with ogonek
1009 $chars{0xd375}=0x0173; # small u with ogonek
1010 # 5/4 circle below
1011 $chars{0xd441}=0x1e00; # capital a with ring below
1012 $chars{0xd461}=0x1e01; # small a with ring below
1013 # 5/5 half circle below
1014 $chars{0xf948}=0x1e2a; # capital h with breve below
1015 $chars{0xf968}=0x1e2b; # small h with breve below
1016 # 5/6 dot below
1017 $chars{0xd641}=0x1ea0; # capital a with dot below
1018 $chars{0xd642}=0x1e04; # capital b with dot below
1019 $chars{0xd644}=0x1e0c; # capital d with dot below
1020 $chars{0xd645}=0x1eb8; # capital e with dot below
1021 $chars{0xd648}=0x1e24; # capital h with dot below
1022 $chars{0xd649}=0x1eca; # capital i with dot below
1023 $chars{0xd64b}=0x1e32; # capital k with dot below
1024 $chars{0xd64c}=0x1e36; # capital l with dot below
1025 $chars{0xd64d}=0x1e42; # capital m with dot below
1026 $chars{0xd64e}=0x1e46; # capital n with dot below
1027 $chars{0xd64f}=0x1ecc; # capital o with dot below
1028 $chars{0xd652}=0x1e5a; # capital r with dot below
1029 $chars{0xd653}=0x1e62; # capital s with dot below
1030 $chars{0xd654}=0x1e6c; # capital t with dot below
1031 $chars{0xd655}=0x1ee4; # capital u with dot below
1032 $chars{0xd656}=0x1e7e; # capital v with dot below
1033 $chars{0xd657}=0x1e88; # capital w with dot below
1034 $chars{0xd659}=0x1ef4; # capital y with dot below
1035 $chars{0xd65a}=0x1e92; # capital z with dot below
1036 $chars{0xd661}=0x1ea1; # small a with dot below
1037 $chars{0xd662}=0x1e05; # small b with dot below
1038 $chars{0xd664}=0x1e0d; # small d with dot below
1039 $chars{0xd665}=0x1eb9; # small e with dot below
1040 $chars{0xd668}=0x1e25; # small h with dot below
1041 $chars{0xd669}=0x1ecb; # small i with dot below
1042 $chars{0xd66b}=0x1e33; # small k with dot below
1043 $chars{0xd66c}=0x1e37; # small l with dot below
1044 $chars{0xd66d}=0x1e43; # small m with dot below
1045 $chars{0xd66e}=0x1e47; # small n with dot below
1046 $chars{0xd66f}=0x1ecd; # small o with dot below
1047 $chars{0xd672}=0x1e5b; # small r with dot below
1048 $chars{0xd673}=0x1e63; # small s with dot below
1049 $chars{0xd674}=0x1e6d; # small t with dot below
1050 $chars{0xd675}=0x1ee5; # small u with dot below
1051 $chars{0xd676}=0x1e7f; # small v with dot below
1052 $chars{0xd677}=0x1e89; # small w with dot below
1053 $chars{0xd679}=0x1ef5; # small y with dot below
1054 $chars{0xd67a}=0x1e93; # small z with dot below
1055 # 5/7 double dot below
1056 $chars{0xd755}=0x1e72; # capital u with diaeresis below
1057 $chars{0xd775}=0x1e73; # small u with diaeresis below
1058 # 5/8 underline
1059 $chars{0xd820}=0x005f; # underline
1060 # 5/9 double underline
1061 $chars{0xd920}=0x2017; # double underline
1062 # 5/10 small low vertical bar
1063 $chars{0xda20}=0x02cc; #
1064 # 5/11 circumflex below
1065 # 5/12 (this position shall not be used)
1066 # 5/13 left half of ligature sign and of double tilde
1067 # 5/14 right half of ligature sign
1068 # 5/15 right half of double tilde
1069 # map {printf "%x :%x\n",$_,$chars{$_};}keys %chars;
1071 sub char_decode5426 {
1072 my ( $string) = @_;
1073 my $result;
1075 my @data = unpack("C*", $string);
1076 my @characters;
1077 my $length=scalar(@data);
1078 for (my $i = 0; $i < scalar(@data); $i++) {
1079 my $char= $data[$i];
1080 if ($char >= 0x00 && $char <= 0x7F){
1081 #IsAscii
1083 push @characters,$char unless ($char<0x02 ||$char== 0x0F);
1084 }elsif (($char >= 0xC0 && $char <= 0xDF)) {
1085 #Combined Char
1086 my $convchar ;
1087 if ($chars{$char*256+$data[$i+1]}) {
1088 $convchar= $chars{$char * 256 + $data[$i+1]};
1089 $i++;
1090 # printf "char %x $char, char to convert %x , converted %x\n",$char,$char * 256 + $data[$i - 1],$convchar;
1091 } elsif ($chars{$char}) {
1092 $convchar= $chars{$char};
1093 # printf "0xC char %x, converted %x\n",$char,$chars{$char};
1094 }else {
1095 $convchar=$char;
1097 push @characters,$convchar;
1098 } else {
1099 my $convchar;
1100 if ($chars{$char}) {
1101 $convchar= $chars{$char};
1102 # printf "char %x, converted %x\n",$char,$chars{$char};
1103 }else {
1104 # printf "char %x $char\n",$char;
1105 $convchar=$char;
1107 push @characters,$convchar;
1110 $result=pack "U*",@characters;
1111 # $result=~s/\x01//;
1112 # $result=~s/\x00//;
1113 $result=~s/\x0f//;
1114 $result=~s/\x1b.//;
1115 $result=~s/\x0e//;
1116 $result=~s/\x1b\x5b//;
1117 # map{printf "%x",$_} @characters;
1118 # printf "\n";
1119 return $result;
1125 =head1 AUTHOR
1127 Koha Development Team <http://koha-community.org/>
1129 Galen Charlton <galen.charlton@liblime.com>
1131 =cut