2 # Copyright (C) 2001-2007, The Perl Foundation.
7 use lib qw( . lib ../lib ../../lib );
9 use Parrot::Test tests => 50;
14 t/op/string_cs.t - String Charset Tests
18 % prove t/op/string_cs.t
22 Tests charset support.
26 pasm_output_is( <<'CODE', <<OUTPUT, "basic syntax" );
27 set S0, ascii:"ok 1\n"
29 set S0, binary:"ok 2\n"
31 set S0, iso-8859-1:"ok 3\n"
40 pasm_output_is( <<'CODE', <<OUTPUT, "charset name" );
51 pasm_output_is( <<'CODE', <<OUTPUT, "find_charset" );
52 find_charset I0, "iso-8859-1"
54 find_charset I0, "ascii"
56 find_charset I0, "binary"
65 pasm_error_output_like( <<'CODE', <<OUTPUT, "find_charset - not existing" );
66 find_charset I0, "no_such"
69 /charset 'no_such' not found/
72 pasm_output_is( <<'CODE', <<OUTPUT, "downcase" );
73 set S0, iso-8859-1:"AEIOU_ÄÖÜ\n"
81 pasm_output_is( <<'CODE', <<OUTPUT, "upcase" );
82 set S0, iso-8859-1:"aeiou_äöüß\n"
90 pasm_output_is( <<'CODE', <<OUTPUT, "titlecase" );
91 set S0, iso-8859-1:"zAEIOU_ÄÖÜ\n"
99 pasm_output_is( <<'CODE', <<OUTPUT, "is_whitespace" );
100 set S0, iso-8859-1:"a\t\n \xa0" # is 0xa0 a whitespace in iso-8859-1??
101 .include "cclass.pasm"
102 is_cclass I0, .CCLASS_WHITESPACE, S0, 0
103 is_cclass I1, .CCLASS_WHITESPACE, S0, 1
104 is_cclass I2, .CCLASS_WHITESPACE, S0, 2
105 is_cclass I3, .CCLASS_WHITESPACE, S0, 3
107 is_cclass I4, .CCLASS_WHITESPACE, S0, I4
114 set S0, ascii:"a\t\n "
115 is_cclass I0, .CCLASS_WHITESPACE, S0, 0
116 is_cclass I1, .CCLASS_WHITESPACE, S0, 1
117 is_cclass I2, .CCLASS_WHITESPACE, S0, 2
118 is_cclass I3, .CCLASS_WHITESPACE, S0, 3
119 is_cclass I4, .CCLASS_WHITESPACE, S0, 4 # access past string boundary: not a whitespace
132 pasm_output_is( <<'CODE', <<OUTPUT, "is_wordchar" );
133 .include "cclass.pasm"
138 is_cclass I0, .CCLASS_WORD, S0, I2
148 pasm_output_is( <<'CODE', <<OUTPUT, "is_digit" );
149 .include "cclass.pasm"
154 is_cclass I0, .CCLASS_NUMERIC, S0, I2
164 pasm_output_is( <<'CODE', <<OUTPUT, "is_punctuation" );
165 .include "cclass.pasm"
170 is_cclass I0, .CCLASS_PUNCTUATION, S0, I2
180 pasm_output_is( <<'CODE', <<OUTPUT, "is_newline" );
181 .include "cclass.pasm"
183 is_cclass I0, .CCLASS_NEWLINE, S0, 0
185 is_cclass I0, .CCLASS_NEWLINE, S0, 1
193 pasm_output_is( <<'CODE', <<OUTPUT, "find_wordchar" );
194 .include "cclass.pasm"
199 find_cclass I0, .CCLASS_WORD, S0, I0, I1
212 pasm_output_is( <<'CODE', <<OUTPUT, "find_digit" );
213 .include "cclass.pasm"
218 find_cclass I0, .CCLASS_NUMERIC, S0, I0, I1
231 pasm_output_is( <<'CODE', <<OUTPUT, "find_punctuation" );
232 .include "cclass.pasm"
237 find_cclass I0, .CCLASS_PUNCTUATION, S0, I0, I1
250 pasm_output_is( <<'CODE', <<OUTPUT, "trans_charset_s_s_i" );
252 find_charset I0, "iso-8859-1"
253 trans_charset S1, S0, I0
266 pasm_output_is( <<'CODE', <<OUTPUT, "trans_charset_s_i" );
268 find_charset I0, "iso-8859-1"
282 pasm_error_output_like( <<'CODE', <<OUTPUT, "trans_charset_s_i - lossy" );
283 set S1, iso-8859-1:"abcä"
284 find_charset I0, "ascii"
289 /lossy conversion to ascii/
292 pasm_output_is( <<'CODE', <<OUTPUT, "trans_charset_s_i - same" );
294 find_charset I0, "ascii"
308 pasm_output_is( <<'CODE', <<OUTPUT, "trans_charset_s_s_i iso-8859-1 to binary" );
309 set S0, iso-8859-1:"abc"
310 find_charset I0, "binary"
311 trans_charset S1, S0, I0
324 pasm_output_is( <<'CODE', <<OUTPUT, "trans_charset_s_i iso-8859-1 to binary" );
325 set S1, iso-8859-1:"abc"
326 find_charset I0, "binary"
340 pasm_output_is( <<'CODE', <<OUTPUT, "trans_charset_s_s_i ascii to binary" );
342 find_charset I0, "binary"
343 trans_charset S1, S0, I0
356 pasm_output_is( <<'CODE', <<OUTPUT, "trans_charset_s_i ascii to binary" );
358 find_charset I0, "binary"
372 pasm_output_is( <<'CODE', <<OUTPUT, "trans_charset_s_s_i ascii to iso-8859-1" );
374 find_charset I0, "iso-8859-1"
375 trans_charset S1, S0, I0
388 pasm_output_is( <<'CODE', <<OUTPUT, "trans_charset_s_i ascii to iso-8859-1" );
390 find_charset I0, "iso-8859-1"
404 pasm_output_is( <<'CODE', <<OUTPUT, "trans_charset_s_s_i iso-8859-1 to unicode" );
405 set S0, iso-8859-1:"abc_ä_"
406 find_charset I0, "unicode"
407 trans_charset S1, S0, I0
424 pasm_output_is( <<'CODE', <<OUTPUT, "trans_charset_s_s_i unicode to iso-8859-1" );
425 set S0, unicode:"abc_\xe4_"
426 bytelength I2, S0 # XXX its 7 for utf8 only
429 find_charset I0, "iso-8859-1"
430 trans_charset S1, S0, I0
448 pir_output_is( <<'CODE', <<'OUTPUT', "bug #34661 literal" );
450 $S0 = unicode:"\"]\nif I3 == "
457 pir_output_is( <<'CODE', <<'OUTPUT', "todo #34660 hash" );
461 set_global ['Foo'], unicode:"Bar", $P0
463 $P1 = get_global ['Foo'], "Bar"
474 pir_output_is( <<'CODE', <<'OUTPUT', "concat ascii, utf8" );
476 .local string s, t, u
492 skip( 'no ICU lib', 16 ) unless $PConfig{has_icu};
493 pasm_output_is( <<'CODE', <<"OUTPUT", "unicode downcase" );
494 set S0, iso-8859-1:"TÖTSCH"
495 find_charset I0, "unicode"
496 trans_charset S1, S0, I0
498 getstdout P0 # need to convert back to utf8
499 push P0, "utf8" # push utf8 output layer
507 pasm_output_is( <<'CODE', <<"OUTPUT", "unicode downcase, trans_charset_s_i" );
508 set S0, iso-8859-1:"TÖTSCH"
509 find_charset I0, "unicode"
510 trans_charset S1, S0, I0
512 find_charset I0, "iso-8859-1"
520 pasm_output_is( <<'CODE', <<"OUTPUT", "unicode downcase - transcharset" );
521 set S0, iso-8859-1:"TÖTSCH"
522 find_charset I0, "unicode"
523 trans_charset S1, S0, I0
525 find_encoding I0, "utf8"
526 trans_encoding S2, S1, I0
534 pasm_output_is( <<'CODE', <<"OUTPUT", "utf16 ord, length" );
535 set S0, iso-8859-1:"TÖTSCH"
536 find_charset I0, "unicode"
537 trans_charset S1, S0, I0
538 find_encoding I0, "utf16"
539 trans_encoding S1, S1, I0
557 pasm_output_is( <<'CODE', <<"OUTPUT", "chopn utf8" );
558 set S0, iso-8859-1:"TTÖÖ"
559 find_charset I0, "unicode"
560 trans_charset S1, S0, I0
567 .include "stringinfo.pasm"
568 stringinfo I0, S1, .STRINGINFO_BUFUSED
576 pasm_output_is( <<'CODE', <<"OUTPUT", "utf16 append" );
577 set S0, iso-8859-1:"Tötsch"
578 find_charset I0, "unicode"
579 trans_charset S1, S0, I0
580 find_encoding I0, "utf16"
581 trans_encoding S1, S1, I0
586 .include "stringinfo.pasm"
587 stringinfo I0, S1, .STRINGINFO_BUFUSED
590 find_encoding I0, "utf8"
591 trans_encoding S2, S1, I0
600 pasm_output_is( <<'CODE', <<"OUTPUT", "utf16 concat" );
601 set S0, iso-8859-1:"Tötsch"
602 find_charset I0, "unicode"
603 trans_charset S1, S0, I0
604 find_encoding I0, "utf16"
605 trans_encoding S1, S1, I0
606 concat S2, S1, " Leo"
610 .include "stringinfo.pasm"
611 stringinfo I0, S2, .STRINGINFO_BUFUSED
614 find_encoding I0, "utf8"
615 trans_encoding S2, S2, I0
624 pasm_output_is( <<'CODE', <<"OUTPUT", "utf16 substr" );
625 set S0, iso-8859-1:"Tötsch"
626 find_charset I0, "unicode"
627 trans_charset S1, S0, I0
628 find_encoding I0, "utf16"
629 trans_encoding S1, S1, I0
631 find_encoding I0, "utf8"
632 trans_encoding S2, S2, I0
640 pasm_output_is( <<'CODE', <<"OUTPUT", "utf16 replace" );
641 set S0, iso-8859-1:"Tötsch"
642 find_charset I0, "unicode"
643 trans_charset S1, S0, I0
644 find_encoding I0, "utf16"
645 trans_encoding S1, S1, I0
646 substr S2, S1, 1, 1, "oe"
647 find_encoding I0, "utf8"
648 trans_encoding S2, S2, I0
649 trans_encoding S1, S1, I0
660 pasm_output_is( <<'CODE', <<"OUTPUT", "utf16 index, latin1 search" );
661 set S0, iso-8859-1:"TÖTSCH"
662 find_charset I0, "unicode"
663 trans_charset S1, S0, I0
665 set S2, iso-8859-1:"öt"
674 pasm_output_is( <<'CODE', <<"OUTPUT", "utf16 index, latin1 search" );
675 set S0, iso-8859-1:"TÖTSCH"
676 find_charset I0, "unicode"
677 trans_charset S1, S0, I0
679 set S2, iso-8859-1:"öt"
693 pasm_output_is( <<'CODE', <<"OUTPUT", "unicode upcase" );
694 set S0, iso-8859-1:"tötsch"
695 find_charset I0, "unicode"
696 trans_charset S1, S0, I0
698 getstdout P0 # need to convert back to utf8
699 push P0, "utf8" # push utf8 output layer
707 pasm_output_is( <<'CODE', <<"OUTPUT", "unicode upcase to combined char" );
708 set S1, unicode:"hacek j \u01f0"
710 getstdout P0 # need to convert back to utf8
711 push P0, "utf8" # push utf8 output layer
721 # 106 dest_len = u_strToUpper(src->strstart, dest_len,
726 # (gdb) x /8h src->strstart
727 # 0x844fb60: 0x005f 0x005f 0x005f 0x01f0 0x0031 0x0032 0x0033 0x0000
729 # 110 src->bufused = dest_len * sizeof(UChar);
732 # (gdb) x /8h src->strstart
733 # 0x844fb60: 0x005f 0x005f 0x005f 0x004a 0x030c 0x0031 0x0032 0x0000
735 pasm_output_is( <<'CODE', <<"OUTPUT", "unicode upcase to combined char 3.2 bug?" );
736 set S1, unicode:"___\u01f0123"
738 getstdout P0 # need to convert back to utf8
739 push P0, "utf8" # push utf8 output layer
747 pasm_output_is( <<'CODE', <<"OUTPUT", "unicode titlecase" );
748 set S0, iso-8859-1:"tötsch leo"
749 find_charset I0, "unicode"
750 trans_charset S1, S0, I0
752 getstdout P0 # need to convert back to utf8
753 push P0, "utf8" # push utf8 output layer
758 T\x{c3}\x{b6}tsch Leo
761 pasm_output_is( <<'CODE', <<OUTPUT, "combose combined char" );
762 set S1, unicode:"___\u01f0___"
764 upcase S1 # decompose J+hacek
765 length I1, S1 # 1 longer
766 downcase S1 # j+hacek
769 length I3, S1 # back at original string
770 getstdout P0 # need to convert back to utf8
771 push P0, "utf8" # push utf8 output layer
790 pasm_output_is( <<'CODE', <<'OUTPUT', "escape ascii" );
791 set S0, "abcdefghi\n"
800 pasm_output_is( <<'CODE', <<'OUTPUT', "escape ctrl" );
801 set S0, "\x00\x01\x1f\x7f"
807 \x{0}\x{1}\x{1f}\x{7f}
810 pasm_output_is( <<'CODE', <<'OUTPUT', "escape latin1" );
811 set S0, iso-8859-1:"tötsch leo"
820 pasm_output_is( <<'CODE', <<'OUTPUT', "escape unicode" );
821 set S0, unicode:"\u2001\u2002\u2003\u2004\x{e01ef}\u0114"
827 \u2001\u2002\u2003\u2004\x{e01ef}\u0114
832 # cperl-indent-level: 4
835 # vim: expandtab shiftwidth=4: