2 # Copyright (C) 2001-2009, Parrot Foundation.
7 use lib qw( . lib ../lib ../../lib );
9 use Parrot::Test tests => 32;
14 t/op/stringu.t - Unicode String Test
18 % prove t/op/stringu.t
22 Tests Parrot unicode string system.
26 pir_output_is( <<'CODE', <<OUTPUT, "angstrom" );
29 $P0.'encoding'("utf8")
39 pir_output_is( <<'CODE', <<OUTPUT, "escaped angstrom" );
42 $P0.'encoding'("utf8")
43 set $S0, unicode:"\x{212b}"
52 pir_output_is( <<'CODE', <<OUTPUT, "escaped angstrom 2" );
55 $P0.'encoding'("utf8")
56 set $S0, unicode:"aaaaaa\x{212b}"
65 pir_output_is( <<'CODE', <<OUTPUT, "escaped angstrom 3" );
68 $P0.'encoding'("utf8")
69 set $S0, unicode:"aaaaaa\x{212b}-aaaaaa"
75 aaaaaa\xe2\x84\xab-aaaaaa
78 pir_output_is( <<'CODE', <<OUTPUT, 'escaped angstrom 3 \uhhhh' );
81 $P0.'encoding'("utf8")
82 set $S0, unicode:"aaaaaa\u212b-aaaaaa"
88 aaaaaa\xe2\x84\xab-aaaaaa
91 pir_output_is( <<'CODE', <<OUTPUT, "MATHEMATICAL BOLD CAPITAL A" );
94 $P0.'encoding'("utf8")
95 set $S0, unicode:"aaaaaa\x{1d400}-aaaaaa"
101 aaaaaa\xf0\x9d\x90\x80-aaaaaa
104 pir_output_is( <<'CODE', <<OUTPUT, 'MATHEMATICAL BOLD CAPITAL A \U' );
107 $P0.'encoding'("utf8")
108 set $S0, unicode:"aaaaaa\U0001d400-aaaaaa"
114 aaaaaa\xf0\x9d\x90\x80-aaaaaa
117 pir_output_is( <<'CODE', <<OUTPUT, "two upscales" );
120 $P0.'encoding'("utf8")
121 set $S0, unicode:"aaaaaa\x{212b}-bbbbbb\x{1d400}-cccccc"
130 aaaaaa\xe2\x84\xab-bbbbbb\xf0\x9d\x90\x80-cccccc
134 pir_output_is( <<'CODE', <<OUTPUT, "two upscales - don't downscale" );
137 $P0.'encoding'("utf8")
138 set $S0, unicode:"aaaaaa\x{1d400}-bbbbbb\x{212b}-cccccc"
147 aaaaaa\xf0\x9d\x90\x80-bbbbbb\xe2\x84\xab-cccccc
151 pir_output_is( <<'CODE', <<OUTPUT, '\cX, \ooo' );
154 $P0.'encoding'("utf8")
175 pasm_error_output_like( <<'CODE', <<OUTPUT, 'illegal \u' );
180 /Illegal escape sequence in/
183 pasm_error_output_like( <<'CODE', <<OUTPUT, 'illegal \u123' );
188 /Illegal escape sequence in/
191 pasm_error_output_like( <<'CODE', <<OUTPUT, 'illegal \U123' );
196 /Illegal escape sequence in/
199 pasm_error_output_like( <<'CODE', <<OUTPUT, 'illegal \x' );
204 /Illegal escape sequence in/
207 pasm_output_is( <<'CODE', <<OUTPUT, "UTF8 literals" );
208 set S0, utf8:unicode:"«"
220 pasm_output_is( <<'CODE', <<OUTPUT, "UTF8 literals" );
221 set S0, utf8:unicode:"\xc2\xab"
233 pasm_error_output_like( <<'CODE', <<OUTPUT, "UTF8 literals - illegal" );
234 set S0, utf8:unicode:"\xf2\xab"
242 /Malformed UTF-8 string/
245 pasm_error_output_like( <<'CODE', <<OUTPUT, "UTF8 as malformed ascii" );
255 pasm_output_is( <<'CODE', <<OUTPUT, "substr with a UTF8 replacement #36794" );
256 set S0, "AAAAAAAAAA\\u666"
268 skip( 'no ICU lib', 3 ) unless $PConfig{has_icu};
269 pir_output_is( <<'CODE', <<OUTPUT, "downcase changes string behind scenes" );
291 pir_output_is( <<'CODE', <<OUTPUT, "downcase asciish" );
304 # escape does not produce utf8, just a raw sequence of chars
305 pir_output_is( <<"CODE", <<'OUTPUT', "escape utf16" );
309 s = iso-8859-1:"T\xf6tsch"
310 i = find_charset "unicode"
311 s = trans_charset s, i
322 # Tests for .CCLASS_WHITESPACE
323 pir_output_is( <<'CODE', <<'OUTPUT', "CCLASS_WHITESPACE in unicode" );
325 .include 'cclass.pasm'
327 s = unicode:" \t\u207babc\n\u2000\u2009"
329 $I0 = is_cclass .CCLASS_WHITESPACE, s, 0
331 $I0 = is_cclass .CCLASS_WHITESPACE, s, 1
333 $I0 = is_cclass .CCLASS_WHITESPACE, s, 2
335 $I0 = find_not_cclass .CCLASS_WHITESPACE, s, 0, $I9
337 $I0 = find_not_cclass .CCLASS_WHITESPACE, s, $I0, $I9
339 $I0 = find_cclass .CCLASS_WHITESPACE, s, $I0, $I9
341 $I0 = find_not_cclass .CCLASS_WHITESPACE, s, $I0, $I9
349 # Tests for .CCLASS_ANY
350 pir_output_is( <<'CODE', <<'OUTPUT', "CCLASS_ANY in unicode" );
352 .include 'cclass.pasm'
354 s = unicode:" \t\u207babc\n\u2000\u2009"
356 $I0 = is_cclass .CCLASS_ANY, s, 0
358 $I0 = is_cclass .CCLASS_ANY, s, 1
360 $I0 = is_cclass .CCLASS_ANY, s, 2
362 $I0 = is_cclass .CCLASS_ANY, s, $I9
364 $I0 = find_not_cclass .CCLASS_ANY, s, 0, $I9
366 $I0 = find_not_cclass .CCLASS_ANY, s, $I0, $I9
368 $I0 = find_cclass .CCLASS_ANY, s, $I0, $I9
370 $I0 = find_cclass .CCLASS_ANY, s, 2, $I9
379 skip "Tests seem to fail on big endian machines with icu", 2 if $PConfig{byteorder} eq '4321';
381 # Tests for .CCLASS_NUMERIC
382 pir_output_is( <<'CODE', <<'OUTPUT', "CCLASS_NUMERIC in unicode" );
384 .include 'cclass.pasm'
386 s = unicode:"01\u207bxyz\u0660\u17e1\u19d9"
388 $I0 = is_cclass .CCLASS_NUMERIC, s, 0
390 $I0 = is_cclass .CCLASS_NUMERIC, s, 1
392 $I0 = is_cclass .CCLASS_NUMERIC, s, 2
394 $I0 = find_not_cclass .CCLASS_NUMERIC, s, 0, $I9
396 $I0 = find_not_cclass .CCLASS_NUMERIC, s, $I0, $I9
398 $I0 = find_cclass .CCLASS_NUMERIC, s, $I0, $I9
400 $I0 = find_not_cclass .CCLASS_NUMERIC, s, $I0, $I9
408 # Concatenate unicode: with iso-8859-1
410 <<'CODE', <<"OUTPUT", "Concat unicode with iso-8859-1" );
414 $S2 = concat $S0, $S1
420 $S2 = concat $S0, $S1
426 $S2 = concat $S0, $S1
437 pir_output_is( <<'CODE', <<OUTPUT, "UTF-8 and Unicode hash keys");
439 .local string str0, str1
440 str0 = unicode:"\u00ab"
441 str1 = iso-8859-1:"\xab"
447 $I0 = iseq str0, str1
464 pir_output_is( <<'CODE', <<OUTPUT, "UTF-8 and Unicode hash keys, full bucket" );
466 .local string str0, str1
467 str0 = unicode:"infix:\u00b1"
468 str1 = iso-8859-1:"infix:\xb1"
476 unless $I0 < 200 goto fill_done
479 $S0 = concat 'infix:', $S0
484 $I0 = iseq str0, str1
485 #print "iseq str0, str1 => "
491 #print "iseq hash[str0], hash[str1] => "
505 skip( 'no ICU lib', 3 ) unless $PConfig{has_icu};
506 pir_output_is( <<'CODE', <<'OUT', 'numification of unicode strings to int' );
511 $I0 = find_encoding 'ucs2'
512 $S0 = trans_encoding $S0, $I0
521 pir_output_is( <<'CODE', <<'OUT', 'numification of unicode strings to float' );
526 $I0 = find_encoding 'ucs2'
527 $S0 = trans_encoding $S0, $I0
536 pir_output_is( <<'CODE', <<'OUT', 'numification of unicode strings float mixed' );
538 $S0 = unicode:"140 r\x{e9}sum\x{e9}s"
541 $I0 = find_encoding 'ucs2'
542 $S0 = trans_encoding $S0, $I0
552 pir_output_is( <<'CODE', <<'OUT', 'concatenation of utf8 and iso-8859-1 (TT #752)' );
558 $S0 = unicode:"\u00e5\u263b"
559 $S3 = concat $S1, $S2
560 if $S0 == $S3 goto equal_1
565 $S0 = unicode:"\u263b\u00e5"
566 $S3 = concat $S2, $S1
567 if $S0 == $S3 goto equal_2
580 # cperl-indent-level: 4
583 # vim: expandtab shiftwidth=4: