t/op/stringu.t

   1 #!perl
   2 # Copyright (C) 2001-2009, Parrot Foundation.
   3 # $Id$
   4
   5 use strict;
   6 use warnings;
   7 use lib qw( . lib ../lib ../../lib );
   8 use Test::More;
   9 use Parrot::Test tests => 32;
  10 use Parrot::Config;
  11
  12 =head1 NAME
  13
  14 t/op/stringu.t - Unicode String Test
  15
  16 =head1 SYNOPSIS
  17
  18         % prove t/op/stringu.t
  19
  20 =head1 DESCRIPTION
  21
  22 Tests Parrot unicode string system.
  23
  24 =cut
  25
  26 pir_output_is( <<'CODE', <<OUTPUT, "angstrom" );
  27 .sub main :main
  28     getstdout $P0
  29     $P0.'encoding'("utf8")
  30     chr $S0, 0x212B
  31     print $P0, $S0
  32     print $P0, "\n"
  33     end
  34 .end
  35 CODE
  36 \xe2\x84\xab
  37 OUTPUT
  38
  39 pir_output_is( <<'CODE', <<OUTPUT, "escaped angstrom" );
  40 .sub main :main
  41     getstdout $P0
  42     $P0.'encoding'("utf8")
  43     set $S0, unicode:"\x{212b}"
  44     print $S0
  45     print "\n"
  46     end
  47 .end
  48 CODE
  49 \xe2\x84\xab
  50 OUTPUT
  51
  52 pir_output_is( <<'CODE', <<OUTPUT, "escaped angstrom 2" );
  53 .sub main :main
  54     getstdout $P0
  55     $P0.'encoding'("utf8")
  56     set $S0, unicode:"aaaaaa\x{212b}"
  57     print $S0
  58     print "\n"
  59     end
  60 .end
  61 CODE
  62 aaaaaa\xe2\x84\xab
  63 OUTPUT
  64
  65 pir_output_is( <<'CODE', <<OUTPUT, "escaped angstrom 3" );
  66 .sub main :main
  67     getstdout $P0
  68     $P0.'encoding'("utf8")
  69     set $S0, unicode:"aaaaaa\x{212b}-aaaaaa"
  70     print $S0
  71     print "\n"
  72     end
  73 .end
  74 CODE
  75 aaaaaa\xe2\x84\xab-aaaaaa
  76 OUTPUT
  77
  78 pir_output_is( <<'CODE', <<OUTPUT, 'escaped angstrom 3 \uhhhh' );
  79 .sub main :main
  80     getstdout $P0
  81     $P0.'encoding'("utf8")
  82     set $S0, unicode:"aaaaaa\u212b-aaaaaa"
  83     print $S0
  84     print "\n"
  85     end
  86 .end
  87 CODE
  88 aaaaaa\xe2\x84\xab-aaaaaa
  89 OUTPUT
  90
  91 pir_output_is( <<'CODE', <<OUTPUT, "MATHEMATICAL BOLD CAPITAL A" );
  92 .sub main :main
  93     getstdout $P0
  94     $P0.'encoding'("utf8")
  95     set $S0, unicode:"aaaaaa\x{1d400}-aaaaaa"
  96     print $S0
  97     print "\n"
  98     end
  99 .end
 100 CODE
 101 aaaaaa\xf0\x9d\x90\x80-aaaaaa
 102 OUTPUT
 103
 104 pir_output_is( <<'CODE', <<OUTPUT, 'MATHEMATICAL BOLD CAPITAL A \U' );
 105 .sub main :main
 106     getstdout $P0
 107     $P0.'encoding'("utf8")
 108     set $S0, unicode:"aaaaaa\U0001d400-aaaaaa"
 109     print $S0
 110     print "\n"
 111     end
 112 .end
 113 CODE
 114 aaaaaa\xf0\x9d\x90\x80-aaaaaa
 115 OUTPUT
 116
 117 pir_output_is( <<'CODE', <<OUTPUT, "two upscales" );
 118 .sub main :main
 119     getstdout $P0
 120     $P0.'encoding'("utf8")
 121     set $S0, unicode:"aaaaaa\x{212b}-bbbbbb\x{1d400}-cccccc"
 122     print $S0
 123     print "\n"
 124     length $I0, $S0
 125     print $I0
 126     print "\n"
 127     end
 128 .end
 129 CODE
 130 aaaaaa\xe2\x84\xab-bbbbbb\xf0\x9d\x90\x80-cccccc
 131 22
 132 OUTPUT
 133
 134 pir_output_is( <<'CODE', <<OUTPUT, "two upscales - don't downscale" );
 135 .sub main :main
 136     getstdout $P0
 137     $P0.'encoding'("utf8")
 138     set $S0, unicode:"aaaaaa\x{1d400}-bbbbbb\x{212b}-cccccc"
 139     print $S0
 140     print "\n"
 141     length $I0, $S0
 142     print $I0
 143     print "\n"
 144     end
 145 .end
 146 CODE
 147 aaaaaa\xf0\x9d\x90\x80-bbbbbb\xe2\x84\xab-cccccc
 148 22
 149 OUTPUT
 150
 151 pir_output_is( <<'CODE', <<OUTPUT, '\cX, \ooo' );
 152 .sub main :main
 153     getstdout $P0
 154     $P0.'encoding'("utf8")
 155     set $S0, "ok 1\cJ"
 156     print $S0
 157     set $S0, "ok 2\012"
 158     print $S0
 159     set $S0, "ok 3\12"
 160     print $S0
 161     set $S0, "ok 4\x0a"
 162     print $S0
 163     set $S0, "ok 5\xa"
 164     print $S0
 165     end
 166 .end
 167 CODE
 168 ok 1
 169 ok 2
 170 ok 3
 171 ok 4
 172 ok 5
 173 OUTPUT
 174
 175 pasm_error_output_like( <<'CODE', <<OUTPUT, 'illegal \u' );
 176     set S0, "x\uy"
 177     print "never\n"
 178     end
 179 CODE
 180 /Illegal escape sequence in/
 181 OUTPUT
 182
 183 pasm_error_output_like( <<'CODE', <<OUTPUT, 'illegal \u123' );
 184     set S0, "x\u123y"
 185     print "never\n"
 186     end
 187 CODE
 188 /Illegal escape sequence in/
 189 OUTPUT
 190
 191 pasm_error_output_like( <<'CODE', <<OUTPUT, 'illegal \U123' );
 192     set S0, "x\U123y"
 193     print "never\n"
 194     end
 195 CODE
 196 /Illegal escape sequence in/
 197 OUTPUT
 198
 199 pasm_error_output_like( <<'CODE', <<OUTPUT, 'illegal \x' );
 200     set S0, "x\xy"
 201     print "never\n"
 202     end
 203 CODE
 204 /Illegal escape sequence in/
 205 OUTPUT
 206
 207 pasm_output_is( <<'CODE', <<OUTPUT, "UTF8 literals" );
 208     set S0, utf8:unicode:"«"
 209     length I0, S0
 210     print I0
 211     print "\n"
 212     print S0
 213     print "\n"
 214     end
 215 CODE
 216 1
 217 \xc2\xab
 218 OUTPUT
 219
 220 pasm_output_is( <<'CODE', <<OUTPUT, "UTF8 literals" );
 221     set S0, utf8:unicode:"\xc2\xab"
 222     length I0, S0
 223     print I0
 224     print "\n"
 225     print S0
 226     print "\n"
 227     end
 228 CODE
 229 1
 230 \xc2\xab
 231 OUTPUT
 232
 233 pasm_error_output_like( <<'CODE', <<OUTPUT, "UTF8 literals - illegal" );
 234     set S0, utf8:unicode:"\xf2\xab"
 235     length I0, S0
 236     print I0
 237     print "\n"
 238     print S0
 239     print "\n"
 240     end
 241 CODE
 242 /Malformed UTF-8 string/
 243 OUTPUT
 244
 245 pasm_error_output_like( <<'CODE', <<OUTPUT, "UTF8 as malformed ascii" );
 246     set S0, ascii:"«"
 247     length I0, S0
 248     print I0
 249     print "\n"
 250     end
 251 CODE
 252 /Malformed string/
 253 OUTPUT
 254
 255 pasm_output_is( <<'CODE', <<OUTPUT, "substr with a UTF8 replacement #36794" );
 256     set S0, "AAAAAAAAAA\\u666"
 257     set I0, 0x666
 258     chr S1, I0
 259     substr S0, 10, 5, S1
 260     print S0
 261     print "\n"
 262     end
 263 CODE
 264 AAAAAAAAAA\xd9\xa6
 265 OUTPUT
 266
 267 SKIP: {
 268     skip( 'no ICU lib', 3 ) unless $PConfig{has_icu};
 269     pir_output_is( <<'CODE', <<OUTPUT, "downcase changes string behind scenes" );
 270 .sub main
 271     .local string str
 272     .local string rest
 273
 274     str = unicode:".xyz"
 275     rest = substr str, 1
 276     print rest
 277     print "\n"
 278
 279     str = unicode:".xyz"
 280     $S99 = downcase str
 281     rest = substr str, 1
 282     print rest
 283     print "\n"
 284
 285 .end
 286 CODE
 287 xyz
 288 xyz
 289 OUTPUT
 290
 291     pir_output_is( <<'CODE', <<OUTPUT, "downcase asciish" );
 292 .sub main
 293     .local string str
 294     .local string rest
 295     str = unicode:".XYZ"
 296     $S0 = downcase str
 297     print $S0
 298     print "\n"
 299 .end
 300 CODE
 301 .xyz
 302 OUTPUT
 303
 304     # escape does not produce utf8, just a raw sequence of chars
 305     pir_output_is( <<"CODE", <<'OUTPUT', "escape utf16" );
 306 .sub main
 307     .local string s, t
 308     .local int i
 309     s = iso-8859-1:"T\xf6tsch"
 310     i = find_charset "unicode"
 311     s = trans_charset s, i
 312     t = upcase s
 313     escape t, t
 314     print t
 315     print "\\n"
 316 .end
 317 CODE
 318 T\x{d6}TSCH
 319 OUTPUT
 320 }
 321
 322 # Tests for .CCLASS_WHITESPACE
 323 pir_output_is( <<'CODE', <<'OUTPUT', "CCLASS_WHITESPACE in unicode" );
 324 .sub main
 325     .include 'cclass.pasm'
 326     .local string s
 327     s = unicode:" \t\u207babc\n\u2000\u2009"
 328     $I9 = length s
 329     $I0 = is_cclass .CCLASS_WHITESPACE, s, 0
 330     print $I0
 331     $I0 = is_cclass .CCLASS_WHITESPACE, s, 1
 332     print $I0
 333     $I0 = is_cclass .CCLASS_WHITESPACE, s, 2
 334     print $I0
 335     $I0 = find_not_cclass .CCLASS_WHITESPACE, s, 0, $I9
 336     print $I0
 337     $I0 = find_not_cclass .CCLASS_WHITESPACE, s, $I0, $I9
 338     print $I0
 339     $I0 = find_cclass .CCLASS_WHITESPACE, s, $I0, $I9
 340     print $I0
 341     $I0 = find_not_cclass .CCLASS_WHITESPACE, s, $I0, $I9
 342     print $I0
 343     print "\n"
 344 .end
 345 CODE
 346 1102269
 347 OUTPUT
 348
 349 # Tests for .CCLASS_ANY
 350 pir_output_is( <<'CODE', <<'OUTPUT', "CCLASS_ANY in unicode" );
 351 .sub main
 352     .include 'cclass.pasm'
 353     .local string s
 354     s = unicode:" \t\u207babc\n\u2000\u2009"
 355     $I9 = length s
 356     $I0 = is_cclass .CCLASS_ANY, s, 0
 357     print $I0
 358     $I0 = is_cclass .CCLASS_ANY, s, 1
 359     print $I0
 360     $I0 = is_cclass .CCLASS_ANY, s, 2
 361     print $I0
 362     $I0 = is_cclass .CCLASS_ANY, s, $I9
 363     print $I0
 364     $I0 = find_not_cclass .CCLASS_ANY, s, 0, $I9
 365     print $I0
 366     $I0 = find_not_cclass .CCLASS_ANY, s, $I0, $I9
 367     print $I0
 368     $I0 = find_cclass .CCLASS_ANY, s, $I0, $I9
 369     print $I0
 370     $I0 = find_cclass .CCLASS_ANY, s, 2, $I9
 371     print $I0
 372     print "\n"
 373 .end
 374 CODE
 375 11109992
 376 OUTPUT
 377
 378 SKIP: {
 379     skip "Tests seem to fail on big endian machines with icu", 2 if $PConfig{byteorder} eq '4321';
 380
 381     # Tests for .CCLASS_NUMERIC
 382     pir_output_is( <<'CODE', <<'OUTPUT', "CCLASS_NUMERIC in unicode" );
 383 .sub main
 384     .include 'cclass.pasm'
 385     .local string s
 386     s = unicode:"01\u207bxyz\u0660\u17e1\u19d9"
 387     $I9 = length s
 388     $I0 = is_cclass .CCLASS_NUMERIC, s, 0
 389     print $I0
 390     $I0 = is_cclass .CCLASS_NUMERIC, s, 1
 391     print $I0
 392     $I0 = is_cclass .CCLASS_NUMERIC, s, 2
 393     print $I0
 394     $I0 = find_not_cclass .CCLASS_NUMERIC, s, 0, $I9
 395     print $I0
 396     $I0 = find_not_cclass .CCLASS_NUMERIC, s, $I0, $I9
 397     print $I0
 398     $I0 = find_cclass .CCLASS_NUMERIC, s, $I0, $I9
 399     print $I0
 400     $I0 = find_not_cclass .CCLASS_NUMERIC, s, $I0, $I9
 401     print $I0
 402     print "\n"
 403 .end
 404 CODE
 405 1102269
 406 OUTPUT
 407
 408     # Concatenate unicode: with iso-8859-1
 409     pir_output_is(
 410         <<'CODE', <<"OUTPUT", "Concat unicode with iso-8859-1" );
 411 .sub main
 412     $S0 = unicode:"A"
 413     $S1 = ascii:"B"
 414     $S2 = concat $S0, $S1
 415     print $S2
 416     print "\n"
 417
 418     $S0 = unicode:"A"
 419     $S1 = unicode:"B"
 420     $S2 = concat $S0, $S1
 421     print $S2
 422     print "\n"
 423
 424     $S0 = unicode:"A"
 425     $S1 = iso-8859-1:"B"
 426     $S2 = concat $S0, $S1
 427     print $S2
 428     print "\n"
 429 .end
 430 CODE
 431 AB
 432 AB
 433 AB
 434 OUTPUT
 435 }
 436
 437 pir_output_is( <<'CODE', <<OUTPUT, "UTF-8 and Unicode hash keys");
 438 .sub 'main'
 439     .local string str0, str1
 440     str0 = unicode:"\u00ab"
 441     str1 = iso-8859-1:"\xab"
 442
 443     .local pmc hash
 444     hash = new 'Hash'
 445     hash[str0] = 'hello'
 446
 447     $I0 = iseq str0, str1
 448     say $I0
 449
 450     $S0 = hash[str0]
 451     $S1 = hash[str1]
 452     $I0 = iseq $S0, $S1
 453     say $I0
 454     say $S0
 455     say $S1
 456 .end
 457 CODE
 458 1
 459 1
 460 hello
 461 hello
 462 OUTPUT
 463
 464 pir_output_is( <<'CODE', <<OUTPUT, "UTF-8 and Unicode hash keys, full bucket" );
 465 .sub 'main'
 466     .local string str0, str1
 467     str0 = unicode:"infix:\u00b1"
 468     str1 = iso-8859-1:"infix:\xb1"
 469
 470     .local pmc hash
 471     hash = new 'Hash'
 472     hash[str0] = 'hello'
 473
 474     $I0 = 0
 475   fill_loop:
 476     unless $I0 < 200 goto fill_done
 477     inc $I0
 478     $S0 = $I0
 479     $S0 = concat 'infix:', $S0
 480     hash[$S0] = 'foo'
 481     goto fill_loop
 482   fill_done:
 483
 484     $I0 = iseq str0, str1
 485     #print "iseq str0, str1               => "
 486     say $I0
 487
 488     $S0 = hash[str0]
 489     $S1 = hash[str1]
 490     $I0 = iseq $S0, $S1
 491     #print "iseq hash[str0], hash[str1]   => "
 492     say $I0
 493     say $S0
 494     say $S1
 495 .end
 496 CODE
 497 1
 498 1
 499 hello
 500 hello
 501 OUTPUT
 502
 503
 504 SKIP: {
 505     skip( 'no ICU lib', 3 ) unless $PConfig{has_icu};
 506 pir_output_is( <<'CODE', <<'OUT', 'numification of unicode strings to int' );
 507 .sub main :main
 508      $S0 = "140"
 509      $I0 = $S0
 510      say $I0
 511      $I0 = find_encoding 'ucs2'
 512      $S0 = trans_encoding $S0, $I0
 513      $I0 = $S0
 514      say $I0
 515 .end
 516 CODE
 517 140
 518 140
 519 OUT
 520
 521 pir_output_is( <<'CODE', <<'OUT', 'numification of unicode strings to float' );
 522 .sub main :main
 523      $S0 = "140"
 524      $N0 = $S0
 525      say $N0
 526      $I0 = find_encoding 'ucs2'
 527      $S0 = trans_encoding $S0, $I0
 528      $N0 = $S0
 529      say $N0
 530 .end
 531 CODE
 532 140
 533 140
 534 OUT
 535
 536 pir_output_is( <<'CODE', <<'OUT', 'numification of unicode strings float mixed' );
 537 .sub main :main
 538     $S0 = unicode:"140 r\x{e9}sum\x{e9}s"
 539     $N0 = $S0
 540     say $N0
 541     $I0 = find_encoding 'ucs2'
 542     $S0 = trans_encoding $S0, $I0
 543     $N0 = $S0
 544     say $N0
 545 .end
 546 CODE
 547 140
 548 140
 549 OUT
 550 }
 551
 552 pir_output_is( <<'CODE', <<'OUT', 'concatenation of utf8 and iso-8859-1 (TT #752)' );
 553 .sub 'main'
 554
 555     $S1 = chr 0xe5
 556     $S2 = chr 0x263b
 557
 558     $S0 = unicode:"\u00e5\u263b"
 559     $S3 = concat $S1, $S2
 560     if $S0 == $S3 goto equal_1
 561     print "not "
 562   equal_1:
 563     say "equal"
 564
 565     $S0 = unicode:"\u263b\u00e5"
 566     $S3 = concat $S2, $S1
 567     if $S0 == $S3 goto equal_2
 568     print "not "
 569   equal_2:
 570     say "equal"
 571 .end
 572 CODE
 573 equal
 574 equal
 575 OUT
 576
 577
 578 # Local Variables:
 579 #   mode: cperl
 580 #   cperl-indent-level: 4
 581 #   fill-column: 100
 582 # End:
 583 # vim: expandtab shiftwidth=4: