t/op/stringu.t

   1 #!perl
   2 # Copyright (C) 2001-2007, The Perl Foundation.
   3 # $Id$
   4
   5 use strict;
   6 use warnings;
   7 use lib qw( . lib ../lib ../../lib );
   8 use Test::More;
   9 use Parrot::Test tests => 26;
  10 use Parrot::Config;
  11
  12 =head1 NAME
  13
  14 t/op/stringu.t - Unicode String Test
  15
  16 =head1 SYNOPSIS
  17
  18         % prove t/op/stringu.t
  19
  20 =head1 DESCRIPTION
  21
  22 Tests Parrot unicode string system.
  23
  24 =cut
  25
  26 pasm_output_is( <<'CODE', <<OUTPUT, "angstrom" );
  27     getstdout P0
  28     push P0, "utf8"
  29     chr S0, 0x212B
  30     print P0, S0
  31     print P0, "\n"
  32     end
  33 CODE
  34 \xe2\x84\xab
  35 OUTPUT
  36
  37 pasm_output_is( <<'CODE', <<OUTPUT, "escaped angstrom" );
  38     getstdout P0
  39     push P0, "utf8"
  40     set S0, unicode:"\x{212b}"
  41     print S0
  42     print "\n"
  43     end
  44 CODE
  45 \xe2\x84\xab
  46 OUTPUT
  47
  48 pasm_output_is( <<'CODE', <<OUTPUT, "escaped angstrom 2" );
  49     getstdout P0
  50     push P0, "utf8"
  51     set S0, unicode:"aaaaaa\x{212b}"
  52     print S0
  53     print "\n"
  54     end
  55 CODE
  56 aaaaaa\xe2\x84\xab
  57 OUTPUT
  58
  59 pasm_output_is( <<'CODE', <<OUTPUT, "escaped angstrom 3" );
  60     getstdout P0
  61     push P0, "utf8"
  62     set S0, unicode:"aaaaaa\x{212b}-aaaaaa"
  63     print S0
  64     print "\n"
  65     end
  66 CODE
  67 aaaaaa\xe2\x84\xab-aaaaaa
  68 OUTPUT
  69
  70 pasm_output_is( <<'CODE', <<OUTPUT, 'escaped angstrom 3 \uhhhh' );
  71     getstdout P0
  72     push P0, "utf8"
  73     set S0, unicode:"aaaaaa\u212b-aaaaaa"
  74     print S0
  75     print "\n"
  76     end
  77 CODE
  78 aaaaaa\xe2\x84\xab-aaaaaa
  79 OUTPUT
  80
  81 pasm_output_is( <<'CODE', <<OUTPUT, "MATHEMATICAL BOLD CAPITAL A" );
  82     getstdout P0
  83     push P0, "utf8"
  84     set S0, unicode:"aaaaaa\x{1d400}-aaaaaa"
  85     print S0
  86     print "\n"
  87     end
  88 CODE
  89 aaaaaa\xf0\x9d\x90\x80-aaaaaa
  90 OUTPUT
  91
  92 pasm_output_is( <<'CODE', <<OUTPUT, 'MATHEMATICAL BOLD CAPITAL A \U' );
  93     getstdout P0
  94     push P0, "utf8"
  95     set S0, unicode:"aaaaaa\U0001d400-aaaaaa"
  96     print S0
  97     print "\n"
  98     end
  99 CODE
 100 aaaaaa\xf0\x9d\x90\x80-aaaaaa
 101 OUTPUT
 102
 103 pasm_output_is( <<'CODE', <<OUTPUT, "two upscales" );
 104     getstdout P0
 105     push P0, "utf8"
 106     set S0, unicode:"aaaaaa\x{212b}-bbbbbb\x{1d400}-cccccc"
 107     print S0
 108     print "\n"
 109     length I0, S0
 110     print I0
 111     print "\n"
 112     end
 113 CODE
 114 aaaaaa\xe2\x84\xab-bbbbbb\xf0\x9d\x90\x80-cccccc
 115 22
 116 OUTPUT
 117
 118 pasm_output_is( <<'CODE', <<OUTPUT, "two upscales - don't downscale" );
 119     getstdout P0
 120     push P0, "utf8"
 121     set S0, unicode:"aaaaaa\x{1d400}-bbbbbb\x{212b}-cccccc"
 122     print S0
 123     print "\n"
 124     length I0, S0
 125     print I0
 126     print "\n"
 127     end
 128 CODE
 129 aaaaaa\xf0\x9d\x90\x80-bbbbbb\xe2\x84\xab-cccccc
 130 22
 131 OUTPUT
 132
 133 pasm_output_is( <<'CODE', <<OUTPUT, '\cX, \ooo' );
 134     getstdout P0
 135     push P0, "utf8"
 136     set S0, "ok 1\cJ"
 137     print S0
 138     set S0, "ok 2\012"
 139     print S0
 140     set S0, "ok 3\12"
 141     print S0
 142     set S0, "ok 4\x0a"
 143     print S0
 144     set S0, "ok 5\xa"
 145     print S0
 146     end
 147 CODE
 148 ok 1
 149 ok 2
 150 ok 3
 151 ok 4
 152 ok 5
 153 OUTPUT
 154
 155 pasm_error_output_like( <<'CODE', <<OUTPUT, 'illegal \u' );
 156     set S0, "x\uy"
 157     print "never\n"
 158     end
 159 CODE
 160 /Illegal escape sequence in/
 161 OUTPUT
 162
 163 pasm_error_output_like( <<'CODE', <<OUTPUT, 'illegal \u123' );
 164     set S0, "x\u123y"
 165     print "never\n"
 166     end
 167 CODE
 168 /Illegal escape sequence in/
 169 OUTPUT
 170
 171 pasm_error_output_like( <<'CODE', <<OUTPUT, 'illegal \U123' );
 172     set S0, "x\U123y"
 173     print "never\n"
 174     end
 175 CODE
 176 /Illegal escape sequence in/
 177 OUTPUT
 178
 179 pasm_error_output_like( <<'CODE', <<OUTPUT, 'illegal \x' );
 180     set S0, "x\xy"
 181     print "never\n"
 182     end
 183 CODE
 184 /Illegal escape sequence in/
 185 OUTPUT
 186
 187 pasm_output_is( <<'CODE', <<OUTPUT, "UTF8 literals" );
 188     set S0, utf8:unicode:"«"
 189     length I0, S0
 190     print I0
 191     print "\n"
 192     print S0
 193     print "\n"
 194     end
 195 CODE
 196 1
 197 \xc2\xab
 198 OUTPUT
 199
 200 pasm_output_is( <<'CODE', <<OUTPUT, "UTF8 literals" );
 201     set S0, utf8:unicode:"\xc2\xab"
 202     length I0, S0
 203     print I0
 204     print "\n"
 205     print S0
 206     print "\n"
 207     end
 208 CODE
 209 1
 210 \xc2\xab
 211 OUTPUT
 212
 213 pasm_error_output_like( <<'CODE', <<OUTPUT, "UTF8 literals - illegal" );
 214     set S0, utf8:unicode:"\xf2\xab"
 215     length I0, S0
 216     print I0
 217     print "\n"
 218     print S0
 219     print "\n"
 220     end
 221 CODE
 222 /Malformed UTF-8 string/
 223 OUTPUT
 224
 225 pasm_error_output_like( <<'CODE', <<OUTPUT, "UTF8 as malformed ascii" );
 226     set S0, ascii:"«"
 227     length I0, S0
 228     print I0
 229     print "\n"
 230     end
 231 CODE
 232 /Malformed string/
 233 OUTPUT
 234
 235 pasm_output_is( <<'CODE', <<OUTPUT, "substr with a UTF8 replacement #36794" );
 236     set S0, "AAAAAAAAAA\\u666"
 237     set I0, 0x666
 238     chr S1, I0
 239     substr S0, 10, 5, S1
 240     print S0
 241     print "\n"
 242     end
 243 CODE
 244 AAAAAAAAAA\xd9\xa6
 245 OUTPUT
 246
 247 SKIP: {
 248     skip( 'no ICU lib', 3 ) unless $PConfig{has_icu};
 249     pir_output_is( <<'CODE', <<OUTPUT, "downcase changes string behind scenes" );
 250 .sub main
 251     .local string str
 252     .local string rest
 253
 254     str = unicode:".xyz"
 255     rest = substr str, 1
 256     print rest
 257     print "\n"
 258
 259     str = unicode:".xyz"
 260     $S99 = downcase str
 261     rest = substr str, 1
 262     print rest
 263     print "\n"
 264
 265 .end
 266 CODE
 267 xyz
 268 xyz
 269 OUTPUT
 270
 271     pir_output_is( <<'CODE', <<OUTPUT, "downcase asciish" );
 272 .sub main
 273     .local string str
 274     .local string rest
 275     str = unicode:".XYZ"
 276     $S0 = downcase str
 277     print $S0
 278     print "\n"
 279 .end
 280 CODE
 281 .xyz
 282 OUTPUT
 283
 284     # escape does not produce utf8, just a raw sequence of chars
 285     pir_output_is( <<"CODE", <<'OUTPUT', "escape utf16" );
 286 .sub main
 287     .local string s, t
 288     .local int i
 289     s = iso-8859-1:"T\xf6tsch"
 290     i = find_charset "unicode"
 291     s = trans_charset s, i
 292     t = upcase s
 293     escape t, t
 294     print t
 295     print "\\n"
 296 .end
 297 CODE
 298 T\x{d6}TSCH
 299 OUTPUT
 300 }
 301
 302 # Tests for .CCLASS_WHITESPACE
 303 pir_output_is( <<'CODE', <<'OUTPUT', "CCLASS_WHITESPACE in unicode" );
 304 .sub main
 305     .include 'cclass.pasm'
 306     .local string s
 307     s = unicode:" \t\u207babc\n\u2000\u2009"
 308     $I9 = length s
 309     $I0 = is_cclass .CCLASS_WHITESPACE, s, 0
 310     print $I0
 311     $I0 = is_cclass .CCLASS_WHITESPACE, s, 1
 312     print $I0
 313     $I0 = is_cclass .CCLASS_WHITESPACE, s, 2
 314     print $I0
 315     $I0 = find_not_cclass .CCLASS_WHITESPACE, s, 0, $I9
 316     print $I0
 317     $I0 = find_not_cclass .CCLASS_WHITESPACE, s, $I0, $I9
 318     print $I0
 319     $I0 = find_cclass .CCLASS_WHITESPACE, s, $I0, $I9
 320     print $I0
 321     $I0 = find_not_cclass .CCLASS_WHITESPACE, s, $I0, $I9
 322     print $I0
 323     print "\n"
 324 .end
 325 CODE
 326 1102269
 327 OUTPUT
 328
 329 # Tests for .CCLASS_ANY
 330 pir_output_is( <<'CODE', <<'OUTPUT', "CCLASS_ANY in unicode" );
 331 .sub main
 332     .include 'cclass.pasm'
 333     .local string s
 334     s = unicode:" \t\u207babc\n\u2000\u2009"
 335     $I9 = length s
 336     $I0 = is_cclass .CCLASS_ANY, s, 0
 337     print $I0
 338     $I0 = is_cclass .CCLASS_ANY, s, 1
 339     print $I0
 340     $I0 = is_cclass .CCLASS_ANY, s, 2
 341     print $I0
 342     $I0 = is_cclass .CCLASS_ANY, s, $I9
 343     print $I0
 344     $I0 = find_not_cclass .CCLASS_ANY, s, 0, $I9
 345     print $I0
 346     $I0 = find_not_cclass .CCLASS_ANY, s, $I0, $I9
 347     print $I0
 348     $I0 = find_cclass .CCLASS_ANY, s, $I0, $I9
 349     print $I0
 350     $I0 = find_cclass .CCLASS_ANY, s, 2, $I9
 351     print $I0
 352     print "\n"
 353 .end
 354 CODE
 355 11109992
 356 OUTPUT
 357
 358 SKIP: {
 359     skip "Tests seem to fail on big endian machines with icu", 2 if $PConfig{byteorder} eq '4321';
 360
 361     # Tests for .CCLASS_NUMERIC
 362     pir_output_is( <<'CODE', <<'OUTPUT', "CCLASS_NUMERIC in unicode" );
 363 .sub main
 364     .include 'cclass.pasm'
 365     .local string s
 366     s = unicode:"01\u207bxyz\u0660\u17e1\u19d9"
 367     $I9 = length s
 368     $I0 = is_cclass .CCLASS_NUMERIC, s, 0
 369     print $I0
 370     $I0 = is_cclass .CCLASS_NUMERIC, s, 1
 371     print $I0
 372     $I0 = is_cclass .CCLASS_NUMERIC, s, 2
 373     print $I0
 374     $I0 = find_not_cclass .CCLASS_NUMERIC, s, 0, $I9
 375     print $I0
 376     $I0 = find_not_cclass .CCLASS_NUMERIC, s, $I0, $I9
 377     print $I0
 378     $I0 = find_cclass .CCLASS_NUMERIC, s, $I0, $I9
 379     print $I0
 380     $I0 = find_not_cclass .CCLASS_NUMERIC, s, $I0, $I9
 381     print $I0
 382     print "\n"
 383 .end
 384 CODE
 385 1102269
 386 OUTPUT
 387
 388     # Concatenate unicode: with iso-8859-1
 389     # See RT #39930 for discussion
 390     pir_output_is(
 391         <<'CODE', <<"OUTPUT", "Concat unicode with iso-8859-1" );
 392 .sub main
 393     $S0 = unicode:"A"
 394     $S1 = ascii:"B"
 395     $S2 = concat $S0, $S1
 396     print $S2
 397     print "\n"
 398
 399     $S0 = unicode:"A"
 400     $S1 = unicode:"B"
 401     $S2 = concat $S0, $S1
 402     print $S2
 403     print "\n"
 404
 405     $S0 = unicode:"A"
 406     $S1 = iso-8859-1:"B"
 407     $S2 = concat $S0, $S1
 408     print $S2
 409     print "\n"
 410 .end
 411 CODE
 412 AB
 413 AB
 414 AB
 415 OUTPUT
 416 }
 417
 418 # Local Variables:
 419 #   mode: cperl
 420 #   cperl-indent-level: 4
 421 #   fill-column: 100
 422 # End:
 423 # vim: expandtab shiftwidth=4: