tests/utftcl.test

   1 # This file contains a collection of tests for tclUtf.c
   2 # Sourcing this file into Tcl runs the tests and generates output for
   3 # errors.  No output means no errors were found.
   4 #
   5 # Copyright (c) 1997 Sun Microsystems, Inc.
   6 # Copyright (c) 1998-1999 by Scriptics Corporation.
   7 #
   8 # See the file "license.terms" for information on usage and redistribution
   9 # of this file, and for a DISCLAIMER OF ALL WARRANTIES.
  10 #
  11 # RCS: @(#) $Id: utf.test,v 1.14 2007/05/02 01:37:28 kennykb Exp $
  12
  13 source [file dirname [info script]]/testing.tcl
  14
  15 needs constraint utf8
  16
  17 catch {unset x}
  18
  19 test utf-1.1 {Tcl_UniCharToUtf: 1 byte sequences} {
  20     set x \x01
  21 } [bytestring "\x01"]
  22 test utf-1.2 {Tcl_UniCharToUtf: 2 byte sequences} {
  23     set x "\u80"
  24 } [bytestring "\xc2\x80"]
  25 test utf-1.3 {Tcl_UniCharToUtf: 2 byte sequences} {
  26     set x "\ue0"
  27 } [bytestring "\xc3\xa0"]
  28 test utf-1.4 {Tcl_UniCharToUtf: 3 byte sequences} {
  29     set x "\u4e4e"
  30 } [bytestring "\xe4\xb9\x8e"]
  31 test utf-1.5 {Tcl_UniCharToUtf: negative Tcl_UniChar} {
  32     string length [format %c -1]
  33 } 1
  34
  35 test utf-2.1 {Tcl_UtfToUniChar: low ascii} {
  36     string length "abc"
  37 } {3}
  38 test utf-2.2 {Tcl_UtfToUniChar: naked trail bytes} {
  39     string length [bytestring "\x82\x83\x84"]
  40 } {3}
  41 test utf-2.3 {Tcl_UtfToUniChar: lead (2-byte) followed by non-trail} {
  42     string length [bytestring "\xC2"]
  43 } {1}
  44 test utf-2.4 {Tcl_UtfToUniChar: lead (2-byte) followed by trail} {
  45     string length [bytestring "\xC2\xa2"]
  46 } {1}
  47 test utf-2.5 {Tcl_UtfToUniChar: lead (3-byte) followed by non-trail} {
  48     string length [bytestring "\xE2"]
  49 } {1}
  50 test utf-2.6 {Tcl_UtfToUniChar: lead (3-byte) followed by 1 trail} {
  51     string length [bytestring "\xE2\xA2"]
  52 } {2}
  53 test utf-2.7 {Tcl_UtfToUniChar: lead (3-byte) followed by 2 trail} {
  54     string length [bytestring "\xE4\xb9\x8e"]
  55 } {1}
  56 # Note that Tcl may or may not support utf-8 sequences >= 4 bytes
  57 test utf-2.9 {Tcl_UtfToUniChar: 4-byte UTF sequence} {
  58     string length [bytestring "\xF4\xA2\xA2\xA2"]
  59 } {1}
  60
  61 test utf-3.1 {Tcl_UtfCharComplete} {
  62 } {}
  63
  64 proc testnumutfchars {a {n ""}} {
  65     string length $a
  66 }
  67
  68 test utf-4.1 {Tcl_NumUtfChars: zero length} {
  69     testnumutfchars ""
  70 } {0}
  71 test utf-4.2 {Tcl_NumUtfChars: length 1} {
  72     testnumutfchars [bytestring "\xC2\xA2"]
  73 } {1}
  74 test utf-4.3 {Tcl_NumUtfChars: long string} {
  75     testnumutfchars [bytestring "abc\xC2\xA2\xe4\xb9\x8e\uA2\u4e4e"]
  76 } {7}
  77 # This is an invalid utf-8 sequence. Not minimal, so should return 2
  78 test utf-4.4 {Tcl_NumUtfChars: #u0000} tcl {
  79     testnumutfchars [bytestring "\xC0\x80"]
  80 } {1}
  81 test utf-4.5 {Tcl_NumUtfChars: zero length, calc len} {
  82     testnumutfchars "" 1
  83 } {0}
  84 test utf-4.6 {Tcl_NumUtfChars: length 1, calc len} {
  85     testnumutfchars [bytestring "\xC2\xA2"] 1
  86 } {1}
  87 test utf-4.7 {Tcl_NumUtfChars: long string, calc len} {
  88     testnumutfchars [bytestring "abc\xC2\xA2\xe4\xb9\x8e\uA2\u4e4e"] 1
  89 } {7}
  90 test utf-4.8 {Tcl_NumUtfChars: #u0000, calc len} tcl {
  91     testnumutfchars [bytestring "\xC0\x80"] 1
  92 } {1}
  93
  94 test utf-5.1 {Tcl_UtfFindFirsts} {
  95 } {}
  96
  97 test utf-6.1 {Tcl_UtfNext} {
  98 } {}
  99
 100 test utf-7.1 {Tcl_UtfPrev} {
 101 } {}
 102
 103 test utf-8.1 {Tcl_UniCharAtIndex: index = 0} {
 104     string index abcd 0
 105 } {a}
 106 test utf-8.2 {Tcl_UniCharAtIndex: index = 0} {
 107     string index \u4e4e\u25a 0
 108 } "\u4e4e"
 109 test utf-8.3 {Tcl_UniCharAtIndex: index > 0} {
 110     string index abcd 2
 111 } {c}
 112 test utf-8.4 {Tcl_UniCharAtIndex: index > 0} {
 113     string index \u4e4e\u25a\uff\u543 2
 114 } "\uff"
 115
 116 test utf-9.1 {Tcl_UtfAtIndex: index = 0} {
 117     string range abcd 0 2
 118 } {abc}
 119 test utf-9.2 {Tcl_UtfAtIndex: index > 0} {
 120     string range \u4e4e\u25a\xff\u543klmnop 1 5
 121 } "\u25a\xff\u543kl"
 122
 123
 124 test utf-10.1 {Tcl_UtfBackslash: dst == NULL} {
 125     set x \n
 126 } {
 127 }
 128 test utf-10.2 {Tcl_UtfBackslash: \u subst} {
 129     set x \ua2
 130 } [bytestring "\xc2\xa2"]
 131 test utf-10.3 {Tcl_UtfBackslash: longer \u subst} {
 132     set x \u4e21
 133 } [bytestring "\xe4\xb8\xa1"]
 134 test utf-10.4 {Tcl_UtfBackslash: stops at first non-hex} {
 135     set x \u4e2k
 136 } "[bytestring \xd3\xa2]k"
 137 test utf-10.5 {Tcl_UtfBackslash: stops after 4 hex chars} {
 138     set x \u4e216
 139 } "[bytestring \xe4\xb8\xa1]6"
 140 proc bsCheck {char num} {
 141     global errNum
 142     test utf-10.$errNum {backslash substitution} {
 143         scan $char %c value
 144         set value
 145     } $num
 146     incr errNum
 147 }
 148 set errNum 6
 149 bsCheck \b      8
 150 bsCheck \e      101
 151 bsCheck \f      12
 152 bsCheck \n      10
 153 bsCheck \r      13
 154 bsCheck \t      9
 155 bsCheck \v      11
 156 bsCheck \{      123
 157 bsCheck \}      125
 158 bsCheck \[      91
 159 bsCheck \]      93
 160 bsCheck \$      36
 161 bsCheck \       32
 162 bsCheck \;      59
 163 bsCheck \\      92
 164 bsCheck \Ca     67
 165 bsCheck \Ma     77
 166 bsCheck \CMa    67
 167 # prior to 8.3, this returned 8, as \8 as accepted as an
 168 # octal value - but it isn't! [Bug: 3975]
 169 bsCheck \8a     56
 170 bsCheck \14     12
 171 bsCheck \141    97
 172 bsCheck b\0     98
 173 bsCheck \x      120
 174 bsCheck \ua     10
 175 bsCheck \uA     10
 176 bsCheck \u41    65
 177 bsCheck \u      117
 178 bsCheck \uk     117
 179 bsCheck \u41    65
 180 bsCheck \ua     10
 181 bsCheck \uA     10
 182 bsCheck \340    224
 183 bsCheck \ua1    161
 184 bsCheck \u4e21  20001
 185
 186 test utf-11.1 {Tcl_UtfToUpper} {
 187     string toupper {}
 188 } {}
 189 test utf-11.2 {Tcl_UtfToUpper} {
 190     string toupper abc
 191 } ABC
 192 test utf-11.3 {Tcl_UtfToUpper} {
 193     string toupper \u00e3ab
 194 } \u00c3AB
 195 test utf-11.4 {Tcl_UtfToUpper} {
 196     string toupper \u01e3ab
 197 } \u01e2AB
 198
 199 test utf-12.1 {Tcl_UtfToLower} {
 200     string tolower {}
 201 } {}
 202 test utf-12.2 {Tcl_UtfToLower} {
 203     string tolower ABC
 204 } abc
 205 test utf-12.3 {Tcl_UtfToLower} {
 206     string tolower \u00c3AB
 207 } \u00e3ab
 208 test utf-12.4 {Tcl_UtfToLower} {
 209     string tolower \u01e2AB
 210 } \u01e3ab
 211
 212
 213 test utf-14.1 {Tcl_UtfNcasecmp} {
 214     string compare -nocase a b
 215 } -1
 216 test utf-14.2 {Tcl_UtfNcasecmp} {
 217     string compare -nocase b a
 218 } 1
 219 test utf-14.3 {Tcl_UtfNcasecmp} {
 220     string compare -nocase B a
 221 } 1
 222 test utf-14.4 {Tcl_UtfNcasecmp} {
 223     string compare -nocase aBcB abca
 224 } 1
 225
 226 test utf-15.1 {Tcl_UniCharToUpper, negative delta} {
 227     string toupper aA
 228 } AA
 229 test utf-15.2 {Tcl_UniCharToUpper, positive delta} {
 230     string toupper \u0178\u00ff
 231 } \u0178\u0178
 232 test utf-15.3 {Tcl_UniCharToUpper, no delta} {
 233     string toupper !
 234 } !
 235
 236 test utf-16.1 {Tcl_UniCharToLower, negative delta} {
 237     string tolower aA
 238 } aa
 239 test utf-16.2 {Tcl_UniCharToLower, positive delta} {
 240     string tolower \u0178\u00ff
 241 } \u00ff\u00ff
 242 test utf-17.1 {Tcl_UniCharToLower, no delta} {
 243     string tolower !
 244 } !
 245
 246
 247 #test utf-21.1 {TclUniCharIsAlnum} {
 248 #    # this returns 1 with Unicode 3 compliance
 249 #    string is alnum \u1040\u021f
 250 #} {1}
 251 #test utf-21.2 {unicode alnum char in regc_locale.c} {
 252 #    # this returns 1 with Unicode 3 compliance
 253 #    list [regexp {^[[:alnum:]]+$} \u1040\u021f] [regexp {^\w+$} \u1040\u021f]
 254 #} {1 1}
 255
 256 #test utf-22.1 {TclUniCharIsWordChar} {
 257 #    string wordend "xyz123_bar fg" 0
 258 #} 10
 259 #test utf-22.2 {TclUniCharIsWordChar} {
 260 #    string wordend "x\u5080z123_bar\u203c fg" 0
 261 #} 10
 262
 263 #test utf-23.1 {TclUniCharIsAlpha} {
 264 #    # this returns 1 with Unicode 3 compliance
 265 #    string is alpha \u021f
 266 #} {1}
 267 #test utf-23.2 {unicode alpha char in regc_locale.c} {
 268 #    # this returns 1 with Unicode 3 compliance
 269 #    regexp {^[[:alpha:]]+$} \u021f
 270 #} {1}
 271 #
 272 #test utf-24.1 {TclUniCharIsDigit} {
 273 #    # this returns 1 with Unicode 3 compliance
 274 #    string is digit \u1040
 275 #} {1}
 276 #test utf-24.2 {unicode digit char in regc_locale.c} {
 277 #    # this returns 1 with Unicode 3 compliance
 278 #    list [regexp {^[[:digit:]]+$} \u1040] [regexp {^\d+$} \u1040]
 279 #} {1 1}
 280 #
 281 #test utf-24.3 {TclUniCharIsSpace} {
 282 #    # this returns 1 with Unicode 3 compliance
 283 #    string is space \u1680
 284 #} {1}
 285 #test utf-24.4 {unicode space char in regc_locale.c} {
 286 #    # this returns 1 with Unicode 3 compliance
 287 #    list [regexp {^[[:space:]]+$} \u1680] [regexp {^\s+$} \u1680]
 288 #} {1 1}
 289
 290 testreport