tests/utftcl.test

   1 # This file contains a collection of tests for tclUtf.c
   2 # Sourcing this file into Tcl runs the tests and generates output for
   3 # errors.  No output means no errors were found.
   4 #
   5 # Copyright (c) 1997 Sun Microsystems, Inc.
   6 # Copyright (c) 1998-1999 by Scriptics Corporation.
   7 #
   8 # See the file "license.terms" for information on usage and redistribution
   9 # of this file, and for a DISCLAIMER OF ALL WARRANTIES.
  10 #
  11 # RCS: @(#) $Id: utf.test,v 1.14 2007/05/02 01:37:28 kennykb Exp $
  12
  13 source [file dirname [info script]]/testing.tcl
  14
  15 needs constraint utf8
  16
  17 catch {unset x}
  18
  19 test utf-1.1 {Tcl_UniCharToUtf: 1 byte sequences} {
  20     set x \x01
  21 } [bytestring "\x01"]
  22 test utf-1.2 {Tcl_UniCharToUtf: 2 byte sequences} {
  23     set x "\u80"
  24 } [bytestring "\xc2\x80"]
  25 test utf-1.3 {Tcl_UniCharToUtf: 2 byte sequences} {
  26     set x "\ue0"
  27 } [bytestring "\xc3\xa0"]
  28 test utf-1.4 {Tcl_UniCharToUtf: 3 byte sequences} {
  29     set x "\u4e4e"
  30 } [bytestring "\xe4\xb9\x8e"]
  31 test utf-1.5 {Tcl_UniCharToUtf: negative Tcl_UniChar} {
  32     string length [format %c -1]
  33 } 1
  34
  35 test utf-2.1 {Tcl_UtfToUniChar: low ascii} {
  36     string length "abc"
  37 } {3}
  38 test utf-2.2 {Tcl_UtfToUniChar: naked trail bytes} {
  39     string length [bytestring "\x82\x83\x84"]
  40 } {3}
  41 test utf-2.3 {Tcl_UtfToUniChar: lead (2-byte) followed by non-trail} {
  42     string length [bytestring "\xC2"]
  43 } {1}
  44 test utf-2.4 {Tcl_UtfToUniChar: lead (2-byte) followed by trail} {
  45     string length [bytestring "\xC2\xa2"]
  46 } {1}
  47 test utf-2.5 {Tcl_UtfToUniChar: lead (3-byte) followed by non-trail} {
  48     string length [bytestring "\xE2"]
  49 } {1}
  50 test utf-2.6 {Tcl_UtfToUniChar: lead (3-byte) followed by 1 trail} {
  51     string length [bytestring "\xE2\xA2"]
  52 } {2}
  53 test utf-2.7 {Tcl_UtfToUniChar: lead (3-byte) followed by 2 trail} {
  54     string length [bytestring "\xE4\xb9\x8e"]
  55 } {1}
  56 test utf-2.8 {Tcl_UtfToUniChar: longer UTF sequences not supported} {
  57     string length [bytestring "\xF4\xA2\xA2\xA2"]
  58 } {4}
  59
  60 test utf-3.1 {Tcl_UtfCharComplete} {
  61 } {}
  62
  63 proc testnumutfchars {a {n ""}} {
  64     string length $a
  65 }
  66
  67 test utf-4.1 {Tcl_NumUtfChars: zero length} {
  68     testnumutfchars ""
  69 } {0}
  70 test utf-4.2 {Tcl_NumUtfChars: length 1} {
  71     testnumutfchars [bytestring "\xC2\xA2"]
  72 } {1}
  73 test utf-4.3 {Tcl_NumUtfChars: long string} {
  74     testnumutfchars [bytestring "abc\xC2\xA2\xe4\xb9\x8e\uA2\u4e4e"]
  75 } {7}
  76 test utf-4.4 {Tcl_NumUtfChars: #u0000} {
  77     testnumutfchars [bytestring "\xC0\x80"]
  78 } {1}
  79 test utf-4.5 {Tcl_NumUtfChars: zero length, calc len} {
  80     testnumutfchars "" 1
  81 } {0}
  82 test utf-4.6 {Tcl_NumUtfChars: length 1, calc len} {
  83     testnumutfchars [bytestring "\xC2\xA2"] 1
  84 } {1}
  85 test utf-4.7 {Tcl_NumUtfChars: long string, calc len} {
  86     testnumutfchars [bytestring "abc\xC2\xA2\xe4\xb9\x8e\uA2\u4e4e"] 1
  87 } {7}
  88 test utf-4.8 {Tcl_NumUtfChars: #u0000, calc len} {
  89     testnumutfchars [bytestring "\xC0\x80"] 1
  90 } {1}
  91
  92 test utf-5.1 {Tcl_UtfFindFirsts} {
  93 } {}
  94
  95 test utf-6.1 {Tcl_UtfNext} {
  96 } {}
  97
  98 test utf-7.1 {Tcl_UtfPrev} {
  99 } {}
 100
 101 test utf-8.1 {Tcl_UniCharAtIndex: index = 0} {
 102     string index abcd 0
 103 } {a}
 104 test utf-8.2 {Tcl_UniCharAtIndex: index = 0} {
 105     string index \u4e4e\u25a 0
 106 } "\u4e4e"
 107 test utf-8.3 {Tcl_UniCharAtIndex: index > 0} {
 108     string index abcd 2
 109 } {c}
 110 test utf-8.4 {Tcl_UniCharAtIndex: index > 0} {
 111     string index \u4e4e\u25a\uff\u543 2
 112 } "\uff"
 113
 114 test utf-9.1 {Tcl_UtfAtIndex: index = 0} {
 115     string range abcd 0 2
 116 } {abc}
 117 test utf-9.2 {Tcl_UtfAtIndex: index > 0} {
 118     string range \u4e4e\u25a\xff\u543klmnop 1 5
 119 } "\u25a\xff\u543kl"
 120
 121
 122 test utf-10.1 {Tcl_UtfBackslash: dst == NULL} {
 123     set x \n
 124 } {
 125 }
 126 test utf-10.2 {Tcl_UtfBackslash: \u subst} {
 127     set x \ua2
 128 } [bytestring "\xc2\xa2"]
 129 test utf-10.3 {Tcl_UtfBackslash: longer \u subst} {
 130     set x \u4e21
 131 } [bytestring "\xe4\xb8\xa1"]
 132 test utf-10.4 {Tcl_UtfBackslash: stops at first non-hex} {
 133     set x \u4e2k
 134 } "[bytestring \xd3\xa2]k"
 135 test utf-10.5 {Tcl_UtfBackslash: stops after 4 hex chars} {
 136     set x \u4e216
 137 } "[bytestring \xe4\xb8\xa1]6"
 138 proc bsCheck {char num} {
 139     global errNum
 140     test utf-10.$errNum {backslash substitution} {
 141         scan $char %c value
 142         set value
 143     } $num
 144     incr errNum
 145 }
 146 set errNum 6
 147 bsCheck \b      8
 148 bsCheck \e      101
 149 bsCheck \f      12
 150 bsCheck \n      10
 151 bsCheck \r      13
 152 bsCheck \t      9
 153 bsCheck \v      11
 154 bsCheck \{      123
 155 bsCheck \}      125
 156 bsCheck \[      91
 157 bsCheck \]      93
 158 bsCheck \$      36
 159 bsCheck \       32
 160 bsCheck \;      59
 161 bsCheck \\      92
 162 bsCheck \Ca     67
 163 bsCheck \Ma     77
 164 bsCheck \CMa    67
 165 # prior to 8.3, this returned 8, as \8 as accepted as an
 166 # octal value - but it isn't! [Bug: 3975]
 167 bsCheck \8a     56
 168 bsCheck \14     12
 169 bsCheck \141    97
 170 bsCheck b\0     98
 171 bsCheck \x      120
 172 bsCheck \ua     10
 173 bsCheck \uA     10
 174 bsCheck \u41    65
 175 bsCheck \u      117
 176 bsCheck \uk     117
 177 bsCheck \u41    65
 178 bsCheck \ua     10
 179 bsCheck \uA     10
 180 bsCheck \340    224
 181 bsCheck \ua1    161
 182 bsCheck \u4e21  20001
 183
 184 test utf-11.1 {Tcl_UtfToUpper} {
 185     string toupper {}
 186 } {}
 187 test utf-11.2 {Tcl_UtfToUpper} {
 188     string toupper abc
 189 } ABC
 190 test utf-11.3 {Tcl_UtfToUpper} {
 191     string toupper \u00e3ab
 192 } \u00c3AB
 193 test utf-11.4 {Tcl_UtfToUpper} {
 194     string toupper \u01e3ab
 195 } \u01e2AB
 196
 197 test utf-12.1 {Tcl_UtfToLower} {
 198     string tolower {}
 199 } {}
 200 test utf-12.2 {Tcl_UtfToLower} {
 201     string tolower ABC
 202 } abc
 203 test utf-12.3 {Tcl_UtfToLower} {
 204     string tolower \u00c3AB
 205 } \u00e3ab
 206 test utf-12.4 {Tcl_UtfToLower} {
 207     string tolower \u01e2AB
 208 } \u01e3ab
 209
 210
 211 test utf-14.1 {Tcl_UtfNcasecmp} {
 212     string compare -nocase a b
 213 } -1
 214 test utf-14.2 {Tcl_UtfNcasecmp} {
 215     string compare -nocase b a
 216 } 1
 217 test utf-14.3 {Tcl_UtfNcasecmp} {
 218     string compare -nocase B a
 219 } 1
 220 test utf-14.4 {Tcl_UtfNcasecmp} {
 221     string compare -nocase aBcB abca
 222 } 1
 223
 224 test utf-15.1 {Tcl_UniCharToUpper, negative delta} {
 225     string toupper aA
 226 } AA
 227 test utf-15.2 {Tcl_UniCharToUpper, positive delta} {
 228     string toupper \u0178\u00ff
 229 } \u0178\u0178
 230 test utf-15.3 {Tcl_UniCharToUpper, no delta} {
 231     string toupper !
 232 } !
 233
 234 test utf-16.1 {Tcl_UniCharToLower, negative delta} {
 235     string tolower aA
 236 } aa
 237 test utf-16.2 {Tcl_UniCharToLower, positive delta} {
 238     string tolower \u0178\u00ff
 239 } \u00ff\u00ff
 240 test utf-17.1 {Tcl_UniCharToLower, no delta} {
 241     string tolower !
 242 } !
 243
 244
 245 #test utf-21.1 {TclUniCharIsAlnum} {
 246 #    # this returns 1 with Unicode 3 compliance
 247 #    string is alnum \u1040\u021f
 248 #} {1}
 249 #test utf-21.2 {unicode alnum char in regc_locale.c} {
 250 #    # this returns 1 with Unicode 3 compliance
 251 #    list [regexp {^[[:alnum:]]+$} \u1040\u021f] [regexp {^\w+$} \u1040\u021f]
 252 #} {1 1}
 253
 254 #test utf-22.1 {TclUniCharIsWordChar} {
 255 #    string wordend "xyz123_bar fg" 0
 256 #} 10
 257 #test utf-22.2 {TclUniCharIsWordChar} {
 258 #    string wordend "x\u5080z123_bar\u203c fg" 0
 259 #} 10
 260
 261 #test utf-23.1 {TclUniCharIsAlpha} {
 262 #    # this returns 1 with Unicode 3 compliance
 263 #    string is alpha \u021f
 264 #} {1}
 265 #test utf-23.2 {unicode alpha char in regc_locale.c} {
 266 #    # this returns 1 with Unicode 3 compliance
 267 #    regexp {^[[:alpha:]]+$} \u021f
 268 #} {1}
 269 #
 270 #test utf-24.1 {TclUniCharIsDigit} {
 271 #    # this returns 1 with Unicode 3 compliance
 272 #    string is digit \u1040
 273 #} {1}
 274 #test utf-24.2 {unicode digit char in regc_locale.c} {
 275 #    # this returns 1 with Unicode 3 compliance
 276 #    list [regexp {^[[:digit:]]+$} \u1040] [regexp {^\d+$} \u1040]
 277 #} {1 1}
 278 #
 279 #test utf-24.3 {TclUniCharIsSpace} {
 280 #    # this returns 1 with Unicode 3 compliance
 281 #    string is space \u1680
 282 #} {1}
 283 #test utf-24.4 {unicode space char in regc_locale.c} {
 284 #    # this returns 1 with Unicode 3 compliance
 285 #    list [regexp {^[[:space:]]+$} \u1680] [regexp {^\s+$} \u1680]
 286 #} {1 1}
 287
 288 testreport