source/texk/web2c/luatexdir/slnunicode/unitest

   1 #!/opt/lua-5.0.2/bin/lua
   2 --      there are four string-like ctype closures:
   3 --      unicode.ascii, latin1, utf8 and grapheme
   4 --
   5 --      ascii and latin1 are single-byte like string,
   6 --      but use the unicode table for upper/lower and character classes
   7 --      ascii does not touch bytes > 127 on upper/lower
   8 --
   9 --      ascii or latin1 can be used as locale-independent string replacement.
  10 --      (There is a compile switch to do this automatically for ascii).
  11 --
  12 --      UTF-8 operates on UTF-8 sequences as of RFC 3629:
  13 --      1 byte 0-7F, 2 byte 80-7FF, 3 byte 800-FFFF, 4 byte 1000-10FFFF
  14 --      (not exclusing UTF-16 surrogate characters)
  15 --      Any byte not part of such a sequence is treated as it's (Latin-1) value.
  16 --
  17 --      Grapheme takes care of grapheme clusters, which are characters followed by
  18 --      "grapheme extension" characters (Mn+Me) like combining diacritical marks.
  19 --
  20 --      calls are:
  21 --      len(str)
  22 --      sub(str, start [,end=-1])
  23 --      byte(str, start [,end=-1])
  24 --      lower(str)
  25 --      upper(str)
  26 --      char(i [,j...])
  27 --      reverse(str)
  28 --
  29 --      same as in string: rep, format, dump
  30 --      TODO: use char count with %s in format? (sub does the job)
  31 --      TODO: grapheme.byte: only first code of any cluster?
  32 --
  33 --      find, gfind, gsub: done, but need thorough testing ...:
  34 --      ascii does not match them on any %class (but on ., literals and ranges)
  35 --      behaviour of %class with class not ASCII is undefined
  36 --      frontier %f currently disabled -- should we?
  37 --
  38 --      character classes are:
  39 --      %a L* (Lu+Ll+Lt+Lm+Lo)
  40 --      %c Cc
  41 --      %d 0-9
  42 --      %l Ll
  43 --      %n N* (Nd+Nl+No, new)
  44 --      %p P* (Pc+Pd+Ps+Pe+Pi+Pf+Po)
  45 --      %s Z* (Zs+Zl+Zp) plus the controls 9-13 (HT,LF,VT,FF,CR)
  46 --      %u Lu (also Lt ?)
  47 --      %w %a+%n+Pc (e.g. '_')
  48 --      %x 0-9A-Za-z
  49 --      %z the 0 byte
  50 --      c.f. http://unicode.org/Public/UNIDATA/UCD.html#General_Category_Values
  51 --      http://unicode.org/Public/UNIDATA/UnicodeData.txt
  52 --
  53 --      NOTE: find positions are in bytes for all ctypes!
  54 --      use ascii.sub to cut found ranges!
  55 --      this is a) faster b) more reliable
  56 --
  57 --      UTF-8 behaviour: match is by codes, code ranges are supported
  58 --
  59 --      grapheme behaviour: any %class, '.' and range match includes
  60 --      any following grapheme extensions.
  61 --      Ranges apply to single code points only.
  62 --      If a [] enumeration contains a grapheme cluster,
  63 --      this matches only the exact same cluster.
  64 --      However, a literal single 'o' standalone or in an [] enumeration
  65 --      will match just that 'o',       even if it has a extension in the string.
  66 --      Consequently, grapheme match positions are not always cluster positions.
  67 --
  68
  69 local unicode = require("unicode")
  70 local utf8 = unicode.utf8
  71 unicode.string = string -- for tests unicode[ctype]
  72 local sprintf = string.format
  73 local function printf (fmt, ...) return print(sprintf(fmt, ...)) end
  74
  75 local function check (test, ok, got)
  76         if ok == got then return printf("ok  %s = %s",test,ok) end
  77         return printf("NOK %s = %s GOT '%s'",test, ok, got or "<nil>")
  78 end
  79 local function checka (test, ok, ...)
  80         local arg = {...}
  81         arg[1] = arg[1] or ""
  82         return check(test, ok, table.concat(arg, ","))
  83 end
  84
  85
  86 local function testlen (str,bytes,codes,chars)
  87         codes = codes or bytes
  88         chars = chars or codes
  89         return check(sprintf("len '%s'", str),
  90                 sprintf("%d/%d/%d", bytes, codes, chars),
  91 sprintf("%d/%d/%d", string.len(str), utf8.len(str), unicode.grapheme.len(str)))
  92 end
  93
  94 -- 176 = 00B0;DEGREE SIGN -- UTF-8: C2,B0 = \194\176
  95 -- 196 = 00C4;LATIN CAPITAL LETTER A WITH DIAERESIS
  96 -- 214 = 00D6;LATIN CAPITAL LETTER O WITH DIAERESIS
  97 -- 776 = 0308;COMBINING DIAERESIS -- UTF-8: CC,88 = \204\136
  98 testlen("A\tB",3) -- plain Latin-1
  99 testlen("\176\196\214",3) -- plain Latin-1
 100 testlen("\196\176\214",3,2) -- C4,B0 is valid seq 0130 I WITH DOT ABOVE
 101 testlen("\192\178",2) -- C0,B2 is bad seq for 2
 102 testlen("°ÄÖ",6,3) -- simple Latin-1 chars in UTF-8
 103 testlen("\204\136A\204\136O\204\136",8,5,3) -- decomposed (with broken lead)
 104
 105
 106 local function testsub (ctype,ok,str,start,e)
 107         return check(sprintf("%s.sub('%s',%d,%d)", ctype, str, start, e), ok,
 108                 unicode[ctype].sub(str,start,e))
 109 end
 110 testsub("ascii","BCD","ABCDE",2,4)
 111 testsub("utf8","BCD","ABCDE",2,4)
 112 testsub("latin1","Ä","°ÄÖ",3,4)
 113 testsub("utf8","Ä","°ÄÖ",2,2)
 114 testsub("utf8","ÄÖ","°ÄÖ",2,-1)
 115 testsub("utf8","\204\136","A\204\136O\204\136",2,2) -- decomposed
 116 testsub("grapheme","O\204\136","A\204\136O\204\136",2,2) -- decomposed
 117
 118
 119 local function testbyte (ctype, ok, str, ...)
 120         return checka(sprintf("%s.byte('%s',%s)",ctype,str,table.concat({...}, ",")),
 121                 ok, unicode[ctype].byte(str, ...))
 122 end
 123 testbyte("string","194,176","Ä°Ö",3,4) -- the UTF-8 seq for °
 124 testbyte("ascii","194,176","Ä°Ö",3,4)
 125 testbyte("utf8","176,214","Ä°Ö",2,3) -- code points for °,Ö
 126 testbyte("utf8","65,776","\204\136A\204\136O\204\136",2,3) -- decomposed
 127 testbyte("grapheme","65,776","\204\136A\204\136O\204\136",2) -- decomposed
 128
 129
 130 local function testchar (ctype, ok, ...)
 131         return check(sprintf("%s.char(%s)",ctype,table.concat({...}, ",")),
 132                 ok, unicode[ctype].char(...))
 133 end
 134 testchar("ascii", "AB", 65,66)
 135 testchar("ascii", "\176", 176)
 136 testchar("utf8", "\194\176", 176)
 137
 138
 139 local function testcase (ctype,str,up,lo)
 140         check(sprintf("%s.lower('%s')", ctype, str), lo, unicode[ctype].lower(str))
 141         check(sprintf("%s.upper('%s')", ctype, str), up, unicode[ctype].upper(str))
 142 end
 143 -- upper/lower also fixes plain Latin
 144 testcase("utf8","Ab\196üo\204\136","ABÄÜO\204\136","abäüo\204\136")
 145 testcase("ascii","Ab\196üo\204\136","AB\196üO\204\136","ab\196üo\204\136")
 146 testcase("latin1","Ab\196","AB\196","ab\228")
 147
 148
 149 local function testrev (ctype,ok,str)
 150         return check(sprintf("%s.reverse('%s')",ctype,str),
 151                 ok, unicode[ctype].reverse(str))
 152 end
 153 testrev("ascii","b\136\204oa\176\194ba","ab°ao\204\136b");
 154 testrev("utf8","b\204\136oa°ba","ab°ao\204\136b");
 155 testrev("grapheme","bo\204\136a°ba","ab°ao\204\136b");
 156
 157
 158
 159 local function testfind (ctype,ok,str,pat)
 160         return checka(sprintf("%s.find('%s','%s')",ctype,str,pat),
 161                 ok, unicode[ctype].find(str, pat))
 162 end
 163 testfind("ascii","1,1","e=mc2","%a")
 164 testfind("ascii","3,4","e=mc2","%a%a")
 165 testfind("ascii","5,5","e=mc2","%d")
 166 testfind("ascii","","Ä","%a")
 167 testfind("ascii","1,2","Ä","%A*")
 168 testfind("latin1","1,1","Ä","%a")
 169 testfind("utf8","1,2","Ä","%a")
 170 testfind("utf8","1,1","o\204\136","%a*")
 171 testfind("utf8","2,3","o\204\136","%A")
 172 testfind("utf8","1,1","o\204\136",".")
 173 testfind("grapheme","1,3","o\204\136","%a*")
 174 testfind("grapheme","2,3","o\204\136","%A") -- didn't expect this?
 175 testfind("grapheme","1,3","o\204\136",".")
 176 testfind("utf8","4,5","ÜHÄPPY","[À-Ö]")
 177 testfind("utf8","4,5","ÜHÄPPY","[Ä-]")
 178 testfind("utf8","7,7","ÜHÄP-PY","[ä-]")
 179 testfind("ascii","1,4","abcdef","%a*d")
 180 testfind("utf8","1,10","äöüßü","%a*ü")
 181 testfind("utf8","1,6","äöüß","%a*ü")
 182 testfind("utf8","4,5,Ä","ÜHÄPPY","([À-Ö])")
 183 testfind("utf8","1,5,ÜHÄ","ÜHÄ_PPY","([%w]+)")
 184 testfind("utf8","1,9,ÜHÄ_PPY","ÜHÄ_PPY","([%w_]+)")
 185
 186
 187 local function testgsub (ctype,ok,str,pat,repl)
 188         return check(sprintf("%s.gsub('%s','%s','%s')",ctype,str,pat,repl),
 189                 ok, unicode[ctype].gsub(str,pat,repl))
 190 end
 191 testgsub("ascii","hello hello world world","hello world", "(%w+)", "%1 %1")
 192 testgsub("ascii","world hello Lua from",
 193         "hello world from Lua", "(%w+)%s*(%w+)", "%2 %1")
 194 testgsub("ascii","l helö wöfr rldöL müä",
 195         "hellö wörld fröm Lüä", "(%w+)%s*(%w+)", "%2 %1")
 196 testgsub("utf8","wörld hellö Lüä fröm",
 197         "hellö wörld fröm Lüä", "(%w+)%s*(%w+)", "%2 %1")
 198 testgsub("utf8","HÜppÄ","HÄppÜ","([À-Ö])(%l*)(%u)","%3%2%1")
 199
 200
 201 fail = 0
 202 for i=0,65535 do if i ~= utf8.byte(utf8.char(i)) then fail=fail+1 end end
 203 check("code-decode failures", 0, fail)
 204
 205 --[[ print the table
 206 for i=192,65535,64 do
 207         local k = i/64
 208         io.write(sprintf("%04x\\%3d\\%3d ",i, 224+k/64, 128+math.mod(k,64)))
 209         for j=i,i+63 do
 210                 io.write(utf8.char(j))
 211         end
 212         io.write("\n")
 213 end
 214 ]]
 215