Daily bump.
[official-gcc.git] / libgo / go / exp / norm / normalize_test.go
blob9a6b46e41bc52fb5450516d3c8da0ce044498df8
1 // Copyright 2011 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
5 package norm
7 import (
8 "bytes"
9 "io"
10 "strings"
11 "testing"
14 type PositionTest struct {
15 input string
16 pos int
17 buffer string // expected contents of reorderBuffer, if applicable
20 type positionFunc func(rb *reorderBuffer, s string) int
22 func runPosTests(t *testing.T, name string, f Form, fn positionFunc, tests []PositionTest) {
23 rb := reorderBuffer{}
24 rb.init(f, nil)
25 for i, test := range tests {
26 rb.reset()
27 rb.src = inputString(test.input)
28 rb.nsrc = len(test.input)
29 pos := fn(&rb, test.input)
30 if pos != test.pos {
31 t.Errorf("%s:%d: position is %d; want %d", name, i, pos, test.pos)
33 runes := []rune(test.buffer)
34 if rb.nrune != len(runes) {
35 t.Errorf("%s:%d: reorder buffer length is %d; want %d", name, i, rb.nrune, len(runes))
36 continue
38 for j, want := range runes {
39 found := rune(rb.runeAt(j))
40 if found != want {
41 t.Errorf("%s:%d: rune at %d is %U; want %U", name, i, j, found, want)
47 var decomposeSegmentTests = []PositionTest{
48 // illegal runes
49 {"\xC0", 0, ""},
50 {"\u00E0\x80", 2, "\u0061\u0300"},
51 // starter
52 {"a", 1, "a"},
53 {"ab", 1, "a"},
54 // starter + composing
55 {"a\u0300", 3, "a\u0300"},
56 {"a\u0300b", 3, "a\u0300"},
57 // with decomposition
58 {"\u00C0", 2, "A\u0300"},
59 {"\u00C0b", 2, "A\u0300"},
60 // long
61 {strings.Repeat("\u0300", 31), 62, strings.Repeat("\u0300", 31)},
62 // ends with incomplete UTF-8 encoding
63 {"\xCC", 0, ""},
64 {"\u0300\xCC", 2, "\u0300"},
67 func decomposeSegmentF(rb *reorderBuffer, s string) int {
68 rb.src = inputString(s)
69 rb.nsrc = len(s)
70 return decomposeSegment(rb, 0)
73 func TestDecomposeSegment(t *testing.T) {
74 runPosTests(t, "TestDecomposeSegment", NFC, decomposeSegmentF, decomposeSegmentTests)
77 var firstBoundaryTests = []PositionTest{
78 // no boundary
79 {"", -1, ""},
80 {"\u0300", -1, ""},
81 {"\x80\x80", -1, ""},
82 // illegal runes
83 {"\xff", 0, ""},
84 {"\u0300\xff", 2, ""},
85 {"\u0300\xc0\x80\x80", 2, ""},
86 // boundaries
87 {"a", 0, ""},
88 {"\u0300a", 2, ""},
89 // Hangul
90 {"\u1103\u1161", 0, ""},
91 {"\u110B\u1173\u11B7", 0, ""},
92 {"\u1161\u110B\u1173\u11B7", 3, ""},
93 {"\u1173\u11B7\u1103\u1161", 6, ""},
94 // too many combining characters.
95 {strings.Repeat("\u0300", maxCombiningChars-1), -1, ""},
96 {strings.Repeat("\u0300", maxCombiningChars), 60, ""},
97 {strings.Repeat("\u0300", maxCombiningChars+1), 60, ""},
100 func firstBoundaryF(rb *reorderBuffer, s string) int {
101 return rb.f.form.FirstBoundary([]byte(s))
104 func firstBoundaryStringF(rb *reorderBuffer, s string) int {
105 return rb.f.form.FirstBoundaryInString(s)
108 func TestFirstBoundary(t *testing.T) {
109 runPosTests(t, "TestFirstBoundary", NFC, firstBoundaryF, firstBoundaryTests)
110 runPosTests(t, "TestFirstBoundaryInString", NFC, firstBoundaryStringF, firstBoundaryTests)
113 var decomposeToLastTests = []PositionTest{
114 // ends with inert character
115 {"Hello!", 6, ""},
116 {"\u0632", 2, ""},
117 {"a\u0301\u0635", 5, ""},
118 // ends with non-inert starter
119 {"a", 0, "a"},
120 {"a\u0301a", 3, "a"},
121 {"a\u0301\u03B9", 3, "\u03B9"},
122 {"a\u0327", 0, "a\u0327"},
123 // illegal runes
124 {"\xFF", 1, ""},
125 {"aa\xFF", 3, ""},
126 {"\xC0\x80\x80", 3, ""},
127 {"\xCC\x80\x80", 3, ""},
128 // ends with incomplete UTF-8 encoding
129 {"a\xCC", 2, ""},
130 // ends with combining characters
131 {"\u0300\u0301", 0, "\u0300\u0301"},
132 {"a\u0300\u0301", 0, "a\u0300\u0301"},
133 {"a\u0301\u0308", 0, "a\u0301\u0308"},
134 {"a\u0308\u0301", 0, "a\u0308\u0301"},
135 {"aaaa\u0300\u0301", 3, "a\u0300\u0301"},
136 {"\u0300a\u0300\u0301", 2, "a\u0300\u0301"},
137 {"\u00C0", 0, "A\u0300"},
138 {"a\u00C0", 1, "A\u0300"},
139 // decomposing
140 {"a\u0300\uFDC0", 3, "\u0645\u062C\u064A"},
141 {"\uFDC0" + strings.Repeat("\u0300", 26), 0, "\u0645\u062C\u064A" + strings.Repeat("\u0300", 26)},
142 // Hangul
143 {"a\u1103", 1, "\u1103"},
144 {"a\u110B", 1, "\u110B"},
145 {"a\u110B\u1173", 1, "\u110B\u1173"},
146 // See comment in composition.go:compBoundaryAfter.
147 {"a\u110B\u1173\u11B7", 1, "\u110B\u1173\u11B7"},
148 {"a\uC73C", 1, "\u110B\u1173"},
149 {"다음", 3, "\u110B\u1173\u11B7"},
150 {"다", 0, "\u1103\u1161"},
151 {"\u1103\u1161\u110B\u1173\u11B7", 6, "\u110B\u1173\u11B7"},
152 {"\u110B\u1173\u11B7\u1103\u1161", 9, "\u1103\u1161"},
153 {"다음음", 6, "\u110B\u1173\u11B7"},
154 {"음다다", 6, "\u1103\u1161"},
155 // buffer overflow
156 {"a" + strings.Repeat("\u0300", 30), 3, strings.Repeat("\u0300", 29)},
157 {"\uFDFA" + strings.Repeat("\u0300", 14), 3, strings.Repeat("\u0300", 14)},
158 // weird UTF-8
159 {"a\u0300\u11B7", 0, "a\u0300\u11B7"},
162 func decomposeToLast(rb *reorderBuffer, s string) int {
163 buf := decomposeToLastBoundary(rb, []byte(s))
164 return len(buf)
167 func TestDecomposeToLastBoundary(t *testing.T) {
168 runPosTests(t, "TestDecomposeToLastBoundary", NFKC, decomposeToLast, decomposeToLastTests)
171 var lastBoundaryTests = []PositionTest{
172 // ends with inert character
173 {"Hello!", 6, ""},
174 {"\u0632", 2, ""},
175 // ends with non-inert starter
176 {"a", 0, ""},
177 // illegal runes
178 {"\xff", 1, ""},
179 {"aa\xff", 3, ""},
180 {"a\xff\u0300", 1, ""},
181 {"\xc0\x80\x80", 3, ""},
182 {"\xc0\x80\x80\u0300", 3, ""},
183 // ends with incomplete UTF-8 encoding
184 {"\xCC", -1, ""},
185 {"\xE0\x80", -1, ""},
186 {"\xF0\x80\x80", -1, ""},
187 {"a\xCC", 0, ""},
188 {"\x80\xCC", 1, ""},
189 {"\xCC\xCC", 1, ""},
190 // ends with combining characters
191 {"a\u0300\u0301", 0, ""},
192 {"aaaa\u0300\u0301", 3, ""},
193 {"\u0300a\u0300\u0301", 2, ""},
194 {"\u00C0", 0, ""},
195 {"a\u00C0", 1, ""},
196 // decomposition may recombine
197 {"\u0226", 0, ""},
198 // no boundary
199 {"", -1, ""},
200 {"\u0300\u0301", -1, ""},
201 {"\u0300", -1, ""},
202 {"\x80\x80", -1, ""},
203 {"\x80\x80\u0301", -1, ""},
204 // Hangul
205 {"다음", 3, ""},
206 {"다", 0, ""},
207 {"\u1103\u1161\u110B\u1173\u11B7", 6, ""},
208 {"\u110B\u1173\u11B7\u1103\u1161", 9, ""},
209 // too many combining characters.
210 {strings.Repeat("\u0300", maxCombiningChars-1), -1, ""},
211 {strings.Repeat("\u0300", maxCombiningChars), 60, ""},
212 {strings.Repeat("\u0300", maxCombiningChars+1), 62, ""},
215 func lastBoundaryF(rb *reorderBuffer, s string) int {
216 return rb.f.form.LastBoundary([]byte(s))
219 func TestLastBoundary(t *testing.T) {
220 runPosTests(t, "TestLastBoundary", NFC, lastBoundaryF, lastBoundaryTests)
223 var quickSpanTests = []PositionTest{
224 {"", 0, ""},
225 // starters
226 {"a", 1, ""},
227 {"abc", 3, ""},
228 {"\u043Eb", 3, ""},
229 // incomplete last rune.
230 {"\xCC", 1, ""},
231 {"a\xCC", 2, ""},
232 // incorrectly ordered combining characters
233 {"\u0300\u0316", 0, ""},
234 {"\u0300\u0316cd", 0, ""},
235 // have a maximum number of combining characters.
236 {strings.Repeat("\u035D", 30) + "\u035B", 62, ""},
237 {"a" + strings.Repeat("\u035D", 30) + "\u035B", 63, ""},
238 {"Ɵ" + strings.Repeat("\u035D", 30) + "\u035B", 64, ""},
239 {"aa" + strings.Repeat("\u035D", 30) + "\u035B", 64, ""},
242 var quickSpanNFDTests = []PositionTest{
243 // needs decomposing
244 {"\u00C0", 0, ""},
245 {"abc\u00C0", 3, ""},
246 // correctly ordered combining characters
247 {"\u0300", 2, ""},
248 {"ab\u0300", 4, ""},
249 {"ab\u0300cd", 6, ""},
250 {"\u0300cd", 4, ""},
251 {"\u0316\u0300", 4, ""},
252 {"ab\u0316\u0300", 6, ""},
253 {"ab\u0316\u0300cd", 8, ""},
254 {"ab\u0316\u0300\u00C0", 6, ""},
255 {"\u0316\u0300cd", 6, ""},
256 {"\u043E\u0308b", 5, ""},
257 // incorrectly ordered combining characters
258 {"ab\u0300\u0316", 1, ""}, // TODO: we could skip 'b' as well.
259 {"ab\u0300\u0316cd", 1, ""},
260 // Hangul
261 {"같은", 0, ""},
264 var quickSpanNFCTests = []PositionTest{
265 // okay composed
266 {"\u00C0", 2, ""},
267 {"abc\u00C0", 5, ""},
268 // correctly ordered combining characters
269 {"ab\u0300", 1, ""},
270 {"ab\u0300cd", 1, ""},
271 {"ab\u0316\u0300", 1, ""},
272 {"ab\u0316\u0300cd", 1, ""},
273 {"\u00C0\u035D", 4, ""},
274 // we do not special case leading combining characters
275 {"\u0300cd", 0, ""},
276 {"\u0300", 0, ""},
277 {"\u0316\u0300", 0, ""},
278 {"\u0316\u0300cd", 0, ""},
279 // incorrectly ordered combining characters
280 {"ab\u0300\u0316", 1, ""},
281 {"ab\u0300\u0316cd", 1, ""},
282 // Hangul
283 {"같은", 6, ""},
286 func doQuickSpan(rb *reorderBuffer, s string) int {
287 return rb.f.form.QuickSpan([]byte(s))
290 func doQuickSpanString(rb *reorderBuffer, s string) int {
291 return rb.f.form.QuickSpanString(s)
294 func TestQuickSpan(t *testing.T) {
295 runPosTests(t, "TestQuickSpanNFD1", NFD, doQuickSpan, quickSpanTests)
296 runPosTests(t, "TestQuickSpanNFD2", NFD, doQuickSpan, quickSpanNFDTests)
297 runPosTests(t, "TestQuickSpanNFC1", NFC, doQuickSpan, quickSpanTests)
298 runPosTests(t, "TestQuickSpanNFC2", NFC, doQuickSpan, quickSpanNFCTests)
300 runPosTests(t, "TestQuickSpanStringNFD1", NFD, doQuickSpanString, quickSpanTests)
301 runPosTests(t, "TestQuickSpanStringNFD2", NFD, doQuickSpanString, quickSpanNFDTests)
302 runPosTests(t, "TestQuickSpanStringNFC1", NFC, doQuickSpanString, quickSpanTests)
303 runPosTests(t, "TestQuickSpanStringNFC2", NFC, doQuickSpanString, quickSpanNFCTests)
306 var isNormalTests = []PositionTest{
307 {"", 1, ""},
308 // illegal runes
309 {"\xff", 1, ""},
310 // starters
311 {"a", 1, ""},
312 {"abc", 1, ""},
313 {"\u043Eb", 1, ""},
314 // incorrectly ordered combining characters
315 {"\u0300\u0316", 0, ""},
316 {"ab\u0300\u0316", 0, ""},
317 {"ab\u0300\u0316cd", 0, ""},
318 {"\u0300\u0316cd", 0, ""},
320 var isNormalNFDTests = []PositionTest{
321 // needs decomposing
322 {"\u00C0", 0, ""},
323 {"abc\u00C0", 0, ""},
324 // correctly ordered combining characters
325 {"\u0300", 1, ""},
326 {"ab\u0300", 1, ""},
327 {"ab\u0300cd", 1, ""},
328 {"\u0300cd", 1, ""},
329 {"\u0316\u0300", 1, ""},
330 {"ab\u0316\u0300", 1, ""},
331 {"ab\u0316\u0300cd", 1, ""},
332 {"\u0316\u0300cd", 1, ""},
333 {"\u043E\u0308b", 1, ""},
334 // Hangul
335 {"같은", 0, ""},
337 var isNormalNFCTests = []PositionTest{
338 // okay composed
339 {"\u00C0", 1, ""},
340 {"abc\u00C0", 1, ""},
341 // need reordering
342 {"a\u0300", 0, ""},
343 {"a\u0300cd", 0, ""},
344 {"a\u0316\u0300", 0, ""},
345 {"a\u0316\u0300cd", 0, ""},
346 // correctly ordered combining characters
347 {"ab\u0300", 1, ""},
348 {"ab\u0300cd", 1, ""},
349 {"ab\u0316\u0300", 1, ""},
350 {"ab\u0316\u0300cd", 1, ""},
351 {"\u00C0\u035D", 1, ""},
352 {"\u0300", 1, ""},
353 {"\u0316\u0300cd", 1, ""},
354 // Hangul
355 {"같은", 1, ""},
358 func isNormalF(rb *reorderBuffer, s string) int {
359 if rb.f.form.IsNormal([]byte(s)) {
360 return 1
362 return 0
365 func TestIsNormal(t *testing.T) {
366 runPosTests(t, "TestIsNormalNFD1", NFD, isNormalF, isNormalTests)
367 runPosTests(t, "TestIsNormalNFD2", NFD, isNormalF, isNormalNFDTests)
368 runPosTests(t, "TestIsNormalNFC1", NFC, isNormalF, isNormalTests)
369 runPosTests(t, "TestIsNormalNFC2", NFC, isNormalF, isNormalNFCTests)
372 type AppendTest struct {
373 left string
374 right string
375 out string
378 type appendFunc func(f Form, out []byte, s string) []byte
380 func runAppendTests(t *testing.T, name string, f Form, fn appendFunc, tests []AppendTest) {
381 for i, test := range tests {
382 out := []byte(test.left)
383 out = fn(f, out, test.right)
384 outs := string(out)
385 if len(outs) != len(test.out) {
386 t.Errorf("%s:%d: length is %d; want %d", name, i, len(outs), len(test.out))
388 if outs != test.out {
389 // Find first rune that differs and show context.
390 ir := []rune(outs)
391 ig := []rune(test.out)
392 for j := 0; j < len(ir) && j < len(ig); j++ {
393 if ir[j] == ig[j] {
394 continue
396 if j -= 3; j < 0 {
397 j = 0
399 for e := j + 7; j < e && j < len(ir) && j < len(ig); j++ {
400 t.Errorf("%s:%d: runeAt(%d) = %U; want %U", name, i, j, ir[j], ig[j])
402 break
408 var appendTests = []AppendTest{
409 // empty buffers
410 {"", "", ""},
411 {"a", "", "a"},
412 {"", "a", "a"},
413 {"", "\u0041\u0307\u0304", "\u01E0"},
414 // segment split across buffers
415 {"", "a\u0300b", "\u00E0b"},
416 {"a", "\u0300b", "\u00E0b"},
417 {"a", "\u0300\u0316", "\u00E0\u0316"},
418 {"a", "\u0316\u0300", "\u00E0\u0316"},
419 {"a", "\u0300a\u0300", "\u00E0\u00E0"},
420 {"a", "\u0300a\u0300a\u0300", "\u00E0\u00E0\u00E0"},
421 {"a", "\u0300aaa\u0300aaa\u0300", "\u00E0aa\u00E0aa\u00E0"},
422 {"a\u0300", "\u0327", "\u00E0\u0327"},
423 {"a\u0327", "\u0300", "\u00E0\u0327"},
424 {"a\u0316", "\u0300", "\u00E0\u0316"},
425 {"\u0041\u0307", "\u0304", "\u01E0"},
426 // Hangul
427 {"", "\u110B\u1173", "\uC73C"},
428 {"", "\u1103\u1161", "\uB2E4"},
429 {"", "\u110B\u1173\u11B7", "\uC74C"},
430 {"", "\u320E", "\x28\uAC00\x29"},
431 {"", "\x28\u1100\u1161\x29", "\x28\uAC00\x29"},
432 {"\u1103", "\u1161", "\uB2E4"},
433 {"\u110B", "\u1173\u11B7", "\uC74C"},
434 {"\u110B\u1173", "\u11B7", "\uC74C"},
435 {"\uC73C", "\u11B7", "\uC74C"},
436 // UTF-8 encoding split across buffers
437 {"a\xCC", "\x80", "\u00E0"},
438 {"a\xCC", "\x80b", "\u00E0b"},
439 {"a\xCC", "\x80a\u0300", "\u00E0\u00E0"},
440 {"a\xCC", "\x80\x80", "\u00E0\x80"},
441 {"a\xCC", "\x80\xCC", "\u00E0\xCC"},
442 {"a\u0316\xCC", "\x80a\u0316\u0300", "\u00E0\u0316\u00E0\u0316"},
443 // ending in incomplete UTF-8 encoding
444 {"", "\xCC", "\xCC"},
445 {"a", "\xCC", "a\xCC"},
446 {"a", "b\xCC", "ab\xCC"},
447 {"\u0226", "\xCC", "\u0226\xCC"},
448 // illegal runes
449 {"", "\x80", "\x80"},
450 {"", "\x80\x80\x80", "\x80\x80\x80"},
451 {"", "\xCC\x80\x80\x80", "\xCC\x80\x80\x80"},
452 {"", "a\x80", "a\x80"},
453 {"", "a\x80\x80\x80", "a\x80\x80\x80"},
454 {"", "a\x80\x80\x80\x80\x80\x80", "a\x80\x80\x80\x80\x80\x80"},
455 {"a", "\x80\x80\x80", "a\x80\x80\x80"},
456 // overflow
457 {"", strings.Repeat("\x80", 33), strings.Repeat("\x80", 33)},
458 {strings.Repeat("\x80", 33), "", strings.Repeat("\x80", 33)},
459 {strings.Repeat("\x80", 33), strings.Repeat("\x80", 33), strings.Repeat("\x80", 66)},
460 // overflow of combining characters
461 {strings.Repeat("\u0300", 33), "", strings.Repeat("\u0300", 33)},
462 // weird UTF-8
463 {"\u00E0\xE1", "\x86", "\u00E0\xE1\x86"},
464 {"a\u0300\u11B7", "\u0300", "\u00E0\u11B7\u0300"},
465 {"a\u0300\u11B7\u0300", "\u0300", "\u00E0\u11B7\u0300\u0300"},
466 {"\u0300", "\xF8\x80\x80\x80\x80\u0300", "\u0300\xF8\x80\x80\x80\x80\u0300"},
467 {"\u0300", "\xFC\x80\x80\x80\x80\x80\u0300", "\u0300\xFC\x80\x80\x80\x80\x80\u0300"},
468 {"\xF8\x80\x80\x80\x80\u0300", "\u0300", "\xF8\x80\x80\x80\x80\u0300\u0300"},
469 {"\xFC\x80\x80\x80\x80\x80\u0300", "\u0300", "\xFC\x80\x80\x80\x80\x80\u0300\u0300"},
470 {"\xF8\x80\x80\x80", "\x80\u0300\u0300", "\xF8\x80\x80\x80\x80\u0300\u0300"},
473 func appendF(f Form, out []byte, s string) []byte {
474 return f.Append(out, []byte(s)...)
477 func appendStringF(f Form, out []byte, s string) []byte {
478 return f.AppendString(out, s)
481 func bytesF(f Form, out []byte, s string) []byte {
482 buf := []byte{}
483 buf = append(buf, out...)
484 buf = append(buf, s...)
485 return f.Bytes(buf)
488 func stringF(f Form, out []byte, s string) []byte {
489 outs := string(out) + s
490 return []byte(f.String(outs))
493 func TestAppend(t *testing.T) {
494 runAppendTests(t, "TestAppend", NFKC, appendF, appendTests)
495 runAppendTests(t, "TestAppendString", NFKC, appendStringF, appendTests)
496 runAppendTests(t, "TestBytes", NFKC, bytesF, appendTests)
497 runAppendTests(t, "TestString", NFKC, stringF, appendTests)
500 func appendBench(f Form, in []byte) func() {
501 buf := make([]byte, 0, 4*len(in))
502 return func() {
503 f.Append(buf, in...)
507 func iterBench(f Form, in []byte) func() {
508 iter := Iter{}
509 return func() {
510 iter.Init(f, in)
511 for !iter.Done() {
512 iter.Next()
517 func readerBench(f Form, in []byte) func() {
518 buf := make([]byte, 4*len(in))
519 return func() {
520 r := f.Reader(bytes.NewReader(in))
521 var err error
522 for err == nil {
523 _, err = r.Read(buf)
525 if err != io.EOF {
526 panic("")
531 func writerBench(f Form, in []byte) func() {
532 buf := make([]byte, 0, 4*len(in))
533 return func() {
534 r := f.Writer(bytes.NewBuffer(buf))
535 if _, err := r.Write(in); err != nil {
536 panic("")
541 func appendBenchmarks(bm []func(), f Form, in []byte) []func() {
542 //bm = append(bm, appendBench(f, in))
543 bm = append(bm, iterBench(f, in))
544 //bm = append(bm, readerBench(f, in))
545 //bm = append(bm, writerBench(f, in))
546 return bm
549 func doFormBenchmark(b *testing.B, inf, f Form, s string) {
550 b.StopTimer()
551 in := inf.Bytes([]byte(s))
552 bm := appendBenchmarks(nil, f, in)
553 b.SetBytes(int64(len(in) * len(bm)))
554 b.StartTimer()
555 for i := 0; i < b.N; i++ {
556 for _, fn := range bm {
557 fn()
562 var ascii = strings.Repeat("There is nothing to change here! ", 500)
564 func BenchmarkNormalizeAsciiNFC(b *testing.B) {
565 doFormBenchmark(b, NFC, NFC, ascii)
567 func BenchmarkNormalizeAsciiNFD(b *testing.B) {
568 doFormBenchmark(b, NFC, NFD, ascii)
570 func BenchmarkNormalizeAsciiNFKC(b *testing.B) {
571 doFormBenchmark(b, NFC, NFKC, ascii)
573 func BenchmarkNormalizeAsciiNFKD(b *testing.B) {
574 doFormBenchmark(b, NFC, NFKD, ascii)
577 func BenchmarkNormalizeNFC2NFC(b *testing.B) {
578 doFormBenchmark(b, NFC, NFC, txt_all)
580 func BenchmarkNormalizeNFC2NFD(b *testing.B) {
581 doFormBenchmark(b, NFC, NFD, txt_all)
583 func BenchmarkNormalizeNFD2NFC(b *testing.B) {
584 doFormBenchmark(b, NFD, NFC, txt_all)
586 func BenchmarkNormalizeNFD2NFD(b *testing.B) {
587 doFormBenchmark(b, NFD, NFD, txt_all)
590 // Hangul is often special-cased, so we test it separately.
591 func BenchmarkNormalizeHangulNFC2NFC(b *testing.B) {
592 doFormBenchmark(b, NFC, NFC, txt_kr)
594 func BenchmarkNormalizeHangulNFC2NFD(b *testing.B) {
595 doFormBenchmark(b, NFC, NFD, txt_kr)
597 func BenchmarkNormalizeHangulNFD2NFC(b *testing.B) {
598 doFormBenchmark(b, NFD, NFC, txt_kr)
600 func BenchmarkNormalizeHangulNFD2NFD(b *testing.B) {
601 doFormBenchmark(b, NFD, NFD, txt_kr)
604 var forms = []Form{NFC, NFD, NFKC, NFKD}
606 func doTextBenchmark(b *testing.B, s string) {
607 b.StopTimer()
608 in := []byte(s)
609 bm := []func(){}
610 for _, f := range forms {
611 bm = appendBenchmarks(bm, f, in)
613 b.SetBytes(int64(len(s) * len(bm)))
614 b.StartTimer()
615 for i := 0; i < b.N; i++ {
616 for _, f := range bm {
622 func BenchmarkCanonicalOrdering(b *testing.B) {
623 doTextBenchmark(b, txt_canon)
625 func BenchmarkExtendedLatin(b *testing.B) {
626 doTextBenchmark(b, txt_vn)
628 func BenchmarkMiscTwoByteUtf8(b *testing.B) {
629 doTextBenchmark(b, twoByteUtf8)
631 func BenchmarkMiscThreeByteUtf8(b *testing.B) {
632 doTextBenchmark(b, threeByteUtf8)
634 func BenchmarkHangul(b *testing.B) {
635 doTextBenchmark(b, txt_kr)
637 func BenchmarkJapanese(b *testing.B) {
638 doTextBenchmark(b, txt_jp)
640 func BenchmarkChinese(b *testing.B) {
641 doTextBenchmark(b, txt_cn)
643 func BenchmarkOverflow(b *testing.B) {
644 doTextBenchmark(b, overflow)
647 var overflow = string(bytes.Repeat([]byte("\u035D"), 4096)) + "\u035B"
649 // Tests sampled from the Canonical ordering tests (Part 2) of
650 // http://unicode.org/Public/UNIDATA/NormalizationTest.txt
651 const txt_canon = `\u0061\u0315\u0300\u05AE\u0300\u0062 \u0061\u0300\u0315\u0300\u05AE\u0062
652 \u0061\u0302\u0315\u0300\u05AE\u0062 \u0061\u0307\u0315\u0300\u05AE\u0062
653 \u0061\u0315\u0300\u05AE\u030A\u0062 \u0061\u059A\u0316\u302A\u031C\u0062
654 \u0061\u032E\u059A\u0316\u302A\u0062 \u0061\u0338\u093C\u0334\u0062
655 \u0061\u059A\u0316\u302A\u0339 \u0061\u0341\u0315\u0300\u05AE\u0062
656 \u0061\u0348\u059A\u0316\u302A\u0062 \u0061\u0361\u0345\u035D\u035C\u0062
657 \u0061\u0366\u0315\u0300\u05AE\u0062 \u0061\u0315\u0300\u05AE\u0486\u0062
658 \u0061\u05A4\u059A\u0316\u302A\u0062 \u0061\u0315\u0300\u05AE\u0613\u0062
659 \u0061\u0315\u0300\u05AE\u0615\u0062 \u0061\u0617\u0315\u0300\u05AE\u0062
660 \u0061\u0619\u0618\u064D\u064E\u0062 \u0061\u0315\u0300\u05AE\u0654\u0062
661 \u0061\u0315\u0300\u05AE\u06DC\u0062 \u0061\u0733\u0315\u0300\u05AE\u0062
662 \u0061\u0744\u059A\u0316\u302A\u0062 \u0061\u0315\u0300\u05AE\u0745\u0062
663 \u0061\u09CD\u05B0\u094D\u3099\u0062 \u0061\u0E38\u0E48\u0E38\u0C56\u0062
664 \u0061\u0EB8\u0E48\u0E38\u0E49\u0062 \u0061\u0F72\u0F71\u0EC8\u0F71\u0062
665 \u0061\u1039\u05B0\u094D\u3099\u0062 \u0061\u05B0\u094D\u3099\u1A60\u0062
666 \u0061\u3099\u093C\u0334\u1BE6\u0062 \u0061\u3099\u093C\u0334\u1C37\u0062
667 \u0061\u1CD9\u059A\u0316\u302A\u0062 \u0061\u2DED\u0315\u0300\u05AE\u0062
668 \u0061\u2DEF\u0315\u0300\u05AE\u0062 \u0061\u302D\u302E\u059A\u0316\u0062`
670 // Taken from http://creativecommons.org/licenses/by-sa/3.0/vn/
671 const txt_vn = `Với các điều kiện sau: Ghi nhận công của tác giả.
672 Nếu bạn sử dụng, chuyển đổi, hoặc xây dựng dự án từ
673 nội dung được chia sẻ này, bạn phải áp dụng giấy phép này hoặc
674 một giấy phép khác có các điều khoản tương tự như giấy phép này
675 cho dự án của bạn. Hiểu rằng: Miễn — Bất kỳ các điều kiện nào
676 trên đây cũng có thể được miễn bỏ nếu bạn được sự cho phép của
677 người sở hữu bản quyền. Phạm vi công chúng — Khi tác phẩm hoặc
678 bất kỳ chương nào của tác phẩm đã trong vùng dành cho công
679 chúng theo quy định của pháp luật thì tình trạng của nó không
680 bị ảnh hưởng bởi giấy phép trong bất kỳ trường hợp nào.`
682 // Taken from http://creativecommons.org/licenses/by-sa/1.0/deed.ru
683 const txt_ru = `При обязательном соблюдении следующих условий:
684 Attribution — Вы должны атрибутировать произведение (указывать
685 автора и источник) в порядке, предусмотренном автором или
686 лицензиаром (но только так, чтобы никоим образом не подразумевалось,
687 что они поддерживают вас или использование вами данного произведения).
688 Υπό τις ακόλουθες προϋποθέσεις:`
690 // Taken from http://creativecommons.org/licenses/by-sa/3.0/gr/
691 const txt_gr = `Αναφορά Δημιουργού — Θα πρέπει να κάνετε την αναφορά στο έργο με τον
692 τρόπο που έχει οριστεί από το δημιουργό ή το χορηγούντο την άδεια
693 (χωρίς όμως να εννοείται με οποιονδήποτε τρόπο ότι εγκρίνουν εσάς ή
694 τη χρήση του έργου από εσάς). Παρόμοια Διανομή — Εάν αλλοιώσετε,
695 τροποποιήσετε ή δημιουργήσετε περαιτέρω βασισμένοι στο έργο θα
696 μπορείτε να διανέμετε το έργο που θα προκύψει μόνο με την ίδια ή
697 παρόμοια άδεια.`
699 // Taken from http://creativecommons.org/licenses/by-sa/3.0/deed.ar
700 const txt_ar = `بموجب الشروط التالية نسب المصنف — يجب عليك أن
701 تنسب العمل بالطريقة التي تحددها المؤلف أو المرخص (ولكن ليس بأي حال من
702 الأحوال أن توحي وتقترح بتحول أو استخدامك للعمل).
703 المشاركة على قدم المساواة — إذا كنت يعدل ، والتغيير ، أو الاستفادة
704 من هذا العمل ، قد ينتج عن توزيع العمل إلا في ظل تشابه او تطابق فى واحد
705 لهذا الترخيص.`
707 // Taken from http://creativecommons.org/licenses/by-sa/1.0/il/
708 const txt_il = `בכפוף לתנאים הבאים: ייחוס — עליך לייחס את היצירה (לתת קרדיט) באופן
709 המצויין על-ידי היוצר או מעניק הרישיון (אך לא בשום אופן המרמז על כך
710 שהם תומכים בך או בשימוש שלך ביצירה). שיתוף זהה — אם תחליט/י לשנות,
711 לעבד או ליצור יצירה נגזרת בהסתמך על יצירה זו, תוכל/י להפיץ את יצירתך
712 החדשה רק תחת אותו הרישיון או רישיון דומה לרישיון זה.`
714 const twoByteUtf8 = txt_ru + txt_gr + txt_ar + txt_il
716 // Taken from http://creativecommons.org/licenses/by-sa/2.0/kr/
717 const txt_kr = `다음과 같은 조건을 따라야 합니다: 저작자표시
718 (Attribution) — 저작자나 이용허락자가 정한 방법으로 저작물의
719 원저작자를 표시하여야 합니다(그러나 원저작자가 이용자나 이용자의
720 이용을 보증하거나 추천한다는 의미로 표시해서는 안됩니다).
721 동일조건변경허락 — 이 저작물을 이용하여 만든 이차적 저작물에는 본
722 라이선스와 동일한 라이선스를 적용해야 합니다.`
724 // Taken from http://creativecommons.org/licenses/by-sa/3.0/th/
725 const txt_th = `ภายใต้เงื่อนไข ดังต่อไปนี้ : แสดงที่มา — คุณต้องแสดงที่
726 มาของงานดังกล่าว ตามรูปแบบที่ผู้สร้างสรรค์หรือผู้อนุญาตกำหนด (แต่
727 ไม่ใช่ในลักษณะที่ว่า พวกเขาสนับสนุนคุณหรือสนับสนุนการที่
728 คุณนำงานไปใช้) อนุญาตแบบเดียวกัน — หากคุณดัดแปลง เปลี่ยนรูป หรื
729 อต่อเติมงานนี้ คุณต้องใช้สัญญาอนุญาตแบบเดียวกันหรือแบบที่เหมื
730 อนกับสัญญาอนุญาตที่ใช้กับงานนี้เท่านั้น`
732 const threeByteUtf8 = txt_th
734 // Taken from http://creativecommons.org/licenses/by-sa/2.0/jp/
735 const txt_jp = `あなたの従うべき条件は以下の通りです。
736 表示 — あなたは原著作者のクレジットを表示しなければなりません。
737 継承 — もしあなたがこの作品を改変、変形または加工した場合、
738 あなたはその結果生じた作品をこの作品と同一の許諾条件の下でのみ
739 頒布することができます。`
741 // http://creativecommons.org/licenses/by-sa/2.5/cn/
742 const txt_cn = `您可以自由: 复制、发行、展览、表演、放映、
743 广播或通过信息网络传播本作品 创作演绎作品
744 对本作品进行商业性使用 惟须遵守下列条件:
745 署名 — 您必须按照作者或者许可人指定的方式对作品进行署名。
746 相同方式共享 — 如果您改变、转换本作品或者以本作品为基础进行创作,
747 您只能采用与本协议相同的许可协议发布基于本作品的演绎作品。`
749 const txt_cjk = txt_cn + txt_jp + txt_kr
750 const txt_all = txt_vn + twoByteUtf8 + threeByteUtf8 + txt_cjk