maint: improve ERE in sc_tests_list_consistency
[coreutils.git] / tests / uniq / uniq-collate.sh
blob34020b07a513f33dd0799844c1e687492b497194
1 #!/bin/sh
2 # before coreutils-8.32, uniq would not distinguish
3 # items which compared equal with strcoll()
4 # So ensure we avoid strcoll() for the following cases.
6 # Copyright (C) 2020-2024 Free Software Foundation, Inc.
8 # This program is free software: you can redistribute it and/or modify
9 # it under the terms of the GNU General Public License as published by
10 # the Free Software Foundation, either version 3 of the License, or
11 # (at your option) any later version.
13 # This program is distributed in the hope that it will be useful,
14 # but WITHOUT ANY WARRANTY; without even the implied warranty of
15 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 # GNU General Public License for more details.
18 # You should have received a copy of the GNU General Public License
19 # along with this program. If not, see <https://www.gnu.org/licenses/>.
21 . "${srcdir=.}/tests/init.sh"; path_prepend_ ./src
22 print_ver_ uniq printf
24 gen_input()
26 env LC_ALL=$LOCALE_FR_UTF8 printf "$@" > in || framework_failure_
29 # strcoll() used to return 0 comparing the following strings
30 # which was fixed somewhere between glibc-2.22 and glibc-2.30
31 gen_input '%s\n' 'ⁿᵘˡˡ' 'ܥܝܪܐܩ'
32 test $(LC_ALL=$LOCALE_FR_UTF8 uniq < in | wc -l) = 2 || fail=1
34 # normalization in strcoll is inconsistent across platforms.
35 # glibc based systems at least do _not_ normalize in strcoll,
36 # while cygwin systems for example may do so.
37 # á composed and decomposed, are generally not compared equal
38 gen_input '\u00E1\na\u0301\n'
39 test $(LC_ALL=$LOCALE_FR_UTF8 uniq < in | wc -l) = 2 || fail=1
40 # Similarly with the following equivalent hangul characters
41 gen_input '\uAC01\n\u1100\u1161\u11A8\n'
42 test $(LC_ALL=ko_KR.utf8 uniq < in | wc -l) = 2 || fail=1
44 # Note if running in the wrong locale,
45 # strcoll may indicate the strings match when they don't.
46 # I.e., cjk and hangul will now work even if
47 # uniq is running in the wrong locale
48 # hangul (ko_KR.utf8)
49 gen_input '\uAC00\n\uAC01\n'
50 test $(LC_ALL=en_US.utf8 uniq < in | wc -l) = 2 || fail=1
51 # CJK (zh_CN.utf8)
52 gen_input '\u3400\n\u3401\n'
53 test $(LC_ALL=en_US.utf8 uniq < in | wc -l) = 2 || fail=1
55 # Note strcoll() ignores certain characters,
56 # but not if the strings are otherwise equal.
57 # I.e., the following on glibc-2.30 at least,
58 # as expected, does not print a single item,
59 # but testing here for illustration
60 gen_input ',a\n.a\n'
61 test $(LC_ALL=$LOCALE_FR_UTF8 uniq < in | wc -l) = 2 || fail=1
63 Exit $fail