unicode: update the width tables to Unicode 15
[git.git] / t / lib-unicode-nfc-nfd.sh
blob22232247efc34d28804311f44ead4661b9ad2afd
1 # Help detect how Unicode NFC and NFD are handled on the filesystem.
3 # A simple character that has a NFD form.
5 # NFC: U+00e9 LATIN SMALL LETTER E WITH ACUTE
6 # UTF8(NFC): \xc3 \xa9
8 # NFD: U+0065 LATIN SMALL LETTER E
9 # U+0301 COMBINING ACUTE ACCENT
10 # UTF8(NFD): \x65 + \xcc \x81
12 utf8_nfc=$(printf "\xc3\xa9")
13 utf8_nfd=$(printf "\x65\xcc\x81")
15 # Is the OS or the filesystem "Unicode composition sensitive"?
17 # That is, does the OS or the filesystem allow files to exist with
18 # both the NFC and NFD spellings? Or, does the OS/FS lie to us and
19 # tell us that the NFC and NFD forms are equivalent.
21 # This is or may be independent of what type of filesystem we have,
22 # since it might be handled by the OS at a layer above the FS.
23 # Testing shows on MacOS using APFS, HFS+, and FAT32 reports a
24 # collision, for example.
26 # This does not tell us how the Unicode pathname will be spelled
27 # on disk, but rather only that the two spelling "collide". We
28 # will examine the actual on disk spelling in a later prereq.
30 test_lazy_prereq UNICODE_COMPOSITION_SENSITIVE '
31 mkdir trial_${utf8_nfc} &&
32 mkdir trial_${utf8_nfd}
35 # Is the spelling of an NFC pathname preserved on disk?
37 # On MacOS with HFS+ and FAT32, NFC paths are converted into NFD
38 # and on APFS, NFC paths are preserved. As we have established
39 # above, this is independent of "composition sensitivity".
41 test_lazy_prereq UNICODE_NFC_PRESERVED '
42 mkdir c_${utf8_nfc} &&
43 ls | test-tool hexdump >dump &&
44 grep "63 5f c3 a9" dump
47 # Is the spelling of an NFD pathname preserved on disk?
49 test_lazy_prereq UNICODE_NFD_PRESERVED '
50 mkdir d_${utf8_nfd} &&
51 ls | test-tool hexdump >dump &&
52 grep "64 5f 65 cc 81" dump
55 # The following _DOUBLE_ forms are more for my curiosity,
56 # but there may be quirks lurking when there are multiple
57 # combining characters in non-canonical order.
59 # Unicode also allows multiple combining characters
60 # that can be decomposed in pieces.
62 # NFC: U+1f67 GREEK SMALL LETTER OMEGA WITH DASIA AND PERISPOMENI
63 # UTF8(NFC): \xe1 \xbd \xa7
65 # NFD1: U+1f61 GREEK SMALL LETTER OMEGA WITH DASIA
66 # U+0342 COMBINING GREEK PERISPOMENI
67 # UTF8(NFD1): \xe1 \xbd \xa1 + \xcd \x82
69 # But U+1f61 decomposes into
70 # NFD2: U+03c9 GREEK SMALL LETTER OMEGA
71 # U+0314 COMBINING REVERSED COMMA ABOVE
72 # UTF8(NFD2): \xcf \x89 + \xcc \x94
74 # Yielding: \xcf \x89 + \xcc \x94 + \xcd \x82
76 # Note that I've used the canonical ordering of the
77 # combinining characters. It is also possible to
78 # swap them. My testing shows that that non-standard
79 # ordering also causes a collision in mkdir. However,
80 # the resulting names don't draw correctly on the
81 # terminal (implying that the on-disk format also has
82 # them out of order).
84 greek_nfc=$(printf "\xe1\xbd\xa7")
85 greek_nfd1=$(printf "\xe1\xbd\xa1\xcd\x82")
86 greek_nfd2=$(printf "\xcf\x89\xcc\x94\xcd\x82")
88 # See if a double decomposition also collides.
90 test_lazy_prereq UNICODE_DOUBLE_COMPOSITION_SENSITIVE '
91 mkdir trial_${greek_nfc} &&
92 mkdir trial_${greek_nfd2}
95 # See if the NFC spelling appears on the disk.
97 test_lazy_prereq UNICODE_DOUBLE_NFC_PRESERVED '
98 mkdir c_${greek_nfc} &&
99 ls | test-tool hexdump >dump &&
100 grep "63 5f e1 bd a7" dump
103 # See if the NFD spelling appears on the disk.
105 test_lazy_prereq UNICODE_DOUBLE_NFD_PRESERVED '
106 mkdir d_${greek_nfd2} &&
107 ls | test-tool hexdump >dump &&
108 grep "64 5f cf 89 cc 94 cd 82" dump
111 # The following is for debugging. I found it useful when
112 # trying to understand the various (OS, FS) quirks WRT
113 # Unicode and how composition/decomposition is handled.
114 # For example, when trying to understand how (macOS, APFS)
115 # and (macOS, HFS) and (macOS, FAT32) compare.
117 # It is rather noisy, so it is disabled by default.
119 if test "$unicode_debug" = "true"
120 then
121 if test_have_prereq UNICODE_COMPOSITION_SENSITIVE
122 then
123 echo NFC and NFD are distinct on this OS/filesystem.
124 else
125 echo NFC and NFD are aliases on this OS/filesystem.
128 if test_have_prereq UNICODE_NFC_PRESERVED
129 then
130 echo NFC maintains original spelling.
131 else
132 echo NFC is modified.
135 if test_have_prereq UNICODE_NFD_PRESERVED
136 then
137 echo NFD maintains original spelling.
138 else
139 echo NFD is modified.
142 if test_have_prereq UNICODE_DOUBLE_COMPOSITION_SENSITIVE
143 then
144 echo DOUBLE NFC and NFD are distinct on this OS/filesystem.
145 else
146 echo DOUBLE NFC and NFD are aliases on this OS/filesystem.
149 if test_have_prereq UNICODE_DOUBLE_NFC_PRESERVED
150 then
151 echo Double NFC maintains original spelling.
152 else
153 echo Double NFC is modified.
156 if test_have_prereq UNICODE_DOUBLE_NFD_PRESERVED
157 then
158 echo Double NFD maintains original spelling.
159 else
160 echo Double NFD is modified.