1 unicode: implement higher level API for string handling
3 From: Gabriel Krisman Bertazi <krisman@collabora.co.uk>
5 This patch integrates the utf8n patches with some higher level API to
6 perform UTF-8 string comparison, normalization and casefolding
7 operations. Implemented is a variation of NFD, and casefold is
8 performed by doing full casefold on top of NFD. These algorithms are
9 based on the core implemented by Olaf Weber from SGI.
11 Signed-off-by: Gabriel Krisman Bertazi <krisman@collabora.co.uk>
12 Signed-off-by: Theodore Ts'o <tytso@mit.edu>
15 - Fix sparse warnings (0-day)
18 - integrate in fs/unicode
21 - Change error return code from EIO to EINVAL. (Olaf Weber)
22 - Fix issues with strncmp/strcmp. (Olaf Weber)
23 - Remove stack buffer in normalization/casefold. (Olaf Weber)
24 - Include length parameter for second string on comparison functions.
25 - Change length type to size_t.
27 fs/unicode/Makefile | 4 +-
28 fs/unicode/utf8-core.c | 187 ++++++++++++++++++++++++++++++++++++++++
29 fs/unicode/utf8-norm.c | 6 ++
30 fs/unicode/utf8n.h | 1 +
31 include/linux/unicode.h | 30 +++++++
32 5 files changed, 227 insertions(+), 1 deletion(-)
33 create mode 100644 fs/unicode/utf8-core.c
34 create mode 100644 include/linux/unicode.h
36 diff --git a/fs/unicode/Makefile b/fs/unicode/Makefile
37 index 16d43d180416..bfb0360687df 100644
38 --- a/fs/unicode/Makefile
39 +++ b/fs/unicode/Makefile
41 # SPDX-License-Identifier: GPL-2.0
43 -obj-$(CONFIG_UNICODE) += utf8-norm.o
44 +obj-$(CONFIG_UNICODE) += unicode.o
46 +unicode-y := utf8-norm.o utf8-core.o
48 # This rule is not invoked during the kernel compilation. It is used to
49 # regenerate the utf8data.h header file.
50 diff --git a/fs/unicode/utf8-core.c b/fs/unicode/utf8-core.c
52 index 000000000000..6afab4fdce90
54 +++ b/fs/unicode/utf8-core.c
56 +/* SPDX-License-Identifier: GPL-2.0 */
57 +#include <linux/module.h>
58 +#include <linux/kernel.h>
59 +#include <linux/string.h>
60 +#include <linux/slab.h>
61 +#include <linux/parser.h>
62 +#include <linux/errno.h>
63 +#include <linux/unicode.h>
67 +int utf8_validate(const struct unicode_map *um, const struct qstr *str)
69 + const struct utf8data *data = utf8nfdi(um->version);
71 + if (utf8nlen(data, str->name, str->len) < 0)
75 +EXPORT_SYMBOL(utf8_validate);
77 +int utf8_strncmp(const struct unicode_map *um,
78 + const struct qstr *s1, const struct qstr *s2)
80 + const struct utf8data *data = utf8nfdi(um->version);
81 + struct utf8cursor cur1, cur2;
84 + if (utf8ncursor(&cur1, data, s1->name, s1->len) < 0)
87 + if (utf8ncursor(&cur2, data, s2->name, s2->len) < 0)
91 + c1 = utf8byte(&cur1);
92 + c2 = utf8byte(&cur2);
94 + if (c1 < 0 || c2 < 0)
102 +EXPORT_SYMBOL(utf8_strncmp);
104 +int utf8_strncasecmp(const struct unicode_map *um,
105 + const struct qstr *s1, const struct qstr *s2)
107 + const struct utf8data *data = utf8nfdicf(um->version);
108 + struct utf8cursor cur1, cur2;
111 + if (utf8ncursor(&cur1, data, s1->name, s1->len) < 0)
114 + if (utf8ncursor(&cur2, data, s2->name, s2->len) < 0)
118 + c1 = utf8byte(&cur1);
119 + c2 = utf8byte(&cur2);
121 + if (c1 < 0 || c2 < 0)
129 +EXPORT_SYMBOL(utf8_strncasecmp);
131 +int utf8_casefold(const struct unicode_map *um, const struct qstr *str,
132 + unsigned char *dest, size_t dlen)
134 + const struct utf8data *data = utf8nfdicf(um->version);
135 + struct utf8cursor cur;
138 + if (utf8ncursor(&cur, data, str->name, str->len) < 0)
141 + for (nlen = 0; nlen < dlen; nlen++) {
142 + int c = utf8byte(&cur);
153 +EXPORT_SYMBOL(utf8_casefold);
155 +int utf8_normalize(const struct unicode_map *um, const struct qstr *str,
156 + unsigned char *dest, size_t dlen)
158 + const struct utf8data *data = utf8nfdi(um->version);
159 + struct utf8cursor cur;
162 + if (utf8ncursor(&cur, data, str->name, str->len) < 0)
165 + for (nlen = 0; nlen < dlen; nlen++) {
166 + int c = utf8byte(&cur);
177 +EXPORT_SYMBOL(utf8_normalize);
179 +static int utf8_parse_version(const char *version, unsigned int *maj,
180 + unsigned int *min, unsigned int *rev)
182 + substring_t args[3];
183 + char version_string[12];
184 + const struct match_token token[] = {
189 + strncpy(version_string, version, sizeof(version_string));
191 + if (match_token(version_string, token, args) != 1)
194 + if (match_int(&args[0], maj) || match_int(&args[1], min) ||
195 + match_int(&args[2], rev))
201 +struct unicode_map *utf8_load(const char *version)
203 + struct unicode_map *um = NULL;
204 + int unicode_version;
207 + unsigned int maj, min, rev;
209 + if (utf8_parse_version(version, &maj, &min, &rev) < 0)
210 + return ERR_PTR(-EINVAL);
212 + if (!utf8version_is_supported(maj, min, rev))
213 + return ERR_PTR(-EINVAL);
215 + unicode_version = UNICODE_AGE(maj, min, rev);
217 + unicode_version = utf8version_latest();
218 + printk(KERN_WARNING"UTF-8 version not specified. "
219 + "Assuming latest supported version (%d.%d.%d).",
220 + (unicode_version >> 16) & 0xff,
221 + (unicode_version >> 8) & 0xff,
222 + (unicode_version & 0xff));
225 + um = kzalloc(sizeof(struct unicode_map), GFP_KERNEL);
227 + return ERR_PTR(-ENOMEM);
229 + um->charset = "UTF-8";
230 + um->version = unicode_version;
234 +EXPORT_SYMBOL(utf8_load);
236 +void utf8_unload(struct unicode_map *um)
240 +EXPORT_SYMBOL(utf8_unload);
242 +MODULE_LICENSE("GPL v2");
243 diff --git a/fs/unicode/utf8-norm.c b/fs/unicode/utf8-norm.c
244 index 848b93e97f50..20d440c3f2db 100644
245 --- a/fs/unicode/utf8-norm.c
246 +++ b/fs/unicode/utf8-norm.c
247 @@ -38,6 +38,12 @@ int utf8version_is_supported(u8 maj, u8 min, u8 rev)
249 EXPORT_SYMBOL(utf8version_is_supported);
251 +int utf8version_latest(void)
255 +EXPORT_SYMBOL(utf8version_latest);
258 * UTF-8 valid ranges.
260 diff --git a/fs/unicode/utf8n.h b/fs/unicode/utf8n.h
261 index b63a9091dc39..a120638014c1 100644
262 --- a/fs/unicode/utf8n.h
263 +++ b/fs/unicode/utf8n.h
266 /* Highest unicode version supported by the data tables. */
267 extern int utf8version_is_supported(u8 maj, u8 min, u8 rev);
268 +extern int utf8version_latest(void);
271 * Look for the correct const struct utf8data for a unicode version.
272 diff --git a/include/linux/unicode.h b/include/linux/unicode.h
274 index 000000000000..aec2c6d800aa
276 +++ b/include/linux/unicode.h
278 +/* SPDX-License-Identifier: GPL-2.0 */
279 +#ifndef _LINUX_UNICODE_H
280 +#define _LINUX_UNICODE_H
282 +#include <linux/init.h>
283 +#include <linux/dcache.h>
285 +struct unicode_map {
286 + const char *charset;
290 +int utf8_validate(const struct unicode_map *um, const struct qstr *str);
292 +int utf8_strncmp(const struct unicode_map *um,
293 + const struct qstr *s1, const struct qstr *s2);
295 +int utf8_strncasecmp(const struct unicode_map *um,
296 + const struct qstr *s1, const struct qstr *s2);
298 +int utf8_normalize(const struct unicode_map *um, const struct qstr *str,
299 + unsigned char *dest, size_t dlen);
301 +int utf8_casefold(const struct unicode_map *um, const struct qstr *str,
302 + unsigned char *dest, size_t dlen);
304 +struct unicode_map *utf8_load(const char *version);
305 +void utf8_unload(struct unicode_map *um);
307 +#endif /* _LINUX_UNICODE_H */