implement-higher-level-API-for-unicode-string-handling

   1 unicode: implement higher level API for string handling
   2
   3 From: Gabriel Krisman Bertazi <krisman@collabora.co.uk>
   4
   5 This patch integrates the utf8n patches with some higher level API to
   6 perform UTF-8 string comparison, normalization and casefolding
   7 operations.  Implemented is a variation of NFD, and casefold is
   8 performed by doing full casefold on top of NFD.  These algorithms are
   9 based on the core implemented by Olaf Weber from SGI.
  10
  11 Signed-off-by: Gabriel Krisman Bertazi <krisman@collabora.co.uk>
  12 Signed-off-by: Theodore Ts'o <tytso@mit.edu>
  13 ---
  14 Changes since v6:
  15   - Fix sparse warnings (0-day)
  16
  17 Changes since v4:
  18   - integrate in fs/unicode
  19
  20 Changes since RFC v1:
  21   - Change error return code from EIO to EINVAL. (Olaf Weber)
  22   - Fix issues with strncmp/strcmp.  (Olaf Weber)
  23   - Remove stack buffer in normalization/casefold. (Olaf Weber)
  24   - Include length parameter for second string on comparison functions.
  25   - Change length type to size_t.
  26
  27  fs/unicode/Makefile     |   4 +-
  28  fs/unicode/utf8-core.c  | 187 ++++++++++++++++++++++++++++++++++++++++
  29  fs/unicode/utf8-norm.c  |   6 ++
  30  fs/unicode/utf8n.h      |   1 +
  31  include/linux/unicode.h |  30 +++++++
  32  5 files changed, 227 insertions(+), 1 deletion(-)
  33  create mode 100644 fs/unicode/utf8-core.c
  34  create mode 100644 include/linux/unicode.h
  35
  36 diff --git a/fs/unicode/Makefile b/fs/unicode/Makefile
  37 index 16d43d180416..bfb0360687df 100644
  38 --- a/fs/unicode/Makefile
  39 +++ b/fs/unicode/Makefile
  40 @@ -1,6 +1,8 @@
  41  # SPDX-License-Identifier: GPL-2.0
  42
  43 -obj-$(CONFIG_UNICODE) += utf8-norm.o
  44 +obj-$(CONFIG_UNICODE) += unicode.o
  45 +
  46 +unicode-y := utf8-norm.o utf8-core.o
  47
  48  # This rule is not invoked during the kernel compilation.  It is used to
  49  # regenerate the utf8data.h header file.
  50 diff --git a/fs/unicode/utf8-core.c b/fs/unicode/utf8-core.c
  51 new file mode 100644
  52 index 000000000000..6afab4fdce90
  53 --- /dev/null
  54 +++ b/fs/unicode/utf8-core.c
  55 @@ -0,0 +1,187 @@
  56 +/* SPDX-License-Identifier: GPL-2.0 */
  57 +#include <linux/module.h>
  58 +#include <linux/kernel.h>
  59 +#include <linux/string.h>
  60 +#include <linux/slab.h>
  61 +#include <linux/parser.h>
  62 +#include <linux/errno.h>
  63 +#include <linux/unicode.h>
  64 +
  65 +#include "utf8n.h"
  66 +
  67 +int utf8_validate(const struct unicode_map *um, const struct qstr *str)
  68 +{
  69 +       const struct utf8data *data = utf8nfdi(um->version);
  70 +
  71 +       if (utf8nlen(data, str->name, str->len) < 0)
  72 +               return -1;
  73 +       return 0;
  74 +}
  75 +EXPORT_SYMBOL(utf8_validate);
  76 +
  77 +int utf8_strncmp(const struct unicode_map *um,
  78 +                const struct qstr *s1, const struct qstr *s2)
  79 +{
  80 +       const struct utf8data *data = utf8nfdi(um->version);
  81 +       struct utf8cursor cur1, cur2;
  82 +       int c1, c2;
  83 +
  84 +       if (utf8ncursor(&cur1, data, s1->name, s1->len) < 0)
  85 +               return -EINVAL;
  86 +
  87 +       if (utf8ncursor(&cur2, data, s2->name, s2->len) < 0)
  88 +               return -EINVAL;
  89 +
  90 +       do {
  91 +               c1 = utf8byte(&cur1);
  92 +               c2 = utf8byte(&cur2);
  93 +
  94 +               if (c1 < 0 || c2 < 0)
  95 +                       return -EINVAL;
  96 +               if (c1 != c2)
  97 +                       return 1;
  98 +       } while (c1);
  99 +
 100 +       return 0;
 101 +}
 102 +EXPORT_SYMBOL(utf8_strncmp);
 103 +
 104 +int utf8_strncasecmp(const struct unicode_map *um,
 105 +                    const struct qstr *s1, const struct qstr *s2)
 106 +{
 107 +       const struct utf8data *data = utf8nfdicf(um->version);
 108 +       struct utf8cursor cur1, cur2;
 109 +       int c1, c2;
 110 +
 111 +       if (utf8ncursor(&cur1, data, s1->name, s1->len) < 0)
 112 +               return -EINVAL;
 113 +
 114 +       if (utf8ncursor(&cur2, data, s2->name, s2->len) < 0)
 115 +               return -EINVAL;
 116 +
 117 +       do {
 118 +               c1 = utf8byte(&cur1);
 119 +               c2 = utf8byte(&cur2);
 120 +
 121 +               if (c1 < 0 || c2 < 0)
 122 +                       return -EINVAL;
 123 +               if (c1 != c2)
 124 +                       return 1;
 125 +       } while (c1);
 126 +
 127 +       return 0;
 128 +}
 129 +EXPORT_SYMBOL(utf8_strncasecmp);
 130 +
 131 +int utf8_casefold(const struct unicode_map *um, const struct qstr *str,
 132 +                 unsigned char *dest, size_t dlen)
 133 +{
 134 +       const struct utf8data *data = utf8nfdicf(um->version);
 135 +       struct utf8cursor cur;
 136 +       size_t nlen = 0;
 137 +
 138 +       if (utf8ncursor(&cur, data, str->name, str->len) < 0)
 139 +               return -EINVAL;
 140 +
 141 +       for (nlen = 0; nlen < dlen; nlen++) {
 142 +               int c = utf8byte(&cur);
 143 +
 144 +               dest[nlen] = c;
 145 +               if (!c)
 146 +                       return nlen;
 147 +               if (c == -1)
 148 +                       break;
 149 +       }
 150 +       return -EINVAL;
 151 +}
 152 +
 153 +EXPORT_SYMBOL(utf8_casefold);
 154 +
 155 +int utf8_normalize(const struct unicode_map *um, const struct qstr *str,
 156 +                  unsigned char *dest, size_t dlen)
 157 +{
 158 +       const struct utf8data *data = utf8nfdi(um->version);
 159 +       struct utf8cursor cur;
 160 +       ssize_t nlen = 0;
 161 +
 162 +       if (utf8ncursor(&cur, data, str->name, str->len) < 0)
 163 +               return -EINVAL;
 164 +
 165 +       for (nlen = 0; nlen < dlen; nlen++) {
 166 +               int c = utf8byte(&cur);
 167 +
 168 +               dest[nlen] = c;
 169 +               if (!c)
 170 +                       return nlen;
 171 +               if (c == -1)
 172 +                       break;
 173 +       }
 174 +       return -EINVAL;
 175 +}
 176 +
 177 +EXPORT_SYMBOL(utf8_normalize);
 178 +
 179 +static int utf8_parse_version(const char *version, unsigned int *maj,
 180 +                             unsigned int *min, unsigned int *rev)
 181 +{
 182 +       substring_t args[3];
 183 +       char version_string[12];
 184 +       const struct match_token token[] = {
 185 +               {1, "%d.%d.%d"},
 186 +               {0, NULL}
 187 +       };
 188 +
 189 +       strncpy(version_string, version, sizeof(version_string));
 190 +
 191 +       if (match_token(version_string, token, args) != 1)
 192 +               return -EINVAL;
 193 +
 194 +       if (match_int(&args[0], maj) || match_int(&args[1], min) ||
 195 +           match_int(&args[2], rev))
 196 +               return -EINVAL;
 197 +
 198 +       return 0;
 199 +}
 200 +
 201 +struct unicode_map *utf8_load(const char *version)
 202 +{
 203 +       struct unicode_map *um = NULL;
 204 +       int unicode_version;
 205 +
 206 +       if (version) {
 207 +               unsigned int maj, min, rev;
 208 +
 209 +               if (utf8_parse_version(version, &maj, &min, &rev) < 0)
 210 +                       return ERR_PTR(-EINVAL);
 211 +
 212 +               if (!utf8version_is_supported(maj, min, rev))
 213 +                       return ERR_PTR(-EINVAL);
 214 +
 215 +               unicode_version = UNICODE_AGE(maj, min, rev);
 216 +       } else {
 217 +               unicode_version = utf8version_latest();
 218 +               printk(KERN_WARNING"UTF-8 version not specified. "
 219 +                      "Assuming latest supported version (%d.%d.%d).",
 220 +                      (unicode_version >> 16) & 0xff,
 221 +                      (unicode_version >> 8) & 0xff,
 222 +                      (unicode_version & 0xff));
 223 +       }
 224 +
 225 +       um = kzalloc(sizeof(struct unicode_map), GFP_KERNEL);
 226 +       if (!um)
 227 +               return ERR_PTR(-ENOMEM);
 228 +
 229 +       um->charset = "UTF-8";
 230 +       um->version = unicode_version;
 231 +
 232 +       return um;
 233 +}
 234 +EXPORT_SYMBOL(utf8_load);
 235 +
 236 +void utf8_unload(struct unicode_map *um)
 237 +{
 238 +       kfree(um);
 239 +}
 240 +EXPORT_SYMBOL(utf8_unload);
 241 +
 242 +MODULE_LICENSE("GPL v2");
 243 diff --git a/fs/unicode/utf8-norm.c b/fs/unicode/utf8-norm.c
 244 index 848b93e97f50..20d440c3f2db 100644
 245 --- a/fs/unicode/utf8-norm.c
 246 +++ b/fs/unicode/utf8-norm.c
 247 @@ -38,6 +38,12 @@ int utf8version_is_supported(u8 maj, u8 min, u8 rev)
 248  }
 249  EXPORT_SYMBOL(utf8version_is_supported);
 250
 251 +int utf8version_latest(void)
 252 +{
 253 +       return utf8vers;
 254 +}
 255 +EXPORT_SYMBOL(utf8version_latest);
 256 +
 257  /*
 258   * UTF-8 valid ranges.
 259   *
 260 diff --git a/fs/unicode/utf8n.h b/fs/unicode/utf8n.h
 261 index b63a9091dc39..a120638014c1 100644
 262 --- a/fs/unicode/utf8n.h
 263 +++ b/fs/unicode/utf8n.h
 264 @@ -32,6 +32,7 @@
 265
 266  /* Highest unicode version supported by the data tables. */
 267  extern int utf8version_is_supported(u8 maj, u8 min, u8 rev);
 268 +extern int utf8version_latest(void);
 269
 270  /*
 271   * Look for the correct const struct utf8data for a unicode version.
 272 diff --git a/include/linux/unicode.h b/include/linux/unicode.h
 273 new file mode 100644
 274 index 000000000000..aec2c6d800aa
 275 --- /dev/null
 276 +++ b/include/linux/unicode.h
 277 @@ -0,0 +1,30 @@
 278 +/* SPDX-License-Identifier: GPL-2.0 */
 279 +#ifndef _LINUX_UNICODE_H
 280 +#define _LINUX_UNICODE_H
 281 +
 282 +#include <linux/init.h>
 283 +#include <linux/dcache.h>
 284 +
 285 +struct unicode_map {
 286 +       const char *charset;
 287 +       int version;
 288 +};
 289 +
 290 +int utf8_validate(const struct unicode_map *um, const struct qstr *str);
 291 +
 292 +int utf8_strncmp(const struct unicode_map *um,
 293 +                const struct qstr *s1, const struct qstr *s2);
 294 +
 295 +int utf8_strncasecmp(const struct unicode_map *um,
 296 +                const struct qstr *s1, const struct qstr *s2);
 297 +
 298 +int utf8_normalize(const struct unicode_map *um, const struct qstr *str,
 299 +                  unsigned char *dest, size_t dlen);
 300 +
 301 +int utf8_casefold(const struct unicode_map *um, const struct qstr *str,
 302 +                 unsigned char *dest, size_t dlen);
 303 +
 304 +struct unicode_map *utf8_load(const char *version);
 305 +void utf8_unload(struct unicode_map *um);
 306 +
 307 +#endif /* _LINUX_UNICODE_H */
 308 --
 309 2.20.1
 310
 311