lib/unilbrk/ulc-width-linebreaks.c

   1 /* Line breaking of strings.
   2    Copyright (C) 2001-2003, 2006-2024 Free Software Foundation, Inc.
   3    Written by Bruno Haible <bruno@clisp.org>, 2001.
   4
   5    This file is free software.
   6    It is dual-licensed under "the GNU LGPLv3+ or the GNU GPLv2+".
   7    You can redistribute it and/or modify it under either
   8      - the terms of the GNU Lesser General Public License as published
   9        by the Free Software Foundation, either version 3, or (at your
  10        option) any later version, or
  11      - the terms of the GNU General Public License as published by the
  12        Free Software Foundation; either version 2, or (at your option)
  13        any later version, or
  14      - the same dual license "the GNU LGPLv3+ or the GNU GPLv2+".
  15
  16    This file is distributed in the hope that it will be useful,
  17    but WITHOUT ANY WARRANTY; without even the implied warranty of
  18    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  19    Lesser General Public License and the GNU General Public License
  20    for more details.
  21
  22    You should have received a copy of the GNU Lesser General Public
  23    License and of the GNU General Public License along with this
  24    program.  If not, see <https://www.gnu.org/licenses/>.  */
  25
  26 #include <config.h>
  27
  28 /* Specification.  */
  29 #include "unilbrk.h"
  30
  31 #include <stdlib.h>
  32 #include <string.h>
  33
  34 #include "c-ctype.h"
  35 #include "uniconv.h"
  36 #include "unilbrk/internal.h"
  37 #include "unilbrk/lbrktables.h"
  38 #include "unilbrk/ulc-common.h"
  39
  40 /* Line breaking of a string in an arbitrary encoding.
  41
  42    We convert the input string to Unicode.
  43
  44    The standardized Unicode encodings are UTF-8, UCS-2, UCS-4, UTF-16,
  45    UTF-16BE, UTF-16LE, UTF-7.  UCS-2 supports only characters up to
  46    \U0000FFFF.  UTF-16 and variants support only characters up to
  47    \U0010FFFF.  UTF-7 is way too complex and not supported by glibc-2.1.
  48    UCS-4 specification leaves doubts about endianness and byte order mark.
  49    glibc currently interprets it as big endian without byte order mark,
  50    but this is not backed by an RFC.  So we use UTF-8. It supports
  51    characters up to \U7FFFFFFF and is unambiguously defined.  */
  52
  53 static int
  54 ulc_width_linebreaks_internal (const char *s, size_t n,
  55                                int width, int start_column, int at_end_columns,
  56                                const char *o, const char *encoding, int cr,
  57                                char *p)
  58 {
  59   if (n > 0)
  60     {
  61       if (is_utf8_encoding (encoding))
  62         return u8_width_linebreaks_internal ((const uint8_t *) s, n, width, start_column, at_end_columns, o, encoding, cr, p);
  63       else
  64         {
  65           /* Convert the string to UTF-8 and build a translation table
  66              from offsets into s to offsets into the translated string.  */
  67           size_t *offsets = (size_t *) malloc (n * sizeof (size_t));
  68
  69           if (offsets != NULL)
  70             {
  71               uint8_t *t;
  72               size_t m;
  73
  74               t = u8_conv_from_encoding (encoding, iconveh_question_mark,
  75                                          s, n, offsets, NULL, &m);
  76               if (t != NULL)
  77                 {
  78                   char *memory =
  79                     (char *) (m > 0 ? malloc (m + (o != NULL ? m : 0)) : NULL);
  80
  81                   if (m == 0 || memory != NULL)
  82                     {
  83                       char *q = (char *) memory;
  84                       char *o8 = (o != NULL ? (char *) (q + m) : NULL);
  85                       int res_column;
  86                       size_t i;
  87
  88                       /* Translate the overrides to the UTF-8 string.  */
  89                       if (o != NULL)
  90                         {
  91                           memset (o8, UC_BREAK_UNDEFINED, m);
  92                           for (i = 0; i < n; i++)
  93                             if (offsets[i] != (size_t)(-1))
  94                               o8[offsets[i]] = o[i];
  95                         }
  96
  97                       /* Determine the line breaks of the UTF-8 string.  */
  98                       res_column =
  99                         u8_width_linebreaks_internal (t, m, width, start_column, at_end_columns, o8, encoding, cr, q);
 100
 101                       /* Translate the result back to the original string.  */
 102                       memset (p, UC_BREAK_PROHIBITED, n);
 103                       for (i = 0; i < n; i++)
 104                         if (offsets[i] != (size_t)(-1))
 105                           p[i] = q[offsets[i]];
 106
 107                       free (memory);
 108                       free (t);
 109                       free (offsets);
 110                       return res_column;
 111                     }
 112                   free (t);
 113                 }
 114               free (offsets);
 115             }
 116           /* Impossible to convert.  */
 117 #if C_CTYPE_ASCII
 118           if (is_all_ascii (s, n))
 119             {
 120               /* ASCII is a subset of UTF-8.  */
 121               return u8_width_linebreaks_internal ((const uint8_t *) s, n, width, start_column, at_end_columns, o, encoding, cr, p);
 122             }
 123 #endif
 124           /* We have a non-ASCII string and cannot convert it.
 125              Don't produce line breaks except those already present in the
 126              input string.  All we assume here is that the encoding is
 127              minimally ASCII compatible.  */
 128           {
 129             const char *s_end = s + n;
 130             while (s < s_end)
 131               {
 132                 *p = ((o != NULL && *o == UC_BREAK_MANDATORY)
 133                       || *s == '\n'
 134                       ? UC_BREAK_MANDATORY
 135                       : ((o != NULL && *o == UC_BREAK_CR_BEFORE_LF)
 136                          || (cr >= 0
 137                              && *s == '\r'
 138                              && s + 1 < s_end
 139                              && *(s + 1) == '\n')
 140                          ? UC_BREAK_CR_BEFORE_LF
 141                          : UC_BREAK_PROHIBITED));
 142                 s++;
 143                 p++;
 144                 if (o != NULL)
 145                   o++;
 146               }
 147             /* We cannot compute widths in this case.  */
 148           }
 149         }
 150     }
 151   return start_column;
 152 }
 153
 154 #if defined IN_LIBUNISTRING
 155 /* For backward compatibility with older versions of libunistring.  */
 156
 157 # undef ulc_width_linebreaks
 158
 159 int
 160 ulc_width_linebreaks (const char *s, size_t n,
 161                       int width, int start_column, int at_end_columns,
 162                       const char *o, const char *encoding,
 163                       char *p)
 164 {
 165   return ulc_width_linebreaks_internal (s, n,
 166                                         width, start_column, at_end_columns,
 167                                         o, encoding, -1, p);
 168 }
 169
 170 #endif
 171
 172 int
 173 ulc_width_linebreaks_v2 (const char *s, size_t n,
 174                          int width, int start_column, int at_end_columns,
 175                          const char *o, const char *encoding,
 176                          char *p)
 177 {
 178   return ulc_width_linebreaks_internal (s, n,
 179                                         width, start_column, at_end_columns,
 180                                         o, encoding, LBP_CR, p);
 181 }
 182
 183
 184 #ifdef TEST
 185
 186 #include <stdio.h>
 187 #include <locale.h>
 188
 189 /* Read the contents of an input stream, and return it, terminated with a NUL
 190    byte. */
 191 char *
 192 read_file (FILE *stream)
 193 {
 194 #define BUFSIZE 4096
 195   char *buf = NULL;
 196   int alloc = 0;
 197   int size = 0;
 198   int count;
 199
 200   while (! feof (stream))
 201     {
 202       if (size + BUFSIZE > alloc)
 203         {
 204           alloc = alloc + alloc / 2;
 205           if (alloc < size + BUFSIZE)
 206             alloc = size + BUFSIZE;
 207           buf = realloc (buf, alloc);
 208           if (buf == NULL)
 209             {
 210               fprintf (stderr, "out of memory\n");
 211               exit (1);
 212             }
 213         }
 214       count = fread (buf + size, 1, BUFSIZE, stream);
 215       if (count == 0)
 216         {
 217           if (ferror (stream))
 218             {
 219               perror ("fread");
 220               exit (1);
 221             }
 222         }
 223       else
 224         size += count;
 225     }
 226   buf = realloc (buf, size + 1);
 227   if (buf == NULL)
 228     {
 229       fprintf (stderr, "out of memory\n");
 230       exit (1);
 231     }
 232   buf[size] = '\0';
 233   return buf;
 234 #undef BUFSIZE
 235 }
 236
 237 int
 238 main (int argc, char * argv[])
 239 {
 240   setlocale (LC_CTYPE, "");
 241   if (argc == 2)
 242     {
 243       /* Insert line breaks for a given width.  */
 244       int width = atoi (argv[1]);
 245       char *input = read_file (stdin);
 246       int length = strlen (input);
 247       char *breaks = malloc (length);
 248       int i;
 249
 250       ulc_width_linebreaks_v2 (input, length, width, 0, 0, NULL, locale_charset (), breaks);
 251
 252       for (i = 0; i < length; i++)
 253         {
 254           switch (breaks[i])
 255             {
 256             case UC_BREAK_POSSIBLE:
 257               putc ('\n', stdout);
 258               break;
 259             case UC_BREAK_MANDATORY:
 260               break;
 261             case UC_BREAK_CR_BEFORE_LF:
 262               break;
 263             case UC_BREAK_PROHIBITED:
 264               break;
 265             default:
 266               abort ();
 267             }
 268           putc (input[i], stdout);
 269         }
 270
 271       free (breaks);
 272
 273       return 0;
 274     }
 275   else
 276     return 1;
 277 }
 278
 279 #endif /* TEST */