lib/unilbrk/u8-possible-linebreaks.c

   1 /* Line breaking of UTF-8 strings.
   2    Copyright (C) 2001-2003, 2006-2024 Free Software Foundation, Inc.
   3    Written by Bruno Haible <bruno@clisp.org>, 2001.
   4
   5    This file is free software.
   6    It is dual-licensed under "the GNU LGPLv3+ or the GNU GPLv2+".
   7    You can redistribute it and/or modify it under either
   8      - the terms of the GNU Lesser General Public License as published
   9        by the Free Software Foundation, either version 3, or (at your
  10        option) any later version, or
  11      - the terms of the GNU General Public License as published by the
  12        Free Software Foundation; either version 2, or (at your option)
  13        any later version, or
  14      - the same dual license "the GNU LGPLv3+ or the GNU GPLv2+".
  15
  16    This file is distributed in the hope that it will be useful,
  17    but WITHOUT ANY WARRANTY; without even the implied warranty of
  18    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  19    Lesser General Public License and the GNU General Public License
  20    for more details.
  21
  22    You should have received a copy of the GNU Lesser General Public
  23    License and of the GNU General Public License along with this
  24    program.  If not, see <https://www.gnu.org/licenses/>.  */
  25
  26 #include <config.h>
  27
  28 /* Specification.  */
  29 #include "unilbrk.h"
  30 #include "unilbrk/internal.h"
  31
  32 #include <stdlib.h>
  33 #include <string.h>
  34
  35 #include "unilbrk/lbrktables.h"
  36 #include "uniwidth/cjk.h"
  37 #include "unistr.h"
  38
  39 /* This file implements
  40    Unicode Standard Annex #14 <https://www.unicode.org/reports/tr14/>.  */
  41
  42 void
  43 u8_possible_linebreaks_loop (const uint8_t *s, size_t n, const char *encoding,
  44                              int cr, char *p)
  45 {
  46   if (n > 0)
  47     {
  48       int LBP_AI_REPLACEMENT = (is_cjk_encoding (encoding) ? LBP_ID1 : LBP_AL);
  49       const uint8_t *s_end = s + n;
  50       int prev_prop = LBP_BK; /* line break property of last character */
  51       int last_prop = LBP_BK; /* line break property of last non-space character */
  52       char *seen_space = NULL; /* Was a space seen after the last non-space character? */
  53
  54       /* Don't break inside multibyte characters.  */
  55       memset (p, UC_BREAK_PROHIBITED, n);
  56
  57       /* Number of consecutive regional indicator (RI) characters seen
  58          immediately before the current point.  */
  59       size_t ri_count = 0;
  60
  61       do
  62         {
  63           ucs4_t uc;
  64           int count = u8_mbtouc_unsafe (&uc, s, s_end - s);
  65           s += count;
  66           int prop = unilbrkprop_lookup (uc);
  67
  68           if (prop == LBP_BK || prop == LBP_LF || prop == LBP_CR)
  69             {
  70               /* (LB4,LB5,LB6) Mandatory break.  */
  71               *p = UC_BREAK_MANDATORY;
  72               /* cr is either LBP_CR or -1.  In the first case, recognize
  73                  a CR-LF sequence.  */
  74               if (prev_prop == cr && prop == LBP_LF)
  75                 p[-1] = UC_BREAK_CR_BEFORE_LF;
  76               prev_prop = prop;
  77               last_prop = LBP_BK;
  78               seen_space = NULL;
  79             }
  80           else
  81             {
  82               /* Resolve property values whose behaviour is not fixed.  */
  83               switch (prop)
  84                 {
  85                 case LBP_AI:
  86                   /* Resolve ambiguous.  */
  87                   prop = LBP_AI_REPLACEMENT;
  88                   break;
  89                 case LBP_CB:
  90                   /* This is arbitrary.  */
  91                   prop = LBP_ID1;
  92                   break;
  93                 case LBP_SA:
  94                   /* We don't handle complex scripts yet.
  95                      Treat LBP_SA like LBP_XX.  */
  96                 case LBP_XX:
  97                   /* This is arbitrary.  */
  98                   prop = LBP_AL;
  99                   break;
 100                 case LBP_QU2:
 101                   /* For (LB15a): Replace LBP_QU2 with LBP_QU1 if the previous
 102                      character's line break property was not one of
 103                      BK, CR, LF, OP, QU, GL, SP, ZW.  */
 104                   switch (prev_prop)
 105                     {
 106                     case LBP_BK:
 107                     case LBP_CR:
 108                     case LBP_LF:
 109                     case LBP_OP1: case LBP_OP2:
 110                     case LBP_QU1: case LBP_QU2: case LBP_QU3:
 111                     case LBP_GL:
 112                     case LBP_SP:
 113                     case LBP_ZW:
 114                       break;
 115                     default:
 116                       prop = LBP_QU1;
 117                       break;
 118                     }
 119                   break;
 120                 case LBP_QU3:
 121                   /* For (LB15b): Replace LBP_QU3 with LBP_QU1 if the next
 122                      character's line break property is not one of
 123                      BK, CR, LF, SP, GL, WJ, CL, QU, CP, EX, IS, SY, ZW.  */
 124                   {
 125                     int next_prop;
 126                     if (s < s_end)
 127                       {
 128                         ucs4_t next_uc;
 129                         (void) u8_mbtouc_unsafe (&next_uc, s, s_end - s);
 130                         next_prop = unilbrkprop_lookup (next_uc);
 131                       }
 132                     else
 133                       next_prop = LBP_BK;
 134                     switch (next_prop)
 135                       {
 136                       case LBP_BK:
 137                       case LBP_CR:
 138                       case LBP_LF:
 139                       case LBP_SP:
 140                       case LBP_GL:
 141                       case LBP_WJ:
 142                       case LBP_CL:
 143                       case LBP_QU1: case LBP_QU2: case LBP_QU3:
 144                       case LBP_CP1: case LBP_CP2:
 145                       case LBP_EX:
 146                       case LBP_IS:
 147                       case LBP_SY:
 148                       case LBP_ZW:
 149                         break;
 150                       default:
 151                         prop = LBP_QU1;
 152                         break;
 153                       }
 154                   }
 155                   break;
 156                 }
 157
 158               /* Deal with spaces and combining characters.  */
 159               if (prop == LBP_SP)
 160                 {
 161                   /* (LB7) Don't break just before a space.  */
 162                   *p = UC_BREAK_PROHIBITED;
 163                   seen_space = p;
 164                 }
 165               else if (prop == LBP_ZW)
 166                 {
 167                   /* (LB7) Don't break just before a zero-width space.  */
 168                   *p = UC_BREAK_PROHIBITED;
 169                   last_prop = LBP_ZW;
 170                   seen_space = NULL;
 171                 }
 172               else if (prop == LBP_CM || prop == LBP_ZWJ)
 173                 {
 174                   /* (LB9) Don't break just before a combining character or
 175                      zero-width joiner, except immediately after a mandatory
 176                      break character, space, or zero-width space.  */
 177                   if (last_prop == LBP_BK)
 178                     {
 179                       /* (LB4,LB5,LB6) Don't break at the beginning of a line.  */
 180                       *p = UC_BREAK_PROHIBITED;
 181                       /* (LB10) Treat CM or ZWJ as AL.  */
 182                       last_prop = LBP_AL;
 183                       seen_space = NULL;
 184                     }
 185                   else if (last_prop == LBP_ZW || seen_space != NULL)
 186                     {
 187                       /* (LB8) Break after zero-width space.  */
 188                       /* (LB18) Break after spaces.
 189                          We do *not* implement the "legacy support for space
 190                          character as base for combining marks" because now the
 191                          NBSP CM sequence is recommended instead of SP CM.  */
 192                       *p = UC_BREAK_POSSIBLE;
 193                       /* (LB10) Treat CM or ZWJ as AL.  */
 194                       last_prop = LBP_AL;
 195                       seen_space = NULL;
 196                     }
 197                   else
 198                     {
 199                       /* Treat X CM as if it were X.  */
 200                       *p = UC_BREAK_PROHIBITED;
 201                     }
 202                 }
 203               else
 204                 {
 205                   /* prop must be usable as an index for table 7.3 of UTR #14.  */
 206                   if (!(prop >= 0 && prop < sizeof (unilbrk_table) / sizeof (unilbrk_table[0])))
 207                     abort ();
 208
 209                   if (last_prop == LBP_BK)
 210                     {
 211                       /* (LB4,LB5,LB6) Don't break at the beginning of a line.  */
 212                       *p = UC_BREAK_PROHIBITED;
 213                     }
 214                   else if (last_prop == LBP_ZW)
 215                     {
 216                       /* (LB8) Break after zero-width space.  */
 217                       *p = UC_BREAK_POSSIBLE;
 218                     }
 219                   else if (prev_prop == LBP_ZWJ)
 220                     {
 221                       /* (LB8a) Don't break right after a zero-width joiner.  */
 222                       *p = UC_BREAK_PROHIBITED;
 223                     }
 224                   else if (last_prop == LBP_RI && prop == LBP_RI)
 225                     {
 226                       /* (LB30a) Break between two regional indicator symbols
 227                          if and only if there are an even number of regional
 228                          indicators preceding the position of the break.  */
 229                       *p = (seen_space != NULL || (ri_count % 2) == 0
 230                             ? UC_BREAK_POSSIBLE
 231                             : UC_BREAK_PROHIBITED);
 232                     }
 233                   else if (prev_prop == LBP_HL_BA)
 234                     {
 235                       /* (LB21a) Don't break after Hebrew + Hyphen/Break-After.  */
 236                       *p = UC_BREAK_PROHIBITED;
 237                     }
 238                   else
 239                     {
 240                       switch (unilbrk_table [last_prop] [prop])
 241                         {
 242                         case D:
 243                           *p = UC_BREAK_POSSIBLE;
 244                           break;
 245                         case I:
 246                           *p = (seen_space != NULL ? UC_BREAK_POSSIBLE : UC_BREAK_PROHIBITED);
 247                           break;
 248                         case P:
 249                           *p = UC_BREAK_PROHIBITED;
 250                           break;
 251                         default:
 252                           abort ();
 253                         }
 254                     }
 255                   last_prop = prop;
 256                   seen_space = NULL;
 257                 }
 258
 259               prev_prop = (prev_prop == LBP_HL && (prop == LBP_HY || prop == LBP_BA)
 260                            ? LBP_HL_BA
 261                            : prop);
 262             }
 263
 264           if (prop == LBP_RI)
 265             ri_count++;
 266           else
 267             ri_count = 0;
 268
 269           p += count;
 270         }
 271       while (s < s_end);
 272     }
 273 }
 274
 275 #if defined IN_LIBUNISTRING
 276 /* For backward compatibility with older versions of libunistring.  */
 277
 278 # undef u8_possible_linebreaks
 279
 280 void
 281 u8_possible_linebreaks (const uint8_t *s, size_t n, const char *encoding,
 282                         char *p)
 283 {
 284   u8_possible_linebreaks_loop (s, n, encoding, -1, p);
 285 }
 286
 287 #endif
 288
 289 void
 290 u8_possible_linebreaks_v2 (const uint8_t *s, size_t n, const char *encoding,
 291                            char *p)
 292 {
 293   u8_possible_linebreaks_loop (s, n, encoding, LBP_CR, p);
 294 }
 295
 296
 297 #ifdef TEST
 298
 299 #include <stdio.h>
 300 #include <string.h>
 301
 302 /* Read the contents of an input stream, and return it, terminated with a NUL
 303    byte. */
 304 char *
 305 read_file (FILE *stream)
 306 {
 307 #define BUFSIZE 4096
 308   char *buf = NULL;
 309   int alloc = 0;
 310   int size = 0;
 311   int count;
 312
 313   while (! feof (stream))
 314     {
 315       if (size + BUFSIZE > alloc)
 316         {
 317           alloc = alloc + alloc / 2;
 318           if (alloc < size + BUFSIZE)
 319             alloc = size + BUFSIZE;
 320           buf = realloc (buf, alloc);
 321           if (buf == NULL)
 322             {
 323               fprintf (stderr, "out of memory\n");
 324               exit (1);
 325             }
 326         }
 327       count = fread (buf + size, 1, BUFSIZE, stream);
 328       if (count == 0)
 329         {
 330           if (ferror (stream))
 331             {
 332               perror ("fread");
 333               exit (1);
 334             }
 335         }
 336       else
 337         size += count;
 338     }
 339   buf = realloc (buf, size + 1);
 340   if (buf == NULL)
 341     {
 342       fprintf (stderr, "out of memory\n");
 343       exit (1);
 344     }
 345   buf[size] = '\0';
 346   return buf;
 347 #undef BUFSIZE
 348 }
 349
 350 int
 351 main (int argc, char * argv[])
 352 {
 353   if (argc == 1)
 354     {
 355       /* Display all the break opportunities in the input string.  */
 356       char *input = read_file (stdin);
 357       int length = strlen (input);
 358       char *breaks = malloc (length);
 359       int i;
 360
 361       u8_possible_linebreaks_v2 ((uint8_t *) input, length, "UTF-8", breaks);
 362
 363       for (i = 0; i < length; i++)
 364         {
 365           switch (breaks[i])
 366             {
 367             case UC_BREAK_POSSIBLE:
 368               /* U+2027 in UTF-8 encoding */
 369               putc (0xe2, stdout); putc (0x80, stdout); putc (0xa7, stdout);
 370               break;
 371             case UC_BREAK_MANDATORY:
 372               /* U+21B2 (or U+21B5) in UTF-8 encoding */
 373               putc (0xe2, stdout); putc (0x86, stdout); putc (0xb2, stdout);
 374               break;
 375             case UC_BREAK_CR_BEFORE_LF:
 376               /* U+21E4 in UTF-8 encoding */
 377               putc (0xe2, stdout); putc (0x87, stdout); putc (0xa4, stdout);
 378               break;
 379             case UC_BREAK_PROHIBITED:
 380               break;
 381             default:
 382               abort ();
 383             }
 384           putc (input[i], stdout);
 385         }
 386
 387       free (breaks);
 388
 389       return 0;
 390     }
 391   else
 392     return 1;
 393 }
 394
 395 #endif /* TEST */