intl/lwbrk/LineBreaker.cpp

   1 /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
   2 /* This Source Code Form is subject to the terms of the Mozilla Public
   3  * License, v. 2.0. If a copy of the MPL was not distributed with this
   4  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
   5
   6 #include "mozilla/intl/LineBreaker.h"
   7
   8 #include "jisx4051class.h"
   9 #include "nsComplexBreaker.h"
  10 #include "nsTArray.h"
  11 #include "nsUnicodeProperties.h"
  12 #include "mozilla/ArrayUtils.h"
  13
  14 using namespace mozilla::unicode;
  15 using namespace mozilla::intl;
  16
  17 /*static*/
  18 already_AddRefed<LineBreaker> LineBreaker::Create() {
  19   return RefPtr<LineBreaker>(new LineBreaker()).forget();
  20 }
  21
  22 /*
  23
  24    Simplification of Pair Table in JIS X 4051
  25
  26    1. The Origion Table - in 4.1.3
  27
  28    In JIS x 4051. The pair table is defined as below
  29
  30    Class of
  31    Leading    Class of Trailing Char Class
  32    Char
  33
  34               1  2  3  4  5  6  7  8  9 10 11 12 13 13 14 14 15 16 17 18 19 20
  35                                                  *  #  *  #
  36         1     X  X  X  X  X  X  X  X  X  X  X  X  X  X  X  X  X  X  X  X  X  E
  37         2        X  X  X  X  X                                               X
  38         3        X  X  X  X  X                                               X
  39         4        X  X  X  X  X                                               X
  40         5        X  X  X  X  X                                               X
  41         6        X  X  X  X  X                                               X
  42         7        X  X  X  X  X  X                                            X
  43         8        X  X  X  X  X                                X              E
  44         9        X  X  X  X  X                                               X
  45        10        X  X  X  X  X                                               X
  46        11        X  X  X  X  X                                               X
  47        12        X  X  X  X  X                                               X
  48        13        X  X  X  X  X                    X                          X
  49        14        X  X  X  X  X                          X                    X
  50        15        X  X  X  X  X        X                       X        X     X
  51        16        X  X  X  X  X                                   X     X     X
  52        17        X  X  X  X  X                                               E
  53        18        X  X  X  X  X                                X  X     X     X
  54        19     X  E  E  E  E  E  X  X  X  X  X  X  X  X  X  X  X  X  E  X  E  E
  55        20        X  X  X  X  X                                               E
  56
  57    * Same Char
  58    # Other Char
  59
  60    X Cannot Break
  61
  62    The classes mean:
  63       1: Open parenthesis
  64       2: Close parenthesis
  65       3: Prohibit a line break before
  66       4: Punctuation for sentence end (except Full stop, e.g., "!" and "?")
  67       5: Middle dot (e.g., U+30FB KATAKANA MIDDLE DOT)
  68       6: Full stop
  69       7: Non-breakable between same characters
  70       8: Prefix (e.g., "$", "NO.")
  71       9: Postfix (e.g., "%")
  72      10: Ideographic space
  73      11: Hiragana
  74      12: Japanese characters (except class 11)
  75      13: Subscript
  76      14: Ruby
  77      15: Numeric
  78      16: Alphabet
  79      17: Space for Western language
  80      18: Western characters (except class 17)
  81      19: Split line note (Warichu) begin quote
  82      20: Split line note (Warichu) end quote
  83
  84    2. Simplified by remove the class which we do not care
  85
  86    However, since we do not care about class 13(Subscript), 14(Ruby),
  87    16 (Aphabet), 19(split line note begin quote), and 20(split line note end
  88    quote) we can simplify this par table into the following
  89
  90    Class of
  91    Leading    Class of Trailing Char Class
  92    Char
  93
  94               1  2  3  4  5  6  7  8  9 10 11 12 15 17 18
  95
  96         1     X  X  X  X  X  X  X  X  X  X  X  X  X  X  X
  97         2        X  X  X  X  X
  98         3        X  X  X  X  X
  99         4        X  X  X  X  X
 100         5        X  X  X  X  X
 101         6        X  X  X  X  X
 102         7        X  X  X  X  X  X
 103         8        X  X  X  X  X                    X
 104         9        X  X  X  X  X
 105        10        X  X  X  X  X
 106        11        X  X  X  X  X
 107        12        X  X  X  X  X
 108        15        X  X  X  X  X        X           X     X
 109        17        X  X  X  X  X
 110        18        X  X  X  X  X                    X     X
 111
 112    3. Simplified by merged classes
 113
 114    After the 2 simplification, the pair table have some duplication
 115    a. class 2, 3, 4, 5, 6,  are the same- we can merged them
 116    b. class 10, 11, 12, 17  are the same- we can merged them
 117
 118    We introduce an extra non-breaking pair at [b]/7 to better match
 119    the expectations of CSS line-breaking as tested by WPT tests.
 120    This added entry is marked as * in the tables below.
 121
 122    Class of
 123    Leading    Class of Trailing Char Class
 124    Char
 125
 126               1 [a] 7  8  9 [b]15 18
 127
 128         1     X  X  X  X  X  X  X  X
 129       [a]        X
 130         7        X  X
 131         8        X              X
 132         9        X
 133       [b]        X  *
 134        15        X        X     X  X
 135        18        X              X  X
 136
 137
 138    4. We add COMPLEX characters and make it breakable w/ all ther class
 139       except after class 1 and before class [a]
 140
 141    Class of
 142    Leading    Class of Trailing Char Class
 143    Char
 144
 145               1 [a] 7  8  9 [b]15 18 COMPLEX
 146
 147         1     X  X  X  X  X  X  X  X  X
 148       [a]        X
 149         7        X  X
 150         8        X              X
 151         9        X
 152       [b]        X  *
 153        15        X        X     X  X
 154        18        X              X  X
 155   COMPLEX        X                    T
 156
 157      T : need special handling
 158
 159
 160    5. However, we need two special class for some punctuations/parentheses,
 161       theirs breaking rules like character class (18), see bug 389056.
 162       And also we need character like punctuation that is same behavior with 18,
 163       but the characters are not letters of all languages. (e.g., '_')
 164       [c]. Based on open parenthesis class (1), but it is not breakable after
 165            character class (18) or numeric class (15).
 166       [d]. Based on close parenthesis (or punctuation) class (2), but it is not
 167            breakable before character class (18) or numeric class (15).
 168
 169    Class of
 170    Leading    Class of Trailing Char Class
 171    Char
 172
 173               1 [a] 7  8  9 [b]15 18 COMPLEX [c] [d]
 174
 175         1     X  X  X  X  X  X  X  X  X       X    X
 176       [a]        X                            X    X
 177         7        X  X
 178         8        X              X
 179         9        X
 180       [b]        X  *                              X
 181        15        X        X     X  X          X    X
 182        18        X              X  X          X    X
 183   COMPLEX        X                    T
 184       [c]     X  X  X  X  X  X  X  X  X       X    X
 185       [d]        X              X  X               X
 186
 187
 188    6. And Unicode has "NON-BREAK" characters. The lines should be broken around
 189       them. But in JIS X 4051, such class is not, therefore, we create [e].
 190
 191    Class of
 192    Leading    Class of Trailing Char Class
 193    Char
 194
 195               1 [a] 7  8  9 [b]15 18 COMPLEX [c] [d] [e]
 196
 197         1     X  X  X  X  X  X  X  X  X       X    X   X
 198       [a]        X                                 X   X
 199         7        X  X                                  X
 200         8        X              X                      X
 201         9        X                                     X
 202       [b]        X  *                              X   X
 203        15        X        X     X  X          X    X   X
 204        18        X              X  X          X    X   X
 205   COMPLEX        X                    T                X
 206       [c]     X  X  X  X  X  X  X  X  X       X    X   X
 207       [d]        X              X  X               X   X
 208       [e]     X  X  X  X  X  X  X  X  X       X    X   X
 209
 210
 211    7. Now we use one bit to encode whether it is breakable, and use 2 bytes
 212       for one row, then the bit table will look like:
 213
 214                  18    <-   1
 215
 216        1  0000 1111 1111 1111  = 0x0FFF
 217       [a] 0000 1100 0000 0010  = 0x0C02
 218        7  0000 1000 0000 0110  = 0x0806
 219        8  0000 1000 0100 0010  = 0x0842
 220        9  0000 1000 0000 0010  = 0x0802
 221       [b] 0000 1100 0000 0110  = 0x0C06
 222       15  0000 1110 1101 0010  = 0x0ED2
 223       18  0000 1110 1100 0010  = 0x0EC2
 224  COMPLEX  0000 1001 0000 0010  = 0x0902
 225       [c] 0000 1111 1111 1111  = 0x0FFF
 226       [d] 0000 1100 1100 0010  = 0x0CC2
 227       [e] 0000 1111 1111 1111  = 0x0FFF
 228 */
 229
 230 #define MAX_CLASSES 12
 231
 232 static const uint16_t gPair[MAX_CLASSES] = {0x0FFF, 0x0C02, 0x0806, 0x0842,
 233                                             0x0802, 0x0C06, 0x0ED2, 0x0EC2,
 234                                             0x0902, 0x0FFF, 0x0CC2, 0x0FFF};
 235
 236 /*
 237
 238    8. And if the character is not enough far from word start, word end and
 239       another break point, we should not break in non-CJK languages.
 240       I.e., Don't break around 15, 18, [c] and [d], but don't change
 241       that if they are related to [b].
 242
 243    Class of
 244    Leading    Class of Trailing Char Class
 245    Char
 246
 247               1 [a] 7  8  9 [b]15 18 COMPLEX [c] [d] [e]
 248
 249         1     X  X  X  X  X  X  X  X  X       X    X   X
 250       [a]        X              X  X          X    X   X
 251         7        X  X           X  X          X    X   X
 252         8        X              X  X          X    X   X
 253         9        X              X  X          X    X   X
 254       [b]        X  *                              X   X
 255        15     X  X  X  X  X     X  X  X       X    X   X
 256        18     X  X  X  X  X     X  X  X       X    X   X
 257   COMPLEX        X              X  X  T       X    X   X
 258       [c]     X  X  X  X  X  X  X  X  X       X    X   X
 259       [d]     X  X  X  X  X     X  X  X       X    X   X
 260       [e]     X  X  X  X  X  X  X  X  X       X    X   X
 261
 262                  18    <-   1
 263
 264        1  0000 1111 1111 1111  = 0x0FFF
 265       [a] 0000 1110 1100 0010  = 0x0EC2
 266        7  0000 1110 1100 0110  = 0x0EC6
 267        8  0000 1110 1100 0010  = 0x0EC2
 268        9  0000 1110 1100 0010  = 0x0EC2
 269       [b] 0000 1100 0000 0110  = 0x0C06
 270       15  0000 1111 1101 1111  = 0x0FDF
 271       18  0000 1111 1101 1111  = 0x0FDF
 272  COMPLEX  0000 1111 1100 0010  = 0x0FC2
 273       [c] 0000 1111 1111 1111  = 0x0FFF
 274       [d] 0000 1111 1101 1111  = 0x0FDF
 275       [e] 0000 1111 1111 1111  = 0x0FFF
 276 */
 277
 278 static const uint16_t gPairConservative[MAX_CLASSES] = {
 279     0x0FFF, 0x0EC2, 0x0EC6, 0x0EC2, 0x0EC2, 0x0C06,
 280     0x0FDF, 0x0FDF, 0x0FC2, 0x0FFF, 0x0FDF, 0x0FFF};
 281
 282 /*
 283
 284    9. Now we map the class to number
 285
 286       0: 1
 287       1: [a]- 2, 3, 4, 5, 6
 288       2: 7
 289       3: 8
 290       4: 9
 291       5: [b]- 10, 11, 12, 17
 292       6: 15
 293       7: 18
 294       8: COMPLEX
 295       9: [c]
 296       A: [d]
 297       B: [e]
 298
 299     and they mean:
 300       0: Open parenthesis
 301       1: Punctuation that prohibits break before
 302       2: Non-breakable between same classes
 303       3: Prefix
 304       4: Postfix
 305       5: Breakable character (Spaces and Most Japanese characters)
 306       6: Numeric
 307       7: Characters
 308       8: Need special handling characters (E.g., Thai)
 309       9: Open parentheses like Character (See bug 389056)
 310       A: Close parenthese (or punctuations) like Character (See bug 389056)
 311       B: Non breakable (See bug 390920)
 312
 313 */
 314
 315 #define CLASS_NONE INT8_MAX
 316
 317 #define CLASS_OPEN 0x00
 318 #define CLASS_CLOSE 0x01
 319 #define CLASS_NON_BREAKABLE_BETWEEN_SAME_CLASS 0x02
 320 #define CLASS_PREFIX 0x03
 321 #define CLASS_POSTFFIX 0x04
 322 #define CLASS_BREAKABLE 0x05
 323 #define CLASS_NUMERIC 0x06
 324 #define CLASS_CHARACTER 0x07
 325 #define CLASS_COMPLEX 0x08
 326 #define CLASS_OPEN_LIKE_CHARACTER 0x09
 327 #define CLASS_CLOSE_LIKE_CHARACTER 0x0A
 328 #define CLASS_NON_BREAKABLE 0x0B
 329
 330 #define U_NULL char16_t(0x0000)
 331 #define U_SLASH char16_t('/')
 332 #define U_SPACE char16_t(' ')
 333 #define U_HYPHEN char16_t('-')
 334 #define U_EQUAL char16_t('=')
 335 #define U_PERCENT char16_t('%')
 336 #define U_AMPERSAND char16_t('&')
 337 #define U_SEMICOLON char16_t(';')
 338 #define U_BACKSLASH char16_t('\\')
 339 #define U_OPEN_SINGLE_QUOTE char16_t(0x2018)
 340 #define U_OPEN_DOUBLE_QUOTE char16_t(0x201C)
 341 #define U_OPEN_GUILLEMET char16_t(0x00AB)
 342
 343 #define NEED_CONTEXTUAL_ANALYSIS(c)                                            \
 344   (IS_HYPHEN(c) || (c) == U_SLASH || (c) == U_PERCENT || (c) == U_AMPERSAND || \
 345    (c) == U_SEMICOLON || (c) == U_BACKSLASH || (c) == U_OPEN_SINGLE_QUOTE ||   \
 346    (c) == U_OPEN_DOUBLE_QUOTE || (c) == U_OPEN_GUILLEMET)
 347
 348 #define IS_ASCII_DIGIT(u) (0x0030 <= (u) && (u) <= 0x0039)
 349
 350 static inline int GETCLASSFROMTABLE(const uint32_t* t, uint16_t l) {
 351   return ((((t)[(l >> 3)]) >> ((l & 0x0007) << 2)) & 0x000f);
 352 }
 353
 354 static inline int IS_HALFWIDTH_IN_JISx4051_CLASS3(char16_t u) {
 355   return ((0xff66 <= (u)) && ((u) <= 0xff70));
 356 }
 357
 358 static inline int IS_CJK_CHAR(char32_t u) {
 359   return (
 360       (0x1100 <= (u) && (u) <= 0x11ff) || (0x2e80 <= (u) && (u) <= 0xd7ff) ||
 361       (0xf900 <= (u) && (u) <= 0xfaff) || (0xff00 <= (u) && (u) <= 0xffef) ||
 362       (0x20000 <= (u) && (u) <= 0x2fffd));
 363 }
 364
 365 static inline bool IS_NONBREAKABLE_SPACE(char16_t u) {
 366   return u == 0x00A0 || u == 0x2007;  // NO-BREAK SPACE, FIGURE SPACE
 367 }
 368
 369 static inline bool IS_HYPHEN(char16_t u) {
 370   return (u == U_HYPHEN || u == 0x058A ||  // ARMENIAN HYPHEN
 371           u == 0x2010 ||                   // HYPHEN
 372           u == 0x2012 ||                   // FIGURE DASH
 373           u == 0x2013);                    // EN DASH
 374 }
 375
 376 static int8_t GetClass(uint32_t u, LineBreaker::Strictness aLevel,
 377                        bool aIsChineseOrJapanese) {
 378   // Mapping for Unicode LineBreak.txt classes to the (simplified) set of
 379   // character classes used here.
 380   // XXX The mappings here were derived by comparing the Unicode LineBreak
 381   //     values of BMP characters to the classes our existing GetClass returns
 382   //     for the same codepoints; in cases where characters with the same
 383   //     LineBreak class mapped to various classes here, I picked what seemed
 384   //     the most prevalent equivalence.
 385   //     Some of these are unclear to me, but currently they are ONLY used
 386   //     for characters not handled by the old code below, so all the JISx405
 387   //     special cases should already be accounted for.
 388   static const int8_t sUnicodeLineBreakToClass[] = {
 389       /* UNKNOWN = 0,                       [XX] */ CLASS_CHARACTER,
 390       /* AMBIGUOUS = 1,                     [AI] */ CLASS_CHARACTER,
 391       /* ALPHABETIC = 2,                    [AL] */ CLASS_CHARACTER,
 392       /* BREAK_BOTH = 3,                    [B2] */ CLASS_CHARACTER,
 393       /* BREAK_AFTER = 4,                   [BA] */ CLASS_CHARACTER,
 394       /* BREAK_BEFORE = 5,                  [BB] */ CLASS_OPEN_LIKE_CHARACTER,
 395       /* MANDATORY_BREAK = 6,               [BK] */ CLASS_CHARACTER,
 396       /* CONTINGENT_BREAK = 7,              [CB] */ CLASS_CHARACTER,
 397       /* CLOSE_PUNCTUATION = 8,             [CL] */ CLASS_CHARACTER,
 398       /* COMBINING_MARK = 9,                [CM] */ CLASS_CHARACTER,
 399       /* CARRIAGE_RETURN = 10,              [CR] */ CLASS_BREAKABLE,
 400       /* EXCLAMATION = 11,                  [EX] */ CLASS_CHARACTER,
 401       /* GLUE = 12,                         [GL] */ CLASS_NON_BREAKABLE,
 402       /* HYPHEN = 13,                       [HY] */ CLASS_CHARACTER,
 403       /* IDEOGRAPHIC = 14,                  [ID] */ CLASS_BREAKABLE,
 404       /* INSEPARABLE = 15,                  [IN] */ CLASS_CLOSE_LIKE_CHARACTER,
 405       /* INFIX_NUMERIC = 16,                [IS] */ CLASS_CHARACTER,
 406       /* LINE_FEED = 17,                    [LF] */ CLASS_BREAKABLE,
 407       /* NONSTARTER = 18,                   [NS] */ CLASS_CLOSE_LIKE_CHARACTER,
 408       /* NUMERIC = 19,                      [NU] */ CLASS_NUMERIC,
 409       /* OPEN_PUNCTUATION = 20,             [OP] */ CLASS_CHARACTER,
 410       /* POSTFIX_NUMERIC = 21,              [PO] */ CLASS_CHARACTER,
 411       /* PREFIX_NUMERIC = 22,               [PR] */ CLASS_CHARACTER,
 412       /* QUOTATION = 23,                    [QU] */ CLASS_CHARACTER,
 413       /* COMPLEX_CONTEXT = 24,              [SA] */ CLASS_CHARACTER,
 414       /* SURROGATE = 25,                    [SG] */ CLASS_CHARACTER,
 415       /* SPACE = 26,                        [SP] */ CLASS_BREAKABLE,
 416       /* BREAK_SYMBOLS = 27,                [SY] */ CLASS_CHARACTER,
 417       /* ZWSPACE = 28,                      [ZW] */ CLASS_BREAKABLE,
 418       /* NEXT_LINE = 29,                    [NL] */ CLASS_CHARACTER,
 419       /* WORD_JOINER = 30,                  [WJ] */ CLASS_NON_BREAKABLE,
 420       /* H2 = 31,                           [H2] */ CLASS_BREAKABLE,
 421       /* H3 = 32,                           [H3] */ CLASS_BREAKABLE,
 422       /* JL = 33,                           [JL] */ CLASS_CHARACTER,
 423       /* JT = 34,                           [JT] */ CLASS_CHARACTER,
 424       /* JV = 35,                           [JV] */ CLASS_CHARACTER,
 425       /* CLOSE_PARENTHESIS = 36,            [CP] */ CLASS_CLOSE_LIKE_CHARACTER,
 426       /* CONDITIONAL_JAPANESE_STARTER = 37, [CJ] */ CLASS_CLOSE,
 427       /* HEBREW_LETTER = 38,                [HL] */ CLASS_CHARACTER,
 428       /* REGIONAL_INDICATOR = 39,           [RI] */ CLASS_CHARACTER,
 429       /* E_BASE = 40,                       [EB] */ CLASS_BREAKABLE,
 430       /* E_MODIFIER = 41,                   [EM] */ CLASS_CHARACTER,
 431       /* ZWJ = 42,                          [ZWJ]*/ CLASS_CHARACTER};
 432
 433   static_assert(U_LB_COUNT == mozilla::ArrayLength(sUnicodeLineBreakToClass),
 434                 "Gecko vs ICU LineBreak class mismatch");
 435
 436   auto cls = GetLineBreakClass(u);
 437   MOZ_ASSERT(cls < mozilla::ArrayLength(sUnicodeLineBreakToClass));
 438
 439   // Overrides based on rules for the different line-break values given in
 440   // https://drafts.csswg.org/css-text-3/#line-break-property
 441   switch (aLevel) {
 442     case LineBreaker::Strictness::Auto:
 443       // For now, just use legacy Gecko behavior.
 444       // XXX Possible enhancement - vary strictness according to line width
 445       // or other criteria.
 446       break;
 447     case LineBreaker::Strictness::Strict:
 448       if (cls == U_LB_CONDITIONAL_JAPANESE_STARTER ||
 449           (u == 0x3095 || u == 0x3096 || u == 0x30f5 || u == 0x30f6)) {
 450         return CLASS_CLOSE;
 451       }
 452       if (cls == U_LB_INSEPARABLE) {
 453         return CLASS_NON_BREAKABLE_BETWEEN_SAME_CLASS;
 454       }
 455       if (u == 0x3005 || u == 0x303B || u == 0x309D || u == 0x309E ||
 456           u == 0x30FD || u == 0x30FE) {
 457         return CLASS_CLOSE_LIKE_CHARACTER;
 458       }
 459       if (aIsChineseOrJapanese) {
 460         if (cls == U_LB_POSTFIX_NUMERIC && IsEastAsianWidthAFW(u)) {
 461           return CLASS_CLOSE_LIKE_CHARACTER;
 462         }
 463         if (cls == U_LB_PREFIX_NUMERIC && IsEastAsianWidthAFW(u)) {
 464           return CLASS_OPEN_LIKE_CHARACTER;
 465         }
 466         if (u == 0x2010 || u == 0x2013 || u == 0x301C || u == 0x30A0) {
 467           return CLASS_CLOSE_LIKE_CHARACTER;
 468         }
 469       }
 470       break;
 471     case LineBreaker::Strictness::Normal:
 472       if (cls == U_LB_CONDITIONAL_JAPANESE_STARTER) {
 473         return CLASS_BREAKABLE;
 474       }
 475       if (cls == U_LB_INSEPARABLE) {
 476         return CLASS_NON_BREAKABLE_BETWEEN_SAME_CLASS;
 477       }
 478       if (u == 0x3005 || u == 0x303B || u == 0x309D || u == 0x309E ||
 479           u == 0x30FD || u == 0x30FE) {
 480         return CLASS_CLOSE_LIKE_CHARACTER;
 481       }
 482       if (aIsChineseOrJapanese) {
 483         if (cls == U_LB_POSTFIX_NUMERIC && IsEastAsianWidthAFW(u)) {
 484           return CLASS_CLOSE_LIKE_CHARACTER;
 485         }
 486         if (cls == U_LB_PREFIX_NUMERIC && IsEastAsianWidthAFW(u)) {
 487           return CLASS_OPEN_LIKE_CHARACTER;
 488         }
 489         if (u == 0x2010 || u == 0x2013 || u == 0x301C || u == 0x30A0) {
 490           return CLASS_BREAKABLE;
 491         }
 492       }
 493       break;
 494     case LineBreaker::Strictness::Loose:
 495       if (cls == U_LB_CONDITIONAL_JAPANESE_STARTER) {
 496         return CLASS_BREAKABLE;
 497       }
 498       if (u == 0x3005 || u == 0x303B || u == 0x309D || u == 0x309E ||
 499           u == 0x30FD || u == 0x30FE) {
 500         return CLASS_BREAKABLE;
 501       }
 502       if (cls == U_LB_INSEPARABLE) {
 503         return CLASS_BREAKABLE;
 504       }
 505       if (aIsChineseOrJapanese) {
 506         if (u == 0x30FB || u == 0xFF1A || u == 0xFF1B || u == 0xFF65 ||
 507             u == 0x203C || u == 0x2047 || u == 0x2048 || u == 0x2049 ||
 508             u == 0xFF01 || u == 0xFF1F) {
 509           return CLASS_BREAKABLE;
 510         }
 511         if (cls == U_LB_POSTFIX_NUMERIC && IsEastAsianWidthAFW(u)) {
 512           return CLASS_BREAKABLE;
 513         }
 514         if (cls == U_LB_PREFIX_NUMERIC && IsEastAsianWidthAFW(u)) {
 515           return CLASS_BREAKABLE;
 516         }
 517         if (u == 0x2010 || u == 0x2013 || u == 0x301C || u == 0x30A0) {
 518           return CLASS_BREAKABLE;
 519         }
 520       }
 521       break;
 522     case LineBreaker::Strictness::Anywhere:
 523       MOZ_ASSERT_UNREACHABLE("should have been handled already");
 524       break;
 525   }
 526
 527   if (u < 0x10000) {
 528     uint16_t h = u & 0xFF00;
 529     uint16_t l = u & 0x00ff;
 530
 531     // Handle 3 range table first
 532     if (0x0000 == h) {
 533       return GETCLASSFROMTABLE(gLBClass00, l);
 534     }
 535     if (0x1700 == h) {
 536       return GETCLASSFROMTABLE(gLBClass17, l);
 537     }
 538     if (NS_NeedsPlatformNativeHandling(u)) {
 539       return CLASS_COMPLEX;
 540     }
 541     if (0x0E00 == h) {
 542       return GETCLASSFROMTABLE(gLBClass0E, l);
 543     }
 544     if (0x2000 == h) {
 545       return GETCLASSFROMTABLE(gLBClass20, l);
 546     }
 547     if (0x2100 == h) {
 548       return GETCLASSFROMTABLE(gLBClass21, l);
 549     }
 550     if (0x3000 == h) {
 551       return GETCLASSFROMTABLE(gLBClass30, l);
 552     }
 553     if (0xff00 == h) {
 554       if (l < 0x0060) {  // Fullwidth ASCII variant
 555         return GETCLASSFROMTABLE(gLBClass00, (l + 0x20));
 556       }
 557       if (l < 0x00a0) {  // Halfwidth Katakana variants
 558         switch (l) {
 559           case 0x61:
 560             return GetClass(0x3002, aLevel, aIsChineseOrJapanese);
 561           case 0x62:
 562             return GetClass(0x300c, aLevel, aIsChineseOrJapanese);
 563           case 0x63:
 564             return GetClass(0x300d, aLevel, aIsChineseOrJapanese);
 565           case 0x64:
 566             return GetClass(0x3001, aLevel, aIsChineseOrJapanese);
 567           case 0x65:
 568             return GetClass(0x30fb, aLevel, aIsChineseOrJapanese);
 569           case 0x9e:
 570             return GetClass(0x309b, aLevel, aIsChineseOrJapanese);
 571           case 0x9f:
 572             return GetClass(0x309c, aLevel, aIsChineseOrJapanese);
 573           default:
 574             if (IS_HALFWIDTH_IN_JISx4051_CLASS3(u)) {
 575               return CLASS_CLOSE;  // jis x4051 class 3
 576             }
 577             return CLASS_BREAKABLE;  // jis x4051 class 11
 578         }
 579       }
 580       if (l < 0x00e0) {
 581         return CLASS_CHARACTER;  // Halfwidth Hangul variants
 582       }
 583       if (l < 0x00f0) {
 584         static char16_t NarrowFFEx[16] = {
 585             0x00A2, 0x00A3, 0x00AC, 0x00AF, 0x00A6, 0x00A5, 0x20A9, 0x0000,
 586             0x2502, 0x2190, 0x2191, 0x2192, 0x2193, 0x25A0, 0x25CB, 0x0000};
 587         return GetClass(NarrowFFEx[l - 0x00e0], aLevel, aIsChineseOrJapanese);
 588       }
 589     } else if (0x3100 == h) {
 590       if (l <= 0xbf) {  // Hangul Compatibility Jamo, Bopomofo, Kanbun
 591                         // XXX: This is per UAX #14, but UAX #14 may change
 592                         // the line breaking rules about Kanbun and Bopomofo.
 593         return CLASS_BREAKABLE;
 594       }
 595       if (l >= 0xf0) {  // Katakana small letters for Ainu
 596         return CLASS_CLOSE;
 597       }
 598     } else if (0x0300 == h) {
 599       if (0x4F == l || (0x5C <= l && l <= 0x62)) {
 600         return CLASS_NON_BREAKABLE;
 601       }
 602     } else if (0x0500 == h) {
 603       // ARMENIAN HYPHEN (for "Breaking Hyphens" of UAX#14)
 604       if (l == 0x8A) {
 605         return GETCLASSFROMTABLE(gLBClass00, uint16_t(U_HYPHEN));
 606       }
 607     } else if (0x0F00 == h) {
 608       if (0x08 == l || 0x0C == l || 0x12 == l) {
 609         return CLASS_NON_BREAKABLE;
 610       }
 611     } else if (0x1800 == h) {
 612       if (0x0E == l) {
 613         return CLASS_NON_BREAKABLE;
 614       }
 615     } else if (0x1600 == h) {
 616       if (0x80 == l) {  // U+1680 OGHAM SPACE MARK
 617         return CLASS_BREAKABLE;
 618       }
 619     } else if (u == 0xfeff) {
 620       return CLASS_NON_BREAKABLE;
 621     }
 622   }
 623
 624   return sUnicodeLineBreakToClass[cls];
 625 }
 626
 627 static bool GetPair(int8_t c1, int8_t c2) {
 628   NS_ASSERTION(c1 < MAX_CLASSES, "illegal classes 1");
 629   NS_ASSERTION(c2 < MAX_CLASSES, "illegal classes 2");
 630
 631   return (0 == ((gPair[c1] >> c2) & 0x0001));
 632 }
 633
 634 static bool GetPairConservative(int8_t c1, int8_t c2) {
 635   NS_ASSERTION(c1 < MAX_CLASSES, "illegal classes 1");
 636   NS_ASSERTION(c2 < MAX_CLASSES, "illegal classes 2");
 637
 638   return (0 == ((gPairConservative[c1] >> c2) & 0x0001));
 639 }
 640
 641 class ContextState {
 642  public:
 643   ContextState(const char16_t* aText, uint32_t aLength)
 644       : mUniText(aText), mText(nullptr), mLength(aLength) {
 645     Init();
 646   }
 647
 648   ContextState(const uint8_t* aText, uint32_t aLength)
 649       : mUniText(nullptr), mText(aText), mLength(aLength) {
 650     Init();
 651   }
 652
 653   uint32_t Length() const { return mLength; }
 654   uint32_t Index() const { return mIndex; }
 655
 656   // This gets a single code unit of the text, without checking for surrogates
 657   // (in the case of a 16-bit text buffer). That's OK if we're only checking for
 658   // specific characters that are known to be BMP values.
 659   char16_t GetCodeUnitAt(uint32_t aIndex) const {
 660     MOZ_ASSERT(aIndex < mLength, "Out of range!");
 661     return mUniText ? mUniText[aIndex] : char16_t(mText[aIndex]);
 662   }
 663
 664   // This gets a 32-bit Unicode character (codepoint), handling surrogate pairs
 665   // as necessary. It must ONLY be called for 16-bit text, not 8-bit.
 666   char32_t GetUnicodeCharAt(uint32_t aIndex) const {
 667     MOZ_ASSERT(mUniText, "Only for 16-bit text!");
 668     MOZ_ASSERT(aIndex < mLength, "Out of range!");
 669     char32_t c = mUniText[aIndex];
 670     if (aIndex + 1 < mLength && NS_IS_SURROGATE_PAIR(c, mUniText[aIndex + 1])) {
 671       c = SURROGATE_TO_UCS4(c, mUniText[aIndex + 1]);
 672     }
 673     return c;
 674   }
 675
 676   void AdvanceIndex() { ++mIndex; }
 677
 678   void NotifyBreakBefore() { mLastBreakIndex = mIndex; }
 679
 680   // A word of western language should not be broken. But even if the word has
 681   // only ASCII characters, non-natural context words should be broken, e.g.,
 682   // URL and file path. For protecting the natural words, we should use
 683   // conservative breaking rules at following conditions:
 684   //   1. at near the start of word
 685   //   2. at near the end of word
 686   //   3. at near the latest broken point
 687   // CONSERVATIVE_RANGE_{LETTER,OTHER} define the 'near' in characters,
 688   // which varies depending whether we are looking at a letter or a non-letter
 689   // character: for non-letters, we use an extended "conservative" range.
 690
 691 #define CONSERVATIVE_RANGE_LETTER 2
 692 #define CONSERVATIVE_RANGE_OTHER 6
 693
 694   bool UseConservativeBreaking(uint32_t aOffset = 0) const {
 695     if (mHasCJKChar) return false;
 696     uint32_t index = mIndex + aOffset;
 697
 698     // If the character at index is a letter (rather than various punctuation
 699     // characters, etc) then we want a shorter "conservative" range
 700     uint32_t conservativeRangeStart, conservativeRangeEnd;
 701     if (index < mLength &&
 702         nsUGenCategory::kLetter ==
 703             (mText ? GetGenCategory(mText[index])
 704                    : GetGenCategory(GetUnicodeCharAt(index)))) {
 705       // Primarily for hyphenated word prefixes/suffixes; we add 1 to Start
 706       // to get more balanced behavior (if we break off a 2-letter prefix,
 707       // that means the break will actually be three letters from start of
 708       // word, to include the hyphen; whereas a 2-letter suffix will be
 709       // broken only two letters from end of word).
 710       conservativeRangeEnd = CONSERVATIVE_RANGE_LETTER;
 711       conservativeRangeStart = CONSERVATIVE_RANGE_LETTER + 1;
 712     } else {
 713       conservativeRangeEnd = conservativeRangeStart = CONSERVATIVE_RANGE_OTHER;
 714     }
 715
 716     bool result = (index < conservativeRangeStart ||
 717                    mLength - index < conservativeRangeEnd ||
 718                    index - mLastBreakIndex < conservativeRangeStart);
 719     if (result || !mHasNonbreakableSpace) return result;
 720
 721     // This text has no-breakable space, we need to check whether the index
 722     // is near it.
 723
 724     // Note that index is always larger than conservativeRange here.
 725     for (uint32_t i = index; index - conservativeRangeStart < i; --i) {
 726       if (IS_NONBREAKABLE_SPACE(GetCodeUnitAt(i - 1))) return true;
 727     }
 728     // Note that index is always less than mLength - conservativeRange.
 729     for (uint32_t i = index + 1; i < index + conservativeRangeEnd; ++i) {
 730       if (IS_NONBREAKABLE_SPACE(GetCodeUnitAt(i))) return true;
 731     }
 732     return false;
 733   }
 734
 735   bool HasPreviousEqualsSign() const { return mHasPreviousEqualsSign; }
 736   void NotifySeenEqualsSign() { mHasPreviousEqualsSign = true; }
 737
 738   bool HasPreviousSlash() const { return mHasPreviousSlash; }
 739   void NotifySeenSlash() { mHasPreviousSlash = true; }
 740
 741   bool HasPreviousBackslash() const { return mHasPreviousBackslash; }
 742   void NotifySeenBackslash() { mHasPreviousBackslash = true; }
 743
 744   uint32_t GetPreviousNonHyphenCharacter() const {
 745     return mPreviousNonHyphenCharacter;
 746   }
 747   void NotifyNonHyphenCharacter(uint32_t ch) {
 748     mPreviousNonHyphenCharacter = ch;
 749   }
 750
 751  private:
 752   void Init() {
 753     mIndex = 0;
 754     mLastBreakIndex = 0;
 755     mPreviousNonHyphenCharacter = U_NULL;
 756     mHasCJKChar = false;
 757     mHasNonbreakableSpace = false;
 758     mHasPreviousEqualsSign = false;
 759     mHasPreviousSlash = false;
 760     mHasPreviousBackslash = false;
 761
 762     if (mText) {
 763       // 8-bit text: we only need to check for &nbsp;
 764       for (uint32_t i = 0; i < mLength; ++i) {
 765         if (IS_NONBREAKABLE_SPACE(mText[i])) {
 766           mHasNonbreakableSpace = true;
 767           break;
 768         }
 769       }
 770     } else {
 771       // 16-bit text: handle surrogates and check for CJK as well as &nbsp;
 772       for (uint32_t i = 0; i < mLength; ++i) {
 773         char32_t u = GetUnicodeCharAt(i);
 774         if (!mHasNonbreakableSpace && IS_NONBREAKABLE_SPACE(u)) {
 775           mHasNonbreakableSpace = true;
 776           if (mHasCJKChar) {
 777             break;
 778           }
 779         } else if (!mHasCJKChar && IS_CJK_CHAR(u)) {
 780           mHasCJKChar = 1;
 781           if (mHasNonbreakableSpace) {
 782             break;
 783           }
 784         }
 785         if (u > 0xFFFFu) {
 786           ++i;  // step over trailing low surrogate
 787         }
 788       }
 789     }
 790   }
 791
 792   const char16_t* const mUniText;
 793   const uint8_t* const mText;
 794
 795   uint32_t mIndex;
 796   const uint32_t mLength;  // length of text
 797   uint32_t mLastBreakIndex;
 798   char32_t mPreviousNonHyphenCharacter;  // The last character we have seen
 799                                          // which is not U_HYPHEN
 800   bool mHasCJKChar;             // if the text has CJK character, this is true.
 801   bool mHasNonbreakableSpace;   // if the text has no-breakable space,
 802                                 // this is true.
 803   bool mHasPreviousEqualsSign;  // True if we have seen a U_EQUAL
 804   bool mHasPreviousSlash;       // True if we have seen a U_SLASH
 805   bool mHasPreviousBackslash;   // True if we have seen a U_BACKSLASH
 806 };
 807
 808 static int8_t ContextualAnalysis(char32_t prev, char32_t cur, char32_t next,
 809                                  ContextState& aState,
 810                                  LineBreaker::Strictness aLevel,
 811                                  bool aIsChineseOrJapanese) {
 812   // Don't return CLASS_OPEN/CLASS_CLOSE if aState.UseJISX4051 is FALSE.
 813
 814   if (IS_HYPHEN(cur)) {
 815     // If next character is hyphen, we don't need to break between them.
 816     if (IS_HYPHEN(next)) return CLASS_CHARACTER;
 817     // If prev and next characters are numeric, it may be in Math context.
 818     // So, we should not break here.
 819     bool prevIsNum = IS_ASCII_DIGIT(prev);
 820     bool nextIsNum = IS_ASCII_DIGIT(next);
 821     if (prevIsNum && nextIsNum) return CLASS_NUMERIC;
 822     // If one side is numeric and the other is a character, or if both sides are
 823     // characters, the hyphen should be breakable.
 824     if (!aState.UseConservativeBreaking(1)) {
 825       char32_t prevOfHyphen = aState.GetPreviousNonHyphenCharacter();
 826       if (prevOfHyphen && next) {
 827         int8_t prevClass = GetClass(prevOfHyphen, aLevel, aIsChineseOrJapanese);
 828         int8_t nextClass = GetClass(next, aLevel, aIsChineseOrJapanese);
 829         bool prevIsNumOrCharOrClose =
 830             prevIsNum ||
 831             (prevClass == CLASS_CHARACTER &&
 832              !NEED_CONTEXTUAL_ANALYSIS(prevOfHyphen)) ||
 833             prevClass == CLASS_CLOSE || prevClass == CLASS_CLOSE_LIKE_CHARACTER;
 834         bool nextIsNumOrCharOrOpen =
 835             nextIsNum ||
 836             (nextClass == CLASS_CHARACTER && !NEED_CONTEXTUAL_ANALYSIS(next)) ||
 837             nextClass == CLASS_OPEN || nextClass == CLASS_OPEN_LIKE_CHARACTER ||
 838             next == U_OPEN_SINGLE_QUOTE || next == U_OPEN_DOUBLE_QUOTE ||
 839             next == U_OPEN_GUILLEMET;
 840         if (prevIsNumOrCharOrClose && nextIsNumOrCharOrOpen) {
 841           return CLASS_CLOSE;
 842         }
 843       }
 844     }
 845   } else {
 846     aState.NotifyNonHyphenCharacter(cur);
 847     if (cur == U_SLASH || cur == U_BACKSLASH) {
 848       // If this is immediately after same char, we should not break here.
 849       if (prev == cur) return CLASS_CHARACTER;
 850       // If this text has two or more (BACK)SLASHs, this may be file path or
 851       // URL. Make sure to compute shouldReturn before we notify on this slash.
 852       bool shouldReturn = !aState.UseConservativeBreaking() &&
 853                           (cur == U_SLASH ? aState.HasPreviousSlash()
 854                                           : aState.HasPreviousBackslash());
 855
 856       if (cur == U_SLASH) {
 857         aState.NotifySeenSlash();
 858       } else {
 859         aState.NotifySeenBackslash();
 860       }
 861
 862       if (shouldReturn) return CLASS_OPEN;
 863     } else if (cur == U_PERCENT) {
 864       // If this is a part of the param of URL, we should break before.
 865       if (!aState.UseConservativeBreaking()) {
 866         if (aState.Index() >= 3 &&
 867             aState.GetCodeUnitAt(aState.Index() - 3) == U_PERCENT)
 868           return CLASS_OPEN;
 869         if (aState.Index() + 3 < aState.Length() &&
 870             aState.GetCodeUnitAt(aState.Index() + 3) == U_PERCENT)
 871           return CLASS_OPEN;
 872       }
 873     } else if (cur == U_AMPERSAND || cur == U_SEMICOLON) {
 874       // If this may be a separator of params of URL, we should break after.
 875       if (!aState.UseConservativeBreaking(1) && aState.HasPreviousEqualsSign())
 876         return CLASS_CLOSE;
 877     } else if (cur == U_OPEN_SINGLE_QUOTE || cur == U_OPEN_DOUBLE_QUOTE ||
 878                cur == U_OPEN_GUILLEMET) {
 879       // for CJK usage, we treat these as openers to allow a break before them,
 880       // but otherwise treat them as normal characters because quote mark usage
 881       // in various Western languages varies too much; see bug #450088
 882       // discussion.
 883       if (!aState.UseConservativeBreaking() && IS_CJK_CHAR(next))
 884         return CLASS_OPEN;
 885     } else {
 886       NS_ERROR("Forgot to handle the current character!");
 887     }
 888   }
 889   return GetClass(cur, aLevel, aIsChineseOrJapanese);
 890 }
 891
 892 int32_t LineBreaker::WordMove(const char16_t* aText, uint32_t aLen,
 893                               uint32_t aPos, int8_t aDirection) {
 894   bool textNeedsJISx4051 = false;
 895   int32_t begin, end;
 896
 897   for (begin = aPos; begin > 0 && !NS_IsSpace(aText[begin - 1]); --begin) {
 898     if (IS_CJK_CHAR(aText[begin]) ||
 899         NS_NeedsPlatformNativeHandling(aText[begin])) {
 900       textNeedsJISx4051 = true;
 901     }
 902   }
 903   for (end = aPos + 1; end < int32_t(aLen) && !NS_IsSpace(aText[end]); ++end) {
 904     if (IS_CJK_CHAR(aText[end]) || NS_NeedsPlatformNativeHandling(aText[end])) {
 905       textNeedsJISx4051 = true;
 906     }
 907   }
 908
 909   int32_t ret;
 910   AutoTArray<uint8_t, 2000> breakState;
 911   if (!textNeedsJISx4051 || !breakState.AppendElements(end - begin)) {
 912     // No complex text character, do not try to do complex line break.
 913     // (This is required for serializers. See Bug #344816.)
 914     // Also fall back to this when out of memory.
 915     if (aDirection < 0) {
 916       ret = (begin == int32_t(aPos)) ? begin - 1 : begin;
 917     } else {
 918       ret = end;
 919     }
 920   } else {
 921     GetJISx4051Breaks(aText + begin, end - begin, WordBreak::Normal,
 922                       Strictness::Auto, false, breakState.Elements());
 923
 924     ret = aPos;
 925     do {
 926       ret += aDirection;
 927     } while (begin < ret && ret < end && !breakState[ret - begin]);
 928   }
 929
 930   return ret;
 931 }
 932
 933 int32_t LineBreaker::Next(const char16_t* aText, uint32_t aLen, uint32_t aPos) {
 934   NS_ASSERTION(aText, "aText shouldn't be null");
 935   NS_ASSERTION(aLen > aPos,
 936                "Bad position passed to nsJISx4051LineBreaker::Next");
 937
 938   int32_t nextPos = WordMove(aText, aLen, aPos, 1);
 939   return nextPos < int32_t(aLen) ? nextPos : NS_LINEBREAKER_NEED_MORE_TEXT;
 940 }
 941
 942 int32_t LineBreaker::Prev(const char16_t* aText, uint32_t aLen, uint32_t aPos) {
 943   NS_ASSERTION(aText, "aText shouldn't be null");
 944   NS_ASSERTION(aLen >= aPos && aPos > 0,
 945                "Bad position passed to nsJISx4051LineBreaker::Prev");
 946
 947   int32_t prevPos = WordMove(aText, aLen, aPos, -1);
 948   return prevPos > 0 ? prevPos : NS_LINEBREAKER_NEED_MORE_TEXT;
 949 }
 950
 951 void LineBreaker::GetJISx4051Breaks(const char16_t* aChars, uint32_t aLength,
 952                                     WordBreak aWordBreak, Strictness aLevel,
 953                                     bool aIsChineseOrJapanese,
 954                                     uint8_t* aBreakBefore) {
 955   uint32_t cur;
 956   int8_t lastClass = CLASS_NONE;
 957   ContextState state(aChars, aLength);
 958
 959   for (cur = 0; cur < aLength; ++cur, state.AdvanceIndex()) {
 960     char32_t ch = state.GetUnicodeCharAt(cur);
 961     uint32_t chLen = ch > 0xFFFFu ? 2 : 1;
 962     int8_t cl;
 963
 964     if (NEED_CONTEXTUAL_ANALYSIS(ch)) {
 965       char32_t prev, next;
 966       if (cur > 0) {
 967         // not using state.GetUnicodeCharAt() here because we're looking back
 968         // rather than forward for possible surrogates
 969         prev = aChars[cur - 1];
 970         if (cur > 1 && NS_IS_SURROGATE_PAIR(aChars[cur - 2], prev)) {
 971           prev = SURROGATE_TO_UCS4(aChars[cur - 2], prev);
 972         }
 973       } else {
 974         prev = 0;
 975       }
 976       if (cur + chLen < aLength) {
 977         next = state.GetUnicodeCharAt(cur + chLen);
 978       } else {
 979         next = 0;
 980       }
 981       cl = ContextualAnalysis(prev, ch, next, state, aLevel,
 982                               aIsChineseOrJapanese);
 983     } else {
 984       if (ch == U_EQUAL) state.NotifySeenEqualsSign();
 985       state.NotifyNonHyphenCharacter(ch);
 986       cl = GetClass(ch, aLevel, aIsChineseOrJapanese);
 987     }
 988
 989     // To implement word-break:break-all, we overwrite the line-break class of
 990     // alphanumeric characters so they are treated the same as ideographic.
 991     // The relevant characters will have been assigned CLASS_CHARACTER, _CLOSE,
 992     // or _NUMERIC by GetClass(), but those classes also include others that
 993     // we don't want to touch here, so we re-check the Unicode line-break class
 994     // to determine which ones to modify.
 995     if (aWordBreak == WordBreak::BreakAll &&
 996         (cl == CLASS_CHARACTER || cl == CLASS_CLOSE || cl == CLASS_NUMERIC)) {
 997       auto cls = GetLineBreakClass(ch);
 998       if (cls == U_LB_ALPHABETIC || cls == U_LB_NUMERIC ||
 999           cls == U_LB_AMBIGUOUS || cls == U_LB_COMPLEX_CONTEXT ||
1000           /* Additional Japanese and Korean LB classes; CSS Text spec doesn't
1001              explicitly mention these, but this appears to give expected
1002              behavior (spec issue?) */
1003           cls == U_LB_CONDITIONAL_JAPANESE_STARTER ||
1004           (cls >= U_LB_H2 && cls <= U_LB_JV)) {
1005         cl = CLASS_BREAKABLE;
1006       }
1007     }
1008
1009     bool allowBreak = false;
1010     if (cur > 0) {
1011       NS_ASSERTION(CLASS_COMPLEX != lastClass || CLASS_COMPLEX != cl,
1012                    "Loop should have prevented adjacent complex chars here");
1013       if (aWordBreak == WordBreak::Normal ||
1014           aWordBreak == WordBreak::BreakAll) {
1015         allowBreak = (state.UseConservativeBreaking())
1016                          ? GetPairConservative(lastClass, cl)
1017                          : GetPair(lastClass, cl);
1018       }
1019     }
1020     aBreakBefore[cur] = allowBreak;
1021     if (allowBreak) state.NotifyBreakBefore();
1022     lastClass = cl;
1023     if (CLASS_COMPLEX == cl) {
1024       uint32_t end = cur + chLen;
1025
1026       while (end < aLength) {
1027         char32_t c = state.GetUnicodeCharAt(end);
1028         if (CLASS_COMPLEX != GetClass(c, aLevel, false)) {
1029           break;
1030         }
1031         ++end;
1032         if (c > 0xFFFFU) {  // it was a surrogate pair
1033           ++end;
1034         }
1035       }
1036
1037       if (aWordBreak == WordBreak::BreakAll) {
1038         // For break-all, we don't need to run a dictionary-based breaking
1039         // algorithm, we just allow breaks between all grapheme clusters.
1040         ClusterIterator ci(aChars + cur, end - cur);
1041         while (!ci.AtEnd()) {
1042           ci.Next();
1043           aBreakBefore[ci - aChars] = true;
1044         }
1045       } else {
1046         NS_GetComplexLineBreaks(aChars + cur, end - cur, aBreakBefore + cur);
1047         // restore breakability at chunk begin, which was always set to false
1048         // by the complex line breaker
1049         aBreakBefore[cur] = allowBreak;
1050       }
1051
1052       cur = end - 1;
1053     }
1054
1055     if (chLen == 2) {
1056       // Supplementary-plane character: mark that we cannot break before the
1057       // trailing low surrogate, and advance past it.
1058       ++cur;
1059       aBreakBefore[cur] = false;
1060       state.AdvanceIndex();
1061     }
1062   }
1063 }
1064
1065 void LineBreaker::GetJISx4051Breaks(const uint8_t* aChars, uint32_t aLength,
1066                                     WordBreak aWordBreak, Strictness aLevel,
1067                                     bool aIsChineseOrJapanese,
1068                                     uint8_t* aBreakBefore) {
1069   uint32_t cur;
1070   int8_t lastClass = CLASS_NONE;
1071   ContextState state(aChars, aLength);
1072
1073   for (cur = 0; cur < aLength; ++cur, state.AdvanceIndex()) {
1074     char32_t ch = aChars[cur];
1075     int8_t cl;
1076
1077     if (NEED_CONTEXTUAL_ANALYSIS(ch)) {
1078       cl = ContextualAnalysis(cur > 0 ? aChars[cur - 1] : U_NULL, ch,
1079                               cur + 1 < aLength ? aChars[cur + 1] : U_NULL,
1080                               state, aLevel, aIsChineseOrJapanese);
1081     } else {
1082       if (ch == U_EQUAL) state.NotifySeenEqualsSign();
1083       state.NotifyNonHyphenCharacter(ch);
1084       cl = GetClass(ch, aLevel, aIsChineseOrJapanese);
1085     }
1086     if (aWordBreak == WordBreak::BreakAll &&
1087         (cl == CLASS_CHARACTER || cl == CLASS_CLOSE || cl == CLASS_NUMERIC)) {
1088       auto cls = GetLineBreakClass(ch);
1089       // Don't need to check additional Japanese/Korean classes in 8-bit
1090       if (cls == U_LB_ALPHABETIC || cls == U_LB_NUMERIC ||
1091           cls == U_LB_COMPLEX_CONTEXT) {
1092         cl = CLASS_BREAKABLE;
1093       }
1094     }
1095
1096     bool allowBreak = false;
1097     if (cur > 0) {
1098       if (aWordBreak == WordBreak::Normal ||
1099           aWordBreak == WordBreak::BreakAll) {
1100         allowBreak = (state.UseConservativeBreaking())
1101                          ? GetPairConservative(lastClass, cl)
1102                          : GetPair(lastClass, cl);
1103       }
1104     }
1105     aBreakBefore[cur] = allowBreak;
1106     if (allowBreak) state.NotifyBreakBefore();
1107     lastClass = cl;
1108   }
1109 }