src/bidi.c

   1 /* Low-level bidirectional buffer/string-scanning functions for GNU Emacs.
   2    Copyright (C) 2000-2001, 2004-2005, 2009-2014 Free Software
   3    Foundation, Inc.
   4
   5 This file is part of GNU Emacs.
   6
   7 GNU Emacs is free software: you can redistribute it and/or modify
   8 it under the terms of the GNU General Public License as published by
   9 the Free Software Foundation, either version 3 of the License, or
  10 (at your option) any later version.
  11
  12 GNU Emacs is distributed in the hope that it will be useful,
  13 but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15 GNU General Public License for more details.
  16
  17 You should have received a copy of the GNU General Public License
  18 along with GNU Emacs.  If not, see <http://www.gnu.org/licenses/>.  */
  19
  20 /* Written by Eli Zaretskii <eliz@gnu.org>.
  21
  22    A sequential implementation of the Unicode Bidirectional algorithm,
  23    (UBA) as per UAX#9, a part of the Unicode Standard.
  24
  25    Unlike the Reference Implementation and most other implementations,
  26    this one is designed to be called once for every character in the
  27    buffer or string.  That way, we can leave intact the design of the
  28    Emacs display engine, whereby an iterator object is used to
  29    traverse buffer or string text character by character, and generate
  30    the necessary data for displaying each character in 'struct glyph'
  31    objects.  (See xdisp.c for the details of that iteration.)  The
  32    functions on this file replace the original linear iteration in the
  33    logical order of the text with a non-linear iteration in the visual
  34    order, i.e. in the order characters should be shown on display.
  35
  36    The main entry point is bidi_move_to_visually_next.  Each time it
  37    is called, it finds the next character in the visual order, and
  38    returns its information in a special structure.  The caller is then
  39    expected to process this character for display or any other
  40    purposes, and call bidi_move_to_visually_next for the next
  41    character.  See the comments in bidi_move_to_visually_next for more
  42    details about its algorithm that finds the next visual-order
  43    character by resolving their levels on the fly.
  44
  45    Two other entry points are bidi_paragraph_init and
  46    bidi_mirror_char.  The first determines the base direction of a
  47    paragraph, while the second returns the mirrored version of its
  48    argument character.
  49
  50    A few auxiliary entry points are used to initialize the bidi
  51    iterator for iterating an object (buffer or string), push and pop
  52    the bidi iterator state, and save and restore the state of the bidi
  53    cache.
  54
  55    If you want to understand the code, you will have to read it
  56    together with the relevant portions of UAX#9.  The comments include
  57    references to UAX#9 rules, for that very reason.
  58
  59    A note about references to UAX#9 rules: if the reference says
  60    something like "X9/Retaining", it means that you need to refer to
  61    rule X9 and to its modifications described in the "Implementation
  62    Notes" section of UAX#9, under "Retaining Format Codes".
  63
  64    Here's the overview of the design of the reordering engine
  65    implemented by this file.
  66
  67    Basic implementation structure
  68    ------------------------------
  69
  70    The sequential processing steps described by UAX#9 are implemented
  71    as recursive levels of processing, all of which examine the next
  72    character in the logical order.  This hierarchy of processing looks
  73    as follows, from the innermost (deepest) to the outermost level,
  74    omitting some subroutines used by each level:
  75
  76      bidi_fetch_char         -- fetch next character
  77      bidi_resolve_explicit   -- resolve explicit levels and directions
  78      bidi_resolve_weak       -- resolve weak types
  79      bidi_resolve_brackets   -- resolve "paired brackets" neutral types
  80      bidi_resolve_neutral    -- resolve neutral types
  81      bidi_level_of_next_char -- resolve implicit levels
  82
  83    Each level calls the level below it, and works on the result
  84    returned by the lower level, including all of its sub-levels.
  85
  86    Unlike all the levels below it, bidi_level_of_next_char can return
  87    the information about either the next or previous character in the
  88    logical order, depending on the current direction of scanning the
  89    buffer or string.  For the next character, it calls all the levels
  90    below it; for the previous character, it uses the cache, described
  91    below.
  92
  93    Thus, the result of calling bidi_level_of_next_char is the resolved
  94    level of the next or the previous character in the logical order.
  95    Based on this information, the function bidi_move_to_visually_next
  96    finds the next character in the visual order and updates the
  97    direction in which the buffer is scanned, either forward or
  98    backward, to find the next character to be displayed.  (Text is
  99    scanned backwards when it needs to be reversed for display, i.e. if
 100    the visual order is the inverse of the logical order.)  This
 101    implements the last, reordering steps of the UBA, by successively
 102    calling bidi_level_of_next_char until the character of the required
 103    embedding level is found; the scan direction is dynamically updated
 104    as a side effect.  See the commentary before the 'while' loop in
 105    bidi_move_to_visually_next, for the details.
 106
 107    Fetching characters
 108    -------------------
 109
 110    In a nutshell, fetching the next character boils down to calling
 111    STRING_CHAR_AND_LENGTH, passing it the address of a buffer or
 112    string position.  See bidi_fetch_char.  However, if the next
 113    character is "covered" by a display property of some kind,
 114    bidi_fetch_char returns the u+FFFC "object replacement character"
 115    that represents the entire run of text covered by the display
 116    property.  (The ch_len and nchars members of 'struct bidi_it'
 117    reflect the length in bytes and characters of that text.)  This is
 118    so we reorder text on both sides of the display property as
 119    appropriate for an image or embedded string.  Similarly, text
 120    covered by a display spec of the form '(space ...)', is replaced
 121    with the u+2029 paragraph separator character, so such display
 122    specs produce the same effect as a TAB under UBA.  Both these
 123    special characters are not actually displayed -- the display
 124    property is displayed instead -- but just used to compute the
 125    embedding level of the surrounding text so as to produce the
 126    required effect.
 127
 128    Bidi iterator states
 129    --------------------
 130
 131    The UBA is highly context dependent in some of its parts,
 132    i.e. results of processing a character can generally depend on
 133    characters very far away.  The UAX#9 description of the UBA
 134    prescribes a stateful processing of each character, whereby the
 135    results of this processing depend on various state variables, such
 136    as the current embedding level, level stack, and directional
 137    override status.  In addition, the UAX#9 description includes many
 138    passages like this (from rule W2 in this case):
 139
 140      Search backward from each instance of a European number until the
 141      first strong type (R, L, AL, or sos) is found. If an AL is found,
 142      change the type of the European number to Arabic number.
 143
 144    To support this, we use a bidi iterator object, 'struct bidi_it',
 145    which is a sub-structure of 'struct it' used by xdisp.c (see
 146    dispextern.h for the definition of both of these structures).  The
 147    bidi iterator holds the entire state of the iteration required by
 148    the UBA, and is updated as the text is traversed.  In particular,
 149    the embedding level of the current character being resolved is
 150    recorded in the iterator state.  To avoid costly searches backward
 151    in support of rules like W2 above, the necessary character types
 152    are also recorded in the iterator state as they are found during
 153    the forward scan, and then used when such rules need to be applied.
 154    (Forward scans cannot be avoided in this way; they need to be
 155    performed at least once, and the results recorded in the iterator
 156    state, to be reused until the forward scan oversteps the recorded
 157    position.)
 158
 159    In this manner, the iterator state acts as a mini-cache of
 160    contextual information required for resolving the level of the
 161    current character by various UBA rules.
 162
 163    Caching of bidi iterator states
 164    -------------------------------
 165
 166    As described above, the reordering engine uses the information
 167    recorded in the bidi iterator state in order to resolve the
 168    embedding level of the current character.  When the reordering
 169    engine needs to process the next character in the logical order, it
 170    fetches it and applies to it all the UBA levels, updating the
 171    iterator state as it goes.  But when the buffer or string is
 172    scanned backwards, i.e. in the reverse order of buffer/string
 173    positions, the scanned characters were already processed during the
 174    preceding forward scan (see bidi_find_other_level_edge).  To avoid
 175    costly re-processing of characters that were already processed
 176    during the forward scan, the iterator states computed while
 177    scanning forward are cached.
 178
 179    The cache is just a linear array of 'struct bidi_it' objects, which
 180    is dynamically allocated and reallocated as needed, since the size
 181    of the cache depends on the text being processed.  We only need the
 182    cache while processing embedded levels higher than the base
 183    paragraph embedding level, because these higher levels require
 184    changes in scan direction.  Therefore, as soon as we are back to
 185    the base embedding level, we can free the cache; see the calls to
 186    bidi_cache_reset and bidi_cache_shrink, for the conditions to do
 187    this.
 188
 189    The cache maintains the index of the next unused cache slot -- this
 190    is where the next iterator state will be cached.  The function
 191    bidi_cache_iterator_state saves an instance of the state in the
 192    cache and increments the unused slot index.  The companion function
 193    bidi_cache_find looks up a cached state that corresponds to a given
 194    buffer/string position.  All of the cached states must correspond
 195    1:1 to the buffer or string region whose processing they reflect;
 196    bidi.c will abort if it finds cache slots that violate this 1:1
 197    correspondence.
 198
 199    When the parent iterator 'struct it' is pushed (see push_it in
 200    xdisp.c) to pause the current iteration and start iterating over a
 201    different object (e.g., a 'display' string that covers some buffer
 202    text), the bidi iterator cache needs to be "pushed" as well, so
 203    that a new empty cache could be used while iterating over the new
 204    object.  Later, when the new object is exhausted, and xdisp.c calls
 205    pop_it, we need to "pop" the bidi cache as well and return to the
 206    original cache.  See bidi_push_it and bidi_pop_it for how this is
 207    done.
 208
 209    Some functions of the display engine save copies of 'struct it' in
 210    local variables, and restore them later.  For examples, see
 211    pos_visible_p and move_it_in_display_line_to in xdisp.c, and
 212    window_scroll_pixel_based in window.c.  When this happens, we need
 213    to save and restore the bidi cache as well, because conceptually
 214    the cache is part of the 'struct it' state, and needs to be in
 215    perfect sync with the portion of the buffer/string that is being
 216    processed.  This saving and restoring of the cache state is handled
 217    by bidi_shelve_cache and bidi_unshelve_cache, and the helper macros
 218    SAVE_IT and RESTORE_IT defined on xdisp.c.
 219
 220    Note that, because reordering is implemented below the level in
 221    xdisp.c that breaks glyphs into screen lines, we are violating
 222    paragraph 3.4 of UAX#9. which mandates that line breaking shall be
 223    done before reordering each screen line separately.  However,
 224    following UAX#9 to the letter in this matter goes against the basic
 225    design of the Emacs display engine, and so we choose here this
 226    minor deviation from the UBA letter in preference to redesign of
 227    the display engine.  The effect of this is only seen in continued
 228    lines that are broken into screen lines in the middle of a run
 229    whose direction is opposite to the paragraph's base direction.
 230
 231    Important design and implementation note: when the code needs to
 232    scan far ahead, be sure to avoid such scans as much as possible
 233    when the buffer/string doesn't contain any RTL characters.  Users
 234    of left-to-right scripts will never forgive you if you introduce
 235    some slow-down due to bidi in situations that don't involve any
 236    bidirectional text.  See the large comment near the beginning of
 237    bidi_resolve_neutral, for one situation where such shortcut was
 238    necessary.  */
 239
 240 #include <config.h>
 241 #include <stdio.h>
 242
 243 #include "lisp.h"
 244 #include "character.h"
 245 #include "buffer.h"
 246 #include "dispextern.h"
 247 #include "region-cache.h"
 248
 249 static bool bidi_initialized = 0;
 250
 251 static Lisp_Object bidi_type_table, bidi_mirror_table, bidi_brackets_table;
 252
 253 #define BIDI_EOB   (-1)
 254
 255 /* Data type for describing the bidirectional character categories.  */
 256 typedef enum {
 257   UNKNOWN_BC,
 258   NEUTRAL,
 259   WEAK,
 260   STRONG,
 261   EXPLICIT_FORMATTING
 262 } bidi_category_t;
 263
 264 static Lisp_Object paragraph_start_re, paragraph_separate_re;
 265 static Lisp_Object Qparagraph_start, Qparagraph_separate;
 266
 267 \f
 268 /***********************************************************************
 269                         Utilities
 270  ***********************************************************************/
 271
 272 /* Return the bidi type of a character CH, subject to the current
 273    directional OVERRIDE.  */
 274 static bidi_type_t
 275 bidi_get_type (int ch, bidi_dir_t override)
 276 {
 277   bidi_type_t default_type;
 278
 279   if (ch == BIDI_EOB)
 280     return NEUTRAL_B;
 281   if (ch < 0 || ch > MAX_CHAR)
 282     emacs_abort ();
 283
 284   default_type = (bidi_type_t) XINT (CHAR_TABLE_REF (bidi_type_table, ch));
 285   /* Every valid character code, even those that are unassigned by the
 286      UCD, have some bidi-class property, according to
 287      DerivedBidiClass.txt file.  Therefore, if we ever get UNKNOWN_BT
 288      (= zero) code from CHAR_TABLE_REF, that's a bug.  */
 289   if (default_type == UNKNOWN_BT)
 290     emacs_abort ();
 291
 292   switch (default_type)
 293     {
 294       case WEAK_BN:
 295       case NEUTRAL_B:
 296       case LRE:
 297       case LRO:
 298       case RLE:
 299       case RLO:
 300       case PDF:
 301       case LRI:
 302       case RLI:
 303       case FSI:
 304       case PDI:
 305         return default_type;
 306       default:
 307         if (override == L2R)
 308           return STRONG_L;
 309         else if (override == R2L)
 310           return STRONG_R;
 311         else
 312           return default_type;
 313     }
 314 }
 315
 316 static void
 317 bidi_check_type (bidi_type_t type)
 318 {
 319   eassert (UNKNOWN_BT <= type && type <= NEUTRAL_ON);
 320 }
 321
 322 /* Given a bidi TYPE of a character, return its category.  */
 323 static bidi_category_t
 324 bidi_get_category (bidi_type_t type)
 325 {
 326   switch (type)
 327     {
 328       case UNKNOWN_BT:
 329         return UNKNOWN_BC;
 330       case STRONG_L:
 331       case STRONG_R:
 332       case STRONG_AL:
 333         return STRONG;
 334       case WEAK_EN:
 335       case WEAK_ES:
 336       case WEAK_ET:
 337       case WEAK_AN:
 338       case WEAK_CS:
 339       case WEAK_NSM:
 340       case WEAK_BN:
 341         return WEAK;
 342       case NEUTRAL_B:
 343       case NEUTRAL_S:
 344       case NEUTRAL_WS:
 345       case NEUTRAL_ON:
 346         return NEUTRAL;
 347       case LRE:
 348       case LRO:
 349       case RLE:
 350       case RLO:
 351       case PDF:
 352       case LRI:
 353       case RLI:
 354       case FSI:
 355       case PDI:
 356         return EXPLICIT_FORMATTING;
 357       default:
 358         emacs_abort ();
 359     }
 360 }
 361
 362 static bool
 363 bidi_isolate_fmt_char (bidi_type_t ch_type)
 364 {
 365   return (ch_type == LRI || ch_type == RLI || ch_type == PDI || ch_type == FSI);
 366 }
 367
 368 /* Return the mirrored character of C, if it has one.  If C has no
 369    mirrored counterpart, return C.
 370    Note: The conditions in UAX#9 clause L4 regarding the surrounding
 371    context must be tested by the caller.  */
 372 int
 373 bidi_mirror_char (int c)
 374 {
 375   Lisp_Object val;
 376
 377   if (c == BIDI_EOB)
 378     return c;
 379   if (c < 0 || c > MAX_CHAR)
 380     emacs_abort ();
 381
 382   val = CHAR_TABLE_REF (bidi_mirror_table, c);
 383   if (INTEGERP (val))
 384     {
 385       int v;
 386
 387       /* When debugging, check before assigning to V, so that the check
 388          isn't broken by undefined behavior due to int overflow.  */
 389       eassert (CHAR_VALID_P (XINT (val)));
 390
 391       v = XINT (val);
 392
 393       /* Minimal test we must do in optimized builds, to prevent weird
 394          crashes further down the road.  */
 395       if (v < 0 || v > MAX_CHAR)
 396         emacs_abort ();
 397
 398       return v;
 399     }
 400
 401   return c;
 402 }
 403
 404 /* Return the Bidi_Paired_Bracket_Type property of the character C.  */
 405 static bidi_bracket_type_t
 406 bidi_paired_bracket_type (int c)
 407 {
 408   if (c == BIDI_EOB)
 409     return BIDI_BRACKET_NONE;
 410   if (c < 0 || c > MAX_CHAR)
 411     emacs_abort ();
 412
 413   return (bidi_bracket_type_t) XINT (CHAR_TABLE_REF (bidi_brackets_table, c));
 414 }
 415
 416 /* Determine the start-of-sequence (sos) directional type given the two
 417    embedding levels on either side of the run boundary.  Also, update
 418    the saved info about previously seen characters, since that info is
 419    generally valid for a single level run.  */
 420 static void
 421 bidi_set_sos_type (struct bidi_it *bidi_it, int level_before, int level_after)
 422 {
 423   int higher_level = (level_before > level_after ? level_before : level_after);
 424
 425   /* FIXME: should the default sos direction be user selectable?  */
 426   bidi_it->sos = ((higher_level & 1) != 0 ? R2L : L2R); /* X10 */
 427
 428   bidi_it->prev.type = UNKNOWN_BT;
 429   bidi_it->last_strong.type = bidi_it->last_strong.orig_type = UNKNOWN_BT;
 430   bidi_it->prev_for_neutral.type = (bidi_it->sos == R2L ? STRONG_R : STRONG_L);
 431   bidi_it->prev_for_neutral.charpos = bidi_it->charpos;
 432   bidi_it->next_for_neutral.type
 433     = bidi_it->next_for_neutral.orig_type = UNKNOWN_BT;
 434 }
 435
 436 /* Push the current embedding level and override status; reset the
 437    current level to LEVEL and the current override status to OVERRIDE.  */
 438 static void
 439 bidi_push_embedding_level (struct bidi_it *bidi_it,
 440                            int level, bidi_dir_t override, bool isolate_status)
 441 {
 442   struct bidi_stack *st;
 443   int prev_level = bidi_it->level_stack[bidi_it->stack_idx].level;
 444
 445   bidi_it->stack_idx++;
 446   eassert (bidi_it->stack_idx < BIDI_MAXDEPTH+2+1);
 447   st = &bidi_it->level_stack[bidi_it->stack_idx];
 448   eassert (level <= (1 << 7));
 449   st->level = level;
 450   st->override = override;
 451   st->isolate_status = isolate_status;
 452   if (isolate_status)
 453     {
 454       st->last_strong = bidi_it->last_strong;
 455       st->prev_for_neutral = bidi_it->prev_for_neutral;
 456       st->next_for_neutral = bidi_it->next_for_neutral;
 457       st->sos = bidi_it->sos;
 458     }
 459   /* We've got a new isolating sequence, compute the directional type
 460      of sos and initialize per-sequence variables (UAX#9, clause X10).  */
 461   bidi_set_sos_type (bidi_it, prev_level, level);
 462 }
 463
 464 /* Pop from the stack the embedding level, the directional override
 465    status, and optionally saved information for the isolating run
 466    sequence.  Return the new level.  */
 467 static int
 468 bidi_pop_embedding_level (struct bidi_it *bidi_it)
 469 {
 470   int level;
 471
 472   /* UAX#9 says to ignore invalid PDFs (X7, last bullet)
 473      and PDIs (X6a, 2nd bullet).  */
 474   if (bidi_it->stack_idx > 0)
 475     {
 476       bool isolate_status
 477         = bidi_it->level_stack[bidi_it->stack_idx].isolate_status;
 478       int old_level = bidi_it->level_stack[bidi_it->stack_idx].level;
 479
 480       struct bidi_stack st;
 481
 482       st = bidi_it->level_stack[bidi_it->stack_idx];
 483       if (isolate_status)
 484         {
 485           /* PREV is used in W1 for resolving WEAK_NSM.  By the time
 486              we get to an NSM, we must have gotten past at least one
 487              character: the PDI that ends the isolate from which we
 488              are popping here.  So PREV will have been filled up by
 489              the time we first use it.  We initialize it here to
 490              UNKNOWN_BT to be able to catch any blunders in this
 491              logic.  */
 492           bidi_it->prev.orig_type = bidi_it->prev.type = UNKNOWN_BT;
 493           bidi_it->last_strong = st.last_strong;
 494           bidi_it->prev_for_neutral = st.prev_for_neutral;
 495           bidi_it->next_for_neutral = st.next_for_neutral;
 496           bidi_it->sos = st.sos;
 497         }
 498       else
 499         bidi_set_sos_type (bidi_it, old_level,
 500                            bidi_it->level_stack[bidi_it->stack_idx - 1].level);
 501
 502       bidi_it->stack_idx--;
 503     }
 504   level = bidi_it->level_stack[bidi_it->stack_idx].level;
 505   eassert (0 <= level && level <= BIDI_MAXDEPTH + 1);
 506   return level;
 507 }
 508
 509 /* Record in SAVED_INFO the information about the current character.  */
 510 static void
 511 bidi_remember_char (struct bidi_saved_info *saved_info,
 512                     struct bidi_it *bidi_it, bool from_type)
 513 {
 514   saved_info->charpos = bidi_it->charpos;
 515   if (from_type)
 516     saved_info->type = bidi_it->type;
 517   else
 518     saved_info->type = bidi_it->type_after_wn;
 519   bidi_check_type (saved_info->type);
 520   saved_info->orig_type = bidi_it->orig_type;
 521   bidi_check_type (saved_info->orig_type);
 522 }
 523
 524 /* Copy the bidi iterator from FROM to TO.  To save cycles, this only
 525    copies the part of the level stack that is actually in use.  */
 526 static void
 527 bidi_copy_it (struct bidi_it *to, struct bidi_it *from)
 528 {
 529   /* Copy everything from the start through the active part of
 530      the level stack.  */
 531   memcpy (to, from,
 532           (offsetof (struct bidi_it, level_stack[1])
 533            + from->stack_idx * sizeof from->level_stack[0]));
 534 }
 535
 536 \f
 537 /***********************************************************************
 538                         Caching the bidi iterator states
 539  ***********************************************************************/
 540
 541 /* We allocate and de-allocate the cache in chunks of this size (in
 542    characters).  200 was chosen as an upper limit for reasonably-long
 543    lines in a text file/buffer.  */
 544 #define BIDI_CACHE_CHUNK 200
 545 static struct bidi_it *bidi_cache;
 546 static ptrdiff_t bidi_cache_size = 0;
 547 enum { elsz = sizeof (struct bidi_it) };
 548 static ptrdiff_t bidi_cache_idx;        /* next unused cache slot */
 549 static ptrdiff_t bidi_cache_last_idx;   /* slot of last cache hit */
 550 static ptrdiff_t bidi_cache_start = 0;  /* start of cache for this
 551                                            "stack" level */
 552
 553 /* 5-slot stack for saving the start of the previous level of the
 554    cache.  xdisp.c maintains a 5-slot stack for its iterator state,
 555    and we need the same size of our stack.  */
 556 static ptrdiff_t bidi_cache_start_stack[IT_STACK_SIZE];
 557 static int bidi_cache_sp;
 558
 559 /* Size of header used by bidi_shelve_cache.  */
 560 enum
 561   {
 562     bidi_shelve_header_size
 563       = (sizeof (bidi_cache_idx) + sizeof (bidi_cache_start_stack)
 564          + sizeof (bidi_cache_sp) + sizeof (bidi_cache_start)
 565          + sizeof (bidi_cache_last_idx))
 566   };
 567
 568 /* Effectively remove the cached states beyond the Nth state from the
 569    part of the cache relevant to iteration of the current object
 570    (buffer or string).  */
 571 static void
 572 bidi_cache_reset_to (int n)
 573 {
 574   bidi_cache_idx = bidi_cache_start + n;
 575   bidi_cache_last_idx = -1;
 576 }
 577
 578 /* Reset the cache state to the empty state.  We only reset the part
 579    of the cache relevant to iteration of the current object.  Previous
 580    objects, which are pushed on the display iterator's stack, are left
 581    intact.  This is called when the cached information is no more
 582    useful for the current iteration, e.g. when we were reseated to a
 583    new position on the same object.  */
 584 static void
 585 bidi_cache_reset (void)
 586 {
 587   bidi_cache_reset_to (0);
 588 }
 589
 590 /* Shrink the cache to its minimal size.  Called when we init the bidi
 591    iterator for reordering a buffer or a string that does not come
 592    from display properties, because that means all the previously
 593    cached info is of no further use.  */
 594 static void
 595 bidi_cache_shrink (void)
 596 {
 597   if (bidi_cache_size > BIDI_CACHE_CHUNK)
 598     {
 599       bidi_cache = xrealloc (bidi_cache, BIDI_CACHE_CHUNK * elsz);
 600       bidi_cache_size = BIDI_CACHE_CHUNK;
 601     }
 602   bidi_cache_reset ();
 603 }
 604
 605 static void
 606 bidi_cache_fetch_state (ptrdiff_t idx, struct bidi_it *bidi_it)
 607 {
 608   int current_scan_dir = bidi_it->scan_dir;
 609
 610   if (idx < bidi_cache_start || idx >= bidi_cache_idx)
 611     emacs_abort ();
 612
 613   bidi_copy_it (bidi_it, &bidi_cache[idx]);
 614   bidi_it->scan_dir = current_scan_dir;
 615   bidi_cache_last_idx = idx;
 616 }
 617
 618 /* Find a cached state with a given CHARPOS and resolved embedding
 619    level less or equal to LEVEL.  If LEVEL is -1, disregard the
 620    resolved levels in cached states.  DIR, if non-zero, means search
 621    in that direction from the last cache hit.  */
 622 static ptrdiff_t
 623 bidi_cache_search (ptrdiff_t charpos, int level, int dir)
 624 {
 625   ptrdiff_t i, i_start;
 626
 627   if (bidi_cache_idx > bidi_cache_start)
 628     {
 629       if (bidi_cache_last_idx == -1)
 630         bidi_cache_last_idx = bidi_cache_idx - 1;
 631       if (charpos < bidi_cache[bidi_cache_last_idx].charpos)
 632         {
 633           dir = -1;
 634           i_start = bidi_cache_last_idx - 1;
 635         }
 636       else if (charpos > (bidi_cache[bidi_cache_last_idx].charpos
 637                           + bidi_cache[bidi_cache_last_idx].nchars - 1))
 638         {
 639           dir = 1;
 640           i_start = bidi_cache_last_idx + 1;
 641         }
 642       else if (dir)
 643         i_start = bidi_cache_last_idx;
 644       else
 645         {
 646           dir = -1;
 647           i_start = bidi_cache_idx - 1;
 648         }
 649
 650       if (dir < 0)
 651         {
 652           /* Linear search for now; FIXME!  */
 653           for (i = i_start; i >= bidi_cache_start; i--)
 654             if (bidi_cache[i].charpos <= charpos
 655                 && charpos < bidi_cache[i].charpos + bidi_cache[i].nchars
 656                 && (level == -1 || bidi_cache[i].resolved_level <= level))
 657               return i;
 658         }
 659       else
 660         {
 661           for (i = i_start; i < bidi_cache_idx; i++)
 662             if (bidi_cache[i].charpos <= charpos
 663                 && charpos < bidi_cache[i].charpos + bidi_cache[i].nchars
 664                 && (level == -1 || bidi_cache[i].resolved_level <= level))
 665               return i;
 666         }
 667     }
 668
 669   return -1;
 670 }
 671
 672 /* Find a cached state where the resolved level changes to a value
 673    that is lower than LEVEL, and return its cache slot index.  DIR is
 674    the direction to search, starting with the last used cache slot.
 675    If DIR is zero, we search backwards from the last occupied cache
 676    slot.  BEFORE means return the index of the slot that
 677    is ``before'' the level change in the search direction.  That is,
 678    given the cached levels like this:
 679
 680          1122333442211
 681           AB        C
 682
 683    and assuming we are at the position cached at the slot marked with
 684    C, searching backwards (DIR = -1) for LEVEL = 2 will return the
 685    index of slot B or A, depending whether BEFORE is, respectively,
 686    true or false.  */
 687 static ptrdiff_t
 688 bidi_cache_find_level_change (int level, int dir, bool before)
 689 {
 690   if (bidi_cache_idx)
 691     {
 692       ptrdiff_t i = dir ? bidi_cache_last_idx : bidi_cache_idx - 1;
 693       int incr = before ? 1 : 0;
 694
 695       eassert (!dir || bidi_cache_last_idx >= 0);
 696
 697       if (!dir)
 698         dir = -1;
 699       else if (!incr)
 700         i += dir;
 701
 702       if (dir < 0)
 703         {
 704           while (i >= bidi_cache_start + incr)
 705             {
 706               if (bidi_cache[i - incr].resolved_level >= 0
 707                   && bidi_cache[i - incr].resolved_level < level)
 708                 return i;
 709               i--;
 710             }
 711         }
 712       else
 713         {
 714           while (i < bidi_cache_idx - incr)
 715             {
 716               if (bidi_cache[i + incr].resolved_level >= 0
 717                   && bidi_cache[i + incr].resolved_level < level)
 718                 return i;
 719               i++;
 720             }
 721         }
 722     }
 723
 724   return -1;
 725 }
 726
 727 static void
 728 bidi_cache_ensure_space (ptrdiff_t idx)
 729 {
 730   /* Enlarge the cache as needed.  */
 731   if (idx >= bidi_cache_size)
 732     {
 733       /* The bidi cache cannot be larger than the largest Lisp string
 734          or buffer.  */
 735       ptrdiff_t string_or_buffer_bound
 736         = max (BUF_BYTES_MAX, STRING_BYTES_BOUND);
 737
 738       /* Also, it cannot be larger than what C can represent.  */
 739       ptrdiff_t c_bound
 740         = (min (PTRDIFF_MAX, SIZE_MAX) - bidi_shelve_header_size) / elsz;
 741
 742       bidi_cache
 743         = xpalloc (bidi_cache, &bidi_cache_size,
 744                    max (BIDI_CACHE_CHUNK, idx - bidi_cache_size + 1),
 745                    min (string_or_buffer_bound, c_bound), elsz);
 746     }
 747 }
 748
 749 static void
 750 bidi_cache_iterator_state (struct bidi_it *bidi_it, bool resolved,
 751                            bool update_only)
 752 {
 753   ptrdiff_t idx;
 754
 755   /* We should never cache on backward scans.  */
 756   if (bidi_it->scan_dir == -1)
 757     emacs_abort ();
 758   idx = bidi_cache_search (bidi_it->charpos, -1, 1);
 759
 760   if (idx < 0 && update_only)
 761     return;
 762
 763   if (idx < 0)
 764     {
 765       idx = bidi_cache_idx;
 766       bidi_cache_ensure_space (idx);
 767       /* Character positions should correspond to cache positions 1:1.
 768          If we are outside the range of cached positions, the cache is
 769          useless and must be reset.  */
 770       if (idx > bidi_cache_start &&
 771           (bidi_it->charpos > (bidi_cache[idx - 1].charpos
 772                                + bidi_cache[idx - 1].nchars)
 773            || bidi_it->charpos < bidi_cache[bidi_cache_start].charpos))
 774         {
 775           bidi_cache_reset ();
 776           idx = bidi_cache_start;
 777         }
 778       if (bidi_it->nchars <= 0)
 779         emacs_abort ();
 780       bidi_copy_it (&bidi_cache[idx], bidi_it);
 781       if (!resolved)
 782         bidi_cache[idx].resolved_level = -1;
 783     }
 784   else
 785     {
 786       /* Copy only the members which could have changed, to avoid
 787          costly copying of the entire struct.  */
 788       bidi_cache[idx].type = bidi_it->type;
 789       bidi_check_type (bidi_it->type);
 790       bidi_cache[idx].type_after_wn = bidi_it->type_after_wn;
 791       bidi_check_type (bidi_it->type_after_wn);
 792       if (resolved)
 793         bidi_cache[idx].resolved_level = bidi_it->resolved_level;
 794       else
 795         bidi_cache[idx].resolved_level = -1;
 796       bidi_cache[idx].invalid_levels = bidi_it->invalid_levels;
 797       bidi_cache[idx].next_for_neutral = bidi_it->next_for_neutral;
 798       bidi_cache[idx].next_for_ws = bidi_it->next_for_ws;
 799       bidi_cache[idx].disp_pos = bidi_it->disp_pos;
 800       bidi_cache[idx].disp_prop = bidi_it->disp_prop;
 801       bidi_cache[idx].bracket_pairing_pos = bidi_it->bracket_pairing_pos;
 802       bidi_cache[idx].bracket_enclosed_type = bidi_it->bracket_enclosed_type;
 803     }
 804
 805   bidi_cache_last_idx = idx;
 806   if (idx >= bidi_cache_idx)
 807     bidi_cache_idx = idx + 1;
 808 }
 809
 810 /* Look for a cached iterator state that corresponds to CHARPOS.  If
 811    found, copy the cached state into BIDI_IT and return the type of
 812    the cached entry.  If not found, return UNKNOWN_BT.  RESOLVED_ONLY
 813    zero means it is OK to return cached states that were not fully
 814    resolved yet.  This can happen if the state was cached before it
 815    was resolved in bidi_resolve_neutral.  */
 816 static bidi_type_t
 817 bidi_cache_find (ptrdiff_t charpos, bool resolved_only, struct bidi_it *bidi_it)
 818 {
 819   ptrdiff_t i = bidi_cache_search (charpos, -1, bidi_it->scan_dir);
 820
 821   if (i >= bidi_cache_start
 822       && (!resolved_only
 823           /* Callers that want only fully resolved states (and set
 824              resolved_only = true) need to be sure that there's enough
 825              info in the cached state to return the state as final,
 826              and if not, they don't want the cached state.  */
 827           || bidi_cache[i].resolved_level >= 0))
 828     {
 829       bidi_dir_t current_scan_dir = bidi_it->scan_dir;
 830
 831       bidi_copy_it (bidi_it, &bidi_cache[i]);
 832       bidi_cache_last_idx = i;
 833       /* Don't let scan direction from the cached state override
 834          the current scan direction.  */
 835       bidi_it->scan_dir = current_scan_dir;
 836       return bidi_it->type;
 837     }
 838
 839   return UNKNOWN_BT;
 840 }
 841
 842 static int
 843 bidi_peek_at_next_level (struct bidi_it *bidi_it)
 844 {
 845   if (bidi_cache_idx == bidi_cache_start || bidi_cache_last_idx == -1)
 846     emacs_abort ();
 847   return bidi_cache[bidi_cache_last_idx + bidi_it->scan_dir].resolved_level;
 848 }
 849
 850 \f
 851 /***********************************************************************
 852              Pushing and popping the bidi iterator state
 853  ***********************************************************************/
 854
 855 /* Push the bidi iterator state in preparation for reordering a
 856    different object, e.g. display string found at certain buffer
 857    position.  Pushing the bidi iterator boils down to saving its
 858    entire state on the cache and starting a new cache "stacked" on top
 859    of the current cache.  */
 860 void
 861 bidi_push_it (struct bidi_it *bidi_it)
 862 {
 863   /* Save the current iterator state in its entirety after the last
 864      used cache slot.  */
 865   bidi_cache_ensure_space (bidi_cache_idx);
 866   bidi_cache[bidi_cache_idx++] = *bidi_it;
 867
 868   /* Push the current cache start onto the stack.  */
 869   eassert (bidi_cache_sp < IT_STACK_SIZE);
 870   bidi_cache_start_stack[bidi_cache_sp++] = bidi_cache_start;
 871
 872   /* Start a new level of cache, and make it empty.  */
 873   bidi_cache_start = bidi_cache_idx;
 874   bidi_cache_last_idx = -1;
 875 }
 876
 877 /* Restore the iterator state saved by bidi_push_it and return the
 878    cache to the corresponding state.  */
 879 void
 880 bidi_pop_it (struct bidi_it *bidi_it)
 881 {
 882   if (bidi_cache_start <= 0)
 883     emacs_abort ();
 884
 885   /* Reset the next free cache slot index to what it was before the
 886      call to bidi_push_it.  */
 887   bidi_cache_idx = bidi_cache_start - 1;
 888
 889   /* Restore the bidi iterator state saved in the cache.  */
 890   *bidi_it = bidi_cache[bidi_cache_idx];
 891
 892   /* Pop the previous cache start from the stack.  */
 893   if (bidi_cache_sp <= 0)
 894     emacs_abort ();
 895   bidi_cache_start = bidi_cache_start_stack[--bidi_cache_sp];
 896
 897   /* Invalidate the last-used cache slot data.  */
 898   bidi_cache_last_idx = -1;
 899 }
 900
 901 static ptrdiff_t bidi_cache_total_alloc;
 902
 903 /* Stash away a copy of the cache and its control variables.  */
 904 void *
 905 bidi_shelve_cache (void)
 906 {
 907   unsigned char *databuf;
 908   ptrdiff_t alloc;
 909
 910   /* Empty cache.  */
 911   if (bidi_cache_idx == 0)
 912     return NULL;
 913
 914   alloc = (bidi_shelve_header_size
 915            + bidi_cache_idx * sizeof (struct bidi_it));
 916   databuf = xmalloc (alloc);
 917   bidi_cache_total_alloc += alloc;
 918
 919   memcpy (databuf, &bidi_cache_idx, sizeof (bidi_cache_idx));
 920   memcpy (databuf + sizeof (bidi_cache_idx),
 921           bidi_cache, bidi_cache_idx * sizeof (struct bidi_it));
 922   memcpy (databuf + sizeof (bidi_cache_idx)
 923           + bidi_cache_idx * sizeof (struct bidi_it),
 924           bidi_cache_start_stack, sizeof (bidi_cache_start_stack));
 925   memcpy (databuf + sizeof (bidi_cache_idx)
 926           + bidi_cache_idx * sizeof (struct bidi_it)
 927           + sizeof (bidi_cache_start_stack),
 928           &bidi_cache_sp, sizeof (bidi_cache_sp));
 929   memcpy (databuf + sizeof (bidi_cache_idx)
 930           + bidi_cache_idx * sizeof (struct bidi_it)
 931           + sizeof (bidi_cache_start_stack) + sizeof (bidi_cache_sp),
 932           &bidi_cache_start, sizeof (bidi_cache_start));
 933   memcpy (databuf + sizeof (bidi_cache_idx)
 934           + bidi_cache_idx * sizeof (struct bidi_it)
 935           + sizeof (bidi_cache_start_stack) + sizeof (bidi_cache_sp)
 936           + sizeof (bidi_cache_start),
 937           &bidi_cache_last_idx, sizeof (bidi_cache_last_idx));
 938
 939   return databuf;
 940 }
 941
 942 /* Restore the cache state from a copy stashed away by
 943    bidi_shelve_cache, and free the buffer used to stash that copy.
 944    JUST_FREE means free the buffer, but don't restore the
 945    cache; used when the corresponding iterator is discarded instead of
 946    being restored.  */
 947 void
 948 bidi_unshelve_cache (void *databuf, bool just_free)
 949 {
 950   unsigned char *p = databuf;
 951
 952   if (!p)
 953     {
 954       if (!just_free)
 955         {
 956           /* A NULL pointer means an empty cache.  */
 957           bidi_cache_start = 0;
 958           bidi_cache_sp = 0;
 959           bidi_cache_reset ();
 960         }
 961     }
 962   else
 963     {
 964       if (just_free)
 965         {
 966           ptrdiff_t idx;
 967
 968           memcpy (&idx, p, sizeof (bidi_cache_idx));
 969           bidi_cache_total_alloc
 970             -= bidi_shelve_header_size + idx * sizeof (struct bidi_it);
 971         }
 972       else
 973         {
 974           memcpy (&bidi_cache_idx, p, sizeof (bidi_cache_idx));
 975           bidi_cache_ensure_space (bidi_cache_idx);
 976           memcpy (bidi_cache, p + sizeof (bidi_cache_idx),
 977                   bidi_cache_idx * sizeof (struct bidi_it));
 978           memcpy (bidi_cache_start_stack,
 979                   p + sizeof (bidi_cache_idx)
 980                   + bidi_cache_idx * sizeof (struct bidi_it),
 981                   sizeof (bidi_cache_start_stack));
 982           memcpy (&bidi_cache_sp,
 983                   p + sizeof (bidi_cache_idx)
 984                   + bidi_cache_idx * sizeof (struct bidi_it)
 985                   + sizeof (bidi_cache_start_stack),
 986                   sizeof (bidi_cache_sp));
 987           memcpy (&bidi_cache_start,
 988                   p + sizeof (bidi_cache_idx)
 989                   + bidi_cache_idx * sizeof (struct bidi_it)
 990                   + sizeof (bidi_cache_start_stack) + sizeof (bidi_cache_sp),
 991                   sizeof (bidi_cache_start));
 992           memcpy (&bidi_cache_last_idx,
 993                   p + sizeof (bidi_cache_idx)
 994                   + bidi_cache_idx * sizeof (struct bidi_it)
 995                   + sizeof (bidi_cache_start_stack) + sizeof (bidi_cache_sp)
 996                   + sizeof (bidi_cache_start),
 997                   sizeof (bidi_cache_last_idx));
 998           bidi_cache_total_alloc
 999             -= (bidi_shelve_header_size
1000                 + bidi_cache_idx * sizeof (struct bidi_it));
1001         }
1002
1003       xfree (p);
1004     }
1005 }
1006
1007 \f
1008 /***********************************************************************
1009                         Initialization
1010  ***********************************************************************/
1011 static void
1012 bidi_initialize (void)
1013 {
1014   bidi_type_table = uniprop_table (intern ("bidi-class"));
1015   if (NILP (bidi_type_table))
1016     emacs_abort ();
1017   staticpro (&bidi_type_table);
1018
1019   bidi_mirror_table = uniprop_table (intern ("mirroring"));
1020   if (NILP (bidi_mirror_table))
1021     emacs_abort ();
1022   staticpro (&bidi_mirror_table);
1023
1024   bidi_brackets_table = uniprop_table (intern ("bracket-type"));
1025   if (NILP (bidi_brackets_table))
1026     emacs_abort ();
1027   staticpro (&bidi_brackets_table);
1028
1029   Qparagraph_start = intern ("paragraph-start");
1030   staticpro (&Qparagraph_start);
1031   paragraph_start_re = Fsymbol_value (Qparagraph_start);
1032   if (!STRINGP (paragraph_start_re))
1033     paragraph_start_re = build_string ("\f\\|[ \t]*$");
1034   staticpro (&paragraph_start_re);
1035   Qparagraph_separate = intern ("paragraph-separate");
1036   staticpro (&Qparagraph_separate);
1037   paragraph_separate_re = Fsymbol_value (Qparagraph_separate);
1038   if (!STRINGP (paragraph_separate_re))
1039     paragraph_separate_re = build_string ("[ \t\f]*$");
1040   staticpro (&paragraph_separate_re);
1041
1042   bidi_cache_sp = 0;
1043   bidi_cache_total_alloc = 0;
1044
1045   bidi_initialized = 1;
1046 }
1047
1048 /* Do whatever UAX#9 clause X8 says should be done at paragraph's
1049    end.  */
1050 static void
1051 bidi_set_paragraph_end (struct bidi_it *bidi_it)
1052 {
1053   bidi_it->invalid_levels = 0;
1054   bidi_it->invalid_isolates = 0;
1055   bidi_it->stack_idx = 0;
1056   bidi_it->resolved_level = bidi_it->level_stack[0].level;
1057 }
1058
1059 /* Initialize the bidi iterator from buffer/string position CHARPOS.  */
1060 void
1061 bidi_init_it (ptrdiff_t charpos, ptrdiff_t bytepos, bool frame_window_p,
1062               struct bidi_it *bidi_it)
1063 {
1064   if (! bidi_initialized)
1065     bidi_initialize ();
1066   if (charpos >= 0)
1067     bidi_it->charpos = charpos;
1068   if (bytepos >= 0)
1069     bidi_it->bytepos = bytepos;
1070   bidi_it->frame_window_p = frame_window_p;
1071   bidi_it->nchars = -1; /* to be computed in bidi_resolve_explicit */
1072   bidi_it->first_elt = 1;
1073   bidi_set_paragraph_end (bidi_it);
1074   bidi_it->new_paragraph = 1;
1075   bidi_it->separator_limit = -1;
1076   bidi_it->type = NEUTRAL_B;
1077   bidi_it->type_after_wn = NEUTRAL_B;
1078   bidi_it->orig_type = NEUTRAL_B;
1079   /* FIXME: Review this!!! */
1080   bidi_it->prev.type = bidi_it->prev.orig_type = UNKNOWN_BT;
1081   bidi_it->last_strong.type = bidi_it->last_strong.orig_type = UNKNOWN_BT;
1082   bidi_it->next_for_neutral.charpos = -1;
1083   bidi_it->next_for_neutral.type
1084     = bidi_it->next_for_neutral.orig_type = UNKNOWN_BT;
1085   bidi_it->prev_for_neutral.charpos = -1;
1086   bidi_it->prev_for_neutral.type
1087     = bidi_it->prev_for_neutral.orig_type = UNKNOWN_BT;
1088   bidi_it->bracket_pairing_pos = -1;
1089   bidi_it->sos = L2R;    /* FIXME: should it be user-selectable? */
1090   bidi_it->disp_pos = -1;       /* invalid/unknown */
1091   bidi_it->disp_prop = 0;
1092   /* We can only shrink the cache if we are at the bottom level of its
1093      "stack".  */
1094   if (bidi_cache_start == 0)
1095     bidi_cache_shrink ();
1096   else
1097     bidi_cache_reset ();
1098 }
1099
1100 /* Perform initializations for reordering a new line of bidi text.  */
1101 static void
1102 bidi_line_init (struct bidi_it *bidi_it)
1103 {
1104   bidi_it->scan_dir = 1; /* FIXME: do we need to have control on this? */
1105   bidi_it->stack_idx = 0;
1106   bidi_it->resolved_level = bidi_it->level_stack[0].level;
1107   bidi_it->level_stack[0].override = NEUTRAL_DIR; /* X1 */
1108   bidi_it->level_stack[0].isolate_status = false; /* X1 */
1109   bidi_it->invalid_levels = 0;
1110   bidi_it->isolate_level = 0;    /* X1 */
1111   bidi_it->invalid_isolates = 0; /* X1 */
1112   /* Setting this to zero will force its recomputation the first time
1113      we need it for W5.  */
1114   bidi_it->next_en_pos = 0;
1115   bidi_it->next_en_type = UNKNOWN_BT;
1116   bidi_it->next_for_ws.charpos = -1;
1117   bidi_it->next_for_ws.type = UNKNOWN_BT;
1118   bidi_it->bracket_pairing_pos = -1;
1119   bidi_set_sos_type (bidi_it,
1120                      (bidi_it->paragraph_dir == R2L ? 1 : 0),
1121                      bidi_it->level_stack[0].level); /* X10 */
1122
1123   bidi_cache_reset ();
1124 }
1125
1126 \f
1127 /***********************************************************************
1128                         Fetching characters
1129  ***********************************************************************/
1130
1131 /* Count bytes in string S between BEG/BEGBYTE and END.  BEG and END
1132    are zero-based character positions in S, BEGBYTE is byte position
1133    corresponding to BEG.  UNIBYTE means S is a unibyte string.  */
1134 static ptrdiff_t
1135 bidi_count_bytes (const unsigned char *s, ptrdiff_t beg,
1136                   ptrdiff_t begbyte, ptrdiff_t end, bool unibyte)
1137 {
1138   ptrdiff_t pos = beg;
1139   const unsigned char *p = s + begbyte, *start = p;
1140
1141   if (unibyte)
1142     p = s + end;
1143   else
1144     {
1145       if (!CHAR_HEAD_P (*p))
1146         emacs_abort ();
1147
1148       while (pos < end)
1149         {
1150           p += BYTES_BY_CHAR_HEAD (*p);
1151           pos++;
1152         }
1153     }
1154
1155   return p - start;
1156 }
1157
1158 /* Fetch and return the character at byte position BYTEPOS.  If S is
1159    non-NULL, fetch the character from string S; otherwise fetch the
1160    character from the current buffer.  UNIBYTE means S is a
1161    unibyte string.  */
1162 static int
1163 bidi_char_at_pos (ptrdiff_t bytepos, const unsigned char *s, bool unibyte)
1164 {
1165   if (s)
1166     {
1167       s += bytepos;
1168       if (unibyte)
1169         return *s;
1170     }
1171   else
1172     s = BYTE_POS_ADDR (bytepos);
1173   return STRING_CHAR (s);
1174 }
1175
1176 /* Fetch and return the character at CHARPOS/BYTEPOS.  If that
1177    character is covered by a display string, treat the entire run of
1178    covered characters as a single character, either u+2029 or u+FFFC,
1179    and return their combined length in CH_LEN and NCHARS.  DISP_POS
1180    specifies the character position of the next display string, or -1
1181    if not yet computed.  When the next character is at or beyond that
1182    position, the function updates DISP_POS with the position of the
1183    next display string.  *DISP_PROP non-zero means that there's really
1184    a display string at DISP_POS, as opposed to when we searched till
1185    DISP_POS without finding one.  If *DISP_PROP is 2, it means the
1186    display spec is of the form `(space ...)', which is replaced with
1187    u+2029 to handle it as a paragraph separator.  STRING->s is the C
1188    string to iterate, or NULL if iterating over a buffer or a Lisp
1189    string; in the latter case, STRING->lstring is the Lisp string.  */
1190 static int
1191 bidi_fetch_char (ptrdiff_t charpos, ptrdiff_t bytepos, ptrdiff_t *disp_pos,
1192                  int *disp_prop, struct bidi_string_data *string,
1193                  struct window *w,
1194                  bool frame_window_p, ptrdiff_t *ch_len, ptrdiff_t *nchars)
1195 {
1196   int ch;
1197   ptrdiff_t endpos
1198     = (string->s || STRINGP (string->lstring)) ? string->schars : ZV;
1199   struct text_pos pos;
1200   int len;
1201
1202   /* If we got past the last known position of display string, compute
1203      the position of the next one.  That position could be at CHARPOS.  */
1204   if (charpos < endpos && charpos > *disp_pos)
1205     {
1206       SET_TEXT_POS (pos, charpos, bytepos);
1207       *disp_pos = compute_display_string_pos (&pos, string, w, frame_window_p,
1208                                               disp_prop);
1209     }
1210
1211   /* Fetch the character at BYTEPOS.  */
1212   if (charpos >= endpos)
1213     {
1214       ch = BIDI_EOB;
1215       *ch_len = 1;
1216       *nchars = 1;
1217       *disp_pos = endpos;
1218       *disp_prop = 0;
1219     }
1220   else if (charpos >= *disp_pos && *disp_prop)
1221     {
1222       ptrdiff_t disp_end_pos;
1223
1224       /* We don't expect to find ourselves in the middle of a display
1225          property.  Hopefully, it will never be needed.  */
1226       if (charpos > *disp_pos)
1227         emacs_abort ();
1228       /* Text covered by `display' properties and overlays with
1229          display properties or display strings is handled as a single
1230          character that represents the entire run of characters
1231          covered by the display property.  */
1232       if (*disp_prop == 2)
1233         {
1234           /* `(space ...)' display specs are handled as paragraph
1235              separators for the purposes of the reordering; see UAX#9
1236              section 3 and clause HL1 in section 4.3 there.  */
1237           ch = 0x2029;
1238         }
1239       else
1240         {
1241           /* All other display specs are handled as the Unicode Object
1242              Replacement Character.  */
1243           ch = 0xFFFC;
1244         }
1245       disp_end_pos = compute_display_string_end (*disp_pos, string);
1246       if (disp_end_pos < 0)
1247         {
1248           /* Somebody removed the display string from the buffer
1249              behind our back.  Recover by processing this buffer
1250              position as if no display property were present there to
1251              begin with.  */
1252           *disp_prop = 0;
1253           goto normal_char;
1254         }
1255       *nchars = disp_end_pos - *disp_pos;
1256       if (*nchars <= 0)
1257         emacs_abort ();
1258       if (string->s)
1259         *ch_len = bidi_count_bytes (string->s, *disp_pos, bytepos,
1260                                     disp_end_pos, string->unibyte);
1261       else if (STRINGP (string->lstring))
1262         *ch_len = bidi_count_bytes (SDATA (string->lstring), *disp_pos,
1263                                     bytepos, disp_end_pos, string->unibyte);
1264       else
1265         *ch_len = CHAR_TO_BYTE (disp_end_pos) - bytepos;
1266     }
1267   else
1268     {
1269     normal_char:
1270       if (string->s)
1271         {
1272
1273           if (!string->unibyte)
1274             {
1275               ch = STRING_CHAR_AND_LENGTH (string->s + bytepos, len);
1276               *ch_len = len;
1277             }
1278           else
1279             {
1280               ch = UNIBYTE_TO_CHAR (string->s[bytepos]);
1281               *ch_len = 1;
1282             }
1283         }
1284       else if (STRINGP (string->lstring))
1285         {
1286           if (!string->unibyte)
1287             {
1288               ch = STRING_CHAR_AND_LENGTH (SDATA (string->lstring) + bytepos,
1289                                            len);
1290               *ch_len = len;
1291             }
1292           else
1293             {
1294               ch = UNIBYTE_TO_CHAR (SREF (string->lstring, bytepos));
1295               *ch_len = 1;
1296             }
1297         }
1298       else
1299         {
1300           ch = STRING_CHAR_AND_LENGTH (BYTE_POS_ADDR (bytepos), len);
1301           *ch_len = len;
1302         }
1303       *nchars = 1;
1304     }
1305
1306   /* If we just entered a run of characters covered by a display
1307      string, compute the position of the next display string.  */
1308   if (charpos + *nchars <= endpos && charpos + *nchars > *disp_pos
1309       && *disp_prop)
1310     {
1311       SET_TEXT_POS (pos, charpos + *nchars, bytepos + *ch_len);
1312       *disp_pos = compute_display_string_pos (&pos, string, w, frame_window_p,
1313                                               disp_prop);
1314     }
1315
1316   return ch;
1317 }
1318
1319 /* Like bidi_fetch_char, but ignore any text between an isolate
1320    initiator and its matching PDI or, if it has no matching PDI, the
1321    end of the paragraph.  If isolates were skipped, CH_LEN and NCHARS
1322    are set to the number of bytes and characters between BYTEPOS/CHARPOS
1323    and the character that was fetched after skipping the isolates.  */
1324 static int
1325 bidi_fetch_char_skip_isolates (ptrdiff_t charpos, ptrdiff_t bytepos,
1326                                ptrdiff_t *disp_pos, int *disp_prop,
1327                                struct bidi_string_data *string,
1328                                struct window *w, bool frame_window_p,
1329                                ptrdiff_t *ch_len, ptrdiff_t *nchars)
1330 {
1331   ptrdiff_t orig_charpos = charpos, orig_bytepos = bytepos;
1332   int ch = bidi_fetch_char (charpos, bytepos, disp_pos, disp_prop, string, w,
1333                             frame_window_p, ch_len, nchars);
1334   bidi_type_t ch_type = bidi_get_type (ch, NEUTRAL_DIR);
1335   ptrdiff_t level = 0;
1336
1337   if (ch_type == LRI || ch_type == RLI || ch_type == FSI)
1338     {
1339       level++;
1340       while (level > 0 && ch_type != NEUTRAL_B)
1341         {
1342           charpos += *nchars;
1343           bytepos += *ch_len;
1344           ch = bidi_fetch_char (charpos, bytepos, disp_pos, disp_prop, string,
1345                                 w, frame_window_p, ch_len, nchars);
1346           ch_type = bidi_get_type (ch, NEUTRAL_DIR);
1347           /* A Note to P2 says to ignore max_depth limit.  */
1348           if (ch_type == LRI || ch_type == RLI || ch_type == FSI)
1349             level++;
1350           else if (ch_type == PDI)
1351             level--;
1352         }
1353     }
1354
1355   /* Communicate to the caller how much did we skip, so it could get
1356      past the last character position we examined.  */
1357   *nchars += charpos - orig_charpos;
1358   *ch_len += bytepos - orig_bytepos;
1359   return ch;
1360 }
1361
1362
1363 \f
1364 /***********************************************************************
1365                         Determining paragraph direction
1366  ***********************************************************************/
1367
1368 /* Check if buffer position CHARPOS/BYTEPOS is the end of a paragraph.
1369    Value is the non-negative length of the paragraph separator
1370    following the buffer position, -1 if position is at the beginning
1371    of a new paragraph, or -2 if position is neither at beginning nor
1372    at end of a paragraph.  */
1373 static ptrdiff_t
1374 bidi_at_paragraph_end (ptrdiff_t charpos, ptrdiff_t bytepos)
1375 {
1376   Lisp_Object sep_re;
1377   Lisp_Object start_re;
1378   ptrdiff_t val;
1379
1380   sep_re = paragraph_separate_re;
1381   start_re = paragraph_start_re;
1382
1383   val = fast_looking_at (sep_re, charpos, bytepos, ZV, ZV_BYTE, Qnil);
1384   if (val < 0)
1385     {
1386       if (fast_looking_at (start_re, charpos, bytepos, ZV, ZV_BYTE, Qnil) >= 0)
1387         val = -1;
1388       else
1389         val = -2;
1390     }
1391
1392   return val;
1393 }
1394
1395 /* If the user has requested the long scans caching, make sure that
1396    BIDI cache is enabled.  Otherwise, make sure it's disabled.  */
1397
1398 static struct region_cache *
1399 bidi_paragraph_cache_on_off (void)
1400 {
1401   struct buffer *cache_buffer = current_buffer;
1402   bool indirect_p = false;
1403
1404   /* For indirect buffers, make sure to use the cache of their base
1405      buffer.  */
1406   if (cache_buffer->base_buffer)
1407     {
1408       cache_buffer = cache_buffer->base_buffer;
1409       indirect_p = true;
1410     }
1411
1412   /* Don't turn on or off the cache in the base buffer, if the value
1413      of cache-long-scans of the base buffer is inconsistent with that.
1414      This is because doing so will just make the cache pure overhead,
1415      since if we turn it on via indirect buffer, it will be
1416      immediately turned off by its base buffer.  */
1417   if (NILP (BVAR (current_buffer, cache_long_scans)))
1418     {
1419       if (!indirect_p
1420           || NILP (BVAR (cache_buffer, cache_long_scans)))
1421         {
1422           if (cache_buffer->bidi_paragraph_cache)
1423             {
1424               free_region_cache (cache_buffer->bidi_paragraph_cache);
1425               cache_buffer->bidi_paragraph_cache = 0;
1426             }
1427         }
1428       return NULL;
1429     }
1430   else
1431     {
1432       if (!indirect_p
1433           || !NILP (BVAR (cache_buffer, cache_long_scans)))
1434         {
1435           if (!cache_buffer->bidi_paragraph_cache)
1436             cache_buffer->bidi_paragraph_cache = new_region_cache ();
1437         }
1438       return cache_buffer->bidi_paragraph_cache;
1439     }
1440 }
1441
1442 /* On my 2005-vintage machine, searching back for paragraph start
1443    takes ~1 ms per line.  And bidi_paragraph_init is called 4 times
1444    when user types C-p.  The number below limits each call to
1445    bidi_paragraph_init to about 10 ms.  */
1446 #define MAX_PARAGRAPH_SEARCH 7500
1447
1448 /* Find the beginning of this paragraph by looking back in the buffer.
1449    Value is the byte position of the paragraph's beginning, or
1450    BEGV_BYTE if paragraph_start_re is still not found after looking
1451    back MAX_PARAGRAPH_SEARCH lines in the buffer.  */
1452 static ptrdiff_t
1453 bidi_find_paragraph_start (ptrdiff_t pos, ptrdiff_t pos_byte)
1454 {
1455   Lisp_Object re = paragraph_start_re;
1456   ptrdiff_t limit = ZV, limit_byte = ZV_BYTE;
1457   struct region_cache *bpc = bidi_paragraph_cache_on_off ();
1458   ptrdiff_t n = 0, oldpos = pos, next;
1459   struct buffer *cache_buffer = current_buffer;
1460
1461   if (cache_buffer->base_buffer)
1462     cache_buffer = cache_buffer->base_buffer;
1463
1464   while (pos_byte > BEGV_BYTE
1465          && n++ < MAX_PARAGRAPH_SEARCH
1466          && fast_looking_at (re, pos, pos_byte, limit, limit_byte, Qnil) < 0)
1467     {
1468       /* FIXME: What if the paragraph beginning is covered by a
1469          display string?  And what if a display string covering some
1470          of the text over which we scan back includes
1471          paragraph_start_re?  */
1472       DEC_BOTH (pos, pos_byte);
1473       if (bpc && region_cache_backward (cache_buffer, bpc, pos, &next))
1474         {
1475           pos = next, pos_byte = CHAR_TO_BYTE (pos);
1476           break;
1477         }
1478       else
1479         pos = find_newline_no_quit (pos, pos_byte, -1, &pos_byte);
1480     }
1481   if (n >= MAX_PARAGRAPH_SEARCH)
1482     pos = BEGV, pos_byte = BEGV_BYTE;
1483   if (bpc)
1484     know_region_cache (cache_buffer, bpc, pos, oldpos);
1485   /* Positions returned by the region cache are not limited to
1486      BEGV..ZV range, so we limit them here.  */
1487   pos_byte = clip_to_bounds (BEGV_BYTE, pos_byte, ZV_BYTE);
1488   return pos_byte;
1489 }
1490
1491 /* On a 3.4 GHz machine, searching forward for a strong directional
1492    character in a long paragraph full of weaks or neutrals takes about
1493    1 ms for each 20K characters.  The number below limits each call to
1494    bidi_paragraph_init to less than 10 ms even on slow machines.  */
1495 #define MAX_STRONG_CHAR_SEARCH 100000
1496
1497 /* Starting from POS, find the first strong (L, R, or AL) character,
1498    while skipping over any characters between an isolate initiator and
1499    its matching PDI.  STOP_AT_PDI non-zero means stop at the PDI that
1500    matches the isolate initiator at POS.  Return the bidi type of the
1501    character where the search stopped.  Give up if after examining
1502    MAX_STRONG_CHAR_SEARCH buffer or string positions no strong
1503    character was found.  */
1504 static bidi_type_t
1505 find_first_strong_char (ptrdiff_t pos, ptrdiff_t bytepos, ptrdiff_t end,
1506                         ptrdiff_t *disp_pos, int *disp_prop,
1507                         struct bidi_string_data *string, struct window *w,
1508                         bool string_p, bool frame_window_p,
1509                         ptrdiff_t *ch_len, ptrdiff_t *nchars, bool stop_at_pdi)
1510 {
1511   ptrdiff_t pos1;
1512   bidi_type_t type;
1513   int ch;
1514
1515   if (stop_at_pdi)
1516     {
1517       /* If STOP_AT_PDI is non-zero, we must have been called with FSI
1518          at POS.  Get past it.  */
1519 #ifdef ENABLE_CHECKING
1520       ch = bidi_fetch_char (pos, bytepos, disp_pos, disp_prop, string, w,
1521                             frame_window_p, ch_len, nchars);
1522       type = bidi_get_type (ch, NEUTRAL_DIR);
1523       eassert (type == FSI /* || type == LRI || type == RLI */);
1524 #endif
1525       pos += *nchars;
1526       bytepos += *ch_len;
1527     }
1528   ch = bidi_fetch_char_skip_isolates (pos, bytepos, disp_pos, disp_prop, string,
1529                                       w, frame_window_p, ch_len, nchars);
1530   type = bidi_get_type (ch, NEUTRAL_DIR);
1531
1532   pos1 = pos;
1533   for (pos += *nchars, bytepos += *ch_len;
1534        bidi_get_category (type) != STRONG
1535          /* If requested to stop at first PDI, stop there.  */
1536          && !(stop_at_pdi && type == PDI)
1537          /* Stop when searched too far into an abnormally large
1538             paragraph full of weak or neutral characters.  */
1539          && pos - pos1 < MAX_STRONG_CHAR_SEARCH;
1540        type = bidi_get_type (ch, NEUTRAL_DIR))
1541     {
1542       if (pos >= end)
1543         {
1544           /* Pretend there's a paragraph separator at end of
1545              buffer/string.  */
1546           type = NEUTRAL_B;
1547           break;
1548         }
1549       if (!string_p
1550           && type == NEUTRAL_B
1551           && bidi_at_paragraph_end (pos, bytepos) >= -1)
1552         break;
1553       /* Fetch next character and advance to get past it.  */
1554       ch = bidi_fetch_char_skip_isolates (pos, bytepos, disp_pos, disp_prop,
1555                                           string, w, frame_window_p,
1556                                           ch_len, nchars);
1557       pos += *nchars;
1558       bytepos += *ch_len;
1559     }
1560   return type;
1561 }
1562
1563 /* Determine the base direction, a.k.a. base embedding level, of the
1564    paragraph we are about to iterate through.  If DIR is either L2R or
1565    R2L, just use that.  Otherwise, determine the paragraph direction
1566    from the first strong directional character of the paragraph.
1567
1568    NO_DEFAULT_P means don't default to L2R if the paragraph
1569    has no strong directional characters and both DIR and
1570    bidi_it->paragraph_dir are NEUTRAL_DIR.  In that case, search back
1571    in the buffer until a paragraph is found with a strong character,
1572    or until hitting BEGV.  In the latter case, fall back to L2R.  This
1573    flag is used in current-bidi-paragraph-direction.
1574
1575    Note that this function gives the paragraph separator the same
1576    direction as the preceding paragraph, even though Emacs generally
1577    views the separator as not belonging to any paragraph.  */
1578 void
1579 bidi_paragraph_init (bidi_dir_t dir, struct bidi_it *bidi_it, bool no_default_p)
1580 {
1581   ptrdiff_t bytepos = bidi_it->bytepos;
1582   bool string_p = bidi_it->string.s || STRINGP (bidi_it->string.lstring);
1583   ptrdiff_t pstartbyte;
1584   /* Note that begbyte is a byte position, while end is a character
1585      position.  Yes, this is ugly, but we are trying to avoid costly
1586      calls to BYTE_TO_CHAR and its ilk.  */
1587   ptrdiff_t begbyte = string_p ? 0 : BEGV_BYTE;
1588   ptrdiff_t end = string_p ? bidi_it->string.schars : ZV;
1589
1590   /* Special case for an empty buffer. */
1591   if (bytepos == begbyte && bidi_it->charpos == end)
1592     dir = L2R;
1593   /* We should never be called at EOB or before BEGV.  */
1594   else if (bidi_it->charpos >= end || bytepos < begbyte)
1595     emacs_abort ();
1596
1597   if (dir == L2R)
1598     {
1599       bidi_it->paragraph_dir = L2R;
1600       bidi_it->new_paragraph = 0;
1601     }
1602   else if (dir == R2L)
1603     {
1604       bidi_it->paragraph_dir = R2L;
1605       bidi_it->new_paragraph = 0;
1606     }
1607   else if (dir == NEUTRAL_DIR)  /* P2 */
1608     {
1609       ptrdiff_t ch_len, nchars;
1610       ptrdiff_t pos, disp_pos = -1;
1611       int disp_prop = 0;
1612       bidi_type_t type;
1613       const unsigned char *s;
1614
1615       if (!bidi_initialized)
1616         bidi_initialize ();
1617
1618       /* If we are inside a paragraph separator, we are just waiting
1619          for the separator to be exhausted; use the previous paragraph
1620          direction.  But don't do that if we have been just reseated,
1621          because we need to reinitialize below in that case.  */
1622       if (!bidi_it->first_elt
1623           && bidi_it->charpos < bidi_it->separator_limit)
1624         return;
1625
1626       /* If we are on a newline, get past it to where the next
1627          paragraph might start.  But don't do that at BEGV since then
1628          we are potentially in a new paragraph that doesn't yet
1629          exist.  */
1630       pos = bidi_it->charpos;
1631       s = (STRINGP (bidi_it->string.lstring)
1632            ? SDATA (bidi_it->string.lstring)
1633            : bidi_it->string.s);
1634       if (bytepos > begbyte
1635           && bidi_char_at_pos (bytepos, s, bidi_it->string.unibyte) == '\n')
1636         {
1637           bytepos++;
1638           pos++;
1639         }
1640
1641       /* We are either at the beginning of a paragraph or in the
1642          middle of it.  Find where this paragraph starts.  */
1643       if (string_p)
1644         {
1645           /* We don't support changes of paragraph direction inside a
1646              string.  It is treated as a single paragraph.  */
1647           pstartbyte = 0;
1648         }
1649       else
1650         pstartbyte = bidi_find_paragraph_start (pos, bytepos);
1651       bidi_it->separator_limit = -1;
1652       bidi_it->new_paragraph = 0;
1653
1654       /* The following loop is run more than once only if NO_DEFAULT_P,
1655          and only if we are iterating on a buffer.  */
1656       do {
1657         bytepos = pstartbyte;
1658         if (!string_p)
1659           pos = BYTE_TO_CHAR (bytepos);
1660         type = find_first_strong_char (pos, bytepos, end, &disp_pos, &disp_prop,
1661                                        &bidi_it->string, bidi_it->w,
1662                                        string_p, bidi_it->frame_window_p,
1663                                        &ch_len, &nchars, false);
1664         if (type == STRONG_R || type == STRONG_AL) /* P3 */
1665           bidi_it->paragraph_dir = R2L;
1666         else if (type == STRONG_L)
1667           bidi_it->paragraph_dir = L2R;
1668         if (!string_p
1669             && no_default_p && bidi_it->paragraph_dir == NEUTRAL_DIR)
1670           {
1671             /* If this paragraph is at BEGV, default to L2R.  */
1672             if (pstartbyte == BEGV_BYTE)
1673               bidi_it->paragraph_dir = L2R; /* P3 and HL1 */
1674             else
1675               {
1676                 ptrdiff_t prevpbyte = pstartbyte;
1677                 ptrdiff_t p = BYTE_TO_CHAR (pstartbyte), pbyte = pstartbyte;
1678
1679                 /* Find the beginning of the previous paragraph, if any.  */
1680                 while (pbyte > BEGV_BYTE && prevpbyte >= pstartbyte)
1681                   {
1682                     /* FXIME: What if p is covered by a display
1683                        string?  See also a FIXME inside
1684                        bidi_find_paragraph_start.  */
1685                     DEC_BOTH (p, pbyte);
1686                     prevpbyte = bidi_find_paragraph_start (p, pbyte);
1687                   }
1688                 pstartbyte = prevpbyte;
1689               }
1690           }
1691       } while (!string_p
1692                && no_default_p && bidi_it->paragraph_dir == NEUTRAL_DIR);
1693     }
1694   else
1695     emacs_abort ();
1696
1697   /* Contrary to UAX#9 clause P3, we only default the paragraph
1698      direction to L2R if we have no previous usable paragraph
1699      direction.  This is allowed by the HL1 clause.  */
1700   if (bidi_it->paragraph_dir != L2R && bidi_it->paragraph_dir != R2L)
1701     bidi_it->paragraph_dir = L2R; /* P3 and HL1 ``higher-level protocols'' */
1702   if (bidi_it->paragraph_dir == R2L)
1703     bidi_it->level_stack[0].level = 1;
1704   else
1705     bidi_it->level_stack[0].level = 0;
1706
1707   bidi_line_init (bidi_it);
1708 }
1709
1710 \f
1711 /***********************************************************************
1712                  Resolving explicit and implicit levels.
1713   The rest of this file constitutes the core of the UBA implementation.
1714  ***********************************************************************/
1715
1716 static bool
1717 bidi_explicit_dir_char (int ch)
1718 {
1719   bidi_type_t ch_type;
1720
1721   if (!bidi_initialized)
1722     emacs_abort ();
1723   ch_type = (bidi_type_t) XINT (CHAR_TABLE_REF (bidi_type_table, ch));
1724   return (ch_type == LRE || ch_type == LRO
1725           || ch_type == RLE || ch_type == RLO
1726           || ch_type == PDF);
1727 }
1728
1729 /* Given an iterator state in BIDI_IT, advance one character position
1730    in the buffer/string to the next character (in the logical order),
1731    resolve any explicit embeddings, directional overrides, and isolate
1732    initiators and terminators, and return the embedding level of the
1733    character after resolving these explicit directives.  */
1734 static int
1735 bidi_resolve_explicit (struct bidi_it *bidi_it)
1736 {
1737   int curchar;
1738   bidi_type_t type, typ1, prev_type = UNKNOWN_BT;
1739   int current_level;
1740   int new_level;
1741   bidi_dir_t override;
1742   bool isolate_status;
1743   bool string_p = bidi_it->string.s || STRINGP (bidi_it->string.lstring);
1744   ptrdiff_t ch_len, nchars, disp_pos, end;
1745   int disp_prop;
1746   ptrdiff_t eob
1747     = ((bidi_it->string.s || STRINGP (bidi_it->string.lstring))
1748        ? bidi_it->string.schars : ZV);
1749
1750   /* Record the info about the previous character.  */
1751   if (bidi_it->type_after_wn != WEAK_BN /* W1/Retaining */
1752       && bidi_it->type != WEAK_BN)
1753     {
1754       /* This special case is needed in support of Unicode 8.0
1755          correction to N0, as implemented in bidi_resolve_weak/W1
1756          below.  */
1757       if (bidi_it->type_after_wn == NEUTRAL_ON
1758           && bidi_get_category (bidi_it->type) == STRONG
1759           && bidi_paired_bracket_type (bidi_it->ch) == BIDI_BRACKET_CLOSE)
1760         bidi_remember_char (&bidi_it->prev, bidi_it, 1);
1761       else
1762         bidi_remember_char (&bidi_it->prev, bidi_it, 0);
1763     }
1764   if (bidi_it->type_after_wn == STRONG_R
1765       || bidi_it->type_after_wn == STRONG_L
1766       || bidi_it->type_after_wn == STRONG_AL)
1767     bidi_remember_char (&bidi_it->last_strong, bidi_it, 0);
1768   if (bidi_it->type == STRONG_R || bidi_it->type == STRONG_L
1769       || bidi_it->type == WEAK_EN || bidi_it->type == WEAK_AN)
1770     bidi_remember_char (&bidi_it->prev_for_neutral, bidi_it, 1);
1771
1772   /* If we overstepped the characters used for resolving neutrals
1773      and whitespace, invalidate their info in the iterator.  */
1774   if (bidi_it->charpos >= bidi_it->next_for_neutral.charpos)
1775     {
1776       bidi_it->next_for_neutral.type = UNKNOWN_BT;
1777       /* If needed, reset the "magical" value of pairing bracket
1778          position, so that bidi_resolve_brackets will resume
1779          resolution of brackets according to BPA.  */
1780       if (bidi_it->bracket_pairing_pos == eob)
1781         bidi_it->bracket_pairing_pos = -1;
1782     }
1783   if (bidi_it->next_en_pos >= 0
1784       && bidi_it->charpos >= bidi_it->next_en_pos)
1785     {
1786       bidi_it->next_en_pos = 0;
1787       bidi_it->next_en_type = UNKNOWN_BT;
1788     }
1789
1790   /* Reset the bracket resolution info, unless we previously decided
1791      (in bidi_find_bracket_pairs) that brackets in this level run
1792      should be resolved as neutrals.  */
1793   if (bidi_it->bracket_pairing_pos != eob)
1794     {
1795       bidi_it->bracket_pairing_pos = -1;
1796       bidi_it->bracket_enclosed_type = UNKNOWN_BT;
1797     }
1798
1799   /* If reseat()'ed, don't advance, so as to start iteration from the
1800      position where we were reseated.  bidi_it->bytepos can be less
1801      than BEGV_BYTE after reseat to BEGV.  */
1802   if (bidi_it->bytepos < (string_p ? 0 : BEGV_BYTE)
1803       || bidi_it->first_elt)
1804     {
1805       bidi_it->first_elt = 0;
1806       if (string_p)
1807         {
1808           const unsigned char *p
1809             = (STRINGP (bidi_it->string.lstring)
1810                ? SDATA (bidi_it->string.lstring)
1811                : bidi_it->string.s);
1812
1813           if (bidi_it->charpos < 0)
1814             bidi_it->charpos = bidi_it->bytepos = 0;
1815           eassert (bidi_it->bytepos == bidi_count_bytes (p, 0, 0,
1816                                                          bidi_it->charpos,
1817                                                          bidi_it->string.unibyte));
1818         }
1819       else
1820         {
1821           if (bidi_it->charpos < BEGV)
1822             {
1823               bidi_it->charpos = BEGV;
1824               bidi_it->bytepos = BEGV_BYTE;
1825             }
1826           eassert (bidi_it->bytepos == CHAR_TO_BYTE (bidi_it->charpos));
1827         }
1828       /* Determine the original bidi type of the previous character,
1829          which is needed for handling isolate initiators and PDF.  The
1830          type of the previous character will be non-trivial only if
1831          our caller moved through some previous text in
1832          get_visually_first_element, in which case bidi_it->prev holds
1833          the information we want.  */
1834       if (bidi_it->first_elt && bidi_it->prev.type != UNKNOWN_BT)
1835         {
1836           eassert (bidi_it->prev.charpos == bidi_it->charpos - 1);
1837           prev_type = bidi_it->prev.orig_type;
1838           if (prev_type == FSI)
1839             prev_type = bidi_it->type_after_wn;
1840         }
1841     }
1842   /* Don't move at end of buffer/string.  */
1843   else if (bidi_it->charpos < (string_p ? bidi_it->string.schars : ZV))
1844     {
1845       /* Advance to the next character, skipping characters covered by
1846          display strings (nchars > 1).  */
1847       if (bidi_it->nchars <= 0)
1848         emacs_abort ();
1849       bidi_it->charpos += bidi_it->nchars;
1850       if (bidi_it->ch_len == 0)
1851         emacs_abort ();
1852       bidi_it->bytepos += bidi_it->ch_len;
1853       prev_type = bidi_it->orig_type;
1854       if (prev_type == FSI)
1855         prev_type = bidi_it->type_after_wn;
1856     }
1857   else  /* EOB or end of string */
1858     prev_type = NEUTRAL_B;
1859
1860   current_level = bidi_it->level_stack[bidi_it->stack_idx].level; /* X1 */
1861   override = bidi_it->level_stack[bidi_it->stack_idx].override;
1862   isolate_status = bidi_it->level_stack[bidi_it->stack_idx].isolate_status;
1863   new_level = current_level;
1864
1865   if (bidi_it->charpos >= (string_p ? bidi_it->string.schars : ZV))
1866     {
1867       curchar = BIDI_EOB;
1868       bidi_it->ch_len = 1;
1869       bidi_it->nchars = 1;
1870       bidi_it->disp_pos = (string_p ? bidi_it->string.schars : ZV);
1871       bidi_it->disp_prop = 0;
1872     }
1873   else
1874     {
1875       /* LRI, RLI, and FSI increment, and PDF decrements, the
1876          embedding level of the _following_ characters, so we must
1877          first look at the type of the previous character to support
1878          that.  */
1879       switch (prev_type)
1880         {
1881         case RLI:       /* X5a */
1882           if (current_level < BIDI_MAXDEPTH
1883               && bidi_it->invalid_levels == 0
1884               && bidi_it->invalid_isolates == 0)
1885             {
1886               new_level = ((current_level + 1) & ~1) + 1;
1887               bidi_it->isolate_level++;
1888               bidi_push_embedding_level (bidi_it, new_level,
1889                                          NEUTRAL_DIR, true);
1890             }
1891           else
1892             bidi_it->invalid_isolates++;
1893           break;
1894         case LRI:       /* X5b */
1895           if (current_level < BIDI_MAXDEPTH - 1
1896               && bidi_it->invalid_levels == 0
1897               && bidi_it->invalid_isolates == 0)
1898             {
1899               new_level = ((current_level + 2) & ~1);
1900               bidi_it->isolate_level++;
1901               bidi_push_embedding_level (bidi_it, new_level,
1902                                          NEUTRAL_DIR, true);
1903             }
1904           else
1905             bidi_it->invalid_isolates++;
1906           break;
1907         case PDF:       /* X7 */
1908           if (!bidi_it->invalid_isolates)
1909             {
1910               if (bidi_it->invalid_levels)
1911                 bidi_it->invalid_levels--;
1912               else if (!isolate_status && bidi_it->stack_idx >= 1)
1913                 new_level = bidi_pop_embedding_level (bidi_it);
1914             }
1915           break;
1916         default:
1917           eassert (prev_type != FSI);
1918           /* Nothing.  */
1919           break;
1920         }
1921       /* Fetch the character at BYTEPOS.  If it is covered by a
1922          display string, treat the entire run of covered characters as
1923          a single character u+FFFC.  */
1924       curchar = bidi_fetch_char (bidi_it->charpos, bidi_it->bytepos,
1925                                  &bidi_it->disp_pos, &bidi_it->disp_prop,
1926                                  &bidi_it->string, bidi_it->w,
1927                                  bidi_it->frame_window_p,
1928                                  &bidi_it->ch_len, &bidi_it->nchars);
1929     }
1930   bidi_it->ch = curchar;
1931   bidi_it->resolved_level = new_level;
1932
1933   /* Don't apply directional override here, as all the types we handle
1934      below will not be affected by the override anyway, and we need
1935      the original type unaltered.  The override will be applied in
1936      bidi_resolve_weak.  */
1937   type = bidi_get_type (curchar, NEUTRAL_DIR);
1938   bidi_it->orig_type = type;
1939   bidi_check_type (bidi_it->orig_type);
1940
1941   bidi_it->type_after_wn = UNKNOWN_BT;
1942
1943   switch (type)
1944     {
1945     case RLE:   /* X2 */
1946     case RLO:   /* X4 */
1947       bidi_it->type_after_wn = type;
1948       bidi_check_type (bidi_it->type_after_wn);
1949       type = WEAK_BN; /* X9/Retaining */
1950       if (new_level < BIDI_MAXDEPTH
1951           && bidi_it->invalid_levels == 0
1952           && bidi_it->invalid_isolates == 0)
1953         {
1954           /* Compute the least odd embedding level greater than
1955              the current level.  */
1956           new_level = ((new_level + 1) & ~1) + 1;
1957           if (bidi_it->type_after_wn == RLE)
1958             override = NEUTRAL_DIR;
1959           else
1960             override = R2L;
1961           bidi_push_embedding_level (bidi_it, new_level, override, false);
1962           bidi_it->resolved_level = new_level;
1963         }
1964       else
1965         {
1966           if (bidi_it->invalid_isolates == 0)
1967             bidi_it->invalid_levels++;
1968         }
1969       break;
1970     case LRE:   /* X3 */
1971     case LRO:   /* X5 */
1972       bidi_it->type_after_wn = type;
1973       bidi_check_type (bidi_it->type_after_wn);
1974       type = WEAK_BN; /* X9/Retaining */
1975       if (new_level < BIDI_MAXDEPTH - 1
1976           && bidi_it->invalid_levels == 0
1977           && bidi_it->invalid_isolates == 0)
1978         {
1979           /* Compute the least even embedding level greater than
1980              the current level.  */
1981           new_level = ((new_level + 2) & ~1);
1982           if (bidi_it->type_after_wn == LRE)
1983             override = NEUTRAL_DIR;
1984           else
1985             override = L2R;
1986           bidi_push_embedding_level (bidi_it, new_level, override, false);
1987           bidi_it->resolved_level = new_level;
1988         }
1989       else
1990         {
1991           if (bidi_it->invalid_isolates == 0)
1992             bidi_it->invalid_levels++;
1993         }
1994       break;
1995     case FSI:   /* X5c */
1996       end = string_p ? bidi_it->string.schars : ZV;
1997       disp_pos = bidi_it->disp_pos;
1998       disp_prop = bidi_it->disp_prop;
1999       nchars = bidi_it->nchars;
2000       ch_len = bidi_it->ch_len;
2001       typ1 = find_first_strong_char (bidi_it->charpos,
2002                                      bidi_it->bytepos, end,
2003                                      &disp_pos, &disp_prop,
2004                                      &bidi_it->string, bidi_it->w,
2005                                      string_p, bidi_it->frame_window_p,
2006                                      &ch_len, &nchars, true);
2007       if (typ1 != STRONG_R && typ1 != STRONG_AL)
2008         {
2009           type = LRI;
2010           goto fsi_as_lri;
2011         }
2012       else
2013         type = RLI;
2014       /* FALLTHROUGH */
2015     case RLI:   /* X5a */
2016       if (override == NEUTRAL_DIR)
2017         bidi_it->type_after_wn = type;
2018       else      /* Unicode 8.0 correction.  */
2019         bidi_it->type_after_wn = (override == L2R ? STRONG_L : STRONG_R);
2020       bidi_check_type (bidi_it->type_after_wn);
2021       break;
2022     case LRI:   /* X5b */
2023     fsi_as_lri:
2024       if (override == NEUTRAL_DIR)
2025         bidi_it->type_after_wn = type;
2026       else      /* Unicode 8.0 correction.  */
2027         bidi_it->type_after_wn = (override == L2R ? STRONG_L : STRONG_R);
2028       bidi_check_type (bidi_it->type_after_wn);
2029       break;
2030     case PDI:   /* X6a */
2031       if (bidi_it->invalid_isolates)
2032         bidi_it->invalid_isolates--;
2033       else if (bidi_it->isolate_level > 0)
2034         {
2035           bidi_it->invalid_levels = 0;
2036           while (!bidi_it->level_stack[bidi_it->stack_idx].isolate_status)
2037             bidi_pop_embedding_level (bidi_it);
2038           eassert (bidi_it->stack_idx > 0);
2039           new_level = bidi_pop_embedding_level (bidi_it);
2040           bidi_it->isolate_level--;
2041         }
2042       bidi_it->resolved_level = new_level;
2043       /* Unicode 8.0 correction.  */
2044       if (bidi_it->level_stack[bidi_it->stack_idx].override == L2R)
2045         bidi_it->type_after_wn = STRONG_L;
2046       else if (bidi_it->level_stack[bidi_it->stack_idx].override == R2L)
2047         bidi_it->type_after_wn = STRONG_R;
2048       else
2049         bidi_it->type_after_wn = type;
2050       break;
2051     case PDF:   /* X7 */
2052       bidi_it->type_after_wn = type;
2053       bidi_check_type (bidi_it->type_after_wn);
2054       type = WEAK_BN; /* X9/Retaining */
2055       break;
2056     default:
2057       /* Nothing.  */
2058       break;
2059     }
2060
2061   bidi_it->type = type;
2062   bidi_check_type (bidi_it->type);
2063
2064   if (bidi_it->type == NEUTRAL_B)       /* X8 */
2065     {
2066       bidi_set_paragraph_end (bidi_it);
2067       /* This is needed by bidi_resolve_weak below, and in L1.  */
2068       bidi_it->type_after_wn = bidi_it->type;
2069     }
2070
2071   eassert (bidi_it->resolved_level >= 0);
2072   return bidi_it->resolved_level;
2073 }
2074
2075 /* Advance in the buffer/string, resolve weak types and return the
2076    type of the next character after weak type resolution.  */
2077 static bidi_type_t
2078 bidi_resolve_weak (struct bidi_it *bidi_it)
2079 {
2080   bidi_type_t type;
2081   bidi_dir_t override;
2082   int prev_level = bidi_it->level_stack[bidi_it->stack_idx].level;
2083   int new_level  = bidi_resolve_explicit (bidi_it);
2084   int next_char;
2085   bidi_type_t type_of_next;
2086   struct bidi_it saved_it;
2087   ptrdiff_t eob
2088     = ((STRINGP (bidi_it->string.lstring) || bidi_it->string.s)
2089        ? bidi_it->string.schars : ZV);
2090
2091   type = bidi_it->type;
2092   override = bidi_it->level_stack[bidi_it->stack_idx].override;
2093
2094   eassert (!(type == UNKNOWN_BT
2095              || type == LRE
2096              || type == LRO
2097              || type == RLE
2098              || type == RLO
2099              || type == PDF));
2100
2101   eassert (prev_level >= 0);
2102   if (bidi_it->type == NEUTRAL_B)
2103     {
2104       /* We've got a new isolating sequence, compute the directional
2105          type of sos and initialize per-run variables (UAX#9, clause
2106          X10).  */
2107       bidi_set_sos_type (bidi_it, prev_level, new_level);
2108     }
2109   if (type == NEUTRAL_S || type == NEUTRAL_WS
2110       || type == WEAK_BN || type == STRONG_AL)
2111     bidi_it->type_after_wn = type;      /* needed in L1 */
2112   bidi_check_type (bidi_it->type_after_wn);
2113
2114   /* Level and directional override status are already recorded in
2115      bidi_it, and do not need any change; see X6.  */
2116   if (override == R2L)          /* X6 */
2117     type = STRONG_R;
2118   else if (override == L2R)
2119     type = STRONG_L;
2120   else
2121     {
2122       if (type == WEAK_NSM)     /* W1 */
2123         {
2124           /* Note that we don't need to consider the case where the
2125              prev character has its type overridden by an RLO or LRO,
2126              because then either the type of this NSM would have been
2127              also overridden, or the previous character is outside the
2128              current level run, and thus not relevant to this NSM.
2129              This is why NSM gets the type_after_wn of the previous
2130              character.  */
2131           /* bidi_set_sos_type sets type_after_wn to UNKNOWN_BT.  */
2132           if (bidi_it->prev.type != UNKNOWN_BT
2133               /* If type_after_wn is NEUTRAL_B, this NSM is at sos.  */
2134               && bidi_it->prev.type != NEUTRAL_B)
2135             {
2136               if (bidi_isolate_fmt_char (bidi_it->prev.type))
2137                 {
2138                   /* From W1: "Note that in an isolating run sequence,
2139                      an isolate initiator followed by an NSM or any
2140                      type other than PDI must be an overflow isolate
2141                      initiator."  */
2142                   eassert (bidi_it->invalid_isolates > 0);
2143                   type = NEUTRAL_ON;
2144                 }
2145               else
2146                 {
2147                   /* This includes the Unicode 8.0 correction for N0,
2148                      due to how we set prev.type in bidi_resolve_explicit,
2149                      which see.  */
2150                   type = bidi_it->prev.type;
2151                 }
2152             }
2153           else if (bidi_it->sos == R2L)
2154             type = STRONG_R;
2155           else if (bidi_it->sos == L2R)
2156             type = STRONG_L;
2157           else /* shouldn't happen! */
2158             emacs_abort ();
2159         }
2160       if (type == WEAK_EN       /* W2 */
2161           && bidi_it->last_strong.type == STRONG_AL)
2162         type = WEAK_AN;
2163       else if (type == STRONG_AL) /* W3 */
2164         type = STRONG_R;
2165       else if ((type == WEAK_ES /* W4 */
2166                 && bidi_it->prev.type == WEAK_EN
2167                 && bidi_it->prev.orig_type == WEAK_EN)
2168                || (type == WEAK_CS
2169                    && ((bidi_it->prev.type == WEAK_EN
2170                         && bidi_it->prev.orig_type == WEAK_EN)
2171                        || bidi_it->prev.type == WEAK_AN)))
2172         {
2173           const unsigned char *s
2174             = (STRINGP (bidi_it->string.lstring)
2175                ? SDATA (bidi_it->string.lstring)
2176                : bidi_it->string.s);
2177
2178           next_char = (bidi_it->charpos + bidi_it->nchars >= eob
2179                        ? BIDI_EOB
2180                        : bidi_char_at_pos (bidi_it->bytepos + bidi_it->ch_len,
2181                                            s, bidi_it->string.unibyte));
2182           type_of_next = bidi_get_type (next_char, override);
2183
2184           if (type_of_next == WEAK_BN
2185               || bidi_explicit_dir_char (next_char))
2186             {
2187               bidi_copy_it (&saved_it, bidi_it);
2188               while (bidi_resolve_explicit (bidi_it) == new_level
2189                      && bidi_it->type == WEAK_BN)
2190                 type_of_next = bidi_it->type;
2191               bidi_copy_it (bidi_it, &saved_it);
2192             }
2193
2194           /* If the next character is EN, but the last strong-type
2195              character is AL, that next EN will be changed to AN when
2196              we process it in W2 above.  So in that case, this ES
2197              should not be changed into EN.  */
2198           if (type == WEAK_ES
2199               && type_of_next == WEAK_EN
2200               && bidi_it->last_strong.type != STRONG_AL)
2201             type = WEAK_EN;
2202           else if (type == WEAK_CS)
2203             {
2204               if (bidi_it->prev.type == WEAK_AN
2205                   && (type_of_next == WEAK_AN
2206                       /* If the next character is EN, but the last
2207                          strong-type character is AL, EN will be later
2208                          changed to AN when we process it in W2 above.
2209                          So in that case, this ES should not be
2210                          changed into EN.  */
2211                       || (type_of_next == WEAK_EN
2212                           && bidi_it->last_strong.type == STRONG_AL)))
2213                 type = WEAK_AN;
2214               else if (bidi_it->prev.type == WEAK_EN
2215                        && type_of_next == WEAK_EN
2216                        && bidi_it->last_strong.type != STRONG_AL)
2217                 type = WEAK_EN;
2218             }
2219         }
2220       else if (type == WEAK_ET  /* W5: ET with EN before or after it */
2221                || type == WEAK_BN)      /* W5/Retaining */
2222         {
2223           if (bidi_it->prev.type == WEAK_EN) /* ET/BN w/EN before it */
2224             type = WEAK_EN;
2225           else if (bidi_it->next_en_pos > bidi_it->charpos
2226                    && bidi_it->next_en_type != WEAK_BN)
2227             {
2228               if (bidi_it->next_en_type == WEAK_EN) /* ET/BN with EN after it */
2229                 type = WEAK_EN;
2230             }
2231           else if (bidi_it->next_en_pos >=0)
2232             {
2233               /* We overstepped the last known position for ET
2234                  resolution but there could be other such characters
2235                  in this paragraph (when we are sure there are no more
2236                  such positions, we set next_en_pos to a negative
2237                  value).  Try to find the next position for ET
2238                  resolution.  */
2239               ptrdiff_t en_pos = bidi_it->charpos + bidi_it->nchars;
2240               const unsigned char *s = (STRINGP (bidi_it->string.lstring)
2241                                         ? SDATA (bidi_it->string.lstring)
2242                                         : bidi_it->string.s);
2243
2244               if (bidi_it->nchars <= 0)
2245                 emacs_abort ();
2246               next_char
2247                 = (bidi_it->charpos + bidi_it->nchars >= eob
2248                    ? BIDI_EOB
2249                    : bidi_char_at_pos (bidi_it->bytepos + bidi_it->ch_len, s,
2250                                        bidi_it->string.unibyte));
2251               type_of_next = bidi_get_type (next_char, override);
2252
2253               if (type_of_next == WEAK_ET
2254                   || type_of_next == WEAK_BN
2255                   || bidi_explicit_dir_char (next_char))
2256                 {
2257                   bidi_copy_it (&saved_it, bidi_it);
2258                   while (bidi_resolve_explicit (bidi_it) == new_level
2259                          && (bidi_it->type == WEAK_BN
2260                              || bidi_it->type == WEAK_ET))
2261                     type_of_next = bidi_it->type;
2262                   if (type == WEAK_BN
2263                       && bidi_it->charpos == saved_it.charpos + saved_it.nchars)
2264                     {
2265                       /* If we entered the above loop with a BN that
2266                          changes the level, the type of next
2267                          character, which is in a different level, is
2268                          not relevant to resolving this series of ET
2269                          and BN.  */
2270                       en_pos = saved_it.charpos;
2271                       type_of_next = type;
2272                     }
2273                   else
2274                     en_pos = bidi_it->charpos;
2275                   bidi_copy_it (bidi_it, &saved_it);
2276                 }
2277               /* Remember this position, to speed up processing of the
2278                  next ETs.  */
2279               bidi_it->next_en_pos = en_pos;
2280               if (type_of_next == WEAK_EN)
2281                 {
2282                   /* If the last strong character is AL, the EN we've
2283                      found will become AN when we get to it (W2). */
2284                   if (bidi_it->last_strong.type == STRONG_AL)
2285                     type_of_next = WEAK_AN;
2286                   else if (type == WEAK_BN)
2287                     type = NEUTRAL_ON; /* W6/Retaining */
2288                   else
2289                     type = WEAK_EN;
2290                 }
2291               else if (type_of_next == NEUTRAL_B)
2292                 /* Record the fact that there are no more ENs from
2293                    here to the end of paragraph, to avoid entering the
2294                    loop above ever again in this paragraph.  */
2295                 bidi_it->next_en_pos = -1;
2296               /* Record the type of the character where we ended our search.  */
2297               bidi_it->next_en_type = type_of_next;
2298             }
2299         }
2300     }
2301
2302   if (type == WEAK_ES || type == WEAK_ET || type == WEAK_CS /* W6 */
2303       || (type == WEAK_BN
2304           && (bidi_it->prev.type == WEAK_CS         /* W6/Retaining */
2305               || bidi_it->prev.type == WEAK_ES
2306               || bidi_it->prev.type == WEAK_ET)))
2307     type = NEUTRAL_ON;
2308
2309   /* Store the type we've got so far, before we clobber it with strong
2310      types in W7 and while resolving neutral types.  But leave alone
2311      the original types that were recorded above, because we will need
2312      them for the L1 clause.  */
2313   if (bidi_it->type_after_wn == UNKNOWN_BT)
2314     bidi_it->type_after_wn = type;
2315   bidi_check_type (bidi_it->type_after_wn);
2316
2317   if (type == WEAK_EN)  /* W7 */
2318     {
2319       if ((bidi_it->last_strong.type == STRONG_L)
2320           || (bidi_it->last_strong.type == UNKNOWN_BT && bidi_it->sos == L2R))
2321         type = STRONG_L;
2322     }
2323
2324   bidi_it->type = type;
2325   bidi_check_type (bidi_it->type);
2326   return type;
2327 }
2328
2329 /* Resolve the type of a neutral character according to the type of
2330    surrounding strong text and the current embedding level.  */
2331 static bidi_type_t
2332 bidi_resolve_neutral_1 (bidi_type_t prev_type, bidi_type_t next_type, int lev)
2333 {
2334   /* N1: "European and Arabic numbers act as if they were R in terms
2335      of their influence on NIs."  */
2336   if (next_type == WEAK_EN || next_type == WEAK_AN)
2337     next_type = STRONG_R;
2338   if (prev_type == WEAK_EN || prev_type == WEAK_AN)
2339     prev_type = STRONG_R;
2340
2341   if (next_type == prev_type)   /* N1 */
2342     return next_type;
2343   else if ((lev & 1) == 0)      /* N2 */
2344     return STRONG_L;
2345   else
2346     return STRONG_R;
2347 }
2348
2349 #define FLAG_EMBEDDING_INSIDE  1
2350 #define FLAG_OPPOSITE_INSIDE   2
2351
2352 /* A data type used in the stack maintained by
2353    bidi_find_bracket_pairs below.  */
2354 typedef struct bpa_stack_entry {
2355   int close_bracket_char;
2356   int open_bracket_idx;
2357 #ifdef ENABLE_CHECKING
2358   ptrdiff_t open_bracket_pos;
2359 #endif
2360   unsigned flags : 2;
2361 } bpa_stack_entry;
2362
2363 /* With MAX_ALLOCA of 16KB, this should allow at least 1K slots in the
2364    BPA stack, which should be more than enough for actual bidi text.  */
2365 #define MAX_BPA_STACK ((int)max (MAX_ALLOCA / sizeof (bpa_stack_entry), 1))
2366
2367 /* UAX#9 says to match opening brackets with the matching closing
2368    brackets or their canonical equivalents.  As of Unicode 7.0, there
2369    are only 2 bracket characters that have canonical equivalence
2370    decompositions: u+2329 and u+232A.  So instead of accessing the
2371    table in uni-decomposition.el, we just handle these 2 characters
2372    with this simple macro.  Note that ASCII characters don't have
2373    canonical equivalents by definition.  */
2374
2375 /* To find all the characters that need to be processed by
2376    CANONICAL_EQU, first find all the characters which have
2377    decompositions in UnicodeData.txt, with this Awk script:
2378
2379     awk -F ";" " {if ($6 != \"\") print $1, $6}" UnicodeData.txt
2380
2381    Then produce a list of all the bracket characters in BidiBrackets.txt:
2382
2383     awk -F "[ ;]" " {if ($1 != \"#\" && $1 != \"\") print $1}" BidiBrackets.txt
2384
2385    And finally, cross-reference these two:
2386
2387     fgrep -w -f brackets.txt decompositions.txt
2388
2389    where "decompositions.txt" was produced by the 1st script, and
2390    "brackets.txt" by the 2nd script.  In the output of fgrep, look
2391    only for decompositions that don't begin with some compatibility
2392    formatting tag, such as "<compat>".  Only decompositions that
2393    consist solely of character codepoints are relevant to bidi
2394    brackets processing.  */
2395
2396 #define CANONICAL_EQU(c)                                        \
2397   ( ASCII_CHAR_P (c) ? c                                        \
2398     : (c) == 0x2329 ? 0x3008                                    \
2399     : (c) == 0x232a ? 0x3009                                    \
2400     : c )
2401
2402 #ifdef ENABLE_CHECKING
2403 # define STORE_BRACKET_CHARPOS \
2404    bpa_stack[bpa_sp].open_bracket_pos = bidi_it->charpos
2405 #else
2406 # define STORE_BRACKET_CHARPOS  /* nothing */
2407 #endif
2408
2409 #define PUSH_BPA_STACK                                                  \
2410   do {                                                                  \
2411     int ch;                                                             \
2412     if (bpa_sp < MAX_BPA_STACK - 1)                                     \
2413       {                                                                 \
2414         bpa_sp++;                                                       \
2415         ch = CANONICAL_EQU (bidi_it->ch);                               \
2416         bpa_stack[bpa_sp].close_bracket_char = bidi_mirror_char (ch);   \
2417         bpa_stack[bpa_sp].open_bracket_idx = bidi_cache_last_idx;       \
2418         bpa_stack[bpa_sp].flags = 0;                                    \
2419         STORE_BRACKET_CHARPOS;                                          \
2420       }                                                                 \
2421   } while (0)
2422
2423
2424 /* This function implements BPA, the Bidi Parenthesis Algorithm,
2425    described in BD16 and N0 of UAX#9.  It finds all the bracket pairs
2426    in the current isolating sequence, and records the enclosed type
2427    and the position of the matching bracket in the cache.  It returns
2428    non-zero if called with the iterator on the opening bracket which
2429    has a matching closing bracket in the current isolating sequence,
2430    zero otherwise.  */
2431 static bool
2432 bidi_find_bracket_pairs (struct bidi_it *bidi_it)
2433 {
2434   bidi_bracket_type_t btype;
2435   bidi_type_t type = bidi_it->type;
2436   bool retval = false;
2437
2438   /* When scanning backwards, we don't expect any unresolved bidi
2439      bracket characters.  */
2440   if (bidi_it->scan_dir != 1)
2441     emacs_abort ();
2442
2443   btype = bidi_paired_bracket_type (bidi_it->ch);
2444   if (btype == BIDI_BRACKET_OPEN)
2445     {
2446       bpa_stack_entry bpa_stack[MAX_BPA_STACK];
2447       int bpa_sp = -1;
2448       struct bidi_it saved_it;
2449       int base_level = bidi_it->level_stack[0].level;
2450       int embedding_level = bidi_it->level_stack[bidi_it->stack_idx].level;
2451       int maxlevel = embedding_level;
2452       bidi_type_t embedding_type = (embedding_level & 1) ? STRONG_R : STRONG_L;
2453       struct bidi_it tem_it;
2454       bool l2r_seen = false, r2l_seen = false;
2455       ptrdiff_t pairing_pos;
2456
2457       eassert (MAX_BPA_STACK >= 100);
2458       bidi_copy_it (&saved_it, bidi_it);
2459       /* bidi_cache_iterator_state refuses to cache on backward scans,
2460          and bidi_cache_fetch_state doesn't bring scan_dir from the
2461          cache, so we must initialize this explicitly.  */
2462       tem_it.scan_dir = 1;
2463
2464       while (1)
2465         {
2466           int old_sidx, new_sidx;
2467           int current_level = bidi_it->level_stack[bidi_it->stack_idx].level;
2468
2469           if (maxlevel < current_level)
2470             maxlevel = current_level;
2471           /* Mark every opening bracket character we've traversed by
2472              putting its own position into bracket_pairing_pos.  This
2473              is examined in bidi_resolve_brackets to distinguish
2474              brackets that were already resolved to stay NEUTRAL_ON,
2475              and those that were not yet processed by this function
2476              (because they were skipped when we skip higher embedding
2477              levels below).  */
2478           if (btype == BIDI_BRACKET_OPEN && bidi_it->bracket_pairing_pos == -1)
2479             bidi_it->bracket_pairing_pos = bidi_it->charpos;
2480           bidi_cache_iterator_state (bidi_it, type == NEUTRAL_B, 0);
2481           if (btype == BIDI_BRACKET_OPEN)
2482             PUSH_BPA_STACK;
2483           else if (btype == BIDI_BRACKET_CLOSE)
2484             {
2485               int sp = bpa_sp;
2486               int curchar = CANONICAL_EQU (bidi_it->ch);
2487
2488               eassert (sp >= 0);
2489               while (sp >= 0 && bpa_stack[sp].close_bracket_char != curchar)
2490                 sp--;
2491               if (sp >= 0)
2492                 {
2493                   /* Update and cache the corresponding opening bracket.  */
2494                   bidi_cache_fetch_state (bpa_stack[sp].open_bracket_idx,
2495                                           &tem_it);
2496 #ifdef ENABLE_CHECKING
2497                   eassert (bpa_stack[sp].open_bracket_pos == tem_it.charpos);
2498 #endif
2499                   /* Determine the enclosed type for this bracket
2500                      pair's type resolution according to N0.  */
2501                   if (bpa_stack[sp].flags & FLAG_EMBEDDING_INSIDE)
2502                     tem_it.bracket_enclosed_type = embedding_type; /* N0b */
2503                   else if (bpa_stack[sp].flags & FLAG_OPPOSITE_INSIDE)
2504                     tem_it.bracket_enclosed_type                   /* N0c */
2505                       = (embedding_type == STRONG_L ? STRONG_R : STRONG_L);
2506                   else                                             /* N0d */
2507                     tem_it.bracket_enclosed_type = UNKNOWN_BT;
2508
2509                   /* Record the position of the matching closing
2510                      bracket, and update the cache.  */
2511                   tem_it.bracket_pairing_pos = bidi_it->charpos;
2512                   bidi_cache_iterator_state (&tem_it, 0, 1);
2513
2514                   /* Pop the BPA stack.  */
2515                   bpa_sp = sp - 1;
2516                 }
2517               if (bpa_sp < 0)
2518                 {
2519                   retval = true;
2520                   break;
2521                 }
2522             }
2523           else if (bidi_get_category (bidi_it->type_after_wn) != NEUTRAL)
2524             {
2525               unsigned flag = 0;
2526               int sp;
2527
2528               /* Whenever we see a strong type, update the flags of
2529                  all the slots on the stack.  */
2530               switch (bidi_it->type)
2531                 {
2532                 case STRONG_L:
2533                   flag = ((embedding_level & 1) == 0
2534                           ? FLAG_EMBEDDING_INSIDE
2535                           : FLAG_OPPOSITE_INSIDE);
2536                   l2r_seen = true;
2537                   break;
2538                 case STRONG_R:
2539                 case WEAK_EN:
2540                 case WEAK_AN:
2541                   flag = ((embedding_level & 1) == 1
2542                           ? FLAG_EMBEDDING_INSIDE
2543                           : FLAG_OPPOSITE_INSIDE);
2544                   r2l_seen = true;
2545                   break;
2546                 default:
2547                   break;
2548                 }
2549               if (flag)
2550                 {
2551                   for (sp = bpa_sp; sp >= 0; sp--)
2552                     bpa_stack[sp].flags |= flag;
2553                 }
2554             }
2555           old_sidx = bidi_it->stack_idx;
2556           type = bidi_resolve_weak (bidi_it);
2557           /* Skip level runs excluded from this isolating run sequence.  */
2558           new_sidx = bidi_it->stack_idx;
2559           if (bidi_it->level_stack[new_sidx].level > current_level
2560               && (bidi_it->level_stack[new_sidx].isolate_status
2561                   || (new_sidx > old_sidx + 1
2562                       && bidi_it->level_stack[new_sidx - 1].isolate_status)))
2563             {
2564               while (bidi_it->level_stack[bidi_it->stack_idx].level
2565                      > current_level)
2566                 {
2567                   if (maxlevel < bidi_it->level_stack[bidi_it->stack_idx].level)
2568                     maxlevel = bidi_it->level_stack[bidi_it->stack_idx].level;
2569                   bidi_cache_iterator_state (bidi_it, type == NEUTRAL_B, 0);
2570                   type = bidi_resolve_weak (bidi_it);
2571                 }
2572             }
2573           if (type == NEUTRAL_B
2574               || (bidi_it->level_stack[bidi_it->stack_idx].level
2575                   != current_level))
2576             {
2577               /* We've marched all the way to the end of this
2578                  isolating run sequence, and didn't find matching
2579                  closing brackets for some opening brackets.  Leave
2580                  their type unchanged.  */
2581               pairing_pos = bidi_it->charpos;
2582               break;
2583             }
2584           if (bidi_it->type_after_wn == NEUTRAL_ON) /* Unicode 8.0 correction */
2585             btype = bidi_paired_bracket_type (bidi_it->ch);
2586           else
2587             btype = BIDI_BRACKET_NONE;
2588         }
2589
2590       /* Restore bidi_it from the cache, which should have the bracket
2591          resolution members set as determined by the above loop.  */
2592       type = bidi_cache_find (saved_it.charpos, 0, bidi_it);
2593       eassert (type == NEUTRAL_ON);
2594
2595       /* The following is an optimization for bracketed text that has
2596          only one level which is equal to the paragraph's base
2597          embedding level.  That is, only L2R and weak/neutral
2598          characters in a L2R paragraph, or only R2L and weak/neutral
2599          characters in a R2L paragraph.  Such brackets can be resolved
2600          by bidi_resolve_neutral, which has a further shortcut for
2601          this case.  So we pretend we did not resolve the brackets in
2602          this case, set up next_for_neutral for the entire bracketed
2603          text, and reset the cache to the character before the opening
2604          bracket.  The upshot is to allow bidi_move_to_visually_next
2605          reset the cache when it returns this opening bracket, thus
2606          cutting significantly on the size of the cache, which is
2607          important with long lines, especially if word-wrap is non-nil
2608          (which requires the display engine to copy the cache back and
2609          forth many times).  */
2610       if (maxlevel == base_level
2611           && ((base_level == 0 && !r2l_seen)
2612               || (base_level == 1 && !l2r_seen)))
2613         {
2614           ptrdiff_t eob
2615             = ((bidi_it->string.s || STRINGP (bidi_it->string.lstring))
2616                ? bidi_it->string.schars : ZV);
2617
2618           if (retval)
2619             pairing_pos = bidi_it->bracket_pairing_pos;
2620
2621           /* This special value (which cannot possibly happen when
2622              brackets are resolved, since there's no character at ZV)
2623              will be noticed by bidi_resolve_explicit, and will be
2624              copied to the following iterator states, instead of being
2625              reset to -1.  */
2626           bidi_it->bracket_pairing_pos = eob;
2627           /* This type value will be used for resolving the outermost
2628              closing bracket in bidi_resolve_brackets.  */
2629           bidi_it->bracket_enclosed_type = embedding_type;
2630           /* bidi_cache_last_idx is set to the index of the current
2631              state, because we just called bidi_cache_find above.
2632              That state describes the outermost opening bracket, the
2633              one with which we entered this function.  Force the cache
2634              to "forget" all the cached states starting from that state.  */
2635           bidi_cache_reset_to (bidi_cache_last_idx - bidi_cache_start);
2636           /* Set up the next_for_neutral member, to help
2637              bidi_resolve_neutral.  */
2638           bidi_it->next_for_neutral.type = embedding_type;
2639           bidi_it->next_for_neutral.charpos = pairing_pos;
2640           /* Pretend we didn't resolve this bracket.  */
2641           retval = false;
2642         }
2643     }
2644
2645   return retval;
2646 }
2647
2648 static void
2649 bidi_record_type_for_neutral (struct bidi_saved_info *info, int level,
2650                               bool nextp)
2651 {
2652   int idx;
2653
2654   for (idx = bidi_cache_last_idx + 1; idx < bidi_cache_idx; idx++)
2655     {
2656       int lev = bidi_cache[idx].level_stack[bidi_cache[idx].stack_idx].level;
2657
2658       if (lev <= level)
2659         {
2660           eassert (lev == level);
2661           if (nextp)
2662             bidi_cache[idx].next_for_neutral = *info;
2663           else
2664             bidi_cache[idx].prev_for_neutral = *info;
2665           break;
2666         }
2667     }
2668 }
2669
2670 static bidi_type_t
2671 bidi_resolve_brackets (struct bidi_it *bidi_it)
2672 {
2673   int prev_level = bidi_it->level_stack[bidi_it->stack_idx].level;
2674   bool resolve_bracket = false;
2675   bidi_type_t type = UNKNOWN_BT;
2676   int ch;
2677   struct bidi_saved_info prev_for_neutral, next_for_neutral;
2678   ptrdiff_t eob
2679     = ((bidi_it->string.s || STRINGP (bidi_it->string.lstring))
2680        ? bidi_it->string.schars : ZV);
2681
2682   /* Record the prev_for_neutral type either from the previous
2683      character, if it was a strong or AN/EN, or from the
2684      prev_for_neutral information recorded previously.  */
2685   if (bidi_it->type == STRONG_L || bidi_it->type == STRONG_R
2686       || bidi_it->type == WEAK_AN || bidi_it->type == WEAK_EN)
2687     bidi_remember_char (&prev_for_neutral, bidi_it, 1);
2688   else
2689     prev_for_neutral = bidi_it->prev_for_neutral;
2690   /* Record the next_for_neutral type information.  */
2691   if (bidi_it->next_for_neutral.charpos > bidi_it->charpos)
2692     next_for_neutral = bidi_it->next_for_neutral;
2693   else
2694     next_for_neutral.charpos = -1;
2695   if (!bidi_it->first_elt)
2696     {
2697       type = bidi_cache_find (bidi_it->charpos + bidi_it->nchars, 0, bidi_it);
2698       ch = bidi_it->ch;
2699     }
2700   if (type == UNKNOWN_BT)
2701     {
2702       type = bidi_resolve_weak (bidi_it);
2703       if (type == NEUTRAL_ON)
2704         {
2705           /* bracket_pairing_pos == eob means this bracket does not
2706              need to be resolved as a bracket, but as a neutral, see
2707              the optimization trick we play near the end of
2708              bidi_find_bracket_pairs.  */
2709           if (bidi_it->bracket_pairing_pos == eob)
2710             {
2711               /* If this is the outermost closing bracket of a run of
2712                  characters in which we decided to resolve brackets as
2713                  neutrals, use the embedding level's type, recorded in
2714                  bracket_enclosed_type, to resolve the bracket.  */
2715               if (bidi_it->next_for_neutral.charpos == bidi_it->charpos
2716                   && bidi_paired_bracket_type (bidi_it->ch) == BIDI_BRACKET_CLOSE)
2717                 type = bidi_it->bracket_enclosed_type;
2718             }
2719           else if (bidi_find_bracket_pairs (bidi_it))
2720             resolve_bracket = true;
2721         }
2722     }
2723   else if (bidi_it->bracket_pairing_pos != eob)
2724     {
2725       eassert (bidi_it->resolved_level == -1);
2726       /* If the cached state shows an increase of embedding level due
2727          to an isolate initiator, we need to update the 1st cached
2728          state of the next run of the current isolating sequence with
2729          the prev_for_neutral and next_for_neutral information, so
2730          that it will be picked up when we advance to that next run.  */
2731       if (bidi_it->level_stack[bidi_it->stack_idx].level > prev_level
2732           && bidi_it->level_stack[bidi_it->stack_idx].isolate_status)
2733         {
2734           bidi_record_type_for_neutral (&prev_for_neutral, prev_level, 0);
2735           bidi_record_type_for_neutral (&next_for_neutral, prev_level, 1);
2736         }
2737       if (type == NEUTRAL_ON
2738           && bidi_paired_bracket_type (ch) == BIDI_BRACKET_OPEN)
2739         {
2740           if (bidi_it->bracket_pairing_pos > bidi_it->charpos)
2741             {
2742               /* A cached opening bracket that wasn't completely
2743                  resolved yet.  */
2744               resolve_bracket = true;
2745             }
2746           else if (bidi_it->bracket_pairing_pos == -1)
2747             {
2748               /* Higher levels were not BPA-resolved yet, even if
2749                  cached by bidi_find_bracket_pairs.  Force application
2750                  of BPA to the new level now.  */
2751               if (bidi_find_bracket_pairs (bidi_it))
2752                 resolve_bracket = true;
2753             }
2754         }
2755       /* Keep track of the prev_for_neutral and next_for_neutral
2756          types, needed for resolving brackets below and for resolving
2757          neutrals in bidi_resolve_neutral.  */
2758       if (bidi_it->level_stack[bidi_it->stack_idx].level == prev_level)
2759         {
2760           bidi_it->prev_for_neutral = prev_for_neutral;
2761           if (next_for_neutral.charpos > 0)
2762             bidi_it->next_for_neutral = next_for_neutral;
2763         }
2764     }
2765
2766   /* If needed, resolve the bracket type according to N0.  */
2767   if (resolve_bracket)
2768     {
2769       int embedding_level = bidi_it->level_stack[bidi_it->stack_idx].level;
2770       bidi_type_t embedding_type = (embedding_level & 1) ? STRONG_R : STRONG_L;
2771
2772       eassert (bidi_it->prev_for_neutral.type != UNKNOWN_BT);
2773       eassert (bidi_it->bracket_pairing_pos > bidi_it->charpos);
2774       if (bidi_it->bracket_enclosed_type == embedding_type) /* N0b */
2775         type = embedding_type;
2776       else
2777         {
2778           switch (bidi_it->prev_for_neutral.type)
2779             {
2780             case STRONG_R:
2781             case WEAK_EN:
2782             case WEAK_AN:
2783               type =
2784                 (bidi_it->bracket_enclosed_type == STRONG_R) /* N0c */
2785                 ? STRONG_R                                   /* N0c1 */
2786                 : embedding_type;                            /* N0c2 */
2787               break;
2788             case STRONG_L:
2789               type =
2790                 (bidi_it->bracket_enclosed_type == STRONG_L) /* N0c */
2791                 ? STRONG_L                                   /* N0c1 */
2792                 : embedding_type;                            /* N0c2 */
2793               break;
2794             default:
2795               /* N0d: Do not set the type for that bracket pair.  */
2796               break;
2797             }
2798         }
2799       eassert (type == STRONG_L || type == STRONG_R || type == NEUTRAL_ON);
2800
2801       /* Update the type of the paired closing bracket to the same
2802          type as for the resolved opening bracket.  */
2803       if (type != NEUTRAL_ON)
2804         {
2805           ptrdiff_t idx = bidi_cache_search (bidi_it->bracket_pairing_pos,
2806                                              -1, 1);
2807
2808           if (idx < bidi_cache_start)
2809             emacs_abort ();
2810           bidi_cache[idx].type = type;
2811         }
2812     }
2813
2814   return type;
2815 }
2816
2817 static bidi_type_t
2818 bidi_resolve_neutral (struct bidi_it *bidi_it)
2819 {
2820   bidi_type_t type = bidi_resolve_brackets (bidi_it);
2821   int current_level;
2822   bool is_neutral;
2823
2824   eassert (type == STRONG_R
2825            || type == STRONG_L
2826            || type == WEAK_BN
2827            || type == WEAK_EN
2828            || type == WEAK_AN
2829            || type == NEUTRAL_B
2830            || type == NEUTRAL_S
2831            || type == NEUTRAL_WS
2832            || type == NEUTRAL_ON
2833            || type == LRI
2834            || type == RLI
2835            || type == PDI);
2836
2837   current_level = bidi_it->level_stack[bidi_it->stack_idx].level;
2838   eassert (current_level >= 0);
2839   is_neutral = bidi_get_category (type) == NEUTRAL;
2840
2841   if ((type != NEUTRAL_B /* Don't risk entering the long loop below if
2842                             we are already at paragraph end.  */
2843        && (is_neutral || bidi_isolate_fmt_char (type)))
2844       /* N1-N2/Retaining */
2845       || (type == WEAK_BN && bidi_explicit_dir_char (bidi_it->ch)))
2846     {
2847       if (bidi_it->next_for_neutral.type != UNKNOWN_BT)
2848         {
2849           /* Make sure the data for resolving neutrals we are
2850              about to use is valid.  */
2851           eassert (bidi_it->next_for_neutral.charpos > bidi_it->charpos
2852                    /* PDI defines an eos, so it's OK for it to
2853                       serve as its own next_for_neutral.  */
2854                    || (bidi_it->next_for_neutral.charpos == bidi_it->charpos
2855                        && bidi_it->type == PDI));
2856           type = bidi_resolve_neutral_1 (bidi_it->prev_for_neutral.type,
2857                                          bidi_it->next_for_neutral.type,
2858                                          current_level);
2859         }
2860       /* The next two "else if" clauses are shortcuts for the
2861          important special case when we have a long sequence of
2862          neutral or WEAK_BN characters, such as whitespace or nulls or
2863          other control characters, on the base embedding level of the
2864          paragraph, and that sequence goes all the way to the end of
2865          the paragraph and follows a character whose resolved
2866          directionality is identical to the base embedding level.
2867          (This is what happens in a buffer with plain L2R text that
2868          happens to include long sequences of control characters.)  By
2869          virtue of N1, the result of examining this long sequence will
2870          always be either STRONG_L or STRONG_R, depending on the base
2871          embedding level.  So we use this fact directly instead of
2872          entering the expensive loop in the "else" clause.  */
2873       else if (current_level == 0
2874                && bidi_it->prev_for_neutral.type == STRONG_L
2875                && !bidi_explicit_dir_char (bidi_it->ch)
2876                && !bidi_isolate_fmt_char (type))
2877         type = bidi_resolve_neutral_1 (bidi_it->prev_for_neutral.type,
2878                                        STRONG_L, current_level);
2879       else if (/* current level is 1 */
2880                current_level == 1
2881                /* base embedding level is also 1 */
2882                && bidi_it->level_stack[0].level == 1
2883                /* previous character is one of those considered R for
2884                   the purposes of W5 */
2885                && (bidi_it->prev_for_neutral.type == STRONG_R
2886                    || bidi_it->prev_for_neutral.type == WEAK_EN
2887                    || bidi_it->prev_for_neutral.type == WEAK_AN)
2888                && !bidi_explicit_dir_char (bidi_it->ch)
2889                && !bidi_isolate_fmt_char (type))
2890         type = bidi_resolve_neutral_1 (bidi_it->prev_for_neutral.type,
2891                                        STRONG_R, current_level);
2892       else
2893         {
2894           /* Arrrgh!!  The UAX#9 algorithm is too deeply entrenched in
2895              the assumption of batch-style processing; see clauses W4,
2896              W5, and especially N1, which require to look far forward
2897              (as well as back) in the buffer/string.  May the fleas of
2898              a thousand camels infest the armpits of those who design
2899              supposedly general-purpose algorithms by looking at their
2900              own implementations, and fail to consider other possible
2901              implementations!  */
2902           struct bidi_it saved_it;
2903           bidi_type_t next_type;
2904           bool adjacent_to_neutrals = is_neutral;
2905
2906           bidi_copy_it (&saved_it, bidi_it);
2907           /* Scan the text forward until we find the first non-neutral
2908              character, and then use that to resolve the neutral we
2909              are dealing with now.  We also cache the scanned iterator
2910              states, to salvage some of the effort later.  */
2911           do {
2912             int old_sidx, new_sidx;
2913
2914             /* Paragraph separators have their levels fully resolved
2915                at this point, so cache them as resolved.  */
2916             bidi_cache_iterator_state (bidi_it, type == NEUTRAL_B, 0);
2917             old_sidx = bidi_it->stack_idx;
2918             type = bidi_resolve_brackets (bidi_it);
2919             /* Skip level runs excluded from this isolating run sequence.  */
2920             new_sidx = bidi_it->stack_idx;
2921             if (bidi_it->level_stack[new_sidx].level > current_level
2922                 && (bidi_it->level_stack[new_sidx].isolate_status
2923                     /* This is for when we have an isolate initiator
2924                        immediately followed by an embedding or
2925                        override initiator, in which case we get the
2926                        level stack pushed twice by the single call to
2927                        bidi_resolve_weak above.  */
2928                     || (new_sidx > old_sidx + 1
2929                         && bidi_it->level_stack[new_sidx - 1].isolate_status)))
2930               {
2931                 while (bidi_it->level_stack[bidi_it->stack_idx].level
2932                        > current_level)
2933                   {
2934                     bidi_cache_iterator_state (bidi_it, type == NEUTRAL_B, 0);
2935                     type = bidi_resolve_brackets (bidi_it);
2936                   }
2937               }
2938             if (!adjacent_to_neutrals
2939                 && (bidi_get_category (type) == NEUTRAL
2940                     || bidi_isolate_fmt_char (type)))
2941               adjacent_to_neutrals = true;
2942           } while (!(type == NEUTRAL_B
2943                      || (type != WEAK_BN
2944                          && bidi_get_category (type) != NEUTRAL
2945                          && !bidi_isolate_fmt_char (type))
2946                      /* This is all per level run, so stop when we
2947                         reach the end of this level run.  */
2948                      || (bidi_it->level_stack[bidi_it->stack_idx].level
2949                          != current_level)));
2950
2951           /* Record the character we stopped at.  */
2952           bidi_remember_char (&saved_it.next_for_neutral, bidi_it, 1);
2953
2954           if ((bidi_it->level_stack[bidi_it->stack_idx].level != current_level)
2955               || type == NEUTRAL_B)
2956             {
2957               /* Marched all the way to the end of this level run.  We
2958                  need to use the eos type, whose information is stored
2959                  by bidi_set_sos_type in the prev_for_neutral
2960                  member.  */
2961               if (adjacent_to_neutrals)
2962                 next_type = bidi_it->prev_for_neutral.type;
2963               else
2964                 {
2965                   /* This is a BN which does not adjoin neutrals.
2966                      Leave its type alone.  */
2967                   bidi_copy_it (bidi_it, &saved_it);
2968                   return bidi_it->type;
2969                 }
2970             }
2971           else
2972             {
2973               switch (type)
2974                 {
2975                 case STRONG_L:
2976                 case STRONG_R:
2977                 case STRONG_AL:
2978                   /* Actually, STRONG_AL cannot happen here, because
2979                      bidi_resolve_weak converts it to STRONG_R, per W3.  */
2980                   eassert (type != STRONG_AL);
2981                   next_type = type;
2982                   break;
2983                 case WEAK_EN:
2984                 case WEAK_AN:
2985                   /* N1: "European and Arabic numbers act as if they
2986                      were R in terms of their influence on NIs."  */
2987                   next_type = STRONG_R;
2988                   break;
2989                 default:
2990                   emacs_abort ();
2991                   break;
2992                 }
2993             }
2994           /* Resolve the type of all the NIs found during the above loop.  */
2995           type = bidi_resolve_neutral_1 (saved_it.prev_for_neutral.type,
2996                                          next_type, current_level);
2997           /* Update next_for_neutral with the resolved type, so we
2998              could use it for all the other NIs up to the place where
2999              we exited the loop.  */
3000           saved_it.next_for_neutral.type = next_type;
3001           bidi_check_type (type);
3002           /* Update the character which caused us to enter the above loop.  */
3003           saved_it.type = type;
3004           bidi_check_type (next_type);
3005           bidi_copy_it (bidi_it, &saved_it);
3006         }
3007     }
3008   return type;
3009 }
3010
3011 /* Given an iterator state in BIDI_IT, advance one character position
3012    in the buffer/string to the next character (in the logical order),
3013    resolve the bidi type of that next character, and return that
3014    type.  */
3015 static bidi_type_t
3016 bidi_type_of_next_char (struct bidi_it *bidi_it)
3017 {
3018   bidi_type_t type;
3019
3020   /* This should always be called during a forward scan.  */
3021   if (bidi_it->scan_dir != 1)
3022     emacs_abort ();
3023
3024   type = bidi_resolve_neutral (bidi_it);
3025
3026   return type;
3027 }
3028
3029 /* Given an iterator state BIDI_IT, advance one character position in
3030    the buffer/string to the next character (in the current scan
3031    direction), resolve the embedding and implicit levels of that next
3032    character, and return the resulting level.  */
3033 static int
3034 bidi_level_of_next_char (struct bidi_it *bidi_it)
3035 {
3036   bidi_type_t type = UNKNOWN_BT;
3037   int level;
3038   ptrdiff_t next_char_pos = -2;
3039
3040   if (bidi_it->scan_dir == 1)
3041     {
3042       ptrdiff_t eob
3043         = ((bidi_it->string.s || STRINGP (bidi_it->string.lstring))
3044            ? bidi_it->string.schars : ZV);
3045
3046       /* There's no sense in trying to advance if we've already hit
3047          the end of text.  */
3048       if (bidi_it->charpos >= eob)
3049         {
3050           eassert (bidi_it->resolved_level >= 0);
3051           return bidi_it->resolved_level;
3052         }
3053     }
3054
3055   /* Perhaps the character we want is already cached s fully resolved.
3056      If it is, the call to bidi_cache_find below will return a type
3057      other than UNKNOWN_BT.  */
3058   if (bidi_cache_idx > bidi_cache_start && !bidi_it->first_elt)
3059     {
3060       int bob = ((bidi_it->string.s || STRINGP (bidi_it->string.lstring))
3061                  ? 0 : 1);
3062
3063       if (bidi_it->scan_dir > 0)
3064         {
3065           if (bidi_it->nchars <= 0)
3066             emacs_abort ();
3067           next_char_pos = bidi_it->charpos + bidi_it->nchars;
3068         }
3069       else if (bidi_it->charpos >= bob)
3070         /* Implementation note: we allow next_char_pos to be as low as
3071            0 for buffers or -1 for strings, and that is okay because
3072            that's the "position" of the sentinel iterator state we
3073            cached at the beginning of the iteration.  */
3074         next_char_pos = bidi_it->charpos - 1;
3075       if (next_char_pos >= bob - 1)
3076         type = bidi_cache_find (next_char_pos, 1, bidi_it);
3077       if (type != UNKNOWN_BT)
3078         {
3079           /* We asked the cache for fully resolved states.  */
3080           eassert (bidi_it->resolved_level >= 0);
3081           return bidi_it->resolved_level;
3082         }
3083     }
3084
3085   if (bidi_it->scan_dir == -1)
3086     /* If we are going backwards, the iterator state is already cached
3087        from previous scans, and should be fully resolved.  */
3088     emacs_abort ();
3089
3090   if (type == UNKNOWN_BT)
3091     type = bidi_type_of_next_char (bidi_it);
3092
3093   if (type == NEUTRAL_B)
3094     {
3095       eassert (bidi_it->resolved_level >= 0);
3096       return bidi_it->resolved_level;
3097     }
3098
3099   level = bidi_it->level_stack[bidi_it->stack_idx].level;
3100
3101   eassert ((type == STRONG_R
3102             || type == STRONG_L
3103             || type == WEAK_BN
3104             || type == WEAK_EN
3105             || type == WEAK_AN));
3106   bidi_it->type = type;
3107   bidi_check_type (bidi_it->type);
3108
3109   /* For L1 below, we need to know, for each WS character, whether
3110      it belongs to a sequence of WS characters preceding a newline
3111      or a TAB or a paragraph separator.  */
3112   if ((bidi_it->orig_type == NEUTRAL_WS
3113        || bidi_isolate_fmt_char (bidi_it->orig_type))
3114       && bidi_it->next_for_ws.charpos < bidi_it->charpos)
3115     {
3116       int ch;
3117       ptrdiff_t clen = bidi_it->ch_len;
3118       ptrdiff_t bpos = bidi_it->bytepos;
3119       ptrdiff_t cpos = bidi_it->charpos;
3120       ptrdiff_t disp_pos = bidi_it->disp_pos;
3121       ptrdiff_t nc = bidi_it->nchars;
3122       struct bidi_string_data bs = bidi_it->string;
3123       bidi_type_t chtype;
3124       bool fwp = bidi_it->frame_window_p;
3125       int dpp = bidi_it->disp_prop;
3126
3127       if (bidi_it->nchars <= 0)
3128         emacs_abort ();
3129       do {
3130         ch = bidi_fetch_char (cpos += nc, bpos += clen, &disp_pos, &dpp, &bs,
3131                               bidi_it->w, fwp, &clen, &nc);
3132         chtype = bidi_get_type (ch, NEUTRAL_DIR);
3133       } while (chtype == NEUTRAL_WS || chtype == WEAK_BN
3134                || bidi_isolate_fmt_char (chtype)
3135                || bidi_explicit_dir_char (ch)); /* L1/Retaining */
3136       bidi_it->next_for_ws.type = chtype;
3137       bidi_check_type (bidi_it->next_for_ws.type);
3138       bidi_it->next_for_ws.charpos = cpos;
3139     }
3140
3141   /* Update the cache, but only if this state was already cached.  */
3142   bidi_cache_iterator_state (bidi_it, 1, 1);
3143
3144   /* Resolve implicit levels.  */
3145   if (bidi_it->orig_type == NEUTRAL_B /* L1 */
3146            || bidi_it->orig_type == NEUTRAL_S
3147            || bidi_it->ch == '\n' || bidi_it->ch == BIDI_EOB
3148            || (bidi_it->orig_type == NEUTRAL_WS
3149                && (bidi_it->next_for_ws.type == NEUTRAL_B
3150                    || bidi_it->next_for_ws.type == NEUTRAL_S)))
3151     level = bidi_it->level_stack[0].level;
3152   else if ((level & 1) == 0) /* I1 */
3153     {
3154       if (type == STRONG_R)
3155         level++;
3156       else if (type == WEAK_EN || type == WEAK_AN)
3157         level += 2;
3158     }
3159   else                  /* I2 */
3160     {
3161       if (type == STRONG_L || type == WEAK_EN || type == WEAK_AN)
3162         level++;
3163     }
3164
3165   bidi_it->resolved_level = level;
3166   return level;
3167 }
3168
3169 /* Move to the other edge of a level given by LEVEL.  If END_FLAG,
3170    we are at the end of a level, and we need to prepare to
3171    resume the scan of the lower level.
3172
3173    If this level's other edge is cached, we simply jump to it, filling
3174    the iterator structure with the iterator state on the other edge.
3175    Otherwise, we walk the buffer or string until we come back to the
3176    same level as LEVEL.
3177
3178    Note: we are not talking here about a ``level run'' in the UAX#9
3179    sense of the term, but rather about a ``level'' which includes
3180    all the levels higher than it.  In other words, given the levels
3181    like this:
3182
3183          11111112222222333333334443343222222111111112223322111
3184                 A      B                    C
3185
3186    and assuming we are at point A scanning left to right, this
3187    function moves to point C, whereas the UAX#9 ``level 2 run'' ends
3188    at point B.  */
3189 static void
3190 bidi_find_other_level_edge (struct bidi_it *bidi_it, int level, bool end_flag)
3191 {
3192   int dir = end_flag ? -bidi_it->scan_dir : bidi_it->scan_dir;
3193   ptrdiff_t idx;
3194
3195   /* Try the cache first.  */
3196   if ((idx = bidi_cache_find_level_change (level, dir, end_flag))
3197       >= bidi_cache_start)
3198     bidi_cache_fetch_state (idx, bidi_it);
3199   else
3200     {
3201       int new_level;
3202
3203       /* If we are at end of level, its edges must be cached.  */
3204       if (end_flag)
3205         emacs_abort ();
3206
3207       bidi_cache_iterator_state (bidi_it, 1, 0);
3208       do {
3209         new_level = bidi_level_of_next_char (bidi_it);
3210         bidi_cache_iterator_state (bidi_it, 1, 0);
3211       } while (new_level >= level);
3212     }
3213 }
3214
3215 void
3216 bidi_move_to_visually_next (struct bidi_it *bidi_it)
3217 {
3218   int old_level, new_level, next_level;
3219   struct bidi_it sentinel;
3220   struct gcpro gcpro1;
3221
3222   if (bidi_it->charpos < 0 || bidi_it->bytepos < 0)
3223     emacs_abort ();
3224
3225   if (bidi_it->scan_dir == 0)
3226     {
3227       bidi_it->scan_dir = 1;    /* default to logical order */
3228     }
3229
3230   /* The code below can call eval, and thus cause GC.  If we are
3231      iterating a Lisp string, make sure it won't be GCed.  */
3232   if (STRINGP (bidi_it->string.lstring))
3233     GCPRO1 (bidi_it->string.lstring);
3234
3235   /* If we just passed a newline, initialize for the next line.  */
3236   if (!bidi_it->first_elt
3237       && (bidi_it->ch == '\n' || bidi_it->ch == BIDI_EOB))
3238     bidi_line_init (bidi_it);
3239
3240   /* Prepare the sentinel iterator state, and cache it.  When we bump
3241      into it, scanning backwards, we'll know that the last non-base
3242      level is exhausted.  */
3243   if (bidi_cache_idx == bidi_cache_start)
3244     {
3245       bidi_copy_it (&sentinel, bidi_it);
3246       if (bidi_it->first_elt)
3247         {
3248           sentinel.charpos--;   /* cached charpos needs to be monotonic */
3249           sentinel.bytepos--;
3250           sentinel.ch = '\n';   /* doesn't matter, but why not? */
3251           sentinel.ch_len = 1;
3252           sentinel.nchars = 1;
3253         }
3254       bidi_cache_iterator_state (&sentinel, 1, 0);
3255     }
3256
3257   old_level = bidi_it->resolved_level;
3258   new_level = bidi_level_of_next_char (bidi_it);
3259
3260   /* Reordering of resolved levels (clause L2) is implemented by
3261      jumping to the other edge of the level and flipping direction of
3262      scanning the text whenever we find a level change.  */
3263   if (new_level != old_level)
3264     {
3265       bool ascending = new_level > old_level;
3266       int level_to_search = ascending ? old_level + 1 : old_level;
3267       int incr = ascending ? 1 : -1;
3268       int expected_next_level = old_level + incr;
3269
3270       /* Jump (or walk) to the other edge of this level.  */
3271       bidi_find_other_level_edge (bidi_it, level_to_search, !ascending);
3272       /* Switch scan direction and peek at the next character in the
3273          new direction.  */
3274       bidi_it->scan_dir = -bidi_it->scan_dir;
3275
3276       /* The following loop handles the case where the resolved level
3277          jumps by more than one.  This is typical for numbers inside a
3278          run of text with left-to-right embedding direction, but can
3279          also happen in other situations.  In those cases the decision
3280          where to continue after a level change, and in what direction,
3281          is tricky.  For example, given a text like below:
3282
3283                   abcdefgh
3284                   11336622
3285
3286          (where the numbers below the text show the resolved levels),
3287          the result of reordering according to UAX#9 should be this:
3288
3289                   efdcghba
3290
3291          This is implemented by the loop below which flips direction
3292          and jumps to the other edge of the level each time it finds
3293          the new level not to be the expected one.  The expected level
3294          is always one more or one less than the previous one.  */
3295       next_level = bidi_peek_at_next_level (bidi_it);
3296       while (next_level != expected_next_level)
3297         {
3298           /* If next_level is -1, it means we have an unresolved level
3299              in the cache, which at this point should not happen.  If
3300              it does, we will infloop.  */
3301           eassert (next_level >= 0);
3302           /* If next_level is not consistent with incr, we might
3303              infloop.  */
3304           eassert (incr > 0
3305                    ? next_level > expected_next_level
3306                    : next_level < expected_next_level);
3307           expected_next_level += incr;
3308           level_to_search += incr;
3309           bidi_find_other_level_edge (bidi_it, level_to_search, !ascending);
3310           bidi_it->scan_dir = -bidi_it->scan_dir;
3311           next_level = bidi_peek_at_next_level (bidi_it);
3312         }
3313
3314       /* Finally, deliver the next character in the new direction.  */
3315       next_level = bidi_level_of_next_char (bidi_it);
3316     }
3317
3318   /* Take note when we have just processed the newline that precedes
3319      the end of the paragraph.  The next time we are about to be
3320      called, set_iterator_to_next will automatically reinit the
3321      paragraph direction, if needed.  We do this at the newline before
3322      the paragraph separator, because the next character might not be
3323      the first character of the next paragraph, due to the bidi
3324      reordering, whereas we _must_ know the paragraph base direction
3325      _before_ we process the paragraph's text, since the base
3326      direction affects the reordering.  */
3327   if (bidi_it->scan_dir == 1
3328       && (bidi_it->ch == '\n' || bidi_it->ch == BIDI_EOB))
3329     {
3330       /* The paragraph direction of the entire string, once
3331          determined, is in effect for the entire string.  Setting the
3332          separator limit to the end of the string prevents
3333          bidi_paragraph_init from being called automatically on this
3334          string.  */
3335       if (bidi_it->string.s || STRINGP (bidi_it->string.lstring))
3336         bidi_it->separator_limit = bidi_it->string.schars;
3337       else if (bidi_it->bytepos < ZV_BYTE)
3338         {
3339           ptrdiff_t sep_len
3340             = bidi_at_paragraph_end (bidi_it->charpos + bidi_it->nchars,
3341                                      bidi_it->bytepos + bidi_it->ch_len);
3342           if (bidi_it->nchars <= 0)
3343             emacs_abort ();
3344           if (sep_len >= 0)
3345             {
3346               bidi_it->new_paragraph = 1;
3347               /* Record the buffer position of the last character of the
3348                  paragraph separator.  */
3349               bidi_it->separator_limit
3350                 = bidi_it->charpos + bidi_it->nchars + sep_len;
3351             }
3352         }
3353     }
3354
3355   if (bidi_it->scan_dir == 1 && bidi_cache_idx > bidi_cache_start)
3356     {
3357       /* If we are at paragraph's base embedding level and beyond the
3358          last cached position, the cache's job is done and we can
3359          discard it.  */
3360       if (bidi_it->resolved_level == bidi_it->level_stack[0].level
3361           && bidi_it->charpos > (bidi_cache[bidi_cache_idx - 1].charpos
3362                                  + bidi_cache[bidi_cache_idx - 1].nchars - 1))
3363         bidi_cache_reset ();
3364         /* But as long as we are caching during forward scan, we must
3365            cache each state, or else the cache integrity will be
3366            compromised: it assumes cached states correspond to buffer
3367            positions 1:1.  */
3368       else
3369         bidi_cache_iterator_state (bidi_it, 1, 0);
3370     }
3371
3372   eassert (bidi_it->resolved_level >= 0
3373            && bidi_it->resolved_level <= BIDI_MAXDEPTH + 2);
3374
3375   if (STRINGP (bidi_it->string.lstring))
3376     UNGCPRO;
3377 }
3378
3379 /* This is meant to be called from within the debugger, whenever you
3380    wish to examine the cache contents.  */
3381 void bidi_dump_cached_states (void) EXTERNALLY_VISIBLE;
3382 void
3383 bidi_dump_cached_states (void)
3384 {
3385   ptrdiff_t i;
3386   int ndigits = 1;
3387
3388   if (bidi_cache_idx == 0)
3389     {
3390       fprintf (stderr, "The cache is empty.\n");
3391       return;
3392     }
3393   fprintf (stderr, "Total of  %"pD"d state%s in cache:\n",
3394            bidi_cache_idx, bidi_cache_idx == 1 ? "" : "s");
3395
3396   for (i = bidi_cache[bidi_cache_idx - 1].charpos; i > 0; i /= 10)
3397     ndigits++;
3398   fputs ("ch  ", stderr);
3399   for (i = 0; i < bidi_cache_idx; i++)
3400     fprintf (stderr, "%*c", ndigits, bidi_cache[i].ch);
3401   fputs ("\n", stderr);
3402   fputs ("lvl ", stderr);
3403   for (i = 0; i < bidi_cache_idx; i++)
3404     fprintf (stderr, "%*d", ndigits, bidi_cache[i].resolved_level);
3405   fputs ("\n", stderr);
3406   fputs ("pos ", stderr);
3407   for (i = 0; i < bidi_cache_idx; i++)
3408     fprintf (stderr, "%*"pD"d", ndigits, bidi_cache[i].charpos);
3409   fputs ("\n", stderr);
3410 }