src/bidi.c

   1 /* Low-level bidirectional buffer/string-scanning functions for GNU Emacs.
   2    Copyright (C) 2000-2001, 2004-2005, 2009-2014 Free Software
   3    Foundation, Inc.
   4
   5 This file is part of GNU Emacs.
   6
   7 GNU Emacs is free software: you can redistribute it and/or modify
   8 it under the terms of the GNU General Public License as published by
   9 the Free Software Foundation, either version 3 of the License, or
  10 (at your option) any later version.
  11
  12 GNU Emacs is distributed in the hope that it will be useful,
  13 but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15 GNU General Public License for more details.
  16
  17 You should have received a copy of the GNU General Public License
  18 along with GNU Emacs.  If not, see <http://www.gnu.org/licenses/>.  */
  19
  20 /* Written by Eli Zaretskii <eliz@gnu.org>.
  21
  22    A sequential implementation of the Unicode Bidirectional algorithm,
  23    (UBA) as per UAX#9, a part of the Unicode Standard.
  24
  25    Unlike the Reference Implementation and most other implementations,
  26    this one is designed to be called once for every character in the
  27    buffer or string.  That way, we can leave intact the design of the
  28    Emacs display engine, whereby an iterator object is used to
  29    traverse buffer or string text character by character, and generate
  30    the necessary data for displaying each character in 'struct glyph'
  31    objects.  (See xdisp.c for the details of that iteration.)  The
  32    functions on this file replace the original linear iteration in the
  33    logical order of the text with a non-linear iteration in the visual
  34    order, i.e. in the order characters should be shown on display.
  35
  36    The main entry point is bidi_move_to_visually_next.  Each time it
  37    is called, it finds the next character in the visual order, and
  38    returns its information in a special structure.  The caller is then
  39    expected to process this character for display or any other
  40    purposes, and call bidi_move_to_visually_next for the next
  41    character.  See the comments in bidi_move_to_visually_next for more
  42    details about its algorithm that finds the next visual-order
  43    character by resolving their levels on the fly.
  44
  45    Two other entry points are bidi_paragraph_init and
  46    bidi_mirror_char.  The first determines the base direction of a
  47    paragraph, while the second returns the mirrored version of its
  48    argument character.
  49
  50    A few auxiliary entry points are used to initialize the bidi
  51    iterator for iterating an object (buffer or string), push and pop
  52    the bidi iterator state, and save and restore the state of the bidi
  53    cache.
  54
  55    If you want to understand the code, you will have to read it
  56    together with the relevant portions of UAX#9.  The comments include
  57    references to UAX#9 rules, for that very reason.
  58
  59    A note about references to UAX#9 rules: if the reference says
  60    something like "X9/Retaining", it means that you need to refer to
  61    rule X9 and to its modifications described in the "Implementation
  62    Notes" section of UAX#9, under "Retaining Format Codes".
  63
  64    Here's the overview of the design of the reordering engine
  65    implemented by this file.
  66
  67    Basic implementation structure
  68    ------------------------------
  69
  70    The sequential processing steps described by UAX#9 are implemented
  71    as recursive levels of processing, all of which examine the next
  72    character in the logical order.  This hierarchy of processing looks
  73    as follows, from the innermost (deepest) to the outermost level,
  74    omitting some subroutines used by each level:
  75
  76      bidi_fetch_char         -- fetch next character
  77      bidi_resolve_explicit   -- resolve explicit levels and directions
  78      bidi_resolve_weak       -- resolve weak types
  79      bidi_resolve_brackets   -- resolve "paired brackets" neutral types
  80      bidi_resolve_neutral    -- resolve neutral types
  81      bidi_level_of_next_char -- resolve implicit levels
  82
  83    Each level calls the level below it, and works on the result
  84    returned by the lower level, including all of its sub-levels.
  85
  86    Unlike all the levels below it, bidi_level_of_next_char can return
  87    the information about either the next or previous character in the
  88    logical order, depending on the current direction of scanning the
  89    buffer or string.  For the next character, it calls all the levels
  90    below it; for the previous character, it uses the cache, described
  91    below.
  92
  93    Thus, the result of calling bidi_level_of_next_char is the resolved
  94    level of the next or the previous character in the logical order.
  95    Based on this information, the function bidi_move_to_visually_next
  96    finds the next character in the visual order and updates the
  97    direction in which the buffer is scanned, either forward or
  98    backward, to find the next character to be displayed.  (Text is
  99    scanned backwards when it needs to be reversed for display, i.e. if
 100    the visual order is the inverse of the logical order.)  This
 101    implements the last, reordering steps of the UBA, by successively
 102    calling bidi_level_of_next_char until the character of the required
 103    embedding level is found; the scan direction is dynamically updated
 104    as a side effect.  See the commentary before the 'while' loop in
 105    bidi_move_to_visually_next, for the details.
 106
 107    Fetching characters
 108    -------------------
 109
 110    In a nutshell, fetching the next character boils down to calling
 111    STRING_CHAR_AND_LENGTH, passing it the address of a buffer or
 112    string position.  See bidi_fetch_char.  However, if the next
 113    character is "covered" by a display property of some kind,
 114    bidi_fetch_char returns the u+FFFC "object replacement character"
 115    that represents the entire run of text covered by the display
 116    property.  (The ch_len and nchars members of 'struct bidi_it'
 117    reflect the length in bytes and characters of that text.)  This is
 118    so we reorder text on both sides of the display property as
 119    appropriate for an image or embedded string.  Similarly, text
 120    covered by a display spec of the form '(space ...)', is replaced
 121    with the u+2029 paragraph separator character, so such display
 122    specs produce the same effect as a TAB under UBA.  Both these
 123    special characters are not actually displayed -- the display
 124    property is displayed instead -- but just used to compute the
 125    embedding level of the surrounding text so as to produce the
 126    required effect.
 127
 128    Bidi iterator states
 129    --------------------
 130
 131    The UBA is highly context dependent in some of its parts,
 132    i.e. results of processing a character can generally depend on
 133    characters very far away.  The UAX#9 description of the UBA
 134    prescribes a stateful processing of each character, whereby the
 135    results of this processing depend on various state variables, such
 136    as the current embedding level, level stack, and directional
 137    override status.  In addition, the UAX#9 description includes many
 138    passages like this (from rule W2 in this case):
 139
 140      Search backward from each instance of a European number until the
 141      first strong type (R, L, AL, or sos) is found. If an AL is found,
 142      change the type of the European number to Arabic number.
 143
 144    To support this, we use a bidi iterator object, 'struct bidi_it',
 145    which is a sub-structure of 'struct it' used by xdisp.c (see
 146    dispextern.h for the definition of both of these structures).  The
 147    bidi iterator holds the entire state of the iteration required by
 148    the UBA, and is updated as the text is traversed.  In particular,
 149    the embedding level of the current character being resolved is
 150    recorded in the iterator state.  To avoid costly searches backward
 151    in support of rules like W2 above, the necessary character types
 152    are also recorded in the iterator state as they are found during
 153    the forward scan, and then used when such rules need to be applied.
 154    (Forward scans cannot be avoided in this way; they need to be
 155    performed at least once, and the results recorded in the iterator
 156    state, to be reused until the forward scan oversteps the recorded
 157    position.)
 158
 159    In this manner, the iterator state acts as a mini-cache of
 160    contextual information required for resolving the level of the
 161    current character by various UBA rules.
 162
 163    Caching of bidi iterator states
 164    -------------------------------
 165
 166    As described above, the reordering engine uses the information
 167    recorded in the bidi iterator state in order to resolve the
 168    embedding level of the current character.  When the reordering
 169    engine needs to process the next character in the logical order, it
 170    fetches it and applies to it all the UBA levels, updating the
 171    iterator state as it goes.  But when the buffer or string is
 172    scanned backwards, i.e. in the reverse order of buffer/string
 173    positions, the scanned characters were already processed during the
 174    preceding forward scan (see bidi_find_other_level_edge).  To avoid
 175    costly re-processing of characters that were already processed
 176    during the forward scan, the iterator states computed while
 177    scanning forward are cached.
 178
 179    The cache is just a linear array of 'struct bidi_it' objects, which
 180    is dynamically allocated and reallocated as needed, since the size
 181    of the cache depends on the text being processed.  We only need the
 182    cache while processing embedded levels higher than the base
 183    paragraph embedding level, because these higher levels require
 184    changes in scan direction.  Therefore, as soon as we are back to
 185    the base embedding level, we can free the cache; see the calls to
 186    bidi_cache_reset and bidi_cache_shrink, for the conditions to do
 187    this.
 188
 189    The cache maintains the index of the next unused cache slot -- this
 190    is where the next iterator state will be cached.  The function
 191    bidi_cache_iterator_state saves an instance of the state in the
 192    cache and increments the unused slot index.  The companion function
 193    bidi_cache_find looks up a cached state that corresponds to a given
 194    buffer/string position.  All of the cached states must correspond
 195    1:1 to the buffer or string region whose processing they reflect;
 196    bidi.c will abort if it finds cache slots that violate this 1:1
 197    correspondence.
 198
 199    When the parent iterator 'struct it' is pushed (see push_it in
 200    xdisp.c) to pause the current iteration and start iterating over a
 201    different object (e.g., a 'display' string that covers some buffer
 202    text), the bidi iterator cache needs to be "pushed" as well, so
 203    that a new empty cache could be used while iterating over the new
 204    object.  Later, when the new object is exhausted, and xdisp.c calls
 205    pop_it, we need to "pop" the bidi cache as well and return to the
 206    original cache.  See bidi_push_it and bidi_pop_it for how this is
 207    done.
 208
 209    Some functions of the display engine save copies of 'struct it' in
 210    local variables, and restore them later.  For examples, see
 211    pos_visible_p and move_it_in_display_line_to in xdisp.c, and
 212    window_scroll_pixel_based in window.c.  When this happens, we need
 213    to save and restore the bidi cache as well, because conceptually
 214    the cache is part of the 'struct it' state, and needs to be in
 215    perfect sync with the portion of the buffer/string that is being
 216    processed.  This saving and restoring of the cache state is handled
 217    by bidi_shelve_cache and bidi_unshelve_cache, and the helper macros
 218    SAVE_IT and RESTORE_IT defined on xdisp.c.
 219
 220    Note that, because reordering is implemented below the level in
 221    xdisp.c that breaks glyphs into screen lines, we are violating
 222    paragraph 3.4 of UAX#9. which mandates that line breaking shall be
 223    done before reordering each screen line separately.  However,
 224    following UAX#9 to the letter in this matter goes against the basic
 225    design of the Emacs display engine, and so we choose here this
 226    minor deviation from the UBA letter in preference to redesign of
 227    the display engine.  The effect of this is only seen in continued
 228    lines that are broken into screen lines in the middle of a run
 229    whose direction is opposite to the paragraph's base direction.
 230
 231    Important design and implementation note: when the code needs to
 232    scan far ahead, be sure to avoid such scans as much as possible
 233    when the buffer/string doesn't contain any RTL characters.  Users
 234    of left-to-right scripts will never forgive you if you introduce
 235    some slow-down due to bidi in situations that don't involve any
 236    bidirectional text.  See the large comment near the beginning of
 237    bidi_resolve_neutral, for one situation where such shortcut was
 238    necessary.  */
 239
 240 #include <config.h>
 241 #include <stdio.h>
 242
 243 #include "lisp.h"
 244 #include "character.h"
 245 #include "buffer.h"
 246 #include "dispextern.h"
 247 #include "region-cache.h"
 248
 249 static bool bidi_initialized = 0;
 250
 251 static Lisp_Object bidi_type_table, bidi_mirror_table, bidi_brackets_table;
 252
 253 #define BIDI_EOB   (-1)
 254
 255 /* Data type for describing the bidirectional character categories.  */
 256 typedef enum {
 257   UNKNOWN_BC,
 258   NEUTRAL,
 259   WEAK,
 260   STRONG,
 261   EXPLICIT_FORMATTING
 262 } bidi_category_t;
 263
 264 static Lisp_Object paragraph_start_re, paragraph_separate_re;
 265 static Lisp_Object Qparagraph_start, Qparagraph_separate;
 266
 267 \f
 268 /***********************************************************************
 269                         Utilities
 270  ***********************************************************************/
 271
 272 /* Return the bidi type of a character CH, subject to the current
 273    directional OVERRIDE.  */
 274 static bidi_type_t
 275 bidi_get_type (int ch, bidi_dir_t override)
 276 {
 277   bidi_type_t default_type;
 278
 279   if (ch == BIDI_EOB)
 280     return NEUTRAL_B;
 281   if (ch < 0 || ch > MAX_CHAR)
 282     emacs_abort ();
 283
 284   default_type = (bidi_type_t) XINT (CHAR_TABLE_REF (bidi_type_table, ch));
 285   /* Every valid character code, even those that are unassigned by the
 286      UCD, have some bidi-class property, according to
 287      DerivedBidiClass.txt file.  Therefore, if we ever get UNKNOWN_BT
 288      (= zero) code from CHAR_TABLE_REF, that's a bug.  */
 289   if (default_type == UNKNOWN_BT)
 290     emacs_abort ();
 291
 292   switch (default_type)
 293     {
 294       case WEAK_BN:
 295       case NEUTRAL_B:
 296       case LRE:
 297       case LRO:
 298       case RLE:
 299       case RLO:
 300       case PDF:
 301       case LRI:
 302       case RLI:
 303       case FSI:
 304       case PDI:
 305         return default_type;
 306       default:
 307         if (override == L2R)
 308           return STRONG_L;
 309         else if (override == R2L)
 310           return STRONG_R;
 311         else
 312           return default_type;
 313     }
 314 }
 315
 316 static void
 317 bidi_check_type (bidi_type_t type)
 318 {
 319   eassert (UNKNOWN_BT <= type && type <= NEUTRAL_ON);
 320 }
 321
 322 /* Given a bidi TYPE of a character, return its category.  */
 323 static bidi_category_t
 324 bidi_get_category (bidi_type_t type)
 325 {
 326   switch (type)
 327     {
 328       case UNKNOWN_BT:
 329         return UNKNOWN_BC;
 330       case STRONG_L:
 331       case STRONG_R:
 332       case STRONG_AL:
 333         return STRONG;
 334       case WEAK_EN:
 335       case WEAK_ES:
 336       case WEAK_ET:
 337       case WEAK_AN:
 338       case WEAK_CS:
 339       case WEAK_NSM:
 340       case WEAK_BN:
 341         return WEAK;
 342       case NEUTRAL_B:
 343       case NEUTRAL_S:
 344       case NEUTRAL_WS:
 345       case NEUTRAL_ON:
 346         return NEUTRAL;
 347       case LRE:
 348       case LRO:
 349       case RLE:
 350       case RLO:
 351       case PDF:
 352       case LRI:
 353       case RLI:
 354       case FSI:
 355       case PDI:
 356         return EXPLICIT_FORMATTING;
 357       default:
 358         emacs_abort ();
 359     }
 360 }
 361
 362 static bool
 363 bidi_isolate_fmt_char (bidi_type_t ch_type)
 364 {
 365   return (ch_type == LRI || ch_type == RLI || ch_type == PDI || ch_type == FSI);
 366 }
 367
 368 /* Return the mirrored character of C, if it has one.  If C has no
 369    mirrored counterpart, return C.
 370    Note: The conditions in UAX#9 clause L4 regarding the surrounding
 371    context must be tested by the caller.  */
 372 int
 373 bidi_mirror_char (int c)
 374 {
 375   Lisp_Object val;
 376
 377   if (c == BIDI_EOB)
 378     return c;
 379   if (c < 0 || c > MAX_CHAR)
 380     emacs_abort ();
 381
 382   val = CHAR_TABLE_REF (bidi_mirror_table, c);
 383   if (INTEGERP (val))
 384     {
 385       int v;
 386
 387       /* When debugging, check before assigning to V, so that the check
 388          isn't broken by undefined behavior due to int overflow.  */
 389       eassert (CHAR_VALID_P (XINT (val)));
 390
 391       v = XINT (val);
 392
 393       /* Minimal test we must do in optimized builds, to prevent weird
 394          crashes further down the road.  */
 395       if (v < 0 || v > MAX_CHAR)
 396         emacs_abort ();
 397
 398       return v;
 399     }
 400
 401   return c;
 402 }
 403
 404 /* Return the Bidi_Paired_Bracket_Type property of the character C.  */
 405 static bidi_bracket_type_t
 406 bidi_paired_bracket_type (int c)
 407 {
 408   if (c == BIDI_EOB)
 409     return BIDI_BRACKET_NONE;
 410   if (c < 0 || c > MAX_CHAR)
 411     emacs_abort ();
 412
 413   return (bidi_bracket_type_t) XINT (CHAR_TABLE_REF (bidi_brackets_table, c));
 414 }
 415
 416 /* Determine the start-of-sequence (sos) directional type given the two
 417    embedding levels on either side of the run boundary.  Also, update
 418    the saved info about previously seen characters, since that info is
 419    generally valid for a single level run.  */
 420 static void
 421 bidi_set_sos_type (struct bidi_it *bidi_it, int level_before, int level_after)
 422 {
 423   int higher_level = (level_before > level_after ? level_before : level_after);
 424
 425   /* FIXME: should the default sos direction be user selectable?  */
 426   bidi_it->sos = ((higher_level & 1) != 0 ? R2L : L2R); /* X10 */
 427
 428   bidi_it->prev.type = UNKNOWN_BT;
 429   bidi_it->last_strong.type = bidi_it->last_strong.orig_type = UNKNOWN_BT;
 430   bidi_it->prev_for_neutral.type = (bidi_it->sos == R2L ? STRONG_R : STRONG_L);
 431   bidi_it->prev_for_neutral.charpos = bidi_it->charpos;
 432   bidi_it->next_for_neutral.type
 433     = bidi_it->next_for_neutral.orig_type = UNKNOWN_BT;
 434 }
 435
 436 /* Push the current embedding level and override status; reset the
 437    current level to LEVEL and the current override status to OVERRIDE.  */
 438 static void
 439 bidi_push_embedding_level (struct bidi_it *bidi_it,
 440                            int level, bidi_dir_t override, bool isolate_status)
 441 {
 442   struct bidi_stack *st;
 443   int prev_level = bidi_it->level_stack[bidi_it->stack_idx].level;
 444
 445   bidi_it->stack_idx++;
 446   eassert (bidi_it->stack_idx < BIDI_MAXDEPTH+2+1);
 447   st = &bidi_it->level_stack[bidi_it->stack_idx];
 448   eassert (level <= (1 << 7));
 449   st->level = level;
 450   st->override = override;
 451   st->isolate_status = isolate_status;
 452   if (isolate_status)
 453     {
 454       st->last_strong = bidi_it->last_strong;
 455       st->prev_for_neutral = bidi_it->prev_for_neutral;
 456       st->next_for_neutral = bidi_it->next_for_neutral;
 457       st->sos = bidi_it->sos;
 458     }
 459   /* We've got a new isolating sequence, compute the directional type
 460      of sos and initialize per-sequence variables (UAX#9, clause X10).  */
 461   bidi_set_sos_type (bidi_it, prev_level, level);
 462 }
 463
 464 /* Pop from the stack the embedding level, the directional override
 465    status, and optionally saved information for the isolating run
 466    sequence.  Return the new level.  */
 467 static int
 468 bidi_pop_embedding_level (struct bidi_it *bidi_it)
 469 {
 470   int level;
 471
 472   /* UAX#9 says to ignore invalid PDFs (X7, last bullet)
 473      and PDIs (X6a, 2nd bullet).  */
 474   if (bidi_it->stack_idx > 0)
 475     {
 476       bool isolate_status
 477         = bidi_it->level_stack[bidi_it->stack_idx].isolate_status;
 478       int old_level = bidi_it->level_stack[bidi_it->stack_idx].level;
 479
 480       struct bidi_stack st;
 481
 482       st = bidi_it->level_stack[bidi_it->stack_idx];
 483       if (isolate_status)
 484         {
 485           /* PREV is used in W1 for resolving WEAK_NSM.  By the time
 486              we get to an NSM, we must have gotten past at least one
 487              character: the PDI that ends the isolate from which we
 488              are popping here.  So PREV will have been filled up by
 489              the time we first use it.  We initialize it here to
 490              UNKNOWN_BT to be able to catch any blunders in this
 491              logic.  */
 492           bidi_it->prev.orig_type = bidi_it->prev.type = UNKNOWN_BT;
 493           bidi_it->last_strong = st.last_strong;
 494           bidi_it->prev_for_neutral = st.prev_for_neutral;
 495           bidi_it->next_for_neutral = st.next_for_neutral;
 496           bidi_it->sos = st.sos;
 497         }
 498       else
 499         bidi_set_sos_type (bidi_it, old_level,
 500                            bidi_it->level_stack[bidi_it->stack_idx - 1].level);
 501
 502       bidi_it->stack_idx--;
 503     }
 504   level = bidi_it->level_stack[bidi_it->stack_idx].level;
 505   eassert (0 <= level && level <= BIDI_MAXDEPTH + 1);
 506   return level;
 507 }
 508
 509 /* Record in SAVED_INFO the information about the current character.  */
 510 static void
 511 bidi_remember_char (struct bidi_saved_info *saved_info,
 512                     struct bidi_it *bidi_it, bool from_type)
 513 {
 514   saved_info->charpos = bidi_it->charpos;
 515   if (from_type)
 516     saved_info->type = bidi_it->type;
 517   else
 518     saved_info->type = bidi_it->type_after_wn;
 519   bidi_check_type (saved_info->type);
 520   saved_info->orig_type = bidi_it->orig_type;
 521   bidi_check_type (saved_info->orig_type);
 522 }
 523
 524 /* Copy the bidi iterator from FROM to TO.  To save cycles, this only
 525    copies the part of the level stack that is actually in use.  */
 526 static void
 527 bidi_copy_it (struct bidi_it *to, struct bidi_it *from)
 528 {
 529   /* Copy everything from the start through the active part of
 530      the level stack.  */
 531   memcpy (to, from,
 532           (offsetof (struct bidi_it, level_stack[1])
 533            + from->stack_idx * sizeof from->level_stack[0]));
 534 }
 535
 536 \f
 537 /***********************************************************************
 538                         Caching the bidi iterator states
 539  ***********************************************************************/
 540
 541 /* We allocate and de-allocate the cache in chunks of this size (in
 542    characters).  200 was chosen as an upper limit for reasonably-long
 543    lines in a text file/buffer.  */
 544 #define BIDI_CACHE_CHUNK 200
 545 static struct bidi_it *bidi_cache;
 546 static ptrdiff_t bidi_cache_size = 0;
 547 enum { elsz = sizeof (struct bidi_it) };
 548 static ptrdiff_t bidi_cache_idx;        /* next unused cache slot */
 549 static ptrdiff_t bidi_cache_last_idx;   /* slot of last cache hit */
 550 static ptrdiff_t bidi_cache_start = 0;  /* start of cache for this
 551                                            "stack" level */
 552
 553 /* 5-slot stack for saving the start of the previous level of the
 554    cache.  xdisp.c maintains a 5-slot stack for its iterator state,
 555    and we need the same size of our stack.  */
 556 static ptrdiff_t bidi_cache_start_stack[IT_STACK_SIZE];
 557 static int bidi_cache_sp;
 558
 559 /* Size of header used by bidi_shelve_cache.  */
 560 enum
 561   {
 562     bidi_shelve_header_size
 563       = (sizeof (bidi_cache_idx) + sizeof (bidi_cache_start_stack)
 564          + sizeof (bidi_cache_sp) + sizeof (bidi_cache_start)
 565          + sizeof (bidi_cache_last_idx))
 566   };
 567
 568 /* Reset the cache state to the empty state.  We only reset the part
 569    of the cache relevant to iteration of the current object.  Previous
 570    objects, which are pushed on the display iterator's stack, are left
 571    intact.  This is called when the cached information is no more
 572    useful for the current iteration, e.g. when we were reseated to a
 573    new position on the same object.  */
 574 static void
 575 bidi_cache_reset (void)
 576 {
 577   bidi_cache_idx = bidi_cache_start;
 578   bidi_cache_last_idx = -1;
 579 }
 580
 581 /* Shrink the cache to its minimal size.  Called when we init the bidi
 582    iterator for reordering a buffer or a string that does not come
 583    from display properties, because that means all the previously
 584    cached info is of no further use.  */
 585 static void
 586 bidi_cache_shrink (void)
 587 {
 588   if (bidi_cache_size > BIDI_CACHE_CHUNK)
 589     {
 590       bidi_cache = xrealloc (bidi_cache, BIDI_CACHE_CHUNK * elsz);
 591       bidi_cache_size = BIDI_CACHE_CHUNK;
 592     }
 593   bidi_cache_reset ();
 594 }
 595
 596 static void
 597 bidi_cache_fetch_state (ptrdiff_t idx, struct bidi_it *bidi_it)
 598 {
 599   int current_scan_dir = bidi_it->scan_dir;
 600
 601   if (idx < bidi_cache_start || idx >= bidi_cache_idx)
 602     emacs_abort ();
 603
 604   bidi_copy_it (bidi_it, &bidi_cache[idx]);
 605   bidi_it->scan_dir = current_scan_dir;
 606   bidi_cache_last_idx = idx;
 607 }
 608
 609 /* Find a cached state with a given CHARPOS and resolved embedding
 610    level less or equal to LEVEL.  If LEVEL is -1, disregard the
 611    resolved levels in cached states.  DIR, if non-zero, means search
 612    in that direction from the last cache hit.  */
 613 static ptrdiff_t
 614 bidi_cache_search (ptrdiff_t charpos, int level, int dir)
 615 {
 616   ptrdiff_t i, i_start;
 617
 618   if (bidi_cache_idx > bidi_cache_start)
 619     {
 620       if (bidi_cache_last_idx == -1)
 621         bidi_cache_last_idx = bidi_cache_idx - 1;
 622       if (charpos < bidi_cache[bidi_cache_last_idx].charpos)
 623         {
 624           dir = -1;
 625           i_start = bidi_cache_last_idx - 1;
 626         }
 627       else if (charpos > (bidi_cache[bidi_cache_last_idx].charpos
 628                           + bidi_cache[bidi_cache_last_idx].nchars - 1))
 629         {
 630           dir = 1;
 631           i_start = bidi_cache_last_idx + 1;
 632         }
 633       else if (dir)
 634         i_start = bidi_cache_last_idx;
 635       else
 636         {
 637           dir = -1;
 638           i_start = bidi_cache_idx - 1;
 639         }
 640
 641       if (dir < 0)
 642         {
 643           /* Linear search for now; FIXME!  */
 644           for (i = i_start; i >= bidi_cache_start; i--)
 645             if (bidi_cache[i].charpos <= charpos
 646                 && charpos < bidi_cache[i].charpos + bidi_cache[i].nchars
 647                 && (level == -1 || bidi_cache[i].resolved_level <= level))
 648               return i;
 649         }
 650       else
 651         {
 652           for (i = i_start; i < bidi_cache_idx; i++)
 653             if (bidi_cache[i].charpos <= charpos
 654                 && charpos < bidi_cache[i].charpos + bidi_cache[i].nchars
 655                 && (level == -1 || bidi_cache[i].resolved_level <= level))
 656               return i;
 657         }
 658     }
 659
 660   return -1;
 661 }
 662
 663 /* Find a cached state where the resolved level changes to a value
 664    that is lower than LEVEL, and return its cache slot index.  DIR is
 665    the direction to search, starting with the last used cache slot.
 666    If DIR is zero, we search backwards from the last occupied cache
 667    slot.  BEFORE means return the index of the slot that
 668    is ``before'' the level change in the search direction.  That is,
 669    given the cached levels like this:
 670
 671          1122333442211
 672           AB        C
 673
 674    and assuming we are at the position cached at the slot marked with
 675    C, searching backwards (DIR = -1) for LEVEL = 2 will return the
 676    index of slot B or A, depending whether BEFORE is, respectively,
 677    true or false.  */
 678 static ptrdiff_t
 679 bidi_cache_find_level_change (int level, int dir, bool before)
 680 {
 681   if (bidi_cache_idx)
 682     {
 683       ptrdiff_t i = dir ? bidi_cache_last_idx : bidi_cache_idx - 1;
 684       int incr = before ? 1 : 0;
 685
 686       eassert (!dir || bidi_cache_last_idx >= 0);
 687
 688       if (!dir)
 689         dir = -1;
 690       else if (!incr)
 691         i += dir;
 692
 693       if (dir < 0)
 694         {
 695           while (i >= bidi_cache_start + incr)
 696             {
 697               if (bidi_cache[i - incr].resolved_level >= 0
 698                   && bidi_cache[i - incr].resolved_level < level)
 699                 return i;
 700               i--;
 701             }
 702         }
 703       else
 704         {
 705           while (i < bidi_cache_idx - incr)
 706             {
 707               if (bidi_cache[i + incr].resolved_level >= 0
 708                   && bidi_cache[i + incr].resolved_level < level)
 709                 return i;
 710               i++;
 711             }
 712         }
 713     }
 714
 715   return -1;
 716 }
 717
 718 static void
 719 bidi_cache_ensure_space (ptrdiff_t idx)
 720 {
 721   /* Enlarge the cache as needed.  */
 722   if (idx >= bidi_cache_size)
 723     {
 724       /* The bidi cache cannot be larger than the largest Lisp string
 725          or buffer.  */
 726       ptrdiff_t string_or_buffer_bound
 727         = max (BUF_BYTES_MAX, STRING_BYTES_BOUND);
 728
 729       /* Also, it cannot be larger than what C can represent.  */
 730       ptrdiff_t c_bound
 731         = (min (PTRDIFF_MAX, SIZE_MAX) - bidi_shelve_header_size) / elsz;
 732
 733       bidi_cache
 734         = xpalloc (bidi_cache, &bidi_cache_size,
 735                    max (BIDI_CACHE_CHUNK, idx - bidi_cache_size + 1),
 736                    min (string_or_buffer_bound, c_bound), elsz);
 737     }
 738 }
 739
 740 static void
 741 bidi_cache_iterator_state (struct bidi_it *bidi_it, bool resolved,
 742                            bool update_only)
 743 {
 744   ptrdiff_t idx;
 745
 746   /* We should never cache on backward scans.  */
 747   if (bidi_it->scan_dir == -1)
 748     emacs_abort ();
 749   idx = bidi_cache_search (bidi_it->charpos, -1, 1);
 750
 751   if (idx < 0 && update_only)
 752     return;
 753
 754   if (idx < 0)
 755     {
 756       idx = bidi_cache_idx;
 757       bidi_cache_ensure_space (idx);
 758       /* Character positions should correspond to cache positions 1:1.
 759          If we are outside the range of cached positions, the cache is
 760          useless and must be reset.  */
 761       if (idx > bidi_cache_start &&
 762           (bidi_it->charpos > (bidi_cache[idx - 1].charpos
 763                                + bidi_cache[idx - 1].nchars)
 764            || bidi_it->charpos < bidi_cache[bidi_cache_start].charpos))
 765         {
 766           bidi_cache_reset ();
 767           idx = bidi_cache_start;
 768         }
 769       if (bidi_it->nchars <= 0)
 770         emacs_abort ();
 771       bidi_copy_it (&bidi_cache[idx], bidi_it);
 772       if (!resolved)
 773         bidi_cache[idx].resolved_level = -1;
 774     }
 775   else
 776     {
 777       /* Copy only the members which could have changed, to avoid
 778          costly copying of the entire struct.  */
 779       bidi_cache[idx].type = bidi_it->type;
 780       bidi_check_type (bidi_it->type);
 781       bidi_cache[idx].type_after_wn = bidi_it->type_after_wn;
 782       bidi_check_type (bidi_it->type_after_wn);
 783       if (resolved)
 784         bidi_cache[idx].resolved_level = bidi_it->resolved_level;
 785       else
 786         bidi_cache[idx].resolved_level = -1;
 787       bidi_cache[idx].invalid_levels = bidi_it->invalid_levels;
 788       bidi_cache[idx].next_for_neutral = bidi_it->next_for_neutral;
 789       bidi_cache[idx].next_for_ws = bidi_it->next_for_ws;
 790       bidi_cache[idx].disp_pos = bidi_it->disp_pos;
 791       bidi_cache[idx].disp_prop = bidi_it->disp_prop;
 792       bidi_cache[idx].bracket_pairing_pos = bidi_it->bracket_pairing_pos;
 793       bidi_cache[idx].bracket_enclosed_type = bidi_it->bracket_enclosed_type;
 794     }
 795
 796   bidi_cache_last_idx = idx;
 797   if (idx >= bidi_cache_idx)
 798     bidi_cache_idx = idx + 1;
 799 }
 800
 801 /* Look for a cached iterator state that corresponds to CHARPOS.  If
 802    found, copy the cached state into BIDI_IT and return the type of
 803    the cached entry.  If not found, return UNKNOWN_BT.  NEUTRALS_OK
 804    non-zero means it is OK to return cached state for neutral
 805    characters that have no valid next_for_neutral member, and
 806    therefore cannot be resolved.  This can happen if the state was
 807    cached before it was resolved in bidi_resolve_neutral.  */
 808 static bidi_type_t
 809 bidi_cache_find (ptrdiff_t charpos, bool neutrals_ok, struct bidi_it *bidi_it)
 810 {
 811   ptrdiff_t i = bidi_cache_search (charpos, -1, bidi_it->scan_dir);
 812
 813   if (i >= bidi_cache_start
 814       && (neutrals_ok
 815           /* Callers that don't want to resolve neutrals (and set
 816              neutrals_ok = false) need to be sure that there's enough
 817              info in the cached state to resolve the neutrals and
 818              isolates, and if not, they don't want the cached state.  */
 819           || !(bidi_cache[i].resolved_level == -1
 820                && (bidi_get_category (bidi_cache[i].type) == NEUTRAL
 821                    || bidi_isolate_fmt_char (bidi_cache[i].type))
 822                && bidi_cache[i].next_for_neutral.type == UNKNOWN_BT)))
 823     {
 824       bidi_dir_t current_scan_dir = bidi_it->scan_dir;
 825
 826       bidi_copy_it (bidi_it, &bidi_cache[i]);
 827       bidi_cache_last_idx = i;
 828       /* Don't let scan direction from the cached state override
 829          the current scan direction.  */
 830       bidi_it->scan_dir = current_scan_dir;
 831       return bidi_it->type;
 832     }
 833
 834   return UNKNOWN_BT;
 835 }
 836
 837 static int
 838 bidi_peek_at_next_level (struct bidi_it *bidi_it)
 839 {
 840   if (bidi_cache_idx == bidi_cache_start || bidi_cache_last_idx == -1)
 841     emacs_abort ();
 842   return bidi_cache[bidi_cache_last_idx + bidi_it->scan_dir].resolved_level;
 843 }
 844
 845 \f
 846 /***********************************************************************
 847              Pushing and popping the bidi iterator state
 848  ***********************************************************************/
 849
 850 /* Push the bidi iterator state in preparation for reordering a
 851    different object, e.g. display string found at certain buffer
 852    position.  Pushing the bidi iterator boils down to saving its
 853    entire state on the cache and starting a new cache "stacked" on top
 854    of the current cache.  */
 855 void
 856 bidi_push_it (struct bidi_it *bidi_it)
 857 {
 858   /* Save the current iterator state in its entirety after the last
 859      used cache slot.  */
 860   bidi_cache_ensure_space (bidi_cache_idx);
 861   bidi_cache[bidi_cache_idx++] = *bidi_it;
 862
 863   /* Push the current cache start onto the stack.  */
 864   eassert (bidi_cache_sp < IT_STACK_SIZE);
 865   bidi_cache_start_stack[bidi_cache_sp++] = bidi_cache_start;
 866
 867   /* Start a new level of cache, and make it empty.  */
 868   bidi_cache_start = bidi_cache_idx;
 869   bidi_cache_last_idx = -1;
 870 }
 871
 872 /* Restore the iterator state saved by bidi_push_it and return the
 873    cache to the corresponding state.  */
 874 void
 875 bidi_pop_it (struct bidi_it *bidi_it)
 876 {
 877   if (bidi_cache_start <= 0)
 878     emacs_abort ();
 879
 880   /* Reset the next free cache slot index to what it was before the
 881      call to bidi_push_it.  */
 882   bidi_cache_idx = bidi_cache_start - 1;
 883
 884   /* Restore the bidi iterator state saved in the cache.  */
 885   *bidi_it = bidi_cache[bidi_cache_idx];
 886
 887   /* Pop the previous cache start from the stack.  */
 888   if (bidi_cache_sp <= 0)
 889     emacs_abort ();
 890   bidi_cache_start = bidi_cache_start_stack[--bidi_cache_sp];
 891
 892   /* Invalidate the last-used cache slot data.  */
 893   bidi_cache_last_idx = -1;
 894 }
 895
 896 static ptrdiff_t bidi_cache_total_alloc;
 897
 898 /* Stash away a copy of the cache and its control variables.  */
 899 void *
 900 bidi_shelve_cache (void)
 901 {
 902   unsigned char *databuf;
 903   ptrdiff_t alloc;
 904
 905   /* Empty cache.  */
 906   if (bidi_cache_idx == 0)
 907     return NULL;
 908
 909   alloc = (bidi_shelve_header_size
 910            + bidi_cache_idx * sizeof (struct bidi_it));
 911   databuf = xmalloc (alloc);
 912   bidi_cache_total_alloc += alloc;
 913
 914   memcpy (databuf, &bidi_cache_idx, sizeof (bidi_cache_idx));
 915   memcpy (databuf + sizeof (bidi_cache_idx),
 916           bidi_cache, bidi_cache_idx * sizeof (struct bidi_it));
 917   memcpy (databuf + sizeof (bidi_cache_idx)
 918           + bidi_cache_idx * sizeof (struct bidi_it),
 919           bidi_cache_start_stack, sizeof (bidi_cache_start_stack));
 920   memcpy (databuf + sizeof (bidi_cache_idx)
 921           + bidi_cache_idx * sizeof (struct bidi_it)
 922           + sizeof (bidi_cache_start_stack),
 923           &bidi_cache_sp, sizeof (bidi_cache_sp));
 924   memcpy (databuf + sizeof (bidi_cache_idx)
 925           + bidi_cache_idx * sizeof (struct bidi_it)
 926           + sizeof (bidi_cache_start_stack) + sizeof (bidi_cache_sp),
 927           &bidi_cache_start, sizeof (bidi_cache_start));
 928   memcpy (databuf + sizeof (bidi_cache_idx)
 929           + bidi_cache_idx * sizeof (struct bidi_it)
 930           + sizeof (bidi_cache_start_stack) + sizeof (bidi_cache_sp)
 931           + sizeof (bidi_cache_start),
 932           &bidi_cache_last_idx, sizeof (bidi_cache_last_idx));
 933
 934   return databuf;
 935 }
 936
 937 /* Restore the cache state from a copy stashed away by
 938    bidi_shelve_cache, and free the buffer used to stash that copy.
 939    JUST_FREE means free the buffer, but don't restore the
 940    cache; used when the corresponding iterator is discarded instead of
 941    being restored.  */
 942 void
 943 bidi_unshelve_cache (void *databuf, bool just_free)
 944 {
 945   unsigned char *p = databuf;
 946
 947   if (!p)
 948     {
 949       if (!just_free)
 950         {
 951           /* A NULL pointer means an empty cache.  */
 952           bidi_cache_start = 0;
 953           bidi_cache_sp = 0;
 954           bidi_cache_reset ();
 955         }
 956     }
 957   else
 958     {
 959       if (just_free)
 960         {
 961           ptrdiff_t idx;
 962
 963           memcpy (&idx, p, sizeof (bidi_cache_idx));
 964           bidi_cache_total_alloc
 965             -= bidi_shelve_header_size + idx * sizeof (struct bidi_it);
 966         }
 967       else
 968         {
 969           memcpy (&bidi_cache_idx, p, sizeof (bidi_cache_idx));
 970           bidi_cache_ensure_space (bidi_cache_idx);
 971           memcpy (bidi_cache, p + sizeof (bidi_cache_idx),
 972                   bidi_cache_idx * sizeof (struct bidi_it));
 973           memcpy (bidi_cache_start_stack,
 974                   p + sizeof (bidi_cache_idx)
 975                   + bidi_cache_idx * sizeof (struct bidi_it),
 976                   sizeof (bidi_cache_start_stack));
 977           memcpy (&bidi_cache_sp,
 978                   p + sizeof (bidi_cache_idx)
 979                   + bidi_cache_idx * sizeof (struct bidi_it)
 980                   + sizeof (bidi_cache_start_stack),
 981                   sizeof (bidi_cache_sp));
 982           memcpy (&bidi_cache_start,
 983                   p + sizeof (bidi_cache_idx)
 984                   + bidi_cache_idx * sizeof (struct bidi_it)
 985                   + sizeof (bidi_cache_start_stack) + sizeof (bidi_cache_sp),
 986                   sizeof (bidi_cache_start));
 987           memcpy (&bidi_cache_last_idx,
 988                   p + sizeof (bidi_cache_idx)
 989                   + bidi_cache_idx * sizeof (struct bidi_it)
 990                   + sizeof (bidi_cache_start_stack) + sizeof (bidi_cache_sp)
 991                   + sizeof (bidi_cache_start),
 992                   sizeof (bidi_cache_last_idx));
 993           bidi_cache_total_alloc
 994             -= (bidi_shelve_header_size
 995                 + bidi_cache_idx * sizeof (struct bidi_it));
 996         }
 997
 998       xfree (p);
 999     }
1000 }
1001
1002 \f
1003 /***********************************************************************
1004                         Initialization
1005  ***********************************************************************/
1006 static void
1007 bidi_initialize (void)
1008 {
1009   bidi_type_table = uniprop_table (intern ("bidi-class"));
1010   if (NILP (bidi_type_table))
1011     emacs_abort ();
1012   staticpro (&bidi_type_table);
1013
1014   bidi_mirror_table = uniprop_table (intern ("mirroring"));
1015   if (NILP (bidi_mirror_table))
1016     emacs_abort ();
1017   staticpro (&bidi_mirror_table);
1018
1019   bidi_brackets_table = uniprop_table (intern ("bracket-type"));
1020   if (NILP (bidi_brackets_table))
1021     emacs_abort ();
1022   staticpro (&bidi_brackets_table);
1023
1024   Qparagraph_start = intern ("paragraph-start");
1025   staticpro (&Qparagraph_start);
1026   paragraph_start_re = Fsymbol_value (Qparagraph_start);
1027   if (!STRINGP (paragraph_start_re))
1028     paragraph_start_re = build_string ("\f\\|[ \t]*$");
1029   staticpro (&paragraph_start_re);
1030   Qparagraph_separate = intern ("paragraph-separate");
1031   staticpro (&Qparagraph_separate);
1032   paragraph_separate_re = Fsymbol_value (Qparagraph_separate);
1033   if (!STRINGP (paragraph_separate_re))
1034     paragraph_separate_re = build_string ("[ \t\f]*$");
1035   staticpro (&paragraph_separate_re);
1036
1037   bidi_cache_sp = 0;
1038   bidi_cache_total_alloc = 0;
1039
1040   bidi_initialized = 1;
1041 }
1042
1043 /* Do whatever UAX#9 clause X8 says should be done at paragraph's
1044    end.  */
1045 static void
1046 bidi_set_paragraph_end (struct bidi_it *bidi_it)
1047 {
1048   bidi_it->invalid_levels = 0;
1049   bidi_it->invalid_isolates = 0;
1050   bidi_it->stack_idx = 0;
1051   bidi_it->resolved_level = bidi_it->level_stack[0].level;
1052 }
1053
1054 /* Initialize the bidi iterator from buffer/string position CHARPOS.  */
1055 void
1056 bidi_init_it (ptrdiff_t charpos, ptrdiff_t bytepos, bool frame_window_p,
1057               struct bidi_it *bidi_it)
1058 {
1059   if (! bidi_initialized)
1060     bidi_initialize ();
1061   if (charpos >= 0)
1062     bidi_it->charpos = charpos;
1063   if (bytepos >= 0)
1064     bidi_it->bytepos = bytepos;
1065   bidi_it->frame_window_p = frame_window_p;
1066   bidi_it->nchars = -1; /* to be computed in bidi_resolve_explicit */
1067   bidi_it->first_elt = 1;
1068   bidi_set_paragraph_end (bidi_it);
1069   bidi_it->new_paragraph = 1;
1070   bidi_it->separator_limit = -1;
1071   bidi_it->type = NEUTRAL_B;
1072   bidi_it->type_after_wn = NEUTRAL_B;
1073   bidi_it->orig_type = NEUTRAL_B;
1074   /* FIXME: Review this!!! */
1075   bidi_it->prev.type = bidi_it->prev.orig_type = UNKNOWN_BT;
1076   bidi_it->last_strong.type = bidi_it->last_strong.orig_type = UNKNOWN_BT;
1077   bidi_it->next_for_neutral.charpos = -1;
1078   bidi_it->next_for_neutral.type
1079     = bidi_it->next_for_neutral.orig_type = UNKNOWN_BT;
1080   bidi_it->prev_for_neutral.charpos = -1;
1081   bidi_it->prev_for_neutral.type
1082     = bidi_it->prev_for_neutral.orig_type = UNKNOWN_BT;
1083   bidi_it->sos = L2R;    /* FIXME: should it be user-selectable? */
1084   bidi_it->disp_pos = -1;       /* invalid/unknown */
1085   bidi_it->disp_prop = 0;
1086   /* We can only shrink the cache if we are at the bottom level of its
1087      "stack".  */
1088   if (bidi_cache_start == 0)
1089     bidi_cache_shrink ();
1090   else
1091     bidi_cache_reset ();
1092 }
1093
1094 /* Perform initializations for reordering a new line of bidi text.  */
1095 static void
1096 bidi_line_init (struct bidi_it *bidi_it)
1097 {
1098   bidi_it->scan_dir = 1; /* FIXME: do we need to have control on this? */
1099   bidi_it->stack_idx = 0;
1100   bidi_it->resolved_level = bidi_it->level_stack[0].level;
1101   bidi_it->level_stack[0].override = NEUTRAL_DIR; /* X1 */
1102   bidi_it->level_stack[0].isolate_status = false; /* X1 */
1103   bidi_it->invalid_levels = 0;
1104   bidi_it->isolate_level = 0;    /* X1 */
1105   bidi_it->invalid_isolates = 0; /* X1 */
1106   /* Setting this to zero will force its recomputation the first time
1107      we need it for W5.  */
1108   bidi_it->next_en_pos = 0;
1109   bidi_it->next_en_type = UNKNOWN_BT;
1110   bidi_it->next_for_ws.charpos = -1;
1111   bidi_it->next_for_ws.type = UNKNOWN_BT;
1112   bidi_set_sos_type (bidi_it,
1113                      (bidi_it->paragraph_dir == R2L ? 1 : 0),
1114                      bidi_it->level_stack[0].level); /* X10 */
1115
1116   bidi_cache_reset ();
1117 }
1118
1119 \f
1120 /***********************************************************************
1121                         Fetching characters
1122  ***********************************************************************/
1123
1124 /* Count bytes in string S between BEG/BEGBYTE and END.  BEG and END
1125    are zero-based character positions in S, BEGBYTE is byte position
1126    corresponding to BEG.  UNIBYTE means S is a unibyte string.  */
1127 static ptrdiff_t
1128 bidi_count_bytes (const unsigned char *s, ptrdiff_t beg,
1129                   ptrdiff_t begbyte, ptrdiff_t end, bool unibyte)
1130 {
1131   ptrdiff_t pos = beg;
1132   const unsigned char *p = s + begbyte, *start = p;
1133
1134   if (unibyte)
1135     p = s + end;
1136   else
1137     {
1138       if (!CHAR_HEAD_P (*p))
1139         emacs_abort ();
1140
1141       while (pos < end)
1142         {
1143           p += BYTES_BY_CHAR_HEAD (*p);
1144           pos++;
1145         }
1146     }
1147
1148   return p - start;
1149 }
1150
1151 /* Fetch and return the character at byte position BYTEPOS.  If S is
1152    non-NULL, fetch the character from string S; otherwise fetch the
1153    character from the current buffer.  UNIBYTE means S is a
1154    unibyte string.  */
1155 static int
1156 bidi_char_at_pos (ptrdiff_t bytepos, const unsigned char *s, bool unibyte)
1157 {
1158   if (s)
1159     {
1160       s += bytepos;
1161       if (unibyte)
1162         return *s;
1163     }
1164   else
1165     s = BYTE_POS_ADDR (bytepos);
1166   return STRING_CHAR (s);
1167 }
1168
1169 /* Fetch and return the character at CHARPOS/BYTEPOS.  If that
1170    character is covered by a display string, treat the entire run of
1171    covered characters as a single character, either u+2029 or u+FFFC,
1172    and return their combined length in CH_LEN and NCHARS.  DISP_POS
1173    specifies the character position of the next display string, or -1
1174    if not yet computed.  When the next character is at or beyond that
1175    position, the function updates DISP_POS with the position of the
1176    next display string.  *DISP_PROP non-zero means that there's really
1177    a display string at DISP_POS, as opposed to when we searched till
1178    DISP_POS without finding one.  If *DISP_PROP is 2, it means the
1179    display spec is of the form `(space ...)', which is replaced with
1180    u+2029 to handle it as a paragraph separator.  STRING->s is the C
1181    string to iterate, or NULL if iterating over a buffer or a Lisp
1182    string; in the latter case, STRING->lstring is the Lisp string.  */
1183 static int
1184 bidi_fetch_char (ptrdiff_t charpos, ptrdiff_t bytepos, ptrdiff_t *disp_pos,
1185                  int *disp_prop, struct bidi_string_data *string,
1186                  struct window *w,
1187                  bool frame_window_p, ptrdiff_t *ch_len, ptrdiff_t *nchars)
1188 {
1189   int ch;
1190   ptrdiff_t endpos
1191     = (string->s || STRINGP (string->lstring)) ? string->schars : ZV;
1192   struct text_pos pos;
1193   int len;
1194
1195   /* If we got past the last known position of display string, compute
1196      the position of the next one.  That position could be at CHARPOS.  */
1197   if (charpos < endpos && charpos > *disp_pos)
1198     {
1199       SET_TEXT_POS (pos, charpos, bytepos);
1200       *disp_pos = compute_display_string_pos (&pos, string, w, frame_window_p,
1201                                               disp_prop);
1202     }
1203
1204   /* Fetch the character at BYTEPOS.  */
1205   if (charpos >= endpos)
1206     {
1207       ch = BIDI_EOB;
1208       *ch_len = 1;
1209       *nchars = 1;
1210       *disp_pos = endpos;
1211       *disp_prop = 0;
1212     }
1213   else if (charpos >= *disp_pos && *disp_prop)
1214     {
1215       ptrdiff_t disp_end_pos;
1216
1217       /* We don't expect to find ourselves in the middle of a display
1218          property.  Hopefully, it will never be needed.  */
1219       if (charpos > *disp_pos)
1220         emacs_abort ();
1221       /* Text covered by `display' properties and overlays with
1222          display properties or display strings is handled as a single
1223          character that represents the entire run of characters
1224          covered by the display property.  */
1225       if (*disp_prop == 2)
1226         {
1227           /* `(space ...)' display specs are handled as paragraph
1228              separators for the purposes of the reordering; see UAX#9
1229              section 3 and clause HL1 in section 4.3 there.  */
1230           ch = 0x2029;
1231         }
1232       else
1233         {
1234           /* All other display specs are handled as the Unicode Object
1235              Replacement Character.  */
1236           ch = 0xFFFC;
1237         }
1238       disp_end_pos = compute_display_string_end (*disp_pos, string);
1239       if (disp_end_pos < 0)
1240         {
1241           /* Somebody removed the display string from the buffer
1242              behind our back.  Recover by processing this buffer
1243              position as if no display property were present there to
1244              begin with.  */
1245           *disp_prop = 0;
1246           goto normal_char;
1247         }
1248       *nchars = disp_end_pos - *disp_pos;
1249       if (*nchars <= 0)
1250         emacs_abort ();
1251       if (string->s)
1252         *ch_len = bidi_count_bytes (string->s, *disp_pos, bytepos,
1253                                     disp_end_pos, string->unibyte);
1254       else if (STRINGP (string->lstring))
1255         *ch_len = bidi_count_bytes (SDATA (string->lstring), *disp_pos,
1256                                     bytepos, disp_end_pos, string->unibyte);
1257       else
1258         *ch_len = CHAR_TO_BYTE (disp_end_pos) - bytepos;
1259     }
1260   else
1261     {
1262     normal_char:
1263       if (string->s)
1264         {
1265
1266           if (!string->unibyte)
1267             {
1268               ch = STRING_CHAR_AND_LENGTH (string->s + bytepos, len);
1269               *ch_len = len;
1270             }
1271           else
1272             {
1273               ch = UNIBYTE_TO_CHAR (string->s[bytepos]);
1274               *ch_len = 1;
1275             }
1276         }
1277       else if (STRINGP (string->lstring))
1278         {
1279           if (!string->unibyte)
1280             {
1281               ch = STRING_CHAR_AND_LENGTH (SDATA (string->lstring) + bytepos,
1282                                            len);
1283               *ch_len = len;
1284             }
1285           else
1286             {
1287               ch = UNIBYTE_TO_CHAR (SREF (string->lstring, bytepos));
1288               *ch_len = 1;
1289             }
1290         }
1291       else
1292         {
1293           ch = STRING_CHAR_AND_LENGTH (BYTE_POS_ADDR (bytepos), len);
1294           *ch_len = len;
1295         }
1296       *nchars = 1;
1297     }
1298
1299   /* If we just entered a run of characters covered by a display
1300      string, compute the position of the next display string.  */
1301   if (charpos + *nchars <= endpos && charpos + *nchars > *disp_pos
1302       && *disp_prop)
1303     {
1304       SET_TEXT_POS (pos, charpos + *nchars, bytepos + *ch_len);
1305       *disp_pos = compute_display_string_pos (&pos, string, w, frame_window_p,
1306                                               disp_prop);
1307     }
1308
1309   return ch;
1310 }
1311
1312 /* Like bidi_fetch_char, but ignore any text between an isolate
1313    initiator and its matching PDI or, if it has no matching PDI, the
1314    end of the paragraph.  If isolates were skipped, CH_LEN and NCHARS
1315    are set to the number of bytes and characters between BYTEPOS/CHARPOS
1316    and the character that was fetched after skipping the isolates.  */
1317 static int
1318 bidi_fetch_char_skip_isolates (ptrdiff_t charpos, ptrdiff_t bytepos,
1319                                ptrdiff_t *disp_pos, int *disp_prop,
1320                                struct bidi_string_data *string,
1321                                struct window *w, bool frame_window_p,
1322                                ptrdiff_t *ch_len, ptrdiff_t *nchars)
1323 {
1324   ptrdiff_t orig_charpos = charpos, orig_bytepos = bytepos;
1325   int ch = bidi_fetch_char (charpos, bytepos, disp_pos, disp_prop, string, w,
1326                             frame_window_p, ch_len, nchars);
1327   bidi_type_t ch_type = bidi_get_type (ch, NEUTRAL_DIR);
1328   ptrdiff_t level = 0;
1329
1330   if (ch_type == LRI || ch_type == RLI || ch_type == FSI)
1331     {
1332       level++;
1333       while (level > 0 && ch_type != NEUTRAL_B)
1334         {
1335           charpos += *nchars;
1336           bytepos += *ch_len;
1337           ch = bidi_fetch_char (charpos, bytepos, disp_pos, disp_prop, string,
1338                                 w, frame_window_p, ch_len, nchars);
1339           ch_type = bidi_get_type (ch, NEUTRAL_DIR);
1340           /* A Note to P2 says to ignore max_depth limit.  */
1341           if (ch_type == LRI || ch_type == RLI || ch_type == FSI)
1342             level++;
1343           else if (ch_type == PDI)
1344             level--;
1345         }
1346     }
1347
1348   /* Communicate to the caller how much did we skip, so it could get
1349      past the last character position we examined.  */
1350   *nchars += charpos - orig_charpos;
1351   *ch_len += bytepos - orig_bytepos;
1352   return ch;
1353 }
1354
1355
1356 \f
1357 /***********************************************************************
1358                         Determining paragraph direction
1359  ***********************************************************************/
1360
1361 /* Check if buffer position CHARPOS/BYTEPOS is the end of a paragraph.
1362    Value is the non-negative length of the paragraph separator
1363    following the buffer position, -1 if position is at the beginning
1364    of a new paragraph, or -2 if position is neither at beginning nor
1365    at end of a paragraph.  */
1366 static ptrdiff_t
1367 bidi_at_paragraph_end (ptrdiff_t charpos, ptrdiff_t bytepos)
1368 {
1369   Lisp_Object sep_re;
1370   Lisp_Object start_re;
1371   ptrdiff_t val;
1372
1373   sep_re = paragraph_separate_re;
1374   start_re = paragraph_start_re;
1375
1376   val = fast_looking_at (sep_re, charpos, bytepos, ZV, ZV_BYTE, Qnil);
1377   if (val < 0)
1378     {
1379       if (fast_looking_at (start_re, charpos, bytepos, ZV, ZV_BYTE, Qnil) >= 0)
1380         val = -1;
1381       else
1382         val = -2;
1383     }
1384
1385   return val;
1386 }
1387
1388 /* If the user has requested the long scans caching, make sure that
1389    BIDI cache is enabled.  Otherwise, make sure it's disabled.  */
1390
1391 static struct region_cache *
1392 bidi_paragraph_cache_on_off (void)
1393 {
1394   struct buffer *cache_buffer = current_buffer;
1395   bool indirect_p = false;
1396
1397   /* For indirect buffers, make sure to use the cache of their base
1398      buffer.  */
1399   if (cache_buffer->base_buffer)
1400     {
1401       cache_buffer = cache_buffer->base_buffer;
1402       indirect_p = true;
1403     }
1404
1405   /* Don't turn on or off the cache in the base buffer, if the value
1406      of cache-long-scans of the base buffer is inconsistent with that.
1407      This is because doing so will just make the cache pure overhead,
1408      since if we turn it on via indirect buffer, it will be
1409      immediately turned off by its base buffer.  */
1410   if (NILP (BVAR (current_buffer, cache_long_scans)))
1411     {
1412       if (!indirect_p
1413           || NILP (BVAR (cache_buffer, cache_long_scans)))
1414         {
1415           if (cache_buffer->bidi_paragraph_cache)
1416             {
1417               free_region_cache (cache_buffer->bidi_paragraph_cache);
1418               cache_buffer->bidi_paragraph_cache = 0;
1419             }
1420         }
1421       return NULL;
1422     }
1423   else
1424     {
1425       if (!indirect_p
1426           || !NILP (BVAR (cache_buffer, cache_long_scans)))
1427         {
1428           if (!cache_buffer->bidi_paragraph_cache)
1429             cache_buffer->bidi_paragraph_cache = new_region_cache ();
1430         }
1431       return cache_buffer->bidi_paragraph_cache;
1432     }
1433 }
1434
1435 /* On my 2005-vintage machine, searching back for paragraph start
1436    takes ~1 ms per line.  And bidi_paragraph_init is called 4 times
1437    when user types C-p.  The number below limits each call to
1438    bidi_paragraph_init to about 10 ms.  */
1439 #define MAX_PARAGRAPH_SEARCH 7500
1440
1441 /* Find the beginning of this paragraph by looking back in the buffer.
1442    Value is the byte position of the paragraph's beginning, or
1443    BEGV_BYTE if paragraph_start_re is still not found after looking
1444    back MAX_PARAGRAPH_SEARCH lines in the buffer.  */
1445 static ptrdiff_t
1446 bidi_find_paragraph_start (ptrdiff_t pos, ptrdiff_t pos_byte)
1447 {
1448   Lisp_Object re = paragraph_start_re;
1449   ptrdiff_t limit = ZV, limit_byte = ZV_BYTE;
1450   struct region_cache *bpc = bidi_paragraph_cache_on_off ();
1451   ptrdiff_t n = 0, oldpos = pos, next;
1452   struct buffer *cache_buffer = current_buffer;
1453
1454   if (cache_buffer->base_buffer)
1455     cache_buffer = cache_buffer->base_buffer;
1456
1457   while (pos_byte > BEGV_BYTE
1458          && n++ < MAX_PARAGRAPH_SEARCH
1459          && fast_looking_at (re, pos, pos_byte, limit, limit_byte, Qnil) < 0)
1460     {
1461       /* FIXME: What if the paragraph beginning is covered by a
1462          display string?  And what if a display string covering some
1463          of the text over which we scan back includes
1464          paragraph_start_re?  */
1465       DEC_BOTH (pos, pos_byte);
1466       if (bpc && region_cache_backward (cache_buffer, bpc, pos, &next))
1467         {
1468           pos = next, pos_byte = CHAR_TO_BYTE (pos);
1469           break;
1470         }
1471       else
1472         pos = find_newline_no_quit (pos, pos_byte, -1, &pos_byte);
1473     }
1474   if (n >= MAX_PARAGRAPH_SEARCH)
1475     pos = BEGV, pos_byte = BEGV_BYTE;
1476   if (bpc)
1477     know_region_cache (cache_buffer, bpc, pos, oldpos);
1478   /* Positions returned by the region cache are not limited to
1479      BEGV..ZV range, so we limit them here.  */
1480   pos_byte = clip_to_bounds (BEGV_BYTE, pos_byte, ZV_BYTE);
1481   return pos_byte;
1482 }
1483
1484 /* On a 3.4 GHz machine, searching forward for a strong directional
1485    character in a long paragraph full of weaks or neutrals takes about
1486    1 ms for each 20K characters.  The number below limits each call to
1487    bidi_paragraph_init to less than 10 ms even on slow machines.  */
1488 #define MAX_STRONG_CHAR_SEARCH 100000
1489
1490 /* Starting from POS, find the first strong (L, R, or AL) character,
1491    while skipping over any characters between an isolate initiator and
1492    its matching PDI.  STOP_AT_PDI non-zero means stop at the PDI that
1493    matches the isolate initiator at POS.  Return the bidi type of the
1494    character where the search stopped.  Give up if after examining
1495    MAX_STRONG_CHAR_SEARCH buffer or string positions no strong
1496    character was found.  */
1497 static bidi_type_t
1498 find_first_strong_char (ptrdiff_t pos, ptrdiff_t bytepos, ptrdiff_t end,
1499                         ptrdiff_t *disp_pos, int *disp_prop,
1500                         struct bidi_string_data *string, struct window *w,
1501                         bool string_p, bool frame_window_p,
1502                         ptrdiff_t *ch_len, ptrdiff_t *nchars, bool stop_at_pdi)
1503 {
1504   ptrdiff_t pos1;
1505   bidi_type_t type;
1506   int ch;
1507
1508   if (stop_at_pdi)
1509     {
1510       /* If STOP_AT_PDI is non-zero, we must have been called with FSI
1511          at POS.  Get past it.  */
1512 #ifdef ENABLE_CHECKING
1513       ch = bidi_fetch_char (pos, bytepos, disp_pos, disp_prop, string, w,
1514                             frame_window_p, ch_len, nchars);
1515       type = bidi_get_type (ch, NEUTRAL_DIR);
1516       eassert (type == FSI /* || type == LRI || type == RLI */);
1517 #endif
1518       pos += *nchars;
1519       bytepos += *ch_len;
1520     }
1521   ch = bidi_fetch_char_skip_isolates (pos, bytepos, disp_pos, disp_prop, string,
1522                                       w, frame_window_p, ch_len, nchars);
1523   type = bidi_get_type (ch, NEUTRAL_DIR);
1524
1525   pos1 = pos;
1526   for (pos += *nchars, bytepos += *ch_len;
1527        bidi_get_category (type) != STRONG
1528          /* If requested to stop at first PDI, stop there.  */
1529          && !(stop_at_pdi && type == PDI)
1530          /* Stop when searched too far into an abnormally large
1531             paragraph full of weak or neutral characters.  */
1532          && pos - pos1 < MAX_STRONG_CHAR_SEARCH;
1533        type = bidi_get_type (ch, NEUTRAL_DIR))
1534     {
1535       if (pos >= end)
1536         {
1537           /* Pretend there's a paragraph separator at end of
1538              buffer/string.  */
1539           type = NEUTRAL_B;
1540           break;
1541         }
1542       if (!string_p
1543           && type == NEUTRAL_B
1544           && bidi_at_paragraph_end (pos, bytepos) >= -1)
1545         break;
1546       /* Fetch next character and advance to get past it.  */
1547       ch = bidi_fetch_char_skip_isolates (pos, bytepos, disp_pos, disp_prop,
1548                                           string, w, frame_window_p,
1549                                           ch_len, nchars);
1550       pos += *nchars;
1551       bytepos += *ch_len;
1552     }
1553   return type;
1554 }
1555
1556 /* Determine the base direction, a.k.a. base embedding level, of the
1557    paragraph we are about to iterate through.  If DIR is either L2R or
1558    R2L, just use that.  Otherwise, determine the paragraph direction
1559    from the first strong directional character of the paragraph.
1560
1561    NO_DEFAULT_P means don't default to L2R if the paragraph
1562    has no strong directional characters and both DIR and
1563    bidi_it->paragraph_dir are NEUTRAL_DIR.  In that case, search back
1564    in the buffer until a paragraph is found with a strong character,
1565    or until hitting BEGV.  In the latter case, fall back to L2R.  This
1566    flag is used in current-bidi-paragraph-direction.
1567
1568    Note that this function gives the paragraph separator the same
1569    direction as the preceding paragraph, even though Emacs generally
1570    views the separator as not belonging to any paragraph.  */
1571 void
1572 bidi_paragraph_init (bidi_dir_t dir, struct bidi_it *bidi_it, bool no_default_p)
1573 {
1574   ptrdiff_t bytepos = bidi_it->bytepos;
1575   bool string_p = bidi_it->string.s || STRINGP (bidi_it->string.lstring);
1576   ptrdiff_t pstartbyte;
1577   /* Note that begbyte is a byte position, while end is a character
1578      position.  Yes, this is ugly, but we are trying to avoid costly
1579      calls to BYTE_TO_CHAR and its ilk.  */
1580   ptrdiff_t begbyte = string_p ? 0 : BEGV_BYTE;
1581   ptrdiff_t end = string_p ? bidi_it->string.schars : ZV;
1582
1583   /* Special case for an empty buffer. */
1584   if (bytepos == begbyte && bidi_it->charpos == end)
1585     dir = L2R;
1586   /* We should never be called at EOB or before BEGV.  */
1587   else if (bidi_it->charpos >= end || bytepos < begbyte)
1588     emacs_abort ();
1589
1590   if (dir == L2R)
1591     {
1592       bidi_it->paragraph_dir = L2R;
1593       bidi_it->new_paragraph = 0;
1594     }
1595   else if (dir == R2L)
1596     {
1597       bidi_it->paragraph_dir = R2L;
1598       bidi_it->new_paragraph = 0;
1599     }
1600   else if (dir == NEUTRAL_DIR)  /* P2 */
1601     {
1602       ptrdiff_t ch_len, nchars;
1603       ptrdiff_t pos, disp_pos = -1;
1604       int disp_prop = 0;
1605       bidi_type_t type;
1606       const unsigned char *s;
1607
1608       if (!bidi_initialized)
1609         bidi_initialize ();
1610
1611       /* If we are inside a paragraph separator, we are just waiting
1612          for the separator to be exhausted; use the previous paragraph
1613          direction.  But don't do that if we have been just reseated,
1614          because we need to reinitialize below in that case.  */
1615       if (!bidi_it->first_elt
1616           && bidi_it->charpos < bidi_it->separator_limit)
1617         return;
1618
1619       /* If we are on a newline, get past it to where the next
1620          paragraph might start.  But don't do that at BEGV since then
1621          we are potentially in a new paragraph that doesn't yet
1622          exist.  */
1623       pos = bidi_it->charpos;
1624       s = (STRINGP (bidi_it->string.lstring)
1625            ? SDATA (bidi_it->string.lstring)
1626            : bidi_it->string.s);
1627       if (bytepos > begbyte
1628           && bidi_char_at_pos (bytepos, s, bidi_it->string.unibyte) == '\n')
1629         {
1630           bytepos++;
1631           pos++;
1632         }
1633
1634       /* We are either at the beginning of a paragraph or in the
1635          middle of it.  Find where this paragraph starts.  */
1636       if (string_p)
1637         {
1638           /* We don't support changes of paragraph direction inside a
1639              string.  It is treated as a single paragraph.  */
1640           pstartbyte = 0;
1641         }
1642       else
1643         pstartbyte = bidi_find_paragraph_start (pos, bytepos);
1644       bidi_it->separator_limit = -1;
1645       bidi_it->new_paragraph = 0;
1646
1647       /* The following loop is run more than once only if NO_DEFAULT_P,
1648          and only if we are iterating on a buffer.  */
1649       do {
1650         bytepos = pstartbyte;
1651         if (!string_p)
1652           pos = BYTE_TO_CHAR (bytepos);
1653         type = find_first_strong_char (pos, bytepos, end, &disp_pos, &disp_prop,
1654                                        &bidi_it->string, bidi_it->w,
1655                                        string_p, bidi_it->frame_window_p,
1656                                        &ch_len, &nchars, false);
1657         if (type == STRONG_R || type == STRONG_AL) /* P3 */
1658           bidi_it->paragraph_dir = R2L;
1659         else if (type == STRONG_L)
1660           bidi_it->paragraph_dir = L2R;
1661         if (!string_p
1662             && no_default_p && bidi_it->paragraph_dir == NEUTRAL_DIR)
1663           {
1664             /* If this paragraph is at BEGV, default to L2R.  */
1665             if (pstartbyte == BEGV_BYTE)
1666               bidi_it->paragraph_dir = L2R; /* P3 and HL1 */
1667             else
1668               {
1669                 ptrdiff_t prevpbyte = pstartbyte;
1670                 ptrdiff_t p = BYTE_TO_CHAR (pstartbyte), pbyte = pstartbyte;
1671
1672                 /* Find the beginning of the previous paragraph, if any.  */
1673                 while (pbyte > BEGV_BYTE && prevpbyte >= pstartbyte)
1674                   {
1675                     /* FXIME: What if p is covered by a display
1676                        string?  See also a FIXME inside
1677                        bidi_find_paragraph_start.  */
1678                     DEC_BOTH (p, pbyte);
1679                     prevpbyte = bidi_find_paragraph_start (p, pbyte);
1680                   }
1681                 pstartbyte = prevpbyte;
1682               }
1683           }
1684       } while (!string_p
1685                && no_default_p && bidi_it->paragraph_dir == NEUTRAL_DIR);
1686     }
1687   else
1688     emacs_abort ();
1689
1690   /* Contrary to UAX#9 clause P3, we only default the paragraph
1691      direction to L2R if we have no previous usable paragraph
1692      direction.  This is allowed by the HL1 clause.  */
1693   if (bidi_it->paragraph_dir != L2R && bidi_it->paragraph_dir != R2L)
1694     bidi_it->paragraph_dir = L2R; /* P3 and HL1 ``higher-level protocols'' */
1695   if (bidi_it->paragraph_dir == R2L)
1696     bidi_it->level_stack[0].level = 1;
1697   else
1698     bidi_it->level_stack[0].level = 0;
1699
1700   bidi_line_init (bidi_it);
1701 }
1702
1703 \f
1704 /***********************************************************************
1705                  Resolving explicit and implicit levels.
1706   The rest of this file constitutes the core of the UBA implementation.
1707  ***********************************************************************/
1708
1709 static bool
1710 bidi_explicit_dir_char (int ch)
1711 {
1712   bidi_type_t ch_type;
1713
1714   if (!bidi_initialized)
1715     emacs_abort ();
1716   ch_type = (bidi_type_t) XINT (CHAR_TABLE_REF (bidi_type_table, ch));
1717   return (ch_type == LRE || ch_type == LRO
1718           || ch_type == RLE || ch_type == RLO
1719           || ch_type == PDF);
1720 }
1721
1722 /* Given an iterator state in BIDI_IT, advance one character position
1723    in the buffer/string to the next character (in the logical order),
1724    resolve any explicit embeddings, directional overrides, and isolate
1725    initiators and terminators, and return the embedding level of the
1726    character after resolving these explicit directives.  */
1727 static int
1728 bidi_resolve_explicit (struct bidi_it *bidi_it)
1729 {
1730   int curchar;
1731   bidi_type_t type, typ1, prev_type = UNKNOWN_BT;
1732   int current_level;
1733   int new_level;
1734   bidi_dir_t override;
1735   bool isolate_status;
1736   bool string_p = bidi_it->string.s || STRINGP (bidi_it->string.lstring);
1737   ptrdiff_t ch_len, nchars, disp_pos, end;
1738   int disp_prop;
1739
1740   /* Record the info about the previous character.  */
1741   if (bidi_it->type_after_wn != WEAK_BN /* W1/Retaining */
1742       && bidi_it->type != WEAK_BN)
1743     {
1744       /* This special case is needed in support of Unicode 8.0
1745          correction to N0, as implemented in bidi_resolve_weak/W1
1746          below.  */
1747       if (bidi_it->type_after_wn == NEUTRAL_ON
1748           && bidi_get_category (bidi_it->type) == STRONG
1749           && bidi_paired_bracket_type (bidi_it->ch) == BIDI_BRACKET_CLOSE)
1750         bidi_remember_char (&bidi_it->prev, bidi_it, 1);
1751       else
1752         bidi_remember_char (&bidi_it->prev, bidi_it, 0);
1753     }
1754   if (bidi_it->type_after_wn == STRONG_R
1755       || bidi_it->type_after_wn == STRONG_L
1756       || bidi_it->type_after_wn == STRONG_AL)
1757     bidi_remember_char (&bidi_it->last_strong, bidi_it, 0);
1758   if (bidi_it->type == STRONG_R || bidi_it->type == STRONG_L
1759       || bidi_it->type == WEAK_EN || bidi_it->type == WEAK_AN)
1760     bidi_remember_char (&bidi_it->prev_for_neutral, bidi_it, 1);
1761
1762   /* If we overstepped the characters used for resolving neutrals
1763      and whitespace, invalidate their info in the iterator.  */
1764   if (bidi_it->charpos >= bidi_it->next_for_neutral.charpos)
1765     bidi_it->next_for_neutral.type = UNKNOWN_BT;
1766   if (bidi_it->next_en_pos >= 0
1767       && bidi_it->charpos >= bidi_it->next_en_pos)
1768     {
1769       bidi_it->next_en_pos = 0;
1770       bidi_it->next_en_type = UNKNOWN_BT;
1771     }
1772
1773   /* Reset the bracket resolution info.  */
1774   bidi_it->bracket_pairing_pos = -1;
1775   bidi_it->bracket_enclosed_type = UNKNOWN_BT;
1776
1777   /* If reseat()'ed, don't advance, so as to start iteration from the
1778      position where we were reseated.  bidi_it->bytepos can be less
1779      than BEGV_BYTE after reseat to BEGV.  */
1780   if (bidi_it->bytepos < (string_p ? 0 : BEGV_BYTE)
1781       || bidi_it->first_elt)
1782     {
1783       bidi_it->first_elt = 0;
1784       if (string_p)
1785         {
1786           const unsigned char *p
1787             = (STRINGP (bidi_it->string.lstring)
1788                ? SDATA (bidi_it->string.lstring)
1789                : bidi_it->string.s);
1790
1791           if (bidi_it->charpos < 0)
1792             bidi_it->charpos = bidi_it->bytepos = 0;
1793           eassert (bidi_it->bytepos == bidi_count_bytes (p, 0, 0,
1794                                                          bidi_it->charpos,
1795                                                          bidi_it->string.unibyte));
1796         }
1797       else
1798         {
1799           if (bidi_it->charpos < BEGV)
1800             {
1801               bidi_it->charpos = BEGV;
1802               bidi_it->bytepos = BEGV_BYTE;
1803             }
1804           eassert (bidi_it->bytepos == CHAR_TO_BYTE (bidi_it->charpos));
1805         }
1806       /* Determine the orginal bidi type of the previous character,
1807          which is needed for handling isolate initiators and PDF.  The
1808          type of the previous character will only be non-trivial if
1809          our caller moved through some previous text in
1810          get_visually_first_element, in which case bidi_it->prev holds
1811          the information we want.  */
1812       if (bidi_it->first_elt && bidi_it->prev.type != UNKNOWN_BT)
1813         {
1814           eassert (bidi_it->prev.charpos == bidi_it->charpos - 1);
1815           prev_type = bidi_it->prev.orig_type;
1816           if (prev_type == FSI)
1817             prev_type = bidi_it->type_after_wn;
1818         }
1819     }
1820   /* Don't move at end of buffer/string.  */
1821   else if (bidi_it->charpos < (string_p ? bidi_it->string.schars : ZV))
1822     {
1823       /* Advance to the next character, skipping characters covered by
1824          display strings (nchars > 1).  */
1825       if (bidi_it->nchars <= 0)
1826         emacs_abort ();
1827       bidi_it->charpos += bidi_it->nchars;
1828       if (bidi_it->ch_len == 0)
1829         emacs_abort ();
1830       bidi_it->bytepos += bidi_it->ch_len;
1831       prev_type = bidi_it->orig_type;
1832       if (prev_type == FSI)
1833         prev_type = bidi_it->type_after_wn;
1834     }
1835   else  /* EOB or end of string */
1836     prev_type = NEUTRAL_B;
1837
1838   current_level = bidi_it->level_stack[bidi_it->stack_idx].level; /* X1 */
1839   override = bidi_it->level_stack[bidi_it->stack_idx].override;
1840   isolate_status = bidi_it->level_stack[bidi_it->stack_idx].isolate_status;
1841   new_level = current_level;
1842
1843   if (bidi_it->charpos >= (string_p ? bidi_it->string.schars : ZV))
1844     {
1845       curchar = BIDI_EOB;
1846       bidi_it->ch_len = 1;
1847       bidi_it->nchars = 1;
1848       bidi_it->disp_pos = (string_p ? bidi_it->string.schars : ZV);
1849       bidi_it->disp_prop = 0;
1850     }
1851   else
1852     {
1853       /* LRI, RLI, and FSI increment, and PDF decrements, the
1854          embedding level of the _following_ characters, so we must
1855          first look at the type of the previous character to support
1856          that.  */
1857       switch (prev_type)
1858         {
1859         case RLI:       /* X5a */
1860           if (current_level < BIDI_MAXDEPTH
1861               && bidi_it->invalid_levels == 0
1862               && bidi_it->invalid_isolates == 0)
1863             {
1864               new_level = ((current_level + 1) & ~1) + 1;
1865               bidi_it->isolate_level++;
1866               bidi_push_embedding_level (bidi_it, new_level,
1867                                          NEUTRAL_DIR, true);
1868             }
1869           else
1870             bidi_it->invalid_isolates++;
1871           break;
1872         case LRI:       /* X5b */
1873           if (current_level < BIDI_MAXDEPTH - 1
1874               && bidi_it->invalid_levels == 0
1875               && bidi_it->invalid_isolates == 0)
1876             {
1877               new_level = ((current_level + 2) & ~1);
1878               bidi_it->isolate_level++;
1879               bidi_push_embedding_level (bidi_it, new_level,
1880                                          NEUTRAL_DIR, true);
1881             }
1882           else
1883             bidi_it->invalid_isolates++;
1884           break;
1885         case PDF:       /* X7 */
1886           if (!bidi_it->invalid_isolates)
1887             {
1888               if (bidi_it->invalid_levels)
1889                 bidi_it->invalid_levels--;
1890               else if (!isolate_status && bidi_it->stack_idx >= 1)
1891                 new_level = bidi_pop_embedding_level (bidi_it);
1892             }
1893           break;
1894         default:
1895           eassert (prev_type != FSI);
1896           /* Nothing.  */
1897           break;
1898         }
1899       /* Fetch the character at BYTEPOS.  If it is covered by a
1900          display string, treat the entire run of covered characters as
1901          a single character u+FFFC.  */
1902       curchar = bidi_fetch_char (bidi_it->charpos, bidi_it->bytepos,
1903                                  &bidi_it->disp_pos, &bidi_it->disp_prop,
1904                                  &bidi_it->string, bidi_it->w,
1905                                  bidi_it->frame_window_p,
1906                                  &bidi_it->ch_len, &bidi_it->nchars);
1907     }
1908   bidi_it->ch = curchar;
1909   bidi_it->resolved_level = new_level;
1910
1911   /* Don't apply directional override here, as all the types we handle
1912      below will not be affected by the override anyway, and we need
1913      the original type unaltered.  The override will be applied in
1914      bidi_resolve_weak.  */
1915   type = bidi_get_type (curchar, NEUTRAL_DIR);
1916   bidi_it->orig_type = type;
1917   bidi_check_type (bidi_it->orig_type);
1918
1919   bidi_it->type_after_wn = UNKNOWN_BT;
1920
1921   switch (type)
1922     {
1923     case RLE:   /* X2 */
1924     case RLO:   /* X4 */
1925       bidi_it->type_after_wn = type;
1926       bidi_check_type (bidi_it->type_after_wn);
1927       type = WEAK_BN; /* X9/Retaining */
1928       if (new_level < BIDI_MAXDEPTH
1929           && bidi_it->invalid_levels == 0
1930           && bidi_it->invalid_isolates == 0)
1931         {
1932           /* Compute the least odd embedding level greater than
1933              the current level.  */
1934           new_level = ((new_level + 1) & ~1) + 1;
1935           if (bidi_it->type_after_wn == RLE)
1936             override = NEUTRAL_DIR;
1937           else
1938             override = R2L;
1939           bidi_push_embedding_level (bidi_it, new_level, override, false);
1940           bidi_it->resolved_level = new_level;
1941         }
1942       else
1943         {
1944           if (bidi_it->invalid_isolates == 0)
1945             bidi_it->invalid_levels++;
1946         }
1947       break;
1948     case LRE:   /* X3 */
1949     case LRO:   /* X5 */
1950       bidi_it->type_after_wn = type;
1951       bidi_check_type (bidi_it->type_after_wn);
1952       type = WEAK_BN; /* X9/Retaining */
1953       if (new_level < BIDI_MAXDEPTH - 1
1954           && bidi_it->invalid_levels == 0
1955           && bidi_it->invalid_isolates == 0)
1956         {
1957           /* Compute the least even embedding level greater than
1958              the current level.  */
1959           new_level = ((new_level + 2) & ~1);
1960           if (bidi_it->type_after_wn == LRE)
1961             override = NEUTRAL_DIR;
1962           else
1963             override = L2R;
1964           bidi_push_embedding_level (bidi_it, new_level, override, false);
1965           bidi_it->resolved_level = new_level;
1966         }
1967       else
1968         {
1969           if (bidi_it->invalid_isolates == 0)
1970             bidi_it->invalid_levels++;
1971         }
1972       break;
1973     case FSI:   /* X5c */
1974       end = string_p ? bidi_it->string.schars : ZV;
1975       disp_pos = bidi_it->disp_pos;
1976       disp_prop = bidi_it->disp_prop;
1977       nchars = bidi_it->nchars;
1978       ch_len = bidi_it->ch_len;
1979       typ1 = find_first_strong_char (bidi_it->charpos,
1980                                      bidi_it->bytepos, end,
1981                                      &disp_pos, &disp_prop,
1982                                      &bidi_it->string, bidi_it->w,
1983                                      string_p, bidi_it->frame_window_p,
1984                                      &ch_len, &nchars, true);
1985       if (typ1 != STRONG_R && typ1 != STRONG_AL)
1986         {
1987           type = LRI;
1988           goto fsi_as_lri;
1989         }
1990       else
1991         type = RLI;
1992       /* FALLTHROUGH */
1993     case RLI:   /* X5a */
1994       if (override == NEUTRAL_DIR)
1995         bidi_it->type_after_wn = type;
1996       else      /* Unicode 8.0 correction.  */
1997         bidi_it->type_after_wn = (override == L2R ? STRONG_L : STRONG_R);
1998       bidi_check_type (bidi_it->type_after_wn);
1999       break;
2000     case LRI:   /* X5b */
2001     fsi_as_lri:
2002       if (override == NEUTRAL_DIR)
2003         bidi_it->type_after_wn = type;
2004       else      /* Unicode 8.0 correction.  */
2005         bidi_it->type_after_wn = (override == L2R ? STRONG_L : STRONG_R);
2006       bidi_check_type (bidi_it->type_after_wn);
2007       break;
2008     case PDI:   /* X6a */
2009       if (bidi_it->invalid_isolates)
2010         bidi_it->invalid_isolates--;
2011       else if (bidi_it->isolate_level > 0)
2012         {
2013           bidi_it->invalid_levels = 0;
2014           while (!bidi_it->level_stack[bidi_it->stack_idx].isolate_status)
2015             bidi_pop_embedding_level (bidi_it);
2016           eassert (bidi_it->stack_idx > 0);
2017           new_level = bidi_pop_embedding_level (bidi_it);
2018           bidi_it->isolate_level--;
2019         }
2020       bidi_it->resolved_level = new_level;
2021       /* Unicode 8.0 correction.  */
2022       if (bidi_it->level_stack[bidi_it->stack_idx].override == L2R)
2023         bidi_it->type_after_wn = STRONG_L;
2024       else if (bidi_it->level_stack[bidi_it->stack_idx].override == R2L)
2025         bidi_it->type_after_wn = STRONG_R;
2026       else
2027         bidi_it->type_after_wn = type;
2028       break;
2029     case PDF:   /* X7 */
2030       bidi_it->type_after_wn = type;
2031       bidi_check_type (bidi_it->type_after_wn);
2032       type = WEAK_BN; /* X9/Retaining */
2033       break;
2034     default:
2035       /* Nothing.  */
2036       break;
2037     }
2038
2039   bidi_it->type = type;
2040   bidi_check_type (bidi_it->type);
2041
2042   if (bidi_it->type == NEUTRAL_B)       /* X8 */
2043     {
2044       bidi_set_paragraph_end (bidi_it);
2045       /* This is needed by bidi_resolve_weak below, and in L1.  */
2046       bidi_it->type_after_wn = bidi_it->type;
2047     }
2048
2049   eassert (bidi_it->resolved_level >= 0);
2050   return bidi_it->resolved_level;
2051 }
2052
2053 /* Advance in the buffer/string, resolve weak types and return the
2054    type of the next character after weak type resolution.  */
2055 static bidi_type_t
2056 bidi_resolve_weak (struct bidi_it *bidi_it)
2057 {
2058   bidi_type_t type;
2059   bidi_dir_t override;
2060   int prev_level = bidi_it->level_stack[bidi_it->stack_idx].level;
2061   int new_level  = bidi_resolve_explicit (bidi_it);
2062   int next_char;
2063   bidi_type_t type_of_next;
2064   struct bidi_it saved_it;
2065   ptrdiff_t eob
2066     = ((STRINGP (bidi_it->string.lstring) || bidi_it->string.s)
2067        ? bidi_it->string.schars : ZV);
2068
2069   type = bidi_it->type;
2070   override = bidi_it->level_stack[bidi_it->stack_idx].override;
2071
2072   eassert (!(type == UNKNOWN_BT
2073              || type == LRE
2074              || type == LRO
2075              || type == RLE
2076              || type == RLO
2077              || type == PDF));
2078
2079   eassert (prev_level >= 0);
2080   if (bidi_it->type == NEUTRAL_B)
2081     {
2082       /* We've got a new isolating sequence, compute the directional
2083          type of sos and initialize per-run variables (UAX#9, clause
2084          X10).  */
2085       bidi_set_sos_type (bidi_it, prev_level, new_level);
2086     }
2087   if (type == NEUTRAL_S || type == NEUTRAL_WS
2088       || type == WEAK_BN || type == STRONG_AL)
2089     bidi_it->type_after_wn = type;      /* needed in L1 */
2090   bidi_check_type (bidi_it->type_after_wn);
2091
2092   /* Level and directional override status are already recorded in
2093      bidi_it, and do not need any change; see X6.  */
2094   if (override == R2L)          /* X6 */
2095     type = STRONG_R;
2096   else if (override == L2R)
2097     type = STRONG_L;
2098   else
2099     {
2100       if (type == WEAK_NSM)     /* W1 */
2101         {
2102           /* Note that we don't need to consider the case where the
2103              prev character has its type overridden by an RLO or LRO,
2104              because then either the type of this NSM would have been
2105              also overridden, or the previous character is outside the
2106              current level run, and thus not relevant to this NSM.
2107              This is why NSM gets the type_after_wn of the previous
2108              character.  */
2109           /* bidi_set_sos_type sets type_after_wn to UNKNOWN_BT.  */
2110           if (bidi_it->prev.type != UNKNOWN_BT
2111               /* If type_after_wn is NEUTRAL_B, this NSM is at sos.  */
2112               && bidi_it->prev.type != NEUTRAL_B)
2113             {
2114               if (bidi_isolate_fmt_char (bidi_it->prev.type))
2115                 {
2116                   /* From W1: "Note that in an isolating run sequence,
2117                      an isolate initiator followed by an NSM or any
2118                      type other than PDI must be an overflow isolate
2119                      initiator."  */
2120                   eassert (bidi_it->invalid_isolates > 0);
2121                   type = NEUTRAL_ON;
2122                 }
2123               else
2124                 {
2125                   /* This includes the Unicode 8.0 correction for N0,
2126                      due to how we set prev.type in bidi_resolve_explicit,
2127                      which see.  */
2128                   type = bidi_it->prev.type;
2129                 }
2130             }
2131           else if (bidi_it->sos == R2L)
2132             type = STRONG_R;
2133           else if (bidi_it->sos == L2R)
2134             type = STRONG_L;
2135           else /* shouldn't happen! */
2136             emacs_abort ();
2137         }
2138       if (type == WEAK_EN       /* W2 */
2139           && bidi_it->last_strong.type == STRONG_AL)
2140         type = WEAK_AN;
2141       else if (type == STRONG_AL) /* W3 */
2142         type = STRONG_R;
2143       else if ((type == WEAK_ES /* W4 */
2144                 && bidi_it->prev.type == WEAK_EN
2145                 && bidi_it->prev.orig_type == WEAK_EN)
2146                || (type == WEAK_CS
2147                    && ((bidi_it->prev.type == WEAK_EN
2148                         && bidi_it->prev.orig_type == WEAK_EN)
2149                        || bidi_it->prev.type == WEAK_AN)))
2150         {
2151           const unsigned char *s
2152             = (STRINGP (bidi_it->string.lstring)
2153                ? SDATA (bidi_it->string.lstring)
2154                : bidi_it->string.s);
2155
2156           next_char = (bidi_it->charpos + bidi_it->nchars >= eob
2157                        ? BIDI_EOB
2158                        : bidi_char_at_pos (bidi_it->bytepos + bidi_it->ch_len,
2159                                            s, bidi_it->string.unibyte));
2160           type_of_next = bidi_get_type (next_char, override);
2161
2162           if (type_of_next == WEAK_BN
2163               || bidi_explicit_dir_char (next_char))
2164             {
2165               bidi_copy_it (&saved_it, bidi_it);
2166               while (bidi_resolve_explicit (bidi_it) == new_level
2167                      && bidi_it->type == WEAK_BN)
2168                 type_of_next = bidi_it->type;
2169               bidi_copy_it (bidi_it, &saved_it);
2170             }
2171
2172           /* If the next character is EN, but the last strong-type
2173              character is AL, that next EN will be changed to AN when
2174              we process it in W2 above.  So in that case, this ES
2175              should not be changed into EN.  */
2176           if (type == WEAK_ES
2177               && type_of_next == WEAK_EN
2178               && bidi_it->last_strong.type != STRONG_AL)
2179             type = WEAK_EN;
2180           else if (type == WEAK_CS)
2181             {
2182               if (bidi_it->prev.type == WEAK_AN
2183                   && (type_of_next == WEAK_AN
2184                       /* If the next character is EN, but the last
2185                          strong-type character is AL, EN will be later
2186                          changed to AN when we process it in W2 above.
2187                          So in that case, this ES should not be
2188                          changed into EN.  */
2189                       || (type_of_next == WEAK_EN
2190                           && bidi_it->last_strong.type == STRONG_AL)))
2191                 type = WEAK_AN;
2192               else if (bidi_it->prev.type == WEAK_EN
2193                        && type_of_next == WEAK_EN
2194                        && bidi_it->last_strong.type != STRONG_AL)
2195                 type = WEAK_EN;
2196             }
2197         }
2198       else if (type == WEAK_ET  /* W5: ET with EN before or after it */
2199                || type == WEAK_BN)      /* W5/Retaining */
2200         {
2201           if (bidi_it->prev.type == WEAK_EN) /* ET/BN w/EN before it */
2202             type = WEAK_EN;
2203           else if (bidi_it->next_en_pos > bidi_it->charpos
2204                    && bidi_it->next_en_type != WEAK_BN)
2205             {
2206               if (bidi_it->next_en_type == WEAK_EN) /* ET/BN with EN after it */
2207                 type = WEAK_EN;
2208             }
2209           else if (bidi_it->next_en_pos >=0)
2210             {
2211               /* We overstepped the last known position for ET
2212                  resolution but there could be other such characters
2213                  in this paragraph (when we are sure there are no more
2214                  such positions, we set next_en_pos to a negative
2215                  value).  Try to find the next position for ET
2216                  resolution.  */
2217               ptrdiff_t en_pos = bidi_it->charpos + bidi_it->nchars;
2218               const unsigned char *s = (STRINGP (bidi_it->string.lstring)
2219                                         ? SDATA (bidi_it->string.lstring)
2220                                         : bidi_it->string.s);
2221
2222               if (bidi_it->nchars <= 0)
2223                 emacs_abort ();
2224               next_char
2225                 = (bidi_it->charpos + bidi_it->nchars >= eob
2226                    ? BIDI_EOB
2227                    : bidi_char_at_pos (bidi_it->bytepos + bidi_it->ch_len, s,
2228                                        bidi_it->string.unibyte));
2229               type_of_next = bidi_get_type (next_char, override);
2230
2231               if (type_of_next == WEAK_ET
2232                   || type_of_next == WEAK_BN
2233                   || bidi_explicit_dir_char (next_char))
2234                 {
2235                   bidi_copy_it (&saved_it, bidi_it);
2236                   while (bidi_resolve_explicit (bidi_it) == new_level
2237                          && (bidi_it->type == WEAK_BN
2238                              || bidi_it->type == WEAK_ET))
2239                     type_of_next = bidi_it->type;
2240                   if (type == WEAK_BN
2241                       && bidi_it->charpos == saved_it.charpos + saved_it.nchars)
2242                     {
2243                       /* If we entered the above loop with a BN that
2244                          changes the level, the type of next
2245                          character, which is in a different level, is
2246                          not relevant to resolving this series of ET
2247                          and BN.  */
2248                       en_pos = saved_it.charpos;
2249                       type_of_next = type;
2250                     }
2251                   else
2252                     en_pos = bidi_it->charpos;
2253                   bidi_copy_it (bidi_it, &saved_it);
2254                 }
2255               /* Remember this position, to speed up processing of the
2256                  next ETs.  */
2257               bidi_it->next_en_pos = en_pos;
2258               if (type_of_next == WEAK_EN)
2259                 {
2260                   /* If the last strong character is AL, the EN we've
2261                      found will become AN when we get to it (W2). */
2262                   if (bidi_it->last_strong.type == STRONG_AL)
2263                     type_of_next = WEAK_AN;
2264                   else if (type == WEAK_BN)
2265                     type = NEUTRAL_ON; /* W6/Retaining */
2266                   else
2267                     type = WEAK_EN;
2268                 }
2269               else if (type_of_next == NEUTRAL_B)
2270                 /* Record the fact that there are no more ENs from
2271                    here to the end of paragraph, to avoid entering the
2272                    loop above ever again in this paragraph.  */
2273                 bidi_it->next_en_pos = -1;
2274               /* Record the type of the character where we ended our search.  */
2275               bidi_it->next_en_type = type_of_next;
2276             }
2277         }
2278     }
2279
2280   if (type == WEAK_ES || type == WEAK_ET || type == WEAK_CS /* W6 */
2281       || (type == WEAK_BN
2282           && (bidi_it->prev.type == WEAK_CS         /* W6/Retaining */
2283               || bidi_it->prev.type == WEAK_ES
2284               || bidi_it->prev.type == WEAK_ET)))
2285     type = NEUTRAL_ON;
2286
2287   /* Store the type we've got so far, before we clobber it with strong
2288      types in W7 and while resolving neutral types.  But leave alone
2289      the original types that were recorded above, because we will need
2290      them for the L1 clause.  */
2291   if (bidi_it->type_after_wn == UNKNOWN_BT)
2292     bidi_it->type_after_wn = type;
2293   bidi_check_type (bidi_it->type_after_wn);
2294
2295   if (type == WEAK_EN)  /* W7 */
2296     {
2297       if ((bidi_it->last_strong.type == STRONG_L)
2298           || (bidi_it->last_strong.type == UNKNOWN_BT && bidi_it->sos == L2R))
2299         type = STRONG_L;
2300     }
2301
2302   bidi_it->type = type;
2303   bidi_check_type (bidi_it->type);
2304   return type;
2305 }
2306
2307 /* Resolve the type of a neutral character according to the type of
2308    surrounding strong text and the current embedding level.  */
2309 static bidi_type_t
2310 bidi_resolve_neutral_1 (bidi_type_t prev_type, bidi_type_t next_type, int lev)
2311 {
2312   /* N1: "European and Arabic numbers act as if they were R in terms
2313      of their influence on NIs."  */
2314   if (next_type == WEAK_EN || next_type == WEAK_AN)
2315     next_type = STRONG_R;
2316   if (prev_type == WEAK_EN || prev_type == WEAK_AN)
2317     prev_type = STRONG_R;
2318
2319   if (next_type == prev_type)   /* N1 */
2320     return next_type;
2321   else if ((lev & 1) == 0)      /* N2 */
2322     return STRONG_L;
2323   else
2324     return STRONG_R;
2325 }
2326
2327 #define FLAG_EMBEDDING_INSIDE  1
2328 #define FLAG_OPPOSITE_INSIDE   2
2329
2330 /* A data type used in the stack maintained by
2331    bidi_find_bracket_pairs below.  */
2332 typedef struct bpa_stack_entry {
2333   int close_bracket_char;
2334   int open_bracket_idx;
2335 #ifdef ENABLE_CHECKING
2336   ptrdiff_t open_bracket_pos;
2337 #endif
2338   unsigned flags : 2;
2339 } bpa_stack_entry;
2340
2341 /* With MAX_ALLOCA of 16KB, this should allow at least 1K slots in the
2342    BPA stack, which should be more than enough for actual bidi text.  */
2343 #define MAX_BPA_STACK (max (MAX_ALLOCA / sizeof (bpa_stack_entry), 1))
2344
2345 #ifdef ENABLE_CHECKING
2346 # define STORE_BRACKET_CHARPOS \
2347    bpa_stack[bpa_sp].open_bracket_pos = bidi_it->charpos
2348 #else
2349 # define STORE_BRACKET_CHARPOS  /* nothing */
2350 #endif
2351
2352 #define PUSH_BPA_STACK                                                  \
2353   do {                                                                  \
2354    bpa_sp++;                                                            \
2355    if (bpa_sp >= MAX_BPA_STACK)                                         \
2356      {                                                                  \
2357        bpa_sp = MAX_BPA_STACK - 1;                                      \
2358        goto bpa_give_up;                                                \
2359      }                                                                  \
2360    bpa_stack[bpa_sp].close_bracket_char = bidi_mirror_char (bidi_it->ch); \
2361    bpa_stack[bpa_sp].open_bracket_idx = bidi_cache_last_idx;            \
2362    bpa_stack[bpa_sp].flags = 0;                                         \
2363    STORE_BRACKET_CHARPOS;                                               \
2364   } while (0)
2365
2366
2367 /* This function implements BPA, the Bidi Parenthesis Algorithm,
2368    described in BD16 and N0 of UAX#9.  It finds all the bracket pairs
2369    in the current isolating sequence, and records the enclosed type
2370    and the position of the matching bracket in the cache.  It returns
2371    non-zero if called with the iterator on the opening bracket which
2372    has a matching closing bracket in the current isolating sequence,
2373    zero otherwise.  */
2374 static bool
2375 bidi_find_bracket_pairs (struct bidi_it *bidi_it)
2376 {
2377   bidi_bracket_type_t btype;
2378   bidi_type_t type = bidi_it->type;
2379   bool retval = false;
2380
2381   /* When scanning backwards, we don't expect any unresolved bidi
2382      bracket characters.  */
2383   if (bidi_it->scan_dir != 1)
2384     emacs_abort ();
2385
2386   btype = bidi_paired_bracket_type (bidi_it->ch);
2387   if (btype == BIDI_BRACKET_OPEN)
2388     {
2389       bpa_stack_entry bpa_stack[MAX_BPA_STACK];
2390       int bpa_sp = -1;
2391       struct bidi_it saved_it;
2392       int embedding_level = bidi_it->level_stack[bidi_it->stack_idx].level;
2393       bidi_type_t embedding_type = (embedding_level & 1) ? STRONG_R : STRONG_L;
2394       struct bidi_it tem_it;
2395
2396       eassert (MAX_BPA_STACK >= 100);
2397       bidi_copy_it (&saved_it, bidi_it);
2398       /* bidi_cache_iterator_state refuses to cache on backward scans,
2399          and bidi_cache_fetch_state doesn't bring scan_dir from the
2400          cache, so we must initialize this explicitly.  */
2401       tem_it.scan_dir = 1;
2402
2403       while (1)
2404         {
2405           int old_sidx, new_sidx;
2406           int current_level = bidi_it->level_stack[bidi_it->stack_idx].level;
2407
2408           bidi_cache_iterator_state (bidi_it, type == NEUTRAL_B, 0);
2409           if (btype == BIDI_BRACKET_OPEN)
2410             PUSH_BPA_STACK;
2411           else if (btype == BIDI_BRACKET_CLOSE)
2412             {
2413               int sp = bpa_sp;
2414               int curchar = bidi_it->ch;
2415
2416               eassert (sp >= 0);
2417               while (sp >= 0 && bpa_stack[sp].close_bracket_char != curchar)
2418                 sp--;
2419               if (sp >= 0)
2420                 {
2421                   /* Update and cache the corresponding opening bracket.  */
2422                   bidi_cache_fetch_state (bpa_stack[sp].open_bracket_idx,
2423                                           &tem_it);
2424 #ifdef ENABLE_CHECKING
2425                   eassert (bpa_stack[sp].open_bracket_pos == tem_it.charpos);
2426 #endif
2427                   /* Determine the enclosed type for this bracket
2428                      pair's type resolution according to N0.  */
2429                   if (bpa_stack[sp].flags & FLAG_EMBEDDING_INSIDE)
2430                     tem_it.bracket_enclosed_type = embedding_type; /* N0b */
2431                   else if (bpa_stack[sp].flags & FLAG_OPPOSITE_INSIDE)
2432                     tem_it.bracket_enclosed_type                   /* N0c */
2433                       = (embedding_type == STRONG_L ? STRONG_R : STRONG_L);
2434                   else                                             /* N0d */
2435                     tem_it.bracket_enclosed_type = UNKNOWN_BT;
2436
2437                   /* Record the position of the matching closing
2438                      bracket, and update the cache.  */
2439                   tem_it.bracket_pairing_pos = bidi_it->charpos;
2440                   bidi_cache_iterator_state (&tem_it, 0, 1);
2441
2442                   /* Pop the BPA stack.  */
2443                   bpa_sp = sp - 1;
2444                 }
2445               if (bpa_sp < 0)
2446                 {
2447                   retval = true;
2448                   break;
2449                 }
2450             }
2451           else if (bidi_get_category (bidi_it->type_after_wn) != NEUTRAL)
2452             {
2453               unsigned flag;
2454               int sp;
2455
2456               /* Whenever we see a strong type, update the flags of
2457                  all the slots on the stack.  */
2458               switch (bidi_it->type)
2459                 {
2460                 case STRONG_L:
2461                   flag = ((embedding_level & 1) == 0
2462                           ? FLAG_EMBEDDING_INSIDE
2463                           : FLAG_OPPOSITE_INSIDE);
2464                   break;
2465                 case STRONG_R:
2466                 case WEAK_EN:
2467                 case WEAK_AN:
2468                   flag = ((embedding_level & 1) == 1
2469                           ? FLAG_EMBEDDING_INSIDE
2470                           : FLAG_OPPOSITE_INSIDE);
2471                   break;
2472                 default:
2473                   break;
2474                 }
2475               for (sp = bpa_sp; sp >= 0; sp--)
2476                 bpa_stack[sp].flags |= flag;
2477             }
2478           old_sidx = bidi_it->stack_idx;
2479           type = bidi_resolve_weak (bidi_it);
2480           /* Skip level runs excluded from this isolating run sequence.  */
2481           new_sidx = bidi_it->stack_idx;
2482           if (bidi_it->level_stack[new_sidx].level > current_level
2483               && (bidi_it->level_stack[new_sidx].isolate_status
2484                   || (new_sidx > old_sidx + 1
2485                       && bidi_it->level_stack[new_sidx - 1].isolate_status)))
2486             {
2487               while (bidi_it->level_stack[bidi_it->stack_idx].level
2488                      > current_level)
2489                 {
2490                   bidi_cache_iterator_state (bidi_it, type == NEUTRAL_B, 0);
2491                   type = bidi_resolve_weak (bidi_it);
2492                 }
2493             }
2494           if (type == NEUTRAL_B
2495               || (bidi_it->level_stack[bidi_it->stack_idx].level
2496                   != current_level))
2497             {
2498             bpa_give_up:
2499               /* We've marched all the way to the end of this
2500                  isolating run sequence, and didn't find matching
2501                  closing brackets for some opening brackets.  Leave
2502                  their type unchanged.  */
2503               break;
2504             }
2505           if (bidi_it->type_after_wn == NEUTRAL_ON) /* Unicode 8.0 correction */
2506             btype = bidi_paired_bracket_type (bidi_it->ch);
2507           else
2508             btype = BIDI_BRACKET_NONE;
2509         }
2510
2511       /* Restore bidi_it from the cache, which should have the bracket
2512          resolution members set as determined by the above loop.  */
2513       type = bidi_cache_find (saved_it.charpos, 1, bidi_it);
2514       eassert (type == NEUTRAL_ON);
2515     }
2516
2517   return retval;
2518 }
2519
2520 static bidi_type_t
2521 bidi_resolve_brackets (struct bidi_it *bidi_it)
2522 {
2523   int prev_level = bidi_it->level_stack[bidi_it->stack_idx].level;
2524   bool resolve_bracket = false;
2525   bidi_type_t type = UNKNOWN_BT;
2526   int ch;
2527   struct bidi_saved_info tem_info;
2528
2529   bidi_remember_char (&tem_info, bidi_it, 1);
2530   if (!bidi_it->first_elt)
2531     {
2532       type = bidi_cache_find (bidi_it->charpos + bidi_it->nchars, 1, bidi_it);
2533       ch = bidi_it->ch;
2534     }
2535   if (type == UNKNOWN_BT)
2536     {
2537       type = bidi_resolve_weak (bidi_it);
2538       if (type == NEUTRAL_ON && bidi_find_bracket_pairs (bidi_it))
2539         resolve_bracket = true;
2540     }
2541   else
2542     {
2543       if (type == NEUTRAL_ON
2544           && bidi_paired_bracket_type (ch) == BIDI_BRACKET_OPEN)
2545         {
2546           if (bidi_it->level_stack[bidi_it->stack_idx].level == prev_level)
2547             {
2548               if (bidi_it->bracket_pairing_pos > 0)
2549                 {
2550                   /* A cached opening bracket that wasn't completely
2551                      resolved yet.  */
2552                   resolve_bracket = true;
2553                 }
2554             }
2555           else
2556             {
2557               /* Higher levels were not BPA-resolved yet, even if
2558                  cached by bidi_find_bracket_pairs.  Lower levels were
2559                  probably processed by bidi_find_bracket_pairs, but we
2560                  have no easy way of retaining the prev_for_neutral
2561                  from the previous level run of the isolating
2562                  sequence.  Force application of BPA now.  */
2563               if (bidi_find_bracket_pairs (bidi_it))
2564                 resolve_bracket = true;
2565             }
2566         }
2567       /* Keep track of the prev_for_neutral type, needed for resolving
2568          brackets below and for resolving neutrals in bidi_resolve_neutral.  */
2569       if (bidi_it->level_stack[bidi_it->stack_idx].level == prev_level
2570           && (tem_info.type == STRONG_L || tem_info.type == STRONG_R
2571               || tem_info.type == WEAK_AN || tem_info.type == WEAK_EN))
2572         bidi_it->prev_for_neutral = tem_info;
2573     }
2574
2575   /* If needed, resolve the bracket type according to N0.  */
2576   if (resolve_bracket)
2577     {
2578       int embedding_level = bidi_it->level_stack[bidi_it->stack_idx].level;
2579       bidi_type_t embedding_type = (embedding_level & 1) ? STRONG_R : STRONG_L;
2580
2581       eassert (bidi_it->prev_for_neutral.type != UNKNOWN_BT);
2582       eassert (bidi_it->bracket_pairing_pos > bidi_it->charpos);
2583       if (bidi_it->bracket_enclosed_type == embedding_type) /* N0b */
2584         type = embedding_type;
2585       else
2586         {
2587           switch (bidi_it->prev_for_neutral.type)
2588             {
2589             case STRONG_R:
2590             case WEAK_EN:
2591             case WEAK_AN:
2592               type =
2593                 (bidi_it->bracket_enclosed_type == STRONG_R) /* N0c */
2594                 ? STRONG_R                                   /* N0c1 */
2595                 : embedding_type;                            /* N0c2 */
2596               break;
2597             case STRONG_L:
2598               type =
2599                 (bidi_it->bracket_enclosed_type == STRONG_L) /* N0c */
2600                 ? STRONG_L                                   /* N0c1 */
2601                 : embedding_type;                            /* N0c2 */
2602               break;
2603             default:
2604               /* N0d: Do not set the type for that bracket pair.  */
2605               break;
2606             }
2607         }
2608       eassert (type == STRONG_L || type == STRONG_R || type == NEUTRAL_ON);
2609
2610       /* Update the type of the paired closing bracket to the same
2611          type as for the resolved opening bracket.  */
2612       if (type != NEUTRAL_ON)
2613         {
2614           ptrdiff_t idx = bidi_cache_search (bidi_it->bracket_pairing_pos,
2615                                              -1, 1);
2616
2617           if (idx < bidi_cache_start)
2618             emacs_abort ();
2619           bidi_cache[idx].type = type;
2620         }
2621     }
2622
2623   return type;
2624 }
2625
2626 static bidi_type_t
2627 bidi_resolve_neutral (struct bidi_it *bidi_it)
2628 {
2629   bidi_type_t type = bidi_resolve_brackets (bidi_it);
2630   int current_level;
2631   bool is_neutral;
2632
2633   eassert (type == STRONG_R
2634            || type == STRONG_L
2635            || type == WEAK_BN
2636            || type == WEAK_EN
2637            || type == WEAK_AN
2638            || type == NEUTRAL_B
2639            || type == NEUTRAL_S
2640            || type == NEUTRAL_WS
2641            || type == NEUTRAL_ON
2642            || type == LRI
2643            || type == RLI
2644            || type == PDI);
2645
2646   current_level = bidi_it->level_stack[bidi_it->stack_idx].level;
2647   eassert (current_level >= 0);
2648   is_neutral = bidi_get_category (type) == NEUTRAL;
2649
2650   if ((type != NEUTRAL_B /* Don't risk entering the long loop below if
2651                             we are already at paragraph end.  */
2652        && (is_neutral || bidi_isolate_fmt_char (type)))
2653       /* N1-N2/Retaining */
2654       || (type == WEAK_BN && bidi_explicit_dir_char (bidi_it->ch)))
2655     {
2656       if (bidi_it->next_for_neutral.type != UNKNOWN_BT)
2657         type = bidi_resolve_neutral_1 (bidi_it->prev_for_neutral.type,
2658                                        bidi_it->next_for_neutral.type,
2659                                        current_level);
2660       /* The next two "else if" clauses are shortcuts for the
2661          important special case when we have a long sequence of
2662          neutral or WEAK_BN characters, such as whitespace or nulls or
2663          other control characters, on the base embedding level of the
2664          paragraph, and that sequence goes all the way to the end of
2665          the paragraph and follows a character whose resolved
2666          directionality is identical to the base embedding level.
2667          (This is what happens in a buffer with plain L2R text that
2668          happens to include long sequences of control characters.)  By
2669          virtue of N1, the result of examining this long sequence will
2670          always be either STRONG_L or STRONG_R, depending on the base
2671          embedding level.  So we use this fact directly instead of
2672          entering the expensive loop in the "else" clause.  */
2673       else if (current_level == 0
2674                && bidi_it->prev_for_neutral.type == STRONG_L
2675                && !bidi_explicit_dir_char (bidi_it->ch)
2676                && !bidi_isolate_fmt_char (type))
2677         type = bidi_resolve_neutral_1 (bidi_it->prev_for_neutral.type,
2678                                        STRONG_L, current_level);
2679       else if (/* current level is 1 */
2680                current_level == 1
2681                /* base embedding level is also 1 */
2682                && bidi_it->level_stack[0].level == 1
2683                /* previous character is one of those considered R for
2684                   the purposes of W5 */
2685                && (bidi_it->prev_for_neutral.type == STRONG_R
2686                    || bidi_it->prev_for_neutral.type == WEAK_EN
2687                    || bidi_it->prev_for_neutral.type == WEAK_AN)
2688                && !bidi_explicit_dir_char (bidi_it->ch)
2689                && !bidi_isolate_fmt_char (type))
2690         type = bidi_resolve_neutral_1 (bidi_it->prev_for_neutral.type,
2691                                        STRONG_R, current_level);
2692       else
2693         {
2694           /* Arrrgh!!  The UAX#9 algorithm is too deeply entrenched in
2695              the assumption of batch-style processing; see clauses W4,
2696              W5, and especially N1, which require to look far forward
2697              (as well as back) in the buffer/string.  May the fleas of
2698              a thousand camels infest the armpits of those who design
2699              supposedly general-purpose algorithms by looking at their
2700              own implementations, and fail to consider other possible
2701              implementations!  */
2702           struct bidi_it saved_it;
2703           bidi_type_t next_type;
2704           bool adjacent_to_neutrals = is_neutral;
2705
2706           bidi_copy_it (&saved_it, bidi_it);
2707           /* Scan the text forward until we find the first non-neutral
2708              character, and then use that to resolve the neutral we
2709              are dealing with now.  We also cache the scanned iterator
2710              states, to salvage some of the effort later.  */
2711           do {
2712             int old_sidx, new_sidx;
2713
2714             /* Paragraph separators have their levels fully resolved
2715                at this point, so cache them as resolved.  */
2716             bidi_cache_iterator_state (bidi_it, type == NEUTRAL_B, 0);
2717             old_sidx = bidi_it->stack_idx;
2718             type = bidi_resolve_brackets (bidi_it);
2719             /* Skip level runs excluded from this isolating run sequence.  */
2720             new_sidx = bidi_it->stack_idx;
2721             if (bidi_it->level_stack[new_sidx].level > current_level
2722                 && (bidi_it->level_stack[new_sidx].isolate_status
2723                     /* This is for when we have an isolate initiator
2724                        immediately followed by an embedding or
2725                        override initiator, in which case we get the
2726                        level stack pushed twice by the single call to
2727                        bidi_resolve_weak above.  */
2728                     || (new_sidx > old_sidx + 1
2729                         && bidi_it->level_stack[new_sidx - 1].isolate_status)))
2730               {
2731                 while (bidi_it->level_stack[bidi_it->stack_idx].level
2732                        > current_level)
2733                   {
2734                     bidi_cache_iterator_state (bidi_it, type == NEUTRAL_B, 0);
2735                     type = bidi_resolve_brackets (bidi_it);
2736                   }
2737               }
2738             if (!adjacent_to_neutrals
2739                 && (bidi_get_category (type) == NEUTRAL
2740                     || bidi_isolate_fmt_char (type)))
2741               adjacent_to_neutrals = true;
2742           } while (!(type == NEUTRAL_B
2743                      || (type != WEAK_BN
2744                          && bidi_get_category (type) != NEUTRAL
2745                          && !bidi_isolate_fmt_char (type))
2746                      /* This is all per level run, so stop when we
2747                         reach the end of this level run.  */
2748                      || (bidi_it->level_stack[bidi_it->stack_idx].level
2749                          != current_level)));
2750
2751           /* Record the character we stopped at.  */
2752           bidi_remember_char (&saved_it.next_for_neutral, bidi_it, 1);
2753
2754           if ((bidi_it->level_stack[bidi_it->stack_idx].level != current_level)
2755               || type == NEUTRAL_B)
2756             {
2757               /* Marched all the way to the end of this level run.  We
2758                  need to use the eos type, whose information is stored
2759                  by bidi_set_sos_type in the prev_for_neutral
2760                  member.  */
2761               if (adjacent_to_neutrals)
2762                 next_type = bidi_it->prev_for_neutral.type;
2763               else
2764                 {
2765                   /* This is a BN which does not adjoin neutrals.
2766                      Leave its type alone.  */
2767                   bidi_copy_it (bidi_it, &saved_it);
2768                   return bidi_it->type;
2769                 }
2770             }
2771           else
2772             {
2773               switch (type)
2774                 {
2775                 case STRONG_L:
2776                 case STRONG_R:
2777                 case STRONG_AL:
2778                   /* Actually, STRONG_AL cannot happen here, because
2779                      bidi_resolve_weak converts it to STRONG_R, per W3.  */
2780                   eassert (type != STRONG_AL);
2781                   next_type = type;
2782                   break;
2783                 case WEAK_EN:
2784                 case WEAK_AN:
2785                   /* N1: "European and Arabic numbers act as if they
2786                      were R in terms of their influence on NIs."  */
2787                   next_type = STRONG_R;
2788                   break;
2789                 default:
2790                   emacs_abort ();
2791                   break;
2792                 }
2793             }
2794           /* Resolve the type of all the NIs found during the above loop.  */
2795           type = bidi_resolve_neutral_1 (saved_it.prev_for_neutral.type,
2796                                          next_type, current_level);
2797           /* Update next_for_neutral with the resolved type, so we
2798              could use it for all the other NIs up to the place where
2799              we exited the loop.  */
2800           saved_it.next_for_neutral.type = next_type;
2801           bidi_check_type (type);
2802           /* Update the character which caused us to enter the above loop.  */
2803           saved_it.type = type;
2804           bidi_check_type (next_type);
2805           bidi_copy_it (bidi_it, &saved_it);
2806         }
2807     }
2808   return type;
2809 }
2810
2811 /* Given an iterator state in BIDI_IT, advance one character position
2812    in the buffer/string to the next character (in the logical order),
2813    resolve the bidi type of that next character, and return that
2814    type.  */
2815 static bidi_type_t
2816 bidi_type_of_next_char (struct bidi_it *bidi_it)
2817 {
2818   bidi_type_t type;
2819
2820   /* This should always be called during a forward scan.  */
2821   if (bidi_it->scan_dir != 1)
2822     emacs_abort ();
2823
2824   type = bidi_resolve_neutral (bidi_it);
2825
2826   return type;
2827 }
2828
2829 /* Given an iterator state BIDI_IT, advance one character position in
2830    the buffer/string to the next character (in the current scan
2831    direction), resolve the embedding and implicit levels of that next
2832    character, and return the resulting level.  */
2833 static int
2834 bidi_level_of_next_char (struct bidi_it *bidi_it)
2835 {
2836   bidi_type_t type = UNKNOWN_BT;
2837   int level;
2838   ptrdiff_t next_char_pos = -2;
2839
2840   if (bidi_it->scan_dir == 1)
2841     {
2842       ptrdiff_t eob
2843         = ((bidi_it->string.s || STRINGP (bidi_it->string.lstring))
2844            ? bidi_it->string.schars : ZV);
2845
2846       /* There's no sense in trying to advance if we've already hit
2847          the end of text.  */
2848       if (bidi_it->charpos >= eob)
2849         {
2850           eassert (bidi_it->resolved_level >= 0);
2851           return bidi_it->resolved_level;
2852         }
2853     }
2854
2855   /* Perhaps the character we want is already cached.  If it is, the
2856      call to bidi_cache_find below will return a type other than
2857      UNKNOWN_BT.  */
2858   if (bidi_cache_idx > bidi_cache_start && !bidi_it->first_elt)
2859     {
2860       int bob = ((bidi_it->string.s || STRINGP (bidi_it->string.lstring))
2861                  ? 0 : 1);
2862       bidi_type_t prev_type = bidi_it->type;
2863       bidi_type_t type_for_neutral = bidi_it->next_for_neutral.type;
2864       ptrdiff_t pos_for_neutral = bidi_it->next_for_neutral.charpos;
2865
2866       if (bidi_it->scan_dir > 0)
2867         {
2868           if (bidi_it->nchars <= 0)
2869             emacs_abort ();
2870           next_char_pos = bidi_it->charpos + bidi_it->nchars;
2871         }
2872       else if (bidi_it->charpos >= bob)
2873         /* Implementation note: we allow next_char_pos to be as low as
2874            0 for buffers or -1 for strings, and that is okay because
2875            that's the "position" of the sentinel iterator state we
2876            cached at the beginning of the iteration.  */
2877         next_char_pos = bidi_it->charpos - 1;
2878       if (next_char_pos >= bob - 1)
2879         type = bidi_cache_find (next_char_pos, 0, bidi_it);
2880
2881       /* For a sequence of BN and NI, copy the type from the previous
2882          character.  This is because the loop in bidi_resolve_neutral
2883          that handles such sequences caches the characters it
2884          traverses, but does not (and cannot) store the
2885          next_for_neutral member for them, because it is only known
2886          when the loop ends.  So when we find them in the cache, their
2887          type needs to be updated, but we don't have next_for_neutral
2888          to do that.  However, whatever type is resolved as result of
2889          that loop, it will be the same for all the traversed
2890          characters, by virtue of N1 and N2.  */
2891       if (type == WEAK_BN && bidi_it->scan_dir > 0
2892           && bidi_explicit_dir_char (bidi_it->ch)
2893           && type_for_neutral != UNKNOWN_BT
2894           && bidi_it->charpos < pos_for_neutral)
2895         {
2896           type = prev_type;
2897           eassert (type != UNKNOWN_BT);
2898         }
2899       if (type != UNKNOWN_BT)
2900         {
2901           /* If resolved_level is -1, it means this state was cached
2902              before it was completely resolved, so we cannot return
2903              it.  */
2904           if (bidi_it->resolved_level != -1)
2905             {
2906               eassert (bidi_it->resolved_level >= 0);
2907               return bidi_it->resolved_level;
2908             }
2909           else
2910             {
2911               level = bidi_it->level_stack[bidi_it->stack_idx].level;
2912               if (bidi_get_category (type) == NEUTRAL
2913                   || bidi_isolate_fmt_char (type))
2914                 {
2915                   /* Make sure the data for resolving neutrals we are
2916                      about to use is valid.  */
2917                   if (bidi_it->next_for_neutral.charpos < bidi_it->charpos
2918                       /* PDI defines an eos, so it's OK for it to
2919                          serve as its own next_for_neutral.  */
2920                       || (bidi_it->next_for_neutral.charpos == bidi_it->charpos
2921                           && bidi_it->type != PDI)
2922                       || bidi_it->next_for_neutral.type == UNKNOWN_BT)
2923                     emacs_abort ();
2924
2925                   type = bidi_resolve_neutral_1 (bidi_it->prev_for_neutral.type,
2926                                                  bidi_it->next_for_neutral.type,
2927                                                  level);
2928                 }
2929             }
2930         }
2931     }
2932
2933   if (bidi_it->scan_dir == -1)
2934     /* If we are going backwards, the iterator state is already cached
2935        from previous scans, and should be fully resolved.  */
2936     emacs_abort ();
2937
2938   if (type == UNKNOWN_BT)
2939     type = bidi_type_of_next_char (bidi_it);
2940
2941   if (type == NEUTRAL_B)
2942     {
2943       eassert (bidi_it->resolved_level >= 0);
2944       return bidi_it->resolved_level;
2945     }
2946
2947   level = bidi_it->level_stack[bidi_it->stack_idx].level;
2948
2949   eassert ((type == STRONG_R
2950             || type == STRONG_L
2951             || type == WEAK_BN
2952             || type == WEAK_EN
2953             || type == WEAK_AN));
2954   bidi_it->type = type;
2955   bidi_check_type (bidi_it->type);
2956
2957   /* For L1 below, we need to know, for each WS character, whether
2958      it belongs to a sequence of WS characters preceding a newline
2959      or a TAB or a paragraph separator.  */
2960   if ((bidi_it->orig_type == NEUTRAL_WS
2961        || bidi_isolate_fmt_char (bidi_it->orig_type))
2962       && bidi_it->next_for_ws.charpos < bidi_it->charpos)
2963     {
2964       int ch;
2965       ptrdiff_t clen = bidi_it->ch_len;
2966       ptrdiff_t bpos = bidi_it->bytepos;
2967       ptrdiff_t cpos = bidi_it->charpos;
2968       ptrdiff_t disp_pos = bidi_it->disp_pos;
2969       ptrdiff_t nc = bidi_it->nchars;
2970       struct bidi_string_data bs = bidi_it->string;
2971       bidi_type_t chtype;
2972       bool fwp = bidi_it->frame_window_p;
2973       int dpp = bidi_it->disp_prop;
2974
2975       if (bidi_it->nchars <= 0)
2976         emacs_abort ();
2977       do {
2978         ch = bidi_fetch_char (cpos += nc, bpos += clen, &disp_pos, &dpp, &bs,
2979                               bidi_it->w, fwp, &clen, &nc);
2980         chtype = bidi_get_type (ch, NEUTRAL_DIR);
2981       } while (chtype == NEUTRAL_WS || chtype == WEAK_BN
2982                || bidi_isolate_fmt_char (chtype)
2983                || bidi_explicit_dir_char (ch)); /* L1/Retaining */
2984       bidi_it->next_for_ws.type = chtype;
2985       bidi_check_type (bidi_it->next_for_ws.type);
2986       bidi_it->next_for_ws.charpos = cpos;
2987     }
2988
2989   /* Update the cache, but only if this state was already cached.  */
2990   bidi_cache_iterator_state (bidi_it, 1, 1);
2991
2992   /* Resolve implicit levels.  */
2993   if (bidi_it->orig_type == NEUTRAL_B /* L1 */
2994            || bidi_it->orig_type == NEUTRAL_S
2995            || bidi_it->ch == '\n' || bidi_it->ch == BIDI_EOB
2996            || (bidi_it->orig_type == NEUTRAL_WS
2997                && (bidi_it->next_for_ws.type == NEUTRAL_B
2998                    || bidi_it->next_for_ws.type == NEUTRAL_S)))
2999     level = bidi_it->level_stack[0].level;
3000   else if ((level & 1) == 0) /* I1 */
3001     {
3002       if (type == STRONG_R)
3003         level++;
3004       else if (type == WEAK_EN || type == WEAK_AN)
3005         level += 2;
3006     }
3007   else                  /* I2 */
3008     {
3009       if (type == STRONG_L || type == WEAK_EN || type == WEAK_AN)
3010         level++;
3011     }
3012
3013   bidi_it->resolved_level = level;
3014   return level;
3015 }
3016
3017 /* Move to the other edge of a level given by LEVEL.  If END_FLAG,
3018    we are at the end of a level, and we need to prepare to
3019    resume the scan of the lower level.
3020
3021    If this level's other edge is cached, we simply jump to it, filling
3022    the iterator structure with the iterator state on the other edge.
3023    Otherwise, we walk the buffer or string until we come back to the
3024    same level as LEVEL.
3025
3026    Note: we are not talking here about a ``level run'' in the UAX#9
3027    sense of the term, but rather about a ``level'' which includes
3028    all the levels higher than it.  In other words, given the levels
3029    like this:
3030
3031          11111112222222333333334443343222222111111112223322111
3032                 A      B                    C
3033
3034    and assuming we are at point A scanning left to right, this
3035    function moves to point C, whereas the UAX#9 ``level 2 run'' ends
3036    at point B.  */
3037 static void
3038 bidi_find_other_level_edge (struct bidi_it *bidi_it, int level, bool end_flag)
3039 {
3040   int dir = end_flag ? -bidi_it->scan_dir : bidi_it->scan_dir;
3041   ptrdiff_t idx;
3042
3043   /* Try the cache first.  */
3044   if ((idx = bidi_cache_find_level_change (level, dir, end_flag))
3045       >= bidi_cache_start)
3046     bidi_cache_fetch_state (idx, bidi_it);
3047   else
3048     {
3049       int new_level;
3050
3051       /* If we are at end of level, its edges must be cached.  */
3052       if (end_flag)
3053         emacs_abort ();
3054
3055       bidi_cache_iterator_state (bidi_it, 1, 0);
3056       do {
3057         new_level = bidi_level_of_next_char (bidi_it);
3058         bidi_cache_iterator_state (bidi_it, 1, 0);
3059       } while (new_level >= level);
3060     }
3061 }
3062
3063 void
3064 bidi_move_to_visually_next (struct bidi_it *bidi_it)
3065 {
3066   int old_level, new_level, next_level;
3067   struct bidi_it sentinel;
3068   struct gcpro gcpro1;
3069
3070   if (bidi_it->charpos < 0 || bidi_it->bytepos < 0)
3071     emacs_abort ();
3072
3073   if (bidi_it->scan_dir == 0)
3074     {
3075       bidi_it->scan_dir = 1;    /* default to logical order */
3076     }
3077
3078   /* The code below can call eval, and thus cause GC.  If we are
3079      iterating a Lisp string, make sure it won't be GCed.  */
3080   if (STRINGP (bidi_it->string.lstring))
3081     GCPRO1 (bidi_it->string.lstring);
3082
3083   /* If we just passed a newline, initialize for the next line.  */
3084   if (!bidi_it->first_elt
3085       && (bidi_it->ch == '\n' || bidi_it->ch == BIDI_EOB))
3086     bidi_line_init (bidi_it);
3087
3088   /* Prepare the sentinel iterator state, and cache it.  When we bump
3089      into it, scanning backwards, we'll know that the last non-base
3090      level is exhausted.  */
3091   if (bidi_cache_idx == bidi_cache_start)
3092     {
3093       bidi_copy_it (&sentinel, bidi_it);
3094       if (bidi_it->first_elt)
3095         {
3096           sentinel.charpos--;   /* cached charpos needs to be monotonic */
3097           sentinel.bytepos--;
3098           sentinel.ch = '\n';   /* doesn't matter, but why not? */
3099           sentinel.ch_len = 1;
3100           sentinel.nchars = 1;
3101         }
3102       bidi_cache_iterator_state (&sentinel, 1, 0);
3103     }
3104
3105   old_level = bidi_it->resolved_level;
3106   new_level = bidi_level_of_next_char (bidi_it);
3107
3108   /* Reordering of resolved levels (clause L2) is implemented by
3109      jumping to the other edge of the level and flipping direction of
3110      scanning the text whenever we find a level change.  */
3111   if (new_level != old_level)
3112     {
3113       bool ascending = new_level > old_level;
3114       int level_to_search = ascending ? old_level + 1 : old_level;
3115       int incr = ascending ? 1 : -1;
3116       int expected_next_level = old_level + incr;
3117
3118       /* Jump (or walk) to the other edge of this level.  */
3119       bidi_find_other_level_edge (bidi_it, level_to_search, !ascending);
3120       /* Switch scan direction and peek at the next character in the
3121          new direction.  */
3122       bidi_it->scan_dir = -bidi_it->scan_dir;
3123
3124       /* The following loop handles the case where the resolved level
3125          jumps by more than one.  This is typical for numbers inside a
3126          run of text with left-to-right embedding direction, but can
3127          also happen in other situations.  In those cases the decision
3128          where to continue after a level change, and in what direction,
3129          is tricky.  For example, given a text like below:
3130
3131                   abcdefgh
3132                   11336622
3133
3134          (where the numbers below the text show the resolved levels),
3135          the result of reordering according to UAX#9 should be this:
3136
3137                   efdcghba
3138
3139          This is implemented by the loop below which flips direction
3140          and jumps to the other edge of the level each time it finds
3141          the new level not to be the expected one.  The expected level
3142          is always one more or one less than the previous one.  */
3143       next_level = bidi_peek_at_next_level (bidi_it);
3144       while (next_level != expected_next_level)
3145         {
3146           /* If next_level is -1, it means we have an unresolved level
3147              in the cache, which at this point should not happen.  If
3148              it does, we will infloop.  */
3149           eassert (next_level >= 0);
3150           /* If next_level is not consistent with incr, we might
3151              infloop.  */
3152           eassert (incr > 0
3153                    ? next_level > expected_next_level
3154                    : next_level < expected_next_level);
3155           expected_next_level += incr;
3156           level_to_search += incr;
3157           bidi_find_other_level_edge (bidi_it, level_to_search, !ascending);
3158           bidi_it->scan_dir = -bidi_it->scan_dir;
3159           next_level = bidi_peek_at_next_level (bidi_it);
3160         }
3161
3162       /* Finally, deliver the next character in the new direction.  */
3163       next_level = bidi_level_of_next_char (bidi_it);
3164     }
3165
3166   /* Take note when we have just processed the newline that precedes
3167      the end of the paragraph.  The next time we are about to be
3168      called, set_iterator_to_next will automatically reinit the
3169      paragraph direction, if needed.  We do this at the newline before
3170      the paragraph separator, because the next character might not be
3171      the first character of the next paragraph, due to the bidi
3172      reordering, whereas we _must_ know the paragraph base direction
3173      _before_ we process the paragraph's text, since the base
3174      direction affects the reordering.  */
3175   if (bidi_it->scan_dir == 1
3176       && (bidi_it->ch == '\n' || bidi_it->ch == BIDI_EOB))
3177     {
3178       /* The paragraph direction of the entire string, once
3179          determined, is in effect for the entire string.  Setting the
3180          separator limit to the end of the string prevents
3181          bidi_paragraph_init from being called automatically on this
3182          string.  */
3183       if (bidi_it->string.s || STRINGP (bidi_it->string.lstring))
3184         bidi_it->separator_limit = bidi_it->string.schars;
3185       else if (bidi_it->bytepos < ZV_BYTE)
3186         {
3187           ptrdiff_t sep_len
3188             = bidi_at_paragraph_end (bidi_it->charpos + bidi_it->nchars,
3189                                      bidi_it->bytepos + bidi_it->ch_len);
3190           if (bidi_it->nchars <= 0)
3191             emacs_abort ();
3192           if (sep_len >= 0)
3193             {
3194               bidi_it->new_paragraph = 1;
3195               /* Record the buffer position of the last character of the
3196                  paragraph separator.  */
3197               bidi_it->separator_limit
3198                 = bidi_it->charpos + bidi_it->nchars + sep_len;
3199             }
3200         }
3201     }
3202
3203   if (bidi_it->scan_dir == 1 && bidi_cache_idx > bidi_cache_start)
3204     {
3205       /* If we are at paragraph's base embedding level and beyond the
3206          last cached position, the cache's job is done and we can
3207          discard it.  */
3208       if (bidi_it->resolved_level == bidi_it->level_stack[0].level
3209           && bidi_it->charpos > (bidi_cache[bidi_cache_idx - 1].charpos
3210                                  + bidi_cache[bidi_cache_idx - 1].nchars - 1))
3211         bidi_cache_reset ();
3212         /* But as long as we are caching during forward scan, we must
3213            cache each state, or else the cache integrity will be
3214            compromised: it assumes cached states correspond to buffer
3215            positions 1:1.  */
3216       else
3217         bidi_cache_iterator_state (bidi_it, 1, 0);
3218     }
3219
3220   eassert (bidi_it->resolved_level >= 0
3221            && bidi_it->resolved_level <= BIDI_MAXDEPTH + 2);
3222
3223   if (STRINGP (bidi_it->string.lstring))
3224     UNGCPRO;
3225 }
3226
3227 /* This is meant to be called from within the debugger, whenever you
3228    wish to examine the cache contents.  */
3229 void bidi_dump_cached_states (void) EXTERNALLY_VISIBLE;
3230 void
3231 bidi_dump_cached_states (void)
3232 {
3233   ptrdiff_t i;
3234   int ndigits = 1;
3235
3236   if (bidi_cache_idx == 0)
3237     {
3238       fprintf (stderr, "The cache is empty.\n");
3239       return;
3240     }
3241   fprintf (stderr, "Total of  %"pD"d state%s in cache:\n",
3242            bidi_cache_idx, bidi_cache_idx == 1 ? "" : "s");
3243
3244   for (i = bidi_cache[bidi_cache_idx - 1].charpos; i > 0; i /= 10)
3245     ndigits++;
3246   fputs ("ch  ", stderr);
3247   for (i = 0; i < bidi_cache_idx; i++)
3248     fprintf (stderr, "%*c", ndigits, bidi_cache[i].ch);
3249   fputs ("\n", stderr);
3250   fputs ("lvl ", stderr);
3251   for (i = 0; i < bidi_cache_idx; i++)
3252     fprintf (stderr, "%*d", ndigits, bidi_cache[i].resolved_level);
3253   fputs ("\n", stderr);
3254   fputs ("pos ", stderr);
3255   for (i = 0; i < bidi_cache_idx; i++)
3256     fprintf (stderr, "%*"pD"d", ndigits, bidi_cache[i].charpos);
3257   fputs ("\n", stderr);
3258 }