src/bidi.c

   1 /* Low-level bidirectional buffer/string-scanning functions for GNU Emacs.
   2    Copyright (C) 2000-2001, 2004-2005, 2009-2011
   3    Free Software Foundation, Inc.
   4
   5 This file is part of GNU Emacs.
   6
   7 GNU Emacs is free software: you can redistribute it and/or modify
   8 it under the terms of the GNU General Public License as published by
   9 the Free Software Foundation, either version 3 of the License, or
  10 (at your option) any later version.
  11
  12 GNU Emacs is distributed in the hope that it will be useful,
  13 but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15 GNU General Public License for more details.
  16
  17 You should have received a copy of the GNU General Public License
  18 along with GNU Emacs.  If not, see <http://www.gnu.org/licenses/>.  */
  19
  20 /* Written by Eli Zaretskii <eliz@gnu.org>.
  21
  22    A sequential implementation of the Unicode Bidirectional algorithm,
  23    (UBA) as per UAX#9, a part of the Unicode Standard.
  24
  25    Unlike the reference and most other implementations, this one is
  26    designed to be called once for every character in the buffer or
  27    string.
  28
  29    The main entry point is bidi_move_to_visually_next.  Each time it
  30    is called, it finds the next character in the visual order, and
  31    returns its information in a special structure.  The caller is then
  32    expected to process this character for display or any other
  33    purposes, and call bidi_move_to_visually_next for the next
  34    character.  See the comments in bidi_move_to_visually_next for more
  35    details about its algorithm that finds the next visual-order
  36    character by resolving their levels on the fly.
  37
  38    The two other entry points are bidi_paragraph_init and
  39    bidi_mirror_char.  The first determines the base direction of a
  40    paragraph, while the second returns the mirrored version of its
  41    argument character.
  42
  43    If you want to understand the code, you will have to read it
  44    together with the relevant portions of UAX#9.  The comments include
  45    references to UAX#9 rules, for that very reason.
  46
  47    A note about references to UAX#9 rules: if the reference says
  48    something like "X9/Retaining", it means that you need to refer to
  49    rule X9 and to its modifications decribed in the "Implementation
  50    Notes" section of UAX#9, under "Retaining Format Codes".  */
  51
  52 #include <config.h>
  53 #include <stdio.h>
  54 #include <setjmp.h>
  55
  56 #include "lisp.h"
  57 #include "buffer.h"
  58 #include "character.h"
  59 #include "dispextern.h"
  60
  61 static int bidi_initialized = 0;
  62
  63 static Lisp_Object bidi_type_table, bidi_mirror_table;
  64
  65 #define LRM_CHAR   0x200E
  66 #define RLM_CHAR   0x200F
  67 #define BIDI_EOB   -1
  68
  69 /* Data type for describing the bidirectional character categories.  */
  70 typedef enum {
  71   UNKNOWN_BC,
  72   NEUTRAL,
  73   WEAK,
  74   STRONG
  75 } bidi_category_t;
  76
  77 extern int bidi_ignore_explicit_marks_for_paragraph_level EXTERNALLY_VISIBLE;
  78 int bidi_ignore_explicit_marks_for_paragraph_level = 1;
  79
  80 static Lisp_Object paragraph_start_re, paragraph_separate_re;
  81 static Lisp_Object Qparagraph_start, Qparagraph_separate;
  82
  83 \f
  84 /***********************************************************************
  85                         Utilities
  86  ***********************************************************************/
  87
  88 /* Return the bidi type of a character CH, subject to the current
  89    directional OVERRIDE.  */
  90 static inline bidi_type_t
  91 bidi_get_type (int ch, bidi_dir_t override)
  92 {
  93   bidi_type_t default_type;
  94
  95   if (ch == BIDI_EOB)
  96     return NEUTRAL_B;
  97   if (ch < 0 || ch > MAX_CHAR)
  98     abort ();
  99
 100   default_type = (bidi_type_t) XINT (CHAR_TABLE_REF (bidi_type_table, ch));
 101
 102   if (override == NEUTRAL_DIR)
 103     return default_type;
 104
 105   switch (default_type)
 106     {
 107       /* Although UAX#9 does not tell, it doesn't make sense to
 108          override NEUTRAL_B and LRM/RLM characters.  */
 109       case NEUTRAL_B:
 110       case LRE:
 111       case LRO:
 112       case RLE:
 113       case RLO:
 114       case PDF:
 115         return default_type;
 116       default:
 117         switch (ch)
 118           {
 119             case LRM_CHAR:
 120             case RLM_CHAR:
 121               return default_type;
 122             default:
 123               if (override == L2R) /* X6 */
 124                 return STRONG_L;
 125               else if (override == R2L)
 126                 return STRONG_R;
 127               else
 128                 abort ();       /* can't happen: handled above */
 129           }
 130     }
 131 }
 132
 133 static void
 134 bidi_check_type (bidi_type_t type)
 135 {
 136   if (type < UNKNOWN_BT || type > NEUTRAL_ON)
 137     abort ();
 138 }
 139
 140 /* Given a bidi TYPE of a character, return its category.  */
 141 static inline bidi_category_t
 142 bidi_get_category (bidi_type_t type)
 143 {
 144   switch (type)
 145     {
 146       case UNKNOWN_BT:
 147         return UNKNOWN_BC;
 148       case STRONG_L:
 149       case STRONG_R:
 150       case STRONG_AL:
 151       case LRE:
 152       case LRO:
 153       case RLE:
 154       case RLO:
 155         return STRONG;
 156       case PDF:         /* ??? really?? */
 157       case WEAK_EN:
 158       case WEAK_ES:
 159       case WEAK_ET:
 160       case WEAK_AN:
 161       case WEAK_CS:
 162       case WEAK_NSM:
 163       case WEAK_BN:
 164         return WEAK;
 165       case NEUTRAL_B:
 166       case NEUTRAL_S:
 167       case NEUTRAL_WS:
 168       case NEUTRAL_ON:
 169         return NEUTRAL;
 170       default:
 171         abort ();
 172     }
 173 }
 174
 175 /* Return the mirrored character of C, if it has one.  If C has no
 176    mirrored counterpart, return C.
 177    Note: The conditions in UAX#9 clause L4 regarding the surrounding
 178    context must be tested by the caller.  */
 179 int
 180 bidi_mirror_char (int c)
 181 {
 182   Lisp_Object val;
 183
 184   if (c == BIDI_EOB)
 185     return c;
 186   if (c < 0 || c > MAX_CHAR)
 187     abort ();
 188
 189   val = CHAR_TABLE_REF (bidi_mirror_table, c);
 190   if (INTEGERP (val))
 191     {
 192       int v = XINT (val);
 193
 194       if (v < 0 || v > MAX_CHAR)
 195         abort ();
 196
 197       return v;
 198     }
 199
 200   return c;
 201 }
 202
 203 /* Determine the start-of-run (sor) directional type given the two
 204    embedding levels on either side of the run boundary.  Also, update
 205    the saved info about previously seen characters, since that info is
 206    generally valid for a single level run.  */
 207 static inline void
 208 bidi_set_sor_type (struct bidi_it *bidi_it, int level_before, int level_after)
 209 {
 210   int higher_level = level_before > level_after ? level_before : level_after;
 211
 212   /* The prev_was_pdf gork is required for when we have several PDFs
 213      in a row.  In that case, we want to compute the sor type for the
 214      next level run only once: when we see the first PDF.  That's
 215      because the sor type depends only on the higher of the two levels
 216      that we find on the two sides of the level boundary (see UAX#9,
 217      clause X10), and so we don't need to know the final embedding
 218      level to which we descend after processing all the PDFs.  */
 219   if (!bidi_it->prev_was_pdf || level_before < level_after)
 220     /* FIXME: should the default sor direction be user selectable?  */
 221     bidi_it->sor = (higher_level & 1) != 0 ? R2L : L2R;
 222   if (level_before > level_after)
 223     bidi_it->prev_was_pdf = 1;
 224
 225   bidi_it->prev.type = UNKNOWN_BT;
 226   bidi_it->last_strong.type = bidi_it->last_strong.type_after_w1 =
 227     bidi_it->last_strong.orig_type = UNKNOWN_BT;
 228   bidi_it->prev_for_neutral.type = bidi_it->sor == R2L ? STRONG_R : STRONG_L;
 229   bidi_it->prev_for_neutral.charpos = bidi_it->charpos;
 230   bidi_it->prev_for_neutral.bytepos = bidi_it->bytepos;
 231   bidi_it->next_for_neutral.type = bidi_it->next_for_neutral.type_after_w1 =
 232     bidi_it->next_for_neutral.orig_type = UNKNOWN_BT;
 233   bidi_it->ignore_bn_limit = -1; /* meaning it's unknown */
 234 }
 235
 236 /* Push the current embedding level and override status; reset the
 237    current level to LEVEL and the current override status to OVERRIDE.  */
 238 static inline void
 239 bidi_push_embedding_level (struct bidi_it *bidi_it,
 240                            int level, bidi_dir_t override)
 241 {
 242   bidi_it->stack_idx++;
 243   xassert (bidi_it->stack_idx < BIDI_MAXLEVEL);
 244   bidi_it->level_stack[bidi_it->stack_idx].level = level;
 245   bidi_it->level_stack[bidi_it->stack_idx].override = override;
 246 }
 247
 248 /* Pop the embedding level and directional override status from the
 249    stack, and return the new level.  */
 250 static inline int
 251 bidi_pop_embedding_level (struct bidi_it *bidi_it)
 252 {
 253   /* UAX#9 says to ignore invalid PDFs.  */
 254   if (bidi_it->stack_idx > 0)
 255     bidi_it->stack_idx--;
 256   return bidi_it->level_stack[bidi_it->stack_idx].level;
 257 }
 258
 259 /* Record in SAVED_INFO the information about the current character.  */
 260 static inline void
 261 bidi_remember_char (struct bidi_saved_info *saved_info,
 262                     struct bidi_it *bidi_it)
 263 {
 264   saved_info->charpos = bidi_it->charpos;
 265   saved_info->bytepos = bidi_it->bytepos;
 266   saved_info->type = bidi_it->type;
 267   bidi_check_type (bidi_it->type);
 268   saved_info->type_after_w1 = bidi_it->type_after_w1;
 269   bidi_check_type (bidi_it->type_after_w1);
 270   saved_info->orig_type = bidi_it->orig_type;
 271   bidi_check_type (bidi_it->orig_type);
 272 }
 273
 274 /* Copy the bidi iterator from FROM to TO.  To save cycles, this only
 275    copies the part of the level stack that is actually in use.  */
 276 static inline void
 277 bidi_copy_it (struct bidi_it *to, struct bidi_it *from)
 278 {
 279   int i;
 280
 281   /* Copy everything except the level stack and beyond.  */
 282   memcpy (to, from, offsetof (struct bidi_it, level_stack[0]));
 283
 284   /* Copy the active part of the level stack.  */
 285   to->level_stack[0] = from->level_stack[0]; /* level zero is always in use */
 286   for (i = 1; i <= from->stack_idx; i++)
 287     to->level_stack[i] = from->level_stack[i];
 288 }
 289
 290 \f
 291 /***********************************************************************
 292                         Caching the bidi iterator states
 293  ***********************************************************************/
 294
 295 #define BIDI_CACHE_CHUNK 200
 296 static struct bidi_it *bidi_cache;
 297 static size_t bidi_cache_size = 0;
 298 static size_t elsz = sizeof (struct bidi_it);
 299 static EMACS_INT bidi_cache_idx;        /* next unused cache slot */
 300 static EMACS_INT bidi_cache_last_idx;   /* slot of last cache hit */
 301 static EMACS_INT bidi_cache_start = 0;  /* start of cache for this
 302                                            "stack" level */
 303
 304 /* Reset the cache state to the empty state.  We only reset the part
 305    of the cache relevant to iteration of the current object.  Previous
 306    objects, which are pushed on the display iterator's stack, are left
 307    intact.  This is called when the cached information is no more
 308    useful for the current iteration, e.g. when we were reseated to a
 309    new position on the same object.  */
 310 static inline void
 311 bidi_cache_reset (void)
 312 {
 313   bidi_cache_idx = bidi_cache_start;
 314   bidi_cache_last_idx = -1;
 315 }
 316
 317 /* Shrink the cache to its minimal size.  Called when we init the bidi
 318    iterator for reordering a buffer or a string that does not come
 319    from display properties, because that means all the previously
 320    cached info is of no further use.  */
 321 static inline void
 322 bidi_cache_shrink (void)
 323 {
 324   if (bidi_cache_size > BIDI_CACHE_CHUNK)
 325     {
 326       bidi_cache_size = BIDI_CACHE_CHUNK;
 327       bidi_cache =
 328         (struct bidi_it *) xrealloc (bidi_cache, bidi_cache_size * elsz);
 329     }
 330   bidi_cache_reset ();
 331 }
 332
 333 static inline void
 334 bidi_cache_fetch_state (int idx, struct bidi_it *bidi_it)
 335 {
 336   int current_scan_dir = bidi_it->scan_dir;
 337
 338   if (idx < bidi_cache_start || idx >= bidi_cache_idx)
 339     abort ();
 340
 341   bidi_copy_it (bidi_it, &bidi_cache[idx]);
 342   bidi_it->scan_dir = current_scan_dir;
 343   bidi_cache_last_idx = idx;
 344 }
 345
 346 /* Find a cached state with a given CHARPOS and resolved embedding
 347    level less or equal to LEVEL.  if LEVEL is -1, disregard the
 348    resolved levels in cached states.  DIR, if non-zero, means search
 349    in that direction from the last cache hit.  */
 350 static inline int
 351 bidi_cache_search (EMACS_INT charpos, int level, int dir)
 352 {
 353   int i, i_start;
 354
 355   if (bidi_cache_idx)
 356     {
 357       if (charpos < bidi_cache[bidi_cache_last_idx].charpos)
 358         {
 359           dir = -1;
 360           i_start = bidi_cache_last_idx - 1;
 361         }
 362       else if (charpos > (bidi_cache[bidi_cache_last_idx].charpos
 363                           + bidi_cache[bidi_cache_last_idx].nchars - 1))
 364         {
 365           dir = 1;
 366           i_start = bidi_cache_last_idx + 1;
 367         }
 368       else if (dir)
 369         i_start = bidi_cache_last_idx;
 370       else
 371         {
 372           dir = -1;
 373           i_start = bidi_cache_idx - 1;
 374         }
 375
 376       if (dir < 0)
 377         {
 378           /* Linear search for now; FIXME!  */
 379           for (i = i_start; i >= bidi_cache_start; i--)
 380             if (bidi_cache[i].charpos <= charpos
 381                 && charpos < bidi_cache[i].charpos + bidi_cache[i].nchars
 382                 && (level == -1 || bidi_cache[i].resolved_level <= level))
 383               return i;
 384         }
 385       else
 386         {
 387           for (i = i_start; i < bidi_cache_idx; i++)
 388             if (bidi_cache[i].charpos <= charpos
 389                 && charpos < bidi_cache[i].charpos + bidi_cache[i].nchars
 390                 && (level == -1 || bidi_cache[i].resolved_level <= level))
 391               return i;
 392         }
 393     }
 394
 395   return -1;
 396 }
 397
 398 /* Find a cached state where the resolved level changes to a value
 399    that is lower than LEVEL, and return its cache slot index.  DIR is
 400    the direction to search, starting with the last used cache slot.
 401    If DIR is zero, we search backwards from the last occupied cache
 402    slot.  BEFORE, if non-zero, means return the index of the slot that
 403    is ``before'' the level change in the search direction.  That is,
 404    given the cached levels like this:
 405
 406          1122333442211
 407           AB        C
 408
 409    and assuming we are at the position cached at the slot marked with
 410    C, searching backwards (DIR = -1) for LEVEL = 2 will return the
 411    index of slot B or A, depending whether BEFORE is, respectively,
 412    non-zero or zero.  */
 413 static int
 414 bidi_cache_find_level_change (int level, int dir, int before)
 415 {
 416   if (bidi_cache_idx)
 417     {
 418       int i = dir ? bidi_cache_last_idx : bidi_cache_idx - 1;
 419       int incr = before ? 1 : 0;
 420
 421       if (!dir)
 422         dir = -1;
 423       else if (!incr)
 424         i += dir;
 425
 426       if (dir < 0)
 427         {
 428           while (i >= bidi_cache_start + incr)
 429             {
 430               if (bidi_cache[i - incr].resolved_level >= 0
 431                   && bidi_cache[i - incr].resolved_level < level)
 432                 return i;
 433               i--;
 434             }
 435         }
 436       else
 437         {
 438           while (i < bidi_cache_idx - incr)
 439             {
 440               if (bidi_cache[i + incr].resolved_level >= 0
 441                   && bidi_cache[i + incr].resolved_level < level)
 442                 return i;
 443               i++;
 444             }
 445         }
 446     }
 447
 448   return -1;
 449 }
 450
 451 static inline void
 452 bidi_cache_ensure_space (int idx)
 453 {
 454   /* Enlarge the cache as needed.  */
 455   if (idx >= bidi_cache_size)
 456     {
 457       bidi_cache_size += BIDI_CACHE_CHUNK;
 458       bidi_cache =
 459         (struct bidi_it *) xrealloc (bidi_cache, bidi_cache_size * elsz);
 460     }
 461 }
 462
 463 static inline void
 464 bidi_cache_iterator_state (struct bidi_it *bidi_it, int resolved)
 465 {
 466   int idx;
 467
 468   /* We should never cache on backward scans.  */
 469   if (bidi_it->scan_dir == -1)
 470     abort ();
 471   idx = bidi_cache_search (bidi_it->charpos, -1, 1);
 472
 473   if (idx < 0)
 474     {
 475       idx = bidi_cache_idx;
 476       bidi_cache_ensure_space (idx);
 477       /* Character positions should correspond to cache positions 1:1.
 478          If we are outside the range of cached positions, the cache is
 479          useless and must be reset.  */
 480       if (idx > bidi_cache_start &&
 481           (bidi_it->charpos > (bidi_cache[idx - 1].charpos
 482                                + bidi_cache[idx - 1].nchars)
 483            || bidi_it->charpos < bidi_cache[bidi_cache_start].charpos))
 484         {
 485           bidi_cache_reset ();
 486           idx = bidi_cache_start;
 487         }
 488       if (bidi_it->nchars <= 0)
 489         abort ();
 490       bidi_copy_it (&bidi_cache[idx], bidi_it);
 491       if (!resolved)
 492         bidi_cache[idx].resolved_level = -1;
 493     }
 494   else
 495     {
 496       /* Copy only the members which could have changed, to avoid
 497          costly copying of the entire struct.  */
 498       bidi_cache[idx].type = bidi_it->type;
 499       bidi_check_type (bidi_it->type);
 500       bidi_cache[idx].type_after_w1 = bidi_it->type_after_w1;
 501       bidi_check_type (bidi_it->type_after_w1);
 502       if (resolved)
 503         bidi_cache[idx].resolved_level = bidi_it->resolved_level;
 504       else
 505         bidi_cache[idx].resolved_level = -1;
 506       bidi_cache[idx].invalid_levels = bidi_it->invalid_levels;
 507       bidi_cache[idx].invalid_rl_levels = bidi_it->invalid_rl_levels;
 508       bidi_cache[idx].next_for_neutral = bidi_it->next_for_neutral;
 509       bidi_cache[idx].next_for_ws = bidi_it->next_for_ws;
 510       bidi_cache[idx].ignore_bn_limit = bidi_it->ignore_bn_limit;
 511     }
 512
 513   bidi_cache_last_idx = idx;
 514   if (idx >= bidi_cache_idx)
 515     bidi_cache_idx = idx + 1;
 516 }
 517
 518 static inline bidi_type_t
 519 bidi_cache_find (EMACS_INT charpos, int level, struct bidi_it *bidi_it)
 520 {
 521   int i = bidi_cache_search (charpos, level, bidi_it->scan_dir);
 522
 523   if (i >= bidi_cache_start)
 524     {
 525       bidi_dir_t current_scan_dir = bidi_it->scan_dir;
 526
 527       bidi_copy_it (bidi_it, &bidi_cache[i]);
 528       bidi_cache_last_idx = i;
 529       /* Don't let scan direction from from the cached state override
 530          the current scan direction.  */
 531       bidi_it->scan_dir = current_scan_dir;
 532       return bidi_it->type;
 533     }
 534
 535   return UNKNOWN_BT;
 536 }
 537
 538 static inline int
 539 bidi_peek_at_next_level (struct bidi_it *bidi_it)
 540 {
 541   if (bidi_cache_idx == bidi_cache_start || bidi_cache_last_idx == -1)
 542     abort ();
 543   return bidi_cache[bidi_cache_last_idx + bidi_it->scan_dir].resolved_level;
 544 }
 545
 546 \f
 547 /***********************************************************************
 548              Pushing and popping the bidi iterator state
 549  ***********************************************************************/
 550 /* 5-slot stack for saving the start of the previous level of the
 551    cache.  xdisp.c maintains a 5-slot stack for its iterator state,
 552    and we need the same size of our stack.  */
 553 static int bidi_cache_start_stack[IT_STACK_SIZE];
 554 static int bidi_cache_sp;
 555
 556 /* Push the bidi iterator state in preparation for reordering a
 557    different object, e.g. display string found at certain buffer
 558    position.  Pushing the bidi iterator boils down to saving its
 559    entire state on the cache and starting a new cache "stacked" on top
 560    of the current cache.  */
 561 void
 562 bidi_push_it (struct bidi_it *bidi_it)
 563 {
 564   /* Save the current iterator state in its entirety after the last
 565      used cache slot.  */
 566   bidi_cache_ensure_space (bidi_cache_idx);
 567   memcpy (&bidi_cache[bidi_cache_idx++], bidi_it, sizeof (struct bidi_it));
 568
 569   /* Push the current cache start onto the stack.  */
 570   xassert (bidi_cache_sp < IT_STACK_SIZE);
 571   bidi_cache_start_stack[bidi_cache_sp++] = bidi_cache_start;
 572
 573   /* Start a new level of cache, and make it empty.  */
 574   bidi_cache_start = bidi_cache_idx;
 575   bidi_cache_last_idx = -1;
 576 }
 577
 578 /* Restore the iterator state saved by bidi_push_it and return the
 579    cache to the corresponding state.  */
 580 void
 581 bidi_pop_it (struct bidi_it *bidi_it)
 582 {
 583   if (bidi_cache_start <= 0)
 584     abort ();
 585
 586   /* Reset the next free cache slot index to what it was before the
 587      call to bidi_push_it.  */
 588   bidi_cache_idx = bidi_cache_start - 1;
 589
 590   /* Restore the bidi iterator state saved in the cache.  */
 591   memcpy (bidi_it, &bidi_cache[bidi_cache_idx], sizeof (struct bidi_it));
 592
 593   /* Pop the previous cache start from the stack.  */
 594   if (bidi_cache_sp <= 0)
 595     abort ();
 596   bidi_cache_start = bidi_cache_start_stack[--bidi_cache_sp];
 597
 598   /* Invalidate the last-used cache slot data.  */
 599   bidi_cache_last_idx = -1;
 600 }
 601
 602 \f
 603 /***********************************************************************
 604                         Initialization
 605  ***********************************************************************/
 606 static void
 607 bidi_initialize (void)
 608 {
 609
 610 #include "biditype.h"
 611 #include "bidimirror.h"
 612
 613   int i;
 614
 615   bidi_type_table = Fmake_char_table (Qnil, make_number (STRONG_L));
 616   staticpro (&bidi_type_table);
 617
 618   for (i = 0; i < sizeof bidi_type / sizeof bidi_type[0]; i++)
 619     char_table_set_range (bidi_type_table, bidi_type[i].from, bidi_type[i].to,
 620                           make_number (bidi_type[i].type));
 621
 622   bidi_mirror_table = Fmake_char_table (Qnil, Qnil);
 623   staticpro (&bidi_mirror_table);
 624
 625   for (i = 0; i < sizeof bidi_mirror / sizeof bidi_mirror[0]; i++)
 626     char_table_set (bidi_mirror_table, bidi_mirror[i].from,
 627                     make_number (bidi_mirror[i].to));
 628
 629   Qparagraph_start = intern ("paragraph-start");
 630   staticpro (&Qparagraph_start);
 631   paragraph_start_re = Fsymbol_value (Qparagraph_start);
 632   if (!STRINGP (paragraph_start_re))
 633     paragraph_start_re = build_string ("\f\\|[ \t]*$");
 634   staticpro (&paragraph_start_re);
 635   Qparagraph_separate = intern ("paragraph-separate");
 636   staticpro (&Qparagraph_separate);
 637   paragraph_separate_re = Fsymbol_value (Qparagraph_separate);
 638   if (!STRINGP (paragraph_separate_re))
 639     paragraph_separate_re = build_string ("[ \t\f]*$");
 640   staticpro (&paragraph_separate_re);
 641
 642   bidi_cache_sp = 0;
 643
 644   bidi_initialized = 1;
 645 }
 646
 647 /* Do whatever UAX#9 clause X8 says should be done at paragraph's
 648    end.  */
 649 static inline void
 650 bidi_set_paragraph_end (struct bidi_it *bidi_it)
 651 {
 652   bidi_it->invalid_levels = 0;
 653   bidi_it->invalid_rl_levels = -1;
 654   bidi_it->stack_idx = 0;
 655   bidi_it->resolved_level = bidi_it->level_stack[0].level;
 656 }
 657
 658 /* Initialize the bidi iterator from buffer/string position CHARPOS.  */
 659 void
 660 bidi_init_it (EMACS_INT charpos, EMACS_INT bytepos, int frame_window_p,
 661               struct bidi_it *bidi_it)
 662 {
 663   if (! bidi_initialized)
 664     bidi_initialize ();
 665   if (charpos >= 0)
 666     bidi_it->charpos = charpos;
 667   if (bytepos >= 0)
 668     bidi_it->bytepos = bytepos;
 669   bidi_it->frame_window_p = frame_window_p;
 670   bidi_it->nchars = -1; /* to be computed in bidi_resolve_explicit_1 */
 671   bidi_it->first_elt = 1;
 672   bidi_set_paragraph_end (bidi_it);
 673   bidi_it->new_paragraph = 1;
 674   bidi_it->separator_limit = -1;
 675   bidi_it->type = NEUTRAL_B;
 676   bidi_it->type_after_w1 = NEUTRAL_B;
 677   bidi_it->orig_type = NEUTRAL_B;
 678   bidi_it->prev_was_pdf = 0;
 679   bidi_it->prev.type = bidi_it->prev.type_after_w1 =
 680     bidi_it->prev.orig_type = UNKNOWN_BT;
 681   bidi_it->last_strong.type = bidi_it->last_strong.type_after_w1 =
 682     bidi_it->last_strong.orig_type = UNKNOWN_BT;
 683   bidi_it->next_for_neutral.charpos = -1;
 684   bidi_it->next_for_neutral.type =
 685     bidi_it->next_for_neutral.type_after_w1 =
 686     bidi_it->next_for_neutral.orig_type = UNKNOWN_BT;
 687   bidi_it->prev_for_neutral.charpos = -1;
 688   bidi_it->prev_for_neutral.type =
 689     bidi_it->prev_for_neutral.type_after_w1 =
 690     bidi_it->prev_for_neutral.orig_type = UNKNOWN_BT;
 691   bidi_it->sor = L2R;    /* FIXME: should it be user-selectable? */
 692   bidi_it->disp_pos = -1;       /* invalid/unknown */
 693   /* We can only shrink the cache if we are at the bottom level of its
 694      "stack".  */
 695   if (bidi_cache_start == 0)
 696     bidi_cache_shrink ();
 697   else
 698     bidi_cache_reset ();
 699 }
 700
 701 /* Perform initializations for reordering a new line of bidi text.  */
 702 static void
 703 bidi_line_init (struct bidi_it *bidi_it)
 704 {
 705   bidi_it->scan_dir = 1; /* FIXME: do we need to have control on this? */
 706   bidi_it->resolved_level = bidi_it->level_stack[0].level;
 707   bidi_it->level_stack[0].override = NEUTRAL_DIR; /* X1 */
 708   bidi_it->invalid_levels = 0;
 709   bidi_it->invalid_rl_levels = -1;
 710   bidi_it->next_en_pos = -1;
 711   bidi_it->next_for_ws.type = UNKNOWN_BT;
 712   bidi_set_sor_type (bidi_it,
 713                      bidi_it->paragraph_dir == R2L ? 1 : 0,
 714                      bidi_it->level_stack[0].level); /* X10 */
 715
 716   bidi_cache_reset ();
 717 }
 718
 719 \f
 720 /***********************************************************************
 721                         Fetching characters
 722  ***********************************************************************/
 723
 724 /* Count bytes in multibyte string S between BEG/BEGBYTE and END.  BEG
 725    and END are zero-based character positions in S, BEGBYTE is byte
 726    position corresponding to BEG.  */
 727 static inline EMACS_INT
 728 bidi_count_bytes (const unsigned char *s, const EMACS_INT beg,
 729                   const EMACS_INT begbyte, const EMACS_INT end)
 730 {
 731   EMACS_INT pos = beg;
 732   const unsigned char *p = s + begbyte, *start = p;
 733
 734   if (!CHAR_HEAD_P (*p))
 735     abort ();
 736
 737   while (pos < end)
 738     {
 739       p += BYTES_BY_CHAR_HEAD (*p);
 740       pos++;
 741     }
 742
 743   return p - start;
 744 }
 745
 746 /* Fetch and returns the character at byte position BYTEPOS.  If S is
 747    non-NULL, fetch the character from string S; otherwise fetch the
 748    character from the current buffer.  */
 749 static inline int
 750 bidi_char_at_pos (EMACS_INT bytepos, const unsigned char *s)
 751 {
 752   if (s)
 753     return STRING_CHAR (s + bytepos);
 754   else
 755     return FETCH_MULTIBYTE_CHAR (bytepos);
 756 }
 757
 758 /* Fetch and return the character at BYTEPOS/CHARPOS.  If that
 759    character is covered by a display string, treat the entire run of
 760    covered characters as a single character u+FFFC, and return their
 761    combined length in CH_LEN and NCHARS.  DISP_POS specifies the
 762    character position of the next display string, or -1 if not yet
 763    computed.  When the next character is at or beyond that position,
 764    the function updates DISP_POS with the position of the next display
 765    string.  STRING->s is the C string to iterate, or NULL if iterating
 766    over a buffer or a Lisp string; in the latter case, STRING->lstring
 767    is the Lisp string.  */
 768 static inline int
 769 bidi_fetch_char (EMACS_INT bytepos, EMACS_INT charpos, EMACS_INT *disp_pos,
 770                  struct bidi_string_data *string,
 771                  int frame_window_p, EMACS_INT *ch_len, EMACS_INT *nchars)
 772 {
 773   int ch;
 774   EMACS_INT endpos =
 775     (string->s || STRINGP (string->lstring)) ? string->schars : ZV;
 776   struct text_pos pos;
 777
 778   /* If we got past the last known position of display string, compute
 779      the position of the next one.  That position could be at CHARPOS.  */
 780   if (charpos < endpos && charpos > *disp_pos)
 781     {
 782       SET_TEXT_POS (pos, charpos, bytepos);
 783       *disp_pos = compute_display_string_pos (&pos, string, frame_window_p);
 784     }
 785
 786   /* Fetch the character at BYTEPOS.  */
 787   if (charpos >= endpos)
 788     {
 789       ch = BIDI_EOB;
 790       *ch_len = 1;
 791       *nchars = 1;
 792       *disp_pos = endpos;
 793     }
 794   else if (charpos >= *disp_pos)
 795     {
 796       EMACS_INT disp_end_pos;
 797
 798       /* We don't expect to find ourselves in the middle of a display
 799          property.  Hopefully, it will never be needed.  */
 800       if (charpos > *disp_pos)
 801         abort ();
 802       /* Return the Unicode Object Replacement Character to represent
 803          the entire run of characters covered by the display string.  */
 804       ch = 0xFFFC;
 805       disp_end_pos = compute_display_string_end (*disp_pos, string);
 806       *nchars = disp_end_pos - *disp_pos;
 807       if (string->s)
 808         *ch_len = bidi_count_bytes (string->s, *disp_pos, bytepos,
 809                                     disp_end_pos);
 810       else if (STRINGP (string->lstring))
 811         *ch_len = bidi_count_bytes (SDATA (string->lstring), *disp_pos,
 812                                     bytepos, disp_end_pos);
 813       else
 814         *ch_len = CHAR_TO_BYTE (disp_end_pos) - bytepos;
 815     }
 816   else
 817     {
 818       if (string->s)
 819         {
 820           EMACS_INT len;
 821
 822           ch = STRING_CHAR_AND_LENGTH (string->s + bytepos, len);
 823           *ch_len = len;
 824         }
 825       else if (STRINGP (string->lstring))
 826         {
 827           EMACS_INT len;
 828
 829           ch = STRING_CHAR_AND_LENGTH (SDATA (string->lstring) + bytepos, len);
 830           *ch_len = len;
 831         }
 832       else
 833         {
 834           ch = FETCH_MULTIBYTE_CHAR (bytepos);
 835           *ch_len = CHAR_BYTES (ch);
 836         }
 837       *nchars = 1;
 838     }
 839
 840   /* If we just entered a run of characters covered by a display
 841      string, compute the position of the next display string.  */
 842   if (charpos + *nchars <= endpos && charpos + *nchars > *disp_pos)
 843     {
 844       SET_TEXT_POS (pos, charpos + *nchars, bytepos + *ch_len);
 845       *disp_pos = compute_display_string_pos (&pos, string, frame_window_p);
 846     }
 847
 848   return ch;
 849 }
 850
 851 \f
 852 /***********************************************************************
 853                         Determining paragraph direction
 854  ***********************************************************************/
 855
 856 /* Check if buffer position CHARPOS/BYTEPOS is the end of a paragraph.
 857    Value is the non-negative length of the paragraph separator
 858    following the buffer position, -1 if position is at the beginning
 859    of a new paragraph, or -2 if position is neither at beginning nor
 860    at end of a paragraph.  */
 861 static EMACS_INT
 862 bidi_at_paragraph_end (EMACS_INT charpos, EMACS_INT bytepos)
 863 {
 864   Lisp_Object sep_re;
 865   Lisp_Object start_re;
 866   EMACS_INT val;
 867
 868   sep_re = paragraph_separate_re;
 869   start_re = paragraph_start_re;
 870
 871   val = fast_looking_at (sep_re, charpos, bytepos, ZV, ZV_BYTE, Qnil);
 872   if (val < 0)
 873     {
 874       if (fast_looking_at (start_re, charpos, bytepos, ZV, ZV_BYTE, Qnil) >= 0)
 875         val = -1;
 876       else
 877         val = -2;
 878     }
 879
 880   return val;
 881 }
 882
 883 /* Find the beginning of this paragraph by looking back in the buffer.
 884    Value is the byte position of the paragraph's beginning.  */
 885 static EMACS_INT
 886 bidi_find_paragraph_start (EMACS_INT pos, EMACS_INT pos_byte)
 887 {
 888   Lisp_Object re = paragraph_start_re;
 889   EMACS_INT limit = ZV, limit_byte = ZV_BYTE;
 890
 891   while (pos_byte > BEGV_BYTE
 892          && fast_looking_at (re, pos, pos_byte, limit, limit_byte, Qnil) < 0)
 893     {
 894       /* FIXME: What if the paragraph beginning is covered by a
 895          display string?  And what if a display string covering some
 896          of the text over which we scan back includes
 897          paragraph_start_re?  */
 898       pos = find_next_newline_no_quit (pos - 1, -1);
 899       pos_byte = CHAR_TO_BYTE (pos);
 900     }
 901   return pos_byte;
 902 }
 903
 904 /* Determine the base direction, a.k.a. base embedding level, of the
 905    paragraph we are about to iterate through.  If DIR is either L2R or
 906    R2L, just use that.  Otherwise, determine the paragraph direction
 907    from the first strong directional character of the paragraph.
 908
 909    NO_DEFAULT_P non-zero means don't default to L2R if the paragraph
 910    has no strong directional characters and both DIR and
 911    bidi_it->paragraph_dir are NEUTRAL_DIR.  In that case, search back
 912    in the buffer until a paragraph is found with a strong character,
 913    or until hitting BEGV.  In the latter case, fall back to L2R.  This
 914    flag is used in current-bidi-paragraph-direction.
 915
 916    Note that this function gives the paragraph separator the same
 917    direction as the preceding paragraph, even though Emacs generally
 918    views the separartor as not belonging to any paragraph.  */
 919 void
 920 bidi_paragraph_init (bidi_dir_t dir, struct bidi_it *bidi_it, int no_default_p)
 921 {
 922   EMACS_INT bytepos = bidi_it->bytepos;
 923   int string_p = bidi_it->string.s != NULL || STRINGP (bidi_it->string.lstring);
 924   EMACS_INT pstartbyte;
 925   /* Note that begbyte is a byte position, while end is a character
 926      position.  Yes, this is ugly, but we are trying to avoid costly
 927      calls to BYTE_TO_CHAR and its ilk.  */
 928   EMACS_INT begbyte = string_p ? 0 : BEGV_BYTE;
 929   EMACS_INT end = string_p ? bidi_it->string.schars : ZV;
 930
 931   /* Special case for an empty buffer. */
 932   if (bytepos == begbyte && bidi_it->charpos == end)
 933     dir = L2R;
 934   /* We should never be called at EOB or before BEGV.  */
 935   else if (bidi_it->charpos >= end || bytepos < begbyte)
 936     abort ();
 937
 938   if (dir == L2R)
 939     {
 940       bidi_it->paragraph_dir = L2R;
 941       bidi_it->new_paragraph = 0;
 942     }
 943   else if (dir == R2L)
 944     {
 945       bidi_it->paragraph_dir = R2L;
 946       bidi_it->new_paragraph = 0;
 947     }
 948   else if (dir == NEUTRAL_DIR)  /* P2 */
 949     {
 950       int ch;
 951       EMACS_INT ch_len, nchars;
 952       EMACS_INT pos, disp_pos = -1;
 953       bidi_type_t type;
 954       const unsigned char *s;
 955
 956       if (!bidi_initialized)
 957         bidi_initialize ();
 958
 959       /* If we are inside a paragraph separator, we are just waiting
 960          for the separator to be exhausted; use the previous paragraph
 961          direction.  But don't do that if we have been just reseated,
 962          because we need to reinitialize below in that case.  */
 963       if (!bidi_it->first_elt
 964           && bidi_it->charpos < bidi_it->separator_limit)
 965         return;
 966
 967       /* If we are on a newline, get past it to where the next
 968          paragraph might start.  But don't do that at BEGV since then
 969          we are potentially in a new paragraph that doesn't yet
 970          exist.  */
 971       pos = bidi_it->charpos;
 972       s = STRINGP (bidi_it->string.lstring) ?
 973         SDATA (bidi_it->string.lstring) : bidi_it->string.s;
 974       if (bytepos > begbyte && bidi_char_at_pos (bytepos, s) == '\n')
 975         {
 976           bytepos++;
 977           pos++;
 978         }
 979
 980       /* We are either at the beginning of a paragraph or in the
 981          middle of it.  Find where this paragraph starts.  */
 982       if (string_p)
 983         {
 984           /* We don't support changes of paragraph direction inside a
 985              string.  It is treated as a single paragraph.  */
 986           pstartbyte = 0;
 987         }
 988       else
 989         pstartbyte = bidi_find_paragraph_start (pos, bytepos);
 990       bidi_it->separator_limit = -1;
 991       bidi_it->new_paragraph = 0;
 992
 993       /* The following loop is run more than once only if NO_DEFAULT_P
 994          is non-zero, and only if we are iterating on a buffer.  */
 995       do {
 996         bytepos = pstartbyte;
 997         if (!string_p)
 998           pos = BYTE_TO_CHAR (bytepos);
 999         ch = bidi_fetch_char (bytepos, pos, &disp_pos, &bidi_it->string,
1000                               bidi_it->frame_window_p, &ch_len, &nchars);
1001         type = bidi_get_type (ch, NEUTRAL_DIR);
1002
1003         for (pos += nchars, bytepos += ch_len;
1004              /* NOTE: UAX#9 says to search only for L, AL, or R types
1005                 of characters, and ignore RLE, RLO, LRE, and LRO.
1006                 However, I'm not sure it makes sense to omit those 4;
1007                 should try with and without that to see the effect.  */
1008              (bidi_get_category (type) != STRONG)
1009                || (bidi_ignore_explicit_marks_for_paragraph_level
1010                    && (type == RLE || type == RLO
1011                        || type == LRE || type == LRO));
1012              type = bidi_get_type (ch, NEUTRAL_DIR))
1013           {
1014             if (!string_p
1015                 && type == NEUTRAL_B
1016                 && bidi_at_paragraph_end (pos, bytepos) >= -1)
1017               break;
1018             if (pos >= end)
1019               {
1020                 /* Pretend there's a paragraph separator at end of
1021                    buffer/string.  */
1022                 type = NEUTRAL_B;
1023                 break;
1024               }
1025             /* Fetch next character and advance to get past it.  */
1026             ch = bidi_fetch_char (bytepos, pos, &disp_pos, &bidi_it->string,
1027                                   bidi_it->frame_window_p, &ch_len, &nchars);
1028             pos += nchars;
1029             bytepos += ch_len;
1030           }
1031         if (type == STRONG_R || type == STRONG_AL) /* P3 */
1032           bidi_it->paragraph_dir = R2L;
1033         else if (type == STRONG_L)
1034           bidi_it->paragraph_dir = L2R;
1035         if (!string_p
1036             && no_default_p && bidi_it->paragraph_dir == NEUTRAL_DIR)
1037           {
1038             /* If this paragraph is at BEGV, default to L2R.  */
1039             if (pstartbyte == BEGV_BYTE)
1040               bidi_it->paragraph_dir = L2R; /* P3 and HL1 */
1041             else
1042               {
1043                 EMACS_INT prevpbyte = pstartbyte;
1044                 EMACS_INT p = BYTE_TO_CHAR (pstartbyte), pbyte = pstartbyte;
1045
1046                 /* Find the beginning of the previous paragraph, if any.  */
1047                 while (pbyte > BEGV_BYTE && prevpbyte >= pstartbyte)
1048                   {
1049                     /* FXIME: What if p is covered by a display
1050                        string?  See also a FIXME inside
1051                        bidi_find_paragraph_start.  */
1052                     p--;
1053                     pbyte = CHAR_TO_BYTE (p);
1054                     prevpbyte = bidi_find_paragraph_start (p, pbyte);
1055                   }
1056                 pstartbyte = prevpbyte;
1057               }
1058           }
1059       } while (!string_p
1060                && no_default_p && bidi_it->paragraph_dir == NEUTRAL_DIR);
1061     }
1062   else
1063     abort ();
1064
1065   /* Contrary to UAX#9 clause P3, we only default the paragraph
1066      direction to L2R if we have no previous usable paragraph
1067      direction.  This is allowed by the HL1 clause.  */
1068   if (bidi_it->paragraph_dir != L2R && bidi_it->paragraph_dir != R2L)
1069     bidi_it->paragraph_dir = L2R; /* P3 and HL1 ``higher-level protocols'' */
1070   if (bidi_it->paragraph_dir == R2L)
1071     bidi_it->level_stack[0].level = 1;
1072   else
1073     bidi_it->level_stack[0].level = 0;
1074
1075   bidi_line_init (bidi_it);
1076 }
1077
1078 \f
1079 /***********************************************************************
1080                  Resolving explicit and implicit levels.
1081   The rest of this file constitutes the core of the UBA implementation.
1082  ***********************************************************************/
1083
1084 static inline int
1085 bidi_explicit_dir_char (int ch)
1086 {
1087   bidi_type_t ch_type;
1088
1089   if (!bidi_initialized)
1090     abort ();
1091   ch_type = (bidi_type_t) XINT (CHAR_TABLE_REF (bidi_type_table, ch));
1092   return (ch_type == LRE || ch_type == LRO
1093           || ch_type == RLE || ch_type == RLO
1094           || ch_type == PDF);
1095 }
1096
1097 /* A helper function for bidi_resolve_explicit.  It advances to the
1098    next character in logical order and determines the new embedding
1099    level and directional override, but does not take into account
1100    empty embeddings.  */
1101 static int
1102 bidi_resolve_explicit_1 (struct bidi_it *bidi_it)
1103 {
1104   int curchar;
1105   bidi_type_t type;
1106   int current_level;
1107   int new_level;
1108   bidi_dir_t override;
1109   int string_p = bidi_it->string.s != NULL || STRINGP (bidi_it->string.lstring);
1110
1111   /* If reseat()'ed, don't advance, so as to start iteration from the
1112      position where we were reseated.  bidi_it->bytepos can be less
1113      than BEGV_BYTE after reseat to BEGV.  */
1114   if (bidi_it->bytepos < (string_p ? 0 : BEGV_BYTE)
1115       || bidi_it->first_elt)
1116     {
1117       bidi_it->first_elt = 0;
1118       if (string_p)
1119         {
1120           const unsigned char *p =
1121             STRINGP (bidi_it->string.lstring)
1122             ? SDATA (bidi_it->string.lstring) : bidi_it->string.s;
1123
1124           if (bidi_it->charpos < 0)
1125             bidi_it->charpos = 0;
1126           bidi_it->bytepos = bidi_count_bytes (p, 0, 0, bidi_it->charpos);
1127         }
1128       else
1129         {
1130           if (bidi_it->charpos < BEGV)
1131             bidi_it->charpos = BEGV;
1132           bidi_it->bytepos = CHAR_TO_BYTE (bidi_it->charpos);
1133         }
1134     }
1135   /* Don't move at end of buffer/string.  */
1136   else if (bidi_it->charpos < (string_p ? bidi_it->string.schars : ZV))
1137     {
1138       /* Advance to the next character, skipping characters covered by
1139          display strings (nchars > 1).  */
1140       if (bidi_it->nchars <= 0)
1141         abort ();
1142       bidi_it->charpos += bidi_it->nchars;
1143       if (bidi_it->ch_len == 0)
1144         abort ();
1145       bidi_it->bytepos += bidi_it->ch_len;
1146     }
1147
1148   current_level = bidi_it->level_stack[bidi_it->stack_idx].level; /* X1 */
1149   override = bidi_it->level_stack[bidi_it->stack_idx].override;
1150   new_level = current_level;
1151
1152   if (bidi_it->charpos >= (string_p ? bidi_it->string.schars : ZV))
1153     {
1154       curchar = BIDI_EOB;
1155       bidi_it->ch_len = 1;
1156       bidi_it->nchars = 1;
1157       bidi_it->disp_pos = (string_p ? bidi_it->string.schars : ZV);
1158     }
1159   else
1160     {
1161       /* Fetch the character at BYTEPOS.  If it is covered by a
1162          display string, treat the entire run of covered characters as
1163          a single character u+FFFC.  */
1164       curchar = bidi_fetch_char (bidi_it->bytepos, bidi_it->charpos,
1165                                  &bidi_it->disp_pos, &bidi_it->string,
1166                                  bidi_it->frame_window_p,
1167                                  &bidi_it->ch_len, &bidi_it->nchars);
1168     }
1169   bidi_it->ch = curchar;
1170
1171   /* Don't apply directional override here, as all the types we handle
1172      below will not be affected by the override anyway, and we need
1173      the original type unaltered.  The override will be applied in
1174      bidi_resolve_weak.  */
1175   type = bidi_get_type (curchar, NEUTRAL_DIR);
1176   bidi_it->orig_type = type;
1177   bidi_check_type (bidi_it->orig_type);
1178
1179   if (type != PDF)
1180     bidi_it->prev_was_pdf = 0;
1181
1182   bidi_it->type_after_w1 = UNKNOWN_BT;
1183
1184   switch (type)
1185     {
1186       case RLE: /* X2 */
1187       case RLO: /* X4 */
1188         bidi_it->type_after_w1 = type;
1189         bidi_check_type (bidi_it->type_after_w1);
1190         type = WEAK_BN; /* X9/Retaining */
1191         if (bidi_it->ignore_bn_limit <= -1)
1192           {
1193             if (current_level <= BIDI_MAXLEVEL - 4)
1194               {
1195                 /* Compute the least odd embedding level greater than
1196                    the current level.  */
1197                 new_level = ((current_level + 1) & ~1) + 1;
1198                 if (bidi_it->type_after_w1 == RLE)
1199                   override = NEUTRAL_DIR;
1200                 else
1201                   override = R2L;
1202                 if (current_level == BIDI_MAXLEVEL - 4)
1203                   bidi_it->invalid_rl_levels = 0;
1204                 bidi_push_embedding_level (bidi_it, new_level, override);
1205               }
1206             else
1207               {
1208                 bidi_it->invalid_levels++;
1209                 /* See the commentary about invalid_rl_levels below.  */
1210                 if (bidi_it->invalid_rl_levels < 0)
1211                   bidi_it->invalid_rl_levels = 0;
1212                 bidi_it->invalid_rl_levels++;
1213               }
1214           }
1215         else if (bidi_it->prev.type_after_w1 == WEAK_EN /* W5/Retaining */
1216                  || bidi_it->next_en_pos > bidi_it->charpos)
1217           type = WEAK_EN;
1218         break;
1219       case LRE: /* X3 */
1220       case LRO: /* X5 */
1221         bidi_it->type_after_w1 = type;
1222         bidi_check_type (bidi_it->type_after_w1);
1223         type = WEAK_BN; /* X9/Retaining */
1224         if (bidi_it->ignore_bn_limit <= -1)
1225           {
1226             if (current_level <= BIDI_MAXLEVEL - 5)
1227               {
1228                 /* Compute the least even embedding level greater than
1229                    the current level.  */
1230                 new_level = ((current_level + 2) & ~1);
1231                 if (bidi_it->type_after_w1 == LRE)
1232                   override = NEUTRAL_DIR;
1233                 else
1234                   override = L2R;
1235                 bidi_push_embedding_level (bidi_it, new_level, override);
1236               }
1237             else
1238               {
1239                 bidi_it->invalid_levels++;
1240                 /* invalid_rl_levels counts invalid levels encountered
1241                    while the embedding level was already too high for
1242                    LRE/LRO, but not for RLE/RLO.  That is because
1243                    there may be exactly one PDF which we should not
1244                    ignore even though invalid_levels is non-zero.
1245                    invalid_rl_levels helps to know what PDF is
1246                    that.  */
1247                 if (bidi_it->invalid_rl_levels >= 0)
1248                   bidi_it->invalid_rl_levels++;
1249               }
1250           }
1251         else if (bidi_it->prev.type_after_w1 == WEAK_EN /* W5/Retaining */
1252                  || bidi_it->next_en_pos > bidi_it->charpos)
1253           type = WEAK_EN;
1254         break;
1255       case PDF: /* X7 */
1256         bidi_it->type_after_w1 = type;
1257         bidi_check_type (bidi_it->type_after_w1);
1258         type = WEAK_BN; /* X9/Retaining */
1259         if (bidi_it->ignore_bn_limit <= -1)
1260           {
1261             if (!bidi_it->invalid_rl_levels)
1262               {
1263                 new_level = bidi_pop_embedding_level (bidi_it);
1264                 bidi_it->invalid_rl_levels = -1;
1265                 if (bidi_it->invalid_levels)
1266                   bidi_it->invalid_levels--;
1267                 /* else nothing: UAX#9 says to ignore invalid PDFs */
1268               }
1269             if (!bidi_it->invalid_levels)
1270               new_level = bidi_pop_embedding_level (bidi_it);
1271             else
1272               {
1273                 bidi_it->invalid_levels--;
1274                 bidi_it->invalid_rl_levels--;
1275               }
1276           }
1277         else if (bidi_it->prev.type_after_w1 == WEAK_EN /* W5/Retaining */
1278                  || bidi_it->next_en_pos > bidi_it->charpos)
1279           type = WEAK_EN;
1280         break;
1281       default:
1282         /* Nothing.  */
1283         break;
1284     }
1285
1286   bidi_it->type = type;
1287   bidi_check_type (bidi_it->type);
1288
1289   return new_level;
1290 }
1291
1292 /* Given an iterator state in BIDI_IT, advance one character position
1293    in the buffer/string to the next character (in the logical order),
1294    resolve any explicit embeddings and directional overrides, and
1295    return the embedding level of the character after resolving
1296    explicit directives and ignoring empty embeddings.  */
1297 static int
1298 bidi_resolve_explicit (struct bidi_it *bidi_it)
1299 {
1300   int prev_level = bidi_it->level_stack[bidi_it->stack_idx].level;
1301   int new_level  = bidi_resolve_explicit_1 (bidi_it);
1302   EMACS_INT eob = bidi_it->string.s ? bidi_it->string.schars : ZV;
1303   const unsigned char *s = STRINGP (bidi_it->string.lstring)
1304     ? SDATA (bidi_it->string.lstring) : bidi_it->string.s;
1305
1306   if (prev_level < new_level
1307       && bidi_it->type == WEAK_BN
1308       && bidi_it->ignore_bn_limit == -1 /* only if not already known */
1309       && bidi_it->charpos < eob         /* not already at EOB */
1310       && bidi_explicit_dir_char (bidi_char_at_pos (bidi_it->bytepos
1311                                                    + bidi_it->ch_len, s)))
1312     {
1313       /* Avoid pushing and popping embedding levels if the level run
1314          is empty, as this breaks level runs where it shouldn't.
1315          UAX#9 removes all the explicit embedding and override codes,
1316          so empty embeddings disappear without a trace.  We need to
1317          behave as if we did the same.  */
1318       struct bidi_it saved_it;
1319       int level = prev_level;
1320
1321       bidi_copy_it (&saved_it, bidi_it);
1322
1323       while (bidi_explicit_dir_char (bidi_char_at_pos (bidi_it->bytepos
1324                                                        + bidi_it->ch_len, s)))
1325         {
1326           /* This advances to the next character, skipping any
1327              characters covered by display strings.  */
1328           level = bidi_resolve_explicit_1 (bidi_it);
1329           /* If string.lstring was relocated inside bidi_resolve_explicit_1,
1330              a pointer to its data is no longer valid.  */
1331           if (STRINGP (bidi_it->string.lstring))
1332             s = SDATA (bidi_it->string.lstring);
1333         }
1334
1335       if (bidi_it->nchars <= 0)
1336         abort ();
1337       if (level == prev_level)  /* empty embedding */
1338         saved_it.ignore_bn_limit = bidi_it->charpos + bidi_it->nchars;
1339       else                      /* this embedding is non-empty */
1340         saved_it.ignore_bn_limit = -2;
1341
1342       bidi_copy_it (bidi_it, &saved_it);
1343       if (bidi_it->ignore_bn_limit > -1)
1344         {
1345           /* We pushed a level, but we shouldn't have.  Undo that. */
1346           if (!bidi_it->invalid_rl_levels)
1347             {
1348               new_level = bidi_pop_embedding_level (bidi_it);
1349               bidi_it->invalid_rl_levels = -1;
1350               if (bidi_it->invalid_levels)
1351                 bidi_it->invalid_levels--;
1352             }
1353           if (!bidi_it->invalid_levels)
1354             new_level = bidi_pop_embedding_level (bidi_it);
1355           else
1356             {
1357               bidi_it->invalid_levels--;
1358               bidi_it->invalid_rl_levels--;
1359             }
1360         }
1361     }
1362
1363   if (bidi_it->type == NEUTRAL_B)       /* X8 */
1364     {
1365       bidi_set_paragraph_end (bidi_it);
1366       /* This is needed by bidi_resolve_weak below, and in L1.  */
1367       bidi_it->type_after_w1 = bidi_it->type;
1368       bidi_check_type (bidi_it->type_after_w1);
1369     }
1370
1371   return new_level;
1372 }
1373
1374 /* Advance in the buffer/string, resolve weak types and return the
1375    type of the next character after weak type resolution.  */
1376 static bidi_type_t
1377 bidi_resolve_weak (struct bidi_it *bidi_it)
1378 {
1379   bidi_type_t type;
1380   bidi_dir_t override;
1381   int prev_level = bidi_it->level_stack[bidi_it->stack_idx].level;
1382   int new_level  = bidi_resolve_explicit (bidi_it);
1383   int next_char;
1384   bidi_type_t type_of_next;
1385   struct bidi_it saved_it;
1386   EMACS_INT eob =
1387     (STRINGP (bidi_it->string.lstring) || bidi_it->string.s)
1388     ? bidi_it->string.schars : ZV;
1389
1390   type = bidi_it->type;
1391   override = bidi_it->level_stack[bidi_it->stack_idx].override;
1392
1393   if (type == UNKNOWN_BT
1394       || type == LRE
1395       || type == LRO
1396       || type == RLE
1397       || type == RLO
1398       || type == PDF)
1399     abort ();
1400
1401   if (new_level != prev_level
1402       || bidi_it->type == NEUTRAL_B)
1403     {
1404       /* We've got a new embedding level run, compute the directional
1405          type of sor and initialize per-run variables (UAX#9, clause
1406          X10).  */
1407       bidi_set_sor_type (bidi_it, prev_level, new_level);
1408     }
1409   else if (type == NEUTRAL_S || type == NEUTRAL_WS
1410            || type == WEAK_BN || type == STRONG_AL)
1411     bidi_it->type_after_w1 = type;      /* needed in L1 */
1412   bidi_check_type (bidi_it->type_after_w1);
1413
1414   /* Level and directional override status are already recorded in
1415      bidi_it, and do not need any change; see X6.  */
1416   if (override == R2L)          /* X6 */
1417     type = STRONG_R;
1418   else if (override == L2R)
1419     type = STRONG_L;
1420   else
1421     {
1422       if (type == WEAK_NSM)     /* W1 */
1423         {
1424           /* Note that we don't need to consider the case where the
1425              prev character has its type overridden by an RLO or LRO,
1426              because then either the type of this NSM would have been
1427              also overridden, or the previous character is outside the
1428              current level run, and thus not relevant to this NSM.
1429              This is why NSM gets the type_after_w1 of the previous
1430              character.  */
1431           if (bidi_it->prev.type_after_w1 != UNKNOWN_BT
1432               /* if type_after_w1 is NEUTRAL_B, this NSM is at sor */
1433               && bidi_it->prev.type_after_w1 != NEUTRAL_B)
1434             type = bidi_it->prev.type_after_w1;
1435           else if (bidi_it->sor == R2L)
1436             type = STRONG_R;
1437           else if (bidi_it->sor == L2R)
1438             type = STRONG_L;
1439           else /* shouldn't happen! */
1440             abort ();
1441         }
1442       if (type == WEAK_EN       /* W2 */
1443           && bidi_it->last_strong.type_after_w1 == STRONG_AL)
1444         type = WEAK_AN;
1445       else if (type == STRONG_AL) /* W3 */
1446         type = STRONG_R;
1447       else if ((type == WEAK_ES /* W4 */
1448                 && bidi_it->prev.type_after_w1 == WEAK_EN
1449                 && bidi_it->prev.orig_type == WEAK_EN)
1450                || (type == WEAK_CS
1451                    && ((bidi_it->prev.type_after_w1 == WEAK_EN
1452                         && bidi_it->prev.orig_type == WEAK_EN)
1453                        || bidi_it->prev.type_after_w1 == WEAK_AN)))
1454         {
1455           const unsigned char *s =
1456             STRINGP (bidi_it->string.lstring)
1457             ? SDATA (bidi_it->string.lstring) : bidi_it->string.s;
1458
1459           next_char =
1460             bidi_it->charpos + bidi_it->nchars >= eob
1461             ? BIDI_EOB
1462             : bidi_char_at_pos (bidi_it->bytepos + bidi_it->ch_len, s);
1463           type_of_next = bidi_get_type (next_char, override);
1464
1465           if (type_of_next == WEAK_BN
1466               || bidi_explicit_dir_char (next_char))
1467             {
1468               bidi_copy_it (&saved_it, bidi_it);
1469               while (bidi_resolve_explicit (bidi_it) == new_level
1470                      && bidi_it->type == WEAK_BN)
1471                 ;
1472               type_of_next = bidi_it->type;
1473               bidi_copy_it (bidi_it, &saved_it);
1474             }
1475
1476           /* If the next character is EN, but the last strong-type
1477              character is AL, that next EN will be changed to AN when
1478              we process it in W2 above.  So in that case, this ES
1479              should not be changed into EN.  */
1480           if (type == WEAK_ES
1481               && type_of_next == WEAK_EN
1482               && bidi_it->last_strong.type_after_w1 != STRONG_AL)
1483             type = WEAK_EN;
1484           else if (type == WEAK_CS)
1485             {
1486               if (bidi_it->prev.type_after_w1 == WEAK_AN
1487                   && (type_of_next == WEAK_AN
1488                       /* If the next character is EN, but the last
1489                          strong-type character is AL, EN will be later
1490                          changed to AN when we process it in W2 above.
1491                          So in that case, this ES should not be
1492                          changed into EN.  */
1493                       || (type_of_next == WEAK_EN
1494                           && bidi_it->last_strong.type_after_w1 == STRONG_AL)))
1495                 type = WEAK_AN;
1496               else if (bidi_it->prev.type_after_w1 == WEAK_EN
1497                        && type_of_next == WEAK_EN
1498                        && bidi_it->last_strong.type_after_w1 != STRONG_AL)
1499                 type = WEAK_EN;
1500             }
1501         }
1502       else if (type == WEAK_ET  /* W5: ET with EN before or after it */
1503                || type == WEAK_BN)      /* W5/Retaining */
1504         {
1505           if (bidi_it->prev.type_after_w1 == WEAK_EN /* ET/BN w/EN before it */
1506               || bidi_it->next_en_pos > bidi_it->charpos)
1507             type = WEAK_EN;
1508           else                  /* W5: ET/BN with EN after it.  */
1509             {
1510               EMACS_INT en_pos = bidi_it->charpos + bidi_it->nchars;
1511               const unsigned char *s =
1512                 STRINGP (bidi_it->string.lstring)
1513                 ? SDATA (bidi_it->string.lstring) : bidi_it->string.s;
1514
1515               if (bidi_it->nchars <= 0)
1516                 abort ();
1517               next_char =
1518                 bidi_it->charpos + bidi_it->nchars >= eob
1519                 ? BIDI_EOB
1520                 : bidi_char_at_pos (bidi_it->bytepos + bidi_it->ch_len, s);
1521               type_of_next = bidi_get_type (next_char, override);
1522
1523               if (type_of_next == WEAK_ET
1524                   || type_of_next == WEAK_BN
1525                   || bidi_explicit_dir_char (next_char))
1526                 {
1527                   bidi_copy_it (&saved_it, bidi_it);
1528                   while (bidi_resolve_explicit (bidi_it) == new_level
1529                          && (bidi_it->type == WEAK_BN
1530                              || bidi_it->type == WEAK_ET))
1531                     ;
1532                   type_of_next = bidi_it->type;
1533                   en_pos = bidi_it->charpos;
1534                   bidi_copy_it (bidi_it, &saved_it);
1535                 }
1536               if (type_of_next == WEAK_EN)
1537                 {
1538                   /* If the last strong character is AL, the EN we've
1539                      found will become AN when we get to it (W2). */
1540                   if (bidi_it->last_strong.type_after_w1 != STRONG_AL)
1541                     {
1542                       type = WEAK_EN;
1543                       /* Remember this EN position, to speed up processing
1544                          of the next ETs.  */
1545                       bidi_it->next_en_pos = en_pos;
1546                     }
1547                   else if (type == WEAK_BN)
1548                     type = NEUTRAL_ON; /* W6/Retaining */
1549                 }
1550             }
1551         }
1552     }
1553
1554   if (type == WEAK_ES || type == WEAK_ET || type == WEAK_CS /* W6 */
1555       || (type == WEAK_BN
1556           && (bidi_it->prev.type_after_w1 == WEAK_CS        /* W6/Retaining */
1557               || bidi_it->prev.type_after_w1 == WEAK_ES
1558               || bidi_it->prev.type_after_w1 == WEAK_ET)))
1559     type = NEUTRAL_ON;
1560
1561   /* Store the type we've got so far, before we clobber it with strong
1562      types in W7 and while resolving neutral types.  But leave alone
1563      the original types that were recorded above, because we will need
1564      them for the L1 clause.  */
1565   if (bidi_it->type_after_w1 == UNKNOWN_BT)
1566     bidi_it->type_after_w1 = type;
1567   bidi_check_type (bidi_it->type_after_w1);
1568
1569   if (type == WEAK_EN)  /* W7 */
1570     {
1571       if ((bidi_it->last_strong.type_after_w1 == STRONG_L)
1572           || (bidi_it->last_strong.type == UNKNOWN_BT && bidi_it->sor == L2R))
1573         type = STRONG_L;
1574     }
1575
1576   bidi_it->type = type;
1577   bidi_check_type (bidi_it->type);
1578   return type;
1579 }
1580
1581 /* Resolve the type of a neutral character according to the type of
1582    surrounding strong text and the current embedding level.  */
1583 static inline bidi_type_t
1584 bidi_resolve_neutral_1 (bidi_type_t prev_type, bidi_type_t next_type, int lev)
1585 {
1586   /* N1: European and Arabic numbers are treated as though they were R.  */
1587   if (next_type == WEAK_EN || next_type == WEAK_AN)
1588     next_type = STRONG_R;
1589   if (prev_type == WEAK_EN || prev_type == WEAK_AN)
1590     prev_type = STRONG_R;
1591
1592   if (next_type == prev_type)   /* N1 */
1593     return next_type;
1594   else if ((lev & 1) == 0)      /* N2 */
1595     return STRONG_L;
1596   else
1597     return STRONG_R;
1598 }
1599
1600 static bidi_type_t
1601 bidi_resolve_neutral (struct bidi_it *bidi_it)
1602 {
1603   int prev_level = bidi_it->level_stack[bidi_it->stack_idx].level;
1604   bidi_type_t type = bidi_resolve_weak (bidi_it);
1605   int current_level = bidi_it->level_stack[bidi_it->stack_idx].level;
1606
1607   if (!(type == STRONG_R
1608         || type == STRONG_L
1609         || type == WEAK_BN
1610         || type == WEAK_EN
1611         || type == WEAK_AN
1612         || type == NEUTRAL_B
1613         || type == NEUTRAL_S
1614         || type == NEUTRAL_WS
1615         || type == NEUTRAL_ON))
1616     abort ();
1617
1618   if (bidi_get_category (type) == NEUTRAL
1619       || (type == WEAK_BN && prev_level == current_level))
1620     {
1621       if (bidi_it->next_for_neutral.type != UNKNOWN_BT)
1622         type = bidi_resolve_neutral_1 (bidi_it->prev_for_neutral.type,
1623                                        bidi_it->next_for_neutral.type,
1624                                        current_level);
1625       else
1626         {
1627           /* Arrrgh!!  The UAX#9 algorithm is too deeply entrenched in
1628              the assumption of batch-style processing; see clauses W4,
1629              W5, and especially N1, which require to look far forward
1630              (as well as back) in the buffer/string.  May the fleas of
1631              a thousand camels infest the armpits of those who design
1632              supposedly general-purpose algorithms by looking at their
1633              own implementations, and fail to consider other possible
1634              implementations!  */
1635           struct bidi_it saved_it;
1636           bidi_type_t next_type;
1637
1638           if (bidi_it->scan_dir == -1)
1639             abort ();
1640
1641           bidi_copy_it (&saved_it, bidi_it);
1642           /* Scan the text forward until we find the first non-neutral
1643              character, and then use that to resolve the neutral we
1644              are dealing with now.  We also cache the scanned iterator
1645              states, to salvage some of the effort later.  */
1646           bidi_cache_iterator_state (bidi_it, 0);
1647           do {
1648             /* Record the info about the previous character, so that
1649                it will be cached below with this state.  */
1650             if (bidi_it->type_after_w1 != WEAK_BN /* W1/Retaining */
1651                 && bidi_it->type != WEAK_BN)
1652               bidi_remember_char (&bidi_it->prev, bidi_it);
1653             type = bidi_resolve_weak (bidi_it);
1654             /* Paragraph separators have their levels fully resolved
1655                at this point, so cache them as resolved.  */
1656             bidi_cache_iterator_state (bidi_it, type == NEUTRAL_B);
1657             /* FIXME: implement L1 here, by testing for a newline and
1658                resetting the level for any sequence of whitespace
1659                characters adjacent to it.  */
1660           } while (!(type == NEUTRAL_B
1661                      || (type != WEAK_BN
1662                          && bidi_get_category (type) != NEUTRAL)
1663                      /* This is all per level run, so stop when we
1664                         reach the end of this level run.  */
1665                      || bidi_it->level_stack[bidi_it->stack_idx].level !=
1666                      current_level));
1667
1668           bidi_remember_char (&saved_it.next_for_neutral, bidi_it);
1669
1670           switch (type)
1671             {
1672               case STRONG_L:
1673               case STRONG_R:
1674               case STRONG_AL:
1675                 next_type = type;
1676                 break;
1677               case WEAK_EN:
1678               case WEAK_AN:
1679                 /* N1: ``European and Arabic numbers are treated as
1680                    though they were R.''  */
1681                 next_type = STRONG_R;
1682                 saved_it.next_for_neutral.type = STRONG_R;
1683                 break;
1684               case WEAK_BN:
1685                 if (!bidi_explicit_dir_char (bidi_it->ch))
1686                   abort ();             /* can't happen: BNs are skipped */
1687                 /* FALLTHROUGH */
1688               case NEUTRAL_B:
1689                 /* Marched all the way to the end of this level run.
1690                    We need to use the eor type, whose information is
1691                    stored by bidi_set_sor_type in the prev_for_neutral
1692                    member.  */
1693                 if (saved_it.type != WEAK_BN
1694                     || bidi_get_category (bidi_it->prev.type_after_w1) == NEUTRAL)
1695                   {
1696                     next_type = bidi_it->prev_for_neutral.type;
1697                     saved_it.next_for_neutral.type = next_type;
1698                     bidi_check_type (next_type);
1699                   }
1700                 else
1701                   {
1702                     /* This is a BN which does not adjoin neutrals.
1703                        Leave its type alone.  */
1704                     bidi_copy_it (bidi_it, &saved_it);
1705                     return bidi_it->type;
1706                   }
1707                 break;
1708               default:
1709                 abort ();
1710             }
1711           type = bidi_resolve_neutral_1 (saved_it.prev_for_neutral.type,
1712                                          next_type, current_level);
1713           saved_it.type = type;
1714           bidi_check_type (type);
1715           bidi_copy_it (bidi_it, &saved_it);
1716         }
1717     }
1718   return type;
1719 }
1720
1721 /* Given an iterator state in BIDI_IT, advance one character position
1722    in the buffer/string to the next character (in the logical order),
1723    resolve the bidi type of that next character, and return that
1724    type.  */
1725 static bidi_type_t
1726 bidi_type_of_next_char (struct bidi_it *bidi_it)
1727 {
1728   bidi_type_t type;
1729
1730   /* This should always be called during a forward scan.  */
1731   if (bidi_it->scan_dir != 1)
1732     abort ();
1733
1734   /* Reset the limit until which to ignore BNs if we step out of the
1735      area where we found only empty levels.  */
1736   if ((bidi_it->ignore_bn_limit > -1
1737        && bidi_it->ignore_bn_limit <= bidi_it->charpos)
1738       || (bidi_it->ignore_bn_limit == -2
1739           && !bidi_explicit_dir_char (bidi_it->ch)))
1740     bidi_it->ignore_bn_limit = -1;
1741
1742   type = bidi_resolve_neutral (bidi_it);
1743
1744   return type;
1745 }
1746
1747 /* Given an iterator state BIDI_IT, advance one character position in
1748    the buffer/string to the next character (in the current scan
1749    direction), resolve the embedding and implicit levels of that next
1750    character, and return the resulting level.  */
1751 static int
1752 bidi_level_of_next_char (struct bidi_it *bidi_it)
1753 {
1754   bidi_type_t type;
1755   int level, prev_level = -1;
1756   struct bidi_saved_info next_for_neutral;
1757   EMACS_INT next_char_pos = -2;
1758
1759   if (bidi_it->scan_dir == 1)
1760     {
1761       EMACS_INT eob =
1762         (bidi_it->string.s || STRINGP (bidi_it->string.lstring))
1763         ? bidi_it->string.schars : ZV;
1764
1765       /* There's no sense in trying to advance if we hit end of text.  */
1766       if (bidi_it->charpos >= eob)
1767         return bidi_it->resolved_level;
1768
1769       /* Record the info about the previous character.  */
1770       if (bidi_it->type_after_w1 != WEAK_BN /* W1/Retaining */
1771           && bidi_it->type != WEAK_BN)
1772         bidi_remember_char (&bidi_it->prev, bidi_it);
1773       if (bidi_it->type_after_w1 == STRONG_R
1774           || bidi_it->type_after_w1 == STRONG_L
1775           || bidi_it->type_after_w1 == STRONG_AL)
1776         bidi_remember_char (&bidi_it->last_strong, bidi_it);
1777       /* FIXME: it sounds like we don't need both prev and
1778          prev_for_neutral members, but I'm leaving them both for now.  */
1779       if (bidi_it->type == STRONG_R || bidi_it->type == STRONG_L
1780           || bidi_it->type == WEAK_EN || bidi_it->type == WEAK_AN)
1781         bidi_remember_char (&bidi_it->prev_for_neutral, bidi_it);
1782
1783       /* If we overstepped the characters used for resolving neutrals
1784          and whitespace, invalidate their info in the iterator.  */
1785       if (bidi_it->charpos >= bidi_it->next_for_neutral.charpos)
1786         bidi_it->next_for_neutral.type = UNKNOWN_BT;
1787       if (bidi_it->next_en_pos >= 0
1788           && bidi_it->charpos >= bidi_it->next_en_pos)
1789         bidi_it->next_en_pos = -1;
1790       if (bidi_it->next_for_ws.type != UNKNOWN_BT
1791           && bidi_it->charpos >= bidi_it->next_for_ws.charpos)
1792         bidi_it->next_for_ws.type = UNKNOWN_BT;
1793
1794       /* This must be taken before we fill the iterator with the info
1795          about the next char.  If we scan backwards, the iterator
1796          state must be already cached, so there's no need to know the
1797          embedding level of the previous character, since we will be
1798          returning to our caller shortly.  */
1799       prev_level = bidi_it->level_stack[bidi_it->stack_idx].level;
1800     }
1801   next_for_neutral = bidi_it->next_for_neutral;
1802
1803   /* Perhaps the character we want is already cached.  If it is, the
1804      call to bidi_cache_find below will return a type other than
1805      UNKNOWN_BT.  */
1806   if (bidi_cache_idx > bidi_cache_start && !bidi_it->first_elt)
1807     {
1808       int bob =
1809         (bidi_it->string.s || STRINGP (bidi_it->string.lstring)) ? 0 : 1;
1810
1811       if (bidi_it->scan_dir > 0)
1812         {
1813           if (bidi_it->nchars <= 0)
1814             abort ();
1815           next_char_pos = bidi_it->charpos + bidi_it->nchars;
1816         }
1817       else if (bidi_it->charpos >= bob)
1818         /* Implementation note: we allow next_char_pos to be as low as
1819            0 for buffers or -1 for strings, and that is okay because
1820            that's the "position" of the sentinel iterator state we
1821            cached at the beginning of the iteration.  */
1822         next_char_pos = bidi_it->charpos - 1;
1823       if (next_char_pos >= bob - 1)
1824         type = bidi_cache_find (next_char_pos, -1, bidi_it);
1825       else
1826         type = UNKNOWN_BT;
1827     }
1828   else
1829     type = UNKNOWN_BT;
1830   if (type != UNKNOWN_BT)
1831     {
1832       /* Don't lose the information for resolving neutrals!  The
1833          cached states could have been cached before their
1834          next_for_neutral member was computed.  If we are on our way
1835          forward, we can simply take the info from the previous
1836          state.  */
1837       if (bidi_it->scan_dir == 1
1838           && bidi_it->next_for_neutral.type == UNKNOWN_BT)
1839         bidi_it->next_for_neutral = next_for_neutral;
1840
1841       /* If resolved_level is -1, it means this state was cached
1842          before it was completely resolved, so we cannot return
1843          it.  */
1844       if (bidi_it->resolved_level != -1)
1845         return bidi_it->resolved_level;
1846     }
1847   if (bidi_it->scan_dir == -1)
1848     /* If we are going backwards, the iterator state is already cached
1849        from previous scans, and should be fully resolved.  */
1850     abort ();
1851
1852   if (type == UNKNOWN_BT)
1853     type = bidi_type_of_next_char (bidi_it);
1854
1855   if (type == NEUTRAL_B)
1856     return bidi_it->resolved_level;
1857
1858   level = bidi_it->level_stack[bidi_it->stack_idx].level;
1859   if ((bidi_get_category (type) == NEUTRAL /* && type != NEUTRAL_B */)
1860       || (type == WEAK_BN && prev_level == level))
1861     {
1862       if (bidi_it->next_for_neutral.type == UNKNOWN_BT)
1863         abort ();
1864
1865       /* If the cached state shows a neutral character, it was not
1866          resolved by bidi_resolve_neutral, so do it now.  */
1867       type = bidi_resolve_neutral_1 (bidi_it->prev_for_neutral.type,
1868                                      bidi_it->next_for_neutral.type,
1869                                      level);
1870     }
1871
1872   if (!(type == STRONG_R
1873         || type == STRONG_L
1874         || type == WEAK_BN
1875         || type == WEAK_EN
1876         || type == WEAK_AN))
1877     abort ();
1878   bidi_it->type = type;
1879   bidi_check_type (bidi_it->type);
1880
1881   /* For L1 below, we need to know, for each WS character, whether
1882      it belongs to a sequence of WS characters preceding a newline
1883      or a TAB or a paragraph separator.  */
1884   if (bidi_it->orig_type == NEUTRAL_WS
1885       && bidi_it->next_for_ws.type == UNKNOWN_BT)
1886     {
1887       int ch;
1888       EMACS_INT clen = bidi_it->ch_len;
1889       EMACS_INT bpos = bidi_it->bytepos;
1890       EMACS_INT cpos = bidi_it->charpos;
1891       EMACS_INT disp_pos = bidi_it->disp_pos;
1892       EMACS_INT nc = bidi_it->nchars;
1893       struct bidi_string_data bs = bidi_it->string;
1894       bidi_type_t chtype;
1895       int fwp = bidi_it->frame_window_p;
1896
1897       if (bidi_it->nchars <= 0)
1898         abort ();
1899       do {
1900         ch = bidi_fetch_char (bpos += clen, cpos += nc, &disp_pos, &bs, fwp,
1901                               &clen, &nc);
1902         if (ch == '\n' || ch == BIDI_EOB /* || ch == LINESEP_CHAR */)
1903           chtype = NEUTRAL_B;
1904         else
1905           chtype = bidi_get_type (ch, NEUTRAL_DIR);
1906       } while (chtype == NEUTRAL_WS || chtype == WEAK_BN
1907                || bidi_explicit_dir_char (ch)); /* L1/Retaining */
1908       bidi_it->next_for_ws.type = chtype;
1909       bidi_check_type (bidi_it->next_for_ws.type);
1910       bidi_it->next_for_ws.charpos = cpos;
1911       bidi_it->next_for_ws.bytepos = bpos;
1912     }
1913
1914   /* Resolve implicit levels, with a twist: PDFs get the embedding
1915      level of the enbedding they terminate.  See below for the
1916      reason.  */
1917   if (bidi_it->orig_type == PDF
1918       /* Don't do this if this formatting code didn't change the
1919          embedding level due to invalid or empty embeddings.  */
1920       && prev_level != level)
1921     {
1922       /* Don't look in UAX#9 for the reason for this: it's our own
1923          private quirk.  The reason is that we want the formatting
1924          codes to be delivered so that they bracket the text of their
1925          embedding.  For example, given the text
1926
1927              {RLO}teST{PDF}
1928
1929          we want it to be displayed as
1930
1931              {PDF}STet{RLO}
1932
1933          not as
1934
1935              STet{RLO}{PDF}
1936
1937          which will result because we bump up the embedding level as
1938          soon as we see the RLO and pop it as soon as we see the PDF,
1939          so RLO itself has the same embedding level as "teST", and
1940          thus would be normally delivered last, just before the PDF.
1941          The switch below fiddles with the level of PDF so that this
1942          ugly side effect does not happen.
1943
1944          (This is, of course, only important if the formatting codes
1945          are actually displayed, but Emacs does need to display them
1946          if the user wants to.)  */
1947       level = prev_level;
1948     }
1949   else if (bidi_it->orig_type == NEUTRAL_B /* L1 */
1950            || bidi_it->orig_type == NEUTRAL_S
1951            || bidi_it->ch == '\n' || bidi_it->ch == BIDI_EOB
1952            /* || bidi_it->ch == LINESEP_CHAR */
1953            || (bidi_it->orig_type == NEUTRAL_WS
1954                && (bidi_it->next_for_ws.type == NEUTRAL_B
1955                    || bidi_it->next_for_ws.type == NEUTRAL_S)))
1956     level = bidi_it->level_stack[0].level;
1957   else if ((level & 1) == 0) /* I1 */
1958     {
1959       if (type == STRONG_R)
1960         level++;
1961       else if (type == WEAK_EN || type == WEAK_AN)
1962         level += 2;
1963     }
1964   else                  /* I2 */
1965     {
1966       if (type == STRONG_L || type == WEAK_EN || type == WEAK_AN)
1967         level++;
1968     }
1969
1970   bidi_it->resolved_level = level;
1971   return level;
1972 }
1973
1974 /* Move to the other edge of a level given by LEVEL.  If END_FLAG is
1975    non-zero, we are at the end of a level, and we need to prepare to
1976    resume the scan of the lower level.
1977
1978    If this level's other edge is cached, we simply jump to it, filling
1979    the iterator structure with the iterator state on the other edge.
1980    Otherwise, we walk the buffer or string until we come back to the
1981    same level as LEVEL.
1982
1983    Note: we are not talking here about a ``level run'' in the UAX#9
1984    sense of the term, but rather about a ``level'' which includes
1985    all the levels higher than it.  In other words, given the levels
1986    like this:
1987
1988          11111112222222333333334443343222222111111112223322111
1989                 A      B                    C
1990
1991    and assuming we are at point A scanning left to right, this
1992    function moves to point C, whereas the UAX#9 ``level 2 run'' ends
1993    at point B.  */
1994 static void
1995 bidi_find_other_level_edge (struct bidi_it *bidi_it, int level, int end_flag)
1996 {
1997   int dir = end_flag ? -bidi_it->scan_dir : bidi_it->scan_dir;
1998   int idx;
1999
2000   /* Try the cache first.  */
2001   if ((idx = bidi_cache_find_level_change (level, dir, end_flag))
2002       >= bidi_cache_start)
2003     bidi_cache_fetch_state (idx, bidi_it);
2004   else
2005     {
2006       int new_level;
2007
2008       if (end_flag)
2009         abort (); /* if we are at end of level, its edges must be cached */
2010
2011       bidi_cache_iterator_state (bidi_it, 1);
2012       do {
2013         new_level = bidi_level_of_next_char (bidi_it);
2014         bidi_cache_iterator_state (bidi_it, 1);
2015       } while (new_level >= level);
2016     }
2017 }
2018
2019 void
2020 bidi_move_to_visually_next (struct bidi_it *bidi_it)
2021 {
2022   int old_level, new_level, next_level;
2023   struct bidi_it sentinel;
2024   struct gcpro gcpro1;
2025
2026   if (bidi_it->charpos < 0 || bidi_it->bytepos < 0)
2027     abort ();
2028
2029   if (bidi_it->scan_dir == 0)
2030     {
2031       bidi_it->scan_dir = 1;    /* default to logical order */
2032     }
2033
2034   /* The code below can call eval, and thus cause GC.  If we are
2035      iterating a Lisp string, make sure it won't be GCed.  */
2036   if (STRINGP (bidi_it->string.lstring))
2037     GCPRO1 (bidi_it->string.lstring);
2038
2039   /* If we just passed a newline, initialize for the next line.  */
2040   if (!bidi_it->first_elt && bidi_it->orig_type == NEUTRAL_B)
2041     bidi_line_init (bidi_it);
2042
2043   /* Prepare the sentinel iterator state, and cache it.  When we bump
2044      into it, scanning backwards, we'll know that the last non-base
2045      level is exhausted.  */
2046   if (bidi_cache_idx == bidi_cache_start)
2047     {
2048       bidi_copy_it (&sentinel, bidi_it);
2049       if (bidi_it->first_elt)
2050         {
2051           sentinel.charpos--;   /* cached charpos needs to be monotonic */
2052           sentinel.bytepos--;
2053           sentinel.ch = '\n';   /* doesn't matter, but why not? */
2054           sentinel.ch_len = 1;
2055           sentinel.nchars = 1;
2056         }
2057       bidi_cache_iterator_state (&sentinel, 1);
2058     }
2059
2060   old_level = bidi_it->resolved_level;
2061   new_level = bidi_level_of_next_char (bidi_it);
2062
2063   /* Reordering of resolved levels (clause L2) is implemented by
2064      jumping to the other edge of the level and flipping direction of
2065      scanning the text whenever we find a level change.  */
2066   if (new_level != old_level)
2067     {
2068       int ascending = new_level > old_level;
2069       int level_to_search = ascending ? old_level + 1 : old_level;
2070       int incr = ascending ? 1 : -1;
2071       int expected_next_level = old_level + incr;
2072
2073       /* Jump (or walk) to the other edge of this level.  */
2074       bidi_find_other_level_edge (bidi_it, level_to_search, !ascending);
2075       /* Switch scan direction and peek at the next character in the
2076          new direction.  */
2077       bidi_it->scan_dir = -bidi_it->scan_dir;
2078
2079       /* The following loop handles the case where the resolved level
2080          jumps by more than one.  This is typical for numbers inside a
2081          run of text with left-to-right embedding direction, but can
2082          also happen in other situations.  In those cases the decision
2083          where to continue after a level change, and in what direction,
2084          is tricky.  For example, given a text like below:
2085
2086                   abcdefgh
2087                   11336622
2088
2089          (where the numbers below the text show the resolved levels),
2090          the result of reordering according to UAX#9 should be this:
2091
2092                   efdcghba
2093
2094          This is implemented by the loop below which flips direction
2095          and jumps to the other edge of the level each time it finds
2096          the new level not to be the expected one.  The expected level
2097          is always one more or one less than the previous one.  */
2098       next_level = bidi_peek_at_next_level (bidi_it);
2099       while (next_level != expected_next_level)
2100         {
2101           expected_next_level += incr;
2102           level_to_search += incr;
2103           bidi_find_other_level_edge (bidi_it, level_to_search, !ascending);
2104           bidi_it->scan_dir = -bidi_it->scan_dir;
2105           next_level = bidi_peek_at_next_level (bidi_it);
2106         }
2107
2108       /* Finally, deliver the next character in the new direction.  */
2109       next_level = bidi_level_of_next_char (bidi_it);
2110     }
2111
2112   /* Take note when we have just processed the newline that precedes
2113      the end of the paragraph.  The next time we are about to be
2114      called, set_iterator_to_next will automatically reinit the
2115      paragraph direction, if needed.  We do this at the newline before
2116      the paragraph separator, because the next character might not be
2117      the first character of the next paragraph, due to the bidi
2118      reordering, whereas we _must_ know the paragraph base direction
2119      _before_ we process the paragraph's text, since the base
2120      direction affects the reordering.  */
2121   if (bidi_it->scan_dir == 1 && bidi_it->orig_type == NEUTRAL_B)
2122     {
2123       /* The paragraph direction of the entire string, once
2124          determined, is in effect for the entire string.  Setting the
2125          separator limit to the end of the string prevents
2126          bidi_paragraph_init from being called automatically on this
2127          string.  */
2128       if (bidi_it->string.s || STRINGP (bidi_it->string.lstring))
2129         bidi_it->separator_limit = bidi_it->string.schars;
2130       else if (bidi_it->bytepos < ZV_BYTE)
2131         {
2132           EMACS_INT sep_len =
2133             bidi_at_paragraph_end (bidi_it->charpos + bidi_it->nchars,
2134                                    bidi_it->bytepos + bidi_it->ch_len);
2135           if (bidi_it->nchars <= 0)
2136             abort ();
2137           if (sep_len >= 0)
2138             {
2139               bidi_it->new_paragraph = 1;
2140               /* Record the buffer position of the last character of the
2141                  paragraph separator.  */
2142               bidi_it->separator_limit =
2143                 bidi_it->charpos + bidi_it->nchars + sep_len;
2144             }
2145         }
2146     }
2147
2148   if (bidi_it->scan_dir == 1 && bidi_cache_idx > bidi_cache_start)
2149     {
2150       /* If we are at paragraph's base embedding level and beyond the
2151          last cached position, the cache's job is done and we can
2152          discard it.  */
2153       if (bidi_it->resolved_level == bidi_it->level_stack[0].level
2154           && bidi_it->charpos > (bidi_cache[bidi_cache_idx - 1].charpos
2155                                  + bidi_cache[bidi_cache_idx - 1].nchars - 1))
2156         bidi_cache_reset ();
2157         /* But as long as we are caching during forward scan, we must
2158            cache each state, or else the cache integrity will be
2159            compromised: it assumes cached states correspond to buffer
2160            positions 1:1.  */
2161       else
2162         bidi_cache_iterator_state (bidi_it, 1);
2163     }
2164
2165   if (STRINGP (bidi_it->string.lstring))
2166     UNGCPRO;
2167 }
2168
2169 /* This is meant to be called from within the debugger, whenever you
2170    wish to examine the cache contents.  */
2171 void bidi_dump_cached_states (void) EXTERNALLY_VISIBLE;
2172 void
2173 bidi_dump_cached_states (void)
2174 {
2175   int i;
2176   int ndigits = 1;
2177
2178   if (bidi_cache_idx == 0)
2179     {
2180       fprintf (stderr, "The cache is empty.\n");
2181       return;
2182     }
2183   fprintf (stderr, "Total of %d state%s in cache:\n",
2184            bidi_cache_idx, bidi_cache_idx == 1 ? "" : "s");
2185
2186   for (i = bidi_cache[bidi_cache_idx - 1].charpos; i > 0; i /= 10)
2187     ndigits++;
2188   fputs ("ch  ", stderr);
2189   for (i = 0; i < bidi_cache_idx; i++)
2190     fprintf (stderr, "%*c", ndigits, bidi_cache[i].ch);
2191   fputs ("\n", stderr);
2192   fputs ("lvl ", stderr);
2193   for (i = 0; i < bidi_cache_idx; i++)
2194     fprintf (stderr, "%*d", ndigits, bidi_cache[i].resolved_level);
2195   fputs ("\n", stderr);
2196   fputs ("pos ", stderr);
2197   for (i = 0; i < bidi_cache_idx; i++)
2198     fprintf (stderr, "%*"pI"d", ndigits, bidi_cache[i].charpos);
2199   fputs ("\n", stderr);
2200 }