vc/vc-src.el (vc-src-do-comand): Prepend -- to file argument list
[emacs.git] / src / bidi.c
blobcc70d08f01e93f61b4de320cd7f66f89790b1be2
1 /* Low-level bidirectional buffer/string-scanning functions for GNU Emacs.
2 Copyright (C) 2000-2001, 2004-2005, 2009-2014 Free Software
3 Foundation, Inc.
5 This file is part of GNU Emacs.
7 GNU Emacs is free software: you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation, either version 3 of the License, or
10 (at your option) any later version.
12 GNU Emacs is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with GNU Emacs. If not, see <http://www.gnu.org/licenses/>. */
20 /* Written by Eli Zaretskii <eliz@gnu.org>.
22 A sequential implementation of the Unicode Bidirectional algorithm,
23 (UBA) as per UAX#9, a part of the Unicode Standard.
25 Unlike the Reference Implementation and most other implementations,
26 this one is designed to be called once for every character in the
27 buffer or string. That way, we can leave intact the design of the
28 Emacs display engine, whereby an iterator object is used to
29 traverse buffer or string text character by character, and generate
30 the necessary data for displaying each character in 'struct glyph'
31 objects. (See xdisp.c for the details of that iteration.) The
32 functions on this file replace the original linear iteration in the
33 logical order of the text with a non-linear iteration in the visual
34 order, i.e. in the order characters should be shown on display.
36 The main entry point is bidi_move_to_visually_next. Each time it
37 is called, it finds the next character in the visual order, and
38 returns its information in a special structure. The caller is then
39 expected to process this character for display or any other
40 purposes, and call bidi_move_to_visually_next for the next
41 character. See the comments in bidi_move_to_visually_next for more
42 details about its algorithm that finds the next visual-order
43 character by resolving their levels on the fly.
45 Two other entry points are bidi_paragraph_init and
46 bidi_mirror_char. The first determines the base direction of a
47 paragraph, while the second returns the mirrored version of its
48 argument character.
50 A few auxiliary entry points are used to initialize the bidi
51 iterator for iterating an object (buffer or string), push and pop
52 the bidi iterator state, and save and restore the state of the bidi
53 cache.
55 If you want to understand the code, you will have to read it
56 together with the relevant portions of UAX#9. The comments include
57 references to UAX#9 rules, for that very reason.
59 A note about references to UAX#9 rules: if the reference says
60 something like "X9/Retaining", it means that you need to refer to
61 rule X9 and to its modifications described in the "Implementation
62 Notes" section of UAX#9, under "Retaining Format Codes".
64 Here's the overview of the design of the reordering engine
65 implemented by this file.
67 Basic implementation structure
68 ------------------------------
70 The sequential processing steps described by UAX#9 are implemented
71 as recursive levels of processing, all of which examine the next
72 character in the logical order. This hierarchy of processing looks
73 as follows, from the innermost (deepest) to the outermost level,
74 omitting some subroutines used by each level:
76 bidi_fetch_char -- fetch next character
77 bidi_resolve_explicit -- resolve explicit levels and directions
78 bidi_resolve_weak -- resolve weak types
79 bidi_resolve_brackets -- resolve "paired brackets" neutral types
80 bidi_resolve_neutral -- resolve neutral types
81 bidi_level_of_next_char -- resolve implicit levels
83 Each level calls the level below it, and works on the result
84 returned by the lower level, including all of its sub-levels.
86 Unlike all the levels below it, bidi_level_of_next_char can return
87 the information about either the next or previous character in the
88 logical order, depending on the current direction of scanning the
89 buffer or string. For the next character, it calls all the levels
90 below it; for the previous character, it uses the cache, described
91 below.
93 Thus, the result of calling bidi_level_of_next_char is the resolved
94 level of the next or the previous character in the logical order.
95 Based on this information, the function bidi_move_to_visually_next
96 finds the next character in the visual order and updates the
97 direction in which the buffer is scanned, either forward or
98 backward, to find the next character to be displayed. (Text is
99 scanned backwards when it needs to be reversed for display, i.e. if
100 the visual order is the inverse of the logical order.) This
101 implements the last, reordering steps of the UBA, by successively
102 calling bidi_level_of_next_char until the character of the required
103 embedding level is found; the scan direction is dynamically updated
104 as a side effect. See the commentary before the 'while' loop in
105 bidi_move_to_visually_next, for the details.
107 Fetching characters
108 -------------------
110 In a nutshell, fetching the next character boils down to calling
111 STRING_CHAR_AND_LENGTH, passing it the address of a buffer or
112 string position. See bidi_fetch_char. However, if the next
113 character is "covered" by a display property of some kind,
114 bidi_fetch_char returns the u+FFFC "object replacement character"
115 that represents the entire run of text covered by the display
116 property. (The ch_len and nchars members of 'struct bidi_it'
117 reflect the length in bytes and characters of that text.) This is
118 so we reorder text on both sides of the display property as
119 appropriate for an image or embedded string. Similarly, text
120 covered by a display spec of the form '(space ...)', is replaced
121 with the u+2029 paragraph separator character, so such display
122 specs produce the same effect as a TAB under UBA. Both these
123 special characters are not actually displayed -- the display
124 property is displayed instead -- but just used to compute the
125 embedding level of the surrounding text so as to produce the
126 required effect.
128 Bidi iterator states
129 --------------------
131 The UBA is highly context dependent in some of its parts,
132 i.e. results of processing a character can generally depend on
133 characters very far away. The UAX#9 description of the UBA
134 prescribes a stateful processing of each character, whereby the
135 results of this processing depend on various state variables, such
136 as the current embedding level, level stack, and directional
137 override status. In addition, the UAX#9 description includes many
138 passages like this (from rule W2 in this case):
140 Search backward from each instance of a European number until the
141 first strong type (R, L, AL, or sos) is found. If an AL is found,
142 change the type of the European number to Arabic number.
144 To support this, we use a bidi iterator object, 'struct bidi_it',
145 which is a sub-structure of 'struct it' used by xdisp.c (see
146 dispextern.h for the definition of both of these structures). The
147 bidi iterator holds the entire state of the iteration required by
148 the UBA, and is updated as the text is traversed. In particular,
149 the embedding level of the current character being resolved is
150 recorded in the iterator state. To avoid costly searches backward
151 in support of rules like W2 above, the necessary character types
152 are also recorded in the iterator state as they are found during
153 the forward scan, and then used when such rules need to be applied.
154 (Forward scans cannot be avoided in this way; they need to be
155 performed at least once, and the results recorded in the iterator
156 state, to be reused until the forward scan oversteps the recorded
157 position.)
159 In this manner, the iterator state acts as a mini-cache of
160 contextual information required for resolving the level of the
161 current character by various UBA rules.
163 Caching of bidi iterator states
164 -------------------------------
166 As described above, the reordering engine uses the information
167 recorded in the bidi iterator state in order to resolve the
168 embedding level of the current character. When the reordering
169 engine needs to process the next character in the logical order, it
170 fetches it and applies to it all the UBA levels, updating the
171 iterator state as it goes. But when the buffer or string is
172 scanned backwards, i.e. in the reverse order of buffer/string
173 positions, the scanned characters were already processed during the
174 preceding forward scan (see bidi_find_other_level_edge). To avoid
175 costly re-processing of characters that were already processed
176 during the forward scan, the iterator states computed while
177 scanning forward are cached.
179 The cache is just a linear array of 'struct bidi_it' objects, which
180 is dynamically allocated and reallocated as needed, since the size
181 of the cache depends on the text being processed. We only need the
182 cache while processing embedded levels higher than the base
183 paragraph embedding level, because these higher levels require
184 changes in scan direction. Therefore, as soon as we are back to
185 the base embedding level, we can free the cache; see the calls to
186 bidi_cache_reset and bidi_cache_shrink, for the conditions to do
187 this.
189 The cache maintains the index of the next unused cache slot -- this
190 is where the next iterator state will be cached. The function
191 bidi_cache_iterator_state saves an instance of the state in the
192 cache and increments the unused slot index. The companion function
193 bidi_cache_find looks up a cached state that corresponds to a given
194 buffer/string position. All of the cached states must correspond
195 1:1 to the buffer or string region whose processing they reflect;
196 bidi.c will abort if it finds cache slots that violate this 1:1
197 correspondence.
199 When the parent iterator 'struct it' is pushed (see push_it in
200 xdisp.c) to pause the current iteration and start iterating over a
201 different object (e.g., a 'display' string that covers some buffer
202 text), the bidi iterator cache needs to be "pushed" as well, so
203 that a new empty cache could be used while iterating over the new
204 object. Later, when the new object is exhausted, and xdisp.c calls
205 pop_it, we need to "pop" the bidi cache as well and return to the
206 original cache. See bidi_push_it and bidi_pop_it for how this is
207 done.
209 Some functions of the display engine save copies of 'struct it' in
210 local variables, and restore them later. For examples, see
211 pos_visible_p and move_it_in_display_line_to in xdisp.c, and
212 window_scroll_pixel_based in window.c. When this happens, we need
213 to save and restore the bidi cache as well, because conceptually
214 the cache is part of the 'struct it' state, and needs to be in
215 perfect sync with the portion of the buffer/string that is being
216 processed. This saving and restoring of the cache state is handled
217 by bidi_shelve_cache and bidi_unshelve_cache, and the helper macros
218 SAVE_IT and RESTORE_IT defined on xdisp.c.
220 Note that, because reordering is implemented below the level in
221 xdisp.c that breaks glyphs into screen lines, we are violating
222 paragraph 3.4 of UAX#9. which mandates that line breaking shall be
223 done before reordering each screen line separately. However,
224 following UAX#9 to the letter in this matter goes against the basic
225 design of the Emacs display engine, and so we choose here this
226 minor deviation from the UBA letter in preference to redesign of
227 the display engine. The effect of this is only seen in continued
228 lines that are broken into screen lines in the middle of a run
229 whose direction is opposite to the paragraph's base direction.
231 Important design and implementation note: when the code needs to
232 scan far ahead, be sure to avoid such scans as much as possible
233 when the buffer/string doesn't contain any RTL characters. Users
234 of left-to-right scripts will never forgive you if you introduce
235 some slow-down due to bidi in situations that don't involve any
236 bidirectional text. See the large comment near the beginning of
237 bidi_resolve_neutral, for one situation where such shortcut was
238 necessary. */
240 #include <config.h>
241 #include <stdio.h>
243 #include "lisp.h"
244 #include "character.h"
245 #include "buffer.h"
246 #include "dispextern.h"
247 #include "region-cache.h"
249 static bool bidi_initialized = 0;
251 static Lisp_Object bidi_type_table, bidi_mirror_table, bidi_brackets_table;
253 #define BIDI_EOB (-1)
255 /* Data type for describing the bidirectional character categories. */
256 typedef enum {
257 UNKNOWN_BC,
258 NEUTRAL,
259 WEAK,
260 STRONG,
261 EXPLICIT_FORMATTING
262 } bidi_category_t;
264 static Lisp_Object paragraph_start_re, paragraph_separate_re;
265 static Lisp_Object Qparagraph_start, Qparagraph_separate;
268 /***********************************************************************
269 Utilities
270 ***********************************************************************/
272 /* Return the bidi type of a character CH, subject to the current
273 directional OVERRIDE. */
274 static bidi_type_t
275 bidi_get_type (int ch, bidi_dir_t override)
277 bidi_type_t default_type;
279 if (ch == BIDI_EOB)
280 return NEUTRAL_B;
281 if (ch < 0 || ch > MAX_CHAR)
282 emacs_abort ();
284 default_type = (bidi_type_t) XINT (CHAR_TABLE_REF (bidi_type_table, ch));
285 /* Every valid character code, even those that are unassigned by the
286 UCD, have some bidi-class property, according to
287 DerivedBidiClass.txt file. Therefore, if we ever get UNKNOWN_BT
288 (= zero) code from CHAR_TABLE_REF, that's a bug. */
289 if (default_type == UNKNOWN_BT)
290 emacs_abort ();
292 switch (default_type)
294 case WEAK_BN:
295 case NEUTRAL_B:
296 case LRE:
297 case LRO:
298 case RLE:
299 case RLO:
300 case PDF:
301 case LRI:
302 case RLI:
303 case FSI:
304 case PDI:
305 return default_type;
306 default:
307 if (override == L2R)
308 return STRONG_L;
309 else if (override == R2L)
310 return STRONG_R;
311 else
312 return default_type;
316 static void
317 bidi_check_type (bidi_type_t type)
319 eassert (UNKNOWN_BT <= type && type <= NEUTRAL_ON);
322 /* Given a bidi TYPE of a character, return its category. */
323 static bidi_category_t
324 bidi_get_category (bidi_type_t type)
326 switch (type)
328 case UNKNOWN_BT:
329 return UNKNOWN_BC;
330 case STRONG_L:
331 case STRONG_R:
332 case STRONG_AL:
333 return STRONG;
334 case WEAK_EN:
335 case WEAK_ES:
336 case WEAK_ET:
337 case WEAK_AN:
338 case WEAK_CS:
339 case WEAK_NSM:
340 case WEAK_BN:
341 return WEAK;
342 case NEUTRAL_B:
343 case NEUTRAL_S:
344 case NEUTRAL_WS:
345 case NEUTRAL_ON:
346 return NEUTRAL;
347 case LRE:
348 case LRO:
349 case RLE:
350 case RLO:
351 case PDF:
352 case LRI:
353 case RLI:
354 case FSI:
355 case PDI:
356 return EXPLICIT_FORMATTING;
357 default:
358 emacs_abort ();
362 static bool
363 bidi_isolate_fmt_char (bidi_type_t ch_type)
365 return (ch_type == LRI || ch_type == RLI || ch_type == PDI || ch_type == FSI);
368 /* Return the mirrored character of C, if it has one. If C has no
369 mirrored counterpart, return C.
370 Note: The conditions in UAX#9 clause L4 regarding the surrounding
371 context must be tested by the caller. */
373 bidi_mirror_char (int c)
375 Lisp_Object val;
377 if (c == BIDI_EOB)
378 return c;
379 if (c < 0 || c > MAX_CHAR)
380 emacs_abort ();
382 val = CHAR_TABLE_REF (bidi_mirror_table, c);
383 if (INTEGERP (val))
385 int v;
387 /* When debugging, check before assigning to V, so that the check
388 isn't broken by undefined behavior due to int overflow. */
389 eassert (CHAR_VALID_P (XINT (val)));
391 v = XINT (val);
393 /* Minimal test we must do in optimized builds, to prevent weird
394 crashes further down the road. */
395 if (v < 0 || v > MAX_CHAR)
396 emacs_abort ();
398 return v;
401 return c;
404 /* Return the Bidi_Paired_Bracket_Type property of the character C. */
405 static bidi_bracket_type_t
406 bidi_paired_bracket_type (int c)
408 if (c == BIDI_EOB)
409 return BIDI_BRACKET_NONE;
410 if (c < 0 || c > MAX_CHAR)
411 emacs_abort ();
413 return (bidi_bracket_type_t) XINT (CHAR_TABLE_REF (bidi_brackets_table, c));
416 /* Determine the start-of-sequence (sos) directional type given the two
417 embedding levels on either side of the run boundary. Also, update
418 the saved info about previously seen characters, since that info is
419 generally valid for a single level run. */
420 static void
421 bidi_set_sos_type (struct bidi_it *bidi_it, int level_before, int level_after)
423 int higher_level = (level_before > level_after ? level_before : level_after);
425 /* FIXME: should the default sos direction be user selectable? */
426 bidi_it->sos = ((higher_level & 1) != 0 ? R2L : L2R); /* X10 */
428 bidi_it->prev.type = UNKNOWN_BT;
429 bidi_it->last_strong.type = bidi_it->last_strong.orig_type = UNKNOWN_BT;
430 bidi_it->prev_for_neutral.type = (bidi_it->sos == R2L ? STRONG_R : STRONG_L);
431 bidi_it->prev_for_neutral.charpos = bidi_it->charpos;
432 bidi_it->next_for_neutral.type
433 = bidi_it->next_for_neutral.orig_type = UNKNOWN_BT;
436 #define ISOLATE_STATUS(BIDI_IT, IDX) ((BIDI_IT)->level_stack[IDX].flags & 1)
437 #define OVERRIDE(BIDI_IT, IDX) (((BIDI_IT)->level_stack[IDX].flags >> 1) & 3)
439 /* Push the current embedding level and override status; reset the
440 current level to LEVEL and the current override status to OVERRIDE. */
441 static void
442 bidi_push_embedding_level (struct bidi_it *bidi_it,
443 int level, bidi_dir_t override, bool isolate_status)
445 struct bidi_stack *st;
446 int prev_level = bidi_it->level_stack[bidi_it->stack_idx].level;
448 bidi_it->stack_idx++;
449 eassert (bidi_it->stack_idx < BIDI_MAXDEPTH+2+1);
450 st = &bidi_it->level_stack[bidi_it->stack_idx];
451 eassert (level <= (1 << 7));
452 st->level = level;
453 st->flags = (((override & 3) << 1) | (isolate_status != 0));
454 if (isolate_status)
456 st->last_strong_type = bidi_it->last_strong.type;
457 st->prev_for_neutral_type = bidi_it->prev_for_neutral.type;
458 st->next_for_neutral_type = bidi_it->next_for_neutral.type;
459 st->next_for_neutral_pos = bidi_it->next_for_neutral.charpos;
460 st->flags |= ((bidi_it->sos == L2R ? 0 : 1) << 3);
462 /* We've got a new isolating sequence, compute the directional type
463 of sos and initialize per-sequence variables (UAX#9, clause X10). */
464 bidi_set_sos_type (bidi_it, prev_level, level);
467 /* Pop from the stack the embedding level, the directional override
468 status, and optionally saved information for the isolating run
469 sequence. Return the new level. */
470 static int
471 bidi_pop_embedding_level (struct bidi_it *bidi_it)
473 int level;
475 /* UAX#9 says to ignore invalid PDFs (X7, last bullet)
476 and PDIs (X6a, 2nd bullet). */
477 if (bidi_it->stack_idx > 0)
479 bool isolate_status = ISOLATE_STATUS (bidi_it, bidi_it->stack_idx);
480 int old_level = bidi_it->level_stack[bidi_it->stack_idx].level;
482 struct bidi_stack st;
484 st = bidi_it->level_stack[bidi_it->stack_idx];
485 if (isolate_status)
487 bidi_dir_t sos = ((st.flags >> 3) & 1);
488 /* PREV is used in W1 for resolving WEAK_NSM. By the time
489 we get to an NSM, we must have gotten past at least one
490 character: the PDI that ends the isolate from which we
491 are popping here. So PREV will have been filled up by
492 the time we first use it. We initialize it here to
493 UNKNOWN_BT to be able to catch any blunders in this
494 logic. */
495 bidi_it->prev.orig_type = bidi_it->prev.type = UNKNOWN_BT;
496 bidi_it->last_strong.type = st.last_strong_type;
497 bidi_it->prev_for_neutral.type = st.prev_for_neutral_type;
498 bidi_it->next_for_neutral.type = st.next_for_neutral_type;
499 bidi_it->next_for_neutral.charpos = st.next_for_neutral_pos;
500 bidi_it->sos = (sos == 0 ? L2R : R2L);
502 else
503 bidi_set_sos_type (bidi_it, old_level,
504 bidi_it->level_stack[bidi_it->stack_idx - 1].level);
506 bidi_it->stack_idx--;
508 level = bidi_it->level_stack[bidi_it->stack_idx].level;
509 eassert (0 <= level && level <= BIDI_MAXDEPTH + 1);
510 return level;
513 /* Record in SAVED_INFO the information about the current character. */
514 static void
515 bidi_remember_char (struct bidi_saved_info *saved_info,
516 struct bidi_it *bidi_it, bool from_type)
518 saved_info->charpos = bidi_it->charpos;
519 if (from_type)
520 saved_info->type = bidi_it->type;
521 else
522 saved_info->type = bidi_it->type_after_wn;
523 bidi_check_type (saved_info->type);
524 saved_info->orig_type = bidi_it->orig_type;
525 bidi_check_type (saved_info->orig_type);
528 /* Copy the bidi iterator from FROM to TO. To save cycles, this only
529 copies the part of the level stack that is actually in use. */
530 static void
531 bidi_copy_it (struct bidi_it *to, struct bidi_it *from)
533 /* Copy everything from the start through the active part of
534 the level stack. */
535 memcpy (to, from,
536 (offsetof (struct bidi_it, level_stack[1])
537 + from->stack_idx * sizeof from->level_stack[0]));
541 /***********************************************************************
542 Caching the bidi iterator states
543 ***********************************************************************/
545 /* We allocate and de-allocate the cache in chunks of this size (in
546 characters). 200 was chosen as an upper limit for reasonably-long
547 lines in a text file/buffer. */
548 #define BIDI_CACHE_CHUNK 200
549 static struct bidi_it *bidi_cache;
550 static ptrdiff_t bidi_cache_size = 0;
551 enum { elsz = sizeof (struct bidi_it) };
552 static ptrdiff_t bidi_cache_idx; /* next unused cache slot */
553 static ptrdiff_t bidi_cache_last_idx; /* slot of last cache hit */
554 static ptrdiff_t bidi_cache_start = 0; /* start of cache for this
555 "stack" level */
557 /* 5-slot stack for saving the start of the previous level of the
558 cache. xdisp.c maintains a 5-slot stack for its iterator state,
559 and we need the same size of our stack. */
560 static ptrdiff_t bidi_cache_start_stack[IT_STACK_SIZE];
561 static int bidi_cache_sp;
563 /* Size of header used by bidi_shelve_cache. */
564 enum
566 bidi_shelve_header_size
567 = (sizeof (bidi_cache_idx) + sizeof (bidi_cache_start_stack)
568 + sizeof (bidi_cache_sp) + sizeof (bidi_cache_start)
569 + sizeof (bidi_cache_last_idx))
572 /* Effectively remove the cached states beyond the Nth state from the
573 part of the cache relevant to iteration of the current object
574 (buffer or string). */
575 static void
576 bidi_cache_reset_to (int n)
578 bidi_cache_idx = bidi_cache_start + n;
579 bidi_cache_last_idx = -1;
582 /* Reset the cache state to the empty state. We only reset the part
583 of the cache relevant to iteration of the current object. Previous
584 objects, which are pushed on the display iterator's stack, are left
585 intact. This is called when the cached information is no more
586 useful for the current iteration, e.g. when we were reseated to a
587 new position on the same object. */
588 static void
589 bidi_cache_reset (void)
591 bidi_cache_reset_to (0);
594 /* Shrink the cache to its minimal size. Called when we init the bidi
595 iterator for reordering a buffer or a string that does not come
596 from display properties, because that means all the previously
597 cached info is of no further use. */
598 static void
599 bidi_cache_shrink (void)
601 if (bidi_cache_size > BIDI_CACHE_CHUNK)
603 bidi_cache = xrealloc (bidi_cache, BIDI_CACHE_CHUNK * elsz);
604 bidi_cache_size = BIDI_CACHE_CHUNK;
606 bidi_cache_reset ();
609 static void
610 bidi_cache_fetch_state (ptrdiff_t idx, struct bidi_it *bidi_it)
612 int current_scan_dir = bidi_it->scan_dir;
614 if (idx < bidi_cache_start || idx >= bidi_cache_idx)
615 emacs_abort ();
617 bidi_copy_it (bidi_it, &bidi_cache[idx]);
618 bidi_it->scan_dir = current_scan_dir;
619 bidi_cache_last_idx = idx;
622 /* Find a cached state with a given CHARPOS and resolved embedding
623 level less or equal to LEVEL. If LEVEL is -1, disregard the
624 resolved levels in cached states. DIR, if non-zero, means search
625 in that direction from the last cache hit. */
626 static ptrdiff_t
627 bidi_cache_search (ptrdiff_t charpos, int level, int dir)
629 ptrdiff_t i, i_start;
631 if (bidi_cache_idx > bidi_cache_start)
633 if (bidi_cache_last_idx == -1)
634 bidi_cache_last_idx = bidi_cache_idx - 1;
635 if (charpos < bidi_cache[bidi_cache_last_idx].charpos)
637 dir = -1;
638 i_start = bidi_cache_last_idx - 1;
640 else if (charpos > (bidi_cache[bidi_cache_last_idx].charpos
641 + bidi_cache[bidi_cache_last_idx].nchars - 1))
643 dir = 1;
644 i_start = bidi_cache_last_idx + 1;
646 else if (dir)
647 i_start = bidi_cache_last_idx;
648 else
650 dir = -1;
651 i_start = bidi_cache_idx - 1;
654 if (dir < 0)
656 /* Linear search for now; FIXME! */
657 for (i = i_start; i >= bidi_cache_start; i--)
658 if (bidi_cache[i].charpos <= charpos
659 && charpos < bidi_cache[i].charpos + bidi_cache[i].nchars
660 && (level == -1 || bidi_cache[i].resolved_level <= level))
661 return i;
663 else
665 for (i = i_start; i < bidi_cache_idx; i++)
666 if (bidi_cache[i].charpos <= charpos
667 && charpos < bidi_cache[i].charpos + bidi_cache[i].nchars
668 && (level == -1 || bidi_cache[i].resolved_level <= level))
669 return i;
673 return -1;
676 /* Find a cached state where the resolved level changes to a value
677 that is lower than LEVEL, and return its cache slot index. DIR is
678 the direction to search, starting with the last used cache slot.
679 If DIR is zero, we search backwards from the last occupied cache
680 slot. BEFORE means return the index of the slot that
681 is ``before'' the level change in the search direction. That is,
682 given the cached levels like this:
684 1122333442211
685 AB C
687 and assuming we are at the position cached at the slot marked with
688 C, searching backwards (DIR = -1) for LEVEL = 2 will return the
689 index of slot B or A, depending whether BEFORE is, respectively,
690 true or false. */
691 static ptrdiff_t
692 bidi_cache_find_level_change (int level, int dir, bool before)
694 if (bidi_cache_idx)
696 ptrdiff_t i = dir ? bidi_cache_last_idx : bidi_cache_idx - 1;
697 int incr = before ? 1 : 0;
699 eassert (!dir || bidi_cache_last_idx >= 0);
701 if (!dir)
702 dir = -1;
703 else if (!incr)
704 i += dir;
706 if (dir < 0)
708 while (i >= bidi_cache_start + incr)
710 if (bidi_cache[i - incr].resolved_level >= 0
711 && bidi_cache[i - incr].resolved_level < level)
712 return i;
713 i--;
716 else
718 while (i < bidi_cache_idx - incr)
720 if (bidi_cache[i + incr].resolved_level >= 0
721 && bidi_cache[i + incr].resolved_level < level)
722 return i;
723 i++;
728 return -1;
731 static void
732 bidi_cache_ensure_space (ptrdiff_t idx)
734 /* Enlarge the cache as needed. */
735 if (idx >= bidi_cache_size)
737 /* The bidi cache cannot be larger than the largest Lisp string
738 or buffer. */
739 ptrdiff_t string_or_buffer_bound
740 = max (BUF_BYTES_MAX, STRING_BYTES_BOUND);
742 /* Also, it cannot be larger than what C can represent. */
743 ptrdiff_t c_bound
744 = (min (PTRDIFF_MAX, SIZE_MAX) - bidi_shelve_header_size) / elsz;
746 bidi_cache
747 = xpalloc (bidi_cache, &bidi_cache_size,
748 max (BIDI_CACHE_CHUNK, idx - bidi_cache_size + 1),
749 min (string_or_buffer_bound, c_bound), elsz);
753 static void
754 bidi_cache_iterator_state (struct bidi_it *bidi_it, bool resolved,
755 bool update_only)
757 ptrdiff_t idx;
759 /* We should never cache on backward scans. */
760 if (bidi_it->scan_dir == -1)
761 emacs_abort ();
762 idx = bidi_cache_search (bidi_it->charpos, -1, 1);
764 if (idx < 0 && update_only)
765 return;
767 if (idx < 0)
769 idx = bidi_cache_idx;
770 bidi_cache_ensure_space (idx);
771 /* Character positions should correspond to cache positions 1:1.
772 If we are outside the range of cached positions, the cache is
773 useless and must be reset. */
774 if (idx > bidi_cache_start &&
775 (bidi_it->charpos > (bidi_cache[idx - 1].charpos
776 + bidi_cache[idx - 1].nchars)
777 || bidi_it->charpos < bidi_cache[bidi_cache_start].charpos))
779 bidi_cache_reset ();
780 idx = bidi_cache_start;
782 if (bidi_it->nchars <= 0)
783 emacs_abort ();
784 bidi_copy_it (&bidi_cache[idx], bidi_it);
785 if (!resolved)
786 bidi_cache[idx].resolved_level = -1;
788 else
790 /* Copy only the members which could have changed, to avoid
791 costly copying of the entire struct. */
792 bidi_cache[idx].type = bidi_it->type;
793 bidi_check_type (bidi_it->type);
794 bidi_cache[idx].type_after_wn = bidi_it->type_after_wn;
795 bidi_check_type (bidi_it->type_after_wn);
796 if (resolved)
797 bidi_cache[idx].resolved_level = bidi_it->resolved_level;
798 else
799 bidi_cache[idx].resolved_level = -1;
800 bidi_cache[idx].invalid_levels = bidi_it->invalid_levels;
801 bidi_cache[idx].next_for_neutral = bidi_it->next_for_neutral;
802 bidi_cache[idx].next_for_ws = bidi_it->next_for_ws;
803 bidi_cache[idx].disp_pos = bidi_it->disp_pos;
804 bidi_cache[idx].disp_prop = bidi_it->disp_prop;
805 bidi_cache[idx].bracket_pairing_pos = bidi_it->bracket_pairing_pos;
806 bidi_cache[idx].bracket_enclosed_type = bidi_it->bracket_enclosed_type;
809 bidi_cache_last_idx = idx;
810 if (idx >= bidi_cache_idx)
811 bidi_cache_idx = idx + 1;
814 /* Look for a cached iterator state that corresponds to CHARPOS. If
815 found, copy the cached state into BIDI_IT and return the type of
816 the cached entry. If not found, return UNKNOWN_BT. RESOLVED_ONLY
817 zero means it is OK to return cached states that were not fully
818 resolved yet. This can happen if the state was cached before it
819 was resolved in bidi_resolve_neutral. */
820 static bidi_type_t
821 bidi_cache_find (ptrdiff_t charpos, bool resolved_only, struct bidi_it *bidi_it)
823 ptrdiff_t i = bidi_cache_search (charpos, -1, bidi_it->scan_dir);
825 if (i >= bidi_cache_start
826 && (!resolved_only
827 /* Callers that want only fully resolved states (and set
828 resolved_only = true) need to be sure that there's enough
829 info in the cached state to return the state as final,
830 and if not, they don't want the cached state. */
831 || bidi_cache[i].resolved_level >= 0))
833 bidi_dir_t current_scan_dir = bidi_it->scan_dir;
835 bidi_copy_it (bidi_it, &bidi_cache[i]);
836 bidi_cache_last_idx = i;
837 /* Don't let scan direction from the cached state override
838 the current scan direction. */
839 bidi_it->scan_dir = current_scan_dir;
840 return bidi_it->type;
843 return UNKNOWN_BT;
846 static int
847 bidi_peek_at_next_level (struct bidi_it *bidi_it)
849 if (bidi_cache_idx == bidi_cache_start || bidi_cache_last_idx == -1)
850 emacs_abort ();
851 return bidi_cache[bidi_cache_last_idx + bidi_it->scan_dir].resolved_level;
855 /***********************************************************************
856 Pushing and popping the bidi iterator state
857 ***********************************************************************/
859 /* Push the bidi iterator state in preparation for reordering a
860 different object, e.g. display string found at certain buffer
861 position. Pushing the bidi iterator boils down to saving its
862 entire state on the cache and starting a new cache "stacked" on top
863 of the current cache. */
864 void
865 bidi_push_it (struct bidi_it *bidi_it)
867 /* Save the current iterator state in its entirety after the last
868 used cache slot. */
869 bidi_cache_ensure_space (bidi_cache_idx);
870 bidi_cache[bidi_cache_idx++] = *bidi_it;
872 /* Push the current cache start onto the stack. */
873 eassert (bidi_cache_sp < IT_STACK_SIZE);
874 bidi_cache_start_stack[bidi_cache_sp++] = bidi_cache_start;
876 /* Start a new level of cache, and make it empty. */
877 bidi_cache_start = bidi_cache_idx;
878 bidi_cache_last_idx = -1;
881 /* Restore the iterator state saved by bidi_push_it and return the
882 cache to the corresponding state. */
883 void
884 bidi_pop_it (struct bidi_it *bidi_it)
886 if (bidi_cache_start <= 0)
887 emacs_abort ();
889 /* Reset the next free cache slot index to what it was before the
890 call to bidi_push_it. */
891 bidi_cache_idx = bidi_cache_start - 1;
893 /* Restore the bidi iterator state saved in the cache. */
894 *bidi_it = bidi_cache[bidi_cache_idx];
896 /* Pop the previous cache start from the stack. */
897 if (bidi_cache_sp <= 0)
898 emacs_abort ();
899 bidi_cache_start = bidi_cache_start_stack[--bidi_cache_sp];
901 /* Invalidate the last-used cache slot data. */
902 bidi_cache_last_idx = -1;
905 static ptrdiff_t bidi_cache_total_alloc;
907 /* Stash away a copy of the cache and its control variables. */
908 void *
909 bidi_shelve_cache (void)
911 unsigned char *databuf;
912 ptrdiff_t alloc;
914 /* Empty cache. */
915 if (bidi_cache_idx == 0)
916 return NULL;
918 alloc = (bidi_shelve_header_size
919 + bidi_cache_idx * sizeof (struct bidi_it));
920 databuf = xmalloc (alloc);
921 bidi_cache_total_alloc += alloc;
923 memcpy (databuf, &bidi_cache_idx, sizeof (bidi_cache_idx));
924 memcpy (databuf + sizeof (bidi_cache_idx),
925 bidi_cache, bidi_cache_idx * sizeof (struct bidi_it));
926 memcpy (databuf + sizeof (bidi_cache_idx)
927 + bidi_cache_idx * sizeof (struct bidi_it),
928 bidi_cache_start_stack, sizeof (bidi_cache_start_stack));
929 memcpy (databuf + sizeof (bidi_cache_idx)
930 + bidi_cache_idx * sizeof (struct bidi_it)
931 + sizeof (bidi_cache_start_stack),
932 &bidi_cache_sp, sizeof (bidi_cache_sp));
933 memcpy (databuf + sizeof (bidi_cache_idx)
934 + bidi_cache_idx * sizeof (struct bidi_it)
935 + sizeof (bidi_cache_start_stack) + sizeof (bidi_cache_sp),
936 &bidi_cache_start, sizeof (bidi_cache_start));
937 memcpy (databuf + sizeof (bidi_cache_idx)
938 + bidi_cache_idx * sizeof (struct bidi_it)
939 + sizeof (bidi_cache_start_stack) + sizeof (bidi_cache_sp)
940 + sizeof (bidi_cache_start),
941 &bidi_cache_last_idx, sizeof (bidi_cache_last_idx));
943 return databuf;
946 /* Restore the cache state from a copy stashed away by
947 bidi_shelve_cache, and free the buffer used to stash that copy.
948 JUST_FREE means free the buffer, but don't restore the
949 cache; used when the corresponding iterator is discarded instead of
950 being restored. */
951 void
952 bidi_unshelve_cache (void *databuf, bool just_free)
954 unsigned char *p = databuf;
956 if (!p)
958 if (!just_free)
960 /* A NULL pointer means an empty cache. */
961 bidi_cache_start = 0;
962 bidi_cache_sp = 0;
963 bidi_cache_reset ();
966 else
968 if (just_free)
970 ptrdiff_t idx;
972 memcpy (&idx, p, sizeof (bidi_cache_idx));
973 bidi_cache_total_alloc
974 -= bidi_shelve_header_size + idx * sizeof (struct bidi_it);
976 else
978 memcpy (&bidi_cache_idx, p, sizeof (bidi_cache_idx));
979 bidi_cache_ensure_space (bidi_cache_idx);
980 memcpy (bidi_cache, p + sizeof (bidi_cache_idx),
981 bidi_cache_idx * sizeof (struct bidi_it));
982 memcpy (bidi_cache_start_stack,
983 p + sizeof (bidi_cache_idx)
984 + bidi_cache_idx * sizeof (struct bidi_it),
985 sizeof (bidi_cache_start_stack));
986 memcpy (&bidi_cache_sp,
987 p + sizeof (bidi_cache_idx)
988 + bidi_cache_idx * sizeof (struct bidi_it)
989 + sizeof (bidi_cache_start_stack),
990 sizeof (bidi_cache_sp));
991 memcpy (&bidi_cache_start,
992 p + sizeof (bidi_cache_idx)
993 + bidi_cache_idx * sizeof (struct bidi_it)
994 + sizeof (bidi_cache_start_stack) + sizeof (bidi_cache_sp),
995 sizeof (bidi_cache_start));
996 memcpy (&bidi_cache_last_idx,
997 p + sizeof (bidi_cache_idx)
998 + bidi_cache_idx * sizeof (struct bidi_it)
999 + sizeof (bidi_cache_start_stack) + sizeof (bidi_cache_sp)
1000 + sizeof (bidi_cache_start),
1001 sizeof (bidi_cache_last_idx));
1002 bidi_cache_total_alloc
1003 -= (bidi_shelve_header_size
1004 + bidi_cache_idx * sizeof (struct bidi_it));
1007 xfree (p);
1012 /***********************************************************************
1013 Initialization
1014 ***********************************************************************/
1015 static void
1016 bidi_initialize (void)
1018 bidi_type_table = uniprop_table (intern ("bidi-class"));
1019 if (NILP (bidi_type_table))
1020 emacs_abort ();
1021 staticpro (&bidi_type_table);
1023 bidi_mirror_table = uniprop_table (intern ("mirroring"));
1024 if (NILP (bidi_mirror_table))
1025 emacs_abort ();
1026 staticpro (&bidi_mirror_table);
1028 bidi_brackets_table = uniprop_table (intern ("bracket-type"));
1029 if (NILP (bidi_brackets_table))
1030 emacs_abort ();
1031 staticpro (&bidi_brackets_table);
1033 Qparagraph_start = intern ("paragraph-start");
1034 staticpro (&Qparagraph_start);
1035 paragraph_start_re = Fsymbol_value (Qparagraph_start);
1036 if (!STRINGP (paragraph_start_re))
1037 paragraph_start_re = build_string ("\f\\|[ \t]*$");
1038 staticpro (&paragraph_start_re);
1039 Qparagraph_separate = intern ("paragraph-separate");
1040 staticpro (&Qparagraph_separate);
1041 paragraph_separate_re = Fsymbol_value (Qparagraph_separate);
1042 if (!STRINGP (paragraph_separate_re))
1043 paragraph_separate_re = build_string ("[ \t\f]*$");
1044 staticpro (&paragraph_separate_re);
1046 bidi_cache_sp = 0;
1047 bidi_cache_total_alloc = 0;
1049 bidi_initialized = 1;
1052 /* Do whatever UAX#9 clause X8 says should be done at paragraph's
1053 end. */
1054 static void
1055 bidi_set_paragraph_end (struct bidi_it *bidi_it)
1057 bidi_it->invalid_levels = 0;
1058 bidi_it->invalid_isolates = 0;
1059 bidi_it->stack_idx = 0;
1060 bidi_it->resolved_level = bidi_it->level_stack[0].level;
1063 /* Initialize the bidi iterator from buffer/string position CHARPOS. */
1064 void
1065 bidi_init_it (ptrdiff_t charpos, ptrdiff_t bytepos, bool frame_window_p,
1066 struct bidi_it *bidi_it)
1068 if (! bidi_initialized)
1069 bidi_initialize ();
1070 if (charpos >= 0)
1071 bidi_it->charpos = charpos;
1072 if (bytepos >= 0)
1073 bidi_it->bytepos = bytepos;
1074 bidi_it->frame_window_p = frame_window_p;
1075 bidi_it->nchars = -1; /* to be computed in bidi_resolve_explicit */
1076 bidi_it->first_elt = 1;
1077 bidi_set_paragraph_end (bidi_it);
1078 bidi_it->new_paragraph = 1;
1079 bidi_it->separator_limit = -1;
1080 bidi_it->type = NEUTRAL_B;
1081 bidi_it->type_after_wn = NEUTRAL_B;
1082 bidi_it->orig_type = NEUTRAL_B;
1083 /* FIXME: Review this!!! */
1084 bidi_it->prev.type = bidi_it->prev.orig_type = UNKNOWN_BT;
1085 bidi_it->last_strong.type = bidi_it->last_strong.orig_type = UNKNOWN_BT;
1086 bidi_it->next_for_neutral.charpos = -1;
1087 bidi_it->next_for_neutral.type
1088 = bidi_it->next_for_neutral.orig_type = UNKNOWN_BT;
1089 bidi_it->prev_for_neutral.charpos = -1;
1090 bidi_it->prev_for_neutral.type
1091 = bidi_it->prev_for_neutral.orig_type = UNKNOWN_BT;
1092 bidi_it->bracket_pairing_pos = -1;
1093 bidi_it->sos = L2R; /* FIXME: should it be user-selectable? */
1094 bidi_it->disp_pos = -1; /* invalid/unknown */
1095 bidi_it->disp_prop = 0;
1096 /* We can only shrink the cache if we are at the bottom level of its
1097 "stack". */
1098 if (bidi_cache_start == 0)
1099 bidi_cache_shrink ();
1100 else
1101 bidi_cache_reset ();
1104 /* Perform initializations for reordering a new line of bidi text. */
1105 static void
1106 bidi_line_init (struct bidi_it *bidi_it)
1108 bidi_it->scan_dir = 1; /* FIXME: do we need to have control on this? */
1109 bidi_it->stack_idx = 0;
1110 bidi_it->resolved_level = bidi_it->level_stack[0].level;
1111 bidi_it->level_stack[0].flags = 0; /* NEUTRAL_DIR, false per X1 */
1112 bidi_it->invalid_levels = 0;
1113 bidi_it->isolate_level = 0; /* X1 */
1114 bidi_it->invalid_isolates = 0; /* X1 */
1115 /* Setting this to zero will force its recomputation the first time
1116 we need it for W5. */
1117 bidi_it->next_en_pos = 0;
1118 bidi_it->next_en_type = UNKNOWN_BT;
1119 bidi_it->next_for_ws.charpos = -1;
1120 bidi_it->next_for_ws.type = UNKNOWN_BT;
1121 bidi_it->bracket_pairing_pos = -1;
1122 bidi_set_sos_type (bidi_it,
1123 (bidi_it->paragraph_dir == R2L ? 1 : 0),
1124 bidi_it->level_stack[0].level); /* X10 */
1126 bidi_cache_reset ();
1130 /***********************************************************************
1131 Fetching characters
1132 ***********************************************************************/
1134 /* Count bytes in string S between BEG/BEGBYTE and END. BEG and END
1135 are zero-based character positions in S, BEGBYTE is byte position
1136 corresponding to BEG. UNIBYTE means S is a unibyte string. */
1137 static ptrdiff_t
1138 bidi_count_bytes (const unsigned char *s, ptrdiff_t beg,
1139 ptrdiff_t begbyte, ptrdiff_t end, bool unibyte)
1141 ptrdiff_t pos = beg;
1142 const unsigned char *p = s + begbyte, *start = p;
1144 if (unibyte)
1145 p = s + end;
1146 else
1148 if (!CHAR_HEAD_P (*p))
1149 emacs_abort ();
1151 while (pos < end)
1153 p += BYTES_BY_CHAR_HEAD (*p);
1154 pos++;
1158 return p - start;
1161 /* Fetch and return the character at byte position BYTEPOS. If S is
1162 non-NULL, fetch the character from string S; otherwise fetch the
1163 character from the current buffer. UNIBYTE means S is a
1164 unibyte string. */
1165 static int
1166 bidi_char_at_pos (ptrdiff_t bytepos, const unsigned char *s, bool unibyte)
1168 if (s)
1170 s += bytepos;
1171 if (unibyte)
1172 return *s;
1174 else
1175 s = BYTE_POS_ADDR (bytepos);
1176 return STRING_CHAR (s);
1179 /* Fetch and return the character at CHARPOS/BYTEPOS. If that
1180 character is covered by a display string, treat the entire run of
1181 covered characters as a single character, either u+2029 or u+FFFC,
1182 and return their combined length in CH_LEN and NCHARS. DISP_POS
1183 specifies the character position of the next display string, or -1
1184 if not yet computed. When the next character is at or beyond that
1185 position, the function updates DISP_POS with the position of the
1186 next display string. *DISP_PROP non-zero means that there's really
1187 a display string at DISP_POS, as opposed to when we searched till
1188 DISP_POS without finding one. If *DISP_PROP is 2, it means the
1189 display spec is of the form `(space ...)', which is replaced with
1190 u+2029 to handle it as a paragraph separator. STRING->s is the C
1191 string to iterate, or NULL if iterating over a buffer or a Lisp
1192 string; in the latter case, STRING->lstring is the Lisp string. */
1193 static int
1194 bidi_fetch_char (ptrdiff_t charpos, ptrdiff_t bytepos, ptrdiff_t *disp_pos,
1195 int *disp_prop, struct bidi_string_data *string,
1196 struct window *w,
1197 bool frame_window_p, ptrdiff_t *ch_len, ptrdiff_t *nchars)
1199 int ch;
1200 ptrdiff_t endpos
1201 = (string->s || STRINGP (string->lstring)) ? string->schars : ZV;
1202 struct text_pos pos;
1203 int len;
1205 /* If we got past the last known position of display string, compute
1206 the position of the next one. That position could be at CHARPOS. */
1207 if (charpos < endpos && charpos > *disp_pos)
1209 SET_TEXT_POS (pos, charpos, bytepos);
1210 *disp_pos = compute_display_string_pos (&pos, string, w, frame_window_p,
1211 disp_prop);
1214 /* Fetch the character at BYTEPOS. */
1215 if (charpos >= endpos)
1217 ch = BIDI_EOB;
1218 *ch_len = 1;
1219 *nchars = 1;
1220 *disp_pos = endpos;
1221 *disp_prop = 0;
1223 else if (charpos >= *disp_pos && *disp_prop)
1225 ptrdiff_t disp_end_pos;
1227 /* We don't expect to find ourselves in the middle of a display
1228 property. Hopefully, it will never be needed. */
1229 if (charpos > *disp_pos)
1230 emacs_abort ();
1231 /* Text covered by `display' properties and overlays with
1232 display properties or display strings is handled as a single
1233 character that represents the entire run of characters
1234 covered by the display property. */
1235 if (*disp_prop == 2)
1237 /* `(space ...)' display specs are handled as paragraph
1238 separators for the purposes of the reordering; see UAX#9
1239 section 3 and clause HL1 in section 4.3 there. */
1240 ch = 0x2029;
1242 else
1244 /* All other display specs are handled as the Unicode Object
1245 Replacement Character. */
1246 ch = 0xFFFC;
1248 disp_end_pos = compute_display_string_end (*disp_pos, string);
1249 if (disp_end_pos < 0)
1251 /* Somebody removed the display string from the buffer
1252 behind our back. Recover by processing this buffer
1253 position as if no display property were present there to
1254 begin with. */
1255 *disp_prop = 0;
1256 goto normal_char;
1258 *nchars = disp_end_pos - *disp_pos;
1259 if (*nchars <= 0)
1260 emacs_abort ();
1261 if (string->s)
1262 *ch_len = bidi_count_bytes (string->s, *disp_pos, bytepos,
1263 disp_end_pos, string->unibyte);
1264 else if (STRINGP (string->lstring))
1265 *ch_len = bidi_count_bytes (SDATA (string->lstring), *disp_pos,
1266 bytepos, disp_end_pos, string->unibyte);
1267 else
1268 *ch_len = CHAR_TO_BYTE (disp_end_pos) - bytepos;
1270 else
1272 normal_char:
1273 if (string->s)
1276 if (!string->unibyte)
1278 ch = STRING_CHAR_AND_LENGTH (string->s + bytepos, len);
1279 *ch_len = len;
1281 else
1283 ch = UNIBYTE_TO_CHAR (string->s[bytepos]);
1284 *ch_len = 1;
1287 else if (STRINGP (string->lstring))
1289 if (!string->unibyte)
1291 ch = STRING_CHAR_AND_LENGTH (SDATA (string->lstring) + bytepos,
1292 len);
1293 *ch_len = len;
1295 else
1297 ch = UNIBYTE_TO_CHAR (SREF (string->lstring, bytepos));
1298 *ch_len = 1;
1301 else
1303 ch = STRING_CHAR_AND_LENGTH (BYTE_POS_ADDR (bytepos), len);
1304 *ch_len = len;
1306 *nchars = 1;
1309 /* If we just entered a run of characters covered by a display
1310 string, compute the position of the next display string. */
1311 if (charpos + *nchars <= endpos && charpos + *nchars > *disp_pos
1312 && *disp_prop)
1314 SET_TEXT_POS (pos, charpos + *nchars, bytepos + *ch_len);
1315 *disp_pos = compute_display_string_pos (&pos, string, w, frame_window_p,
1316 disp_prop);
1319 return ch;
1322 /* Like bidi_fetch_char, but ignore any text between an isolate
1323 initiator and its matching PDI or, if it has no matching PDI, the
1324 end of the paragraph. If isolates were skipped, CH_LEN and NCHARS
1325 are set to the number of bytes and characters between BYTEPOS/CHARPOS
1326 and the character that was fetched after skipping the isolates. */
1327 static int
1328 bidi_fetch_char_skip_isolates (ptrdiff_t charpos, ptrdiff_t bytepos,
1329 ptrdiff_t *disp_pos, int *disp_prop,
1330 struct bidi_string_data *string,
1331 struct window *w, bool frame_window_p,
1332 ptrdiff_t *ch_len, ptrdiff_t *nchars)
1334 ptrdiff_t orig_charpos = charpos, orig_bytepos = bytepos;
1335 int ch = bidi_fetch_char (charpos, bytepos, disp_pos, disp_prop, string, w,
1336 frame_window_p, ch_len, nchars);
1337 bidi_type_t ch_type = bidi_get_type (ch, NEUTRAL_DIR);
1338 ptrdiff_t level = 0;
1340 if (ch_type == LRI || ch_type == RLI || ch_type == FSI)
1342 level++;
1343 while (level > 0 && ch_type != NEUTRAL_B)
1345 charpos += *nchars;
1346 bytepos += *ch_len;
1347 ch = bidi_fetch_char (charpos, bytepos, disp_pos, disp_prop, string,
1348 w, frame_window_p, ch_len, nchars);
1349 ch_type = bidi_get_type (ch, NEUTRAL_DIR);
1350 /* A Note to P2 says to ignore max_depth limit. */
1351 if (ch_type == LRI || ch_type == RLI || ch_type == FSI)
1352 level++;
1353 else if (ch_type == PDI)
1354 level--;
1358 /* Communicate to the caller how much did we skip, so it could get
1359 past the last character position we examined. */
1360 *nchars += charpos - orig_charpos;
1361 *ch_len += bytepos - orig_bytepos;
1362 return ch;
1367 /***********************************************************************
1368 Determining paragraph direction
1369 ***********************************************************************/
1371 /* Check if buffer position CHARPOS/BYTEPOS is the end of a paragraph.
1372 Value is the non-negative length of the paragraph separator
1373 following the buffer position, -1 if position is at the beginning
1374 of a new paragraph, or -2 if position is neither at beginning nor
1375 at end of a paragraph. */
1376 static ptrdiff_t
1377 bidi_at_paragraph_end (ptrdiff_t charpos, ptrdiff_t bytepos)
1379 Lisp_Object sep_re;
1380 Lisp_Object start_re;
1381 ptrdiff_t val;
1383 sep_re = paragraph_separate_re;
1384 start_re = paragraph_start_re;
1386 val = fast_looking_at (sep_re, charpos, bytepos, ZV, ZV_BYTE, Qnil);
1387 if (val < 0)
1389 if (fast_looking_at (start_re, charpos, bytepos, ZV, ZV_BYTE, Qnil) >= 0)
1390 val = -1;
1391 else
1392 val = -2;
1395 return val;
1398 /* If the user has requested the long scans caching, make sure that
1399 BIDI cache is enabled. Otherwise, make sure it's disabled. */
1401 static struct region_cache *
1402 bidi_paragraph_cache_on_off (void)
1404 struct buffer *cache_buffer = current_buffer;
1405 bool indirect_p = false;
1407 /* For indirect buffers, make sure to use the cache of their base
1408 buffer. */
1409 if (cache_buffer->base_buffer)
1411 cache_buffer = cache_buffer->base_buffer;
1412 indirect_p = true;
1415 /* Don't turn on or off the cache in the base buffer, if the value
1416 of cache-long-scans of the base buffer is inconsistent with that.
1417 This is because doing so will just make the cache pure overhead,
1418 since if we turn it on via indirect buffer, it will be
1419 immediately turned off by its base buffer. */
1420 if (NILP (BVAR (current_buffer, cache_long_scans)))
1422 if (!indirect_p
1423 || NILP (BVAR (cache_buffer, cache_long_scans)))
1425 if (cache_buffer->bidi_paragraph_cache)
1427 free_region_cache (cache_buffer->bidi_paragraph_cache);
1428 cache_buffer->bidi_paragraph_cache = 0;
1431 return NULL;
1433 else
1435 if (!indirect_p
1436 || !NILP (BVAR (cache_buffer, cache_long_scans)))
1438 if (!cache_buffer->bidi_paragraph_cache)
1439 cache_buffer->bidi_paragraph_cache = new_region_cache ();
1441 return cache_buffer->bidi_paragraph_cache;
1445 /* On my 2005-vintage machine, searching back for paragraph start
1446 takes ~1 ms per line. And bidi_paragraph_init is called 4 times
1447 when user types C-p. The number below limits each call to
1448 bidi_paragraph_init to about 10 ms. */
1449 #define MAX_PARAGRAPH_SEARCH 7500
1451 /* Find the beginning of this paragraph by looking back in the buffer.
1452 Value is the byte position of the paragraph's beginning, or
1453 BEGV_BYTE if paragraph_start_re is still not found after looking
1454 back MAX_PARAGRAPH_SEARCH lines in the buffer. */
1455 static ptrdiff_t
1456 bidi_find_paragraph_start (ptrdiff_t pos, ptrdiff_t pos_byte)
1458 Lisp_Object re = paragraph_start_re;
1459 ptrdiff_t limit = ZV, limit_byte = ZV_BYTE;
1460 struct region_cache *bpc = bidi_paragraph_cache_on_off ();
1461 ptrdiff_t n = 0, oldpos = pos, next;
1462 struct buffer *cache_buffer = current_buffer;
1464 if (cache_buffer->base_buffer)
1465 cache_buffer = cache_buffer->base_buffer;
1467 while (pos_byte > BEGV_BYTE
1468 && n++ < MAX_PARAGRAPH_SEARCH
1469 && fast_looking_at (re, pos, pos_byte, limit, limit_byte, Qnil) < 0)
1471 /* FIXME: What if the paragraph beginning is covered by a
1472 display string? And what if a display string covering some
1473 of the text over which we scan back includes
1474 paragraph_start_re? */
1475 DEC_BOTH (pos, pos_byte);
1476 if (bpc && region_cache_backward (cache_buffer, bpc, pos, &next))
1478 pos = next, pos_byte = CHAR_TO_BYTE (pos);
1479 break;
1481 else
1482 pos = find_newline_no_quit (pos, pos_byte, -1, &pos_byte);
1484 if (n >= MAX_PARAGRAPH_SEARCH)
1485 pos = BEGV, pos_byte = BEGV_BYTE;
1486 if (bpc)
1487 know_region_cache (cache_buffer, bpc, pos, oldpos);
1488 /* Positions returned by the region cache are not limited to
1489 BEGV..ZV range, so we limit them here. */
1490 pos_byte = clip_to_bounds (BEGV_BYTE, pos_byte, ZV_BYTE);
1491 return pos_byte;
1494 /* On a 3.4 GHz machine, searching forward for a strong directional
1495 character in a long paragraph full of weaks or neutrals takes about
1496 1 ms for each 20K characters. The number below limits each call to
1497 bidi_paragraph_init to less than 10 ms even on slow machines. */
1498 #define MAX_STRONG_CHAR_SEARCH 100000
1500 /* Starting from POS, find the first strong (L, R, or AL) character,
1501 while skipping over any characters between an isolate initiator and
1502 its matching PDI. STOP_AT_PDI non-zero means stop at the PDI that
1503 matches the isolate initiator at POS. Return the bidi type of the
1504 character where the search stopped. Give up if after examining
1505 MAX_STRONG_CHAR_SEARCH buffer or string positions no strong
1506 character was found. */
1507 static bidi_type_t
1508 find_first_strong_char (ptrdiff_t pos, ptrdiff_t bytepos, ptrdiff_t end,
1509 ptrdiff_t *disp_pos, int *disp_prop,
1510 struct bidi_string_data *string, struct window *w,
1511 bool string_p, bool frame_window_p,
1512 ptrdiff_t *ch_len, ptrdiff_t *nchars, bool stop_at_pdi)
1514 ptrdiff_t pos1;
1515 bidi_type_t type;
1516 int ch;
1518 if (stop_at_pdi)
1520 /* If STOP_AT_PDI is non-zero, we must have been called with FSI
1521 at POS. Get past it. */
1522 #ifdef ENABLE_CHECKING
1523 ch = bidi_fetch_char (pos, bytepos, disp_pos, disp_prop, string, w,
1524 frame_window_p, ch_len, nchars);
1525 type = bidi_get_type (ch, NEUTRAL_DIR);
1526 eassert (type == FSI /* || type == LRI || type == RLI */);
1527 #endif
1528 pos += *nchars;
1529 bytepos += *ch_len;
1531 ch = bidi_fetch_char_skip_isolates (pos, bytepos, disp_pos, disp_prop, string,
1532 w, frame_window_p, ch_len, nchars);
1533 type = bidi_get_type (ch, NEUTRAL_DIR);
1535 pos1 = pos;
1536 for (pos += *nchars, bytepos += *ch_len;
1537 bidi_get_category (type) != STRONG
1538 /* If requested to stop at first PDI, stop there. */
1539 && !(stop_at_pdi && type == PDI)
1540 /* Stop when searched too far into an abnormally large
1541 paragraph full of weak or neutral characters. */
1542 && pos - pos1 < MAX_STRONG_CHAR_SEARCH;
1543 type = bidi_get_type (ch, NEUTRAL_DIR))
1545 if (pos >= end)
1547 /* Pretend there's a paragraph separator at end of
1548 buffer/string. */
1549 type = NEUTRAL_B;
1550 break;
1552 if (!string_p
1553 && type == NEUTRAL_B
1554 && bidi_at_paragraph_end (pos, bytepos) >= -1)
1555 break;
1556 /* Fetch next character and advance to get past it. */
1557 ch = bidi_fetch_char_skip_isolates (pos, bytepos, disp_pos, disp_prop,
1558 string, w, frame_window_p,
1559 ch_len, nchars);
1560 pos += *nchars;
1561 bytepos += *ch_len;
1563 return type;
1566 /* Determine the base direction, a.k.a. base embedding level, of the
1567 paragraph we are about to iterate through. If DIR is either L2R or
1568 R2L, just use that. Otherwise, determine the paragraph direction
1569 from the first strong directional character of the paragraph.
1571 NO_DEFAULT_P means don't default to L2R if the paragraph
1572 has no strong directional characters and both DIR and
1573 bidi_it->paragraph_dir are NEUTRAL_DIR. In that case, search back
1574 in the buffer until a paragraph is found with a strong character,
1575 or until hitting BEGV. In the latter case, fall back to L2R. This
1576 flag is used in current-bidi-paragraph-direction.
1578 Note that this function gives the paragraph separator the same
1579 direction as the preceding paragraph, even though Emacs generally
1580 views the separator as not belonging to any paragraph. */
1581 void
1582 bidi_paragraph_init (bidi_dir_t dir, struct bidi_it *bidi_it, bool no_default_p)
1584 ptrdiff_t bytepos = bidi_it->bytepos;
1585 bool string_p = bidi_it->string.s || STRINGP (bidi_it->string.lstring);
1586 ptrdiff_t pstartbyte;
1587 /* Note that begbyte is a byte position, while end is a character
1588 position. Yes, this is ugly, but we are trying to avoid costly
1589 calls to BYTE_TO_CHAR and its ilk. */
1590 ptrdiff_t begbyte = string_p ? 0 : BEGV_BYTE;
1591 ptrdiff_t end = string_p ? bidi_it->string.schars : ZV;
1593 /* Special case for an empty buffer. */
1594 if (bytepos == begbyte && bidi_it->charpos == end)
1595 dir = L2R;
1596 /* We should never be called at EOB or before BEGV. */
1597 else if (bidi_it->charpos >= end || bytepos < begbyte)
1598 emacs_abort ();
1600 if (dir == L2R)
1602 bidi_it->paragraph_dir = L2R;
1603 bidi_it->new_paragraph = 0;
1605 else if (dir == R2L)
1607 bidi_it->paragraph_dir = R2L;
1608 bidi_it->new_paragraph = 0;
1610 else if (dir == NEUTRAL_DIR) /* P2 */
1612 ptrdiff_t ch_len, nchars;
1613 ptrdiff_t pos, disp_pos = -1;
1614 int disp_prop = 0;
1615 bidi_type_t type;
1616 const unsigned char *s;
1618 if (!bidi_initialized)
1619 bidi_initialize ();
1621 /* If we are inside a paragraph separator, we are just waiting
1622 for the separator to be exhausted; use the previous paragraph
1623 direction. But don't do that if we have been just reseated,
1624 because we need to reinitialize below in that case. */
1625 if (!bidi_it->first_elt
1626 && bidi_it->charpos < bidi_it->separator_limit)
1627 return;
1629 /* If we are on a newline, get past it to where the next
1630 paragraph might start. But don't do that at BEGV since then
1631 we are potentially in a new paragraph that doesn't yet
1632 exist. */
1633 pos = bidi_it->charpos;
1634 s = (STRINGP (bidi_it->string.lstring)
1635 ? SDATA (bidi_it->string.lstring)
1636 : bidi_it->string.s);
1637 if (bytepos > begbyte
1638 && bidi_char_at_pos (bytepos, s, bidi_it->string.unibyte) == '\n')
1640 bytepos++;
1641 pos++;
1644 /* We are either at the beginning of a paragraph or in the
1645 middle of it. Find where this paragraph starts. */
1646 if (string_p)
1648 /* We don't support changes of paragraph direction inside a
1649 string. It is treated as a single paragraph. */
1650 pstartbyte = 0;
1652 else
1653 pstartbyte = bidi_find_paragraph_start (pos, bytepos);
1654 bidi_it->separator_limit = -1;
1655 bidi_it->new_paragraph = 0;
1657 /* The following loop is run more than once only if NO_DEFAULT_P,
1658 and only if we are iterating on a buffer. */
1659 do {
1660 bytepos = pstartbyte;
1661 if (!string_p)
1662 pos = BYTE_TO_CHAR (bytepos);
1663 type = find_first_strong_char (pos, bytepos, end, &disp_pos, &disp_prop,
1664 &bidi_it->string, bidi_it->w,
1665 string_p, bidi_it->frame_window_p,
1666 &ch_len, &nchars, false);
1667 if (type == STRONG_R || type == STRONG_AL) /* P3 */
1668 bidi_it->paragraph_dir = R2L;
1669 else if (type == STRONG_L)
1670 bidi_it->paragraph_dir = L2R;
1671 if (!string_p
1672 && no_default_p && bidi_it->paragraph_dir == NEUTRAL_DIR)
1674 /* If this paragraph is at BEGV, default to L2R. */
1675 if (pstartbyte == BEGV_BYTE)
1676 bidi_it->paragraph_dir = L2R; /* P3 and HL1 */
1677 else
1679 ptrdiff_t prevpbyte = pstartbyte;
1680 ptrdiff_t p = BYTE_TO_CHAR (pstartbyte), pbyte = pstartbyte;
1682 /* Find the beginning of the previous paragraph, if any. */
1683 while (pbyte > BEGV_BYTE && prevpbyte >= pstartbyte)
1685 /* FXIME: What if p is covered by a display
1686 string? See also a FIXME inside
1687 bidi_find_paragraph_start. */
1688 DEC_BOTH (p, pbyte);
1689 prevpbyte = bidi_find_paragraph_start (p, pbyte);
1691 pstartbyte = prevpbyte;
1694 } while (!string_p
1695 && no_default_p && bidi_it->paragraph_dir == NEUTRAL_DIR);
1697 else
1698 emacs_abort ();
1700 /* Contrary to UAX#9 clause P3, we only default the paragraph
1701 direction to L2R if we have no previous usable paragraph
1702 direction. This is allowed by the HL1 clause. */
1703 if (bidi_it->paragraph_dir != L2R && bidi_it->paragraph_dir != R2L)
1704 bidi_it->paragraph_dir = L2R; /* P3 and HL1 ``higher-level protocols'' */
1705 if (bidi_it->paragraph_dir == R2L)
1706 bidi_it->level_stack[0].level = 1;
1707 else
1708 bidi_it->level_stack[0].level = 0;
1710 bidi_line_init (bidi_it);
1714 /***********************************************************************
1715 Resolving explicit and implicit levels.
1716 The rest of this file constitutes the core of the UBA implementation.
1717 ***********************************************************************/
1719 static bool
1720 bidi_explicit_dir_char (int ch)
1722 bidi_type_t ch_type;
1724 if (!bidi_initialized)
1725 emacs_abort ();
1726 ch_type = (bidi_type_t) XINT (CHAR_TABLE_REF (bidi_type_table, ch));
1727 return (ch_type == LRE || ch_type == LRO
1728 || ch_type == RLE || ch_type == RLO
1729 || ch_type == PDF);
1732 /* Given an iterator state in BIDI_IT, advance one character position
1733 in the buffer/string to the next character (in the logical order),
1734 resolve any explicit embeddings, directional overrides, and isolate
1735 initiators and terminators, and return the embedding level of the
1736 character after resolving these explicit directives. */
1737 static int
1738 bidi_resolve_explicit (struct bidi_it *bidi_it)
1740 int curchar;
1741 bidi_type_t type, typ1, prev_type = UNKNOWN_BT;
1742 int current_level;
1743 int new_level;
1744 bidi_dir_t override;
1745 bool isolate_status;
1746 bool string_p = bidi_it->string.s || STRINGP (bidi_it->string.lstring);
1747 ptrdiff_t ch_len, nchars, disp_pos, end;
1748 int disp_prop;
1749 ptrdiff_t eob
1750 = ((bidi_it->string.s || STRINGP (bidi_it->string.lstring))
1751 ? bidi_it->string.schars : ZV);
1753 /* Record the info about the previous character. */
1754 if (bidi_it->type_after_wn != WEAK_BN /* W1/Retaining */
1755 && bidi_it->type != WEAK_BN)
1757 /* This special case is needed in support of Unicode 8.0
1758 correction to N0, as implemented in bidi_resolve_weak/W1
1759 below. */
1760 if (bidi_it->type_after_wn == NEUTRAL_ON
1761 && bidi_get_category (bidi_it->type) == STRONG
1762 && bidi_paired_bracket_type (bidi_it->ch) == BIDI_BRACKET_CLOSE)
1763 bidi_remember_char (&bidi_it->prev, bidi_it, 1);
1764 else
1765 bidi_remember_char (&bidi_it->prev, bidi_it, 0);
1767 if (bidi_it->type_after_wn == STRONG_R
1768 || bidi_it->type_after_wn == STRONG_L
1769 || bidi_it->type_after_wn == STRONG_AL)
1770 bidi_remember_char (&bidi_it->last_strong, bidi_it, 0);
1771 if (bidi_it->type == STRONG_R || bidi_it->type == STRONG_L
1772 || bidi_it->type == WEAK_EN || bidi_it->type == WEAK_AN)
1773 bidi_remember_char (&bidi_it->prev_for_neutral, bidi_it, 1);
1775 /* If we overstepped the characters used for resolving neutrals
1776 and whitespace, invalidate their info in the iterator. */
1777 if (bidi_it->charpos >= bidi_it->next_for_neutral.charpos)
1779 bidi_it->next_for_neutral.type = UNKNOWN_BT;
1780 /* If needed, reset the "magical" value of pairing bracket
1781 position, so that bidi_resolve_brackets will resume
1782 resolution of brackets according to BPA. */
1783 if (bidi_it->bracket_pairing_pos == eob)
1784 bidi_it->bracket_pairing_pos = -1;
1786 if (bidi_it->next_en_pos >= 0
1787 && bidi_it->charpos >= bidi_it->next_en_pos)
1789 bidi_it->next_en_pos = 0;
1790 bidi_it->next_en_type = UNKNOWN_BT;
1793 /* Reset the bracket resolution info, unless we previously decided
1794 (in bidi_find_bracket_pairs) that brackets in this level run
1795 should be resolved as neutrals. */
1796 if (bidi_it->bracket_pairing_pos != eob)
1798 bidi_it->bracket_pairing_pos = -1;
1799 bidi_it->bracket_enclosed_type = UNKNOWN_BT;
1802 /* If reseat()'ed, don't advance, so as to start iteration from the
1803 position where we were reseated. bidi_it->bytepos can be less
1804 than BEGV_BYTE after reseat to BEGV. */
1805 if (bidi_it->bytepos < (string_p ? 0 : BEGV_BYTE)
1806 || bidi_it->first_elt)
1808 bidi_it->first_elt = 0;
1809 if (string_p)
1811 const unsigned char *p
1812 = (STRINGP (bidi_it->string.lstring)
1813 ? SDATA (bidi_it->string.lstring)
1814 : bidi_it->string.s);
1816 if (bidi_it->charpos < 0)
1817 bidi_it->charpos = bidi_it->bytepos = 0;
1818 eassert (bidi_it->bytepos == bidi_count_bytes (p, 0, 0,
1819 bidi_it->charpos,
1820 bidi_it->string.unibyte));
1822 else
1824 if (bidi_it->charpos < BEGV)
1826 bidi_it->charpos = BEGV;
1827 bidi_it->bytepos = BEGV_BYTE;
1829 eassert (bidi_it->bytepos == CHAR_TO_BYTE (bidi_it->charpos));
1831 /* Determine the original bidi type of the previous character,
1832 which is needed for handling isolate initiators and PDF. The
1833 type of the previous character will be non-trivial only if
1834 our caller moved through some previous text in
1835 get_visually_first_element, in which case bidi_it->prev holds
1836 the information we want. */
1837 if (bidi_it->first_elt && bidi_it->prev.type != UNKNOWN_BT)
1839 eassert (bidi_it->prev.charpos == bidi_it->charpos - 1);
1840 prev_type = bidi_it->prev.orig_type;
1841 if (prev_type == FSI)
1842 prev_type = bidi_it->type_after_wn;
1845 /* Don't move at end of buffer/string. */
1846 else if (bidi_it->charpos < (string_p ? bidi_it->string.schars : ZV))
1848 /* Advance to the next character, skipping characters covered by
1849 display strings (nchars > 1). */
1850 if (bidi_it->nchars <= 0)
1851 emacs_abort ();
1852 bidi_it->charpos += bidi_it->nchars;
1853 if (bidi_it->ch_len == 0)
1854 emacs_abort ();
1855 bidi_it->bytepos += bidi_it->ch_len;
1856 prev_type = bidi_it->orig_type;
1857 if (prev_type == FSI)
1858 prev_type = bidi_it->type_after_wn;
1860 else /* EOB or end of string */
1861 prev_type = NEUTRAL_B;
1863 current_level = bidi_it->level_stack[bidi_it->stack_idx].level; /* X1 */
1864 isolate_status = ISOLATE_STATUS (bidi_it, bidi_it->stack_idx);
1865 override = OVERRIDE (bidi_it, bidi_it->stack_idx);
1866 new_level = current_level;
1868 if (bidi_it->charpos >= (string_p ? bidi_it->string.schars : ZV))
1870 curchar = BIDI_EOB;
1871 bidi_it->ch_len = 1;
1872 bidi_it->nchars = 1;
1873 bidi_it->disp_pos = (string_p ? bidi_it->string.schars : ZV);
1874 bidi_it->disp_prop = 0;
1876 else
1878 /* LRI, RLI, and FSI increment, and PDF decrements, the
1879 embedding level of the _following_ characters, so we must
1880 first look at the type of the previous character to support
1881 that. */
1882 switch (prev_type)
1884 case RLI: /* X5a */
1885 if (current_level < BIDI_MAXDEPTH
1886 && bidi_it->invalid_levels == 0
1887 && bidi_it->invalid_isolates == 0)
1889 new_level = ((current_level + 1) & ~1) + 1;
1890 bidi_it->isolate_level++;
1891 bidi_push_embedding_level (bidi_it, new_level,
1892 NEUTRAL_DIR, true);
1894 else
1895 bidi_it->invalid_isolates++;
1896 break;
1897 case LRI: /* X5b */
1898 if (current_level < BIDI_MAXDEPTH - 1
1899 && bidi_it->invalid_levels == 0
1900 && bidi_it->invalid_isolates == 0)
1902 new_level = ((current_level + 2) & ~1);
1903 bidi_it->isolate_level++;
1904 bidi_push_embedding_level (bidi_it, new_level,
1905 NEUTRAL_DIR, true);
1907 else
1908 bidi_it->invalid_isolates++;
1909 break;
1910 case PDF: /* X7 */
1911 if (!bidi_it->invalid_isolates)
1913 if (bidi_it->invalid_levels)
1914 bidi_it->invalid_levels--;
1915 else if (!isolate_status && bidi_it->stack_idx >= 1)
1916 new_level = bidi_pop_embedding_level (bidi_it);
1918 break;
1919 default:
1920 eassert (prev_type != FSI);
1921 /* Nothing. */
1922 break;
1924 /* Fetch the character at BYTEPOS. If it is covered by a
1925 display string, treat the entire run of covered characters as
1926 a single character u+FFFC. */
1927 curchar = bidi_fetch_char (bidi_it->charpos, bidi_it->bytepos,
1928 &bidi_it->disp_pos, &bidi_it->disp_prop,
1929 &bidi_it->string, bidi_it->w,
1930 bidi_it->frame_window_p,
1931 &bidi_it->ch_len, &bidi_it->nchars);
1933 bidi_it->ch = curchar;
1934 bidi_it->resolved_level = new_level;
1936 /* Don't apply directional override here, as all the types we handle
1937 below will not be affected by the override anyway, and we need
1938 the original type unaltered. The override will be applied in
1939 bidi_resolve_weak. */
1940 type = bidi_get_type (curchar, NEUTRAL_DIR);
1941 bidi_it->orig_type = type;
1942 bidi_check_type (bidi_it->orig_type);
1944 bidi_it->type_after_wn = UNKNOWN_BT;
1946 switch (type)
1948 case RLE: /* X2 */
1949 case RLO: /* X4 */
1950 bidi_it->type_after_wn = type;
1951 bidi_check_type (bidi_it->type_after_wn);
1952 type = WEAK_BN; /* X9/Retaining */
1953 if (new_level < BIDI_MAXDEPTH
1954 && bidi_it->invalid_levels == 0
1955 && bidi_it->invalid_isolates == 0)
1957 /* Compute the least odd embedding level greater than
1958 the current level. */
1959 new_level = ((new_level + 1) & ~1) + 1;
1960 if (bidi_it->type_after_wn == RLE)
1961 override = NEUTRAL_DIR;
1962 else
1963 override = R2L;
1964 bidi_push_embedding_level (bidi_it, new_level, override, false);
1965 bidi_it->resolved_level = new_level;
1967 else
1969 if (bidi_it->invalid_isolates == 0)
1970 bidi_it->invalid_levels++;
1972 break;
1973 case LRE: /* X3 */
1974 case LRO: /* X5 */
1975 bidi_it->type_after_wn = type;
1976 bidi_check_type (bidi_it->type_after_wn);
1977 type = WEAK_BN; /* X9/Retaining */
1978 if (new_level < BIDI_MAXDEPTH - 1
1979 && bidi_it->invalid_levels == 0
1980 && bidi_it->invalid_isolates == 0)
1982 /* Compute the least even embedding level greater than
1983 the current level. */
1984 new_level = ((new_level + 2) & ~1);
1985 if (bidi_it->type_after_wn == LRE)
1986 override = NEUTRAL_DIR;
1987 else
1988 override = L2R;
1989 bidi_push_embedding_level (bidi_it, new_level, override, false);
1990 bidi_it->resolved_level = new_level;
1992 else
1994 if (bidi_it->invalid_isolates == 0)
1995 bidi_it->invalid_levels++;
1997 break;
1998 case FSI: /* X5c */
1999 end = string_p ? bidi_it->string.schars : ZV;
2000 disp_pos = bidi_it->disp_pos;
2001 disp_prop = bidi_it->disp_prop;
2002 nchars = bidi_it->nchars;
2003 ch_len = bidi_it->ch_len;
2004 typ1 = find_first_strong_char (bidi_it->charpos,
2005 bidi_it->bytepos, end,
2006 &disp_pos, &disp_prop,
2007 &bidi_it->string, bidi_it->w,
2008 string_p, bidi_it->frame_window_p,
2009 &ch_len, &nchars, true);
2010 if (typ1 != STRONG_R && typ1 != STRONG_AL)
2012 type = LRI;
2013 goto fsi_as_lri;
2015 else
2016 type = RLI;
2017 /* FALLTHROUGH */
2018 case RLI: /* X5a */
2019 if (override == NEUTRAL_DIR)
2020 bidi_it->type_after_wn = type;
2021 else /* Unicode 8.0 correction. */
2022 bidi_it->type_after_wn = (override == L2R ? STRONG_L : STRONG_R);
2023 bidi_check_type (bidi_it->type_after_wn);
2024 break;
2025 case LRI: /* X5b */
2026 fsi_as_lri:
2027 if (override == NEUTRAL_DIR)
2028 bidi_it->type_after_wn = type;
2029 else /* Unicode 8.0 correction. */
2030 bidi_it->type_after_wn = (override == L2R ? STRONG_L : STRONG_R);
2031 bidi_check_type (bidi_it->type_after_wn);
2032 break;
2033 case PDI: /* X6a */
2034 if (bidi_it->invalid_isolates)
2035 bidi_it->invalid_isolates--;
2036 else if (bidi_it->isolate_level > 0)
2038 bidi_it->invalid_levels = 0;
2039 while (!ISOLATE_STATUS (bidi_it, bidi_it->stack_idx))
2040 bidi_pop_embedding_level (bidi_it);
2041 eassert (bidi_it->stack_idx > 0);
2042 new_level = bidi_pop_embedding_level (bidi_it);
2043 bidi_it->isolate_level--;
2045 bidi_it->resolved_level = new_level;
2046 /* Unicode 8.0 correction. */
2048 bidi_dir_t stack_override = OVERRIDE (bidi_it, bidi_it->stack_idx);
2049 if (stack_override == L2R)
2050 bidi_it->type_after_wn = STRONG_L;
2051 else if (stack_override == R2L)
2052 bidi_it->type_after_wn = STRONG_R;
2053 else
2054 bidi_it->type_after_wn = type;
2056 break;
2057 case PDF: /* X7 */
2058 bidi_it->type_after_wn = type;
2059 bidi_check_type (bidi_it->type_after_wn);
2060 type = WEAK_BN; /* X9/Retaining */
2061 break;
2062 default:
2063 /* Nothing. */
2064 break;
2067 bidi_it->type = type;
2068 bidi_check_type (bidi_it->type);
2070 if (bidi_it->type == NEUTRAL_B) /* X8 */
2072 bidi_set_paragraph_end (bidi_it);
2073 /* This is needed by bidi_resolve_weak below, and in L1. */
2074 bidi_it->type_after_wn = bidi_it->type;
2077 eassert (bidi_it->resolved_level >= 0);
2078 return bidi_it->resolved_level;
2081 /* Advance in the buffer/string, resolve weak types and return the
2082 type of the next character after weak type resolution. */
2083 static bidi_type_t
2084 bidi_resolve_weak (struct bidi_it *bidi_it)
2086 bidi_type_t type;
2087 bidi_dir_t override;
2088 int prev_level = bidi_it->level_stack[bidi_it->stack_idx].level;
2089 int new_level = bidi_resolve_explicit (bidi_it);
2090 int next_char;
2091 bidi_type_t type_of_next;
2092 struct bidi_it saved_it;
2093 ptrdiff_t eob
2094 = ((STRINGP (bidi_it->string.lstring) || bidi_it->string.s)
2095 ? bidi_it->string.schars : ZV);
2097 type = bidi_it->type;
2098 override = OVERRIDE (bidi_it, bidi_it->stack_idx);
2100 eassert (!(type == UNKNOWN_BT
2101 || type == LRE
2102 || type == LRO
2103 || type == RLE
2104 || type == RLO
2105 || type == PDF));
2107 eassert (prev_level >= 0);
2108 if (bidi_it->type == NEUTRAL_B)
2110 /* We've got a new isolating sequence, compute the directional
2111 type of sos and initialize per-run variables (UAX#9, clause
2112 X10). */
2113 bidi_set_sos_type (bidi_it, prev_level, new_level);
2115 if (type == NEUTRAL_S || type == NEUTRAL_WS
2116 || type == WEAK_BN || type == STRONG_AL)
2117 bidi_it->type_after_wn = type; /* needed in L1 */
2118 bidi_check_type (bidi_it->type_after_wn);
2120 /* Level and directional override status are already recorded in
2121 bidi_it, and do not need any change; see X6. */
2122 if (override == R2L) /* X6 */
2123 type = STRONG_R;
2124 else if (override == L2R)
2125 type = STRONG_L;
2126 else
2128 if (type == WEAK_NSM) /* W1 */
2130 /* Note that we don't need to consider the case where the
2131 prev character has its type overridden by an RLO or LRO,
2132 because then either the type of this NSM would have been
2133 also overridden, or the previous character is outside the
2134 current level run, and thus not relevant to this NSM.
2135 This is why NSM gets the type_after_wn of the previous
2136 character. */
2137 /* bidi_set_sos_type sets type_after_wn to UNKNOWN_BT. */
2138 if (bidi_it->prev.type != UNKNOWN_BT
2139 /* If type_after_wn is NEUTRAL_B, this NSM is at sos. */
2140 && bidi_it->prev.type != NEUTRAL_B)
2142 if (bidi_isolate_fmt_char (bidi_it->prev.type))
2144 /* From W1: "Note that in an isolating run sequence,
2145 an isolate initiator followed by an NSM or any
2146 type other than PDI must be an overflow isolate
2147 initiator." */
2148 eassert (bidi_it->invalid_isolates > 0);
2149 type = NEUTRAL_ON;
2151 else
2153 /* This includes the Unicode 8.0 correction for N0,
2154 due to how we set prev.type in bidi_resolve_explicit,
2155 which see. */
2156 type = bidi_it->prev.type;
2159 else if (bidi_it->sos == R2L)
2160 type = STRONG_R;
2161 else if (bidi_it->sos == L2R)
2162 type = STRONG_L;
2163 else /* shouldn't happen! */
2164 emacs_abort ();
2166 if (type == WEAK_EN /* W2 */
2167 && bidi_it->last_strong.type == STRONG_AL)
2168 type = WEAK_AN;
2169 else if (type == STRONG_AL) /* W3 */
2170 type = STRONG_R;
2171 else if ((type == WEAK_ES /* W4 */
2172 && bidi_it->prev.type == WEAK_EN
2173 && bidi_it->prev.orig_type == WEAK_EN)
2174 || (type == WEAK_CS
2175 && ((bidi_it->prev.type == WEAK_EN
2176 && bidi_it->prev.orig_type == WEAK_EN)
2177 || bidi_it->prev.type == WEAK_AN)))
2179 const unsigned char *s
2180 = (STRINGP (bidi_it->string.lstring)
2181 ? SDATA (bidi_it->string.lstring)
2182 : bidi_it->string.s);
2184 next_char = (bidi_it->charpos + bidi_it->nchars >= eob
2185 ? BIDI_EOB
2186 : bidi_char_at_pos (bidi_it->bytepos + bidi_it->ch_len,
2187 s, bidi_it->string.unibyte));
2188 type_of_next = bidi_get_type (next_char, override);
2190 if (type_of_next == WEAK_BN
2191 || bidi_explicit_dir_char (next_char))
2193 bidi_copy_it (&saved_it, bidi_it);
2194 while (bidi_resolve_explicit (bidi_it) == new_level
2195 && bidi_it->type == WEAK_BN)
2196 type_of_next = bidi_it->type;
2197 bidi_copy_it (bidi_it, &saved_it);
2200 /* If the next character is EN, but the last strong-type
2201 character is AL, that next EN will be changed to AN when
2202 we process it in W2 above. So in that case, this ES
2203 should not be changed into EN. */
2204 if (type == WEAK_ES
2205 && type_of_next == WEAK_EN
2206 && bidi_it->last_strong.type != STRONG_AL)
2207 type = WEAK_EN;
2208 else if (type == WEAK_CS)
2210 if (bidi_it->prev.type == WEAK_AN
2211 && (type_of_next == WEAK_AN
2212 /* If the next character is EN, but the last
2213 strong-type character is AL, EN will be later
2214 changed to AN when we process it in W2 above.
2215 So in that case, this ES should not be
2216 changed into EN. */
2217 || (type_of_next == WEAK_EN
2218 && bidi_it->last_strong.type == STRONG_AL)))
2219 type = WEAK_AN;
2220 else if (bidi_it->prev.type == WEAK_EN
2221 && type_of_next == WEAK_EN
2222 && bidi_it->last_strong.type != STRONG_AL)
2223 type = WEAK_EN;
2226 else if (type == WEAK_ET /* W5: ET with EN before or after it */
2227 || type == WEAK_BN) /* W5/Retaining */
2229 if (bidi_it->prev.type == WEAK_EN) /* ET/BN w/EN before it */
2230 type = WEAK_EN;
2231 else if (bidi_it->next_en_pos > bidi_it->charpos
2232 && bidi_it->next_en_type != WEAK_BN)
2234 if (bidi_it->next_en_type == WEAK_EN) /* ET/BN with EN after it */
2235 type = WEAK_EN;
2237 else if (bidi_it->next_en_pos >=0)
2239 /* We overstepped the last known position for ET
2240 resolution but there could be other such characters
2241 in this paragraph (when we are sure there are no more
2242 such positions, we set next_en_pos to a negative
2243 value). Try to find the next position for ET
2244 resolution. */
2245 ptrdiff_t en_pos = bidi_it->charpos + bidi_it->nchars;
2246 const unsigned char *s = (STRINGP (bidi_it->string.lstring)
2247 ? SDATA (bidi_it->string.lstring)
2248 : bidi_it->string.s);
2250 if (bidi_it->nchars <= 0)
2251 emacs_abort ();
2252 next_char
2253 = (bidi_it->charpos + bidi_it->nchars >= eob
2254 ? BIDI_EOB
2255 : bidi_char_at_pos (bidi_it->bytepos + bidi_it->ch_len, s,
2256 bidi_it->string.unibyte));
2257 type_of_next = bidi_get_type (next_char, override);
2259 if (type_of_next == WEAK_ET
2260 || type_of_next == WEAK_BN
2261 || bidi_explicit_dir_char (next_char))
2263 bidi_copy_it (&saved_it, bidi_it);
2264 while (bidi_resolve_explicit (bidi_it) == new_level
2265 && (bidi_it->type == WEAK_BN
2266 || bidi_it->type == WEAK_ET))
2267 type_of_next = bidi_it->type;
2268 if (type == WEAK_BN
2269 && bidi_it->charpos == saved_it.charpos + saved_it.nchars)
2271 /* If we entered the above loop with a BN that
2272 changes the level, the type of next
2273 character, which is in a different level, is
2274 not relevant to resolving this series of ET
2275 and BN. */
2276 en_pos = saved_it.charpos;
2277 type_of_next = type;
2279 else
2280 en_pos = bidi_it->charpos;
2281 bidi_copy_it (bidi_it, &saved_it);
2283 /* Remember this position, to speed up processing of the
2284 next ETs. */
2285 bidi_it->next_en_pos = en_pos;
2286 if (type_of_next == WEAK_EN)
2288 /* If the last strong character is AL, the EN we've
2289 found will become AN when we get to it (W2). */
2290 if (bidi_it->last_strong.type == STRONG_AL)
2291 type_of_next = WEAK_AN;
2292 else if (type == WEAK_BN)
2293 type = NEUTRAL_ON; /* W6/Retaining */
2294 else
2295 type = WEAK_EN;
2297 else if (type_of_next == NEUTRAL_B)
2298 /* Record the fact that there are no more ENs from
2299 here to the end of paragraph, to avoid entering the
2300 loop above ever again in this paragraph. */
2301 bidi_it->next_en_pos = -1;
2302 /* Record the type of the character where we ended our search. */
2303 bidi_it->next_en_type = type_of_next;
2308 if (type == WEAK_ES || type == WEAK_ET || type == WEAK_CS /* W6 */
2309 || (type == WEAK_BN
2310 && (bidi_it->prev.type == WEAK_CS /* W6/Retaining */
2311 || bidi_it->prev.type == WEAK_ES
2312 || bidi_it->prev.type == WEAK_ET)))
2313 type = NEUTRAL_ON;
2315 /* Store the type we've got so far, before we clobber it with strong
2316 types in W7 and while resolving neutral types. But leave alone
2317 the original types that were recorded above, because we will need
2318 them for the L1 clause. */
2319 if (bidi_it->type_after_wn == UNKNOWN_BT)
2320 bidi_it->type_after_wn = type;
2321 bidi_check_type (bidi_it->type_after_wn);
2323 if (type == WEAK_EN) /* W7 */
2325 if ((bidi_it->last_strong.type == STRONG_L)
2326 || (bidi_it->last_strong.type == UNKNOWN_BT && bidi_it->sos == L2R))
2327 type = STRONG_L;
2330 bidi_it->type = type;
2331 bidi_check_type (bidi_it->type);
2332 return type;
2335 /* Resolve the type of a neutral character according to the type of
2336 surrounding strong text and the current embedding level. */
2337 static bidi_type_t
2338 bidi_resolve_neutral_1 (bidi_type_t prev_type, bidi_type_t next_type, int lev)
2340 /* N1: "European and Arabic numbers act as if they were R in terms
2341 of their influence on NIs." */
2342 if (next_type == WEAK_EN || next_type == WEAK_AN)
2343 next_type = STRONG_R;
2344 if (prev_type == WEAK_EN || prev_type == WEAK_AN)
2345 prev_type = STRONG_R;
2347 if (next_type == prev_type) /* N1 */
2348 return next_type;
2349 else if ((lev & 1) == 0) /* N2 */
2350 return STRONG_L;
2351 else
2352 return STRONG_R;
2355 #define FLAG_EMBEDDING_INSIDE 1
2356 #define FLAG_OPPOSITE_INSIDE 2
2358 /* A data type used in the stack maintained by
2359 bidi_find_bracket_pairs below. */
2360 typedef struct bpa_stack_entry {
2361 int close_bracket_char;
2362 int open_bracket_idx;
2363 #ifdef ENABLE_CHECKING
2364 ptrdiff_t open_bracket_pos;
2365 #endif
2366 unsigned flags : 2;
2367 } bpa_stack_entry;
2369 /* With MAX_ALLOCA of 16KB, this should allow at least 1K slots in the
2370 BPA stack, which should be more than enough for actual bidi text. */
2371 #define MAX_BPA_STACK ((int)max (MAX_ALLOCA / sizeof (bpa_stack_entry), 1))
2373 /* UAX#9 says to match opening brackets with the matching closing
2374 brackets or their canonical equivalents. As of Unicode 7.0, there
2375 are only 2 bracket characters that have canonical equivalence
2376 decompositions: u+2329 and u+232A. So instead of accessing the
2377 table in uni-decomposition.el, we just handle these 2 characters
2378 with this simple macro. Note that ASCII characters don't have
2379 canonical equivalents by definition. */
2381 /* To find all the characters that need to be processed by
2382 CANONICAL_EQU, first find all the characters which have
2383 decompositions in UnicodeData.txt, with this Awk script:
2385 awk -F ";" " {if ($6 != \"\") print $1, $6}" UnicodeData.txt
2387 Then produce a list of all the bracket characters in BidiBrackets.txt:
2389 awk -F "[ ;]" " {if ($1 != \"#\" && $1 != \"\") print $1}" BidiBrackets.txt
2391 And finally, cross-reference these two:
2393 fgrep -w -f brackets.txt decompositions.txt
2395 where "decompositions.txt" was produced by the 1st script, and
2396 "brackets.txt" by the 2nd script. In the output of fgrep, look
2397 only for decompositions that don't begin with some compatibility
2398 formatting tag, such as "<compat>". Only decompositions that
2399 consist solely of character codepoints are relevant to bidi
2400 brackets processing. */
2402 #define CANONICAL_EQU(c) \
2403 ( ASCII_CHAR_P (c) ? c \
2404 : (c) == 0x2329 ? 0x3008 \
2405 : (c) == 0x232a ? 0x3009 \
2406 : c )
2408 #ifdef ENABLE_CHECKING
2409 # define STORE_BRACKET_CHARPOS \
2410 bpa_stack[bpa_sp].open_bracket_pos = bidi_it->charpos
2411 #else
2412 # define STORE_BRACKET_CHARPOS /* nothing */
2413 #endif
2415 #define PUSH_BPA_STACK \
2416 do { \
2417 int ch; \
2418 if (bpa_sp < MAX_BPA_STACK - 1) \
2420 bpa_sp++; \
2421 ch = CANONICAL_EQU (bidi_it->ch); \
2422 bpa_stack[bpa_sp].close_bracket_char = bidi_mirror_char (ch); \
2423 bpa_stack[bpa_sp].open_bracket_idx = bidi_cache_last_idx; \
2424 bpa_stack[bpa_sp].flags = 0; \
2425 STORE_BRACKET_CHARPOS; \
2427 } while (0)
2430 /* This function implements BPA, the Bidi Parenthesis Algorithm,
2431 described in BD16 and N0 of UAX#9. It finds all the bracket pairs
2432 in the current isolating sequence, and records the enclosed type
2433 and the position of the matching bracket in the cache. It returns
2434 non-zero if called with the iterator on the opening bracket which
2435 has a matching closing bracket in the current isolating sequence,
2436 zero otherwise. */
2437 static bool
2438 bidi_find_bracket_pairs (struct bidi_it *bidi_it)
2440 bidi_bracket_type_t btype;
2441 bidi_type_t type = bidi_it->type;
2442 bool retval = false;
2444 /* When scanning backwards, we don't expect any unresolved bidi
2445 bracket characters. */
2446 if (bidi_it->scan_dir != 1)
2447 emacs_abort ();
2449 btype = bidi_paired_bracket_type (bidi_it->ch);
2450 if (btype == BIDI_BRACKET_OPEN)
2452 bpa_stack_entry bpa_stack[MAX_BPA_STACK];
2453 int bpa_sp = -1;
2454 struct bidi_it saved_it;
2455 int base_level = bidi_it->level_stack[0].level;
2456 int embedding_level = bidi_it->level_stack[bidi_it->stack_idx].level;
2457 int maxlevel = embedding_level;
2458 bidi_type_t embedding_type = (embedding_level & 1) ? STRONG_R : STRONG_L;
2459 struct bidi_it tem_it;
2460 bool l2r_seen = false, r2l_seen = false;
2461 ptrdiff_t pairing_pos;
2463 eassert (MAX_BPA_STACK >= 100);
2464 bidi_copy_it (&saved_it, bidi_it);
2465 /* bidi_cache_iterator_state refuses to cache on backward scans,
2466 and bidi_cache_fetch_state doesn't bring scan_dir from the
2467 cache, so we must initialize this explicitly. */
2468 tem_it.scan_dir = 1;
2470 while (1)
2472 int old_sidx, new_sidx;
2473 int current_level = bidi_it->level_stack[bidi_it->stack_idx].level;
2475 if (maxlevel < current_level)
2476 maxlevel = current_level;
2477 /* Mark every opening bracket character we've traversed by
2478 putting its own position into bracket_pairing_pos. This
2479 is examined in bidi_resolve_brackets to distinguish
2480 brackets that were already resolved to stay NEUTRAL_ON,
2481 and those that were not yet processed by this function
2482 (because they were skipped when we skip higher embedding
2483 levels below). */
2484 if (btype == BIDI_BRACKET_OPEN && bidi_it->bracket_pairing_pos == -1)
2485 bidi_it->bracket_pairing_pos = bidi_it->charpos;
2486 bidi_cache_iterator_state (bidi_it, type == NEUTRAL_B, 0);
2487 if (btype == BIDI_BRACKET_OPEN)
2488 PUSH_BPA_STACK;
2489 else if (btype == BIDI_BRACKET_CLOSE)
2491 int sp = bpa_sp;
2492 int curchar = CANONICAL_EQU (bidi_it->ch);
2494 eassert (sp >= 0);
2495 while (sp >= 0 && bpa_stack[sp].close_bracket_char != curchar)
2496 sp--;
2497 if (sp >= 0)
2499 /* Update and cache the corresponding opening bracket. */
2500 bidi_cache_fetch_state (bpa_stack[sp].open_bracket_idx,
2501 &tem_it);
2502 #ifdef ENABLE_CHECKING
2503 eassert (bpa_stack[sp].open_bracket_pos == tem_it.charpos);
2504 #endif
2505 /* Determine the enclosed type for this bracket
2506 pair's type resolution according to N0. */
2507 if (bpa_stack[sp].flags & FLAG_EMBEDDING_INSIDE)
2508 tem_it.bracket_enclosed_type = embedding_type; /* N0b */
2509 else if (bpa_stack[sp].flags & FLAG_OPPOSITE_INSIDE)
2510 tem_it.bracket_enclosed_type /* N0c */
2511 = (embedding_type == STRONG_L ? STRONG_R : STRONG_L);
2512 else /* N0d */
2513 tem_it.bracket_enclosed_type = UNKNOWN_BT;
2515 /* Record the position of the matching closing
2516 bracket, and update the cache. */
2517 tem_it.bracket_pairing_pos = bidi_it->charpos;
2518 bidi_cache_iterator_state (&tem_it, 0, 1);
2520 /* Pop the BPA stack. */
2521 bpa_sp = sp - 1;
2523 if (bpa_sp < 0)
2525 retval = true;
2526 break;
2529 else if (bidi_get_category (bidi_it->type_after_wn) != NEUTRAL)
2531 unsigned flag = 0;
2532 int sp;
2534 /* Whenever we see a strong type, update the flags of
2535 all the slots on the stack. */
2536 switch (bidi_it->type)
2538 case STRONG_L:
2539 flag = ((embedding_level & 1) == 0
2540 ? FLAG_EMBEDDING_INSIDE
2541 : FLAG_OPPOSITE_INSIDE);
2542 l2r_seen = true;
2543 break;
2544 case STRONG_R:
2545 case WEAK_EN:
2546 case WEAK_AN:
2547 flag = ((embedding_level & 1) == 1
2548 ? FLAG_EMBEDDING_INSIDE
2549 : FLAG_OPPOSITE_INSIDE);
2550 r2l_seen = true;
2551 break;
2552 default:
2553 break;
2555 if (flag)
2557 for (sp = bpa_sp; sp >= 0; sp--)
2558 bpa_stack[sp].flags |= flag;
2561 old_sidx = bidi_it->stack_idx;
2562 type = bidi_resolve_weak (bidi_it);
2563 /* Skip level runs excluded from this isolating run sequence. */
2564 new_sidx = bidi_it->stack_idx;
2565 if (bidi_it->level_stack[new_sidx].level > current_level
2566 && (ISOLATE_STATUS (bidi_it, new_sidx)
2567 || (new_sidx > old_sidx + 1
2568 && ISOLATE_STATUS (bidi_it, new_sidx - 1))))
2570 while (bidi_it->level_stack[bidi_it->stack_idx].level
2571 > current_level)
2573 if (maxlevel < bidi_it->level_stack[bidi_it->stack_idx].level)
2574 maxlevel = bidi_it->level_stack[bidi_it->stack_idx].level;
2575 bidi_cache_iterator_state (bidi_it, type == NEUTRAL_B, 0);
2576 type = bidi_resolve_weak (bidi_it);
2579 if (type == NEUTRAL_B
2580 || (bidi_it->level_stack[bidi_it->stack_idx].level
2581 != current_level))
2583 /* We've marched all the way to the end of this
2584 isolating run sequence, and didn't find matching
2585 closing brackets for some opening brackets. Leave
2586 their type unchanged. */
2587 pairing_pos = bidi_it->charpos;
2588 break;
2590 if (bidi_it->type_after_wn == NEUTRAL_ON) /* Unicode 8.0 correction */
2591 btype = bidi_paired_bracket_type (bidi_it->ch);
2592 else
2593 btype = BIDI_BRACKET_NONE;
2596 /* Restore bidi_it from the cache, which should have the bracket
2597 resolution members set as determined by the above loop. */
2598 type = bidi_cache_find (saved_it.charpos, 0, bidi_it);
2599 eassert (type == NEUTRAL_ON);
2601 /* The following is an optimization for bracketed text that has
2602 only one level which is equal to the paragraph's base
2603 embedding level. That is, only L2R and weak/neutral
2604 characters in a L2R paragraph, or only R2L and weak/neutral
2605 characters in a R2L paragraph. Such brackets can be resolved
2606 by bidi_resolve_neutral, which has a further shortcut for
2607 this case. So we pretend we did not resolve the brackets in
2608 this case, set up next_for_neutral for the entire bracketed
2609 text, and reset the cache to the character before the opening
2610 bracket. The upshot is to allow bidi_move_to_visually_next
2611 reset the cache when it returns this opening bracket, thus
2612 cutting significantly on the size of the cache, which is
2613 important with long lines, especially if word-wrap is non-nil
2614 (which requires the display engine to copy the cache back and
2615 forth many times). */
2616 if (maxlevel == base_level
2617 && ((base_level == 0 && !r2l_seen)
2618 || (base_level == 1 && !l2r_seen)))
2620 ptrdiff_t eob
2621 = ((bidi_it->string.s || STRINGP (bidi_it->string.lstring))
2622 ? bidi_it->string.schars : ZV);
2624 if (retval)
2625 pairing_pos = bidi_it->bracket_pairing_pos;
2627 /* This special value (which cannot possibly happen when
2628 brackets are resolved, since there's no character at ZV)
2629 will be noticed by bidi_resolve_explicit, and will be
2630 copied to the following iterator states, instead of being
2631 reset to -1. */
2632 bidi_it->bracket_pairing_pos = eob;
2633 /* This type value will be used for resolving the outermost
2634 closing bracket in bidi_resolve_brackets. */
2635 bidi_it->bracket_enclosed_type = embedding_type;
2636 /* bidi_cache_last_idx is set to the index of the current
2637 state, because we just called bidi_cache_find above.
2638 That state describes the outermost opening bracket, the
2639 one with which we entered this function. Force the cache
2640 to "forget" all the cached states starting from that state. */
2641 bidi_cache_reset_to (bidi_cache_last_idx - bidi_cache_start);
2642 /* Set up the next_for_neutral member, to help
2643 bidi_resolve_neutral. */
2644 bidi_it->next_for_neutral.type = embedding_type;
2645 bidi_it->next_for_neutral.charpos = pairing_pos;
2646 /* Pretend we didn't resolve this bracket. */
2647 retval = false;
2651 return retval;
2654 static void
2655 bidi_record_type_for_neutral (struct bidi_saved_info *info, int level,
2656 bool nextp)
2658 int idx;
2660 for (idx = bidi_cache_last_idx + 1; idx < bidi_cache_idx; idx++)
2662 int lev = bidi_cache[idx].level_stack[bidi_cache[idx].stack_idx].level;
2664 if (lev <= level)
2666 eassert (lev == level);
2667 if (nextp)
2668 bidi_cache[idx].next_for_neutral = *info;
2669 else
2670 bidi_cache[idx].prev_for_neutral = *info;
2671 break;
2676 static bidi_type_t
2677 bidi_resolve_brackets (struct bidi_it *bidi_it)
2679 int prev_level = bidi_it->level_stack[bidi_it->stack_idx].level;
2680 bool resolve_bracket = false;
2681 bidi_type_t type = UNKNOWN_BT;
2682 int ch;
2683 struct bidi_saved_info prev_for_neutral, next_for_neutral;
2684 ptrdiff_t eob
2685 = ((bidi_it->string.s || STRINGP (bidi_it->string.lstring))
2686 ? bidi_it->string.schars : ZV);
2688 /* Record the prev_for_neutral type either from the previous
2689 character, if it was a strong or AN/EN, or from the
2690 prev_for_neutral information recorded previously. */
2691 if (bidi_it->type == STRONG_L || bidi_it->type == STRONG_R
2692 || bidi_it->type == WEAK_AN || bidi_it->type == WEAK_EN)
2693 bidi_remember_char (&prev_for_neutral, bidi_it, 1);
2694 else
2695 prev_for_neutral = bidi_it->prev_for_neutral;
2696 /* Record the next_for_neutral type information. */
2697 if (bidi_it->next_for_neutral.charpos > bidi_it->charpos)
2698 next_for_neutral = bidi_it->next_for_neutral;
2699 else
2700 next_for_neutral.charpos = -1;
2701 if (!bidi_it->first_elt)
2703 type = bidi_cache_find (bidi_it->charpos + bidi_it->nchars, 0, bidi_it);
2704 ch = bidi_it->ch;
2706 if (type == UNKNOWN_BT)
2708 type = bidi_resolve_weak (bidi_it);
2709 if (type == NEUTRAL_ON)
2711 /* bracket_pairing_pos == eob means this bracket does not
2712 need to be resolved as a bracket, but as a neutral, see
2713 the optimization trick we play near the end of
2714 bidi_find_bracket_pairs. */
2715 if (bidi_it->bracket_pairing_pos == eob)
2717 /* If this is the outermost closing bracket of a run of
2718 characters in which we decided to resolve brackets as
2719 neutrals, use the embedding level's type, recorded in
2720 bracket_enclosed_type, to resolve the bracket. */
2721 if (bidi_it->next_for_neutral.charpos == bidi_it->charpos
2722 && bidi_paired_bracket_type (bidi_it->ch) == BIDI_BRACKET_CLOSE)
2723 type = bidi_it->bracket_enclosed_type;
2725 else if (bidi_find_bracket_pairs (bidi_it))
2726 resolve_bracket = true;
2729 else if (bidi_it->bracket_pairing_pos != eob)
2731 eassert (bidi_it->resolved_level == -1);
2732 /* If the cached state shows an increase of embedding level due
2733 to an isolate initiator, we need to update the 1st cached
2734 state of the next run of the current isolating sequence with
2735 the prev_for_neutral and next_for_neutral information, so
2736 that it will be picked up when we advance to that next run. */
2737 if (bidi_it->level_stack[bidi_it->stack_idx].level > prev_level
2738 && ISOLATE_STATUS (bidi_it, bidi_it->stack_idx))
2740 bidi_record_type_for_neutral (&prev_for_neutral, prev_level, 0);
2741 bidi_record_type_for_neutral (&next_for_neutral, prev_level, 1);
2743 if (type == NEUTRAL_ON
2744 && bidi_paired_bracket_type (ch) == BIDI_BRACKET_OPEN)
2746 if (bidi_it->bracket_pairing_pos > bidi_it->charpos)
2748 /* A cached opening bracket that wasn't completely
2749 resolved yet. */
2750 resolve_bracket = true;
2752 else if (bidi_it->bracket_pairing_pos == -1)
2754 /* Higher levels were not BPA-resolved yet, even if
2755 cached by bidi_find_bracket_pairs. Force application
2756 of BPA to the new level now. */
2757 if (bidi_find_bracket_pairs (bidi_it))
2758 resolve_bracket = true;
2761 /* Keep track of the prev_for_neutral and next_for_neutral
2762 types, needed for resolving brackets below and for resolving
2763 neutrals in bidi_resolve_neutral. */
2764 if (bidi_it->level_stack[bidi_it->stack_idx].level == prev_level)
2766 bidi_it->prev_for_neutral = prev_for_neutral;
2767 if (next_for_neutral.charpos > 0)
2768 bidi_it->next_for_neutral = next_for_neutral;
2772 /* If needed, resolve the bracket type according to N0. */
2773 if (resolve_bracket)
2775 int embedding_level = bidi_it->level_stack[bidi_it->stack_idx].level;
2776 bidi_type_t embedding_type = (embedding_level & 1) ? STRONG_R : STRONG_L;
2778 eassert (bidi_it->prev_for_neutral.type != UNKNOWN_BT);
2779 eassert (bidi_it->bracket_pairing_pos > bidi_it->charpos);
2780 if (bidi_it->bracket_enclosed_type == embedding_type) /* N0b */
2781 type = embedding_type;
2782 else
2784 switch (bidi_it->prev_for_neutral.type)
2786 case STRONG_R:
2787 case WEAK_EN:
2788 case WEAK_AN:
2789 type =
2790 (bidi_it->bracket_enclosed_type == STRONG_R) /* N0c */
2791 ? STRONG_R /* N0c1 */
2792 : embedding_type; /* N0c2 */
2793 break;
2794 case STRONG_L:
2795 type =
2796 (bidi_it->bracket_enclosed_type == STRONG_L) /* N0c */
2797 ? STRONG_L /* N0c1 */
2798 : embedding_type; /* N0c2 */
2799 break;
2800 default:
2801 /* N0d: Do not set the type for that bracket pair. */
2802 break;
2805 eassert (type == STRONG_L || type == STRONG_R || type == NEUTRAL_ON);
2807 /* Update the type of the paired closing bracket to the same
2808 type as for the resolved opening bracket. */
2809 if (type != NEUTRAL_ON)
2811 ptrdiff_t idx = bidi_cache_search (bidi_it->bracket_pairing_pos,
2812 -1, 1);
2814 if (idx < bidi_cache_start)
2815 emacs_abort ();
2816 bidi_cache[idx].type = type;
2820 return type;
2823 static bidi_type_t
2824 bidi_resolve_neutral (struct bidi_it *bidi_it)
2826 bidi_type_t type = bidi_resolve_brackets (bidi_it);
2827 int current_level;
2828 bool is_neutral;
2830 eassert (type == STRONG_R
2831 || type == STRONG_L
2832 || type == WEAK_BN
2833 || type == WEAK_EN
2834 || type == WEAK_AN
2835 || type == NEUTRAL_B
2836 || type == NEUTRAL_S
2837 || type == NEUTRAL_WS
2838 || type == NEUTRAL_ON
2839 || type == LRI
2840 || type == RLI
2841 || type == PDI);
2843 current_level = bidi_it->level_stack[bidi_it->stack_idx].level;
2844 eassert (current_level >= 0);
2845 is_neutral = bidi_get_category (type) == NEUTRAL;
2847 if ((type != NEUTRAL_B /* Don't risk entering the long loop below if
2848 we are already at paragraph end. */
2849 && (is_neutral || bidi_isolate_fmt_char (type)))
2850 /* N1-N2/Retaining */
2851 || (type == WEAK_BN && bidi_explicit_dir_char (bidi_it->ch)))
2853 if (bidi_it->next_for_neutral.type != UNKNOWN_BT)
2855 /* Make sure the data for resolving neutrals we are
2856 about to use is valid. */
2857 eassert (bidi_it->next_for_neutral.charpos > bidi_it->charpos
2858 /* PDI defines an eos, so it's OK for it to
2859 serve as its own next_for_neutral. */
2860 || (bidi_it->next_for_neutral.charpos == bidi_it->charpos
2861 && bidi_it->type == PDI));
2862 type = bidi_resolve_neutral_1 (bidi_it->prev_for_neutral.type,
2863 bidi_it->next_for_neutral.type,
2864 current_level);
2866 /* The next two "else if" clauses are shortcuts for the
2867 important special case when we have a long sequence of
2868 neutral or WEAK_BN characters, such as whitespace or nulls or
2869 other control characters, on the base embedding level of the
2870 paragraph, and that sequence goes all the way to the end of
2871 the paragraph and follows a character whose resolved
2872 directionality is identical to the base embedding level.
2873 (This is what happens in a buffer with plain L2R text that
2874 happens to include long sequences of control characters.) By
2875 virtue of N1, the result of examining this long sequence will
2876 always be either STRONG_L or STRONG_R, depending on the base
2877 embedding level. So we use this fact directly instead of
2878 entering the expensive loop in the "else" clause. */
2879 else if (current_level == 0
2880 && bidi_it->prev_for_neutral.type == STRONG_L
2881 && !bidi_explicit_dir_char (bidi_it->ch)
2882 && !bidi_isolate_fmt_char (type))
2883 type = bidi_resolve_neutral_1 (bidi_it->prev_for_neutral.type,
2884 STRONG_L, current_level);
2885 else if (/* current level is 1 */
2886 current_level == 1
2887 /* base embedding level is also 1 */
2888 && bidi_it->level_stack[0].level == 1
2889 /* previous character is one of those considered R for
2890 the purposes of W5 */
2891 && (bidi_it->prev_for_neutral.type == STRONG_R
2892 || bidi_it->prev_for_neutral.type == WEAK_EN
2893 || bidi_it->prev_for_neutral.type == WEAK_AN)
2894 && !bidi_explicit_dir_char (bidi_it->ch)
2895 && !bidi_isolate_fmt_char (type))
2896 type = bidi_resolve_neutral_1 (bidi_it->prev_for_neutral.type,
2897 STRONG_R, current_level);
2898 else
2900 /* Arrrgh!! The UAX#9 algorithm is too deeply entrenched in
2901 the assumption of batch-style processing; see clauses W4,
2902 W5, and especially N1, which require to look far forward
2903 (as well as back) in the buffer/string. May the fleas of
2904 a thousand camels infest the armpits of those who design
2905 supposedly general-purpose algorithms by looking at their
2906 own implementations, and fail to consider other possible
2907 implementations! */
2908 struct bidi_it saved_it;
2909 bidi_type_t next_type;
2910 bool adjacent_to_neutrals = is_neutral;
2912 bidi_copy_it (&saved_it, bidi_it);
2913 /* Scan the text forward until we find the first non-neutral
2914 character, and then use that to resolve the neutral we
2915 are dealing with now. We also cache the scanned iterator
2916 states, to salvage some of the effort later. */
2917 do {
2918 int old_sidx, new_sidx;
2920 /* Paragraph separators have their levels fully resolved
2921 at this point, so cache them as resolved. */
2922 bidi_cache_iterator_state (bidi_it, type == NEUTRAL_B, 0);
2923 old_sidx = bidi_it->stack_idx;
2924 type = bidi_resolve_brackets (bidi_it);
2925 /* Skip level runs excluded from this isolating run sequence. */
2926 new_sidx = bidi_it->stack_idx;
2927 if (bidi_it->level_stack[new_sidx].level > current_level
2928 && (ISOLATE_STATUS (bidi_it, new_sidx)
2929 /* This is for when we have an isolate initiator
2930 immediately followed by an embedding or
2931 override initiator, in which case we get the
2932 level stack pushed twice by the single call to
2933 bidi_resolve_weak above. */
2934 || (new_sidx > old_sidx + 1
2935 && ISOLATE_STATUS (bidi_it, new_sidx - 1))))
2937 while (bidi_it->level_stack[bidi_it->stack_idx].level
2938 > current_level)
2940 bidi_cache_iterator_state (bidi_it, type == NEUTRAL_B, 0);
2941 type = bidi_resolve_brackets (bidi_it);
2944 if (!adjacent_to_neutrals
2945 && (bidi_get_category (type) == NEUTRAL
2946 || bidi_isolate_fmt_char (type)))
2947 adjacent_to_neutrals = true;
2948 } while (!(type == NEUTRAL_B
2949 || (type != WEAK_BN
2950 && bidi_get_category (type) != NEUTRAL
2951 && !bidi_isolate_fmt_char (type))
2952 /* This is all per level run, so stop when we
2953 reach the end of this level run. */
2954 || (bidi_it->level_stack[bidi_it->stack_idx].level
2955 != current_level)));
2957 /* Record the character we stopped at. */
2958 bidi_remember_char (&saved_it.next_for_neutral, bidi_it, 1);
2960 if ((bidi_it->level_stack[bidi_it->stack_idx].level != current_level)
2961 || type == NEUTRAL_B)
2963 /* Marched all the way to the end of this level run. We
2964 need to use the eos type, whose information is stored
2965 by bidi_set_sos_type in the prev_for_neutral
2966 member. */
2967 if (adjacent_to_neutrals)
2968 next_type = bidi_it->prev_for_neutral.type;
2969 else
2971 /* This is a BN which does not adjoin neutrals.
2972 Leave its type alone. */
2973 bidi_copy_it (bidi_it, &saved_it);
2974 return bidi_it->type;
2977 else
2979 switch (type)
2981 case STRONG_L:
2982 case STRONG_R:
2983 case STRONG_AL:
2984 /* Actually, STRONG_AL cannot happen here, because
2985 bidi_resolve_weak converts it to STRONG_R, per W3. */
2986 eassert (type != STRONG_AL);
2987 next_type = type;
2988 break;
2989 case WEAK_EN:
2990 case WEAK_AN:
2991 /* N1: "European and Arabic numbers act as if they
2992 were R in terms of their influence on NIs." */
2993 next_type = STRONG_R;
2994 break;
2995 default:
2996 emacs_abort ();
2997 break;
3000 /* Resolve the type of all the NIs found during the above loop. */
3001 type = bidi_resolve_neutral_1 (saved_it.prev_for_neutral.type,
3002 next_type, current_level);
3003 /* Update next_for_neutral with the resolved type, so we
3004 could use it for all the other NIs up to the place where
3005 we exited the loop. */
3006 saved_it.next_for_neutral.type = next_type;
3007 bidi_check_type (type);
3008 /* Update the character which caused us to enter the above loop. */
3009 saved_it.type = type;
3010 bidi_check_type (next_type);
3011 bidi_copy_it (bidi_it, &saved_it);
3014 return type;
3017 /* Given an iterator state in BIDI_IT, advance one character position
3018 in the buffer/string to the next character (in the logical order),
3019 resolve the bidi type of that next character, and return that
3020 type. */
3021 static bidi_type_t
3022 bidi_type_of_next_char (struct bidi_it *bidi_it)
3024 bidi_type_t type;
3026 /* This should always be called during a forward scan. */
3027 if (bidi_it->scan_dir != 1)
3028 emacs_abort ();
3030 type = bidi_resolve_neutral (bidi_it);
3032 return type;
3035 /* Given an iterator state BIDI_IT, advance one character position in
3036 the buffer/string to the next character (in the current scan
3037 direction), resolve the embedding and implicit levels of that next
3038 character, and return the resulting level. */
3039 static int
3040 bidi_level_of_next_char (struct bidi_it *bidi_it)
3042 bidi_type_t type = UNKNOWN_BT;
3043 int level;
3044 ptrdiff_t next_char_pos = -2;
3046 if (bidi_it->scan_dir == 1)
3048 ptrdiff_t eob
3049 = ((bidi_it->string.s || STRINGP (bidi_it->string.lstring))
3050 ? bidi_it->string.schars : ZV);
3052 /* There's no sense in trying to advance if we've already hit
3053 the end of text. */
3054 if (bidi_it->charpos >= eob)
3056 eassert (bidi_it->resolved_level >= 0);
3057 return bidi_it->resolved_level;
3061 /* Perhaps the character we want is already cached s fully resolved.
3062 If it is, the call to bidi_cache_find below will return a type
3063 other than UNKNOWN_BT. */
3064 if (bidi_cache_idx > bidi_cache_start && !bidi_it->first_elt)
3066 int bob = ((bidi_it->string.s || STRINGP (bidi_it->string.lstring))
3067 ? 0 : 1);
3069 if (bidi_it->scan_dir > 0)
3071 if (bidi_it->nchars <= 0)
3072 emacs_abort ();
3073 next_char_pos = bidi_it->charpos + bidi_it->nchars;
3075 else if (bidi_it->charpos >= bob)
3076 /* Implementation note: we allow next_char_pos to be as low as
3077 0 for buffers or -1 for strings, and that is okay because
3078 that's the "position" of the sentinel iterator state we
3079 cached at the beginning of the iteration. */
3080 next_char_pos = bidi_it->charpos - 1;
3081 if (next_char_pos >= bob - 1)
3082 type = bidi_cache_find (next_char_pos, 1, bidi_it);
3083 if (type != UNKNOWN_BT)
3085 /* We asked the cache for fully resolved states. */
3086 eassert (bidi_it->resolved_level >= 0);
3087 return bidi_it->resolved_level;
3091 if (bidi_it->scan_dir == -1)
3092 /* If we are going backwards, the iterator state is already cached
3093 from previous scans, and should be fully resolved. */
3094 emacs_abort ();
3096 if (type == UNKNOWN_BT)
3097 type = bidi_type_of_next_char (bidi_it);
3099 if (type == NEUTRAL_B)
3101 eassert (bidi_it->resolved_level >= 0);
3102 return bidi_it->resolved_level;
3105 level = bidi_it->level_stack[bidi_it->stack_idx].level;
3107 eassert ((type == STRONG_R
3108 || type == STRONG_L
3109 || type == WEAK_BN
3110 || type == WEAK_EN
3111 || type == WEAK_AN));
3112 bidi_it->type = type;
3113 bidi_check_type (bidi_it->type);
3115 /* For L1 below, we need to know, for each WS character, whether
3116 it belongs to a sequence of WS characters preceding a newline
3117 or a TAB or a paragraph separator. */
3118 if ((bidi_it->orig_type == NEUTRAL_WS
3119 || bidi_isolate_fmt_char (bidi_it->orig_type))
3120 && bidi_it->next_for_ws.charpos < bidi_it->charpos)
3122 int ch;
3123 ptrdiff_t clen = bidi_it->ch_len;
3124 ptrdiff_t bpos = bidi_it->bytepos;
3125 ptrdiff_t cpos = bidi_it->charpos;
3126 ptrdiff_t disp_pos = bidi_it->disp_pos;
3127 ptrdiff_t nc = bidi_it->nchars;
3128 struct bidi_string_data bs = bidi_it->string;
3129 bidi_type_t chtype;
3130 bool fwp = bidi_it->frame_window_p;
3131 int dpp = bidi_it->disp_prop;
3133 if (bidi_it->nchars <= 0)
3134 emacs_abort ();
3135 do {
3136 ch = bidi_fetch_char (cpos += nc, bpos += clen, &disp_pos, &dpp, &bs,
3137 bidi_it->w, fwp, &clen, &nc);
3138 chtype = bidi_get_type (ch, NEUTRAL_DIR);
3139 } while (chtype == NEUTRAL_WS || chtype == WEAK_BN
3140 || bidi_isolate_fmt_char (chtype)
3141 || bidi_explicit_dir_char (ch)); /* L1/Retaining */
3142 bidi_it->next_for_ws.type = chtype;
3143 bidi_check_type (bidi_it->next_for_ws.type);
3144 bidi_it->next_for_ws.charpos = cpos;
3147 /* Update the cache, but only if this state was already cached. */
3148 bidi_cache_iterator_state (bidi_it, 1, 1);
3150 /* Resolve implicit levels. */
3151 if (bidi_it->orig_type == NEUTRAL_B /* L1 */
3152 || bidi_it->orig_type == NEUTRAL_S
3153 || bidi_it->ch == '\n' || bidi_it->ch == BIDI_EOB
3154 || (bidi_it->orig_type == NEUTRAL_WS
3155 && (bidi_it->next_for_ws.type == NEUTRAL_B
3156 || bidi_it->next_for_ws.type == NEUTRAL_S)))
3157 level = bidi_it->level_stack[0].level;
3158 else if ((level & 1) == 0) /* I1 */
3160 if (type == STRONG_R)
3161 level++;
3162 else if (type == WEAK_EN || type == WEAK_AN)
3163 level += 2;
3165 else /* I2 */
3167 if (type == STRONG_L || type == WEAK_EN || type == WEAK_AN)
3168 level++;
3171 bidi_it->resolved_level = level;
3172 return level;
3175 /* Move to the other edge of a level given by LEVEL. If END_FLAG,
3176 we are at the end of a level, and we need to prepare to
3177 resume the scan of the lower level.
3179 If this level's other edge is cached, we simply jump to it, filling
3180 the iterator structure with the iterator state on the other edge.
3181 Otherwise, we walk the buffer or string until we come back to the
3182 same level as LEVEL.
3184 Note: we are not talking here about a ``level run'' in the UAX#9
3185 sense of the term, but rather about a ``level'' which includes
3186 all the levels higher than it. In other words, given the levels
3187 like this:
3189 11111112222222333333334443343222222111111112223322111
3190 A B C
3192 and assuming we are at point A scanning left to right, this
3193 function moves to point C, whereas the UAX#9 ``level 2 run'' ends
3194 at point B. */
3195 static void
3196 bidi_find_other_level_edge (struct bidi_it *bidi_it, int level, bool end_flag)
3198 int dir = end_flag ? -bidi_it->scan_dir : bidi_it->scan_dir;
3199 ptrdiff_t idx;
3201 /* Try the cache first. */
3202 if ((idx = bidi_cache_find_level_change (level, dir, end_flag))
3203 >= bidi_cache_start)
3204 bidi_cache_fetch_state (idx, bidi_it);
3205 else
3207 int new_level;
3209 /* If we are at end of level, its edges must be cached. */
3210 if (end_flag)
3211 emacs_abort ();
3213 bidi_cache_iterator_state (bidi_it, 1, 0);
3214 do {
3215 new_level = bidi_level_of_next_char (bidi_it);
3216 bidi_cache_iterator_state (bidi_it, 1, 0);
3217 } while (new_level >= level);
3221 void
3222 bidi_move_to_visually_next (struct bidi_it *bidi_it)
3224 int old_level, new_level, next_level;
3225 struct bidi_it sentinel;
3226 struct gcpro gcpro1;
3228 if (bidi_it->charpos < 0 || bidi_it->bytepos < 0)
3229 emacs_abort ();
3231 if (bidi_it->scan_dir == 0)
3233 bidi_it->scan_dir = 1; /* default to logical order */
3236 /* The code below can call eval, and thus cause GC. If we are
3237 iterating a Lisp string, make sure it won't be GCed. */
3238 if (STRINGP (bidi_it->string.lstring))
3239 GCPRO1 (bidi_it->string.lstring);
3241 /* If we just passed a newline, initialize for the next line. */
3242 if (!bidi_it->first_elt
3243 && (bidi_it->ch == '\n' || bidi_it->ch == BIDI_EOB))
3244 bidi_line_init (bidi_it);
3246 /* Prepare the sentinel iterator state, and cache it. When we bump
3247 into it, scanning backwards, we'll know that the last non-base
3248 level is exhausted. */
3249 if (bidi_cache_idx == bidi_cache_start)
3251 bidi_copy_it (&sentinel, bidi_it);
3252 if (bidi_it->first_elt)
3254 sentinel.charpos--; /* cached charpos needs to be monotonic */
3255 sentinel.bytepos--;
3256 sentinel.ch = '\n'; /* doesn't matter, but why not? */
3257 sentinel.ch_len = 1;
3258 sentinel.nchars = 1;
3260 bidi_cache_iterator_state (&sentinel, 1, 0);
3263 old_level = bidi_it->resolved_level;
3264 new_level = bidi_level_of_next_char (bidi_it);
3266 /* Reordering of resolved levels (clause L2) is implemented by
3267 jumping to the other edge of the level and flipping direction of
3268 scanning the text whenever we find a level change. */
3269 if (new_level != old_level)
3271 bool ascending = new_level > old_level;
3272 int level_to_search = ascending ? old_level + 1 : old_level;
3273 int incr = ascending ? 1 : -1;
3274 int expected_next_level = old_level + incr;
3276 /* Jump (or walk) to the other edge of this level. */
3277 bidi_find_other_level_edge (bidi_it, level_to_search, !ascending);
3278 /* Switch scan direction and peek at the next character in the
3279 new direction. */
3280 bidi_it->scan_dir = -bidi_it->scan_dir;
3282 /* The following loop handles the case where the resolved level
3283 jumps by more than one. This is typical for numbers inside a
3284 run of text with left-to-right embedding direction, but can
3285 also happen in other situations. In those cases the decision
3286 where to continue after a level change, and in what direction,
3287 is tricky. For example, given a text like below:
3289 abcdefgh
3290 11336622
3292 (where the numbers below the text show the resolved levels),
3293 the result of reordering according to UAX#9 should be this:
3295 efdcghba
3297 This is implemented by the loop below which flips direction
3298 and jumps to the other edge of the level each time it finds
3299 the new level not to be the expected one. The expected level
3300 is always one more or one less than the previous one. */
3301 next_level = bidi_peek_at_next_level (bidi_it);
3302 while (next_level != expected_next_level)
3304 /* If next_level is -1, it means we have an unresolved level
3305 in the cache, which at this point should not happen. If
3306 it does, we will infloop. */
3307 eassert (next_level >= 0);
3308 /* If next_level is not consistent with incr, we might
3309 infloop. */
3310 eassert (incr > 0
3311 ? next_level > expected_next_level
3312 : next_level < expected_next_level);
3313 expected_next_level += incr;
3314 level_to_search += incr;
3315 bidi_find_other_level_edge (bidi_it, level_to_search, !ascending);
3316 bidi_it->scan_dir = -bidi_it->scan_dir;
3317 next_level = bidi_peek_at_next_level (bidi_it);
3320 /* Finally, deliver the next character in the new direction. */
3321 next_level = bidi_level_of_next_char (bidi_it);
3324 /* Take note when we have just processed the newline that precedes
3325 the end of the paragraph. The next time we are about to be
3326 called, set_iterator_to_next will automatically reinit the
3327 paragraph direction, if needed. We do this at the newline before
3328 the paragraph separator, because the next character might not be
3329 the first character of the next paragraph, due to the bidi
3330 reordering, whereas we _must_ know the paragraph base direction
3331 _before_ we process the paragraph's text, since the base
3332 direction affects the reordering. */
3333 if (bidi_it->scan_dir == 1
3334 && (bidi_it->ch == '\n' || bidi_it->ch == BIDI_EOB))
3336 /* The paragraph direction of the entire string, once
3337 determined, is in effect for the entire string. Setting the
3338 separator limit to the end of the string prevents
3339 bidi_paragraph_init from being called automatically on this
3340 string. */
3341 if (bidi_it->string.s || STRINGP (bidi_it->string.lstring))
3342 bidi_it->separator_limit = bidi_it->string.schars;
3343 else if (bidi_it->bytepos < ZV_BYTE)
3345 ptrdiff_t sep_len
3346 = bidi_at_paragraph_end (bidi_it->charpos + bidi_it->nchars,
3347 bidi_it->bytepos + bidi_it->ch_len);
3348 if (bidi_it->nchars <= 0)
3349 emacs_abort ();
3350 if (sep_len >= 0)
3352 bidi_it->new_paragraph = 1;
3353 /* Record the buffer position of the last character of the
3354 paragraph separator. */
3355 bidi_it->separator_limit
3356 = bidi_it->charpos + bidi_it->nchars + sep_len;
3361 if (bidi_it->scan_dir == 1 && bidi_cache_idx > bidi_cache_start)
3363 /* If we are at paragraph's base embedding level and beyond the
3364 last cached position, the cache's job is done and we can
3365 discard it. */
3366 if (bidi_it->resolved_level == bidi_it->level_stack[0].level
3367 && bidi_it->charpos > (bidi_cache[bidi_cache_idx - 1].charpos
3368 + bidi_cache[bidi_cache_idx - 1].nchars - 1))
3369 bidi_cache_reset ();
3370 /* But as long as we are caching during forward scan, we must
3371 cache each state, or else the cache integrity will be
3372 compromised: it assumes cached states correspond to buffer
3373 positions 1:1. */
3374 else
3375 bidi_cache_iterator_state (bidi_it, 1, 0);
3378 eassert (bidi_it->resolved_level >= 0
3379 && bidi_it->resolved_level <= BIDI_MAXDEPTH + 2);
3381 if (STRINGP (bidi_it->string.lstring))
3382 UNGCPRO;
3385 /* Utility function for looking for strong directional characters
3386 whose bidi type was overridden by a directional override. */
3387 ptrdiff_t
3388 bidi_find_first_overridden (struct bidi_it *bidi_it)
3390 ptrdiff_t found_pos = ZV;
3394 /* Need to call bidi_resolve_weak, not bidi_resolve_explicit,
3395 because the directional overrides are applied by the
3396 former. */
3397 bidi_type_t type = bidi_resolve_weak (bidi_it);
3399 if ((type == STRONG_R && bidi_it->orig_type == STRONG_L)
3400 || (type == STRONG_L
3401 && (bidi_it->orig_type == STRONG_R
3402 || bidi_it->orig_type == STRONG_AL)))
3403 found_pos = bidi_it->charpos;
3404 } while (found_pos == ZV
3405 && bidi_it->charpos < ZV
3406 && bidi_it->ch != BIDI_EOB
3407 && bidi_it->ch != '\n');
3409 return found_pos;
3412 /* This is meant to be called from within the debugger, whenever you
3413 wish to examine the cache contents. */
3414 void bidi_dump_cached_states (void) EXTERNALLY_VISIBLE;
3415 void
3416 bidi_dump_cached_states (void)
3418 ptrdiff_t i;
3419 int ndigits = 1;
3421 if (bidi_cache_idx == 0)
3423 fprintf (stderr, "The cache is empty.\n");
3424 return;
3426 fprintf (stderr, "Total of %"pD"d state%s in cache:\n",
3427 bidi_cache_idx, bidi_cache_idx == 1 ? "" : "s");
3429 for (i = bidi_cache[bidi_cache_idx - 1].charpos; i > 0; i /= 10)
3430 ndigits++;
3431 fputs ("ch ", stderr);
3432 for (i = 0; i < bidi_cache_idx; i++)
3433 fprintf (stderr, "%*c", ndigits, bidi_cache[i].ch);
3434 fputs ("\n", stderr);
3435 fputs ("lvl ", stderr);
3436 for (i = 0; i < bidi_cache_idx; i++)
3437 fprintf (stderr, "%*d", ndigits, bidi_cache[i].resolved_level);
3438 fputs ("\n", stderr);
3439 fputs ("pos ", stderr);
3440 for (i = 0; i < bidi_cache_idx; i++)
3441 fprintf (stderr, "%*"pD"d", ndigits, bidi_cache[i].charpos);
3442 fputs ("\n", stderr);