Merge branch 'master' into comment-cache
[emacs.git] / src / bidi.c
blobb75ad933626793ccbe27bfa235aeb76ae7f2c5e7
1 /* Low-level bidirectional buffer/string-scanning functions for GNU Emacs.
2 Copyright (C) 2000-2001, 2004-2005, 2009-2017 Free Software
3 Foundation, Inc.
5 This file is part of GNU Emacs.
7 GNU Emacs is free software: you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation, either version 3 of the License, or (at
10 your option) any later version.
12 GNU Emacs is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with GNU Emacs. If not, see <http://www.gnu.org/licenses/>. */
20 /* Written by Eli Zaretskii <eliz@gnu.org>.
22 A sequential implementation of the Unicode Bidirectional algorithm,
23 (UBA) as per UAX#9, a part of the Unicode Standard.
25 Unlike the Reference Implementation and most other implementations,
26 this one is designed to be called once for every character in the
27 buffer or string. That way, we can leave intact the design of the
28 Emacs display engine, whereby an iterator object is used to
29 traverse buffer or string text character by character, and generate
30 the necessary data for displaying each character in 'struct glyph'
31 objects. (See xdisp.c for the details of that iteration.) The
32 functions on this file replace the original linear iteration in the
33 logical order of the text with a non-linear iteration in the visual
34 order, i.e. in the order characters should be shown on display.
36 The main entry point is bidi_move_to_visually_next. Each time it
37 is called, it finds the next character in the visual order, and
38 returns its information in a special structure. The caller is then
39 expected to process this character for display or any other
40 purposes, and call bidi_move_to_visually_next for the next
41 character. See the comments in bidi_move_to_visually_next for more
42 details about its algorithm that finds the next visual-order
43 character by resolving their levels on the fly.
45 Two other entry points are bidi_paragraph_init and
46 bidi_mirror_char. The first determines the base direction of a
47 paragraph, while the second returns the mirrored version of its
48 argument character.
50 A few auxiliary entry points are used to initialize the bidi
51 iterator for iterating an object (buffer or string), push and pop
52 the bidi iterator state, and save and restore the state of the bidi
53 cache.
55 If you want to understand the code, you will have to read it
56 together with the relevant portions of UAX#9. The comments include
57 references to UAX#9 rules, for that very reason.
59 A note about references to UAX#9 rules: if the reference says
60 something like "X9/Retaining", it means that you need to refer to
61 rule X9 and to its modifications described in the "Implementation
62 Notes" section of UAX#9, under "Retaining Format Codes".
64 Here's the overview of the design of the reordering engine
65 implemented by this file.
67 Basic implementation structure
68 ------------------------------
70 The sequential processing steps described by UAX#9 are implemented
71 as recursive levels of processing, all of which examine the next
72 character in the logical order. This hierarchy of processing looks
73 as follows, from the innermost (deepest) to the outermost level,
74 omitting some subroutines used by each level:
76 bidi_fetch_char -- fetch next character
77 bidi_resolve_explicit -- resolve explicit levels and directions
78 bidi_resolve_weak -- resolve weak types
79 bidi_resolve_brackets -- resolve "paired brackets" neutral types
80 bidi_resolve_neutral -- resolve neutral types
81 bidi_level_of_next_char -- resolve implicit levels
83 Each level calls the level below it, and works on the result
84 returned by the lower level, including all of its sub-levels.
86 Unlike all the levels below it, bidi_level_of_next_char can return
87 the information about either the next or previous character in the
88 logical order, depending on the current direction of scanning the
89 buffer or string. For the next character, it calls all the levels
90 below it; for the previous character, it uses the cache, described
91 below.
93 Thus, the result of calling bidi_level_of_next_char is the resolved
94 level of the next or the previous character in the logical order.
95 Based on this information, the function bidi_move_to_visually_next
96 finds the next character in the visual order and updates the
97 direction in which the buffer is scanned, either forward or
98 backward, to find the next character to be displayed. (Text is
99 scanned backwards when it needs to be reversed for display, i.e. if
100 the visual order is the inverse of the logical order.) This
101 implements the last, reordering steps of the UBA, by successively
102 calling bidi_level_of_next_char until the character of the required
103 embedding level is found; the scan direction is dynamically updated
104 as a side effect. See the commentary before the 'while' loop in
105 bidi_move_to_visually_next, for the details.
107 Fetching characters
108 -------------------
110 In a nutshell, fetching the next character boils down to calling
111 STRING_CHAR_AND_LENGTH, passing it the address of a buffer or
112 string position. See bidi_fetch_char. However, if the next
113 character is "covered" by a display property of some kind,
114 bidi_fetch_char returns the u+FFFC "object replacement character"
115 that represents the entire run of text covered by the display
116 property. (The ch_len and nchars members of 'struct bidi_it'
117 reflect the length in bytes and characters of that text.) This is
118 so we reorder text on both sides of the display property as
119 appropriate for an image or embedded string. Similarly, text
120 covered by a display spec of the form '(space ...)', is replaced
121 with the u+2029 paragraph separator character, so such display
122 specs produce the same effect as a TAB under UBA. Both these
123 special characters are not actually displayed -- the display
124 property is displayed instead -- but just used to compute the
125 embedding level of the surrounding text so as to produce the
126 required effect.
128 Bidi iterator states
129 --------------------
131 The UBA is highly context dependent in some of its parts,
132 i.e. results of processing a character can generally depend on
133 characters very far away. The UAX#9 description of the UBA
134 prescribes a stateful processing of each character, whereby the
135 results of this processing depend on various state variables, such
136 as the current embedding level, level stack, and directional
137 override status. In addition, the UAX#9 description includes many
138 passages like this (from rule W2 in this case):
140 Search backward from each instance of a European number until the
141 first strong type (R, L, AL, or sos) is found. If an AL is found,
142 change the type of the European number to Arabic number.
144 To support this, we use a bidi iterator object, 'struct bidi_it',
145 which is a sub-structure of 'struct it' used by xdisp.c (see
146 dispextern.h for the definition of both of these structures). The
147 bidi iterator holds the entire state of the iteration required by
148 the UBA, and is updated as the text is traversed. In particular,
149 the embedding level of the current character being resolved is
150 recorded in the iterator state. To avoid costly searches backward
151 in support of rules like W2 above, the necessary character types
152 are also recorded in the iterator state as they are found during
153 the forward scan, and then used when such rules need to be applied.
154 (Forward scans cannot be avoided in this way; they need to be
155 performed at least once, and the results recorded in the iterator
156 state, to be reused until the forward scan oversteps the recorded
157 position.)
159 In this manner, the iterator state acts as a mini-cache of
160 contextual information required for resolving the level of the
161 current character by various UBA rules.
163 Caching of bidi iterator states
164 -------------------------------
166 As described above, the reordering engine uses the information
167 recorded in the bidi iterator state in order to resolve the
168 embedding level of the current character. When the reordering
169 engine needs to process the next character in the logical order, it
170 fetches it and applies to it all the UBA levels, updating the
171 iterator state as it goes. But when the buffer or string is
172 scanned backwards, i.e. in the reverse order of buffer/string
173 positions, the scanned characters were already processed during the
174 preceding forward scan (see bidi_find_other_level_edge). To avoid
175 costly re-processing of characters that were already processed
176 during the forward scan, the iterator states computed while
177 scanning forward are cached.
179 The cache is just a linear array of 'struct bidi_it' objects, which
180 is dynamically allocated and reallocated as needed, since the size
181 of the cache depends on the text being processed. We only need the
182 cache while processing embedded levels higher than the base
183 paragraph embedding level, because these higher levels require
184 changes in scan direction. Therefore, as soon as we are back to
185 the base embedding level, we can free the cache; see the calls to
186 bidi_cache_reset and bidi_cache_shrink, for the conditions to do
187 this.
189 The cache maintains the index of the next unused cache slot -- this
190 is where the next iterator state will be cached. The function
191 bidi_cache_iterator_state saves an instance of the state in the
192 cache and increments the unused slot index. The companion function
193 bidi_cache_find looks up a cached state that corresponds to a given
194 buffer/string position. All of the cached states must correspond
195 1:1 to the buffer or string region whose processing they reflect;
196 bidi.c will abort if it finds cache slots that violate this 1:1
197 correspondence.
199 When the parent iterator 'struct it' is pushed (see push_it in
200 xdisp.c) to pause the current iteration and start iterating over a
201 different object (e.g., a 'display' string that covers some buffer
202 text), the bidi iterator cache needs to be "pushed" as well, so
203 that a new empty cache could be used while iterating over the new
204 object. Later, when the new object is exhausted, and xdisp.c calls
205 pop_it, we need to "pop" the bidi cache as well and return to the
206 original cache. See bidi_push_it and bidi_pop_it for how this is
207 done.
209 Some functions of the display engine save copies of 'struct it' in
210 local variables, and restore them later. For examples, see
211 pos_visible_p and move_it_in_display_line_to in xdisp.c, and
212 window_scroll_pixel_based in window.c. When this happens, we need
213 to save and restore the bidi cache as well, because conceptually
214 the cache is part of the 'struct it' state, and needs to be in
215 perfect sync with the portion of the buffer/string that is being
216 processed. This saving and restoring of the cache state is handled
217 by bidi_shelve_cache and bidi_unshelve_cache, and the helper macros
218 SAVE_IT and RESTORE_IT defined on xdisp.c.
220 Note that, because reordering is implemented below the level in
221 xdisp.c that breaks glyphs into screen lines, we are violating
222 paragraph 3.4 of UAX#9. which mandates that line breaking shall be
223 done before reordering each screen line separately. However,
224 following UAX#9 to the letter in this matter goes against the basic
225 design of the Emacs display engine, and so we choose here this
226 minor deviation from the UBA letter in preference to redesign of
227 the display engine. The effect of this is only seen in continued
228 lines that are broken into screen lines in the middle of a run
229 whose direction is opposite to the paragraph's base direction.
231 Important design and implementation note: when the code needs to
232 scan far ahead, be sure to avoid such scans as much as possible
233 when the buffer/string doesn't contain any RTL characters. Users
234 of left-to-right scripts will never forgive you if you introduce
235 some slow-down due to bidi in situations that don't involve any
236 bidirectional text. See the large comment near the beginning of
237 bidi_resolve_neutral, for one situation where such shortcut was
238 necessary. */
240 #include <config.h>
241 #include <stdio.h>
243 #include "lisp.h"
244 #include "character.h"
245 #include "buffer.h"
246 #include "dispextern.h"
247 #include "region-cache.h"
249 static bool bidi_initialized = 0;
251 static Lisp_Object bidi_type_table, bidi_mirror_table, bidi_brackets_table;
253 #define BIDI_EOB (-1)
255 /* Data type for describing the bidirectional character categories. */
256 typedef enum {
257 UNKNOWN_BC,
258 NEUTRAL,
259 WEAK,
260 STRONG,
261 EXPLICIT_FORMATTING
262 } bidi_category_t;
264 static Lisp_Object paragraph_start_re, paragraph_separate_re;
267 /***********************************************************************
268 Utilities
269 ***********************************************************************/
271 /* Return the bidi type of a character CH, subject to the current
272 directional OVERRIDE. */
273 static bidi_type_t
274 bidi_get_type (int ch, bidi_dir_t override)
276 bidi_type_t default_type;
278 if (ch == BIDI_EOB)
279 return NEUTRAL_B;
280 if (ch < 0 || ch > MAX_CHAR)
281 emacs_abort ();
283 default_type = (bidi_type_t) XINT (CHAR_TABLE_REF (bidi_type_table, ch));
284 /* Every valid character code, even those that are unassigned by the
285 UCD, have some bidi-class property, according to
286 DerivedBidiClass.txt file. Therefore, if we ever get UNKNOWN_BT
287 (= zero) code from CHAR_TABLE_REF, that's a bug. */
288 if (default_type == UNKNOWN_BT)
289 emacs_abort ();
291 switch (default_type)
293 case WEAK_BN:
294 case NEUTRAL_B:
295 case LRE:
296 case LRO:
297 case RLE:
298 case RLO:
299 case PDF:
300 case LRI:
301 case RLI:
302 case FSI:
303 case PDI:
304 return default_type;
305 default:
306 if (override == L2R)
307 return STRONG_L;
308 else if (override == R2L)
309 return STRONG_R;
310 else
311 return default_type;
315 static void
316 bidi_check_type (bidi_type_t type)
318 eassert (UNKNOWN_BT <= type && type <= NEUTRAL_ON);
321 /* Given a bidi TYPE of a character, return its category. */
322 static bidi_category_t
323 bidi_get_category (bidi_type_t type)
325 switch (type)
327 case UNKNOWN_BT:
328 return UNKNOWN_BC;
329 case STRONG_L:
330 case STRONG_R:
331 case STRONG_AL:
332 return STRONG;
333 case WEAK_EN:
334 case WEAK_ES:
335 case WEAK_ET:
336 case WEAK_AN:
337 case WEAK_CS:
338 case WEAK_NSM:
339 case WEAK_BN:
340 return WEAK;
341 case NEUTRAL_B:
342 case NEUTRAL_S:
343 case NEUTRAL_WS:
344 case NEUTRAL_ON:
345 return NEUTRAL;
346 case LRE:
347 case LRO:
348 case RLE:
349 case RLO:
350 case PDF:
351 case LRI:
352 case RLI:
353 case FSI:
354 case PDI:
355 return EXPLICIT_FORMATTING;
356 default:
357 emacs_abort ();
361 static bool
362 bidi_isolate_fmt_char (bidi_type_t ch_type)
364 return (ch_type == LRI || ch_type == RLI || ch_type == PDI || ch_type == FSI);
367 /* Return the mirrored character of C, if it has one. If C has no
368 mirrored counterpart, return C.
369 Note: The conditions in UAX#9 clause L4 regarding the surrounding
370 context must be tested by the caller. */
372 bidi_mirror_char (int c)
374 Lisp_Object val;
376 if (c == BIDI_EOB)
377 return c;
378 if (c < 0 || c > MAX_CHAR)
379 emacs_abort ();
381 val = CHAR_TABLE_REF (bidi_mirror_table, c);
382 if (INTEGERP (val))
384 int v;
386 /* When debugging, check before assigning to V, so that the check
387 isn't broken by undefined behavior due to int overflow. */
388 eassert (CHAR_VALID_P (XINT (val)));
390 v = XINT (val);
392 /* Minimal test we must do in optimized builds, to prevent weird
393 crashes further down the road. */
394 if (v < 0 || v > MAX_CHAR)
395 emacs_abort ();
397 return v;
400 return c;
403 /* Return the Bidi_Paired_Bracket_Type property of the character C. */
404 static bidi_bracket_type_t
405 bidi_paired_bracket_type (int c)
407 if (c == BIDI_EOB)
408 return BIDI_BRACKET_NONE;
409 if (c < 0 || c > MAX_CHAR)
410 emacs_abort ();
412 return (bidi_bracket_type_t) XINT (CHAR_TABLE_REF (bidi_brackets_table, c));
415 /* Determine the start-of-sequence (sos) directional type given the two
416 embedding levels on either side of the run boundary. Also, update
417 the saved info about previously seen characters, since that info is
418 generally valid for a single level run. */
419 static void
420 bidi_set_sos_type (struct bidi_it *bidi_it, int level_before, int level_after)
422 int higher_level = (level_before > level_after ? level_before : level_after);
424 /* FIXME: should the default sos direction be user selectable? */
425 bidi_it->sos = ((higher_level & 1) != 0 ? R2L : L2R); /* X10 */
427 bidi_it->prev.type = UNKNOWN_BT;
428 bidi_it->last_strong.type = bidi_it->last_strong.orig_type = UNKNOWN_BT;
429 bidi_it->prev_for_neutral.type = (bidi_it->sos == R2L ? STRONG_R : STRONG_L);
430 bidi_it->prev_for_neutral.charpos = bidi_it->charpos;
431 bidi_it->next_for_neutral.type
432 = bidi_it->next_for_neutral.orig_type = UNKNOWN_BT;
435 #define ISOLATE_STATUS(BIDI_IT, IDX) ((BIDI_IT)->level_stack[IDX].flags & 1)
436 #define OVERRIDE(BIDI_IT, IDX) (((BIDI_IT)->level_stack[IDX].flags >> 1) & 3)
438 /* Push the current embedding level and override status; reset the
439 current level to LEVEL and the current override status to OVERRIDE. */
440 static void
441 bidi_push_embedding_level (struct bidi_it *bidi_it,
442 int level, bidi_dir_t override, bool isolate_status)
444 struct bidi_stack *st;
445 int prev_level = bidi_it->level_stack[bidi_it->stack_idx].level;
447 bidi_it->stack_idx++;
448 eassert (bidi_it->stack_idx < BIDI_MAXDEPTH+2+1);
449 st = &bidi_it->level_stack[bidi_it->stack_idx];
450 eassert (level <= (1 << 7));
451 st->level = level;
452 st->flags = (((override & 3) << 1) | (isolate_status != 0));
453 if (isolate_status)
455 st->last_strong_type = bidi_it->last_strong.type;
456 st->prev_for_neutral_type = bidi_it->prev_for_neutral.type;
457 st->next_for_neutral_type = bidi_it->next_for_neutral.type;
458 st->next_for_neutral_pos = bidi_it->next_for_neutral.charpos;
459 st->flags |= ((bidi_it->sos == L2R ? 0 : 1) << 3);
461 /* We've got a new isolating sequence, compute the directional type
462 of sos and initialize per-sequence variables (UAX#9, clause X10). */
463 bidi_set_sos_type (bidi_it, prev_level, level);
466 /* Pop from the stack the embedding level, the directional override
467 status, and optionally saved information for the isolating run
468 sequence. Return the new level. */
469 static int
470 bidi_pop_embedding_level (struct bidi_it *bidi_it)
472 int level;
474 /* UAX#9 says to ignore invalid PDFs (X7, last bullet)
475 and PDIs (X6a, 2nd bullet). */
476 if (bidi_it->stack_idx > 0)
478 bool isolate_status = ISOLATE_STATUS (bidi_it, bidi_it->stack_idx);
479 int old_level = bidi_it->level_stack[bidi_it->stack_idx].level;
481 struct bidi_stack st;
483 st = bidi_it->level_stack[bidi_it->stack_idx];
484 if (isolate_status)
486 bidi_dir_t sos = ((st.flags >> 3) & 1);
487 /* PREV is used in W1 for resolving WEAK_NSM. By the time
488 we get to an NSM, we must have gotten past at least one
489 character: the PDI that ends the isolate from which we
490 are popping here. So PREV will have been filled up by
491 the time we first use it. We initialize it here to
492 UNKNOWN_BT to be able to catch any blunders in this
493 logic. */
494 bidi_it->prev.orig_type = bidi_it->prev.type = UNKNOWN_BT;
495 bidi_it->last_strong.type = st.last_strong_type;
496 bidi_it->prev_for_neutral.type = st.prev_for_neutral_type;
497 bidi_it->next_for_neutral.type = st.next_for_neutral_type;
498 bidi_it->next_for_neutral.charpos = st.next_for_neutral_pos;
499 bidi_it->sos = (sos == 0 ? L2R : R2L);
501 else
502 bidi_set_sos_type (bidi_it, old_level,
503 bidi_it->level_stack[bidi_it->stack_idx - 1].level);
505 bidi_it->stack_idx--;
507 level = bidi_it->level_stack[bidi_it->stack_idx].level;
508 eassert (0 <= level && level <= BIDI_MAXDEPTH + 1);
509 return level;
512 /* Record in SAVED_INFO the information about the current character. */
513 static void
514 bidi_remember_char (struct bidi_saved_info *saved_info,
515 struct bidi_it *bidi_it, bool from_type)
517 saved_info->charpos = bidi_it->charpos;
518 if (from_type)
519 saved_info->type = bidi_it->type;
520 else
521 saved_info->type = bidi_it->type_after_wn;
522 bidi_check_type (saved_info->type);
523 saved_info->orig_type = bidi_it->orig_type;
524 bidi_check_type (saved_info->orig_type);
527 /* Copy the bidi iterator from FROM to TO. To save cycles, this only
528 copies the part of the level stack that is actually in use. */
529 static void
530 bidi_copy_it (struct bidi_it *to, struct bidi_it *from)
532 /* Copy everything from the start through the active part of
533 the level stack. */
534 memcpy (to, from,
535 (offsetof (struct bidi_it, level_stack) + sizeof from->level_stack[0]
536 + from->stack_idx * sizeof from->level_stack[0]));
540 /***********************************************************************
541 Caching the bidi iterator states
542 ***********************************************************************/
544 /* We allocate and de-allocate the cache in chunks of this size (in
545 characters). 200 was chosen as an upper limit for reasonably-long
546 lines in a text file/buffer. */
547 #define BIDI_CACHE_CHUNK 200
548 /* Maximum size we allow the cache to become, per iterator stack slot,
549 in units of struct bidi_it size. If we allow unlimited growth, we
550 could run out of memory for pathologically long bracketed text or
551 very long text lines that need to be reordered. This is aggravated
552 when word-wrap is in effect, since then functions display_line and
553 move_it_in_display_line_to need to keep up to 4 copies of the
554 cache.
556 This limitation means there can be no more than that amount of
557 contiguous RTL text on any single physical line in a LTR paragraph,
558 and similarly with contiguous LTR + numeric text in a RTL
559 paragraph. (LTR text in a LTR paragraph and RTL text in a RTL
560 paragraph are not reordered, and so don't need the cache, and
561 cannot hit this limit.) More importantly, no single line can have
562 text longer than this inside paired brackets (because bracket pairs
563 resolution uses the cache). If the limit is exceeded, the fallback
564 code will produce visual order that will be incorrect if there are
565 RTL characters in the offending line of text. */
566 /* Do we need to allow customization of this limit? */
567 #define BIDI_CACHE_MAX_ELTS_PER_SLOT 50000
568 #if BIDI_CACHE_CHUNK >= BIDI_CACHE_MAX_ELTS_PER_SLOT
569 # error BIDI_CACHE_CHUNK must be less than BIDI_CACHE_MAX_ELTS_PER_SLOT
570 #endif
571 static ptrdiff_t bidi_cache_max_elts = BIDI_CACHE_MAX_ELTS_PER_SLOT;
572 static struct bidi_it *bidi_cache;
573 static ptrdiff_t bidi_cache_size = 0;
574 enum { elsz = sizeof (struct bidi_it) };
575 static ptrdiff_t bidi_cache_idx; /* next unused cache slot */
576 static ptrdiff_t bidi_cache_last_idx; /* slot of last cache hit */
577 static ptrdiff_t bidi_cache_start = 0; /* start of cache for this
578 "stack" level */
580 /* 5-slot stack for saving the start of the previous level of the
581 cache. xdisp.c maintains a 5-slot stack for its iterator state,
582 and we need the same size of our stack. */
583 static ptrdiff_t bidi_cache_start_stack[IT_STACK_SIZE];
584 static int bidi_cache_sp;
586 /* Size of header used by bidi_shelve_cache. */
587 enum
589 bidi_shelve_header_size
590 = (sizeof (bidi_cache_idx) + sizeof (bidi_cache_start_stack)
591 + sizeof (bidi_cache_sp) + sizeof (bidi_cache_start)
592 + sizeof (bidi_cache_last_idx) + sizeof (bidi_cache_max_elts))
595 /* Effectively remove the cached states beyond the Nth state from the
596 part of the cache relevant to iteration of the current object
597 (buffer or string). */
598 static void
599 bidi_cache_reset_to (int n)
601 bidi_cache_idx = bidi_cache_start + n;
602 bidi_cache_last_idx = -1;
605 /* Reset the cache state to the empty state. We only reset the part
606 of the cache relevant to iteration of the current object. Previous
607 objects, which are pushed on the display iterator's stack, are left
608 intact. This is called when the cached information is no more
609 useful for the current iteration, e.g. when we were reseated to a
610 new position on the same object. */
611 static void
612 bidi_cache_reset (void)
614 bidi_cache_reset_to (0);
617 /* Shrink the cache to its minimal size. Called when we init the bidi
618 iterator for reordering a buffer or a string that does not come
619 from display properties, because that means all the previously
620 cached info is of no further use. */
621 static void
622 bidi_cache_shrink (void)
624 if (bidi_cache_size > BIDI_CACHE_CHUNK)
626 bidi_cache = xrealloc (bidi_cache, BIDI_CACHE_CHUNK * elsz);
627 bidi_cache_size = BIDI_CACHE_CHUNK;
629 bidi_cache_reset ();
630 bidi_cache_max_elts = BIDI_CACHE_MAX_ELTS_PER_SLOT;
633 static void
634 bidi_cache_fetch_state (ptrdiff_t idx, struct bidi_it *bidi_it)
636 int current_scan_dir = bidi_it->scan_dir;
638 if (idx < bidi_cache_start || idx >= bidi_cache_idx)
639 emacs_abort ();
641 bidi_copy_it (bidi_it, &bidi_cache[idx]);
642 bidi_it->scan_dir = current_scan_dir;
643 bidi_cache_last_idx = idx;
646 /* Find a cached state with a given CHARPOS and resolved embedding
647 level less or equal to LEVEL. If LEVEL is -1, disregard the
648 resolved levels in cached states. DIR, if non-zero, means search
649 in that direction from the last cache hit.
651 Value is the index of the cached state, or -1 if not found. */
652 static ptrdiff_t
653 bidi_cache_search (ptrdiff_t charpos, int level, int dir)
655 ptrdiff_t i, i_start;
657 if (bidi_cache_idx > bidi_cache_start)
659 if (bidi_cache_last_idx == -1)
660 bidi_cache_last_idx = bidi_cache_idx - 1;
661 if (charpos < bidi_cache[bidi_cache_last_idx].charpos)
663 dir = -1;
664 i_start = bidi_cache_last_idx - 1;
666 else if (charpos > (bidi_cache[bidi_cache_last_idx].charpos
667 + bidi_cache[bidi_cache_last_idx].nchars - 1))
669 dir = 1;
670 i_start = bidi_cache_last_idx + 1;
672 else if (dir)
673 i_start = bidi_cache_last_idx;
674 else
676 dir = -1;
677 i_start = bidi_cache_idx - 1;
680 if (dir < 0)
682 /* Linear search for now; FIXME! */
683 for (i = i_start; i >= bidi_cache_start; i--)
684 if (bidi_cache[i].charpos <= charpos
685 && charpos < bidi_cache[i].charpos + bidi_cache[i].nchars
686 && (level == -1 || bidi_cache[i].resolved_level <= level))
687 return i;
689 else
691 for (i = i_start; i < bidi_cache_idx; i++)
692 if (bidi_cache[i].charpos <= charpos
693 && charpos < bidi_cache[i].charpos + bidi_cache[i].nchars
694 && (level == -1 || bidi_cache[i].resolved_level <= level))
695 return i;
699 return -1;
702 /* Find a cached state where the resolved level changes to a value
703 that is lower than LEVEL, and return its cache slot index. DIR is
704 the direction to search, starting with the last used cache slot.
705 If DIR is zero, we search backwards from the last occupied cache
706 slot. BEFORE means return the index of the slot that
707 is ``before'' the level change in the search direction. That is,
708 given the cached levels like this:
710 1122333442211
711 AB C
713 and assuming we are at the position cached at the slot marked with
714 C, searching backwards (DIR = -1) for LEVEL = 2 will return the
715 index of slot B or A, depending whether BEFORE is, respectively,
716 true or false. */
717 static ptrdiff_t
718 bidi_cache_find_level_change (int level, int dir, bool before)
720 if (bidi_cache_idx)
722 ptrdiff_t i = dir ? bidi_cache_last_idx : bidi_cache_idx - 1;
723 int incr = before ? 1 : 0;
725 if (i < 0) /* cache overflowed? */
726 i = 0;
728 if (!dir)
729 dir = -1;
730 else if (!incr)
731 i += dir;
733 if (dir < 0)
735 while (i >= bidi_cache_start + incr)
737 if (bidi_cache[i - incr].resolved_level >= 0
738 && bidi_cache[i - incr].resolved_level < level)
739 return i;
740 i--;
743 else
745 while (i < bidi_cache_idx - incr)
747 if (bidi_cache[i + incr].resolved_level >= 0
748 && bidi_cache[i + incr].resolved_level < level)
749 return i;
750 i++;
755 return -1;
758 static void
759 bidi_cache_ensure_space (ptrdiff_t idx)
761 /* Enlarge the cache as needed. */
762 if (idx >= bidi_cache_size)
764 ptrdiff_t chunk_size = BIDI_CACHE_CHUNK;
766 if (bidi_cache_size > bidi_cache_max_elts - chunk_size)
767 chunk_size = bidi_cache_max_elts - bidi_cache_size;
769 if (max (idx + 1,
770 bidi_cache_size + chunk_size) <= bidi_cache_max_elts)
772 /* The bidi cache cannot be larger than the largest Lisp
773 string or buffer. */
774 ptrdiff_t string_or_buffer_bound
775 = max (BUF_BYTES_MAX, STRING_BYTES_BOUND);
777 /* Also, it cannot be larger than what C can represent. */
778 ptrdiff_t c_bound
779 = (min (PTRDIFF_MAX, SIZE_MAX) - bidi_shelve_header_size) / elsz;
780 ptrdiff_t max_elts = bidi_cache_max_elts;
782 max_elts = min (max_elts, min (string_or_buffer_bound, c_bound));
784 /* Force xpalloc not to over-allocate by passing it MAX_ELTS
785 as its 4th argument. */
786 bidi_cache = xpalloc (bidi_cache, &bidi_cache_size,
787 max (chunk_size, idx - bidi_cache_size + 1),
788 max_elts, elsz);
789 eassert (bidi_cache_size > idx);
794 static int
795 bidi_cache_iterator_state (struct bidi_it *bidi_it, bool resolved,
796 bool update_only)
798 ptrdiff_t idx;
800 /* We should never cache on backward scans. */
801 if (bidi_it->scan_dir == -1)
802 emacs_abort ();
803 idx = bidi_cache_search (bidi_it->charpos, -1, 1);
805 if (idx < 0 && update_only)
806 return 0;
808 if (idx < 0)
810 idx = bidi_cache_idx;
811 bidi_cache_ensure_space (idx);
812 /* Character positions should correspond to cache positions 1:1.
813 If we are outside the range of cached positions, the cache is
814 useless and must be reset. */
815 if (bidi_cache_start < idx && idx < bidi_cache_size
816 && (bidi_it->charpos > (bidi_cache[idx - 1].charpos
817 + bidi_cache[idx - 1].nchars)
818 || bidi_it->charpos < bidi_cache[bidi_cache_start].charpos))
820 bidi_cache_reset ();
821 idx = bidi_cache_start;
823 if (bidi_it->nchars <= 0)
824 emacs_abort ();
825 /* Don't cache if no available space in the cache. */
826 if (bidi_cache_size > idx)
828 bidi_copy_it (&bidi_cache[idx], bidi_it);
829 if (!resolved)
830 bidi_cache[idx].resolved_level = -1;
833 else
835 /* Copy only the members which could have changed, to avoid
836 costly copying of the entire struct. */
837 bidi_cache[idx].type = bidi_it->type;
838 bidi_check_type (bidi_it->type);
839 bidi_cache[idx].type_after_wn = bidi_it->type_after_wn;
840 bidi_check_type (bidi_it->type_after_wn);
841 if (resolved)
842 bidi_cache[idx].resolved_level = bidi_it->resolved_level;
843 else
844 bidi_cache[idx].resolved_level = -1;
845 bidi_cache[idx].invalid_levels = bidi_it->invalid_levels;
846 bidi_cache[idx].next_for_neutral = bidi_it->next_for_neutral;
847 bidi_cache[idx].next_for_ws = bidi_it->next_for_ws;
848 bidi_cache[idx].disp_pos = bidi_it->disp_pos;
849 bidi_cache[idx].disp_prop = bidi_it->disp_prop;
850 bidi_cache[idx].bracket_pairing_pos = bidi_it->bracket_pairing_pos;
851 bidi_cache[idx].bracket_enclosed_type = bidi_it->bracket_enclosed_type;
854 if (bidi_cache_size > idx)
856 bidi_cache_last_idx = idx;
857 if (idx >= bidi_cache_idx)
858 bidi_cache_idx = idx + 1;
859 return 1;
861 else
863 /* The cache overflowed. */
864 bidi_cache_last_idx = -1;
865 return 0;
869 /* Look for a cached iterator state that corresponds to CHARPOS. If
870 found, copy the cached state into BIDI_IT and return the type of
871 the cached entry. If not found, return UNKNOWN_BT. RESOLVED_ONLY
872 zero means it is OK to return cached states that were not fully
873 resolved yet. This can happen if the state was cached before it
874 was resolved in bidi_resolve_neutral. */
875 static bidi_type_t
876 bidi_cache_find (ptrdiff_t charpos, bool resolved_only, struct bidi_it *bidi_it)
878 ptrdiff_t i = bidi_cache_search (charpos, -1, bidi_it->scan_dir);
880 if (i >= bidi_cache_start
881 && (!resolved_only
882 /* Callers that want only fully resolved states (and set
883 resolved_only = true) need to be sure that there's enough
884 info in the cached state to return the state as final,
885 and if not, they don't want the cached state. */
886 || bidi_cache[i].resolved_level >= 0))
888 bidi_dir_t current_scan_dir = bidi_it->scan_dir;
890 bidi_copy_it (bidi_it, &bidi_cache[i]);
891 bidi_cache_last_idx = i;
892 /* Don't let scan direction from the cached state override
893 the current scan direction. */
894 bidi_it->scan_dir = current_scan_dir;
895 return bidi_it->type;
898 return UNKNOWN_BT;
901 static int
902 bidi_peek_at_next_level (struct bidi_it *bidi_it)
904 if (bidi_cache_idx == bidi_cache_start)
905 emacs_abort ();
906 /* If the cache overflowed, return the level of the last cached
907 character. */
908 if (bidi_cache_last_idx == -1
909 || (bidi_cache_last_idx >= bidi_cache_idx - 1 && bidi_it->scan_dir > 0))
910 return bidi_cache[bidi_cache_idx - 1].resolved_level;
911 return bidi_cache[bidi_cache_last_idx + bidi_it->scan_dir].resolved_level;
915 /***********************************************************************
916 Pushing and popping the bidi iterator state
917 ***********************************************************************/
919 /* Push the bidi iterator state in preparation for reordering a
920 different object, e.g. display string found at certain buffer
921 position. Pushing the bidi iterator boils down to saving its
922 entire state on the cache and starting a new cache "stacked" on top
923 of the current cache. */
924 void
925 bidi_push_it (struct bidi_it *bidi_it)
927 /* Give this stack slot its cache room. */
928 bidi_cache_max_elts += BIDI_CACHE_MAX_ELTS_PER_SLOT;
929 /* Save the current iterator state in its entirety after the last
930 used cache slot. */
931 bidi_cache_ensure_space (bidi_cache_idx);
932 bidi_cache[bidi_cache_idx++] = *bidi_it;
934 /* Push the current cache start onto the stack. */
935 eassert (bidi_cache_sp < IT_STACK_SIZE);
936 bidi_cache_start_stack[bidi_cache_sp++] = bidi_cache_start;
938 /* Start a new level of cache, and make it empty. */
939 bidi_cache_start = bidi_cache_idx;
940 bidi_cache_last_idx = -1;
943 /* Restore the iterator state saved by bidi_push_it and return the
944 cache to the corresponding state. */
945 void
946 bidi_pop_it (struct bidi_it *bidi_it)
948 if (bidi_cache_start <= 0)
949 emacs_abort ();
951 /* Reset the next free cache slot index to what it was before the
952 call to bidi_push_it. */
953 bidi_cache_idx = bidi_cache_start - 1;
955 /* Restore the bidi iterator state saved in the cache. */
956 *bidi_it = bidi_cache[bidi_cache_idx];
958 /* Pop the previous cache start from the stack. */
959 if (bidi_cache_sp <= 0)
960 emacs_abort ();
961 bidi_cache_start = bidi_cache_start_stack[--bidi_cache_sp];
963 /* Invalidate the last-used cache slot data. */
964 bidi_cache_last_idx = -1;
966 bidi_cache_max_elts -= BIDI_CACHE_MAX_ELTS_PER_SLOT;
967 eassert (bidi_cache_max_elts > 0);
970 static ptrdiff_t bidi_cache_total_alloc;
972 /* Stash away a copy of the cache and its control variables. */
973 void *
974 bidi_shelve_cache (void)
976 unsigned char *databuf;
977 ptrdiff_t alloc;
979 /* Empty cache. */
980 if (bidi_cache_idx == 0)
981 return NULL;
983 alloc = (bidi_shelve_header_size
984 + bidi_cache_idx * sizeof (struct bidi_it));
985 databuf = xmalloc (alloc);
986 bidi_cache_total_alloc += alloc;
988 memcpy (databuf, &bidi_cache_idx, sizeof (bidi_cache_idx));
989 memcpy (databuf + sizeof (bidi_cache_idx),
990 bidi_cache, bidi_cache_idx * sizeof (struct bidi_it));
991 memcpy (databuf + sizeof (bidi_cache_idx)
992 + bidi_cache_idx * sizeof (struct bidi_it),
993 bidi_cache_start_stack, sizeof (bidi_cache_start_stack));
994 memcpy (databuf + sizeof (bidi_cache_idx)
995 + bidi_cache_idx * sizeof (struct bidi_it)
996 + sizeof (bidi_cache_start_stack),
997 &bidi_cache_sp, sizeof (bidi_cache_sp));
998 memcpy (databuf + sizeof (bidi_cache_idx)
999 + bidi_cache_idx * sizeof (struct bidi_it)
1000 + sizeof (bidi_cache_start_stack) + sizeof (bidi_cache_sp),
1001 &bidi_cache_start, sizeof (bidi_cache_start));
1002 memcpy (databuf + sizeof (bidi_cache_idx)
1003 + bidi_cache_idx * sizeof (struct bidi_it)
1004 + sizeof (bidi_cache_start_stack) + sizeof (bidi_cache_sp)
1005 + sizeof (bidi_cache_start),
1006 &bidi_cache_last_idx, sizeof (bidi_cache_last_idx));
1007 memcpy (databuf + sizeof (bidi_cache_idx)
1008 + bidi_cache_idx * sizeof (struct bidi_it)
1009 + sizeof (bidi_cache_start_stack) + sizeof (bidi_cache_sp)
1010 + sizeof (bidi_cache_start) + sizeof (bidi_cache_last_idx),
1011 &bidi_cache_max_elts, sizeof (bidi_cache_max_elts));
1013 return databuf;
1016 /* Restore the cache state from a copy stashed away by
1017 bidi_shelve_cache, and free the buffer used to stash that copy.
1018 JUST_FREE means free the buffer, but don't restore the
1019 cache; used when the corresponding iterator is discarded instead of
1020 being restored. */
1021 void
1022 bidi_unshelve_cache (void *databuf, bool just_free)
1024 unsigned char *p = databuf;
1026 if (!p)
1028 if (!just_free)
1030 /* A NULL pointer means an empty cache. */
1031 bidi_cache_start = 0;
1032 bidi_cache_sp = 0;
1033 bidi_cache_max_elts = BIDI_CACHE_MAX_ELTS_PER_SLOT;
1034 bidi_cache_reset ();
1037 else
1039 if (just_free)
1041 ptrdiff_t idx;
1043 memcpy (&idx, p, sizeof (bidi_cache_idx));
1044 bidi_cache_total_alloc
1045 -= bidi_shelve_header_size + idx * sizeof (struct bidi_it);
1047 else
1049 memcpy (&bidi_cache_idx, p, sizeof (bidi_cache_idx));
1050 bidi_cache_ensure_space (bidi_cache_idx);
1051 memcpy (bidi_cache, p + sizeof (bidi_cache_idx),
1052 bidi_cache_idx * sizeof (struct bidi_it));
1053 memcpy (bidi_cache_start_stack,
1054 p + sizeof (bidi_cache_idx)
1055 + bidi_cache_idx * sizeof (struct bidi_it),
1056 sizeof (bidi_cache_start_stack));
1057 memcpy (&bidi_cache_sp,
1058 p + sizeof (bidi_cache_idx)
1059 + bidi_cache_idx * sizeof (struct bidi_it)
1060 + sizeof (bidi_cache_start_stack),
1061 sizeof (bidi_cache_sp));
1062 memcpy (&bidi_cache_start,
1063 p + sizeof (bidi_cache_idx)
1064 + bidi_cache_idx * sizeof (struct bidi_it)
1065 + sizeof (bidi_cache_start_stack) + sizeof (bidi_cache_sp),
1066 sizeof (bidi_cache_start));
1067 memcpy (&bidi_cache_last_idx,
1068 p + sizeof (bidi_cache_idx)
1069 + bidi_cache_idx * sizeof (struct bidi_it)
1070 + sizeof (bidi_cache_start_stack) + sizeof (bidi_cache_sp)
1071 + sizeof (bidi_cache_start),
1072 sizeof (bidi_cache_last_idx));
1073 memcpy (&bidi_cache_max_elts,
1074 p + sizeof (bidi_cache_idx)
1075 + bidi_cache_idx * sizeof (struct bidi_it)
1076 + sizeof (bidi_cache_start_stack) + sizeof (bidi_cache_sp)
1077 + sizeof (bidi_cache_start) + sizeof (bidi_cache_last_idx),
1078 sizeof (bidi_cache_max_elts));
1079 bidi_cache_total_alloc
1080 -= (bidi_shelve_header_size
1081 + bidi_cache_idx * sizeof (struct bidi_it));
1084 xfree (p);
1089 /***********************************************************************
1090 Initialization
1091 ***********************************************************************/
1092 static void
1093 bidi_initialize (void)
1095 bidi_type_table = uniprop_table (intern ("bidi-class"));
1096 if (NILP (bidi_type_table))
1097 emacs_abort ();
1098 staticpro (&bidi_type_table);
1100 bidi_mirror_table = uniprop_table (intern ("mirroring"));
1101 if (NILP (bidi_mirror_table))
1102 emacs_abort ();
1103 staticpro (&bidi_mirror_table);
1105 bidi_brackets_table = uniprop_table (intern ("bracket-type"));
1106 if (NILP (bidi_brackets_table))
1107 emacs_abort ();
1108 staticpro (&bidi_brackets_table);
1110 paragraph_start_re = build_string ("^\\(\f\\|[ \t]*\\)$");
1111 staticpro (&paragraph_start_re);
1112 paragraph_separate_re = build_string ("^[ \t\f]*$");
1113 staticpro (&paragraph_separate_re);
1115 bidi_cache_sp = 0;
1116 bidi_cache_total_alloc = 0;
1117 bidi_cache_max_elts = BIDI_CACHE_MAX_ELTS_PER_SLOT;
1119 bidi_initialized = 1;
1122 /* Do whatever UAX#9 clause X8 says should be done at paragraph's
1123 end. */
1124 static void
1125 bidi_set_paragraph_end (struct bidi_it *bidi_it)
1127 bidi_it->invalid_levels = 0;
1128 bidi_it->invalid_isolates = 0;
1129 bidi_it->stack_idx = 0;
1130 bidi_it->resolved_level = bidi_it->level_stack[0].level;
1133 /* Initialize the bidi iterator from buffer/string position CHARPOS. */
1134 void
1135 bidi_init_it (ptrdiff_t charpos, ptrdiff_t bytepos, bool frame_window_p,
1136 struct bidi_it *bidi_it)
1138 if (! bidi_initialized)
1139 bidi_initialize ();
1140 if (charpos >= 0)
1141 bidi_it->charpos = charpos;
1142 if (bytepos >= 0)
1143 bidi_it->bytepos = bytepos;
1144 bidi_it->frame_window_p = frame_window_p;
1145 bidi_it->nchars = -1; /* to be computed in bidi_resolve_explicit */
1146 bidi_it->first_elt = 1;
1147 bidi_set_paragraph_end (bidi_it);
1148 bidi_it->new_paragraph = 1;
1149 bidi_it->separator_limit = -1;
1150 bidi_it->type = NEUTRAL_B;
1151 bidi_it->type_after_wn = NEUTRAL_B;
1152 bidi_it->orig_type = NEUTRAL_B;
1153 /* FIXME: Review this!!! */
1154 bidi_it->prev.type = bidi_it->prev.orig_type = UNKNOWN_BT;
1155 bidi_it->last_strong.type = bidi_it->last_strong.orig_type = UNKNOWN_BT;
1156 bidi_it->next_for_neutral.charpos = -1;
1157 bidi_it->next_for_neutral.type
1158 = bidi_it->next_for_neutral.orig_type = UNKNOWN_BT;
1159 bidi_it->prev_for_neutral.charpos = -1;
1160 bidi_it->prev_for_neutral.type
1161 = bidi_it->prev_for_neutral.orig_type = UNKNOWN_BT;
1162 bidi_it->bracket_pairing_pos = -1;
1163 bidi_it->sos = L2R; /* FIXME: should it be user-selectable? */
1164 bidi_it->disp_pos = -1; /* invalid/unknown */
1165 bidi_it->disp_prop = 0;
1166 /* We can only shrink the cache if we are at the bottom level of its
1167 "stack". */
1168 if (bidi_cache_start == 0)
1169 bidi_cache_shrink ();
1170 else
1171 bidi_cache_reset ();
1174 /* Perform initializations for reordering a new line of bidi text. */
1175 static void
1176 bidi_line_init (struct bidi_it *bidi_it)
1178 bidi_it->scan_dir = 1; /* FIXME: do we need to have control on this? */
1179 bidi_it->stack_idx = 0;
1180 bidi_it->resolved_level = bidi_it->level_stack[0].level;
1181 bidi_it->level_stack[0].flags = 0; /* NEUTRAL_DIR, false per X1 */
1182 bidi_it->invalid_levels = 0;
1183 bidi_it->isolate_level = 0; /* X1 */
1184 bidi_it->invalid_isolates = 0; /* X1 */
1185 /* Setting this to zero will force its recomputation the first time
1186 we need it for W5. */
1187 bidi_it->next_en_pos = 0;
1188 bidi_it->next_en_type = UNKNOWN_BT;
1189 bidi_it->next_for_ws.charpos = -1;
1190 bidi_it->next_for_ws.type = UNKNOWN_BT;
1191 bidi_it->bracket_pairing_pos = -1;
1192 bidi_set_sos_type (bidi_it,
1193 (bidi_it->paragraph_dir == R2L ? 1 : 0),
1194 bidi_it->level_stack[0].level); /* X10 */
1196 bidi_cache_reset ();
1200 /***********************************************************************
1201 Fetching characters
1202 ***********************************************************************/
1204 /* Count bytes in string S between BEG/BEGBYTE and END. BEG and END
1205 are zero-based character positions in S, BEGBYTE is byte position
1206 corresponding to BEG. UNIBYTE means S is a unibyte string. */
1207 static ptrdiff_t
1208 bidi_count_bytes (const unsigned char *s, ptrdiff_t beg,
1209 ptrdiff_t begbyte, ptrdiff_t end, bool unibyte)
1211 ptrdiff_t pos = beg;
1212 const unsigned char *p = s + begbyte, *start = p;
1214 if (unibyte)
1215 p = s + end;
1216 else
1218 if (!CHAR_HEAD_P (*p))
1219 emacs_abort ();
1221 while (pos < end)
1223 p += BYTES_BY_CHAR_HEAD (*p);
1224 pos++;
1228 return p - start;
1231 /* Fetch and return the character at byte position BYTEPOS. If S is
1232 non-NULL, fetch the character from string S; otherwise fetch the
1233 character from the current buffer. UNIBYTE means S is a
1234 unibyte string. */
1235 static int
1236 bidi_char_at_pos (ptrdiff_t bytepos, const unsigned char *s, bool unibyte)
1238 if (s)
1240 s += bytepos;
1241 if (unibyte)
1242 return *s;
1244 else
1245 s = BYTE_POS_ADDR (bytepos);
1246 return STRING_CHAR (s);
1249 /* Fetch and return the character at CHARPOS/BYTEPOS. If that
1250 character is covered by a display string, treat the entire run of
1251 covered characters as a single character, either u+2029 or u+FFFC,
1252 and return their combined length in CH_LEN and NCHARS. DISP_POS
1253 specifies the character position of the next display string, or -1
1254 if not yet computed. When the next character is at or beyond that
1255 position, the function updates DISP_POS with the position of the
1256 next display string. *DISP_PROP non-zero means that there's really
1257 a display string at DISP_POS, as opposed to when we searched till
1258 DISP_POS without finding one. If *DISP_PROP is 2, it means the
1259 display spec is of the form `(space ...)', which is replaced with
1260 u+2029 to handle it as a paragraph separator. STRING->s is the C
1261 string to iterate, or NULL if iterating over a buffer or a Lisp
1262 string; in the latter case, STRING->lstring is the Lisp string. */
1263 static int
1264 bidi_fetch_char (ptrdiff_t charpos, ptrdiff_t bytepos, ptrdiff_t *disp_pos,
1265 int *disp_prop, struct bidi_string_data *string,
1266 struct window *w,
1267 bool frame_window_p, ptrdiff_t *ch_len, ptrdiff_t *nchars)
1269 int ch;
1270 ptrdiff_t endpos
1271 = (string->s || STRINGP (string->lstring)) ? string->schars : ZV;
1272 struct text_pos pos;
1273 int len;
1275 /* If we got past the last known position of display string, compute
1276 the position of the next one. That position could be at CHARPOS. */
1277 if (charpos < endpos && charpos > *disp_pos)
1279 SET_TEXT_POS (pos, charpos, bytepos);
1280 *disp_pos = compute_display_string_pos (&pos, string, w, frame_window_p,
1281 disp_prop);
1284 /* Fetch the character at BYTEPOS. */
1285 if (charpos >= endpos)
1287 ch = BIDI_EOB;
1288 *ch_len = 1;
1289 *nchars = 1;
1290 *disp_pos = endpos;
1291 *disp_prop = 0;
1293 else if (charpos >= *disp_pos && *disp_prop)
1295 ptrdiff_t disp_end_pos;
1297 /* We don't expect to find ourselves in the middle of a display
1298 property. Hopefully, it will never be needed. */
1299 if (charpos > *disp_pos)
1300 emacs_abort ();
1301 /* Text covered by `display' properties and overlays with
1302 display properties or display strings is handled as a single
1303 character that represents the entire run of characters
1304 covered by the display property. */
1305 if (*disp_prop == 2)
1307 /* `(space ...)' display specs are handled as paragraph
1308 separators for the purposes of the reordering; see UAX#9
1309 section 3 and clause HL1 in section 4.3 there. */
1310 ch = PARAGRAPH_SEPARATOR;
1312 else
1314 /* All other display specs are handled as the Unicode Object
1315 Replacement Character. */
1316 ch = OBJECT_REPLACEMENT_CHARACTER;
1318 disp_end_pos = compute_display_string_end (*disp_pos, string);
1319 if (disp_end_pos < 0)
1321 /* Somebody removed the display string from the buffer
1322 behind our back. Recover by processing this buffer
1323 position as if no display property were present there to
1324 begin with. */
1325 *disp_prop = 0;
1326 goto normal_char;
1328 *nchars = disp_end_pos - *disp_pos;
1329 if (*nchars <= 0)
1330 emacs_abort ();
1331 if (string->s)
1332 *ch_len = bidi_count_bytes (string->s, *disp_pos, bytepos,
1333 disp_end_pos, string->unibyte);
1334 else if (STRINGP (string->lstring))
1335 *ch_len = bidi_count_bytes (SDATA (string->lstring), *disp_pos,
1336 bytepos, disp_end_pos, string->unibyte);
1337 else
1338 *ch_len = CHAR_TO_BYTE (disp_end_pos) - bytepos;
1340 else
1342 normal_char:
1343 if (string->s)
1346 if (!string->unibyte)
1348 ch = STRING_CHAR_AND_LENGTH (string->s + bytepos, len);
1349 *ch_len = len;
1351 else
1353 ch = UNIBYTE_TO_CHAR (string->s[bytepos]);
1354 *ch_len = 1;
1357 else if (STRINGP (string->lstring))
1359 if (!string->unibyte)
1361 ch = STRING_CHAR_AND_LENGTH (SDATA (string->lstring) + bytepos,
1362 len);
1363 *ch_len = len;
1365 else
1367 ch = UNIBYTE_TO_CHAR (SREF (string->lstring, bytepos));
1368 *ch_len = 1;
1371 else
1373 ch = STRING_CHAR_AND_LENGTH (BYTE_POS_ADDR (bytepos), len);
1374 *ch_len = len;
1376 *nchars = 1;
1379 /* If we just entered a run of characters covered by a display
1380 string, compute the position of the next display string. */
1381 if (charpos + *nchars <= endpos && charpos + *nchars > *disp_pos
1382 && *disp_prop)
1384 SET_TEXT_POS (pos, charpos + *nchars, bytepos + *ch_len);
1385 *disp_pos = compute_display_string_pos (&pos, string, w, frame_window_p,
1386 disp_prop);
1389 return ch;
1392 /* Like bidi_fetch_char, but ignore any text between an isolate
1393 initiator and its matching PDI or, if it has no matching PDI, the
1394 end of the paragraph. If isolates were skipped, CH_LEN and NCHARS
1395 are set to the number of bytes and characters between BYTEPOS/CHARPOS
1396 and the character that was fetched after skipping the isolates. */
1397 static int
1398 bidi_fetch_char_skip_isolates (ptrdiff_t charpos, ptrdiff_t bytepos,
1399 ptrdiff_t *disp_pos, int *disp_prop,
1400 struct bidi_string_data *string,
1401 struct window *w, bool frame_window_p,
1402 ptrdiff_t *ch_len, ptrdiff_t *nchars)
1404 ptrdiff_t orig_charpos = charpos, orig_bytepos = bytepos;
1405 int ch = bidi_fetch_char (charpos, bytepos, disp_pos, disp_prop, string, w,
1406 frame_window_p, ch_len, nchars);
1407 bidi_type_t ch_type = bidi_get_type (ch, NEUTRAL_DIR);
1408 ptrdiff_t level = 0;
1410 if (ch_type == LRI || ch_type == RLI || ch_type == FSI)
1412 level++;
1413 while (level > 0 && ch_type != NEUTRAL_B)
1415 charpos += *nchars;
1416 bytepos += *ch_len;
1417 ch = bidi_fetch_char (charpos, bytepos, disp_pos, disp_prop, string,
1418 w, frame_window_p, ch_len, nchars);
1419 ch_type = bidi_get_type (ch, NEUTRAL_DIR);
1420 /* A Note to P2 says to ignore max_depth limit. */
1421 if (ch_type == LRI || ch_type == RLI || ch_type == FSI)
1422 level++;
1423 else if (ch_type == PDI)
1424 level--;
1428 /* Communicate to the caller how much did we skip, so it could get
1429 past the last character position we examined. */
1430 *nchars += charpos - orig_charpos;
1431 *ch_len += bytepos - orig_bytepos;
1432 return ch;
1437 /***********************************************************************
1438 Determining paragraph direction
1439 ***********************************************************************/
1441 /* Check if buffer position CHARPOS/BYTEPOS is the end of a paragraph.
1442 Value is the non-negative length of the paragraph separator
1443 following the buffer position, -1 if position is at the beginning
1444 of a new paragraph, or -2 if position is neither at beginning nor
1445 at end of a paragraph. */
1446 static ptrdiff_t
1447 bidi_at_paragraph_end (ptrdiff_t charpos, ptrdiff_t bytepos)
1449 Lisp_Object sep_re;
1450 Lisp_Object start_re;
1451 ptrdiff_t val;
1453 sep_re = paragraph_separate_re;
1454 start_re = paragraph_start_re;
1456 val = fast_looking_at (sep_re, charpos, bytepos, ZV, ZV_BYTE, Qnil);
1457 if (val < 0)
1459 if (fast_looking_at (start_re, charpos, bytepos, ZV, ZV_BYTE, Qnil) >= 0)
1460 val = -1;
1461 else
1462 val = -2;
1465 return val;
1468 /* If the user has requested the long scans caching, make sure that
1469 BIDI cache is enabled. Otherwise, make sure it's disabled. */
1471 static struct region_cache *
1472 bidi_paragraph_cache_on_off (void)
1474 struct buffer *cache_buffer = current_buffer;
1475 bool indirect_p = false;
1477 /* For indirect buffers, make sure to use the cache of their base
1478 buffer. */
1479 if (cache_buffer->base_buffer)
1481 cache_buffer = cache_buffer->base_buffer;
1482 indirect_p = true;
1485 /* Don't turn on or off the cache in the base buffer, if the value
1486 of cache-long-scans of the base buffer is inconsistent with that.
1487 This is because doing so will just make the cache pure overhead,
1488 since if we turn it on via indirect buffer, it will be
1489 immediately turned off by its base buffer. */
1490 if (NILP (BVAR (current_buffer, cache_long_scans)))
1492 if (!indirect_p
1493 || NILP (BVAR (cache_buffer, cache_long_scans)))
1495 if (cache_buffer->bidi_paragraph_cache)
1497 free_region_cache (cache_buffer->bidi_paragraph_cache);
1498 cache_buffer->bidi_paragraph_cache = 0;
1501 return NULL;
1503 else
1505 if (!indirect_p
1506 || !NILP (BVAR (cache_buffer, cache_long_scans)))
1508 if (!cache_buffer->bidi_paragraph_cache)
1509 cache_buffer->bidi_paragraph_cache = new_region_cache ();
1511 return cache_buffer->bidi_paragraph_cache;
1515 /* On my 2005-vintage machine, searching back for paragraph start
1516 takes ~1 ms per line. And bidi_paragraph_init is called 4 times
1517 when user types C-p. The number below limits each call to
1518 bidi_paragraph_init to about 10 ms. */
1519 #define MAX_PARAGRAPH_SEARCH 7500
1521 /* Find the beginning of this paragraph by looking back in the buffer.
1522 Value is the byte position of the paragraph's beginning, or
1523 BEGV_BYTE if paragraph_start_re is still not found after looking
1524 back MAX_PARAGRAPH_SEARCH lines in the buffer. */
1525 static ptrdiff_t
1526 bidi_find_paragraph_start (ptrdiff_t pos, ptrdiff_t pos_byte)
1528 Lisp_Object re = paragraph_start_re;
1529 ptrdiff_t limit = ZV, limit_byte = ZV_BYTE;
1530 struct region_cache *bpc = bidi_paragraph_cache_on_off ();
1531 ptrdiff_t n = 0, oldpos = pos, next;
1532 struct buffer *cache_buffer = current_buffer;
1534 if (cache_buffer->base_buffer)
1535 cache_buffer = cache_buffer->base_buffer;
1537 while (pos_byte > BEGV_BYTE
1538 && n++ < MAX_PARAGRAPH_SEARCH
1539 && fast_looking_at (re, pos, pos_byte, limit, limit_byte, Qnil) < 0)
1541 /* FIXME: What if the paragraph beginning is covered by a
1542 display string? And what if a display string covering some
1543 of the text over which we scan back includes
1544 paragraph_start_re? */
1545 DEC_BOTH (pos, pos_byte);
1546 if (bpc && region_cache_backward (cache_buffer, bpc, pos, &next))
1548 pos = next, pos_byte = CHAR_TO_BYTE (pos);
1549 break;
1551 else
1552 pos = find_newline_no_quit (pos, pos_byte, -1, &pos_byte);
1554 if (n >= MAX_PARAGRAPH_SEARCH)
1555 pos = BEGV, pos_byte = BEGV_BYTE;
1556 if (bpc)
1557 know_region_cache (cache_buffer, bpc, pos, oldpos);
1558 /* Positions returned by the region cache are not limited to
1559 BEGV..ZV range, so we limit them here. */
1560 pos_byte = clip_to_bounds (BEGV_BYTE, pos_byte, ZV_BYTE);
1561 return pos_byte;
1564 /* On a 3.4 GHz machine, searching forward for a strong directional
1565 character in a long paragraph full of weaks or neutrals takes about
1566 1 ms for each 20K characters. The number below limits each call to
1567 bidi_paragraph_init to less than 10 ms even on slow machines. */
1568 #define MAX_STRONG_CHAR_SEARCH 100000
1570 /* Starting from POS, find the first strong (L, R, or AL) character,
1571 while skipping over any characters between an isolate initiator and
1572 its matching PDI. STOP_AT_PDI non-zero means stop at the PDI that
1573 matches the isolate initiator at POS. Return the bidi type of the
1574 character where the search stopped. Give up if after examining
1575 MAX_STRONG_CHAR_SEARCH buffer or string positions no strong
1576 character was found. */
1577 static bidi_type_t
1578 find_first_strong_char (ptrdiff_t pos, ptrdiff_t bytepos, ptrdiff_t end,
1579 ptrdiff_t *disp_pos, int *disp_prop,
1580 struct bidi_string_data *string, struct window *w,
1581 bool string_p, bool frame_window_p,
1582 ptrdiff_t *ch_len, ptrdiff_t *nchars, bool stop_at_pdi)
1584 ptrdiff_t pos1;
1585 bidi_type_t type;
1586 int ch;
1588 if (stop_at_pdi)
1590 /* If STOP_AT_PDI is non-zero, we must have been called with FSI
1591 at POS. Get past it. */
1592 #ifdef ENABLE_CHECKING
1593 ch = bidi_fetch_char (pos, bytepos, disp_pos, disp_prop, string, w,
1594 frame_window_p, ch_len, nchars);
1595 type = bidi_get_type (ch, NEUTRAL_DIR);
1596 eassert (type == FSI /* || type == LRI || type == RLI */);
1597 #endif
1598 pos += *nchars;
1599 bytepos += *ch_len;
1601 ch = bidi_fetch_char_skip_isolates (pos, bytepos, disp_pos, disp_prop, string,
1602 w, frame_window_p, ch_len, nchars);
1603 type = bidi_get_type (ch, NEUTRAL_DIR);
1605 pos1 = pos;
1606 for (pos += *nchars, bytepos += *ch_len;
1607 bidi_get_category (type) != STRONG
1608 /* If requested to stop at first PDI, stop there. */
1609 && !(stop_at_pdi && type == PDI)
1610 /* Stop when searched too far into an abnormally large
1611 paragraph full of weak or neutral characters. */
1612 && pos - pos1 < MAX_STRONG_CHAR_SEARCH;
1613 type = bidi_get_type (ch, NEUTRAL_DIR))
1615 if (pos >= end)
1617 /* Pretend there's a paragraph separator at end of
1618 buffer/string. */
1619 type = NEUTRAL_B;
1620 break;
1622 if (!string_p
1623 && type == NEUTRAL_B
1624 && bidi_at_paragraph_end (pos, bytepos) >= -1)
1625 break;
1626 /* Fetch next character and advance to get past it. */
1627 ch = bidi_fetch_char_skip_isolates (pos, bytepos, disp_pos, disp_prop,
1628 string, w, frame_window_p,
1629 ch_len, nchars);
1630 pos += *nchars;
1631 bytepos += *ch_len;
1633 return type;
1636 /* Determine the base direction, a.k.a. base embedding level, of the
1637 paragraph we are about to iterate through. If DIR is either L2R or
1638 R2L, just use that. Otherwise, determine the paragraph direction
1639 from the first strong directional character of the paragraph.
1641 NO_DEFAULT_P means don't default to L2R if the paragraph
1642 has no strong directional characters and both DIR and
1643 bidi_it->paragraph_dir are NEUTRAL_DIR. In that case, search back
1644 in the buffer until a paragraph is found with a strong character,
1645 or until hitting BEGV. In the latter case, fall back to L2R. This
1646 flag is used in current-bidi-paragraph-direction.
1648 Note that this function gives the paragraph separator the same
1649 direction as the preceding paragraph, even though Emacs generally
1650 views the separator as not belonging to any paragraph. */
1651 void
1652 bidi_paragraph_init (bidi_dir_t dir, struct bidi_it *bidi_it, bool no_default_p)
1654 ptrdiff_t bytepos = bidi_it->bytepos;
1655 bool string_p = bidi_it->string.s || STRINGP (bidi_it->string.lstring);
1656 ptrdiff_t pstartbyte;
1657 /* Note that begbyte is a byte position, while end is a character
1658 position. Yes, this is ugly, but we are trying to avoid costly
1659 calls to BYTE_TO_CHAR and its ilk. */
1660 ptrdiff_t begbyte = string_p ? 0 : BEGV_BYTE;
1661 ptrdiff_t end = string_p ? bidi_it->string.schars : ZV;
1663 /* Special case for an empty buffer. */
1664 if (bytepos == begbyte && bidi_it->charpos == end)
1665 dir = L2R;
1666 /* We should never be called at EOB or before BEGV. */
1667 else if (bidi_it->charpos >= end || bytepos < begbyte)
1668 emacs_abort ();
1670 if (dir == L2R)
1672 bidi_it->paragraph_dir = L2R;
1673 bidi_it->new_paragraph = 0;
1675 else if (dir == R2L)
1677 bidi_it->paragraph_dir = R2L;
1678 bidi_it->new_paragraph = 0;
1680 else if (dir == NEUTRAL_DIR) /* P2 */
1682 ptrdiff_t ch_len, nchars;
1683 ptrdiff_t pos, disp_pos = -1;
1684 int disp_prop = 0;
1685 bidi_type_t type;
1686 const unsigned char *s;
1688 if (!bidi_initialized)
1689 bidi_initialize ();
1691 /* If we are inside a paragraph separator, we are just waiting
1692 for the separator to be exhausted; use the previous paragraph
1693 direction. But don't do that if we have been just reseated,
1694 because we need to reinitialize below in that case. */
1695 if (!bidi_it->first_elt
1696 && bidi_it->charpos < bidi_it->separator_limit)
1697 return;
1699 /* If we are on a newline, get past it to where the next
1700 paragraph might start. But don't do that at BEGV since then
1701 we are potentially in a new paragraph that doesn't yet
1702 exist. */
1703 pos = bidi_it->charpos;
1704 s = (STRINGP (bidi_it->string.lstring)
1705 ? SDATA (bidi_it->string.lstring)
1706 : bidi_it->string.s);
1707 if (bytepos > begbyte
1708 && bidi_char_at_pos (bytepos, s, bidi_it->string.unibyte) == '\n')
1710 bytepos++;
1711 pos++;
1714 /* We are either at the beginning of a paragraph or in the
1715 middle of it. Find where this paragraph starts. */
1716 if (string_p)
1718 /* We don't support changes of paragraph direction inside a
1719 string. It is treated as a single paragraph. */
1720 pstartbyte = 0;
1722 else
1723 pstartbyte = bidi_find_paragraph_start (pos, bytepos);
1724 bidi_it->separator_limit = -1;
1725 bidi_it->new_paragraph = 0;
1727 /* The following loop is run more than once only if NO_DEFAULT_P,
1728 and only if we are iterating on a buffer. */
1729 do {
1730 bytepos = pstartbyte;
1731 if (!string_p)
1732 pos = BYTE_TO_CHAR (bytepos);
1733 type = find_first_strong_char (pos, bytepos, end, &disp_pos, &disp_prop,
1734 &bidi_it->string, bidi_it->w,
1735 string_p, bidi_it->frame_window_p,
1736 &ch_len, &nchars, false);
1737 if (type == STRONG_R || type == STRONG_AL) /* P3 */
1738 bidi_it->paragraph_dir = R2L;
1739 else if (type == STRONG_L)
1740 bidi_it->paragraph_dir = L2R;
1741 if (!string_p
1742 && no_default_p && bidi_it->paragraph_dir == NEUTRAL_DIR)
1744 /* If this paragraph is at BEGV, default to L2R. */
1745 if (pstartbyte == BEGV_BYTE)
1746 bidi_it->paragraph_dir = L2R; /* P3 and HL1 */
1747 else
1749 ptrdiff_t prevpbyte = pstartbyte;
1750 ptrdiff_t p = BYTE_TO_CHAR (pstartbyte), pbyte = pstartbyte;
1752 /* Find the beginning of the previous paragraph, if any. */
1753 while (pbyte > BEGV_BYTE && prevpbyte >= pstartbyte)
1755 /* FXIME: What if p is covered by a display
1756 string? See also a FIXME inside
1757 bidi_find_paragraph_start. */
1758 DEC_BOTH (p, pbyte);
1759 prevpbyte = bidi_find_paragraph_start (p, pbyte);
1761 pstartbyte = prevpbyte;
1764 } while (!string_p
1765 && no_default_p && bidi_it->paragraph_dir == NEUTRAL_DIR);
1767 else
1768 emacs_abort ();
1770 /* Contrary to UAX#9 clause P3, we only default the paragraph
1771 direction to L2R if we have no previous usable paragraph
1772 direction. This is allowed by the HL1 clause. */
1773 if (bidi_it->paragraph_dir != L2R && bidi_it->paragraph_dir != R2L)
1774 bidi_it->paragraph_dir = L2R; /* P3 and HL1 ``higher-level protocols'' */
1775 if (bidi_it->paragraph_dir == R2L)
1776 bidi_it->level_stack[0].level = 1;
1777 else
1778 bidi_it->level_stack[0].level = 0;
1780 bidi_line_init (bidi_it);
1784 /***********************************************************************
1785 Resolving explicit and implicit levels.
1786 The rest of this file constitutes the core of the UBA implementation.
1787 ***********************************************************************/
1789 static bool
1790 bidi_explicit_dir_char (int ch)
1792 bidi_type_t ch_type;
1794 if (!bidi_initialized)
1795 emacs_abort ();
1796 if (ch < 0)
1798 eassert (ch == BIDI_EOB);
1799 return false;
1801 ch_type = (bidi_type_t) XINT (CHAR_TABLE_REF (bidi_type_table, ch));
1802 return (ch_type == LRE || ch_type == LRO
1803 || ch_type == RLE || ch_type == RLO
1804 || ch_type == PDF);
1807 /* Given an iterator state in BIDI_IT, advance one character position
1808 in the buffer/string to the next character (in the logical order),
1809 resolve any explicit embeddings, directional overrides, and isolate
1810 initiators and terminators, and return the embedding level of the
1811 character after resolving these explicit directives. */
1812 static int
1813 bidi_resolve_explicit (struct bidi_it *bidi_it)
1815 int curchar;
1816 bidi_type_t type, typ1, prev_type = UNKNOWN_BT;
1817 int current_level;
1818 int new_level;
1819 bidi_dir_t override;
1820 bool isolate_status;
1821 bool string_p = bidi_it->string.s || STRINGP (bidi_it->string.lstring);
1822 ptrdiff_t ch_len, nchars, disp_pos, end;
1823 int disp_prop;
1824 ptrdiff_t eob
1825 = ((bidi_it->string.s || STRINGP (bidi_it->string.lstring))
1826 ? bidi_it->string.schars : ZV);
1828 /* Record the info about the previous character. */
1829 if (bidi_it->type_after_wn != WEAK_BN /* W1/Retaining */
1830 && bidi_it->type != WEAK_BN)
1832 /* This special case is needed in support of Unicode 8.0
1833 correction to N0, as implemented in bidi_resolve_weak/W1
1834 below. */
1835 if (bidi_it->type_after_wn == NEUTRAL_ON
1836 && bidi_get_category (bidi_it->type) == STRONG
1837 && bidi_paired_bracket_type (bidi_it->ch) == BIDI_BRACKET_CLOSE)
1838 bidi_remember_char (&bidi_it->prev, bidi_it, 1);
1839 else
1840 bidi_remember_char (&bidi_it->prev, bidi_it, 0);
1842 if (bidi_it->type_after_wn == STRONG_R
1843 || bidi_it->type_after_wn == STRONG_L
1844 || bidi_it->type_after_wn == STRONG_AL)
1845 bidi_remember_char (&bidi_it->last_strong, bidi_it, 0);
1846 if (bidi_it->type == STRONG_R || bidi_it->type == STRONG_L
1847 || bidi_it->type == WEAK_EN || bidi_it->type == WEAK_AN)
1848 bidi_remember_char (&bidi_it->prev_for_neutral, bidi_it, 1);
1850 /* If we overstepped the characters used for resolving neutrals
1851 and whitespace, invalidate their info in the iterator. */
1852 if (bidi_it->charpos >= bidi_it->next_for_neutral.charpos)
1854 bidi_it->next_for_neutral.type = UNKNOWN_BT;
1855 /* If needed, reset the "magical" value of pairing bracket
1856 position, so that bidi_resolve_brackets will resume
1857 resolution of brackets according to BPA. */
1858 if (bidi_it->bracket_pairing_pos == eob)
1859 bidi_it->bracket_pairing_pos = -1;
1861 if (bidi_it->next_en_pos >= 0
1862 && bidi_it->charpos >= bidi_it->next_en_pos)
1864 bidi_it->next_en_pos = 0;
1865 bidi_it->next_en_type = UNKNOWN_BT;
1868 /* Reset the bracket resolution info, unless we previously decided
1869 (in bidi_find_bracket_pairs) that brackets in this level run
1870 should be resolved as neutrals. */
1871 if (bidi_it->bracket_pairing_pos != eob)
1873 bidi_it->bracket_pairing_pos = -1;
1874 bidi_it->bracket_enclosed_type = UNKNOWN_BT;
1877 /* If reseat()'ed, don't advance, so as to start iteration from the
1878 position where we were reseated. bidi_it->bytepos can be less
1879 than BEGV_BYTE after reseat to BEGV. */
1880 if (bidi_it->bytepos < (string_p ? 0 : BEGV_BYTE)
1881 || bidi_it->first_elt)
1883 bidi_it->first_elt = 0;
1884 if (string_p)
1886 const unsigned char *p
1887 = (STRINGP (bidi_it->string.lstring)
1888 ? SDATA (bidi_it->string.lstring)
1889 : bidi_it->string.s);
1891 if (bidi_it->charpos < 0)
1892 bidi_it->charpos = bidi_it->bytepos = 0;
1893 eassert (bidi_it->bytepos == bidi_count_bytes (p, 0, 0,
1894 bidi_it->charpos,
1895 bidi_it->string.unibyte));
1897 else
1899 if (bidi_it->charpos < BEGV)
1901 bidi_it->charpos = BEGV;
1902 bidi_it->bytepos = BEGV_BYTE;
1904 eassert (bidi_it->bytepos == CHAR_TO_BYTE (bidi_it->charpos));
1906 /* Determine the original bidi type of the previous character,
1907 which is needed for handling isolate initiators and PDF. The
1908 type of the previous character will be non-trivial only if
1909 our caller moved through some previous text in
1910 get_visually_first_element, in which case bidi_it->prev holds
1911 the information we want. */
1912 if (bidi_it->first_elt && bidi_it->prev.type != UNKNOWN_BT)
1914 eassert (bidi_it->prev.charpos == bidi_it->charpos - 1);
1915 prev_type = bidi_it->prev.orig_type;
1918 /* Don't move at end of buffer/string. */
1919 else if (bidi_it->charpos < (string_p ? bidi_it->string.schars : ZV))
1921 /* Advance to the next character, skipping characters covered by
1922 display strings (nchars > 1). */
1923 if (bidi_it->nchars <= 0)
1924 emacs_abort ();
1925 bidi_it->charpos += bidi_it->nchars;
1926 if (bidi_it->ch_len == 0)
1927 emacs_abort ();
1928 bidi_it->bytepos += bidi_it->ch_len;
1929 prev_type = bidi_it->orig_type;
1931 else /* EOB or end of string */
1932 prev_type = NEUTRAL_B;
1934 current_level = bidi_it->level_stack[bidi_it->stack_idx].level; /* X1 */
1935 isolate_status = ISOLATE_STATUS (bidi_it, bidi_it->stack_idx);
1936 override = OVERRIDE (bidi_it, bidi_it->stack_idx);
1937 new_level = current_level;
1939 if (bidi_it->charpos >= (string_p ? bidi_it->string.schars : ZV))
1941 curchar = BIDI_EOB;
1942 bidi_it->ch_len = 1;
1943 bidi_it->nchars = 1;
1944 bidi_it->disp_pos = (string_p ? bidi_it->string.schars : ZV);
1945 bidi_it->disp_prop = 0;
1947 else
1949 /* LRI, RLI, and FSI increment, and PDF decrements, the
1950 embedding level of the _following_ characters, so we must
1951 first look at the type of the previous character to support
1952 that. */
1953 switch (prev_type)
1955 case RLI: /* X5a */
1956 if (current_level < BIDI_MAXDEPTH
1957 && bidi_it->invalid_levels == 0
1958 && bidi_it->invalid_isolates == 0)
1960 new_level = ((current_level + 1) & ~1) + 1;
1961 bidi_it->isolate_level++;
1962 bidi_push_embedding_level (bidi_it, new_level,
1963 NEUTRAL_DIR, true);
1965 else
1966 bidi_it->invalid_isolates++;
1967 break;
1968 case LRI: /* X5b */
1969 if (current_level < BIDI_MAXDEPTH - 1
1970 && bidi_it->invalid_levels == 0
1971 && bidi_it->invalid_isolates == 0)
1973 new_level = ((current_level + 2) & ~1);
1974 bidi_it->isolate_level++;
1975 bidi_push_embedding_level (bidi_it, new_level,
1976 NEUTRAL_DIR, true);
1978 else
1979 bidi_it->invalid_isolates++;
1980 break;
1981 case PDF: /* X7 */
1982 if (!bidi_it->invalid_isolates)
1984 if (bidi_it->invalid_levels)
1985 bidi_it->invalid_levels--;
1986 else if (!isolate_status && bidi_it->stack_idx >= 1)
1987 new_level = bidi_pop_embedding_level (bidi_it);
1989 break;
1990 default:
1991 eassert (prev_type != FSI);
1992 /* Nothing. */
1993 break;
1995 /* Fetch the character at BYTEPOS. If it is covered by a
1996 display string, treat the entire run of covered characters as
1997 a single character u+FFFC. */
1998 curchar = bidi_fetch_char (bidi_it->charpos, bidi_it->bytepos,
1999 &bidi_it->disp_pos, &bidi_it->disp_prop,
2000 &bidi_it->string, bidi_it->w,
2001 bidi_it->frame_window_p,
2002 &bidi_it->ch_len, &bidi_it->nchars);
2004 bidi_it->ch = curchar;
2005 bidi_it->resolved_level = new_level;
2007 /* Don't apply directional override here, as all the types we handle
2008 below will not be affected by the override anyway, and we need
2009 the original type unaltered. The override will be applied in
2010 bidi_resolve_weak. */
2011 type = bidi_get_type (curchar, NEUTRAL_DIR);
2012 bidi_it->orig_type = type;
2013 bidi_check_type (bidi_it->orig_type);
2015 bidi_it->type_after_wn = UNKNOWN_BT;
2017 switch (type)
2019 case RLE: /* X2 */
2020 case RLO: /* X4 */
2021 bidi_it->type_after_wn = type;
2022 bidi_check_type (bidi_it->type_after_wn);
2023 type = WEAK_BN; /* X9/Retaining */
2024 if (new_level < BIDI_MAXDEPTH
2025 && bidi_it->invalid_levels == 0
2026 && bidi_it->invalid_isolates == 0)
2028 /* Compute the least odd embedding level greater than
2029 the current level. */
2030 new_level = ((new_level + 1) & ~1) + 1;
2031 if (bidi_it->type_after_wn == RLE)
2032 override = NEUTRAL_DIR;
2033 else
2034 override = R2L;
2035 bidi_push_embedding_level (bidi_it, new_level, override, false);
2036 bidi_it->resolved_level = new_level;
2038 else
2040 if (bidi_it->invalid_isolates == 0)
2041 bidi_it->invalid_levels++;
2043 break;
2044 case LRE: /* X3 */
2045 case LRO: /* X5 */
2046 bidi_it->type_after_wn = type;
2047 bidi_check_type (bidi_it->type_after_wn);
2048 type = WEAK_BN; /* X9/Retaining */
2049 if (new_level < BIDI_MAXDEPTH - 1
2050 && bidi_it->invalid_levels == 0
2051 && bidi_it->invalid_isolates == 0)
2053 /* Compute the least even embedding level greater than
2054 the current level. */
2055 new_level = ((new_level + 2) & ~1);
2056 if (bidi_it->type_after_wn == LRE)
2057 override = NEUTRAL_DIR;
2058 else
2059 override = L2R;
2060 bidi_push_embedding_level (bidi_it, new_level, override, false);
2061 bidi_it->resolved_level = new_level;
2063 else
2065 if (bidi_it->invalid_isolates == 0)
2066 bidi_it->invalid_levels++;
2068 break;
2069 case FSI: /* X5c */
2070 end = string_p ? bidi_it->string.schars : ZV;
2071 disp_pos = bidi_it->disp_pos;
2072 disp_prop = bidi_it->disp_prop;
2073 nchars = bidi_it->nchars;
2074 ch_len = bidi_it->ch_len;
2075 typ1 = find_first_strong_char (bidi_it->charpos,
2076 bidi_it->bytepos, end,
2077 &disp_pos, &disp_prop,
2078 &bidi_it->string, bidi_it->w,
2079 string_p, bidi_it->frame_window_p,
2080 &ch_len, &nchars, true);
2081 if (typ1 != STRONG_R && typ1 != STRONG_AL)
2083 type = LRI;
2084 /* Override orig_type, which will be needed when we come to
2085 examine the next character, which is the first character
2086 inside the isolate. */
2087 bidi_it->orig_type = type;
2088 goto fsi_as_lri;
2090 else
2092 type = RLI;
2093 bidi_it->orig_type = type;
2095 /* FALLTHROUGH */
2096 case RLI: /* X5a */
2097 if (override == NEUTRAL_DIR)
2098 bidi_it->type_after_wn = type;
2099 else /* Unicode 8.0 correction. */
2100 bidi_it->type_after_wn = (override == L2R ? STRONG_L : STRONG_R);
2101 bidi_check_type (bidi_it->type_after_wn);
2102 break;
2103 case LRI: /* X5b */
2104 fsi_as_lri:
2105 if (override == NEUTRAL_DIR)
2106 bidi_it->type_after_wn = type;
2107 else /* Unicode 8.0 correction. */
2108 bidi_it->type_after_wn = (override == L2R ? STRONG_L : STRONG_R);
2109 bidi_check_type (bidi_it->type_after_wn);
2110 break;
2111 case PDI: /* X6a */
2112 if (bidi_it->invalid_isolates)
2113 bidi_it->invalid_isolates--;
2114 else if (bidi_it->isolate_level > 0)
2116 bidi_it->invalid_levels = 0;
2117 while (!ISOLATE_STATUS (bidi_it, bidi_it->stack_idx))
2118 bidi_pop_embedding_level (bidi_it);
2119 eassert (bidi_it->stack_idx > 0);
2120 new_level = bidi_pop_embedding_level (bidi_it);
2121 bidi_it->isolate_level--;
2123 bidi_it->resolved_level = new_level;
2124 /* Unicode 8.0 correction. */
2126 bidi_dir_t stack_override = OVERRIDE (bidi_it, bidi_it->stack_idx);
2127 if (stack_override == L2R)
2128 bidi_it->type_after_wn = STRONG_L;
2129 else if (stack_override == R2L)
2130 bidi_it->type_after_wn = STRONG_R;
2131 else
2132 bidi_it->type_after_wn = type;
2134 break;
2135 case PDF: /* X7 */
2136 bidi_it->type_after_wn = type;
2137 bidi_check_type (bidi_it->type_after_wn);
2138 type = WEAK_BN; /* X9/Retaining */
2139 break;
2140 default:
2141 /* Nothing. */
2142 break;
2145 bidi_it->type = type;
2146 bidi_check_type (bidi_it->type);
2148 if (bidi_it->type == NEUTRAL_B) /* X8 */
2150 bidi_set_paragraph_end (bidi_it);
2151 /* This is needed by bidi_resolve_weak below, and in L1. */
2152 bidi_it->type_after_wn = bidi_it->type;
2155 eassert (bidi_it->resolved_level >= 0);
2156 return bidi_it->resolved_level;
2159 /* Advance in the buffer/string, resolve weak types and return the
2160 type of the next character after weak type resolution. */
2161 static bidi_type_t
2162 bidi_resolve_weak (struct bidi_it *bidi_it)
2164 bidi_type_t type;
2165 bidi_dir_t override;
2166 int prev_level = bidi_it->level_stack[bidi_it->stack_idx].level;
2167 int new_level = bidi_resolve_explicit (bidi_it);
2168 int next_char;
2169 bidi_type_t type_of_next;
2170 struct bidi_it saved_it;
2171 ptrdiff_t eob
2172 = ((STRINGP (bidi_it->string.lstring) || bidi_it->string.s)
2173 ? bidi_it->string.schars : ZV);
2175 type = bidi_it->type;
2176 override = OVERRIDE (bidi_it, bidi_it->stack_idx);
2178 eassert (!(type == UNKNOWN_BT
2179 || type == LRE
2180 || type == LRO
2181 || type == RLE
2182 || type == RLO
2183 || type == PDF));
2185 eassert (prev_level >= 0);
2186 if (bidi_it->type == NEUTRAL_B)
2188 /* We've got a new isolating sequence, compute the directional
2189 type of sos and initialize per-run variables (UAX#9, clause
2190 X10). */
2191 bidi_set_sos_type (bidi_it, prev_level, new_level);
2193 if (type == NEUTRAL_S || type == NEUTRAL_WS
2194 || type == WEAK_BN || type == STRONG_AL)
2195 bidi_it->type_after_wn = type; /* needed in L1 */
2196 bidi_check_type (bidi_it->type_after_wn);
2198 /* Level and directional override status are already recorded in
2199 bidi_it, and do not need any change; see X6. */
2200 if (override == R2L) /* X6 */
2201 type = STRONG_R;
2202 else if (override == L2R)
2203 type = STRONG_L;
2204 else
2206 if (type == WEAK_NSM) /* W1 */
2208 /* Note that we don't need to consider the case where the
2209 prev character has its type overridden by an RLO or LRO,
2210 because then either the type of this NSM would have been
2211 also overridden, or the previous character is outside the
2212 current level run, and thus not relevant to this NSM.
2213 This is why NSM gets the type_after_wn of the previous
2214 character. */
2215 /* bidi_set_sos_type sets type_after_wn to UNKNOWN_BT. */
2216 if (bidi_it->prev.type != UNKNOWN_BT
2217 /* If type_after_wn is NEUTRAL_B, this NSM is at sos. */
2218 && bidi_it->prev.type != NEUTRAL_B)
2220 if (bidi_isolate_fmt_char (bidi_it->prev.type))
2222 /* From W1: "Note that in an isolating run sequence,
2223 an isolate initiator followed by an NSM or any
2224 type other than PDI must be an overflow isolate
2225 initiator." */
2226 eassert (bidi_it->invalid_isolates > 0);
2227 type = NEUTRAL_ON;
2229 else
2231 /* This includes the Unicode 8.0 correction for N0,
2232 due to how we set prev.type in bidi_resolve_explicit,
2233 which see. */
2234 type = bidi_it->prev.type;
2237 else if (bidi_it->sos == R2L)
2238 type = STRONG_R;
2239 else if (bidi_it->sos == L2R)
2240 type = STRONG_L;
2241 else /* shouldn't happen! */
2242 emacs_abort ();
2244 if (type == WEAK_EN /* W2 */
2245 && bidi_it->last_strong.type == STRONG_AL)
2246 type = WEAK_AN;
2247 else if (type == STRONG_AL) /* W3 */
2248 type = STRONG_R;
2249 else if ((type == WEAK_ES /* W4 */
2250 && bidi_it->prev.type == WEAK_EN
2251 && bidi_it->prev.orig_type == WEAK_EN)
2252 || (type == WEAK_CS
2253 && ((bidi_it->prev.type == WEAK_EN
2254 && bidi_it->prev.orig_type == WEAK_EN)
2255 || bidi_it->prev.type == WEAK_AN)))
2257 const unsigned char *s
2258 = (STRINGP (bidi_it->string.lstring)
2259 ? SDATA (bidi_it->string.lstring)
2260 : bidi_it->string.s);
2262 next_char = (bidi_it->charpos + bidi_it->nchars >= eob
2263 ? BIDI_EOB
2264 : bidi_char_at_pos (bidi_it->bytepos + bidi_it->ch_len,
2265 s, bidi_it->string.unibyte));
2266 type_of_next = bidi_get_type (next_char, override);
2268 if (type_of_next == WEAK_BN
2269 || bidi_explicit_dir_char (next_char))
2271 bidi_copy_it (&saved_it, bidi_it);
2272 while (bidi_resolve_explicit (bidi_it) == new_level
2273 && bidi_it->type == WEAK_BN)
2274 type_of_next = bidi_it->type;
2275 bidi_copy_it (bidi_it, &saved_it);
2278 /* If the next character is EN, but the last strong-type
2279 character is AL, that next EN will be changed to AN when
2280 we process it in W2 above. So in that case, this ES
2281 should not be changed into EN. */
2282 if (type == WEAK_ES
2283 && type_of_next == WEAK_EN
2284 && bidi_it->last_strong.type != STRONG_AL)
2285 type = WEAK_EN;
2286 else if (type == WEAK_CS)
2288 if (bidi_it->prev.type == WEAK_AN
2289 && (type_of_next == WEAK_AN
2290 /* If the next character is EN, but the last
2291 strong-type character is AL, EN will be later
2292 changed to AN when we process it in W2 above.
2293 So in that case, this ES should not be
2294 changed into EN. */
2295 || (type_of_next == WEAK_EN
2296 && bidi_it->last_strong.type == STRONG_AL)))
2297 type = WEAK_AN;
2298 else if (bidi_it->prev.type == WEAK_EN
2299 && type_of_next == WEAK_EN
2300 && bidi_it->last_strong.type != STRONG_AL)
2301 type = WEAK_EN;
2304 else if (type == WEAK_ET /* W5: ET with EN before or after it */
2305 || type == WEAK_BN) /* W5/Retaining */
2307 if (bidi_it->prev.type == WEAK_EN) /* ET/BN w/EN before it */
2308 type = WEAK_EN;
2309 else if (bidi_it->next_en_pos > bidi_it->charpos
2310 && bidi_it->next_en_type != WEAK_BN)
2312 if (bidi_it->next_en_type == WEAK_EN) /* ET/BN with EN after it */
2313 type = WEAK_EN;
2315 else if (type == WEAK_BN
2316 /* This condition is for the following important case:
2318 . we are at level zero
2319 . either previous strong character was L,
2320 or we've seen no strong characters since sos
2321 and the base paragraph direction is L2R
2322 . this BN is NOT a bidi directional control
2324 For such a situation, either this BN will be
2325 converted to EN per W5, and then to L by virtue
2326 of W7; or it will become ON per W6, and then L
2327 because of N1/N2. So we take a shortcut here
2328 and make it L right away, to avoid the
2329 potentially costly loop below. This is
2330 important when the buffer has a long series of
2331 control characters, like binary nulls, and no
2332 R2L characters at all. */
2333 && new_level == 0
2334 && !bidi_explicit_dir_char (bidi_it->ch)
2335 && ((bidi_it->last_strong.type == STRONG_L)
2336 || (bidi_it->last_strong.type == UNKNOWN_BT
2337 && bidi_it->sos == L2R)))
2338 type = STRONG_L;
2339 else if (bidi_it->next_en_pos >= 0)
2341 /* We overstepped the last known position for ET
2342 resolution but there could be other such characters
2343 in this paragraph (when we are sure there are no more
2344 such positions, we set next_en_pos to a negative
2345 value). Try to find the next position for ET
2346 resolution. */
2347 ptrdiff_t en_pos = bidi_it->charpos + bidi_it->nchars;
2348 const unsigned char *s = (STRINGP (bidi_it->string.lstring)
2349 ? SDATA (bidi_it->string.lstring)
2350 : bidi_it->string.s);
2352 if (bidi_it->nchars <= 0)
2353 emacs_abort ();
2354 next_char
2355 = (bidi_it->charpos + bidi_it->nchars >= eob
2356 ? BIDI_EOB
2357 : bidi_char_at_pos (bidi_it->bytepos + bidi_it->ch_len, s,
2358 bidi_it->string.unibyte));
2359 type_of_next = bidi_get_type (next_char, override);
2361 if (type_of_next == WEAK_ET
2362 || type_of_next == WEAK_BN
2363 || bidi_explicit_dir_char (next_char))
2365 bidi_copy_it (&saved_it, bidi_it);
2366 while (bidi_resolve_explicit (bidi_it) == new_level
2367 && (bidi_it->type == WEAK_BN
2368 || bidi_it->type == WEAK_ET))
2369 type_of_next = bidi_it->type;
2370 if (type == WEAK_BN
2371 && bidi_it->charpos == saved_it.charpos + saved_it.nchars)
2373 /* If we entered the above loop with a BN that
2374 changes the level, the type of next
2375 character, which is in a different level, is
2376 not relevant to resolving this series of ET
2377 and BN. */
2378 en_pos = saved_it.charpos;
2379 type_of_next = type;
2381 else
2382 en_pos = bidi_it->charpos;
2383 bidi_copy_it (bidi_it, &saved_it);
2385 /* Remember this position, to speed up processing of the
2386 next ETs. */
2387 bidi_it->next_en_pos = en_pos;
2388 if (type_of_next == WEAK_EN)
2390 /* If the last strong character is AL, the EN we've
2391 found will become AN when we get to it (W2). */
2392 if (bidi_it->last_strong.type == STRONG_AL)
2393 type_of_next = WEAK_AN;
2394 else if (type == WEAK_BN)
2395 type = NEUTRAL_ON; /* W6/Retaining */
2396 else
2397 type = WEAK_EN;
2399 else if (type_of_next == NEUTRAL_B)
2400 /* Record the fact that there are no more ENs from
2401 here to the end of paragraph, to avoid entering the
2402 loop above ever again in this paragraph. */
2403 bidi_it->next_en_pos = -1;
2404 /* Record the type of the character where we ended our search. */
2405 bidi_it->next_en_type = type_of_next;
2410 if (type == WEAK_ES || type == WEAK_ET || type == WEAK_CS /* W6 */
2411 || (type == WEAK_BN
2412 && (bidi_it->prev.type == WEAK_CS /* W6/Retaining */
2413 || bidi_it->prev.type == WEAK_ES
2414 || bidi_it->prev.type == WEAK_ET)))
2415 type = NEUTRAL_ON;
2417 /* Store the type we've got so far, before we clobber it with strong
2418 types in W7 and while resolving neutral types. But leave alone
2419 the original types that were recorded above, because we will need
2420 them for the L1 clause. */
2421 if (bidi_it->type_after_wn == UNKNOWN_BT)
2422 bidi_it->type_after_wn = type;
2423 bidi_check_type (bidi_it->type_after_wn);
2425 if (type == WEAK_EN) /* W7 */
2427 if ((bidi_it->last_strong.type == STRONG_L)
2428 || (bidi_it->last_strong.type == UNKNOWN_BT && bidi_it->sos == L2R))
2429 type = STRONG_L;
2432 bidi_it->type = type;
2433 bidi_check_type (bidi_it->type);
2434 return type;
2437 /* Resolve the type of a neutral character according to the type of
2438 surrounding strong text and the current embedding level. */
2439 static bidi_type_t
2440 bidi_resolve_neutral_1 (bidi_type_t prev_type, bidi_type_t next_type, int lev)
2442 /* N1: "European and Arabic numbers act as if they were R in terms
2443 of their influence on NIs." */
2444 if (next_type == WEAK_EN || next_type == WEAK_AN)
2445 next_type = STRONG_R;
2446 if (prev_type == WEAK_EN || prev_type == WEAK_AN)
2447 prev_type = STRONG_R;
2449 if (next_type == prev_type) /* N1 */
2450 return next_type;
2451 else if ((lev & 1) == 0) /* N2 */
2452 return STRONG_L;
2453 else
2454 return STRONG_R;
2457 #define FLAG_EMBEDDING_INSIDE 1
2458 #define FLAG_OPPOSITE_INSIDE 2
2460 /* A data type used in the stack maintained by
2461 bidi_find_bracket_pairs below. */
2462 typedef struct bpa_stack_entry {
2463 int close_bracket_char;
2464 int open_bracket_idx;
2465 #ifdef ENABLE_CHECKING
2466 ptrdiff_t open_bracket_pos;
2467 #endif
2468 unsigned flags : 2;
2469 } bpa_stack_entry;
2471 /* With MAX_ALLOCA of 16KB, this should allow at least 1K slots in the
2472 BPA stack, which should be more than enough for actual bidi text. */
2473 #define MAX_BPA_STACK ((int)max (MAX_ALLOCA / sizeof (bpa_stack_entry), 1))
2475 /* UAX#9 says to match opening brackets with the matching closing
2476 brackets or their canonical equivalents. As of Unicode 8.0, there
2477 are only 2 bracket characters that have canonical equivalence
2478 decompositions: u+2329 and u+232A. So instead of accessing the
2479 table in uni-decomposition.el, we just handle these 2 characters
2480 with this simple macro. Note that ASCII characters don't have
2481 canonical equivalents by definition. */
2483 /* To find all the characters that need to be processed by
2484 CANONICAL_EQU, first find all the characters which have
2485 decompositions in UnicodeData.txt, with this Awk script:
2487 awk -F ";" " {if ($6 != \"\") print $1, $6}" UnicodeData.txt
2489 Then produce a list of all the bracket characters in BidiBrackets.txt:
2491 awk -F "[ ;]" " {if ($1 != \"#\" && $1 != \"\") print $1}" BidiBrackets.txt
2493 And finally, cross-reference these two:
2495 grep -Fw -f brackets.txt decompositions.txt
2497 where "decompositions.txt" was produced by the 1st script, and
2498 "brackets.txt" by the 2nd script. In the output of grep, look
2499 only for decompositions that don't begin with some compatibility
2500 formatting tag, such as "<compat>". Only decompositions that
2501 consist solely of character codepoints are relevant to bidi
2502 brackets processing. */
2504 #define CANONICAL_EQU(c) \
2505 ( ASCII_CHAR_P (c) ? c \
2506 : (c) == LEFT_POINTING_ANGLE_BRACKET ? LEFT_ANGLE_BRACKET \
2507 : (c) == RIGHT_POINTING_ANGLE_BRACKET ? RIGHT_ANGLE_BRACKET \
2508 : c )
2510 #ifdef ENABLE_CHECKING
2511 # define STORE_BRACKET_CHARPOS \
2512 bpa_stack[bpa_sp].open_bracket_pos = bidi_it->charpos
2513 #else
2514 # define STORE_BRACKET_CHARPOS /* nothing */
2515 #endif
2517 #define PUSH_BPA_STACK \
2518 do { \
2519 int ch; \
2520 if (bpa_sp < MAX_BPA_STACK - 1) \
2522 bpa_sp++; \
2523 ch = CANONICAL_EQU (bidi_it->ch); \
2524 bpa_stack[bpa_sp].close_bracket_char = bidi_mirror_char (ch); \
2525 bpa_stack[bpa_sp].open_bracket_idx = bidi_cache_last_idx; \
2526 bpa_stack[bpa_sp].flags = 0; \
2527 STORE_BRACKET_CHARPOS; \
2529 } while (0)
2532 /* This function implements BPA, the Bidi Parenthesis Algorithm,
2533 described in BD16 and N0 of UAX#9. It finds all the bracket pairs
2534 in the current isolating sequence, and records the enclosed type
2535 and the position of the matching bracket in the cache. It returns
2536 non-zero if called with the iterator on the opening bracket which
2537 has a matching closing bracket in the current isolating sequence,
2538 zero otherwise. */
2539 static bool
2540 bidi_find_bracket_pairs (struct bidi_it *bidi_it)
2542 bidi_bracket_type_t btype;
2543 bidi_type_t type = bidi_it->type;
2544 bool retval = false;
2546 /* When scanning backwards, we don't expect any unresolved bidi
2547 bracket characters. */
2548 if (bidi_it->scan_dir != 1)
2549 emacs_abort ();
2551 btype = bidi_paired_bracket_type (bidi_it->ch);
2552 if (btype == BIDI_BRACKET_OPEN)
2554 bpa_stack_entry bpa_stack[MAX_BPA_STACK];
2555 int bpa_sp = -1;
2556 struct bidi_it saved_it;
2557 int base_level = bidi_it->level_stack[0].level;
2558 int embedding_level = bidi_it->level_stack[bidi_it->stack_idx].level;
2559 int maxlevel = embedding_level;
2560 bidi_type_t embedding_type = (embedding_level & 1) ? STRONG_R : STRONG_L;
2561 struct bidi_it tem_it;
2562 bool l2r_seen = false, r2l_seen = false;
2563 ptrdiff_t pairing_pos;
2564 int idx_at_entry = bidi_cache_idx;
2566 eassert (MAX_BPA_STACK >= 100);
2567 bidi_copy_it (&saved_it, bidi_it);
2568 /* bidi_cache_iterator_state refuses to cache on backward scans,
2569 and bidi_cache_fetch_state doesn't bring scan_dir from the
2570 cache, so we must initialize this explicitly. */
2571 tem_it.scan_dir = 1;
2573 while (1)
2575 int old_sidx, new_sidx;
2576 int current_level = bidi_it->level_stack[bidi_it->stack_idx].level;
2578 if (maxlevel < current_level)
2579 maxlevel = current_level;
2580 /* Mark every opening bracket character we've traversed by
2581 putting its own position into bracket_pairing_pos. This
2582 is examined in bidi_resolve_brackets to distinguish
2583 brackets that were already resolved to stay NEUTRAL_ON,
2584 and those that were not yet processed by this function
2585 (because they were skipped when we skip higher embedding
2586 levels below). */
2587 if (btype == BIDI_BRACKET_OPEN && bidi_it->bracket_pairing_pos == -1)
2588 bidi_it->bracket_pairing_pos = bidi_it->charpos;
2589 if (!bidi_cache_iterator_state (bidi_it, type == NEUTRAL_B, 0))
2591 /* No more space in cache -- give up and let the opening
2592 bracket that started this be processed as a
2593 NEUTRAL_ON. */
2594 bidi_cache_reset_to (idx_at_entry - bidi_cache_start);
2595 bidi_copy_it (bidi_it, &saved_it);
2596 goto give_up;
2598 if (btype == BIDI_BRACKET_OPEN)
2599 PUSH_BPA_STACK;
2600 else if (btype == BIDI_BRACKET_CLOSE)
2602 int sp = bpa_sp;
2603 int curchar = CANONICAL_EQU (bidi_it->ch);
2605 eassert (sp >= 0);
2606 while (sp >= 0 && bpa_stack[sp].close_bracket_char != curchar)
2607 sp--;
2608 if (sp >= 0)
2610 /* Update and cache the corresponding opening bracket. */
2611 bidi_cache_fetch_state (bpa_stack[sp].open_bracket_idx,
2612 &tem_it);
2613 #ifdef ENABLE_CHECKING
2614 eassert (bpa_stack[sp].open_bracket_pos == tem_it.charpos);
2615 #endif
2616 /* Determine the enclosed type for this bracket
2617 pair's type resolution according to N0. */
2618 if (bpa_stack[sp].flags & FLAG_EMBEDDING_INSIDE)
2619 tem_it.bracket_enclosed_type = embedding_type; /* N0b */
2620 else if (bpa_stack[sp].flags & FLAG_OPPOSITE_INSIDE)
2621 tem_it.bracket_enclosed_type /* N0c */
2622 = (embedding_type == STRONG_L ? STRONG_R : STRONG_L);
2623 else /* N0d */
2624 tem_it.bracket_enclosed_type = UNKNOWN_BT;
2626 /* Record the position of the matching closing
2627 bracket, and update the cache. */
2628 tem_it.bracket_pairing_pos = bidi_it->charpos;
2629 bidi_cache_iterator_state (&tem_it, 0, 1);
2631 /* Pop the BPA stack. */
2632 bpa_sp = sp - 1;
2634 if (bpa_sp < 0)
2636 retval = true;
2637 break;
2640 else if (bidi_get_category (bidi_it->type_after_wn) != NEUTRAL)
2642 unsigned flag = 0;
2643 int sp;
2645 /* Whenever we see a strong type, update the flags of
2646 all the slots on the stack. */
2647 switch (bidi_it->type)
2649 case STRONG_L:
2650 flag = ((embedding_level & 1) == 0
2651 ? FLAG_EMBEDDING_INSIDE
2652 : FLAG_OPPOSITE_INSIDE);
2653 l2r_seen = true;
2654 break;
2655 case STRONG_R:
2656 case WEAK_EN:
2657 case WEAK_AN:
2658 flag = ((embedding_level & 1) == 1
2659 ? FLAG_EMBEDDING_INSIDE
2660 : FLAG_OPPOSITE_INSIDE);
2661 r2l_seen = true;
2662 break;
2663 default:
2664 break;
2666 if (flag)
2668 for (sp = bpa_sp; sp >= 0; sp--)
2669 bpa_stack[sp].flags |= flag;
2672 old_sidx = bidi_it->stack_idx;
2673 type = bidi_resolve_weak (bidi_it);
2674 /* Skip level runs excluded from this isolating run sequence. */
2675 new_sidx = bidi_it->stack_idx;
2676 if (bidi_it->level_stack[new_sidx].level > current_level
2677 && (ISOLATE_STATUS (bidi_it, new_sidx)
2678 || (new_sidx > old_sidx + 1
2679 && ISOLATE_STATUS (bidi_it, new_sidx - 1))))
2681 while (bidi_it->level_stack[bidi_it->stack_idx].level
2682 > current_level)
2684 if (maxlevel < bidi_it->level_stack[bidi_it->stack_idx].level)
2685 maxlevel = bidi_it->level_stack[bidi_it->stack_idx].level;
2686 if (!bidi_cache_iterator_state (bidi_it,
2687 type == NEUTRAL_B, 0))
2689 /* No more space in cache -- give up and let the
2690 opening bracket that started this be
2691 processed as any other NEUTRAL_ON. */
2692 bidi_cache_reset_to (idx_at_entry - bidi_cache_start);
2693 bidi_copy_it (bidi_it, &saved_it);
2694 goto give_up;
2696 type = bidi_resolve_weak (bidi_it);
2699 if (type == NEUTRAL_B
2700 || (bidi_it->level_stack[bidi_it->stack_idx].level
2701 != current_level))
2703 /* We've marched all the way to the end of this
2704 isolating run sequence, and didn't find matching
2705 closing brackets for some opening brackets. Leave
2706 their type unchanged. */
2707 pairing_pos = bidi_it->charpos;
2708 break;
2710 if (bidi_it->type_after_wn == NEUTRAL_ON) /* Unicode 8.0 correction */
2711 btype = bidi_paired_bracket_type (bidi_it->ch);
2712 else
2713 btype = BIDI_BRACKET_NONE;
2716 /* Restore bidi_it from the cache, which should have the bracket
2717 resolution members set as determined by the above loop. */
2718 type = bidi_cache_find (saved_it.charpos, 0, bidi_it);
2719 eassert (type == NEUTRAL_ON);
2721 /* The following is an optimization for bracketed text that has
2722 only one level which is equal to the paragraph's base
2723 embedding level. That is, only L2R and weak/neutral
2724 characters in a L2R paragraph, or only R2L and weak/neutral
2725 characters in a R2L paragraph. Such brackets can be resolved
2726 by bidi_resolve_neutral, which has a further shortcut for
2727 this case. So we pretend we did not resolve the brackets in
2728 this case, set up next_for_neutral for the entire bracketed
2729 text, and reset the cache to the character before the opening
2730 bracket. The upshot is to allow bidi_move_to_visually_next
2731 reset the cache when it returns this opening bracket, thus
2732 cutting significantly on the size of the cache, which is
2733 important with long lines, especially if word-wrap is non-nil
2734 (which requires the display engine to copy the cache back and
2735 forth many times). */
2736 if (maxlevel == base_level
2737 && ((base_level == 0 && !r2l_seen)
2738 || (base_level == 1 && !l2r_seen)))
2740 ptrdiff_t eob
2741 = ((bidi_it->string.s || STRINGP (bidi_it->string.lstring))
2742 ? bidi_it->string.schars : ZV);
2744 if (retval)
2745 pairing_pos = bidi_it->bracket_pairing_pos;
2747 /* This special value (which cannot possibly happen when
2748 brackets are resolved, since there's no character at ZV)
2749 will be noticed by bidi_resolve_explicit, and will be
2750 copied to the following iterator states, instead of being
2751 reset to -1. */
2752 bidi_it->bracket_pairing_pos = eob;
2753 /* This type value will be used for resolving the outermost
2754 closing bracket in bidi_resolve_brackets. */
2755 bidi_it->bracket_enclosed_type = embedding_type;
2756 /* bidi_cache_last_idx is set to the index of the current
2757 state, because we just called bidi_cache_find above.
2758 That state describes the outermost opening bracket, the
2759 one with which we entered this function. Force the cache
2760 to "forget" all the cached states starting from that state. */
2761 bidi_cache_reset_to (bidi_cache_last_idx - bidi_cache_start);
2762 /* Set up the next_for_neutral member, to help
2763 bidi_resolve_neutral. */
2764 bidi_it->next_for_neutral.type = embedding_type;
2765 bidi_it->next_for_neutral.charpos = pairing_pos;
2766 /* Pretend we didn't resolve this bracket. */
2767 retval = false;
2771 give_up:
2772 return retval;
2775 static void
2776 bidi_record_type_for_neutral (struct bidi_saved_info *info, int level,
2777 bool nextp)
2779 int idx;
2781 for (idx = bidi_cache_last_idx + 1; idx < bidi_cache_idx; idx++)
2783 int lev = bidi_cache[idx].level_stack[bidi_cache[idx].stack_idx].level;
2785 if (lev <= level)
2787 eassert (lev == level);
2788 if (nextp)
2789 bidi_cache[idx].next_for_neutral = *info;
2790 else
2791 bidi_cache[idx].prev_for_neutral = *info;
2792 break;
2797 static bidi_type_t
2798 bidi_resolve_brackets (struct bidi_it *bidi_it)
2800 int prev_level = bidi_it->level_stack[bidi_it->stack_idx].level;
2801 bool resolve_bracket = false;
2802 bidi_type_t type = UNKNOWN_BT;
2803 int ch;
2804 struct bidi_saved_info prev_for_neutral, next_for_neutral;
2805 ptrdiff_t eob
2806 = ((bidi_it->string.s || STRINGP (bidi_it->string.lstring))
2807 ? bidi_it->string.schars : ZV);
2809 /* Record the prev_for_neutral type either from the previous
2810 character, if it was a strong or AN/EN, or from the
2811 prev_for_neutral information recorded previously. */
2812 if (bidi_it->type == STRONG_L || bidi_it->type == STRONG_R
2813 || bidi_it->type == WEAK_AN || bidi_it->type == WEAK_EN)
2814 bidi_remember_char (&prev_for_neutral, bidi_it, 1);
2815 else
2816 prev_for_neutral = bidi_it->prev_for_neutral;
2817 /* Record the next_for_neutral type information. */
2818 if (bidi_it->next_for_neutral.charpos > bidi_it->charpos)
2819 next_for_neutral = bidi_it->next_for_neutral;
2820 else
2821 next_for_neutral.charpos = -1;
2822 if (!bidi_it->first_elt)
2824 type = bidi_cache_find (bidi_it->charpos + bidi_it->nchars, 0, bidi_it);
2825 ch = bidi_it->ch;
2827 if (type == UNKNOWN_BT)
2829 type = bidi_resolve_weak (bidi_it);
2830 if (type == NEUTRAL_ON)
2832 /* bracket_pairing_pos == eob means this bracket does not
2833 need to be resolved as a bracket, but as a neutral, see
2834 the optimization trick we play near the end of
2835 bidi_find_bracket_pairs. */
2836 if (bidi_it->bracket_pairing_pos == eob)
2838 /* If this is the outermost closing bracket of a run of
2839 characters in which we decided to resolve brackets as
2840 neutrals, use the embedding level's type, recorded in
2841 bracket_enclosed_type, to resolve the bracket. */
2842 if (bidi_it->next_for_neutral.charpos == bidi_it->charpos
2843 && bidi_paired_bracket_type (bidi_it->ch) == BIDI_BRACKET_CLOSE)
2844 type = bidi_it->bracket_enclosed_type;
2846 else if (bidi_find_bracket_pairs (bidi_it))
2847 resolve_bracket = true;
2850 else if (bidi_it->bracket_pairing_pos != eob)
2852 eassert (bidi_it->resolved_level == -1);
2853 /* If the cached state shows an increase of embedding level due
2854 to an isolate initiator, we need to update the 1st cached
2855 state of the next run of the current isolating sequence with
2856 the prev_for_neutral and next_for_neutral information, so
2857 that it will be picked up when we advance to that next run. */
2858 if (bidi_it->level_stack[bidi_it->stack_idx].level > prev_level
2859 && ISOLATE_STATUS (bidi_it, bidi_it->stack_idx))
2861 bidi_record_type_for_neutral (&prev_for_neutral, prev_level, 0);
2862 bidi_record_type_for_neutral (&next_for_neutral, prev_level, 1);
2864 if (type == NEUTRAL_ON
2865 && bidi_paired_bracket_type (ch) == BIDI_BRACKET_OPEN)
2867 if (bidi_it->bracket_pairing_pos > bidi_it->charpos)
2869 /* A cached opening bracket that wasn't completely
2870 resolved yet. */
2871 resolve_bracket = true;
2873 else if (bidi_it->bracket_pairing_pos == -1)
2875 /* Higher levels were not BPA-resolved yet, even if
2876 cached by bidi_find_bracket_pairs. Force application
2877 of BPA to the new level now. */
2878 if (bidi_find_bracket_pairs (bidi_it))
2879 resolve_bracket = true;
2882 /* Keep track of the prev_for_neutral and next_for_neutral
2883 types, needed for resolving brackets below and for resolving
2884 neutrals in bidi_resolve_neutral. */
2885 if (bidi_it->level_stack[bidi_it->stack_idx].level == prev_level)
2887 bidi_it->prev_for_neutral = prev_for_neutral;
2888 if (next_for_neutral.charpos > 0)
2889 bidi_it->next_for_neutral = next_for_neutral;
2893 /* If needed, resolve the bracket type according to N0. */
2894 if (resolve_bracket)
2896 int embedding_level = bidi_it->level_stack[bidi_it->stack_idx].level;
2897 bidi_type_t embedding_type = (embedding_level & 1) ? STRONG_R : STRONG_L;
2899 eassert (bidi_it->prev_for_neutral.type != UNKNOWN_BT);
2900 eassert (bidi_it->bracket_pairing_pos > bidi_it->charpos);
2901 if (bidi_it->bracket_enclosed_type == embedding_type) /* N0b */
2902 type = embedding_type;
2903 else
2905 switch (bidi_it->prev_for_neutral.type)
2907 case STRONG_R:
2908 case WEAK_EN:
2909 case WEAK_AN:
2910 type =
2911 (bidi_it->bracket_enclosed_type == STRONG_R) /* N0c */
2912 ? STRONG_R /* N0c1 */
2913 : embedding_type; /* N0c2 */
2914 break;
2915 case STRONG_L:
2916 type =
2917 (bidi_it->bracket_enclosed_type == STRONG_L) /* N0c */
2918 ? STRONG_L /* N0c1 */
2919 : embedding_type; /* N0c2 */
2920 break;
2921 default:
2922 /* N0d: Do not set the type for that bracket pair. */
2923 break;
2926 eassert (type == STRONG_L || type == STRONG_R || type == NEUTRAL_ON);
2928 /* Update the type of the paired closing bracket to the same
2929 type as for the resolved opening bracket. */
2930 if (type != NEUTRAL_ON)
2932 ptrdiff_t idx = bidi_cache_search (bidi_it->bracket_pairing_pos,
2933 -1, 1);
2935 if (idx < bidi_cache_start)
2936 emacs_abort ();
2937 bidi_cache[idx].type = type;
2941 return type;
2944 static bidi_type_t
2945 bidi_resolve_neutral (struct bidi_it *bidi_it)
2947 bidi_type_t type = bidi_resolve_brackets (bidi_it);
2948 int current_level;
2949 bool is_neutral;
2951 eassert (type == STRONG_R
2952 || type == STRONG_L
2953 || type == WEAK_BN
2954 || type == WEAK_EN
2955 || type == WEAK_AN
2956 || type == NEUTRAL_B
2957 || type == NEUTRAL_S
2958 || type == NEUTRAL_WS
2959 || type == NEUTRAL_ON
2960 || type == LRI
2961 || type == RLI
2962 || type == PDI);
2964 current_level = bidi_it->level_stack[bidi_it->stack_idx].level;
2965 eassert (current_level >= 0);
2966 is_neutral = bidi_get_category (type) == NEUTRAL;
2968 if ((type != NEUTRAL_B /* Don't risk entering the long loop below if
2969 we are already at paragraph end. */
2970 && (is_neutral || bidi_isolate_fmt_char (type)))
2971 /* N1-N2/Retaining */
2972 || type == WEAK_BN)
2974 if (bidi_it->next_for_neutral.type != UNKNOWN_BT
2975 && (bidi_it->next_for_neutral.charpos > bidi_it->charpos
2976 /* PDI defines an eos, so it's OK for it to serve as its
2977 own next_for_neutral. */
2978 || (bidi_it->next_for_neutral.charpos == bidi_it->charpos
2979 && bidi_it->type == PDI)))
2981 type = bidi_resolve_neutral_1 (bidi_it->prev_for_neutral.type,
2982 bidi_it->next_for_neutral.type,
2983 current_level);
2985 /* The next two "else if" clauses are shortcuts for the
2986 important special case when we have a long sequence of
2987 neutral or WEAK_BN characters, such as whitespace or nulls or
2988 other control characters, on the base embedding level of the
2989 paragraph, and that sequence goes all the way to the end of
2990 the paragraph and follows a character whose resolved
2991 directionality is identical to the base embedding level.
2992 (This is what happens in a buffer with plain L2R text that
2993 happens to include long sequences of control characters.) By
2994 virtue of N1, the result of examining this long sequence will
2995 always be either STRONG_L or STRONG_R, depending on the base
2996 embedding level. So we use this fact directly instead of
2997 entering the expensive loop in the "else" clause. */
2998 else if (current_level == 0
2999 && bidi_it->prev_for_neutral.type == STRONG_L
3000 && (ASCII_CHAR_P (bidi_it->ch)
3001 || (type != WEAK_BN
3002 && !bidi_explicit_dir_char (bidi_it->ch)
3003 && !bidi_isolate_fmt_char (type))))
3004 type = bidi_resolve_neutral_1 (bidi_it->prev_for_neutral.type,
3005 STRONG_L, current_level);
3006 else if (/* current level is 1 */
3007 current_level == 1
3008 /* base embedding level is also 1 */
3009 && bidi_it->level_stack[0].level == 1
3010 /* previous character is one of those considered R for
3011 the purposes of W5 */
3012 && (bidi_it->prev_for_neutral.type == STRONG_R
3013 || bidi_it->prev_for_neutral.type == WEAK_EN
3014 || bidi_it->prev_for_neutral.type == WEAK_AN)
3015 && type != WEAK_BN
3016 && !bidi_explicit_dir_char (bidi_it->ch)
3017 && !bidi_isolate_fmt_char (type))
3018 type = bidi_resolve_neutral_1 (bidi_it->prev_for_neutral.type,
3019 STRONG_R, current_level);
3020 else
3022 /* Arrrgh!! The UAX#9 algorithm is too deeply entrenched in
3023 the assumption of batch-style processing; see clauses W4,
3024 W5, and especially N1, which require looking far forward
3025 (as well as back) in the buffer/string. May the fleas of
3026 a thousand camels infest the armpits of those who design
3027 supposedly general-purpose algorithms by looking at their
3028 own implementations, and fail to consider other possible
3029 implementations! */
3030 struct bidi_it saved_it;
3031 bidi_type_t next_type;
3032 bool adjacent_to_neutrals = is_neutral;
3034 bidi_copy_it (&saved_it, bidi_it);
3035 /* Scan the text forward until we find the first non-neutral
3036 character, and then use that to resolve the neutral we
3037 are dealing with now. We also cache the scanned iterator
3038 states, to salvage some of the effort later. */
3039 do {
3040 int old_sidx, new_sidx;
3042 /* Paragraph separators have their levels fully resolved
3043 at this point, so cache them as resolved. */
3044 bidi_cache_iterator_state (bidi_it, type == NEUTRAL_B, 0);
3045 old_sidx = bidi_it->stack_idx;
3046 type = bidi_resolve_brackets (bidi_it);
3047 /* Skip level runs excluded from this isolating run sequence. */
3048 new_sidx = bidi_it->stack_idx;
3049 if (bidi_it->level_stack[new_sidx].level > current_level
3050 && (ISOLATE_STATUS (bidi_it, new_sidx)
3051 /* This is for when we have an isolate initiator
3052 immediately followed by an embedding or
3053 override initiator, in which case we get the
3054 level stack pushed twice by the single call to
3055 bidi_resolve_weak above. */
3056 || (new_sidx > old_sidx + 1
3057 && ISOLATE_STATUS (bidi_it, new_sidx - 1))))
3059 while (bidi_it->level_stack[bidi_it->stack_idx].level
3060 > current_level)
3062 bidi_cache_iterator_state (bidi_it, type == NEUTRAL_B, 0);
3063 type = bidi_resolve_brackets (bidi_it);
3066 if (!adjacent_to_neutrals
3067 && (bidi_get_category (type) == NEUTRAL
3068 || bidi_isolate_fmt_char (type)))
3069 adjacent_to_neutrals = true;
3070 } while (!(type == NEUTRAL_B
3071 || (type != WEAK_BN
3072 && bidi_get_category (type) != NEUTRAL
3073 && !bidi_isolate_fmt_char (type))
3074 /* This is all per level run, so stop when we
3075 reach the end of this level run. */
3076 || (bidi_it->level_stack[bidi_it->stack_idx].level
3077 != current_level)));
3079 /* Record the character we stopped at. */
3080 bidi_remember_char (&saved_it.next_for_neutral, bidi_it, 1);
3082 if ((bidi_it->level_stack[bidi_it->stack_idx].level != current_level)
3083 || type == NEUTRAL_B)
3085 /* Marched all the way to the end of this level run. We
3086 need to use the eos type, whose information is stored
3087 by bidi_set_sos_type in the prev_for_neutral
3088 member. */
3089 if (adjacent_to_neutrals)
3090 next_type = bidi_it->prev_for_neutral.type;
3091 else
3093 /* This is a BN which does not adjoin neutrals.
3094 Leave its type alone. */
3095 bidi_copy_it (bidi_it, &saved_it);
3096 return bidi_it->type;
3099 else
3101 switch (type)
3103 case STRONG_L:
3104 case STRONG_R:
3105 case STRONG_AL:
3106 /* Actually, STRONG_AL cannot happen here, because
3107 bidi_resolve_weak converts it to STRONG_R, per W3. */
3108 eassert (type != STRONG_AL);
3109 next_type = type;
3110 break;
3111 case WEAK_EN:
3112 case WEAK_AN:
3113 /* N1: "European and Arabic numbers act as if they
3114 were R in terms of their influence on NIs." */
3115 next_type = STRONG_R;
3116 break;
3117 default:
3118 emacs_abort ();
3119 break;
3122 /* Resolve the type of all the NIs found during the above loop. */
3123 type = bidi_resolve_neutral_1 (saved_it.prev_for_neutral.type,
3124 next_type, current_level);
3125 /* Update next_for_neutral with the resolved type, so we
3126 could use it for all the other NIs up to the place where
3127 we exited the loop. */
3128 saved_it.next_for_neutral.type = next_type;
3129 bidi_check_type (type);
3130 /* Update the character which caused us to enter the above loop. */
3131 saved_it.type = type;
3132 bidi_check_type (next_type);
3133 bidi_copy_it (bidi_it, &saved_it);
3136 return type;
3139 /* Given an iterator state in BIDI_IT, advance one character position
3140 in the buffer/string to the next character (in the logical order),
3141 resolve the bidi type of that next character, and return that
3142 type. */
3143 static bidi_type_t
3144 bidi_type_of_next_char (struct bidi_it *bidi_it)
3146 bidi_type_t type;
3148 /* This should always be called during a forward scan. */
3149 if (bidi_it->scan_dir != 1)
3150 emacs_abort ();
3152 type = bidi_resolve_neutral (bidi_it);
3154 return type;
3157 /* Given an iterator state BIDI_IT, advance one character position in
3158 the buffer/string to the next character (in the current scan
3159 direction), resolve the embedding and implicit levels of that next
3160 character, and return the resulting level. */
3161 static int
3162 bidi_level_of_next_char (struct bidi_it *bidi_it)
3164 bidi_type_t type = UNKNOWN_BT;
3165 int level;
3166 ptrdiff_t next_char_pos = -2;
3168 if (bidi_it->scan_dir == 1)
3170 ptrdiff_t eob
3171 = ((bidi_it->string.s || STRINGP (bidi_it->string.lstring))
3172 ? bidi_it->string.schars : ZV);
3174 /* There's no sense in trying to advance if we've already hit
3175 the end of text. */
3176 if (bidi_it->charpos >= eob)
3178 eassert (bidi_it->resolved_level >= 0);
3179 return bidi_it->resolved_level;
3183 /* Perhaps the character we want is already cached as fully resolved.
3184 If it is, the call to bidi_cache_find below will return a type
3185 other than UNKNOWN_BT. */
3186 if (bidi_cache_idx > bidi_cache_start && !bidi_it->first_elt)
3188 int bob = ((bidi_it->string.s || STRINGP (bidi_it->string.lstring))
3189 ? 0 : 1);
3191 if (bidi_it->scan_dir > 0)
3193 if (bidi_it->nchars <= 0)
3194 emacs_abort ();
3195 next_char_pos = bidi_it->charpos + bidi_it->nchars;
3197 else if (bidi_it->charpos >= bob)
3198 /* Implementation note: we allow next_char_pos to be as low as
3199 0 for buffers or -1 for strings, and that is okay because
3200 that's the "position" of the sentinel iterator state we
3201 cached at the beginning of the iteration. */
3202 next_char_pos = bidi_it->charpos - 1;
3203 if (next_char_pos >= bob - 1)
3204 type = bidi_cache_find (next_char_pos, 1, bidi_it);
3205 if (type != UNKNOWN_BT)
3207 /* We asked the cache for fully resolved states. */
3208 eassert (bidi_it->resolved_level >= 0);
3209 return bidi_it->resolved_level;
3213 if (bidi_it->scan_dir == -1)
3214 /* If we are going backwards, the iterator state is already cached
3215 from previous scans, and should be fully resolved. */
3216 emacs_abort ();
3218 if (type == UNKNOWN_BT)
3219 type = bidi_type_of_next_char (bidi_it);
3221 if (type == NEUTRAL_B)
3223 eassert (bidi_it->resolved_level >= 0);
3224 return bidi_it->resolved_level;
3227 level = bidi_it->level_stack[bidi_it->stack_idx].level;
3229 eassert ((type == STRONG_R
3230 || type == STRONG_L
3231 || type == WEAK_BN
3232 || type == WEAK_EN
3233 || type == WEAK_AN));
3234 bidi_it->type = type;
3235 bidi_check_type (bidi_it->type);
3237 /* For L1 below, we need to know, for each WS character, whether
3238 it belongs to a sequence of WS characters preceding a newline
3239 or a TAB or a paragraph separator. */
3240 if ((bidi_it->orig_type == NEUTRAL_WS
3241 || bidi_it->orig_type == WEAK_BN
3242 || bidi_isolate_fmt_char (bidi_it->orig_type))
3243 && bidi_it->next_for_ws.charpos < bidi_it->charpos
3244 /* If this character is already at base level, we don't need to
3245 reset it, so avoid the potentially costly loop below. */
3246 && level != bidi_it->level_stack[0].level)
3248 int ch;
3249 ptrdiff_t clen = bidi_it->ch_len;
3250 ptrdiff_t bpos = bidi_it->bytepos;
3251 ptrdiff_t cpos = bidi_it->charpos;
3252 ptrdiff_t disp_pos = bidi_it->disp_pos;
3253 ptrdiff_t nc = bidi_it->nchars;
3254 struct bidi_string_data bs = bidi_it->string;
3255 bidi_type_t chtype;
3256 bool fwp = bidi_it->frame_window_p;
3257 int dpp = bidi_it->disp_prop;
3259 if (bidi_it->nchars <= 0)
3260 emacs_abort ();
3261 do {
3262 ch = bidi_fetch_char (cpos += nc, bpos += clen, &disp_pos, &dpp, &bs,
3263 bidi_it->w, fwp, &clen, &nc);
3264 chtype = bidi_get_type (ch, NEUTRAL_DIR);
3265 } while (chtype == NEUTRAL_WS || chtype == WEAK_BN
3266 || bidi_isolate_fmt_char (chtype)
3267 || bidi_explicit_dir_char (ch)); /* L1/Retaining */
3268 bidi_it->next_for_ws.type = chtype;
3269 bidi_check_type (bidi_it->next_for_ws.type);
3270 bidi_it->next_for_ws.charpos = cpos;
3273 /* Update the cache, but only if this state was already cached. */
3274 bidi_cache_iterator_state (bidi_it, 1, 1);
3276 /* Resolve implicit levels. */
3277 if (bidi_it->orig_type == NEUTRAL_B /* L1 */
3278 || bidi_it->orig_type == NEUTRAL_S
3279 || bidi_it->ch == '\n' || bidi_it->ch == BIDI_EOB
3280 || ((bidi_it->orig_type == NEUTRAL_WS
3281 || bidi_it->orig_type == WEAK_BN
3282 || bidi_isolate_fmt_char (bidi_it->orig_type)
3283 || bidi_explicit_dir_char (bidi_it->ch))
3284 && (bidi_it->next_for_ws.type == NEUTRAL_B
3285 || bidi_it->next_for_ws.type == NEUTRAL_S)))
3286 level = bidi_it->level_stack[0].level;
3287 else if ((level & 1) == 0) /* I1 */
3289 if (type == STRONG_R)
3290 level++;
3291 else if (type == WEAK_EN || type == WEAK_AN)
3292 level += 2;
3294 else /* I2 */
3296 if (type == STRONG_L || type == WEAK_EN || type == WEAK_AN)
3297 level++;
3300 bidi_it->resolved_level = level;
3301 return level;
3304 /* Move to the other edge of a level given by LEVEL. If END_FLAG,
3305 we are at the end of a level, and we need to prepare to
3306 resume the scan of the lower level.
3308 If this level's other edge is cached, we simply jump to it, filling
3309 the iterator structure with the iterator state on the other edge.
3310 Otherwise, we walk the buffer or string until we come back to the
3311 same level as LEVEL.
3313 Note: we are not talking here about a ``level run'' in the UAX#9
3314 sense of the term, but rather about a ``level'' which includes
3315 all the levels higher than it. In other words, given the levels
3316 like this:
3318 11111112222222333333334443343222222111111112223322111
3319 A B C
3321 and assuming we are at point A scanning left to right, this
3322 function moves to point C, whereas the UAX#9 ``level 2 run'' ends
3323 at point B. */
3324 static void
3325 bidi_find_other_level_edge (struct bidi_it *bidi_it, int level, bool end_flag)
3327 int dir = end_flag ? -bidi_it->scan_dir : bidi_it->scan_dir;
3328 ptrdiff_t idx;
3330 /* Try the cache first. */
3331 if ((idx = bidi_cache_find_level_change (level, dir, end_flag))
3332 >= bidi_cache_start)
3333 bidi_cache_fetch_state (idx, bidi_it);
3334 else
3336 int new_level;
3338 /* If we are at end of level, its edges must be cached. */
3339 if (end_flag)
3340 emacs_abort ();
3342 if (!bidi_cache_iterator_state (bidi_it, 1, 0))
3344 /* Can't happen: if the cache needs to grow, it means we
3345 were at base embedding level, so the cache should have
3346 been either empty or already large enough to cover this
3347 character position. */
3348 emacs_abort ();
3350 do {
3351 new_level = bidi_level_of_next_char (bidi_it);
3352 /* If the cache is full, perform an emergency return by
3353 pretending that the level ended. */
3354 if (!bidi_cache_iterator_state (bidi_it, 1, 0))
3356 new_level = level - 1;
3357 /* Since the cache should only grow when we are scanning
3358 forward looking for the edge of the level that is one
3359 above the base embedding level, we can only have this
3360 contingency when LEVEL - 1 is the base embedding
3361 level. */
3362 eassert (new_level == bidi_it->level_stack[0].level);
3363 /* Plan B, for when the cache overflows: Back up to the
3364 previous character by fetching the last cached state,
3365 and force the resolved level of that character be the
3366 base embedding level. */
3367 bidi_cache_fetch_state (bidi_cache_idx - 1, bidi_it);
3368 bidi_it->resolved_level = new_level;
3369 bidi_cache_iterator_state (bidi_it, 1, 1);
3371 } while (new_level >= level);
3375 void
3376 bidi_move_to_visually_next (struct bidi_it *bidi_it)
3378 int old_level, new_level, next_level;
3379 struct bidi_it sentinel;
3381 if (bidi_it->charpos < 0 || bidi_it->bytepos < 0)
3382 emacs_abort ();
3384 if (bidi_it->scan_dir == 0)
3386 bidi_it->scan_dir = 1; /* default to logical order */
3389 /* If we just passed a newline, initialize for the next line. */
3390 if (!bidi_it->first_elt
3391 && (bidi_it->ch == '\n' || bidi_it->ch == BIDI_EOB))
3392 bidi_line_init (bidi_it);
3394 /* Prepare the sentinel iterator state, and cache it. When we bump
3395 into it, scanning backwards, we'll know that the last non-base
3396 level is exhausted. */
3397 if (bidi_cache_idx == bidi_cache_start)
3399 bidi_copy_it (&sentinel, bidi_it);
3400 if (bidi_it->first_elt)
3402 sentinel.charpos--; /* cached charpos needs to be monotonic */
3403 sentinel.bytepos--;
3404 sentinel.ch = '\n'; /* doesn't matter, but why not? */
3405 sentinel.ch_len = 1;
3406 sentinel.nchars = 1;
3408 bidi_cache_iterator_state (&sentinel, 1, 0);
3411 old_level = bidi_it->resolved_level;
3412 new_level = bidi_level_of_next_char (bidi_it);
3414 /* Reordering of resolved levels (clause L2) is implemented by
3415 jumping to the other edge of the level and flipping direction of
3416 scanning the text whenever we find a level change. */
3417 if (new_level != old_level)
3419 bool ascending = new_level > old_level;
3420 int level_to_search = ascending ? old_level + 1 : old_level;
3421 int incr = ascending ? 1 : -1;
3422 int expected_next_level = old_level + incr;
3424 /* Jump (or walk) to the other edge of this level. */
3425 bidi_find_other_level_edge (bidi_it, level_to_search, !ascending);
3426 /* Switch scan direction and peek at the next character in the
3427 new direction. */
3428 bidi_it->scan_dir = -bidi_it->scan_dir;
3430 /* The following loop handles the case where the resolved level
3431 jumps by more than one. This is typical for numbers inside a
3432 run of text with left-to-right embedding direction, but can
3433 also happen in other situations. In those cases the decision
3434 where to continue after a level change, and in what direction,
3435 is tricky. For example, given a text like below:
3437 abcdefgh
3438 11336622
3440 (where the numbers below the text show the resolved levels),
3441 the result of reordering according to UAX#9 should be this:
3443 efdcghba
3445 This is implemented by the loop below which flips direction
3446 and jumps to the other edge of the level each time it finds
3447 the new level not to be the expected one. The expected level
3448 is always one more or one less than the previous one. */
3449 next_level = bidi_peek_at_next_level (bidi_it);
3450 while (next_level != expected_next_level)
3452 /* If next_level is -1, it means we have an unresolved level
3453 in the cache, which at this point should not happen. If
3454 it does, we will infloop. */
3455 eassert (next_level >= 0);
3456 /* If next_level is not consistent with incr, we might
3457 infloop. */
3458 eassert (incr > 0
3459 ? next_level > expected_next_level
3460 : next_level < expected_next_level);
3461 expected_next_level += incr;
3462 level_to_search += incr;
3463 bidi_find_other_level_edge (bidi_it, level_to_search, !ascending);
3464 bidi_it->scan_dir = -bidi_it->scan_dir;
3465 next_level = bidi_peek_at_next_level (bidi_it);
3468 /* Finally, deliver the next character in the new direction. */
3469 next_level = bidi_level_of_next_char (bidi_it);
3472 /* Take note when we have just processed the newline that precedes
3473 the end of the paragraph. The next time we are about to be
3474 called, set_iterator_to_next will automatically reinit the
3475 paragraph direction, if needed. We do this at the newline before
3476 the paragraph separator, because the next character might not be
3477 the first character of the next paragraph, due to the bidi
3478 reordering, whereas we _must_ know the paragraph base direction
3479 _before_ we process the paragraph's text, since the base
3480 direction affects the reordering. */
3481 if (bidi_it->scan_dir == 1
3482 && (bidi_it->ch == '\n' || bidi_it->ch == BIDI_EOB))
3484 /* The paragraph direction of the entire string, once
3485 determined, is in effect for the entire string. Setting the
3486 separator limit to the end of the string prevents
3487 bidi_paragraph_init from being called automatically on this
3488 string. */
3489 if (bidi_it->string.s || STRINGP (bidi_it->string.lstring))
3490 bidi_it->separator_limit = bidi_it->string.schars;
3491 else if (bidi_it->bytepos < ZV_BYTE)
3493 ptrdiff_t sep_len
3494 = bidi_at_paragraph_end (bidi_it->charpos + bidi_it->nchars,
3495 bidi_it->bytepos + bidi_it->ch_len);
3496 if (bidi_it->nchars <= 0)
3497 emacs_abort ();
3498 if (sep_len >= 0)
3500 bidi_it->new_paragraph = 1;
3501 /* Record the buffer position of the last character of the
3502 paragraph separator. */
3503 bidi_it->separator_limit
3504 = bidi_it->charpos + bidi_it->nchars + sep_len;
3509 if (bidi_it->scan_dir == 1 && bidi_cache_idx > bidi_cache_start)
3511 /* If we are at paragraph's base embedding level and beyond the
3512 last cached position, the cache's job is done and we can
3513 discard it. */
3514 if (bidi_it->resolved_level == bidi_it->level_stack[0].level
3515 && bidi_it->charpos > (bidi_cache[bidi_cache_idx - 1].charpos
3516 + bidi_cache[bidi_cache_idx - 1].nchars - 1))
3517 bidi_cache_reset ();
3518 /* Also reset the cache if it overflowed and we have just
3519 emergency-exited using Plan B. */
3520 else if (bidi_it->resolved_level == bidi_it->level_stack[0].level
3521 && bidi_cache_idx >= bidi_cache_size
3522 && bidi_it->charpos == bidi_cache[bidi_cache_idx - 1].charpos)
3523 bidi_cache_reset ();
3524 /* But as long as we are caching during forward scan, we must
3525 cache each state, or else the cache integrity will be
3526 compromised: it assumes cached states correspond to buffer
3527 positions 1:1. */
3528 else
3529 bidi_cache_iterator_state (bidi_it, 1, 0);
3532 eassert (bidi_it->resolved_level >= 0
3533 && bidi_it->resolved_level <= BIDI_MAXDEPTH + 2);
3536 /* Utility function for looking for strong directional characters
3537 whose bidi type was overridden by a directional override. */
3538 ptrdiff_t
3539 bidi_find_first_overridden (struct bidi_it *bidi_it)
3541 ptrdiff_t found_pos = ZV;
3545 /* Need to call bidi_resolve_weak, not bidi_resolve_explicit,
3546 because the directional overrides are applied by the
3547 former. */
3548 bidi_type_t type = bidi_resolve_weak (bidi_it);
3550 if ((type == STRONG_R && bidi_it->orig_type == STRONG_L)
3551 || (type == STRONG_L
3552 && (bidi_it->orig_type == STRONG_R
3553 || bidi_it->orig_type == STRONG_AL)))
3554 found_pos = bidi_it->charpos;
3555 } while (found_pos == ZV
3556 && bidi_it->charpos < ZV
3557 && bidi_it->ch != BIDI_EOB
3558 && bidi_it->ch != '\n');
3560 return found_pos;
3563 /* This is meant to be called from within the debugger, whenever you
3564 wish to examine the cache contents. */
3565 void bidi_dump_cached_states (void) EXTERNALLY_VISIBLE;
3566 void
3567 bidi_dump_cached_states (void)
3569 ptrdiff_t i;
3570 int ndigits = 1;
3572 if (bidi_cache_idx == 0)
3574 fprintf (stderr, "The cache is empty.\n");
3575 return;
3577 fprintf (stderr, "Total of %"pD"d state%s in cache:\n",
3578 bidi_cache_idx, bidi_cache_idx == 1 ? "" : "s");
3580 for (i = bidi_cache[bidi_cache_idx - 1].charpos; i > 0; i /= 10)
3581 ndigits++;
3582 fputs ("ch ", stderr);
3583 for (i = 0; i < bidi_cache_idx; i++)
3584 fprintf (stderr, "%*c", ndigits, bidi_cache[i].ch);
3585 fputs ("\n", stderr);
3586 fputs ("lvl ", stderr);
3587 for (i = 0; i < bidi_cache_idx; i++)
3588 fprintf (stderr, "%*d", ndigits, bidi_cache[i].resolved_level);
3589 fputs ("\n", stderr);
3590 fputs ("pos ", stderr);
3591 for (i = 0; i < bidi_cache_idx; i++)
3592 fprintf (stderr, "%*"pD"d", ndigits, bidi_cache[i].charpos);
3593 fputs ("\n", stderr);