1 /* Low-level bidirectional buffer/string-scanning functions for GNU Emacs.
2 Copyright (C) 2000-2001, 2004-2005, 2009-2014 Free Software
5 This file is part of GNU Emacs.
7 GNU Emacs is free software: you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation, either version 3 of the License, or
10 (at your option) any later version.
12 GNU Emacs is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with GNU Emacs. If not, see <http://www.gnu.org/licenses/>. */
20 /* Written by Eli Zaretskii <eliz@gnu.org>.
22 A sequential implementation of the Unicode Bidirectional algorithm,
23 (UBA) as per UAX#9, a part of the Unicode Standard.
25 Unlike the Reference Implementation and most other implementations,
26 this one is designed to be called once for every character in the
27 buffer or string. That way, we can leave intact the design of the
28 Emacs display engine, whereby an iterator object is used to
29 traverse buffer or string text character by character, and generate
30 the necessary data for displaying each character in 'struct glyph'
31 objects. (See xdisp.c for the details of that iteration.) The
32 functions on this file replace the original linear iteration in the
33 logical order of the text with a non-linear iteration in the visual
34 order, i.e. in the order characters should be shown on display.
36 The main entry point is bidi_move_to_visually_next. Each time it
37 is called, it finds the next character in the visual order, and
38 returns its information in a special structure. The caller is then
39 expected to process this character for display or any other
40 purposes, and call bidi_move_to_visually_next for the next
41 character. See the comments in bidi_move_to_visually_next for more
42 details about its algorithm that finds the next visual-order
43 character by resolving their levels on the fly.
45 Two other entry points are bidi_paragraph_init and
46 bidi_mirror_char. The first determines the base direction of a
47 paragraph, while the second returns the mirrored version of its
50 A few auxiliary entry points are used to initialize the bidi
51 iterator for iterating an object (buffer or string), push and pop
52 the bidi iterator state, and save and restore the state of the bidi
55 If you want to understand the code, you will have to read it
56 together with the relevant portions of UAX#9. The comments include
57 references to UAX#9 rules, for that very reason.
59 A note about references to UAX#9 rules: if the reference says
60 something like "X9/Retaining", it means that you need to refer to
61 rule X9 and to its modifications described in the "Implementation
62 Notes" section of UAX#9, under "Retaining Format Codes".
64 Here's the overview of the design of the reordering engine
65 implemented by this file.
67 Basic implementation structure
68 ------------------------------
70 The sequential processing steps described by UAX#9 are implemented
71 as recursive levels of processing, all of which examine the next
72 character in the logical order. This hierarchy of processing looks
73 as follows, from the innermost (deepest) to the outermost level,
74 omitting some subroutines used by each level:
76 bidi_fetch_char -- fetch next character
77 bidi_resolve_explicit -- resolve explicit levels and directions
78 bidi_resolve_weak -- resolve weak types
79 bidi_resolve_neutral -- resolve neutral types
80 bidi_level_of_next_char -- resolve implicit levels
82 Each level calls the level below it, and works on the result
83 returned by the lower level, including all of its sub-levels.
85 Unlike all the levels below it, bidi_level_of_next_char can return
86 the information about either the next or previous character in the
87 logical order, depending on the current direction of scanning the
88 buffer or string. For the next character, it calls all the levels
89 below it; for the previous character, it uses the cache, described
92 Thus, the result of calling bidi_level_of_next_char is the resolved
93 level of the next or the previous character in the logical order.
94 Based on this information, the function bidi_move_to_visually_next
95 finds the next character in the visual order and updates the
96 direction in which the buffer is scanned, either forward or
97 backward, to find the next character to be displayed. (Text is
98 scanned backwards when it needs to be reversed for display, i.e. if
99 the visual order is the inverse of the logical order.) This
100 implements the last, reordering steps of the UBA, by successively
101 calling bidi_level_of_next_char until the character of the required
102 embedding level is found; the scan direction is dynamically updated
103 as a side effect. See the commentary before the 'while' loop in
104 bidi_move_to_visually_next, for the details.
109 In a nutshell, fetching the next character boils down to calling
110 STRING_CHAR_AND_LENGTH, passing it the address of a buffer or
111 string position. See bidi_fetch_char. However, if the next
112 character is "covered" by a display property of some kind,
113 bidi_fetch_char returns the u+FFFC "object replacement character"
114 that represents the entire run of text covered by the display
115 property. (The ch_len and nchars members of 'struct bidi_it'
116 reflect the length in bytes and characters of that text.) This is
117 so we reorder text on both sides of the display property as
118 appropriate for an image or embedded string. Similarly, text
119 covered by a display spec of the form '(space ...)', is replaced
120 with the u+2029 paragraph separator character, so such display
121 specs produce the same effect as a TAB under UBA. Both these
122 special characters are not actually displayed -- the display
123 property is displayed instead -- but just used to compute the
124 embedding level of the surrounding text so as to produce the
130 The UBA is highly context dependent in some of its parts,
131 i.e. results of processing a character can generally depend on
132 characters very far away. The UAX#9 description of the UBA
133 prescribes a stateful processing of each character, whereby the
134 results of this processing depend on various state variables, such
135 as the current embedding level, level stack, and directional
136 override status. In addition, the UAX#9 description includes many
137 passages like this (from rule W2 in this case):
139 Search backward from each instance of a European number until the
140 first strong type (R, L, AL, or sos) is found. If an AL is found,
141 change the type of the European number to Arabic number.
143 To support this, we use a bidi iterator object, 'struct bidi_it',
144 which is a sub-structure of 'struct it' used by xdisp.c (see
145 dispextern.h for the definition of both of these structures). The
146 bidi iterator holds the entire state of the iteration required by
147 the UBA, and is updated as the text is traversed. In particular,
148 the embedding level of the current character being resolved is
149 recorded in the iterator state. To avoid costly searches backward
150 in support of rules like W2 above, the necessary character types
151 are also recorded in the iterator state as they are found during
152 the forward scan, and then used when such rules need to be applied.
153 (Forward scans cannot be avoided in this way; they need to be
154 performed at least once, and the results recorded in the iterator
155 state, to be reused until the forward scan oversteps the recorded
158 In this manner, the iterator state acts as a mini-cache of
159 contextual information required for resolving the level of the
160 current character by various UBA rules.
162 Caching of bidi iterator states
163 -------------------------------
165 As described above, the reordering engine uses the information
166 recorded in the bidi iterator state in order to resolve the
167 embedding level of the current character. When the reordering
168 engine needs to process the next character in the logical order, it
169 fetches it and applies to it all the UBA levels, updating the
170 iterator state as it goes. But when the buffer or string is
171 scanned backwards, i.e. in the reverse order of buffer/string
172 positions, the scanned characters were already processed during the
173 preceding forward scan (see bidi_find_other_level_edge). To avoid
174 costly re-processing of characters that were already processed
175 during the forward scan, the iterator states computed while
176 scanning forward are cached.
178 The cache is just a linear array of 'struct bidi_it' objects, which
179 is dynamically allocated and reallocated as needed, since the size
180 of the cache depends on the text being processed. We only need the
181 cache while processing embedded levels higher than the base
182 paragraph embedding level, because these higher levels require
183 changes in scan direction. Therefore, as soon as we are back to
184 the base embedding level, we can free the cache; see the calls to
185 bidi_cache_reset and bidi_cache_shrink, for the conditions to do
188 The cache maintains the index of the next unused cache slot -- this
189 is where the next iterator state will be cached. The function
190 bidi_cache_iterator_state saves an instance of the state in the
191 cache and increments the unused slot index. The companion function
192 bidi_cache_find looks up a cached state that corresponds to a given
193 buffer/string position. All of the cached states must correspond
194 1:1 to the buffer or string region whose processing they reflect;
195 bidi.c will abort if it finds cache slots that violate this 1:1
198 When the parent iterator 'struct it' is pushed (see push_it in
199 xdisp.c) to pause the current iteration and start iterating over a
200 different object (e.g., a 'display' string that covers some buffer
201 text), the bidi iterator cache needs to be "pushed" as well, so
202 that a new empty cache could be used while iterating over the new
203 object. Later, when the new object is exhausted, and xdisp.c calls
204 pop_it, we need to "pop" the bidi cache as well and return to the
205 original cache. See bidi_push_it and bidi_pop_it for how this is
208 Some functions of the display engine save copies of 'struct it' in
209 local variables, and restore them later. For examples, see
210 pos_visible_p and move_it_in_display_line_to in xdisp.c, and
211 window_scroll_pixel_based in window.c. When this happens, we need
212 to save and restore the bidi cache as well, because conceptually
213 the cache is part of the 'struct it' state, and needs to be in
214 perfect sync with the portion of the buffer/string that is being
215 processed. This saving and restoring of the cache state is handled
216 by bidi_shelve_cache and bidi_unshelve_cache, and the helper macros
217 SAVE_IT and RESTORE_IT defined on xdisp.c.
219 Note that, because reordering is implemented below the level in
220 xdisp.c that breaks glyphs into screen lines, we are violating
221 paragraph 3.4 of UAX#9. which mandates that line breaking shall be
222 done before reordering each screen line separately. However,
223 following UAX#9 to the letter in this matter goes against the basic
224 design of the Emacs display engine, and so we choose here this
225 minor deviation from the UBA letter in preference to redesign of
226 the display engine. The effect of this is only seen in continued
227 lines that are broken into screen lines in the middle of a run
228 whose direction is opposite to the paragraph's base direction.
230 Important design and implementation note: when the code needs to
231 scan far ahead, be sure to avoid such scans as much as possible
232 when the buffer/string doesn't contain any RTL characters. Users
233 of left-to-right scripts will never forgive you if you introduce
234 some slow-down due to bidi in situations that don't involve any
235 bidirectional text. See the large comment near the beginning of
236 bidi_resolve_neutral, for one situation where such shortcut was
243 #include "character.h"
245 #include "dispextern.h"
246 #include "region-cache.h"
248 static bool bidi_initialized
= 0;
250 static Lisp_Object bidi_type_table
, bidi_mirror_table
, bidi_brackets_table
;
252 #define BIDI_EOB (-1)
254 /* Data type for describing the bidirectional character categories. */
263 static Lisp_Object paragraph_start_re
, paragraph_separate_re
;
264 static Lisp_Object Qparagraph_start
, Qparagraph_separate
;
267 /***********************************************************************
269 ***********************************************************************/
271 /* Return the bidi type of a character CH, subject to the current
272 directional OVERRIDE. */
274 bidi_get_type (int ch
, bidi_dir_t override
)
276 bidi_type_t default_type
;
280 if (ch
< 0 || ch
> MAX_CHAR
)
283 default_type
= (bidi_type_t
) XINT (CHAR_TABLE_REF (bidi_type_table
, ch
));
284 /* Every valid character code, even those that are unassigned by the
285 UCD, have some bidi-class property, according to
286 DerivedBidiClass.txt file. Therefore, if we ever get UNKNOWN_BT
287 (= zero) code from CHAR_TABLE_REF, that's a bug. */
288 if (default_type
== UNKNOWN_BT
)
291 switch (default_type
)
308 else if (override
== R2L
)
316 bidi_check_type (bidi_type_t type
)
318 eassert (UNKNOWN_BT
<= type
&& type
<= NEUTRAL_ON
);
321 /* Given a bidi TYPE of a character, return its category. */
322 static bidi_category_t
323 bidi_get_category (bidi_type_t type
)
355 return EXPLICIT_FORMATTING
;
361 /* Return the mirrored character of C, if it has one. If C has no
362 mirrored counterpart, return C.
363 Note: The conditions in UAX#9 clause L4 regarding the surrounding
364 context must be tested by the caller. */
366 bidi_mirror_char (int c
)
372 if (c
< 0 || c
> MAX_CHAR
)
375 val
= CHAR_TABLE_REF (bidi_mirror_table
, c
);
380 /* When debugging, check before assigning to V, so that the check
381 isn't broken by undefined behavior due to int overflow. */
382 eassert (CHAR_VALID_P (XINT (val
)));
386 /* Minimal test we must do in optimized builds, to prevent weird
387 crashes further down the road. */
388 if (v
< 0 || v
> MAX_CHAR
)
397 /* Determine the start-of-sequence (sos) directional type given the two
398 embedding levels on either side of the run boundary. Also, update
399 the saved info about previously seen characters, since that info is
400 generally valid for a single level run. */
402 bidi_set_sos_type (struct bidi_it
*bidi_it
, int level_before
, int level_after
)
404 int higher_level
= (level_before
> level_after
? level_before
: level_after
);
406 /* FIXME: should the default sos direction be user selectable? */
407 bidi_it
->sos
= ((higher_level
& 1) != 0 ? R2L
: L2R
); /* X10 */
409 bidi_it
->prev
.type
= bidi_it
->prev
.type_after_w1
= UNKNOWN_BT
;
410 bidi_it
->last_strong
.type
= bidi_it
->last_strong
.type_after_w1
411 = bidi_it
->last_strong
.orig_type
= UNKNOWN_BT
;
412 bidi_it
->prev_for_neutral
.type
= (bidi_it
->sos
== R2L
? STRONG_R
: STRONG_L
);
413 bidi_it
->prev_for_neutral
.charpos
= bidi_it
->charpos
;
414 bidi_it
->prev_for_neutral
.bytepos
= bidi_it
->bytepos
;
415 bidi_it
->next_for_neutral
.type
= bidi_it
->next_for_neutral
.type_after_w1
416 = bidi_it
->next_for_neutral
.orig_type
= UNKNOWN_BT
;
419 /* Push the current embedding level and override status; reset the
420 current level to LEVEL and the current override status to OVERRIDE. */
422 bidi_push_embedding_level (struct bidi_it
*bidi_it
,
423 int level
, bidi_dir_t override
, bool isolate_status
)
425 struct bidi_stack
*st
;
426 int prev_level
= bidi_it
->level_stack
[bidi_it
->stack_idx
].level
;
428 bidi_it
->stack_idx
++;
429 eassert (bidi_it
->stack_idx
< BIDI_MAXDEPTH
+2+1);
430 st
= &bidi_it
->level_stack
[bidi_it
->stack_idx
];
431 eassert (level
<= (1 << 7));
433 st
->override
= override
;
434 st
->isolate_status
= isolate_status
;
437 st
->last_strong
= bidi_it
->last_strong
;
438 st
->prev_for_neutral
= bidi_it
->prev_for_neutral
;
439 st
->next_for_neutral
= bidi_it
->next_for_neutral
;
440 st
->sos
= bidi_it
->sos
;
442 /* We've got a new isolating sequence, compute the directional type
443 of sos and initialize per-sequence variables (UAX#9, clause X10). */
444 bidi_set_sos_type (bidi_it
, prev_level
, level
);
447 /* Pop from the stack the embedding level, the directional override
448 status, and optionally saved information for the isolating run
449 sequence. Return the new level. */
451 bidi_pop_embedding_level (struct bidi_it
*bidi_it
)
455 /* UAX#9 says to ignore invalid PDFs (X7, last bullet)
456 and PDIs (X6a, 2nd bullet). */
457 if (bidi_it
->stack_idx
> 0)
460 = bidi_it
->level_stack
[bidi_it
->stack_idx
].isolate_status
;
461 int old_level
= bidi_it
->level_stack
[bidi_it
->stack_idx
].level
;
463 struct bidi_stack st
;
465 st
= bidi_it
->level_stack
[bidi_it
->stack_idx
];
468 /* PREV is used in W1 for resolving WEAK_NSM. By the time
469 we get to an NSM, we must have gotten past at least one
470 character: the PDI that ends the isolate from which we
471 are popping here. So PREV will have been filled up by
472 the time we first use it. We initialize it here to
473 UNKNOWN_BT to be able to catch any blunders in this
475 bidi_it
->prev
.orig_type
= bidi_it
->prev
.type_after_w1
476 = bidi_it
->prev
.type
= UNKNOWN_BT
;
477 bidi_it
->last_strong
= st
.last_strong
;
478 bidi_it
->prev_for_neutral
= st
.prev_for_neutral
;
479 bidi_it
->next_for_neutral
= st
.next_for_neutral
;
480 bidi_it
->sos
= st
.sos
;
483 bidi_set_sos_type (bidi_it
, old_level
,
484 bidi_it
->level_stack
[bidi_it
->stack_idx
- 1].level
);
486 bidi_it
->stack_idx
--;
488 level
= bidi_it
->level_stack
[bidi_it
->stack_idx
].level
;
489 eassert (0 <= level
&& level
<= BIDI_MAXDEPTH
+ 1);
493 /* Record in SAVED_INFO the information about the current character. */
495 bidi_remember_char (struct bidi_saved_info
*saved_info
,
496 struct bidi_it
*bidi_it
)
498 saved_info
->charpos
= bidi_it
->charpos
;
499 saved_info
->bytepos
= bidi_it
->bytepos
;
500 saved_info
->type
= bidi_it
->type
;
501 bidi_check_type (bidi_it
->type
);
502 saved_info
->type_after_w1
= bidi_it
->type_after_w1
;
503 bidi_check_type (bidi_it
->type_after_w1
);
504 saved_info
->orig_type
= bidi_it
->orig_type
;
505 bidi_check_type (bidi_it
->orig_type
);
506 saved_info
->bracket_resolved
= bidi_it
->bracket_resolved
;
509 /* Copy the bidi iterator from FROM to TO. To save cycles, this only
510 copies the part of the level stack that is actually in use. */
512 bidi_copy_it (struct bidi_it
*to
, struct bidi_it
*from
)
514 /* Copy everything from the start through the active part of
517 (offsetof (struct bidi_it
, level_stack
[1])
518 + from
->stack_idx
* sizeof from
->level_stack
[0]));
522 /***********************************************************************
523 Caching the bidi iterator states
524 ***********************************************************************/
526 /* We allocate and de-allocate the cache in chunks of this size (in
527 characters). 200 was chosen as an upper limit for reasonably-long
528 lines in a text file/buffer. */
529 #define BIDI_CACHE_CHUNK 200
530 static struct bidi_it
*bidi_cache
;
531 static ptrdiff_t bidi_cache_size
= 0;
532 enum { elsz
= sizeof (struct bidi_it
) };
533 static ptrdiff_t bidi_cache_idx
; /* next unused cache slot */
534 static ptrdiff_t bidi_cache_last_idx
; /* slot of last cache hit */
535 static ptrdiff_t bidi_cache_start
= 0; /* start of cache for this
538 /* 5-slot stack for saving the start of the previous level of the
539 cache. xdisp.c maintains a 5-slot stack for its iterator state,
540 and we need the same size of our stack. */
541 static ptrdiff_t bidi_cache_start_stack
[IT_STACK_SIZE
];
542 static int bidi_cache_sp
;
544 /* Size of header used by bidi_shelve_cache. */
547 bidi_shelve_header_size
548 = (sizeof (bidi_cache_idx
) + sizeof (bidi_cache_start_stack
)
549 + sizeof (bidi_cache_sp
) + sizeof (bidi_cache_start
)
550 + sizeof (bidi_cache_last_idx
))
553 /* Reset the cache state to the empty state. We only reset the part
554 of the cache relevant to iteration of the current object. Previous
555 objects, which are pushed on the display iterator's stack, are left
556 intact. This is called when the cached information is no more
557 useful for the current iteration, e.g. when we were reseated to a
558 new position on the same object. */
560 bidi_cache_reset (void)
562 bidi_cache_idx
= bidi_cache_start
;
563 bidi_cache_last_idx
= -1;
566 /* Shrink the cache to its minimal size. Called when we init the bidi
567 iterator for reordering a buffer or a string that does not come
568 from display properties, because that means all the previously
569 cached info is of no further use. */
571 bidi_cache_shrink (void)
573 if (bidi_cache_size
> BIDI_CACHE_CHUNK
)
575 bidi_cache
= xrealloc (bidi_cache
, BIDI_CACHE_CHUNK
* elsz
);
576 bidi_cache_size
= BIDI_CACHE_CHUNK
;
582 bidi_cache_fetch_state (ptrdiff_t idx
, struct bidi_it
*bidi_it
)
584 int current_scan_dir
= bidi_it
->scan_dir
;
586 if (idx
< bidi_cache_start
|| idx
>= bidi_cache_idx
)
589 bidi_copy_it (bidi_it
, &bidi_cache
[idx
]);
590 bidi_it
->scan_dir
= current_scan_dir
;
591 bidi_cache_last_idx
= idx
;
594 /* Find a cached state with a given CHARPOS and resolved embedding
595 level less or equal to LEVEL. if LEVEL is -1, disregard the
596 resolved levels in cached states. DIR, if non-zero, means search
597 in that direction from the last cache hit. */
599 bidi_cache_search (ptrdiff_t charpos
, int level
, int dir
)
601 ptrdiff_t i
, i_start
;
603 if (bidi_cache_idx
> bidi_cache_start
)
605 if (bidi_cache_last_idx
== -1)
606 bidi_cache_last_idx
= bidi_cache_idx
- 1;
607 if (charpos
< bidi_cache
[bidi_cache_last_idx
].charpos
)
610 i_start
= bidi_cache_last_idx
- 1;
612 else if (charpos
> (bidi_cache
[bidi_cache_last_idx
].charpos
613 + bidi_cache
[bidi_cache_last_idx
].nchars
- 1))
616 i_start
= bidi_cache_last_idx
+ 1;
619 i_start
= bidi_cache_last_idx
;
623 i_start
= bidi_cache_idx
- 1;
628 /* Linear search for now; FIXME! */
629 for (i
= i_start
; i
>= bidi_cache_start
; i
--)
630 if (bidi_cache
[i
].charpos
<= charpos
631 && charpos
< bidi_cache
[i
].charpos
+ bidi_cache
[i
].nchars
632 && (level
== -1 || bidi_cache
[i
].resolved_level
<= level
))
637 for (i
= i_start
; i
< bidi_cache_idx
; i
++)
638 if (bidi_cache
[i
].charpos
<= charpos
639 && charpos
< bidi_cache
[i
].charpos
+ bidi_cache
[i
].nchars
640 && (level
== -1 || bidi_cache
[i
].resolved_level
<= level
))
648 /* Find a cached state where the resolved level changes to a value
649 that is lower than LEVEL, and return its cache slot index. DIR is
650 the direction to search, starting with the last used cache slot.
651 If DIR is zero, we search backwards from the last occupied cache
652 slot. BEFORE means return the index of the slot that
653 is ``before'' the level change in the search direction. That is,
654 given the cached levels like this:
659 and assuming we are at the position cached at the slot marked with
660 C, searching backwards (DIR = -1) for LEVEL = 2 will return the
661 index of slot B or A, depending whether BEFORE is, respectively,
664 bidi_cache_find_level_change (int level
, int dir
, bool before
)
668 ptrdiff_t i
= dir
? bidi_cache_last_idx
: bidi_cache_idx
- 1;
669 int incr
= before
? 1 : 0;
671 eassert (!dir
|| bidi_cache_last_idx
>= 0);
680 while (i
>= bidi_cache_start
+ incr
)
682 if (bidi_cache
[i
- incr
].resolved_level
>= 0
683 && bidi_cache
[i
- incr
].resolved_level
< level
)
690 while (i
< bidi_cache_idx
- incr
)
692 if (bidi_cache
[i
+ incr
].resolved_level
>= 0
693 && bidi_cache
[i
+ incr
].resolved_level
< level
)
704 bidi_cache_ensure_space (ptrdiff_t idx
)
706 /* Enlarge the cache as needed. */
707 if (idx
>= bidi_cache_size
)
709 /* The bidi cache cannot be larger than the largest Lisp string
711 ptrdiff_t string_or_buffer_bound
712 = max (BUF_BYTES_MAX
, STRING_BYTES_BOUND
);
714 /* Also, it cannot be larger than what C can represent. */
716 = (min (PTRDIFF_MAX
, SIZE_MAX
) - bidi_shelve_header_size
) / elsz
;
719 = xpalloc (bidi_cache
, &bidi_cache_size
,
720 max (BIDI_CACHE_CHUNK
, idx
- bidi_cache_size
+ 1),
721 min (string_or_buffer_bound
, c_bound
), elsz
);
726 bidi_cache_iterator_state (struct bidi_it
*bidi_it
, bool resolved
)
730 /* We should never cache on backward scans. */
731 if (bidi_it
->scan_dir
== -1)
733 idx
= bidi_cache_search (bidi_it
->charpos
, -1, 1);
737 idx
= bidi_cache_idx
;
738 bidi_cache_ensure_space (idx
);
739 /* Character positions should correspond to cache positions 1:1.
740 If we are outside the range of cached positions, the cache is
741 useless and must be reset. */
742 if (idx
> bidi_cache_start
&&
743 (bidi_it
->charpos
> (bidi_cache
[idx
- 1].charpos
744 + bidi_cache
[idx
- 1].nchars
)
745 || bidi_it
->charpos
< bidi_cache
[bidi_cache_start
].charpos
))
748 idx
= bidi_cache_start
;
750 if (bidi_it
->nchars
<= 0)
752 bidi_copy_it (&bidi_cache
[idx
], bidi_it
);
754 bidi_cache
[idx
].resolved_level
= -1;
758 /* Copy only the members which could have changed, to avoid
759 costly copying of the entire struct. */
760 bidi_cache
[idx
].type
= bidi_it
->type
;
761 bidi_check_type (bidi_it
->type
);
762 bidi_cache
[idx
].type_after_w1
= bidi_it
->type_after_w1
;
763 bidi_check_type (bidi_it
->type_after_w1
);
765 bidi_cache
[idx
].resolved_level
= bidi_it
->resolved_level
;
767 bidi_cache
[idx
].resolved_level
= -1;
768 bidi_cache
[idx
].invalid_levels
= bidi_it
->invalid_levels
;
769 bidi_cache
[idx
].next_for_neutral
= bidi_it
->next_for_neutral
;
770 bidi_cache
[idx
].next_for_ws
= bidi_it
->next_for_ws
;
771 bidi_cache
[idx
].disp_pos
= bidi_it
->disp_pos
;
772 bidi_cache
[idx
].disp_prop
= bidi_it
->disp_prop
;
773 bidi_cache
[idx
].bracket_resolved
= bidi_it
->bracket_resolved
;
776 bidi_cache_last_idx
= idx
;
777 if (idx
>= bidi_cache_idx
)
778 bidi_cache_idx
= idx
+ 1;
782 bidi_cache_find (ptrdiff_t charpos
, int level
, struct bidi_it
*bidi_it
)
784 ptrdiff_t i
= bidi_cache_search (charpos
, level
, bidi_it
->scan_dir
);
786 if (i
>= bidi_cache_start
)
788 bidi_dir_t current_scan_dir
= bidi_it
->scan_dir
;
790 bidi_copy_it (bidi_it
, &bidi_cache
[i
]);
791 bidi_cache_last_idx
= i
;
792 /* Don't let scan direction from the cached state override
793 the current scan direction. */
794 bidi_it
->scan_dir
= current_scan_dir
;
795 return bidi_it
->type
;
802 bidi_peek_at_next_level (struct bidi_it
*bidi_it
)
804 if (bidi_cache_idx
== bidi_cache_start
|| bidi_cache_last_idx
== -1)
806 return bidi_cache
[bidi_cache_last_idx
+ bidi_it
->scan_dir
].resolved_level
;
810 /***********************************************************************
811 Pushing and popping the bidi iterator state
812 ***********************************************************************/
814 /* Push the bidi iterator state in preparation for reordering a
815 different object, e.g. display string found at certain buffer
816 position. Pushing the bidi iterator boils down to saving its
817 entire state on the cache and starting a new cache "stacked" on top
818 of the current cache. */
820 bidi_push_it (struct bidi_it
*bidi_it
)
822 /* Save the current iterator state in its entirety after the last
824 bidi_cache_ensure_space (bidi_cache_idx
);
825 bidi_cache
[bidi_cache_idx
++] = *bidi_it
;
827 /* Push the current cache start onto the stack. */
828 eassert (bidi_cache_sp
< IT_STACK_SIZE
);
829 bidi_cache_start_stack
[bidi_cache_sp
++] = bidi_cache_start
;
831 /* Start a new level of cache, and make it empty. */
832 bidi_cache_start
= bidi_cache_idx
;
833 bidi_cache_last_idx
= -1;
836 /* Restore the iterator state saved by bidi_push_it and return the
837 cache to the corresponding state. */
839 bidi_pop_it (struct bidi_it
*bidi_it
)
841 if (bidi_cache_start
<= 0)
844 /* Reset the next free cache slot index to what it was before the
845 call to bidi_push_it. */
846 bidi_cache_idx
= bidi_cache_start
- 1;
848 /* Restore the bidi iterator state saved in the cache. */
849 *bidi_it
= bidi_cache
[bidi_cache_idx
];
851 /* Pop the previous cache start from the stack. */
852 if (bidi_cache_sp
<= 0)
854 bidi_cache_start
= bidi_cache_start_stack
[--bidi_cache_sp
];
856 /* Invalidate the last-used cache slot data. */
857 bidi_cache_last_idx
= -1;
860 static ptrdiff_t bidi_cache_total_alloc
;
862 /* Stash away a copy of the cache and its control variables. */
864 bidi_shelve_cache (void)
866 unsigned char *databuf
;
870 if (bidi_cache_idx
== 0)
873 alloc
= (bidi_shelve_header_size
874 + bidi_cache_idx
* sizeof (struct bidi_it
));
875 databuf
= xmalloc (alloc
);
876 bidi_cache_total_alloc
+= alloc
;
878 memcpy (databuf
, &bidi_cache_idx
, sizeof (bidi_cache_idx
));
879 memcpy (databuf
+ sizeof (bidi_cache_idx
),
880 bidi_cache
, bidi_cache_idx
* sizeof (struct bidi_it
));
881 memcpy (databuf
+ sizeof (bidi_cache_idx
)
882 + bidi_cache_idx
* sizeof (struct bidi_it
),
883 bidi_cache_start_stack
, sizeof (bidi_cache_start_stack
));
884 memcpy (databuf
+ sizeof (bidi_cache_idx
)
885 + bidi_cache_idx
* sizeof (struct bidi_it
)
886 + sizeof (bidi_cache_start_stack
),
887 &bidi_cache_sp
, sizeof (bidi_cache_sp
));
888 memcpy (databuf
+ sizeof (bidi_cache_idx
)
889 + bidi_cache_idx
* sizeof (struct bidi_it
)
890 + sizeof (bidi_cache_start_stack
) + sizeof (bidi_cache_sp
),
891 &bidi_cache_start
, sizeof (bidi_cache_start
));
892 memcpy (databuf
+ sizeof (bidi_cache_idx
)
893 + bidi_cache_idx
* sizeof (struct bidi_it
)
894 + sizeof (bidi_cache_start_stack
) + sizeof (bidi_cache_sp
)
895 + sizeof (bidi_cache_start
),
896 &bidi_cache_last_idx
, sizeof (bidi_cache_last_idx
));
901 /* Restore the cache state from a copy stashed away by
902 bidi_shelve_cache, and free the buffer used to stash that copy.
903 JUST_FREE means free the buffer, but don't restore the
904 cache; used when the corresponding iterator is discarded instead of
907 bidi_unshelve_cache (void *databuf
, bool just_free
)
909 unsigned char *p
= databuf
;
915 /* A NULL pointer means an empty cache. */
916 bidi_cache_start
= 0;
927 memcpy (&idx
, p
, sizeof (bidi_cache_idx
));
928 bidi_cache_total_alloc
929 -= bidi_shelve_header_size
+ idx
* sizeof (struct bidi_it
);
933 memcpy (&bidi_cache_idx
, p
, sizeof (bidi_cache_idx
));
934 bidi_cache_ensure_space (bidi_cache_idx
);
935 memcpy (bidi_cache
, p
+ sizeof (bidi_cache_idx
),
936 bidi_cache_idx
* sizeof (struct bidi_it
));
937 memcpy (bidi_cache_start_stack
,
938 p
+ sizeof (bidi_cache_idx
)
939 + bidi_cache_idx
* sizeof (struct bidi_it
),
940 sizeof (bidi_cache_start_stack
));
941 memcpy (&bidi_cache_sp
,
942 p
+ sizeof (bidi_cache_idx
)
943 + bidi_cache_idx
* sizeof (struct bidi_it
)
944 + sizeof (bidi_cache_start_stack
),
945 sizeof (bidi_cache_sp
));
946 memcpy (&bidi_cache_start
,
947 p
+ sizeof (bidi_cache_idx
)
948 + bidi_cache_idx
* sizeof (struct bidi_it
)
949 + sizeof (bidi_cache_start_stack
) + sizeof (bidi_cache_sp
),
950 sizeof (bidi_cache_start
));
951 memcpy (&bidi_cache_last_idx
,
952 p
+ sizeof (bidi_cache_idx
)
953 + bidi_cache_idx
* sizeof (struct bidi_it
)
954 + sizeof (bidi_cache_start_stack
) + sizeof (bidi_cache_sp
)
955 + sizeof (bidi_cache_start
),
956 sizeof (bidi_cache_last_idx
));
957 bidi_cache_total_alloc
958 -= (bidi_shelve_header_size
959 + bidi_cache_idx
* sizeof (struct bidi_it
));
967 /***********************************************************************
969 ***********************************************************************/
971 bidi_initialize (void)
973 bidi_type_table
= uniprop_table (intern ("bidi-class"));
974 if (NILP (bidi_type_table
))
976 staticpro (&bidi_type_table
);
978 bidi_mirror_table
= uniprop_table (intern ("mirroring"));
979 if (NILP (bidi_mirror_table
))
981 staticpro (&bidi_mirror_table
);
983 bidi_brackets_table
= uniprop_table (intern ("bracket-type"));
984 if (NILP (bidi_brackets_table
))
986 staticpro (&bidi_brackets_table
);
988 Qparagraph_start
= intern ("paragraph-start");
989 staticpro (&Qparagraph_start
);
990 paragraph_start_re
= Fsymbol_value (Qparagraph_start
);
991 if (!STRINGP (paragraph_start_re
))
992 paragraph_start_re
= build_string ("\f\\|[ \t]*$");
993 staticpro (¶graph_start_re
);
994 Qparagraph_separate
= intern ("paragraph-separate");
995 staticpro (&Qparagraph_separate
);
996 paragraph_separate_re
= Fsymbol_value (Qparagraph_separate
);
997 if (!STRINGP (paragraph_separate_re
))
998 paragraph_separate_re
= build_string ("[ \t\f]*$");
999 staticpro (¶graph_separate_re
);
1002 bidi_cache_total_alloc
= 0;
1004 bidi_initialized
= 1;
1007 /* Do whatever UAX#9 clause X8 says should be done at paragraph's
1010 bidi_set_paragraph_end (struct bidi_it
*bidi_it
)
1012 bidi_it
->invalid_levels
= 0;
1013 bidi_it
->invalid_isolates
= 0;
1014 bidi_it
->stack_idx
= 0;
1015 bidi_it
->resolved_level
= bidi_it
->level_stack
[0].level
;
1018 /* Initialize the bidi iterator from buffer/string position CHARPOS. */
1020 bidi_init_it (ptrdiff_t charpos
, ptrdiff_t bytepos
, bool frame_window_p
,
1021 struct bidi_it
*bidi_it
)
1023 if (! bidi_initialized
)
1026 bidi_it
->charpos
= charpos
;
1028 bidi_it
->bytepos
= bytepos
;
1029 bidi_it
->frame_window_p
= frame_window_p
;
1030 bidi_it
->nchars
= -1; /* to be computed in bidi_resolve_explicit */
1031 bidi_it
->first_elt
= 1;
1032 bidi_set_paragraph_end (bidi_it
);
1033 bidi_it
->new_paragraph
= 1;
1034 bidi_it
->separator_limit
= -1;
1035 bidi_it
->type
= NEUTRAL_B
;
1036 bidi_it
->type_after_w1
= NEUTRAL_B
;
1037 bidi_it
->orig_type
= NEUTRAL_B
;
1038 /* FIXME: Review this!!! */
1039 bidi_it
->prev
.type
= bidi_it
->prev
.type_after_w1
1040 = bidi_it
->prev
.orig_type
= UNKNOWN_BT
;
1041 bidi_it
->last_strong
.type
= bidi_it
->last_strong
.type_after_w1
1042 = bidi_it
->last_strong
.orig_type
= UNKNOWN_BT
;
1043 bidi_it
->next_for_neutral
.charpos
= -1;
1044 bidi_it
->next_for_neutral
.type
1045 = bidi_it
->next_for_neutral
.type_after_w1
1046 = bidi_it
->next_for_neutral
.orig_type
= UNKNOWN_BT
;
1047 bidi_it
->prev_for_neutral
.charpos
= -1;
1048 bidi_it
->prev_for_neutral
.type
1049 = bidi_it
->prev_for_neutral
.type_after_w1
1050 = bidi_it
->prev_for_neutral
.orig_type
= UNKNOWN_BT
;
1051 bidi_it
->sos
= L2R
; /* FIXME: should it be user-selectable? */
1052 bidi_it
->disp_pos
= -1; /* invalid/unknown */
1053 bidi_it
->disp_prop
= 0;
1054 /* We can only shrink the cache if we are at the bottom level of its
1056 if (bidi_cache_start
== 0)
1057 bidi_cache_shrink ();
1059 bidi_cache_reset ();
1062 /* Perform initializations for reordering a new line of bidi text. */
1064 bidi_line_init (struct bidi_it
*bidi_it
)
1066 bidi_it
->scan_dir
= 1; /* FIXME: do we need to have control on this? */
1067 bidi_it
->stack_idx
= 0;
1068 bidi_it
->resolved_level
= bidi_it
->level_stack
[0].level
;
1069 bidi_it
->level_stack
[0].override
= NEUTRAL_DIR
; /* X1 */
1070 bidi_it
->level_stack
[0].isolate_status
= false; /* X1 */
1071 bidi_it
->invalid_levels
= 0;
1072 bidi_it
->isolate_level
= 0; /* X1 */
1073 bidi_it
->invalid_isolates
= 0; /* X1 */
1074 /* Setting this to zero will force its recomputation the first time
1075 we need it for W5. */
1076 bidi_it
->next_en_pos
= 0;
1077 bidi_it
->next_en_type
= UNKNOWN_BT
;
1078 bidi_it
->next_for_ws
.type
= UNKNOWN_BT
;
1079 bidi_set_sos_type (bidi_it
,
1080 (bidi_it
->paragraph_dir
== R2L
? 1 : 0),
1081 bidi_it
->level_stack
[0].level
); /* X10 */
1083 bidi_cache_reset ();
1087 /***********************************************************************
1089 ***********************************************************************/
1091 /* Count bytes in string S between BEG/BEGBYTE and END. BEG and END
1092 are zero-based character positions in S, BEGBYTE is byte position
1093 corresponding to BEG. UNIBYTE means S is a unibyte string. */
1095 bidi_count_bytes (const unsigned char *s
, ptrdiff_t beg
,
1096 ptrdiff_t begbyte
, ptrdiff_t end
, bool unibyte
)
1098 ptrdiff_t pos
= beg
;
1099 const unsigned char *p
= s
+ begbyte
, *start
= p
;
1105 if (!CHAR_HEAD_P (*p
))
1110 p
+= BYTES_BY_CHAR_HEAD (*p
);
1118 /* Fetch and return the character at byte position BYTEPOS. If S is
1119 non-NULL, fetch the character from string S; otherwise fetch the
1120 character from the current buffer. UNIBYTE means S is a
1123 bidi_char_at_pos (ptrdiff_t bytepos
, const unsigned char *s
, bool unibyte
)
1132 s
= BYTE_POS_ADDR (bytepos
);
1133 return STRING_CHAR (s
);
1136 /* Fetch and return the character at CHARPOS/BYTEPOS. If that
1137 character is covered by a display string, treat the entire run of
1138 covered characters as a single character, either u+2029 or u+FFFC,
1139 and return their combined length in CH_LEN and NCHARS. DISP_POS
1140 specifies the character position of the next display string, or -1
1141 if not yet computed. When the next character is at or beyond that
1142 position, the function updates DISP_POS with the position of the
1143 next display string. *DISP_PROP non-zero means that there's really
1144 a display string at DISP_POS, as opposed to when we searched till
1145 DISP_POS without finding one. If *DISP_PROP is 2, it means the
1146 display spec is of the form `(space ...)', which is replaced with
1147 u+2029 to handle it as a paragraph separator. STRING->s is the C
1148 string to iterate, or NULL if iterating over a buffer or a Lisp
1149 string; in the latter case, STRING->lstring is the Lisp string. */
1151 bidi_fetch_char (ptrdiff_t charpos
, ptrdiff_t bytepos
, ptrdiff_t *disp_pos
,
1152 int *disp_prop
, struct bidi_string_data
*string
,
1154 bool frame_window_p
, ptrdiff_t *ch_len
, ptrdiff_t *nchars
)
1158 = (string
->s
|| STRINGP (string
->lstring
)) ? string
->schars
: ZV
;
1159 struct text_pos pos
;
1162 /* If we got past the last known position of display string, compute
1163 the position of the next one. That position could be at CHARPOS. */
1164 if (charpos
< endpos
&& charpos
> *disp_pos
)
1166 SET_TEXT_POS (pos
, charpos
, bytepos
);
1167 *disp_pos
= compute_display_string_pos (&pos
, string
, w
, frame_window_p
,
1171 /* Fetch the character at BYTEPOS. */
1172 if (charpos
>= endpos
)
1180 else if (charpos
>= *disp_pos
&& *disp_prop
)
1182 ptrdiff_t disp_end_pos
;
1184 /* We don't expect to find ourselves in the middle of a display
1185 property. Hopefully, it will never be needed. */
1186 if (charpos
> *disp_pos
)
1188 /* Text covered by `display' properties and overlays with
1189 display properties or display strings is handled as a single
1190 character that represents the entire run of characters
1191 covered by the display property. */
1192 if (*disp_prop
== 2)
1194 /* `(space ...)' display specs are handled as paragraph
1195 separators for the purposes of the reordering; see UAX#9
1196 section 3 and clause HL1 in section 4.3 there. */
1201 /* All other display specs are handled as the Unicode Object
1202 Replacement Character. */
1205 disp_end_pos
= compute_display_string_end (*disp_pos
, string
);
1206 if (disp_end_pos
< 0)
1208 /* Somebody removed the display string from the buffer
1209 behind our back. Recover by processing this buffer
1210 position as if no display property were present there to
1215 *nchars
= disp_end_pos
- *disp_pos
;
1219 *ch_len
= bidi_count_bytes (string
->s
, *disp_pos
, bytepos
,
1220 disp_end_pos
, string
->unibyte
);
1221 else if (STRINGP (string
->lstring
))
1222 *ch_len
= bidi_count_bytes (SDATA (string
->lstring
), *disp_pos
,
1223 bytepos
, disp_end_pos
, string
->unibyte
);
1225 *ch_len
= CHAR_TO_BYTE (disp_end_pos
) - bytepos
;
1233 if (!string
->unibyte
)
1235 ch
= STRING_CHAR_AND_LENGTH (string
->s
+ bytepos
, len
);
1240 ch
= UNIBYTE_TO_CHAR (string
->s
[bytepos
]);
1244 else if (STRINGP (string
->lstring
))
1246 if (!string
->unibyte
)
1248 ch
= STRING_CHAR_AND_LENGTH (SDATA (string
->lstring
) + bytepos
,
1254 ch
= UNIBYTE_TO_CHAR (SREF (string
->lstring
, bytepos
));
1260 ch
= STRING_CHAR_AND_LENGTH (BYTE_POS_ADDR (bytepos
), len
);
1266 /* If we just entered a run of characters covered by a display
1267 string, compute the position of the next display string. */
1268 if (charpos
+ *nchars
<= endpos
&& charpos
+ *nchars
> *disp_pos
1271 SET_TEXT_POS (pos
, charpos
+ *nchars
, bytepos
+ *ch_len
);
1272 *disp_pos
= compute_display_string_pos (&pos
, string
, w
, frame_window_p
,
1279 /* Like bidi_fetch_char, but ignore any text between an isolate
1280 initiator and its matching PDI or, if it has no matching PDI, the
1281 end of the paragraph. If isolates were skipped, CH_LEN and NCHARS
1282 are set to the number of bytes and characters between BYTEPOS/CHARPOS
1283 and the character that was fetched after skipping the isolates. */
1285 bidi_fetch_char_skip_isolates (ptrdiff_t charpos
, ptrdiff_t bytepos
,
1286 ptrdiff_t *disp_pos
, int *disp_prop
,
1287 struct bidi_string_data
*string
,
1288 struct window
*w
, bool frame_window_p
,
1289 ptrdiff_t *ch_len
, ptrdiff_t *nchars
)
1291 ptrdiff_t orig_charpos
= charpos
, orig_bytepos
= bytepos
;
1292 int ch
= bidi_fetch_char (charpos
, bytepos
, disp_pos
, disp_prop
, string
, w
,
1293 frame_window_p
, ch_len
, nchars
);
1294 bidi_type_t ch_type
= bidi_get_type (ch
, NEUTRAL_DIR
);
1295 ptrdiff_t level
= 0;
1297 if (ch_type
== LRI
|| ch_type
== RLI
|| ch_type
== FSI
)
1300 while (level
> 0 && ch_type
!= NEUTRAL_B
)
1304 ch
= bidi_fetch_char (charpos
, bytepos
, disp_pos
, disp_prop
, string
,
1305 w
, frame_window_p
, ch_len
, nchars
);
1306 ch_type
= bidi_get_type (ch
, NEUTRAL_DIR
);
1307 /* A Note to P2 says to ignore max_depth limit. */
1308 if (ch_type
== LRI
|| ch_type
== RLI
|| ch_type
== FSI
)
1310 else if (ch_type
== PDI
)
1315 /* Communicate to the caller how much did we skip, so it could get
1316 past the last character position we examined. */
1317 *nchars
+= charpos
- orig_charpos
;
1318 *ch_len
+= bytepos
- orig_bytepos
;
1324 /***********************************************************************
1325 Determining paragraph direction
1326 ***********************************************************************/
1328 /* Check if buffer position CHARPOS/BYTEPOS is the end of a paragraph.
1329 Value is the non-negative length of the paragraph separator
1330 following the buffer position, -1 if position is at the beginning
1331 of a new paragraph, or -2 if position is neither at beginning nor
1332 at end of a paragraph. */
1334 bidi_at_paragraph_end (ptrdiff_t charpos
, ptrdiff_t bytepos
)
1337 Lisp_Object start_re
;
1340 sep_re
= paragraph_separate_re
;
1341 start_re
= paragraph_start_re
;
1343 val
= fast_looking_at (sep_re
, charpos
, bytepos
, ZV
, ZV_BYTE
, Qnil
);
1346 if (fast_looking_at (start_re
, charpos
, bytepos
, ZV
, ZV_BYTE
, Qnil
) >= 0)
1355 /* If the user has requested the long scans caching, make sure that
1356 BIDI cache is enabled. Otherwise, make sure it's disabled. */
1358 static struct region_cache
*
1359 bidi_paragraph_cache_on_off (void)
1361 struct buffer
*cache_buffer
= current_buffer
;
1362 bool indirect_p
= false;
1364 /* For indirect buffers, make sure to use the cache of their base
1366 if (cache_buffer
->base_buffer
)
1368 cache_buffer
= cache_buffer
->base_buffer
;
1372 /* Don't turn on or off the cache in the base buffer, if the value
1373 of cache-long-scans of the base buffer is inconsistent with that.
1374 This is because doing so will just make the cache pure overhead,
1375 since if we turn it on via indirect buffer, it will be
1376 immediately turned off by its base buffer. */
1377 if (NILP (BVAR (current_buffer
, cache_long_scans
)))
1380 || NILP (BVAR (cache_buffer
, cache_long_scans
)))
1382 if (cache_buffer
->bidi_paragraph_cache
)
1384 free_region_cache (cache_buffer
->bidi_paragraph_cache
);
1385 cache_buffer
->bidi_paragraph_cache
= 0;
1393 || !NILP (BVAR (cache_buffer
, cache_long_scans
)))
1395 if (!cache_buffer
->bidi_paragraph_cache
)
1396 cache_buffer
->bidi_paragraph_cache
= new_region_cache ();
1398 return cache_buffer
->bidi_paragraph_cache
;
1402 /* On my 2005-vintage machine, searching back for paragraph start
1403 takes ~1 ms per line. And bidi_paragraph_init is called 4 times
1404 when user types C-p. The number below limits each call to
1405 bidi_paragraph_init to about 10 ms. */
1406 #define MAX_PARAGRAPH_SEARCH 7500
1408 /* Find the beginning of this paragraph by looking back in the buffer.
1409 Value is the byte position of the paragraph's beginning, or
1410 BEGV_BYTE if paragraph_start_re is still not found after looking
1411 back MAX_PARAGRAPH_SEARCH lines in the buffer. */
1413 bidi_find_paragraph_start (ptrdiff_t pos
, ptrdiff_t pos_byte
)
1415 Lisp_Object re
= paragraph_start_re
;
1416 ptrdiff_t limit
= ZV
, limit_byte
= ZV_BYTE
;
1417 struct region_cache
*bpc
= bidi_paragraph_cache_on_off ();
1418 ptrdiff_t n
= 0, oldpos
= pos
, next
;
1419 struct buffer
*cache_buffer
= current_buffer
;
1421 if (cache_buffer
->base_buffer
)
1422 cache_buffer
= cache_buffer
->base_buffer
;
1424 while (pos_byte
> BEGV_BYTE
1425 && n
++ < MAX_PARAGRAPH_SEARCH
1426 && fast_looking_at (re
, pos
, pos_byte
, limit
, limit_byte
, Qnil
) < 0)
1428 /* FIXME: What if the paragraph beginning is covered by a
1429 display string? And what if a display string covering some
1430 of the text over which we scan back includes
1431 paragraph_start_re? */
1432 DEC_BOTH (pos
, pos_byte
);
1433 if (bpc
&& region_cache_backward (cache_buffer
, bpc
, pos
, &next
))
1435 pos
= next
, pos_byte
= CHAR_TO_BYTE (pos
);
1439 pos
= find_newline_no_quit (pos
, pos_byte
, -1, &pos_byte
);
1441 if (n
>= MAX_PARAGRAPH_SEARCH
)
1442 pos
= BEGV
, pos_byte
= BEGV_BYTE
;
1444 know_region_cache (cache_buffer
, bpc
, pos
, oldpos
);
1445 /* Positions returned by the region cache are not limited to
1446 BEGV..ZV range, so we limit them here. */
1447 pos_byte
= clip_to_bounds (BEGV_BYTE
, pos_byte
, ZV_BYTE
);
1451 /* On a 3.4 GHz machine, searching forward for a strong directional
1452 character in a long paragraph full of weaks or neutrals takes about
1453 1 ms for each 20K characters. The number below limits each call to
1454 bidi_paragraph_init to less than 10 ms even on slow machines. */
1455 #define MAX_STRONG_CHAR_SEARCH 100000
1457 /* Starting from POS, find the first strong (L, R, or AL) character,
1458 while skipping over any characters between an isolate initiator and
1459 its matching PDI. STOP_AT_PDI non-zero means stop at the PDI that
1460 matches the isolate initiator at POS. Return the bidi type of the
1461 character where the search stopped. Give up if after examining
1462 MAX_STRONG_CHAR_SEARCH buffer or string positions no strong
1463 character was found. */
1465 find_first_strong_char (ptrdiff_t pos
, ptrdiff_t bytepos
, ptrdiff_t end
,
1466 ptrdiff_t *disp_pos
, int *disp_prop
,
1467 struct bidi_string_data
*string
, struct window
*w
,
1468 bool string_p
, bool frame_window_p
,
1469 ptrdiff_t *ch_len
, ptrdiff_t *nchars
, bool stop_at_pdi
)
1477 /* If STOP_AT_PDI is non-zero, we must have been called with FSI
1478 at POS. Get past it. */
1479 #ifdef ENABLE_CHECKING
1480 ch
= bidi_fetch_char (pos
, bytepos
, disp_pos
, disp_prop
, string
, w
,
1481 frame_window_p
, ch_len
, nchars
);
1482 type
= bidi_get_type (ch
, NEUTRAL_DIR
);
1483 eassert (type
== FSI
/* || type == LRI || type == RLI */);
1488 ch
= bidi_fetch_char_skip_isolates (pos
, bytepos
, disp_pos
, disp_prop
, string
,
1489 w
, frame_window_p
, ch_len
, nchars
);
1490 type
= bidi_get_type (ch
, NEUTRAL_DIR
);
1493 for (pos
+= *nchars
, bytepos
+= *ch_len
;
1494 bidi_get_category (type
) != STRONG
1495 /* If requested to stop at first PDI, stop there. */
1496 && !(stop_at_pdi
&& type
== PDI
)
1497 /* Stop when searched too far into an abnormally large
1498 paragraph full of weak or neutral characters. */
1499 && pos
- pos1
< MAX_STRONG_CHAR_SEARCH
;
1500 type
= bidi_get_type (ch
, NEUTRAL_DIR
))
1504 /* Pretend there's a paragraph separator at end of
1510 && type
== NEUTRAL_B
1511 && bidi_at_paragraph_end (pos
, bytepos
) >= -1)
1513 /* Fetch next character and advance to get past it. */
1514 ch
= bidi_fetch_char_skip_isolates (pos
, bytepos
, disp_pos
, disp_prop
,
1515 string
, w
, frame_window_p
,
1523 /* Determine the base direction, a.k.a. base embedding level, of the
1524 paragraph we are about to iterate through. If DIR is either L2R or
1525 R2L, just use that. Otherwise, determine the paragraph direction
1526 from the first strong directional character of the paragraph.
1528 NO_DEFAULT_P means don't default to L2R if the paragraph
1529 has no strong directional characters and both DIR and
1530 bidi_it->paragraph_dir are NEUTRAL_DIR. In that case, search back
1531 in the buffer until a paragraph is found with a strong character,
1532 or until hitting BEGV. In the latter case, fall back to L2R. This
1533 flag is used in current-bidi-paragraph-direction.
1535 Note that this function gives the paragraph separator the same
1536 direction as the preceding paragraph, even though Emacs generally
1537 views the separator as not belonging to any paragraph. */
1539 bidi_paragraph_init (bidi_dir_t dir
, struct bidi_it
*bidi_it
, bool no_default_p
)
1541 ptrdiff_t bytepos
= bidi_it
->bytepos
;
1542 bool string_p
= bidi_it
->string
.s
|| STRINGP (bidi_it
->string
.lstring
);
1543 ptrdiff_t pstartbyte
;
1544 /* Note that begbyte is a byte position, while end is a character
1545 position. Yes, this is ugly, but we are trying to avoid costly
1546 calls to BYTE_TO_CHAR and its ilk. */
1547 ptrdiff_t begbyte
= string_p
? 0 : BEGV_BYTE
;
1548 ptrdiff_t end
= string_p
? bidi_it
->string
.schars
: ZV
;
1550 /* Special case for an empty buffer. */
1551 if (bytepos
== begbyte
&& bidi_it
->charpos
== end
)
1553 /* We should never be called at EOB or before BEGV. */
1554 else if (bidi_it
->charpos
>= end
|| bytepos
< begbyte
)
1559 bidi_it
->paragraph_dir
= L2R
;
1560 bidi_it
->new_paragraph
= 0;
1562 else if (dir
== R2L
)
1564 bidi_it
->paragraph_dir
= R2L
;
1565 bidi_it
->new_paragraph
= 0;
1567 else if (dir
== NEUTRAL_DIR
) /* P2 */
1569 ptrdiff_t ch_len
, nchars
;
1570 ptrdiff_t pos
, disp_pos
= -1;
1573 const unsigned char *s
;
1575 if (!bidi_initialized
)
1578 /* If we are inside a paragraph separator, we are just waiting
1579 for the separator to be exhausted; use the previous paragraph
1580 direction. But don't do that if we have been just reseated,
1581 because we need to reinitialize below in that case. */
1582 if (!bidi_it
->first_elt
1583 && bidi_it
->charpos
< bidi_it
->separator_limit
)
1586 /* If we are on a newline, get past it to where the next
1587 paragraph might start. But don't do that at BEGV since then
1588 we are potentially in a new paragraph that doesn't yet
1590 pos
= bidi_it
->charpos
;
1591 s
= (STRINGP (bidi_it
->string
.lstring
)
1592 ? SDATA (bidi_it
->string
.lstring
)
1593 : bidi_it
->string
.s
);
1594 if (bytepos
> begbyte
1595 && bidi_char_at_pos (bytepos
, s
, bidi_it
->string
.unibyte
) == '\n')
1601 /* We are either at the beginning of a paragraph or in the
1602 middle of it. Find where this paragraph starts. */
1605 /* We don't support changes of paragraph direction inside a
1606 string. It is treated as a single paragraph. */
1610 pstartbyte
= bidi_find_paragraph_start (pos
, bytepos
);
1611 bidi_it
->separator_limit
= -1;
1612 bidi_it
->new_paragraph
= 0;
1614 /* The following loop is run more than once only if NO_DEFAULT_P,
1615 and only if we are iterating on a buffer. */
1617 bytepos
= pstartbyte
;
1619 pos
= BYTE_TO_CHAR (bytepos
);
1620 type
= find_first_strong_char (pos
, bytepos
, end
, &disp_pos
, &disp_prop
,
1621 &bidi_it
->string
, bidi_it
->w
,
1622 string_p
, bidi_it
->frame_window_p
,
1623 &ch_len
, &nchars
, false);
1624 if (type
== STRONG_R
|| type
== STRONG_AL
) /* P3 */
1625 bidi_it
->paragraph_dir
= R2L
;
1626 else if (type
== STRONG_L
)
1627 bidi_it
->paragraph_dir
= L2R
;
1629 && no_default_p
&& bidi_it
->paragraph_dir
== NEUTRAL_DIR
)
1631 /* If this paragraph is at BEGV, default to L2R. */
1632 if (pstartbyte
== BEGV_BYTE
)
1633 bidi_it
->paragraph_dir
= L2R
; /* P3 and HL1 */
1636 ptrdiff_t prevpbyte
= pstartbyte
;
1637 ptrdiff_t p
= BYTE_TO_CHAR (pstartbyte
), pbyte
= pstartbyte
;
1639 /* Find the beginning of the previous paragraph, if any. */
1640 while (pbyte
> BEGV_BYTE
&& prevpbyte
>= pstartbyte
)
1642 /* FXIME: What if p is covered by a display
1643 string? See also a FIXME inside
1644 bidi_find_paragraph_start. */
1645 DEC_BOTH (p
, pbyte
);
1646 prevpbyte
= bidi_find_paragraph_start (p
, pbyte
);
1648 pstartbyte
= prevpbyte
;
1652 && no_default_p
&& bidi_it
->paragraph_dir
== NEUTRAL_DIR
);
1657 /* Contrary to UAX#9 clause P3, we only default the paragraph
1658 direction to L2R if we have no previous usable paragraph
1659 direction. This is allowed by the HL1 clause. */
1660 if (bidi_it
->paragraph_dir
!= L2R
&& bidi_it
->paragraph_dir
!= R2L
)
1661 bidi_it
->paragraph_dir
= L2R
; /* P3 and HL1 ``higher-level protocols'' */
1662 if (bidi_it
->paragraph_dir
== R2L
)
1663 bidi_it
->level_stack
[0].level
= 1;
1665 bidi_it
->level_stack
[0].level
= 0;
1667 bidi_line_init (bidi_it
);
1671 /***********************************************************************
1672 Resolving explicit and implicit levels.
1673 The rest of this file constitutes the core of the UBA implementation.
1674 ***********************************************************************/
1677 bidi_explicit_dir_char (int ch
)
1679 bidi_type_t ch_type
;
1681 if (!bidi_initialized
)
1683 ch_type
= (bidi_type_t
) XINT (CHAR_TABLE_REF (bidi_type_table
, ch
));
1684 return (ch_type
== LRE
|| ch_type
== LRO
1685 || ch_type
== RLE
|| ch_type
== RLO
1689 /* Given an iterator state in BIDI_IT, advance one character position
1690 in the buffer/string to the next character (in the logical order),
1691 resolve any explicit embeddings, directional overrides, and isolate
1692 initiators and terminators, and return the embedding level of the
1693 character after resolving these explicit directives. */
1695 bidi_resolve_explicit (struct bidi_it
*bidi_it
)
1698 bidi_type_t type
, typ1
, prev_type
= UNKNOWN_BT
;
1701 bidi_dir_t override
;
1702 bool isolate_status
;
1703 bool string_p
= bidi_it
->string
.s
|| STRINGP (bidi_it
->string
.lstring
);
1704 ptrdiff_t ch_len
, nchars
, disp_pos
, end
;
1707 /* If reseat()'ed, don't advance, so as to start iteration from the
1708 position where we were reseated. bidi_it->bytepos can be less
1709 than BEGV_BYTE after reseat to BEGV. */
1710 if (bidi_it
->bytepos
< (string_p
? 0 : BEGV_BYTE
)
1711 || bidi_it
->first_elt
)
1713 bidi_it
->first_elt
= 0;
1716 const unsigned char *p
1717 = (STRINGP (bidi_it
->string
.lstring
)
1718 ? SDATA (bidi_it
->string
.lstring
)
1719 : bidi_it
->string
.s
);
1721 if (bidi_it
->charpos
< 0)
1722 bidi_it
->charpos
= bidi_it
->bytepos
= 0;
1723 eassert (bidi_it
->bytepos
== bidi_count_bytes (p
, 0, 0,
1725 bidi_it
->string
.unibyte
));
1729 if (bidi_it
->charpos
< BEGV
)
1731 bidi_it
->charpos
= BEGV
;
1732 bidi_it
->bytepos
= BEGV_BYTE
;
1734 eassert (bidi_it
->bytepos
== CHAR_TO_BYTE (bidi_it
->charpos
));
1736 /* Determine the orginal bidi type of the previous character,
1737 which is needed for handling isolate initiators and PDF. The
1738 type of the previous character will only be non-trivial if
1739 our caller moved through some previous text in
1740 get_visually_first_element, in which case bidi_it->prev holds
1741 the information we want. */
1742 if (bidi_it
->first_elt
&& bidi_it
->prev
.type
!= UNKNOWN_BT
)
1744 eassert (bidi_it
->prev
.charpos
== bidi_it
->charpos
- 1);
1745 prev_type
= bidi_it
->prev
.orig_type
;
1746 if (prev_type
== FSI
)
1747 prev_type
= bidi_it
->type_after_w1
;
1750 /* Don't move at end of buffer/string. */
1751 else if (bidi_it
->charpos
< (string_p
? bidi_it
->string
.schars
: ZV
))
1753 /* Advance to the next character, skipping characters covered by
1754 display strings (nchars > 1). */
1755 if (bidi_it
->nchars
<= 0)
1757 bidi_it
->charpos
+= bidi_it
->nchars
;
1758 if (bidi_it
->ch_len
== 0)
1760 bidi_it
->bytepos
+= bidi_it
->ch_len
;
1761 prev_type
= bidi_it
->orig_type
;
1762 if (prev_type
== FSI
)
1763 prev_type
= bidi_it
->type_after_w1
;
1765 else /* EOB or end of string */
1766 prev_type
= NEUTRAL_B
;
1768 current_level
= bidi_it
->level_stack
[bidi_it
->stack_idx
].level
; /* X1 */
1769 override
= bidi_it
->level_stack
[bidi_it
->stack_idx
].override
;
1770 isolate_status
= bidi_it
->level_stack
[bidi_it
->stack_idx
].isolate_status
;
1771 new_level
= current_level
;
1773 if (bidi_it
->charpos
>= (string_p
? bidi_it
->string
.schars
: ZV
))
1776 bidi_it
->ch_len
= 1;
1777 bidi_it
->nchars
= 1;
1778 bidi_it
->disp_pos
= (string_p
? bidi_it
->string
.schars
: ZV
);
1779 bidi_it
->disp_prop
= 0;
1783 /* LRI, RLI, and FSI increment, and PDF decrements, the
1784 embedding level of the _following_ characters, so we must
1785 first look at the type of the previous character to support
1790 if (current_level
< BIDI_MAXDEPTH
1791 && bidi_it
->invalid_levels
== 0
1792 && bidi_it
->invalid_isolates
== 0)
1794 new_level
= ((current_level
+ 1) & ~1) + 1;
1795 bidi_it
->isolate_level
++;
1796 bidi_push_embedding_level (bidi_it
, new_level
,
1800 bidi_it
->invalid_isolates
++;
1803 if (current_level
< BIDI_MAXDEPTH
- 1
1804 && bidi_it
->invalid_levels
== 0
1805 && bidi_it
->invalid_isolates
== 0)
1807 new_level
= ((current_level
+ 2) & ~1);
1808 bidi_it
->isolate_level
++;
1809 bidi_push_embedding_level (bidi_it
, new_level
,
1813 bidi_it
->invalid_isolates
++;
1816 if (!bidi_it
->invalid_isolates
)
1818 if (bidi_it
->invalid_levels
)
1819 bidi_it
->invalid_levels
--;
1820 else if (!isolate_status
&& bidi_it
->stack_idx
>= 1)
1821 new_level
= bidi_pop_embedding_level (bidi_it
);
1825 eassert (prev_type
!= FSI
);
1829 /* Fetch the character at BYTEPOS. If it is covered by a
1830 display string, treat the entire run of covered characters as
1831 a single character u+FFFC. */
1832 curchar
= bidi_fetch_char (bidi_it
->charpos
, bidi_it
->bytepos
,
1833 &bidi_it
->disp_pos
, &bidi_it
->disp_prop
,
1834 &bidi_it
->string
, bidi_it
->w
,
1835 bidi_it
->frame_window_p
,
1836 &bidi_it
->ch_len
, &bidi_it
->nchars
);
1838 bidi_it
->ch
= curchar
;
1839 bidi_it
->resolved_level
= new_level
;
1841 /* Don't apply directional override here, as all the types we handle
1842 below will not be affected by the override anyway, and we need
1843 the original type unaltered. The override will be applied in
1844 bidi_resolve_weak. */
1845 type
= bidi_get_type (curchar
, NEUTRAL_DIR
);
1846 bidi_it
->orig_type
= type
;
1847 bidi_check_type (bidi_it
->orig_type
);
1849 bidi_it
->type_after_w1
= UNKNOWN_BT
;
1855 bidi_it
->type_after_w1
= type
;
1856 bidi_check_type (bidi_it
->type_after_w1
);
1857 type
= WEAK_BN
; /* X9/Retaining */
1858 if (new_level
< BIDI_MAXDEPTH
1859 && bidi_it
->invalid_levels
== 0
1860 && bidi_it
->invalid_isolates
== 0)
1862 /* Compute the least odd embedding level greater than
1863 the current level. */
1864 new_level
= ((new_level
+ 1) & ~1) + 1;
1865 if (bidi_it
->type_after_w1
== RLE
)
1866 override
= NEUTRAL_DIR
;
1869 bidi_push_embedding_level (bidi_it
, new_level
, override
, false);
1870 bidi_it
->resolved_level
= new_level
;
1874 if (bidi_it
->invalid_isolates
== 0)
1875 bidi_it
->invalid_levels
++;
1880 bidi_it
->type_after_w1
= type
;
1881 bidi_check_type (bidi_it
->type_after_w1
);
1882 type
= WEAK_BN
; /* X9/Retaining */
1883 if (new_level
< BIDI_MAXDEPTH
- 1
1884 && bidi_it
->invalid_levels
== 0
1885 && bidi_it
->invalid_isolates
== 0)
1887 /* Compute the least even embedding level greater than
1888 the current level. */
1889 new_level
= ((new_level
+ 2) & ~1);
1890 if (bidi_it
->type_after_w1
== LRE
)
1891 override
= NEUTRAL_DIR
;
1894 bidi_push_embedding_level (bidi_it
, new_level
, override
, false);
1895 bidi_it
->resolved_level
= new_level
;
1899 if (bidi_it
->invalid_isolates
== 0)
1900 bidi_it
->invalid_levels
++;
1904 end
= string_p
? bidi_it
->string
.schars
: ZV
;
1905 disp_pos
= bidi_it
->disp_pos
;
1906 disp_prop
= bidi_it
->disp_prop
;
1907 nchars
= bidi_it
->nchars
;
1908 ch_len
= bidi_it
->ch_len
;
1909 typ1
= find_first_strong_char (bidi_it
->charpos
,
1910 bidi_it
->bytepos
, end
,
1911 &disp_pos
, &disp_prop
,
1912 &bidi_it
->string
, bidi_it
->w
,
1913 string_p
, bidi_it
->frame_window_p
,
1914 &ch_len
, &nchars
, true);
1915 if (typ1
!= STRONG_R
&& typ1
!= STRONG_AL
)
1924 if (override
== NEUTRAL_DIR
)
1925 bidi_it
->type_after_w1
= type
;
1926 else /* Unicode 8.0 correction. */
1927 bidi_it
->type_after_w1
= (override
== L2R
? STRONG_L
: STRONG_R
);
1928 bidi_check_type (bidi_it
->type_after_w1
);
1932 if (override
== NEUTRAL_DIR
)
1933 bidi_it
->type_after_w1
= type
;
1934 else /* Unicode 8.0 correction. */
1935 bidi_it
->type_after_w1
= (override
== L2R
? STRONG_L
: STRONG_R
);
1936 bidi_check_type (bidi_it
->type_after_w1
);
1939 if (bidi_it
->invalid_isolates
)
1940 bidi_it
->invalid_isolates
--;
1941 else if (bidi_it
->isolate_level
> 0)
1943 bidi_it
->invalid_levels
= 0;
1944 while (!bidi_it
->level_stack
[bidi_it
->stack_idx
].isolate_status
)
1945 bidi_pop_embedding_level (bidi_it
);
1946 eassert (bidi_it
->stack_idx
> 0);
1947 new_level
= bidi_pop_embedding_level (bidi_it
);
1948 bidi_it
->isolate_level
--;
1950 bidi_it
->resolved_level
= new_level
;
1951 /* Unicode 8.0 correction. */
1952 if (bidi_it
->level_stack
[bidi_it
->stack_idx
].override
== L2R
)
1953 bidi_it
->type_after_w1
= STRONG_L
;
1954 else if (bidi_it
->level_stack
[bidi_it
->stack_idx
].override
== R2L
)
1955 bidi_it
->type_after_w1
= STRONG_R
;
1957 bidi_it
->type_after_w1
= type
;
1960 bidi_it
->type_after_w1
= type
;
1961 bidi_check_type (bidi_it
->type_after_w1
);
1962 type
= WEAK_BN
; /* X9/Retaining */
1969 bidi_it
->type
= type
;
1970 bidi_check_type (bidi_it
->type
);
1972 if (bidi_it
->type
== NEUTRAL_B
) /* X8 */
1974 bidi_set_paragraph_end (bidi_it
);
1975 /* This is needed by bidi_resolve_weak below, and in L1. */
1976 bidi_it
->type_after_w1
= bidi_it
->type
;
1979 eassert (bidi_it
->resolved_level
>= 0);
1980 return bidi_it
->resolved_level
;
1984 bidi_isolate_fmt_char (bidi_type_t ch_type
)
1986 return (ch_type
== LRI
|| ch_type
== RLI
|| ch_type
== PDI
|| ch_type
== FSI
);
1989 /* Advance in the buffer/string, resolve weak types and return the
1990 type of the next character after weak type resolution. */
1992 bidi_resolve_weak (struct bidi_it
*bidi_it
)
1995 bidi_dir_t override
;
1996 int prev_level
= bidi_it
->level_stack
[bidi_it
->stack_idx
].level
;
1997 int new_level
= bidi_resolve_explicit (bidi_it
);
1999 bidi_type_t type_of_next
;
2000 struct bidi_it saved_it
;
2002 = ((STRINGP (bidi_it
->string
.lstring
) || bidi_it
->string
.s
)
2003 ? bidi_it
->string
.schars
: ZV
);
2005 type
= bidi_it
->type
;
2006 override
= bidi_it
->level_stack
[bidi_it
->stack_idx
].override
;
2008 eassert (!(type
== UNKNOWN_BT
2015 eassert (prev_level
>= 0);
2016 if (bidi_it
->type
== NEUTRAL_B
)
2018 /* We've got a new isolating sequence, compute the directional
2019 type of sos and initialize per-run variables (UAX#9, clause
2021 bidi_set_sos_type (bidi_it
, prev_level
, new_level
);
2023 if (type
== NEUTRAL_S
|| type
== NEUTRAL_WS
2024 || type
== WEAK_BN
|| type
== STRONG_AL
)
2025 bidi_it
->type_after_w1
= type
; /* needed in L1 */
2026 bidi_check_type (bidi_it
->type_after_w1
);
2028 /* Level and directional override status are already recorded in
2029 bidi_it, and do not need any change; see X6. */
2030 if (override
== R2L
) /* X6 */
2032 else if (override
== L2R
)
2036 if (type
== WEAK_NSM
) /* W1 */
2038 /* Note that we don't need to consider the case where the
2039 prev character has its type overridden by an RLO or LRO,
2040 because then either the type of this NSM would have been
2041 also overridden, or the previous character is outside the
2042 current level run, and thus not relevant to this NSM.
2043 This is why NSM gets the type_after_w1 of the previous
2045 /* bidi_set_sos_type sets type_after_w1 to UNKNOWN_BT. */
2046 if (bidi_it
->prev
.type_after_w1
!= UNKNOWN_BT
2047 /* If type_after_w1 is NEUTRAL_B, this NSM is at sos. */
2048 && bidi_it
->prev
.type_after_w1
!= NEUTRAL_B
)
2050 if (bidi_isolate_fmt_char (bidi_it
->prev
.type_after_w1
))
2052 /* From W1: "Note that in an isolating run sequence,
2053 an isolate initiator followed by an NSM or any
2054 type other than PDI must be an overflow isolate
2056 eassert (bidi_it
->invalid_isolates
> 0);
2061 type
= bidi_it
->prev
.type_after_w1
;
2062 /* Unicode 8.0 correction for N0. */
2063 if (type
== NEUTRAL_ON
2064 && bidi_it
->prev
.bracket_resolved
2065 && (bidi_it
->prev
.type
== STRONG_L
2066 || bidi_it
->prev
.type
== STRONG_R
))
2067 type
= bidi_it
->prev
.type
;
2070 else if (bidi_it
->sos
== R2L
)
2072 else if (bidi_it
->sos
== L2R
)
2074 else /* shouldn't happen! */
2077 if (type
== WEAK_EN
/* W2 */
2078 && bidi_it
->last_strong
.type_after_w1
== STRONG_AL
)
2080 else if (type
== STRONG_AL
) /* W3 */
2082 else if ((type
== WEAK_ES
/* W4 */
2083 && bidi_it
->prev
.type_after_w1
== WEAK_EN
2084 && bidi_it
->prev
.orig_type
== WEAK_EN
)
2086 && ((bidi_it
->prev
.type_after_w1
== WEAK_EN
2087 && bidi_it
->prev
.orig_type
== WEAK_EN
)
2088 || bidi_it
->prev
.type_after_w1
== WEAK_AN
)))
2090 const unsigned char *s
2091 = (STRINGP (bidi_it
->string
.lstring
)
2092 ? SDATA (bidi_it
->string
.lstring
)
2093 : bidi_it
->string
.s
);
2095 next_char
= (bidi_it
->charpos
+ bidi_it
->nchars
>= eob
2097 : bidi_char_at_pos (bidi_it
->bytepos
+ bidi_it
->ch_len
,
2098 s
, bidi_it
->string
.unibyte
));
2099 type_of_next
= bidi_get_type (next_char
, override
);
2101 if (type_of_next
== WEAK_BN
2102 || bidi_explicit_dir_char (next_char
))
2104 bidi_copy_it (&saved_it
, bidi_it
);
2105 while (bidi_resolve_explicit (bidi_it
) == new_level
2106 && bidi_it
->type
== WEAK_BN
)
2107 type_of_next
= bidi_it
->type
;
2108 bidi_copy_it (bidi_it
, &saved_it
);
2111 /* If the next character is EN, but the last strong-type
2112 character is AL, that next EN will be changed to AN when
2113 we process it in W2 above. So in that case, this ES
2114 should not be changed into EN. */
2116 && type_of_next
== WEAK_EN
2117 && bidi_it
->last_strong
.type_after_w1
!= STRONG_AL
)
2119 else if (type
== WEAK_CS
)
2121 if (bidi_it
->prev
.type_after_w1
== WEAK_AN
2122 && (type_of_next
== WEAK_AN
2123 /* If the next character is EN, but the last
2124 strong-type character is AL, EN will be later
2125 changed to AN when we process it in W2 above.
2126 So in that case, this ES should not be
2128 || (type_of_next
== WEAK_EN
2129 && bidi_it
->last_strong
.type_after_w1
== STRONG_AL
)))
2131 else if (bidi_it
->prev
.type_after_w1
== WEAK_EN
2132 && type_of_next
== WEAK_EN
2133 && bidi_it
->last_strong
.type_after_w1
!= STRONG_AL
)
2137 else if (type
== WEAK_ET
/* W5: ET with EN before or after it */
2138 || type
== WEAK_BN
) /* W5/Retaining */
2140 if (bidi_it
->prev
.type_after_w1
== WEAK_EN
) /* ET/BN w/EN before it */
2142 else if (bidi_it
->next_en_pos
> bidi_it
->charpos
2143 && bidi_it
->next_en_type
!= WEAK_BN
)
2145 if (bidi_it
->next_en_type
== WEAK_EN
) /* ET/BN with EN after it */
2148 else if (bidi_it
->next_en_pos
>=0)
2150 /* We overstepped the last known position for ET
2151 resolution but there could be other such characters
2152 in this paragraph (when we are sure there are no more
2153 such positions, we set next_en_pos to a negative
2154 value). Try to find the next position for ET
2156 ptrdiff_t en_pos
= bidi_it
->charpos
+ bidi_it
->nchars
;
2157 const unsigned char *s
= (STRINGP (bidi_it
->string
.lstring
)
2158 ? SDATA (bidi_it
->string
.lstring
)
2159 : bidi_it
->string
.s
);
2161 if (bidi_it
->nchars
<= 0)
2164 = (bidi_it
->charpos
+ bidi_it
->nchars
>= eob
2166 : bidi_char_at_pos (bidi_it
->bytepos
+ bidi_it
->ch_len
, s
,
2167 bidi_it
->string
.unibyte
));
2168 type_of_next
= bidi_get_type (next_char
, override
);
2170 if (type_of_next
== WEAK_ET
2171 || type_of_next
== WEAK_BN
2172 || bidi_explicit_dir_char (next_char
))
2174 bidi_copy_it (&saved_it
, bidi_it
);
2175 while (bidi_resolve_explicit (bidi_it
) == new_level
2176 && (bidi_it
->type
== WEAK_BN
2177 || bidi_it
->type
== WEAK_ET
))
2178 type_of_next
= bidi_it
->type
;
2180 && bidi_it
->charpos
== saved_it
.charpos
+ saved_it
.nchars
)
2182 /* If we entered the above loop with a BN that
2183 changes the level, the type of next
2184 character, which is in a different level, is
2185 not relevant to resolving this series of ET
2187 en_pos
= saved_it
.charpos
;
2188 type_of_next
= type
;
2191 en_pos
= bidi_it
->charpos
;
2192 bidi_copy_it (bidi_it
, &saved_it
);
2194 /* Remember this position, to speed up processing of the
2196 bidi_it
->next_en_pos
= en_pos
;
2197 if (type_of_next
== WEAK_EN
)
2199 /* If the last strong character is AL, the EN we've
2200 found will become AN when we get to it (W2). */
2201 if (bidi_it
->last_strong
.type_after_w1
== STRONG_AL
)
2202 type_of_next
= WEAK_AN
;
2203 else if (type
== WEAK_BN
)
2204 type
= NEUTRAL_ON
; /* W6/Retaining */
2208 else if (type_of_next
== NEUTRAL_B
)
2209 /* Record the fact that there are no more ENs from
2210 here to the end of paragraph, to avoid entering the
2211 loop above ever again in this paragraph. */
2212 bidi_it
->next_en_pos
= -1;
2213 /* Record the type of the character where we ended our search. */
2214 bidi_it
->next_en_type
= type_of_next
;
2219 if (type
== WEAK_ES
|| type
== WEAK_ET
|| type
== WEAK_CS
/* W6 */
2221 && (bidi_it
->prev
.type_after_w1
== WEAK_CS
/* W6/Retaining */
2222 || bidi_it
->prev
.type_after_w1
== WEAK_ES
2223 || bidi_it
->prev
.type_after_w1
== WEAK_ET
)))
2226 /* Store the type we've got so far, before we clobber it with strong
2227 types in W7 and while resolving neutral types. But leave alone
2228 the original types that were recorded above, because we will need
2229 them for the L1 clause. */
2230 if (bidi_it
->type_after_w1
== UNKNOWN_BT
)
2231 bidi_it
->type_after_w1
= type
;
2232 bidi_check_type (bidi_it
->type_after_w1
);
2234 if (type
== WEAK_EN
) /* W7 */
2236 if ((bidi_it
->last_strong
.type_after_w1
== STRONG_L
)
2237 || (bidi_it
->last_strong
.type
== UNKNOWN_BT
&& bidi_it
->sos
== L2R
))
2241 bidi_it
->type
= type
;
2242 bidi_check_type (bidi_it
->type
);
2246 /* Resolve the type of a neutral character according to the type of
2247 surrounding strong text and the current embedding level. */
2249 bidi_resolve_neutral_1 (bidi_type_t prev_type
, bidi_type_t next_type
, int lev
)
2251 /* N1: "European and Arabic numbers act as if they were R in terms
2252 of their influence on NIs." */
2253 if (next_type
== WEAK_EN
|| next_type
== WEAK_AN
)
2254 next_type
= STRONG_R
;
2255 if (prev_type
== WEAK_EN
|| prev_type
== WEAK_AN
)
2256 prev_type
= STRONG_R
;
2258 if (next_type
== prev_type
) /* N1 */
2260 else if ((lev
& 1) == 0) /* N2 */
2266 static bidi_bracket_type_t
2267 bidi_paired_bracket_type (int c
)
2270 return BIDI_BRACKET_NONE
;
2271 if (c
< 0 || c
> MAX_CHAR
)
2274 return (bidi_bracket_type_t
) XINT (CHAR_TABLE_REF (bidi_brackets_table
, c
));
2277 #define FLAG_EMBEDDING_INSIDE 1
2278 #define FLAG_OPPOSITE_INSIDE 2
2279 #define FLAG_EMBEDDING_OUTSIDE 4
2280 #define FLAG_OPPOSITE_OUTSIDE 8
2282 /* A data type used in the stack maintained by
2283 bidi_resolve_bracket_pairs below. */
2284 typedef struct bpa_stack_entry
{
2285 int close_bracket_char
;
2286 int open_bracket_idx
;
2287 #ifdef ENABLE_CHECKING
2288 ptrdiff_t open_bracket_pos
;
2293 /* With MAX_ALLOCA of 16KB, this should allow at least 1K slots in the
2294 BPA stack, which should be more than enough for actual bidi text. */
2295 #define MAX_BPA_STACK (max (MAX_ALLOCA / sizeof (bpa_stack_entry), 1))
2297 #ifdef ENABLE_CHECKING
2298 # define STORE_BRACKET_CHARPOS \
2299 bpa_stack[bpa_sp].open_bracket_pos = bidi_it->charpos
2301 # define STORE_BRACKET_CHARPOS /* nothing */
2304 #define PUSH_BPA_STACK(EMBEDDING_LEVEL, LAST_STRONG) \
2307 if (bpa_sp >= MAX_BPA_STACK) \
2309 bpa_stack[bpa_sp].close_bracket_char = bidi_mirror_char (bidi_it->ch); \
2310 bpa_stack[bpa_sp].open_bracket_idx = bidi_cache_last_idx; \
2311 STORE_BRACKET_CHARPOS; \
2312 if (((EMBEDDING_LEVEL) & 1) == 0) \
2313 bpa_stack[bpa_sp].flags = ((LAST_STRONG) == STRONG_L \
2314 ? FLAG_EMBEDDING_OUTSIDE \
2315 : FLAG_OPPOSITE_OUTSIDE); \
2317 bpa_stack[bpa_sp].flags = ((LAST_STRONG) == STRONG_L \
2318 ? FLAG_OPPOSITE_OUTSIDE \
2319 : FLAG_EMBEDDING_OUTSIDE); \
2323 /* This function implements BPA, the Bidi Parenthesis Algorithm,
2324 described in BD16 and N0 of UAX#9. */
2326 bidi_resolve_bracket_pairs (struct bidi_it
*bidi_it
)
2328 bidi_bracket_type_t btype
;
2329 bidi_type_t type
= bidi_it
->type
;
2331 /* When scanning backwards, we don't expect any unresolved bidi
2332 bracket characters. */
2333 if (bidi_it
->scan_dir
!= 1)
2336 btype
= bidi_paired_bracket_type (bidi_it
->ch
);
2337 if (btype
== BIDI_BRACKET_OPEN
)
2339 bpa_stack_entry bpa_stack
[MAX_BPA_STACK
];
2341 struct bidi_it saved_it
;
2342 bidi_type_t last_strong
;
2343 int embedding_level
= bidi_it
->level_stack
[bidi_it
->stack_idx
].level
;
2345 eassert (MAX_BPA_STACK
>= 100);
2346 bidi_copy_it (&saved_it
, bidi_it
);
2347 last_strong
= bidi_it
->prev_for_neutral
.type
;
2351 int old_sidx
, new_sidx
;
2352 int current_level
= bidi_it
->level_stack
[bidi_it
->stack_idx
].level
;
2354 bidi_cache_iterator_state (bidi_it
, type
== NEUTRAL_B
);
2355 if (btype
== BIDI_BRACKET_OPEN
)
2356 PUSH_BPA_STACK (embedding_level
, last_strong
);
2357 else if (btype
== BIDI_BRACKET_CLOSE
)
2360 int curchar
= bidi_it
->ch
;
2363 while (sp
>= 0 && bpa_stack
[sp
].close_bracket_char
!= curchar
)
2367 /* Resolve the bracket type according to N0. */
2368 if (bpa_stack
[sp
].flags
& FLAG_EMBEDDING_INSIDE
) /* N0b */
2369 type
= ((embedding_level
& 1) ? STRONG_R
: STRONG_L
);
2370 else if ((bpa_stack
[sp
].flags
/* N0c1 */
2371 & (FLAG_OPPOSITE_INSIDE
| FLAG_OPPOSITE_OUTSIDE
))
2372 == (FLAG_OPPOSITE_INSIDE
| FLAG_OPPOSITE_OUTSIDE
))
2373 type
= ((embedding_level
& 1) ? STRONG_L
: STRONG_R
);
2374 else if (bpa_stack
[sp
].flags
& FLAG_OPPOSITE_INSIDE
) /* N0c2 */
2375 type
= ((embedding_level
& 1) ? STRONG_R
: STRONG_L
);
2377 /* Update and cache the closing bracket. */
2378 bidi_it
->type
= type
;
2379 bidi_it
->bracket_resolved
= 1;
2380 bidi_cache_iterator_state (bidi_it
, 0);
2381 /* Update and cache the corresponding opening bracket. */
2382 bidi_cache_fetch_state (bpa_stack
[sp
].open_bracket_idx
,
2384 #ifdef ENABLE_CHECKING
2385 eassert (bpa_stack
[sp
].open_bracket_pos
== bidi_it
->charpos
);
2387 bidi_it
->type
= type
;
2388 bidi_it
->bracket_resolved
= 1;
2389 bidi_cache_iterator_state (bidi_it
, 0);
2395 bidi_it
->bracket_resolved
= 1;
2397 else if (bidi_get_category (bidi_it
->type_after_w1
) != NEUTRAL
)
2402 /* Update the "inside" flags of all the slots on the stack. */
2403 switch (bidi_it
->type
)
2406 flag
= ((embedding_level
& 1) == 0
2407 ? FLAG_EMBEDDING_INSIDE
2408 : FLAG_OPPOSITE_INSIDE
);
2409 last_strong
= STRONG_L
;
2414 flag
= ((embedding_level
& 1) == 1
2415 ? FLAG_EMBEDDING_INSIDE
2416 : FLAG_OPPOSITE_INSIDE
);
2417 last_strong
= STRONG_R
;
2422 for (sp
= bpa_sp
; sp
>= 0; sp
--)
2423 bpa_stack
[sp
].flags
|= flag
;
2425 /* Record the info about the previous character, so that it
2426 will be cached with this state. */
2427 if (bidi_it
->type_after_w1
!= WEAK_BN
/* W1/Retaining */
2428 && bidi_it
->type
!= WEAK_BN
)
2429 bidi_remember_char (&bidi_it
->prev
, bidi_it
);
2430 old_sidx
= bidi_it
->stack_idx
;
2431 type
= bidi_resolve_weak (bidi_it
);
2432 /* Skip level runs excluded from this isolating run sequence. */
2433 new_sidx
= bidi_it
->stack_idx
;
2434 if (bidi_it
->level_stack
[new_sidx
].level
> current_level
2435 && (bidi_it
->level_stack
[new_sidx
].isolate_status
2436 || (new_sidx
> old_sidx
+ 1
2437 && bidi_it
->level_stack
[new_sidx
- 1].isolate_status
)))
2439 while (bidi_it
->level_stack
[bidi_it
->stack_idx
].level
2442 bidi_cache_iterator_state (bidi_it
, type
== NEUTRAL_B
);
2443 type
= bidi_resolve_weak (bidi_it
);
2446 if (type
== NEUTRAL_B
2447 || (bidi_it
->level_stack
[bidi_it
->stack_idx
].level
2451 /* We've marched all the way to the end of this
2452 isolating run sequence, and didn't find matching
2453 closing brackets for some opening brackets. Unwind
2454 whatever is left on the BPA stack, and mark each
2455 bracket there as BPA-resolved. */
2458 bidi_cache_fetch_state (bpa_stack
[bpa_sp
].open_bracket_idx
,
2460 #ifdef ENABLE_CHECKING
2461 eassert (bpa_stack
[bpa_sp
].open_bracket_pos
2462 == bidi_it
->charpos
);
2464 bidi_it
->bracket_resolved
= 1;
2465 bidi_cache_iterator_state (bidi_it
, 0);
2468 type
= saved_it
.type
;
2471 if (bidi_it
->type_after_w1
== NEUTRAL_ON
) /* Unicode 8.0 correction */
2472 btype
= bidi_paired_bracket_type (bidi_it
->ch
);
2474 btype
= BIDI_BRACKET_NONE
;
2476 bidi_check_type (type
);
2478 bidi_copy_it (bidi_it
, &saved_it
);
2479 bidi_it
->type
= type
;
2480 bidi_it
->bracket_resolved
= 1;
2487 bidi_resolve_neutral (struct bidi_it
*bidi_it
)
2489 int prev_level
= bidi_it
->level_stack
[bidi_it
->stack_idx
].level
;
2490 bidi_type_t type
= bidi_resolve_weak (bidi_it
);
2491 int current_level
= bidi_it
->level_stack
[bidi_it
->stack_idx
].level
;
2493 int ch
= bidi_it
->ch
;
2495 eassert (type
== STRONG_R
2500 || type
== NEUTRAL_B
2501 || type
== NEUTRAL_S
2502 || type
== NEUTRAL_WS
2503 || type
== NEUTRAL_ON
2508 eassert (prev_level
>= 0);
2509 eassert (current_level
>= 0);
2511 /* FIXME: Insert the code for N0 here. */
2512 if (type
== NEUTRAL_ON
2513 && bidi_paired_bracket_type (ch
) != BIDI_BRACKET_NONE
)
2515 if (bidi_cache_idx
> bidi_cache_start
2516 && bidi_cache_find (bidi_it
->charpos
, -1, bidi_it
) != UNKNOWN_BT
2517 && bidi_it
->bracket_resolved
)
2518 type
= bidi_it
->type
;
2520 type
= bidi_resolve_bracket_pairs (bidi_it
);
2523 is_neutral
= bidi_get_category (type
) == NEUTRAL
;
2525 if ((type
!= NEUTRAL_B
/* Don't risk entering the long loop below if
2526 we are already at paragraph end. */
2527 && (is_neutral
|| bidi_isolate_fmt_char (type
)))
2528 /* N1-N2/Retaining */
2529 || (type
== WEAK_BN
&& bidi_explicit_dir_char (bidi_it
->ch
)))
2531 if (bidi_it
->next_for_neutral
.type
!= UNKNOWN_BT
)
2532 type
= bidi_resolve_neutral_1 (bidi_it
->prev_for_neutral
.type
,
2533 bidi_it
->next_for_neutral
.type
,
2535 /* The next two "else if" clauses are shortcuts for the
2536 important special case when we have a long sequence of
2537 neutral or WEAK_BN characters, such as whitespace or nulls or
2538 other control characters, on the base embedding level of the
2539 paragraph, and that sequence goes all the way to the end of
2540 the paragraph and follows a character whose resolved
2541 directionality is identical to the base embedding level.
2542 (This is what happens in a buffer with plain L2R text that
2543 happens to include long sequences of control characters.) By
2544 virtue of N1, the result of examining this long sequence will
2545 always be either STRONG_L or STRONG_R, depending on the base
2546 embedding level. So we use this fact directly instead of
2547 entering the expensive loop in the "else" clause. */
2548 else if (current_level
== 0
2549 && bidi_it
->prev_for_neutral
.type
== STRONG_L
2550 && !bidi_explicit_dir_char (bidi_it
->ch
)
2551 && !bidi_isolate_fmt_char (type
))
2552 type
= bidi_resolve_neutral_1 (bidi_it
->prev_for_neutral
.type
,
2553 STRONG_L
, current_level
);
2554 else if (/* current level is 1 */
2556 /* base embedding level is also 1 */
2557 && bidi_it
->level_stack
[0].level
== 1
2558 /* previous character is one of those considered R for
2559 the purposes of W5 */
2560 && (bidi_it
->prev_for_neutral
.type
== STRONG_R
2561 || bidi_it
->prev_for_neutral
.type
== WEAK_EN
2562 || bidi_it
->prev_for_neutral
.type
== WEAK_AN
)
2563 && !bidi_explicit_dir_char (bidi_it
->ch
)
2564 && !bidi_isolate_fmt_char (type
))
2565 type
= bidi_resolve_neutral_1 (bidi_it
->prev_for_neutral
.type
,
2566 STRONG_R
, current_level
);
2569 /* Arrrgh!! The UAX#9 algorithm is too deeply entrenched in
2570 the assumption of batch-style processing; see clauses W4,
2571 W5, and especially N1, which require to look far forward
2572 (as well as back) in the buffer/string. May the fleas of
2573 a thousand camels infest the armpits of those who design
2574 supposedly general-purpose algorithms by looking at their
2575 own implementations, and fail to consider other possible
2577 struct bidi_it saved_it
;
2578 bidi_type_t next_type
;
2579 bool adjacent_to_neutrals
= is_neutral
;
2581 if (bidi_it
->scan_dir
== -1)
2584 bidi_copy_it (&saved_it
, bidi_it
);
2585 /* Scan the text forward until we find the first non-neutral
2586 character, and then use that to resolve the neutral we
2587 are dealing with now. We also cache the scanned iterator
2588 states, to salvage some of the effort later. */
2590 int old_sidx
, new_sidx
;
2592 /* Paragraph separators have their levels fully resolved
2593 at this point, so cache them as resolved. */
2594 bidi_cache_iterator_state (bidi_it
, type
== NEUTRAL_B
);
2595 /* Record the info about the previous character, so that
2596 it will be cached with this state. */
2597 if (bidi_it
->type_after_w1
!= WEAK_BN
/* W1/Retaining */
2598 && bidi_it
->type
!= WEAK_BN
)
2599 bidi_remember_char (&bidi_it
->prev
, bidi_it
);
2600 old_sidx
= bidi_it
->stack_idx
;
2601 type
= bidi_resolve_weak (bidi_it
);
2602 /* Skip level runs excluded from this isolating run sequence. */
2603 new_sidx
= bidi_it
->stack_idx
;
2604 if (bidi_it
->level_stack
[new_sidx
].level
> current_level
2605 && (bidi_it
->level_stack
[new_sidx
].isolate_status
2606 /* This is for when we have an isolate initiator
2607 immediately followed by an embedding or
2608 override initiator, in which case we get the
2609 level stack pushed twice by the single call to
2610 bidi_resolve_weak above. */
2611 || (new_sidx
> old_sidx
+ 1
2612 && bidi_it
->level_stack
[new_sidx
- 1].isolate_status
)))
2614 while (bidi_it
->level_stack
[bidi_it
->stack_idx
].level
2617 bidi_cache_iterator_state (bidi_it
, type
== NEUTRAL_B
);
2618 type
= bidi_resolve_weak (bidi_it
);
2621 if (!adjacent_to_neutrals
2622 && (bidi_get_category (type
) == NEUTRAL
2623 || bidi_isolate_fmt_char (type
)))
2624 adjacent_to_neutrals
= true;
2625 } while (!(type
== NEUTRAL_B
2627 && bidi_get_category (type
) != NEUTRAL
2628 && !bidi_isolate_fmt_char (type
))
2629 /* This is all per level run, so stop when we
2630 reach the end of this level run. */
2631 || (bidi_it
->level_stack
[bidi_it
->stack_idx
].level
2632 != current_level
)));
2634 /* Record the character we stopped at. */
2635 bidi_remember_char (&saved_it
.next_for_neutral
, bidi_it
);
2637 if ((bidi_it
->level_stack
[bidi_it
->stack_idx
].level
!= current_level
)
2638 || type
== NEUTRAL_B
)
2640 /* Marched all the way to the end of this level run. We
2641 need to use the eos type, whose information is stored
2642 by bidi_set_sos_type in the prev_for_neutral
2644 if (adjacent_to_neutrals
)
2645 next_type
= bidi_it
->prev_for_neutral
.type
;
2648 /* This is a BN which does not adjoin neutrals.
2649 Leave its type alone. */
2650 bidi_copy_it (bidi_it
, &saved_it
);
2651 return bidi_it
->type
;
2661 /* Actually, STRONG_AL cannot happen here, because
2662 bidi_resolve_weak converts it to STRONG_R, per W3. */
2663 eassert (type
!= STRONG_AL
);
2668 /* N1: "European and Arabic numbers act as if they
2669 were R in terms of their influence on NIs." */
2670 next_type
= STRONG_R
;
2677 /* Resolve the type of all the NIs found during the above loop. */
2678 type
= bidi_resolve_neutral_1 (saved_it
.prev_for_neutral
.type
,
2679 next_type
, current_level
);
2680 /* Update next_for_neutral with the resolved type, so we
2681 could use it for all the other NIs up to the place where
2682 we exited the loop. */
2683 saved_it
.next_for_neutral
.type
= next_type
;
2684 bidi_check_type (type
);
2685 /* Update the character which caused us to enter the above loop. */
2686 saved_it
.type
= type
;
2687 bidi_check_type (next_type
);
2688 bidi_copy_it (bidi_it
, &saved_it
);
2694 /* Given an iterator state in BIDI_IT, advance one character position
2695 in the buffer/string to the next character (in the logical order),
2696 resolve the bidi type of that next character, and return that
2699 bidi_type_of_next_char (struct bidi_it
*bidi_it
)
2703 /* This should always be called during a forward scan. */
2704 if (bidi_it
->scan_dir
!= 1)
2707 type
= bidi_resolve_neutral (bidi_it
);
2712 /* Given an iterator state BIDI_IT, advance one character position in
2713 the buffer/string to the next character (in the current scan
2714 direction), resolve the embedding and implicit levels of that next
2715 character, and return the resulting level. */
2717 bidi_level_of_next_char (struct bidi_it
*bidi_it
)
2720 int level
, prev_level
= -1;
2721 struct bidi_saved_info next_for_neutral
;
2722 ptrdiff_t next_char_pos
= -2;
2723 bool need_to_update_cache
= false;
2725 if (bidi_it
->scan_dir
== 1)
2728 = ((bidi_it
->string
.s
|| STRINGP (bidi_it
->string
.lstring
))
2729 ? bidi_it
->string
.schars
: ZV
);
2731 /* There's no sense in trying to advance if we hit end of text. */
2732 if (bidi_it
->charpos
>= eob
)
2734 eassert (bidi_it
->resolved_level
>= 0);
2735 return bidi_it
->resolved_level
;
2738 /* Record the info about the previous character. */
2739 if (bidi_it
->type_after_w1
!= WEAK_BN
/* W1/Retaining */
2740 && bidi_it
->type
!= WEAK_BN
)
2741 bidi_remember_char (&bidi_it
->prev
, bidi_it
);
2742 if (bidi_it
->type_after_w1
== STRONG_R
2743 || bidi_it
->type_after_w1
== STRONG_L
2744 || bidi_it
->type_after_w1
== STRONG_AL
)
2745 bidi_remember_char (&bidi_it
->last_strong
, bidi_it
);
2746 /* FIXME: it sounds like we don't need both prev and
2747 prev_for_neutral members, but I'm leaving them both for now. */
2748 if (bidi_it
->type
== STRONG_R
|| bidi_it
->type
== STRONG_L
2749 || bidi_it
->type
== WEAK_EN
|| bidi_it
->type
== WEAK_AN
)
2750 bidi_remember_char (&bidi_it
->prev_for_neutral
, bidi_it
);
2752 /* If we overstepped the characters used for resolving neutrals
2753 and whitespace, invalidate their info in the iterator. */
2754 if (bidi_it
->charpos
>= bidi_it
->next_for_neutral
.charpos
)
2755 bidi_it
->next_for_neutral
.type
= UNKNOWN_BT
;
2756 if (bidi_it
->next_en_pos
>= 0
2757 && bidi_it
->charpos
>= bidi_it
->next_en_pos
)
2759 bidi_it
->next_en_pos
= 0;
2760 bidi_it
->next_en_type
= UNKNOWN_BT
;
2762 if (bidi_it
->next_for_ws
.type
!= UNKNOWN_BT
2763 && bidi_it
->charpos
>= bidi_it
->next_for_ws
.charpos
)
2764 bidi_it
->next_for_ws
.type
= UNKNOWN_BT
;
2766 /* This must be taken before we fill the iterator with the info
2767 about the next char. If we scan backwards, the iterator
2768 state must be already cached, so there's no need to know the
2769 embedding level of the previous character, since we will be
2770 returning to our caller shortly. */
2771 prev_level
= bidi_it
->level_stack
[bidi_it
->stack_idx
].level
;
2772 eassert (prev_level
>= 0);
2774 next_for_neutral
= bidi_it
->next_for_neutral
;
2776 /* Perhaps the character we want is already cached. If it is, the
2777 call to bidi_cache_find below will return a type other than
2779 if (bidi_cache_idx
> bidi_cache_start
&& !bidi_it
->first_elt
)
2781 int bob
= ((bidi_it
->string
.s
|| STRINGP (bidi_it
->string
.lstring
))
2783 bidi_type_t prev_type
= bidi_it
->type
;
2784 bidi_type_t type_for_neutral
= bidi_it
->next_for_neutral
.type
;
2785 ptrdiff_t pos_for_neutral
= bidi_it
->next_for_neutral
.charpos
;
2787 if (bidi_it
->scan_dir
> 0)
2789 if (bidi_it
->nchars
<= 0)
2791 next_char_pos
= bidi_it
->charpos
+ bidi_it
->nchars
;
2793 else if (bidi_it
->charpos
>= bob
)
2794 /* Implementation note: we allow next_char_pos to be as low as
2795 0 for buffers or -1 for strings, and that is okay because
2796 that's the "position" of the sentinel iterator state we
2797 cached at the beginning of the iteration. */
2798 next_char_pos
= bidi_it
->charpos
- 1;
2799 if (next_char_pos
>= bob
- 1)
2800 type
= bidi_cache_find (next_char_pos
, -1, bidi_it
);
2804 /* For a sequence of BN and NI, copy the type from the previous
2805 character. This is because the loop in bidi_resolve_neutral
2806 that handles such sequences caches the characters it
2807 traverses, but does not (and cannot) store the
2808 next_for_neutral member for them, because it is only known
2809 when the loop ends. So when we find them in the cache, their
2810 type needs to be updated, but we don't have next_for_neutral
2811 to do that. However, whatever type is resolved as result of
2812 that loop, it will be the same for all the traversed
2813 characters, by virtue of N1 and N2. */
2814 if (type
== WEAK_BN
&& bidi_it
->scan_dir
> 0
2815 && bidi_explicit_dir_char (bidi_it
->ch
)
2816 && type_for_neutral
!= UNKNOWN_BT
2817 && bidi_it
->charpos
< pos_for_neutral
)
2820 eassert (type
!= UNKNOWN_BT
);
2825 if (type
!= UNKNOWN_BT
)
2827 /* Don't lose the information for resolving neutrals! The
2828 cached states could have been cached before their
2829 next_for_neutral member was computed. If we are on our way
2830 forward, we can simply take the info from the previous
2832 if (bidi_it
->scan_dir
== 1
2833 && bidi_it
->next_for_neutral
.type
== UNKNOWN_BT
)
2834 bidi_it
->next_for_neutral
= next_for_neutral
;
2836 /* If resolved_level is -1, it means this state was cached
2837 before it was completely resolved, so we cannot return
2839 if (bidi_it
->resolved_level
!= -1)
2841 eassert (bidi_it
->resolved_level
>= 0);
2842 return bidi_it
->resolved_level
;
2846 /* Take note when we've got a cached state that is not fully
2847 resolved, so that we could make sure we update the cache
2848 below, when we do resolve it. */
2849 need_to_update_cache
= true;
2852 if (bidi_it
->scan_dir
== -1)
2853 /* If we are going backwards, the iterator state is already cached
2854 from previous scans, and should be fully resolved. */
2857 if (type
== UNKNOWN_BT
)
2858 type
= bidi_type_of_next_char (bidi_it
);
2860 if (type
== NEUTRAL_B
)
2862 eassert (bidi_it
->resolved_level
>= 0);
2863 return bidi_it
->resolved_level
;
2866 level
= bidi_it
->level_stack
[bidi_it
->stack_idx
].level
;
2867 if (bidi_get_category (type
) == NEUTRAL
/* && type != NEUTRAL_B */
2868 || bidi_isolate_fmt_char (type
))
2870 if (bidi_it
->next_for_neutral
.type
== UNKNOWN_BT
)
2873 /* If the cached state shows a neutral character, it was not
2874 resolved by bidi_resolve_neutral, so do it now. */
2875 type
= bidi_resolve_neutral_1 (bidi_it
->prev_for_neutral
.type
,
2876 bidi_it
->next_for_neutral
.type
,
2880 eassert ((type
== STRONG_R
2884 || type
== WEAK_AN
));
2885 bidi_it
->type
= type
;
2886 bidi_check_type (bidi_it
->type
);
2888 /* For L1 below, we need to know, for each WS character, whether
2889 it belongs to a sequence of WS characters preceding a newline
2890 or a TAB or a paragraph separator. */
2891 if ((bidi_it
->orig_type
== NEUTRAL_WS
2892 || bidi_isolate_fmt_char (bidi_it
->orig_type
))
2893 && bidi_it
->next_for_ws
.type
== UNKNOWN_BT
)
2896 ptrdiff_t clen
= bidi_it
->ch_len
;
2897 ptrdiff_t bpos
= bidi_it
->bytepos
;
2898 ptrdiff_t cpos
= bidi_it
->charpos
;
2899 ptrdiff_t disp_pos
= bidi_it
->disp_pos
;
2900 ptrdiff_t nc
= bidi_it
->nchars
;
2901 struct bidi_string_data bs
= bidi_it
->string
;
2903 bool fwp
= bidi_it
->frame_window_p
;
2904 int dpp
= bidi_it
->disp_prop
;
2906 if (bidi_it
->nchars
<= 0)
2909 ch
= bidi_fetch_char (cpos
+= nc
, bpos
+= clen
, &disp_pos
, &dpp
, &bs
,
2910 bidi_it
->w
, fwp
, &clen
, &nc
);
2911 chtype
= bidi_get_type (ch
, NEUTRAL_DIR
);
2912 } while (chtype
== NEUTRAL_WS
|| chtype
== WEAK_BN
2913 || bidi_isolate_fmt_char (chtype
)
2914 || bidi_explicit_dir_char (ch
)); /* L1/Retaining */
2915 bidi_it
->next_for_ws
.type
= chtype
;
2916 bidi_check_type (bidi_it
->next_for_ws
.type
);
2917 bidi_it
->next_for_ws
.charpos
= cpos
;
2918 bidi_it
->next_for_ws
.bytepos
= bpos
;
2921 /* Resolve implicit levels. */
2922 if (bidi_it
->orig_type
== NEUTRAL_B
/* L1 */
2923 || bidi_it
->orig_type
== NEUTRAL_S
2924 || bidi_it
->ch
== '\n' || bidi_it
->ch
== BIDI_EOB
2925 || (bidi_it
->orig_type
== NEUTRAL_WS
2926 && (bidi_it
->next_for_ws
.type
== NEUTRAL_B
2927 || bidi_it
->next_for_ws
.type
== NEUTRAL_S
)))
2928 level
= bidi_it
->level_stack
[0].level
;
2929 else if ((level
& 1) == 0) /* I1 */
2931 if (type
== STRONG_R
)
2933 else if (type
== WEAK_EN
|| type
== WEAK_AN
)
2938 if (type
== STRONG_L
|| type
== WEAK_EN
|| type
== WEAK_AN
)
2942 bidi_it
->resolved_level
= level
;
2943 if (need_to_update_cache
)
2944 bidi_cache_iterator_state (bidi_it
, 1);
2948 /* Move to the other edge of a level given by LEVEL. If END_FLAG,
2949 we are at the end of a level, and we need to prepare to
2950 resume the scan of the lower level.
2952 If this level's other edge is cached, we simply jump to it, filling
2953 the iterator structure with the iterator state on the other edge.
2954 Otherwise, we walk the buffer or string until we come back to the
2955 same level as LEVEL.
2957 Note: we are not talking here about a ``level run'' in the UAX#9
2958 sense of the term, but rather about a ``level'' which includes
2959 all the levels higher than it. In other words, given the levels
2962 11111112222222333333334443343222222111111112223322111
2965 and assuming we are at point A scanning left to right, this
2966 function moves to point C, whereas the UAX#9 ``level 2 run'' ends
2969 bidi_find_other_level_edge (struct bidi_it
*bidi_it
, int level
, bool end_flag
)
2971 int dir
= end_flag
? -bidi_it
->scan_dir
: bidi_it
->scan_dir
;
2974 /* Try the cache first. */
2975 if ((idx
= bidi_cache_find_level_change (level
, dir
, end_flag
))
2976 >= bidi_cache_start
)
2977 bidi_cache_fetch_state (idx
, bidi_it
);
2982 /* If we are at end of level, its edges must be cached. */
2986 bidi_cache_iterator_state (bidi_it
, 1);
2988 new_level
= bidi_level_of_next_char (bidi_it
);
2989 bidi_cache_iterator_state (bidi_it
, 1);
2990 } while (new_level
>= level
);
2995 bidi_move_to_visually_next (struct bidi_it
*bidi_it
)
2997 int old_level
, new_level
, next_level
;
2998 struct bidi_it sentinel
;
2999 struct gcpro gcpro1
;
3001 if (bidi_it
->charpos
< 0 || bidi_it
->bytepos
< 0)
3004 if (bidi_it
->scan_dir
== 0)
3006 bidi_it
->scan_dir
= 1; /* default to logical order */
3009 /* The code below can call eval, and thus cause GC. If we are
3010 iterating a Lisp string, make sure it won't be GCed. */
3011 if (STRINGP (bidi_it
->string
.lstring
))
3012 GCPRO1 (bidi_it
->string
.lstring
);
3014 /* If we just passed a newline, initialize for the next line. */
3015 if (!bidi_it
->first_elt
3016 && (bidi_it
->ch
== '\n' || bidi_it
->ch
== BIDI_EOB
))
3017 bidi_line_init (bidi_it
);
3019 /* Prepare the sentinel iterator state, and cache it. When we bump
3020 into it, scanning backwards, we'll know that the last non-base
3021 level is exhausted. */
3022 if (bidi_cache_idx
== bidi_cache_start
)
3024 bidi_copy_it (&sentinel
, bidi_it
);
3025 if (bidi_it
->first_elt
)
3027 sentinel
.charpos
--; /* cached charpos needs to be monotonic */
3029 sentinel
.ch
= '\n'; /* doesn't matter, but why not? */
3030 sentinel
.ch_len
= 1;
3031 sentinel
.nchars
= 1;
3033 bidi_cache_iterator_state (&sentinel
, 1);
3036 old_level
= bidi_it
->resolved_level
;
3037 new_level
= bidi_level_of_next_char (bidi_it
);
3039 /* Reordering of resolved levels (clause L2) is implemented by
3040 jumping to the other edge of the level and flipping direction of
3041 scanning the text whenever we find a level change. */
3042 if (new_level
!= old_level
)
3044 bool ascending
= new_level
> old_level
;
3045 int level_to_search
= ascending
? old_level
+ 1 : old_level
;
3046 int incr
= ascending
? 1 : -1;
3047 int expected_next_level
= old_level
+ incr
;
3049 /* Jump (or walk) to the other edge of this level. */
3050 bidi_find_other_level_edge (bidi_it
, level_to_search
, !ascending
);
3051 /* Switch scan direction and peek at the next character in the
3053 bidi_it
->scan_dir
= -bidi_it
->scan_dir
;
3055 /* The following loop handles the case where the resolved level
3056 jumps by more than one. This is typical for numbers inside a
3057 run of text with left-to-right embedding direction, but can
3058 also happen in other situations. In those cases the decision
3059 where to continue after a level change, and in what direction,
3060 is tricky. For example, given a text like below:
3065 (where the numbers below the text show the resolved levels),
3066 the result of reordering according to UAX#9 should be this:
3070 This is implemented by the loop below which flips direction
3071 and jumps to the other edge of the level each time it finds
3072 the new level not to be the expected one. The expected level
3073 is always one more or one less than the previous one. */
3074 next_level
= bidi_peek_at_next_level (bidi_it
);
3075 while (next_level
!= expected_next_level
)
3077 /* If next_level is -1, it means we have an unresolved level
3078 in the cache, which at this point should not happen. If
3079 it does, we will infloop. */
3080 eassert (next_level
>= 0);
3081 /* If next_level is not consistent with incr, we might
3084 ? next_level
> expected_next_level
3085 : next_level
< expected_next_level
);
3086 expected_next_level
+= incr
;
3087 level_to_search
+= incr
;
3088 bidi_find_other_level_edge (bidi_it
, level_to_search
, !ascending
);
3089 bidi_it
->scan_dir
= -bidi_it
->scan_dir
;
3090 next_level
= bidi_peek_at_next_level (bidi_it
);
3093 /* Finally, deliver the next character in the new direction. */
3094 next_level
= bidi_level_of_next_char (bidi_it
);
3097 /* Take note when we have just processed the newline that precedes
3098 the end of the paragraph. The next time we are about to be
3099 called, set_iterator_to_next will automatically reinit the
3100 paragraph direction, if needed. We do this at the newline before
3101 the paragraph separator, because the next character might not be
3102 the first character of the next paragraph, due to the bidi
3103 reordering, whereas we _must_ know the paragraph base direction
3104 _before_ we process the paragraph's text, since the base
3105 direction affects the reordering. */
3106 if (bidi_it
->scan_dir
== 1
3107 && (bidi_it
->ch
== '\n' || bidi_it
->ch
== BIDI_EOB
))
3109 /* The paragraph direction of the entire string, once
3110 determined, is in effect for the entire string. Setting the
3111 separator limit to the end of the string prevents
3112 bidi_paragraph_init from being called automatically on this
3114 if (bidi_it
->string
.s
|| STRINGP (bidi_it
->string
.lstring
))
3115 bidi_it
->separator_limit
= bidi_it
->string
.schars
;
3116 else if (bidi_it
->bytepos
< ZV_BYTE
)
3119 = bidi_at_paragraph_end (bidi_it
->charpos
+ bidi_it
->nchars
,
3120 bidi_it
->bytepos
+ bidi_it
->ch_len
);
3121 if (bidi_it
->nchars
<= 0)
3125 bidi_it
->new_paragraph
= 1;
3126 /* Record the buffer position of the last character of the
3127 paragraph separator. */
3128 bidi_it
->separator_limit
3129 = bidi_it
->charpos
+ bidi_it
->nchars
+ sep_len
;
3134 if (bidi_it
->scan_dir
== 1 && bidi_cache_idx
> bidi_cache_start
)
3136 /* If we are at paragraph's base embedding level and beyond the
3137 last cached position, the cache's job is done and we can
3139 if (bidi_it
->resolved_level
== bidi_it
->level_stack
[0].level
3140 && bidi_it
->charpos
> (bidi_cache
[bidi_cache_idx
- 1].charpos
3141 + bidi_cache
[bidi_cache_idx
- 1].nchars
- 1))
3142 bidi_cache_reset ();
3143 /* But as long as we are caching during forward scan, we must
3144 cache each state, or else the cache integrity will be
3145 compromised: it assumes cached states correspond to buffer
3148 bidi_cache_iterator_state (bidi_it
, 1);
3151 if (STRINGP (bidi_it
->string
.lstring
))
3155 /* This is meant to be called from within the debugger, whenever you
3156 wish to examine the cache contents. */
3157 void bidi_dump_cached_states (void) EXTERNALLY_VISIBLE
;
3159 bidi_dump_cached_states (void)
3164 if (bidi_cache_idx
== 0)
3166 fprintf (stderr
, "The cache is empty.\n");
3169 fprintf (stderr
, "Total of %"pD
"d state%s in cache:\n",
3170 bidi_cache_idx
, bidi_cache_idx
== 1 ? "" : "s");
3172 for (i
= bidi_cache
[bidi_cache_idx
- 1].charpos
; i
> 0; i
/= 10)
3174 fputs ("ch ", stderr
);
3175 for (i
= 0; i
< bidi_cache_idx
; i
++)
3176 fprintf (stderr
, "%*c", ndigits
, bidi_cache
[i
].ch
);
3177 fputs ("\n", stderr
);
3178 fputs ("lvl ", stderr
);
3179 for (i
= 0; i
< bidi_cache_idx
; i
++)
3180 fprintf (stderr
, "%*d", ndigits
, bidi_cache
[i
].resolved_level
);
3181 fputs ("\n", stderr
);
3182 fputs ("pos ", stderr
);
3183 for (i
= 0; i
< bidi_cache_idx
; i
++)
3184 fprintf (stderr
, "%*"pD
"d", ndigits
, bidi_cache
[i
].charpos
);
3185 fputs ("\n", stderr
);