1 /* -*- coding: utf-8 -*- */
2 /* GNU Emacs case conversion functions.
4 Copyright (C) 1985, 1994, 1997-1999, 2001-2018 Free Software Foundation,
7 This file is part of GNU Emacs.
9 GNU Emacs is free software: you can redistribute it and/or modify
10 it under the terms of the GNU General Public License as published by
11 the Free Software Foundation, either version 3 of the License, or (at
12 your option) any later version.
14 GNU Emacs is distributed in the hope that it will be useful,
15 but WITHOUT ANY WARRANTY; without even the implied warranty of
16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 GNU General Public License for more details.
19 You should have received a copy of the GNU General Public License
20 along with GNU Emacs. If not, see <https://www.gnu.org/licenses/>. */
26 #include "character.h"
30 #include "composite.h"
33 enum case_action
{CASE_UP
, CASE_DOWN
, CASE_CAPITALIZE
, CASE_CAPITALIZE_UP
};
35 /* State for casing individual characters. */
38 /* A char-table with title-case character mappings or nil. Non-nil implies
39 flag is CASE_CAPITALIZE or CASE_CAPITALIZE_UP. */
40 Lisp_Object titlecase_char_table
;
42 /* The unconditional special-casing Unicode property char tables for upper
43 casing, lower casing and title casing respectively. */
44 Lisp_Object specialcase_char_tables
[3];
46 /* User-requested action. */
47 enum case_action flag
;
49 /* If true, the function operates on a buffer as opposed to a string
50 or character. When run on a buffer, syntax_prefix_flag_p is
51 taken into account when determining whether the context is within
55 /* Whether the context is within a word. */
59 /* Initialize CTX structure for casing characters. */
61 prepare_casing_context (struct casing_context
*ctx
,
62 enum case_action flag
, bool inbuffer
)
65 ctx
->inbuffer
= inbuffer
;
67 ctx
->titlecase_char_table
68 = (flag
< CASE_CAPITALIZE
? Qnil
69 : uniprop_table (Qtitlecase
));
70 ctx
->specialcase_char_tables
[CASE_UP
]
71 = (flag
== CASE_DOWN
? Qnil
72 : uniprop_table (Qspecial_uppercase
));
73 ctx
->specialcase_char_tables
[CASE_DOWN
]
74 = (flag
== CASE_UP
? Qnil
75 : uniprop_table (Qspecial_lowercase
));
76 ctx
->specialcase_char_tables
[CASE_CAPITALIZE
]
77 = (flag
< CASE_CAPITALIZE
? Qnil
78 : uniprop_table (Qspecial_titlecase
));
80 /* If the case table is flagged as modified, rescan it. */
81 if (NILP (XCHAR_TABLE (BVAR (current_buffer
, downcase_table
))->extras
[1]))
82 Fset_case_table (BVAR (current_buffer
, downcase_table
));
84 if (inbuffer
&& flag
>= CASE_CAPITALIZE
)
85 SETUP_BUFFER_SYNTAX_TABLE (); /* For syntax_prefix_flag_p. */
90 unsigned char data
[max (6, MAX_MULTIBYTE_LENGTH
)];
91 unsigned char len_chars
;
92 unsigned char len_bytes
;
95 /* Based on CTX, case character CH. If BUF is NULL, return cased character.
96 Otherwise, if BUF is non-NULL, save result in it and return whether the
97 character has been changed.
99 Since meaning of return value depends on arguments, it’s more convenient to
100 use case_single_character or case_character instead. */
102 case_character_impl (struct casing_str_buf
*buf
,
103 struct casing_context
*ctx
, int ch
)
105 enum case_action flag
;
109 /* Update inword state */
110 bool was_inword
= ctx
->inword
;
111 ctx
->inword
= SYNTAX (ch
) == Sword
&&
112 (!ctx
->inbuffer
|| was_inword
|| !syntax_prefix_flag_p (ch
));
114 /* Normalize flag so its one of CASE_UP, CASE_DOWN or CASE_CAPITALIZE. */
115 if (ctx
->flag
== CASE_CAPITALIZE
)
116 flag
= ctx
->flag
- was_inword
;
117 else if (ctx
->flag
!= CASE_CAPITALIZE_UP
)
119 else if (!was_inword
)
120 flag
= CASE_CAPITALIZE
;
127 /* Look through the special casing entries. */
128 if (buf
&& !NILP (ctx
->specialcase_char_tables
[flag
]))
130 prop
= CHAR_TABLE_REF (ctx
->specialcase_char_tables
[flag
], ch
);
133 struct Lisp_String
*str
= XSTRING (prop
);
134 if (STRING_BYTES (str
) <= sizeof buf
->data
)
136 buf
->len_chars
= str
->u
.s
.size
;
137 buf
->len_bytes
= STRING_BYTES (str
);
138 memcpy (buf
->data
, str
->u
.s
.data
, buf
->len_bytes
);
144 /* Handle simple, one-to-one case. */
145 if (flag
== CASE_DOWN
)
146 cased
= downcase (ch
);
149 bool cased_is_set
= false;
150 if (!NILP (ctx
->titlecase_char_table
))
152 prop
= CHAR_TABLE_REF (ctx
->titlecase_char_table
, ch
);
153 if (CHARACTERP (prop
))
155 cased
= XFASTINT (prop
);
163 /* And we’re done. */
168 buf
->len_bytes
= CHAR_STRING (cased
, buf
->data
);
172 /* In Greek, lower case sigma has two forms: one when used in the middle and one
173 when used at the end of a word. Below is to help handle those cases when
176 The rule does not conflict with any other casing rules so while it is
177 a conditional one, it is independent of language. */
179 enum { GREEK_CAPITAL_LETTER_SIGMA
= 0x03A3 }; /* Σ */
180 enum { GREEK_SMALL_LETTER_FINAL_SIGMA
= 0x03C2 }; /* ς */
182 /* Based on CTX, case character CH accordingly. Update CTX as necessary.
183 Return cased character.
185 Special casing rules (such as upcase(fi) = FI) are not handled. For
186 characters whose casing results in multiple code points, the character is
187 returned unchanged. */
189 case_single_character (struct casing_context
*ctx
, int ch
)
191 return case_character_impl (NULL
, ctx
, ch
);
194 /* Save in BUF result of casing character CH. Return whether casing changed the
197 If not-NULL, NEXT points to the next character in the cased string. If NULL,
198 it is assumed current character is the last one being cased. This is used to
199 apply some rules which depend on proceeding state.
201 This is like case_single_character but also handles one-to-many casing
204 case_character (struct casing_str_buf
*buf
, struct casing_context
*ctx
,
205 int ch
, const unsigned char *next
)
207 bool was_inword
= ctx
->inword
;
208 bool changed
= case_character_impl (buf
, ctx
, ch
);
210 /* If we have just down-cased a capital sigma and the next character no longer
211 has a word syntax (i.e. current character is end of word), use final
213 if (was_inword
&& ch
== GREEK_CAPITAL_LETTER_SIGMA
&& changed
214 && (!next
|| SYNTAX (STRING_CHAR (next
)) != Sword
))
216 buf
->len_bytes
= CHAR_STRING (GREEK_SMALL_LETTER_FINAL_SIGMA
, buf
->data
);
224 do_casify_natnum (struct casing_context
*ctx
, Lisp_Object obj
)
226 int flagbits
= (CHAR_ALT
| CHAR_SUPER
| CHAR_HYPER
227 | CHAR_SHIFT
| CHAR_CTL
| CHAR_META
);
228 int ch
= XFASTINT (obj
);
230 /* If the character has higher bits set above the flags, return it unchanged.
231 It is not a real character. */
232 if (UNSIGNED_CMP (ch
, >, flagbits
))
235 int flags
= ch
& flagbits
;
238 /* FIXME: Even if enable-multibyte-characters is nil, we may manipulate
239 multibyte chars. This means we have a bug for latin-1 chars since when we
240 receive an int 128-255 we can't tell whether it's an eight-bit byte or
242 bool multibyte
= (ch
>= 256
243 || !NILP (BVAR (current_buffer
,
244 enable_multibyte_characters
)));
246 MAKE_CHAR_MULTIBYTE (ch
);
247 int cased
= case_single_character (ctx
, ch
);
252 MAKE_CHAR_UNIBYTE (cased
);
253 return make_natnum (cased
| flags
);
257 do_casify_multibyte_string (struct casing_context
*ctx
, Lisp_Object obj
)
259 /* Verify that ‘data’ is the first member of struct casing_str_buf
260 so that when casting char * to struct casing_str_buf *, the
261 representation of the character is at the beginning of the
262 buffer. This is why we don’t need a separate struct
263 casing_str_buf object, and can write directly to the destination. */
264 verify (offsetof (struct casing_str_buf
, data
) == 0);
266 ptrdiff_t size
= SCHARS (obj
), n
;
268 if (INT_MULTIPLY_WRAPV (size
, MAX_MULTIBYTE_LENGTH
, &n
)
269 || INT_ADD_WRAPV (n
, sizeof (struct casing_str_buf
), &n
))
271 unsigned char *dst
= SAFE_ALLOCA (n
);
272 unsigned char *dst_end
= dst
+ n
;
273 unsigned char *o
= dst
;
275 const unsigned char *src
= SDATA (obj
);
277 for (n
= 0; size
; --size
)
279 if (dst_end
- o
< sizeof (struct casing_str_buf
))
281 int ch
= STRING_CHAR_ADVANCE (src
);
282 case_character ((struct casing_str_buf
*) o
, ctx
, ch
,
283 size
> 1 ? src
: NULL
);
284 n
+= ((struct casing_str_buf
*) o
)->len_chars
;
285 o
+= ((struct casing_str_buf
*) o
)->len_bytes
;
287 eassert (o
<= dst_end
);
288 obj
= make_multibyte_string ((char *) dst
, n
, o
- dst
);
294 do_casify_unibyte_string (struct casing_context
*ctx
, Lisp_Object obj
)
296 ptrdiff_t i
, size
= SCHARS (obj
);
299 obj
= Fcopy_sequence (obj
);
300 for (i
= 0; i
< size
; i
++)
303 MAKE_CHAR_MULTIBYTE (ch
);
304 cased
= case_single_character (ctx
, ch
);
307 MAKE_CHAR_UNIBYTE (cased
);
308 /* If the char can't be converted to a valid byte, just don't
310 if (cased
>= 0 && cased
< 256)
311 SSET (obj
, i
, cased
);
317 casify_object (enum case_action flag
, Lisp_Object obj
)
319 struct casing_context ctx
;
320 prepare_casing_context (&ctx
, flag
, false);
323 return do_casify_natnum (&ctx
, obj
);
324 else if (!STRINGP (obj
))
325 wrong_type_argument (Qchar_or_string_p
, obj
);
326 else if (!SCHARS (obj
))
328 else if (STRING_MULTIBYTE (obj
))
329 return do_casify_multibyte_string (&ctx
, obj
);
331 return do_casify_unibyte_string (&ctx
, obj
);
334 DEFUN ("upcase", Fupcase
, Supcase
, 1, 1, 0,
335 doc
: /* Convert argument to upper case and return that.
336 The argument may be a character or string. The result has the same type.
337 The argument object is not altered--the value is a copy. If argument
338 is a character, characters which map to multiple code points when
339 cased, e.g. fi, are returned unchanged.
340 See also `capitalize', `downcase' and `upcase-initials'. */)
343 return casify_object (CASE_UP
, obj
);
346 DEFUN ("downcase", Fdowncase
, Sdowncase
, 1, 1, 0,
347 doc
: /* Convert argument to lower case and return that.
348 The argument may be a character or string. The result has the same type.
349 The argument object is not altered--the value is a copy. */)
352 return casify_object (CASE_DOWN
, obj
);
355 DEFUN ("capitalize", Fcapitalize
, Scapitalize
, 1, 1, 0,
356 doc
: /* Convert argument to capitalized form and return that.
357 This means that each word's first character is converted to either
358 title case or upper case, and the rest to lower case.
359 The argument may be a character or string. The result has the same type.
360 The argument object is not altered--the value is a copy. If argument
361 is a character, characters which map to multiple code points when
362 cased, e.g. fi, are returned unchanged. */)
365 return casify_object (CASE_CAPITALIZE
, obj
);
368 /* Like Fcapitalize but change only the initials. */
370 DEFUN ("upcase-initials", Fupcase_initials
, Supcase_initials
, 1, 1, 0,
371 doc
: /* Convert the initial of each word in the argument to upper case.
372 This means that each word's first character is converted to either
373 title case or upper case, and the rest are left unchanged.
374 The argument may be a character or string. The result has the same type.
375 The argument object is not altered--the value is a copy. If argument
376 is a character, characters which map to multiple code points when
377 cased, e.g. fi, are returned unchanged. */)
380 return casify_object (CASE_CAPITALIZE_UP
, obj
);
383 /* Based on CTX, case region in a unibyte buffer from *STARTP to *ENDP.
385 Save first and last positions that has changed in *STARTP and *ENDP
386 respectively. If no characters were changed, save -1 to *STARTP and leave
389 Always return 0. This is so that interface of this function is the same as
390 do_casify_multibyte_region. */
392 do_casify_unibyte_region (struct casing_context
*ctx
,
393 ptrdiff_t *startp
, ptrdiff_t *endp
)
395 ptrdiff_t first
= -1, last
= -1; /* Position of first and last changes. */
396 ptrdiff_t end
= *endp
;
398 for (ptrdiff_t pos
= *startp
; pos
< end
; ++pos
)
400 int ch
= FETCH_BYTE (pos
);
401 MAKE_CHAR_MULTIBYTE (ch
);
403 int cased
= case_single_character (ctx
, ch
);
411 MAKE_CHAR_UNIBYTE (cased
);
412 FETCH_BYTE (pos
) = cased
;
420 /* Based on CTX, case region in a multibyte buffer from *STARTP to *ENDP.
422 Return number of added characters (may be negative if more characters were
423 deleted then inserted), save first and last positions that has changed in
424 *STARTP and *ENDP respectively. If no characters were changed, return 0,
425 save -1 to *STARTP and leave *ENDP unspecified. */
427 do_casify_multibyte_region (struct casing_context
*ctx
,
428 ptrdiff_t *startp
, ptrdiff_t *endp
)
430 ptrdiff_t first
= -1, last
= -1; /* Position of first and last changes. */
431 ptrdiff_t pos
= *startp
, pos_byte
= CHAR_TO_BYTE (pos
), size
= *endp
- pos
;
432 ptrdiff_t opoint
= PT
, added
= 0;
437 int ch
= STRING_CHAR_AND_LENGTH (BYTE_POS_ADDR (pos_byte
), len
);
438 struct casing_str_buf buf
;
439 if (!case_character (&buf
, ctx
, ch
,
440 size
> 1 ? BYTE_POS_ADDR (pos_byte
+ len
) : NULL
))
447 last
= pos
+ buf
.len_chars
;
451 if (buf
.len_chars
== 1 && buf
.len_bytes
== len
)
452 memcpy (BYTE_POS_ADDR (pos_byte
), buf
.data
, len
);
455 /* Replace one character with the other(s), keeping text
456 properties the same. */
457 replace_range_2 (pos
, pos_byte
, pos
+ 1, pos_byte
+ len
,
458 (const char *) buf
.data
, buf
.len_chars
,
461 added
+= (ptrdiff_t) buf
.len_chars
- 1;
463 opoint
+= (ptrdiff_t) buf
.len_chars
- 1;
466 pos_byte
+= buf
.len_bytes
;
467 pos
+= buf
.len_chars
;
471 TEMP_SET_PT_BOTH (opoint
, CHAR_TO_BYTE (opoint
));
478 /* flag is CASE_UP, CASE_DOWN or CASE_CAPITALIZE or CASE_CAPITALIZE_UP. b and
479 e specify range of buffer to operate on. Return character position of the
480 end of the region after changes. */
482 casify_region (enum case_action flag
, Lisp_Object b
, Lisp_Object e
)
485 struct casing_context ctx
;
487 validate_region (&b
, &e
);
488 ptrdiff_t start
= XFASTINT (b
);
489 ptrdiff_t end
= XFASTINT (e
);
491 /* Not modifying because nothing marked. */
493 modify_text (start
, end
);
494 prepare_casing_context (&ctx
, flag
, true);
496 ptrdiff_t orig_end
= end
;
497 record_delete (start
, make_buffer_string (start
, end
, true), false);
498 if (NILP (BVAR (current_buffer
, enable_multibyte_characters
)))
500 record_insert (start
, end
- start
);
501 added
= do_casify_unibyte_region (&ctx
, &start
, &end
);
505 ptrdiff_t len
= end
- start
, ostart
= start
;
506 added
= do_casify_multibyte_region (&ctx
, &start
, &end
);
507 record_insert (ostart
, len
+ added
);
512 signal_after_change (start
, end
- start
- added
, end
- start
);
513 update_compositions (start
, end
, CHECK_ALL
);
516 return orig_end
+ added
;
519 DEFUN ("upcase-region", Fupcase_region
, Supcase_region
, 2, 3,
520 "(list (region-beginning) (region-end) (region-noncontiguous-p))",
521 doc
: /* Convert the region to upper case. In programs, wants two arguments.
522 These arguments specify the starting and ending character numbers of
523 the region to operate on. When used as a command, the text between
524 point and the mark is operated on.
525 See also `capitalize-region'. */)
526 (Lisp_Object beg
, Lisp_Object end
, Lisp_Object region_noncontiguous_p
)
528 Lisp_Object bounds
= Qnil
;
530 if (!NILP (region_noncontiguous_p
))
532 bounds
= call1 (Fsymbol_value (intern ("region-extract-function")),
535 while (CONSP (bounds
))
537 casify_region (CASE_UP
, XCAR (XCAR (bounds
)), XCDR (XCAR (bounds
)));
538 bounds
= XCDR (bounds
);
542 casify_region (CASE_UP
, beg
, end
);
547 DEFUN ("downcase-region", Fdowncase_region
, Sdowncase_region
, 2, 3,
548 "(list (region-beginning) (region-end) (region-noncontiguous-p))",
549 doc
: /* Convert the region to lower case. In programs, wants two arguments.
550 These arguments specify the starting and ending character numbers of
551 the region to operate on. When used as a command, the text between
552 point and the mark is operated on. */)
553 (Lisp_Object beg
, Lisp_Object end
, Lisp_Object region_noncontiguous_p
)
555 Lisp_Object bounds
= Qnil
;
557 if (!NILP (region_noncontiguous_p
))
559 bounds
= call1 (Fsymbol_value (intern ("region-extract-function")),
562 while (CONSP (bounds
))
564 casify_region (CASE_DOWN
, XCAR (XCAR (bounds
)), XCDR (XCAR (bounds
)));
565 bounds
= XCDR (bounds
);
569 casify_region (CASE_DOWN
, beg
, end
);
574 DEFUN ("capitalize-region", Fcapitalize_region
, Scapitalize_region
, 2, 2, "r",
575 doc
: /* Convert the region to capitalized form.
576 This means that each word's first character is converted to either
577 title case or upper case, and the rest to lower case.
578 In programs, give two arguments, the starting and ending
579 character positions to operate on. */)
580 (Lisp_Object beg
, Lisp_Object end
)
582 casify_region (CASE_CAPITALIZE
, beg
, end
);
586 /* Like Fcapitalize_region but change only the initials. */
588 DEFUN ("upcase-initials-region", Fupcase_initials_region
,
589 Supcase_initials_region
, 2, 2, "r",
590 doc
: /* Upcase the initial of each word in the region.
591 This means that each word's first character is converted to either
592 title case or upper case, and the rest are left unchanged.
593 In programs, give two arguments, the starting and ending
594 character positions to operate on. */)
595 (Lisp_Object beg
, Lisp_Object end
)
597 casify_region (CASE_CAPITALIZE_UP
, beg
, end
);
602 casify_word (enum case_action flag
, Lisp_Object arg
)
605 ptrdiff_t farend
= scan_words (PT
, XINT (arg
));
607 farend
= XINT (arg
) <= 0 ? BEGV
: ZV
;
608 SET_PT (casify_region (flag
, make_number (PT
), make_number (farend
)));
612 DEFUN ("upcase-word", Fupcase_word
, Supcase_word
, 1, 1, "p",
613 doc
: /* Convert to upper case from point to end of word, moving over.
615 If point is in the middle of a word, the part of that word before point
616 is ignored when moving forward.
618 With negative argument, convert previous words but do not move.
619 See also `capitalize-word'. */)
622 return casify_word (CASE_UP
, arg
);
625 DEFUN ("downcase-word", Fdowncase_word
, Sdowncase_word
, 1, 1, "p",
626 doc
: /* Convert to lower case from point to end of word, moving over.
628 If point is in the middle of a word, the part of that word before point
629 is ignored when moving forward.
631 With negative argument, convert previous words but do not move. */)
634 return casify_word (CASE_DOWN
, arg
);
637 DEFUN ("capitalize-word", Fcapitalize_word
, Scapitalize_word
, 1, 1, "p",
638 doc
: /* Capitalize from point to the end of word, moving over.
639 With numerical argument ARG, capitalize the next ARG-1 words as well.
640 This gives the word(s) a first character in upper case
641 and the rest lower case.
643 If point is in the middle of a word, the part of that word before point
644 is ignored when moving forward.
646 With negative argument, capitalize previous words but do not move. */)
649 return casify_word (CASE_CAPITALIZE
, arg
);
653 syms_of_casefiddle (void)
655 DEFSYM (Qidentity
, "identity");
656 DEFSYM (Qtitlecase
, "titlecase");
657 DEFSYM (Qspecial_uppercase
, "special-uppercase");
658 DEFSYM (Qspecial_lowercase
, "special-lowercase");
659 DEFSYM (Qspecial_titlecase
, "special-titlecase");
662 defsubr (&Sdowncase
);
663 defsubr (&Scapitalize
);
664 defsubr (&Supcase_initials
);
665 defsubr (&Supcase_region
);
666 defsubr (&Sdowncase_region
);
667 defsubr (&Scapitalize_region
);
668 defsubr (&Supcase_initials_region
);
669 defsubr (&Supcase_word
);
670 defsubr (&Sdowncase_word
);
671 defsubr (&Scapitalize_word
);
675 keys_of_casefiddle (void)
677 initial_define_key (control_x_map
, Ctl ('U'), "upcase-region");
678 Fput (intern ("upcase-region"), Qdisabled
, Qt
);
679 initial_define_key (control_x_map
, Ctl ('L'), "downcase-region");
680 Fput (intern ("downcase-region"), Qdisabled
, Qt
);
682 initial_define_key (meta_map
, 'u', "upcase-word");
683 initial_define_key (meta_map
, 'l', "downcase-word");
684 initial_define_key (meta_map
, 'c', "capitalize-word");