Merge branch 'master' of git.savannah.gnu.org:/srv/git/emacs
[emacs.git] / src / casefiddle.c
blob443d62b625932da941ef4e174aae75ba6bc8e6e4
1 /* -*- coding: utf-8 -*- */
2 /* GNU Emacs case conversion functions.
4 Copyright (C) 1985, 1994, 1997-1999, 2001-2017 Free Software Foundation,
5 Inc.
7 This file is part of GNU Emacs.
9 GNU Emacs is free software: you can redistribute it and/or modify
10 it under the terms of the GNU General Public License as published by
11 the Free Software Foundation, either version 3 of the License, or (at
12 your option) any later version.
14 GNU Emacs is distributed in the hope that it will be useful,
15 but WITHOUT ANY WARRANTY; without even the implied warranty of
16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 GNU General Public License for more details.
19 You should have received a copy of the GNU General Public License
20 along with GNU Emacs. If not, see <http://www.gnu.org/licenses/>. */
23 #include <config.h>
25 #include "lisp.h"
26 #include "character.h"
27 #include "buffer.h"
28 #include "commands.h"
29 #include "syntax.h"
30 #include "composite.h"
31 #include "keymap.h"
33 enum case_action {CASE_UP, CASE_DOWN, CASE_CAPITALIZE, CASE_CAPITALIZE_UP};
35 /* State for casing individual characters. */
36 struct casing_context
38 /* A char-table with title-case character mappings or nil. Non-nil implies
39 flag is CASE_CAPITALIZE or CASE_CAPITALIZE_UP. */
40 Lisp_Object titlecase_char_table;
42 /* The unconditional special-casing Unicode property char tables for upper
43 casing, lower casing and title casing respectively. */
44 Lisp_Object specialcase_char_tables[3];
46 /* User-requested action. */
47 enum case_action flag;
49 /* If true, the function operates on a buffer as opposed to a string
50 or character. When run on a buffer, syntax_prefix_flag_p is
51 taken into account when determining whether the context is within
52 a word. */
53 bool inbuffer;
55 /* Whether the context is within a word. */
56 bool inword;
59 /* Initialize CTX structure for casing characters. */
60 static void
61 prepare_casing_context (struct casing_context *ctx,
62 enum case_action flag, bool inbuffer)
64 ctx->flag = flag;
65 ctx->inbuffer = inbuffer;
66 ctx->inword = false;
67 ctx->titlecase_char_table
68 = (flag < CASE_CAPITALIZE ? Qnil
69 : uniprop_table (Qtitlecase));
70 ctx->specialcase_char_tables[CASE_UP]
71 = (flag == CASE_DOWN ? Qnil
72 : uniprop_table (Qspecial_uppercase));
73 ctx->specialcase_char_tables[CASE_DOWN]
74 = (flag == CASE_UP ? Qnil
75 : uniprop_table (Qspecial_lowercase));
76 ctx->specialcase_char_tables[CASE_CAPITALIZE]
77 = (flag < CASE_CAPITALIZE ? Qnil
78 : uniprop_table (Qspecial_titlecase));
80 /* If the case table is flagged as modified, rescan it. */
81 if (NILP (XCHAR_TABLE (BVAR (current_buffer, downcase_table))->extras[1]))
82 Fset_case_table (BVAR (current_buffer, downcase_table));
84 if (inbuffer && flag >= CASE_CAPITALIZE)
85 SETUP_BUFFER_SYNTAX_TABLE (); /* For syntax_prefix_flag_p. */
88 struct casing_str_buf
90 unsigned char data[max (6, MAX_MULTIBYTE_LENGTH)];
91 unsigned char len_chars;
92 unsigned char len_bytes;
95 /* Based on CTX, case character CH. If BUF is NULL, return cased character.
96 Otherwise, if BUF is non-NULL, save result in it and return whether the
97 character has been changed.
99 Since meaning of return value depends on arguments, it’s more convenient to
100 use case_single_character or case_character instead. */
101 static int
102 case_character_impl (struct casing_str_buf *buf,
103 struct casing_context *ctx, int ch)
105 enum case_action flag;
106 Lisp_Object prop;
107 int cased;
109 /* Update inword state */
110 bool was_inword = ctx->inword;
111 ctx->inword = SYNTAX (ch) == Sword &&
112 (!ctx->inbuffer || was_inword || !syntax_prefix_flag_p (ch));
114 /* Normalize flag so its one of CASE_UP, CASE_DOWN or CASE_CAPITALIZE. */
115 if (ctx->flag == CASE_CAPITALIZE)
116 flag = ctx->flag - was_inword;
117 else if (ctx->flag != CASE_CAPITALIZE_UP)
118 flag = ctx->flag;
119 else if (!was_inword)
120 flag = CASE_CAPITALIZE;
121 else
123 cased = ch;
124 goto done;
127 /* Look through the special casing entries. */
128 if (buf && !NILP (ctx->specialcase_char_tables[flag]))
130 prop = CHAR_TABLE_REF (ctx->specialcase_char_tables[flag], ch);
131 if (STRINGP (prop))
133 struct Lisp_String *str = XSTRING (prop);
134 if (STRING_BYTES (str) <= sizeof buf->data)
136 buf->len_chars = str->size;
137 buf->len_bytes = STRING_BYTES (str);
138 memcpy (buf->data, str->data, buf->len_bytes);
139 return 1;
144 /* Handle simple, one-to-one case. */
145 if (flag == CASE_DOWN)
146 cased = downcase (ch);
147 else
149 bool cased_is_set = false;
150 if (!NILP (ctx->titlecase_char_table))
152 prop = CHAR_TABLE_REF (ctx->titlecase_char_table, ch);
153 if (CHARACTERP (prop))
155 cased = XFASTINT (prop);
156 cased_is_set = true;
159 if (!cased_is_set)
160 cased = upcase (ch);
163 /* And we’re done. */
164 done:
165 if (!buf)
166 return cased;
167 buf->len_chars = 1;
168 buf->len_bytes = CHAR_STRING (cased, buf->data);
169 return cased != ch;
172 /* In Greek, lower case sigma has two forms: one when used in the middle and one
173 when used at the end of a word. Below is to help handle those cases when
174 casing.
176 The rule does not conflict with any other casing rules so while it is
177 a conditional one, it is independent of language. */
179 enum { GREEK_CAPITAL_LETTER_SIGMA = 0x03A3 }; /* Σ */
180 enum { GREEK_SMALL_LETTER_FINAL_SIGMA = 0x03C2 }; /* ς */
182 /* Based on CTX, case character CH accordingly. Update CTX as necessary.
183 Return cased character.
185 Special casing rules (such as upcase(fi) = FI) are not handled. For
186 characters whose casing results in multiple code points, the character is
187 returned unchanged. */
188 static inline int
189 case_single_character (struct casing_context *ctx, int ch)
191 return case_character_impl (NULL, ctx, ch);
194 /* Save in BUF result of casing character CH. Return whether casing changed the
195 character.
197 If not-NULL, NEXT points to the next character in the cased string. If NULL,
198 it is assumed current character is the last one being cased. This is used to
199 apply some rules which depend on proceeding state.
201 This is like case_single_character but also handles one-to-many casing
202 rules. */
203 static bool
204 case_character (struct casing_str_buf *buf, struct casing_context *ctx,
205 int ch, const unsigned char *next)
207 bool was_inword = ctx->inword;
208 bool changed = case_character_impl (buf, ctx, ch);
210 /* If we have just down-cased a capital sigma and the next character no longer
211 has a word syntax (i.e. current character is end of word), use final
212 sigma. */
213 if (was_inword && ch == GREEK_CAPITAL_LETTER_SIGMA && changed
214 && (!next || SYNTAX (STRING_CHAR (next)) != Sword))
216 buf->len_bytes = CHAR_STRING (GREEK_SMALL_LETTER_FINAL_SIGMA, buf->data);
217 buf->len_chars = 1;
220 return changed;
223 static Lisp_Object
224 do_casify_natnum (struct casing_context *ctx, Lisp_Object obj)
226 int flagbits = (CHAR_ALT | CHAR_SUPER | CHAR_HYPER
227 | CHAR_SHIFT | CHAR_CTL | CHAR_META);
228 int ch = XFASTINT (obj);
230 /* If the character has higher bits set above the flags, return it unchanged.
231 It is not a real character. */
232 if (UNSIGNED_CMP (ch, >, flagbits))
233 return obj;
235 int flags = ch & flagbits;
236 ch = ch & ~flagbits;
238 /* FIXME: Even if enable-multibyte-characters is nil, we may manipulate
239 multibyte chars. This means we have a bug for latin-1 chars since when we
240 receive an int 128-255 we can't tell whether it's an eight-bit byte or
241 a latin-1 char. */
242 bool multibyte = (ch >= 256
243 || !NILP (BVAR (current_buffer,
244 enable_multibyte_characters)));
245 if (! multibyte)
246 MAKE_CHAR_MULTIBYTE (ch);
247 int cased = case_single_character (ctx, ch);
248 if (cased == ch)
249 return obj;
251 if (! multibyte)
252 MAKE_CHAR_UNIBYTE (cased);
253 return make_natnum (cased | flags);
256 static Lisp_Object
257 do_casify_multibyte_string (struct casing_context *ctx, Lisp_Object obj)
259 /* Verify that ‘data’ is the first member of struct casing_str_buf
260 so that when casting char * to struct casing_str_buf *, the
261 representation of the character is at the beginning of the
262 buffer. This is why we don’t need a separate struct
263 casing_str_buf object, and can write directly to the destination. */
264 verify (offsetof (struct casing_str_buf, data) == 0);
266 ptrdiff_t size = SCHARS (obj), n;
267 USE_SAFE_ALLOCA;
268 if (INT_MULTIPLY_WRAPV (size, MAX_MULTIBYTE_LENGTH, &n)
269 || INT_ADD_WRAPV (n, sizeof (struct casing_str_buf), &n))
270 n = PTRDIFF_MAX;
271 unsigned char *dst = SAFE_ALLOCA (n);
272 unsigned char *dst_end = dst + n;
273 unsigned char *o = dst;
275 const unsigned char *src = SDATA (obj);
277 for (n = 0; size; --size)
279 if (dst_end - o < sizeof (struct casing_str_buf))
280 string_overflow ();
281 int ch = STRING_CHAR_ADVANCE (src);
282 case_character ((struct casing_str_buf *) o, ctx, ch,
283 size > 1 ? src : NULL);
284 n += ((struct casing_str_buf *) o)->len_chars;
285 o += ((struct casing_str_buf *) o)->len_bytes;
287 eassert (o <= dst_end);
288 obj = make_multibyte_string ((char *) dst, n, o - dst);
289 SAFE_FREE ();
290 return obj;
293 static Lisp_Object
294 do_casify_unibyte_string (struct casing_context *ctx, Lisp_Object obj)
296 ptrdiff_t i, size = SCHARS (obj);
297 int ch, cased;
299 obj = Fcopy_sequence (obj);
300 for (i = 0; i < size; i++)
302 ch = SREF (obj, i);
303 MAKE_CHAR_MULTIBYTE (ch);
304 cased = case_single_character (ctx, ch);
305 if (ch == cased)
306 continue;
307 MAKE_CHAR_UNIBYTE (cased);
308 /* If the char can't be converted to a valid byte, just don't
309 change it. */
310 if (cased >= 0 && cased < 256)
311 SSET (obj, i, cased);
313 return obj;
316 static Lisp_Object
317 casify_object (enum case_action flag, Lisp_Object obj)
319 struct casing_context ctx;
320 prepare_casing_context (&ctx, flag, false);
322 if (NATNUMP (obj))
323 return do_casify_natnum (&ctx, obj);
324 else if (!STRINGP (obj))
325 wrong_type_argument (Qchar_or_string_p, obj);
326 else if (!SCHARS (obj))
327 return obj;
328 else if (STRING_MULTIBYTE (obj))
329 return do_casify_multibyte_string (&ctx, obj);
330 else
331 return do_casify_unibyte_string (&ctx, obj);
334 DEFUN ("upcase", Fupcase, Supcase, 1, 1, 0,
335 doc: /* Convert argument to upper case and return that.
336 The argument may be a character or string. The result has the same type.
337 The argument object is not altered--the value is a copy. If argument
338 is a character, characters which map to multiple code points when
339 cased, e.g. fi, are returned unchanged.
340 See also `capitalize', `downcase' and `upcase-initials'. */)
341 (Lisp_Object obj)
343 return casify_object (CASE_UP, obj);
346 DEFUN ("downcase", Fdowncase, Sdowncase, 1, 1, 0,
347 doc: /* Convert argument to lower case and return that.
348 The argument may be a character or string. The result has the same type.
349 The argument object is not altered--the value is a copy. */)
350 (Lisp_Object obj)
352 return casify_object (CASE_DOWN, obj);
355 DEFUN ("capitalize", Fcapitalize, Scapitalize, 1, 1, 0,
356 doc: /* Convert argument to capitalized form and return that.
357 This means that each word's first character is converted to either
358 title case or upper case, and the rest to lower case.
359 The argument may be a character or string. The result has the same type.
360 The argument object is not altered--the value is a copy. If argument
361 is a character, characters which map to multiple code points when
362 cased, e.g. fi, are returned unchanged. */)
363 (Lisp_Object obj)
365 return casify_object (CASE_CAPITALIZE, obj);
368 /* Like Fcapitalize but change only the initials. */
370 DEFUN ("upcase-initials", Fupcase_initials, Supcase_initials, 1, 1, 0,
371 doc: /* Convert the initial of each word in the argument to upper case.
372 This means that each word's first character is converted to either
373 title case or upper case, and the rest are left unchanged.
374 The argument may be a character or string. The result has the same type.
375 The argument object is not altered--the value is a copy. If argument
376 is a character, characters which map to multiple code points when
377 cased, e.g. fi, are returned unchanged. */)
378 (Lisp_Object obj)
380 return casify_object (CASE_CAPITALIZE_UP, obj);
383 /* Based on CTX, case region in a unibyte buffer from *STARTP to *ENDP.
385 Save first and last positions that has changed in *STARTP and *ENDP
386 respectively. If no characters were changed, save -1 to *STARTP and leave
387 *ENDP unspecified.
389 Always return 0. This is so that interface of this function is the same as
390 do_casify_multibyte_region. */
391 static ptrdiff_t
392 do_casify_unibyte_region (struct casing_context *ctx,
393 ptrdiff_t *startp, ptrdiff_t *endp)
395 ptrdiff_t first = -1, last = -1; /* Position of first and last changes. */
396 ptrdiff_t end = *endp;
398 for (ptrdiff_t pos = *startp; pos < end; ++pos)
400 int ch = FETCH_BYTE (pos);
401 MAKE_CHAR_MULTIBYTE (ch);
403 int cased = case_single_character (ctx, ch);
404 if (cased == ch)
405 continue;
407 last = pos + 1;
408 if (first < 0)
409 first = pos;
411 MAKE_CHAR_UNIBYTE (cased);
412 FETCH_BYTE (pos) = cased;
415 *startp = first;
416 *endp = last;
417 return 0;
420 /* Based on CTX, case region in a multibyte buffer from *STARTP to *ENDP.
422 Return number of added characters (may be negative if more characters were
423 deleted then inserted), save first and last positions that has changed in
424 *STARTP and *ENDP respectively. If no characters were changed, return 0,
425 save -1 to *STARTP and leave *ENDP unspecified. */
426 static ptrdiff_t
427 do_casify_multibyte_region (struct casing_context *ctx,
428 ptrdiff_t *startp, ptrdiff_t *endp)
430 ptrdiff_t first = -1, last = -1; /* Position of first and last changes. */
431 ptrdiff_t pos = *startp, pos_byte = CHAR_TO_BYTE (pos), size = *endp - pos;
432 ptrdiff_t opoint = PT, added = 0;
434 for (; size; --size)
436 int len;
437 int ch = STRING_CHAR_AND_LENGTH (BYTE_POS_ADDR (pos_byte), len);
438 struct casing_str_buf buf;
439 if (!case_character (&buf, ctx, ch,
440 size > 1 ? BYTE_POS_ADDR (pos_byte + len) : NULL))
442 pos_byte += len;
443 ++pos;
444 continue;
447 last = pos + buf.len_chars;
448 if (first < 0)
449 first = pos;
451 if (buf.len_chars == 1 && buf.len_bytes == len)
452 memcpy (BYTE_POS_ADDR (pos_byte), buf.data, len);
453 else
455 /* Replace one character with the other(s), keeping text
456 properties the same. */
457 replace_range_2 (pos, pos_byte, pos + 1, pos_byte + len,
458 (const char *) buf.data, buf.len_chars,
459 buf.len_bytes,
461 added += (ptrdiff_t) buf.len_chars - 1;
462 if (opoint > pos)
463 opoint += (ptrdiff_t) buf.len_chars - 1;
466 pos_byte += buf.len_bytes;
467 pos += buf.len_chars;
470 if (PT != opoint)
471 TEMP_SET_PT_BOTH (opoint, CHAR_TO_BYTE (opoint));
473 *startp = first;
474 *endp = last;
475 return added;
478 /* flag is CASE_UP, CASE_DOWN or CASE_CAPITALIZE or CASE_CAPITALIZE_UP. b and
479 e specify range of buffer to operate on. Return character position of the
480 end of the region after changes. */
481 static ptrdiff_t
482 casify_region (enum case_action flag, Lisp_Object b, Lisp_Object e)
484 ptrdiff_t added;
485 struct casing_context ctx;
487 validate_region (&b, &e);
488 ptrdiff_t start = XFASTINT (b);
489 ptrdiff_t end = XFASTINT (e);
490 if (start == end)
491 /* Not modifying because nothing marked. */
492 return end;
493 modify_text (start, end);
494 prepare_casing_context (&ctx, flag, true);
496 ptrdiff_t orig_end = end;
497 record_delete (start, make_buffer_string (start, end, true), false);
498 if (NILP (BVAR (current_buffer, enable_multibyte_characters)))
500 record_insert (start, end - start);
501 added = do_casify_unibyte_region (&ctx, &start, &end);
503 else
505 ptrdiff_t len = end - start, ostart = start;
506 added = do_casify_multibyte_region (&ctx, &start, &end);
507 record_insert (ostart, len + added);
510 if (start >= 0)
512 signal_after_change (start, end - start - added, end - start);
513 update_compositions (start, end, CHECK_ALL);
516 return orig_end + added;
519 DEFUN ("upcase-region", Fupcase_region, Supcase_region, 2, 3,
520 "(list (region-beginning) (region-end) (region-noncontiguous-p))",
521 doc: /* Convert the region to upper case. In programs, wants two arguments.
522 These arguments specify the starting and ending character numbers of
523 the region to operate on. When used as a command, the text between
524 point and the mark is operated on.
525 See also `capitalize-region'. */)
526 (Lisp_Object beg, Lisp_Object end, Lisp_Object region_noncontiguous_p)
528 Lisp_Object bounds = Qnil;
530 if (!NILP (region_noncontiguous_p))
532 bounds = call1 (Fsymbol_value (intern ("region-extract-function")),
533 intern ("bounds"));
535 while (CONSP (bounds))
537 casify_region (CASE_UP, XCAR (XCAR (bounds)), XCDR (XCAR (bounds)));
538 bounds = XCDR (bounds);
541 else
542 casify_region (CASE_UP, beg, end);
544 return Qnil;
547 DEFUN ("downcase-region", Fdowncase_region, Sdowncase_region, 2, 3,
548 "(list (region-beginning) (region-end) (region-noncontiguous-p))",
549 doc: /* Convert the region to lower case. In programs, wants two arguments.
550 These arguments specify the starting and ending character numbers of
551 the region to operate on. When used as a command, the text between
552 point and the mark is operated on. */)
553 (Lisp_Object beg, Lisp_Object end, Lisp_Object region_noncontiguous_p)
555 Lisp_Object bounds = Qnil;
557 if (!NILP (region_noncontiguous_p))
559 bounds = call1 (Fsymbol_value (intern ("region-extract-function")),
560 intern ("bounds"));
562 while (CONSP (bounds))
564 casify_region (CASE_DOWN, XCAR (XCAR (bounds)), XCDR (XCAR (bounds)));
565 bounds = XCDR (bounds);
568 else
569 casify_region (CASE_DOWN, beg, end);
571 return Qnil;
574 DEFUN ("capitalize-region", Fcapitalize_region, Scapitalize_region, 2, 2, "r",
575 doc: /* Convert the region to capitalized form.
576 This means that each word's first character is converted to either
577 title case or upper case, and the rest to lower case.
578 In programs, give two arguments, the starting and ending
579 character positions to operate on. */)
580 (Lisp_Object beg, Lisp_Object end)
582 casify_region (CASE_CAPITALIZE, beg, end);
583 return Qnil;
586 /* Like Fcapitalize_region but change only the initials. */
588 DEFUN ("upcase-initials-region", Fupcase_initials_region,
589 Supcase_initials_region, 2, 2, "r",
590 doc: /* Upcase the initial of each word in the region.
591 This means that each word's first character is converted to either
592 title case or upper case, and the rest are left unchanged.
593 In programs, give two arguments, the starting and ending
594 character positions to operate on. */)
595 (Lisp_Object beg, Lisp_Object end)
597 casify_region (CASE_CAPITALIZE_UP, beg, end);
598 return Qnil;
601 static Lisp_Object
602 casify_word (enum case_action flag, Lisp_Object arg)
604 CHECK_NUMBER (arg);
605 ptrdiff_t farend = scan_words (PT, XINT (arg));
606 if (!farend)
607 farend = XINT (arg) <= 0 ? BEGV : ZV;
608 SET_PT (casify_region (flag, make_number (PT), make_number (farend)));
609 return Qnil;
612 DEFUN ("upcase-word", Fupcase_word, Supcase_word, 1, 1, "p",
613 doc: /* Convert to upper case from point to end of word, moving over.
615 If point is in the middle of a word, the part of that word before point
616 is ignored when moving forward.
618 With negative argument, convert previous words but do not move.
619 See also `capitalize-word'. */)
620 (Lisp_Object arg)
622 return casify_word (CASE_UP, arg);
625 DEFUN ("downcase-word", Fdowncase_word, Sdowncase_word, 1, 1, "p",
626 doc: /* Convert to lower case from point to end of word, moving over.
628 If point is in the middle of a word, the part of that word before point
629 is ignored when moving forward.
631 With negative argument, convert previous words but do not move. */)
632 (Lisp_Object arg)
634 return casify_word (CASE_DOWN, arg);
637 DEFUN ("capitalize-word", Fcapitalize_word, Scapitalize_word, 1, 1, "p",
638 doc: /* Capitalize from point to the end of word, moving over.
639 With numerical argument ARG, capitalize the next ARG-1 words as well.
640 This gives the word(s) a first character in upper case
641 and the rest lower case.
643 If point is in the middle of a word, the part of that word before point
644 is ignored when moving forward.
646 With negative argument, capitalize previous words but do not move. */)
647 (Lisp_Object arg)
649 return casify_word (CASE_CAPITALIZE, arg);
652 void
653 syms_of_casefiddle (void)
655 DEFSYM (Qidentity, "identity");
656 DEFSYM (Qtitlecase, "titlecase");
657 DEFSYM (Qspecial_uppercase, "special-uppercase");
658 DEFSYM (Qspecial_lowercase, "special-lowercase");
659 DEFSYM (Qspecial_titlecase, "special-titlecase");
661 defsubr (&Supcase);
662 defsubr (&Sdowncase);
663 defsubr (&Scapitalize);
664 defsubr (&Supcase_initials);
665 defsubr (&Supcase_region);
666 defsubr (&Sdowncase_region);
667 defsubr (&Scapitalize_region);
668 defsubr (&Supcase_initials_region);
669 defsubr (&Supcase_word);
670 defsubr (&Sdowncase_word);
671 defsubr (&Scapitalize_word);
674 void
675 keys_of_casefiddle (void)
677 initial_define_key (control_x_map, Ctl ('U'), "upcase-region");
678 Fput (intern ("upcase-region"), Qdisabled, Qt);
679 initial_define_key (control_x_map, Ctl ('L'), "downcase-region");
680 Fput (intern ("downcase-region"), Qdisabled, Qt);
682 initial_define_key (meta_map, 'u', "upcase-word");
683 initial_define_key (meta_map, 'l', "downcase-word");
684 initial_define_key (meta_map, 'c', "capitalize-word");