From 3074a9fad1c7c57948521125ee947bfa11ae185b Mon Sep 17 00:00:00 2001 From: Paul Eggert Date: Wed, 15 Apr 2015 23:45:08 -0700 Subject: [PATCH] '[:graph:]' now excludes whitespace, not just ' ' * doc/lispref/searching.texi (Char Classes): * lisp/emacs-lisp/rx.el (rx): Document [:graph:] to be [:print:] sans whitespace (not sans space). * src/character.c (graphicp): Exclude all Unicode whitespace chars, not just space. * src/regex.c (ISGRAPH): Exclude U+00A0 (NO-BREAK SPACE). --- doc/lispref/searching.texi | 4 ++-- lisp/emacs-lisp/rx.el | 4 ++-- src/character.c | 25 ++++++++++++++++--------- src/regex.c | 2 +- 4 files changed, 21 insertions(+), 14 deletions(-) diff --git a/doc/lispref/searching.texi b/doc/lispref/searching.texi index 10ea411d436..5a05c7c729d 100644 --- a/doc/lispref/searching.texi +++ b/doc/lispref/searching.texi @@ -558,7 +558,7 @@ This matches any @acronym{ASCII} control character. This matches @samp{0} through @samp{9}. Thus, @samp{[-+[:digit:]]} matches any digit, as well as @samp{+} and @samp{-}. @item [:graph:] -This matches graphic characters---everything except space, +This matches graphic characters---everything except whitespace, @acronym{ASCII} and non-@acronym{ASCII} control characters, surrogates, and codepoints unassigned by Unicode, as indicated by the Unicode @samp{general-category} property (@pxref{Character @@ -572,7 +572,7 @@ This matches any multibyte character (@pxref{Text Representations}). @item [:nonascii:] This matches any non-@acronym{ASCII} character. @item [:print:] -This matches any printing character---either space, or a graphic +This matches any printing character---either whitespace, or a graphic character matched by @samp{[:graph:]}. @item [:punct:] This matches any punctuation character. (At present, for multibyte diff --git a/lisp/emacs-lisp/rx.el b/lisp/emacs-lisp/rx.el index ab9beb60928..520210614f5 100644 --- a/lisp/emacs-lisp/rx.el +++ b/lisp/emacs-lisp/rx.el @@ -965,12 +965,12 @@ CHAR matches space and tab only. `graphic', `graph' - matches graphic characters--everything except space, ASCII + matches graphic characters--everything except whitespace, ASCII and non-ASCII control characters, surrogates, and codepoints unassigned by Unicode. `printing', `print' - matches space and graphic characters. + matches whitespace and graphic characters. `alphanumeric', `alnum' matches alphabetic characters and digits. (For multibyte characters, diff --git a/src/character.c b/src/character.c index ea98cf68e6c..c143c0f0e3e 100644 --- a/src/character.c +++ b/src/character.c @@ -984,8 +984,7 @@ character is not ASCII nor 8-bit character, an error is signaled. */) #ifdef emacs -/* Return 'true' if C is an alphabetic character as defined by its - Unicode properties. */ +/* Return true if C is an alphabetic character. */ bool alphabeticp (int c) { @@ -1008,8 +1007,7 @@ alphabeticp (int c) || gen_cat == UNICODE_CATEGORY_Nl); } -/* Return 'true' if C is an decimal-number character as defined by its - Unicode properties. */ +/* Return true if C is a decimal-number character. */ bool decimalnump (int c) { @@ -1022,16 +1020,25 @@ decimalnump (int c) return gen_cat == UNICODE_CATEGORY_Nd; } -/* Return 'true' if C is a graphic character as defined by its - Unicode properties. */ +/* Return true if C is a graphic character. */ bool graphicp (int c) { - return c == ' ' || printablep (c); + Lisp_Object category = CHAR_TABLE_REF (Vunicode_category_table, c); + if (! INTEGERP (category)) + return false; + EMACS_INT gen_cat = XINT (category); + + /* See UTS #18. */ + return (!(gen_cat == UNICODE_CATEGORY_Zs /* space separator */ + || gen_cat == UNICODE_CATEGORY_Zl /* line separator */ + || gen_cat == UNICODE_CATEGORY_Zp /* paragraph separator */ + || gen_cat == UNICODE_CATEGORY_Cc /* control */ + || gen_cat == UNICODE_CATEGORY_Cs /* surrogate */ + || gen_cat == UNICODE_CATEGORY_Cn)); /* unassigned */ } -/* Return 'true' if C is a printable character as defined by its - Unicode properties. */ +/* Return true if C is a printable character. */ bool printablep (int c) { diff --git a/src/regex.c b/src/regex.c index 4af70c62cf5..38c5e350541 100644 --- a/src/regex.c +++ b/src/regex.c @@ -313,7 +313,7 @@ enum syntaxcode { Swhitespace = 0, Sword = 1, Ssymbol = 2 }; /* The rest must handle multibyte characters. */ # define ISGRAPH(c) (SINGLE_BYTE_CHAR_P (c) \ - ? (c) > ' ' && !((c) >= 0177 && (c) <= 0237) \ + ? (c) > ' ' && !((c) >= 0177 && (c) <= 0240) \ : graphicp (c)) # define ISPRINT(c) (SINGLE_BYTE_CHAR_P (c) \ -- 2.11.4.GIT