lisp/international/characters.el

   1 ;;; characters.el --- set syntax and category for multibyte characters
   2
   3 ;; Copyright (C) 1997, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
   4 ;;   Free Software Foundation, Inc.
   5 ;; Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
   6 ;;   2005, 2006, 2007, 2008
   7 ;;   National Institute of Advanced Industrial Science and Technology (AIST)
   8 ;;   Registration Number H14PRO021
   9 ;; Copyright (C) 2003
  10 ;;   National Institute of Advanced Industrial Science and Technology (AIST)
  11 ;;   Registration Number H13PRO009
  12
  13 ;; Keywords: multibyte character, character set, syntax, category
  14
  15 ;; This file is part of GNU Emacs.
  16
  17 ;; GNU Emacs is free software: you can redistribute it and/or modify
  18 ;; it under the terms of the GNU General Public License as published by
  19 ;; the Free Software Foundation, either version 3 of the License, or
  20 ;; (at your option) any later version.
  21
  22 ;; GNU Emacs is distributed in the hope that it will be useful,
  23 ;; but WITHOUT ANY WARRANTY; without even the implied warranty of
  24 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  25 ;; GNU General Public License for more details.
  26
  27 ;; You should have received a copy of the GNU General Public License
  28 ;; along with GNU Emacs.  If not, see <http://www.gnu.org/licenses/>.
  29
  30 ;;; Commentary:
  31
  32 ;;; Code:
  33
  34 ;;; Predefined categories.
  35
  36 ;; For each character set.
  37
  38 (define-category ?a "ASCII graphic characters 32-126 (ISO646 IRV:1983[4/0])")
  39 (define-category ?l "Latin")
  40 (define-category ?t "Thai")
  41 (define-category ?g "Greek")
  42 (define-category ?b "Arabic")
  43 (define-category ?w "Hebrew")
  44 (define-category ?y "Cyrillic")
  45 (define-category ?k "Japanese katakana")
  46 (define-category ?r "Japanese roman")
  47 (define-category ?c "Chinese")
  48 (define-category ?j "Japanese")
  49 (define-category ?h "Korean")
  50 (define-category ?e "Ethiopic (Ge'ez)")
  51 (define-category ?v "Vietnamese")
  52 (define-category ?i "Indian")
  53 (define-category ?o "Lao")
  54 (define-category ?q "Tibetan")
  55
  56 ;; For each group (row) of 2-byte character sets.
  57
  58 (define-category ?A "Alpha-numeric characters of 2-byte character sets")
  59 (define-category ?C "Chinese (Han) characters of 2-byte character sets")
  60 (define-category ?G "Greek characters of 2-byte character sets")
  61 (define-category ?H "Japanese Hiragana characters of 2-byte character sets")
  62 (define-category ?K "Japanese Katakana characters of 2-byte character sets")
  63 (define-category ?N "Korean Hangul characters of 2-byte character sets")
  64 (define-category ?Y "Cyrillic characters of 2-byte character sets")
  65 (define-category ?I "Indian Glyphs")
  66
  67 ;; For phonetic classifications.
  68
  69 (define-category ?0 "consonant")
  70 (define-category ?1 "base (independent) vowel")
  71 (define-category ?2 "upper diacritical mark (including upper vowel)")
  72 (define-category ?3 "lower diacritical mark (including lower vowel)")
  73 (define-category ?4 "combining tone mark")
  74 (define-category ?5 "symbol")
  75 (define-category ?6 "digit")
  76 (define-category ?7 "vowel-modifying diacritical mark")
  77 (define-category ?8 "vowel-signs")
  78 (define-category ?9 "semivowel lower")
  79
  80 ;; For filling.
  81 (define-category ?| "While filling, we can break a line at this character.")
  82
  83 ;; For indentation calculation.
  84 (define-category ?\s
  85   "This character counts as a space for indentation purposes.")
  86
  87 ;; Keep the following for `kinsoku' processing.  See comments in
  88 ;; kinsoku.el.
  89 (define-category ?> "A character which can't be placed at beginning of line.")
  90 (define-category ?< "A character which can't be placed at end of line.")
  91
  92 ;; Combining
  93 (define-category ?^ "Combining diacritic or mark")
  94 \f
  95 ;;; Setting syntax and category.
  96
  97 ;; ASCII
  98
  99 ;; All ASCII characters have the category `a' (ASCII) and `l' (Latin).
 100 (modify-category-entry '(32 . 127) ?a)
 101 (modify-category-entry '(32 . 127) ?l)
 102
 103 ;; Deal with the CJK charsets first.  Since the syntax of blocks is
 104 ;; defined per charset, and the charsets may contain e.g. Latin
 105 ;; characters, we end up with the wrong syntax definitions if we're
 106 ;; not careful.
 107
 108 ;; Chinese characters (Unicode)
 109 (modify-category-entry '(#x2E80 . #x312F) ?|)
 110 (modify-category-entry '(#x3190 . #x33FF) ?|)
 111 (modify-category-entry '(#x3400 . #x9FAF) ?C)
 112 (modify-category-entry '(#x3400 . #x9FAF) ?c)
 113 (modify-category-entry '(#x3400 . #x9FAF) ?|)
 114 (modify-category-entry '(#xF900 . #xFAFF) ?C)
 115 (modify-category-entry '(#xF900 . #xFAFF) ?c)
 116 (modify-category-entry '(#xF900 . #xFAFF) ?|)
 117 (modify-category-entry '(#x20000 . #x2AFFF) ?|)
 118 (modify-category-entry '(#x2F800 . #x2FFFF) ?|)
 119
 120
 121 ;; Chinese character set (GB2312)
 122
 123 (map-charset-chars #'modify-syntax-entry 'chinese-gb2312 "_" #x2121 #x217E)
 124 (map-charset-chars #'modify-syntax-entry 'chinese-gb2312 "_" #x2221 #x227E)
 125 (map-charset-chars #'modify-syntax-entry 'chinese-gb2312 "_" #x2921 #x297E)
 126
 127 (map-charset-chars #'modify-category-entry 'chinese-gb2312 ?c)
 128 (map-charset-chars #'modify-category-entry 'chinese-gb2312 ?A #x2330 #x2339)
 129 (map-charset-chars #'modify-category-entry 'chinese-gb2312 ?A #x2341 #x235A)
 130 (map-charset-chars #'modify-category-entry 'chinese-gb2312 ?A #x2361 #x237A)
 131 (map-charset-chars #'modify-category-entry 'chinese-gb2312 ?H #x2421 #x247E)
 132 (map-charset-chars #'modify-category-entry 'chinese-gb2312 ?K #x2521 #x257E)
 133 (map-charset-chars #'modify-category-entry 'chinese-gb2312 ?G #x2621 #x267E)
 134 (map-charset-chars #'modify-category-entry 'chinese-gb2312 ?Y #x2721 #x277E)
 135 (map-charset-chars #'modify-category-entry 'chinese-gb2312 ?C #x3021 #x7E7E)
 136
 137 ;; Chinese character set (BIG5)
 138
 139 (map-charset-chars #'modify-category-entry 'big5 ?c)
 140 (map-charset-chars #'modify-category-entry 'big5 ?C #xA259 #xA25F)
 141 (map-charset-chars #'modify-category-entry 'big5 ?C #xA440 #xC67E)
 142 (map-charset-chars #'modify-category-entry 'big5 ?C #xC940 #xF9DF)
 143
 144 ;; Chinese character set (CNS11643)
 145
 146 (dolist (c '(chinese-cns11643-1 chinese-cns11643-2 chinese-cns11643-3
 147              chinese-cns11643-4 chinese-cns11643-5 chinese-cns11643-6
 148              chinese-cns11643-7))
 149   (map-charset-chars #'modify-category-entry c ?c)
 150   (if (eq c 'chinese-cns11643-1)
 151       (map-charset-chars #'modify-category-entry c ?C #x4421 #x7E7E)
 152     (map-charset-chars #'modify-category-entry c ?C)))
 153
 154 ;; Japanese character set (JISX0201, JISX0208, JISX0212, JISX0213)
 155
 156 (map-charset-chars #'modify-category-entry 'katakana-jisx0201 ?k)
 157
 158 (map-charset-chars #'modify-category-entry 'latin-jisx0201 ?r)
 159
 160 (dolist (l '(katakana-jisx0201 japanese-jisx0208 japanese-jisx0212
 161                                japanese-jisx0213-1 japanese-jisx0213-2))
 162   (map-charset-chars #'modify-category-entry l ?j))
 163
 164 ;; Unicode equivalents of JISX0201-kana
 165 (let ((range '(#xff61 . #xff9f)))
 166   (modify-category-entry range  ?k)
 167   (modify-category-entry range ?j)
 168   (modify-category-entry range ?\|))
 169
 170 ;; Katakana block
 171 (let ((range '(#x30a0 . #x30ff)))
 172   ;; ?K is double width, ?k isn't specified
 173   (modify-category-entry range ?K)
 174   (modify-category-entry range ?\|))
 175
 176 ;; Hiragana block
 177 (let ((range '(#x3040 . #x309d)))
 178   ;; ?H is actually defined to be double width
 179   ;;(modify-category-entry range ?H)
 180   (modify-category-entry range ?\|)
 181   )
 182
 183 ;; JISX0208
 184 (map-charset-chars #'modify-syntax-entry 'japanese-jisx0208 "_" #x2121 #x227E)
 185 (map-charset-chars #'modify-syntax-entry 'japanese-jisx0208 "_" #x2821 #x287E)
 186 (let ((chars '(?ー ?゛ ?゜ ?ヽ ?ヾ ?ゝ ?ゞ ?〃 ?仝 ?々 ?〆 ?〇)))
 187   (dolist (elt chars)
 188     (modify-syntax-entry (car chars) "w")))
 189
 190 (map-charset-chars #'modify-category-entry 'japanese-jisx0208 ?A #x2321 #x237E)
 191 (map-charset-chars #'modify-category-entry 'japanese-jisx0208 ?H #x2421 #x247E)
 192 (map-charset-chars #'modify-category-entry 'japanese-jisx0208 ?K #x2521 #x257E)
 193 (map-charset-chars #'modify-category-entry 'japanese-jisx0208 ?G #x2621 #x267E)
 194 (map-charset-chars #'modify-category-entry 'japanese-jisx0208 ?Y #x2721 #x277E)
 195 (map-charset-chars #'modify-category-entry 'japanese-jisx0208 ?C #x3021 #x7E7E)
 196 (modify-category-entry ?ー ?K)
 197 (let ((chars '(?゛ ?゜)))
 198   (while chars
 199     (modify-category-entry (car chars) ?K)
 200     (modify-category-entry (car chars) ?H)
 201     (setq chars (cdr chars))))
 202 (let ((chars '(?ヽ ?ヾ ?ゝ ?ゞ ?〃 ?仝 ?々 ?〆 ?〇)))
 203   (while chars
 204     (modify-category-entry (car chars) ?C)
 205     (setq chars (cdr chars))))
 206
 207 ;; JISX0212
 208
 209 (map-charset-chars #'modify-syntax-entry 'japanese-jisx0212 "_" #x2121 #x237E)
 210
 211 ;; JISX0201-Kana
 212
 213 (let ((chars '(?｡ ?､ ?･)))
 214   (while chars
 215     (modify-syntax-entry (car chars) ".")
 216     (setq chars (cdr chars))))
 217
 218 (modify-syntax-entry ?\｢ "(｣")
 219 (modify-syntax-entry ?\｣ "(｢")
 220
 221 ;; Korean character set (KSC5601)
 222
 223 (map-charset-chars #'modify-category-entry 'korean-ksc5601 ?h)
 224
 225 (map-charset-chars #'modify-syntax-entry 'korean-ksc5601 "_" #x2121 #x227E)
 226 (map-charset-chars #'modify-syntax-entry 'korean-ksc5601 "_" #x2621 #x277E)
 227 (map-charset-chars #'modify-syntax-entry 'korean-ksc5601 "_" #x2830 #x287E)
 228 (map-charset-chars #'modify-syntax-entry 'korean-ksc5601 "_" #x2930 #x297E)
 229 (map-charset-chars #'modify-category-entry 'korean-ksc5601 ?A #x2330 #x2339)
 230 (map-charset-chars #'modify-category-entry 'korean-ksc5601 ?A #x2341 #x235A)
 231 (map-charset-chars #'modify-category-entry 'korean-ksc5601 ?A #x2361 #x237A)
 232 (map-charset-chars #'modify-category-entry 'korean-ksc5601 ?G #x2521 #x257E)
 233 (map-charset-chars #'modify-category-entry 'korean-ksc5601 ?H #x2A21 #x2A7E)
 234 (map-charset-chars #'modify-category-entry 'korean-ksc5601 ?K #x2B21 #x2B7E)
 235 (map-charset-chars #'modify-category-entry 'korean-ksc5601 ?Y #x2C21 #x2C7E)
 236
 237 ;; These are in more than one charset.
 238 (let ((parens (concat "〈〉《》「」『』【】〔〕〖〗〘〙〚〛"
 239                       "︵︶︷︸︹︺︻︼︽︾︿﹀﹁﹂﹃﹄"
 240                       "（）［］｛｝"))
 241       open close)
 242   (dotimes (i (/ (length parens) 2))
 243     (setq open (aref parens (* i 2))
 244           close (aref parens (1+ (* i 2))))
 245     (modify-syntax-entry open (format "(%c" close))
 246     (modify-syntax-entry close (format ")%c" open))))
 247
 248 ;; Arabic character set
 249
 250 (let ((charsets '(arabic-iso8859-6
 251                   arabic-digit
 252                   arabic-1-column
 253                   arabic-2-column)))
 254   (while charsets
 255     (map-charset-chars #'modify-category-entry (car charsets) ?b)
 256     (setq charsets (cdr charsets))))
 257 (modify-category-entry '(#x600 . #x6ff) ?b)
 258 (modify-category-entry '(#xfb50 . #xfdff) ?b)
 259 (modify-category-entry '(#xfe70 . #xfefe) ?b)
 260
 261 ;; Cyrillic character set (ISO-8859-5)
 262
 263 (modify-syntax-entry ?№ ".")
 264
 265 ;; Ethiopic character set
 266
 267 (modify-category-entry '(#x1200 . #x1399) ?e)
 268 (modify-category-entry '(#x2d80 . #x2dde) ?e)
 269 (let ((chars '(?፡ ?። ?፣ ?፤ ?፥ ?፦ ?፧ ?፨)))
 270   (while chars
 271     (modify-syntax-entry (car chars) ".")
 272     (setq chars (cdr chars))))
 273 (map-charset-chars #'modify-category-entry 'ethiopic ?e)
 274
 275 ;; Hebrew character set (ISO-8859-8)
 276
 277 (modify-syntax-entry #x5be ".") ; MAQAF
 278 (modify-syntax-entry #x5c0 ".") ; PASEQ
 279 (modify-syntax-entry #x5c3 ".") ; SOF PASUQ
 280 (modify-syntax-entry #x5f3 ".") ; GERESH
 281 (modify-syntax-entry #x5f4 ".") ; GERSHAYIM
 282
 283 ;; Indian character set (IS 13194 and other Emacs original Indian charsets)
 284
 285 (modify-category-entry '(#x901 . #x970) ?i)
 286 (map-charset-chars #'modify-category-entry 'indian-is13194 ?i)
 287 (map-charset-chars #'modify-category-entry 'indian-2-column ?i)
 288
 289 ;; Lao character set
 290
 291 (modify-category-entry '(#xe80 . #xeff) ?o)
 292 (map-charset-chars #'modify-category-entry 'lao ?o)
 293
 294 (let ((deflist  '(("ກ-ຮ"    "w"     ?0) ; consonant
 295                   ("ະາຳຽເ-ໄ"        "w"     ?1) ; vowel base
 296                   ("ັິ-ືົໍ"   "w"     ?2) ; vowel upper
 297                   ("ຸູ"     "w"     ?3) ; vowel lower
 298                   ("່-໋"    "w"     ?4) ; tone mark
 299                   ("ຼຽ"     "w"     ?9) ; semivowel lower
 300                   ("໐-໙"    "w"     ?6) ; digit
 301                   ("ຯໆ"     "_"     ?5) ; symbol
 302                   ))
 303       elm chars len syntax category to ch i)
 304   (while deflist
 305     (setq elm (car deflist))
 306     (setq chars (car elm)
 307           len (length chars)
 308           syntax (nth 1 elm)
 309           category (nth 2 elm)
 310           i 0)
 311     (while (< i len)
 312       (if (= (aref chars i) ?-)
 313           (setq i (1+ i)
 314                 to (aref chars i))
 315         (setq ch (aref chars i)
 316               to ch))
 317       (while (<= ch to)
 318         (unless (string-equal syntax "w")
 319           (modify-syntax-entry ch syntax))
 320         (modify-category-entry ch category)
 321         (setq ch (1+ ch)))
 322       (setq i (1+ i)))
 323     (setq deflist (cdr deflist))))
 324
 325 ;; Thai character set (TIS620)
 326
 327 (modify-category-entry '(#xe00 . #xe7f) ?t)
 328 (map-charset-chars #'modify-category-entry 'thai-tis620 ?t)
 329
 330 (let ((deflist  '(;; chars      syntax  category
 331                   ("ก-รลว-ฮ"  "w"     ?0) ; consonant
 332                   ("ฤฦะาำเ-ๅ"     "w"     ?1) ; vowel base
 333                   ("ัิ-ื็๎"   "w"     ?2) ; vowel upper
 334                   ("ุ-ฺ"    "w"     ?3) ; vowel lower
 335                   ("่-ํ"    "w"     ?4) ; tone mark
 336                   ("๐-๙"    "w"     ?6) ; digit
 337                   ("ฯๆ฿๏๚๛" "_"     ?5) ; symbol
 338                   ))
 339       elm chars len syntax category to ch i)
 340   (while deflist
 341     (setq elm (car deflist))
 342     (setq chars (car elm)
 343           len (length chars)
 344           syntax (nth 1 elm)
 345           category (nth 2 elm)
 346           i 0)
 347     (while (< i len)
 348       (if (= (aref chars i) ?-)
 349           (setq i (1+ i)
 350                 to (aref chars i))
 351         (setq ch (aref chars i)
 352               to ch))
 353       (while (<= ch to)
 354         (unless (string-equal syntax "w")
 355           (modify-syntax-entry ch syntax))
 356         (modify-category-entry ch category)
 357         (setq ch (1+ ch)))
 358       (setq i (1+ i)))
 359     (setq deflist (cdr deflist))))
 360
 361 ;; Tibetan character set
 362
 363 (modify-category-entry '(#xf00 . #xfff) ?q)
 364 (map-charset-chars #'modify-category-entry 'tibetan ?q)
 365 (map-charset-chars #'modify-category-entry 'tibetan-1-column ?q)
 366
 367 (let ((deflist  '(;; chars             syntax category
 368                   ("ཀ-ཀྵཪ"         "w"     ?0) ; consonant
 369                   ("ྐ-ྐྵྺྻྼ"       "w"     ?0) ;
 370                   ("ིེཻོཽྀ"       "w"       ?2) ; upper vowel
 371                   ("ཾྂྃ྆྇ྈྉྊྋ" "w"    ?2) ; upper modifier
 372                   ("྄ཱུ༙༵༷"       "w"       ?3) ; lowel vowel/modifier
 373                   ("཰"                "w" ?3)             ; invisible vowel a
 374                   ("༠-༩༪-༳"             "w"     ?6) ; digit
 375                   ("་།-༒༔ཿ"        "."     ?|) ; line-break char
 376                   ("་།༏༐༑༔ཿ"            "."     ?|) ;
 377                   ("༈་།-༒༔ཿ༽༴"  "."     ?>) ; prohibition
 378                   ("་།༏༐༑༔ཿ"            "."     ?>) ;
 379                   ("ༀ-༊༼࿁࿂྅"      "."     ?<) ; prohibition
 380                   ("༓༕-༘༚-༟༶༸-༻༾༿྾྿-࿏" "." ?q) ; others
 381                   ))
 382       elm chars len syntax category to ch i)
 383   (while deflist
 384     (setq elm (car deflist))
 385     (setq chars (car elm)
 386           len (length chars)
 387           syntax (nth 1 elm)
 388           category (nth 2 elm)
 389           i 0)
 390     (while (< i len)
 391       (if (= (aref chars i) ?-)
 392           (setq i (1+ i)
 393                 to (aref chars i))
 394         (setq ch (aref chars i)
 395               to ch))
 396       (while (<= ch to)
 397         (unless (string-equal syntax "w")
 398           (modify-syntax-entry ch syntax))
 399         (modify-category-entry ch category)
 400         (setq ch (1+ ch)))
 401       (setq i (1+ i)))
 402     (setq deflist (cdr deflist))))
 403
 404 ;; Vietnamese character set
 405
 406 ;; To make a word with Latin characters
 407 (map-charset-chars #'modify-category-entry 'vietnamese-viscii-lower ?l)
 408 (map-charset-chars #'modify-category-entry 'vietnamese-viscii-lower ?v)
 409
 410 (map-charset-chars #'modify-category-entry 'vietnamese-viscii-upper ?l)
 411 (map-charset-chars #'modify-category-entry 'vietnamese-viscii-upper ?v)
 412
 413 (let ((tbl (standard-case-table))
 414       (i 32))
 415   (while (< i 128)
 416     (let* ((char (decode-char 'vietnamese-viscii-upper i))
 417            (charl (decode-char 'vietnamese-viscii-lower i))
 418            (uc (encode-char char 'ucs))
 419            (lc (encode-char charl 'ucs)))
 420       (set-case-syntax-pair char (decode-char 'vietnamese-viscii-lower i)
 421                             tbl)
 422       (if uc (modify-category-entry uc ?v))
 423       (if lc (modify-category-entry lc ?v)))
 424     (setq i (1+ i))))
 425
 426 ;; Tai Viet
 427 (let ((deflist '(;; chars       syntax  category
 428                  ((?ꪀ.  ?ꪯ) "w"     ?0) ; cosonant
 429                  ("ꪱꪵꪶ"           "w"     ?1) ; vowel base
 430                  ((?ꪹ . ?ꪽ) "w"     ?1) ; vowel base
 431                  ("ꪰꪲꪳꪷꪸꪾ"  "w"     ?2) ; vowel upper
 432                  ("ꪴ"         "w"     ?3) ; vowel lower
 433                  ("ꫀꫂ"              "w"     ?1) ; non-combining tone-mark
 434                  ("꪿꫁"              "w"     ?4) ; combining tone-mark
 435                  ((?ꫛ . ?꫟) "_"     ?5) ; symbol
 436                  )))
 437   (dolist (elm deflist)
 438     (let ((chars (car elm))
 439           (syntax (nth 1 elm))
 440           (category (nth 2 elm)))
 441       (if (consp chars)
 442           (progn
 443             (modify-syntax-entry chars syntax)
 444             (modify-category-entry chars category))
 445         (mapc #'(lambda (x)
 446                   (modify-syntax-entry x syntax)
 447                   (modify-category-entry x category))
 448               chars)))))
 449
 450 ;; Latin
 451
 452 (modify-category-entry '(#x80 . #x024F) ?l)
 453
 454 (let ((tbl (standard-case-table)) c)
 455
 456   ;; Latin-1
 457
 458   ;; Fixme: Some of the non-word syntaxes here perhaps should be
 459   ;; reviewed.  (Note that the following all implicitly have word
 460   ;; syntax: ¢£¤¥¨ª¯²³´¶¸¹º.)  There should be a well-defined way of
 461   ;; relating Unicode categories to Emacs syntax codes.
 462
 463   ;; NBSP isn't semantically interchangeable with other whitespace chars,
 464   ;; so it's more like punctation.
 465   (set-case-syntax ?  "." tbl)
 466   (set-case-syntax ?¡ "." tbl)
 467   (set-case-syntax ?¦ "_" tbl)
 468   (set-case-syntax ?§ "." tbl)
 469   (set-case-syntax ?© "_" tbl)
 470   (set-case-syntax-delims 171 187 tbl)  ; « »
 471   (set-case-syntax ?¬ "_" tbl)
 472   (set-case-syntax ? "_" tbl)
 473   (set-case-syntax ?® "_" tbl)
 474   (set-case-syntax ?° "_" tbl)
 475   (set-case-syntax ?± "_" tbl)
 476   (set-case-syntax ?µ "_" tbl)
 477   (set-case-syntax ?· "_" tbl)
 478   (set-case-syntax ?¼ "_" tbl)
 479   (set-case-syntax ?½ "_" tbl)
 480   (set-case-syntax ?¾ "_" tbl)
 481   (set-case-syntax ?¿ "." tbl)
 482   (let ((c 192))
 483     (while (<= c 222)
 484       (set-case-syntax-pair c (+ c 32) tbl)
 485       (setq c (1+ c))))
 486   (set-case-syntax ?× "_" tbl)
 487   (set-case-syntax ?ß "w" tbl)
 488   (set-case-syntax ?÷ "_" tbl)
 489   ;; See below for ÿ.
 490
 491   ;; Latin Extended-A, Latin Extended-B
 492   (setq c #x0100)
 493   (while (<= c #x02B8)
 494     (modify-category-entry c ?l)
 495     (setq c (1+ c)))
 496
 497   (let ((pair-ranges '((#x0100 . #x012F)
 498                        (#x0132 . #x0137)
 499                        (#x0139 . #x0148)
 500                        (#x014a . #x0177)
 501                        (#x0179 . #x017E)
 502                        (#x0182 . #x0185)
 503                        (#x0187 . #x018C)
 504                        (#x0191 . #x0192)
 505                        (#x0198 . #x0199)
 506                        (#x01A0 . #x01A5)
 507                        (#x01A7 . #x01A8)
 508                        (#x01AC . #x01AD)
 509                        (#x01AF . #x01B0)
 510                        (#x01B3 . #x01B6)
 511                        (#x01BC . #x01BD)
 512                        (#x01CD . #x01DC)
 513                        (#x01DE . #x01EF)
 514                        (#x01F4 . #x01F5)
 515                        (#x01F8 . #x021F)
 516                        (#x0222 . #x0233)
 517                        (#x023B . #x023C)
 518                        (#x0241 . #x0242)
 519                        (#x0246 . #x024F))))
 520     (dolist (elt pair-ranges)
 521       (let ((from (car elt)) (to (cdr elt)))
 522         (while (< from to)
 523           (set-case-syntax-pair from (1+ from) tbl)
 524           (setq from (+ from 2))))))
 525
 526   ;; In some languages, such as Turkish, U+0049 LATIN CAPITAL LETTER I
 527   ;; and U+0131 LATIN SMALL LETTER DOTLESS I make a case pair, and so
 528   ;; do U+0130 LATIN CAPITAL LETTER I WITH DOT ABOVE and U+0069 LATIN
 529   ;; SMALL LETTER I.
 530
 531   ;; We used to set up half of those correspondence unconditionally,
 532   ;; but that makes searches slow.  So now we don't set up either half
 533   ;; of these correspondences by default.
 534
 535   ;; (set-downcase-syntax  ?İ ?i tbl)
 536   ;; (set-upcase-syntax    ?I ?ı tbl)
 537
 538   (set-case-syntax-pair ?Ǆ ?ǆ tbl)
 539   (set-case-syntax-pair ?ǅ ?ǆ tbl)
 540   (set-case-syntax-pair ?Ǉ ?ǉ tbl)
 541   (set-case-syntax-pair ?ǈ ?ǉ tbl)
 542   (set-case-syntax-pair ?Ǌ ?ǌ tbl)
 543   (set-case-syntax-pair ?ǋ ?ǌ tbl)
 544
 545   ;; 01F0; F; 006A 030C; # LATIN SMALL LETTER J WITH CARON
 546   (set-case-syntax-pair ?Ǳ ?ǳ tbl)
 547   (set-case-syntax-pair ?ǲ ?ǳ tbl)
 548   (set-case-syntax-pair ?Ƕ ?ƕ tbl)
 549   (set-case-syntax-pair ?Ƿ ?ƿ tbl)
 550
 551   ;; Latin Extended Additional
 552   (modify-category-entry '(#x1e00 . #x1ef9) ?l)
 553   (setq c #x1e00)
 554   (while (<= c #x1ef9)
 555     (and (zerop (% c 2))
 556          (or (<= c #x1e94) (>= c #x1ea0))
 557          (set-case-syntax-pair c (1+ c) tbl))
 558     (setq c (1+ c)))
 559
 560   ;; Greek
 561   (modify-category-entry '(#x0370 . #x03ff) ?g)
 562   (setq c #x0370)
 563   (while (<= c #x03ff)
 564     (if (or (and (>= c #x0391) (<= c #x03a1))
 565             (and (>= c #x03a3) (<= c #x03ab)))
 566         (set-case-syntax-pair c (+ c 32) tbl))
 567     (and (>= c #x03da)
 568          (<= c #x03ee)
 569          (zerop (% c 2))
 570          (set-case-syntax-pair c (1+ c) tbl))
 571     (setq c (1+ c)))
 572   (set-case-syntax-pair ?Ά ?ά tbl)
 573   (set-case-syntax-pair ?Έ ?έ tbl)
 574   (set-case-syntax-pair ?Ή ?ή tbl)
 575   (set-case-syntax-pair ?Ί ?ί tbl)
 576   (set-case-syntax-pair ?Ό ?ό tbl)
 577   (set-case-syntax-pair ?Ύ ?ύ tbl)
 578   (set-case-syntax-pair ?Ώ ?ώ tbl)
 579
 580   ;; Armenian
 581   (setq c #x531)
 582   (while (<= c #x556)
 583     (set-case-syntax-pair c (+ c #x30) tbl)
 584     (setq c (1+ c)))
 585
 586   ;; Greek Extended
 587   (modify-category-entry '(#x1f00 . #x1fff) ?g)
 588   (setq c #x1f00)
 589   (while (<= c #x1fff)
 590     (and (<= (logand c #x000f) 7)
 591          (<= c #x1fa7)
 592          (not (memq c '(#x1f50 #x1f52 #x1f54 #x1f56)))
 593          (/= (logand c #x00f0) 7)
 594          (set-case-syntax-pair (+ c 8) c tbl))
 595     (setq c (1+ c)))
 596   (set-case-syntax-pair ?Ᾰ ?ᾰ tbl)
 597   (set-case-syntax-pair ?Ᾱ ?ᾱ tbl)
 598   (set-case-syntax-pair ?Ὰ ?ὰ tbl)
 599   (set-case-syntax-pair ?Ά ?ά tbl)
 600   (set-case-syntax-pair ?ᾼ ?ᾳ tbl)
 601   (set-case-syntax-pair ?Ὲ ?ὲ tbl)
 602   (set-case-syntax-pair ?Έ ?έ tbl)
 603   (set-case-syntax-pair ?Ὴ ?ὴ tbl)
 604   (set-case-syntax-pair ?Ή ?ή tbl)
 605   (set-case-syntax-pair ?ῌ ?ῃ tbl)
 606   (set-case-syntax-pair ?Ῐ ?ῐ tbl)
 607   (set-case-syntax-pair ?Ῑ ?ῑ tbl)
 608   (set-case-syntax-pair ?Ὶ ?ὶ tbl)
 609   (set-case-syntax-pair ?Ί ?ί tbl)
 610   (set-case-syntax-pair ?Ῠ ?ῠ tbl)
 611   (set-case-syntax-pair ?Ῡ ?ῡ tbl)
 612   (set-case-syntax-pair ?Ὺ ?ὺ tbl)
 613   (set-case-syntax-pair ?Ύ ?ύ tbl)
 614   (set-case-syntax-pair ?Ῥ ?ῥ tbl)
 615   (set-case-syntax-pair ?Ὸ ?ὸ tbl)
 616   (set-case-syntax-pair ?Ό ?ό tbl)
 617   (set-case-syntax-pair ?Ὼ ?ὼ tbl)
 618   (set-case-syntax-pair ?Ώ ?ώ tbl)
 619   (set-case-syntax-pair ?ῼ ?ῳ tbl)
 620
 621   ;; cyrillic
 622   (modify-category-entry '(#x0400 . #x04FF) ?y)
 623   (setq c #x0400)
 624   (while (<= c #x04ff)
 625     (and (>= c #x0400)
 626          (<= c #x040f)
 627          (set-case-syntax-pair c (+ c 80) tbl))
 628     (and (>= c #x0410)
 629          (<= c #x042f)
 630          (set-case-syntax-pair c (+ c 32) tbl))
 631     (and (zerop (% c 2))
 632          (or (and (>= c #x0460) (<= c #x0480))
 633              (and (>= c #x048c) (<= c #x04be))
 634              (and (>= c #x04d0) (<= c #x04f4)))
 635          (set-case-syntax-pair c (1+ c) tbl))
 636     (setq c (1+ c)))
 637   (set-case-syntax-pair ?Ӂ ?ӂ tbl)
 638   (set-case-syntax-pair ?Ӄ ?ӄ tbl)
 639   (set-case-syntax-pair ?Ӈ ?ӈ tbl)
 640   (set-case-syntax-pair ?Ӌ ?ӌ tbl)
 641   (set-case-syntax-pair ?Ӹ ?ӹ tbl)
 642
 643   ;; general punctuation
 644   (setq c #x2000)
 645   (while (<= c #x200b)
 646     (set-case-syntax c " " tbl)
 647     (setq c (1+ c)))
 648   (while (<= c #x200F)
 649     (set-case-syntax c "." tbl)
 650     (setq c (1+ c)))
 651   ;; Fixme: These aren't all right:
 652   (setq c #x2010)
 653   (while (<= c #x2016)
 654     (set-case-syntax c "_" tbl)
 655     (setq c (1+ c)))
 656   ;; Punctuation syntax for quotation marks (like `)
 657   (while (<= c #x201f)
 658     (set-case-syntax  c "." tbl)
 659     (setq c (1+ c)))
 660   ;; Fixme: These aren't all right:
 661   (while (<= c #x2027)
 662     (set-case-syntax c "_" tbl)
 663     (setq c (1+ c)))
 664   (while (<= c #x206F)
 665     (set-case-syntax c "." tbl)
 666     (setq c (1+ c)))
 667
 668   ;; Roman numerals
 669   (setq c #x2160)
 670   (while (<= c #x216f)
 671     (set-case-syntax-pair c (+ c #x10) tbl)
 672     (setq c (1+ c)))
 673
 674   ;; Fixme: The following blocks might be better as symbol rather than
 675   ;; punctuation.
 676   ;; Arrows
 677   (setq c #x2190)
 678   (while (<= c #x21FF)
 679     (set-case-syntax c "." tbl)
 680     (setq c (1+ c)))
 681   ;; Mathematical Operators
 682   (while (<= c #x22FF)
 683     (set-case-syntax c "." tbl)
 684     (setq c (1+ c)))
 685   ;; Miscellaneous Technical
 686   (while (<= c #x23FF)
 687     (set-case-syntax c "." tbl)
 688     (setq c (1+ c)))
 689   ;; Control Pictures
 690   (while (<= c #x243F)
 691     (set-case-syntax c "_" tbl)
 692     (setq c (1+ c)))
 693
 694   ;; Circled Latin
 695   (setq c #x24b6)
 696   (while (<= c #x24cf)
 697     (set-case-syntax-pair c (+ c 26) tbl)
 698     (modify-category-entry c ?l)
 699     (modify-category-entry (+ c 26) ?l)
 700     (setq c (1+ c)))
 701
 702   ;; Fullwidth Latin
 703   (setq c #xff21)
 704   (while (<= c #xff3a)
 705     (set-case-syntax-pair c (+ c #x20) tbl)
 706     (modify-category-entry c ?l)
 707     (modify-category-entry (+ c #x20) ?l)
 708     (setq c (1+ c)))
 709
 710   ;; Combining diacritics
 711   (modify-category-entry '(#x300 . #x362) ?^)
 712   ;; Combining marks
 713   (modify-category-entry '(#x20d0 . #x20e3) ?^)
 714
 715   ;; Fixme: syntax for symbols &c
 716   )
 717
 718 (let ((pairs
 719        '("⁅⁆"                               ; U+2045 U+2046
 720          "⁽⁾"                               ; U+207D U+207E
 721          "₍₎"                               ; U+208D U+208E
 722          "〈〉"                               ; U+2329 U+232A
 723          "⎴⎵"                               ; U+23B4 U+23B5
 724          "❨❩"                               ; U+2768 U+2769
 725          "❪❫"                               ; U+276A U+276B
 726          "❬❭"                               ; U+276C U+276D
 727          "❰❱"                               ; U+2770 U+2771
 728          "❲❳"                               ; U+2772 U+2773
 729          "❴❵"                               ; U+2774 U+2775
 730          "⟦⟧"                               ; U+27E6 U+27E7
 731          "⟨⟩"                               ; U+27E8 U+27E9
 732          "⟪⟫"                               ; U+27EA U+27EB
 733          "⦃⦄"                               ; U+2983 U+2984
 734          "⦅⦆"                               ; U+2985 U+2986
 735          "⦇⦈"                               ; U+2987 U+2988
 736          "⦉⦊"                               ; U+2989 U+298A
 737          "⦋⦌"                               ; U+298B U+298C
 738          "⦍⦎"                               ; U+298D U+298E
 739          "⦏⦐"                               ; U+298F U+2990
 740          "⦑⦒"                               ; U+2991 U+2992
 741          "⦓⦔"                               ; U+2993 U+2994
 742          "⦕⦖"                               ; U+2995 U+2996
 743          "⦗⦘"                               ; U+2997 U+2998
 744          "⧼⧽"                               ; U+29FC U+29FD
 745          "〈〉"                               ; U+3008 U+3009
 746          "《》"                               ; U+300A U+300B
 747          "「」"                               ; U+300C U+300D
 748          "『』"                               ; U+300E U+300F
 749          "【】"                               ; U+3010 U+3011
 750          "〔〕"                               ; U+3014 U+3015
 751          "〖〗"                               ; U+3016 U+3017
 752          "〘〙"                               ; U+3018 U+3019
 753          "〚〛"                               ; U+301A U+301B
 754          "﴾﴿"                               ; U+FD3E U+FD3F
 755          "︵︶"                               ; U+FE35 U+FE36
 756          "︷︸"                               ; U+FE37 U+FE38
 757          "︹︺"                               ; U+FE39 U+FE3A
 758          "︻︼"                               ; U+FE3B U+FE3C
 759          "︽︾"                               ; U+FE3D U+FE3E
 760          "︿﹀"                               ; U+FE3F U+FE40
 761          "﹁﹂"                               ; U+FE41 U+FE42
 762          "﹃﹄"                               ; U+FE43 U+FE44
 763          "﹙﹚"                               ; U+FE59 U+FE5A
 764          "﹛﹜"                               ; U+FE5B U+FE5C
 765          "﹝﹞"                               ; U+FE5D U+FE5E
 766          "（）"                               ; U+FF08 U+FF09
 767          "［］"                               ; U+FF3B U+FF3D
 768          "｛｝"                               ; U+FF5B U+FF5D
 769          "｟｠"                               ; U+FF5F U+FF60
 770          "｢｣"                               ; U+FF62 U+FF63
 771          )))
 772   (dolist (elt pairs)
 773     (modify-syntax-entry (aref elt 0) (string ?\( (aref elt 1)))
 774     (modify-syntax-entry (aref elt 1) (string ?\) (aref elt 0)))))
 775
 776 \f
 777 ;; For each character set, put the information of the most proper
 778 ;; coding system to encode it by `preferred-coding-system' property.
 779
 780 ;; Fixme: should this be junked?
 781 (let ((l '((latin-iso8859-1     . iso-latin-1)
 782            (latin-iso8859-2     . iso-latin-2)
 783            (latin-iso8859-3     . iso-latin-3)
 784            (latin-iso8859-4     . iso-latin-4)
 785            (thai-tis620         . thai-tis620)
 786            (greek-iso8859-7     . greek-iso-8bit)
 787            (arabic-iso8859-6    . iso-2022-7bit)
 788            (hebrew-iso8859-8    . hebrew-iso-8bit)
 789            (katakana-jisx0201   . japanese-shift-jis)
 790            (latin-jisx0201      . japanese-shift-jis)
 791            (cyrillic-iso8859-5  . cyrillic-iso-8bit)
 792            (latin-iso8859-9     . iso-latin-5)
 793            (japanese-jisx0208-1978 . iso-2022-jp)
 794            (chinese-gb2312      . chinese-iso-8bit)
 795            (chinese-gbk         . chinese-gbk)
 796            (gb18030-2-byte      . chinese-gb18030)
 797            (gb18030-4-byte-bmp  . chinese-gb18030)
 798            (gb18030-4-byte-smp  . chinese-gb18030)
 799            (gb18030-4-byte-ext-1 . chinese-gb18030)
 800            (gb18030-4-byte-ext-2 . chinese-gb18030)
 801            (japanese-jisx0208   . iso-2022-jp)
 802            (korean-ksc5601      . iso-2022-kr)
 803            (japanese-jisx0212   . iso-2022-jp)
 804            (chinese-big5-1      . chinese-big5)
 805            (chinese-big5-2      . chinese-big5)
 806            (chinese-sisheng     . iso-2022-7bit)
 807            (ipa                 . iso-2022-7bit)
 808            (vietnamese-viscii-lower . vietnamese-viscii)
 809            (vietnamese-viscii-upper . vietnamese-viscii)
 810            (arabic-digit        . iso-2022-7bit)
 811            (arabic-1-column     . iso-2022-7bit)
 812            (lao                 . lao)
 813            (arabic-2-column     . iso-2022-7bit)
 814            (indian-is13194      . devanagari)
 815            (indian-glyph        . devanagari)
 816            (tibetan-1-column    . tibetan)
 817            (ethiopic            . iso-2022-7bit)
 818            (chinese-cns11643-1  . iso-2022-cn)
 819            (chinese-cns11643-2  . iso-2022-cn)
 820            (chinese-cns11643-3  . iso-2022-cn)
 821            (chinese-cns11643-4  . iso-2022-cn)
 822            (chinese-cns11643-5  . iso-2022-cn)
 823            (chinese-cns11643-6  . iso-2022-cn)
 824            (chinese-cns11643-7  . iso-2022-cn)
 825            (indian-2-column     . devanagari)
 826            (tibetan             . tibetan)
 827            (latin-iso8859-14    . iso-latin-8)
 828            (latin-iso8859-15    . iso-latin-9))))
 829   (while l
 830     (put-charset-property (car (car l)) 'preferred-coding-system (cdr (car l)))
 831     (setq l (cdr l))))
 832
 833 \f
 834 ;; Setup auto-fill-chars for charsets that should invoke auto-filling.
 835 ;; SPACE and NEWLINE are already set.
 836
 837 (set-char-table-range auto-fill-chars '(#x3041 . #x30FF) t)
 838 (set-char-table-range auto-fill-chars '(#x3400 . #x4DB5) t)
 839 (set-char-table-range auto-fill-chars '(#x4e00 . #x9fbb) t)
 840 (set-char-table-range auto-fill-chars '(#xF900 . #xFAFF) t)
 841 (set-char-table-range auto-fill-chars '(#xFF00 . #xFF9F) t)
 842 (set-char-table-range auto-fill-chars '(#x20000 . #x2FFFF) t)
 843
 844 \f
 845 ;;; Setting char-width-table.  The default is 1.
 846
 847 ;; 0: non-spacing, enclosing combining, formatting, Hangul Jamo medial
 848 ;;    and final characters.
 849 (let ((l '((#x0300 . #x036F)
 850            (#x0483 . #x0489)
 851            (#x0591 . #x05BD)
 852            (#x05BF . #x05BF)
 853            (#x05C1 . #x05C2)
 854            (#x05C4 . #x05C5)
 855            (#x05C7 . #x05C7)
 856            (#x0600 . #x0603)
 857            (#x0610 . #x0615)
 858            (#x064B . #x065E)
 859            (#x0670 . #x0670)
 860            (#x06D6 . #x06E4)
 861            (#x06E7 . #x06E8)
 862            (#x06EA . #x06ED)
 863            (#x070F . #x070F)
 864            (#x0711 . #x0711)
 865            (#x0730 . #x074A)
 866            (#x07A6 . #x07B0)
 867            (#x07EB . #x07F3)
 868            (#x0901 . #x0902)
 869            (#x093C . #x093C)
 870            (#x0941 . #x0948)
 871            (#x094D . #x094D)
 872            (#x0951 . #x0954)
 873            (#x0962 . #x0963)
 874            (#x0981 . #x0981)
 875            (#x09BC . #x09BC)
 876            (#x09C1 . #x09C4)
 877            (#x09CD . #x09CD)
 878            (#x09E2 . #x09E3)
 879            (#x0A01 . #x0A02)
 880            (#x0A3C . #x0A3C)
 881            (#x0A41 . #x0A4D)
 882            (#x0A70 . #x0A71)
 883            (#x0A81 . #x0A82)
 884            (#x0ABC . #x0ABC)
 885            (#x0AC1 . #x0AC8)
 886            (#x0ACD . #x0ACD)
 887            (#x0AE2 . #x0AE3)
 888            (#x0B01 . #x0B01)
 889            (#x0B3C . #x0B3C)
 890            (#x0B3F . #x0B3F)
 891            (#x0B41 . #x0B43)
 892            (#x0B4D . #x0B56)
 893            (#x0B82 . #x0B82)
 894            (#x0BC0 . #x0BC0)
 895            (#x0BCD . #x0BCD)
 896            (#x0C3E . #x0C40)
 897            (#x0C46 . #x0C56)
 898            (#x0CBC . #x0CBC)
 899            (#x0CBF . #x0CBF)
 900            (#x0CC6 . #x0CC6)
 901            (#x0CCC . #x0CCD)
 902            (#x0CE2 . #x0CE3)
 903            (#x0D41 . #x0D43)
 904            (#x0D4D . #x0D4D)
 905            (#x0DCA . #x0DCA)
 906            (#x0DD2 . #x0DD6)
 907            (#x0E31 . #x0E31)
 908            (#x0E34 . #x0E3A)
 909            (#x0E47 . #x0E4E)
 910            (#x0EB1 . #x0EB1)
 911            (#x0EB4 . #x0EBC)
 912            (#x0EC8 . #x0ECD)
 913            (#x0F18 . #x0F19)
 914            (#x0F35 . #x0F35)
 915            (#x0F37 . #x0F37)
 916            (#x0F39 . #x0F39)
 917            (#x0F71 . #x0F7E)
 918            (#x0F80 . #x0F84)
 919            (#x0F86 . #x0F87)
 920            (#x0F90 . #x0FBC)
 921            (#x0FC6 . #x0FC6)
 922            (#x102D . #x1030)
 923            (#x1032 . #x1037)
 924            (#x1039 . #x1039)
 925            (#x1058 . #x1059)
 926            (#x1160 . #x11FF)
 927            (#x135F . #x135F)
 928            (#x1712 . #x1714)
 929            (#x1732 . #x1734)
 930            (#x1752 . #x1753)
 931            (#x1772 . #x1773)
 932            (#x17B4 . #x17B5)
 933            (#x17B7 . #x17BD)
 934            (#x17C6 . #x17C6)
 935            (#x17C9 . #x17D3)
 936            (#x17DD . #x17DD)
 937            (#x180B . #x180D)
 938            (#x18A9 . #x18A9)
 939            (#x1920 . #x1922)
 940            (#x1927 . #x1928)
 941            (#x1932 . #x1932)
 942            (#x1939 . #x193B)
 943            (#x1A17 . #x1A18)
 944            (#x1B00 . #x1B03)
 945            (#x1B34 . #x1B34)
 946            (#x1B36 . #x1B3A)
 947            (#x1B3C . #x1B3C)
 948            (#x1B42 . #x1B42)
 949            (#x1B6B . #x1B73)
 950            (#x1DC0 . #x1DFF)
 951            (#x200B . #x200F)
 952            (#x202A . #x202E)
 953            (#x2060 . #x206F)
 954            (#x20D0 . #x20EF)
 955            (#x302A . #x302F)
 956            (#x3099 . #x309A)
 957            (#xA806 . #xA806)
 958            (#xA80B . #xA80B)
 959            (#xA825 . #xA826)
 960            (#xFB1E . #xFB1E)
 961            (#xFE00 . #xFE0F)
 962            (#xFE20 . #xFE23)
 963            (#xFEFF . #xFEFF)
 964            (#xFFF9 . #xFFFB)
 965            (#x10A01 . #x10A0F)
 966            (#x10A38 . #x10A3F)
 967            (#x1D167 . #x1D169)
 968            (#x1D173 . #x1D182)
 969            (#x1D185 . #x1D18B)
 970            (#x1D1AA . #x1D1AD)
 971            (#x1D242 . #x1D244)
 972            (#xE0001 . #xE01EF))))
 973   (dolist (elt l)
 974     (set-char-table-range char-width-table elt 0)))
 975
 976 ;; 2: East Asian Wide and Full-width characters.
 977 (let ((l '((#x1100 . #x115F)
 978            (#x2329 . #x232A)
 979            (#x2E80 . #x303E)
 980            (#x3040 . #xA4CF)
 981            (#xAC00 . #xD7A3)
 982            (#xF900 . #xFAFF)
 983            (#xFE30 . #xFE6F)
 984            (#xFF01 . #xFF60)
 985            (#xFFE0 . #xFFE6)
 986            (#x20000 . #x2FFFF)
 987            (#x30000 . #x3FFFF))))
 988   (dolist (elt l)
 989     (set-char-table-range char-width-table elt 2)))
 990
 991 ;; Other double width
 992 ;;(map-charset-chars
 993 ;; (lambda (range ignore) (set-char-table-range char-width-table range 2))
 994 ;; 'ethiopic)
 995 ;; (map-charset-chars
 996 ;;  (lambda (range ignore) (set-char-table-range char-width-table range 2))
 997 ;; 'tibetan)
 998 (map-charset-chars
 999  (lambda (range ignore) (set-char-table-range char-width-table range 2))
1000  'indian-2-column)
1001 (map-charset-chars
1002  (lambda (range ignore) (set-char-table-range char-width-table range 2))
1003  'arabic-2-column)
1004
1005 (defvar cjk-char-width-table
1006   (let ((table (make-char-table nil)))
1007     (dolist (charset '(big5 chinese-gb2312 chinese-cns11643-1
1008                             japanese-jisx0208 korean-ksc5601))
1009       (map-charset-chars #'(lambda (range arg)
1010                              (set-char-table-range table range 2))
1011                          charset))
1012     (optimize-char-table table)
1013     (set-char-table-parent table char-width-table)
1014     table)
1015   "Character width table used in CJK language environment.")
1016
1017 (defun use-cjk-char-width-table ()
1018   "Internal use only.
1019 Setup char-width-table appropriate for CJK language environment."
1020   (setq char-width-table cjk-char-width-table))
1021
1022 (defun use-default-char-width-table ()
1023   "Internal use only.
1024 Setup char-width-table appropriate for non-CJK language environment."
1025   (setq char-width-table (char-table-parent cjk-char-width-table)))
1026
1027 (optimize-char-table (standard-case-table))
1028 (optimize-char-table (standard-category-table))
1029 (optimize-char-table (standard-syntax-table))
1030
1031 \f
1032 ;; Setting char-script-table.
1033
1034 ;; The Unicode blocks actually extend past some of these ranges with
1035 ;; undefined codepoints.
1036 (let ((script-list nil))
1037   (dolist
1038       (elt
1039        '((#x0000 #x007F latin)
1040          (#x00A0 #x024F latin)
1041          (#x0250 #x02AF phonetic)
1042          (#x02B0 #x036F latin)
1043          (#x0370 #x03E1 greek)
1044          (#x03E2 #x03EF coptic)
1045          (#x03F0 #x03F3 greek)
1046          (#x0400 #x04FF cyrillic)
1047          (#x0530 #x058F armenian)
1048          (#x0590 #x05FF hebrew)
1049          (#x0600 #x06FF arabic)
1050          (#x0700 #x074F syriac)
1051          (#x07C0 #x07FA nko)
1052          (#x0780 #x07BF thaana)
1053          (#x0900 #x097F devanagari)
1054          (#x0980 #x09FF bengali)
1055          (#x0A00 #x0A7F gurmukhi)
1056          (#x0A80 #x0AFF gujarati)
1057          (#x0B00 #x0B7F oriya)
1058          (#x0B80 #x0BFF tamil)
1059          (#x0C00 #x0C7F telugu)
1060          (#x0C80 #x0CFF kannada)
1061          (#x0D00 #x0D7F malayalam)
1062          (#x0D80 #x0DFF sinhala)
1063          (#x0E00 #x0E5F thai)
1064          (#x0E80 #x0EDF lao)
1065          (#x0F00 #x0FFF tibetan)
1066          (#x1000 #x105F myanmar)
1067          (#x10A0 #x10FF georgian)
1068          (#x1100 #x11FF hangul)
1069          (#x1200 #x139F ethiopic)
1070          (#x13A0 #x13FF cherokee)
1071          (#x1400 #x167F canadian-aboriginal)
1072          (#x1680 #x169F ogham)
1073          (#x16A0 #x16FF runic)
1074          (#x1780 #x17FF khmer)
1075          (#x1800 #x18AF mongolian)
1076          (#x1D00 #x1DFF phonetic)
1077          (#x1E00 #x1EFF latin)
1078          (#x1F00 #x1FFF greek)
1079          (#x2000 #x27FF symbol)
1080          (#x2800 #x28FF braille)
1081          (#x2D80 #x2DDF ethiopic)
1082          (#x2E80 #x2FDF han)
1083          (#x2FF0 #x2FFF ideographic-description)
1084          (#x3000 #x303F cjk-misc)
1085          (#x3040 #x30FF kana)
1086          (#x3100 #x312F bopomofo)
1087          (#x3130 #x318F hangul)
1088          (#x3190 #x319F kanbun)
1089          (#x31A0 #x31BF bopomofo)
1090          (#x3400 #x9FAF han)
1091          (#xA000 #xA4CF yi)
1092          (#xAA00 #xAA5F cham)
1093          (#xAA80 #xAADF tai-viet)
1094          (#xAC00 #xD7AF hangul)
1095          (#xF900 #xFAFF han)
1096          (#xFB1D #xFB4F hebrew)
1097          (#xFB50 #xFDFF arabic)
1098          (#xFE70 #xFEFC arabic)
1099          (#xFF00 #xFF5F cjk-misc)
1100          (#xFF61 #xFF9F kana)
1101          (#xFFE0 #xFFE6 cjk-misc)
1102          (#x1D000 #x1D0FF byzantine-musical-symbol)
1103          (#x1D100 #x1D1FF musical-symbol)
1104          (#x1D400 #x1D7FF mathematical)
1105          (#x20000 #x2AFFF han)
1106          (#x2F800 #x2FFFF han)))
1107     (set-char-table-range char-script-table
1108                           (cons (car elt) (nth 1 elt)) (nth 2 elt))
1109     (or (memq (nth 2 elt) script-list)
1110         (setq script-list (cons (nth 2 elt) script-list))))
1111   (set-char-table-extra-slot char-script-table 0 (nreverse script-list)))
1112
1113 (map-charset-chars
1114  #'(lambda (range ignore)
1115      (set-char-table-range char-script-table range 'tibetan))
1116  'tibetan)
1117
1118 \f
1119 ;;; Setting unicode-category-table.
1120
1121 ;; This macro is to build unicode-category-table at compile time so
1122 ;; that C code can access the table efficiently.
1123 (defmacro build-unicode-category-table ()
1124   (let ((table (make-char-table 'unicode-category-table nil)))
1125     (dotimes (i #x110000)
1126       (if (or (< i #xD800)
1127               (and (> i #xF900) (< i #x30000))
1128               (and (> i #xE0000) (< i #xE0200)))
1129           (aset table i (get-char-code-property i 'general-category))))
1130     (set-char-table-range table '(#xE000 . #xF8FF) 'Co)
1131     (set-char-table-range table '(#xF0000 . #xFFFFD) 'Co)
1132     (set-char-table-range table '(#x100000 . #x10FFFD) 'Co)
1133     (optimize-char-table table 'eq)
1134     table))
1135
1136 (setq unicode-category-table (build-unicode-category-table))
1137
1138 \f
1139 ;;; Setting word boundary.
1140
1141 (defun next-word-boundary-han (pos limit)
1142   (if (<= pos limit)
1143       (save-excursion
1144         (goto-char pos)
1145         (looking-at "\\cC+")
1146         (goto-char (match-end 0))
1147         (if (looking-at "\\cH+")
1148             (goto-char (match-end 0)))
1149         (point))
1150     (while (and (> pos limit)
1151                 (eq (aref char-script-table (char-after (1- pos))) 'han))
1152       (setq pos (1- pos)))
1153     pos))
1154
1155 (defun next-word-boundary-kana (pos limit)
1156   (if (<= pos limit)
1157       (save-excursion
1158         (goto-char pos)
1159         (if (looking-at "\\cK+")
1160             (goto-char (match-end 0)))
1161         (if (looking-at "\\cH+")
1162             (goto-char (match-end 0)))
1163         (if (looking-at "\\ck+")
1164             (goto-char (match-end 0)))
1165         (point))
1166     (let ((category-set (char-category-set (char-after pos)))
1167           category)
1168       (if (or (aref category-set ?K) (aref category-set ?k))
1169           (while (and (> pos limit)
1170                       (setq category-set
1171                             (char-category-set (char-after (1- pos))))
1172                       (or (aref category-set ?K) (aref category-set ?k)))
1173             (setq pos (1- pos)))
1174         (while (and (> pos limit)
1175                     (aref (setq category-set
1176                                 (char-category-set (char-after (1- pos)))) ?H))
1177           (setq pos (1- pos)))
1178         (setq category (cond ((aref category-set ?C) ?C)
1179                              ((aref category-set ?K) ?K)
1180                              ((aref category-set ?A) ?A)))
1181         (when category
1182           (setq pos (1- pos))
1183           (while (and (> pos limit)
1184                       (aref (char-category-set (char-after (1- pos)))
1185                             category))
1186             (setq pos (1- pos)))))
1187       pos)))
1188
1189 (map-char-table
1190  #'(lambda (char script)
1191      (cond ((eq script 'han)
1192             (set-char-table-range find-word-boundary-function-table
1193                                   char #'next-word-boundary-han))
1194            ((eq script 'kana)
1195             (set-char-table-range find-word-boundary-function-table
1196                                   char #'next-word-boundary-kana))))
1197  char-script-table)
1198
1199 (setq word-combining-categories
1200       '((?l . ?l)
1201         (?C . ?C)
1202         (?C . ?H)
1203         (?C . ?K)))
1204
1205 (setq word-separating-categories        ;  (2-byte character sets)
1206       '((?A . ?K)                       ; Alpha numeric - Katakana
1207         (?A . ?C)                       ; Alpha numeric - Chinese
1208         (?H . ?A)                       ; Hiragana - Alpha numeric
1209         (?H . ?K)                       ; Hiragana - Katakana
1210         (?H . ?C)                       ; Hiragana - Chinese
1211         (?K . ?A)                       ; Katakana - Alpha numeric
1212         (?K . ?C)                       ; Katakana - Chinese
1213         (?C . ?A)                       ; Chinese - Alpha numeric
1214         (?C . ?K)                       ; Chinese - Katakana
1215         ))
1216
1217 ;; Local Variables:
1218 ;; coding: utf-8
1219 ;; End:
1220
1221 ;; arch-tag: 85889c35-9f4d-4912-9bf5-82de31b0d42d
1222 ;;; characters.el ends here