1 ;;; thai-word.el -- find Thai word boundaries
3 ;; Copyright (C) 2000, 2001, 2002, 2003, 2004, 2005
4 ;; National Institute of Advanced Industrial Science and Technology (AIST)
5 ;; Registration Number H14PRO021
7 ;; Author: Kenichi HANDA <handa@etl.go.jp>
9 ;; Keywords: thai, word break, emacs
11 ;; This program is free software; you can redistribute it and/or modify
12 ;; it under the terms of the GNU General Public License as published by
13 ;; the Free Software Foundation; either version 2, or (at your option)
16 ;; This program is distributed in the hope that it will be useful,
17 ;; but WITHOUT ANY WARRANTY; without even the implied warranty of
18 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 ;; GNU General Public License for more details.
21 ;; You should have received a copy of the GNU General Public License
22 ;; along with GNU Emacs; see the file COPYING. If not, write to
23 ;; the Free Software Foundation, 51 Franklin Street, Fifth Floor,
24 ;; Boston, MA 02110-1301, USA.
26 ;; The used Thai word list has been taken from IBM's ICU4J project
27 ;; (file `thai6.ucs', version 1.4, converted to TIS encoding, with
28 ;; removal of three incorrect entries) to which the following license
31 ;; COPYRIGHT AND PERMISSION NOTICE
34 ;; Copyright (c) 1995-2001 International Business Machines
35 ;; Corporation and others
37 ;; All rights reserved.
40 ;; Permission is hereby granted, free of charge, to any person
41 ;; obtaining a copy of this software and associated documentation
42 ;; files (the "Software"), to deal in the Software without
43 ;; restriction, including without limitation the rights to use,
44 ;; copy, modify, merge, publish, distribute, and/or sell copies of
45 ;; the Software, and to permit persons to whom the Software is
46 ;; furnished to do so, provided that the above copyright notice(s)
47 ;; and this permission notice appear in all copies of the Software
48 ;; and that both the above copyright notice(s) and this permission
49 ;; notice appear in supporting documentation.
51 ;; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
52 ;; EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
53 ;; OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
54 ;; NONINFRINGEMENT OF THIRD PARTY RIGHTS. IN NO EVENT SHALL THE
55 ;; COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS NOTICE BE LIABLE
56 ;; FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES,
57 ;; OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR
58 ;; PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
59 ;; TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
60 ;; PERFORMANCE OF THIS SOFTWARE.
62 ;; Except as contained in this notice, the name of a copyright
63 ;; holder shall not be used in advertising or otherwise to promote
64 ;; the sale, use or other dealings in this Software without prior
65 ;; written authorization of the copyright holder.
68 ;; This file implements an algorithm to find Thai word breaks using a
69 ;; dictionary. It is based on the C program `cttex' written by
70 ;; Vuthichai Ampornaramveth <vuthi@nii.ac.jp>.
73 ;; Table of Thai words. It is a nested alist (see `nested-alist-p'),
74 ;; which means that you can easily index the list character by
77 (defvar thai-word-table
78 (let ((table (list 'thai-words
)))
80 ;;; The following is indented as this to minimize this file size.
6225 "ÊÁºÙóÒÒÊÔ·¸ÔÃÒªÂì"
10735 (set-nested-alist elt
1 table
))
10737 "Nested alist of Thai words.")
10740 (defun thai-update-word-table (file &optional append
)
10741 "Update Thai word table by replacing the current word list with
10742 FILE. If called with a prefix argument, FILE is appended instead to
10743 the current word list."
10744 (interactive "FThai word table file: \nP")
10745 (let ((buf (generate-new-buffer "*thai-work*"))
10746 (coding-system-for-read 'thai-tis620
)
10747 (table (if append thai-word-table
(list 'thai-words
))))
10751 (insert-file-contents file
)
10752 (goto-char (point-min))
10753 (while (re-search-forward "\\ct+" nil t
)
10754 (set-nested-alist (match-string 0) 1 table
)))
10756 (setq thai-word-table table
)))
10759 ;; Two special Thai characters regarded as suffix of words.
10761 (defconst thai-MaiYaMok
(make-char 'thai-tis620 ?
\xE6))
10762 (defconst thai-PaiYanNoi
(make-char 'thai-tis620 ?
\xCF))
10765 ;; Find Thai words starting at POS and return a list of positions of
10766 ;; the Thai word ends. It doesn't move point. LIMIT limits the
10767 ;; maximum position. IGNORE is a list of positions to ignore. It is
10768 ;; assumed that all following characters to LIMIT are Thai. If the
10769 ;; following char is not Thai (i.e., POS is equal to LIMIT), return t.
10771 ;; Note that the longest word position comes first.
10773 (defun thai-find-word-ends (pos limit
&optional ignore
)
10776 (let* ((char (char-after pos
))
10777 (this (cdr (assq char
(cdr thai-word-table
))))
10780 ;; Look up the following character sequence in `thai-word-table'
10781 ;; character by character.
10784 char
(or (char-after pos
) 0)
10785 category-set
(char-category-set char
))
10786 ;; If the current sequence is recorded in `thai-word-table'
10787 ;; (i.e. (car THIS) is 1) and the following Thai character is
10788 ;; not an upper-vowel, lower-vowel, or tone-mark, we have
10789 ;; found a possible word ending position.
10790 (if (and (eq (car this
) 1)
10791 (not (or (aref category-set ?
2)
10792 (aref category-set ?
3)
10793 (aref category-set ?
4))))
10795 ;; Skip possible Thai suffices.
10796 (while (or (eq char thai-MaiYaMok
) (eq char thai-PaiYanNoi
))
10798 char
(char-after pos
)))
10799 ;; Skip character positions in IGNORE list.
10800 (or (memq pos ignore
)
10801 (setq positions
(cons pos positions
)))))
10802 ;; Set up next loop.
10803 (setq this
(and (< pos limit
) (cdr (assq char this
)))))
10807 ;; Move point forward to the end of Thai word which follows point and
10808 ;; update VEC. VEC is a vector of three elements used to cache word
10809 ;; end positions. The Nth element, if non-nil, is a list of end
10810 ;; points of the Nth word, or t indicating that there is no Thai
10811 ;; character. LIMIT limits the point movement.
10813 (defun thai-forward-word-update-info (vec limit
)
10814 (let ((pos (point))
10820 ;; If four succeeding Thai words are found, throw t, otherwise
10823 ;; Start with first vector element.
10824 (setq v0
(aref vec
0))
10826 ;; Update VEC if V0 is empty.
10827 (setq v0
(thai-find-word-ends pos limit
))
10830 ;; In case we haven't found any wordbreaks resp. point has
10831 ;; reached LIMIT, exit the catch body.
10832 (if (symbolp v0
) ; i.e. nil or t?
10834 ;; OK, V0 holds possible word ends for the current position.
10835 ;; We save V0 for later reference.
10837 (setq v1
(aref vec
1))
10838 ;; Now we try all end word positions to find the next word.
10840 (setq pos
(car v0
))
10842 ;; Update VEC if V1 is empty, ignoring positions already
10843 ;; found -- for Thai, we need the longest match, so if
10846 ;; start(long-word) = start(short-word1)
10847 ;; end(short-word1) = start(short-word2)
10848 ;; end(short-word2) = end(long-word)
10850 ;; only long-word is used.
10851 (setq v1
(thai-find-word-ends pos limit tried
))
10854 ;; If point has reached LIMIT, exit the catch body.
10857 ;; Save SECOND-BEST, if this hasn't been done already.
10858 ;; The `second best' solution is the end position of the
10859 ;; longest first word followed by the longest second word.
10862 (setq second-best
(cons v0 v1
)))
10863 ;; Update the already tried end word positions.
10864 (setq tried
(append tried v1
))
10865 ;; Now repeat the whole process to find a third word.
10867 (setq v2
(aref vec
2))
10869 (setq pos
(car v1
))
10871 (setq v2
(thai-find-word-ends pos limit tried
))
10875 (setq tried
(append tried v2
))
10876 ;; And the same for a fourth word.
10878 (setq pos
(car v2
))
10879 (setq v3
(thai-find-word-ends pos limit tried
))
10882 (setq v2
(cdr v2
)))
10883 (setq v1
(cdr v1
))))
10884 (setq v0
(cdr v0
)))
10888 ;; We found four succeeding Thai words (or LIMIT has been
10889 ;; reached). Move to the end of the first word.
10890 (goto-char (car v0
))
10891 ;; Update VEC for the next function call. If no larger word
10892 ;; positions have been found, set the corresponding vector
10894 (if (and (consp v1
) (< (car v1
) (car (aref vec
1))))
10897 (if (and (consp v2
) (< (car v2
) (car (aref vec
2))))
10900 (aset vec
2 v3
)))) ; exit function successfully
10902 ;; We didn't find four consecutive words. If we have found a
10903 ;; `second best' solution and the length of those two words is
10904 ;; longer than the longest word we can see at the current point,
10905 ;; adopt the second best solution. This decision is based on
10906 ;; heuristic tests.
10907 (if (and second-best
10908 (< (car (aref vec
0)) (car (cdr second-best
))))
10910 (goto-char (car (car second-best
)))
10911 (aset vec
0 (cdr second-best
)))
10912 ;; We finally failed to find a word break. For Thai, the best
10913 ;; solution is to extend the first longest word so that the
10914 ;; end point starts a second word.
10915 (setq pos
(or (car (aref vec
0)) pos
))
10916 (while (and (< pos limit
)
10917 (not (setq positions
(thai-find-word-ends pos limit
))))
10918 (setq pos
(1+ pos
)))
10920 (aset vec
0 positions
))
10925 ;; Return a list of Thai word boundary positions after the current
10926 ;; point. LIMIT, if non-nil, limits the region to check.
10928 (defun thai-find-word-boundaries (&optional limit
)
10930 (setq limit
(point-max)))
10932 (let ((vec (make-vector 3 nil
))
10935 ;; Loop over all (consecutive) Thai regions by using the
10936 ;; character property `t' until LIMIT is reached.
10937 (while (and (< (point) limit
)
10938 (re-search-forward "\\ct+" nil t
))
10939 (setq this-limit
(point))
10940 (goto-char (match-beginning 0))
10941 (fillarray vec nil
)
10942 ;; Check the first word, initializing VEC.
10943 (thai-forward-word-update-info vec this-limit
)
10944 ;; Then loop over the remaining words in the current Thai
10945 ;; region, collecting the boundaries.
10946 (while (< (point) this-limit
)
10947 (setq boundaries
(cons (point) boundaries
))
10948 (thai-forward-word-update-info vec this-limit
)))
10952 (defun thai-break-words (separator &optional limit
)
10953 "Break Thai words by inserting a separator string at word boundaries."
10954 (interactive "sSeparator: ")
10956 (let ((boundaries (thai-find-word-boundaries limit
)))
10958 (goto-char (car boundaries
))
10960 (setq boundaries
(cdr boundaries
))))))
10963 (defun thai-forward-word (count)
10964 "Move point forward COUNT words considering Thai word boundaries.
10965 If COUNT is negative, move point backward (- COUNT) words."
10969 (skip-syntax-forward "^w")
10970 (if (looking-at "\\ct+")
10971 ;; We have reached a Thai region, so we must do something
10972 ;; special instead of using forward-word.
10973 (let ((start (point))
10974 (limit (match-end 0))
10977 ;; If thai-forward-word has been called within a Thai
10978 ;; region, we must go back until the Thai region starts
10979 ;; to do the contextual analysis for finding word
10981 (while (aref (char-category-set (preceding-char)) ?t
)
10983 ;; OK, we ask for the list of word boundaries in
10985 (setq boundaries
(nreverse (thai-find-word-boundaries limit
)))
10986 ;; Now we search for the next boundary after START.
10987 (while (and boundaries
(<= (car boundaries
) start
))
10988 (setq boundaries
(cdr boundaries
)))
10989 ;; Adjust loop for next while loop.
10990 (setq count
(1- count
))
10991 ;; Now we skip Thai words until the BOUNDARIES list is
10992 ;; empty or count-1 words have been passed.
10993 (if (not boundaries
)
10995 (while (and (> count
0)
10997 (setq boundaries
(cdr boundaries
)
10999 ;; If BOUNDARIES is empty, the word counter is not
11000 ;; zero yet (remember that we have decreased COUNT by
11001 ;; one), so we go to LIMIT. Otherwise go to next
11004 (goto-char (car boundaries
))
11005 (goto-char limit
))))
11008 (setq count
(1- count
))))
11009 ;; The symmetrical action for negative values.
11012 (skip-syntax-backward "^w")
11013 (if (aref (char-category-set (preceding-char)) ?t
)
11014 (let ((start (point))
11015 (limit (if (looking-at "\\ct+") (match-end 0)
11019 (while (aref (char-category-set (preceding-char)) ?t
)
11021 (setq boundaries
(thai-find-word-boundaries limit
))
11022 (while (and boundaries
(>= (car boundaries
) start
))
11023 (setq boundaries
(cdr boundaries
)))
11024 (setq count
(1+ count
))
11027 (while (and (< count
0) boundaries
)
11028 (setq boundaries
(cdr boundaries
)
11031 (goto-char (car boundaries
))))))
11033 (setq count
(1+ count
)))))))
11036 (defun thai-backward-word (count)
11037 "Move point backward COUNT words considering Thai word boundaries.
11038 If COUNT is negative, move point forward (- COUNT) words."
11040 (thai-forward-word (- count
)))
11043 (defun thai-kill-word (arg)
11044 "Like kill-word but pay attention to Thai word boundaries.
11045 With argument, do this that many times."
11047 (kill-region (point) (progn (thai-forward-word arg
) (point))))
11050 (defun thai-backward-kill-word (arg)
11051 "Like backward-kill-word but pay attention to Thai word boundaries."
11053 (thai-kill-word (- arg
)))
11056 (defun thai-transpose-words (arg)
11057 "Like transpose-words but pay attention to Thai word boundaries."
11059 (transpose-subr 'thai-forward-word arg
))
11061 (defun thai-fill-find-break-point (linebeg)
11062 "Go to a line breaking position near point considering Thai word boundaries."
11063 (let ((pos (point)))
11064 (thai-forward-word -
1)
11065 (when (<= (point) linebeg
)
11067 (thai-forward-word 1))
11068 (kinsoku linebeg
)))
11070 (provide 'thai-word
)
11073 ;; Local Variables:
11077 ;; end of thai-word.el
11079 ;; arch-tag: 29927f02-e177-4224-a270-7e67210b038a