1 ;;; coding-tests.el --- tests for text encoding and decoding
3 ;; Copyright (C) 2013-2017 Free Software Foundation, Inc.
5 ;; Author: Eli Zaretskii <eliz@gnu.org>
6 ;; Author: Kenichi Handa <handa@gnu.org>
8 ;; This file is part of GNU Emacs.
10 ;; GNU Emacs is free software: you can redistribute it and/or modify
11 ;; it under the terms of the GNU General Public License as published by
12 ;; the Free Software Foundation, either version 3 of the License, or
13 ;; (at your option) any later version.
15 ;; GNU Emacs is distributed in the hope that it will be useful,
16 ;; but WITHOUT ANY WARRANTY; without even the implied warranty of
17 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 ;; GNU General Public License for more details.
20 ;; You should have received a copy of the GNU General Public License
21 ;; along with GNU Emacs. If not, see <https://www.gnu.org/licenses/>.
27 ;; Directory to hold test data files.
28 (defvar coding-tests-workdir
29 (expand-file-name "coding-tests" temporary-file-directory
))
31 ;; Remove all generated test files.
32 (defun coding-tests-remove-files ()
33 (delete-directory coding-tests-workdir t
))
35 (ert-deftest ert-test-coding-bogus-coding-systems
()
38 (or (file-directory-p coding-tests-workdir
)
39 (mkdir coding-tests-workdir t
))
40 (setq test-file
(expand-file-name "nonexistent" coding-tests-workdir
))
41 (if (file-exists-p test-file
)
42 (delete-file test-file
))
44 (let ((coding-system-for-read 'bogus
))
45 (insert-file-contents test-file
)))
47 (setq test-file
(expand-file-name "writing" coding-tests-workdir
))
49 (let ((coding-system-for-write (intern "\"us-ascii\"")))
50 (write-region "some text" nil test-file
))))
51 (coding-tests-remove-files)))
54 (ert-deftest ert-test-unibyte-buffer-dos-eol-decode
()
56 (set-buffer-multibyte nil
)
57 (insert (encode-coding-string "あ" 'euc-jp
) "\xd" "\n")
58 (decode-coding-region (point-min) (point-max) 'euc-jp-dos
)
59 (should-not (string-match-p "\^M" (buffer-string)))))
61 ;; Return the contents (specified by CONTENT-TYPE; ascii, latin, or
62 ;; binary) of a test file.
63 (defun coding-tests-file-contents (content-type)
64 (let* ((ascii "ABCDEFGHIJKLMNOPQRSTUVWXYZ\n")
65 (latin (concat ascii
"ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏ\n"))
66 (binary (string-to-multibyte
67 (concat (string-as-unibyte latin
)
68 (unibyte-string #xC0
#xC1 ?
\n)))))
69 (cond ((eq content-type
'ascii
) ascii
)
70 ((eq content-type
'latin
) latin
)
71 ((eq content-type
'binary
) binary
)
73 (error "Invalid file content type: %s" content-type
)))))
75 ;; Generate FILE with CONTENTS encoded by CODING-SYSTEM.
76 ;; whose encoding specified by CODING-SYSTEM.
77 (defun coding-tests-gen-file (file contents coding-system
)
78 (or (file-directory-p coding-tests-workdir
)
79 (mkdir coding-tests-workdir t
))
80 (setq file
(expand-file-name file coding-tests-workdir
))
82 (set-buffer-file-coding-system coding-system
)
86 ;;; The following three functions are filters for contents of a test
89 ;; Convert all LFs to CR LF sequences in the string STR.
90 (defun coding-tests-lf-to-crlf (str)
93 (goto-char (point-min))
94 (while (search-forward "\n" nil t
)
99 ;; Convert all LFs to CRs in the string STR.
100 (defun coding-tests-lf-to-cr (str)
103 (subst-char-in-region (point-min) (point-max) ?
\n ?
\r)
106 ;; Convert all LFs to LF LF sequences in the string STR.
107 (defun coding-tests-lf-to-lflf (str)
110 (goto-char (point-min))
111 (while (search-forward "\n" nil t
)
115 ;; Prepend the UTF-8 BOM to STR.
116 (defun coding-tests-add-bom (str)
117 (concat "\xfeff" str
))
119 ;; Return the name of test file whose contents specified by
120 ;; CONTENT-TYPE and whose encoding specified by CODING-SYSTEM.
121 (defun coding-tests-filename (content-type coding-system
&optional ext
)
123 (expand-file-name (format "%s-%s.%s" content-type coding-system ext
)
124 coding-tests-workdir
)
125 (expand-file-name (format "%s-%s" content-type coding-system
)
126 coding-tests-workdir
)))
129 ;;; Check ASCII optimizing decoder
131 ;; Generate a test file whose contents specified by CONTENT-TYPE and
132 ;; whose encoding specified by CODING-SYSTEM.
133 (defun coding-tests-ao-gen-file (content-type coding-system
)
134 (let ((file (coding-tests-filename content-type coding-system
)))
135 (coding-tests-gen-file file
136 (coding-tests-file-contents content-type
)
139 ;; Test the decoding of a file whose contents and encoding are
140 ;; specified by CONTENT-TYPE and WRITE-CODING. The test passes if the
141 ;; file is read by READ-CODING and detected as DETECTED-CODING and the
142 ;; contents is correctly decoded.
143 ;; Optional 5th arg TRANSLATOR is a function to translate the original
144 ;; file contents to match with the expected result of decoding. For
145 ;; instance, when a file of dos eol-type is read by unix eol-type,
146 ;; `decode-test-lf-to-crlf' must be specified.
148 (defun coding-tests (content-type write-coding read-coding detected-coding
149 &optional translator
)
150 (prefer-coding-system 'utf-8-auto
)
151 (let ((filename (coding-tests-filename content-type write-coding
)))
153 (let ((coding-system-for-read read-coding
)
154 (contents (coding-tests-file-contents content-type
))
155 (disable-ascii-optimization nil
))
157 (setq contents
(funcall translator contents
)))
158 (insert-file-contents filename
)
159 (if (and (coding-system-equal buffer-file-coding-system detected-coding
)
160 (string= (buffer-string) contents
))
162 (list buffer-file-coding-system
163 (string-to-list (buffer-string))
164 (string-to-list contents
)))))))
166 (ert-deftest ert-test-coding-ascii
()
169 (dolist (eol-type '(unix dos mac
))
170 (coding-tests-ao-gen-file 'ascii eol-type
))
171 (should-not (coding-tests 'ascii
'unix
'undecided
'unix
))
172 (should-not (coding-tests 'ascii
'dos
'undecided
'dos
))
173 (should-not (coding-tests 'ascii
'dos
'dos
'dos
))
174 (should-not (coding-tests 'ascii
'mac
'undecided
'mac
))
175 (should-not (coding-tests 'ascii
'mac
'mac
'mac
))
176 (should-not (coding-tests 'ascii
'dos
'utf-8
'utf-8-dos
))
177 (should-not (coding-tests 'ascii
'dos
'unix
'unix
178 'coding-tests-lf-to-crlf
))
179 (should-not (coding-tests 'ascii
'mac
'dos
'dos
180 'coding-tests-lf-to-cr
))
181 (should-not (coding-tests 'ascii
'dos
'mac
'mac
182 'coding-tests-lf-to-lflf
)))
183 (coding-tests-remove-files)))
185 (ert-deftest ert-test-coding-latin
()
188 (dolist (coding '("utf-8" "utf-8-with-signature"))
189 (dolist (eol-type '("unix" "dos" "mac"))
190 (coding-tests-ao-gen-file 'latin
191 (intern (concat coding
"-" eol-type
)))))
192 (should-not (coding-tests 'latin
'utf-8-unix
'undecided
'utf-8-unix
))
193 (should-not (coding-tests 'latin
'utf-8-unix
'utf-8-unix
'utf-8-unix
))
194 (should-not (coding-tests 'latin
'utf-8-dos
'undecided
'utf-8-dos
))
195 (should-not (coding-tests 'latin
'utf-8-dos
'utf-8-dos
'utf-8-dos
))
196 (should-not (coding-tests 'latin
'utf-8-mac
'undecided
'utf-8-mac
))
197 (should-not (coding-tests 'latin
'utf-8-mac
'utf-8-mac
'utf-8-mac
))
198 (should-not (coding-tests 'latin
'utf-8-dos
'unix
'utf-8-unix
199 'coding-tests-lf-to-crlf
))
200 (should-not (coding-tests 'latin
'utf-8-mac
'dos
'utf-8-dos
201 'coding-tests-lf-to-cr
))
202 (should-not (coding-tests 'latin
'utf-8-dos
'mac
'utf-8-mac
203 'coding-tests-lf-to-lflf
))
204 (should-not (coding-tests 'latin
'utf-8-with-signature-unix
'undecided
205 'utf-8-with-signature-unix
))
206 (should-not (coding-tests 'latin
'utf-8-with-signature-unix
'utf-8-auto
207 'utf-8-with-signature-unix
))
208 (should-not (coding-tests 'latin
'utf-8-with-signature-dos
'undecided
209 'utf-8-with-signature-dos
))
210 (should-not (coding-tests 'latin
'utf-8-with-signature-unix
'utf-8
211 'utf-8-unix
'coding-tests-add-bom
))
212 (should-not (coding-tests 'latin
'utf-8-with-signature-unix
'utf-8
213 'utf-8-unix
'coding-tests-add-bom
)))
214 (coding-tests-remove-files)))
216 (ert-deftest ert-test-coding-binary
()
219 (dolist (eol-type '("unix" "dos" "mac"))
220 (coding-tests-ao-gen-file 'binary
221 (intern (concat "raw-text" "-" eol-type
))))
222 (should-not (coding-tests 'binary
'raw-text-unix
'undecided
224 (should-not (coding-tests 'binary
'raw-text-dos
'undecided
226 (should-not (coding-tests 'binary
'raw-text-mac
'undecided
228 (should-not (coding-tests 'binary
'raw-text-dos
'unix
229 'raw-text-unix
'coding-tests-lf-to-crlf
))
230 (should-not (coding-tests 'binary
'raw-text-mac
'dos
231 'raw-text-dos
'coding-tests-lf-to-cr
))
232 (should-not (coding-tests 'binary
'raw-text-dos
'mac
233 'raw-text-mac
'coding-tests-lf-to-lflf
)))
234 (coding-tests-remove-files)))
237 ;;; Check the coding system `prefer-utf-8'.
239 ;; Read FILE. Check if the encoding was detected as DETECT. If
240 ;; PREFER is non-nil, prefer that coding system before reading.
242 (defun coding-tests-prefer-utf-8-read (file detect prefer
)
244 (with-coding-priority (if prefer
(list prefer
))
245 (insert-file-contents file
))
246 (if (eq buffer-file-coding-system detect
)
248 (format "Invalid detection: %s" buffer-file-coding-system
))))
250 ;; Read FILE, modify it, and write it. Check if the coding system
251 ;; used for writing was CODING. If CODING-TAG is non-nil, insert
252 ;; coding tag with it before writing. If STR is non-nil, insert it
255 (defun coding-tests-prefer-utf-8-write (file coding-tag coding
258 (insert-file-contents file
)
259 (goto-char (point-min))
261 (insert (format ";; -*- coding: %s; -*-\n" coding-tag
))
265 (write-file (coding-tests-filename 'test
'test
"el"))
266 (if (coding-system-equal buffer-file-coding-system coding
)
268 (format "Incorrect encoding: %s" last-coding-system-used
))))
270 (ert-deftest ert-test-coding-prefer-utf-8
()
272 (let ((ascii (coding-tests-gen-file "ascii.el"
273 (coding-tests-file-contents 'ascii
)
275 (latin (coding-tests-gen-file "utf-8.el"
276 (coding-tests-file-contents 'latin
)
278 (should-not (coding-tests-prefer-utf-8-read
279 ascii
'prefer-utf-8-unix nil
))
280 (should-not (coding-tests-prefer-utf-8-read
281 latin
'utf-8-unix nil
))
282 (should-not (coding-tests-prefer-utf-8-read
283 latin
'utf-8-unix
'iso-8859-1
))
284 (should-not (coding-tests-prefer-utf-8-read
285 latin
'utf-8-unix
'sjis
))
286 (should-not (coding-tests-prefer-utf-8-write
287 ascii nil
'prefer-utf-8-unix
))
288 (should-not (coding-tests-prefer-utf-8-write
289 ascii
'iso-8859-1
'iso-8859-1-unix
))
290 (should-not (coding-tests-prefer-utf-8-write
291 ascii nil
'utf-8-unix
"À")))
292 (coding-tests-remove-files)))
295 ;;; The following is for benchmark testing of the new optimized
296 ;;; decoder, not for regression testing.
298 (defun generate-ascii-file ()
303 (defun generate-rarely-nonascii-file ()
311 (defun generate-mostly-nonascii-file ()
323 (defvar test-file-list
324 '((generate-ascii-file
325 ("~/ascii-tag-utf-8-unix.unix" ";; -*- coding: utf-8-unix; -*-" unix
)
326 ("~/ascii-tag-utf-8.unix" ";; -*- coding: utf-8; -*-" unix
)
327 ("~/ascii-tag-none.unix" "" unix
)
328 ("~/ascii-tag-utf-8-dos.dos" ";; -*- coding: utf-8-dos; -*-" dos
)
329 ("~/ascii-tag-utf-8.dos" ";; -*- coding: utf-8; -*-" dos
)
330 ("~/ascii-tag-none.dos" "" dos
))
331 (generate-rarely-nonascii-file
332 ("~/utf-8-r-tag-utf-8-unix.unix" ";; -*- coding: utf-8-unix; -*-" utf-8-unix
)
333 ("~/utf-8-r-tag-utf-8.unix" ";; -*- coding: utf-8; -*-" utf-8-unix
)
334 ("~/utf-8-r-tag-none.unix" "" utf-8-unix
)
335 ("~/utf-8-r-tag-utf-8-dos.dos" ";; -*- coding: utf-8-dos; -*-" utf-8-dos
)
336 ("~/utf-8-r-tag-utf-8.dos" ";; -*- coding: utf-8; -*-" utf-8-dos
)
337 ("~/utf-8-r-tag-none.dos" "" utf-8-dos
))
338 (generate-mostly-nonascii-file
339 ("~/utf-8-m-tag-utf-8-unix.unix" ";; -*- coding: utf-8-unix; -*-" utf-8-unix
)
340 ("~/utf-8-m-tag-utf-8.unix" ";; -*- coding: utf-8; -*-" utf-8-unix
)
341 ("~/utf-8-m-tag-none.unix" "" utf-8-unix
)
342 ("~/utf-8-m-tag-utf-8-dos.dos" ";; -*- coding: utf-8-dos; -*-" utf-8-dos
)
343 ("~/utf-8-m-tag-utf-8.dos" ";; -*- coding: utf-8; -*-" utf-8-dos
)
344 ("~/utf-8-m-tag-none.dos" "" utf-8-dos
))))
346 (defun generate-benchmark-test-file ()
349 (message "Generating data...")
350 (dolist (files test-file-list
)
351 (delete-region (point-min) (point-max))
352 (funcall (car files
))
353 (dolist (file (cdr files
))
354 (message "Writing %s..." (car file
))
355 (goto-char (point-min))
356 (insert (nth 1 file
) "\n")
357 (let ((coding-system-for-write (nth 2 file
)))
358 (write-region (point-min) (point-max) (car file
)))
359 (delete-region (point-min) (point))))))
361 (defun benchmark-decoder ()
362 (let ((gc-cons-threshold 4000000))
363 (insert "Without optimization:\n")
364 (dolist (files test-file-list
)
365 (dolist (file (cdr files
))
366 (let* ((disable-ascii-optimization t
)
367 (result (benchmark-run 10
368 (with-temp-buffer (insert-file-contents (car file
))))))
369 (insert (format "%s: %s\n" (car file
) result
)))))
370 (insert "With optimization:\n")
371 (dolist (files test-file-list
)
372 (dolist (file (cdr files
))
373 (let* ((disable-ascii-optimization nil
)
374 (result (benchmark-run 10
375 (with-temp-buffer (insert-file-contents (car file
))))))
376 (insert (format "%s: %s\n" (car file
) result
)))))))
379 ;; byte-compile-warnings: (not obsolete)
382 (provide 'coding-tests
)
383 ;; coding-tests.el ends here