decoder-tests.el (decoder-tests-gen-file): New arg FILE.
[emacs.git] / test / automated / decoder-tests.el
blobe1b05faf3c01d44bf28e81e4c445b1a15a78b9b5
1 ;;; decoder-tests.el --- test for text decoder
3 ;; Copyright (C) 2013 Free Software Foundation, Inc.
5 ;; Author: Kenichi Handa <handa@gnu.org>
7 ;; This file is part of GNU Emacs.
9 ;; GNU Emacs is free software: you can redistribute it and/or modify
10 ;; it under the terms of the GNU General Public License as published by
11 ;; the Free Software Foundation, either version 3 of the License, or
12 ;; (at your option) any later version.
14 ;; GNU Emacs is distributed in the hope that it will be useful,
15 ;; but WITHOUT ANY WARRANTY; without even the implied warranty of
16 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 ;; GNU General Public License for more details.
19 ;; You should have received a copy of the GNU General Public License
20 ;; along with GNU Emacs. If not, see <http://www.gnu.org/licenses/>.
22 ;;; Code:
24 (require 'ert)
26 ;; Directory to hold test data files.
27 (defvar decoder-tests-workdir
28 (expand-file-name "decoder-tests" temporary-file-directory))
30 ;; Remove all generated test files.
31 (defun decoder-tests-remove-files ()
32 (delete-directory decoder-tests-workdir t))
34 ;; Return the contents (specified by CONTENT-TYPE; ascii, latin, or
35 ;; binary) of a test file.
36 (defun decoder-tests-file-contents (content-type)
37 (let* ((ascii "ABCDEFGHIJKLMNOPQRSTUVWXYZ\n")
38 (latin (concat ascii "ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏ\n"))
39 (binary (string-to-multibyte
40 (concat (string-as-unibyte latin)
41 (unibyte-string #xC0 #xC1 ?\n)))))
42 (cond ((eq content-type 'ascii) ascii)
43 ((eq content-type 'latin) latin)
44 ((eq content-type 'binary) binary)
46 (error "Invalid file content type: %s" content-type)))))
48 ;; Generate FILE with CONTENTS encoded by CODING-SYSTEM.
49 ;; whose encoding specified by CODING-SYSTEM.
50 (defun decoder-tests-gen-file (file contents coding-system)
51 (or (file-directory-p decoder-tests-workdir)
52 (mkdir decoder-tests-workdir t))
53 (setq file (expand-file-name file decoder-tests-workdir))
54 (with-temp-file file
55 (set-buffer-file-coding-system coding-system)
56 (insert contents))
57 file)
59 ;;; The following three functions are filters for contents of a test
60 ;;; file.
62 ;; Convert all LFs to CR LF sequences in the string STR.
63 (defun decoder-tests-lf-to-crlf (str)
64 (with-temp-buffer
65 (insert str)
66 (goto-char (point-min))
67 (while (search-forward "\n" nil t)
68 (delete-char -1)
69 (insert "\r\n"))
70 (buffer-string)))
72 ;; Convert all LFs to CRs in the string STR.
73 (defun decoder-tests-lf-to-cr (str)
74 (with-temp-buffer
75 (insert str)
76 (subst-char-in-region (point-min) (point-max) ?\n ?\r)
77 (buffer-string)))
79 ;; Convert all LFs to LF LF sequences in the string STR.
80 (defun decoder-tests-lf-to-lflf (str)
81 (with-temp-buffer
82 (insert str)
83 (goto-char (point-min))
84 (while (search-forward "\n" nil t)
85 (insert "\n"))
86 (buffer-string)))
88 ;; Prepend the UTF-8 BOM to STR.
89 (defun decoder-tests-add-bom (str)
90 (concat "\xfeff" str))
92 ;; Return the name of test file whose contents specified by
93 ;; CONTENT-TYPE and whose encoding specified by CODING-SYSTEM.
94 (defun decoder-tests-filename (content-type coding-system &optional ext)
95 (if ext
96 (expand-file-name (format "%s-%s.%s" content-type coding-system ext)
97 decoder-tests-workdir)
98 (expand-file-name (format "%s-%s" content-type coding-system)
99 decoder-tests-workdir)))
102 ;;; Check ASCII optimizing decoder
104 ;; Generate a test file whose contents specified by CONTENT-TYPE and
105 ;; whose encoding specified by CODING-SYSTEM.
106 (defun decoder-tests-ao-gen-file (content-type coding-system)
107 (let ((file (decoder-tests-filename content-type coding-system)))
108 (decoder-tests-gen-file file
109 (decoder-tests-file-contents content-type)
110 coding-system)))
112 ;; Test the decoding of a file whose contents and encoding are
113 ;; specified by CONTENT-TYPE and WRITE-CODING. The test passes if the
114 ;; file is read by READ-CODING and detected as DETECTED-CODING and the
115 ;; contents is correctly decoded.
116 ;; Optional 5th arg TRANSLATOR is a function to translate the original
117 ;; file contents to match with the expected result of decoding. For
118 ;; instance, when a file of dos eol-type is read by unix eol-type,
119 ;; `decode-test-lf-to-crlf' must be specified.
121 (defun decoder-tests (content-type write-coding read-coding detected-coding
122 &optional translator)
123 (prefer-coding-system 'utf-8-auto)
124 (let ((filename (decoder-tests-filename content-type write-coding)))
125 (with-temp-buffer
126 (let ((coding-system-for-read read-coding)
127 (contents (decoder-tests-file-contents content-type))
128 (disable-ascii-optimization nil))
129 (if translator
130 (setq contents (funcall translator contents)))
131 (insert-file-contents filename)
132 (if (and (coding-system-equal buffer-file-coding-system detected-coding)
133 (string= (buffer-string) contents))
135 (list buffer-file-coding-system
136 (string-to-list (buffer-string))
137 (string-to-list contents)))))))
139 (ert-deftest ert-test-decoder-ascii ()
140 (unwind-protect
141 (progn
142 (dolist (eol-type '(unix dos mac))
143 (decoder-tests-ao-gen-file 'ascii eol-type))
144 (should-not (decoder-tests 'ascii 'unix 'undecided 'unix))
145 (should-not (decoder-tests 'ascii 'dos 'undecided 'dos))
146 (should-not (decoder-tests 'ascii 'dos 'dos 'dos))
147 (should-not (decoder-tests 'ascii 'mac 'undecided 'mac))
148 (should-not (decoder-tests 'ascii 'mac 'mac 'mac))
149 (should-not (decoder-tests 'ascii 'dos 'utf-8 'utf-8-dos))
150 (should-not (decoder-tests 'ascii 'dos 'unix 'unix
151 'decoder-tests-lf-to-crlf))
152 (should-not (decoder-tests 'ascii 'mac 'dos 'dos
153 'decoder-tests-lf-to-cr))
154 (should-not (decoder-tests 'ascii 'dos 'mac 'mac
155 'decoder-tests-lf-to-lflf)))
156 (decoder-tests-remove-files)))
158 (ert-deftest ert-test-decoder-latin ()
159 (unwind-protect
160 (progn
161 (dolist (coding '("utf-8" "utf-8-with-signature"))
162 (dolist (eol-type '("unix" "dos" "mac"))
163 (decoder-tests-ao-gen-file 'latin
164 (intern (concat coding "-" eol-type)))))
165 (should-not (decoder-tests 'latin 'utf-8-unix 'undecided 'utf-8-unix))
166 (should-not (decoder-tests 'latin 'utf-8-unix 'utf-8-unix 'utf-8-unix))
167 (should-not (decoder-tests 'latin 'utf-8-dos 'undecided 'utf-8-dos))
168 (should-not (decoder-tests 'latin 'utf-8-dos 'utf-8-dos 'utf-8-dos))
169 (should-not (decoder-tests 'latin 'utf-8-mac 'undecided 'utf-8-mac))
170 (should-not (decoder-tests 'latin 'utf-8-mac 'utf-8-mac 'utf-8-mac))
171 (should-not (decoder-tests 'latin 'utf-8-dos 'unix 'utf-8-unix
172 'decoder-tests-lf-to-crlf))
173 (should-not (decoder-tests 'latin 'utf-8-mac 'dos 'utf-8-dos
174 'decoder-tests-lf-to-cr))
175 (should-not (decoder-tests 'latin 'utf-8-dos 'mac 'utf-8-mac
176 'decoder-tests-lf-to-lflf))
177 (should-not (decoder-tests 'latin 'utf-8-with-signature-unix 'undecided
178 'utf-8-with-signature-unix))
179 (should-not (decoder-tests 'latin 'utf-8-with-signature-unix 'utf-8-auto
180 'utf-8-with-signature-unix))
181 (should-not (decoder-tests 'latin 'utf-8-with-signature-dos 'undecided
182 'utf-8-with-signature-dos))
183 (should-not (decoder-tests 'latin 'utf-8-with-signature-unix 'utf-8
184 'utf-8-unix 'decoder-tests-add-bom))
185 (should-not (decoder-tests 'latin 'utf-8-with-signature-unix 'utf-8
186 'utf-8-unix 'decoder-tests-add-bom)))
187 (decoder-tests-remove-files)))
189 (ert-deftest ert-test-decoder-binary ()
190 (unwind-protect
191 (progn
192 (dolist (eol-type '("unix" "dos" "mac"))
193 (decoder-tests-ao-gen-file 'binary
194 (intern (concat "raw-text" "-" eol-type))))
195 (should-not (decoder-tests 'binary 'raw-text-unix 'undecided
196 'raw-text-unix))
197 (should-not (decoder-tests 'binary 'raw-text-dos 'undecided
198 'raw-text-dos))
199 (should-not (decoder-tests 'binary 'raw-text-mac 'undecided
200 'raw-text-mac))
201 (should-not (decoder-tests 'binary 'raw-text-dos 'unix
202 'raw-text-unix 'decoder-tests-lf-to-crlf))
203 (should-not (decoder-tests 'binary 'raw-text-mac 'dos
204 'raw-text-dos 'decoder-tests-lf-to-cr))
205 (should-not (decoder-tests 'binary 'raw-text-dos 'mac
206 'raw-text-mac 'decoder-tests-lf-to-lflf)))
207 (decoder-tests-remove-files)))
210 ;;; Check the coding system `prefer-utf-8'.
212 ;; Read FILE. Check if the encoding was detected as DETECT. If
213 ;; PREFER is non-nil, prefer that coding system before reading.
215 (defun decoder-tests-prefer-utf-8-read (file detect prefer)
216 (if prefer
217 (prefer-coding-system prefer))
218 (with-temp-buffer
219 (insert-file-contents file)
220 (if (eq buffer-file-coding-system detect)
222 (format "Invalid detection: %s" buffer-file-coding-system))))
224 ;; Read FILE, modify it, and write it. Check if the coding system
225 ;; used for writing was CODING. If CODING-TAG is non-nil, insert
226 ;; coding tag with it before writing. If STR is non-nil, insert it
227 ;; before writing.
229 (defun decoder-tests-prefer-utf-8-write (file coding-tag coding
230 &optional str)
231 (with-temp-buffer
232 (insert-file-contents file)
233 (goto-char (point-min))
234 (if coding-tag
235 (insert (format ";; -*- coding: %s; -*-\n" coding-tag))
236 (insert ";;\n"))
237 (if str
238 (insert str))
239 (write-file (decoder-tests-filename 'test 'test "el"))
240 (if (coding-system-equal buffer-file-coding-system coding)
242 (format "Incorrect encoding: %s" last-coding-system-used))))
244 (ert-deftest ert-test-decoder-prefer-utf-8 ()
245 (unwind-protect
246 (let ((ascii (decoder-tests-gen-file "ascii.el"
247 (decoder-tests-file-contents 'ascii)
248 'unix))
249 (latin (decoder-tests-gen-file "utf-8.el"
250 (decoder-tests-file-contents 'latin)
251 'utf-8)))
252 (should-not (decoder-tests-prefer-utf-8-read
253 ascii 'prefer-utf-8-unix nil))
254 (should-not (decoder-tests-prefer-utf-8-read
255 latin 'utf-8-unix nil))
256 (should-not (decoder-tests-prefer-utf-8-read
257 latin 'utf-8-unix 'iso-8859-1))
258 (should-not (decoder-tests-prefer-utf-8-read
259 latin 'utf-8-unix 'sjis))
260 (should-not (decoder-tests-prefer-utf-8-write
261 ascii nil 'prefer-utf-8-unix))
262 (should-not (decoder-tests-prefer-utf-8-write
263 ascii 'iso-8859-1 'iso-8859-1-unix))
264 (should-not (decoder-tests-prefer-utf-8-write
265 ascii nil 'utf-8-unix "À")))
266 (decoder-tests-remove-files)))
269 ;;; The following is for benchmark testing of the new optimized
270 ;;; decoder, not for regression testing.
272 (defun generate-ascii-file ()
273 (dotimes (i 100000)
274 (insert-char ?a 80)
275 (insert "\n")))
277 (defun generate-rarely-nonascii-file ()
278 (dotimes (i 100000)
279 (if (/= i 50000)
280 (insert-char ?a 80)
281 (insert)
282 (insert-char ?a 79))
283 (insert "\n")))
285 (defun generate-mostly-nonascii-file ()
286 (dotimes (i 30000)
287 (insert-char ?a 80)
288 (insert "\n"))
289 (dotimes (i 20000)
290 (insert-char80)
291 (insert "\n"))
292 (dotimes (i 10000)
293 (insert-char ?あ 80)
294 (insert "\n")))
297 (defvar test-file-list
298 '((generate-ascii-file
299 ("~/ascii-tag-utf-8-unix.unix" ";; -*- coding: utf-8-unix; -*-" unix)
300 ("~/ascii-tag-utf-8.unix" ";; -*- coding: utf-8; -*-" unix)
301 ("~/ascii-tag-none.unix" "" unix)
302 ("~/ascii-tag-utf-8-dos.dos" ";; -*- coding: utf-8-dos; -*-" dos)
303 ("~/ascii-tag-utf-8.dos" ";; -*- coding: utf-8; -*-" dos)
304 ("~/ascii-tag-none.dos" "" dos))
305 (generate-rarely-nonascii-file
306 ("~/utf-8-r-tag-utf-8-unix.unix" ";; -*- coding: utf-8-unix; -*-" utf-8-unix)
307 ("~/utf-8-r-tag-utf-8.unix" ";; -*- coding: utf-8; -*-" utf-8-unix)
308 ("~/utf-8-r-tag-none.unix" "" utf-8-unix)
309 ("~/utf-8-r-tag-utf-8-dos.dos" ";; -*- coding: utf-8-dos; -*-" utf-8-dos)
310 ("~/utf-8-r-tag-utf-8.dos" ";; -*- coding: utf-8; -*-" utf-8-dos)
311 ("~/utf-8-r-tag-none.dos" "" utf-8-dos))
312 (generate-mostly-nonascii-file
313 ("~/utf-8-m-tag-utf-8-unix.unix" ";; -*- coding: utf-8-unix; -*-" utf-8-unix)
314 ("~/utf-8-m-tag-utf-8.unix" ";; -*- coding: utf-8; -*-" utf-8-unix)
315 ("~/utf-8-m-tag-none.unix" "" utf-8-unix)
316 ("~/utf-8-m-tag-utf-8-dos.dos" ";; -*- coding: utf-8-dos; -*-" utf-8-dos)
317 ("~/utf-8-m-tag-utf-8.dos" ";; -*- coding: utf-8; -*-" utf-8-dos)
318 ("~/utf-8-m-tag-none.dos" "" utf-8-dos))))
320 (defun generate-benchmark-test-file ()
321 (interactive)
322 (with-temp-buffer
323 (message "Generating data...")
324 (dolist (files test-file-list)
325 (delete-region (point-min) (point-max))
326 (funcall (car files))
327 (dolist (file (cdr files))
328 (message "Writing %s..." (car file))
329 (goto-char (point-min))
330 (insert (nth 1 file) "\n")
331 (let ((coding-system-for-write (nth 2 file)))
332 (write-region (point-min) (point-max) (car file)))
333 (delete-region (point-min) (point))))))
335 (defun benchmark-decoder ()
336 (let ((gc-cons-threshold 4000000))
337 (insert "Without optimization:\n")
338 (dolist (files test-file-list)
339 (dolist (file (cdr files))
340 (let* ((disable-ascii-optimization t)
341 (result (benchmark-run 10
342 (with-temp-buffer (insert-file-contents (car file))))))
343 (insert (format "%s: %s\n" (car file) result)))))
344 (insert "With optimization:\n")
345 (dolist (files test-file-list)
346 (dolist (file (cdr files))
347 (let* ((disable-ascii-optimization nil)
348 (result (benchmark-run 10
349 (with-temp-buffer (insert-file-contents (car file))))))
350 (insert (format "%s: %s\n" (car file) result)))))))