added oct package for long-long arith
[CommonLispStat.git] / external / oct / qd-extra.lisp
blob2a77e886cf1bbbf79032ba21c5206cf326b856c2
1 ;;;; -*- Mode: lisp -*-
2 ;;;;
3 ;;;; Copyright (c) 2007 Raymond Toy
4 ;;;;
5 ;;;; Permission is hereby granted, free of charge, to any person
6 ;;;; obtaining a copy of this software and associated documentation
7 ;;;; files (the "Software"), to deal in the Software without
8 ;;;; restriction, including without limitation the rights to use,
9 ;;;; copy, modify, merge, publish, distribute, sublicense, and/or sell
10 ;;;; copies of the Software, and to permit persons to whom the
11 ;;;; Software is furnished to do so, subject to the following
12 ;;;; conditions:
13 ;;;;
14 ;;;; The above copyright notice and this permission notice shall be
15 ;;;; included in all copies or substantial portions of the Software.
16 ;;;;
17 ;;;; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
18 ;;;; EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
19 ;;;; OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
20 ;;;; NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
21 ;;;; HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
22 ;;;; WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
23 ;;;; FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
24 ;;;; OTHER DEALINGS IN THE SOFTWARE.
26 ;;; This file contains various possible implementations of some of the
27 ;;; core routines. These were experiments on faster and/or more
28 ;;; accurate implementations. The routines inf qd-fun.lisp are the
29 ;;; default, but you can select a different implementation from here
30 ;;; if you want.
31 ;;;
32 ;;; The end of the file also includes some tests of the different
33 ;;; implementations for speed.
35 (in-package #:qdi)
37 ;; This works but seems rather slow, so we don't even compile it.
38 #+(or)
39 (defun exp-qd/newton (a)
40 (declare (type %quad-double a))
41 ;; Newton iteration
43 ;; f(x) = log(x) - a
45 ;; x' = x - (log(x) - a)/(1/x)
46 ;; = x - x*(log(x) - a)
47 ;; = x*(1 + a - log(x))
48 (let ((a1 (add-qd-d a 1d0))
49 (x (make-qd-d (exp (qd-0 a)))))
50 (setf x (mul-qd x (sub-qd a1 (log-qd/agm x))))
51 (setf x (mul-qd x (sub-qd a1 (log-qd/agm x))))
52 (setf x (mul-qd x (sub-qd a1 (log-qd/agm x))))
53 x))
55 (defun expm1-qd/series (a)
56 (declare (type %quad-double a))
57 ;; Compute exp(x) - 1.
59 ;; D(x) = exp(x) - 1
61 ;; First, write x = s*log(2) + r*k where s is an integer and |r*k| <
62 ;; log(2)/2.
64 ;; Then D(x) = D(s*log(2)+r*k) = 2^s*exp(r*k) - 1
65 ;; = 2^s*(exp(r*k)-1) - 1 + 2^s
66 ;; = 2^s*D(r*k)+2^s-1
67 ;; But
68 ;; exp(r*k) = exp(r)^k
69 ;; = (D(r) + 1)^k
71 ;; So
72 ;; D(r*k) = (D(r) + 1)^k - 1
74 ;; For small r, D(r) can be computed using the Taylor series around
75 ;; zero. To compute D(r*k) = (D(r) + 1)^k - 1, we use the binomial
76 ;; theorem to expand out the power and to exactly cancel out the -1
77 ;; term, which is the source of inaccuracy.
79 ;; We want to have small r so the Taylor series converges quickly,
80 ;; but that means k is large, which means the binomial expansion is
81 ;; long. We need to compromise. Let use choose k = 8. Then |r| <
82 ;; log(2)/16 = 0.0433. For this range, the Taylor series converges
83 ;; to 212 bits of accuracy with about 28 terms.
86 (flet ((taylor (x)
87 (declare (type %quad-double x))
88 ;; Taylor series for exp(x)-1
89 ;; = x+x^2/2!+x^3/3!+x^4/4!+...
90 ;; = x*(1+x/2!+x^2/3!+x^3/4!+...)
91 (let ((sum +qd-one+)
92 (term +qd-one+))
93 (dotimes (k 28)
94 (setf term (div-qd-d (mul-qd term x) (float (cl:+ k 2) 1d0)))
95 (setf sum (add-qd sum term)))
96 (mul-qd x sum)))
97 (binom (x)
98 (declare (type %quad-double x))
99 ;; (1+x)^8-1
100 ;; = x*(8 + 28*x + 56*x^2 + 70*x^3 + 56*x^4 + 28*x^5 + 8*x^6 + x^7)
101 ;; = x (x (x (x (x (x (x (x + 8) + 28) + 56) + 70) + 56) + 28) + 8)
102 (mul-qd
104 (add-qd-d
105 (mul-qd x
106 (add-qd-d
107 (mul-qd x
108 (add-qd-d
109 (mul-qd x
110 (add-qd-d
111 (mul-qd x
112 (add-qd-d
113 (mul-qd x
114 (add-qd-d
115 (mul-qd x
116 (add-qd-d x 8d0))
117 28d0))
118 56d0))
119 70d0))
120 56d0))
121 28d0))
122 8d0)))
123 (arg-reduce (x)
124 (declare (type %quad-double x))
125 ;; Write x = s*log(2) + r*k where s is an integer and |r*k|
126 ;; < log(2)/2, and k = 8.
127 (let* ((s (truncate (qd-0 (nint-qd (div-qd a +qd-log2+)))))
128 (r*k (sub-qd x (mul-qd-d +qd-log2+ (float s 1d0))))
129 (r (div-qd-d r*k 8d0)))
130 (values s r))))
131 (multiple-value-bind (s r)
132 (arg-reduce a)
133 (let* ((d (taylor r))
134 (dr (binom d)))
135 (add-qd-d (scale-float-qd dr s)
136 (cl:- (scale-float 1d0 s) 1))))))
138 (defun log-qd/newton (a)
139 (declare (type %quad-double a))
140 ;; The Taylor series for log converges rather slowly. Hence, this
141 ;; routine tries to determine the root of the function
143 ;; f(x) = exp(x) - a
145 ;; using Newton iteration. The iteration is
147 ;; x' = x - f(x) / f'(x)
148 ;; = x - (1 - a * exp(-x))
149 ;; = x + a * exp(-x) - 1
151 ;; Two iterations are needed.
152 (let ((x (make-qd-d (log (qd-0 a)))))
153 (dotimes (k 3)
154 (setf x (sub-qd-d (add-qd x (mul-qd a (exp-qd (neg-qd x))))
155 1d0)))
159 ;;(declaim (inline agm-qd))
161 (defun agm-qd (x y)
162 (declare (type %quad-double x y)
163 (optimize (speed 3)))
164 (let ((diff (qd-0 (abs-qd (sub-qd x y)))))
165 (cond ((< diff +qd-eps+)
168 (let ((a-mean (div-qd-d (add-qd x y) 2d0))
169 (g-mean (sqrt-qd (mul-qd x y))))
170 (agm-qd a-mean g-mean))))))
172 #+(or)
173 (defun agm-qd (x y)
174 (declare (type %quad-double x y)
175 (optimize (speed 3) (space 0) (safety 0)))
176 (let ((diff (qd-0 (abs-qd (sub-qd x y))))
177 (x x)
178 (y y))
179 (declare (double-float diff))
180 (loop while (> diff +qd-eps+)
182 (let ((a-mean (scale-float-qd (add-qd x y) -1))
183 (g-mean (sqrt-qd (mul-qd x y))))
184 (setf x a-mean)
185 (setf y g-mean)
186 (setf diff (qd-0 (abs-qd (sub-qd x y))))))
189 (defun log-qd/agm (x)
190 (declare (type %quad-double x))
191 ;; log(x) ~ pi/2/agm(1,4/x)*(1+O(1/x^2))
193 ;; Need to make x >= 2^(d/2) to get d bits of precision. We use
195 ;; log(2^k*x) = k*log(2)+log(x)
197 ;; to compute log(x). log(2^k*x) is computed using AGM.
199 (multiple-value-bind (frac exp)
200 (decode-float (qd-0 x))
201 (declare (ignore frac))
202 (cond ((>= exp 106)
203 ;; Big enough to use AGM
204 (div-qd +qd-pi/2+
205 (agm-qd +qd-one+
206 (div-qd (make-qd-d 4d0)
207 x))))
209 ;; log(x) = log(2^k*x) - k * log(2)
210 (let* ((k (cl:- 107 exp))
211 (big-x (scale-float-qd x k)))
212 ;; Compute k*log(2) using extra precision by writing
213 ;; log(2) = a + b, where a is the quad-double
214 ;; approximation and b the rest.
215 (sub-qd (log-qd/agm big-x)
216 (add-qd (mul-qd-d +qd-log2+ (float k 1d0))
217 (mul-qd-d +qd-log2-extra+ (float k 1d0)))))))))
219 (defun log-qd/agm2 (x)
220 (declare (type %quad-double x))
221 ;; log(x) ~ pi/4/agm(theta2(q^4)^2,theta3(q^4)^2)
223 ;; where q = 1/x
225 ;; Need to make x >= 2^(d/36) to get d bits of precision. We use
227 ;; log(2^k*x) = k*log(2)+log(x)
229 ;; to compute log(x). log(2^k*x) is computed using AGM.
231 (multiple-value-bind (frac exp)
232 (decode-float (qd-0 x))
233 (declare (ignore frac))
234 (cond ((>= exp 7)
235 ;; Big enough to use AGM (because d = 212 so x >= 2^5.8888)
236 (let* ((q (div-qd +qd-one+
238 (q^4 (npow q 4))
239 (q^8 (sqr-qd q^4))
240 ;; theta2(q^4) = 2*q*(1+q^8+q^24)
241 ;; = 2*q*(1+q^8+(q^8)^3)
242 (theta2 (mul-qd-d
243 (mul-qd
245 (add-qd-d
246 (add-qd q^8
247 (npow q^8 3))
248 1d0))
249 2d0))
250 ;; theta3(q^4) = 1+2*(q^4+q^16)
251 ;; = 1+2*(q^4+(q^4)^4)
252 (theta3 (add-qd-d
253 (mul-qd-d
254 (add-qd q^4
255 (npow q^4 4))
256 2d0)
257 1d0)))
258 (div-qd +qd-pi/4+
259 (agm-qd (sqr-qd theta2)
260 (sqr-qd theta3)))))
262 ;; log(x) = log(2^k*x) - k * log(2)
263 (let* ((k (cl:- 7 exp))
264 (big-x (scale-float-qd x k)))
265 (sub-qd (log-qd/agm2 big-x)
266 (add-qd (mul-qd-d +qd-log2+ (float k 1d0))
267 (mul-qd-d +qd-log2-extra+ (float k 1d0)))))))))
269 (defun log-qd/agm3 (x)
270 (declare (type %quad-double x))
271 ;; log(x) ~ pi/4/agm(theta2(q^4)^2,theta3(q^4)^2)
273 ;; where q = 1/x
275 ;; Need to make x >= 2^(d/36) to get d bits of precision. We use
277 ;; log(2^k*x) = k*log(2)+log(x)
279 ;; to compute log(x). log(2^k*x) is computed using AGM.
281 (multiple-value-bind (frac exp)
282 (decode-float (qd-0 x))
283 (declare (ignore frac))
284 (cond ((>= exp 7)
285 ;; Big enough to use AGM (because d = 212 so x >= 2^5.8888)
286 (let* ((q (div-qd +qd-one+
288 (q^4 (npow q 4))
289 (q^8 (sqr-qd q^4))
290 ;; theta2(q^4) = 2*q*(1+q^8+q^24)
291 ;; = 2*q*(1+q^8+(q^8)^3)
292 (theta2 (mul-qd-d
293 (mul-qd
295 (add-qd-d
296 (add-qd q^8
297 (npow q^8 3))
298 1d0))
299 2d0))
300 ;; theta3(q^4) = 1+2*(q^4+q^16)
301 ;; = 1+2*(q^4+(q^4)^4)
302 (theta3 (add-qd-d
303 (mul-qd-d
304 (add-qd q^4
305 (npow q^4 4))
306 2d0)
307 1d0)))
308 ;; Note that agm(theta2^2,theta3^2) = agm(2*theta2*theta3,theta2^2+theta3^2)/2
309 (div-qd +qd-pi/4+
310 (scale-float-qd
311 (agm-qd (scale-float-qd (mul-qd theta2 theta3) 1)
312 (add-qd (sqr-qd theta2)
313 (sqr-qd theta3)))
314 -1))))
316 ;; log(x) = log(2^k*x) - k * log(2)
317 (let* ((k (cl:- 7 exp))
318 (big-x (scale-float-qd x k)))
319 (sub-qd (log-qd/agm3 big-x)
320 (add-qd
321 (mul-qd-d +qd-log2+ (float k 1d0))
322 (mul-qd-d +qd-log2-extra+ (float k 1d0)))))))))
324 #+(or)
325 (defun atan-d (y x)
326 (let* ((r (abs (complex x y)))
327 (xx (cl:/ x r))
328 (yy (cl:/ y r)))
329 (let ((z (atan (float y 1f0) (float x 1f0)))
330 (sinz 0d0)
331 (cosz 0d0))
332 (format t "z = ~A~%" z)
333 (cond ((> xx yy)
334 (format t "xx > yy~%")
335 (dotimes (k 5)
336 (let* ((sinz (sin z))
337 (cosz (cos z))
338 (delta (cl:/ (cl:- yy sinz)
339 cosz)))
340 (format t "sz, dz = ~A ~A~%" sinz cosz)
341 (format t "delta = ~A~%" delta)
342 (setf z (cl:+ z delta))
343 (format t "z = ~A~%" z))))
345 (dotimes (k 20)
346 (let ((sinz (sin z))
347 (cosz (cos z)))
348 (format t "sz, dz = ~A ~A~%" sinz cosz)
350 (setf z (cl:- z (cl:/ (cl:- xx cosz)
351 sinz)))
352 (format t "z = ~A~%" z)))))
353 z)))
356 (defvar *table*)
357 (defvar *ttable*)
358 (defvar *cordic-scale*)
360 #+nil
361 (defun setup-cordic ()
362 (let ((table (make-array 34))
363 (ttable (make-array 34)))
364 (setf (aref table 0) 1d0)
365 (setf (aref table 1) 1d0)
366 (setf (aref table 2) 1d0)
367 (setf (aref ttable 0) (cl:/ pi 4))
368 (setf (aref ttable 1) (cl:/ pi 4))
369 (setf (aref ttable 2) (cl:/ pi 4))
370 (loop for k from 3 below 34 do
371 (setf (aref table k) (cl:* 0.5d0 (aref table (cl:1- k))))
372 (setf (aref ttable k) (atan (aref table k))))
373 (setf *table* table)
374 (setf *ttable* ttable)))
376 (defun setup-cordic ()
377 (let ((table (make-array 34))
378 (ttable (make-array 34)))
379 (setf (aref table 0) 4d0)
380 (setf (aref table 1) 2d0)
381 (setf (aref table 2) 1d0)
382 (setf (aref ttable 0) (atan 4d0))
383 (setf (aref ttable 1) (atan 2d0))
384 (setf (aref ttable 2) (cl:/ pi 4))
385 (loop for k from 3 below 34 do
386 (setf (aref table k) (cl:* 0.5d0 (aref table (cl:1- k))))
387 (setf (aref ttable k) (atan (aref table k))))
388 (setf *table* table)
389 (setf *ttable* ttable)))
391 (defun setup-cordic ()
392 (let ((table (make-array 34))
393 (ttable (make-array 34))
394 (scale 1d0))
395 (loop for k from 0 below 34 do
396 (setf (aref table k) (scale-float 1d0 (cl:- 2 k)))
397 (setf (aref ttable k) (atan (aref table k)))
398 (setf scale (cl:* scale (cos (aref ttable k)))))
399 (setf *table* table)
400 (setf *ttable* ttable)
401 (setf *cordic-scale* scale)))
404 (defun cordic-rot (x y)
405 (let ((z 0))
406 (dotimes (k (length *table*))
407 (cond ((plusp y)
408 (psetq x (cl:+ x (cl:* y (aref *table* k)))
409 y (cl:- y (cl:* x (aref *table* k))))
410 (incf z (aref *ttable* k)))
412 (psetq x (cl:- x (cl:* y (aref *table* k)))
413 y (cl:+ y (cl:* x (aref *table* k))))
414 (decf z (aref *ttable* k)))
416 (values z x y)))
418 (defun cordic-vec (z)
419 (let ((x 1d0)
420 (y 0d0)
421 (scale 1d0))
422 (dotimes (k 12 (length *table*))
423 (setf scale (cl:* scale (cos (aref *ttable* k))))
424 (cond ((minusp z)
425 (psetq x (cl:+ x (cl:* y (aref *table* k)))
426 y (cl:- y (cl:* x (aref *table* k))))
427 (incf z (aref *ttable* k)))
429 (psetq x (cl:- x (cl:* y (aref *table* k)))
430 y (cl:+ y (cl:* x (aref *table* k))))
431 (decf z (aref *ttable* k)))
433 (values x y z scale)))
435 (defun atan2-d (y x)
436 (multiple-value-bind (z dx dy)
437 (cordic-rot x y)
438 (let ((theta (cl:/ dy dx)))
439 (format t "theta = ~A~%" theta)
440 (let ((corr (cl:+ theta
441 (cl:- (cl:/ (expt theta 3)
443 (cl:/ (expt theta 5)
444 5))))
445 (format t "corr = ~A~%" corr)
446 (cl:+ z corr)))))
448 (defun tan-d (r)
449 (multiple-value-bind (x y z)
450 (cordic-vec r)
451 (setf x (cl:* x *cordic-scale*))
452 (setf y (cl:* y *cordic-scale*))
453 (format t "x = ~A~%" x)
454 (format t "y = ~A~%" y)
455 (format t "z = ~A~%" z)
456 ;; Need to finish of the rotation
457 (let ((st (sin z))
458 (ct (cos z)))
459 (format t "st, ct = ~A ~A~%" st ct)
460 (psetq x (cl:- (cl:* x ct) (cl:* y st))
461 y (cl:+ (cl:* y ct) (cl:* x st)))
462 (format t "x = ~A~%" x)
463 (format t "y = ~A~%" y)
464 (cl:/ y x)
467 (defun sin-d (r)
468 (declare (type double-float r))
469 (multiple-value-bind (x y z s)
470 (cordic-vec r)
472 ;; Need to finish the rotation
473 (let ((st (sin z))
474 (ct (cos z)))
475 (psetq x (cl:- (cl:* x ct) (cl:* y st))
476 y (cl:+ (cl:* y ct) (cl:* x st)))
477 (cl:* s y))))
480 ;; This is the basic CORDIC rotation. Based on code from
481 ;; http://www.voidware.com/cordic.htm and
482 ;; http://www.dspcsp.com/progs/cordic.c.txt.
484 ;; The only difference between this version and the typical CORDIC
485 ;; implementation is that the first 3 rotations are all by pi/4. This
486 ;; makes sense. If the angle is greater than pi/4, the rotations will
487 ;; reduce it to at most pi/4. If the angle is less than pi/4, the 3
488 ;; rotations by pi/4 will cause us to end back at the same place.
489 ;; (Should we try to be smarter?)
490 (defun cordic-rot-qd (x y)
491 (declare (type %quad-double y x)
492 (optimize (speed 3)))
493 (let* ((zero +qd-zero+)
494 (z zero))
495 (declare (type %quad-double zero z))
496 (dotimes (k (length +atan-table+))
497 (declare (fixnum k))
498 (cond ((qd-> y zero)
499 (psetq x (add-qd x (mul-qd-d y (aref +atan-power-table+ k)))
500 y (sub-qd y (mul-qd-d x (aref +atan-power-table+ k))))
501 (setf z (add-qd z (aref +atan-table+ k))))
503 (psetq x (sub-qd x (mul-qd-d y (aref +atan-power-table+ k)))
504 y (add-qd y (mul-qd-d x (aref +atan-power-table+ k))))
505 (setf z (sub-qd z (aref +atan-table+ k))))))
506 (values z x y)))
508 (defun atan2-qd/cordic (y x)
509 (declare (type %quad-double y x))
510 ;; Use the CORDIC rotation to get us to a small angle. Then use the
511 ;; Taylor series for atan to finish the computation.
512 (multiple-value-bind (z dx dy)
513 (cordic-rot-qd x y)
514 ;; Use Taylor series to finish off the computation
515 (let* ((arg (div-qd dy dx))
516 (sq (neg-qd (sqr-qd arg)))
517 (sum +qd-one+))
518 ;; atan(x) = x - x^3/3 + x^5/5 - ...
519 ;; = x*(1-x^2/3+x^4/5-x^6/7+...)
520 (do ((k 3d0 (cl:+ k 2d0))
521 (term sq))
522 ((< (abs (qd-0 term)) +qd-eps+))
523 (setf sum (add-qd sum (div-qd-d term k)))
524 (setf term (mul-qd term sq)))
525 (setf sum (mul-qd arg sum))
526 (add-qd z sum))))
528 (defun atan-qd/cordic (y)
529 (declare (type %quad-double y))
530 (atan2-qd/cordic y +qd-one+))
532 (defun atan-qd/duplication (y)
533 (declare (type %quad-double y)
534 (optimize (speed 3) (space 0)))
535 (cond ((< (abs (qd-0 y)) 1d-4)
536 ;; Series
537 (let* ((arg y)
538 (sq (neg-qd (sqr-qd arg)))
539 (sum +qd-one+))
540 ;; atan(x) = x - x^3/3 + x^5/5 - ...
541 ;; = x*(1-x^2/3+x^4/5-x^6/7+...)
542 (do ((k 3d0 (cl:+ k 2d0))
543 (term sq))
544 ((< (abs (qd-0 term)) +qd-eps+))
545 (setf sum (add-qd sum (div-qd-d term k)))
546 (setf term (mul-qd term sq)))
547 (mul-qd arg sum)))
549 ;; atan(x) = 2*atan(x/(1 + sqrt(1 + x^2)))
550 (let ((x (div-qd y
551 (add-qd-d (sqrt-qd (add-qd-d (sqr-qd y) 1d0))
552 1d0))))
553 (scale-float-qd (atan-qd/duplication x) 1)))))
555 (defun cordic-vec-qd (z)
556 (declare (type %quad-double z)
557 (optimize (speed 3)))
558 (let* ((x +qd-one+)
559 (y +qd-zero+)
560 (zero +qd-zero+))
561 (declare (type %quad-double zero x y))
562 (dotimes (k 30 (length +atan-table+))
563 (declare (fixnum k)
564 (inline mul-qd-d sub-qd add-qd))
565 (cond ((qd-> z zero)
566 (psetq x (sub-qd x (mul-qd-d y (aref +atan-power-table+ k)))
567 y (add-qd y (mul-qd-d x (aref +atan-power-table+ k))))
568 (setf z (sub-qd z (aref +atan-table+ k))))
570 (psetq x (add-qd x (mul-qd-d y (aref +atan-power-table+ k)))
571 y (sub-qd y (mul-qd-d x (aref +atan-power-table+ k))))
572 (setf z (add-qd z (aref +atan-table+ k))))))
573 (values z x y)))
575 (defun tan-qd/cordic (r)
576 (declare (type %quad-double r))
577 (multiple-value-bind (z x y)
578 (cordic-vec-qd r)
579 ;; Need to finish the rotation
580 (multiple-value-bind (st ct)
581 (sincos-taylor z)
582 (psetq x (sub-qd (mul-qd x ct) (mul-qd y st))
583 y (add-qd (mul-qd y ct) (mul-qd x st)))
584 (div-qd y x))))
587 (defun sin-qd/cordic (r)
588 (declare (type %quad-double r))
589 (multiple-value-bind (z x y)
590 (cordic-vec-qd r)
591 #+nil
592 (progn
593 (format t "~&x = ~/qd::qd-format/~%" x)
594 (format t "~&y = ~/qd::qd-format/~%" y)
595 (format t "~&z = ~/qd::qd-format/~%" z)
596 (format t "~&s = ~/qd::qd-format/~%" s))
597 ;; Need to finish the rotation
598 (multiple-value-bind (st ct)
599 (sincos-taylor z)
600 #+nil
601 (progn
602 (format t "~&st = ~/qd::qd-format/~%" st)
603 (format t "~&ct = ~/qd::qd-format/~%" ct)
604 (format t "~&y = ~/qd::qd-format/~%" (mul-qd +cordic-scale+ y)))
606 (psetq x (sub-qd (mul-qd x ct) (mul-qd y st))
607 y (add-qd (mul-qd y ct) (mul-qd x st)))
608 (mul-qd +cordic-scale+ y))))
611 ;; Some timing and consing tests.
613 ;; The tests are run using the following:
615 ;; Sparc: 1.5 GHz Ultrasparc IIIi
616 ;; Sparc2: 450 MHz Ultrasparc II
617 ;; PPC: 1.42 GHz
618 ;; x86: 866 MHz Pentium 3
619 ;; PPC(fma): 1.42 GHz with cmucl with fused-multiply-add double-double.
622 ;; (time-exp #c(2w0 0) 50000)
624 ;; Time Sparc PPC x86 PPC (fma) Sparc2
625 ;; exp-qd/reduce 2.06 3.18 10.46 2.76 6.12
626 ;; expm1-qd/series 8.81 12.24 18.87 3.26 29.0
627 ;; expm1-qd/dup 5.68 4.34 18.47 3.64 18.78
629 ;; Consing (MB) Sparc
630 ;; exp-qd/reduce 45 45 638 44.4 45
631 ;; expm1-qd/series 519 519 1201 14.8 519
632 ;; expm1-qd/dup 32 32 1224 32.0 32
634 ;; Speeds seem to vary quite a bit between architectures.
636 ;; Timing without inlining all the basic functions everywhere. (That
637 ;; is, :qd-inline is not a feature.)
639 ;; (time-exp #c(2w0 0) 50000)
641 ;; Time Sparc PPC x86 PPC (fma)
642 ;; exp-qd/reduce 5.83 0.67 10.67 0.98
643 ;; expm1-qd/series 10.65 1.45 21.06 1.35
644 ;; expm1-qd/dup 11.17 1.36 24.01 1.25
646 ;; Consing Sparc
647 ;; exp-qd/reduce 638 93 638 93
648 ;; expm1-qd/series 1203 120 1201 120
649 ;; expm1-qd/dup 1224 122 1224 122
651 ;; So inlining speeds things up by a factor of about 3 for sparc,
652 ;; 1.5-4 for ppc. Strangely, x86 slows down on some but speeds up on
653 ;; others.
654 (defun time-exp (x n)
655 (declare (type %quad-double x)
656 (fixnum n))
657 (let ((y +qd-zero+))
658 (declare (type %quad-double y))
659 #+cmu (gc :full t)
660 (format t "exp-qd/reduce~%")
661 (time (dotimes (k n)
662 (declare (fixnum k))
663 (setf y (exp-qd/reduce x))))
664 #+cmu (gc :full t)
665 (format t "expm1-qd/series~%")
666 (time (dotimes (k n)
667 (declare (fixnum k))
668 (setf y (expm1-qd/series x))))
669 #+cmu (gc :full t)
670 (format t "expm1-qd/duplication~%")
671 (time (dotimes (k n)
672 (declare (fixnum k))
673 (setf y (expm1-qd/duplication x))))
677 ;; (time-log #c(3w0 0) 50000)
679 ;; Time (s) Sparc PPC x86 PPC (fma) Sparc2
680 ;; log-qd/newton 7.08 10.23 35.74 8.82 21.77
681 ;; log1p-qd/dup 5.87 8.41 27.32 6.65 20.73
682 ;; log-qd/agm 6.58 8.0 27.2 6.87 24.62
683 ;; log-qd/agm2 5.8 6.93 22.89 6.07 18.44
684 ;; log-qd/agm3 5.45 6.57 20.97 6.18 20.34
685 ;; log-qd/halley 4.96 6.8 25.11 7.01 16.13
687 ;; Consing (MB) Sparc PPC x86 PPC (fma)
688 ;; log-qd/newton 150 150 2194 148 150
689 ;; log1p-qd/dup 56 56 1564 56 56
690 ;; log-qd/agm 81 11 1434 81 81
691 ;; log-qd/agm2 87 35 1184 87 87
692 ;; log-qd/agm3 82 36 1091 81 82
693 ;; log-qd/halley 101 101 1568 100 101
695 ;; Based on these results, it's not really clear what is the fastest.
696 ;; But Halley's iteration is probably a good tradeoff for log.
698 ;; However, consider log(1+2^(-100)). Use log1p as a reference:
699 ;; 7.88860905221011805411728565282475078909313378023665801567590088088481830649115711502410110281q-31
701 ;; We have
702 ;; log-qd
703 ;; 7.88860905221011805411728565282475078909313378023665801567590088088481830649133878797727478488q-31
704 ;; log-agm
705 ;; 7.88860905221011805411728565282514580471135738786455290255431302193794546609432q-31
706 ;; log-agm2
707 ;; 7.88860905221011805411728565282474926980229445866885841995713611460718519856111q-31
708 ;; log-agm3
709 ;; 7.88860905221011805411728565282474926980229445866885841995713611460718519856111q-31
710 ;; log-halley
711 ;; 7.88860905221011805411728565282475078909313378023665801567590088088481830649120253326239452326q-31
713 ;; We can see that the AGM methods are grossly inaccurate, but log-qd
714 ;; and log-halley are quite good.
716 ;; Timing results without inlining everything:
718 ;; Time Sparc PPC x86 PPC (fma)
719 ;; log-qd/newton 21.37 0.87 41.49 0.62
720 ;; log1p-qd/dup 12.58 0.41 31.86 0.28
721 ;; log-qd/agm 7.17 0.23 34.86 0.16
722 ;; log-qd/agm2 6.35 0.22 27.53 0.15
723 ;; log-qd/agm3 7.49 0.17 24.92 0.14
724 ;; log-qd/halley 14.38 0.56 30.2 0.65
726 ;; Consing
727 ;; Sparc PPC x86 PPC (fma)
728 ;; log-qd/newton 2194 60.7 2194 61
729 ;; log1p-qd/dup 1114 22.6 1564 23
730 ;; log-qd/agm 371 7.9 1434 7.9
731 ;; log-qd/agm2 371 7.8 1185 7.8
732 ;; log-qd/agm3 373 7.8 1091 7.8
733 ;; log-qd/halley 1554 42.3 1567 42.3
735 (defun time-log (x n)
736 (declare (type %quad-double x)
737 (fixnum n))
738 (let ((y +qd-zero+))
739 (declare (type %quad-double y))
740 #+cmu (gc :full t)
741 (format t "log-qd/newton~%")
742 (time (dotimes (k n)
743 (declare (fixnum k))
744 (setf y (log-qd/newton x))))
745 #+cmu (gc :full t)
746 (format t "log1p-qd/duplication~%")
747 (time (dotimes (k n)
748 (declare (fixnum k))
749 (setf y (log1p-qd/duplication x))))
750 #+cmu (gc :full t)
751 (format t "log-qd/agm~%")
752 (time (dotimes (k n)
753 (declare (fixnum k))
754 (setf y (log-qd/agm x))))
756 #+cmu (gc :full t)
757 (format t "log-qd/agm2~%")
758 (time (dotimes (k n)
759 (declare (fixnum k))
760 (setf y (log-qd/agm2 x))))
761 #+cmu (gc :full t)
762 (format t "log-qd/agm3~%")
763 (time (dotimes (k n)
764 (declare (fixnum k))
765 (setf y (log-qd/agm3 x))))
766 #+cmu (gc :full t)
767 (format t "log-qd/halley~%")
768 (time (dotimes (k n)
769 (declare (fixnum k))
770 (setf y (log-qd/halley x))))
774 ;; (time-atan2 #c(10w0 0) 10000)
776 ;; Time
777 ;; PPC Sparc x86 PPC (fma) Sparc2
778 ;; atan2-qd/newton 2.91 1.91 8.06 2.16 7.55
779 ;; atan2-qd/cordic 1.22 0.89 6.68 1.43 2.47
780 ;; atan-qd/duplication 2.51 2.14 5.63 1.76 5.94
782 ;; Consing
783 ;; atan2-qd/newton 44.4 44.4 481 44.4 44.4
784 ;; atan2-qd/cordic 1.6 1.6 482 1.6 1.6
785 ;; atan-qd/duplication 17.2 6.0 281 6.0 6.0
787 ;; Don't know why x86 is 10 times slower than sparc/ppc for
788 ;; atan2-qd/newton. Consing is much more too. Not enough registers?
790 ;; atan2-qd/cordic is by far the fastest on all archs.
792 ;; Timing results without inlining everything:
793 ;; Time
794 ;; PPC Sparc x86 PPC (fma)
795 ;; atan2-qd/newton 6.56 4.48 9.75 6.15
796 ;; atan2-qd/cordic 6.02 4.24 7.06 5.01
797 ;; atan-qd/duplication 3.28 1.94 5.72 2.46
799 ;; Consing
800 ;; atan2-qd/newton 443 441 482 443
801 ;; atan2-qd/cordic 482 482 482 482
802 ;; atan-qd/duplication 87 81 281 87
805 (defun time-atan2 (x n)
806 (declare (type %quad-double x)
807 (fixnum n))
808 (let ((y +qd-zero+)
809 (one +qd-one+))
810 #+cmu (gc :full t)
811 (format t "atan2-qd/newton~%")
812 (time (dotimes (k n)
813 (declare (fixnum k))
814 (setf y (atan2-qd/newton x one))))
815 #+cmu (gc :full t)
816 (format t "atan2-qd/cordic~%")
817 (time (dotimes (k n)
818 (declare (fixnum k))
819 (setf y (atan2-qd/cordic x one))))
820 #+cmu (gc :full t)
821 (format t "atan-qd/duplication~%")
822 (time (dotimes (k n)
823 (declare (fixnum k))
824 (setf y (atan-qd/duplication x))))
827 ;; (time-tan #c(10w0 0) 10000)
829 ;; Time
830 ;; PPC Sparc x86 PPC (fma) Sparc2
831 ;; tan-qd/cordic 2.12 1.51 8.26 1.77 4.61
832 ;; tan-qd/sincos 0.68 0.57 2.39 0.54 2.56
834 ;; Consing
835 ;; tan-qd/cordic 23.0 23.0 473 23.0 23.0
836 ;; tan-qd/sincos 14.8 14.8 147 14.8 14.8
838 ;; Don't know why x86 is so much slower for tan-qd/cordic.
840 ;; Without inlining everything
841 ;; PPC Sparc x86 PPC (fma)
842 ;; tan-qd/cordic 7.72 4.56 17.08 5.96
843 ;; tan-qd/sincos 2.32 1.4 4.91 1.87
845 ;; Consing
846 ;; tan-qd/cordic 463 463 472 463
847 ;; tan-qd/sincos 137 136 146 137
849 (defun time-tan (x n)
850 (declare (type %quad-double x)
851 (fixnum n))
852 (let ((y +qd-zero+))
853 #+cmu (gc :full t)
854 (format t "tan-qd/cordic~%")
855 (time (dotimes (k n)
856 (declare (fixnum k))
857 (setf y (tan-qd/cordic x))))
858 #+cmu (gc :full t)
859 (format t "tan-qd/sincos~%")
860 (time (dotimes (k n)
861 (declare (fixnum k))
862 (setf y (tan-qd/sincos x))))))