1 ;;;; -*- Mode: lisp -*-
3 ;;;; Copyright (c) 2007 Raymond Toy
5 ;;;; Permission is hereby granted, free of charge, to any person
6 ;;;; obtaining a copy of this software and associated documentation
7 ;;;; files (the "Software"), to deal in the Software without
8 ;;;; restriction, including without limitation the rights to use,
9 ;;;; copy, modify, merge, publish, distribute, sublicense, and/or sell
10 ;;;; copies of the Software, and to permit persons to whom the
11 ;;;; Software is furnished to do so, subject to the following
14 ;;;; The above copyright notice and this permission notice shall be
15 ;;;; included in all copies or substantial portions of the Software.
17 ;;;; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
18 ;;;; EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
19 ;;;; OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
20 ;;;; NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
21 ;;;; HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
22 ;;;; WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
23 ;;;; FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
24 ;;;; OTHER DEALINGS IN THE SOFTWARE.
26 ;;; This file contains various possible implementations of some of the
27 ;;; core routines. These were experiments on faster and/or more
28 ;;; accurate implementations. The routines inf qd-fun.lisp are the
29 ;;; default, but you can select a different implementation from here
32 ;;; The end of the file also includes some tests of the different
33 ;;; implementations for speed.
37 ;; This works but seems rather slow, so we don't even compile it.
39 (defun exp-qd/newton
(a)
40 (declare (type %quad-double a
))
45 ;; x' = x - (log(x) - a)/(1/x)
46 ;; = x - x*(log(x) - a)
47 ;; = x*(1 + a - log(x))
48 (let ((a1 (add-qd-d a
1d0
))
49 (x (make-qd-d (exp (qd-0 a
)))))
50 (setf x
(mul-qd x
(sub-qd a1
(log-qd/agm x
))))
51 (setf x
(mul-qd x
(sub-qd a1
(log-qd/agm x
))))
52 (setf x
(mul-qd x
(sub-qd a1
(log-qd/agm x
))))
55 (defun expm1-qd/series
(a)
56 (declare (type %quad-double a
))
57 ;; Compute exp(x) - 1.
61 ;; First, write x = s*log(2) + r*k where s is an integer and |r*k| <
64 ;; Then D(x) = D(s*log(2)+r*k) = 2^s*exp(r*k) - 1
65 ;; = 2^s*(exp(r*k)-1) - 1 + 2^s
68 ;; exp(r*k) = exp(r)^k
72 ;; D(r*k) = (D(r) + 1)^k - 1
74 ;; For small r, D(r) can be computed using the Taylor series around
75 ;; zero. To compute D(r*k) = (D(r) + 1)^k - 1, we use the binomial
76 ;; theorem to expand out the power and to exactly cancel out the -1
77 ;; term, which is the source of inaccuracy.
79 ;; We want to have small r so the Taylor series converges quickly,
80 ;; but that means k is large, which means the binomial expansion is
81 ;; long. We need to compromise. Let use choose k = 8. Then |r| <
82 ;; log(2)/16 = 0.0433. For this range, the Taylor series converges
83 ;; to 212 bits of accuracy with about 28 terms.
87 (declare (type %quad-double x
))
88 ;; Taylor series for exp(x)-1
89 ;; = x+x^2/2!+x^3/3!+x^4/4!+...
90 ;; = x*(1+x/2!+x^2/3!+x^3/4!+...)
94 (setf term
(div-qd-d (mul-qd term x
) (float (cl:+ k
2) 1d0
)))
95 (setf sum
(add-qd sum term
)))
98 (declare (type %quad-double x
))
100 ;; = x*(8 + 28*x + 56*x^2 + 70*x^3 + 56*x^4 + 28*x^5 + 8*x^6 + x^7)
101 ;; = x (x (x (x (x (x (x (x + 8) + 28) + 56) + 70) + 56) + 28) + 8)
124 (declare (type %quad-double x
))
125 ;; Write x = s*log(2) + r*k where s is an integer and |r*k|
126 ;; < log(2)/2, and k = 8.
127 (let* ((s (truncate (qd-0 (nint-qd (div-qd a
+qd-log2
+)))))
128 (r*k
(sub-qd x
(mul-qd-d +qd-log2
+ (float s
1d0
))))
129 (r (div-qd-d r
*k
8d0
)))
131 (multiple-value-bind (s r
)
133 (let* ((d (taylor r
))
135 (add-qd-d (scale-float-qd dr s
)
136 (cl:-
(scale-float 1d0 s
) 1))))))
138 (defun log-qd/newton
(a)
139 (declare (type %quad-double a
))
140 ;; The Taylor series for log converges rather slowly. Hence, this
141 ;; routine tries to determine the root of the function
145 ;; using Newton iteration. The iteration is
147 ;; x' = x - f(x) / f'(x)
148 ;; = x - (1 - a * exp(-x))
149 ;; = x + a * exp(-x) - 1
151 ;; Two iterations are needed.
152 (let ((x (make-qd-d (log (qd-0 a
)))))
154 (setf x
(sub-qd-d (add-qd x
(mul-qd a
(exp-qd (neg-qd x
))))
159 ;;(declaim (inline agm-qd))
162 (declare (type %quad-double x y
)
163 (optimize (speed 3)))
164 (let ((diff (qd-0 (abs-qd (sub-qd x y
)))))
165 (cond ((< diff
+qd-eps
+)
168 (let ((a-mean (div-qd-d (add-qd x y
) 2d0
))
169 (g-mean (sqrt-qd (mul-qd x y
))))
170 (agm-qd a-mean g-mean
))))))
174 (declare (type %quad-double x y
)
175 (optimize (speed 3) (space 0) (safety 0)))
176 (let ((diff (qd-0 (abs-qd (sub-qd x y
))))
179 (declare (double-float diff
))
180 (loop while
(> diff
+qd-eps
+)
182 (let ((a-mean (scale-float-qd (add-qd x y
) -
1))
183 (g-mean (sqrt-qd (mul-qd x y
))))
186 (setf diff
(qd-0 (abs-qd (sub-qd x y
))))))
189 (defun log-qd/agm
(x)
190 (declare (type %quad-double x
))
191 ;; log(x) ~ pi/2/agm(1,4/x)*(1+O(1/x^2))
193 ;; Need to make x >= 2^(d/2) to get d bits of precision. We use
195 ;; log(2^k*x) = k*log(2)+log(x)
197 ;; to compute log(x). log(2^k*x) is computed using AGM.
199 (multiple-value-bind (frac exp
)
200 (decode-float (qd-0 x
))
201 (declare (ignore frac
))
203 ;; Big enough to use AGM
206 (div-qd (make-qd-d 4d0
)
209 ;; log(x) = log(2^k*x) - k * log(2)
210 (let* ((k (cl:-
107 exp
))
211 (big-x (scale-float-qd x k
)))
212 ;; Compute k*log(2) using extra precision by writing
213 ;; log(2) = a + b, where a is the quad-double
214 ;; approximation and b the rest.
215 (sub-qd (log-qd/agm big-x
)
216 (add-qd (mul-qd-d +qd-log2
+ (float k
1d0
))
217 (mul-qd-d +qd-log2-extra
+ (float k
1d0
)))))))))
219 (defun log-qd/agm2
(x)
220 (declare (type %quad-double x
))
221 ;; log(x) ~ pi/4/agm(theta2(q^4)^2,theta3(q^4)^2)
225 ;; Need to make x >= 2^(d/36) to get d bits of precision. We use
227 ;; log(2^k*x) = k*log(2)+log(x)
229 ;; to compute log(x). log(2^k*x) is computed using AGM.
231 (multiple-value-bind (frac exp
)
232 (decode-float (qd-0 x
))
233 (declare (ignore frac
))
235 ;; Big enough to use AGM (because d = 212 so x >= 2^5.8888)
236 (let* ((q (div-qd +qd-one
+
240 ;; theta2(q^4) = 2*q*(1+q^8+q^24)
241 ;; = 2*q*(1+q^8+(q^8)^3)
250 ;; theta3(q^4) = 1+2*(q^4+q^16)
251 ;; = 1+2*(q^4+(q^4)^4)
259 (agm-qd (sqr-qd theta2
)
262 ;; log(x) = log(2^k*x) - k * log(2)
263 (let* ((k (cl:-
7 exp
))
264 (big-x (scale-float-qd x k
)))
265 (sub-qd (log-qd/agm2 big-x
)
266 (add-qd (mul-qd-d +qd-log2
+ (float k
1d0
))
267 (mul-qd-d +qd-log2-extra
+ (float k
1d0
)))))))))
269 (defun log-qd/agm3
(x)
270 (declare (type %quad-double x
))
271 ;; log(x) ~ pi/4/agm(theta2(q^4)^2,theta3(q^4)^2)
275 ;; Need to make x >= 2^(d/36) to get d bits of precision. We use
277 ;; log(2^k*x) = k*log(2)+log(x)
279 ;; to compute log(x). log(2^k*x) is computed using AGM.
281 (multiple-value-bind (frac exp
)
282 (decode-float (qd-0 x
))
283 (declare (ignore frac
))
285 ;; Big enough to use AGM (because d = 212 so x >= 2^5.8888)
286 (let* ((q (div-qd +qd-one
+
290 ;; theta2(q^4) = 2*q*(1+q^8+q^24)
291 ;; = 2*q*(1+q^8+(q^8)^3)
300 ;; theta3(q^4) = 1+2*(q^4+q^16)
301 ;; = 1+2*(q^4+(q^4)^4)
308 ;; Note that agm(theta2^2,theta3^2) = agm(2*theta2*theta3,theta2^2+theta3^2)/2
311 (agm-qd (scale-float-qd (mul-qd theta2 theta3
) 1)
312 (add-qd (sqr-qd theta2
)
316 ;; log(x) = log(2^k*x) - k * log(2)
317 (let* ((k (cl:-
7 exp
))
318 (big-x (scale-float-qd x k
)))
319 (sub-qd (log-qd/agm3 big-x
)
321 (mul-qd-d +qd-log2
+ (float k
1d0
))
322 (mul-qd-d +qd-log2-extra
+ (float k
1d0
)))))))))
326 (let* ((r (abs (complex x y
)))
329 (let ((z (atan (float y
1f0
) (float x
1f0
)))
332 (format t
"z = ~A~%" z
)
334 (format t
"xx > yy~%")
336 (let* ((sinz (sin z
))
338 (delta (cl:/ (cl:- yy sinz
)
340 (format t
"sz, dz = ~A ~A~%" sinz cosz
)
341 (format t
"delta = ~A~%" delta
)
342 (setf z
(cl:+ z delta
))
343 (format t
"z = ~A~%" z
))))
348 (format t
"sz, dz = ~A ~A~%" sinz cosz
)
350 (setf z
(cl:- z
(cl:/ (cl:- xx cosz
)
352 (format t
"z = ~A~%" z
)))))
358 (defvar *cordic-scale
*)
361 (defun setup-cordic ()
362 (let ((table (make-array 34))
363 (ttable (make-array 34)))
364 (setf (aref table
0) 1d0
)
365 (setf (aref table
1) 1d0
)
366 (setf (aref table
2) 1d0
)
367 (setf (aref ttable
0) (cl:/ pi
4))
368 (setf (aref ttable
1) (cl:/ pi
4))
369 (setf (aref ttable
2) (cl:/ pi
4))
370 (loop for k from
3 below
34 do
371 (setf (aref table k
) (cl:* 0.5d0
(aref table
(cl:1- k
))))
372 (setf (aref ttable k
) (atan (aref table k
))))
374 (setf *ttable
* ttable
)))
376 (defun setup-cordic ()
377 (let ((table (make-array 34))
378 (ttable (make-array 34)))
379 (setf (aref table
0) 4d0
)
380 (setf (aref table
1) 2d0
)
381 (setf (aref table
2) 1d0
)
382 (setf (aref ttable
0) (atan 4d0
))
383 (setf (aref ttable
1) (atan 2d0
))
384 (setf (aref ttable
2) (cl:/ pi
4))
385 (loop for k from
3 below
34 do
386 (setf (aref table k
) (cl:* 0.5d0
(aref table
(cl:1- k
))))
387 (setf (aref ttable k
) (atan (aref table k
))))
389 (setf *ttable
* ttable
)))
391 (defun setup-cordic ()
392 (let ((table (make-array 34))
393 (ttable (make-array 34))
395 (loop for k from
0 below
34 do
396 (setf (aref table k
) (scale-float 1d0
(cl:-
2 k
)))
397 (setf (aref ttable k
) (atan (aref table k
)))
398 (setf scale
(cl:* scale
(cos (aref ttable k
)))))
400 (setf *ttable
* ttable
)
401 (setf *cordic-scale
* scale
)))
404 (defun cordic-rot (x y
)
406 (dotimes (k (length *table
*))
408 (psetq x
(cl:+ x
(cl:* y
(aref *table
* k
)))
409 y
(cl:- y
(cl:* x
(aref *table
* k
))))
410 (incf z
(aref *ttable
* k
)))
412 (psetq x
(cl:- x
(cl:* y
(aref *table
* k
)))
413 y
(cl:+ y
(cl:* x
(aref *table
* k
))))
414 (decf z
(aref *ttable
* k
)))
418 (defun cordic-vec (z)
422 (dotimes (k 12 (length *table
*))
423 (setf scale
(cl:* scale
(cos (aref *ttable
* k
))))
425 (psetq x
(cl:+ x
(cl:* y
(aref *table
* k
)))
426 y
(cl:- y
(cl:* x
(aref *table
* k
))))
427 (incf z
(aref *ttable
* k
)))
429 (psetq x
(cl:- x
(cl:* y
(aref *table
* k
)))
430 y
(cl:+ y
(cl:* x
(aref *table
* k
))))
431 (decf z
(aref *ttable
* k
)))
433 (values x y z scale
)))
436 (multiple-value-bind (z dx dy
)
438 (let ((theta (cl:/ dy dx
)))
439 (format t
"theta = ~A~%" theta
)
440 (let ((corr (cl:+ theta
441 (cl:-
(cl:/ (expt theta
3)
445 (format t
"corr = ~A~%" corr
)
449 (multiple-value-bind (x y z
)
451 (setf x
(cl:* x
*cordic-scale
*))
452 (setf y
(cl:* y
*cordic-scale
*))
453 (format t
"x = ~A~%" x
)
454 (format t
"y = ~A~%" y
)
455 (format t
"z = ~A~%" z
)
456 ;; Need to finish of the rotation
459 (format t
"st, ct = ~A ~A~%" st ct
)
460 (psetq x
(cl:-
(cl:* x ct
) (cl:* y st
))
461 y
(cl:+ (cl:* y ct
) (cl:* x st
)))
462 (format t
"x = ~A~%" x
)
463 (format t
"y = ~A~%" y
)
468 (declare (type double-float r
))
469 (multiple-value-bind (x y z s
)
472 ;; Need to finish the rotation
475 (psetq x
(cl:-
(cl:* x ct
) (cl:* y st
))
476 y
(cl:+ (cl:* y ct
) (cl:* x st
)))
480 ;; This is the basic CORDIC rotation. Based on code from
481 ;; http://www.voidware.com/cordic.htm and
482 ;; http://www.dspcsp.com/progs/cordic.c.txt.
484 ;; The only difference between this version and the typical CORDIC
485 ;; implementation is that the first 3 rotations are all by pi/4. This
486 ;; makes sense. If the angle is greater than pi/4, the rotations will
487 ;; reduce it to at most pi/4. If the angle is less than pi/4, the 3
488 ;; rotations by pi/4 will cause us to end back at the same place.
489 ;; (Should we try to be smarter?)
490 (defun cordic-rot-qd (x y
)
491 (declare (type %quad-double y x
)
492 (optimize (speed 3)))
493 (let* ((zero +qd-zero
+)
495 (declare (type %quad-double zero z
))
496 (dotimes (k (length +atan-table
+))
499 (psetq x
(add-qd x
(mul-qd-d y
(aref +atan-power-table
+ k
)))
500 y
(sub-qd y
(mul-qd-d x
(aref +atan-power-table
+ k
))))
501 (setf z
(add-qd z
(aref +atan-table
+ k
))))
503 (psetq x
(sub-qd x
(mul-qd-d y
(aref +atan-power-table
+ k
)))
504 y
(add-qd y
(mul-qd-d x
(aref +atan-power-table
+ k
))))
505 (setf z
(sub-qd z
(aref +atan-table
+ k
))))))
508 (defun atan2-qd/cordic
(y x
)
509 (declare (type %quad-double y x
))
510 ;; Use the CORDIC rotation to get us to a small angle. Then use the
511 ;; Taylor series for atan to finish the computation.
512 (multiple-value-bind (z dx dy
)
514 ;; Use Taylor series to finish off the computation
515 (let* ((arg (div-qd dy dx
))
516 (sq (neg-qd (sqr-qd arg
)))
518 ;; atan(x) = x - x^3/3 + x^5/5 - ...
519 ;; = x*(1-x^2/3+x^4/5-x^6/7+...)
520 (do ((k 3d0
(cl:+ k
2d0
))
522 ((< (abs (qd-0 term
)) +qd-eps
+))
523 (setf sum
(add-qd sum
(div-qd-d term k
)))
524 (setf term
(mul-qd term sq
)))
525 (setf sum
(mul-qd arg sum
))
528 (defun atan-qd/cordic
(y)
529 (declare (type %quad-double y
))
530 (atan2-qd/cordic y
+qd-one
+))
532 (defun atan-qd/duplication
(y)
533 (declare (type %quad-double y
)
534 (optimize (speed 3) (space 0)))
535 (cond ((< (abs (qd-0 y
)) 1d-4
)
538 (sq (neg-qd (sqr-qd arg
)))
540 ;; atan(x) = x - x^3/3 + x^5/5 - ...
541 ;; = x*(1-x^2/3+x^4/5-x^6/7+...)
542 (do ((k 3d0
(cl:+ k
2d0
))
544 ((< (abs (qd-0 term
)) +qd-eps
+))
545 (setf sum
(add-qd sum
(div-qd-d term k
)))
546 (setf term
(mul-qd term sq
)))
549 ;; atan(x) = 2*atan(x/(1 + sqrt(1 + x^2)))
551 (add-qd-d (sqrt-qd (add-qd-d (sqr-qd y
) 1d0
))
553 (scale-float-qd (atan-qd/duplication x
) 1)))))
555 (defun cordic-vec-qd (z)
556 (declare (type %quad-double z
)
557 (optimize (speed 3)))
561 (declare (type %quad-double zero x y
))
562 (dotimes (k 30 (length +atan-table
+))
564 (inline mul-qd-d sub-qd add-qd
))
566 (psetq x
(sub-qd x
(mul-qd-d y
(aref +atan-power-table
+ k
)))
567 y
(add-qd y
(mul-qd-d x
(aref +atan-power-table
+ k
))))
568 (setf z
(sub-qd z
(aref +atan-table
+ k
))))
570 (psetq x
(add-qd x
(mul-qd-d y
(aref +atan-power-table
+ k
)))
571 y
(sub-qd y
(mul-qd-d x
(aref +atan-power-table
+ k
))))
572 (setf z
(add-qd z
(aref +atan-table
+ k
))))))
575 (defun tan-qd/cordic
(r)
576 (declare (type %quad-double r
))
577 (multiple-value-bind (z x y
)
579 ;; Need to finish the rotation
580 (multiple-value-bind (st ct
)
582 (psetq x
(sub-qd (mul-qd x ct
) (mul-qd y st
))
583 y
(add-qd (mul-qd y ct
) (mul-qd x st
)))
587 (defun sin-qd/cordic
(r)
588 (declare (type %quad-double r
))
589 (multiple-value-bind (z x y
)
593 (format t
"~&x = ~/qd::qd-format/~%" x
)
594 (format t
"~&y = ~/qd::qd-format/~%" y
)
595 (format t
"~&z = ~/qd::qd-format/~%" z
)
596 (format t
"~&s = ~/qd::qd-format/~%" s
))
597 ;; Need to finish the rotation
598 (multiple-value-bind (st ct
)
602 (format t
"~&st = ~/qd::qd-format/~%" st
)
603 (format t
"~&ct = ~/qd::qd-format/~%" ct
)
604 (format t
"~&y = ~/qd::qd-format/~%" (mul-qd +cordic-scale
+ y
)))
606 (psetq x
(sub-qd (mul-qd x ct
) (mul-qd y st
))
607 y
(add-qd (mul-qd y ct
) (mul-qd x st
)))
608 (mul-qd +cordic-scale
+ y
))))
611 ;; Some timing and consing tests.
613 ;; The tests are run using the following:
615 ;; Sparc: 1.5 GHz Ultrasparc IIIi
616 ;; Sparc2: 450 MHz Ultrasparc II
618 ;; x86: 866 MHz Pentium 3
619 ;; PPC(fma): 1.42 GHz with cmucl with fused-multiply-add double-double.
622 ;; (time-exp #c(2w0 0) 50000)
624 ;; Time Sparc PPC x86 PPC (fma) Sparc2
625 ;; exp-qd/reduce 2.06 3.18 10.46 2.76 6.12
626 ;; expm1-qd/series 8.81 12.24 18.87 3.26 29.0
627 ;; expm1-qd/dup 5.68 4.34 18.47 3.64 18.78
629 ;; Consing (MB) Sparc
630 ;; exp-qd/reduce 45 45 638 44.4 45
631 ;; expm1-qd/series 519 519 1201 14.8 519
632 ;; expm1-qd/dup 32 32 1224 32.0 32
634 ;; Speeds seem to vary quite a bit between architectures.
636 ;; Timing without inlining all the basic functions everywhere. (That
637 ;; is, :qd-inline is not a feature.)
639 ;; (time-exp #c(2w0 0) 50000)
641 ;; Time Sparc PPC x86 PPC (fma)
642 ;; exp-qd/reduce 5.83 0.67 10.67 0.98
643 ;; expm1-qd/series 10.65 1.45 21.06 1.35
644 ;; expm1-qd/dup 11.17 1.36 24.01 1.25
647 ;; exp-qd/reduce 638 93 638 93
648 ;; expm1-qd/series 1203 120 1201 120
649 ;; expm1-qd/dup 1224 122 1224 122
651 ;; So inlining speeds things up by a factor of about 3 for sparc,
652 ;; 1.5-4 for ppc. Strangely, x86 slows down on some but speeds up on
654 (defun time-exp (x n
)
655 (declare (type %quad-double x
)
658 (declare (type %quad-double y
))
660 (format t
"exp-qd/reduce~%")
663 (setf y
(exp-qd/reduce x
))))
665 (format t
"expm1-qd/series~%")
668 (setf y
(expm1-qd/series x
))))
670 (format t
"expm1-qd/duplication~%")
673 (setf y
(expm1-qd/duplication x
))))
677 ;; (time-log #c(3w0 0) 50000)
679 ;; Time (s) Sparc PPC x86 PPC (fma) Sparc2
680 ;; log-qd/newton 7.08 10.23 35.74 8.82 21.77
681 ;; log1p-qd/dup 5.87 8.41 27.32 6.65 20.73
682 ;; log-qd/agm 6.58 8.0 27.2 6.87 24.62
683 ;; log-qd/agm2 5.8 6.93 22.89 6.07 18.44
684 ;; log-qd/agm3 5.45 6.57 20.97 6.18 20.34
685 ;; log-qd/halley 4.96 6.8 25.11 7.01 16.13
687 ;; Consing (MB) Sparc PPC x86 PPC (fma)
688 ;; log-qd/newton 150 150 2194 148 150
689 ;; log1p-qd/dup 56 56 1564 56 56
690 ;; log-qd/agm 81 11 1434 81 81
691 ;; log-qd/agm2 87 35 1184 87 87
692 ;; log-qd/agm3 82 36 1091 81 82
693 ;; log-qd/halley 101 101 1568 100 101
695 ;; Based on these results, it's not really clear what is the fastest.
696 ;; But Halley's iteration is probably a good tradeoff for log.
698 ;; However, consider log(1+2^(-100)). Use log1p as a reference:
699 ;; 7.88860905221011805411728565282475078909313378023665801567590088088481830649115711502410110281q-31
703 ;; 7.88860905221011805411728565282475078909313378023665801567590088088481830649133878797727478488q-31
705 ;; 7.88860905221011805411728565282514580471135738786455290255431302193794546609432q-31
707 ;; 7.88860905221011805411728565282474926980229445866885841995713611460718519856111q-31
709 ;; 7.88860905221011805411728565282474926980229445866885841995713611460718519856111q-31
711 ;; 7.88860905221011805411728565282475078909313378023665801567590088088481830649120253326239452326q-31
713 ;; We can see that the AGM methods are grossly inaccurate, but log-qd
714 ;; and log-halley are quite good.
716 ;; Timing results without inlining everything:
718 ;; Time Sparc PPC x86 PPC (fma)
719 ;; log-qd/newton 21.37 0.87 41.49 0.62
720 ;; log1p-qd/dup 12.58 0.41 31.86 0.28
721 ;; log-qd/agm 7.17 0.23 34.86 0.16
722 ;; log-qd/agm2 6.35 0.22 27.53 0.15
723 ;; log-qd/agm3 7.49 0.17 24.92 0.14
724 ;; log-qd/halley 14.38 0.56 30.2 0.65
727 ;; Sparc PPC x86 PPC (fma)
728 ;; log-qd/newton 2194 60.7 2194 61
729 ;; log1p-qd/dup 1114 22.6 1564 23
730 ;; log-qd/agm 371 7.9 1434 7.9
731 ;; log-qd/agm2 371 7.8 1185 7.8
732 ;; log-qd/agm3 373 7.8 1091 7.8
733 ;; log-qd/halley 1554 42.3 1567 42.3
735 (defun time-log (x n
)
736 (declare (type %quad-double x
)
739 (declare (type %quad-double y
))
741 (format t
"log-qd/newton~%")
744 (setf y
(log-qd/newton x
))))
746 (format t
"log1p-qd/duplication~%")
749 (setf y
(log1p-qd/duplication x
))))
751 (format t
"log-qd/agm~%")
754 (setf y
(log-qd/agm x
))))
757 (format t
"log-qd/agm2~%")
760 (setf y
(log-qd/agm2 x
))))
762 (format t
"log-qd/agm3~%")
765 (setf y
(log-qd/agm3 x
))))
767 (format t
"log-qd/halley~%")
770 (setf y
(log-qd/halley x
))))
774 ;; (time-atan2 #c(10w0 0) 10000)
777 ;; PPC Sparc x86 PPC (fma) Sparc2
778 ;; atan2-qd/newton 2.91 1.91 8.06 2.16 7.55
779 ;; atan2-qd/cordic 1.22 0.89 6.68 1.43 2.47
780 ;; atan-qd/duplication 2.51 2.14 5.63 1.76 5.94
783 ;; atan2-qd/newton 44.4 44.4 481 44.4 44.4
784 ;; atan2-qd/cordic 1.6 1.6 482 1.6 1.6
785 ;; atan-qd/duplication 17.2 6.0 281 6.0 6.0
787 ;; Don't know why x86 is 10 times slower than sparc/ppc for
788 ;; atan2-qd/newton. Consing is much more too. Not enough registers?
790 ;; atan2-qd/cordic is by far the fastest on all archs.
792 ;; Timing results without inlining everything:
794 ;; PPC Sparc x86 PPC (fma)
795 ;; atan2-qd/newton 6.56 4.48 9.75 6.15
796 ;; atan2-qd/cordic 6.02 4.24 7.06 5.01
797 ;; atan-qd/duplication 3.28 1.94 5.72 2.46
800 ;; atan2-qd/newton 443 441 482 443
801 ;; atan2-qd/cordic 482 482 482 482
802 ;; atan-qd/duplication 87 81 281 87
805 (defun time-atan2 (x n
)
806 (declare (type %quad-double x
)
811 (format t
"atan2-qd/newton~%")
814 (setf y
(atan2-qd/newton x one
))))
816 (format t
"atan2-qd/cordic~%")
819 (setf y
(atan2-qd/cordic x one
))))
821 (format t
"atan-qd/duplication~%")
824 (setf y
(atan-qd/duplication x
))))
827 ;; (time-tan #c(10w0 0) 10000)
830 ;; PPC Sparc x86 PPC (fma) Sparc2
831 ;; tan-qd/cordic 2.12 1.51 8.26 1.77 4.61
832 ;; tan-qd/sincos 0.68 0.57 2.39 0.54 2.56
835 ;; tan-qd/cordic 23.0 23.0 473 23.0 23.0
836 ;; tan-qd/sincos 14.8 14.8 147 14.8 14.8
838 ;; Don't know why x86 is so much slower for tan-qd/cordic.
840 ;; Without inlining everything
841 ;; PPC Sparc x86 PPC (fma)
842 ;; tan-qd/cordic 7.72 4.56 17.08 5.96
843 ;; tan-qd/sincos 2.32 1.4 4.91 1.87
846 ;; tan-qd/cordic 463 463 472 463
847 ;; tan-qd/sincos 137 136 146 137
849 (defun time-tan (x n
)
850 (declare (type %quad-double x
)
854 (format t
"tan-qd/cordic~%")
857 (setf y
(tan-qd/cordic x
))))
859 (format t
"tan-qd/sincos~%")
862 (setf y
(tan-qd/sincos x
))))))