external/oct/qd-extra.lisp

   1 ;;;; -*- Mode: lisp -*-
   2 ;;;;
   3 ;;;; Copyright (c) 2007 Raymond Toy
   4 ;;;;
   5 ;;;; Permission is hereby granted, free of charge, to any person
   6 ;;;; obtaining a copy of this software and associated documentation
   7 ;;;; files (the "Software"), to deal in the Software without
   8 ;;;; restriction, including without limitation the rights to use,
   9 ;;;; copy, modify, merge, publish, distribute, sublicense, and/or sell
  10 ;;;; copies of the Software, and to permit persons to whom the
  11 ;;;; Software is furnished to do so, subject to the following
  12 ;;;; conditions:
  13 ;;;;
  14 ;;;; The above copyright notice and this permission notice shall be
  15 ;;;; included in all copies or substantial portions of the Software.
  16 ;;;;
  17 ;;;; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  18 ;;;; EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
  19 ;;;; OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  20 ;;;; NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
  21 ;;;; HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
  22 ;;;; WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  23 ;;;; FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
  24 ;;;; OTHER DEALINGS IN THE SOFTWARE.
  25
  26 ;;; This file contains various possible implementations of some of the
  27 ;;; core routines.  These were experiments on faster and/or more
  28 ;;; accurate implementations.  The routines inf qd-fun.lisp are the
  29 ;;; default, but you can select a different implementation from here
  30 ;;; if you want.
  31 ;;;
  32 ;;; The end of the file also includes some tests of the different
  33 ;;; implementations for speed.
  34
  35 (in-package #:qdi)
  36
  37 ;; This works but seems rather slow, so we don't even compile it.
  38 #+(or)
  39 (defun exp-qd/newton (a)
  40   (declare (type %quad-double a))
  41   ;; Newton iteration
  42   ;;
  43   ;; f(x) = log(x) - a
  44   ;;
  45   ;; x' = x - (log(x) - a)/(1/x)
  46   ;;    = x - x*(log(x) - a)
  47   ;;    = x*(1 + a - log(x))
  48   (let ((a1 (add-qd-d a 1d0))
  49         (x (make-qd-d (exp (qd-0 a)))))
  50     (setf x (mul-qd x (sub-qd a1 (log-qd/agm x))))
  51     (setf x (mul-qd x (sub-qd a1 (log-qd/agm x))))
  52     (setf x (mul-qd x (sub-qd a1 (log-qd/agm x))))
  53     x))
  54
  55 (defun expm1-qd/series (a)
  56   (declare (type %quad-double a))
  57   ;; Compute exp(x) - 1.
  58   ;;
  59   ;; D(x) = exp(x) - 1
  60   ;;
  61   ;; First, write x = s*log(2) + r*k where s is an integer and |r*k| <
  62   ;; log(2)/2.
  63   ;;
  64   ;; Then D(x) = D(s*log(2)+r*k) = 2^s*exp(r*k) - 1
  65   ;;           = 2^s*(exp(r*k)-1) - 1 + 2^s
  66   ;;           = 2^s*D(r*k)+2^s-1
  67   ;; But
  68   ;; exp(r*k) = exp(r)^k
  69   ;;          = (D(r) + 1)^k
  70   ;;
  71   ;; So
  72   ;; D(r*k) = (D(r) + 1)^k - 1
  73   ;;
  74   ;; For small r, D(r) can be computed using the Taylor series around
  75   ;; zero.  To compute D(r*k) = (D(r) + 1)^k - 1, we use the binomial
  76   ;; theorem to expand out the power and to exactly cancel out the -1
  77   ;; term, which is the source of inaccuracy.
  78   ;;
  79   ;; We want to have small r so the Taylor series converges quickly,
  80   ;; but that means k is large, which means the binomial expansion is
  81   ;; long.  We need to compromise.  Let use choose k = 8.  Then |r| <
  82   ;; log(2)/16 = 0.0433.  For this range, the Taylor series converges
  83   ;; to 212 bits of accuracy with about 28 terms.
  84   ;;
  85   ;;
  86   (flet ((taylor (x)
  87            (declare (type %quad-double x))
  88            ;; Taylor series for exp(x)-1
  89            ;; = x+x^2/2!+x^3/3!+x^4/4!+...
  90            ;; = x*(1+x/2!+x^2/3!+x^3/4!+...)
  91            (let ((sum +qd-one+)
  92                  (term +qd-one+))
  93              (dotimes (k 28)
  94                (setf term (div-qd-d (mul-qd term x) (float (cl:+ k 2) 1d0)))
  95                (setf sum (add-qd sum term)))
  96              (mul-qd x sum)))
  97          (binom (x)
  98            (declare (type %quad-double x))
  99            ;; (1+x)^8-1
 100            ;; = x*(8 + 28*x + 56*x^2 + 70*x^3 + 56*x^4 + 28*x^5 + 8*x^6 + x^7)
 101            ;; = x (x (x (x (x (x (x (x + 8) + 28) + 56) + 70) + 56) + 28) + 8)
 102            (mul-qd
 103             x
 104             (add-qd-d
 105              (mul-qd x
 106                      (add-qd-d
 107                       (mul-qd x
 108                               (add-qd-d
 109                                (mul-qd x
 110                                        (add-qd-d
 111                                         (mul-qd x
 112                                                 (add-qd-d
 113                                                  (mul-qd x
 114                                                          (add-qd-d
 115                                                           (mul-qd x
 116                                                                   (add-qd-d x 8d0))
 117                                                           28d0))
 118                                                  56d0))
 119                                         70d0))
 120                                56d0))
 121                       28d0))
 122              8d0)))
 123          (arg-reduce (x)
 124            (declare (type %quad-double x))
 125            ;; Write x = s*log(2) + r*k where s is an integer and |r*k|
 126            ;; < log(2)/2, and k = 8.
 127            (let* ((s (truncate (qd-0 (nint-qd (div-qd a +qd-log2+)))))
 128                   (r*k (sub-qd x (mul-qd-d +qd-log2+ (float s 1d0))))
 129                   (r (div-qd-d r*k 8d0)))
 130              (values s r))))
 131     (multiple-value-bind (s r)
 132         (arg-reduce a)
 133       (let* ((d (taylor r))
 134              (dr (binom d)))
 135         (add-qd-d (scale-float-qd dr s)
 136                   (cl:- (scale-float 1d0 s) 1))))))
 137
 138 (defun log-qd/newton (a)
 139   (declare (type %quad-double a))
 140   ;; The Taylor series for log converges rather slowly.  Hence, this
 141   ;; routine tries to determine the root of the function
 142   ;;
 143   ;; f(x) = exp(x) - a
 144   ;;
 145   ;; using Newton iteration.  The iteration is
 146   ;;
 147   ;; x' = x - f(x) / f'(x)
 148   ;;    = x - (1 - a * exp(-x))
 149   ;;    = x + a * exp(-x) - 1
 150   ;;
 151   ;; Two iterations are needed.
 152   (let ((x (make-qd-d (log (qd-0 a)))))
 153     (dotimes (k 3)
 154       (setf x (sub-qd-d (add-qd x (mul-qd a (exp-qd (neg-qd x))))
 155                         1d0)))
 156     x))
 157
 158
 159 ;;(declaim (inline agm-qd))
 160
 161 (defun agm-qd (x y)
 162   (declare (type %quad-double x y)
 163            (optimize (speed 3)))
 164   (let ((diff (qd-0 (abs-qd (sub-qd x y)))))
 165     (cond ((< diff +qd-eps+)
 166            x)
 167           (t
 168            (let ((a-mean (div-qd-d (add-qd x y) 2d0))
 169                  (g-mean (sqrt-qd (mul-qd x y))))
 170              (agm-qd a-mean g-mean))))))
 171
 172 #+(or)
 173 (defun agm-qd (x y)
 174   (declare (type %quad-double x y)
 175            (optimize (speed 3) (space 0) (safety 0)))
 176   (let ((diff (qd-0 (abs-qd (sub-qd x y))))
 177         (x x)
 178         (y y))
 179     (declare (double-float diff))
 180     (loop while (> diff +qd-eps+)
 181       do
 182       (let ((a-mean (scale-float-qd (add-qd x y) -1))
 183             (g-mean (sqrt-qd (mul-qd x y))))
 184         (setf x a-mean)
 185         (setf y g-mean)
 186         (setf diff (qd-0 (abs-qd (sub-qd x y))))))
 187     x))
 188
 189 (defun log-qd/agm (x)
 190   (declare (type %quad-double x))
 191   ;; log(x) ~ pi/2/agm(1,4/x)*(1+O(1/x^2))
 192   ;;
 193   ;; Need to make x >= 2^(d/2) to get d bits of precision.  We use
 194   ;;
 195   ;; log(2^k*x) = k*log(2)+log(x)
 196   ;;
 197   ;; to compute log(x).  log(2^k*x) is computed using AGM.
 198   ;;
 199   (multiple-value-bind (frac exp)
 200       (decode-float (qd-0 x))
 201     (declare (ignore frac))
 202     (cond ((>= exp 106)
 203            ;; Big enough to use AGM
 204            (div-qd +qd-pi/2+
 205                    (agm-qd +qd-one+
 206                            (div-qd (make-qd-d 4d0)
 207                                    x))))
 208           (t
 209            ;; log(x) = log(2^k*x) - k * log(2)
 210            (let* ((k (cl:- 107 exp))
 211                   (big-x (scale-float-qd x k)))
 212              ;; Compute k*log(2) using extra precision by writing
 213              ;; log(2) = a + b, where a is the quad-double
 214              ;; approximation and b the rest.
 215              (sub-qd (log-qd/agm big-x)
 216                      (add-qd (mul-qd-d +qd-log2+ (float k 1d0))
 217                              (mul-qd-d +qd-log2-extra+ (float k 1d0)))))))))
 218
 219 (defun log-qd/agm2 (x)
 220   (declare (type %quad-double x))
 221   ;; log(x) ~ pi/4/agm(theta2(q^4)^2,theta3(q^4)^2)
 222   ;;
 223   ;; where q = 1/x
 224   ;;
 225   ;; Need to make x >= 2^(d/36) to get d bits of precision.  We use
 226   ;;
 227   ;; log(2^k*x) = k*log(2)+log(x)
 228   ;;
 229   ;; to compute log(x).  log(2^k*x) is computed using AGM.
 230   ;;
 231   (multiple-value-bind (frac exp)
 232       (decode-float (qd-0 x))
 233     (declare (ignore frac))
 234     (cond ((>= exp 7)
 235            ;; Big enough to use AGM (because d = 212 so x >= 2^5.8888)
 236            (let* ((q (div-qd +qd-one+
 237                              x))
 238                   (q^4 (npow q 4))
 239                   (q^8 (sqr-qd q^4))
 240                   ;; theta2(q^4) = 2*q*(1+q^8+q^24)
 241                   ;;             = 2*q*(1+q^8+(q^8)^3)
 242                   (theta2 (mul-qd-d
 243                            (mul-qd
 244                             q
 245                             (add-qd-d
 246                              (add-qd q^8
 247                                      (npow q^8 3))
 248                              1d0))
 249                            2d0))
 250                   ;; theta3(q^4) = 1+2*(q^4+q^16)
 251                   ;;             = 1+2*(q^4+(q^4)^4)
 252                   (theta3 (add-qd-d
 253                            (mul-qd-d
 254                             (add-qd q^4
 255                                     (npow q^4 4))
 256                             2d0)
 257                            1d0)))
 258              (div-qd +qd-pi/4+
 259                      (agm-qd (sqr-qd theta2)
 260                              (sqr-qd theta3)))))
 261           (t
 262            ;; log(x) = log(2^k*x) - k * log(2)
 263            (let* ((k (cl:- 7 exp))
 264                   (big-x (scale-float-qd x k)))
 265              (sub-qd (log-qd/agm2 big-x)
 266                      (add-qd (mul-qd-d +qd-log2+ (float k 1d0))
 267                              (mul-qd-d +qd-log2-extra+ (float k 1d0)))))))))
 268
 269 (defun log-qd/agm3 (x)
 270   (declare (type %quad-double x))
 271   ;; log(x) ~ pi/4/agm(theta2(q^4)^2,theta3(q^4)^2)
 272   ;;
 273   ;; where q = 1/x
 274   ;;
 275   ;; Need to make x >= 2^(d/36) to get d bits of precision.  We use
 276   ;;
 277   ;; log(2^k*x) = k*log(2)+log(x)
 278   ;;
 279   ;; to compute log(x).  log(2^k*x) is computed using AGM.
 280   ;;
 281   (multiple-value-bind (frac exp)
 282       (decode-float (qd-0 x))
 283     (declare (ignore frac))
 284     (cond ((>= exp 7)
 285            ;; Big enough to use AGM (because d = 212 so x >= 2^5.8888)
 286            (let* ((q (div-qd +qd-one+
 287                              x))
 288                   (q^4 (npow q 4))
 289                   (q^8 (sqr-qd q^4))
 290                   ;; theta2(q^4) = 2*q*(1+q^8+q^24)
 291                   ;;             = 2*q*(1+q^8+(q^8)^3)
 292                   (theta2 (mul-qd-d
 293                            (mul-qd
 294                             q
 295                             (add-qd-d
 296                              (add-qd q^8
 297                                      (npow q^8 3))
 298                              1d0))
 299                            2d0))
 300                   ;; theta3(q^4) = 1+2*(q^4+q^16)
 301                   ;;             = 1+2*(q^4+(q^4)^4)
 302                   (theta3 (add-qd-d
 303                            (mul-qd-d
 304                             (add-qd q^4
 305                                     (npow q^4 4))
 306                             2d0)
 307                            1d0)))
 308              ;; Note that agm(theta2^2,theta3^2) = agm(2*theta2*theta3,theta2^2+theta3^2)/2
 309              (div-qd +qd-pi/4+
 310                      (scale-float-qd
 311                       (agm-qd (scale-float-qd (mul-qd theta2 theta3) 1)
 312                               (add-qd (sqr-qd theta2)
 313                                       (sqr-qd theta3)))
 314                       -1))))
 315           (t
 316            ;; log(x) = log(2^k*x) - k * log(2)
 317            (let* ((k (cl:- 7 exp))
 318                   (big-x (scale-float-qd x k)))
 319              (sub-qd (log-qd/agm3 big-x)
 320                      (add-qd
 321                       (mul-qd-d +qd-log2+ (float k 1d0))
 322                       (mul-qd-d +qd-log2-extra+ (float k 1d0)))))))))
 323
 324 #+(or)
 325 (defun atan-d (y x)
 326   (let* ((r (abs (complex x y)))
 327          (xx (cl:/ x r))
 328          (yy (cl:/ y r)))
 329     (let ((z (atan (float y 1f0) (float x 1f0)))
 330           (sinz 0d0)
 331           (cosz 0d0))
 332       (format t "z = ~A~%" z)
 333       (cond ((> xx yy)
 334              (format t "xx > yy~%")
 335              (dotimes (k 5)
 336                (let* ((sinz (sin z))
 337                       (cosz (cos z))
 338                       (delta (cl:/ (cl:- yy sinz)
 339                                    cosz)))
 340                  (format t "sz, dz = ~A ~A~%" sinz cosz)
 341                  (format t "delta  = ~A~%" delta)
 342                  (setf z (cl:+ z delta))
 343                  (format t "z = ~A~%" z))))
 344             (t
 345              (dotimes (k 20)
 346                (let ((sinz (sin z))
 347                      (cosz (cos z)))
 348                  (format t "sz, dz = ~A ~A~%" sinz cosz)
 349
 350                  (setf z (cl:- z (cl:/ (cl:- xx cosz)
 351                                        sinz)))
 352                  (format t "z = ~A~%" z)))))
 353       z)))
 354
 355 #||
 356 (defvar *table*)
 357 (defvar *ttable*)
 358 (defvar *cordic-scale*)
 359
 360 #+nil
 361 (defun setup-cordic ()
 362   (let ((table (make-array 34))
 363         (ttable (make-array 34)))
 364     (setf (aref table 0) 1d0)
 365     (setf (aref table 1) 1d0)
 366     (setf (aref table 2) 1d0)
 367     (setf (aref ttable 0) (cl:/ pi 4))
 368     (setf (aref ttable 1) (cl:/ pi 4))
 369     (setf (aref ttable 2) (cl:/ pi 4))
 370     (loop for k from 3 below 34 do
 371          (setf (aref table k) (cl:* 0.5d0 (aref table (cl:1- k))))
 372          (setf (aref ttable k) (atan (aref table k))))
 373     (setf *table* table)
 374     (setf *ttable* ttable)))
 375
 376 (defun setup-cordic ()
 377   (let ((table (make-array 34))
 378         (ttable (make-array 34)))
 379     (setf (aref table 0) 4d0)
 380     (setf (aref table 1) 2d0)
 381     (setf (aref table 2) 1d0)
 382     (setf (aref ttable 0) (atan 4d0))
 383     (setf (aref ttable 1) (atan 2d0))
 384     (setf (aref ttable 2) (cl:/ pi 4))
 385     (loop for k from 3 below 34 do
 386          (setf (aref table k) (cl:* 0.5d0 (aref table (cl:1- k))))
 387          (setf (aref ttable k) (atan (aref table k))))
 388     (setf *table* table)
 389     (setf *ttable* ttable)))
 390
 391 (defun setup-cordic ()
 392   (let ((table (make-array 34))
 393         (ttable (make-array 34))
 394         (scale 1d0))
 395     (loop for k from 0 below 34 do
 396          (setf (aref table k) (scale-float 1d0 (cl:- 2 k)))
 397          (setf (aref ttable k) (atan (aref table k)))
 398          (setf scale (cl:* scale (cos (aref ttable k)))))
 399     (setf *table* table)
 400     (setf *ttable* ttable)
 401     (setf *cordic-scale* scale)))
 402
 403
 404 (defun cordic-rot (x y)
 405   (let ((z 0))
 406     (dotimes (k (length *table*))
 407       (cond ((plusp y)
 408              (psetq x (cl:+ x (cl:* y (aref *table* k)))
 409                     y (cl:- y (cl:* x (aref *table* k))))
 410              (incf z (aref *ttable* k)))
 411             (t
 412              (psetq x (cl:- x (cl:* y (aref *table* k)))
 413                     y (cl:+ y (cl:* x (aref *table* k))))
 414              (decf z (aref *ttable* k)))
 415             ))
 416     (values z x y)))
 417
 418 (defun cordic-vec (z)
 419   (let ((x 1d0)
 420         (y 0d0)
 421         (scale 1d0))
 422     (dotimes (k 12 (length *table*))
 423       (setf scale (cl:* scale (cos (aref *ttable* k))))
 424       (cond ((minusp z)
 425              (psetq x (cl:+ x (cl:* y (aref *table* k)))
 426                     y (cl:- y (cl:* x (aref *table* k))))
 427              (incf z (aref *ttable* k)))
 428             (t
 429              (psetq x (cl:- x (cl:* y (aref *table* k)))
 430                     y (cl:+ y (cl:* x (aref *table* k))))
 431              (decf z (aref *ttable* k)))
 432             ))
 433     (values x y z scale)))
 434
 435 (defun atan2-d (y x)
 436   (multiple-value-bind (z dx dy)
 437       (cordic-rot x y)
 438     (let ((theta (cl:/ dy dx)))
 439       (format t "theta = ~A~%" theta)
 440       (let ((corr (cl:+ theta
 441                      (cl:- (cl:/ (expt theta 3)
 442                            3))
 443                      (cl:/ (expt theta 5)
 444                         5))))
 445         (format t "corr = ~A~%" corr)
 446         (cl:+ z corr)))))
 447
 448 (defun tan-d (r)
 449   (multiple-value-bind (x y z)
 450       (cordic-vec r)
 451     (setf x (cl:* x *cordic-scale*))
 452     (setf y (cl:* y *cordic-scale*))
 453     (format t "x = ~A~%" x)
 454     (format t "y = ~A~%" y)
 455     (format t "z = ~A~%" z)
 456     ;; Need to finish of the rotation
 457     (let ((st (sin z))
 458           (ct (cos z)))
 459       (format t "st, ct = ~A ~A~%" st ct)
 460       (psetq x (cl:- (cl:* x ct) (cl:* y st))
 461              y (cl:+ (cl:* y ct) (cl:* x st)))
 462       (format t "x = ~A~%" x)
 463       (format t "y = ~A~%" y)
 464       (cl:/ y x)
 465       )))
 466
 467 (defun sin-d (r)
 468   (declare (type double-float r))
 469   (multiple-value-bind (x y z s)
 470       (cordic-vec r)
 471
 472     ;; Need to finish the rotation
 473     (let ((st (sin z))
 474           (ct (cos z)))
 475       (psetq x (cl:- (cl:* x ct) (cl:* y st))
 476              y (cl:+ (cl:* y ct) (cl:* x st)))
 477       (cl:* s y))))
 478 ||#
 479
 480 ;; This is the basic CORDIC rotation.  Based on code from
 481 ;; http://www.voidware.com/cordic.htm and
 482 ;; http://www.dspcsp.com/progs/cordic.c.txt.
 483 ;;
 484 ;; The only difference between this version and the typical CORDIC
 485 ;; implementation is that the first 3 rotations are all by pi/4.  This
 486 ;; makes sense.  If the angle is greater than pi/4, the rotations will
 487 ;; reduce it to at most pi/4.  If the angle is less than pi/4, the 3
 488 ;; rotations by pi/4 will cause us to end back at the same place.
 489 ;; (Should we try to be smarter?)
 490 (defun cordic-rot-qd (x y)
 491   (declare (type %quad-double y x)
 492            (optimize (speed 3)))
 493   (let* ((zero +qd-zero+)
 494          (z zero))
 495     (declare (type %quad-double zero z))
 496     (dotimes (k (length +atan-table+))
 497       (declare (fixnum k))
 498       (cond ((qd-> y zero)
 499              (psetq x (add-qd x (mul-qd-d y (aref +atan-power-table+ k)))
 500                     y (sub-qd y (mul-qd-d x (aref +atan-power-table+ k))))
 501              (setf z (add-qd z (aref +atan-table+ k))))
 502             (t
 503              (psetq x (sub-qd x (mul-qd-d y (aref +atan-power-table+ k)))
 504                     y (add-qd y (mul-qd-d x (aref +atan-power-table+ k))))
 505              (setf z (sub-qd z (aref +atan-table+ k))))))
 506     (values z x y)))
 507
 508 (defun atan2-qd/cordic (y x)
 509   (declare (type %quad-double y x))
 510   ;; Use the CORDIC rotation to get us to a small angle.  Then use the
 511   ;; Taylor series for atan to finish the computation.
 512   (multiple-value-bind (z dx dy)
 513       (cordic-rot-qd x y)
 514     ;; Use Taylor series to finish off the computation
 515     (let* ((arg (div-qd dy dx))
 516            (sq (neg-qd (sqr-qd arg)))
 517            (sum +qd-one+))
 518       ;; atan(x) = x - x^3/3 + x^5/5 - ...
 519       ;;         = x*(1-x^2/3+x^4/5-x^6/7+...)
 520       (do ((k 3d0 (cl:+ k 2d0))
 521            (term sq))
 522           ((< (abs (qd-0 term)) +qd-eps+))
 523         (setf sum (add-qd sum (div-qd-d term k)))
 524         (setf term (mul-qd term sq)))
 525       (setf sum (mul-qd arg sum))
 526       (add-qd z sum))))
 527
 528 (defun atan-qd/cordic (y)
 529   (declare (type %quad-double y))
 530   (atan2-qd/cordic y +qd-one+))
 531
 532 (defun atan-qd/duplication (y)
 533   (declare (type %quad-double y)
 534            (optimize (speed 3) (space 0)))
 535   (cond ((< (abs (qd-0 y)) 1d-4)
 536          ;; Series
 537          (let* ((arg y)
 538                 (sq (neg-qd (sqr-qd arg)))
 539                 (sum +qd-one+))
 540            ;; atan(x) = x - x^3/3 + x^5/5 - ...
 541            ;;         = x*(1-x^2/3+x^4/5-x^6/7+...)
 542            (do ((k 3d0 (cl:+ k 2d0))
 543                 (term sq))
 544                ((< (abs (qd-0 term)) +qd-eps+))
 545              (setf sum (add-qd sum (div-qd-d term k)))
 546              (setf term (mul-qd term sq)))
 547            (mul-qd arg sum)))
 548         (t
 549          ;; atan(x) = 2*atan(x/(1 + sqrt(1 + x^2)))
 550          (let ((x (div-qd y
 551                           (add-qd-d (sqrt-qd (add-qd-d (sqr-qd y) 1d0))
 552                                     1d0))))
 553            (scale-float-qd (atan-qd/duplication x) 1)))))
 554
 555 (defun cordic-vec-qd (z)
 556   (declare (type %quad-double z)
 557            (optimize (speed 3)))
 558   (let* ((x +qd-one+)
 559          (y +qd-zero+)
 560          (zero +qd-zero+))
 561     (declare (type %quad-double zero x y))
 562     (dotimes (k 30 (length +atan-table+))
 563       (declare (fixnum k)
 564                (inline mul-qd-d sub-qd add-qd))
 565       (cond ((qd-> z zero)
 566              (psetq x (sub-qd x (mul-qd-d y (aref +atan-power-table+ k)))
 567                     y (add-qd y (mul-qd-d x (aref +atan-power-table+ k))))
 568              (setf z (sub-qd z (aref +atan-table+ k))))
 569             (t
 570              (psetq x (add-qd x (mul-qd-d y (aref +atan-power-table+ k)))
 571                     y (sub-qd y (mul-qd-d x (aref +atan-power-table+ k))))
 572              (setf z (add-qd z (aref +atan-table+ k))))))
 573     (values z x y)))
 574
 575 (defun tan-qd/cordic (r)
 576   (declare (type %quad-double r))
 577   (multiple-value-bind (z x y)
 578       (cordic-vec-qd r)
 579     ;; Need to finish the rotation
 580     (multiple-value-bind (st ct)
 581         (sincos-taylor z)
 582       (psetq x (sub-qd (mul-qd x ct) (mul-qd y st))
 583              y (add-qd (mul-qd y ct) (mul-qd x st)))
 584       (div-qd y x))))
 585
 586
 587 (defun sin-qd/cordic (r)
 588   (declare (type %quad-double r))
 589   (multiple-value-bind (z x y)
 590       (cordic-vec-qd r)
 591     #+nil
 592     (progn
 593       (format t "~&x = ~/qd::qd-format/~%" x)
 594       (format t "~&y = ~/qd::qd-format/~%" y)
 595       (format t "~&z = ~/qd::qd-format/~%" z)
 596       (format t "~&s = ~/qd::qd-format/~%" s))
 597     ;; Need to finish the rotation
 598     (multiple-value-bind (st ct)
 599         (sincos-taylor z)
 600       #+nil
 601       (progn
 602         (format t "~&st = ~/qd::qd-format/~%" st)
 603         (format t "~&ct = ~/qd::qd-format/~%" ct)
 604         (format t "~&y  = ~/qd::qd-format/~%" (mul-qd +cordic-scale+ y)))
 605
 606       (psetq x (sub-qd (mul-qd x ct) (mul-qd y st))
 607              y (add-qd (mul-qd y ct) (mul-qd x st)))
 608       (mul-qd +cordic-scale+ y))))
 609
 610 \f
 611 ;; Some timing and consing tests.
 612 ;;
 613 ;; The tests are run using the following:
 614 ;;
 615 ;; Sparc:       1.5 GHz Ultrasparc IIIi
 616 ;; Sparc2:      450 MHz Ultrasparc II
 617 ;; PPC:         1.42 GHz
 618 ;; x86:         866 MHz Pentium 3
 619 ;; PPC(fma):    1.42 GHz with cmucl with fused-multiply-add double-double.
 620 ;;
 621
 622 ;; (time-exp #c(2w0 0) 50000)
 623 ;;
 624 ;; Time                 Sparc   PPC     x86     PPC (fma)       Sparc2
 625 ;; exp-qd/reduce        2.06     3.18   10.46   2.76             6.12
 626 ;; expm1-qd/series      8.81    12.24   18.87   3.26            29.0
 627 ;; expm1-qd/dup         5.68     4.34   18.47   3.64            18.78
 628 ;;
 629 ;; Consing (MB)         Sparc
 630 ;; exp-qd/reduce         45      45      638    44.4             45
 631 ;; expm1-qd/series      519     519     1201    14.8            519
 632 ;; expm1-qd/dup          32      32     1224    32.0             32
 633 ;;
 634 ;; Speeds seem to vary quite a bit between architectures.
 635 ;;
 636 ;; Timing without inlining all the basic functions everywhere.  (That
 637 ;; is, :qd-inline is not a feature.)
 638 ;;
 639 ;; (time-exp #c(2w0 0) 50000)
 640 ;;
 641 ;; Time                 Sparc   PPC     x86     PPC (fma)
 642 ;; exp-qd/reduce         5.83   0.67    10.67   0.98
 643 ;; expm1-qd/series      10.65   1.45    21.06   1.35
 644 ;; expm1-qd/dup         11.17   1.36    24.01   1.25
 645 ;;
 646 ;; Consing              Sparc
 647 ;; exp-qd/reduce         638     93      638     93
 648 ;; expm1-qd/series      1203    120     1201    120
 649 ;; expm1-qd/dup         1224    122     1224    122
 650 ;;
 651 ;; So inlining speeds things up by a factor of about 3 for sparc,
 652 ;; 1.5-4 for ppc.  Strangely, x86 slows down on some but speeds up on
 653 ;; others.
 654 (defun time-exp (x n)
 655   (declare (type %quad-double x)
 656            (fixnum n))
 657   (let ((y +qd-zero+))
 658     (declare (type %quad-double y))
 659     #+cmu (gc :full t)
 660     (format t "exp-qd/reduce~%")
 661     (time (dotimes (k n)
 662             (declare (fixnum k))
 663             (setf y (exp-qd/reduce x))))
 664     #+cmu (gc :full t)
 665     (format t "expm1-qd/series~%")
 666     (time (dotimes (k n)
 667             (declare (fixnum k))
 668             (setf y (expm1-qd/series x))))
 669     #+cmu (gc :full t)
 670     (format t "expm1-qd/duplication~%")
 671     (time (dotimes (k n)
 672             (declare (fixnum k))
 673             (setf y (expm1-qd/duplication x))))
 674
 675     ))
 676
 677 ;; (time-log #c(3w0 0) 50000)
 678 ;;
 679 ;; Time (s)             Sparc   PPC     x86     PPC (fma)       Sparc2
 680 ;; log-qd/newton        7.08    10.23   35.74   8.82            21.77
 681 ;; log1p-qd/dup         5.87     8.41   27.32   6.65            20.73
 682 ;; log-qd/agm           6.58     8.0    27.2    6.87            24.62
 683 ;; log-qd/agm2          5.8      6.93   22.89   6.07            18.44
 684 ;; log-qd/agm3          5.45     6.57   20.97   6.18            20.34
 685 ;; log-qd/halley        4.96     6.8    25.11   7.01            16.13
 686 ;;
 687 ;; Consing (MB)         Sparc   PPC     x86     PPC (fma)
 688 ;; log-qd/newton        150     150     2194    148             150
 689 ;; log1p-qd/dup          56      56     1564     56              56
 690 ;; log-qd/agm            81      11     1434     81              81
 691 ;; log-qd/agm2           87      35     1184     87              87
 692 ;; log-qd/agm3           82      36     1091     81              82
 693 ;; log-qd/halley        101     101     1568    100             101
 694 ;;
 695 ;; Based on these results, it's not really clear what is the fastest.
 696 ;; But Halley's iteration is probably a good tradeoff for log.
 697 ;;
 698 ;; However, consider log(1+2^(-100)).  Use log1p as a reference:
 699 ;;  7.88860905221011805411728565282475078909313378023665801567590088088481830649115711502410110281q-31
 700 ;;
 701 ;; We have
 702 ;; log-qd
 703 ;;  7.88860905221011805411728565282475078909313378023665801567590088088481830649133878797727478488q-31
 704 ;; log-agm
 705 ;;  7.88860905221011805411728565282514580471135738786455290255431302193794546609432q-31
 706 ;; log-agm2
 707 ;;  7.88860905221011805411728565282474926980229445866885841995713611460718519856111q-31
 708 ;; log-agm3
 709 ;;  7.88860905221011805411728565282474926980229445866885841995713611460718519856111q-31
 710 ;; log-halley
 711 ;;  7.88860905221011805411728565282475078909313378023665801567590088088481830649120253326239452326q-31
 712 ;;
 713 ;; We can see that the AGM methods are grossly inaccurate, but log-qd
 714 ;; and log-halley are quite good.
 715 ;;
 716 ;; Timing results without inlining everything:
 717 ;;
 718 ;; Time                 Sparc   PPC     x86     PPC (fma)
 719 ;; log-qd/newton        21.37   0.87    41.49   0.62
 720 ;; log1p-qd/dup         12.58   0.41    31.86   0.28
 721 ;; log-qd/agm            7.17   0.23    34.86   0.16
 722 ;; log-qd/agm2           6.35   0.22    27.53   0.15
 723 ;; log-qd/agm3           7.49   0.17    24.92   0.14
 724 ;; log-qd/halley        14.38   0.56    30.2    0.65
 725 ;;
 726 ;; Consing
 727 ;;                      Sparc   PPC     x86     PPC (fma)
 728 ;; log-qd/newton        2194    60.7    2194    61
 729 ;; log1p-qd/dup         1114    22.6    1564    23
 730 ;; log-qd/agm            371     7.9    1434     7.9
 731 ;; log-qd/agm2           371     7.8    1185     7.8
 732 ;; log-qd/agm3           373     7.8    1091     7.8
 733 ;; log-qd/halley        1554    42.3    1567    42.3
 734
 735 (defun time-log (x n)
 736   (declare (type %quad-double x)
 737            (fixnum n))
 738   (let ((y +qd-zero+))
 739     (declare (type %quad-double y))
 740     #+cmu (gc :full t)
 741     (format t "log-qd/newton~%")
 742     (time (dotimes (k n)
 743             (declare (fixnum k))
 744             (setf y (log-qd/newton x))))
 745     #+cmu (gc :full t)
 746     (format t "log1p-qd/duplication~%")
 747     (time (dotimes (k n)
 748             (declare (fixnum k))
 749             (setf y (log1p-qd/duplication x))))
 750     #+cmu (gc :full t)
 751     (format t "log-qd/agm~%")
 752     (time (dotimes (k n)
 753             (declare (fixnum k))
 754             (setf y (log-qd/agm x))))
 755
 756     #+cmu (gc :full t)
 757     (format t "log-qd/agm2~%")
 758     (time (dotimes (k n)
 759             (declare (fixnum k))
 760             (setf y (log-qd/agm2 x))))
 761     #+cmu (gc :full t)
 762     (format t "log-qd/agm3~%")
 763     (time (dotimes (k n)
 764             (declare (fixnum k))
 765             (setf y (log-qd/agm3 x))))
 766     #+cmu (gc :full t)
 767     (format t "log-qd/halley~%")
 768     (time (dotimes (k n)
 769             (declare (fixnum k))
 770             (setf y (log-qd/halley x))))
 771     ))
 772
 773
 774 ;; (time-atan2 #c(10w0 0) 10000)
 775 ;;
 776 ;; Time
 777 ;;                      PPC     Sparc   x86     PPC (fma)       Sparc2
 778 ;; atan2-qd/newton      2.91     1.91    8.06   2.16            7.55
 779 ;; atan2-qd/cordic      1.22     0.89    6.68   1.43            2.47
 780 ;; atan-qd/duplication  2.51     2.14    5.63   1.76            5.94
 781 ;;
 782 ;; Consing
 783 ;; atan2-qd/newton      44.4    44.4    481     44.4            44.4
 784 ;; atan2-qd/cordic       1.6     1.6    482      1.6             1.6
 785 ;; atan-qd/duplication  17.2     6.0    281      6.0             6.0
 786 ;;
 787 ;; Don't know why x86 is 10 times slower than sparc/ppc for
 788 ;; atan2-qd/newton.  Consing is much more too.  Not enough registers?
 789 ;;
 790 ;; atan2-qd/cordic is by far the fastest on all archs.
 791 ;;
 792 ;; Timing results without inlining everything:
 793 ;; Time
 794 ;;                      PPC     Sparc   x86     PPC (fma)
 795 ;; atan2-qd/newton      6.56     4.48   9.75    6.15
 796 ;; atan2-qd/cordic      6.02     4.24   7.06    5.01
 797 ;; atan-qd/duplication  3.28     1.94   5.72    2.46
 798 ;;
 799 ;; Consing
 800 ;; atan2-qd/newton      443     441     482     443
 801 ;; atan2-qd/cordic      482     482     482     482
 802 ;; atan-qd/duplication   87      81     281     87
 803 ;;
 804
 805 (defun time-atan2 (x n)
 806   (declare (type %quad-double x)
 807            (fixnum n))
 808   (let ((y +qd-zero+)
 809         (one +qd-one+))
 810     #+cmu (gc :full t)
 811     (format t "atan2-qd/newton~%")
 812     (time (dotimes (k n)
 813             (declare (fixnum k))
 814             (setf y (atan2-qd/newton x one))))
 815     #+cmu (gc :full t)
 816     (format t "atan2-qd/cordic~%")
 817     (time (dotimes (k n)
 818             (declare (fixnum k))
 819             (setf y (atan2-qd/cordic x one))))
 820     #+cmu (gc :full t)
 821     (format t "atan-qd/duplication~%")
 822     (time (dotimes (k n)
 823             (declare (fixnum k))
 824             (setf y (atan-qd/duplication x))))
 825     ))
 826
 827 ;; (time-tan #c(10w0 0) 10000)
 828 ;;
 829 ;; Time
 830 ;;                      PPC     Sparc   x86     PPC (fma)       Sparc2
 831 ;; tan-qd/cordic        2.12     1.51    8.26   1.77            4.61
 832 ;; tan-qd/sincos        0.68     0.57    2.39   0.54            2.56
 833 ;;
 834 ;; Consing
 835 ;; tan-qd/cordic        23.0    23.0    473     23.0            23.0
 836 ;; tan-qd/sincos        14.8    14.8    147     14.8            14.8
 837 ;;
 838 ;; Don't know why x86 is so much slower for tan-qd/cordic.
 839 ;;
 840 ;; Without inlining everything
 841 ;;                      PPC     Sparc   x86     PPC (fma)
 842 ;; tan-qd/cordic        7.72    4.56    17.08   5.96
 843 ;; tan-qd/sincos        2.32    1.4      4.91   1.87
 844 ;;
 845 ;; Consing
 846 ;; tan-qd/cordic        463     463     472     463
 847 ;; tan-qd/sincos        137     136     146     137
 848
 849 (defun time-tan (x n)
 850   (declare (type %quad-double x)
 851            (fixnum n))
 852   (let ((y +qd-zero+))
 853     #+cmu (gc :full t)
 854     (format t "tan-qd/cordic~%")
 855     (time (dotimes (k n)
 856             (declare (fixnum k))
 857             (setf y (tan-qd/cordic x))))
 858     #+cmu (gc :full t)
 859     (format t "tan-qd/sincos~%")
 860     (time (dotimes (k n)
 861             (declare (fixnum k))
 862             (setf y (tan-qd/sincos x))))))
 863