src/assembly/x86-64/arith.lisp

   1 ;;;; simple cases for generic arithmetic
   2
   3 ;;;; This software is part of the SBCL system. See the README file for
   4 ;;;; more information.
   5 ;;;;
   6 ;;;; This software is derived from the CMU CL system, which was
   7 ;;;; written at Carnegie Mellon University and released into the
   8 ;;;; public domain. The software is in the public domain and is
   9 ;;;; provided with absolutely no warranty. See the COPYING and CREDITS
  10 ;;;; files for more information.
  11
  12 (in-package "SB!VM")
  13
  14 (defun !both-fixnum-p (temp x y)
  15   (inst mov (reg-in-size temp :dword)
  16         (reg-in-size x :dword))
  17   (inst or (reg-in-size temp :dword)
  18         (reg-in-size y :dword))
  19   (inst test (reg-in-size temp :byte)
  20         fixnum-tag-mask))
  21
  22 (defun !some-fixnum-p (temp x y)
  23   (inst mov (reg-in-size temp :dword)
  24         (reg-in-size x :dword))
  25   (inst and (reg-in-size temp :dword)
  26         (reg-in-size y :dword))
  27   (inst test (reg-in-size temp :byte)
  28         fixnum-tag-mask))
  29
  30 \f
  31 ;;;; addition, subtraction, and multiplication
  32
  33 (macrolet ((define-generic-arith-routine ((fun cost) &body body)
  34              `(define-assembly-routine (,(symbolicate "GENERIC-" fun)
  35                                         (:cost ,cost)
  36                                         (:return-style :full-call)
  37                                         (:translate ,fun)
  38                                         (:policy :safe)
  39                                         (:save-p t))
  40                 ((:arg x (descriptor-reg any-reg) rdx-offset)
  41                  (:arg y (descriptor-reg any-reg) rdi-offset)
  42
  43                  (:res res (descriptor-reg any-reg) rdx-offset)
  44
  45                  (:temp rax unsigned-reg rax-offset)
  46                  (:temp rcx unsigned-reg rcx-offset))
  47                 (!both-fixnum-p rax x y)
  48                 (inst jmp :nz DO-STATIC-FUN)    ; no - do generic
  49
  50                 ,@body
  51                 (inst clc) ; single-value return
  52                 (inst ret)
  53
  54                 DO-STATIC-FUN
  55                 ;; Same as: (inst enter (* n-word-bytes 1))
  56                 (inst push rbp-tn)
  57                 (inst mov rbp-tn rsp-tn)
  58                 (inst sub rsp-tn (* n-word-bytes 1))
  59                 (inst push (make-ea :qword :base rbp-tn
  60                             :disp (frame-byte-offset return-pc-save-offset)))
  61                 (inst mov rcx (fixnumize 2)) ; arg count
  62                 (inst jmp
  63                       (make-ea :qword
  64                                :disp (+ nil-value
  65                                         (static-fun-offset
  66                                          ',(symbolicate "TWO-ARG-" fun))))))))
  67
  68   #.`
  69   (define-generic-arith-routine (+ 10)
  70     (move res x)
  71     (inst add res y)
  72     (inst jmp :no OKAY)
  73     ;; Unbox the overflowed result, recovering the correct sign from
  74     ;; the carry flag, then re-box as a bignum.
  75     (inst rcr res 1)
  76     ,@(when (> n-fixnum-tag-bits 1)   ; don't shift by 0
  77             '((inst sar res (1- n-fixnum-tag-bits))))
  78
  79     (move rcx res)
  80
  81     (with-fixed-allocation (res bignum-widetag (1+ bignum-digits-offset))
  82       (storew rcx res bignum-digits-offset other-pointer-lowtag))
  83
  84     OKAY)
  85
  86   #.`
  87   (define-generic-arith-routine (- 10)
  88     (move res x)
  89     (inst sub res y)
  90     (inst jmp :no OKAY)
  91     ;; Unbox the overflowed result, recovering the correct sign from
  92     ;; the carry flag, then re-box as a bignum.
  93     (inst cmc)                        ; carry has correct sign now
  94     (inst rcr res 1)
  95     ,@(when (> n-fixnum-tag-bits 1)   ; don't shift by 0
  96             '((inst sar res (1- n-fixnum-tag-bits))))
  97
  98     (move rcx res)
  99
 100     (with-fixed-allocation (res bignum-widetag (1+ bignum-digits-offset))
 101       (storew rcx res bignum-digits-offset other-pointer-lowtag))
 102     OKAY)
 103
 104   (define-generic-arith-routine (* 30)
 105     (move rax x)                     ; must use eax for 64-bit result
 106     (inst sar rax n-fixnum-tag-bits) ; remove *8 fixnum bias
 107     (inst imul y)                    ; result in edx:eax
 108     (inst jmp :no OKAY)              ; still fixnum
 109
 110     (inst shrd rax x n-fixnum-tag-bits) ; high bits from edx
 111     (inst sar x n-fixnum-tag-bits)      ; now shift edx too
 112
 113     (move rcx x)                   ; save high bits from cqo
 114     (inst cqo)                     ; edx:eax <- sign-extend of eax
 115     (inst cmp x rcx)
 116     (inst jmp :e SINGLE-WORD-BIGNUM)
 117
 118     (with-fixed-allocation (res bignum-widetag (+ bignum-digits-offset 2))
 119       (storew rax res bignum-digits-offset other-pointer-lowtag)
 120       (storew rcx res (1+ bignum-digits-offset) other-pointer-lowtag))
 121     (inst jmp DONE)
 122
 123     SINGLE-WORD-BIGNUM
 124
 125     (with-fixed-allocation (res bignum-widetag (1+ bignum-digits-offset))
 126       (storew rax res bignum-digits-offset other-pointer-lowtag))
 127     (inst jmp DONE)
 128
 129     OKAY
 130     (move res rax)
 131     DONE))
 132 \f
 133 ;;;; negation
 134
 135 (define-assembly-routine (generic-negate
 136                           (:cost 10)
 137                           (:return-style :full-call)
 138                           (:policy :safe)
 139                           (:translate %negate)
 140                           (:save-p t))
 141                          ((:arg x (descriptor-reg any-reg) rdx-offset)
 142                           (:res res (descriptor-reg any-reg) rdx-offset)
 143
 144                           (:temp rax unsigned-reg rax-offset)
 145                           (:temp rcx unsigned-reg rcx-offset))
 146   (inst test (reg-in-size x :byte) fixnum-tag-mask)
 147   (inst jmp :z FIXNUM)
 148
 149   (inst push rbp-tn)
 150   (inst mov rbp-tn rsp-tn)
 151   (inst sub rsp-tn n-word-bytes)
 152   (inst push (make-ea :qword :base rbp-tn
 153                       :disp (frame-byte-offset return-pc-save-offset)))
 154   (inst mov rcx (fixnumize 1))    ; arg count
 155   (inst jmp (make-ea :qword
 156                      :disp (+ nil-value (static-fun-offset '%negate))))
 157
 158   FIXNUM
 159   (move res x)
 160   (inst neg res)                        ; (- most-negative-fixnum) is BIGNUM
 161   (inst jmp :no OKAY)
 162   (inst shr res n-fixnum-tag-bits)      ; sign bit is data - remove type bits
 163   (move rcx res)
 164
 165   (with-fixed-allocation (res bignum-widetag (1+ bignum-digits-offset))
 166     (storew rcx res bignum-digits-offset other-pointer-lowtag))
 167
 168   OKAY)
 169 \f
 170 ;;;; comparison
 171
 172 (macrolet ((define-cond-assem-rtn (name translate static-fn test)
 173                (declare (ignorable translate static-fn))
 174              #+sb-assembling
 175              `(define-assembly-routine (,name
 176                                         (:return-style :none))
 177                   ((:arg x (descriptor-reg any-reg) rdx-offset)
 178                    (:arg y (descriptor-reg any-reg) rdi-offset)
 179
 180                    (:temp rcx unsigned-reg rcx-offset))
 181
 182                 (!both-fixnum-p rcx x y)
 183                 (inst jmp :nz DO-STATIC-FUN)
 184
 185                 (inst cmp x y)
 186                 (inst ret)
 187
 188                 DO-STATIC-FUN
 189                 (inst push rbp-tn)
 190                 (inst mov rbp-tn rsp-tn)
 191                 (inst sub rsp-tn (* n-word-bytes 3))
 192                 (inst mov (make-ea :qword :base rsp-tn
 193                                    :disp (frame-byte-offset
 194                                           (+ sp->fp-offset
 195                                              -3
 196                                              ocfp-save-offset)))
 197                       rbp-tn)
 198                 (inst lea rbp-tn (make-ea :qword :base rsp-tn
 199                                           :disp (frame-byte-offset
 200                                           (+ sp->fp-offset
 201                                              -3
 202                                              ocfp-save-offset))))
 203                 (inst mov rcx (fixnumize 2))
 204                 (inst call (make-ea :qword
 205                                     :disp (+ nil-value
 206                                              (static-fun-offset ',static-fn))))
 207                 ;; HACK: We depend on NIL having the lowest address of all
 208                 ;; static symbols (including T)
 209                 ,@(ecase test
 210                     (:l `((inst mov y (1+ nil-value))
 211                           (inst cmp y x)))
 212                     (:g `((inst cmp x (1+ nil-value)))))
 213                 (inst pop rbp-tn)
 214                 (inst ret))
 215              #-sb-assembling
 216              `(define-vop (,name)
 217                 (:translate ,translate)
 218                 (:policy :safe)
 219                 (:save-p t)
 220                 (:args (x :scs (descriptor-reg any-reg) :target rdx)
 221                        (y :scs (descriptor-reg any-reg) :target rdi))
 222
 223                 (:temporary (:sc unsigned-reg :offset rdx-offset
 224                                  :from (:argument 0))
 225                             rdx)
 226                 (:temporary (:sc unsigned-reg :offset rdi-offset
 227                                  :from (:argument 1))
 228                             rdi)
 229
 230                 (:temporary (:sc unsigned-reg :offset rcx-offset
 231                                  :from :eval)
 232                             rcx)
 233                 (:conditional ,test)
 234                 (:generator 10
 235                    (move rdx x)
 236                    (move rdi y)
 237                    (inst mov rcx (make-fixup ',name :assembly-routine))
 238                    (inst call rcx)))))
 239
 240   (define-cond-assem-rtn generic-< < two-arg-< :l)
 241   (define-cond-assem-rtn generic-> > two-arg-> :g))
 242
 243 #+sb-assembling
 244 (define-assembly-routine (generic-eql
 245                           (:return-style :none))
 246                          ((:arg x (descriptor-reg any-reg) rdx-offset)
 247                           (:arg y (descriptor-reg any-reg) rdi-offset)
 248
 249                           (:temp rcx unsigned-reg rcx-offset))
 250
 251   (!some-fixnum-p rcx x y)
 252   (inst jmp :nz DO-STATIC-FUN)
 253
 254   ;; At least one fixnum
 255   (inst cmp x y)
 256   (inst ret)
 257
 258   DO-STATIC-FUN
 259   (inst push rbp-tn)
 260   (inst mov rbp-tn rsp-tn)
 261   (inst sub rsp-tn (* n-word-bytes 3))
 262   (inst mov (make-ea :qword :base rsp-tn
 263                      :disp (frame-byte-offset
 264                             (+ sp->fp-offset
 265                                -3
 266                                ocfp-save-offset)))
 267         rbp-tn)
 268   (inst lea rbp-tn (make-ea :qword :base rsp-tn
 269                             :disp (frame-byte-offset
 270                                    (+ sp->fp-offset
 271                                       -3
 272                                       ocfp-save-offset))))
 273   (inst mov rcx (fixnumize 2))
 274   (inst call (make-ea :qword
 275                       :disp (+ nil-value (static-fun-offset 'eql))))
 276   (inst cmp x (+ nil-value (static-symbol-offset t)))
 277   (inst pop rbp-tn)
 278   (inst ret))
 279
 280 #-sb-assembling
 281 (define-vop (generic-eql)
 282   (:translate eql)
 283   (:policy :safe)
 284   (:save-p t)
 285   (:args (x :scs (descriptor-reg any-reg) :target rdx)
 286          (y :scs (descriptor-reg any-reg) :target rdi))
 287
 288   (:temporary (:sc unsigned-reg :offset rdx-offset
 289                :from (:argument 0))
 290               rdx)
 291   (:temporary (:sc unsigned-reg :offset rdi-offset
 292                :from (:argument 1))
 293               rdi)
 294
 295   (:temporary (:sc unsigned-reg :offset rcx-offset
 296                :from :eval)
 297               rcx)
 298   (:conditional :e)
 299   (:generator 10
 300     (move rdx x)
 301     (move rdi y)
 302     (inst mov rcx (make-fixup 'generic-eql :assembly-routine))
 303     (inst call rcx)))
 304
 305 #+sb-assembling
 306 (define-assembly-routine (generic-=
 307                           (:return-style :none))
 308                          ((:arg x (descriptor-reg any-reg) rdx-offset)
 309                           (:arg y (descriptor-reg any-reg) rdi-offset)
 310
 311                           (:temp rcx unsigned-reg rcx-offset))
 312   (!both-fixnum-p rcx x y)
 313   (inst jmp :nz DO-STATIC-FUN)
 314
 315   ;; Both fixnums
 316   (inst cmp x y)
 317   (inst ret)
 318
 319   DO-STATIC-FUN
 320   (inst push rbp-tn)
 321   (inst mov rbp-tn rsp-tn)
 322   (inst sub rsp-tn (* n-word-bytes 3))
 323   (inst mov (make-ea :qword :base rsp-tn
 324                      :disp (frame-byte-offset
 325                             (+ sp->fp-offset
 326                                -3
 327                                ocfp-save-offset)))
 328         rbp-tn)
 329   (inst lea rbp-tn (make-ea :qword :base rsp-tn
 330                             :disp (frame-byte-offset
 331                                    (+ sp->fp-offset
 332                                       -3
 333                                       ocfp-save-offset))))
 334
 335   (inst mov rcx (fixnumize 2))
 336   (inst call (make-ea :qword
 337                       :disp (+ nil-value (static-fun-offset 'two-arg-=))))
 338   (inst cmp x (+ nil-value (static-symbol-offset t)))
 339   (inst pop rbp-tn)
 340   (inst ret))
 341
 342 #-sb-assembling
 343 (define-vop (generic-=)
 344   (:translate =)
 345   (:policy :safe)
 346   (:save-p t)
 347   (:args (x :scs (descriptor-reg any-reg) :target rdx)
 348          (y :scs (descriptor-reg any-reg) :target rdi))
 349
 350   (:temporary (:sc unsigned-reg :offset rdx-offset
 351                :from (:argument 0))
 352               rdx)
 353   (:temporary (:sc unsigned-reg :offset rdi-offset
 354                :from (:argument 1))
 355               rdi)
 356
 357   (:temporary (:sc unsigned-reg :offset rcx-offset
 358                :from :eval)
 359               rcx)
 360   (:conditional :e)
 361   (:generator 10
 362     (move rdx x)
 363     (move rdi y)
 364     (inst mov rcx (make-fixup 'generic-= :assembly-routine))
 365     (inst call rcx)))
 366
 367 #+sb-assembling
 368 (define-assembly-routine (logcount)
 369                          ((:arg arg (descriptor-reg any-reg) rdx-offset)
 370                           (:temp mask unsigned-reg rcx-offset)
 371                           (:temp temp unsigned-reg rax-offset))
 372   (inst push temp) ; save RAX
 373   (let ((result arg))
 374     ;; See the comments below for how the algorithm works. The tricks
 375     ;; used can be found for example in AMD's software optimization
 376     ;; guide or at "http://www.hackersdelight.org/HDcode/pop.cc" in the
 377     ;; function "pop1", for 32-bit words. The extension to 64 bits is
 378     ;; straightforward.
 379     ;; Calculate 2-bit sums. Note that the value of a two-digit binary
 380     ;; number is the sum of the right digit and twice the left digit.
 381     ;; Thus we can calculate the sum of the two digits by shifting the
 382     ;; left digit to the right position and doing a two-bit subtraction.
 383     ;; This subtraction will never create a borrow and thus can be made
 384     ;; on all 32 2-digit numbers at once.
 385     (move temp arg)
 386     (inst shr result 1)
 387     (inst mov mask #x5555555555555555)
 388     (inst and result mask)
 389     (inst sub temp result)
 390     ;; Calculate 4-bit sums by straightforward shift, mask and add.
 391     ;; Note that we shift the source operand of the MOV and not its
 392     ;; destination so that the SHR and the MOV can execute in the same
 393     ;; clock cycle.
 394     (inst mov result temp)
 395     (inst shr temp 2)
 396     (inst mov mask #x3333333333333333)
 397     (inst and result mask)
 398     (inst and temp mask)
 399     (inst add result temp)
 400     ;; Calculate 8-bit sums. Since each sum is at most 8, which fits
 401     ;; into 4 bits, we can apply the mask after the addition, saving one
 402     ;; instruction.
 403     (inst mov temp result)
 404     (inst shr result 4)
 405     (inst add result temp)
 406     (inst mov mask #x0f0f0f0f0f0f0f0f)
 407     (inst and result mask)
 408     ;; Add all 8 bytes at once by multiplying with #256r11111111.
 409     ;; We need to calculate only the lower 8 bytes of the product.
 410     ;; Of these the most significant byte contains the final result.
 411     ;; Note that there can be no overflow from one byte to the next
 412     ;; as the sum is at most 64 which needs only 7 bits.
 413     (inst mov mask #x0101010101010101)
 414     (inst imul result mask)
 415     (inst shr result 56))
 416   (inst pop temp)) ; restore RAX
 417
 418 ;; To perform logcount on small integers, we test whether to use the
 419 ;; builtin opcode, or an assembly routine. I benchmarked this against
 420 ;; an approach that always used the assembly routine via "call [addr]"
 421 ;; where the contents of the address reflected one implementation
 422 ;; or the other, chosen at startup - and this is faster.
 423 #-sb-assembling
 424 (macrolet
 425     ((def-it (name cost arg-sc arg-type)
 426       `(define-vop (,name)
 427          (:translate logcount)
 428          (:note ,(format nil "inline ~a logcount" arg-type))
 429          (:policy :fast-safe)
 430          (:args (arg :scs (,arg-sc)))
 431          (:arg-types ,arg-type)
 432          (:results (result :scs (unsigned-reg)))
 433          (:result-types positive-fixnum)
 434          ;; input/output of assembly routine
 435          (:temporary (:sc unsigned-reg :offset rdx-offset
 436                           :from (:argument 0) :to (:result 0)) rdx)
 437          ;; Assembly routine clobbers RAX and RCX but only needs to save RAX,
 438          ;; as this vop clobbers RCX in the call. If changed to "CALL [ADDR]"
 439          ;; be sure to update the subroutine to push and pop RCX.
 440          (:temporary (:sc unsigned-reg :offset rcx-offset) rcx)
 441          (:generator ,cost
 442            ;; FIXME: As I've got no way to set the cpuid feature bits,
 443            ;; don't try testing here whether to use popcnt, until it has been
 444            ;; verified that the __cpuid() intrinsic works for win32.
 445            #!-win32
 446            (progn
 447              ;; POPCNT = ECX bit 23 = bit 7 of byte index 2
 448              (inst test
 449                    (make-ea :byte :disp (make-fixup "cpuid_fn1_ecx" :foreign 2))
 450                    #x80)
 451              (inst jmp :z slow)
 452              ;; Intel's implementation of POPCNT on some models treats it as
 453              ;; a 2-operand ALU op in the manner of ADD,SUB,etc which means that
 454              ;; it falsely appears to need data from the destination register.
 455              ;; The workaround is to clear the destination.
 456              ;; See http://stackoverflow.com/questions/25078285
 457              (unless (location= result arg)
 458                ;; We only break the spurious dep. chain if result isn't the same
 459                ;; register as arg. (If they're location=, don't trash the arg!)
 460                (inst xor result result))
 461              (inst popcnt result arg)
 462              (inst jmp done))
 463          slow
 464            (move rdx arg)
 465            (inst mov rcx (make-fixup 'logcount :assembly-routine))
 466            (inst call rcx)
 467            (move result rdx)
 468          done))))
 469   (def-it unsigned-byte-64-count 14 unsigned-reg unsigned-num)
 470   (def-it positive-fixnum-count 13 any-reg positive-fixnum))