src/assembly/x86-64/arith.lisp

   1 ;;;; simple cases for generic arithmetic
   2
   3 ;;;; This software is part of the SBCL system. See the README file for
   4 ;;;; more information.
   5 ;;;;
   6 ;;;; This software is derived from the CMU CL system, which was
   7 ;;;; written at Carnegie Mellon University and released into the
   8 ;;;; public domain. The software is in the public domain and is
   9 ;;;; provided with absolutely no warranty. See the COPYING and CREDITS
  10 ;;;; files for more information.
  11
  12 (in-package "SB!VM")
  13
  14 #-sb-assembling ; avoid redefinition warning
  15 (progn
  16 (defun !both-fixnum-p (temp x y)
  17   (inst mov (reg-in-size temp :dword)
  18         (reg-in-size x :dword))
  19   (inst or (reg-in-size temp :dword)
  20         (reg-in-size y :dword))
  21   (inst test (reg-in-size temp :byte)
  22         fixnum-tag-mask))
  23
  24 (defun !some-fixnum-p (temp x y)
  25   (inst mov (reg-in-size temp :dword)
  26         (reg-in-size x :dword))
  27   (inst and (reg-in-size temp :dword)
  28         (reg-in-size y :dword))
  29   (inst test (reg-in-size temp :byte)
  30         fixnum-tag-mask))
  31 )
  32
  33 #+sb-xc-host
  34 (defmacro static-fun-addr (name)
  35   `(make-ea :qword :disp (+ nil-value (static-fun-offset ,name))))
  36 \f
  37 ;;;; addition, subtraction, and multiplication
  38
  39 (macrolet ((define-generic-arith-routine ((fun cost) &body body)
  40              `(define-assembly-routine (,(symbolicate "GENERIC-" fun)
  41                                         (:cost ,cost)
  42                                         (:return-style :full-call)
  43                                         (:translate ,fun)
  44                                         (:policy :safe)
  45                                         (:save-p t))
  46                 ((:arg x (descriptor-reg any-reg) rdx-offset)
  47                  (:arg y (descriptor-reg any-reg) rdi-offset)
  48
  49                  (:res res (descriptor-reg any-reg) rdx-offset)
  50
  51                  (:temp rax unsigned-reg rax-offset)
  52                  (:temp rcx unsigned-reg rcx-offset))
  53                 (!both-fixnum-p rax x y)
  54                 (inst jmp :nz DO-STATIC-FUN)    ; no - do generic
  55
  56                 ,@body
  57                 (inst clc) ; single-value return
  58                 (inst ret)
  59
  60                 DO-STATIC-FUN
  61                 ;; Same as: (inst enter (* n-word-bytes 1))
  62                 (inst push rbp-tn)
  63                 (inst mov rbp-tn rsp-tn)
  64                 (inst sub rsp-tn (* n-word-bytes 1))
  65                 (inst push (make-ea :qword :base rbp-tn
  66                             :disp (frame-byte-offset return-pc-save-offset)))
  67                 (inst mov rcx (fixnumize 2)) ; arg count
  68                 (inst jmp (static-fun-addr ',(symbolicate "TWO-ARG-" fun))))))
  69
  70   #.`
  71   (define-generic-arith-routine (+ 10)
  72     (move res x)
  73     (inst add res y)
  74     (inst jmp :no OKAY)
  75     ;; Unbox the overflowed result, recovering the correct sign from
  76     ;; the carry flag, then re-box as a bignum.
  77     (inst rcr res 1)
  78     ,@(when (> n-fixnum-tag-bits 1)   ; don't shift by 0
  79             '((inst sar res (1- n-fixnum-tag-bits))))
  80
  81     (move rcx res)
  82
  83     (with-fixed-allocation (res bignum-widetag (1+ bignum-digits-offset))
  84       (storew rcx res bignum-digits-offset other-pointer-lowtag))
  85
  86     OKAY)
  87
  88   #.`
  89   (define-generic-arith-routine (- 10)
  90     (move res x)
  91     (inst sub res y)
  92     (inst jmp :no OKAY)
  93     ;; Unbox the overflowed result, recovering the correct sign from
  94     ;; the carry flag, then re-box as a bignum.
  95     (inst cmc)                        ; carry has correct sign now
  96     (inst rcr res 1)
  97     ,@(when (> n-fixnum-tag-bits 1)   ; don't shift by 0
  98             '((inst sar res (1- n-fixnum-tag-bits))))
  99
 100     (move rcx res)
 101
 102     (with-fixed-allocation (res bignum-widetag (1+ bignum-digits-offset))
 103       (storew rcx res bignum-digits-offset other-pointer-lowtag))
 104     OKAY)
 105
 106   (define-generic-arith-routine (* 30)
 107     (move rax x)                     ; must use eax for 64-bit result
 108     (inst sar rax n-fixnum-tag-bits) ; remove *8 fixnum bias
 109     (inst imul y)                    ; result in edx:eax
 110     (inst jmp :no OKAY)              ; still fixnum
 111
 112     (inst shrd rax x n-fixnum-tag-bits) ; high bits from edx
 113     (inst sar x n-fixnum-tag-bits)      ; now shift edx too
 114
 115     (move rcx x)                   ; save high bits from cqo
 116     (inst cqo)                     ; edx:eax <- sign-extend of eax
 117     (inst cmp x rcx)
 118     (inst jmp :e SINGLE-WORD-BIGNUM)
 119
 120     (with-fixed-allocation (res bignum-widetag (+ bignum-digits-offset 2))
 121       (storew rax res bignum-digits-offset other-pointer-lowtag)
 122       (storew rcx res (1+ bignum-digits-offset) other-pointer-lowtag))
 123     (inst jmp DONE)
 124
 125     SINGLE-WORD-BIGNUM
 126
 127     (with-fixed-allocation (res bignum-widetag (1+ bignum-digits-offset))
 128       (storew rax res bignum-digits-offset other-pointer-lowtag))
 129     (inst jmp DONE)
 130
 131     OKAY
 132     (move res rax)
 133     DONE))
 134 \f
 135 ;;;; negation
 136
 137 (define-assembly-routine (generic-negate
 138                           (:cost 10)
 139                           (:return-style :full-call)
 140                           (:policy :safe)
 141                           (:translate %negate)
 142                           (:save-p t))
 143                          ((:arg x (descriptor-reg any-reg) rdx-offset)
 144                           (:res res (descriptor-reg any-reg) rdx-offset)
 145                           (:temp rcx unsigned-reg rcx-offset))
 146   (inst test (reg-in-size x :byte) fixnum-tag-mask)
 147   (inst jmp :z FIXNUM)
 148
 149   (inst push rbp-tn)
 150   (inst mov rbp-tn rsp-tn)
 151   (inst sub rsp-tn n-word-bytes)
 152   (inst push (make-ea :qword :base rbp-tn
 153                       :disp (frame-byte-offset return-pc-save-offset)))
 154   (inst mov rcx (fixnumize 1))    ; arg count
 155   (inst jmp (static-fun-addr '%negate))
 156
 157   FIXNUM
 158   (move res x)
 159   (inst neg res)                        ; (- most-negative-fixnum) is BIGNUM
 160   (inst jmp :no OKAY)
 161   (inst shr res n-fixnum-tag-bits)      ; sign bit is data - remove type bits
 162   (move rcx res)
 163
 164   (with-fixed-allocation (res bignum-widetag (1+ bignum-digits-offset))
 165     (storew rcx res bignum-digits-offset other-pointer-lowtag))
 166
 167   OKAY)
 168 \f
 169 ;;;; comparison
 170
 171 (macrolet ((define-cond-assem-rtn (name translate static-fn test)
 172              `(define-assembly-routine (,name
 173                                         (:translate ,translate)
 174                                         (:policy :safe)
 175                                         (:save-p t)
 176                                         (:conditional ,test)
 177                                         (:cost 10)
 178                                         (:call-temps rcx))
 179                   ((:arg x (descriptor-reg any-reg) rdx-offset)
 180                    (:arg y (descriptor-reg any-reg) rdi-offset)
 181
 182                    (:temp rcx unsigned-reg rcx-offset))
 183
 184                 (!both-fixnum-p rcx x y)
 185                 (inst jmp :nz DO-STATIC-FUN)
 186
 187                 (inst cmp x y)
 188                 (inst ret)
 189
 190                 DO-STATIC-FUN
 191                 (inst push rbp-tn)
 192                 (inst mov rbp-tn rsp-tn)
 193                 (inst sub rsp-tn (* n-word-bytes 3))
 194                 (inst mov (make-ea :qword :base rsp-tn
 195                                    :disp (frame-byte-offset
 196                                           (+ sp->fp-offset
 197                                              -3
 198                                              ocfp-save-offset)))
 199                       rbp-tn)
 200                 (inst lea rbp-tn (make-ea :qword :base rsp-tn
 201                                           :disp (frame-byte-offset
 202                                           (+ sp->fp-offset
 203                                              -3
 204                                              ocfp-save-offset))))
 205                 (inst mov rcx (fixnumize 2))
 206                 (inst call (static-fun-addr ',static-fn))
 207                 ;; HACK: We depend on NIL having the lowest address of all
 208                 ;; static symbols (including T)
 209                 ,@(ecase test
 210                     (:l `((inst mov y (1+ nil-value))
 211                           (inst cmp y x)))
 212                     (:g `((inst cmp x (1+ nil-value)))))
 213                 (inst pop rbp-tn))))
 214   (define-cond-assem-rtn generic-< < two-arg-< :l)
 215   (define-cond-assem-rtn generic-> > two-arg-> :g))
 216
 217 (define-assembly-routine (generic-eql
 218                           (:translate eql)
 219                           (:policy :safe)
 220                           (:save-p t)
 221                           (:conditional :e)
 222                           (:cost 10)
 223                           (:call-temps rcx))
 224                          ((:arg x (descriptor-reg any-reg) rdx-offset)
 225                           (:arg y (descriptor-reg any-reg) rdi-offset)
 226
 227                           (:temp rcx unsigned-reg rcx-offset))
 228
 229   (!some-fixnum-p rcx x y)
 230   (inst jmp :nz DO-STATIC-FUN)
 231
 232   ;; At least one fixnum
 233   (inst cmp x y)
 234   (inst ret)
 235
 236   DO-STATIC-FUN
 237   (inst push rbp-tn)
 238   (inst mov rbp-tn rsp-tn)
 239   (inst sub rsp-tn (* n-word-bytes 3))
 240   (inst mov (make-ea :qword :base rsp-tn
 241                      :disp (frame-byte-offset
 242                             (+ sp->fp-offset
 243                                -3
 244                                ocfp-save-offset)))
 245         rbp-tn)
 246   (inst lea rbp-tn (make-ea :qword :base rsp-tn
 247                             :disp (frame-byte-offset
 248                                    (+ sp->fp-offset
 249                                       -3
 250                                       ocfp-save-offset))))
 251   (inst mov rcx (fixnumize 2))
 252   (inst call (static-fun-addr 'eql))
 253   (inst cmp x (+ nil-value (static-symbol-offset t)))
 254   (inst pop rbp-tn))
 255
 256 (define-assembly-routine (generic-=
 257                           (:translate =)
 258                           (:policy :safe)
 259                           (:save-p t)
 260                           (:conditional :e)
 261                           (:cost 10)
 262                           (:call-temps rcx))
 263                          ((:arg x (descriptor-reg any-reg) rdx-offset)
 264                           (:arg y (descriptor-reg any-reg) rdi-offset)
 265
 266                           (:temp rcx unsigned-reg rcx-offset))
 267   (!both-fixnum-p rcx x y)
 268   (inst jmp :nz DO-STATIC-FUN)
 269
 270   ;; Both fixnums
 271   (inst cmp x y)
 272   (inst ret)
 273
 274   DO-STATIC-FUN
 275   (inst push rbp-tn)
 276   (inst mov rbp-tn rsp-tn)
 277   (inst sub rsp-tn (* n-word-bytes 3))
 278   (inst mov (make-ea :qword :base rsp-tn
 279                      :disp (frame-byte-offset
 280                             (+ sp->fp-offset
 281                                -3
 282                                ocfp-save-offset)))
 283         rbp-tn)
 284   (inst lea rbp-tn (make-ea :qword :base rsp-tn
 285                             :disp (frame-byte-offset
 286                                    (+ sp->fp-offset
 287                                       -3
 288                                       ocfp-save-offset))))
 289
 290   (inst mov rcx (fixnumize 2))
 291   (inst call (static-fun-addr 'two-arg-=))
 292   (inst cmp x (+ nil-value (static-symbol-offset t)))
 293   (inst pop rbp-tn))
 294
 295 #+sb-assembling
 296 (define-assembly-routine (logcount)
 297                          ((:arg arg (descriptor-reg any-reg) rdx-offset)
 298                           (:temp mask unsigned-reg rcx-offset)
 299                           (:temp temp unsigned-reg rax-offset))
 300   (inst push temp) ; save RAX
 301   (let ((result arg))
 302     ;; See the comments below for how the algorithm works. The tricks
 303     ;; used can be found for example in AMD's software optimization
 304     ;; guide or at "http://www.hackersdelight.org/HDcode/pop.cc" in the
 305     ;; function "pop1", for 32-bit words. The extension to 64 bits is
 306     ;; straightforward.
 307     ;; Calculate 2-bit sums. Note that the value of a two-digit binary
 308     ;; number is the sum of the right digit and twice the left digit.
 309     ;; Thus we can calculate the sum of the two digits by shifting the
 310     ;; left digit to the right position and doing a two-bit subtraction.
 311     ;; This subtraction will never create a borrow and thus can be made
 312     ;; on all 32 2-digit numbers at once.
 313     (move temp arg)
 314     (inst shr result 1)
 315     (inst mov mask #x5555555555555555)
 316     (inst and result mask)
 317     (inst sub temp result)
 318     ;; Calculate 4-bit sums by straightforward shift, mask and add.
 319     ;; Note that we shift the source operand of the MOV and not its
 320     ;; destination so that the SHR and the MOV can execute in the same
 321     ;; clock cycle.
 322     (inst mov result temp)
 323     (inst shr temp 2)
 324     (inst mov mask #x3333333333333333)
 325     (inst and result mask)
 326     (inst and temp mask)
 327     (inst add result temp)
 328     ;; Calculate 8-bit sums. Since each sum is at most 8, which fits
 329     ;; into 4 bits, we can apply the mask after the addition, saving one
 330     ;; instruction.
 331     (inst mov temp result)
 332     (inst shr result 4)
 333     (inst add result temp)
 334     (inst mov mask #x0f0f0f0f0f0f0f0f)
 335     (inst and result mask)
 336     ;; Add all 8 bytes at once by multiplying with #256r11111111.
 337     ;; We need to calculate only the lower 8 bytes of the product.
 338     ;; Of these the most significant byte contains the final result.
 339     ;; Note that there can be no overflow from one byte to the next
 340     ;; as the sum is at most 64 which needs only 7 bits.
 341     (inst mov mask #x0101010101010101)
 342     (inst imul result mask)
 343     (inst shr result 56))
 344   (inst pop temp)) ; restore RAX
 345
 346 #-sb-assembling ; avoid redefinition warning
 347 (defun emit-foreign-logbitp (index foreign-symbol temp-reg) ; result in Z flag
 348   (declare (ignorable temp-reg))
 349   (multiple-value-bind (byte bit) (floor index 8)
 350     #!-sb-dynamic-core
 351     (inst test
 352           (make-ea :byte :disp (make-fixup foreign-symbol :foreign byte))
 353           (ash 1 bit))
 354     #!+sb-dynamic-core
 355     (progn
 356       (inst mov temp-reg
 357             (make-ea :qword :disp (make-fixup foreign-symbol :foreign-dataref)))
 358       (inst test (make-ea :byte :base temp-reg :disp byte) (ash 1 bit)))))
 359
 360 ;; To perform logcount on small integers, we test whether to use the
 361 ;; builtin opcode, or an assembly routine. I benchmarked this against
 362 ;; an approach that always used the assembly routine via "call [addr]"
 363 ;; where the contents of the address reflected one implementation
 364 ;; or the other, chosen at startup - and this is faster.
 365 #-sb-assembling
 366 (macrolet
 367     ((def-it (name cost arg-sc arg-type)
 368       `(define-vop (,name)
 369          (:translate logcount)
 370          (:note ,(format nil "inline ~a logcount" arg-type))
 371          (:policy :fast-safe)
 372          (:args (arg :scs (,arg-sc)))
 373          (:arg-types ,arg-type)
 374          (:results (result :scs (unsigned-reg)))
 375          (:result-types positive-fixnum)
 376          ;; input/output of assembly routine
 377          (:temporary (:sc unsigned-reg :offset rdx-offset
 378                           :from (:argument 0) :to (:result 0)) rdx)
 379          ;; Assembly routine clobbers RAX and RCX but only needs to save RAX,
 380          ;; as this vop clobbers RCX in the call. If changed to "CALL [ADDR]"
 381          ;; be sure to update the subroutine to push and pop RCX.
 382          (:temporary (:sc unsigned-reg :offset rcx-offset) rcx)
 383          (:vop-var vop)
 384          (:generator ,cost
 385            (progn
 386              ;; POPCNT = ECX bit 23 = bit 7 of byte index 2
 387              ;; this use of 'rcx' is as the temporary register for performing
 388              ;; a reference to foreign data with dynamic core. It has to be
 389              ;; a register that conflicts with 'arg' lest we clobber it.
 390              (emit-foreign-logbitp 23 "cpuid_fn1_ecx" rcx)
 391              (inst jmp :z slow)
 392              ;; Intel's implementation of POPCNT on some models treats it as
 393              ;; a 2-operand ALU op in the manner of ADD,SUB,etc which means that
 394              ;; it falsely appears to need data from the destination register.
 395              ;; The workaround is to clear the destination.
 396              ;; See http://stackoverflow.com/questions/25078285
 397              (unless (location= result arg)
 398                ;; We only break the spurious dep. chain if result isn't the same
 399                ;; register as arg. (If they're location=, don't trash the arg!)
 400                (inst xor result result))
 401              (inst popcnt result arg)
 402              (inst jmp done))
 403          slow
 404            (move rdx arg)
 405            (invoke-asm-routine 'call 'logcount vop rcx)
 406            (move result rdx)
 407          done))))
 408   (def-it unsigned-byte-64-count 14 unsigned-reg unsigned-num)
 409   (def-it positive-fixnum-count 13 any-reg positive-fixnum))
 410
 411 ;;; EQL for integers that are either fixnum or bignum
 412
 413 ;; The restriction on use of this assembly routine can't be expressed a
 414 ;; constraints on vop args: it may be called when at *least* one arg
 415 ;; is known to be an integer; the other can be anything.
 416 ;
 417 ;; Logic: we succeed quickly in the EQ case when possible.
 418 ;; Otherwise, check if both are OTHER-POINTER objects, failing if not.
 419 ;; Given that at least one is an integer, and both are OTHER-POINTERs,
 420 ;; then if their widetags match, both are BIGNUMs to be compared word-for-word.
 421 ;;
 422 ;; If you call this with two other-pointer objects with
 423 ;; the same widetag, but not bignum-widetag, the behavior is undefined.
 424 ;;
 425 (define-assembly-routine (%eql/integer
 426                           (:translate %eql/integer)
 427                           ;; :safe would imply signaling an error
 428                           ;; if the args are not integer, which this doesn't.
 429                           (:policy :fast-safe)
 430                           (:conditional :e)
 431                           (:cost 10)
 432                           (:call-temps rcx))
 433                          ((:arg x (descriptor-reg any-reg) rdx-offset)
 434                           (:arg y (descriptor-reg any-reg) rdi-offset)
 435                           (:temp rcx unsigned-reg rcx-offset)
 436                           (:temp rax unsigned-reg rax-offset))
 437   (inst cmp x y)
 438   (inst jmp :e done) ; Z condition flag contains the answer
 439   ;; check that both have other-pointer-lowtag
 440   (inst lea (reg-in-size rax :dword)
 441         (make-ea :dword :base x :disp (- other-pointer-lowtag)))
 442   (inst lea (reg-in-size rcx :dword)
 443         (make-ea :dword :base y :disp (- other-pointer-lowtag)))
 444   (inst or (reg-in-size rax :dword) (reg-in-size rcx :dword))
 445   (inst test (reg-in-size rax :byte) lowtag-mask)
 446   (inst jmp :ne done)
 447   ;; Compare the entire header word, ensuring that if at least one
 448   ;; argument is a bignum, then both are.
 449   (inst mov rcx (make-ea :qword :base x :disp (- other-pointer-lowtag)))
 450   (inst cmp rcx (make-ea :qword :base y :disp (- other-pointer-lowtag)))
 451   (inst jmp :ne done)
 452   (inst shr rcx n-widetag-bits)
 453   ;; can you have 0 payload words? probably not, but let's be safe here.
 454   (inst jrcxz done)
 455   loop
 456   (inst mov rax (make-ea :qword :base x :disp (- other-pointer-lowtag)
 457                          :index rcx :scale 8))
 458   (inst cmp rax (make-ea :qword :base y :disp (- other-pointer-lowtag)
 459                          :index rcx :scale 8))
 460   ;; These next 3 instructions are the equivalent of "LOOPNZ LOOP"
 461   ;; but had significantly better performance for me, consistent with claims
 462   ;; of most optimization guides saying that LOOP was deliberately pessimized
 463   ;; because of its use in timing-related code in the win32 kernel.
 464   (inst jmp :ne done)
 465   (inst dec rcx)
 466   (inst jmp :ne loop)
 467   ;; If the Z flag is set, the integers were EQL
 468   done)