1 ;;;; simple cases for generic arithmetic
3 ;;;; This software is part of the SBCL system. See the README file for
6 ;;;; This software is derived from the CMU CL system, which was
7 ;;;; written at Carnegie Mellon University and released into the
8 ;;;; public domain. The software is in the public domain and is
9 ;;;; provided with absolutely no warranty. See the COPYING and CREDITS
10 ;;;; files for more information.
14 (defun !both-fixnum-p
(temp x y
)
15 (inst mov
(reg-in-size temp
:dword
)
16 (reg-in-size x
:dword
))
17 (inst or
(reg-in-size temp
:dword
)
18 (reg-in-size y
:dword
))
19 (inst test
(reg-in-size temp
:byte
)
22 (defun !some-fixnum-p
(temp x y
)
23 (inst mov
(reg-in-size temp
:dword
)
24 (reg-in-size x
:dword
))
25 (inst and
(reg-in-size temp
:dword
)
26 (reg-in-size y
:dword
))
27 (inst test
(reg-in-size temp
:byte
)
31 ;;;; addition, subtraction, and multiplication
33 (macrolet ((define-generic-arith-routine ((fun cost
) &body body
)
34 `(define-assembly-routine (,(symbolicate "GENERIC-" fun
)
36 (:return-style
:full-call
)
40 ((:arg x
(descriptor-reg any-reg
) rdx-offset
)
41 (:arg y
(descriptor-reg any-reg
) rdi-offset
)
43 (:res res
(descriptor-reg any-reg
) rdx-offset
)
45 (:temp rax unsigned-reg rax-offset
)
46 (:temp rcx unsigned-reg rcx-offset
))
47 (!both-fixnum-p rax x y
)
48 (inst jmp
:nz DO-STATIC-FUN
) ; no - do generic
51 (inst clc
) ; single-value return
55 ;; Same as: (inst enter (* n-word-bytes 1))
57 (inst mov rbp-tn rsp-tn
)
58 (inst sub rsp-tn
(* n-word-bytes
1))
59 (inst push
(make-ea :qword
:base rbp-tn
60 :disp
(frame-byte-offset return-pc-save-offset
)))
61 (inst mov rcx
(fixnumize 2)) ; arg count
66 ',(symbolicate "TWO-ARG-" fun
))))))))
69 (define-generic-arith-routine (+ 10)
73 ;; Unbox the overflowed result, recovering the correct sign from
74 ;; the carry flag, then re-box as a bignum.
76 ,@(when (> n-fixnum-tag-bits
1) ; don't shift by 0
77 '((inst sar res
(1- n-fixnum-tag-bits
))))
81 (with-fixed-allocation (res bignum-widetag
(1+ bignum-digits-offset
))
82 (storew rcx res bignum-digits-offset other-pointer-lowtag
))
87 (define-generic-arith-routine (- 10)
91 ;; Unbox the overflowed result, recovering the correct sign from
92 ;; the carry flag, then re-box as a bignum.
93 (inst cmc
) ; carry has correct sign now
95 ,@(when (> n-fixnum-tag-bits
1) ; don't shift by 0
96 '((inst sar res
(1- n-fixnum-tag-bits
))))
100 (with-fixed-allocation (res bignum-widetag
(1+ bignum-digits-offset
))
101 (storew rcx res bignum-digits-offset other-pointer-lowtag
))
104 (define-generic-arith-routine (* 30)
105 (move rax x
) ; must use eax for 64-bit result
106 (inst sar rax n-fixnum-tag-bits
) ; remove *8 fixnum bias
107 (inst imul y
) ; result in edx:eax
108 (inst jmp
:no OKAY
) ; still fixnum
110 (inst shrd rax x n-fixnum-tag-bits
) ; high bits from edx
111 (inst sar x n-fixnum-tag-bits
) ; now shift edx too
113 (move rcx x
) ; save high bits from cqo
114 (inst cqo
) ; edx:eax <- sign-extend of eax
116 (inst jmp
:e SINGLE-WORD-BIGNUM
)
118 (with-fixed-allocation (res bignum-widetag
(+ bignum-digits-offset
2))
119 (storew rax res bignum-digits-offset other-pointer-lowtag
)
120 (storew rcx res
(1+ bignum-digits-offset
) other-pointer-lowtag
))
125 (with-fixed-allocation (res bignum-widetag
(1+ bignum-digits-offset
))
126 (storew rax res bignum-digits-offset other-pointer-lowtag
))
135 (define-assembly-routine (generic-negate
137 (:return-style
:full-call
)
141 ((:arg x
(descriptor-reg any-reg
) rdx-offset
)
142 (:res res
(descriptor-reg any-reg
) rdx-offset
)
144 (:temp rax unsigned-reg rax-offset
)
145 (:temp rcx unsigned-reg rcx-offset
))
146 (inst test
(reg-in-size x
:byte
) fixnum-tag-mask
)
150 (inst mov rbp-tn rsp-tn
)
151 (inst sub rsp-tn n-word-bytes
)
152 (inst push
(make-ea :qword
:base rbp-tn
153 :disp
(frame-byte-offset return-pc-save-offset
)))
154 (inst mov rcx
(fixnumize 1)) ; arg count
155 (inst jmp
(make-ea :qword
156 :disp
(+ nil-value
(static-fun-offset '%negate
))))
160 (inst neg res
) ; (- most-negative-fixnum) is BIGNUM
162 (inst shr res n-fixnum-tag-bits
) ; sign bit is data - remove type bits
165 (with-fixed-allocation (res bignum-widetag
(1+ bignum-digits-offset
))
166 (storew rcx res bignum-digits-offset other-pointer-lowtag
))
172 (macrolet ((define-cond-assem-rtn (name translate static-fn test
)
173 (declare (ignorable translate static-fn
))
175 `(define-assembly-routine (,name
176 (:return-style
:none
))
177 ((:arg x
(descriptor-reg any-reg
) rdx-offset
)
178 (:arg y
(descriptor-reg any-reg
) rdi-offset
)
180 (:temp rcx unsigned-reg rcx-offset
))
182 (!both-fixnum-p rcx x y
)
183 (inst jmp
:nz DO-STATIC-FUN
)
190 (inst mov rbp-tn rsp-tn
)
191 (inst sub rsp-tn
(* n-word-bytes
3))
192 (inst mov
(make-ea :qword
:base rsp-tn
193 :disp
(frame-byte-offset
198 (inst lea rbp-tn
(make-ea :qword
:base rsp-tn
199 :disp
(frame-byte-offset
203 (inst mov rcx
(fixnumize 2))
204 (inst call
(make-ea :qword
206 (static-fun-offset ',static-fn
))))
207 ;; HACK: We depend on NIL having the lowest address of all
208 ;; static symbols (including T)
210 (:l
`((inst mov y
(1+ nil-value
))
212 (:g
`((inst cmp x
(1+ nil-value
)))))
217 (:translate
,translate
)
220 (:args
(x :scs
(descriptor-reg any-reg
) :target rdx
)
221 (y :scs
(descriptor-reg any-reg
) :target rdi
))
223 (:temporary
(:sc unsigned-reg
:offset rdx-offset
226 (:temporary
(:sc unsigned-reg
:offset rdi-offset
230 (:temporary
(:sc unsigned-reg
:offset rcx-offset
237 (inst mov rcx
(make-fixup ',name
:assembly-routine
))
240 (define-cond-assem-rtn generic-
< < two-arg-
< :l
)
241 (define-cond-assem-rtn generic-
> > two-arg-
> :g
))
244 (define-assembly-routine (generic-eql
245 (:return-style
:none
))
246 ((:arg x
(descriptor-reg any-reg
) rdx-offset
)
247 (:arg y
(descriptor-reg any-reg
) rdi-offset
)
249 (:temp rcx unsigned-reg rcx-offset
))
251 (!some-fixnum-p rcx x y
)
252 (inst jmp
:nz DO-STATIC-FUN
)
254 ;; At least one fixnum
260 (inst mov rbp-tn rsp-tn
)
261 (inst sub rsp-tn
(* n-word-bytes
3))
262 (inst mov
(make-ea :qword
:base rsp-tn
263 :disp
(frame-byte-offset
268 (inst lea rbp-tn
(make-ea :qword
:base rsp-tn
269 :disp
(frame-byte-offset
273 (inst mov rcx
(fixnumize 2))
274 (inst call
(make-ea :qword
275 :disp
(+ nil-value
(static-fun-offset 'eql
))))
276 (inst cmp x
(+ nil-value
(static-symbol-offset t
)))
281 (define-vop (generic-eql)
285 (:args
(x :scs
(descriptor-reg any-reg
) :target rdx
)
286 (y :scs
(descriptor-reg any-reg
) :target rdi
))
288 (:temporary
(:sc unsigned-reg
:offset rdx-offset
291 (:temporary
(:sc unsigned-reg
:offset rdi-offset
295 (:temporary
(:sc unsigned-reg
:offset rcx-offset
302 (inst mov rcx
(make-fixup 'generic-eql
:assembly-routine
))
306 (define-assembly-routine (generic-=
307 (:return-style
:none
))
308 ((:arg x
(descriptor-reg any-reg
) rdx-offset
)
309 (:arg y
(descriptor-reg any-reg
) rdi-offset
)
311 (:temp rcx unsigned-reg rcx-offset
))
312 (!both-fixnum-p rcx x y
)
313 (inst jmp
:nz DO-STATIC-FUN
)
321 (inst mov rbp-tn rsp-tn
)
322 (inst sub rsp-tn
(* n-word-bytes
3))
323 (inst mov
(make-ea :qword
:base rsp-tn
324 :disp
(frame-byte-offset
329 (inst lea rbp-tn
(make-ea :qword
:base rsp-tn
330 :disp
(frame-byte-offset
335 (inst mov rcx
(fixnumize 2))
336 (inst call
(make-ea :qword
337 :disp
(+ nil-value
(static-fun-offset 'two-arg-
=))))
338 (inst cmp x
(+ nil-value
(static-symbol-offset t
)))
343 (define-vop (generic-=)
347 (:args
(x :scs
(descriptor-reg any-reg
) :target rdx
)
348 (y :scs
(descriptor-reg any-reg
) :target rdi
))
350 (:temporary
(:sc unsigned-reg
:offset rdx-offset
353 (:temporary
(:sc unsigned-reg
:offset rdi-offset
357 (:temporary
(:sc unsigned-reg
:offset rcx-offset
364 (inst mov rcx
(make-fixup 'generic-
= :assembly-routine
))
368 (define-assembly-routine (logcount)
369 ((:arg arg
(descriptor-reg any-reg
) rdx-offset
)
370 (:temp mask unsigned-reg rcx-offset
)
371 (:temp temp unsigned-reg rax-offset
))
372 (inst push temp
) ; save RAX
374 ;; See the comments below for how the algorithm works. The tricks
375 ;; used can be found for example in AMD's software optimization
376 ;; guide or at "http://www.hackersdelight.org/HDcode/pop.cc" in the
377 ;; function "pop1", for 32-bit words. The extension to 64 bits is
379 ;; Calculate 2-bit sums. Note that the value of a two-digit binary
380 ;; number is the sum of the right digit and twice the left digit.
381 ;; Thus we can calculate the sum of the two digits by shifting the
382 ;; left digit to the right position and doing a two-bit subtraction.
383 ;; This subtraction will never create a borrow and thus can be made
384 ;; on all 32 2-digit numbers at once.
387 (inst mov mask
#x5555555555555555
)
388 (inst and result mask
)
389 (inst sub temp result
)
390 ;; Calculate 4-bit sums by straightforward shift, mask and add.
391 ;; Note that we shift the source operand of the MOV and not its
392 ;; destination so that the SHR and the MOV can execute in the same
394 (inst mov result temp
)
396 (inst mov mask
#x3333333333333333
)
397 (inst and result mask
)
399 (inst add result temp
)
400 ;; Calculate 8-bit sums. Since each sum is at most 8, which fits
401 ;; into 4 bits, we can apply the mask after the addition, saving one
403 (inst mov temp result
)
405 (inst add result temp
)
406 (inst mov mask
#x0f0f0f0f0f0f0f0f
)
407 (inst and result mask
)
408 ;; Add all 8 bytes at once by multiplying with #256r11111111.
409 ;; We need to calculate only the lower 8 bytes of the product.
410 ;; Of these the most significant byte contains the final result.
411 ;; Note that there can be no overflow from one byte to the next
412 ;; as the sum is at most 64 which needs only 7 bits.
413 (inst mov mask
#x0101010101010101
)
414 (inst imul result mask
)
415 (inst shr result
56))
416 (inst pop temp
)) ; restore RAX
418 ;; To perform logcount on small integers, we test whether to use the
419 ;; builtin opcode, or an assembly routine. I benchmarked this against
420 ;; an approach that always used the assembly routine via "call [addr]"
421 ;; where the contents of the address reflected one implementation
422 ;; or the other, chosen at startup - and this is faster.
425 ((def-it (name cost arg-sc arg-type
)
427 (:translate logcount
)
428 (:note
,(format nil
"inline ~a logcount" arg-type
))
430 (:args
(arg :scs
(,arg-sc
)))
431 (:arg-types
,arg-type
)
432 (:results
(result :scs
(unsigned-reg)))
433 (:result-types positive-fixnum
)
434 ;; input/output of assembly routine
435 (:temporary
(:sc unsigned-reg
:offset rdx-offset
436 :from
(:argument
0) :to
(:result
0)) rdx
)
437 ;; Assembly routine clobbers RAX and RCX but only needs to save RAX,
438 ;; as this vop clobbers RCX in the call. If changed to "CALL [ADDR]"
439 ;; be sure to update the subroutine to push and pop RCX.
440 (:temporary
(:sc unsigned-reg
:offset rcx-offset
) rcx
)
442 ;; FIXME: As I've got no way to set the cpuid feature bits,
443 ;; don't try testing here whether to use popcnt, until it has been
444 ;; verified that the __cpuid() intrinsic works for win32.
447 ;; POPCNT = ECX bit 23 = bit 7 of byte index 2
449 (make-ea :byte
:disp
(make-fixup "cpuid_fn1_ecx" :foreign
2))
452 ;; Intel's implementation of POPCNT on some models treats it as
453 ;; a 2-operand ALU op in the manner of ADD,SUB,etc which means that
454 ;; it falsely appears to need data from the destination register.
455 ;; The workaround is to clear the destination.
456 ;; See http://stackoverflow.com/questions/25078285
457 (unless (location= result arg
)
458 ;; We only break the spurious dep. chain if result isn't the same
459 ;; register as arg. (If they're location=, don't trash the arg!)
460 (inst xor result result
))
461 (inst popcnt result arg
)
465 (inst mov rcx
(make-fixup 'logcount
:assembly-routine
))
469 (def-it unsigned-byte-64-count
14 unsigned-reg unsigned-num
)
470 (def-it positive-fixnum-count
13 any-reg positive-fixnum
))