1 ;;;; simple cases for generic arithmetic
3 ;;;; This software is part of the SBCL system. See the README file for
6 ;;;; This software is derived from the CMU CL system, which was
7 ;;;; written at Carnegie Mellon University and released into the
8 ;;;; public domain. The software is in the public domain and is
9 ;;;; provided with absolutely no warranty. See the COPYING and CREDITS
10 ;;;; files for more information.
14 #-sb-assembling
; avoid redefinition warning
16 (defun !both-fixnum-p
(temp x y
)
17 (inst mov
(reg-in-size temp
:dword
)
18 (reg-in-size x
:dword
))
19 (inst or
(reg-in-size temp
:dword
)
20 (reg-in-size y
:dword
))
21 (inst test
(reg-in-size temp
:byte
)
24 (defun !some-fixnum-p
(temp x y
)
25 (inst mov
(reg-in-size temp
:dword
)
26 (reg-in-size x
:dword
))
27 (inst and
(reg-in-size temp
:dword
)
28 (reg-in-size y
:dword
))
29 (inst test
(reg-in-size temp
:byte
)
34 (defmacro static-fun-addr
(name)
35 `(make-ea :qword
:disp
(+ nil-value
(static-fun-offset ,name
))))
37 ;;;; addition, subtraction, and multiplication
39 (macrolet ((define-generic-arith-routine ((fun cost
) &body body
)
40 `(define-assembly-routine (,(symbolicate "GENERIC-" fun
)
42 (:return-style
:full-call
)
46 ((:arg x
(descriptor-reg any-reg
) rdx-offset
)
47 (:arg y
(descriptor-reg any-reg
) rdi-offset
)
49 (:res res
(descriptor-reg any-reg
) rdx-offset
)
51 (:temp rax unsigned-reg rax-offset
)
52 (:temp rcx unsigned-reg rcx-offset
))
53 (!both-fixnum-p rax x y
)
54 (inst jmp
:nz DO-STATIC-FUN
) ; no - do generic
57 (inst clc
) ; single-value return
61 ;; Same as: (inst enter (* n-word-bytes 1))
63 (inst mov rbp-tn rsp-tn
)
64 (inst sub rsp-tn
(* n-word-bytes
1))
65 (inst push
(make-ea :qword
:base rbp-tn
66 :disp
(frame-byte-offset return-pc-save-offset
)))
67 (inst mov rcx
(fixnumize 2)) ; arg count
68 (inst jmp
(static-fun-addr ',(symbolicate "TWO-ARG-" fun
))))))
71 (define-generic-arith-routine (+ 10)
75 ;; Unbox the overflowed result, recovering the correct sign from
76 ;; the carry flag, then re-box as a bignum.
78 ,@(when (> n-fixnum-tag-bits
1) ; don't shift by 0
79 '((inst sar res
(1- n-fixnum-tag-bits
))))
83 (with-fixed-allocation (res bignum-widetag
(1+ bignum-digits-offset
))
84 (storew rcx res bignum-digits-offset other-pointer-lowtag
))
89 (define-generic-arith-routine (- 10)
93 ;; Unbox the overflowed result, recovering the correct sign from
94 ;; the carry flag, then re-box as a bignum.
95 (inst cmc
) ; carry has correct sign now
97 ,@(when (> n-fixnum-tag-bits
1) ; don't shift by 0
98 '((inst sar res
(1- n-fixnum-tag-bits
))))
102 (with-fixed-allocation (res bignum-widetag
(1+ bignum-digits-offset
))
103 (storew rcx res bignum-digits-offset other-pointer-lowtag
))
106 (define-generic-arith-routine (* 30)
107 (move rax x
) ; must use eax for 64-bit result
108 (inst sar rax n-fixnum-tag-bits
) ; remove *8 fixnum bias
109 (inst imul y
) ; result in edx:eax
110 (inst jmp
:no OKAY
) ; still fixnum
112 (inst shrd rax x n-fixnum-tag-bits
) ; high bits from edx
113 (inst sar x n-fixnum-tag-bits
) ; now shift edx too
115 (move rcx x
) ; save high bits from cqo
116 (inst cqo
) ; edx:eax <- sign-extend of eax
118 (inst jmp
:e SINGLE-WORD-BIGNUM
)
120 (with-fixed-allocation (res bignum-widetag
(+ bignum-digits-offset
2))
121 (storew rax res bignum-digits-offset other-pointer-lowtag
)
122 (storew rcx res
(1+ bignum-digits-offset
) other-pointer-lowtag
))
127 (with-fixed-allocation (res bignum-widetag
(1+ bignum-digits-offset
))
128 (storew rax res bignum-digits-offset other-pointer-lowtag
))
137 (define-assembly-routine (generic-negate
139 (:return-style
:full-call
)
143 ((:arg x
(descriptor-reg any-reg
) rdx-offset
)
144 (:res res
(descriptor-reg any-reg
) rdx-offset
)
145 (:temp rcx unsigned-reg rcx-offset
))
146 (inst test
(reg-in-size x
:byte
) fixnum-tag-mask
)
150 (inst mov rbp-tn rsp-tn
)
151 (inst sub rsp-tn n-word-bytes
)
152 (inst push
(make-ea :qword
:base rbp-tn
153 :disp
(frame-byte-offset return-pc-save-offset
)))
154 (inst mov rcx
(fixnumize 1)) ; arg count
155 (inst jmp
(static-fun-addr '%negate
))
159 (inst neg res
) ; (- most-negative-fixnum) is BIGNUM
161 (inst shr res n-fixnum-tag-bits
) ; sign bit is data - remove type bits
164 (with-fixed-allocation (res bignum-widetag
(1+ bignum-digits-offset
))
165 (storew rcx res bignum-digits-offset other-pointer-lowtag
))
171 (macrolet ((define-cond-assem-rtn (name translate static-fn test
)
172 `(define-assembly-routine (,name
173 (:translate
,translate
)
179 ((:arg x
(descriptor-reg any-reg
) rdx-offset
)
180 (:arg y
(descriptor-reg any-reg
) rdi-offset
)
182 (:temp rcx unsigned-reg rcx-offset
))
184 (!both-fixnum-p rcx x y
)
185 (inst jmp
:nz DO-STATIC-FUN
)
192 (inst mov rbp-tn rsp-tn
)
193 (inst sub rsp-tn
(* n-word-bytes
3))
194 (inst mov
(make-ea :qword
:base rsp-tn
195 :disp
(frame-byte-offset
200 (inst lea rbp-tn
(make-ea :qword
:base rsp-tn
201 :disp
(frame-byte-offset
205 (inst mov rcx
(fixnumize 2))
206 (inst call
(static-fun-addr ',static-fn
))
207 ;; HACK: We depend on NIL having the lowest address of all
208 ;; static symbols (including T)
210 (:l
`((inst mov y
(1+ nil-value
))
212 (:g
`((inst cmp x
(1+ nil-value
)))))
214 (define-cond-assem-rtn generic-
< < two-arg-
< :l
)
215 (define-cond-assem-rtn generic-
> > two-arg-
> :g
))
217 (define-assembly-routine (generic-eql
224 ((:arg x
(descriptor-reg any-reg
) rdx-offset
)
225 (:arg y
(descriptor-reg any-reg
) rdi-offset
)
227 (:temp rcx unsigned-reg rcx-offset
))
229 (!some-fixnum-p rcx x y
)
230 (inst jmp
:nz DO-STATIC-FUN
)
232 ;; At least one fixnum
238 (inst mov rbp-tn rsp-tn
)
239 (inst sub rsp-tn
(* n-word-bytes
3))
240 (inst mov
(make-ea :qword
:base rsp-tn
241 :disp
(frame-byte-offset
246 (inst lea rbp-tn
(make-ea :qword
:base rsp-tn
247 :disp
(frame-byte-offset
251 (inst mov rcx
(fixnumize 2))
252 (inst call
(static-fun-addr 'eql
))
253 (inst cmp x
(+ nil-value
(static-symbol-offset t
)))
256 (define-assembly-routine (generic-=
263 ((:arg x
(descriptor-reg any-reg
) rdx-offset
)
264 (:arg y
(descriptor-reg any-reg
) rdi-offset
)
266 (:temp rcx unsigned-reg rcx-offset
))
267 (!both-fixnum-p rcx x y
)
268 (inst jmp
:nz DO-STATIC-FUN
)
276 (inst mov rbp-tn rsp-tn
)
277 (inst sub rsp-tn
(* n-word-bytes
3))
278 (inst mov
(make-ea :qword
:base rsp-tn
279 :disp
(frame-byte-offset
284 (inst lea rbp-tn
(make-ea :qword
:base rsp-tn
285 :disp
(frame-byte-offset
290 (inst mov rcx
(fixnumize 2))
291 (inst call
(static-fun-addr 'two-arg-
=))
292 (inst cmp x
(+ nil-value
(static-symbol-offset t
)))
296 (define-assembly-routine (logcount)
297 ((:arg arg
(descriptor-reg any-reg
) rdx-offset
)
298 (:temp mask unsigned-reg rcx-offset
)
299 (:temp temp unsigned-reg rax-offset
))
300 (inst push temp
) ; save RAX
302 ;; See the comments below for how the algorithm works. The tricks
303 ;; used can be found for example in AMD's software optimization
304 ;; guide or at "http://www.hackersdelight.org/HDcode/pop.cc" in the
305 ;; function "pop1", for 32-bit words. The extension to 64 bits is
307 ;; Calculate 2-bit sums. Note that the value of a two-digit binary
308 ;; number is the sum of the right digit and twice the left digit.
309 ;; Thus we can calculate the sum of the two digits by shifting the
310 ;; left digit to the right position and doing a two-bit subtraction.
311 ;; This subtraction will never create a borrow and thus can be made
312 ;; on all 32 2-digit numbers at once.
315 (inst mov mask
#x5555555555555555
)
316 (inst and result mask
)
317 (inst sub temp result
)
318 ;; Calculate 4-bit sums by straightforward shift, mask and add.
319 ;; Note that we shift the source operand of the MOV and not its
320 ;; destination so that the SHR and the MOV can execute in the same
322 (inst mov result temp
)
324 (inst mov mask
#x3333333333333333
)
325 (inst and result mask
)
327 (inst add result temp
)
328 ;; Calculate 8-bit sums. Since each sum is at most 8, which fits
329 ;; into 4 bits, we can apply the mask after the addition, saving one
331 (inst mov temp result
)
333 (inst add result temp
)
334 (inst mov mask
#x0f0f0f0f0f0f0f0f
)
335 (inst and result mask
)
336 ;; Add all 8 bytes at once by multiplying with #256r11111111.
337 ;; We need to calculate only the lower 8 bytes of the product.
338 ;; Of these the most significant byte contains the final result.
339 ;; Note that there can be no overflow from one byte to the next
340 ;; as the sum is at most 64 which needs only 7 bits.
341 (inst mov mask
#x0101010101010101
)
342 (inst imul result mask
)
343 (inst shr result
56))
344 (inst pop temp
)) ; restore RAX
346 #-sb-assembling
; avoid redefinition warning
347 (defun emit-foreign-logbitp (index foreign-symbol temp-reg
) ; result in Z flag
348 (declare (ignorable temp-reg
))
349 (multiple-value-bind (byte bit
) (floor index
8)
352 (make-ea :byte
:disp
(make-fixup foreign-symbol
:foreign byte
))
357 (make-ea :qword
:disp
(make-fixup foreign-symbol
:foreign-dataref
)))
358 (inst test
(make-ea :byte
:base temp-reg
:disp byte
) (ash 1 bit
)))))
360 ;; To perform logcount on small integers, we test whether to use the
361 ;; builtin opcode, or an assembly routine. I benchmarked this against
362 ;; an approach that always used the assembly routine via "call [addr]"
363 ;; where the contents of the address reflected one implementation
364 ;; or the other, chosen at startup - and this is faster.
367 ((def-it (name cost arg-sc arg-type
)
369 (:translate logcount
)
370 (:note
,(format nil
"inline ~a logcount" arg-type
))
372 (:args
(arg :scs
(,arg-sc
)))
373 (:arg-types
,arg-type
)
374 (:results
(result :scs
(unsigned-reg)))
375 (:result-types positive-fixnum
)
376 ;; input/output of assembly routine
377 (:temporary
(:sc unsigned-reg
:offset rdx-offset
378 :from
(:argument
0) :to
(:result
0)) rdx
)
379 ;; Assembly routine clobbers RAX and RCX but only needs to save RAX,
380 ;; as this vop clobbers RCX in the call. If changed to "CALL [ADDR]"
381 ;; be sure to update the subroutine to push and pop RCX.
382 (:temporary
(:sc unsigned-reg
:offset rcx-offset
) rcx
)
386 ;; POPCNT = ECX bit 23 = bit 7 of byte index 2
387 ;; this use of 'rcx' is as the temporary register for performing
388 ;; a reference to foreign data with dynamic core. It has to be
389 ;; a register that conflicts with 'arg' lest we clobber it.
390 (emit-foreign-logbitp 23 "cpuid_fn1_ecx" rcx
)
392 ;; Intel's implementation of POPCNT on some models treats it as
393 ;; a 2-operand ALU op in the manner of ADD,SUB,etc which means that
394 ;; it falsely appears to need data from the destination register.
395 ;; The workaround is to clear the destination.
396 ;; See http://stackoverflow.com/questions/25078285
397 (unless (location= result arg
)
398 ;; We only break the spurious dep. chain if result isn't the same
399 ;; register as arg. (If they're location=, don't trash the arg!)
400 (inst xor result result
))
401 (inst popcnt result arg
)
405 (invoke-asm-routine 'call
'logcount vop rcx
)
408 (def-it unsigned-byte-64-count
14 unsigned-reg unsigned-num
)
409 (def-it positive-fixnum-count
13 any-reg positive-fixnum
))
411 ;;; EQL for integers that are either fixnum or bignum
413 ;; The restriction on use of this assembly routine can't be expressed a
414 ;; constraints on vop args: it may be called when at *least* one arg
415 ;; is known to be an integer; the other can be anything.
417 ;; Logic: we succeed quickly in the EQ case when possible.
418 ;; Otherwise, check if both are OTHER-POINTER objects, failing if not.
419 ;; Given that at least one is an integer, and both are OTHER-POINTERs,
420 ;; then if their widetags match, both are BIGNUMs to be compared word-for-word.
422 ;; If you call this with two other-pointer objects with
423 ;; the same widetag, but not bignum-widetag, the behavior is undefined.
425 (define-assembly-routine (%eql
/integer
426 (:translate %eql
/integer
)
427 ;; :safe would imply signaling an error
428 ;; if the args are not integer, which this doesn't.
433 ((:arg x
(descriptor-reg any-reg
) rdx-offset
)
434 (:arg y
(descriptor-reg any-reg
) rdi-offset
)
435 (:temp rcx unsigned-reg rcx-offset
)
436 (:temp rax unsigned-reg rax-offset
))
438 (inst jmp
:e done
) ; Z condition flag contains the answer
439 ;; check that both have other-pointer-lowtag
440 (inst lea
(reg-in-size rax
:dword
)
441 (make-ea :dword
:base x
:disp
(- other-pointer-lowtag
)))
442 (inst lea
(reg-in-size rcx
:dword
)
443 (make-ea :dword
:base y
:disp
(- other-pointer-lowtag
)))
444 (inst or
(reg-in-size rax
:dword
) (reg-in-size rcx
:dword
))
445 (inst test
(reg-in-size rax
:byte
) lowtag-mask
)
447 ;; Compare the entire header word, ensuring that if at least one
448 ;; argument is a bignum, then both are.
449 (inst mov rcx
(make-ea :qword
:base x
:disp
(- other-pointer-lowtag
)))
450 (inst cmp rcx
(make-ea :qword
:base y
:disp
(- other-pointer-lowtag
)))
452 (inst shr rcx n-widetag-bits
)
453 ;; can you have 0 payload words? probably not, but let's be safe here.
456 (inst mov rax
(make-ea :qword
:base x
:disp
(- other-pointer-lowtag
)
457 :index rcx
:scale
8))
458 (inst cmp rax
(make-ea :qword
:base y
:disp
(- other-pointer-lowtag
)
459 :index rcx
:scale
8))
460 ;; These next 3 instructions are the equivalent of "LOOPNZ LOOP"
461 ;; but had significantly better performance for me, consistent with claims
462 ;; of most optimization guides saying that LOOP was deliberately pessimized
463 ;; because of its use in timing-related code in the win32 kernel.
467 ;; If the Z flag is set, the integers were EQL