Add thin wrapper for calling assem code on x86-64
[sbcl.git] / src / assembly / x86-64 / arith.lisp
blob8fb7222c08a8c0adddac90e4962a178dd1de4a6b
1 ;;;; simple cases for generic arithmetic
3 ;;;; This software is part of the SBCL system. See the README file for
4 ;;;; more information.
5 ;;;;
6 ;;;; This software is derived from the CMU CL system, which was
7 ;;;; written at Carnegie Mellon University and released into the
8 ;;;; public domain. The software is in the public domain and is
9 ;;;; provided with absolutely no warranty. See the COPYING and CREDITS
10 ;;;; files for more information.
12 (in-package "SB!VM")
14 #-sb-assembling ; avoid redefinition warning
15 (progn
16 (defun !both-fixnum-p (temp x y)
17 (inst mov (reg-in-size temp :dword)
18 (reg-in-size x :dword))
19 (inst or (reg-in-size temp :dword)
20 (reg-in-size y :dword))
21 (inst test (reg-in-size temp :byte)
22 fixnum-tag-mask))
24 (defun !some-fixnum-p (temp x y)
25 (inst mov (reg-in-size temp :dword)
26 (reg-in-size x :dword))
27 (inst and (reg-in-size temp :dword)
28 (reg-in-size y :dword))
29 (inst test (reg-in-size temp :byte)
30 fixnum-tag-mask))
33 #+sb-xc-host
34 (defmacro static-fun-addr (name)
35 `(make-ea :qword :disp (+ nil-value (static-fun-offset ,name))))
37 ;;;; addition, subtraction, and multiplication
39 (macrolet ((define-generic-arith-routine ((fun cost) &body body)
40 `(define-assembly-routine (,(symbolicate "GENERIC-" fun)
41 (:cost ,cost)
42 (:return-style :full-call)
43 (:translate ,fun)
44 (:policy :safe)
45 (:save-p t))
46 ((:arg x (descriptor-reg any-reg) rdx-offset)
47 (:arg y (descriptor-reg any-reg) rdi-offset)
49 (:res res (descriptor-reg any-reg) rdx-offset)
51 (:temp rax unsigned-reg rax-offset)
52 (:temp rcx unsigned-reg rcx-offset))
53 (!both-fixnum-p rax x y)
54 (inst jmp :nz DO-STATIC-FUN) ; no - do generic
56 ,@body
57 (inst clc) ; single-value return
58 (inst ret)
60 DO-STATIC-FUN
61 ;; Same as: (inst enter (* n-word-bytes 1))
62 (inst push rbp-tn)
63 (inst mov rbp-tn rsp-tn)
64 (inst sub rsp-tn (* n-word-bytes 1))
65 (inst push (make-ea :qword :base rbp-tn
66 :disp (frame-byte-offset return-pc-save-offset)))
67 (inst mov rcx (fixnumize 2)) ; arg count
68 (inst jmp (static-fun-addr ',(symbolicate "TWO-ARG-" fun))))))
70 #.`
71 (define-generic-arith-routine (+ 10)
72 (move res x)
73 (inst add res y)
74 (inst jmp :no OKAY)
75 ;; Unbox the overflowed result, recovering the correct sign from
76 ;; the carry flag, then re-box as a bignum.
77 (inst rcr res 1)
78 ,@(when (> n-fixnum-tag-bits 1) ; don't shift by 0
79 '((inst sar res (1- n-fixnum-tag-bits))))
81 (move rcx res)
83 (with-fixed-allocation (res bignum-widetag (1+ bignum-digits-offset))
84 (storew rcx res bignum-digits-offset other-pointer-lowtag))
86 OKAY)
88 #.`
89 (define-generic-arith-routine (- 10)
90 (move res x)
91 (inst sub res y)
92 (inst jmp :no OKAY)
93 ;; Unbox the overflowed result, recovering the correct sign from
94 ;; the carry flag, then re-box as a bignum.
95 (inst cmc) ; carry has correct sign now
96 (inst rcr res 1)
97 ,@(when (> n-fixnum-tag-bits 1) ; don't shift by 0
98 '((inst sar res (1- n-fixnum-tag-bits))))
100 (move rcx res)
102 (with-fixed-allocation (res bignum-widetag (1+ bignum-digits-offset))
103 (storew rcx res bignum-digits-offset other-pointer-lowtag))
104 OKAY)
106 (define-generic-arith-routine (* 30)
107 (move rax x) ; must use eax for 64-bit result
108 (inst sar rax n-fixnum-tag-bits) ; remove *8 fixnum bias
109 (inst imul y) ; result in edx:eax
110 (inst jmp :no OKAY) ; still fixnum
112 (inst shrd rax x n-fixnum-tag-bits) ; high bits from edx
113 (inst sar x n-fixnum-tag-bits) ; now shift edx too
115 (move rcx x) ; save high bits from cqo
116 (inst cqo) ; edx:eax <- sign-extend of eax
117 (inst cmp x rcx)
118 (inst jmp :e SINGLE-WORD-BIGNUM)
120 (with-fixed-allocation (res bignum-widetag (+ bignum-digits-offset 2))
121 (storew rax res bignum-digits-offset other-pointer-lowtag)
122 (storew rcx res (1+ bignum-digits-offset) other-pointer-lowtag))
123 (inst jmp DONE)
125 SINGLE-WORD-BIGNUM
127 (with-fixed-allocation (res bignum-widetag (1+ bignum-digits-offset))
128 (storew rax res bignum-digits-offset other-pointer-lowtag))
129 (inst jmp DONE)
131 OKAY
132 (move res rax)
133 DONE))
135 ;;;; negation
137 (define-assembly-routine (generic-negate
138 (:cost 10)
139 (:return-style :full-call)
140 (:policy :safe)
141 (:translate %negate)
142 (:save-p t))
143 ((:arg x (descriptor-reg any-reg) rdx-offset)
144 (:res res (descriptor-reg any-reg) rdx-offset)
145 (:temp rcx unsigned-reg rcx-offset))
146 (inst test (reg-in-size x :byte) fixnum-tag-mask)
147 (inst jmp :z FIXNUM)
149 (inst push rbp-tn)
150 (inst mov rbp-tn rsp-tn)
151 (inst sub rsp-tn n-word-bytes)
152 (inst push (make-ea :qword :base rbp-tn
153 :disp (frame-byte-offset return-pc-save-offset)))
154 (inst mov rcx (fixnumize 1)) ; arg count
155 (inst jmp (static-fun-addr '%negate))
157 FIXNUM
158 (move res x)
159 (inst neg res) ; (- most-negative-fixnum) is BIGNUM
160 (inst jmp :no OKAY)
161 (inst shr res n-fixnum-tag-bits) ; sign bit is data - remove type bits
162 (move rcx res)
164 (with-fixed-allocation (res bignum-widetag (1+ bignum-digits-offset))
165 (storew rcx res bignum-digits-offset other-pointer-lowtag))
167 OKAY)
169 ;;;; comparison
171 (macrolet ((define-cond-assem-rtn (name translate static-fn test)
172 `(define-assembly-routine (,name
173 (:translate ,translate)
174 (:policy :safe)
175 (:save-p t)
176 (:conditional ,test)
177 (:cost 10)
178 (:call-temps rcx))
179 ((:arg x (descriptor-reg any-reg) rdx-offset)
180 (:arg y (descriptor-reg any-reg) rdi-offset)
182 (:temp rcx unsigned-reg rcx-offset))
184 (!both-fixnum-p rcx x y)
185 (inst jmp :nz DO-STATIC-FUN)
187 (inst cmp x y)
188 (inst ret)
190 DO-STATIC-FUN
191 (inst push rbp-tn)
192 (inst mov rbp-tn rsp-tn)
193 (inst sub rsp-tn (* n-word-bytes 3))
194 (inst mov (make-ea :qword :base rsp-tn
195 :disp (frame-byte-offset
196 (+ sp->fp-offset
198 ocfp-save-offset)))
199 rbp-tn)
200 (inst lea rbp-tn (make-ea :qword :base rsp-tn
201 :disp (frame-byte-offset
202 (+ sp->fp-offset
204 ocfp-save-offset))))
205 (inst mov rcx (fixnumize 2))
206 (inst call (static-fun-addr ',static-fn))
207 ;; HACK: We depend on NIL having the lowest address of all
208 ;; static symbols (including T)
209 ,@(ecase test
210 (:l `((inst mov y (1+ nil-value))
211 (inst cmp y x)))
212 (:g `((inst cmp x (1+ nil-value)))))
213 (inst pop rbp-tn))))
214 (define-cond-assem-rtn generic-< < two-arg-< :l)
215 (define-cond-assem-rtn generic-> > two-arg-> :g))
217 (define-assembly-routine (generic-eql
218 (:translate eql)
219 (:policy :safe)
220 (:save-p t)
221 (:conditional :e)
222 (:cost 10)
223 (:call-temps rcx))
224 ((:arg x (descriptor-reg any-reg) rdx-offset)
225 (:arg y (descriptor-reg any-reg) rdi-offset)
227 (:temp rcx unsigned-reg rcx-offset))
229 (!some-fixnum-p rcx x y)
230 (inst jmp :nz DO-STATIC-FUN)
232 ;; At least one fixnum
233 (inst cmp x y)
234 (inst ret)
236 DO-STATIC-FUN
237 (inst push rbp-tn)
238 (inst mov rbp-tn rsp-tn)
239 (inst sub rsp-tn (* n-word-bytes 3))
240 (inst mov (make-ea :qword :base rsp-tn
241 :disp (frame-byte-offset
242 (+ sp->fp-offset
244 ocfp-save-offset)))
245 rbp-tn)
246 (inst lea rbp-tn (make-ea :qword :base rsp-tn
247 :disp (frame-byte-offset
248 (+ sp->fp-offset
250 ocfp-save-offset))))
251 (inst mov rcx (fixnumize 2))
252 (inst call (static-fun-addr 'eql))
253 (inst cmp x (+ nil-value (static-symbol-offset t)))
254 (inst pop rbp-tn))
256 (define-assembly-routine (generic-=
257 (:translate =)
258 (:policy :safe)
259 (:save-p t)
260 (:conditional :e)
261 (:cost 10)
262 (:call-temps rcx))
263 ((:arg x (descriptor-reg any-reg) rdx-offset)
264 (:arg y (descriptor-reg any-reg) rdi-offset)
266 (:temp rcx unsigned-reg rcx-offset))
267 (!both-fixnum-p rcx x y)
268 (inst jmp :nz DO-STATIC-FUN)
270 ;; Both fixnums
271 (inst cmp x y)
272 (inst ret)
274 DO-STATIC-FUN
275 (inst push rbp-tn)
276 (inst mov rbp-tn rsp-tn)
277 (inst sub rsp-tn (* n-word-bytes 3))
278 (inst mov (make-ea :qword :base rsp-tn
279 :disp (frame-byte-offset
280 (+ sp->fp-offset
282 ocfp-save-offset)))
283 rbp-tn)
284 (inst lea rbp-tn (make-ea :qword :base rsp-tn
285 :disp (frame-byte-offset
286 (+ sp->fp-offset
288 ocfp-save-offset))))
290 (inst mov rcx (fixnumize 2))
291 (inst call (static-fun-addr 'two-arg-=))
292 (inst cmp x (+ nil-value (static-symbol-offset t)))
293 (inst pop rbp-tn))
295 #+sb-assembling
296 (define-assembly-routine (logcount)
297 ((:arg arg (descriptor-reg any-reg) rdx-offset)
298 (:temp mask unsigned-reg rcx-offset)
299 (:temp temp unsigned-reg rax-offset))
300 (inst push temp) ; save RAX
301 (let ((result arg))
302 ;; See the comments below for how the algorithm works. The tricks
303 ;; used can be found for example in AMD's software optimization
304 ;; guide or at "http://www.hackersdelight.org/HDcode/pop.cc" in the
305 ;; function "pop1", for 32-bit words. The extension to 64 bits is
306 ;; straightforward.
307 ;; Calculate 2-bit sums. Note that the value of a two-digit binary
308 ;; number is the sum of the right digit and twice the left digit.
309 ;; Thus we can calculate the sum of the two digits by shifting the
310 ;; left digit to the right position and doing a two-bit subtraction.
311 ;; This subtraction will never create a borrow and thus can be made
312 ;; on all 32 2-digit numbers at once.
313 (move temp arg)
314 (inst shr result 1)
315 (inst mov mask #x5555555555555555)
316 (inst and result mask)
317 (inst sub temp result)
318 ;; Calculate 4-bit sums by straightforward shift, mask and add.
319 ;; Note that we shift the source operand of the MOV and not its
320 ;; destination so that the SHR and the MOV can execute in the same
321 ;; clock cycle.
322 (inst mov result temp)
323 (inst shr temp 2)
324 (inst mov mask #x3333333333333333)
325 (inst and result mask)
326 (inst and temp mask)
327 (inst add result temp)
328 ;; Calculate 8-bit sums. Since each sum is at most 8, which fits
329 ;; into 4 bits, we can apply the mask after the addition, saving one
330 ;; instruction.
331 (inst mov temp result)
332 (inst shr result 4)
333 (inst add result temp)
334 (inst mov mask #x0f0f0f0f0f0f0f0f)
335 (inst and result mask)
336 ;; Add all 8 bytes at once by multiplying with #256r11111111.
337 ;; We need to calculate only the lower 8 bytes of the product.
338 ;; Of these the most significant byte contains the final result.
339 ;; Note that there can be no overflow from one byte to the next
340 ;; as the sum is at most 64 which needs only 7 bits.
341 (inst mov mask #x0101010101010101)
342 (inst imul result mask)
343 (inst shr result 56))
344 (inst pop temp)) ; restore RAX
346 #-sb-assembling ; avoid redefinition warning
347 (defun emit-foreign-logbitp (index foreign-symbol temp-reg) ; result in Z flag
348 (declare (ignorable temp-reg))
349 (multiple-value-bind (byte bit) (floor index 8)
350 #!-sb-dynamic-core
351 (inst test
352 (make-ea :byte :disp (make-fixup foreign-symbol :foreign byte))
353 (ash 1 bit))
354 #!+sb-dynamic-core
355 (progn
356 (inst mov temp-reg
357 (make-ea :qword :disp (make-fixup foreign-symbol :foreign-dataref)))
358 (inst test (make-ea :byte :base temp-reg :disp byte) (ash 1 bit)))))
360 ;; To perform logcount on small integers, we test whether to use the
361 ;; builtin opcode, or an assembly routine. I benchmarked this against
362 ;; an approach that always used the assembly routine via "call [addr]"
363 ;; where the contents of the address reflected one implementation
364 ;; or the other, chosen at startup - and this is faster.
365 #-sb-assembling
366 (macrolet
367 ((def-it (name cost arg-sc arg-type)
368 `(define-vop (,name)
369 (:translate logcount)
370 (:note ,(format nil "inline ~a logcount" arg-type))
371 (:policy :fast-safe)
372 (:args (arg :scs (,arg-sc)))
373 (:arg-types ,arg-type)
374 (:results (result :scs (unsigned-reg)))
375 (:result-types positive-fixnum)
376 ;; input/output of assembly routine
377 (:temporary (:sc unsigned-reg :offset rdx-offset
378 :from (:argument 0) :to (:result 0)) rdx)
379 ;; Assembly routine clobbers RAX and RCX but only needs to save RAX,
380 ;; as this vop clobbers RCX in the call. If changed to "CALL [ADDR]"
381 ;; be sure to update the subroutine to push and pop RCX.
382 (:temporary (:sc unsigned-reg :offset rcx-offset) rcx)
383 (:vop-var vop)
384 (:generator ,cost
385 (progn
386 ;; POPCNT = ECX bit 23 = bit 7 of byte index 2
387 ;; this use of 'rcx' is as the temporary register for performing
388 ;; a reference to foreign data with dynamic core. It has to be
389 ;; a register that conflicts with 'arg' lest we clobber it.
390 (emit-foreign-logbitp 23 "cpuid_fn1_ecx" rcx)
391 (inst jmp :z slow)
392 ;; Intel's implementation of POPCNT on some models treats it as
393 ;; a 2-operand ALU op in the manner of ADD,SUB,etc which means that
394 ;; it falsely appears to need data from the destination register.
395 ;; The workaround is to clear the destination.
396 ;; See http://stackoverflow.com/questions/25078285
397 (unless (location= result arg)
398 ;; We only break the spurious dep. chain if result isn't the same
399 ;; register as arg. (If they're location=, don't trash the arg!)
400 (inst xor result result))
401 (inst popcnt result arg)
402 (inst jmp done))
403 slow
404 (move rdx arg)
405 (invoke-asm-routine 'call 'logcount vop rcx)
406 (move result rdx)
407 done))))
408 (def-it unsigned-byte-64-count 14 unsigned-reg unsigned-num)
409 (def-it positive-fixnum-count 13 any-reg positive-fixnum))
411 ;;; EQL for integers that are either fixnum or bignum
413 ;; The restriction on use of this assembly routine can't be expressed a
414 ;; constraints on vop args: it may be called when at *least* one arg
415 ;; is known to be an integer; the other can be anything.
417 ;; Logic: we succeed quickly in the EQ case when possible.
418 ;; Otherwise, check if both are OTHER-POINTER objects, failing if not.
419 ;; Given that at least one is an integer, and both are OTHER-POINTERs,
420 ;; then if their widetags match, both are BIGNUMs to be compared word-for-word.
422 ;; If you call this with two other-pointer objects with
423 ;; the same widetag, but not bignum-widetag, the behavior is undefined.
425 (define-assembly-routine (%eql/integer
426 (:translate %eql/integer)
427 ;; :safe would imply signaling an error
428 ;; if the args are not integer, which this doesn't.
429 (:policy :fast-safe)
430 (:conditional :e)
431 (:cost 10)
432 (:call-temps rcx))
433 ((:arg x (descriptor-reg any-reg) rdx-offset)
434 (:arg y (descriptor-reg any-reg) rdi-offset)
435 (:temp rcx unsigned-reg rcx-offset)
436 (:temp rax unsigned-reg rax-offset))
437 (inst cmp x y)
438 (inst jmp :e done) ; Z condition flag contains the answer
439 ;; check that both have other-pointer-lowtag
440 (inst lea (reg-in-size rax :dword)
441 (make-ea :dword :base x :disp (- other-pointer-lowtag)))
442 (inst lea (reg-in-size rcx :dword)
443 (make-ea :dword :base y :disp (- other-pointer-lowtag)))
444 (inst or (reg-in-size rax :dword) (reg-in-size rcx :dword))
445 (inst test (reg-in-size rax :byte) lowtag-mask)
446 (inst jmp :ne done)
447 ;; Compare the entire header word, ensuring that if at least one
448 ;; argument is a bignum, then both are.
449 (inst mov rcx (make-ea :qword :base x :disp (- other-pointer-lowtag)))
450 (inst cmp rcx (make-ea :qword :base y :disp (- other-pointer-lowtag)))
451 (inst jmp :ne done)
452 (inst shr rcx n-widetag-bits)
453 ;; can you have 0 payload words? probably not, but let's be safe here.
454 (inst jrcxz done)
455 loop
456 (inst mov rax (make-ea :qword :base x :disp (- other-pointer-lowtag)
457 :index rcx :scale 8))
458 (inst cmp rax (make-ea :qword :base y :disp (- other-pointer-lowtag)
459 :index rcx :scale 8))
460 ;; These next 3 instructions are the equivalent of "LOOPNZ LOOP"
461 ;; but had significantly better performance for me, consistent with claims
462 ;; of most optimization guides saying that LOOP was deliberately pessimized
463 ;; because of its use in timing-related code in the win32 kernel.
464 (inst jmp :ne done)
465 (inst dec rcx)
466 (inst jmp :ne loop)
467 ;; If the Z flag is set, the integers were EQL
468 done)