Implement suggestions of Paul Khuong re. POPCNT
[sbcl.git] / src / assembly / x86-64 / arith.lisp
blob5b3028f9c88effe2e3136e9c057b270f8492873b
1 ;;;; simple cases for generic arithmetic
3 ;;;; This software is part of the SBCL system. See the README file for
4 ;;;; more information.
5 ;;;;
6 ;;;; This software is derived from the CMU CL system, which was
7 ;;;; written at Carnegie Mellon University and released into the
8 ;;;; public domain. The software is in the public domain and is
9 ;;;; provided with absolutely no warranty. See the COPYING and CREDITS
10 ;;;; files for more information.
12 (in-package "SB!VM")
14 (defun !both-fixnum-p (temp x y)
15 (inst mov (reg-in-size temp :dword)
16 (reg-in-size x :dword))
17 (inst or (reg-in-size temp :dword)
18 (reg-in-size y :dword))
19 (inst test (reg-in-size temp :byte)
20 fixnum-tag-mask))
22 (defun !some-fixnum-p (temp x y)
23 (inst mov (reg-in-size temp :dword)
24 (reg-in-size x :dword))
25 (inst and (reg-in-size temp :dword)
26 (reg-in-size y :dword))
27 (inst test (reg-in-size temp :byte)
28 fixnum-tag-mask))
31 ;;;; addition, subtraction, and multiplication
33 (macrolet ((define-generic-arith-routine ((fun cost) &body body)
34 `(define-assembly-routine (,(symbolicate "GENERIC-" fun)
35 (:cost ,cost)
36 (:return-style :full-call)
37 (:translate ,fun)
38 (:policy :safe)
39 (:save-p t))
40 ((:arg x (descriptor-reg any-reg) rdx-offset)
41 (:arg y (descriptor-reg any-reg) rdi-offset)
43 (:res res (descriptor-reg any-reg) rdx-offset)
45 (:temp rax unsigned-reg rax-offset)
46 (:temp rcx unsigned-reg rcx-offset))
47 (!both-fixnum-p rax x y)
48 (inst jmp :nz DO-STATIC-FUN) ; no - do generic
50 ,@body
51 (inst clc) ; single-value return
52 (inst ret)
54 DO-STATIC-FUN
55 ;; Same as: (inst enter (* n-word-bytes 1))
56 (inst push rbp-tn)
57 (inst mov rbp-tn rsp-tn)
58 (inst sub rsp-tn (* n-word-bytes 1))
59 (inst push (make-ea :qword :base rbp-tn
60 :disp (frame-byte-offset return-pc-save-offset)))
61 (inst mov rcx (fixnumize 2)) ; arg count
62 (inst jmp
63 (make-ea :qword
64 :disp (+ nil-value
65 (static-fun-offset
66 ',(symbolicate "TWO-ARG-" fun))))))))
68 #.`
69 (define-generic-arith-routine (+ 10)
70 (move res x)
71 (inst add res y)
72 (inst jmp :no OKAY)
73 ;; Unbox the overflowed result, recovering the correct sign from
74 ;; the carry flag, then re-box as a bignum.
75 (inst rcr res 1)
76 ,@(when (> n-fixnum-tag-bits 1) ; don't shift by 0
77 '((inst sar res (1- n-fixnum-tag-bits))))
79 (move rcx res)
81 (with-fixed-allocation (res bignum-widetag (1+ bignum-digits-offset))
82 (storew rcx res bignum-digits-offset other-pointer-lowtag))
84 OKAY)
86 #.`
87 (define-generic-arith-routine (- 10)
88 (move res x)
89 (inst sub res y)
90 (inst jmp :no OKAY)
91 ;; Unbox the overflowed result, recovering the correct sign from
92 ;; the carry flag, then re-box as a bignum.
93 (inst cmc) ; carry has correct sign now
94 (inst rcr res 1)
95 ,@(when (> n-fixnum-tag-bits 1) ; don't shift by 0
96 '((inst sar res (1- n-fixnum-tag-bits))))
98 (move rcx res)
100 (with-fixed-allocation (res bignum-widetag (1+ bignum-digits-offset))
101 (storew rcx res bignum-digits-offset other-pointer-lowtag))
102 OKAY)
104 (define-generic-arith-routine (* 30)
105 (move rax x) ; must use eax for 64-bit result
106 (inst sar rax n-fixnum-tag-bits) ; remove *8 fixnum bias
107 (inst imul y) ; result in edx:eax
108 (inst jmp :no OKAY) ; still fixnum
110 (inst shrd rax x n-fixnum-tag-bits) ; high bits from edx
111 (inst sar x n-fixnum-tag-bits) ; now shift edx too
113 (move rcx x) ; save high bits from cqo
114 (inst cqo) ; edx:eax <- sign-extend of eax
115 (inst cmp x rcx)
116 (inst jmp :e SINGLE-WORD-BIGNUM)
118 (with-fixed-allocation (res bignum-widetag (+ bignum-digits-offset 2))
119 (storew rax res bignum-digits-offset other-pointer-lowtag)
120 (storew rcx res (1+ bignum-digits-offset) other-pointer-lowtag))
121 (inst jmp DONE)
123 SINGLE-WORD-BIGNUM
125 (with-fixed-allocation (res bignum-widetag (1+ bignum-digits-offset))
126 (storew rax res bignum-digits-offset other-pointer-lowtag))
127 (inst jmp DONE)
129 OKAY
130 (move res rax)
131 DONE))
133 ;;;; negation
135 (define-assembly-routine (generic-negate
136 (:cost 10)
137 (:return-style :full-call)
138 (:policy :safe)
139 (:translate %negate)
140 (:save-p t))
141 ((:arg x (descriptor-reg any-reg) rdx-offset)
142 (:res res (descriptor-reg any-reg) rdx-offset)
144 (:temp rax unsigned-reg rax-offset)
145 (:temp rcx unsigned-reg rcx-offset))
146 (inst test (reg-in-size x :byte) fixnum-tag-mask)
147 (inst jmp :z FIXNUM)
149 (inst push rbp-tn)
150 (inst mov rbp-tn rsp-tn)
151 (inst sub rsp-tn n-word-bytes)
152 (inst push (make-ea :qword :base rbp-tn
153 :disp (frame-byte-offset return-pc-save-offset)))
154 (inst mov rcx (fixnumize 1)) ; arg count
155 (inst jmp (make-ea :qword
156 :disp (+ nil-value (static-fun-offset '%negate))))
158 FIXNUM
159 (move res x)
160 (inst neg res) ; (- most-negative-fixnum) is BIGNUM
161 (inst jmp :no OKAY)
162 (inst shr res n-fixnum-tag-bits) ; sign bit is data - remove type bits
163 (move rcx res)
165 (with-fixed-allocation (res bignum-widetag (1+ bignum-digits-offset))
166 (storew rcx res bignum-digits-offset other-pointer-lowtag))
168 OKAY)
170 ;;;; comparison
172 (macrolet ((define-cond-assem-rtn (name translate static-fn test)
173 (declare (ignorable translate static-fn))
174 #+sb-assembling
175 `(define-assembly-routine (,name
176 (:return-style :none))
177 ((:arg x (descriptor-reg any-reg) rdx-offset)
178 (:arg y (descriptor-reg any-reg) rdi-offset)
180 (:temp rcx unsigned-reg rcx-offset))
182 (!both-fixnum-p rcx x y)
183 (inst jmp :nz DO-STATIC-FUN)
185 (inst cmp x y)
186 (inst ret)
188 DO-STATIC-FUN
189 (inst push rbp-tn)
190 (inst mov rbp-tn rsp-tn)
191 (inst sub rsp-tn (* n-word-bytes 3))
192 (inst mov (make-ea :qword :base rsp-tn
193 :disp (frame-byte-offset
194 (+ sp->fp-offset
196 ocfp-save-offset)))
197 rbp-tn)
198 (inst lea rbp-tn (make-ea :qword :base rsp-tn
199 :disp (frame-byte-offset
200 (+ sp->fp-offset
202 ocfp-save-offset))))
203 (inst mov rcx (fixnumize 2))
204 (inst call (make-ea :qword
205 :disp (+ nil-value
206 (static-fun-offset ',static-fn))))
207 ;; HACK: We depend on NIL having the lowest address of all
208 ;; static symbols (including T)
209 ,@(ecase test
210 (:l `((inst mov y (1+ nil-value))
211 (inst cmp y x)))
212 (:g `((inst cmp x (1+ nil-value)))))
213 (inst pop rbp-tn)
214 (inst ret))
215 #-sb-assembling
216 `(define-vop (,name)
217 (:translate ,translate)
218 (:policy :safe)
219 (:save-p t)
220 (:args (x :scs (descriptor-reg any-reg) :target rdx)
221 (y :scs (descriptor-reg any-reg) :target rdi))
223 (:temporary (:sc unsigned-reg :offset rdx-offset
224 :from (:argument 0))
225 rdx)
226 (:temporary (:sc unsigned-reg :offset rdi-offset
227 :from (:argument 1))
228 rdi)
230 (:temporary (:sc unsigned-reg :offset rcx-offset
231 :from :eval)
232 rcx)
233 (:conditional ,test)
234 (:generator 10
235 (move rdx x)
236 (move rdi y)
237 (inst mov rcx (make-fixup ',name :assembly-routine))
238 (inst call rcx)))))
240 (define-cond-assem-rtn generic-< < two-arg-< :l)
241 (define-cond-assem-rtn generic-> > two-arg-> :g))
243 #+sb-assembling
244 (define-assembly-routine (generic-eql
245 (:return-style :none))
246 ((:arg x (descriptor-reg any-reg) rdx-offset)
247 (:arg y (descriptor-reg any-reg) rdi-offset)
249 (:temp rcx unsigned-reg rcx-offset))
251 (!some-fixnum-p rcx x y)
252 (inst jmp :nz DO-STATIC-FUN)
254 ;; At least one fixnum
255 (inst cmp x y)
256 (inst ret)
258 DO-STATIC-FUN
259 (inst push rbp-tn)
260 (inst mov rbp-tn rsp-tn)
261 (inst sub rsp-tn (* n-word-bytes 3))
262 (inst mov (make-ea :qword :base rsp-tn
263 :disp (frame-byte-offset
264 (+ sp->fp-offset
266 ocfp-save-offset)))
267 rbp-tn)
268 (inst lea rbp-tn (make-ea :qword :base rsp-tn
269 :disp (frame-byte-offset
270 (+ sp->fp-offset
272 ocfp-save-offset))))
273 (inst mov rcx (fixnumize 2))
274 (inst call (make-ea :qword
275 :disp (+ nil-value (static-fun-offset 'eql))))
276 (inst cmp x (+ nil-value (static-symbol-offset t)))
277 (inst pop rbp-tn)
278 (inst ret))
280 #-sb-assembling
281 (define-vop (generic-eql)
282 (:translate eql)
283 (:policy :safe)
284 (:save-p t)
285 (:args (x :scs (descriptor-reg any-reg) :target rdx)
286 (y :scs (descriptor-reg any-reg) :target rdi))
288 (:temporary (:sc unsigned-reg :offset rdx-offset
289 :from (:argument 0))
290 rdx)
291 (:temporary (:sc unsigned-reg :offset rdi-offset
292 :from (:argument 1))
293 rdi)
295 (:temporary (:sc unsigned-reg :offset rcx-offset
296 :from :eval)
297 rcx)
298 (:conditional :e)
299 (:generator 10
300 (move rdx x)
301 (move rdi y)
302 (inst mov rcx (make-fixup 'generic-eql :assembly-routine))
303 (inst call rcx)))
305 #+sb-assembling
306 (define-assembly-routine (generic-=
307 (:return-style :none))
308 ((:arg x (descriptor-reg any-reg) rdx-offset)
309 (:arg y (descriptor-reg any-reg) rdi-offset)
311 (:temp rcx unsigned-reg rcx-offset))
312 (!both-fixnum-p rcx x y)
313 (inst jmp :nz DO-STATIC-FUN)
315 ;; Both fixnums
316 (inst cmp x y)
317 (inst ret)
319 DO-STATIC-FUN
320 (inst push rbp-tn)
321 (inst mov rbp-tn rsp-tn)
322 (inst sub rsp-tn (* n-word-bytes 3))
323 (inst mov (make-ea :qword :base rsp-tn
324 :disp (frame-byte-offset
325 (+ sp->fp-offset
327 ocfp-save-offset)))
328 rbp-tn)
329 (inst lea rbp-tn (make-ea :qword :base rsp-tn
330 :disp (frame-byte-offset
331 (+ sp->fp-offset
333 ocfp-save-offset))))
335 (inst mov rcx (fixnumize 2))
336 (inst call (make-ea :qword
337 :disp (+ nil-value (static-fun-offset 'two-arg-=))))
338 (inst cmp x (+ nil-value (static-symbol-offset t)))
339 (inst pop rbp-tn)
340 (inst ret))
342 #-sb-assembling
343 (define-vop (generic-=)
344 (:translate =)
345 (:policy :safe)
346 (:save-p t)
347 (:args (x :scs (descriptor-reg any-reg) :target rdx)
348 (y :scs (descriptor-reg any-reg) :target rdi))
350 (:temporary (:sc unsigned-reg :offset rdx-offset
351 :from (:argument 0))
352 rdx)
353 (:temporary (:sc unsigned-reg :offset rdi-offset
354 :from (:argument 1))
355 rdi)
357 (:temporary (:sc unsigned-reg :offset rcx-offset
358 :from :eval)
359 rcx)
360 (:conditional :e)
361 (:generator 10
362 (move rdx x)
363 (move rdi y)
364 (inst mov rcx (make-fixup 'generic-= :assembly-routine))
365 (inst call rcx)))
367 #+sb-assembling
368 (define-assembly-routine (logcount)
369 ((:arg arg (descriptor-reg any-reg) rdx-offset)
370 (:temp mask unsigned-reg rcx-offset)
371 (:temp temp unsigned-reg rax-offset))
372 (inst push temp) ; save RAX
373 (let ((result arg))
374 ;; See the comments below for how the algorithm works. The tricks
375 ;; used can be found for example in AMD's software optimization
376 ;; guide or at "http://www.hackersdelight.org/HDcode/pop.cc" in the
377 ;; function "pop1", for 32-bit words. The extension to 64 bits is
378 ;; straightforward.
379 ;; Calculate 2-bit sums. Note that the value of a two-digit binary
380 ;; number is the sum of the right digit and twice the left digit.
381 ;; Thus we can calculate the sum of the two digits by shifting the
382 ;; left digit to the right position and doing a two-bit subtraction.
383 ;; This subtraction will never create a borrow and thus can be made
384 ;; on all 32 2-digit numbers at once.
385 (move temp arg)
386 (inst shr result 1)
387 (inst mov mask #x5555555555555555)
388 (inst and result mask)
389 (inst sub temp result)
390 ;; Calculate 4-bit sums by straightforward shift, mask and add.
391 ;; Note that we shift the source operand of the MOV and not its
392 ;; destination so that the SHR and the MOV can execute in the same
393 ;; clock cycle.
394 (inst mov result temp)
395 (inst shr temp 2)
396 (inst mov mask #x3333333333333333)
397 (inst and result mask)
398 (inst and temp mask)
399 (inst add result temp)
400 ;; Calculate 8-bit sums. Since each sum is at most 8, which fits
401 ;; into 4 bits, we can apply the mask after the addition, saving one
402 ;; instruction.
403 (inst mov temp result)
404 (inst shr result 4)
405 (inst add result temp)
406 (inst mov mask #x0f0f0f0f0f0f0f0f)
407 (inst and result mask)
408 ;; Add all 8 bytes at once by multiplying with #256r11111111.
409 ;; We need to calculate only the lower 8 bytes of the product.
410 ;; Of these the most significant byte contains the final result.
411 ;; Note that there can be no overflow from one byte to the next
412 ;; as the sum is at most 64 which needs only 7 bits.
413 (inst mov mask #x0101010101010101)
414 (inst imul result mask)
415 (inst shr result 56))
416 (inst pop temp)) ; restore RAX
418 ;; To perform logcount on small integers, we test whether to use the
419 ;; builtin opcode, or an assembly routine. I benchmarked this against
420 ;; an approach that always used the assembly routine via "call [addr]"
421 ;; where the contents of the address reflected one implementation
422 ;; or the other, chosen at startup - and this is faster.
423 #-sb-assembling
424 (macrolet
425 ((def-it (name cost arg-sc arg-type)
426 `(define-vop (,name)
427 (:translate logcount)
428 (:note ,(format nil "inline ~a logcount" arg-type))
429 (:policy :fast-safe)
430 (:args (arg :scs (,arg-sc)))
431 (:arg-types ,arg-type)
432 (:results (result :scs (unsigned-reg)))
433 (:result-types positive-fixnum)
434 ;; input/output of assembly routine
435 (:temporary (:sc unsigned-reg :offset rdx-offset
436 :from (:argument 0) :to (:result 0)) rdx)
437 ;; Assembly routine clobbers RAX and RCX but only needs to save RAX,
438 ;; as this vop clobbers RCX in the call. If changed to "CALL [ADDR]"
439 ;; be sure to update the subroutine to push and pop RCX.
440 (:temporary (:sc unsigned-reg :offset rcx-offset) rcx)
441 (:generator ,cost
442 ;; FIXME: As I've got no way to set the cpuid feature bits,
443 ;; don't try testing here whether to use popcnt, until it has been
444 ;; verified that the __cpuid() intrinsic works for win32.
445 #!-win32
446 (progn
447 ;; POPCNT = ECX bit 23 = bit 7 of byte index 2
448 (inst test
449 (make-ea :byte :disp (make-fixup "cpuid_fn1_ecx" :foreign 2))
450 #x80)
451 (inst jmp :z slow)
452 ;; Intel's implementation of POPCNT on some models treats it as
453 ;; a 2-operand ALU op in the manner of ADD,SUB,etc which means that
454 ;; it falsely appears to need data from the destination register.
455 ;; The workaround is to clear the destination.
456 ;; See http://stackoverflow.com/questions/25078285
457 (unless (location= result arg)
458 ;; We only break the spurious dep. chain if result isn't the same
459 ;; register as arg. (If they're location=, don't trash the arg!)
460 (inst xor result result))
461 (inst popcnt result arg)
462 (inst jmp done))
463 slow
464 (move rdx arg)
465 (inst mov rcx (make-fixup 'logcount :assembly-routine))
466 (inst call rcx)
467 (move result rdx)
468 done))))
469 (def-it unsigned-byte-64-count 14 unsigned-reg unsigned-num)
470 (def-it positive-fixnum-count 13 any-reg positive-fixnum))