db99a8a8bc356bc7ae26a2fd394dab1dd0335211
[sb-simd.git] / scratch / sse.lisp
blobdb99a8a8bc356bc7ae26a2fd394dab1dd0335211
1 ;;; From sb-devel post by Christophe Rhodes
2 (in-package :cl-user)
4 (defun %vector+ (result vector1 vector2)
5 (loop for x across vector1
6 for y across vector2
7 for i from 0
8 do (setf (aref result i) (+ x y))))
10 (in-package :sb-c)
12 ;; kludge
13 (ignore-errors (defknown cl-user::%vector+ (vector vector vector) vector))
15 (in-package :sb-vm)
17 (pushnew :sse2 *backend-subfeatures*)
19 (define-vop (vector+/simple-array-signed-byte-30)
20 (:translate cl-user::%vector+)
21 (:policy :fast)
22 (:args (result :scs (descriptor-reg))
23 (vector1 :scs (descriptor-reg))
24 (vector2 :scs (descriptor-reg)))
25 (:arg-types simple-array-signed-byte-30 simple-array-signed-byte-30 simple-array-signed-byte-30)
26 (:temporary (:sc any-reg) temp)
27 (:temporary (:sc unsigned-reg) index)
28 (:temporary (:sc unsigned-reg) length)
29 (:generator 30
30 (let ((top (gen-label))
31 (end (gen-label)))
32 (loadw length result vector-length-slot other-pointer-lowtag)
33 ;; check that the result vector doesn't have length 0
34 (inst cmp length 0)
35 (inst jmp :e end)
36 ;; zero the index
37 (inst xor index index)
38 (emit-label top)
39 ;; this is wasteful if one of the arguments is the same as the
40 ;; result; we can save a mov
41 (inst mov temp (make-ea :dword :base vector1 :index index
42 :disp (- (* vector-data-offset n-word-bytes) other-pointer-lowtag)))
43 (inst add temp (make-ea :dword :base vector2 :index index
44 :disp (- (* vector-data-offset n-word-bytes) other-pointer-lowtag)))
45 (inst mov (make-ea :dword :base result :index index
46 :disp (- (* vector-data-offset n-word-bytes) other-pointer-lowtag)) temp)
47 (inst add index (fixnumize 1))
48 (inst cmp index length)
49 (inst jmp :ne top)
50 (emit-label end))))
52 (define-vop (vector+/simple-array-signed-byte-30-sse)
53 (:translate cl-user::%vector+)
54 (:policy :fast)
55 (:args (result :scs (descriptor-reg))
56 (vector1 :scs (descriptor-reg))
57 (vector2 :scs (descriptor-reg)))
58 (:arg-types simple-array-signed-byte-30 simple-array-signed-byte-30 simple-array-signed-byte-30)
59 (:temporary (:sc any-reg) temp)
60 (:temporary (:sc sse-reg) sse-temp1)
61 (:temporary (:sc sse-reg) sse-temp2)
62 (:temporary (:sc unsigned-reg) index)
63 (:temporary (:sc unsigned-reg) length)
64 (:guard (member :sse2 *backend-subfeatures*))
65 (:generator 25
66 (let ((top (gen-label))
67 (two (gen-label))
68 (four (gen-label))
69 (end (gen-label)))
70 (loadw length result vector-length-slot other-pointer-lowtag)
71 ;; check that the result vector doesn't have length 0
72 (inst cmp length 0)
73 (inst jmp :e end)
74 ;; zero the index
75 (inst xor index index)
76 (emit-label top)
77 (inst test length 1)
78 (inst jmp :z two)
79 (inst mov temp (make-ea :dword :base vector1 :index index
80 :disp (- (* vector-data-offset n-word-bytes) other-pointer-lowtag)))
81 (inst add temp (make-ea :dword :base vector2 :index index
82 :disp (- (* vector-data-offset n-word-bytes) other-pointer-lowtag)))
83 (inst mov (make-ea :dword :base result :index index
84 :disp (- (* vector-data-offset n-word-bytes) other-pointer-lowtag)) temp)
85 (inst add index (fixnumize 1))
86 (inst cmp index length)
87 (inst jmp :e end)
88 (emit-label two)
89 ;; eventually at this point we put in a quadword add, but that
90 ;; would be one more instruction to write.
91 (inst test length 2)
92 (inst mov temp (make-ea :dword :base vector1 :index index
93 :disp (- (* vector-data-offset n-word-bytes) other-pointer-lowtag)))
94 (inst add temp (make-ea :dword :base vector2 :index index
95 :disp (- (* vector-data-offset n-word-bytes) other-pointer-lowtag)))
96 (inst mov (make-ea :dword :base result :index index
97 :disp (- (* vector-data-offset n-word-bytes) other-pointer-lowtag)) temp)
98 (inst add index (fixnumize 1))
99 (inst cmp index length)
100 (inst jmp :e end)
101 (inst mov temp (make-ea :dword :base vector1 :index index
102 :disp (- (* vector-data-offset n-word-bytes) other-pointer-lowtag)))
103 (inst add temp (make-ea :dword :base vector2 :index index
104 :disp (- (* vector-data-offset n-word-bytes) other-pointer-lowtag)))
105 (inst mov (make-ea :dword :base result :index index
106 :disp (- (* vector-data-offset n-word-bytes) other-pointer-lowtag)) temp)
107 (inst add index (fixnumize 1))
108 (inst cmp index length)
109 (inst jmp :e end)
110 (emit-label four)
111 ;; here, we do double quadword additions until we hit the end of
112 ;; the computation. No guarantees about alignment, so we have to
113 ;; use movdqu.
114 (inst movdqu sse-temp1 (make-ea :dword :base vector1 :index index
115 :disp (- (* vector-data-offset n-word-bytes) other-pointer-lowtag)))
116 ;; KLUDGE: We're using :dword EAs here. This is possibly non-optimal.
117 (inst movdqu sse-temp2 (make-ea :dword :base vector2 :index index
118 :disp (- (* vector-data-offset n-word-bytes) other-pointer-lowtag)))
119 (inst paddd sse-temp1 sse-temp2)
120 (inst movdqu sse-temp1 (make-ea :dword :base result :index index
121 :disp (- (* vector-data-offset n-word-bytes) other-pointer-lowtag)))
122 (inst add index (fixnumize 4))
123 (inst cmp index length)
124 (inst jmp :ne four)
125 (emit-label end))))