..
[sb-simd.git] / scratch / foo.lisp
blob0f54018e735371ecb3383b33bef5a2600fea7ece
1 (in-package :sb-vm)
3 (define-vop (my-vop)
4 (:policy :fast-safe)
6 (:args (vector1 :scs (descriptor-reg))
7 (vector2 :scs (descriptor-reg)))
8 (:arg-types simple-array-single-float simple-array-single-float)
10 (:temporary (:sc unsigned-reg) index)
12 ;; (:temporary (:sc unsigned-reg) temp1)
13 ;; (:temporary (:sc unsigned-reg) temp2)
15 (:temporary (:sc sse-reg) sse-temp1)
16 (:temporary (:sc sse-reg) sse-temp2)
18 (:generator 10
20 (inst xor index index)
23 (inst movups sse-temp1
24 (make-ea :dword :base vector1 :index index
25 :disp (- (* vector-data-offset n-word-bytes) other-pointer-lowtag)))
26 (inst movups sse-temp2
27 (make-ea :dword :base vector2 :index index
28 :disp (- (* vector-data-offset n-word-bytes) other-pointer-lowtag)))
30 (inst addps sse-temp1 sse-temp2)
32 ;; (inst add index 4)
34 (inst movups
35 (make-ea :dword :base vector1 :index index
36 :disp (- (* vector-data-offset n-word-bytes) other-pointer-lowtag))
37 sse-temp1)
39 (inst add index 4)
40 (inst mov
41 (make-ea :dword :base vector1 :index index
42 :disp (- (* vector-data-offset n-word-bytes) other-pointer-lowtag))
43 index)
45 (inst add index 4)
46 (inst movups
47 (make-ea :dword :base vector1 :index index
48 :disp (- (* vector-data-offset n-word-bytes) other-pointer-lowtag))
49 sse-temp2)
54 00000000 <my_func>:
55 0: 31 c0 xor %eax,%eax 2: 0f 10 04 c6 movups (%esi,%eax,8),%xmm0
56 6: 0f 10 0c c7 movups (%edi,%eax,8),%xmm1
57 a: 0f 58 c1 addps %xmm1,%xmm0
58 d: 0f 11 44 c5 00 movups %xmm0,0x0(%ebp,%eax,8)
60 --- v2:
61 0: 31 c0 xor %eax,%eax
62 2: 0f 10 44 03 01 movups 0x1(%ebx,%eax,1),%xmm0
63 7: 0f 10 4c 01 01 movups 0x1(%ecx,%eax,1),%xmm1
64 c: 0f 58 c1 addps %xmm1,%xmm0
65 f: 0f 11 44 01 01 movups %xmm0,0x1(%ecx,%eax,1)
67 --- v3:
68 2: 0f 10 43 01 movups 0x1(%ebx),%xmm0
69 6: 0f 10 49 01 movups 0x1(%ecx),%xmm1
70 a: 0f 58 c1 addps %xmm1,%xmm0
71 d: 0f 11 41 01 movups %xmm0,0x1(%ecx)
73 --- v4:
74 2: 0f 10 44 1a 01 movups 0x1(%edx,%ebx,1),%xmm0
75 7: 0f 10 4c 0e 01 movups 0x1(%esi,%ecx,1),%xmm1
76 c: 0f 58 c1 addps %xmm1,%xmm0
77 f: 0f 11 41 01 movups %xmm0,0x1(%ecx)
79 10h = MOVUPS Vps, Wps
80 11h = MOVUPS Wps, Vps
82 V = 128bit xmm reg specified by the modrm reg field.
83 W = 128bit xmm register or mem op specified by the modrm byte.
84 ps = 128bit single-precision float operand
86 movups xmm0, [ebx + 01]
87 movups md reg r/m sc idx bse disp8
88 0f 10 01 000 100 00 000 011 01
89 +d8 xm0 sib *0 +0 ebx +01
92 ; 43E: L4: 31C0 XOR EAX, EAX
94 7 6 5 4 3 2 1 0
95 m d r e g r / m
97 44h = b 0 1 0 0 0 1 0 0
98 4Ch = b 0 1 0 0 1 1 0 0
99 64h = b 0 1 1 0 0 1 0 0
100 E0h = b 1 1 1 0 0 0 0 0
101 C1h = b 1 1 0 0 0 0 0 1
102 43h = b 0 1 0 0 0 0 1 1
103 49h = b 0 1 0 0 1 0 0 1
105 r/m b100 => has sib byte
107 modrm md+r/m field:
108 r/m= 000, 001, 010, 011, 100, 101, 110 , 111
110 00 = ax, cx, dx, bx, sib, rip+d32, si, di
111 01 = --||-- + disp8 , bp+disp8, ..
112 10 = --||-- + disp32
113 11 = al/ax/eax/mmx0/xmm0, 1, 2, 3, 4, 5, 6, 7
115 modrm reg:
116 000 001 010 011 100 101 110 111
117 reg32 eax ecx edx ebx esp ebp esi edi
118 xmm xm0 xm1 xm2 xm3 xm4 xm5 xm6 xm7 ;; actually xmm0..xmm7
121 44h = md 01, r/m 100, reg 000, => xmm0, [sib + disp8] => 44 03 01 : xmm0, [ebx + 01], 44 01 01 : xmm0, [ecx + 01]
122 64h = md 01, r/m 100, reg 100, => xmm4, [sib + disp8] => xmm4, [ebx + 01]
123 04 C6 = md 00, r/m 100, reg 000, => xmm0, [sib] => xmm0, [esi*8]
124 4C 01 01 = md 01, reg 001, r/m 100 => xmm1, [sib + disp8] => [ecx + 01]
125 43h = md 01, reg 0, r/m 011 => xmm0, [ebx + 01]
126 49h = md 01, reg 1, r/m 001 => xmm1, [ecx + 01]
129 7 6 5 4 3 2 1 0
130 s c i d x b a s
132 03h = b 0 0 0 0 0 0 1 1 = eax + ebx*1
133 01h = b 0 0 0 0 0 0 0 1 = eax + ecx*1
134 C6h = b 1 1 0 0 0 1 1 0 = eax + esi*8
135 1Ah = b 0 0 0 1 1 0 1 0 = ebx + edx*1
136 0Eh = b 0 0 0 0 1 1 1 0 = ecx + esi*1
139 ;; movups xmm0, ea 0F 10 44 03 01
140 ; 440: 0F 10 44 01 01 movups xmm0, [eax + ecx + 01]
142 ;; movups xmm1, ea 0F 10 4C 01 01
143 ; 445: 0F 10 64 03 01 movups xmm4, [eax + ebx + 01]
145 ;; addps xmm0, xmm1 0F 58 C1
146 ; 44A: 0F 58 E0 addps xmm0, xmm4
148 ;; movups ea, xmm0 0f 11 44 01 01
149 0F 11 44 01 01 movups [eax + ecx + 01], xmm0
151 ; 452: 83C004 ADD EAX, 4
156 c: 0f 58 c4 addps %xmm4,%xmm0
157 f: 0f 58 e0 addps %xmm0,%xmm4