1 ; RUN: llc -mtriple=arm-eabi -mattr=+neon -lower-interleaved-accesses=true < %s | FileCheck %s -check-prefix=NEON
2 ; RUN: llc -mtriple=arm-eabi -mattr=-neon -lower-interleaved-accesses=true < %s | FileCheck %s -check-prefix=NONEON
4 ; NEON-LABEL: load_factor2:
5 ; NEON: vld2.8 {d16, d17}, [r0]
6 ; NONEON-LABEL: load_factor2:
8 define <8 x i8> @load_factor2(<16 x i8>* %ptr) {
9 %wide.vec = load <16 x i8>, <16 x i8>* %ptr, align 4
10 %strided.v0 = shufflevector <16 x i8> %wide.vec, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
11 %strided.v1 = shufflevector <16 x i8> %wide.vec, <16 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
12 %add = add nsw <8 x i8> %strided.v0, %strided.v1
16 ; NEON-LABEL: load_factor3:
17 ; NEON: vld3.32 {d16, d17, d18}, [r0]
18 ; NONEON-LABEL: load_factor3:
20 define <2 x i32> @load_factor3(i32* %ptr) {
21 %base = bitcast i32* %ptr to <6 x i32>*
22 %wide.vec = load <6 x i32>, <6 x i32>* %base, align 4
23 %strided.v2 = shufflevector <6 x i32> %wide.vec, <6 x i32> undef, <2 x i32> <i32 2, i32 5>
24 %strided.v1 = shufflevector <6 x i32> %wide.vec, <6 x i32> undef, <2 x i32> <i32 1, i32 4>
25 %add = add nsw <2 x i32> %strided.v2, %strided.v1
29 ; NEON-LABEL: load_factor4:
30 ; NEON: vld4.32 {d16, d18, d20, d22}, [r0]!
31 ; NEON: vld4.32 {d17, d19, d21, d23}, [r0]
32 ; NONEON-LABEL: load_factor4:
34 define <4 x i32> @load_factor4(i32* %ptr) {
35 %base = bitcast i32* %ptr to <16 x i32>*
36 %wide.vec = load <16 x i32>, <16 x i32>* %base, align 4
37 %strided.v0 = shufflevector <16 x i32> %wide.vec, <16 x i32> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
38 %strided.v2 = shufflevector <16 x i32> %wide.vec, <16 x i32> undef, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
39 %add = add nsw <4 x i32> %strided.v0, %strided.v2
43 ; NEON-LABEL: store_factor2:
44 ; NEON: vst2.8 {d16, d17}, [r0]
45 ; NONEON-LABEL: store_factor2:
47 define void @store_factor2(<16 x i8>* %ptr, <8 x i8> %v0, <8 x i8> %v1) {
48 %interleaved.vec = shufflevector <8 x i8> %v0, <8 x i8> %v1, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
49 store <16 x i8> %interleaved.vec, <16 x i8>* %ptr, align 4
53 ; NEON-LABEL: store_factor3:
54 ; NEON: vst3.32 {d16, d18, d20}, [r0]!
55 ; NEON: vst3.32 {d17, d19, d21}, [r0]
56 ; NONEON-LABEL: store_factor3:
58 define void @store_factor3(i32* %ptr, <4 x i32> %v0, <4 x i32> %v1, <4 x i32> %v2) {
59 %base = bitcast i32* %ptr to <12 x i32>*
60 %v0_v1 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
61 %v2_u = shufflevector <4 x i32> %v2, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
62 %interleaved.vec = shufflevector <8 x i32> %v0_v1, <8 x i32> %v2_u, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
63 store <12 x i32> %interleaved.vec, <12 x i32>* %base, align 4
67 ; NEON-LABEL: store_factor4:
68 ; NEON: vst4.32 {d16, d18, d20, d22}, [r0]!
69 ; NEON: vst4.32 {d17, d19, d21, d23}, [r0]
70 ; NONEON-LABEL: store_factor4:
72 define void @store_factor4(i32* %ptr, <4 x i32> %v0, <4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3) {
73 %base = bitcast i32* %ptr to <16 x i32>*
74 %v0_v1 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
75 %v2_v3 = shufflevector <4 x i32> %v2, <4 x i32> %v3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
76 %interleaved.vec = shufflevector <8 x i32> %v0_v1, <8 x i32> %v2_v3, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
77 store <16 x i32> %interleaved.vec, <16 x i32>* %base, align 4
81 ; The following cases test that interleaved access of pointer vectors can be
82 ; matched to ldN/stN instruction.
84 ; NEON-LABEL: load_ptrvec_factor2:
85 ; NEON: vld2.32 {d16, d17}, [r0]
86 ; NONEON-LABEL: load_ptrvec_factor2:
88 define <2 x i32*> @load_ptrvec_factor2(i32** %ptr) {
89 %base = bitcast i32** %ptr to <4 x i32*>*
90 %wide.vec = load <4 x i32*>, <4 x i32*>* %base, align 4
91 %strided.v0 = shufflevector <4 x i32*> %wide.vec, <4 x i32*> undef, <2 x i32> <i32 0, i32 2>
92 ret <2 x i32*> %strided.v0
95 ; NEON-LABEL: load_ptrvec_factor3:
96 ; NEON: vld3.32 {d16, d17, d18}, [r0]
97 ; NONEON-LABEL: load_ptrvec_factor3:
99 define void @load_ptrvec_factor3(i32** %ptr, <2 x i32*>* %ptr1, <2 x i32*>* %ptr2) {
100 %base = bitcast i32** %ptr to <6 x i32*>*
101 %wide.vec = load <6 x i32*>, <6 x i32*>* %base, align 4
102 %strided.v2 = shufflevector <6 x i32*> %wide.vec, <6 x i32*> undef, <2 x i32> <i32 2, i32 5>
103 store <2 x i32*> %strided.v2, <2 x i32*>* %ptr1
104 %strided.v1 = shufflevector <6 x i32*> %wide.vec, <6 x i32*> undef, <2 x i32> <i32 1, i32 4>
105 store <2 x i32*> %strided.v1, <2 x i32*>* %ptr2
109 ; NEON-LABEL: load_ptrvec_factor4:
110 ; NEON: vld4.32 {d16, d17, d18, d19}, [r0]
111 ; NONEON-LABEL: load_ptrvec_factor4:
113 define void @load_ptrvec_factor4(i32** %ptr, <2 x i32*>* %ptr1, <2 x i32*>* %ptr2) {
114 %base = bitcast i32** %ptr to <8 x i32*>*
115 %wide.vec = load <8 x i32*>, <8 x i32*>* %base, align 4
116 %strided.v1 = shufflevector <8 x i32*> %wide.vec, <8 x i32*> undef, <2 x i32> <i32 1, i32 5>
117 %strided.v3 = shufflevector <8 x i32*> %wide.vec, <8 x i32*> undef, <2 x i32> <i32 3, i32 7>
118 store <2 x i32*> %strided.v1, <2 x i32*>* %ptr1
119 store <2 x i32*> %strided.v3, <2 x i32*>* %ptr2
123 ; NEON-LABEL: store_ptrvec_factor2:
124 ; NEON: vst2.32 {d16, d17}, [r0]
125 ; NONEON-LABEL: store_ptrvec_factor2:
127 define void @store_ptrvec_factor2(i32** %ptr, <2 x i32*> %v0, <2 x i32*> %v1) {
128 %base = bitcast i32** %ptr to <4 x i32*>*
129 %interleaved.vec = shufflevector <2 x i32*> %v0, <2 x i32*> %v1, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
130 store <4 x i32*> %interleaved.vec, <4 x i32*>* %base, align 4
134 ; NEON-LABEL: store_ptrvec_factor3:
135 ; NEON: vst3.32 {d16, d17, d18}, [r0]
136 ; NONEON-LABEL: store_ptrvec_factor3:
138 define void @store_ptrvec_factor3(i32** %ptr, <2 x i32*> %v0, <2 x i32*> %v1, <2 x i32*> %v2) {
139 %base = bitcast i32** %ptr to <6 x i32*>*
140 %v0_v1 = shufflevector <2 x i32*> %v0, <2 x i32*> %v1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
141 %v2_u = shufflevector <2 x i32*> %v2, <2 x i32*> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
142 %interleaved.vec = shufflevector <4 x i32*> %v0_v1, <4 x i32*> %v2_u, <6 x i32> <i32 0, i32 2, i32 4, i32 1, i32 3, i32 5>
143 store <6 x i32*> %interleaved.vec, <6 x i32*>* %base, align 4
147 ; NEON-LABEL: store_ptrvec_factor4:
148 ; NEON: vst4.32 {d16, d17, d18, d19}, [r0]
149 ; NONEON-LABEL: store_ptrvec_factor4:
151 define void @store_ptrvec_factor4(i32* %ptr, <2 x i32*> %v0, <2 x i32*> %v1, <2 x i32*> %v2, <2 x i32*> %v3) {
152 %base = bitcast i32* %ptr to <8 x i32*>*
153 %v0_v1 = shufflevector <2 x i32*> %v0, <2 x i32*> %v1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
154 %v2_v3 = shufflevector <2 x i32*> %v2, <2 x i32*> %v3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
155 %interleaved.vec = shufflevector <4 x i32*> %v0_v1, <4 x i32*> %v2_v3, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7>
156 store <8 x i32*> %interleaved.vec, <8 x i32*>* %base, align 4
160 ; Following cases check that shuffle maskes with undef indices can be matched
161 ; into ldN/stN instruction.
163 ; NEON-LABEL: load_undef_mask_factor2:
164 ; NEON: vld2.32 {d16, d17, d18, d19}, [r0]
165 ; NONEON-LABEL: load_undef_mask_factor2:
167 define <4 x i32> @load_undef_mask_factor2(i32* %ptr) {
168 %base = bitcast i32* %ptr to <8 x i32>*
169 %wide.vec = load <8 x i32>, <8 x i32>* %base, align 4
170 %strided.v0 = shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> <i32 undef, i32 2, i32 undef, i32 6>
171 %strided.v1 = shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> <i32 undef, i32 3, i32 undef, i32 7>
172 %add = add nsw <4 x i32> %strided.v0, %strided.v1
176 ; NEON-LABEL: load_undef_mask_factor3:
177 ; NEON: vld3.32 {d16, d18, d20}, [r0]!
178 ; NEON: vld3.32 {d17, d19, d21}, [r0]
179 ; NONEON-LABEL: load_undef_mask_factor3:
181 define <4 x i32> @load_undef_mask_factor3(i32* %ptr) {
182 %base = bitcast i32* %ptr to <12 x i32>*
183 %wide.vec = load <12 x i32>, <12 x i32>* %base, align 4
184 %strided.v2 = shufflevector <12 x i32> %wide.vec, <12 x i32> undef, <4 x i32> <i32 2, i32 undef, i32 undef, i32 undef>
185 %strided.v1 = shufflevector <12 x i32> %wide.vec, <12 x i32> undef, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
186 %add = add nsw <4 x i32> %strided.v2, %strided.v1
190 ; NEON-LABEL: load_undef_mask_factor4:
191 ; NEON: vld4.32 {d16, d18, d20, d22}, [r0]!
192 ; NEON: vld4.32 {d17, d19, d21, d23}, [r0]
193 ; NONEON-LABEL: load_undef_mask_factor4:
195 define <4 x i32> @load_undef_mask_factor4(i32* %ptr) {
196 %base = bitcast i32* %ptr to <16 x i32>*
197 %wide.vec = load <16 x i32>, <16 x i32>* %base, align 4
198 %strided.v0 = shufflevector <16 x i32> %wide.vec, <16 x i32> undef, <4 x i32> <i32 0, i32 4, i32 undef, i32 undef>
199 %strided.v2 = shufflevector <16 x i32> %wide.vec, <16 x i32> undef, <4 x i32> <i32 2, i32 6, i32 undef, i32 undef>
200 %add = add nsw <4 x i32> %strided.v0, %strided.v2
204 ; NEON-LABEL: store_undef_mask_factor2:
205 ; NEON: vst2.32 {d16, d17, d18, d19}, [r0]
206 ; NONEON-LABEL: store_undef_mask_factor2:
208 define void @store_undef_mask_factor2(i32* %ptr, <4 x i32> %v0, <4 x i32> %v1) {
209 %base = bitcast i32* %ptr to <8 x i32>*
210 %interleaved.vec = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 2, i32 6, i32 3, i32 7>
211 store <8 x i32> %interleaved.vec, <8 x i32>* %base, align 4
215 ; NEON-LABEL: store_undef_mask_factor3:
216 ; NEON: vst3.32 {d16, d18, d20}, [r0]!
217 ; NEON: vst3.32 {d17, d19, d21}, [r0]
218 ; NONEON-LABEL: store_undef_mask_factor3:
220 define void @store_undef_mask_factor3(i32* %ptr, <4 x i32> %v0, <4 x i32> %v1, <4 x i32> %v2) {
221 %base = bitcast i32* %ptr to <12 x i32>*
222 %v0_v1 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
223 %v2_u = shufflevector <4 x i32> %v2, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
224 %interleaved.vec = shufflevector <8 x i32> %v0_v1, <8 x i32> %v2_u, <12 x i32> <i32 0, i32 4, i32 undef, i32 1, i32 undef, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
225 store <12 x i32> %interleaved.vec, <12 x i32>* %base, align 4
229 ; NEON-LABEL: store_undef_mask_factor4:
230 ; NEON: vst4.32 {d16, d18, d20, d22}, [r0]!
231 ; NEON: vst4.32 {d17, d19, d21, d23}, [r0]
232 ; NONEON-LABEL: store_undef_mask_factor4:
234 define void @store_undef_mask_factor4(i32* %ptr, <4 x i32> %v0, <4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3) {
235 %base = bitcast i32* %ptr to <16 x i32>*
236 %v0_v1 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
237 %v2_v3 = shufflevector <4 x i32> %v2, <4 x i32> %v3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
238 %interleaved.vec = shufflevector <8 x i32> %v0_v1, <8 x i32> %v2_v3, <16 x i32> <i32 0, i32 4, i32 8, i32 undef, i32 undef, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
239 store <16 x i32> %interleaved.vec, <16 x i32>* %base, align 4
243 ; The following test cases check that address spaces are properly handled
245 ; NEON-LABEL: load_address_space
247 ; NONEON-LABEL: load_address_space
249 define void @load_address_space(<4 x i32> addrspace(1)* %A, <2 x i32>* %B) {
250 %tmp = load <4 x i32>, <4 x i32> addrspace(1)* %A
251 %interleaved = shufflevector <4 x i32> %tmp, <4 x i32> undef, <2 x i32> <i32 0, i32 3>
252 store <2 x i32> %interleaved, <2 x i32>* %B
256 ; NEON-LABEL: store_address_space
258 ; NONEON-LABEL: store_address_space
260 define void @store_address_space(<2 x i32>* %A, <2 x i32>* %B, <4 x i32> addrspace(1)* %C) {
261 %tmp0 = load <2 x i32>, <2 x i32>* %A
262 %tmp1 = load <2 x i32>, <2 x i32>* %B
263 %interleaved = shufflevector <2 x i32> %tmp0, <2 x i32> %tmp1, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
264 store <4 x i32> %interleaved, <4 x i32> addrspace(1)* %C
268 ; Check that we do something sane with illegal types.
270 ; NEON-LABEL: load_illegal_factor2:
272 ; NEON-NEXT: vld1.64 {d16, d17}, [r0:128]
273 ; NEON-NEXT: vuzp.32 q8, {{.*}}
274 ; NEON-NEXT: vmov r0, r1, d16
275 ; NEON-NEXT: vmov r2, r3, {{.*}}
276 ; NEON-NEXT: mov pc, lr
277 ; NONEON-LABEL: load_illegal_factor2:
279 ; NONEON-NEXT: ldr [[ELT0:r[0-9]+]], [r0]
280 ; NONEON-NEXT: ldr r1, [r0, #8]
281 ; NONEON-NEXT: mov r0, [[ELT0]]
282 ; NONEON-NEXT: mov pc, lr
283 define <3 x float> @load_illegal_factor2(<3 x float>* %p) nounwind {
284 %tmp1 = load <3 x float>, <3 x float>* %p, align 16
285 %tmp2 = shufflevector <3 x float> %tmp1, <3 x float> undef, <3 x i32> <i32 0, i32 2, i32 undef>
286 ret <3 x float> %tmp2
289 ; This lowering isn't great, but it's at least correct.
291 ; NEON-LABEL: store_illegal_factor2:
293 ; NEON-NEXT: vldr d17, [sp]
294 ; NEON-NEXT: vmov d16, r2, r3
295 ; NEON-NEXT: vuzp.32 q8, {{.*}}
296 ; NEON-NEXT: vstr d16, [r0]
297 ; NEON-NEXT: mov pc, lr
298 ; NONEON-LABEL: store_illegal_factor2:
300 ; NONEON-NEXT: stm r0, {r1, r3}
301 ; NONEON-NEXT: mov pc, lr
302 define void @store_illegal_factor2(<3 x float>* %p, <3 x float> %v) nounwind {
303 %tmp1 = shufflevector <3 x float> %v, <3 x float> undef, <3 x i32> <i32 0, i32 2, i32 undef>
304 store <3 x float> %tmp1, <3 x float>* %p, align 16
308 ; NEON-LABEL: load_factor2_with_extract_user:
309 ; NEON: vld2.32 {d16, d17, d18, d19}, [r0:64]
310 ; NEON: vmov.32 r0, d16[1]
311 ; NONEON-LABEL: load_factor2_with_extract_user:
313 define i32 @load_factor2_with_extract_user(<8 x i32>* %a) {
314 %1 = load <8 x i32>, <8 x i32>* %a, align 8
315 %2 = shufflevector <8 x i32> %1, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
316 %3 = extractelement <8 x i32> %1, i32 2