Contribs: fix dav1d issue on Android/x86
[vlc.git] / contrib / src / dav1d / 0001-SSE2-PIC-464ca6c2.patch
blob097ae6311771a9b8307eb803b1ecc2d254b34454
1 From 464ca6c2f37b93180cc27ea41889ffaf1eab388e Mon Sep 17 00:00:00 2001
2 From: Henrik Gramner <gramner@twoorioles.com>
3 Date: Thu, 25 Jun 2020 01:27:28 +0200
4 Subject: [PATCH] x86: Fix 32-bit build with PIC enabled
6 ---
7 src/x86/mc_sse.asm | 147 +++++++++++++++++----------------------------
8 1 file changed, 56 insertions(+), 91 deletions(-)
10 diff --git a/src/x86/mc_sse.asm b/src/x86/mc_sse.asm
11 index d98ac621..5d5c5e3f 100644
12 --- a/src/x86/mc_sse.asm
13 +++ b/src/x86/mc_sse.asm
14 @@ -1263,7 +1263,7 @@ cglobal prep_bilin, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3
15 %if ARCH_X86_64
16 mova m8, [pw_8]
17 %else
18 - %define m8 [pw_8]
19 + %define m8 [t1-prep_sse2+pw_8]
20 %endif
21 pxor m7, m7
22 %endif
23 @@ -1272,13 +1272,11 @@ cglobal prep_bilin, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3
24 pshuflw m6, m6, q0000
25 %if cpuflag(ssse3)
26 punpcklqdq m6, m6
27 -%else
28 - %if ARCH_X86_64
29 +%elif ARCH_X86_64
30 psrlw m0, m8, 3
31 punpcklwd m6, m0
32 - %else
33 +%else
34 punpcklwd m6, [base+pw_1]
35 - %endif
36 %endif
37 %if ARCH_X86_32
38 mov t1, t2 ; save base reg for w4
39 @@ -1396,8 +1394,8 @@ cglobal prep_bilin, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3
40 PUSH r7
41 %endif
42 mov r7, tmpq
43 + mov r5, srcq
44 %endif
45 - mov t1, srcq
46 .hv_w16_hloop:
47 movu m0, [srcq+strideq*0+8*0]
48 movu m1, [srcq+strideq*0+8*1]
49 @@ -1440,14 +1438,17 @@ cglobal prep_bilin, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3
50 sub hd, 2
51 jg .hv_w16_vloop
52 movzx hd, t2w
53 - add t1, 16
54 - mov srcq, t1
55 %if ARCH_X86_64
56 + add r5, 16
57 add r7, 2*16
58 + mov srcq, r5
59 mov tmpq, r7
60 %else
61 + mov srcq, srcmp
62 mov tmpq, tmpmp
63 + add srcq, 16
64 add tmpq, 2*16
65 + mov srcmp, srcq
66 mov tmpmp, tmpq
67 %endif
68 sub t2d, 1<<16
69 @@ -2624,22 +2625,20 @@ cglobal put_8tap, 1, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3
70 %macro PHADDW 4 ; dst, src, pw_1/tmp, load_pw_1
71 %if cpuflag(ssse3)
72 phaddw %1, %2
73 - %else
74 - %ifnidn %1, %2
75 + %elifnidn %1, %2
76 %if %4 == 1
77 - mova %3, [pw_1]
78 + mova %3, [base+pw_1]
79 %endif
80 pmaddwd %1, %3
81 pmaddwd %2, %3
82 packssdw %1, %2
83 - %else
84 + %else
85 %if %4 == 1
86 - pmaddwd %1, [pw_1]
87 + pmaddwd %1, [base+pw_1]
88 %else
89 pmaddwd %1, %3
90 %endif
91 packssdw %1, %1
92 - %endif
93 %endif
94 %endmacro
96 @@ -2795,11 +2794,9 @@ PREP_8TAP_FN sharp_smooth, SHARP, SMOOTH
97 %if ARCH_X86_32
98 %define base_reg r2
99 %define base base_reg-prep%+SUFFIX
100 - %define W32_RESTORE_SSQ mov strideq, stridem
101 %else
102 %define base_reg r7
103 %define base 0
104 - %define W32_RESTORE_SSQ
105 %endif
106 cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3
107 %assign org_stack_offset stack_offset
108 @@ -2834,6 +2831,10 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3
109 WIN64_SPILL_XMM 12
110 %else
111 WIN64_SPILL_XMM 16
112 +%endif
113 +%if ARCH_X86_32
114 + %define strideq r6
115 + mov strideq, stridem
116 %endif
117 cmp wd, 4
118 je .h_w4
119 @@ -2894,7 +2895,6 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3
120 punpcklbw m4, m4
121 psraw m4, 8
122 %endif
123 - W32_RESTORE_SSQ
124 %if ARCH_X86_64
125 lea stride3q, [strideq*3]
126 %endif
127 @@ -2916,8 +2916,7 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3
128 pshufb m1, m5
129 pshufb m2, m5
130 pshufb m3, m5
131 -%else
132 - %if ARCH_X86_64
133 +%elif ARCH_X86_64
134 movd m0, [srcq+strideq*0+0]
135 movd m12, [srcq+strideq*0+1]
136 movd m1, [srcq+strideq*1+0]
137 @@ -2947,7 +2946,7 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3
138 punpcklqdq m1, m5 ; 1
139 punpcklqdq m2, m13 ; 2
140 punpcklqdq m3, m7 ; 3
141 - %else
142 +%else
143 movd m0, [srcq+strideq*0+0]
144 movd m1, [srcq+strideq*0+1]
145 movd m2, [srcq+strideq*0+2]
146 @@ -2978,7 +2977,6 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3
147 lea srcq, [srcq+strideq*2]
148 punpckldq m7, m5
149 punpcklqdq m3, m7 ; 3
150 - %endif
151 %endif
152 PMADDUBSW m0, m4, m5, m7, 1 ; subpel_filters + 2
153 PMADDUBSW m1, m4, m5, m7, 0
154 @@ -2994,14 +2992,7 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3
155 sub hd, 4
156 jg .h_w4_loop
159 .h_w8:
160 -%if ARCH_X86_32
161 - mov r3, r2
162 - %define base_reg r3
163 - W32_RESTORE_SSQ
164 -%endif
165 -.h_w8_loop:
166 %if cpuflag(ssse3)
167 PREP_8TAP_H 0, srcq+strideq*0
168 PREP_8TAP_H 1, srcq+strideq*1
169 @@ -3017,51 +3008,42 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3
170 add tmpq, 16
171 dec hd
172 %endif
173 - jg .h_w8_loop
174 + jg .h_w8
176 .h_w16:
177 - mov r6, -16*1
178 + mov r3, -16*1
179 jmp .h_start
180 .h_w32:
181 - mov r6, -16*2
182 + mov r3, -16*2
183 jmp .h_start
184 .h_w64:
185 - mov r6, -16*4
186 + mov r3, -16*4
187 jmp .h_start
188 .h_w128:
189 - mov r6, -16*8
190 + mov r3, -16*8
191 .h_start:
192 -%if ARCH_X86_32
193 - mov r3, r2
194 - %define base_reg r3
195 -%endif
196 - sub srcq, r6
197 - mov r5, r6
198 - W32_RESTORE_SSQ
199 + sub srcq, r3
200 + mov r5, r3
201 .h_loop:
202 %if cpuflag(ssse3)
203 - PREP_8TAP_H 0, srcq+r6+8*0
204 - PREP_8TAP_H 1, srcq+r6+8*1
205 + PREP_8TAP_H 0, srcq+r3+8*0
206 + PREP_8TAP_H 1, srcq+r3+8*1
207 mova [tmpq+16*0], m0
208 mova [tmpq+16*1], m1
209 add tmpq, 32
210 - add r6, 16
211 + add r3, 16
212 %else
213 - PREP_8TAP_H 0, srcq+r6
214 + PREP_8TAP_H 0, srcq+r3
215 mova [tmpq], m0
216 add tmpq, 16
217 - add r6, 8
218 + add r3, 8
219 %endif
220 jl .h_loop
221 add srcq, strideq
222 - mov r6, r5
223 + mov r3, r5
224 dec hd
225 jg .h_loop
227 -%if ARCH_X86_32
228 - %define base_reg r2
229 -%endif
232 LEA base_reg, prep%+SUFFIX
233 %if ARCH_X86_32
234 @@ -3086,7 +3068,7 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3
235 %define subpel1 [rsp+mmsize*1]
236 %define subpel2 [rsp+mmsize*2]
237 %define subpel3 [rsp+mmsize*3]
238 -%assign regs_used 2 ; use r1 (src) as tmp for stack alignment if needed
239 +%assign regs_used 6 ; use r5 (mx) as tmp for stack alignment if needed
240 %if cpuflag(ssse3)
241 ALLOC_STACK -mmsize*4
242 %else
243 @@ -3105,15 +3087,9 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3
244 movd m0, [myq+6]
245 PSHUFB_0X1X m0, m2
246 mova subpel3, m0
247 - %if notcpuflag(ssse3)
248 - mov r6, base_reg
249 - %define base_reg r6
250 - %endif
251 - mov strideq, [rstk+stack_offset+gprsize*3]
252 - lea strideq, [strideq*3]
253 - sub [rstk+stack_offset+gprsize*2], strideq
254 mov strideq, [rstk+stack_offset+gprsize*3]
255 - mov srcq, [rstk+stack_offset+gprsize*2]
256 + lea r5, [strideq*3]
257 + sub srcq, r5
258 %else
259 %define subpel0 m8
260 %define subpel1 m9
261 @@ -3245,10 +3221,6 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3
262 jg .v_w4_loop0
263 %endif
265 -%if ARCH_X86_32 && notcpuflag(ssse3)
266 - %define base_reg r2
267 -%endif
269 %if ARCH_X86_64
270 .v_w8:
271 lea r5d, [wq - 8] ; horizontal loop
272 @@ -3373,16 +3345,12 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3
273 cmp hd, 6
274 cmovs myd, mxd
275 movq m0, [base_reg+myq*8+subpel_filters-prep%+SUFFIX]
276 - mov r5, r2; use as new base
277 - %define base_reg r5
278 - %assign regs_used 2
279 + mov strideq, stridem
280 + %assign regs_used 6
281 ALLOC_STACK -mmsize*14
282 %assign regs_used 7
283 - mov strideq, [rstk+stack_offset+gprsize*3]
284 - lea strideq, [strideq*3 + 1]
285 - sub [rstk+stack_offset+gprsize*2], strideq
286 - mov strideq, [rstk+stack_offset+gprsize*3]
287 - mov srcq, [rstk+stack_offset+gprsize*2]
288 + lea r5, [strideq*3+1]
289 + sub srcq, r5
290 %define subpelv0 [rsp+mmsize*0]
291 %define subpelv1 [rsp+mmsize*1]
292 %define subpelv2 [rsp+mmsize*2]
293 @@ -3445,9 +3413,9 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3
294 %define hv4_line_1_3 13
295 %if ARCH_X86_32
296 %if cpuflag(ssse3)
297 - %define w8192reg [base+pw_8192]
298 + %define w8192reg [base+pw_8192]
299 %else
300 - %define w8192reg [base+pw_2]
301 + %define w8192reg [base+pw_2]
302 %endif
303 %define d32reg [base+pd_32]
304 %else
305 @@ -3676,7 +3644,6 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3
306 %define hv8_line_6 4
307 shr mxd, 16
308 %if ARCH_X86_32
309 - %define base_reg r2
310 %define subpelh0 [rsp+mmsize*5]
311 %define subpelh1 [rsp+mmsize*6]
312 %define subpelv0 [rsp+mmsize*7]
313 @@ -3692,16 +3659,16 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3
314 cmp hd, 6
315 cmovs myd, mxd
316 movq m5, [base_reg+myq*8+subpel_filters-prep%+SUFFIX]
317 - ALLOC_STACK -mmsize*13
318 + mov strideq, stridem
319 + %assign regs_used 6
320 + ALLOC_STACK -mmsize*14
321 + %assign regs_used 7
322 %if STACK_ALIGNMENT < mmsize
323 - mov rstk, r2m
324 - %define tmpm [rsp+mmsize*13+gprsize*1]
325 - %define srcm [rsp+mmsize*13+gprsize*2]
326 - %define stridem [rsp+mmsize*13+gprsize*3]
327 - mov stridem, rstk
328 + %define tmpm [rsp+mmsize*13+gprsize*1]
329 + %define srcm [rsp+mmsize*13+gprsize*2]
330 + %define stridem [rsp+mmsize*13+gprsize*3]
331 + mov stridem, strideq
332 %endif
333 - mov r6, r2
334 - %define base_reg r6
335 pshufd m0, m1, q0000
336 pshufd m1, m1, q1111
337 punpcklbw m5, m5
338 @@ -3724,12 +3691,9 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3
339 mova subpelv1, m3
340 mova subpelv2, m4
341 mova subpelv3, m5
342 - W32_RESTORE_SSQ
343 - lea strided, [strided*3]
344 - sub srcd, strided
345 - sub srcd, 3
346 - mov srcm, srcd
347 - W32_RESTORE_SSQ
348 + lea r5, [strideq*3+3]
349 + sub srcq, r5
350 + mov srcm, srcq
351 %else
352 ALLOC_STACK mmsize*5, 16
353 %define subpelh0 m10
354 @@ -3765,7 +3729,7 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3
355 %if notcpuflag(ssse3)
356 mova m7, [base+pw_2]
357 %endif
358 - lea stride3q, [strideq*3]
359 + lea stride3q, [strideq*3]
360 sub srcq, 3
361 sub srcq, stride3q
362 mov r6, srcq
363 @@ -3939,11 +3903,12 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3
364 .hv_w8_outer:
365 movzx hd, r5w
366 %if ARCH_X86_32
367 - add dword tmpm, 8
368 - mov tmpq, tmpm
369 mov srcq, srcm
370 + mov tmpq, tmpm
371 add srcq, 4
372 + add tmpq, 8
373 mov srcm, srcq
374 + mov tmpm, tmpq
375 %else
376 add r8, 8
377 mov tmpq, r8
379 2.26.2