1 From 464ca6c2f37b93180cc27ea41889ffaf1eab388e Mon Sep 17 00:00:00 2001
2 From: Henrik Gramner <gramner@twoorioles.com>
3 Date: Thu, 25 Jun 2020 01:27:28 +0200
4 Subject: [PATCH] x86: Fix 32-bit build with PIC enabled
7 src/x86/mc_sse.asm | 147 +++++++++++++++++----------------------------
8 1 file changed, 56 insertions(+), 91 deletions(-)
10 diff --git a/src/x86/mc_sse.asm b/src/x86/mc_sse.asm
11 index d98ac621..5d5c5e3f 100644
12 --- a/src/x86/mc_sse.asm
13 +++ b/src/x86/mc_sse.asm
14 @@ -1263,7 +1263,7 @@ cglobal prep_bilin, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3
19 + %define m8 [t1-prep_sse2+pw_8]
23 @@ -1272,13 +1272,11 @@ cglobal prep_bilin, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3
34 punpcklwd m6, [base+pw_1]
38 mov t1, t2 ; save base reg for w4
39 @@ -1396,8 +1394,8 @@ cglobal prep_bilin, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3
47 movu m0, [srcq+strideq*0+8*0]
48 movu m1, [srcq+strideq*0+8*1]
49 @@ -1440,14 +1438,17 @@ cglobal prep_bilin, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3
69 @@ -2624,22 +2625,20 @@ cglobal put_8tap, 1, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3
70 %macro PHADDW 4 ; dst, src, pw_1/tmp, load_pw_1
78 + mova %3, [base+pw_1]
87 + pmaddwd %1, [base+pw_1]
96 @@ -2795,11 +2794,9 @@ PREP_8TAP_FN sharp_smooth, SHARP, SMOOTH
99 %define base base_reg-prep%+SUFFIX
100 - %define W32_RESTORE_SSQ mov strideq, stridem
104 - %define W32_RESTORE_SSQ
106 cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3
107 %assign org_stack_offset stack_offset
108 @@ -2834,6 +2831,10 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3
115 + mov strideq, stridem
119 @@ -2894,7 +2895,6 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3
125 lea stride3q, [strideq*3]
127 @@ -2916,8 +2916,7 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3
134 movd m0, [srcq+strideq*0+0]
135 movd m12, [srcq+strideq*0+1]
136 movd m1, [srcq+strideq*1+0]
137 @@ -2947,7 +2946,7 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3
138 punpcklqdq m1, m5 ; 1
139 punpcklqdq m2, m13 ; 2
140 punpcklqdq m3, m7 ; 3
143 movd m0, [srcq+strideq*0+0]
144 movd m1, [srcq+strideq*0+1]
145 movd m2, [srcq+strideq*0+2]
146 @@ -2978,7 +2977,6 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3
147 lea srcq, [srcq+strideq*2]
149 punpcklqdq m3, m7 ; 3
152 PMADDUBSW m0, m4, m5, m7, 1 ; subpel_filters + 2
153 PMADDUBSW m1, m4, m5, m7, 0
154 @@ -2994,14 +2992,7 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3
162 - %define base_reg r3
167 PREP_8TAP_H 0, srcq+strideq*0
168 PREP_8TAP_H 1, srcq+strideq*1
169 @@ -3017,51 +3008,42 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3
194 - %define base_reg r3
203 - PREP_8TAP_H 0, srcq+r6+8*0
204 - PREP_8TAP_H 1, srcq+r6+8*1
205 + PREP_8TAP_H 0, srcq+r3+8*0
206 + PREP_8TAP_H 1, srcq+r3+8*1
213 - PREP_8TAP_H 0, srcq+r6
214 + PREP_8TAP_H 0, srcq+r3
228 - %define base_reg r2
232 LEA base_reg, prep%+SUFFIX
234 @@ -3086,7 +3068,7 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3
235 %define subpel1 [rsp+mmsize*1]
236 %define subpel2 [rsp+mmsize*2]
237 %define subpel3 [rsp+mmsize*3]
238 -%assign regs_used 2 ; use r1 (src) as tmp for stack alignment if needed
239 +%assign regs_used 6 ; use r5 (mx) as tmp for stack alignment if needed
241 ALLOC_STACK -mmsize*4
243 @@ -3105,15 +3087,9 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3
247 - %if notcpuflag(ssse3)
249 - %define base_reg r6
251 - mov strideq, [rstk+stack_offset+gprsize*3]
252 - lea strideq, [strideq*3]
253 - sub [rstk+stack_offset+gprsize*2], strideq
254 mov strideq, [rstk+stack_offset+gprsize*3]
255 - mov srcq, [rstk+stack_offset+gprsize*2]
256 + lea r5, [strideq*3]
261 @@ -3245,10 +3221,6 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3
265 -%if ARCH_X86_32 && notcpuflag(ssse3)
266 - %define base_reg r2
271 lea r5d, [wq - 8] ; horizontal loop
272 @@ -3373,16 +3345,12 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3
275 movq m0, [base_reg+myq*8+subpel_filters-prep%+SUFFIX]
276 - mov r5, r2; use as new base
277 - %define base_reg r5
278 - %assign regs_used 2
279 + mov strideq, stridem
280 + %assign regs_used 6
281 ALLOC_STACK -mmsize*14
283 - mov strideq, [rstk+stack_offset+gprsize*3]
284 - lea strideq, [strideq*3 + 1]
285 - sub [rstk+stack_offset+gprsize*2], strideq
286 - mov strideq, [rstk+stack_offset+gprsize*3]
287 - mov srcq, [rstk+stack_offset+gprsize*2]
288 + lea r5, [strideq*3+1]
290 %define subpelv0 [rsp+mmsize*0]
291 %define subpelv1 [rsp+mmsize*1]
292 %define subpelv2 [rsp+mmsize*2]
293 @@ -3445,9 +3413,9 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3
294 %define hv4_line_1_3 13
297 - %define w8192reg [base+pw_8192]
298 + %define w8192reg [base+pw_8192]
300 - %define w8192reg [base+pw_2]
301 + %define w8192reg [base+pw_2]
303 %define d32reg [base+pd_32]
305 @@ -3676,7 +3644,6 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3
309 - %define base_reg r2
310 %define subpelh0 [rsp+mmsize*5]
311 %define subpelh1 [rsp+mmsize*6]
312 %define subpelv0 [rsp+mmsize*7]
313 @@ -3692,16 +3659,16 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3
316 movq m5, [base_reg+myq*8+subpel_filters-prep%+SUFFIX]
317 - ALLOC_STACK -mmsize*13
318 + mov strideq, stridem
319 + %assign regs_used 6
320 + ALLOC_STACK -mmsize*14
321 + %assign regs_used 7
322 %if STACK_ALIGNMENT < mmsize
324 - %define tmpm [rsp+mmsize*13+gprsize*1]
325 - %define srcm [rsp+mmsize*13+gprsize*2]
326 - %define stridem [rsp+mmsize*13+gprsize*3]
328 + %define tmpm [rsp+mmsize*13+gprsize*1]
329 + %define srcm [rsp+mmsize*13+gprsize*2]
330 + %define stridem [rsp+mmsize*13+gprsize*3]
331 + mov stridem, strideq
334 - %define base_reg r6
338 @@ -3724,12 +3691,9 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3
343 - lea strided, [strided*3]
348 + lea r5, [strideq*3+3]
352 ALLOC_STACK mmsize*5, 16
354 @@ -3765,7 +3729,7 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3
355 %if notcpuflag(ssse3)
358 - lea stride3q, [strideq*3]
359 + lea stride3q, [strideq*3]
363 @@ -3939,11 +3903,12 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3