Bug 1853320 [wpt PR 41940] - [FLEDGE] Split up runs of /trusted-scoring-signals.https...
[gecko.git] / gfx / cairo / pixman-arm64-clang.patch
blobf0597345319d6e03b80fe465b20f3031e9e0b262
1 https://gitlab.freedesktop.org/pixman/pixman/-/merge_requests/71
3 diff --git a/gfx/cairo/libpixman/src/pixman-arm-asm.h b/gfx/cairo/libpixman/src/pixman-arm-asm.h
4 --- a/gfx/cairo/libpixman/src/pixman-arm-asm.h
5 +++ b/gfx/cairo/libpixman/src/pixman-arm-asm.h
6 @@ -21,17 +21,33 @@
7 * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
8 * SOFTWARE.
10 * Author: Jeff Muizelaar (jeff@infidigm.net)
14 /* Supplementary macro for setting function attributes */
15 -.macro pixman_asm_function fname
16 - .func fname
17 - .global fname
18 +.macro pixman_asm_function_impl fname
19 +#ifdef ASM_HAVE_FUNC_DIRECTIVE
20 + .func \fname
21 +#endif
22 + .global \fname
23 #ifdef __ELF__
24 - .hidden fname
25 - .type fname, %function
26 + .hidden \fname
27 + .type \fname, %function
28 #endif
29 -fname:
30 +\fname:
31 .endm
33 +.macro pixman_asm_function fname
34 +#ifdef ASM_LEADING_UNDERSCORE
35 + pixman_asm_function_impl _\fname
36 +#else
37 + pixman_asm_function_impl \fname
38 +#endif
39 +.endm
41 +.macro pixman_end_asm_function
42 +#ifdef ASM_HAVE_FUNC_DIRECTIVE
43 + .endfunc
44 +#endif
45 +.endm
46 diff --git a/gfx/cairo/libpixman/src/pixman-arma64-neon-asm-bilinear.S b/gfx/cairo/libpixman/src/pixman-arma64-neon-asm-bilinear.S
47 --- a/gfx/cairo/libpixman/src/pixman-arma64-neon-asm-bilinear.S
48 +++ b/gfx/cairo/libpixman/src/pixman-arma64-neon-asm-bilinear.S
49 @@ -72,219 +72,219 @@
50 * format conversion, and interpolation as separate macros which can be used
51 * as the basic building blocks for constructing bilinear scanline functions.
54 .macro bilinear_load_8888 reg1, reg2, tmp
55 asr WTMP1, X, #16
56 add X, X, UX
57 add TMP1, TOP, TMP1, lsl #2
58 - ld1 {&reg1&.2s}, [TMP1], STRIDE
59 - ld1 {&reg2&.2s}, [TMP1]
60 + ld1 {\()\reg1\().2s}, [TMP1], STRIDE
61 + ld1 {\()\reg2\().2s}, [TMP1]
62 .endm
64 .macro bilinear_load_0565 reg1, reg2, tmp
65 asr WTMP1, X, #16
66 add X, X, UX
67 add TMP1, TOP, TMP1, lsl #1
68 - ld1 {&reg2&.s}[0], [TMP1], STRIDE
69 - ld1 {&reg2&.s}[1], [TMP1]
70 - convert_four_0565_to_x888_packed reg2, reg1, reg2, tmp
71 + ld1 {\()\reg2\().s}[0], [TMP1], STRIDE
72 + ld1 {\()\reg2\().s}[1], [TMP1]
73 + convert_four_0565_to_x888_packed \reg2, \reg1, \reg2, \tmp
74 .endm
76 .macro bilinear_load_and_vertical_interpolate_two_8888 \
77 acc1, acc2, reg1, reg2, reg3, reg4, tmp1, tmp2
79 - bilinear_load_8888 reg1, reg2, tmp1
80 - umull &acc1&.8h, &reg1&.8b, v28.8b
81 - umlal &acc1&.8h, &reg2&.8b, v29.8b
82 - bilinear_load_8888 reg3, reg4, tmp2
83 - umull &acc2&.8h, &reg3&.8b, v28.8b
84 - umlal &acc2&.8h, &reg4&.8b, v29.8b
85 + bilinear_load_8888 \reg1, \reg2, \tmp1
86 + umull \()\acc1\().8h, \()\reg1\().8b, v28.8b
87 + umlal \()\acc1\().8h, \()\reg2\().8b, v29.8b
88 + bilinear_load_8888 \reg3, \reg4, \tmp2
89 + umull \()\acc2\().8h, \()\reg3\().8b, v28.8b
90 + umlal \()\acc2\().8h, \()\reg4\().8b, v29.8b
91 .endm
93 .macro bilinear_load_and_vertical_interpolate_four_8888 \
94 - xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi \
95 + xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi, \
96 yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
98 bilinear_load_and_vertical_interpolate_two_8888 \
99 - xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi
100 + \xacc1, \xacc2, \xreg1, \xreg2, \xreg3, \xreg4, \xacc2lo, xacc2hi
101 bilinear_load_and_vertical_interpolate_two_8888 \
102 - yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
103 + \yacc1, \yacc2, \yreg1, \yreg2, \yreg3, \yreg4, \yacc2lo, \yacc2hi
104 .endm
106 .macro vzip reg1, reg2
107 - zip1 v24.8b, reg1, reg2
108 - zip2 reg2, reg1, reg2
109 - mov reg1, v24.8b
110 + zip1 v24.8b, \reg1, \reg2
111 + zip2 \reg2, \reg1, \reg2
112 + mov \reg1, v24.8b
113 .endm
115 .macro vuzp reg1, reg2
116 - uzp1 v24.8b, reg1, reg2
117 - uzp2 reg2, reg1, reg2
118 - mov reg1, v24.8b
119 + uzp1 v24.8b, \reg1, \reg2
120 + uzp2 \reg2, \reg1, \reg2
121 + mov \reg1, v24.8b
122 .endm
124 .macro bilinear_load_and_vertical_interpolate_two_0565 \
125 acc1, acc2, reg1, reg2, reg3, reg4, acc2lo, acc2hi
126 asr WTMP1, X, #16
127 add X, X, UX
128 add TMP1, TOP, TMP1, lsl #1
129 asr WTMP2, X, #16
130 add X, X, UX
131 add TMP2, TOP, TMP2, lsl #1
132 - ld1 {&acc2&.s}[0], [TMP1], STRIDE
133 - ld1 {&acc2&.s}[2], [TMP2], STRIDE
134 - ld1 {&acc2&.s}[1], [TMP1]
135 - ld1 {&acc2&.s}[3], [TMP2]
136 - convert_0565_to_x888 acc2, reg3, reg2, reg1
137 - vzip &reg1&.8b, &reg3&.8b
138 - vzip &reg2&.8b, &reg4&.8b
139 - vzip &reg3&.8b, &reg4&.8b
140 - vzip &reg1&.8b, &reg2&.8b
141 - umull &acc1&.8h, &reg1&.8b, v28.8b
142 - umlal &acc1&.8h, &reg2&.8b, v29.8b
143 - umull &acc2&.8h, &reg3&.8b, v28.8b
144 - umlal &acc2&.8h, &reg4&.8b, v29.8b
145 + ld1 {\()\acc2\().s}[0], [TMP1], STRIDE
146 + ld1 {\()\acc2\().s}[2], [TMP2], STRIDE
147 + ld1 {\()\acc2\().s}[1], [TMP1]
148 + ld1 {\()\acc2\().s}[3], [TMP2]
149 + convert_0565_to_x888 \acc2, \reg3, \reg2, \reg1
150 + vzip \()\reg1\().8b, \()\reg3\().8b
151 + vzip \()\reg2\().8b, \()\reg4\().8b
152 + vzip \()\reg3\().8b, \()\reg4\().8b
153 + vzip \()\reg1\().8b, \()\reg2\().8b
154 + umull \()\acc1\().8h, \()\reg1\().8b, v28.8b
155 + umlal \()\acc1\().8h, \()\reg2\().8b, v29.8b
156 + umull \()\acc2\().8h, \()\reg3\().8b, v28.8b
157 + umlal \()\acc2\().8h, \()\reg4\().8b, v29.8b
158 .endm
160 .macro bilinear_load_and_vertical_interpolate_four_0565 \
161 - xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi \
162 + xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi, \
163 yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
165 asr WTMP1, X, #16
166 add X, X, UX
167 add TMP1, TOP, TMP1, lsl #1
168 asr WTMP2, X, #16
169 add X, X, UX
170 add TMP2, TOP, TMP2, lsl #1
171 - ld1 {&xacc2&.s}[0], [TMP1], STRIDE
172 - ld1 {&xacc2&.s}[2], [TMP2], STRIDE
173 - ld1 {&xacc2&.s}[1], [TMP1]
174 - ld1 {&xacc2&.s}[3], [TMP2]
175 - convert_0565_to_x888 xacc2, xreg3, xreg2, xreg1
176 + ld1 {\()\xacc2\().s}[0], [TMP1], STRIDE
177 + ld1 {\()\xacc2\().s}[2], [TMP2], STRIDE
178 + ld1 {\()\xacc2\().s}[1], [TMP1]
179 + ld1 {\()\xacc2\().s}[3], [TMP2]
180 + convert_0565_to_x888 \xacc2, \xreg3, \xreg2, \xreg1
181 asr WTMP1, X, #16
182 add X, X, UX
183 add TMP1, TOP, TMP1, lsl #1
184 asr WTMP2, X, #16
185 add X, X, UX
186 add TMP2, TOP, TMP2, lsl #1
187 - ld1 {&yacc2&.s}[0], [TMP1], STRIDE
188 - vzip &xreg1&.8b, &xreg3&.8b
189 - ld1 {&yacc2&.s}[2], [TMP2], STRIDE
190 - vzip &xreg2&.8b, &xreg4&.8b
191 - ld1 {&yacc2&.s}[1], [TMP1]
192 - vzip &xreg3&.8b, &xreg4&.8b
193 - ld1 {&yacc2&.s}[3], [TMP2]
194 - vzip &xreg1&.8b, &xreg2&.8b
195 - convert_0565_to_x888 yacc2, yreg3, yreg2, yreg1
196 - umull &xacc1&.8h, &xreg1&.8b, v28.8b
197 - vzip &yreg1&.8b, &yreg3&.8b
198 - umlal &xacc1&.8h, &xreg2&.8b, v29.8b
199 - vzip &yreg2&.8b, &yreg4&.8b
200 - umull &xacc2&.8h, &xreg3&.8b, v28.8b
201 - vzip &yreg3&.8b, &yreg4&.8b
202 - umlal &xacc2&.8h, &xreg4&.8b, v29.8b
203 - vzip &yreg1&.8b, &yreg2&.8b
204 - umull &yacc1&.8h, &yreg1&.8b, v28.8b
205 - umlal &yacc1&.8h, &yreg2&.8b, v29.8b
206 - umull &yacc2&.8h, &yreg3&.8b, v28.8b
207 - umlal &yacc2&.8h, &yreg4&.8b, v29.8b
208 + ld1 {\()\yacc2\().s}[0], [TMP1], STRIDE
209 + vzip \()\xreg1\().8b, \()\xreg3\().8b
210 + ld1 {\()\yacc2\().s}[2], [TMP2], STRIDE
211 + vzip \()\xreg2\().8b, \()\xreg4\().8b
212 + ld1 {\()\yacc2\().s}[1], [TMP1]
213 + vzip \()\xreg3\().8b, \()\xreg4\().8b
214 + ld1 {\()\yacc2\().s}[3], [TMP2]
215 + vzip \()\xreg1\().8b, \()\xreg2\().8b
216 + convert_0565_to_x888 \yacc2, \yreg3, \yreg2, \yreg1
217 + umull \()\xacc1\().8h, \()\xreg1\().8b, v28.8b
218 + vzip \()\yreg1\().8b, \()\yreg3\().8b
219 + umlal \()\xacc1\().8h, \()\xreg2\().8b, v29.8b
220 + vzip \()\yreg2\().8b, \()\yreg4\().8b
221 + umull \()\xacc2\().8h, \()\xreg3\().8b, v28.8b
222 + vzip \()\yreg3\().8b, \()\yreg4\().8b
223 + umlal \()\xacc2\().8h, \()\xreg4\().8b, v29.8b
224 + vzip \()\yreg1\().8b, \()\yreg2\().8b
225 + umull \()\yacc1\().8h, \()\yreg1\().8b, v28.8b
226 + umlal \()\yacc1\().8h, \()\yreg2\().8b, v29.8b
227 + umull \()\yacc2\().8h, \()\yreg3\().8b, v28.8b
228 + umlal \()\yacc2\().8h, \()\yreg4\().8b, v29.8b
229 .endm
231 .macro bilinear_store_8888 numpix, tmp1, tmp2
232 -.if numpix == 4
233 +.if \numpix == 4
234 st1 {v0.2s, v1.2s}, [OUT], #16
235 -.elseif numpix == 2
236 +.elseif \numpix == 2
237 st1 {v0.2s}, [OUT], #8
238 -.elseif numpix == 1
239 +.elseif \numpix == 1
240 st1 {v0.s}[0], [OUT], #4
241 .else
242 - .error bilinear_store_8888 numpix is unsupported
243 + .error bilinear_store_8888 \numpix is unsupported
244 .endif
245 .endm
247 .macro bilinear_store_0565 numpix, tmp1, tmp2
248 vuzp v0.8b, v1.8b
249 vuzp v2.8b, v3.8b
250 vuzp v1.8b, v3.8b
251 vuzp v0.8b, v2.8b
252 - convert_8888_to_0565 v2, v1, v0, v1, tmp1, tmp2
253 -.if numpix == 4
254 + convert_8888_to_0565 v2, v1, v0, v1, \tmp1, \tmp2
255 +.if \numpix == 4
256 st1 {v1.4h}, [OUT], #8
257 -.elseif numpix == 2
258 +.elseif \numpix == 2
259 st1 {v1.s}[0], [OUT], #4
260 -.elseif numpix == 1
261 +.elseif \numpix == 1
262 st1 {v1.h}[0], [OUT], #2
263 .else
264 - .error bilinear_store_0565 numpix is unsupported
265 + .error bilinear_store_0565 \numpix is unsupported
266 .endif
267 .endm
271 * Macros for loading mask pixels into register 'mask'.
272 * dup must be done in somewhere else.
274 .macro bilinear_load_mask_x numpix, mask
275 .endm
277 .macro bilinear_load_mask_8 numpix, mask
278 -.if numpix == 4
279 - ld1 {&mask&.s}[0], [MASK], #4
280 -.elseif numpix == 2
281 - ld1 {&mask&.h}[0], [MASK], #2
282 -.elseif numpix == 1
283 - ld1 {&mask&.b}[0], [MASK], #1
284 +.if \numpix == 4
285 + ld1 {\()\mask\().s}[0], [MASK], #4
286 +.elseif \numpix == 2
287 + ld1 {\()\mask\().h}[0], [MASK], #2
288 +.elseif \numpix == 1
289 + ld1 {\()\mask\().b}[0], [MASK], #1
290 .else
291 - .error bilinear_load_mask_8 numpix is unsupported
292 + .error bilinear_load_mask_8 \numpix is unsupported
293 .endif
294 - prfm PREFETCH_MODE, [MASK, #prefetch_offset]
295 + prfum PREFETCH_MODE, [MASK, #(prefetch_offset)]
296 .endm
298 .macro bilinear_load_mask mask_fmt, numpix, mask
299 - bilinear_load_mask_&mask_fmt numpix, mask
300 + bilinear_load_mask_\mask_fmt \numpix, \mask
301 .endm
305 * Macros for loading destination pixels into register 'dst0' and 'dst1'.
306 * Interleave should be done somewhere else.
308 .macro bilinear_load_dst_0565_src numpix, dst0, dst1, dst01
309 .endm
311 .macro bilinear_load_dst_8888_src numpix, dst0, dst1, dst01
312 .endm
314 .macro bilinear_load_dst_8888 numpix, dst0, dst1, dst01
315 -.if numpix == 4
316 - ld1 {&dst0&.2s, &dst1&.2s}, [OUT]
317 -.elseif numpix == 2
318 - ld1 {&dst0&.2s}, [OUT]
319 -.elseif numpix == 1
320 - ld1 {&dst0&.s}[0], [OUT]
321 +.if \numpix == 4
322 + ld1 {\()\dst0\().2s, \()\dst1\().2s}, [OUT]
323 +.elseif \numpix == 2
324 + ld1 {\()\dst0\().2s}, [OUT]
325 +.elseif \numpix == 1
326 + ld1 {\()\dst0\().s}[0], [OUT]
327 .else
328 - .error bilinear_load_dst_8888 numpix is unsupported
329 + .error bilinear_load_dst_8888 \numpix is unsupported
330 .endif
331 - mov &dst01&.d[0], &dst0&.d[0]
332 - mov &dst01&.d[1], &dst1&.d[0]
333 + mov \()\dst01\().d[0], \()\dst0\().d[0]
334 + mov \()\dst01\().d[1], \()\dst1\().d[0]
335 prfm PREFETCH_MODE, [OUT, #(prefetch_offset * 4)]
336 .endm
338 .macro bilinear_load_dst_8888_over numpix, dst0, dst1, dst01
339 - bilinear_load_dst_8888 numpix, dst0, dst1, dst01
340 + bilinear_load_dst_8888 \numpix, \dst0, \dst1, \dst01
341 .endm
343 .macro bilinear_load_dst_8888_add numpix, dst0, dst1, dst01
344 - bilinear_load_dst_8888 numpix, dst0, dst1, dst01
345 + bilinear_load_dst_8888 \numpix, \dst0, \dst1, \dst01
346 .endm
348 .macro bilinear_load_dst dst_fmt, op, numpix, dst0, dst1, dst01
349 - bilinear_load_dst_&dst_fmt&_&op numpix, dst0, dst1, dst01
350 + bilinear_load_dst_\()\dst_fmt\()_\()\op \numpix, \dst0, \dst1, \dst01
351 .endm
354 * Macros for duplicating partially loaded mask to fill entire register.
355 * We will apply mask to interleaved source pixels, that is
356 * (r0, r1, r2, r3, g0, g1, g2, g3) x (m0, m1, m2, m3, m0, m1, m2, m3)
357 * (b0, b1, b2, b3, a0, a1, a2, a3) x (m0, m1, m2, m3, m0, m1, m2, m3)
358 * So, we need to duplicate loaded mask into whole register.
359 @@ -293,84 +293,85 @@
360 * (r0, r1, x, x, g0, g1, x, x) x (m0, m1, m0, m1, m0, m1, m0, m1)
361 * (b0, b1, x, x, a0, a1, x, x) x (m0, m1, m0, m1, m0, m1, m0, m1)
362 * We can do some optimizations for this including last pixel cases.
364 .macro bilinear_duplicate_mask_x numpix, mask
365 .endm
367 .macro bilinear_duplicate_mask_8 numpix, mask
368 -.if numpix == 4
369 - dup &mask&.2s, &mask&.s[0]
370 -.elseif numpix == 2
371 - dup &mask&.4h, &mask&.h[0]
372 -.elseif numpix == 1
373 - dup &mask&.8b, &mask&.b[0]
374 +.if \numpix == 4
375 + dup \()\mask\().2s, \()\mask\().s[0]
376 +.elseif \numpix == 2
377 + dup \()\mask\().4h, \()\mask\().h[0]
378 +.elseif \numpix == 1
379 + dup \()\mask\().8b, \()\mask\().b[0]
380 .else
381 - .error bilinear_duplicate_mask_8 is unsupported
382 + .error bilinear_duplicate_\mask_8 is unsupported
383 .endif
384 .endm
386 .macro bilinear_duplicate_mask mask_fmt, numpix, mask
387 - bilinear_duplicate_mask_&mask_fmt numpix, mask
388 + bilinear_duplicate_mask_\()\mask_fmt \numpix, \mask
389 .endm
392 * Macros for interleaving src and dst pixels to rrrr gggg bbbb aaaa form.
393 * Interleave should be done when maks is enabled or operator is 'over'.
395 .macro bilinear_interleave src0, src1, src01, dst0, dst1, dst01
396 - vuzp &src0&.8b, &src1&.8b
397 - vuzp &dst0&.8b, &dst1&.8b
398 - vuzp &src0&.8b, &src1&.8b
399 - vuzp &dst0&.8b, &dst1&.8b
400 - mov &src01&.d[1], &src1&.d[0]
401 - mov &src01&.d[0], &src0&.d[0]
402 - mov &dst01&.d[1], &dst1&.d[0]
403 - mov &dst01&.d[0], &dst0&.d[0]
404 + vuzp \()\src0\().8b, \()\src1\().8b
405 + vuzp \()\dst0\().8b, \()\dst1\().8b
406 + vuzp \()\src0\().8b, \()\src1\().8b
407 + vuzp \()\dst0\().8b, \()\dst1\().8b
408 + mov \()\src01\().d[1], \()\src1\().d[0]
409 + mov \()\src01\().d[0], \()\src0\().d[0]
410 + mov \()\dst01\().d[1], \()\dst1\().d[0]
411 + mov \()\dst01\().d[0], \()\dst0\().d[0]
412 .endm
414 .macro bilinear_interleave_src_dst_x_src \
415 numpix, src0, src1, src01, dst0, dst1, dst01
416 .endm
418 .macro bilinear_interleave_src_dst_x_over \
419 numpix, src0, src1, src01, dst0, dst1, dst01
421 - bilinear_interleave src0, src1, src01, dst0, dst1, dst01
422 + bilinear_interleave \src0, \src1, \src01, \dst0, \dst1, \dst01
423 .endm
425 .macro bilinear_interleave_src_dst_x_add \
426 numpix, src0, src1, src01, dst0, dst1, dst01
427 - bilinear_interleave src0, src1, src01, dst0, dst1, dst01
429 + bilinear_interleave \src0, \src1, \src01, \dst0, \dst1, \dst01
430 .endm
432 .macro bilinear_interleave_src_dst_8_src \
433 numpix, src0, src1, src01, dst0, dst1, dst01
435 - bilinear_interleave src0, src1, src01, dst0, dst1, dst01
436 + bilinear_interleave \src0, \src1, \src01, \dst0, \dst1, \dst01
437 .endm
439 .macro bilinear_interleave_src_dst_8_over \
440 numpix, src0, src1, src01, dst0, dst1, dst01
442 - bilinear_interleave src0, src1, src01, dst0, dst1, dst01
443 + bilinear_interleave \src0, \src1, \src01, \dst0, \dst1, \dst01
444 .endm
446 .macro bilinear_interleave_src_dst_8_add \
447 numpix, src0, src1, src01, dst0, dst1, dst01
449 - bilinear_interleave src0, src1, src01, dst0, dst1, dst01
450 + bilinear_interleave \src0, \src1, \src01, \dst0, \dst1, \dst01
451 .endm
453 .macro bilinear_interleave_src_dst \
454 mask_fmt, op, numpix, src0, src1, src01, dst0, dst1, dst01
456 - bilinear_interleave_src_dst_&mask_fmt&_&op \
457 - numpix, src0, src1, src01, dst0, dst1, dst01
458 + bilinear_interleave_src_dst_\()\mask_fmt\()_\()\op \
459 + \numpix, \src0, \src1, \src01, \dst0, \dst1, \dst01
460 .endm
464 * Macros for applying masks to src pixels. (see combine_mask_u() function)
465 * src, dst should be in interleaved form.
466 * mask register should be in form (m0, m1, m2, m3).
468 @@ -378,191 +379,191 @@
469 numpix, src0, src1, src01, mask, \
470 tmp01, tmp23, tmp45, tmp67
471 .endm
473 .macro bilinear_apply_mask_to_src_8 \
474 numpix, src0, src1, src01, mask, \
475 tmp01, tmp23, tmp45, tmp67
477 - umull &tmp01&.8h, &src0&.8b, &mask&.8b
478 - umull &tmp23&.8h, &src1&.8b, &mask&.8b
479 + umull \()\tmp01\().8h, \()\src0\().8b, \()\mask\().8b
480 + umull \()\tmp23\().8h, \()\src1\().8b, \()\mask\().8b
481 /* bubbles */
482 - urshr &tmp45&.8h, &tmp01&.8h, #8
483 - urshr &tmp67&.8h, &tmp23&.8h, #8
484 + urshr \()\tmp45\().8h, \()\tmp01\().8h, #8
485 + urshr \()\tmp67\().8h, \()\tmp23\().8h, #8
486 /* bubbles */
487 - raddhn &src0&.8b, &tmp45&.8h, &tmp01&.8h
488 - raddhn &src1&.8b, &tmp67&.8h, &tmp23&.8h
489 - mov &src01&.d[0], &src0&.d[0]
490 - mov &src01&.d[1], &src1&.d[0]
491 + raddhn \()\src0\().8b, \()\tmp45\().8h, \()\tmp01\().8h
492 + raddhn \()\src1\().8b, \()\tmp67\().8h, \()\tmp23\().8h
493 + mov \()\src01\().d[0], \()\src0\().d[0]
494 + mov \()\src01\().d[1], \()\src1\().d[0]
495 .endm
497 .macro bilinear_apply_mask_to_src \
498 mask_fmt, numpix, src0, src1, src01, mask, \
499 tmp01, tmp23, tmp45, tmp67
501 - bilinear_apply_mask_to_src_&mask_fmt \
502 - numpix, src0, src1, src01, mask, \
503 - tmp01, tmp23, tmp45, tmp67
504 + bilinear_apply_mask_to_src_\()\mask_fmt \
505 + \numpix, \src0, \src1, \src01, \mask, \
506 + \tmp01, \tmp23, \tmp45, \tmp67
507 .endm
511 * Macros for combining src and destination pixels.
512 * Interleave or not is depending on operator 'op'.
514 .macro bilinear_combine_src \
515 numpix, src0, src1, src01, dst0, dst1, dst01, \
516 tmp01, tmp23, tmp45, tmp67, tmp8
517 .endm
519 .macro bilinear_combine_over \
520 numpix, src0, src1, src01, dst0, dst1, dst01, \
521 tmp01, tmp23, tmp45, tmp67, tmp8
523 - dup &tmp8&.2s, &src1&.s[1]
524 + dup \()\tmp8\().2s, \()\src1\().s[1]
525 /* bubbles */
526 - mvn &tmp8&.8b, &tmp8&.8b
527 + mvn \()\tmp8\().8b, \()\tmp8\().8b
528 /* bubbles */
529 - umull &tmp01&.8h, &dst0&.8b, &tmp8&.8b
530 + umull \()\tmp01\().8h, \()\dst0\().8b, \()\tmp8\().8b
531 /* bubbles */
532 - umull &tmp23&.8h, &dst1&.8b, &tmp8&.8b
533 + umull \()\tmp23\().8h, \()\dst1\().8b, \()\tmp8\().8b
534 /* bubbles */
535 - urshr &tmp45&.8h, &tmp01&.8h, #8
536 - urshr &tmp67&.8h, &tmp23&.8h, #8
537 + urshr \()\tmp45\().8h, \()\tmp01\().8h, #8
538 + urshr \()\tmp67\().8h, \()\tmp23\().8h, #8
539 /* bubbles */
540 - raddhn &dst0&.8b, &tmp45&.8h, &tmp01&.8h
541 - raddhn &dst1&.8b, &tmp67&.8h, &tmp23&.8h
542 - mov &dst01&.d[0], &dst0&.d[0]
543 - mov &dst01&.d[1], &dst1&.d[0]
544 + raddhn \()\dst0\().8b, \()\tmp45\().8h, \()\tmp01\().8h
545 + raddhn \()\dst1\().8b, \()\tmp67\().8h, \()\tmp23\().8h
546 + mov \()\dst01\().d[0], \()\dst0\().d[0]
547 + mov \()\dst01\().d[1], \()\dst1\().d[0]
548 /* bubbles */
549 - uqadd &src0&.8b, &dst0&.8b, &src0&.8b
550 - uqadd &src1&.8b, &dst1&.8b, &src1&.8b
551 - mov &src01&.d[0], &src0&.d[0]
552 - mov &src01&.d[1], &src1&.d[0]
553 + uqadd \()\src0\().8b, \()\dst0\().8b, \()\src0\().8b
554 + uqadd \()\src1\().8b, \()\dst1\().8b, \()\src1\().8b
555 + mov \()\src01\().d[0], \()\src0\().d[0]
556 + mov \()\src01\().d[1], \()\src1\().d[0]
557 .endm
559 .macro bilinear_combine_add \
560 numpix, src0, src1, src01, dst0, dst1, dst01, \
561 tmp01, tmp23, tmp45, tmp67, tmp8
563 - uqadd &src0&.8b, &dst0&.8b, &src0&.8b
564 - uqadd &src1&.8b, &dst1&.8b, &src1&.8b
565 - mov &src01&.d[0], &src0&.d[0]
566 - mov &src01&.d[1], &src1&.d[0]
567 + uqadd \()\src0\().8b, \()\dst0\().8b, \()\src0\().8b
568 + uqadd \()\src1\().8b, \()\dst1\().8b, \()\src1\().8b
569 + mov \()\src01\().d[0], \()\src0\().d[0]
570 + mov \()\src01\().d[1], \()\src1\().d[0]
571 .endm
573 .macro bilinear_combine \
574 op, numpix, src0, src1, src01, dst0, dst1, dst01, \
575 tmp01, tmp23, tmp45, tmp67, tmp8
577 - bilinear_combine_&op \
578 - numpix, src0, src1, src01, dst0, dst1, dst01, \
579 - tmp01, tmp23, tmp45, tmp67, tmp8
580 + bilinear_combine_\()\op \
581 + \numpix, \src0, \src1, \src01, \dst0, \dst1, \dst01, \
582 + \tmp01, \tmp23, \tmp45, \tmp67, \tmp8
583 .endm
586 * Macros for final deinterleaving of destination pixels if needed.
588 .macro bilinear_deinterleave numpix, dst0, dst1, dst01
589 - vuzp &dst0&.8b, &dst1&.8b
590 + vuzp \()\dst0\().8b, \()\dst1\().8b
591 /* bubbles */
592 - vuzp &dst0&.8b, &dst1&.8b
593 - mov &dst01&.d[0], &dst0&.d[0]
594 - mov &dst01&.d[1], &dst1&.d[0]
595 + vuzp \()\dst0\().8b, \()\dst1\().8b
596 + mov \()\dst01\().d[0], \()\dst0\().d[0]
597 + mov \()\dst01\().d[1], \()\dst1\().d[0]
598 .endm
600 .macro bilinear_deinterleave_dst_x_src numpix, dst0, dst1, dst01
601 .endm
603 .macro bilinear_deinterleave_dst_x_over numpix, dst0, dst1, dst01
604 - bilinear_deinterleave numpix, dst0, dst1, dst01
605 + bilinear_deinterleave \numpix, \dst0, \dst1, \dst01
606 .endm
608 .macro bilinear_deinterleave_dst_x_add numpix, dst0, dst1, dst01
609 - bilinear_deinterleave numpix, dst0, dst1, dst01
610 + bilinear_deinterleave \numpix, \dst0, \dst1, \dst01
611 .endm
613 .macro bilinear_deinterleave_dst_8_src numpix, dst0, dst1, dst01
614 - bilinear_deinterleave numpix, dst0, dst1, dst01
615 + bilinear_deinterleave \numpix, \dst0, \dst1, \dst01
616 .endm
618 .macro bilinear_deinterleave_dst_8_over numpix, dst0, dst1, dst01
619 - bilinear_deinterleave numpix, dst0, dst1, dst01
620 + bilinear_deinterleave \numpix, \dst0, \dst1, \dst01
621 .endm
623 .macro bilinear_deinterleave_dst_8_add numpix, dst0, dst1, dst01
624 - bilinear_deinterleave numpix, dst0, dst1, dst01
625 + bilinear_deinterleave \numpix, \dst0, \dst1, \dst01
626 .endm
628 .macro bilinear_deinterleave_dst mask_fmt, op, numpix, dst0, dst1, dst01
629 - bilinear_deinterleave_dst_&mask_fmt&_&op numpix, dst0, dst1, dst01
630 + bilinear_deinterleave_dst_\()\mask_fmt\()_\()\op \numpix, \dst0, \dst1, \dst01
631 .endm
634 .macro bilinear_interpolate_last_pixel src_fmt, mask_fmt, dst_fmt, op
635 - bilinear_load_&src_fmt v0, v1, v2
636 - bilinear_load_mask mask_fmt, 1, v4
637 - bilinear_load_dst dst_fmt, op, 1, v18, v19, v9
638 + bilinear_load_\()\src_fmt v0, v1, v2
639 + bilinear_load_mask \mask_fmt, 1, v4
640 + bilinear_load_dst \dst_fmt, \op, 1, v18, v19, v9
641 umull v2.8h, v0.8b, v28.8b
642 umlal v2.8h, v1.8b, v29.8b
643 /* 5 cycles bubble */
644 ushll v0.4s, v2.4h, #BILINEAR_INTERPOLATION_BITS
645 umlsl v0.4s, v2.4h, v15.h[0]
646 umlal2 v0.4s, v2.8h, v15.h[0]
647 /* 5 cycles bubble */
648 - bilinear_duplicate_mask mask_fmt, 1, v4
649 + bilinear_duplicate_mask \mask_fmt, 1, v4
650 shrn v0.4h, v0.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
651 /* 3 cycles bubble */
652 xtn v0.8b, v0.8h
653 /* 1 cycle bubble */
654 bilinear_interleave_src_dst \
655 - mask_fmt, op, 1, v0, v1, v0, v18, v19, v9
656 + \mask_fmt, \op, 1, v0, v1, v0, v18, v19, v9
657 bilinear_apply_mask_to_src \
658 - mask_fmt, 1, v0, v1, v0, v4, \
659 + \mask_fmt, 1, v0, v1, v0, v4, \
660 v3, v8, v10, v11
661 bilinear_combine \
662 - op, 1, v0, v1, v0, v18, v19, v9, \
663 + \op, 1, v0, v1, v0, v18, v19, v9, \
664 v3, v8, v10, v11, v5
665 - bilinear_deinterleave_dst mask_fmt, op, 1, v0, v1, v0
666 - bilinear_store_&dst_fmt 1, v17, v18
667 + bilinear_deinterleave_dst \mask_fmt, \op, 1, v0, v1, v0
668 + bilinear_store_\()\dst_fmt 1, v17, v18
669 .endm
671 .macro bilinear_interpolate_two_pixels src_fmt, mask_fmt, dst_fmt, op
672 - bilinear_load_and_vertical_interpolate_two_&src_fmt \
673 + bilinear_load_and_vertical_interpolate_two_\()\src_fmt \
674 v1, v11, v18, v19, v20, v21, v22, v23
675 - bilinear_load_mask mask_fmt, 2, v4
676 - bilinear_load_dst dst_fmt, op, 2, v18, v19, v9
677 + bilinear_load_mask \mask_fmt, 2, v4
678 + bilinear_load_dst \dst_fmt, \op, 2, v18, v19, v9
679 ushll v0.4s, v1.4h, #BILINEAR_INTERPOLATION_BITS
680 umlsl v0.4s, v1.4h, v15.h[0]
681 umlal2 v0.4s, v1.8h, v15.h[0]
682 ushll v10.4s, v11.4h, #BILINEAR_INTERPOLATION_BITS
683 umlsl v10.4s, v11.4h, v15.h[4]
684 umlal2 v10.4s, v11.8h, v15.h[4]
685 shrn v0.4h, v0.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
686 shrn2 v0.8h, v10.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
687 - bilinear_duplicate_mask mask_fmt, 2, v4
688 + bilinear_duplicate_mask \mask_fmt, 2, v4
689 ushr v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS)
690 add v12.8h, v12.8h, v13.8h
691 xtn v0.8b, v0.8h
692 bilinear_interleave_src_dst \
693 - mask_fmt, op, 2, v0, v1, v0, v18, v19, v9
694 + \mask_fmt, \op, 2, v0, v1, v0, v18, v19, v9
695 bilinear_apply_mask_to_src \
696 - mask_fmt, 2, v0, v1, v0, v4, \
697 + \mask_fmt, 2, v0, v1, v0, v4, \
698 v3, v8, v10, v11
699 bilinear_combine \
700 - op, 2, v0, v1, v0, v18, v19, v9, \
701 + \op, 2, v0, v1, v0, v18, v19, v9, \
702 v3, v8, v10, v11, v5
703 - bilinear_deinterleave_dst mask_fmt, op, 2, v0, v1, v0
704 - bilinear_store_&dst_fmt 2, v16, v17
705 + bilinear_deinterleave_dst \mask_fmt, \op, 2, v0, v1, v0
706 + bilinear_store_\()\dst_fmt 2, v16, v17
707 .endm
709 .macro bilinear_interpolate_four_pixels src_fmt, mask_fmt, dst_fmt, op
710 - bilinear_load_and_vertical_interpolate_four_&src_fmt \
711 - v1, v11, v4, v5, v6, v7, v22, v23 \
712 + bilinear_load_and_vertical_interpolate_four_\()\src_fmt \
713 + v1, v11, v4, v5, v6, v7, v22, v23, \
714 v3, v9, v16, v17, v20, v21, v18, v19
715 prfm PREFETCH_MODE, [TMP1, PF_OFFS]
716 sub TMP1, TMP1, STRIDE
717 prfm PREFETCH_MODE, [TMP1, PF_OFFS]
718 ushll v0.4s, v1.4h, #BILINEAR_INTERPOLATION_BITS
719 umlsl v0.4s, v1.4h, v15.h[0]
720 umlal2 v0.4s, v1.8h, v15.h[0]
721 ushll v10.4s, v11.4h, #BILINEAR_INTERPOLATION_BITS
722 @@ -575,33 +576,33 @@
723 ushll v8.4s, v9.4h, #BILINEAR_INTERPOLATION_BITS
724 umlsl v8.4s, v9.4h, v15.h[4]
725 umlal2 v8.4s, v9.8h, v15.h[4]
726 add v12.8h, v12.8h, v13.8h
727 shrn v0.4h, v0.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
728 shrn2 v0.8h, v10.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
729 shrn v2.4h, v2.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
730 shrn2 v2.8h, v8.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
731 - bilinear_load_mask mask_fmt, 4, v4
732 - bilinear_duplicate_mask mask_fmt, 4, v4
733 + bilinear_load_mask \mask_fmt, 4, v4
734 + bilinear_duplicate_mask \mask_fmt, 4, v4
735 ushr v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS)
736 xtn v0.8b, v0.8h
737 xtn v1.8b, v2.8h
738 add v12.8h, v12.8h, v13.8h
739 - bilinear_load_dst dst_fmt, op, 4, v2, v3, v21
740 + bilinear_load_dst \dst_fmt, \op, 4, v2, v3, v21
741 bilinear_interleave_src_dst \
742 - mask_fmt, op, 4, v0, v1, v0, v2, v3, v11
743 + \mask_fmt, \op, 4, v0, v1, v0, v2, v3, v11
744 bilinear_apply_mask_to_src \
745 - mask_fmt, 4, v0, v1, v0, v4, \
746 + \mask_fmt, 4, v0, v1, v0, v4, \
747 v6, v8, v9, v10
748 bilinear_combine \
749 - op, 4, v0, v1, v0, v2, v3, v1, \
750 + \op, 4, v0, v1, v0, v2, v3, v1, \
751 v6, v8, v9, v10, v23
752 - bilinear_deinterleave_dst mask_fmt, op, 4, v0, v1, v0
753 - bilinear_store_&dst_fmt 4, v6, v7
754 + bilinear_deinterleave_dst \mask_fmt, \op, 4, v0, v1, v0
755 + bilinear_store_\()\dst_fmt 4, v6, v7
756 .endm
758 .set BILINEAR_FLAG_USE_MASK, 1
759 .set BILINEAR_FLAG_USE_ALL_NEON_REGS, 2
762 * Main template macro for generating NEON optimized bilinear scanline functions.
764 @@ -631,24 +632,24 @@
765 bilinear_process_four_pixels, \
766 bilinear_process_pixblock_head, \
767 bilinear_process_pixblock_tail, \
768 bilinear_process_pixblock_tail_head, \
769 pixblock_size, \
770 prefetch_distance, \
771 flags
773 -pixman_asm_function fname
774 -.if pixblock_size == 8
775 -.elseif pixblock_size == 4
776 +pixman_asm_function \fname
777 +.if \pixblock_size == 8
778 +.elseif \pixblock_size == 4
779 .else
780 .error unsupported pixblock size
781 .endif
783 -.if ((flags) & BILINEAR_FLAG_USE_MASK) == 0
784 +.if ((\flags) & BILINEAR_FLAG_USE_MASK) == 0
785 OUT .req x0
786 TOP .req x1
787 BOTTOM .req x2
788 WT .req x3
789 WWT .req w3
790 WB .req x4
791 WWB .req w4
792 X .req w5
793 @@ -694,32 +695,32 @@ pixman_asm_function fname
794 PF_OFFS .req x12
795 TMP3 .req x13
796 WTMP3 .req w13
797 TMP4 .req x14
798 WTMP4 .req w14
799 STRIDE .req x15
800 DUMMY .req x30
802 - .set prefetch_offset, prefetch_distance
803 + .set prefetch_offset, \prefetch_distance
805 stp x29, x30, [sp, -16]!
806 mov x29, sp
807 sub x29, x29, 64
808 st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32
809 st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32
810 stp x10, x11, [x29, -80]
811 stp x12, x13, [x29, -96]
812 stp x14, x15, [x29, -112]
813 str x8, [x29, -120]
814 ldr w8, [x29, 16]
815 sub sp, sp, 120
816 .endif
818 - mov WTMP1, #prefetch_distance
819 + mov WTMP1, #\prefetch_distance
820 umull PF_OFFS, WTMP1, UX
822 sub STRIDE, BOTTOM, TOP
823 .unreq BOTTOM
825 cmp WIDTH, #0
826 ble 300f
828 @@ -730,73 +731,73 @@ pixman_asm_function fname
829 mov v25.d[0], v12.d[1]
830 mov v26.d[0], v13.d[0]
831 add v25.4h, v25.4h, v26.4h
832 mov v12.d[1], v25.d[0]
834 /* ensure good destination alignment */
835 cmp WIDTH, #1
836 blt 100f
837 - tst OUT, #(1 << dst_bpp_shift)
838 + tst OUT, #(1 << \dst_bpp_shift)
839 beq 100f
840 ushr v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS)
841 add v12.8h, v12.8h, v13.8h
842 - bilinear_process_last_pixel
843 + \bilinear_process_last_pixel
844 sub WIDTH, WIDTH, #1
845 100:
846 add v13.8h, v13.8h, v13.8h
847 ushr v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS)
848 add v12.8h, v12.8h, v13.8h
850 cmp WIDTH, #2
851 blt 100f
852 - tst OUT, #(1 << (dst_bpp_shift + 1))
853 + tst OUT, #(1 << (\dst_bpp_shift + 1))
854 beq 100f
855 - bilinear_process_two_pixels
856 + \bilinear_process_two_pixels
857 sub WIDTH, WIDTH, #2
858 100:
859 -.if pixblock_size == 8
860 +.if \pixblock_size == 8
861 cmp WIDTH, #4
862 blt 100f
863 - tst OUT, #(1 << (dst_bpp_shift + 2))
864 + tst OUT, #(1 << (\dst_bpp_shift + 2))
865 beq 100f
866 - bilinear_process_four_pixels
867 + \bilinear_process_four_pixels
868 sub WIDTH, WIDTH, #4
869 100:
870 .endif
871 - subs WIDTH, WIDTH, #pixblock_size
872 + subs WIDTH, WIDTH, #\pixblock_size
873 blt 100f
874 - asr PF_OFFS, PF_OFFS, #(16 - src_bpp_shift)
875 - bilinear_process_pixblock_head
876 - subs WIDTH, WIDTH, #pixblock_size
877 + asr PF_OFFS, PF_OFFS, #(16 - \src_bpp_shift)
878 + \bilinear_process_pixblock_head
879 + subs WIDTH, WIDTH, #\pixblock_size
880 blt 500f
882 - bilinear_process_pixblock_tail_head
883 - subs WIDTH, WIDTH, #pixblock_size
884 + \bilinear_process_pixblock_tail_head
885 + subs WIDTH, WIDTH, #\pixblock_size
886 bge 0b
887 500:
888 - bilinear_process_pixblock_tail
889 + \bilinear_process_pixblock_tail
890 100:
891 -.if pixblock_size == 8
892 +.if \pixblock_size == 8
893 tst WIDTH, #4
894 beq 200f
895 - bilinear_process_four_pixels
896 + \bilinear_process_four_pixels
897 200:
898 .endif
899 /* handle the remaining trailing pixels */
900 tst WIDTH, #2
901 beq 200f
902 - bilinear_process_two_pixels
903 + \bilinear_process_two_pixels
904 200:
905 tst WIDTH, #1
906 beq 300f
907 - bilinear_process_last_pixel
908 + \bilinear_process_last_pixel
909 300:
911 -.if ((flags) & BILINEAR_FLAG_USE_MASK) == 0
912 +.if ((\flags) & BILINEAR_FLAG_USE_MASK) == 0
913 sub x29, x29, 64
914 ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32
915 ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32
916 ldp x10, x11, [x29, -80]
917 ldp x12, x13, [x29, -96]
918 ldp x14, x15, [x29, -112]
919 mov sp, x29
920 ldp x29, x30, [sp], 16
921 @@ -824,21 +825,21 @@ 300:
922 .unreq WIDTH
923 .unreq TMP1
924 .unreq WTMP1
925 .unreq TMP2
926 .unreq PF_OFFS
927 .unreq TMP3
928 .unreq TMP4
929 .unreq STRIDE
930 -.if ((flags) & BILINEAR_FLAG_USE_MASK) != 0
931 +.if ((\flags) & BILINEAR_FLAG_USE_MASK) != 0
932 .unreq MASK
933 .endif
935 -.endfunc
936 +pixman_end_asm_function
938 .endm
940 /* src_8888_8_8888 */
941 .macro bilinear_src_8888_8_8888_process_last_pixel
942 bilinear_interpolate_last_pixel 8888, 8, 8888, src
943 .endm
945 diff --git a/gfx/cairo/libpixman/src/pixman-arma64-neon-asm.S b/gfx/cairo/libpixman/src/pixman-arma64-neon-asm.S
946 --- a/gfx/cairo/libpixman/src/pixman-arma64-neon-asm.S
947 +++ b/gfx/cairo/libpixman/src/pixman-arma64-neon-asm.S
948 @@ -262,64 +262,64 @@
949 uqadd v18.8b, v0.8b, v22.8b
950 uqadd v19.8b, v1.8b, v23.8b
951 shrn v6.8b, v4.8h, #8
952 fetch_src_pixblock
953 shrn v7.8b, v4.8h, #3
954 sli v4.8h, v4.8h, #5
955 ushll v14.8h, v17.8b, #7
956 sli v14.8h, v14.8h, #1
957 - PF add PF_X, PF_X, #8
958 + PF add, PF_X, PF_X, #8
959 ushll v8.8h, v19.8b, #7
960 sli v8.8h, v8.8h, #1
961 - PF tst PF_CTL, #0xF
962 + PF tst, PF_CTL, #0xF
963 sri v6.8b, v6.8b, #5
964 - PF beq 10f
965 - PF add PF_X, PF_X, #8
966 + PF beq, 10f
967 + PF add, PF_X, PF_X, #8
969 mvn v3.8b, v3.8b
970 - PF beq 10f
971 - PF sub PF_CTL, PF_CTL, #1
972 + PF beq, 10f
973 + PF sub, PF_CTL, PF_CTL, #1
975 sri v7.8b, v7.8b, #6
976 shrn v30.8b, v4.8h, #2
977 umull v10.8h, v3.8b, v6.8b
978 - PF lsl DUMMY, PF_X, #src_bpp_shift
979 - PF prfm PREFETCH_MODE, [PF_SRC, DUMMY]
980 + PF lsl, DUMMY, PF_X, #src_bpp_shift
981 + PF prfm, PREFETCH_MODE, [PF_SRC, DUMMY]
982 umull v11.8h, v3.8b, v7.8b
983 umull v12.8h, v3.8b, v30.8b
984 - PF lsl DUMMY, PF_X, #dst_bpp_shift
985 - PF prfm PREFETCH_MODE, [PF_DST, DUMMY]
986 + PF lsl, DUMMY, PF_X, #dst_bpp_shift
987 + PF prfm, PREFETCH_MODE, [PF_DST, DUMMY]
988 sri v14.8h, v8.8h, #5
989 - PF cmp PF_X, ORIG_W
990 + PF cmp, PF_X, ORIG_W
991 ushll v9.8h, v18.8b, #7
992 sli v9.8h, v9.8h, #1
993 urshr v17.8h, v10.8h, #8
994 - PF ble 10f
995 - PF sub PF_X, PF_X, ORIG_W
996 + PF ble, 10f
997 + PF sub, PF_X, PF_X, ORIG_W
999 urshr v19.8h, v11.8h, #8
1000 urshr v18.8h, v12.8h, #8
1001 - PF ble 10f
1002 - PF subs PF_CTL, PF_CTL, #0x10
1003 + PF ble, 10f
1004 + PF subs, PF_CTL, PF_CTL, #0x10
1006 sri v14.8h, v9.8h, #11
1007 mov v28.d[0], v14.d[0]
1008 mov v29.d[0], v14.d[1]
1009 - PF ble 10f
1010 - PF lsl DUMMY, SRC_STRIDE, #src_bpp_shift
1011 - PF ldrsb DUMMY, [PF_SRC, DUMMY]
1012 - PF add PF_SRC, PF_SRC, #1
1013 + PF ble, 10f
1014 + PF lsl, DUMMY, SRC_STRIDE, #src_bpp_shift
1015 + PF ldrsb, DUMMY, [PF_SRC, DUMMY]
1016 + PF add, PF_SRC, PF_SRC, #1
1018 raddhn v20.8b, v10.8h, v17.8h
1019 raddhn v23.8b, v11.8h, v19.8h
1020 - PF ble 10f
1021 - PF lsl DUMMY, DST_STRIDE, #dst_bpp_shift
1022 - PF ldrsb DUMMY, [PF_DST, DUMMY]
1023 - PF add PF_DST, PF_SRC, #1
1024 + PF ble, 10f
1025 + PF lsl, DUMMY, DST_STRIDE, #dst_bpp_shift
1026 + PF ldrsb, DUMMY, [PF_DST, DUMMY]
1027 + PF add, PF_DST, PF_SRC, #1
1029 raddhn v22.8b, v12.8h, v18.8h
1030 st1 {v14.8h}, [DST_W], #16
1031 .endm
1033 #else
1035 /* If we did not care much about the performance, we would just use this... */
1036 @@ -469,42 +469,42 @@ generate_composite_function \
1037 sri v14.8h, v8.8h, #5
1038 sri v14.8h, v9.8h, #11
1039 mov v28.d[0], v14.d[0]
1040 mov v29.d[0], v14.d[1]
1041 .endm
1043 .macro pixman_composite_src_8888_0565_process_pixblock_tail_head
1044 sri v14.8h, v8.8h, #5
1045 - PF add PF_X, PF_X, #8
1046 - PF tst PF_CTL, #0xF
1047 + PF add, PF_X, PF_X, #8
1048 + PF tst, PF_CTL, #0xF
1049 fetch_src_pixblock
1050 - PF beq 10f
1051 - PF add PF_X, PF_X, #8
1052 - PF sub PF_CTL, PF_CTL, #1
1053 + PF beq, 10f
1054 + PF add, PF_X, PF_X, #8
1055 + PF sub, PF_CTL, PF_CTL, #1
1057 sri v14.8h, v9.8h, #11
1058 mov v28.d[0], v14.d[0]
1059 mov v29.d[0], v14.d[1]
1060 - PF cmp PF_X, ORIG_W
1061 - PF lsl DUMMY, PF_X, #src_bpp_shift
1062 - PF prfm PREFETCH_MODE, [PF_SRC, DUMMY]
1063 + PF cmp, PF_X, ORIG_W
1064 + PF lsl, DUMMY, PF_X, #src_bpp_shift
1065 + PF prfm, PREFETCH_MODE, [PF_SRC, DUMMY]
1066 ushll v8.8h, v1.8b, #7
1067 sli v8.8h, v8.8h, #1
1068 st1 {v14.8h}, [DST_W], #16
1069 - PF ble 10f
1070 - PF sub PF_X, PF_X, ORIG_W
1071 - PF subs PF_CTL, PF_CTL, #0x10
1072 + PF ble, 10f
1073 + PF sub, PF_X, PF_X, ORIG_W
1074 + PF subs, PF_CTL, PF_CTL, #0x10
1076 ushll v14.8h, v2.8b, #7
1077 sli v14.8h, v14.8h, #1
1078 - PF ble 10f
1079 - PF lsl DUMMY, SRC_STRIDE, #src_bpp_shift
1080 - PF ldrsb DUMMY, [PF_SRC, DUMMY]
1081 - PF add PF_SRC, PF_SRC, #1
1082 + PF ble, 10f
1083 + PF lsl, DUMMY, SRC_STRIDE, #src_bpp_shift
1084 + PF ldrsb, DUMMY, [PF_SRC, DUMMY]
1085 + PF add, PF_SRC, PF_SRC, #1
1087 ushll v9.8h, v0.8b, #7
1088 sli v9.8h, v9.8h, #1
1089 .endm
1091 generate_composite_function \
1092 pixman_composite_src_8888_0565_asm_neon, 32, 0, 16, \
1093 FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
1094 @@ -561,41 +561,41 @@ generate_composite_function \
1095 uqadd v31.8b, v3.8b, v7.8b
1096 .endm
1098 .macro pixman_composite_add_8_8_process_pixblock_tail
1099 .endm
1101 .macro pixman_composite_add_8_8_process_pixblock_tail_head
1102 fetch_src_pixblock
1103 - PF add PF_X, PF_X, #32
1104 - PF tst PF_CTL, #0xF
1105 + PF add, PF_X, PF_X, #32
1106 + PF tst, PF_CTL, #0xF
1107 ld1 {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32
1108 - PF beq 10f
1109 - PF add PF_X, PF_X, #32
1110 - PF sub PF_CTL, PF_CTL, #1
1111 + PF beq, 10f
1112 + PF add, PF_X, PF_X, #32
1113 + PF sub, PF_CTL, PF_CTL, #1
1115 st1 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
1116 - PF cmp PF_X, ORIG_W
1117 - PF lsl DUMMY, PF_X, #src_bpp_shift
1118 - PF prfm PREFETCH_MODE, [PF_SRC, DUMMY]
1119 - PF lsl DUMMY, PF_X, #dst_bpp_shift
1120 - PF prfm PREFETCH_MODE, [PF_DST, DUMMY]
1121 - PF ble 10f
1122 - PF sub PF_X, PF_X, ORIG_W
1123 - PF subs PF_CTL, PF_CTL, #0x10
1124 + PF cmp, PF_X, ORIG_W
1125 + PF lsl, DUMMY, PF_X, #src_bpp_shift
1126 + PF prfm, PREFETCH_MODE, [PF_SRC, DUMMY]
1127 + PF lsl, DUMMY, PF_X, #dst_bpp_shift
1128 + PF prfm, PREFETCH_MODE, [PF_DST, DUMMY]
1129 + PF ble, 10f
1130 + PF sub, PF_X, PF_X, ORIG_W
1131 + PF subs, PF_CTL, PF_CTL, #0x10
1133 uqadd v28.8b, v0.8b, v4.8b
1134 - PF ble 10f
1135 - PF lsl DUMMY, SRC_STRIDE, #src_bpp_shift
1136 - PF ldrsb DUMMY, [PF_SRC, DUMMY]
1137 - PF add PF_SRC, PF_SRC, #1
1138 - PF lsl DUMMY, DST_STRIDE, #dst_bpp_shift
1139 - PF ldrsb DUMMY, [PF_DST, DUMMY]
1140 - PF add PF_DST, PF_DST, #1
1141 + PF ble, 10f
1142 + PF lsl, DUMMY, SRC_STRIDE, #src_bpp_shift
1143 + PF ldrsb, DUMMY, [PF_SRC, DUMMY]
1144 + PF add, PF_SRC, PF_SRC, #1
1145 + PF lsl, DUMMY, DST_STRIDE, #dst_bpp_shift
1146 + PF ldrsb, DUMMY, [PF_DST, DUMMY]
1147 + PF add, PF_DST, PF_DST, #1
1149 uqadd v29.8b, v1.8b, v5.8b
1150 uqadd v30.8b, v2.8b, v6.8b
1151 uqadd v31.8b, v3.8b, v7.8b
1152 .endm
1154 generate_composite_function \
1155 pixman_composite_add_8_8_asm_neon, 8, 0, 8, \
1156 @@ -607,41 +607,41 @@ generate_composite_function \
1157 pixman_composite_add_8_8_process_pixblock_head, \
1158 pixman_composite_add_8_8_process_pixblock_tail, \
1159 pixman_composite_add_8_8_process_pixblock_tail_head
1161 /******************************************************************************/
1163 .macro pixman_composite_add_8888_8888_process_pixblock_tail_head
1164 fetch_src_pixblock
1165 - PF add PF_X, PF_X, #8
1166 - PF tst PF_CTL, #0xF
1167 + PF add, PF_X, PF_X, #8
1168 + PF tst, PF_CTL, #0xF
1169 ld1 {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32
1170 - PF beq 10f
1171 - PF add PF_X, PF_X, #8
1172 - PF sub PF_CTL, PF_CTL, #1
1173 + PF beq, 10f
1174 + PF add, PF_X, PF_X, #8
1175 + PF sub, PF_CTL, PF_CTL, #1
1177 st1 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
1178 - PF cmp PF_X, ORIG_W
1179 - PF lsl DUMMY, PF_X, #src_bpp_shift
1180 - PF prfm PREFETCH_MODE, [PF_SRC, DUMMY]
1181 - PF lsl DUMMY, PF_X, #dst_bpp_shift
1182 - PF prfm PREFETCH_MODE, [PF_DST, DUMMY]
1183 - PF ble 10f
1184 - PF sub PF_X, PF_X, ORIG_W
1185 - PF subs PF_CTL, PF_CTL, #0x10
1186 + PF cmp, PF_X, ORIG_W
1187 + PF lsl, DUMMY, PF_X, #src_bpp_shift
1188 + PF prfm, PREFETCH_MODE, [PF_SRC, DUMMY]
1189 + PF lsl, DUMMY, PF_X, #dst_bpp_shift
1190 + PF prfm, PREFETCH_MODE, [PF_DST, DUMMY]
1191 + PF ble, 10f
1192 + PF sub, PF_X, PF_X, ORIG_W
1193 + PF subs, PF_CTL, PF_CTL, #0x10
1195 uqadd v28.8b, v0.8b, v4.8b
1196 - PF ble 10f
1197 - PF lsl DUMMY, SRC_STRIDE, #src_bpp_shift
1198 - PF ldrsb DUMMY, [PF_SRC, DUMMY]
1199 - PF add PF_SRC, PF_SRC, #1
1200 - PF lsl DUMMY, DST_STRIDE, #dst_bpp_shift
1201 - PF ldrsb DUMMY, [PF_DST, DUMMY]
1202 - PF add PF_DST, PF_DST, #1
1203 + PF ble, 10f
1204 + PF lsl, DUMMY, SRC_STRIDE, #src_bpp_shift
1205 + PF ldrsb, DUMMY, [PF_SRC, DUMMY]
1206 + PF add, PF_SRC, PF_SRC, #1
1207 + PF lsl, DUMMY, DST_STRIDE, #dst_bpp_shift
1208 + PF ldrsb, DUMMY, [PF_DST, DUMMY]
1209 + PF add, PF_DST, PF_DST, #1
1211 uqadd v29.8b, v1.8b, v5.8b
1212 uqadd v30.8b, v2.8b, v6.8b
1213 uqadd v31.8b, v3.8b, v7.8b
1214 .endm
1216 generate_composite_function \
1217 pixman_composite_add_8888_8888_asm_neon, 32, 0, 32, \
1218 @@ -684,55 +684,55 @@ generate_composite_function_single_scanl
1219 raddhn v29.8b, v15.8h, v9.8h
1220 raddhn v30.8b, v16.8h, v10.8h
1221 raddhn v31.8b, v17.8h, v11.8h
1222 .endm
1224 .macro pixman_composite_out_reverse_8888_8888_process_pixblock_tail_head
1225 ld4 {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32
1226 urshr v14.8h, v8.8h, #8
1227 - PF add PF_X, PF_X, #8
1228 - PF tst PF_CTL, #0xF
1229 + PF add, PF_X, PF_X, #8
1230 + PF tst, PF_CTL, #0xF
1231 urshr v15.8h, v9.8h, #8
1232 urshr v16.8h, v10.8h, #8
1233 urshr v17.8h, v11.8h, #8
1234 - PF beq 10f
1235 - PF add PF_X, PF_X, #8
1236 - PF sub PF_CTL, PF_CTL, #1
1237 + PF beq, 10f
1238 + PF add, PF_X, PF_X, #8
1239 + PF sub, PF_CTL, PF_CTL, #1
1241 raddhn v28.8b, v14.8h, v8.8h
1242 raddhn v29.8b, v15.8h, v9.8h
1243 - PF cmp PF_X, ORIG_W
1244 + PF cmp, PF_X, ORIG_W
1245 raddhn v30.8b, v16.8h, v10.8h
1246 raddhn v31.8b, v17.8h, v11.8h
1247 fetch_src_pixblock
1248 - PF lsl DUMMY, PF_X, #src_bpp_shift
1249 - PF prfm PREFETCH_MODE, [PF_SRC, DUMMY]
1250 + PF lsl, DUMMY, PF_X, #src_bpp_shift
1251 + PF prfm, PREFETCH_MODE, [PF_SRC, DUMMY]
1252 mvn v22.8b, v3.8b
1253 - PF lsl DUMMY, PF_X, #dst_bpp_shift
1254 - PF prfm PREFETCH_MODE, [PF_DST, DUMMY]
1255 + PF lsl, DUMMY, PF_X, #dst_bpp_shift
1256 + PF prfm, PREFETCH_MODE, [PF_DST, DUMMY]
1257 st4 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
1258 - PF ble 10f
1259 - PF sub PF_X, PF_X, ORIG_W
1260 + PF ble, 10f
1261 + PF sub, PF_X, PF_X, ORIG_W
1263 umull v8.8h, v22.8b, v4.8b
1264 - PF ble 10f
1265 - PF subs PF_CTL, PF_CTL, #0x10
1266 + PF ble, 10f
1267 + PF subs, PF_CTL, PF_CTL, #0x10
1269 umull v9.8h, v22.8b, v5.8b
1270 - PF ble 10f
1271 - PF lsl DUMMY, SRC_STRIDE, #src_bpp_shift
1272 - PF ldrsb DUMMY, [PF_SRC, DUMMY]
1273 - PF add PF_SRC, PF_SRC, #1
1274 + PF ble, 10f
1275 + PF lsl, DUMMY, SRC_STRIDE, #src_bpp_shift
1276 + PF ldrsb, DUMMY, [PF_SRC, DUMMY]
1277 + PF add, PF_SRC, PF_SRC, #1
1279 umull v10.8h, v22.8b, v6.8b
1280 - PF ble 10f
1281 - PF lsl DUMMY, DST_STRIDE, #dst_bpp_shift
1282 - PF ldrsb DUMMY, [PF_DST, DUMMY]
1283 - PF add PF_DST, PF_DST, #1
1284 + PF ble, 10f
1285 + PF lsl, DUMMY, DST_STRIDE, #dst_bpp_shift
1286 + PF ldrsb, DUMMY, [PF_DST, DUMMY]
1287 + PF add, PF_DST, PF_DST, #1
1289 umull v11.8h, v22.8b, v7.8b
1290 .endm
1292 generate_composite_function_single_scanline \
1293 pixman_composite_scanline_out_reverse_asm_neon, 32, 0, 32, \
1294 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
1295 8, /* number of pixels, processed in a single block */ \
1296 @@ -754,59 +754,59 @@ generate_composite_function_single_scanl
1297 uqadd v29.8b, v1.8b, v29.8b
1298 uqadd v30.8b, v2.8b, v30.8b
1299 uqadd v31.8b, v3.8b, v31.8b
1300 .endm
1302 .macro pixman_composite_over_8888_8888_process_pixblock_tail_head
1303 ld4 {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32
1304 urshr v14.8h, v8.8h, #8
1305 - PF add PF_X, PF_X, #8
1306 - PF tst PF_CTL, #0xF
1307 + PF add, PF_X, PF_X, #8
1308 + PF tst, PF_CTL, #0xF
1309 urshr v15.8h, v9.8h, #8
1310 urshr v16.8h, v10.8h, #8
1311 urshr v17.8h, v11.8h, #8
1312 - PF beq 10f
1313 - PF add PF_X, PF_X, #8
1314 - PF sub PF_CTL, PF_CTL, #1
1315 + PF beq, 10f
1316 + PF add, PF_X, PF_X, #8
1317 + PF sub, PF_CTL, PF_CTL, #1
1319 raddhn v28.8b, v14.8h, v8.8h
1320 raddhn v29.8b, v15.8h, v9.8h
1321 - PF cmp PF_X, ORIG_W
1322 + PF cmp, PF_X, ORIG_W
1323 raddhn v30.8b, v16.8h, v10.8h
1324 raddhn v31.8b, v17.8h, v11.8h
1325 uqadd v28.8b, v0.8b, v28.8b
1326 uqadd v29.8b, v1.8b, v29.8b
1327 uqadd v30.8b, v2.8b, v30.8b
1328 uqadd v31.8b, v3.8b, v31.8b
1329 fetch_src_pixblock
1330 - PF lsl DUMMY, PF_X, #src_bpp_shift
1331 - PF prfm PREFETCH_MODE, [PF_SRC, DUMMY]
1332 + PF lsl, DUMMY, PF_X, #src_bpp_shift
1333 + PF prfm, PREFETCH_MODE, [PF_SRC, DUMMY]
1334 mvn v22.8b, v3.8b
1335 - PF lsl DUMMY, PF_X, #dst_bpp_shift
1336 - PF prfm PREFETCH_MODE, [PF_DST, DUMMY]
1337 + PF lsl, DUMMY, PF_X, #dst_bpp_shift
1338 + PF prfm, PREFETCH_MODE, [PF_DST, DUMMY]
1339 st4 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
1340 - PF ble 10f
1341 - PF sub PF_X, PF_X, ORIG_W
1342 + PF ble, 10f
1343 + PF sub, PF_X, PF_X, ORIG_W
1345 umull v8.8h, v22.8b, v4.8b
1346 - PF ble 10f
1347 - PF subs PF_CTL, PF_CTL, #0x10
1348 + PF ble, 10f
1349 + PF subs, PF_CTL, PF_CTL, #0x10
1351 umull v9.8h, v22.8b, v5.8b
1352 - PF ble 10f
1353 - PF lsl DUMMY, SRC_STRIDE, #src_bpp_shift
1354 - PF ldrsb DUMMY, [PF_SRC, DUMMY]
1355 - PF add PF_SRC, PF_SRC, #1
1356 + PF ble, 10f
1357 + PF lsl, DUMMY, SRC_STRIDE, #src_bpp_shift
1358 + PF ldrsb, DUMMY, [PF_SRC, DUMMY]
1359 + PF add, PF_SRC, PF_SRC, #1
1361 umull v10.8h, v22.8b, v6.8b
1362 - PF ble 10f
1363 - PF lsl DUMMY, DST_STRIDE, #dst_bpp_shift
1364 - PF ldrsb DUMMY, [PF_DST, DUMMY]
1365 - PF add PF_DST, PF_DST, #1
1366 + PF ble, 10f
1367 + PF lsl, DUMMY, DST_STRIDE, #dst_bpp_shift
1368 + PF ldrsb, DUMMY, [PF_DST, DUMMY]
1369 + PF add, PF_DST, PF_DST, #1
1371 umull v11.8h, v22.8b, v7.8b
1372 .endm
1374 generate_composite_function \
1375 pixman_composite_over_8888_8888_asm_neon, 32, 0, 32, \
1376 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
1377 8, /* number of pixels, processed in a single block */ \
1378 @@ -860,40 +860,40 @@ generate_composite_function_single_scanl
1379 urshr v16.8h, v10.8h, #8
1380 urshr v17.8h, v11.8h, #8
1381 raddhn v28.8b, v14.8h, v8.8h
1382 raddhn v29.8b, v15.8h, v9.8h
1383 raddhn v30.8b, v16.8h, v10.8h
1384 raddhn v31.8b, v17.8h, v11.8h
1385 ld4 {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32
1386 uqadd v28.8b, v0.8b, v28.8b
1387 - PF add PF_X, PF_X, #8
1388 - PF tst PF_CTL, #0x0F
1389 - PF beq 10f
1390 - PF add PF_X, PF_X, #8
1391 - PF sub PF_CTL, PF_CTL, #1
1392 + PF add, PF_X, PF_X, #8
1393 + PF tst, PF_CTL, #0x0F
1394 + PF beq, 10f
1395 + PF add, PF_X, PF_X, #8
1396 + PF sub, PF_CTL, PF_CTL, #1
1398 uqadd v29.8b, v1.8b, v29.8b
1399 uqadd v30.8b, v2.8b, v30.8b
1400 uqadd v31.8b, v3.8b, v31.8b
1401 - PF cmp PF_X, ORIG_W
1402 + PF cmp, PF_X, ORIG_W
1403 umull v8.8h, v24.8b, v4.8b
1404 - PF lsl DUMMY, PF_X, #dst_bpp_shift
1405 - PF prfm PREFETCH_MODE, [PF_DST, DUMMY]
1406 + PF lsl, DUMMY, PF_X, #dst_bpp_shift
1407 + PF prfm, PREFETCH_MODE, [PF_DST, DUMMY]
1408 umull v9.8h, v24.8b, v5.8b
1409 - PF ble 10f
1410 - PF sub PF_X, PF_X, ORIG_W
1411 + PF ble, 10f
1412 + PF sub, PF_X, PF_X, ORIG_W
1414 umull v10.8h, v24.8b, v6.8b
1415 - PF subs PF_CTL, PF_CTL, #0x10
1416 + PF subs, PF_CTL, PF_CTL, #0x10
1417 umull v11.8h, v24.8b, v7.8b
1418 - PF ble 10f
1419 - PF lsl DUMMY, DST_STRIDE, #dst_bpp_shift
1420 - PF ldrsb DUMMY, [PF_DST, DUMMY]
1421 - PF add PF_DST, PF_DST, #1
1422 + PF ble, 10f
1423 + PF lsl, DUMMY, DST_STRIDE, #dst_bpp_shift
1424 + PF ldrsb, DUMMY, [PF_DST, DUMMY]
1425 + PF add, PF_DST, PF_DST, #1
1427 st4 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
1428 .endm
1430 .macro pixman_composite_over_n_8888_init
1431 mov v3.s[0], w4
1432 dup v0.8b, v3.b[0]
1433 dup v1.8b, v3.b[1]
1434 @@ -912,52 +912,52 @@ generate_composite_function \
1435 pixman_composite_over_8888_8888_process_pixblock_head, \
1436 pixman_composite_over_8888_8888_process_pixblock_tail, \
1437 pixman_composite_over_n_8888_process_pixblock_tail_head
1439 /******************************************************************************/
1441 .macro pixman_composite_over_reverse_n_8888_process_pixblock_tail_head
1442 urshr v14.8h, v8.8h, #8
1443 - PF add PF_X, PF_X, #8
1444 - PF tst PF_CTL, #0xF
1445 + PF add, PF_X, PF_X, #8
1446 + PF tst, PF_CTL, #0xF
1447 urshr v15.8h, v9.8h, #8
1448 urshr v12.8h, v10.8h, #8
1449 urshr v13.8h, v11.8h, #8
1450 - PF beq 10f
1451 - PF add PF_X, PF_X, #8
1452 - PF sub PF_CTL, PF_CTL, #1
1453 + PF beq, 10f
1454 + PF add, PF_X, PF_X, #8
1455 + PF sub, PF_CTL, PF_CTL, #1
1457 raddhn v28.8b, v14.8h, v8.8h
1458 raddhn v29.8b, v15.8h, v9.8h
1459 - PF cmp PF_X, ORIG_W
1460 + PF cmp, PF_X, ORIG_W
1461 raddhn v30.8b, v12.8h, v10.8h
1462 raddhn v31.8b, v13.8h, v11.8h
1463 uqadd v28.8b, v0.8b, v28.8b
1464 uqadd v29.8b, v1.8b, v29.8b
1465 uqadd v30.8b, v2.8b, v30.8b
1466 uqadd v31.8b, v3.8b, v31.8b
1467 ld4 {v0.8b, v1.8b, v2.8b, v3.8b}, [DST_R], #32
1468 mvn v22.8b, v3.8b
1469 - PF lsl DUMMY, PF_X, #dst_bpp_shift
1470 - PF prfm PREFETCH_MODE, [PF_DST, DUMMY]
1471 + PF lsl, DUMMY, PF_X, #dst_bpp_shift
1472 + PF prfm, PREFETCH_MODE, [PF_DST, DUMMY]
1473 st4 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
1474 - PF blt 10f
1475 - PF sub PF_X, PF_X, ORIG_W
1476 + PF blt, 10f
1477 + PF sub, PF_X, PF_X, ORIG_W
1479 umull v8.8h, v22.8b, v4.8b
1480 - PF blt 10f
1481 - PF subs PF_CTL, PF_CTL, #0x10
1482 + PF blt, 10f
1483 + PF subs, PF_CTL, PF_CTL, #0x10
1485 umull v9.8h, v22.8b, v5.8b
1486 umull v10.8h, v22.8b, v6.8b
1487 - PF blt 10f
1488 - PF lsl DUMMY, DST_STRIDE, #dst_bpp_shift
1489 - PF ldrsb DUMMY, [PF_DST, DUMMY]
1490 - PF add PF_DST, PF_DST, #1
1491 + PF blt, 10f
1492 + PF lsl, DUMMY, DST_STRIDE, #dst_bpp_shift
1493 + PF ldrsb, DUMMY, [PF_DST, DUMMY]
1494 + PF add, PF_DST, PF_DST, #1
1496 umull v11.8h, v22.8b, v7.8b
1497 .endm
1499 .macro pixman_composite_over_reverse_n_8888_init
1500 mov v7.s[0], w4
1501 dup v4.8b, v7.b[0]
1502 dup v5.8b, v7.b[1]
1503 @@ -1405,45 +1405,45 @@ generate_composite_function \
1504 rshrn v28.8b, v8.8h, #8
1505 rshrn v29.8b, v9.8h, #8
1506 rshrn v30.8b, v10.8h, #8
1507 rshrn v31.8b, v11.8h, #8
1508 .endm
1510 .macro pixman_composite_src_n_8_8888_process_pixblock_tail_head
1511 fetch_mask_pixblock
1512 - PF add PF_X, PF_X, #8
1513 + PF add, PF_X, PF_X, #8
1514 rshrn v28.8b, v8.8h, #8
1515 - PF tst PF_CTL, #0x0F
1516 + PF tst, PF_CTL, #0x0F
1517 rshrn v29.8b, v9.8h, #8
1518 - PF beq 10f
1519 - PF add PF_X, PF_X, #8
1520 + PF beq, 10f
1521 + PF add, PF_X, PF_X, #8
1523 rshrn v30.8b, v10.8h, #8
1524 - PF beq 10f
1525 - PF sub PF_CTL, PF_CTL, #1
1526 + PF beq, 10f
1527 + PF sub, PF_CTL, PF_CTL, #1
1529 rshrn v31.8b, v11.8h, #8
1530 - PF cmp PF_X, ORIG_W
1531 + PF cmp, PF_X, ORIG_W
1532 umull v8.8h, v24.8b, v0.8b
1533 - PF lsl DUMMY, PF_X, #mask_bpp_shift
1534 - PF prfm PREFETCH_MODE, [PF_MASK, DUMMY]
1535 + PF lsl, DUMMY, PF_X, #mask_bpp_shift
1536 + PF prfm, PREFETCH_MODE, [PF_MASK, DUMMY]
1537 umull v9.8h, v24.8b, v1.8b
1538 - PF ble 10f
1539 - PF sub PF_X, PF_X, ORIG_W
1540 + PF ble, 10f
1541 + PF sub, PF_X, PF_X, ORIG_W
1543 umull v10.8h, v24.8b, v2.8b
1544 - PF ble 10f
1545 - PF subs PF_CTL, PF_CTL, #0x10
1546 + PF ble, 10f
1547 + PF subs, PF_CTL, PF_CTL, #0x10
1549 umull v11.8h, v24.8b, v3.8b
1550 - PF ble 10f
1551 - PF lsl DUMMY, MASK_STRIDE, #mask_bpp_shift
1552 - PF ldrsb DUMMY, [PF_MASK, DUMMY]
1553 - PF add PF_MASK, PF_MASK, #1
1554 + PF ble, 10f
1555 + PF lsl, DUMMY, MASK_STRIDE, #mask_bpp_shift
1556 + PF ldrsb, DUMMY, [PF_MASK, DUMMY]
1557 + PF add, PF_MASK, PF_MASK, #1
1559 st4 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
1560 ursra v8.8h, v8.8h, #8
1561 ursra v9.8h, v9.8h, #8
1562 ursra v10.8h, v10.8h, #8
1563 ursra v11.8h, v11.8h, #8
1564 .endm
1566 @@ -1486,45 +1486,45 @@ generate_composite_function \
1567 rshrn v28.8b, v0.8h, #8
1568 rshrn v29.8b, v1.8h, #8
1569 rshrn v30.8b, v2.8h, #8
1570 rshrn v31.8b, v3.8h, #8
1571 .endm
1573 .macro pixman_composite_src_n_8_8_process_pixblock_tail_head
1574 fetch_mask_pixblock
1575 - PF add PF_X, PF_X, #8
1576 + PF add, PF_X, PF_X, #8
1577 rshrn v28.8b, v0.8h, #8
1578 - PF tst PF_CTL, #0x0F
1579 + PF tst, PF_CTL, #0x0F
1580 rshrn v29.8b, v1.8h, #8
1581 - PF beq 10f
1582 - PF add PF_X, PF_X, #8
1583 + PF beq, 10f
1584 + PF add, PF_X, PF_X, #8
1586 rshrn v30.8b, v2.8h, #8
1587 - PF beq 10f
1588 - PF sub PF_CTL, PF_CTL, #1
1589 + PF beq, 10f
1590 + PF sub, PF_CTL, PF_CTL, #1
1592 rshrn v31.8b, v3.8h, #8
1593 - PF cmp PF_X, ORIG_W
1594 + PF cmp, PF_X, ORIG_W
1595 umull v0.8h, v24.8b, v16.8b
1596 - PF lsl DUMMY, PF_X, mask_bpp_shift
1597 - PF prfm PREFETCH_MODE, [PF_MASK, DUMMY]
1598 + PF lsl, DUMMY, PF_X, mask_bpp_shift
1599 + PF prfm, PREFETCH_MODE, [PF_MASK, DUMMY]
1600 umull v1.8h, v25.8b, v16.8b
1601 - PF ble 10f
1602 - PF sub PF_X, PF_X, ORIG_W
1603 + PF ble, 10f
1604 + PF sub, PF_X, PF_X, ORIG_W
1606 umull v2.8h, v26.8b, v16.8b
1607 - PF ble 10f
1608 - PF subs PF_CTL, PF_CTL, #0x10
1609 + PF ble, 10f
1610 + PF subs, PF_CTL, PF_CTL, #0x10
1612 umull v3.8h, v27.8b, v16.8b
1613 - PF ble 10f
1614 - PF lsl DUMMY, MASK_STRIDE, #mask_bpp_shift
1615 - PF ldrsb DUMMY, [PF_MASK, DUMMY]
1616 - PF add PF_MASK, PF_MASK, #1
1617 + PF ble, 10f
1618 + PF lsl, DUMMY, MASK_STRIDE, #mask_bpp_shift
1619 + PF ldrsb, DUMMY, [PF_MASK, DUMMY]
1620 + PF add, PF_MASK, PF_MASK, #1
1622 st1 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
1623 ursra v0.8h, v0.8h, #8
1624 ursra v1.8h, v1.8h, #8
1625 ursra v2.8h, v2.8h, #8
1626 ursra v3.8h, v3.8h, #8
1627 .endm
1629 @@ -1594,54 +1594,54 @@ generate_composite_function \
1630 .endm
1632 .macro pixman_composite_over_n_8_8888_process_pixblock_tail_head
1633 urshr v16.8h, v12.8h, #8
1634 ld4 {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32
1635 urshr v17.8h, v13.8h, #8
1636 fetch_mask_pixblock
1637 urshr v18.8h, v14.8h, #8
1638 - PF add PF_X, PF_X, #8
1639 + PF add, PF_X, PF_X, #8
1640 urshr v19.8h, v15.8h, #8
1641 - PF tst PF_CTL, #0x0F
1642 + PF tst, PF_CTL, #0x0F
1643 raddhn v28.8b, v16.8h, v12.8h
1644 - PF beq 10f
1645 - PF add PF_X, PF_X, #8
1646 + PF beq, 10f
1647 + PF add, PF_X, PF_X, #8
1649 raddhn v29.8b, v17.8h, v13.8h
1650 - PF beq 10f
1651 - PF sub PF_CTL, PF_CTL, #1
1652 + PF beq, 10f
1653 + PF sub, PF_CTL, PF_CTL, #1
1655 raddhn v30.8b, v18.8h, v14.8h
1656 - PF cmp PF_X, ORIG_W
1657 + PF cmp, PF_X, ORIG_W
1658 raddhn v31.8b, v19.8h, v15.8h
1659 - PF lsl DUMMY, PF_X, #dst_bpp_shift
1660 - PF prfm PREFETCH_MODE, [PF_DST, DUMMY]
1661 + PF lsl, DUMMY, PF_X, #dst_bpp_shift
1662 + PF prfm, PREFETCH_MODE, [PF_DST, DUMMY]
1663 umull v16.8h, v24.8b, v8.8b
1664 - PF lsl DUMMY, PF_X, #mask_bpp_shift
1665 - PF prfm PREFETCH_MODE, [PF_MASK, DUMMY]
1666 + PF lsl, DUMMY, PF_X, #mask_bpp_shift
1667 + PF prfm, PREFETCH_MODE, [PF_MASK, DUMMY]
1668 umull v17.8h, v24.8b, v9.8b
1669 - PF ble 10f
1670 - PF sub PF_X, PF_X, ORIG_W
1671 + PF ble, 10f
1672 + PF sub, PF_X, PF_X, ORIG_W
1674 umull v18.8h, v24.8b, v10.8b
1675 - PF ble 10f
1676 - PF subs PF_CTL, PF_CTL, #0x10
1677 + PF ble, 10f
1678 + PF subs, PF_CTL, PF_CTL, #0x10
1680 umull v19.8h, v24.8b, v11.8b
1681 - PF ble 10f
1682 - PF lsl DUMMY, DST_STRIDE, #dst_bpp_shift
1683 - PF ldrsb DUMMY, [PF_DST, DUMMY]
1684 - PF add PF_DST, PF_DST, #1
1685 + PF ble, 10f
1686 + PF lsl, DUMMY, DST_STRIDE, #dst_bpp_shift
1687 + PF ldrsb, DUMMY, [PF_DST, DUMMY]
1688 + PF add, PF_DST, PF_DST, #1
1690 uqadd v28.8b, v0.8b, v28.8b
1691 - PF ble 10f
1692 - PF lsl DUMMY, MASK_STRIDE, #mask_bpp_shift
1693 - PF ldrsb DUMMY, [PF_MASK, DUMMY]
1694 - PF add PF_MASK, PF_MASK, #1
1695 + PF ble, 10f
1696 + PF lsl, DUMMY, MASK_STRIDE, #mask_bpp_shift
1697 + PF ldrsb, DUMMY, [PF_MASK, DUMMY]
1698 + PF add, PF_MASK, PF_MASK, #1
1700 uqadd v29.8b, v1.8b, v29.8b
1701 uqadd v30.8b, v2.8b, v30.8b
1702 uqadd v31.8b, v3.8b, v31.8b
1703 urshr v12.8h, v16.8h, #8
1704 urshr v13.8h, v17.8h, #8
1705 urshr v14.8h, v18.8h, #8
1706 urshr v15.8h, v19.8h, #8
1707 @@ -2407,17 +2407,17 @@ generate_composite_function \
1708 generate_composite_function_single_scanline \
1709 pixman_composite_scanline_out_reverse_mask_asm_neon, 32, 32, 32, \
1710 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
1711 8, /* number of pixels, processed in a single block */ \
1712 default_init_need_all_regs, \
1713 default_cleanup_need_all_regs, \
1714 pixman_composite_out_reverse_8888_n_8888_process_pixblock_head, \
1715 pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail, \
1716 - pixman_composite_out_reverse_8888_8888_8888_process_pixblock_tail_head \
1717 + pixman_composite_out_reverse_8888_8888_8888_process_pixblock_tail_head, \
1718 28, /* dst_w_basereg */ \
1719 4, /* dst_r_basereg */ \
1720 0, /* src_basereg */ \
1721 12 /* mask_basereg */
1723 /******************************************************************************/
1725 .macro pixman_composite_over_8888_n_8888_process_pixblock_head
1726 @@ -2482,31 +2482,31 @@ generate_composite_function \
1727 pixman_composite_over_8888_8888_8888_asm_neon, 32, 32, 32, \
1728 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
1729 8, /* number of pixels, processed in a single block */ \
1730 5, /* prefetch distance */ \
1731 default_init_need_all_regs, \
1732 default_cleanup_need_all_regs, \
1733 pixman_composite_over_8888_n_8888_process_pixblock_head, \
1734 pixman_composite_over_8888_n_8888_process_pixblock_tail, \
1735 - pixman_composite_over_8888_8888_8888_process_pixblock_tail_head \
1736 + pixman_composite_over_8888_8888_8888_process_pixblock_tail_head, \
1737 28, /* dst_w_basereg */ \
1738 4, /* dst_r_basereg */ \
1739 0, /* src_basereg */ \
1740 12 /* mask_basereg */
1742 generate_composite_function_single_scanline \
1743 pixman_composite_scanline_over_mask_asm_neon, 32, 32, 32, \
1744 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
1745 8, /* number of pixels, processed in a single block */ \
1746 default_init_need_all_regs, \
1747 default_cleanup_need_all_regs, \
1748 pixman_composite_over_8888_n_8888_process_pixblock_head, \
1749 pixman_composite_over_8888_n_8888_process_pixblock_tail, \
1750 - pixman_composite_over_8888_8888_8888_process_pixblock_tail_head \
1751 + pixman_composite_over_8888_8888_8888_process_pixblock_tail_head, \
1752 28, /* dst_w_basereg */ \
1753 4, /* dst_r_basereg */ \
1754 0, /* src_basereg */ \
1755 12 /* mask_basereg */
1757 /******************************************************************************/
1759 /* TODO: expand macros and do better instructions scheduling */
1760 @@ -2524,17 +2524,17 @@ generate_composite_function \
1761 pixman_composite_over_8888_8_8888_asm_neon, 32, 8, 32, \
1762 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
1763 8, /* number of pixels, processed in a single block */ \
1764 5, /* prefetch distance */ \
1765 default_init_need_all_regs, \
1766 default_cleanup_need_all_regs, \
1767 pixman_composite_over_8888_n_8888_process_pixblock_head, \
1768 pixman_composite_over_8888_n_8888_process_pixblock_tail, \
1769 - pixman_composite_over_8888_8_8888_process_pixblock_tail_head \
1770 + pixman_composite_over_8888_8_8888_process_pixblock_tail_head, \
1771 28, /* dst_w_basereg */ \
1772 4, /* dst_r_basereg */ \
1773 0, /* src_basereg */ \
1774 15 /* mask_basereg */
1776 /******************************************************************************/
1778 .macro pixman_composite_src_0888_0888_process_pixblock_head
1779 @@ -2675,38 +2675,38 @@ generate_composite_function \
1780 urshr v11.8h, v8.8h, #8
1781 mov v30.8b, v31.8b
1782 mov v31.8b, v3.8b
1783 mov v3.8b, v31.8b
1784 urshr v12.8h, v9.8h, #8
1785 urshr v13.8h, v10.8h, #8
1786 fetch_src_pixblock
1787 raddhn v30.8b, v11.8h, v8.8h
1788 - PF add PF_X, PF_X, #8
1789 - PF tst PF_CTL, #0xF
1790 - PF beq 10f
1791 - PF add PF_X, PF_X, #8
1792 - PF sub PF_CTL, PF_CTL, #1
1793 + PF add, PF_X, PF_X, #8
1794 + PF tst, PF_CTL, #0xF
1795 + PF beq, 10f
1796 + PF add, PF_X, PF_X, #8
1797 + PF sub, PF_CTL, PF_CTL, #1
1799 raddhn v29.8b, v12.8h, v9.8h
1800 raddhn v28.8b, v13.8h, v10.8h
1801 umull v8.8h, v3.8b, v0.8b
1802 umull v9.8h, v3.8b, v1.8b
1803 umull v10.8h, v3.8b, v2.8b
1804 st4 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
1805 - PF cmp PF_X, ORIG_W
1806 - PF lsl DUMMY, PF_X, src_bpp_shift
1807 - PF prfm PREFETCH_MODE, [PF_SRC, DUMMY]
1808 - PF ble 10f
1809 - PF sub PF_X, PF_X, ORIG_W
1810 - PF subs PF_CTL, PF_CTL, #0x10
1811 - PF ble 10f
1812 - PF lsl DUMMY, SRC_STRIDE, #src_bpp_shift
1813 - PF ldrsb DUMMY, [PF_SRC, DUMMY]
1814 - PF add PF_SRC, PF_SRC, #1
1815 + PF cmp, PF_X, ORIG_W
1816 + PF lsl, DUMMY, PF_X, src_bpp_shift
1817 + PF prfm, PREFETCH_MODE, [PF_SRC, DUMMY]
1818 + PF ble, 10f
1819 + PF sub, PF_X, PF_X, ORIG_W
1820 + PF subs, PF_CTL, PF_CTL, #0x10
1821 + PF ble, 10f
1822 + PF lsl, DUMMY, SRC_STRIDE, #src_bpp_shift
1823 + PF ldrsb, DUMMY, [PF_SRC, DUMMY]
1824 + PF add, PF_SRC, PF_SRC, #1
1826 .endm
1828 generate_composite_function \
1829 pixman_composite_src_pixbuf_8888_asm_neon, 32, 0, 32, \
1830 FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
1831 8, /* number of pixels, processed in a single block */ \
1832 10, /* prefetch distance */ \
1833 @@ -2744,38 +2744,38 @@ generate_composite_function \
1834 urshr v11.8h, v8.8h, #8
1835 mov v30.8b, v31.8b
1836 mov v31.8b, v3.8b
1837 mov v3.8b, v30.8b
1838 urshr v12.8h, v9.8h, #8
1839 urshr v13.8h, v10.8h, #8
1840 fetch_src_pixblock
1841 raddhn v28.8b, v11.8h, v8.8h
1842 - PF add PF_X, PF_X, #8
1843 - PF tst PF_CTL, #0xF
1844 - PF beq 10f
1845 - PF add PF_X, PF_X, #8
1846 - PF sub PF_CTL, PF_CTL, #1
1847 + PF add, PF_X, PF_X, #8
1848 + PF tst, PF_CTL, #0xF
1849 + PF beq, 10f
1850 + PF add, PF_X, PF_X, #8
1851 + PF sub, PF_CTL, PF_CTL, #1
1853 raddhn v29.8b, v12.8h, v9.8h
1854 raddhn v30.8b, v13.8h, v10.8h
1855 umull v8.8h, v3.8b, v0.8b
1856 umull v9.8h, v3.8b, v1.8b
1857 umull v10.8h, v3.8b, v2.8b
1858 st4 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
1859 - PF cmp PF_X, ORIG_W
1860 - PF lsl DUMMY, PF_X, src_bpp_shift
1861 - PF prfm PREFETCH_MODE, [PF_SRC, DUMMY]
1862 - PF ble 10f
1863 - PF sub PF_X, PF_X, ORIG_W
1864 - PF subs PF_CTL, PF_CTL, #0x10
1865 - PF ble 10f
1866 - PF lsl DUMMY, SRC_STRIDE, #src_bpp_shift
1867 - PF ldrsb DUMMY, [PF_SRC, DUMMY]
1868 - PF add PF_SRC, PF_SRC, #1
1869 + PF cmp, PF_X, ORIG_W
1870 + PF lsl, DUMMY, PF_X, src_bpp_shift
1871 + PF prfm, PREFETCH_MODE, [PF_SRC, DUMMY]
1872 + PF ble, 10f
1873 + PF sub, PF_X, PF_X, ORIG_W
1874 + PF subs, PF_CTL, PF_CTL, #0x10
1875 + PF ble, 10f
1876 + PF lsl, DUMMY, SRC_STRIDE, #src_bpp_shift
1877 + PF ldrsb, DUMMY, [PF_SRC, DUMMY]
1878 + PF add, PF_SRC, PF_SRC, #1
1880 .endm
1882 generate_composite_function \
1883 pixman_composite_src_rpixbuf_8888_asm_neon, 32, 0, 32, \
1884 FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
1885 8, /* number of pixels, processed in a single block */ \
1886 10, /* prefetch distance */ \
1887 @@ -3126,197 +3126,197 @@ generate_composite_function_nearest_scan
1888 * format conversion, and interpolation as separate macros which can be used
1889 * as the basic building blocks for constructing bilinear scanline functions.
1892 .macro bilinear_load_8888 reg1, reg2, tmp
1893 asr TMP1, X, #16
1894 add X, X, UX
1895 add TMP1, TOP, TMP1, lsl #2
1896 - ld1 {&reg1&.2s}, [TMP1], STRIDE
1897 - ld1 {&reg2&.2s}, [TMP1]
1898 + ld1 {\()\reg1\().2s}, [TMP1], STRIDE
1899 + ld1 {\()\reg2\().2s}, [TMP1]
1900 .endm
1902 .macro bilinear_load_0565 reg1, reg2, tmp
1903 asr TMP1, X, #16
1904 add X, X, UX
1905 add TMP1, TOP, TMP1, lsl #1
1906 - ld1 {&reg2&.s}[0], [TMP1], STRIDE
1907 - ld1 {&reg2&.s}[1], [TMP1]
1908 - convert_four_0565_to_x888_packed reg2, reg1, reg2, tmp
1909 + ld1 {\()\reg2\().s}[0], [TMP1], STRIDE
1910 + ld1 {\()\reg2\().s}[1], [TMP1]
1911 + convert_four_0565_to_x888_packed \reg2, \reg1, \reg2, \tmp
1912 .endm
1914 .macro bilinear_load_and_vertical_interpolate_two_8888 \
1915 acc1, acc2, reg1, reg2, reg3, reg4, tmp1, tmp2
1917 - bilinear_load_8888 reg1, reg2, tmp1
1918 - umull &acc1&.8h, &reg1&.8b, v28.8b
1919 - umlal &acc1&.8h, &reg2&.8b, v29.8b
1920 - bilinear_load_8888 reg3, reg4, tmp2
1921 - umull &acc2&.8h, &reg3&.8b, v28.8b
1922 - umlal &acc2&.8h, &reg4&.8b, v29.8b
1923 + bilinear_load_8888 \reg1, \reg2, \tmp1
1924 + umull \()\acc1\().8h, \()\reg1\().8b, v28.8b
1925 + umlal \()\acc1\().8h, \()\reg2\().8b, v29.8b
1926 + bilinear_load_8888 \reg3, \reg4, \tmp2
1927 + umull \()\acc2\().8h, \()\reg3\().8b, v28.8b
1928 + umlal \()\acc2\().8h, \()\reg4\().8b, v29.8b
1929 .endm
1931 .macro bilinear_load_and_vertical_interpolate_four_8888 \
1932 - xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi \
1933 + xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi, \
1934 yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
1936 bilinear_load_and_vertical_interpolate_two_8888 \
1937 - xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi
1938 + \xacc1, \xacc2, \xreg1, \xreg2, \xreg3, \xreg4, \xacc2lo, \xacc2hi
1939 bilinear_load_and_vertical_interpolate_two_8888 \
1940 - yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
1941 + \yacc1, \yacc2, \yreg1, \yreg2, \yreg3, \yreg4, \yacc2lo, \yacc2hi
1942 .endm
1944 .macro vzip reg1, reg2
1945 umov TMP4, v31.d[0]
1946 - zip1 v31.8b, reg1, reg2
1947 - zip2 reg2, reg1, reg2
1948 - mov reg1, v31.8b
1949 + zip1 v31.8b, \reg1, \reg2
1950 + zip2 \reg2, \reg1, \reg2
1951 + mov \reg1, v31.8b
1952 mov v31.d[0], TMP4
1953 .endm
1955 .macro vuzp reg1, reg2
1956 umov TMP4, v31.d[0]
1957 - uzp1 v31.8b, reg1, reg2
1958 - uzp2 reg2, reg1, reg2
1959 - mov reg1, v31.8b
1960 + uzp1 v31.8b, \reg1, \reg2
1961 + uzp2 \reg2, \reg1, \reg2
1962 + mov \reg1, v31.8b
1963 mov v31.d[0], TMP4
1964 .endm
1966 .macro bilinear_load_and_vertical_interpolate_two_0565 \
1967 acc1, acc2, reg1, reg2, reg3, reg4, acc2lo, acc2hi
1968 asr TMP1, X, #16
1969 add X, X, UX
1970 add TMP1, TOP, TMP1, lsl #1
1971 asr TMP2, X, #16
1972 add X, X, UX
1973 add TMP2, TOP, TMP2, lsl #1
1974 - ld1 {&acc2&.s}[0], [TMP1], STRIDE
1975 - ld1 {&acc2&.s}[2], [TMP2], STRIDE
1976 - ld1 {&acc2&.s}[1], [TMP1]
1977 - ld1 {&acc2&.s}[3], [TMP2]
1978 - convert_0565_to_x888 acc2, reg3, reg2, reg1
1979 - vzip &reg1&.8b, &reg3&.8b
1980 - vzip &reg2&.8b, &reg4&.8b
1981 - vzip &reg3&.8b, &reg4&.8b
1982 - vzip &reg1&.8b, &reg2&.8b
1983 - umull &acc1&.8h, &reg1&.8b, v28.8b
1984 - umlal &acc1&.8h, &reg2&.8b, v29.8b
1985 - umull &acc2&.8h, &reg3&.8b, v28.8b
1986 - umlal &acc2&.8h, &reg4&.8b, v29.8b
1987 + ld1 {\()\acc2\().s}[0], [TMP1], STRIDE
1988 + ld1 {\()\acc2\().s}[2], [TMP2], STRIDE
1989 + ld1 {\()\acc2\().s}[1], [TMP1]
1990 + ld1 {\()\acc2\().s}[3], [TMP2]
1991 + convert_0565_to_x888 \acc2, \reg3, \reg2, \reg1
1992 + vzip \()\reg1\().8b, \()\reg3\().8b
1993 + vzip \()\reg2\().8b, \()\reg4\().8b
1994 + vzip \()\reg3\().8b, \()\reg4\().8b
1995 + vzip \()\reg1\().8b, \()\reg2\().8b
1996 + umull \()\acc1\().8h, \()\reg1\().8b, v28.8b
1997 + umlal \()\acc1\().8h, \()\reg2\().8b, v29.8b
1998 + umull \()\acc2\().8h, \()\reg3\().8b, v28.8b
1999 + umlal \()\acc2\().8h, \()\reg4\().8b, v29.8b
2000 .endm
2002 .macro bilinear_load_and_vertical_interpolate_four_0565 \
2003 - xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi \
2004 + xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi, \
2005 yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
2006 asr TMP1, X, #16
2007 add X, X, UX
2008 add TMP1, TOP, TMP1, lsl #1
2009 asr TMP2, X, #16
2010 add X, X, UX
2011 add TMP2, TOP, TMP2, lsl #1
2012 - ld1 {&xacc2&.s}[0], [TMP1], STRIDE
2013 - ld1 {&xacc2&.s}[2], [TMP2], STRIDE
2014 - ld1 {&xacc2&.s}[1], [TMP1]
2015 - ld1 {&xacc2&.s}[3], [TMP2]
2016 - convert_0565_to_x888 xacc2, xreg3, xreg2, xreg1
2017 + ld1 {\()\xacc2\().s}[0], [TMP1], STRIDE
2018 + ld1 {\()\xacc2\().s}[2], [TMP2], STRIDE
2019 + ld1 {\()\xacc2\().s}[1], [TMP1]
2020 + ld1 {\()\xacc2\().s}[3], [TMP2]
2021 + convert_0565_to_x888 \xacc2, \xreg3, \xreg2, \xreg1
2022 asr TMP1, X, #16
2023 add X, X, UX
2024 add TMP1, TOP, TMP1, lsl #1
2025 asr TMP2, X, #16
2026 add X, X, UX
2027 add TMP2, TOP, TMP2, lsl #1
2028 - ld1 {&yacc2&.s}[0], [TMP1], STRIDE
2029 - vzip &xreg1&.8b, &xreg3&.8b
2030 - ld1 {&yacc2&.s}[2], [TMP2], STRIDE
2031 - vzip &xreg2&.8b, &xreg4&.8b
2032 - ld1 {&yacc2&.s}[1], [TMP1]
2033 - vzip &xreg3&.8b, &xreg4&.8b
2034 - ld1 {&yacc2&.s}[3], [TMP2]
2035 - vzip &xreg1&.8b, &xreg2&.8b
2036 - convert_0565_to_x888 yacc2, yreg3, yreg2, yreg1
2037 - umull &xacc1&.8h, &xreg1&.8b, v28.8b
2038 - vzip &yreg1&.8b, &yreg3&.8b
2039 - umlal &xacc1&.8h, &xreg2&.8b, v29.8b
2040 - vzip &yreg2&.8b, &yreg4&.8b
2041 - umull &xacc2&.8h, &xreg3&.8b, v28.8b
2042 - vzip &yreg3&.8b, &yreg4&.8b
2043 - umlal &xacc2&.8h, &xreg4&.8b, v29.8b
2044 - vzip &yreg1&.8b, &yreg2&.8b
2045 - umull &yacc1&.8h, &yreg1&.8b, v28.8b
2046 - umlal &yacc1&.8h, &yreg2&.8b, v29.8b
2047 - umull &yacc2&.8h, &yreg3&.8b, v28.8b
2048 - umlal &yacc2&.8h, &yreg4&.8b, v29.8b
2049 + ld1 {\()\yacc2\().s}[0], [TMP1], STRIDE
2050 + vzip \()\xreg1\().8b, \()\xreg3\().8b
2051 + ld1 {\()\yacc2\().s}[2], [TMP2], STRIDE
2052 + vzip \()\xreg2\().8b, \()\xreg4\().8b
2053 + ld1 {\()\yacc2\().s}[1], [TMP1]
2054 + vzip \()\xreg3\().8b, \()\xreg4\().8b
2055 + ld1 {\()\yacc2\().s}[3], [TMP2]
2056 + vzip \()\xreg1\().8b, \()\xreg2\().8b
2057 + convert_0565_to_x888 \yacc2, \yreg3, \yreg2, \yreg1
2058 + umull \()\xacc1\().8h, \()\xreg1\().8b, v28.8b
2059 + vzip \()\yreg1\().8b, \()\yreg3\().8b
2060 + umlal \()\xacc1\().8h, \()\xreg2\().8b, v29.8b
2061 + vzip \()\yreg2\().8b, \()\yreg4\().8b
2062 + umull \()\xacc2\().8h, \()\xreg3\().8b, v28.8b
2063 + vzip \()\yreg3\().8b, \()\yreg4\().8b
2064 + umlal \()\xacc2\().8h, \()\xreg4\().8b, v29.8b
2065 + vzip \()\yreg1\().8b, \()\yreg2\().8b
2066 + umull \()\yacc1\().8h, \()\yreg1\().8b, v28.8b
2067 + umlal \()\yacc1\().8h, \()\yreg2\().8b, v29.8b
2068 + umull \()\yacc2\().8h, \()\yreg3\().8b, v28.8b
2069 + umlal \()\yacc2\().8h, \()\yreg4\().8b, v29.8b
2070 .endm
2072 .macro bilinear_store_8888 numpix, tmp1, tmp2
2073 -.if numpix == 4
2074 +.if \numpix == 4
2075 st1 {v0.2s, v1.2s}, [OUT], #16
2076 -.elseif numpix == 2
2077 +.elseif \numpix == 2
2078 st1 {v0.2s}, [OUT], #8
2079 -.elseif numpix == 1
2080 +.elseif \numpix == 1
2081 st1 {v0.s}[0], [OUT], #4
2082 .else
2083 - .error bilinear_store_8888 numpix is unsupported
2084 + .error bilinear_store_8888 \numpix is unsupported
2085 .endif
2086 .endm
2088 .macro bilinear_store_0565 numpix, tmp1, tmp2
2089 vuzp v0.8b, v1.8b
2090 vuzp v2.8b, v3.8b
2091 vuzp v1.8b, v3.8b
2092 vuzp v0.8b, v2.8b
2093 - convert_8888_to_0565 v2, v1, v0, v1, tmp1, tmp2
2094 -.if numpix == 4
2095 + convert_8888_to_0565 v2, v1, v0, v1, \tmp1, \tmp2
2096 +.if \numpix == 4
2097 st1 {v1.4h}, [OUT], #8
2098 -.elseif numpix == 2
2099 +.elseif \numpix == 2
2100 st1 {v1.s}[0], [OUT], #4
2101 -.elseif numpix == 1
2102 +.elseif \numpix == 1
2103 st1 {v1.h}[0], [OUT], #2
2104 .else
2105 - .error bilinear_store_0565 numpix is unsupported
2106 + .error bilinear_store_0565 \numpix is unsupported
2107 .endif
2108 .endm
2110 .macro bilinear_interpolate_last_pixel src_fmt, dst_fmt
2111 - bilinear_load_&src_fmt v0, v1, v2
2112 + bilinear_load_\()\src_fmt v0, v1, v2
2113 umull v2.8h, v0.8b, v28.8b
2114 umlal v2.8h, v1.8b, v29.8b
2115 /* 5 cycles bubble */
2116 ushll v0.4s, v2.4h, #BILINEAR_INTERPOLATION_BITS
2117 umlsl v0.4s, v2.4h, v15.h[0]
2118 umlal2 v0.4s, v2.8h, v15.h[0]
2119 /* 5 cycles bubble */
2120 shrn v0.4h, v0.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
2121 /* 3 cycles bubble */
2122 xtn v0.8b, v0.8h
2123 /* 1 cycle bubble */
2124 - bilinear_store_&dst_fmt 1, v3, v4
2125 + bilinear_store_\()\dst_fmt 1, v3, v4
2126 .endm
2128 .macro bilinear_interpolate_two_pixels src_fmt, dst_fmt
2129 - bilinear_load_and_vertical_interpolate_two_&src_fmt \
2130 + bilinear_load_and_vertical_interpolate_two_\()\src_fmt \
2131 v1, v11, v2, v3, v20, v21, v22, v23
2132 ushll v0.4s, v1.4h, #BILINEAR_INTERPOLATION_BITS
2133 umlsl v0.4s, v1.4h, v15.h[0]
2134 umlal2 v0.4s, v1.8h, v15.h[0]
2135 ushll v10.4s, v11.4h, #BILINEAR_INTERPOLATION_BITS
2136 umlsl v10.4s, v11.4h, v15.h[4]
2137 umlal2 v10.4s, v11.8h, v15.h[4]
2138 shrn v0.4h, v0.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
2139 shrn2 v0.8h, v10.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
2140 ushr v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS)
2141 add v12.8h, v12.8h, v13.8h
2142 xtn v0.8b, v0.8h
2143 - bilinear_store_&dst_fmt 2, v3, v4
2144 + bilinear_store_\()\dst_fmt 2, v3, v4
2145 .endm
2147 .macro bilinear_interpolate_four_pixels src_fmt, dst_fmt
2148 - bilinear_load_and_vertical_interpolate_four_&src_fmt \
2149 - v1, v11, v14, v20, v16, v17, v22, v23 \
2150 + bilinear_load_and_vertical_interpolate_four_\()\src_fmt \
2151 + v1, v11, v14, v20, v16, v17, v22, v23, \
2152 v3, v9, v24, v25, v26, v27, v18, v19
2153 prfm PREFETCH_MODE, [TMP1, PF_OFFS]
2154 sub TMP1, TMP1, STRIDE
2155 ushll v0.4s, v1.4h, #BILINEAR_INTERPOLATION_BITS
2156 umlsl v0.4s, v1.4h, v15.h[0]
2157 umlal2 v0.4s, v1.8h, v15.h[0]
2158 ushll v10.4s, v11.4h, #BILINEAR_INTERPOLATION_BITS
2159 umlsl v10.4s, v11.4h, v15.h[4]
2160 @@ -3333,64 +3333,64 @@ generate_composite_function_nearest_scan
2161 shrn v0.4h, v0.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
2162 shrn2 v0.8h, v10.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
2163 shrn v2.4h, v2.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
2164 shrn2 v2.8h, v8.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
2165 ushr v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS)
2166 xtn v0.8b, v0.8h
2167 xtn v1.8b, v2.8h
2168 add v12.8h, v12.8h, v13.8h
2169 - bilinear_store_&dst_fmt 4, v3, v4
2170 + bilinear_store_\()\dst_fmt 4, v3, v4
2171 .endm
2173 .macro bilinear_interpolate_four_pixels_head src_fmt, dst_fmt
2174 -.ifdef have_bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt
2175 - bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt&_head
2176 +.ifdef have_bilinear_interpolate_four_pixels_\()\src_fmt\()_\()\dst_fmt
2177 + bilinear_interpolate_four_pixels_\()\src_fmt\()_\()\dst_fmt\()_head
2178 .else
2179 - bilinear_interpolate_four_pixels src_fmt, dst_fmt
2180 + bilinear_interpolate_four_pixels \src_fmt, \dst_fmt
2181 .endif
2182 .endm
2184 .macro bilinear_interpolate_four_pixels_tail src_fmt, dst_fmt
2185 -.ifdef have_bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt
2186 - bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt&_tail
2187 +.ifdef have_bilinear_interpolate_four_pixels_\()\src_fmt\()_\()\dst_fmt
2188 + bilinear_interpolate_four_pixels_\()\src_fmt\()_\()\dst_fmt\()_tail
2189 .endif
2190 .endm
2192 .macro bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt
2193 -.ifdef have_bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt
2194 - bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt&_tail_head
2195 +.ifdef have_bilinear_interpolate_four_pixels_\()\src_fmt\()_\()\dst_fmt
2196 + bilinear_interpolate_four_pixels_\()\src_fmt\()_\()\dst_fmt\()_tail_head
2197 .else
2198 - bilinear_interpolate_four_pixels src_fmt, dst_fmt
2199 + bilinear_interpolate_four_pixels \src_fmt, \dst_fmt
2200 .endif
2201 .endm
2203 .macro bilinear_interpolate_eight_pixels_head src_fmt, dst_fmt
2204 -.ifdef have_bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt
2205 - bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt&_head
2206 +.ifdef have_bilinear_interpolate_eight_pixels_\()\src_fmt\()_\()\dst_fmt
2207 + bilinear_interpolate_eight_pixels_\()\src_fmt\()_\()\dst_fmt\()_head
2208 .else
2209 - bilinear_interpolate_four_pixels_head src_fmt, dst_fmt
2210 - bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt
2211 + bilinear_interpolate_four_pixels_head \src_fmt, \dst_fmt
2212 + bilinear_interpolate_four_pixels_tail_head \src_fmt, \dst_fmt
2213 .endif
2214 .endm
2216 .macro bilinear_interpolate_eight_pixels_tail src_fmt, dst_fmt
2217 -.ifdef have_bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt
2218 - bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt&_tail
2219 +.ifdef have_bilinear_interpolate_eight_pixels_\()\src_fmt\()_\()\dst_fmt
2220 + bilinear_interpolate_eight_pixels_\()\src_fmt\()_\()\dst_fmt\()_tail
2221 .else
2222 - bilinear_interpolate_four_pixels_tail src_fmt, dst_fmt
2223 + bilinear_interpolate_four_pixels_tail \src_fmt, \dst_fmt
2224 .endif
2225 .endm
2227 .macro bilinear_interpolate_eight_pixels_tail_head src_fmt, dst_fmt
2228 -.ifdef have_bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt
2229 - bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt&_tail_head
2230 +.ifdef have_bilinear_interpolate_eight_pixels_\()\src_fmt\()_\()\dst_fmt
2231 + bilinear_interpolate_eight_pixels_\()\src_fmt\()_\()\dst_fmt\()_tail_head
2232 .else
2233 - bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt
2234 - bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt
2235 + bilinear_interpolate_four_pixels_tail_head \src_fmt, \dst_fmt
2236 + bilinear_interpolate_four_pixels_tail_head \src_fmt, \dst_fmt
2237 .endif
2238 .endm
2240 .set BILINEAR_FLAG_UNROLL_4, 0
2241 .set BILINEAR_FLAG_UNROLL_8, 1
2242 .set BILINEAR_FLAG_USE_ALL_NEON_REGS, 2
2245 @@ -3405,17 +3405,17 @@ generate_composite_function_nearest_scan
2246 * prefetch_distance - prefetch in the source image by that many
2247 * pixels ahead
2250 .macro generate_bilinear_scanline_func fname, src_fmt, dst_fmt, \
2251 src_bpp_shift, dst_bpp_shift, \
2252 prefetch_distance, flags
2254 -pixman_asm_function fname
2255 +pixman_asm_function \fname
2256 OUT .req x0
2257 TOP .req x1
2258 BOTTOM .req x2
2259 WT .req x3
2260 WB .req x4
2261 X .req x5
2262 UX .req x6
2263 WIDTH .req x7
2264 @@ -3437,17 +3437,17 @@ pixman_asm_function fname
2265 sub sp, sp, 112 /* push all registers */
2266 sub x29, x29, 64
2267 st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], #32
2268 st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], #32
2269 stp x8, x9, [x29, -80]
2270 stp x10, x11, [x29, -96]
2271 stp x12, x13, [x29, -112]
2273 - mov PF_OFFS, #prefetch_distance
2274 + mov PF_OFFS, #\prefetch_distance
2275 mul PF_OFFS, PF_OFFS, UX
2277 subs STRIDE, BOTTOM, TOP
2278 .unreq BOTTOM
2280 cmp WIDTH, #0
2281 ble 300f
2283 @@ -3458,85 +3458,85 @@ pixman_asm_function fname
2284 mov v25.d[0], v12.d[1]
2285 mov v26.d[0], v13.d[0]
2286 add v25.4h, v25.4h, v26.4h
2287 mov v12.d[1], v25.d[0]
2289 /* ensure good destination alignment */
2290 cmp WIDTH, #1
2291 blt 100f
2292 - tst OUT, #(1 << dst_bpp_shift)
2293 + tst OUT, #(1 << \dst_bpp_shift)
2294 beq 100f
2295 ushr v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS)
2296 add v12.8h, v12.8h, v13.8h
2297 - bilinear_interpolate_last_pixel src_fmt, dst_fmt
2298 + bilinear_interpolate_last_pixel \src_fmt, \dst_fmt
2299 sub WIDTH, WIDTH, #1
2300 100:
2301 add v13.8h, v13.8h, v13.8h
2302 ushr v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS)
2303 add v12.8h, v12.8h, v13.8h
2305 cmp WIDTH, #2
2306 blt 100f
2307 - tst OUT, #(1 << (dst_bpp_shift + 1))
2308 + tst OUT, #(1 << (\dst_bpp_shift + 1))
2309 beq 100f
2310 - bilinear_interpolate_two_pixels src_fmt, dst_fmt
2311 + bilinear_interpolate_two_pixels \src_fmt, \dst_fmt
2312 sub WIDTH, WIDTH, #2
2313 100:
2314 -.if ((flags) & BILINEAR_FLAG_UNROLL_8) != 0
2315 +.if ((\flags) & BILINEAR_FLAG_UNROLL_8) != 0
2316 /*********** 8 pixels per iteration *****************/
2317 cmp WIDTH, #4
2318 blt 100f
2319 - tst OUT, #(1 << (dst_bpp_shift + 2))
2320 + tst OUT, #(1 << (\dst_bpp_shift + 2))
2321 beq 100f
2322 - bilinear_interpolate_four_pixels src_fmt, dst_fmt
2323 + bilinear_interpolate_four_pixels \src_fmt, \dst_fmt
2324 sub WIDTH, WIDTH, #4
2325 100:
2326 subs WIDTH, WIDTH, #8
2327 blt 100f
2328 - asr PF_OFFS, PF_OFFS, #(16 - src_bpp_shift)
2329 - bilinear_interpolate_eight_pixels_head src_fmt, dst_fmt
2330 + asr PF_OFFS, PF_OFFS, #(16 - \src_bpp_shift)
2331 + bilinear_interpolate_eight_pixels_head \src_fmt, \dst_fmt
2332 subs WIDTH, WIDTH, #8
2333 blt 500f
2334 1000:
2335 - bilinear_interpolate_eight_pixels_tail_head src_fmt, dst_fmt
2336 + bilinear_interpolate_eight_pixels_tail_head \src_fmt, \dst_fmt
2337 subs WIDTH, WIDTH, #8
2338 bge 1000b
2339 500:
2340 - bilinear_interpolate_eight_pixels_tail src_fmt, dst_fmt
2341 + bilinear_interpolate_eight_pixels_tail \src_fmt, \dst_fmt
2342 100:
2343 tst WIDTH, #4
2344 beq 200f
2345 - bilinear_interpolate_four_pixels src_fmt, dst_fmt
2346 + bilinear_interpolate_four_pixels \src_fmt, \dst_fmt
2347 200:
2348 .else
2349 /*********** 4 pixels per iteration *****************/
2350 subs WIDTH, WIDTH, #4
2351 blt 100f
2352 - asr PF_OFFS, PF_OFFS, #(16 - src_bpp_shift)
2353 - bilinear_interpolate_four_pixels_head src_fmt, dst_fmt
2354 + asr PF_OFFS, PF_OFFS, #(16 - \src_bpp_shift)
2355 + bilinear_interpolate_four_pixels_head \src_fmt, \dst_fmt
2356 subs WIDTH, WIDTH, #4
2357 blt 500f
2358 1000:
2359 - bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt
2360 + bilinear_interpolate_four_pixels_tail_head \src_fmt, \dst_fmt
2361 subs WIDTH, WIDTH, #4
2362 bge 1000b
2363 500:
2364 - bilinear_interpolate_four_pixels_tail src_fmt, dst_fmt
2365 + bilinear_interpolate_four_pixels_tail \src_fmt, \dst_fmt
2366 100:
2367 /****************************************************/
2368 .endif
2369 /* handle the remaining trailing pixels */
2370 tst WIDTH, #2
2371 beq 200f
2372 - bilinear_interpolate_two_pixels src_fmt, dst_fmt
2373 + bilinear_interpolate_two_pixels \src_fmt, \dst_fmt
2374 200:
2375 tst WIDTH, #1
2376 beq 300f
2377 - bilinear_interpolate_last_pixel src_fmt, dst_fmt
2378 + bilinear_interpolate_last_pixel \src_fmt, \dst_fmt
2379 300:
2380 sub x29, x29, 64
2381 ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], #32
2382 ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], #32
2383 ldp x8, x9, [x29, -80]
2384 ldp x10, x11, [x29, -96]
2385 ldp x12, x13, [x29, -104]
2386 mov sp, x29
2387 @@ -3551,17 +3551,17 @@ 300:
2388 .unreq UX
2389 .unreq WIDTH
2390 .unreq TMP1
2391 .unreq TMP2
2392 .unreq PF_OFFS
2393 .unreq TMP3
2394 .unreq TMP4
2395 .unreq STRIDE
2396 -.endfunc
2397 +pixman_end_asm_function
2399 .endm
2401 /*****************************************************************************/
2403 .set have_bilinear_interpolate_four_pixels_8888_8888, 1
2405 .macro bilinear_interpolate_four_pixels_8888_8888_head
2406 diff --git a/gfx/cairo/libpixman/src/pixman-arma64-neon-asm.h b/gfx/cairo/libpixman/src/pixman-arma64-neon-asm.h
2407 --- a/gfx/cairo/libpixman/src/pixman-arma64-neon-asm.h
2408 +++ b/gfx/cairo/libpixman/src/pixman-arma64-neon-asm.h
2409 @@ -75,340 +75,340 @@
2410 #define PREFETCH_MODE pldl1keep
2413 * Definitions of supplementary pixld/pixst macros (for partial load/store of
2414 * pixel data).
2417 .macro pixldst1 op, elem_size, reg1, mem_operand, abits
2418 - op {v&reg1&.&elem_size}, [&mem_operand&], #8
2419 + \op {v\()\reg1\().\()\elem_size}, [\()\mem_operand\()], #8
2420 .endm
2422 .macro pixldst2 op, elem_size, reg1, reg2, mem_operand, abits
2423 - op {v&reg1&.&elem_size, v&reg2&.&elem_size}, [&mem_operand&], #16
2424 + \op {v\()\reg1\().\()\elem_size, v\()\reg2\().\()\elem_size}, [\()\mem_operand\()], #16
2425 .endm
2427 .macro pixldst4 op, elem_size, reg1, reg2, reg3, reg4, mem_operand, abits
2428 - op {v&reg1&.&elem_size, v&reg2&.&elem_size, v&reg3&.&elem_size, v&reg4&.&elem_size}, [&mem_operand&], #32
2429 + \op {v\()\reg1\().\()\elem_size, v\()\reg2\().\()\elem_size, v\()\reg3\().\()\elem_size, v\()\reg4\().\()\elem_size}, [\()\mem_operand\()], #32
2430 .endm
2432 .macro pixldst0 op, elem_size, reg1, idx, mem_operand, abits, bytes
2433 - op {v&reg1&.&elem_size}[idx], [&mem_operand&], #&bytes&
2434 + \op {v\()\reg1\().\()\elem_size}[\idx], [\()\mem_operand\()], #\()\bytes\()
2435 .endm
2437 .macro pixldst3 op, elem_size, reg1, reg2, reg3, mem_operand
2438 - op {v&reg1&.&elem_size, v&reg2&.&elem_size, v&reg3&.&elem_size}, [&mem_operand&], #24
2439 + \op {v\()\reg1\().\()\elem_size, v\()\reg2\().\()\elem_size, v\()\reg3\().\()\elem_size}, [\()\mem_operand\()], #24
2440 .endm
2442 .macro pixldst30 op, elem_size, reg1, reg2, reg3, idx, mem_operand
2443 - op {v&reg1&.&elem_size, v&reg2&.&elem_size, v&reg3&.&elem_size}[idx], [&mem_operand&], #3
2444 + \op {v\()\reg1\().\()\elem_size, v\()\reg2\().\()\elem_size, v\()\reg3\().\()\elem_size}[\idx], [\()\mem_operand\()], #3
2445 .endm
2447 .macro pixldst numbytes, op, elem_size, basereg, mem_operand, abits
2448 -.if numbytes == 32
2449 - .if elem_size==32
2450 - pixldst4 op, 2s, %(basereg+4), %(basereg+5), \
2451 - %(basereg+6), %(basereg+7), mem_operand, abits
2452 - .elseif elem_size==16
2453 - pixldst4 op, 4h, %(basereg+4), %(basereg+5), \
2454 - %(basereg+6), %(basereg+7), mem_operand, abits
2455 +.if \numbytes == 32
2456 + .if \elem_size==32
2457 + pixldst4 \op, 2s, %(\basereg+4), %(\basereg+5), \
2458 + %(\basereg+6), %(\basereg+7), \mem_operand, \abits
2459 + .elseif \elem_size==16
2460 + pixldst4 \op, 4h, %(\basereg+4), %(\basereg+5), \
2461 + %(\basereg+6), %(\basereg+7), \mem_operand, \abits
2462 .else
2463 - pixldst4 op, 8b, %(basereg+4), %(basereg+5), \
2464 - %(basereg+6), %(basereg+7), mem_operand, abits
2465 + pixldst4 \op, 8b, %(\basereg+4), %(\basereg+5), \
2466 + %(\basereg+6), %(\basereg+7), \mem_operand, \abits
2467 .endif
2468 -.elseif numbytes == 16
2469 - .if elem_size==32
2470 - pixldst2 op, 2s, %(basereg+2), %(basereg+3), mem_operand, abits
2471 - .elseif elem_size==16
2472 - pixldst2 op, 4h, %(basereg+2), %(basereg+3), mem_operand, abits
2473 +.elseif \numbytes == 16
2474 + .if \elem_size==32
2475 + pixldst2 \op, 2s, %(\basereg+2), %(\basereg+3), \mem_operand, \abits
2476 + .elseif \elem_size==16
2477 + pixldst2 \op, 4h, %(\basereg+2), %(\basereg+3), \mem_operand, \abits
2478 .else
2479 - pixldst2 op, 8b, %(basereg+2), %(basereg+3), mem_operand, abits
2480 + pixldst2 \op, 8b, %(\basereg+2), %(\basereg+3), \mem_operand, \abits
2481 .endif
2482 -.elseif numbytes == 8
2483 - .if elem_size==32
2484 - pixldst1 op, 2s, %(basereg+1), mem_operand, abits
2485 - .elseif elem_size==16
2486 - pixldst1 op, 4h, %(basereg+1), mem_operand, abits
2487 +.elseif \numbytes == 8
2488 + .if \elem_size==32
2489 + pixldst1 \op, 2s, %(\basereg+1), \mem_operand, \abits
2490 + .elseif \elem_size==16
2491 + pixldst1 \op, 4h, %(\basereg+1), \mem_operand, \abits
2492 .else
2493 - pixldst1 op, 8b, %(basereg+1), mem_operand, abits
2494 + pixldst1 \op, 8b, %(\basereg+1), \mem_operand, \abits
2495 .endif
2496 -.elseif numbytes == 4
2497 - .if !RESPECT_STRICT_ALIGNMENT || (elem_size == 32)
2498 - pixldst0 op, s, %(basereg+0), 1, mem_operand, abits, 4
2499 - .elseif elem_size == 16
2500 - pixldst0 op, h, %(basereg+0), 2, mem_operand, abits, 2
2501 - pixldst0 op, h, %(basereg+0), 3, mem_operand, abits, 2
2502 +.elseif \numbytes == 4
2503 + .if !RESPECT_STRICT_ALIGNMENT || (\elem_size == 32)
2504 + pixldst0 \op, s, %(\basereg+0), 1, \mem_operand, \abits, 4
2505 + .elseif \elem_size == 16
2506 + pixldst0 \op, h, %(\basereg+0), 2, \mem_operand, \abits, 2
2507 + pixldst0 \op, h, %(\basereg+0), 3, \mem_operand, \abits, 2
2508 .else
2509 - pixldst0 op, b, %(basereg+0), 4, mem_operand, abits, 1
2510 - pixldst0 op, b, %(basereg+0), 5, mem_operand, abits, 1
2511 - pixldst0 op, b, %(basereg+0), 6, mem_operand, abits, 1
2512 - pixldst0 op, b, %(basereg+0), 7, mem_operand, abits, 1
2513 + pixldst0 \op, b, %(\basereg+0), 4, \mem_operand, \abits, 1
2514 + pixldst0 \op, b, %(\basereg+0), 5, \mem_operand, \abits, 1
2515 + pixldst0 \op, b, %(\basereg+0), 6, \mem_operand, \abits, 1
2516 + pixldst0 \op, b, %(\basereg+0), 7, \mem_operand, \abits, 1
2517 .endif
2518 -.elseif numbytes == 2
2519 - .if !RESPECT_STRICT_ALIGNMENT || (elem_size == 16)
2520 - pixldst0 op, h, %(basereg+0), 1, mem_operand, abits, 2
2521 +.elseif \numbytes == 2
2522 + .if !RESPECT_STRICT_ALIGNMENT || (\elem_size == 16)
2523 + pixldst0 \op, h, %(\basereg+0), 1, \mem_operand, \abits, 2
2524 .else
2525 - pixldst0 op, b, %(basereg+0), 2, mem_operand, abits, 1
2526 - pixldst0 op, b, %(basereg+0), 3, mem_operand, abits, 1
2527 + pixldst0 \op, b, %(\basereg+0), 2, \mem_operand, \abits, 1
2528 + pixldst0 \op, b, %(\basereg+0), 3, \mem_operand, \abits, 1
2529 .endif
2530 -.elseif numbytes == 1
2531 - pixldst0 op, b, %(basereg+0), 1, mem_operand, abits, 1
2532 +.elseif \numbytes == 1
2533 + pixldst0 \op, b, %(\basereg+0), 1, \mem_operand, \abits, 1
2534 .else
2535 - .error "unsupported size: numbytes"
2536 + .error "unsupported size: \numbytes"
2537 .endif
2538 .endm
2540 .macro pixld numpix, bpp, basereg, mem_operand, abits=0
2541 -.if bpp > 0
2542 -.if (bpp == 32) && (numpix == 8) && (DEINTERLEAVE_32BPP_ENABLED != 0)
2543 - pixldst4 ld4, 8b, %(basereg+4), %(basereg+5), \
2544 - %(basereg+6), %(basereg+7), mem_operand, abits
2545 -.elseif (bpp == 24) && (numpix == 8)
2546 - pixldst3 ld3, 8b, %(basereg+3), %(basereg+4), %(basereg+5), mem_operand
2547 -.elseif (bpp == 24) && (numpix == 4)
2548 - pixldst30 ld3, b, %(basereg+0), %(basereg+1), %(basereg+2), 4, mem_operand
2549 - pixldst30 ld3, b, %(basereg+0), %(basereg+1), %(basereg+2), 5, mem_operand
2550 - pixldst30 ld3, b, %(basereg+0), %(basereg+1), %(basereg+2), 6, mem_operand
2551 - pixldst30 ld3, b, %(basereg+0), %(basereg+1), %(basereg+2), 7, mem_operand
2552 -.elseif (bpp == 24) && (numpix == 2)
2553 - pixldst30 ld3, b, %(basereg+0), %(basereg+1), %(basereg+2), 2, mem_operand
2554 - pixldst30 ld3, b, %(basereg+0), %(basereg+1), %(basereg+2), 3, mem_operand
2555 -.elseif (bpp == 24) && (numpix == 1)
2556 - pixldst30 ld3, b, %(basereg+0), %(basereg+1), %(basereg+2), 1, mem_operand
2557 +.if \bpp > 0
2558 +.if (\bpp == 32) && (\numpix == 8) && (DEINTERLEAVE_32BPP_ENABLED != 0)
2559 + pixldst4 ld4, 8b, %(\basereg+4), %(\basereg+5), \
2560 + %(\basereg+6), %(\basereg+7), \mem_operand, \abits
2561 +.elseif (\bpp == 24) && (\numpix == 8)
2562 + pixldst3 ld3, 8b, %(\basereg+3), %(\basereg+4), %(\basereg+5), \mem_operand
2563 +.elseif (\bpp == 24) && (\numpix == 4)
2564 + pixldst30 ld3, b, %(\basereg+0), %(\basereg+1), %(\basereg+2), 4, \mem_operand
2565 + pixldst30 ld3, b, %(\basereg+0), %(\basereg+1), %(\basereg+2), 5, \mem_operand
2566 + pixldst30 ld3, b, %(\basereg+0), %(\basereg+1), %(\basereg+2), 6, \mem_operand
2567 + pixldst30 ld3, b, %(\basereg+0), %(\basereg+1), %(\basereg+2), 7, \mem_operand
2568 +.elseif (\bpp == 24) && (\numpix == 2)
2569 + pixldst30 ld3, b, %(\basereg+0), %(\basereg+1), %(\basereg+2), 2, \mem_operand
2570 + pixldst30 ld3, b, %(\basereg+0), %(\basereg+1), %(\basereg+2), 3, \mem_operand
2571 +.elseif (\bpp == 24) && (\numpix == 1)
2572 + pixldst30 ld3, b, %(\basereg+0), %(\basereg+1), %(\basereg+2), 1, \mem_operand
2573 .else
2574 - pixldst %(numpix * bpp / 8), ld1, %(bpp), basereg, mem_operand, abits
2575 + pixldst %(\numpix * \bpp / 8), ld1, %(\bpp), \basereg, \mem_operand, \abits
2576 .endif
2577 .endif
2578 .endm
2580 .macro pixst numpix, bpp, basereg, mem_operand, abits=0
2581 -.if bpp > 0
2582 -.if (bpp == 32) && (numpix == 8) && (DEINTERLEAVE_32BPP_ENABLED != 0)
2583 - pixldst4 st4, 8b, %(basereg+4), %(basereg+5), \
2584 - %(basereg+6), %(basereg+7), mem_operand, abits
2585 -.elseif (bpp == 24) && (numpix == 8)
2586 - pixldst3 st3, 8b, %(basereg+3), %(basereg+4), %(basereg+5), mem_operand
2587 -.elseif (bpp == 24) && (numpix == 4)
2588 - pixldst30 st3, b, %(basereg+0), %(basereg+1), %(basereg+2), 4, mem_operand
2589 - pixldst30 st3, b, %(basereg+0), %(basereg+1), %(basereg+2), 5, mem_operand
2590 - pixldst30 st3, b, %(basereg+0), %(basereg+1), %(basereg+2), 6, mem_operand
2591 - pixldst30 st3, b, %(basereg+0), %(basereg+1), %(basereg+2), 7, mem_operand
2592 -.elseif (bpp == 24) && (numpix == 2)
2593 - pixldst30 st3, b, %(basereg+0), %(basereg+1), %(basereg+2), 2, mem_operand
2594 - pixldst30 st3, b, %(basereg+0), %(basereg+1), %(basereg+2), 3, mem_operand
2595 -.elseif (bpp == 24) && (numpix == 1)
2596 - pixldst30 st3, b, %(basereg+0), %(basereg+1), %(basereg+2), 1, mem_operand
2597 -.elseif numpix * bpp == 32 && abits == 32
2598 - pixldst 4, st1, 32, basereg, mem_operand, abits
2599 -.elseif numpix * bpp == 16 && abits == 16
2600 - pixldst 2, st1, 16, basereg, mem_operand, abits
2601 +.if \bpp > 0
2602 +.if (\bpp == 32) && (\numpix == 8) && (DEINTERLEAVE_32BPP_ENABLED != 0)
2603 + pixldst4 st4, 8b, %(\basereg+4), %(\basereg+5), \
2604 + %(\basereg+6), %(\basereg+7), \mem_operand, \abits
2605 +.elseif (\bpp == 24) && (\numpix == 8)
2606 + pixldst3 st3, 8b, %(\basereg+3), %(\basereg+4), %(\basereg+5), \mem_operand
2607 +.elseif (\bpp == 24) && (\numpix == 4)
2608 + pixldst30 st3, b, %(\basereg+0), %(\basereg+1), %(\basereg+2), 4, \mem_operand
2609 + pixldst30 st3, b, %(\basereg+0), %(\basereg+1), %(\basereg+2), 5, \mem_operand
2610 + pixldst30 st3, b, %(\basereg+0), %(\basereg+1), %(\basereg+2), 6, \mem_operand
2611 + pixldst30 st3, b, %(\basereg+0), %(\basereg+1), %(\basereg+2), 7, \mem_operand
2612 +.elseif (\bpp == 24) && (\numpix == 2)
2613 + pixldst30 st3, b, %(\basereg+0), %(\basereg+1), %(\basereg+2), 2, \mem_operand
2614 + pixldst30 st3, b, %(\basereg+0), %(\basereg+1), %(\basereg+2), 3, \mem_operand
2615 +.elseif (\bpp == 24) && (\numpix == 1)
2616 + pixldst30 st3, b, %(\basereg+0), %(\basereg+1), %(\basereg+2), 1, \mem_operand
2617 +.elseif \numpix * \bpp == 32 && \abits == 32
2618 + pixldst 4, st1, 32, \basereg, \mem_operand, \abits
2619 +.elseif \numpix * \bpp == 16 && \abits == 16
2620 + pixldst 2, st1, 16, \basereg, \mem_operand, \abits
2621 .else
2622 - pixldst %(numpix * bpp / 8), st1, %(bpp), basereg, mem_operand, abits
2623 + pixldst %(\numpix * \bpp / 8), st1, %(\bpp), \basereg, \mem_operand, \abits
2624 .endif
2625 .endif
2626 .endm
2628 .macro pixld_a numpix, bpp, basereg, mem_operand
2629 -.if (bpp * numpix) <= 128
2630 - pixld numpix, bpp, basereg, mem_operand, %(bpp * numpix)
2631 +.if (\bpp * \numpix) <= 128
2632 + pixld \numpix, \bpp, \basereg, \mem_operand, %(\bpp * \numpix)
2633 .else
2634 - pixld numpix, bpp, basereg, mem_operand, 128
2635 + pixld \numpix, \bpp, \basereg, \mem_operand, 128
2636 .endif
2637 .endm
2639 .macro pixst_a numpix, bpp, basereg, mem_operand
2640 -.if (bpp * numpix) <= 128
2641 - pixst numpix, bpp, basereg, mem_operand, %(bpp * numpix)
2642 +.if (\bpp * \numpix) <= 128
2643 + pixst \numpix, \bpp, \basereg, \mem_operand, %(\bpp * \numpix)
2644 .else
2645 - pixst numpix, bpp, basereg, mem_operand, 128
2646 + pixst \numpix, \bpp, \basereg, \mem_operand, 128
2647 .endif
2648 .endm
2651 * Pixel fetcher for nearest scaling (needs TMP1, TMP2, VX, UNIT_X register
2652 * aliases to be defined)
2654 .macro pixld1_s elem_size, reg1, mem_operand
2655 -.if elem_size == 16
2656 +.if \elem_size == 16
2657 asr TMP1, VX, #16
2658 adds VX, VX, UNIT_X
2659 bmi 55f
2660 5: subs VX, VX, SRC_WIDTH_FIXED
2661 bpl 5b
2663 - add TMP1, mem_operand, TMP1, lsl #1
2664 + add TMP1, \mem_operand, TMP1, lsl #1
2665 asr TMP2, VX, #16
2666 adds VX, VX, UNIT_X
2667 bmi 55f
2668 5: subs VX, VX, SRC_WIDTH_FIXED
2669 bpl 5b
2671 - add TMP2, mem_operand, TMP2, lsl #1
2672 - ld1 {v&reg1&.h}[0], [TMP1]
2673 + add TMP2, \mem_operand, TMP2, lsl #1
2674 + ld1 {v\()\reg1\().h}[0], [TMP1]
2675 asr TMP1, VX, #16
2676 adds VX, VX, UNIT_X
2677 bmi 55f
2678 5: subs VX, VX, SRC_WIDTH_FIXED
2679 bpl 5b
2681 - add TMP1, mem_operand, TMP1, lsl #1
2682 - ld1 {v&reg1&.h}[1], [TMP2]
2683 + add TMP1, \mem_operand, TMP1, lsl #1
2684 + ld1 {v\()\reg1\().h}[1], [TMP2]
2685 asr TMP2, VX, #16
2686 adds VX, VX, UNIT_X
2687 bmi 55f
2688 5: subs VX, VX, SRC_WIDTH_FIXED
2689 bpl 5b
2691 - add TMP2, mem_operand, TMP2, lsl #1
2692 - ld1 {v&reg1&.h}[2], [TMP1]
2693 - ld1 {v&reg1&.h}[3], [TMP2]
2694 -.elseif elem_size == 32
2695 + add TMP2, \mem_operand, TMP2, lsl #1
2696 + ld1 {v\()\reg1\().h}[2], [TMP1]
2697 + ld1 {v\()\reg1\().h}[3], [TMP2]
2698 +.elseif \elem_size == 32
2699 asr TMP1, VX, #16
2700 adds VX, VX, UNIT_X
2701 bmi 55f
2702 5: subs VX, VX, SRC_WIDTH_FIXED
2703 bpl 5b
2705 - add TMP1, mem_operand, TMP1, lsl #2
2706 + add TMP1, \mem_operand, TMP1, lsl #2
2707 asr TMP2, VX, #16
2708 adds VX, VX, UNIT_X
2709 bmi 55f
2710 5: subs VX, VX, SRC_WIDTH_FIXED
2711 bpl 5b
2713 - add TMP2, mem_operand, TMP2, lsl #2
2714 - ld1 {v&reg1&.s}[0], [TMP1]
2715 - ld1 {v&reg1&.s}[1], [TMP2]
2716 + add TMP2, \mem_operand, TMP2, lsl #2
2717 + ld1 {v\()\reg1\().s}[0], [TMP1]
2718 + ld1 {v\()\reg1\().s}[1], [TMP2]
2719 .else
2720 .error "unsupported"
2721 .endif
2722 .endm
2724 .macro pixld2_s elem_size, reg1, reg2, mem_operand
2725 -.if 0 /* elem_size == 32 */
2726 +.if 0 /* \elem_size == 32 */
2727 mov TMP1, VX, asr #16
2728 add VX, VX, UNIT_X, asl #1
2729 - add TMP1, mem_operand, TMP1, asl #2
2730 + add TMP1, \mem_operand, TMP1, asl #2
2731 mov TMP2, VX, asr #16
2732 sub VX, VX, UNIT_X
2733 - add TMP2, mem_operand, TMP2, asl #2
2734 - ld1 {v&reg1&.s}[0], [TMP1]
2735 + add TMP2, \mem_operand, TMP2, asl #2
2736 + ld1 {v\()\reg1\().s}[0], [TMP1]
2737 mov TMP1, VX, asr #16
2738 add VX, VX, UNIT_X, asl #1
2739 - add TMP1, mem_operand, TMP1, asl #2
2740 - ld1 {v&reg2&.s}[0], [TMP2, :32]
2741 + add TMP1, \mem_operand, TMP1, asl #2
2742 + ld1 {v\()\reg2\().s}[0], [TMP2, :32]
2743 mov TMP2, VX, asr #16
2744 add VX, VX, UNIT_X
2745 - add TMP2, mem_operand, TMP2, asl #2
2746 - ld1 {v&reg1&.s}[1], [TMP1]
2747 - ld1 {v&reg2&.s}[1], [TMP2]
2748 + add TMP2, \mem_operand, TMP2, asl #2
2749 + ld1 {v\()\reg1\().s}[1], [TMP1]
2750 + ld1 {v\()\reg2\().s}[1], [TMP2]
2751 .else
2752 - pixld1_s elem_size, reg1, mem_operand
2753 - pixld1_s elem_size, reg2, mem_operand
2754 + pixld1_s \elem_size, \reg1, \mem_operand
2755 + pixld1_s \elem_size, \reg2, \mem_operand
2756 .endif
2757 .endm
2759 .macro pixld0_s elem_size, reg1, idx, mem_operand
2760 -.if elem_size == 16
2761 +.if \elem_size == 16
2762 asr TMP1, VX, #16
2763 adds VX, VX, UNIT_X
2764 bmi 55f
2765 5: subs VX, VX, SRC_WIDTH_FIXED
2766 bpl 5b
2768 - add TMP1, mem_operand, TMP1, lsl #1
2769 - ld1 {v&reg1&.h}[idx], [TMP1]
2770 -.elseif elem_size == 32
2771 + add TMP1, \mem_operand, TMP1, lsl #1
2772 + ld1 {v\()\reg1\().h}[\idx], [TMP1]
2773 +.elseif \elem_size == 32
2774 asr DUMMY, VX, #16
2775 mov TMP1, DUMMY
2776 adds VX, VX, UNIT_X
2777 bmi 55f
2778 5: subs VX, VX, SRC_WIDTH_FIXED
2779 bpl 5b
2781 - add TMP1, mem_operand, TMP1, lsl #2
2782 - ld1 {v&reg1&.s}[idx], [TMP1]
2783 + add TMP1, \mem_operand, TMP1, lsl #2
2784 + ld1 {v\()\reg1\().s}[\idx], [TMP1]
2785 .endif
2786 .endm
2788 .macro pixld_s_internal numbytes, elem_size, basereg, mem_operand
2789 -.if numbytes == 32
2790 - pixld2_s elem_size, %(basereg+4), %(basereg+5), mem_operand
2791 - pixld2_s elem_size, %(basereg+6), %(basereg+7), mem_operand
2792 - pixdeinterleave elem_size, %(basereg+4)
2793 -.elseif numbytes == 16
2794 - pixld2_s elem_size, %(basereg+2), %(basereg+3), mem_operand
2795 -.elseif numbytes == 8
2796 - pixld1_s elem_size, %(basereg+1), mem_operand
2797 -.elseif numbytes == 4
2798 - .if elem_size == 32
2799 - pixld0_s elem_size, %(basereg+0), 1, mem_operand
2800 - .elseif elem_size == 16
2801 - pixld0_s elem_size, %(basereg+0), 2, mem_operand
2802 - pixld0_s elem_size, %(basereg+0), 3, mem_operand
2803 +.if \numbytes == 32
2804 + pixld2_s \elem_size, %(\basereg+4), %(\basereg+5), \mem_operand
2805 + pixld2_s \elem_size, %(\basereg+6), %(\basereg+7), \mem_operand
2806 + pixdeinterleave \elem_size, %(\basereg+4)
2807 +.elseif \numbytes == 16
2808 + pixld2_s \elem_size, %(\basereg+2), %(\basereg+3), \mem_operand
2809 +.elseif \numbytes == 8
2810 + pixld1_s \elem_size, %(\basereg+1), \mem_operand
2811 +.elseif \numbytes == 4
2812 + .if \elem_size == 32
2813 + pixld0_s \elem_size, %(\basereg+0), 1, \mem_operand
2814 + .elseif \elem_size == 16
2815 + pixld0_s \elem_size, %(\basereg+0), 2, \mem_operand
2816 + pixld0_s \elem_size, %(\basereg+0), 3, \mem_operand
2817 .else
2818 - pixld0_s elem_size, %(basereg+0), 4, mem_operand
2819 - pixld0_s elem_size, %(basereg+0), 5, mem_operand
2820 - pixld0_s elem_size, %(basereg+0), 6, mem_operand
2821 - pixld0_s elem_size, %(basereg+0), 7, mem_operand
2822 + pixld0_s \elem_size, %(\basereg+0), 4, \mem_operand
2823 + pixld0_s \elem_size, %(\basereg+0), 5, \mem_operand
2824 + pixld0_s \elem_size, %(\basereg+0), 6, \mem_operand
2825 + pixld0_s \elem_size, %(\basereg+0), 7, \mem_operand
2826 .endif
2827 -.elseif numbytes == 2
2828 - .if elem_size == 16
2829 - pixld0_s elem_size, %(basereg+0), 1, mem_operand
2830 +.elseif \numbytes == 2
2831 + .if \elem_size == 16
2832 + pixld0_s \elem_size, %(\basereg+0), 1, \mem_operand
2833 .else
2834 - pixld0_s elem_size, %(basereg+0), 2, mem_operand
2835 - pixld0_s elem_size, %(basereg+0), 3, mem_operand
2836 + pixld0_s \elem_size, %(\basereg+0), 2, \mem_operand
2837 + pixld0_s \elem_size, %(\basereg+0), 3, \mem_operand
2838 .endif
2839 -.elseif numbytes == 1
2840 - pixld0_s elem_size, %(basereg+0), 1, mem_operand
2841 +.elseif \numbytes == 1
2842 + pixld0_s \elem_size, %(\basereg+0), 1, \mem_operand
2843 .else
2844 - .error "unsupported size: numbytes"
2845 + .error "unsupported size: \numbytes"
2846 .endif
2847 .endm
2849 .macro pixld_s numpix, bpp, basereg, mem_operand
2850 -.if bpp > 0
2851 - pixld_s_internal %(numpix * bpp / 8), %(bpp), basereg, mem_operand
2852 +.if \bpp > 0
2853 + pixld_s_internal %(\numpix * \bpp / 8), %(\bpp), \basereg, \mem_operand
2854 .endif
2855 .endm
2857 .macro vuzp8 reg1, reg2
2858 umov DUMMY, v16.d[0]
2859 - uzp1 v16.8b, v&reg1&.8b, v&reg2&.8b
2860 - uzp2 v&reg2&.8b, v&reg1&.8b, v&reg2&.8b
2861 - mov v&reg1&.8b, v16.8b
2862 + uzp1 v16.8b, v\()\reg1\().8b, v\()\reg2\().8b
2863 + uzp2 v\()\reg2\().8b, v\()\reg1\().8b, v\()\reg2\().8b
2864 + mov v\()\reg1\().8b, v16.8b
2865 mov v16.d[0], DUMMY
2866 .endm
2868 .macro vzip8 reg1, reg2
2869 umov DUMMY, v16.d[0]
2870 - zip1 v16.8b, v&reg1&.8b, v&reg2&.8b
2871 - zip2 v&reg2&.8b, v&reg1&.8b, v&reg2&.8b
2872 - mov v&reg1&.8b, v16.8b
2873 + zip1 v16.8b, v\()\reg1\().8b, v\()\reg2\().8b
2874 + zip2 v\()\reg2\().8b, v\()\reg1\().8b, v\()\reg2\().8b
2875 + mov v\()\reg1\().8b, v16.8b
2876 mov v16.d[0], DUMMY
2877 .endm
2879 /* deinterleave B, G, R, A channels for eight 32bpp pixels in 4 registers */
2880 .macro pixdeinterleave bpp, basereg
2881 -.if (bpp == 32) && (DEINTERLEAVE_32BPP_ENABLED != 0)
2882 - vuzp8 %(basereg+0), %(basereg+1)
2883 - vuzp8 %(basereg+2), %(basereg+3)
2884 - vuzp8 %(basereg+1), %(basereg+3)
2885 - vuzp8 %(basereg+0), %(basereg+2)
2886 +.if (\bpp == 32) && (DEINTERLEAVE_32BPP_ENABLED != 0)
2887 + vuzp8 %(\basereg+0), %(\basereg+1)
2888 + vuzp8 %(\basereg+2), %(\basereg+3)
2889 + vuzp8 %(\basereg+1), %(\basereg+3)
2890 + vuzp8 %(\basereg+0), %(\basereg+2)
2891 .endif
2892 .endm
2894 /* interleave B, G, R, A channels for eight 32bpp pixels in 4 registers */
2895 .macro pixinterleave bpp, basereg
2896 -.if (bpp == 32) && (DEINTERLEAVE_32BPP_ENABLED != 0)
2897 - vzip8 %(basereg+0), %(basereg+2)
2898 - vzip8 %(basereg+1), %(basereg+3)
2899 - vzip8 %(basereg+2), %(basereg+3)
2900 - vzip8 %(basereg+0), %(basereg+1)
2901 +.if (\bpp == 32) && (DEINTERLEAVE_32BPP_ENABLED != 0)
2902 + vzip8 %(\basereg+0), %(\basereg+2)
2903 + vzip8 %(\basereg+1), %(\basereg+3)
2904 + vzip8 %(\basereg+2), %(\basereg+3)
2905 + vzip8 %(\basereg+0), %(\basereg+1)
2906 .endif
2907 .endm
2910 * This is a macro for implementing cache preload. The main idea is that
2911 * cache preload logic is mostly independent from the rest of pixels
2912 * processing code. It starts at the top left pixel and moves forward
2913 * across pixels and can jump across scanlines. Prefetch distance is
2914 @@ -432,62 +432,62 @@ 55:
2915 * for almost zero cost!
2917 * (*) The overhead of the prefetcher is visible when running some trivial
2918 * pixels processing like simple copy. Anyway, having prefetch is a must
2919 * when working with the graphics data.
2921 .macro PF a, x:vararg
2922 .if (PREFETCH_TYPE_CURRENT == PREFETCH_TYPE_ADVANCED)
2923 - a x
2924 + \a \x
2925 .endif
2926 .endm
2928 .macro cache_preload std_increment, boost_increment
2929 .if (src_bpp_shift >= 0) || (dst_r_bpp != 0) || (mask_bpp_shift >= 0)
2930 -.if std_increment != 0
2931 - PF add PF_X, PF_X, #std_increment
2932 +.if \std_increment != 0
2933 + PF add, PF_X, PF_X, #\std_increment
2934 .endif
2935 - PF tst PF_CTL, #0xF
2936 - PF beq 71f
2937 - PF add PF_X, PF_X, #boost_increment
2938 - PF sub PF_CTL, PF_CTL, #1
2939 + PF tst, PF_CTL, #0xF
2940 + PF beq, 71f
2941 + PF add, PF_X, PF_X, #\boost_increment
2942 + PF sub, PF_CTL, PF_CTL, #1
2944 - PF cmp PF_X, ORIG_W
2945 + PF cmp, PF_X, ORIG_W
2946 .if src_bpp_shift >= 0
2947 - PF lsl DUMMY, PF_X, #src_bpp_shift
2948 - PF prfm PREFETCH_MODE, [PF_SRC, DUMMY]
2949 + PF lsl, DUMMY, PF_X, #src_bpp_shift
2950 + PF prfm, PREFETCH_MODE, [PF_SRC, DUMMY]
2951 .endif
2952 .if dst_r_bpp != 0
2953 - PF lsl DUMMY, PF_X, #dst_bpp_shift
2954 - PF prfm PREFETCH_MODE, [PF_DST, DUMMY]
2955 + PF lsl, DUMMY, PF_X, #dst_bpp_shift
2956 + PF prfm, PREFETCH_MODE, [PF_DST, DUMMY]
2957 .endif
2958 .if mask_bpp_shift >= 0
2959 - PF lsl DUMMY, PF_X, #mask_bpp_shift
2960 - PF prfm PREFETCH_MODE, [PF_MASK, DUMMY]
2961 + PF lsl, DUMMY, PF_X, #mask_bpp_shift
2962 + PF prfm, PREFETCH_MODE, [PF_MASK, DUMMY]
2963 .endif
2964 - PF ble 71f
2965 - PF sub PF_X, PF_X, ORIG_W
2966 - PF subs PF_CTL, PF_CTL, #0x10
2967 + PF ble, 71f
2968 + PF sub, PF_X, PF_X, ORIG_W
2969 + PF subs, PF_CTL, PF_CTL, #0x10
2971 - PF ble 72f
2972 + PF ble, 72f
2973 .if src_bpp_shift >= 0
2974 - PF lsl DUMMY, SRC_STRIDE, #src_bpp_shift
2975 - PF ldrsb DUMMY, [PF_SRC, DUMMY]
2976 - PF add PF_SRC, PF_SRC, #1
2977 + PF lsl, DUMMY, SRC_STRIDE, #src_bpp_shift
2978 + PF ldrsb, DUMMY, [PF_SRC, DUMMY]
2979 + PF add, PF_SRC, PF_SRC, #1
2980 .endif
2981 .if dst_r_bpp != 0
2982 - PF lsl DUMMY, DST_STRIDE, #dst_bpp_shift
2983 - PF ldrsb DUMMY, [PF_DST, DUMMY]
2984 - PF add PF_DST, PF_DST, #1
2985 + PF lsl, DUMMY, DST_STRIDE, #dst_bpp_shift
2986 + PF ldrsb, DUMMY, [PF_DST, DUMMY]
2987 + PF add, PF_DST, PF_DST, #1
2988 .endif
2989 .if mask_bpp_shift >= 0
2990 - PF lsl DUMMY, MASK_STRIDE, #mask_bpp_shift
2991 - PF ldrsb DUMMY, [PF_MASK, DUMMY]
2992 - PF add PF_MASK, PF_MASK, #1
2993 + PF lsl, DUMMY, MASK_STRIDE, #mask_bpp_shift
2994 + PF ldrsb, DUMMY, [PF_MASK, DUMMY]
2995 + PF add, PF_MASK, PF_MASK, #1
2996 .endif
2998 .endif
2999 .endm
3001 .macro cache_preload_simple
3002 .if (PREFETCH_TYPE_CURRENT == PREFETCH_TYPE_SIMPLE)
3003 .if src_bpp > 0
3004 @@ -516,56 +516,56 @@ 72:
3005 process_pixblock_tail, \
3006 process_pixblock_tail_head
3007 .if dst_w_bpp != 24
3008 tst DST_R, #0xF
3009 beq 52f
3011 .if src_bpp > 0 || mask_bpp > 0 || dst_r_bpp > 0
3012 .irp lowbit, 1, 2, 4, 8, 16
3013 -local skip1
3014 -.if (dst_w_bpp <= (lowbit * 8)) && ((lowbit * 8) < (pixblock_size * dst_w_bpp))
3015 -.if lowbit < 16 /* we don't need more than 16-byte alignment */
3016 - tst DST_R, #lowbit
3018 +.if (dst_w_bpp <= (\lowbit * 8)) && ((\lowbit * 8) < (pixblock_size * dst_w_bpp))
3019 +.if \lowbit < 16 /* we don't need more than 16-byte alignment */
3020 + tst DST_R, #\lowbit
3021 beq 51f
3022 .endif
3023 - pixld_src (lowbit * 8 / dst_w_bpp), src_bpp, src_basereg, SRC
3024 - pixld (lowbit * 8 / dst_w_bpp), mask_bpp, mask_basereg, MASK
3025 + pixld_src (\lowbit * 8 / dst_w_bpp), src_bpp, src_basereg, SRC
3026 + pixld (\lowbit * 8 / dst_w_bpp), mask_bpp, mask_basereg, MASK
3027 .if dst_r_bpp > 0
3028 - pixld_a (lowbit * 8 / dst_r_bpp), dst_r_bpp, dst_r_basereg, DST_R
3029 + pixld_a (\lowbit * 8 / dst_r_bpp), dst_r_bpp, dst_r_basereg, DST_R
3030 .else
3031 - add DST_R, DST_R, #lowbit
3032 + add DST_R, DST_R, #\lowbit
3033 .endif
3034 - PF add PF_X, PF_X, #(lowbit * 8 / dst_w_bpp)
3035 - sub W, W, #(lowbit * 8 / dst_w_bpp)
3036 + PF add, PF_X, PF_X, #(\lowbit * 8 / dst_w_bpp)
3037 + sub W, W, #(\lowbit * 8 / dst_w_bpp)
3039 .endif
3040 .endr
3041 .endif
3042 pixdeinterleave src_bpp, src_basereg
3043 pixdeinterleave mask_bpp, mask_basereg
3044 pixdeinterleave dst_r_bpp, dst_r_basereg
3046 - process_pixblock_head
3047 + \process_pixblock_head
3048 cache_preload 0, pixblock_size
3049 cache_preload_simple
3050 - process_pixblock_tail
3051 + \process_pixblock_tail
3053 pixinterleave dst_w_bpp, dst_w_basereg
3055 .irp lowbit, 1, 2, 4, 8, 16
3056 -.if (dst_w_bpp <= (lowbit * 8)) && ((lowbit * 8) < (pixblock_size * dst_w_bpp))
3057 -.if lowbit < 16 /* we don't need more than 16-byte alignment */
3058 - tst DST_W, #lowbit
3059 +.if (dst_w_bpp <= (\lowbit * 8)) && ((\lowbit * 8) < (pixblock_size * dst_w_bpp))
3060 +.if \lowbit < 16 /* we don't need more than 16-byte alignment */
3061 + tst DST_W, #\lowbit
3062 beq 51f
3063 .endif
3064 .if src_bpp == 0 && mask_bpp == 0 && dst_r_bpp == 0
3065 - sub W, W, #(lowbit * 8 / dst_w_bpp)
3066 + sub W, W, #(\lowbit * 8 / dst_w_bpp)
3067 .endif
3068 - pixst_a (lowbit * 8 / dst_w_bpp), dst_w_bpp, dst_w_basereg, DST_W
3069 + pixst_a (\lowbit * 8 / dst_w_bpp), dst_w_bpp, dst_w_basereg, DST_W
3071 .endif
3072 .endr
3073 .endif
3075 .endm
3078 @@ -587,52 +587,52 @@ 52:
3079 dst_aligned_flag, \
3080 process_pixblock_head, \
3081 process_pixblock_tail, \
3082 process_pixblock_tail_head
3083 tst W, #(pixblock_size - 1)
3084 beq 52f
3085 .if src_bpp > 0 || mask_bpp > 0 || dst_r_bpp > 0
3086 .irp chunk_size, 16, 8, 4, 2, 1
3087 -.if pixblock_size > chunk_size
3088 - tst W, #chunk_size
3089 +.if pixblock_size > \chunk_size
3090 + tst W, #\chunk_size
3091 beq 51f
3092 - pixld_src chunk_size, src_bpp, src_basereg, SRC
3093 - pixld chunk_size, mask_bpp, mask_basereg, MASK
3094 -.if dst_aligned_flag != 0
3095 - pixld_a chunk_size, dst_r_bpp, dst_r_basereg, DST_R
3096 + pixld_src \chunk_size, src_bpp, src_basereg, SRC
3097 + pixld \chunk_size, mask_bpp, mask_basereg, MASK
3098 +.if \dst_aligned_flag != 0
3099 + pixld_a \chunk_size, dst_r_bpp, dst_r_basereg, DST_R
3100 .else
3101 - pixld chunk_size, dst_r_bpp, dst_r_basereg, DST_R
3102 + pixld \chunk_size, dst_r_bpp, dst_r_basereg, DST_R
3103 .endif
3104 -.if cache_preload_flag != 0
3105 - PF add PF_X, PF_X, #chunk_size
3106 +.if \cache_preload_flag != 0
3107 + PF add, PF_X, PF_X, #\chunk_size
3108 .endif
3110 .endif
3111 .endr
3112 .endif
3113 pixdeinterleave src_bpp, src_basereg
3114 pixdeinterleave mask_bpp, mask_basereg
3115 pixdeinterleave dst_r_bpp, dst_r_basereg
3117 - process_pixblock_head
3118 -.if cache_preload_flag != 0
3119 + \process_pixblock_head
3120 +.if \cache_preload_flag != 0
3121 cache_preload 0, pixblock_size
3122 cache_preload_simple
3123 .endif
3124 - process_pixblock_tail
3125 + \process_pixblock_tail
3126 pixinterleave dst_w_bpp, dst_w_basereg
3127 .irp chunk_size, 16, 8, 4, 2, 1
3128 -.if pixblock_size > chunk_size
3129 - tst W, #chunk_size
3130 +.if pixblock_size > \chunk_size
3131 + tst W, #\chunk_size
3132 beq 51f
3133 -.if dst_aligned_flag != 0
3134 - pixst_a chunk_size, dst_w_bpp, dst_w_basereg, DST_W
3135 +.if \dst_aligned_flag != 0
3136 + pixst_a \chunk_size, dst_w_bpp, dst_w_basereg, DST_W
3137 .else
3138 - pixst chunk_size, dst_w_bpp, dst_w_basereg, DST_W
3139 + pixst \chunk_size, dst_w_bpp, dst_w_basereg, DST_W
3140 .endif
3142 .endif
3143 .endr
3145 .endm
3148 @@ -655,17 +655,17 @@ 52:
3149 .if (src_bpp != 24) && (src_bpp != 0)
3150 sub SRC, SRC, W, lsl #src_bpp_shift
3151 .endif
3152 .if (mask_bpp != 24) && (mask_bpp != 0)
3153 sub MASK, MASK, W, lsl #mask_bpp_shift
3154 .endif
3155 subs H, H, #1
3156 mov DST_R, DST_W
3157 - bge start_of_loop_label
3158 + bge \start_of_loop_label
3159 .endm
3162 * Registers are allocated in the following way by default:
3163 * v0, v1, v2, v3 - reserved for loading source pixel data
3164 * v4, v5, v6, v7 - reserved for loading destination pixel data
3165 * v24, v25, v26, v27 - reserved for loading mask pixel data
3166 * v28, v29, v30, v31 - final destination pixel data for writeback to memory
3167 @@ -682,17 +682,17 @@ 52:
3168 process_pixblock_head, \
3169 process_pixblock_tail, \
3170 process_pixblock_tail_head, \
3171 dst_w_basereg_ = 28, \
3172 dst_r_basereg_ = 4, \
3173 src_basereg_ = 0, \
3174 mask_basereg_ = 24
3176 - pixman_asm_function fname
3177 + pixman_asm_function \fname
3178 stp x29, x30, [sp, -16]!
3179 mov x29, sp
3180 sub sp, sp, 232 /* push all registers */
3181 sub x29, x29, 64
3182 st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], #32
3183 st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], #32
3184 stp x8, x9, [x29, -80]
3185 stp x10, x11, [x29, -96]
3186 @@ -707,38 +707,38 @@ 52:
3187 str x28, [x29, -232]
3190 * Select prefetch type for this function. If prefetch distance is
3191 * set to 0 or one of the color formats is 24bpp, SIMPLE prefetch
3192 * has to be used instead of ADVANCED.
3194 .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_DEFAULT
3195 -.if prefetch_distance == 0
3196 +.if \prefetch_distance == 0
3197 .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_NONE
3198 .elseif (PREFETCH_TYPE_CURRENT > PREFETCH_TYPE_SIMPLE) && \
3199 - ((src_bpp_ == 24) || (mask_bpp_ == 24) || (dst_w_bpp_ == 24))
3200 + ((\src_bpp_ == 24) || (\mask_bpp_ == 24) || (\dst_w_bpp_ == 24))
3201 .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_SIMPLE
3202 .endif
3205 * Make some macro arguments globally visible and accessible
3206 * from other macros
3208 - .set src_bpp, src_bpp_
3209 - .set mask_bpp, mask_bpp_
3210 - .set dst_w_bpp, dst_w_bpp_
3211 - .set pixblock_size, pixblock_size_
3212 - .set dst_w_basereg, dst_w_basereg_
3213 - .set dst_r_basereg, dst_r_basereg_
3214 - .set src_basereg, src_basereg_
3215 - .set mask_basereg, mask_basereg_
3216 + .set src_bpp, \src_bpp_
3217 + .set mask_bpp, \mask_bpp_
3218 + .set dst_w_bpp, \dst_w_bpp_
3219 + .set pixblock_size, \pixblock_size_
3220 + .set dst_w_basereg, \dst_w_basereg_
3221 + .set dst_r_basereg, \dst_r_basereg_
3222 + .set src_basereg, \src_basereg_
3223 + .set mask_basereg, \mask_basereg_
3225 .macro pixld_src x:vararg
3226 - pixld x
3227 + pixld \x
3228 .endm
3229 .macro fetch_src_pixblock
3230 pixld_src pixblock_size, src_bpp, \
3231 (src_basereg - pixblock_size * src_bpp / 64), SRC
3232 .endm
3234 * Assign symbolic names to registers
3236 @@ -805,32 +805,32 @@ 52:
3237 .elseif dst_w_bpp == 16
3238 .set dst_bpp_shift, 1
3239 .elseif dst_w_bpp == 8
3240 .set dst_bpp_shift, 0
3241 .else
3242 .error "requested dst bpp (dst_w_bpp) is not supported"
3243 .endif
3245 -.if (((flags) & FLAG_DST_READWRITE) != 0)
3246 +.if (((\flags) & FLAG_DST_READWRITE) != 0)
3247 .set dst_r_bpp, dst_w_bpp
3248 .else
3249 .set dst_r_bpp, 0
3250 .endif
3251 -.if (((flags) & FLAG_DEINTERLEAVE_32BPP) != 0)
3252 +.if (((\flags) & FLAG_DEINTERLEAVE_32BPP) != 0)
3253 .set DEINTERLEAVE_32BPP_ENABLED, 1
3254 .else
3255 .set DEINTERLEAVE_32BPP_ENABLED, 0
3256 .endif
3258 -.if prefetch_distance < 0 || prefetch_distance > 15
3259 - .error "invalid prefetch distance (prefetch_distance)"
3260 +.if \prefetch_distance < 0 || \prefetch_distance > 15
3261 + .error "invalid prefetch distance (\prefetch_distance)"
3262 .endif
3264 - PF mov PF_X, #0
3265 + PF mov, PF_X, #0
3266 mov DST_R, DST_W
3268 .if src_bpp == 24
3269 sub SRC_STRIDE, SRC_STRIDE, W
3270 sub SRC_STRIDE, SRC_STRIDE, W, lsl #1
3271 .endif
3272 .if mask_bpp == 24
3273 sub MASK_STRIDE, MASK_STRIDE, W
3274 @@ -839,71 +839,71 @@ 52:
3275 .if dst_w_bpp == 24
3276 sub DST_STRIDE, DST_STRIDE, W
3277 sub DST_STRIDE, DST_STRIDE, W, lsl #1
3278 .endif
3281 * Setup advanced prefetcher initial state
3283 - PF mov PF_SRC, SRC
3284 - PF mov PF_DST, DST_R
3285 - PF mov PF_MASK, MASK
3286 - /* PF_CTL = prefetch_distance | ((h - 1) << 4) */
3287 - PF lsl DUMMY, H, #4
3288 - PF mov PF_CTL, DUMMY
3289 - PF add PF_CTL, PF_CTL, #(prefetch_distance - 0x10)
3290 + PF mov, PF_SRC, SRC
3291 + PF mov, PF_DST, DST_R
3292 + PF mov, PF_MASK, MASK
3293 + /* PF_CTL = \prefetch_distance | ((h - 1) << 4) */
3294 + PF lsl, DUMMY, H, #4
3295 + PF mov, PF_CTL, DUMMY
3296 + PF add, PF_CTL, PF_CTL, #(\prefetch_distance - 0x10)
3298 - init
3299 + \init
3300 subs H, H, #1
3301 mov ORIG_W, W
3302 blt 9f
3303 cmp W, #(pixblock_size * 2)
3304 blt 800f
3306 * This is the start of the pipelined loop, which if optimized for
3307 * long scanlines
3310 - ensure_destination_ptr_alignment process_pixblock_head, \
3311 - process_pixblock_tail, \
3312 - process_pixblock_tail_head
3313 + ensure_destination_ptr_alignment \process_pixblock_head, \
3314 + \process_pixblock_tail, \
3315 + \process_pixblock_tail_head
3317 /* Implement "head (tail_head) ... (tail_head) tail" loop pattern */
3318 pixld_a pixblock_size, dst_r_bpp, \
3319 (dst_r_basereg - pixblock_size * dst_r_bpp / 64), DST_R
3320 fetch_src_pixblock
3321 pixld pixblock_size, mask_bpp, \
3322 (mask_basereg - pixblock_size * mask_bpp / 64), MASK
3323 - PF add PF_X, PF_X, #pixblock_size
3324 - process_pixblock_head
3325 + PF add, PF_X, PF_X, #pixblock_size
3326 + \process_pixblock_head
3327 cache_preload 0, pixblock_size
3328 cache_preload_simple
3329 subs W, W, #(pixblock_size * 2)
3330 blt 200f
3332 100:
3333 - process_pixblock_tail_head
3334 + \process_pixblock_tail_head
3335 cache_preload_simple
3336 subs W, W, #pixblock_size
3337 bge 100b
3339 200:
3340 - process_pixblock_tail
3341 + \process_pixblock_tail
3342 pixst_a pixblock_size, dst_w_bpp, \
3343 (dst_w_basereg - pixblock_size * dst_w_bpp / 64), DST_W
3345 /* Process the remaining trailing pixels in the scanline */
3346 process_trailing_pixels 1, 1, \
3347 - process_pixblock_head, \
3348 - process_pixblock_tail, \
3349 - process_pixblock_tail_head
3350 + \process_pixblock_head, \
3351 + \process_pixblock_tail, \
3352 + \process_pixblock_tail_head
3353 advance_to_next_scanline 0b
3355 - cleanup
3356 + \cleanup
3357 1000:
3358 /* pop all registers */
3359 sub x29, x29, 64
3360 ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32
3361 ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32
3362 ldp x8, x9, [x29, -80]
3363 ldp x10, x11, [x29, -96]
3364 ldp x12, x13, [x29, -112]
3365 @@ -920,48 +920,48 @@ 1000:
3366 ret /* exit */
3368 * This is the start of the loop, designed to process images with small width
3369 * (less than pixblock_size * 2 pixels). In this case neither pipelining
3370 * nor prefetch are used.
3372 800:
3373 .if src_bpp_shift >= 0
3374 - PF lsl DUMMY, SRC_STRIDE, #src_bpp_shift
3375 - PF prfm PREFETCH_MODE, [SRC, DUMMY]
3376 + PF lsl, DUMMY, SRC_STRIDE, #src_bpp_shift
3377 + PF prfm, PREFETCH_MODE, [SRC, DUMMY]
3378 .endif
3379 .if dst_r_bpp != 0
3380 - PF lsl DUMMY, DST_STRIDE, #dst_bpp_shift
3381 - PF prfm PREFETCH_MODE, [DST_R, DUMMY]
3382 + PF lsl, DUMMY, DST_STRIDE, #dst_bpp_shift
3383 + PF prfm, PREFETCH_MODE, [DST_R, DUMMY]
3384 .endif
3385 .if mask_bpp_shift >= 0
3386 - PF lsl DUMMY, MASK_STRIDE, #mask_bpp_shift
3387 - PF prfm PREFETCH_MODE, [MASK, DUMMY]
3388 + PF lsl, DUMMY, MASK_STRIDE, #mask_bpp_shift
3389 + PF prfm, PREFETCH_MODE, [MASK, DUMMY]
3390 .endif
3391 /* Process exactly pixblock_size pixels if needed */
3392 tst W, #pixblock_size
3393 beq 100f
3394 pixld pixblock_size, dst_r_bpp, \
3395 (dst_r_basereg - pixblock_size * dst_r_bpp / 64), DST_R
3396 fetch_src_pixblock
3397 pixld pixblock_size, mask_bpp, \
3398 (mask_basereg - pixblock_size * mask_bpp / 64), MASK
3399 - process_pixblock_head
3400 - process_pixblock_tail
3401 + \process_pixblock_head
3402 + \process_pixblock_tail
3403 pixst pixblock_size, dst_w_bpp, \
3404 (dst_w_basereg - pixblock_size * dst_w_bpp / 64), DST_W
3405 100:
3406 /* Process the remaining trailing pixels in the scanline */
3407 process_trailing_pixels 0, 0, \
3408 - process_pixblock_head, \
3409 - process_pixblock_tail, \
3410 - process_pixblock_tail_head
3411 + \process_pixblock_head, \
3412 + \process_pixblock_tail, \
3413 + \process_pixblock_tail_head
3414 advance_to_next_scanline 800b
3416 - cleanup
3417 + \cleanup
3418 /* pop all registers */
3419 sub x29, x29, 64
3420 ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32
3421 ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32
3422 ldp x8, x9, [x29, -80]
3423 ldp x10, x11, [x29, -96]
3424 ldp x12, x13, [x29, -112]
3425 ldp x14, x15, [x29, -128]
3426 @@ -990,17 +990,17 @@ 9:
3427 .unreq DST_STRIDE
3428 .unreq MASK_STRIDE
3429 .unreq PF_CTL
3430 .unreq PF_X
3431 .unreq PF_SRC
3432 .unreq PF_DST
3433 .unreq PF_MASK
3434 .unreq DUMMY
3435 - .endfunc
3436 + pixman_end_asm_function
3437 .endm
3440 * A simplified variant of function generation template for a single
3441 * scanline processing (for implementing pixman combine functions)
3443 .macro generate_composite_function_scanline use_nearest_scaling, \
3444 fname, \
3445 @@ -1014,50 +1014,50 @@ 9:
3446 process_pixblock_head, \
3447 process_pixblock_tail, \
3448 process_pixblock_tail_head, \
3449 dst_w_basereg_ = 28, \
3450 dst_r_basereg_ = 4, \
3451 src_basereg_ = 0, \
3452 mask_basereg_ = 24
3454 - pixman_asm_function fname
3455 + pixman_asm_function \fname
3456 .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_NONE
3459 * Make some macro arguments globally visible and accessible
3460 * from other macros
3462 - .set src_bpp, src_bpp_
3463 - .set mask_bpp, mask_bpp_
3464 - .set dst_w_bpp, dst_w_bpp_
3465 - .set pixblock_size, pixblock_size_
3466 - .set dst_w_basereg, dst_w_basereg_
3467 - .set dst_r_basereg, dst_r_basereg_
3468 - .set src_basereg, src_basereg_
3469 - .set mask_basereg, mask_basereg_
3470 + .set src_bpp, \src_bpp_
3471 + .set mask_bpp, \mask_bpp_
3472 + .set dst_w_bpp, \dst_w_bpp_
3473 + .set pixblock_size, \pixblock_size_
3474 + .set dst_w_basereg, \dst_w_basereg_
3475 + .set dst_r_basereg, \dst_r_basereg_
3476 + .set src_basereg, \src_basereg_
3477 + .set mask_basereg, \mask_basereg_
3479 -.if use_nearest_scaling != 0
3480 +.if \use_nearest_scaling != 0
3482 * Assign symbolic names to registers for nearest scaling
3484 W .req x0
3485 DST_W .req x1
3486 SRC .req x2
3487 VX .req x3
3488 UNIT_X .req x4
3489 SRC_WIDTH_FIXED .req x5
3490 MASK .req x6
3491 TMP1 .req x8
3492 TMP2 .req x9
3493 DST_R .req x10
3494 DUMMY .req x30
3496 .macro pixld_src x:vararg
3497 - pixld_s x
3498 + pixld_s \x
3499 .endm
3501 sxtw x0, w0
3502 sxtw x3, w3
3503 sxtw x4, w4
3504 sxtw x5, w5
3506 stp x29, x30, [sp, -16]!
3507 @@ -1075,84 +1075,84 @@ 9:
3508 W .req x0 /* width (is updated during processing) */
3509 DST_W .req x1 /* destination buffer pointer for writes */
3510 SRC .req x2 /* source buffer pointer */
3511 MASK .req x3 /* mask pointer */
3512 DST_R .req x4 /* destination buffer pointer for reads */
3513 DUMMY .req x30
3515 .macro pixld_src x:vararg
3516 - pixld x
3517 + pixld \x
3518 .endm
3520 sxtw x0, w0
3522 stp x29, x30, [sp, -16]!
3523 mov x29, sp
3524 sub sp, sp, 64
3525 sub x29, x29, 64
3526 st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32
3527 st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32
3528 .endif
3530 -.if (((flags) & FLAG_DST_READWRITE) != 0)
3531 +.if (((\flags) & FLAG_DST_READWRITE) != 0)
3532 .set dst_r_bpp, dst_w_bpp
3533 .else
3534 .set dst_r_bpp, 0
3535 .endif
3536 -.if (((flags) & FLAG_DEINTERLEAVE_32BPP) != 0)
3537 +.if (((\flags) & FLAG_DEINTERLEAVE_32BPP) != 0)
3538 .set DEINTERLEAVE_32BPP_ENABLED, 1
3539 .else
3540 .set DEINTERLEAVE_32BPP_ENABLED, 0
3541 .endif
3543 .macro fetch_src_pixblock
3544 pixld_src pixblock_size, src_bpp, \
3545 (src_basereg - pixblock_size * src_bpp / 64), SRC
3546 .endm
3548 - init
3549 + \init
3550 mov DST_R, DST_W
3552 cmp W, #pixblock_size
3553 blt 800f
3555 - ensure_destination_ptr_alignment process_pixblock_head, \
3556 - process_pixblock_tail, \
3557 - process_pixblock_tail_head
3558 + ensure_destination_ptr_alignment \process_pixblock_head, \
3559 + \process_pixblock_tail, \
3560 + \process_pixblock_tail_head
3562 subs W, W, #pixblock_size
3563 blt 700f
3565 /* Implement "head (tail_head) ... (tail_head) tail" loop pattern */
3566 pixld_a pixblock_size, dst_r_bpp, \
3567 (dst_r_basereg - pixblock_size * dst_r_bpp / 64), DST_R
3568 fetch_src_pixblock
3569 pixld pixblock_size, mask_bpp, \
3570 (mask_basereg - pixblock_size * mask_bpp / 64), MASK
3571 - process_pixblock_head
3572 + \process_pixblock_head
3573 subs W, W, #pixblock_size
3574 blt 200f
3575 100:
3576 - process_pixblock_tail_head
3577 + \process_pixblock_tail_head
3578 subs W, W, #pixblock_size
3579 bge 100b
3580 200:
3581 - process_pixblock_tail
3582 + \process_pixblock_tail
3583 pixst_a pixblock_size, dst_w_bpp, \
3584 (dst_w_basereg - pixblock_size * dst_w_bpp / 64), DST_W
3585 700:
3586 /* Process the remaining trailing pixels in the scanline (dst aligned) */
3587 process_trailing_pixels 0, 1, \
3588 - process_pixblock_head, \
3589 - process_pixblock_tail, \
3590 - process_pixblock_tail_head
3591 + \process_pixblock_head, \
3592 + \process_pixblock_tail, \
3593 + \process_pixblock_tail_head
3595 - cleanup
3596 -.if use_nearest_scaling != 0
3597 + \cleanup
3598 +.if \use_nearest_scaling != 0
3599 sub x29, x29, 64
3600 ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32
3601 ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32
3602 ldp x8, x9, [x29, -80]
3603 ldr x10, [x29, -96]
3604 mov sp, x29
3605 ldp x29, x30, [sp], 16
3606 ret /* exit */
3607 @@ -1162,22 +1162,22 @@ 700:
3608 ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32
3609 mov sp, x29
3610 ldp x29, x30, [sp], 16
3611 ret /* exit */
3612 .endif
3613 800:
3614 /* Process the remaining trailing pixels in the scanline (dst unaligned) */
3615 process_trailing_pixels 0, 0, \
3616 - process_pixblock_head, \
3617 - process_pixblock_tail, \
3618 - process_pixblock_tail_head
3619 + \process_pixblock_head, \
3620 + \process_pixblock_tail, \
3621 + \process_pixblock_tail_head
3623 - cleanup
3624 -.if use_nearest_scaling != 0
3625 + \cleanup
3626 +.if \use_nearest_scaling != 0
3627 sub x29, x29, 64
3628 ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32
3629 ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32
3630 ldp x8, x9, [x29, -80]
3631 ldr x10, [x29, -88]
3632 mov sp, x29
3633 ldp x29, x30, [sp], 16
3634 ret /* exit */
3635 @@ -1208,25 +1208,25 @@ 800:
3636 .unreq DST_R
3637 .unreq DST_W
3638 .unreq W
3639 .endif
3641 .purgem fetch_src_pixblock
3642 .purgem pixld_src
3644 - .endfunc
3645 + pixman_end_asm_function
3646 .endm
3648 .macro generate_composite_function_single_scanline x:vararg
3649 - generate_composite_function_scanline 0, x
3650 + generate_composite_function_scanline 0, \x
3651 .endm
3653 .macro generate_composite_function_nearest_scanline x:vararg
3654 - generate_composite_function_scanline 1, x
3655 + generate_composite_function_scanline 1, \x
3656 .endm
3658 /* Default prologue/epilogue, nothing special needs to be done */
3660 .macro default_init
3661 .endm
3663 .macro default_cleanup
3664 @@ -1250,61 +1250,61 @@ 800:
3665 * Conversion of 8 r5g6b6 pixels packed in 128-bit register (in)
3666 * into a planar a8r8g8b8 format (with a, r, g, b color components
3667 * stored into 64-bit registers out_a, out_r, out_g, out_b respectively).
3669 * Warning: the conversion is destructive and the original
3670 * value (in) is lost.
3672 .macro convert_0565_to_8888 in, out_a, out_r, out_g, out_b
3673 - shrn &out_r&.8b, &in&.8h, #8
3674 - shrn &out_g&.8b, &in&.8h, #3
3675 - sli &in&.8h, &in&.8h, #5
3676 - movi &out_a&.8b, #255
3677 - sri &out_r&.8b, &out_r&.8b, #5
3678 - sri &out_g&.8b, &out_g&.8b, #6
3679 - shrn &out_b&.8b, &in&.8h, #2
3680 + shrn \()\out_r\().8b, \()\in\().8h, #8
3681 + shrn \()\out_g\().8b, \()\in\().8h, #3
3682 + sli \()\in\().8h, \()\in\().8h, #5
3683 + movi \()\out_a\().8b, #255
3684 + sri \()\out_r\().8b, \()\out_r\().8b, #5
3685 + sri \()\out_g\().8b, \()\out_g\().8b, #6
3686 + shrn \()\out_b\().8b, \()\in\().8h, #2
3687 .endm
3689 .macro convert_0565_to_x888 in, out_r, out_g, out_b
3690 - shrn &out_r&.8b, &in&.8h, #8
3691 - shrn &out_g&.8b, &in&.8h, #3
3692 - sli &in&.8h, &in&.8h, #5
3693 - sri &out_r&.8b, &out_r&.8b, #5
3694 - sri &out_g&.8b, &out_g&.8b, #6
3695 - shrn &out_b&.8b, &in&.8h, #2
3696 + shrn \()\out_r\().8b, \()\in\().8h, #8
3697 + shrn \()\out_g\().8b, \()\in\().8h, #3
3698 + sli \()\in\().8h, \()\in\().8h, #5
3699 + sri \()\out_r\().8b, \()\out_r\().8b, #5
3700 + sri \()\out_g\().8b, \()\out_g\().8b, #6
3701 + shrn \()\out_b\().8b, \()\in\().8h, #2
3702 .endm
3705 * Conversion from planar a8r8g8b8 format (with a, r, g, b color components
3706 * in 64-bit registers in_a, in_r, in_g, in_b respectively) into 8 r5g6b6
3707 * pixels packed in 128-bit register (out). Requires two temporary 128-bit
3708 * registers (tmp1, tmp2)
3710 .macro convert_8888_to_0565 in_r, in_g, in_b, out, tmp1, tmp2
3711 - ushll &tmp1&.8h, &in_g&.8b, #7
3712 - shl &tmp1&.8h, &tmp1&.8h, #1
3713 - ushll &out&.8h, &in_r&.8b, #7
3714 - shl &out&.8h, &out&.8h, #1
3715 - ushll &tmp2&.8h, &in_b&.8b, #7
3716 - shl &tmp2&.8h, &tmp2&.8h, #1
3717 - sri &out&.8h, &tmp1&.8h, #5
3718 - sri &out&.8h, &tmp2&.8h, #11
3719 + ushll \()\tmp1\().8h, \()\in_g\().8b, #7
3720 + shl \()\tmp1\().8h, \()\tmp1\().8h, #1
3721 + ushll \()\out\().8h, \()\in_r\().8b, #7
3722 + shl \()\out\().8h, \()\out\().8h, #1
3723 + ushll \()\tmp2\().8h, \()\in_b\().8b, #7
3724 + shl \()\tmp2\().8h, \()\tmp2\().8h, #1
3725 + sri \()\out\().8h, \()\tmp1\().8h, #5
3726 + sri \()\out\().8h, \()\tmp2\().8h, #11
3727 .endm
3730 * Conversion of four r5g6b5 pixels (in) to four x8r8g8b8 pixels
3731 * returned in (out0, out1) registers pair. Requires one temporary
3732 * 64-bit register (tmp). 'out1' and 'in' may overlap, the original
3733 * value from 'in' is lost
3735 .macro convert_four_0565_to_x888_packed in, out0, out1, tmp
3736 - shl &out0&.4h, &in&.4h, #5 /* G top 6 bits */
3737 - shl &tmp&.4h, &in&.4h, #11 /* B top 5 bits */
3738 - sri &in&.4h, &in&.4h, #5 /* R is ready in top bits */
3739 - sri &out0&.4h, &out0&.4h, #6 /* G is ready in top bits */
3740 - sri &tmp&.4h, &tmp&.4h, #5 /* B is ready in top bits */
3741 - ushr &out1&.4h, &in&.4h, #8 /* R is in place */
3742 - sri &out0&.4h, &tmp&.4h, #8 /* G & B is in place */
3743 - zip1 &tmp&.4h, &out0&.4h, &out1&.4h /* everything is in place */
3744 - zip2 &out1&.4h, &out0&.4h, &out1&.4h
3745 - mov &out0&.d[0], &tmp&.d[0]
3746 + shl \()\out0\().4h, \()\in\().4h, #5 /* G top 6 bits */
3747 + shl \()\tmp\().4h, \()\in\().4h, #11 /* B top 5 bits */
3748 + sri \()\in\().4h, \()\in\().4h, #5 /* R is ready \in top bits */
3749 + sri \()\out0\().4h, \()\out0\().4h, #6 /* G is ready \in top bits */
3750 + sri \()\tmp\().4h, \()\tmp\().4h, #5 /* B is ready \in top bits */
3751 + ushr \()\out1\().4h, \()\in\().4h, #8 /* R is \in place */
3752 + sri \()\out0\().4h, \()\tmp\().4h, #8 /* G \() B is \in place */
3753 + zip1 \()\tmp\().4h, \()\out0\().4h, \()\out1\().4h /* everything is \in place */
3754 + zip2 \()\out1\().4h, \()\out0\().4h, \()\out1\().4h
3755 + mov \()\out0\().d[0], \()\tmp\().d[0]
3756 .endm