1 https://gitlab.freedesktop.org/pixman/pixman/-/merge_requests/71
3 diff --git a/gfx/cairo/libpixman/src/pixman-arm-asm.h b/gfx/cairo/libpixman/src/pixman-arm-asm.h
4 --- a/gfx/cairo/libpixman/src/pixman-arm-asm.h
5 +++ b/gfx/cairo/libpixman/src/pixman-arm-asm.h
7 * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
10 * Author: Jeff Muizelaar (jeff@infidigm.net)
14 /* Supplementary macro for setting function attributes */
15 -.macro pixman_asm_function fname
18 +.macro pixman_asm_function_impl fname
19 +#ifdef ASM_HAVE_FUNC_DIRECTIVE
25 - .type fname, %function
27 + .type \fname, %function
33 +.macro pixman_asm_function fname
34 +#ifdef ASM_LEADING_UNDERSCORE
35 + pixman_asm_function_impl _\fname
37 + pixman_asm_function_impl \fname
41 +.macro pixman_end_asm_function
42 +#ifdef ASM_HAVE_FUNC_DIRECTIVE
46 diff --git a/gfx/cairo/libpixman/src/pixman-arma64-neon-asm-bilinear.S b/gfx/cairo/libpixman/src/pixman-arma64-neon-asm-bilinear.S
47 --- a/gfx/cairo/libpixman/src/pixman-arma64-neon-asm-bilinear.S
48 +++ b/gfx/cairo/libpixman/src/pixman-arma64-neon-asm-bilinear.S
50 * format conversion, and interpolation as separate macros which can be used
51 * as the basic building blocks for constructing bilinear scanline functions.
54 .macro bilinear_load_8888 reg1, reg2, tmp
57 add TMP1, TOP, TMP1, lsl #2
58 - ld1 {®1&.2s}, [TMP1], STRIDE
59 - ld1 {®2&.2s}, [TMP1]
60 + ld1 {\()\reg1\().2s}, [TMP1], STRIDE
61 + ld1 {\()\reg2\().2s}, [TMP1]
64 .macro bilinear_load_0565 reg1, reg2, tmp
67 add TMP1, TOP, TMP1, lsl #1
68 - ld1 {®2&.s}[0], [TMP1], STRIDE
69 - ld1 {®2&.s}[1], [TMP1]
70 - convert_four_0565_to_x888_packed reg2, reg1, reg2, tmp
71 + ld1 {\()\reg2\().s}[0], [TMP1], STRIDE
72 + ld1 {\()\reg2\().s}[1], [TMP1]
73 + convert_four_0565_to_x888_packed \reg2, \reg1, \reg2, \tmp
76 .macro bilinear_load_and_vertical_interpolate_two_8888 \
77 acc1, acc2, reg1, reg2, reg3, reg4, tmp1, tmp2
79 - bilinear_load_8888 reg1, reg2, tmp1
80 - umull &acc1&.8h, ®1&.8b, v28.8b
81 - umlal &acc1&.8h, ®2&.8b, v29.8b
82 - bilinear_load_8888 reg3, reg4, tmp2
83 - umull &acc2&.8h, ®3&.8b, v28.8b
84 - umlal &acc2&.8h, ®4&.8b, v29.8b
85 + bilinear_load_8888 \reg1, \reg2, \tmp1
86 + umull \()\acc1\().8h, \()\reg1\().8b, v28.8b
87 + umlal \()\acc1\().8h, \()\reg2\().8b, v29.8b
88 + bilinear_load_8888 \reg3, \reg4, \tmp2
89 + umull \()\acc2\().8h, \()\reg3\().8b, v28.8b
90 + umlal \()\acc2\().8h, \()\reg4\().8b, v29.8b
93 .macro bilinear_load_and_vertical_interpolate_four_8888 \
94 - xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi \
95 + xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi, \
96 yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
98 bilinear_load_and_vertical_interpolate_two_8888 \
99 - xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi
100 + \xacc1, \xacc2, \xreg1, \xreg2, \xreg3, \xreg4, \xacc2lo, xacc2hi
101 bilinear_load_and_vertical_interpolate_two_8888 \
102 - yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
103 + \yacc1, \yacc2, \yreg1, \yreg2, \yreg3, \yreg4, \yacc2lo, \yacc2hi
106 .macro vzip reg1, reg2
107 - zip1 v24.8b, reg1, reg2
108 - zip2 reg2, reg1, reg2
110 + zip1 v24.8b, \reg1, \reg2
111 + zip2 \reg2, \reg1, \reg2
115 .macro vuzp reg1, reg2
116 - uzp1 v24.8b, reg1, reg2
117 - uzp2 reg2, reg1, reg2
119 + uzp1 v24.8b, \reg1, \reg2
120 + uzp2 \reg2, \reg1, \reg2
124 .macro bilinear_load_and_vertical_interpolate_two_0565 \
125 acc1, acc2, reg1, reg2, reg3, reg4, acc2lo, acc2hi
128 add TMP1, TOP, TMP1, lsl #1
131 add TMP2, TOP, TMP2, lsl #1
132 - ld1 {&acc2&.s}[0], [TMP1], STRIDE
133 - ld1 {&acc2&.s}[2], [TMP2], STRIDE
134 - ld1 {&acc2&.s}[1], [TMP1]
135 - ld1 {&acc2&.s}[3], [TMP2]
136 - convert_0565_to_x888 acc2, reg3, reg2, reg1
137 - vzip ®1&.8b, ®3&.8b
138 - vzip ®2&.8b, ®4&.8b
139 - vzip ®3&.8b, ®4&.8b
140 - vzip ®1&.8b, ®2&.8b
141 - umull &acc1&.8h, ®1&.8b, v28.8b
142 - umlal &acc1&.8h, ®2&.8b, v29.8b
143 - umull &acc2&.8h, ®3&.8b, v28.8b
144 - umlal &acc2&.8h, ®4&.8b, v29.8b
145 + ld1 {\()\acc2\().s}[0], [TMP1], STRIDE
146 + ld1 {\()\acc2\().s}[2], [TMP2], STRIDE
147 + ld1 {\()\acc2\().s}[1], [TMP1]
148 + ld1 {\()\acc2\().s}[3], [TMP2]
149 + convert_0565_to_x888 \acc2, \reg3, \reg2, \reg1
150 + vzip \()\reg1\().8b, \()\reg3\().8b
151 + vzip \()\reg2\().8b, \()\reg4\().8b
152 + vzip \()\reg3\().8b, \()\reg4\().8b
153 + vzip \()\reg1\().8b, \()\reg2\().8b
154 + umull \()\acc1\().8h, \()\reg1\().8b, v28.8b
155 + umlal \()\acc1\().8h, \()\reg2\().8b, v29.8b
156 + umull \()\acc2\().8h, \()\reg3\().8b, v28.8b
157 + umlal \()\acc2\().8h, \()\reg4\().8b, v29.8b
160 .macro bilinear_load_and_vertical_interpolate_four_0565 \
161 - xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi \
162 + xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi, \
163 yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
167 add TMP1, TOP, TMP1, lsl #1
170 add TMP2, TOP, TMP2, lsl #1
171 - ld1 {&xacc2&.s}[0], [TMP1], STRIDE
172 - ld1 {&xacc2&.s}[2], [TMP2], STRIDE
173 - ld1 {&xacc2&.s}[1], [TMP1]
174 - ld1 {&xacc2&.s}[3], [TMP2]
175 - convert_0565_to_x888 xacc2, xreg3, xreg2, xreg1
176 + ld1 {\()\xacc2\().s}[0], [TMP1], STRIDE
177 + ld1 {\()\xacc2\().s}[2], [TMP2], STRIDE
178 + ld1 {\()\xacc2\().s}[1], [TMP1]
179 + ld1 {\()\xacc2\().s}[3], [TMP2]
180 + convert_0565_to_x888 \xacc2, \xreg3, \xreg2, \xreg1
183 add TMP1, TOP, TMP1, lsl #1
186 add TMP2, TOP, TMP2, lsl #1
187 - ld1 {&yacc2&.s}[0], [TMP1], STRIDE
188 - vzip &xreg1&.8b, &xreg3&.8b
189 - ld1 {&yacc2&.s}[2], [TMP2], STRIDE
190 - vzip &xreg2&.8b, &xreg4&.8b
191 - ld1 {&yacc2&.s}[1], [TMP1]
192 - vzip &xreg3&.8b, &xreg4&.8b
193 - ld1 {&yacc2&.s}[3], [TMP2]
194 - vzip &xreg1&.8b, &xreg2&.8b
195 - convert_0565_to_x888 yacc2, yreg3, yreg2, yreg1
196 - umull &xacc1&.8h, &xreg1&.8b, v28.8b
197 - vzip &yreg1&.8b, &yreg3&.8b
198 - umlal &xacc1&.8h, &xreg2&.8b, v29.8b
199 - vzip &yreg2&.8b, &yreg4&.8b
200 - umull &xacc2&.8h, &xreg3&.8b, v28.8b
201 - vzip &yreg3&.8b, &yreg4&.8b
202 - umlal &xacc2&.8h, &xreg4&.8b, v29.8b
203 - vzip &yreg1&.8b, &yreg2&.8b
204 - umull &yacc1&.8h, &yreg1&.8b, v28.8b
205 - umlal &yacc1&.8h, &yreg2&.8b, v29.8b
206 - umull &yacc2&.8h, &yreg3&.8b, v28.8b
207 - umlal &yacc2&.8h, &yreg4&.8b, v29.8b
208 + ld1 {\()\yacc2\().s}[0], [TMP1], STRIDE
209 + vzip \()\xreg1\().8b, \()\xreg3\().8b
210 + ld1 {\()\yacc2\().s}[2], [TMP2], STRIDE
211 + vzip \()\xreg2\().8b, \()\xreg4\().8b
212 + ld1 {\()\yacc2\().s}[1], [TMP1]
213 + vzip \()\xreg3\().8b, \()\xreg4\().8b
214 + ld1 {\()\yacc2\().s}[3], [TMP2]
215 + vzip \()\xreg1\().8b, \()\xreg2\().8b
216 + convert_0565_to_x888 \yacc2, \yreg3, \yreg2, \yreg1
217 + umull \()\xacc1\().8h, \()\xreg1\().8b, v28.8b
218 + vzip \()\yreg1\().8b, \()\yreg3\().8b
219 + umlal \()\xacc1\().8h, \()\xreg2\().8b, v29.8b
220 + vzip \()\yreg2\().8b, \()\yreg4\().8b
221 + umull \()\xacc2\().8h, \()\xreg3\().8b, v28.8b
222 + vzip \()\yreg3\().8b, \()\yreg4\().8b
223 + umlal \()\xacc2\().8h, \()\xreg4\().8b, v29.8b
224 + vzip \()\yreg1\().8b, \()\yreg2\().8b
225 + umull \()\yacc1\().8h, \()\yreg1\().8b, v28.8b
226 + umlal \()\yacc1\().8h, \()\yreg2\().8b, v29.8b
227 + umull \()\yacc2\().8h, \()\yreg3\().8b, v28.8b
228 + umlal \()\yacc2\().8h, \()\yreg4\().8b, v29.8b
231 .macro bilinear_store_8888 numpix, tmp1, tmp2
234 st1 {v0.2s, v1.2s}, [OUT], #16
236 +.elseif \numpix == 2
237 st1 {v0.2s}, [OUT], #8
239 +.elseif \numpix == 1
240 st1 {v0.s}[0], [OUT], #4
242 - .error bilinear_store_8888 numpix is unsupported
243 + .error bilinear_store_8888 \numpix is unsupported
247 .macro bilinear_store_0565 numpix, tmp1, tmp2
252 - convert_8888_to_0565 v2, v1, v0, v1, tmp1, tmp2
254 + convert_8888_to_0565 v2, v1, v0, v1, \tmp1, \tmp2
256 st1 {v1.4h}, [OUT], #8
258 +.elseif \numpix == 2
259 st1 {v1.s}[0], [OUT], #4
261 +.elseif \numpix == 1
262 st1 {v1.h}[0], [OUT], #2
264 - .error bilinear_store_0565 numpix is unsupported
265 + .error bilinear_store_0565 \numpix is unsupported
271 * Macros for loading mask pixels into register 'mask'.
272 * dup must be done in somewhere else.
274 .macro bilinear_load_mask_x numpix, mask
277 .macro bilinear_load_mask_8 numpix, mask
279 - ld1 {&mask&.s}[0], [MASK], #4
281 - ld1 {&mask&.h}[0], [MASK], #2
283 - ld1 {&mask&.b}[0], [MASK], #1
285 + ld1 {\()\mask\().s}[0], [MASK], #4
286 +.elseif \numpix == 2
287 + ld1 {\()\mask\().h}[0], [MASK], #2
288 +.elseif \numpix == 1
289 + ld1 {\()\mask\().b}[0], [MASK], #1
291 - .error bilinear_load_mask_8 numpix is unsupported
292 + .error bilinear_load_mask_8 \numpix is unsupported
294 - prfm PREFETCH_MODE, [MASK, #prefetch_offset]
295 + prfum PREFETCH_MODE, [MASK, #(prefetch_offset)]
298 .macro bilinear_load_mask mask_fmt, numpix, mask
299 - bilinear_load_mask_&mask_fmt numpix, mask
300 + bilinear_load_mask_\mask_fmt \numpix, \mask
305 * Macros for loading destination pixels into register 'dst0' and 'dst1'.
306 * Interleave should be done somewhere else.
308 .macro bilinear_load_dst_0565_src numpix, dst0, dst1, dst01
311 .macro bilinear_load_dst_8888_src numpix, dst0, dst1, dst01
314 .macro bilinear_load_dst_8888 numpix, dst0, dst1, dst01
316 - ld1 {&dst0&.2s, &dst1&.2s}, [OUT]
318 - ld1 {&dst0&.2s}, [OUT]
320 - ld1 {&dst0&.s}[0], [OUT]
322 + ld1 {\()\dst0\().2s, \()\dst1\().2s}, [OUT]
323 +.elseif \numpix == 2
324 + ld1 {\()\dst0\().2s}, [OUT]
325 +.elseif \numpix == 1
326 + ld1 {\()\dst0\().s}[0], [OUT]
328 - .error bilinear_load_dst_8888 numpix is unsupported
329 + .error bilinear_load_dst_8888 \numpix is unsupported
331 - mov &dst01&.d[0], &dst0&.d[0]
332 - mov &dst01&.d[1], &dst1&.d[0]
333 + mov \()\dst01\().d[0], \()\dst0\().d[0]
334 + mov \()\dst01\().d[1], \()\dst1\().d[0]
335 prfm PREFETCH_MODE, [OUT, #(prefetch_offset * 4)]
338 .macro bilinear_load_dst_8888_over numpix, dst0, dst1, dst01
339 - bilinear_load_dst_8888 numpix, dst0, dst1, dst01
340 + bilinear_load_dst_8888 \numpix, \dst0, \dst1, \dst01
343 .macro bilinear_load_dst_8888_add numpix, dst0, dst1, dst01
344 - bilinear_load_dst_8888 numpix, dst0, dst1, dst01
345 + bilinear_load_dst_8888 \numpix, \dst0, \dst1, \dst01
348 .macro bilinear_load_dst dst_fmt, op, numpix, dst0, dst1, dst01
349 - bilinear_load_dst_&dst_fmt&_&op numpix, dst0, dst1, dst01
350 + bilinear_load_dst_\()\dst_fmt\()_\()\op \numpix, \dst0, \dst1, \dst01
354 * Macros for duplicating partially loaded mask to fill entire register.
355 * We will apply mask to interleaved source pixels, that is
356 * (r0, r1, r2, r3, g0, g1, g2, g3) x (m0, m1, m2, m3, m0, m1, m2, m3)
357 * (b0, b1, b2, b3, a0, a1, a2, a3) x (m0, m1, m2, m3, m0, m1, m2, m3)
358 * So, we need to duplicate loaded mask into whole register.
359 @@ -293,84 +293,85 @@
360 * (r0, r1, x, x, g0, g1, x, x) x (m0, m1, m0, m1, m0, m1, m0, m1)
361 * (b0, b1, x, x, a0, a1, x, x) x (m0, m1, m0, m1, m0, m1, m0, m1)
362 * We can do some optimizations for this including last pixel cases.
364 .macro bilinear_duplicate_mask_x numpix, mask
367 .macro bilinear_duplicate_mask_8 numpix, mask
369 - dup &mask&.2s, &mask&.s[0]
371 - dup &mask&.4h, &mask&.h[0]
373 - dup &mask&.8b, &mask&.b[0]
375 + dup \()\mask\().2s, \()\mask\().s[0]
376 +.elseif \numpix == 2
377 + dup \()\mask\().4h, \()\mask\().h[0]
378 +.elseif \numpix == 1
379 + dup \()\mask\().8b, \()\mask\().b[0]
381 - .error bilinear_duplicate_mask_8 is unsupported
382 + .error bilinear_duplicate_\mask_8 is unsupported
386 .macro bilinear_duplicate_mask mask_fmt, numpix, mask
387 - bilinear_duplicate_mask_&mask_fmt numpix, mask
388 + bilinear_duplicate_mask_\()\mask_fmt \numpix, \mask
392 * Macros for interleaving src and dst pixels to rrrr gggg bbbb aaaa form.
393 * Interleave should be done when maks is enabled or operator is 'over'.
395 .macro bilinear_interleave src0, src1, src01, dst0, dst1, dst01
396 - vuzp &src0&.8b, &src1&.8b
397 - vuzp &dst0&.8b, &dst1&.8b
398 - vuzp &src0&.8b, &src1&.8b
399 - vuzp &dst0&.8b, &dst1&.8b
400 - mov &src01&.d[1], &src1&.d[0]
401 - mov &src01&.d[0], &src0&.d[0]
402 - mov &dst01&.d[1], &dst1&.d[0]
403 - mov &dst01&.d[0], &dst0&.d[0]
404 + vuzp \()\src0\().8b, \()\src1\().8b
405 + vuzp \()\dst0\().8b, \()\dst1\().8b
406 + vuzp \()\src0\().8b, \()\src1\().8b
407 + vuzp \()\dst0\().8b, \()\dst1\().8b
408 + mov \()\src01\().d[1], \()\src1\().d[0]
409 + mov \()\src01\().d[0], \()\src0\().d[0]
410 + mov \()\dst01\().d[1], \()\dst1\().d[0]
411 + mov \()\dst01\().d[0], \()\dst0\().d[0]
414 .macro bilinear_interleave_src_dst_x_src \
415 numpix, src0, src1, src01, dst0, dst1, dst01
418 .macro bilinear_interleave_src_dst_x_over \
419 numpix, src0, src1, src01, dst0, dst1, dst01
421 - bilinear_interleave src0, src1, src01, dst0, dst1, dst01
422 + bilinear_interleave \src0, \src1, \src01, \dst0, \dst1, \dst01
425 .macro bilinear_interleave_src_dst_x_add \
426 numpix, src0, src1, src01, dst0, dst1, dst01
427 - bilinear_interleave src0, src1, src01, dst0, dst1, dst01
429 + bilinear_interleave \src0, \src1, \src01, \dst0, \dst1, \dst01
432 .macro bilinear_interleave_src_dst_8_src \
433 numpix, src0, src1, src01, dst0, dst1, dst01
435 - bilinear_interleave src0, src1, src01, dst0, dst1, dst01
436 + bilinear_interleave \src0, \src1, \src01, \dst0, \dst1, \dst01
439 .macro bilinear_interleave_src_dst_8_over \
440 numpix, src0, src1, src01, dst0, dst1, dst01
442 - bilinear_interleave src0, src1, src01, dst0, dst1, dst01
443 + bilinear_interleave \src0, \src1, \src01, \dst0, \dst1, \dst01
446 .macro bilinear_interleave_src_dst_8_add \
447 numpix, src0, src1, src01, dst0, dst1, dst01
449 - bilinear_interleave src0, src1, src01, dst0, dst1, dst01
450 + bilinear_interleave \src0, \src1, \src01, \dst0, \dst1, \dst01
453 .macro bilinear_interleave_src_dst \
454 mask_fmt, op, numpix, src0, src1, src01, dst0, dst1, dst01
456 - bilinear_interleave_src_dst_&mask_fmt&_&op \
457 - numpix, src0, src1, src01, dst0, dst1, dst01
458 + bilinear_interleave_src_dst_\()\mask_fmt\()_\()\op \
459 + \numpix, \src0, \src1, \src01, \dst0, \dst1, \dst01
464 * Macros for applying masks to src pixels. (see combine_mask_u() function)
465 * src, dst should be in interleaved form.
466 * mask register should be in form (m0, m1, m2, m3).
468 @@ -378,191 +379,191 @@
469 numpix, src0, src1, src01, mask, \
470 tmp01, tmp23, tmp45, tmp67
473 .macro bilinear_apply_mask_to_src_8 \
474 numpix, src0, src1, src01, mask, \
475 tmp01, tmp23, tmp45, tmp67
477 - umull &tmp01&.8h, &src0&.8b, &mask&.8b
478 - umull &tmp23&.8h, &src1&.8b, &mask&.8b
479 + umull \()\tmp01\().8h, \()\src0\().8b, \()\mask\().8b
480 + umull \()\tmp23\().8h, \()\src1\().8b, \()\mask\().8b
482 - urshr &tmp45&.8h, &tmp01&.8h, #8
483 - urshr &tmp67&.8h, &tmp23&.8h, #8
484 + urshr \()\tmp45\().8h, \()\tmp01\().8h, #8
485 + urshr \()\tmp67\().8h, \()\tmp23\().8h, #8
487 - raddhn &src0&.8b, &tmp45&.8h, &tmp01&.8h
488 - raddhn &src1&.8b, &tmp67&.8h, &tmp23&.8h
489 - mov &src01&.d[0], &src0&.d[0]
490 - mov &src01&.d[1], &src1&.d[0]
491 + raddhn \()\src0\().8b, \()\tmp45\().8h, \()\tmp01\().8h
492 + raddhn \()\src1\().8b, \()\tmp67\().8h, \()\tmp23\().8h
493 + mov \()\src01\().d[0], \()\src0\().d[0]
494 + mov \()\src01\().d[1], \()\src1\().d[0]
497 .macro bilinear_apply_mask_to_src \
498 mask_fmt, numpix, src0, src1, src01, mask, \
499 tmp01, tmp23, tmp45, tmp67
501 - bilinear_apply_mask_to_src_&mask_fmt \
502 - numpix, src0, src1, src01, mask, \
503 - tmp01, tmp23, tmp45, tmp67
504 + bilinear_apply_mask_to_src_\()\mask_fmt \
505 + \numpix, \src0, \src1, \src01, \mask, \
506 + \tmp01, \tmp23, \tmp45, \tmp67
511 * Macros for combining src and destination pixels.
512 * Interleave or not is depending on operator 'op'.
514 .macro bilinear_combine_src \
515 numpix, src0, src1, src01, dst0, dst1, dst01, \
516 tmp01, tmp23, tmp45, tmp67, tmp8
519 .macro bilinear_combine_over \
520 numpix, src0, src1, src01, dst0, dst1, dst01, \
521 tmp01, tmp23, tmp45, tmp67, tmp8
523 - dup &tmp8&.2s, &src1&.s[1]
524 + dup \()\tmp8\().2s, \()\src1\().s[1]
526 - mvn &tmp8&.8b, &tmp8&.8b
527 + mvn \()\tmp8\().8b, \()\tmp8\().8b
529 - umull &tmp01&.8h, &dst0&.8b, &tmp8&.8b
530 + umull \()\tmp01\().8h, \()\dst0\().8b, \()\tmp8\().8b
532 - umull &tmp23&.8h, &dst1&.8b, &tmp8&.8b
533 + umull \()\tmp23\().8h, \()\dst1\().8b, \()\tmp8\().8b
535 - urshr &tmp45&.8h, &tmp01&.8h, #8
536 - urshr &tmp67&.8h, &tmp23&.8h, #8
537 + urshr \()\tmp45\().8h, \()\tmp01\().8h, #8
538 + urshr \()\tmp67\().8h, \()\tmp23\().8h, #8
540 - raddhn &dst0&.8b, &tmp45&.8h, &tmp01&.8h
541 - raddhn &dst1&.8b, &tmp67&.8h, &tmp23&.8h
542 - mov &dst01&.d[0], &dst0&.d[0]
543 - mov &dst01&.d[1], &dst1&.d[0]
544 + raddhn \()\dst0\().8b, \()\tmp45\().8h, \()\tmp01\().8h
545 + raddhn \()\dst1\().8b, \()\tmp67\().8h, \()\tmp23\().8h
546 + mov \()\dst01\().d[0], \()\dst0\().d[0]
547 + mov \()\dst01\().d[1], \()\dst1\().d[0]
549 - uqadd &src0&.8b, &dst0&.8b, &src0&.8b
550 - uqadd &src1&.8b, &dst1&.8b, &src1&.8b
551 - mov &src01&.d[0], &src0&.d[0]
552 - mov &src01&.d[1], &src1&.d[0]
553 + uqadd \()\src0\().8b, \()\dst0\().8b, \()\src0\().8b
554 + uqadd \()\src1\().8b, \()\dst1\().8b, \()\src1\().8b
555 + mov \()\src01\().d[0], \()\src0\().d[0]
556 + mov \()\src01\().d[1], \()\src1\().d[0]
559 .macro bilinear_combine_add \
560 numpix, src0, src1, src01, dst0, dst1, dst01, \
561 tmp01, tmp23, tmp45, tmp67, tmp8
563 - uqadd &src0&.8b, &dst0&.8b, &src0&.8b
564 - uqadd &src1&.8b, &dst1&.8b, &src1&.8b
565 - mov &src01&.d[0], &src0&.d[0]
566 - mov &src01&.d[1], &src1&.d[0]
567 + uqadd \()\src0\().8b, \()\dst0\().8b, \()\src0\().8b
568 + uqadd \()\src1\().8b, \()\dst1\().8b, \()\src1\().8b
569 + mov \()\src01\().d[0], \()\src0\().d[0]
570 + mov \()\src01\().d[1], \()\src1\().d[0]
573 .macro bilinear_combine \
574 op, numpix, src0, src1, src01, dst0, dst1, dst01, \
575 tmp01, tmp23, tmp45, tmp67, tmp8
577 - bilinear_combine_&op \
578 - numpix, src0, src1, src01, dst0, dst1, dst01, \
579 - tmp01, tmp23, tmp45, tmp67, tmp8
580 + bilinear_combine_\()\op \
581 + \numpix, \src0, \src1, \src01, \dst0, \dst1, \dst01, \
582 + \tmp01, \tmp23, \tmp45, \tmp67, \tmp8
586 * Macros for final deinterleaving of destination pixels if needed.
588 .macro bilinear_deinterleave numpix, dst0, dst1, dst01
589 - vuzp &dst0&.8b, &dst1&.8b
590 + vuzp \()\dst0\().8b, \()\dst1\().8b
592 - vuzp &dst0&.8b, &dst1&.8b
593 - mov &dst01&.d[0], &dst0&.d[0]
594 - mov &dst01&.d[1], &dst1&.d[0]
595 + vuzp \()\dst0\().8b, \()\dst1\().8b
596 + mov \()\dst01\().d[0], \()\dst0\().d[0]
597 + mov \()\dst01\().d[1], \()\dst1\().d[0]
600 .macro bilinear_deinterleave_dst_x_src numpix, dst0, dst1, dst01
603 .macro bilinear_deinterleave_dst_x_over numpix, dst0, dst1, dst01
604 - bilinear_deinterleave numpix, dst0, dst1, dst01
605 + bilinear_deinterleave \numpix, \dst0, \dst1, \dst01
608 .macro bilinear_deinterleave_dst_x_add numpix, dst0, dst1, dst01
609 - bilinear_deinterleave numpix, dst0, dst1, dst01
610 + bilinear_deinterleave \numpix, \dst0, \dst1, \dst01
613 .macro bilinear_deinterleave_dst_8_src numpix, dst0, dst1, dst01
614 - bilinear_deinterleave numpix, dst0, dst1, dst01
615 + bilinear_deinterleave \numpix, \dst0, \dst1, \dst01
618 .macro bilinear_deinterleave_dst_8_over numpix, dst0, dst1, dst01
619 - bilinear_deinterleave numpix, dst0, dst1, dst01
620 + bilinear_deinterleave \numpix, \dst0, \dst1, \dst01
623 .macro bilinear_deinterleave_dst_8_add numpix, dst0, dst1, dst01
624 - bilinear_deinterleave numpix, dst0, dst1, dst01
625 + bilinear_deinterleave \numpix, \dst0, \dst1, \dst01
628 .macro bilinear_deinterleave_dst mask_fmt, op, numpix, dst0, dst1, dst01
629 - bilinear_deinterleave_dst_&mask_fmt&_&op numpix, dst0, dst1, dst01
630 + bilinear_deinterleave_dst_\()\mask_fmt\()_\()\op \numpix, \dst0, \dst1, \dst01
634 .macro bilinear_interpolate_last_pixel src_fmt, mask_fmt, dst_fmt, op
635 - bilinear_load_&src_fmt v0, v1, v2
636 - bilinear_load_mask mask_fmt, 1, v4
637 - bilinear_load_dst dst_fmt, op, 1, v18, v19, v9
638 + bilinear_load_\()\src_fmt v0, v1, v2
639 + bilinear_load_mask \mask_fmt, 1, v4
640 + bilinear_load_dst \dst_fmt, \op, 1, v18, v19, v9
641 umull v2.8h, v0.8b, v28.8b
642 umlal v2.8h, v1.8b, v29.8b
643 /* 5 cycles bubble */
644 ushll v0.4s, v2.4h, #BILINEAR_INTERPOLATION_BITS
645 umlsl v0.4s, v2.4h, v15.h[0]
646 umlal2 v0.4s, v2.8h, v15.h[0]
647 /* 5 cycles bubble */
648 - bilinear_duplicate_mask mask_fmt, 1, v4
649 + bilinear_duplicate_mask \mask_fmt, 1, v4
650 shrn v0.4h, v0.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
651 /* 3 cycles bubble */
654 bilinear_interleave_src_dst \
655 - mask_fmt, op, 1, v0, v1, v0, v18, v19, v9
656 + \mask_fmt, \op, 1, v0, v1, v0, v18, v19, v9
657 bilinear_apply_mask_to_src \
658 - mask_fmt, 1, v0, v1, v0, v4, \
659 + \mask_fmt, 1, v0, v1, v0, v4, \
662 - op, 1, v0, v1, v0, v18, v19, v9, \
663 + \op, 1, v0, v1, v0, v18, v19, v9, \
665 - bilinear_deinterleave_dst mask_fmt, op, 1, v0, v1, v0
666 - bilinear_store_&dst_fmt 1, v17, v18
667 + bilinear_deinterleave_dst \mask_fmt, \op, 1, v0, v1, v0
668 + bilinear_store_\()\dst_fmt 1, v17, v18
671 .macro bilinear_interpolate_two_pixels src_fmt, mask_fmt, dst_fmt, op
672 - bilinear_load_and_vertical_interpolate_two_&src_fmt \
673 + bilinear_load_and_vertical_interpolate_two_\()\src_fmt \
674 v1, v11, v18, v19, v20, v21, v22, v23
675 - bilinear_load_mask mask_fmt, 2, v4
676 - bilinear_load_dst dst_fmt, op, 2, v18, v19, v9
677 + bilinear_load_mask \mask_fmt, 2, v4
678 + bilinear_load_dst \dst_fmt, \op, 2, v18, v19, v9
679 ushll v0.4s, v1.4h, #BILINEAR_INTERPOLATION_BITS
680 umlsl v0.4s, v1.4h, v15.h[0]
681 umlal2 v0.4s, v1.8h, v15.h[0]
682 ushll v10.4s, v11.4h, #BILINEAR_INTERPOLATION_BITS
683 umlsl v10.4s, v11.4h, v15.h[4]
684 umlal2 v10.4s, v11.8h, v15.h[4]
685 shrn v0.4h, v0.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
686 shrn2 v0.8h, v10.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
687 - bilinear_duplicate_mask mask_fmt, 2, v4
688 + bilinear_duplicate_mask \mask_fmt, 2, v4
689 ushr v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS)
690 add v12.8h, v12.8h, v13.8h
692 bilinear_interleave_src_dst \
693 - mask_fmt, op, 2, v0, v1, v0, v18, v19, v9
694 + \mask_fmt, \op, 2, v0, v1, v0, v18, v19, v9
695 bilinear_apply_mask_to_src \
696 - mask_fmt, 2, v0, v1, v0, v4, \
697 + \mask_fmt, 2, v0, v1, v0, v4, \
700 - op, 2, v0, v1, v0, v18, v19, v9, \
701 + \op, 2, v0, v1, v0, v18, v19, v9, \
703 - bilinear_deinterleave_dst mask_fmt, op, 2, v0, v1, v0
704 - bilinear_store_&dst_fmt 2, v16, v17
705 + bilinear_deinterleave_dst \mask_fmt, \op, 2, v0, v1, v0
706 + bilinear_store_\()\dst_fmt 2, v16, v17
709 .macro bilinear_interpolate_four_pixels src_fmt, mask_fmt, dst_fmt, op
710 - bilinear_load_and_vertical_interpolate_four_&src_fmt \
711 - v1, v11, v4, v5, v6, v7, v22, v23 \
712 + bilinear_load_and_vertical_interpolate_four_\()\src_fmt \
713 + v1, v11, v4, v5, v6, v7, v22, v23, \
714 v3, v9, v16, v17, v20, v21, v18, v19
715 prfm PREFETCH_MODE, [TMP1, PF_OFFS]
716 sub TMP1, TMP1, STRIDE
717 prfm PREFETCH_MODE, [TMP1, PF_OFFS]
718 ushll v0.4s, v1.4h, #BILINEAR_INTERPOLATION_BITS
719 umlsl v0.4s, v1.4h, v15.h[0]
720 umlal2 v0.4s, v1.8h, v15.h[0]
721 ushll v10.4s, v11.4h, #BILINEAR_INTERPOLATION_BITS
722 @@ -575,33 +576,33 @@
723 ushll v8.4s, v9.4h, #BILINEAR_INTERPOLATION_BITS
724 umlsl v8.4s, v9.4h, v15.h[4]
725 umlal2 v8.4s, v9.8h, v15.h[4]
726 add v12.8h, v12.8h, v13.8h
727 shrn v0.4h, v0.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
728 shrn2 v0.8h, v10.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
729 shrn v2.4h, v2.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
730 shrn2 v2.8h, v8.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
731 - bilinear_load_mask mask_fmt, 4, v4
732 - bilinear_duplicate_mask mask_fmt, 4, v4
733 + bilinear_load_mask \mask_fmt, 4, v4
734 + bilinear_duplicate_mask \mask_fmt, 4, v4
735 ushr v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS)
738 add v12.8h, v12.8h, v13.8h
739 - bilinear_load_dst dst_fmt, op, 4, v2, v3, v21
740 + bilinear_load_dst \dst_fmt, \op, 4, v2, v3, v21
741 bilinear_interleave_src_dst \
742 - mask_fmt, op, 4, v0, v1, v0, v2, v3, v11
743 + \mask_fmt, \op, 4, v0, v1, v0, v2, v3, v11
744 bilinear_apply_mask_to_src \
745 - mask_fmt, 4, v0, v1, v0, v4, \
746 + \mask_fmt, 4, v0, v1, v0, v4, \
749 - op, 4, v0, v1, v0, v2, v3, v1, \
750 + \op, 4, v0, v1, v0, v2, v3, v1, \
752 - bilinear_deinterleave_dst mask_fmt, op, 4, v0, v1, v0
753 - bilinear_store_&dst_fmt 4, v6, v7
754 + bilinear_deinterleave_dst \mask_fmt, \op, 4, v0, v1, v0
755 + bilinear_store_\()\dst_fmt 4, v6, v7
758 .set BILINEAR_FLAG_USE_MASK, 1
759 .set BILINEAR_FLAG_USE_ALL_NEON_REGS, 2
762 * Main template macro for generating NEON optimized bilinear scanline functions.
764 @@ -631,24 +632,24 @@
765 bilinear_process_four_pixels, \
766 bilinear_process_pixblock_head, \
767 bilinear_process_pixblock_tail, \
768 bilinear_process_pixblock_tail_head, \
773 -pixman_asm_function fname
774 -.if pixblock_size == 8
775 -.elseif pixblock_size == 4
776 +pixman_asm_function \fname
777 +.if \pixblock_size == 8
778 +.elseif \pixblock_size == 4
780 .error unsupported pixblock size
783 -.if ((flags) & BILINEAR_FLAG_USE_MASK) == 0
784 +.if ((\flags) & BILINEAR_FLAG_USE_MASK) == 0
793 @@ -694,32 +695,32 @@ pixman_asm_function fname
802 - .set prefetch_offset, prefetch_distance
803 + .set prefetch_offset, \prefetch_distance
805 stp x29, x30, [sp, -16]!
808 st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32
809 st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32
810 stp x10, x11, [x29, -80]
811 stp x12, x13, [x29, -96]
812 stp x14, x15, [x29, -112]
818 - mov WTMP1, #prefetch_distance
819 + mov WTMP1, #\prefetch_distance
820 umull PF_OFFS, WTMP1, UX
822 sub STRIDE, BOTTOM, TOP
828 @@ -730,73 +731,73 @@ pixman_asm_function fname
829 mov v25.d[0], v12.d[1]
830 mov v26.d[0], v13.d[0]
831 add v25.4h, v25.4h, v26.4h
832 mov v12.d[1], v25.d[0]
834 /* ensure good destination alignment */
837 - tst OUT, #(1 << dst_bpp_shift)
838 + tst OUT, #(1 << \dst_bpp_shift)
840 ushr v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS)
841 add v12.8h, v12.8h, v13.8h
842 - bilinear_process_last_pixel
843 + \bilinear_process_last_pixel
846 add v13.8h, v13.8h, v13.8h
847 ushr v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS)
848 add v12.8h, v12.8h, v13.8h
852 - tst OUT, #(1 << (dst_bpp_shift + 1))
853 + tst OUT, #(1 << (\dst_bpp_shift + 1))
855 - bilinear_process_two_pixels
856 + \bilinear_process_two_pixels
859 -.if pixblock_size == 8
860 +.if \pixblock_size == 8
863 - tst OUT, #(1 << (dst_bpp_shift + 2))
864 + tst OUT, #(1 << (\dst_bpp_shift + 2))
866 - bilinear_process_four_pixels
867 + \bilinear_process_four_pixels
871 - subs WIDTH, WIDTH, #pixblock_size
872 + subs WIDTH, WIDTH, #\pixblock_size
874 - asr PF_OFFS, PF_OFFS, #(16 - src_bpp_shift)
875 - bilinear_process_pixblock_head
876 - subs WIDTH, WIDTH, #pixblock_size
877 + asr PF_OFFS, PF_OFFS, #(16 - \src_bpp_shift)
878 + \bilinear_process_pixblock_head
879 + subs WIDTH, WIDTH, #\pixblock_size
882 - bilinear_process_pixblock_tail_head
883 - subs WIDTH, WIDTH, #pixblock_size
884 + \bilinear_process_pixblock_tail_head
885 + subs WIDTH, WIDTH, #\pixblock_size
888 - bilinear_process_pixblock_tail
889 + \bilinear_process_pixblock_tail
891 -.if pixblock_size == 8
892 +.if \pixblock_size == 8
895 - bilinear_process_four_pixels
896 + \bilinear_process_four_pixels
899 /* handle the remaining trailing pixels */
902 - bilinear_process_two_pixels
903 + \bilinear_process_two_pixels
907 - bilinear_process_last_pixel
908 + \bilinear_process_last_pixel
911 -.if ((flags) & BILINEAR_FLAG_USE_MASK) == 0
912 +.if ((\flags) & BILINEAR_FLAG_USE_MASK) == 0
914 ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32
915 ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32
916 ldp x10, x11, [x29, -80]
917 ldp x12, x13, [x29, -96]
918 ldp x14, x15, [x29, -112]
920 ldp x29, x30, [sp], 16
921 @@ -824,21 +825,21 @@ 300:
930 -.if ((flags) & BILINEAR_FLAG_USE_MASK) != 0
931 +.if ((\flags) & BILINEAR_FLAG_USE_MASK) != 0
936 +pixman_end_asm_function
940 /* src_8888_8_8888 */
941 .macro bilinear_src_8888_8_8888_process_last_pixel
942 bilinear_interpolate_last_pixel 8888, 8, 8888, src
945 diff --git a/gfx/cairo/libpixman/src/pixman-arma64-neon-asm.S b/gfx/cairo/libpixman/src/pixman-arma64-neon-asm.S
946 --- a/gfx/cairo/libpixman/src/pixman-arma64-neon-asm.S
947 +++ b/gfx/cairo/libpixman/src/pixman-arma64-neon-asm.S
948 @@ -262,64 +262,64 @@
949 uqadd v18.8b, v0.8b, v22.8b
950 uqadd v19.8b, v1.8b, v23.8b
951 shrn v6.8b, v4.8h, #8
953 shrn v7.8b, v4.8h, #3
955 ushll v14.8h, v17.8b, #7
956 sli v14.8h, v14.8h, #1
957 - PF add PF_X, PF_X, #8
958 + PF add, PF_X, PF_X, #8
959 ushll v8.8h, v19.8b, #7
961 - PF tst PF_CTL, #0xF
962 + PF tst, PF_CTL, #0xF
965 - PF add PF_X, PF_X, #8
967 + PF add, PF_X, PF_X, #8
971 - PF sub PF_CTL, PF_CTL, #1
973 + PF sub, PF_CTL, PF_CTL, #1
976 shrn v30.8b, v4.8h, #2
977 umull v10.8h, v3.8b, v6.8b
978 - PF lsl DUMMY, PF_X, #src_bpp_shift
979 - PF prfm PREFETCH_MODE, [PF_SRC, DUMMY]
980 + PF lsl, DUMMY, PF_X, #src_bpp_shift
981 + PF prfm, PREFETCH_MODE, [PF_SRC, DUMMY]
982 umull v11.8h, v3.8b, v7.8b
983 umull v12.8h, v3.8b, v30.8b
984 - PF lsl DUMMY, PF_X, #dst_bpp_shift
985 - PF prfm PREFETCH_MODE, [PF_DST, DUMMY]
986 + PF lsl, DUMMY, PF_X, #dst_bpp_shift
987 + PF prfm, PREFETCH_MODE, [PF_DST, DUMMY]
988 sri v14.8h, v8.8h, #5
989 - PF cmp PF_X, ORIG_W
990 + PF cmp, PF_X, ORIG_W
991 ushll v9.8h, v18.8b, #7
993 urshr v17.8h, v10.8h, #8
995 - PF sub PF_X, PF_X, ORIG_W
997 + PF sub, PF_X, PF_X, ORIG_W
999 urshr v19.8h, v11.8h, #8
1000 urshr v18.8h, v12.8h, #8
1002 - PF subs PF_CTL, PF_CTL, #0x10
1004 + PF subs, PF_CTL, PF_CTL, #0x10
1006 sri v14.8h, v9.8h, #11
1007 mov v28.d[0], v14.d[0]
1008 mov v29.d[0], v14.d[1]
1010 - PF lsl DUMMY, SRC_STRIDE, #src_bpp_shift
1011 - PF ldrsb DUMMY, [PF_SRC, DUMMY]
1012 - PF add PF_SRC, PF_SRC, #1
1014 + PF lsl, DUMMY, SRC_STRIDE, #src_bpp_shift
1015 + PF ldrsb, DUMMY, [PF_SRC, DUMMY]
1016 + PF add, PF_SRC, PF_SRC, #1
1018 raddhn v20.8b, v10.8h, v17.8h
1019 raddhn v23.8b, v11.8h, v19.8h
1021 - PF lsl DUMMY, DST_STRIDE, #dst_bpp_shift
1022 - PF ldrsb DUMMY, [PF_DST, DUMMY]
1023 - PF add PF_DST, PF_SRC, #1
1025 + PF lsl, DUMMY, DST_STRIDE, #dst_bpp_shift
1026 + PF ldrsb, DUMMY, [PF_DST, DUMMY]
1027 + PF add, PF_DST, PF_SRC, #1
1029 raddhn v22.8b, v12.8h, v18.8h
1030 st1 {v14.8h}, [DST_W], #16
1035 /* If we did not care much about the performance, we would just use this... */
1036 @@ -469,42 +469,42 @@ generate_composite_function \
1037 sri v14.8h, v8.8h, #5
1038 sri v14.8h, v9.8h, #11
1039 mov v28.d[0], v14.d[0]
1040 mov v29.d[0], v14.d[1]
1043 .macro pixman_composite_src_8888_0565_process_pixblock_tail_head
1044 sri v14.8h, v8.8h, #5
1045 - PF add PF_X, PF_X, #8
1046 - PF tst PF_CTL, #0xF
1047 + PF add, PF_X, PF_X, #8
1048 + PF tst, PF_CTL, #0xF
1051 - PF add PF_X, PF_X, #8
1052 - PF sub PF_CTL, PF_CTL, #1
1054 + PF add, PF_X, PF_X, #8
1055 + PF sub, PF_CTL, PF_CTL, #1
1057 sri v14.8h, v9.8h, #11
1058 mov v28.d[0], v14.d[0]
1059 mov v29.d[0], v14.d[1]
1060 - PF cmp PF_X, ORIG_W
1061 - PF lsl DUMMY, PF_X, #src_bpp_shift
1062 - PF prfm PREFETCH_MODE, [PF_SRC, DUMMY]
1063 + PF cmp, PF_X, ORIG_W
1064 + PF lsl, DUMMY, PF_X, #src_bpp_shift
1065 + PF prfm, PREFETCH_MODE, [PF_SRC, DUMMY]
1066 ushll v8.8h, v1.8b, #7
1067 sli v8.8h, v8.8h, #1
1068 st1 {v14.8h}, [DST_W], #16
1070 - PF sub PF_X, PF_X, ORIG_W
1071 - PF subs PF_CTL, PF_CTL, #0x10
1073 + PF sub, PF_X, PF_X, ORIG_W
1074 + PF subs, PF_CTL, PF_CTL, #0x10
1076 ushll v14.8h, v2.8b, #7
1077 sli v14.8h, v14.8h, #1
1079 - PF lsl DUMMY, SRC_STRIDE, #src_bpp_shift
1080 - PF ldrsb DUMMY, [PF_SRC, DUMMY]
1081 - PF add PF_SRC, PF_SRC, #1
1083 + PF lsl, DUMMY, SRC_STRIDE, #src_bpp_shift
1084 + PF ldrsb, DUMMY, [PF_SRC, DUMMY]
1085 + PF add, PF_SRC, PF_SRC, #1
1087 ushll v9.8h, v0.8b, #7
1088 sli v9.8h, v9.8h, #1
1091 generate_composite_function \
1092 pixman_composite_src_8888_0565_asm_neon, 32, 0, 16, \
1093 FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
1094 @@ -561,41 +561,41 @@ generate_composite_function \
1095 uqadd v31.8b, v3.8b, v7.8b
1098 .macro pixman_composite_add_8_8_process_pixblock_tail
1101 .macro pixman_composite_add_8_8_process_pixblock_tail_head
1103 - PF add PF_X, PF_X, #32
1104 - PF tst PF_CTL, #0xF
1105 + PF add, PF_X, PF_X, #32
1106 + PF tst, PF_CTL, #0xF
1107 ld1 {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32
1109 - PF add PF_X, PF_X, #32
1110 - PF sub PF_CTL, PF_CTL, #1
1112 + PF add, PF_X, PF_X, #32
1113 + PF sub, PF_CTL, PF_CTL, #1
1115 st1 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
1116 - PF cmp PF_X, ORIG_W
1117 - PF lsl DUMMY, PF_X, #src_bpp_shift
1118 - PF prfm PREFETCH_MODE, [PF_SRC, DUMMY]
1119 - PF lsl DUMMY, PF_X, #dst_bpp_shift
1120 - PF prfm PREFETCH_MODE, [PF_DST, DUMMY]
1122 - PF sub PF_X, PF_X, ORIG_W
1123 - PF subs PF_CTL, PF_CTL, #0x10
1124 + PF cmp, PF_X, ORIG_W
1125 + PF lsl, DUMMY, PF_X, #src_bpp_shift
1126 + PF prfm, PREFETCH_MODE, [PF_SRC, DUMMY]
1127 + PF lsl, DUMMY, PF_X, #dst_bpp_shift
1128 + PF prfm, PREFETCH_MODE, [PF_DST, DUMMY]
1130 + PF sub, PF_X, PF_X, ORIG_W
1131 + PF subs, PF_CTL, PF_CTL, #0x10
1133 uqadd v28.8b, v0.8b, v4.8b
1135 - PF lsl DUMMY, SRC_STRIDE, #src_bpp_shift
1136 - PF ldrsb DUMMY, [PF_SRC, DUMMY]
1137 - PF add PF_SRC, PF_SRC, #1
1138 - PF lsl DUMMY, DST_STRIDE, #dst_bpp_shift
1139 - PF ldrsb DUMMY, [PF_DST, DUMMY]
1140 - PF add PF_DST, PF_DST, #1
1142 + PF lsl, DUMMY, SRC_STRIDE, #src_bpp_shift
1143 + PF ldrsb, DUMMY, [PF_SRC, DUMMY]
1144 + PF add, PF_SRC, PF_SRC, #1
1145 + PF lsl, DUMMY, DST_STRIDE, #dst_bpp_shift
1146 + PF ldrsb, DUMMY, [PF_DST, DUMMY]
1147 + PF add, PF_DST, PF_DST, #1
1149 uqadd v29.8b, v1.8b, v5.8b
1150 uqadd v30.8b, v2.8b, v6.8b
1151 uqadd v31.8b, v3.8b, v7.8b
1154 generate_composite_function \
1155 pixman_composite_add_8_8_asm_neon, 8, 0, 8, \
1156 @@ -607,41 +607,41 @@ generate_composite_function \
1157 pixman_composite_add_8_8_process_pixblock_head, \
1158 pixman_composite_add_8_8_process_pixblock_tail, \
1159 pixman_composite_add_8_8_process_pixblock_tail_head
1161 /******************************************************************************/
1163 .macro pixman_composite_add_8888_8888_process_pixblock_tail_head
1165 - PF add PF_X, PF_X, #8
1166 - PF tst PF_CTL, #0xF
1167 + PF add, PF_X, PF_X, #8
1168 + PF tst, PF_CTL, #0xF
1169 ld1 {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32
1171 - PF add PF_X, PF_X, #8
1172 - PF sub PF_CTL, PF_CTL, #1
1174 + PF add, PF_X, PF_X, #8
1175 + PF sub, PF_CTL, PF_CTL, #1
1177 st1 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
1178 - PF cmp PF_X, ORIG_W
1179 - PF lsl DUMMY, PF_X, #src_bpp_shift
1180 - PF prfm PREFETCH_MODE, [PF_SRC, DUMMY]
1181 - PF lsl DUMMY, PF_X, #dst_bpp_shift
1182 - PF prfm PREFETCH_MODE, [PF_DST, DUMMY]
1184 - PF sub PF_X, PF_X, ORIG_W
1185 - PF subs PF_CTL, PF_CTL, #0x10
1186 + PF cmp, PF_X, ORIG_W
1187 + PF lsl, DUMMY, PF_X, #src_bpp_shift
1188 + PF prfm, PREFETCH_MODE, [PF_SRC, DUMMY]
1189 + PF lsl, DUMMY, PF_X, #dst_bpp_shift
1190 + PF prfm, PREFETCH_MODE, [PF_DST, DUMMY]
1192 + PF sub, PF_X, PF_X, ORIG_W
1193 + PF subs, PF_CTL, PF_CTL, #0x10
1195 uqadd v28.8b, v0.8b, v4.8b
1197 - PF lsl DUMMY, SRC_STRIDE, #src_bpp_shift
1198 - PF ldrsb DUMMY, [PF_SRC, DUMMY]
1199 - PF add PF_SRC, PF_SRC, #1
1200 - PF lsl DUMMY, DST_STRIDE, #dst_bpp_shift
1201 - PF ldrsb DUMMY, [PF_DST, DUMMY]
1202 - PF add PF_DST, PF_DST, #1
1204 + PF lsl, DUMMY, SRC_STRIDE, #src_bpp_shift
1205 + PF ldrsb, DUMMY, [PF_SRC, DUMMY]
1206 + PF add, PF_SRC, PF_SRC, #1
1207 + PF lsl, DUMMY, DST_STRIDE, #dst_bpp_shift
1208 + PF ldrsb, DUMMY, [PF_DST, DUMMY]
1209 + PF add, PF_DST, PF_DST, #1
1211 uqadd v29.8b, v1.8b, v5.8b
1212 uqadd v30.8b, v2.8b, v6.8b
1213 uqadd v31.8b, v3.8b, v7.8b
1216 generate_composite_function \
1217 pixman_composite_add_8888_8888_asm_neon, 32, 0, 32, \
1218 @@ -684,55 +684,55 @@ generate_composite_function_single_scanl
1219 raddhn v29.8b, v15.8h, v9.8h
1220 raddhn v30.8b, v16.8h, v10.8h
1221 raddhn v31.8b, v17.8h, v11.8h
1224 .macro pixman_composite_out_reverse_8888_8888_process_pixblock_tail_head
1225 ld4 {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32
1226 urshr v14.8h, v8.8h, #8
1227 - PF add PF_X, PF_X, #8
1228 - PF tst PF_CTL, #0xF
1229 + PF add, PF_X, PF_X, #8
1230 + PF tst, PF_CTL, #0xF
1231 urshr v15.8h, v9.8h, #8
1232 urshr v16.8h, v10.8h, #8
1233 urshr v17.8h, v11.8h, #8
1235 - PF add PF_X, PF_X, #8
1236 - PF sub PF_CTL, PF_CTL, #1
1238 + PF add, PF_X, PF_X, #8
1239 + PF sub, PF_CTL, PF_CTL, #1
1241 raddhn v28.8b, v14.8h, v8.8h
1242 raddhn v29.8b, v15.8h, v9.8h
1243 - PF cmp PF_X, ORIG_W
1244 + PF cmp, PF_X, ORIG_W
1245 raddhn v30.8b, v16.8h, v10.8h
1246 raddhn v31.8b, v17.8h, v11.8h
1248 - PF lsl DUMMY, PF_X, #src_bpp_shift
1249 - PF prfm PREFETCH_MODE, [PF_SRC, DUMMY]
1250 + PF lsl, DUMMY, PF_X, #src_bpp_shift
1251 + PF prfm, PREFETCH_MODE, [PF_SRC, DUMMY]
1253 - PF lsl DUMMY, PF_X, #dst_bpp_shift
1254 - PF prfm PREFETCH_MODE, [PF_DST, DUMMY]
1255 + PF lsl, DUMMY, PF_X, #dst_bpp_shift
1256 + PF prfm, PREFETCH_MODE, [PF_DST, DUMMY]
1257 st4 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
1259 - PF sub PF_X, PF_X, ORIG_W
1261 + PF sub, PF_X, PF_X, ORIG_W
1263 umull v8.8h, v22.8b, v4.8b
1265 - PF subs PF_CTL, PF_CTL, #0x10
1267 + PF subs, PF_CTL, PF_CTL, #0x10
1269 umull v9.8h, v22.8b, v5.8b
1271 - PF lsl DUMMY, SRC_STRIDE, #src_bpp_shift
1272 - PF ldrsb DUMMY, [PF_SRC, DUMMY]
1273 - PF add PF_SRC, PF_SRC, #1
1275 + PF lsl, DUMMY, SRC_STRIDE, #src_bpp_shift
1276 + PF ldrsb, DUMMY, [PF_SRC, DUMMY]
1277 + PF add, PF_SRC, PF_SRC, #1
1279 umull v10.8h, v22.8b, v6.8b
1281 - PF lsl DUMMY, DST_STRIDE, #dst_bpp_shift
1282 - PF ldrsb DUMMY, [PF_DST, DUMMY]
1283 - PF add PF_DST, PF_DST, #1
1285 + PF lsl, DUMMY, DST_STRIDE, #dst_bpp_shift
1286 + PF ldrsb, DUMMY, [PF_DST, DUMMY]
1287 + PF add, PF_DST, PF_DST, #1
1289 umull v11.8h, v22.8b, v7.8b
1292 generate_composite_function_single_scanline \
1293 pixman_composite_scanline_out_reverse_asm_neon, 32, 0, 32, \
1294 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
1295 8, /* number of pixels, processed in a single block */ \
1296 @@ -754,59 +754,59 @@ generate_composite_function_single_scanl
1297 uqadd v29.8b, v1.8b, v29.8b
1298 uqadd v30.8b, v2.8b, v30.8b
1299 uqadd v31.8b, v3.8b, v31.8b
1302 .macro pixman_composite_over_8888_8888_process_pixblock_tail_head
1303 ld4 {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32
1304 urshr v14.8h, v8.8h, #8
1305 - PF add PF_X, PF_X, #8
1306 - PF tst PF_CTL, #0xF
1307 + PF add, PF_X, PF_X, #8
1308 + PF tst, PF_CTL, #0xF
1309 urshr v15.8h, v9.8h, #8
1310 urshr v16.8h, v10.8h, #8
1311 urshr v17.8h, v11.8h, #8
1313 - PF add PF_X, PF_X, #8
1314 - PF sub PF_CTL, PF_CTL, #1
1316 + PF add, PF_X, PF_X, #8
1317 + PF sub, PF_CTL, PF_CTL, #1
1319 raddhn v28.8b, v14.8h, v8.8h
1320 raddhn v29.8b, v15.8h, v9.8h
1321 - PF cmp PF_X, ORIG_W
1322 + PF cmp, PF_X, ORIG_W
1323 raddhn v30.8b, v16.8h, v10.8h
1324 raddhn v31.8b, v17.8h, v11.8h
1325 uqadd v28.8b, v0.8b, v28.8b
1326 uqadd v29.8b, v1.8b, v29.8b
1327 uqadd v30.8b, v2.8b, v30.8b
1328 uqadd v31.8b, v3.8b, v31.8b
1330 - PF lsl DUMMY, PF_X, #src_bpp_shift
1331 - PF prfm PREFETCH_MODE, [PF_SRC, DUMMY]
1332 + PF lsl, DUMMY, PF_X, #src_bpp_shift
1333 + PF prfm, PREFETCH_MODE, [PF_SRC, DUMMY]
1335 - PF lsl DUMMY, PF_X, #dst_bpp_shift
1336 - PF prfm PREFETCH_MODE, [PF_DST, DUMMY]
1337 + PF lsl, DUMMY, PF_X, #dst_bpp_shift
1338 + PF prfm, PREFETCH_MODE, [PF_DST, DUMMY]
1339 st4 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
1341 - PF sub PF_X, PF_X, ORIG_W
1343 + PF sub, PF_X, PF_X, ORIG_W
1345 umull v8.8h, v22.8b, v4.8b
1347 - PF subs PF_CTL, PF_CTL, #0x10
1349 + PF subs, PF_CTL, PF_CTL, #0x10
1351 umull v9.8h, v22.8b, v5.8b
1353 - PF lsl DUMMY, SRC_STRIDE, #src_bpp_shift
1354 - PF ldrsb DUMMY, [PF_SRC, DUMMY]
1355 - PF add PF_SRC, PF_SRC, #1
1357 + PF lsl, DUMMY, SRC_STRIDE, #src_bpp_shift
1358 + PF ldrsb, DUMMY, [PF_SRC, DUMMY]
1359 + PF add, PF_SRC, PF_SRC, #1
1361 umull v10.8h, v22.8b, v6.8b
1363 - PF lsl DUMMY, DST_STRIDE, #dst_bpp_shift
1364 - PF ldrsb DUMMY, [PF_DST, DUMMY]
1365 - PF add PF_DST, PF_DST, #1
1367 + PF lsl, DUMMY, DST_STRIDE, #dst_bpp_shift
1368 + PF ldrsb, DUMMY, [PF_DST, DUMMY]
1369 + PF add, PF_DST, PF_DST, #1
1371 umull v11.8h, v22.8b, v7.8b
1374 generate_composite_function \
1375 pixman_composite_over_8888_8888_asm_neon, 32, 0, 32, \
1376 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
1377 8, /* number of pixels, processed in a single block */ \
1378 @@ -860,40 +860,40 @@ generate_composite_function_single_scanl
1379 urshr v16.8h, v10.8h, #8
1380 urshr v17.8h, v11.8h, #8
1381 raddhn v28.8b, v14.8h, v8.8h
1382 raddhn v29.8b, v15.8h, v9.8h
1383 raddhn v30.8b, v16.8h, v10.8h
1384 raddhn v31.8b, v17.8h, v11.8h
1385 ld4 {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32
1386 uqadd v28.8b, v0.8b, v28.8b
1387 - PF add PF_X, PF_X, #8
1388 - PF tst PF_CTL, #0x0F
1390 - PF add PF_X, PF_X, #8
1391 - PF sub PF_CTL, PF_CTL, #1
1392 + PF add, PF_X, PF_X, #8
1393 + PF tst, PF_CTL, #0x0F
1395 + PF add, PF_X, PF_X, #8
1396 + PF sub, PF_CTL, PF_CTL, #1
1398 uqadd v29.8b, v1.8b, v29.8b
1399 uqadd v30.8b, v2.8b, v30.8b
1400 uqadd v31.8b, v3.8b, v31.8b
1401 - PF cmp PF_X, ORIG_W
1402 + PF cmp, PF_X, ORIG_W
1403 umull v8.8h, v24.8b, v4.8b
1404 - PF lsl DUMMY, PF_X, #dst_bpp_shift
1405 - PF prfm PREFETCH_MODE, [PF_DST, DUMMY]
1406 + PF lsl, DUMMY, PF_X, #dst_bpp_shift
1407 + PF prfm, PREFETCH_MODE, [PF_DST, DUMMY]
1408 umull v9.8h, v24.8b, v5.8b
1410 - PF sub PF_X, PF_X, ORIG_W
1412 + PF sub, PF_X, PF_X, ORIG_W
1414 umull v10.8h, v24.8b, v6.8b
1415 - PF subs PF_CTL, PF_CTL, #0x10
1416 + PF subs, PF_CTL, PF_CTL, #0x10
1417 umull v11.8h, v24.8b, v7.8b
1419 - PF lsl DUMMY, DST_STRIDE, #dst_bpp_shift
1420 - PF ldrsb DUMMY, [PF_DST, DUMMY]
1421 - PF add PF_DST, PF_DST, #1
1423 + PF lsl, DUMMY, DST_STRIDE, #dst_bpp_shift
1424 + PF ldrsb, DUMMY, [PF_DST, DUMMY]
1425 + PF add, PF_DST, PF_DST, #1
1427 st4 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
1430 .macro pixman_composite_over_n_8888_init
1434 @@ -912,52 +912,52 @@ generate_composite_function \
1435 pixman_composite_over_8888_8888_process_pixblock_head, \
1436 pixman_composite_over_8888_8888_process_pixblock_tail, \
1437 pixman_composite_over_n_8888_process_pixblock_tail_head
1439 /******************************************************************************/
1441 .macro pixman_composite_over_reverse_n_8888_process_pixblock_tail_head
1442 urshr v14.8h, v8.8h, #8
1443 - PF add PF_X, PF_X, #8
1444 - PF tst PF_CTL, #0xF
1445 + PF add, PF_X, PF_X, #8
1446 + PF tst, PF_CTL, #0xF
1447 urshr v15.8h, v9.8h, #8
1448 urshr v12.8h, v10.8h, #8
1449 urshr v13.8h, v11.8h, #8
1451 - PF add PF_X, PF_X, #8
1452 - PF sub PF_CTL, PF_CTL, #1
1454 + PF add, PF_X, PF_X, #8
1455 + PF sub, PF_CTL, PF_CTL, #1
1457 raddhn v28.8b, v14.8h, v8.8h
1458 raddhn v29.8b, v15.8h, v9.8h
1459 - PF cmp PF_X, ORIG_W
1460 + PF cmp, PF_X, ORIG_W
1461 raddhn v30.8b, v12.8h, v10.8h
1462 raddhn v31.8b, v13.8h, v11.8h
1463 uqadd v28.8b, v0.8b, v28.8b
1464 uqadd v29.8b, v1.8b, v29.8b
1465 uqadd v30.8b, v2.8b, v30.8b
1466 uqadd v31.8b, v3.8b, v31.8b
1467 ld4 {v0.8b, v1.8b, v2.8b, v3.8b}, [DST_R], #32
1469 - PF lsl DUMMY, PF_X, #dst_bpp_shift
1470 - PF prfm PREFETCH_MODE, [PF_DST, DUMMY]
1471 + PF lsl, DUMMY, PF_X, #dst_bpp_shift
1472 + PF prfm, PREFETCH_MODE, [PF_DST, DUMMY]
1473 st4 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
1475 - PF sub PF_X, PF_X, ORIG_W
1477 + PF sub, PF_X, PF_X, ORIG_W
1479 umull v8.8h, v22.8b, v4.8b
1481 - PF subs PF_CTL, PF_CTL, #0x10
1483 + PF subs, PF_CTL, PF_CTL, #0x10
1485 umull v9.8h, v22.8b, v5.8b
1486 umull v10.8h, v22.8b, v6.8b
1488 - PF lsl DUMMY, DST_STRIDE, #dst_bpp_shift
1489 - PF ldrsb DUMMY, [PF_DST, DUMMY]
1490 - PF add PF_DST, PF_DST, #1
1492 + PF lsl, DUMMY, DST_STRIDE, #dst_bpp_shift
1493 + PF ldrsb, DUMMY, [PF_DST, DUMMY]
1494 + PF add, PF_DST, PF_DST, #1
1496 umull v11.8h, v22.8b, v7.8b
1499 .macro pixman_composite_over_reverse_n_8888_init
1503 @@ -1405,45 +1405,45 @@ generate_composite_function \
1504 rshrn v28.8b, v8.8h, #8
1505 rshrn v29.8b, v9.8h, #8
1506 rshrn v30.8b, v10.8h, #8
1507 rshrn v31.8b, v11.8h, #8
1510 .macro pixman_composite_src_n_8_8888_process_pixblock_tail_head
1512 - PF add PF_X, PF_X, #8
1513 + PF add, PF_X, PF_X, #8
1514 rshrn v28.8b, v8.8h, #8
1515 - PF tst PF_CTL, #0x0F
1516 + PF tst, PF_CTL, #0x0F
1517 rshrn v29.8b, v9.8h, #8
1519 - PF add PF_X, PF_X, #8
1521 + PF add, PF_X, PF_X, #8
1523 rshrn v30.8b, v10.8h, #8
1525 - PF sub PF_CTL, PF_CTL, #1
1527 + PF sub, PF_CTL, PF_CTL, #1
1529 rshrn v31.8b, v11.8h, #8
1530 - PF cmp PF_X, ORIG_W
1531 + PF cmp, PF_X, ORIG_W
1532 umull v8.8h, v24.8b, v0.8b
1533 - PF lsl DUMMY, PF_X, #mask_bpp_shift
1534 - PF prfm PREFETCH_MODE, [PF_MASK, DUMMY]
1535 + PF lsl, DUMMY, PF_X, #mask_bpp_shift
1536 + PF prfm, PREFETCH_MODE, [PF_MASK, DUMMY]
1537 umull v9.8h, v24.8b, v1.8b
1539 - PF sub PF_X, PF_X, ORIG_W
1541 + PF sub, PF_X, PF_X, ORIG_W
1543 umull v10.8h, v24.8b, v2.8b
1545 - PF subs PF_CTL, PF_CTL, #0x10
1547 + PF subs, PF_CTL, PF_CTL, #0x10
1549 umull v11.8h, v24.8b, v3.8b
1551 - PF lsl DUMMY, MASK_STRIDE, #mask_bpp_shift
1552 - PF ldrsb DUMMY, [PF_MASK, DUMMY]
1553 - PF add PF_MASK, PF_MASK, #1
1555 + PF lsl, DUMMY, MASK_STRIDE, #mask_bpp_shift
1556 + PF ldrsb, DUMMY, [PF_MASK, DUMMY]
1557 + PF add, PF_MASK, PF_MASK, #1
1559 st4 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
1560 ursra v8.8h, v8.8h, #8
1561 ursra v9.8h, v9.8h, #8
1562 ursra v10.8h, v10.8h, #8
1563 ursra v11.8h, v11.8h, #8
1566 @@ -1486,45 +1486,45 @@ generate_composite_function \
1567 rshrn v28.8b, v0.8h, #8
1568 rshrn v29.8b, v1.8h, #8
1569 rshrn v30.8b, v2.8h, #8
1570 rshrn v31.8b, v3.8h, #8
1573 .macro pixman_composite_src_n_8_8_process_pixblock_tail_head
1575 - PF add PF_X, PF_X, #8
1576 + PF add, PF_X, PF_X, #8
1577 rshrn v28.8b, v0.8h, #8
1578 - PF tst PF_CTL, #0x0F
1579 + PF tst, PF_CTL, #0x0F
1580 rshrn v29.8b, v1.8h, #8
1582 - PF add PF_X, PF_X, #8
1584 + PF add, PF_X, PF_X, #8
1586 rshrn v30.8b, v2.8h, #8
1588 - PF sub PF_CTL, PF_CTL, #1
1590 + PF sub, PF_CTL, PF_CTL, #1
1592 rshrn v31.8b, v3.8h, #8
1593 - PF cmp PF_X, ORIG_W
1594 + PF cmp, PF_X, ORIG_W
1595 umull v0.8h, v24.8b, v16.8b
1596 - PF lsl DUMMY, PF_X, mask_bpp_shift
1597 - PF prfm PREFETCH_MODE, [PF_MASK, DUMMY]
1598 + PF lsl, DUMMY, PF_X, mask_bpp_shift
1599 + PF prfm, PREFETCH_MODE, [PF_MASK, DUMMY]
1600 umull v1.8h, v25.8b, v16.8b
1602 - PF sub PF_X, PF_X, ORIG_W
1604 + PF sub, PF_X, PF_X, ORIG_W
1606 umull v2.8h, v26.8b, v16.8b
1608 - PF subs PF_CTL, PF_CTL, #0x10
1610 + PF subs, PF_CTL, PF_CTL, #0x10
1612 umull v3.8h, v27.8b, v16.8b
1614 - PF lsl DUMMY, MASK_STRIDE, #mask_bpp_shift
1615 - PF ldrsb DUMMY, [PF_MASK, DUMMY]
1616 - PF add PF_MASK, PF_MASK, #1
1618 + PF lsl, DUMMY, MASK_STRIDE, #mask_bpp_shift
1619 + PF ldrsb, DUMMY, [PF_MASK, DUMMY]
1620 + PF add, PF_MASK, PF_MASK, #1
1622 st1 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
1623 ursra v0.8h, v0.8h, #8
1624 ursra v1.8h, v1.8h, #8
1625 ursra v2.8h, v2.8h, #8
1626 ursra v3.8h, v3.8h, #8
1629 @@ -1594,54 +1594,54 @@ generate_composite_function \
1632 .macro pixman_composite_over_n_8_8888_process_pixblock_tail_head
1633 urshr v16.8h, v12.8h, #8
1634 ld4 {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32
1635 urshr v17.8h, v13.8h, #8
1637 urshr v18.8h, v14.8h, #8
1638 - PF add PF_X, PF_X, #8
1639 + PF add, PF_X, PF_X, #8
1640 urshr v19.8h, v15.8h, #8
1641 - PF tst PF_CTL, #0x0F
1642 + PF tst, PF_CTL, #0x0F
1643 raddhn v28.8b, v16.8h, v12.8h
1645 - PF add PF_X, PF_X, #8
1647 + PF add, PF_X, PF_X, #8
1649 raddhn v29.8b, v17.8h, v13.8h
1651 - PF sub PF_CTL, PF_CTL, #1
1653 + PF sub, PF_CTL, PF_CTL, #1
1655 raddhn v30.8b, v18.8h, v14.8h
1656 - PF cmp PF_X, ORIG_W
1657 + PF cmp, PF_X, ORIG_W
1658 raddhn v31.8b, v19.8h, v15.8h
1659 - PF lsl DUMMY, PF_X, #dst_bpp_shift
1660 - PF prfm PREFETCH_MODE, [PF_DST, DUMMY]
1661 + PF lsl, DUMMY, PF_X, #dst_bpp_shift
1662 + PF prfm, PREFETCH_MODE, [PF_DST, DUMMY]
1663 umull v16.8h, v24.8b, v8.8b
1664 - PF lsl DUMMY, PF_X, #mask_bpp_shift
1665 - PF prfm PREFETCH_MODE, [PF_MASK, DUMMY]
1666 + PF lsl, DUMMY, PF_X, #mask_bpp_shift
1667 + PF prfm, PREFETCH_MODE, [PF_MASK, DUMMY]
1668 umull v17.8h, v24.8b, v9.8b
1670 - PF sub PF_X, PF_X, ORIG_W
1672 + PF sub, PF_X, PF_X, ORIG_W
1674 umull v18.8h, v24.8b, v10.8b
1676 - PF subs PF_CTL, PF_CTL, #0x10
1678 + PF subs, PF_CTL, PF_CTL, #0x10
1680 umull v19.8h, v24.8b, v11.8b
1682 - PF lsl DUMMY, DST_STRIDE, #dst_bpp_shift
1683 - PF ldrsb DUMMY, [PF_DST, DUMMY]
1684 - PF add PF_DST, PF_DST, #1
1686 + PF lsl, DUMMY, DST_STRIDE, #dst_bpp_shift
1687 + PF ldrsb, DUMMY, [PF_DST, DUMMY]
1688 + PF add, PF_DST, PF_DST, #1
1690 uqadd v28.8b, v0.8b, v28.8b
1692 - PF lsl DUMMY, MASK_STRIDE, #mask_bpp_shift
1693 - PF ldrsb DUMMY, [PF_MASK, DUMMY]
1694 - PF add PF_MASK, PF_MASK, #1
1696 + PF lsl, DUMMY, MASK_STRIDE, #mask_bpp_shift
1697 + PF ldrsb, DUMMY, [PF_MASK, DUMMY]
1698 + PF add, PF_MASK, PF_MASK, #1
1700 uqadd v29.8b, v1.8b, v29.8b
1701 uqadd v30.8b, v2.8b, v30.8b
1702 uqadd v31.8b, v3.8b, v31.8b
1703 urshr v12.8h, v16.8h, #8
1704 urshr v13.8h, v17.8h, #8
1705 urshr v14.8h, v18.8h, #8
1706 urshr v15.8h, v19.8h, #8
1707 @@ -2407,17 +2407,17 @@ generate_composite_function \
1708 generate_composite_function_single_scanline \
1709 pixman_composite_scanline_out_reverse_mask_asm_neon, 32, 32, 32, \
1710 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
1711 8, /* number of pixels, processed in a single block */ \
1712 default_init_need_all_regs, \
1713 default_cleanup_need_all_regs, \
1714 pixman_composite_out_reverse_8888_n_8888_process_pixblock_head, \
1715 pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail, \
1716 - pixman_composite_out_reverse_8888_8888_8888_process_pixblock_tail_head \
1717 + pixman_composite_out_reverse_8888_8888_8888_process_pixblock_tail_head, \
1718 28, /* dst_w_basereg */ \
1719 4, /* dst_r_basereg */ \
1720 0, /* src_basereg */ \
1721 12 /* mask_basereg */
1723 /******************************************************************************/
1725 .macro pixman_composite_over_8888_n_8888_process_pixblock_head
1726 @@ -2482,31 +2482,31 @@ generate_composite_function \
1727 pixman_composite_over_8888_8888_8888_asm_neon, 32, 32, 32, \
1728 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
1729 8, /* number of pixels, processed in a single block */ \
1730 5, /* prefetch distance */ \
1731 default_init_need_all_regs, \
1732 default_cleanup_need_all_regs, \
1733 pixman_composite_over_8888_n_8888_process_pixblock_head, \
1734 pixman_composite_over_8888_n_8888_process_pixblock_tail, \
1735 - pixman_composite_over_8888_8888_8888_process_pixblock_tail_head \
1736 + pixman_composite_over_8888_8888_8888_process_pixblock_tail_head, \
1737 28, /* dst_w_basereg */ \
1738 4, /* dst_r_basereg */ \
1739 0, /* src_basereg */ \
1740 12 /* mask_basereg */
1742 generate_composite_function_single_scanline \
1743 pixman_composite_scanline_over_mask_asm_neon, 32, 32, 32, \
1744 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
1745 8, /* number of pixels, processed in a single block */ \
1746 default_init_need_all_regs, \
1747 default_cleanup_need_all_regs, \
1748 pixman_composite_over_8888_n_8888_process_pixblock_head, \
1749 pixman_composite_over_8888_n_8888_process_pixblock_tail, \
1750 - pixman_composite_over_8888_8888_8888_process_pixblock_tail_head \
1751 + pixman_composite_over_8888_8888_8888_process_pixblock_tail_head, \
1752 28, /* dst_w_basereg */ \
1753 4, /* dst_r_basereg */ \
1754 0, /* src_basereg */ \
1755 12 /* mask_basereg */
1757 /******************************************************************************/
1759 /* TODO: expand macros and do better instructions scheduling */
1760 @@ -2524,17 +2524,17 @@ generate_composite_function \
1761 pixman_composite_over_8888_8_8888_asm_neon, 32, 8, 32, \
1762 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
1763 8, /* number of pixels, processed in a single block */ \
1764 5, /* prefetch distance */ \
1765 default_init_need_all_regs, \
1766 default_cleanup_need_all_regs, \
1767 pixman_composite_over_8888_n_8888_process_pixblock_head, \
1768 pixman_composite_over_8888_n_8888_process_pixblock_tail, \
1769 - pixman_composite_over_8888_8_8888_process_pixblock_tail_head \
1770 + pixman_composite_over_8888_8_8888_process_pixblock_tail_head, \
1771 28, /* dst_w_basereg */ \
1772 4, /* dst_r_basereg */ \
1773 0, /* src_basereg */ \
1774 15 /* mask_basereg */
1776 /******************************************************************************/
1778 .macro pixman_composite_src_0888_0888_process_pixblock_head
1779 @@ -2675,38 +2675,38 @@ generate_composite_function \
1780 urshr v11.8h, v8.8h, #8
1784 urshr v12.8h, v9.8h, #8
1785 urshr v13.8h, v10.8h, #8
1787 raddhn v30.8b, v11.8h, v8.8h
1788 - PF add PF_X, PF_X, #8
1789 - PF tst PF_CTL, #0xF
1791 - PF add PF_X, PF_X, #8
1792 - PF sub PF_CTL, PF_CTL, #1
1793 + PF add, PF_X, PF_X, #8
1794 + PF tst, PF_CTL, #0xF
1796 + PF add, PF_X, PF_X, #8
1797 + PF sub, PF_CTL, PF_CTL, #1
1799 raddhn v29.8b, v12.8h, v9.8h
1800 raddhn v28.8b, v13.8h, v10.8h
1801 umull v8.8h, v3.8b, v0.8b
1802 umull v9.8h, v3.8b, v1.8b
1803 umull v10.8h, v3.8b, v2.8b
1804 st4 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
1805 - PF cmp PF_X, ORIG_W
1806 - PF lsl DUMMY, PF_X, src_bpp_shift
1807 - PF prfm PREFETCH_MODE, [PF_SRC, DUMMY]
1809 - PF sub PF_X, PF_X, ORIG_W
1810 - PF subs PF_CTL, PF_CTL, #0x10
1812 - PF lsl DUMMY, SRC_STRIDE, #src_bpp_shift
1813 - PF ldrsb DUMMY, [PF_SRC, DUMMY]
1814 - PF add PF_SRC, PF_SRC, #1
1815 + PF cmp, PF_X, ORIG_W
1816 + PF lsl, DUMMY, PF_X, src_bpp_shift
1817 + PF prfm, PREFETCH_MODE, [PF_SRC, DUMMY]
1819 + PF sub, PF_X, PF_X, ORIG_W
1820 + PF subs, PF_CTL, PF_CTL, #0x10
1822 + PF lsl, DUMMY, SRC_STRIDE, #src_bpp_shift
1823 + PF ldrsb, DUMMY, [PF_SRC, DUMMY]
1824 + PF add, PF_SRC, PF_SRC, #1
1828 generate_composite_function \
1829 pixman_composite_src_pixbuf_8888_asm_neon, 32, 0, 32, \
1830 FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
1831 8, /* number of pixels, processed in a single block */ \
1832 10, /* prefetch distance */ \
1833 @@ -2744,38 +2744,38 @@ generate_composite_function \
1834 urshr v11.8h, v8.8h, #8
1838 urshr v12.8h, v9.8h, #8
1839 urshr v13.8h, v10.8h, #8
1841 raddhn v28.8b, v11.8h, v8.8h
1842 - PF add PF_X, PF_X, #8
1843 - PF tst PF_CTL, #0xF
1845 - PF add PF_X, PF_X, #8
1846 - PF sub PF_CTL, PF_CTL, #1
1847 + PF add, PF_X, PF_X, #8
1848 + PF tst, PF_CTL, #0xF
1850 + PF add, PF_X, PF_X, #8
1851 + PF sub, PF_CTL, PF_CTL, #1
1853 raddhn v29.8b, v12.8h, v9.8h
1854 raddhn v30.8b, v13.8h, v10.8h
1855 umull v8.8h, v3.8b, v0.8b
1856 umull v9.8h, v3.8b, v1.8b
1857 umull v10.8h, v3.8b, v2.8b
1858 st4 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
1859 - PF cmp PF_X, ORIG_W
1860 - PF lsl DUMMY, PF_X, src_bpp_shift
1861 - PF prfm PREFETCH_MODE, [PF_SRC, DUMMY]
1863 - PF sub PF_X, PF_X, ORIG_W
1864 - PF subs PF_CTL, PF_CTL, #0x10
1866 - PF lsl DUMMY, SRC_STRIDE, #src_bpp_shift
1867 - PF ldrsb DUMMY, [PF_SRC, DUMMY]
1868 - PF add PF_SRC, PF_SRC, #1
1869 + PF cmp, PF_X, ORIG_W
1870 + PF lsl, DUMMY, PF_X, src_bpp_shift
1871 + PF prfm, PREFETCH_MODE, [PF_SRC, DUMMY]
1873 + PF sub, PF_X, PF_X, ORIG_W
1874 + PF subs, PF_CTL, PF_CTL, #0x10
1876 + PF lsl, DUMMY, SRC_STRIDE, #src_bpp_shift
1877 + PF ldrsb, DUMMY, [PF_SRC, DUMMY]
1878 + PF add, PF_SRC, PF_SRC, #1
1882 generate_composite_function \
1883 pixman_composite_src_rpixbuf_8888_asm_neon, 32, 0, 32, \
1884 FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
1885 8, /* number of pixels, processed in a single block */ \
1886 10, /* prefetch distance */ \
1887 @@ -3126,197 +3126,197 @@ generate_composite_function_nearest_scan
1888 * format conversion, and interpolation as separate macros which can be used
1889 * as the basic building blocks for constructing bilinear scanline functions.
1892 .macro bilinear_load_8888 reg1, reg2, tmp
1895 add TMP1, TOP, TMP1, lsl #2
1896 - ld1 {®1&.2s}, [TMP1], STRIDE
1897 - ld1 {®2&.2s}, [TMP1]
1898 + ld1 {\()\reg1\().2s}, [TMP1], STRIDE
1899 + ld1 {\()\reg2\().2s}, [TMP1]
1902 .macro bilinear_load_0565 reg1, reg2, tmp
1905 add TMP1, TOP, TMP1, lsl #1
1906 - ld1 {®2&.s}[0], [TMP1], STRIDE
1907 - ld1 {®2&.s}[1], [TMP1]
1908 - convert_four_0565_to_x888_packed reg2, reg1, reg2, tmp
1909 + ld1 {\()\reg2\().s}[0], [TMP1], STRIDE
1910 + ld1 {\()\reg2\().s}[1], [TMP1]
1911 + convert_four_0565_to_x888_packed \reg2, \reg1, \reg2, \tmp
1914 .macro bilinear_load_and_vertical_interpolate_two_8888 \
1915 acc1, acc2, reg1, reg2, reg3, reg4, tmp1, tmp2
1917 - bilinear_load_8888 reg1, reg2, tmp1
1918 - umull &acc1&.8h, ®1&.8b, v28.8b
1919 - umlal &acc1&.8h, ®2&.8b, v29.8b
1920 - bilinear_load_8888 reg3, reg4, tmp2
1921 - umull &acc2&.8h, ®3&.8b, v28.8b
1922 - umlal &acc2&.8h, ®4&.8b, v29.8b
1923 + bilinear_load_8888 \reg1, \reg2, \tmp1
1924 + umull \()\acc1\().8h, \()\reg1\().8b, v28.8b
1925 + umlal \()\acc1\().8h, \()\reg2\().8b, v29.8b
1926 + bilinear_load_8888 \reg3, \reg4, \tmp2
1927 + umull \()\acc2\().8h, \()\reg3\().8b, v28.8b
1928 + umlal \()\acc2\().8h, \()\reg4\().8b, v29.8b
1931 .macro bilinear_load_and_vertical_interpolate_four_8888 \
1932 - xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi \
1933 + xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi, \
1934 yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
1936 bilinear_load_and_vertical_interpolate_two_8888 \
1937 - xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi
1938 + \xacc1, \xacc2, \xreg1, \xreg2, \xreg3, \xreg4, \xacc2lo, \xacc2hi
1939 bilinear_load_and_vertical_interpolate_two_8888 \
1940 - yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
1941 + \yacc1, \yacc2, \yreg1, \yreg2, \yreg3, \yreg4, \yacc2lo, \yacc2hi
1944 .macro vzip reg1, reg2
1946 - zip1 v31.8b, reg1, reg2
1947 - zip2 reg2, reg1, reg2
1949 + zip1 v31.8b, \reg1, \reg2
1950 + zip2 \reg2, \reg1, \reg2
1955 .macro vuzp reg1, reg2
1957 - uzp1 v31.8b, reg1, reg2
1958 - uzp2 reg2, reg1, reg2
1960 + uzp1 v31.8b, \reg1, \reg2
1961 + uzp2 \reg2, \reg1, \reg2
1966 .macro bilinear_load_and_vertical_interpolate_two_0565 \
1967 acc1, acc2, reg1, reg2, reg3, reg4, acc2lo, acc2hi
1970 add TMP1, TOP, TMP1, lsl #1
1973 add TMP2, TOP, TMP2, lsl #1
1974 - ld1 {&acc2&.s}[0], [TMP1], STRIDE
1975 - ld1 {&acc2&.s}[2], [TMP2], STRIDE
1976 - ld1 {&acc2&.s}[1], [TMP1]
1977 - ld1 {&acc2&.s}[3], [TMP2]
1978 - convert_0565_to_x888 acc2, reg3, reg2, reg1
1979 - vzip ®1&.8b, ®3&.8b
1980 - vzip ®2&.8b, ®4&.8b
1981 - vzip ®3&.8b, ®4&.8b
1982 - vzip ®1&.8b, ®2&.8b
1983 - umull &acc1&.8h, ®1&.8b, v28.8b
1984 - umlal &acc1&.8h, ®2&.8b, v29.8b
1985 - umull &acc2&.8h, ®3&.8b, v28.8b
1986 - umlal &acc2&.8h, ®4&.8b, v29.8b
1987 + ld1 {\()\acc2\().s}[0], [TMP1], STRIDE
1988 + ld1 {\()\acc2\().s}[2], [TMP2], STRIDE
1989 + ld1 {\()\acc2\().s}[1], [TMP1]
1990 + ld1 {\()\acc2\().s}[3], [TMP2]
1991 + convert_0565_to_x888 \acc2, \reg3, \reg2, \reg1
1992 + vzip \()\reg1\().8b, \()\reg3\().8b
1993 + vzip \()\reg2\().8b, \()\reg4\().8b
1994 + vzip \()\reg3\().8b, \()\reg4\().8b
1995 + vzip \()\reg1\().8b, \()\reg2\().8b
1996 + umull \()\acc1\().8h, \()\reg1\().8b, v28.8b
1997 + umlal \()\acc1\().8h, \()\reg2\().8b, v29.8b
1998 + umull \()\acc2\().8h, \()\reg3\().8b, v28.8b
1999 + umlal \()\acc2\().8h, \()\reg4\().8b, v29.8b
2002 .macro bilinear_load_and_vertical_interpolate_four_0565 \
2003 - xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi \
2004 + xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi, \
2005 yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
2008 add TMP1, TOP, TMP1, lsl #1
2011 add TMP2, TOP, TMP2, lsl #1
2012 - ld1 {&xacc2&.s}[0], [TMP1], STRIDE
2013 - ld1 {&xacc2&.s}[2], [TMP2], STRIDE
2014 - ld1 {&xacc2&.s}[1], [TMP1]
2015 - ld1 {&xacc2&.s}[3], [TMP2]
2016 - convert_0565_to_x888 xacc2, xreg3, xreg2, xreg1
2017 + ld1 {\()\xacc2\().s}[0], [TMP1], STRIDE
2018 + ld1 {\()\xacc2\().s}[2], [TMP2], STRIDE
2019 + ld1 {\()\xacc2\().s}[1], [TMP1]
2020 + ld1 {\()\xacc2\().s}[3], [TMP2]
2021 + convert_0565_to_x888 \xacc2, \xreg3, \xreg2, \xreg1
2024 add TMP1, TOP, TMP1, lsl #1
2027 add TMP2, TOP, TMP2, lsl #1
2028 - ld1 {&yacc2&.s}[0], [TMP1], STRIDE
2029 - vzip &xreg1&.8b, &xreg3&.8b
2030 - ld1 {&yacc2&.s}[2], [TMP2], STRIDE
2031 - vzip &xreg2&.8b, &xreg4&.8b
2032 - ld1 {&yacc2&.s}[1], [TMP1]
2033 - vzip &xreg3&.8b, &xreg4&.8b
2034 - ld1 {&yacc2&.s}[3], [TMP2]
2035 - vzip &xreg1&.8b, &xreg2&.8b
2036 - convert_0565_to_x888 yacc2, yreg3, yreg2, yreg1
2037 - umull &xacc1&.8h, &xreg1&.8b, v28.8b
2038 - vzip &yreg1&.8b, &yreg3&.8b
2039 - umlal &xacc1&.8h, &xreg2&.8b, v29.8b
2040 - vzip &yreg2&.8b, &yreg4&.8b
2041 - umull &xacc2&.8h, &xreg3&.8b, v28.8b
2042 - vzip &yreg3&.8b, &yreg4&.8b
2043 - umlal &xacc2&.8h, &xreg4&.8b, v29.8b
2044 - vzip &yreg1&.8b, &yreg2&.8b
2045 - umull &yacc1&.8h, &yreg1&.8b, v28.8b
2046 - umlal &yacc1&.8h, &yreg2&.8b, v29.8b
2047 - umull &yacc2&.8h, &yreg3&.8b, v28.8b
2048 - umlal &yacc2&.8h, &yreg4&.8b, v29.8b
2049 + ld1 {\()\yacc2\().s}[0], [TMP1], STRIDE
2050 + vzip \()\xreg1\().8b, \()\xreg3\().8b
2051 + ld1 {\()\yacc2\().s}[2], [TMP2], STRIDE
2052 + vzip \()\xreg2\().8b, \()\xreg4\().8b
2053 + ld1 {\()\yacc2\().s}[1], [TMP1]
2054 + vzip \()\xreg3\().8b, \()\xreg4\().8b
2055 + ld1 {\()\yacc2\().s}[3], [TMP2]
2056 + vzip \()\xreg1\().8b, \()\xreg2\().8b
2057 + convert_0565_to_x888 \yacc2, \yreg3, \yreg2, \yreg1
2058 + umull \()\xacc1\().8h, \()\xreg1\().8b, v28.8b
2059 + vzip \()\yreg1\().8b, \()\yreg3\().8b
2060 + umlal \()\xacc1\().8h, \()\xreg2\().8b, v29.8b
2061 + vzip \()\yreg2\().8b, \()\yreg4\().8b
2062 + umull \()\xacc2\().8h, \()\xreg3\().8b, v28.8b
2063 + vzip \()\yreg3\().8b, \()\yreg4\().8b
2064 + umlal \()\xacc2\().8h, \()\xreg4\().8b, v29.8b
2065 + vzip \()\yreg1\().8b, \()\yreg2\().8b
2066 + umull \()\yacc1\().8h, \()\yreg1\().8b, v28.8b
2067 + umlal \()\yacc1\().8h, \()\yreg2\().8b, v29.8b
2068 + umull \()\yacc2\().8h, \()\yreg3\().8b, v28.8b
2069 + umlal \()\yacc2\().8h, \()\yreg4\().8b, v29.8b
2072 .macro bilinear_store_8888 numpix, tmp1, tmp2
2075 st1 {v0.2s, v1.2s}, [OUT], #16
2076 -.elseif numpix == 2
2077 +.elseif \numpix == 2
2078 st1 {v0.2s}, [OUT], #8
2079 -.elseif numpix == 1
2080 +.elseif \numpix == 1
2081 st1 {v0.s}[0], [OUT], #4
2083 - .error bilinear_store_8888 numpix is unsupported
2084 + .error bilinear_store_8888 \numpix is unsupported
2088 .macro bilinear_store_0565 numpix, tmp1, tmp2
2093 - convert_8888_to_0565 v2, v1, v0, v1, tmp1, tmp2
2095 + convert_8888_to_0565 v2, v1, v0, v1, \tmp1, \tmp2
2097 st1 {v1.4h}, [OUT], #8
2098 -.elseif numpix == 2
2099 +.elseif \numpix == 2
2100 st1 {v1.s}[0], [OUT], #4
2101 -.elseif numpix == 1
2102 +.elseif \numpix == 1
2103 st1 {v1.h}[0], [OUT], #2
2105 - .error bilinear_store_0565 numpix is unsupported
2106 + .error bilinear_store_0565 \numpix is unsupported
2110 .macro bilinear_interpolate_last_pixel src_fmt, dst_fmt
2111 - bilinear_load_&src_fmt v0, v1, v2
2112 + bilinear_load_\()\src_fmt v0, v1, v2
2113 umull v2.8h, v0.8b, v28.8b
2114 umlal v2.8h, v1.8b, v29.8b
2115 /* 5 cycles bubble */
2116 ushll v0.4s, v2.4h, #BILINEAR_INTERPOLATION_BITS
2117 umlsl v0.4s, v2.4h, v15.h[0]
2118 umlal2 v0.4s, v2.8h, v15.h[0]
2119 /* 5 cycles bubble */
2120 shrn v0.4h, v0.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
2121 /* 3 cycles bubble */
2123 /* 1 cycle bubble */
2124 - bilinear_store_&dst_fmt 1, v3, v4
2125 + bilinear_store_\()\dst_fmt 1, v3, v4
2128 .macro bilinear_interpolate_two_pixels src_fmt, dst_fmt
2129 - bilinear_load_and_vertical_interpolate_two_&src_fmt \
2130 + bilinear_load_and_vertical_interpolate_two_\()\src_fmt \
2131 v1, v11, v2, v3, v20, v21, v22, v23
2132 ushll v0.4s, v1.4h, #BILINEAR_INTERPOLATION_BITS
2133 umlsl v0.4s, v1.4h, v15.h[0]
2134 umlal2 v0.4s, v1.8h, v15.h[0]
2135 ushll v10.4s, v11.4h, #BILINEAR_INTERPOLATION_BITS
2136 umlsl v10.4s, v11.4h, v15.h[4]
2137 umlal2 v10.4s, v11.8h, v15.h[4]
2138 shrn v0.4h, v0.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
2139 shrn2 v0.8h, v10.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
2140 ushr v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS)
2141 add v12.8h, v12.8h, v13.8h
2143 - bilinear_store_&dst_fmt 2, v3, v4
2144 + bilinear_store_\()\dst_fmt 2, v3, v4
2147 .macro bilinear_interpolate_four_pixels src_fmt, dst_fmt
2148 - bilinear_load_and_vertical_interpolate_four_&src_fmt \
2149 - v1, v11, v14, v20, v16, v17, v22, v23 \
2150 + bilinear_load_and_vertical_interpolate_four_\()\src_fmt \
2151 + v1, v11, v14, v20, v16, v17, v22, v23, \
2152 v3, v9, v24, v25, v26, v27, v18, v19
2153 prfm PREFETCH_MODE, [TMP1, PF_OFFS]
2154 sub TMP1, TMP1, STRIDE
2155 ushll v0.4s, v1.4h, #BILINEAR_INTERPOLATION_BITS
2156 umlsl v0.4s, v1.4h, v15.h[0]
2157 umlal2 v0.4s, v1.8h, v15.h[0]
2158 ushll v10.4s, v11.4h, #BILINEAR_INTERPOLATION_BITS
2159 umlsl v10.4s, v11.4h, v15.h[4]
2160 @@ -3333,64 +3333,64 @@ generate_composite_function_nearest_scan
2161 shrn v0.4h, v0.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
2162 shrn2 v0.8h, v10.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
2163 shrn v2.4h, v2.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
2164 shrn2 v2.8h, v8.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
2165 ushr v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS)
2168 add v12.8h, v12.8h, v13.8h
2169 - bilinear_store_&dst_fmt 4, v3, v4
2170 + bilinear_store_\()\dst_fmt 4, v3, v4
2173 .macro bilinear_interpolate_four_pixels_head src_fmt, dst_fmt
2174 -.ifdef have_bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt
2175 - bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt&_head
2176 +.ifdef have_bilinear_interpolate_four_pixels_\()\src_fmt\()_\()\dst_fmt
2177 + bilinear_interpolate_four_pixels_\()\src_fmt\()_\()\dst_fmt\()_head
2179 - bilinear_interpolate_four_pixels src_fmt, dst_fmt
2180 + bilinear_interpolate_four_pixels \src_fmt, \dst_fmt
2184 .macro bilinear_interpolate_four_pixels_tail src_fmt, dst_fmt
2185 -.ifdef have_bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt
2186 - bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt&_tail
2187 +.ifdef have_bilinear_interpolate_four_pixels_\()\src_fmt\()_\()\dst_fmt
2188 + bilinear_interpolate_four_pixels_\()\src_fmt\()_\()\dst_fmt\()_tail
2192 .macro bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt
2193 -.ifdef have_bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt
2194 - bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt&_tail_head
2195 +.ifdef have_bilinear_interpolate_four_pixels_\()\src_fmt\()_\()\dst_fmt
2196 + bilinear_interpolate_four_pixels_\()\src_fmt\()_\()\dst_fmt\()_tail_head
2198 - bilinear_interpolate_four_pixels src_fmt, dst_fmt
2199 + bilinear_interpolate_four_pixels \src_fmt, \dst_fmt
2203 .macro bilinear_interpolate_eight_pixels_head src_fmt, dst_fmt
2204 -.ifdef have_bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt
2205 - bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt&_head
2206 +.ifdef have_bilinear_interpolate_eight_pixels_\()\src_fmt\()_\()\dst_fmt
2207 + bilinear_interpolate_eight_pixels_\()\src_fmt\()_\()\dst_fmt\()_head
2209 - bilinear_interpolate_four_pixels_head src_fmt, dst_fmt
2210 - bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt
2211 + bilinear_interpolate_four_pixels_head \src_fmt, \dst_fmt
2212 + bilinear_interpolate_four_pixels_tail_head \src_fmt, \dst_fmt
2216 .macro bilinear_interpolate_eight_pixels_tail src_fmt, dst_fmt
2217 -.ifdef have_bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt
2218 - bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt&_tail
2219 +.ifdef have_bilinear_interpolate_eight_pixels_\()\src_fmt\()_\()\dst_fmt
2220 + bilinear_interpolate_eight_pixels_\()\src_fmt\()_\()\dst_fmt\()_tail
2222 - bilinear_interpolate_four_pixels_tail src_fmt, dst_fmt
2223 + bilinear_interpolate_four_pixels_tail \src_fmt, \dst_fmt
2227 .macro bilinear_interpolate_eight_pixels_tail_head src_fmt, dst_fmt
2228 -.ifdef have_bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt
2229 - bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt&_tail_head
2230 +.ifdef have_bilinear_interpolate_eight_pixels_\()\src_fmt\()_\()\dst_fmt
2231 + bilinear_interpolate_eight_pixels_\()\src_fmt\()_\()\dst_fmt\()_tail_head
2233 - bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt
2234 - bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt
2235 + bilinear_interpolate_four_pixels_tail_head \src_fmt, \dst_fmt
2236 + bilinear_interpolate_four_pixels_tail_head \src_fmt, \dst_fmt
2240 .set BILINEAR_FLAG_UNROLL_4, 0
2241 .set BILINEAR_FLAG_UNROLL_8, 1
2242 .set BILINEAR_FLAG_USE_ALL_NEON_REGS, 2
2245 @@ -3405,17 +3405,17 @@ generate_composite_function_nearest_scan
2246 * prefetch_distance - prefetch in the source image by that many
2250 .macro generate_bilinear_scanline_func fname, src_fmt, dst_fmt, \
2251 src_bpp_shift, dst_bpp_shift, \
2252 prefetch_distance, flags
2254 -pixman_asm_function fname
2255 +pixman_asm_function \fname
2264 @@ -3437,17 +3437,17 @@ pixman_asm_function fname
2265 sub sp, sp, 112 /* push all registers */
2267 st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], #32
2268 st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], #32
2269 stp x8, x9, [x29, -80]
2270 stp x10, x11, [x29, -96]
2271 stp x12, x13, [x29, -112]
2273 - mov PF_OFFS, #prefetch_distance
2274 + mov PF_OFFS, #\prefetch_distance
2275 mul PF_OFFS, PF_OFFS, UX
2277 subs STRIDE, BOTTOM, TOP
2283 @@ -3458,85 +3458,85 @@ pixman_asm_function fname
2284 mov v25.d[0], v12.d[1]
2285 mov v26.d[0], v13.d[0]
2286 add v25.4h, v25.4h, v26.4h
2287 mov v12.d[1], v25.d[0]
2289 /* ensure good destination alignment */
2292 - tst OUT, #(1 << dst_bpp_shift)
2293 + tst OUT, #(1 << \dst_bpp_shift)
2295 ushr v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS)
2296 add v12.8h, v12.8h, v13.8h
2297 - bilinear_interpolate_last_pixel src_fmt, dst_fmt
2298 + bilinear_interpolate_last_pixel \src_fmt, \dst_fmt
2299 sub WIDTH, WIDTH, #1
2301 add v13.8h, v13.8h, v13.8h
2302 ushr v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS)
2303 add v12.8h, v12.8h, v13.8h
2307 - tst OUT, #(1 << (dst_bpp_shift + 1))
2308 + tst OUT, #(1 << (\dst_bpp_shift + 1))
2310 - bilinear_interpolate_two_pixels src_fmt, dst_fmt
2311 + bilinear_interpolate_two_pixels \src_fmt, \dst_fmt
2312 sub WIDTH, WIDTH, #2
2314 -.if ((flags) & BILINEAR_FLAG_UNROLL_8) != 0
2315 +.if ((\flags) & BILINEAR_FLAG_UNROLL_8) != 0
2316 /*********** 8 pixels per iteration *****************/
2319 - tst OUT, #(1 << (dst_bpp_shift + 2))
2320 + tst OUT, #(1 << (\dst_bpp_shift + 2))
2322 - bilinear_interpolate_four_pixels src_fmt, dst_fmt
2323 + bilinear_interpolate_four_pixels \src_fmt, \dst_fmt
2324 sub WIDTH, WIDTH, #4
2326 subs WIDTH, WIDTH, #8
2328 - asr PF_OFFS, PF_OFFS, #(16 - src_bpp_shift)
2329 - bilinear_interpolate_eight_pixels_head src_fmt, dst_fmt
2330 + asr PF_OFFS, PF_OFFS, #(16 - \src_bpp_shift)
2331 + bilinear_interpolate_eight_pixels_head \src_fmt, \dst_fmt
2332 subs WIDTH, WIDTH, #8
2335 - bilinear_interpolate_eight_pixels_tail_head src_fmt, dst_fmt
2336 + bilinear_interpolate_eight_pixels_tail_head \src_fmt, \dst_fmt
2337 subs WIDTH, WIDTH, #8
2340 - bilinear_interpolate_eight_pixels_tail src_fmt, dst_fmt
2341 + bilinear_interpolate_eight_pixels_tail \src_fmt, \dst_fmt
2345 - bilinear_interpolate_four_pixels src_fmt, dst_fmt
2346 + bilinear_interpolate_four_pixels \src_fmt, \dst_fmt
2349 /*********** 4 pixels per iteration *****************/
2350 subs WIDTH, WIDTH, #4
2352 - asr PF_OFFS, PF_OFFS, #(16 - src_bpp_shift)
2353 - bilinear_interpolate_four_pixels_head src_fmt, dst_fmt
2354 + asr PF_OFFS, PF_OFFS, #(16 - \src_bpp_shift)
2355 + bilinear_interpolate_four_pixels_head \src_fmt, \dst_fmt
2356 subs WIDTH, WIDTH, #4
2359 - bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt
2360 + bilinear_interpolate_four_pixels_tail_head \src_fmt, \dst_fmt
2361 subs WIDTH, WIDTH, #4
2364 - bilinear_interpolate_four_pixels_tail src_fmt, dst_fmt
2365 + bilinear_interpolate_four_pixels_tail \src_fmt, \dst_fmt
2367 /****************************************************/
2369 /* handle the remaining trailing pixels */
2372 - bilinear_interpolate_two_pixels src_fmt, dst_fmt
2373 + bilinear_interpolate_two_pixels \src_fmt, \dst_fmt
2377 - bilinear_interpolate_last_pixel src_fmt, dst_fmt
2378 + bilinear_interpolate_last_pixel \src_fmt, \dst_fmt
2381 ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], #32
2382 ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], #32
2383 ldp x8, x9, [x29, -80]
2384 ldp x10, x11, [x29, -96]
2385 ldp x12, x13, [x29, -104]
2387 @@ -3551,17 +3551,17 @@ 300:
2397 +pixman_end_asm_function
2401 /*****************************************************************************/
2403 .set have_bilinear_interpolate_four_pixels_8888_8888, 1
2405 .macro bilinear_interpolate_four_pixels_8888_8888_head
2406 diff --git a/gfx/cairo/libpixman/src/pixman-arma64-neon-asm.h b/gfx/cairo/libpixman/src/pixman-arma64-neon-asm.h
2407 --- a/gfx/cairo/libpixman/src/pixman-arma64-neon-asm.h
2408 +++ b/gfx/cairo/libpixman/src/pixman-arma64-neon-asm.h
2409 @@ -75,340 +75,340 @@
2410 #define PREFETCH_MODE pldl1keep
2413 * Definitions of supplementary pixld/pixst macros (for partial load/store of
2417 .macro pixldst1 op, elem_size, reg1, mem_operand, abits
2418 - op {v®1&.&elem_size}, [&mem_operand&], #8
2419 + \op {v\()\reg1\().\()\elem_size}, [\()\mem_operand\()], #8
2422 .macro pixldst2 op, elem_size, reg1, reg2, mem_operand, abits
2423 - op {v®1&.&elem_size, v®2&.&elem_size}, [&mem_operand&], #16
2424 + \op {v\()\reg1\().\()\elem_size, v\()\reg2\().\()\elem_size}, [\()\mem_operand\()], #16
2427 .macro pixldst4 op, elem_size, reg1, reg2, reg3, reg4, mem_operand, abits
2428 - op {v®1&.&elem_size, v®2&.&elem_size, v®3&.&elem_size, v®4&.&elem_size}, [&mem_operand&], #32
2429 + \op {v\()\reg1\().\()\elem_size, v\()\reg2\().\()\elem_size, v\()\reg3\().\()\elem_size, v\()\reg4\().\()\elem_size}, [\()\mem_operand\()], #32
2432 .macro pixldst0 op, elem_size, reg1, idx, mem_operand, abits, bytes
2433 - op {v®1&.&elem_size}[idx], [&mem_operand&], #&bytes&
2434 + \op {v\()\reg1\().\()\elem_size}[\idx], [\()\mem_operand\()], #\()\bytes\()
2437 .macro pixldst3 op, elem_size, reg1, reg2, reg3, mem_operand
2438 - op {v®1&.&elem_size, v®2&.&elem_size, v®3&.&elem_size}, [&mem_operand&], #24
2439 + \op {v\()\reg1\().\()\elem_size, v\()\reg2\().\()\elem_size, v\()\reg3\().\()\elem_size}, [\()\mem_operand\()], #24
2442 .macro pixldst30 op, elem_size, reg1, reg2, reg3, idx, mem_operand
2443 - op {v®1&.&elem_size, v®2&.&elem_size, v®3&.&elem_size}[idx], [&mem_operand&], #3
2444 + \op {v\()\reg1\().\()\elem_size, v\()\reg2\().\()\elem_size, v\()\reg3\().\()\elem_size}[\idx], [\()\mem_operand\()], #3
2447 .macro pixldst numbytes, op, elem_size, basereg, mem_operand, abits
2450 - pixldst4 op, 2s, %(basereg+4), %(basereg+5), \
2451 - %(basereg+6), %(basereg+7), mem_operand, abits
2452 - .elseif elem_size==16
2453 - pixldst4 op, 4h, %(basereg+4), %(basereg+5), \
2454 - %(basereg+6), %(basereg+7), mem_operand, abits
2455 +.if \numbytes == 32
2456 + .if \elem_size==32
2457 + pixldst4 \op, 2s, %(\basereg+4), %(\basereg+5), \
2458 + %(\basereg+6), %(\basereg+7), \mem_operand, \abits
2459 + .elseif \elem_size==16
2460 + pixldst4 \op, 4h, %(\basereg+4), %(\basereg+5), \
2461 + %(\basereg+6), %(\basereg+7), \mem_operand, \abits
2463 - pixldst4 op, 8b, %(basereg+4), %(basereg+5), \
2464 - %(basereg+6), %(basereg+7), mem_operand, abits
2465 + pixldst4 \op, 8b, %(\basereg+4), %(\basereg+5), \
2466 + %(\basereg+6), %(\basereg+7), \mem_operand, \abits
2468 -.elseif numbytes == 16
2470 - pixldst2 op, 2s, %(basereg+2), %(basereg+3), mem_operand, abits
2471 - .elseif elem_size==16
2472 - pixldst2 op, 4h, %(basereg+2), %(basereg+3), mem_operand, abits
2473 +.elseif \numbytes == 16
2474 + .if \elem_size==32
2475 + pixldst2 \op, 2s, %(\basereg+2), %(\basereg+3), \mem_operand, \abits
2476 + .elseif \elem_size==16
2477 + pixldst2 \op, 4h, %(\basereg+2), %(\basereg+3), \mem_operand, \abits
2479 - pixldst2 op, 8b, %(basereg+2), %(basereg+3), mem_operand, abits
2480 + pixldst2 \op, 8b, %(\basereg+2), %(\basereg+3), \mem_operand, \abits
2482 -.elseif numbytes == 8
2484 - pixldst1 op, 2s, %(basereg+1), mem_operand, abits
2485 - .elseif elem_size==16
2486 - pixldst1 op, 4h, %(basereg+1), mem_operand, abits
2487 +.elseif \numbytes == 8
2488 + .if \elem_size==32
2489 + pixldst1 \op, 2s, %(\basereg+1), \mem_operand, \abits
2490 + .elseif \elem_size==16
2491 + pixldst1 \op, 4h, %(\basereg+1), \mem_operand, \abits
2493 - pixldst1 op, 8b, %(basereg+1), mem_operand, abits
2494 + pixldst1 \op, 8b, %(\basereg+1), \mem_operand, \abits
2496 -.elseif numbytes == 4
2497 - .if !RESPECT_STRICT_ALIGNMENT || (elem_size == 32)
2498 - pixldst0 op, s, %(basereg+0), 1, mem_operand, abits, 4
2499 - .elseif elem_size == 16
2500 - pixldst0 op, h, %(basereg+0), 2, mem_operand, abits, 2
2501 - pixldst0 op, h, %(basereg+0), 3, mem_operand, abits, 2
2502 +.elseif \numbytes == 4
2503 + .if !RESPECT_STRICT_ALIGNMENT || (\elem_size == 32)
2504 + pixldst0 \op, s, %(\basereg+0), 1, \mem_operand, \abits, 4
2505 + .elseif \elem_size == 16
2506 + pixldst0 \op, h, %(\basereg+0), 2, \mem_operand, \abits, 2
2507 + pixldst0 \op, h, %(\basereg+0), 3, \mem_operand, \abits, 2
2509 - pixldst0 op, b, %(basereg+0), 4, mem_operand, abits, 1
2510 - pixldst0 op, b, %(basereg+0), 5, mem_operand, abits, 1
2511 - pixldst0 op, b, %(basereg+0), 6, mem_operand, abits, 1
2512 - pixldst0 op, b, %(basereg+0), 7, mem_operand, abits, 1
2513 + pixldst0 \op, b, %(\basereg+0), 4, \mem_operand, \abits, 1
2514 + pixldst0 \op, b, %(\basereg+0), 5, \mem_operand, \abits, 1
2515 + pixldst0 \op, b, %(\basereg+0), 6, \mem_operand, \abits, 1
2516 + pixldst0 \op, b, %(\basereg+0), 7, \mem_operand, \abits, 1
2518 -.elseif numbytes == 2
2519 - .if !RESPECT_STRICT_ALIGNMENT || (elem_size == 16)
2520 - pixldst0 op, h, %(basereg+0), 1, mem_operand, abits, 2
2521 +.elseif \numbytes == 2
2522 + .if !RESPECT_STRICT_ALIGNMENT || (\elem_size == 16)
2523 + pixldst0 \op, h, %(\basereg+0), 1, \mem_operand, \abits, 2
2525 - pixldst0 op, b, %(basereg+0), 2, mem_operand, abits, 1
2526 - pixldst0 op, b, %(basereg+0), 3, mem_operand, abits, 1
2527 + pixldst0 \op, b, %(\basereg+0), 2, \mem_operand, \abits, 1
2528 + pixldst0 \op, b, %(\basereg+0), 3, \mem_operand, \abits, 1
2530 -.elseif numbytes == 1
2531 - pixldst0 op, b, %(basereg+0), 1, mem_operand, abits, 1
2532 +.elseif \numbytes == 1
2533 + pixldst0 \op, b, %(\basereg+0), 1, \mem_operand, \abits, 1
2535 - .error "unsupported size: numbytes"
2536 + .error "unsupported size: \numbytes"
2540 .macro pixld numpix, bpp, basereg, mem_operand, abits=0
2542 -.if (bpp == 32) && (numpix == 8) && (DEINTERLEAVE_32BPP_ENABLED != 0)
2543 - pixldst4 ld4, 8b, %(basereg+4), %(basereg+5), \
2544 - %(basereg+6), %(basereg+7), mem_operand, abits
2545 -.elseif (bpp == 24) && (numpix == 8)
2546 - pixldst3 ld3, 8b, %(basereg+3), %(basereg+4), %(basereg+5), mem_operand
2547 -.elseif (bpp == 24) && (numpix == 4)
2548 - pixldst30 ld3, b, %(basereg+0), %(basereg+1), %(basereg+2), 4, mem_operand
2549 - pixldst30 ld3, b, %(basereg+0), %(basereg+1), %(basereg+2), 5, mem_operand
2550 - pixldst30 ld3, b, %(basereg+0), %(basereg+1), %(basereg+2), 6, mem_operand
2551 - pixldst30 ld3, b, %(basereg+0), %(basereg+1), %(basereg+2), 7, mem_operand
2552 -.elseif (bpp == 24) && (numpix == 2)
2553 - pixldst30 ld3, b, %(basereg+0), %(basereg+1), %(basereg+2), 2, mem_operand
2554 - pixldst30 ld3, b, %(basereg+0), %(basereg+1), %(basereg+2), 3, mem_operand
2555 -.elseif (bpp == 24) && (numpix == 1)
2556 - pixldst30 ld3, b, %(basereg+0), %(basereg+1), %(basereg+2), 1, mem_operand
2558 +.if (\bpp == 32) && (\numpix == 8) && (DEINTERLEAVE_32BPP_ENABLED != 0)
2559 + pixldst4 ld4, 8b, %(\basereg+4), %(\basereg+5), \
2560 + %(\basereg+6), %(\basereg+7), \mem_operand, \abits
2561 +.elseif (\bpp == 24) && (\numpix == 8)
2562 + pixldst3 ld3, 8b, %(\basereg+3), %(\basereg+4), %(\basereg+5), \mem_operand
2563 +.elseif (\bpp == 24) && (\numpix == 4)
2564 + pixldst30 ld3, b, %(\basereg+0), %(\basereg+1), %(\basereg+2), 4, \mem_operand
2565 + pixldst30 ld3, b, %(\basereg+0), %(\basereg+1), %(\basereg+2), 5, \mem_operand
2566 + pixldst30 ld3, b, %(\basereg+0), %(\basereg+1), %(\basereg+2), 6, \mem_operand
2567 + pixldst30 ld3, b, %(\basereg+0), %(\basereg+1), %(\basereg+2), 7, \mem_operand
2568 +.elseif (\bpp == 24) && (\numpix == 2)
2569 + pixldst30 ld3, b, %(\basereg+0), %(\basereg+1), %(\basereg+2), 2, \mem_operand
2570 + pixldst30 ld3, b, %(\basereg+0), %(\basereg+1), %(\basereg+2), 3, \mem_operand
2571 +.elseif (\bpp == 24) && (\numpix == 1)
2572 + pixldst30 ld3, b, %(\basereg+0), %(\basereg+1), %(\basereg+2), 1, \mem_operand
2574 - pixldst %(numpix * bpp / 8), ld1, %(bpp), basereg, mem_operand, abits
2575 + pixldst %(\numpix * \bpp / 8), ld1, %(\bpp), \basereg, \mem_operand, \abits
2580 .macro pixst numpix, bpp, basereg, mem_operand, abits=0
2582 -.if (bpp == 32) && (numpix == 8) && (DEINTERLEAVE_32BPP_ENABLED != 0)
2583 - pixldst4 st4, 8b, %(basereg+4), %(basereg+5), \
2584 - %(basereg+6), %(basereg+7), mem_operand, abits
2585 -.elseif (bpp == 24) && (numpix == 8)
2586 - pixldst3 st3, 8b, %(basereg+3), %(basereg+4), %(basereg+5), mem_operand
2587 -.elseif (bpp == 24) && (numpix == 4)
2588 - pixldst30 st3, b, %(basereg+0), %(basereg+1), %(basereg+2), 4, mem_operand
2589 - pixldst30 st3, b, %(basereg+0), %(basereg+1), %(basereg+2), 5, mem_operand
2590 - pixldst30 st3, b, %(basereg+0), %(basereg+1), %(basereg+2), 6, mem_operand
2591 - pixldst30 st3, b, %(basereg+0), %(basereg+1), %(basereg+2), 7, mem_operand
2592 -.elseif (bpp == 24) && (numpix == 2)
2593 - pixldst30 st3, b, %(basereg+0), %(basereg+1), %(basereg+2), 2, mem_operand
2594 - pixldst30 st3, b, %(basereg+0), %(basereg+1), %(basereg+2), 3, mem_operand
2595 -.elseif (bpp == 24) && (numpix == 1)
2596 - pixldst30 st3, b, %(basereg+0), %(basereg+1), %(basereg+2), 1, mem_operand
2597 -.elseif numpix * bpp == 32 && abits == 32
2598 - pixldst 4, st1, 32, basereg, mem_operand, abits
2599 -.elseif numpix * bpp == 16 && abits == 16
2600 - pixldst 2, st1, 16, basereg, mem_operand, abits
2602 +.if (\bpp == 32) && (\numpix == 8) && (DEINTERLEAVE_32BPP_ENABLED != 0)
2603 + pixldst4 st4, 8b, %(\basereg+4), %(\basereg+5), \
2604 + %(\basereg+6), %(\basereg+7), \mem_operand, \abits
2605 +.elseif (\bpp == 24) && (\numpix == 8)
2606 + pixldst3 st3, 8b, %(\basereg+3), %(\basereg+4), %(\basereg+5), \mem_operand
2607 +.elseif (\bpp == 24) && (\numpix == 4)
2608 + pixldst30 st3, b, %(\basereg+0), %(\basereg+1), %(\basereg+2), 4, \mem_operand
2609 + pixldst30 st3, b, %(\basereg+0), %(\basereg+1), %(\basereg+2), 5, \mem_operand
2610 + pixldst30 st3, b, %(\basereg+0), %(\basereg+1), %(\basereg+2), 6, \mem_operand
2611 + pixldst30 st3, b, %(\basereg+0), %(\basereg+1), %(\basereg+2), 7, \mem_operand
2612 +.elseif (\bpp == 24) && (\numpix == 2)
2613 + pixldst30 st3, b, %(\basereg+0), %(\basereg+1), %(\basereg+2), 2, \mem_operand
2614 + pixldst30 st3, b, %(\basereg+0), %(\basereg+1), %(\basereg+2), 3, \mem_operand
2615 +.elseif (\bpp == 24) && (\numpix == 1)
2616 + pixldst30 st3, b, %(\basereg+0), %(\basereg+1), %(\basereg+2), 1, \mem_operand
2617 +.elseif \numpix * \bpp == 32 && \abits == 32
2618 + pixldst 4, st1, 32, \basereg, \mem_operand, \abits
2619 +.elseif \numpix * \bpp == 16 && \abits == 16
2620 + pixldst 2, st1, 16, \basereg, \mem_operand, \abits
2622 - pixldst %(numpix * bpp / 8), st1, %(bpp), basereg, mem_operand, abits
2623 + pixldst %(\numpix * \bpp / 8), st1, %(\bpp), \basereg, \mem_operand, \abits
2628 .macro pixld_a numpix, bpp, basereg, mem_operand
2629 -.if (bpp * numpix) <= 128
2630 - pixld numpix, bpp, basereg, mem_operand, %(bpp * numpix)
2631 +.if (\bpp * \numpix) <= 128
2632 + pixld \numpix, \bpp, \basereg, \mem_operand, %(\bpp * \numpix)
2634 - pixld numpix, bpp, basereg, mem_operand, 128
2635 + pixld \numpix, \bpp, \basereg, \mem_operand, 128
2639 .macro pixst_a numpix, bpp, basereg, mem_operand
2640 -.if (bpp * numpix) <= 128
2641 - pixst numpix, bpp, basereg, mem_operand, %(bpp * numpix)
2642 +.if (\bpp * \numpix) <= 128
2643 + pixst \numpix, \bpp, \basereg, \mem_operand, %(\bpp * \numpix)
2645 - pixst numpix, bpp, basereg, mem_operand, 128
2646 + pixst \numpix, \bpp, \basereg, \mem_operand, 128
2651 * Pixel fetcher for nearest scaling (needs TMP1, TMP2, VX, UNIT_X register
2652 * aliases to be defined)
2654 .macro pixld1_s elem_size, reg1, mem_operand
2655 -.if elem_size == 16
2656 +.if \elem_size == 16
2660 5: subs VX, VX, SRC_WIDTH_FIXED
2663 - add TMP1, mem_operand, TMP1, lsl #1
2664 + add TMP1, \mem_operand, TMP1, lsl #1
2668 5: subs VX, VX, SRC_WIDTH_FIXED
2671 - add TMP2, mem_operand, TMP2, lsl #1
2672 - ld1 {v®1&.h}[0], [TMP1]
2673 + add TMP2, \mem_operand, TMP2, lsl #1
2674 + ld1 {v\()\reg1\().h}[0], [TMP1]
2678 5: subs VX, VX, SRC_WIDTH_FIXED
2681 - add TMP1, mem_operand, TMP1, lsl #1
2682 - ld1 {v®1&.h}[1], [TMP2]
2683 + add TMP1, \mem_operand, TMP1, lsl #1
2684 + ld1 {v\()\reg1\().h}[1], [TMP2]
2688 5: subs VX, VX, SRC_WIDTH_FIXED
2691 - add TMP2, mem_operand, TMP2, lsl #1
2692 - ld1 {v®1&.h}[2], [TMP1]
2693 - ld1 {v®1&.h}[3], [TMP2]
2694 -.elseif elem_size == 32
2695 + add TMP2, \mem_operand, TMP2, lsl #1
2696 + ld1 {v\()\reg1\().h}[2], [TMP1]
2697 + ld1 {v\()\reg1\().h}[3], [TMP2]
2698 +.elseif \elem_size == 32
2702 5: subs VX, VX, SRC_WIDTH_FIXED
2705 - add TMP1, mem_operand, TMP1, lsl #2
2706 + add TMP1, \mem_operand, TMP1, lsl #2
2710 5: subs VX, VX, SRC_WIDTH_FIXED
2713 - add TMP2, mem_operand, TMP2, lsl #2
2714 - ld1 {v®1&.s}[0], [TMP1]
2715 - ld1 {v®1&.s}[1], [TMP2]
2716 + add TMP2, \mem_operand, TMP2, lsl #2
2717 + ld1 {v\()\reg1\().s}[0], [TMP1]
2718 + ld1 {v\()\reg1\().s}[1], [TMP2]
2720 .error "unsupported"
2724 .macro pixld2_s elem_size, reg1, reg2, mem_operand
2725 -.if 0 /* elem_size == 32 */
2726 +.if 0 /* \elem_size == 32 */
2727 mov TMP1, VX, asr #16
2728 add VX, VX, UNIT_X, asl #1
2729 - add TMP1, mem_operand, TMP1, asl #2
2730 + add TMP1, \mem_operand, TMP1, asl #2
2731 mov TMP2, VX, asr #16
2733 - add TMP2, mem_operand, TMP2, asl #2
2734 - ld1 {v®1&.s}[0], [TMP1]
2735 + add TMP2, \mem_operand, TMP2, asl #2
2736 + ld1 {v\()\reg1\().s}[0], [TMP1]
2737 mov TMP1, VX, asr #16
2738 add VX, VX, UNIT_X, asl #1
2739 - add TMP1, mem_operand, TMP1, asl #2
2740 - ld1 {v®2&.s}[0], [TMP2, :32]
2741 + add TMP1, \mem_operand, TMP1, asl #2
2742 + ld1 {v\()\reg2\().s}[0], [TMP2, :32]
2743 mov TMP2, VX, asr #16
2745 - add TMP2, mem_operand, TMP2, asl #2
2746 - ld1 {v®1&.s}[1], [TMP1]
2747 - ld1 {v®2&.s}[1], [TMP2]
2748 + add TMP2, \mem_operand, TMP2, asl #2
2749 + ld1 {v\()\reg1\().s}[1], [TMP1]
2750 + ld1 {v\()\reg2\().s}[1], [TMP2]
2752 - pixld1_s elem_size, reg1, mem_operand
2753 - pixld1_s elem_size, reg2, mem_operand
2754 + pixld1_s \elem_size, \reg1, \mem_operand
2755 + pixld1_s \elem_size, \reg2, \mem_operand
2759 .macro pixld0_s elem_size, reg1, idx, mem_operand
2760 -.if elem_size == 16
2761 +.if \elem_size == 16
2765 5: subs VX, VX, SRC_WIDTH_FIXED
2768 - add TMP1, mem_operand, TMP1, lsl #1
2769 - ld1 {v®1&.h}[idx], [TMP1]
2770 -.elseif elem_size == 32
2771 + add TMP1, \mem_operand, TMP1, lsl #1
2772 + ld1 {v\()\reg1\().h}[\idx], [TMP1]
2773 +.elseif \elem_size == 32
2778 5: subs VX, VX, SRC_WIDTH_FIXED
2781 - add TMP1, mem_operand, TMP1, lsl #2
2782 - ld1 {v®1&.s}[idx], [TMP1]
2783 + add TMP1, \mem_operand, TMP1, lsl #2
2784 + ld1 {v\()\reg1\().s}[\idx], [TMP1]
2788 .macro pixld_s_internal numbytes, elem_size, basereg, mem_operand
2790 - pixld2_s elem_size, %(basereg+4), %(basereg+5), mem_operand
2791 - pixld2_s elem_size, %(basereg+6), %(basereg+7), mem_operand
2792 - pixdeinterleave elem_size, %(basereg+4)
2793 -.elseif numbytes == 16
2794 - pixld2_s elem_size, %(basereg+2), %(basereg+3), mem_operand
2795 -.elseif numbytes == 8
2796 - pixld1_s elem_size, %(basereg+1), mem_operand
2797 -.elseif numbytes == 4
2798 - .if elem_size == 32
2799 - pixld0_s elem_size, %(basereg+0), 1, mem_operand
2800 - .elseif elem_size == 16
2801 - pixld0_s elem_size, %(basereg+0), 2, mem_operand
2802 - pixld0_s elem_size, %(basereg+0), 3, mem_operand
2803 +.if \numbytes == 32
2804 + pixld2_s \elem_size, %(\basereg+4), %(\basereg+5), \mem_operand
2805 + pixld2_s \elem_size, %(\basereg+6), %(\basereg+7), \mem_operand
2806 + pixdeinterleave \elem_size, %(\basereg+4)
2807 +.elseif \numbytes == 16
2808 + pixld2_s \elem_size, %(\basereg+2), %(\basereg+3), \mem_operand
2809 +.elseif \numbytes == 8
2810 + pixld1_s \elem_size, %(\basereg+1), \mem_operand
2811 +.elseif \numbytes == 4
2812 + .if \elem_size == 32
2813 + pixld0_s \elem_size, %(\basereg+0), 1, \mem_operand
2814 + .elseif \elem_size == 16
2815 + pixld0_s \elem_size, %(\basereg+0), 2, \mem_operand
2816 + pixld0_s \elem_size, %(\basereg+0), 3, \mem_operand
2818 - pixld0_s elem_size, %(basereg+0), 4, mem_operand
2819 - pixld0_s elem_size, %(basereg+0), 5, mem_operand
2820 - pixld0_s elem_size, %(basereg+0), 6, mem_operand
2821 - pixld0_s elem_size, %(basereg+0), 7, mem_operand
2822 + pixld0_s \elem_size, %(\basereg+0), 4, \mem_operand
2823 + pixld0_s \elem_size, %(\basereg+0), 5, \mem_operand
2824 + pixld0_s \elem_size, %(\basereg+0), 6, \mem_operand
2825 + pixld0_s \elem_size, %(\basereg+0), 7, \mem_operand
2827 -.elseif numbytes == 2
2828 - .if elem_size == 16
2829 - pixld0_s elem_size, %(basereg+0), 1, mem_operand
2830 +.elseif \numbytes == 2
2831 + .if \elem_size == 16
2832 + pixld0_s \elem_size, %(\basereg+0), 1, \mem_operand
2834 - pixld0_s elem_size, %(basereg+0), 2, mem_operand
2835 - pixld0_s elem_size, %(basereg+0), 3, mem_operand
2836 + pixld0_s \elem_size, %(\basereg+0), 2, \mem_operand
2837 + pixld0_s \elem_size, %(\basereg+0), 3, \mem_operand
2839 -.elseif numbytes == 1
2840 - pixld0_s elem_size, %(basereg+0), 1, mem_operand
2841 +.elseif \numbytes == 1
2842 + pixld0_s \elem_size, %(\basereg+0), 1, \mem_operand
2844 - .error "unsupported size: numbytes"
2845 + .error "unsupported size: \numbytes"
2849 .macro pixld_s numpix, bpp, basereg, mem_operand
2851 - pixld_s_internal %(numpix * bpp / 8), %(bpp), basereg, mem_operand
2853 + pixld_s_internal %(\numpix * \bpp / 8), %(\bpp), \basereg, \mem_operand
2857 .macro vuzp8 reg1, reg2
2858 umov DUMMY, v16.d[0]
2859 - uzp1 v16.8b, v®1&.8b, v®2&.8b
2860 - uzp2 v®2&.8b, v®1&.8b, v®2&.8b
2861 - mov v®1&.8b, v16.8b
2862 + uzp1 v16.8b, v\()\reg1\().8b, v\()\reg2\().8b
2863 + uzp2 v\()\reg2\().8b, v\()\reg1\().8b, v\()\reg2\().8b
2864 + mov v\()\reg1\().8b, v16.8b
2868 .macro vzip8 reg1, reg2
2869 umov DUMMY, v16.d[0]
2870 - zip1 v16.8b, v®1&.8b, v®2&.8b
2871 - zip2 v®2&.8b, v®1&.8b, v®2&.8b
2872 - mov v®1&.8b, v16.8b
2873 + zip1 v16.8b, v\()\reg1\().8b, v\()\reg2\().8b
2874 + zip2 v\()\reg2\().8b, v\()\reg1\().8b, v\()\reg2\().8b
2875 + mov v\()\reg1\().8b, v16.8b
2879 /* deinterleave B, G, R, A channels for eight 32bpp pixels in 4 registers */
2880 .macro pixdeinterleave bpp, basereg
2881 -.if (bpp == 32) && (DEINTERLEAVE_32BPP_ENABLED != 0)
2882 - vuzp8 %(basereg+0), %(basereg+1)
2883 - vuzp8 %(basereg+2), %(basereg+3)
2884 - vuzp8 %(basereg+1), %(basereg+3)
2885 - vuzp8 %(basereg+0), %(basereg+2)
2886 +.if (\bpp == 32) && (DEINTERLEAVE_32BPP_ENABLED != 0)
2887 + vuzp8 %(\basereg+0), %(\basereg+1)
2888 + vuzp8 %(\basereg+2), %(\basereg+3)
2889 + vuzp8 %(\basereg+1), %(\basereg+3)
2890 + vuzp8 %(\basereg+0), %(\basereg+2)
2894 /* interleave B, G, R, A channels for eight 32bpp pixels in 4 registers */
2895 .macro pixinterleave bpp, basereg
2896 -.if (bpp == 32) && (DEINTERLEAVE_32BPP_ENABLED != 0)
2897 - vzip8 %(basereg+0), %(basereg+2)
2898 - vzip8 %(basereg+1), %(basereg+3)
2899 - vzip8 %(basereg+2), %(basereg+3)
2900 - vzip8 %(basereg+0), %(basereg+1)
2901 +.if (\bpp == 32) && (DEINTERLEAVE_32BPP_ENABLED != 0)
2902 + vzip8 %(\basereg+0), %(\basereg+2)
2903 + vzip8 %(\basereg+1), %(\basereg+3)
2904 + vzip8 %(\basereg+2), %(\basereg+3)
2905 + vzip8 %(\basereg+0), %(\basereg+1)
2910 * This is a macro for implementing cache preload. The main idea is that
2911 * cache preload logic is mostly independent from the rest of pixels
2912 * processing code. It starts at the top left pixel and moves forward
2913 * across pixels and can jump across scanlines. Prefetch distance is
2914 @@ -432,62 +432,62 @@ 55:
2915 * for almost zero cost!
2917 * (*) The overhead of the prefetcher is visible when running some trivial
2918 * pixels processing like simple copy. Anyway, having prefetch is a must
2919 * when working with the graphics data.
2921 .macro PF a, x:vararg
2922 .if (PREFETCH_TYPE_CURRENT == PREFETCH_TYPE_ADVANCED)
2928 .macro cache_preload std_increment, boost_increment
2929 .if (src_bpp_shift >= 0) || (dst_r_bpp != 0) || (mask_bpp_shift >= 0)
2930 -.if std_increment != 0
2931 - PF add PF_X, PF_X, #std_increment
2932 +.if \std_increment != 0
2933 + PF add, PF_X, PF_X, #\std_increment
2935 - PF tst PF_CTL, #0xF
2937 - PF add PF_X, PF_X, #boost_increment
2938 - PF sub PF_CTL, PF_CTL, #1
2939 + PF tst, PF_CTL, #0xF
2941 + PF add, PF_X, PF_X, #\boost_increment
2942 + PF sub, PF_CTL, PF_CTL, #1
2944 - PF cmp PF_X, ORIG_W
2945 + PF cmp, PF_X, ORIG_W
2946 .if src_bpp_shift >= 0
2947 - PF lsl DUMMY, PF_X, #src_bpp_shift
2948 - PF prfm PREFETCH_MODE, [PF_SRC, DUMMY]
2949 + PF lsl, DUMMY, PF_X, #src_bpp_shift
2950 + PF prfm, PREFETCH_MODE, [PF_SRC, DUMMY]
2953 - PF lsl DUMMY, PF_X, #dst_bpp_shift
2954 - PF prfm PREFETCH_MODE, [PF_DST, DUMMY]
2955 + PF lsl, DUMMY, PF_X, #dst_bpp_shift
2956 + PF prfm, PREFETCH_MODE, [PF_DST, DUMMY]
2958 .if mask_bpp_shift >= 0
2959 - PF lsl DUMMY, PF_X, #mask_bpp_shift
2960 - PF prfm PREFETCH_MODE, [PF_MASK, DUMMY]
2961 + PF lsl, DUMMY, PF_X, #mask_bpp_shift
2962 + PF prfm, PREFETCH_MODE, [PF_MASK, DUMMY]
2965 - PF sub PF_X, PF_X, ORIG_W
2966 - PF subs PF_CTL, PF_CTL, #0x10
2968 + PF sub, PF_X, PF_X, ORIG_W
2969 + PF subs, PF_CTL, PF_CTL, #0x10
2973 .if src_bpp_shift >= 0
2974 - PF lsl DUMMY, SRC_STRIDE, #src_bpp_shift
2975 - PF ldrsb DUMMY, [PF_SRC, DUMMY]
2976 - PF add PF_SRC, PF_SRC, #1
2977 + PF lsl, DUMMY, SRC_STRIDE, #src_bpp_shift
2978 + PF ldrsb, DUMMY, [PF_SRC, DUMMY]
2979 + PF add, PF_SRC, PF_SRC, #1
2982 - PF lsl DUMMY, DST_STRIDE, #dst_bpp_shift
2983 - PF ldrsb DUMMY, [PF_DST, DUMMY]
2984 - PF add PF_DST, PF_DST, #1
2985 + PF lsl, DUMMY, DST_STRIDE, #dst_bpp_shift
2986 + PF ldrsb, DUMMY, [PF_DST, DUMMY]
2987 + PF add, PF_DST, PF_DST, #1
2989 .if mask_bpp_shift >= 0
2990 - PF lsl DUMMY, MASK_STRIDE, #mask_bpp_shift
2991 - PF ldrsb DUMMY, [PF_MASK, DUMMY]
2992 - PF add PF_MASK, PF_MASK, #1
2993 + PF lsl, DUMMY, MASK_STRIDE, #mask_bpp_shift
2994 + PF ldrsb, DUMMY, [PF_MASK, DUMMY]
2995 + PF add, PF_MASK, PF_MASK, #1
3001 .macro cache_preload_simple
3002 .if (PREFETCH_TYPE_CURRENT == PREFETCH_TYPE_SIMPLE)
3004 @@ -516,56 +516,56 @@ 72:
3005 process_pixblock_tail, \
3006 process_pixblock_tail_head
3011 .if src_bpp > 0 || mask_bpp > 0 || dst_r_bpp > 0
3012 .irp lowbit, 1, 2, 4, 8, 16
3014 -.if (dst_w_bpp <= (lowbit * 8)) && ((lowbit * 8) < (pixblock_size * dst_w_bpp))
3015 -.if lowbit < 16 /* we don't need more than 16-byte alignment */
3016 - tst DST_R, #lowbit
3018 +.if (dst_w_bpp <= (\lowbit * 8)) && ((\lowbit * 8) < (pixblock_size * dst_w_bpp))
3019 +.if \lowbit < 16 /* we don't need more than 16-byte alignment */
3020 + tst DST_R, #\lowbit
3023 - pixld_src (lowbit * 8 / dst_w_bpp), src_bpp, src_basereg, SRC
3024 - pixld (lowbit * 8 / dst_w_bpp), mask_bpp, mask_basereg, MASK
3025 + pixld_src (\lowbit * 8 / dst_w_bpp), src_bpp, src_basereg, SRC
3026 + pixld (\lowbit * 8 / dst_w_bpp), mask_bpp, mask_basereg, MASK
3028 - pixld_a (lowbit * 8 / dst_r_bpp), dst_r_bpp, dst_r_basereg, DST_R
3029 + pixld_a (\lowbit * 8 / dst_r_bpp), dst_r_bpp, dst_r_basereg, DST_R
3031 - add DST_R, DST_R, #lowbit
3032 + add DST_R, DST_R, #\lowbit
3034 - PF add PF_X, PF_X, #(lowbit * 8 / dst_w_bpp)
3035 - sub W, W, #(lowbit * 8 / dst_w_bpp)
3036 + PF add, PF_X, PF_X, #(\lowbit * 8 / dst_w_bpp)
3037 + sub W, W, #(\lowbit * 8 / dst_w_bpp)
3042 pixdeinterleave src_bpp, src_basereg
3043 pixdeinterleave mask_bpp, mask_basereg
3044 pixdeinterleave dst_r_bpp, dst_r_basereg
3046 - process_pixblock_head
3047 + \process_pixblock_head
3048 cache_preload 0, pixblock_size
3049 cache_preload_simple
3050 - process_pixblock_tail
3051 + \process_pixblock_tail
3053 pixinterleave dst_w_bpp, dst_w_basereg
3055 .irp lowbit, 1, 2, 4, 8, 16
3056 -.if (dst_w_bpp <= (lowbit * 8)) && ((lowbit * 8) < (pixblock_size * dst_w_bpp))
3057 -.if lowbit < 16 /* we don't need more than 16-byte alignment */
3058 - tst DST_W, #lowbit
3059 +.if (dst_w_bpp <= (\lowbit * 8)) && ((\lowbit * 8) < (pixblock_size * dst_w_bpp))
3060 +.if \lowbit < 16 /* we don't need more than 16-byte alignment */
3061 + tst DST_W, #\lowbit
3064 .if src_bpp == 0 && mask_bpp == 0 && dst_r_bpp == 0
3065 - sub W, W, #(lowbit * 8 / dst_w_bpp)
3066 + sub W, W, #(\lowbit * 8 / dst_w_bpp)
3068 - pixst_a (lowbit * 8 / dst_w_bpp), dst_w_bpp, dst_w_basereg, DST_W
3069 + pixst_a (\lowbit * 8 / dst_w_bpp), dst_w_bpp, dst_w_basereg, DST_W
3078 @@ -587,52 +587,52 @@ 52:
3080 process_pixblock_head, \
3081 process_pixblock_tail, \
3082 process_pixblock_tail_head
3083 tst W, #(pixblock_size - 1)
3085 .if src_bpp > 0 || mask_bpp > 0 || dst_r_bpp > 0
3086 .irp chunk_size, 16, 8, 4, 2, 1
3087 -.if pixblock_size > chunk_size
3088 - tst W, #chunk_size
3089 +.if pixblock_size > \chunk_size
3090 + tst W, #\chunk_size
3092 - pixld_src chunk_size, src_bpp, src_basereg, SRC
3093 - pixld chunk_size, mask_bpp, mask_basereg, MASK
3094 -.if dst_aligned_flag != 0
3095 - pixld_a chunk_size, dst_r_bpp, dst_r_basereg, DST_R
3096 + pixld_src \chunk_size, src_bpp, src_basereg, SRC
3097 + pixld \chunk_size, mask_bpp, mask_basereg, MASK
3098 +.if \dst_aligned_flag != 0
3099 + pixld_a \chunk_size, dst_r_bpp, dst_r_basereg, DST_R
3101 - pixld chunk_size, dst_r_bpp, dst_r_basereg, DST_R
3102 + pixld \chunk_size, dst_r_bpp, dst_r_basereg, DST_R
3104 -.if cache_preload_flag != 0
3105 - PF add PF_X, PF_X, #chunk_size
3106 +.if \cache_preload_flag != 0
3107 + PF add, PF_X, PF_X, #\chunk_size
3113 pixdeinterleave src_bpp, src_basereg
3114 pixdeinterleave mask_bpp, mask_basereg
3115 pixdeinterleave dst_r_bpp, dst_r_basereg
3117 - process_pixblock_head
3118 -.if cache_preload_flag != 0
3119 + \process_pixblock_head
3120 +.if \cache_preload_flag != 0
3121 cache_preload 0, pixblock_size
3122 cache_preload_simple
3124 - process_pixblock_tail
3125 + \process_pixblock_tail
3126 pixinterleave dst_w_bpp, dst_w_basereg
3127 .irp chunk_size, 16, 8, 4, 2, 1
3128 -.if pixblock_size > chunk_size
3129 - tst W, #chunk_size
3130 +.if pixblock_size > \chunk_size
3131 + tst W, #\chunk_size
3133 -.if dst_aligned_flag != 0
3134 - pixst_a chunk_size, dst_w_bpp, dst_w_basereg, DST_W
3135 +.if \dst_aligned_flag != 0
3136 + pixst_a \chunk_size, dst_w_bpp, dst_w_basereg, DST_W
3138 - pixst chunk_size, dst_w_bpp, dst_w_basereg, DST_W
3139 + pixst \chunk_size, dst_w_bpp, dst_w_basereg, DST_W
3148 @@ -655,17 +655,17 @@ 52:
3149 .if (src_bpp != 24) && (src_bpp != 0)
3150 sub SRC, SRC, W, lsl #src_bpp_shift
3152 .if (mask_bpp != 24) && (mask_bpp != 0)
3153 sub MASK, MASK, W, lsl #mask_bpp_shift
3157 - bge start_of_loop_label
3158 + bge \start_of_loop_label
3162 * Registers are allocated in the following way by default:
3163 * v0, v1, v2, v3 - reserved for loading source pixel data
3164 * v4, v5, v6, v7 - reserved for loading destination pixel data
3165 * v24, v25, v26, v27 - reserved for loading mask pixel data
3166 * v28, v29, v30, v31 - final destination pixel data for writeback to memory
3167 @@ -682,17 +682,17 @@ 52:
3168 process_pixblock_head, \
3169 process_pixblock_tail, \
3170 process_pixblock_tail_head, \
3171 dst_w_basereg_ = 28, \
3172 dst_r_basereg_ = 4, \
3176 - pixman_asm_function fname
3177 + pixman_asm_function \fname
3178 stp x29, x30, [sp, -16]!
3180 sub sp, sp, 232 /* push all registers */
3182 st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], #32
3183 st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], #32
3184 stp x8, x9, [x29, -80]
3185 stp x10, x11, [x29, -96]
3186 @@ -707,38 +707,38 @@ 52:
3187 str x28, [x29, -232]
3190 * Select prefetch type for this function. If prefetch distance is
3191 * set to 0 or one of the color formats is 24bpp, SIMPLE prefetch
3192 * has to be used instead of ADVANCED.
3194 .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_DEFAULT
3195 -.if prefetch_distance == 0
3196 +.if \prefetch_distance == 0
3197 .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_NONE
3198 .elseif (PREFETCH_TYPE_CURRENT > PREFETCH_TYPE_SIMPLE) && \
3199 - ((src_bpp_ == 24) || (mask_bpp_ == 24) || (dst_w_bpp_ == 24))
3200 + ((\src_bpp_ == 24) || (\mask_bpp_ == 24) || (\dst_w_bpp_ == 24))
3201 .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_SIMPLE
3205 * Make some macro arguments globally visible and accessible
3208 - .set src_bpp, src_bpp_
3209 - .set mask_bpp, mask_bpp_
3210 - .set dst_w_bpp, dst_w_bpp_
3211 - .set pixblock_size, pixblock_size_
3212 - .set dst_w_basereg, dst_w_basereg_
3213 - .set dst_r_basereg, dst_r_basereg_
3214 - .set src_basereg, src_basereg_
3215 - .set mask_basereg, mask_basereg_
3216 + .set src_bpp, \src_bpp_
3217 + .set mask_bpp, \mask_bpp_
3218 + .set dst_w_bpp, \dst_w_bpp_
3219 + .set pixblock_size, \pixblock_size_
3220 + .set dst_w_basereg, \dst_w_basereg_
3221 + .set dst_r_basereg, \dst_r_basereg_
3222 + .set src_basereg, \src_basereg_
3223 + .set mask_basereg, \mask_basereg_
3225 .macro pixld_src x:vararg
3229 .macro fetch_src_pixblock
3230 pixld_src pixblock_size, src_bpp, \
3231 (src_basereg - pixblock_size * src_bpp / 64), SRC
3234 * Assign symbolic names to registers
3236 @@ -805,32 +805,32 @@ 52:
3237 .elseif dst_w_bpp == 16
3238 .set dst_bpp_shift, 1
3239 .elseif dst_w_bpp == 8
3240 .set dst_bpp_shift, 0
3242 .error "requested dst bpp (dst_w_bpp) is not supported"
3245 -.if (((flags) & FLAG_DST_READWRITE) != 0)
3246 +.if (((\flags) & FLAG_DST_READWRITE) != 0)
3247 .set dst_r_bpp, dst_w_bpp
3251 -.if (((flags) & FLAG_DEINTERLEAVE_32BPP) != 0)
3252 +.if (((\flags) & FLAG_DEINTERLEAVE_32BPP) != 0)
3253 .set DEINTERLEAVE_32BPP_ENABLED, 1
3255 .set DEINTERLEAVE_32BPP_ENABLED, 0
3258 -.if prefetch_distance < 0 || prefetch_distance > 15
3259 - .error "invalid prefetch distance (prefetch_distance)"
3260 +.if \prefetch_distance < 0 || \prefetch_distance > 15
3261 + .error "invalid prefetch distance (\prefetch_distance)"
3269 sub SRC_STRIDE, SRC_STRIDE, W
3270 sub SRC_STRIDE, SRC_STRIDE, W, lsl #1
3273 sub MASK_STRIDE, MASK_STRIDE, W
3274 @@ -839,71 +839,71 @@ 52:
3276 sub DST_STRIDE, DST_STRIDE, W
3277 sub DST_STRIDE, DST_STRIDE, W, lsl #1
3281 * Setup advanced prefetcher initial state
3283 - PF mov PF_SRC, SRC
3284 - PF mov PF_DST, DST_R
3285 - PF mov PF_MASK, MASK
3286 - /* PF_CTL = prefetch_distance | ((h - 1) << 4) */
3287 - PF lsl DUMMY, H, #4
3288 - PF mov PF_CTL, DUMMY
3289 - PF add PF_CTL, PF_CTL, #(prefetch_distance - 0x10)
3290 + PF mov, PF_SRC, SRC
3291 + PF mov, PF_DST, DST_R
3292 + PF mov, PF_MASK, MASK
3293 + /* PF_CTL = \prefetch_distance | ((h - 1) << 4) */
3294 + PF lsl, DUMMY, H, #4
3295 + PF mov, PF_CTL, DUMMY
3296 + PF add, PF_CTL, PF_CTL, #(\prefetch_distance - 0x10)
3303 cmp W, #(pixblock_size * 2)
3306 * This is the start of the pipelined loop, which if optimized for
3310 - ensure_destination_ptr_alignment process_pixblock_head, \
3311 - process_pixblock_tail, \
3312 - process_pixblock_tail_head
3313 + ensure_destination_ptr_alignment \process_pixblock_head, \
3314 + \process_pixblock_tail, \
3315 + \process_pixblock_tail_head
3317 /* Implement "head (tail_head) ... (tail_head) tail" loop pattern */
3318 pixld_a pixblock_size, dst_r_bpp, \
3319 (dst_r_basereg - pixblock_size * dst_r_bpp / 64), DST_R
3321 pixld pixblock_size, mask_bpp, \
3322 (mask_basereg - pixblock_size * mask_bpp / 64), MASK
3323 - PF add PF_X, PF_X, #pixblock_size
3324 - process_pixblock_head
3325 + PF add, PF_X, PF_X, #pixblock_size
3326 + \process_pixblock_head
3327 cache_preload 0, pixblock_size
3328 cache_preload_simple
3329 subs W, W, #(pixblock_size * 2)
3333 - process_pixblock_tail_head
3334 + \process_pixblock_tail_head
3335 cache_preload_simple
3336 subs W, W, #pixblock_size
3340 - process_pixblock_tail
3341 + \process_pixblock_tail
3342 pixst_a pixblock_size, dst_w_bpp, \
3343 (dst_w_basereg - pixblock_size * dst_w_bpp / 64), DST_W
3345 /* Process the remaining trailing pixels in the scanline */
3346 process_trailing_pixels 1, 1, \
3347 - process_pixblock_head, \
3348 - process_pixblock_tail, \
3349 - process_pixblock_tail_head
3350 + \process_pixblock_head, \
3351 + \process_pixblock_tail, \
3352 + \process_pixblock_tail_head
3353 advance_to_next_scanline 0b
3358 /* pop all registers */
3360 ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32
3361 ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32
3362 ldp x8, x9, [x29, -80]
3363 ldp x10, x11, [x29, -96]
3364 ldp x12, x13, [x29, -112]
3365 @@ -920,48 +920,48 @@ 1000:
3368 * This is the start of the loop, designed to process images with small width
3369 * (less than pixblock_size * 2 pixels). In this case neither pipelining
3370 * nor prefetch are used.
3373 .if src_bpp_shift >= 0
3374 - PF lsl DUMMY, SRC_STRIDE, #src_bpp_shift
3375 - PF prfm PREFETCH_MODE, [SRC, DUMMY]
3376 + PF lsl, DUMMY, SRC_STRIDE, #src_bpp_shift
3377 + PF prfm, PREFETCH_MODE, [SRC, DUMMY]
3380 - PF lsl DUMMY, DST_STRIDE, #dst_bpp_shift
3381 - PF prfm PREFETCH_MODE, [DST_R, DUMMY]
3382 + PF lsl, DUMMY, DST_STRIDE, #dst_bpp_shift
3383 + PF prfm, PREFETCH_MODE, [DST_R, DUMMY]
3385 .if mask_bpp_shift >= 0
3386 - PF lsl DUMMY, MASK_STRIDE, #mask_bpp_shift
3387 - PF prfm PREFETCH_MODE, [MASK, DUMMY]
3388 + PF lsl, DUMMY, MASK_STRIDE, #mask_bpp_shift
3389 + PF prfm, PREFETCH_MODE, [MASK, DUMMY]
3391 /* Process exactly pixblock_size pixels if needed */
3392 tst W, #pixblock_size
3394 pixld pixblock_size, dst_r_bpp, \
3395 (dst_r_basereg - pixblock_size * dst_r_bpp / 64), DST_R
3397 pixld pixblock_size, mask_bpp, \
3398 (mask_basereg - pixblock_size * mask_bpp / 64), MASK
3399 - process_pixblock_head
3400 - process_pixblock_tail
3401 + \process_pixblock_head
3402 + \process_pixblock_tail
3403 pixst pixblock_size, dst_w_bpp, \
3404 (dst_w_basereg - pixblock_size * dst_w_bpp / 64), DST_W
3406 /* Process the remaining trailing pixels in the scanline */
3407 process_trailing_pixels 0, 0, \
3408 - process_pixblock_head, \
3409 - process_pixblock_tail, \
3410 - process_pixblock_tail_head
3411 + \process_pixblock_head, \
3412 + \process_pixblock_tail, \
3413 + \process_pixblock_tail_head
3414 advance_to_next_scanline 800b
3418 /* pop all registers */
3420 ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32
3421 ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32
3422 ldp x8, x9, [x29, -80]
3423 ldp x10, x11, [x29, -96]
3424 ldp x12, x13, [x29, -112]
3425 ldp x14, x15, [x29, -128]
3426 @@ -990,17 +990,17 @@ 9:
3436 + pixman_end_asm_function
3440 * A simplified variant of function generation template for a single
3441 * scanline processing (for implementing pixman combine functions)
3443 .macro generate_composite_function_scanline use_nearest_scaling, \
3445 @@ -1014,50 +1014,50 @@ 9:
3446 process_pixblock_head, \
3447 process_pixblock_tail, \
3448 process_pixblock_tail_head, \
3449 dst_w_basereg_ = 28, \
3450 dst_r_basereg_ = 4, \
3454 - pixman_asm_function fname
3455 + pixman_asm_function \fname
3456 .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_NONE
3459 * Make some macro arguments globally visible and accessible
3462 - .set src_bpp, src_bpp_
3463 - .set mask_bpp, mask_bpp_
3464 - .set dst_w_bpp, dst_w_bpp_
3465 - .set pixblock_size, pixblock_size_
3466 - .set dst_w_basereg, dst_w_basereg_
3467 - .set dst_r_basereg, dst_r_basereg_
3468 - .set src_basereg, src_basereg_
3469 - .set mask_basereg, mask_basereg_
3470 + .set src_bpp, \src_bpp_
3471 + .set mask_bpp, \mask_bpp_
3472 + .set dst_w_bpp, \dst_w_bpp_
3473 + .set pixblock_size, \pixblock_size_
3474 + .set dst_w_basereg, \dst_w_basereg_
3475 + .set dst_r_basereg, \dst_r_basereg_
3476 + .set src_basereg, \src_basereg_
3477 + .set mask_basereg, \mask_basereg_
3479 -.if use_nearest_scaling != 0
3480 +.if \use_nearest_scaling != 0
3482 * Assign symbolic names to registers for nearest scaling
3489 SRC_WIDTH_FIXED .req x5
3496 .macro pixld_src x:vararg
3506 stp x29, x30, [sp, -16]!
3507 @@ -1075,84 +1075,84 @@ 9:
3508 W .req x0 /* width (is updated during processing) */
3509 DST_W .req x1 /* destination buffer pointer for writes */
3510 SRC .req x2 /* source buffer pointer */
3511 MASK .req x3 /* mask pointer */
3512 DST_R .req x4 /* destination buffer pointer for reads */
3515 .macro pixld_src x:vararg
3522 stp x29, x30, [sp, -16]!
3526 st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32
3527 st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32
3530 -.if (((flags) & FLAG_DST_READWRITE) != 0)
3531 +.if (((\flags) & FLAG_DST_READWRITE) != 0)
3532 .set dst_r_bpp, dst_w_bpp
3536 -.if (((flags) & FLAG_DEINTERLEAVE_32BPP) != 0)
3537 +.if (((\flags) & FLAG_DEINTERLEAVE_32BPP) != 0)
3538 .set DEINTERLEAVE_32BPP_ENABLED, 1
3540 .set DEINTERLEAVE_32BPP_ENABLED, 0
3543 .macro fetch_src_pixblock
3544 pixld_src pixblock_size, src_bpp, \
3545 (src_basereg - pixblock_size * src_bpp / 64), SRC
3552 cmp W, #pixblock_size
3555 - ensure_destination_ptr_alignment process_pixblock_head, \
3556 - process_pixblock_tail, \
3557 - process_pixblock_tail_head
3558 + ensure_destination_ptr_alignment \process_pixblock_head, \
3559 + \process_pixblock_tail, \
3560 + \process_pixblock_tail_head
3562 subs W, W, #pixblock_size
3565 /* Implement "head (tail_head) ... (tail_head) tail" loop pattern */
3566 pixld_a pixblock_size, dst_r_bpp, \
3567 (dst_r_basereg - pixblock_size * dst_r_bpp / 64), DST_R
3569 pixld pixblock_size, mask_bpp, \
3570 (mask_basereg - pixblock_size * mask_bpp / 64), MASK
3571 - process_pixblock_head
3572 + \process_pixblock_head
3573 subs W, W, #pixblock_size
3576 - process_pixblock_tail_head
3577 + \process_pixblock_tail_head
3578 subs W, W, #pixblock_size
3581 - process_pixblock_tail
3582 + \process_pixblock_tail
3583 pixst_a pixblock_size, dst_w_bpp, \
3584 (dst_w_basereg - pixblock_size * dst_w_bpp / 64), DST_W
3586 /* Process the remaining trailing pixels in the scanline (dst aligned) */
3587 process_trailing_pixels 0, 1, \
3588 - process_pixblock_head, \
3589 - process_pixblock_tail, \
3590 - process_pixblock_tail_head
3591 + \process_pixblock_head, \
3592 + \process_pixblock_tail, \
3593 + \process_pixblock_tail_head
3596 -.if use_nearest_scaling != 0
3598 +.if \use_nearest_scaling != 0
3600 ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32
3601 ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32
3602 ldp x8, x9, [x29, -80]
3605 ldp x29, x30, [sp], 16
3607 @@ -1162,22 +1162,22 @@ 700:
3608 ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32
3610 ldp x29, x30, [sp], 16
3614 /* Process the remaining trailing pixels in the scanline (dst unaligned) */
3615 process_trailing_pixels 0, 0, \
3616 - process_pixblock_head, \
3617 - process_pixblock_tail, \
3618 - process_pixblock_tail_head
3619 + \process_pixblock_head, \
3620 + \process_pixblock_tail, \
3621 + \process_pixblock_tail_head
3624 -.if use_nearest_scaling != 0
3626 +.if \use_nearest_scaling != 0
3628 ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32
3629 ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32
3630 ldp x8, x9, [x29, -80]
3633 ldp x29, x30, [sp], 16
3635 @@ -1208,25 +1208,25 @@ 800:
3641 .purgem fetch_src_pixblock
3645 + pixman_end_asm_function
3648 .macro generate_composite_function_single_scanline x:vararg
3649 - generate_composite_function_scanline 0, x
3650 + generate_composite_function_scanline 0, \x
3653 .macro generate_composite_function_nearest_scanline x:vararg
3654 - generate_composite_function_scanline 1, x
3655 + generate_composite_function_scanline 1, \x
3658 /* Default prologue/epilogue, nothing special needs to be done */
3663 .macro default_cleanup
3664 @@ -1250,61 +1250,61 @@ 800:
3665 * Conversion of 8 r5g6b6 pixels packed in 128-bit register (in)
3666 * into a planar a8r8g8b8 format (with a, r, g, b color components
3667 * stored into 64-bit registers out_a, out_r, out_g, out_b respectively).
3669 * Warning: the conversion is destructive and the original
3670 * value (in) is lost.
3672 .macro convert_0565_to_8888 in, out_a, out_r, out_g, out_b
3673 - shrn &out_r&.8b, &in&.8h, #8
3674 - shrn &out_g&.8b, &in&.8h, #3
3675 - sli &in&.8h, &in&.8h, #5
3676 - movi &out_a&.8b, #255
3677 - sri &out_r&.8b, &out_r&.8b, #5
3678 - sri &out_g&.8b, &out_g&.8b, #6
3679 - shrn &out_b&.8b, &in&.8h, #2
3680 + shrn \()\out_r\().8b, \()\in\().8h, #8
3681 + shrn \()\out_g\().8b, \()\in\().8h, #3
3682 + sli \()\in\().8h, \()\in\().8h, #5
3683 + movi \()\out_a\().8b, #255
3684 + sri \()\out_r\().8b, \()\out_r\().8b, #5
3685 + sri \()\out_g\().8b, \()\out_g\().8b, #6
3686 + shrn \()\out_b\().8b, \()\in\().8h, #2
3689 .macro convert_0565_to_x888 in, out_r, out_g, out_b
3690 - shrn &out_r&.8b, &in&.8h, #8
3691 - shrn &out_g&.8b, &in&.8h, #3
3692 - sli &in&.8h, &in&.8h, #5
3693 - sri &out_r&.8b, &out_r&.8b, #5
3694 - sri &out_g&.8b, &out_g&.8b, #6
3695 - shrn &out_b&.8b, &in&.8h, #2
3696 + shrn \()\out_r\().8b, \()\in\().8h, #8
3697 + shrn \()\out_g\().8b, \()\in\().8h, #3
3698 + sli \()\in\().8h, \()\in\().8h, #5
3699 + sri \()\out_r\().8b, \()\out_r\().8b, #5
3700 + sri \()\out_g\().8b, \()\out_g\().8b, #6
3701 + shrn \()\out_b\().8b, \()\in\().8h, #2
3705 * Conversion from planar a8r8g8b8 format (with a, r, g, b color components
3706 * in 64-bit registers in_a, in_r, in_g, in_b respectively) into 8 r5g6b6
3707 * pixels packed in 128-bit register (out). Requires two temporary 128-bit
3708 * registers (tmp1, tmp2)
3710 .macro convert_8888_to_0565 in_r, in_g, in_b, out, tmp1, tmp2
3711 - ushll &tmp1&.8h, &in_g&.8b, #7
3712 - shl &tmp1&.8h, &tmp1&.8h, #1
3713 - ushll &out&.8h, &in_r&.8b, #7
3714 - shl &out&.8h, &out&.8h, #1
3715 - ushll &tmp2&.8h, &in_b&.8b, #7
3716 - shl &tmp2&.8h, &tmp2&.8h, #1
3717 - sri &out&.8h, &tmp1&.8h, #5
3718 - sri &out&.8h, &tmp2&.8h, #11
3719 + ushll \()\tmp1\().8h, \()\in_g\().8b, #7
3720 + shl \()\tmp1\().8h, \()\tmp1\().8h, #1
3721 + ushll \()\out\().8h, \()\in_r\().8b, #7
3722 + shl \()\out\().8h, \()\out\().8h, #1
3723 + ushll \()\tmp2\().8h, \()\in_b\().8b, #7
3724 + shl \()\tmp2\().8h, \()\tmp2\().8h, #1
3725 + sri \()\out\().8h, \()\tmp1\().8h, #5
3726 + sri \()\out\().8h, \()\tmp2\().8h, #11
3730 * Conversion of four r5g6b5 pixels (in) to four x8r8g8b8 pixels
3731 * returned in (out0, out1) registers pair. Requires one temporary
3732 * 64-bit register (tmp). 'out1' and 'in' may overlap, the original
3733 * value from 'in' is lost
3735 .macro convert_four_0565_to_x888_packed in, out0, out1, tmp
3736 - shl &out0&.4h, &in&.4h, #5 /* G top 6 bits */
3737 - shl &tmp&.4h, &in&.4h, #11 /* B top 5 bits */
3738 - sri &in&.4h, &in&.4h, #5 /* R is ready in top bits */
3739 - sri &out0&.4h, &out0&.4h, #6 /* G is ready in top bits */
3740 - sri &tmp&.4h, &tmp&.4h, #5 /* B is ready in top bits */
3741 - ushr &out1&.4h, &in&.4h, #8 /* R is in place */
3742 - sri &out0&.4h, &tmp&.4h, #8 /* G & B is in place */
3743 - zip1 &tmp&.4h, &out0&.4h, &out1&.4h /* everything is in place */
3744 - zip2 &out1&.4h, &out0&.4h, &out1&.4h
3745 - mov &out0&.d[0], &tmp&.d[0]
3746 + shl \()\out0\().4h, \()\in\().4h, #5 /* G top 6 bits */
3747 + shl \()\tmp\().4h, \()\in\().4h, #11 /* B top 5 bits */
3748 + sri \()\in\().4h, \()\in\().4h, #5 /* R is ready \in top bits */
3749 + sri \()\out0\().4h, \()\out0\().4h, #6 /* G is ready \in top bits */
3750 + sri \()\tmp\().4h, \()\tmp\().4h, #5 /* B is ready \in top bits */
3751 + ushr \()\out1\().4h, \()\in\().4h, #8 /* R is \in place */
3752 + sri \()\out0\().4h, \()\tmp\().4h, #8 /* G \() B is \in place */
3753 + zip1 \()\tmp\().4h, \()\out0\().4h, \()\out1\().4h /* everything is \in place */
3754 + zip2 \()\out1\().4h, \()\out0\().4h, \()\out1\().4h
3755 + mov \()\out0\().d[0], \()\tmp\().d[0]