Bug 1890689 accumulate input in LargerReceiverBlockSizeThanDesiredBuffering GTest...
[gecko.git] / gfx / cairo / pixman-arm32-clang.patch
blobcd9d61e470447589fcd7dd369f9f062b498c19cb
1 https://gitlab.freedesktop.org/pixman/pixman/-/issues/74
3 diff --git a/gfx/cairo/libpixman/src/pixman-arm-neon-asm-bilinear.S b/gfx/cairo/libpixman/src/pixman-arm-neon-asm-bilinear.S
4 --- a/gfx/cairo/libpixman/src/pixman-arm-neon-asm-bilinear.S
5 +++ b/gfx/cairo/libpixman/src/pixman-arm-neon-asm-bilinear.S
6 @@ -77,206 +77,206 @@
7 * format conversion, and interpolation as separate macros which can be used
8 * as the basic building blocks for constructing bilinear scanline functions.
9 */
11 .macro bilinear_load_8888 reg1, reg2, tmp
12 mov TMP1, X, asr #16
13 add X, X, UX
14 add TMP1, TOP, TMP1, asl #2
15 - vld1.32 {reg1}, [TMP1], STRIDE
16 - vld1.32 {reg2}, [TMP1]
17 + vld1.32 {\reg1}, [TMP1], STRIDE
18 + vld1.32 {\reg2}, [TMP1]
19 .endm
21 .macro bilinear_load_0565 reg1, reg2, tmp
22 mov TMP1, X, asr #16
23 add X, X, UX
24 add TMP1, TOP, TMP1, asl #1
25 - vld1.32 {reg2[0]}, [TMP1], STRIDE
26 - vld1.32 {reg2[1]}, [TMP1]
27 - convert_four_0565_to_x888_packed reg2, reg1, reg2, tmp
28 + vld1.32 {\reg2[0]}, [TMP1], STRIDE
29 + vld1.32 {\reg2[1]}, [TMP1]
30 + convert_four_0565_to_x888_packed \reg2, \reg1, \reg2, \tmp
31 .endm
33 .macro bilinear_load_and_vertical_interpolate_two_8888 \
34 acc1, acc2, reg1, reg2, reg3, reg4, tmp1, tmp2
36 - bilinear_load_8888 reg1, reg2, tmp1
37 - vmull.u8 acc1, reg1, d28
38 - vmlal.u8 acc1, reg2, d29
39 - bilinear_load_8888 reg3, reg4, tmp2
40 - vmull.u8 acc2, reg3, d28
41 - vmlal.u8 acc2, reg4, d29
42 + bilinear_load_8888 \reg1, \reg2, \tmp1
43 + vmull.u8 \acc1, \reg1, d28
44 + vmlal.u8 \acc1, \reg2, d29
45 + bilinear_load_8888 \reg3, \reg4, \tmp2
46 + vmull.u8 \acc2, \reg3, d28
47 + vmlal.u8 \acc2, \reg4, d29
48 .endm
50 .macro bilinear_load_and_vertical_interpolate_four_8888 \
51 xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi \
52 yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
54 bilinear_load_and_vertical_interpolate_two_8888 \
55 - xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi
56 + \xacc1, \xacc2, \xreg1, \xreg2, \xreg3, \xreg4, \xacc2lo, \xacc2hi
57 bilinear_load_and_vertical_interpolate_two_8888 \
58 - yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
59 + \yacc1, \yacc2, \yreg1, \yreg2, \yreg3, \yreg4, \yacc2lo, \yacc2hi
60 .endm
62 .macro bilinear_load_and_vertical_interpolate_two_0565 \
63 acc1, acc2, reg1, reg2, reg3, reg4, acc2lo, acc2hi
65 mov TMP1, X, asr #16
66 add X, X, UX
67 add TMP1, TOP, TMP1, asl #1
68 mov TMP2, X, asr #16
69 add X, X, UX
70 add TMP2, TOP, TMP2, asl #1
71 - vld1.32 {acc2lo[0]}, [TMP1], STRIDE
72 - vld1.32 {acc2hi[0]}, [TMP2], STRIDE
73 - vld1.32 {acc2lo[1]}, [TMP1]
74 - vld1.32 {acc2hi[1]}, [TMP2]
75 - convert_0565_to_x888 acc2, reg3, reg2, reg1
76 - vzip.u8 reg1, reg3
77 - vzip.u8 reg2, reg4
78 - vzip.u8 reg3, reg4
79 - vzip.u8 reg1, reg2
80 - vmull.u8 acc1, reg1, d28
81 - vmlal.u8 acc1, reg2, d29
82 - vmull.u8 acc2, reg3, d28
83 - vmlal.u8 acc2, reg4, d29
84 + vld1.32 {\acc2lo[0]}, [TMP1], STRIDE
85 + vld1.32 {\acc2hi[0]}, [TMP2], STRIDE
86 + vld1.32 {\acc2lo[1]}, [TMP1]
87 + vld1.32 {\acc2hi[1]}, [TMP2]
88 + convert_0565_to_x888 \acc2, \reg3, \reg2, \reg1
89 + vzip.u8 \reg1, \reg3
90 + vzip.u8 \reg2, \reg4
91 + vzip.u8 \reg3, \reg4
92 + vzip.u8 \reg1, \reg2
93 + vmull.u8 \acc1, \reg1, d28
94 + vmlal.u8 \acc1, \reg2, d29
95 + vmull.u8 \acc2, \reg3, d28
96 + vmlal.u8 \acc2, \reg4, d29
97 .endm
99 .macro bilinear_load_and_vertical_interpolate_four_0565 \
100 xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi \
101 yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
103 mov TMP1, X, asr #16
104 add X, X, UX
105 add TMP1, TOP, TMP1, asl #1
106 mov TMP2, X, asr #16
107 add X, X, UX
108 add TMP2, TOP, TMP2, asl #1
109 - vld1.32 {xacc2lo[0]}, [TMP1], STRIDE
110 - vld1.32 {xacc2hi[0]}, [TMP2], STRIDE
111 - vld1.32 {xacc2lo[1]}, [TMP1]
112 - vld1.32 {xacc2hi[1]}, [TMP2]
113 - convert_0565_to_x888 xacc2, xreg3, xreg2, xreg1
114 + vld1.32 {\xacc2lo[0]}, [TMP1], STRIDE
115 + vld1.32 {\xacc2hi[0]}, [TMP2], STRIDE
116 + vld1.32 {\xacc2lo[1]}, [TMP1]
117 + vld1.32 {\xacc2hi[1]}, [TMP2]
118 + convert_0565_to_x888 \xacc2, \xreg3, \xreg2, \xreg1
119 mov TMP1, X, asr #16
120 add X, X, UX
121 add TMP1, TOP, TMP1, asl #1
122 mov TMP2, X, asr #16
123 add X, X, UX
124 add TMP2, TOP, TMP2, asl #1
125 - vld1.32 {yacc2lo[0]}, [TMP1], STRIDE
126 - vzip.u8 xreg1, xreg3
127 - vld1.32 {yacc2hi[0]}, [TMP2], STRIDE
128 - vzip.u8 xreg2, xreg4
129 - vld1.32 {yacc2lo[1]}, [TMP1]
130 - vzip.u8 xreg3, xreg4
131 - vld1.32 {yacc2hi[1]}, [TMP2]
132 - vzip.u8 xreg1, xreg2
133 - convert_0565_to_x888 yacc2, yreg3, yreg2, yreg1
134 - vmull.u8 xacc1, xreg1, d28
135 - vzip.u8 yreg1, yreg3
136 - vmlal.u8 xacc1, xreg2, d29
137 - vzip.u8 yreg2, yreg4
138 - vmull.u8 xacc2, xreg3, d28
139 - vzip.u8 yreg3, yreg4
140 - vmlal.u8 xacc2, xreg4, d29
141 - vzip.u8 yreg1, yreg2
142 - vmull.u8 yacc1, yreg1, d28
143 - vmlal.u8 yacc1, yreg2, d29
144 - vmull.u8 yacc2, yreg3, d28
145 - vmlal.u8 yacc2, yreg4, d29
146 + vld1.32 {\yacc2lo[0]}, [TMP1], STRIDE
147 + vzip.u8 \xreg1, \xreg3
148 + vld1.32 {\yacc2hi[0]}, [TMP2], STRIDE
149 + vzip.u8 \xreg2, \xreg4
150 + vld1.32 {\yacc2lo[1]}, [TMP1]
151 + vzip.u8 \xreg3, \xreg4
152 + vld1.32 {\yacc2hi[1]}, [TMP2]
153 + vzip.u8 \xreg1, \xreg2
154 + convert_0565_to_x888 \yacc2, \yreg3, \yreg2, \yreg1
155 + vmull.u8 \xacc1, \xreg1, d28
156 + vzip.u8 \yreg1, \yreg3
157 + vmlal.u8 \xacc1, \xreg2, d29
158 + vzip.u8 \yreg2, \yreg4
159 + vmull.u8 \xacc2, \xreg3, d28
160 + vzip.u8 \yreg3, \yreg4
161 + vmlal.u8 \xacc2, \xreg4, d29
162 + vzip.u8 \yreg1, \yreg2
163 + vmull.u8 \yacc1, \yreg1, d28
164 + vmlal.u8 \yacc1, \yreg2, d29
165 + vmull.u8 \yacc2, \yreg3, d28
166 + vmlal.u8 \yacc2, \yreg4, d29
167 .endm
169 .macro bilinear_store_8888 numpix, tmp1, tmp2
170 -.if numpix == 4
171 +.if \numpix == 4
172 vst1.32 {d0, d1}, [OUT]!
173 -.elseif numpix == 2
174 +.elseif \numpix == 2
175 vst1.32 {d0}, [OUT]!
176 -.elseif numpix == 1
177 +.elseif \numpix == 1
178 vst1.32 {d0[0]}, [OUT, :32]!
179 .else
180 .error bilinear_store_8888 numpix is unsupported
181 .endif
182 .endm
184 .macro bilinear_store_0565 numpix, tmp1, tmp2
185 vuzp.u8 d0, d1
186 vuzp.u8 d2, d3
187 vuzp.u8 d1, d3
188 vuzp.u8 d0, d2
189 - convert_8888_to_0565 d2, d1, d0, q1, tmp1, tmp2
190 -.if numpix == 4
191 + convert_8888_to_0565 d2, d1, d0, q1, \tmp1, \tmp2
192 +.if \numpix == 4
193 vst1.16 {d2}, [OUT]!
194 -.elseif numpix == 2
195 +.elseif \numpix == 2
196 vst1.32 {d2[0]}, [OUT]!
197 -.elseif numpix == 1
198 +.elseif \numpix == 1
199 vst1.16 {d2[0]}, [OUT]!
200 .else
201 .error bilinear_store_0565 numpix is unsupported
202 .endif
203 .endm
207 * Macros for loading mask pixels into register 'mask'.
208 * vdup must be done in somewhere else.
210 .macro bilinear_load_mask_x numpix, mask
211 .endm
213 .macro bilinear_load_mask_8 numpix, mask
214 -.if numpix == 4
215 - vld1.32 {mask[0]}, [MASK]!
216 -.elseif numpix == 2
217 - vld1.16 {mask[0]}, [MASK]!
218 -.elseif numpix == 1
219 - vld1.8 {mask[0]}, [MASK]!
220 +.if \numpix == 4
221 + vld1.32 {\mask[0]}, [MASK]!
222 +.elseif \numpix == 2
223 + vld1.16 {\mask[0]}, [MASK]!
224 +.elseif \numpix == 1
225 + vld1.8 {\mask[0]}, [MASK]!
226 .else
227 - .error bilinear_load_mask_8 numpix is unsupported
228 + .error bilinear_load_mask_8 \numpix is unsupported
229 .endif
230 pld [MASK, #prefetch_offset]
231 .endm
233 .macro bilinear_load_mask mask_fmt, numpix, mask
234 - bilinear_load_mask_&mask_fmt numpix, mask
235 + bilinear_load_mask_\()\mask_fmt \numpix, \mask
236 .endm
240 * Macros for loading destination pixels into register 'dst0' and 'dst1'.
241 * Interleave should be done somewhere else.
243 .macro bilinear_load_dst_0565_src numpix, dst0, dst1, dst01
244 .endm
246 .macro bilinear_load_dst_8888_src numpix, dst0, dst1, dst01
247 .endm
249 .macro bilinear_load_dst_8888 numpix, dst0, dst1, dst01
250 -.if numpix == 4
251 - vld1.32 {dst0, dst1}, [OUT]
252 -.elseif numpix == 2
253 - vld1.32 {dst0}, [OUT]
254 -.elseif numpix == 1
255 - vld1.32 {dst0[0]}, [OUT]
256 +.if \numpix == 4
257 + vld1.32 {\dst0, \dst1}, [OUT]
258 +.elseif \numpix == 2
259 + vld1.32 {\dst0}, [OUT]
260 +.elseif \numpix == 1
261 + vld1.32 {\dst0[0]}, [OUT]
262 .else
263 - .error bilinear_load_dst_8888 numpix is unsupported
264 + .error bilinear_load_dst_8888 \numpix is unsupported
265 .endif
266 pld [OUT, #(prefetch_offset * 4)]
267 .endm
269 .macro bilinear_load_dst_8888_over numpix, dst0, dst1, dst01
270 - bilinear_load_dst_8888 numpix, dst0, dst1, dst01
271 + bilinear_load_dst_8888 \numpix, \dst0, \dst1, \dst01
272 .endm
274 .macro bilinear_load_dst_8888_add numpix, dst0, dst1, dst01
275 - bilinear_load_dst_8888 numpix, dst0, dst1, dst01
276 + bilinear_load_dst_8888 \numpix, \dst0, \dst1, \dst01
277 .endm
279 .macro bilinear_load_dst dst_fmt, op, numpix, dst0, dst1, dst01
280 - bilinear_load_dst_&dst_fmt&_&op numpix, dst0, dst1, dst01
281 + bilinear_load_dst_\()\dst_fmt\()_\()\op \numpix, \dst0, \dst1, \dst01
282 .endm
285 * Macros for duplicating partially loaded mask to fill entire register.
286 * We will apply mask to interleaved source pixels, that is
287 * (r0, r1, r2, r3, g0, g1, g2, g3) x (m0, m1, m2, m3, m0, m1, m2, m3)
288 * (b0, b1, b2, b3, a0, a1, a2, a3) x (m0, m1, m2, m3, m0, m1, m2, m3)
289 * So, we need to duplicate loaded mask into whole register.
290 @@ -285,79 +285,79 @@
291 * (r0, r1, x, x, g0, g1, x, x) x (m0, m1, m0, m1, m0, m1, m0, m1)
292 * (b0, b1, x, x, a0, a1, x, x) x (m0, m1, m0, m1, m0, m1, m0, m1)
293 * We can do some optimizations for this including last pixel cases.
295 .macro bilinear_duplicate_mask_x numpix, mask
296 .endm
298 .macro bilinear_duplicate_mask_8 numpix, mask
299 -.if numpix == 4
300 - vdup.32 mask, mask[0]
301 -.elseif numpix == 2
302 - vdup.16 mask, mask[0]
303 -.elseif numpix == 1
304 - vdup.8 mask, mask[0]
305 +.if \numpix == 4
306 + vdup.32 \mask, \mask[0]
307 +.elseif \numpix == 2
308 + vdup.16 \mask, \mask[0]
309 +.elseif \numpix == 1
310 + vdup.8 \mask, \mask[0]
311 .else
312 .error bilinear_duplicate_mask_8 is unsupported
313 .endif
314 .endm
316 .macro bilinear_duplicate_mask mask_fmt, numpix, mask
317 - bilinear_duplicate_mask_&mask_fmt numpix, mask
318 + bilinear_duplicate_mask_\()\mask_fmt \numpix, \mask
319 .endm
322 * Macros for interleaving src and dst pixels to rrrr gggg bbbb aaaa form.
323 * Interleave should be done when maks is enabled or operator is 'over'.
325 .macro bilinear_interleave src0, src1, dst0, dst1
326 - vuzp.8 src0, src1
327 - vuzp.8 dst0, dst1
328 - vuzp.8 src0, src1
329 - vuzp.8 dst0, dst1
330 + vuzp.8 \src0, \src1
331 + vuzp.8 \dst0, \dst1
332 + vuzp.8 \src0, \src1
333 + vuzp.8 \dst0, \dst1
334 .endm
336 .macro bilinear_interleave_src_dst_x_src \
337 numpix, src0, src1, src01, dst0, dst1, dst01
338 .endm
340 .macro bilinear_interleave_src_dst_x_over \
341 numpix, src0, src1, src01, dst0, dst1, dst01
343 - bilinear_interleave src0, src1, dst0, dst1
344 + bilinear_interleave \src0, \src1, \dst0, \dst1
345 .endm
347 .macro bilinear_interleave_src_dst_x_add \
348 numpix, src0, src1, src01, dst0, dst1, dst01
349 .endm
351 .macro bilinear_interleave_src_dst_8_src \
352 numpix, src0, src1, src01, dst0, dst1, dst01
354 - bilinear_interleave src0, src1, dst0, dst1
355 + bilinear_interleave \src0, \src1, \dst0, \dst1
356 .endm
358 .macro bilinear_interleave_src_dst_8_over \
359 numpix, src0, src1, src01, dst0, dst1, dst01
361 - bilinear_interleave src0, src1, dst0, dst1
362 + bilinear_interleave \src0, \src1, \dst0, \dst1
363 .endm
365 .macro bilinear_interleave_src_dst_8_add \
366 numpix, src0, src1, src01, dst0, dst1, dst01
368 - bilinear_interleave src0, src1, dst0, dst1
369 + bilinear_interleave \src0, \src1, \dst0, \dst1
370 .endm
372 .macro bilinear_interleave_src_dst \
373 mask_fmt, op, numpix, src0, src1, src01, dst0, dst1, dst01
375 - bilinear_interleave_src_dst_&mask_fmt&_&op \
376 - numpix, src0, src1, src01, dst0, dst1, dst01
377 + bilinear_interleave_src_dst_\()\mask_fmt\()_\()\op \
378 + \numpix, \src0, \src1, \src01, \dst0, \dst1, \dst01
379 .endm
383 * Macros for applying masks to src pixels. (see combine_mask_u() function)
384 * src, dst should be in interleaved form.
385 * mask register should be in form (m0, m1, m2, m3).
387 @@ -365,217 +365,217 @@
388 numpix, src0, src1, src01, mask, \
389 tmp01, tmp23, tmp45, tmp67
390 .endm
392 .macro bilinear_apply_mask_to_src_8 \
393 numpix, src0, src1, src01, mask, \
394 tmp01, tmp23, tmp45, tmp67
396 - vmull.u8 tmp01, src0, mask
397 - vmull.u8 tmp23, src1, mask
398 + vmull.u8 \tmp01, \src0, \mask
399 + vmull.u8 \tmp23, \src1, \mask
400 /* bubbles */
401 - vrshr.u16 tmp45, tmp01, #8
402 - vrshr.u16 tmp67, tmp23, #8
403 + vrshr.u16 \tmp45, \tmp01, #8
404 + vrshr.u16 \tmp67, \tmp23, #8
405 /* bubbles */
406 - vraddhn.u16 src0, tmp45, tmp01
407 - vraddhn.u16 src1, tmp67, tmp23
408 + vraddhn.u16 \src0, \tmp45, \tmp01
409 + vraddhn.u16 \src1, \tmp67, \tmp23
410 .endm
412 .macro bilinear_apply_mask_to_src \
413 mask_fmt, numpix, src0, src1, src01, mask, \
414 tmp01, tmp23, tmp45, tmp67
416 - bilinear_apply_mask_to_src_&mask_fmt \
417 - numpix, src0, src1, src01, mask, \
418 - tmp01, tmp23, tmp45, tmp67
419 + bilinear_apply_mask_to_src_\()\mask_fmt \
420 + \numpix, \src0, \src1, \src01, \mask, \
421 + \tmp01, \tmp23, \tmp45, \tmp67
422 .endm
426 * Macros for combining src and destination pixels.
427 * Interleave or not is depending on operator 'op'.
429 .macro bilinear_combine_src \
430 numpix, src0, src1, src01, dst0, dst1, dst01, \
431 tmp01, tmp23, tmp45, tmp67, tmp8
432 .endm
434 .macro bilinear_combine_over \
435 numpix, src0, src1, src01, dst0, dst1, dst01, \
436 tmp01, tmp23, tmp45, tmp67, tmp8
438 - vdup.32 tmp8, src1[1]
439 + vdup.32 \tmp8, \src1[1]
440 /* bubbles */
441 - vmvn.8 tmp8, tmp8
442 + vmvn.8 \tmp8, \tmp8
443 /* bubbles */
444 - vmull.u8 tmp01, dst0, tmp8
445 + vmull.u8 \tmp01, \dst0, \tmp8
446 /* bubbles */
447 - vmull.u8 tmp23, dst1, tmp8
448 + vmull.u8 \tmp23, \dst1, \tmp8
449 /* bubbles */
450 - vrshr.u16 tmp45, tmp01, #8
451 - vrshr.u16 tmp67, tmp23, #8
452 + vrshr.u16 \tmp45, \tmp01, #8
453 + vrshr.u16 \tmp67, \tmp23, #8
454 /* bubbles */
455 - vraddhn.u16 dst0, tmp45, tmp01
456 - vraddhn.u16 dst1, tmp67, tmp23
457 + vraddhn.u16 \dst0, \tmp45, \tmp01
458 + vraddhn.u16 \dst1, \tmp67, \tmp23
459 /* bubbles */
460 - vqadd.u8 src01, dst01, src01
461 + vqadd.u8 \src01, \dst01, \src01
462 .endm
464 .macro bilinear_combine_add \
465 numpix, src0, src1, src01, dst0, dst1, dst01, \
466 tmp01, tmp23, tmp45, tmp67, tmp8
468 - vqadd.u8 src01, dst01, src01
469 + vqadd.u8 \src01, \dst01, \src01
470 .endm
472 .macro bilinear_combine \
473 op, numpix, src0, src1, src01, dst0, dst1, dst01, \
474 tmp01, tmp23, tmp45, tmp67, tmp8
476 - bilinear_combine_&op \
477 - numpix, src0, src1, src01, dst0, dst1, dst01, \
478 - tmp01, tmp23, tmp45, tmp67, tmp8
479 + bilinear_combine_\()\op \
480 + \numpix, \src0, \src1, \src01, \dst0, \dst1, \dst01, \
481 + \tmp01, \tmp23, \tmp45, \tmp67, \tmp8
482 .endm
485 * Macros for final deinterleaving of destination pixels if needed.
487 .macro bilinear_deinterleave numpix, dst0, dst1, dst01
488 - vuzp.8 dst0, dst1
489 + vuzp.8 \dst0, \dst1
490 /* bubbles */
491 - vuzp.8 dst0, dst1
492 + vuzp.8 \dst0, \dst1
493 .endm
495 .macro bilinear_deinterleave_dst_x_src numpix, dst0, dst1, dst01
496 .endm
498 .macro bilinear_deinterleave_dst_x_over numpix, dst0, dst1, dst01
499 - bilinear_deinterleave numpix, dst0, dst1, dst01
500 + bilinear_deinterleave \numpix, \dst0, \dst1, \dst01
501 .endm
503 .macro bilinear_deinterleave_dst_x_add numpix, dst0, dst1, dst01
504 .endm
506 .macro bilinear_deinterleave_dst_8_src numpix, dst0, dst1, dst01
507 - bilinear_deinterleave numpix, dst0, dst1, dst01
508 + bilinear_deinterleave \numpix, \dst0, \dst1, \dst01
509 .endm
511 .macro bilinear_deinterleave_dst_8_over numpix, dst0, dst1, dst01
512 - bilinear_deinterleave numpix, dst0, dst1, dst01
513 + bilinear_deinterleave \numpix, \dst0, \dst1, \dst01
514 .endm
516 .macro bilinear_deinterleave_dst_8_add numpix, dst0, dst1, dst01
517 - bilinear_deinterleave numpix, dst0, dst1, dst01
518 + bilinear_deinterleave \numpix, \dst0, \dst1, \dst01
519 .endm
521 .macro bilinear_deinterleave_dst mask_fmt, op, numpix, dst0, dst1, dst01
522 - bilinear_deinterleave_dst_&mask_fmt&_&op numpix, dst0, dst1, dst01
523 + bilinear_deinterleave_dst_\()\mask_fmt\()_\()\op \numpix, \dst0, \dst1, \dst01
524 .endm
527 .macro bilinear_interpolate_last_pixel src_fmt, mask_fmt, dst_fmt, op
528 - bilinear_load_&src_fmt d0, d1, d2
529 - bilinear_load_mask mask_fmt, 1, d4
530 - bilinear_load_dst dst_fmt, op, 1, d18, d19, q9
531 + bilinear_load_\()\src_fmt d0, d1, d2
532 + bilinear_load_mask \mask_fmt, 1, d4
533 + bilinear_load_dst \dst_fmt, \op, 1, d18, d19, q9
534 vmull.u8 q1, d0, d28
535 vmlal.u8 q1, d1, d29
536 /* 5 cycles bubble */
537 vshll.u16 q0, d2, #BILINEAR_INTERPOLATION_BITS
538 vmlsl.u16 q0, d2, d30
539 vmlal.u16 q0, d3, d30
540 /* 5 cycles bubble */
541 - bilinear_duplicate_mask mask_fmt, 1, d4
542 + bilinear_duplicate_mask \mask_fmt, 1, d4
543 vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
544 /* 3 cycles bubble */
545 vmovn.u16 d0, q0
546 /* 1 cycle bubble */
547 bilinear_interleave_src_dst \
548 - mask_fmt, op, 1, d0, d1, q0, d18, d19, q9
549 + \mask_fmt, \op, 1, d0, d1, q0, d18, d19, q9
550 bilinear_apply_mask_to_src \
551 - mask_fmt, 1, d0, d1, q0, d4, \
552 + \mask_fmt, 1, d0, d1, q0, d4, \
553 q3, q8, q10, q11
554 bilinear_combine \
555 - op, 1, d0, d1, q0, d18, d19, q9, \
556 + \op, 1, d0, d1, q0, d18, d19, q9, \
557 q3, q8, q10, q11, d5
558 - bilinear_deinterleave_dst mask_fmt, op, 1, d0, d1, q0
559 - bilinear_store_&dst_fmt 1, q2, q3
560 + bilinear_deinterleave_dst \mask_fmt, \op, 1, d0, d1, q0
561 + bilinear_store_\()\dst_fmt 1, q2, q3
562 .endm
564 .macro bilinear_interpolate_two_pixels src_fmt, mask_fmt, dst_fmt, op
565 - bilinear_load_and_vertical_interpolate_two_&src_fmt \
566 + bilinear_load_and_vertical_interpolate_two_\()\src_fmt \
567 q1, q11, d0, d1, d20, d21, d22, d23
568 - bilinear_load_mask mask_fmt, 2, d4
569 - bilinear_load_dst dst_fmt, op, 2, d18, d19, q9
570 + bilinear_load_mask \mask_fmt, 2, d4
571 + bilinear_load_dst \dst_fmt, \op, 2, d18, d19, q9
572 vshll.u16 q0, d2, #BILINEAR_INTERPOLATION_BITS
573 vmlsl.u16 q0, d2, d30
574 vmlal.u16 q0, d3, d30
575 vshll.u16 q10, d22, #BILINEAR_INTERPOLATION_BITS
576 vmlsl.u16 q10, d22, d31
577 vmlal.u16 q10, d23, d31
578 vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
579 vshrn.u32 d1, q10, #(2 * BILINEAR_INTERPOLATION_BITS)
580 - bilinear_duplicate_mask mask_fmt, 2, d4
581 + bilinear_duplicate_mask \mask_fmt, 2, d4
582 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
583 vadd.u16 q12, q12, q13
584 vmovn.u16 d0, q0
585 bilinear_interleave_src_dst \
586 - mask_fmt, op, 2, d0, d1, q0, d18, d19, q9
587 + \mask_fmt, \op, 2, d0, d1, q0, d18, d19, q9
588 bilinear_apply_mask_to_src \
589 - mask_fmt, 2, d0, d1, q0, d4, \
590 + \mask_fmt, 2, d0, d1, q0, d4, \
591 q3, q8, q10, q11
592 bilinear_combine \
593 - op, 2, d0, d1, q0, d18, d19, q9, \
594 + \op, 2, d0, d1, q0, d18, d19, q9, \
595 q3, q8, q10, q11, d5
596 - bilinear_deinterleave_dst mask_fmt, op, 2, d0, d1, q0
597 - bilinear_store_&dst_fmt 2, q2, q3
598 + bilinear_deinterleave_dst \mask_fmt, \op, 2, d0, d1, q0
599 + bilinear_store_\()\dst_fmt 2, q2, q3
600 .endm
602 .macro bilinear_interpolate_four_pixels src_fmt, mask_fmt, dst_fmt, op
603 - bilinear_load_and_vertical_interpolate_four_&src_fmt \
604 + bilinear_load_and_vertical_interpolate_four_\()\src_fmt \
605 q1, q11, d0, d1, d20, d21, d22, d23 \
606 q3, q9, d4, d5, d16, d17, d18, d19
607 pld [TMP1, PF_OFFS]
608 sub TMP1, TMP1, STRIDE
609 vshll.u16 q0, d2, #BILINEAR_INTERPOLATION_BITS
610 vmlsl.u16 q0, d2, d30
611 vmlal.u16 q0, d3, d30
612 vshll.u16 q10, d22, #BILINEAR_INTERPOLATION_BITS
613 vmlsl.u16 q10, d22, d31
614 vmlal.u16 q10, d23, d31
615 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
616 vshll.u16 q2, d6, #BILINEAR_INTERPOLATION_BITS
617 vmlsl.u16 q2, d6, d30
618 vmlal.u16 q2, d7, d30
619 vshll.u16 q8, d18, #BILINEAR_INTERPOLATION_BITS
620 - bilinear_load_mask mask_fmt, 4, d22
621 - bilinear_load_dst dst_fmt, op, 4, d2, d3, q1
622 + bilinear_load_mask \mask_fmt, 4, d22
623 + bilinear_load_dst \dst_fmt, \op, 4, d2, d3, q1
624 pld [TMP1, PF_OFFS]
625 vmlsl.u16 q8, d18, d31
626 vmlal.u16 q8, d19, d31
627 vadd.u16 q12, q12, q13
628 vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
629 vshrn.u32 d1, q10, #(2 * BILINEAR_INTERPOLATION_BITS)
630 vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS)
631 vshrn.u32 d5, q8, #(2 * BILINEAR_INTERPOLATION_BITS)
632 - bilinear_duplicate_mask mask_fmt, 4, d22
633 + bilinear_duplicate_mask \mask_fmt, 4, d22
634 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
635 vmovn.u16 d0, q0
636 vmovn.u16 d1, q2
637 vadd.u16 q12, q12, q13
638 bilinear_interleave_src_dst \
639 - mask_fmt, op, 4, d0, d1, q0, d2, d3, q1
640 + \mask_fmt, \op, 4, d0, d1, q0, d2, d3, q1
641 bilinear_apply_mask_to_src \
642 - mask_fmt, 4, d0, d1, q0, d22, \
643 + \mask_fmt, 4, d0, d1, q0, d22, \
644 q3, q8, q9, q10
645 bilinear_combine \
646 - op, 4, d0, d1, q0, d2, d3, q1, \
647 + \op, 4, d0, d1, q0, d2, d3, q1, \
648 q3, q8, q9, q10, d23
649 - bilinear_deinterleave_dst mask_fmt, op, 4, d0, d1, q0
650 - bilinear_store_&dst_fmt 4, q2, q3
651 + bilinear_deinterleave_dst \mask_fmt, \op, 4, d0, d1, q0
652 + bilinear_store_\()\dst_fmt 4, q2, q3
653 .endm
655 .set BILINEAR_FLAG_USE_MASK, 1
656 .set BILINEAR_FLAG_USE_ALL_NEON_REGS, 2
659 * Main template macro for generating NEON optimized bilinear scanline functions.
661 @@ -605,24 +605,24 @@
662 bilinear_process_four_pixels, \
663 bilinear_process_pixblock_head, \
664 bilinear_process_pixblock_tail, \
665 bilinear_process_pixblock_tail_head, \
666 pixblock_size, \
667 prefetch_distance, \
668 flags
670 -pixman_asm_function fname
671 -.if pixblock_size == 8
672 -.elseif pixblock_size == 4
673 +pixman_asm_function \fname
674 +.if \pixblock_size == 8
675 +.elseif \pixblock_size == 4
676 .else
677 .error unsupported pixblock size
678 .endif
680 -.if ((flags) & BILINEAR_FLAG_USE_MASK) == 0
681 +.if ((\flags) & BILINEAR_FLAG_USE_MASK) == 0
682 OUT .req r0
683 TOP .req r1
684 BOTTOM .req r2
685 WT .req r3
686 WB .req r4
687 X .req r5
688 UX .req r6
689 WIDTH .req ip
690 @@ -630,17 +630,17 @@ pixman_asm_function fname
691 TMP2 .req r4
692 PF_OFFS .req r7
693 TMP3 .req r8
694 TMP4 .req r9
695 STRIDE .req r2
697 mov ip, sp
698 push {r4, r5, r6, r7, r8, r9}
699 - mov PF_OFFS, #prefetch_distance
700 + mov PF_OFFS, #\prefetch_distance
701 ldmia ip, {WB, X, UX, WIDTH}
702 .else
703 OUT .req r0
704 MASK .req r1
705 TOP .req r2
706 BOTTOM .req r3
707 WT .req r4
708 WB .req r5
709 @@ -649,27 +649,27 @@ pixman_asm_function fname
710 WIDTH .req ip
711 TMP1 .req r4
712 TMP2 .req r5
713 PF_OFFS .req r8
714 TMP3 .req r9
715 TMP4 .req r10
716 STRIDE .req r3
718 - .set prefetch_offset, prefetch_distance
719 + .set prefetch_offset, \prefetch_distance
721 mov ip, sp
722 push {r4, r5, r6, r7, r8, r9, r10, ip}
723 - mov PF_OFFS, #prefetch_distance
724 + mov PF_OFFS, #\prefetch_distance
725 ldmia ip, {WT, WB, X, UX, WIDTH}
726 .endif
728 mul PF_OFFS, PF_OFFS, UX
730 -.if ((flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0
731 +.if ((\flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0
732 vpush {d8-d15}
733 .endif
735 sub STRIDE, BOTTOM, TOP
736 .unreq BOTTOM
738 cmp WIDTH, #0
739 ble 3f
740 @@ -678,76 +678,76 @@ pixman_asm_function fname
741 vdup.u16 q13, UX
742 vdup.u8 d28, WT
743 vdup.u8 d29, WB
744 vadd.u16 d25, d25, d26
746 /* ensure good destination alignment */
747 cmp WIDTH, #1
748 blt 0f
749 - tst OUT, #(1 << dst_bpp_shift)
750 + tst OUT, #(1 << \dst_bpp_shift)
751 beq 0f
752 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
753 vadd.u16 q12, q12, q13
754 - bilinear_process_last_pixel
755 + \bilinear_process_last_pixel
756 sub WIDTH, WIDTH, #1
758 vadd.u16 q13, q13, q13
759 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
760 vadd.u16 q12, q12, q13
762 cmp WIDTH, #2
763 blt 0f
764 - tst OUT, #(1 << (dst_bpp_shift + 1))
765 + tst OUT, #(1 << (\dst_bpp_shift + 1))
766 beq 0f
767 - bilinear_process_two_pixels
768 + \bilinear_process_two_pixels
769 sub WIDTH, WIDTH, #2
771 -.if pixblock_size == 8
772 +.if \pixblock_size == 8
773 cmp WIDTH, #4
774 blt 0f
775 - tst OUT, #(1 << (dst_bpp_shift + 2))
776 + tst OUT, #(1 << (\dst_bpp_shift + 2))
777 beq 0f
778 - bilinear_process_four_pixels
779 + \bilinear_process_four_pixels
780 sub WIDTH, WIDTH, #4
782 .endif
783 - subs WIDTH, WIDTH, #pixblock_size
784 + subs WIDTH, WIDTH, #\pixblock_size
785 blt 1f
786 - mov PF_OFFS, PF_OFFS, asr #(16 - src_bpp_shift)
787 - bilinear_process_pixblock_head
788 - subs WIDTH, WIDTH, #pixblock_size
789 + mov PF_OFFS, PF_OFFS, asr #(16 - \src_bpp_shift)
790 + \bilinear_process_pixblock_head
791 + subs WIDTH, WIDTH, #\pixblock_size
792 blt 5f
794 - bilinear_process_pixblock_tail_head
795 - subs WIDTH, WIDTH, #pixblock_size
796 + \bilinear_process_pixblock_tail_head
797 + subs WIDTH, WIDTH, #\pixblock_size
798 bge 0b
800 - bilinear_process_pixblock_tail
801 + \bilinear_process_pixblock_tail
803 -.if pixblock_size == 8
804 +.if \pixblock_size == 8
805 tst WIDTH, #4
806 beq 2f
807 - bilinear_process_four_pixels
808 + \bilinear_process_four_pixels
810 .endif
811 /* handle the remaining trailing pixels */
812 tst WIDTH, #2
813 beq 2f
814 - bilinear_process_two_pixels
815 + \bilinear_process_two_pixels
817 tst WIDTH, #1
818 beq 3f
819 - bilinear_process_last_pixel
820 + \bilinear_process_last_pixel
822 -.if ((flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0
823 +.if ((\flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0
824 vpop {d8-d15}
825 .endif
827 -.if ((flags) & BILINEAR_FLAG_USE_MASK) == 0
828 +.if ((\flags) & BILINEAR_FLAG_USE_MASK) == 0
829 pop {r4, r5, r6, r7, r8, r9}
830 .else
831 pop {r4, r5, r6, r7, r8, r9, r10, ip}
832 .endif
833 bx lr
835 .unreq OUT
836 .unreq TOP
837 @@ -757,21 +757,21 @@ 3:
838 .unreq UX
839 .unreq WIDTH
840 .unreq TMP1
841 .unreq TMP2
842 .unreq PF_OFFS
843 .unreq TMP3
844 .unreq TMP4
845 .unreq STRIDE
846 -.if ((flags) & BILINEAR_FLAG_USE_MASK) != 0
847 +.if ((\flags) & BILINEAR_FLAG_USE_MASK) != 0
848 .unreq MASK
849 .endif
851 -.endfunc
852 +pixman_end_asm_function
854 .endm
856 /* src_8888_8_8888 */
857 .macro bilinear_src_8888_8_8888_process_last_pixel
858 bilinear_interpolate_last_pixel 8888, 8, 8888, src
859 .endm
861 diff --git a/gfx/cairo/libpixman/src/pixman-arm-neon-asm.S b/gfx/cairo/libpixman/src/pixman-arm-neon-asm.S
862 --- a/gfx/cairo/libpixman/src/pixman-arm-neon-asm.S
863 +++ b/gfx/cairo/libpixman/src/pixman-arm-neon-asm.S
864 @@ -29,16 +29,22 @@
865 * (those which are exposing some new or interesting features) are
866 * extensively commented and can be used as examples.
868 * You may want to have a look at the comments for following functions:
869 * - pixman_composite_over_8888_0565_asm_neon
870 * - pixman_composite_over_n_8_0565_asm_neon
873 +#ifdef __clang__
874 +#define ldrgeb ldrbge
875 +#define subges subsge
876 +#define subpls subspl
877 +#endif
879 /* Prevent the stack from becoming executable for no reason... */
880 #if defined(__linux__) && defined(__ELF__)
881 .section .note.GNU-stack,"",%progbits
882 #endif
884 .text
885 .fpu neon
886 .arch armv7a
887 @@ -255,43 +261,43 @@
888 vqadd.u8 d16, d2, d20
889 vld1.16 {d4, d5}, [DST_R, :128]!
890 vqadd.u8 q9, q0, q11
891 vshrn.u16 d6, q2, #8
892 fetch_src_pixblock
893 vshrn.u16 d7, q2, #3
894 vsli.u16 q2, q2, #5
895 vshll.u8 q14, d16, #8
896 - PF add PF_X, PF_X, #8
897 + PF add, PF_X, PF_X, #8
898 vshll.u8 q8, d19, #8
899 - PF tst PF_CTL, #0xF
900 + PF tst, PF_CTL, #0xF
901 vsri.u8 d6, d6, #5
902 - PF addne PF_X, PF_X, #8
903 + PF addne, PF_X, PF_X, #8
904 vmvn.8 d3, d3
905 - PF subne PF_CTL, PF_CTL, #1
906 + PF subne, PF_CTL, PF_CTL, #1
907 vsri.u8 d7, d7, #6
908 vshrn.u16 d30, q2, #2
909 vmull.u8 q10, d3, d6
910 PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
911 vmull.u8 q11, d3, d7
912 vmull.u8 q12, d3, d30
913 PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
914 vsri.u16 q14, q8, #5
915 - PF cmp PF_X, ORIG_W
916 + PF cmp, PF_X, ORIG_W
917 vshll.u8 q9, d18, #8
918 vrshr.u16 q13, q10, #8
919 - PF subge PF_X, PF_X, ORIG_W
920 + PF subge, PF_X, PF_X, ORIG_W
921 vrshr.u16 q3, q11, #8
922 vrshr.u16 q15, q12, #8
923 - PF subges PF_CTL, PF_CTL, #0x10
924 + PF subges, PF_CTL, PF_CTL, #0x10
925 vsri.u16 q14, q9, #11
926 - PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
927 + PF ldrgeb, DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
928 vraddhn.u16 d20, q10, q13
929 vraddhn.u16 d23, q11, q3
930 - PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
931 + PF ldrgeb, DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
932 vraddhn.u16 d22, q12, q15
933 vst1.16 {d28, d29}, [DST_W, :128]!
934 .endm
936 #else
938 /* If we did not care much about the performance, we would just use this... */
939 .macro pixman_composite_over_8888_0565_process_pixblock_tail_head
940 @@ -429,30 +435,30 @@ generate_composite_function \
942 .macro pixman_composite_src_8888_0565_process_pixblock_tail
943 vsri.u16 q14, q8, #5
944 vsri.u16 q14, q9, #11
945 .endm
947 .macro pixman_composite_src_8888_0565_process_pixblock_tail_head
948 vsri.u16 q14, q8, #5
949 - PF add PF_X, PF_X, #8
950 - PF tst PF_CTL, #0xF
951 + PF add, PF_X, PF_X, #8
952 + PF tst, PF_CTL, #0xF
953 fetch_src_pixblock
954 - PF addne PF_X, PF_X, #8
955 - PF subne PF_CTL, PF_CTL, #1
956 + PF addne, PF_X, PF_X, #8
957 + PF subne, PF_CTL, PF_CTL, #1
958 vsri.u16 q14, q9, #11
959 - PF cmp PF_X, ORIG_W
960 + PF cmp, PF_X, ORIG_W
961 PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
962 vshll.u8 q8, d1, #8
963 vst1.16 {d28, d29}, [DST_W, :128]!
964 - PF subge PF_X, PF_X, ORIG_W
965 - PF subges PF_CTL, PF_CTL, #0x10
966 + PF subge, PF_X, PF_X, ORIG_W
967 + PF subges, PF_CTL, PF_CTL, #0x10
968 vshll.u8 q14, d2, #8
969 - PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
970 + PF ldrgeb, DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
971 vshll.u8 q9, d0, #8
972 .endm
974 generate_composite_function \
975 pixman_composite_src_8888_0565_asm_neon, 32, 0, 16, \
976 FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
977 8, /* number of pixels, processed in a single block */ \
978 10, /* prefetch distance */ \
979 @@ -504,30 +510,30 @@ generate_composite_function \
980 vqadd.u8 q15, q1, q3
981 .endm
983 .macro pixman_composite_add_8_8_process_pixblock_tail
984 .endm
986 .macro pixman_composite_add_8_8_process_pixblock_tail_head
987 fetch_src_pixblock
988 - PF add PF_X, PF_X, #32
989 - PF tst PF_CTL, #0xF
990 + PF add, PF_X, PF_X, #32
991 + PF tst, PF_CTL, #0xF
992 vld1.8 {d4, d5, d6, d7}, [DST_R, :128]!
993 - PF addne PF_X, PF_X, #32
994 - PF subne PF_CTL, PF_CTL, #1
995 + PF addne, PF_X, PF_X, #32
996 + PF subne, PF_CTL, PF_CTL, #1
997 vst1.8 {d28, d29, d30, d31}, [DST_W, :128]!
998 - PF cmp PF_X, ORIG_W
999 + PF cmp, PF_X, ORIG_W
1000 PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
1001 PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
1002 - PF subge PF_X, PF_X, ORIG_W
1003 - PF subges PF_CTL, PF_CTL, #0x10
1004 + PF subge, PF_X, PF_X, ORIG_W
1005 + PF subges, PF_CTL, PF_CTL, #0x10
1006 vqadd.u8 q14, q0, q2
1007 - PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
1008 - PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
1009 + PF ldrgeb, DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
1010 + PF ldrgeb, DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
1011 vqadd.u8 q15, q1, q3
1012 .endm
1014 generate_composite_function \
1015 pixman_composite_add_8_8_asm_neon, 8, 0, 8, \
1016 FLAG_DST_READWRITE, \
1017 32, /* number of pixels, processed in a single block */ \
1018 10, /* prefetch distance */ \
1019 @@ -536,30 +542,30 @@ generate_composite_function \
1020 pixman_composite_add_8_8_process_pixblock_head, \
1021 pixman_composite_add_8_8_process_pixblock_tail, \
1022 pixman_composite_add_8_8_process_pixblock_tail_head
1024 /******************************************************************************/
1026 .macro pixman_composite_add_8888_8888_process_pixblock_tail_head
1027 fetch_src_pixblock
1028 - PF add PF_X, PF_X, #8
1029 - PF tst PF_CTL, #0xF
1030 + PF add, PF_X, PF_X, #8
1031 + PF tst, PF_CTL, #0xF
1032 vld1.32 {d4, d5, d6, d7}, [DST_R, :128]!
1033 - PF addne PF_X, PF_X, #8
1034 - PF subne PF_CTL, PF_CTL, #1
1035 + PF addne, PF_X, PF_X, #8
1036 + PF subne, PF_CTL, PF_CTL, #1
1037 vst1.32 {d28, d29, d30, d31}, [DST_W, :128]!
1038 - PF cmp PF_X, ORIG_W
1039 + PF cmp, PF_X, ORIG_W
1040 PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
1041 PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
1042 - PF subge PF_X, PF_X, ORIG_W
1043 - PF subges PF_CTL, PF_CTL, #0x10
1044 + PF subge, PF_X, PF_X, ORIG_W
1045 + PF subges, PF_CTL, PF_CTL, #0x10
1046 vqadd.u8 q14, q0, q2
1047 - PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
1048 - PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
1049 + PF ldrgeb, DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
1050 + PF ldrgeb, DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
1051 vqadd.u8 q15, q1, q3
1052 .endm
1054 generate_composite_function \
1055 pixman_composite_add_8888_8888_asm_neon, 32, 0, 32, \
1056 FLAG_DST_READWRITE, \
1057 8, /* number of pixels, processed in a single block */ \
1058 10, /* prefetch distance */ \
1059 @@ -599,40 +605,40 @@ generate_composite_function_single_scanl
1060 vraddhn.u16 d29, q15, q9
1061 vraddhn.u16 d30, q12, q10
1062 vraddhn.u16 d31, q13, q11
1063 .endm
1065 .macro pixman_composite_out_reverse_8888_8888_process_pixblock_tail_head
1066 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
1067 vrshr.u16 q14, q8, #8
1068 - PF add PF_X, PF_X, #8
1069 - PF tst PF_CTL, #0xF
1070 + PF add, PF_X, PF_X, #8
1071 + PF tst, PF_CTL, #0xF
1072 vrshr.u16 q15, q9, #8
1073 vrshr.u16 q12, q10, #8
1074 vrshr.u16 q13, q11, #8
1075 - PF addne PF_X, PF_X, #8
1076 - PF subne PF_CTL, PF_CTL, #1
1077 + PF addne, PF_X, PF_X, #8
1078 + PF subne, PF_CTL, PF_CTL, #1
1079 vraddhn.u16 d28, q14, q8
1080 vraddhn.u16 d29, q15, q9
1081 - PF cmp PF_X, ORIG_W
1082 + PF cmp, PF_X, ORIG_W
1083 vraddhn.u16 d30, q12, q10
1084 vraddhn.u16 d31, q13, q11
1085 fetch_src_pixblock
1086 PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
1087 vmvn.8 d22, d3
1088 PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
1089 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
1090 - PF subge PF_X, PF_X, ORIG_W
1091 + PF subge, PF_X, PF_X, ORIG_W
1092 vmull.u8 q8, d22, d4
1093 - PF subges PF_CTL, PF_CTL, #0x10
1094 + PF subsge, PF_CTL, PF_CTL, #0x10
1095 vmull.u8 q9, d22, d5
1096 - PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
1097 + PF ldrgeb, DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
1098 vmull.u8 q10, d22, d6
1099 - PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
1100 + PF ldrgeb, DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
1101 vmull.u8 q11, d22, d7
1102 .endm
1104 generate_composite_function_single_scanline \
1105 pixman_composite_scanline_out_reverse_asm_neon, 32, 0, 32, \
1106 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
1107 8, /* number of pixels, processed in a single block */ \
1108 default_init, \
1109 @@ -651,42 +657,42 @@ generate_composite_function_single_scanl
1110 pixman_composite_out_reverse_8888_8888_process_pixblock_tail
1111 vqadd.u8 q14, q0, q14
1112 vqadd.u8 q15, q1, q15
1113 .endm
1115 .macro pixman_composite_over_8888_8888_process_pixblock_tail_head
1116 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
1117 vrshr.u16 q14, q8, #8
1118 - PF add PF_X, PF_X, #8
1119 - PF tst PF_CTL, #0xF
1120 + PF add, PF_X, PF_X, #8
1121 + PF tst, PF_CTL, #0xF
1122 vrshr.u16 q15, q9, #8
1123 vrshr.u16 q12, q10, #8
1124 vrshr.u16 q13, q11, #8
1125 - PF addne PF_X, PF_X, #8
1126 - PF subne PF_CTL, PF_CTL, #1
1127 + PF addne, PF_X, PF_X, #8
1128 + PF subne, PF_CTL, PF_CTL, #1
1129 vraddhn.u16 d28, q14, q8
1130 vraddhn.u16 d29, q15, q9
1131 - PF cmp PF_X, ORIG_W
1132 + PF cmp, PF_X, ORIG_W
1133 vraddhn.u16 d30, q12, q10
1134 vraddhn.u16 d31, q13, q11
1135 vqadd.u8 q14, q0, q14
1136 vqadd.u8 q15, q1, q15
1137 fetch_src_pixblock
1138 PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
1139 vmvn.8 d22, d3
1140 PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
1141 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
1142 - PF subge PF_X, PF_X, ORIG_W
1143 + PF subge, PF_X, PF_X, ORIG_W
1144 vmull.u8 q8, d22, d4
1145 - PF subges PF_CTL, PF_CTL, #0x10
1146 + PF subges, PF_CTL, PF_CTL, #0x10
1147 vmull.u8 q9, d22, d5
1148 - PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
1149 + PF ldrgeb, DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
1150 vmull.u8 q10, d22, d6
1151 - PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
1152 + PF ldrgeb, DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
1153 vmull.u8 q11, d22, d7
1154 .endm
1156 generate_composite_function \
1157 pixman_composite_over_8888_8888_asm_neon, 32, 0, 32, \
1158 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
1159 8, /* number of pixels, processed in a single block */ \
1160 5, /* prefetch distance */ \
1161 @@ -737,30 +743,30 @@ generate_composite_function_single_scanl
1162 vrshr.u16 q2, q10, #8
1163 vrshr.u16 q3, q11, #8
1164 vraddhn.u16 d28, q14, q8
1165 vraddhn.u16 d29, q15, q9
1166 vraddhn.u16 d30, q2, q10
1167 vraddhn.u16 d31, q3, q11
1168 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
1169 vqadd.u8 q14, q0, q14
1170 - PF add PF_X, PF_X, #8
1171 - PF tst PF_CTL, #0x0F
1172 - PF addne PF_X, PF_X, #8
1173 - PF subne PF_CTL, PF_CTL, #1
1174 + PF add, PF_X, PF_X, #8
1175 + PF tst, PF_CTL, #0x0F
1176 + PF addne, PF_X, PF_X, #8
1177 + PF subne, PF_CTL, PF_CTL, #1
1178 vqadd.u8 q15, q1, q15
1179 - PF cmp PF_X, ORIG_W
1180 + PF cmp, PF_X, ORIG_W
1181 vmull.u8 q8, d24, d4
1182 PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
1183 vmull.u8 q9, d24, d5
1184 - PF subge PF_X, PF_X, ORIG_W
1185 + PF subge, PF_X, PF_X, ORIG_W
1186 vmull.u8 q10, d24, d6
1187 - PF subges PF_CTL, PF_CTL, #0x10
1188 + PF subges, PF_CTL, PF_CTL, #0x10
1189 vmull.u8 q11, d24, d7
1190 - PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
1191 + PF ldrgeb, DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
1192 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
1193 .endm
1195 .macro pixman_composite_over_n_8888_init
1196 add DUMMY, sp, #ARGS_STACK_OFFSET
1197 vld1.32 {d3[0]}, [DUMMY]
1198 vdup.8 d0, d3[0]
1199 vdup.8 d1, d3[1]
1200 @@ -779,40 +785,40 @@ generate_composite_function \
1201 pixman_composite_over_8888_8888_process_pixblock_head, \
1202 pixman_composite_over_8888_8888_process_pixblock_tail, \
1203 pixman_composite_over_n_8888_process_pixblock_tail_head
1205 /******************************************************************************/
1207 .macro pixman_composite_over_reverse_n_8888_process_pixblock_tail_head
1208 vrshr.u16 q14, q8, #8
1209 - PF add PF_X, PF_X, #8
1210 - PF tst PF_CTL, #0xF
1211 + PF add, PF_X, PF_X, #8
1212 + PF tst, PF_CTL, #0xF
1213 vrshr.u16 q15, q9, #8
1214 vrshr.u16 q12, q10, #8
1215 vrshr.u16 q13, q11, #8
1216 - PF addne PF_X, PF_X, #8
1217 - PF subne PF_CTL, PF_CTL, #1
1218 + PF addne, PF_X, PF_X, #8
1219 + PF subne, PF_CTL, PF_CTL, #1
1220 vraddhn.u16 d28, q14, q8
1221 vraddhn.u16 d29, q15, q9
1222 - PF cmp PF_X, ORIG_W
1223 + PF cmp, PF_X, ORIG_W
1224 vraddhn.u16 d30, q12, q10
1225 vraddhn.u16 d31, q13, q11
1226 vqadd.u8 q14, q0, q14
1227 vqadd.u8 q15, q1, q15
1228 vld4.8 {d0, d1, d2, d3}, [DST_R, :128]!
1229 vmvn.8 d22, d3
1230 PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
1231 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
1232 - PF subge PF_X, PF_X, ORIG_W
1233 + PF subge, PF_X, PF_X, ORIG_W
1234 vmull.u8 q8, d22, d4
1235 - PF subges PF_CTL, PF_CTL, #0x10
1236 + PF subges, PF_CTL, PF_CTL, #0x10
1237 vmull.u8 q9, d22, d5
1238 vmull.u8 q10, d22, d6
1239 - PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
1240 + PF ldrgeb, DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
1241 vmull.u8 q11, d22, d7
1242 .endm
1244 .macro pixman_composite_over_reverse_n_8888_init
1245 add DUMMY, sp, #ARGS_STACK_OFFSET
1246 vld1.32 {d7[0]}, [DUMMY]
1247 vdup.8 d4, d7[0]
1248 vdup.8 d5, d7[1]
1249 @@ -1240,33 +1246,33 @@ generate_composite_function \
1250 vrshrn.u16 d28, q8, #8
1251 vrshrn.u16 d29, q9, #8
1252 vrshrn.u16 d30, q10, #8
1253 vrshrn.u16 d31, q11, #8
1254 .endm
1256 .macro pixman_composite_src_n_8_8888_process_pixblock_tail_head
1257 fetch_mask_pixblock
1258 - PF add PF_X, PF_X, #8
1259 + PF add, PF_X, PF_X, #8
1260 vrshrn.u16 d28, q8, #8
1261 - PF tst PF_CTL, #0x0F
1262 + PF tst, PF_CTL, #0x0F
1263 vrshrn.u16 d29, q9, #8
1264 - PF addne PF_X, PF_X, #8
1265 + PF addne, PF_X, PF_X, #8
1266 vrshrn.u16 d30, q10, #8
1267 - PF subne PF_CTL, PF_CTL, #1
1268 + PF subne, PF_CTL, PF_CTL, #1
1269 vrshrn.u16 d31, q11, #8
1270 - PF cmp PF_X, ORIG_W
1271 + PF cmp, PF_X, ORIG_W
1272 vmull.u8 q8, d24, d0
1273 PF pld, [PF_MASK, PF_X, lsl #mask_bpp_shift]
1274 vmull.u8 q9, d24, d1
1275 - PF subge PF_X, PF_X, ORIG_W
1276 + PF subge, PF_X, PF_X, ORIG_W
1277 vmull.u8 q10, d24, d2
1278 - PF subges PF_CTL, PF_CTL, #0x10
1279 + PF subges, PF_CTL, PF_CTL, #0x10
1280 vmull.u8 q11, d24, d3
1281 - PF ldrgeb DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]!
1282 + PF ldrgeb, DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]!
1283 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
1284 vrsra.u16 q8, q8, #8
1285 vrsra.u16 q9, q9, #8
1286 vrsra.u16 q10, q10, #8
1287 vrsra.u16 q11, q11, #8
1288 .endm
1290 .macro pixman_composite_src_n_8_8888_init
1291 @@ -1309,33 +1315,33 @@ generate_composite_function \
1292 vrshrn.u16 d28, q0, #8
1293 vrshrn.u16 d29, q1, #8
1294 vrshrn.u16 d30, q2, #8
1295 vrshrn.u16 d31, q3, #8
1296 .endm
1298 .macro pixman_composite_src_n_8_8_process_pixblock_tail_head
1299 fetch_mask_pixblock
1300 - PF add PF_X, PF_X, #8
1301 + PF add, PF_X, PF_X, #8
1302 vrshrn.u16 d28, q0, #8
1303 - PF tst PF_CTL, #0x0F
1304 + PF tst, PF_CTL, #0x0F
1305 vrshrn.u16 d29, q1, #8
1306 - PF addne PF_X, PF_X, #8
1307 + PF addne, PF_X, PF_X, #8
1308 vrshrn.u16 d30, q2, #8
1309 - PF subne PF_CTL, PF_CTL, #1
1310 + PF subne, PF_CTL, PF_CTL, #1
1311 vrshrn.u16 d31, q3, #8
1312 - PF cmp PF_X, ORIG_W
1313 + PF cmp, PF_X, ORIG_W
1314 vmull.u8 q0, d24, d16
1315 PF pld, [PF_MASK, PF_X, lsl #mask_bpp_shift]
1316 vmull.u8 q1, d25, d16
1317 - PF subge PF_X, PF_X, ORIG_W
1318 + PF subge, PF_X, PF_X, ORIG_W
1319 vmull.u8 q2, d26, d16
1320 - PF subges PF_CTL, PF_CTL, #0x10
1321 + PF subges, PF_CTL, PF_CTL, #0x10
1322 vmull.u8 q3, d27, d16
1323 - PF ldrgeb DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]!
1324 + PF ldrgeb, DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]!
1325 vst1.8 {d28, d29, d30, d31}, [DST_W, :128]!
1326 vrsra.u16 q0, q0, #8
1327 vrsra.u16 q1, q1, #8
1328 vrsra.u16 q2, q2, #8
1329 vrsra.u16 q3, q3, #8
1330 .endm
1332 .macro pixman_composite_src_n_8_8_init
1333 @@ -1403,37 +1409,37 @@ generate_composite_function \
1334 .endm
1336 .macro pixman_composite_over_n_8_8888_process_pixblock_tail_head
1337 vrshr.u16 q14, q8, #8
1338 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
1339 vrshr.u16 q15, q9, #8
1340 fetch_mask_pixblock
1341 vrshr.u16 q6, q10, #8
1342 - PF add PF_X, PF_X, #8
1343 + PF add, PF_X, PF_X, #8
1344 vrshr.u16 q7, q11, #8
1345 - PF tst PF_CTL, #0x0F
1346 + PF tst, PF_CTL, #0x0F
1347 vraddhn.u16 d28, q14, q8
1348 - PF addne PF_X, PF_X, #8
1349 + PF addne, PF_X, PF_X, #8
1350 vraddhn.u16 d29, q15, q9
1351 - PF subne PF_CTL, PF_CTL, #1
1352 + PF subne, PF_CTL, PF_CTL, #1
1353 vraddhn.u16 d30, q6, q10
1354 - PF cmp PF_X, ORIG_W
1355 + PF cmp, PF_X, ORIG_W
1356 vraddhn.u16 d31, q7, q11
1357 PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
1358 vmull.u8 q6, d24, d8
1359 PF pld, [PF_MASK, PF_X, lsl #mask_bpp_shift]
1360 vmull.u8 q7, d24, d9
1361 - PF subge PF_X, PF_X, ORIG_W
1362 + PF subge, PF_X, PF_X, ORIG_W
1363 vmull.u8 q8, d24, d10
1364 - PF subges PF_CTL, PF_CTL, #0x10
1365 + PF subges, PF_CTL, PF_CTL, #0x10
1366 vmull.u8 q9, d24, d11
1367 - PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
1368 + PF ldrgeb, DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
1369 vqadd.u8 q14, q0, q14
1370 - PF ldrgeb DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]!
1371 + PF ldrgeb, DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]!
1372 vqadd.u8 q15, q1, q15
1373 vrshr.u16 q10, q6, #8
1374 vrshr.u16 q11, q7, #8
1375 vrshr.u16 q12, q8, #8
1376 vrshr.u16 q13, q9, #8
1377 vraddhn.u16 d0, q6, q10
1378 vraddhn.u16 d1, q7, q11
1379 vraddhn.u16 d2, q8, q12
1380 @@ -2420,31 +2426,31 @@ generate_composite_function \
1382 .macro pixman_composite_src_pixbuf_8888_process_pixblock_tail_head
1383 vrshr.u16 q11, q8, #8
1384 vswp d3, d31
1385 vrshr.u16 q12, q9, #8
1386 vrshr.u16 q13, q10, #8
1387 fetch_src_pixblock
1388 vraddhn.u16 d30, q11, q8
1389 - PF add PF_X, PF_X, #8
1390 - PF tst PF_CTL, #0xF
1391 - PF addne PF_X, PF_X, #8
1392 - PF subne PF_CTL, PF_CTL, #1
1393 + PF add, PF_X, PF_X, #8
1394 + PF tst, PF_CTL, #0xF
1395 + PF addne, PF_X, PF_X, #8
1396 + PF subne, PF_CTL, PF_CTL, #1
1397 vraddhn.u16 d29, q12, q9
1398 vraddhn.u16 d28, q13, q10
1399 vmull.u8 q8, d3, d0
1400 vmull.u8 q9, d3, d1
1401 vmull.u8 q10, d3, d2
1402 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
1403 - PF cmp PF_X, ORIG_W
1404 + PF cmp, PF_X, ORIG_W
1405 PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
1406 - PF subge PF_X, PF_X, ORIG_W
1407 - PF subges PF_CTL, PF_CTL, #0x10
1408 - PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
1409 + PF subge, PF_X, PF_X, ORIG_W
1410 + PF subges, PF_CTL, PF_CTL, #0x10
1411 + PF ldrgeb, DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
1412 .endm
1414 generate_composite_function \
1415 pixman_composite_src_pixbuf_8888_asm_neon, 32, 0, 32, \
1416 FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
1417 8, /* number of pixels, processed in a single block */ \
1418 10, /* prefetch distance */ \
1419 default_init, \
1420 @@ -2477,31 +2483,31 @@ generate_composite_function \
1422 .macro pixman_composite_src_rpixbuf_8888_process_pixblock_tail_head
1423 vrshr.u16 q11, q8, #8
1424 vswp d3, d31
1425 vrshr.u16 q12, q9, #8
1426 vrshr.u16 q13, q10, #8
1427 fetch_src_pixblock
1428 vraddhn.u16 d28, q11, q8
1429 - PF add PF_X, PF_X, #8
1430 - PF tst PF_CTL, #0xF
1431 - PF addne PF_X, PF_X, #8
1432 - PF subne PF_CTL, PF_CTL, #1
1433 + PF add, PF_X, PF_X, #8
1434 + PF tst, PF_CTL, #0xF
1435 + PF addne, PF_X, PF_X, #8
1436 + PF subne, PF_CTL, PF_CTL, #1
1437 vraddhn.u16 d29, q12, q9
1438 vraddhn.u16 d30, q13, q10
1439 vmull.u8 q8, d3, d0
1440 vmull.u8 q9, d3, d1
1441 vmull.u8 q10, d3, d2
1442 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
1443 - PF cmp PF_X, ORIG_W
1444 + PF cmp, PF_X, ORIG_W
1445 PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
1446 - PF subge PF_X, PF_X, ORIG_W
1447 - PF subges PF_CTL, PF_CTL, #0x10
1448 - PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
1449 + PF subge, PF_X, PF_X, ORIG_W
1450 + PF subges, PF_CTL, PF_CTL, #0x10
1451 + PF ldrgeb, DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
1452 .endm
1454 generate_composite_function \
1455 pixman_composite_src_rpixbuf_8888_asm_neon, 32, 0, 32, \
1456 FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
1457 8, /* number of pixels, processed in a single block */ \
1458 10, /* prefetch distance */ \
1459 default_init, \
1460 @@ -2836,182 +2842,182 @@ generate_composite_function_nearest_scan
1461 * format conversion, and interpolation as separate macros which can be used
1462 * as the basic building blocks for constructing bilinear scanline functions.
1465 .macro bilinear_load_8888 reg1, reg2, tmp
1466 mov TMP1, X, asr #16
1467 add X, X, UX
1468 add TMP1, TOP, TMP1, asl #2
1469 - vld1.32 {reg1}, [TMP1], STRIDE
1470 - vld1.32 {reg2}, [TMP1]
1471 + vld1.32 {\reg1}, [TMP1], STRIDE
1472 + vld1.32 {\reg2}, [TMP1]
1473 .endm
1475 .macro bilinear_load_0565 reg1, reg2, tmp
1476 mov TMP1, X, asr #16
1477 add X, X, UX
1478 add TMP1, TOP, TMP1, asl #1
1479 - vld1.32 {reg2[0]}, [TMP1], STRIDE
1480 - vld1.32 {reg2[1]}, [TMP1]
1481 - convert_four_0565_to_x888_packed reg2, reg1, reg2, tmp
1482 + vld1.32 {\reg2[0]}, [TMP1], STRIDE
1483 + vld1.32 {\reg2[1]}, [TMP1]
1484 + convert_four_0565_to_x888_packed \reg2, \reg1, \reg2, \tmp
1485 .endm
1487 .macro bilinear_load_and_vertical_interpolate_two_8888 \
1488 acc1, acc2, reg1, reg2, reg3, reg4, tmp1, tmp2
1490 - bilinear_load_8888 reg1, reg2, tmp1
1491 - vmull.u8 acc1, reg1, d28
1492 - vmlal.u8 acc1, reg2, d29
1493 - bilinear_load_8888 reg3, reg4, tmp2
1494 - vmull.u8 acc2, reg3, d28
1495 - vmlal.u8 acc2, reg4, d29
1496 + bilinear_load_8888 \reg1, \reg2, \tmp1
1497 + vmull.u8 \acc1, \reg1, d28
1498 + vmlal.u8 \acc1, \reg2, d29
1499 + bilinear_load_8888 \reg3, \reg4, \tmp2
1500 + vmull.u8 \acc2, \reg3, d28
1501 + vmlal.u8 \acc2, \reg4, d29
1502 .endm
1504 .macro bilinear_load_and_vertical_interpolate_four_8888 \
1505 xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi \
1506 yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
1508 bilinear_load_and_vertical_interpolate_two_8888 \
1509 - xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi
1510 + \xacc1, \xacc2, \xreg1, \xreg2, \xreg3, \xreg4, \xacc2lo, \xacc2hi
1511 bilinear_load_and_vertical_interpolate_two_8888 \
1512 - yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
1513 + \yacc1, \yacc2, \yreg1, \yreg2, \yreg3, \yreg4, \yacc2lo, \yacc2hi
1514 .endm
1516 .macro bilinear_load_and_vertical_interpolate_two_0565 \
1517 acc1, acc2, reg1, reg2, reg3, reg4, acc2lo, acc2hi
1519 mov TMP1, X, asr #16
1520 add X, X, UX
1521 add TMP1, TOP, TMP1, asl #1
1522 mov TMP2, X, asr #16
1523 add X, X, UX
1524 add TMP2, TOP, TMP2, asl #1
1525 - vld1.32 {acc2lo[0]}, [TMP1], STRIDE
1526 - vld1.32 {acc2hi[0]}, [TMP2], STRIDE
1527 - vld1.32 {acc2lo[1]}, [TMP1]
1528 - vld1.32 {acc2hi[1]}, [TMP2]
1529 - convert_0565_to_x888 acc2, reg3, reg2, reg1
1530 - vzip.u8 reg1, reg3
1531 - vzip.u8 reg2, reg4
1532 - vzip.u8 reg3, reg4
1533 - vzip.u8 reg1, reg2
1534 - vmull.u8 acc1, reg1, d28
1535 - vmlal.u8 acc1, reg2, d29
1536 - vmull.u8 acc2, reg3, d28
1537 - vmlal.u8 acc2, reg4, d29
1538 + vld1.32 {\acc2lo[0]}, [TMP1], STRIDE
1539 + vld1.32 {\acc2hi[0]}, [TMP2], STRIDE
1540 + vld1.32 {\acc2lo[1]}, [TMP1]
1541 + vld1.32 {\acc2hi[1]}, [TMP2]
1542 + convert_0565_to_x888 \acc2, \reg3, \reg2, \reg1
1543 + vzip.u8 \reg1, \reg3
1544 + vzip.u8 \reg2, \reg4
1545 + vzip.u8 \reg3, \reg4
1546 + vzip.u8 \reg1, \reg2
1547 + vmull.u8 \acc1, \reg1, d28
1548 + vmlal.u8 \acc1, \reg2, d29
1549 + vmull.u8 \acc2, \reg3, d28
1550 + vmlal.u8 \acc2, \reg4, d29
1551 .endm
1553 .macro bilinear_load_and_vertical_interpolate_four_0565 \
1554 xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi \
1555 yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
1557 mov TMP1, X, asr #16
1558 add X, X, UX
1559 add TMP1, TOP, TMP1, asl #1
1560 mov TMP2, X, asr #16
1561 add X, X, UX
1562 add TMP2, TOP, TMP2, asl #1
1563 - vld1.32 {xacc2lo[0]}, [TMP1], STRIDE
1564 - vld1.32 {xacc2hi[0]}, [TMP2], STRIDE
1565 - vld1.32 {xacc2lo[1]}, [TMP1]
1566 - vld1.32 {xacc2hi[1]}, [TMP2]
1567 - convert_0565_to_x888 xacc2, xreg3, xreg2, xreg1
1568 + vld1.32 {\xacc2lo[0]}, [TMP1], STRIDE
1569 + vld1.32 {\xacc2hi[0]}, [TMP2], STRIDE
1570 + vld1.32 {\xacc2lo[1]}, [TMP1]
1571 + vld1.32 {\xacc2hi[1]}, [TMP2]
1572 + convert_0565_to_x888 \xacc2, \xreg3, \xreg2, \xreg1
1573 mov TMP1, X, asr #16
1574 add X, X, UX
1575 add TMP1, TOP, TMP1, asl #1
1576 mov TMP2, X, asr #16
1577 add X, X, UX
1578 add TMP2, TOP, TMP2, asl #1
1579 - vld1.32 {yacc2lo[0]}, [TMP1], STRIDE
1580 - vzip.u8 xreg1, xreg3
1581 - vld1.32 {yacc2hi[0]}, [TMP2], STRIDE
1582 - vzip.u8 xreg2, xreg4
1583 - vld1.32 {yacc2lo[1]}, [TMP1]
1584 - vzip.u8 xreg3, xreg4
1585 - vld1.32 {yacc2hi[1]}, [TMP2]
1586 - vzip.u8 xreg1, xreg2
1587 - convert_0565_to_x888 yacc2, yreg3, yreg2, yreg1
1588 - vmull.u8 xacc1, xreg1, d28
1589 - vzip.u8 yreg1, yreg3
1590 - vmlal.u8 xacc1, xreg2, d29
1591 - vzip.u8 yreg2, yreg4
1592 - vmull.u8 xacc2, xreg3, d28
1593 - vzip.u8 yreg3, yreg4
1594 - vmlal.u8 xacc2, xreg4, d29
1595 - vzip.u8 yreg1, yreg2
1596 - vmull.u8 yacc1, yreg1, d28
1597 - vmlal.u8 yacc1, yreg2, d29
1598 - vmull.u8 yacc2, yreg3, d28
1599 - vmlal.u8 yacc2, yreg4, d29
1600 + vld1.32 {\yacc2lo[0]}, [TMP1], STRIDE
1601 + vzip.u8 \xreg1, \xreg3
1602 + vld1.32 {\yacc2hi[0]}, [TMP2], STRIDE
1603 + vzip.u8 \xreg2, \xreg4
1604 + vld1.32 {\yacc2lo[1]}, [TMP1]
1605 + vzip.u8 \xreg3, \xreg4
1606 + vld1.32 {\yacc2hi[1]}, [TMP2]
1607 + vzip.u8 \xreg1, \xreg2
1608 + convert_0565_to_x888 \yacc2, \yreg3, \yreg2, \yreg1
1609 + vmull.u8 \xacc1, \xreg1, d28
1610 + vzip.u8 \yreg1, \yreg3
1611 + vmlal.u8 \xacc1, \xreg2, d29
1612 + vzip.u8 \yreg2, \yreg4
1613 + vmull.u8 \xacc2, \xreg3, d28
1614 + vzip.u8 \yreg3, \yreg4
1615 + vmlal.u8 \xacc2, \xreg4, d29
1616 + vzip.u8 \yreg1, \yreg2
1617 + vmull.u8 \yacc1, \yreg1, d28
1618 + vmlal.u8 \yacc1, \yreg2, d29
1619 + vmull.u8 \yacc2, \yreg3, d28
1620 + vmlal.u8 \yacc2, \yreg4, d29
1621 .endm
1623 .macro bilinear_store_8888 numpix, tmp1, tmp2
1624 -.if numpix == 4
1625 +.if \numpix == 4
1626 vst1.32 {d0, d1}, [OUT, :128]!
1627 -.elseif numpix == 2
1628 +.elseif \numpix == 2
1629 vst1.32 {d0}, [OUT, :64]!
1630 -.elseif numpix == 1
1631 +.elseif \numpix == 1
1632 vst1.32 {d0[0]}, [OUT, :32]!
1633 .else
1634 - .error bilinear_store_8888 numpix is unsupported
1635 + .error bilinear_store_8888 \numpix is unsupported
1636 .endif
1637 .endm
1639 .macro bilinear_store_0565 numpix, tmp1, tmp2
1640 vuzp.u8 d0, d1
1641 vuzp.u8 d2, d3
1642 vuzp.u8 d1, d3
1643 vuzp.u8 d0, d2
1644 - convert_8888_to_0565 d2, d1, d0, q1, tmp1, tmp2
1645 -.if numpix == 4
1646 + convert_8888_to_0565 d2, d1, d0, q1, \tmp1, \tmp2
1647 +.if \numpix == 4
1648 vst1.16 {d2}, [OUT, :64]!
1649 -.elseif numpix == 2
1650 +.elseif \numpix == 2
1651 vst1.32 {d2[0]}, [OUT, :32]!
1652 -.elseif numpix == 1
1653 +.elseif \numpix == 1
1654 vst1.16 {d2[0]}, [OUT, :16]!
1655 .else
1656 - .error bilinear_store_0565 numpix is unsupported
1657 + .error bilinear_store_0565 \numpix is unsupported
1658 .endif
1659 .endm
1661 .macro bilinear_interpolate_last_pixel src_fmt, dst_fmt
1662 - bilinear_load_&src_fmt d0, d1, d2
1663 + bilinear_load_\()\src_fmt d0, d1, d2
1664 vmull.u8 q1, d0, d28
1665 vmlal.u8 q1, d1, d29
1666 /* 5 cycles bubble */
1667 vshll.u16 q0, d2, #BILINEAR_INTERPOLATION_BITS
1668 vmlsl.u16 q0, d2, d30
1669 vmlal.u16 q0, d3, d30
1670 /* 5 cycles bubble */
1671 vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
1672 /* 3 cycles bubble */
1673 vmovn.u16 d0, q0
1674 /* 1 cycle bubble */
1675 - bilinear_store_&dst_fmt 1, q2, q3
1676 + bilinear_store_\()\dst_fmt 1, q2, q3
1677 .endm
1679 .macro bilinear_interpolate_two_pixels src_fmt, dst_fmt
1680 - bilinear_load_and_vertical_interpolate_two_&src_fmt \
1681 + bilinear_load_and_vertical_interpolate_two_\()\src_fmt \
1682 q1, q11, d0, d1, d20, d21, d22, d23
1683 vshll.u16 q0, d2, #BILINEAR_INTERPOLATION_BITS
1684 vmlsl.u16 q0, d2, d30
1685 vmlal.u16 q0, d3, d30
1686 vshll.u16 q10, d22, #BILINEAR_INTERPOLATION_BITS
1687 vmlsl.u16 q10, d22, d31
1688 vmlal.u16 q10, d23, d31
1689 vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
1690 vshrn.u32 d1, q10, #(2 * BILINEAR_INTERPOLATION_BITS)
1691 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
1692 vadd.u16 q12, q12, q13
1693 vmovn.u16 d0, q0
1694 - bilinear_store_&dst_fmt 2, q2, q3
1695 + bilinear_store_\()\dst_fmt 2, q2, q3
1696 .endm
1698 .macro bilinear_interpolate_four_pixels src_fmt, dst_fmt
1699 - bilinear_load_and_vertical_interpolate_four_&src_fmt \
1700 + bilinear_load_and_vertical_interpolate_four_\()\src_fmt \
1701 q1, q11, d0, d1, d20, d21, d22, d23 \
1702 q3, q9, d4, d5, d16, d17, d18, d19
1703 pld [TMP1, PF_OFFS]
1704 sub TMP1, TMP1, STRIDE
1705 vshll.u16 q0, d2, #BILINEAR_INTERPOLATION_BITS
1706 vmlsl.u16 q0, d2, d30
1707 vmlal.u16 q0, d3, d30
1708 vshll.u16 q10, d22, #BILINEAR_INTERPOLATION_BITS
1709 @@ -3029,64 +3035,64 @@ generate_composite_function_nearest_scan
1710 vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
1711 vshrn.u32 d1, q10, #(2 * BILINEAR_INTERPOLATION_BITS)
1712 vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS)
1713 vshrn.u32 d5, q8, #(2 * BILINEAR_INTERPOLATION_BITS)
1714 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
1715 vmovn.u16 d0, q0
1716 vmovn.u16 d1, q2
1717 vadd.u16 q12, q12, q13
1718 - bilinear_store_&dst_fmt 4, q2, q3
1719 + bilinear_store_\()\dst_fmt 4, q2, q3
1720 .endm
1722 .macro bilinear_interpolate_four_pixels_head src_fmt, dst_fmt
1723 -.ifdef have_bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt
1724 - bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt&_head
1725 +.ifdef have_bilinear_interpolate_four_pixels_\()\src_fmt\()_\()\dst_fmt
1726 + bilinear_interpolate_four_pixels_\()\src_fmt\()_\()\dst_fmt\()_head
1727 .else
1728 - bilinear_interpolate_four_pixels src_fmt, dst_fmt
1729 + bilinear_interpolate_four_pixels \src_fmt, \dst_fmt
1730 .endif
1731 .endm
1733 .macro bilinear_interpolate_four_pixels_tail src_fmt, dst_fmt
1734 -.ifdef have_bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt
1735 - bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt&_tail
1736 +.ifdef have_bilinear_interpolate_four_pixels_\()\src_fmt\()_\()\dst_fmt
1737 + bilinear_interpolate_four_pixels_\()\src_fmt\()_\()\dst_fmt\()_tail
1738 .endif
1739 .endm
1741 .macro bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt
1742 -.ifdef have_bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt
1743 - bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt&_tail_head
1744 +.ifdef have_bilinear_interpolate_four_pixels_\()\src_fmt\()_\()\dst_fmt
1745 + bilinear_interpolate_four_pixels_\()\src_fmt\()_\()\dst_fmt\()_tail_head
1746 .else
1747 - bilinear_interpolate_four_pixels src_fmt, dst_fmt
1748 + bilinear_interpolate_four_pixels \src_fmt, \dst_fmt
1749 .endif
1750 .endm
1752 .macro bilinear_interpolate_eight_pixels_head src_fmt, dst_fmt
1753 -.ifdef have_bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt
1754 - bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt&_head
1755 +.ifdef have_bilinear_interpolate_eight_pixels_\()\src_fmt\()_\()\dst_fmt
1756 + bilinear_interpolate_eight_pixels_\()\src_fmt\()_\()\dst_fmt\()_head
1757 .else
1758 - bilinear_interpolate_four_pixels_head src_fmt, dst_fmt
1759 - bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt
1760 + bilinear_interpolate_four_pixels_head \src_fmt, \dst_fmt
1761 + bilinear_interpolate_four_pixels_tail_head \src_fmt, \dst_fmt
1762 .endif
1763 .endm
1765 .macro bilinear_interpolate_eight_pixels_tail src_fmt, dst_fmt
1766 -.ifdef have_bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt
1767 - bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt&_tail
1768 +.ifdef have_bilinear_interpolate_eight_pixels_\()\src_fmt\()_\()\dst_fmt
1769 + bilinear_interpolate_eight_pixels_\()\src_fmt\()_\()\dst_fmt\()_tail
1770 .else
1771 - bilinear_interpolate_four_pixels_tail src_fmt, dst_fmt
1772 + bilinear_interpolate_four_pixels_tail \src_fmt, \dst_fmt
1773 .endif
1774 .endm
1776 .macro bilinear_interpolate_eight_pixels_tail_head src_fmt, dst_fmt
1777 -.ifdef have_bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt
1778 - bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt&_tail_head
1779 +.ifdef have_bilinear_interpolate_eight_pixels_\()\src_fmt\()_\()\dst_fmt
1780 + bilinear_interpolate_eight_pixels_\()\src_fmt\()_\()\dst_fmt\()_tail_head
1781 .else
1782 - bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt
1783 - bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt
1784 + bilinear_interpolate_four_pixels_tail_head \src_fmt, \dst_fmt
1785 + bilinear_interpolate_four_pixels_tail_head \src_fmt, \dst_fmt
1786 .endif
1787 .endm
1789 .set BILINEAR_FLAG_UNROLL_4, 0
1790 .set BILINEAR_FLAG_UNROLL_8, 1
1791 .set BILINEAR_FLAG_USE_ALL_NEON_REGS, 2
1794 @@ -3101,17 +3107,17 @@ generate_composite_function_nearest_scan
1795 * prefetch_distance - prefetch in the source image by that many
1796 * pixels ahead
1799 .macro generate_bilinear_scanline_func fname, src_fmt, dst_fmt, \
1800 src_bpp_shift, dst_bpp_shift, \
1801 prefetch_distance, flags
1803 -pixman_asm_function fname
1804 +pixman_asm_function \fname
1805 OUT .req r0
1806 TOP .req r1
1807 BOTTOM .req r2
1808 WT .req r3
1809 WB .req r4
1810 X .req r5
1811 UX .req r6
1812 WIDTH .req ip
1813 @@ -3119,21 +3125,21 @@ pixman_asm_function fname
1814 TMP2 .req r4
1815 PF_OFFS .req r7
1816 TMP3 .req r8
1817 TMP4 .req r9
1818 STRIDE .req r2
1820 mov ip, sp
1821 push {r4, r5, r6, r7, r8, r9}
1822 - mov PF_OFFS, #prefetch_distance
1823 + mov PF_OFFS, #\prefetch_distance
1824 ldmia ip, {WB, X, UX, WIDTH}
1825 mul PF_OFFS, PF_OFFS, UX
1827 -.if ((flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0
1828 +.if ((\flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0
1829 vpush {d8-d15}
1830 .endif
1832 sub STRIDE, BOTTOM, TOP
1833 .unreq BOTTOM
1835 cmp WIDTH, #0
1836 ble 3f
1837 @@ -3146,83 +3152,83 @@ pixman_asm_function fname
1839 /* ensure good destination alignment */
1840 cmp WIDTH, #1
1841 blt 0f
1842 tst OUT, #(1 << dst_bpp_shift)
1843 beq 0f
1844 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
1845 vadd.u16 q12, q12, q13
1846 - bilinear_interpolate_last_pixel src_fmt, dst_fmt
1847 + bilinear_interpolate_last_pixel \src_fmt, \dst_fmt
1848 sub WIDTH, WIDTH, #1
1850 vadd.u16 q13, q13, q13
1851 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
1852 vadd.u16 q12, q12, q13
1854 cmp WIDTH, #2
1855 blt 0f
1856 tst OUT, #(1 << (dst_bpp_shift + 1))
1857 beq 0f
1858 - bilinear_interpolate_two_pixels src_fmt, dst_fmt
1859 + bilinear_interpolate_two_pixels \src_fmt, \dst_fmt
1860 sub WIDTH, WIDTH, #2
1862 -.if ((flags) & BILINEAR_FLAG_UNROLL_8) != 0
1863 +.if ((\flags) & BILINEAR_FLAG_UNROLL_8) != 0
1864 /*********** 8 pixels per iteration *****************/
1865 cmp WIDTH, #4
1866 blt 0f
1867 tst OUT, #(1 << (dst_bpp_shift + 2))
1868 beq 0f
1869 - bilinear_interpolate_four_pixels src_fmt, dst_fmt
1870 + bilinear_interpolate_four_pixels \src_fmt, \dst_fmt
1871 sub WIDTH, WIDTH, #4
1873 subs WIDTH, WIDTH, #8
1874 blt 1f
1875 mov PF_OFFS, PF_OFFS, asr #(16 - src_bpp_shift)
1876 - bilinear_interpolate_eight_pixels_head src_fmt, dst_fmt
1877 + bilinear_interpolate_eight_pixels_head \src_fmt, \dst_fmt
1878 subs WIDTH, WIDTH, #8
1879 blt 5f
1881 - bilinear_interpolate_eight_pixels_tail_head src_fmt, dst_fmt
1882 + bilinear_interpolate_eight_pixels_tail_head \src_fmt, \dst_fmt
1883 subs WIDTH, WIDTH, #8
1884 bge 0b
1886 - bilinear_interpolate_eight_pixels_tail src_fmt, dst_fmt
1887 + bilinear_interpolate_eight_pixels_tail \src_fmt, \dst_fmt
1889 tst WIDTH, #4
1890 beq 2f
1891 - bilinear_interpolate_four_pixels src_fmt, dst_fmt
1892 + bilinear_interpolate_four_pixels \src_fmt, \dst_fmt
1894 .else
1895 /*********** 4 pixels per iteration *****************/
1896 subs WIDTH, WIDTH, #4
1897 blt 1f
1898 mov PF_OFFS, PF_OFFS, asr #(16 - src_bpp_shift)
1899 - bilinear_interpolate_four_pixels_head src_fmt, dst_fmt
1900 + bilinear_interpolate_four_pixels_head \src_fmt, \dst_fmt
1901 subs WIDTH, WIDTH, #4
1902 blt 5f
1904 - bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt
1905 + bilinear_interpolate_four_pixels_tail_head \src_fmt, \dst_fmt
1906 subs WIDTH, WIDTH, #4
1907 bge 0b
1909 - bilinear_interpolate_four_pixels_tail src_fmt, dst_fmt
1910 + bilinear_interpolate_four_pixels_tail \src_fmt, \dst_fmt
1912 /****************************************************/
1913 .endif
1914 /* handle the remaining trailing pixels */
1915 tst WIDTH, #2
1916 beq 2f
1917 - bilinear_interpolate_two_pixels src_fmt, dst_fmt
1918 + bilinear_interpolate_two_pixels \src_fmt, \dst_fmt
1920 tst WIDTH, #1
1921 beq 3f
1922 - bilinear_interpolate_last_pixel src_fmt, dst_fmt
1923 + bilinear_interpolate_last_pixel \src_fmt, \dst_fmt
1925 -.if ((flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0
1926 +.if ((\flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0
1927 vpop {d8-d15}
1928 .endif
1929 pop {r4, r5, r6, r7, r8, r9}
1930 bx lr
1932 .unreq OUT
1933 .unreq TOP
1934 .unreq WT
1935 @@ -3231,17 +3237,17 @@ 3:
1936 .unreq UX
1937 .unreq WIDTH
1938 .unreq TMP1
1939 .unreq TMP2
1940 .unreq PF_OFFS
1941 .unreq TMP3
1942 .unreq TMP4
1943 .unreq STRIDE
1944 -.endfunc
1945 +pixman_end_asm_function
1947 .endm
1949 /*****************************************************************************/
1951 .set have_bilinear_interpolate_four_pixels_8888_8888, 1
1953 .macro bilinear_interpolate_four_pixels_8888_8888_head
1954 diff --git a/gfx/cairo/libpixman/src/pixman-arm-neon-asm.h b/gfx/cairo/libpixman/src/pixman-arm-neon-asm.h
1955 --- a/gfx/cairo/libpixman/src/pixman-arm-neon-asm.h
1956 +++ b/gfx/cairo/libpixman/src/pixman-arm-neon-asm.h
1957 @@ -69,303 +69,303 @@
1958 .set PREFETCH_TYPE_ADVANCED, 2 /* Advanced fine-grained prefetch */
1961 * Definitions of supplementary pixld/pixst macros (for partial load/store of
1962 * pixel data).
1965 .macro pixldst1 op, elem_size, reg1, mem_operand, abits
1966 -.if abits > 0
1967 - op&.&elem_size {d&reg1}, [&mem_operand&, :&abits&]!
1968 +.if \abits > 0
1969 + \op\().\()\elem_size {d\()\reg1}, [\()\mem_operand\(), :\()\abits\()]!
1970 .else
1971 - op&.&elem_size {d&reg1}, [&mem_operand&]!
1972 + \op\().\()\elem_size {d\()\reg1}, [\()\mem_operand\()]!
1973 .endif
1974 .endm
1976 .macro pixldst2 op, elem_size, reg1, reg2, mem_operand, abits
1977 -.if abits > 0
1978 - op&.&elem_size {d&reg1, d&reg2}, [&mem_operand&, :&abits&]!
1979 +.if \abits > 0
1980 + \op\().\()\elem_size {d\()\reg1, d\()\reg2}, [\()\mem_operand\(), :\()\abits\()]!
1981 .else
1982 - op&.&elem_size {d&reg1, d&reg2}, [&mem_operand&]!
1983 + \op\().\()\elem_size {d\()\reg1, d\()\reg2}, [\()\mem_operand\()]!
1984 .endif
1985 .endm
1987 .macro pixldst4 op, elem_size, reg1, reg2, reg3, reg4, mem_operand, abits
1988 -.if abits > 0
1989 - op&.&elem_size {d&reg1, d&reg2, d&reg3, d&reg4}, [&mem_operand&, :&abits&]!
1990 +.if \abits > 0
1991 + \op\().\()\elem_size {d\()\reg1, d\()\reg2, d\()\reg3, d\()\reg4}, [\()\mem_operand\(), :\()\abits\()]!
1992 .else
1993 - op&.&elem_size {d&reg1, d&reg2, d&reg3, d&reg4}, [&mem_operand&]!
1994 + \op\().\()\elem_size {d\()\reg1, d\()\reg2, d\()\reg3, d\()\reg4}, [\()\mem_operand\()]!
1995 .endif
1996 .endm
1998 .macro pixldst0 op, elem_size, reg1, idx, mem_operand, abits
1999 - op&.&elem_size {d&reg1[idx]}, [&mem_operand&]!
2000 + \op\().\()\elem_size {d\()\reg1[\idx]}, [\()\mem_operand\()]!
2001 .endm
2003 .macro pixldst3 op, elem_size, reg1, reg2, reg3, mem_operand
2004 - op&.&elem_size {d&reg1, d&reg2, d&reg3}, [&mem_operand&]!
2005 + \op\().\()\elem_size {d\()\reg1, d\()\reg2, d\()\reg3}, [\()\mem_operand\()]!
2006 .endm
2008 .macro pixldst30 op, elem_size, reg1, reg2, reg3, idx, mem_operand
2009 - op&.&elem_size {d&reg1[idx], d&reg2[idx], d&reg3[idx]}, [&mem_operand&]!
2010 + \op\().\()\elem_size {d\()\reg1[\idx], d\()\reg2[\idx], d\()\reg3[\idx]}, [\()\mem_operand\()]!
2011 .endm
2013 .macro pixldst numbytes, op, elem_size, basereg, mem_operand, abits
2014 -.if numbytes == 32
2015 - pixldst4 op, elem_size, %(basereg+4), %(basereg+5), \
2016 - %(basereg+6), %(basereg+7), mem_operand, abits
2017 -.elseif numbytes == 16
2018 - pixldst2 op, elem_size, %(basereg+2), %(basereg+3), mem_operand, abits
2019 -.elseif numbytes == 8
2020 - pixldst1 op, elem_size, %(basereg+1), mem_operand, abits
2021 -.elseif numbytes == 4
2022 - .if !RESPECT_STRICT_ALIGNMENT || (elem_size == 32)
2023 - pixldst0 op, 32, %(basereg+0), 1, mem_operand, abits
2024 - .elseif elem_size == 16
2025 - pixldst0 op, 16, %(basereg+0), 2, mem_operand, abits
2026 - pixldst0 op, 16, %(basereg+0), 3, mem_operand, abits
2027 +.if \numbytes == 32
2028 + pixldst4 \op, \elem_size, %(\basereg+4), %(\basereg+5), \
2029 + %(\basereg+6), %(\basereg+7), \mem_operand, \abits
2030 +.elseif \numbytes == 16
2031 + pixldst2 \op, \elem_size, %(\basereg+2), %(\basereg+3), \mem_operand, \abits
2032 +.elseif \numbytes == 8
2033 + pixldst1 \op, \elem_size, %(\basereg+1), \mem_operand, \abits
2034 +.elseif \numbytes == 4
2035 + .if !RESPECT_STRICT_ALIGNMENT || (\elem_size == 32)
2036 + pixldst0 \op, 32, %(\basereg+0), 1, \mem_operand, \abits
2037 + .elseif \elem_size == 16
2038 + pixldst0 \op, 16, %(\basereg+0), 2, \mem_operand, \abits
2039 + pixldst0 \op, 16, %(\basereg+0), 3, \mem_operand, \abits
2040 .else
2041 - pixldst0 op, 8, %(basereg+0), 4, mem_operand, abits
2042 - pixldst0 op, 8, %(basereg+0), 5, mem_operand, abits
2043 - pixldst0 op, 8, %(basereg+0), 6, mem_operand, abits
2044 - pixldst0 op, 8, %(basereg+0), 7, mem_operand, abits
2045 + pixldst0 \op, 8, %(\basereg+0), 4, \mem_operand, \abits
2046 + pixldst0 \op, 8, %(\basereg+0), 5, \mem_operand, \abits
2047 + pixldst0 \op, 8, %(\basereg+0), 6, \mem_operand, \abits
2048 + pixldst0 \op, 8, %(\basereg+0), 7, \mem_operand, \abits
2049 .endif
2050 -.elseif numbytes == 2
2051 - .if !RESPECT_STRICT_ALIGNMENT || (elem_size == 16)
2052 - pixldst0 op, 16, %(basereg+0), 1, mem_operand, abits
2053 +.elseif \numbytes == 2
2054 + .if !RESPECT_STRICT_ALIGNMENT || (\elem_size == 16)
2055 + pixldst0 \op, 16, %(\basereg+0), 1, \mem_operand, \abits
2056 .else
2057 - pixldst0 op, 8, %(basereg+0), 2, mem_operand, abits
2058 - pixldst0 op, 8, %(basereg+0), 3, mem_operand, abits
2059 + pixldst0 \op, 8, %(\basereg+0), 2, \mem_operand, \abits
2060 + pixldst0 \op, 8, %(\basereg+0), 3, \mem_operand, \abits
2061 .endif
2062 -.elseif numbytes == 1
2063 - pixldst0 op, 8, %(basereg+0), 1, mem_operand, abits
2064 +.elseif \numbytes == 1
2065 + pixldst0 \op, 8, %(\basereg+0), 1, \mem_operand, \abits
2066 .else
2067 - .error "unsupported size: numbytes"
2068 + .error "unsupported size: \numbytes"
2069 .endif
2070 .endm
2072 .macro pixld numpix, bpp, basereg, mem_operand, abits=0
2073 -.if bpp > 0
2074 -.if (bpp == 32) && (numpix == 8) && (DEINTERLEAVE_32BPP_ENABLED != 0)
2075 - pixldst4 vld4, 8, %(basereg+4), %(basereg+5), \
2076 - %(basereg+6), %(basereg+7), mem_operand, abits
2077 -.elseif (bpp == 24) && (numpix == 8)
2078 - pixldst3 vld3, 8, %(basereg+3), %(basereg+4), %(basereg+5), mem_operand
2079 -.elseif (bpp == 24) && (numpix == 4)
2080 - pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 4, mem_operand
2081 - pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 5, mem_operand
2082 - pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 6, mem_operand
2083 - pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 7, mem_operand
2084 -.elseif (bpp == 24) && (numpix == 2)
2085 - pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 2, mem_operand
2086 - pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 3, mem_operand
2087 -.elseif (bpp == 24) && (numpix == 1)
2088 - pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 1, mem_operand
2089 +.if \bpp > 0
2090 +.if (\bpp == 32) && (\numpix == 8) && (DEINTERLEAVE_32BPP_ENABLED != 0)
2091 + pixldst4 vld4, 8, %(\basereg+4), %(\basereg+5), \
2092 + %(\basereg+6), %(\basereg+7), \mem_operand, \abits
2093 +.elseif (\bpp == 24) && (\numpix == 8)
2094 + pixldst3 vld3, 8, %(\basereg+3), %(\basereg+4), %(\basereg+5), \mem_operand
2095 +.elseif (\bpp == 24) && (\numpix == 4)
2096 + pixldst30 vld3, 8, %(\basereg+0), %(\basereg+1), %(\basereg+2), 4, \mem_operand
2097 + pixldst30 vld3, 8, %(\basereg+0), %(\basereg+1), %(\basereg+2), 5, \mem_operand
2098 + pixldst30 vld3, 8, %(\basereg+0), %(\basereg+1), %(\basereg+2), 6, \mem_operand
2099 + pixldst30 vld3, 8, %(\basereg+0), %(\basereg+1), %(\basereg+2), 7, \mem_operand
2100 +.elseif (\bpp == 24) && (\numpix == 2)
2101 + pixldst30 vld3, 8, %(\basereg+0), %(\basereg+1), %(\basereg+2), 2, \mem_operand
2102 + pixldst30 vld3, 8, %(\basereg+0), %(\basereg+1), %(\basereg+2), 3, \mem_operand
2103 +.elseif (\bpp == 24) && (\numpix == 1)
2104 + pixldst30 vld3, 8, %(\basereg+0), %(\basereg+1), %(\basereg+2), 1, \mem_operand
2105 .else
2106 - pixldst %(numpix * bpp / 8), vld1, %(bpp), basereg, mem_operand, abits
2107 + pixldst %(\numpix * \bpp / 8), vld1, %(\bpp), \basereg, \mem_operand, \abits
2108 .endif
2109 .endif
2110 .endm
2112 .macro pixst numpix, bpp, basereg, mem_operand, abits=0
2113 -.if bpp > 0
2114 -.if (bpp == 32) && (numpix == 8) && (DEINTERLEAVE_32BPP_ENABLED != 0)
2115 - pixldst4 vst4, 8, %(basereg+4), %(basereg+5), \
2116 - %(basereg+6), %(basereg+7), mem_operand, abits
2117 -.elseif (bpp == 24) && (numpix == 8)
2118 - pixldst3 vst3, 8, %(basereg+3), %(basereg+4), %(basereg+5), mem_operand
2119 -.elseif (bpp == 24) && (numpix == 4)
2120 - pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 4, mem_operand
2121 - pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 5, mem_operand
2122 - pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 6, mem_operand
2123 - pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 7, mem_operand
2124 -.elseif (bpp == 24) && (numpix == 2)
2125 - pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 2, mem_operand
2126 - pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 3, mem_operand
2127 -.elseif (bpp == 24) && (numpix == 1)
2128 - pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 1, mem_operand
2129 +.if \bpp > 0
2130 +.if (\bpp == 32) && (\numpix == 8) && (DEINTERLEAVE_32BPP_ENABLED != 0)
2131 + pixldst4 vst4, 8, %(\basereg+4), %(\basereg+5), \
2132 + %(\basereg+6), %(\basereg+7), \mem_operand, \abits
2133 +.elseif (\bpp == 24) && (\numpix == 8)
2134 + pixldst3 vst3, 8, %(\basereg+3), %(\basereg+4), %(\basereg+5), \mem_operand
2135 +.elseif (\bpp == 24) && (\numpix == 4)
2136 + pixldst30 vst3, 8, %(\basereg+0), %(\basereg+1), %(\basereg+2), 4, \mem_operand
2137 + pixldst30 vst3, 8, %(\basereg+0), %(\basereg+1), %(\basereg+2), 5, \mem_operand
2138 + pixldst30 vst3, 8, %(\basereg+0), %(\basereg+1), %(\basereg+2), 6, \mem_operand
2139 + pixldst30 vst3, 8, %(\basereg+0), %(\basereg+1), %(\basereg+2), 7, \mem_operand
2140 +.elseif (\bpp == 24) && (\numpix == 2)
2141 + pixldst30 vst3, 8, %(\basereg+0), %(\basereg+1), %(\basereg+2), 2, \mem_operand
2142 + pixldst30 vst3, 8, %(\basereg+0), %(\basereg+1), %(\basereg+2), 3, \mem_operand
2143 +.elseif (\bpp == 24) && (\numpix == 1)
2144 + pixldst30 vst3, 8, %(\basereg+0), %(\basereg+1), %(\basereg+2), 1, \mem_operand
2145 .else
2146 - pixldst %(numpix * bpp / 8), vst1, %(bpp), basereg, mem_operand, abits
2147 + pixldst %(\numpix * \bpp / 8), vst1, %(\bpp), \basereg, \mem_operand, \abits
2148 .endif
2149 .endif
2150 .endm
2152 .macro pixld_a numpix, bpp, basereg, mem_operand
2153 -.if (bpp * numpix) <= 128
2154 - pixld numpix, bpp, basereg, mem_operand, %(bpp * numpix)
2155 +.if (\bpp * \numpix) <= 128
2156 + pixld \numpix, \bpp, \basereg, \mem_operand, %(\bpp * \numpix)
2157 .else
2158 - pixld numpix, bpp, basereg, mem_operand, 128
2159 + pixld \numpix, \bpp, \basereg, \mem_operand, 128
2160 .endif
2161 .endm
2163 .macro pixst_a numpix, bpp, basereg, mem_operand
2164 -.if (bpp * numpix) <= 128
2165 - pixst numpix, bpp, basereg, mem_operand, %(bpp * numpix)
2166 +.if (\bpp * \numpix) <= 128
2167 + pixst \numpix, \bpp, \basereg, \mem_operand, %(\bpp * \numpix)
2168 .else
2169 - pixst numpix, bpp, basereg, mem_operand, 128
2170 + pixst \numpix, \bpp, \basereg, \mem_operand, 128
2171 .endif
2172 .endm
2175 * Pixel fetcher for nearest scaling (needs TMP1, TMP2, VX, UNIT_X register
2176 * aliases to be defined)
2178 .macro pixld1_s elem_size, reg1, mem_operand
2179 -.if elem_size == 16
2180 +.if \elem_size == 16
2181 mov TMP1, VX, asr #16
2182 adds VX, VX, UNIT_X
2183 5: subpls VX, VX, SRC_WIDTH_FIXED
2184 bpl 5b
2185 - add TMP1, mem_operand, TMP1, asl #1
2186 + add TMP1, \mem_operand, TMP1, asl #1
2187 mov TMP2, VX, asr #16
2188 adds VX, VX, UNIT_X
2189 5: subpls VX, VX, SRC_WIDTH_FIXED
2190 bpl 5b
2191 - add TMP2, mem_operand, TMP2, asl #1
2192 - vld1.16 {d&reg1&[0]}, [TMP1, :16]
2193 + add TMP2, \mem_operand, TMP2, asl #1
2194 + vld1.16 {d\()\reg1\()[0]}, [TMP1, :16]
2195 mov TMP1, VX, asr #16
2196 adds VX, VX, UNIT_X
2197 5: subpls VX, VX, SRC_WIDTH_FIXED
2198 bpl 5b
2199 - add TMP1, mem_operand, TMP1, asl #1
2200 - vld1.16 {d&reg1&[1]}, [TMP2, :16]
2201 + add TMP1, \mem_operand, TMP1, asl #1
2202 + vld1.16 {d\()\reg1\()[1]}, [TMP2, :16]
2203 mov TMP2, VX, asr #16
2204 adds VX, VX, UNIT_X
2205 5: subpls VX, VX, SRC_WIDTH_FIXED
2206 bpl 5b
2207 - add TMP2, mem_operand, TMP2, asl #1
2208 - vld1.16 {d&reg1&[2]}, [TMP1, :16]
2209 - vld1.16 {d&reg1&[3]}, [TMP2, :16]
2210 -.elseif elem_size == 32
2211 + add TMP2, \mem_operand, TMP2, asl #1
2212 + vld1.16 {d\()\reg1\()[2]}, [TMP1, :16]
2213 + vld1.16 {d\()\reg1\()[3]}, [TMP2, :16]
2214 +.elseif \elem_size == 32
2215 mov TMP1, VX, asr #16
2216 adds VX, VX, UNIT_X
2217 5: subpls VX, VX, SRC_WIDTH_FIXED
2218 bpl 5b
2219 - add TMP1, mem_operand, TMP1, asl #2
2220 + add TMP1, \mem_operand, TMP1, asl #2
2221 mov TMP2, VX, asr #16
2222 adds VX, VX, UNIT_X
2223 5: subpls VX, VX, SRC_WIDTH_FIXED
2224 bpl 5b
2225 - add TMP2, mem_operand, TMP2, asl #2
2226 - vld1.32 {d&reg1&[0]}, [TMP1, :32]
2227 - vld1.32 {d&reg1&[1]}, [TMP2, :32]
2228 + add TMP2, \mem_operand, TMP2, asl #2
2229 + vld1.32 {d\()\reg1\()[0]}, [TMP1, :32]
2230 + vld1.32 {d\()\reg1\()[1]}, [TMP2, :32]
2231 .else
2232 .error "unsupported"
2233 .endif
2234 .endm
2236 .macro pixld2_s elem_size, reg1, reg2, mem_operand
2237 .if 0 /* elem_size == 32 */
2238 mov TMP1, VX, asr #16
2239 add VX, VX, UNIT_X, asl #1
2240 - add TMP1, mem_operand, TMP1, asl #2
2241 + add TMP1, \mem_operand, TMP1, asl #2
2242 mov TMP2, VX, asr #16
2243 sub VX, VX, UNIT_X
2244 - add TMP2, mem_operand, TMP2, asl #2
2245 - vld1.32 {d&reg1&[0]}, [TMP1, :32]
2246 + add TMP2, \mem_operand, TMP2, asl #2
2247 + vld1.32 {d\()\reg1\()[0]}, [TMP1, :32]
2248 mov TMP1, VX, asr #16
2249 add VX, VX, UNIT_X, asl #1
2250 - add TMP1, mem_operand, TMP1, asl #2
2251 - vld1.32 {d&reg2&[0]}, [TMP2, :32]
2252 + add TMP1, \mem_operand, TMP1, asl #2
2253 + vld1.32 {d\()\reg2\()[0]}, [TMP2, :32]
2254 mov TMP2, VX, asr #16
2255 add VX, VX, UNIT_X
2256 - add TMP2, mem_operand, TMP2, asl #2
2257 - vld1.32 {d&reg1&[1]}, [TMP1, :32]
2258 - vld1.32 {d&reg2&[1]}, [TMP2, :32]
2259 + add TMP2, \mem_operand, TMP2, asl #2
2260 + vld1.32 {d\()\reg1\()[1]}, [TMP1, :32]
2261 + vld1.32 {d\()\reg2\()[1]}, [TMP2, :32]
2262 .else
2263 - pixld1_s elem_size, reg1, mem_operand
2264 - pixld1_s elem_size, reg2, mem_operand
2265 + pixld1_s \elem_size, \reg1, \mem_operand
2266 + pixld1_s \elem_size, \reg2, \mem_operand
2267 .endif
2268 .endm
2270 .macro pixld0_s elem_size, reg1, idx, mem_operand
2271 -.if elem_size == 16
2272 +.if \elem_size == 16
2273 mov TMP1, VX, asr #16
2274 adds VX, VX, UNIT_X
2275 5: subpls VX, VX, SRC_WIDTH_FIXED
2276 bpl 5b
2277 - add TMP1, mem_operand, TMP1, asl #1
2278 - vld1.16 {d&reg1&[idx]}, [TMP1, :16]
2279 -.elseif elem_size == 32
2280 + add TMP1, \mem_operand, TMP1, asl #1
2281 + vld1.16 {d\()\reg1\()[\idx]}, [TMP1, :16]
2282 +.elseif \elem_size == 32
2283 mov TMP1, VX, asr #16
2284 adds VX, VX, UNIT_X
2285 5: subpls VX, VX, SRC_WIDTH_FIXED
2286 bpl 5b
2287 - add TMP1, mem_operand, TMP1, asl #2
2288 - vld1.32 {d&reg1&[idx]}, [TMP1, :32]
2289 + add TMP1, \mem_operand, TMP1, asl #2
2290 + vld1.32 {d\()\reg1\()[\idx]}, [TMP1, :32]
2291 .endif
2292 .endm
2294 .macro pixld_s_internal numbytes, elem_size, basereg, mem_operand
2295 -.if numbytes == 32
2296 - pixld2_s elem_size, %(basereg+4), %(basereg+5), mem_operand
2297 - pixld2_s elem_size, %(basereg+6), %(basereg+7), mem_operand
2298 - pixdeinterleave elem_size, %(basereg+4)
2299 -.elseif numbytes == 16
2300 - pixld2_s elem_size, %(basereg+2), %(basereg+3), mem_operand
2301 -.elseif numbytes == 8
2302 - pixld1_s elem_size, %(basereg+1), mem_operand
2303 -.elseif numbytes == 4
2304 - .if elem_size == 32
2305 - pixld0_s elem_size, %(basereg+0), 1, mem_operand
2306 - .elseif elem_size == 16
2307 - pixld0_s elem_size, %(basereg+0), 2, mem_operand
2308 - pixld0_s elem_size, %(basereg+0), 3, mem_operand
2309 +.if \numbytes == 32
2310 + pixld2_s \elem_size, %(\basereg+4), %(\basereg+5), \mem_operand
2311 + pixld2_s \elem_size, %(\basereg+6), %(\basereg+7), \mem_operand
2312 + pixdeinterleave \elem_size, %(\basereg+4)
2313 +.elseif \numbytes == 16
2314 + pixld2_s \elem_size, %(\basereg+2), %(\basereg+3), \mem_operand
2315 +.elseif \numbytes == 8
2316 + pixld1_s \elem_size, %(\basereg+1), \mem_operand
2317 +.elseif \numbytes == 4
2318 + .if \elem_size == 32
2319 + pixld0_s \elem_size, %(\basereg+0), 1, \mem_operand
2320 + .elseif \elem_size == 16
2321 + pixld0_s \elem_size, %(\basereg+0), 2, \mem_operand
2322 + pixld0_s \elem_size, %(\basereg+0), 3, \mem_operand
2323 .else
2324 - pixld0_s elem_size, %(basereg+0), 4, mem_operand
2325 - pixld0_s elem_size, %(basereg+0), 5, mem_operand
2326 - pixld0_s elem_size, %(basereg+0), 6, mem_operand
2327 - pixld0_s elem_size, %(basereg+0), 7, mem_operand
2328 + pixld0_s \elem_size, %(\basereg+0), 4, \mem_operand
2329 + pixld0_s \elem_size, %(\basereg+0), 5, \mem_operand
2330 + pixld0_s \elem_size, %(\basereg+0), 6, \mem_operand
2331 + pixld0_s \elem_size, %(\basereg+0), 7, \mem_operand
2332 .endif
2333 -.elseif numbytes == 2
2334 - .if elem_size == 16
2335 - pixld0_s elem_size, %(basereg+0), 1, mem_operand
2336 +.elseif \numbytes == 2
2337 + .if \elem_size == 16
2338 + pixld0_s \elem_size, %(\basereg+0), 1, \mem_operand
2339 .else
2340 - pixld0_s elem_size, %(basereg+0), 2, mem_operand
2341 - pixld0_s elem_size, %(basereg+0), 3, mem_operand
2342 + pixld0_s \elem_size, %(\basereg+0), 2, \mem_operand
2343 + pixld0_s \elem_size, %(\basereg+0), 3, \mem_operand
2344 .endif
2345 -.elseif numbytes == 1
2346 - pixld0_s elem_size, %(basereg+0), 1, mem_operand
2347 +.elseif \numbytes == 1
2348 + pixld0_s \elem_size, %(\basereg+0), 1, \mem_operand
2349 .else
2350 - .error "unsupported size: numbytes"
2351 + .error "unsupported size: \numbytes"
2352 .endif
2353 .endm
2355 .macro pixld_s numpix, bpp, basereg, mem_operand
2356 -.if bpp > 0
2357 - pixld_s_internal %(numpix * bpp / 8), %(bpp), basereg, mem_operand
2358 +.if \bpp > 0
2359 + pixld_s_internal %(\numpix * \bpp / 8), %(\bpp), \basereg, \mem_operand
2360 .endif
2361 .endm
2363 .macro vuzp8 reg1, reg2
2364 - vuzp.8 d&reg1, d&reg2
2365 + vuzp.8 d\()\reg1, d\()\reg2
2366 .endm
2368 .macro vzip8 reg1, reg2
2369 - vzip.8 d&reg1, d&reg2
2370 + vzip.8 d\()\reg1, d\()\reg2
2371 .endm
2373 /* deinterleave B, G, R, A channels for eight 32bpp pixels in 4 registers */
2374 .macro pixdeinterleave bpp, basereg
2375 -.if (bpp == 32) && (DEINTERLEAVE_32BPP_ENABLED != 0)
2376 - vuzp8 %(basereg+0), %(basereg+1)
2377 - vuzp8 %(basereg+2), %(basereg+3)
2378 - vuzp8 %(basereg+1), %(basereg+3)
2379 - vuzp8 %(basereg+0), %(basereg+2)
2380 +.if (\bpp == 32) && (DEINTERLEAVE_32BPP_ENABLED != 0)
2381 + vuzp8 %(\basereg+0), %(\basereg+1)
2382 + vuzp8 %(\basereg+2), %(\basereg+3)
2383 + vuzp8 %(\basereg+1), %(\basereg+3)
2384 + vuzp8 %(\basereg+0), %(\basereg+2)
2385 .endif
2386 .endm
2388 /* interleave B, G, R, A channels for eight 32bpp pixels in 4 registers */
2389 .macro pixinterleave bpp, basereg
2390 -.if (bpp == 32) && (DEINTERLEAVE_32BPP_ENABLED != 0)
2391 - vzip8 %(basereg+0), %(basereg+2)
2392 - vzip8 %(basereg+1), %(basereg+3)
2393 - vzip8 %(basereg+2), %(basereg+3)
2394 - vzip8 %(basereg+0), %(basereg+1)
2395 +.if (\bpp == 32) && (DEINTERLEAVE_32BPP_ENABLED != 0)
2396 + vzip8 %(\basereg+0), %(\basereg+2)
2397 + vzip8 %(\basereg+1), %(\basereg+3)
2398 + vzip8 %(\basereg+2), %(\basereg+3)
2399 + vzip8 %(\basereg+0), %(\basereg+1)
2400 .endif
2401 .endm
2404 * This is a macro for implementing cache preload. The main idea is that
2405 * cache preload logic is mostly independent from the rest of pixels
2406 * processing code. It starts at the top left pixel and moves forward
2407 * across pixels and can jump across scanlines. Prefetch distance is
2408 @@ -389,51 +389,51 @@ 5: subpls VX, VX, SRC_WIDTH_FIXED
2409 * for almost zero cost!
2411 * (*) The overhead of the prefetcher is visible when running some trivial
2412 * pixels processing like simple copy. Anyway, having prefetch is a must
2413 * when working with the graphics data.
2415 .macro PF a, x:vararg
2416 .if (PREFETCH_TYPE_CURRENT == PREFETCH_TYPE_ADVANCED)
2417 - a x
2418 + \a \x
2419 .endif
2420 .endm
2422 .macro cache_preload std_increment, boost_increment
2423 .if (src_bpp_shift >= 0) || (dst_r_bpp != 0) || (mask_bpp_shift >= 0)
2424 .if regs_shortage
2425 - PF ldr ORIG_W, [sp] /* If we are short on regs, ORIG_W is kept on stack */
2426 + PF ldr, ORIG_W, [sp] /* If we are short on regs, ORIG_W is kept on stack */
2427 .endif
2428 -.if std_increment != 0
2429 - PF add PF_X, PF_X, #std_increment
2430 +.if \std_increment != 0
2431 + PF add, PF_X, PF_X, #\std_increment
2432 .endif
2433 - PF tst PF_CTL, #0xF
2434 - PF addne PF_X, PF_X, #boost_increment
2435 - PF subne PF_CTL, PF_CTL, #1
2436 - PF cmp PF_X, ORIG_W
2437 + PF tst, PF_CTL, #0xF
2438 + PF addne, PF_X, PF_X, #\boost_increment
2439 + PF subne, PF_CTL, PF_CTL, #1
2440 + PF cmp, PF_X, ORIG_W
2441 .if src_bpp_shift >= 0
2442 PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
2443 .endif
2444 .if dst_r_bpp != 0
2445 PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
2446 .endif
2447 .if mask_bpp_shift >= 0
2448 PF pld, [PF_MASK, PF_X, lsl #mask_bpp_shift]
2449 .endif
2450 - PF subge PF_X, PF_X, ORIG_W
2451 - PF subges PF_CTL, PF_CTL, #0x10
2452 + PF subge, PF_X, PF_X, ORIG_W
2453 + PF subges, PF_CTL, PF_CTL, #0x10
2454 .if src_bpp_shift >= 0
2455 - PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
2456 + PF ldrgeb, DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
2457 .endif
2458 .if dst_r_bpp != 0
2459 - PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
2460 + PF ldrgeb, DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
2461 .endif
2462 .if mask_bpp_shift >= 0
2463 - PF ldrgeb DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]!
2464 + PF ldrgeb, DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]!
2465 .endif
2466 .endif
2467 .endm
2469 .macro cache_preload_simple
2470 .if (PREFETCH_TYPE_CURRENT == PREFETCH_TYPE_SIMPLE)
2471 .if src_bpp > 0
2472 pld [SRC, #(PREFETCH_DISTANCE_SIMPLE * src_bpp / 8)]
2473 @@ -460,51 +460,53 @@ 5: subpls VX, VX, SRC_WIDTH_FIXED
2474 .macro ensure_destination_ptr_alignment process_pixblock_head, \
2475 process_pixblock_tail, \
2476 process_pixblock_tail_head
2477 .if dst_w_bpp != 24
2478 tst DST_R, #0xF
2479 beq 2f
2481 .irp lowbit, 1, 2, 4, 8, 16
2482 +#ifndef __clang__
2483 local skip1
2484 -.if (dst_w_bpp <= (lowbit * 8)) && ((lowbit * 8) < (pixblock_size * dst_w_bpp))
2485 -.if lowbit < 16 /* we don't need more than 16-byte alignment */
2486 - tst DST_R, #lowbit
2487 +#endif
2488 +.if (dst_w_bpp <= (\lowbit * 8)) && ((\lowbit * 8) < (pixblock_size * dst_w_bpp))
2489 +.if \lowbit < 16 /* we don't need more than 16-byte alignment */
2490 + tst DST_R, #\lowbit
2491 beq 1f
2492 .endif
2493 - pixld_src (lowbit * 8 / dst_w_bpp), src_bpp, src_basereg, SRC
2494 - pixld (lowbit * 8 / dst_w_bpp), mask_bpp, mask_basereg, MASK
2495 + pixld_src (\lowbit * 8 / dst_w_bpp), src_bpp, src_basereg, SRC
2496 + pixld (\lowbit * 8 / dst_w_bpp), mask_bpp, mask_basereg, MASK
2497 .if dst_r_bpp > 0
2498 - pixld_a (lowbit * 8 / dst_r_bpp), dst_r_bpp, dst_r_basereg, DST_R
2499 + pixld_a (\lowbit * 8 / dst_r_bpp), dst_r_bpp, dst_r_basereg, DST_R
2500 .else
2501 - add DST_R, DST_R, #lowbit
2502 + add DST_R, DST_R, #\lowbit
2503 .endif
2504 - PF add PF_X, PF_X, #(lowbit * 8 / dst_w_bpp)
2505 - sub W, W, #(lowbit * 8 / dst_w_bpp)
2506 + PF add, PF_X, PF_X, #(\lowbit * 8 / dst_w_bpp)
2507 + sub W, W, #(\lowbit * 8 / dst_w_bpp)
2509 .endif
2510 .endr
2511 pixdeinterleave src_bpp, src_basereg
2512 pixdeinterleave mask_bpp, mask_basereg
2513 pixdeinterleave dst_r_bpp, dst_r_basereg
2515 - process_pixblock_head
2516 + \process_pixblock_head
2517 cache_preload 0, pixblock_size
2518 cache_preload_simple
2519 - process_pixblock_tail
2520 + \process_pixblock_tail
2522 pixinterleave dst_w_bpp, dst_w_basereg
2523 .irp lowbit, 1, 2, 4, 8, 16
2524 -.if (dst_w_bpp <= (lowbit * 8)) && ((lowbit * 8) < (pixblock_size * dst_w_bpp))
2525 -.if lowbit < 16 /* we don't need more than 16-byte alignment */
2526 - tst DST_W, #lowbit
2527 +.if (dst_w_bpp <= (\lowbit * 8)) && ((\lowbit * 8) < (pixblock_size * dst_w_bpp))
2528 +.if \lowbit < 16 /* we don't need more than 16-byte alignment */
2529 + tst DST_W, #\lowbit
2530 beq 1f
2531 .endif
2532 - pixst_a (lowbit * 8 / dst_w_bpp), dst_w_bpp, dst_w_basereg, DST_W
2533 + pixst_a (\lowbit * 8 / dst_w_bpp), dst_w_bpp, dst_w_basereg, DST_W
2535 .endif
2536 .endr
2537 .endif
2539 .endm
2542 @@ -525,51 +527,51 @@ 2:
2543 .macro process_trailing_pixels cache_preload_flag, \
2544 dst_aligned_flag, \
2545 process_pixblock_head, \
2546 process_pixblock_tail, \
2547 process_pixblock_tail_head
2548 tst W, #(pixblock_size - 1)
2549 beq 2f
2550 .irp chunk_size, 16, 8, 4, 2, 1
2551 -.if pixblock_size > chunk_size
2552 - tst W, #chunk_size
2553 +.if pixblock_size > \chunk_size
2554 + tst W, #\chunk_size
2555 beq 1f
2556 - pixld_src chunk_size, src_bpp, src_basereg, SRC
2557 - pixld chunk_size, mask_bpp, mask_basereg, MASK
2558 -.if dst_aligned_flag != 0
2559 - pixld_a chunk_size, dst_r_bpp, dst_r_basereg, DST_R
2560 + pixld_src \chunk_size, src_bpp, src_basereg, SRC
2561 + pixld \chunk_size, mask_bpp, mask_basereg, MASK
2562 +.if \dst_aligned_flag != 0
2563 + pixld_a \chunk_size, dst_r_bpp, dst_r_basereg, DST_R
2564 .else
2565 - pixld chunk_size, dst_r_bpp, dst_r_basereg, DST_R
2566 + pixld \chunk_size, dst_r_bpp, dst_r_basereg, DST_R
2567 .endif
2568 -.if cache_preload_flag != 0
2569 - PF add PF_X, PF_X, #chunk_size
2570 +.if \cache_preload_flag != 0
2571 + PF add, PF_X, PF_X, #\chunk_size
2572 .endif
2574 .endif
2575 .endr
2576 pixdeinterleave src_bpp, src_basereg
2577 pixdeinterleave mask_bpp, mask_basereg
2578 pixdeinterleave dst_r_bpp, dst_r_basereg
2580 - process_pixblock_head
2581 -.if cache_preload_flag != 0
2582 + \process_pixblock_head
2583 +.if \cache_preload_flag != 0
2584 cache_preload 0, pixblock_size
2585 cache_preload_simple
2586 .endif
2587 - process_pixblock_tail
2588 + \process_pixblock_tail
2589 pixinterleave dst_w_bpp, dst_w_basereg
2590 .irp chunk_size, 16, 8, 4, 2, 1
2591 -.if pixblock_size > chunk_size
2592 - tst W, #chunk_size
2593 +.if pixblock_size > \chunk_size
2594 + tst W, #\chunk_size
2595 beq 1f
2596 -.if dst_aligned_flag != 0
2597 - pixst_a chunk_size, dst_w_bpp, dst_w_basereg, DST_W
2598 +.if \dst_aligned_flag != 0
2599 + pixst_a \chunk_size, dst_w_bpp, dst_w_basereg, DST_W
2600 .else
2601 - pixst chunk_size, dst_w_bpp, dst_w_basereg, DST_W
2602 + pixst \chunk_size, dst_w_bpp, dst_w_basereg, DST_W
2603 .endif
2605 .endif
2606 .endr
2608 .endm
2611 @@ -599,17 +601,17 @@ 2:
2612 .if (mask_bpp != 24) && (mask_bpp != 0)
2613 sub MASK, MASK, W, lsl #mask_bpp_shift
2614 .endif
2615 subs H, H, #1
2616 mov DST_R, DST_W
2617 .if regs_shortage
2618 str H, [sp, #4] /* save updated height to stack */
2619 .endif
2620 - bge start_of_loop_label
2621 + bge \start_of_loop_label
2622 .endm
2625 * Registers are allocated in the following way by default:
2626 * d0, d1, d2, d3 - reserved for loading source pixel data
2627 * d4, d5, d6, d7 - reserved for loading destination pixel data
2628 * d24, d25, d26, d27 - reserved for loading mask pixel data
2629 * d28, d29, d30, d31 - final destination pixel data for writeback to memory
2630 @@ -626,48 +628,48 @@ 2:
2631 process_pixblock_head, \
2632 process_pixblock_tail, \
2633 process_pixblock_tail_head, \
2634 dst_w_basereg_ = 28, \
2635 dst_r_basereg_ = 4, \
2636 src_basereg_ = 0, \
2637 mask_basereg_ = 24
2639 - pixman_asm_function fname
2640 + pixman_asm_function \fname
2642 push {r4-r12, lr} /* save all registers */
2645 * Select prefetch type for this function. If prefetch distance is
2646 * set to 0 or one of the color formats is 24bpp, SIMPLE prefetch
2647 * has to be used instead of ADVANCED.
2649 .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_DEFAULT
2650 -.if prefetch_distance == 0
2651 +.if \prefetch_distance == 0
2652 .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_NONE
2653 .elseif (PREFETCH_TYPE_CURRENT > PREFETCH_TYPE_SIMPLE) && \
2654 - ((src_bpp_ == 24) || (mask_bpp_ == 24) || (dst_w_bpp_ == 24))
2655 + ((\src_bpp_ == 24) || (\mask_bpp_ == 24) || (\dst_w_bpp_ == 24))
2656 .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_SIMPLE
2657 .endif
2660 * Make some macro arguments globally visible and accessible
2661 * from other macros
2663 - .set src_bpp, src_bpp_
2664 - .set mask_bpp, mask_bpp_
2665 - .set dst_w_bpp, dst_w_bpp_
2666 - .set pixblock_size, pixblock_size_
2667 - .set dst_w_basereg, dst_w_basereg_
2668 - .set dst_r_basereg, dst_r_basereg_
2669 - .set src_basereg, src_basereg_
2670 - .set mask_basereg, mask_basereg_
2671 + .set src_bpp, \src_bpp_
2672 + .set mask_bpp, \mask_bpp_
2673 + .set dst_w_bpp, \dst_w_bpp_
2674 + .set pixblock_size, \pixblock_size_
2675 + .set dst_w_basereg, \dst_w_basereg_
2676 + .set dst_r_basereg, \dst_r_basereg_
2677 + .set src_basereg, \src_basereg_
2678 + .set mask_basereg, \mask_basereg_
2680 .macro pixld_src x:vararg
2681 - pixld x
2682 + pixld \x
2683 .endm
2684 .macro fetch_src_pixblock
2685 pixld_src pixblock_size, src_bpp, \
2686 (src_basereg - pixblock_size * src_bpp / 64), SRC
2687 .endm
2689 * Assign symbolic names to registers
2691 @@ -750,38 +752,38 @@ 2:
2692 .elseif dst_w_bpp == 16
2693 .set dst_bpp_shift, 1
2694 .elseif dst_w_bpp == 8
2695 .set dst_bpp_shift, 0
2696 .else
2697 .error "requested dst bpp (dst_w_bpp) is not supported"
2698 .endif
2700 -.if (((flags) & FLAG_DST_READWRITE) != 0)
2701 +.if (((\flags) & FLAG_DST_READWRITE) != 0)
2702 .set dst_r_bpp, dst_w_bpp
2703 .else
2704 .set dst_r_bpp, 0
2705 .endif
2706 -.if (((flags) & FLAG_DEINTERLEAVE_32BPP) != 0)
2707 +.if (((\flags) & FLAG_DEINTERLEAVE_32BPP) != 0)
2708 .set DEINTERLEAVE_32BPP_ENABLED, 1
2709 .else
2710 .set DEINTERLEAVE_32BPP_ENABLED, 0
2711 .endif
2713 -.if prefetch_distance < 0 || prefetch_distance > 15
2714 - .error "invalid prefetch distance (prefetch_distance)"
2715 +.if \prefetch_distance < 0 || \prefetch_distance > 15
2716 + .error "invalid prefetch distance (\prefetch_distance)"
2717 .endif
2719 .if src_bpp > 0
2720 ldr SRC, [sp, #40]
2721 .endif
2722 .if mask_bpp > 0
2723 ldr MASK, [sp, #48]
2724 .endif
2725 - PF mov PF_X, #0
2726 + PF mov, PF_X, #0
2727 .if src_bpp > 0
2728 ldr SRC_STRIDE, [sp, #44]
2729 .endif
2730 .if mask_bpp > 0
2731 ldr MASK_STRIDE, [sp, #52]
2732 .endif
2733 mov DST_R, DST_W
2735 @@ -796,24 +798,24 @@ 2:
2736 .if dst_w_bpp == 24
2737 sub DST_STRIDE, DST_STRIDE, W
2738 sub DST_STRIDE, DST_STRIDE, W, lsl #1
2739 .endif
2742 * Setup advanced prefetcher initial state
2744 - PF mov PF_SRC, SRC
2745 - PF mov PF_DST, DST_R
2746 - PF mov PF_MASK, MASK
2747 + PF mov, PF_SRC, SRC
2748 + PF mov, PF_DST, DST_R
2749 + PF mov, PF_MASK, MASK
2750 /* PF_CTL = prefetch_distance | ((h - 1) << 4) */
2751 - PF mov PF_CTL, H, lsl #4
2752 - PF add PF_CTL, #(prefetch_distance - 0x10)
2753 + PF mov, PF_CTL, H, lsl #4
2754 + PF add, PF_CTL, #(\prefetch_distance - 0x10)
2756 - init
2757 + \init
2758 .if regs_shortage
2759 push {r0, r1}
2760 .endif
2761 subs H, H, #1
2762 .if regs_shortage
2763 str H, [sp, #4] /* save updated height to stack */
2764 .else
2765 mov ORIG_W, W
2766 @@ -821,84 +823,84 @@ 2:
2767 blt 9f
2768 cmp W, #(pixblock_size * 2)
2769 blt 8f
2771 * This is the start of the pipelined loop, which if optimized for
2772 * long scanlines
2775 - ensure_destination_ptr_alignment process_pixblock_head, \
2776 - process_pixblock_tail, \
2777 - process_pixblock_tail_head
2778 + ensure_destination_ptr_alignment \process_pixblock_head, \
2779 + \process_pixblock_tail, \
2780 + \process_pixblock_tail_head
2782 /* Implement "head (tail_head) ... (tail_head) tail" loop pattern */
2783 pixld_a pixblock_size, dst_r_bpp, \
2784 (dst_r_basereg - pixblock_size * dst_r_bpp / 64), DST_R
2785 fetch_src_pixblock
2786 pixld pixblock_size, mask_bpp, \
2787 (mask_basereg - pixblock_size * mask_bpp / 64), MASK
2788 - PF add PF_X, PF_X, #pixblock_size
2789 - process_pixblock_head
2790 + PF add, PF_X, PF_X, #pixblock_size
2791 + \process_pixblock_head
2792 cache_preload 0, pixblock_size
2793 cache_preload_simple
2794 subs W, W, #(pixblock_size * 2)
2795 blt 2f
2797 - process_pixblock_tail_head
2798 + \process_pixblock_tail_head
2799 cache_preload_simple
2800 subs W, W, #pixblock_size
2801 bge 1b
2803 - process_pixblock_tail
2804 + \process_pixblock_tail
2805 pixst_a pixblock_size, dst_w_bpp, \
2806 (dst_w_basereg - pixblock_size * dst_w_bpp / 64), DST_W
2808 /* Process the remaining trailing pixels in the scanline */
2809 process_trailing_pixels 1, 1, \
2810 - process_pixblock_head, \
2811 - process_pixblock_tail, \
2812 - process_pixblock_tail_head
2813 + \process_pixblock_head, \
2814 + \process_pixblock_tail, \
2815 + \process_pixblock_tail_head
2816 advance_to_next_scanline 0b
2818 .if regs_shortage
2819 pop {r0, r1}
2820 .endif
2821 - cleanup
2822 + \cleanup
2823 pop {r4-r12, pc} /* exit */
2825 * This is the start of the loop, designed to process images with small width
2826 * (less than pixblock_size * 2 pixels). In this case neither pipelining
2827 * nor prefetch are used.
2830 /* Process exactly pixblock_size pixels if needed */
2831 tst W, #pixblock_size
2832 beq 1f
2833 pixld pixblock_size, dst_r_bpp, \
2834 (dst_r_basereg - pixblock_size * dst_r_bpp / 64), DST_R
2835 fetch_src_pixblock
2836 pixld pixblock_size, mask_bpp, \
2837 (mask_basereg - pixblock_size * mask_bpp / 64), MASK
2838 - process_pixblock_head
2839 - process_pixblock_tail
2840 + \process_pixblock_head
2841 + \process_pixblock_tail
2842 pixst pixblock_size, dst_w_bpp, \
2843 (dst_w_basereg - pixblock_size * dst_w_bpp / 64), DST_W
2845 /* Process the remaining trailing pixels in the scanline */
2846 process_trailing_pixels 0, 0, \
2847 - process_pixblock_head, \
2848 - process_pixblock_tail, \
2849 - process_pixblock_tail_head
2850 + \process_pixblock_head, \
2851 + \process_pixblock_tail, \
2852 + \process_pixblock_tail_head
2853 advance_to_next_scanline 8b
2855 .if regs_shortage
2856 pop {r0, r1}
2857 .endif
2858 - cleanup
2859 + \cleanup
2860 pop {r4-r12, pc} /* exit */
2862 .purgem fetch_src_pixblock
2863 .purgem pixld_src
2865 .unreq SRC
2866 .unreq MASK
2867 .unreq DST_R
2868 @@ -910,17 +912,17 @@ 9:
2869 .unreq DST_STRIDE
2870 .unreq MASK_STRIDE
2871 .unreq PF_CTL
2872 .unreq PF_X
2873 .unreq PF_SRC
2874 .unreq PF_DST
2875 .unreq PF_MASK
2876 .unreq DUMMY
2877 - .endfunc
2878 + pixman_end_asm_function
2879 .endm
2882 * A simplified variant of function generation template for a single
2883 * scanline processing (for implementing pixman combine functions)
2885 .macro generate_composite_function_scanline use_nearest_scaling, \
2886 fname, \
2887 @@ -934,49 +936,49 @@ 9:
2888 process_pixblock_head, \
2889 process_pixblock_tail, \
2890 process_pixblock_tail_head, \
2891 dst_w_basereg_ = 28, \
2892 dst_r_basereg_ = 4, \
2893 src_basereg_ = 0, \
2894 mask_basereg_ = 24
2896 - pixman_asm_function fname
2897 + pixman_asm_function \fname
2899 .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_NONE
2901 * Make some macro arguments globally visible and accessible
2902 * from other macros
2904 - .set src_bpp, src_bpp_
2905 - .set mask_bpp, mask_bpp_
2906 - .set dst_w_bpp, dst_w_bpp_
2907 - .set pixblock_size, pixblock_size_
2908 - .set dst_w_basereg, dst_w_basereg_
2909 - .set dst_r_basereg, dst_r_basereg_
2910 - .set src_basereg, src_basereg_
2911 - .set mask_basereg, mask_basereg_
2912 + .set src_bpp, \src_bpp_
2913 + .set mask_bpp, \mask_bpp_
2914 + .set dst_w_bpp, \dst_w_bpp_
2915 + .set pixblock_size, \pixblock_size_
2916 + .set dst_w_basereg, \dst_w_basereg_
2917 + .set dst_r_basereg, \dst_r_basereg_
2918 + .set src_basereg, \src_basereg_
2919 + .set mask_basereg, \mask_basereg_
2921 -.if use_nearest_scaling != 0
2922 +.if \use_nearest_scaling != 0
2924 * Assign symbolic names to registers for nearest scaling
2926 W .req r0
2927 DST_W .req r1
2928 SRC .req r2
2929 VX .req r3
2930 UNIT_X .req ip
2931 MASK .req lr
2932 TMP1 .req r4
2933 TMP2 .req r5
2934 DST_R .req r6
2935 SRC_WIDTH_FIXED .req r7
2937 .macro pixld_src x:vararg
2938 - pixld_s x
2939 + pixld_s \x
2940 .endm
2942 ldr UNIT_X, [sp]
2943 push {r4-r8, lr}
2944 ldr SRC_WIDTH_FIXED, [sp, #(24 + 4)]
2945 .if mask_bpp != 0
2946 ldr MASK, [sp, #(24 + 8)]
2947 .endif
2948 @@ -986,89 +988,89 @@ 9:
2950 W .req r0 /* width (is updated during processing) */
2951 DST_W .req r1 /* destination buffer pointer for writes */
2952 SRC .req r2 /* source buffer pointer */
2953 DST_R .req ip /* destination buffer pointer for reads */
2954 MASK .req r3 /* mask pointer */
2956 .macro pixld_src x:vararg
2957 - pixld x
2958 + pixld \x
2959 .endm
2960 .endif
2962 -.if (((flags) & FLAG_DST_READWRITE) != 0)
2963 +.if (((\flags) & FLAG_DST_READWRITE) != 0)
2964 .set dst_r_bpp, dst_w_bpp
2965 .else
2966 .set dst_r_bpp, 0
2967 .endif
2968 -.if (((flags) & FLAG_DEINTERLEAVE_32BPP) != 0)
2969 +.if (((\flags) & FLAG_DEINTERLEAVE_32BPP) != 0)
2970 .set DEINTERLEAVE_32BPP_ENABLED, 1
2971 .else
2972 .set DEINTERLEAVE_32BPP_ENABLED, 0
2973 .endif
2975 .macro fetch_src_pixblock
2976 pixld_src pixblock_size, src_bpp, \
2977 (src_basereg - pixblock_size * src_bpp / 64), SRC
2978 .endm
2980 - init
2981 + \init
2982 mov DST_R, DST_W
2984 cmp W, #pixblock_size
2985 blt 8f
2987 - ensure_destination_ptr_alignment process_pixblock_head, \
2988 - process_pixblock_tail, \
2989 - process_pixblock_tail_head
2990 + ensure_destination_ptr_alignment \process_pixblock_head, \
2991 + \process_pixblock_tail, \
2992 + \process_pixblock_tail_head
2994 subs W, W, #pixblock_size
2995 blt 7f
2997 /* Implement "head (tail_head) ... (tail_head) tail" loop pattern */
2998 pixld_a pixblock_size, dst_r_bpp, \
2999 (dst_r_basereg - pixblock_size * dst_r_bpp / 64), DST_R
3000 fetch_src_pixblock
3001 pixld pixblock_size, mask_bpp, \
3002 (mask_basereg - pixblock_size * mask_bpp / 64), MASK
3003 - process_pixblock_head
3004 + \process_pixblock_head
3005 subs W, W, #pixblock_size
3006 blt 2f
3008 - process_pixblock_tail_head
3009 + \process_pixblock_tail_head
3010 subs W, W, #pixblock_size
3011 bge 1b
3013 - process_pixblock_tail
3014 + \process_pixblock_tail
3015 pixst_a pixblock_size, dst_w_bpp, \
3016 (dst_w_basereg - pixblock_size * dst_w_bpp / 64), DST_W
3018 /* Process the remaining trailing pixels in the scanline (dst aligned) */
3019 process_trailing_pixels 0, 1, \
3020 - process_pixblock_head, \
3021 - process_pixblock_tail, \
3022 - process_pixblock_tail_head
3023 + \process_pixblock_head, \
3024 + \process_pixblock_tail, \
3025 + \process_pixblock_tail_head
3027 - cleanup
3028 -.if use_nearest_scaling != 0
3029 + \cleanup
3030 +.if \use_nearest_scaling != 0
3031 pop {r4-r8, pc} /* exit */
3032 .else
3033 bx lr /* exit */
3034 .endif
3036 /* Process the remaining trailing pixels in the scanline (dst unaligned) */
3037 process_trailing_pixels 0, 0, \
3038 - process_pixblock_head, \
3039 - process_pixblock_tail, \
3040 - process_pixblock_tail_head
3041 + \process_pixblock_head, \
3042 + \process_pixblock_tail, \
3043 + \process_pixblock_tail_head
3045 - cleanup
3046 + \cleanup
3048 -.if use_nearest_scaling != 0
3049 +.if \use_nearest_scaling != 0
3050 pop {r4-r8, pc} /* exit */
3052 .unreq DST_R
3053 .unreq SRC
3054 .unreq W
3055 .unreq VX
3056 .unreq UNIT_X
3057 .unreq TMP1
3058 @@ -1085,25 +1087,25 @@ 8:
3059 .unreq DST_R
3060 .unreq DST_W
3061 .unreq W
3062 .endif
3064 .purgem fetch_src_pixblock
3065 .purgem pixld_src
3067 - .endfunc
3068 + pixman_end_asm_function
3069 .endm
3071 .macro generate_composite_function_single_scanline x:vararg
3072 - generate_composite_function_scanline 0, x
3073 + generate_composite_function_scanline 0, \x
3074 .endm
3076 .macro generate_composite_function_nearest_scanline x:vararg
3077 - generate_composite_function_scanline 1, x
3078 + generate_composite_function_scanline 1, \x
3079 .endm
3081 /* Default prologue/epilogue, nothing special needs to be done */
3083 .macro default_init
3084 .endm
3086 .macro default_cleanup
3087 @@ -1129,56 +1131,56 @@ 8:
3088 * Conversion of 8 r5g6b6 pixels packed in 128-bit register (in)
3089 * into a planar a8r8g8b8 format (with a, r, g, b color components
3090 * stored into 64-bit registers out_a, out_r, out_g, out_b respectively).
3092 * Warning: the conversion is destructive and the original
3093 * value (in) is lost.
3095 .macro convert_0565_to_8888 in, out_a, out_r, out_g, out_b
3096 - vshrn.u16 out_r, in, #8
3097 - vshrn.u16 out_g, in, #3
3098 - vsli.u16 in, in, #5
3099 - vmov.u8 out_a, #255
3100 - vsri.u8 out_r, out_r, #5
3101 - vsri.u8 out_g, out_g, #6
3102 - vshrn.u16 out_b, in, #2
3103 + vshrn.u16 \out_r, \in, #8
3104 + vshrn.u16 \out_g, \in, #3
3105 + vsli.u16 \in, \in, #5
3106 + vmov.u8 \out_a, #255
3107 + vsri.u8 \out_r, \out_r, #5
3108 + vsri.u8 \out_g, \out_g, #6
3109 + vshrn.u16 \out_b, \in, #2
3110 .endm
3112 .macro convert_0565_to_x888 in, out_r, out_g, out_b
3113 - vshrn.u16 out_r, in, #8
3114 - vshrn.u16 out_g, in, #3
3115 - vsli.u16 in, in, #5
3116 - vsri.u8 out_r, out_r, #5
3117 - vsri.u8 out_g, out_g, #6
3118 - vshrn.u16 out_b, in, #2
3119 + vshrn.u16 \out_r, \in, #8
3120 + vshrn.u16 \out_g, \in, #3
3121 + vsli.u16 \in, \in, #5
3122 + vsri.u8 \out_r, \out_r, #5
3123 + vsri.u8 \out_g, \out_g, #6
3124 + vshrn.u16 \out_b, \in, #2
3125 .endm
3128 * Conversion from planar a8r8g8b8 format (with a, r, g, b color components
3129 * in 64-bit registers in_a, in_r, in_g, in_b respectively) into 8 r5g6b6
3130 * pixels packed in 128-bit register (out). Requires two temporary 128-bit
3131 * registers (tmp1, tmp2)
3133 .macro convert_8888_to_0565 in_r, in_g, in_b, out, tmp1, tmp2
3134 - vshll.u8 tmp1, in_g, #8
3135 - vshll.u8 out, in_r, #8
3136 - vshll.u8 tmp2, in_b, #8
3137 - vsri.u16 out, tmp1, #5
3138 - vsri.u16 out, tmp2, #11
3139 + vshll.u8 \tmp1, \in_g, #8
3140 + vshll.u8 \out, \in_r, #8
3141 + vshll.u8 \tmp2, \in_b, #8
3142 + vsri.u16 \out, \tmp1, #5
3143 + vsri.u16 \out, \tmp2, #11
3144 .endm
3147 * Conversion of four r5g6b5 pixels (in) to four x8r8g8b8 pixels
3148 * returned in (out0, out1) registers pair. Requires one temporary
3149 * 64-bit register (tmp). 'out1' and 'in' may overlap, the original
3150 * value from 'in' is lost
3152 .macro convert_four_0565_to_x888_packed in, out0, out1, tmp
3153 - vshl.u16 out0, in, #5 /* G top 6 bits */
3154 - vshl.u16 tmp, in, #11 /* B top 5 bits */
3155 - vsri.u16 in, in, #5 /* R is ready in top bits */
3156 - vsri.u16 out0, out0, #6 /* G is ready in top bits */
3157 - vsri.u16 tmp, tmp, #5 /* B is ready in top bits */
3158 - vshr.u16 out1, in, #8 /* R is in place */
3159 - vsri.u16 out0, tmp, #8 /* G & B is in place */
3160 - vzip.u16 out0, out1 /* everything is in place */
3161 + vshl.u16 \out0, \in, #5 /* G top 6 bits */
3162 + vshl.u16 \tmp, \in, #11 /* B top 5 bits */
3163 + vsri.u16 \in, \in, #5 /* R is ready in top bits */
3164 + vsri.u16 \out0, \out0, #6 /* G is ready in top bits */
3165 + vsri.u16 \tmp, \tmp, #5 /* B is ready in top bits */
3166 + vshr.u16 \out1, \in, #8 /* R is in place */
3167 + vsri.u16 \out0, \tmp, #8 /* G & B is in place */
3168 + vzip.u16 \out0, \out1 /* everything is in place */
3169 .endm
3170 diff --git a/gfx/cairo/libpixman/src/pixman-arm-simd-asm-scaled.S b/gfx/cairo/libpixman/src/pixman-arm-simd-asm-scaled.S
3171 --- a/gfx/cairo/libpixman/src/pixman-arm-simd-asm-scaled.S
3172 +++ b/gfx/cairo/libpixman/src/pixman-arm-simd-asm-scaled.S
3173 @@ -20,16 +20,20 @@
3174 * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
3175 * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
3176 * SOFTWARE.
3178 * Author: Jeff Muizelaar (jeff@infidigm.net)
3182 +#ifdef __clang__
3183 +#define subpls subspl
3184 +#endif
3186 /* Prevent the stack from becoming executable */
3187 #if defined(__linux__) && defined(__ELF__)
3188 .section .note.GNU-stack,"",%progbits
3189 #endif
3191 .text
3192 .arch armv6
3193 .object_arch armv4
3194 @@ -57,100 +61,105 @@
3195 * prefetch_braking_distance - stop prefetching when that many pixels are
3196 * remaining before the end of scanline
3199 .macro generate_nearest_scanline_func fname, bpp_shift, t, \
3200 prefetch_distance, \
3201 prefetch_braking_distance
3203 -pixman_asm_function fname
3204 +pixman_asm_function \fname
3205 W .req r0
3206 DST .req r1
3207 SRC .req r2
3208 VX .req r3
3209 UNIT_X .req ip
3210 TMP1 .req r4
3211 TMP2 .req r5
3212 VXMASK .req r6
3213 PF_OFFS .req r7
3214 SRC_WIDTH_FIXED .req r8
3216 ldr UNIT_X, [sp]
3217 push {r4, r5, r6, r7, r8, r10}
3218 - mvn VXMASK, #((1 << bpp_shift) - 1)
3219 + mvn VXMASK, #((1 << \bpp_shift) - 1)
3220 ldr SRC_WIDTH_FIXED, [sp, #28]
3222 /* define helper macro */
3223 .macro scale_2_pixels
3224 - ldr&t TMP1, [SRC, TMP1]
3225 - and TMP2, VXMASK, VX, asr #(16 - bpp_shift)
3226 + ldr\()\t TMP1, [SRC, TMP1]
3227 + and TMP2, VXMASK, VX, asr #(16 - \bpp_shift)
3228 adds VX, VX, UNIT_X
3229 - str&t TMP1, [DST], #(1 << bpp_shift)
3230 + str\()\t TMP1, [DST], #(1 << \bpp_shift)
3231 9: subpls VX, VX, SRC_WIDTH_FIXED
3232 bpl 9b
3234 - ldr&t TMP2, [SRC, TMP2]
3235 - and TMP1, VXMASK, VX, asr #(16 - bpp_shift)
3236 + ldr\()\t TMP2, [SRC, TMP2]
3237 + and TMP1, VXMASK, VX, asr #(16 - \bpp_shift)
3238 adds VX, VX, UNIT_X
3239 - str&t TMP2, [DST], #(1 << bpp_shift)
3240 + str\()\t TMP2, [DST], #(1 << \bpp_shift)
3241 9: subpls VX, VX, SRC_WIDTH_FIXED
3242 bpl 9b
3243 .endm
3245 /* now do the scaling */
3246 - and TMP1, VXMASK, VX, asr #(16 - bpp_shift)
3247 + and TMP1, VXMASK, VX, asr #(16 - \bpp_shift)
3248 adds VX, VX, UNIT_X
3249 9: subpls VX, VX, SRC_WIDTH_FIXED
3250 bpl 9b
3251 - subs W, W, #(8 + prefetch_braking_distance)
3252 + subs W, W, #(8 + \prefetch_braking_distance)
3253 blt 2f
3254 /* calculate prefetch offset */
3255 - mov PF_OFFS, #prefetch_distance
3256 + mov PF_OFFS, #\prefetch_distance
3257 mla PF_OFFS, UNIT_X, PF_OFFS, VX
3258 1: /* main loop, process 8 pixels per iteration with prefetch */
3259 - pld [SRC, PF_OFFS, asr #(16 - bpp_shift)]
3260 + pld [SRC, PF_OFFS, asr #(16 - \bpp_shift)]
3261 add PF_OFFS, UNIT_X, lsl #3
3262 scale_2_pixels
3263 scale_2_pixels
3264 scale_2_pixels
3265 scale_2_pixels
3266 subs W, W, #8
3267 bge 1b
3269 - subs W, W, #(4 - 8 - prefetch_braking_distance)
3270 + subs W, W, #(4 - 8 - \prefetch_braking_distance)
3271 blt 2f
3272 1: /* process the remaining pixels */
3273 scale_2_pixels
3274 scale_2_pixels
3275 subs W, W, #4
3276 bge 1b
3278 tst W, #2
3279 beq 2f
3280 scale_2_pixels
3282 tst W, #1
3283 - ldrne&t TMP1, [SRC, TMP1]
3284 - strne&t TMP1, [DST]
3285 +#ifdef __clang__
3286 + ldr\()\t\()ne TMP1, [SRC, TMP1]
3287 + str\()\t\()ne TMP1, [DST]
3288 +#else
3289 + ldrne\()\t TMP1, [SRC, TMP1]
3290 + strne\()\t TMP1, [DST]
3291 +#endif
3292 /* cleanup helper macro */
3293 .purgem scale_2_pixels
3294 .unreq DST
3295 .unreq SRC
3296 .unreq W
3297 .unreq VX
3298 .unreq UNIT_X
3299 .unreq TMP1
3300 .unreq TMP2
3301 .unreq VXMASK
3302 .unreq PF_OFFS
3303 .unreq SRC_WIDTH_FIXED
3304 /* return */
3305 pop {r4, r5, r6, r7, r8, r10}
3306 bx lr
3307 -.endfunc
3308 +pixman_end_asm_function
3309 .endm
3311 generate_nearest_scanline_func \
3312 pixman_scaled_nearest_scanline_0565_0565_SRC_asm_armv6, 1, h, 80, 32
3314 generate_nearest_scanline_func \
3315 pixman_scaled_nearest_scanline_8888_8888_SRC_asm_armv6, 2, , 48, 32
3316 diff --git a/gfx/cairo/libpixman/src/pixman-arm-simd-asm.S b/gfx/cairo/libpixman/src/pixman-arm-simd-asm.S
3317 --- a/gfx/cairo/libpixman/src/pixman-arm-simd-asm.S
3318 +++ b/gfx/cairo/libpixman/src/pixman-arm-simd-asm.S
3319 @@ -20,16 +20,21 @@
3320 * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
3321 * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
3322 * SOFTWARE.
3324 * Author: Ben Avison (bavison@riscosopen.org)
3328 +#ifdef __clang__
3329 +#define adceqs adcseq
3330 +#define ldmnedb ldmdbne
3331 +#endif
3333 /* Prevent the stack from becoming executable */
3334 #if defined(__linux__) && defined(__ELF__)
3335 .section .note.GNU-stack,"",%progbits
3336 #endif
3338 .text
3339 .arch armv6
3340 .object_arch armv4
3341 @@ -52,26 +57,26 @@
3342 * preload If outputting 16 bytes causes 64 bytes to be read, whether an extra preload should be output
3345 .macro blit_init
3346 line_saved_regs STRIDE_D, STRIDE_S
3347 .endm
3349 .macro blit_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
3350 - pixld cond, numbytes, firstreg, SRC, unaligned_src
3351 + pixld \cond, \numbytes, \firstreg, SRC, \unaligned_src
3352 .endm
3354 .macro blit_inner_loop process_head, process_tail, unaligned_src, unaligned_mask, dst_alignment
3355 WK4 .req STRIDE_D
3356 WK5 .req STRIDE_S
3357 WK6 .req MASK
3358 WK7 .req STRIDE_M
3359 -110: pixld , 16, 0, SRC, unaligned_src
3360 - pixld , 16, 4, SRC, unaligned_src
3361 +110: pixld , 16, 0, SRC, \unaligned_src
3362 + pixld , 16, 4, SRC, \unaligned_src
3363 pld [SRC, SCRATCH]
3364 pixst , 16, 0, DST
3365 pixst , 16, 4, DST
3366 subs X, X, #32*8/src_bpp
3367 bhs 110b
3368 .unreq WK4
3369 .unreq WK5
3370 .unreq WK6
3371 @@ -137,17 +142,17 @@ generate_composite_function \
3372 mov STRIDE_M, SRC
3373 .endm
3375 .macro fill_process_tail cond, numbytes, firstreg
3376 WK4 .req SRC
3377 WK5 .req STRIDE_S
3378 WK6 .req MASK
3379 WK7 .req STRIDE_M
3380 - pixst cond, numbytes, 4, DST
3381 + pixst \cond, \numbytes, 4, DST
3382 .unreq WK4
3383 .unreq WK5
3384 .unreq WK6
3385 .unreq WK7
3386 .endm
3388 generate_composite_function \
3389 pixman_composite_src_n_8888_asm_armv6, 0, 0, 32, \
3390 @@ -177,30 +182,30 @@ generate_composite_function \
3391 nop_macro, /* newline */ \
3392 nop_macro /* cleanup */ \
3393 nop_macro /* process head */ \
3394 fill_process_tail
3396 /******************************************************************************/
3398 .macro src_x888_8888_pixel, cond, reg
3399 - orr&cond WK&reg, WK&reg, #0xFF000000
3400 + orr\()\cond WK\()\reg, WK\()\reg, #0xFF000000
3401 .endm
3403 .macro pixman_composite_src_x888_8888_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
3404 - pixld cond, numbytes, firstreg, SRC, unaligned_src
3405 + pixld \cond, \numbytes, \firstreg, SRC, \unaligned_src
3406 .endm
3408 .macro pixman_composite_src_x888_8888_process_tail cond, numbytes, firstreg
3409 - src_x888_8888_pixel cond, %(firstreg+0)
3410 - .if numbytes >= 8
3411 - src_x888_8888_pixel cond, %(firstreg+1)
3412 - .if numbytes == 16
3413 - src_x888_8888_pixel cond, %(firstreg+2)
3414 - src_x888_8888_pixel cond, %(firstreg+3)
3415 + src_x888_8888_pixel \cond, %(\firstreg+0)
3416 + .if \numbytes >= 8
3417 + src_x888_8888_pixel \cond, %(\firstreg+1)
3418 + .if \numbytes == 16
3419 + src_x888_8888_pixel \cond, %(\firstreg+2)
3420 + src_x888_8888_pixel \cond, %(\firstreg+3)
3421 .endif
3422 .endif
3423 .endm
3425 generate_composite_function \
3426 pixman_composite_src_x888_8888_asm_armv6, 32, 0, 32, \
3427 FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_PROCESS_PRESERVES_SCRATCH, \
3428 3, /* prefetch distance */ \
3429 @@ -217,83 +222,83 @@ generate_composite_function \
3430 ldr MASK, =0x07E007E0
3431 mov STRIDE_M, #0xFF000000
3432 /* Set GE[3:0] to 1010 so SEL instructions do what we want */
3433 ldr SCRATCH, =0x80008000
3434 uadd8 SCRATCH, SCRATCH, SCRATCH
3435 .endm
3437 .macro src_0565_8888_2pixels, reg1, reg2
3438 - and SCRATCH, WK&reg1, MASK @ 00000GGGGGG0000000000gggggg00000
3439 - bic WK&reg2, WK&reg1, MASK @ RRRRR000000BBBBBrrrrr000000bbbbb
3440 - orr SCRATCH, SCRATCH, SCRATCH, lsr #6 @ 00000GGGGGGGGGGGG0000ggggggggggg
3441 - mov WK&reg1, WK&reg2, lsl #16 @ rrrrr000000bbbbb0000000000000000
3442 - mov SCRATCH, SCRATCH, ror #19 @ GGGG0000ggggggggggg00000GGGGGGGG
3443 - bic WK&reg2, WK&reg2, WK&reg1, lsr #16 @ RRRRR000000BBBBB0000000000000000
3444 - orr WK&reg1, WK&reg1, WK&reg1, lsr #5 @ rrrrrrrrrr0bbbbbbbbbb00000000000
3445 - orr WK&reg2, WK&reg2, WK&reg2, lsr #5 @ RRRRRRRRRR0BBBBBBBBBB00000000000
3446 - pkhtb WK&reg1, WK&reg1, WK&reg1, asr #5 @ rrrrrrrr--------bbbbbbbb--------
3447 - sel WK&reg1, WK&reg1, SCRATCH @ rrrrrrrrggggggggbbbbbbbb--------
3448 - mov SCRATCH, SCRATCH, ror #16 @ ggg00000GGGGGGGGGGGG0000gggggggg
3449 - pkhtb WK&reg2, WK&reg2, WK&reg2, asr #5 @ RRRRRRRR--------BBBBBBBB--------
3450 - sel WK&reg2, WK&reg2, SCRATCH @ RRRRRRRRGGGGGGGGBBBBBBBB--------
3451 - orr WK&reg1, STRIDE_M, WK&reg1, lsr #8 @ 11111111rrrrrrrrggggggggbbbbbbbb
3452 - orr WK&reg2, STRIDE_M, WK&reg2, lsr #8 @ 11111111RRRRRRRRGGGGGGGGBBBBBBBB
3453 + and SCRATCH, WK\()\reg1, MASK @ 00000GGGGGG0000000000gggggg00000
3454 + bic WK\()\reg2, WK\()\reg1, MASK @ RRRRR000000BBBBBrrrrr000000bbbbb
3455 + orr SCRATCH, SCRATCH, SCRATCH, lsr #6 @ 00000GGGGGGGGGGGG0000ggggggggggg
3456 + mov WK\()\reg1, WK\()\reg2, lsl #16 @ rrrrr000000bbbbb0000000000000000
3457 + mov SCRATCH, SCRATCH, ror #19 @ GGGG0000ggggggggggg00000GGGGGGGG
3458 + bic WK\()\reg2, WK\()\reg2, WK\()\reg1, lsr #16 @ RRRRR000000BBBBB0000000000000000
3459 + orr WK\()\reg1, WK\()\reg1, WK\()\reg1, lsr #5 @ rrrrrrrrrr0bbbbbbbbbb00000000000
3460 + orr WK\()\reg2, WK\()\reg2, WK\()\reg2, lsr #5 @ RRRRRRRRRR0BBBBBBBBBB00000000000
3461 + pkhtb WK\()\reg1, WK\()\reg1, WK\()\reg1, asr #5 @ rrrrrrrr--------bbbbbbbb--------
3462 + sel WK\()\reg1, WK\()\reg1, SCRATCH @ rrrrrrrrggggggggbbbbbbbb--------
3463 + mov SCRATCH, SCRATCH, ror #16 @ ggg00000GGGGGGGGGGGG0000gggggggg
3464 + pkhtb WK\()\reg2, WK\()\reg2, WK\()\reg2, asr #5 @ RRRRRRRR--------BBBBBBBB--------
3465 + sel WK\()\reg2, WK\()\reg2, SCRATCH @ RRRRRRRRGGGGGGGGBBBBBBBB--------
3466 + orr WK\()\reg1, STRIDE_M, WK\()\reg1, lsr #8 @ 11111111rrrrrrrrggggggggbbbbbbbb
3467 + orr WK\()\reg2, STRIDE_M, WK\()\reg2, lsr #8 @ 11111111RRRRRRRRGGGGGGGGBBBBBBBB
3468 .endm
3470 /* This version doesn't need STRIDE_M, but is one instruction longer.
3471 It would however be preferable for an XRGB target, since we could knock off the last 2 instructions, but is that a common case?
3472 - and SCRATCH, WK&reg1, MASK @ 00000GGGGGG0000000000gggggg00000
3473 - bic WK&reg1, WK&reg1, MASK @ RRRRR000000BBBBBrrrrr000000bbbbb
3474 - orr SCRATCH, SCRATCH, SCRATCH, lsr #6 @ 00000GGGGGGGGGGGG0000ggggggggggg
3475 - mov WK&reg2, WK&reg1, lsr #16 @ 0000000000000000RRRRR000000BBBBB
3476 - mov SCRATCH, SCRATCH, ror #27 @ GGGGGGGGGGGG0000ggggggggggg00000
3477 - bic WK&reg1, WK&reg1, WK&reg2, lsl #16 @ 0000000000000000rrrrr000000bbbbb
3478 - mov WK&reg2, WK&reg2, lsl #3 @ 0000000000000RRRRR000000BBBBB000
3479 - mov WK&reg1, WK&reg1, lsl #3 @ 0000000000000rrrrr000000bbbbb000
3480 - orr WK&reg2, WK&reg2, WK&reg2, lsr #5 @ 0000000000000RRRRRRRRRR0BBBBBBBB
3481 - orr WK&reg1, WK&reg1, WK&reg1, lsr #5 @ 0000000000000rrrrrrrrrr0bbbbbbbb
3482 - pkhbt WK&reg2, WK&reg2, WK&reg2, lsl #5 @ --------RRRRRRRR--------BBBBBBBB
3483 - pkhbt WK&reg1, WK&reg1, WK&reg1, lsl #5 @ --------rrrrrrrr--------bbbbbbbb
3484 - sel WK&reg2, SCRATCH, WK&reg2 @ --------RRRRRRRRGGGGGGGGBBBBBBBB
3485 - sel WK&reg1, SCRATCH, WK&reg1 @ --------rrrrrrrrggggggggbbbbbbbb
3486 - orr WK&reg2, WK&reg2, #0xFF000000 @ 11111111RRRRRRRRGGGGGGGGBBBBBBBB
3487 - orr WK&reg1, WK&reg1, #0xFF000000 @ 11111111rrrrrrrrggggggggbbbbbbbb
3488 + and SCRATCH, WK\()\reg1, MASK @ 00000GGGGGG0000000000gggggg00000
3489 + bic WK\()\reg1, WK\()\reg1, MASK @ RRRRR000000BBBBBrrrrr000000bbbbb
3490 + orr SCRATCH, SCRATCH, SCRATCH, lsr #6 @ 00000GGGGGGGGGGGG0000ggggggggggg
3491 + mov WK\()\reg2, WK\()\reg1, lsr #16 @ 0000000000000000RRRRR000000BBBBB
3492 + mov SCRATCH, SCRATCH, ror #27 @ GGGGGGGGGGGG0000ggggggggggg00000
3493 + bic WK\()\reg1, WK\()\reg1, WK\()\reg2, lsl #16 @ 0000000000000000rrrrr000000bbbbb
3494 + mov WK\()\reg2, WK\()\reg2, lsl #3 @ 0000000000000RRRRR000000BBBBB000
3495 + mov WK\()\reg1, WK\()\reg1, lsl #3 @ 0000000000000rrrrr000000bbbbb000
3496 + orr WK\()\reg2, WK\()\reg2, WK\()\reg2, lsr #5 @ 0000000000000RRRRRRRRRR0BBBBBBBB
3497 + orr WK\()\reg1, WK\()\reg1, WK\()\reg1, lsr #5 @ 0000000000000rrrrrrrrrr0bbbbbbbb
3498 + pkhbt WK\()\reg2, WK\()\reg2, WK\()\reg2, lsl #5 @ --------RRRRRRRR--------BBBBBBBB
3499 + pkhbt WK\()\reg1, WK\()\reg1, WK\()\reg1, lsl #5 @ --------rrrrrrrr--------bbbbbbbb
3500 + sel WK\()\reg2, SCRATCH, WK\()\reg2 @ --------RRRRRRRRGGGGGGGGBBBBBBBB
3501 + sel WK\()\reg1, SCRATCH, WK\()\reg1 @ --------rrrrrrrrggggggggbbbbbbbb
3502 + orr WK\()\reg2, WK\()\reg2, #0xFF000000 @ 11111111RRRRRRRRGGGGGGGGBBBBBBBB
3503 + orr WK\()\reg1, WK\()\reg1, #0xFF000000 @ 11111111rrrrrrrrggggggggbbbbbbbb
3506 .macro src_0565_8888_1pixel, reg
3507 - bic SCRATCH, WK&reg, MASK @ 0000000000000000rrrrr000000bbbbb
3508 - and WK&reg, WK&reg, MASK @ 000000000000000000000gggggg00000
3509 - mov SCRATCH, SCRATCH, lsl #3 @ 0000000000000rrrrr000000bbbbb000
3510 - mov WK&reg, WK&reg, lsl #5 @ 0000000000000000gggggg0000000000
3511 - orr SCRATCH, SCRATCH, SCRATCH, lsr #5 @ 0000000000000rrrrrrrrrr0bbbbbbbb
3512 - orr WK&reg, WK&reg, WK&reg, lsr #6 @ 000000000000000gggggggggggg00000
3513 - pkhbt SCRATCH, SCRATCH, SCRATCH, lsl #5 @ --------rrrrrrrr--------bbbbbbbb
3514 - sel WK&reg, WK&reg, SCRATCH @ --------rrrrrrrrggggggggbbbbbbbb
3515 - orr WK&reg, WK&reg, #0xFF000000 @ 11111111rrrrrrrrggggggggbbbbbbbb
3516 + bic SCRATCH, WK\()\reg, MASK @ 0000000000000000rrrrr000000bbbbb
3517 + and WK\()\reg, WK\()\reg, MASK @ 000000000000000000000gggggg00000
3518 + mov SCRATCH, SCRATCH, lsl #3 @ 0000000000000rrrrr000000bbbbb000
3519 + mov WK\()\reg, WK\()\reg, lsl #5 @ 0000000000000000gggggg0000000000
3520 + orr SCRATCH, SCRATCH, SCRATCH, lsr #5 @ 0000000000000rrrrrrrrrr0bbbbbbbb
3521 + orr WK\()\reg, WK\()\reg, WK\()\reg, lsr #6 @ 000000000000000gggggggggggg00000
3522 + pkhbt SCRATCH, SCRATCH, SCRATCH, lsl #5 @ --------rrrrrrrr--------bbbbbbbb
3523 + sel WK\()\reg, WK\()\reg, SCRATCH @ --------rrrrrrrrggggggggbbbbbbbb
3524 + orr WK\()\reg, WK\()\reg, #0xFF000000 @ 11111111rrrrrrrrggggggggbbbbbbbb
3525 .endm
3527 .macro src_0565_8888_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
3528 - .if numbytes == 16
3529 - pixldst ld,, 8, firstreg, %(firstreg+2),,, SRC, unaligned_src
3530 - .elseif numbytes == 8
3531 - pixld , 4, firstreg, SRC, unaligned_src
3532 - .elseif numbytes == 4
3533 - pixld , 2, firstreg, SRC, unaligned_src
3534 + .if \numbytes == 16
3535 + pixldst ld,, 8, \firstreg, %(\firstreg+2),,, SRC, \unaligned_src
3536 + .elseif \numbytes == 8
3537 + pixld , 4, \firstreg, SRC, \unaligned_src
3538 + .elseif \numbytes == 4
3539 + pixld , 2, \firstreg, SRC, \unaligned_src
3540 .endif
3541 .endm
3543 .macro src_0565_8888_process_tail cond, numbytes, firstreg
3544 - .if numbytes == 16
3545 - src_0565_8888_2pixels firstreg, %(firstreg+1)
3546 - src_0565_8888_2pixels %(firstreg+2), %(firstreg+3)
3547 - .elseif numbytes == 8
3548 - src_0565_8888_2pixels firstreg, %(firstreg+1)
3549 + .if \numbytes == 16
3550 + src_0565_8888_2pixels \firstreg, %(\firstreg+1)
3551 + src_0565_8888_2pixels %(\firstreg+2), %(\firstreg+3)
3552 + .elseif \numbytes == 8
3553 + src_0565_8888_2pixels \firstreg, %(\firstreg+1)
3554 .else
3555 - src_0565_8888_1pixel firstreg
3556 + src_0565_8888_1pixel \firstreg
3557 .endif
3558 .endm
3560 generate_composite_function \
3561 pixman_composite_src_0565_8888_asm_armv6, 16, 0, 32, \
3562 FLAG_DST_WRITEONLY | FLAG_BRANCH_OVER, \
3563 3, /* prefetch distance */ \
3564 src_0565_8888_init, \
3565 @@ -306,67 +311,67 @@ generate_composite_function \
3567 .macro src_x888_0565_init
3568 /* Hold loop invariant in MASK */
3569 ldr MASK, =0x001F001F
3570 line_saved_regs STRIDE_S, ORIG_W
3571 .endm
3573 .macro src_x888_0565_1pixel s, d
3574 - and WK&d, MASK, WK&s, lsr #3 @ 00000000000rrrrr00000000000bbbbb
3575 - and STRIDE_S, WK&s, #0xFC00 @ 0000000000000000gggggg0000000000
3576 - orr WK&d, WK&d, WK&d, lsr #5 @ 00000000000-----rrrrr000000bbbbb
3577 - orr WK&d, WK&d, STRIDE_S, lsr #5 @ 00000000000-----rrrrrggggggbbbbb
3578 + and WK\()\d, MASK, WK\()\s, lsr #3 @ 00000000000rrrrr00000000000bbbbb
3579 + and STRIDE_S, WK\()\s, #0xFC00 @ 0000000000000000gggggg0000000000
3580 + orr WK\()\d, WK\()\d, WK\()\d, lsr #5 @ 00000000000-----rrrrr000000bbbbb
3581 + orr WK\()\d, WK\()\d, STRIDE_S, lsr #5 @ 00000000000-----rrrrrggggggbbbbb
3582 /* Top 16 bits are discarded during the following STRH */
3583 .endm
3585 .macro src_x888_0565_2pixels slo, shi, d, tmp
3586 - and SCRATCH, WK&shi, #0xFC00 @ 0000000000000000GGGGGG0000000000
3587 - and WK&tmp, MASK, WK&shi, lsr #3 @ 00000000000RRRRR00000000000BBBBB
3588 - and WK&shi, MASK, WK&slo, lsr #3 @ 00000000000rrrrr00000000000bbbbb
3589 - orr WK&tmp, WK&tmp, WK&tmp, lsr #5 @ 00000000000-----RRRRR000000BBBBB
3590 - orr WK&tmp, WK&tmp, SCRATCH, lsr #5 @ 00000000000-----RRRRRGGGGGGBBBBB
3591 - and SCRATCH, WK&slo, #0xFC00 @ 0000000000000000gggggg0000000000
3592 - orr WK&shi, WK&shi, WK&shi, lsr #5 @ 00000000000-----rrrrr000000bbbbb
3593 - orr WK&shi, WK&shi, SCRATCH, lsr #5 @ 00000000000-----rrrrrggggggbbbbb
3594 - pkhbt WK&d, WK&shi, WK&tmp, lsl #16 @ RRRRRGGGGGGBBBBBrrrrrggggggbbbbb
3595 + and SCRATCH, WK\()\shi, #0xFC00 @ 0000000000000000GGGGGG0000000000
3596 + and WK\()\tmp, MASK, WK\()\shi, lsr #3 @ 00000000000RRRRR00000000000BBBBB
3597 + and WK\()\shi, MASK, WK\()\slo, lsr #3 @ 00000000000rrrrr00000000000bbbbb
3598 + orr WK\()\tmp, WK\()\tmp, WK\()\tmp, lsr #5 @ 00000000000-----RRRRR000000BBBBB
3599 + orr WK\()\tmp, WK\()\tmp, SCRATCH, lsr #5 @ 00000000000-----RRRRRGGGGGGBBBBB
3600 + and SCRATCH, WK\()\slo, #0xFC00 @ 0000000000000000gggggg0000000000
3601 + orr WK\()\shi, WK\()\shi, WK\()\shi, lsr #5 @ 00000000000-----rrrrr000000bbbbb
3602 + orr WK\()\shi, WK\()\shi, SCRATCH, lsr #5 @ 00000000000-----rrrrrggggggbbbbb
3603 + pkhbt WK\()\d, WK\()\shi, WK\()\tmp, lsl #16 @ RRRRRGGGGGGBBBBBrrrrrggggggbbbbb
3604 .endm
3606 .macro src_x888_0565_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
3607 WK4 .req STRIDE_S
3608 WK5 .req STRIDE_M
3609 WK6 .req WK3
3610 WK7 .req ORIG_W
3611 - .if numbytes == 16
3612 + .if \numbytes == 16
3613 pixld , 16, 4, SRC, 0
3614 src_x888_0565_2pixels 4, 5, 0, 0
3615 pixld , 8, 4, SRC, 0
3616 src_x888_0565_2pixels 6, 7, 1, 1
3617 pixld , 8, 6, SRC, 0
3618 .else
3619 - pixld , numbytes*2, 4, SRC, 0
3620 + pixld , \numbytes*2, 4, SRC, 0
3621 .endif
3622 .endm
3624 .macro src_x888_0565_process_tail cond, numbytes, firstreg
3625 - .if numbytes == 16
3626 + .if \numbytes == 16
3627 src_x888_0565_2pixels 4, 5, 2, 2
3628 src_x888_0565_2pixels 6, 7, 3, 4
3629 - .elseif numbytes == 8
3630 + .elseif \numbytes == 8
3631 src_x888_0565_2pixels 4, 5, 1, 1
3632 src_x888_0565_2pixels 6, 7, 2, 2
3633 - .elseif numbytes == 4
3634 + .elseif \numbytes == 4
3635 src_x888_0565_2pixels 4, 5, 1, 1
3636 .else
3637 src_x888_0565_1pixel 4, 1
3638 .endif
3639 - .if numbytes == 16
3640 - pixst , numbytes, 0, DST
3641 + .if \numbytes == 16
3642 + pixst , \numbytes, 0, DST
3643 .else
3644 - pixst , numbytes, 1, DST
3645 + pixst , \numbytes, 1, DST
3646 .endif
3647 .unreq WK4
3648 .unreq WK5
3649 .unreq WK6
3650 .unreq WK7
3651 .endm
3653 generate_composite_function \
3654 @@ -377,47 +382,47 @@ generate_composite_function \
3655 nop_macro, /* newline */ \
3656 nop_macro, /* cleanup */ \
3657 src_x888_0565_process_head, \
3658 src_x888_0565_process_tail
3660 /******************************************************************************/
3662 .macro add_8_8_8pixels cond, dst1, dst2
3663 - uqadd8&cond WK&dst1, WK&dst1, MASK
3664 - uqadd8&cond WK&dst2, WK&dst2, STRIDE_M
3665 + uqadd8\()\cond WK\()\dst1, WK\()\dst1, MASK
3666 + uqadd8\()\cond WK\()\dst2, WK\()\dst2, STRIDE_M
3667 .endm
3669 .macro add_8_8_4pixels cond, dst
3670 - uqadd8&cond WK&dst, WK&dst, MASK
3671 + uqadd8\()\cond WK\()\dst, WK\()\dst, MASK
3672 .endm
3674 .macro add_8_8_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
3675 WK4 .req MASK
3676 WK5 .req STRIDE_M
3677 - .if numbytes == 16
3678 - pixld cond, 8, 4, SRC, unaligned_src
3679 - pixld cond, 16, firstreg, DST, 0
3680 - add_8_8_8pixels cond, firstreg, %(firstreg+1)
3681 - pixld cond, 8, 4, SRC, unaligned_src
3682 + .if \numbytes == 16
3683 + pixld \cond, 8, 4, SRC, \unaligned_src
3684 + pixld \cond, 16, \firstreg, DST, 0
3685 + add_8_8_8pixels \cond, \firstreg, %(\firstreg+1)
3686 + pixld \cond, 8, 4, SRC, \unaligned_src
3687 .else
3688 - pixld cond, numbytes, 4, SRC, unaligned_src
3689 - pixld cond, numbytes, firstreg, DST, 0
3690 + pixld \cond, \numbytes, 4, SRC, \unaligned_src
3691 + pixld \cond, \numbytes, \firstreg, DST, 0
3692 .endif
3693 .unreq WK4
3694 .unreq WK5
3695 .endm
3697 .macro add_8_8_process_tail cond, numbytes, firstreg
3698 - .if numbytes == 16
3699 - add_8_8_8pixels cond, %(firstreg+2), %(firstreg+3)
3700 - .elseif numbytes == 8
3701 - add_8_8_8pixels cond, firstreg, %(firstreg+1)
3702 + .if \numbytes == 16
3703 + add_8_8_8pixels \cond, %(\firstreg+2), %(\firstreg+3)
3704 + .elseif \numbytes == 8
3705 + add_8_8_8pixels \cond, \firstreg, %(\firstreg+1)
3706 .else
3707 - add_8_8_4pixels cond, firstreg
3708 + add_8_8_4pixels \cond, \firstreg
3709 .endif
3710 .endm
3712 generate_composite_function \
3713 pixman_composite_add_8_8_asm_armv6, 8, 0, 8, \
3714 FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_PRESERVES_SCRATCH, \
3715 2, /* prefetch distance */ \
3716 nop_macro, /* init */ \
3717 @@ -436,82 +441,82 @@ generate_composite_function \
3718 line_saved_regs STRIDE_D, STRIDE_S, ORIG_W
3719 .endm
3721 .macro over_8888_8888_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
3722 WK4 .req STRIDE_D
3723 WK5 .req STRIDE_S
3724 WK6 .req STRIDE_M
3725 WK7 .req ORIG_W
3726 - pixld , numbytes, %(4+firstreg), SRC, unaligned_src
3727 - pixld , numbytes, firstreg, DST, 0
3728 + pixld , \numbytes, %(4+\firstreg), SRC, \unaligned_src
3729 + pixld , \numbytes, \firstreg, DST, 0
3730 .unreq WK4
3731 .unreq WK5
3732 .unreq WK6
3733 .unreq WK7
3734 .endm
3736 .macro over_8888_8888_check_transparent numbytes, reg0, reg1, reg2, reg3
3737 /* Since these colours a premultiplied by alpha, only 0 indicates transparent (any other colour with 0 in the alpha byte is luminous) */
3738 - teq WK&reg0, #0
3739 - .if numbytes > 4
3740 - teqeq WK&reg1, #0
3741 - .if numbytes > 8
3742 - teqeq WK&reg2, #0
3743 - teqeq WK&reg3, #0
3744 + teq WK\()\reg0, #0
3745 + .if \numbytes > 4
3746 + teqeq WK\()\reg1, #0
3747 + .if \numbytes > 8
3748 + teqeq WK\()\reg2, #0
3749 + teqeq WK\()\reg3, #0
3750 .endif
3751 .endif
3752 .endm
3754 .macro over_8888_8888_prepare next
3755 - mov WK&next, WK&next, lsr #24
3756 + mov WK\()\next, WK\()\next, lsr #24
3757 .endm
3759 .macro over_8888_8888_1pixel src, dst, offset, next
3760 /* src = destination component multiplier */
3761 - rsb WK&src, WK&src, #255
3762 + rsb WK\()\src, WK\()\src, #255
3763 /* Split even/odd bytes of dst into SCRATCH/dst */
3764 - uxtb16 SCRATCH, WK&dst
3765 - uxtb16 WK&dst, WK&dst, ror #8
3766 + uxtb16 SCRATCH, WK\()\dst
3767 + uxtb16 WK\()\dst, WK\()\dst, ror #8
3768 /* Multiply through, adding 0.5 to the upper byte of result for rounding */
3769 - mla SCRATCH, SCRATCH, WK&src, MASK
3770 - mla WK&dst, WK&dst, WK&src, MASK
3771 + mla SCRATCH, SCRATCH, WK\()\src, MASK
3772 + mla WK\()\dst, WK\()\dst, WK\()\src, MASK
3773 /* Where we would have had a stall between the result of the first MLA and the shifter input,
3774 * reload the complete source pixel */
3775 - ldr WK&src, [SRC, #offset]
3776 + ldr WK\()\src, [SRC, #\offset]
3777 /* Multiply by 257/256 to approximate 256/255 */
3778 uxtab16 SCRATCH, SCRATCH, SCRATCH, ror #8
3779 /* In this stall, start processing the next pixel */
3780 - .if offset < -4
3781 - mov WK&next, WK&next, lsr #24
3782 + .if \offset < -4
3783 + mov WK\()\next, WK\()\next, lsr #24
3784 .endif
3785 - uxtab16 WK&dst, WK&dst, WK&dst, ror #8
3786 + uxtab16 WK\()\dst, WK\()\dst, WK\()\dst, ror #8
3787 /* Recombine even/odd bytes of multiplied destination */
3788 mov SCRATCH, SCRATCH, ror #8
3789 - sel WK&dst, SCRATCH, WK&dst
3790 + sel WK\()\dst, SCRATCH, WK\()\dst
3791 /* Saturated add of source to multiplied destination */
3792 - uqadd8 WK&dst, WK&dst, WK&src
3793 + uqadd8 WK\()\dst, WK\()\dst, WK\()\src
3794 .endm
3796 .macro over_8888_8888_process_tail cond, numbytes, firstreg
3797 WK4 .req STRIDE_D
3798 WK5 .req STRIDE_S
3799 WK6 .req STRIDE_M
3800 WK7 .req ORIG_W
3801 - over_8888_8888_check_transparent numbytes, %(4+firstreg), %(5+firstreg), %(6+firstreg), %(7+firstreg)
3802 + over_8888_8888_check_transparent \numbytes, %(4+\firstreg), %(5+\firstreg), %(6+\firstreg), %(7+\firstreg)
3803 beq 10f
3804 - over_8888_8888_prepare %(4+firstreg)
3805 - .set PROCESS_REG, firstreg
3806 - .set PROCESS_OFF, -numbytes
3807 - .rept numbytes / 4
3808 + over_8888_8888_prepare %(4+\firstreg)
3809 + .set PROCESS_REG, \firstreg
3810 + .set PROCESS_OFF, -\numbytes
3811 + .rept \numbytes / 4
3812 over_8888_8888_1pixel %(4+PROCESS_REG), %(0+PROCESS_REG), PROCESS_OFF, %(5+PROCESS_REG)
3813 .set PROCESS_REG, PROCESS_REG+1
3814 .set PROCESS_OFF, PROCESS_OFF+4
3815 .endr
3816 - pixst , numbytes, firstreg, DST
3817 + pixst , \numbytes, \firstreg, DST
3819 .unreq WK4
3820 .unreq WK5
3821 .unreq WK6
3822 .unreq WK7
3823 .endm
3825 generate_composite_function \
3826 @@ -531,26 +536,26 @@ generate_composite_function \
3827 * word Register containing 4 bytes
3828 * byte Register containing byte multiplier (bits 8-31 must be 0)
3829 * tmp Scratch register
3830 * half Register containing the constant 0x00800080
3831 * GE[3:0] bits must contain 0101
3833 .macro mul_8888_8 word, byte, tmp, half
3834 /* Split even/odd bytes of word apart */
3835 - uxtb16 tmp, word
3836 - uxtb16 word, word, ror #8
3837 + uxtb16 \tmp, \word
3838 + uxtb16 \word, \word, ror #8
3839 /* Multiply bytes together with rounding, then by 257/256 */
3840 - mla tmp, tmp, byte, half
3841 - mla word, word, byte, half /* 1 stall follows */
3842 - uxtab16 tmp, tmp, tmp, ror #8 /* 1 stall follows */
3843 - uxtab16 word, word, word, ror #8
3844 + mla \tmp, \tmp, \byte, \half
3845 + mla \word, \word, \byte, \half /* 1 stall follows */
3846 + uxtab16 \tmp, \tmp, \tmp, ror #8 /* 1 stall follows */
3847 + uxtab16 \word, \word, \word, ror #8
3848 /* Recombine bytes */
3849 - mov tmp, tmp, ror #8
3850 - sel word, tmp, word
3851 + mov \tmp, \tmp, ror #8
3852 + sel \word, \tmp, \word
3853 .endm
3855 /******************************************************************************/
3857 .macro over_8888_n_8888_init
3858 /* Mask is constant */
3859 ldr MASK, [sp, #ARGS_STACK_OFFSET+8]
3860 /* Hold loop invariant in STRIDE_M */
3861 @@ -562,51 +567,51 @@ generate_composite_function \
3862 line_saved_regs Y, STRIDE_D, STRIDE_S, ORIG_W
3863 .endm
3865 .macro over_8888_n_8888_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
3866 WK4 .req Y
3867 WK5 .req STRIDE_D
3868 WK6 .req STRIDE_S
3869 WK7 .req ORIG_W
3870 - pixld , numbytes, %(4+(firstreg%2)), SRC, unaligned_src
3871 - pixld , numbytes, firstreg, DST, 0
3872 + pixld , \numbytes, %(4+(\firstreg%2)), SRC, \unaligned_src
3873 + pixld , \numbytes, \firstreg, DST, 0
3874 .unreq WK4
3875 .unreq WK5
3876 .unreq WK6
3877 .unreq WK7
3878 .endm
3880 .macro over_8888_n_8888_1pixel src, dst
3881 - mul_8888_8 WK&src, MASK, SCRATCH, STRIDE_M
3882 - sub WK7, WK6, WK&src, lsr #24
3883 - mul_8888_8 WK&dst, WK7, SCRATCH, STRIDE_M
3884 - uqadd8 WK&dst, WK&dst, WK&src
3885 + mul_8888_8 WK\()\src, MASK, SCRATCH, STRIDE_M
3886 + sub WK7, WK6, WK\()\src, lsr #24
3887 + mul_8888_8 WK\()\dst, WK7, SCRATCH, STRIDE_M
3888 + uqadd8 WK\()\dst, WK\()\dst, WK\()\src
3889 .endm
3891 .macro over_8888_n_8888_process_tail cond, numbytes, firstreg
3892 WK4 .req Y
3893 WK5 .req STRIDE_D
3894 WK6 .req STRIDE_S
3895 WK7 .req ORIG_W
3896 - over_8888_8888_check_transparent numbytes, %(4+(firstreg%2)), %(5+(firstreg%2)), %(6+firstreg), %(7+firstreg)
3897 + over_8888_8888_check_transparent \numbytes, %(4+(\firstreg%2)), %(5+(\firstreg%2)), %(6+\firstreg), %(7+\firstreg)
3898 beq 10f
3899 mov WK6, #255
3900 - .set PROCESS_REG, firstreg
3901 - .rept numbytes / 4
3902 - .if numbytes == 16 && PROCESS_REG == 2
3903 + .set PROCESS_REG, \firstreg
3904 + .rept \numbytes / 4
3905 + .if \numbytes == 16 && PROCESS_REG == 2
3906 /* We're using WK6 and WK7 as temporaries, so half way through
3907 * 4 pixels, reload the second two source pixels but this time
3908 * into WK4 and WK5 */
3909 ldmdb SRC, {WK4, WK5}
3910 .endif
3911 over_8888_n_8888_1pixel %(4+(PROCESS_REG%2)), %(PROCESS_REG)
3912 .set PROCESS_REG, PROCESS_REG+1
3913 .endr
3914 - pixst , numbytes, firstreg, DST
3915 + pixst , \numbytes, \firstreg, DST
3917 .unreq WK4
3918 .unreq WK5
3919 .unreq WK6
3920 .unreq WK7
3921 .endm
3923 generate_composite_function \
3924 @@ -637,47 +642,47 @@ generate_composite_function \
3925 ldr STRIDE_D, =0x00800080
3926 b 1f
3927 .ltorg
3929 .endm
3931 .macro over_n_8_8888_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
3932 WK4 .req STRIDE_M
3933 - pixld , numbytes/4, 4, MASK, unaligned_mask
3934 - pixld , numbytes, firstreg, DST, 0
3935 + pixld , \numbytes/4, 4, MASK, \unaligned_mask
3936 + pixld , \numbytes, \firstreg, DST, 0
3937 .unreq WK4
3938 .endm
3940 .macro over_n_8_8888_1pixel src, dst
3941 - uxtb Y, WK4, ror #src*8
3942 + uxtb Y, WK4, ror #\src*8
3943 /* Trailing part of multiplication of source */
3944 mla SCRATCH, STRIDE_S, Y, STRIDE_D
3945 mla Y, SRC, Y, STRIDE_D
3946 mov ORIG_W, #255
3947 uxtab16 SCRATCH, SCRATCH, SCRATCH, ror #8
3948 uxtab16 Y, Y, Y, ror #8
3949 mov SCRATCH, SCRATCH, ror #8
3950 sub ORIG_W, ORIG_W, Y, lsr #24
3951 sel Y, SCRATCH, Y
3952 /* Then multiply the destination */
3953 - mul_8888_8 WK&dst, ORIG_W, SCRATCH, STRIDE_D
3954 - uqadd8 WK&dst, WK&dst, Y
3955 + mul_8888_8 WK\()\dst, ORIG_W, SCRATCH, STRIDE_D
3956 + uqadd8 WK\()\dst, WK\()\dst, Y
3957 .endm
3959 .macro over_n_8_8888_process_tail cond, numbytes, firstreg
3960 WK4 .req STRIDE_M
3961 teq WK4, #0
3962 beq 10f
3963 - .set PROCESS_REG, firstreg
3964 - .rept numbytes / 4
3965 - over_n_8_8888_1pixel %(PROCESS_REG-firstreg), %(PROCESS_REG)
3966 + .set PROCESS_REG, \firstreg
3967 + .rept \numbytes / 4
3968 + over_n_8_8888_1pixel %(PROCESS_REG-\firstreg), %(PROCESS_REG)
3969 .set PROCESS_REG, PROCESS_REG+1
3970 .endr
3971 - pixst , numbytes, firstreg, DST
3972 + pixst , \numbytes, \firstreg, DST
3974 .unreq WK4
3975 .endm
3977 generate_composite_function \
3978 pixman_composite_over_n_8_8888_asm_armv6, 0, 8, 32 \
3979 FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS \
3980 2, /* prefetch distance */ \
3981 @@ -700,64 +705,64 @@ generate_composite_function \
3982 line_saved_regs STRIDE_D, ORIG_W
3983 .endm
3985 .macro over_reverse_n_8888_newline
3986 mov STRIDE_D, #0xFF
3987 .endm
3989 .macro over_reverse_n_8888_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
3990 - pixld , numbytes, firstreg, DST, 0
3991 + pixld , \numbytes, \firstreg, DST, 0
3992 .endm
3994 .macro over_reverse_n_8888_1pixel d, is_only
3995 - teq WK&d, #0
3996 + teq WK\()\d, #0
3997 beq 8f /* replace with source */
3998 - bics ORIG_W, STRIDE_D, WK&d, lsr #24
3999 - .if is_only == 1
4000 + bics ORIG_W, STRIDE_D, WK\()\d, lsr #24
4001 + .if \is_only == 1
4002 beq 49f /* skip store */
4003 .else
4004 beq 9f /* write same value back */
4005 .endif
4006 mla SCRATCH, STRIDE_S, ORIG_W, MASK /* red/blue */
4007 mla ORIG_W, STRIDE_M, ORIG_W, MASK /* alpha/green */
4008 uxtab16 SCRATCH, SCRATCH, SCRATCH, ror #8
4009 uxtab16 ORIG_W, ORIG_W, ORIG_W, ror #8
4010 mov SCRATCH, SCRATCH, ror #8
4011 sel ORIG_W, SCRATCH, ORIG_W
4012 - uqadd8 WK&d, WK&d, ORIG_W
4013 + uqadd8 WK\()\d, WK\()\d, ORIG_W
4014 b 9f
4015 -8: mov WK&d, SRC
4016 +8: mov WK\()\d, SRC
4018 .endm
4020 .macro over_reverse_n_8888_tail numbytes, reg1, reg2, reg3, reg4
4021 - .if numbytes == 4
4022 - over_reverse_n_8888_1pixel reg1, 1
4023 + .if \numbytes == 4
4024 + over_reverse_n_8888_1pixel \reg1, 1
4025 .else
4026 - and SCRATCH, WK&reg1, WK&reg2
4027 - .if numbytes == 16
4028 - and SCRATCH, SCRATCH, WK&reg3
4029 - and SCRATCH, SCRATCH, WK&reg4
4030 + and SCRATCH, WK\()\reg1, WK\()\reg2
4031 + .if \numbytes == 16
4032 + and SCRATCH, SCRATCH, WK\()\reg3
4033 + and SCRATCH, SCRATCH, WK\()\reg4
4034 .endif
4035 mvns SCRATCH, SCRATCH, asr #24
4036 beq 49f /* skip store if all opaque */
4037 - over_reverse_n_8888_1pixel reg1, 0
4038 - over_reverse_n_8888_1pixel reg2, 0
4039 - .if numbytes == 16
4040 - over_reverse_n_8888_1pixel reg3, 0
4041 - over_reverse_n_8888_1pixel reg4, 0
4042 + over_reverse_n_8888_1pixel \reg1, 0
4043 + over_reverse_n_8888_1pixel \reg2, 0
4044 + .if \numbytes == 16
4045 + over_reverse_n_8888_1pixel \reg3, 0
4046 + over_reverse_n_8888_1pixel \reg4, 0
4047 .endif
4048 .endif
4049 - pixst , numbytes, reg1, DST
4050 + pixst , \numbytes, \reg1, DST
4052 .endm
4054 .macro over_reverse_n_8888_process_tail cond, numbytes, firstreg
4055 - over_reverse_n_8888_tail numbytes, firstreg, %(firstreg+1), %(firstreg+2), %(firstreg+3)
4056 + over_reverse_n_8888_tail \numbytes, \firstreg, %(\firstreg+1), %(\firstreg+2), %(\firstreg+3)
4057 .endm
4059 generate_composite_function \
4060 pixman_composite_over_reverse_n_8888_asm_armv6, 0, 0, 32 \
4061 FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS | FLAG_PROCESS_CORRUPTS_SCRATCH, \
4062 3, /* prefetch distance */ \
4063 over_reverse_n_8888_init, \
4064 over_reverse_n_8888_newline, \
4065 @@ -789,30 +794,30 @@ generate_composite_function \
4066 .unreq TMP1
4067 .unreq TMP2
4068 .unreq TMP3
4069 .unreq WK4
4070 .endm
4072 .macro over_white_8888_8888_ca_combine m, d
4073 uxtb16 TMP1, TMP0 /* rb_notmask */
4074 - uxtb16 TMP2, d /* rb_dest; 1 stall follows */
4075 + uxtb16 TMP2, \d /* rb_dest; 1 stall follows */
4076 smlatt TMP3, TMP2, TMP1, HALF /* red */
4077 smlabb TMP2, TMP2, TMP1, HALF /* blue */
4078 uxtb16 TMP0, TMP0, ror #8 /* ag_notmask */
4079 - uxtb16 TMP1, d, ror #8 /* ag_dest; 1 stall follows */
4080 - smlatt d, TMP1, TMP0, HALF /* alpha */
4081 + uxtb16 TMP1, \d, ror #8 /* ag_dest; 1 stall follows */
4082 + smlatt \d, TMP1, TMP0, HALF /* alpha */
4083 smlabb TMP1, TMP1, TMP0, HALF /* green */
4084 pkhbt TMP0, TMP2, TMP3, lsl #16 /* rb; 1 stall follows */
4085 - pkhbt TMP1, TMP1, d, lsl #16 /* ag */
4086 + pkhbt TMP1, TMP1, \d, lsl #16 /* ag */
4087 uxtab16 TMP0, TMP0, TMP0, ror #8
4088 uxtab16 TMP1, TMP1, TMP1, ror #8
4089 mov TMP0, TMP0, ror #8
4090 - sel d, TMP0, TMP1
4091 - uqadd8 d, d, m /* d is a late result */
4092 + sel \d, TMP0, TMP1
4093 + uqadd8 \d, \d, \m /* d is a late result */
4094 .endm
4096 .macro over_white_8888_8888_ca_1pixel_head
4097 pixld , 4, 1, MASK, 0
4098 pixld , 4, 3, DST, 0
4099 .endm
4101 .macro over_white_8888_8888_ca_1pixel_tail
4102 @@ -848,29 +853,29 @@ 02: mvn TMP0, WK2
4103 movcs WK4, WK2
4104 b 04f
4105 03: over_white_8888_8888_ca_combine WK2, WK4
4106 04: pixst , 8, 3, DST
4108 .endm
4110 .macro over_white_8888_8888_ca_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
4111 - .if numbytes == 4
4112 + .if \numbytes == 4
4113 over_white_8888_8888_ca_1pixel_head
4114 .else
4115 - .if numbytes == 16
4116 + .if \numbytes == 16
4117 over_white_8888_8888_ca_2pixels_head
4118 over_white_8888_8888_ca_2pixels_tail
4119 .endif
4120 over_white_8888_8888_ca_2pixels_head
4121 .endif
4122 .endm
4124 .macro over_white_8888_8888_ca_process_tail cond, numbytes, firstreg
4125 - .if numbytes == 4
4126 + .if \numbytes == 4
4127 over_white_8888_8888_ca_1pixel_tail
4128 .else
4129 over_white_8888_8888_ca_2pixels_tail
4130 .endif
4131 .endm
4133 generate_composite_function \
4134 pixman_composite_over_white_8888_8888_ca_asm_armv6, 0, 32, 32 \
4135 @@ -999,33 +1004,33 @@ 20: /* No simplifications possible -
4136 uqadd8 WK0, WK1, WK2 /* followed by 1 stall */
4137 30: /* The destination buffer is already in the L1 cache, so
4138 * there's little point in amalgamating writes */
4139 pixst , 4, 0, DST
4141 .endm
4143 .macro over_n_8888_8888_ca_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
4144 - .rept (numbytes / 4) - 1
4145 + .rept (\numbytes / 4) - 1
4146 over_n_8888_8888_ca_1pixel_head
4147 over_n_8888_8888_ca_1pixel_tail
4148 .endr
4149 over_n_8888_8888_ca_1pixel_head
4150 .endm
4152 .macro over_n_8888_8888_ca_process_tail cond, numbytes, firstreg
4153 over_n_8888_8888_ca_1pixel_tail
4154 .endm
4156 pixman_asm_function pixman_composite_over_n_8888_8888_ca_asm_armv6
4157 ldr ip, [sp]
4158 cmp ip, #-1
4159 beq pixman_composite_over_white_8888_8888_ca_asm_armv6
4160 /* else drop through... */
4161 - .endfunc
4162 + pixman_end_asm_function
4163 generate_composite_function \
4164 pixman_composite_over_n_8888_8888_ca_asm_armv6_helper, 0, 32, 32 \
4165 FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS | FLAG_PROCESS_CORRUPTS_SCRATCH | FLAG_PROCESS_CORRUPTS_WK0 \
4166 2, /* prefetch distance */ \
4167 over_n_8888_8888_ca_init, \
4168 nop_macro, /* newline */ \
4169 over_n_8888_8888_ca_cleanup, \
4170 over_n_8888_8888_ca_process_head, \
4171 @@ -1040,94 +1045,94 @@ generate_composite_function \
4172 uadd8 SCRATCH, MASK, MASK
4173 /* Offset the source pointer: we only need the alpha bytes */
4174 add SRC, SRC, #3
4175 line_saved_regs ORIG_W
4176 .endm
4178 .macro in_reverse_8888_8888_head numbytes, reg1, reg2, reg3
4179 ldrb ORIG_W, [SRC], #4
4180 - .if numbytes >= 8
4181 - ldrb WK&reg1, [SRC], #4
4182 - .if numbytes == 16
4183 - ldrb WK&reg2, [SRC], #4
4184 - ldrb WK&reg3, [SRC], #4
4185 + .if \numbytes >= 8
4186 + ldrb WK\()\reg1, [SRC], #4
4187 + .if \numbytes == 16
4188 + ldrb WK\()\reg2, [SRC], #4
4189 + ldrb WK\()\reg3, [SRC], #4
4190 .endif
4191 .endif
4192 - add DST, DST, #numbytes
4193 + add DST, DST, #\numbytes
4194 .endm
4196 .macro in_reverse_8888_8888_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
4197 - in_reverse_8888_8888_head numbytes, firstreg, %(firstreg+1), %(firstreg+2)
4198 + in_reverse_8888_8888_head \numbytes, \firstreg, %(\firstreg+1), %(\firstreg+2)
4199 .endm
4201 .macro in_reverse_8888_8888_1pixel s, d, offset, is_only
4202 - .if is_only != 1
4203 - movs s, ORIG_W
4204 - .if offset != 0
4205 - ldrb ORIG_W, [SRC, #offset]
4206 + .if \is_only != 1
4207 + movs \s, ORIG_W
4208 + .if \offset != 0
4209 + ldrb ORIG_W, [SRC, #\offset]
4210 .endif
4211 beq 01f
4212 teq STRIDE_M, #0xFF
4213 beq 02f
4214 .endif
4215 - uxtb16 SCRATCH, d /* rb_dest */
4216 - uxtb16 d, d, ror #8 /* ag_dest */
4217 - mla SCRATCH, SCRATCH, s, MASK
4218 - mla d, d, s, MASK
4219 + uxtb16 SCRATCH, \d /* rb_dest */
4220 + uxtb16 \d, \d, ror #8 /* ag_dest */
4221 + mla SCRATCH, SCRATCH, \s, MASK
4222 + mla \d, \d, \s, MASK
4223 uxtab16 SCRATCH, SCRATCH, SCRATCH, ror #8
4224 - uxtab16 d, d, d, ror #8
4225 + uxtab16 \d, \d, \d, ror #8
4226 mov SCRATCH, SCRATCH, ror #8
4227 - sel d, SCRATCH, d
4228 + sel \d, SCRATCH, \d
4229 b 02f
4230 - .if offset == 0
4231 + .if \offset == 0
4232 48: /* Last mov d,#0 of the set - used as part of shortcut for
4233 * source values all 0 */
4234 .endif
4235 -01: mov d, #0
4236 +01: mov \d, #0
4238 .endm
4240 .macro in_reverse_8888_8888_tail numbytes, reg1, reg2, reg3, reg4
4241 - .if numbytes == 4
4242 + .if \numbytes == 4
4243 teq ORIG_W, ORIG_W, asr #32
4244 - ldrne WK&reg1, [DST, #-4]
4245 - .elseif numbytes == 8
4246 - teq ORIG_W, WK&reg1
4247 + ldrne WK\()\reg1, [DST, #-4]
4248 + .elseif \numbytes == 8
4249 + teq ORIG_W, WK\()\reg1
4250 teqeq ORIG_W, ORIG_W, asr #32 /* all 0 or all -1? */
4251 - ldmnedb DST, {WK&reg1-WK&reg2}
4252 + ldmnedb DST, {WK\()\reg1-WK\()\reg2}
4253 .else
4254 - teq ORIG_W, WK&reg1
4255 - teqeq ORIG_W, WK&reg2
4256 - teqeq ORIG_W, WK&reg3
4257 + teq ORIG_W, WK\()\reg1
4258 + teqeq ORIG_W, WK\()\reg2
4259 + teqeq ORIG_W, WK\()\reg3
4260 teqeq ORIG_W, ORIG_W, asr #32 /* all 0 or all -1? */
4261 - ldmnedb DST, {WK&reg1-WK&reg4}
4262 + ldmnedb DST, {WK\()\reg1-WK\()\reg4}
4263 .endif
4264 cmnne DST, #0 /* clear C if NE */
4265 bcs 49f /* no writes to dest if source all -1 */
4266 beq 48f /* set dest to all 0 if source all 0 */
4267 - .if numbytes == 4
4268 - in_reverse_8888_8888_1pixel ORIG_W, WK&reg1, 0, 1
4269 - str WK&reg1, [DST, #-4]
4270 - .elseif numbytes == 8
4271 - in_reverse_8888_8888_1pixel STRIDE_M, WK&reg1, -4, 0
4272 - in_reverse_8888_8888_1pixel STRIDE_M, WK&reg2, 0, 0
4273 - stmdb DST, {WK&reg1-WK&reg2}
4274 + .if \numbytes == 4
4275 + in_reverse_8888_8888_1pixel ORIG_W, WK\()\reg1, 0, 1
4276 + str WK\()\reg1, [DST, #-4]
4277 + .elseif \numbytes == 8
4278 + in_reverse_8888_8888_1pixel STRIDE_M, WK\()\reg1, -4, 0
4279 + in_reverse_8888_8888_1pixel STRIDE_M, WK\()\reg2, 0, 0
4280 + stmdb DST, {WK\()\reg1-WK\()\reg2}
4281 .else
4282 - in_reverse_8888_8888_1pixel STRIDE_M, WK&reg1, -12, 0
4283 - in_reverse_8888_8888_1pixel STRIDE_M, WK&reg2, -8, 0
4284 - in_reverse_8888_8888_1pixel STRIDE_M, WK&reg3, -4, 0
4285 - in_reverse_8888_8888_1pixel STRIDE_M, WK&reg4, 0, 0
4286 - stmdb DST, {WK&reg1-WK&reg4}
4287 + in_reverse_8888_8888_1pixel STRIDE_M, WK\()\reg1, -12, 0
4288 + in_reverse_8888_8888_1pixel STRIDE_M, WK\()\reg2, -8, 0
4289 + in_reverse_8888_8888_1pixel STRIDE_M, WK\()\reg3, -4, 0
4290 + in_reverse_8888_8888_1pixel STRIDE_M, WK\()\reg4, 0, 0
4291 + stmdb DST, {WK\()\reg1-WK\()\reg4}
4292 .endif
4294 .endm
4296 .macro in_reverse_8888_8888_process_tail cond, numbytes, firstreg
4297 - in_reverse_8888_8888_tail numbytes, firstreg, %(firstreg+1), %(firstreg+2), %(firstreg+3)
4298 + in_reverse_8888_8888_tail \numbytes, \firstreg, %(\firstreg+1), %(\firstreg+2), %(\firstreg+3)
4299 .endm
4301 generate_composite_function \
4302 pixman_composite_in_reverse_8888_8888_asm_armv6, 32, 0, 32 \
4303 FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS | FLAG_PROCESS_CORRUPTS_SCRATCH | FLAG_NO_PRELOAD_DST \
4304 2, /* prefetch distance */ \
4305 in_reverse_8888_8888_init, \
4306 nop_macro, /* newline */ \
4307 @@ -1144,31 +1149,31 @@ generate_composite_function \
4308 /* Hold multiplier for destination in STRIDE_M */
4309 mov STRIDE_M, #255
4310 sub STRIDE_M, STRIDE_M, SRC, lsr #24
4311 /* Set GE[3:0] to 0101 so SEL instructions do what we want */
4312 uadd8 SCRATCH, MASK, MASK
4313 .endm
4315 .macro over_n_8888_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
4316 - pixld , numbytes, firstreg, DST, 0
4317 + pixld , \numbytes, \firstreg, DST, 0
4318 .endm
4320 .macro over_n_8888_1pixel dst
4321 - mul_8888_8 WK&dst, STRIDE_M, SCRATCH, MASK
4322 - uqadd8 WK&dst, WK&dst, SRC
4323 + mul_8888_8 WK\()\dst, STRIDE_M, SCRATCH, MASK
4324 + uqadd8 WK\()\dst, WK\()\dst, SRC
4325 .endm
4327 .macro over_n_8888_process_tail cond, numbytes, firstreg
4328 - .set PROCESS_REG, firstreg
4329 - .rept numbytes / 4
4330 + .set PROCESS_REG, \firstreg
4331 + .rept \numbytes / 4
4332 over_n_8888_1pixel %(PROCESS_REG)
4333 .set PROCESS_REG, PROCESS_REG+1
4334 .endr
4335 - pixst , numbytes, firstreg, DST
4336 + pixst , \numbytes, \firstreg, DST
4337 .endm
4339 generate_composite_function \
4340 pixman_composite_over_n_8888_asm_armv6, 0, 0, 32 \
4341 FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_DOES_STORE \
4342 2, /* prefetch distance */ \
4343 over_n_8888_init, \
4344 nop_macro, /* newline */ \
4345 diff --git a/gfx/cairo/libpixman/src/pixman-arm-simd-asm.h b/gfx/cairo/libpixman/src/pixman-arm-simd-asm.h
4346 --- a/gfx/cairo/libpixman/src/pixman-arm-simd-asm.h
4347 +++ b/gfx/cairo/libpixman/src/pixman-arm-simd-asm.h
4348 @@ -107,88 +107,120 @@
4349 .set PREFETCH_TYPE_NONE, 0
4350 .set PREFETCH_TYPE_STANDARD, 1
4353 * Definitions of macros for load/store of pixel data.
4356 .macro pixldst op, cond=al, numbytes, reg0, reg1, reg2, reg3, base, unaligned=0
4357 - .if numbytes == 16
4358 - .if unaligned == 1
4359 - op&r&cond WK&reg0, [base], #4
4360 - op&r&cond WK&reg1, [base], #4
4361 - op&r&cond WK&reg2, [base], #4
4362 - op&r&cond WK&reg3, [base], #4
4363 + .if \numbytes == 16
4364 + .if \unaligned == 1
4365 + \op\()r\()\cond WK\()\reg0, [\base], #4
4366 + \op\()r\()\cond WK\()\reg1, [\base], #4
4367 + \op\()r\()\cond WK\()\reg2, [\base], #4
4368 + \op\()r\()\cond WK\()\reg3, [\base], #4
4369 .else
4370 - op&m&cond&ia base!, {WK&reg0,WK&reg1,WK&reg2,WK&reg3}
4371 +#ifdef __clang__
4372 + \op\()mia\()\cond \base!, {WK\()\reg0,WK\()\reg1,WK\()\reg2,WK\()\reg3}
4373 +#else
4374 + \op\()m\()\cond\()ia \base!, {WK\()\reg0,WK\()\reg1,WK\()\reg2,WK\()\reg3}
4375 +#endif
4376 .endif
4377 - .elseif numbytes == 8
4378 - .if unaligned == 1
4379 - op&r&cond WK&reg0, [base], #4
4380 - op&r&cond WK&reg1, [base], #4
4381 + .elseif \numbytes == 8
4382 + .if \unaligned == 1
4383 + \op\()r\()\cond WK\()\reg0, [\base], #4
4384 + \op\()r\()\cond WK\()\reg1, [\base], #4
4385 .else
4386 - op&m&cond&ia base!, {WK&reg0,WK&reg1}
4387 +#ifdef __clang__
4388 + \op\()mia\()\cond \base!, {WK\()\reg0,WK\()\reg1}
4389 +#else
4390 + \op\()m\()\cond\()ia \base!, {WK\()\reg0,WK\()\reg1}
4391 +#endif
4392 .endif
4393 - .elseif numbytes == 4
4394 - op&r&cond WK&reg0, [base], #4
4395 - .elseif numbytes == 2
4396 - op&r&cond&h WK&reg0, [base], #2
4397 - .elseif numbytes == 1
4398 - op&r&cond&b WK&reg0, [base], #1
4399 + .elseif \numbytes == 4
4400 + \op\()r\()\cond WK\()\reg0, [\base], #4
4401 + .elseif \numbytes == 2
4402 +#ifdef __clang__
4403 + \op\()rh\()\cond WK\()\reg0, [\base], #2
4404 +#else
4405 + \op\()r\()\cond\()h WK\()\reg0, [\base], #2
4406 +#endif
4407 + .elseif \numbytes == 1
4408 +#ifdef __clang__
4409 + \op\()rb\()\cond WK\()\reg0, [\base], #1
4410 +#else
4411 + \op\()r\()\cond\()b WK\()\reg0, [\base], #1
4412 +#endif
4413 .else
4414 - .error "unsupported size: numbytes"
4415 + .error "unsupported size: \numbytes"
4416 .endif
4417 .endm
4419 .macro pixst_baseupdated cond, numbytes, reg0, reg1, reg2, reg3, base
4420 - .if numbytes == 16
4421 - stm&cond&db base, {WK&reg0,WK&reg1,WK&reg2,WK&reg3}
4422 - .elseif numbytes == 8
4423 - stm&cond&db base, {WK&reg0,WK&reg1}
4424 - .elseif numbytes == 4
4425 - str&cond WK&reg0, [base, #-4]
4426 - .elseif numbytes == 2
4427 - str&cond&h WK&reg0, [base, #-2]
4428 - .elseif numbytes == 1
4429 - str&cond&b WK&reg0, [base, #-1]
4430 + .if \numbytes == 16
4431 +#ifdef __clang__
4432 + stm\()\cond\()db \base, {WK\()\reg0,WK\()\reg1,WK\()\reg2,WK\()\reg3}
4433 +#else
4434 + stmdb\()\cond \base, {WK\()\reg0,WK\()\reg1,WK\()\reg2,WK\()\reg3}
4435 +#endif
4436 + .elseif \numbytes == 8
4437 +#ifdef __clang__
4438 + stmdb\()\cond \base, {WK\()\reg0,WK\()\reg1}
4439 +#else
4440 + stm\()\cond\()db \base, {WK\()\reg0,WK\()\reg1}
4441 +#endif
4442 + .elseif \numbytes == 4
4443 + str\()\cond WK\()\reg0, [\base, #-4]
4444 + .elseif \numbytes == 2
4445 +#ifdef __clang__
4446 + strh\()\cond WK\()\reg0, [\base, #-2]
4447 +#else
4448 + str\()\cond\()h WK\()\reg0, [\base, #-2]
4449 +#endif
4450 + .elseif \numbytes == 1
4451 +#ifdef __clang__
4452 + strb\()\cond WK\()\reg0, [\base, #-1]
4453 +#else
4454 + str\()\cond\()b WK\()\reg0, [\base, #-1]
4455 +#endif
4456 .else
4457 - .error "unsupported size: numbytes"
4458 + .error "unsupported size: \numbytes"
4459 .endif
4460 .endm
4462 .macro pixld cond, numbytes, firstreg, base, unaligned
4463 - pixldst ld, cond, numbytes, %(firstreg+0), %(firstreg+1), %(firstreg+2), %(firstreg+3), base, unaligned
4464 + pixldst ld, \cond, \numbytes, %(\firstreg+0), %(\firstreg+1), %(\firstreg+2), %(\firstreg+3), \base, \unaligned
4465 .endm
4467 .macro pixst cond, numbytes, firstreg, base
4468 .if (flags) & FLAG_DST_READWRITE
4469 - pixst_baseupdated cond, numbytes, %(firstreg+0), %(firstreg+1), %(firstreg+2), %(firstreg+3), base
4470 + pixst_baseupdated \cond, \numbytes, %(\firstreg+0), %(\firstreg+1), %(\firstreg+2), %(\firstreg+3), \base
4471 .else
4472 - pixldst st, cond, numbytes, %(firstreg+0), %(firstreg+1), %(firstreg+2), %(firstreg+3), base
4473 + pixldst st, \cond, \numbytes, %(\firstreg+0), %(\firstreg+1), %(\firstreg+2), %(\firstreg+3), \base
4474 .endif
4475 .endm
4477 .macro PF a, x:vararg
4478 .if (PREFETCH_TYPE_CURRENT == PREFETCH_TYPE_STANDARD)
4479 - a x
4480 + \a \x
4481 .endif
4482 .endm
4485 .macro preload_leading_step1 bpp, ptr, base
4486 /* If the destination is already 16-byte aligned, then we need to preload
4487 * between 0 and prefetch_distance (inclusive) cache lines ahead so there
4488 * are no gaps when the inner loop starts.
4490 - .if bpp > 0
4491 - PF bic, ptr, base, #31
4492 + .if \bpp > 0
4493 + PF bic, \ptr, \base, #31
4494 .set OFFSET, 0
4495 .rept prefetch_distance+1
4496 - PF pld, [ptr, #OFFSET]
4497 + PF pld, [\ptr, #OFFSET]
4498 .set OFFSET, OFFSET+32
4499 .endr
4500 .endif
4501 .endm
4503 .macro preload_leading_step2 bpp, bpp_shift, ptr, base
4504 /* However, if the destination is not 16-byte aligned, we may need to
4505 * preload more cache lines than that. The question we need to ask is:
4506 @@ -196,81 +228,81 @@
4507 * by which the source pointer will be rounded down for preloading, and if
4508 * so, by how many cache lines? Effectively, we want to calculate
4509 * leading_bytes = ((-dst)&15)*src_bpp/dst_bpp
4510 * inner_loop_offset = (src+leading_bytes)&31
4511 * extra_needed = leading_bytes - inner_loop_offset
4512 * and test if extra_needed is <= 0, <= 32, or > 32 (where > 32 is only
4513 * possible when there are 4 src bytes for every 1 dst byte).
4515 - .if bpp > 0
4516 - .ifc base,DST
4517 + .if \bpp > 0
4518 + .ifc \base,DST
4519 /* The test can be simplified further when preloading the destination */
4520 - PF tst, base, #16
4521 + PF tst, \base, #16
4522 PF beq, 61f
4523 .else
4524 - .if bpp/dst_w_bpp == 4
4525 - PF add, SCRATCH, base, WK0, lsl #bpp_shift-dst_bpp_shift
4526 + .if \bpp/dst_w_bpp == 4
4527 + PF add, SCRATCH, \base, WK0, lsl #\bpp_shift-dst_bpp_shift
4528 PF and, SCRATCH, SCRATCH, #31
4529 - PF rsb, SCRATCH, SCRATCH, WK0, lsl #bpp_shift-dst_bpp_shift
4530 + PF rsb, SCRATCH, SCRATCH, WK0, lsl #\bpp_shift-dst_bpp_shift
4531 PF sub, SCRATCH, SCRATCH, #1 /* so now ranges are -16..-1 / 0..31 / 32..63 */
4532 PF movs, SCRATCH, SCRATCH, lsl #32-6 /* so this sets NC / nc / Nc */
4533 PF bcs, 61f
4534 PF bpl, 60f
4535 PF pld, [ptr, #32*(prefetch_distance+2)]
4536 .else
4537 - PF mov, SCRATCH, base, lsl #32-5
4538 - PF add, SCRATCH, SCRATCH, WK0, lsl #32-5+bpp_shift-dst_bpp_shift
4539 - PF rsbs, SCRATCH, SCRATCH, WK0, lsl #32-5+bpp_shift-dst_bpp_shift
4540 + PF mov, SCRATCH, \base, lsl #32-5
4541 + PF add, SCRATCH, SCRATCH, WK0, lsl #32-5+\bpp_shift-dst_bpp_shift
4542 + PF rsbs, SCRATCH, SCRATCH, WK0, lsl #32-5+\bpp_shift-dst_bpp_shift
4543 PF bls, 61f
4544 .endif
4545 .endif
4546 -60: PF pld, [ptr, #32*(prefetch_distance+1)]
4547 +60: PF pld, [\ptr, #32*(prefetch_distance+1)]
4549 .endif
4550 .endm
4552 #define IS_END_OF_GROUP(INDEX,SIZE) ((SIZE) < 2 || ((INDEX) & ~((INDEX)+1)) & ((SIZE)/2))
4553 .macro preload_middle bpp, base, scratch_holds_offset
4554 - .if bpp > 0
4555 + .if \bpp > 0
4556 /* prefetch distance = 256/bpp, stm distance = 128/dst_w_bpp */
4557 - .if IS_END_OF_GROUP(SUBBLOCK,256/128*dst_w_bpp/bpp)
4558 - .if scratch_holds_offset
4559 - PF pld, [base, SCRATCH]
4560 + .if IS_END_OF_GROUP(SUBBLOCK,256/128*dst_w_bpp/\bpp)
4561 + .if \scratch_holds_offset
4562 + PF pld, [\base, SCRATCH]
4563 .else
4564 - PF bic, SCRATCH, base, #31
4565 + PF bic, SCRATCH, \base, #31
4566 PF pld, [SCRATCH, #32*prefetch_distance]
4567 .endif
4568 .endif
4569 .endif
4570 .endm
4572 .macro preload_trailing bpp, bpp_shift, base
4573 - .if bpp > 0
4574 - .if bpp*pix_per_block > 256
4575 + .if \bpp > 0
4576 + .if \bpp*pix_per_block > 256
4577 /* Calculations are more complex if more than one fetch per block */
4578 - PF and, WK1, base, #31
4579 - PF add, WK1, WK1, WK0, lsl #bpp_shift
4580 - PF add, WK1, WK1, #32*(bpp*pix_per_block/256-1)*(prefetch_distance+1)
4581 - PF bic, SCRATCH, base, #31
4582 + PF and, WK1, \base, #31
4583 + PF add, WK1, WK1, WK0, lsl #\bpp_shift
4584 + PF add, WK1, WK1, #32*(\bpp*pix_per_block/256-1)*(prefetch_distance+1)
4585 + PF bic, SCRATCH, \base, #31
4586 80: PF pld, [SCRATCH, #32*(prefetch_distance+1)]
4587 PF add, SCRATCH, SCRATCH, #32
4588 PF subs, WK1, WK1, #32
4589 PF bhi, 80b
4590 .else
4591 /* If exactly one fetch per block, then we need either 0, 1 or 2 extra preloads */
4592 - PF mov, SCRATCH, base, lsl #32-5
4593 - PF adds, SCRATCH, SCRATCH, X, lsl #32-5+bpp_shift
4594 + PF mov, SCRATCH, \base, lsl #32-5
4595 + PF adds, SCRATCH, SCRATCH, X, lsl #32-5+\bpp_shift
4596 PF adceqs, SCRATCH, SCRATCH, #0
4597 /* The instruction above has two effects: ensures Z is only
4598 * set if C was clear (so Z indicates that both shifted quantities
4599 * were 0), and clears C if Z was set (so C indicates that the sum
4600 * of the shifted quantities was greater and not equal to 32) */
4601 PF beq, 82f
4602 - PF bic, SCRATCH, base, #31
4603 + PF bic, SCRATCH, \base, #31
4604 PF bcc, 81f
4605 PF pld, [SCRATCH, #32*(prefetch_distance+2)]
4606 81: PF pld, [SCRATCH, #32*(prefetch_distance+1)]
4608 .endif
4609 .endif
4610 .endm
4612 @@ -283,97 +315,97 @@ 82:
4613 * pixels) they cannot possibly straddle more than 2 32-byte cachelines,
4614 * meaning there's no need for a loop.
4615 * "bpp" - number of bits per pixel in the channel (source, mask or
4616 * destination) that's being preloaded, or 0 if this channel is not used
4617 * for reading
4618 * "bpp_shift" - log2 of ("bpp"/8) (except if "bpp"=0 of course)
4619 * "base" - base address register of channel to preload (SRC, MASK or DST)
4621 - .if bpp > 0
4622 - .if narrow_case && (bpp <= dst_w_bpp)
4623 + .if \bpp > 0
4624 + .if \narrow_case && (\bpp <= dst_w_bpp)
4625 /* In these cases, each line for each channel is in either 1 or 2 cache lines */
4626 - PF bic, WK0, base, #31
4627 + PF bic, WK0, \base, #31
4628 PF pld, [WK0]
4629 - PF add, WK1, base, X, LSL #bpp_shift
4630 + PF add, WK1, \base, X, LSL #\bpp_shift
4631 PF sub, WK1, WK1, #1
4632 PF bic, WK1, WK1, #31
4633 PF cmp, WK1, WK0
4634 PF beq, 90f
4635 PF pld, [WK1]
4637 .else
4638 - PF bic, WK0, base, #31
4639 + PF bic, WK0, \base, #31
4640 PF pld, [WK0]
4641 - PF add, WK1, base, X, lsl #bpp_shift
4642 + PF add, WK1, \base, X, lsl #\bpp_shift
4643 PF sub, WK1, WK1, #1
4644 PF bic, WK1, WK1, #31
4645 PF cmp, WK1, WK0
4646 PF beq, 92f
4647 91: PF add, WK0, WK0, #32
4648 PF cmp, WK0, WK1
4649 PF pld, [WK0]
4650 PF bne, 91b
4652 .endif
4653 .endif
4654 .endm
4657 .macro conditional_process1_helper cond, process_head, process_tail, numbytes, firstreg, unaligned_src, unaligned_mask, decrementx
4658 - process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, 0
4659 - .if decrementx
4660 - sub&cond X, X, #8*numbytes/dst_w_bpp
4661 + \process_head \cond, \numbytes, \firstreg, \unaligned_src, \unaligned_mask, 0
4662 + .if \decrementx
4663 + sub\()\cond X, X, #8*\numbytes/dst_w_bpp
4664 .endif
4665 - process_tail cond, numbytes, firstreg
4666 + \process_tail \cond, \numbytes, \firstreg
4667 .if !((flags) & FLAG_PROCESS_DOES_STORE)
4668 - pixst cond, numbytes, firstreg, DST
4669 + pixst \cond, \numbytes, \firstreg, DST
4670 .endif
4671 .endm
4673 .macro conditional_process1 cond, process_head, process_tail, numbytes, firstreg, unaligned_src, unaligned_mask, decrementx
4674 .if (flags) & FLAG_BRANCH_OVER
4675 - .ifc cond,mi
4676 + .ifc \cond,mi
4677 bpl 100f
4678 .endif
4679 - .ifc cond,cs
4680 + .ifc \cond,cs
4681 bcc 100f
4682 .endif
4683 - .ifc cond,ne
4684 + .ifc \cond,ne
4685 beq 100f
4686 .endif
4687 - conditional_process1_helper , process_head, process_tail, numbytes, firstreg, unaligned_src, unaligned_mask, decrementx
4688 + conditional_process1_helper , \process_head, \process_tail, \numbytes, \firstreg, \unaligned_src, \unaligned_mask, \decrementx
4689 100:
4690 .else
4691 - conditional_process1_helper cond, process_head, process_tail, numbytes, firstreg, unaligned_src, unaligned_mask, decrementx
4692 + conditional_process1_helper \cond, \process_head, \process_tail, \numbytes, \firstreg, \unaligned_src, \unaligned_mask, \decrementx
4693 .endif
4694 .endm
4696 .macro conditional_process2 test, cond1, cond2, process_head, process_tail, numbytes1, numbytes2, firstreg1, firstreg2, unaligned_src, unaligned_mask, decrementx
4697 .if (flags) & (FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE)
4698 /* Can't interleave reads and writes */
4699 - test
4700 - conditional_process1 cond1, process_head, process_tail, numbytes1, firstreg1, unaligned_src, unaligned_mask, decrementx
4701 + \test
4702 + conditional_process1 \cond1, \process_head, \process_tail, \numbytes1, \firstreg1, \unaligned_src, \unaligned_mask, \decrementx
4703 .if (flags) & FLAG_PROCESS_CORRUPTS_PSR
4704 - test
4705 + \test
4706 .endif
4707 - conditional_process1 cond2, process_head, process_tail, numbytes2, firstreg2, unaligned_src, unaligned_mask, decrementx
4708 + conditional_process1 \cond2, \process_head, \process_tail, \numbytes2, \firstreg2, \unaligned_src, \unaligned_mask, \decrementx
4709 .else
4710 /* Can interleave reads and writes for better scheduling */
4711 - test
4712 - process_head cond1, numbytes1, firstreg1, unaligned_src, unaligned_mask, 0
4713 - process_head cond2, numbytes2, firstreg2, unaligned_src, unaligned_mask, 0
4714 - .if decrementx
4715 - sub&cond1 X, X, #8*numbytes1/dst_w_bpp
4716 - sub&cond2 X, X, #8*numbytes2/dst_w_bpp
4717 + \test
4718 + \process_head \cond1, \numbytes1, \firstreg1, \unaligned_src, \unaligned_mask, 0
4719 + \process_head \cond2, \numbytes2, \firstreg2, \unaligned_src, \unaligned_mask, 0
4720 + .if \decrementx
4721 + sub\()\cond1 X, X, #8*\numbytes1/dst_w_bpp
4722 + sub\()\cond2 X, X, #8*\numbytes2/dst_w_bpp
4723 .endif
4724 - process_tail cond1, numbytes1, firstreg1
4725 - process_tail cond2, numbytes2, firstreg2
4726 - pixst cond1, numbytes1, firstreg1, DST
4727 - pixst cond2, numbytes2, firstreg2, DST
4728 + \process_tail \cond1, \numbytes1, \firstreg1
4729 + \process_tail \cond2, \numbytes2, \firstreg2
4730 + pixst \cond1, \numbytes1, \firstreg1, DST
4731 + pixst \cond2, \numbytes2, \firstreg2, DST
4732 .endif
4733 .endm
4736 .macro test_bits_1_0_ptr
4737 .if (flags) & FLAG_PROCESS_CORRUPTS_WK0
4738 movs SCRATCH, X, lsl #32-1 /* C,N = bits 1,0 of DST */
4739 .else
4740 @@ -395,22 +427,22 @@ 100:
4741 .if (flags) & FLAG_PROCESS_CORRUPTS_WK0
4742 .set DECREMENT_X, 0
4743 sub X, X, WK0, lsr #dst_bpp_shift
4744 str X, [sp, #LINE_SAVED_REG_COUNT*4]
4745 mov X, WK0
4746 .endif
4747 /* Use unaligned loads in all cases for simplicity */
4748 .if dst_w_bpp == 8
4749 - conditional_process2 test_bits_1_0_ptr, mi, cs, process_head, process_tail, 1, 2, 1, 2, 1, 1, DECREMENT_X
4750 + conditional_process2 test_bits_1_0_ptr, mi, cs, \process_head, \process_tail, 1, 2, 1, 2, 1, 1, DECREMENT_X
4751 .elseif dst_w_bpp == 16
4752 test_bits_1_0_ptr
4753 - conditional_process1 cs, process_head, process_tail, 2, 2, 1, 1, DECREMENT_X
4754 + conditional_process1 cs, \process_head, \process_tail, 2, 2, 1, 1, DECREMENT_X
4755 .endif
4756 - conditional_process2 test_bits_3_2_ptr, mi, cs, process_head, process_tail, 4, 8, 1, 2, 1, 1, DECREMENT_X
4757 + conditional_process2 test_bits_3_2_ptr, mi, cs, \process_head, \process_tail, 4, 8, 1, 2, 1, 1, DECREMENT_X
4758 .if (flags) & FLAG_PROCESS_CORRUPTS_WK0
4759 ldr X, [sp, #LINE_SAVED_REG_COUNT*4]
4760 .endif
4761 .endm
4763 .macro test_bits_3_2_pix
4764 movs SCRATCH, X, lsl #dst_bpp_shift+32-3
4765 .endm
4766 @@ -419,169 +451,169 @@ 100:
4767 .if dst_w_bpp == 8
4768 movs SCRATCH, X, lsl #dst_bpp_shift+32-1
4769 .else
4770 movs SCRATCH, X, lsr #1
4771 .endif
4772 .endm
4774 .macro trailing_15bytes process_head, process_tail, unaligned_src, unaligned_mask
4775 - conditional_process2 test_bits_3_2_pix, cs, mi, process_head, process_tail, 8, 4, 0, 2, unaligned_src, unaligned_mask, 0
4776 + conditional_process2 test_bits_3_2_pix, cs, mi, \process_head, \process_tail, 8, 4, 0, 2, \unaligned_src, \unaligned_mask, 0
4777 .if dst_w_bpp == 16
4778 test_bits_1_0_pix
4779 - conditional_process1 cs, process_head, process_tail, 2, 0, unaligned_src, unaligned_mask, 0
4780 + conditional_process1 cs, \process_head, \process_tail, 2, 0, \unaligned_src, \unaligned_mask, 0
4781 .elseif dst_w_bpp == 8
4782 - conditional_process2 test_bits_1_0_pix, cs, mi, process_head, process_tail, 2, 1, 0, 1, unaligned_src, unaligned_mask, 0
4783 + conditional_process2 test_bits_1_0_pix, cs, mi, \process_head, \process_tail, 2, 1, 0, 1, \unaligned_src, \unaligned_mask, 0
4784 .endif
4785 .endm
4788 .macro wide_case_inner_loop process_head, process_tail, unaligned_src, unaligned_mask, dst_alignment
4789 110:
4790 .set SUBBLOCK, 0 /* this is a count of STMs; there can be up to 8 STMs per block */
4791 .rept pix_per_block*dst_w_bpp/128
4792 - process_head , 16, 0, unaligned_src, unaligned_mask, 1
4793 + \process_head , 16, 0, \unaligned_src, \unaligned_mask, 1
4794 .if (src_bpp > 0) && (mask_bpp == 0) && ((flags) & FLAG_PROCESS_PRESERVES_SCRATCH)
4795 preload_middle src_bpp, SRC, 1
4796 .elseif (src_bpp == 0) && (mask_bpp > 0) && ((flags) & FLAG_PROCESS_PRESERVES_SCRATCH)
4797 preload_middle mask_bpp, MASK, 1
4798 .else
4799 preload_middle src_bpp, SRC, 0
4800 preload_middle mask_bpp, MASK, 0
4801 .endif
4802 .if (dst_r_bpp > 0) && ((SUBBLOCK % 2) == 0) && (((flags) & FLAG_NO_PRELOAD_DST) == 0)
4803 /* Because we know that writes are 16-byte aligned, it's relatively easy to ensure that
4804 * destination prefetches are 32-byte aligned. It's also the easiest channel to offset
4805 * preloads for, to achieve staggered prefetches for multiple channels, because there are
4806 * always two STMs per prefetch, so there is always an opposite STM on which to put the
4807 * preload. Note, no need to BIC the base register here */
4808 - PF pld, [DST, #32*prefetch_distance - dst_alignment]
4809 + PF pld, [DST, #32*prefetch_distance - \dst_alignment]
4810 .endif
4811 - process_tail , 16, 0
4812 + \process_tail , 16, 0
4813 .if !((flags) & FLAG_PROCESS_DOES_STORE)
4814 pixst , 16, 0, DST
4815 .endif
4816 .set SUBBLOCK, SUBBLOCK+1
4817 .endr
4818 subs X, X, #pix_per_block
4819 bhs 110b
4820 .endm
4822 .macro wide_case_inner_loop_and_trailing_pixels process_head, process_tail, process_inner_loop, exit_label, unaligned_src, unaligned_mask
4823 /* Destination now 16-byte aligned; we have at least one block before we have to stop preloading */
4824 .if dst_r_bpp > 0
4825 tst DST, #16
4826 bne 111f
4827 - process_inner_loop process_head, process_tail, unaligned_src, unaligned_mask, 16 + DST_PRELOAD_BIAS
4828 + \process_inner_loop \process_head, \process_tail, \unaligned_src, \unaligned_mask, 16 + DST_PRELOAD_BIAS
4829 b 112f
4830 111:
4831 .endif
4832 - process_inner_loop process_head, process_tail, unaligned_src, unaligned_mask, 0 + DST_PRELOAD_BIAS
4833 + \process_inner_loop \process_head, \process_tail, \unaligned_src, \unaligned_mask, 0 + DST_PRELOAD_BIAS
4834 112:
4835 /* Just before the final (prefetch_distance+1) 32-byte blocks, deal with final preloads */
4836 .if (src_bpp*pix_per_block > 256) || (mask_bpp*pix_per_block > 256) || (dst_r_bpp*pix_per_block > 256)
4837 PF and, WK0, X, #pix_per_block-1
4838 .endif
4839 preload_trailing src_bpp, src_bpp_shift, SRC
4840 preload_trailing mask_bpp, mask_bpp_shift, MASK
4841 .if ((flags) & FLAG_NO_PRELOAD_DST) == 0
4842 preload_trailing dst_r_bpp, dst_bpp_shift, DST
4843 .endif
4844 add X, X, #(prefetch_distance+2)*pix_per_block - 128/dst_w_bpp
4845 /* The remainder of the line is handled identically to the medium case */
4846 - medium_case_inner_loop_and_trailing_pixels process_head, process_tail,, exit_label, unaligned_src, unaligned_mask
4847 + medium_case_inner_loop_and_trailing_pixels \process_head, \process_tail,, \exit_label, \unaligned_src, \unaligned_mask
4848 .endm
4850 .macro medium_case_inner_loop_and_trailing_pixels process_head, process_tail, unused, exit_label, unaligned_src, unaligned_mask
4851 120:
4852 - process_head , 16, 0, unaligned_src, unaligned_mask, 0
4853 - process_tail , 16, 0
4854 + \process_head , 16, 0, \unaligned_src, \unaligned_mask, 0
4855 + \process_tail , 16, 0
4856 .if !((flags) & FLAG_PROCESS_DOES_STORE)
4857 pixst , 16, 0, DST
4858 .endif
4859 subs X, X, #128/dst_w_bpp
4860 bhs 120b
4861 /* Trailing pixels */
4862 tst X, #128/dst_w_bpp - 1
4863 - beq exit_label
4864 - trailing_15bytes process_head, process_tail, unaligned_src, unaligned_mask
4865 + beq \exit_label
4866 + trailing_15bytes \process_head, \process_tail, \unaligned_src, \unaligned_mask
4867 .endm
4869 .macro narrow_case_inner_loop_and_trailing_pixels process_head, process_tail, unused, exit_label, unaligned_src, unaligned_mask
4870 tst X, #16*8/dst_w_bpp
4871 - conditional_process1 ne, process_head, process_tail, 16, 0, unaligned_src, unaligned_mask, 0
4872 + conditional_process1 ne, \process_head, \process_tail, 16, 0, \unaligned_src, \unaligned_mask, 0
4873 /* Trailing pixels */
4874 /* In narrow case, it's relatively unlikely to be aligned, so let's do without a branch here */
4875 - trailing_15bytes process_head, process_tail, unaligned_src, unaligned_mask
4876 + trailing_15bytes \process_head, \process_tail, \unaligned_src, \unaligned_mask
4877 .endm
4879 .macro switch_on_alignment action, process_head, process_tail, process_inner_loop, exit_label
4880 /* Note that if we're reading the destination, it's already guaranteed to be aligned at this point */
4881 .if mask_bpp == 8 || mask_bpp == 16
4882 tst MASK, #3
4883 bne 141f
4884 .endif
4885 .if src_bpp == 8 || src_bpp == 16
4886 tst SRC, #3
4887 bne 140f
4888 .endif
4889 - action process_head, process_tail, process_inner_loop, exit_label, 0, 0
4890 + \action \process_head, \process_tail, \process_inner_loop, \exit_label, 0, 0
4891 .if src_bpp == 8 || src_bpp == 16
4892 - b exit_label
4893 + b \exit_label
4894 140:
4895 - action process_head, process_tail, process_inner_loop, exit_label, 1, 0
4896 + \action \process_head, \process_tail, \process_inner_loop, \exit_label, 1, 0
4897 .endif
4898 .if mask_bpp == 8 || mask_bpp == 16
4899 - b exit_label
4900 + b \exit_label
4901 141:
4902 .if src_bpp == 8 || src_bpp == 16
4903 tst SRC, #3
4904 bne 142f
4905 .endif
4906 - action process_head, process_tail, process_inner_loop, exit_label, 0, 1
4907 + \action \process_head, \process_tail, \process_inner_loop, \exit_label, 0, 1
4908 .if src_bpp == 8 || src_bpp == 16
4909 - b exit_label
4910 + b \exit_label
4911 142:
4912 - action process_head, process_tail, process_inner_loop, exit_label, 1, 1
4913 + \action \process_head, \process_tail, \process_inner_loop, \exit_label, 1, 1
4914 .endif
4915 .endif
4916 .endm
4919 .macro end_of_line restore_x, vars_spilled, loop_label, last_one
4920 - .if vars_spilled
4921 + .if \vars_spilled
4922 /* Sadly, GAS doesn't seem have an equivalent of the DCI directive? */
4923 /* This is ldmia sp,{} */
4924 .word 0xE89D0000 | LINE_SAVED_REGS
4925 .endif
4926 subs Y, Y, #1
4927 - .if vars_spilled
4928 + .if \vars_spilled
4929 .if (LINE_SAVED_REGS) & (1<<1)
4930 str Y, [sp]
4931 .endif
4932 .endif
4933 add DST, DST, STRIDE_D
4934 .if src_bpp > 0
4935 add SRC, SRC, STRIDE_S
4936 .endif
4937 .if mask_bpp > 0
4938 add MASK, MASK, STRIDE_M
4939 .endif
4940 - .if restore_x
4941 + .if \restore_x
4942 mov X, ORIG_W
4943 .endif
4944 - bhs loop_label
4945 - .ifc "last_one",""
4946 - .if vars_spilled
4947 + bhs \loop_label
4948 + .ifc "\last_one",""
4949 + .if \vars_spilled
4950 b 197f
4951 .else
4952 b 198f
4953 .endif
4954 .else
4955 - .if (!vars_spilled) && ((flags) & FLAG_SPILL_LINE_VARS)
4956 + .if (!\vars_spilled) && ((flags) & FLAG_SPILL_LINE_VARS)
4957 b 198f
4958 .endif
4959 .endif
4960 .endm
4963 .macro generate_composite_function fname, \
4964 src_bpp_, \
4965 @@ -591,27 +623,27 @@ 142:
4966 prefetch_distance_, \
4967 init, \
4968 newline, \
4969 cleanup, \
4970 process_head, \
4971 process_tail, \
4972 process_inner_loop
4974 - pixman_asm_function fname
4975 + pixman_asm_function \fname
4978 * Make some macro arguments globally visible and accessible
4979 * from other macros
4981 - .set src_bpp, src_bpp_
4982 - .set mask_bpp, mask_bpp_
4983 - .set dst_w_bpp, dst_w_bpp_
4984 - .set flags, flags_
4985 - .set prefetch_distance, prefetch_distance_
4986 + .set src_bpp, \src_bpp_
4987 + .set mask_bpp, \mask_bpp_
4988 + .set dst_w_bpp, \dst_w_bpp_
4989 + .set flags, \flags_
4990 + .set prefetch_distance, \prefetch_distance_
4993 * Select prefetch type for this function.
4995 .if prefetch_distance == 0
4996 .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_NONE
4997 .else
4998 .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_STANDARD
4999 @@ -727,17 +759,17 @@ 142:
5000 .endif
5002 #ifdef DEBUG_PARAMS
5003 add Y, Y, #1
5004 stmia sp, {r0-r7,pc}
5005 sub Y, Y, #1
5006 #endif
5008 - init
5009 + \init
5011 .if (flags) & FLAG_PROCESS_CORRUPTS_WK0
5012 /* Reserve a word in which to store X during leading pixels */
5013 sub sp, sp, #4
5014 .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET+4
5015 .set LOCALS_STACK_OFFSET, LOCALS_STACK_OFFSET+4
5016 .endif
5018 @@ -768,47 +800,47 @@ 142:
5019 mov ORIG_W, X
5020 .if (flags) & FLAG_SPILL_LINE_VARS_WIDE
5021 /* This is stmdb sp!,{} */
5022 .word 0xE92D0000 | LINE_SAVED_REGS
5023 .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET + LINE_SAVED_REG_COUNT*4
5024 .set LOCALS_STACK_OFFSET, LOCALS_STACK_OFFSET + LINE_SAVED_REG_COUNT*4
5025 .endif
5026 151: /* New line */
5027 - newline
5028 + \newline
5029 preload_leading_step1 src_bpp, WK1, SRC
5030 preload_leading_step1 mask_bpp, WK2, MASK
5031 .if ((flags) & FLAG_NO_PRELOAD_DST) == 0
5032 preload_leading_step1 dst_r_bpp, WK3, DST
5033 .endif
5035 ands WK0, DST, #15
5036 beq 154f
5037 rsb WK0, WK0, #16 /* number of leading bytes until destination aligned */
5039 preload_leading_step2 src_bpp, src_bpp_shift, WK1, SRC
5040 preload_leading_step2 mask_bpp, mask_bpp_shift, WK2, MASK
5041 .if ((flags) & FLAG_NO_PRELOAD_DST) == 0
5042 preload_leading_step2 dst_r_bpp, dst_bpp_shift, WK3, DST
5043 .endif
5045 - leading_15bytes process_head, process_tail
5046 + leading_15bytes \process_head, \process_tail
5048 154: /* Destination now 16-byte aligned; we have at least one prefetch on each channel as well as at least one 16-byte output block */
5049 .if (src_bpp > 0) && (mask_bpp == 0) && ((flags) & FLAG_PROCESS_PRESERVES_SCRATCH)
5050 and SCRATCH, SRC, #31
5051 rsb SCRATCH, SCRATCH, #32*prefetch_distance
5052 .elseif (src_bpp == 0) && (mask_bpp > 0) && ((flags) & FLAG_PROCESS_PRESERVES_SCRATCH)
5053 and SCRATCH, MASK, #31
5054 rsb SCRATCH, SCRATCH, #32*prefetch_distance
5055 .endif
5056 - .ifc "process_inner_loop",""
5057 - switch_on_alignment wide_case_inner_loop_and_trailing_pixels, process_head, process_tail, wide_case_inner_loop, 157f
5058 + .ifc "\process_inner_loop",""
5059 + switch_on_alignment wide_case_inner_loop_and_trailing_pixels, \process_head, \process_tail, wide_case_inner_loop, 157f
5060 .else
5061 - switch_on_alignment wide_case_inner_loop_and_trailing_pixels, process_head, process_tail, process_inner_loop, 157f
5062 + switch_on_alignment wide_case_inner_loop_and_trailing_pixels, \process_head, \process_tail, \process_inner_loop, 157f
5063 .endif
5065 157: /* Check for another line */
5066 end_of_line 1, %((flags) & FLAG_SPILL_LINE_VARS_WIDE), 151b
5067 .if (flags) & FLAG_SPILL_LINE_VARS_WIDE
5068 .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET - LINE_SAVED_REG_COUNT*4
5069 .set LOCALS_STACK_OFFSET, LOCALS_STACK_OFFSET - LINE_SAVED_REG_COUNT*4
5070 .endif
5071 @@ -820,80 +852,80 @@ 160: /* Medium case */
5072 mov ORIG_W, X
5073 .if (flags) & FLAG_SPILL_LINE_VARS_NON_WIDE
5074 /* This is stmdb sp!,{} */
5075 .word 0xE92D0000 | LINE_SAVED_REGS
5076 .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET + LINE_SAVED_REG_COUNT*4
5077 .set LOCALS_STACK_OFFSET, LOCALS_STACK_OFFSET + LINE_SAVED_REG_COUNT*4
5078 .endif
5079 161: /* New line */
5080 - newline
5081 + \newline
5082 preload_line 0, src_bpp, src_bpp_shift, SRC /* in: X, corrupts: WK0-WK1 */
5083 preload_line 0, mask_bpp, mask_bpp_shift, MASK
5084 .if ((flags) & FLAG_NO_PRELOAD_DST) == 0
5085 preload_line 0, dst_r_bpp, dst_bpp_shift, DST
5086 .endif
5088 sub X, X, #128/dst_w_bpp /* simplifies inner loop termination */
5089 ands WK0, DST, #15
5090 beq 164f
5091 rsb WK0, WK0, #16 /* number of leading bytes until destination aligned */
5093 - leading_15bytes process_head, process_tail
5094 + leading_15bytes \process_head, \process_tail
5096 164: /* Destination now 16-byte aligned; we have at least one 16-byte output block */
5097 - switch_on_alignment medium_case_inner_loop_and_trailing_pixels, process_head, process_tail,, 167f
5098 + switch_on_alignment medium_case_inner_loop_and_trailing_pixels, \process_head, \process_tail,, 167f
5100 167: /* Check for another line */
5101 end_of_line 1, %((flags) & FLAG_SPILL_LINE_VARS_NON_WIDE), 161b
5103 .ltorg
5105 170: /* Narrow case, less than 31 bytes, so no guarantee of at least one 16-byte block */
5106 .if dst_w_bpp < 32
5107 mov ORIG_W, X
5108 .endif
5109 .if (flags) & FLAG_SPILL_LINE_VARS_NON_WIDE
5110 /* This is stmdb sp!,{} */
5111 .word 0xE92D0000 | LINE_SAVED_REGS
5112 .endif
5113 171: /* New line */
5114 - newline
5115 + \newline
5116 preload_line 1, src_bpp, src_bpp_shift, SRC /* in: X, corrupts: WK0-WK1 */
5117 preload_line 1, mask_bpp, mask_bpp_shift, MASK
5118 .if ((flags) & FLAG_NO_PRELOAD_DST) == 0
5119 preload_line 1, dst_r_bpp, dst_bpp_shift, DST
5120 .endif
5122 .if dst_w_bpp == 8
5123 tst DST, #3
5124 beq 174f
5125 172: subs X, X, #1
5126 blo 177f
5127 - process_head , 1, 0, 1, 1, 0
5128 - process_tail , 1, 0
5129 + \process_head , 1, 0, 1, 1, 0
5130 + \process_tail , 1, 0
5131 .if !((flags) & FLAG_PROCESS_DOES_STORE)
5132 pixst , 1, 0, DST
5133 .endif
5134 tst DST, #3
5135 bne 172b
5136 .elseif dst_w_bpp == 16
5137 tst DST, #2
5138 beq 174f
5139 subs X, X, #1
5140 blo 177f
5141 - process_head , 2, 0, 1, 1, 0
5142 - process_tail , 2, 0
5143 + \process_head , 2, 0, 1, 1, 0
5144 + \process_tail , 2, 0
5145 .if !((flags) & FLAG_PROCESS_DOES_STORE)
5146 pixst , 2, 0, DST
5147 .endif
5148 .endif
5150 174: /* Destination now 4-byte aligned; we have 0 or more output bytes to go */
5151 - switch_on_alignment narrow_case_inner_loop_and_trailing_pixels, process_head, process_tail,, 177f
5152 + switch_on_alignment narrow_case_inner_loop_and_trailing_pixels, \process_head, \process_tail,, 177f
5154 177: /* Check for another line */
5155 end_of_line %(dst_w_bpp < 32), %((flags) & FLAG_SPILL_LINE_VARS_NON_WIDE), 171b, last_one
5156 .if (flags) & FLAG_SPILL_LINE_VARS_NON_WIDE
5157 .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET - LINE_SAVED_REG_COUNT*4
5158 .set LOCALS_STACK_OFFSET, LOCALS_STACK_OFFSET - LINE_SAVED_REG_COUNT*4
5159 .endif
5161 @@ -903,17 +935,17 @@ 197:
5162 .endif
5163 198:
5164 .if (flags) & FLAG_PROCESS_CORRUPTS_WK0
5165 .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET-4
5166 .set LOCALS_STACK_OFFSET, LOCALS_STACK_OFFSET-4
5167 add sp, sp, #4
5168 .endif
5170 - cleanup
5171 + \cleanup
5173 #ifdef DEBUG_PARAMS
5174 add sp, sp, #9*4 /* junk the debug copy of arguments */
5175 #endif
5176 199:
5177 pop {r4-r11, pc} /* exit */
5179 .ltorg
5180 @@ -927,23 +959,23 @@ 199:
5181 .unreq MASK
5182 .unreq STRIDE_M
5183 .unreq WK0
5184 .unreq WK1
5185 .unreq WK2
5186 .unreq WK3
5187 .unreq SCRATCH
5188 .unreq ORIG_W
5189 - .endfunc
5190 + pixman_end_asm_function
5191 .endm
5193 .macro line_saved_regs x:vararg
5194 .set LINE_SAVED_REGS, 0
5195 .set LINE_SAVED_REG_COUNT, 0
5196 - .irp SAVED_REG,x
5197 + .irp SAVED_REG,\x
5198 .ifc "SAVED_REG","Y"
5199 .set LINE_SAVED_REGS, LINE_SAVED_REGS | (1<<1)
5200 .set LINE_SAVED_REG_COUNT, LINE_SAVED_REG_COUNT + 1
5201 .endif
5202 .ifc "SAVED_REG","STRIDE_D"
5203 .set LINE_SAVED_REGS, LINE_SAVED_REGS | (1<<3)
5204 .set LINE_SAVED_REG_COUNT, LINE_SAVED_REG_COUNT + 1
5205 .endif