gfx/cairo/pixman-arm32-clang.patch

   1 https://gitlab.freedesktop.org/pixman/pixman/-/issues/74
   2
   3 diff --git a/gfx/cairo/libpixman/src/pixman-arm-neon-asm-bilinear.S b/gfx/cairo/libpixman/src/pixman-arm-neon-asm-bilinear.S
   4 --- a/gfx/cairo/libpixman/src/pixman-arm-neon-asm-bilinear.S
   5 +++ b/gfx/cairo/libpixman/src/pixman-arm-neon-asm-bilinear.S
   6 @@ -77,206 +77,206 @@
   7   * format conversion, and interpolation as separate macros which can be used
   8   * as the basic building blocks for constructing bilinear scanline functions.
   9   */
  10
  11  .macro bilinear_load_8888 reg1, reg2, tmp
  12      mov       TMP1, X, asr #16
  13      add       X, X, UX
  14      add       TMP1, TOP, TMP1, asl #2
  15 -    vld1.32   {reg1}, [TMP1], STRIDE
  16 -    vld1.32   {reg2}, [TMP1]
  17 +    vld1.32   {\reg1}, [TMP1], STRIDE
  18 +    vld1.32   {\reg2}, [TMP1]
  19  .endm
  20
  21  .macro bilinear_load_0565 reg1, reg2, tmp
  22      mov       TMP1, X, asr #16
  23      add       X, X, UX
  24      add       TMP1, TOP, TMP1, asl #1
  25 -    vld1.32   {reg2[0]}, [TMP1], STRIDE
  26 -    vld1.32   {reg2[1]}, [TMP1]
  27 -    convert_four_0565_to_x888_packed reg2, reg1, reg2, tmp
  28 +    vld1.32   {\reg2[0]}, [TMP1], STRIDE
  29 +    vld1.32   {\reg2[1]}, [TMP1]
  30 +    convert_four_0565_to_x888_packed \reg2, \reg1, \reg2, \tmp
  31  .endm
  32
  33  .macro bilinear_load_and_vertical_interpolate_two_8888 \
  34                      acc1, acc2, reg1, reg2, reg3, reg4, tmp1, tmp2
  35
  36 -    bilinear_load_8888 reg1, reg2, tmp1
  37 -    vmull.u8  acc1, reg1, d28
  38 -    vmlal.u8  acc1, reg2, d29
  39 -    bilinear_load_8888 reg3, reg4, tmp2
  40 -    vmull.u8  acc2, reg3, d28
  41 -    vmlal.u8  acc2, reg4, d29
  42 +    bilinear_load_8888 \reg1, \reg2, \tmp1
  43 +    vmull.u8  \acc1, \reg1, d28
  44 +    vmlal.u8  \acc1, \reg2, d29
  45 +    bilinear_load_8888 \reg3, \reg4, \tmp2
  46 +    vmull.u8  \acc2, \reg3, d28
  47 +    vmlal.u8  \acc2, \reg4, d29
  48  .endm
  49
  50  .macro bilinear_load_and_vertical_interpolate_four_8888 \
  51                  xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi \
  52                  yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
  53
  54      bilinear_load_and_vertical_interpolate_two_8888 \
  55 -                xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi
  56 +                \xacc1, \xacc2, \xreg1, \xreg2, \xreg3, \xreg4, \xacc2lo, \xacc2hi
  57      bilinear_load_and_vertical_interpolate_two_8888 \
  58 -                yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
  59 +                \yacc1, \yacc2, \yreg1, \yreg2, \yreg3, \yreg4, \yacc2lo, \yacc2hi
  60  .endm
  61
  62  .macro bilinear_load_and_vertical_interpolate_two_0565 \
  63                  acc1, acc2, reg1, reg2, reg3, reg4, acc2lo, acc2hi
  64
  65      mov       TMP1, X, asr #16
  66      add       X, X, UX
  67      add       TMP1, TOP, TMP1, asl #1
  68      mov       TMP2, X, asr #16
  69      add       X, X, UX
  70      add       TMP2, TOP, TMP2, asl #1
  71 -    vld1.32   {acc2lo[0]}, [TMP1], STRIDE
  72 -    vld1.32   {acc2hi[0]}, [TMP2], STRIDE
  73 -    vld1.32   {acc2lo[1]}, [TMP1]
  74 -    vld1.32   {acc2hi[1]}, [TMP2]
  75 -    convert_0565_to_x888 acc2, reg3, reg2, reg1
  76 -    vzip.u8   reg1, reg3
  77 -    vzip.u8   reg2, reg4
  78 -    vzip.u8   reg3, reg4
  79 -    vzip.u8   reg1, reg2
  80 -    vmull.u8  acc1, reg1, d28
  81 -    vmlal.u8  acc1, reg2, d29
  82 -    vmull.u8  acc2, reg3, d28
  83 -    vmlal.u8  acc2, reg4, d29
  84 +    vld1.32   {\acc2lo[0]}, [TMP1], STRIDE
  85 +    vld1.32   {\acc2hi[0]}, [TMP2], STRIDE
  86 +    vld1.32   {\acc2lo[1]}, [TMP1]
  87 +    vld1.32   {\acc2hi[1]}, [TMP2]
  88 +    convert_0565_to_x888 \acc2, \reg3, \reg2, \reg1
  89 +    vzip.u8   \reg1, \reg3
  90 +    vzip.u8   \reg2, \reg4
  91 +    vzip.u8   \reg3, \reg4
  92 +    vzip.u8   \reg1, \reg2
  93 +    vmull.u8  \acc1, \reg1, d28
  94 +    vmlal.u8  \acc1, \reg2, d29
  95 +    vmull.u8  \acc2, \reg3, d28
  96 +    vmlal.u8  \acc2, \reg4, d29
  97  .endm
  98
  99  .macro bilinear_load_and_vertical_interpolate_four_0565 \
 100                  xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi \
 101                  yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
 102
 103      mov       TMP1, X, asr #16
 104      add       X, X, UX
 105      add       TMP1, TOP, TMP1, asl #1
 106      mov       TMP2, X, asr #16
 107      add       X, X, UX
 108      add       TMP2, TOP, TMP2, asl #1
 109 -    vld1.32   {xacc2lo[0]}, [TMP1], STRIDE
 110 -    vld1.32   {xacc2hi[0]}, [TMP2], STRIDE
 111 -    vld1.32   {xacc2lo[1]}, [TMP1]
 112 -    vld1.32   {xacc2hi[1]}, [TMP2]
 113 -    convert_0565_to_x888 xacc2, xreg3, xreg2, xreg1
 114 +    vld1.32   {\xacc2lo[0]}, [TMP1], STRIDE
 115 +    vld1.32   {\xacc2hi[0]}, [TMP2], STRIDE
 116 +    vld1.32   {\xacc2lo[1]}, [TMP1]
 117 +    vld1.32   {\xacc2hi[1]}, [TMP2]
 118 +    convert_0565_to_x888 \xacc2, \xreg3, \xreg2, \xreg1
 119      mov       TMP1, X, asr #16
 120      add       X, X, UX
 121      add       TMP1, TOP, TMP1, asl #1
 122      mov       TMP2, X, asr #16
 123      add       X, X, UX
 124      add       TMP2, TOP, TMP2, asl #1
 125 -    vld1.32   {yacc2lo[0]}, [TMP1], STRIDE
 126 -    vzip.u8   xreg1, xreg3
 127 -    vld1.32   {yacc2hi[0]}, [TMP2], STRIDE
 128 -    vzip.u8   xreg2, xreg4
 129 -    vld1.32   {yacc2lo[1]}, [TMP1]
 130 -    vzip.u8   xreg3, xreg4
 131 -    vld1.32   {yacc2hi[1]}, [TMP2]
 132 -    vzip.u8   xreg1, xreg2
 133 -    convert_0565_to_x888 yacc2, yreg3, yreg2, yreg1
 134 -    vmull.u8  xacc1, xreg1, d28
 135 -    vzip.u8   yreg1, yreg3
 136 -    vmlal.u8  xacc1, xreg2, d29
 137 -    vzip.u8   yreg2, yreg4
 138 -    vmull.u8  xacc2, xreg3, d28
 139 -    vzip.u8   yreg3, yreg4
 140 -    vmlal.u8  xacc2, xreg4, d29
 141 -    vzip.u8   yreg1, yreg2
 142 -    vmull.u8  yacc1, yreg1, d28
 143 -    vmlal.u8  yacc1, yreg2, d29
 144 -    vmull.u8  yacc2, yreg3, d28
 145 -    vmlal.u8  yacc2, yreg4, d29
 146 +    vld1.32   {\yacc2lo[0]}, [TMP1], STRIDE
 147 +    vzip.u8   \xreg1, \xreg3
 148 +    vld1.32   {\yacc2hi[0]}, [TMP2], STRIDE
 149 +    vzip.u8   \xreg2, \xreg4
 150 +    vld1.32   {\yacc2lo[1]}, [TMP1]
 151 +    vzip.u8   \xreg3, \xreg4
 152 +    vld1.32   {\yacc2hi[1]}, [TMP2]
 153 +    vzip.u8   \xreg1, \xreg2
 154 +    convert_0565_to_x888 \yacc2, \yreg3, \yreg2, \yreg1
 155 +    vmull.u8  \xacc1, \xreg1, d28
 156 +    vzip.u8   \yreg1, \yreg3
 157 +    vmlal.u8  \xacc1, \xreg2, d29
 158 +    vzip.u8   \yreg2, \yreg4
 159 +    vmull.u8  \xacc2, \xreg3, d28
 160 +    vzip.u8   \yreg3, \yreg4
 161 +    vmlal.u8  \xacc2, \xreg4, d29
 162 +    vzip.u8   \yreg1, \yreg2
 163 +    vmull.u8  \yacc1, \yreg1, d28
 164 +    vmlal.u8  \yacc1, \yreg2, d29
 165 +    vmull.u8  \yacc2, \yreg3, d28
 166 +    vmlal.u8  \yacc2, \yreg4, d29
 167  .endm
 168
 169  .macro bilinear_store_8888 numpix, tmp1, tmp2
 170 -.if numpix == 4
 171 +.if \numpix == 4
 172      vst1.32   {d0, d1}, [OUT]!
 173 -.elseif numpix == 2
 174 +.elseif \numpix == 2
 175      vst1.32   {d0}, [OUT]!
 176 -.elseif numpix == 1
 177 +.elseif \numpix == 1
 178      vst1.32   {d0[0]}, [OUT, :32]!
 179  .else
 180      .error bilinear_store_8888 numpix is unsupported
 181  .endif
 182  .endm
 183
 184  .macro bilinear_store_0565 numpix, tmp1, tmp2
 185      vuzp.u8 d0, d1
 186      vuzp.u8 d2, d3
 187      vuzp.u8 d1, d3
 188      vuzp.u8 d0, d2
 189 -    convert_8888_to_0565 d2, d1, d0, q1, tmp1, tmp2
 190 -.if numpix == 4
 191 +    convert_8888_to_0565 d2, d1, d0, q1, \tmp1, \tmp2
 192 +.if \numpix == 4
 193      vst1.16   {d2}, [OUT]!
 194 -.elseif numpix == 2
 195 +.elseif \numpix == 2
 196      vst1.32   {d2[0]}, [OUT]!
 197 -.elseif numpix == 1
 198 +.elseif \numpix == 1
 199      vst1.16   {d2[0]}, [OUT]!
 200  .else
 201      .error bilinear_store_0565 numpix is unsupported
 202  .endif
 203  .endm
 204
 205
 206  /*
 207   * Macros for loading mask pixels into register 'mask'.
 208   * vdup must be done in somewhere else.
 209   */
 210  .macro bilinear_load_mask_x numpix, mask
 211  .endm
 212
 213  .macro bilinear_load_mask_8 numpix, mask
 214 -.if numpix == 4
 215 -    vld1.32     {mask[0]}, [MASK]!
 216 -.elseif numpix == 2
 217 -    vld1.16     {mask[0]}, [MASK]!
 218 -.elseif numpix == 1
 219 -    vld1.8      {mask[0]}, [MASK]!
 220 +.if \numpix == 4
 221 +    vld1.32     {\mask[0]}, [MASK]!
 222 +.elseif \numpix == 2
 223 +    vld1.16     {\mask[0]}, [MASK]!
 224 +.elseif \numpix == 1
 225 +    vld1.8      {\mask[0]}, [MASK]!
 226  .else
 227 -    .error bilinear_load_mask_8 numpix is unsupported
 228 +    .error bilinear_load_mask_8 \numpix is unsupported
 229  .endif
 230      pld         [MASK, #prefetch_offset]
 231  .endm
 232
 233  .macro bilinear_load_mask mask_fmt, numpix, mask
 234 -    bilinear_load_mask_&mask_fmt numpix, mask
 235 +    bilinear_load_mask_\()\mask_fmt \numpix, \mask
 236  .endm
 237
 238
 239  /*
 240   * Macros for loading destination pixels into register 'dst0' and 'dst1'.
 241   * Interleave should be done somewhere else.
 242   */
 243  .macro bilinear_load_dst_0565_src numpix, dst0, dst1, dst01
 244  .endm
 245
 246  .macro bilinear_load_dst_8888_src numpix, dst0, dst1, dst01
 247  .endm
 248
 249  .macro bilinear_load_dst_8888 numpix, dst0, dst1, dst01
 250 -.if numpix == 4
 251 -    vld1.32     {dst0, dst1}, [OUT]
 252 -.elseif numpix == 2
 253 -    vld1.32     {dst0}, [OUT]
 254 -.elseif numpix == 1
 255 -    vld1.32     {dst0[0]}, [OUT]
 256 +.if \numpix == 4
 257 +    vld1.32     {\dst0, \dst1}, [OUT]
 258 +.elseif \numpix == 2
 259 +    vld1.32     {\dst0}, [OUT]
 260 +.elseif \numpix == 1
 261 +    vld1.32     {\dst0[0]}, [OUT]
 262  .else
 263 -    .error bilinear_load_dst_8888 numpix is unsupported
 264 +    .error bilinear_load_dst_8888 \numpix is unsupported
 265  .endif
 266      pld         [OUT, #(prefetch_offset * 4)]
 267  .endm
 268
 269  .macro bilinear_load_dst_8888_over numpix, dst0, dst1, dst01
 270 -    bilinear_load_dst_8888 numpix, dst0, dst1, dst01
 271 +    bilinear_load_dst_8888 \numpix, \dst0, \dst1, \dst01
 272  .endm
 273
 274  .macro bilinear_load_dst_8888_add numpix, dst0, dst1, dst01
 275 -    bilinear_load_dst_8888 numpix, dst0, dst1, dst01
 276 +    bilinear_load_dst_8888 \numpix, \dst0, \dst1, \dst01
 277  .endm
 278
 279  .macro bilinear_load_dst dst_fmt, op, numpix, dst0, dst1, dst01
 280 -    bilinear_load_dst_&dst_fmt&_&op numpix, dst0, dst1, dst01
 281 +    bilinear_load_dst_\()\dst_fmt\()_\()\op \numpix, \dst0, \dst1, \dst01
 282  .endm
 283
 284  /*
 285   * Macros for duplicating partially loaded mask to fill entire register.
 286   * We will apply mask to interleaved source pixels, that is
 287   *  (r0, r1, r2, r3, g0, g1, g2, g3) x (m0, m1, m2, m3, m0, m1, m2, m3)
 288   *  (b0, b1, b2, b3, a0, a1, a2, a3) x (m0, m1, m2, m3, m0, m1, m2, m3)
 289   * So, we need to duplicate loaded mask into whole register.
 290 @@ -285,79 +285,79 @@
 291   *  (r0, r1, x, x, g0, g1, x, x) x (m0, m1, m0, m1, m0, m1, m0, m1)
 292   *  (b0, b1, x, x, a0, a1, x, x) x (m0, m1, m0, m1, m0, m1, m0, m1)
 293   * We can do some optimizations for this including last pixel cases.
 294   */
 295  .macro bilinear_duplicate_mask_x numpix, mask
 296  .endm
 297
 298  .macro bilinear_duplicate_mask_8 numpix, mask
 299 -.if numpix == 4
 300 -    vdup.32     mask, mask[0]
 301 -.elseif numpix == 2
 302 -    vdup.16     mask, mask[0]
 303 -.elseif numpix == 1
 304 -    vdup.8      mask, mask[0]
 305 +.if \numpix == 4
 306 +    vdup.32     \mask, \mask[0]
 307 +.elseif \numpix == 2
 308 +    vdup.16     \mask, \mask[0]
 309 +.elseif \numpix == 1
 310 +    vdup.8      \mask, \mask[0]
 311  .else
 312      .error bilinear_duplicate_mask_8 is unsupported
 313  .endif
 314  .endm
 315
 316  .macro bilinear_duplicate_mask mask_fmt, numpix, mask
 317 -    bilinear_duplicate_mask_&mask_fmt numpix, mask
 318 +    bilinear_duplicate_mask_\()\mask_fmt \numpix, \mask
 319  .endm
 320
 321  /*
 322   * Macros for interleaving src and dst pixels to rrrr gggg bbbb aaaa form.
 323   * Interleave should be done when maks is enabled or operator is 'over'.
 324   */
 325  .macro bilinear_interleave src0, src1, dst0, dst1
 326 -    vuzp.8      src0, src1
 327 -    vuzp.8      dst0, dst1
 328 -    vuzp.8      src0, src1
 329 -    vuzp.8      dst0, dst1
 330 +    vuzp.8      \src0, \src1
 331 +    vuzp.8      \dst0, \dst1
 332 +    vuzp.8      \src0, \src1
 333 +    vuzp.8      \dst0, \dst1
 334  .endm
 335
 336  .macro bilinear_interleave_src_dst_x_src \
 337                  numpix, src0, src1, src01, dst0, dst1, dst01
 338  .endm
 339
 340  .macro bilinear_interleave_src_dst_x_over \
 341                  numpix, src0, src1, src01, dst0, dst1, dst01
 342
 343 -    bilinear_interleave src0, src1, dst0, dst1
 344 +    bilinear_interleave \src0, \src1, \dst0, \dst1
 345  .endm
 346
 347  .macro bilinear_interleave_src_dst_x_add \
 348                  numpix, src0, src1, src01, dst0, dst1, dst01
 349  .endm
 350
 351  .macro bilinear_interleave_src_dst_8_src \
 352                  numpix, src0, src1, src01, dst0, dst1, dst01
 353
 354 -    bilinear_interleave src0, src1, dst0, dst1
 355 +    bilinear_interleave \src0, \src1, \dst0, \dst1
 356  .endm
 357
 358  .macro bilinear_interleave_src_dst_8_over \
 359                  numpix, src0, src1, src01, dst0, dst1, dst01
 360
 361 -    bilinear_interleave src0, src1, dst0, dst1
 362 +    bilinear_interleave \src0, \src1, \dst0, \dst1
 363  .endm
 364
 365  .macro bilinear_interleave_src_dst_8_add \
 366                  numpix, src0, src1, src01, dst0, dst1, dst01
 367
 368 -    bilinear_interleave src0, src1, dst0, dst1
 369 +    bilinear_interleave \src0, \src1, \dst0, \dst1
 370  .endm
 371
 372  .macro bilinear_interleave_src_dst \
 373                  mask_fmt, op, numpix, src0, src1, src01, dst0, dst1, dst01
 374
 375 -    bilinear_interleave_src_dst_&mask_fmt&_&op \
 376 -                numpix, src0, src1, src01, dst0, dst1, dst01
 377 +    bilinear_interleave_src_dst_\()\mask_fmt\()_\()\op \
 378 +                \numpix, \src0, \src1, \src01, \dst0, \dst1, \dst01
 379  .endm
 380
 381
 382  /*
 383   * Macros for applying masks to src pixels. (see combine_mask_u() function)
 384   * src, dst should be in interleaved form.
 385   * mask register should be in form (m0, m1, m2, m3).
 386   */
 387 @@ -365,217 +365,217 @@
 388                  numpix, src0, src1, src01, mask, \
 389                  tmp01, tmp23, tmp45, tmp67
 390  .endm
 391
 392  .macro bilinear_apply_mask_to_src_8 \
 393                  numpix, src0, src1, src01, mask, \
 394                  tmp01, tmp23, tmp45, tmp67
 395
 396 -    vmull.u8        tmp01, src0, mask
 397 -    vmull.u8        tmp23, src1, mask
 398 +    vmull.u8        \tmp01, \src0, \mask
 399 +    vmull.u8        \tmp23, \src1, \mask
 400      /* bubbles */
 401 -    vrshr.u16       tmp45, tmp01, #8
 402 -    vrshr.u16       tmp67, tmp23, #8
 403 +    vrshr.u16       \tmp45, \tmp01, #8
 404 +    vrshr.u16       \tmp67, \tmp23, #8
 405      /* bubbles */
 406 -    vraddhn.u16     src0, tmp45, tmp01
 407 -    vraddhn.u16     src1, tmp67, tmp23
 408 +    vraddhn.u16     \src0, \tmp45, \tmp01
 409 +    vraddhn.u16     \src1, \tmp67, \tmp23
 410  .endm
 411
 412  .macro bilinear_apply_mask_to_src \
 413                  mask_fmt, numpix, src0, src1, src01, mask, \
 414                  tmp01, tmp23, tmp45, tmp67
 415
 416 -    bilinear_apply_mask_to_src_&mask_fmt \
 417 -                numpix, src0, src1, src01, mask, \
 418 -                tmp01, tmp23, tmp45, tmp67
 419 +    bilinear_apply_mask_to_src_\()\mask_fmt \
 420 +                \numpix, \src0, \src1, \src01, \mask, \
 421 +                \tmp01, \tmp23, \tmp45, \tmp67
 422  .endm
 423
 424
 425  /*
 426   * Macros for combining src and destination pixels.
 427   * Interleave or not is depending on operator 'op'.
 428   */
 429  .macro bilinear_combine_src \
 430                  numpix, src0, src1, src01, dst0, dst1, dst01, \
 431                  tmp01, tmp23, tmp45, tmp67, tmp8
 432  .endm
 433
 434  .macro bilinear_combine_over \
 435                  numpix, src0, src1, src01, dst0, dst1, dst01, \
 436                  tmp01, tmp23, tmp45, tmp67, tmp8
 437
 438 -    vdup.32     tmp8, src1[1]
 439 +    vdup.32     \tmp8, \src1[1]
 440      /* bubbles */
 441 -    vmvn.8      tmp8, tmp8
 442 +    vmvn.8      \tmp8, \tmp8
 443      /* bubbles */
 444 -    vmull.u8    tmp01, dst0, tmp8
 445 +    vmull.u8    \tmp01, \dst0, \tmp8
 446      /* bubbles */
 447 -    vmull.u8    tmp23, dst1, tmp8
 448 +    vmull.u8    \tmp23, \dst1, \tmp8
 449      /* bubbles */
 450 -    vrshr.u16   tmp45, tmp01, #8
 451 -    vrshr.u16   tmp67, tmp23, #8
 452 +    vrshr.u16   \tmp45, \tmp01, #8
 453 +    vrshr.u16   \tmp67, \tmp23, #8
 454      /* bubbles */
 455 -    vraddhn.u16 dst0, tmp45, tmp01
 456 -    vraddhn.u16 dst1, tmp67, tmp23
 457 +    vraddhn.u16 \dst0, \tmp45, \tmp01
 458 +    vraddhn.u16 \dst1, \tmp67, \tmp23
 459      /* bubbles */
 460 -    vqadd.u8    src01, dst01, src01
 461 +    vqadd.u8    \src01, \dst01, \src01
 462  .endm
 463
 464  .macro bilinear_combine_add \
 465                  numpix, src0, src1, src01, dst0, dst1, dst01, \
 466                  tmp01, tmp23, tmp45, tmp67, tmp8
 467
 468 -    vqadd.u8    src01, dst01, src01
 469 +    vqadd.u8    \src01, \dst01, \src01
 470  .endm
 471
 472  .macro bilinear_combine \
 473                  op, numpix, src0, src1, src01, dst0, dst1, dst01, \
 474                  tmp01, tmp23, tmp45, tmp67, tmp8
 475
 476 -    bilinear_combine_&op \
 477 -                numpix, src0, src1, src01, dst0, dst1, dst01, \
 478 -                tmp01, tmp23, tmp45, tmp67, tmp8
 479 +    bilinear_combine_\()\op \
 480 +                \numpix, \src0, \src1, \src01, \dst0, \dst1, \dst01, \
 481 +                \tmp01, \tmp23, \tmp45, \tmp67, \tmp8
 482  .endm
 483
 484  /*
 485   * Macros for final deinterleaving of destination pixels if needed.
 486   */
 487  .macro bilinear_deinterleave numpix, dst0, dst1, dst01
 488 -    vuzp.8      dst0, dst1
 489 +    vuzp.8      \dst0, \dst1
 490      /* bubbles */
 491 -    vuzp.8      dst0, dst1
 492 +    vuzp.8      \dst0, \dst1
 493  .endm
 494
 495  .macro bilinear_deinterleave_dst_x_src numpix, dst0, dst1, dst01
 496  .endm
 497
 498  .macro bilinear_deinterleave_dst_x_over numpix, dst0, dst1, dst01
 499 -    bilinear_deinterleave numpix, dst0, dst1, dst01
 500 +    bilinear_deinterleave \numpix, \dst0, \dst1, \dst01
 501  .endm
 502
 503  .macro bilinear_deinterleave_dst_x_add numpix, dst0, dst1, dst01
 504  .endm
 505
 506  .macro bilinear_deinterleave_dst_8_src numpix, dst0, dst1, dst01
 507 -    bilinear_deinterleave numpix, dst0, dst1, dst01
 508 +    bilinear_deinterleave \numpix, \dst0, \dst1, \dst01
 509  .endm
 510
 511  .macro bilinear_deinterleave_dst_8_over numpix, dst0, dst1, dst01
 512 -    bilinear_deinterleave numpix, dst0, dst1, dst01
 513 +    bilinear_deinterleave \numpix, \dst0, \dst1, \dst01
 514  .endm
 515
 516  .macro bilinear_deinterleave_dst_8_add numpix, dst0, dst1, dst01
 517 -    bilinear_deinterleave numpix, dst0, dst1, dst01
 518 +    bilinear_deinterleave \numpix, \dst0, \dst1, \dst01
 519  .endm
 520
 521  .macro bilinear_deinterleave_dst mask_fmt, op, numpix, dst0, dst1, dst01
 522 -    bilinear_deinterleave_dst_&mask_fmt&_&op numpix, dst0, dst1, dst01
 523 +    bilinear_deinterleave_dst_\()\mask_fmt\()_\()\op \numpix, \dst0, \dst1, \dst01
 524  .endm
 525
 526
 527  .macro bilinear_interpolate_last_pixel src_fmt, mask_fmt, dst_fmt, op
 528 -    bilinear_load_&src_fmt d0, d1, d2
 529 -    bilinear_load_mask mask_fmt, 1, d4
 530 -    bilinear_load_dst dst_fmt, op, 1, d18, d19, q9
 531 +    bilinear_load_\()\src_fmt d0, d1, d2
 532 +    bilinear_load_mask \mask_fmt, 1, d4
 533 +    bilinear_load_dst \dst_fmt, \op, 1, d18, d19, q9
 534      vmull.u8  q1, d0, d28
 535      vmlal.u8  q1, d1, d29
 536      /* 5 cycles bubble */
 537      vshll.u16 q0, d2, #BILINEAR_INTERPOLATION_BITS
 538      vmlsl.u16 q0, d2, d30
 539      vmlal.u16 q0, d3, d30
 540      /* 5 cycles bubble */
 541 -    bilinear_duplicate_mask mask_fmt, 1, d4
 542 +    bilinear_duplicate_mask \mask_fmt, 1, d4
 543      vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
 544      /* 3 cycles bubble */
 545      vmovn.u16 d0, q0
 546      /* 1 cycle bubble */
 547      bilinear_interleave_src_dst \
 548 -                mask_fmt, op, 1, d0, d1, q0, d18, d19, q9
 549 +                \mask_fmt, \op, 1, d0, d1, q0, d18, d19, q9
 550      bilinear_apply_mask_to_src \
 551 -                mask_fmt, 1, d0, d1, q0, d4, \
 552 +                \mask_fmt, 1, d0, d1, q0, d4, \
 553                  q3, q8, q10, q11
 554      bilinear_combine \
 555 -                op, 1, d0, d1, q0, d18, d19, q9, \
 556 +                \op, 1, d0, d1, q0, d18, d19, q9, \
 557                  q3, q8, q10, q11, d5
 558 -    bilinear_deinterleave_dst mask_fmt, op, 1, d0, d1, q0
 559 -    bilinear_store_&dst_fmt 1, q2, q3
 560 +    bilinear_deinterleave_dst \mask_fmt, \op, 1, d0, d1, q0
 561 +    bilinear_store_\()\dst_fmt 1, q2, q3
 562  .endm
 563
 564  .macro bilinear_interpolate_two_pixels src_fmt, mask_fmt, dst_fmt, op
 565 -    bilinear_load_and_vertical_interpolate_two_&src_fmt \
 566 +    bilinear_load_and_vertical_interpolate_two_\()\src_fmt \
 567                  q1, q11, d0, d1, d20, d21, d22, d23
 568 -    bilinear_load_mask mask_fmt, 2, d4
 569 -    bilinear_load_dst dst_fmt, op, 2, d18, d19, q9
 570 +    bilinear_load_mask \mask_fmt, 2, d4
 571 +    bilinear_load_dst \dst_fmt, \op, 2, d18, d19, q9
 572      vshll.u16 q0, d2, #BILINEAR_INTERPOLATION_BITS
 573      vmlsl.u16 q0, d2, d30
 574      vmlal.u16 q0, d3, d30
 575      vshll.u16 q10, d22, #BILINEAR_INTERPOLATION_BITS
 576      vmlsl.u16 q10, d22, d31
 577      vmlal.u16 q10, d23, d31
 578      vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
 579      vshrn.u32 d1, q10, #(2 * BILINEAR_INTERPOLATION_BITS)
 580 -    bilinear_duplicate_mask mask_fmt, 2, d4
 581 +    bilinear_duplicate_mask \mask_fmt, 2, d4
 582      vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
 583      vadd.u16  q12, q12, q13
 584      vmovn.u16 d0, q0
 585      bilinear_interleave_src_dst \
 586 -                mask_fmt, op, 2, d0, d1, q0, d18, d19, q9
 587 +                \mask_fmt, \op, 2, d0, d1, q0, d18, d19, q9
 588      bilinear_apply_mask_to_src \
 589 -                mask_fmt, 2, d0, d1, q0, d4, \
 590 +                \mask_fmt, 2, d0, d1, q0, d4, \
 591                  q3, q8, q10, q11
 592      bilinear_combine \
 593 -                op, 2, d0, d1, q0, d18, d19, q9, \
 594 +                \op, 2, d0, d1, q0, d18, d19, q9, \
 595                  q3, q8, q10, q11, d5
 596 -    bilinear_deinterleave_dst mask_fmt, op, 2, d0, d1, q0
 597 -    bilinear_store_&dst_fmt 2, q2, q3
 598 +    bilinear_deinterleave_dst \mask_fmt, \op, 2, d0, d1, q0
 599 +    bilinear_store_\()\dst_fmt 2, q2, q3
 600  .endm
 601
 602  .macro bilinear_interpolate_four_pixels src_fmt, mask_fmt, dst_fmt, op
 603 -    bilinear_load_and_vertical_interpolate_four_&src_fmt \
 604 +    bilinear_load_and_vertical_interpolate_four_\()\src_fmt \
 605                  q1, q11, d0, d1, d20, d21, d22, d23 \
 606                  q3, q9,  d4, d5, d16, d17, d18, d19
 607      pld       [TMP1, PF_OFFS]
 608      sub       TMP1, TMP1, STRIDE
 609      vshll.u16 q0, d2, #BILINEAR_INTERPOLATION_BITS
 610      vmlsl.u16 q0, d2, d30
 611      vmlal.u16 q0, d3, d30
 612      vshll.u16 q10, d22, #BILINEAR_INTERPOLATION_BITS
 613      vmlsl.u16 q10, d22, d31
 614      vmlal.u16 q10, d23, d31
 615      vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
 616      vshll.u16 q2, d6, #BILINEAR_INTERPOLATION_BITS
 617      vmlsl.u16 q2, d6, d30
 618      vmlal.u16 q2, d7, d30
 619      vshll.u16 q8, d18, #BILINEAR_INTERPOLATION_BITS
 620 -    bilinear_load_mask mask_fmt, 4, d22
 621 -    bilinear_load_dst dst_fmt, op, 4, d2, d3, q1
 622 +    bilinear_load_mask \mask_fmt, 4, d22
 623 +    bilinear_load_dst \dst_fmt, \op, 4, d2, d3, q1
 624      pld       [TMP1, PF_OFFS]
 625      vmlsl.u16 q8, d18, d31
 626      vmlal.u16 q8, d19, d31
 627      vadd.u16  q12, q12, q13
 628      vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
 629      vshrn.u32 d1, q10, #(2 * BILINEAR_INTERPOLATION_BITS)
 630      vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS)
 631      vshrn.u32 d5, q8, #(2 * BILINEAR_INTERPOLATION_BITS)
 632 -    bilinear_duplicate_mask mask_fmt, 4, d22
 633 +    bilinear_duplicate_mask \mask_fmt, 4, d22
 634      vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
 635      vmovn.u16 d0, q0
 636      vmovn.u16 d1, q2
 637      vadd.u16  q12, q12, q13
 638      bilinear_interleave_src_dst \
 639 -                mask_fmt, op, 4, d0, d1, q0, d2, d3, q1
 640 +                \mask_fmt, \op, 4, d0, d1, q0, d2, d3, q1
 641      bilinear_apply_mask_to_src \
 642 -                mask_fmt, 4, d0, d1, q0, d22, \
 643 +                \mask_fmt, 4, d0, d1, q0, d22, \
 644                  q3, q8, q9, q10
 645      bilinear_combine \
 646 -                op, 4, d0, d1, q0, d2, d3, q1, \
 647 +                \op, 4, d0, d1, q0, d2, d3, q1, \
 648                  q3, q8, q9, q10, d23
 649 -    bilinear_deinterleave_dst mask_fmt, op, 4, d0, d1, q0
 650 -    bilinear_store_&dst_fmt 4, q2, q3
 651 +    bilinear_deinterleave_dst \mask_fmt, \op, 4, d0, d1, q0
 652 +    bilinear_store_\()\dst_fmt 4, q2, q3
 653  .endm
 654
 655  .set BILINEAR_FLAG_USE_MASK,           1
 656  .set BILINEAR_FLAG_USE_ALL_NEON_REGS,  2
 657
 658  /*
 659   * Main template macro for generating NEON optimized bilinear scanline functions.
 660   *
 661 @@ -605,24 +605,24 @@
 662         bilinear_process_four_pixels, \
 663         bilinear_process_pixblock_head, \
 664         bilinear_process_pixblock_tail, \
 665         bilinear_process_pixblock_tail_head, \
 666         pixblock_size, \
 667         prefetch_distance, \
 668         flags
 669
 670 -pixman_asm_function fname
 671 -.if pixblock_size == 8
 672 -.elseif pixblock_size == 4
 673 +pixman_asm_function \fname
 674 +.if \pixblock_size == 8
 675 +.elseif \pixblock_size == 4
 676  .else
 677      .error unsupported pixblock size
 678  .endif
 679
 680 -.if ((flags) & BILINEAR_FLAG_USE_MASK) == 0
 681 +.if ((\flags) & BILINEAR_FLAG_USE_MASK) == 0
 682      OUT       .req    r0
 683      TOP       .req    r1
 684      BOTTOM    .req    r2
 685      WT        .req    r3
 686      WB        .req    r4
 687      X         .req    r5
 688      UX        .req    r6
 689      WIDTH     .req    ip
 690 @@ -630,17 +630,17 @@ pixman_asm_function fname
 691      TMP2      .req    r4
 692      PF_OFFS   .req    r7
 693      TMP3      .req    r8
 694      TMP4      .req    r9
 695      STRIDE    .req    r2
 696
 697      mov                ip, sp
 698      push       {r4, r5, r6, r7, r8, r9}
 699 -    mov                PF_OFFS, #prefetch_distance
 700 +    mov                PF_OFFS, #\prefetch_distance
 701      ldmia      ip, {WB, X, UX, WIDTH}
 702  .else
 703      OUT       .req      r0
 704      MASK      .req      r1
 705      TOP       .req      r2
 706      BOTTOM    .req      r3
 707      WT        .req      r4
 708      WB        .req      r5
 709 @@ -649,27 +649,27 @@ pixman_asm_function fname
 710      WIDTH     .req      ip
 711      TMP1      .req      r4
 712      TMP2      .req      r5
 713      PF_OFFS   .req      r8
 714      TMP3      .req      r9
 715      TMP4      .req      r10
 716      STRIDE    .req      r3
 717
 718 -    .set prefetch_offset, prefetch_distance
 719 +    .set prefetch_offset, \prefetch_distance
 720
 721      mov       ip, sp
 722      push      {r4, r5, r6, r7, r8, r9, r10, ip}
 723 -    mov       PF_OFFS, #prefetch_distance
 724 +    mov       PF_OFFS, #\prefetch_distance
 725      ldmia     ip, {WT, WB, X, UX, WIDTH}
 726  .endif
 727
 728      mul       PF_OFFS, PF_OFFS, UX
 729
 730 -.if ((flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0
 731 +.if ((\flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0
 732      vpush     {d8-d15}
 733  .endif
 734
 735      sub              STRIDE, BOTTOM, TOP
 736      .unreq    BOTTOM
 737
 738      cmp       WIDTH, #0
 739      ble       3f
 740 @@ -678,76 +678,76 @@ pixman_asm_function fname
 741      vdup.u16  q13, UX
 742      vdup.u8   d28, WT
 743      vdup.u8   d29, WB
 744      vadd.u16  d25, d25, d26
 745
 746      /* ensure good destination alignment  */
 747      cmp       WIDTH, #1
 748      blt       0f
 749 -    tst       OUT, #(1 << dst_bpp_shift)
 750 +    tst       OUT, #(1 << \dst_bpp_shift)
 751      beq       0f
 752      vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
 753      vadd.u16  q12, q12, q13
 754 -    bilinear_process_last_pixel
 755 +    \bilinear_process_last_pixel
 756      sub       WIDTH, WIDTH, #1
 757  0:
 758      vadd.u16  q13, q13, q13
 759      vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
 760      vadd.u16  q12, q12, q13
 761
 762      cmp       WIDTH, #2
 763      blt       0f
 764 -    tst       OUT, #(1 << (dst_bpp_shift + 1))
 765 +    tst       OUT, #(1 << (\dst_bpp_shift + 1))
 766      beq       0f
 767 -    bilinear_process_two_pixels
 768 +    \bilinear_process_two_pixels
 769      sub       WIDTH, WIDTH, #2
 770  0:
 771 -.if pixblock_size == 8
 772 +.if \pixblock_size == 8
 773      cmp       WIDTH, #4
 774      blt       0f
 775 -    tst       OUT, #(1 << (dst_bpp_shift + 2))
 776 +    tst       OUT, #(1 << (\dst_bpp_shift + 2))
 777      beq       0f
 778 -    bilinear_process_four_pixels
 779 +    \bilinear_process_four_pixels
 780      sub       WIDTH, WIDTH, #4
 781  0:
 782  .endif
 783 -    subs      WIDTH, WIDTH, #pixblock_size
 784 +    subs      WIDTH, WIDTH, #\pixblock_size
 785      blt       1f
 786 -    mov       PF_OFFS, PF_OFFS, asr #(16 - src_bpp_shift)
 787 -    bilinear_process_pixblock_head
 788 -    subs      WIDTH, WIDTH, #pixblock_size
 789 +    mov       PF_OFFS, PF_OFFS, asr #(16 - \src_bpp_shift)
 790 +    \bilinear_process_pixblock_head
 791 +    subs      WIDTH, WIDTH, #\pixblock_size
 792      blt       5f
 793  0:
 794 -    bilinear_process_pixblock_tail_head
 795 -    subs      WIDTH, WIDTH, #pixblock_size
 796 +    \bilinear_process_pixblock_tail_head
 797 +    subs      WIDTH, WIDTH, #\pixblock_size
 798      bge       0b
 799  5:
 800 -    bilinear_process_pixblock_tail
 801 +    \bilinear_process_pixblock_tail
 802  1:
 803 -.if pixblock_size == 8
 804 +.if \pixblock_size == 8
 805      tst       WIDTH, #4
 806      beq       2f
 807 -    bilinear_process_four_pixels
 808 +    \bilinear_process_four_pixels
 809  2:
 810  .endif
 811      /* handle the remaining trailing pixels */
 812      tst       WIDTH, #2
 813      beq       2f
 814 -    bilinear_process_two_pixels
 815 +    \bilinear_process_two_pixels
 816  2:
 817      tst       WIDTH, #1
 818      beq       3f
 819 -    bilinear_process_last_pixel
 820 +    \bilinear_process_last_pixel
 821  3:
 822 -.if ((flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0
 823 +.if ((\flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0
 824      vpop      {d8-d15}
 825  .endif
 826
 827 -.if ((flags) & BILINEAR_FLAG_USE_MASK) == 0
 828 +.if ((\flags) & BILINEAR_FLAG_USE_MASK) == 0
 829      pop       {r4, r5, r6, r7, r8, r9}
 830  .else
 831      pop       {r4, r5, r6, r7, r8, r9, r10, ip}
 832  .endif
 833      bx        lr
 834
 835      .unreq    OUT
 836      .unreq    TOP
 837 @@ -757,21 +757,21 @@ 3:
 838      .unreq    UX
 839      .unreq    WIDTH
 840      .unreq    TMP1
 841      .unreq    TMP2
 842      .unreq    PF_OFFS
 843      .unreq    TMP3
 844      .unreq    TMP4
 845      .unreq    STRIDE
 846 -.if ((flags) & BILINEAR_FLAG_USE_MASK) != 0
 847 +.if ((\flags) & BILINEAR_FLAG_USE_MASK) != 0
 848      .unreq    MASK
 849  .endif
 850
 851 -.endfunc
 852 +pixman_end_asm_function
 853
 854  .endm
 855
 856  /* src_8888_8_8888 */
 857  .macro bilinear_src_8888_8_8888_process_last_pixel
 858      bilinear_interpolate_last_pixel 8888, 8, 8888, src
 859  .endm
 860
 861 diff --git a/gfx/cairo/libpixman/src/pixman-arm-neon-asm.S b/gfx/cairo/libpixman/src/pixman-arm-neon-asm.S
 862 --- a/gfx/cairo/libpixman/src/pixman-arm-neon-asm.S
 863 +++ b/gfx/cairo/libpixman/src/pixman-arm-neon-asm.S
 864 @@ -29,16 +29,22 @@
 865   * (those which are exposing some new or interesting features) are
 866   * extensively commented and can be used as examples.
 867   *
 868   * You may want to have a look at the comments for following functions:
 869   *  - pixman_composite_over_8888_0565_asm_neon
 870   *  - pixman_composite_over_n_8_0565_asm_neon
 871   */
 872
 873 +#ifdef __clang__
 874 +#define ldrgeb ldrbge
 875 +#define subges subsge
 876 +#define subpls subspl
 877 +#endif
 878 +
 879  /* Prevent the stack from becoming executable for no reason... */
 880  #if defined(__linux__) && defined(__ELF__)
 881  .section .note.GNU-stack,"",%progbits
 882  #endif
 883
 884      .text
 885      .fpu neon
 886      .arch armv7a
 887 @@ -255,43 +261,43 @@
 888          vqadd.u8    d16, d2, d20
 889      vld1.16     {d4, d5}, [DST_R, :128]!
 890          vqadd.u8    q9, q0, q11
 891      vshrn.u16   d6, q2, #8
 892      fetch_src_pixblock
 893      vshrn.u16   d7, q2, #3
 894      vsli.u16    q2, q2, #5
 895          vshll.u8    q14, d16, #8
 896 -                                    PF add PF_X, PF_X, #8
 897 +                                    PF add, PF_X, PF_X, #8
 898          vshll.u8    q8, d19, #8
 899 -                                    PF tst PF_CTL, #0xF
 900 +                                    PF tst, PF_CTL, #0xF
 901      vsri.u8     d6, d6, #5
 902 -                                    PF addne PF_X, PF_X, #8
 903 +                                    PF addne, PF_X, PF_X, #8
 904      vmvn.8      d3, d3
 905 -                                    PF subne PF_CTL, PF_CTL, #1
 906 +                                    PF subne, PF_CTL, PF_CTL, #1
 907      vsri.u8     d7, d7, #6
 908      vshrn.u16   d30, q2, #2
 909      vmull.u8    q10, d3, d6
 910                                      PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
 911      vmull.u8    q11, d3, d7
 912      vmull.u8    q12, d3, d30
 913                                      PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
 914          vsri.u16    q14, q8, #5
 915 -                                    PF cmp PF_X, ORIG_W
 916 +                                    PF cmp, PF_X, ORIG_W
 917          vshll.u8    q9, d18, #8
 918      vrshr.u16   q13, q10, #8
 919 -                                    PF subge PF_X, PF_X, ORIG_W
 920 +                                    PF subge, PF_X, PF_X, ORIG_W
 921      vrshr.u16   q3, q11, #8
 922      vrshr.u16   q15, q12, #8
 923 -                                    PF subges PF_CTL, PF_CTL, #0x10
 924 +                                    PF subges, PF_CTL, PF_CTL, #0x10
 925          vsri.u16    q14, q9, #11
 926 -                                    PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
 927 +                                    PF ldrgeb, DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
 928      vraddhn.u16 d20, q10, q13
 929      vraddhn.u16 d23, q11, q3
 930 -                                    PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
 931 +                                    PF ldrgeb, DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
 932      vraddhn.u16 d22, q12, q15
 933          vst1.16     {d28, d29}, [DST_W, :128]!
 934  .endm
 935
 936  #else
 937
 938  /* If we did not care much about the performance, we would just use this... */
 939  .macro pixman_composite_over_8888_0565_process_pixblock_tail_head
 940 @@ -429,30 +435,30 @@ generate_composite_function \
 941
 942  .macro pixman_composite_src_8888_0565_process_pixblock_tail
 943      vsri.u16    q14, q8, #5
 944      vsri.u16    q14, q9, #11
 945  .endm
 946
 947  .macro pixman_composite_src_8888_0565_process_pixblock_tail_head
 948          vsri.u16    q14, q8, #5
 949 -                                    PF add PF_X, PF_X, #8
 950 -                                    PF tst PF_CTL, #0xF
 951 +                                    PF add, PF_X, PF_X, #8
 952 +                                    PF tst, PF_CTL, #0xF
 953      fetch_src_pixblock
 954 -                                    PF addne PF_X, PF_X, #8
 955 -                                    PF subne PF_CTL, PF_CTL, #1
 956 +                                    PF addne, PF_X, PF_X, #8
 957 +                                    PF subne, PF_CTL, PF_CTL, #1
 958          vsri.u16    q14, q9, #11
 959 -                                    PF cmp PF_X, ORIG_W
 960 +                                    PF cmp, PF_X, ORIG_W
 961                                      PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
 962      vshll.u8    q8, d1, #8
 963          vst1.16     {d28, d29}, [DST_W, :128]!
 964 -                                    PF subge PF_X, PF_X, ORIG_W
 965 -                                    PF subges PF_CTL, PF_CTL, #0x10
 966 +                                    PF subge, PF_X, PF_X, ORIG_W
 967 +                                    PF subges, PF_CTL, PF_CTL, #0x10
 968      vshll.u8    q14, d2, #8
 969 -                                    PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
 970 +                                    PF ldrgeb, DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
 971      vshll.u8    q9, d0, #8
 972  .endm
 973
 974  generate_composite_function \
 975      pixman_composite_src_8888_0565_asm_neon, 32, 0, 16, \
 976      FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
 977      8, /* number of pixels, processed in a single block */ \
 978      10, /* prefetch distance */ \
 979 @@ -504,30 +510,30 @@ generate_composite_function \
 980      vqadd.u8    q15, q1, q3
 981  .endm
 982
 983  .macro pixman_composite_add_8_8_process_pixblock_tail
 984  .endm
 985
 986  .macro pixman_composite_add_8_8_process_pixblock_tail_head
 987      fetch_src_pixblock
 988 -                                    PF add PF_X, PF_X, #32
 989 -                                    PF tst PF_CTL, #0xF
 990 +                                    PF add, PF_X, PF_X, #32
 991 +                                    PF tst, PF_CTL, #0xF
 992      vld1.8      {d4, d5, d6, d7}, [DST_R, :128]!
 993 -                                    PF addne PF_X, PF_X, #32
 994 -                                    PF subne PF_CTL, PF_CTL, #1
 995 +                                    PF addne, PF_X, PF_X, #32
 996 +                                    PF subne, PF_CTL, PF_CTL, #1
 997          vst1.8      {d28, d29, d30, d31}, [DST_W, :128]!
 998 -                                    PF cmp PF_X, ORIG_W
 999 +                                    PF cmp, PF_X, ORIG_W
1000                                      PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
1001                                      PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
1002 -                                    PF subge PF_X, PF_X, ORIG_W
1003 -                                    PF subges PF_CTL, PF_CTL, #0x10
1004 +                                    PF subge, PF_X, PF_X, ORIG_W
1005 +                                    PF subges, PF_CTL, PF_CTL, #0x10
1006      vqadd.u8    q14, q0, q2
1007 -                                    PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
1008 -                                    PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
1009 +                                    PF ldrgeb, DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
1010 +                                    PF ldrgeb, DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
1011      vqadd.u8    q15, q1, q3
1012  .endm
1013
1014  generate_composite_function \
1015      pixman_composite_add_8_8_asm_neon, 8, 0, 8, \
1016      FLAG_DST_READWRITE, \
1017      32, /* number of pixels, processed in a single block */ \
1018      10, /* prefetch distance */ \
1019 @@ -536,30 +542,30 @@ generate_composite_function \
1020      pixman_composite_add_8_8_process_pixblock_head, \
1021      pixman_composite_add_8_8_process_pixblock_tail, \
1022      pixman_composite_add_8_8_process_pixblock_tail_head
1023
1024  /******************************************************************************/
1025
1026  .macro pixman_composite_add_8888_8888_process_pixblock_tail_head
1027      fetch_src_pixblock
1028 -                                    PF add PF_X, PF_X, #8
1029 -                                    PF tst PF_CTL, #0xF
1030 +                                    PF add, PF_X, PF_X, #8
1031 +                                    PF tst, PF_CTL, #0xF
1032      vld1.32     {d4, d5, d6, d7}, [DST_R, :128]!
1033 -                                    PF addne PF_X, PF_X, #8
1034 -                                    PF subne PF_CTL, PF_CTL, #1
1035 +                                    PF addne, PF_X, PF_X, #8
1036 +                                    PF subne, PF_CTL, PF_CTL, #1
1037          vst1.32     {d28, d29, d30, d31}, [DST_W, :128]!
1038 -                                    PF cmp PF_X, ORIG_W
1039 +                                    PF cmp, PF_X, ORIG_W
1040                                      PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
1041                                      PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
1042 -                                    PF subge PF_X, PF_X, ORIG_W
1043 -                                    PF subges PF_CTL, PF_CTL, #0x10
1044 +                                    PF subge, PF_X, PF_X, ORIG_W
1045 +                                    PF subges, PF_CTL, PF_CTL, #0x10
1046      vqadd.u8    q14, q0, q2
1047 -                                    PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
1048 -                                    PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
1049 +                                    PF ldrgeb, DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
1050 +                                    PF ldrgeb, DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
1051      vqadd.u8    q15, q1, q3
1052  .endm
1053
1054  generate_composite_function \
1055      pixman_composite_add_8888_8888_asm_neon, 32, 0, 32, \
1056      FLAG_DST_READWRITE, \
1057      8, /* number of pixels, processed in a single block */ \
1058      10, /* prefetch distance */ \
1059 @@ -599,40 +605,40 @@ generate_composite_function_single_scanl
1060      vraddhn.u16 d29, q15, q9
1061      vraddhn.u16 d30, q12, q10
1062      vraddhn.u16 d31, q13, q11
1063  .endm
1064
1065  .macro pixman_composite_out_reverse_8888_8888_process_pixblock_tail_head
1066      vld4.8      {d4, d5, d6, d7}, [DST_R, :128]!
1067          vrshr.u16   q14, q8, #8
1068 -                                    PF add PF_X, PF_X, #8
1069 -                                    PF tst PF_CTL, #0xF
1070 +                                    PF add, PF_X, PF_X, #8
1071 +                                    PF tst, PF_CTL, #0xF
1072          vrshr.u16   q15, q9, #8
1073          vrshr.u16   q12, q10, #8
1074          vrshr.u16   q13, q11, #8
1075 -                                    PF addne PF_X, PF_X, #8
1076 -                                    PF subne PF_CTL, PF_CTL, #1
1077 +                                    PF addne, PF_X, PF_X, #8
1078 +                                    PF subne, PF_CTL, PF_CTL, #1
1079          vraddhn.u16 d28, q14, q8
1080          vraddhn.u16 d29, q15, q9
1081 -                                    PF cmp PF_X, ORIG_W
1082 +                                    PF cmp, PF_X, ORIG_W
1083          vraddhn.u16 d30, q12, q10
1084          vraddhn.u16 d31, q13, q11
1085      fetch_src_pixblock
1086                                      PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
1087      vmvn.8      d22, d3
1088                                      PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
1089          vst4.8      {d28, d29, d30, d31}, [DST_W, :128]!
1090 -                                    PF subge PF_X, PF_X, ORIG_W
1091 +                                    PF subge, PF_X, PF_X, ORIG_W
1092      vmull.u8    q8, d22, d4
1093 -                                    PF subges PF_CTL, PF_CTL, #0x10
1094 +                                    PF subsge, PF_CTL, PF_CTL, #0x10
1095      vmull.u8    q9, d22, d5
1096 -                                    PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
1097 +                                    PF ldrgeb, DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
1098      vmull.u8    q10, d22, d6
1099 -                                    PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
1100 +                                    PF ldrgeb, DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
1101      vmull.u8    q11, d22, d7
1102  .endm
1103
1104  generate_composite_function_single_scanline \
1105      pixman_composite_scanline_out_reverse_asm_neon, 32, 0, 32, \
1106      FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
1107      8, /* number of pixels, processed in a single block */ \
1108      default_init, \
1109 @@ -651,42 +657,42 @@ generate_composite_function_single_scanl
1110      pixman_composite_out_reverse_8888_8888_process_pixblock_tail
1111      vqadd.u8    q14, q0, q14
1112      vqadd.u8    q15, q1, q15
1113  .endm
1114
1115  .macro pixman_composite_over_8888_8888_process_pixblock_tail_head
1116      vld4.8      {d4, d5, d6, d7}, [DST_R, :128]!
1117          vrshr.u16   q14, q8, #8
1118 -                                    PF add PF_X, PF_X, #8
1119 -                                    PF tst PF_CTL, #0xF
1120 +                                    PF add, PF_X, PF_X, #8
1121 +                                    PF tst, PF_CTL, #0xF
1122          vrshr.u16   q15, q9, #8
1123          vrshr.u16   q12, q10, #8
1124          vrshr.u16   q13, q11, #8
1125 -                                    PF addne PF_X, PF_X, #8
1126 -                                    PF subne PF_CTL, PF_CTL, #1
1127 +                                    PF addne, PF_X, PF_X, #8
1128 +                                    PF subne, PF_CTL, PF_CTL, #1
1129          vraddhn.u16 d28, q14, q8
1130          vraddhn.u16 d29, q15, q9
1131 -                                    PF cmp PF_X, ORIG_W
1132 +                                    PF cmp, PF_X, ORIG_W
1133          vraddhn.u16 d30, q12, q10
1134          vraddhn.u16 d31, q13, q11
1135          vqadd.u8    q14, q0, q14
1136          vqadd.u8    q15, q1, q15
1137      fetch_src_pixblock
1138                                      PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
1139      vmvn.8      d22, d3
1140                                      PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
1141          vst4.8      {d28, d29, d30, d31}, [DST_W, :128]!
1142 -                                    PF subge PF_X, PF_X, ORIG_W
1143 +                                    PF subge, PF_X, PF_X, ORIG_W
1144      vmull.u8    q8, d22, d4
1145 -                                    PF subges PF_CTL, PF_CTL, #0x10
1146 +                                    PF subges, PF_CTL, PF_CTL, #0x10
1147      vmull.u8    q9, d22, d5
1148 -                                    PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
1149 +                                    PF ldrgeb, DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
1150      vmull.u8    q10, d22, d6
1151 -                                    PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
1152 +                                    PF ldrgeb, DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
1153      vmull.u8    q11, d22, d7
1154  .endm
1155
1156  generate_composite_function \
1157      pixman_composite_over_8888_8888_asm_neon, 32, 0, 32, \
1158      FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
1159      8, /* number of pixels, processed in a single block */ \
1160      5, /* prefetch distance */ \
1161 @@ -737,30 +743,30 @@ generate_composite_function_single_scanl
1162          vrshr.u16   q2, q10, #8
1163          vrshr.u16   q3, q11, #8
1164          vraddhn.u16 d28, q14, q8
1165          vraddhn.u16 d29, q15, q9
1166          vraddhn.u16 d30, q2, q10
1167          vraddhn.u16 d31, q3, q11
1168      vld4.8      {d4, d5, d6, d7}, [DST_R, :128]!
1169          vqadd.u8    q14, q0, q14
1170 -                                    PF add PF_X, PF_X, #8
1171 -                                    PF tst PF_CTL, #0x0F
1172 -                                    PF addne PF_X, PF_X, #8
1173 -                                    PF subne PF_CTL, PF_CTL, #1
1174 +                                    PF add, PF_X, PF_X, #8
1175 +                                    PF tst, PF_CTL, #0x0F
1176 +                                    PF addne, PF_X, PF_X, #8
1177 +                                    PF subne, PF_CTL, PF_CTL, #1
1178          vqadd.u8    q15, q1, q15
1179 -                                    PF cmp PF_X, ORIG_W
1180 +                                    PF cmp, PF_X, ORIG_W
1181      vmull.u8    q8, d24, d4
1182                                      PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
1183      vmull.u8    q9, d24, d5
1184 -                                    PF subge PF_X, PF_X, ORIG_W
1185 +                                    PF subge, PF_X, PF_X, ORIG_W
1186      vmull.u8    q10, d24, d6
1187 -                                    PF subges PF_CTL, PF_CTL, #0x10
1188 +                                    PF subges, PF_CTL, PF_CTL, #0x10
1189      vmull.u8    q11, d24, d7
1190 -                                    PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
1191 +                                    PF ldrgeb, DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
1192          vst4.8      {d28, d29, d30, d31}, [DST_W, :128]!
1193  .endm
1194
1195  .macro pixman_composite_over_n_8888_init
1196      add         DUMMY, sp, #ARGS_STACK_OFFSET
1197      vld1.32     {d3[0]}, [DUMMY]
1198      vdup.8      d0, d3[0]
1199      vdup.8      d1, d3[1]
1200 @@ -779,40 +785,40 @@ generate_composite_function \
1201      pixman_composite_over_8888_8888_process_pixblock_head, \
1202      pixman_composite_over_8888_8888_process_pixblock_tail, \
1203      pixman_composite_over_n_8888_process_pixblock_tail_head
1204
1205  /******************************************************************************/
1206
1207  .macro pixman_composite_over_reverse_n_8888_process_pixblock_tail_head
1208          vrshr.u16   q14, q8, #8
1209 -                                    PF add PF_X, PF_X, #8
1210 -                                    PF tst PF_CTL, #0xF
1211 +                                    PF add, PF_X, PF_X, #8
1212 +                                    PF tst, PF_CTL, #0xF
1213          vrshr.u16   q15, q9, #8
1214          vrshr.u16   q12, q10, #8
1215          vrshr.u16   q13, q11, #8
1216 -                                    PF addne PF_X, PF_X, #8
1217 -                                    PF subne PF_CTL, PF_CTL, #1
1218 +                                    PF addne, PF_X, PF_X, #8
1219 +                                    PF subne, PF_CTL, PF_CTL, #1
1220          vraddhn.u16 d28, q14, q8
1221          vraddhn.u16 d29, q15, q9
1222 -                                    PF cmp PF_X, ORIG_W
1223 +                                    PF cmp, PF_X, ORIG_W
1224          vraddhn.u16 d30, q12, q10
1225          vraddhn.u16 d31, q13, q11
1226          vqadd.u8    q14, q0, q14
1227          vqadd.u8    q15, q1, q15
1228      vld4.8      {d0, d1, d2, d3}, [DST_R, :128]!
1229      vmvn.8      d22, d3
1230                                      PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
1231          vst4.8      {d28, d29, d30, d31}, [DST_W, :128]!
1232 -                                    PF subge PF_X, PF_X, ORIG_W
1233 +                                    PF subge, PF_X, PF_X, ORIG_W
1234      vmull.u8    q8, d22, d4
1235 -                                    PF subges PF_CTL, PF_CTL, #0x10
1236 +                                    PF subges, PF_CTL, PF_CTL, #0x10
1237      vmull.u8    q9, d22, d5
1238      vmull.u8    q10, d22, d6
1239 -                                    PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
1240 +                                    PF ldrgeb, DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
1241      vmull.u8    q11, d22, d7
1242  .endm
1243
1244  .macro pixman_composite_over_reverse_n_8888_init
1245      add         DUMMY, sp, #ARGS_STACK_OFFSET
1246      vld1.32     {d7[0]}, [DUMMY]
1247      vdup.8      d4, d7[0]
1248      vdup.8      d5, d7[1]
1249 @@ -1240,33 +1246,33 @@ generate_composite_function \
1250      vrshrn.u16  d28, q8, #8
1251      vrshrn.u16  d29, q9, #8
1252      vrshrn.u16  d30, q10, #8
1253      vrshrn.u16  d31, q11, #8
1254  .endm
1255
1256  .macro pixman_composite_src_n_8_8888_process_pixblock_tail_head
1257      fetch_mask_pixblock
1258 -                                    PF add PF_X, PF_X, #8
1259 +                                    PF add, PF_X, PF_X, #8
1260          vrshrn.u16  d28, q8, #8
1261 -                                    PF tst PF_CTL, #0x0F
1262 +                                    PF tst, PF_CTL, #0x0F
1263          vrshrn.u16  d29, q9, #8
1264 -                                    PF addne PF_X, PF_X, #8
1265 +                                    PF addne, PF_X, PF_X, #8
1266          vrshrn.u16  d30, q10, #8
1267 -                                    PF subne PF_CTL, PF_CTL, #1
1268 +                                    PF subne, PF_CTL, PF_CTL, #1
1269          vrshrn.u16  d31, q11, #8
1270 -                                    PF cmp PF_X, ORIG_W
1271 +                                    PF cmp, PF_X, ORIG_W
1272      vmull.u8    q8, d24, d0
1273                                      PF pld, [PF_MASK, PF_X, lsl #mask_bpp_shift]
1274      vmull.u8    q9, d24, d1
1275 -                                    PF subge PF_X, PF_X, ORIG_W
1276 +                                    PF subge, PF_X, PF_X, ORIG_W
1277      vmull.u8    q10, d24, d2
1278 -                                    PF subges PF_CTL, PF_CTL, #0x10
1279 +                                    PF subges, PF_CTL, PF_CTL, #0x10
1280      vmull.u8    q11, d24, d3
1281 -                                    PF ldrgeb DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]!
1282 +                                    PF ldrgeb, DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]!
1283          vst4.8      {d28, d29, d30, d31}, [DST_W, :128]!
1284      vrsra.u16   q8, q8, #8
1285      vrsra.u16   q9, q9, #8
1286      vrsra.u16   q10, q10, #8
1287      vrsra.u16   q11, q11, #8
1288  .endm
1289
1290  .macro pixman_composite_src_n_8_8888_init
1291 @@ -1309,33 +1315,33 @@ generate_composite_function \
1292      vrshrn.u16  d28, q0, #8
1293      vrshrn.u16  d29, q1, #8
1294      vrshrn.u16  d30, q2, #8
1295      vrshrn.u16  d31, q3, #8
1296  .endm
1297
1298  .macro pixman_composite_src_n_8_8_process_pixblock_tail_head
1299      fetch_mask_pixblock
1300 -                                    PF add PF_X, PF_X, #8
1301 +                                    PF add, PF_X, PF_X, #8
1302          vrshrn.u16  d28, q0, #8
1303 -                                    PF tst PF_CTL, #0x0F
1304 +                                    PF tst, PF_CTL, #0x0F
1305          vrshrn.u16  d29, q1, #8
1306 -                                    PF addne PF_X, PF_X, #8
1307 +                                    PF addne, PF_X, PF_X, #8
1308          vrshrn.u16  d30, q2, #8
1309 -                                    PF subne PF_CTL, PF_CTL, #1
1310 +                                    PF subne, PF_CTL, PF_CTL, #1
1311          vrshrn.u16  d31, q3, #8
1312 -                                    PF cmp PF_X, ORIG_W
1313 +                                    PF cmp, PF_X, ORIG_W
1314      vmull.u8    q0,  d24, d16
1315                                      PF pld, [PF_MASK, PF_X, lsl #mask_bpp_shift]
1316      vmull.u8    q1,  d25, d16
1317 -                                    PF subge PF_X, PF_X, ORIG_W
1318 +                                    PF subge, PF_X, PF_X, ORIG_W
1319      vmull.u8    q2,  d26, d16
1320 -                                    PF subges PF_CTL, PF_CTL, #0x10
1321 +                                    PF subges, PF_CTL, PF_CTL, #0x10
1322      vmull.u8    q3,  d27, d16
1323 -                                    PF ldrgeb DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]!
1324 +                                    PF ldrgeb, DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]!
1325          vst1.8      {d28, d29, d30, d31}, [DST_W, :128]!
1326      vrsra.u16   q0, q0,  #8
1327      vrsra.u16   q1, q1,  #8
1328      vrsra.u16   q2, q2,  #8
1329      vrsra.u16   q3, q3,  #8
1330  .endm
1331
1332  .macro pixman_composite_src_n_8_8_init
1333 @@ -1403,37 +1409,37 @@ generate_composite_function \
1334  .endm
1335
1336  .macro pixman_composite_over_n_8_8888_process_pixblock_tail_head
1337          vrshr.u16   q14, q8, #8
1338      vld4.8      {d4, d5, d6, d7}, [DST_R, :128]!
1339          vrshr.u16   q15, q9, #8
1340      fetch_mask_pixblock
1341          vrshr.u16   q6, q10, #8
1342 -                                    PF add PF_X, PF_X, #8
1343 +                                    PF add, PF_X, PF_X, #8
1344          vrshr.u16   q7, q11, #8
1345 -                                    PF tst PF_CTL, #0x0F
1346 +                                    PF tst, PF_CTL, #0x0F
1347          vraddhn.u16 d28, q14, q8
1348 -                                    PF addne PF_X, PF_X, #8
1349 +                                    PF addne, PF_X, PF_X, #8
1350          vraddhn.u16 d29, q15, q9
1351 -                                    PF subne PF_CTL, PF_CTL, #1
1352 +                                    PF subne, PF_CTL, PF_CTL, #1
1353          vraddhn.u16 d30, q6, q10
1354 -                                    PF cmp PF_X, ORIG_W
1355 +                                    PF cmp, PF_X, ORIG_W
1356          vraddhn.u16 d31, q7, q11
1357                                      PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
1358      vmull.u8    q6, d24, d8
1359                                      PF pld, [PF_MASK, PF_X, lsl #mask_bpp_shift]
1360      vmull.u8    q7, d24, d9
1361 -                                    PF subge PF_X, PF_X, ORIG_W
1362 +                                    PF subge, PF_X, PF_X, ORIG_W
1363      vmull.u8    q8, d24, d10
1364 -                                    PF subges PF_CTL, PF_CTL, #0x10
1365 +                                    PF subges, PF_CTL, PF_CTL, #0x10
1366      vmull.u8    q9, d24, d11
1367 -                                    PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
1368 +                                    PF ldrgeb, DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
1369          vqadd.u8    q14, q0, q14
1370 -                                    PF ldrgeb DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]!
1371 +                                    PF ldrgeb, DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]!
1372          vqadd.u8    q15, q1, q15
1373      vrshr.u16   q10, q6, #8
1374      vrshr.u16   q11, q7, #8
1375      vrshr.u16   q12, q8, #8
1376      vrshr.u16   q13, q9, #8
1377      vraddhn.u16 d0, q6, q10
1378      vraddhn.u16 d1, q7, q11
1379      vraddhn.u16 d2, q8, q12
1380 @@ -2420,31 +2426,31 @@ generate_composite_function \
1381
1382  .macro pixman_composite_src_pixbuf_8888_process_pixblock_tail_head
1383          vrshr.u16   q11, q8, #8
1384          vswp        d3, d31
1385          vrshr.u16   q12, q9, #8
1386          vrshr.u16   q13, q10, #8
1387      fetch_src_pixblock
1388          vraddhn.u16 d30, q11, q8
1389 -                                    PF add PF_X, PF_X, #8
1390 -                                    PF tst PF_CTL, #0xF
1391 -                                    PF addne PF_X, PF_X, #8
1392 -                                    PF subne PF_CTL, PF_CTL, #1
1393 +                                    PF add, PF_X, PF_X, #8
1394 +                                    PF tst, PF_CTL, #0xF
1395 +                                    PF addne, PF_X, PF_X, #8
1396 +                                    PF subne, PF_CTL, PF_CTL, #1
1397          vraddhn.u16 d29, q12, q9
1398          vraddhn.u16 d28, q13, q10
1399      vmull.u8    q8, d3, d0
1400      vmull.u8    q9, d3, d1
1401      vmull.u8    q10, d3, d2
1402          vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
1403 -                                    PF cmp PF_X, ORIG_W
1404 +                                    PF cmp, PF_X, ORIG_W
1405                                      PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
1406 -                                    PF subge PF_X, PF_X, ORIG_W
1407 -                                    PF subges PF_CTL, PF_CTL, #0x10
1408 -                                    PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
1409 +                                    PF subge, PF_X, PF_X, ORIG_W
1410 +                                    PF subges, PF_CTL, PF_CTL, #0x10
1411 +                                    PF ldrgeb, DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
1412  .endm
1413
1414  generate_composite_function \
1415      pixman_composite_src_pixbuf_8888_asm_neon, 32, 0, 32, \
1416      FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
1417      8, /* number of pixels, processed in a single block */ \
1418      10, /* prefetch distance */ \
1419      default_init, \
1420 @@ -2477,31 +2483,31 @@ generate_composite_function \
1421
1422  .macro pixman_composite_src_rpixbuf_8888_process_pixblock_tail_head
1423          vrshr.u16   q11, q8, #8
1424          vswp        d3, d31
1425          vrshr.u16   q12, q9, #8
1426          vrshr.u16   q13, q10, #8
1427      fetch_src_pixblock
1428          vraddhn.u16 d28, q11, q8
1429 -                                    PF add PF_X, PF_X, #8
1430 -                                    PF tst PF_CTL, #0xF
1431 -                                    PF addne PF_X, PF_X, #8
1432 -                                    PF subne PF_CTL, PF_CTL, #1
1433 +                                    PF add, PF_X, PF_X, #8
1434 +                                    PF tst, PF_CTL, #0xF
1435 +                                    PF addne, PF_X, PF_X, #8
1436 +                                    PF subne, PF_CTL, PF_CTL, #1
1437          vraddhn.u16 d29, q12, q9
1438          vraddhn.u16 d30, q13, q10
1439      vmull.u8    q8, d3, d0
1440      vmull.u8    q9, d3, d1
1441      vmull.u8    q10, d3, d2
1442          vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
1443 -                                    PF cmp PF_X, ORIG_W
1444 +                                    PF cmp, PF_X, ORIG_W
1445                                      PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
1446 -                                    PF subge PF_X, PF_X, ORIG_W
1447 -                                    PF subges PF_CTL, PF_CTL, #0x10
1448 -                                    PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
1449 +                                    PF subge, PF_X, PF_X, ORIG_W
1450 +                                    PF subges, PF_CTL, PF_CTL, #0x10
1451 +                                    PF ldrgeb, DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
1452  .endm
1453
1454  generate_composite_function \
1455      pixman_composite_src_rpixbuf_8888_asm_neon, 32, 0, 32, \
1456      FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
1457      8, /* number of pixels, processed in a single block */ \
1458      10, /* prefetch distance */ \
1459      default_init, \
1460 @@ -2836,182 +2842,182 @@ generate_composite_function_nearest_scan
1461   * format conversion, and interpolation as separate macros which can be used
1462   * as the basic building blocks for constructing bilinear scanline functions.
1463   */
1464
1465  .macro bilinear_load_8888 reg1, reg2, tmp
1466      mov       TMP1, X, asr #16
1467      add       X, X, UX
1468      add       TMP1, TOP, TMP1, asl #2
1469 -    vld1.32   {reg1}, [TMP1], STRIDE
1470 -    vld1.32   {reg2}, [TMP1]
1471 +    vld1.32   {\reg1}, [TMP1], STRIDE
1472 +    vld1.32   {\reg2}, [TMP1]
1473  .endm
1474
1475  .macro bilinear_load_0565 reg1, reg2, tmp
1476      mov       TMP1, X, asr #16
1477      add       X, X, UX
1478      add       TMP1, TOP, TMP1, asl #1
1479 -    vld1.32   {reg2[0]}, [TMP1], STRIDE
1480 -    vld1.32   {reg2[1]}, [TMP1]
1481 -    convert_four_0565_to_x888_packed reg2, reg1, reg2, tmp
1482 +    vld1.32   {\reg2[0]}, [TMP1], STRIDE
1483 +    vld1.32   {\reg2[1]}, [TMP1]
1484 +    convert_four_0565_to_x888_packed \reg2, \reg1, \reg2, \tmp
1485  .endm
1486
1487  .macro bilinear_load_and_vertical_interpolate_two_8888 \
1488                      acc1, acc2, reg1, reg2, reg3, reg4, tmp1, tmp2
1489
1490 -    bilinear_load_8888 reg1, reg2, tmp1
1491 -    vmull.u8  acc1, reg1, d28
1492 -    vmlal.u8  acc1, reg2, d29
1493 -    bilinear_load_8888 reg3, reg4, tmp2
1494 -    vmull.u8  acc2, reg3, d28
1495 -    vmlal.u8  acc2, reg4, d29
1496 +    bilinear_load_8888 \reg1, \reg2, \tmp1
1497 +    vmull.u8  \acc1, \reg1, d28
1498 +    vmlal.u8  \acc1, \reg2, d29
1499 +    bilinear_load_8888 \reg3, \reg4, \tmp2
1500 +    vmull.u8  \acc2, \reg3, d28
1501 +    vmlal.u8  \acc2, \reg4, d29
1502  .endm
1503
1504  .macro bilinear_load_and_vertical_interpolate_four_8888 \
1505                  xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi \
1506                  yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
1507
1508      bilinear_load_and_vertical_interpolate_two_8888 \
1509 -                xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi
1510 +                \xacc1, \xacc2, \xreg1, \xreg2, \xreg3, \xreg4, \xacc2lo, \xacc2hi
1511      bilinear_load_and_vertical_interpolate_two_8888 \
1512 -                yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
1513 +                \yacc1, \yacc2, \yreg1, \yreg2, \yreg3, \yreg4, \yacc2lo, \yacc2hi
1514  .endm
1515
1516  .macro bilinear_load_and_vertical_interpolate_two_0565 \
1517                  acc1, acc2, reg1, reg2, reg3, reg4, acc2lo, acc2hi
1518
1519      mov       TMP1, X, asr #16
1520      add       X, X, UX
1521      add       TMP1, TOP, TMP1, asl #1
1522      mov       TMP2, X, asr #16
1523      add       X, X, UX
1524      add       TMP2, TOP, TMP2, asl #1
1525 -    vld1.32   {acc2lo[0]}, [TMP1], STRIDE
1526 -    vld1.32   {acc2hi[0]}, [TMP2], STRIDE
1527 -    vld1.32   {acc2lo[1]}, [TMP1]
1528 -    vld1.32   {acc2hi[1]}, [TMP2]
1529 -    convert_0565_to_x888 acc2, reg3, reg2, reg1
1530 -    vzip.u8   reg1, reg3
1531 -    vzip.u8   reg2, reg4
1532 -    vzip.u8   reg3, reg4
1533 -    vzip.u8   reg1, reg2
1534 -    vmull.u8  acc1, reg1, d28
1535 -    vmlal.u8  acc1, reg2, d29
1536 -    vmull.u8  acc2, reg3, d28
1537 -    vmlal.u8  acc2, reg4, d29
1538 +    vld1.32   {\acc2lo[0]}, [TMP1], STRIDE
1539 +    vld1.32   {\acc2hi[0]}, [TMP2], STRIDE
1540 +    vld1.32   {\acc2lo[1]}, [TMP1]
1541 +    vld1.32   {\acc2hi[1]}, [TMP2]
1542 +    convert_0565_to_x888 \acc2, \reg3, \reg2, \reg1
1543 +    vzip.u8   \reg1, \reg3
1544 +    vzip.u8   \reg2, \reg4
1545 +    vzip.u8   \reg3, \reg4
1546 +    vzip.u8   \reg1, \reg2
1547 +    vmull.u8  \acc1, \reg1, d28
1548 +    vmlal.u8  \acc1, \reg2, d29
1549 +    vmull.u8  \acc2, \reg3, d28
1550 +    vmlal.u8  \acc2, \reg4, d29
1551  .endm
1552
1553  .macro bilinear_load_and_vertical_interpolate_four_0565 \
1554                  xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi \
1555                  yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
1556
1557      mov       TMP1, X, asr #16
1558      add       X, X, UX
1559      add       TMP1, TOP, TMP1, asl #1
1560      mov       TMP2, X, asr #16
1561      add       X, X, UX
1562      add       TMP2, TOP, TMP2, asl #1
1563 -    vld1.32   {xacc2lo[0]}, [TMP1], STRIDE
1564 -    vld1.32   {xacc2hi[0]}, [TMP2], STRIDE
1565 -    vld1.32   {xacc2lo[1]}, [TMP1]
1566 -    vld1.32   {xacc2hi[1]}, [TMP2]
1567 -    convert_0565_to_x888 xacc2, xreg3, xreg2, xreg1
1568 +    vld1.32   {\xacc2lo[0]}, [TMP1], STRIDE
1569 +    vld1.32   {\xacc2hi[0]}, [TMP2], STRIDE
1570 +    vld1.32   {\xacc2lo[1]}, [TMP1]
1571 +    vld1.32   {\xacc2hi[1]}, [TMP2]
1572 +    convert_0565_to_x888 \xacc2, \xreg3, \xreg2, \xreg1
1573      mov       TMP1, X, asr #16
1574      add       X, X, UX
1575      add       TMP1, TOP, TMP1, asl #1
1576      mov       TMP2, X, asr #16
1577      add       X, X, UX
1578      add       TMP2, TOP, TMP2, asl #1
1579 -    vld1.32   {yacc2lo[0]}, [TMP1], STRIDE
1580 -    vzip.u8   xreg1, xreg3
1581 -    vld1.32   {yacc2hi[0]}, [TMP2], STRIDE
1582 -    vzip.u8   xreg2, xreg4
1583 -    vld1.32   {yacc2lo[1]}, [TMP1]
1584 -    vzip.u8   xreg3, xreg4
1585 -    vld1.32   {yacc2hi[1]}, [TMP2]
1586 -    vzip.u8   xreg1, xreg2
1587 -    convert_0565_to_x888 yacc2, yreg3, yreg2, yreg1
1588 -    vmull.u8  xacc1, xreg1, d28
1589 -    vzip.u8   yreg1, yreg3
1590 -    vmlal.u8  xacc1, xreg2, d29
1591 -    vzip.u8   yreg2, yreg4
1592 -    vmull.u8  xacc2, xreg3, d28
1593 -    vzip.u8   yreg3, yreg4
1594 -    vmlal.u8  xacc2, xreg4, d29
1595 -    vzip.u8   yreg1, yreg2
1596 -    vmull.u8  yacc1, yreg1, d28
1597 -    vmlal.u8  yacc1, yreg2, d29
1598 -    vmull.u8  yacc2, yreg3, d28
1599 -    vmlal.u8  yacc2, yreg4, d29
1600 +    vld1.32   {\yacc2lo[0]}, [TMP1], STRIDE
1601 +    vzip.u8   \xreg1, \xreg3
1602 +    vld1.32   {\yacc2hi[0]}, [TMP2], STRIDE
1603 +    vzip.u8   \xreg2, \xreg4
1604 +    vld1.32   {\yacc2lo[1]}, [TMP1]
1605 +    vzip.u8   \xreg3, \xreg4
1606 +    vld1.32   {\yacc2hi[1]}, [TMP2]
1607 +    vzip.u8   \xreg1, \xreg2
1608 +    convert_0565_to_x888 \yacc2, \yreg3, \yreg2, \yreg1
1609 +    vmull.u8  \xacc1, \xreg1, d28
1610 +    vzip.u8   \yreg1, \yreg3
1611 +    vmlal.u8  \xacc1, \xreg2, d29
1612 +    vzip.u8   \yreg2, \yreg4
1613 +    vmull.u8  \xacc2, \xreg3, d28
1614 +    vzip.u8   \yreg3, \yreg4
1615 +    vmlal.u8  \xacc2, \xreg4, d29
1616 +    vzip.u8   \yreg1, \yreg2
1617 +    vmull.u8  \yacc1, \yreg1, d28
1618 +    vmlal.u8  \yacc1, \yreg2, d29
1619 +    vmull.u8  \yacc2, \yreg3, d28
1620 +    vmlal.u8  \yacc2, \yreg4, d29
1621  .endm
1622
1623  .macro bilinear_store_8888 numpix, tmp1, tmp2
1624 -.if numpix == 4
1625 +.if \numpix == 4
1626      vst1.32   {d0, d1}, [OUT, :128]!
1627 -.elseif numpix == 2
1628 +.elseif \numpix == 2
1629      vst1.32   {d0}, [OUT, :64]!
1630 -.elseif numpix == 1
1631 +.elseif \numpix == 1
1632      vst1.32   {d0[0]}, [OUT, :32]!
1633  .else
1634 -    .error bilinear_store_8888 numpix is unsupported
1635 +    .error bilinear_store_8888 \numpix is unsupported
1636  .endif
1637  .endm
1638
1639  .macro bilinear_store_0565 numpix, tmp1, tmp2
1640      vuzp.u8 d0, d1
1641      vuzp.u8 d2, d3
1642      vuzp.u8 d1, d3
1643      vuzp.u8 d0, d2
1644 -    convert_8888_to_0565 d2, d1, d0, q1, tmp1, tmp2
1645 -.if numpix == 4
1646 +    convert_8888_to_0565 d2, d1, d0, q1, \tmp1, \tmp2
1647 +.if \numpix == 4
1648      vst1.16   {d2}, [OUT, :64]!
1649 -.elseif numpix == 2
1650 +.elseif \numpix == 2
1651      vst1.32   {d2[0]}, [OUT, :32]!
1652 -.elseif numpix == 1
1653 +.elseif \numpix == 1
1654      vst1.16   {d2[0]}, [OUT, :16]!
1655  .else
1656 -    .error bilinear_store_0565 numpix is unsupported
1657 +    .error bilinear_store_0565 \numpix is unsupported
1658  .endif
1659  .endm
1660
1661  .macro bilinear_interpolate_last_pixel src_fmt, dst_fmt
1662 -    bilinear_load_&src_fmt d0, d1, d2
1663 +    bilinear_load_\()\src_fmt d0, d1, d2
1664      vmull.u8  q1, d0, d28
1665      vmlal.u8  q1, d1, d29
1666      /* 5 cycles bubble */
1667      vshll.u16 q0, d2, #BILINEAR_INTERPOLATION_BITS
1668      vmlsl.u16 q0, d2, d30
1669      vmlal.u16 q0, d3, d30
1670      /* 5 cycles bubble */
1671      vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
1672      /* 3 cycles bubble */
1673      vmovn.u16 d0, q0
1674      /* 1 cycle bubble */
1675 -    bilinear_store_&dst_fmt 1, q2, q3
1676 +    bilinear_store_\()\dst_fmt 1, q2, q3
1677  .endm
1678
1679  .macro bilinear_interpolate_two_pixels src_fmt, dst_fmt
1680 -    bilinear_load_and_vertical_interpolate_two_&src_fmt \
1681 +    bilinear_load_and_vertical_interpolate_two_\()\src_fmt \
1682                  q1, q11, d0, d1, d20, d21, d22, d23
1683      vshll.u16 q0, d2, #BILINEAR_INTERPOLATION_BITS
1684      vmlsl.u16 q0, d2, d30
1685      vmlal.u16 q0, d3, d30
1686      vshll.u16 q10, d22, #BILINEAR_INTERPOLATION_BITS
1687      vmlsl.u16 q10, d22, d31
1688      vmlal.u16 q10, d23, d31
1689      vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
1690      vshrn.u32 d1, q10, #(2 * BILINEAR_INTERPOLATION_BITS)
1691      vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
1692      vadd.u16  q12, q12, q13
1693      vmovn.u16 d0, q0
1694 -    bilinear_store_&dst_fmt 2, q2, q3
1695 +    bilinear_store_\()\dst_fmt 2, q2, q3
1696  .endm
1697
1698  .macro bilinear_interpolate_four_pixels src_fmt, dst_fmt
1699 -    bilinear_load_and_vertical_interpolate_four_&src_fmt \
1700 +    bilinear_load_and_vertical_interpolate_four_\()\src_fmt \
1701                  q1, q11, d0, d1, d20, d21, d22, d23 \
1702                  q3, q9,  d4, d5, d16, d17, d18, d19
1703      pld       [TMP1, PF_OFFS]
1704      sub       TMP1, TMP1, STRIDE
1705      vshll.u16 q0, d2, #BILINEAR_INTERPOLATION_BITS
1706      vmlsl.u16 q0, d2, d30
1707      vmlal.u16 q0, d3, d30
1708      vshll.u16 q10, d22, #BILINEAR_INTERPOLATION_BITS
1709 @@ -3029,64 +3035,64 @@ generate_composite_function_nearest_scan
1710      vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
1711      vshrn.u32 d1, q10, #(2 * BILINEAR_INTERPOLATION_BITS)
1712      vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS)
1713      vshrn.u32 d5, q8, #(2 * BILINEAR_INTERPOLATION_BITS)
1714      vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
1715      vmovn.u16 d0, q0
1716      vmovn.u16 d1, q2
1717      vadd.u16  q12, q12, q13
1718 -    bilinear_store_&dst_fmt 4, q2, q3
1719 +    bilinear_store_\()\dst_fmt 4, q2, q3
1720  .endm
1721
1722  .macro bilinear_interpolate_four_pixels_head src_fmt, dst_fmt
1723 -.ifdef have_bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt
1724 -    bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt&_head
1725 +.ifdef have_bilinear_interpolate_four_pixels_\()\src_fmt\()_\()\dst_fmt
1726 +    bilinear_interpolate_four_pixels_\()\src_fmt\()_\()\dst_fmt\()_head
1727  .else
1728 -    bilinear_interpolate_four_pixels src_fmt, dst_fmt
1729 +    bilinear_interpolate_four_pixels \src_fmt, \dst_fmt
1730  .endif
1731  .endm
1732
1733  .macro bilinear_interpolate_four_pixels_tail src_fmt, dst_fmt
1734 -.ifdef have_bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt
1735 -    bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt&_tail
1736 +.ifdef have_bilinear_interpolate_four_pixels_\()\src_fmt\()_\()\dst_fmt
1737 +    bilinear_interpolate_four_pixels_\()\src_fmt\()_\()\dst_fmt\()_tail
1738  .endif
1739  .endm
1740
1741  .macro bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt
1742 -.ifdef have_bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt
1743 -    bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt&_tail_head
1744 +.ifdef have_bilinear_interpolate_four_pixels_\()\src_fmt\()_\()\dst_fmt
1745 +    bilinear_interpolate_four_pixels_\()\src_fmt\()_\()\dst_fmt\()_tail_head
1746  .else
1747 -    bilinear_interpolate_four_pixels src_fmt, dst_fmt
1748 +    bilinear_interpolate_four_pixels \src_fmt, \dst_fmt
1749  .endif
1750  .endm
1751
1752  .macro bilinear_interpolate_eight_pixels_head src_fmt, dst_fmt
1753 -.ifdef have_bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt
1754 -    bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt&_head
1755 +.ifdef have_bilinear_interpolate_eight_pixels_\()\src_fmt\()_\()\dst_fmt
1756 +    bilinear_interpolate_eight_pixels_\()\src_fmt\()_\()\dst_fmt\()_head
1757  .else
1758 -    bilinear_interpolate_four_pixels_head src_fmt, dst_fmt
1759 -    bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt
1760 +    bilinear_interpolate_four_pixels_head \src_fmt, \dst_fmt
1761 +    bilinear_interpolate_four_pixels_tail_head \src_fmt, \dst_fmt
1762  .endif
1763  .endm
1764
1765  .macro bilinear_interpolate_eight_pixels_tail src_fmt, dst_fmt
1766 -.ifdef have_bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt
1767 -    bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt&_tail
1768 +.ifdef have_bilinear_interpolate_eight_pixels_\()\src_fmt\()_\()\dst_fmt
1769 +    bilinear_interpolate_eight_pixels_\()\src_fmt\()_\()\dst_fmt\()_tail
1770  .else
1771 -    bilinear_interpolate_four_pixels_tail src_fmt, dst_fmt
1772 +    bilinear_interpolate_four_pixels_tail \src_fmt, \dst_fmt
1773  .endif
1774  .endm
1775
1776  .macro bilinear_interpolate_eight_pixels_tail_head src_fmt, dst_fmt
1777 -.ifdef have_bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt
1778 -    bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt&_tail_head
1779 +.ifdef have_bilinear_interpolate_eight_pixels_\()\src_fmt\()_\()\dst_fmt
1780 +    bilinear_interpolate_eight_pixels_\()\src_fmt\()_\()\dst_fmt\()_tail_head
1781  .else
1782 -    bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt
1783 -    bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt
1784 +    bilinear_interpolate_four_pixels_tail_head \src_fmt, \dst_fmt
1785 +    bilinear_interpolate_four_pixels_tail_head \src_fmt, \dst_fmt
1786  .endif
1787  .endm
1788
1789  .set BILINEAR_FLAG_UNROLL_4,          0
1790  .set BILINEAR_FLAG_UNROLL_8,          1
1791  .set BILINEAR_FLAG_USE_ALL_NEON_REGS, 2
1792
1793  /*
1794 @@ -3101,17 +3107,17 @@ generate_composite_function_nearest_scan
1795   *  prefetch_distance - prefetch in the source image by that many
1796   *                      pixels ahead
1797   */
1798
1799  .macro generate_bilinear_scanline_func fname, src_fmt, dst_fmt, \
1800                                         src_bpp_shift, dst_bpp_shift, \
1801                                         prefetch_distance, flags
1802
1803 -pixman_asm_function fname
1804 +pixman_asm_function \fname
1805      OUT       .req      r0
1806      TOP       .req      r1
1807      BOTTOM    .req      r2
1808      WT        .req      r3
1809      WB        .req      r4
1810      X         .req      r5
1811      UX        .req      r6
1812      WIDTH     .req      ip
1813 @@ -3119,21 +3125,21 @@ pixman_asm_function fname
1814      TMP2      .req      r4
1815      PF_OFFS   .req      r7
1816      TMP3      .req      r8
1817      TMP4      .req      r9
1818      STRIDE    .req      r2
1819
1820      mov       ip, sp
1821      push      {r4, r5, r6, r7, r8, r9}
1822 -    mov       PF_OFFS, #prefetch_distance
1823 +    mov       PF_OFFS, #\prefetch_distance
1824      ldmia     ip, {WB, X, UX, WIDTH}
1825      mul       PF_OFFS, PF_OFFS, UX
1826
1827 -.if ((flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0
1828 +.if ((\flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0
1829      vpush     {d8-d15}
1830  .endif
1831
1832      sub       STRIDE, BOTTOM, TOP
1833      .unreq    BOTTOM
1834
1835      cmp       WIDTH, #0
1836      ble       3f
1837 @@ -3146,83 +3152,83 @@ pixman_asm_function fname
1838
1839      /* ensure good destination alignment  */
1840      cmp       WIDTH, #1
1841      blt       0f
1842      tst       OUT, #(1 << dst_bpp_shift)
1843      beq       0f
1844      vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
1845      vadd.u16  q12, q12, q13
1846 -    bilinear_interpolate_last_pixel src_fmt, dst_fmt
1847 +    bilinear_interpolate_last_pixel \src_fmt, \dst_fmt
1848      sub       WIDTH, WIDTH, #1
1849  0:
1850      vadd.u16  q13, q13, q13
1851      vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
1852      vadd.u16  q12, q12, q13
1853
1854      cmp       WIDTH, #2
1855      blt       0f
1856      tst       OUT, #(1 << (dst_bpp_shift + 1))
1857      beq       0f
1858 -    bilinear_interpolate_two_pixels src_fmt, dst_fmt
1859 +    bilinear_interpolate_two_pixels \src_fmt, \dst_fmt
1860      sub       WIDTH, WIDTH, #2
1861  0:
1862 -.if ((flags) & BILINEAR_FLAG_UNROLL_8) != 0
1863 +.if ((\flags) & BILINEAR_FLAG_UNROLL_8) != 0
1864  /*********** 8 pixels per iteration *****************/
1865      cmp       WIDTH, #4
1866      blt       0f
1867      tst       OUT, #(1 << (dst_bpp_shift + 2))
1868      beq       0f
1869 -    bilinear_interpolate_four_pixels src_fmt, dst_fmt
1870 +    bilinear_interpolate_four_pixels \src_fmt, \dst_fmt
1871      sub       WIDTH, WIDTH, #4
1872  0:
1873      subs      WIDTH, WIDTH, #8
1874      blt       1f
1875      mov       PF_OFFS, PF_OFFS, asr #(16 - src_bpp_shift)
1876 -    bilinear_interpolate_eight_pixels_head src_fmt, dst_fmt
1877 +    bilinear_interpolate_eight_pixels_head \src_fmt, \dst_fmt
1878      subs      WIDTH, WIDTH, #8
1879      blt       5f
1880  0:
1881 -    bilinear_interpolate_eight_pixels_tail_head src_fmt, dst_fmt
1882 +    bilinear_interpolate_eight_pixels_tail_head \src_fmt, \dst_fmt
1883      subs      WIDTH, WIDTH, #8
1884      bge       0b
1885  5:
1886 -    bilinear_interpolate_eight_pixels_tail src_fmt, dst_fmt
1887 +    bilinear_interpolate_eight_pixels_tail \src_fmt, \dst_fmt
1888  1:
1889      tst       WIDTH, #4
1890      beq       2f
1891 -    bilinear_interpolate_four_pixels src_fmt, dst_fmt
1892 +    bilinear_interpolate_four_pixels \src_fmt, \dst_fmt
1893  2:
1894  .else
1895  /*********** 4 pixels per iteration *****************/
1896      subs      WIDTH, WIDTH, #4
1897      blt       1f
1898      mov       PF_OFFS, PF_OFFS, asr #(16 - src_bpp_shift)
1899 -    bilinear_interpolate_four_pixels_head src_fmt, dst_fmt
1900 +    bilinear_interpolate_four_pixels_head \src_fmt, \dst_fmt
1901      subs      WIDTH, WIDTH, #4
1902      blt       5f
1903  0:
1904 -    bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt
1905 +    bilinear_interpolate_four_pixels_tail_head \src_fmt, \dst_fmt
1906      subs      WIDTH, WIDTH, #4
1907      bge       0b
1908  5:
1909 -    bilinear_interpolate_four_pixels_tail src_fmt, dst_fmt
1910 +    bilinear_interpolate_four_pixels_tail \src_fmt, \dst_fmt
1911  1:
1912  /****************************************************/
1913  .endif
1914      /* handle the remaining trailing pixels */
1915      tst       WIDTH, #2
1916      beq       2f
1917 -    bilinear_interpolate_two_pixels src_fmt, dst_fmt
1918 +    bilinear_interpolate_two_pixels \src_fmt, \dst_fmt
1919  2:
1920      tst       WIDTH, #1
1921      beq       3f
1922 -    bilinear_interpolate_last_pixel src_fmt, dst_fmt
1923 +    bilinear_interpolate_last_pixel \src_fmt, \dst_fmt
1924  3:
1925 -.if ((flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0
1926 +.if ((\flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0
1927      vpop      {d8-d15}
1928  .endif
1929      pop       {r4, r5, r6, r7, r8, r9}
1930      bx        lr
1931
1932      .unreq    OUT
1933      .unreq    TOP
1934      .unreq    WT
1935 @@ -3231,17 +3237,17 @@ 3:
1936      .unreq    UX
1937      .unreq    WIDTH
1938      .unreq    TMP1
1939      .unreq    TMP2
1940      .unreq    PF_OFFS
1941      .unreq    TMP3
1942      .unreq    TMP4
1943      .unreq    STRIDE
1944 -.endfunc
1945 +pixman_end_asm_function
1946
1947  .endm
1948
1949  /*****************************************************************************/
1950
1951  .set have_bilinear_interpolate_four_pixels_8888_8888, 1
1952
1953  .macro bilinear_interpolate_four_pixels_8888_8888_head
1954 diff --git a/gfx/cairo/libpixman/src/pixman-arm-neon-asm.h b/gfx/cairo/libpixman/src/pixman-arm-neon-asm.h
1955 --- a/gfx/cairo/libpixman/src/pixman-arm-neon-asm.h
1956 +++ b/gfx/cairo/libpixman/src/pixman-arm-neon-asm.h
1957 @@ -69,303 +69,303 @@
1958  .set PREFETCH_TYPE_ADVANCED,   2 /* Advanced fine-grained prefetch */
1959
1960  /*
1961   * Definitions of supplementary pixld/pixst macros (for partial load/store of
1962   * pixel data).
1963   */
1964
1965  .macro pixldst1 op, elem_size, reg1, mem_operand, abits
1966 -.if abits > 0
1967 -    op&.&elem_size {d&reg1}, [&mem_operand&, :&abits&]!
1968 +.if \abits > 0
1969 +    \op\().\()\elem_size {d\()\reg1}, [\()\mem_operand\(), :\()\abits\()]!
1970  .else
1971 -    op&.&elem_size {d&reg1}, [&mem_operand&]!
1972 +    \op\().\()\elem_size {d\()\reg1}, [\()\mem_operand\()]!
1973  .endif
1974  .endm
1975
1976  .macro pixldst2 op, elem_size, reg1, reg2, mem_operand, abits
1977 -.if abits > 0
1978 -    op&.&elem_size {d&reg1, d&reg2}, [&mem_operand&, :&abits&]!
1979 +.if \abits > 0
1980 +    \op\().\()\elem_size {d\()\reg1, d\()\reg2}, [\()\mem_operand\(), :\()\abits\()]!
1981  .else
1982 -    op&.&elem_size {d&reg1, d&reg2}, [&mem_operand&]!
1983 +    \op\().\()\elem_size {d\()\reg1, d\()\reg2}, [\()\mem_operand\()]!
1984  .endif
1985  .endm
1986
1987  .macro pixldst4 op, elem_size, reg1, reg2, reg3, reg4, mem_operand, abits
1988 -.if abits > 0
1989 -    op&.&elem_size {d&reg1, d&reg2, d&reg3, d&reg4}, [&mem_operand&, :&abits&]!
1990 +.if \abits > 0
1991 +    \op\().\()\elem_size {d\()\reg1, d\()\reg2, d\()\reg3, d\()\reg4}, [\()\mem_operand\(), :\()\abits\()]!
1992  .else
1993 -    op&.&elem_size {d&reg1, d&reg2, d&reg3, d&reg4}, [&mem_operand&]!
1994 +    \op\().\()\elem_size {d\()\reg1, d\()\reg2, d\()\reg3, d\()\reg4}, [\()\mem_operand\()]!
1995  .endif
1996  .endm
1997
1998  .macro pixldst0 op, elem_size, reg1, idx, mem_operand, abits
1999 -    op&.&elem_size {d&reg1[idx]}, [&mem_operand&]!
2000 +    \op\().\()\elem_size {d\()\reg1[\idx]}, [\()\mem_operand\()]!
2001  .endm
2002
2003  .macro pixldst3 op, elem_size, reg1, reg2, reg3, mem_operand
2004 -    op&.&elem_size {d&reg1, d&reg2, d&reg3}, [&mem_operand&]!
2005 +    \op\().\()\elem_size {d\()\reg1, d\()\reg2, d\()\reg3}, [\()\mem_operand\()]!
2006  .endm
2007
2008  .macro pixldst30 op, elem_size, reg1, reg2, reg3, idx, mem_operand
2009 -    op&.&elem_size {d&reg1[idx], d&reg2[idx], d&reg3[idx]}, [&mem_operand&]!
2010 +    \op\().\()\elem_size {d\()\reg1[\idx], d\()\reg2[\idx], d\()\reg3[\idx]}, [\()\mem_operand\()]!
2011  .endm
2012
2013  .macro pixldst numbytes, op, elem_size, basereg, mem_operand, abits
2014 -.if numbytes == 32
2015 -    pixldst4 op, elem_size, %(basereg+4), %(basereg+5), \
2016 -                              %(basereg+6), %(basereg+7), mem_operand, abits
2017 -.elseif numbytes == 16
2018 -    pixldst2 op, elem_size, %(basereg+2), %(basereg+3), mem_operand, abits
2019 -.elseif numbytes == 8
2020 -    pixldst1 op, elem_size, %(basereg+1), mem_operand, abits
2021 -.elseif numbytes == 4
2022 -    .if !RESPECT_STRICT_ALIGNMENT || (elem_size == 32)
2023 -        pixldst0 op, 32, %(basereg+0), 1, mem_operand, abits
2024 -    .elseif elem_size == 16
2025 -        pixldst0 op, 16, %(basereg+0), 2, mem_operand, abits
2026 -        pixldst0 op, 16, %(basereg+0), 3, mem_operand, abits
2027 +.if \numbytes == 32
2028 +    pixldst4 \op, \elem_size, %(\basereg+4), %(\basereg+5), \
2029 +                              %(\basereg+6), %(\basereg+7), \mem_operand, \abits
2030 +.elseif \numbytes == 16
2031 +    pixldst2 \op, \elem_size, %(\basereg+2), %(\basereg+3), \mem_operand, \abits
2032 +.elseif \numbytes == 8
2033 +    pixldst1 \op, \elem_size, %(\basereg+1), \mem_operand, \abits
2034 +.elseif \numbytes == 4
2035 +    .if !RESPECT_STRICT_ALIGNMENT || (\elem_size == 32)
2036 +        pixldst0 \op, 32, %(\basereg+0), 1, \mem_operand, \abits
2037 +    .elseif \elem_size == 16
2038 +        pixldst0 \op, 16, %(\basereg+0), 2, \mem_operand, \abits
2039 +        pixldst0 \op, 16, %(\basereg+0), 3, \mem_operand, \abits
2040      .else
2041 -        pixldst0 op, 8, %(basereg+0), 4, mem_operand, abits
2042 -        pixldst0 op, 8, %(basereg+0), 5, mem_operand, abits
2043 -        pixldst0 op, 8, %(basereg+0), 6, mem_operand, abits
2044 -        pixldst0 op, 8, %(basereg+0), 7, mem_operand, abits
2045 +        pixldst0 \op, 8, %(\basereg+0), 4, \mem_operand, \abits
2046 +        pixldst0 \op, 8, %(\basereg+0), 5, \mem_operand, \abits
2047 +        pixldst0 \op, 8, %(\basereg+0), 6, \mem_operand, \abits
2048 +        pixldst0 \op, 8, %(\basereg+0), 7, \mem_operand, \abits
2049      .endif
2050 -.elseif numbytes == 2
2051 -    .if !RESPECT_STRICT_ALIGNMENT || (elem_size == 16)
2052 -        pixldst0 op, 16, %(basereg+0), 1, mem_operand, abits
2053 +.elseif \numbytes == 2
2054 +    .if !RESPECT_STRICT_ALIGNMENT || (\elem_size == 16)
2055 +        pixldst0 \op, 16, %(\basereg+0), 1, \mem_operand, \abits
2056      .else
2057 -        pixldst0 op, 8, %(basereg+0), 2, mem_operand, abits
2058 -        pixldst0 op, 8, %(basereg+0), 3, mem_operand, abits
2059 +        pixldst0 \op, 8, %(\basereg+0), 2, \mem_operand, \abits
2060 +        pixldst0 \op, 8, %(\basereg+0), 3, \mem_operand, \abits
2061      .endif
2062 -.elseif numbytes == 1
2063 -    pixldst0 op, 8, %(basereg+0), 1, mem_operand, abits
2064 +.elseif \numbytes == 1
2065 +    pixldst0 \op, 8, %(\basereg+0), 1, \mem_operand, \abits
2066  .else
2067 -    .error "unsupported size: numbytes"
2068 +    .error "unsupported size: \numbytes"
2069  .endif
2070  .endm
2071
2072  .macro pixld numpix, bpp, basereg, mem_operand, abits=0
2073 -.if bpp > 0
2074 -.if (bpp == 32) && (numpix == 8) && (DEINTERLEAVE_32BPP_ENABLED != 0)
2075 -    pixldst4 vld4, 8, %(basereg+4), %(basereg+5), \
2076 -                      %(basereg+6), %(basereg+7), mem_operand, abits
2077 -.elseif (bpp == 24) && (numpix == 8)
2078 -    pixldst3 vld3, 8, %(basereg+3), %(basereg+4), %(basereg+5), mem_operand
2079 -.elseif (bpp == 24) && (numpix == 4)
2080 -    pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 4, mem_operand
2081 -    pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 5, mem_operand
2082 -    pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 6, mem_operand
2083 -    pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 7, mem_operand
2084 -.elseif (bpp == 24) && (numpix == 2)
2085 -    pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 2, mem_operand
2086 -    pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 3, mem_operand
2087 -.elseif (bpp == 24) && (numpix == 1)
2088 -    pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 1, mem_operand
2089 +.if \bpp > 0
2090 +.if (\bpp == 32) && (\numpix == 8) && (DEINTERLEAVE_32BPP_ENABLED != 0)
2091 +    pixldst4 vld4, 8, %(\basereg+4), %(\basereg+5), \
2092 +                      %(\basereg+6), %(\basereg+7), \mem_operand, \abits
2093 +.elseif (\bpp == 24) && (\numpix == 8)
2094 +    pixldst3 vld3, 8, %(\basereg+3), %(\basereg+4), %(\basereg+5), \mem_operand
2095 +.elseif (\bpp == 24) && (\numpix == 4)
2096 +    pixldst30 vld3, 8, %(\basereg+0), %(\basereg+1), %(\basereg+2), 4, \mem_operand
2097 +    pixldst30 vld3, 8, %(\basereg+0), %(\basereg+1), %(\basereg+2), 5, \mem_operand
2098 +    pixldst30 vld3, 8, %(\basereg+0), %(\basereg+1), %(\basereg+2), 6, \mem_operand
2099 +    pixldst30 vld3, 8, %(\basereg+0), %(\basereg+1), %(\basereg+2), 7, \mem_operand
2100 +.elseif (\bpp == 24) && (\numpix == 2)
2101 +    pixldst30 vld3, 8, %(\basereg+0), %(\basereg+1), %(\basereg+2), 2, \mem_operand
2102 +    pixldst30 vld3, 8, %(\basereg+0), %(\basereg+1), %(\basereg+2), 3, \mem_operand
2103 +.elseif (\bpp == 24) && (\numpix == 1)
2104 +    pixldst30 vld3, 8, %(\basereg+0), %(\basereg+1), %(\basereg+2), 1, \mem_operand
2105  .else
2106 -    pixldst %(numpix * bpp / 8), vld1, %(bpp), basereg, mem_operand, abits
2107 +    pixldst %(\numpix * \bpp / 8), vld1, %(\bpp), \basereg, \mem_operand, \abits
2108  .endif
2109  .endif
2110  .endm
2111
2112  .macro pixst numpix, bpp, basereg, mem_operand, abits=0
2113 -.if bpp > 0
2114 -.if (bpp == 32) && (numpix == 8) && (DEINTERLEAVE_32BPP_ENABLED != 0)
2115 -    pixldst4 vst4, 8, %(basereg+4), %(basereg+5), \
2116 -                      %(basereg+6), %(basereg+7), mem_operand, abits
2117 -.elseif (bpp == 24) && (numpix == 8)
2118 -    pixldst3 vst3, 8, %(basereg+3), %(basereg+4), %(basereg+5), mem_operand
2119 -.elseif (bpp == 24) && (numpix == 4)
2120 -    pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 4, mem_operand
2121 -    pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 5, mem_operand
2122 -    pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 6, mem_operand
2123 -    pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 7, mem_operand
2124 -.elseif (bpp == 24) && (numpix == 2)
2125 -    pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 2, mem_operand
2126 -    pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 3, mem_operand
2127 -.elseif (bpp == 24) && (numpix == 1)
2128 -    pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 1, mem_operand
2129 +.if \bpp > 0
2130 +.if (\bpp == 32) && (\numpix == 8) && (DEINTERLEAVE_32BPP_ENABLED != 0)
2131 +    pixldst4 vst4, 8, %(\basereg+4), %(\basereg+5), \
2132 +                      %(\basereg+6), %(\basereg+7), \mem_operand, \abits
2133 +.elseif (\bpp == 24) && (\numpix == 8)
2134 +    pixldst3 vst3, 8, %(\basereg+3), %(\basereg+4), %(\basereg+5), \mem_operand
2135 +.elseif (\bpp == 24) && (\numpix == 4)
2136 +    pixldst30 vst3, 8, %(\basereg+0), %(\basereg+1), %(\basereg+2), 4, \mem_operand
2137 +    pixldst30 vst3, 8, %(\basereg+0), %(\basereg+1), %(\basereg+2), 5, \mem_operand
2138 +    pixldst30 vst3, 8, %(\basereg+0), %(\basereg+1), %(\basereg+2), 6, \mem_operand
2139 +    pixldst30 vst3, 8, %(\basereg+0), %(\basereg+1), %(\basereg+2), 7, \mem_operand
2140 +.elseif (\bpp == 24) && (\numpix == 2)
2141 +    pixldst30 vst3, 8, %(\basereg+0), %(\basereg+1), %(\basereg+2), 2, \mem_operand
2142 +    pixldst30 vst3, 8, %(\basereg+0), %(\basereg+1), %(\basereg+2), 3, \mem_operand
2143 +.elseif (\bpp == 24) && (\numpix == 1)
2144 +    pixldst30 vst3, 8, %(\basereg+0), %(\basereg+1), %(\basereg+2), 1, \mem_operand
2145  .else
2146 -    pixldst %(numpix * bpp / 8), vst1, %(bpp), basereg, mem_operand, abits
2147 +    pixldst %(\numpix * \bpp / 8), vst1, %(\bpp), \basereg, \mem_operand, \abits
2148  .endif
2149  .endif
2150  .endm
2151
2152  .macro pixld_a numpix, bpp, basereg, mem_operand
2153 -.if (bpp * numpix) <= 128
2154 -    pixld numpix, bpp, basereg, mem_operand, %(bpp * numpix)
2155 +.if (\bpp * \numpix) <= 128
2156 +    pixld \numpix, \bpp, \basereg, \mem_operand, %(\bpp * \numpix)
2157  .else
2158 -    pixld numpix, bpp, basereg, mem_operand, 128
2159 +    pixld \numpix, \bpp, \basereg, \mem_operand, 128
2160  .endif
2161  .endm
2162
2163  .macro pixst_a numpix, bpp, basereg, mem_operand
2164 -.if (bpp * numpix) <= 128
2165 -    pixst numpix, bpp, basereg, mem_operand, %(bpp * numpix)
2166 +.if (\bpp * \numpix) <= 128
2167 +    pixst \numpix, \bpp, \basereg, \mem_operand, %(\bpp * \numpix)
2168  .else
2169 -    pixst numpix, bpp, basereg, mem_operand, 128
2170 +    pixst \numpix, \bpp, \basereg, \mem_operand, 128
2171  .endif
2172  .endm
2173
2174  /*
2175   * Pixel fetcher for nearest scaling (needs TMP1, TMP2, VX, UNIT_X register
2176   * aliases to be defined)
2177   */
2178  .macro pixld1_s elem_size, reg1, mem_operand
2179 -.if elem_size == 16
2180 +.if \elem_size == 16
2181      mov     TMP1, VX, asr #16
2182      adds    VX, VX, UNIT_X
2183  5:  subpls  VX, VX, SRC_WIDTH_FIXED
2184      bpl     5b
2185 -    add     TMP1, mem_operand, TMP1, asl #1
2186 +    add     TMP1, \mem_operand, TMP1, asl #1
2187      mov     TMP2, VX, asr #16
2188      adds    VX, VX, UNIT_X
2189  5:  subpls  VX, VX, SRC_WIDTH_FIXED
2190      bpl     5b
2191 -    add     TMP2, mem_operand, TMP2, asl #1
2192 -    vld1.16 {d&reg1&[0]}, [TMP1, :16]
2193 +    add     TMP2, \mem_operand, TMP2, asl #1
2194 +    vld1.16 {d\()\reg1\()[0]}, [TMP1, :16]
2195      mov     TMP1, VX, asr #16
2196      adds    VX, VX, UNIT_X
2197  5:  subpls  VX, VX, SRC_WIDTH_FIXED
2198      bpl     5b
2199 -    add     TMP1, mem_operand, TMP1, asl #1
2200 -    vld1.16 {d&reg1&[1]}, [TMP2, :16]
2201 +    add     TMP1, \mem_operand, TMP1, asl #1
2202 +    vld1.16 {d\()\reg1\()[1]}, [TMP2, :16]
2203      mov     TMP2, VX, asr #16
2204      adds    VX, VX, UNIT_X
2205  5:  subpls  VX, VX, SRC_WIDTH_FIXED
2206      bpl     5b
2207 -    add     TMP2, mem_operand, TMP2, asl #1
2208 -    vld1.16 {d&reg1&[2]}, [TMP1, :16]
2209 -    vld1.16 {d&reg1&[3]}, [TMP2, :16]
2210 -.elseif elem_size == 32
2211 +    add     TMP2, \mem_operand, TMP2, asl #1
2212 +    vld1.16 {d\()\reg1\()[2]}, [TMP1, :16]
2213 +    vld1.16 {d\()\reg1\()[3]}, [TMP2, :16]
2214 +.elseif \elem_size == 32
2215      mov     TMP1, VX, asr #16
2216      adds    VX, VX, UNIT_X
2217  5:  subpls  VX, VX, SRC_WIDTH_FIXED
2218      bpl     5b
2219 -    add     TMP1, mem_operand, TMP1, asl #2
2220 +    add     TMP1, \mem_operand, TMP1, asl #2
2221      mov     TMP2, VX, asr #16
2222      adds    VX, VX, UNIT_X
2223  5:  subpls  VX, VX, SRC_WIDTH_FIXED
2224      bpl     5b
2225 -    add     TMP2, mem_operand, TMP2, asl #2
2226 -    vld1.32 {d&reg1&[0]}, [TMP1, :32]
2227 -    vld1.32 {d&reg1&[1]}, [TMP2, :32]
2228 +    add     TMP2, \mem_operand, TMP2, asl #2
2229 +    vld1.32 {d\()\reg1\()[0]}, [TMP1, :32]
2230 +    vld1.32 {d\()\reg1\()[1]}, [TMP2, :32]
2231  .else
2232      .error "unsupported"
2233  .endif
2234  .endm
2235
2236  .macro pixld2_s elem_size, reg1, reg2, mem_operand
2237  .if 0 /* elem_size == 32 */
2238      mov     TMP1, VX, asr #16
2239      add     VX, VX, UNIT_X, asl #1
2240 -    add     TMP1, mem_operand, TMP1, asl #2
2241 +    add     TMP1, \mem_operand, TMP1, asl #2
2242      mov     TMP2, VX, asr #16
2243      sub     VX, VX, UNIT_X
2244 -    add     TMP2, mem_operand, TMP2, asl #2
2245 -    vld1.32 {d&reg1&[0]}, [TMP1, :32]
2246 +    add     TMP2, \mem_operand, TMP2, asl #2
2247 +    vld1.32 {d\()\reg1\()[0]}, [TMP1, :32]
2248      mov     TMP1, VX, asr #16
2249      add     VX, VX, UNIT_X, asl #1
2250 -    add     TMP1, mem_operand, TMP1, asl #2
2251 -    vld1.32 {d&reg2&[0]}, [TMP2, :32]
2252 +    add     TMP1, \mem_operand, TMP1, asl #2
2253 +    vld1.32 {d\()\reg2\()[0]}, [TMP2, :32]
2254      mov     TMP2, VX, asr #16
2255      add     VX, VX, UNIT_X
2256 -    add     TMP2, mem_operand, TMP2, asl #2
2257 -    vld1.32 {d&reg1&[1]}, [TMP1, :32]
2258 -    vld1.32 {d&reg2&[1]}, [TMP2, :32]
2259 +    add     TMP2, \mem_operand, TMP2, asl #2
2260 +    vld1.32 {d\()\reg1\()[1]}, [TMP1, :32]
2261 +    vld1.32 {d\()\reg2\()[1]}, [TMP2, :32]
2262  .else
2263 -    pixld1_s elem_size, reg1, mem_operand
2264 -    pixld1_s elem_size, reg2, mem_operand
2265 +    pixld1_s \elem_size, \reg1, \mem_operand
2266 +    pixld1_s \elem_size, \reg2, \mem_operand
2267  .endif
2268  .endm
2269
2270  .macro pixld0_s elem_size, reg1, idx, mem_operand
2271 -.if elem_size == 16
2272 +.if \elem_size == 16
2273      mov     TMP1, VX, asr #16
2274      adds    VX, VX, UNIT_X
2275  5:  subpls  VX, VX, SRC_WIDTH_FIXED
2276      bpl     5b
2277 -    add     TMP1, mem_operand, TMP1, asl #1
2278 -    vld1.16 {d&reg1&[idx]}, [TMP1, :16]
2279 -.elseif elem_size == 32
2280 +    add     TMP1, \mem_operand, TMP1, asl #1
2281 +    vld1.16 {d\()\reg1\()[\idx]}, [TMP1, :16]
2282 +.elseif \elem_size == 32
2283      mov     TMP1, VX, asr #16
2284      adds    VX, VX, UNIT_X
2285  5:  subpls  VX, VX, SRC_WIDTH_FIXED
2286      bpl     5b
2287 -    add     TMP1, mem_operand, TMP1, asl #2
2288 -    vld1.32 {d&reg1&[idx]}, [TMP1, :32]
2289 +    add     TMP1, \mem_operand, TMP1, asl #2
2290 +    vld1.32 {d\()\reg1\()[\idx]}, [TMP1, :32]
2291  .endif
2292  .endm
2293
2294  .macro pixld_s_internal numbytes, elem_size, basereg, mem_operand
2295 -.if numbytes == 32
2296 -    pixld2_s elem_size, %(basereg+4), %(basereg+5), mem_operand
2297 -    pixld2_s elem_size, %(basereg+6), %(basereg+7), mem_operand
2298 -    pixdeinterleave elem_size, %(basereg+4)
2299 -.elseif numbytes == 16
2300 -    pixld2_s elem_size, %(basereg+2), %(basereg+3), mem_operand
2301 -.elseif numbytes == 8
2302 -    pixld1_s elem_size, %(basereg+1), mem_operand
2303 -.elseif numbytes == 4
2304 -    .if elem_size == 32
2305 -        pixld0_s elem_size, %(basereg+0), 1, mem_operand
2306 -    .elseif elem_size == 16
2307 -        pixld0_s elem_size, %(basereg+0), 2, mem_operand
2308 -        pixld0_s elem_size, %(basereg+0), 3, mem_operand
2309 +.if \numbytes == 32
2310 +    pixld2_s \elem_size, %(\basereg+4), %(\basereg+5), \mem_operand
2311 +    pixld2_s \elem_size, %(\basereg+6), %(\basereg+7), \mem_operand
2312 +    pixdeinterleave \elem_size, %(\basereg+4)
2313 +.elseif \numbytes == 16
2314 +    pixld2_s \elem_size, %(\basereg+2), %(\basereg+3), \mem_operand
2315 +.elseif \numbytes == 8
2316 +    pixld1_s \elem_size, %(\basereg+1), \mem_operand
2317 +.elseif \numbytes == 4
2318 +    .if \elem_size == 32
2319 +        pixld0_s \elem_size, %(\basereg+0), 1, \mem_operand
2320 +    .elseif \elem_size == 16
2321 +        pixld0_s \elem_size, %(\basereg+0), 2, \mem_operand
2322 +        pixld0_s \elem_size, %(\basereg+0), 3, \mem_operand
2323      .else
2324 -        pixld0_s elem_size, %(basereg+0), 4, mem_operand
2325 -        pixld0_s elem_size, %(basereg+0), 5, mem_operand
2326 -        pixld0_s elem_size, %(basereg+0), 6, mem_operand
2327 -        pixld0_s elem_size, %(basereg+0), 7, mem_operand
2328 +        pixld0_s \elem_size, %(\basereg+0), 4, \mem_operand
2329 +        pixld0_s \elem_size, %(\basereg+0), 5, \mem_operand
2330 +        pixld0_s \elem_size, %(\basereg+0), 6, \mem_operand
2331 +        pixld0_s \elem_size, %(\basereg+0), 7, \mem_operand
2332      .endif
2333 -.elseif numbytes == 2
2334 -    .if elem_size == 16
2335 -        pixld0_s elem_size, %(basereg+0), 1, mem_operand
2336 +.elseif \numbytes == 2
2337 +    .if \elem_size == 16
2338 +        pixld0_s \elem_size, %(\basereg+0), 1, \mem_operand
2339      .else
2340 -        pixld0_s elem_size, %(basereg+0), 2, mem_operand
2341 -        pixld0_s elem_size, %(basereg+0), 3, mem_operand
2342 +        pixld0_s \elem_size, %(\basereg+0), 2, \mem_operand
2343 +        pixld0_s \elem_size, %(\basereg+0), 3, \mem_operand
2344      .endif
2345 -.elseif numbytes == 1
2346 -    pixld0_s elem_size, %(basereg+0), 1, mem_operand
2347 +.elseif \numbytes == 1
2348 +    pixld0_s \elem_size, %(\basereg+0), 1, \mem_operand
2349  .else
2350 -    .error "unsupported size: numbytes"
2351 +    .error "unsupported size: \numbytes"
2352  .endif
2353  .endm
2354
2355  .macro pixld_s numpix, bpp, basereg, mem_operand
2356 -.if bpp > 0
2357 -    pixld_s_internal %(numpix * bpp / 8), %(bpp), basereg, mem_operand
2358 +.if \bpp > 0
2359 +    pixld_s_internal %(\numpix * \bpp / 8), %(\bpp), \basereg, \mem_operand
2360  .endif
2361  .endm
2362
2363  .macro vuzp8 reg1, reg2
2364 -    vuzp.8 d&reg1, d&reg2
2365 +    vuzp.8 d\()\reg1, d\()\reg2
2366  .endm
2367
2368  .macro vzip8 reg1, reg2
2369 -    vzip.8 d&reg1, d&reg2
2370 +    vzip.8 d\()\reg1, d\()\reg2
2371  .endm
2372
2373  /* deinterleave B, G, R, A channels for eight 32bpp pixels in 4 registers */
2374  .macro pixdeinterleave bpp, basereg
2375 -.if (bpp == 32) && (DEINTERLEAVE_32BPP_ENABLED != 0)
2376 -    vuzp8 %(basereg+0), %(basereg+1)
2377 -    vuzp8 %(basereg+2), %(basereg+3)
2378 -    vuzp8 %(basereg+1), %(basereg+3)
2379 -    vuzp8 %(basereg+0), %(basereg+2)
2380 +.if (\bpp == 32) && (DEINTERLEAVE_32BPP_ENABLED != 0)
2381 +    vuzp8 %(\basereg+0), %(\basereg+1)
2382 +    vuzp8 %(\basereg+2), %(\basereg+3)
2383 +    vuzp8 %(\basereg+1), %(\basereg+3)
2384 +    vuzp8 %(\basereg+0), %(\basereg+2)
2385  .endif
2386  .endm
2387
2388  /* interleave B, G, R, A channels for eight 32bpp pixels in 4 registers */
2389  .macro pixinterleave bpp, basereg
2390 -.if (bpp == 32) && (DEINTERLEAVE_32BPP_ENABLED != 0)
2391 -    vzip8 %(basereg+0), %(basereg+2)
2392 -    vzip8 %(basereg+1), %(basereg+3)
2393 -    vzip8 %(basereg+2), %(basereg+3)
2394 -    vzip8 %(basereg+0), %(basereg+1)
2395 +.if (\bpp == 32) && (DEINTERLEAVE_32BPP_ENABLED != 0)
2396 +    vzip8 %(\basereg+0), %(\basereg+2)
2397 +    vzip8 %(\basereg+1), %(\basereg+3)
2398 +    vzip8 %(\basereg+2), %(\basereg+3)
2399 +    vzip8 %(\basereg+0), %(\basereg+1)
2400  .endif
2401  .endm
2402
2403  /*
2404   * This is a macro for implementing cache preload. The main idea is that
2405   * cache preload logic is mostly independent from the rest of pixels
2406   * processing code. It starts at the top left pixel and moves forward
2407   * across pixels and can jump across scanlines. Prefetch distance is
2408 @@ -389,51 +389,51 @@ 5:  subpls  VX, VX, SRC_WIDTH_FIXED
2409   * for almost zero cost!
2410   *
2411   * (*) The overhead of the prefetcher is visible when running some trivial
2412   * pixels processing like simple copy. Anyway, having prefetch is a must
2413   * when working with the graphics data.
2414   */
2415  .macro PF a, x:vararg
2416  .if (PREFETCH_TYPE_CURRENT == PREFETCH_TYPE_ADVANCED)
2417 -    a x
2418 +    \a \x
2419  .endif
2420  .endm
2421
2422  .macro cache_preload std_increment, boost_increment
2423  .if (src_bpp_shift >= 0) || (dst_r_bpp != 0) || (mask_bpp_shift >= 0)
2424  .if regs_shortage
2425 -    PF ldr ORIG_W, [sp] /* If we are short on regs, ORIG_W is kept on stack */
2426 +    PF ldr, ORIG_W, [sp] /* If we are short on regs, ORIG_W is kept on stack */
2427  .endif
2428 -.if std_increment != 0
2429 -    PF add PF_X, PF_X, #std_increment
2430 +.if \std_increment != 0
2431 +    PF add, PF_X, PF_X, #\std_increment
2432  .endif
2433 -    PF tst PF_CTL, #0xF
2434 -    PF addne PF_X, PF_X, #boost_increment
2435 -    PF subne PF_CTL, PF_CTL, #1
2436 -    PF cmp PF_X, ORIG_W
2437 +    PF tst, PF_CTL, #0xF
2438 +    PF addne, PF_X, PF_X, #\boost_increment
2439 +    PF subne, PF_CTL, PF_CTL, #1
2440 +    PF cmp, PF_X, ORIG_W
2441  .if src_bpp_shift >= 0
2442      PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
2443  .endif
2444  .if dst_r_bpp != 0
2445      PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
2446  .endif
2447  .if mask_bpp_shift >= 0
2448      PF pld, [PF_MASK, PF_X, lsl #mask_bpp_shift]
2449  .endif
2450 -    PF subge PF_X, PF_X, ORIG_W
2451 -    PF subges PF_CTL, PF_CTL, #0x10
2452 +    PF subge, PF_X, PF_X, ORIG_W
2453 +    PF subges, PF_CTL, PF_CTL, #0x10
2454  .if src_bpp_shift >= 0
2455 -    PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
2456 +    PF ldrgeb, DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
2457  .endif
2458  .if dst_r_bpp != 0
2459 -    PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
2460 +    PF ldrgeb, DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
2461  .endif
2462  .if mask_bpp_shift >= 0
2463 -    PF ldrgeb DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]!
2464 +    PF ldrgeb, DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]!
2465  .endif
2466  .endif
2467  .endm
2468
2469  .macro cache_preload_simple
2470  .if (PREFETCH_TYPE_CURRENT == PREFETCH_TYPE_SIMPLE)
2471  .if src_bpp > 0
2472      pld [SRC, #(PREFETCH_DISTANCE_SIMPLE * src_bpp / 8)]
2473 @@ -460,51 +460,53 @@ 5:  subpls  VX, VX, SRC_WIDTH_FIXED
2474  .macro ensure_destination_ptr_alignment process_pixblock_head, \
2475                                          process_pixblock_tail, \
2476                                          process_pixblock_tail_head
2477  .if dst_w_bpp != 24
2478      tst         DST_R, #0xF
2479      beq         2f
2480
2481  .irp lowbit, 1, 2, 4, 8, 16
2482 +#ifndef __clang__
2483  local skip1
2484 -.if (dst_w_bpp <= (lowbit * 8)) && ((lowbit * 8) < (pixblock_size * dst_w_bpp))
2485 -.if lowbit < 16 /* we don't need more than 16-byte alignment */
2486 -    tst         DST_R, #lowbit
2487 +#endif
2488 +.if (dst_w_bpp <= (\lowbit * 8)) && ((\lowbit * 8) < (pixblock_size * dst_w_bpp))
2489 +.if \lowbit < 16 /* we don't need more than 16-byte alignment */
2490 +    tst         DST_R, #\lowbit
2491      beq         1f
2492  .endif
2493 -    pixld_src   (lowbit * 8 / dst_w_bpp), src_bpp, src_basereg, SRC
2494 -    pixld       (lowbit * 8 / dst_w_bpp), mask_bpp, mask_basereg, MASK
2495 +    pixld_src   (\lowbit * 8 / dst_w_bpp), src_bpp, src_basereg, SRC
2496 +    pixld       (\lowbit * 8 / dst_w_bpp), mask_bpp, mask_basereg, MASK
2497  .if dst_r_bpp > 0
2498 -    pixld_a     (lowbit * 8 / dst_r_bpp), dst_r_bpp, dst_r_basereg, DST_R
2499 +    pixld_a     (\lowbit * 8 / dst_r_bpp), dst_r_bpp, dst_r_basereg, DST_R
2500  .else
2501 -    add         DST_R, DST_R, #lowbit
2502 +    add         DST_R, DST_R, #\lowbit
2503  .endif
2504 -    PF add      PF_X, PF_X, #(lowbit * 8 / dst_w_bpp)
2505 -    sub         W, W, #(lowbit * 8 / dst_w_bpp)
2506 +    PF add,     PF_X, PF_X, #(\lowbit * 8 / dst_w_bpp)
2507 +    sub         W, W, #(\lowbit * 8 / dst_w_bpp)
2508  1:
2509  .endif
2510  .endr
2511      pixdeinterleave src_bpp, src_basereg
2512      pixdeinterleave mask_bpp, mask_basereg
2513      pixdeinterleave dst_r_bpp, dst_r_basereg
2514
2515 -    process_pixblock_head
2516 +    \process_pixblock_head
2517      cache_preload 0, pixblock_size
2518      cache_preload_simple
2519 -    process_pixblock_tail
2520 +    \process_pixblock_tail
2521
2522      pixinterleave dst_w_bpp, dst_w_basereg
2523  .irp lowbit, 1, 2, 4, 8, 16
2524 -.if (dst_w_bpp <= (lowbit * 8)) && ((lowbit * 8) < (pixblock_size * dst_w_bpp))
2525 -.if lowbit < 16 /* we don't need more than 16-byte alignment */
2526 -    tst         DST_W, #lowbit
2527 +.if (dst_w_bpp <= (\lowbit * 8)) && ((\lowbit * 8) < (pixblock_size * dst_w_bpp))
2528 +.if \lowbit < 16 /* we don't need more than 16-byte alignment */
2529 +    tst         DST_W, #\lowbit
2530      beq         1f
2531  .endif
2532 -    pixst_a     (lowbit * 8 / dst_w_bpp), dst_w_bpp, dst_w_basereg, DST_W
2533 +    pixst_a     (\lowbit * 8 / dst_w_bpp), dst_w_bpp, dst_w_basereg, DST_W
2534  1:
2535  .endif
2536  .endr
2537  .endif
2538  2:
2539  .endm
2540
2541  /*
2542 @@ -525,51 +527,51 @@ 2:
2543  .macro process_trailing_pixels cache_preload_flag, \
2544                                 dst_aligned_flag, \
2545                                 process_pixblock_head, \
2546                                 process_pixblock_tail, \
2547                                 process_pixblock_tail_head
2548      tst         W, #(pixblock_size - 1)
2549      beq         2f
2550  .irp chunk_size, 16, 8, 4, 2, 1
2551 -.if pixblock_size > chunk_size
2552 -    tst         W, #chunk_size
2553 +.if pixblock_size > \chunk_size
2554 +    tst         W, #\chunk_size
2555      beq         1f
2556 -    pixld_src   chunk_size, src_bpp, src_basereg, SRC
2557 -    pixld       chunk_size, mask_bpp, mask_basereg, MASK
2558 -.if dst_aligned_flag != 0
2559 -    pixld_a     chunk_size, dst_r_bpp, dst_r_basereg, DST_R
2560 +    pixld_src   \chunk_size, src_bpp, src_basereg, SRC
2561 +    pixld       \chunk_size, mask_bpp, mask_basereg, MASK
2562 +.if \dst_aligned_flag != 0
2563 +    pixld_a     \chunk_size, dst_r_bpp, dst_r_basereg, DST_R
2564  .else
2565 -    pixld       chunk_size, dst_r_bpp, dst_r_basereg, DST_R
2566 +    pixld       \chunk_size, dst_r_bpp, dst_r_basereg, DST_R
2567  .endif
2568 -.if cache_preload_flag != 0
2569 -    PF add      PF_X, PF_X, #chunk_size
2570 +.if \cache_preload_flag != 0
2571 +    PF add,     PF_X, PF_X, #\chunk_size
2572  .endif
2573  1:
2574  .endif
2575  .endr
2576      pixdeinterleave src_bpp, src_basereg
2577      pixdeinterleave mask_bpp, mask_basereg
2578      pixdeinterleave dst_r_bpp, dst_r_basereg
2579
2580 -    process_pixblock_head
2581 -.if cache_preload_flag != 0
2582 +    \process_pixblock_head
2583 +.if \cache_preload_flag != 0
2584      cache_preload 0, pixblock_size
2585      cache_preload_simple
2586  .endif
2587 -    process_pixblock_tail
2588 +    \process_pixblock_tail
2589      pixinterleave dst_w_bpp, dst_w_basereg
2590  .irp chunk_size, 16, 8, 4, 2, 1
2591 -.if pixblock_size > chunk_size
2592 -    tst         W, #chunk_size
2593 +.if pixblock_size > \chunk_size
2594 +    tst         W, #\chunk_size
2595      beq         1f
2596 -.if dst_aligned_flag != 0
2597 -    pixst_a     chunk_size, dst_w_bpp, dst_w_basereg, DST_W
2598 +.if \dst_aligned_flag != 0
2599 +    pixst_a     \chunk_size, dst_w_bpp, dst_w_basereg, DST_W
2600  .else
2601 -    pixst       chunk_size, dst_w_bpp, dst_w_basereg, DST_W
2602 +    pixst       \chunk_size, dst_w_bpp, dst_w_basereg, DST_W
2603  .endif
2604  1:
2605  .endif
2606  .endr
2607  2:
2608  .endm
2609
2610  /*
2611 @@ -599,17 +601,17 @@ 2:
2612  .if (mask_bpp != 24) && (mask_bpp != 0)
2613      sub         MASK, MASK, W, lsl #mask_bpp_shift
2614  .endif
2615      subs        H, H, #1
2616      mov         DST_R, DST_W
2617  .if regs_shortage
2618      str         H, [sp, #4] /* save updated height to stack */
2619  .endif
2620 -    bge         start_of_loop_label
2621 +    bge         \start_of_loop_label
2622  .endm
2623
2624  /*
2625   * Registers are allocated in the following way by default:
2626   * d0, d1, d2, d3     - reserved for loading source pixel data
2627   * d4, d5, d6, d7     - reserved for loading destination pixel data
2628   * d24, d25, d26, d27 - reserved for loading mask pixel data
2629   * d28, d29, d30, d31 - final destination pixel data for writeback to memory
2630 @@ -626,48 +628,48 @@ 2:
2631                                     process_pixblock_head, \
2632                                     process_pixblock_tail, \
2633                                     process_pixblock_tail_head, \
2634                                     dst_w_basereg_ = 28, \
2635                                     dst_r_basereg_ = 4, \
2636                                     src_basereg_   = 0, \
2637                                     mask_basereg_  = 24
2638
2639 -    pixman_asm_function fname
2640 +    pixman_asm_function \fname
2641
2642      push        {r4-r12, lr}        /* save all registers */
2643
2644  /*
2645   * Select prefetch type for this function. If prefetch distance is
2646   * set to 0 or one of the color formats is 24bpp, SIMPLE prefetch
2647   * has to be used instead of ADVANCED.
2648   */
2649      .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_DEFAULT
2650 -.if prefetch_distance == 0
2651 +.if \prefetch_distance == 0
2652      .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_NONE
2653  .elseif (PREFETCH_TYPE_CURRENT > PREFETCH_TYPE_SIMPLE) && \
2654 -        ((src_bpp_ == 24) || (mask_bpp_ == 24) || (dst_w_bpp_ == 24))
2655 +        ((\src_bpp_ == 24) || (\mask_bpp_ == 24) || (\dst_w_bpp_ == 24))
2656      .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_SIMPLE
2657  .endif
2658
2659  /*
2660   * Make some macro arguments globally visible and accessible
2661   * from other macros
2662   */
2663 -    .set src_bpp, src_bpp_
2664 -    .set mask_bpp, mask_bpp_
2665 -    .set dst_w_bpp, dst_w_bpp_
2666 -    .set pixblock_size, pixblock_size_
2667 -    .set dst_w_basereg, dst_w_basereg_
2668 -    .set dst_r_basereg, dst_r_basereg_
2669 -    .set src_basereg, src_basereg_
2670 -    .set mask_basereg, mask_basereg_
2671 +    .set src_bpp, \src_bpp_
2672 +    .set mask_bpp, \mask_bpp_
2673 +    .set dst_w_bpp, \dst_w_bpp_
2674 +    .set pixblock_size, \pixblock_size_
2675 +    .set dst_w_basereg, \dst_w_basereg_
2676 +    .set dst_r_basereg, \dst_r_basereg_
2677 +    .set src_basereg, \src_basereg_
2678 +    .set mask_basereg, \mask_basereg_
2679
2680      .macro pixld_src x:vararg
2681 -        pixld x
2682 +        pixld \x
2683      .endm
2684      .macro fetch_src_pixblock
2685          pixld_src   pixblock_size, src_bpp, \
2686                      (src_basereg - pixblock_size * src_bpp / 64), SRC
2687      .endm
2688  /*
2689   * Assign symbolic names to registers
2690   */
2691 @@ -750,38 +752,38 @@ 2:
2692  .elseif dst_w_bpp == 16
2693      .set dst_bpp_shift, 1
2694  .elseif dst_w_bpp == 8
2695      .set dst_bpp_shift, 0
2696  .else
2697      .error "requested dst bpp (dst_w_bpp) is not supported"
2698  .endif
2699
2700 -.if (((flags) & FLAG_DST_READWRITE) != 0)
2701 +.if (((\flags) & FLAG_DST_READWRITE) != 0)
2702      .set dst_r_bpp, dst_w_bpp
2703  .else
2704      .set dst_r_bpp, 0
2705  .endif
2706 -.if (((flags) & FLAG_DEINTERLEAVE_32BPP) != 0)
2707 +.if (((\flags) & FLAG_DEINTERLEAVE_32BPP) != 0)
2708      .set DEINTERLEAVE_32BPP_ENABLED, 1
2709  .else
2710      .set DEINTERLEAVE_32BPP_ENABLED, 0
2711  .endif
2712
2713 -.if prefetch_distance < 0 || prefetch_distance > 15
2714 -    .error "invalid prefetch distance (prefetch_distance)"
2715 +.if \prefetch_distance < 0 || \prefetch_distance > 15
2716 +    .error "invalid prefetch distance (\prefetch_distance)"
2717  .endif
2718
2719  .if src_bpp > 0
2720      ldr         SRC, [sp, #40]
2721  .endif
2722  .if mask_bpp > 0
2723      ldr         MASK, [sp, #48]
2724  .endif
2725 -    PF mov      PF_X, #0
2726 +    PF mov,     PF_X, #0
2727  .if src_bpp > 0
2728      ldr         SRC_STRIDE, [sp, #44]
2729  .endif
2730  .if mask_bpp > 0
2731      ldr         MASK_STRIDE, [sp, #52]
2732  .endif
2733      mov         DST_R, DST_W
2734
2735 @@ -796,24 +798,24 @@ 2:
2736  .if dst_w_bpp == 24
2737      sub         DST_STRIDE, DST_STRIDE, W
2738      sub         DST_STRIDE, DST_STRIDE, W, lsl #1
2739  .endif
2740
2741  /*
2742   * Setup advanced prefetcher initial state
2743   */
2744 -    PF mov      PF_SRC, SRC
2745 -    PF mov      PF_DST, DST_R
2746 -    PF mov      PF_MASK, MASK
2747 +    PF mov,     PF_SRC, SRC
2748 +    PF mov,     PF_DST, DST_R
2749 +    PF mov,     PF_MASK, MASK
2750      /* PF_CTL = prefetch_distance | ((h - 1) << 4) */
2751 -    PF mov      PF_CTL, H, lsl #4
2752 -    PF add      PF_CTL, #(prefetch_distance - 0x10)
2753 +    PF mov,     PF_CTL, H, lsl #4
2754 +    PF add,     PF_CTL, #(\prefetch_distance - 0x10)
2755
2756 -    init
2757 +    \init
2758  .if regs_shortage
2759      push        {r0, r1}
2760  .endif
2761      subs        H, H, #1
2762  .if regs_shortage
2763      str         H, [sp, #4] /* save updated height to stack */
2764  .else
2765      mov         ORIG_W, W
2766 @@ -821,84 +823,84 @@ 2:
2767      blt         9f
2768      cmp         W, #(pixblock_size * 2)
2769      blt         8f
2770  /*
2771   * This is the start of the pipelined loop, which if optimized for
2772   * long scanlines
2773   */
2774  0:
2775 -    ensure_destination_ptr_alignment process_pixblock_head, \
2776 -                                     process_pixblock_tail, \
2777 -                                     process_pixblock_tail_head
2778 +    ensure_destination_ptr_alignment \process_pixblock_head, \
2779 +                                     \process_pixblock_tail, \
2780 +                                     \process_pixblock_tail_head
2781
2782      /* Implement "head (tail_head) ... (tail_head) tail" loop pattern */
2783      pixld_a     pixblock_size, dst_r_bpp, \
2784                  (dst_r_basereg - pixblock_size * dst_r_bpp / 64), DST_R
2785      fetch_src_pixblock
2786      pixld       pixblock_size, mask_bpp, \
2787                  (mask_basereg - pixblock_size * mask_bpp / 64), MASK
2788 -    PF add      PF_X, PF_X, #pixblock_size
2789 -    process_pixblock_head
2790 +    PF add,     PF_X, PF_X, #pixblock_size
2791 +    \process_pixblock_head
2792      cache_preload 0, pixblock_size
2793      cache_preload_simple
2794      subs        W, W, #(pixblock_size * 2)
2795      blt         2f
2796  1:
2797 -    process_pixblock_tail_head
2798 +    \process_pixblock_tail_head
2799      cache_preload_simple
2800      subs        W, W, #pixblock_size
2801      bge         1b
2802  2:
2803 -    process_pixblock_tail
2804 +    \process_pixblock_tail
2805      pixst_a     pixblock_size, dst_w_bpp, \
2806                  (dst_w_basereg - pixblock_size * dst_w_bpp / 64), DST_W
2807
2808      /* Process the remaining trailing pixels in the scanline */
2809      process_trailing_pixels 1, 1, \
2810 -                            process_pixblock_head, \
2811 -                            process_pixblock_tail, \
2812 -                            process_pixblock_tail_head
2813 +                            \process_pixblock_head, \
2814 +                            \process_pixblock_tail, \
2815 +                            \process_pixblock_tail_head
2816      advance_to_next_scanline 0b
2817
2818  .if regs_shortage
2819      pop         {r0, r1}
2820  .endif
2821 -    cleanup
2822 +    \cleanup
2823      pop         {r4-r12, pc}  /* exit */
2824  /*
2825   * This is the start of the loop, designed to process images with small width
2826   * (less than pixblock_size * 2 pixels). In this case neither pipelining
2827   * nor prefetch are used.
2828   */
2829  8:
2830      /* Process exactly pixblock_size pixels if needed */
2831      tst         W, #pixblock_size
2832      beq         1f
2833      pixld       pixblock_size, dst_r_bpp, \
2834                  (dst_r_basereg - pixblock_size * dst_r_bpp / 64), DST_R
2835      fetch_src_pixblock
2836      pixld       pixblock_size, mask_bpp, \
2837                  (mask_basereg - pixblock_size * mask_bpp / 64), MASK
2838 -    process_pixblock_head
2839 -    process_pixblock_tail
2840 +    \process_pixblock_head
2841 +    \process_pixblock_tail
2842      pixst       pixblock_size, dst_w_bpp, \
2843                  (dst_w_basereg - pixblock_size * dst_w_bpp / 64), DST_W
2844  1:
2845      /* Process the remaining trailing pixels in the scanline */
2846      process_trailing_pixels 0, 0, \
2847 -                            process_pixblock_head, \
2848 -                            process_pixblock_tail, \
2849 -                            process_pixblock_tail_head
2850 +                            \process_pixblock_head, \
2851 +                            \process_pixblock_tail, \
2852 +                            \process_pixblock_tail_head
2853      advance_to_next_scanline 8b
2854  9:
2855  .if regs_shortage
2856      pop         {r0, r1}
2857  .endif
2858 -    cleanup
2859 +    \cleanup
2860      pop         {r4-r12, pc}  /* exit */
2861
2862      .purgem     fetch_src_pixblock
2863      .purgem     pixld_src
2864
2865      .unreq      SRC
2866      .unreq      MASK
2867      .unreq      DST_R
2868 @@ -910,17 +912,17 @@ 9:
2869      .unreq      DST_STRIDE
2870      .unreq      MASK_STRIDE
2871      .unreq      PF_CTL
2872      .unreq      PF_X
2873      .unreq      PF_SRC
2874      .unreq      PF_DST
2875      .unreq      PF_MASK
2876      .unreq      DUMMY
2877 -    .endfunc
2878 +    pixman_end_asm_function
2879  .endm
2880
2881  /*
2882   * A simplified variant of function generation template for a single
2883   * scanline processing (for implementing pixman combine functions)
2884   */
2885  .macro generate_composite_function_scanline        use_nearest_scaling, \
2886                                                     fname, \
2887 @@ -934,49 +936,49 @@ 9:
2888                                                     process_pixblock_head, \
2889                                                     process_pixblock_tail, \
2890                                                     process_pixblock_tail_head, \
2891                                                     dst_w_basereg_ = 28, \
2892                                                     dst_r_basereg_ = 4, \
2893                                                     src_basereg_   = 0, \
2894                                                     mask_basereg_  = 24
2895
2896 -    pixman_asm_function fname
2897 +    pixman_asm_function \fname
2898
2899      .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_NONE
2900  /*
2901   * Make some macro arguments globally visible and accessible
2902   * from other macros
2903   */
2904 -    .set src_bpp, src_bpp_
2905 -    .set mask_bpp, mask_bpp_
2906 -    .set dst_w_bpp, dst_w_bpp_
2907 -    .set pixblock_size, pixblock_size_
2908 -    .set dst_w_basereg, dst_w_basereg_
2909 -    .set dst_r_basereg, dst_r_basereg_
2910 -    .set src_basereg, src_basereg_
2911 -    .set mask_basereg, mask_basereg_
2912 +    .set src_bpp, \src_bpp_
2913 +    .set mask_bpp, \mask_bpp_
2914 +    .set dst_w_bpp, \dst_w_bpp_
2915 +    .set pixblock_size, \pixblock_size_
2916 +    .set dst_w_basereg, \dst_w_basereg_
2917 +    .set dst_r_basereg, \dst_r_basereg_
2918 +    .set src_basereg, \src_basereg_
2919 +    .set mask_basereg, \mask_basereg_
2920
2921 -.if use_nearest_scaling != 0
2922 +.if \use_nearest_scaling != 0
2923      /*
2924       * Assign symbolic names to registers for nearest scaling
2925       */
2926      W           .req        r0
2927      DST_W       .req        r1
2928      SRC         .req        r2
2929      VX          .req        r3
2930      UNIT_X      .req        ip
2931      MASK        .req        lr
2932      TMP1        .req        r4
2933      TMP2        .req        r5
2934      DST_R       .req        r6
2935      SRC_WIDTH_FIXED .req        r7
2936
2937      .macro pixld_src x:vararg
2938 -        pixld_s x
2939 +        pixld_s \x
2940      .endm
2941
2942      ldr         UNIT_X, [sp]
2943      push        {r4-r8, lr}
2944      ldr         SRC_WIDTH_FIXED, [sp, #(24 + 4)]
2945      .if mask_bpp != 0
2946      ldr         MASK, [sp, #(24 + 8)]
2947      .endif
2948 @@ -986,89 +988,89 @@ 9:
2949       */
2950      W           .req        r0      /* width (is updated during processing) */
2951      DST_W       .req        r1      /* destination buffer pointer for writes */
2952      SRC         .req        r2      /* source buffer pointer */
2953      DST_R       .req        ip      /* destination buffer pointer for reads */
2954      MASK        .req        r3      /* mask pointer */
2955
2956      .macro pixld_src x:vararg
2957 -        pixld x
2958 +        pixld \x
2959      .endm
2960  .endif
2961
2962 -.if (((flags) & FLAG_DST_READWRITE) != 0)
2963 +.if (((\flags) & FLAG_DST_READWRITE) != 0)
2964      .set dst_r_bpp, dst_w_bpp
2965  .else
2966      .set dst_r_bpp, 0
2967  .endif
2968 -.if (((flags) & FLAG_DEINTERLEAVE_32BPP) != 0)
2969 +.if (((\flags) & FLAG_DEINTERLEAVE_32BPP) != 0)
2970      .set DEINTERLEAVE_32BPP_ENABLED, 1
2971  .else
2972      .set DEINTERLEAVE_32BPP_ENABLED, 0
2973  .endif
2974
2975      .macro fetch_src_pixblock
2976          pixld_src   pixblock_size, src_bpp, \
2977                      (src_basereg - pixblock_size * src_bpp / 64), SRC
2978      .endm
2979
2980 -    init
2981 +    \init
2982      mov         DST_R, DST_W
2983
2984      cmp         W, #pixblock_size
2985      blt         8f
2986
2987 -    ensure_destination_ptr_alignment process_pixblock_head, \
2988 -                                     process_pixblock_tail, \
2989 -                                     process_pixblock_tail_head
2990 +    ensure_destination_ptr_alignment \process_pixblock_head, \
2991 +                                     \process_pixblock_tail, \
2992 +                                     \process_pixblock_tail_head
2993
2994      subs        W, W, #pixblock_size
2995      blt         7f
2996
2997      /* Implement "head (tail_head) ... (tail_head) tail" loop pattern */
2998      pixld_a     pixblock_size, dst_r_bpp, \
2999                  (dst_r_basereg - pixblock_size * dst_r_bpp / 64), DST_R
3000      fetch_src_pixblock
3001      pixld       pixblock_size, mask_bpp, \
3002                  (mask_basereg - pixblock_size * mask_bpp / 64), MASK
3003 -    process_pixblock_head
3004 +    \process_pixblock_head
3005      subs        W, W, #pixblock_size
3006      blt         2f
3007  1:
3008 -    process_pixblock_tail_head
3009 +    \process_pixblock_tail_head
3010      subs        W, W, #pixblock_size
3011      bge         1b
3012  2:
3013 -    process_pixblock_tail
3014 +    \process_pixblock_tail
3015      pixst_a     pixblock_size, dst_w_bpp, \
3016                  (dst_w_basereg - pixblock_size * dst_w_bpp / 64), DST_W
3017  7:
3018      /* Process the remaining trailing pixels in the scanline (dst aligned) */
3019      process_trailing_pixels 0, 1, \
3020 -                            process_pixblock_head, \
3021 -                            process_pixblock_tail, \
3022 -                            process_pixblock_tail_head
3023 +                            \process_pixblock_head, \
3024 +                            \process_pixblock_tail, \
3025 +                            \process_pixblock_tail_head
3026
3027 -    cleanup
3028 -.if use_nearest_scaling != 0
3029 +    \cleanup
3030 +.if \use_nearest_scaling != 0
3031      pop         {r4-r8, pc}  /* exit */
3032  .else
3033      bx          lr  /* exit */
3034  .endif
3035  8:
3036      /* Process the remaining trailing pixels in the scanline (dst unaligned) */
3037      process_trailing_pixels 0, 0, \
3038 -                            process_pixblock_head, \
3039 -                            process_pixblock_tail, \
3040 -                            process_pixblock_tail_head
3041 +                            \process_pixblock_head, \
3042 +                            \process_pixblock_tail, \
3043 +                            \process_pixblock_tail_head
3044
3045 -    cleanup
3046 +    \cleanup
3047
3048 -.if use_nearest_scaling != 0
3049 +.if \use_nearest_scaling != 0
3050      pop         {r4-r8, pc}  /* exit */
3051
3052      .unreq      DST_R
3053      .unreq      SRC
3054      .unreq      W
3055      .unreq      VX
3056      .unreq      UNIT_X
3057      .unreq      TMP1
3058 @@ -1085,25 +1087,25 @@ 8:
3059      .unreq      DST_R
3060      .unreq      DST_W
3061      .unreq      W
3062  .endif
3063
3064      .purgem     fetch_src_pixblock
3065      .purgem     pixld_src
3066
3067 -    .endfunc
3068 +    pixman_end_asm_function
3069  .endm
3070
3071  .macro generate_composite_function_single_scanline x:vararg
3072 -    generate_composite_function_scanline 0, x
3073 +    generate_composite_function_scanline 0, \x
3074  .endm
3075
3076  .macro generate_composite_function_nearest_scanline x:vararg
3077 -    generate_composite_function_scanline 1, x
3078 +    generate_composite_function_scanline 1, \x
3079  .endm
3080
3081  /* Default prologue/epilogue, nothing special needs to be done */
3082
3083  .macro default_init
3084  .endm
3085
3086  .macro default_cleanup
3087 @@ -1129,56 +1131,56 @@ 8:
3088   * Conversion of 8 r5g6b6 pixels packed in 128-bit register (in)
3089   * into a planar a8r8g8b8 format (with a, r, g, b color components
3090   * stored into 64-bit registers out_a, out_r, out_g, out_b respectively).
3091   *
3092   * Warning: the conversion is destructive and the original
3093   *          value (in) is lost.
3094   */
3095  .macro convert_0565_to_8888 in, out_a, out_r, out_g, out_b
3096 -    vshrn.u16   out_r, in,    #8
3097 -    vshrn.u16   out_g, in,    #3
3098 -    vsli.u16    in,    in,    #5
3099 -    vmov.u8     out_a, #255
3100 -    vsri.u8     out_r, out_r, #5
3101 -    vsri.u8     out_g, out_g, #6
3102 -    vshrn.u16   out_b, in,    #2
3103 +    vshrn.u16   \out_r, \in,    #8
3104 +    vshrn.u16   \out_g, \in,    #3
3105 +    vsli.u16    \in,    \in,    #5
3106 +    vmov.u8     \out_a, #255
3107 +    vsri.u8     \out_r, \out_r, #5
3108 +    vsri.u8     \out_g, \out_g, #6
3109 +    vshrn.u16   \out_b, \in,    #2
3110  .endm
3111
3112  .macro convert_0565_to_x888 in, out_r, out_g, out_b
3113 -    vshrn.u16   out_r, in,    #8
3114 -    vshrn.u16   out_g, in,    #3
3115 -    vsli.u16    in,    in,    #5
3116 -    vsri.u8     out_r, out_r, #5
3117 -    vsri.u8     out_g, out_g, #6
3118 -    vshrn.u16   out_b, in,    #2
3119 +    vshrn.u16   \out_r, \in,    #8
3120 +    vshrn.u16   \out_g, \in,    #3
3121 +    vsli.u16    \in,    \in,    #5
3122 +    vsri.u8     \out_r, \out_r, #5
3123 +    vsri.u8     \out_g, \out_g, #6
3124 +    vshrn.u16   \out_b, \in,    #2
3125  .endm
3126
3127  /*
3128   * Conversion from planar a8r8g8b8 format (with a, r, g, b color components
3129   * in 64-bit registers in_a, in_r, in_g, in_b respectively) into 8 r5g6b6
3130   * pixels packed in 128-bit register (out). Requires two temporary 128-bit
3131   * registers (tmp1, tmp2)
3132   */
3133  .macro convert_8888_to_0565 in_r, in_g, in_b, out, tmp1, tmp2
3134 -    vshll.u8    tmp1, in_g, #8
3135 -    vshll.u8    out, in_r, #8
3136 -    vshll.u8    tmp2, in_b, #8
3137 -    vsri.u16    out, tmp1, #5
3138 -    vsri.u16    out, tmp2, #11
3139 +    vshll.u8    \tmp1, \in_g, #8
3140 +    vshll.u8    \out, \in_r, #8
3141 +    vshll.u8    \tmp2, \in_b, #8
3142 +    vsri.u16    \out, \tmp1, #5
3143 +    vsri.u16    \out, \tmp2, #11
3144  .endm
3145
3146  /*
3147   * Conversion of four r5g6b5 pixels (in) to four x8r8g8b8 pixels
3148   * returned in (out0, out1) registers pair. Requires one temporary
3149   * 64-bit register (tmp). 'out1' and 'in' may overlap, the original
3150   * value from 'in' is lost
3151   */
3152  .macro convert_four_0565_to_x888_packed in, out0, out1, tmp
3153 -    vshl.u16    out0, in,   #5  /* G top 6 bits */
3154 -    vshl.u16    tmp,  in,   #11 /* B top 5 bits */
3155 -    vsri.u16    in,   in,   #5  /* R is ready in top bits */
3156 -    vsri.u16    out0, out0, #6  /* G is ready in top bits */
3157 -    vsri.u16    tmp,  tmp,  #5  /* B is ready in top bits */
3158 -    vshr.u16    out1, in,   #8  /* R is in place */
3159 -    vsri.u16    out0, tmp,  #8  /* G & B is in place */
3160 -    vzip.u16    out0, out1      /* everything is in place */
3161 +    vshl.u16    \out0, \in,   #5  /* G top 6 bits */
3162 +    vshl.u16    \tmp,  \in,   #11 /* B top 5 bits */
3163 +    vsri.u16    \in,   \in,   #5  /* R is ready in top bits */
3164 +    vsri.u16    \out0, \out0, #6  /* G is ready in top bits */
3165 +    vsri.u16    \tmp,  \tmp,  #5  /* B is ready in top bits */
3166 +    vshr.u16    \out1, \in,   #8  /* R is in place */
3167 +    vsri.u16    \out0, \tmp,  #8  /* G & B is in place */
3168 +    vzip.u16    \out0, \out1      /* everything is in place */
3169  .endm
3170 diff --git a/gfx/cairo/libpixman/src/pixman-arm-simd-asm-scaled.S b/gfx/cairo/libpixman/src/pixman-arm-simd-asm-scaled.S
3171 --- a/gfx/cairo/libpixman/src/pixman-arm-simd-asm-scaled.S
3172 +++ b/gfx/cairo/libpixman/src/pixman-arm-simd-asm-scaled.S
3173 @@ -20,16 +20,20 @@
3174   * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
3175   * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
3176   * SOFTWARE.
3177   *
3178   * Author:  Jeff Muizelaar (jeff@infidigm.net)
3179   *
3180   */
3181
3182 +#ifdef __clang__
3183 +#define subpls subspl
3184 +#endif
3185 +
3186  /* Prevent the stack from becoming executable */
3187  #if defined(__linux__) && defined(__ELF__)
3188  .section .note.GNU-stack,"",%progbits
3189  #endif
3190
3191         .text
3192         .arch armv6
3193         .object_arch armv4
3194 @@ -57,100 +61,105 @@
3195   *  prefetch_braking_distance - stop prefetching when that many pixels are
3196   *                              remaining before the end of scanline
3197   */
3198
3199  .macro generate_nearest_scanline_func fname, bpp_shift, t,      \
3200                                        prefetch_distance,        \
3201                                        prefetch_braking_distance
3202
3203 -pixman_asm_function fname
3204 +pixman_asm_function \fname
3205         W               .req    r0
3206         DST             .req    r1
3207         SRC             .req    r2
3208         VX              .req    r3
3209         UNIT_X          .req    ip
3210         TMP1            .req    r4
3211         TMP2            .req    r5
3212         VXMASK          .req    r6
3213         PF_OFFS         .req    r7
3214         SRC_WIDTH_FIXED .req    r8
3215
3216         ldr     UNIT_X, [sp]
3217         push    {r4, r5, r6, r7, r8, r10}
3218 -       mvn     VXMASK, #((1 << bpp_shift) - 1)
3219 +       mvn     VXMASK, #((1 << \bpp_shift) - 1)
3220         ldr     SRC_WIDTH_FIXED, [sp, #28]
3221
3222         /* define helper macro */
3223         .macro  scale_2_pixels
3224 -               ldr&t   TMP1, [SRC, TMP1]
3225 -               and     TMP2, VXMASK, VX, asr #(16 - bpp_shift)
3226 +               ldr\()\t        TMP1, [SRC, TMP1]
3227 +               and     TMP2, VXMASK, VX, asr #(16 - \bpp_shift)
3228                 adds    VX, VX, UNIT_X
3229 -               str&t   TMP1, [DST], #(1 << bpp_shift)
3230 +               str\()\t        TMP1, [DST], #(1 << \bpp_shift)
3231  9:             subpls  VX, VX, SRC_WIDTH_FIXED
3232                 bpl     9b
3233
3234 -               ldr&t   TMP2, [SRC, TMP2]
3235 -               and     TMP1, VXMASK, VX, asr #(16 - bpp_shift)
3236 +               ldr\()\t        TMP2, [SRC, TMP2]
3237 +               and     TMP1, VXMASK, VX, asr #(16 - \bpp_shift)
3238                 adds    VX, VX, UNIT_X
3239 -               str&t   TMP2, [DST], #(1 << bpp_shift)
3240 +               str\()\t        TMP2, [DST], #(1 << \bpp_shift)
3241  9:             subpls  VX, VX, SRC_WIDTH_FIXED
3242                 bpl     9b
3243         .endm
3244
3245         /* now do the scaling */
3246 -       and     TMP1, VXMASK, VX, asr #(16 - bpp_shift)
3247 +       and     TMP1, VXMASK, VX, asr #(16 - \bpp_shift)
3248         adds    VX, VX, UNIT_X
3249  9:     subpls  VX, VX, SRC_WIDTH_FIXED
3250         bpl     9b
3251 -       subs    W, W, #(8 + prefetch_braking_distance)
3252 +       subs    W, W, #(8 + \prefetch_braking_distance)
3253         blt     2f
3254         /* calculate prefetch offset */
3255 -       mov     PF_OFFS, #prefetch_distance
3256 +       mov     PF_OFFS, #\prefetch_distance
3257         mla     PF_OFFS, UNIT_X, PF_OFFS, VX
3258  1:     /* main loop, process 8 pixels per iteration with prefetch */
3259 -       pld     [SRC, PF_OFFS, asr #(16 - bpp_shift)]
3260 +       pld     [SRC, PF_OFFS, asr #(16 - \bpp_shift)]
3261         add     PF_OFFS, UNIT_X, lsl #3
3262         scale_2_pixels
3263         scale_2_pixels
3264         scale_2_pixels
3265         scale_2_pixels
3266         subs    W, W, #8
3267         bge     1b
3268  2:
3269 -       subs    W, W, #(4 - 8 - prefetch_braking_distance)
3270 +       subs    W, W, #(4 - 8 - \prefetch_braking_distance)
3271         blt     2f
3272  1:     /* process the remaining pixels */
3273         scale_2_pixels
3274         scale_2_pixels
3275         subs    W, W, #4
3276         bge     1b
3277  2:
3278         tst     W, #2
3279         beq     2f
3280         scale_2_pixels
3281  2:
3282         tst     W, #1
3283 -       ldrne&t TMP1, [SRC, TMP1]
3284 -       strne&t TMP1, [DST]
3285 +#ifdef __clang__
3286 +       ldr\()\t\()ne   TMP1, [SRC, TMP1]
3287 +       str\()\t\()ne   TMP1, [DST]
3288 +#else
3289 +       ldrne\()\t      TMP1, [SRC, TMP1]
3290 +       strne\()\t      TMP1, [DST]
3291 +#endif
3292         /* cleanup helper macro */
3293         .purgem scale_2_pixels
3294         .unreq  DST
3295         .unreq  SRC
3296         .unreq  W
3297         .unreq  VX
3298         .unreq  UNIT_X
3299         .unreq  TMP1
3300         .unreq  TMP2
3301         .unreq  VXMASK
3302         .unreq  PF_OFFS
3303         .unreq  SRC_WIDTH_FIXED
3304         /* return */
3305         pop     {r4, r5, r6, r7, r8, r10}
3306         bx      lr
3307 -.endfunc
3308 +pixman_end_asm_function
3309  .endm
3310
3311  generate_nearest_scanline_func \
3312      pixman_scaled_nearest_scanline_0565_0565_SRC_asm_armv6, 1, h, 80, 32
3313
3314  generate_nearest_scanline_func \
3315      pixman_scaled_nearest_scanline_8888_8888_SRC_asm_armv6, 2,  , 48, 32
3316 diff --git a/gfx/cairo/libpixman/src/pixman-arm-simd-asm.S b/gfx/cairo/libpixman/src/pixman-arm-simd-asm.S
3317 --- a/gfx/cairo/libpixman/src/pixman-arm-simd-asm.S
3318 +++ b/gfx/cairo/libpixman/src/pixman-arm-simd-asm.S
3319 @@ -20,16 +20,21 @@
3320   * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
3321   * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
3322   * SOFTWARE.
3323   *
3324   * Author:  Ben Avison (bavison@riscosopen.org)
3325   *
3326   */
3327
3328 +#ifdef __clang__
3329 +#define adceqs adcseq
3330 +#define ldmnedb ldmdbne
3331 +#endif
3332 +
3333  /* Prevent the stack from becoming executable */
3334  #if defined(__linux__) && defined(__ELF__)
3335  .section .note.GNU-stack,"",%progbits
3336  #endif
3337
3338         .text
3339         .arch armv6
3340         .object_arch armv4
3341 @@ -52,26 +57,26 @@
3342   *   preload        If outputting 16 bytes causes 64 bytes to be read, whether an extra preload should be output
3343   */
3344
3345  .macro blit_init
3346          line_saved_regs STRIDE_D, STRIDE_S
3347  .endm
3348
3349  .macro blit_process_head   cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
3350 -        pixld   cond, numbytes, firstreg, SRC, unaligned_src
3351 +        pixld   \cond, \numbytes, \firstreg, SRC, \unaligned_src
3352  .endm
3353
3354  .macro blit_inner_loop  process_head, process_tail, unaligned_src, unaligned_mask, dst_alignment
3355      WK4     .req    STRIDE_D
3356      WK5     .req    STRIDE_S
3357      WK6     .req    MASK
3358      WK7     .req    STRIDE_M
3359 -110:    pixld   , 16, 0, SRC, unaligned_src
3360 -        pixld   , 16, 4, SRC, unaligned_src
3361 +110:    pixld   , 16, 0, SRC, \unaligned_src
3362 +        pixld   , 16, 4, SRC, \unaligned_src
3363          pld     [SRC, SCRATCH]
3364          pixst   , 16, 0, DST
3365          pixst   , 16, 4, DST
3366          subs    X, X, #32*8/src_bpp
3367          bhs     110b
3368      .unreq  WK4
3369      .unreq  WK5
3370      .unreq  WK6
3371 @@ -137,17 +142,17 @@ generate_composite_function \
3372          mov     STRIDE_M, SRC
3373  .endm
3374
3375  .macro fill_process_tail  cond, numbytes, firstreg
3376      WK4     .req    SRC
3377      WK5     .req    STRIDE_S
3378      WK6     .req    MASK
3379      WK7     .req    STRIDE_M
3380 -        pixst   cond, numbytes, 4, DST
3381 +        pixst   \cond, \numbytes, 4, DST
3382      .unreq  WK4
3383      .unreq  WK5
3384      .unreq  WK6
3385      .unreq  WK7
3386  .endm
3387
3388  generate_composite_function \
3389      pixman_composite_src_n_8888_asm_armv6, 0, 0, 32, \
3390 @@ -177,30 +182,30 @@ generate_composite_function \
3391      nop_macro, /* newline */ \
3392      nop_macro /* cleanup */ \
3393      nop_macro /* process head */ \
3394      fill_process_tail
3395
3396  /******************************************************************************/
3397
3398  .macro src_x888_8888_pixel, cond, reg
3399 -        orr&cond WK&reg, WK&reg, #0xFF000000
3400 +        orr\()\cond WK\()\reg, WK\()\reg, #0xFF000000
3401  .endm
3402
3403  .macro pixman_composite_src_x888_8888_process_head   cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
3404 -        pixld   cond, numbytes, firstreg, SRC, unaligned_src
3405 +        pixld   \cond, \numbytes, \firstreg, SRC, \unaligned_src
3406  .endm
3407
3408  .macro pixman_composite_src_x888_8888_process_tail   cond, numbytes, firstreg
3409 -        src_x888_8888_pixel cond, %(firstreg+0)
3410 - .if numbytes >= 8
3411 -        src_x888_8888_pixel cond, %(firstreg+1)
3412 -  .if numbytes == 16
3413 -        src_x888_8888_pixel cond, %(firstreg+2)
3414 -        src_x888_8888_pixel cond, %(firstreg+3)
3415 +        src_x888_8888_pixel \cond, %(\firstreg+0)
3416 + .if \numbytes >= 8
3417 +        src_x888_8888_pixel \cond, %(\firstreg+1)
3418 +  .if \numbytes == 16
3419 +        src_x888_8888_pixel \cond, %(\firstreg+2)
3420 +        src_x888_8888_pixel \cond, %(\firstreg+3)
3421    .endif
3422   .endif
3423  .endm
3424
3425  generate_composite_function \
3426      pixman_composite_src_x888_8888_asm_armv6, 32, 0, 32, \
3427      FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_PROCESS_PRESERVES_SCRATCH, \
3428      3, /* prefetch distance */ \
3429 @@ -217,83 +222,83 @@ generate_composite_function \
3430          ldr     MASK, =0x07E007E0
3431          mov     STRIDE_M, #0xFF000000
3432          /* Set GE[3:0] to 1010 so SEL instructions do what we want */
3433          ldr     SCRATCH, =0x80008000
3434          uadd8   SCRATCH, SCRATCH, SCRATCH
3435  .endm
3436
3437  .macro src_0565_8888_2pixels, reg1, reg2
3438 -        and     SCRATCH, WK&reg1, MASK             @ 00000GGGGGG0000000000gggggg00000
3439 -        bic     WK&reg2, WK&reg1, MASK             @ RRRRR000000BBBBBrrrrr000000bbbbb
3440 -        orr     SCRATCH, SCRATCH, SCRATCH, lsr #6  @ 00000GGGGGGGGGGGG0000ggggggggggg
3441 -        mov     WK&reg1, WK&reg2, lsl #16          @ rrrrr000000bbbbb0000000000000000
3442 -        mov     SCRATCH, SCRATCH, ror #19          @ GGGG0000ggggggggggg00000GGGGGGGG
3443 -        bic     WK&reg2, WK&reg2, WK&reg1, lsr #16 @ RRRRR000000BBBBB0000000000000000
3444 -        orr     WK&reg1, WK&reg1, WK&reg1, lsr #5  @ rrrrrrrrrr0bbbbbbbbbb00000000000
3445 -        orr     WK&reg2, WK&reg2, WK&reg2, lsr #5  @ RRRRRRRRRR0BBBBBBBBBB00000000000
3446 -        pkhtb   WK&reg1, WK&reg1, WK&reg1, asr #5  @ rrrrrrrr--------bbbbbbbb--------
3447 -        sel     WK&reg1, WK&reg1, SCRATCH          @ rrrrrrrrggggggggbbbbbbbb--------
3448 -        mov     SCRATCH, SCRATCH, ror #16          @ ggg00000GGGGGGGGGGGG0000gggggggg
3449 -        pkhtb   WK&reg2, WK&reg2, WK&reg2, asr #5  @ RRRRRRRR--------BBBBBBBB--------
3450 -        sel     WK&reg2, WK&reg2, SCRATCH          @ RRRRRRRRGGGGGGGGBBBBBBBB--------
3451 -        orr     WK&reg1, STRIDE_M, WK&reg1, lsr #8 @ 11111111rrrrrrrrggggggggbbbbbbbb
3452 -        orr     WK&reg2, STRIDE_M, WK&reg2, lsr #8 @ 11111111RRRRRRRRGGGGGGGGBBBBBBBB
3453 +        and     SCRATCH, WK\()\reg1, MASK                   @ 00000GGGGGG0000000000gggggg00000
3454 +        bic     WK\()\reg2, WK\()\reg1, MASK                @ RRRRR000000BBBBBrrrrr000000bbbbb
3455 +        orr     SCRATCH, SCRATCH, SCRATCH, lsr #6           @ 00000GGGGGGGGGGGG0000ggggggggggg
3456 +        mov     WK\()\reg1, WK\()\reg2, lsl #16             @ rrrrr000000bbbbb0000000000000000
3457 +        mov     SCRATCH, SCRATCH, ror #19                   @ GGGG0000ggggggggggg00000GGGGGGGG
3458 +        bic     WK\()\reg2, WK\()\reg2, WK\()\reg1, lsr #16 @ RRRRR000000BBBBB0000000000000000
3459 +        orr     WK\()\reg1, WK\()\reg1, WK\()\reg1, lsr #5  @ rrrrrrrrrr0bbbbbbbbbb00000000000
3460 +        orr     WK\()\reg2, WK\()\reg2, WK\()\reg2, lsr #5  @ RRRRRRRRRR0BBBBBBBBBB00000000000
3461 +        pkhtb   WK\()\reg1, WK\()\reg1, WK\()\reg1, asr #5  @ rrrrrrrr--------bbbbbbbb--------
3462 +        sel     WK\()\reg1, WK\()\reg1, SCRATCH             @ rrrrrrrrggggggggbbbbbbbb--------
3463 +        mov     SCRATCH, SCRATCH, ror #16                   @ ggg00000GGGGGGGGGGGG0000gggggggg
3464 +        pkhtb   WK\()\reg2, WK\()\reg2, WK\()\reg2, asr #5  @ RRRRRRRR--------BBBBBBBB--------
3465 +        sel     WK\()\reg2, WK\()\reg2, SCRATCH             @ RRRRRRRRGGGGGGGGBBBBBBBB--------
3466 +        orr     WK\()\reg1, STRIDE_M, WK\()\reg1, lsr #8    @ 11111111rrrrrrrrggggggggbbbbbbbb
3467 +        orr     WK\()\reg2, STRIDE_M, WK\()\reg2, lsr #8    @ 11111111RRRRRRRRGGGGGGGGBBBBBBBB
3468  .endm
3469
3470  /* This version doesn't need STRIDE_M, but is one instruction longer.
3471     It would however be preferable for an XRGB target, since we could knock off the last 2 instructions, but is that a common case?
3472 -        and     SCRATCH, WK&reg1, MASK             @ 00000GGGGGG0000000000gggggg00000
3473 -        bic     WK&reg1, WK&reg1, MASK             @ RRRRR000000BBBBBrrrrr000000bbbbb
3474 -        orr     SCRATCH, SCRATCH, SCRATCH, lsr #6  @ 00000GGGGGGGGGGGG0000ggggggggggg
3475 -        mov     WK&reg2, WK&reg1, lsr #16          @ 0000000000000000RRRRR000000BBBBB
3476 -        mov     SCRATCH, SCRATCH, ror #27          @ GGGGGGGGGGGG0000ggggggggggg00000
3477 -        bic     WK&reg1, WK&reg1, WK&reg2, lsl #16 @ 0000000000000000rrrrr000000bbbbb
3478 -        mov     WK&reg2, WK&reg2, lsl #3           @ 0000000000000RRRRR000000BBBBB000
3479 -        mov     WK&reg1, WK&reg1, lsl #3           @ 0000000000000rrrrr000000bbbbb000
3480 -        orr     WK&reg2, WK&reg2, WK&reg2, lsr #5  @ 0000000000000RRRRRRRRRR0BBBBBBBB
3481 -        orr     WK&reg1, WK&reg1, WK&reg1, lsr #5  @ 0000000000000rrrrrrrrrr0bbbbbbbb
3482 -        pkhbt   WK&reg2, WK&reg2, WK&reg2, lsl #5  @ --------RRRRRRRR--------BBBBBBBB
3483 -        pkhbt   WK&reg1, WK&reg1, WK&reg1, lsl #5  @ --------rrrrrrrr--------bbbbbbbb
3484 -        sel     WK&reg2, SCRATCH, WK&reg2          @ --------RRRRRRRRGGGGGGGGBBBBBBBB
3485 -        sel     WK&reg1, SCRATCH, WK&reg1          @ --------rrrrrrrrggggggggbbbbbbbb
3486 -        orr     WK&reg2, WK&reg2, #0xFF000000      @ 11111111RRRRRRRRGGGGGGGGBBBBBBBB
3487 -        orr     WK&reg1, WK&reg1, #0xFF000000      @ 11111111rrrrrrrrggggggggbbbbbbbb
3488 +        and     SCRATCH, WK\()\reg1, MASK                   @ 00000GGGGGG0000000000gggggg00000
3489 +        bic     WK\()\reg1, WK\()\reg1, MASK                @ RRRRR000000BBBBBrrrrr000000bbbbb
3490 +        orr     SCRATCH, SCRATCH, SCRATCH, lsr #6           @ 00000GGGGGGGGGGGG0000ggggggggggg
3491 +        mov     WK\()\reg2, WK\()\reg1, lsr #16             @ 0000000000000000RRRRR000000BBBBB
3492 +        mov     SCRATCH, SCRATCH, ror #27                   @ GGGGGGGGGGGG0000ggggggggggg00000
3493 +        bic     WK\()\reg1, WK\()\reg1, WK\()\reg2, lsl #16 @ 0000000000000000rrrrr000000bbbbb
3494 +        mov     WK\()\reg2, WK\()\reg2, lsl #3              @ 0000000000000RRRRR000000BBBBB000
3495 +        mov     WK\()\reg1, WK\()\reg1, lsl #3              @ 0000000000000rrrrr000000bbbbb000
3496 +        orr     WK\()\reg2, WK\()\reg2, WK\()\reg2, lsr #5  @ 0000000000000RRRRRRRRRR0BBBBBBBB
3497 +        orr     WK\()\reg1, WK\()\reg1, WK\()\reg1, lsr #5  @ 0000000000000rrrrrrrrrr0bbbbbbbb
3498 +        pkhbt   WK\()\reg2, WK\()\reg2, WK\()\reg2, lsl #5  @ --------RRRRRRRR--------BBBBBBBB
3499 +        pkhbt   WK\()\reg1, WK\()\reg1, WK\()\reg1, lsl #5  @ --------rrrrrrrr--------bbbbbbbb
3500 +        sel     WK\()\reg2, SCRATCH, WK\()\reg2             @ --------RRRRRRRRGGGGGGGGBBBBBBBB
3501 +        sel     WK\()\reg1, SCRATCH, WK\()\reg1             @ --------rrrrrrrrggggggggbbbbbbbb
3502 +        orr     WK\()\reg2, WK\()\reg2, #0xFF000000         @ 11111111RRRRRRRRGGGGGGGGBBBBBBBB
3503 +        orr     WK\()\reg1, WK\()\reg1, #0xFF000000         @ 11111111rrrrrrrrggggggggbbbbbbbb
3504  */
3505
3506  .macro src_0565_8888_1pixel, reg
3507 -        bic     SCRATCH, WK&reg, MASK              @ 0000000000000000rrrrr000000bbbbb
3508 -        and     WK&reg, WK&reg, MASK               @ 000000000000000000000gggggg00000
3509 -        mov     SCRATCH, SCRATCH, lsl #3           @ 0000000000000rrrrr000000bbbbb000
3510 -        mov     WK&reg, WK&reg, lsl #5             @ 0000000000000000gggggg0000000000
3511 -        orr     SCRATCH, SCRATCH, SCRATCH, lsr #5  @ 0000000000000rrrrrrrrrr0bbbbbbbb
3512 -        orr     WK&reg, WK&reg, WK&reg, lsr #6     @ 000000000000000gggggggggggg00000
3513 -        pkhbt   SCRATCH, SCRATCH, SCRATCH, lsl #5  @ --------rrrrrrrr--------bbbbbbbb
3514 -        sel     WK&reg, WK&reg, SCRATCH            @ --------rrrrrrrrggggggggbbbbbbbb
3515 -        orr     WK&reg, WK&reg, #0xFF000000        @ 11111111rrrrrrrrggggggggbbbbbbbb
3516 +        bic     SCRATCH, WK\()\reg, MASK                 @ 0000000000000000rrrrr000000bbbbb
3517 +        and     WK\()\reg, WK\()\reg, MASK               @ 000000000000000000000gggggg00000
3518 +        mov     SCRATCH, SCRATCH, lsl #3                 @ 0000000000000rrrrr000000bbbbb000
3519 +        mov     WK\()\reg, WK\()\reg, lsl #5             @ 0000000000000000gggggg0000000000
3520 +        orr     SCRATCH, SCRATCH, SCRATCH, lsr #5        @ 0000000000000rrrrrrrrrr0bbbbbbbb
3521 +        orr     WK\()\reg, WK\()\reg, WK\()\reg, lsr #6  @ 000000000000000gggggggggggg00000
3522 +        pkhbt   SCRATCH, SCRATCH, SCRATCH, lsl #5        @ --------rrrrrrrr--------bbbbbbbb
3523 +        sel     WK\()\reg, WK\()\reg, SCRATCH            @ --------rrrrrrrrggggggggbbbbbbbb
3524 +        orr     WK\()\reg, WK\()\reg, #0xFF000000        @ 11111111rrrrrrrrggggggggbbbbbbbb
3525  .endm
3526
3527  .macro src_0565_8888_process_head   cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
3528 - .if numbytes == 16
3529 -        pixldst ld,, 8, firstreg, %(firstreg+2),,, SRC, unaligned_src
3530 - .elseif numbytes == 8
3531 -        pixld   , 4, firstreg, SRC, unaligned_src
3532 - .elseif numbytes == 4
3533 -        pixld   , 2, firstreg, SRC, unaligned_src
3534 + .if \numbytes == 16
3535 +        pixldst ld,, 8, \firstreg, %(\firstreg+2),,, SRC, \unaligned_src
3536 + .elseif \numbytes == 8
3537 +        pixld   , 4, \firstreg, SRC, \unaligned_src
3538 + .elseif \numbytes == 4
3539 +        pixld   , 2, \firstreg, SRC, \unaligned_src
3540   .endif
3541  .endm
3542
3543  .macro src_0565_8888_process_tail   cond, numbytes, firstreg
3544 - .if numbytes == 16
3545 -        src_0565_8888_2pixels firstreg, %(firstreg+1)
3546 -        src_0565_8888_2pixels %(firstreg+2), %(firstreg+3)
3547 - .elseif numbytes == 8
3548 -        src_0565_8888_2pixels firstreg, %(firstreg+1)
3549 + .if \numbytes == 16
3550 +        src_0565_8888_2pixels \firstreg, %(\firstreg+1)
3551 +        src_0565_8888_2pixels %(\firstreg+2), %(\firstreg+3)
3552 + .elseif \numbytes == 8
3553 +        src_0565_8888_2pixels \firstreg, %(\firstreg+1)
3554   .else
3555 -        src_0565_8888_1pixel firstreg
3556 +        src_0565_8888_1pixel \firstreg
3557   .endif
3558  .endm
3559
3560  generate_composite_function \
3561      pixman_composite_src_0565_8888_asm_armv6, 16, 0, 32, \
3562      FLAG_DST_WRITEONLY | FLAG_BRANCH_OVER, \
3563      3, /* prefetch distance */ \
3564      src_0565_8888_init, \
3565 @@ -306,67 +311,67 @@ generate_composite_function \
3566
3567  .macro src_x888_0565_init
3568          /* Hold loop invariant in MASK */
3569          ldr     MASK, =0x001F001F
3570          line_saved_regs  STRIDE_S, ORIG_W
3571  .endm
3572
3573  .macro src_x888_0565_1pixel  s, d
3574 -        and     WK&d, MASK, WK&s, lsr #3           @ 00000000000rrrrr00000000000bbbbb
3575 -        and     STRIDE_S, WK&s, #0xFC00            @ 0000000000000000gggggg0000000000
3576 -        orr     WK&d, WK&d, WK&d, lsr #5           @ 00000000000-----rrrrr000000bbbbb
3577 -        orr     WK&d, WK&d, STRIDE_S, lsr #5       @ 00000000000-----rrrrrggggggbbbbb
3578 +        and     WK\()\d, MASK, WK\()\s, lsr #3           @ 00000000000rrrrr00000000000bbbbb
3579 +        and     STRIDE_S, WK\()\s, #0xFC00               @ 0000000000000000gggggg0000000000
3580 +        orr     WK\()\d, WK\()\d, WK\()\d, lsr #5        @ 00000000000-----rrrrr000000bbbbb
3581 +        orr     WK\()\d, WK\()\d, STRIDE_S, lsr #5       @ 00000000000-----rrrrrggggggbbbbb
3582          /* Top 16 bits are discarded during the following STRH */
3583  .endm
3584
3585  .macro src_x888_0565_2pixels  slo, shi, d, tmp
3586 -        and     SCRATCH, WK&shi, #0xFC00           @ 0000000000000000GGGGGG0000000000
3587 -        and     WK&tmp, MASK, WK&shi, lsr #3       @ 00000000000RRRRR00000000000BBBBB
3588 -        and     WK&shi, MASK, WK&slo, lsr #3       @ 00000000000rrrrr00000000000bbbbb
3589 -        orr     WK&tmp, WK&tmp, WK&tmp, lsr #5     @ 00000000000-----RRRRR000000BBBBB
3590 -        orr     WK&tmp, WK&tmp, SCRATCH, lsr #5    @ 00000000000-----RRRRRGGGGGGBBBBB
3591 -        and     SCRATCH, WK&slo, #0xFC00           @ 0000000000000000gggggg0000000000
3592 -        orr     WK&shi, WK&shi, WK&shi, lsr #5     @ 00000000000-----rrrrr000000bbbbb
3593 -        orr     WK&shi, WK&shi, SCRATCH, lsr #5    @ 00000000000-----rrrrrggggggbbbbb
3594 -        pkhbt   WK&d, WK&shi, WK&tmp, lsl #16      @ RRRRRGGGGGGBBBBBrrrrrggggggbbbbb
3595 +        and     SCRATCH, WK\()\shi, #0xFC00                 @ 0000000000000000GGGGGG0000000000
3596 +        and     WK\()\tmp, MASK, WK\()\shi, lsr #3          @ 00000000000RRRRR00000000000BBBBB
3597 +        and     WK\()\shi, MASK, WK\()\slo, lsr #3          @ 00000000000rrrrr00000000000bbbbb
3598 +        orr     WK\()\tmp, WK\()\tmp, WK\()\tmp, lsr #5     @ 00000000000-----RRRRR000000BBBBB
3599 +        orr     WK\()\tmp, WK\()\tmp, SCRATCH, lsr #5       @ 00000000000-----RRRRRGGGGGGBBBBB
3600 +        and     SCRATCH, WK\()\slo, #0xFC00                 @ 0000000000000000gggggg0000000000
3601 +        orr     WK\()\shi, WK\()\shi, WK\()\shi, lsr #5     @ 00000000000-----rrrrr000000bbbbb
3602 +        orr     WK\()\shi, WK\()\shi, SCRATCH, lsr #5       @ 00000000000-----rrrrrggggggbbbbb
3603 +        pkhbt   WK\()\d, WK\()\shi, WK\()\tmp, lsl #16      @ RRRRRGGGGGGBBBBBrrrrrggggggbbbbb
3604  .endm
3605
3606  .macro src_x888_0565_process_head   cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
3607          WK4     .req    STRIDE_S
3608          WK5     .req    STRIDE_M
3609          WK6     .req    WK3
3610          WK7     .req    ORIG_W
3611 - .if numbytes == 16
3612 + .if \numbytes == 16
3613          pixld   , 16, 4, SRC, 0
3614          src_x888_0565_2pixels  4, 5, 0, 0
3615          pixld   , 8, 4, SRC, 0
3616          src_x888_0565_2pixels  6, 7, 1, 1
3617          pixld   , 8, 6, SRC, 0
3618   .else
3619 -        pixld   , numbytes*2, 4, SRC, 0
3620 +        pixld   , \numbytes*2, 4, SRC, 0
3621   .endif
3622  .endm
3623
3624  .macro src_x888_0565_process_tail   cond, numbytes, firstreg
3625 - .if numbytes == 16
3626 + .if \numbytes == 16
3627          src_x888_0565_2pixels  4, 5, 2, 2
3628          src_x888_0565_2pixels  6, 7, 3, 4
3629 - .elseif numbytes == 8
3630 + .elseif \numbytes == 8
3631          src_x888_0565_2pixels  4, 5, 1, 1
3632          src_x888_0565_2pixels  6, 7, 2, 2
3633 - .elseif numbytes == 4
3634 + .elseif \numbytes == 4
3635          src_x888_0565_2pixels  4, 5, 1, 1
3636   .else
3637          src_x888_0565_1pixel  4, 1
3638   .endif
3639 - .if numbytes == 16
3640 -        pixst   , numbytes, 0, DST
3641 + .if \numbytes == 16
3642 +        pixst   , \numbytes, 0, DST
3643   .else
3644 -        pixst   , numbytes, 1, DST
3645 +        pixst   , \numbytes, 1, DST
3646   .endif
3647          .unreq  WK4
3648          .unreq  WK5
3649          .unreq  WK6
3650          .unreq  WK7
3651  .endm
3652
3653  generate_composite_function \
3654 @@ -377,47 +382,47 @@ generate_composite_function \
3655      nop_macro, /* newline */ \
3656      nop_macro, /* cleanup */ \
3657      src_x888_0565_process_head, \
3658      src_x888_0565_process_tail
3659
3660  /******************************************************************************/
3661
3662  .macro add_8_8_8pixels  cond, dst1, dst2
3663 -        uqadd8&cond  WK&dst1, WK&dst1, MASK
3664 -        uqadd8&cond  WK&dst2, WK&dst2, STRIDE_M
3665 +        uqadd8\()\cond  WK\()\dst1, WK\()\dst1, MASK
3666 +        uqadd8\()\cond  WK\()\dst2, WK\()\dst2, STRIDE_M
3667  .endm
3668
3669  .macro add_8_8_4pixels  cond, dst
3670 -        uqadd8&cond  WK&dst, WK&dst, MASK
3671 +        uqadd8\()\cond  WK\()\dst, WK\()\dst, MASK
3672  .endm
3673
3674  .macro add_8_8_process_head  cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
3675      WK4     .req    MASK
3676      WK5     .req    STRIDE_M
3677 - .if numbytes == 16
3678 -        pixld   cond, 8, 4, SRC, unaligned_src
3679 -        pixld   cond, 16, firstreg, DST, 0
3680 -        add_8_8_8pixels cond, firstreg, %(firstreg+1)
3681 -        pixld   cond, 8, 4, SRC, unaligned_src
3682 + .if \numbytes == 16
3683 +        pixld   \cond, 8, 4, SRC, \unaligned_src
3684 +        pixld   \cond, 16, \firstreg, DST, 0
3685 +        add_8_8_8pixels \cond, \firstreg, %(\firstreg+1)
3686 +        pixld   \cond, 8, 4, SRC, \unaligned_src
3687   .else
3688 -        pixld   cond, numbytes, 4, SRC, unaligned_src
3689 -        pixld   cond, numbytes, firstreg, DST, 0
3690 +        pixld   \cond, \numbytes, 4, SRC, \unaligned_src
3691 +        pixld   \cond, \numbytes, \firstreg, DST, 0
3692   .endif
3693      .unreq  WK4
3694      .unreq  WK5
3695  .endm
3696
3697  .macro add_8_8_process_tail  cond, numbytes, firstreg
3698 - .if numbytes == 16
3699 -        add_8_8_8pixels cond, %(firstreg+2), %(firstreg+3)
3700 - .elseif numbytes == 8
3701 -        add_8_8_8pixels cond, firstreg, %(firstreg+1)
3702 + .if \numbytes == 16
3703 +        add_8_8_8pixels \cond, %(\firstreg+2), %(\firstreg+3)
3704 + .elseif \numbytes == 8
3705 +        add_8_8_8pixels \cond, \firstreg, %(\firstreg+1)
3706   .else
3707 -        add_8_8_4pixels cond, firstreg
3708 +        add_8_8_4pixels \cond, \firstreg
3709   .endif
3710  .endm
3711
3712  generate_composite_function \
3713      pixman_composite_add_8_8_asm_armv6, 8, 0, 8, \
3714      FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_PRESERVES_SCRATCH, \
3715      2, /* prefetch distance */ \
3716      nop_macro, /* init */ \
3717 @@ -436,82 +441,82 @@ generate_composite_function \
3718          line_saved_regs STRIDE_D, STRIDE_S, ORIG_W
3719  .endm
3720
3721  .macro over_8888_8888_process_head  cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
3722      WK4     .req    STRIDE_D
3723      WK5     .req    STRIDE_S
3724      WK6     .req    STRIDE_M
3725      WK7     .req    ORIG_W
3726 -        pixld   , numbytes, %(4+firstreg), SRC, unaligned_src
3727 -        pixld   , numbytes, firstreg, DST, 0
3728 +        pixld   , \numbytes, %(4+\firstreg), SRC, \unaligned_src
3729 +        pixld   , \numbytes, \firstreg, DST, 0
3730      .unreq  WK4
3731      .unreq  WK5
3732      .unreq  WK6
3733      .unreq  WK7
3734  .endm
3735
3736  .macro over_8888_8888_check_transparent  numbytes, reg0, reg1, reg2, reg3
3737          /* Since these colours a premultiplied by alpha, only 0 indicates transparent (any other colour with 0 in the alpha byte is luminous) */
3738 -        teq     WK&reg0, #0
3739 - .if numbytes > 4
3740 -        teqeq   WK&reg1, #0
3741 -  .if numbytes > 8
3742 -        teqeq   WK&reg2, #0
3743 -        teqeq   WK&reg3, #0
3744 +        teq     WK\()\reg0, #0
3745 + .if \numbytes > 4
3746 +        teqeq   WK\()\reg1, #0
3747 +  .if \numbytes > 8
3748 +        teqeq   WK\()\reg2, #0
3749 +        teqeq   WK\()\reg3, #0
3750    .endif
3751   .endif
3752  .endm
3753
3754  .macro over_8888_8888_prepare  next
3755 -        mov     WK&next, WK&next, lsr #24
3756 +        mov     WK\()\next, WK\()\next, lsr #24
3757  .endm
3758
3759  .macro over_8888_8888_1pixel src, dst, offset, next
3760          /* src = destination component multiplier */
3761 -        rsb     WK&src, WK&src, #255
3762 +        rsb     WK\()\src, WK\()\src, #255
3763          /* Split even/odd bytes of dst into SCRATCH/dst */
3764 -        uxtb16  SCRATCH, WK&dst
3765 -        uxtb16  WK&dst, WK&dst, ror #8
3766 +        uxtb16  SCRATCH, WK\()\dst
3767 +        uxtb16  WK\()\dst, WK\()\dst, ror #8
3768          /* Multiply through, adding 0.5 to the upper byte of result for rounding */
3769 -        mla     SCRATCH, SCRATCH, WK&src, MASK
3770 -        mla     WK&dst, WK&dst, WK&src, MASK
3771 +        mla     SCRATCH, SCRATCH, WK\()\src, MASK
3772 +        mla     WK\()\dst, WK\()\dst, WK\()\src, MASK
3773          /* Where we would have had a stall between the result of the first MLA and the shifter input,
3774           * reload the complete source pixel */
3775 -        ldr     WK&src, [SRC, #offset]
3776 +        ldr     WK\()\src, [SRC, #\offset]
3777          /* Multiply by 257/256 to approximate 256/255 */
3778          uxtab16 SCRATCH, SCRATCH, SCRATCH, ror #8
3779          /* In this stall, start processing the next pixel */
3780 - .if offset < -4
3781 -        mov     WK&next, WK&next, lsr #24
3782 + .if \offset < -4
3783 +        mov     WK\()\next, WK\()\next, lsr #24
3784   .endif
3785 -        uxtab16 WK&dst, WK&dst, WK&dst, ror #8
3786 +        uxtab16 WK\()\dst, WK\()\dst, WK\()\dst, ror #8
3787          /* Recombine even/odd bytes of multiplied destination */
3788          mov     SCRATCH, SCRATCH, ror #8
3789 -        sel     WK&dst, SCRATCH, WK&dst
3790 +        sel     WK\()\dst, SCRATCH, WK\()\dst
3791          /* Saturated add of source to multiplied destination */
3792 -        uqadd8  WK&dst, WK&dst, WK&src
3793 +        uqadd8  WK\()\dst, WK\()\dst, WK\()\src
3794  .endm
3795
3796  .macro over_8888_8888_process_tail  cond, numbytes, firstreg
3797      WK4     .req    STRIDE_D
3798      WK5     .req    STRIDE_S
3799      WK6     .req    STRIDE_M
3800      WK7     .req    ORIG_W
3801 -        over_8888_8888_check_transparent numbytes, %(4+firstreg), %(5+firstreg), %(6+firstreg), %(7+firstreg)
3802 +        over_8888_8888_check_transparent \numbytes, %(4+\firstreg), %(5+\firstreg), %(6+\firstreg), %(7+\firstreg)
3803          beq     10f
3804 -        over_8888_8888_prepare  %(4+firstreg)
3805 - .set PROCESS_REG, firstreg
3806 - .set PROCESS_OFF, -numbytes
3807 - .rept numbytes / 4
3808 +        over_8888_8888_prepare  %(4+\firstreg)
3809 + .set PROCESS_REG, \firstreg
3810 + .set PROCESS_OFF, -\numbytes
3811 + .rept \numbytes / 4
3812          over_8888_8888_1pixel %(4+PROCESS_REG), %(0+PROCESS_REG), PROCESS_OFF, %(5+PROCESS_REG)
3813    .set PROCESS_REG, PROCESS_REG+1
3814    .set PROCESS_OFF, PROCESS_OFF+4
3815   .endr
3816 -        pixst   , numbytes, firstreg, DST
3817 +        pixst   , \numbytes, \firstreg, DST
3818  10:
3819      .unreq  WK4
3820      .unreq  WK5
3821      .unreq  WK6
3822      .unreq  WK7
3823  .endm
3824
3825  generate_composite_function \
3826 @@ -531,26 +536,26 @@ generate_composite_function \
3827   * word  Register containing 4 bytes
3828   * byte  Register containing byte multiplier (bits 8-31 must be 0)
3829   * tmp   Scratch register
3830   * half  Register containing the constant 0x00800080
3831   * GE[3:0] bits must contain 0101
3832   */
3833  .macro mul_8888_8  word, byte, tmp, half
3834          /* Split even/odd bytes of word apart */
3835 -        uxtb16  tmp, word
3836 -        uxtb16  word, word, ror #8
3837 +        uxtb16  \tmp, \word
3838 +        uxtb16  \word, \word, ror #8
3839          /* Multiply bytes together with rounding, then by 257/256 */
3840 -        mla     tmp, tmp, byte, half
3841 -        mla     word, word, byte, half /* 1 stall follows */
3842 -        uxtab16 tmp, tmp, tmp, ror #8  /* 1 stall follows */
3843 -        uxtab16 word, word, word, ror #8
3844 +        mla     \tmp, \tmp, \byte, \half
3845 +        mla     \word, \word, \byte, \half /* 1 stall follows */
3846 +        uxtab16 \tmp, \tmp, \tmp, ror #8  /* 1 stall follows */
3847 +        uxtab16 \word, \word, \word, ror #8
3848          /* Recombine bytes */
3849 -        mov     tmp, tmp, ror #8
3850 -        sel     word, tmp, word
3851 +        mov     \tmp, \tmp, ror #8
3852 +        sel     \word, \tmp, \word
3853  .endm
3854
3855  /******************************************************************************/
3856
3857  .macro over_8888_n_8888_init
3858          /* Mask is constant */
3859          ldr     MASK, [sp, #ARGS_STACK_OFFSET+8]
3860          /* Hold loop invariant in STRIDE_M */
3861 @@ -562,51 +567,51 @@ generate_composite_function \
3862          line_saved_regs Y, STRIDE_D, STRIDE_S, ORIG_W
3863  .endm
3864
3865  .macro over_8888_n_8888_process_head  cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
3866      WK4     .req    Y
3867      WK5     .req    STRIDE_D
3868      WK6     .req    STRIDE_S
3869      WK7     .req    ORIG_W
3870 -        pixld   , numbytes, %(4+(firstreg%2)), SRC, unaligned_src
3871 -        pixld   , numbytes, firstreg, DST, 0
3872 +        pixld   , \numbytes, %(4+(\firstreg%2)), SRC, \unaligned_src
3873 +        pixld   , \numbytes, \firstreg, DST, 0
3874      .unreq  WK4
3875      .unreq  WK5
3876      .unreq  WK6
3877      .unreq  WK7
3878  .endm
3879
3880  .macro over_8888_n_8888_1pixel src, dst
3881 -        mul_8888_8  WK&src, MASK, SCRATCH, STRIDE_M
3882 -        sub     WK7, WK6, WK&src, lsr #24
3883 -        mul_8888_8  WK&dst, WK7, SCRATCH, STRIDE_M
3884 -        uqadd8  WK&dst, WK&dst, WK&src
3885 +        mul_8888_8  WK\()\src, MASK, SCRATCH, STRIDE_M
3886 +        sub     WK7, WK6, WK\()\src, lsr #24
3887 +        mul_8888_8  WK\()\dst, WK7, SCRATCH, STRIDE_M
3888 +        uqadd8  WK\()\dst, WK\()\dst, WK\()\src
3889  .endm
3890
3891  .macro over_8888_n_8888_process_tail  cond, numbytes, firstreg
3892      WK4     .req    Y
3893      WK5     .req    STRIDE_D
3894      WK6     .req    STRIDE_S
3895      WK7     .req    ORIG_W
3896 -        over_8888_8888_check_transparent numbytes, %(4+(firstreg%2)), %(5+(firstreg%2)), %(6+firstreg), %(7+firstreg)
3897 +        over_8888_8888_check_transparent \numbytes, %(4+(\firstreg%2)), %(5+(\firstreg%2)), %(6+\firstreg), %(7+\firstreg)
3898          beq     10f
3899          mov     WK6, #255
3900 - .set PROCESS_REG, firstreg
3901 - .rept numbytes / 4
3902 -  .if numbytes == 16 && PROCESS_REG == 2
3903 + .set PROCESS_REG, \firstreg
3904 + .rept \numbytes / 4
3905 +  .if \numbytes == 16 && PROCESS_REG == 2
3906          /* We're using WK6 and WK7 as temporaries, so half way through
3907           * 4 pixels, reload the second two source pixels but this time
3908           * into WK4 and WK5 */
3909          ldmdb   SRC, {WK4, WK5}
3910    .endif
3911          over_8888_n_8888_1pixel  %(4+(PROCESS_REG%2)), %(PROCESS_REG)
3912    .set PROCESS_REG, PROCESS_REG+1
3913   .endr
3914 -        pixst   , numbytes, firstreg, DST
3915 +        pixst   , \numbytes, \firstreg, DST
3916  10:
3917      .unreq  WK4
3918      .unreq  WK5
3919      .unreq  WK6
3920      .unreq  WK7
3921  .endm
3922
3923  generate_composite_function \
3924 @@ -637,47 +642,47 @@ generate_composite_function \
3925          ldr     STRIDE_D, =0x00800080
3926          b       1f
3927   .ltorg
3928  1:
3929  .endm
3930
3931  .macro over_n_8_8888_process_head  cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
3932      WK4     .req    STRIDE_M
3933 -        pixld   , numbytes/4, 4, MASK, unaligned_mask
3934 -        pixld   , numbytes, firstreg, DST, 0
3935 +        pixld   , \numbytes/4, 4, MASK, \unaligned_mask
3936 +        pixld   , \numbytes, \firstreg, DST, 0
3937      .unreq  WK4
3938  .endm
3939
3940  .macro over_n_8_8888_1pixel src, dst
3941 -        uxtb    Y, WK4, ror #src*8
3942 +        uxtb    Y, WK4, ror #\src*8
3943          /* Trailing part of multiplication of source */
3944          mla     SCRATCH, STRIDE_S, Y, STRIDE_D
3945          mla     Y, SRC, Y, STRIDE_D
3946          mov     ORIG_W, #255
3947          uxtab16 SCRATCH, SCRATCH, SCRATCH, ror #8
3948          uxtab16 Y, Y, Y, ror #8
3949          mov     SCRATCH, SCRATCH, ror #8
3950          sub     ORIG_W, ORIG_W, Y, lsr #24
3951          sel     Y, SCRATCH, Y
3952          /* Then multiply the destination */
3953 -        mul_8888_8  WK&dst, ORIG_W, SCRATCH, STRIDE_D
3954 -        uqadd8  WK&dst, WK&dst, Y
3955 +        mul_8888_8  WK\()\dst, ORIG_W, SCRATCH, STRIDE_D
3956 +        uqadd8  WK\()\dst, WK\()\dst, Y
3957  .endm
3958
3959  .macro over_n_8_8888_process_tail  cond, numbytes, firstreg
3960      WK4     .req    STRIDE_M
3961          teq     WK4, #0
3962          beq     10f
3963 - .set PROCESS_REG, firstreg
3964 - .rept numbytes / 4
3965 -        over_n_8_8888_1pixel  %(PROCESS_REG-firstreg), %(PROCESS_REG)
3966 + .set PROCESS_REG, \firstreg
3967 + .rept \numbytes / 4
3968 +        over_n_8_8888_1pixel  %(PROCESS_REG-\firstreg), %(PROCESS_REG)
3969    .set PROCESS_REG, PROCESS_REG+1
3970   .endr
3971 -        pixst   , numbytes, firstreg, DST
3972 +        pixst   , \numbytes, \firstreg, DST
3973  10:
3974      .unreq  WK4
3975  .endm
3976
3977  generate_composite_function \
3978      pixman_composite_over_n_8_8888_asm_armv6, 0, 8, 32 \
3979      FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS \
3980      2, /* prefetch distance */ \
3981 @@ -700,64 +705,64 @@ generate_composite_function \
3982          line_saved_regs  STRIDE_D, ORIG_W
3983  .endm
3984
3985  .macro over_reverse_n_8888_newline
3986          mov     STRIDE_D, #0xFF
3987  .endm
3988
3989  .macro over_reverse_n_8888_process_head  cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
3990 -        pixld   , numbytes, firstreg, DST, 0
3991 +        pixld   , \numbytes, \firstreg, DST, 0
3992  .endm
3993
3994  .macro over_reverse_n_8888_1pixel  d, is_only
3995 -        teq     WK&d, #0
3996 +        teq     WK\()\d, #0
3997          beq     8f       /* replace with source */
3998 -        bics    ORIG_W, STRIDE_D, WK&d, lsr #24
3999 - .if is_only == 1
4000 +        bics    ORIG_W, STRIDE_D, WK\()\d, lsr #24
4001 + .if \is_only == 1
4002          beq     49f      /* skip store */
4003   .else
4004          beq     9f       /* write same value back */
4005   .endif
4006          mla     SCRATCH, STRIDE_S, ORIG_W, MASK /* red/blue */
4007          mla     ORIG_W, STRIDE_M, ORIG_W, MASK  /* alpha/green */
4008          uxtab16 SCRATCH, SCRATCH, SCRATCH, ror #8
4009          uxtab16 ORIG_W, ORIG_W, ORIG_W, ror #8
4010          mov     SCRATCH, SCRATCH, ror #8
4011          sel     ORIG_W, SCRATCH, ORIG_W
4012 -        uqadd8  WK&d, WK&d, ORIG_W
4013 +        uqadd8  WK\()\d, WK\()\d, ORIG_W
4014          b       9f
4015 -8:      mov     WK&d, SRC
4016 +8:      mov     WK\()\d, SRC
4017  9:
4018  .endm
4019
4020  .macro over_reverse_n_8888_tail  numbytes, reg1, reg2, reg3, reg4
4021 - .if numbytes == 4
4022 -        over_reverse_n_8888_1pixel  reg1, 1
4023 + .if \numbytes == 4
4024 +        over_reverse_n_8888_1pixel  \reg1, 1
4025   .else
4026 -        and     SCRATCH, WK&reg1, WK&reg2
4027 -  .if numbytes == 16
4028 -        and     SCRATCH, SCRATCH, WK&reg3
4029 -        and     SCRATCH, SCRATCH, WK&reg4
4030 +        and     SCRATCH, WK\()\reg1, WK\()\reg2
4031 +  .if \numbytes == 16
4032 +        and     SCRATCH, SCRATCH, WK\()\reg3
4033 +        and     SCRATCH, SCRATCH, WK\()\reg4
4034    .endif
4035          mvns    SCRATCH, SCRATCH, asr #24
4036          beq     49f /* skip store if all opaque */
4037 -        over_reverse_n_8888_1pixel  reg1, 0
4038 -        over_reverse_n_8888_1pixel  reg2, 0
4039 -  .if numbytes == 16
4040 -        over_reverse_n_8888_1pixel  reg3, 0
4041 -        over_reverse_n_8888_1pixel  reg4, 0
4042 +        over_reverse_n_8888_1pixel  \reg1, 0
4043 +        over_reverse_n_8888_1pixel  \reg2, 0
4044 +  .if \numbytes == 16
4045 +        over_reverse_n_8888_1pixel  \reg3, 0
4046 +        over_reverse_n_8888_1pixel  \reg4, 0
4047    .endif
4048   .endif
4049 -        pixst   , numbytes, reg1, DST
4050 +        pixst   , \numbytes, \reg1, DST
4051  49:
4052  .endm
4053
4054  .macro over_reverse_n_8888_process_tail  cond, numbytes, firstreg
4055 -        over_reverse_n_8888_tail  numbytes, firstreg, %(firstreg+1), %(firstreg+2), %(firstreg+3)
4056 +        over_reverse_n_8888_tail  \numbytes, \firstreg, %(\firstreg+1), %(\firstreg+2), %(\firstreg+3)
4057  .endm
4058
4059  generate_composite_function \
4060      pixman_composite_over_reverse_n_8888_asm_armv6, 0, 0, 32 \
4061      FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS | FLAG_PROCESS_CORRUPTS_SCRATCH, \
4062      3, /* prefetch distance */ \
4063      over_reverse_n_8888_init, \
4064      over_reverse_n_8888_newline, \
4065 @@ -789,30 +794,30 @@ generate_composite_function \
4066          .unreq  TMP1
4067          .unreq  TMP2
4068          .unreq  TMP3
4069          .unreq  WK4
4070  .endm
4071
4072  .macro over_white_8888_8888_ca_combine  m, d
4073          uxtb16  TMP1, TMP0                /* rb_notmask */
4074 -        uxtb16  TMP2, d                   /* rb_dest; 1 stall follows */
4075 +        uxtb16  TMP2, \d                  /* rb_dest; 1 stall follows */
4076          smlatt  TMP3, TMP2, TMP1, HALF    /* red */
4077          smlabb  TMP2, TMP2, TMP1, HALF    /* blue */
4078          uxtb16  TMP0, TMP0, ror #8        /* ag_notmask */
4079 -        uxtb16  TMP1, d, ror #8           /* ag_dest; 1 stall follows */
4080 -        smlatt  d, TMP1, TMP0, HALF       /* alpha */
4081 +        uxtb16  TMP1, \d, ror #8          /* ag_dest; 1 stall follows */
4082 +        smlatt  \d, TMP1, TMP0, HALF      /* alpha */
4083          smlabb  TMP1, TMP1, TMP0, HALF    /* green */
4084          pkhbt   TMP0, TMP2, TMP3, lsl #16 /* rb; 1 stall follows */
4085 -        pkhbt   TMP1, TMP1, d, lsl #16    /* ag */
4086 +        pkhbt   TMP1, TMP1, \d, lsl #16   /* ag */
4087          uxtab16 TMP0, TMP0, TMP0, ror #8
4088          uxtab16 TMP1, TMP1, TMP1, ror #8
4089          mov     TMP0, TMP0, ror #8
4090 -        sel     d, TMP0, TMP1
4091 -        uqadd8  d, d, m                   /* d is a late result */
4092 +        sel     \d, TMP0, TMP1
4093 +        uqadd8  \d, \d, \m                 /* d is a late result */
4094  .endm
4095
4096  .macro over_white_8888_8888_ca_1pixel_head
4097          pixld   , 4, 1, MASK, 0
4098          pixld   , 4, 3, DST, 0
4099  .endm
4100
4101  .macro over_white_8888_8888_ca_1pixel_tail
4102 @@ -848,29 +853,29 @@ 02:     mvn     TMP0, WK2
4103          movcs   WK4, WK2
4104          b       04f
4105  03:     over_white_8888_8888_ca_combine WK2, WK4
4106  04:     pixst   , 8, 3, DST
4107  05:
4108  .endm
4109
4110  .macro over_white_8888_8888_ca_process_head  cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
4111 - .if numbytes == 4
4112 + .if \numbytes == 4
4113          over_white_8888_8888_ca_1pixel_head
4114   .else
4115 -  .if numbytes == 16
4116 +  .if \numbytes == 16
4117          over_white_8888_8888_ca_2pixels_head
4118          over_white_8888_8888_ca_2pixels_tail
4119    .endif
4120          over_white_8888_8888_ca_2pixels_head
4121   .endif
4122  .endm
4123
4124  .macro over_white_8888_8888_ca_process_tail  cond, numbytes, firstreg
4125 - .if numbytes == 4
4126 + .if \numbytes == 4
4127          over_white_8888_8888_ca_1pixel_tail
4128   .else
4129          over_white_8888_8888_ca_2pixels_tail
4130   .endif
4131  .endm
4132
4133  generate_composite_function \
4134      pixman_composite_over_white_8888_8888_ca_asm_armv6, 0, 32, 32 \
4135 @@ -999,33 +1004,33 @@ 20:     /* No simplifications possible -
4136          uqadd8  WK0, WK1, WK2            /* followed by 1 stall */
4137  30:     /* The destination buffer is already in the L1 cache, so
4138           * there's little point in amalgamating writes */
4139          pixst   , 4, 0, DST
4140  40:
4141  .endm
4142
4143  .macro over_n_8888_8888_ca_process_head  cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
4144 - .rept (numbytes / 4) - 1
4145 + .rept (\numbytes / 4) - 1
4146          over_n_8888_8888_ca_1pixel_head
4147          over_n_8888_8888_ca_1pixel_tail
4148   .endr
4149          over_n_8888_8888_ca_1pixel_head
4150  .endm
4151
4152  .macro over_n_8888_8888_ca_process_tail  cond, numbytes, firstreg
4153          over_n_8888_8888_ca_1pixel_tail
4154  .endm
4155
4156  pixman_asm_function pixman_composite_over_n_8888_8888_ca_asm_armv6
4157          ldr     ip, [sp]
4158          cmp     ip, #-1
4159          beq     pixman_composite_over_white_8888_8888_ca_asm_armv6
4160          /* else drop through... */
4161 - .endfunc
4162 + pixman_end_asm_function
4163  generate_composite_function \
4164      pixman_composite_over_n_8888_8888_ca_asm_armv6_helper, 0, 32, 32 \
4165      FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS | FLAG_PROCESS_CORRUPTS_SCRATCH | FLAG_PROCESS_CORRUPTS_WK0 \
4166      2, /* prefetch distance */ \
4167      over_n_8888_8888_ca_init, \
4168      nop_macro, /* newline */ \
4169      over_n_8888_8888_ca_cleanup, \
4170      over_n_8888_8888_ca_process_head, \
4171 @@ -1040,94 +1045,94 @@ generate_composite_function \
4172          uadd8   SCRATCH, MASK, MASK
4173          /* Offset the source pointer: we only need the alpha bytes */
4174          add     SRC, SRC, #3
4175          line_saved_regs  ORIG_W
4176  .endm
4177
4178  .macro in_reverse_8888_8888_head  numbytes, reg1, reg2, reg3
4179          ldrb    ORIG_W, [SRC], #4
4180 - .if numbytes >= 8
4181 -        ldrb    WK&reg1, [SRC], #4
4182 -  .if numbytes == 16
4183 -        ldrb    WK&reg2, [SRC], #4
4184 -        ldrb    WK&reg3, [SRC], #4
4185 + .if \numbytes >= 8
4186 +        ldrb    WK\()\reg1, [SRC], #4
4187 +  .if \numbytes == 16
4188 +        ldrb    WK\()\reg2, [SRC], #4
4189 +        ldrb    WK\()\reg3, [SRC], #4
4190    .endif
4191   .endif
4192 -        add     DST, DST, #numbytes
4193 +        add     DST, DST, #\numbytes
4194  .endm
4195
4196  .macro in_reverse_8888_8888_process_head  cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
4197 -        in_reverse_8888_8888_head  numbytes, firstreg, %(firstreg+1), %(firstreg+2)
4198 +        in_reverse_8888_8888_head  \numbytes, \firstreg, %(\firstreg+1), %(\firstreg+2)
4199  .endm
4200
4201  .macro in_reverse_8888_8888_1pixel  s, d, offset, is_only
4202 - .if is_only != 1
4203 -        movs    s, ORIG_W
4204 -  .if offset != 0
4205 -        ldrb    ORIG_W, [SRC, #offset]
4206 + .if \is_only != 1
4207 +        movs    \s, ORIG_W
4208 +  .if \offset != 0
4209 +        ldrb    ORIG_W, [SRC, #\offset]
4210    .endif
4211          beq     01f
4212          teq     STRIDE_M, #0xFF
4213          beq     02f
4214   .endif
4215 -        uxtb16  SCRATCH, d                 /* rb_dest */
4216 -        uxtb16  d, d, ror #8               /* ag_dest */
4217 -        mla     SCRATCH, SCRATCH, s, MASK
4218 -        mla     d, d, s, MASK
4219 +        uxtb16  SCRATCH, \d                 /* rb_dest */
4220 +        uxtb16  \d, \d, ror #8               /* ag_dest */
4221 +        mla     SCRATCH, SCRATCH, \s, MASK
4222 +        mla     \d, \d, \s, MASK
4223          uxtab16 SCRATCH, SCRATCH, SCRATCH, ror #8
4224 -        uxtab16 d, d, d, ror #8
4225 +        uxtab16 \d, \d, \d, ror #8
4226          mov     SCRATCH, SCRATCH, ror #8
4227 -        sel     d, SCRATCH, d
4228 +        sel     \d, SCRATCH, \d
4229          b       02f
4230 - .if offset == 0
4231 + .if \offset == 0
4232  48:     /* Last mov d,#0 of the set - used as part of shortcut for
4233           * source values all 0 */
4234   .endif
4235 -01:     mov     d, #0
4236 +01:     mov     \d, #0
4237  02:
4238  .endm
4239
4240  .macro in_reverse_8888_8888_tail  numbytes, reg1, reg2, reg3, reg4
4241 - .if numbytes == 4
4242 + .if \numbytes == 4
4243          teq     ORIG_W, ORIG_W, asr #32
4244 -        ldrne   WK&reg1, [DST, #-4]
4245 - .elseif numbytes == 8
4246 -        teq     ORIG_W, WK&reg1
4247 +        ldrne   WK\()\reg1, [DST, #-4]
4248 + .elseif \numbytes == 8
4249 +        teq     ORIG_W, WK\()\reg1
4250          teqeq   ORIG_W, ORIG_W, asr #32  /* all 0 or all -1? */
4251 -        ldmnedb DST, {WK&reg1-WK&reg2}
4252 +        ldmnedb DST, {WK\()\reg1-WK\()\reg2}
4253   .else
4254 -        teq     ORIG_W, WK&reg1
4255 -        teqeq   ORIG_W, WK&reg2
4256 -        teqeq   ORIG_W, WK&reg3
4257 +        teq     ORIG_W, WK\()\reg1
4258 +        teqeq   ORIG_W, WK\()\reg2
4259 +        teqeq   ORIG_W, WK\()\reg3
4260          teqeq   ORIG_W, ORIG_W, asr #32  /* all 0 or all -1? */
4261 -        ldmnedb DST, {WK&reg1-WK&reg4}
4262 +        ldmnedb DST, {WK\()\reg1-WK\()\reg4}
4263   .endif
4264          cmnne   DST, #0   /* clear C if NE */
4265          bcs     49f       /* no writes to dest if source all -1 */
4266          beq     48f       /* set dest to all 0 if source all 0 */
4267 - .if numbytes == 4
4268 -        in_reverse_8888_8888_1pixel  ORIG_W, WK&reg1, 0, 1
4269 -        str     WK&reg1, [DST, #-4]
4270 - .elseif numbytes == 8
4271 -        in_reverse_8888_8888_1pixel  STRIDE_M, WK&reg1, -4, 0
4272 -        in_reverse_8888_8888_1pixel  STRIDE_M, WK&reg2, 0, 0
4273 -        stmdb   DST, {WK&reg1-WK&reg2}
4274 + .if \numbytes == 4
4275 +        in_reverse_8888_8888_1pixel  ORIG_W, WK\()\reg1, 0, 1
4276 +        str     WK\()\reg1, [DST, #-4]
4277 + .elseif \numbytes == 8
4278 +        in_reverse_8888_8888_1pixel  STRIDE_M, WK\()\reg1, -4, 0
4279 +        in_reverse_8888_8888_1pixel  STRIDE_M, WK\()\reg2, 0, 0
4280 +        stmdb   DST, {WK\()\reg1-WK\()\reg2}
4281   .else
4282 -        in_reverse_8888_8888_1pixel  STRIDE_M, WK&reg1, -12, 0
4283 -        in_reverse_8888_8888_1pixel  STRIDE_M, WK&reg2, -8, 0
4284 -        in_reverse_8888_8888_1pixel  STRIDE_M, WK&reg3, -4, 0
4285 -        in_reverse_8888_8888_1pixel  STRIDE_M, WK&reg4, 0, 0
4286 -        stmdb   DST, {WK&reg1-WK&reg4}
4287 +        in_reverse_8888_8888_1pixel  STRIDE_M, WK\()\reg1, -12, 0
4288 +        in_reverse_8888_8888_1pixel  STRIDE_M, WK\()\reg2, -8, 0
4289 +        in_reverse_8888_8888_1pixel  STRIDE_M, WK\()\reg3, -4, 0
4290 +        in_reverse_8888_8888_1pixel  STRIDE_M, WK\()\reg4, 0, 0
4291 +        stmdb   DST, {WK\()\reg1-WK\()\reg4}
4292   .endif
4293  49:
4294  .endm
4295
4296  .macro in_reverse_8888_8888_process_tail  cond, numbytes, firstreg
4297 -        in_reverse_8888_8888_tail  numbytes, firstreg, %(firstreg+1), %(firstreg+2), %(firstreg+3)
4298 +        in_reverse_8888_8888_tail  \numbytes, \firstreg, %(\firstreg+1), %(\firstreg+2), %(\firstreg+3)
4299  .endm
4300
4301  generate_composite_function \
4302      pixman_composite_in_reverse_8888_8888_asm_armv6, 32, 0, 32 \
4303      FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS | FLAG_PROCESS_CORRUPTS_SCRATCH | FLAG_NO_PRELOAD_DST \
4304      2, /* prefetch distance */ \
4305      in_reverse_8888_8888_init, \
4306      nop_macro, /* newline */ \
4307 @@ -1144,31 +1149,31 @@ generate_composite_function \
4308          /* Hold multiplier for destination in STRIDE_M */
4309          mov     STRIDE_M, #255
4310          sub     STRIDE_M, STRIDE_M, SRC, lsr #24
4311          /* Set GE[3:0] to 0101 so SEL instructions do what we want */
4312          uadd8   SCRATCH, MASK, MASK
4313  .endm
4314
4315  .macro over_n_8888_process_head  cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
4316 -        pixld   , numbytes, firstreg, DST, 0
4317 +        pixld   , \numbytes, \firstreg, DST, 0
4318  .endm
4319
4320  .macro over_n_8888_1pixel dst
4321 -        mul_8888_8  WK&dst, STRIDE_M, SCRATCH, MASK
4322 -        uqadd8  WK&dst, WK&dst, SRC
4323 +        mul_8888_8  WK\()\dst, STRIDE_M, SCRATCH, MASK
4324 +        uqadd8  WK\()\dst, WK\()\dst, SRC
4325  .endm
4326
4327  .macro over_n_8888_process_tail  cond, numbytes, firstreg
4328 - .set PROCESS_REG, firstreg
4329 - .rept numbytes / 4
4330 + .set PROCESS_REG, \firstreg
4331 + .rept \numbytes / 4
4332          over_n_8888_1pixel %(PROCESS_REG)
4333    .set PROCESS_REG, PROCESS_REG+1
4334   .endr
4335 -        pixst   , numbytes, firstreg, DST
4336 +        pixst   , \numbytes, \firstreg, DST
4337  .endm
4338
4339  generate_composite_function \
4340      pixman_composite_over_n_8888_asm_armv6, 0, 0, 32 \
4341      FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_DOES_STORE \
4342      2, /* prefetch distance */ \
4343      over_n_8888_init, \
4344      nop_macro, /* newline */ \
4345 diff --git a/gfx/cairo/libpixman/src/pixman-arm-simd-asm.h b/gfx/cairo/libpixman/src/pixman-arm-simd-asm.h
4346 --- a/gfx/cairo/libpixman/src/pixman-arm-simd-asm.h
4347 +++ b/gfx/cairo/libpixman/src/pixman-arm-simd-asm.h
4348 @@ -107,88 +107,120 @@
4349  .set PREFETCH_TYPE_NONE,       0
4350  .set PREFETCH_TYPE_STANDARD,   1
4351
4352  /*
4353   * Definitions of macros for load/store of pixel data.
4354   */
4355
4356  .macro pixldst op, cond=al, numbytes, reg0, reg1, reg2, reg3, base, unaligned=0
4357 - .if numbytes == 16
4358 -  .if unaligned == 1
4359 -        op&r&cond    WK&reg0, [base], #4
4360 -        op&r&cond    WK&reg1, [base], #4
4361 -        op&r&cond    WK&reg2, [base], #4
4362 -        op&r&cond    WK&reg3, [base], #4
4363 + .if \numbytes == 16
4364 +  .if \unaligned == 1
4365 +        \op\()r\()\cond    WK\()\reg0, [\base], #4
4366 +        \op\()r\()\cond    WK\()\reg1, [\base], #4
4367 +        \op\()r\()\cond    WK\()\reg2, [\base], #4
4368 +        \op\()r\()\cond    WK\()\reg3, [\base], #4
4369    .else
4370 -        op&m&cond&ia base!, {WK&reg0,WK&reg1,WK&reg2,WK&reg3}
4371 +#ifdef __clang__
4372 +        \op\()mia\()\cond  \base!, {WK\()\reg0,WK\()\reg1,WK\()\reg2,WK\()\reg3}
4373 +#else
4374 +        \op\()m\()\cond\()ia  \base!, {WK\()\reg0,WK\()\reg1,WK\()\reg2,WK\()\reg3}
4375 +#endif
4376    .endif
4377 - .elseif numbytes == 8
4378 -  .if unaligned == 1
4379 -        op&r&cond    WK&reg0, [base], #4
4380 -        op&r&cond    WK&reg1, [base], #4
4381 + .elseif \numbytes == 8
4382 +  .if \unaligned == 1
4383 +        \op\()r\()\cond    WK\()\reg0, [\base], #4
4384 +        \op\()r\()\cond    WK\()\reg1, [\base], #4
4385    .else
4386 -        op&m&cond&ia base!, {WK&reg0,WK&reg1}
4387 +#ifdef __clang__
4388 +        \op\()mia\()\cond  \base!, {WK\()\reg0,WK\()\reg1}
4389 +#else
4390 +        \op\()m\()\cond\()ia  \base!, {WK\()\reg0,WK\()\reg1}
4391 +#endif
4392    .endif
4393 - .elseif numbytes == 4
4394 -        op&r&cond    WK&reg0, [base], #4
4395 - .elseif numbytes == 2
4396 -        op&r&cond&h  WK&reg0, [base], #2
4397 - .elseif numbytes == 1
4398 -        op&r&cond&b  WK&reg0, [base], #1
4399 + .elseif \numbytes == 4
4400 +        \op\()r\()\cond    WK\()\reg0, [\base], #4
4401 + .elseif \numbytes == 2
4402 +#ifdef __clang__
4403 +        \op\()rh\()\cond   WK\()\reg0, [\base], #2
4404 +#else
4405 +        \op\()r\()\cond\()h   WK\()\reg0, [\base], #2
4406 +#endif
4407 + .elseif \numbytes == 1
4408 +#ifdef __clang__
4409 +        \op\()rb\()\cond   WK\()\reg0, [\base], #1
4410 +#else
4411 +        \op\()r\()\cond\()b   WK\()\reg0, [\base], #1
4412 +#endif
4413   .else
4414 -  .error "unsupported size: numbytes"
4415 +  .error "unsupported size: \numbytes"
4416   .endif
4417  .endm
4418
4419  .macro pixst_baseupdated cond, numbytes, reg0, reg1, reg2, reg3, base
4420 - .if numbytes == 16
4421 -        stm&cond&db base, {WK&reg0,WK&reg1,WK&reg2,WK&reg3}
4422 - .elseif numbytes == 8
4423 -        stm&cond&db base, {WK&reg0,WK&reg1}
4424 - .elseif numbytes == 4
4425 -        str&cond    WK&reg0, [base, #-4]
4426 - .elseif numbytes == 2
4427 -        str&cond&h  WK&reg0, [base, #-2]
4428 - .elseif numbytes == 1
4429 -        str&cond&b  WK&reg0, [base, #-1]
4430 + .if \numbytes == 16
4431 +#ifdef __clang__
4432 +        stm\()\cond\()db \base, {WK\()\reg0,WK\()\reg1,WK\()\reg2,WK\()\reg3}
4433 +#else
4434 +        stmdb\()\cond \base, {WK\()\reg0,WK\()\reg1,WK\()\reg2,WK\()\reg3}
4435 +#endif
4436 + .elseif \numbytes == 8
4437 +#ifdef __clang__
4438 +        stmdb\()\cond \base, {WK\()\reg0,WK\()\reg1}
4439 +#else
4440 +        stm\()\cond\()db \base, {WK\()\reg0,WK\()\reg1}
4441 +#endif
4442 + .elseif \numbytes == 4
4443 +        str\()\cond    WK\()\reg0, [\base, #-4]
4444 + .elseif \numbytes == 2
4445 +#ifdef __clang__
4446 +        strh\()\cond   WK\()\reg0, [\base, #-2]
4447 +#else
4448 +        str\()\cond\()h   WK\()\reg0, [\base, #-2]
4449 +#endif
4450 + .elseif \numbytes == 1
4451 +#ifdef __clang__
4452 +        strb\()\cond   WK\()\reg0, [\base, #-1]
4453 +#else
4454 +        str\()\cond\()b   WK\()\reg0, [\base, #-1]
4455 +#endif
4456   .else
4457 -  .error "unsupported size: numbytes"
4458 +  .error "unsupported size: \numbytes"
4459   .endif
4460  .endm
4461
4462  .macro pixld cond, numbytes, firstreg, base, unaligned
4463 -        pixldst ld, cond, numbytes, %(firstreg+0), %(firstreg+1), %(firstreg+2), %(firstreg+3), base, unaligned
4464 +        pixldst ld, \cond, \numbytes, %(\firstreg+0), %(\firstreg+1), %(\firstreg+2), %(\firstreg+3), \base, \unaligned
4465  .endm
4466
4467  .macro pixst cond, numbytes, firstreg, base
4468   .if (flags) & FLAG_DST_READWRITE
4469 -        pixst_baseupdated cond, numbytes, %(firstreg+0), %(firstreg+1), %(firstreg+2), %(firstreg+3), base
4470 +        pixst_baseupdated \cond, \numbytes, %(\firstreg+0), %(\firstreg+1), %(\firstreg+2), %(\firstreg+3), \base
4471   .else
4472 -        pixldst st, cond, numbytes, %(firstreg+0), %(firstreg+1), %(firstreg+2), %(firstreg+3), base
4473 +        pixldst st, \cond, \numbytes, %(\firstreg+0), %(\firstreg+1), %(\firstreg+2), %(\firstreg+3), \base
4474   .endif
4475  .endm
4476
4477  .macro PF a, x:vararg
4478   .if (PREFETCH_TYPE_CURRENT == PREFETCH_TYPE_STANDARD)
4479 -        a x
4480 +        \a \x
4481   .endif
4482  .endm
4483
4484
4485  .macro preload_leading_step1  bpp, ptr, base
4486  /* If the destination is already 16-byte aligned, then we need to preload
4487   * between 0 and prefetch_distance (inclusive) cache lines ahead so there
4488   * are no gaps when the inner loop starts.
4489   */
4490 - .if bpp > 0
4491 -        PF  bic,    ptr, base, #31
4492 + .if \bpp > 0
4493 +        PF  bic,    \ptr, \base, #31
4494    .set OFFSET, 0
4495    .rept prefetch_distance+1
4496 -        PF  pld,    [ptr, #OFFSET]
4497 +        PF  pld,    [\ptr, #OFFSET]
4498     .set OFFSET, OFFSET+32
4499    .endr
4500   .endif
4501  .endm
4502
4503  .macro preload_leading_step2  bpp, bpp_shift, ptr, base
4504  /* However, if the destination is not 16-byte aligned, we may need to
4505   * preload more cache lines than that. The question we need to ask is:
4506 @@ -196,81 +228,81 @@
4507   * by which the source pointer will be rounded down for preloading, and if
4508   * so, by how many cache lines? Effectively, we want to calculate
4509   *     leading_bytes = ((-dst)&15)*src_bpp/dst_bpp
4510   *     inner_loop_offset = (src+leading_bytes)&31
4511   *     extra_needed = leading_bytes - inner_loop_offset
4512   * and test if extra_needed is <= 0, <= 32, or > 32 (where > 32 is only
4513   * possible when there are 4 src bytes for every 1 dst byte).
4514   */
4515 - .if bpp > 0
4516 -  .ifc base,DST
4517 + .if \bpp > 0
4518 +  .ifc \base,DST
4519          /* The test can be simplified further when preloading the destination */
4520 -        PF  tst,    base, #16
4521 +        PF  tst,    \base, #16
4522          PF  beq,    61f
4523    .else
4524 -   .if bpp/dst_w_bpp == 4
4525 -        PF  add,    SCRATCH, base, WK0, lsl #bpp_shift-dst_bpp_shift
4526 +   .if \bpp/dst_w_bpp == 4
4527 +        PF  add,    SCRATCH, \base, WK0, lsl #\bpp_shift-dst_bpp_shift
4528          PF  and,    SCRATCH, SCRATCH, #31
4529 -        PF  rsb,    SCRATCH, SCRATCH, WK0, lsl #bpp_shift-dst_bpp_shift
4530 +        PF  rsb,    SCRATCH, SCRATCH, WK0, lsl #\bpp_shift-dst_bpp_shift
4531          PF  sub,    SCRATCH, SCRATCH, #1        /* so now ranges are -16..-1 / 0..31 / 32..63 */
4532          PF  movs,   SCRATCH, SCRATCH, lsl #32-6 /* so this sets         NC   /  nc   /   Nc   */
4533          PF  bcs,    61f
4534          PF  bpl,    60f
4535          PF  pld,    [ptr, #32*(prefetch_distance+2)]
4536     .else
4537 -        PF  mov,    SCRATCH, base, lsl #32-5
4538 -        PF  add,    SCRATCH, SCRATCH, WK0, lsl #32-5+bpp_shift-dst_bpp_shift
4539 -        PF  rsbs,   SCRATCH, SCRATCH, WK0, lsl #32-5+bpp_shift-dst_bpp_shift
4540 +        PF  mov,    SCRATCH, \base, lsl #32-5
4541 +        PF  add,    SCRATCH, SCRATCH, WK0, lsl #32-5+\bpp_shift-dst_bpp_shift
4542 +        PF  rsbs,   SCRATCH, SCRATCH, WK0, lsl #32-5+\bpp_shift-dst_bpp_shift
4543          PF  bls,    61f
4544     .endif
4545    .endif
4546 -60:     PF  pld,    [ptr, #32*(prefetch_distance+1)]
4547 +60:     PF  pld,    [\ptr, #32*(prefetch_distance+1)]
4548  61:
4549   .endif
4550  .endm
4551
4552  #define IS_END_OF_GROUP(INDEX,SIZE) ((SIZE) < 2 || ((INDEX) & ~((INDEX)+1)) & ((SIZE)/2))
4553  .macro preload_middle   bpp, base, scratch_holds_offset
4554 - .if bpp > 0
4555 + .if \bpp > 0
4556          /* prefetch distance = 256/bpp, stm distance = 128/dst_w_bpp */
4557 -  .if IS_END_OF_GROUP(SUBBLOCK,256/128*dst_w_bpp/bpp)
4558 -   .if scratch_holds_offset
4559 -        PF  pld,    [base, SCRATCH]
4560 +  .if IS_END_OF_GROUP(SUBBLOCK,256/128*dst_w_bpp/\bpp)
4561 +   .if \scratch_holds_offset
4562 +        PF  pld,    [\base, SCRATCH]
4563     .else
4564 -        PF  bic,    SCRATCH, base, #31
4565 +        PF  bic,    SCRATCH, \base, #31
4566          PF  pld,    [SCRATCH, #32*prefetch_distance]
4567     .endif
4568    .endif
4569   .endif
4570  .endm
4571
4572  .macro preload_trailing  bpp, bpp_shift, base
4573 - .if bpp > 0
4574 -  .if bpp*pix_per_block > 256
4575 + .if \bpp > 0
4576 +  .if \bpp*pix_per_block > 256
4577          /* Calculations are more complex if more than one fetch per block */
4578 -        PF  and,    WK1, base, #31
4579 -        PF  add,    WK1, WK1, WK0, lsl #bpp_shift
4580 -        PF  add,    WK1, WK1, #32*(bpp*pix_per_block/256-1)*(prefetch_distance+1)
4581 -        PF  bic,    SCRATCH, base, #31
4582 +        PF  and,    WK1, \base, #31
4583 +        PF  add,    WK1, WK1, WK0, lsl #\bpp_shift
4584 +        PF  add,    WK1, WK1, #32*(\bpp*pix_per_block/256-1)*(prefetch_distance+1)
4585 +        PF  bic,    SCRATCH, \base, #31
4586  80:     PF  pld,    [SCRATCH, #32*(prefetch_distance+1)]
4587          PF  add,    SCRATCH, SCRATCH, #32
4588          PF  subs,   WK1, WK1, #32
4589          PF  bhi,    80b
4590    .else
4591          /* If exactly one fetch per block, then we need either 0, 1 or 2 extra preloads */
4592 -        PF  mov,    SCRATCH, base, lsl #32-5
4593 -        PF  adds,   SCRATCH, SCRATCH, X, lsl #32-5+bpp_shift
4594 +        PF  mov,    SCRATCH, \base, lsl #32-5
4595 +        PF  adds,   SCRATCH, SCRATCH, X, lsl #32-5+\bpp_shift
4596          PF  adceqs, SCRATCH, SCRATCH, #0
4597          /* The instruction above has two effects: ensures Z is only
4598           * set if C was clear (so Z indicates that both shifted quantities
4599           * were 0), and clears C if Z was set (so C indicates that the sum
4600           * of the shifted quantities was greater and not equal to 32) */
4601          PF  beq,    82f
4602 -        PF  bic,    SCRATCH, base, #31
4603 +        PF  bic,    SCRATCH, \base, #31
4604          PF  bcc,    81f
4605          PF  pld,    [SCRATCH, #32*(prefetch_distance+2)]
4606  81:     PF  pld,    [SCRATCH, #32*(prefetch_distance+1)]
4607  82:
4608    .endif
4609   .endif
4610  .endm
4611
4612 @@ -283,97 +315,97 @@ 82:
4613   *    pixels) they cannot possibly straddle more than 2 32-byte cachelines,
4614   *    meaning there's no need for a loop.
4615   * "bpp" - number of bits per pixel in the channel (source, mask or
4616   *    destination) that's being preloaded, or 0 if this channel is not used
4617   *    for reading
4618   * "bpp_shift" - log2 of ("bpp"/8) (except if "bpp"=0 of course)
4619   * "base" - base address register of channel to preload (SRC, MASK or DST)
4620   */
4621 - .if bpp > 0
4622 -  .if narrow_case && (bpp <= dst_w_bpp)
4623 + .if \bpp > 0
4624 +  .if \narrow_case && (\bpp <= dst_w_bpp)
4625          /* In these cases, each line for each channel is in either 1 or 2 cache lines */
4626 -        PF  bic,    WK0, base, #31
4627 +        PF  bic,    WK0, \base, #31
4628          PF  pld,    [WK0]
4629 -        PF  add,    WK1, base, X, LSL #bpp_shift
4630 +        PF  add,    WK1, \base, X, LSL #\bpp_shift
4631          PF  sub,    WK1, WK1, #1
4632          PF  bic,    WK1, WK1, #31
4633          PF  cmp,    WK1, WK0
4634          PF  beq,    90f
4635          PF  pld,    [WK1]
4636  90:
4637    .else
4638 -        PF  bic,    WK0, base, #31
4639 +        PF  bic,    WK0, \base, #31
4640          PF  pld,    [WK0]
4641 -        PF  add,    WK1, base, X, lsl #bpp_shift
4642 +        PF  add,    WK1, \base, X, lsl #\bpp_shift
4643          PF  sub,    WK1, WK1, #1
4644          PF  bic,    WK1, WK1, #31
4645          PF  cmp,    WK1, WK0
4646          PF  beq,    92f
4647  91:     PF  add,    WK0, WK0, #32
4648          PF  cmp,    WK0, WK1
4649          PF  pld,    [WK0]
4650          PF  bne,    91b
4651  92:
4652    .endif
4653   .endif
4654  .endm
4655
4656
4657  .macro conditional_process1_helper  cond, process_head, process_tail, numbytes, firstreg, unaligned_src, unaligned_mask, decrementx
4658 -        process_head  cond, numbytes, firstreg, unaligned_src, unaligned_mask, 0
4659 - .if decrementx
4660 -        sub&cond X, X, #8*numbytes/dst_w_bpp
4661 +        \process_head  \cond, \numbytes, \firstreg, \unaligned_src, \unaligned_mask, 0
4662 + .if \decrementx
4663 +        sub\()\cond X, X, #8*\numbytes/dst_w_bpp
4664   .endif
4665 -        process_tail  cond, numbytes, firstreg
4666 +        \process_tail  \cond, \numbytes, \firstreg
4667   .if !((flags) & FLAG_PROCESS_DOES_STORE)
4668 -        pixst   cond, numbytes, firstreg, DST
4669 +        pixst   \cond, \numbytes, \firstreg, DST
4670   .endif
4671  .endm
4672
4673  .macro conditional_process1  cond, process_head, process_tail, numbytes, firstreg, unaligned_src, unaligned_mask, decrementx
4674   .if (flags) & FLAG_BRANCH_OVER
4675 -  .ifc cond,mi
4676 +  .ifc \cond,mi
4677          bpl     100f
4678    .endif
4679 -  .ifc cond,cs
4680 +  .ifc \cond,cs
4681          bcc     100f
4682    .endif
4683 -  .ifc cond,ne
4684 +  .ifc \cond,ne
4685          beq     100f
4686    .endif
4687 -        conditional_process1_helper  , process_head, process_tail, numbytes, firstreg, unaligned_src, unaligned_mask, decrementx
4688 +        conditional_process1_helper  , \process_head, \process_tail, \numbytes, \firstreg, \unaligned_src, \unaligned_mask, \decrementx
4689  100:
4690   .else
4691 -        conditional_process1_helper  cond, process_head, process_tail, numbytes, firstreg, unaligned_src, unaligned_mask, decrementx
4692 +        conditional_process1_helper  \cond, \process_head, \process_tail, \numbytes, \firstreg, \unaligned_src, \unaligned_mask, \decrementx
4693   .endif
4694  .endm
4695
4696  .macro conditional_process2  test, cond1, cond2, process_head, process_tail, numbytes1, numbytes2, firstreg1, firstreg2, unaligned_src, unaligned_mask, decrementx
4697   .if (flags) & (FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE)
4698          /* Can't interleave reads and writes */
4699 -        test
4700 -        conditional_process1  cond1, process_head, process_tail, numbytes1, firstreg1, unaligned_src, unaligned_mask, decrementx
4701 +        \test
4702 +        conditional_process1  \cond1, \process_head, \process_tail, \numbytes1, \firstreg1, \unaligned_src, \unaligned_mask, \decrementx
4703    .if (flags) & FLAG_PROCESS_CORRUPTS_PSR
4704 -        test
4705 +        \test
4706    .endif
4707 -        conditional_process1  cond2, process_head, process_tail, numbytes2, firstreg2, unaligned_src, unaligned_mask, decrementx
4708 +        conditional_process1  \cond2, \process_head, \process_tail, \numbytes2, \firstreg2, \unaligned_src, \unaligned_mask, \decrementx
4709   .else
4710          /* Can interleave reads and writes for better scheduling */
4711 -        test
4712 -        process_head  cond1, numbytes1, firstreg1, unaligned_src, unaligned_mask, 0
4713 -        process_head  cond2, numbytes2, firstreg2, unaligned_src, unaligned_mask, 0
4714 -  .if decrementx
4715 -        sub&cond1 X, X, #8*numbytes1/dst_w_bpp
4716 -        sub&cond2 X, X, #8*numbytes2/dst_w_bpp
4717 +        \test
4718 +        \process_head  \cond1, \numbytes1, \firstreg1, \unaligned_src, \unaligned_mask, 0
4719 +        \process_head  \cond2, \numbytes2, \firstreg2, \unaligned_src, \unaligned_mask, 0
4720 +  .if \decrementx
4721 +        sub\()\cond1 X, X, #8*\numbytes1/dst_w_bpp
4722 +        sub\()\cond2 X, X, #8*\numbytes2/dst_w_bpp
4723    .endif
4724 -        process_tail  cond1, numbytes1, firstreg1
4725 -        process_tail  cond2, numbytes2, firstreg2
4726 -        pixst   cond1, numbytes1, firstreg1, DST
4727 -        pixst   cond2, numbytes2, firstreg2, DST
4728 +        \process_tail  \cond1, \numbytes1, \firstreg1
4729 +        \process_tail  \cond2, \numbytes2, \firstreg2
4730 +        pixst   \cond1, \numbytes1, \firstreg1, DST
4731 +        pixst   \cond2, \numbytes2, \firstreg2, DST
4732   .endif
4733  .endm
4734
4735
4736  .macro test_bits_1_0_ptr
4737   .if (flags) & FLAG_PROCESS_CORRUPTS_WK0
4738          movs    SCRATCH, X, lsl #32-1  /* C,N = bits 1,0 of DST */
4739   .else
4740 @@ -395,22 +427,22 @@ 100:
4741   .if (flags) & FLAG_PROCESS_CORRUPTS_WK0
4742    .set DECREMENT_X, 0
4743          sub     X, X, WK0, lsr #dst_bpp_shift
4744          str     X, [sp, #LINE_SAVED_REG_COUNT*4]
4745          mov     X, WK0
4746   .endif
4747          /* Use unaligned loads in all cases for simplicity */
4748   .if dst_w_bpp == 8
4749 -        conditional_process2  test_bits_1_0_ptr, mi, cs, process_head, process_tail, 1, 2, 1, 2, 1, 1, DECREMENT_X
4750 +        conditional_process2  test_bits_1_0_ptr, mi, cs, \process_head, \process_tail, 1, 2, 1, 2, 1, 1, DECREMENT_X
4751   .elseif dst_w_bpp == 16
4752          test_bits_1_0_ptr
4753 -        conditional_process1  cs, process_head, process_tail, 2, 2, 1, 1, DECREMENT_X
4754 +        conditional_process1  cs, \process_head, \process_tail, 2, 2, 1, 1, DECREMENT_X
4755   .endif
4756 -        conditional_process2  test_bits_3_2_ptr, mi, cs, process_head, process_tail, 4, 8, 1, 2, 1, 1, DECREMENT_X
4757 +        conditional_process2  test_bits_3_2_ptr, mi, cs, \process_head, \process_tail, 4, 8, 1, 2, 1, 1, DECREMENT_X
4758   .if (flags) & FLAG_PROCESS_CORRUPTS_WK0
4759          ldr     X, [sp, #LINE_SAVED_REG_COUNT*4]
4760   .endif
4761  .endm
4762
4763  .macro test_bits_3_2_pix
4764          movs    SCRATCH, X, lsl #dst_bpp_shift+32-3
4765  .endm
4766 @@ -419,169 +451,169 @@ 100:
4767   .if dst_w_bpp == 8
4768          movs    SCRATCH, X, lsl #dst_bpp_shift+32-1
4769   .else
4770          movs    SCRATCH, X, lsr #1
4771   .endif
4772  .endm
4773
4774  .macro trailing_15bytes  process_head, process_tail, unaligned_src, unaligned_mask
4775 -        conditional_process2  test_bits_3_2_pix, cs, mi, process_head, process_tail, 8, 4, 0, 2, unaligned_src, unaligned_mask, 0
4776 +        conditional_process2  test_bits_3_2_pix, cs, mi, \process_head, \process_tail, 8, 4, 0, 2, \unaligned_src, \unaligned_mask, 0
4777   .if dst_w_bpp == 16
4778          test_bits_1_0_pix
4779 -        conditional_process1  cs, process_head, process_tail, 2, 0, unaligned_src, unaligned_mask, 0
4780 +        conditional_process1  cs, \process_head, \process_tail, 2, 0, \unaligned_src, \unaligned_mask, 0
4781   .elseif dst_w_bpp == 8
4782 -        conditional_process2  test_bits_1_0_pix, cs, mi, process_head, process_tail, 2, 1, 0, 1, unaligned_src, unaligned_mask, 0
4783 +        conditional_process2  test_bits_1_0_pix, cs, mi, \process_head, \process_tail, 2, 1, 0, 1, \unaligned_src, \unaligned_mask, 0
4784   .endif
4785  .endm
4786
4787
4788  .macro wide_case_inner_loop  process_head, process_tail, unaligned_src, unaligned_mask, dst_alignment
4789  110:
4790   .set SUBBLOCK, 0 /* this is a count of STMs; there can be up to 8 STMs per block */
4791   .rept pix_per_block*dst_w_bpp/128
4792 -        process_head  , 16, 0, unaligned_src, unaligned_mask, 1
4793 +        \process_head  , 16, 0, \unaligned_src, \unaligned_mask, 1
4794    .if (src_bpp > 0) && (mask_bpp == 0) && ((flags) & FLAG_PROCESS_PRESERVES_SCRATCH)
4795          preload_middle  src_bpp, SRC, 1
4796    .elseif (src_bpp == 0) && (mask_bpp > 0) && ((flags) & FLAG_PROCESS_PRESERVES_SCRATCH)
4797          preload_middle  mask_bpp, MASK, 1
4798    .else
4799          preload_middle  src_bpp, SRC, 0
4800          preload_middle  mask_bpp, MASK, 0
4801    .endif
4802    .if (dst_r_bpp > 0) && ((SUBBLOCK % 2) == 0) && (((flags) & FLAG_NO_PRELOAD_DST) == 0)
4803          /* Because we know that writes are 16-byte aligned, it's relatively easy to ensure that
4804           * destination prefetches are 32-byte aligned. It's also the easiest channel to offset
4805           * preloads for, to achieve staggered prefetches for multiple channels, because there are
4806           * always two STMs per prefetch, so there is always an opposite STM on which to put the
4807           * preload. Note, no need to BIC the base register here */
4808 -        PF  pld,    [DST, #32*prefetch_distance - dst_alignment]
4809 +        PF  pld,    [DST, #32*prefetch_distance - \dst_alignment]
4810    .endif
4811 -        process_tail  , 16, 0
4812 +        \process_tail  , 16, 0
4813    .if !((flags) & FLAG_PROCESS_DOES_STORE)
4814          pixst   , 16, 0, DST
4815    .endif
4816    .set SUBBLOCK, SUBBLOCK+1
4817   .endr
4818          subs    X, X, #pix_per_block
4819          bhs     110b
4820  .endm
4821
4822  .macro wide_case_inner_loop_and_trailing_pixels  process_head, process_tail, process_inner_loop, exit_label, unaligned_src, unaligned_mask
4823          /* Destination now 16-byte aligned; we have at least one block before we have to stop preloading */
4824   .if dst_r_bpp > 0
4825          tst     DST, #16
4826          bne     111f
4827 -        process_inner_loop  process_head, process_tail, unaligned_src, unaligned_mask, 16 + DST_PRELOAD_BIAS
4828 +        \process_inner_loop  \process_head, \process_tail, \unaligned_src, \unaligned_mask, 16 + DST_PRELOAD_BIAS
4829          b       112f
4830  111:
4831   .endif
4832 -        process_inner_loop  process_head, process_tail, unaligned_src, unaligned_mask, 0 + DST_PRELOAD_BIAS
4833 +        \process_inner_loop  \process_head, \process_tail, \unaligned_src, \unaligned_mask, 0 + DST_PRELOAD_BIAS
4834  112:
4835          /* Just before the final (prefetch_distance+1) 32-byte blocks, deal with final preloads */
4836   .if (src_bpp*pix_per_block > 256) || (mask_bpp*pix_per_block > 256) || (dst_r_bpp*pix_per_block > 256)
4837          PF  and,    WK0, X, #pix_per_block-1
4838   .endif
4839          preload_trailing  src_bpp, src_bpp_shift, SRC
4840          preload_trailing  mask_bpp, mask_bpp_shift, MASK
4841   .if ((flags) & FLAG_NO_PRELOAD_DST) == 0
4842          preload_trailing  dst_r_bpp, dst_bpp_shift, DST
4843   .endif
4844          add     X, X, #(prefetch_distance+2)*pix_per_block - 128/dst_w_bpp
4845          /* The remainder of the line is handled identically to the medium case */
4846 -        medium_case_inner_loop_and_trailing_pixels  process_head, process_tail,, exit_label, unaligned_src, unaligned_mask
4847 +        medium_case_inner_loop_and_trailing_pixels  \process_head, \process_tail,, \exit_label, \unaligned_src, \unaligned_mask
4848  .endm
4849
4850  .macro medium_case_inner_loop_and_trailing_pixels  process_head, process_tail, unused, exit_label, unaligned_src, unaligned_mask
4851  120:
4852 -        process_head  , 16, 0, unaligned_src, unaligned_mask, 0
4853 -        process_tail  , 16, 0
4854 +        \process_head  , 16, 0, \unaligned_src, \unaligned_mask, 0
4855 +        \process_tail  , 16, 0
4856   .if !((flags) & FLAG_PROCESS_DOES_STORE)
4857          pixst   , 16, 0, DST
4858   .endif
4859          subs    X, X, #128/dst_w_bpp
4860          bhs     120b
4861          /* Trailing pixels */
4862          tst     X, #128/dst_w_bpp - 1
4863 -        beq     exit_label
4864 -        trailing_15bytes  process_head, process_tail, unaligned_src, unaligned_mask
4865 +        beq     \exit_label
4866 +        trailing_15bytes  \process_head, \process_tail, \unaligned_src, \unaligned_mask
4867  .endm
4868
4869  .macro narrow_case_inner_loop_and_trailing_pixels  process_head, process_tail, unused, exit_label, unaligned_src, unaligned_mask
4870          tst     X, #16*8/dst_w_bpp
4871 -        conditional_process1  ne, process_head, process_tail, 16, 0, unaligned_src, unaligned_mask, 0
4872 +        conditional_process1  ne, \process_head, \process_tail, 16, 0, \unaligned_src, \unaligned_mask, 0
4873          /* Trailing pixels */
4874          /* In narrow case, it's relatively unlikely to be aligned, so let's do without a branch here */
4875 -        trailing_15bytes  process_head, process_tail, unaligned_src, unaligned_mask
4876 +        trailing_15bytes  \process_head, \process_tail, \unaligned_src, \unaligned_mask
4877  .endm
4878
4879  .macro switch_on_alignment  action, process_head, process_tail, process_inner_loop, exit_label
4880   /* Note that if we're reading the destination, it's already guaranteed to be aligned at this point */
4881   .if mask_bpp == 8 || mask_bpp == 16
4882          tst     MASK, #3
4883          bne     141f
4884   .endif
4885    .if src_bpp == 8 || src_bpp == 16
4886          tst     SRC, #3
4887          bne     140f
4888    .endif
4889 -        action  process_head, process_tail, process_inner_loop, exit_label, 0, 0
4890 +        \action  \process_head, \process_tail, \process_inner_loop, \exit_label, 0, 0
4891    .if src_bpp == 8 || src_bpp == 16
4892 -        b       exit_label
4893 +        b       \exit_label
4894  140:
4895 -        action  process_head, process_tail, process_inner_loop, exit_label, 1, 0
4896 +        \action  \process_head, \process_tail, \process_inner_loop, \exit_label, 1, 0
4897    .endif
4898   .if mask_bpp == 8 || mask_bpp == 16
4899 -        b       exit_label
4900 +        b       \exit_label
4901  141:
4902    .if src_bpp == 8 || src_bpp == 16
4903          tst     SRC, #3
4904          bne     142f
4905    .endif
4906 -        action  process_head, process_tail, process_inner_loop, exit_label, 0, 1
4907 +        \action  \process_head, \process_tail, \process_inner_loop, \exit_label, 0, 1
4908    .if src_bpp == 8 || src_bpp == 16
4909 -        b       exit_label
4910 +        b       \exit_label
4911  142:
4912 -        action  process_head, process_tail, process_inner_loop, exit_label, 1, 1
4913 +        \action  \process_head, \process_tail, \process_inner_loop, \exit_label, 1, 1
4914    .endif
4915   .endif
4916  .endm
4917
4918
4919  .macro end_of_line      restore_x, vars_spilled, loop_label, last_one
4920 - .if vars_spilled
4921 + .if \vars_spilled
4922          /* Sadly, GAS doesn't seem have an equivalent of the DCI directive? */
4923          /* This is ldmia sp,{} */
4924          .word   0xE89D0000 | LINE_SAVED_REGS
4925   .endif
4926          subs    Y, Y, #1
4927 - .if vars_spilled
4928 + .if \vars_spilled
4929    .if (LINE_SAVED_REGS) & (1<<1)
4930          str     Y, [sp]
4931    .endif
4932   .endif
4933          add     DST, DST, STRIDE_D
4934   .if src_bpp > 0
4935          add     SRC, SRC, STRIDE_S
4936   .endif
4937   .if mask_bpp > 0
4938          add     MASK, MASK, STRIDE_M
4939   .endif
4940 - .if restore_x
4941 + .if \restore_x
4942          mov     X, ORIG_W
4943   .endif
4944 -        bhs     loop_label
4945 - .ifc "last_one",""
4946 -  .if vars_spilled
4947 +        bhs     \loop_label
4948 + .ifc "\last_one",""
4949 +  .if \vars_spilled
4950          b       197f
4951    .else
4952          b       198f
4953    .endif
4954   .else
4955 -  .if (!vars_spilled) && ((flags) & FLAG_SPILL_LINE_VARS)
4956 +  .if (!\vars_spilled) && ((flags) & FLAG_SPILL_LINE_VARS)
4957          b       198f
4958    .endif
4959   .endif
4960  .endm
4961
4962
4963  .macro generate_composite_function fname, \
4964                                     src_bpp_, \
4965 @@ -591,27 +623,27 @@ 142:
4966                                     prefetch_distance_, \
4967                                     init, \
4968                                     newline, \
4969                                     cleanup, \
4970                                     process_head, \
4971                                     process_tail, \
4972                                     process_inner_loop
4973
4974 -    pixman_asm_function fname
4975 +    pixman_asm_function \fname
4976
4977  /*
4978   * Make some macro arguments globally visible and accessible
4979   * from other macros
4980   */
4981 - .set src_bpp, src_bpp_
4982 - .set mask_bpp, mask_bpp_
4983 - .set dst_w_bpp, dst_w_bpp_
4984 - .set flags, flags_
4985 - .set prefetch_distance, prefetch_distance_
4986 + .set src_bpp, \src_bpp_
4987 + .set mask_bpp, \mask_bpp_
4988 + .set dst_w_bpp, \dst_w_bpp_
4989 + .set flags, \flags_
4990 + .set prefetch_distance, \prefetch_distance_
4991
4992  /*
4993   * Select prefetch type for this function.
4994   */
4995   .if prefetch_distance == 0
4996    .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_NONE
4997   .else
4998    .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_STANDARD
4999 @@ -727,17 +759,17 @@ 142:
5000   .endif
5001
5002  #ifdef DEBUG_PARAMS
5003          add     Y, Y, #1
5004          stmia   sp, {r0-r7,pc}
5005          sub     Y, Y, #1
5006  #endif
5007
5008 -        init
5009 +        \init
5010
5011   .if (flags) & FLAG_PROCESS_CORRUPTS_WK0
5012          /* Reserve a word in which to store X during leading pixels */
5013          sub     sp, sp, #4
5014    .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET+4
5015    .set LOCALS_STACK_OFFSET, LOCALS_STACK_OFFSET+4
5016   .endif
5017
5018 @@ -768,47 +800,47 @@ 142:
5019          mov     ORIG_W, X
5020    .if (flags) & FLAG_SPILL_LINE_VARS_WIDE
5021          /* This is stmdb sp!,{} */
5022          .word   0xE92D0000 | LINE_SAVED_REGS
5023     .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET + LINE_SAVED_REG_COUNT*4
5024     .set LOCALS_STACK_OFFSET, LOCALS_STACK_OFFSET + LINE_SAVED_REG_COUNT*4
5025    .endif
5026  151:    /* New line */
5027 -        newline
5028 +        \newline
5029          preload_leading_step1  src_bpp, WK1, SRC
5030          preload_leading_step1  mask_bpp, WK2, MASK
5031    .if ((flags) & FLAG_NO_PRELOAD_DST) == 0
5032          preload_leading_step1  dst_r_bpp, WK3, DST
5033    .endif
5034
5035          ands    WK0, DST, #15
5036          beq     154f
5037          rsb     WK0, WK0, #16 /* number of leading bytes until destination aligned */
5038
5039          preload_leading_step2  src_bpp, src_bpp_shift, WK1, SRC
5040          preload_leading_step2  mask_bpp, mask_bpp_shift, WK2, MASK
5041    .if ((flags) & FLAG_NO_PRELOAD_DST) == 0
5042          preload_leading_step2  dst_r_bpp, dst_bpp_shift, WK3, DST
5043    .endif
5044
5045 -        leading_15bytes  process_head, process_tail
5046 +        leading_15bytes  \process_head, \process_tail
5047
5048  154:    /* Destination now 16-byte aligned; we have at least one prefetch on each channel as well as at least one 16-byte output block */
5049    .if (src_bpp > 0) && (mask_bpp == 0) && ((flags) & FLAG_PROCESS_PRESERVES_SCRATCH)
5050          and     SCRATCH, SRC, #31
5051          rsb     SCRATCH, SCRATCH, #32*prefetch_distance
5052    .elseif (src_bpp == 0) && (mask_bpp > 0) && ((flags) & FLAG_PROCESS_PRESERVES_SCRATCH)
5053          and     SCRATCH, MASK, #31
5054          rsb     SCRATCH, SCRATCH, #32*prefetch_distance
5055    .endif
5056 -  .ifc "process_inner_loop",""
5057 -        switch_on_alignment  wide_case_inner_loop_and_trailing_pixels, process_head, process_tail, wide_case_inner_loop, 157f
5058 +  .ifc "\process_inner_loop",""
5059 +        switch_on_alignment  wide_case_inner_loop_and_trailing_pixels, \process_head, \process_tail, wide_case_inner_loop, 157f
5060    .else
5061 -        switch_on_alignment  wide_case_inner_loop_and_trailing_pixels, process_head, process_tail, process_inner_loop, 157f
5062 +        switch_on_alignment  wide_case_inner_loop_and_trailing_pixels, \process_head, \process_tail, \process_inner_loop, 157f
5063    .endif
5064
5065  157:    /* Check for another line */
5066          end_of_line 1, %((flags) & FLAG_SPILL_LINE_VARS_WIDE), 151b
5067    .if (flags) & FLAG_SPILL_LINE_VARS_WIDE
5068     .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET - LINE_SAVED_REG_COUNT*4
5069     .set LOCALS_STACK_OFFSET, LOCALS_STACK_OFFSET - LINE_SAVED_REG_COUNT*4
5070    .endif
5071 @@ -820,80 +852,80 @@ 160:    /* Medium case */
5072          mov     ORIG_W, X
5073   .if (flags) & FLAG_SPILL_LINE_VARS_NON_WIDE
5074          /* This is stmdb sp!,{} */
5075          .word   0xE92D0000 | LINE_SAVED_REGS
5076    .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET + LINE_SAVED_REG_COUNT*4
5077    .set LOCALS_STACK_OFFSET, LOCALS_STACK_OFFSET + LINE_SAVED_REG_COUNT*4
5078   .endif
5079  161:    /* New line */
5080 -        newline
5081 +        \newline
5082          preload_line 0, src_bpp, src_bpp_shift, SRC  /* in: X, corrupts: WK0-WK1 */
5083          preload_line 0, mask_bpp, mask_bpp_shift, MASK
5084   .if ((flags) & FLAG_NO_PRELOAD_DST) == 0
5085          preload_line 0, dst_r_bpp, dst_bpp_shift, DST
5086   .endif
5087
5088          sub     X, X, #128/dst_w_bpp     /* simplifies inner loop termination */
5089          ands    WK0, DST, #15
5090          beq     164f
5091          rsb     WK0, WK0, #16 /* number of leading bytes until destination aligned */
5092
5093 -        leading_15bytes  process_head, process_tail
5094 +        leading_15bytes  \process_head, \process_tail
5095
5096  164:    /* Destination now 16-byte aligned; we have at least one 16-byte output block */
5097 -        switch_on_alignment  medium_case_inner_loop_and_trailing_pixels, process_head, process_tail,, 167f
5098 +        switch_on_alignment  medium_case_inner_loop_and_trailing_pixels, \process_head, \process_tail,, 167f
5099
5100  167:    /* Check for another line */
5101          end_of_line 1, %((flags) & FLAG_SPILL_LINE_VARS_NON_WIDE), 161b
5102
5103   .ltorg
5104
5105  170:    /* Narrow case, less than 31 bytes, so no guarantee of at least one 16-byte block */
5106   .if dst_w_bpp < 32
5107          mov     ORIG_W, X
5108   .endif
5109   .if (flags) & FLAG_SPILL_LINE_VARS_NON_WIDE
5110          /* This is stmdb sp!,{} */
5111          .word   0xE92D0000 | LINE_SAVED_REGS
5112   .endif
5113  171:    /* New line */
5114 -        newline
5115 +        \newline
5116          preload_line 1, src_bpp, src_bpp_shift, SRC  /* in: X, corrupts: WK0-WK1 */
5117          preload_line 1, mask_bpp, mask_bpp_shift, MASK
5118   .if ((flags) & FLAG_NO_PRELOAD_DST) == 0
5119          preload_line 1, dst_r_bpp, dst_bpp_shift, DST
5120   .endif
5121
5122   .if dst_w_bpp == 8
5123          tst     DST, #3
5124          beq     174f
5125  172:    subs    X, X, #1
5126          blo     177f
5127 -        process_head  , 1, 0, 1, 1, 0
5128 -        process_tail  , 1, 0
5129 +        \process_head  , 1, 0, 1, 1, 0
5130 +        \process_tail  , 1, 0
5131    .if !((flags) & FLAG_PROCESS_DOES_STORE)
5132          pixst   , 1, 0, DST
5133    .endif
5134          tst     DST, #3
5135          bne     172b
5136   .elseif dst_w_bpp == 16
5137          tst     DST, #2
5138          beq     174f
5139          subs    X, X, #1
5140          blo     177f
5141 -        process_head  , 2, 0, 1, 1, 0
5142 -        process_tail  , 2, 0
5143 +        \process_head  , 2, 0, 1, 1, 0
5144 +        \process_tail  , 2, 0
5145    .if !((flags) & FLAG_PROCESS_DOES_STORE)
5146          pixst   , 2, 0, DST
5147    .endif
5148   .endif
5149
5150  174:    /* Destination now 4-byte aligned; we have 0 or more output bytes to go */
5151 -        switch_on_alignment  narrow_case_inner_loop_and_trailing_pixels, process_head, process_tail,, 177f
5152 +        switch_on_alignment  narrow_case_inner_loop_and_trailing_pixels, \process_head, \process_tail,, 177f
5153
5154  177:    /* Check for another line */
5155          end_of_line %(dst_w_bpp < 32), %((flags) & FLAG_SPILL_LINE_VARS_NON_WIDE), 171b, last_one
5156   .if (flags) & FLAG_SPILL_LINE_VARS_NON_WIDE
5157    .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET - LINE_SAVED_REG_COUNT*4
5158    .set LOCALS_STACK_OFFSET, LOCALS_STACK_OFFSET - LINE_SAVED_REG_COUNT*4
5159   .endif
5160
5161 @@ -903,17 +935,17 @@ 197:
5162   .endif
5163  198:
5164   .if (flags) & FLAG_PROCESS_CORRUPTS_WK0
5165    .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET-4
5166    .set LOCALS_STACK_OFFSET, LOCALS_STACK_OFFSET-4
5167          add     sp, sp, #4
5168   .endif
5169
5170 -        cleanup
5171 +        \cleanup
5172
5173  #ifdef DEBUG_PARAMS
5174          add     sp, sp, #9*4 /* junk the debug copy of arguments */
5175  #endif
5176  199:
5177          pop     {r4-r11, pc}  /* exit */
5178
5179   .ltorg
5180 @@ -927,23 +959,23 @@ 199:
5181      .unreq  MASK
5182      .unreq  STRIDE_M
5183      .unreq  WK0
5184      .unreq  WK1
5185      .unreq  WK2
5186      .unreq  WK3
5187      .unreq  SCRATCH
5188      .unreq  ORIG_W
5189 -    .endfunc
5190 +    pixman_end_asm_function
5191  .endm
5192
5193  .macro line_saved_regs  x:vararg
5194   .set LINE_SAVED_REGS, 0
5195   .set LINE_SAVED_REG_COUNT, 0
5196 - .irp SAVED_REG,x
5197 + .irp SAVED_REG,\x
5198    .ifc "SAVED_REG","Y"
5199     .set LINE_SAVED_REGS, LINE_SAVED_REGS | (1<<1)
5200     .set LINE_SAVED_REG_COUNT, LINE_SAVED_REG_COUNT + 1
5201    .endif
5202    .ifc "SAVED_REG","STRIDE_D"
5203     .set LINE_SAVED_REGS, LINE_SAVED_REGS | (1<<3)
5204     .set LINE_SAVED_REG_COUNT, LINE_SAVED_REG_COUNT + 1
5205    .endif