source/libs/pixman/pixman-src/pixman/pixman-arm-simd-asm.S

   1 /*
   2  * Copyright © 2012 Raspberry Pi Foundation
   3  * Copyright © 2012 RISC OS Open Ltd
   4  *
   5  * Permission to use, copy, modify, distribute, and sell this software and its
   6  * documentation for any purpose is hereby granted without fee, provided that
   7  * the above copyright notice appear in all copies and that both that
   8  * copyright notice and this permission notice appear in supporting
   9  * documentation, and that the name of the copyright holders not be used in
  10  * advertising or publicity pertaining to distribution of the software without
  11  * specific, written prior permission.  The copyright holders make no
  12  * representations about the suitability of this software for any purpose.  It
  13  * is provided "as is" without express or implied warranty.
  14  *
  15  * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
  16  * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
  17  * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
  18  * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
  19  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
  20  * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
  21  * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
  22  * SOFTWARE.
  23  *
  24  * Author:  Ben Avison (bavison@riscosopen.org)
  25  *
  26  */
  27
  28 /* Prevent the stack from becoming executable */
  29 #if defined(__linux__) && defined(__ELF__)
  30 .section .note.GNU-stack,"",%progbits
  31 #endif
  32
  33         .text
  34         .arch armv6
  35         .object_arch armv4
  36         .arm
  37         .altmacro
  38         .p2align 2
  39
  40 #include "pixman-arm-asm.h"
  41 #include "pixman-arm-simd-asm.h"
  42
  43 /* A head macro should do all processing which results in an output of up to
  44  * 16 bytes, as far as the final load instruction. The corresponding tail macro
  45  * should complete the processing of the up-to-16 bytes. The calling macro will
  46  * sometimes choose to insert a preload or a decrement of X between them.
  47  *   cond           ARM condition code for code block
  48  *   numbytes       Number of output bytes that should be generated this time
  49  *   firstreg       First WK register in which to place output
  50  *   unaligned_src  Whether to use non-wordaligned loads of source image
  51  *   unaligned_mask Whether to use non-wordaligned loads of mask image
  52  *   preload        If outputting 16 bytes causes 64 bytes to be read, whether an extra preload should be output
  53  */
  54
  55 .macro blit_init
  56         line_saved_regs STRIDE_D, STRIDE_S
  57 .endm
  58
  59 .macro blit_process_head   cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
  60         pixld   cond, numbytes, firstreg, SRC, unaligned_src
  61 .endm
  62
  63 .macro blit_inner_loop  process_head, process_tail, unaligned_src, unaligned_mask, dst_alignment
  64     WK4     .req    STRIDE_D
  65     WK5     .req    STRIDE_S
  66     WK6     .req    MASK
  67     WK7     .req    STRIDE_M
  68 110:    pixld   , 16, 0, SRC, unaligned_src
  69         pixld   , 16, 4, SRC, unaligned_src
  70         pld     [SRC, SCRATCH]
  71         pixst   , 16, 0, DST
  72         pixst   , 16, 4, DST
  73         subs    X, X, #32*8/src_bpp
  74         bhs     110b
  75     .unreq  WK4
  76     .unreq  WK5
  77     .unreq  WK6
  78     .unreq  WK7
  79 .endm
  80
  81 generate_composite_function \
  82     pixman_composite_src_8888_8888_asm_armv6, 32, 0, 32, \
  83     FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_SPILL_LINE_VARS_WIDE | FLAG_PROCESS_PRESERVES_SCRATCH, \
  84     4, /* prefetch distance */ \
  85     blit_init, \
  86     nop_macro, /* newline */ \
  87     nop_macro, /* cleanup */ \
  88     blit_process_head, \
  89     nop_macro, /* process tail */ \
  90     blit_inner_loop
  91
  92 generate_composite_function \
  93     pixman_composite_src_0565_0565_asm_armv6, 16, 0, 16, \
  94     FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_SPILL_LINE_VARS_WIDE | FLAG_PROCESS_PRESERVES_SCRATCH, \
  95     4, /* prefetch distance */ \
  96     blit_init, \
  97     nop_macro, /* newline */ \
  98     nop_macro, /* cleanup */ \
  99     blit_process_head, \
 100     nop_macro, /* process tail */ \
 101     blit_inner_loop
 102
 103 generate_composite_function \
 104     pixman_composite_src_8_8_asm_armv6, 8, 0, 8, \
 105     FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_SPILL_LINE_VARS_WIDE | FLAG_PROCESS_PRESERVES_SCRATCH, \
 106     3, /* prefetch distance */ \
 107     blit_init, \
 108     nop_macro, /* newline */ \
 109     nop_macro, /* cleanup */ \
 110     blit_process_head, \
 111     nop_macro, /* process tail */ \
 112     blit_inner_loop
 113
 114 /******************************************************************************/
 115
 116 .macro src_n_8888_init
 117         ldr     SRC, [sp, #ARGS_STACK_OFFSET]
 118         mov     STRIDE_S, SRC
 119         mov     MASK, SRC
 120         mov     STRIDE_M, SRC
 121 .endm
 122
 123 .macro src_n_0565_init
 124         ldrh    SRC, [sp, #ARGS_STACK_OFFSET]
 125         orr     SRC, SRC, lsl #16
 126         mov     STRIDE_S, SRC
 127         mov     MASK, SRC
 128         mov     STRIDE_M, SRC
 129 .endm
 130
 131 .macro src_n_8_init
 132         ldrb    SRC, [sp, #ARGS_STACK_OFFSET]
 133         orr     SRC, SRC, lsl #8
 134         orr     SRC, SRC, lsl #16
 135         mov     STRIDE_S, SRC
 136         mov     MASK, SRC
 137         mov     STRIDE_M, SRC
 138 .endm
 139
 140 .macro fill_process_tail  cond, numbytes, firstreg
 141     WK4     .req    SRC
 142     WK5     .req    STRIDE_S
 143     WK6     .req    MASK
 144     WK7     .req    STRIDE_M
 145         pixst   cond, numbytes, 4, DST
 146     .unreq  WK4
 147     .unreq  WK5
 148     .unreq  WK6
 149     .unreq  WK7
 150 .endm
 151
 152 generate_composite_function \
 153     pixman_composite_src_n_8888_asm_armv6, 0, 0, 32, \
 154     FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_PROCESS_PRESERVES_PSR | FLAG_PROCESS_DOES_STORE | FLAG_PROCESS_PRESERVES_SCRATCH \
 155     0, /* prefetch distance doesn't apply */ \
 156     src_n_8888_init \
 157     nop_macro, /* newline */ \
 158     nop_macro /* cleanup */ \
 159     nop_macro /* process head */ \
 160     fill_process_tail
 161
 162 generate_composite_function \
 163     pixman_composite_src_n_0565_asm_armv6, 0, 0, 16, \
 164     FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_PROCESS_PRESERVES_PSR | FLAG_PROCESS_DOES_STORE | FLAG_PROCESS_PRESERVES_SCRATCH \
 165     0, /* prefetch distance doesn't apply */ \
 166     src_n_0565_init \
 167     nop_macro, /* newline */ \
 168     nop_macro /* cleanup */ \
 169     nop_macro /* process head */ \
 170     fill_process_tail
 171
 172 generate_composite_function \
 173     pixman_composite_src_n_8_asm_armv6, 0, 0, 8, \
 174     FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_PROCESS_PRESERVES_PSR | FLAG_PROCESS_DOES_STORE | FLAG_PROCESS_PRESERVES_SCRATCH \
 175     0, /* prefetch distance doesn't apply */ \
 176     src_n_8_init \
 177     nop_macro, /* newline */ \
 178     nop_macro /* cleanup */ \
 179     nop_macro /* process head */ \
 180     fill_process_tail
 181
 182 /******************************************************************************/
 183
 184 .macro src_x888_8888_pixel, cond, reg
 185         orr&cond WK&reg, WK&reg, #0xFF000000
 186 .endm
 187
 188 .macro pixman_composite_src_x888_8888_process_head   cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
 189         pixld   cond, numbytes, firstreg, SRC, unaligned_src
 190 .endm
 191
 192 .macro pixman_composite_src_x888_8888_process_tail   cond, numbytes, firstreg
 193         src_x888_8888_pixel cond, %(firstreg+0)
 194  .if numbytes >= 8
 195         src_x888_8888_pixel cond, %(firstreg+1)
 196   .if numbytes == 16
 197         src_x888_8888_pixel cond, %(firstreg+2)
 198         src_x888_8888_pixel cond, %(firstreg+3)
 199   .endif
 200  .endif
 201 .endm
 202
 203 generate_composite_function \
 204     pixman_composite_src_x888_8888_asm_armv6, 32, 0, 32, \
 205     FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_PROCESS_PRESERVES_SCRATCH, \
 206     3, /* prefetch distance */ \
 207     nop_macro, /* init */ \
 208     nop_macro, /* newline */ \
 209     nop_macro, /* cleanup */ \
 210     pixman_composite_src_x888_8888_process_head, \
 211     pixman_composite_src_x888_8888_process_tail
 212
 213 /******************************************************************************/
 214
 215 .macro src_0565_8888_init
 216         /* Hold loop invariants in MASK and STRIDE_M */
 217         ldr     MASK, =0x07E007E0
 218         mov     STRIDE_M, #0xFF000000
 219         /* Set GE[3:0] to 1010 so SEL instructions do what we want */
 220         ldr     SCRATCH, =0x80008000
 221         uadd8   SCRATCH, SCRATCH, SCRATCH
 222 .endm
 223
 224 .macro src_0565_8888_2pixels, reg1, reg2
 225         and     SCRATCH, WK&reg1, MASK             @ 00000GGGGGG0000000000gggggg00000
 226         bic     WK&reg2, WK&reg1, MASK             @ RRRRR000000BBBBBrrrrr000000bbbbb
 227         orr     SCRATCH, SCRATCH, SCRATCH, lsr #6  @ 00000GGGGGGGGGGGG0000ggggggggggg
 228         mov     WK&reg1, WK&reg2, lsl #16          @ rrrrr000000bbbbb0000000000000000
 229         mov     SCRATCH, SCRATCH, ror #19          @ GGGG0000ggggggggggg00000GGGGGGGG
 230         bic     WK&reg2, WK&reg2, WK&reg1, lsr #16 @ RRRRR000000BBBBB0000000000000000
 231         orr     WK&reg1, WK&reg1, WK&reg1, lsr #5  @ rrrrrrrrrr0bbbbbbbbbb00000000000
 232         orr     WK&reg2, WK&reg2, WK&reg2, lsr #5  @ RRRRRRRRRR0BBBBBBBBBB00000000000
 233         pkhtb   WK&reg1, WK&reg1, WK&reg1, asr #5  @ rrrrrrrr--------bbbbbbbb--------
 234         sel     WK&reg1, WK&reg1, SCRATCH          @ rrrrrrrrggggggggbbbbbbbb--------
 235         mov     SCRATCH, SCRATCH, ror #16          @ ggg00000GGGGGGGGGGGG0000gggggggg
 236         pkhtb   WK&reg2, WK&reg2, WK&reg2, asr #5  @ RRRRRRRR--------BBBBBBBB--------
 237         sel     WK&reg2, WK&reg2, SCRATCH          @ RRRRRRRRGGGGGGGGBBBBBBBB--------
 238         orr     WK&reg1, STRIDE_M, WK&reg1, lsr #8 @ 11111111rrrrrrrrggggggggbbbbbbbb
 239         orr     WK&reg2, STRIDE_M, WK&reg2, lsr #8 @ 11111111RRRRRRRRGGGGGGGGBBBBBBBB
 240 .endm
 241
 242 /* This version doesn't need STRIDE_M, but is one instruction longer.
 243    It would however be preferable for an XRGB target, since we could knock off the last 2 instructions, but is that a common case?
 244         and     SCRATCH, WK&reg1, MASK             @ 00000GGGGGG0000000000gggggg00000
 245         bic     WK&reg1, WK&reg1, MASK             @ RRRRR000000BBBBBrrrrr000000bbbbb
 246         orr     SCRATCH, SCRATCH, SCRATCH, lsr #6  @ 00000GGGGGGGGGGGG0000ggggggggggg
 247         mov     WK&reg2, WK&reg1, lsr #16          @ 0000000000000000RRRRR000000BBBBB
 248         mov     SCRATCH, SCRATCH, ror #27          @ GGGGGGGGGGGG0000ggggggggggg00000
 249         bic     WK&reg1, WK&reg1, WK&reg2, lsl #16 @ 0000000000000000rrrrr000000bbbbb
 250         mov     WK&reg2, WK&reg2, lsl #3           @ 0000000000000RRRRR000000BBBBB000
 251         mov     WK&reg1, WK&reg1, lsl #3           @ 0000000000000rrrrr000000bbbbb000
 252         orr     WK&reg2, WK&reg2, WK&reg2, lsr #5  @ 0000000000000RRRRRRRRRR0BBBBBBBB
 253         orr     WK&reg1, WK&reg1, WK&reg1, lsr #5  @ 0000000000000rrrrrrrrrr0bbbbbbbb
 254         pkhbt   WK&reg2, WK&reg2, WK&reg2, lsl #5  @ --------RRRRRRRR--------BBBBBBBB
 255         pkhbt   WK&reg1, WK&reg1, WK&reg1, lsl #5  @ --------rrrrrrrr--------bbbbbbbb
 256         sel     WK&reg2, SCRATCH, WK&reg2          @ --------RRRRRRRRGGGGGGGGBBBBBBBB
 257         sel     WK&reg1, SCRATCH, WK&reg1          @ --------rrrrrrrrggggggggbbbbbbbb
 258         orr     WK&reg2, WK&reg2, #0xFF000000      @ 11111111RRRRRRRRGGGGGGGGBBBBBBBB
 259         orr     WK&reg1, WK&reg1, #0xFF000000      @ 11111111rrrrrrrrggggggggbbbbbbbb
 260 */
 261
 262 .macro src_0565_8888_1pixel, reg
 263         bic     SCRATCH, WK&reg, MASK              @ 0000000000000000rrrrr000000bbbbb
 264         and     WK&reg, WK&reg, MASK               @ 000000000000000000000gggggg00000
 265         mov     SCRATCH, SCRATCH, lsl #3           @ 0000000000000rrrrr000000bbbbb000
 266         mov     WK&reg, WK&reg, lsl #5             @ 0000000000000000gggggg0000000000
 267         orr     SCRATCH, SCRATCH, SCRATCH, lsr #5  @ 0000000000000rrrrrrrrrr0bbbbbbbb
 268         orr     WK&reg, WK&reg, WK&reg, lsr #6     @ 000000000000000gggggggggggg00000
 269         pkhbt   SCRATCH, SCRATCH, SCRATCH, lsl #5  @ --------rrrrrrrr--------bbbbbbbb
 270         sel     WK&reg, WK&reg, SCRATCH            @ --------rrrrrrrrggggggggbbbbbbbb
 271         orr     WK&reg, WK&reg, #0xFF000000        @ 11111111rrrrrrrrggggggggbbbbbbbb
 272 .endm
 273
 274 .macro src_0565_8888_process_head   cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
 275  .if numbytes == 16
 276         pixldst ld,, 8, firstreg, %(firstreg+2),,, SRC, unaligned_src
 277  .elseif numbytes == 8
 278         pixld   , 4, firstreg, SRC, unaligned_src
 279  .elseif numbytes == 4
 280         pixld   , 2, firstreg, SRC, unaligned_src
 281  .endif
 282 .endm
 283
 284 .macro src_0565_8888_process_tail   cond, numbytes, firstreg
 285  .if numbytes == 16
 286         src_0565_8888_2pixels firstreg, %(firstreg+1)
 287         src_0565_8888_2pixels %(firstreg+2), %(firstreg+3)
 288  .elseif numbytes == 8
 289         src_0565_8888_2pixels firstreg, %(firstreg+1)
 290  .else
 291         src_0565_8888_1pixel firstreg
 292  .endif
 293 .endm
 294
 295 generate_composite_function \
 296     pixman_composite_src_0565_8888_asm_armv6, 16, 0, 32, \
 297     FLAG_DST_WRITEONLY | FLAG_BRANCH_OVER, \
 298     3, /* prefetch distance */ \
 299     src_0565_8888_init, \
 300     nop_macro, /* newline */ \
 301     nop_macro, /* cleanup */ \
 302     src_0565_8888_process_head, \
 303     src_0565_8888_process_tail
 304
 305 /******************************************************************************/
 306
 307 .macro src_x888_0565_init
 308         /* Hold loop invariant in MASK */
 309         ldr     MASK, =0x001F001F
 310         line_saved_regs  STRIDE_S, ORIG_W
 311 .endm
 312
 313 .macro src_x888_0565_1pixel  s, d
 314         and     WK&d, MASK, WK&s, lsr #3           @ 00000000000rrrrr00000000000bbbbb
 315         and     STRIDE_S, WK&s, #0xFC00            @ 0000000000000000gggggg0000000000
 316         orr     WK&d, WK&d, WK&d, lsr #5           @ 00000000000-----rrrrr000000bbbbb
 317         orr     WK&d, WK&d, STRIDE_S, lsr #5       @ 00000000000-----rrrrrggggggbbbbb
 318         /* Top 16 bits are discarded during the following STRH */
 319 .endm
 320
 321 .macro src_x888_0565_2pixels  slo, shi, d, tmp
 322         and     SCRATCH, WK&shi, #0xFC00           @ 0000000000000000GGGGGG0000000000
 323         and     WK&tmp, MASK, WK&shi, lsr #3       @ 00000000000RRRRR00000000000BBBBB
 324         and     WK&shi, MASK, WK&slo, lsr #3       @ 00000000000rrrrr00000000000bbbbb
 325         orr     WK&tmp, WK&tmp, WK&tmp, lsr #5     @ 00000000000-----RRRRR000000BBBBB
 326         orr     WK&tmp, WK&tmp, SCRATCH, lsr #5    @ 00000000000-----RRRRRGGGGGGBBBBB
 327         and     SCRATCH, WK&slo, #0xFC00           @ 0000000000000000gggggg0000000000
 328         orr     WK&shi, WK&shi, WK&shi, lsr #5     @ 00000000000-----rrrrr000000bbbbb
 329         orr     WK&shi, WK&shi, SCRATCH, lsr #5    @ 00000000000-----rrrrrggggggbbbbb
 330         pkhbt   WK&d, WK&shi, WK&tmp, lsl #16      @ RRRRRGGGGGGBBBBBrrrrrggggggbbbbb
 331 .endm
 332
 333 .macro src_x888_0565_process_head   cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
 334         WK4     .req    STRIDE_S
 335         WK5     .req    STRIDE_M
 336         WK6     .req    WK3
 337         WK7     .req    ORIG_W
 338  .if numbytes == 16
 339         pixld   , 16, 4, SRC, 0
 340         src_x888_0565_2pixels  4, 5, 0, 0
 341         pixld   , 8, 4, SRC, 0
 342         src_x888_0565_2pixels  6, 7, 1, 1
 343         pixld   , 8, 6, SRC, 0
 344  .else
 345         pixld   , numbytes*2, 4, SRC, 0
 346  .endif
 347 .endm
 348
 349 .macro src_x888_0565_process_tail   cond, numbytes, firstreg
 350  .if numbytes == 16
 351         src_x888_0565_2pixels  4, 5, 2, 2
 352         src_x888_0565_2pixels  6, 7, 3, 4
 353  .elseif numbytes == 8
 354         src_x888_0565_2pixels  4, 5, 1, 1
 355         src_x888_0565_2pixels  6, 7, 2, 2
 356  .elseif numbytes == 4
 357         src_x888_0565_2pixels  4, 5, 1, 1
 358  .else
 359         src_x888_0565_1pixel  4, 1
 360  .endif
 361  .if numbytes == 16
 362         pixst   , numbytes, 0, DST
 363  .else
 364         pixst   , numbytes, 1, DST
 365  .endif
 366         .unreq  WK4
 367         .unreq  WK5
 368         .unreq  WK6
 369         .unreq  WK7
 370 .endm
 371
 372 generate_composite_function \
 373     pixman_composite_src_x888_0565_asm_armv6, 32, 0, 16, \
 374     FLAG_DST_WRITEONLY | FLAG_BRANCH_OVER | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS | FLAG_PROCESS_CORRUPTS_SCRATCH, \
 375     3, /* prefetch distance */ \
 376     src_x888_0565_init, \
 377     nop_macro, /* newline */ \
 378     nop_macro, /* cleanup */ \
 379     src_x888_0565_process_head, \
 380     src_x888_0565_process_tail
 381
 382 /******************************************************************************/
 383
 384 .macro add_8_8_8pixels  cond, dst1, dst2
 385         uqadd8&cond  WK&dst1, WK&dst1, MASK
 386         uqadd8&cond  WK&dst2, WK&dst2, STRIDE_M
 387 .endm
 388
 389 .macro add_8_8_4pixels  cond, dst
 390         uqadd8&cond  WK&dst, WK&dst, MASK
 391 .endm
 392
 393 .macro add_8_8_process_head  cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
 394     WK4     .req    MASK
 395     WK5     .req    STRIDE_M
 396  .if numbytes == 16
 397         pixld   cond, 8, 4, SRC, unaligned_src
 398         pixld   cond, 16, firstreg, DST, 0
 399         add_8_8_8pixels cond, firstreg, %(firstreg+1)
 400         pixld   cond, 8, 4, SRC, unaligned_src
 401  .else
 402         pixld   cond, numbytes, 4, SRC, unaligned_src
 403         pixld   cond, numbytes, firstreg, DST, 0
 404  .endif
 405     .unreq  WK4
 406     .unreq  WK5
 407 .endm
 408
 409 .macro add_8_8_process_tail  cond, numbytes, firstreg
 410  .if numbytes == 16
 411         add_8_8_8pixels cond, %(firstreg+2), %(firstreg+3)
 412  .elseif numbytes == 8
 413         add_8_8_8pixels cond, firstreg, %(firstreg+1)
 414  .else
 415         add_8_8_4pixels cond, firstreg
 416  .endif
 417 .endm
 418
 419 generate_composite_function \
 420     pixman_composite_add_8_8_asm_armv6, 8, 0, 8, \
 421     FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_PRESERVES_SCRATCH, \
 422     2, /* prefetch distance */ \
 423     nop_macro, /* init */ \
 424     nop_macro, /* newline */ \
 425     nop_macro, /* cleanup */ \
 426     add_8_8_process_head, \
 427     add_8_8_process_tail
 428
 429 /******************************************************************************/
 430
 431 .macro over_8888_8888_init
 432         /* Hold loop invariant in MASK */
 433         ldr     MASK, =0x00800080
 434         /* Set GE[3:0] to 0101 so SEL instructions do what we want */
 435         uadd8   SCRATCH, MASK, MASK
 436         line_saved_regs STRIDE_D, STRIDE_S, ORIG_W
 437 .endm
 438
 439 .macro over_8888_8888_process_head  cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
 440     WK4     .req    STRIDE_D
 441     WK5     .req    STRIDE_S
 442     WK6     .req    STRIDE_M
 443     WK7     .req    ORIG_W
 444         pixld   , numbytes, %(4+firstreg), SRC, unaligned_src
 445         pixld   , numbytes, firstreg, DST, 0
 446     .unreq  WK4
 447     .unreq  WK5
 448     .unreq  WK6
 449     .unreq  WK7
 450 .endm
 451
 452 .macro over_8888_8888_check_transparent  numbytes, reg0, reg1, reg2, reg3
 453         /* Since these colours a premultiplied by alpha, only 0 indicates transparent (any other colour with 0 in the alpha byte is luminous) */
 454         teq     WK&reg0, #0
 455  .if numbytes > 4
 456         teqeq   WK&reg1, #0
 457   .if numbytes > 8
 458         teqeq   WK&reg2, #0
 459         teqeq   WK&reg3, #0
 460   .endif
 461  .endif
 462 .endm
 463
 464 .macro over_8888_8888_prepare  next
 465         mov     WK&next, WK&next, lsr #24
 466 .endm
 467
 468 .macro over_8888_8888_1pixel src, dst, offset, next
 469         /* src = destination component multiplier */
 470         rsb     WK&src, WK&src, #255
 471         /* Split even/odd bytes of dst into SCRATCH/dst */
 472         uxtb16  SCRATCH, WK&dst
 473         uxtb16  WK&dst, WK&dst, ror #8
 474         /* Multiply through, adding 0.5 to the upper byte of result for rounding */
 475         mla     SCRATCH, SCRATCH, WK&src, MASK
 476         mla     WK&dst, WK&dst, WK&src, MASK
 477         /* Where we would have had a stall between the result of the first MLA and the shifter input,
 478          * reload the complete source pixel */
 479         ldr     WK&src, [SRC, #offset]
 480         /* Multiply by 257/256 to approximate 256/255 */
 481         uxtab16 SCRATCH, SCRATCH, SCRATCH, ror #8
 482         /* In this stall, start processing the next pixel */
 483  .if offset < -4
 484         mov     WK&next, WK&next, lsr #24
 485  .endif
 486         uxtab16 WK&dst, WK&dst, WK&dst, ror #8
 487         /* Recombine even/odd bytes of multiplied destination */
 488         mov     SCRATCH, SCRATCH, ror #8
 489         sel     WK&dst, SCRATCH, WK&dst
 490         /* Saturated add of source to multiplied destination */
 491         uqadd8  WK&dst, WK&dst, WK&src
 492 .endm
 493
 494 .macro over_8888_8888_process_tail  cond, numbytes, firstreg
 495     WK4     .req    STRIDE_D
 496     WK5     .req    STRIDE_S
 497     WK6     .req    STRIDE_M
 498     WK7     .req    ORIG_W
 499         over_8888_8888_check_transparent numbytes, %(4+firstreg), %(5+firstreg), %(6+firstreg), %(7+firstreg)
 500         beq     10f
 501         over_8888_8888_prepare  %(4+firstreg)
 502  .set PROCESS_REG, firstreg
 503  .set PROCESS_OFF, -numbytes
 504  .rept numbytes / 4
 505         over_8888_8888_1pixel %(4+PROCESS_REG), %(0+PROCESS_REG), PROCESS_OFF, %(5+PROCESS_REG)
 506   .set PROCESS_REG, PROCESS_REG+1
 507   .set PROCESS_OFF, PROCESS_OFF+4
 508  .endr
 509         pixst   , numbytes, firstreg, DST
 510 10:
 511     .unreq  WK4
 512     .unreq  WK5
 513     .unreq  WK6
 514     .unreq  WK7
 515 .endm
 516
 517 generate_composite_function \
 518     pixman_composite_over_8888_8888_asm_armv6, 32, 0, 32 \
 519     FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS \
 520     2, /* prefetch distance */ \
 521     over_8888_8888_init, \
 522     nop_macro, /* newline */ \
 523     nop_macro, /* cleanup */ \
 524     over_8888_8888_process_head, \
 525     over_8888_8888_process_tail
 526
 527 /******************************************************************************/
 528
 529 /* Multiply each byte of a word by a byte.
 530  * Useful when there aren't any obvious ways to fill the stalls with other instructions.
 531  * word  Register containing 4 bytes
 532  * byte  Register containing byte multiplier (bits 8-31 must be 0)
 533  * tmp   Scratch register
 534  * half  Register containing the constant 0x00800080
 535  * GE[3:0] bits must contain 0101
 536  */
 537 .macro mul_8888_8  word, byte, tmp, half
 538         /* Split even/odd bytes of word apart */
 539         uxtb16  tmp, word
 540         uxtb16  word, word, ror #8
 541         /* Multiply bytes together with rounding, then by 257/256 */
 542         mla     tmp, tmp, byte, half
 543         mla     word, word, byte, half /* 1 stall follows */
 544         uxtab16 tmp, tmp, tmp, ror #8  /* 1 stall follows */
 545         uxtab16 word, word, word, ror #8
 546         /* Recombine bytes */
 547         mov     tmp, tmp, ror #8
 548         sel     word, tmp, word
 549 .endm
 550
 551 /******************************************************************************/
 552
 553 .macro over_8888_n_8888_init
 554         /* Mask is constant */
 555         ldr     MASK, [sp, #ARGS_STACK_OFFSET+8]
 556         /* Hold loop invariant in STRIDE_M */
 557         ldr     STRIDE_M, =0x00800080
 558         /* We only want the alpha bits of the constant mask */
 559         mov     MASK, MASK, lsr #24
 560         /* Set GE[3:0] to 0101 so SEL instructions do what we want */
 561         uadd8   SCRATCH, STRIDE_M, STRIDE_M
 562         line_saved_regs Y, STRIDE_D, STRIDE_S, ORIG_W
 563 .endm
 564
 565 .macro over_8888_n_8888_process_head  cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
 566     WK4     .req    Y
 567     WK5     .req    STRIDE_D
 568     WK6     .req    STRIDE_S
 569     WK7     .req    ORIG_W
 570         pixld   , numbytes, %(4+(firstreg%2)), SRC, unaligned_src
 571         pixld   , numbytes, firstreg, DST, 0
 572     .unreq  WK4
 573     .unreq  WK5
 574     .unreq  WK6
 575     .unreq  WK7
 576 .endm
 577
 578 .macro over_8888_n_8888_1pixel src, dst
 579         mul_8888_8  WK&src, MASK, SCRATCH, STRIDE_M
 580         sub     WK7, WK6, WK&src, lsr #24
 581         mul_8888_8  WK&dst, WK7, SCRATCH, STRIDE_M
 582         uqadd8  WK&dst, WK&dst, WK&src
 583 .endm
 584
 585 .macro over_8888_n_8888_process_tail  cond, numbytes, firstreg
 586     WK4     .req    Y
 587     WK5     .req    STRIDE_D
 588     WK6     .req    STRIDE_S
 589     WK7     .req    ORIG_W
 590         over_8888_8888_check_transparent numbytes, %(4+(firstreg%2)), %(5+(firstreg%2)), %(6+firstreg), %(7+firstreg)
 591         beq     10f
 592         mov     WK6, #255
 593  .set PROCESS_REG, firstreg
 594  .rept numbytes / 4
 595   .if numbytes == 16 && PROCESS_REG == 2
 596         /* We're using WK6 and WK7 as temporaries, so half way through
 597          * 4 pixels, reload the second two source pixels but this time
 598          * into WK4 and WK5 */
 599         ldmdb   SRC, {WK4, WK5}
 600   .endif
 601         over_8888_n_8888_1pixel  %(4+(PROCESS_REG%2)), %(PROCESS_REG)
 602   .set PROCESS_REG, PROCESS_REG+1
 603  .endr
 604         pixst   , numbytes, firstreg, DST
 605 10:
 606     .unreq  WK4
 607     .unreq  WK5
 608     .unreq  WK6
 609     .unreq  WK7
 610 .endm
 611
 612 generate_composite_function \
 613     pixman_composite_over_8888_n_8888_asm_armv6, 32, 0, 32 \
 614     FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS \
 615     2, /* prefetch distance */ \
 616     over_8888_n_8888_init, \
 617     nop_macro, /* newline */ \
 618     nop_macro, /* cleanup */ \
 619     over_8888_n_8888_process_head, \
 620     over_8888_n_8888_process_tail
 621
 622 /******************************************************************************/
 623
 624 .macro over_n_8_8888_init
 625         /* Source is constant, but splitting it into even/odd bytes is a loop invariant */
 626         ldr     SRC, [sp, #ARGS_STACK_OFFSET]
 627         /* Not enough registers to hold this constant, but we still use it here to set GE[3:0] */
 628         ldr     SCRATCH, =0x00800080
 629         uxtb16  STRIDE_S, SRC
 630         uxtb16  SRC, SRC, ror #8
 631         /* Set GE[3:0] to 0101 so SEL instructions do what we want */
 632         uadd8   SCRATCH, SCRATCH, SCRATCH
 633         line_saved_regs Y, STRIDE_D, STRIDE_M, ORIG_W
 634 .endm
 635
 636 .macro over_n_8_8888_newline
 637         ldr     STRIDE_D, =0x00800080
 638         b       1f
 639  .ltorg
 640 1:
 641 .endm
 642
 643 .macro over_n_8_8888_process_head  cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
 644     WK4     .req    STRIDE_M
 645         pixld   , numbytes/4, 4, MASK, unaligned_mask
 646         pixld   , numbytes, firstreg, DST, 0
 647     .unreq  WK4
 648 .endm
 649
 650 .macro over_n_8_8888_1pixel src, dst
 651         uxtb    Y, WK4, ror #src*8
 652         /* Trailing part of multiplication of source */
 653         mla     SCRATCH, STRIDE_S, Y, STRIDE_D
 654         mla     Y, SRC, Y, STRIDE_D
 655         mov     ORIG_W, #255
 656         uxtab16 SCRATCH, SCRATCH, SCRATCH, ror #8
 657         uxtab16 Y, Y, Y, ror #8
 658         mov     SCRATCH, SCRATCH, ror #8
 659         sub     ORIG_W, ORIG_W, Y, lsr #24
 660         sel     Y, SCRATCH, Y
 661         /* Then multiply the destination */
 662         mul_8888_8  WK&dst, ORIG_W, SCRATCH, STRIDE_D
 663         uqadd8  WK&dst, WK&dst, Y
 664 .endm
 665
 666 .macro over_n_8_8888_process_tail  cond, numbytes, firstreg
 667     WK4     .req    STRIDE_M
 668         teq     WK4, #0
 669         beq     10f
 670  .set PROCESS_REG, firstreg
 671  .rept numbytes / 4
 672         over_n_8_8888_1pixel  %(PROCESS_REG-firstreg), %(PROCESS_REG)
 673   .set PROCESS_REG, PROCESS_REG+1
 674  .endr
 675         pixst   , numbytes, firstreg, DST
 676 10:
 677     .unreq  WK4
 678 .endm
 679
 680 generate_composite_function \
 681     pixman_composite_over_n_8_8888_asm_armv6, 0, 8, 32 \
 682     FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS \
 683     2, /* prefetch distance */ \
 684     over_n_8_8888_init, \
 685     over_n_8_8888_newline, \
 686     nop_macro, /* cleanup */ \
 687     over_n_8_8888_process_head, \
 688     over_n_8_8888_process_tail
 689
 690 /******************************************************************************/
 691
 692 .macro over_reverse_n_8888_init
 693         ldr     SRC, [sp, #ARGS_STACK_OFFSET]
 694         ldr     MASK, =0x00800080
 695         /* Split source pixel into RB/AG parts */
 696         uxtb16  STRIDE_S, SRC
 697         uxtb16  STRIDE_M, SRC, ror #8
 698         /* Set GE[3:0] to 0101 so SEL instructions do what we want */
 699         uadd8   SCRATCH, MASK, MASK
 700         line_saved_regs  STRIDE_D, ORIG_W
 701 .endm
 702
 703 .macro over_reverse_n_8888_newline
 704         mov     STRIDE_D, #0xFF
 705 .endm
 706
 707 .macro over_reverse_n_8888_process_head  cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
 708         pixld   , numbytes, firstreg, DST, 0
 709 .endm
 710
 711 .macro over_reverse_n_8888_1pixel  d, is_only
 712         teq     WK&d, #0
 713         beq     8f       /* replace with source */
 714         bics    ORIG_W, STRIDE_D, WK&d, lsr #24
 715  .if is_only == 1
 716         beq     49f      /* skip store */
 717  .else
 718         beq     9f       /* write same value back */
 719  .endif
 720         mla     SCRATCH, STRIDE_S, ORIG_W, MASK /* red/blue */
 721         mla     ORIG_W, STRIDE_M, ORIG_W, MASK  /* alpha/green */
 722         uxtab16 SCRATCH, SCRATCH, SCRATCH, ror #8
 723         uxtab16 ORIG_W, ORIG_W, ORIG_W, ror #8
 724         mov     SCRATCH, SCRATCH, ror #8
 725         sel     ORIG_W, SCRATCH, ORIG_W
 726         uqadd8  WK&d, WK&d, ORIG_W
 727         b       9f
 728 8:      mov     WK&d, SRC
 729 9:
 730 .endm
 731
 732 .macro over_reverse_n_8888_tail  numbytes, reg1, reg2, reg3, reg4
 733  .if numbytes == 4
 734         over_reverse_n_8888_1pixel  reg1, 1
 735  .else
 736         and     SCRATCH, WK&reg1, WK&reg2
 737   .if numbytes == 16
 738         and     SCRATCH, SCRATCH, WK&reg3
 739         and     SCRATCH, SCRATCH, WK&reg4
 740   .endif
 741         mvns    SCRATCH, SCRATCH, asr #24
 742         beq     49f /* skip store if all opaque */
 743         over_reverse_n_8888_1pixel  reg1, 0
 744         over_reverse_n_8888_1pixel  reg2, 0
 745   .if numbytes == 16
 746         over_reverse_n_8888_1pixel  reg3, 0
 747         over_reverse_n_8888_1pixel  reg4, 0
 748   .endif
 749  .endif
 750         pixst   , numbytes, reg1, DST
 751 49:
 752 .endm
 753
 754 .macro over_reverse_n_8888_process_tail  cond, numbytes, firstreg
 755         over_reverse_n_8888_tail  numbytes, firstreg, %(firstreg+1), %(firstreg+2), %(firstreg+3)
 756 .endm
 757
 758 generate_composite_function \
 759     pixman_composite_over_reverse_n_8888_asm_armv6, 0, 0, 32 \
 760     FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS | FLAG_PROCESS_CORRUPTS_SCRATCH, \
 761     3, /* prefetch distance */ \
 762     over_reverse_n_8888_init, \
 763     over_reverse_n_8888_newline, \
 764     nop_macro, /* cleanup */ \
 765     over_reverse_n_8888_process_head, \
 766     over_reverse_n_8888_process_tail
 767
 768 /******************************************************************************/
 769
 770 .macro over_white_8888_8888_ca_init
 771         HALF    .req    SRC
 772         TMP0    .req    STRIDE_D
 773         TMP1    .req    STRIDE_S
 774         TMP2    .req    STRIDE_M
 775         TMP3    .req    ORIG_W
 776         WK4     .req    SCRATCH
 777         line_saved_regs STRIDE_D, STRIDE_M, ORIG_W
 778         ldr     SCRATCH, =0x800080
 779         mov     HALF, #0x80
 780         /* Set GE[3:0] to 0101 so SEL instructions do what we want */
 781         uadd8   SCRATCH, SCRATCH, SCRATCH
 782         .set DST_PRELOAD_BIAS, 8
 783 .endm
 784
 785 .macro over_white_8888_8888_ca_cleanup
 786         .set DST_PRELOAD_BIAS, 0
 787         .unreq  HALF
 788         .unreq  TMP0
 789         .unreq  TMP1
 790         .unreq  TMP2
 791         .unreq  TMP3
 792         .unreq  WK4
 793 .endm
 794
 795 .macro over_white_8888_8888_ca_combine  m, d
 796         uxtb16  TMP1, TMP0                /* rb_notmask */
 797         uxtb16  TMP2, d                   /* rb_dest; 1 stall follows */
 798         smlatt  TMP3, TMP2, TMP1, HALF    /* red */
 799         smlabb  TMP2, TMP2, TMP1, HALF    /* blue */
 800         uxtb16  TMP0, TMP0, ror #8        /* ag_notmask */
 801         uxtb16  TMP1, d, ror #8           /* ag_dest; 1 stall follows */
 802         smlatt  d, TMP1, TMP0, HALF       /* alpha */
 803         smlabb  TMP1, TMP1, TMP0, HALF    /* green */
 804         pkhbt   TMP0, TMP2, TMP3, lsl #16 /* rb; 1 stall follows */
 805         pkhbt   TMP1, TMP1, d, lsl #16    /* ag */
 806         uxtab16 TMP0, TMP0, TMP0, ror #8
 807         uxtab16 TMP1, TMP1, TMP1, ror #8
 808         mov     TMP0, TMP0, ror #8
 809         sel     d, TMP0, TMP1
 810         uqadd8  d, d, m                   /* d is a late result */
 811 .endm
 812
 813 .macro over_white_8888_8888_ca_1pixel_head
 814         pixld   , 4, 1, MASK, 0
 815         pixld   , 4, 3, DST, 0
 816 .endm
 817
 818 .macro over_white_8888_8888_ca_1pixel_tail
 819         mvn     TMP0, WK1
 820         teq     WK1, WK1, asr #32
 821         bne     01f
 822         bcc     03f
 823         mov     WK3, WK1
 824         b       02f
 825 01:     over_white_8888_8888_ca_combine WK1, WK3
 826 02:     pixst   , 4, 3, DST
 827 03:
 828 .endm
 829
 830 .macro over_white_8888_8888_ca_2pixels_head
 831         pixld   , 8, 1, MASK, 0
 832 .endm
 833
 834 .macro over_white_8888_8888_ca_2pixels_tail
 835         pixld   , 8, 3, DST
 836         mvn     TMP0, WK1
 837         teq     WK1, WK1, asr #32
 838         bne     01f
 839         movcs   WK3, WK1
 840         bcs     02f
 841         teq     WK2, #0
 842         beq     05f
 843         b       02f
 844 01:     over_white_8888_8888_ca_combine WK1, WK3
 845 02:     mvn     TMP0, WK2
 846         teq     WK2, WK2, asr #32
 847         bne     03f
 848         movcs   WK4, WK2
 849         b       04f
 850 03:     over_white_8888_8888_ca_combine WK2, WK4
 851 04:     pixst   , 8, 3, DST
 852 05:
 853 .endm
 854
 855 .macro over_white_8888_8888_ca_process_head  cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
 856  .if numbytes == 4
 857         over_white_8888_8888_ca_1pixel_head
 858  .else
 859   .if numbytes == 16
 860         over_white_8888_8888_ca_2pixels_head
 861         over_white_8888_8888_ca_2pixels_tail
 862   .endif
 863         over_white_8888_8888_ca_2pixels_head
 864  .endif
 865 .endm
 866
 867 .macro over_white_8888_8888_ca_process_tail  cond, numbytes, firstreg
 868  .if numbytes == 4
 869         over_white_8888_8888_ca_1pixel_tail
 870  .else
 871         over_white_8888_8888_ca_2pixels_tail
 872  .endif
 873 .endm
 874
 875 generate_composite_function \
 876     pixman_composite_over_white_8888_8888_ca_asm_armv6, 0, 32, 32 \
 877     FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS | FLAG_PROCESS_CORRUPTS_SCRATCH \
 878     2, /* prefetch distance */ \
 879     over_white_8888_8888_ca_init, \
 880     nop_macro, /* newline */ \
 881     over_white_8888_8888_ca_cleanup, \
 882     over_white_8888_8888_ca_process_head, \
 883     over_white_8888_8888_ca_process_tail
 884
 885
 886 .macro over_n_8888_8888_ca_init
 887         /* Set up constants. RB_SRC and AG_SRC are in registers;
 888          * RB_FLDS, A_SRC, and the two HALF values need to go on the
 889          * stack (and the ful SRC value is already there) */
 890         ldr     SCRATCH, [sp, #ARGS_STACK_OFFSET]
 891         mov     WK0, #0x00FF0000
 892         orr     WK0, WK0, #0xFF        /* RB_FLDS (0x00FF00FF) */
 893         mov     WK1, #0x80             /* HALF default value */
 894         mov     WK2, SCRATCH, lsr #24  /* A_SRC */
 895         orr     WK3, WK1, WK1, lsl #16 /* HALF alternate value (0x00800080) */
 896         push    {WK0-WK3}
 897  .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET+16
 898         uxtb16  SRC, SCRATCH
 899         uxtb16  STRIDE_S, SCRATCH, ror #8
 900
 901         /* Set GE[3:0] to 0101 so SEL instructions do what we want */
 902         uadd8   SCRATCH, WK3, WK3
 903
 904         .unreq  WK0
 905         .unreq  WK1
 906         .unreq  WK2
 907         .unreq  WK3
 908         WK0     .req    Y
 909         WK1     .req    STRIDE_D
 910         RB_SRC  .req    SRC
 911         AG_SRC  .req    STRIDE_S
 912         WK2     .req    STRIDE_M
 913         RB_FLDS .req    r8       /* the reloaded constants have to be at consecutive registers starting at an even one */
 914         A_SRC   .req    r8
 915         HALF    .req    r9
 916         WK3     .req    r10
 917         WK4     .req    r11
 918         WK5     .req    SCRATCH
 919         WK6     .req    ORIG_W
 920
 921         line_saved_regs Y, STRIDE_D, STRIDE_M, ORIG_W
 922 .endm
 923
 924 .macro over_n_8888_8888_ca_cleanup
 925         add     sp, sp, #16
 926  .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET-16
 927
 928         .unreq  WK0
 929         .unreq  WK1
 930         .unreq  RB_SRC
 931         .unreq  AG_SRC
 932         .unreq  WK2
 933         .unreq  RB_FLDS
 934         .unreq  A_SRC
 935         .unreq  HALF
 936         .unreq  WK3
 937         .unreq  WK4
 938         .unreq  WK5
 939         .unreq  WK6
 940         WK0     .req    r8
 941         WK1     .req    r9
 942         WK2     .req    r10
 943         WK3     .req    r11
 944 .endm
 945
 946 .macro over_n_8888_8888_ca_1pixel_head
 947         pixld   , 4, 6, MASK, 0
 948         pixld   , 4, 0, DST, 0
 949 .endm
 950
 951 .macro over_n_8888_8888_ca_1pixel_tail
 952         ldrd    A_SRC, HALF, [sp, #LOCALS_STACK_OFFSET+8]
 953         uxtb16  WK1, WK6                 /* rb_mask (first step of hard case placed in what would otherwise be a stall) */
 954         teq     WK6, WK6, asr #32        /* Zc if transparent, ZC if opaque */
 955         bne     20f
 956         bcc     40f
 957         /* Mask is fully opaque (all channels) */
 958         ldr     WK6, [sp, #ARGS_STACK_OFFSET] /* get SRC back */
 959         eors    A_SRC, A_SRC, #0xFF
 960         bne     10f
 961         /* Source is also opaque - same as src_8888_8888 */
 962         mov     WK0, WK6
 963         b       30f
 964 10:     /* Same as over_8888_8888 */
 965         mul_8888_8 WK0, A_SRC, WK5, HALF
 966         uqadd8  WK0, WK0, WK6
 967         b       30f
 968 20:     /* No simplifications possible - do it the hard way */
 969         uxtb16  WK2, WK6, ror #8         /* ag_mask */
 970         mla     WK3, WK1, A_SRC, HALF    /* rb_mul; 2 cycles */
 971         mla     WK4, WK2, A_SRC, HALF    /* ag_mul; 2 cycles */
 972         ldrd    RB_FLDS, HALF, [sp, #LOCALS_STACK_OFFSET]
 973         uxtb16  WK5, WK0                 /* rb_dest */
 974         uxtab16 WK3, WK3, WK3, ror #8
 975         uxtb16  WK6, WK0, ror #8         /* ag_dest */
 976         uxtab16 WK4, WK4, WK4, ror #8
 977         smlatt  WK0, RB_SRC, WK1, HALF   /* red1 */
 978         smlabb  WK1, RB_SRC, WK1, HALF   /* blue1 */
 979         bic     WK3, RB_FLDS, WK3, lsr #8
 980         bic     WK4, RB_FLDS, WK4, lsr #8
 981         pkhbt   WK1, WK1, WK0, lsl #16   /* rb1 */
 982         smlatt  WK0, WK5, WK3, HALF      /* red2 */
 983         smlabb  WK3, WK5, WK3, HALF      /* blue2 */
 984         uxtab16 WK1, WK1, WK1, ror #8
 985         smlatt  WK5, AG_SRC, WK2, HALF   /* alpha1 */
 986         pkhbt   WK3, WK3, WK0, lsl #16   /* rb2 */
 987         smlabb  WK0, AG_SRC, WK2, HALF   /* green1 */
 988         smlatt  WK2, WK6, WK4, HALF      /* alpha2 */
 989         smlabb  WK4, WK6, WK4, HALF      /* green2 */
 990         pkhbt   WK0, WK0, WK5, lsl #16   /* ag1 */
 991         uxtab16 WK3, WK3, WK3, ror #8
 992         pkhbt   WK4, WK4, WK2, lsl #16   /* ag2 */
 993         uxtab16 WK0, WK0, WK0, ror #8
 994         uxtab16 WK4, WK4, WK4, ror #8
 995         mov     WK1, WK1, ror #8
 996         mov     WK3, WK3, ror #8
 997         sel     WK2, WK1, WK0            /* recombine source*mask */
 998         sel     WK1, WK3, WK4            /* recombine dest*(1-source_alpha*mask) */
 999         uqadd8  WK0, WK1, WK2            /* followed by 1 stall */
1000 30:     /* The destination buffer is already in the L1 cache, so
1001          * there's little point in amalgamating writes */
1002         pixst   , 4, 0, DST
1003 40:
1004 .endm
1005
1006 .macro over_n_8888_8888_ca_process_head  cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
1007  .rept (numbytes / 4) - 1
1008         over_n_8888_8888_ca_1pixel_head
1009         over_n_8888_8888_ca_1pixel_tail
1010  .endr
1011         over_n_8888_8888_ca_1pixel_head
1012 .endm
1013
1014 .macro over_n_8888_8888_ca_process_tail  cond, numbytes, firstreg
1015         over_n_8888_8888_ca_1pixel_tail
1016 .endm
1017
1018 pixman_asm_function pixman_composite_over_n_8888_8888_ca_asm_armv6
1019         ldr     ip, [sp]
1020         cmp     ip, #-1
1021         beq     pixman_composite_over_white_8888_8888_ca_asm_armv6
1022         /* else drop through... */
1023  .endfunc
1024 generate_composite_function \
1025     pixman_composite_over_n_8888_8888_ca_asm_armv6_helper, 0, 32, 32 \
1026     FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS | FLAG_PROCESS_CORRUPTS_SCRATCH | FLAG_PROCESS_CORRUPTS_WK0 \
1027     2, /* prefetch distance */ \
1028     over_n_8888_8888_ca_init, \
1029     nop_macro, /* newline */ \
1030     over_n_8888_8888_ca_cleanup, \
1031     over_n_8888_8888_ca_process_head, \
1032     over_n_8888_8888_ca_process_tail
1033
1034 /******************************************************************************/
1035
1036 .macro in_reverse_8888_8888_init
1037         /* Hold loop invariant in MASK */
1038         ldr     MASK, =0x00800080
1039         /* Set GE[3:0] to 0101 so SEL instructions do what we want */
1040         uadd8   SCRATCH, MASK, MASK
1041         /* Offset the source pointer: we only need the alpha bytes */
1042         add     SRC, SRC, #3
1043         line_saved_regs  ORIG_W
1044 .endm
1045
1046 .macro in_reverse_8888_8888_head  numbytes, reg1, reg2, reg3
1047         ldrb    ORIG_W, [SRC], #4
1048  .if numbytes >= 8
1049         ldrb    WK&reg1, [SRC], #4
1050   .if numbytes == 16
1051         ldrb    WK&reg2, [SRC], #4
1052         ldrb    WK&reg3, [SRC], #4
1053   .endif
1054  .endif
1055         add     DST, DST, #numbytes
1056 .endm
1057
1058 .macro in_reverse_8888_8888_process_head  cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
1059         in_reverse_8888_8888_head  numbytes, firstreg, %(firstreg+1), %(firstreg+2)
1060 .endm
1061
1062 .macro in_reverse_8888_8888_1pixel  s, d, offset, is_only
1063  .if is_only != 1
1064         movs    s, ORIG_W
1065   .if offset != 0
1066         ldrb    ORIG_W, [SRC, #offset]
1067   .endif
1068         beq     01f
1069         teq     STRIDE_M, #0xFF
1070         beq     02f
1071  .endif
1072         uxtb16  SCRATCH, d                 /* rb_dest */
1073         uxtb16  d, d, ror #8               /* ag_dest */
1074         mla     SCRATCH, SCRATCH, s, MASK
1075         mla     d, d, s, MASK
1076         uxtab16 SCRATCH, SCRATCH, SCRATCH, ror #8
1077         uxtab16 d, d, d, ror #8
1078         mov     SCRATCH, SCRATCH, ror #8
1079         sel     d, SCRATCH, d
1080         b       02f
1081  .if offset == 0
1082 48:     /* Last mov d,#0 of the set - used as part of shortcut for
1083          * source values all 0 */
1084  .endif
1085 01:     mov     d, #0
1086 02:
1087 .endm
1088
1089 .macro in_reverse_8888_8888_tail  numbytes, reg1, reg2, reg3, reg4
1090  .if numbytes == 4
1091         teq     ORIG_W, ORIG_W, asr #32
1092         ldrne   WK&reg1, [DST, #-4]
1093  .elseif numbytes == 8
1094         teq     ORIG_W, WK&reg1
1095         teqeq   ORIG_W, ORIG_W, asr #32  /* all 0 or all -1? */
1096         ldmnedb DST, {WK&reg1-WK&reg2}
1097  .else
1098         teq     ORIG_W, WK&reg1
1099         teqeq   ORIG_W, WK&reg2
1100         teqeq   ORIG_W, WK&reg3
1101         teqeq   ORIG_W, ORIG_W, asr #32  /* all 0 or all -1? */
1102         ldmnedb DST, {WK&reg1-WK&reg4}
1103  .endif
1104         cmnne   DST, #0   /* clear C if NE */
1105         bcs     49f       /* no writes to dest if source all -1 */
1106         beq     48f       /* set dest to all 0 if source all 0 */
1107  .if numbytes == 4
1108         in_reverse_8888_8888_1pixel  ORIG_W, WK&reg1, 0, 1
1109         str     WK&reg1, [DST, #-4]
1110  .elseif numbytes == 8
1111         in_reverse_8888_8888_1pixel  STRIDE_M, WK&reg1, -4, 0
1112         in_reverse_8888_8888_1pixel  STRIDE_M, WK&reg2, 0, 0
1113         stmdb   DST, {WK&reg1-WK&reg2}
1114  .else
1115         in_reverse_8888_8888_1pixel  STRIDE_M, WK&reg1, -12, 0
1116         in_reverse_8888_8888_1pixel  STRIDE_M, WK&reg2, -8, 0
1117         in_reverse_8888_8888_1pixel  STRIDE_M, WK&reg3, -4, 0
1118         in_reverse_8888_8888_1pixel  STRIDE_M, WK&reg4, 0, 0
1119         stmdb   DST, {WK&reg1-WK&reg4}
1120  .endif
1121 49:
1122 .endm
1123
1124 .macro in_reverse_8888_8888_process_tail  cond, numbytes, firstreg
1125         in_reverse_8888_8888_tail  numbytes, firstreg, %(firstreg+1), %(firstreg+2), %(firstreg+3)
1126 .endm
1127
1128 generate_composite_function \
1129     pixman_composite_in_reverse_8888_8888_asm_armv6, 32, 0, 32 \
1130     FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS | FLAG_PROCESS_CORRUPTS_SCRATCH | FLAG_NO_PRELOAD_DST \
1131     2, /* prefetch distance */ \
1132     in_reverse_8888_8888_init, \
1133     nop_macro, /* newline */ \
1134     nop_macro, /* cleanup */ \
1135     in_reverse_8888_8888_process_head, \
1136     in_reverse_8888_8888_process_tail
1137
1138 /******************************************************************************/
1139
1140 .macro over_n_8888_init
1141         ldr     SRC, [sp, #ARGS_STACK_OFFSET]
1142         /* Hold loop invariant in MASK */
1143         ldr     MASK, =0x00800080
1144         /* Hold multiplier for destination in STRIDE_M */
1145         mov     STRIDE_M, #255
1146         sub     STRIDE_M, STRIDE_M, SRC, lsr #24
1147         /* Set GE[3:0] to 0101 so SEL instructions do what we want */
1148         uadd8   SCRATCH, MASK, MASK
1149 .endm
1150
1151 .macro over_n_8888_process_head  cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
1152         pixld   , numbytes, firstreg, DST, 0
1153 .endm
1154
1155 .macro over_n_8888_1pixel dst
1156         mul_8888_8  WK&dst, STRIDE_M, SCRATCH, MASK
1157         uqadd8  WK&dst, WK&dst, SRC
1158 .endm
1159
1160 .macro over_n_8888_process_tail  cond, numbytes, firstreg
1161  .set PROCESS_REG, firstreg
1162  .rept numbytes / 4
1163         over_n_8888_1pixel %(PROCESS_REG)
1164   .set PROCESS_REG, PROCESS_REG+1
1165  .endr
1166         pixst   , numbytes, firstreg, DST
1167 .endm
1168
1169 generate_composite_function \
1170     pixman_composite_over_n_8888_asm_armv6, 0, 0, 32 \
1171     FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_DOES_STORE \
1172     2, /* prefetch distance */ \
1173     over_n_8888_init, \
1174     nop_macro, /* newline */ \
1175     nop_macro, /* cleanup */ \
1176     over_n_8888_process_head, \
1177     over_n_8888_process_tail
1178
1179 /******************************************************************************/