source/libs/pixman/pixman-src/pixman/pixman-arm-simd-asm.h

   1 /*
   2  * Copyright © 2012 Raspberry Pi Foundation
   3  * Copyright © 2012 RISC OS Open Ltd
   4  *
   5  * Permission to use, copy, modify, distribute, and sell this software and its
   6  * documentation for any purpose is hereby granted without fee, provided that
   7  * the above copyright notice appear in all copies and that both that
   8  * copyright notice and this permission notice appear in supporting
   9  * documentation, and that the name of the copyright holders not be used in
  10  * advertising or publicity pertaining to distribution of the software without
  11  * specific, written prior permission.  The copyright holders make no
  12  * representations about the suitability of this software for any purpose.  It
  13  * is provided "as is" without express or implied warranty.
  14  *
  15  * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
  16  * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
  17  * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
  18  * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
  19  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
  20  * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
  21  * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
  22  * SOFTWARE.
  23  *
  24  * Author:  Ben Avison (bavison@riscosopen.org)
  25  *
  26  */
  27
  28 /*
  29  * Because the alignment of pixel data to cachelines, and even the number of
  30  * cachelines per row can vary from row to row, and because of the need to
  31  * preload each scanline once and only once, this prefetch strategy treats
  32  * each row of pixels independently. When a pixel row is long enough, there
  33  * are three distinct phases of prefetch:
  34  * * an inner loop section, where each time a cacheline of data is
  35  *    processed, another cacheline is preloaded (the exact distance ahead is
  36  *    determined empirically using profiling results from lowlevel-blt-bench)
  37  * * a leading section, where enough cachelines are preloaded to ensure no
  38  *    cachelines escape being preloaded when the inner loop starts
  39  * * a trailing section, where a limited number (0 or more) of cachelines
  40  *    are preloaded to deal with data (if any) that hangs off the end of the
  41  *    last iteration of the inner loop, plus any trailing bytes that were not
  42  *    enough to make up one whole iteration of the inner loop
  43  *
  44  * There are (in general) three distinct code paths, selected between
  45  * depending upon how long the pixel row is. If it is long enough that there
  46  * is at least one iteration of the inner loop (as described above) then
  47  * this is described as the "wide" case. If it is shorter than that, but
  48  * there are still enough bytes output that there is at least one 16-byte-
  49  * long, 16-byte-aligned write to the destination (the optimum type of
  50  * write), then this is the "medium" case. If it is not even this long, then
  51  * this is the "narrow" case, and there is no attempt to align writes to
  52  * 16-byte boundaries. In the "medium" and "narrow" cases, all the
  53  * cachelines containing data from the pixel row are prefetched up-front.
  54  */
  55
  56 /*
  57  * Determine whether we put the arguments on the stack for debugging.
  58  */
  59 #undef DEBUG_PARAMS
  60
  61 /*
  62  * Bit flags for 'generate_composite_function' macro which are used
  63  * to tune generated functions behavior.
  64  */
  65 .set FLAG_DST_WRITEONLY,         0
  66 .set FLAG_DST_READWRITE,         1
  67 .set FLAG_COND_EXEC,             0
  68 .set FLAG_BRANCH_OVER,           2
  69 .set FLAG_PROCESS_PRESERVES_PSR, 0
  70 .set FLAG_PROCESS_CORRUPTS_PSR,  4
  71 .set FLAG_PROCESS_DOESNT_STORE,  0
  72 .set FLAG_PROCESS_DOES_STORE,    8 /* usually because it needs to conditionally skip it */
  73 .set FLAG_NO_SPILL_LINE_VARS,        0
  74 .set FLAG_SPILL_LINE_VARS_WIDE,      16
  75 .set FLAG_SPILL_LINE_VARS_NON_WIDE,  32
  76 .set FLAG_SPILL_LINE_VARS,           48
  77 .set FLAG_PROCESS_CORRUPTS_SCRATCH,  0
  78 .set FLAG_PROCESS_PRESERVES_SCRATCH, 64
  79 .set FLAG_PROCESS_PRESERVES_WK0,     0
  80 .set FLAG_PROCESS_CORRUPTS_WK0,      128 /* if possible, use the specified register(s) instead so WK0 can hold number of leading pixels */
  81 .set FLAG_PRELOAD_DST,               0
  82 .set FLAG_NO_PRELOAD_DST,            256
  83
  84 /*
  85  * Number of bytes by which to adjust preload offset of destination
  86  * buffer (allows preload instruction to be moved before the load(s))
  87  */
  88 .set DST_PRELOAD_BIAS, 0
  89
  90 /*
  91  * Offset into stack where mask and source pointer/stride can be accessed.
  92  */
  93 #ifdef DEBUG_PARAMS
  94 .set ARGS_STACK_OFFSET,        (9*4+9*4)
  95 #else
  96 .set ARGS_STACK_OFFSET,        (9*4)
  97 #endif
  98
  99 /*
 100  * Offset into stack where space allocated during init macro can be accessed.
 101  */
 102 .set LOCALS_STACK_OFFSET,     0
 103
 104 /*
 105  * Constants for selecting preferable prefetch type.
 106  */
 107 .set PREFETCH_TYPE_NONE,       0
 108 .set PREFETCH_TYPE_STANDARD,   1
 109
 110 /*
 111  * Definitions of macros for load/store of pixel data.
 112  */
 113
 114 .macro pixldst op, cond=al, numbytes, reg0, reg1, reg2, reg3, base, unaligned=0
 115  .if numbytes == 16
 116   .if unaligned == 1
 117         op&r&cond    WK&reg0, [base], #4
 118         op&r&cond    WK&reg1, [base], #4
 119         op&r&cond    WK&reg2, [base], #4
 120         op&r&cond    WK&reg3, [base], #4
 121   .else
 122         op&m&cond&ia base!, {WK&reg0,WK&reg1,WK&reg2,WK&reg3}
 123   .endif
 124  .elseif numbytes == 8
 125   .if unaligned == 1
 126         op&r&cond    WK&reg0, [base], #4
 127         op&r&cond    WK&reg1, [base], #4
 128   .else
 129         op&m&cond&ia base!, {WK&reg0,WK&reg1}
 130   .endif
 131  .elseif numbytes == 4
 132         op&r&cond    WK&reg0, [base], #4
 133  .elseif numbytes == 2
 134         op&r&cond&h  WK&reg0, [base], #2
 135  .elseif numbytes == 1
 136         op&r&cond&b  WK&reg0, [base], #1
 137  .else
 138   .error "unsupported size: numbytes"
 139  .endif
 140 .endm
 141
 142 .macro pixst_baseupdated cond, numbytes, reg0, reg1, reg2, reg3, base
 143  .if numbytes == 16
 144         stm&cond&db base, {WK&reg0,WK&reg1,WK&reg2,WK&reg3}
 145  .elseif numbytes == 8
 146         stm&cond&db base, {WK&reg0,WK&reg1}
 147  .elseif numbytes == 4
 148         str&cond    WK&reg0, [base, #-4]
 149  .elseif numbytes == 2
 150         str&cond&h  WK&reg0, [base, #-2]
 151  .elseif numbytes == 1
 152         str&cond&b  WK&reg0, [base, #-1]
 153  .else
 154   .error "unsupported size: numbytes"
 155  .endif
 156 .endm
 157
 158 .macro pixld cond, numbytes, firstreg, base, unaligned
 159         pixldst ld, cond, numbytes, %(firstreg+0), %(firstreg+1), %(firstreg+2), %(firstreg+3), base, unaligned
 160 .endm
 161
 162 .macro pixst cond, numbytes, firstreg, base
 163  .if (flags) & FLAG_DST_READWRITE
 164         pixst_baseupdated cond, numbytes, %(firstreg+0), %(firstreg+1), %(firstreg+2), %(firstreg+3), base
 165  .else
 166         pixldst st, cond, numbytes, %(firstreg+0), %(firstreg+1), %(firstreg+2), %(firstreg+3), base
 167  .endif
 168 .endm
 169
 170 .macro PF a, x:vararg
 171  .if (PREFETCH_TYPE_CURRENT == PREFETCH_TYPE_STANDARD)
 172         a x
 173  .endif
 174 .endm
 175
 176
 177 .macro preload_leading_step1  bpp, ptr, base
 178 /* If the destination is already 16-byte aligned, then we need to preload
 179  * between 0 and prefetch_distance (inclusive) cache lines ahead so there
 180  * are no gaps when the inner loop starts.
 181  */
 182  .if bpp > 0
 183         PF  bic,    ptr, base, #31
 184   .set OFFSET, 0
 185   .rept prefetch_distance+1
 186         PF  pld,    [ptr, #OFFSET]
 187    .set OFFSET, OFFSET+32
 188   .endr
 189  .endif
 190 .endm
 191
 192 .macro preload_leading_step2  bpp, bpp_shift, ptr, base
 193 /* However, if the destination is not 16-byte aligned, we may need to
 194  * preload more cache lines than that. The question we need to ask is:
 195  * are the bytes corresponding to the leading pixels more than the amount
 196  * by which the source pointer will be rounded down for preloading, and if
 197  * so, by how many cache lines? Effectively, we want to calculate
 198  *     leading_bytes = ((-dst)&15)*src_bpp/dst_bpp
 199  *     inner_loop_offset = (src+leading_bytes)&31
 200  *     extra_needed = leading_bytes - inner_loop_offset
 201  * and test if extra_needed is <= 0, <= 32, or > 32 (where > 32 is only
 202  * possible when there are 4 src bytes for every 1 dst byte).
 203  */
 204  .if bpp > 0
 205   .ifc base,DST
 206         /* The test can be simplified further when preloading the destination */
 207         PF  tst,    base, #16
 208         PF  beq,    61f
 209   .else
 210    .if bpp/dst_w_bpp == 4
 211         PF  add,    SCRATCH, base, WK0, lsl #bpp_shift-dst_bpp_shift
 212         PF  and,    SCRATCH, SCRATCH, #31
 213         PF  rsb,    SCRATCH, SCRATCH, WK0, lsl #bpp_shift-dst_bpp_shift
 214         PF  sub,    SCRATCH, SCRATCH, #1        /* so now ranges are -16..-1 / 0..31 / 32..63 */
 215         PF  movs,   SCRATCH, SCRATCH, lsl #32-6 /* so this sets         NC   /  nc   /   Nc   */
 216         PF  bcs,    61f
 217         PF  bpl,    60f
 218         PF  pld,    [ptr, #32*(prefetch_distance+2)]
 219    .else
 220         PF  mov,    SCRATCH, base, lsl #32-5
 221         PF  add,    SCRATCH, SCRATCH, WK0, lsl #32-5+bpp_shift-dst_bpp_shift
 222         PF  rsbs,   SCRATCH, SCRATCH, WK0, lsl #32-5+bpp_shift-dst_bpp_shift
 223         PF  bls,    61f
 224    .endif
 225   .endif
 226 60:     PF  pld,    [ptr, #32*(prefetch_distance+1)]
 227 61:
 228  .endif
 229 .endm
 230
 231 #define IS_END_OF_GROUP(INDEX,SIZE) ((SIZE) < 2 || ((INDEX) & ~((INDEX)+1)) & ((SIZE)/2))
 232 .macro preload_middle   bpp, base, scratch_holds_offset
 233  .if bpp > 0
 234         /* prefetch distance = 256/bpp, stm distance = 128/dst_w_bpp */
 235   .if IS_END_OF_GROUP(SUBBLOCK,256/128*dst_w_bpp/bpp)
 236    .if scratch_holds_offset
 237         PF  pld,    [base, SCRATCH]
 238    .else
 239         PF  bic,    SCRATCH, base, #31
 240         PF  pld,    [SCRATCH, #32*prefetch_distance]
 241    .endif
 242   .endif
 243  .endif
 244 .endm
 245
 246 .macro preload_trailing  bpp, bpp_shift, base
 247  .if bpp > 0
 248   .if bpp*pix_per_block > 256
 249         /* Calculations are more complex if more than one fetch per block */
 250         PF  and,    WK1, base, #31
 251         PF  add,    WK1, WK1, WK0, lsl #bpp_shift
 252         PF  add,    WK1, WK1, #32*(bpp*pix_per_block/256-1)*(prefetch_distance+1)
 253         PF  bic,    SCRATCH, base, #31
 254 80:     PF  pld,    [SCRATCH, #32*(prefetch_distance+1)]
 255         PF  add,    SCRATCH, SCRATCH, #32
 256         PF  subs,   WK1, WK1, #32
 257         PF  bhi,    80b
 258   .else
 259         /* If exactly one fetch per block, then we need either 0, 1 or 2 extra preloads */
 260         PF  mov,    SCRATCH, base, lsl #32-5
 261         PF  adds,   SCRATCH, SCRATCH, X, lsl #32-5+bpp_shift
 262         PF  adceqs, SCRATCH, SCRATCH, #0
 263         /* The instruction above has two effects: ensures Z is only
 264          * set if C was clear (so Z indicates that both shifted quantities
 265          * were 0), and clears C if Z was set (so C indicates that the sum
 266          * of the shifted quantities was greater and not equal to 32) */
 267         PF  beq,    82f
 268         PF  bic,    SCRATCH, base, #31
 269         PF  bcc,    81f
 270         PF  pld,    [SCRATCH, #32*(prefetch_distance+2)]
 271 81:     PF  pld,    [SCRATCH, #32*(prefetch_distance+1)]
 272 82:
 273   .endif
 274  .endif
 275 .endm
 276
 277
 278 .macro preload_line    narrow_case, bpp, bpp_shift, base
 279 /* "narrow_case" - just means that the macro was invoked from the "narrow"
 280  *    code path rather than the "medium" one - because in the narrow case,
 281  *    the row of pixels is known to output no more than 30 bytes, then
 282  *    (assuming the source pixels are no wider than the the destination
 283  *    pixels) they cannot possibly straddle more than 2 32-byte cachelines,
 284  *    meaning there's no need for a loop.
 285  * "bpp" - number of bits per pixel in the channel (source, mask or
 286  *    destination) that's being preloaded, or 0 if this channel is not used
 287  *    for reading
 288  * "bpp_shift" - log2 of ("bpp"/8) (except if "bpp"=0 of course)
 289  * "base" - base address register of channel to preload (SRC, MASK or DST)
 290  */
 291  .if bpp > 0
 292   .if narrow_case && (bpp <= dst_w_bpp)
 293         /* In these cases, each line for each channel is in either 1 or 2 cache lines */
 294         PF  bic,    WK0, base, #31
 295         PF  pld,    [WK0]
 296         PF  add,    WK1, base, X, LSL #bpp_shift
 297         PF  sub,    WK1, WK1, #1
 298         PF  bic,    WK1, WK1, #31
 299         PF  cmp,    WK1, WK0
 300         PF  beq,    90f
 301         PF  pld,    [WK1]
 302 90:
 303   .else
 304         PF  bic,    WK0, base, #31
 305         PF  pld,    [WK0]
 306         PF  add,    WK1, base, X, lsl #bpp_shift
 307         PF  sub,    WK1, WK1, #1
 308         PF  bic,    WK1, WK1, #31
 309         PF  cmp,    WK1, WK0
 310         PF  beq,    92f
 311 91:     PF  add,    WK0, WK0, #32
 312         PF  cmp,    WK0, WK1
 313         PF  pld,    [WK0]
 314         PF  bne,    91b
 315 92:
 316   .endif
 317  .endif
 318 .endm
 319
 320
 321 .macro conditional_process1_helper  cond, process_head, process_tail, numbytes, firstreg, unaligned_src, unaligned_mask, decrementx
 322         process_head  cond, numbytes, firstreg, unaligned_src, unaligned_mask, 0
 323  .if decrementx
 324         sub&cond X, X, #8*numbytes/dst_w_bpp
 325  .endif
 326         process_tail  cond, numbytes, firstreg
 327  .if !((flags) & FLAG_PROCESS_DOES_STORE)
 328         pixst   cond, numbytes, firstreg, DST
 329  .endif
 330 .endm
 331
 332 .macro conditional_process1  cond, process_head, process_tail, numbytes, firstreg, unaligned_src, unaligned_mask, decrementx
 333  .if (flags) & FLAG_BRANCH_OVER
 334   .ifc cond,mi
 335         bpl     100f
 336   .endif
 337   .ifc cond,cs
 338         bcc     100f
 339   .endif
 340   .ifc cond,ne
 341         beq     100f
 342   .endif
 343         conditional_process1_helper  , process_head, process_tail, numbytes, firstreg, unaligned_src, unaligned_mask, decrementx
 344 100:
 345  .else
 346         conditional_process1_helper  cond, process_head, process_tail, numbytes, firstreg, unaligned_src, unaligned_mask, decrementx
 347  .endif
 348 .endm
 349
 350 .macro conditional_process2  test, cond1, cond2, process_head, process_tail, numbytes1, numbytes2, firstreg1, firstreg2, unaligned_src, unaligned_mask, decrementx
 351  .if (flags) & (FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE)
 352         /* Can't interleave reads and writes */
 353         test
 354         conditional_process1  cond1, process_head, process_tail, numbytes1, firstreg1, unaligned_src, unaligned_mask, decrementx
 355   .if (flags) & FLAG_PROCESS_CORRUPTS_PSR
 356         test
 357   .endif
 358         conditional_process1  cond2, process_head, process_tail, numbytes2, firstreg2, unaligned_src, unaligned_mask, decrementx
 359  .else
 360         /* Can interleave reads and writes for better scheduling */
 361         test
 362         process_head  cond1, numbytes1, firstreg1, unaligned_src, unaligned_mask, 0
 363         process_head  cond2, numbytes2, firstreg2, unaligned_src, unaligned_mask, 0
 364   .if decrementx
 365         sub&cond1 X, X, #8*numbytes1/dst_w_bpp
 366         sub&cond2 X, X, #8*numbytes2/dst_w_bpp
 367   .endif
 368         process_tail  cond1, numbytes1, firstreg1
 369         process_tail  cond2, numbytes2, firstreg2
 370         pixst   cond1, numbytes1, firstreg1, DST
 371         pixst   cond2, numbytes2, firstreg2, DST
 372  .endif
 373 .endm
 374
 375
 376 .macro test_bits_1_0_ptr
 377  .if (flags) & FLAG_PROCESS_CORRUPTS_WK0
 378         movs    SCRATCH, X, lsl #32-1  /* C,N = bits 1,0 of DST */
 379  .else
 380         movs    SCRATCH, WK0, lsl #32-1  /* C,N = bits 1,0 of DST */
 381  .endif
 382 .endm
 383
 384 .macro test_bits_3_2_ptr
 385  .if (flags) & FLAG_PROCESS_CORRUPTS_WK0
 386         movs    SCRATCH, X, lsl #32-3  /* C,N = bits 3, 2 of DST */
 387  .else
 388         movs    SCRATCH, WK0, lsl #32-3  /* C,N = bits 3, 2 of DST */
 389  .endif
 390 .endm
 391
 392 .macro leading_15bytes  process_head, process_tail
 393         /* On entry, WK0 bits 0-3 = number of bytes until destination is 16-byte aligned */
 394  .set DECREMENT_X, 1
 395  .if (flags) & FLAG_PROCESS_CORRUPTS_WK0
 396   .set DECREMENT_X, 0
 397         sub     X, X, WK0, lsr #dst_bpp_shift
 398         str     X, [sp, #LINE_SAVED_REG_COUNT*4]
 399         mov     X, WK0
 400  .endif
 401         /* Use unaligned loads in all cases for simplicity */
 402  .if dst_w_bpp == 8
 403         conditional_process2  test_bits_1_0_ptr, mi, cs, process_head, process_tail, 1, 2, 1, 2, 1, 1, DECREMENT_X
 404  .elseif dst_w_bpp == 16
 405         test_bits_1_0_ptr
 406         conditional_process1  cs, process_head, process_tail, 2, 2, 1, 1, DECREMENT_X
 407  .endif
 408         conditional_process2  test_bits_3_2_ptr, mi, cs, process_head, process_tail, 4, 8, 1, 2, 1, 1, DECREMENT_X
 409  .if (flags) & FLAG_PROCESS_CORRUPTS_WK0
 410         ldr     X, [sp, #LINE_SAVED_REG_COUNT*4]
 411  .endif
 412 .endm
 413
 414 .macro test_bits_3_2_pix
 415         movs    SCRATCH, X, lsl #dst_bpp_shift+32-3
 416 .endm
 417
 418 .macro test_bits_1_0_pix
 419  .if dst_w_bpp == 8
 420         movs    SCRATCH, X, lsl #dst_bpp_shift+32-1
 421  .else
 422         movs    SCRATCH, X, lsr #1
 423  .endif
 424 .endm
 425
 426 .macro trailing_15bytes  process_head, process_tail, unaligned_src, unaligned_mask
 427         conditional_process2  test_bits_3_2_pix, cs, mi, process_head, process_tail, 8, 4, 0, 2, unaligned_src, unaligned_mask, 0
 428  .if dst_w_bpp == 16
 429         test_bits_1_0_pix
 430         conditional_process1  cs, process_head, process_tail, 2, 0, unaligned_src, unaligned_mask, 0
 431  .elseif dst_w_bpp == 8
 432         conditional_process2  test_bits_1_0_pix, cs, mi, process_head, process_tail, 2, 1, 0, 1, unaligned_src, unaligned_mask, 0
 433  .endif
 434 .endm
 435
 436
 437 .macro wide_case_inner_loop  process_head, process_tail, unaligned_src, unaligned_mask, dst_alignment
 438 110:
 439  .set SUBBLOCK, 0 /* this is a count of STMs; there can be up to 8 STMs per block */
 440  .rept pix_per_block*dst_w_bpp/128
 441         process_head  , 16, 0, unaligned_src, unaligned_mask, 1
 442   .if (src_bpp > 0) && (mask_bpp == 0) && ((flags) & FLAG_PROCESS_PRESERVES_SCRATCH)
 443         preload_middle  src_bpp, SRC, 1
 444   .elseif (src_bpp == 0) && (mask_bpp > 0) && ((flags) & FLAG_PROCESS_PRESERVES_SCRATCH)
 445         preload_middle  mask_bpp, MASK, 1
 446   .else
 447         preload_middle  src_bpp, SRC, 0
 448         preload_middle  mask_bpp, MASK, 0
 449   .endif
 450   .if (dst_r_bpp > 0) && ((SUBBLOCK % 2) == 0) && (((flags) & FLAG_NO_PRELOAD_DST) == 0)
 451         /* Because we know that writes are 16-byte aligned, it's relatively easy to ensure that
 452          * destination prefetches are 32-byte aligned. It's also the easiest channel to offset
 453          * preloads for, to achieve staggered prefetches for multiple channels, because there are
 454          * always two STMs per prefetch, so there is always an opposite STM on which to put the
 455          * preload. Note, no need to BIC the base register here */
 456         PF  pld,    [DST, #32*prefetch_distance - dst_alignment]
 457   .endif
 458         process_tail  , 16, 0
 459   .if !((flags) & FLAG_PROCESS_DOES_STORE)
 460         pixst   , 16, 0, DST
 461   .endif
 462   .set SUBBLOCK, SUBBLOCK+1
 463  .endr
 464         subs    X, X, #pix_per_block
 465         bhs     110b
 466 .endm
 467
 468 .macro wide_case_inner_loop_and_trailing_pixels  process_head, process_tail, process_inner_loop, exit_label, unaligned_src, unaligned_mask
 469         /* Destination now 16-byte aligned; we have at least one block before we have to stop preloading */
 470  .if dst_r_bpp > 0
 471         tst     DST, #16
 472         bne     111f
 473         process_inner_loop  process_head, process_tail, unaligned_src, unaligned_mask, 16 + DST_PRELOAD_BIAS
 474         b       112f
 475 111:
 476  .endif
 477         process_inner_loop  process_head, process_tail, unaligned_src, unaligned_mask, 0 + DST_PRELOAD_BIAS
 478 112:
 479         /* Just before the final (prefetch_distance+1) 32-byte blocks, deal with final preloads */
 480  .if (src_bpp*pix_per_block > 256) || (mask_bpp*pix_per_block > 256) || (dst_r_bpp*pix_per_block > 256)
 481         PF  and,    WK0, X, #pix_per_block-1
 482  .endif
 483         preload_trailing  src_bpp, src_bpp_shift, SRC
 484         preload_trailing  mask_bpp, mask_bpp_shift, MASK
 485  .if ((flags) & FLAG_NO_PRELOAD_DST) == 0
 486         preload_trailing  dst_r_bpp, dst_bpp_shift, DST
 487  .endif
 488         add     X, X, #(prefetch_distance+2)*pix_per_block - 128/dst_w_bpp
 489         /* The remainder of the line is handled identically to the medium case */
 490         medium_case_inner_loop_and_trailing_pixels  process_head, process_tail,, exit_label, unaligned_src, unaligned_mask
 491 .endm
 492
 493 .macro medium_case_inner_loop_and_trailing_pixels  process_head, process_tail, unused, exit_label, unaligned_src, unaligned_mask
 494 120:
 495         process_head  , 16, 0, unaligned_src, unaligned_mask, 0
 496         process_tail  , 16, 0
 497  .if !((flags) & FLAG_PROCESS_DOES_STORE)
 498         pixst   , 16, 0, DST
 499  .endif
 500         subs    X, X, #128/dst_w_bpp
 501         bhs     120b
 502         /* Trailing pixels */
 503         tst     X, #128/dst_w_bpp - 1
 504         beq     exit_label
 505         trailing_15bytes  process_head, process_tail, unaligned_src, unaligned_mask
 506 .endm
 507
 508 .macro narrow_case_inner_loop_and_trailing_pixels  process_head, process_tail, unused, exit_label, unaligned_src, unaligned_mask
 509         tst     X, #16*8/dst_w_bpp
 510         conditional_process1  ne, process_head, process_tail, 16, 0, unaligned_src, unaligned_mask, 0
 511         /* Trailing pixels */
 512         /* In narrow case, it's relatively unlikely to be aligned, so let's do without a branch here */
 513         trailing_15bytes  process_head, process_tail, unaligned_src, unaligned_mask
 514 .endm
 515
 516 .macro switch_on_alignment  action, process_head, process_tail, process_inner_loop, exit_label
 517  /* Note that if we're reading the destination, it's already guaranteed to be aligned at this point */
 518  .if mask_bpp == 8 || mask_bpp == 16
 519         tst     MASK, #3
 520         bne     141f
 521  .endif
 522   .if src_bpp == 8 || src_bpp == 16
 523         tst     SRC, #3
 524         bne     140f
 525   .endif
 526         action  process_head, process_tail, process_inner_loop, exit_label, 0, 0
 527   .if src_bpp == 8 || src_bpp == 16
 528         b       exit_label
 529 140:
 530         action  process_head, process_tail, process_inner_loop, exit_label, 1, 0
 531   .endif
 532  .if mask_bpp == 8 || mask_bpp == 16
 533         b       exit_label
 534 141:
 535   .if src_bpp == 8 || src_bpp == 16
 536         tst     SRC, #3
 537         bne     142f
 538   .endif
 539         action  process_head, process_tail, process_inner_loop, exit_label, 0, 1
 540   .if src_bpp == 8 || src_bpp == 16
 541         b       exit_label
 542 142:
 543         action  process_head, process_tail, process_inner_loop, exit_label, 1, 1
 544   .endif
 545  .endif
 546 .endm
 547
 548
 549 .macro end_of_line      restore_x, vars_spilled, loop_label, last_one
 550  .if vars_spilled
 551         /* Sadly, GAS doesn't seem have an equivalent of the DCI directive? */
 552         /* This is ldmia sp,{} */
 553         .word   0xE89D0000 | LINE_SAVED_REGS
 554  .endif
 555         subs    Y, Y, #1
 556  .if vars_spilled
 557   .if (LINE_SAVED_REGS) & (1<<1)
 558         str     Y, [sp]
 559   .endif
 560  .endif
 561         add     DST, DST, STRIDE_D
 562  .if src_bpp > 0
 563         add     SRC, SRC, STRIDE_S
 564  .endif
 565  .if mask_bpp > 0
 566         add     MASK, MASK, STRIDE_M
 567  .endif
 568  .if restore_x
 569         mov     X, ORIG_W
 570  .endif
 571         bhs     loop_label
 572  .ifc "last_one",""
 573   .if vars_spilled
 574         b       197f
 575   .else
 576         b       198f
 577   .endif
 578  .else
 579   .if (!vars_spilled) && ((flags) & FLAG_SPILL_LINE_VARS)
 580         b       198f
 581   .endif
 582  .endif
 583 .endm
 584
 585
 586 .macro generate_composite_function fname, \
 587                                    src_bpp_, \
 588                                    mask_bpp_, \
 589                                    dst_w_bpp_, \
 590                                    flags_, \
 591                                    prefetch_distance_, \
 592                                    init, \
 593                                    newline, \
 594                                    cleanup, \
 595                                    process_head, \
 596                                    process_tail, \
 597                                    process_inner_loop
 598
 599     pixman_asm_function fname
 600
 601 /*
 602  * Make some macro arguments globally visible and accessible
 603  * from other macros
 604  */
 605  .set src_bpp, src_bpp_
 606  .set mask_bpp, mask_bpp_
 607  .set dst_w_bpp, dst_w_bpp_
 608  .set flags, flags_
 609  .set prefetch_distance, prefetch_distance_
 610
 611 /*
 612  * Select prefetch type for this function.
 613  */
 614  .if prefetch_distance == 0
 615   .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_NONE
 616  .else
 617   .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_STANDARD
 618  .endif
 619
 620  .if src_bpp == 32
 621   .set src_bpp_shift, 2
 622  .elseif src_bpp == 24
 623   .set src_bpp_shift, 0
 624  .elseif src_bpp == 16
 625   .set src_bpp_shift, 1
 626  .elseif src_bpp == 8
 627   .set src_bpp_shift, 0
 628  .elseif src_bpp == 0
 629   .set src_bpp_shift, -1
 630  .else
 631   .error "requested src bpp (src_bpp) is not supported"
 632  .endif
 633
 634  .if mask_bpp == 32
 635   .set mask_bpp_shift, 2
 636  .elseif mask_bpp == 24
 637   .set mask_bpp_shift, 0
 638  .elseif mask_bpp == 8
 639   .set mask_bpp_shift, 0
 640  .elseif mask_bpp == 0
 641   .set mask_bpp_shift, -1
 642  .else
 643   .error "requested mask bpp (mask_bpp) is not supported"
 644  .endif
 645
 646  .if dst_w_bpp == 32
 647   .set dst_bpp_shift, 2
 648  .elseif dst_w_bpp == 24
 649   .set dst_bpp_shift, 0
 650  .elseif dst_w_bpp == 16
 651   .set dst_bpp_shift, 1
 652  .elseif dst_w_bpp == 8
 653   .set dst_bpp_shift, 0
 654  .else
 655   .error "requested dst bpp (dst_w_bpp) is not supported"
 656  .endif
 657
 658  .if (((flags) & FLAG_DST_READWRITE) != 0)
 659   .set dst_r_bpp, dst_w_bpp
 660  .else
 661   .set dst_r_bpp, 0
 662  .endif
 663
 664  .set pix_per_block, 16*8/dst_w_bpp
 665  .if src_bpp != 0
 666   .if 32*8/src_bpp > pix_per_block
 667    .set pix_per_block, 32*8/src_bpp
 668   .endif
 669  .endif
 670  .if mask_bpp != 0
 671   .if 32*8/mask_bpp > pix_per_block
 672    .set pix_per_block, 32*8/mask_bpp
 673   .endif
 674  .endif
 675  .if dst_r_bpp != 0
 676   .if 32*8/dst_r_bpp > pix_per_block
 677    .set pix_per_block, 32*8/dst_r_bpp
 678   .endif
 679  .endif
 680
 681 /* The standard entry conditions set up by pixman-arm-common.h are:
 682  * r0 = width (pixels)
 683  * r1 = height (rows)
 684  * r2 = pointer to top-left pixel of destination
 685  * r3 = destination stride (pixels)
 686  * [sp] = source pixel value, or pointer to top-left pixel of source
 687  * [sp,#4] = 0 or source stride (pixels)
 688  * The following arguments are unused for non-mask operations
 689  * [sp,#8] = mask pixel value, or pointer to top-left pixel of mask
 690  * [sp,#12] = 0 or mask stride (pixels)
 691  */
 692
 693 /*
 694  * Assign symbolic names to registers
 695  */
 696     X           .req    r0  /* pixels to go on this line */
 697     Y           .req    r1  /* lines to go */
 698     DST         .req    r2  /* destination pixel pointer */
 699     STRIDE_D    .req    r3  /* destination stride (bytes, minus width) */
 700     SRC         .req    r4  /* source pixel pointer */
 701     STRIDE_S    .req    r5  /* source stride (bytes, minus width) */
 702     MASK        .req    r6  /* mask pixel pointer (if applicable) */
 703     STRIDE_M    .req    r7  /* mask stride (bytes, minus width) */
 704     WK0         .req    r8  /* pixel data registers */
 705     WK1         .req    r9
 706     WK2         .req    r10
 707     WK3         .req    r11
 708     SCRATCH     .req    r12
 709     ORIG_W      .req    r14 /* width (pixels) */
 710
 711         push    {r4-r11, lr}        /* save all registers */
 712
 713         subs    Y, Y, #1
 714         blo     199f
 715
 716 #ifdef DEBUG_PARAMS
 717         sub     sp, sp, #9*4
 718 #endif
 719
 720  .if src_bpp > 0
 721         ldr     SRC, [sp, #ARGS_STACK_OFFSET]
 722         ldr     STRIDE_S, [sp, #ARGS_STACK_OFFSET+4]
 723  .endif
 724  .if mask_bpp > 0
 725         ldr     MASK, [sp, #ARGS_STACK_OFFSET+8]
 726         ldr     STRIDE_M, [sp, #ARGS_STACK_OFFSET+12]
 727  .endif
 728
 729 #ifdef DEBUG_PARAMS
 730         add     Y, Y, #1
 731         stmia   sp, {r0-r7,pc}
 732         sub     Y, Y, #1
 733 #endif
 734
 735         init
 736
 737  .if (flags) & FLAG_PROCESS_CORRUPTS_WK0
 738         /* Reserve a word in which to store X during leading pixels */
 739         sub     sp, sp, #4
 740   .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET+4
 741   .set LOCALS_STACK_OFFSET, LOCALS_STACK_OFFSET+4
 742  .endif
 743
 744         lsl     STRIDE_D, #dst_bpp_shift /* stride in bytes */
 745         sub     STRIDE_D, STRIDE_D, X, lsl #dst_bpp_shift
 746  .if src_bpp > 0
 747         lsl     STRIDE_S, #src_bpp_shift
 748         sub     STRIDE_S, STRIDE_S, X, lsl #src_bpp_shift
 749  .endif
 750  .if mask_bpp > 0
 751         lsl     STRIDE_M, #mask_bpp_shift
 752         sub     STRIDE_M, STRIDE_M, X, lsl #mask_bpp_shift
 753  .endif
 754
 755         /* Are we not even wide enough to have one 16-byte aligned 16-byte block write? */
 756         cmp     X, #2*16*8/dst_w_bpp - 1
 757         blo     170f
 758  .if src_bpp || mask_bpp || dst_r_bpp /* Wide and medium cases are the same for fill */
 759         /* To preload ahead on the current line, we need at least (prefetch_distance+2) 32-byte blocks on all prefetch channels */
 760         cmp     X, #(prefetch_distance+3)*pix_per_block - 1
 761         blo     160f
 762
 763         /* Wide case */
 764         /* Adjust X so that the decrement instruction can also test for
 765          * inner loop termination. We want it to stop when there are
 766          * (prefetch_distance+1) complete blocks to go. */
 767         sub     X, X, #(prefetch_distance+2)*pix_per_block
 768         mov     ORIG_W, X
 769   .if (flags) & FLAG_SPILL_LINE_VARS_WIDE
 770         /* This is stmdb sp!,{} */
 771         .word   0xE92D0000 | LINE_SAVED_REGS
 772    .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET + LINE_SAVED_REG_COUNT*4
 773    .set LOCALS_STACK_OFFSET, LOCALS_STACK_OFFSET + LINE_SAVED_REG_COUNT*4
 774   .endif
 775 151:    /* New line */
 776         newline
 777         preload_leading_step1  src_bpp, WK1, SRC
 778         preload_leading_step1  mask_bpp, WK2, MASK
 779   .if ((flags) & FLAG_NO_PRELOAD_DST) == 0
 780         preload_leading_step1  dst_r_bpp, WK3, DST
 781   .endif
 782
 783         ands    WK0, DST, #15
 784         beq     154f
 785         rsb     WK0, WK0, #16 /* number of leading bytes until destination aligned */
 786
 787         preload_leading_step2  src_bpp, src_bpp_shift, WK1, SRC
 788         preload_leading_step2  mask_bpp, mask_bpp_shift, WK2, MASK
 789   .if ((flags) & FLAG_NO_PRELOAD_DST) == 0
 790         preload_leading_step2  dst_r_bpp, dst_bpp_shift, WK3, DST
 791   .endif
 792
 793         leading_15bytes  process_head, process_tail
 794
 795 154:    /* Destination now 16-byte aligned; we have at least one prefetch on each channel as well as at least one 16-byte output block */
 796   .if (src_bpp > 0) && (mask_bpp == 0) && ((flags) & FLAG_PROCESS_PRESERVES_SCRATCH)
 797         and     SCRATCH, SRC, #31
 798         rsb     SCRATCH, SCRATCH, #32*prefetch_distance
 799   .elseif (src_bpp == 0) && (mask_bpp > 0) && ((flags) & FLAG_PROCESS_PRESERVES_SCRATCH)
 800         and     SCRATCH, MASK, #31
 801         rsb     SCRATCH, SCRATCH, #32*prefetch_distance
 802   .endif
 803   .ifc "process_inner_loop",""
 804         switch_on_alignment  wide_case_inner_loop_and_trailing_pixels, process_head, process_tail, wide_case_inner_loop, 157f
 805   .else
 806         switch_on_alignment  wide_case_inner_loop_and_trailing_pixels, process_head, process_tail, process_inner_loop, 157f
 807   .endif
 808
 809 157:    /* Check for another line */
 810         end_of_line 1, %((flags) & FLAG_SPILL_LINE_VARS_WIDE), 151b
 811   .if (flags) & FLAG_SPILL_LINE_VARS_WIDE
 812    .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET - LINE_SAVED_REG_COUNT*4
 813    .set LOCALS_STACK_OFFSET, LOCALS_STACK_OFFSET - LINE_SAVED_REG_COUNT*4
 814   .endif
 815  .endif
 816
 817  .ltorg
 818
 819 160:    /* Medium case */
 820         mov     ORIG_W, X
 821  .if (flags) & FLAG_SPILL_LINE_VARS_NON_WIDE
 822         /* This is stmdb sp!,{} */
 823         .word   0xE92D0000 | LINE_SAVED_REGS
 824   .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET + LINE_SAVED_REG_COUNT*4
 825   .set LOCALS_STACK_OFFSET, LOCALS_STACK_OFFSET + LINE_SAVED_REG_COUNT*4
 826  .endif
 827 161:    /* New line */
 828         newline
 829         preload_line 0, src_bpp, src_bpp_shift, SRC  /* in: X, corrupts: WK0-WK1 */
 830         preload_line 0, mask_bpp, mask_bpp_shift, MASK
 831  .if ((flags) & FLAG_NO_PRELOAD_DST) == 0
 832         preload_line 0, dst_r_bpp, dst_bpp_shift, DST
 833  .endif
 834
 835         sub     X, X, #128/dst_w_bpp     /* simplifies inner loop termination */
 836         ands    WK0, DST, #15
 837         beq     164f
 838         rsb     WK0, WK0, #16 /* number of leading bytes until destination aligned */
 839
 840         leading_15bytes  process_head, process_tail
 841
 842 164:    /* Destination now 16-byte aligned; we have at least one 16-byte output block */
 843         switch_on_alignment  medium_case_inner_loop_and_trailing_pixels, process_head, process_tail,, 167f
 844
 845 167:    /* Check for another line */
 846         end_of_line 1, %((flags) & FLAG_SPILL_LINE_VARS_NON_WIDE), 161b
 847
 848  .ltorg
 849
 850 170:    /* Narrow case, less than 31 bytes, so no guarantee of at least one 16-byte block */
 851  .if dst_w_bpp < 32
 852         mov     ORIG_W, X
 853  .endif
 854  .if (flags) & FLAG_SPILL_LINE_VARS_NON_WIDE
 855         /* This is stmdb sp!,{} */
 856         .word   0xE92D0000 | LINE_SAVED_REGS
 857  .endif
 858 171:    /* New line */
 859         newline
 860         preload_line 1, src_bpp, src_bpp_shift, SRC  /* in: X, corrupts: WK0-WK1 */
 861         preload_line 1, mask_bpp, mask_bpp_shift, MASK
 862  .if ((flags) & FLAG_NO_PRELOAD_DST) == 0
 863         preload_line 1, dst_r_bpp, dst_bpp_shift, DST
 864  .endif
 865
 866  .if dst_w_bpp == 8
 867         tst     DST, #3
 868         beq     174f
 869 172:    subs    X, X, #1
 870         blo     177f
 871         process_head  , 1, 0, 1, 1, 0
 872         process_tail  , 1, 0
 873   .if !((flags) & FLAG_PROCESS_DOES_STORE)
 874         pixst   , 1, 0, DST
 875   .endif
 876         tst     DST, #3
 877         bne     172b
 878  .elseif dst_w_bpp == 16
 879         tst     DST, #2
 880         beq     174f
 881         subs    X, X, #1
 882         blo     177f
 883         process_head  , 2, 0, 1, 1, 0
 884         process_tail  , 2, 0
 885   .if !((flags) & FLAG_PROCESS_DOES_STORE)
 886         pixst   , 2, 0, DST
 887   .endif
 888  .endif
 889
 890 174:    /* Destination now 4-byte aligned; we have 0 or more output bytes to go */
 891         switch_on_alignment  narrow_case_inner_loop_and_trailing_pixels, process_head, process_tail,, 177f
 892
 893 177:    /* Check for another line */
 894         end_of_line %(dst_w_bpp < 32), %((flags) & FLAG_SPILL_LINE_VARS_NON_WIDE), 171b, last_one
 895  .if (flags) & FLAG_SPILL_LINE_VARS_NON_WIDE
 896   .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET - LINE_SAVED_REG_COUNT*4
 897   .set LOCALS_STACK_OFFSET, LOCALS_STACK_OFFSET - LINE_SAVED_REG_COUNT*4
 898  .endif
 899
 900 197:
 901  .if (flags) & FLAG_SPILL_LINE_VARS
 902         add     sp, sp, #LINE_SAVED_REG_COUNT*4
 903  .endif
 904 198:
 905  .if (flags) & FLAG_PROCESS_CORRUPTS_WK0
 906   .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET-4
 907   .set LOCALS_STACK_OFFSET, LOCALS_STACK_OFFSET-4
 908         add     sp, sp, #4
 909  .endif
 910
 911         cleanup
 912
 913 #ifdef DEBUG_PARAMS
 914         add     sp, sp, #9*4 /* junk the debug copy of arguments */
 915 #endif
 916 199:
 917         pop     {r4-r11, pc}  /* exit */
 918
 919  .ltorg
 920
 921     .unreq  X
 922     .unreq  Y
 923     .unreq  DST
 924     .unreq  STRIDE_D
 925     .unreq  SRC
 926     .unreq  STRIDE_S
 927     .unreq  MASK
 928     .unreq  STRIDE_M
 929     .unreq  WK0
 930     .unreq  WK1
 931     .unreq  WK2
 932     .unreq  WK3
 933     .unreq  SCRATCH
 934     .unreq  ORIG_W
 935     .endfunc
 936 .endm
 937
 938 .macro line_saved_regs  x:vararg
 939  .set LINE_SAVED_REGS, 0
 940  .set LINE_SAVED_REG_COUNT, 0
 941  .irp SAVED_REG,x
 942   .ifc "SAVED_REG","Y"
 943    .set LINE_SAVED_REGS, LINE_SAVED_REGS | (1<<1)
 944    .set LINE_SAVED_REG_COUNT, LINE_SAVED_REG_COUNT + 1
 945   .endif
 946   .ifc "SAVED_REG","STRIDE_D"
 947    .set LINE_SAVED_REGS, LINE_SAVED_REGS | (1<<3)
 948    .set LINE_SAVED_REG_COUNT, LINE_SAVED_REG_COUNT + 1
 949   .endif
 950   .ifc "SAVED_REG","STRIDE_S"
 951    .set LINE_SAVED_REGS, LINE_SAVED_REGS | (1<<5)
 952    .set LINE_SAVED_REG_COUNT, LINE_SAVED_REG_COUNT + 1
 953   .endif
 954   .ifc "SAVED_REG","STRIDE_M"
 955    .set LINE_SAVED_REGS, LINE_SAVED_REGS | (1<<7)
 956    .set LINE_SAVED_REG_COUNT, LINE_SAVED_REG_COUNT + 1
 957   .endif
 958   .ifc "SAVED_REG","ORIG_W"
 959    .set LINE_SAVED_REGS, LINE_SAVED_REGS | (1<<14)
 960    .set LINE_SAVED_REG_COUNT, LINE_SAVED_REG_COUNT + 1
 961   .endif
 962  .endr
 963 .endm
 964
 965 .macro nop_macro x:vararg
 966 .endm