source/libs/pixman/pixman-src/pixman/pixman-arm-simd-asm-scaled.S

   1 /*
   2  * Copyright © 2008 Mozilla Corporation
   3  * Copyright © 2010 Nokia Corporation
   4  *
   5  * Permission to use, copy, modify, distribute, and sell this software and its
   6  * documentation for any purpose is hereby granted without fee, provided that
   7  * the above copyright notice appear in all copies and that both that
   8  * copyright notice and this permission notice appear in supporting
   9  * documentation, and that the name of Mozilla Corporation not be used in
  10  * advertising or publicity pertaining to distribution of the software without
  11  * specific, written prior permission.  Mozilla Corporation makes no
  12  * representations about the suitability of this software for any purpose.  It
  13  * is provided "as is" without express or implied warranty.
  14  *
  15  * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
  16  * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
  17  * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
  18  * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
  19  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
  20  * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
  21  * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
  22  * SOFTWARE.
  23  *
  24  * Author:  Jeff Muizelaar (jeff@infidigm.net)
  25  *
  26  */
  27
  28 /* Prevent the stack from becoming executable */
  29 #if defined(__linux__) && defined(__ELF__)
  30 .section .note.GNU-stack,"",%progbits
  31 #endif
  32
  33         .text
  34         .arch armv6
  35         .object_arch armv4
  36         .arm
  37         .altmacro
  38         .p2align 2
  39
  40 #include "pixman-arm-asm.h"
  41
  42 /*
  43  * Note: This code is only using armv5te instructions (not even armv6),
  44  *       but is scheduled for ARM Cortex-A8 pipeline. So it might need to
  45  *       be split into a few variants, tuned for each microarchitecture.
  46  *
  47  * TODO: In order to get good performance on ARM9/ARM11 cores (which don't
  48  * have efficient write combining), it needs to be changed to use 16-byte
  49  * aligned writes using STM instruction.
  50  *
  51  * Nearest scanline scaler macro template uses the following arguments:
  52  *  fname                     - name of the function to generate
  53  *  bpp_shift                 - (1 << bpp_shift) is the size of pixel in bytes
  54  *  t                         - type suffix for LDR/STR instructions
  55  *  prefetch_distance         - prefetch in the source image by that many
  56  *                              pixels ahead
  57  *  prefetch_braking_distance - stop prefetching when that many pixels are
  58  *                              remaining before the end of scanline
  59  */
  60
  61 .macro generate_nearest_scanline_func fname, bpp_shift, t,      \
  62                                       prefetch_distance,        \
  63                                       prefetch_braking_distance
  64
  65 pixman_asm_function fname
  66         W               .req    r0
  67         DST             .req    r1
  68         SRC             .req    r2
  69         VX              .req    r3
  70         UNIT_X          .req    ip
  71         TMP1            .req    r4
  72         TMP2            .req    r5
  73         VXMASK          .req    r6
  74         PF_OFFS         .req    r7
  75         SRC_WIDTH_FIXED .req    r8
  76
  77         ldr     UNIT_X, [sp]
  78         push    {r4, r5, r6, r7, r8, r10}
  79         mvn     VXMASK, #((1 << bpp_shift) - 1)
  80         ldr     SRC_WIDTH_FIXED, [sp, #28]
  81
  82         /* define helper macro */
  83         .macro  scale_2_pixels
  84                 ldr&t   TMP1, [SRC, TMP1]
  85                 and     TMP2, VXMASK, VX, asr #(16 - bpp_shift)
  86                 adds    VX, VX, UNIT_X
  87                 str&t   TMP1, [DST], #(1 << bpp_shift)
  88 9:              subpls  VX, VX, SRC_WIDTH_FIXED
  89                 bpl     9b
  90
  91                 ldr&t   TMP2, [SRC, TMP2]
  92                 and     TMP1, VXMASK, VX, asr #(16 - bpp_shift)
  93                 adds    VX, VX, UNIT_X
  94                 str&t   TMP2, [DST], #(1 << bpp_shift)
  95 9:              subpls  VX, VX, SRC_WIDTH_FIXED
  96                 bpl     9b
  97         .endm
  98
  99         /* now do the scaling */
 100         and     TMP1, VXMASK, VX, asr #(16 - bpp_shift)
 101         adds    VX, VX, UNIT_X
 102 9:      subpls  VX, VX, SRC_WIDTH_FIXED
 103         bpl     9b
 104         subs    W, W, #(8 + prefetch_braking_distance)
 105         blt     2f
 106         /* calculate prefetch offset */
 107         mov     PF_OFFS, #prefetch_distance
 108         mla     PF_OFFS, UNIT_X, PF_OFFS, VX
 109 1:      /* main loop, process 8 pixels per iteration with prefetch */
 110         pld     [SRC, PF_OFFS, asr #(16 - bpp_shift)]
 111         add     PF_OFFS, UNIT_X, lsl #3
 112         scale_2_pixels
 113         scale_2_pixels
 114         scale_2_pixels
 115         scale_2_pixels
 116         subs    W, W, #8
 117         bge     1b
 118 2:
 119         subs    W, W, #(4 - 8 - prefetch_braking_distance)
 120         blt     2f
 121 1:      /* process the remaining pixels */
 122         scale_2_pixels
 123         scale_2_pixels
 124         subs    W, W, #4
 125         bge     1b
 126 2:
 127         tst     W, #2
 128         beq     2f
 129         scale_2_pixels
 130 2:
 131         tst     W, #1
 132         ldrne&t TMP1, [SRC, TMP1]
 133         strne&t TMP1, [DST]
 134         /* cleanup helper macro */
 135         .purgem scale_2_pixels
 136         .unreq  DST
 137         .unreq  SRC
 138         .unreq  W
 139         .unreq  VX
 140         .unreq  UNIT_X
 141         .unreq  TMP1
 142         .unreq  TMP2
 143         .unreq  VXMASK
 144         .unreq  PF_OFFS
 145         .unreq  SRC_WIDTH_FIXED
 146         /* return */
 147         pop     {r4, r5, r6, r7, r8, r10}
 148         bx      lr
 149 .endfunc
 150 .endm
 151
 152 generate_nearest_scanline_func \
 153     pixman_scaled_nearest_scanline_0565_0565_SRC_asm_armv6, 1, h, 80, 32
 154
 155 generate_nearest_scanline_func \
 156     pixman_scaled_nearest_scanline_8888_8888_SRC_asm_armv6, 2,  , 48, 32