modules/arm_neon/i420_rgb.S

   1  @*****************************************************************************
   2  @ i420_rgb.S : ARM NEONv1 I420 to RGB chroma conversion
   3  @*****************************************************************************
   4  @ Copyright (C) 2011 Sébastien Toque
   5  @                    Rémi Denis-Courmont
   6  @
   7  @ This program is free software; you can redistribute it and/or modify it
   8  @ under the terms of the GNU Lesser General Public License as published by
   9  @ the Free Software Foundation; either version 2.1 of the License, or
  10  @ (at your option) any later version.
  11  @
  12  @ This program is distributed in the hope that it will be useful,
  13  @ but WITHOUT ANY WARRANTY; without even the implied warranty of
  14  @ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  15  @ GNU Lesser General Public License for more details.
  16  @
  17  @ You should have received a copy of the GNU Lesser General Public License
  18  @ along with this program; if not, write to the Free Software Foundation,
  19  @ Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
  20  @****************************************************************************/
  21
  22 #include "asm.S"
  23
  24         .syntax unified
  25 #if HAVE_AS_FPU_DIRECTIVE
  26         .fpu    neon
  27 #endif
  28         .text
  29
  30 /* ARM */
  31 #define O1      r0
  32 #define O2      r1
  33 #define WIDTH   r2
  34 #define HEIGHT  r3
  35 #define Y1      r4
  36 #define Y2      r5
  37 #define U       r6
  38 #define V       r7
  39 #define YPITCH  r8
  40 #define OPAD    r10
  41 #define YPAD    r11
  42 #define COUNT   ip
  43 #define OPITCH  lr
  44
  45 /* NEON */
  46 #define coefY   D0
  47 #define coefRV  D1
  48 #define coefGU  D2
  49 #define coefGV  D3
  50 #define coefBU  D4
  51 #define Rc      Q3
  52 #define Gc      Q4
  53 #define Bc      Q5
  54
  55 #define u       D24
  56 #define v       D25
  57 #define y1      D18
  58 #define y2      D19
  59
  60 #define chro_r  Q6
  61 #define chro_g  Q7
  62 #define chro_b  Q8
  63 #define lumi1   Q15
  64 #define lumi2   Q10
  65 #define red16_1         Q9
  66 #define green16_1       Q10
  67 #define blue16_1        Q11
  68 #define red16_2         Q12
  69 #define green16_2       Q13
  70 #define blue16_2        Q14
  71
  72 #define red1    D24
  73 #define green1  D25
  74 #define blue1   D26
  75 #define alpha1  D27
  76 #define red2    D28
  77 #define green2  D29
  78 #define blue2   D30
  79 #define alpha2  D31
  80
  81 coefficients:
  82     .short  -15872
  83     .short    4992
  84     .short  -18432
  85
  86         .align 2
  87 function i420_rgb_neon
  88         push            {r4-r8,r10-r11,lr}
  89         vpush           {q4-q7}
  90
  91         /* load arguments */
  92         ldmia           r0,     {O1, OPITCH}
  93         ldmia           r1,     {Y1, U, V, YPITCH}
  94
  95         /* round the width to be a multiple of 16 */
  96         ands            OPAD, WIDTH, #15
  97         sub                     WIDTH, WIDTH, OPAD
  98         it              ne
  99         addne           WIDTH, WIDTH, #16
 100
 101         /* init constants (scale value by 64) */
 102         vmov.u8         coefY, #74
 103         vmov.u8         coefRV, #115
 104         vmov.u8         coefGU, #14
 105         vmov.u8         coefGV, #34
 106         vmov.u8         coefBU, #135
 107         adr                     OPAD, coefficients
 108         vld1.s16        {d6[], d7[]}, [OPAD]!
 109         vld1.s16        {d8[], d9[]}, [OPAD]!
 110         vld1.s16        {d10[], d11[]}, [OPAD]!
 111         vmov.u8         alpha1, #255
 112
 113         /* init padding */
 114         cmp                     HEIGHT, #0
 115         sub                     OPAD,   OPITCH, WIDTH, lsl #2
 116         sub                     YPAD,   YPITCH, WIDTH
 117
 118 loop_row:
 119         it      gt
 120         movsgt  COUNT,  WIDTH
 121         add             O2,     O1,     OPITCH
 122         add             Y2,     Y1,     YPITCH
 123         /* exit if all rows have been processed */
 124         itt     le
 125         vpople  {q4-q7}
 126         pople   {r4-r8,r10-r11,pc}
 127
 128 loop_col:
 129
 130         /* Common U & V */
 131
 132         vld1.u8 {u}, [U,:64]!
 133         vld1.u8 {v}, [V,:64]!
 134
 135         /* Y Top Row */
 136         vld2.u8 {y1,y2}, [Y1,:128]!
 137
 138         vmull.u8        Q14, v, coefRV
 139         vmull.u8        Q11, u, coefGU
 140         vmull.u8        Q13, u, coefBU
 141         vmlal.u8        Q11, v, coefGV
 142
 143         vmull.u8        lumi2, y2, coefY
 144         vmull.u8        lumi1, y1, coefY
 145         vadd.s16        chro_r, Rc, Q14
 146         vadd.s16        chro_b, Bc, Q13
 147         vsub.s16        chro_g, Gc, Q11
 148
 149         pld     [U]
 150         pld     [V]
 151
 152         /* chrominance + luminance */
 153         vqadd.s16       red16_2, lumi2, chro_r
 154         vqadd.s16       blue16_2, lumi2, chro_b
 155         vqadd.s16       green16_2, lumi2, chro_g
 156         vqadd.s16       red16_1, lumi1, chro_r
 157         vqadd.s16       green16_1, lumi1, chro_g
 158         vqadd.s16       blue16_1, lumi1, chro_b
 159
 160         /* clamp (divide by 64) */
 161         vqrshrun.s16    blue2, blue16_2, #6
 162         vqrshrun.s16    red2, red16_2, #6
 163         vqrshrun.s16    green2, green16_2, #6
 164         vqrshrun.s16    red1, red16_1, #6
 165         vqrshrun.s16    green1, green16_1, #6
 166         vqrshrun.s16    blue1, blue16_1, #6
 167
 168         pld     [Y1]
 169
 170         /* Y Bottom Row */
 171         vld2.u8 {y1,y2}, [Y2,:128]!
 172
 173         vmov.u8 alpha1, #255
 174         vzip.u8 red1, red2
 175         vzip.u8 green1, green2
 176         vzip.u8 blue1, blue2
 177
 178         vmull.u8        lumi2, y2, coefY
 179         vst4.u8         {red1,green1,blue1,alpha1}, [O1,:128]!
 180         vst4.u8         {red2,green2,blue2,alpha2}, [O1,:128]!
 181
 182         /* chrominance + luminance */
 183         vmull.u8        lumi1, y1, coefY
 184         vqadd.s16       red16_2, lumi2, chro_r
 185         vqadd.s16       green16_2, lumi2, chro_g
 186         vqadd.s16       blue16_2, lumi2, chro_b
 187         vqadd.s16       red16_1, lumi1, chro_r
 188         vqadd.s16       green16_1, lumi1, chro_g
 189         vqadd.s16       blue16_1, lumi1, chro_b
 190
 191         /* clamp (divide by 64) */
 192         vqrshrun.s16    blue2, blue16_2, #6
 193         vqrshrun.s16    red2, red16_2, #6
 194         vqrshrun.s16    green2, green16_2, #6
 195         vqrshrun.s16    red1, red16_1, #6
 196         vqrshrun.s16    green1, green16_1, #6
 197         vqrshrun.s16    blue1, blue16_1, #6
 198
 199         pld     [Y2]
 200
 201         vmov.u8 alpha2, #255
 202         vzip.u8 red1, red2
 203         vzip.u8 green1, green2
 204         vzip.u8 blue1, blue2
 205
 206         vst4.u8         {red1,green1,blue1,alpha1}, [O2,:128]!
 207         vst4.u8         {red2,green2,blue2,alpha2}, [O2,:128]!
 208
 209         /* next columns (x16) */
 210         subs    COUNT,  COUNT,  #16
 211         bgt             loop_col
 212
 213         /* next rows (x2) */
 214         subs    HEIGHT, #2
 215         add             O1,     O2,     OPAD
 216         add             Y1,     Y2,     YPAD
 217         add             U,      U,      YPAD,   lsr #1
 218         add             V,      V,      YPAD,   lsr #1
 219         b               loop_row