modules/arm_neon/i420_rv16.S

   1  @*****************************************************************************
   2  @ i420_rv16.S : ARM NEONv1 I420 to RV16 chroma conversion
   3  @*****************************************************************************
   4  @ Copyright (C) 2011 Sébastien Toque
   5  @                    Rémi Denis-Courmont
   6  @
   7  @ This program is free software; you can redistribute it and/or modify it
   8  @ under the terms of the GNU Lesser General Public License as published by
   9  @ the Free Software Foundation; either version 2.1 of the License, or
  10  @ (at your option) any later version.
  11  @
  12  @ This program is distributed in the hope that it will be useful,
  13  @ but WITHOUT ANY WARRANTY; without even the implied warranty of
  14  @ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  15  @ GNU Lesser General Public License for more details.
  16  @
  17  @ You should have received a copy of the GNU Lesser General Public License
  18  @ along with this program; if not, write to the Free Software Foundation,
  19  @ Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
  20  @****************************************************************************/
  21
  22 #include "asm.S"
  23
  24         .syntax unified
  25 #if HAVE_AS_FPU_DIRECTIVE
  26         .fpu    neon
  27 #endif
  28         .text
  29
  30 /* ARM */
  31 #define O1      r0
  32 #define O2      r1
  33 #define WIDTH   r2
  34 #define HEIGHT  r3
  35 #define Y1      r4
  36 #define Y2      r5
  37 #define U       r6
  38 #define V       r7
  39 #define YPITCH  r8
  40 #define OPAD    r10
  41 #define YPAD    r11
  42 #define COUNT   ip
  43 #define OPITCH  lr
  44
  45 /* NEON */
  46 #define coefY   D0
  47 #define coefRV  D1
  48 #define coefGU  D2
  49 #define coefGV  D3
  50 #define coefBU  D4
  51 #define Rc      Q3
  52 #define Gc      Q4
  53 #define Bc      Q5
  54
  55 #define u       D24
  56 #define v       D25
  57 #define y1      D18
  58 #define y2      D19
  59
  60 #define chro_r  Q6
  61 #define chro_g  Q7
  62 #define chro_b  Q8
  63 #define lumi1   Q15
  64 #define lumi2   Q10
  65 #define red16_1         Q9
  66 #define green16_1       Q10
  67 #define blue16_1        Q11
  68 #define red16_2         Q12
  69 #define green16_2       Q13
  70 #define blue16_2        Q14
  71
  72 #define red1    D25
  73 #define green1  D26
  74 #define blue1   D27
  75 #define red2    D29
  76 #define green2  D30
  77 #define blue2   D31
  78
  79 #define out1l   D24
  80 #define out1h   D25
  81 #define out2l   D28
  82 #define out2h   D29
  83
  84 coefficients:
  85     .short  -15872
  86     .short    4992
  87     .short  -18432
  88
  89         .align 2
  90 function i420_rv16_neon
  91         push            {r4-r8,r10-r11,lr}
  92         vpush           {q4-q7}
  93
  94         /* load arguments */
  95         ldmia           r0,     {O1, OPITCH}
  96         ldmia           r1,     {Y1, U, V, YPITCH}
  97
  98         /* round the width to be a multiple of 16 */
  99         ands            OPAD, WIDTH, #15
 100         sub                     WIDTH, WIDTH, OPAD
 101         it              ne
 102         addne           WIDTH, WIDTH, #16
 103
 104         /* init constants (scale value by 64) */
 105         vmov.u8         coefY, #74
 106         vmov.u8         coefRV, #115
 107         vmov.u8         coefGU, #14
 108         vmov.u8         coefGV, #34
 109         vmov.u8         coefBU, #135
 110         adr                     OPAD, coefficients
 111         vld1.s16        {d6[], d7[]}, [OPAD]!
 112         vld1.s16        {d8[], d9[]}, [OPAD]!
 113         vld1.s16        {d10[], d11[]}, [OPAD]!
 114
 115         /* init padding */
 116         cmp                     HEIGHT, #0
 117         sub                     OPAD,   OPITCH, WIDTH, lsl #1
 118         sub                     YPAD,   YPITCH, WIDTH
 119
 120 loop_row:
 121         it      gt
 122         movsgt  COUNT,  WIDTH
 123         add             O2,     O1,     OPITCH
 124         add             Y2,     Y1,     YPITCH
 125         /* exit if all rows have been processed */
 126         itt     le
 127         vpople  {q4-q7}
 128         pople   {r4-r8,r10-r11,pc}
 129
 130 loop_col:
 131
 132         /* Common U & V */
 133
 134         vld1.u8 {u}, [U,:64]!
 135         vld1.u8 {v}, [V,:64]!
 136
 137         /* Y Top Row */
 138         vld2.u8 {y1,y2}, [Y1,:128]!
 139
 140         vmull.u8        Q14, v, coefRV
 141         vmull.u8        Q11, u, coefGU
 142         vmull.u8        Q13, u, coefBU
 143         vmlal.u8        Q11, v, coefGV
 144
 145         vmull.u8        lumi2, y2, coefY
 146         vmull.u8        lumi1, y1, coefY
 147         vadd.s16        chro_r, Rc, Q14
 148         vadd.s16        chro_b, Bc, Q13
 149         vsub.s16        chro_g, Gc, Q11
 150
 151         pld     [U]
 152         pld     [V]
 153
 154         /* chrominance + luminance */
 155         vqadd.s16       red16_2, lumi2, chro_r
 156         vqadd.s16       green16_2, lumi2, chro_g
 157         vqadd.s16       blue16_2, lumi2, chro_b
 158         vqadd.s16       red16_1, lumi1, chro_r
 159         vqadd.s16       green16_1, lumi1, chro_g
 160         vqadd.s16       blue16_1, lumi1, chro_b
 161
 162         /* clamp (divide by 64) */
 163         vqrshrun.s16    green2, green16_2, #6
 164         vqrshrun.s16    blue2, blue16_2, #6
 165         vqrshrun.s16    red2, red16_2, #6
 166         vqrshrun.s16    green1, green16_1, #6
 167         vqrshrun.s16    red1, red16_1, #6
 168         vqrshrun.s16    blue1, blue16_1, #6
 169
 170         pld     [Y1]
 171
 172         /* pack into RGB565 */
 173         vshl.u8 out2l, green2, #3 // low 2a
 174         vsri.u8 out2h, green2, #5 // high 2
 175         vshl.u8 out1l, green1, #3 // low 1a
 176         vsri.u8 out1h, green1, #5 // high 1
 177         vsri.u8 out2l, blue2, #3 // low 2b
 178         vsri.u8 out1l, blue1, #3 // low 1b
 179
 180         /* Y Bottom Row */
 181         vld2.u8 {y1,y2}, [Y2,:128]!
 182
 183         /* Top Row output */
 184         vzip.u8 out1h, out2h
 185         vmull.u8        lumi2, y2, coefY
 186         vzip.u8 out1l, out2l
 187         vmull.u8        lumi1, y1, coefY
 188         vst2.u8 {out1l, out1h}, [O1,:128]!
 189         vst2.u8 {out2l, out2h}, [O1,:128]!
 190
 191         /* chrominance + luminance */
 192         vqadd.s16       green16_2, lumi2, chro_g
 193         vqadd.s16       red16_2, lumi2, chro_r
 194         vqadd.s16       blue16_2, lumi2, chro_b
 195         vqadd.s16       red16_1, lumi1, chro_r
 196         vqadd.s16       green16_1, lumi1, chro_g
 197         vqadd.s16       blue16_1, lumi1, chro_b
 198
 199         /* clamp (divide by 64) */
 200         vqrshrun.s16    green2, green16_2, #6
 201         vqrshrun.s16    blue2, blue16_2, #6
 202         vqrshrun.s16    red2, red16_2, #6
 203         vqrshrun.s16    green1, green16_1, #6
 204         vqrshrun.s16    red1, red16_1, #6
 205         vqrshrun.s16    blue1, blue16_1, #6
 206
 207         pld     [Y1]
 208
 209         /* pack into RGB565 */
 210         vshl.u8 out2l, green2, #3 // low 2a
 211         vsri.u8 out2h, green2, #5 // high 2
 212         vshl.u8 out1l, green1, #3 // low 1a
 213         vsri.u8 out1h, green1, #5 // high 1
 214         vsri.u8 out2l, blue2, #3 // low 2b
 215         vsri.u8 out1l, blue1, #3 // low 1b
 216
 217         vzip.u8 out1h, out2h
 218         vzip.u8 out1l, out2l
 219         vst2.u8 {out1l, out1h}, [O2,:128]!
 220         vst2.u8 {out2l, out2h}, [O2,:128]!
 221
 222         /* next columns (x16) */
 223         subs    COUNT,  COUNT,  #16
 224         bgt             loop_col
 225
 226         /* next rows (x2) */
 227         subs    HEIGHT, #2
 228         add             O1,     O2,     OPAD
 229         add             Y1,     Y2,     YPAD
 230         add             U,      U,      YPAD,   lsr #1
 231         add             V,      V,      YPAD,   lsr #1
 232         b               loop_row