modules/arm_neon/i420_rv16.S

   1  @*****************************************************************************
   2  @ i420_rv16.S : ARM NEONv1 I420 to RV16 chroma conversion
   3  @*****************************************************************************
   4  @ Copyright (C) 2011 Sébastien Toque
   5  @                    Rémi Denis-Courmont
   6  @
   7  @ This program is free software; you can redistribute it and/or modify it
   8  @ under the terms of the GNU Lesser General Public License as published by
   9  @ the Free Software Foundation; either version 2.1 of the License, or
  10  @ (at your option) any later version.
  11  @
  12  @ This program is distributed in the hope that it will be useful,
  13  @ but WITHOUT ANY WARRANTY; without even the implied warranty of
  14  @ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  15  @ GNU Lesser General Public License for more details.
  16  @
  17  @ You should have received a copy of the GNU Lesser General Public License
  18  @ along with this program; if not, write to the Free Software Foundation,
  19  @ Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
  20  @****************************************************************************/
  21
  22         .syntax unified
  23         .fpu neon
  24         .text
  25
  26 /* ARM */
  27 #define O1      r0
  28 #define O2      r1
  29 #define WIDTH   r2
  30 #define HEIGHT  r3
  31 #define Y1      r4
  32 #define Y2      r5
  33 #define U       r6
  34 #define V       r7
  35 #define YPITCH  r8
  36 #define OPAD    r10
  37 #define YPAD    r11
  38 #define COUNT   ip
  39 #define OPITCH  lr
  40
  41 /* NEON */
  42 #define coefY   D0
  43 #define coefRV  D1
  44 #define coefGU  D2
  45 #define coefGV  D3
  46 #define coefBU  D4
  47 #define Rc      Q3
  48 #define Gc      Q4
  49 #define Bc      Q5
  50
  51 #define u       D24
  52 #define v       D25
  53 #define y1      D18
  54 #define y2      D19
  55
  56 #define chro_r  Q6
  57 #define chro_g  Q7
  58 #define chro_b  Q8
  59 #define lumi1   Q15
  60 #define lumi2   Q10
  61 #define red16_1         Q9
  62 #define green16_1       Q10
  63 #define blue16_1        Q11
  64 #define red16_2         Q12
  65 #define green16_2       Q13
  66 #define blue16_2        Q14
  67
  68 #define red1    D25
  69 #define green1  D26
  70 #define blue1   D27
  71 #define red2    D29
  72 #define green2  D30
  73 #define blue2   D31
  74
  75 #define out1l   D24
  76 #define out1h   D25
  77 #define out2l   D28
  78 #define out2h   D29
  79
  80 coefficients:
  81     .short  -15872
  82     .short    4992
  83     .short  -18432
  84
  85         .align 2
  86         .global i420_rv16_neon
  87         .type   i420_rv16_neon, %function
  88 i420_rv16_neon:
  89         push            {r4-r8,r10-r11,lr}
  90         vpush           {q4-q7}
  91
  92         /* load arguments */
  93         ldmia           r0,     {O1, OPITCH}
  94         ldmia           r1,     {Y1, U, V, YPITCH}
  95
  96         /* round the width to be a multiple of 16 */
  97         ands            OPAD, WIDTH, #15
  98         sub                     WIDTH, WIDTH, OPAD
  99         addne           WIDTH, WIDTH, #16
 100
 101         /* init constants (scale value by 64) */
 102         vmov.u8         coefY, #74
 103         vmov.u8         coefRV, #115
 104         vmov.u8         coefGU, #14
 105         vmov.u8         coefGV, #34
 106         vmov.u8         coefBU, #135
 107         adr                     OPAD, coefficients
 108         vld1.s16        {d6[], d7[]}, [OPAD]!
 109         vld1.s16        {d8[], d9[]}, [OPAD]!
 110         vld1.s16        {d10[], d11[]}, [OPAD]!
 111
 112         /* init padding */
 113         cmp                     HEIGHT, #0
 114         sub                     OPAD,   OPITCH, WIDTH, lsl #1
 115         sub                     YPAD,   YPITCH, WIDTH
 116
 117 loop_row:
 118         movsgt  COUNT,  WIDTH
 119         add             O2,     O1,     OPITCH
 120         add             Y2,     Y1,     YPITCH
 121         /* exit if all rows have been processed */
 122         vpople  {q4-q7}
 123         pople   {r4-r8,r10-r11,pc}
 124
 125 loop_col:
 126
 127         /* Common U & V */
 128
 129         vld1.u8 {u}, [U,:64]!
 130         vld1.u8 {v}, [V,:64]!
 131
 132         /* Y Top Row */
 133         vld2.u8 {y1,y2}, [Y1,:128]!
 134
 135         vmull.u8        Q14, v, coefRV
 136         vmull.u8        Q11, u, coefGU
 137         vmull.u8        Q13, u, coefBU
 138         vmlal.u8        Q11, v, coefGV
 139
 140         vmull.u8        lumi2, y2, coefY
 141         vmull.u8        lumi1, y1, coefY
 142         vadd.s16        chro_r, Rc, Q14
 143         vadd.s16        chro_b, Bc, Q13
 144         vsub.s16        chro_g, Gc, Q11
 145
 146         pld     [U]
 147         pld     [V]
 148
 149         /* chrominance + luminance */
 150         vqadd.s16       red16_2, lumi2, chro_r
 151         vqadd.s16       green16_2, lumi2, chro_g
 152         vqadd.s16       blue16_2, lumi2, chro_b
 153         vqadd.s16       red16_1, lumi1, chro_r
 154         vqadd.s16       green16_1, lumi1, chro_g
 155         vqadd.s16       blue16_1, lumi1, chro_b
 156
 157         /* clamp (divide by 64) */
 158         vqrshrun.s16    green2, green16_2, #6
 159         vqrshrun.s16    blue2, blue16_2, #6
 160         vqrshrun.s16    red2, red16_2, #6
 161         vqrshrun.s16    green1, green16_1, #6
 162         vqrshrun.s16    red1, red16_1, #6
 163         vqrshrun.s16    blue1, blue16_1, #6
 164
 165         pld     [Y1]
 166
 167         /* pack into RGB565 */
 168         vshl.u8 out2l, green2, #3 // low 2a
 169         vsri.u8 out2h, green2, #5 // high 2
 170         vshl.u8 out1l, green1, #3 // low 1a
 171         vsri.u8 out1h, green1, #5 // high 1
 172         vsri.u8 out2l, blue2, #3 // low 2b
 173         vsri.u8 out1l, blue1, #3 // low 1b
 174
 175         /* Y Bottom Row */
 176         vld2.u8 {y1,y2}, [Y2,:128]!
 177
 178         /* Top Row output */
 179         vzip.u8 out1h, out2h
 180         vmull.u8        lumi2, y2, coefY
 181         vzip.u8 out1l, out2l
 182         vmull.u8        lumi1, y1, coefY
 183         vst2.u8 {out1l, out1h}, [O1,:128]!
 184         vst2.u8 {out2l, out2h}, [O1,:128]!
 185
 186         /* chrominance + luminance */
 187         vqadd.s16       green16_2, lumi2, chro_g
 188         vqadd.s16       red16_2, lumi2, chro_r
 189         vqadd.s16       blue16_2, lumi2, chro_b
 190         vqadd.s16       red16_1, lumi1, chro_r
 191         vqadd.s16       green16_1, lumi1, chro_g
 192         vqadd.s16       blue16_1, lumi1, chro_b
 193
 194         /* clamp (divide by 64) */
 195         vqrshrun.s16    green2, green16_2, #6
 196         vqrshrun.s16    blue2, blue16_2, #6
 197         vqrshrun.s16    red2, red16_2, #6
 198         vqrshrun.s16    green1, green16_1, #6
 199         vqrshrun.s16    red1, red16_1, #6
 200         vqrshrun.s16    blue1, blue16_1, #6
 201
 202         pld     [Y1]
 203
 204         /* pack into RGB565 */
 205         vshl.u8 out2l, green2, #3 // low 2a
 206         vsri.u8 out2h, green2, #5 // high 2
 207         vshl.u8 out1l, green1, #3 // low 1a
 208         vsri.u8 out1h, green1, #5 // high 1
 209         vsri.u8 out2l, blue2, #3 // low 2b
 210         vsri.u8 out1l, blue1, #3 // low 1b
 211
 212         vzip.u8 out1h, out2h
 213         vzip.u8 out1l, out2l
 214         vst2.u8 {out1l, out1h}, [O2,:128]!
 215         vst2.u8 {out2l, out2h}, [O2,:128]!
 216
 217         /* next columns (x16) */
 218         subs    COUNT,  COUNT,  #16
 219         bgt             loop_col
 220
 221         /* next rows (x2) */
 222         subs    HEIGHT, #2
 223         add             O1,     O2,     OPAD
 224         add             Y1,     Y2,     YPAD
 225         add             U,      U,      YPAD,   lsr #1
 226         add             V,      V,      YPAD,   lsr #1
 227         b               loop_row