modules/arm_neon/i420_rgb.S

   1  @*****************************************************************************
   2  @ i420_rgb.S : ARM NEONv1 I420 to RGB chroma conversion
   3  @*****************************************************************************
   4  @ Copyright (C) 2011 Sébastien Toque
   5  @                    Rémi Denis-Courmont
   6  @
   7  @ This program is free software; you can redistribute it and/or modify it
   8  @ under the terms of the GNU Lesser General Public License as published by
   9  @ the Free Software Foundation; either version 2.1 of the License, or
  10  @ (at your option) any later version.
  11  @
  12  @ This program is distributed in the hope that it will be useful,
  13  @ but WITHOUT ANY WARRANTY; without even the implied warranty of
  14  @ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  15  @ GNU Lesser General Public License for more details.
  16  @
  17  @ You should have received a copy of the GNU Lesser General Public License
  18  @ along with this program; if not, write to the Free Software Foundation,
  19  @ Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
  20  @****************************************************************************/
  21
  22         .syntax unified
  23         .fpu neon
  24         .text
  25
  26 /* ARM */
  27 #define O1      r0
  28 #define O2      r1
  29 #define WIDTH   r2
  30 #define HEIGHT  r3
  31 #define Y1      r4
  32 #define Y2      r5
  33 #define U       r6
  34 #define V       r7
  35 #define YPITCH  r8
  36 #define OPAD    r10
  37 #define YPAD    r11
  38 #define COUNT   ip
  39 #define OPITCH  lr
  40
  41 /* NEON */
  42 #define coefY   D0
  43 #define coefRV  D1
  44 #define coefGU  D2
  45 #define coefGV  D3
  46 #define coefBU  D4
  47 #define Rc      Q3
  48 #define Gc      Q4
  49 #define Bc      Q5
  50
  51 #define u       D24
  52 #define v       D25
  53 #define y1      D18
  54 #define y2      D19
  55
  56 #define chro_r  Q6
  57 #define chro_g  Q7
  58 #define chro_b  Q8
  59 #define lumi1   Q15
  60 #define lumi2   Q10
  61 #define red16_1         Q9
  62 #define green16_1       Q10
  63 #define blue16_1        Q11
  64 #define red16_2         Q12
  65 #define green16_2       Q13
  66 #define blue16_2        Q14
  67
  68 #define red1    D24
  69 #define green1  D25
  70 #define blue1   D26
  71 #define alpha1  D27
  72 #define red2    D28
  73 #define green2  D29
  74 #define blue2   D30
  75 #define alpha2  D31
  76
  77 coefficients:
  78     .short  -15872
  79     .short    4992
  80     .short  -18432
  81
  82         .align 2
  83         .global i420_rgb_neon
  84         .type   i420_rgb_neon, %function
  85 i420_rgb_neon:
  86         push            {r4-r8,r10-r11,lr}
  87         vpush           {q4-q7}
  88
  89         /* load arguments */
  90         ldmia           r0,     {O1, OPITCH}
  91         ldmia           r1,     {Y1, U, V, YPITCH}
  92
  93         /* round the width to be a multiple of 16 */
  94         ands            OPAD, WIDTH, #15
  95         sub                     WIDTH, WIDTH, OPAD
  96         addne           WIDTH, WIDTH, #16
  97
  98         /* init constants (scale value by 64) */
  99         vmov.u8         coefY, #74
 100         vmov.u8         coefRV, #115
 101         vmov.u8         coefGU, #14
 102         vmov.u8         coefGV, #34
 103         vmov.u8         coefBU, #135
 104         adr                     OPAD, coefficients
 105         vld1.s16        {d6[], d7[]}, [OPAD]!
 106         vld1.s16        {d8[], d9[]}, [OPAD]!
 107         vld1.s16        {d10[], d11[]}, [OPAD]!
 108         vmov.u8         alpha1, #255
 109
 110         /* init padding */
 111         cmp                     HEIGHT, #0
 112         sub                     OPAD,   OPITCH, WIDTH, lsl #2
 113         sub                     YPAD,   YPITCH, WIDTH
 114
 115 loop_row:
 116         movsgt  COUNT,  WIDTH
 117         add             O2,     O1,     OPITCH
 118         add             Y2,     Y1,     YPITCH
 119         /* exit if all rows have been processed */
 120         vpople  {q4-q7}
 121         pople   {r4-r8,r10-r11,pc}
 122
 123 loop_col:
 124
 125         /* Common U & V */
 126
 127         vld1.u8 {u}, [U,:64]!
 128         vld1.u8 {v}, [V,:64]!
 129
 130         /* Y Top Row */
 131         vld2.u8 {y1,y2}, [Y1,:128]!
 132
 133         vmull.u8        Q14, v, coefRV
 134         vmull.u8        Q11, u, coefGU
 135         vmull.u8        Q13, u, coefBU
 136         vmlal.u8        Q11, v, coefGV
 137
 138         vmull.u8        lumi2, y2, coefY
 139         vmull.u8        lumi1, y1, coefY
 140         vadd.s16        chro_r, Rc, Q14
 141         vadd.s16        chro_b, Bc, Q13
 142         vsub.s16        chro_g, Gc, Q11
 143
 144         pld     [U]
 145         pld     [V]
 146
 147         /* chrominance + luminance */
 148         vqadd.s16       red16_2, lumi2, chro_r
 149         vqadd.s16       blue16_2, lumi2, chro_b
 150         vqadd.s16       green16_2, lumi2, chro_g
 151         vqadd.s16       red16_1, lumi1, chro_r
 152         vqadd.s16       green16_1, lumi1, chro_g
 153         vqadd.s16       blue16_1, lumi1, chro_b
 154
 155         /* clamp (divide by 64) */
 156         vqrshrun.s16    blue2, blue16_2, #6
 157         vqrshrun.s16    red2, red16_2, #6
 158         vqrshrun.s16    green2, green16_2, #6
 159         vqrshrun.s16    red1, red16_1, #6
 160         vqrshrun.s16    green1, green16_1, #6
 161         vqrshrun.s16    blue1, blue16_1, #6
 162
 163         pld     [Y1]
 164
 165         /* Y Bottom Row */
 166         vld2.u8 {y1,y2}, [Y2,:128]!
 167
 168         vmov.u8 alpha1, #255
 169         vzip.u8 red1, red2
 170         vzip.u8 green1, green2
 171         vzip.u8 blue1, blue2
 172
 173         vmull.u8        lumi2, y2, coefY
 174         vst4.u8         {red1,green1,blue1,alpha1}, [O1,:128]!
 175         vst4.u8         {red2,green2,blue2,alpha2}, [O1,:128]!
 176
 177         /* chrominance + luminance */
 178         vmull.u8        lumi1, y1, coefY
 179         vqadd.s16       red16_2, lumi2, chro_r
 180         vqadd.s16       green16_2, lumi2, chro_g
 181         vqadd.s16       blue16_2, lumi2, chro_b
 182         vqadd.s16       red16_1, lumi1, chro_r
 183         vqadd.s16       green16_1, lumi1, chro_g
 184         vqadd.s16       blue16_1, lumi1, chro_b
 185
 186         /* clamp (divide by 64) */
 187         vqrshrun.s16    blue2, blue16_2, #6
 188         vqrshrun.s16    red2, red16_2, #6
 189         vqrshrun.s16    green2, green16_2, #6
 190         vqrshrun.s16    red1, red16_1, #6
 191         vqrshrun.s16    green1, green16_1, #6
 192         vqrshrun.s16    blue1, blue16_1, #6
 193
 194         pld     [Y2]
 195
 196         vmov.u8 alpha2, #255
 197         vzip.u8 red1, red2
 198         vzip.u8 green1, green2
 199         vzip.u8 blue1, blue2
 200
 201         vst4.u8         {red1,green1,blue1,alpha1}, [O2,:128]!
 202         vst4.u8         {red2,green2,blue2,alpha2}, [O2,:128]!
 203
 204         /* next columns (x16) */
 205         subs    COUNT,  COUNT,  #16
 206         bgt             loop_col
 207
 208         /* next rows (x2) */
 209         subs    HEIGHT, #2
 210         add             O1,     O2,     OPAD
 211         add             Y1,     Y2,     YPAD
 212         add             U,      U,      YPAD,   lsr #1
 213         add             V,      V,      YPAD,   lsr #1
 214         b               loop_row