modules/arm_neon/nv12_rgb.S

   1  @*****************************************************************************
   2  @ nv12_rgb.S : ARM NEONv1 NV12 to RGB chroma conversion
   3  @*****************************************************************************
   4  @ Copyright (C) 2011 Sébastien Toque
   5  @                    Rémi Denis-Courmont
   6  @
   7  @ This program is free software; you can redistribute it and/or modify it
   8  @ under the terms of the GNU Lesser General Public License as published by
   9  @ the Free Software Foundation; either version 2.1 of the License, or
  10  @ (at your option) any later version.
  11  @
  12  @ This program is distributed in the hope that it will be useful,
  13  @ but WITHOUT ANY WARRANTY; without even the implied warranty of
  14  @ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  15  @ GNU Lesser General Public License for more details.
  16  @
  17  @ You should have received a copy of the GNU Lesser General Public License
  18  @ along with this program; if not, write to the Free Software Foundation,
  19  @ Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
  20  @****************************************************************************/
  21
  22 #include "asm.S"
  23
  24         .syntax unified
  25 #if HAVE_AS_FPU_DIRECTIVE
  26         .fpu    neon
  27 #endif
  28         .text
  29
  30 /* ARM */
  31 #define O1      r0
  32 #define O2      r1
  33 #define WIDTH   r2
  34 #define HEIGHT  r3
  35 #define Y1      r4
  36 #define Y2      r5
  37 #define U       r6
  38 #define V       r7
  39 #define YPITCH  r8
  40 #define OPAD    r10
  41 #define YPAD    r11
  42 #define COUNT   ip
  43 #define OPITCH  lr
  44
  45 /* NEON */
  46 #define coefY   D0
  47 #define coefRV  D1
  48 #define coefGU  D2
  49 #define coefGV  D3
  50 #define coefBU  D4
  51 #define Rc      Q3
  52 #define Gc      Q4
  53 #define Bc      Q5
  54
  55 #define u       D24
  56 #define v       D25
  57 #define y1      D28
  58 #define y2      D29
  59
  60 #define chro_r  Q6
  61 #define chro_g  Q7
  62 #define chro_b  Q8
  63 #define red             Q9
  64 #define green   Q10
  65 #define blue    Q11
  66 #define lumi    Q15
  67
  68 #define red1    D24
  69 #define green1  D25
  70 #define blue1   D26
  71 #define alpha1  D27
  72 #define red2    D28
  73 #define green2  D29
  74 #define blue2   D30
  75 #define alpha2  D31
  76
  77 coefficients:
  78     .short  -15872
  79     .short    4992
  80     .short  -18432
  81
  82         .align 2
  83 function nv12_rgb_neon
  84         push            {r4-r8,r10-r11,lr}
  85         vpush           {q4-q7}
  86
  87         /* load arguments */
  88         ldmia           r0,     {O1, OPITCH}
  89         ldmia           r1,     {Y1, U, V, YPITCH}
  90
  91         /* round the width to be a multiple of 16 */
  92         ands            OPAD, WIDTH, #15
  93         sub                     WIDTH, WIDTH, OPAD
  94         it              ne
  95         addne           WIDTH, WIDTH, #16
  96
  97         /* init constants (scale value by 64) */
  98         vmov.u8         coefY, #74
  99         vmov.u8         coefRV, #115
 100         vmov.u8         coefGU, #14
 101         vmov.u8         coefGV, #34
 102         vmov.u8         coefBU, #135
 103         adr                     OPAD, coefficients
 104         vld1.s16        {d6[], d7[]}, [OPAD]!
 105         vld1.s16        {d8[], d9[]}, [OPAD]!
 106         vld1.s16        {d10[], d11[]}, [OPAD]!
 107         vmov.u8         alpha1, #255
 108
 109         /* init padding */
 110         cmp                     HEIGHT, #0
 111         sub                     OPAD,   OPITCH, WIDTH, lsl #2
 112         sub                     YPAD,   YPITCH, WIDTH
 113
 114 loop_row:
 115         it      gt
 116         movsgt  COUNT,  WIDTH
 117         add             O2,     O1,     OPITCH
 118         add             Y2,     Y1,     YPITCH
 119         /* exit if all rows have been processed */
 120         itt     le
 121         vpople  {q4-q7}
 122         pople   {r4-r8,r10-r11,pc}
 123
 124 loop_col:
 125
 126         /* Common U & V */
 127
 128         vld2.u8 {u,v}, [U,:128]!
 129
 130         vmull.u8        chro_r, v, coefRV
 131         vmull.u8        chro_g, u, coefGU
 132         vmlal.u8        chro_g, v, coefGV
 133         vmull.u8        chro_b, u, coefBU
 134
 135         vadd.s16        chro_r, Rc, chro_r
 136         vsub.s16        chro_g, Gc, chro_g
 137         vadd.s16        chro_b, Bc, chro_b
 138
 139         pld     [U]
 140
 141         /* Y Top Row */
 142         vld2.u8 {y1,y2}, [Y1,:128]!
 143
 144         /* y1 : chrominance + luminance, then clamp (divide by 64) */
 145         vmull.u8        lumi, y1, coefY
 146         vqadd.s16       red, lumi, chro_r
 147         vqadd.s16       green, lumi, chro_g
 148         vqadd.s16       blue, lumi, chro_b
 149         vqrshrun.s16    red1, red, #6
 150         vqrshrun.s16    green1, green, #6
 151         vqrshrun.s16    blue1, blue, #6
 152
 153         /* y2 : chrominance + luminance, then clamp (divide by 64) */
 154         vmull.u8        lumi, y2, coefY
 155         vqadd.s16       red, lumi, chro_r
 156         vqadd.s16       green, lumi, chro_g
 157         vqadd.s16       blue, lumi, chro_b
 158         vqrshrun.s16    red2, red, #6
 159         vqrshrun.s16    green2, green, #6
 160         vqrshrun.s16    blue2, blue, #6
 161
 162         pld     [Y1]
 163
 164         vmov.u8 alpha2, #255
 165         vzip.u8 red1, red2
 166         vzip.u8 green1, green2
 167         vzip.u8 blue1, blue2
 168
 169         vst4.u8         {red1,green1,blue1,alpha1}, [O1,:128]!
 170         vst4.u8         {red2,green2,blue2,alpha2}, [O1,:128]!
 171
 172         /* Y Bottom Row */
 173         vld2.u8 {y1,y2}, [Y2,:128]!
 174
 175         /* y1 : chrominance + luminance, then clamp (divide by 64) */
 176         vmull.u8        lumi, y1, coefY
 177         vqadd.s16       red, lumi, chro_r
 178         vqadd.s16       green, lumi, chro_g
 179         vqadd.s16       blue, lumi, chro_b
 180         vqrshrun.s16    red1, red, #6
 181         vqrshrun.s16    green1, green, #6
 182         vqrshrun.s16    blue1, blue, #6
 183
 184         /* y2 : chrominance + luminance, then clamp (divide by 64) */
 185         vmull.u8        lumi, y2, coefY
 186         vqadd.s16       red, lumi, chro_r
 187         vqadd.s16       green, lumi, chro_g
 188         vqadd.s16       blue, lumi, chro_b
 189         vqrshrun.s16    red2, red, #6
 190         vqrshrun.s16    green2, green, #6
 191         vqrshrun.s16    blue2, blue, #6
 192
 193         pld     [Y2]
 194
 195         vmov.u8 alpha2, #255
 196         vzip.u8 red1, red2
 197         vzip.u8 green1, green2
 198         vzip.u8 blue1, blue2
 199
 200         vst4.u8         {red1,green1,blue1,alpha1}, [O2,:128]!
 201         vst4.u8         {red2,green2,blue2,alpha2}, [O2,:128]!
 202
 203         /* next columns (x16) */
 204         subs    COUNT,  COUNT,  #16
 205         bgt             loop_col
 206
 207         /* next rows (x2) */
 208         subs    HEIGHT, #2
 209         add             O1,     O2,     OPAD
 210         add             Y1,     Y2,     YPAD
 211         add             U,      U,      YPAD
 212         b               loop_row