1 @*****************************************************************************
2 @ simple_channel_mixer.S : ARM NEON channel mixer
3 @*****************************************************************************
4 @ Copyright (C) 2012 David Geldreich <david.geldreich@free.fr>
7 @ This program is free software; you can redistribute it and/or modify it
8 @ under the terms of the GNU Lesser General Public License as published by
9 @ the Free Software Foundation; either version 2.1 of the License, or
10 @ (at your option) any later version.
12 @ This program is distributed in the hope that it will be useful,
13 @ but WITHOUT ANY WARRANTY; without even the implied warranty of
14 @ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 @ GNU Lesser General Public License for more details.
17 @ You should have received a copy of the GNU Lesser General Public License
18 @ along with this program; if not, write to the Free Software Foundation,
19 @ Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
20 @****************************************************************************/
24 #if HAVE_AS_FPU_DIRECTIVE
41 function convert_7_x_to_2_0_neon_asm
47 vld1.32 {q2},[SRC]! @ load 0,1,2,3
48 vmul.f32 q2,q2,q0 @ 0.5*src[0] 0.5*src[1] 0.25*src[2] 0.25*src[3]
49 vld1.32 {d6},[SRC]! @ load 4,5
50 vmul.f32 d6,d6,d1 @ 0.25*src[4] 0.25*src[5]
51 vadd.f32 d4,d4,d5 @ 0.5*src[0] + 0.25*src[2]
52 @ 0.5*src[1] + 0.25*src[3]
53 vadd.f32 d4,d4,d6 @ 0.5*src[0] + 0.25*src[2] + 0.25*src[4]
54 @ 0.5*src[1] + 0.25*src[3] + 0.25*src[5]
55 flds s14,[SRC] @ load 6
60 addne SRC,SRC,#8 @ skip the lfe channel
61 vadd.f32 d4,d4,d7 @ 0.5*src[0] + 0.25*src[2] + 0.25*src[4] + src[6]
62 @ 0.5*src[1] + 0.25*src[3] + 0.25*src[5] + src[6]
75 function convert_5_x_to_2_0_neon_asm
79 vld1.32 {q0},[COEFF] @ load constants
81 vld1.32 {q1},[SRC]! @ load 0,1,2,3
82 flds s8,[SRC] @ load 4
87 addne SRC,SRC,#8 @ skip the lfe channel
88 vmul.f32 q1,q1,q0 @ 0.5*src[0] 0.5*src[1] 0.33*src[2] 0.33*src[3]/3
89 vadd.f32 d2,d2,d3 @ 0.5*src[0] + 0.33*src[2]
90 @ 0.5*src[1] + 0.33*src[3]
91 vadd.f32 d2,d2,d4 @ 0.5*src[0] + 0.33*src[2] + src[4]
92 @ 0.5*src[1] + 0.33*src[3] + src[4]
103 function convert_4_0_to_2_0_neon_asm
106 adr COEFF, coeff_4to2
107 vld1.32 {d0},[COEFF] @ load constants
110 vmul.f32 d2,d2,d0 @ 0.5*src[0] 0.5*src[1]
111 vdup.32 d4,d3[0] @ dup src[2]
112 vdup.32 d3,d3[1] @ dup src[3]
113 vadd.f32 d2,d2,d3 @ +src[3]
114 vadd.f32 d2,d2,d4 @ +src[2]
125 function convert_3_x_to_2_0_neon_asm
128 adr COEFF, coeff_3to2
129 vld1.32 {d0},[COEFF] @ load constants
131 vld1.32 {d1},[SRC]! @ load 0,1
132 flds s4,[SRC] @ load 2
137 addne SRC,SRC,#8 @ skip the lfe channel
138 vmul.f32 d1,d1,d0 @ 0.5*src[0] 0.5*src[1]
139 vadd.f32 d1,d1,d2 @ 0.5*src[0] + src[2]
140 @ 0.5*src[1] + src[2]
153 function convert_7_x_to_1_0_neon_asm
156 adr COEFF, coeff_7to1
159 vld1.32 {q1},[SRC]! @ load 0,1,2,3
160 vmul.f32 q1,q1,q0 @ 0.25*src[0] 0.25*src[1] 0.125*src[2] 0.125*src[3]
161 vld1.32 {d4},[SRC]! @ load 4,5
162 vmul.f32 d4,d4,d1 @ 0.125*src[4] 0.125*src[5]
165 flds s10,[SRC] @ load 6
169 addne SRC,SRC,#8 @ skip the lfe channel
185 function convert_5_x_to_1_0_neon_asm
188 adr COEFF, coeff_5to1
191 vld1.32 {q1},[SRC]! @ load 0,1,2,3
192 vmul.f32 q1,q1,q0 @ 0.25*src[0] 0.25*src[1] src[2]/6 src[3]/6
194 flds s10,[SRC] @ load 4
198 addne SRC,SRC,#8 @ skip the lfe channel
214 function convert_7_x_to_4_0_neon_asm
217 adr COEFF, coeff_7to4
220 vld1.32 {q1},[SRC]! @ load 0,1,2,3
221 vmul.f32 q1,q1,q0 @ 0.5*src[0] 0.5*src[1] src[2]/6 src[3]/6
222 vld1.32 {d5},[SRC]! @ load 4,5
223 flds s14,[SRC] @ load 6
224 vadd.f32 d2,d2,d3 @ 0.5*src[0] + src[2]/6
225 @ 0.5*src[1] + src[3]/6
226 vdup.32 d4,d7[0] @ so q2 : src[6] src[6] src[4] src[5]
227 vadd.f32 q2,q2,q1 @ src[6] + 0.5*src[0] + src[2]/6
228 @ src[6] + 0.5*src[1] + src[3]/6
234 addne SRC,SRC,#8 @ skip the lfe channel
245 function convert_5_x_to_4_0_neon_asm
248 adr COEFF, coeff_5to4
251 vld1.32 {q1},[SRC]! @ load 0,1,2,3
252 vmul.f32 d2,d2,d0 @ 0.5*src[0] 0.5*src[1]
253 flds s8,[SRC] @ load 4
255 vadd.f32 d2,d2,d4 @ 0.5*src[0] + src[4]
256 @ 0.5*src[1] + src[4]
262 addne SRC,SRC,#8 @ skip the lfe channel