thread: use vlc_tick_from_timespec()
[vlc.git] / modules / arm_neon / simple_channel_mixer.S
blobcf9b1b531ef3ade2fd1e55e263e7df99eea08ba0
1  @*****************************************************************************
2  @ simple_channel_mixer.S : ARM NEON channel mixer
3  @*****************************************************************************
4  @ Copyright (C) 2012 David Geldreich <david.geldreich@free.fr>
5  @                    Sébastien Toque
6  @
7  @ This program is free software; you can redistribute it and/or modify it
8  @ under the terms of the GNU Lesser General Public License as published by
9  @ the Free Software Foundation; either version 2.1 of the License, or
10  @ (at your option) any later version.
11  @
12  @ This program is distributed in the hope that it will be useful,
13  @ but WITHOUT ANY WARRANTY; without even the implied warranty of
14  @ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  @ GNU Lesser General Public License for more details.
16  @
17  @ You should have received a copy of the GNU Lesser General Public License
18  @ along with this program; if not, write to the Free Software Foundation,
19  @ Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
20  @****************************************************************************/
22 #include "asm.S"
24 #if HAVE_AS_FPU_DIRECTIVE
25         .fpu    neon
26 #endif
27         .text
28         .align 2
30 #define DST             r0
31 #define SRC             r1
32 #define NUM             r2
33 #define LFE             r3
34 #define COEFF   r4
36 coeff_7to2:
37         .float 0.5
38         .float 0.5
39         .float 0.25
40         .float 0.25
41 function convert_7_x_to_2_0_neon_asm
42         push {r4,lr}
44         adr COEFF, coeff_7to2
45         vld1.32 {q0},[COEFF]
46 0:                                                @ use local label
47         vld1.32 {q2},[SRC]!                           @ load 0,1,2,3
48         vmul.f32 q2,q2,q0                             @ 0.5*src[0] 0.5*src[1] 0.25*src[2] 0.25*src[3]
49         vld1.32 {d6},[SRC]!                           @ load 4,5
50         vmul.f32 d6,d6,d1                             @ 0.25*src[4] 0.25*src[5]
51         vadd.f32 d4,d4,d5                             @ 0.5*src[0] + 0.25*src[2]
52                                                   @ 0.5*src[1] + 0.25*src[3]
53         vadd.f32 d4,d4,d6                             @ 0.5*src[0] + 0.25*src[2] + 0.25*src[4]
54                                                   @ 0.5*src[1] + 0.25*src[3] + 0.25*src[5]
55         flds s14,[SRC]                                @ load 6
56         vdup.32 d7,d7[0]
57         teq LFE,#0
58         ite eq
59         addeq SRC,SRC,#4
60         addne SRC,SRC,#8                              @ skip the lfe channel
61         vadd.f32 d4,d4,d7                             @ 0.5*src[0] + 0.25*src[2] + 0.25*src[4] + src[6]
62                                                   @ 0.5*src[1] + 0.25*src[3] + 0.25*src[5] + src[6]
63         vst1.32 d4, [DST]!
64         subs NUM,NUM,#1
65         bne 0b
67         pop {r4,pc}
70 coeff_5to2:
71         .float 0.5
72         .float 0.5
73         .float 0.33
74         .float 0.33
75 function convert_5_x_to_2_0_neon_asm
76         push {r4,lr}
78         adr COEFF, coeff_5to2
79         vld1.32 {q0},[COEFF]                          @ load constants
80 0:                                                @ use local label
81         vld1.32 {q1},[SRC]!                           @ load 0,1,2,3
82         flds s8,[SRC]                                 @ load 4
83         vdup.32 d4,d4[0]
84         teq LFE,#0
85         ite eq
86         addeq SRC,SRC,#4
87         addne SRC,SRC,#8                              @ skip the lfe channel
88         vmul.f32 q1,q1,q0                             @ 0.5*src[0] 0.5*src[1] 0.33*src[2] 0.33*src[3]/3
89         vadd.f32 d2,d2,d3                             @ 0.5*src[0] + 0.33*src[2]
90                                                   @ 0.5*src[1] + 0.33*src[3]
91         vadd.f32 d2,d2,d4                             @ 0.5*src[0] + 0.33*src[2] + src[4]
92                                                   @ 0.5*src[1] + 0.33*src[3] + src[4]
93         vst1.32 d2,[DST]!
94         subs NUM,NUM,#1
95         bne 0b
97         pop {r4,pc}
100 coeff_4to2:
101         .float 0.5
102         .float 0.5
103 function convert_4_0_to_2_0_neon_asm
104         push {r4,lr}
106         adr COEFF, coeff_4to2
107         vld1.32 {d0},[COEFF]                          @ load constants
108 0:                                                @ use local label
109         vld1.32 {q1},[SRC]!
110         vmul.f32 d2,d2,d0                             @ 0.5*src[0] 0.5*src[1]
111         vdup.32 d4,d3[0]                              @ dup src[2]
112         vdup.32 d3,d3[1]                              @ dup src[3]
113         vadd.f32 d2,d2,d3                             @ +src[3]
114         vadd.f32 d2,d2,d4                             @ +src[2]
115         vst1.32 d2,[DST]!
116         subs NUM,NUM,#1
117         bne 0b
119         pop {r4,pc}
122 coeff_3to2:
123         .float 0.5
124         .float 0.5
125 function convert_3_x_to_2_0_neon_asm
126         push {r4,lr}
128         adr COEFF, coeff_3to2
129         vld1.32 {d0},[COEFF]                          @ load constants
130 0:                                                @ use local label
131         vld1.32 {d1},[SRC]!                           @ load 0,1
132         flds s4,[SRC]                                 @ load 2
133         vdup.32 d2,d2[0]
134         teq LFE,#0
135         ite eq
136         addeq SRC,SRC,#4
137         addne SRC,SRC,#8                              @ skip the lfe channel
138         vmul.f32 d1,d1,d0                             @ 0.5*src[0] 0.5*src[1]
139         vadd.f32 d1,d1,d2                             @ 0.5*src[0] + src[2]
140                                                   @ 0.5*src[1] + src[2]
141         vst1.32 d1,[DST]!
142         subs NUM,NUM,#1
143         bne 0b
145         pop {r4,pc}
148 coeff_7to1:
149         .float 0.25
150         .float 0.25
151         .float 0.125
152         .float 0.125
153 function convert_7_x_to_1_0_neon_asm
154         push {r4,lr}
156         adr COEFF, coeff_7to1
157         vld1.32 {q0},[COEFF]
158 0:                                                @ use local label
159         vld1.32 {q1},[SRC]!                           @ load 0,1,2,3
160         vmul.f32 q1,q1,q0                             @ 0.25*src[0] 0.25*src[1] 0.125*src[2] 0.125*src[3]
161         vld1.32 {d4},[SRC]!                           @ load 4,5
162         vmul.f32 d4,d4,d1                             @ 0.125*src[4] 0.125*src[5]
163         vadd.f32 d2,d2,d3
164         vadd.f32 d2,d2,d4
165         flds s10,[SRC]                                @ load 6
166         teq LFE,#0
167         ite eq
168         addeq SRC,SRC,#4
169         addne SRC,SRC,#8                              @ skip the lfe channel
170         vadd.f32 s4,s4,s5
171         vadd.f32 s4,s4,s10
172         fsts s4,[DST]
173         add DST,DST,#4
174         subs NUM,NUM,#1
175         bne 0b
177         pop {r4,pc}
180 coeff_5to1:
181         .float 0.25
182         .float 0.25
183         .float 0.16666667
184         .float 0.16666667
185 function convert_5_x_to_1_0_neon_asm
186         push {r4,lr}
188         adr COEFF, coeff_5to1
189         vld1.32 {q0},[COEFF]
190 0:                                                @ use local label
191         vld1.32 {q1},[SRC]!                           @ load 0,1,2,3
192         vmul.f32 q1,q1,q0                             @ 0.25*src[0] 0.25*src[1] src[2]/6 src[3]/6
193         vadd.f32 d2,d2,d3
194         flds s10,[SRC]                                @ load 4
195         teq LFE,#0
196         ite eq
197         addeq SRC,SRC,#4
198         addne SRC,SRC,#8                              @ skip the lfe channel
199         vadd.f32 s4,s4,s5
200         vadd.f32 s4,s4,s10
201         fsts s4,[DST]
202         add DST,DST,#4
203         subs NUM,NUM,#1
204         bne 0b
206         pop {r4,pc}
209 coeff_7to4:
210         .float 0.5
211         .float 0.5
212         .float 0.16666667
213         .float 0.16666667
214 function convert_7_x_to_4_0_neon_asm
215         push {r4,lr}
217         adr COEFF, coeff_7to4
218         vld1.32 {q0},[COEFF]
219 0:                                                @ use local label
220         vld1.32 {q1},[SRC]!                           @ load 0,1,2,3
221         vmul.f32 q1,q1,q0                             @ 0.5*src[0] 0.5*src[1] src[2]/6 src[3]/6
222         vld1.32 {d5},[SRC]!                           @ load 4,5
223         flds s14,[SRC]                                @ load 6
224         vadd.f32 d2,d2,d3                             @ 0.5*src[0] + src[2]/6
225                                                   @ 0.5*src[1] + src[3]/6
226         vdup.32 d4,d7[0]                              @ so q2 : src[6] src[6] src[4] src[5]
227         vadd.f32 q2,q2,q1                             @ src[6] + 0.5*src[0] + src[2]/6
228                                                   @ src[6] + 0.5*src[1] + src[3]/6
229                                                   @ src[4] + src[2]/6
230                                                   @ src[5] + src[3]/6
231         teq LFE,#0
232         ite eq
233         addeq SRC,SRC,#4
234         addne SRC,SRC,#8                              @ skip the lfe channel
235         vst1.32 {q2}, [DST]!
236         subs NUM,NUM,#1
237         bne 0b
239         pop {r4,pc}
242 coeff_5to4:
243         .float 0.5
244         .float 0.5
245 function convert_5_x_to_4_0_neon_asm
246         push {r4,lr}
248         adr COEFF, coeff_5to4
249         vld1.32 {d0},[COEFF]
250 0:                                                @ use local label
251         vld1.32 {q1},[SRC]!                           @ load 0,1,2,3
252         vmul.f32 d2,d2,d0                             @ 0.5*src[0] 0.5*src[1]
253         flds s8,[SRC]                                 @ load 4
254         vdup.32 d4,d4[0]
255         vadd.f32 d2,d2,d4                             @ 0.5*src[0] + src[4]
256                                                   @ 0.5*src[1] + src[4]
257                                                   @ src[2]
258                                                   @ src[3]
259         teq LFE,#0
260         ite eq
261         addeq SRC,SRC,#4
262         addne SRC,SRC,#8                              @ skip the lfe channel
263         vst1.32 {q1}, [DST]!
264         subs NUM,NUM,#1
265         bne 0b
267         pop {r4,pc}