display: store the actually requested zoom
[vlc.git] / modules / arm_neon / simple_channel_mixer.S
bloba94ae8539fc1b39e48c4359b7fdf85ded6a7453c
1  @*****************************************************************************
2  @ simple_channel_mixer.S : ARM NEON channel mixer
3  @*****************************************************************************
4  @ Copyright (C) 2012 David Geldreich <david.geldreich@free.fr>
5  @                    Sébastien Toque
6  @
7  @ This program is free software; you can redistribute it and/or modify it
8  @ under the terms of the GNU Lesser General Public License as published by
9  @ the Free Software Foundation; either version 2.1 of the License, or
10  @ (at your option) any later version.
11  @
12  @ This program is distributed in the hope that it will be useful,
13  @ but WITHOUT ANY WARRANTY; without even the implied warranty of
14  @ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  @ GNU Lesser General Public License for more details.
16  @
17  @ You should have received a copy of the GNU Lesser General Public License
18  @ along with this program; if not, write to the Free Software Foundation,
19  @ Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
20  @****************************************************************************/
22         .fpu neon
23         .text
24         .align 2
26 #define DST             r0
27 #define SRC             r1
28 #define NUM             r2
29 #define LFE             r3
30 #define COEFF   r4
32 coeff_7to2:
33         .float 0.5
34         .float 0.5
35         .float 0.25
36         .float 0.25
37         .global convert_7_x_to_2_0_neon_asm
38         .type   convert_7_x_to_2_0_neon_asm, %function
39 convert_7_x_to_2_0_neon_asm:
40         push {r4,lr}
42         adr COEFF, coeff_7to2
43         vld1.32 {q0},[COEFF]
44 0:                                                @ use local label
45         vld1.32 {q2},[SRC]!                           @ load 0,1,2,3
46         vmul.f32 q2,q2,q0                             @ 0.5*src[0] 0.5*src[1] 0.25*src[2] 0.25*src[3]
47         vld1.32 {d6},[SRC]!                           @ load 4,5
48         vmul.f32 d6,d6,d1                             @ 0.25*src[4] 0.25*src[5]
49         vadd.f32 d4,d4,d5                             @ 0.5*src[0] + 0.25*src[2]
50                                                   @ 0.5*src[1] + 0.25*src[3]
51         vadd.f32 d4,d4,d6                             @ 0.5*src[0] + 0.25*src[2] + 0.25*src[4]
52                                                   @ 0.5*src[1] + 0.25*src[3] + 0.25*src[5]
53         flds s14,[SRC]                                @ load 6
54         vdup.32 d7,d7[0]
55         teq LFE,#0
56         ite eq
57         addeq SRC,SRC,#4
58         addne SRC,SRC,#8                              @ skip the lfe channel
59         vadd.f32 d4,d4,d7                             @ 0.5*src[0] + 0.25*src[2] + 0.25*src[4] + src[6]
60                                                   @ 0.5*src[1] + 0.25*src[3] + 0.25*src[5] + src[6]
61         vst1.32 d4, [DST]!
62         subs NUM,NUM,#1
63         bne 0b
65         pop {r4,pc}
68 coeff_5to2:
69         .float 0.5
70         .float 0.5
71         .float 0.33
72         .float 0.33
73         .global convert_5_x_to_2_0_neon_asm
74         .type   convert_5_x_to_2_0_neon_asm, %function
75 convert_5_x_to_2_0_neon_asm:
76         push {r4,lr}
78         adr COEFF, coeff_5to2
79         vld1.32 {q0},[COEFF]                          @ load constants
80 0:                                                @ use local label
81         vld1.32 {q1},[SRC]!                           @ load 0,1,2,3
82         flds s8,[SRC]                                 @ load 4
83         vdup.32 d4,d4[0]
84         teq LFE,#0
85         ite eq
86         addeq SRC,SRC,#4
87         addne SRC,SRC,#8                              @ skip the lfe channel
88         vmul.f32 q1,q1,q0                             @ 0.5*src[0] 0.5*src[1] 0.33*src[2] 0.33*src[3]/3
89         vadd.f32 d2,d2,d3                             @ 0.5*src[0] + 0.33*src[2]
90                                                   @ 0.5*src[1] + 0.33*src[3]
91         vadd.f32 d2,d2,d4                             @ 0.5*src[0] + 0.33*src[2] + src[4]
92                                                   @ 0.5*src[1] + 0.33*src[3] + src[4]
93         vst1.32 d2,[DST]!
94         subs NUM,NUM,#1
95         bne 0b
97         pop {r4,pc}
100 coeff_4to2:
101         .float 0.5
102         .float 0.5
103         .global convert_4_0_to_2_0_neon_asm
104         .type   convert_4_0_to_2_0_neon_asm, %function
105 convert_4_0_to_2_0_neon_asm:
106         push {r4,lr}
108         adr COEFF, coeff_4to2
109         vld1.32 {d0},[COEFF]                          @ load constants
110 0:                                                @ use local label
111         vld1.32 {q1},[SRC]!
112         vmul.f32 d2,d2,d0                             @ 0.5*src[0] 0.5*src[1]
113         vdup.32 d4,d3[0]                              @ dup src[2]
114         vdup.32 d3,d3[1]                              @ dup src[3]
115         vadd.f32 d2,d2,d3                             @ +src[3]
116         vadd.f32 d2,d2,d4                             @ +src[2]
117         vst1.32 d2,[DST]!
118         subs NUM,NUM,#1
119         bne 0b
121         pop {r4,pc}
124 coeff_3to2:
125         .float 0.5
126         .float 0.5
127         .global convert_3_x_to_2_0_neon_asm
128         .type   convert_3_x_to_2_0_neon_asm, %function
129 convert_3_x_to_2_0_neon_asm:
130         push {r4,lr}
132         adr COEFF, coeff_3to2
133         vld1.32 {d0},[COEFF]                          @ load constants
134 0:                                                @ use local label
135         vld1.32 {d1},[SRC]!                           @ load 0,1
136         flds s4,[SRC]                                 @ load 2
137         vdup.32 d2,d2[0]
138         teq LFE,#0
139         ite eq
140         addeq SRC,SRC,#4
141         addne SRC,SRC,#8                              @ skip the lfe channel
142         vmul.f32 d1,d1,d0                             @ 0.5*src[0] 0.5*src[1]
143         vadd.f32 d1,d1,d2                             @ 0.5*src[0] + src[2]
144                                                   @ 0.5*src[1] + src[2]
145         vst1.32 d1,[DST]!
146         subs NUM,NUM,#1
147         bne 0b
149         pop {r4,pc}
152 coeff_7to1:
153         .float 0.25
154         .float 0.25
155         .float 0.125
156         .float 0.125
157         .global convert_7_x_to_1_0_neon_asm
158         .type   convert_7_x_to_1_0_neon_asm, %function
159 convert_7_x_to_1_0_neon_asm:
160         push {r4,lr}
162         adr COEFF, coeff_7to1
163         vld1.32 {q0},[COEFF]
164 0:                                                @ use local label
165         vld1.32 {q1},[SRC]!                           @ load 0,1,2,3
166         vmul.f32 q1,q1,q0                             @ 0.25*src[0] 0.25*src[1] 0.125*src[2] 0.125*src[3]
167         vld1.32 {d4},[SRC]!                           @ load 4,5
168         vmul.f32 d4,d4,d1                             @ 0.125*src[4] 0.125*src[5]
169         vadd.f32 d2,d2,d3
170         vadd.f32 d2,d2,d4
171         flds s10,[SRC]                                @ load 6
172         teq LFE,#0
173         ite eq
174         addeq SRC,SRC,#4
175         addne SRC,SRC,#8                              @ skip the lfe channel
176         vadd.f32 s4,s4,s5
177         vadd.f32 s4,s4,s10
178         fsts s4,[DST]
179         add DST,DST,#4
180         subs NUM,NUM,#1
181         bne 0b
183         pop {r4,pc}
186 coeff_5to1:
187         .float 0.25
188         .float 0.25
189         .float 0.16666667
190         .float 0.16666667
191         .global convert_5_x_to_1_0_neon_asm
192         .type   convert_5_x_to_1_0_neon_asm, %function
193 convert_5_x_to_1_0_neon_asm:
194         push {r4,lr}
196         adr COEFF, coeff_5to1
197         vld1.32 {q0},[COEFF]
198 0:                                                @ use local label
199         vld1.32 {q1},[SRC]!                           @ load 0,1,2,3
200         vmul.f32 q1,q1,q0                             @ 0.25*src[0] 0.25*src[1] src[2]/6 src[3]/6
201         vadd.f32 d2,d2,d3
202         flds s10,[SRC]                                @ load 4
203         teq LFE,#0
204         ite eq
205         addeq SRC,SRC,#4
206         addne SRC,SRC,#8                              @ skip the lfe channel
207         vadd.f32 s4,s4,s5
208         vadd.f32 s4,s4,s10
209         fsts s4,[DST]
210         add DST,DST,#4
211         subs NUM,NUM,#1
212         bne 0b
214         pop {r4,pc}
217 coeff_7to4:
218         .float 0.5
219         .float 0.5
220         .float 0.16666667
221         .float 0.16666667
222         .global convert_7_x_to_4_0_neon_asm
223         .type   convert_7_x_to_4_0_neon_asm, %function
224 convert_7_x_to_4_0_neon_asm:
225         push {r4,lr}
227         adr COEFF, coeff_7to4
228         vld1.32 {q0},[COEFF]
229 0:                                                @ use local label
230         vld1.32 {q1},[SRC]!                           @ load 0,1,2,3
231         vmul.f32 q1,q1,q0                             @ 0.5*src[0] 0.5*src[1] src[2]/6 src[3]/6
232         vld1.32 {d5},[SRC]!                           @ load 4,5
233         flds s14,[SRC]                                @ load 6
234         vadd.f32 d2,d2,d3                             @ 0.5*src[0] + src[2]/6
235                                                   @ 0.5*src[1] + src[3]/6
236         vdup.32 d4,d7[0]                              @ so q2 : src[6] src[6] src[4] src[5]
237         vadd.f32 q2,q2,q1                             @ src[6] + 0.5*src[0] + src[2]/6
238                                                   @ src[6] + 0.5*src[1] + src[3]/6
239                                                   @ src[4] + src[2]/6
240                                                   @ src[5] + src[3]/6
241         teq LFE,#0
242         ite eq
243         addeq SRC,SRC,#4
244         addne SRC,SRC,#8                              @ skip the lfe channel
245         vst1.32 {q2}, [DST]!
246         subs NUM,NUM,#1
247         bne 0b
249         pop {r4,pc}
252 coeff_5to4:
253         .float 0.5
254         .float 0.5
255         .global convert_5_x_to_4_0_neon_asm
256         .type   convert_5_x_to_4_0_neon_asm, %function
257 convert_5_x_to_4_0_neon_asm:
258         push {r4,lr}
260         adr COEFF, coeff_5to4
261         vld1.32 {d0},[COEFF]
262 0:                                                @ use local label
263         vld1.32 {q1},[SRC]!                           @ load 0,1,2,3
264         vmul.f32 d2,d2,d0                             @ 0.5*src[0] 0.5*src[1]
265         flds s8,[SRC]                                 @ load 4
266         vdup.32 d4,d4[0]
267         vadd.f32 d2,d2,d4                             @ 0.5*src[0] + src[4]
268                                                   @ 0.5*src[1] + src[4]
269                                                   @ src[2]
270                                                   @ src[3]
271         teq LFE,#0
272         ite eq
273         addeq SRC,SRC,#4
274         addne SRC,SRC,#8                              @ skip the lfe channel
275         vst1.32 {q1}, [DST]!
276         subs NUM,NUM,#1
277         bne 0b
279         pop {r4,pc}