2 * Copyright (c) 2004 Romain Dolbeau <romain@dolbeau.org>
4 * This library is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU Lesser General Public
6 * License as published by the Free Software Foundation; either
7 * version 2 of the License, or (at your option) any later version.
9 * This library is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * Lesser General Public License for more details.
14 * You should have received a copy of the GNU Lesser General Public
15 * License along with this library; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19 /* this code assume that stride % 16 == 0 */
20 void PREFIX_h264_chroma_mc8_altivec(uint8_t * dst
, uint8_t * src
, int stride
, int h
, int x
, int y
) {
21 POWERPC_PERF_DECLARE(PREFIX_h264_chroma_mc8_num
, 1);
22 POWERPC_PERF_START_COUNT(PREFIX_h264_chroma_mc8_num
, 1);
23 signed int ABCD
[4] __attribute__((aligned(16)));
25 ABCD
[0] = ((8 - x
) * (8 - y
));
26 ABCD
[1] = ((x
) * (8 - y
));
27 ABCD
[2] = ((8 - x
) * (y
));
28 ABCD
[3] = ((x
) * (y
));
29 const vector
signed int vABCD
= vec_ld(0, ABCD
);
30 const vector
signed short vA
= vec_splat((vector
signed short)vABCD
, 1);
31 const vector
signed short vB
= vec_splat((vector
signed short)vABCD
, 3);
32 const vector
signed short vC
= vec_splat((vector
signed short)vABCD
, 5);
33 const vector
signed short vD
= vec_splat((vector
signed short)vABCD
, 7);
34 const vector
signed int vzero
= vec_splat_s32(0);
35 const vector
signed short v32ss
= (const vector
signed short)AVV(32);
36 const vector
unsigned short v6us
= vec_splat_u16(6);
38 vector
unsigned char fperm
;
40 if (((unsigned long)dst
) % 16 == 0) {
41 fperm
= (vector
unsigned char)AVV(0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
42 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F);
44 fperm
= (vector
unsigned char)AVV(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
45 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F);
48 register int loadSecond
= (((unsigned long)src
) % 16) <= 7 ? 0 : 1;
49 register int reallyBadAlign
= (((unsigned long)src
) % 16) == 15 ? 1 : 0;
51 vector
unsigned char vsrcAuc
;
52 vector
unsigned char vsrcBuc
;
53 vector
unsigned char vsrcperm0
;
54 vector
unsigned char vsrcperm1
;
55 vsrcAuc
= vec_ld(0, src
);
57 vsrcBuc
= vec_ld(16, src
);
58 vsrcperm0
= vec_lvsl(0, src
);
59 vsrcperm1
= vec_lvsl(1, src
);
61 vector
unsigned char vsrc0uc
;
62 vector
unsigned char vsrc1uc
;
63 vsrc0uc
= vec_perm(vsrcAuc
, vsrcBuc
, vsrcperm0
);
67 vsrc1uc
= vec_perm(vsrcAuc
, vsrcBuc
, vsrcperm1
);
69 vector
signed short vsrc0ssH
= (vector
signed short)vec_mergeh((vector
unsigned char)vzero
, (vector
unsigned char)vsrc0uc
);
70 vector
signed short vsrc1ssH
= (vector
signed short)vec_mergeh((vector
unsigned char)vzero
, (vector
unsigned char)vsrc1uc
);
72 if (!loadSecond
) {// -> !reallyBadAlign
73 for (i
= 0 ; i
< h
; i
++) {
74 vector
unsigned char vsrcCuc
;
75 vsrcCuc
= vec_ld(stride
+ 0, src
);
77 vector
unsigned char vsrc2uc
;
78 vector
unsigned char vsrc3uc
;
79 vsrc2uc
= vec_perm(vsrcCuc
, vsrcCuc
, vsrcperm0
);
80 vsrc3uc
= vec_perm(vsrcCuc
, vsrcCuc
, vsrcperm1
);
82 vector
signed short vsrc2ssH
= (vector
signed short)vec_mergeh((vector
unsigned char)vzero
, (vector
unsigned char)vsrc2uc
);
83 vector
signed short vsrc3ssH
= (vector
signed short)vec_mergeh((vector
unsigned char)vzero
, (vector
unsigned char)vsrc3uc
);
85 vector
signed short psum
;
87 psum
= vec_mladd(vA
, vsrc0ssH
, vec_splat_s16(0));
88 psum
= vec_mladd(vB
, vsrc1ssH
, psum
);
89 psum
= vec_mladd(vC
, vsrc2ssH
, psum
);
90 psum
= vec_mladd(vD
, vsrc3ssH
, psum
);
91 psum
= vec_add(v32ss
, psum
);
92 psum
= vec_sra(psum
, v6us
);
94 vector
unsigned char vdst
= vec_ld(0, dst
);
95 vector
unsigned char ppsum
= (vector
unsigned char)vec_packsu(psum
, psum
);
97 vector
unsigned char vfdst
= vec_perm(vdst
, ppsum
, fperm
);
98 vector
unsigned char fsum
;
100 OP_U8_ALTIVEC(fsum
, vfdst
, vdst
);
102 vec_st(fsum
, 0, dst
);
111 for (i
= 0 ; i
< h
; i
++) {
112 vector
unsigned char vsrcCuc
;
113 vector
unsigned char vsrcDuc
;
114 vsrcCuc
= vec_ld(stride
+ 0, src
);
115 vsrcDuc
= vec_ld(stride
+ 16, src
);
117 vector
unsigned char vsrc2uc
;
118 vector
unsigned char vsrc3uc
;
119 vsrc2uc
= vec_perm(vsrcCuc
, vsrcDuc
, vsrcperm0
);
123 vsrc3uc
= vec_perm(vsrcCuc
, vsrcDuc
, vsrcperm1
);
125 vector
signed short vsrc2ssH
= (vector
signed short)vec_mergeh((vector
unsigned char)vzero
, (vector
unsigned char)vsrc2uc
);
126 vector
signed short vsrc3ssH
= (vector
signed short)vec_mergeh((vector
unsigned char)vzero
, (vector
unsigned char)vsrc3uc
);
128 vector
signed short psum
;
130 psum
= vec_mladd(vA
, vsrc0ssH
, vec_splat_s16(0));
131 psum
= vec_mladd(vB
, vsrc1ssH
, psum
);
132 psum
= vec_mladd(vC
, vsrc2ssH
, psum
);
133 psum
= vec_mladd(vD
, vsrc3ssH
, psum
);
134 psum
= vec_add(v32ss
, psum
);
135 psum
= vec_sr(psum
, v6us
);
137 vector
unsigned char vdst
= vec_ld(0, dst
);
138 vector
unsigned char ppsum
= (vector
unsigned char)vec_pack(psum
, psum
);
140 vector
unsigned char vfdst
= vec_perm(vdst
, ppsum
, fperm
);
141 vector
unsigned char fsum
;
143 OP_U8_ALTIVEC(fsum
, vfdst
, vdst
);
145 vec_st(fsum
, 0, dst
);
154 POWERPC_PERF_STOP_COUNT(PREFIX_h264_chroma_mc8_num
, 1);
157 /* this code assume stride % 16 == 0 */
158 static void PREFIX_h264_qpel16_h_lowpass_altivec(uint8_t * dst
, uint8_t * src
, int dstStride
, int srcStride
) {
159 POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_h_lowpass_num
, 1);
160 POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_h_lowpass_num
, 1);
163 const vector
signed int vzero
= vec_splat_s32(0);
164 const vector
unsigned char permM2
= vec_lvsl(-2, src
);
165 const vector
unsigned char permM1
= vec_lvsl(-1, src
);
166 const vector
unsigned char permP0
= vec_lvsl(+0, src
);
167 const vector
unsigned char permP1
= vec_lvsl(+1, src
);
168 const vector
unsigned char permP2
= vec_lvsl(+2, src
);
169 const vector
unsigned char permP3
= vec_lvsl(+3, src
);
170 const vector
signed short v20ss
= (const vector
signed short)AVV(20);
171 const vector
unsigned short v5us
= vec_splat_u16(5);
172 const vector
signed short v5ss
= vec_splat_s16(5);
173 const vector
signed short v16ss
= (const vector
signed short)AVV(16);
174 const vector
unsigned char dstperm
= vec_lvsr(0, dst
);
175 const vector
unsigned char neg1
= (const vector
unsigned char)vec_splat_s8(-1);
176 const vector
unsigned char dstmask
= vec_perm((const vector
unsigned char)vzero
, neg1
, dstperm
);
178 register int align
= ((((unsigned long)src
) - 2) % 16);
180 for (i
= 0 ; i
< 16 ; i
++) {
181 vector
unsigned char srcM2
, srcM1
, srcP0
, srcP1
, srcP2
, srcP3
;
182 vector
unsigned char srcR1
= vec_ld(-2, src
);
183 vector
unsigned char srcR2
= vec_ld(14, src
);
187 srcM2
= vec_perm(srcR1
, srcR2
, permM2
);
188 srcM1
= vec_perm(srcR1
, srcR2
, permM1
);
189 srcP0
= vec_perm(srcR1
, srcR2
, permP0
);
190 srcP1
= vec_perm(srcR1
, srcR2
, permP1
);
191 srcP2
= vec_perm(srcR1
, srcR2
, permP2
);
192 srcP3
= vec_perm(srcR1
, srcR2
, permP3
);
195 srcM2
= vec_perm(srcR1
, srcR2
, permM2
);
196 srcM1
= vec_perm(srcR1
, srcR2
, permM1
);
197 srcP0
= vec_perm(srcR1
, srcR2
, permP0
);
198 srcP1
= vec_perm(srcR1
, srcR2
, permP1
);
199 srcP2
= vec_perm(srcR1
, srcR2
, permP2
);
203 vector
unsigned char srcR3
= vec_ld(30, src
);
204 srcM2
= vec_perm(srcR1
, srcR2
, permM2
);
205 srcM1
= vec_perm(srcR1
, srcR2
, permM1
);
206 srcP0
= vec_perm(srcR1
, srcR2
, permP0
);
207 srcP1
= vec_perm(srcR1
, srcR2
, permP1
);
209 srcP3
= vec_perm(srcR2
, srcR3
, permP3
);
212 vector
unsigned char srcR3
= vec_ld(30, src
);
213 srcM2
= vec_perm(srcR1
, srcR2
, permM2
);
214 srcM1
= vec_perm(srcR1
, srcR2
, permM1
);
215 srcP0
= vec_perm(srcR1
, srcR2
, permP0
);
217 srcP2
= vec_perm(srcR2
, srcR3
, permP2
);
218 srcP3
= vec_perm(srcR2
, srcR3
, permP3
);
221 vector
unsigned char srcR3
= vec_ld(30, src
);
222 srcM2
= vec_perm(srcR1
, srcR2
, permM2
);
223 srcM1
= vec_perm(srcR1
, srcR2
, permM1
);
225 srcP1
= vec_perm(srcR2
, srcR3
, permP1
);
226 srcP2
= vec_perm(srcR2
, srcR3
, permP2
);
227 srcP3
= vec_perm(srcR2
, srcR3
, permP3
);
230 vector
unsigned char srcR3
= vec_ld(30, src
);
231 srcM2
= vec_perm(srcR1
, srcR2
, permM2
);
233 srcP0
= vec_perm(srcR2
, srcR3
, permP0
);
234 srcP1
= vec_perm(srcR2
, srcR3
, permP1
);
235 srcP2
= vec_perm(srcR2
, srcR3
, permP2
);
236 srcP3
= vec_perm(srcR2
, srcR3
, permP3
);
240 const vector
signed short srcP0A
= (vector
signed short)vec_mergeh((vector
unsigned char)vzero
, srcP0
);
241 const vector
signed short srcP0B
= (vector
signed short)vec_mergel((vector
unsigned char)vzero
, srcP0
);
242 const vector
signed short srcP1A
= (vector
signed short)vec_mergeh((vector
unsigned char)vzero
, srcP1
);
243 const vector
signed short srcP1B
= (vector
signed short)vec_mergel((vector
unsigned char)vzero
, srcP1
);
245 const vector
signed short srcP2A
= (vector
signed short)vec_mergeh((vector
unsigned char)vzero
, srcP2
);
246 const vector
signed short srcP2B
= (vector
signed short)vec_mergel((vector
unsigned char)vzero
, srcP2
);
247 const vector
signed short srcP3A
= (vector
signed short)vec_mergeh((vector
unsigned char)vzero
, srcP3
);
248 const vector
signed short srcP3B
= (vector
signed short)vec_mergel((vector
unsigned char)vzero
, srcP3
);
250 const vector
signed short srcM1A
= (vector
signed short)vec_mergeh((vector
unsigned char)vzero
, srcM1
);
251 const vector
signed short srcM1B
= (vector
signed short)vec_mergel((vector
unsigned char)vzero
, srcM1
);
252 const vector
signed short srcM2A
= (vector
signed short)vec_mergeh((vector
unsigned char)vzero
, srcM2
);
253 const vector
signed short srcM2B
= (vector
signed short)vec_mergel((vector
unsigned char)vzero
, srcM2
);
255 const vector
signed short sum1A
= vec_adds(srcP0A
, srcP1A
);
256 const vector
signed short sum1B
= vec_adds(srcP0B
, srcP1B
);
257 const vector
signed short sum2A
= vec_adds(srcM1A
, srcP2A
);
258 const vector
signed short sum2B
= vec_adds(srcM1B
, srcP2B
);
259 const vector
signed short sum3A
= vec_adds(srcM2A
, srcP3A
);
260 const vector
signed short sum3B
= vec_adds(srcM2B
, srcP3B
);
262 const vector
signed short pp1A
= vec_mladd(sum1A
, v20ss
, v16ss
);
263 const vector
signed short pp1B
= vec_mladd(sum1B
, v20ss
, v16ss
);
265 const vector
signed short pp2A
= vec_mladd(sum2A
, v5ss
, (vector
signed short)vzero
);
266 const vector
signed short pp2B
= vec_mladd(sum2B
, v5ss
, (vector
signed short)vzero
);
268 const vector
signed short pp3A
= vec_add(sum3A
, pp1A
);
269 const vector
signed short pp3B
= vec_add(sum3B
, pp1B
);
271 const vector
signed short psumA
= vec_sub(pp3A
, pp2A
);
272 const vector
signed short psumB
= vec_sub(pp3B
, pp2B
);
274 const vector
signed short sumA
= vec_sra(psumA
, v5us
);
275 const vector
signed short sumB
= vec_sra(psumB
, v5us
);
277 const vector
unsigned char sum
= vec_packsu(sumA
, sumB
);
279 const vector
unsigned char dst1
= vec_ld(0, dst
);
280 const vector
unsigned char dst2
= vec_ld(16, dst
);
281 const vector
unsigned char vdst
= vec_perm(dst1
, dst2
, vec_lvsl(0, dst
));
283 vector
unsigned char fsum
;
284 OP_U8_ALTIVEC(fsum
, sum
, vdst
);
286 const vector
unsigned char rsum
= vec_perm(fsum
, fsum
, dstperm
);
287 const vector
unsigned char fdst1
= vec_sel(dst1
, rsum
, dstmask
);
288 const vector
unsigned char fdst2
= vec_sel(rsum
, dst2
, dstmask
);
290 vec_st(fdst1
, 0, dst
);
291 vec_st(fdst2
, 16, dst
);
296 POWERPC_PERF_STOP_COUNT(PREFIX_h264_qpel16_h_lowpass_num
, 1);
299 /* this code assume stride % 16 == 0 */
300 static void PREFIX_h264_qpel16_v_lowpass_altivec(uint8_t * dst
, uint8_t * src
, int dstStride
, int srcStride
) {
301 POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_v_lowpass_num
, 1);
302 POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_v_lowpass_num
, 1);
306 const vector
signed int vzero
= vec_splat_s32(0);
307 const vector
unsigned char perm
= vec_lvsl(0, src
);
308 const vector
signed short v20ss
= (const vector
signed short)AVV(20);
309 const vector
unsigned short v5us
= vec_splat_u16(5);
310 const vector
signed short v5ss
= vec_splat_s16(5);
311 const vector
signed short v16ss
= (const vector
signed short)AVV(16);
312 const vector
unsigned char dstperm
= vec_lvsr(0, dst
);
313 const vector
unsigned char neg1
= (const vector
unsigned char)vec_splat_s8(-1);
314 const vector
unsigned char dstmask
= vec_perm((const vector
unsigned char)vzero
, neg1
, dstperm
);
316 uint8_t *srcbis
= src
- (srcStride
* 2);
318 const vector
unsigned char srcM2a
= vec_ld(0, srcbis
);
319 const vector
unsigned char srcM2b
= vec_ld(16, srcbis
);
320 const vector
unsigned char srcM2
= vec_perm(srcM2a
, srcM2b
, perm
);
322 const vector
unsigned char srcM1a
= vec_ld(0, srcbis
);
323 const vector
unsigned char srcM1b
= vec_ld(16, srcbis
);
324 const vector
unsigned char srcM1
= vec_perm(srcM1a
, srcM1b
, perm
);
326 const vector
unsigned char srcP0a
= vec_ld(0, srcbis
);
327 const vector
unsigned char srcP0b
= vec_ld(16, srcbis
);
328 const vector
unsigned char srcP0
= vec_perm(srcP0a
, srcP0b
, perm
);
330 const vector
unsigned char srcP1a
= vec_ld(0, srcbis
);
331 const vector
unsigned char srcP1b
= vec_ld(16, srcbis
);
332 const vector
unsigned char srcP1
= vec_perm(srcP1a
, srcP1b
, perm
);
334 const vector
unsigned char srcP2a
= vec_ld(0, srcbis
);
335 const vector
unsigned char srcP2b
= vec_ld(16, srcbis
);
336 const vector
unsigned char srcP2
= vec_perm(srcP2a
, srcP2b
, perm
);
339 vector
signed short srcM2ssA
= (vector
signed short)vec_mergeh((vector
unsigned char)vzero
, srcM2
);
340 vector
signed short srcM2ssB
= (vector
signed short)vec_mergel((vector
unsigned char)vzero
, srcM2
);
341 vector
signed short srcM1ssA
= (vector
signed short)vec_mergeh((vector
unsigned char)vzero
, srcM1
);
342 vector
signed short srcM1ssB
= (vector
signed short)vec_mergel((vector
unsigned char)vzero
, srcM1
);
343 vector
signed short srcP0ssA
= (vector
signed short)vec_mergeh((vector
unsigned char)vzero
, srcP0
);
344 vector
signed short srcP0ssB
= (vector
signed short)vec_mergel((vector
unsigned char)vzero
, srcP0
);
345 vector
signed short srcP1ssA
= (vector
signed short)vec_mergeh((vector
unsigned char)vzero
, srcP1
);
346 vector
signed short srcP1ssB
= (vector
signed short)vec_mergel((vector
unsigned char)vzero
, srcP1
);
347 vector
signed short srcP2ssA
= (vector
signed short)vec_mergeh((vector
unsigned char)vzero
, srcP2
);
348 vector
signed short srcP2ssB
= (vector
signed short)vec_mergel((vector
unsigned char)vzero
, srcP2
);
350 for (i
= 0 ; i
< 16 ; i
++) {
351 const vector
unsigned char srcP3a
= vec_ld(0, srcbis
);
352 const vector
unsigned char srcP3b
= vec_ld(16, srcbis
);
353 const vector
unsigned char srcP3
= vec_perm(srcP3a
, srcP3b
, perm
);
354 const vector
signed short srcP3ssA
= (vector
signed short)vec_mergeh((vector
unsigned char)vzero
, srcP3
);
355 const vector
signed short srcP3ssB
= (vector
signed short)vec_mergel((vector
unsigned char)vzero
, srcP3
);
358 const vector
signed short sum1A
= vec_adds(srcP0ssA
, srcP1ssA
);
359 const vector
signed short sum1B
= vec_adds(srcP0ssB
, srcP1ssB
);
360 const vector
signed short sum2A
= vec_adds(srcM1ssA
, srcP2ssA
);
361 const vector
signed short sum2B
= vec_adds(srcM1ssB
, srcP2ssB
);
362 const vector
signed short sum3A
= vec_adds(srcM2ssA
, srcP3ssA
);
363 const vector
signed short sum3B
= vec_adds(srcM2ssB
, srcP3ssB
);
376 const vector
signed short pp1A
= vec_mladd(sum1A
, v20ss
, v16ss
);
377 const vector
signed short pp1B
= vec_mladd(sum1B
, v20ss
, v16ss
);
379 const vector
signed short pp2A
= vec_mladd(sum2A
, v5ss
, (vector
signed short)vzero
);
380 const vector
signed short pp2B
= vec_mladd(sum2B
, v5ss
, (vector
signed short)vzero
);
382 const vector
signed short pp3A
= vec_add(sum3A
, pp1A
);
383 const vector
signed short pp3B
= vec_add(sum3B
, pp1B
);
385 const vector
signed short psumA
= vec_sub(pp3A
, pp2A
);
386 const vector
signed short psumB
= vec_sub(pp3B
, pp2B
);
388 const vector
signed short sumA
= vec_sra(psumA
, v5us
);
389 const vector
signed short sumB
= vec_sra(psumB
, v5us
);
391 const vector
unsigned char sum
= vec_packsu(sumA
, sumB
);
393 const vector
unsigned char dst1
= vec_ld(0, dst
);
394 const vector
unsigned char dst2
= vec_ld(16, dst
);
395 const vector
unsigned char vdst
= vec_perm(dst1
, dst2
, vec_lvsl(0, dst
));
397 vector
unsigned char fsum
;
398 OP_U8_ALTIVEC(fsum
, sum
, vdst
);
400 const vector
unsigned char rsum
= vec_perm(fsum
, fsum
, dstperm
);
401 const vector
unsigned char fdst1
= vec_sel(dst1
, rsum
, dstmask
);
402 const vector
unsigned char fdst2
= vec_sel(rsum
, dst2
, dstmask
);
404 vec_st(fdst1
, 0, dst
);
405 vec_st(fdst2
, 16, dst
);
409 POWERPC_PERF_STOP_COUNT(PREFIX_h264_qpel16_v_lowpass_num
, 1);
412 /* this code assume stride % 16 == 0 *and* tmp is properly aligned */
413 static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t * dst
, int16_t * tmp
, uint8_t * src
, int dstStride
, int tmpStride
, int srcStride
) {
414 POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_hv_lowpass_num
, 1);
415 POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_hv_lowpass_num
, 1);
417 const vector
signed int vzero
= vec_splat_s32(0);
418 const vector
unsigned char permM2
= vec_lvsl(-2, src
);
419 const vector
unsigned char permM1
= vec_lvsl(-1, src
);
420 const vector
unsigned char permP0
= vec_lvsl(+0, src
);
421 const vector
unsigned char permP1
= vec_lvsl(+1, src
);
422 const vector
unsigned char permP2
= vec_lvsl(+2, src
);
423 const vector
unsigned char permP3
= vec_lvsl(+3, src
);
424 const vector
signed short v20ss
= (const vector
signed short)AVV(20);
425 const vector
unsigned int v10ui
= vec_splat_u32(10);
426 const vector
signed short v5ss
= vec_splat_s16(5);
427 const vector
signed short v1ss
= vec_splat_s16(1);
428 const vector
signed int v512si
= (const vector
signed int)AVV(512);
429 const vector
unsigned int v16ui
= (const vector
unsigned int)AVV(16);
431 register int align
= ((((unsigned long)src
) - 2) % 16);
433 src
-= (2 * srcStride
);
435 for (i
= 0 ; i
< 21 ; i
++) {
436 vector
unsigned char srcM2
, srcM1
, srcP0
, srcP1
, srcP2
, srcP3
;
437 vector
unsigned char srcR1
= vec_ld(-2, src
);
438 vector
unsigned char srcR2
= vec_ld(14, src
);
442 srcM2
= vec_perm(srcR1
, srcR2
, permM2
);
443 srcM1
= vec_perm(srcR1
, srcR2
, permM1
);
444 srcP0
= vec_perm(srcR1
, srcR2
, permP0
);
445 srcP1
= vec_perm(srcR1
, srcR2
, permP1
);
446 srcP2
= vec_perm(srcR1
, srcR2
, permP2
);
447 srcP3
= vec_perm(srcR1
, srcR2
, permP3
);
450 srcM2
= vec_perm(srcR1
, srcR2
, permM2
);
451 srcM1
= vec_perm(srcR1
, srcR2
, permM1
);
452 srcP0
= vec_perm(srcR1
, srcR2
, permP0
);
453 srcP1
= vec_perm(srcR1
, srcR2
, permP1
);
454 srcP2
= vec_perm(srcR1
, srcR2
, permP2
);
458 vector
unsigned char srcR3
= vec_ld(30, src
);
459 srcM2
= vec_perm(srcR1
, srcR2
, permM2
);
460 srcM1
= vec_perm(srcR1
, srcR2
, permM1
);
461 srcP0
= vec_perm(srcR1
, srcR2
, permP0
);
462 srcP1
= vec_perm(srcR1
, srcR2
, permP1
);
464 srcP3
= vec_perm(srcR2
, srcR3
, permP3
);
467 vector
unsigned char srcR3
= vec_ld(30, src
);
468 srcM2
= vec_perm(srcR1
, srcR2
, permM2
);
469 srcM1
= vec_perm(srcR1
, srcR2
, permM1
);
470 srcP0
= vec_perm(srcR1
, srcR2
, permP0
);
472 srcP2
= vec_perm(srcR2
, srcR3
, permP2
);
473 srcP3
= vec_perm(srcR2
, srcR3
, permP3
);
476 vector
unsigned char srcR3
= vec_ld(30, src
);
477 srcM2
= vec_perm(srcR1
, srcR2
, permM2
);
478 srcM1
= vec_perm(srcR1
, srcR2
, permM1
);
480 srcP1
= vec_perm(srcR2
, srcR3
, permP1
);
481 srcP2
= vec_perm(srcR2
, srcR3
, permP2
);
482 srcP3
= vec_perm(srcR2
, srcR3
, permP3
);
485 vector
unsigned char srcR3
= vec_ld(30, src
);
486 srcM2
= vec_perm(srcR1
, srcR2
, permM2
);
488 srcP0
= vec_perm(srcR2
, srcR3
, permP0
);
489 srcP1
= vec_perm(srcR2
, srcR3
, permP1
);
490 srcP2
= vec_perm(srcR2
, srcR3
, permP2
);
491 srcP3
= vec_perm(srcR2
, srcR3
, permP3
);
495 const vector
signed short srcP0A
= (vector
signed short)vec_mergeh((vector
unsigned char)vzero
, srcP0
);
496 const vector
signed short srcP0B
= (vector
signed short)vec_mergel((vector
unsigned char)vzero
, srcP0
);
497 const vector
signed short srcP1A
= (vector
signed short)vec_mergeh((vector
unsigned char)vzero
, srcP1
);
498 const vector
signed short srcP1B
= (vector
signed short)vec_mergel((vector
unsigned char)vzero
, srcP1
);
500 const vector
signed short srcP2A
= (vector
signed short)vec_mergeh((vector
unsigned char)vzero
, srcP2
);
501 const vector
signed short srcP2B
= (vector
signed short)vec_mergel((vector
unsigned char)vzero
, srcP2
);
502 const vector
signed short srcP3A
= (vector
signed short)vec_mergeh((vector
unsigned char)vzero
, srcP3
);
503 const vector
signed short srcP3B
= (vector
signed short)vec_mergel((vector
unsigned char)vzero
, srcP3
);
505 const vector
signed short srcM1A
= (vector
signed short)vec_mergeh((vector
unsigned char)vzero
, srcM1
);
506 const vector
signed short srcM1B
= (vector
signed short)vec_mergel((vector
unsigned char)vzero
, srcM1
);
507 const vector
signed short srcM2A
= (vector
signed short)vec_mergeh((vector
unsigned char)vzero
, srcM2
);
508 const vector
signed short srcM2B
= (vector
signed short)vec_mergel((vector
unsigned char)vzero
, srcM2
);
510 const vector
signed short sum1A
= vec_adds(srcP0A
, srcP1A
);
511 const vector
signed short sum1B
= vec_adds(srcP0B
, srcP1B
);
512 const vector
signed short sum2A
= vec_adds(srcM1A
, srcP2A
);
513 const vector
signed short sum2B
= vec_adds(srcM1B
, srcP2B
);
514 const vector
signed short sum3A
= vec_adds(srcM2A
, srcP3A
);
515 const vector
signed short sum3B
= vec_adds(srcM2B
, srcP3B
);
517 const vector
signed short pp1A
= vec_mladd(sum1A
, v20ss
, sum3A
);
518 const vector
signed short pp1B
= vec_mladd(sum1B
, v20ss
, sum3B
);
520 const vector
signed short pp2A
= vec_mladd(sum2A
, v5ss
, (vector
signed short)vzero
);
521 const vector
signed short pp2B
= vec_mladd(sum2B
, v5ss
, (vector
signed short)vzero
);
523 const vector
signed short psumA
= vec_sub(pp1A
, pp2A
);
524 const vector
signed short psumB
= vec_sub(pp1B
, pp2B
);
526 vec_st(psumA
, 0, tmp
);
527 vec_st(psumB
, 16, tmp
);
530 tmp
+= tmpStride
; /* int16_t*, and stride is 16, so it's OK here */
533 const vector
unsigned char dstperm
= vec_lvsr(0, dst
);
534 const vector
unsigned char neg1
= (const vector
unsigned char)vec_splat_s8(-1);
535 const vector
unsigned char dstmask
= vec_perm((const vector
unsigned char)vzero
, neg1
, dstperm
);
536 const vector
unsigned char mperm
= (const vector
unsigned char)
537 AVV(0x00, 0x08, 0x01, 0x09, 0x02, 0x0A, 0x03, 0x0B,
538 0x04, 0x0C, 0x05, 0x0D, 0x06, 0x0E, 0x07, 0x0F);
540 int16_t *tmpbis
= tmp
- (tmpStride
* 21);
542 vector
signed short tmpM2ssA
= vec_ld(0, tmpbis
);
543 vector
signed short tmpM2ssB
= vec_ld(16, tmpbis
);
545 vector
signed short tmpM1ssA
= vec_ld(0, tmpbis
);
546 vector
signed short tmpM1ssB
= vec_ld(16, tmpbis
);
548 vector
signed short tmpP0ssA
= vec_ld(0, tmpbis
);
549 vector
signed short tmpP0ssB
= vec_ld(16, tmpbis
);
551 vector
signed short tmpP1ssA
= vec_ld(0, tmpbis
);
552 vector
signed short tmpP1ssB
= vec_ld(16, tmpbis
);
554 vector
signed short tmpP2ssA
= vec_ld(0, tmpbis
);
555 vector
signed short tmpP2ssB
= vec_ld(16, tmpbis
);
558 for (i
= 0 ; i
< 16 ; i
++) {
559 const vector
signed short tmpP3ssA
= vec_ld(0, tmpbis
);
560 const vector
signed short tmpP3ssB
= vec_ld(16, tmpbis
);
563 const vector
signed short sum1A
= vec_adds(tmpP0ssA
, tmpP1ssA
);
564 const vector
signed short sum1B
= vec_adds(tmpP0ssB
, tmpP1ssB
);
565 const vector
signed short sum2A
= vec_adds(tmpM1ssA
, tmpP2ssA
);
566 const vector
signed short sum2B
= vec_adds(tmpM1ssB
, tmpP2ssB
);
567 const vector
signed short sum3A
= vec_adds(tmpM2ssA
, tmpP3ssA
);
568 const vector
signed short sum3B
= vec_adds(tmpM2ssB
, tmpP3ssB
);
581 const vector
signed int pp1Ae
= vec_mule(sum1A
, v20ss
);
582 const vector
signed int pp1Ao
= vec_mulo(sum1A
, v20ss
);
583 const vector
signed int pp1Be
= vec_mule(sum1B
, v20ss
);
584 const vector
signed int pp1Bo
= vec_mulo(sum1B
, v20ss
);
586 const vector
signed int pp2Ae
= vec_mule(sum2A
, v5ss
);
587 const vector
signed int pp2Ao
= vec_mulo(sum2A
, v5ss
);
588 const vector
signed int pp2Be
= vec_mule(sum2B
, v5ss
);
589 const vector
signed int pp2Bo
= vec_mulo(sum2B
, v5ss
);
591 const vector
signed int pp3Ae
= vec_sra((vector
signed int)sum3A
, v16ui
);
592 const vector
signed int pp3Ao
= vec_mulo(sum3A
, v1ss
);
593 const vector
signed int pp3Be
= vec_sra((vector
signed int)sum3B
, v16ui
);
594 const vector
signed int pp3Bo
= vec_mulo(sum3B
, v1ss
);
596 const vector
signed int pp1cAe
= vec_add(pp1Ae
, v512si
);
597 const vector
signed int pp1cAo
= vec_add(pp1Ao
, v512si
);
598 const vector
signed int pp1cBe
= vec_add(pp1Be
, v512si
);
599 const vector
signed int pp1cBo
= vec_add(pp1Bo
, v512si
);
601 const vector
signed int pp32Ae
= vec_sub(pp3Ae
, pp2Ae
);
602 const vector
signed int pp32Ao
= vec_sub(pp3Ao
, pp2Ao
);
603 const vector
signed int pp32Be
= vec_sub(pp3Be
, pp2Be
);
604 const vector
signed int pp32Bo
= vec_sub(pp3Bo
, pp2Bo
);
606 const vector
signed int sumAe
= vec_add(pp1cAe
, pp32Ae
);
607 const vector
signed int sumAo
= vec_add(pp1cAo
, pp32Ao
);
608 const vector
signed int sumBe
= vec_add(pp1cBe
, pp32Be
);
609 const vector
signed int sumBo
= vec_add(pp1cBo
, pp32Bo
);
611 const vector
signed int ssumAe
= vec_sra(sumAe
, v10ui
);
612 const vector
signed int ssumAo
= vec_sra(sumAo
, v10ui
);
613 const vector
signed int ssumBe
= vec_sra(sumBe
, v10ui
);
614 const vector
signed int ssumBo
= vec_sra(sumBo
, v10ui
);
616 const vector
signed short ssume
= vec_packs(ssumAe
, ssumBe
);
617 const vector
signed short ssumo
= vec_packs(ssumAo
, ssumBo
);
619 const vector
unsigned char sumv
= vec_packsu(ssume
, ssumo
);
620 const vector
unsigned char sum
= vec_perm(sumv
, sumv
, mperm
);
622 const vector
unsigned char dst1
= vec_ld(0, dst
);
623 const vector
unsigned char dst2
= vec_ld(16, dst
);
624 const vector
unsigned char vdst
= vec_perm(dst1
, dst2
, vec_lvsl(0, dst
));
626 vector
unsigned char fsum
;
627 OP_U8_ALTIVEC(fsum
, sum
, vdst
);
629 const vector
unsigned char rsum
= vec_perm(fsum
, fsum
, dstperm
);
630 const vector
unsigned char fdst1
= vec_sel(dst1
, rsum
, dstmask
);
631 const vector
unsigned char fdst2
= vec_sel(rsum
, dst2
, dstmask
);
633 vec_st(fdst1
, 0, dst
);
634 vec_st(fdst2
, 16, dst
);
638 POWERPC_PERF_STOP_COUNT(PREFIX_h264_qpel16_hv_lowpass_num
, 1);