Merge branch 'mirror' into vdpau
[FFMpeg-mirror/ffmpeg-vdpau.git] / libavcodec / ppc / h264_template_altivec.c
blobe050fe5c1ccbc0e608ee870c210d46d6719c2e2e
1 /*
2 * Copyright (c) 2004 Romain Dolbeau <romain@dolbeau.org>
4 * This file is part of FFmpeg.
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 //#define DEBUG_ALIGNMENT
22 #ifdef DEBUG_ALIGNMENT
23 #define ASSERT_ALIGNED(ptr) assert(((unsigned long)ptr&0x0000000F));
24 #else
25 #define ASSERT_ALIGNED(ptr) ;
26 #endif
28 /* this code assume that stride % 16 == 0 */
30 #define CHROMA_MC8_ALTIVEC_CORE \
31 vsrc2ssH = (vec_s16_t)vec_mergeh(zero_u8v,(vec_u8_t)vsrc2uc);\
32 vsrc3ssH = (vec_s16_t)vec_mergeh(zero_u8v,(vec_u8_t)vsrc3uc);\
34 psum = vec_mladd(vA, vsrc0ssH, v32ss);\
35 psum = vec_mladd(vB, vsrc1ssH, psum);\
36 psum = vec_mladd(vC, vsrc2ssH, psum);\
37 psum = vec_mladd(vD, vsrc3ssH, psum);\
38 psum = vec_sr(psum, v6us);\
40 vdst = vec_ld(0, dst);\
41 ppsum = (vec_u8_t)vec_pack(psum, psum);\
42 vfdst = vec_perm(vdst, ppsum, fperm);\
44 OP_U8_ALTIVEC(fsum, vfdst, vdst);\
46 vec_st(fsum, 0, dst);\
48 vsrc0ssH = vsrc2ssH;\
49 vsrc1ssH = vsrc3ssH;\
51 dst += stride;\
52 src += stride;
54 #define CHROMA_MC8_ALTIVEC_CORE_SIMPLE \
56 vsrc0ssH = (vec_s16_t)vec_mergeh(zero_u8v,(vec_u8_t)vsrc0uc);\
57 vsrc1ssH = (vec_s16_t)vec_mergeh(zero_u8v,(vec_u8_t)vsrc1uc);\
59 psum = vec_mladd(vA, vsrc0ssH, v32ss);\
60 psum = vec_mladd(vE, vsrc1ssH, psum);\
61 psum = vec_sr(psum, v6us);\
63 vdst = vec_ld(0, dst);\
64 ppsum = (vec_u8_t)vec_pack(psum, psum);\
65 vfdst = vec_perm(vdst, ppsum, fperm);\
67 OP_U8_ALTIVEC(fsum, vfdst, vdst);\
69 vec_st(fsum, 0, dst);\
71 dst += stride;\
72 src += stride;
74 void PREFIX_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src,
75 int stride, int h, int x, int y) {
76 POWERPC_PERF_DECLARE(PREFIX_h264_chroma_mc8_num, 1);
77 DECLARE_ALIGNED_16(signed int, ABCD[4]) =
78 {((8 - x) * (8 - y)),
79 (( x) * (8 - y)),
80 ((8 - x) * ( y)),
81 (( x) * ( y))};
82 register int i;
83 vec_u8_t fperm;
84 const vec_s32_t vABCD = vec_ld(0, ABCD);
85 const vec_s16_t vA = vec_splat((vec_s16_t)vABCD, 1);
86 const vec_s16_t vB = vec_splat((vec_s16_t)vABCD, 3);
87 const vec_s16_t vC = vec_splat((vec_s16_t)vABCD, 5);
88 const vec_s16_t vD = vec_splat((vec_s16_t)vABCD, 7);
89 LOAD_ZERO;
90 const vec_s16_t v32ss = vec_sl(vec_splat_s16(1),vec_splat_u16(5));
91 const vec_u16_t v6us = vec_splat_u16(6);
92 register int loadSecond = (((unsigned long)src) % 16) <= 7 ? 0 : 1;
93 register int reallyBadAlign = (((unsigned long)src) % 16) == 15 ? 1 : 0;
95 vec_u8_t vsrcAuc, vsrcBuc, vsrcperm0, vsrcperm1;
96 vec_u8_t vsrc0uc, vsrc1uc;
97 vec_s16_t vsrc0ssH, vsrc1ssH;
98 vec_u8_t vsrcCuc, vsrc2uc, vsrc3uc;
99 vec_s16_t vsrc2ssH, vsrc3ssH, psum;
100 vec_u8_t vdst, ppsum, vfdst, fsum;
102 POWERPC_PERF_START_COUNT(PREFIX_h264_chroma_mc8_num, 1);
104 if (((unsigned long)dst) % 16 == 0) {
105 fperm = (vec_u8_t){0x10, 0x11, 0x12, 0x13,
106 0x14, 0x15, 0x16, 0x17,
107 0x08, 0x09, 0x0A, 0x0B,
108 0x0C, 0x0D, 0x0E, 0x0F};
109 } else {
110 fperm = (vec_u8_t){0x00, 0x01, 0x02, 0x03,
111 0x04, 0x05, 0x06, 0x07,
112 0x18, 0x19, 0x1A, 0x1B,
113 0x1C, 0x1D, 0x1E, 0x1F};
116 vsrcAuc = vec_ld(0, src);
118 if (loadSecond)
119 vsrcBuc = vec_ld(16, src);
120 vsrcperm0 = vec_lvsl(0, src);
121 vsrcperm1 = vec_lvsl(1, src);
123 vsrc0uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm0);
124 if (reallyBadAlign)
125 vsrc1uc = vsrcBuc;
126 else
127 vsrc1uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm1);
129 vsrc0ssH = (vec_s16_t)vec_mergeh(zero_u8v,(vec_u8_t)vsrc0uc);
130 vsrc1ssH = (vec_s16_t)vec_mergeh(zero_u8v,(vec_u8_t)vsrc1uc);
132 if (ABCD[3]) {
133 if (!loadSecond) {// -> !reallyBadAlign
134 for (i = 0 ; i < h ; i++) {
135 vsrcCuc = vec_ld(stride + 0, src);
136 vsrc2uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
137 vsrc3uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1);
139 CHROMA_MC8_ALTIVEC_CORE
141 } else {
142 vec_u8_t vsrcDuc;
143 for (i = 0 ; i < h ; i++) {
144 vsrcCuc = vec_ld(stride + 0, src);
145 vsrcDuc = vec_ld(stride + 16, src);
146 vsrc2uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0);
147 if (reallyBadAlign)
148 vsrc3uc = vsrcDuc;
149 else
150 vsrc3uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1);
152 CHROMA_MC8_ALTIVEC_CORE
155 } else {
156 const vec_s16_t vE = vec_add(vB, vC);
157 if (ABCD[2]) { // x == 0 B == 0
158 if (!loadSecond) {// -> !reallyBadAlign
159 for (i = 0 ; i < h ; i++) {
160 vsrcCuc = vec_ld(stride + 0, src);
161 vsrc1uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
162 CHROMA_MC8_ALTIVEC_CORE_SIMPLE
164 vsrc0uc = vsrc1uc;
166 } else {
167 vec_u8_t vsrcDuc;
168 for (i = 0 ; i < h ; i++) {
169 vsrcCuc = vec_ld(stride + 0, src);
170 vsrcDuc = vec_ld(stride + 15, src);
171 vsrc1uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0);
172 CHROMA_MC8_ALTIVEC_CORE_SIMPLE
174 vsrc0uc = vsrc1uc;
177 } else { // y == 0 C == 0
178 if (!loadSecond) {// -> !reallyBadAlign
179 for (i = 0 ; i < h ; i++) {
180 vsrcCuc = vec_ld(0, src);
181 vsrc0uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
182 vsrc1uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1);
184 CHROMA_MC8_ALTIVEC_CORE_SIMPLE
186 } else {
187 vec_u8_t vsrcDuc;
188 for (i = 0 ; i < h ; i++) {
189 vsrcCuc = vec_ld(0, src);
190 vsrcDuc = vec_ld(15, src);
191 vsrc0uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0);
192 if (reallyBadAlign)
193 vsrc1uc = vsrcDuc;
194 else
195 vsrc1uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1);
197 CHROMA_MC8_ALTIVEC_CORE_SIMPLE
202 POWERPC_PERF_STOP_COUNT(PREFIX_h264_chroma_mc8_num, 1);
205 #undef CHROMA_MC8_ALTIVEC_CORE
207 /* this code assume stride % 16 == 0 */
208 static void PREFIX_h264_qpel16_h_lowpass_altivec(uint8_t * dst, uint8_t * src, int dstStride, int srcStride) {
209 POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_h_lowpass_num, 1);
210 register int i;
212 LOAD_ZERO;
213 const vec_u8_t permM2 = vec_lvsl(-2, src);
214 const vec_u8_t permM1 = vec_lvsl(-1, src);
215 const vec_u8_t permP0 = vec_lvsl(+0, src);
216 const vec_u8_t permP1 = vec_lvsl(+1, src);
217 const vec_u8_t permP2 = vec_lvsl(+2, src);
218 const vec_u8_t permP3 = vec_lvsl(+3, src);
219 const vec_s16_t v5ss = vec_splat_s16(5);
220 const vec_u16_t v5us = vec_splat_u16(5);
221 const vec_s16_t v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
222 const vec_s16_t v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4));
224 vec_u8_t srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
226 register int align = ((((unsigned long)src) - 2) % 16);
228 vec_s16_t srcP0A, srcP0B, srcP1A, srcP1B,
229 srcP2A, srcP2B, srcP3A, srcP3B,
230 srcM1A, srcM1B, srcM2A, srcM2B,
231 sum1A, sum1B, sum2A, sum2B, sum3A, sum3B,
232 pp1A, pp1B, pp2A, pp2B, pp3A, pp3B,
233 psumA, psumB, sumA, sumB;
235 vec_u8_t sum, vdst, fsum;
237 POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_h_lowpass_num, 1);
239 for (i = 0 ; i < 16 ; i ++) {
240 vec_u8_t srcR1 = vec_ld(-2, src);
241 vec_u8_t srcR2 = vec_ld(14, src);
243 switch (align) {
244 default: {
245 srcM2 = vec_perm(srcR1, srcR2, permM2);
246 srcM1 = vec_perm(srcR1, srcR2, permM1);
247 srcP0 = vec_perm(srcR1, srcR2, permP0);
248 srcP1 = vec_perm(srcR1, srcR2, permP1);
249 srcP2 = vec_perm(srcR1, srcR2, permP2);
250 srcP3 = vec_perm(srcR1, srcR2, permP3);
251 } break;
252 case 11: {
253 srcM2 = vec_perm(srcR1, srcR2, permM2);
254 srcM1 = vec_perm(srcR1, srcR2, permM1);
255 srcP0 = vec_perm(srcR1, srcR2, permP0);
256 srcP1 = vec_perm(srcR1, srcR2, permP1);
257 srcP2 = vec_perm(srcR1, srcR2, permP2);
258 srcP3 = srcR2;
259 } break;
260 case 12: {
261 vec_u8_t srcR3 = vec_ld(30, src);
262 srcM2 = vec_perm(srcR1, srcR2, permM2);
263 srcM1 = vec_perm(srcR1, srcR2, permM1);
264 srcP0 = vec_perm(srcR1, srcR2, permP0);
265 srcP1 = vec_perm(srcR1, srcR2, permP1);
266 srcP2 = srcR2;
267 srcP3 = vec_perm(srcR2, srcR3, permP3);
268 } break;
269 case 13: {
270 vec_u8_t srcR3 = vec_ld(30, src);
271 srcM2 = vec_perm(srcR1, srcR2, permM2);
272 srcM1 = vec_perm(srcR1, srcR2, permM1);
273 srcP0 = vec_perm(srcR1, srcR2, permP0);
274 srcP1 = srcR2;
275 srcP2 = vec_perm(srcR2, srcR3, permP2);
276 srcP3 = vec_perm(srcR2, srcR3, permP3);
277 } break;
278 case 14: {
279 vec_u8_t srcR3 = vec_ld(30, src);
280 srcM2 = vec_perm(srcR1, srcR2, permM2);
281 srcM1 = vec_perm(srcR1, srcR2, permM1);
282 srcP0 = srcR2;
283 srcP1 = vec_perm(srcR2, srcR3, permP1);
284 srcP2 = vec_perm(srcR2, srcR3, permP2);
285 srcP3 = vec_perm(srcR2, srcR3, permP3);
286 } break;
287 case 15: {
288 vec_u8_t srcR3 = vec_ld(30, src);
289 srcM2 = vec_perm(srcR1, srcR2, permM2);
290 srcM1 = srcR2;
291 srcP0 = vec_perm(srcR2, srcR3, permP0);
292 srcP1 = vec_perm(srcR2, srcR3, permP1);
293 srcP2 = vec_perm(srcR2, srcR3, permP2);
294 srcP3 = vec_perm(srcR2, srcR3, permP3);
295 } break;
298 srcP0A = (vec_s16_t) vec_mergeh(zero_u8v, srcP0);
299 srcP0B = (vec_s16_t) vec_mergel(zero_u8v, srcP0);
300 srcP1A = (vec_s16_t) vec_mergeh(zero_u8v, srcP1);
301 srcP1B = (vec_s16_t) vec_mergel(zero_u8v, srcP1);
303 srcP2A = (vec_s16_t) vec_mergeh(zero_u8v, srcP2);
304 srcP2B = (vec_s16_t) vec_mergel(zero_u8v, srcP2);
305 srcP3A = (vec_s16_t) vec_mergeh(zero_u8v, srcP3);
306 srcP3B = (vec_s16_t) vec_mergel(zero_u8v, srcP3);
308 srcM1A = (vec_s16_t) vec_mergeh(zero_u8v, srcM1);
309 srcM1B = (vec_s16_t) vec_mergel(zero_u8v, srcM1);
310 srcM2A = (vec_s16_t) vec_mergeh(zero_u8v, srcM2);
311 srcM2B = (vec_s16_t) vec_mergel(zero_u8v, srcM2);
313 sum1A = vec_adds(srcP0A, srcP1A);
314 sum1B = vec_adds(srcP0B, srcP1B);
315 sum2A = vec_adds(srcM1A, srcP2A);
316 sum2B = vec_adds(srcM1B, srcP2B);
317 sum3A = vec_adds(srcM2A, srcP3A);
318 sum3B = vec_adds(srcM2B, srcP3B);
320 pp1A = vec_mladd(sum1A, v20ss, v16ss);
321 pp1B = vec_mladd(sum1B, v20ss, v16ss);
323 pp2A = vec_mladd(sum2A, v5ss, zero_s16v);
324 pp2B = vec_mladd(sum2B, v5ss, zero_s16v);
326 pp3A = vec_add(sum3A, pp1A);
327 pp3B = vec_add(sum3B, pp1B);
329 psumA = vec_sub(pp3A, pp2A);
330 psumB = vec_sub(pp3B, pp2B);
332 sumA = vec_sra(psumA, v5us);
333 sumB = vec_sra(psumB, v5us);
335 sum = vec_packsu(sumA, sumB);
337 ASSERT_ALIGNED(dst);
338 vdst = vec_ld(0, dst);
340 OP_U8_ALTIVEC(fsum, sum, vdst);
342 vec_st(fsum, 0, dst);
344 src += srcStride;
345 dst += dstStride;
347 POWERPC_PERF_STOP_COUNT(PREFIX_h264_qpel16_h_lowpass_num, 1);
350 /* this code assume stride % 16 == 0 */
351 static void PREFIX_h264_qpel16_v_lowpass_altivec(uint8_t * dst, uint8_t * src, int dstStride, int srcStride) {
352 POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_v_lowpass_num, 1);
354 register int i;
356 LOAD_ZERO;
357 const vec_u8_t perm = vec_lvsl(0, src);
358 const vec_s16_t v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
359 const vec_u16_t v5us = vec_splat_u16(5);
360 const vec_s16_t v5ss = vec_splat_s16(5);
361 const vec_s16_t v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4));
363 uint8_t *srcbis = src - (srcStride * 2);
365 const vec_u8_t srcM2a = vec_ld(0, srcbis);
366 const vec_u8_t srcM2b = vec_ld(16, srcbis);
367 const vec_u8_t srcM2 = vec_perm(srcM2a, srcM2b, perm);
368 //srcbis += srcStride;
369 const vec_u8_t srcM1a = vec_ld(0, srcbis += srcStride);
370 const vec_u8_t srcM1b = vec_ld(16, srcbis);
371 const vec_u8_t srcM1 = vec_perm(srcM1a, srcM1b, perm);
372 //srcbis += srcStride;
373 const vec_u8_t srcP0a = vec_ld(0, srcbis += srcStride);
374 const vec_u8_t srcP0b = vec_ld(16, srcbis);
375 const vec_u8_t srcP0 = vec_perm(srcP0a, srcP0b, perm);
376 //srcbis += srcStride;
377 const vec_u8_t srcP1a = vec_ld(0, srcbis += srcStride);
378 const vec_u8_t srcP1b = vec_ld(16, srcbis);
379 const vec_u8_t srcP1 = vec_perm(srcP1a, srcP1b, perm);
380 //srcbis += srcStride;
381 const vec_u8_t srcP2a = vec_ld(0, srcbis += srcStride);
382 const vec_u8_t srcP2b = vec_ld(16, srcbis);
383 const vec_u8_t srcP2 = vec_perm(srcP2a, srcP2b, perm);
384 //srcbis += srcStride;
386 vec_s16_t srcM2ssA = (vec_s16_t) vec_mergeh(zero_u8v, srcM2);
387 vec_s16_t srcM2ssB = (vec_s16_t) vec_mergel(zero_u8v, srcM2);
388 vec_s16_t srcM1ssA = (vec_s16_t) vec_mergeh(zero_u8v, srcM1);
389 vec_s16_t srcM1ssB = (vec_s16_t) vec_mergel(zero_u8v, srcM1);
390 vec_s16_t srcP0ssA = (vec_s16_t) vec_mergeh(zero_u8v, srcP0);
391 vec_s16_t srcP0ssB = (vec_s16_t) vec_mergel(zero_u8v, srcP0);
392 vec_s16_t srcP1ssA = (vec_s16_t) vec_mergeh(zero_u8v, srcP1);
393 vec_s16_t srcP1ssB = (vec_s16_t) vec_mergel(zero_u8v, srcP1);
394 vec_s16_t srcP2ssA = (vec_s16_t) vec_mergeh(zero_u8v, srcP2);
395 vec_s16_t srcP2ssB = (vec_s16_t) vec_mergel(zero_u8v, srcP2);
397 vec_s16_t pp1A, pp1B, pp2A, pp2B, pp3A, pp3B,
398 psumA, psumB, sumA, sumB,
399 srcP3ssA, srcP3ssB,
400 sum1A, sum1B, sum2A, sum2B, sum3A, sum3B;
402 vec_u8_t sum, vdst, fsum, srcP3a, srcP3b, srcP3;
404 POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_v_lowpass_num, 1);
406 for (i = 0 ; i < 16 ; i++) {
407 srcP3a = vec_ld(0, srcbis += srcStride);
408 srcP3b = vec_ld(16, srcbis);
409 srcP3 = vec_perm(srcP3a, srcP3b, perm);
410 srcP3ssA = (vec_s16_t) vec_mergeh(zero_u8v, srcP3);
411 srcP3ssB = (vec_s16_t) vec_mergel(zero_u8v, srcP3);
412 //srcbis += srcStride;
414 sum1A = vec_adds(srcP0ssA, srcP1ssA);
415 sum1B = vec_adds(srcP0ssB, srcP1ssB);
416 sum2A = vec_adds(srcM1ssA, srcP2ssA);
417 sum2B = vec_adds(srcM1ssB, srcP2ssB);
418 sum3A = vec_adds(srcM2ssA, srcP3ssA);
419 sum3B = vec_adds(srcM2ssB, srcP3ssB);
421 srcM2ssA = srcM1ssA;
422 srcM2ssB = srcM1ssB;
423 srcM1ssA = srcP0ssA;
424 srcM1ssB = srcP0ssB;
425 srcP0ssA = srcP1ssA;
426 srcP0ssB = srcP1ssB;
427 srcP1ssA = srcP2ssA;
428 srcP1ssB = srcP2ssB;
429 srcP2ssA = srcP3ssA;
430 srcP2ssB = srcP3ssB;
432 pp1A = vec_mladd(sum1A, v20ss, v16ss);
433 pp1B = vec_mladd(sum1B, v20ss, v16ss);
435 pp2A = vec_mladd(sum2A, v5ss, zero_s16v);
436 pp2B = vec_mladd(sum2B, v5ss, zero_s16v);
438 pp3A = vec_add(sum3A, pp1A);
439 pp3B = vec_add(sum3B, pp1B);
441 psumA = vec_sub(pp3A, pp2A);
442 psumB = vec_sub(pp3B, pp2B);
444 sumA = vec_sra(psumA, v5us);
445 sumB = vec_sra(psumB, v5us);
447 sum = vec_packsu(sumA, sumB);
449 ASSERT_ALIGNED(dst);
450 vdst = vec_ld(0, dst);
452 OP_U8_ALTIVEC(fsum, sum, vdst);
454 vec_st(fsum, 0, dst);
456 dst += dstStride;
458 POWERPC_PERF_STOP_COUNT(PREFIX_h264_qpel16_v_lowpass_num, 1);
461 /* this code assume stride % 16 == 0 *and* tmp is properly aligned */
462 static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t * dst, int16_t * tmp, uint8_t * src, int dstStride, int tmpStride, int srcStride) {
463 POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_hv_lowpass_num, 1);
464 register int i;
465 LOAD_ZERO;
466 const vec_u8_t permM2 = vec_lvsl(-2, src);
467 const vec_u8_t permM1 = vec_lvsl(-1, src);
468 const vec_u8_t permP0 = vec_lvsl(+0, src);
469 const vec_u8_t permP1 = vec_lvsl(+1, src);
470 const vec_u8_t permP2 = vec_lvsl(+2, src);
471 const vec_u8_t permP3 = vec_lvsl(+3, src);
472 const vec_s16_t v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
473 const vec_u32_t v10ui = vec_splat_u32(10);
474 const vec_s16_t v5ss = vec_splat_s16(5);
475 const vec_s16_t v1ss = vec_splat_s16(1);
476 const vec_s32_t v512si = vec_sl(vec_splat_s32(1),vec_splat_u32(9));
477 const vec_u32_t v16ui = vec_sl(vec_splat_u32(1),vec_splat_u32(4));
479 register int align = ((((unsigned long)src) - 2) % 16);
481 vec_s16_t srcP0A, srcP0B, srcP1A, srcP1B,
482 srcP2A, srcP2B, srcP3A, srcP3B,
483 srcM1A, srcM1B, srcM2A, srcM2B,
484 sum1A, sum1B, sum2A, sum2B, sum3A, sum3B,
485 pp1A, pp1B, pp2A, pp2B, psumA, psumB;
487 const vec_u8_t mperm = (const vec_u8_t)
488 {0x00, 0x08, 0x01, 0x09, 0x02, 0x0A, 0x03, 0x0B,
489 0x04, 0x0C, 0x05, 0x0D, 0x06, 0x0E, 0x07, 0x0F};
490 int16_t *tmpbis = tmp;
492 vec_s16_t tmpM1ssA, tmpM1ssB, tmpM2ssA, tmpM2ssB,
493 tmpP0ssA, tmpP0ssB, tmpP1ssA, tmpP1ssB,
494 tmpP2ssA, tmpP2ssB;
496 vec_s32_t pp1Ae, pp1Ao, pp1Be, pp1Bo, pp2Ae, pp2Ao, pp2Be, pp2Bo,
497 pp3Ae, pp3Ao, pp3Be, pp3Bo, pp1cAe, pp1cAo, pp1cBe, pp1cBo,
498 pp32Ae, pp32Ao, pp32Be, pp32Bo, sumAe, sumAo, sumBe, sumBo,
499 ssumAe, ssumAo, ssumBe, ssumBo;
500 vec_u8_t fsum, sumv, sum, vdst;
501 vec_s16_t ssume, ssumo;
503 POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_hv_lowpass_num, 1);
504 src -= (2 * srcStride);
505 for (i = 0 ; i < 21 ; i ++) {
506 vec_u8_t srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
507 vec_u8_t srcR1 = vec_ld(-2, src);
508 vec_u8_t srcR2 = vec_ld(14, src);
510 switch (align) {
511 default: {
512 srcM2 = vec_perm(srcR1, srcR2, permM2);
513 srcM1 = vec_perm(srcR1, srcR2, permM1);
514 srcP0 = vec_perm(srcR1, srcR2, permP0);
515 srcP1 = vec_perm(srcR1, srcR2, permP1);
516 srcP2 = vec_perm(srcR1, srcR2, permP2);
517 srcP3 = vec_perm(srcR1, srcR2, permP3);
518 } break;
519 case 11: {
520 srcM2 = vec_perm(srcR1, srcR2, permM2);
521 srcM1 = vec_perm(srcR1, srcR2, permM1);
522 srcP0 = vec_perm(srcR1, srcR2, permP0);
523 srcP1 = vec_perm(srcR1, srcR2, permP1);
524 srcP2 = vec_perm(srcR1, srcR2, permP2);
525 srcP3 = srcR2;
526 } break;
527 case 12: {
528 vec_u8_t srcR3 = vec_ld(30, src);
529 srcM2 = vec_perm(srcR1, srcR2, permM2);
530 srcM1 = vec_perm(srcR1, srcR2, permM1);
531 srcP0 = vec_perm(srcR1, srcR2, permP0);
532 srcP1 = vec_perm(srcR1, srcR2, permP1);
533 srcP2 = srcR2;
534 srcP3 = vec_perm(srcR2, srcR3, permP3);
535 } break;
536 case 13: {
537 vec_u8_t srcR3 = vec_ld(30, src);
538 srcM2 = vec_perm(srcR1, srcR2, permM2);
539 srcM1 = vec_perm(srcR1, srcR2, permM1);
540 srcP0 = vec_perm(srcR1, srcR2, permP0);
541 srcP1 = srcR2;
542 srcP2 = vec_perm(srcR2, srcR3, permP2);
543 srcP3 = vec_perm(srcR2, srcR3, permP3);
544 } break;
545 case 14: {
546 vec_u8_t srcR3 = vec_ld(30, src);
547 srcM2 = vec_perm(srcR1, srcR2, permM2);
548 srcM1 = vec_perm(srcR1, srcR2, permM1);
549 srcP0 = srcR2;
550 srcP1 = vec_perm(srcR2, srcR3, permP1);
551 srcP2 = vec_perm(srcR2, srcR3, permP2);
552 srcP3 = vec_perm(srcR2, srcR3, permP3);
553 } break;
554 case 15: {
555 vec_u8_t srcR3 = vec_ld(30, src);
556 srcM2 = vec_perm(srcR1, srcR2, permM2);
557 srcM1 = srcR2;
558 srcP0 = vec_perm(srcR2, srcR3, permP0);
559 srcP1 = vec_perm(srcR2, srcR3, permP1);
560 srcP2 = vec_perm(srcR2, srcR3, permP2);
561 srcP3 = vec_perm(srcR2, srcR3, permP3);
562 } break;
565 srcP0A = (vec_s16_t) vec_mergeh(zero_u8v, srcP0);
566 srcP0B = (vec_s16_t) vec_mergel(zero_u8v, srcP0);
567 srcP1A = (vec_s16_t) vec_mergeh(zero_u8v, srcP1);
568 srcP1B = (vec_s16_t) vec_mergel(zero_u8v, srcP1);
570 srcP2A = (vec_s16_t) vec_mergeh(zero_u8v, srcP2);
571 srcP2B = (vec_s16_t) vec_mergel(zero_u8v, srcP2);
572 srcP3A = (vec_s16_t) vec_mergeh(zero_u8v, srcP3);
573 srcP3B = (vec_s16_t) vec_mergel(zero_u8v, srcP3);
575 srcM1A = (vec_s16_t) vec_mergeh(zero_u8v, srcM1);
576 srcM1B = (vec_s16_t) vec_mergel(zero_u8v, srcM1);
577 srcM2A = (vec_s16_t) vec_mergeh(zero_u8v, srcM2);
578 srcM2B = (vec_s16_t) vec_mergel(zero_u8v, srcM2);
580 sum1A = vec_adds(srcP0A, srcP1A);
581 sum1B = vec_adds(srcP0B, srcP1B);
582 sum2A = vec_adds(srcM1A, srcP2A);
583 sum2B = vec_adds(srcM1B, srcP2B);
584 sum3A = vec_adds(srcM2A, srcP3A);
585 sum3B = vec_adds(srcM2B, srcP3B);
587 pp1A = vec_mladd(sum1A, v20ss, sum3A);
588 pp1B = vec_mladd(sum1B, v20ss, sum3B);
590 pp2A = vec_mladd(sum2A, v5ss, zero_s16v);
591 pp2B = vec_mladd(sum2B, v5ss, zero_s16v);
593 psumA = vec_sub(pp1A, pp2A);
594 psumB = vec_sub(pp1B, pp2B);
596 vec_st(psumA, 0, tmp);
597 vec_st(psumB, 16, tmp);
599 src += srcStride;
600 tmp += tmpStride; /* int16_t*, and stride is 16, so it's OK here */
603 tmpM2ssA = vec_ld(0, tmpbis);
604 tmpM2ssB = vec_ld(16, tmpbis);
605 tmpbis += tmpStride;
606 tmpM1ssA = vec_ld(0, tmpbis);
607 tmpM1ssB = vec_ld(16, tmpbis);
608 tmpbis += tmpStride;
609 tmpP0ssA = vec_ld(0, tmpbis);
610 tmpP0ssB = vec_ld(16, tmpbis);
611 tmpbis += tmpStride;
612 tmpP1ssA = vec_ld(0, tmpbis);
613 tmpP1ssB = vec_ld(16, tmpbis);
614 tmpbis += tmpStride;
615 tmpP2ssA = vec_ld(0, tmpbis);
616 tmpP2ssB = vec_ld(16, tmpbis);
617 tmpbis += tmpStride;
619 for (i = 0 ; i < 16 ; i++) {
620 const vec_s16_t tmpP3ssA = vec_ld(0, tmpbis);
621 const vec_s16_t tmpP3ssB = vec_ld(16, tmpbis);
623 const vec_s16_t sum1A = vec_adds(tmpP0ssA, tmpP1ssA);
624 const vec_s16_t sum1B = vec_adds(tmpP0ssB, tmpP1ssB);
625 const vec_s16_t sum2A = vec_adds(tmpM1ssA, tmpP2ssA);
626 const vec_s16_t sum2B = vec_adds(tmpM1ssB, tmpP2ssB);
627 const vec_s16_t sum3A = vec_adds(tmpM2ssA, tmpP3ssA);
628 const vec_s16_t sum3B = vec_adds(tmpM2ssB, tmpP3ssB);
630 tmpbis += tmpStride;
632 tmpM2ssA = tmpM1ssA;
633 tmpM2ssB = tmpM1ssB;
634 tmpM1ssA = tmpP0ssA;
635 tmpM1ssB = tmpP0ssB;
636 tmpP0ssA = tmpP1ssA;
637 tmpP0ssB = tmpP1ssB;
638 tmpP1ssA = tmpP2ssA;
639 tmpP1ssB = tmpP2ssB;
640 tmpP2ssA = tmpP3ssA;
641 tmpP2ssB = tmpP3ssB;
643 pp1Ae = vec_mule(sum1A, v20ss);
644 pp1Ao = vec_mulo(sum1A, v20ss);
645 pp1Be = vec_mule(sum1B, v20ss);
646 pp1Bo = vec_mulo(sum1B, v20ss);
648 pp2Ae = vec_mule(sum2A, v5ss);
649 pp2Ao = vec_mulo(sum2A, v5ss);
650 pp2Be = vec_mule(sum2B, v5ss);
651 pp2Bo = vec_mulo(sum2B, v5ss);
653 pp3Ae = vec_sra((vec_s32_t)sum3A, v16ui);
654 pp3Ao = vec_mulo(sum3A, v1ss);
655 pp3Be = vec_sra((vec_s32_t)sum3B, v16ui);
656 pp3Bo = vec_mulo(sum3B, v1ss);
658 pp1cAe = vec_add(pp1Ae, v512si);
659 pp1cAo = vec_add(pp1Ao, v512si);
660 pp1cBe = vec_add(pp1Be, v512si);
661 pp1cBo = vec_add(pp1Bo, v512si);
663 pp32Ae = vec_sub(pp3Ae, pp2Ae);
664 pp32Ao = vec_sub(pp3Ao, pp2Ao);
665 pp32Be = vec_sub(pp3Be, pp2Be);
666 pp32Bo = vec_sub(pp3Bo, pp2Bo);
668 sumAe = vec_add(pp1cAe, pp32Ae);
669 sumAo = vec_add(pp1cAo, pp32Ao);
670 sumBe = vec_add(pp1cBe, pp32Be);
671 sumBo = vec_add(pp1cBo, pp32Bo);
673 ssumAe = vec_sra(sumAe, v10ui);
674 ssumAo = vec_sra(sumAo, v10ui);
675 ssumBe = vec_sra(sumBe, v10ui);
676 ssumBo = vec_sra(sumBo, v10ui);
678 ssume = vec_packs(ssumAe, ssumBe);
679 ssumo = vec_packs(ssumAo, ssumBo);
681 sumv = vec_packsu(ssume, ssumo);
682 sum = vec_perm(sumv, sumv, mperm);
684 ASSERT_ALIGNED(dst);
685 vdst = vec_ld(0, dst);
687 OP_U8_ALTIVEC(fsum, sum, vdst);
689 vec_st(fsum, 0, dst);
691 dst += dstStride;
693 POWERPC_PERF_STOP_COUNT(PREFIX_h264_qpel16_hv_lowpass_num, 1);