pass ARCH down to uClibc
[buildroot.git] / package / mplayer / mplayer-1.0rc1-atmel.3.patch
blob800f43e8ebbc0af134b1d350b67e78a84030eb10
1 cfg-common.h | 4 +
2 cfg-mencoder.h | 4 +
3 cfg-mplayer.h | 4 +
4 configure | 13 +-
5 libaf/af_format.c | 7 +
6 libavcodec/Makefile | 7 +
7 libavcodec/avr32/dsputil_avr32.c | 2678 ++++++++++++++++++++++++++++++++++++++
8 libavcodec/avr32/fdct.S | 541 ++++++++
9 libavcodec/avr32/h264idct.S | 451 +++++++
10 libavcodec/avr32/idct.S | 829 ++++++++++++
11 libavcodec/avr32/mc.S | 434 ++++++
12 libavcodec/avr32/pico.h | 260 ++++
13 libavcodec/bitstream.h | 77 +-
14 libavcodec/dsputil.c | 3 +
15 libavcodec/h264.c | 15 +
16 libavutil/common.h | 16 +
17 libavutil/internal.h | 9 +
18 libfaad2/common.h | 2 +-
19 libmpcodecs/ad_libmad.c | 5 +
20 libswscale/pico-avr32.h | 137 ++
21 libswscale/swscale_internal.h | 2 +-
22 libswscale/yuv2rgb.c | 14 +
23 libswscale/yuv2rgb_avr32.c | 416 ++++++
24 libvo/vo_fbdev2.c | 101 ++-
25 version.sh | 2 +-
26 25 files changed, 6011 insertions(+), 20 deletions(-)
27 create mode 100644 libavcodec/avr32/dsputil_avr32.c
28 create mode 100644 libavcodec/avr32/fdct.S
29 create mode 100644 libavcodec/avr32/h264idct.S
30 create mode 100644 libavcodec/avr32/idct.S
31 create mode 100644 libavcodec/avr32/mc.S
32 create mode 100644 libavcodec/avr32/pico.h
33 create mode 100644 libswscale/pico-avr32.h
34 create mode 100644 libswscale/yuv2rgb_avr32.c
36 diff --git a/cfg-common.h b/cfg-common.h
37 index 780df38..7d878a8 100644
38 --- a/cfg-common.h
39 +++ b/cfg-common.h
40 @@ -235,6 +235,10 @@
41 {"tsprobe", &ts_probe, CONF_TYPE_POSITION, 0, 0, TS_MAX_PROBE_SIZE, NULL},
42 {"tskeepbroken", &ts_keep_broken, CONF_TYPE_FLAG, 0, 0, 1, NULL},
44 +#ifdef ARCH_AVR32
45 + {"use-pico", &avr32_use_pico, CONF_TYPE_FLAG, 0, 0, 1, NULL},
46 + {"nouse-pico", &avr32_use_pico, CONF_TYPE_FLAG, 0, 1, 0, NULL},
47 +#endif
48 // draw by slices or whole frame (useful with libmpeg2/libavcodec)
49 {"slices", &vd_use_slices, CONF_TYPE_FLAG, 0, 0, 1, NULL},
50 {"noslices", &vd_use_slices, CONF_TYPE_FLAG, 0, 1, 0, NULL},
51 diff --git a/cfg-mencoder.h b/cfg-mencoder.h
52 index 411b748..addf791 100644
53 --- a/cfg-mencoder.h
54 +++ b/cfg-mencoder.h
55 @@ -5,6 +5,10 @@
57 #include "cfg-common.h"
59 +#ifdef ARCH_AVR32
60 +extern int avr32_use_pico;
61 +#endif
63 #ifdef USE_FAKE_MONO
64 extern int fakemono; // defined in dec_audio.c
65 #endif
66 diff --git a/cfg-mplayer.h b/cfg-mplayer.h
67 index 62b6eac..31499c2 100644
68 --- a/cfg-mplayer.h
69 +++ b/cfg-mplayer.h
70 @@ -4,6 +4,10 @@
72 #include "cfg-common.h"
74 +#ifdef ARCH_AVR32
75 +extern int avr32_use_pico;
76 +#endif
78 extern int noconsolecontrols;
80 #if defined(HAVE_FBDEV)||defined(HAVE_VESA)
81 diff --git a/configure b/configure
82 index 29002c8..56c6fe4 100755
83 --- a/configure
84 +++ b/configure
85 @@ -1203,6 +1203,15 @@ EOF
86 _optimizing="$proc"
89 + avr32)
90 + _def_arch='#define ARCH_AVR32'
91 + _target_arch='TARGET_ARCH_AVR32 = yes'
92 + iproc='avr32'
93 + proc=''
94 + _march=''
95 + _mcpu=''
96 + _optimizing=''
97 + ;;
98 arm|armv4l|armv5tel)
99 _def_arch='#define ARCH_ARMV4L 1'
100 _target_arch='TARGET_ARCH_ARMV4L = yes'
101 @@ -1533,7 +1542,7 @@ echores $_named_asm_args
102 # Checking for CFLAGS
103 _stripbinaries=yes
104 if test "$_profile" != "" || test "$_debug" != "" ; then
105 - CFLAGS="-W -Wall -O2 $_march $_mcpu $_debug $_profile"
106 + CFLAGS="-W -Wall -O4 $_march $_mcpu $_debug $_profile"
107 if test "$_cc_major" -ge "3" ; then
108 CFLAGS=`echo "$CFLAGS" | sed -e 's/\(-Wall\)/\1 -Wno-unused-parameter/'`
110 @@ -3794,7 +3803,7 @@ fi
113 echocheck "X11 headers presence"
114 - for I in `echo $_inc_extra | sed s/-I//g` /usr/X11/include /usr/X11R6/include /usr/include/X11R6 /usr/include /usr/openwin/include ; do
115 + for I in `echo $_inc_extra | sed s/-I//g`; do
116 if test -f "$I/X11/Xlib.h" ; then
117 _inc_x11="-I$I"
118 _x11_headers="yes"
119 diff --git a/libaf/af_format.c b/libaf/af_format.c
120 index e5b7cc9..5d7ea6d 100644
121 --- a/libaf/af_format.c
122 +++ b/libaf/af_format.c
123 @@ -20,7 +20,14 @@
124 // Integer to float conversion through lrintf()
125 #ifdef HAVE_LRINTF
126 #include <math.h>
128 +#ifdef ARCH_AVR32
129 +#define lrintf(x) rint(x)
130 +#define llrint(x) (long long)rint(x)
131 +#else
132 long int lrintf(float);
133 +#endif
135 #else
136 #define lrintf(x) ((int)(x))
137 #endif
138 diff --git a/libavcodec/Makefile b/libavcodec/Makefile
139 index 17b6c45..8e1dc96 100644
140 --- a/libavcodec/Makefile
141 +++ b/libavcodec/Makefile
142 @@ -360,6 +360,12 @@ OBJS-$(TARGET_ARCH_SPARC) += sparc/dsputil_vis.o \
144 sparc/dsputil_vis.o: CFLAGS += -mcpu=ultrasparc -mtune=ultrasparc
146 +# avr32 specific stuff
147 +ifeq ($(TARGET_ARCH_AVR32),yes)
148 +ASM_OBJS += avr32/idct.o avr32/fdct.o avr32/mc.o avr32/h264idct.o
149 +OBJS += avr32/dsputil_avr32.o
150 +endif
152 # sun mediaLib specific stuff
153 OBJS-$(HAVE_MLIB) += mlib/dsputil_mlib.o \
155 @@ -419,6 +425,7 @@ tests: apiexample $(TESTS)
156 clean::
157 rm -f \
158 i386/*.o i386/*~ \
159 + avr32/*.o avr32/*~ \
160 armv4l/*.o armv4l/*~ \
161 mlib/*.o mlib/*~ \
162 alpha/*.o alpha/*~ \
163 diff --git a/libavcodec/avr32/dsputil_avr32.c b/libavcodec/avr32/dsputil_avr32.c
164 new file mode 100644
165 index 0000000..200284d
166 --- /dev/null
167 +++ b/libavcodec/avr32/dsputil_avr32.c
168 @@ -0,0 +1,2678 @@
170 + * Copyright (c) 2007 Atmel Corporation. All rights reserved.
172 + * Redistribution and use in source and binary forms, with or without
173 + * modification, are permitted provided that the following conditions
174 + * are met:
176 + * 1. Redistributions of source code must retain the above copyright
177 + * notice, this list of conditions and the following disclaimer.
179 + * 2. Redistributions in binary form must reproduce the above
180 + * copyright notice, this list of conditions and the following
181 + * disclaimer in the documentation and/or other materials provided
182 + * with the distribution.
184 + * 3. The name of ATMEL may not be used to endorse or promote products
185 + * derived from this software without specific prior written
186 + * permission.
188 + * THIS SOFTWARE IS PROVIDED BY ATMEL ``AS IS'' AND ANY EXPRESS OR
189 + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
190 + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
191 + * ARE EXPRESSLY AND SPECIFICALLY DISCLAIMED. IN NO EVENT SHALL ATMEL
192 + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
193 + * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
194 + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
195 + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
196 + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
197 + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
198 + * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
199 + * DAMAGE.
200 + */
202 +#include "../dsputil.h"
203 +#include "pico.h"
205 +int avr32_use_pico = 1;
207 +//#define CHECK_DSP_FUNCS_AGAINST_C
209 +#ifdef CHECK_DSP_FUNCS_AGAINST_C
210 +#define DSP_FUNC_NAME(name) test_ ## name
211 +#else
212 +#define DSP_FUNC_NAME(name) name
213 +#endif
215 +union doubleword {
216 + int64_t doubleword;
217 + struct {
218 + int32_t top;
219 + int32_t bottom;
220 + } words;
223 +#undef LD16
224 +#undef LD32
225 +#undef LD64
227 +#define LD16(a) (*((uint16_t*)(a)))
228 +#define LD32(a) (*((uint32_t*)(a)))
229 +#define LD64(a) (*((uint64_t*)(a)))
230 +#define LD64_UNALIGNED(a) \
231 + ({ union doubleword __tmp__; \
232 + __tmp__.words.top = LD32(a); \
233 + __tmp__.words.bottom = LD32(a + 4); \
234 + __tmp__.doubleword; })
236 +#undef ST32
237 +#undef ST16
239 +#define ST16(a, b) *((uint16_t*)(a)) = (b)
240 +#define ST32(a, b) *((uint32_t*)(a)) = (b)
242 +#undef rnd_avg32
243 +#define rnd_avg32(a, b) \
244 + ({ uint32_t __tmp__;\
245 + asm("pavg.ub\t%0, %1, %2" : "=r"(__tmp__) : "r"(a), "r"(b));\
246 + __tmp__;})
248 +void idct_avr32(DCTELEM *data);
249 +void fdct_avr32(DCTELEM *data);
251 +void idct_put_avr32(uint8_t *dest, int line_size, DCTELEM *data);
252 +void idct_add_avr32(uint8_t *dest, int line_size, DCTELEM *data);
254 +void h264_idct_add_avr32(uint8_t *dest, DCTELEM *data, int stride);
255 +void h264_idct8_add_avr32(uint8_t *dest, DCTELEM *data, int stride);
257 +#define extern_dspfunc(PFX, NUM) \
258 + void PFX ## _pixels ## NUM ## _avr32(uint8_t *dst, const uint8_t *pixels, int line_size, int h ); \
259 + void PFX ## _pixels ## NUM ## _h_avr32(uint8_t *dst, const uint8_t *pixels, int line_size, int h ); \
260 + void PFX ## _pixels ## NUM ## _v_avr32(uint8_t *dst, const uint8_t *pixels, int line_size, int h ); \
261 + void PFX ## _pixels ## NUM ## _hv_avr32(uint8_t *dst, const uint8_t *pixels, int line_size, int h )
263 +extern_dspfunc(put, 8);
264 +extern_dspfunc(put_no_rnd, 8);
265 +extern_dspfunc(avg, 8);
266 +extern_dspfunc(avg_no_rnd, 8);
267 +#undef extern_dspfunc
269 +#ifdef CHECK_DSP_FUNCS_AGAINST_C
270 +#define extern_dspfunc(PFX, NUM) \
271 + void PFX ## _pixels ## NUM ## _c(uint8_t *dst, const uint8_t *pixels, int line_size, int h ); \
272 + void PFX ## _pixels ## NUM ## _x2_c(uint8_t *dst, const uint8_t *pixels, int line_size, int h ); \
273 + void PFX ## _pixels ## NUM ## _y2_c(uint8_t *dst, const uint8_t *pixels, int line_size, int h ); \
274 + void PFX ## _pixels ## NUM ## _xy2_c(uint8_t *dst, const uint8_t *pixels, int line_size, int h )
276 +extern_dspfunc(put, 4);
277 +extern_dspfunc(put_no_rnd, 4);
278 +extern_dspfunc(put, 8);
279 +extern_dspfunc(put_no_rnd, 8);
280 +extern_dspfunc(put, 16);
281 +extern_dspfunc(put_no_rnd, 16);
282 +extern_dspfunc(avg, 8);
283 +extern_dspfunc(avg_no_rnd, 8);
284 +extern_dspfunc(avg, 16);
285 +extern_dspfunc(avg_no_rnd, 16);
288 +#undef extern_dspfunc
289 +#define extern_dspfunc(PFX, NUM) \
290 +void PFX ## NUM ## _mc00_c(uint8_t *dst, uint8_t *src, int stride); \
291 +void PFX ## NUM ## _mc10_c(uint8_t *dst, uint8_t *src, int stride); \
292 +void PFX ## NUM ## _mc20_c(uint8_t *dst, uint8_t *src, int stride); \
293 +void PFX ## NUM ## _mc30_c(uint8_t *dst, uint8_t *src, int stride); \
294 +void PFX ## NUM ## _mc01_c(uint8_t *dst, uint8_t *src, int stride); \
295 +void PFX ## NUM ## _mc11_c(uint8_t *dst, uint8_t *src, int stride); \
296 +void PFX ## NUM ## _mc21_c(uint8_t *dst, uint8_t *src, int stride); \
297 +void PFX ## NUM ## _mc31_c(uint8_t *dst, uint8_t *src, int stride); \
298 +void PFX ## NUM ## _mc02_c(uint8_t *dst, uint8_t *src, int stride); \
299 +void PFX ## NUM ## _mc12_c(uint8_t *dst, uint8_t *src, int stride); \
300 +void PFX ## NUM ## _mc22_c(uint8_t *dst, uint8_t *src, int stride); \
301 +void PFX ## NUM ## _mc32_c(uint8_t *dst, uint8_t *src, int stride); \
302 +void PFX ## NUM ## _mc03_c(uint8_t *dst, uint8_t *src, int stride); \
303 +void PFX ## NUM ## _mc13_c(uint8_t *dst, uint8_t *src, int stride); \
304 +void PFX ## NUM ## _mc23_c(uint8_t *dst, uint8_t *src, int stride); \
305 +void PFX ## NUM ## _mc33_c(uint8_t *dst, uint8_t *src, int stride); \
307 +extern_dspfunc(put_h264_qpel, 16);
308 +extern_dspfunc(put_h264_qpel, 8);
309 +extern_dspfunc(put_h264_qpel, 4);
310 +extern_dspfunc(avg_h264_qpel, 16);
311 +extern_dspfunc(avg_h264_qpel, 8);
312 +extern_dspfunc(avg_h264_qpel, 4);
314 +#undef extern_dspfunc
316 +void put_h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y);
317 +void put_h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y);
318 +void put_h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y);
320 +void avg_h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y);
321 +void avg_h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y);
322 +void avg_h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y);
325 +void dump_block8(uint8_t *block, int line_size, int h);
326 +void dump_block4(uint8_t *block, int line_size, int h);
327 +void dump_block(uint8_t *block, int line_size, int h, int w);
329 +void check_block8(uint8_t *test, uint8_t *correct, int line_size_test, int line_size_correct,
330 + int h, char *name, int max_dev);
331 +void check_block4(uint8_t *test, uint8_t *correct, int line_size_test, int line_size_correct,
332 + int h, char *name, int max_dev);
333 +void check_block(uint8_t *test, uint8_t *correct, int line_size_test, int line_size_correct,
334 + int h, int width, char *name, int max_dev);
336 +#define PIXOP2( OPNAME, OP ) \
337 +void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
338 + int i;\
339 + for(i=0; i<h; i++){\
340 + OP(*((uint32_t*)(block )), LD32(pixels ));\
341 + pixels+=line_size;\
342 + block +=line_size;\
343 + }\
345 +void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
346 + int src_stride1, int src_stride2, int h){\
347 + int i;\
348 + for(i=0; i<h; i++){\
349 + uint32_t a,b;\
350 + a= LD32(&src1[i*src_stride1 ]);\
351 + b= LD32(&src2[i*src_stride2 ]);\
352 + OP(*((uint32_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
353 + a= LD32(&src1[i*src_stride1+4]);\
354 + b= LD32(&src2[i*src_stride2+4]);\
355 + OP(*((uint32_t*)&dst[i*dst_stride+4]), rnd_avg32(a, b));\
356 + }\
359 +void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
360 + int src_stride1, int src_stride2, int h){\
361 + int i;\
362 + for(i=0; i<h; i++){\
363 + uint32_t a,b;\
364 + a= LD32(&src1[i*src_stride1 ]);\
365 + b= LD32(&src2[i*src_stride2 ]);\
366 + OP(*((uint32_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
367 + }\
370 +void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
371 + int src_stride1, int src_stride2, int h){\
372 + OPNAME ## _pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\
373 + OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
376 +#else
377 +#define PIXOP2( OPNAME, OP ) \
378 +static void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
379 + int i;\
380 + for(i=0; i<h; i++){\
381 + OP(*((uint32_t*)(block )), LD32(pixels ));\
382 + pixels+=line_size;\
383 + block +=line_size;\
384 + }\
386 +static void OPNAME ## _pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
387 + int i;\
388 + for(i=0; i<h; i++){\
389 + OP(*((uint32_t*)(block )), LD32(pixels ));\
390 + OP(*((uint32_t*)(block+4)), LD32(pixels+4));\
391 + pixels+=line_size;\
392 + block +=line_size;\
393 + }\
395 +static void OPNAME ## _pixels16_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
396 + int i;\
397 + for(i=0; i<h; i++){\
398 + OP(*((uint32_t*)(block )), LD32(pixels ));\
399 + OP(*((uint32_t*)(block+4)), LD32(pixels+4));\
400 + OP(*((uint32_t*)(block+8)), LD32(pixels+8));\
401 + OP(*((uint32_t*)(block+12)), LD32(pixels+12));\
402 + pixels+=line_size;\
403 + block +=line_size;\
404 + }\
406 +static void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
407 + int src_stride1, int src_stride2, int h){\
408 + int i;\
409 + for(i=0; i<h; i++){\
410 + uint32_t a,b;\
411 + a= LD32(&src1[i*src_stride1 ]);\
412 + b= LD32(&src2[i*src_stride2 ]);\
413 + OP(*((uint32_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
414 + a= LD32(&src1[i*src_stride1+4]);\
415 + b= LD32(&src2[i*src_stride2+4]);\
416 + OP(*((uint32_t*)&dst[i*dst_stride+4]), rnd_avg32(a, b));\
417 + }\
420 +static void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
421 + int src_stride1, int src_stride2, int h){\
422 + int i;\
423 + for(i=0; i<h; i++){\
424 + uint32_t a,b;\
425 + a= LD32(&src1[i*src_stride1 ]);\
426 + b= LD32(&src2[i*src_stride2 ]);\
427 + OP(*((uint32_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
428 + }\
431 +static void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
432 + int src_stride1, int src_stride2, int h){\
433 + OPNAME ## _pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\
434 + OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
437 +#endif
439 +#define op_avg(a, b) a = rnd_avg32(a, b)
440 +#define op_put(a, b) a = b
442 +PIXOP2(avg, op_avg)
443 +PIXOP2(put, op_put)
444 +#undef op_avg
445 +#undef op_put
449 +static inline void copy_block4(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
451 + int i;
452 + for(i=0; i<h; i++)
454 + ST32(dst , LD32(src ));
455 + dst+=dstStride;
456 + src+=srcStride;
460 +static void clear_blocks_avr32(DCTELEM *blocks)
462 + int n = 12;
463 + uint64_t tmp1, tmp2;
464 + blocks += 6*64;
465 + asm volatile ( "mov\t%1, 0\n"
466 + "mov\t%m1, 0\n"
467 + "mov\t%2, 0\n"
468 + "mov\t%m2, 0\n"
469 + "0:\n"
470 + "stm\t--%3, %1, %m1, %2, %m2\n"
471 + "stm\t--%3, %1, %m1, %2, %m2\n"
472 + "stm\t--%3, %1, %m1, %2, %m2\n"
473 + "stm\t--%3, %1, %m1, %2, %m2\n"
474 + "sub\t%0, 1\n"
475 + "brne\t0b\n"
476 + : "+r"(n), "=&r"(tmp1), "=&r"(tmp2),
477 + "+r"(blocks));
481 +static inline void copy_block8(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
483 + int i;
484 + for(i=0; i<h; i++)
486 + ST32(dst , LD32(src ));
487 + ST32(dst+4 , LD32(src+4 ));
488 + dst+=dstStride;
489 + src+=srcStride;
493 +static inline void copy_block16(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
495 + int i;
496 + for(i=0; i<h; i++)
498 + ST32(dst , LD32(src ));
499 + ST32(dst+4 , LD32(src+4 ));
500 + ST32(dst+8 , LD32(src+8 ));
501 + ST32(dst+12, LD32(src+12));
502 + dst+=dstStride;
503 + src+=srcStride;
508 +static void put_h264_chroma_mc2_pico(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){
509 + const int A=(8-x)*(8-y);
510 + const int B=( x)*(8-y);
511 + const int C=(8-x)*( y);
512 + const int D=( x)*( y);
513 + int i;
515 + PICO_PUT_W(PICO_COEFF0_A, (A << 16) | (B & 0xFFFF));
516 + PICO_PUT_W(PICO_COEFF0_B, 32);
517 + PICO_PUT_W(PICO_COEFF1_A, (C << 16) | (D & 0xFFFF));
518 + PICO_PUT_W(PICO_COEFF1_B, 0);
519 + PICO_PUT_W(PICO_COEFF2_A, 0);
520 + PICO_PUT_W(PICO_COEFF2_B, 0);
521 + PICO_PUT_W(PICO_CONFIG,
522 + PICO_OUTPUT_MODE(PICO_PLANAR_MODE)
523 + | PICO_INPUT_MODE(PICO_HOR_FILTER_MODE)
524 + | PICO_COEFF_FRAC_BITS(6)
525 + | PICO_OFFSET_FRAC_BITS(6));
527 + for(i=0; i<h; i++)
530 + int src0 = LD32(src);
531 + int src1 = LD32(src + stride);
533 + PICO_MVRC_W(PICO_INPIX0, src0);
534 + PICO_MVRC_W(PICO_INPIX1, src1);
535 + PICO_OP(PICO_SINGLE_VECTOR, 2, 0, 4, 0);
536 + PICO_OP(PICO_SINGLE_VECTOR, 3, 1, 5, 0);
537 + src += stride;
538 + ST16(dst,(short)PICO_GET_W(PICO_OUTPIX0));
539 + dst += stride;
544 +static void put_h264_chroma_mc4_pico(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){
545 + const int A=(8-x)*(8-y);\
546 + const int B=( x)*(8-y);
547 + const int C=(8-x)*( y);
548 + const int D=( x)*( y);
549 + int i;
551 + PICO_PUT_W(PICO_COEFF0_A, (A << 16) | (B & 0xFFFF));
552 + PICO_PUT_W(PICO_COEFF0_B, 32);
553 + PICO_PUT_W(PICO_COEFF1_A, (C << 16) | (D & 0xFFFF));
554 + PICO_PUT_W(PICO_COEFF1_B, 0);
555 + PICO_PUT_W(PICO_COEFF2_A, 0);
556 + PICO_PUT_W(PICO_COEFF2_B, 0);
557 + PICO_PUT_W(PICO_CONFIG,
558 + PICO_OUTPUT_MODE(PICO_PLANAR_MODE)
559 + | PICO_INPUT_MODE(PICO_HOR_FILTER_MODE)
560 + | PICO_COEFF_FRAC_BITS(6)
561 + | PICO_OFFSET_FRAC_BITS(6));
563 + for(i=0; i<h; i++)
565 + /*
566 + OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));
567 + OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));
568 + OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));
569 + OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));
570 + dst+= stride;
571 + src+= stride;
572 + */
574 + int src0 = LD32(src);
575 + int src1 = (((int)src[4] << 24) | (int)src[stride]);
576 + int src2 = LD32(src + stride + 1);
578 + PICO_MVRC_W(PICO_INPIX0, src0);
579 + PICO_MVRC_W(PICO_INPIX1, src1);
580 + PICO_MVRC_W(PICO_INPIX2, src2);
581 + PICO_OP(PICO_SINGLE_VECTOR, 0, 0, 7, 0);
582 + PICO_OP(PICO_SINGLE_VECTOR, 1, 1, 8, 0);
583 + PICO_OP(PICO_SINGLE_VECTOR, 2, 2, 9, 0);
584 + PICO_OP(PICO_SINGLE_VECTOR, 3, 3, 10, 0);
585 + src += stride;
586 + ST32(dst, PICO_GET_W(PICO_OUTPIX0));
588 + dst += stride;
592 +static void put_h264_chroma_mc8_pico(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){
593 + const int A=(8-x)*(8-y);
594 + const int B=( x)*(8-y);
595 + const int C=(8-x)*( y);
596 + const int D=( x)*( y);
597 + int i;
599 + PICO_PUT_W(PICO_COEFF0_A, (A << 16) | (B & 0xFFFF));
600 + PICO_PUT_W(PICO_COEFF0_B, 32);
601 + PICO_PUT_W(PICO_COEFF1_A, (C << 16) | (D & 0xFFFF));
602 + PICO_PUT_W(PICO_COEFF1_B, 0);
603 + PICO_PUT_W(PICO_COEFF2_A, 0);
604 + PICO_PUT_W(PICO_COEFF2_B, 0);
605 + PICO_PUT_W(PICO_CONFIG,
606 + PICO_OUTPUT_MODE(PICO_PLANAR_MODE)
607 + | PICO_INPUT_MODE(PICO_HOR_FILTER_MODE)
608 + | PICO_COEFF_FRAC_BITS(6)
609 + | PICO_OFFSET_FRAC_BITS(6));
611 + for(i=0; i<h; i++)
613 + /*
614 + OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));
615 + OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));
616 + OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));
617 + OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));
618 + OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5]));
619 + OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6]));
620 + OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7]));
621 + OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8]));
622 + dst+= stride;
623 + src+= stride;
624 + */
625 + int src0 = LD32(src);
626 + int src1 = (((int)src[4] << 24) | (int)src[stride]);
627 + int src2 = LD32(src + stride + 1);
629 + PICO_MVRC_W(PICO_INPIX0, src0);
630 + PICO_MVRC_W(PICO_INPIX1, src1);
631 + PICO_MVRC_W(PICO_INPIX2, src2);
632 + PICO_OP(PICO_SINGLE_VECTOR, 0, 0, 7, 0);
633 + PICO_OP(PICO_SINGLE_VECTOR, 1, 1, 8, 0);
634 + PICO_OP(PICO_SINGLE_VECTOR, 2, 2, 9, 0);
635 + PICO_OP(PICO_SINGLE_VECTOR, 3, 3, 10, 0);
636 + ST32(dst, PICO_GET_W(PICO_OUTPIX0));
638 + src0 = LD32(src + 4);
639 + src1 = (src[8] << 24) | src[stride + 4];
640 + src2 = LD32(src + stride + 5);
642 + PICO_MVRC_W(PICO_INPIX0, src0);
643 + PICO_MVRC_W(PICO_INPIX1, src1);
644 + PICO_MVRC_W(PICO_INPIX2, src2);
645 + PICO_OP(PICO_SINGLE_VECTOR, 0, 0, 7, 0);
646 + PICO_OP(PICO_SINGLE_VECTOR, 1, 1, 8, 0);
647 + PICO_OP(PICO_SINGLE_VECTOR, 2, 2, 9, 0);
648 + PICO_OP(PICO_SINGLE_VECTOR, 3, 3, 10, 0);
649 + src += stride;
650 + ST32(dst + 4, PICO_GET_W(PICO_OUTPIX0));
652 + dst += stride;
657 +static void avg_h264_chroma_mc2_pico(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){
658 + const int A=(8-x)*(8-y);
659 + const int B=( x)*(8-y);
660 + const int C=(8-x)*( y);
661 + const int D=( x)*( y);
662 + int i;
664 + PICO_PUT_W(PICO_COEFF0_A, (A << 16) | (B & 0xFFFF));
665 + PICO_PUT_W(PICO_COEFF0_B, 32);
666 + PICO_PUT_W(PICO_COEFF1_A, (C << 16) | (D & 0xFFFF));
667 + PICO_PUT_W(PICO_COEFF1_B, 0);
668 + PICO_PUT_W(PICO_COEFF2_A, 0);
669 + PICO_PUT_W(PICO_COEFF2_B, 0);
670 + PICO_PUT_W(PICO_CONFIG,
671 + PICO_OUTPUT_MODE(PICO_PLANAR_MODE)
672 + | PICO_INPUT_MODE(PICO_HOR_FILTER_MODE)
673 + | PICO_COEFF_FRAC_BITS(6)
674 + | PICO_OFFSET_FRAC_BITS(6));
676 + for(i=0; i<h; i++)
678 + int src0 = LD32(src);
679 + int src1 = LD32(src + stride);
681 + PICO_MVRC_W(PICO_INPIX0, src0);
682 + PICO_MVRC_W(PICO_INPIX1, src1);
683 + PICO_OP(PICO_SINGLE_VECTOR, 2, 0, 4, 0);
684 + PICO_OP(PICO_SINGLE_VECTOR, 3, 1, 5, 0);
685 + src += stride;
686 + ST16(dst, rnd_avg32(LD16(dst), PICO_GET_W(PICO_OUTPIX0)));
687 + dst += stride;
692 +static void avg_h264_chroma_mc4_pico(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){
693 + const int A=(8-x)*(8-y);\
694 + const int B=( x)*(8-y);
695 + const int C=(8-x)*( y);
696 + const int D=( x)*( y);
697 + int i;
699 + PICO_PUT_W(PICO_COEFF0_A, (A << 16) | (B & 0xFFFF));
700 + PICO_PUT_W(PICO_COEFF0_B, 32);
701 + PICO_PUT_W(PICO_COEFF1_A, (C << 16) | (D & 0xFFFF));
702 + PICO_PUT_W(PICO_COEFF1_B, 0);
703 + PICO_PUT_W(PICO_COEFF2_A, 0);
704 + PICO_PUT_W(PICO_COEFF2_B, 0);
705 + PICO_PUT_W(PICO_CONFIG,
706 + PICO_OUTPUT_MODE(PICO_PLANAR_MODE)
707 + | PICO_INPUT_MODE(PICO_HOR_FILTER_MODE)
708 + | PICO_COEFF_FRAC_BITS(6)
709 + | PICO_OFFSET_FRAC_BITS(6));
711 + for(i=0; i<h; i++)
713 + /*
714 + OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));
715 + OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));
716 + OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));
717 + OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));
718 + dst+= stride;
719 + src+= stride;
720 + */
722 + int src0 = *((int *)src);
723 + int src1 = (int)((src[4] << 24) | src[stride]);
724 + int src2 = *((int *)(src + stride + 1));
726 + PICO_MVRC_W(PICO_INPIX0, src0);
727 + PICO_MVRC_W(PICO_INPIX1, src1);
728 + PICO_MVRC_W(PICO_INPIX2, src2);
729 + PICO_OP(PICO_SINGLE_VECTOR, 0, 0, 7, 0);
730 + PICO_OP(PICO_SINGLE_VECTOR, 1, 1, 8, 0);
731 + PICO_OP(PICO_SINGLE_VECTOR, 2, 2, 9, 0);
732 + PICO_OP(PICO_SINGLE_VECTOR, 3, 3, 10, 0);
733 + src += stride;
734 + ST32(dst, rnd_avg32(LD32(dst), PICO_GET_W(PICO_OUTPIX0)));
735 + dst += stride;
739 +static void avg_h264_chroma_mc8_pico(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){
740 + const int A=(8-x)*(8-y);
741 + const int B=( x)*(8-y);
742 + const int C=(8-x)*( y);
743 + const int D=( x)*( y);
744 + int i;
746 + PICO_PUT_W(PICO_COEFF0_A, (A << 16) | (B & 0xFFFF));
747 + PICO_PUT_W(PICO_COEFF0_B, 32);
748 + PICO_PUT_W(PICO_COEFF1_A, (C << 16) | (D & 0xFFFF));
749 + PICO_PUT_W(PICO_COEFF1_B, 0);
750 + PICO_PUT_W(PICO_COEFF2_A, 0);
751 + PICO_PUT_W(PICO_COEFF2_B, 0);
752 + PICO_PUT_W(PICO_CONFIG,
753 + PICO_OUTPUT_MODE(PICO_PLANAR_MODE)
754 + | PICO_INPUT_MODE(PICO_HOR_FILTER_MODE)
755 + | PICO_COEFF_FRAC_BITS(6)
756 + | PICO_OFFSET_FRAC_BITS(6));
758 + for(i=0; i<h; i++)
760 + /*
761 + OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));
762 + OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));
763 + OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));
764 + OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));
765 + OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5]));
766 + OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6]));
767 + OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7]));
768 + OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8]));
769 + dst+= stride;
770 + src+= stride;
771 + */
772 + int src0 = *((int *)src);
773 + int src1 = (volatile int)((src[4] << 24) | src[stride]);
774 + int src2 = *((int *)(src + stride + 1));
776 + PICO_MVRC_W(PICO_INPIX0, src0);
777 + PICO_MVRC_W(PICO_INPIX1, src1);
778 + PICO_MVRC_W(PICO_INPIX2, src2);
779 + PICO_OP(PICO_SINGLE_VECTOR, 0, 0, 7, 0);
780 + PICO_OP(PICO_SINGLE_VECTOR, 1, 1, 8, 0);
781 + PICO_OP(PICO_SINGLE_VECTOR, 2, 2, 9, 0);
782 + PICO_OP(PICO_SINGLE_VECTOR, 3, 3, 10, 0);
783 + ST32(dst, rnd_avg32(LD32(dst), PICO_GET_W(PICO_OUTPIX0)));
785 + src0 = *((int *)(src + 4));
786 + src1 = (int)((src[8] << 24) | src[stride + 4]);
787 + src2 = *((int *)(src + stride + 5));
789 + PICO_MVRC_W(PICO_INPIX0, src0);
790 + PICO_MVRC_W(PICO_INPIX1, src1);
791 + PICO_MVRC_W(PICO_INPIX2, src2);
792 + PICO_OP(PICO_SINGLE_VECTOR, 0, 0, 7, 0);
793 + PICO_OP(PICO_SINGLE_VECTOR, 1, 1, 8, 0);
794 + PICO_OP(PICO_SINGLE_VECTOR, 2, 2, 9, 0);
795 + PICO_OP(PICO_SINGLE_VECTOR, 3, 3, 10, 0);
796 + src += stride;
797 + ST32(dst + 4, rnd_avg32(LD32(dst + 4), PICO_GET_W(PICO_OUTPIX0)));
798 + dst += stride;
802 +static struct pico_config_t h264_qpel4_h_lowpass_config = {
803 + .input_mode = PICO_HOR_FILTER_MODE,
804 + .output_mode = PICO_PLANAR_MODE,
805 + .coeff_frac_bits = 5,
806 + .offset_frac_bits = 5,
807 + .coeff0_0 = 1,
808 + .coeff0_1 = -5,
809 + .coeff0_2 = 20,
810 + .coeff0_3 = 16,
811 + .coeff1_0 = 20,
812 + .coeff1_1 = -5,
813 + .coeff1_2 = 1,
814 + .coeff1_3 = 0,
815 + .coeff2_0 = 0,
816 + .coeff2_1 = 0,
817 + .coeff2_2 = 0,
818 + .coeff2_3 = 0
823 +static void put_h264_qpel4_h_lowpass_pico(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){
824 + const int h=4;
825 + int i;
827 + set_pico_config(&h264_qpel4_h_lowpass_config);
829 + for(i=0; i<h; i++){
831 + /*
832 + OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
833 + OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
834 + OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]));\
835 + OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]));\
836 + dst+=dstStride;\
837 + src+=srcStride;\ */
838 + PICO_MVRC_W(PICO_INPIX0, LD32(src - 2));
839 + PICO_MVRC_D(PICO_INPIX2, LD64_UNALIGNED(src + 2));
840 + PICO_OP(PICO_SINGLE_VECTOR, 0, 0, 3, 6);
841 + PICO_OP(PICO_SINGLE_VECTOR, 1, 1, 4, 7);
842 + PICO_OP(PICO_SINGLE_VECTOR, 2, 2, 5, 8);
843 + PICO_OP(PICO_SINGLE_VECTOR, 3, 3, 6, 9);
844 + src += srcStride;
845 + ST32(dst, PICO_GET_W(PICO_OUTPIX0));
846 + dst += dstStride;
850 +static void avg_h264_qpel4_h_lowpass_pico(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){
851 + const int h=4;
852 + int i;
854 + set_pico_config(&h264_qpel4_h_lowpass_config);
856 + for(i=0; i<h; i++){
858 + /*
859 + OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
860 + OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
861 + OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]));\
862 + OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]));\
863 + dst+=dstStride;\
864 + src+=srcStride;\ */
866 + PICO_MVRC_W(PICO_INPIX0, LD32(src - 2));
867 + PICO_MVRC_D(PICO_INPIX2, LD64_UNALIGNED(src + 2));
868 + PICO_OP(PICO_SINGLE_VECTOR, 0, 0, 3, 6);
869 + PICO_OP(PICO_SINGLE_VECTOR, 1, 1, 4, 7);
870 + PICO_OP(PICO_SINGLE_VECTOR, 2, 2, 5, 8);
871 + PICO_OP(PICO_SINGLE_VECTOR, 3, 3, 6, 9);
872 + src += srcStride;
873 + ST32(dst, rnd_avg32(LD32(dst), PICO_GET_W(PICO_OUTPIX0)));
874 + dst += dstStride;
878 +static struct pico_config_t h264_qpel4_v_lowpass_config1 = {
879 + .input_mode = PICO_VERT_FILTER_MODE,
880 + .output_mode = PICO_PACKED_MODE,
881 + .coeff_frac_bits = 5,
882 + .offset_frac_bits = 5,
883 + .coeff0_0 = 1,
884 + .coeff0_1 = -5,
885 + .coeff0_2 = 20,
886 + .coeff0_3 = 16,
887 + .coeff1_0 = 1,
888 + .coeff1_1 = -5,
889 + .coeff1_2 = 20,
890 + .coeff1_3 = 16,
891 + .coeff2_0 = 1,
892 + .coeff2_1 = -5,
893 + .coeff2_2 = 20,
894 + .coeff2_3 = 16
899 +static struct pico_config_t h264_qpel4_v_lowpass_config2 = {
900 + .input_mode = PICO_VERT_FILTER_MODE,
901 + .output_mode = PICO_PLANAR_MODE,
902 + .coeff_frac_bits = 5,
903 + .offset_frac_bits = 5,
904 + .coeff0_0 = 1,
905 + .coeff0_1 = -5,
906 + .coeff0_2 = 20,
907 + .coeff0_3 = 16,
908 + .coeff1_0 = 20,
909 + .coeff1_1 = -5,
910 + .coeff1_2 = 1,
911 + .coeff1_3 = 0,
912 + .coeff2_0 = 0,
913 + .coeff2_1 = 0,
914 + .coeff2_2 = 0,
915 + .coeff2_3 = 0
918 +static void put_h264_qpel4_v_lowpass_pico(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){
920 + /*
921 + const int w=4;
922 + uint8_t *cm = cropTbl + MAX_NEG_CROP;
923 + int i;
924 + for(i=0; i<w; i++)
926 + const int srcB= src[-2*srcStride];\
927 + const int srcA= src[-1*srcStride];\
928 + const int src0= src[0 *srcStride];\
929 + const int src1= src[1 *srcStride];\
930 + const int src2= src[2 *srcStride];\
931 + const int src3= src[3 *srcStride];\
932 + const int src4= src[4 *srcStride];\
933 + const int src5= src[5 *srcStride];\
934 + const int src6= src[6 *srcStride];\
935 + OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
936 + OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
937 + OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
938 + OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
939 + dst++;\
940 + src++;\
941 + */
943 + set_pico_config(&h264_qpel4_v_lowpass_config1);
946 + int srcB= LD32(src - 2*srcStride);
947 + int srcA= LD32(src - 1*srcStride);
948 + int src0= LD32(src + 0 *srcStride);
949 + int src1= LD32(src + 1 *srcStride);
950 + int src2= LD32(src + 2 *srcStride);
951 + int src3= LD32(src + 3 *srcStride);
952 + int src4= LD32(src + 4 *srcStride);
953 + int src5= LD32(src + 5 *srcStride);
954 + int src6= LD32(src + 6 *srcStride);
956 + /* First compute the leftmost three colums */
957 + PICO_MVRC_W(PICO_INPIX0, srcB);
958 + PICO_MVRC_W(PICO_INPIX1, srcA);
959 + PICO_MVRC_W(PICO_INPIX2, src0);
960 + PICO_OP(0, 0, 0, 3, 6);
961 + PICO_MVRC_W(PICO_INPIX2, src1);
962 + PICO_MVRC_W(PICO_INPIX1, src2);
963 + PICO_MVRC_W(PICO_INPIX0, src3);
964 + PICO_OP(PICO_USE_ACC, 0, 0, 3, 6);
965 + ST32(dst, PICO_GET_W(PICO_OUTPIX0));
966 + dst += dstStride;
967 + PICO_MVRC_W(PICO_INPIX0, srcA);
968 + PICO_MVRC_W(PICO_INPIX1, src0);
969 + PICO_MVRC_W(PICO_INPIX2, src1);
970 + PICO_OP(0, 0, 0, 3, 6);
971 + PICO_MVRC_W(PICO_INPIX2, src2);
972 + PICO_MVRC_W(PICO_INPIX1, src3);
973 + PICO_MVRC_W(PICO_INPIX0, src4);
974 + PICO_OP(PICO_USE_ACC, 0, 0, 3, 6);
975 + ST32(dst, PICO_GET_W(PICO_OUTPIX0));
976 + dst += dstStride;
977 + PICO_MVRC_W(PICO_INPIX0, src0);
978 + PICO_MVRC_W(PICO_INPIX1, src1);
979 + PICO_MVRC_W(PICO_INPIX2, src2);
980 + PICO_OP(0, 0, 0, 3, 6);
981 + PICO_MVRC_W(PICO_INPIX2, src3);
982 + PICO_MVRC_W(PICO_INPIX1, src4);
983 + PICO_MVRC_W(PICO_INPIX0, src5);
984 + PICO_OP(PICO_USE_ACC, 0, 0, 3, 6);
985 + ST32(dst, PICO_GET_W(PICO_OUTPIX0));
986 + dst += dstStride;
987 + PICO_MVRC_W(PICO_INPIX0, src1);
988 + PICO_MVRC_W(PICO_INPIX1, src2);
989 + PICO_MVRC_W(PICO_INPIX2, src3);
990 + PICO_OP(0, 0, 0, 3, 6);
991 + PICO_MVRC_W(PICO_INPIX2, src4);
992 + PICO_MVRC_W(PICO_INPIX1, src5);
993 + PICO_MVRC_W(PICO_INPIX0, src6);
994 + PICO_OP(PICO_USE_ACC, 0, 0, 3, 6);
995 + ST32(dst, PICO_GET_W(PICO_OUTPIX0));
996 + /* Now compute the last column */
998 + union wordbytes {
999 + int word;
1000 + struct {
1001 + unsigned int t:8;
1002 + unsigned int u:8;
1003 + unsigned int l:8;
1004 + unsigned int b:8;
1005 + } bytes; } tmp1, tmp2, tmp3;
1008 + tmp1.bytes.t = srcB;
1009 + tmp1.bytes.u = src1;
1010 + tmp1.bytes.l = src4;
1012 + tmp2.bytes.t = srcA;
1013 + tmp2.bytes.u = src2;
1014 + tmp2.bytes.l = src5;
1016 + tmp3.bytes.t = src0;
1017 + tmp3.bytes.u = src3;
1018 + tmp3.bytes.l = src6;
1020 + PICO_MVRC_W(PICO_INPIX0, tmp1.word);
1021 + PICO_MVRC_W(PICO_INPIX1, tmp2.word);
1022 + PICO_MVRC_W(PICO_INPIX2, tmp3.word);
1023 + set_pico_config(&h264_qpel4_v_lowpass_config2);
1026 + PICO_OP(PICO_SINGLE_VECTOR, 0, 0, 3, 6);
1027 + PICO_OP(PICO_SINGLE_VECTOR, 1, 1, 4, 7);
1028 + PICO_OP(PICO_SINGLE_VECTOR, 2, 2, 5, 8);
1029 + PICO_OP(PICO_SINGLE_VECTOR, 3, 3, 6, 9);
1031 + PICO_MVCR_W(tmp1.word, PICO_OUTPIX0);
1032 + dst[3] = (char)(tmp1.bytes.b);
1033 + dst[3 - dstStride] = (char)(tmp1.bytes.l);
1034 + dst[3 - 2*dstStride] = (char)(tmp1.bytes.u);
1035 + dst[3 - 3*dstStride] = (char)(tmp1.bytes.t);
1038 + /*}
1041 + }*/
1044 +static void avg_h264_qpel4_v_lowpass_pico(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){
1046 + /*
1047 + const int w=4;
1048 + uint8_t *cm = cropTbl + MAX_NEG_CROP;
1049 + int i;
1050 + for(i=0; i<w; i++)
1052 + const int srcB= src[-2*srcStride];\
1053 + const int srcA= src[-1*srcStride];\
1054 + const int src0= src[0 *srcStride];\
1055 + const int src1= src[1 *srcStride];\
1056 + const int src2= src[2 *srcStride];\
1057 + const int src3= src[3 *srcStride];\
1058 + const int src4= src[4 *srcStride];\
1059 + const int src5= src[5 *srcStride];\
1060 + const int src6= src[6 *srcStride];\
1061 + OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
1062 + OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
1063 + OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
1064 + OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
1065 + dst++;\
1066 + src++;\
1067 + */
1068 + uint8_t tmp_block[4*4];
1070 + set_pico_config(&h264_qpel4_v_lowpass_config1);
1073 + int srcB= LD32(src - 2*srcStride);
1074 + int srcA= LD32(src - 1*srcStride);
1075 + int src0= LD32(src + 0 *srcStride);
1076 + int src1= LD32(src + 1 *srcStride);
1077 + int src2= LD32(src + 2 *srcStride);
1078 + int src3= LD32(src + 3 *srcStride);
1079 + int src4= LD32(src + 4 *srcStride);
1080 + int src5= LD32(src + 5 *srcStride);
1081 + int src6= LD32(src + 6 *srcStride);
1083 + /* First compute the leftmost three colums */
1084 + PICO_MVRC_W(PICO_INPIX0, srcB);
1085 + PICO_MVRC_W(PICO_INPIX1, srcA);
1086 + PICO_MVRC_W(PICO_INPIX2, src0);
1087 + PICO_OP(0, 0, 0, 3, 6);
1088 + PICO_MVRC_W(PICO_INPIX2, src1);
1089 + PICO_MVRC_W(PICO_INPIX1, src2);
1090 + PICO_MVRC_W(PICO_INPIX0, src3);
1091 + PICO_OP(PICO_USE_ACC, 0, 0, 3, 6);
1092 + ST32(tmp_block, PICO_GET_W(PICO_OUTPIX0));
1093 + PICO_MVRC_W(PICO_INPIX0, srcA);
1094 + PICO_MVRC_W(PICO_INPIX1, src0);
1095 + PICO_MVRC_W(PICO_INPIX2, src1);
1096 + PICO_OP(0, 0, 0, 3, 6);
1097 + PICO_MVRC_W(PICO_INPIX2, src2);
1098 + PICO_MVRC_W(PICO_INPIX1, src3);
1099 + PICO_MVRC_W(PICO_INPIX0, src4);
1100 + PICO_OP(PICO_USE_ACC, 0, 0, 3, 6);
1101 + ST32(tmp_block + 4, PICO_GET_W(PICO_OUTPIX0));
1102 + PICO_MVRC_W(PICO_INPIX0, src0);
1103 + PICO_MVRC_W(PICO_INPIX1, src1);
1104 + PICO_MVRC_W(PICO_INPIX2, src2);
1105 + PICO_OP(0, 0, 0, 3, 6);
1106 + PICO_MVRC_W(PICO_INPIX2, src3);
1107 + PICO_MVRC_W(PICO_INPIX1, src4);
1108 + PICO_MVRC_W(PICO_INPIX0, src5);
1109 + PICO_OP(PICO_USE_ACC, 0, 0, 3, 6);
1110 + ST32(tmp_block + 8, PICO_GET_W(PICO_OUTPIX0));
1111 + PICO_MVRC_W(PICO_INPIX0, src1);
1112 + PICO_MVRC_W(PICO_INPIX1, src2);
1113 + PICO_MVRC_W(PICO_INPIX2, src3);
1114 + PICO_OP(0, 0, 0, 3, 6);
1115 + PICO_MVRC_W(PICO_INPIX2, src4);
1116 + PICO_MVRC_W(PICO_INPIX1, src5);
1117 + PICO_MVRC_W(PICO_INPIX0, src6);
1118 + PICO_OP(PICO_USE_ACC, 0, 0, 3, 6);
1119 + ST32(tmp_block + 12, PICO_GET_W(PICO_OUTPIX0));
1120 + /* Now compute the last column */
1122 + union wordbytes {
1123 + int word;
1124 + struct {
1125 + unsigned int t:8;
1126 + unsigned int u:8;
1127 + unsigned int l:8;
1128 + unsigned int b:8;
1129 + } bytes; } tmp1, tmp2, tmp3;
1132 + tmp1.bytes.t = srcB;
1133 + tmp1.bytes.u = src1;
1134 + tmp1.bytes.l = src4;
1136 + tmp2.bytes.t = srcA;
1137 + tmp2.bytes.u = src2;
1138 + tmp2.bytes.l = src5;
1140 + tmp3.bytes.t = src0;
1141 + tmp3.bytes.u = src3;
1142 + tmp3.bytes.l = src6;
1144 + PICO_MVRC_W(PICO_INPIX0, tmp1.word);
1145 + PICO_MVRC_W(PICO_INPIX1, tmp2.word);
1146 + PICO_MVRC_W(PICO_INPIX2, tmp3.word);
1147 + set_pico_config(&h264_qpel4_v_lowpass_config2);
1150 + PICO_OP(PICO_SINGLE_VECTOR, 0, 0, 3, 6);
1151 + PICO_OP(PICO_SINGLE_VECTOR, 1, 1, 4, 7);
1152 + PICO_OP(PICO_SINGLE_VECTOR, 2, 2, 5, 8);
1153 + PICO_OP(PICO_SINGLE_VECTOR, 3, 3, 6, 9);
1155 + PICO_MVCR_W(tmp1.word, PICO_OUTPIX0);
1156 + tmp_block[3 + 3*4] = (char)(tmp1.bytes.b);
1157 + tmp_block[3 + 2*4] = (char)(tmp1.bytes.l);
1158 + tmp_block[3 + 1*4] = (char)(tmp1.bytes.u);
1159 + tmp_block[3] = (char)(tmp1.bytes.t);
1161 + /* Compute the average */
1162 + srcB= LD32(dst);
1163 + srcA= LD32(dst + dstStride);
1164 + src0= LD32(dst + dstStride*2);
1165 + src1= LD32(dst + dstStride*3);
1167 + src2= LD32(tmp_block);
1168 + src3= LD32(tmp_block + 4);
1169 + src4= LD32(tmp_block + 8);
1170 + src5= LD32(tmp_block + 12);
1172 + ST32(dst, rnd_avg32(srcB, src2));
1173 + ST32(dst + dstStride, rnd_avg32(srcA, src3));
1174 + ST32(dst + 2*dstStride, rnd_avg32(src0, src4));
1175 + ST32(dst + 3*dstStride, rnd_avg32(src1, src5));
1179 +static struct pico_config_t h264_qpel4_hv_lowpass_config = {
1180 + .input_mode = PICO_HOR_FILTER_MODE,
1181 + .output_mode = PICO_PACKED_MODE,
1182 + .coeff_frac_bits = 10,
1183 + .offset_frac_bits = 10,
1184 + .coeff0_0 = 1,
1185 + .coeff0_1 = -5,
1186 + .coeff0_2 = 20,
1187 + .coeff0_3 = 512,
1188 + .coeff1_0 = -5,
1189 + .coeff1_1 = 25,
1190 + .coeff1_2 = -100,
1191 + .coeff1_3 = 0,
1192 + .coeff2_0 = 20,
1193 + .coeff2_1 = -100,
1194 + .coeff2_2 = 400,
1195 + .coeff2_3 = 0
1198 +static void put_h264_qpel4_hv_lowpass_pico(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){
1200 + int32_t tmp_block[48];
1201 + int32_t *tmp = tmp_block;
1202 + int i;
1204 + set_pico_config(&h264_qpel4_hv_lowpass_config);
1206 + src -= 2;
1207 + for ( i = 0; i < 2; i++ ){
1208 + int srcB= LD32(src - 2*srcStride);
1209 + int srcA= LD32(src - 1*srcStride);
1210 + int src0= LD32(src + 0 *srcStride);
1211 + int src1= LD32(src + 1 *srcStride);
1212 + int src2= LD32(src + 2 *srcStride);
1213 + int src3= LD32(src + 3 *srcStride);
1214 + int src4= LD32(src + 4 *srcStride);
1215 + int src5= LD32(src + 5 *srcStride);
1216 + int src6= LD32(src + 6 *srcStride);
1218 + PICO_MVRC_W(PICO_INPIX0, srcB);
1219 + PICO_MVRC_W(PICO_INPIX1, srcA);
1220 + PICO_MVRC_W(PICO_INPIX2, src0);
1221 + PICO_OP(0, 0, 0, 4, 8);
1222 + PICO_MVRC_W(PICO_INPIX2, src1);
1223 + PICO_MVRC_W(PICO_INPIX1, src2);
1224 + PICO_MVRC_W(PICO_INPIX0, src3);
1225 + PICO_OP(PICO_USE_ACC, 0, 0, 4, 8);
1226 + PICO_STCM_W(tmp,
1227 + PICO_REGVECT_VMU0_OUT,
1228 + PICO_REGVECT_VMU1_OUT,
1229 + PICO_REGVECT_VMU2_OUT);
1230 + tmp += 3;
1232 + PICO_OP(0, 0, 1, 5, 9);
1233 + PICO_MVRC_W(PICO_INPIX0, srcB);
1234 + PICO_MVRC_W(PICO_INPIX1, srcA);
1235 + PICO_MVRC_W(PICO_INPIX2, src0);
1236 + PICO_OP(PICO_USE_ACC, 0, 1, 5, 9);
1237 + PICO_STCM_W(tmp,
1238 + PICO_REGVECT_VMU0_OUT,
1239 + PICO_REGVECT_VMU1_OUT,
1240 + PICO_REGVECT_VMU2_OUT);
1241 + tmp += 3;
1243 + PICO_MVRC_W(PICO_INPIX0, src1);
1244 + PICO_OP(0, 0, 4, 8, 0);
1245 + PICO_MVRC_W(PICO_INPIX2, src2);
1246 + PICO_MVRC_W(PICO_INPIX1, src3);
1247 + PICO_MVRC_W(PICO_INPIX0, src4);
1248 + PICO_OP(PICO_USE_ACC, 0, 0, 4, 8);
1249 + PICO_STCM_W(tmp,
1250 + PICO_REGVECT_VMU0_OUT,
1251 + PICO_REGVECT_VMU1_OUT,
1252 + PICO_REGVECT_VMU2_OUT);
1253 + tmp += 3;
1255 + PICO_OP(0, 0, 1, 5, 9);
1256 + PICO_MVRC_W(PICO_INPIX0, srcA);
1257 + PICO_MVRC_W(PICO_INPIX1, src0);
1258 + PICO_MVRC_W(PICO_INPIX2, src1);
1259 + PICO_OP(PICO_USE_ACC, 0, 1, 5, 9);
1260 + PICO_STCM_W(tmp,
1261 + PICO_REGVECT_VMU0_OUT,
1262 + PICO_REGVECT_VMU1_OUT,
1263 + PICO_REGVECT_VMU2_OUT);
1264 + tmp += 3;
1266 + PICO_MVRC_W(PICO_INPIX0, src2);
1267 + PICO_OP(0, 0, 4, 8, 0);
1268 + PICO_MVRC_W(PICO_INPIX2, src3);
1269 + PICO_MVRC_W(PICO_INPIX1, src4);
1270 + PICO_MVRC_W(PICO_INPIX0, src5);
1271 + PICO_OP(PICO_USE_ACC, 0, 0, 4, 8);
1272 + PICO_STCM_W(tmp,
1273 + PICO_REGVECT_VMU0_OUT,
1274 + PICO_REGVECT_VMU1_OUT,
1275 + PICO_REGVECT_VMU2_OUT);
1276 + tmp += 3;
1278 + PICO_OP(0, 0, 1, 5, 9);
1279 + PICO_MVRC_W(PICO_INPIX0, src0);
1280 + PICO_MVRC_W(PICO_INPIX1, src1);
1281 + PICO_MVRC_W(PICO_INPIX2, src2);
1282 + PICO_OP(PICO_USE_ACC, 0, 1, 5, 9);
1283 + PICO_STCM_W(tmp,
1284 + PICO_REGVECT_VMU0_OUT,
1285 + PICO_REGVECT_VMU1_OUT,
1286 + PICO_REGVECT_VMU2_OUT);
1287 + tmp += 3;
1289 + PICO_MVRC_W(PICO_INPIX0, src3);
1290 + PICO_OP(0, 0, 4, 8, 0);
1291 + PICO_MVRC_W(PICO_INPIX2, src4);
1292 + PICO_MVRC_W(PICO_INPIX1, src5);
1293 + PICO_MVRC_W(PICO_INPIX0, src6);
1294 + PICO_OP(PICO_USE_ACC, 0, 0, 4, 8);
1295 + PICO_STCM_W(tmp,
1296 + PICO_REGVECT_VMU0_OUT,
1297 + PICO_REGVECT_VMU1_OUT,
1298 + PICO_REGVECT_VMU2_OUT);
1299 + tmp += 3;
1301 + PICO_OP(0, 0, 1, 5, 9);
1302 + PICO_MVRC_W(PICO_INPIX0, src1);
1303 + PICO_MVRC_W(PICO_INPIX1, src2);
1304 + PICO_MVRC_W(PICO_INPIX2, src3);
1305 + PICO_OP(PICO_USE_ACC, 0, 1, 5, 9);
1306 + PICO_STCM_W(tmp,
1307 + PICO_REGVECT_VMU0_OUT,
1308 + PICO_REGVECT_VMU1_OUT,
1309 + PICO_REGVECT_VMU2_OUT);
1310 + tmp += 3;
1311 + src += 2;
1314 + src -= 1;
1315 + tmp -= 48;
1318 + PICO_PUT_W(PICO_CONFIG,
1319 + PICO_OUTPUT_MODE(PICO_PLANAR_MODE)
1320 + | PICO_INPUT_MODE(PICO_VERT_FILTER_MODE)
1321 + | PICO_COEFF_FRAC_BITS(10)
1322 + | PICO_OFFSET_FRAC_BITS(10));
1324 + for ( i = 0; i < 2; i++ ){
1325 + int srcB= LD32(src - 2*srcStride);
1326 + int srcA= LD32(src - 1*srcStride);
1327 + int src0= LD32(src + 0 *srcStride);
1328 + int src1= LD32(src + 1 *srcStride);
1329 + int src2= LD32(src + 2 *srcStride);
1330 + int src3= LD32(src + 3 *srcStride);
1331 + int src4= LD32(src + 4 *srcStride);
1332 + int src5= LD32(src + 5 *srcStride);
1333 + int src6= LD32(src + 6 *srcStride);
1336 + PICO_LDCM_W_INC(tmp,
1337 + PICO_REGVECT_VMU0_OUT,
1338 + PICO_REGVECT_VMU1_OUT,
1339 + PICO_REGVECT_VMU2_OUT);
1340 + PICO_MVRC_W(PICO_INPIX0, srcB);
1341 + PICO_MVRC_W(PICO_INPIX1, srcA);
1342 + PICO_MVRC_W(PICO_INPIX2, src0);
1343 + PICO_OP(PICO_USE_ACC, 0, 6, 3, 0);
1344 + PICO_MVRC_W(PICO_INPIX2, src1);
1345 + PICO_MVRC_W(PICO_INPIX1, src2);
1346 + PICO_MVRC_W(PICO_INPIX0, src3);
1347 + PICO_OP(PICO_USE_ACC | PICO_SINGLE_VECTOR, 0, 6, 3, 0);
1349 + PICO_LDCM_W_INC(tmp,
1350 + PICO_REGVECT_VMU0_OUT,
1351 + PICO_REGVECT_VMU1_OUT,
1352 + PICO_REGVECT_VMU2_OUT);
1353 + PICO_OP(PICO_USE_ACC, 1, 9, 6, 3);
1354 + PICO_MVRC_W(PICO_INPIX0, srcB);
1355 + PICO_MVRC_W(PICO_INPIX1, srcA);
1356 + PICO_MVRC_W(PICO_INPIX2, src0);
1357 + PICO_OP(PICO_USE_ACC | PICO_SINGLE_VECTOR, 1, 9, 6, 3);
1359 + PICO_LDCM_W_INC(tmp,
1360 + PICO_REGVECT_VMU0_OUT,
1361 + PICO_REGVECT_VMU1_OUT,
1362 + PICO_REGVECT_VMU2_OUT);
1363 + PICO_MVRC_W(PICO_INPIX0, srcA);
1364 + PICO_MVRC_W(PICO_INPIX1, src0);
1365 + PICO_MVRC_W(PICO_INPIX2, src1);
1366 + PICO_OP(PICO_USE_ACC, 2, 6, 3, 0);
1367 + PICO_MVRC_W(PICO_INPIX2, src2);
1368 + PICO_MVRC_W(PICO_INPIX1, src3);
1369 + PICO_MVRC_W(PICO_INPIX0, src4);
1370 + PICO_OP(PICO_USE_ACC | PICO_SINGLE_VECTOR, 2, 6, 3, 0);
1372 + PICO_LDCM_W_INC(tmp,
1373 + PICO_REGVECT_VMU0_OUT,
1374 + PICO_REGVECT_VMU1_OUT,
1375 + PICO_REGVECT_VMU2_OUT);
1376 + PICO_OP(PICO_USE_ACC, 3, 9, 6, 3);
1377 + PICO_MVRC_W(PICO_INPIX0, srcA);
1378 + PICO_MVRC_W(PICO_INPIX1, src0);
1379 + PICO_MVRC_W(PICO_INPIX2, src1);
1380 + PICO_OP(PICO_USE_ACC | PICO_SINGLE_VECTOR, 3, 9, 6, 3);
1382 + ST16(dst + 0*dstStride, (short)(PICO_GET_W(PICO_OUTPIX0) >> 16));
1383 + ST16(dst + 1*dstStride, (short)PICO_GET_W(PICO_OUTPIX0));
1386 + PICO_LDCM_W_INC(tmp,
1387 + PICO_REGVECT_VMU0_OUT,
1388 + PICO_REGVECT_VMU1_OUT,
1389 + PICO_REGVECT_VMU2_OUT);
1390 + PICO_MVRC_W(PICO_INPIX0, src0);
1391 + PICO_MVRC_W(PICO_INPIX1, src1);
1392 + PICO_MVRC_W(PICO_INPIX2, src2);
1393 + PICO_OP(PICO_USE_ACC, 0, 6, 3, 0);
1394 + PICO_MVRC_W(PICO_INPIX2, src3);
1395 + PICO_MVRC_W(PICO_INPIX1, src4);
1396 + PICO_MVRC_W(PICO_INPIX0, src5);
1397 + PICO_OP(PICO_USE_ACC| PICO_SINGLE_VECTOR, 0, 6, 3, 0);
1399 + PICO_LDCM_W_INC(tmp,
1400 + PICO_REGVECT_VMU0_OUT,
1401 + PICO_REGVECT_VMU1_OUT,
1402 + PICO_REGVECT_VMU2_OUT);
1403 + PICO_OP(PICO_USE_ACC, 1, 9, 6, 3);
1404 + PICO_MVRC_W(PICO_INPIX0, src0);
1405 + PICO_MVRC_W(PICO_INPIX1, src1);
1406 + PICO_MVRC_W(PICO_INPIX2, src2);
1407 + PICO_OP(PICO_USE_ACC| PICO_SINGLE_VECTOR, 1, 9, 6, 3);
1409 + PICO_LDCM_W_INC(tmp,
1410 + PICO_REGVECT_VMU0_OUT,
1411 + PICO_REGVECT_VMU1_OUT,
1412 + PICO_REGVECT_VMU2_OUT);
1413 + PICO_MVRC_W(PICO_INPIX0, src1);
1414 + PICO_MVRC_W(PICO_INPIX1, src2);
1415 + PICO_MVRC_W(PICO_INPIX2, src3);
1416 + PICO_OP(PICO_USE_ACC, 2, 6, 3, 0);
1417 + PICO_MVRC_W(PICO_INPIX2, src4);
1418 + PICO_MVRC_W(PICO_INPIX1, src5);
1419 + PICO_MVRC_W(PICO_INPIX0, src6);
1420 + PICO_OP(PICO_USE_ACC| PICO_SINGLE_VECTOR, 2, 6, 3, 0);
1422 + PICO_LDCM_W_INC(tmp,
1423 + PICO_REGVECT_VMU0_OUT,
1424 + PICO_REGVECT_VMU1_OUT,
1425 + PICO_REGVECT_VMU2_OUT);
1426 + PICO_OP(PICO_USE_ACC, 3, 9, 6, 3);
1427 + PICO_MVRC_W(PICO_INPIX0, src1);
1428 + PICO_MVRC_W(PICO_INPIX1, src2);
1429 + PICO_MVRC_W(PICO_INPIX2, src3);
1430 + PICO_OP(PICO_USE_ACC| PICO_SINGLE_VECTOR, 3, 9, 6, 3);
1432 + ST16(dst + 2*dstStride, (short)(PICO_GET_W(PICO_OUTPIX0) >> 16));
1433 + ST16(dst + 3*dstStride, (short)PICO_GET_W(PICO_OUTPIX0));
1435 + dst += 2;
1436 + src += 2;
1443 +static void avg_h264_qpel4_hv_lowpass_pico(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){
1445 + int32_t tmp_block[48];
1446 + int32_t *tmp = tmp_block;
1447 + int i;
1449 + set_pico_config(&h264_qpel4_hv_lowpass_config);
1451 + src -= 2;
1452 + for ( i = 0; i < 2; i++ ){
1453 + int srcB= LD32(src - 2*srcStride);
1454 + int srcA= LD32(src - 1*srcStride);
1455 + int src0= LD32(src + 0 *srcStride);
1456 + int src1= LD32(src + 1 *srcStride);
1457 + int src2= LD32(src + 2 *srcStride);
1458 + int src3= LD32(src + 3 *srcStride);
1459 + int src4= LD32(src + 4 *srcStride);
1460 + int src5= LD32(src + 5 *srcStride);
1461 + int src6= LD32(src + 6 *srcStride);
1463 + PICO_MVRC_W(PICO_INPIX0, srcB);
1464 + PICO_MVRC_W(PICO_INPIX1, srcA);
1465 + PICO_MVRC_W(PICO_INPIX2, src0);
1466 + PICO_OP(0, 0, 0, 4, 8);
1467 + PICO_MVRC_W(PICO_INPIX2, src1);
1468 + PICO_MVRC_W(PICO_INPIX1, src2);
1469 + PICO_MVRC_W(PICO_INPIX0, src3);
1470 + PICO_OP(PICO_USE_ACC, 0, 0, 4, 8);
1471 + PICO_STCM_W(tmp,
1472 + PICO_REGVECT_VMU0_OUT,
1473 + PICO_REGVECT_VMU1_OUT,
1474 + PICO_REGVECT_VMU2_OUT);
1475 + tmp += 3;
1477 + PICO_OP(0, 0, 1, 5, 9);
1478 + PICO_MVRC_W(PICO_INPIX0, srcB);
1479 + PICO_MVRC_W(PICO_INPIX1, srcA);
1480 + PICO_MVRC_W(PICO_INPIX2, src0);
1481 + PICO_OP(PICO_USE_ACC, 0, 1, 5, 9);
1482 + PICO_STCM_W(tmp,
1483 + PICO_REGVECT_VMU0_OUT,
1484 + PICO_REGVECT_VMU1_OUT,
1485 + PICO_REGVECT_VMU2_OUT);
1486 + tmp += 3;
1488 + PICO_MVRC_W(PICO_INPIX0, src1);
1489 + PICO_OP(0, 0, 4, 8, 0);
1490 + PICO_MVRC_W(PICO_INPIX2, src2);
1491 + PICO_MVRC_W(PICO_INPIX1, src3);
1492 + PICO_MVRC_W(PICO_INPIX0, src4);
1493 + PICO_OP(PICO_USE_ACC, 0, 0, 4, 8);
1494 + PICO_STCM_W(tmp,
1495 + PICO_REGVECT_VMU0_OUT,
1496 + PICO_REGVECT_VMU1_OUT,
1497 + PICO_REGVECT_VMU2_OUT);
1498 + tmp += 3;
1500 + PICO_OP(0, 0, 1, 5, 9);
1501 + PICO_MVRC_W(PICO_INPIX0, srcA);
1502 + PICO_MVRC_W(PICO_INPIX1, src0);
1503 + PICO_MVRC_W(PICO_INPIX2, src1);
1504 + PICO_OP(PICO_USE_ACC, 0, 1, 5, 9);
1505 + PICO_STCM_W(tmp,
1506 + PICO_REGVECT_VMU0_OUT,
1507 + PICO_REGVECT_VMU1_OUT,
1508 + PICO_REGVECT_VMU2_OUT);
1509 + tmp += 3;
1511 + PICO_MVRC_W(PICO_INPIX0, src2);
1512 + PICO_OP(0, 0, 4, 8, 0);
1513 + PICO_MVRC_W(PICO_INPIX2, src3);
1514 + PICO_MVRC_W(PICO_INPIX1, src4);
1515 + PICO_MVRC_W(PICO_INPIX0, src5);
1516 + PICO_OP(PICO_USE_ACC, 0, 0, 4, 8);
1517 + PICO_STCM_W(tmp,
1518 + PICO_REGVECT_VMU0_OUT,
1519 + PICO_REGVECT_VMU1_OUT,
1520 + PICO_REGVECT_VMU2_OUT);
1521 + tmp += 3;
1523 + PICO_OP(0, 0, 1, 5, 9);
1524 + PICO_MVRC_W(PICO_INPIX0, src0);
1525 + PICO_MVRC_W(PICO_INPIX1, src1);
1526 + PICO_MVRC_W(PICO_INPIX2, src2);
1527 + PICO_OP(PICO_USE_ACC, 0, 1, 5, 9);
1528 + PICO_STCM_W(tmp,
1529 + PICO_REGVECT_VMU0_OUT,
1530 + PICO_REGVECT_VMU1_OUT,
1531 + PICO_REGVECT_VMU2_OUT);
1532 + tmp += 3;
1534 + PICO_MVRC_W(PICO_INPIX0, src3);
1535 + PICO_OP(0, 0, 4, 8, 0);
1536 + PICO_MVRC_W(PICO_INPIX2, src4);
1537 + PICO_MVRC_W(PICO_INPIX1, src5);
1538 + PICO_MVRC_W(PICO_INPIX0, src6);
1539 + PICO_OP(PICO_USE_ACC, 0, 0, 4, 8);
1540 + PICO_STCM_W(tmp,
1541 + PICO_REGVECT_VMU0_OUT,
1542 + PICO_REGVECT_VMU1_OUT,
1543 + PICO_REGVECT_VMU2_OUT);
1544 + tmp += 3;
1546 + PICO_OP(0, 0, 1, 5, 9);
1547 + PICO_MVRC_W(PICO_INPIX0, src1);
1548 + PICO_MVRC_W(PICO_INPIX1, src2);
1549 + PICO_MVRC_W(PICO_INPIX2, src3);
1550 + PICO_OP(PICO_USE_ACC, 0, 1, 5, 9);
1551 + PICO_STCM_W(tmp,
1552 + PICO_REGVECT_VMU0_OUT,
1553 + PICO_REGVECT_VMU1_OUT,
1554 + PICO_REGVECT_VMU2_OUT);
1555 + tmp += 3;
1556 + src += 2;
1559 + src -= 1;
1560 + tmp -= 48;
1563 + PICO_PUT_W(PICO_CONFIG,
1564 + PICO_OUTPUT_MODE(PICO_PLANAR_MODE)
1565 + | PICO_INPUT_MODE(PICO_VERT_FILTER_MODE)
1566 + | PICO_COEFF_FRAC_BITS(10)
1567 + | PICO_OFFSET_FRAC_BITS(10));
1569 + for ( i = 0; i < 2; i++ ){
1570 + int srcB= LD32(src - 2*srcStride);
1571 + int srcA= LD32(src - 1*srcStride);
1572 + int src0= LD32(src + 0 *srcStride);
1573 + int src1= LD32(src + 1 *srcStride);
1574 + int src2= LD32(src + 2 *srcStride);
1575 + int src3= LD32(src + 3 *srcStride);
1576 + int src4= LD32(src + 4 *srcStride);
1577 + int src5= LD32(src + 5 *srcStride);
1578 + int src6= LD32(src + 6 *srcStride);
1580 + PICO_LDCM_W_INC(tmp,
1581 + PICO_REGVECT_VMU0_OUT,
1582 + PICO_REGVECT_VMU1_OUT,
1583 + PICO_REGVECT_VMU2_OUT);
1584 + PICO_MVRC_W(PICO_INPIX0, srcB);
1585 + PICO_MVRC_W(PICO_INPIX1, srcA);
1586 + PICO_MVRC_W(PICO_INPIX2, src0);
1587 + PICO_OP(PICO_USE_ACC, 0, 6, 3, 0);
1588 + PICO_MVRC_W(PICO_INPIX2, src1);
1589 + PICO_MVRC_W(PICO_INPIX1, src2);
1590 + PICO_MVRC_W(PICO_INPIX0, src3);
1591 + PICO_OP(PICO_USE_ACC | PICO_SINGLE_VECTOR, 0, 6, 3, 0);
1593 + PICO_LDCM_W_INC(tmp,
1594 + PICO_REGVECT_VMU0_OUT,
1595 + PICO_REGVECT_VMU1_OUT,
1596 + PICO_REGVECT_VMU2_OUT);
1597 + PICO_OP(PICO_USE_ACC, 1, 9, 6, 3);
1598 + PICO_MVRC_W(PICO_INPIX0, srcB);
1599 + PICO_MVRC_W(PICO_INPIX1, srcA);
1600 + PICO_MVRC_W(PICO_INPIX2, src0);
1601 + PICO_OP(PICO_USE_ACC | PICO_SINGLE_VECTOR, 1, 9, 6, 3);
1603 + PICO_LDCM_W_INC(tmp,
1604 + PICO_REGVECT_VMU0_OUT,
1605 + PICO_REGVECT_VMU1_OUT,
1606 + PICO_REGVECT_VMU2_OUT);
1607 + PICO_MVRC_W(PICO_INPIX0, srcA);
1608 + PICO_MVRC_W(PICO_INPIX1, src0);
1609 + PICO_MVRC_W(PICO_INPIX2, src1);
1610 + PICO_OP(PICO_USE_ACC, 2, 6, 3, 0);
1611 + PICO_MVRC_W(PICO_INPIX2, src2);
1612 + PICO_MVRC_W(PICO_INPIX1, src3);
1613 + PICO_MVRC_W(PICO_INPIX0, src4);
1614 + PICO_OP(PICO_USE_ACC | PICO_SINGLE_VECTOR, 2, 6, 3, 0);
1616 + PICO_LDCM_W_INC(tmp,
1617 + PICO_REGVECT_VMU0_OUT,
1618 + PICO_REGVECT_VMU1_OUT,
1619 + PICO_REGVECT_VMU2_OUT);
1620 + PICO_OP(PICO_USE_ACC, 3, 9, 6, 3);
1621 + PICO_MVRC_W(PICO_INPIX0, srcA);
1622 + PICO_MVRC_W(PICO_INPIX1, src0);
1623 + PICO_MVRC_W(PICO_INPIX2, src1);
1624 + PICO_OP(PICO_USE_ACC | PICO_SINGLE_VECTOR, 3, 9, 6, 3);
1626 + ST16(dst + 0*dstStride, rnd_avg32(LD16(dst + 0*dstStride), PICO_GET_W(PICO_OUTPIX0) >> 16));
1627 + ST16(dst + 1*dstStride, rnd_avg32(LD16(dst + 1*dstStride), PICO_GET_W(PICO_OUTPIX0)));
1630 + PICO_LDCM_W_INC(tmp,
1631 + PICO_REGVECT_VMU0_OUT,
1632 + PICO_REGVECT_VMU1_OUT,
1633 + PICO_REGVECT_VMU2_OUT);
1634 + PICO_MVRC_W(PICO_INPIX0, src0);
1635 + PICO_MVRC_W(PICO_INPIX1, src1);
1636 + PICO_MVRC_W(PICO_INPIX2, src2);
1637 + PICO_OP(PICO_USE_ACC, 0, 6, 3, 0);
1638 + PICO_MVRC_W(PICO_INPIX2, src3);
1639 + PICO_MVRC_W(PICO_INPIX1, src4);
1640 + PICO_MVRC_W(PICO_INPIX0, src5);
1641 + PICO_OP(PICO_USE_ACC| PICO_SINGLE_VECTOR, 0, 6, 3, 0);
1643 + PICO_LDCM_W_INC(tmp,
1644 + PICO_REGVECT_VMU0_OUT,
1645 + PICO_REGVECT_VMU1_OUT,
1646 + PICO_REGVECT_VMU2_OUT);
1647 + PICO_OP(PICO_USE_ACC, 1, 9, 6, 3);
1648 + PICO_MVRC_W(PICO_INPIX0, src0);
1649 + PICO_MVRC_W(PICO_INPIX1, src1);
1650 + PICO_MVRC_W(PICO_INPIX2, src2);
1651 + PICO_OP(PICO_USE_ACC| PICO_SINGLE_VECTOR, 1, 9, 6, 3);
1653 + PICO_LDCM_W_INC(tmp,
1654 + PICO_REGVECT_VMU0_OUT,
1655 + PICO_REGVECT_VMU1_OUT,
1656 + PICO_REGVECT_VMU2_OUT);
1657 + PICO_MVRC_W(PICO_INPIX0, src1);
1658 + PICO_MVRC_W(PICO_INPIX1, src2);
1659 + PICO_MVRC_W(PICO_INPIX2, src3);
1660 + PICO_OP(PICO_USE_ACC, 2, 6, 3, 0);
1661 + PICO_MVRC_W(PICO_INPIX2, src4);
1662 + PICO_MVRC_W(PICO_INPIX1, src5);
1663 + PICO_MVRC_W(PICO_INPIX0, src6);
1664 + PICO_OP(PICO_USE_ACC| PICO_SINGLE_VECTOR, 2, 6, 3, 0);
1666 + PICO_LDCM_W_INC(tmp,
1667 + PICO_REGVECT_VMU0_OUT,
1668 + PICO_REGVECT_VMU1_OUT,
1669 + PICO_REGVECT_VMU2_OUT);
1670 + PICO_OP(PICO_USE_ACC, 3, 9, 6, 3);
1671 + PICO_MVRC_W(PICO_INPIX0, src1);
1672 + PICO_MVRC_W(PICO_INPIX1, src2);
1673 + PICO_MVRC_W(PICO_INPIX2, src3);
1674 + PICO_OP(PICO_USE_ACC| PICO_SINGLE_VECTOR, 3, 9, 6, 3);
1676 + ST16(dst + 2*dstStride, rnd_avg32(LD16(dst + 2*dstStride), PICO_GET_W(PICO_OUTPIX0) >> 16));
1677 + ST16(dst + 3*dstStride, rnd_avg32(LD16(dst + 3*dstStride), PICO_GET_W(PICO_OUTPIX0)));
1679 + dst += 2;
1680 + src += 2;
1685 +static void put_h264_qpel8_v_lowpass_pico(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){
1686 + put_h264_qpel4_v_lowpass_pico(dst , src , dstStride, srcStride);
1687 + put_h264_qpel4_v_lowpass_pico(dst+4, src+4, dstStride, srcStride);
1688 + src += 4*srcStride;
1689 + dst += 4*dstStride;
1690 + put_h264_qpel4_v_lowpass_pico(dst , src , dstStride, srcStride);
1691 + put_h264_qpel4_v_lowpass_pico(dst+4, src+4, dstStride, srcStride);
1694 +static void avg_h264_qpel8_v_lowpass_pico(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){
1695 + avg_h264_qpel4_v_lowpass_pico(dst , src , dstStride, srcStride);
1696 + avg_h264_qpel4_v_lowpass_pico(dst+4, src+4, dstStride, srcStride);
1697 + src += 4*srcStride;
1698 + dst += 4*dstStride;
1699 + avg_h264_qpel4_v_lowpass_pico(dst , src , dstStride, srcStride);
1700 + avg_h264_qpel4_v_lowpass_pico(dst+4, src+4, dstStride, srcStride);
1703 +static void put_h264_qpel8_h_lowpass_pico(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){
1704 + put_h264_qpel4_h_lowpass_pico(dst , src , dstStride, srcStride);
1705 + put_h264_qpel4_h_lowpass_pico(dst+4, src+4, dstStride, srcStride);
1706 + src += 4*srcStride;
1707 + dst += 4*dstStride;
1708 + put_h264_qpel4_h_lowpass_pico(dst , src , dstStride, srcStride);
1709 + put_h264_qpel4_h_lowpass_pico(dst+4, src+4, dstStride, srcStride);
1712 +static void avg_h264_qpel8_h_lowpass_pico(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){
1713 + avg_h264_qpel4_h_lowpass_pico(dst , src , dstStride, srcStride);
1714 + avg_h264_qpel4_h_lowpass_pico(dst+4, src+4, dstStride, srcStride);
1715 + src += 4*srcStride;
1716 + dst += 4*dstStride;
1717 + avg_h264_qpel4_h_lowpass_pico(dst , src , dstStride, srcStride);
1718 + avg_h264_qpel4_h_lowpass_pico(dst+4, src+4, dstStride, srcStride);
1721 +static void put_h264_qpel8_hv_lowpass_pico(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){
1722 + put_h264_qpel4_hv_lowpass_pico(dst , src , dstStride, srcStride);
1723 + put_h264_qpel4_hv_lowpass_pico(dst+4, src+4, dstStride, srcStride);
1724 + src += 4*srcStride;
1725 + dst += 4*dstStride;
1726 + put_h264_qpel4_hv_lowpass_pico(dst , src , dstStride, srcStride);
1727 + put_h264_qpel4_hv_lowpass_pico(dst+4, src+4, dstStride, srcStride);
1730 +static void avg_h264_qpel8_hv_lowpass_pico(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){
1731 + avg_h264_qpel4_hv_lowpass_pico(dst , src , dstStride, srcStride);
1732 + avg_h264_qpel4_hv_lowpass_pico(dst+4, src+4, dstStride, srcStride);
1733 + src += 4*srcStride;
1734 + dst += 4*dstStride;
1735 + avg_h264_qpel4_hv_lowpass_pico(dst , src , dstStride, srcStride);
1736 + avg_h264_qpel4_hv_lowpass_pico(dst+4, src+4, dstStride, srcStride);
1739 +static void put_h264_qpel16_v_lowpass_pico(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){
1740 + put_h264_qpel8_v_lowpass_pico(dst , src , dstStride, srcStride);
1741 + put_h264_qpel8_v_lowpass_pico(dst+8, src+8, dstStride, srcStride);
1742 + src += 8*srcStride;
1743 + dst += 8*dstStride;
1744 + put_h264_qpel8_v_lowpass_pico(dst , src , dstStride, srcStride);
1745 + put_h264_qpel8_v_lowpass_pico(dst+8, src+8, dstStride, srcStride);
1748 +static void avg_h264_qpel16_v_lowpass_pico(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){
1749 + avg_h264_qpel8_v_lowpass_pico(dst , src , dstStride, srcStride);
1750 + avg_h264_qpel8_v_lowpass_pico(dst+8, src+8, dstStride, srcStride);
1751 + src += 8*srcStride;
1752 + dst += 8*dstStride;
1753 + avg_h264_qpel8_v_lowpass_pico(dst , src , dstStride, srcStride);
1754 + avg_h264_qpel8_v_lowpass_pico(dst+8, src+8, dstStride, srcStride);
1757 +static void put_h264_qpel16_h_lowpass_pico(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){
1758 + put_h264_qpel8_h_lowpass_pico(dst , src , dstStride, srcStride);
1759 + put_h264_qpel8_h_lowpass_pico(dst+8, src+8, dstStride, srcStride);
1760 + src += 8*srcStride;
1761 + dst += 8*dstStride;
1762 + put_h264_qpel8_h_lowpass_pico(dst , src , dstStride, srcStride);
1763 + put_h264_qpel8_h_lowpass_pico(dst+8, src+8, dstStride, srcStride);
1766 +static void avg_h264_qpel16_h_lowpass_pico(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){
1767 + avg_h264_qpel8_h_lowpass_pico(dst , src , dstStride, srcStride);
1768 + avg_h264_qpel8_h_lowpass_pico(dst+8, src+8, dstStride, srcStride);
1769 + src += 8*srcStride;
1770 + dst += 8*dstStride;
1771 + avg_h264_qpel8_h_lowpass_pico(dst , src , dstStride, srcStride);
1772 + avg_h264_qpel8_h_lowpass_pico(dst+8, src+8, dstStride, srcStride);
1775 +static void put_h264_qpel16_hv_lowpass_pico(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){
1776 + put_h264_qpel8_hv_lowpass_pico(dst , src , dstStride, srcStride);
1777 + put_h264_qpel8_hv_lowpass_pico(dst+8, src+8, dstStride, srcStride);
1778 + src += 8*srcStride;
1779 + dst += 8*dstStride;
1780 + put_h264_qpel8_hv_lowpass_pico(dst , src , dstStride, srcStride);
1781 + put_h264_qpel8_hv_lowpass_pico(dst+8, src+8, dstStride, srcStride);
1784 +static void avg_h264_qpel16_hv_lowpass_pico(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){
1785 + avg_h264_qpel8_hv_lowpass_pico(dst , src , dstStride, srcStride);
1786 + avg_h264_qpel8_hv_lowpass_pico(dst+8, src+8, dstStride, srcStride);
1787 + src += 8*srcStride;
1788 + dst += 8*dstStride;
1789 + avg_h264_qpel8_hv_lowpass_pico(dst , src , dstStride, srcStride);
1790 + avg_h264_qpel8_hv_lowpass_pico(dst+8, src+8, dstStride, srcStride);
1794 +#define H264_MC(OPNAME, SIZE) \
1795 +static void OPNAME ## h264_qpel ## SIZE ## _mc00_pico (uint8_t *dst, uint8_t *src, int stride){\
1796 + OPNAME ## pixels ## SIZE ## _c(dst, src, stride, SIZE);\
1799 +static void OPNAME ## h264_qpel ## SIZE ## _mc10_pico(uint8_t *dst, uint8_t *src, int stride){\
1800 + uint8_t half[SIZE*SIZE];\
1801 + put_h264_qpel ## SIZE ## _h_lowpass_pico(half, src, SIZE, stride);\
1802 + OPNAME ## pixels ## SIZE ## _l2(dst, src, half, stride, stride, SIZE, SIZE);\
1805 +static void OPNAME ## h264_qpel ## SIZE ## _mc20_pico(uint8_t *dst, uint8_t *src, int stride){\
1806 + OPNAME ## h264_qpel ## SIZE ## _h_lowpass_pico(dst, src, stride, stride);\
1809 +static void OPNAME ## h264_qpel ## SIZE ## _mc30_pico(uint8_t *dst, uint8_t *src, int stride){\
1810 + uint8_t half[SIZE*SIZE];\
1811 + put_h264_qpel ## SIZE ## _h_lowpass_pico(half, src, SIZE, stride);\
1812 + OPNAME ## pixels ## SIZE ## _l2(dst, src+1, half, stride, stride, SIZE, SIZE);\
1815 +static void OPNAME ## h264_qpel ## SIZE ## _mc01_pico(uint8_t *dst, uint8_t *src, int stride){\
1816 + uint8_t full[SIZE*(SIZE+5)];\
1817 + uint8_t * const full_mid= full + SIZE*2;\
1818 + uint8_t half[SIZE*SIZE];\
1819 + copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
1820 + put_h264_qpel ## SIZE ## _v_lowpass_pico(half, full_mid, SIZE, SIZE);\
1821 + OPNAME ## pixels ## SIZE ## _l2(dst, full_mid, half, stride, SIZE, SIZE, SIZE);\
1824 +static void OPNAME ## h264_qpel ## SIZE ## _mc02_pico(uint8_t *dst, uint8_t *src, int stride){\
1825 + uint8_t full[SIZE*(SIZE+5)];\
1826 + uint8_t * const full_mid= full + SIZE*2;\
1827 + copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
1828 + OPNAME ## h264_qpel ## SIZE ## _v_lowpass_pico(dst, full_mid, stride, SIZE);\
1831 +static void OPNAME ## h264_qpel ## SIZE ## _mc03_pico(uint8_t *dst, uint8_t *src, int stride){\
1832 + uint8_t full[SIZE*(SIZE+5)];\
1833 + uint8_t * const full_mid= full + SIZE*2;\
1834 + uint8_t half[SIZE*SIZE];\
1835 + copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
1836 + put_h264_qpel ## SIZE ## _v_lowpass_pico(half, full_mid, SIZE, SIZE);\
1837 + OPNAME ## pixels ## SIZE ## _l2(dst, full_mid+SIZE, half, stride, SIZE, SIZE, SIZE);\
1840 +static void OPNAME ## h264_qpel ## SIZE ## _mc11_pico(uint8_t *dst, uint8_t *src, int stride){\
1841 + uint8_t full[SIZE*(SIZE+5)];\
1842 + uint8_t * const full_mid= full + SIZE*2;\
1843 + uint8_t halfH[SIZE*SIZE];\
1844 + uint8_t halfV[SIZE*SIZE];\
1845 + put_h264_qpel ## SIZE ## _h_lowpass_pico(halfH, src, SIZE, stride);\
1846 + copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
1847 + put_h264_qpel ## SIZE ## _v_lowpass_pico(halfV, full_mid, SIZE, SIZE);\
1848 + OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
1851 +static void OPNAME ## h264_qpel ## SIZE ## _mc31_pico(uint8_t *dst, uint8_t *src, int stride){\
1852 + uint8_t full[SIZE*(SIZE+5)];\
1853 + uint8_t * const full_mid= full + SIZE*2;\
1854 + uint8_t halfH[SIZE*SIZE];\
1855 + uint8_t halfV[SIZE*SIZE];\
1856 + put_h264_qpel ## SIZE ## _h_lowpass_pico(halfH, src, SIZE, stride);\
1857 + copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
1858 + put_h264_qpel ## SIZE ## _v_lowpass_pico(halfV, full_mid, SIZE, SIZE);\
1859 + OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
1862 +static void OPNAME ## h264_qpel ## SIZE ## _mc13_pico(uint8_t *dst, uint8_t *src, int stride){\
1863 + uint8_t full[SIZE*(SIZE+5)];\
1864 + uint8_t * const full_mid= full + SIZE*2;\
1865 + uint8_t halfH[SIZE*SIZE];\
1866 + uint8_t halfV[SIZE*SIZE];\
1867 + put_h264_qpel ## SIZE ## _h_lowpass_pico(halfH, src + stride, SIZE, stride);\
1868 + copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
1869 + put_h264_qpel ## SIZE ## _v_lowpass_pico(halfV, full_mid, SIZE, SIZE);\
1870 + OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
1873 +static void OPNAME ## h264_qpel ## SIZE ## _mc33_pico(uint8_t *dst, uint8_t *src, int stride){\
1874 + uint8_t full[SIZE*(SIZE+5)];\
1875 + uint8_t * const full_mid= full + SIZE*2;\
1876 + uint8_t halfH[SIZE*SIZE];\
1877 + uint8_t halfV[SIZE*SIZE];\
1878 + put_h264_qpel ## SIZE ## _h_lowpass_pico(halfH, src + stride, SIZE, stride);\
1879 + copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
1880 + put_h264_qpel ## SIZE ## _v_lowpass_pico(halfV, full_mid, SIZE, SIZE);\
1881 + OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
1884 +static void OPNAME ## h264_qpel ## SIZE ## _mc22_pico(uint8_t *dst, uint8_t *src, int stride){\
1885 + OPNAME ## h264_qpel ## SIZE ## _hv_lowpass_pico(dst, src, stride, stride);\
1888 +static void OPNAME ## h264_qpel ## SIZE ## _mc21_pico(uint8_t *dst, uint8_t *src, int stride){\
1889 + uint8_t halfH[SIZE*SIZE];\
1890 + uint8_t halfHV[SIZE*SIZE];\
1891 + put_h264_qpel ## SIZE ## _h_lowpass_pico(halfH, src, SIZE, stride);\
1892 + put_h264_qpel ## SIZE ## _hv_lowpass_pico(halfHV, src, SIZE, stride);\
1893 + OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
1896 +static void OPNAME ## h264_qpel ## SIZE ## _mc23_pico(uint8_t *dst, uint8_t *src, int stride){\
1897 + uint8_t halfH[SIZE*SIZE];\
1898 + uint8_t halfHV[SIZE*SIZE];\
1899 + put_h264_qpel ## SIZE ## _h_lowpass_pico(halfH, src + stride, SIZE, stride);\
1900 + put_h264_qpel ## SIZE ## _hv_lowpass_pico(halfHV, src, SIZE, stride);\
1901 + OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
1904 +static void OPNAME ## h264_qpel ## SIZE ## _mc12_pico(uint8_t *dst, uint8_t *src, int stride){\
1905 + uint8_t full[SIZE*(SIZE+5)];\
1906 + uint8_t * const full_mid= full + SIZE*2;\
1907 + uint8_t halfV[SIZE*SIZE];\
1908 + uint8_t halfHV[SIZE*SIZE];\
1909 + copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
1910 + put_h264_qpel ## SIZE ## _v_lowpass_pico(halfV, full_mid, SIZE, SIZE);\
1911 + put_h264_qpel ## SIZE ## _hv_lowpass_pico(halfHV, src, SIZE, stride);\
1912 + OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
1915 +static void OPNAME ## h264_qpel ## SIZE ## _mc32_pico(uint8_t *dst, uint8_t *src, int stride){\
1916 + uint8_t full[SIZE*(SIZE+5)];\
1917 + uint8_t * const full_mid= full + SIZE*2;\
1918 + uint8_t halfV[SIZE*SIZE];\
1919 + uint8_t halfHV[SIZE*SIZE];\
1920 + copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
1921 + put_h264_qpel ## SIZE ## _v_lowpass_pico(halfV, full_mid, SIZE, SIZE);\
1922 + put_h264_qpel ## SIZE ## _hv_lowpass_pico(halfHV, src, SIZE, stride);\
1923 + OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
1926 +H264_MC(put_, 4)
1927 +H264_MC(put_, 8)
1928 +H264_MC(put_, 16)
1929 +H264_MC(avg_, 4)
1930 +H264_MC(avg_, 8)
1931 +H264_MC(avg_, 16)
1935 +#define dspfunc16(PFX) \
1936 + void PFX ## _pixels16_avr32(uint8_t *dst, const uint8_t *pixels, int line_size, int h ){ \
1937 + PFX ## _pixels8_avr32(dst, pixels, line_size, h);\
1938 + PFX ## _pixels8_avr32(dst + 8, pixels + 8, line_size, h);\
1939 + }\
1940 + void PFX ## _pixels16_h_avr32(uint8_t *dst, const uint8_t *pixels, int line_size, int h ){ \
1941 + PFX ## _pixels8_h_avr32(dst, pixels, line_size, h);\
1942 + PFX ## _pixels8_h_avr32(dst + 8, pixels + 8, line_size, h);\
1943 + }\
1944 + void PFX ## _pixels16_v_avr32(uint8_t *dst, const uint8_t *pixels, int line_size, int h ){ \
1945 + PFX ## _pixels8_v_avr32(dst, pixels, line_size, h);\
1946 + PFX ## _pixels8_v_avr32(dst + 8, pixels + 8, line_size, h);\
1947 + }\
1948 + void PFX ## _pixels16_hv_avr32(uint8_t *dst, const uint8_t *pixels, int line_size, int h ){ \
1949 + PFX ## _pixels8_hv_avr32(dst, pixels, line_size, h);\
1950 + PFX ## _pixels8_hv_avr32(dst + 8, pixels + 8, line_size, h);\
1951 + }\
1954 +dspfunc16(put)
1955 +dspfunc16(put_no_rnd)
1956 +dspfunc16(avg)
1957 +dspfunc16(avg_no_rnd)
1958 +#undef dspfunc16
1960 +static int pix_sum_avr32(uint8_t * pix, int line_size)
1962 + int s, i;
1964 + s = 0;
1965 + for (i = 0; i < 16; i++) {
1966 + int tmp1,tmp2,tmp3,tmp4,tmp5;
1967 + __asm__ volatile ( "ld.w\t%0, %6[0]\n\t"
1968 + "ld.w\t%1, %6[4]\n\t"
1969 + "ld.w\t%2, %6[8]\n\t"
1970 + "ld.w\t%3, %6[12]\n\t"
1971 + "punpckub.h\t%4, %0:t\n\t"
1972 + "padd.h\t%5, %5, %4\n\t"
1973 + "punpckub.h\t%4, %0:b\n\t"
1974 + "padd.h\t%5, %5, %4\n\t"
1975 + "punpckub.h\t%4, %1:t\n\t"
1976 + "padd.h\t%5, %5, %4\n\t"
1977 + "punpckub.h\t%4, %1:b\n\t"
1978 + "padd.h\t%5, %5, %4\n\t"
1979 + "punpckub.h\t%4, %2:t\n\t"
1980 + "padd.h\t%5, %5, %4\n\t"
1981 + "punpckub.h\t%4, %2:b\n\t"
1982 + "padd.h\t%5, %5, %4\n\t"
1983 + "punpckub.h\t%4, %3:t\n\t"
1984 + "padd.h\t%5, %5, %4\n\t"
1985 + "punpckub.h\t%4, %3:b\n\t"
1986 + "padd.h\t%5, %5, %4\n\t"
1987 + : "=&r"(tmp1),"=&r"(tmp2),"=&r"(tmp3),"=&r"(tmp4),"=&r"(tmp5),"=&r"(s)
1988 + : "r"(pix));
1989 + pix += line_size;
1991 + __asm__ volatile ( "addhh.w\t%0, %0:t, %0:b" : "=&r" (s) );
1993 + return s;
1997 +//#define op_scale1(x) block[x] = clip_uint8( (block[x]*weight + offset) >> log2_denom )
1998 +//#define op_scale2(x) dst[x] = clip_uint8( (src[x]*weights + dst[x]*weightd + offset) >> (log2_denom+1))
1999 +//#define H264_WEIGHT(W,H) \
2000 +//static void weight_h264_pixels ## W ## x ## H ## _c(uint8_t *block, int stride, int log2_denom, int weight, int offset){ \
2001 +// int attribute_unused x, y; \
2002 +// offset <<= log2_denom; \
2003 +// if(log2_denom) offset += 1<<(log2_denom-1); \
2004 +// for(y=0; y<H; y++, block += stride){ \
2005 +// uint32_t tmp0, tmp1;
2006 +// if(W==2) { \
2007 +// asm volatile ( "ld.ub\t%[tmp0], %[block][0]\n" \
2008 +// "ld.ub\t%[tmp1], %[block][1]\n" \
2009 +// "mulhh.w\t%[tmp0], %[tmp0]:b, %[weight]:b\n" \
2010 +// "mulhh.w\t%[tmp1], %[tmp1]:b, %[weight]:b\n" \
2011 +// "asr\t%[tmp0], %[log2_denom]\n" \
2012 +// "asr\t%[tmp1], %[log2_denom]\n" \
2013 +// "satu\t%[tmp0] >> 0, 8\n" \
2014 +// "satu\t%[tmp1] >> 0, 8\n" \
2015 +// "st.b\t%[block][0], %[tmp0]\n" \
2016 +// "st.b\t%[block][1], %[tmp1]\n" \
2017 +// : [tmp0] "=&r"(tmp0), [tmp1] "=&r"(tmp1) \
2018 +// : [block] "r"(block), [weight]"r"(weight), [log2_denom]"r"(log2denom) ); \
2019 +// } else if ( W==4 ) { \
2020 +// asm volatile ( "ld.w\t%[tmp0], %[block][0]\n" \
2021 +// "punpckub.h\t%[tmp1], %[tmp0]:t\n" \
2022 +// "punpckub.h\t%[tmp0], %[tmp0]:b\n" \
2023 +// "mulhh.w\t%[tmp2], %[tmp1]:t, %[weight]:b\n" \
2024 +// "mulhh.w\t%[tmp1], %[tmp1]:b, %[weight]:b\n" \
2025 +// "asr\t%[tmp0], %[log2_denom]\n" \
2026 +// "asr\t%[tmp1], %[log2_denom]\n" \
2027 +// "satu\t%[tmp0] >> 0, 8\n" \
2028 +// "satu\t%[tmp1] >> 0, 8\n" \
2029 +// "st.b\t%[block][0], %[tmp0]\n" \
2030 +// "st.b\t%[block][1], %[tmp1]\n" \
2031 +// : [tmp0] "=&r"(tmp0), [tmp1] "=&r"(tmp1) \
2032 +// : [block] "r"(block), [weight]"r"(weight), [log2_denom]"r"(log2denom) ); \
2036 +// if(W==4) continue; \
2037 +// op_scale1(4); \
2038 +// op_scale1(5); \
2039 +// op_scale1(6); \
2040 +// op_scale1(7); \
2041 +// if(W==8) continue; \
2042 +// op_scale1(8); \
2043 +// op_scale1(9); \
2044 +// op_scale1(10); \
2045 +// op_scale1(11); \
2046 +// op_scale1(12); \
2047 +// op_scale1(13); \
2048 +// op_scale1(14); \
2049 +// op_scale1(15); \
2050 +// } \
2051 +//} \
2052 +//static void biweight_h264_pixels ## W ## x ## H ## _c(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offsetd, int offsets){ \
2053 +// int attribute_unused x, y; \
2054 +// int offset = (offsets + offsetd + 1) >> 1; \
2055 +// offset = ((offset << 1) + 1) << log2_denom; \
2056 +// for(y=0; y<H; y++, dst += stride, src += stride){ \
2057 +// op_scale2(0); \
2058 +// op_scale2(1); \
2059 +// if(W==2) continue; \
2060 +// op_scale2(2); \
2061 +// op_scale2(3); \
2062 +// if(W==4) continue; \
2063 +// op_scale2(4); \
2064 +// op_scale2(5); \
2065 +// op_scale2(6); \
2066 +// op_scale2(7); \
2067 +// if(W==8) continue; \
2068 +// op_scale2(8); \
2069 +// op_scale2(9); \
2070 +// op_scale2(10); \
2071 +// op_scale2(11); \
2072 +// op_scale2(12); \
2073 +// op_scale2(13); \
2074 +// op_scale2(14); \
2075 +// op_scale2(15); \
2076 +// } \
2077 +//}
2081 +/* Returns zero in each byte where the absolute difference between <a> and <b>
2082 + is not less than <compare> */
2083 +#define PABS_DIFF_LESS_THAN( a, b, compare) \
2084 + ({ uint32_t __tmp__, __tmp2__, __mask__; \
2085 + asm ( \
2086 + /* Check ABS( a - b ) < compare */ \
2087 + "psubs.ub\t%[tmp], %[opa], %[opb]\n" \
2088 + "psubs.ub\t%[tmp2], %[opb], %[opa]\n" \
2089 + "or\t%[tmp], %[tmp2]\n" /* ABS ( a - b ) */ \
2090 + /* This produces 0 for all bytes where the comparison is not true */ \
2091 + "psubs.ub\t%[mask], %[cmp], %[tmp]\n" \
2092 + : [tmp] "=&r"(__tmp__), [tmp2] "=&r"(__tmp2__), [mask] "=&r"(__mask__) \
2093 + : [opa] "r"(a), [opb] "r"(b), [cmp] "r"(compare) ); \
2094 + __mask__; })
2096 +/*
2097 + Set all bytes containing zero in <value> to 255 and the rest to zero.
2099 + Add with saturation 254 to all bytes making all bytes different from
2100 + zero become 255. Then add one without saturation to make all bytes
2101 + originally containing zero 255 and the rest 0. */
2102 +#define SET_ALL_BITS_IN_ZERO_BYTES(value) \
2103 + ({ uint32_t __tmp__; \
2104 + asm ( \
2105 + "padds.ub\t%[tmp], %[val], %[max_minus_one]\n" \
2106 + "padd.b\t%[tmp], %[tmp], %[all_ones]\n" \
2107 + : [tmp] "=r"(__tmp__) \
2108 + : [val] "r"(value), [max_minus_one] "r"(0xFEFEFEFE), [all_ones] "r"(0x01010101) ); \
2109 + __tmp__; })
2111 +#define PACKW_SH(upper, lower) \
2112 + ({ uint32_t __tmp__; \
2113 + asm ( \
2114 + "packw.sh\t%[tmp], %[u], %[l]\n" \
2115 + : [tmp] "=r"(__tmp__) \
2116 + : [u] "r"(upper), [l] "r"(lower) ); \
2117 + __tmp__; })
2119 +#define PACKSH_UB(upper, lower) \
2120 + ({ uint32_t __tmp__; \
2121 + asm ( \
2122 + "packsh.sb\t%[tmp], %[u], %[l]\n" \
2123 + : [tmp] "=r"(__tmp__) \
2124 + : [u] "r"(upper), [l] "r"(lower) ); \
2125 + __tmp__; })
2127 +static void h264_v_loop_filter_luma_avr32(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2129 + int i;
2131 + if ( alpha == 0 )
2132 + return;
2134 + alpha = PACKW_SH(alpha, alpha);
2135 + alpha = PACKSH_UB(alpha, alpha);
2136 + beta = PACKW_SH(beta, beta);
2137 + beta = PACKSH_UB(beta, beta);
2139 + for( i = 0; i < 4; i++ ) {
2140 + uint32_t p0, p1, p2, q0, q1, q2;
2141 + uint32_t mask, mask2;
2142 + uint32_t tmp, tmp2, tmp3, tmp4;
2144 + if( tc0[i] < 0 ) {
2145 + pix += 4;
2146 + continue;
2149 +/* for( d = 0; d < 4; d++ ) {
2150 + const int p0 = pix[-1*stride];
2151 + const int p1 = pix[-2*stride];
2152 + const int p2 = pix[-3*stride];
2153 + const int q0 = pix[0];
2154 + const int q1 = pix[1*stride];
2155 + const int q2 = pix[2*stride];
2157 + if( ABS( p0 - q0 ) < alpha &&
2158 + ABS( p1 - p0 ) < beta &&
2159 + ABS( q1 - q0 ) < beta ) { */
2161 + p0 = LD32(pix - stride);
2162 + p1 = LD32(pix - 2*stride);
2163 + q0 = LD32(pix);
2164 + q1 = LD32(pix + stride);
2166 + /* Check which of the columns should be filtered, if any. */
2167 + mask = PABS_DIFF_LESS_THAN(p0, q0, alpha);
2168 + mask |= PABS_DIFF_LESS_THAN(p1, p0, beta);
2169 + mask |= PABS_DIFF_LESS_THAN(q1, q0, beta);
2171 + if ( !mask )
2172 + continue;
2174 + mask = SET_ALL_BITS_IN_ZERO_BYTES(mask);
2177 + int tc = PACKW_SH(tc0[i], tc0[i]);
2178 + int tc0_p = tc;
2179 + int tc0_m = PACKW_SH(-tc0[i], -tc0[i]);
2181 + /*
2182 + int i_delta;
2183 + if( ABS( p2 - p0 ) < beta ) {
2184 + pix[-2*stride] = p1 + clip( (( p2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - p1, -tc0[i], tc0[i] );
2185 + tc++;
2186 + }*/
2188 + p2 = LD32(pix - 3*stride);
2189 + mask2 = PABS_DIFF_LESS_THAN(p2, p0, beta) & ~mask;
2191 + if ( mask2 ){
2192 + mask2 = SET_ALL_BITS_IN_ZERO_BYTES(mask2);
2193 + asm ("pavg.ub\t%[tmp], %[p0], %[q0]\n"
2194 + "paddh.ub\t%[tmp], %[tmp], %[p2]\n"
2195 + "punpckub.h\t%[tmp2], %[tmp]:t\n"
2196 + "punpckub.h\t%[tmp], %[tmp]:b\n"
2197 + "punpckub.h\t%[tmp3], %[p1]:t\n"
2198 + "punpckub.h\t%[tmp4], %[p1]:b\n"
2199 + "psub.h\t%[tmp2], %[tmp2], %[tmp3]\n"
2200 + "psub.h\t%[tmp], %[tmp], %[tmp4]\n"
2201 + "pmin.sh\t%[tmp2], %[tmp2], %[tc0_p]\n"
2202 + "pmin.sh\t%[tmp], %[tmp], %[tc0_p]\n"
2203 + "pmax.sh\t%[tmp2], %[tmp2], %[tc0_m]\n"
2204 + "pmax.sh\t%[tmp], %[tmp], %[tc0_m]\n"
2205 + "padd.h\t%[tmp2], %[tmp2], %[tmp3]\n"
2206 + "padd.h\t%[tmp], %[tmp], %[tmp4]\n"
2207 + "packsh.ub\t%[tmp], %[tmp2], %[tmp]\n"
2208 + "andn\t%[tmp], %[mask2]\n"
2209 + "and\t%[tmp2], %[q1], %[mask2]\n"
2210 + "or\t%[tmp], %[tmp2]\n"
2211 + : [tmp]"=&r"(tmp), [tmp2]"=&r"(tmp2), [tmp3]"=&r"(tmp3),
2212 + [tmp4]"=&r"(tmp4)
2213 + : [q0]"r"(q0), [p2]"r"(p2), [p1]"r"(p1), [p0]"r"(p0), [q1]"r"(q1), [tc0_p]"r"(tc0_p),
2214 + [tc0_m]"r"(tc0_m), [mask2]"r"(mask2));
2215 + ST32(pix - 2*stride, tmp);
2216 + tc += 0x00010001;
2220 + q2 = LD32(pix + 2*stride);
2222 + /*
2223 + if( ABS( q2 - q0 ) < beta ) {
2224 + pix[ stride] = q1 + clip( (( q2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - q1, -tc0[i], tc0[i] );
2225 + tc++;
2227 + */
2228 + mask2 = PABS_DIFF_LESS_THAN(q2, q0, beta) & ~mask;
2230 + if ( mask2 ){
2231 + mask2 = SET_ALL_BITS_IN_ZERO_BYTES(mask2);
2232 + asm ("pavg.ub\t%[tmp], %[p0], %[q0]\n"
2233 + "paddh.ub\t%[tmp], %[tmp], %[q2]\n"
2234 + "punpckub.h\t%[tmp2], %[tmp]:t\n"
2235 + "punpckub.h\t%[tmp], %[tmp]:b\n"
2236 + "punpckub.h\t%[tmp3], %[q1]:t\n"
2237 + "punpckub.h\t%[tmp4], %[q1]:b\n"
2238 + "psub.h\t%[tmp2], %[tmp2], %[tmp3]\n"
2239 + "psub.h\t%[tmp], %[tmp], %[tmp4]\n"
2240 + "pmin.sh\t%[tmp2], %[tmp2], %[tc0_p]\n"
2241 + "pmin.sh\t%[tmp], %[tmp], %[tc0_p]\n"
2242 + "pmax.sh\t%[tmp2], %[tmp2], %[tc0_m]\n"
2243 + "pmax.sh\t%[tmp], %[tmp], %[tc0_m]\n"
2244 + "padd.h\t%[tmp2], %[tmp2], %[tmp3]\n"
2245 + "padd.h\t%[tmp], %[tmp], %[tmp4]\n"
2246 + "packsh.ub\t%[tmp], %[tmp2], %[tmp]\n"
2247 + "andn\t%[tmp], %[mask2]\n"
2248 + "and\t%[tmp2], %[q1], %[mask2]\n"
2249 + "or\t%[tmp], %[tmp2]\n"
2250 + : [tmp]"=&r"(tmp), [tmp2]"=&r"(tmp2), [tmp3]"=&r"(tmp3),
2251 + [tmp4]"=&r"(tmp4)
2252 + : [q0]"r"(q0), [q2]"r"(q2), [q1]"r"(q1), [p0]"r"(p0), [tc0_p]"r"(tc0_p),
2253 + [tc0_m]"r"(tc0_m), [mask2]"r"(mask2));
2254 + ST32(pix + stride, tmp);
2255 + tc += 0x00010001;
2258 + uint32_t old_p0 = p0;
2259 + uint32_t old_q0 = q0;
2261 + /* i_delta = clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
2262 + pix[-stride] = clip_uint8( p0 + i_delta );
2263 + pix[0] = clip_uint8( q0 - i_delta ); */
2265 + asm (
2266 + /* Check if the two upper pixels should be filtered */
2267 + "lsr\t%[tmp], %[inv_mask], 16\n"
2268 + "breq\t0f\n"
2270 + "punpckub.h\t%[tmp], %[p1]:t\n"
2271 + "punpckub.h\t%[tmp2], %[q1]:t\n"
2273 + /* p1 - q1 */
2274 + "psub.h\t%[tmp], %[tmp], %[tmp2]\n"
2276 + "punpckub.h\t%[tmp3], %[q0]:t\n"
2277 + "punpckub.h\t%[tmp4], %[p0]:t\n"
2279 + /* q0 - p0 */
2280 + "psub.h\t%[tmp2], %[tmp3], %[tmp4]\n"
2282 + /* (q0 - p0) << 2 */
2283 + "plsl.h\t%[tmp2], %[tmp2], 2\n"
2285 + /* ((q0 - p0) << 2) + (p1 - q1) */
2286 + "padd.h\t%[tmp2], %[tmp2], %[tmp]\n"
2288 + "mov\t%[tmp], 0x00040004\n"
2289 + /* ((q0 - p0) << 2) + (p1 - q1) + 4*/
2290 + "padd.h\t%[tmp2], %[tmp2], %[tmp]\n"
2292 + /* (((q0 - p0) << 2) + (p1 - q1) + 4) >> 3*/
2293 + "pasr.h\t%[tmp2], %[tmp2], 3\n"
2295 + "mov\t%[tmp], 0\n"
2296 + "psub.h\t%[tmp], %[tmp], %[tc]\n"
2298 + /* i_delta = clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc ); */
2299 + "pmin.sh\t%[tmp2], %[tmp2], %[tc]\n"
2300 + "pmax.sh\t%[tmp2], %[tmp2], %[tmp]\n"
2303 + /* pix[-stride] = clip_uint8( p0 + i_delta ); */
2304 + "padd.h\t%[tmp4], %[tmp4], %[tmp2]\n"
2307 + /* pix[0] = clip_uint8( q0 - i_delta ); */
2308 + "psub.h\t%[tmp3], %[tmp3], %[tmp2]\n"
2310 + /* Check if the two lower pixels should be filtered */
2311 + "lsl\t%[tmp2], %[inv_mask], 16\n"
2312 + "breq\t1f\n"
2314 + "0:\n"
2315 + "punpckub.h\t%[p1], %[p1]:b\n"
2316 + "punpckub.h\t%[q1], %[q1]:b\n"
2318 + /* p1 - q1 */
2319 + "psub.h\t%[p1], %[p1], %[q1]\n"
2321 + "punpckub.h\t%[q0], %[q0]:b\n"
2322 + "punpckub.h\t%[p0], %[p0]:b\n"
2324 + /* q0 - p0 */
2325 + "psub.h\t%[tmp2], %[q0], %[p0]\n"
2327 + /* (q0 - p0) << 2 */
2328 + "plsl.h\t%[tmp2], %[tmp2], 2\n"
2330 + /* ((q0 - p0) << 2) + (p1 - q1) */
2331 + "padd.h\t%[tmp2], %[tmp2], %[p1]\n"
2333 + "mov\t%[q1], 0x00040004\n"
2334 + /* ((q0 - p0) << 2) + (p1 - q1) + 4*/
2335 + "padd.h\t%[tmp2], %[tmp2], %[q1]\n"
2337 + /* (((q0 - p0) << 2) + (p1 - q1) + 4) >> 3*/
2338 + "pasr.h\t%[tmp2], %[tmp2], 3\n"
2340 + /* i_delta = clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc ); */
2341 + "pmin.sh\t%[tmp2], %[tmp2], %[tc]\n"
2342 + "pmax.sh\t%[tmp2], %[tmp2], %[tmp]\n"
2344 + /* pix[-stride] = clip_uint8( p0 + i_delta ); */
2345 + "padd.h\t%[p0], %[p0], %[tmp2]\n"
2347 + /* pix[0] = clip_uint8( q0 - i_delta ); */
2348 + "psub.h\t%[q0], %[q0], %[tmp2]\n"
2350 + "1:\n"
2351 + "packsh.ub\t%[p0], %[tmp4], %[p0]\n"
2352 + "packsh.ub\t%[q0], %[tmp3], %[tmp4]\n"
2354 + : [tmp]"=&r"(tmp), [tmp2]"=&r"(tmp2), [tmp3]"=&r"(tmp3),
2355 + [tmp4]"=&r"(tmp4), [q0]"=&r"(q0), [q1]"=&r"(q1), [p0]"=&r"(p0), [p1]"=&r"(p1)
2356 + : [tc]"r"(tc), [inv_mask]"r"(~mask));
2358 + ST32(pix - stride, (mask & old_p0) | (p0 & ~mask));
2359 + ST32(pix, (mask & old_q0) | (q0 & ~mask));
2362 + pix += 1;
2368 +#ifdef CHECK_DSP_FUNCS_AGAINST_C
2370 +void dump_block8(uint8_t *block, int line_size, int h){
2371 + int i, j;
2373 + for ( i = 0; i < h ; i++ ){
2374 + av_log(NULL, AV_LOG_ERROR, "\t");
2375 + for ( j = 0; j < 8 ; j++ ){
2376 + av_log(NULL, AV_LOG_ERROR, "%d ", block[j + i*line_size]);
2378 + av_log(NULL, AV_LOG_ERROR, "\n");
2382 +void dump_block4(uint8_t *block, int line_size, int h){
2383 + int i, j;
2385 + for ( i = 0; i < h ; i++ ){
2386 + av_log(NULL, AV_LOG_ERROR, "\t");
2387 + for ( j = 0; j < 4 ; j++ ){
2388 + av_log(NULL, AV_LOG_ERROR, "%d ", block[j + i*line_size]);
2390 + av_log(NULL, AV_LOG_ERROR, "\n");
2394 +void dump_block(uint8_t *block, int line_size, int h, int w){
2395 + int i, j;
2397 + for ( i = 0; i < h ; i++ ){
2398 + av_log(NULL, AV_LOG_ERROR, "\t");
2399 + for ( j = 0; j < w ; j++ ){
2400 + av_log(NULL, AV_LOG_ERROR, "%d ", block[j + i*line_size]);
2402 + av_log(NULL, AV_LOG_ERROR, "\n");
2406 +void check_block8(uint8_t *test, uint8_t *correct, int line_size_test, int line_size_correct,
2407 + int h, char *name, int max_dev){
2408 + int i,j;
2409 + for ( i = 0; i < 8 ; i++ ){
2410 + for ( j = 0; j < h ; j++ ){
2411 + int diff = test[i + line_size_test*j] - correct[i + line_size_correct*j];
2412 + diff = diff < 0 ? -diff : diff;
2413 + if ( diff > max_dev ){
2414 + av_log(NULL, AV_LOG_ERROR, "Error pixel x=%i, y=%i differs. Is 0x%x should be 0x%x\n",
2415 + i, j, test[i + line_size_test*j], correct[i + j*line_size_correct]);
2416 + av_log(NULL, AV_LOG_ERROR, "Error resulting block from %s is:\n", name);
2417 + dump_block8(test, line_size_test, h);
2418 + av_log(NULL, AV_LOG_ERROR, "But should be equal to:\n");
2419 + dump_block8(correct, line_size_correct, h);
2420 + exit(1);
2423 + }
2426 +void check_block4(uint8_t *test, uint8_t *correct, int line_size_test, int line_size_correct,
2427 + int h, char *name, int max_dev){
2428 + int i,j;
2429 + for ( i = 0; i < 4 ; i++ ){
2430 + for ( j = 0; j < h ; j++ ){
2431 + int diff = test[i + line_size_test*j] - correct[i + line_size_correct*j];
2432 + diff = diff < 0 ? -diff : diff;
2433 + if ( diff > max_dev ){
2434 + av_log(NULL, AV_LOG_ERROR, "Error pixel x=%i, y=%i differs. Is 0x%x should be 0x%x\n",
2435 + i, j, test[i + line_size_test*j], correct[i + j*line_size_correct]);
2436 + av_log(NULL, AV_LOG_ERROR, "Error resulting block from %s is:\n", name);
2437 + dump_block8(test, line_size_test, h);
2438 + av_log(NULL, AV_LOG_ERROR, "But should be equal to:\n");
2439 + dump_block4(correct, line_size_correct, h);
2440 + exit(1);
2443 + }
2446 +void check_block(uint8_t *test, uint8_t *correct, int line_size_test, int line_size_correct,
2447 + int h, int width, char *name, int max_dev){
2448 + int i,j;
2449 + for ( i = 0; i < width ; i++ ){
2450 + for ( j = 0; j < h ; j++ ){
2451 + int diff = test[i + line_size_test*j] - correct[i + line_size_correct*j];
2452 + diff = diff < 0 ? -diff : diff;
2453 + if ( diff > max_dev ){
2454 + av_log(NULL, AV_LOG_ERROR, "Error pixel x=%i, y=%i differs. Is 0x%x should be 0x%x\n",
2455 + i, j, test[i + line_size_test*j], correct[i + j*line_size_correct]);
2456 + av_log(NULL, AV_LOG_ERROR, "Error resulting block from %s is:\n", name);
2457 + dump_block(test, line_size_test, h, width);
2458 + av_log(NULL, AV_LOG_ERROR, "But should be equal to:\n");
2459 + dump_block(correct, line_size_correct, h, width);
2460 + exit(1);
2463 + }
2466 +void dump_dct_block(DCTELEM *block){
2467 + int i, j;
2469 + for ( i = 0; i < 8 ; i++ ){
2470 + av_log(NULL, AV_LOG_ERROR, "\t");
2471 + for ( j = 0; j < 8 ; j++ ){
2472 + av_log(NULL, AV_LOG_ERROR, "0x%x ", block[j + i*8]);
2474 + av_log(NULL, AV_LOG_ERROR, "\n");
2478 +void test_idct_avr32(DCTELEM *block){
2479 + DCTELEM testBlock[64];
2480 + int i, j;
2482 + /* Copy transposed block to testBlock */
2483 + for ( i = 0; i < 8 ; i++ ){
2484 + for ( j = 0; j < 8 ; j++ ){
2485 + testBlock[i + 8*j] = block[j + i*8];
2489 + idct_avr32(block);
2490 + simple_idct(&testBlock);
2492 + for ( i = 0; i < 64 ; i++ ){
2493 + if ( block[i] != testBlock[i] ){
2494 + av_log(NULL, AV_LOG_ERROR, "Error resulting block from idct is:\n");
2495 + dump_dct_block(block);
2496 + av_log(NULL, AV_LOG_ERROR, "But should be equal to the transposed of:\n");
2497 + dump_dct_block(testBlock);
2498 + exit(1);
2503 +void test_idct_put_avr32(uint8_t *dest, int line_size, DCTELEM *block){
2504 + uint8_t testBlock[64];
2505 + DCTELEM blockCopy[64];
2506 + int i, j;
2508 + /* Copy transposed block to blockCopy */
2509 + for ( i = 0; i < 8 ; i++ ){
2510 + for ( j = 0; j < 8 ; j++ ){
2511 + blockCopy[i + 8*j] = block[j + i*8];
2515 + idct_put_avr32(dest, line_size, block);
2516 + simple_idct_put(&testBlock, 8, blockCopy);
2518 + check_block8(dest, testBlock, line_size, 8, 8, "idct_put", 1);
2522 +void test_idct_add_avr32(uint8_t *dest, int line_size, DCTELEM *block){
2523 + uint8_t testBlock[64];
2524 + DCTELEM blockCopy[64];
2525 + int i, j;
2527 + /* Copy dest to testBlock */
2528 + for ( i = 0; i < 8 ; i++ ){
2529 + for ( j = 0; j < 8 ; j++ ){
2530 + testBlock[i + 8*j] = dest[i + j*line_size];
2534 + /* Copy transposed block to blockCopy */
2535 + for ( i = 0; i < 8 ; i++ ){
2536 + for ( j = 0; j < 8 ; j++ ){
2537 + blockCopy[i + 8*j] = block[j + i*8];
2541 + idct_add_avr32(dest, line_size, block);
2542 + simple_idct_add(&testBlock, 8, blockCopy);
2544 + check_block8(dest, testBlock, line_size, 8, 8, "idct_add", 1);
2547 +void test_h264_idct_add_avr32(uint8_t *dest, DCTELEM *block, int stride){
2548 + uint8_t testBlock[16];
2549 + DCTELEM blockCopy[16];
2550 + int i, j;
2552 + /* Copy dest to testBlock */
2553 + for ( i = 0; i < 4 ; i++ ){
2554 + for ( j = 0; j < 4 ; j++ ){
2555 + testBlock[i + 4*j] = dest[i + j*stride];
2559 + /* Copy transposed block to blockCopy */
2560 + for ( i = 0; i < 16 ; i++ ){
2561 + blockCopy[i] = block[i];
2564 + ff_h264_idct_add_c(dest, block, stride);
2566 + h264_idct_add_avr32(testBlock, blockCopy, 4);
2568 + check_block(dest, testBlock, stride, 4, 4, 4, "h264_idct_add", 0);
2571 +void test_h264_idct8_add_avr32(uint8_t *dest, DCTELEM *block, int stride){
2572 + uint8_t testBlock[8*8];
2573 + DCTELEM blockCopy[8*8];
2574 + int i, j;
2576 + /* Copy dest to testBlock */
2577 + for ( i = 0; i < 8 ; i++ ){
2578 + for ( j = 0; j < 8 ; j++ ){
2579 + testBlock[i + 8*j] = dest[i + j*stride];
2583 + /* Copy source block to blockCopy */
2584 + for ( i = 0; i < 8*8 ; i++ ){
2585 + blockCopy[i] = block[i];
2588 + ff_h264_idct8_add_c(dest, block, stride);
2589 + h264_idct8_add_avr32(testBlock, blockCopy, 8);
2591 + check_block(dest, testBlock, stride, 8, 8, 8, "h264_idct8_add", 0);
2594 +void test_put_pixels_funcs8(op_pixels_func test, op_pixels_func correct, uint8_t *block,
2595 + const uint8_t *pixels, int line_size, int h, char *name, int in_h_size, int in_v_size){
2596 + uint8_t *testBlock, *testBlock2;
2597 + int i, j;
2598 + int input_v_size = h + in_v_size;
2599 + int input_h_size = 8 + in_h_size;
2601 + testBlock = alloca(input_h_size*input_v_size);
2602 + testBlock2 = alloca(input_h_size*input_v_size);
2604 + for ( i = 0; i < input_h_size ; i++ ){
2605 + for ( j = 0; j < input_v_size ; j++ ){
2606 + testBlock[i + input_h_size*j] = pixels[i + j*line_size];
2610 + test(block, pixels, line_size, h);
2611 + correct(testBlock2, testBlock, input_h_size, h);
2613 + check_block8(block, testBlock2, line_size, input_h_size, h, name, 0);
2617 +void test_h264_chroma_mc_funcs(h264_chroma_mc_func test, h264_chroma_mc_func correct, uint8_t *dst,
2618 + uint8_t *src, int stride, int h, int w, int x, int y, char *name){
2619 + uint8_t *testBlock, *testBlock2;
2620 + int i, j;
2621 + int input_v_size = h + 1;
2622 + int input_h_size = ((w + 1) + 3) & ~3;
2624 + testBlock = alloca(input_h_size*input_v_size);
2625 + testBlock2 = alloca(input_h_size*input_v_size);
2627 + for ( i = 0; i < w + 1 ; i++ ){
2628 + for ( j = 0; j < h + 1 ; j++ ){
2629 + testBlock[i + input_h_size*j] = src[i + j*stride];
2633 + for ( i = 0; i < w ; i++ ){
2634 + for ( j = 0; j < h ; j++ ){
2635 + testBlock2[i + input_h_size*j] = dst[i + j*stride];
2639 + test(dst, src, stride, h, x, y);
2640 + correct(testBlock2, testBlock, input_h_size, h, x, y);
2642 + check_block(dst, testBlock2, stride, input_h_size, h, w, name, 0);
2646 +void test_qpel_mc_funcs(qpel_mc_func test, qpel_mc_func correct, uint8_t *dst,
2647 + uint8_t *src, int stride, int size, char *name){
2648 + uint8_t *testBlock, *testBlock2;
2649 + int i, j;
2650 + int test_stride = size + 8;
2652 + testBlock = alloca(test_stride*(size+8)) + 4 + test_stride*4;
2653 + testBlock2 = alloca(test_stride*size);
2655 + for ( i = -4; i < size+4 ; i++ ){
2656 + for ( j = -4; j < size+4 ; j++ ){
2657 + testBlock[i + test_stride*j] = src[i + j*stride];
2661 + for ( i = 0; i < size ; i++ ){
2662 + for ( j = 0; j < size ; j++ ){
2663 + testBlock2[i + test_stride*j] = dst[i + j*stride];
2667 + correct(dst, src, stride);
2668 + test(testBlock2, testBlock, test_stride);
2670 + check_block(testBlock2, dst, test_stride, stride, size, size, name, 0);
2675 +#define test_pixels_funcs(PFX, NUM ) \
2676 +void test_ ## PFX ## _pixels ## NUM ## _avr32( uint8_t *block, const uint8_t *pixels, int line_size, int h){ \
2677 + test_put_pixels_funcs8(PFX ## _pixels ## NUM ## _avr32, PFX ## _pixels ## NUM ## _c, \
2678 + block, pixels, line_size, h, "test_" #PFX "_pixels", 0, 0); } \
2679 +void test_ ## PFX ## _pixels ## NUM ## _h_avr32( uint8_t *block, const uint8_t *pixels, int line_size, int h){ \
2680 + test_put_pixels_funcs8(PFX ## _pixels ## NUM ## _h_avr32, PFX ## _pixels ## NUM ## _x2_c, \
2681 + block, pixels, line_size, h, "test_" #PFX "_pixels_h", 1, 0); } \
2682 +void test_ ## PFX ## _pixels ## NUM ## _v_avr32( uint8_t *block, const uint8_t *pixels, int line_size, int h){ \
2683 + test_put_pixels_funcs8(PFX ## _pixels ## NUM ## _v_avr32, PFX ## _pixels ## NUM ## _y2_c, \
2684 + block, pixels, line_size, h, "test_" #PFX "_pixels_v", 0, 1); } \
2685 +void test_ ## PFX ## _pixels ## NUM ## _hv_avr32( uint8_t *block, const uint8_t *pixels, int line_size, int h){ \
2686 + test_put_pixels_funcs8(PFX ## _pixels ## NUM ## _hv_avr32, PFX ## _pixels ## NUM ## _xy2_c, \
2687 + block, pixels, line_size, h, "test_" #PFX "_pixels_hv", 1, 1); }
2689 +test_pixels_funcs(put, 8);
2690 +test_pixels_funcs(put_no_rnd, 8);
2691 +test_pixels_funcs(put, 16);
2692 +test_pixels_funcs(put_no_rnd, 16);
2694 +test_pixels_funcs(avg, 8);
2695 +test_pixels_funcs(avg_no_rnd, 8);
2696 +test_pixels_funcs(avg, 16);
2697 +test_pixels_funcs(avg_no_rnd, 16);
2699 +#define test_h264_chroma_mc_funcs(PFX, NUM ) \
2700 +void test_ ## PFX ## _h264_chroma_mc ## NUM ## _pico( uint8_t *dst, uint8_t *src, int stride, int h, int x, int y){ \
2701 + test_h264_chroma_mc_funcs(PFX ## _h264_chroma_mc ## NUM ## _pico, PFX ## _h264_chroma_mc ## NUM ## _c, \
2702 + dst, src, stride, h, NUM, x, y, "test_" #PFX "_h264_chroma_mc" #NUM "_pico"); } \
2704 +test_h264_chroma_mc_funcs(put, 2);
2705 +test_h264_chroma_mc_funcs(put, 4);
2706 +test_h264_chroma_mc_funcs(put, 8);
2707 +test_h264_chroma_mc_funcs(avg, 2);
2708 +test_h264_chroma_mc_funcs(avg, 4);
2709 +test_h264_chroma_mc_funcs(avg, 8);
2711 +#define test_qpel_mc_funcs_type(PFX, NUM, TYPE ) \
2712 +void test_ ## PFX ## NUM ## _ ## TYPE ## _pico( uint8_t *dst, uint8_t *src, int stride){ \
2713 + test_qpel_mc_funcs(PFX ## NUM ## _ ## TYPE ## _pico, PFX ## NUM ## _ ## TYPE ## _c, \
2714 + dst, src, stride, NUM, "test_" #PFX #NUM "_" #TYPE "_pico"); }
2716 +#define test_qpel_mc_funcs(PFX, NUM) \
2717 + test_qpel_mc_funcs_type(PFX, NUM, mc00);\
2718 + test_qpel_mc_funcs_type(PFX, NUM, mc10);\
2719 + test_qpel_mc_funcs_type(PFX, NUM, mc20);\
2720 + test_qpel_mc_funcs_type(PFX, NUM, mc30);\
2721 + test_qpel_mc_funcs_type(PFX, NUM, mc01);\
2722 + test_qpel_mc_funcs_type(PFX, NUM, mc11);\
2723 + test_qpel_mc_funcs_type(PFX, NUM, mc21);\
2724 + test_qpel_mc_funcs_type(PFX, NUM, mc31);\
2725 + test_qpel_mc_funcs_type(PFX, NUM, mc02);\
2726 + test_qpel_mc_funcs_type(PFX, NUM, mc12);\
2727 + test_qpel_mc_funcs_type(PFX, NUM, mc22);\
2728 + test_qpel_mc_funcs_type(PFX, NUM, mc32);\
2729 + test_qpel_mc_funcs_type(PFX, NUM, mc03);\
2730 + test_qpel_mc_funcs_type(PFX, NUM, mc13);\
2731 + test_qpel_mc_funcs_type(PFX, NUM, mc23);\
2732 + test_qpel_mc_funcs_type(PFX, NUM, mc33)
2734 +test_qpel_mc_funcs(put_h264_qpel, 4);
2735 +test_qpel_mc_funcs(put_h264_qpel, 8);
2736 +test_qpel_mc_funcs(put_h264_qpel, 16);
2737 +test_qpel_mc_funcs(avg_h264_qpel, 4);
2738 +test_qpel_mc_funcs(avg_h264_qpel, 8);
2739 +test_qpel_mc_funcs(avg_h264_qpel, 16);
2742 +#define dspfunc(PFX, IDX, NUM) \
2743 + c->PFX ## _pixels_tab[IDX][ 0] = DSP_FUNC_NAME( PFX ## NUM ## _mc00_pico ); \
2744 + c->PFX ## _pixels_tab[IDX][ 1] = DSP_FUNC_NAME( PFX ## NUM ## _mc10_pico ); \
2745 + c->PFX ## _pixels_tab[IDX][ 2] = DSP_FUNC_NAME( PFX ## NUM ## _mc20_pico ); \
2746 + c->PFX ## _pixels_tab[IDX][ 3] = DSP_FUNC_NAME( PFX ## NUM ## _mc30_pico ); \
2747 + c->PFX ## _pixels_tab[IDX][ 4] = DSP_FUNC_NAME( PFX ## NUM ## _mc01_pico ); \
2748 + c->PFX ## _pixels_tab[IDX][ 5] = DSP_FUNC_NAME( PFX ## NUM ## _mc11_pico ); \
2749 + c->PFX ## _pixels_tab[IDX][ 6] = DSP_FUNC_NAME( PFX ## NUM ## _mc21_pico ); \
2750 + c->PFX ## _pixels_tab[IDX][ 7] = DSP_FUNC_NAME( PFX ## NUM ## _mc31_pico ); \
2751 + c->PFX ## _pixels_tab[IDX][ 8] = DSP_FUNC_NAME( PFX ## NUM ## _mc02_pico ); \
2752 + c->PFX ## _pixels_tab[IDX][ 9] = DSP_FUNC_NAME( PFX ## NUM ## _mc12_pico ); \
2753 + c->PFX ## _pixels_tab[IDX][10] = DSP_FUNC_NAME( PFX ## NUM ## _mc22_pico ); \
2754 + c->PFX ## _pixels_tab[IDX][11] = DSP_FUNC_NAME( PFX ## NUM ## _mc32_pico ); \
2755 + c->PFX ## _pixels_tab[IDX][12] = DSP_FUNC_NAME( PFX ## NUM ## _mc03_pico ); \
2756 + c->PFX ## _pixels_tab[IDX][13] = DSP_FUNC_NAME( PFX ## NUM ## _mc13_pico ); \
2757 + c->PFX ## _pixels_tab[IDX][14] = DSP_FUNC_NAME( PFX ## NUM ## _mc23_pico ); \
2758 + c->PFX ## _pixels_tab[IDX][15] = DSP_FUNC_NAME( PFX ## NUM ## _mc33_pico )
2760 +#endif
2762 +void dsputil_init_avr32(DSPContext* c, AVCodecContext *avctx)
2765 + /* H264 */
2767 + if ( 0 /*avr32_use_pico*/ ){
2768 + c->put_h264_chroma_pixels_tab[0]= DSP_FUNC_NAME(put_h264_chroma_mc8_pico);
2769 + c->put_h264_chroma_pixels_tab[1]= DSP_FUNC_NAME(put_h264_chroma_mc4_pico);
2770 + c->put_h264_chroma_pixels_tab[2]= DSP_FUNC_NAME(put_h264_chroma_mc2_pico);
2772 + c->avg_h264_chroma_pixels_tab[0]= DSP_FUNC_NAME(avg_h264_chroma_mc8_pico);
2773 + c->avg_h264_chroma_pixels_tab[1]= DSP_FUNC_NAME(avg_h264_chroma_mc4_pico);
2774 + c->avg_h264_chroma_pixels_tab[2]= DSP_FUNC_NAME(avg_h264_chroma_mc2_pico);
2777 +#define dspfunc(PFX, IDX, NUM) \
2778 + c->PFX ## _pixels_tab[IDX][ 0] = DSP_FUNC_NAME( PFX ## NUM ## _mc00_pico ); \
2779 + c->PFX ## _pixels_tab[IDX][ 1] = DSP_FUNC_NAME( PFX ## NUM ## _mc10_pico ); \
2780 + c->PFX ## _pixels_tab[IDX][ 2] = DSP_FUNC_NAME( PFX ## NUM ## _mc20_pico ); \
2781 + c->PFX ## _pixels_tab[IDX][ 3] = DSP_FUNC_NAME( PFX ## NUM ## _mc30_pico ); \
2782 + c->PFX ## _pixels_tab[IDX][ 4] = DSP_FUNC_NAME( PFX ## NUM ## _mc01_pico ); \
2783 + c->PFX ## _pixels_tab[IDX][ 5] = DSP_FUNC_NAME( PFX ## NUM ## _mc11_pico ); \
2784 + c->PFX ## _pixels_tab[IDX][ 6] = DSP_FUNC_NAME( PFX ## NUM ## _mc21_pico ); \
2785 + c->PFX ## _pixels_tab[IDX][ 7] = DSP_FUNC_NAME( PFX ## NUM ## _mc31_pico ); \
2786 + c->PFX ## _pixels_tab[IDX][ 8] = DSP_FUNC_NAME( PFX ## NUM ## _mc02_pico ); \
2787 + c->PFX ## _pixels_tab[IDX][ 9] = DSP_FUNC_NAME( PFX ## NUM ## _mc12_pico ); \
2788 + c->PFX ## _pixels_tab[IDX][10] = DSP_FUNC_NAME( PFX ## NUM ## _mc22_pico ); \
2789 + c->PFX ## _pixels_tab[IDX][11] = DSP_FUNC_NAME( PFX ## NUM ## _mc32_pico ); \
2790 + c->PFX ## _pixels_tab[IDX][12] = DSP_FUNC_NAME( PFX ## NUM ## _mc03_pico ); \
2791 + c->PFX ## _pixels_tab[IDX][13] = DSP_FUNC_NAME( PFX ## NUM ## _mc13_pico ); \
2792 + c->PFX ## _pixels_tab[IDX][14] = DSP_FUNC_NAME( PFX ## NUM ## _mc23_pico ); \
2793 + c->PFX ## _pixels_tab[IDX][15] = DSP_FUNC_NAME( PFX ## NUM ## _mc33_pico )
2795 + if ( avr32_use_pico ){
2796 + dspfunc(put_h264_qpel, 0, 16);
2797 + dspfunc(put_h264_qpel, 1, 8);
2798 + dspfunc(put_h264_qpel, 2, 4);
2799 + dspfunc(avg_h264_qpel, 0, 16);
2800 + dspfunc(avg_h264_qpel, 1, 8);
2801 + dspfunc(avg_h264_qpel, 2, 4);
2804 + c->idct_put= DSP_FUNC_NAME(idct_put_avr32);
2805 + c->idct_add= DSP_FUNC_NAME(idct_add_avr32);
2806 + c->idct = DSP_FUNC_NAME(idct_avr32);
2807 + c->h264_idct_add = DSP_FUNC_NAME(h264_idct_add_avr32);
2808 + c->h264_idct8_add = DSP_FUNC_NAME(h264_idct8_add_avr32);
2810 + /*c->h264_v_loop_filter_luma= h264_v_loop_filter_luma_avr32;*/
2812 + c->idct_permutation_type= FF_TRANSPOSE_IDCT_PERM;
2814 + c->fdct = fdct_avr32;
2816 + c->clear_blocks = clear_blocks_avr32;
2818 +#undef dspfunc
2819 +#define dspfunc(PFX, IDX, NUM) \
2820 + c->PFX ## _pixels_tab[IDX][0] = DSP_FUNC_NAME( PFX ## _pixels ## NUM ## _avr32 ); \
2821 + c->PFX ## _pixels_tab[IDX][1] = DSP_FUNC_NAME( PFX ## _pixels ## NUM ## _h_avr32); \
2822 + c->PFX ## _pixels_tab[IDX][2] = DSP_FUNC_NAME( PFX ## _pixels ## NUM ## _v_avr32); \
2823 + c->PFX ## _pixels_tab[IDX][3] = DSP_FUNC_NAME( PFX ## _pixels ## NUM ## _hv_avr32)
2825 + dspfunc(put, 0, 16);
2826 + dspfunc(put_no_rnd, 0, 16);
2827 + dspfunc(put, 1, 8);
2828 + dspfunc(put_no_rnd, 1, 8);
2830 + dspfunc(avg, 1, 8);
2831 + dspfunc(avg_no_rnd, 1, 8);
2832 + dspfunc(avg, 0, 16);
2833 + dspfunc(avg_no_rnd, 0, 16);
2834 +#undef dspfunc
2840 +#if 0
2841 +int main(int argc, char *argv[]){
2845 +#endif
2847 diff --git a/libavcodec/avr32/fdct.S b/libavcodec/avr32/fdct.S
2848 new file mode 100644
2849 index 0000000..be45b86
2850 --- /dev/null
2851 +++ b/libavcodec/avr32/fdct.S
2852 @@ -0,0 +1,541 @@
2854 + * Copyright (c) 2007 Atmel Corporation. All rights reserved.
2856 + * Redistribution and use in source and binary forms, with or without
2857 + * modification, are permitted provided that the following conditions
2858 + * are met:
2860 + * 1. Redistributions of source code must retain the above copyright
2861 + * notice, this list of conditions and the following disclaimer.
2863 + * 2. Redistributions in binary form must reproduce the above
2864 + * copyright notice, this list of conditions and the following
2865 + * disclaimer in the documentation and/or other materials provided
2866 + * with the distribution.
2868 + * 3. The name of ATMEL may not be used to endorse or promote products
2869 + * derived from this software without specific prior written
2870 + * permission.
2872 + * THIS SOFTWARE IS PROVIDED BY ATMEL ``AS IS'' AND ANY EXPRESS OR
2873 + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
2874 + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
2875 + * ARE EXPRESSLY AND SPECIFICALLY DISCLAIMED. IN NO EVENT SHALL ATMEL
2876 + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
2877 + * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
2878 + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
2879 + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
2880 + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
2881 + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
2882 + * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
2883 + * DAMAGE.
2884 + */
2886 +//**********************************************************
2887 +//* 2-D fDCT, Based on: *
2888 +//* C. Loeffler, A. Ligtenberg and G. Moschytz, "Practical *
2889 +//* Fast 1-D DCT Algorithms with 11 Multiplications", *
2890 +//* Proc. Int'l. Conf. on Acoustics, Speech, and Signal *
2891 +//* Processing 1989 (ICASSP '89), pp. 988-991. *
2892 +//* *
2893 +//* Fixed point implementation optimized for the AVR-II *
2894 +//* instruction set. If a table is used for the *
2895 +//* coeffisients we can load two and two of them from *
2896 +//* This will give a reduction of
2897 +//* *
2898 +//* *
2899 +//**********************************************************
2902 +/* This routine is a slow-but-accurate integer implementation of the
2903 + * forward DCT (Discrete Cosine Transform). Taken from the IJG software
2905 + * A 2-D DCT can be done by 1-D DCT on each row followed by 1-D DCT
2906 + * on each column. Direct algorithms are also available, but they are
2907 + * much more complex and seem not to be any faster when reduced to code.
2909 + * This implementation is based on an algorithm described in
2910 + * C. Loeffler, A. Ligtenberg and G. Moschytz, "Practical Fast 1-D DCT
2911 + * Algorithms with 11 Multiplications", Proc. Int'l. Conf. on Acoustics,
2912 + * Speech, and Signal Processing 1989 (ICASSP '89), pp. 988-991.
2913 + * The primary algorithm described there uses 11 multiplies and 29 adds.
2914 + * We use their alternate method with 12 multiplies and 32 adds.
2915 + * The advantage of this method is that no data path contains more than one
2916 + * multiplication; this allows a very simple and accurate implementation in
2917 + * scaled fixed-point arithmetic, with a minimal number of shifts.
2919 + * The poop on this scaling stuff is as follows:
2921 + * Each 1-D DCT step produces outputs which are a factor of sqrt(N)
2922 + * larger than the true DCT outputs. The final outputs are therefore
2923 + * a factor of N larger than desired; since N=8 this can be cured by
2924 + * a simple right shift at the end of the algorithm. The advantage of
2925 + * this arrangement is that we save two multiplications per 1-D DCT,
2926 + * because the y0 and y4 outputs need not be divided by sqrt(N).
2927 + * In the IJG code, this factor of 8 is removed by the quantization step
2928 + * (in jcdctmgr.c), here it is removed.
2930 + * We have to do addition and subtraction of the integer inputs, which
2931 + * is no problem, and multiplication by fractional constants, which is
2932 + * a problem to do in integer arithmetic. We multiply all the constants
2933 + * by CONST_SCALE and convert them to integer constants (thus retaining
2934 + * CONST_BITS bits of precision in the constants). After doing a
2935 + * multiplication we have to divide the product by CONST_SCALE, with proper
2936 + * rounding, to produce the correct output. This division can be done
2937 + * cheaply as a right shift of CONST_BITS bits. We postpone shifting
2938 + * as long as possible so that partial sums can be added together with
2939 + * full fractional precision.
2941 + * The outputs of the first pass are scaled up by PASS1_BITS bits so that
2942 + * they are represented to better-than-integral precision. These outputs
2943 + * require 8 + PASS1_BITS + 3 bits; this fits in a 16-bit word
2944 + * with the recommended scaling. (For 12-bit sample data, the intermediate
2945 + * array is INT32 anyway.)
2947 + * To avoid overflow of the 32-bit intermediate results in pass 2, we must
2948 + * have 8 + CONST_BITS + PASS1_BITS <= 26. Error analysis
2949 + * shows that the values given below are the most effective.
2951 + * We can gain a little more speed, with a further compromise in accuracy,
2952 + * by omitting the addition in a descaling shift. This yields an incorrectly
2953 + * rounded result half the time...
2954 + */
2956 + .global fdct_avr32
2960 +#define CONST_BITS 13
2961 +#define PASS1_BITS 2
2963 +#define FIX_0_298631336 2446 /* FIX(0.298631336) */
2964 +#define FIX_0_390180644 3196 /* FIX(0.390180644) */
2965 +#define FIX_0_541196100 4433 /* FIX(0.541196100) */
2966 +#define FIX_0_765366865 6270 /* FIX(0.765366865) */
2967 +#define FIX_0_899976223 7373 /* FIX(0.899976223) */
2968 +#define FIX_1_175875602 9633 /* FIX(1.175875602) */
2969 +#define FIX_1_501321110 12299 /* FIX(1.501321110) */
2970 +#define FIX_1_847759065 15137 /* FIX(1.847759065) */
2971 +#define FIX_1_961570560 16069 /* FIX(1.961570560) */
2972 +#define FIX_2_053119869 16819 /* FIX(2.053119869) */
2973 +#define FIX_2_562915447 20995 /* FIX(2.562915447) */
2974 +#define FIX_3_072711026 25172 /* FIX(3.072711026) */
2978 + * Perform an integer forward DCT on one block of samples.
2979 + */
2981 +//void
2982 +//fdct_int32(short *const block)
2983 +//{
2984 +// int tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
2985 +// int tmp10, tmp11, tmp12, tmp13;
2986 +// int z1, z2, z3, z4, z5;
2987 +// short *blkptr;
2988 +// int *dataptr;
2989 +// int data[64];
2990 +// int i;
2992 +// /* Pass 1: process rows. */
2993 +// /* Note results are scaled up by sqrt(8) compared to a true DCT; */
2994 +// /* furthermore, we scale the results by 2**PASS1_BITS. */
2996 +// dataptr = data;
2997 +// blkptr = block;
2999 + .text
3000 +fdct_avr32:
3001 + pushm r0-r3, r4-r7, lr
3002 +#define loop_ctr r0
3003 +#define blkptr r12
3004 +#define x0 r1
3005 +#define x1 r2
3006 +#define x2 r3
3007 +#define x3 r4
3008 +#define x4 r5
3009 +#define x5 r6
3010 +#define x6 r7
3011 +#define x7 r8
3012 +#define tmp0 r5
3013 +#define tmp7 r2
3014 +#define tmp1 r3
3015 +#define tmp6 r4
3016 +#define tmp2 r9
3017 +#define tmp5 r8
3018 +#define tmp3 r7
3019 +#define tmp4 r6
3022 + mov loop_ctr, 8
3023 +// for (i = 0; i < 8; i++) {
3024 +ROW_LOOP:
3026 + ldm blkptr, r1, r2, r3, r4
3028 +// tmp2 = blkptr[2] + blkptr[5];
3029 +// tmp3 = blkptr[3] + blkptr[4];
3030 + paddx.h r5, r3, r2
3031 +// tmp5 = blkptr[2] - blkptr[5];
3032 +// tmp4 = blkptr[3] - blkptr[4];
3033 + psubx.h r6, r3, r2
3034 +// tmp0 = blkptr[0] + blkptr[7];
3035 +// tmp1 = blkptr[1] + blkptr[6];
3036 + paddx.h r2, r4, r1
3037 +// tmp7 = blkptr[0] - blkptr[7];
3038 +// tmp6 = blkptr[1] - blkptr[6];
3039 + psubx.h r3, r4, r1
3041 +// /* Even part per LL&M figure 1 --- note that published figure is faulty;
3042 +// * rotator "sqrt(2)*c1" should be "sqrt(2)*c6".
3043 +// */
3045 +#define tmp10 r1
3046 +#define tmp13 r5
3047 +#define tmp11 r7
3048 +#define tmp12 r3
3049 +#define z1 r9
3051 +// tmp10 = tmp0 + tmp3;
3052 +// tmp13 = tmp0 - tmp3;
3053 + paddsub.h r1, r2:t, r5:b
3054 +// tmp11 = tmp1 + tmp2;
3055 +// tmp12 = tmp1 - tmp2;
3056 + paddsub.h r4, r2:b, r5:t
3059 +// dataptr[0] = (tmp10 + tmp11) << PASS1_BITS;
3060 +// dataptr[4] = (tmp10 - tmp11) << PASS1_BITS;
3061 + paddsub.h r7, r1:t, r4:t
3062 + ld.w r10, pc[const_table - .]
3063 + plsl.h r7, r7, PASS1_BITS
3065 +// z1 = (tmp12 + tmp13) * FIX_0_541196100;
3066 + addhh.w r8, r4:b, r1:b
3067 + mulhh.w r8, r8:b, r10:t
3069 +// dataptr[2] =
3070 +// DESCALE(z1 + tmp13 * FIX_0_765366865, CONST_BITS - PASS1_BITS);
3071 +// dataptr[6] =
3072 +// DESCALE(z1 + tmp12 * (-FIX_1_847759065), CONST_BITS - PASS1_BITS);
3073 + mulhh.w r9, r1:b, r10:b
3074 + ld.w r10, pc[const_table - . + 4]
3075 + add r1, r8, r9
3076 + satrnds r1 >> (CONST_BITS - PASS1_BITS), 31
3078 + mulhh.w r9, r4:b, r10:t
3079 + add r4, r8, r9
3080 + satrnds r4 >> (CONST_BITS - PASS1_BITS), 31
3083 +// /* Odd part per figure 8 --- note paper omits factor of sqrt(2).
3084 +// * cK represents cos(K*pi/16).
3085 +// * i0..i3 in the paper are tmp4..tmp7 here.
3086 +// */
3088 +#define z2 r5
3089 +#define z3 r6
3090 +#define z4 r7
3091 +#define z5 r8
3093 +// z4 = tmp5 + tmp7;
3094 +// z3 = tmp4 + tmp6;
3095 + padd.h r2, r6, r3
3096 +// z2 = tmp5 + tmp6;
3097 +// z1 = tmp4 + tmp7;
3098 + paddx.h r5, r6, r3
3100 + lddpc r9, pc[const_table - . + 8]
3101 +// z5 = (z3 + z4) * FIX_1_175875602; /* sqrt(2) * c3 */
3102 + addhh.w r8, r2:t, r2:b
3103 + mulhh.w r8, r8:b, r10:b
3104 + lddpc r10, pc[const_table - . + 12]
3107 +// tmp4 *= FIX_0_298631336; /* sqrt(2) * (-c1+c3+c5-c7) */
3108 + mulhh.w r11, r6:b, r9:t
3110 +// tmp5 *= FIX_2_053119869; /* sqrt(2) * ( c1+c3-c5+c7) */
3111 + mulhh.w r6, r6:t, r9:b
3113 +// tmp6 *= FIX_3_072711026; /* sqrt(2) * ( c1+c3+c5-c7) */
3114 + lddpc r9, pc[const_table - . + 20]
3115 + mulhh.w lr, r3:b, r10:t
3117 +// tmp7 *= FIX_1_501321110; /* sqrt(2) * ( c1+c3-c5-c7) */
3118 + mulhh.w r3, r3:t, r10:b
3120 +// z3 *= -FIX_1_961570560; /* sqrt(2) * (-c3-c5) */
3121 + mulhh.w r10, r2:b, r9:t
3123 +// z4 *= -FIX_0_390180644; /* sqrt(2) * (c5-c3) */
3124 + mulhh.w r2, r2:t, r9:b
3125 + lddpc r9, pc[const_table - . + 16]
3126 +// z3 += z5;
3127 +// z4 += z5;
3128 + add r10, r8
3129 + add r2, r8
3131 +// z1 *= -FIX_0_899976223; /* sqrt(2) * (c7-c3) */
3132 + mulhh.w r8, r5:b, r9:t
3134 +// z2 *= -FIX_2_562915447; /* sqrt(2) * (-c1-c3) */
3135 + mulhh.w r5, r5:t, r9:b
3137 +// dataptr[7] = DESCALE(tmp4 + z1 + z3, CONST_BITS - PASS1_BITS);
3138 + add r11, r8
3139 + add r11, r10
3140 + satrnds r11 >> (CONST_BITS - PASS1_BITS), 31
3142 +// dataptr[5] = DESCALE(tmp5 + z2 + z4, CONST_BITS - PASS1_BITS);
3143 + add r6, r5
3145 + sthh.w blkptr[6*2], r4:b, r11:b
3146 + add r6, r2
3147 + satrnds r6 >> (CONST_BITS - PASS1_BITS), 31
3149 +// dataptr[3] = DESCALE(tmp6 + z2 + z3, CONST_BITS - PASS1_BITS);
3150 + add lr, r5
3151 + sthh.w blkptr[4*2], r7:b, r6:b
3152 + add lr, r10
3153 + satrnds lr >> (CONST_BITS - PASS1_BITS), 31
3155 +// dataptr[1] = DESCALE(tmp7 + z1 + z4, CONST_BITS - PASS1_BITS);
3156 + add r3, r8
3157 + sthh.w blkptr[2*2], r1:b, lr:b
3158 + add r3, r2
3159 + satrnds r3 >> (CONST_BITS - PASS1_BITS), 31
3163 +// dataptr += 8; /* advance pointer to next row */
3164 +// blkptr += 8;
3165 + sthh.w blkptr[0], r7:t, r3:b
3166 + sub blkptr, -16
3167 + sub loop_ctr, 1
3168 + brne ROW_LOOP
3170 +// }
3172 + /* Pass 2: process columns.
3173 + * We remove the PASS1_BITS scaling, but leave the results scaled up
3174 + * by an overall factor of 8.
3175 + */
3177 +// dataptr = data;
3178 + sub blkptr, 128
3180 + mov loop_ctr, 4
3181 +// for (i = 0; i < 8; i++) {
3182 +COLOUMN_LOOP:
3183 + ld.w r1, blkptr[0]
3184 + ld.w r2, blkptr[1*8*2]
3185 + ld.w r3, blkptr[2*8*2]
3186 + ld.w r4, blkptr[3*8*2]
3187 + ld.w r5, blkptr[4*8*2]
3188 + ld.w r6, blkptr[5*8*2]
3189 + ld.w r7, blkptr[6*8*2]
3190 + ld.w r8, blkptr[7*8*2]
3192 +// tmp0 = blkptr[0] + blkptr[7*8];
3193 + padds.sh r9, r1, r8
3194 +// tmp7 = blkptr[0] - blkptr[7*8];
3195 + psubs.sh r1, r1, r8
3196 +// tmp1 = blkptr[1*8] + blkptr[6*8];
3197 + padds.sh r8, r2, r7
3198 +// tmp6 = blkptr[1*8] - blkptr[6*8];
3199 + psubs.sh r2, r2, r7
3200 +// tmp2 = blkptr[2*8] + blkptr[5*8];
3201 + padds.sh r7, r3, r6
3202 +// tmp5 = blkptr[2*8] - blkptr[5*8];
3203 + psubs.sh r3, r3, r6
3204 +// tmp3 = blkptr[3*8] + blkptr[4*8];
3205 + padds.sh r6, r4, r5
3206 +// tmp4 = blkptr[3*8] - blkptr[4*8];
3207 + psubs.sh r4, r4, r5
3209 +// /* even part per ll&m figure 1 --- note that published figure is faulty;
3210 +// * rotator "sqrt(2)*c1" should be "sqrt(2)*c6".
3211 +// */
3213 +// tmp10 = tmp0 + tmp3;
3214 + padds.sh r5, r9, r6
3215 +// tmp13 = tmp0 - tmp3;
3216 + psubs.sh r9, r9, r6
3217 +// tmp11 = tmp1 + tmp2;
3218 + padds.sh r6, r8, r7
3219 +// tmp12 = tmp1 - tmp2;
3220 + psubs.sh r8, r8, r7
3222 +// dataptr[0] = DESCALE(tmp10 + tmp11, PASS1_BITS);
3223 +// dataptr[32] = DESCALE(tmp10 - tmp11, PASS1_BITS);
3224 +//Might get an overflow here
3225 + padds.sh r7, r5, r6
3226 + psubs.sh r5, r5, r6
3228 + //Rounding
3229 + mov lr, (1 << (PASS1_BITS + 2))
3230 + orh lr, hi(1 << (16 + PASS1_BITS + 2))
3231 + padds.sh r7, r7, lr
3232 + padds.sh r5, r5, lr
3234 + pasr.h r7, r7, PASS1_BITS + 3
3235 + pasr.h r5, r5, PASS1_BITS + 3
3236 + st.w r12[0], r7
3237 + st.w r12[4*8*2], r5
3239 + lddpc r10, const_table2
3242 +// z1 = (tmp12 + tmp13) * FIX_0_541196100;
3243 + padds.sh r5, r8, r9
3244 + mulhh.w r6, r5:t, r10:t
3245 + mulhh.w r7, r5:b, r10:t
3247 +// dataptr[16] =
3248 +// DESCALE(z1 + tmp13 * FIX_0_765366865, CONST_BITS + PASS1_BITS);
3249 + lddpc r11, const_table2 + 4
3250 + mulhh.w lr, r9:t, r10:b
3251 + mulhh.w r9, r9:b, r10:b
3252 + add lr, r6
3253 + add r9, r7
3254 + satrnds lr >> (CONST_BITS + PASS1_BITS + 3), 31
3255 + satrnds r9 >> (CONST_BITS + PASS1_BITS + 3), 31
3256 + sthh.w r12[2*8*2], lr:b, r9:b
3258 +// dataptr[48] =
3259 +// DESCALE(z1 + tmp12 * (-FIX_1_847759065), CONST_BITS + PASS1_BITS);
3260 + mulhh.w lr, r8:t, r11:t
3261 + mulhh.w r8, r8:b, r11:t
3262 + add lr, r6
3263 + add r8, r7
3264 + satrnds lr >> (CONST_BITS + PASS1_BITS + 3), 31
3265 + satrnds r8 >> (CONST_BITS + PASS1_BITS + 3), 31
3266 + sthh.w r12[6*8*2], lr:b, r8:b
3268 +// /* Odd part per figure 8 --- note paper omits factor of sqrt(2).
3269 +// * cK represents cos(K*pi/16).
3270 +// * i0..i3 in the paper are tmp4..tmp7 here.
3271 +// */
3273 +// z2 = tmp5 + tmp6;
3274 +// z3 = tmp4 + tmp6;
3275 +// z4 = tmp5 + tmp7;
3276 + padds.sh r5, r3, r2
3277 + padds.sh r6, r4, r2
3278 + padds.sh r7, r3, r1
3280 +// z5 = (z3 + z4) * FIX_1_175875602; /* sqrt(2) * c3 */
3281 + padds.sh r8, r6, r7
3282 + mulhh.w r9, r8:t, r11:b
3283 + mulhh.w r8, r8:b, r11:b
3285 +// z3 *= -FIX_1_961570560; /* sqrt(2) * (-c3-c5) */
3286 +// z3 += z5;
3287 + lddpc r11, const_table2 + 8
3288 + mulhh.w r10, r6:t, r11:t
3289 + mulhh.w r6, r6:b, r11:t
3290 + add r10, r9
3291 + add r6, r8
3293 +// z4 *= -FIX_0_390180644; /* sqrt(2) * (c5-c3) */
3294 +// z4 += z5;
3295 + mulhh.w lr, r7:t, r11:b
3296 + mulhh.w r7, r7:b, r11:b
3297 + lddpc r11, const_table2 + 12
3298 + st.w --sp,r0
3299 + add lr, r9
3300 + add r7, r8
3302 +// tmp6 *= FIX_3_072711026; /* sqrt(2) * ( c1+c3+c5-c7) */
3303 + mulhh.w r0, r2:t, r11:t
3304 + machh.w r0, r5:t, r11:b
3305 + mulhh.w r2, r2:b, r11:t
3306 + machh.w r2, r5:b, r11:b
3308 +// z2 *= -FIX_2_562915447; /* sqrt(2) * (-c1-c3) */
3309 +// dataptr[24] = DESCALE(tmp6 + z2 + z3, CONST_BITS + PASS1_BITS);
3310 + add r0, r10
3311 + lddpc r11, const_table2 + 16
3312 + add r2, r6
3313 + satrnds r0 >> (CONST_BITS + PASS1_BITS + 3), 31
3314 + satrnds r2 >> (CONST_BITS + PASS1_BITS + 3), 31
3315 + sthh.w r12[3*8*2], r0:b, r2:b
3316 +// tmp5 *= FIX_2_053119869; /* sqrt(2) * ( c1+c3-c5+c7) */
3317 + mulhh.w r0, r3:t, r11:t
3318 + machh.w r0, r5:t, r11:b
3319 + mulhh.w r2, r3:b, r11:t
3320 + machh.w r2, r5:b, r11:b
3321 + add r0, lr
3322 + lddpc r11, const_table2 + 20
3323 + add r2, r7
3325 +// dataptr[40] = DESCALE(tmp5 + z2 + z4, CONST_BITS + PASS1_BITS);
3326 + satrnds r0 >> (CONST_BITS + PASS1_BITS + 3), 31
3327 + satrnds r2 >> (CONST_BITS + PASS1_BITS + 3), 31
3328 + sthh.w r12[5*8*2], r0:b, r2:b
3331 +// z1 = tmp4 + tmp7;
3332 + padds.sh r2, r4, r1
3334 +// tmp4 *= FIX_0_298631336; /* sqrt(2) * (-c1+c3+c5-c7) */
3335 + mulhh.w r3, r4:t, r11:t
3336 + machh.w r3, r2:t, r11:b
3337 + mulhh.w r4, r4:b, r11:t
3338 + machh.w r4, r2:b, r11:b
3339 + add r3, r10
3340 + lddpc r11, const_table2 + 24
3341 + add r4, r6
3343 +// z1 *= -FIX_0_899976223; /* sqrt(2) * (c7-c3) */
3344 +// dataptr[56] = DESCALE(tmp4 + z1 + z3, CONST_BITS + PASS1_BITS);
3345 + satrnds r3 >> (CONST_BITS + PASS1_BITS + 3), 31
3346 + satrnds r4 >> (CONST_BITS + PASS1_BITS + 3), 31
3347 + sthh.w r12[7*8*2], r3:b, r4:b
3350 +// tmp7 *= FIX_1_501321110; /* sqrt(2) * ( c1+c3-c5-c7) */
3351 + mulhh.w r3, r1:t, r11:t
3352 + machh.w r3, r2:t, r11:b
3353 + mulhh.w r4, r1:b, r11:t
3354 + machh.w r4, r2:b, r11:b
3355 + add r3, lr
3356 + add r4, r7
3358 +// dataptr[8] = DESCALE(tmp7 + z1 + z4, CONST_BITS + PASS1_BITS);
3359 + satrnds r3 >> (CONST_BITS + PASS1_BITS + 3), 31
3360 + satrnds r4 >> (CONST_BITS + PASS1_BITS + 3), 31
3361 + sthh.w r12[1*8*2], r3:b, r4:b
3362 + ld.w r0, sp++
3364 +// dataptr++; /* advance pointer to next column */
3365 + sub blkptr, -4
3366 + sub loop_ctr, 1
3367 + brne COLOUMN_LOOP
3369 +// }
3371 + popm r0-r3, r4-r7, pc
3373 +// /* descale */
3374 +// for (i = 0; i < 64; i++)
3375 +// block[i] = (short int) DESCALE(data[i], 3);
3378 +//}
3381 + .align 2
3382 +const_table: .short FIX_0_541196100, FIX_0_765366865, -FIX_1_847759065, FIX_1_175875602
3383 + .short FIX_0_298631336, FIX_2_053119869, FIX_3_072711026, FIX_1_501321110
3384 + .short -FIX_0_899976223,-FIX_2_562915447, -FIX_1_961570560, -FIX_0_390180644
3386 +const_table2: .short FIX_0_541196100, FIX_0_765366865, -FIX_1_847759065, FIX_1_175875602
3387 + .short -FIX_1_961570560, -FIX_0_390180644, FIX_3_072711026, -FIX_2_562915447
3388 + .short FIX_2_053119869, -FIX_2_562915447, FIX_0_298631336, -FIX_0_899976223
3389 + .short FIX_1_501321110, -FIX_0_899976223
3394 diff --git a/libavcodec/avr32/h264idct.S b/libavcodec/avr32/h264idct.S
3395 new file mode 100644
3396 index 0000000..4b23e2d
3397 --- /dev/null
3398 +++ b/libavcodec/avr32/h264idct.S
3399 @@ -0,0 +1,451 @@
3401 + * Copyright (c) 2007 Atmel Corporation. All rights reserved.
3403 + * Redistribution and use in source and binary forms, with or without
3404 + * modification, are permitted provided that the following conditions
3405 + * are met:
3407 + * 1. Redistributions of source code must retain the above copyright
3408 + * notice, this list of conditions and the following disclaimer.
3410 + * 2. Redistributions in binary form must reproduce the above
3411 + * copyright notice, this list of conditions and the following
3412 + * disclaimer in the documentation and/or other materials provided
3413 + * with the distribution.
3415 + * 3. The name of ATMEL may not be used to endorse or promote products
3416 + * derived from this software without specific prior written
3417 + * permission.
3419 + * THIS SOFTWARE IS PROVIDED BY ATMEL ``AS IS'' AND ANY EXPRESS OR
3420 + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
3421 + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
3422 + * ARE EXPRESSLY AND SPECIFICALLY DISCLAIMED. IN NO EVENT SHALL ATMEL
3423 + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
3424 + * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
3425 + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
3426 + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
3427 + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
3428 + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
3429 + * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
3430 + * DAMAGE.
3431 + */
3433 + .global h264_idct_add_avr32
3435 + /* Macro for performing the 1-D transform on one row line.
3437 + The register 'w01' should contain the first two pixels,
3438 + and the register 'w23' should contain the last two pixels
3439 + in the line. The resulting line is placed in p01 and p23
3440 + so that { w01, w23 } = { x0, x1, x3, x2 }.
3441 + 'tmp' and 'tmp2' should be scratchpad registers. */
3442 + .macro transform_row w01, w23, tmp, tmp2
3443 + add \tmp, \w23, \w01 << 1 /* tmp = { xxxx, 2*w1 + w3 } */
3444 + sub \tmp2, \w01, \w23 << 1 /* tmp2 = { xxxx, w1 - 2*w3 } */
3445 + bfins \tmp2, \tmp, 16, 16 /* tmp2 = { 2*w1 + w3, w1 - 2*w3 } */
3446 + pasr.h \tmp2, \tmp2, 1 /* tmp2 = { w1 + w3/2, w1/2 - w3 } */
3447 + paddsub.h \tmp, \w01:t, \w23:t /* tmp = { w0 + w2, w0 - w2 } */
3448 + padd.h \w01, \tmp, \tmp2 /* w01 = { w0 + w2 + w1 + w3/2, w0 - w2 + w1/2 - w3 } */
3449 + psub.h \w23, \tmp, \tmp2 /* w23 = { w0 + w2 - w1 - w3/2, w0 - w2 - w1/2 + w3 } */
3450 + .endm
3452 + /* Macro for performing the 1-D transform on two columns.
3454 + The registers w0, w1, w2, w3 should each contain two
3455 + packed samples from the two colomns to transform.
3456 + tmp and tmp2 are scratchpad registers.
3458 + The resulting transformed columns are placed in the
3459 + same positions as the input columns.
3460 + */
3461 + .macro transform_2columns w0, w1, w2, w3, tmp, tmp2
3462 + padd.h \tmp, \w0, \w2 /* tmp = z0 = w0 + w2 */
3463 + psub.h \w0, \w0, \w2 /* w0 = z1 = w0 - w2 */
3464 + pasr.h \w2, \w1, 1 /* w2 = w1/2 */
3465 + pasr.h \tmp2, \w3, 1 /* tmp2 = w3/2 */
3466 + psub.h \w3, \w2, \w3 /* w3 = z2 = w1/2 - w3 */
3467 + padd.h \tmp2, \w1, \tmp2/* tmp2 = z3 = w1 + w3/2 */
3468 + padd.h \w1, \w0, \w3 /* w1 = x1 = z1 + z2 */
3469 + psub.h \w2, \w0, \w3 /* w2 = x2 = z1 - z2 */
3470 + padd.h \w0, \tmp, \tmp2/* w0 = x0 = z0 + z3 */
3471 + psub.h \w3, \tmp, \tmp2/* w3 = x3 = z0 - z3 */
3472 + /* Scale down result. */
3473 + pasr.h \w0, \w0, 6
3474 + pasr.h \w1, \w1, 6
3475 + pasr.h \w2, \w2, 6
3476 + pasr.h \w3, \w3, 6
3477 + .endm
3479 +/*void h264_idct_add_avr32(uint8_t *dst, DCTELEM *block, int stride)*/
3481 +h264_idct_add_avr32:
3483 + stm --sp,r0-r3,r4-r7, lr
3485 + /* Setup rounding factor. */
3486 + mov r0, (1 << 5)
3487 + lsl r0, 16
3489 + /* Load block */
3490 + ldm r11,r2-r9
3491 + /* r9 = { w00, w01 },
3492 + r8 = { w02, w03 },
3493 + r7 = { w10, w11 },
3494 + r6 = { w12, w13 },
3495 + r5 = { w20, w21 },
3496 + r4 = { w22, w23 },
3497 + r3 = { w30, w31 },
3498 + r2 = { w32, w33 } */
3501 + /* Add the rounding factor to w00. */
3502 + add r9, r0
3504 + /* Transform rows */
3505 + transform_row r9, r8, r0, r1
3506 + transform_row r7, r6, r0, r1
3507 + transform_row r5, r4, r0, r1
3508 + transform_row r3, r2, r0, r1
3510 + /* Transform columns */
3511 + transform_2columns r9, r7, r5, r3, r0, r1
3512 + transform_2columns r8, r6, r4, r2, r0, r1
3514 + /* Load predicted pixels.*/
3515 + ld.w lr, r12[0]
3516 + ld.w r11, r12[r10]
3518 + /* Unpack to halwords. */
3519 + punpckub.h r0, lr:t
3520 + punpckub.h r1, lr:b
3522 + /* Add with transformed row. */
3523 + padd.h r0, r0, r9
3524 + paddx.h r1, r1, r8
3525 + /* Pack and saturate back to 8-bit pixels. */
3526 + packsh.ub r0, r0, r1
3528 + /* Unpack to halwords. */
3529 + punpckub.h lr, r11:t
3530 + punpckub.h r11, r11:b
3532 + /* Add with transformed row. */
3533 + padd.h lr, lr, r7
3534 + paddx.h r11, r11, r6
3535 + /* Pack and saturate back to 8-bit pixels. */
3536 + packsh.ub r1, lr, r11
3538 + /* Store back to frame. */
3539 + st.w r12[0], r0
3540 + st.w r12[r10], r1
3542 + add r12, r12, r10 << 1
3544 + /* Load predicted pixels.*/
3545 + ld.w lr, r12[0]
3546 + ld.w r11, r12[r10]
3548 + /* Unpack to halwords. */
3549 + punpckub.h r0, lr:t
3550 + punpckub.h r1, lr:b
3552 + /* Add with transformed row. */
3553 + padd.h r0, r0, r5
3554 + paddx.h r1, r1, r4
3555 + /* Pack and saturate back to 8-bit pixels. */
3556 + packsh.ub r0, r0, r1
3558 + /* Unpack to halwords. */
3559 + punpckub.h lr, r11:t
3560 + punpckub.h r11, r11:b
3562 + /* Add with transformed row. */
3563 + padd.h lr, lr, r3
3564 + paddx.h r11, r11, r2
3565 + /* Pack and saturate back to 8-bit pixels. */
3566 + packsh.ub r1, lr, r11
3568 + /* Store back to frame. */
3569 + st.w r12[0], r0
3570 + st.w r12[r10], r1
3572 + ldm sp++,r0-r3,r4-r7, pc
3575 + .global h264_idct8_add_avr32
3576 +//void ff_h264_idct8_add_c(uint8_t *dst, DCTELEM *block, int stride){
3578 +h264_idct8_add_avr32:
3579 + stm --sp,r0-r3,r4-r7, lr
3581 + /* Push dst and stride on stack */
3582 + stm --sp,r10,r12
3584 +// int i;
3585 +// DCTELEM (*src)[8] = (DCTELEM(*)[8])block;
3586 +// uint8_t *cm = cropTbl + MAX_NEG_CROP;
3588 +// block[0] += 32;
3591 +// for( i = 0; i < 8; i++ )
3592 +// {
3593 + mov lr, 4
3595 + ld.w r7, r11[0*(8*2)]
3596 + ld.w r6, r11[1*(8*2)]
3597 + ld.w r5, r11[2*(8*2)]
3598 + ld.w r4, r11[3*(8*2)]
3599 + ld.w r3, r11[4*(8*2)]
3600 + ld.w r2, r11[5*(8*2)]
3601 + ld.w r1, r11[6*(8*2)]
3602 + ld.w r0, r11[7*(8*2)]
3604 +/*
3606 + const int a0 = src[0][i] + src[4][i];
3607 + const int a2 = src[0][i] - src[4][i];
3608 + const int a4 = (src[2][i]>>1) - src[6][i];
3609 + const int a6 = (src[6][i]>>1) + src[2][i];
3611 + padd.h r8, r7, r3 /* r8 = a0 */
3612 + psub.h r7, r7, r3 /* r7 = a2 */
3613 + pasr.h r3, r5, 1 /* r3 = src[2][i] >> 1 */
3614 + pasr.h r9, r1, 1 /* r9 = src[6][i] >> 1 */
3615 + psub.h r3, r3, r1 /* r3 = a4 */
3616 + padd.h r9, r9, r5 /* r9 = a6 */
3619 + const int b0 = a0 + a6;
3620 + const int b2 = a2 + a4;
3621 + const int b4 = a2 - a4;
3622 + const int b6 = a0 - a6;
3624 + padd.h r1, r8, r9 /* r1 = b0 */
3625 + psub.h r8, r8, r9 /* r8 = b6 */
3626 + padd.h r5, r7, r3 /* r5 = b2 */
3627 + psub.h r7, r7, r3 /* r7 = b4 */
3629 +/*
3630 + const int a1 = -src[3][i] + src[5][i] - src[7][i] - (src[7][i]>>1);
3631 + const int a3 = src[1][i] + src[7][i] - src[3][i] - (src[3][i]>>1);
3632 + const int a5 = -src[1][i] + src[7][i] + src[5][i] + (src[5][i]>>1);
3633 + const int a7 = src[3][i] + src[5][i] + src[1][i] + (src[1][i]>>1);
3635 + pasr.h r3, r0, 1
3636 + padd.h r3, r3, r0
3637 + psub.h r3, r2, r3
3638 + psub.h r3, r3, r4 /* r3 = a1 */
3640 + pasr.h r9, r4, 1
3641 + padd.h r9, r9, r4
3642 + psub.h r9, r0, r9
3643 + padd.h r9, r6, r9 /* r9 = a3 */
3645 + pasr.h r10, r2, 1
3646 + padd.h r10, r10, r2
3647 + padd.h r10, r10, r0
3648 + psub.h r10, r10, r6 /* r10 = a5 */
3650 + pasr.h r0, r6, 1
3651 + padd.h r0, r0, r6
3652 + padd.h r0, r0, r2
3653 + padd.h r0, r0, r4 /* r0 = a7 */
3655 + const int b1 = (a7>>2) + a1;
3656 + const int b3 = a3 + (a5>>2);
3657 + const int b5 = (a3>>2) - a5;
3658 + const int b7 = a7 - (a1>>2);
3660 + pasr.h r2, r0, 2
3661 + padd.h r2, r2, r3 /* r2 = b1 */
3662 + pasr.h r3, r3, 2
3663 + psub.h r3, r0, r3 /* r3 = b7 */
3665 + pasr.h r0, r10, 2
3666 + padd.h r0, r0, r9 /* r0 = b3 */
3667 + pasr.h r9, r9, 2
3668 + psub.h r9, r9, r10 /* r9 = b5 */
3671 +/*
3672 + src[0][i] = b0 + b7;
3673 + src[7][i] = b0 - b7;
3674 + src[1][i] = b2 + b5;
3675 + src[6][i] = b2 - b5;
3676 + src[2][i] = b4 + b3;
3677 + src[5][i] = b4 - b3;
3678 + src[3][i] = b6 + b1;
3679 + src[4][i] = b6 - b1; */
3681 + padd.h r4, r1, r3
3682 + psub.h r1, r1, r3
3683 + st.w r11[0*(8*2)], r4
3684 + st.w r11[7*(8*2)], r1
3686 + padd.h r3, r5, r9
3687 + psub.h r5, r5, r9
3688 + st.w r11[1*(8*2)], r3
3689 + st.w r11[6*(8*2)], r5
3691 + padd.h r9, r7, r0
3692 + psub.h r7, r7, r0
3693 + st.w r11[2*(8*2)], r9
3694 + st.w r11[5*(8*2)], r7
3696 + padd.h r0, r8, r2
3697 + psub.h r8, r8, r2
3698 + st.w r11[3*(8*2)], r0
3699 + st.w r11[4*(8*2)], r8
3701 + sub r11, -4
3702 + sub lr, 1
3703 + brne 0b
3705 +// }
3707 + lddsp r12, sp[0] /* r12 = dst */
3708 + sub r11, 4*4
3709 + ldm r11++, r4-r7
3710 + mov lr, 8
3711 + /* Push dst and stride on stack */
3713 +1:
3714 +// for( i = 0; i < 8; i++ )
3715 +// {
3717 + /* r7 = {src[i][0], src[i][1]}
3718 + r6 = {src[i][2], src[i][3]}
3719 + r5 = {src[i][4], src[i][5]}
3720 + r4 = {src[i][6], src[i][7]} */
3723 + const int a0 = src[i][0] + src[i][4];
3724 + const int a2 = src[i][0] - src[i][4];
3725 + const int a4 = (src[i][2]>>1) - src[i][6];
3726 + const int a6 = (src[i][6]>>1) + src[i][2];
3728 + pasr.h r8, r6, 1
3729 + pasr.h r9, r4, 1
3730 + addhh.w r0, r7:t, r5:t /* r0 = a0 */
3731 + subhh.w r1, r7:t, r5:t /* r1 = a2 */
3732 + subhh.w r2, r8:t, r4:t /* r2 = a4 */
3733 + addhh.w r3, r9:t, r6:t /* r3 = a6 */
3736 + const int b0 = a0 + a6;
3737 + const int b2 = a2 + a4;
3738 + const int b4 = a2 - a4;
3739 + const int b6 = a0 - a6;
3741 + add r10, r0, r3 /* r10 = b0 */
3742 + sub r0, r3 /* r0 = b6 */
3743 + add r3, r1, r2 /* r3 = b2 */
3744 + sub r1, r2 /* r1 = b4 */
3748 + const int a7 = src[i][5] + src[i][3] + src[i][1] + (src[i][1]>>1);
3749 + const int a1 = src[i][5] - src[i][3] - src[i][7] - (src[i][7]>>1);
3750 + const int a3 = src[i][7] + src[i][1] - src[i][3] - (src[i][3]>>1);
3751 + const int a5 = src[i][7] - src[i][1] + src[i][5] + (src[i][5]>>1); */
3752 + addhh.w r8, r8:b, r6:b
3753 + addhh.w r2, r4:b, r7:b
3754 + sub r2, r8 /* r2 = a3 */
3756 + addhh.w r9, r9:b, r4:b
3757 + subhh.w r8, r5:b, r6:b
3758 + sub r8, r9 /* r8 = a1 */
3760 + pasr.h r9, r7, 1
3761 + addhh.w r9, r9:b, r7:b
3762 + addhh.w r6, r5:b, r6:b
3763 + add r6, r9 /* r6 = a7 */
3765 + pasr.h r9, r5, 1
3766 + addhh.w r9, r9:b, r5:b
3767 + subhh.w r5, r4:b, r7:b
3768 + add r5, r9 /* r5 = a5 */
3770 +/* const int b1 = (a7>>2) + a1;
3771 + const int b3 = (a5>>2) + a3;
3772 + const int b5 = (a3>>2) - a5;
3773 + const int b7 = -(a1>>2) + a7 ; */
3774 + asr r4, r6, 2
3775 + add r4, r8 /* r4 = b1 */
3776 + asr r8, 2
3777 + rsub r8, r6 /* r8 = b7 */
3779 + asr r6, r5, 2
3780 + add r6, r2 /* r6 = b3 */
3781 + asr r2, 2
3782 + sub r2, r5 /* r2 = b5 */
3785 + dst[i*stride + 0] = cm[ dst[i*stride + 0] + ((b0 + b7) >> 6) ];
3786 + dst[i*stride + 1] = cm[ dst[i*stride + 1] + ((b2 + b5) >> 6) ];
3787 + dst[i*stride + 2] = cm[ dst[i*stride + 2] + ((b4 + b3) >> 6) ];
3788 + dst[i*stride + 3] = cm[ dst[i*stride + 3] + ((b6 + b1) >> 6) ];
3789 + dst[i*stride + 4] = cm[ dst[i*stride + 4] + ((b6 - b1) >> 6) ];
3790 + dst[i*stride + 5] = cm[ dst[i*stride + 5] + ((b4 - b3) >> 6) ];
3791 + dst[i*stride + 6] = cm[ dst[i*stride + 6] + ((b2 - b5) >> 6) ];
3792 + dst[i*stride + 7] = cm[ dst[i*stride + 7] + ((b0 - b7) >> 6) ];
3794 + add r5, r10, r8
3795 + satrnds r5 >> 6, 0 /* r5 = (b0 + b7) >> 6 */
3796 + sub r10, r8
3797 + satrnds r10 >> 6, 0 /* r10 = (b0 - b7) >> 6 */
3798 + add r8, r3, r2
3799 + satrnds r8 >> 6, 0 /* r8 = (b2 + b5) >> 6 */
3800 + sub r3, r2
3801 + satrnds r3 >> 6, 0 /* r3 = (b2 - b5) >> 6 */
3803 + add r2, r1, r6
3804 + satrnds r2 >> 6, 0 /* r2 = (b4 + b3) >> 6 */
3805 + sub r1, r6
3806 + satrnds r1 >> 6, 0 /* r1 = (b4 - b3) >> 6 */
3808 + add r6, r0, r4
3809 + satrnds r6 >> 6, 0 /* r6 = (b6 + b1) >> 6 */
3810 + sub r0, r4
3811 + satrnds r0 >> 6, 0 /* r0 = (b6 - b1) >> 6 */
3813 + ld.w r4, r12[0]
3815 + packw.sh r8, r5, r8
3816 + packw.sh r7, r2, r6
3817 + ld.w r9, r12[4]
3818 + packw.sh r6, r0, r1
3819 + packw.sh r5, r3, r10
3821 + punpckub.h r10, r4:t
3822 + punpckub.h r4, r4:b
3823 + punpckub.h r3, r9:t
3824 + punpckub.h r9, r9:b
3826 + padd.h r8, r8, r10
3827 + padd.h r7, r7, r4
3828 + padd.h r6, r6, r3
3829 + padd.h r5, r5, r9
3831 + lddsp r10, sp[4] /* r10 = stride */
3832 + packsh.ub r0, r8, r7
3833 + packsh.ub r1, r6, r5
3835 + st.w r12[0], r0
3836 + st.w r12[4], r1
3838 + ldm r11++, r4-r7
3839 + add r12, r10 /* dst += stride */
3841 + sub lr, 1
3842 + brne 1b
3844 + sub sp, -8
3845 + ldm sp++,r0-r3,r4-r7, pc
3849 +// }
3850 +//}
3851 diff --git a/libavcodec/avr32/idct.S b/libavcodec/avr32/idct.S
3852 new file mode 100644
3853 index 0000000..e7551ec
3854 --- /dev/null
3855 +++ b/libavcodec/avr32/idct.S
3856 @@ -0,0 +1,829 @@
3858 + * Copyright (c) 2007 Atmel Corporation. All rights reserved.
3860 + * Redistribution and use in source and binary forms, with or without
3861 + * modification, are permitted provided that the following conditions
3862 + * are met:
3864 + * 1. Redistributions of source code must retain the above copyright
3865 + * notice, this list of conditions and the following disclaimer.
3867 + * 2. Redistributions in binary form must reproduce the above
3868 + * copyright notice, this list of conditions and the following
3869 + * disclaimer in the documentation and/or other materials provided
3870 + * with the distribution.
3872 + * 3. The name of ATMEL may not be used to endorse or promote products
3873 + * derived from this software without specific prior written
3874 + * permission.
3876 + * THIS SOFTWARE IS PROVIDED BY ATMEL ``AS IS'' AND ANY EXPRESS OR
3877 + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
3878 + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
3879 + * ARE EXPRESSLY AND SPECIFICALLY DISCLAIMED. IN NO EVENT SHALL ATMEL
3880 + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
3881 + * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
3882 + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
3883 + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
3884 + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
3885 + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
3886 + * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
3887 + * DAMAGE.
3888 + */
3890 + .global idct_add_avr32
3891 + .global idct_put_avr32
3892 + .global idct_avr32
3895 +#define CONST_BITS 13
3896 +#define PASS1_BITS 2
3898 +#define ONE ((INT32) 1)
3900 +#define CONST_SCALE (ONE << CONST_BITS)
3902 +#define LINE_SIZE 32
3904 +#define FIX_0_298631336 (2446) /* FIX(0.298631336) */
3905 +#define FIX_0_390180644 (3196) /* FIX(0.390180644) */
3906 +#define FIX_0_541196100 (4433) /* FIX(0.541196100) */
3907 +#define FIX_0_765366865 (6270) /* FIX(0.765366865) */
3908 +#define FIX_0_899976223 (7373) /* FIX(0.899976223) */
3909 +#define FIX_1_175875602 (9633) /* FIX(1.175875602) */
3910 +#define FIX_1_501321110 (12299)/* FIX(1.501321110) */
3911 +#define FIX_1_847759065 (15137)/* FIX(1.847759065) */
3912 +#define FIX_1_961570560 (16069)/* FIX(1.961570560) */
3913 +#define FIX_2_053119869 (16819)/* FIX(2.053119869) */
3914 +#define FIX_2_562915447 (20995)/* FIX(2.562915447) */
3915 +#define FIX_3_072711026 (25172)/* FIX(3.072711026) */
3918 +#define loop_cnt r11
3920 + .text
3922 +idct_add_avr32:
3923 + pushm r0-r3, r4-r7, lr //Free up registers to use for local variables
3925 + // Give room for some variables on the stack
3926 + sub sp, 8
3927 + stdsp SP[0], r12 // rfp
3928 + stdsp SP[4], r11 // iinc
3930 + mov loop_cnt, 8 //Initialize loop counter
3932 +FOR_ROW:
3934 + ldm r10, r0, r1, r2, r3 //Load 8 DCT-coeffisients from the current row in the DCT-block
3935 + mov r6, 0
3936 +#ifdef USE_PREFETCH
3937 + pref r10[LINE_SIZE] //Prefetch next line
3938 +#endif
3939 + or r4, r2, r3 << 16
3940 + or r4, r1 //Check if all DCT-coeffisients except the DC is zero
3941 + or r4, r0
3942 + brne AC_ROW //If there are non-zero AC coeffisients perform row-transform
3944 + paddsub.h r5, r3:t, r6:b //Extract the DC-coeff from r5
3945 + plsl.h r5, r5, PASS1_BITS
3946 + mov r4, r5
3947 + st.d r10++, r4
3948 + st.d r10++, r4
3950 + sub loop_cnt, 1 //Decrement loop counter
3951 + brne FOR_ROW //Perform loop one more time if loop_cnt is not zero
3953 + bral COLOUMN_TRANSFORM //Perform coloumn transform after row transform is computed
3956 +AC_ROW:
3959 + ld.w r12, pc[coef_table - .]
3960 + ld.w r9, pc[coef_table - . + 4]
3962 + padd.h r4, r2, r0 // r4:t = dataptr[2] + dataptr[6],r4:b = dataptr[3] + dataptr[7]
3963 + mulhh.w r5, r4:t, r12:t
3964 + mulhh.w r6, r0:t, r12:b
3965 + ld.w r12, pc[coef_table - . + 8]
3966 + mulhh.w r7, r2:t, r9:t
3967 + add r6, r5 // tmp2
3968 + satrnds r6 >> (CONST_BITS - PASS1_BITS), 31
3969 + add r7, r5 // tmp3
3970 + satrnds r7 >> (CONST_BITS - PASS1_BITS), 31
3972 + paddsub.h r5, r3:t, r1:t
3973 + plsl.h r5, r5, PASS1_BITS // r5:t = tmp0, r5:b = tmp1
3975 + paddsub.h r7, r5:t, r7:b // r7:t = tmp10, r7:b = tmp13
3976 + paddsub.h r6, r5:b, r6:b // r6:t = tmp11, r6:b = tmp12
3979 + addhh.w lr, r3:b, r1:b // lr = z4
3980 + addhh.w r5, r4:b, lr:b
3981 + mulhh.w r5, r5:b, r9:b // r5 = z5
3983 + ld.w r9, pc[coef_table - . + 12]
3984 + mulhh.w r4, r4:b, r12:t // r4 = z3
3985 + mulhh.w lr, lr:b, r12:b // lr = z4
3987 + add r4, r5
3988 + add lr, r5
3990 + addhh.w r5, r2:b, r1:b // r5 = z2
3991 + addhh.w r8, r3:b, r0:b // r8 = z1
3994 + mulhh.w r0, r0:b, r9:t // r0 = tmp0
3995 + ld.w r12, pc[coef_table - . + 16]
3996 + mulhh.w r1, r1:b, r9:b // r1 = tmp1
3997 + ld.w r9, pc[coef_table - . + 20]
3998 + mulhh.w r2, r2:b, r12:t // r2 = tmp2
3999 + mulhh.w r3, r3:b, r12:b // r3 = tmp3
4000 + mulhh.w r8, r8:b, r9:t // r8 = z1
4001 + mulhh.w r5, r5:b, r9:b // r5 = z2
4004 + add r0, r8
4005 + add r0, r4
4006 + add r1, r5
4007 + add r1, lr
4008 + add r2, r5
4009 + add r2, r4
4010 + add r3, r8
4011 + add r3, lr
4013 + satrnds r0 >> (CONST_BITS - PASS1_BITS), 31
4014 + satrnds r1 >> (CONST_BITS - PASS1_BITS), 31
4015 + satrnds r2 >> (CONST_BITS - PASS1_BITS), 31
4016 + satrnds r3 >> (CONST_BITS - PASS1_BITS), 31
4018 + paddsub.h r5, r6:t, r2:b // r5:t = dataptr[1], r5:b = dataptr[6]
4019 + paddsub.h r4, r7:t, r3:b // r4:t = dataptr[0], r4:b = dataptr[7]
4020 + paddsub.h r3, r6:b, r1:b // r3:t = dataptr[2], r3:b = dataptr[5]
4021 + paddsub.h r2, r7:b, r0:b // r2:t = dataptr[3], r2:b = dataptr[4]
4023 + sthh.w r10[0], r4:t, r5:t
4024 + sthh.w r10[4], r3:t, r2:t
4025 + sthh.w r10[8], r2:b, r3:b
4026 + sthh.w r10[12], r5:b, r4:b
4030 + sub r10, -16
4031 + sub loop_cnt, 1
4032 + brne FOR_ROW, e
4034 +COLOUMN_TRANSFORM:
4036 + sub r10, 128 //Set pointer to start of DCT block
4039 + mov loop_cnt, 8
4040 +FOR_COLOUMN:
4041 + ldins.h r3:t,r10[0] // r3:t = dataptr[0]
4042 + ldins.h r1:t,r10[1*8*2]// r1:t = dataptr[1]
4043 + ldins.h r2:t,r10[2*8*2]// r2:t = dataptr[2]
4044 + ldins.h r0:t,r10[5*8*2]// r0:t = dataptr[5]
4045 + ldins.h r3:b,r10[4*8*2]// r3:b = dataptr[4]
4046 + ldins.h r1:b,r10[3*8*2]// r1:b = dataptr[3]
4047 + ldins.h r2:b,r10[6*8*2]// r2:b = dataptr[6]
4048 + ldins.h r0:b,r10[7*8*2]// r0:b = dataptr[7]
4050 + or r4, r1, r3 << 16
4051 + or r4, r2
4052 + or r4, r0
4053 + brne AC_COLOUMN //If there are non-zero AC coeffisients perform row-transform
4055 + lddsp r12, SP[0] // rfp
4056 + lddsp r9, SP[4] // iinc
4057 + satrnds r3 >> ( PASS1_BITS + 3 + 16 ), 9
4058 + ld.d r0, r12[0]
4059 + sub r10, -2 // Increment the dataptr
4060 + bfins r3, r3, 16, 16
4061 + punpckub.h r2, r1:t
4062 + padd.h r2, r2, r3
4063 + punpckub.h r1, r1:b
4064 + padd.h r1, r1, r3
4065 + packsh.ub r1, r2, r1
4066 + punpckub.h r2, r0:t
4067 + padd.h r2, r2, r3
4068 + punpckub.h r0, r0:b
4069 + padd.h r0, r0, r3
4070 + packsh.ub r0, r2, r0
4071 + st.d r12[0], r0
4072 + add r12, r9 // increment rfp
4073 + stdsp SP[0], r12
4075 + sub loop_cnt, 1//Decrement loop counter
4076 + brne FOR_COLOUMN//Perform loop one more time if loop_cnt is not zero
4078 + sub sp, -8
4079 + popm r0-r3, r4-r7, pc//Pop back registers and PC
4081 +AC_COLOUMN:
4083 + ld.w r12, pc[coef_table - .]
4084 + ld.w r9, pc[coef_table - . + 4]
4086 + addhh.w r4, r2:t, r2:b
4087 + mulhh.w r4, r4:b, r12:t // r4 = z1
4088 + mulhh.w r5, r2:b, r12:b
4089 + ld.w r12, pc[coef_table - . + 8]
4090 + mulhh.w r6, r2:t, r9:t
4091 + add r5, r4 // r5 = tmp2
4092 + add r6, r4 // r6 = tmp3
4094 + addhh.w r7, r3:t, r3:b
4095 + subhh.w r8, r3:t, r3:b
4097 + lsl r7, CONST_BITS
4098 + lsl r8, CONST_BITS
4100 + add r2, r7, r6 // r2 = tmp10
4101 + sub r3, r7, r6 // r3 = tmp13
4102 + add r4, r8, r5 // r4 = tmp11
4103 + sub r5, r8, r5 // r5 = tmp12
4105 + padd.h r6, r0, r1 // r6:t = z4, r6:b = z3
4106 + addhh.w r7, r6:t, r6:b
4107 + mulhh.w r7, r7:b, r9:b // r7 = z5
4109 + ld.w r9, pc[coef_table - . + 12]
4110 + mulhh.w r8, r6:b, r12:t // r8 = z3
4111 + mulhh.w r6, r6:t, r12:b // r6 = z4
4113 + add r8, r7
4114 + add r6, r7
4116 + paddx.h r7, r0, r1 // r7:t = z2, r7:b = z1
4118 + mulhh.w r12, r0:b, r9:t // r12 = tmp0
4119 + mulhh.w r0, r0:t, r9:b // r0 = tmp1
4120 + ld.w r9, pc[coef_table - . + 16]
4121 + add r12, r8
4122 + add r0, r6
4124 + ld.w lr, pc[coef_table - . + 20]
4125 + machh.w r8, r1:b, r9:t // r8 = tmp2
4126 + machh.w r6, r1:t, r9:b // r6 = tmp3
4127 + mulhh.w r9, r7:b, lr:t // r9 = z1
4128 + mulhh.w r7, r7:t, lr:b // r7 = z2
4131 + add r12, r9
4132 + add r0, r7
4133 + add r8, r7
4134 + add r6, r9
4136 + add r1, r2, r6 // r1 = dataptr[DCTSIZE*0]
4137 + sub r2, r2, r6 // r2 = dataptr[DCTSIZE*7]
4138 + add r6, r4, r8 // r6 = dataptr[DCTSIZE*1]
4139 + sub r4, r4, r8 // r4 = dataptr[DCTSIZE*6]
4140 + add r8, r5, r0 // r8 = dataptr[DCTSIZE*2]
4141 + sub r5, r5, r0 // r5 = dataptr[DCTSIZE*5]
4142 + add r0, r3, r12 // r0 = dataptr[DCTSIZE*3]
4143 + sub r3, r3, r12 // r3 = dataptr[DCTSIZE*4]
4145 + satrnds r1 >> (CONST_BITS+PASS1_BITS+3), 9
4146 + satrnds r2 >> (CONST_BITS+PASS1_BITS+3), 9
4147 + satrnds r6 >> (CONST_BITS+PASS1_BITS+3), 9
4148 + satrnds r4 >> (CONST_BITS+PASS1_BITS+3), 9
4149 + satrnds r8 >> (CONST_BITS+PASS1_BITS+3), 9
4150 + satrnds r5 >> (CONST_BITS+PASS1_BITS+3), 9
4151 + satrnds r0 >> (CONST_BITS+PASS1_BITS+3), 9
4152 + satrnds r3 >> (CONST_BITS+PASS1_BITS+3), 9
4154 + packw.sh r1, r1, r6
4155 + packw.sh r8, r8, r0
4156 + packw.sh r3, r3, r5
4157 + packw.sh r4, r4, r2
4159 + lddsp r12, SP[0] // rfp
4160 + lddsp r9, SP[4] // iinc
4161 + ld.d r6, r12[0]
4162 + sub r10, -2 // Increment the dataptr
4163 + punpckub.h r0, r7:t
4164 + padd.h r1, r1, r0
4165 + punpckub.h r0, r7:b
4166 + padd.h r8, r8, r0
4167 + packsh.ub r7, r1, r8
4168 + punpckub.h r0, r6:t
4169 + padd.h r3, r3, r0
4170 + punpckub.h r0, r6:b
4171 + padd.h r4, r4, r0
4172 + packsh.ub r6, r3, r4
4173 + st.d r12[0], r6
4174 + add r12, r9 // increment rfp
4175 + stdsp SP[0], r12
4177 + sub loop_cnt, 1 //Decrement loop counter
4178 + brne FOR_COLOUMN //Perform loop one more time if loop_cnt is not zero
4180 + sub sp, -8
4181 + popm r0-r3, r4-r7, pc //Pop back registers and PC
4185 +//Coeffisient Table:
4186 + .align 2
4187 +coef_table:
4188 + .short FIX_0_541196100, -FIX_1_847759065, FIX_0_765366865, FIX_1_175875602
4189 + .short - FIX_1_961570560, - FIX_0_390180644, FIX_0_298631336, FIX_2_053119869
4190 + .short FIX_3_072711026, FIX_1_501321110, - FIX_0_899976223, - FIX_2_562915447
4193 +idct_put_avr32:
4194 + pushm r0-r3, r4-r7, lr //Free up registers to use for local variables
4196 + //; Give room for some variables on the stack
4197 + sub sp, 8
4198 + stdsp SP[0], r12 // rfp
4199 + stdsp SP[4], r11 // iinc
4201 + mov loop_cnt, 8 //Initialize loop counter
4205 + ldm r10, r0, r1, r2, r3 //Load 8 DCT-coeffisients from the current row in the DCT-block
4206 + mov r6, 0
4207 +#ifdef USE_PREFETCH
4208 + pref r10[LINE_SIZE] //Prefetch next line
4209 +#endif
4210 + or r4, r2, r3 << 16
4211 + or r4, r1 //Check if all DCT-coeffisients except the DC is zero
4212 + or r4, r0
4213 + brne 1f //If there are non-zero AC coeffisients perform row-transform
4215 + paddsub.h r5, r3:t, r6:b //Extract the DC-coeff from r5
4216 + plsl.h r5, r5, PASS1_BITS
4217 + mov r4, r5
4218 + st.d r10++, r4
4219 + st.d r10++, r4
4221 + sub loop_cnt, 1 //Decrement loop counter
4222 + brne 0b //Perform loop one more time if loop_cnt is not zero
4224 + bral 2f //Perform coloumn transform after row transform is computed
4226 +1:
4228 + ld.w r12, pc[coef_table_copy - .]
4229 + ld.w r9, pc[coef_table_copy - . + 4]
4231 + padd.h r4, r2, r0 // r4:t = dataptr[2] + dataptr[6],r4:b = dataptr[3] + dataptr[7]
4232 + mulhh.w r5, r4:t, r12:t
4233 + mulhh.w r6, r0:t, r12:b
4234 + ld.w r12, pc[coef_table_copy - . + 8]
4235 + mulhh.w r7, r2:t, r9:t
4236 + add r6, r5 // tmp2
4237 + satrnds r6 >> (CONST_BITS - PASS1_BITS), 31
4238 + add r7, r5 // tmp3
4239 + satrnds r7 >> (CONST_BITS - PASS1_BITS), 31
4241 + paddsub.h r5, r3:t, r1:t
4242 + plsl.h r5, r5, PASS1_BITS // r5:t = tmp0, r5:b = tmp1
4244 + paddsub.h r7, r5:t, r7:b // r7:t = tmp10, r7:b = tmp13
4245 + paddsub.h r6, r5:b, r6:b // r6:t = tmp11, r6:b = tmp12
4249 + addhh.w lr, r3:b, r1:b // lr = z4
4250 + addhh.w r5, r4:b, lr:b
4251 + mulhh.w r5, r5:b, r9:b // r5 = z5
4253 + ld.w r9, pc[coef_table_copy - . + 12]
4254 + mulhh.w r4, r4:b, r12:t // r4 = z3
4255 + mulhh.w lr, lr:b, r12:b // lr = z4
4257 + add r4, r5
4258 + add lr, r5
4260 + addhh.w r5, r2:b, r1:b // r5 = z2
4261 + addhh.w r8, r3:b, r0:b // r8 = z1
4264 + mulhh.w r0, r0:b, r9:t // r0 = tmp0
4265 + ld.w r12, pc[coef_table_copy - . + 16]
4266 + mulhh.w r1, r1:b, r9:b // r1 = tmp1
4267 + ld.w r9, pc[coef_table_copy - . + 20]
4268 + mulhh.w r2, r2:b, r12:t // r2 = tmp2
4269 + mulhh.w r3, r3:b, r12:b // r3 = tmp3
4270 + mulhh.w r8, r8:b, r9:t // r8 = z1
4271 + mulhh.w r5, r5:b, r9:b // r5 = z2
4274 + add r0, r8
4275 + add r0, r4
4276 + add r1, r5
4277 + add r1, lr
4278 + add r2, r5
4279 + add r2, r4
4280 + add r3, r8
4281 + add r3, lr
4283 + satrnds r0 >> (CONST_BITS - PASS1_BITS), 31
4284 + satrnds r1 >> (CONST_BITS - PASS1_BITS), 31
4285 + satrnds r2 >> (CONST_BITS - PASS1_BITS), 31
4286 + satrnds r3 >> (CONST_BITS - PASS1_BITS), 31
4288 + paddsub.h r5, r6:t, r2:b // r5:t = dataptr[1], r5:b = dataptr[6]
4289 + paddsub.h r4, r7:t, r3:b // r4:t = dataptr[0], r4:b = dataptr[7]
4290 + paddsub.h r3, r6:b, r1:b // r3:t = dataptr[2], r3:b = dataptr[5]
4291 + paddsub.h r2, r7:b, r0:b // r2:t = dataptr[3], r2:b = dataptr[4]
4293 + sthh.w r10[0], r4:t, r5:t
4294 + sthh.w r10[4], r3:t, r2:t
4295 + sthh.w r10[8], r2:b, r3:b
4296 + sthh.w r10[12], r5:b, r4:b
4300 + sub r10, -16
4301 + sub loop_cnt, 1
4302 + brne 0b
4306 + sub r10, 128 //Set pointer to start of DCT block
4308 + mov loop_cnt, 8
4310 +0:
4311 + ldins.h r3:t,r10[0] // r3:t = dataptr[0]
4312 + ldins.h r1:t,r10[1*8*2]// r1:t = dataptr[1]
4313 + ldins.h r2:t,r10[2*8*2]// r2:t = dataptr[2]
4314 + ldins.h r0:t,r10[5*8*2]// r0:t = dataptr[5]
4315 + ldins.h r3:b,r10[4*8*2]// r3:b = dataptr[4]
4316 + ldins.h r1:b,r10[3*8*2]// r1:b = dataptr[3]
4317 + ldins.h r2:b,r10[6*8*2]// r2:b = dataptr[6]
4318 + ldins.h r0:b,r10[7*8*2]// r0:b = dataptr[7]
4320 + or r4, r1, r3 << 16
4321 + or r4, r2
4322 + or r4, r0
4323 + brne 1f //If there are non-zero AC coeffisients perform row-transform
4325 + lddsp r12, SP[0] // rfp
4326 + lddsp r9, SP[4] // iinc
4327 + satrnds r3 >> ( PASS1_BITS + 3 + 16 ), 31
4328 + packw.sh r3, r3, r3
4329 + packsh.ub r3, r3, r3
4330 + mov r2, r3
4331 + st.d r12[0], r2
4332 + add r12, r9 // increment rfp
4333 + sub r10, -2 // Increment the dataptr
4334 + stdsp SP[0], r12
4336 + sub loop_cnt, 1//Decrement loop counter
4337 + brne 0b //Perform loop one more time if loop_cnt is not zero
4339 + sub sp, -8
4340 + popm r0-r3, r4-r7, pc//Pop back registers and PC
4344 + ld.w r12, pc[coef_table_copy - .]
4345 + ld.w r9, pc[coef_table_copy - . + 4]
4347 + addhh.w r4, r2:t, r2:b
4348 + mulhh.w r4, r4:b, r12:t // r4 = z1
4349 + mulhh.w r5, r2:b, r12:b
4350 + ld.w r12, pc[coef_table_copy - . + 8]
4351 + mulhh.w r6, r2:t, r9:t
4352 + add r5, r4 // r5 = tmp2
4353 + add r6, r4 // r6 = tmp3
4355 + addhh.w r7, r3:t, r3:b
4356 + subhh.w r8, r3:t, r3:b
4358 + lsl r7, CONST_BITS
4359 + lsl r8, CONST_BITS
4361 + add r2, r7, r6 // r2 = tmp10
4362 + sub r3, r7, r6 // r3 = tmp13
4363 + add r4, r8, r5 // r4 = tmp11
4364 + sub r5, r8, r5 // r5 = tmp12
4367 + padd.h r6, r0, r1 // r6:t = z4, r6:b = z3
4368 + addhh.w r7, r6:t, r6:b
4369 + mulhh.w r7, r7:b, r9:b // r7 = z5
4371 + ld.w r9, pc[coef_table_copy - . + 12]
4372 + mulhh.w r8, r6:b, r12:t // r8 = z3
4373 + mulhh.w r6, r6:t, r12:b // r6 = z4
4375 + add r8, r7
4376 + add r6, r7
4378 + paddx.h r7, r0, r1 // r7:t = z2, r7:b = z1
4380 + mulhh.w r12, r0:b, r9:t // r12 = tmp0
4381 + mulhh.w r0, r0:t, r9:b // r0 = tmp1
4382 + ld.w r9, pc[coef_table_copy - . + 16]
4383 + add r12, r8
4384 + add r0, r6
4386 + ld.w lr, pc[coef_table_copy - . + 20]
4387 + machh.w r8, r1:b, r9:t // r8 = tmp2
4388 + machh.w r6, r1:t, r9:b // r6 = tmp3
4389 + mulhh.w r9, r7:b, lr:t // r9 = z1
4390 + mulhh.w r7, r7:t, lr:b // r7 = z2
4393 + add r12, r9
4394 + add r0, r7
4395 + add r8, r7
4396 + add r6, r9
4398 + add r1, r2, r6 // r1 = dataptr[DCTSIZE*0]
4399 + sub r2, r2, r6 // r2 = dataptr[DCTSIZE*7]
4400 + add r6, r4, r8 // r6 = dataptr[DCTSIZE*1]
4401 + sub r4, r4, r8 // r4 = dataptr[DCTSIZE*6]
4402 + add r8, r5, r0 // r8 = dataptr[DCTSIZE*2]
4403 + sub r5, r5, r0 // r5 = dataptr[DCTSIZE*5]
4404 + add r0, r3, r12 // r0 = dataptr[DCTSIZE*3]
4405 + sub r3, r3, r12 // r3 = dataptr[DCTSIZE*4]
4407 + satrnds r1 >> (CONST_BITS+PASS1_BITS+3), 9
4408 + satrnds r2 >> (CONST_BITS+PASS1_BITS+3), 9
4409 + satrnds r6 >> (CONST_BITS+PASS1_BITS+3), 9
4410 + satrnds r4 >> (CONST_BITS+PASS1_BITS+3), 9
4411 + satrnds r8 >> (CONST_BITS+PASS1_BITS+3), 9
4412 + satrnds r5 >> (CONST_BITS+PASS1_BITS+3), 9
4413 + satrnds r0 >> (CONST_BITS+PASS1_BITS+3), 9
4414 + satrnds r3 >> (CONST_BITS+PASS1_BITS+3), 9
4416 + packw.sh r1, r1, r6
4417 + packw.sh r8, r8, r0
4418 + packw.sh r3, r3, r5
4419 + packw.sh r4, r4, r2
4421 + packsh.ub r1, r1, r8
4422 + packsh.ub r0, r3, r4
4423 + lddsp r12, SP[0] // rfp
4424 + lddsp r9, SP[4] // iinc
4425 + st.d r12[0], r0
4426 + sub r10, -2 // Increment the dataptr
4427 + add r12, r9 // increment rfp
4428 + stdsp SP[0], r12
4430 + sub loop_cnt, 1 //Decrement loop counter
4431 + brne 0b //Perform loop one more time if loop_cnt is not zero
4433 + sub sp, -8
4434 + popm r0-r3, r4-r7, pc //Pop back registers and PC
4438 + .align 2
4439 +coef_table_copy:
4440 + .short FIX_0_541196100, -FIX_1_847759065, FIX_0_765366865, FIX_1_175875602
4441 + .short - FIX_1_961570560, - FIX_0_390180644, FIX_0_298631336, FIX_2_053119869
4442 + .short FIX_3_072711026, FIX_1_501321110, - FIX_0_899976223, - FIX_2_562915447
4445 +idct_avr32:
4446 + pushm r0-r3, r4-r7, lr //Free up registers to use for local variables
4448 + //; Give room for a temporary block on the stack
4449 + sub sp, 8*8*2
4451 + mov loop_cnt, 8 //Initialize loop counter
4455 + ldm r12++, r0, r1, r2, r3 //Load 8 DCT-coeffisients from the current row in the DCT-block
4456 + mov r6, 0
4457 +#ifdef USE_PREFETCH
4458 + pref r12[LINE_SIZE] //Prefetch next line
4459 +#endif
4460 + or r4, r2, r3 << 16
4461 + or r4, r1 //Check if all DCT-coeffisients except the DC is zero
4462 + or r4, r0
4463 + brne 1f //If there are non-zero AC coeffisients perform row-transform
4465 + paddsub.h r5, r3:t, r6:b //Extract the DC-coeff from r5
4466 + plsl.h r5, r5, PASS1_BITS
4467 + mov r4, r5
4468 + st.d sp++, r4
4469 + st.d sp++, r4
4471 + sub loop_cnt, 1 //Decrement loop counter
4472 + brne 0b //Perform loop one more time if loop_cnt is not zero
4474 + bral 2f //Perform coloumn transform after row transform is computed
4476 +1:
4478 + ld.w r10, pc[coef_table_idct - .]
4479 + ld.w r9, pc[coef_table_idct - . + 4]
4481 + padd.h r4, r2, r0 // r4:t = dataptr[2] + dataptr[6],r4:b = dataptr[3] + dataptr[7]
4482 + mulhh.w r5, r4:t, r10:t
4483 + mulhh.w r6, r0:t, r10:b
4484 + ld.w r10, pc[coef_table_idct - . + 8]
4485 + mulhh.w r7, r2:t, r9:t
4486 + add r6, r5 // tmp2
4487 + satrnds r6 >> (CONST_BITS - PASS1_BITS), 31
4488 + add r7, r5 // tmp3
4489 + satrnds r7 >> (CONST_BITS - PASS1_BITS), 31
4491 + paddsub.h r5, r3:t, r1:t
4492 + plsl.h r5, r5, PASS1_BITS // r5:t = tmp0, r5:b = tmp1
4494 + paddsub.h r7, r5:t, r7:b // r7:t = tmp10, r7:b = tmp13
4495 + paddsub.h r6, r5:b, r6:b // r6:t = tmp11, r6:b = tmp12
4499 + addhh.w lr, r3:b, r1:b // lr = z4
4500 + addhh.w r5, r4:b, lr:b
4501 + mulhh.w r5, r5:b, r9:b // r5 = z5
4503 + ld.w r9, pc[coef_table_idct - . + 12]
4504 + mulhh.w r4, r4:b, r10:t // r4 = z3
4505 + mulhh.w lr, lr:b, r10:b // lr = z4
4507 + add r4, r5
4508 + add lr, r5
4510 + addhh.w r5, r2:b, r1:b // r5 = z2
4511 + addhh.w r8, r3:b, r0:b // r8 = z1
4514 + mulhh.w r0, r0:b, r9:t // r0 = tmp0
4515 + ld.w r10, pc[coef_table_idct - . + 16]
4516 + mulhh.w r1, r1:b, r9:b // r1 = tmp1
4517 + ld.w r9, pc[coef_table_idct - . + 20]
4518 + mulhh.w r2, r2:b, r10:t // r2 = tmp2
4519 + mulhh.w r3, r3:b, r10:b // r3 = tmp3
4520 + mulhh.w r8, r8:b, r9:t // r8 = z1
4521 + mulhh.w r5, r5:b, r9:b // r5 = z2
4524 + add r0, r8
4525 + add r0, r4
4526 + add r1, r5
4527 + add r1, lr
4528 + add r2, r5
4529 + add r2, r4
4530 + add r3, r8
4531 + add r3, lr
4533 + satrnds r0 >> (CONST_BITS - PASS1_BITS), 31
4534 + satrnds r1 >> (CONST_BITS - PASS1_BITS), 31
4535 + satrnds r2 >> (CONST_BITS - PASS1_BITS), 31
4536 + satrnds r3 >> (CONST_BITS - PASS1_BITS), 31
4538 + paddsub.h r5, r6:t, r2:b // r5:t = dataptr[1], r5:b = dataptr[6]
4539 + paddsub.h r4, r7:t, r3:b // r4:t = dataptr[0], r4:b = dataptr[7]
4540 + paddsub.h r3, r6:b, r1:b // r3:t = dataptr[2], r3:b = dataptr[5]
4541 + paddsub.h r2, r7:b, r0:b // r2:t = dataptr[3], r2:b = dataptr[4]
4543 + sthh.w sp[0], r4:t, r5:t
4544 + sthh.w sp[4], r3:t, r2:t
4545 + sthh.w sp[8], r2:b, r3:b
4546 + sthh.w sp[12], r5:b, r4:b
4550 + sub sp, -16
4551 + sub loop_cnt, 1
4552 + brne 0b
4556 + sub sp, 8*8*2 //Set pointer to start of DCT block
4557 + sub r12, 8*8*2 //Set pointer to start of DCT block
4559 + mov loop_cnt, 8
4561 +0:
4562 + ldins.h r3:t,sp[0] // r3:t = dataptr[0]
4563 + ldins.h r1:t,sp[1*8*2]// r1:t = dataptr[1]
4564 + ldins.h r2:t,sp[2*8*2]// r2:t = dataptr[2]
4565 + ldins.h r0:t,sp[5*8*2]// r0:t = dataptr[5]
4566 + ldins.h r3:b,sp[4*8*2]// r3:b = dataptr[4]
4567 + ldins.h r1:b,sp[3*8*2]// r1:b = dataptr[3]
4568 + ldins.h r2:b,sp[6*8*2]// r2:b = dataptr[6]
4569 + ldins.h r0:b,sp[7*8*2]// r0:b = dataptr[7]
4571 + or r4, r1, r3 << 16
4572 + or r4, r2
4573 + or r4, r0
4574 + brne 1f //If there are non-zero AC coeffisients perform row-transform
4576 + satrnds r3 >> ( PASS1_BITS + 3 + 16 ), 31
4577 + packw.sh r3, r3, r3
4578 + mov r2, r3
4579 + st.d r12++, r2
4580 + st.d r12++, r2
4581 + sub sp, -2 // Increment the dataptr
4583 + sub loop_cnt, 1//Decrement loop counter
4584 + brne 0b //Perform loop one more time if loop_cnt is not zero
4586 + sub sp, -(8*8*2 - 8)
4587 + popm r0-r3, r4-r7, pc//Pop back registers and PC
4591 + ld.w r10, pc[coef_table_idct - .]
4592 + ld.w r9, pc[coef_table_idct - . + 4]
4594 + addhh.w r4, r2:t, r2:b
4595 + mulhh.w r4, r4:b, r10:t // r4 = z1
4596 + mulhh.w r5, r2:b, r10:b
4597 + ld.w r10, pc[coef_table_idct - . + 8]
4598 + mulhh.w r6, r2:t, r9:t
4599 + add r5, r4 // r5 = tmp2
4600 + add r6, r4 // r6 = tmp3
4602 + addhh.w r7, r3:t, r3:b
4603 + subhh.w r8, r3:t, r3:b
4605 + lsl r7, CONST_BITS
4606 + lsl r8, CONST_BITS
4608 + add r2, r7, r6 // r2 = tmp10
4609 + sub r3, r7, r6 // r3 = tmp13
4610 + add r4, r8, r5 // r4 = tmp11
4611 + sub r5, r8, r5 // r5 = tmp12
4614 + padd.h r6, r0, r1 // r6:t = z4, r6:b = z3
4615 + addhh.w r7, r6:t, r6:b
4616 + mulhh.w r7, r7:b, r9:b // r7 = z5
4618 + ld.w r9, pc[coef_table_idct - . + 12]
4619 + mulhh.w r8, r6:b, r10:t // r8 = z3
4620 + mulhh.w r6, r6:t, r10:b // r6 = z4
4622 + add r8, r7
4623 + add r6, r7
4625 + paddx.h r7, r0, r1 // r7:t = z2, r7:b = z1
4627 + mulhh.w r10, r0:b, r9:t // r10 = tmp0
4628 + mulhh.w r0, r0:t, r9:b // r0 = tmp1
4629 + ld.w r9, pc[coef_table_idct - . + 16]
4630 + add r10, r8
4631 + add r0, r6
4633 + ld.w lr, pc[coef_table_idct - . + 20]
4634 + machh.w r8, r1:b, r9:t // r8 = tmp2
4635 + machh.w r6, r1:t, r9:b // r6 = tmp3
4636 + mulhh.w r9, r7:b, lr:t // r9 = z1
4637 + mulhh.w r7, r7:t, lr:b // r7 = z2
4640 + add r10, r9
4641 + add r0, r7
4642 + add r8, r7
4643 + add r6, r9
4645 + add r1, r2, r6 // r1 = dataptr[DCTSIZE*0]
4646 + sub r2, r2, r6 // r2 = dataptr[DCTSIZE*7]
4647 + add r6, r4, r8 // r6 = dataptr[DCTSIZE*1]
4648 + sub r4, r4, r8 // r4 = dataptr[DCTSIZE*6]
4649 + add r8, r5, r0 // r8 = dataptr[DCTSIZE*2]
4650 + sub r5, r5, r0 // r5 = dataptr[DCTSIZE*5]
4651 + add r0, r3, r10 // r0 = dataptr[DCTSIZE*3]
4652 + sub r3, r3, r10 // r3 = dataptr[DCTSIZE*4]
4654 + satrnds r1 >> (CONST_BITS+PASS1_BITS+3), 9
4655 + satrnds r2 >> (CONST_BITS+PASS1_BITS+3), 9
4656 + satrnds r6 >> (CONST_BITS+PASS1_BITS+3), 9
4657 + satrnds r4 >> (CONST_BITS+PASS1_BITS+3), 9
4658 + satrnds r8 >> (CONST_BITS+PASS1_BITS+3), 9
4659 + satrnds r5 >> (CONST_BITS+PASS1_BITS+3), 9
4660 + satrnds r0 >> (CONST_BITS+PASS1_BITS+3), 9
4661 + satrnds r3 >> (CONST_BITS+PASS1_BITS+3), 9
4663 + packw.sh r7, r1, r6
4664 + packw.sh r6, r8, r0
4665 + packw.sh r5, r3, r5
4666 + packw.sh r4, r4, r2
4668 + stm r12, r4-r7
4669 + sub sp, -2 // Increment the dataptr
4670 + sub r12, -16
4672 + sub loop_cnt, 1 //Decrement loop counter
4673 + brne 0b //Perform loop one more time if loop_cnt is not zero
4675 + sub sp, -(8*8*2 - 8)
4676 + popm r0-r3, r4-r7, pc //Pop back registers and PC
4680 + .align 2
4681 +coef_table_idct:
4682 + .short FIX_0_541196100, -FIX_1_847759065, FIX_0_765366865, FIX_1_175875602
4683 + .short - FIX_1_961570560, - FIX_0_390180644, FIX_0_298631336, FIX_2_053119869
4684 + .short FIX_3_072711026, FIX_1_501321110, - FIX_0_899976223, - FIX_2_562915447
4686 diff --git a/libavcodec/avr32/mc.S b/libavcodec/avr32/mc.S
4687 new file mode 100644
4688 index 0000000..07a002d
4689 --- /dev/null
4690 +++ b/libavcodec/avr32/mc.S
4691 @@ -0,0 +1,434 @@
4693 + * Copyright (c) 2007 Atmel Corporation. All rights reserved.
4695 + * Redistribution and use in source and binary forms, with or without
4696 + * modification, are permitted provided that the following conditions
4697 + * are met:
4699 + * 1. Redistributions of source code must retain the above copyright
4700 + * notice, this list of conditions and the following disclaimer.
4702 + * 2. Redistributions in binary form must reproduce the above
4703 + * copyright notice, this list of conditions and the following
4704 + * disclaimer in the documentation and/or other materials provided
4705 + * with the distribution.
4707 + * 3. The name of ATMEL may not be used to endorse or promote products
4708 + * derived from this software without specific prior written
4709 + * permission.
4711 + * THIS SOFTWARE IS PROVIDED BY ATMEL ``AS IS'' AND ANY EXPRESS OR
4712 + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
4713 + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
4714 + * ARE EXPRESSLY AND SPECIFICALLY DISCLAIMED. IN NO EVENT SHALL ATMEL
4715 + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
4716 + * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
4717 + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
4718 + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
4719 + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
4720 + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
4721 + * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
4722 + * DAMAGE.
4723 + */
4726 + /* Macro for masking the lowest bit of each byte in a
4727 + packed word */
4728 + .macro packedmask1 reg, round
4729 + .if \round
4730 + and \reg, \reg, r8 >> 1
4731 + .else
4732 + and \reg, r8
4733 + .endif
4734 + .endm
4736 + /* Macro for 8 pixel wide horizontal and vertical interpolation functions */
4737 + .macro pixels8_hv round, put
4740 + pushm r0-r7, lr
4742 + /* R12 = uint8_t *block, R11 = uint8_t pixels, R10 = int line_size, R9 = int h */
4744 + /* Rounding immediate */
4745 + .if \round
4746 + mov r8, lo(0x02020202)
4747 + orh r8, hi(0x02020202)
4748 + .else
4749 + mov r8, lo(0x01010101)
4750 + orh r8, hi(0x01010101)
4751 + .endif
4752 + mov r7, 2
4754 + /* Pixel naming convention :
4756 + |-----------------------------------------------------|
4757 + | s00 | s01 | s02 | s03 | s04 | s05 | s06 | s07 | s08 |
4758 + |----d00---d01---d02---d03---d04---d05---d06---d07----|
4759 + | s10 | s11 | s12 | s13 | s14 | s15 | s16 | s17 | s18 |
4760 + |-----------------------------------------------------|
4761 + */
4762 +1:
4763 + ld.w r0, r11[0] // r0 = { s00, s01, s02, s03 }
4764 + ld.w r1, r11[1] // r1 = { s01, s02, s03, s04 }
4765 + mov lr, r9
4766 + eor r2, r0, r1
4767 + packedmask1 r2, \round
4768 + add r2, r8
4770 + paddh.ub r0, r0, r1 // r0 = {(s00+s01)/2,(s01+s02)/2,(s02+s03)/2,(s03+s04)/2}
4772 + add r11, r10 // pixels += line_size
4773 + ld.w r1, r11[0] // r1 = { s10, s11, s12, s13 }
4774 + ld.w r3, r11[1] // r3 = { s11, s12, s13, s14 }
4776 + eor r5, r1, r3
4777 + packedmask1 r5, \round
4778 + add r2, r5
4780 + paddh.ub r1, r1, r3 // r1 = {(s10+s11)/2,(s11+s12)/2,(s12+s13)/2,(s13+s14)/2}
4781 + eor r6, r0, r1
4782 + packedmask1 r6, \round
4783 + add r2, r2, r6 << 1
4785 + ld.w r3, r11[r10] // r3 = { s00, s01, s02, s03 }
4786 + add r11, r10 // pixels += line_size
4787 + ld.w r4, r11[1] // r4 = { s01, s02, s03, s04 }
4789 + paddh.ub r0, r0, r1
4790 + plsr.b r2, r2, 2
4791 + padd.b r0, r0, r2 // r0 = { d00, d01, d02, d03 }
4793 + /* Next row */
4794 + .if \put
4795 + eor r2, r3, r4
4796 + packedmask1 r2, \round
4797 + add r2, r8
4798 + .else
4799 + ld.w r6, r12[0]
4800 + eor r2, r3, r4
4801 + packedmask1 r2, \round
4802 + add r2, r8
4803 + pavg.ub r0, r0, r6
4804 + .endif
4805 + st.w r12[0], r0 // Put data into the block
4807 + add r5, r2
4808 + paddh.ub r0, r3, r4 // r0 = {(s00+s01)/2,(s01+s02)/2,(s02+s03)/2,(s03+s04)/2}
4810 + eor r6, r0, r1
4811 + packedmask1 r6, \round
4812 + add r5, r5, r6 << 1
4814 + .if \put
4815 + paddh.ub r1, r0, r1
4816 + plsr.b r5, r5, 2
4817 + padd.b r1, r1, r5 // r1 = { d10, d11, d12, d13 }
4818 + .else
4819 + ld.w r3, r12[r10]
4820 + paddh.ub r1, r0, r1
4821 + plsr.b r5, r5, 2
4822 + padd.b r1, r1, r5 // r1 = { d10, d11, d12, d13 }
4823 + pavg.ub r1, r1, r3
4824 + .endif
4826 + st.w r12[r10], r1 // Put data into the block
4829 + ld.w r1, r11[r10] // r1 = { s10, s11, s12, s13 }
4830 + add r11, r10 // pixels += line_size
4831 + ld.w r3, r11[1] // r3 = { s11, s12, s13, s14 }
4832 + add r12, r12, r10 << 1 // block += 2*line_size
4833 + sub lr, 2
4834 + brne 0b
4836 + mul r0, r10, r9 // r0 = line_size * h
4837 + rsub r0, r0, 4 // r0 = 4 - (line_size * h)
4838 + add r11, r0
4839 + sub r11, r10 // pixels += 4 - (line_size * (h+1))
4840 + add r12, r0 // pixels += 4 - (line_size * (h))
4841 + sub r7, 1
4842 + brne 1b
4844 + popm r0-r7, pc
4845 + .endm
4848 + /* Macro for 8 pixel wide vertical interpolation functions */
4850 + .macro pixels8_v round, put
4851 + pushm r4-r7,lr
4852 + /* R12 = uint8_t *block, R11 = uint8_t pixels, R10 = int line_size, R9 = int h */
4854 + /*
4855 + Pixel Naming Convention :
4856 + |-----------------------------------------------|
4857 + | s00 | s01 | s02 | s03 | s04 | s05 | s06 | s07 |
4858 + |-d00---d01---d02---d03---d04---d05---d06---d07-|
4859 + | s10 | s11 | s12 | s13 | s14 | s15 | s16 | s17 |
4860 + |-----------------------------------------------|
4861 + */
4862 + ld.w r8, r11[r10] // r8 = { s10, s11, s12, s13 }
4863 + ld.w lr, r11++ // lr = { s00, s01, s02, s03 }, src += 4
4864 + ld.w r7, r11[0] // r7 = { s04, s05, s06, s07 }
4865 + ld.w r6, r11[r10] // r6 = { s14, s15, s16, s17 }
4866 + sub r10, 4 // stride -= 4
4867 + add r11, r11, r10 << 1 // src += 2*stride
4868 + sub r11, -4 // src += 4
4870 +0:
4871 + .if \round
4872 + pavg.ub r5, r8, lr // r5 = {(s10+s00)/2,(s11+s01)/2,(s12+s02)/2,(s13+s03)/2}
4873 + pavg.ub r4, r6, r7 // r4 = {(s14+s04)/2,(s15+s05)/2,(s16+s06)/2,(s17+s07)/2}
4874 + .else
4875 + paddh.ub r5, r8, lr // r5 = {(s10+s00)/2,(s11+s01)/2,(s12+s02)/2,(s13+s03)/2}
4876 + paddh.ub r4, r6, r7 // r4 = {(s14+s04)/2,(s15+s05)/2,(s16+s06)/2,(s17+s07)/2}
4877 + .endif
4879 + .if \put
4880 + st.w r12++, r5 // *dst++ = { d00, d01, d02, d03 }
4881 + ld.w lr, r11++ // lr = { s10, s11, s12, s13 }, src += 4
4882 + st.w r12[0], r4 // *dst = { d04, d05, d06, d07 }
4883 + ld.w r7, r11[0] // r7 = { s14, s15, s16, s17 }
4884 + .else
4885 + ld.w lr, r12[0]
4886 + ld.w r7, r12[4]
4887 + pavg.ub r5, r5, lr
4888 + pavg.ub r4, r4, r7
4889 + st.w r12++, r5 // *dst++ = { d00, d01, d02, d03 }
4890 + ld.w lr, r11++ // lr = { s10, s11, s12, s13 }, src += 4
4891 + st.w r12[0], r4 // *dst = { d04, d05, d06, d07 }
4892 + ld.w r7, r11[0] // r7 = { s14, s15, s16, s17 }
4893 + .endif
4894 + add r11, r10 // src += stride
4895 +#ifdef USE_PREFETCH
4896 + pref r11[0]
4897 +#endif
4898 + add r12, r10 // dst += stride
4900 + .if \round
4901 + pavg.ub r5, r8, lr // r5 = {(s10+s00)/2,(s11+s01)/2,(s12+s02)/2,(s13+s03)/2}
4902 + pavg.ub r4, r6, r7 // r4 = {(s14+s04)/2,(s15+s05)/2,(s16+s06)/2,(s17+s07)/2}
4903 + .else
4904 + paddh.ub r5, r8, lr // r5 = {(s10+s00)/2,(s11+s01)/2,(s12+s02)/2,(s13+s03)/2}
4905 + paddh.ub r4, r6, r7 // r4 = {(s14+s04)/2,(s15+s05)/2,(s16+s06)/2,(s17+s07)/2}
4906 + .endif
4907 + .if \put
4908 + st.w r12++, r5 // *dst++ = { d00, d01, d02, d03 }
4909 + ld.w r8, r11++ // r8 = { s10, s11, s12, s13 }, src += 4
4910 + st.w r12[0], r4 // *dst = { d04, d05, d06, d07 }
4911 + ld.w r6, r11[0] // r6 = { s14, s15, s16, s17 }
4912 + .else
4913 + ld.w r8, r12[0]
4914 + ld.w r6, r12[4]
4915 + pavg.ub r5, r5, r8
4916 + pavg.ub r4, r4, r6
4917 + st.w r12++, r5 // *dst++ = { d00, d01, d02, d03 }
4918 + ld.w r8, r11++ // r8 = { s10, s11, s12, s13 }, src += 4
4919 + st.w r12[0], r4 // *dst = { d04, d05, d06, d07 }
4920 + ld.w r6, r11[0] // r6 = { s14, s15, s16, s17 }
4921 + .endif
4923 + add r11, r10 // src += stride
4924 +#ifdef USE_PREFETCH
4925 + pref r11[0]
4926 +#endif
4927 + add r12, r10 // dst += stride
4928 + sub r9, 2
4929 + brne 0b
4931 + popm r4-r7,pc
4932 + .endm
4934 + /* Macro for 8 pixel wide horizontal interpolation functions */
4936 + .macro pixels8_h round, put
4937 + pushm r4-r7, lr
4939 + /* R12 = uint8_t *block, R11 = uint8_t pixels, R10 = int line_size, R9 = int h */
4940 + /*
4941 + Pixel Naming Convention:
4942 + |--------------------------------------------------------------------|
4943 + | s00 d00 s01 d01 s02 d02 s03 d03 s04 d04 s05 d05 s06 d06 s07 d07 s08|
4944 + |------|-------|-------|-------|-------|-------|-------|-------|-----|
4945 + | s10 d10 s11 d11 s12 d12 s13 d13 s14 d14 s15 d15 s16 d16 s17 d17 s18|
4946 + |--------------------------------------------------------------------|
4947 + */
4949 + ld.w lr, r11[0] // lr = { s00, s01, s02, s03 }
4950 + ld.w r8, r11[1] // r8 = { s01, s02, s03, s04 }
4951 + ld.w r7, r11[4] // r7 = { s04, s05, s06, s07 }
4952 + ld.w r6, r11[5] // r6 = { s05, s06, s07, s08 }
4953 + add r11, r10 // src += stride
4955 +0:
4956 + .if \round
4957 + pavg.ub lr, r8, lr // lr = {(s00+s01)/2,(s01+s02)/2,(s02+s03)/2,(s03+s04)/2}
4958 + pavg.ub r7, r6, r7 // r7 = {(s04+s05)/2,(s05+s06)/2,(s06+s07)/2,(s07+s08)/2}
4959 + .else
4960 + paddh.ub lr, r8, lr // lr = {(s00+s01)/2,(s01+s02)/2,(s02+s03)/2,(s03+s04)/2}
4961 + paddh.ub r7, r6, r7 // r7 = {(s04+s05)/2,(s05+s06)/2,(s06+s07)/2,(s07+s08)/2}
4962 + .endif
4963 + .if \put
4964 + ld.w r5, r11[0] // r5 = { s00, s01, s02, s03 }
4965 + ld.w r4, r11[1] // r4 = { s01, s02, s03, s04 }
4966 + .else
4967 + ld.w r8, r12[0]
4968 + ld.w r6, r12[4]
4969 + ld.w r5, r11[0] // r5 = { s00, s01, s02, s03 }
4970 + ld.w r4, r11[1] // r4 = { s01, s02, s03, s04 }
4971 + pavg.ub lr, lr, r8
4972 + pavg.ub r7, r7, r6
4973 + .endif
4974 + st.w r12[0], lr // dst = { d00, d01, d02, d03 }
4975 + st.w r12[4], r7 // dst = { d04, d05, d06, d07 }
4976 + ld.w r8, r11[4] // r8 = { s04, s05, s06, s07 }
4977 + ld.w r6, r11[5] // r6 = { s05, s06, s07, s08 }
4978 + add r11, r10 // src += stride
4979 +#ifdef USE_PREFETCH
4980 + pref r11[0]
4981 +#endif
4982 + add r12, r10 // dst += stride
4984 + .if \round
4985 + pavg.ub r5, r4, r5 // r5 = {(s00+s01)/2,(s01+s02)/2,(s02+s03)/2,(s03+s04)/2}
4986 + pavg.ub r4, r6, r8 // r4 = {(s04+s05)/2,(s05+s06)/2,(s06+s07)/2,(s07+s08)/2}
4987 + .else
4988 + paddh.ub r5, r4, r5 // r5 = {(s00+s01)/2,(s01+s02)/2,(s02+s03)/2,(s03+s04)/2}
4989 + paddh.ub r4, r6, r8 // r4 = {(s04+s05)/2,(s05+s06)/2,(s06+s07)/2,(s07+s08)/2}
4990 + .endif
4991 + .if \put
4992 + ld.w lr, r11[0] // lr = { s00, s01, s02, s03 }
4993 + ld.w r8, r11[1] // r8 = { s01, s02, s03, s04 }
4994 + .else
4995 + ld.w r7, r12[0]
4996 + ld.w r6, r12[4]
4997 + ld.w lr, r11[0] // lr = { s00, s01, s02, s03 }
4998 + ld.w r8, r11[1] // r8 = { s01, s02, s03, s04 }
4999 + pavg.ub r5, r5, r7
5000 + pavg.ub r4, r4, r6
5001 + .endif
5002 + st.w r12[0], r5 // dst = { d00, d01, d02, d03 }
5003 + st.w r12[4], r4 // dst = { d04, d05, d06, d07 }
5004 + ld.w r7, r11[4] // r7 = { s04, s05, s06, s07 }
5005 + ld.w r6, r11[5] // r6 = { s05, s06, s07, s08 }
5006 + add r11, r10 // src += stride
5007 +#ifdef USE_PREFETCH
5008 + pref r11[0]
5009 +#endif
5010 + add r12, r10 // dst += stride
5011 + sub r9, 2
5012 + brne 0b
5014 + popm r4-r7, pc
5015 + .endm
5017 + /* Macro for 8 pixel wide copy functions */
5018 + .macro pixels8 put
5019 + stm --sp, r3-r7,lr
5020 + /* R12 = uint8_t *block, R11 = uint8_t pixels, R10 = int line_size, R9 = int h */
5021 + mov lr, r9
5022 + sub r3, r10, 2 // stride2 = stride - 2
5023 +0:
5024 + .if \put
5025 + ld.w r9, r11[r10] // r9 = { s10, s11, s12, s13 }
5026 + ld.w r7, r11++ // r7 = { s00, s01, s02, s03 }, src += 4
5027 + ld.w r6, r11[0] // r6 = { s04, s05, s06, s07 }
5028 + ld.w r8, r11[r10] // r8 = { s14, s15, s16, s17 }
5029 + .else
5030 + ld.w r9, r11[r10] // r9 = { s10, s11, s12, s13 }
5031 + ld.d r4, r12[0]
5032 + ld.w r7, r11++ // r7 = { s00, s01, s02, s03 }, src += 4
5033 + ld.w r6, r11[0] // r6 = { s04, s05, s06, s07 }
5034 + ld.w r8, r11[r10] // r8 = { s14, s15, s16, s17 }
5035 + pavg.ub r6, r6, r4
5036 + pavg.ub r7, r7, r5
5037 + ld.d r4, r12[r10]
5038 + .endif
5039 + st.d r12, r6 // *dst = { s00, s01, s02, s03, s04, s05, s06, s07 }
5040 + add r11, r11, r3 << 1 // src += stride2 * 2
5041 + .ifeq \put
5042 + pavg.ub r8, r8, r4
5043 + pavg.ub r9, r9, r5
5044 + .endif
5045 + st.d r12[r10 << 0], r8 // *(dst + stride) = { s10, s11, s12, s13, s14, s15, s16, s17 }
5046 + add r12, r12, r10 << 1 // dst += 2*stride
5047 + sub lr, 2
5048 + brne 0b
5049 + ldm sp++, r3-r7,pc
5051 + .endm
5053 + .global put_no_rnd_pixels8_hv_avr32
5054 + .text
5055 +put_no_rnd_pixels8_hv_avr32:
5056 + pixels8_hv 0, 1
5058 + .global put_pixels8_hv_avr32
5059 + .text
5060 +put_pixels8_hv_avr32:
5061 + pixels8_hv 1, 1
5063 + .global avg_no_rnd_pixels8_hv_avr32
5064 + .text
5065 +avg_no_rnd_pixels8_hv_avr32:
5066 + pixels8_hv 0, 0
5068 + .global avg_pixels8_hv_avr32
5069 + .text
5070 +avg_pixels8_hv_avr32:
5071 + pixels8_hv 1, 0
5073 + .global put_no_rnd_pixels8_v_avr32
5074 + .text
5075 +put_no_rnd_pixels8_v_avr32:
5076 + pixels8_v 0, 1
5078 + .global put_pixels8_v_avr32
5079 + .text
5080 +put_pixels8_v_avr32:
5081 + pixels8_v 1, 1
5083 + .global avg_no_rnd_pixels8_v_avr32
5084 + .text
5085 +avg_no_rnd_pixels8_v_avr32:
5086 + pixels8_v 0, 0
5088 + .global avg_pixels8_v_avr32
5089 + .text
5090 +avg_pixels8_v_avr32:
5091 + pixels8_v 1, 0
5093 + .global put_no_rnd_pixels8_h_avr32
5094 + .text
5095 +put_no_rnd_pixels8_h_avr32:
5096 + pixels8_h 0, 1
5098 + .global put_pixels8_h_avr32
5099 + .text
5100 +put_pixels8_h_avr32:
5101 + pixels8_h 1, 1
5103 + .global avg_no_rnd_pixels8_h_avr32
5104 + .text
5105 +avg_no_rnd_pixels8_h_avr32:
5106 + pixels8_h 0, 0
5108 + .global avg_pixels8_h_avr32
5109 + .text
5110 +avg_pixels8_h_avr32:
5111 + pixels8_h 1, 0
5113 + .global put_pixels8_avr32
5114 + .global put_no_rnd_pixels8_avr32
5115 + .text
5116 +put_pixels8_avr32:
5117 +put_no_rnd_pixels8_avr32:
5118 + pixels8 1
5120 + .global avg_no_rnd_pixels8_avr32
5121 + .global avg_pixels8_avr32
5122 + .text
5123 +avg_pixels8_avr32:
5124 +avg_no_rnd_pixels8_avr32:
5125 + pixels8 0
5126 diff --git a/libavcodec/avr32/pico.h b/libavcodec/avr32/pico.h
5127 new file mode 100644
5128 index 0000000..32201ba
5129 --- /dev/null
5130 +++ b/libavcodec/avr32/pico.h
5131 @@ -0,0 +1,260 @@
5133 + * Copyright (c) 2007 Atmel Corporation. All rights reserved.
5135 + * Redistribution and use in source and binary forms, with or without
5136 + * modification, are permitted provided that the following conditions
5137 + * are met:
5139 + * 1. Redistributions of source code must retain the above copyright
5140 + * notice, this list of conditions and the following disclaimer.
5142 + * 2. Redistributions in binary form must reproduce the above
5143 + * copyright notice, this list of conditions and the following
5144 + * disclaimer in the documentation and/or other materials provided
5145 + * with the distribution.
5147 + * 3. The name of ATMEL may not be used to endorse or promote products
5148 + * derived from this software without specific prior written
5149 + * permission.
5151 + * THIS SOFTWARE IS PROVIDED BY ATMEL ``AS IS'' AND ANY EXPRESS OR
5152 + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
5153 + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
5154 + * ARE EXPRESSLY AND SPECIFICALLY DISCLAIMED. IN NO EVENT SHALL ATMEL
5155 + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
5156 + * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
5157 + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
5158 + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
5159 + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
5160 + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
5161 + * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
5162 + * DAMAGE.
5163 + */
5164 +#ifndef __PICO_H__
5165 +#define __PICO_H__
5169 +/* Coprocessor Number */
5170 +#define PICO_CPNO 1
5172 +/* Pixel Coprocessor Register file */
5173 +#define PICO_REGVECT_INPIX2 cr0
5174 +#define PICO_REGVECT_INPIX1 cr1
5175 +#define PICO_REGVECT_INPIX0 cr2
5176 +#define PICO_REGVECT_OUTPIX2 cr3
5177 +#define PICO_REGVECT_OUTPIX1 cr4
5178 +#define PICO_REGVECT_OUTPIX0 cr5
5179 +#define PICO_REGVECT_COEFF0_A cr6
5180 +#define PICO_REGVECT_COEFF0_B cr7
5181 +#define PICO_REGVECT_COEFF1_A cr8
5182 +#define PICO_REGVECT_COEFF1_B cr9
5183 +#define PICO_REGVECT_COEFF2_A cr10
5184 +#define PICO_REGVECT_COEFF2_B cr11
5185 +#define PICO_REGVECT_VMU0_OUT cr12
5186 +#define PICO_REGVECT_VMU1_OUT cr13
5187 +#define PICO_REGVECT_VMU2_OUT cr14
5188 +#define PICO_REGVECT_CONFIG cr15
5190 +#define PICO_INPIX2 0
5191 +#define PICO_INPIX1 1
5192 +#define PICO_INPIX0 2
5193 +#define PICO_OUTPIX2 3
5194 +#define PICO_OUTPIX1 4
5195 +#define PICO_OUTPIX0 5
5196 +#define PICO_COEFF0_A 6
5197 +#define PICO_COEFF0_B 7
5198 +#define PICO_COEFF1_A 8
5199 +#define PICO_COEFF1_B 9
5200 +#define PICO_COEFF2_A 10
5201 +#define PICO_COEFF2_B 11
5202 +#define PICO_VMU0_OUT 12
5203 +#define PICO_VMU1_OUT 13
5204 +#define PICO_VMU2_OUT 14
5205 +#define PICO_CONFIG 15
5207 +/* Config Register */
5208 +#define PICO_COEFF_FRAC_BITS_OFFSET 0
5209 +#define PICO_COEFF_FRAC_BITS_SIZE 4
5210 +#define PICO_OFFSET_FRAC_BITS_OFFSET 4
5211 +#define PICO_OFFSET_FRAC_BITS_SIZE 4
5212 +#define PICO_INPUT_MODE_OFFSET 8
5213 +#define PICO_INPUT_MODE_SIZE 2
5214 +#define PICO_OUTPUT_MODE_OFFSET 10
5215 +#define PICO_OUTPUT_MODE_SIZE 1
5217 +struct pico_config_t {
5218 + unsigned int : 32 - PICO_OUTPUT_MODE_OFFSET - PICO_OUTPUT_MODE_SIZE;
5219 + unsigned int output_mode : PICO_OUTPUT_MODE_SIZE;
5220 + unsigned int input_mode : PICO_INPUT_MODE_SIZE;
5221 + unsigned int offset_frac_bits : PICO_OFFSET_FRAC_BITS_SIZE;
5222 + unsigned int coeff_frac_bits : PICO_COEFF_FRAC_BITS_SIZE;
5223 + int vmu2_out;
5224 + int vmu1_out;
5225 + int vmu0_out;
5226 + short coeff2_2;
5227 + short coeff2_3;
5228 + short coeff2_0;
5229 + short coeff2_1;
5230 + short coeff1_2;
5231 + short coeff1_3;
5232 + short coeff1_0;
5233 + short coeff1_1;
5234 + short coeff0_2;
5235 + short coeff0_3;
5236 + short coeff0_0;
5237 + short coeff0_1;
5241 +#define PICO_COEFF_FRAC_BITS(x) (x << PICO_COEFF_FRAC_BITS_OFFSET)
5242 +#define PICO_OFFSET_FRAC_BITS(x) (x << PICO_OFFSET_FRAC_BITS_OFFSET)
5243 +#define PICO_INPUT_MODE(x) (x << PICO_INPUT_MODE_OFFSET)
5244 +#define PICO_OUTPUT_MODE(x) (x << PICO_OUTPUT_MODE_OFFSET)
5246 +#define GET_PICO_COEFF_FRAC_BITS(x) ((x >> PICO_COEFF_FRAC_BITS_OFFSET)&((1 << PICO_COEFF_FRAC_BITS_SIZE)-1))
5247 +#define GET_PICO_OFFSET_FRAC_BITS(x) ((x >> PICO_OFFSET_FRAC_BITS_OFFSET)&((1 << PICO_OFFSET_FRAC_BITS_SIZE)-1))
5248 +#define GET_PICO_INPUT_MODE(x) ((x >> PICO_INPUT_MODE_OFFSET)&((1 << PICO_INPUT_MODE_SIZE)-1))
5249 +#define GET_PICO_OUTPUT_MODE(x) ((x >> PICO_OUTPUT_MODE_OFFSET)&((1 << PICO_OUTPUT_MODE_SIZE)-1))
5251 +enum pico_input_mode { PICO_TRANSFORMATION_MODE,
5252 + PICO_HOR_FILTER_MODE,
5253 + PICO_VERT_FILTER_MODE };
5255 +enum pico_output_mode { PICO_PACKED_MODE,
5256 + PICO_PLANAR_MODE };
5258 +/* Bits in coefficients */
5259 +#define PICO_COEFF_BITS 12
5261 +/* Operation bits */
5262 +#define PICO_MATRIX (0)
5263 +#define PICO_USE_ACC (1 << 2)
5264 +#define PICO_SINGLE_VECTOR (1 << 3)
5267 +#define __str(x...) #x
5268 +#define __xstr(x...) __str(x)
5270 +#define PICO_PUT_W(pico_reg, x) \
5271 + __builtin_mvrc_w(PICO_CPNO, pico_reg, x);
5272 +#define PICO_GET_W(pico_reg) \
5273 + __builtin_mvcr_w(PICO_CPNO, pico_reg)
5275 +#define PICO_MVCR_W(x, pico_reg) \
5276 + asm ("mvcr.w\tcp" __xstr(PICO_CPNO) ", %0, cr" __xstr(pico_reg) : "=r"(x));
5278 +#define PICO_MVRC_W(pico_reg, x) \
5279 + asm ("mvrc.w\tcp" __xstr(PICO_CPNO) ", cr" __xstr(pico_reg) ", %0" :: "r"(x));
5281 +#define PICO_PUT_D(pico_reg, x) \
5282 + __builtin_mvrc_d(PICO_CPNO, pico_reg, x);
5283 +#define PICO_GET_D(pico_reg) \
5284 + __builtin_mvcr_d(PICO_CPNO, pico_reg)
5286 +#define PICO_MVCR_D(x, pico_reg) \
5287 + asm volatile ("mvcr.d\tcp" __xstr(PICO_CPNO) ", %0, cr" __xstr(pico_reg) : "=r"(x));
5288 +#define PICO_MVRC_D(pico_reg, x) \
5289 + asm volatile ("mvrc.d\tcp" __xstr(PICO_CPNO) ", cr" __xstr(pico_reg) ", %0" :: "r"(x));
5291 +#define PICO_STCM_W(ptr, pico_regs...) \
5292 + asm volatile ("stcm.w\tcp" __xstr(PICO_CPNO) ", %0," __xstr(pico_regs) :: "r"(ptr));
5293 +#define PICO_STCM_D(ptr, pico_regs...) \
5294 + asm volatile ("stcm.d\tcp" __xstr(PICO_CPNO) ", %0," __xstr(pico_regs) :: "r"(ptr));
5296 +#define PICO_STCM_W_DEC(ptr, pico_regs...) \
5297 + asm volatile ("stcm.w\tcp" __xstr(PICO_CPNO) ", --%0," __xstr(pico_regs) : "+r"(ptr));
5298 +#define PICO_STCM_D_DEC(ptr, pico_regs...) \
5299 + asm volatile ("stcm.d\tcp" __xstr(PICO_CPNO) ", --%0," __xstr(pico_regs) : "+r"(ptr));
5301 +#define PICO_LDCM_W(ptr, pico_regs...) \
5302 + asm volatile ("ldcm.w\tcp" __xstr(PICO_CPNO) ", %0," __xstr(pico_regs) :: "r"(ptr));
5303 +#define PICO_LDCM_D(ptr, pico_regs...) \
5304 + asm volatile ("ldcm.d\tcp" __xstr(PICO_CPNO) ", %0," __xstr(pico_regs) :: "r"(ptr));
5306 +#define PICO_LDCM_W_INC(ptr, pico_regs...) \
5307 + asm volatile ("ldcm.w\tcp" __xstr(PICO_CPNO) ", %0++," __xstr(pico_regs) : "+r"(ptr));
5308 +#define PICO_LDCM_D_INC(ptr, pico_regs...) \
5309 + asm volatile ("ldcm.d\tcp" __xstr(PICO_CPNO) ", %0++," __xstr(pico_regs) : "+r"(ptr));
5311 +#define PICO_OP(op, dst_addr, addr0, addr1, addr2) \
5312 + __builtin_cop(PICO_CPNO, addr0, addr1, addr2, op | dst_addr);
5314 +static inline void set_pico_config(struct pico_config_t *config){
5315 + PICO_LDCM_D(config,
5316 + PICO_REGVECT_COEFF0_A, PICO_REGVECT_COEFF0_B,
5317 + PICO_REGVECT_COEFF1_A, PICO_REGVECT_COEFF1_B,
5318 + PICO_REGVECT_COEFF2_A, PICO_REGVECT_COEFF2_B,
5319 + PICO_REGVECT_VMU0_OUT, PICO_REGVECT_VMU1_OUT,
5320 + PICO_REGVECT_VMU2_OUT, PICO_REGVECT_CONFIG);
5323 +static inline void get_pico_config(struct pico_config_t *config){
5324 + PICO_STCM_D(config,
5325 + PICO_REGVECT_COEFF0_A, PICO_REGVECT_COEFF0_B,
5326 + PICO_REGVECT_COEFF1_A, PICO_REGVECT_COEFF1_B,
5327 + PICO_REGVECT_COEFF2_A, PICO_REGVECT_COEFF2_B,
5328 + PICO_REGVECT_VMU0_OUT, PICO_REGVECT_VMU1_OUT,
5329 + PICO_REGVECT_VMU2_OUT, PICO_REGVECT_CONFIG);
5332 +static inline void dump_pico_config(){
5333 + struct pico_config_t pico_config;
5334 + char *input_mode, *output_mode;
5335 + get_pico_config(&pico_config);
5338 + av_log(NULL, AV_LOG_INFO, "Dumping pico configuration:\n\n");
5339 + av_log(NULL, AV_LOG_INFO, "\tcoeff_frac_bits = %d\n", pico_config.coeff_frac_bits);
5340 + av_log(NULL, AV_LOG_INFO, "\toffset_frac_bits = %d\n", pico_config.offset_frac_bits);
5342 + switch ( pico_config.input_mode ){
5343 + case PICO_TRANSFORMATION_MODE:
5344 + input_mode = "Transformation Mode";
5345 + break;
5346 + case PICO_HOR_FILTER_MODE:
5347 + input_mode = "Horisontal Filter Mode";
5348 + break;
5349 + case PICO_VERT_FILTER_MODE:
5350 + input_mode = "Vertical Filter Mode";
5351 + break;
5352 + default:
5353 + input_mode = "Unknown Mode!!";
5354 + break;
5356 + av_log(NULL, AV_LOG_INFO, "\tinput_mode = %s\n", input_mode);
5358 + switch ( pico_config.output_mode ){
5359 + case PICO_PLANAR_MODE:
5360 + output_mode = "Planar Mode";
5361 + break;
5362 + case PICO_PACKED_MODE:
5363 + output_mode = "Packed Mode";
5364 + break;
5365 + default:
5366 + output_mode = "Unknown Mode!!";
5367 + break;
5370 + av_log(NULL, AV_LOG_INFO, "\toutput_mode = %s\n", output_mode);
5372 + av_log(NULL, AV_LOG_INFO, "\tCoeff0_0 = %f\n", (float)pico_config.coeff0_0/(float)(1 << pico_config.coeff_frac_bits));
5373 + av_log(NULL, AV_LOG_INFO, "\tCoeff0_1 = %f\n", (float)pico_config.coeff0_1/(float)(1 << pico_config.coeff_frac_bits));
5374 + av_log(NULL, AV_LOG_INFO, "\tCoeff0_2 = %f\n", (float)pico_config.coeff0_2/(float)(1 << pico_config.coeff_frac_bits));
5375 + av_log(NULL, AV_LOG_INFO, "\tCoeff0_3 = %f\n", (float)pico_config.coeff0_3/(float)(1 << pico_config.offset_frac_bits));
5377 + av_log(NULL, AV_LOG_INFO, "\tCoeff1_0 = %f\n", (float)pico_config.coeff1_0/(float)(1 << pico_config.coeff_frac_bits));
5378 + av_log(NULL, AV_LOG_INFO, "\tCoeff1_1 = %f\n", (float)pico_config.coeff1_1/(float)(1 << pico_config.coeff_frac_bits));
5379 + av_log(NULL, AV_LOG_INFO, "\tCoeff1_2 = %f\n", (float)pico_config.coeff1_2/(float)(1 << pico_config.coeff_frac_bits));
5380 + av_log(NULL, AV_LOG_INFO, "\tCoeff1_3 = %f\n", (float)pico_config.coeff1_3/(float)(1 << pico_config.offset_frac_bits));
5382 + av_log(NULL, AV_LOG_INFO, "\tCoeff2_0 = %f\n", (float)pico_config.coeff2_0/(float)(1 << pico_config.coeff_frac_bits));
5383 + av_log(NULL, AV_LOG_INFO, "\tCoeff2_1 = %f\n", (float)pico_config.coeff2_1/(float)(1 << pico_config.coeff_frac_bits));
5384 + av_log(NULL, AV_LOG_INFO, "\tCoeff2_2 = %f\n", (float)pico_config.coeff2_2/(float)(1 << pico_config.coeff_frac_bits));
5385 + av_log(NULL, AV_LOG_INFO, "\tCoeff2_3 = %f\n", (float)pico_config.coeff2_3/(float)(1 << pico_config.offset_frac_bits));
5390 +#endif
5392 diff --git a/libavcodec/bitstream.h b/libavcodec/bitstream.h
5393 index 26b4f8d..1f8fabf 100644
5394 --- a/libavcodec/bitstream.h
5395 +++ b/libavcodec/bitstream.h
5396 @@ -171,7 +171,7 @@ typedef struct RL_VLC_ELEM {
5397 #endif
5399 /* used to avoid missaligned exceptions on some archs (alpha, ...) */
5400 -#if defined(ARCH_X86) || defined(ARCH_X86_64)
5401 +#if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_AVR32)
5402 # define unaligned16(a) (*(const uint16_t*)(a))
5403 # define unaligned32(a) (*(const uint32_t*)(a))
5404 # define unaligned64(a) (*(const uint64_t*)(a))
5405 @@ -813,6 +813,44 @@ void free_vlc(VLC *vlc);
5406 * if the vlc code is invalid and max_depth>1 than the number of bits removed
5407 * is undefined
5410 +#if defined(ARCH_AVR32)
5411 +#define GET_VLC(code, name, gb, table, bits, max_depth)\
5413 + int n, index, nb_bits;\
5414 + union { VLC_TYPE vlc[2];\
5415 + uint32_t u32; } table_elem;\
5417 + index= SHOW_UBITS(name, gb, bits);\
5418 + table_elem.u32 = unaligned32(&table[index]); \
5419 + code = table_elem.vlc[0];\
5420 + n = table_elem.vlc[1];\
5422 + if(max_depth > 1 && n < 0 ){\
5423 + LAST_SKIP_BITS(name, gb, bits)\
5424 + UPDATE_CACHE(name, gb)\
5426 + nb_bits = -n;\
5428 + index= SHOW_UBITS(name, gb, nb_bits) + code;\
5429 + table_elem.u32 = unaligned32(&table[index]); \
5430 + code = table_elem.vlc[0];\
5431 + n = table_elem.vlc[1];\
5432 + if(max_depth > 2 && n < 0){\
5433 + LAST_SKIP_BITS(name, gb, nb_bits)\
5434 + UPDATE_CACHE(name, gb)\
5436 + nb_bits = -n;\
5438 + index= SHOW_UBITS(name, gb, nb_bits) + code;\
5439 + code = table[index][0];\
5440 + n = table[index][1];\
5441 + }\
5442 + }\
5443 + SKIP_BITS(name, gb, n)\
5446 +#else
5447 #define GET_VLC(code, name, gb, table, bits, max_depth)\
5449 int n, index, nb_bits;\
5450 @@ -821,7 +859,7 @@ void free_vlc(VLC *vlc);
5451 code = table[index][0];\
5452 n = table[index][1];\
5454 - if(max_depth > 1 && n < 0){\
5455 + if(max_depth > 1 && n < 0 ){\
5456 LAST_SKIP_BITS(name, gb, bits)\
5457 UPDATE_CACHE(name, gb)\
5459 @@ -843,7 +881,38 @@ void free_vlc(VLC *vlc);
5461 SKIP_BITS(name, gb, n)\
5463 +#endif
5465 +#if defined(ARCH_AVR32)
5466 +#define GET_RL_VLC(level, run, name, gb, table, bits, max_depth, need_update)\
5468 + int n, index, nb_bits;\
5469 + union { RL_VLC_ELEM vlc;\
5470 + uint32_t u32; } table_elem;\
5472 + index= SHOW_UBITS(name, gb, bits);\
5473 + table_elem.u32 = unaligned32(&table[index]); \
5474 + level = table_elem.vlc.level;\
5475 + n = table_elem.vlc.len;\
5477 + if(max_depth > 1 && n < 0 ){\
5478 + SKIP_BITS(name, gb, bits)\
5479 + if(need_update){\
5480 + UPDATE_CACHE(name, gb)\
5481 + }\
5483 + nb_bits = -n;\
5485 + index= SHOW_UBITS(name, gb, nb_bits) + level;\
5486 + table_elem.u32 = unaligned32(&table[index]); \
5487 + level = table_elem.vlc.level;\
5488 + n = table_elem.vlc.len;\
5489 + }\
5490 + run= table_elem.vlc.run;\
5491 + SKIP_BITS(name, gb, n)\
5494 +#else
5495 #define GET_RL_VLC(level, run, name, gb, table, bits, max_depth, need_update)\
5497 int n, index, nb_bits;\
5498 @@ -852,7 +921,7 @@ void free_vlc(VLC *vlc);
5499 level = table[index].level;\
5500 n = table[index].len;\
5502 - if(max_depth > 1 && n < 0){\
5503 + if(max_depth > 1 && n < 0 ){\
5504 SKIP_BITS(name, gb, bits)\
5505 if(need_update){\
5506 UPDATE_CACHE(name, gb)\
5507 @@ -867,7 +936,7 @@ void free_vlc(VLC *vlc);
5508 run= table[index].run;\
5509 SKIP_BITS(name, gb, n)\
5512 +#endif
5515 * parses a vlc code, faster then get_vlc()
5516 diff --git a/libavcodec/dsputil.c b/libavcodec/dsputil.c
5517 index 56c42b9..8fc10c6 100644
5518 --- a/libavcodec/dsputil.c
5519 +++ b/libavcodec/dsputil.c
5520 @@ -4197,6 +4197,9 @@ void dsputil_init(DSPContext* c, AVCodecContext *avctx)
5521 #ifdef ARCH_BFIN
5522 dsputil_init_bfin(c,avctx);
5523 #endif
5524 +#ifdef ARCH_AVR32
5525 + dsputil_init_avr32(c,avctx);
5526 +#endif
5528 for(i=0; i<64; i++){
5529 if(!c->put_2tap_qpel_pixels_tab[0][i])
5530 diff --git a/libavcodec/h264.c b/libavcodec/h264.c
5531 index 865e80a..8f7c3f1 100644
5532 --- a/libavcodec/h264.c
5533 +++ b/libavcodec/h264.c
5534 @@ -3258,7 +3258,12 @@ static void free_tables(H264Context *h){
5536 static void init_dequant8_coeff_table(H264Context *h){
5537 int i,q,x;
5538 +#ifdef ARCH_AVR32
5539 + const int transpose = 0;
5540 +#else
5541 const int transpose = (h->s.dsp.h264_idct8_add != ff_h264_idct8_add_c); //FIXME ugly
5542 +#endif
5544 h->dequant8_coeff[0] = h->dequant8_buffer[0];
5545 h->dequant8_coeff[1] = h->dequant8_buffer[1];
5547 @@ -3281,7 +3286,13 @@ static void init_dequant8_coeff_table(H264Context *h){
5549 static void init_dequant4_coeff_table(H264Context *h){
5550 int i,j,q,x;
5551 + // Yes this is ugly as hell....
5552 +#ifdef ARCH_AVR32
5553 + const int transpose = 0;
5554 +#else
5555 const int transpose = (h->s.dsp.h264_idct_add != ff_h264_idct_add_c); //FIXME ugly
5556 +#endif
5558 for(i=0; i<6; i++ ){
5559 h->dequant4_coeff[i] = h->dequant4_buffer[i];
5560 for(j=0; j<i; j++){
5561 @@ -4663,7 +4674,11 @@ static int decode_slice_header(H264Context *h){
5562 if (MPV_common_init(s) < 0)
5563 return -1;
5565 +#ifdef ARCH_AVR32
5566 + if ( 1 ){
5567 +#else
5568 if(s->dsp.h264_idct_add == ff_h264_idct_add_c){ //FIXME little ugly
5569 +#endif
5570 memcpy(h->zigzag_scan, zigzag_scan, 16*sizeof(uint8_t));
5571 memcpy(h-> field_scan, field_scan, 16*sizeof(uint8_t));
5572 }else{
5573 diff --git a/libavutil/common.h b/libavutil/common.h
5574 index 3ae5971..7e52b90 100644
5575 --- a/libavutil/common.h
5576 +++ b/libavutil/common.h
5577 @@ -283,23 +283,39 @@ static inline int mid_pred(int a, int b, int c)
5578 * @param amax maximum value of the clip range
5579 * @return cliped value
5581 +#if defined(ARCH_AVR32)
5582 +#define clip(a, amin, amax) \
5583 + ({ int __tmp__; \
5584 + asm ("min\t%0, %1, %2\n" \
5585 + "max\t%0, %0, %3\n" \
5586 + : "=&r"(__tmp__) : "r"(a), "r"(amax), "r"(amin)); \
5587 + __tmp__; })
5588 +#else
5589 static inline int clip(int a, int amin, int amax)
5591 if (a < amin) return amin;
5592 else if (a > amax) return amax;
5593 else return a;
5595 +#endif
5598 * clip a signed integer value into the 0-255 range
5599 * @param a value to clip
5600 * @return cliped value
5602 +#if defined(ARCH_AVR32)
5603 +#define clip_uint8(a) \
5604 + ({ int __tmp__ = a; \
5605 + asm ("satu\t%0 >> 0, 8" : "+r"(__tmp__)); \
5606 + __tmp__; })
5607 +#else
5608 static inline uint8_t clip_uint8(int a)
5610 if (a&(~255)) return (-a)>>31;
5611 else return a;
5613 +#endif
5615 /* math */
5616 int64_t ff_gcd(int64_t a, int64_t b);
5617 diff --git a/libavutil/internal.h b/libavutil/internal.h
5618 index 285d304..a8b0718 100644
5619 --- a/libavutil/internal.h
5620 +++ b/libavutil/internal.h
5621 @@ -210,6 +210,15 @@ if((y)<(x)){\
5625 +/* XXX: Hack for uclibc which declares lrintf but does not implement it... */
5626 +#ifdef ARCH_AVR32
5627 +#undef HAVE_LRINTF
5628 +#define HAVE_LRINTF 1
5629 +#define lrintf(x) rint(x)
5630 +#define llrint(x) (long long)rint(x)
5631 +#endif
5634 #ifndef HAVE_LRINTF
5635 /* XXX: add ISOC specific test to avoid specific BSD testing. */
5636 /* better than nothing implementation. */
5637 diff --git a/libfaad2/common.h b/libfaad2/common.h
5638 index f809042..6c5fb21 100644
5639 --- a/libfaad2/common.h
5640 +++ b/libfaad2/common.h
5641 @@ -67,7 +67,7 @@ extern "C" {
5642 /* Use if target platform has address generators with autoincrement */
5643 //#define PREFER_POINTERS
5645 -#if defined(_WIN32_WCE) || defined(__arm__)
5646 +#if defined(_WIN32_WCE) || defined(__arm__) || defined(__avr32__)
5647 #define FIXED_POINT
5648 #endif
5650 diff --git a/libmpcodecs/ad_libmad.c b/libmpcodecs/ad_libmad.c
5651 index 076359a..51b77fe 100644
5652 --- a/libmpcodecs/ad_libmad.c
5653 +++ b/libmpcodecs/ad_libmad.c
5654 @@ -86,6 +86,11 @@ static int init(sh_audio_t *sh){
5655 sh->channels=(this->frame.header.mode == MAD_MODE_SINGLE_CHANNEL) ? 1 : 2;
5656 sh->samplerate=this->frame.header.samplerate;
5657 sh->i_bps=this->frame.header.bitrate/8;
5658 +#ifdef WORDS_BIGENDIAN
5659 + sh->sample_format = AF_FORMAT_S16_BE;
5660 +#else
5661 + sh->sample_format = AF_FORMAT_S16_LE;
5662 +#endif
5663 sh->samplesize=2;
5665 return 1;
5666 diff --git a/libswscale/pico-avr32.h b/libswscale/pico-avr32.h
5667 new file mode 100644
5668 index 0000000..7ac6200
5669 --- /dev/null
5670 +++ b/libswscale/pico-avr32.h
5671 @@ -0,0 +1,137 @@
5673 + * Copyright (c) 2007 Atmel Corporation. All rights reserved.
5675 + * Redistribution and use in source and binary forms, with or without
5676 + * modification, are permitted provided that the following conditions
5677 + * are met:
5679 + * 1. Redistributions of source code must retain the above copyright
5680 + * notice, this list of conditions and the following disclaimer.
5682 + * 2. Redistributions in binary form must reproduce the above
5683 + * copyright notice, this list of conditions and the following
5684 + * disclaimer in the documentation and/or other materials provided
5685 + * with the distribution.
5687 + * 3. The name of ATMEL may not be used to endorse or promote products
5688 + * derived from this software without specific prior written
5689 + * permission.
5691 + * THIS SOFTWARE IS PROVIDED BY ATMEL ``AS IS'' AND ANY EXPRESS OR
5692 + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
5693 + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
5694 + * ARE EXPRESSLY AND SPECIFICALLY DISCLAIMED. IN NO EVENT SHALL ATMEL
5695 + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
5696 + * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
5697 + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
5698 + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
5699 + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
5700 + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
5701 + * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
5702 + * DAMAGE.
5703 + */
5704 +#ifndef __PICO_H__
5705 +#define __PICO_H__
5707 +/* Coprocessor Number */
5708 +#define PICO_CPNO 1
5710 +/* Pixel Coprocessor Register file */
5711 +#define PICO_REGVECT_INPIX2 cr0
5712 +#define PICO_REGVECT_INPIX1 cr1
5713 +#define PICO_REGVECT_INPIX0 cr2
5714 +#define PICO_REGVECT_OUTPIX2 cr3
5715 +#define PICO_REGVECT_OUTPIX1 cr4
5716 +#define PICO_REGVECT_OUTPIX0 cr5
5717 +#define PICO_REGVECT_COEFF0_A cr6
5718 +#define PICO_REGVECT_COEFF0_B cr7
5719 +#define PICO_REGVECT_COEFF1_A cr8
5720 +#define PICO_REGVECT_COEFF1_B cr9
5721 +#define PICO_REGVECT_COEFF2_A cr10
5722 +#define PICO_REGVECT_COEFF2_B cr11
5723 +#define PICO_REGVECT_VMU0_OUT cr12
5724 +#define PICO_REGVECT_VMU1_OUT cr13
5725 +#define PICO_REGVECT_VMU2_OUT cr14
5726 +#define PICO_REGVECT_CONFIG cr15
5728 +#define PICO_INPIX2 0
5729 +#define PICO_INPIX1 1
5730 +#define PICO_INPIX0 2
5731 +#define PICO_OUTPIX2 3
5732 +#define PICO_OUTPIX1 4
5733 +#define PICO_OUTPIX0 5
5734 +#define PICO_COEFF0_A 6
5735 +#define PICO_COEFF0_B 7
5736 +#define PICO_COEFF1_A 8
5737 +#define PICO_COEFF1_B 9
5738 +#define PICO_COEFF2_A 10
5739 +#define PICO_COEFF2_B 11
5740 +#define PICO_VMU0_OUT 12
5741 +#define PICO_VMU1_OUT 13
5742 +#define PICO_VMU2_OUT 14
5743 +#define PICO_CONFIG 15
5745 +/* Config Register */
5746 +#define PICO_COEFF_FRAC_BITS 0
5747 +#define PICO_COEFF_FRAC_BITS_WIDTH 4
5748 +#define PICO_OFFSET_FRAC_BITS 4
5749 +#define PICO_OFFSET_FRAC_BITS_WIDTH 4
5750 +#define PICO_INPUT_MODE 8
5751 +#define PICO_INPUT_MODE_WIDTH 2
5752 +#define PICO_OUTPUT_MODE 10
5754 +#define PICO_TRANSFORMATION_MODE 0
5755 +#define PICO_HOR_FILTER_MODE 1
5756 +#define PICO_VERT_FILTER_MODE 2
5758 +#define PICO_PLANAR_MODE 1
5759 +#define PICO_PACKED_MODE 0
5761 +/* Bits in coefficients */
5762 +#define PICO_COEFF_BITS 12
5764 +/* Operation bits */
5765 +#define PICO_USE_ACC (1 << 2)
5766 +#define PICO_SINGLE_VECTOR (1 << 3)
5769 +#define __str(x...) #x
5770 +#define __xstr(x...) __str(x)
5772 +#define PICO_PUT_W(pico_reg, x) \
5773 + __builtin_mvrc_w(PICO_CPNO, pico_reg, x);
5774 +#define PICO_GET_W(pico_reg) \
5775 + __builtin_mvcr_w(PICO_CPNO, pico_reg)
5777 +#define PICO_PUT_D(pico_reg, x) \
5778 + __builtin_mvrc_d(PICO_CPNO, pico_reg, x);
5779 +#define PICO_GET_D(pico_reg) \
5780 + __builtin_mvcr_d(PICO_CPNO, pico_reg)
5783 +#define PICO_STCM_W(ptr, pico_regs...) \
5784 + asm volatile ("stcm.w\tcp" __xstr(PICO_CPNO) ", %0," __xstr(pico_regs) :: "r"(ptr));
5785 +#define PICO_STCM_D(ptr, pico_regs...) \
5786 + asm volatile ("stcm.d\tcp" __xstr(PICO_CPNO) ", %0," __xstr(pico_regs) :: "r"(ptr));
5788 +#define PICO_STCM_W_DEC(ptr, pico_regs...) \
5789 + asm volatile ("stcm.w\tcp" __xstr(PICO_CPNO) ", --%0," __xstr(pico_regs) : "+r"(ptr));
5790 +#define PICO_STCM_D_DEC(ptr, pico_regs...) \
5791 + asm volatile ("stcm.d\tcp" __xstr(PICO_CPNO) ", --%0," __xstr(pico_regs) : "+r"(ptr));
5793 +#define PICO_LDCM_W(ptr, pico_regs...) \
5794 + asm volatile ("ldcm.w\tcp" __xstr(PICO_CPNO) ", %0," __xstr(pico_regs) :: "r"(ptr));
5795 +#define PICO_LDCM_D(ptr, pico_regs...) \
5796 + asm volatile ("ldcm.d\tcp" __xstr(PICO_CPNO) ", %0," __xstr(pico_regs) :: "r"(ptr));
5798 +#define PICO_LDCM_W_INC(ptr, pico_regs...) \
5799 + asm volatile ("ldcm.w\tcp" __xstr(PICO_CPNO) ", %0++," __xstr(pico_regs) : "+r"(ptr));
5800 +#define PICO_LDCM_D_INC(ptr, pico_regs...) \
5801 + asm volatile ("ldcm.d\tcp" __xstr(PICO_CPNO) ", %0++," __xstr(pico_regs) : "+r"(ptr));
5803 +#define PICO_OP(op, dst_addr, addr0, addr1, addr2) \
5804 + __builtin_cop(PICO_CPNO, addr0, addr1, addr2, op | dst_addr);
5807 +#endif
5809 diff --git a/libswscale/swscale_internal.h b/libswscale/swscale_internal.h
5810 index ecd28f5..3221d0c 100644
5811 --- a/libswscale/swscale_internal.h
5812 +++ b/libswscale/swscale_internal.h
5813 @@ -173,7 +173,7 @@ typedef struct SwsContext{
5814 SwsFunc yuv2rgb_get_func_ptr (SwsContext *c);
5815 int yuv2rgb_c_init_tables (SwsContext *c, const int inv_table[4], int fullRange, int brightness, int contrast, int saturation);
5817 -char *sws_format_name(int format);
5818 +char *sws_format_name(enum PixelFormat format);
5820 //FIXME replace this with something faster
5821 #define isPlanarYUV(x) ((x)==PIX_FMT_YUV410P || (x)==PIX_FMT_YUV420P \
5822 diff --git a/libswscale/yuv2rgb.c b/libswscale/yuv2rgb.c
5823 index 71759bc..fa83985 100644
5824 --- a/libswscale/yuv2rgb.c
5825 +++ b/libswscale/yuv2rgb.c
5826 @@ -44,6 +44,10 @@
5827 #include "yuv2rgb_mlib.c"
5828 #endif
5830 +#ifdef ARCH_AVR32
5831 +#include "yuv2rgb_avr32.c"
5832 +#endif
5834 #define DITHER1XBPP // only for mmx
5836 const uint8_t __attribute__((aligned(8))) dither_2x2_4[2][8]={
5837 @@ -601,6 +605,12 @@ SwsFunc yuv2rgb_get_func_ptr (SwsContext *c)
5838 if(t) return t;
5840 #endif
5841 +#ifdef ARCH_AVR32
5843 + SwsFunc t= yuv2rgb_init_avr32(c);
5844 + if(t) return t;
5846 +#endif
5847 #ifdef HAVE_ALTIVEC
5848 if (c->flags & SWS_CPU_CAPS_ALTIVEC)
5850 @@ -678,6 +688,10 @@ int yuv2rgb_c_init_tables (SwsContext *c, const int inv_table[4], int fullRange,
5851 //printf("%lld %lld %lld %lld %lld\n", cy, crv, cbu, cgu, cgv);
5852 oy -= 256*brightness;
5854 +#ifdef ARCH_AVR32
5855 + yuv2rgb_c_init_tables_avr32 (c, inv_table, fullRange, brightness, contrast, saturation);
5856 +#endif
5858 for (i = 0; i < 1024; i++) {
5859 int j;
5861 diff --git a/libswscale/yuv2rgb_avr32.c b/libswscale/yuv2rgb_avr32.c
5862 new file mode 100644
5863 index 0000000..4a8341e
5864 --- /dev/null
5865 +++ b/libswscale/yuv2rgb_avr32.c
5866 @@ -0,0 +1,416 @@
5868 + * Copyright (c) 2007 Atmel Corporation. All rights reserved.
5870 + * Redistribution and use in source and binary forms, with or without
5871 + * modification, are permitted provided that the following conditions
5872 + * are met:
5874 + * 1. Redistributions of source code must retain the above copyright
5875 + * notice, this list of conditions and the following disclaimer.
5877 + * 2. Redistributions in binary form must reproduce the above
5878 + * copyright notice, this list of conditions and the following
5879 + * disclaimer in the documentation and/or other materials provided
5880 + * with the distribution.
5882 + * 3. The name of ATMEL may not be used to endorse or promote products
5883 + * derived from this software without specific prior written
5884 + * permission.
5886 + * THIS SOFTWARE IS PROVIDED BY ATMEL ``AS IS'' AND ANY EXPRESS OR
5887 + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
5888 + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
5889 + * ARE EXPRESSLY AND SPECIFICALLY DISCLAIMED. IN NO EVENT SHALL ATMEL
5890 + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
5891 + * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
5892 + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
5893 + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
5894 + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
5895 + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
5896 + * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
5897 + * DAMAGE.
5898 + */
5899 +#include "pico-avr32.h"
5902 +#define RGB(uv_part) \
5903 + __asm__ volatile ( \
5904 + "ld.w\t%0, %3[%7:" uv_part " << 2]\n\t" /* tmp = c->table_gV[V] */ \
5905 + "ld.w\t%1, %4[%8:" uv_part " << 2]\n\t" /* g = c->table_gU[U] */ \
5906 + "ld.w\t%2, %5[%8:" uv_part " << 2]\n\t" /* b = c->table_bU[U] */ \
5907 + "add\t%1, %0\n\t" /* g += tmp */\
5908 + "ld.w\t%0, %6[%7:" uv_part " << 2]" /* r = c->table_rV[V] */ \
5909 + : "=&r" (r), "=&r" (g), "=&r" (b) \
5910 + : "r" (&c->table_gV[0]), "r" (&c->table_gU[0]),"r" (&c->table_bU[0]), \
5911 + "r" (&c->table_rV[0]), "r" (V), "r" (U));
5914 +#undef YUV2RGB1
5915 +#define YUV2RGB1(dst, src, y, idx) \
5916 + { int tmp2; __asm__ volatile ( \
5917 + "ld.ub\t%0, %3[2*%8]\n\t" /* Y = ((uint32_t*)py_1)[0] */ \
5918 + "ld.ub\t%1, %4[%0]\n\t" /* tmp = r[(Y >> 24) & 0xFF] */ \
5919 + "ld.ub\t%2, %5[%0]\n\t" /* tmp = g[(Y >> 24) & 0xFF] */ \
5920 + "st.b\t%7[6*%8 + 0], %1\n\t" /* dst_1[2] = tmp; */ \
5921 + "st.b\t%7[6*%8 + 1], %2\n\t" /* dst_1[1] = tmp; */ \
5922 + "ld.ub\t%1, %6[%0]\n\t" /* tmp = b[(Y >> 24) & 0xFF] */ \
5923 + "ld.ub\t%0, %3[2*%8 + 1]\n\t" /* Y = ((uint32_t*)py_1)[0] */ \
5924 + "st.b\t%7[6*%8 + 2], %1\n\t" /* dst_1[0] = tmp; */ \
5925 + "ld.ub\t%1, %4[%0]\n\t" /* tmp = r[(Y >> 16) & 0xFF] */ \
5926 + "ld.ub\t%2, %5[%0]\n\t" /* tmp = g[(Y >> 16) & 0xFF] */ \
5927 + "st.b\t%7[6*%8 + 3], %1\n\t" /* dst_1[5] = tmp; */ \
5928 + "ld.ub\t%1, %6[%0]\n\t" /* tmp = b[(Y >> 16) & 0xFF] */ \
5929 + "st.b\t%7[6*%8 + 4], %2\n\t" /* dst_1[4] = tmp; */ \
5930 + "st.b\t%7[6*%8 + 5], %1" /* dst_1[3] = tmp; */ \
5931 + : "=&r" (y), "=&r" (tmp), "=&r" (tmp2) \
5932 + : "r" (src), "r" (r), "r" (g), "r" (b), "r" (dst), "i" (idx)); }
5934 +#undef YUV2RGB2
5935 +#define YUV2RGB2(dst, src, y, idx) \
5936 + { int tmp2; __asm__ volatile ( \
5937 + "ld.ub\t%0, %3[2*%8]\n\t" /* Y = ((uint32_t*)py_1)[0] */ \
5938 + "ld.ub\t%1, %4[%0]\n\t" /* tmp = r[(Y >> 24) & 0xFF] */ \
5939 + "ld.ub\t%2, %5[%0]\n\t" /* tmp = g[(Y >> 24) & 0xFF] */ \
5940 + "st.b\t%7[6*%8 + 0], %1\n\t" /* dst_1[2] = tmp; */ \
5941 + "st.b\t%7[6*%8 + 1], %2\n\t" /* dst_1[1] = tmp; */ \
5942 + "ld.ub\t%1, %6[%0]\n\t" /* tmp = b[(Y >> 24) & 0xFF] */ \
5943 + "ld.ub\t%0, %3[2*%8 + 1]\n\t" /* Y = ((uint32_t*)py_1)[0] */ \
5944 + "st.b\t%7[6*%8 + 2], %1\n\t" /* dst_1[0] = tmp; */ \
5945 + "ld.ub\t%1, %4[%0]\n\t" /* tmp = r[(Y >> 16) & 0xFF] */ \
5946 + "ld.ub\t%2, %5[%0]\n\t" /* tmp = g[(Y >> 16) & 0xFF] */ \
5947 + "st.b\t%7[6*%8 + 3], %1\n\t" /* dst_1[5] = tmp; */ \
5948 + "ld.ub\t%1, %6[%0]\n\t" /* tmp = b[(Y >> 16) & 0xFF] */ \
5949 + "st.b\t%7[6*%8 + 4], %2\n\t" /* dst_1[4] = tmp; */ \
5950 + "st.b\t%7[6*%8 + 5], %1" /* dst_1[3] = tmp; */ \
5951 + : "=&r" (y), "=&r" (tmp), "=&r" (tmp2) \
5952 + : "r" (src), "r" (r), "r" (g), "r" (b), "r" (dst), "i" (idx)); }
5955 +#undef YUV2BGR1
5956 +#define YUV2BGR1(dst, src, y, idx) \
5957 + { int tmp2; __asm__ volatile ( \
5958 + "ld.ub\t%0, %3[2*%8]\n\t" /* Y = ((uint32_t*)py_1)[0] */ \
5959 + "ld.ub\t%1, %4[%0]\n\t" /* tmp = r[(Y >> 24) & 0xFF] */ \
5960 + "ld.ub\t%2, %5[%0]\n\t" /* tmp = g[(Y >> 24) & 0xFF] */ \
5961 + "st.b\t%7[6*%8 + 2], %1\n\t" /* dst_1[2] = tmp; */ \
5962 + "st.b\t%7[6*%8 + 1], %2\n\t" /* dst_1[1] = tmp; */ \
5963 + "ld.ub\t%1, %6[%0]\n\t" /* tmp = b[(Y >> 24) & 0xFF] */ \
5964 + "ld.ub\t%0, %3[2*%8 + 1]\n\t" /* Y = ((uint32_t*)py_1)[0] */ \
5965 + "st.b\t%7[6*%8 + 0], %1\n\t" /* dst_1[0] = tmp; */ \
5966 + "ld.ub\t%1, %4[%0]\n\t" /* tmp = r[(Y >> 16) & 0xFF] */ \
5967 + "ld.ub\t%2, %5[%0]\n\t" /* tmp = g[(Y >> 16) & 0xFF] */ \
5968 + "st.b\t%7[6*%8 + 5], %1\n\t" /* dst_1[5] = tmp; */ \
5969 + "ld.ub\t%1, %6[%0]\n\t" /* tmp = b[(Y >> 16) & 0xFF] */ \
5970 + "st.b\t%7[6*%8 + 4], %2\n\t" /* dst_1[4] = tmp; */ \
5971 + "st.b\t%7[6*%8 + 3], %1" /* dst_1[3] = tmp; */ \
5972 + : "=&r" (y), "=&r" (tmp), "=&r" (tmp2) \
5973 + : "r" (src), "r" (r), "r" (g), "r" (b), "r" (dst), "i" (idx)); }
5975 +#undef YUV2BGR2
5976 +#define YUV2BGR2(dst, src, y, idx) \
5977 + { int tmp2; __asm__ volatile ( \
5978 + "ld.ub\t%0, %3[2*%8]\n\t" /* Y = ((uint32_t*)py_1)[0] */ \
5979 + "ld.ub\t%1, %4[%0]\n\t" /* tmp = r[(Y >> 24) & 0xFF] */ \
5980 + "ld.ub\t%2, %5[%0]\n\t" /* tmp = g[(Y >> 24) & 0xFF] */ \
5981 + "st.b\t%7[6*%8 + 2], %1\n\t" /* dst_1[2] = tmp; */ \
5982 + "st.b\t%7[6*%8 + 1], %2\n\t" /* dst_1[1] = tmp; */ \
5983 + "ld.ub\t%1, %6[%0]\n\t" /* tmp = b[(Y >> 24) & 0xFF] */ \
5984 + "ld.ub\t%0, %3[2*%8 + 1]\n\t" /* Y = ((uint32_t*)py_1)[0] */ \
5985 + "st.b\t%7[6*%8 + 0], %1\n\t" /* dst_1[0] = tmp; */ \
5986 + "ld.ub\t%1, %4[%0]\n\t" /* tmp = r[(Y >> 16) & 0xFF] */ \
5987 + "ld.ub\t%2, %5[%0]\n\t" /* tmp = g[(Y >> 16) & 0xFF] */ \
5988 + "st.b\t%7[6*%8 + 5], %1\n\t" /* dst_1[5] = tmp; */ \
5989 + "ld.ub\t%1, %6[%0]\n\t" /* tmp = b[(Y >> 16) & 0xFF] */ \
5990 + "st.b\t%7[6*%8 + 4], %2\n\t" /* dst_1[4] = tmp; */ \
5991 + "st.b\t%7[6*%8 + 3], %1" /* dst_1[3] = tmp; */ \
5992 + : "=&r" (y), "=&r" (tmp), "=&r" (tmp2) \
5993 + : "r" (src), "r" (r), "r" (g), "r" (b), "r" (dst), "i" (idx)); }
5997 +int yuv2bgr24_avr32(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
5998 + int srcSliceH, uint8_t* dst[], int dstStride[]){
5999 + int y;
6001 + if(c->srcFormat == PIX_FMT_YUV422P){
6002 + srcStride[1] *= 2;
6003 + srcStride[2] *= 2;
6007 + for(y=0; y<srcSliceH; y+=2){
6008 + uint8_t *dst_1= (uint8_t*)(dst[0] + (y+srcSliceY )*dstStride[0]);
6009 + uint8_t *dst_2= (uint8_t*)(dst[0] + (y+srcSliceY+1)*dstStride[0]);
6010 + uint32_t *r, *g, *b;
6011 + uint8_t *py_1= src[0] + y*srcStride[0];
6012 + uint8_t *py_2= py_1 + srcStride[0];
6013 + uint8_t *pu= src[1] + (y>>1)*srcStride[1];
6014 + uint8_t *pv= src[2] + (y>>1)*srcStride[2];
6015 + unsigned int h_size= c->dstW>>3;
6016 + while (h_size--) {
6017 + uint32_t U, V, Y1, Y2, tmp;
6018 + U = ((uint32_t*)pu)[0];
6019 + V = ((uint32_t*)pv)[0];
6021 + RGB("t")
6022 + YUV2BGR1(dst_1, py_1, Y1, 0)
6023 + YUV2BGR1(dst_2, py_2, Y2, 0)
6025 + RGB("u")
6026 + YUV2BGR2(dst_1, py_1, Y1, 1)
6027 + YUV2BGR2(dst_2, py_2, Y2, 1)
6029 + RGB("l")
6030 + YUV2BGR1(dst_1, py_1, Y1, 2)
6031 + YUV2BGR1(dst_2, py_2, Y2, 2)
6033 + RGB("b")
6034 + YUV2BGR2(dst_1, py_1, Y1, 3)
6035 + YUV2BGR2(dst_2, py_2, Y2, 3)
6039 + pu += 4;
6040 + pv += 4;
6041 + py_1 += 8;
6042 + py_2 += 8;
6043 + dst_1 += 24;
6044 + dst_2 += 24;
6047 + return srcSliceH;
6052 +static int yuv2rgb24_avr32(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
6053 + int srcSliceH, uint8_t* dst[], int dstStride[]){
6054 + int y;
6056 + if(c->srcFormat == PIX_FMT_YUV422P){
6057 + srcStride[1] *= 2;
6058 + srcStride[2] *= 2;
6060 + for(y=0; y<srcSliceH; y+=2){
6061 + uint8_t *dst_1= (uint8_t*)(dst[0] + (y+srcSliceY )*dstStride[0]);
6062 + uint8_t *dst_2= (uint8_t*)(dst[0] + (y+srcSliceY+1)*dstStride[0]);
6063 + uint8_t *r, *g, *b;
6064 + uint8_t *py_1= src[0] + y*srcStride[0];
6065 + uint8_t *py_2= py_1 + srcStride[0];
6066 + uint8_t *pu= src[1] + (y>>1)*srcStride[1];
6067 + uint8_t *pv= src[2] + (y>>1)*srcStride[2];
6068 + unsigned int h_size= c->dstW>>3;
6069 + while (h_size--) {
6070 + uint32_t U, V, Y1, Y2, tmp;
6071 + U = ((uint32_t*)pu)[0];
6072 + V = ((uint32_t*)pv)[0];
6074 + RGB("t")
6075 + YUV2RGB1(dst_1, py_1, Y1, 0)
6076 + YUV2RGB1(dst_2, py_2, Y2, 0)
6078 + RGB("u")
6079 + YUV2RGB2(dst_1, py_1, Y1, 1)
6080 + YUV2RGB2(dst_2, py_2, Y2, 1)
6082 + RGB("l")
6083 + YUV2RGB1(dst_1, py_1, Y1, 2)
6084 + YUV2RGB1(dst_2, py_2, Y2, 2)
6086 + RGB("b")
6087 + YUV2RGB2(dst_1, py_1, Y1, 3)
6088 + YUV2RGB2(dst_2, py_2, Y2, 3)
6090 + pu += 4;
6091 + pv += 4;
6092 + py_1 += 8;
6093 + py_2 += 8;
6094 + dst_1 += 24;
6095 + dst_2 += 24;
6098 + return srcSliceH;
6101 +#define SCALE(x, bits) (((x) + ( 1 << (bits - 1))) >> bits)
6102 +#define COEFF_FRAC_BITS 9
6103 +#define OFFSET_FRAC_BITS 2
6105 +/* Coefficients used in the pico */
6106 +static struct {
6107 + short coeff2_2;
6108 + short coeff2_3;
6109 + short coeff2_0;
6110 + short coeff2_1;
6111 + short coeff1_2;
6112 + short coeff1_3;
6113 + short coeff1_0;
6114 + short coeff1_1;
6115 + short coeff0_2;
6116 + short coeff0_3;
6117 + short coeff0_0;
6118 + short coeff0_1;
6119 +} pico_coeff;
6122 +static int yuv2bgr24_avr32_pico(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
6123 + int srcSliceH, uint8_t* dst[], int dstStride[]){
6124 + int y;
6125 + static int first_time = 1;
6127 + /* Initialize pico */
6128 + PICO_LDCM_D(&pico_coeff,
6129 + PICO_REGVECT_COEFF0_A, PICO_REGVECT_COEFF0_B,
6130 + PICO_REGVECT_COEFF1_A, PICO_REGVECT_COEFF1_B,
6131 + PICO_REGVECT_COEFF2_A, PICO_REGVECT_COEFF2_B);
6133 + PICO_PUT_W(PICO_CONFIG,
6134 + (PICO_PACKED_MODE << PICO_OUTPUT_MODE
6135 + | PICO_TRANSFORMATION_MODE << PICO_INPUT_MODE
6136 + | OFFSET_FRAC_BITS << PICO_OFFSET_FRAC_BITS
6137 + | COEFF_FRAC_BITS << PICO_COEFF_FRAC_BITS));
6140 + if(c->srcFormat == PIX_FMT_YUV422P){
6141 + srcStride[1] *= 2;
6142 + srcStride[2] *= 2;
6145 + for(y=0; y<srcSliceH; y+=2){
6146 + uint8_t *dst_1= (uint8_t*)(dst[0] + (y+srcSliceY )*dstStride[0]);
6147 + uint8_t *dst_2= (uint8_t*)(dst[0] + (y+srcSliceY+1)*dstStride[0]);
6148 + uint8_t *r, *g, *b;
6149 + uint8_t *py_1= src[0] + y*srcStride[0];
6150 + uint8_t *py_2= py_1 + srcStride[0];
6151 + uint8_t *pu= src[1] + (y>>1)*srcStride[1];
6152 + uint8_t *pv= src[2] + (y>>1)*srcStride[2];
6153 + unsigned int h_size= c->dstW>>3;
6154 + int *py_1_int = (int *)py_1;
6155 + int *py_2_int = (int *)py_2;
6156 + int *pu_int = (int *)pu;
6157 + int *pv_int = (int *)pv;
6158 + while (h_size--) {
6159 + PICO_PUT_W(PICO_INPIX0, *py_1_int++);
6160 + PICO_PUT_W(PICO_INPIX1, *pu_int++);
6161 + PICO_PUT_W(PICO_INPIX2, *pv_int++);
6162 + PICO_OP(0, 0, 0, 4, 8);
6163 + PICO_OP(0, 1, 1, 4, 8);
6164 + PICO_OP(0, 2, 2, 5, 9);
6165 + PICO_OP(0, 3, 3, 5, 9);
6166 + PICO_PUT_W(PICO_INPIX0, *py_1_int++);
6167 + PICO_STCM_W(dst_1, PICO_REGVECT_OUTPIX2, PICO_REGVECT_OUTPIX1, PICO_REGVECT_OUTPIX0);
6168 + PICO_OP(0, 0, 0, 6, 10);
6169 + PICO_OP(0, 1, 1, 6, 10);
6170 + PICO_OP(0, 2, 2, 7, 11);
6171 + PICO_OP(0, 3, 3, 7, 11);
6172 + PICO_PUT_W(PICO_INPIX0, *py_2_int++);
6173 + PICO_STCM_W(dst_1 + 12, PICO_REGVECT_OUTPIX2, PICO_REGVECT_OUTPIX1, PICO_REGVECT_OUTPIX0);
6175 + PICO_OP(0, 0, 0, 4, 8);
6176 + PICO_OP(0, 1, 1, 4, 8);
6177 + PICO_OP(0, 2, 2, 5, 9);
6178 + PICO_OP(0, 3, 3, 5, 9);
6179 + PICO_PUT_W(PICO_INPIX0, *py_2_int++);
6180 + PICO_STCM_W(dst_2, PICO_REGVECT_OUTPIX2, PICO_REGVECT_OUTPIX1, PICO_REGVECT_OUTPIX0);
6181 + PICO_OP(0, 0, 0, 6, 10);
6182 + PICO_OP(0, 1, 1, 6, 10);
6183 + PICO_OP(0, 2, 2, 7, 11);
6184 + PICO_OP(0, 3, 3, 7, 11);
6185 + PICO_STCM_W(dst_2 + 12, PICO_REGVECT_OUTPIX2, PICO_REGVECT_OUTPIX1, PICO_REGVECT_OUTPIX0);
6187 + dst_1 += 24;
6188 + dst_2 += 24;
6191 + return srcSliceH;
6194 +extern int avr32_use_pico;
6196 +SwsFunc yuv2rgb_init_avr32 (SwsContext *c){
6197 + switch(c->dstFormat){
6198 + case PIX_FMT_BGR24:
6200 + if ( avr32_use_pico ){
6201 + MSG_ERR("AVR32 BGR24: Using PICO for color space conversion\n");
6202 + return yuv2bgr24_avr32_pico;
6203 + } else {
6204 + MSG_ERR("AVR32 BGR24: Using optimized color space conversion\n");
6205 + return yuv2bgr24_avr32;
6208 + break;
6209 + case PIX_FMT_RGB24:
6210 + {
6211 + if ( avr32_use_pico ){
6212 + MSG_ERR("AVR32 RGB24: Using PICO for color space conversion\n");
6213 + return yuv2bgr24_avr32_pico;
6214 + } else {
6215 + MSG_ERR("AVR32 RGB24: Using optimized color space conversion\n");
6216 + return yuv2rgb24_avr32;
6217 + }
6220 + return NULL;
6224 +int yuv2rgb_c_init_tables_avr32 (SwsContext *c, const int inv_table[4], int fullRange, int brightness, int contrast, int saturation){
6225 + const int isRgb = (c->dstFormat == PIX_FMT_RGB24);
6227 + int64_t crv = inv_table[0];
6228 + int64_t cbu = inv_table[1];
6229 + int64_t cgu = -inv_table[2];
6230 + int64_t cgv = -inv_table[3];
6231 + int64_t cy = 1<<16;
6232 + int64_t oy = 0;
6234 + if(!fullRange){
6235 + cy= (cy*255) / 219;
6236 + oy= 16<<16;
6239 + cy = (cy *contrast )>>16;
6240 + crv= (crv*contrast * saturation)>>32;
6241 + cbu= (cbu*contrast * saturation)>>32;
6242 + cgu= (cgu*contrast * saturation)>>32;
6243 + cgv= (cgv*contrast * saturation)>>32;
6245 + oy -= 256*brightness;
6247 + pico_coeff.coeff1_0 = SCALE(cy, 16 - COEFF_FRAC_BITS); /* G <- Y */
6248 + pico_coeff.coeff1_1 = SCALE(cgu, 16 - COEFF_FRAC_BITS); /* G <- U */
6249 + pico_coeff.coeff1_2 = SCALE(cgv, 16 - COEFF_FRAC_BITS); /* G <- V */
6250 + pico_coeff.coeff1_3 = (SCALE(-128*cgu - 128*cgv - 16*cy, 16 - OFFSET_FRAC_BITS)
6251 + + /*0.5*/(1 << (OFFSET_FRAC_BITS-1))); /* G offset */
6253 + if ( isRgb ){
6254 + pico_coeff.coeff0_0 = SCALE(cy, 16 - COEFF_FRAC_BITS); /* R <- Y */
6255 + pico_coeff.coeff0_1 = 0; /* R <- U */
6256 + pico_coeff.coeff0_2 = SCALE(crv, 16 - COEFF_FRAC_BITS); /* R <- V */
6257 + pico_coeff.coeff0_3 = (SCALE(-128*crv - 16*cy, 16 - OFFSET_FRAC_BITS)
6258 + + /*0.5*/(1 << (OFFSET_FRAC_BITS-1))); /* R offset */
6260 + pico_coeff.coeff2_0 = SCALE(cy, 16 - COEFF_FRAC_BITS); /* B <- Y */
6261 + pico_coeff.coeff2_1 = SCALE(cbu, 16 - COEFF_FRAC_BITS); /* B <- U */
6262 + pico_coeff.coeff2_2 = 0; /* B <- V */
6263 + pico_coeff.coeff2_3 = (SCALE(-128*cbu - 16*cy, 16 - OFFSET_FRAC_BITS)
6264 + + /*0.5*/(1 << (OFFSET_FRAC_BITS-1)));/* B offset */
6265 + } else {
6266 + pico_coeff.coeff2_0 = SCALE(cy, 16 - COEFF_FRAC_BITS); /* R <- Y */
6267 + pico_coeff.coeff2_1 = 0; /* R <- U */
6268 + pico_coeff.coeff2_2 = SCALE(crv, 16 - COEFF_FRAC_BITS); /* R <- V */
6269 + pico_coeff.coeff2_3 = (SCALE(-128*crv - 16*cy, 16 - OFFSET_FRAC_BITS)
6270 + + /*0.5*/(1 << (OFFSET_FRAC_BITS-1))); /* R offset */
6272 + pico_coeff.coeff0_0 = SCALE(cy, 16 - COEFF_FRAC_BITS); /* B <- Y */
6273 + pico_coeff.coeff0_1 = SCALE(cbu, 16 - COEFF_FRAC_BITS); /* B <- U */
6274 + pico_coeff.coeff0_2 = 0; /* B <- V */
6275 + pico_coeff.coeff0_3 = (SCALE(-128*cbu - 16*cy, 16 - OFFSET_FRAC_BITS)
6276 + + /*0.5*/(1 << (OFFSET_FRAC_BITS-1))); /* B offset */
6282 +#undef RGB
6283 diff --git a/libvo/vo_fbdev2.c b/libvo/vo_fbdev2.c
6284 index 053c193..7017770 100644
6285 --- a/libvo/vo_fbdev2.c
6286 +++ b/libvo/vo_fbdev2.c
6287 @@ -22,6 +22,9 @@
6288 #include "sub.h"
6289 #include "mp_msg.h"
6291 +/* Draw directly to framebuffer */
6292 +#define USE_CONVERT2FB
6294 static vo_info_t info = {
6295 "Framebuffer Device",
6296 "fbdev2",
6297 @@ -178,6 +181,15 @@ static int fb_preinit(int reset)
6299 fb_orig_vinfo = fb_vinfo;
6301 + /* Reset panning offset */
6302 + fb_vinfo.yoffset = 0;
6303 + if (ioctl(fb_dev_fd, FBIOPAN_DISPLAY, &fb_vinfo)) {
6304 + mp_msg(MSGT_VO, MSGL_ERR,
6305 + "[fbdev2] FBIOPAN_DISPLAY failed: %s\n",
6306 + strerror(errno));
6307 + return 0;
6310 fb_bpp = fb_vinfo.bits_per_pixel;
6312 /* 16 and 15 bpp is reported as 16 bpp */
6313 @@ -289,6 +301,10 @@ static int config(uint32_t width, uint32_t height, uint32_t d_width,
6314 mp_msg(MSGT_VO, MSGL_ERR, "[fbdev2] Can't malloc next_frame: %s\n", strerror(errno));
6315 return 1;
6317 +#else
6318 + if ((fb_line_len * fb_vinfo.yres) <= (fb_finfo.smem_len / 2)
6319 + && fb_vinfo.yoffset == 0)
6320 + center += fb_line_len * fb_vinfo.yres;
6321 #endif
6322 if (fs) memset(frame_buffer, '\0', fb_line_len * fb_vinfo.yres);
6324 @@ -299,14 +315,22 @@ static int query_format(uint32_t format)
6326 // open the device, etc.
6327 if (fb_preinit(0)) return 0;
6328 - if ((format & IMGFMT_BGR_MASK) == IMGFMT_BGR) {
6329 + if ((format & IMGFMT_RGB_MASK) == IMGFMT_RGB) {
6330 int fb_target_bpp = format & 0xff;
6331 set_bpp(&fb_vinfo, fb_target_bpp);
6332 fb_vinfo.xres_virtual = fb_vinfo.xres;
6333 - fb_vinfo.yres_virtual = fb_vinfo.yres;
6334 + fb_vinfo.yres_virtual = fb_vinfo.yres * 2;
6335 if (ioctl(fb_dev_fd, FBIOPUT_VSCREENINFO, &fb_vinfo)) {
6336 - mp_msg(MSGT_VO, MSGL_ERR, "[fbdev2] Can't put VSCREENINFO: %s\n", strerror(errno));
6337 - return 0;
6338 + mp_msg(MSGT_VO, MSGL_WARN,
6339 + "[fbdev2] Can't double virtual y resolution: %s\n",
6340 + strerror(errno));
6341 + fb_vinfo.yres_virtual = fb_vinfo.yres;
6342 + if (ioctl(fb_dev_fd, FBIOPUT_VSCREENINFO, &fb_vinfo)) {
6343 + mp_msg(MSGT_VO, MSGL_ERR,
6344 + "[fbdev2] Can't put VSCREENINFO: %s\n",
6345 + strerror(errno));
6346 + return -1;
6349 fb_pixel_size = fb_vinfo.bits_per_pixel / 8;
6350 fb_bpp = fb_vinfo.red.length + fb_vinfo.green.length +
6351 @@ -367,16 +391,67 @@ static void check_events(void)
6353 static void flip_page(void)
6355 -#ifndef USE_CONVERT2FB
6356 int i, out_offset = 0, in_offset = 0;
6358 - for (i = 0; i < in_height; i++) {
6359 - memcpy(center + out_offset, next_frame + in_offset,
6360 - in_width * fb_pixel_size);
6361 - out_offset += fb_line_len;
6362 - in_offset += in_width * fb_pixel_size;
6364 +#ifndef USE_CONVERT2FB
6365 + if (1) {
6366 +#else
6367 + if (fb_vinfo.yres_virtual == fb_vinfo.yres) {
6368 #endif
6369 + for (i = 0; i < in_height; i++) {
6370 + memcpy(center + out_offset, next_frame + in_offset,
6371 + in_width * fb_pixel_size);
6372 + out_offset += fb_line_len;
6373 + in_offset += in_width * fb_pixel_size;
6375 + } else {
6376 + if (fb_vinfo.yoffset == 0) {
6377 + fb_vinfo.yoffset += fb_vinfo.yres;
6378 + center -= fb_line_len * fb_vinfo.yres;
6379 + } else {
6380 + fb_vinfo.yoffset = 0;
6381 + center += fb_line_len * fb_vinfo.yres;
6384 + if (ioctl(fb_dev_fd, FBIOPAN_DISPLAY, &fb_vinfo)) {
6385 + mp_msg(MSGT_VO, MSGL_ERR,
6386 + "[fbdev2] Can't FBIOPAN_DISPLAY: %s\n",
6387 + strerror(errno));
6392 +static uint32_t get_image(mp_image_t *mpi)
6394 + if(mpi->flags&MP_IMGFLAG_READABLE)
6395 + return VO_FALSE; // slow video ram
6396 + if(mpi->type==MP_IMGTYPE_STATIC)
6397 + return VO_FALSE; // it is not static
6399 + if (mpi->flags & (MP_IMGFLAG_ACCEPT_STRIDE | MP_IMGFLAG_ACCEPT_WIDTH)) {
6400 + // we're lucky or codec accepts stride => ok, let's go!
6402 + //YUY2 and RGB formats
6403 + mpi->planes[0] = center;
6404 + mpi->width = in_width;
6405 + mpi->stride[0] = fb_line_len;
6407 + // center image
6409 + mpi->flags |= MP_IMGFLAG_DIRECT;
6411 + return VO_TRUE;
6414 + return VO_FALSE;
6417 +static uint32_t put_image(mp_image_t *mpi)
6419 + // already out?
6420 + if ((mpi->flags & (MP_IMGFLAG_DIRECT | MP_IMGFLAG_DRAW_CALLBACK)))
6421 + return VO_TRUE;
6422 + return VO_FALSE;
6425 static void uninit(void)
6426 @@ -403,6 +478,10 @@ static int control(uint32_t request, void *data, ...)
6427 switch (request) {
6428 case VOCTRL_QUERY_FORMAT:
6429 return query_format(*((uint32_t*)data));
6430 + case VOCTRL_GET_IMAGE:
6431 + return get_image(data);
6432 + case VOCTRL_DRAW_IMAGE:
6433 + return put_image(data);
6435 return VO_NOTIMPL;
6437 diff --git a/version.sh b/version.sh
6438 index 44b5c5d..cf22a68 100755
6439 --- a/version.sh
6440 +++ b/version.sh
6441 @@ -1,2 +1,2 @@
6442 #!/bin/sh
6443 -echo "#define VERSION \"1.0rc1-$1\"" > version.h
6444 +echo "#define VERSION \"1.0rc1.atmel.2-$1\"" > version.h