Move/add COSTABLE/SINTABLE macros to dsputil to add extern definitions
[FFMpeg-mirror/lagarith.git] / libavcodec / bfin / pixels_bfin.S
blob69b493b647fd80fb864e0af5497371babea7136f
1 /*
2  * Blackfin Pixel Operations
3  * Copyright (C) 2007 Marc Hoffman <marc.hoffman@analog.com>
4  *
5  * This file is part of FFmpeg.
6  *
7  * FFmpeg is free software; you can redistribute it and/or
8  * modify it under the terms of the GNU Lesser General Public
9  * License as published by the Free Software Foundation; either
10  * version 2.1 of the License, or (at your option) any later version.
11  *
12  * FFmpeg is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15  * Lesser General Public License for more details.
16  *
17  * You should have received a copy of the GNU Lesser General Public
18  * License along with FFmpeg; if not, write to the Free Software
19  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20  */
21 #include "config_bfin.h"
23 DEFUN(put_pixels_clamped,mL1,
24         (DCTELEM *block, uint8_t *dest, int line_size)):
25     [--SP] = (R7:4);
26     R4 = 0;
27     R5.l = 0x00ff;
28     R5.h = 0x00ff;
29     I0 = R0;         // block
30     I1 = R1;         // dest
31     R2 += -4;        // line_size
32     M1 = R2;
33     P0 = 8;
34     R0 = [I0++];
35     R1 = [I0++];
36     R2 = MAX(R0, R4) (V);
37     LSETUP (ppc$0,ppc$1) LC0=P0;
38 ppc$0: R2 = MIN(R2, R5) (V);
39        R3 = MAX(R1, R4) (V);
40        R3 = MIN(R3, R5) (V)      || R0 = [I0++];
41        R6 = BYTEPACK (R2,R3)     || R1 = [I0++];
42        R2 = MAX(R0, R4) (V)      || [I1++] = R6;
43        R2 = MIN(R2, R5) (V);
44        R3 = MAX(R1, R4) (V);
45        R3 = MIN(R3, R5) (V)      || R0 = [I0++];
46        R6 = BYTEPACK (R2,R3)     || R1 = [I0++];
47 ppc$1: R2 = Max(R0, R4) (V)      || [I1++M1] = R6;
49     (R7:4) = [SP++];
50     RTS;
51 DEFUN_END(put_pixels_clamped)
53 DEFUN(add_pixels_clamped,mL1,
54         (DCTELEM *block, uint8_t *dest, int line_size)):
55     [-- SP] = (R7:4);
56     R4 = 0;
57     I0 = 0;
58     R2 += -4;        // line_size
59     M0 = R2;
60     I1 = R1;         // dest
61     I3 = R0;         // block
62     I2 = R1;         // dest
63     P0 = 8;
64     M3 = 2;
65     R0 = [I3++]  || R2 = [I1];
66     R2 = R2 << 8                      || R0.H = W[I3--]  || R3 = [I1++];
67     R3 = R3 >> 8                      || R1.L = W[I3]    || I3 += 4;
68     R6 = BYTEOP3P(R1:0, R3:2) (LO)    || R1.H = W[I3++]  || R2 = [I1];
70     LSETUP(apc$2,apc$3) LC1 = P0;
71 apc$2: R7 = BYTEOP3P(R1:0, R3:2) (HI, R) || R0 = [I3++]     || R3 = [I1++M0];
72        R2 = R2 << 8                      || R0.H = W[I3--];
73        R3 = R3 >> 8                      || R1.L = W[I3]    || I3 += 4;
74        R6 = R6 + R7 (S)                  || R1.H = W[I3];
75        R6 = BYTEOP3P(R1:0, R3:2) (LO)    || I3+=M3          || [I2++]=R6;
76        R7 = BYTEOP3P(R1:0, R3:2) (HI, R) || R0 = [I3++]     || R2 = [I1];
77        R2 = R2 << 8                      || R0.H = W[I3--]  || R3 = [I1++];
78        R3 = R3 >> 8                      || R1.L = W[I3]    || I3 += 4;
79        R6 = R6 + R7 (S)                  || R1.H = W[I3++];
80 apc$3: R6 = BYTEOP3P(R1:0, R3:2) (LO)    || [I2++M0] = R6   || R2 = [I1];
82     (R7:4) = [SP++];
83     RTS;
84 DEFUN_END(add_pixels_clamped)
88   motion compensation
89   primitives
91      * Halfpel motion compensation with rounding (a+b+1)>>1.
92      * This is an array[4][4] of motion compensation funcions for 4
93      * horizontal blocksizes (8,16) and the 4 halfpel positions<br>
94      * *pixels_tab[ 0->16xH 1->8xH ][ xhalfpel + 2*yhalfpel ]
95      * @param block destination where the result is stored
96      * @param pixels source
97      * @param line_size number of bytes in a horizontal line of block
98      * @param h height
102 DEFUN(put_pixels8uc,mL1,
103         (uint8_t *block, const uint8_t *s0, const uint8_t *s1,
104                  int dest_size, int line_size, int h)):
105         i3=r0;        // dest
106         i0=r1;        // src0
107         i1=r2;        // src1
108         r0=[sp+12];   // dest_size
109         r2=[sp+16];   // line_size
110         p0=[sp+20];   // h
111         [--sp] = (r7:6);
112         r0+=-4;
113         m3=r0;
114         r2+=-8;
115         m0=r2;
116         LSETUP(pp8$0,pp8$1) LC0=P0;
117         DISALGNEXCPT                || R0 = [I0++]  || R2  =[I1++];
119 pp8$0:  DISALGNEXCPT                || R1 = [I0++]  || R3  =[I1++];
120         R6 = BYTEOP1P(R1:0,R3:2)    || R0 = [I0++M0]|| R2  =[I1++M0];
121         R7 = BYTEOP1P(R1:0,R3:2)(R) || R0 = [I0++]  || [I3++] = R6 ;
122 pp8$1:  DISALGNEXCPT                || R2 = [I1++]  || [I3++M3] = R7;
124         (r7:6) = [sp++];
125         RTS;
126 DEFUN_END(put_pixels8uc)
128 DEFUN(put_pixels16uc,mL1,
129         (uint8_t *block, const uint8_t *s0, const uint8_t *s1,
130                  int dest_size, int line_size, int h)):
131         link 0;
132         [--sp] = (r7:6);
133         i3=r0;        // dest
134         i0=r1;        // src0
135         i1=r2;        // src1
136         r0=[fp+20];   // dest_size
137         r2=[fp+24];   // line_size
138         p0=[fp+28];   // h
141         r0+=-12;
142         m3=r0;        // line_size
143         r2+=-16;
144         m0=r2;
146         LSETUP(pp16$0,pp16$1) LC0=P0;
147          DISALGNEXCPT                || R0 = [I0++]   || R2  =[I1++];
149 pp16$0:  DISALGNEXCPT                || R1 = [I0++]   || R3  =[I1++];
150          R6 = BYTEOP1P(R1:0,R3:2)    || R0 = [I0++]   || R2  =[I1++];
151          R7 = BYTEOP1P(R1:0,R3:2)(R) || R1 = [I0++]   || R3  =[I1++];
152          [I3++] = R6;
153          R6 = BYTEOP1P(R1:0,R3:2)    || R0 = [I0++M0] || R2  =[I1++M0];
154          R7 = BYTEOP1P(R1:0,R3:2)(R) || R0 = [I0++]   || [I3++] = R7 ;
155          [I3++] = R6;
156 pp16$1:  DISALGNEXCPT                || R2 = [I1++]   || [I3++M3] = R7;
158         (r7:6) = [sp++];
159         unlink;
160         RTS;
161 DEFUN_END(put_pixels16uc)
168 DEFUN(put_pixels8uc_nornd,mL1,
169         (uint8_t *block, const uint8_t *s0, const uint8_t *s1,
170                  int line_size, int h)):
171         i3=r0;        // dest
172         i0=r1;        // src0
173         i1=r2;        // src1
174         r2=[sp+12];   // line_size
175         p0=[sp+16];   // h
176         [--sp] = (r7:6);
177         r2+=-4;
178         m3=r2;
179         r2+=-4;
180         m0=r2;
181         LSETUP(pp8$2,pp8$3) LC0=P0;
182         DISALGNEXCPT                || R0 = [I0++]  || R2  =[I1++];
184 pp8$2:  DISALGNEXCPT                || R1 = [I0++]  || R3  =[I1++];
185         R6 = BYTEOP1P(R1:0,R3:2)(T)  || R0 = [I0++M0]|| R2  =[I1++M0];
186         R7 = BYTEOP1P(R1:0,R3:2)(T,R) || R0 = [I0++]  || [I3++] = R6 ;
187 pp8$3:  DISALGNEXCPT                || R2 = [I1++]  || [I3++M3] = R7;
189         (r7:6) = [sp++];
190         RTS;
191 DEFUN_END(put_pixels8uc_nornd)
193 DEFUN(put_pixels16uc_nornd,mL1,
194         (uint8_t *block, const uint8_t *s0, const uint8_t *s1,
195                  int line_size, int h)):
196         i3=r0;        // dest
197         i0=r1;        // src0
198         i1=r2;        // src1
199         r2=[sp+12];   // line_size
200         p0=[sp+16];   // h
202         [--sp] = (r7:6);
203         r2+=-12;
204         m3=r2;        // line_size
205         r2+=-4;
206         m0=r2;
208         LSETUP(pp16$2,pp16$3) LC0=P0;
209         DISALGNEXCPT                || R0 = [I0++]   || R2  =[I1++];
211 pp16$2:
212         DISALGNEXCPT                || R1 = [I0++]   || R3  =[I1++];
213         R6 = BYTEOP1P(R1:0,R3:2)(T)    || R0 = [I0++]   || R2  =[I1++];
214         R7 = BYTEOP1P(R1:0,R3:2)(T,R) || R1 = [I0++]   || R3  =[I1++];
215         [I3++] = R6;
217         R6 = BYTEOP1P(R1:0,R3:2)(T)    || R0 = [I0++M0] || R2  =[I1++M0];
218         R7 = BYTEOP1P(R1:0,R3:2)(T,R) || R0 = [I0++]   || [I3++] = R7 ;
219         [I3++] = R6;
220 pp16$3: DISALGNEXCPT                || R2 = [I1++]   || [I3++M3] = R7;
222         (r7:6) = [sp++];
224         RTS;
225 DEFUN_END(put_pixels16uc_nornd)
227 DEFUN(z_put_pixels16_xy2,mL1,
228         (uint8_t *block, const uint8_t *s0,
229                  int dest_size, int line_size, int h)):
230         link 0;
231         [--sp] = (r7:4);
232         i3=r0;        // dest
233         i0=r1;        // src0--> pixels
234         i1=r1;        // src1--> pixels + line_size
235         r2+=-12;
236         m2=r2;        // m2=dest_width-4
237         r2=[fp+20];
238         m3=r2;        // line_size
239         p0=[fp+24];   // h
240         r2+=-16;
241         i1+=m3;       /* src1 + line_size */
242         m0=r2;        /* line-size - 20 */
244         B0 = I0;
245         B1 = I1;
246         B3 = I3;
248         DISALGNEXCPT                       || R0 = [I0++] || R2  =[I1++];
250         LSETUP(LS$16E,LE$16E) LC0=P0;
251 LS$16E: DISALGNEXCPT                       || R1 = [I0++] || R3  =[I1++];
252         R4 = BYTEOP2P (R3:2,R1:0) (RNDL)   || R0 = [I0++] || R2  =[I1++];
253         R5 = BYTEOP2P (R3:2,R1:0) (RNDL,R) || R1 = [I0++] || [I3++] = R4 ;
254         DISALGNEXCPT                       || R3 = [I1++] || [I3++] = R5;
255         R4 = BYTEOP2P (R3:2,R1:0) (RNDL)   || R0 = [I0++M0]|| R2  = [I1++M0];
256         R5 = BYTEOP2P (R3:2,R1:0) (RNDL,R) || R0 = [I0++] || [I3++] = R4 ;
257 LE$16E: DISALGNEXCPT                       || R2 = [I1++] || [I3++M2] = R5;
259         M1 = 1;
260         I3 = B3;
261         I1 = B1;
262         I0 = B0;
264         I0 += M1;
265         I1 += M1;
267         DISALGNEXCPT                       || R0 = [I0++] || R2  =[I1++];
268         LSETUP(LS$16O,LE$16O) LC0=P0;
269 LS$16O: DISALGNEXCPT                       || R1 = [I0++] || R3  =[I1++];
270         R4 = BYTEOP2P (R3:2,R1:0) (RNDH)   || R0 = [I0++] || R2  =[I1++];
271         R5 = BYTEOP2P (R3:2,R1:0) (RNDH,R) || R1 = [I0++] || R6  =[I3++];
272         R4 = R4 +|+ R6                       || R7 = [I3--];
273         R5 = R5 +|+ R7                       || [I3++] = R4;
274         DISALGNEXCPT                       || R3  =[I1++] || [I3++] = R5;
275         R4 = BYTEOP2P (R3:2,R1:0) (RNDH)   || R0 = [I0++M0]|| R2  = [I1++M0];
276         R5 = BYTEOP2P (R3:2,R1:0) (RNDH,R) || R0 = [I0++] || R6 = [I3++];
277         R4 = R4 +|+ R6                       || R7 = [I3--];
278         R5 = R5 +|+ R7                       || [I3++] = R4;
279 LE$16O: DISALGNEXCPT                       || R2 = [I1++] || [I3++M2] = R5;
281         (r7:4) = [sp++];
282         unlink;
283         rts;
284 DEFUN_END(z_put_pixels16_xy2)
286 DEFUN(put_pixels16_xy2_nornd,mL1,
287         (uint8_t *block, const uint8_t *s0,
288                  int line_size, int h)):
289         link 0;
290         [--sp] = (r7:4);
291         i3=r0;        // dest
292         i0=r1;        // src0--> pixels
293         i1=r1;        // src1--> pixels + line_size
294         m3=r2;
295         r2+=-12;
296         m2=r2;
297         r2+=-4;
298         i1+=m3;       /* src1 + line_size */
299         m0=r2;        /* line-size - 20 */
300         p0=[fp+20];   // h
302         B0=I0;
303         B1=I1;
304         B3=I3;
306         DISALGNEXCPT                       || R0 = [I0++] || R2  =[I1++];
308         LSETUP(LS$16ET,LE$16ET) LC0=P0;
309 LS$16ET:DISALGNEXCPT                       || R1 = [I0++] || R3  =[I1++];
310         R4 = BYTEOP2P (R3:2,R1:0) (TL)     || R0 = [I0++] || R2  =[I1++];
311         R5 = BYTEOP2P (R3:2,R1:0) (TL,R)   || R1 = [I0++] || [I3++] = R4 ;
312         DISALGNEXCPT                       || R3 = [I1++] || [I3++] = R5;
313         R4 = BYTEOP2P (R3:2,R1:0) (TL)     || R0 = [I0++M0]|| R2  = [I1++M0];
314         R5 = BYTEOP2P (R3:2,R1:0) (TL,R)   || R0 = [I0++] || [I3++] = R4 ;
315 LE$16ET:DISALGNEXCPT                       || R2 = [I1++] || [I3++M2] = R5;
317         M1 = 1;
318         I3=B3;
319         I1=B1;
320         I0=B0;
322         I0 += M1;
323         I1 += M1;
325         DISALGNEXCPT                       || R0 = [I0++] || R2  =[I1++];
326         LSETUP(LS$16OT,LE$16OT) LC0=P0;
327 LS$16OT:DISALGNEXCPT                       || R1 = [I0++] || R3  =[I1++];
328         R4 = BYTEOP2P (R3:2,R1:0) (TH)     || R0 = [I0++] || R2  =[I1++];
329         R5 = BYTEOP2P (R3:2,R1:0) (TH,R)   || R1 = [I0++] || R6  =[I3++];
330         R4 = R4 +|+ R6                                    || R7 = [I3--];
331         R5 = R5 +|+ R7                                    || [I3++] = R4;
332         DISALGNEXCPT                       || R3  =[I1++] || [I3++] = R5;
333         R4 = BYTEOP2P (R3:2,R1:0) (TH)     || R0 = [I0++M0]|| R2  = [I1++M0];
334         R5 = BYTEOP2P (R3:2,R1:0) (TH,R)   || R0 = [I0++] || R6 = [I3++];
335         R4 = R4 +|+ R6                                    || R7 = [I3--];
336         R5 = R5 +|+ R7                                    || [I3++] = R4;
337 LE$16OT:DISALGNEXCPT                       || R2 = [I1++] || [I3++M2] = R5;
339         (r7:4) = [sp++];
340         unlink;
341         rts;
342 DEFUN_END(put_pixels16_xy2_nornd)
344 DEFUN(z_put_pixels8_xy2,mL1,
345         (uint8_t *block, const uint8_t *s0,
346                  int dest_size, int line_size, int h)):
347         link 0;
348         [--sp] = (r7:4);
349         i3=r0;        // dest
350         i0=r1;        // src0--> pixels
351         i1=r1;        // src1--> pixels + line_size
352         r2+=-4;
353         m2=r2;        // m2=dest_width-4
354         r2=[fp+20];
355         m3=r2;        // line_size
356         p0=[fp+24];   // h
357         r2+=-8;
358         i1+=m3;       /* src1 + line_size */
359         m0=r2;        /* line-size - 20 */
361         b0 = I0;
362         b1 = I1;
363         b3 = I3;
365         LSETUP(LS$8E,LE$8E) LC0=P0;
366         DISALGNEXCPT                       || R0 = [I0++]   || R2  =[I1++];
367 LS$8E:  DISALGNEXCPT                       || R1 = [I0++]   || R3  =[I1++];
368         R4 = BYTEOP2P (R3:2,R1:0) (RNDL)   || R0 = [I0++M0] || R2  =[I1++M0];
369         R5 = BYTEOP2P (R3:2,R1:0) (RNDL,R) || R0 = [I0++]   || [I3++] = R4 ;
370 LE$8E:  DISALGNEXCPT                       || R2 = [I1++]   || [I3++M2] = R5;
372         M1 = 1;
373         I3 = b3;
374         I1 = b1;
375         I0 = b0;
377         I0 += M1;
378         I1 += M1;
380         LSETUP(LS$8O,LE$8O) LC0=P0;
381         DISALGNEXCPT                       || R0 = [I0++]   || R2  =[I1++];
382 LS$8O:  DISALGNEXCPT                       || R1 = [I0++]   || R3  =[I1++];
383         R4 = BYTEOP2P (R3:2,R1:0) (RNDH)   || R0 = [I0++M0] || R2  =[I1++M0];
384         R5 = BYTEOP2P (R3:2,R1:0) (RNDH,R) || R0 = [I0++]   || R6  =[I3++];
385         R4 = R4 +|+ R6                                      || R7 = [I3--];
386         R5 = R5 +|+ R7                                      || [I3++] = R4;
387 LE$8O:  DISALGNEXCPT                       || R2  =[I1++]   || [I3++M2] = R5;
389         (r7:4) = [sp++];
390         unlink;
391         rts;
392 DEFUN_END(z_put_pixels8_xy2)
394 DEFUN(put_pixels8_xy2_nornd,mL1,
395         (uint8_t *block, const uint8_t *s0, int line_size, int h)):
396         link 0;
397         [--sp] = (r7:4);
398         i3=r0;        // dest
399         i0=r1;        // src0--> pixels
400         i1=r1;        // src1--> pixels + line_size
401         m3=r2;
402         r2+=-4;
403         m2=r2;
404         r2+=-4;
405         i1+=m3;       /* src1 + line_size */
406         m0=r2;        /* line-size - 20 */
407         p0=[fp+20];   // h
410         b0 = I0;
411         b1 = I1;
412         b3 = I3;
414         LSETUP(LS$8ET,LE$8ET) LC0=P0;
415         DISALGNEXCPT                       || R0 = [I0++]   || R2  =[I1++];
417 LS$8ET: DISALGNEXCPT                       || R1 = [I0++]   || R3 = [I1++];
418         R4 = BYTEOP2P (R3:2,R1:0) (TL)     || R0 = [I0++M0] || R2 = [I1++M0];
419         R5 = BYTEOP2P (R3:2,R1:0) (TL,R)   || R0 = [I0++]   || [I3++] = R4 ;
420 LE$8ET: DISALGNEXCPT                       || R2 = [I1++]   || [I3++M2] = R5;
422         M1 = 1;
423         I3 = b3;
424         I1 = b1;
425         I0 = b0;
427         I0 += M1;
428         I1 += M1;
430         LSETUP(LS$8OT,LE$8OT) LC0=P0;
431         DISALGNEXCPT                       || R0 = [I0++]   || R2 = [I1++];
433 LS$8OT: DISALGNEXCPT                       || R1 = [I0++]   || R3 = [I1++];
434         R4 = BYTEOP2P (R3:2,R1:0) (TH)     || R0 = [I0++M0] || R2 = [I1++M0];
435         R5 = BYTEOP2P (R3:2,R1:0) (TH,R)   || R0 = [I0++]   || R6 = [I3++];
436         R4 = R4 +|+ R6                                      || R7 = [I3--];
437         R5 = R5 +|+ R7                                      || [I3++] = R4;
438 LE$8OT: DISALGNEXCPT                       || R2  =[I1++]   || [I3++M2] = R5;
440         (r7:4) = [sp++];
441         unlink;
442         rts;
444 DEFUN(diff_pixels,mL1,
445        (DCTELEM *block, uint8_t *s1, uint8_t *s2, int stride)):
446         link 0;
447         [--sp] = (r7:4);
448         p0=8;
449         i3=r0;        // block
450         i0=r1;        // s1
451         i1=r2;        // s2
452         r2=[fp+20];   // stride
453         r2+=-8;
454         m0=r2;
457         LSETUP(.LS0,.LE0) LC0=P0;
458         DISALGNEXCPT                       || R0 = [I0++]   || R2  =[I1++];
460 .LS0:   DISALGNEXCPT                       || R1 = [I0++]   || R3 = [I1++];
461         (R5,R4) = BYTEOP16M (R1:0,R3:2)    || R0 = [I0++M0] || R2 = [I1++M0];
462         (R7,R6) = BYTEOP16M (R1:0,R3:2) (R)|| R0 = [I0++]   || [I3++] = R4;
463         DISALGNEXCPT                       || R2 = [I1++]   || [I3++] = R5;
464         [i3++]=r6;
465 .LE0:  [i3++]=r7;
467         (r7:4) = [sp++];
468         unlink;
469         rts;
470 DEFUN_END(put_pixels8_xy2_nornd)
473     for (i = 0; i < 16; i++) {
474         for (j = 0; j < 16; j++) {
475           sum += pix[j];
476         }
477         pix += line_size;
478     }
480 DEFUN(pix_sum,mL1,
481         (uint8_t *p, int stride)):
482         link 0;
483         [--sp] = (r7:4);
484         p0=8;
485         i0=r0;        // s1
486         i1=r0;
487         m1=r1;
488         r1=r1+r1;
489         r1+=-16;       // stride
490         m0=r1;
491         i1+=m1;
493         r6=0;
495         LSETUP(LS$PS,LE$PS) LC0=P0;
496         DISALGNEXCPT                       || R0 = [I0++]   || R2  =[I1++];
498 LS$PS:  DISALGNEXCPT                       || R1 = [I0++]   || R3 = [I1++];
499         (R5,R4) = BYTEOP16P (R3:2,R1:0)    || R0 = [I0++]   || R2 = [I1++];
500         r6=r6+|+r5;
501         r6=r6+|+r4;
502         (R5,R4) = BYTEOP16P (R3:2,R1:0) (R)|| R1 = [I0++]   || R3 = [I1++];
503         r6=r6+|+r5;
504         r6=r6+|+r4;
505         (R5,R4) = BYTEOP16P (R3:2,R1:0)    || R0 = [I0++m0] || R2 = [I1++m0];
506         r6=r6+|+r5;
507         r6=r6+|+r4;
508         (R5,R4) = BYTEOP16P (R3:2,R1:0) (R)|| R0 = [I0++]   || R2 = [I1++];
509         r6=r6+|+r5;
510 LE$PS:  r6=r6+|+r4;
511         r0.l=r6.l+r6.h;
512         r0.h=0;
514         (r7:4) = [sp++];
515         unlink;
516         rts;
517 DEFUN_END(pix_sum)
520 DEFUN(get_pixels,mL1,
521         (DCTELEM *restrict block, const uint8_t *pixels, int line_size)):
522         [--sp] = (r7:4);
523         i3=r0;        // dest
524         i0=r1;        // src0
525         p0=8;
526         r2+=-8;
527         m0=r2;
528         LSETUP(gp8$0,gp8$1) LC0=P0;
530         DISALGNEXCPT                   || R0 = [I0++];
531         DISALGNEXCPT                   || R1 = [I0++];
533 gp8$0:  (R7,R6) = byteunpack R1:0      || R0 = [I0++M0];
534         (R5,R4) = byteunpack R1:0 (R)  || R0 = [I0++]    || [I3++]=R6;
535         DISALGNEXCPT                   || R1 = [I0++]    || [I3++]=R7;
536         [I3++]=R4;
537 gp8$1:  [I3++]=R5
540         (r7:4) = [sp++];
541         RTS;
542 DEFUN_END(get_pixels)
545 /* sad = sad16x16 (ubyte *mb, ubyte *refwin, srcwidth, refwinwidth, h) */
546 /* 91 cycles */
547 DEFUN(z_sad16x16,mL1,
548         (uint8_t *blk1, uint8_t *blk2, int dsz, int line_size, int h)):
549         link 0;
550         I0 = R0;
551         I1 = R1;
553         A1 = A0 = 0;
554         R0 = [sp+20]; // rwidth
555         P2 = [sp+24]; // height
556         R3 = 16;
557         R0 = R0 - R3;
558         R3 = R2 - R3;
559         M1 = R0;
560         M0 = R3;
562         DISALGNEXCPT         || R0 = [I0++]    || R2 = [I1++];
563         LSETUP (s$16, e$16) LC0=P2;
564 s$16:   DISALGNEXCPT         || R1 = [I0++]    || R3 = [I1++];
565         SAA (R1:0,R3:2)      || R0 = [I0++]    || R2 = [I1++];
566         SAA (R1:0,R3:2) (R)  || R1 = [I0++]    || R3 = [I1++];
567         SAA (R1:0,R3:2)      || R0 = [I0++M0]  || R2 = [I1++M1];
568 e$16:   SAA (R1:0,R3:2) (R)  || R0 = [I0++]    || R2 = [I1++];
570         R3=A1.L+A1.H,  R2=A0.L+A0.H ;
571         R0 = R2 + R3 ;
572         unlink;
573         RTS;
574 DEFUN_END(z_sad16x16)
576 /* sad = sad8x8 (ubyte *mb, ubyte *refwin, int srcwidth, int refwinwidth, int h) */
577 /* 36 cycles */
578 DEFUN(z_sad8x8,mL1,
579         (uint8_t *blk1, uint8_t *blk2, int dsz, int line_size, int h)):
580         I0 = R0;
581         I1 = R1;
583         A1 = A0 = 0;
584         r0 = [sp+12]; // rwidth
585         P2 = [sp+16]; //height
586         R3 = 8;
587         R0 = R0 - R3;
588         R3 = R2 - R3;
589         M0 = R3;
590         M1 = R0;
592         LSETUP (s$8, e$8) LC0=P2;
593         DISALGNEXCPT         || R0 = [I0++]   || R2 = [I1++];
594         DISALGNEXCPT         || R1 = [I0++]   || R3 = [I1++];
595 s$8:    SAA (R1:0,R3:2)      || R0 = [I0++M0] || R2 = [I1++M1];
596         SAA (R1:0,R3:2) (R)  || R0 = [I0++]   || R2 = [I1++];
597 e$8:    DISALGNEXCPT         || R1 = [I0++]   || R3 = [I1++];
599         R3=A1.L+A1.H,  R2=A0.L+A0.H ;
600         R0 = R2 + R3 ;
601         RTS;
602 DEFUN_END(z_sad8x8)
604 DEFUN(pix_norm1,mL1,
605         (uint8_t * pix, int line_size)):
606         [--SP]=(R7:4,P5:3);
608         // Fetch the input arguments.
609         P1 = R0;  // pix
610         P0 = R1;  // line_size
611         P5 = 16;  // loop ctr.
612         P0 -= P5;
613         M0 = P0;  // M0 = line_size-16;
614         // Now for the real work.
615         A1 = A0 = 0;
616         lsetup(_pix_norm1_blkfn_loopStart, _pix_norm1_blkfn_loopEnd) LC1 = P5;
617         I0 = P1;
618         DISALGNEXCPT || r0 = [i0++];
620 _pix_norm1_blkfn_loopStart:
621         // following unpacks pix1[0..15] pix1+line_size[0..15]
622         DISALGNEXCPT || r1 = [i0++];
624         (r5, r4) = byteunpack r1:0 || r0 = [i0++];
625         a1 += r5.h * r5.h, a0 += r5.l * r5.l (is);
626         a1 += r4.h * r4.h, a0 += r4.l * r4.l (is);
627         (r5, r4) = byteunpack r1:0(r) || r1 = [i0++];
628         a1 += r5.h * r5.h, a0 += r5.l * r5.l (is);
629         a1 += r4.h * r4.h, a0 += r4.l * r4.l (is);
630         (r5, r4) = byteunpack r1:0 || r0 = [i0++M0];
631         a1 += r5.h * r5.h, a0 += r5.l * r5.l (is);
632         a1 += r4.h * r4.h, a0 += r4.l * r4.l (is);
633         (r5, r4) = byteunpack r1:0(r) || r0 = [i0++];
634         a1 += r5.h * r5.h, a0 += r5.l * r5.l (is);
635 _pix_norm1_blkfn_loopEnd:
636         a1 += r4.h * r4.h, a0 += r4.l * r4.l (is);
639 // Clean up at the end:
640         R2 = A0, R3 = A1;
641         R0 = R2 + R3 (S);
643         (R7:4,P5:3)=[SP++];
645         RTS;
646 DEFUN_END(pix_norm1)
648 DEFUN(sse4,mL1,
649         (void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)):
650         link 0;
651         [--sp] = (r7:6);
652         p0=[fp+24];   // h
653         i0=r1;        // pix1
654         i1=r2;        // pix2
655         r2=[fp+20];   // line_size
656         r2+=-4;
657         m0=r2;
659         a0=a1=0;
660         LSETUP(.S40,.E40) LC0=P0;
661         DISALGNEXCPT                       || R0 = [I0++]   || R2  =[I1++];
663 .S40:   DISALGNEXCPT                       || R1 = [I0++M0] || R3 = [I1++M0];
664         (R7,R6) = BYTEOP16M (R1:0,R3:2);
665         a0 += r7.l * r7.l, a1 += r7.h * r7.h (is);
666 .E40:   a0 += r6.l * r6.l, a1 += r6.h * r6.h (is);
667         a0 += a1;
668         r0 = a0;
670         (r7:6) = [sp++];
671         unlink;
672         rts;
673 DEFUN_END(sse4)
675 DEFUN(sse8,mL1,
676         (void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)):
677         link 0;
678         [--sp] = (r7:6);
679         p0=[fp+24];   // h
680         i0=r1;        // pix1
681         i1=r2;        // pix2
682         r2=[fp+20];   // line_size
683         r2+=-8;
684         m0=r2;
686         a0=a1=0;
687         LSETUP(.S80,.E80) LC0=P0;
688         DISALGNEXCPT                       || R0 = [I0++]   || R2  =[I1++];
690 .S80:   DISALGNEXCPT                       || R1 = [I0++]   || R3 = [I1++];
691         (R7,R6) = BYTEOP16M (R1:0,R3:2)    || R0 = [I0++M0] || R2 = [I1++M0];
692         a0 += r7.l * r7.l, a1 += r7.h * r7.h (is);
693         a0 += r6.l * r6.l, a1 += r6.h * r6.h (is);
694         (R7,R6) = BYTEOP16M (R1:0,R3:2) (R)|| R0 = [I0++]   || R2 = [I1++];
695         a0 += r7.l * r7.l, a1 += r7.h * r7.h (is);
696 .E80:   a0 += r6.l * r6.l, a1 += r6.h * r6.h (is);
697         a0 += a1;
698         r0 = a0;
700         (r7:6) = [sp++];
701         unlink;
702         rts;
703 DEFUN_END(sse8)
705 DEFUN(sse16,mL1,
706         (void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)):
707         link 0;
708         [--sp] = (r7:6);
709         p0=[fp+24];   // h
710         i0=r1;        // pix1
711         i1=r2;        // pix2
712         r2=[fp+20];   // line_size
713         r2+=-16;
714         m0=r2;
716         a0=a1=0;
717         DISALGNEXCPT                       || R0 = [I0++]   || R2  =[I1++];
718         LSETUP(.S160,.E160) LC0=P0;
720 .S160:  DISALGNEXCPT                       || R1 = [I0++]   || R3 = [I1++];
721         (R7,R6) = BYTEOP16M (R1:0,R3:2)    || R0 = [I0++]   || R2 = [I1++];
722         a0 += r7.l * r7.l, a1 += r7.h * r7.h (is);
723         a0 += r6.l * r6.l, a1 += r6.h * r6.h (is);
724         (R7,R6) = BYTEOP16M (R1:0,R3:2) (R)|| R1 = [I0++]   || R3 = [I1++];
725         a0 += r7.l * r7.l, a1 += r7.h * r7.h (is);
726         a0 += r6.l * r6.l, a1 += r6.h * r6.h (is);
727         (R7,R6) = BYTEOP16M (R1:0,R3:2)    || R0 = [I0++M0] || R2 = [I1++M0];
728         a0 += r7.l * r7.l, a1 += r7.h * r7.h (is);
729         a0 += r6.l * r6.l, a1 += r6.h * r6.h (is);
730         (R7,R6) = BYTEOP16M (R1:0,R3:2) (R)|| R0 = [I0++]   || R2 = [I1++];
731         a0 += r7.l * r7.l, a1 += r7.h * r7.h (is);
732 .E160:  a0 += r6.l * r6.l, a1 += r6.h * r6.h (is);
733         a0 += a1;
734         r0 = a0;
736         (r7:6) = [sp++];
737         unlink;
738         rts;
739 DEFUN_END(sse16)