r864: Merge 2.1:
[cinelerra_cv/ct.git] / mpeg2enc / predcomp_mmxe.s
bloba2d04130ea6dd749e3e338b4976dcbf25bfa1350
1 ;;;
2 ;;; predcomp_00_mmxe.s:
3 ;;;
4 ;;; Extended MMX prediction composition
5 ;;; routines handling the four different interpolation cases...
6 ;;;
7 ;;; Copyright (C) 2000 Andrew Stevens <as@comlab.ox.ac.uk>
9 ;;;
10 ;;; This program is free software; you can reaxstribute it and/or
11 ;;; modify it under the terms of the GNU General Public License
12 ;;; as published by the Free Software Foundation; either version 2
13 ;;; of the License, or (at your option) any later version.
14 ;;;
15 ;;; This program is distributed in the hope that it will be useful,
16 ;;; but WITHOUT ANY WARRANTY; without even the implied warranty of
17 ;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 ;;; GNU General Public License for more details.
19 ;;;
20 ;;; You should have received a copy of the GNU General Public License
21 ;;; along with this program; if not, write to the Free Software
22 ;;; Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
23 ;;; 02111-1307, USA.
24 ;;;
25 ;;;
26 ;;;
28 ;;; The no interpolation case...
30 global predcomp_00_mmxe
32 ;;; void predcomp_<ix><iy>_mmxe(char *src,char *dst,int lx, int w, int h, int mask);
34 ;;; ix - Interpolation in x iy - Interpolation in y
36 ;;; eax = pdst
37 ;;; ebx = psrc
38 ;;; ecx = h left
39 ;;; edx = lx;
40 ;;; edi = w (8 or 16)
43 ;;; mm1 = one's mask for src
44 ;;; mm0 = zero mask for src...
47 align 32
48 predcomp_00_mmxe:
49 push ebp ; save frame pointer
50 mov ebp, esp ; link
52 push eax
53 push ebx
54 push ecx
55 push edx
56 push edi
58 mov ebx, [ebp+8] ; get psrc
59 mov eax, [ebp+12] ; get pdst
60 mov edx, [ebp+16] ; get lx
61 mov edi, [ebp+20] ; get w
62 mov ecx, [ebp+24] ; get h
63 movd mm0, [ebp+28]
64 ;; Extend addflag into bit-mask
65 pxor mm2, mm2
66 punpckldq mm0,mm0
67 pcmpeqd mm0, mm2
68 movq mm1, mm0
69 pcmpeqd mm0, mm2
71 jmp predrow00 ; align for speed
72 align 32
73 predrow00:
74 movq mm4, [ebx] ; first 8 bytes of row
75 movq mm2, [eax]
76 pand mm2, mm0
77 movq mm3, mm4
78 pand mm3, mm1
79 por mm2, mm3
80 pavgb mm4, mm2 ;
81 movq [eax], mm4 ;
84 cmp edi, 8
85 jz eightwide00
87 movq mm4, [ebx+8] ; first 8 bytes of row
88 movq mm2, [eax+8]
89 pand mm2, mm0
90 movq mm3, mm4
91 pand mm3, mm1
92 por mm2, mm3
93 pavgb mm4, mm2 ;
94 movq [eax+8], mm4 ;
97 eightwide00:
98 add eax, edx ; update pointer to next row
99 add ebx, edx ; ditto
101 sub ecx, 1 ; check h left
102 jnz predrow00
104 pop edi
105 pop edx
106 pop ecx
107 pop ebx
108 pop eax
109 pop ebp
110 emms
111 ret
114 ;;; The x-axis interpolation case...
116 global predcomp_10_mmxe
119 align 32
120 predcomp_10_mmxe:
121 push ebp ; save frame pointer
122 mov ebp, esp ; link
124 push eax
125 push ebx
126 push ecx
127 push edx
128 push edi
130 mov ebx, [ebp+8] ; get psrc
131 mov eax, [ebp+12] ; get pdst
132 mov edx, [ebp+16] ; get lx
133 mov edi, [ebp+20] ; get w
134 mov ecx, [ebp+24] ; get h
135 movd mm0, [ebp+28]
136 ;; Extend addflag into bit-mask
137 pxor mm2,mm2
138 punpckldq mm0,mm0
139 pcmpeqd mm0, mm2
140 movq mm1, mm0
141 pcmpeqd mm0, mm2
143 jmp predrow10 ; align for speed
144 align 32
145 predrow10:
146 movq mm4, [ebx] ; first 8 bytes row: avg src in x
147 pavgb mm4, [ebx+1]
148 movq mm2, [eax]
149 pand mm2, mm0
150 movq mm3, mm4
151 pand mm3, mm1
152 por mm2, mm3
153 pavgb mm4, mm2 ; combine
154 movq [eax], mm4
156 cmp edi, 8
157 jz eightwide10
160 movq mm4, [ebx+8] ; 2nd 8 bytes row: avg src in x
161 pavgb mm4, [ebx+9]
162 movq mm2, [eax+8]
163 pand mm2, mm0
164 movq mm3, mm4
165 pand mm3, mm1
166 por mm2, mm3
167 pavgb mm4, mm2 ; combine
168 movq [eax+8], mm4
171 eightwide10:
172 add eax, edx ; update pointer to next row
173 add ebx, edx ; ditto
176 sub ecx, 1 ; check h left
177 jnz near predrow10
179 pop edi
180 pop edx
181 pop ecx
182 pop ebx
183 pop eax
184 pop ebp
185 emms
189 ;;; The x-axis and y-axis interpolation case...
191 global predcomp_11_mmxe
193 ;;; mm2 = [0,0,0,0]W
194 ;;; mm3 = [2,2,2,2]W
195 align 32
196 predcomp_11_mmxe:
197 push ebp ; save frame pointer
198 mov ebp, esp ; link
200 push eax
201 push ebx
202 push ecx
203 push edx
204 push edi
206 mov eax, 0x00020002
207 movd mm3, eax
208 punpckldq mm3,mm3
209 mov ebx, [ebp+8] ; get psrc
210 mov eax, [ebp+12] ; get pdst
211 mov edx, [ebp+16] ; get lx
212 mov edi, [ebp+20] ; get w
213 mov ecx, [ebp+24] ; get h
214 movd mm0, [ebp+28]
215 ;; Extend addflag into bit-mask
217 pxor mm2,mm2
218 punpckldq mm0, mm0
219 pcmpeqd mm0, mm2
220 movq mm1, mm0
221 pcmpeqd mm0, mm2
223 jmp predrow11 ; align for speed
224 align 32
225 predrow11:
226 movq mm4, [ebx] ; mm4 and mm6 accumulate partial sums for interp.
227 movq mm6, mm4
228 punpcklbw mm4, mm2
229 punpckhbw mm6, mm2
231 movq mm5, [ebx+1]
232 movq mm7, mm5
233 punpcklbw mm5, mm2
234 paddw mm4, mm5
235 punpckhbw mm7, mm2
236 paddw mm6, mm7
238 add ebx, edx ; update pointer to next row
240 movq mm5, [ebx] ; first 8 bytes 1st row: avg src in x
241 movq mm7, mm5
242 punpcklbw mm5, mm2 ; Accumulate partial interpolation
243 paddw mm4, mm5
244 punpckhbw mm7, mm2
245 paddw mm6, mm7
247 movq mm5, [ebx+1]
248 movq mm7, mm5
249 punpcklbw mm5, mm2
250 paddw mm4, mm5
251 punpckhbw mm7, mm2
252 paddw mm6, mm7
254 ;; Now round and repack...
255 paddw mm4, mm3
256 paddw mm6, mm3
257 psrlw mm4, 2
258 psrlw mm6, 2
259 packuswb mm4, mm6
261 movq mm7, [eax]
262 pand mm7, mm0
263 movq mm6, mm4
264 pand mm6, mm1
265 por mm7, mm6
266 pavgb mm4, mm7
267 movq [eax], mm4
269 cmp edi, 8
270 jz eightwide11
272 sub ebx, edx ; Back to 1st row
274 movq mm4, [ebx+8] ; mm4 and mm6 accumulate partial sums for interp.
275 movq mm6, mm4
276 punpcklbw mm4, mm2
277 punpckhbw mm6, mm2
279 movq mm5, [ebx+9]
280 movq mm7, mm5
281 punpcklbw mm5, mm2
282 paddw mm4, mm5
283 punpckhbw mm7, mm2
284 paddw mm6, mm7
286 add ebx, edx ; update pointer to next row
288 movq mm5, [ebx+8] ; first 8 bytes 1st row: avg src in x
289 movq mm7, mm5
290 punpcklbw mm5, mm2 ; Accumulate partial interpolation
291 paddw mm4, mm5
292 punpckhbw mm7, mm2
293 paddw mm6, mm7
295 movq mm5, [ebx+9]
296 movq mm7, mm5
297 punpcklbw mm5, mm2
298 paddw mm4, mm5
299 punpckhbw mm7, mm2
300 paddw mm6, mm7
302 ;; Now round and repack...
303 paddw mm4, mm3
304 paddw mm6, mm3
305 psraw mm4, 2
306 psraw mm6, 2
307 packuswb mm4, mm6
309 movq mm7, [eax+8]
310 pand mm7, mm0
311 movq mm6, mm4
312 pand mm6, mm1
313 por mm7, mm6
314 pavgb mm4, mm7
315 movq [eax+8], mm4
317 eightwide11:
318 add eax, edx ; update pointer to next row
321 sub ecx, 1 ; check h left
322 jnz near predrow11
324 pop edi
325 pop edx
326 pop ecx
327 pop ebx
328 pop eax
329 pop ebp
330 emms
335 ;;; The y-axis interpolation case...
337 global predcomp_01_mmxe
339 align 32
340 predcomp_01_mmxe:
341 push ebp ; save frame pointer
342 mov ebp, esp ; link
344 push eax
345 push ebx
346 push ecx
347 push edx
348 push edi
350 mov ebx, [ebp+8] ; get psrc
351 mov eax, [ebp+12] ; get pdst
352 mov edx, [ebp+16] ; get lx
353 mov edi, [ebp+20] ; get w
354 mov ecx, [ebp+24] ; get h
355 movd mm0, [ebp+28]
356 ;; Extend addflag into bit-mask
357 pxor mm2, mm2
358 punpckldq mm0,mm0
359 pcmpeqd mm0, mm2
360 movq mm1, mm0
361 pcmpeqd mm0, mm2
363 jmp predrow01 ; align for speed
364 align 32
365 predrow01:
366 movq mm4, [ebx] ; first 8 bytes row
367 add ebx, edx ; update pointer to next row
368 pavgb mm4, [ebx] ; Average in y
370 movq mm2, [eax]
371 pand mm2, mm0
372 movq mm3, mm4
373 pand mm3, mm1
374 por mm2, mm3
375 pavgb mm4, mm2
376 movq [eax], mm4
378 cmp edi, 8
379 jz eightwide01
381 sub ebx, edx ; Back to prev row
382 movq mm4, [ebx+8] ; first 8 bytes row
383 add ebx, edx ; update pointer to next row
384 pavgb mm4, [ebx+8] ; Average in y
386 movq mm2, [eax+8]
387 pand mm2, mm0
388 movq mm3, mm4
389 pand mm3, mm1
390 por mm2, mm3
391 pavgb mm4, mm2
392 movq [eax+8], mm4
394 eightwide01:
395 add eax, edx ; update pointer to next row
398 sub ecx, 1 ; check h left
399 jnz predrow01
401 pop edi
402 pop edx
403 pop ecx
404 pop ebx
405 pop eax
406 pop ebp
407 emms