Merge "Dynamic resize for real-time: reference scaling."
[aom.git] / vp9 / common / x86 / vp9_high_intrapred_sse2.asm
blobb12d29c0ad8a57ea898c763b408d3ea177ba4086
2 ; Copyright (c) 2014 The WebM project authors. All Rights Reserved.
4 ; Use of this source code is governed by a BSD-style license
5 ; that can be found in the LICENSE file in the root of the source
6 ; tree. An additional intellectual property rights grant can be found
7 ; in the file PATENTS. All contributing project authors may
8 ; be found in the AUTHORS file in the root of the source tree.
11 %include "third_party/x86inc/x86inc.asm"
13 SECTION_RODATA
14 pw_4: times 8 dw 4
15 pw_8: times 8 dw 8
16 pw_16: times 4 dd 16
17 pw_32: times 4 dd 32
19 SECTION .text
20 INIT_MMX sse
21 cglobal highbd_dc_predictor_4x4, 4, 5, 4, dst, stride, above, left, goffset
22 GET_GOT goffsetq
24 movq m0, [aboveq]
25 movq m2, [leftq]
26 DEFINE_ARGS dst, stride, one
27 mov oned, 0x0001
28 pxor m1, m1
29 movd m3, oned
30 pshufw m3, m3, 0x0
31 paddw m0, m2
32 pmaddwd m0, m3
33 packssdw m0, m1
34 pmaddwd m0, m3
35 paddw m0, [GLOBAL(pw_4)]
36 psraw m0, 3
37 pshufw m0, m0, 0x0
38 movq [dstq ], m0
39 movq [dstq+strideq*2], m0
40 lea dstq, [dstq+strideq*4]
41 movq [dstq ], m0
42 movq [dstq+strideq*2], m0
44 RESTORE_GOT
45 RET
47 INIT_XMM sse2
48 cglobal highbd_dc_predictor_8x8, 4, 5, 4, dst, stride, above, left, goffset
49 GET_GOT goffsetq
51 pxor m1, m1
52 mova m0, [aboveq]
53 mova m2, [leftq]
54 DEFINE_ARGS dst, stride, stride3, one
55 mov oned, 0x00010001
56 lea stride3q, [strideq*3]
57 movd m3, oned
58 pshufd m3, m3, 0x0
59 paddw m0, m2
60 pmaddwd m0, m3
61 packssdw m0, m1
62 pmaddwd m0, m3
63 packssdw m0, m1
64 pmaddwd m0, m3
65 paddw m0, [GLOBAL(pw_8)]
66 psrlw m0, 4
67 pshuflw m0, m0, 0x0
68 punpcklqdq m0, m0
69 mova [dstq ], m0
70 mova [dstq+strideq*2 ], m0
71 mova [dstq+strideq*4 ], m0
72 mova [dstq+stride3q*2], m0
73 lea dstq, [dstq+strideq*8]
74 mova [dstq ], m0
75 mova [dstq+strideq*2 ], m0
76 mova [dstq+strideq*4 ], m0
77 mova [dstq+stride3q*2], m0
79 RESTORE_GOT
80 RET
82 INIT_XMM sse2
83 cglobal highbd_dc_predictor_16x16, 4, 5, 5, dst, stride, above, left, goffset
84 GET_GOT goffsetq
86 pxor m1, m1
87 mova m0, [aboveq]
88 mova m3, [aboveq+16]
89 mova m2, [leftq]
90 mova m4, [leftq+16]
91 DEFINE_ARGS dst, stride, stride3, lines4
92 lea stride3q, [strideq*3]
93 mov lines4d, 4
94 paddw m0, m2
95 paddw m0, m3
96 paddw m0, m4
97 movhlps m2, m0
98 paddw m0, m2
99 punpcklwd m0, m1
100 movhlps m2, m0
101 paddd m0, m2
102 punpckldq m0, m1
103 movhlps m2, m0
104 paddd m0, m2
105 paddd m0, [GLOBAL(pw_16)]
106 psrad m0, 5
107 pshuflw m0, m0, 0x0
108 punpcklqdq m0, m0
109 .loop:
110 mova [dstq ], m0
111 mova [dstq +16], m0
112 mova [dstq+strideq*2 ], m0
113 mova [dstq+strideq*2 +16], m0
114 mova [dstq+strideq*4 ], m0
115 mova [dstq+strideq*4 +16], m0
116 mova [dstq+stride3q*2 ], m0
117 mova [dstq+stride3q*2+16], m0
118 lea dstq, [dstq+strideq*8]
119 dec lines4d
120 jnz .loop
122 RESTORE_GOT
123 REP_RET
125 %if ARCH_X86_64
126 INIT_XMM sse2
127 cglobal highbd_dc_predictor_32x32, 4, 5, 9, dst, stride, above, left, goffset
128 GET_GOT goffsetq
130 pxor m1, m1
131 mova m0, [aboveq]
132 mova m2, [aboveq+16]
133 mova m3, [aboveq+32]
134 mova m4, [aboveq+48]
135 mova m5, [leftq]
136 mova m6, [leftq+16]
137 mova m7, [leftq+32]
138 mova m8, [leftq+48]
139 DEFINE_ARGS dst, stride, stride3, lines4
140 lea stride3q, [strideq*3]
141 mov lines4d, 8
142 paddw m0, m2
143 paddw m0, m3
144 paddw m0, m4
145 paddw m0, m5
146 paddw m0, m6
147 paddw m0, m7
148 paddw m0, m8
149 movhlps m2, m0
150 paddw m0, m2
151 punpcklwd m0, m1
152 movhlps m2, m0
153 paddd m0, m2
154 punpckldq m0, m1
155 movhlps m2, m0
156 paddd m0, m2
157 paddd m0, [GLOBAL(pw_32)]
158 psrad m0, 6
159 pshuflw m0, m0, 0x0
160 punpcklqdq m0, m0
161 .loop:
162 mova [dstq ], m0
163 mova [dstq +16 ], m0
164 mova [dstq +32 ], m0
165 mova [dstq +48 ], m0
166 mova [dstq+strideq*2 ], m0
167 mova [dstq+strideq*2+16 ], m0
168 mova [dstq+strideq*2+32 ], m0
169 mova [dstq+strideq*2+48 ], m0
170 mova [dstq+strideq*4 ], m0
171 mova [dstq+strideq*4+16 ], m0
172 mova [dstq+strideq*4+32 ], m0
173 mova [dstq+strideq*4+48 ], m0
174 mova [dstq+stride3q*2 ], m0
175 mova [dstq+stride3q*2 +16], m0
176 mova [dstq+stride3q*2 +32], m0
177 mova [dstq+stride3q*2 +48], m0
178 lea dstq, [dstq+strideq*8]
179 dec lines4d
180 jnz .loop
182 RESTORE_GOT
183 REP_RET
184 %endif
186 INIT_MMX sse
187 cglobal highbd_v_predictor_4x4, 3, 3, 1, dst, stride, above
188 movq m0, [aboveq]
189 movq [dstq ], m0
190 movq [dstq+strideq*2], m0
191 lea dstq, [dstq+strideq*4]
192 movq [dstq ], m0
193 movq [dstq+strideq*2], m0
196 INIT_XMM sse2
197 cglobal highbd_v_predictor_8x8, 3, 3, 1, dst, stride, above
198 mova m0, [aboveq]
199 DEFINE_ARGS dst, stride, stride3
200 lea stride3q, [strideq*3]
201 mova [dstq ], m0
202 mova [dstq+strideq*2 ], m0
203 mova [dstq+strideq*4 ], m0
204 mova [dstq+stride3q*2], m0
205 lea dstq, [dstq+strideq*8]
206 mova [dstq ], m0
207 mova [dstq+strideq*2 ], m0
208 mova [dstq+strideq*4 ], m0
209 mova [dstq+stride3q*2], m0
212 INIT_XMM sse2
213 cglobal highbd_v_predictor_16x16, 3, 4, 2, dst, stride, above
214 mova m0, [aboveq]
215 mova m1, [aboveq+16]
216 DEFINE_ARGS dst, stride, stride3, nlines4
217 lea stride3q, [strideq*3]
218 mov nlines4d, 4
219 .loop:
220 mova [dstq ], m0
221 mova [dstq +16], m1
222 mova [dstq+strideq*2 ], m0
223 mova [dstq+strideq*2 +16], m1
224 mova [dstq+strideq*4 ], m0
225 mova [dstq+strideq*4 +16], m1
226 mova [dstq+stride3q*2 ], m0
227 mova [dstq+stride3q*2+16], m1
228 lea dstq, [dstq+strideq*8]
229 dec nlines4d
230 jnz .loop
231 REP_RET
233 INIT_XMM sse2
234 cglobal highbd_v_predictor_32x32, 3, 4, 4, dst, stride, above
235 mova m0, [aboveq]
236 mova m1, [aboveq+16]
237 mova m2, [aboveq+32]
238 mova m3, [aboveq+48]
239 DEFINE_ARGS dst, stride, stride3, nlines4
240 lea stride3q, [strideq*3]
241 mov nlines4d, 8
242 .loop:
243 mova [dstq ], m0
244 mova [dstq +16], m1
245 mova [dstq +32], m2
246 mova [dstq +48], m3
247 mova [dstq+strideq*2 ], m0
248 mova [dstq+strideq*2 +16], m1
249 mova [dstq+strideq*2 +32], m2
250 mova [dstq+strideq*2 +48], m3
251 mova [dstq+strideq*4 ], m0
252 mova [dstq+strideq*4 +16], m1
253 mova [dstq+strideq*4 +32], m2
254 mova [dstq+strideq*4 +48], m3
255 mova [dstq+stride3q*2 ], m0
256 mova [dstq+stride3q*2 +16], m1
257 mova [dstq+stride3q*2 +32], m2
258 mova [dstq+stride3q*2 +48], m3
259 lea dstq, [dstq+strideq*8]
260 dec nlines4d
261 jnz .loop
262 REP_RET
264 INIT_MMX sse
265 cglobal highbd_tm_predictor_4x4, 5, 6, 5, dst, stride, above, left, bps, one
266 movd m1, [aboveq-2]
267 movq m0, [aboveq]
268 pshufw m1, m1, 0x0
269 ; Get the values to compute the maximum value at this bit depth
270 mov oned, 1
271 movd m3, oned
272 movd m4, bpsd
273 pshufw m3, m3, 0x0
274 DEFINE_ARGS dst, stride, line, left
275 mov lineq, -2
276 mova m2, m3
277 psllw m3, m4
278 add leftq, 8
279 psubw m3, m2 ; max possible value
280 pxor m4, m4 ; min possible value
281 psubw m0, m1
282 .loop:
283 movq m1, [leftq+lineq*4]
284 movq m2, [leftq+lineq*4+2]
285 pshufw m1, m1, 0x0
286 pshufw m2, m2, 0x0
287 paddw m1, m0
288 paddw m2, m0
289 ;Clamp to the bit-depth
290 pminsw m1, m3
291 pminsw m2, m3
292 pmaxsw m1, m4
293 pmaxsw m2, m4
294 ;Store the values
295 movq [dstq ], m1
296 movq [dstq+strideq*2], m2
297 lea dstq, [dstq+strideq*4]
298 inc lineq
299 jnz .loop
300 REP_RET
302 INIT_XMM sse2
303 cglobal highbd_tm_predictor_8x8, 5, 6, 5, dst, stride, above, left, bps, one
304 movd m1, [aboveq-2]
305 mova m0, [aboveq]
306 pshuflw m1, m1, 0x0
307 ; Get the values to compute the maximum value at this bit depth
308 mov oned, 1
309 pxor m3, m3
310 pxor m4, m4
311 pinsrw m3, oned, 0
312 pinsrw m4, bpsd, 0
313 pshuflw m3, m3, 0x0
314 DEFINE_ARGS dst, stride, line, left
315 punpcklqdq m3, m3
316 mov lineq, -4
317 mova m2, m3
318 punpcklqdq m1, m1
319 psllw m3, m4
320 add leftq, 16
321 psubw m3, m2 ; max possible value
322 pxor m4, m4 ; min possible value
323 psubw m0, m1
324 .loop:
325 movd m1, [leftq+lineq*4]
326 movd m2, [leftq+lineq*4+2]
327 pshuflw m1, m1, 0x0
328 pshuflw m2, m2, 0x0
329 punpcklqdq m1, m1
330 punpcklqdq m2, m2
331 paddw m1, m0
332 paddw m2, m0
333 ;Clamp to the bit-depth
334 pminsw m1, m3
335 pminsw m2, m3
336 pmaxsw m1, m4
337 pmaxsw m2, m4
338 ;Store the values
339 mova [dstq ], m1
340 mova [dstq+strideq*2], m2
341 lea dstq, [dstq+strideq*4]
342 inc lineq
343 jnz .loop
344 REP_RET
346 %if ARCH_X86_64
347 INIT_XMM sse2
348 cglobal highbd_tm_predictor_16x16, 5, 6, 9, dst, stride, above, left, bps, one
349 movd m2, [aboveq-2]
350 mova m0, [aboveq]
351 mova m1, [aboveq+16]
352 pshuflw m2, m2, 0x0
353 ; Get the values to compute the maximum value at this bit depth
354 mov oned, 1
355 pxor m7, m7
356 pxor m8, m8
357 pinsrw m7, oned, 0
358 pinsrw m8, bpsd, 0
359 pshuflw m7, m7, 0x0
360 DEFINE_ARGS dst, stride, line, left
361 punpcklqdq m7, m7
362 mov lineq, -8
363 mova m5, m7
364 punpcklqdq m2, m2
365 psllw m7, m8
366 add leftq, 32
367 psubw m7, m5 ; max possible value
368 pxor m8, m8 ; min possible value
369 psubw m0, m2
370 psubw m1, m2
371 .loop:
372 movd m2, [leftq+lineq*4]
373 movd m3, [leftq+lineq*4+2]
374 pshuflw m2, m2, 0x0
375 pshuflw m3, m3, 0x0
376 punpcklqdq m2, m2
377 punpcklqdq m3, m3
378 paddw m4, m2, m0
379 paddw m5, m3, m0
380 paddw m2, m1
381 paddw m3, m1
382 ;Clamp to the bit-depth
383 pminsw m4, m7
384 pminsw m5, m7
385 pminsw m2, m7
386 pminsw m3, m7
387 pmaxsw m4, m8
388 pmaxsw m5, m8
389 pmaxsw m2, m8
390 pmaxsw m3, m8
391 ;Store the values
392 mova [dstq ], m4
393 mova [dstq+strideq*2 ], m5
394 mova [dstq +16], m2
395 mova [dstq+strideq*2+16], m3
396 lea dstq, [dstq+strideq*4]
397 inc lineq
398 jnz .loop
399 REP_RET
401 INIT_XMM sse2
402 cglobal highbd_tm_predictor_32x32, 5, 6, 12, dst, stride, above, left, bps, one
403 movd m0, [aboveq-2]
404 mova m1, [aboveq]
405 mova m2, [aboveq+16]
406 mova m3, [aboveq+32]
407 mova m4, [aboveq+48]
408 pshuflw m0, m0, 0x0
409 ; Get the values to compute the maximum value at this bit depth
410 mov oned, 1
411 pxor m10, m10
412 pxor m11, m11
413 pinsrw m10, oned, 0
414 pinsrw m11, bpsd, 0
415 pshuflw m10, m10, 0x0
416 DEFINE_ARGS dst, stride, line, left
417 punpcklqdq m10, m10
418 mov lineq, -16
419 mova m5, m10
420 punpcklqdq m0, m0
421 psllw m10, m11
422 add leftq, 64
423 psubw m10, m5 ; max possible value
424 pxor m11, m11 ; min possible value
425 psubw m1, m0
426 psubw m2, m0
427 psubw m3, m0
428 psubw m4, m0
429 .loop:
430 movd m5, [leftq+lineq*4]
431 movd m6, [leftq+lineq*4+2]
432 pshuflw m5, m5, 0x0
433 pshuflw m6, m6, 0x0
434 punpcklqdq m5, m5
435 punpcklqdq m6, m6
436 paddw m7, m5, m1
437 paddw m8, m5, m2
438 paddw m9, m5, m3
439 paddw m5, m4
440 ;Clamp these values to the bit-depth
441 pminsw m7, m10
442 pminsw m8, m10
443 pminsw m9, m10
444 pminsw m5, m10
445 pmaxsw m7, m11
446 pmaxsw m8, m11
447 pmaxsw m9, m11
448 pmaxsw m5, m11
449 ;Store these values
450 mova [dstq ], m7
451 mova [dstq +16], m8
452 mova [dstq +32], m9
453 mova [dstq +48], m5
454 paddw m7, m6, m1
455 paddw m8, m6, m2
456 paddw m9, m6, m3
457 paddw m6, m4
458 ;Clamp these values to the bit-depth
459 pminsw m7, m10
460 pminsw m8, m10
461 pminsw m9, m10
462 pminsw m6, m10
463 pmaxsw m7, m11
464 pmaxsw m8, m11
465 pmaxsw m9, m11
466 pmaxsw m6, m11
467 ;Store these values
468 mova [dstq+strideq*2 ], m7
469 mova [dstq+strideq*2+16], m8
470 mova [dstq+strideq*2+32], m9
471 mova [dstq+strideq*2+48], m6
472 lea dstq, [dstq+strideq*4]
473 inc lineq
474 jnz .loop
475 REP_RET
476 %endif