Merge "Code clean of highbd_tm_predictor_4x4"
[aom.git] / vpx_dsp / x86 / highbd_intrapred_sse2.asm
blob233958a52335fce113795cfd9c0cef0c660f0489
2 ; Copyright (c) 2014 The WebM project authors. All Rights Reserved.
4 ; Use of this source code is governed by a BSD-style license
5 ; that can be found in the LICENSE file in the root of the source
6 ; tree. An additional intellectual property rights grant can be found
7 ; in the file PATENTS. All contributing project authors may
8 ; be found in the AUTHORS file in the root of the source tree.
11 %include "third_party/x86inc/x86inc.asm"
13 SECTION_RODATA
14 pw_4: times 8 dw 4
15 pw_8: times 8 dw 8
16 pw_16: times 4 dd 16
17 pw_32: times 4 dd 32
19 SECTION .text
20 INIT_XMM sse2
21 cglobal highbd_dc_predictor_4x4, 4, 5, 4, dst, stride, above, left, goffset
22 GET_GOT goffsetq
24 movq m0, [aboveq]
25 movq m2, [leftq]
26 paddw m0, m2
27 pshuflw m1, m0, 0xe
28 paddw m0, m1
29 pshuflw m1, m0, 0x1
30 paddw m0, m1
31 paddw m0, [GLOBAL(pw_4)]
32 psraw m0, 3
33 pshuflw m0, m0, 0x0
34 movq [dstq ], m0
35 movq [dstq+strideq*2], m0
36 lea dstq, [dstq+strideq*4]
37 movq [dstq ], m0
38 movq [dstq+strideq*2], m0
40 RESTORE_GOT
41 RET
43 INIT_XMM sse2
44 cglobal highbd_dc_predictor_8x8, 4, 5, 4, dst, stride, above, left, goffset
45 GET_GOT goffsetq
47 pxor m1, m1
48 mova m0, [aboveq]
49 mova m2, [leftq]
50 DEFINE_ARGS dst, stride, stride3, one
51 mov oned, 0x00010001
52 lea stride3q, [strideq*3]
53 movd m3, oned
54 pshufd m3, m3, 0x0
55 paddw m0, m2
56 pmaddwd m0, m3
57 packssdw m0, m1
58 pmaddwd m0, m3
59 packssdw m0, m1
60 pmaddwd m0, m3
61 paddw m0, [GLOBAL(pw_8)]
62 psrlw m0, 4
63 pshuflw m0, m0, 0x0
64 punpcklqdq m0, m0
65 mova [dstq ], m0
66 mova [dstq+strideq*2 ], m0
67 mova [dstq+strideq*4 ], m0
68 mova [dstq+stride3q*2], m0
69 lea dstq, [dstq+strideq*8]
70 mova [dstq ], m0
71 mova [dstq+strideq*2 ], m0
72 mova [dstq+strideq*4 ], m0
73 mova [dstq+stride3q*2], m0
75 RESTORE_GOT
76 RET
78 INIT_XMM sse2
79 cglobal highbd_dc_predictor_16x16, 4, 5, 5, dst, stride, above, left, goffset
80 GET_GOT goffsetq
82 pxor m1, m1
83 mova m0, [aboveq]
84 mova m3, [aboveq+16]
85 mova m2, [leftq]
86 mova m4, [leftq+16]
87 DEFINE_ARGS dst, stride, stride3, lines4
88 lea stride3q, [strideq*3]
89 mov lines4d, 4
90 paddw m0, m2
91 paddw m0, m3
92 paddw m0, m4
93 movhlps m2, m0
94 paddw m0, m2
95 punpcklwd m0, m1
96 movhlps m2, m0
97 paddd m0, m2
98 punpckldq m0, m1
99 movhlps m2, m0
100 paddd m0, m2
101 paddd m0, [GLOBAL(pw_16)]
102 psrad m0, 5
103 pshuflw m0, m0, 0x0
104 punpcklqdq m0, m0
105 .loop:
106 mova [dstq ], m0
107 mova [dstq +16], m0
108 mova [dstq+strideq*2 ], m0
109 mova [dstq+strideq*2 +16], m0
110 mova [dstq+strideq*4 ], m0
111 mova [dstq+strideq*4 +16], m0
112 mova [dstq+stride3q*2 ], m0
113 mova [dstq+stride3q*2+16], m0
114 lea dstq, [dstq+strideq*8]
115 dec lines4d
116 jnz .loop
118 RESTORE_GOT
119 REP_RET
121 %if ARCH_X86_64
122 INIT_XMM sse2
123 cglobal highbd_dc_predictor_32x32, 4, 5, 9, dst, stride, above, left, goffset
124 GET_GOT goffsetq
126 pxor m1, m1
127 mova m0, [aboveq]
128 mova m2, [aboveq+16]
129 mova m3, [aboveq+32]
130 mova m4, [aboveq+48]
131 mova m5, [leftq]
132 mova m6, [leftq+16]
133 mova m7, [leftq+32]
134 mova m8, [leftq+48]
135 DEFINE_ARGS dst, stride, stride3, lines4
136 lea stride3q, [strideq*3]
137 mov lines4d, 8
138 paddw m0, m2
139 paddw m0, m3
140 paddw m0, m4
141 paddw m0, m5
142 paddw m0, m6
143 paddw m0, m7
144 paddw m0, m8
145 movhlps m2, m0
146 paddw m0, m2
147 punpcklwd m0, m1
148 movhlps m2, m0
149 paddd m0, m2
150 punpckldq m0, m1
151 movhlps m2, m0
152 paddd m0, m2
153 paddd m0, [GLOBAL(pw_32)]
154 psrad m0, 6
155 pshuflw m0, m0, 0x0
156 punpcklqdq m0, m0
157 .loop:
158 mova [dstq ], m0
159 mova [dstq +16 ], m0
160 mova [dstq +32 ], m0
161 mova [dstq +48 ], m0
162 mova [dstq+strideq*2 ], m0
163 mova [dstq+strideq*2+16 ], m0
164 mova [dstq+strideq*2+32 ], m0
165 mova [dstq+strideq*2+48 ], m0
166 mova [dstq+strideq*4 ], m0
167 mova [dstq+strideq*4+16 ], m0
168 mova [dstq+strideq*4+32 ], m0
169 mova [dstq+strideq*4+48 ], m0
170 mova [dstq+stride3q*2 ], m0
171 mova [dstq+stride3q*2 +16], m0
172 mova [dstq+stride3q*2 +32], m0
173 mova [dstq+stride3q*2 +48], m0
174 lea dstq, [dstq+strideq*8]
175 dec lines4d
176 jnz .loop
178 RESTORE_GOT
179 REP_RET
180 %endif
182 INIT_XMM sse2
183 cglobal highbd_v_predictor_4x4, 3, 3, 1, dst, stride, above
184 movq m0, [aboveq]
185 movq [dstq ], m0
186 movq [dstq+strideq*2], m0
187 lea dstq, [dstq+strideq*4]
188 movq [dstq ], m0
189 movq [dstq+strideq*2], m0
192 INIT_XMM sse2
193 cglobal highbd_v_predictor_8x8, 3, 3, 1, dst, stride, above
194 mova m0, [aboveq]
195 DEFINE_ARGS dst, stride, stride3
196 lea stride3q, [strideq*3]
197 mova [dstq ], m0
198 mova [dstq+strideq*2 ], m0
199 mova [dstq+strideq*4 ], m0
200 mova [dstq+stride3q*2], m0
201 lea dstq, [dstq+strideq*8]
202 mova [dstq ], m0
203 mova [dstq+strideq*2 ], m0
204 mova [dstq+strideq*4 ], m0
205 mova [dstq+stride3q*2], m0
208 INIT_XMM sse2
209 cglobal highbd_v_predictor_16x16, 3, 4, 2, dst, stride, above
210 mova m0, [aboveq]
211 mova m1, [aboveq+16]
212 DEFINE_ARGS dst, stride, stride3, nlines4
213 lea stride3q, [strideq*3]
214 mov nlines4d, 4
215 .loop:
216 mova [dstq ], m0
217 mova [dstq +16], m1
218 mova [dstq+strideq*2 ], m0
219 mova [dstq+strideq*2 +16], m1
220 mova [dstq+strideq*4 ], m0
221 mova [dstq+strideq*4 +16], m1
222 mova [dstq+stride3q*2 ], m0
223 mova [dstq+stride3q*2+16], m1
224 lea dstq, [dstq+strideq*8]
225 dec nlines4d
226 jnz .loop
227 REP_RET
229 INIT_XMM sse2
230 cglobal highbd_v_predictor_32x32, 3, 4, 4, dst, stride, above
231 mova m0, [aboveq]
232 mova m1, [aboveq+16]
233 mova m2, [aboveq+32]
234 mova m3, [aboveq+48]
235 DEFINE_ARGS dst, stride, stride3, nlines4
236 lea stride3q, [strideq*3]
237 mov nlines4d, 8
238 .loop:
239 mova [dstq ], m0
240 mova [dstq +16], m1
241 mova [dstq +32], m2
242 mova [dstq +48], m3
243 mova [dstq+strideq*2 ], m0
244 mova [dstq+strideq*2 +16], m1
245 mova [dstq+strideq*2 +32], m2
246 mova [dstq+strideq*2 +48], m3
247 mova [dstq+strideq*4 ], m0
248 mova [dstq+strideq*4 +16], m1
249 mova [dstq+strideq*4 +32], m2
250 mova [dstq+strideq*4 +48], m3
251 mova [dstq+stride3q*2 ], m0
252 mova [dstq+stride3q*2 +16], m1
253 mova [dstq+stride3q*2 +32], m2
254 mova [dstq+stride3q*2 +48], m3
255 lea dstq, [dstq+strideq*8]
256 dec nlines4d
257 jnz .loop
258 REP_RET
260 INIT_XMM sse2
261 cglobal highbd_tm_predictor_4x4, 5, 5, 6, dst, stride, above, left, bps
262 movd m1, [aboveq-2]
263 movq m0, [aboveq]
264 pshuflw m1, m1, 0x0
265 movlhps m0, m0 ; t1 t2 t3 t4 t1 t2 t3 t4
266 movlhps m1, m1 ; tl tl tl tl tl tl tl tl
267 ; Get the values to compute the maximum value at this bit depth
268 pcmpeqw m3, m3
269 movd m4, bpsd
270 psubw m0, m1 ; t1-tl t2-tl t3-tl t4-tl
271 psllw m3, m4
272 pcmpeqw m2, m2
273 pxor m4, m4 ; min possible value
274 pxor m3, m2 ; max possible value
275 mova m1, [leftq]
276 pshuflw m2, m1, 0x0
277 pshuflw m5, m1, 0x55
278 movlhps m2, m5 ; l1 l1 l1 l1 l2 l2 l2 l2
279 paddw m2, m0
280 ;Clamp to the bit-depth
281 pminsw m2, m3
282 pmaxsw m2, m4
283 ;Store the values
284 movq [dstq ], m2
285 movhpd [dstq+strideq*2], m2
286 lea dstq, [dstq+strideq*4]
287 pshuflw m2, m1, 0xaa
288 pshuflw m5, m1, 0xff
289 movlhps m2, m5
290 paddw m2, m0
291 ;Clamp to the bit-depth
292 pminsw m2, m3
293 pmaxsw m2, m4
294 ;Store the values
295 movq [dstq ], m2
296 movhpd [dstq+strideq*2], m2
299 INIT_XMM sse2
300 cglobal highbd_tm_predictor_8x8, 5, 6, 5, dst, stride, above, left, bps, one
301 movd m1, [aboveq-2]
302 mova m0, [aboveq]
303 pshuflw m1, m1, 0x0
304 ; Get the values to compute the maximum value at this bit depth
305 mov oned, 1
306 pxor m3, m3
307 pxor m4, m4
308 pinsrw m3, oned, 0
309 pinsrw m4, bpsd, 0
310 pshuflw m3, m3, 0x0
311 DEFINE_ARGS dst, stride, line, left
312 punpcklqdq m3, m3
313 mov lineq, -4
314 mova m2, m3
315 punpcklqdq m1, m1
316 psllw m3, m4
317 add leftq, 16
318 psubw m3, m2 ; max possible value
319 pxor m4, m4 ; min possible value
320 psubw m0, m1
321 .loop:
322 movd m1, [leftq+lineq*4]
323 movd m2, [leftq+lineq*4+2]
324 pshuflw m1, m1, 0x0
325 pshuflw m2, m2, 0x0
326 punpcklqdq m1, m1
327 punpcklqdq m2, m2
328 paddw m1, m0
329 paddw m2, m0
330 ;Clamp to the bit-depth
331 pminsw m1, m3
332 pminsw m2, m3
333 pmaxsw m1, m4
334 pmaxsw m2, m4
335 ;Store the values
336 mova [dstq ], m1
337 mova [dstq+strideq*2], m2
338 lea dstq, [dstq+strideq*4]
339 inc lineq
340 jnz .loop
341 REP_RET
343 %if ARCH_X86_64
344 INIT_XMM sse2
345 cglobal highbd_tm_predictor_16x16, 5, 6, 9, dst, stride, above, left, bps, one
346 movd m2, [aboveq-2]
347 mova m0, [aboveq]
348 mova m1, [aboveq+16]
349 pshuflw m2, m2, 0x0
350 ; Get the values to compute the maximum value at this bit depth
351 mov oned, 1
352 pxor m7, m7
353 pxor m8, m8
354 pinsrw m7, oned, 0
355 pinsrw m8, bpsd, 0
356 pshuflw m7, m7, 0x0
357 DEFINE_ARGS dst, stride, line, left
358 punpcklqdq m7, m7
359 mov lineq, -8
360 mova m5, m7
361 punpcklqdq m2, m2
362 psllw m7, m8
363 add leftq, 32
364 psubw m7, m5 ; max possible value
365 pxor m8, m8 ; min possible value
366 psubw m0, m2
367 psubw m1, m2
368 .loop:
369 movd m2, [leftq+lineq*4]
370 movd m3, [leftq+lineq*4+2]
371 pshuflw m2, m2, 0x0
372 pshuflw m3, m3, 0x0
373 punpcklqdq m2, m2
374 punpcklqdq m3, m3
375 paddw m4, m2, m0
376 paddw m5, m3, m0
377 paddw m2, m1
378 paddw m3, m1
379 ;Clamp to the bit-depth
380 pminsw m4, m7
381 pminsw m5, m7
382 pminsw m2, m7
383 pminsw m3, m7
384 pmaxsw m4, m8
385 pmaxsw m5, m8
386 pmaxsw m2, m8
387 pmaxsw m3, m8
388 ;Store the values
389 mova [dstq ], m4
390 mova [dstq+strideq*2 ], m5
391 mova [dstq +16], m2
392 mova [dstq+strideq*2+16], m3
393 lea dstq, [dstq+strideq*4]
394 inc lineq
395 jnz .loop
396 REP_RET
398 INIT_XMM sse2
399 cglobal highbd_tm_predictor_32x32, 5, 6, 12, dst, stride, above, left, bps, one
400 movd m0, [aboveq-2]
401 mova m1, [aboveq]
402 mova m2, [aboveq+16]
403 mova m3, [aboveq+32]
404 mova m4, [aboveq+48]
405 pshuflw m0, m0, 0x0
406 ; Get the values to compute the maximum value at this bit depth
407 mov oned, 1
408 pxor m10, m10
409 pxor m11, m11
410 pinsrw m10, oned, 0
411 pinsrw m11, bpsd, 0
412 pshuflw m10, m10, 0x0
413 DEFINE_ARGS dst, stride, line, left
414 punpcklqdq m10, m10
415 mov lineq, -16
416 mova m5, m10
417 punpcklqdq m0, m0
418 psllw m10, m11
419 add leftq, 64
420 psubw m10, m5 ; max possible value
421 pxor m11, m11 ; min possible value
422 psubw m1, m0
423 psubw m2, m0
424 psubw m3, m0
425 psubw m4, m0
426 .loop:
427 movd m5, [leftq+lineq*4]
428 movd m6, [leftq+lineq*4+2]
429 pshuflw m5, m5, 0x0
430 pshuflw m6, m6, 0x0
431 punpcklqdq m5, m5
432 punpcklqdq m6, m6
433 paddw m7, m5, m1
434 paddw m8, m5, m2
435 paddw m9, m5, m3
436 paddw m5, m4
437 ;Clamp these values to the bit-depth
438 pminsw m7, m10
439 pminsw m8, m10
440 pminsw m9, m10
441 pminsw m5, m10
442 pmaxsw m7, m11
443 pmaxsw m8, m11
444 pmaxsw m9, m11
445 pmaxsw m5, m11
446 ;Store these values
447 mova [dstq ], m7
448 mova [dstq +16], m8
449 mova [dstq +32], m9
450 mova [dstq +48], m5
451 paddw m7, m6, m1
452 paddw m8, m6, m2
453 paddw m9, m6, m3
454 paddw m6, m4
455 ;Clamp these values to the bit-depth
456 pminsw m7, m10
457 pminsw m8, m10
458 pminsw m9, m10
459 pminsw m6, m10
460 pmaxsw m7, m11
461 pmaxsw m8, m11
462 pmaxsw m9, m11
463 pmaxsw m6, m11
464 ;Store these values
465 mova [dstq+strideq*2 ], m7
466 mova [dstq+strideq*2+16], m8
467 mova [dstq+strideq*2+32], m9
468 mova [dstq+strideq*2+48], m6
469 lea dstq, [dstq+strideq*4]
470 inc lineq
471 jnz .loop
472 REP_RET
473 %endif