2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
4 ; Use of this source code is governed by a BSD-style license
5 ; that can be found in the LICENSE file in the root of the source
6 ; tree. An additional intellectual property rights grant can be found
7 ; in the file PATENTS. All contributing project authors may
8 ; be found in the AUTHORS file in the root of the source tree.
12 %include "vpx_ports/x86_abi_support.asm"
14 ; /****************************************************************************
17 ; * This implementation makes use of 16 bit fixed point version of two multiply
19 ; * 1. sqrt(2) * cos (pi/8)
20 ; * 2. sqrt(2) * sin (pi/8)
21 ; * Because the first constant is bigger than 1, to maintain the same 16 bit
22 ; * fixed point precision as the second one, we use a trick of
23 ; * x * a = x + x*(a-1)
25 ; * x * sqrt(2) * cos (pi/8) = x + x * (sqrt(2) *cos(pi/8)-1).
27 ; * For the second constant, because of the 16bit version is 35468, which
28 ; * is bigger than 32768, in signed 16 bit multiply, it becomes a negative
30 ; * (x * (unsigned)35468 >> 16) = x * (signed)35468 >> 16 + x
32 ; **************************************************************************/
35 ;void vp8_short_idct4x4llm_mmx(short *input, unsigned char *pred,
36 ;int pitch, unsigned char *dest,int stride)
37 global sym
(vp8_short_idct4x4llm_mmx
) PRIVATE
38 sym
(vp8_short_idct4x4llm_mmx
):
41 SHADOW_ARGS_TO_STACK
5
47 mov rax
, arg
(0) ;input
62 movsxd rax
, dword ptr arg
(2) ;pitch
64 movsxd rdi
, dword ptr arg
(4) ;stride
67 psubw mm0
, mm2
; b1= 0-2
71 paddw mm2
, mm0
; a1 =0+2
73 pmulhw mm5
, [GLOBAL(x_s1sqr2
)];
74 paddw mm5
, mm1
; ip1 * sin(pi/8) * sqrt(2)
77 pmulhw mm7
, [GLOBAL(x_c1sqr2less1
)];
79 paddw mm7
, mm3
; ip3 * cos(pi/8) * sqrt(2)
85 pmulhw mm5
, [GLOBAL(x_c1sqr2less1
)]
88 pmulhw mm3
, [GLOBAL(x_s1sqr2
)]
102 movq mm1
, mm2
; 03 02 01 00
103 movq mm3
, mm4
; 23 22 21 20
105 punpcklwd mm1
, mm0
; 11 01 10 00
106 punpckhwd mm2
, mm0
; 13 03 12 02
108 punpcklwd mm3
, mm6
; 31 21 30 20
109 punpckhwd mm4
, mm6
; 33 23 32 22
111 movq mm0
, mm1
; 11 01 10 00
112 movq mm5
, mm2
; 13 03 12 02
114 punpckldq mm0
, mm3
; 30 20 10 00
115 punpckhdq mm1
, mm3
; 31 21 11 01
117 punpckldq mm2
, mm4
; 32 22 12 02
118 punpckhdq mm5
, mm4
; 33 23 13 03
120 movq mm3
, mm5
; 33 23 13 03
122 psubw mm0
, mm2
; b1= 0-2
126 paddw mm2
, mm0
; a1 =0+2
128 pmulhw mm5
, [GLOBAL(x_s1sqr2
)];
129 paddw mm5
, mm1
; ip1 * sin(pi/8) * sqrt(2)
132 pmulhw mm7
, [GLOBAL(x_c1sqr2less1
)];
134 paddw mm7
, mm3
; ip3 * cos(pi/8) * sqrt(2)
140 pmulhw mm5
, [GLOBAL(x_c1sqr2less1
)]
143 pmulhw mm3
, [GLOBAL(x_s1sqr2
)]
147 paddw mm0
, [GLOBAL(fours
)]
149 paddw mm2
, [GLOBAL(fours
)]
166 movq mm1
, mm2
; 03 02 01 00
167 movq mm3
, mm4
; 23 22 21 20
169 punpcklwd mm1
, mm0
; 11 01 10 00
170 punpckhwd mm2
, mm0
; 13 03 12 02
172 punpcklwd mm3
, mm6
; 31 21 30 20
173 punpckhwd mm4
, mm6
; 33 23 32 22
175 movq mm0
, mm1
; 11 01 10 00
176 movq mm5
, mm2
; 13 03 12 02
178 punpckldq mm0
, mm3
; 30 20 10 00
179 punpckhdq mm1
, mm3
; 31 21 11 01
181 punpckldq mm2
, mm4
; 32 22 12 02
182 punpckhdq mm5
, mm4
; 33 23 13 03
198 movd mm4
, [rsi
+2*rax
]
202 movd
[rdx
+rdi
*2], mm2
207 movd mm4
, [rsi
+2*rax
]
211 movd
[rdx
+rdi
*2], mm5
221 ;void vp8_dc_only_idct_add_mmx(
223 ;unsigned char *pred_ptr,
225 ;unsigned char *dst_ptr,
227 global sym
(vp8_dc_only_idct_add_mmx
) PRIVATE
228 sym
(vp8_dc_only_idct_add_mmx
):
231 SHADOW_ARGS_TO_STACK
5
235 movd mm5
, arg
(0) ;input_dc
236 mov rax
, arg
(1) ;pred_ptr
237 movsxd rdx
, dword ptr arg
(2) ;pred_stride
241 paddw mm5
, [GLOBAL(fours
)]
242 lea rcx
, [rdx
+ rdx
*2]
252 movd mm3
, [rax
+2*rdx
]
255 mov rax
, arg
(3) ;d -- destination
256 movsxd rdx
, dword ptr arg
(4) ;dst_stride
260 packuswb mm1
, mm0
; pack and unpack to saturate
261 lea rcx
, [rdx
+ rdx
*2]
265 packuswb mm2
, mm0
; pack and unpack to saturate
269 packuswb mm3
, mm0
; pack and unpack to saturate
273 packuswb mm4
, mm0
; pack and unpack to saturate
277 movd
[rax
+2*rdx
], mm3