Merge "VP9: Eliminate up_available and left_available"
[aom.git] / vp8 / common / x86 / idctllm_mmx.asm
blob96fa2c60d0ff468555241213da42a215f5ece679
2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
4 ; Use of this source code is governed by a BSD-style license
5 ; that can be found in the LICENSE file in the root of the source
6 ; tree. An additional intellectual property rights grant can be found
7 ; in the file PATENTS. All contributing project authors may
8 ; be found in the AUTHORS file in the root of the source tree.
12 %include "vpx_ports/x86_abi_support.asm"
14 ; /****************************************************************************
15 ; * Notes:
16 ; *
17 ; * This implementation makes use of 16 bit fixed point version of two multiply
18 ; * constants:
19 ; * 1. sqrt(2) * cos (pi/8)
20 ; * 2. sqrt(2) * sin (pi/8)
21 ; * Because the first constant is bigger than 1, to maintain the same 16 bit
22 ; * fixed point precision as the second one, we use a trick of
23 ; * x * a = x + x*(a-1)
24 ; * so
25 ; * x * sqrt(2) * cos (pi/8) = x + x * (sqrt(2) *cos(pi/8)-1).
26 ; *
27 ; * For the second constant, because of the 16bit version is 35468, which
28 ; * is bigger than 32768, in signed 16 bit multiply, it becomes a negative
29 ; * number.
30 ; * (x * (unsigned)35468 >> 16) = x * (signed)35468 >> 16 + x
31 ; *
32 ; **************************************************************************/
35 ;void vp8_short_idct4x4llm_mmx(short *input, unsigned char *pred,
36 ;int pitch, unsigned char *dest,int stride)
37 global sym(vp8_short_idct4x4llm_mmx) PRIVATE
38 sym(vp8_short_idct4x4llm_mmx):
39 push rbp
40 mov rbp, rsp
41 SHADOW_ARGS_TO_STACK 5
42 GET_GOT rbx
43 push rsi
44 push rdi
45 ; end prolog
47 mov rax, arg(0) ;input
48 mov rsi, arg(1) ;pred
50 movq mm0, [rax ]
51 movq mm1, [rax+ 8]
52 movq mm2, [rax+16]
53 movq mm3, [rax+24]
55 %if 0
56 pxor mm7, mm7
57 movq [rax], mm7
58 movq [rax+8], mm7
59 movq [rax+16],mm7
60 movq [rax+24],mm7
61 %endif
62 movsxd rax, dword ptr arg(2) ;pitch
63 mov rdx, arg(3) ;dest
64 movsxd rdi, dword ptr arg(4) ;stride
67 psubw mm0, mm2 ; b1= 0-2
68 paddw mm2, mm2 ;
70 movq mm5, mm1
71 paddw mm2, mm0 ; a1 =0+2
73 pmulhw mm5, [GLOBAL(x_s1sqr2)];
74 paddw mm5, mm1 ; ip1 * sin(pi/8) * sqrt(2)
76 movq mm7, mm3 ;
77 pmulhw mm7, [GLOBAL(x_c1sqr2less1)];
79 paddw mm7, mm3 ; ip3 * cos(pi/8) * sqrt(2)
80 psubw mm7, mm5 ; c1
82 movq mm5, mm1
83 movq mm4, mm3
85 pmulhw mm5, [GLOBAL(x_c1sqr2less1)]
86 paddw mm5, mm1
88 pmulhw mm3, [GLOBAL(x_s1sqr2)]
89 paddw mm3, mm4
91 paddw mm3, mm5 ; d1
92 movq mm6, mm2 ; a1
94 movq mm4, mm0 ; b1
95 paddw mm2, mm3 ;0
97 paddw mm4, mm7 ;1
98 psubw mm0, mm7 ;2
100 psubw mm6, mm3 ;3
102 movq mm1, mm2 ; 03 02 01 00
103 movq mm3, mm4 ; 23 22 21 20
105 punpcklwd mm1, mm0 ; 11 01 10 00
106 punpckhwd mm2, mm0 ; 13 03 12 02
108 punpcklwd mm3, mm6 ; 31 21 30 20
109 punpckhwd mm4, mm6 ; 33 23 32 22
111 movq mm0, mm1 ; 11 01 10 00
112 movq mm5, mm2 ; 13 03 12 02
114 punpckldq mm0, mm3 ; 30 20 10 00
115 punpckhdq mm1, mm3 ; 31 21 11 01
117 punpckldq mm2, mm4 ; 32 22 12 02
118 punpckhdq mm5, mm4 ; 33 23 13 03
120 movq mm3, mm5 ; 33 23 13 03
122 psubw mm0, mm2 ; b1= 0-2
123 paddw mm2, mm2 ;
125 movq mm5, mm1
126 paddw mm2, mm0 ; a1 =0+2
128 pmulhw mm5, [GLOBAL(x_s1sqr2)];
129 paddw mm5, mm1 ; ip1 * sin(pi/8) * sqrt(2)
131 movq mm7, mm3 ;
132 pmulhw mm7, [GLOBAL(x_c1sqr2less1)];
134 paddw mm7, mm3 ; ip3 * cos(pi/8) * sqrt(2)
135 psubw mm7, mm5 ; c1
137 movq mm5, mm1
138 movq mm4, mm3
140 pmulhw mm5, [GLOBAL(x_c1sqr2less1)]
141 paddw mm5, mm1
143 pmulhw mm3, [GLOBAL(x_s1sqr2)]
144 paddw mm3, mm4
146 paddw mm3, mm5 ; d1
147 paddw mm0, [GLOBAL(fours)]
149 paddw mm2, [GLOBAL(fours)]
150 movq mm6, mm2 ; a1
152 movq mm4, mm0 ; b1
153 paddw mm2, mm3 ;0
155 paddw mm4, mm7 ;1
156 psubw mm0, mm7 ;2
158 psubw mm6, mm3 ;3
159 psraw mm2, 3
161 psraw mm0, 3
162 psraw mm4, 3
164 psraw mm6, 3
166 movq mm1, mm2 ; 03 02 01 00
167 movq mm3, mm4 ; 23 22 21 20
169 punpcklwd mm1, mm0 ; 11 01 10 00
170 punpckhwd mm2, mm0 ; 13 03 12 02
172 punpcklwd mm3, mm6 ; 31 21 30 20
173 punpckhwd mm4, mm6 ; 33 23 32 22
175 movq mm0, mm1 ; 11 01 10 00
176 movq mm5, mm2 ; 13 03 12 02
178 punpckldq mm0, mm3 ; 30 20 10 00
179 punpckhdq mm1, mm3 ; 31 21 11 01
181 punpckldq mm2, mm4 ; 32 22 12 02
182 punpckhdq mm5, mm4 ; 33 23 13 03
184 pxor mm7, mm7
186 movd mm4, [rsi]
187 punpcklbw mm4, mm7
188 paddsw mm0, mm4
189 packuswb mm0, mm7
190 movd [rdx], mm0
192 movd mm4, [rsi+rax]
193 punpcklbw mm4, mm7
194 paddsw mm1, mm4
195 packuswb mm1, mm7
196 movd [rdx+rdi], mm1
198 movd mm4, [rsi+2*rax]
199 punpcklbw mm4, mm7
200 paddsw mm2, mm4
201 packuswb mm2, mm7
202 movd [rdx+rdi*2], mm2
204 add rdx, rdi
205 add rsi, rax
207 movd mm4, [rsi+2*rax]
208 punpcklbw mm4, mm7
209 paddsw mm5, mm4
210 packuswb mm5, mm7
211 movd [rdx+rdi*2], mm5
213 ; begin epilog
214 pop rdi
215 pop rsi
216 RESTORE_GOT
217 UNSHADOW_ARGS
218 pop rbp
221 ;void vp8_dc_only_idct_add_mmx(
222 ;short input_dc,
223 ;unsigned char *pred_ptr,
224 ;int pred_stride,
225 ;unsigned char *dst_ptr,
226 ;int stride)
227 global sym(vp8_dc_only_idct_add_mmx) PRIVATE
228 sym(vp8_dc_only_idct_add_mmx):
229 push rbp
230 mov rbp, rsp
231 SHADOW_ARGS_TO_STACK 5
232 GET_GOT rbx
233 ; end prolog
235 movd mm5, arg(0) ;input_dc
236 mov rax, arg(1) ;pred_ptr
237 movsxd rdx, dword ptr arg(2) ;pred_stride
239 pxor mm0, mm0
241 paddw mm5, [GLOBAL(fours)]
242 lea rcx, [rdx + rdx*2]
244 psraw mm5, 3
246 punpcklwd mm5, mm5
248 punpckldq mm5, mm5
250 movd mm1, [rax]
251 movd mm2, [rax+rdx]
252 movd mm3, [rax+2*rdx]
253 movd mm4, [rax+rcx]
255 mov rax, arg(3) ;d -- destination
256 movsxd rdx, dword ptr arg(4) ;dst_stride
258 punpcklbw mm1, mm0
259 paddsw mm1, mm5
260 packuswb mm1, mm0 ; pack and unpack to saturate
261 lea rcx, [rdx + rdx*2]
263 punpcklbw mm2, mm0
264 paddsw mm2, mm5
265 packuswb mm2, mm0 ; pack and unpack to saturate
267 punpcklbw mm3, mm0
268 paddsw mm3, mm5
269 packuswb mm3, mm0 ; pack and unpack to saturate
271 punpcklbw mm4, mm0
272 paddsw mm4, mm5
273 packuswb mm4, mm0 ; pack and unpack to saturate
275 movd [rax], mm1
276 movd [rax+rdx], mm2
277 movd [rax+2*rdx], mm3
278 movd [rax+rcx], mm4
280 ; begin epilog
281 RESTORE_GOT
282 UNSHADOW_ARGS
283 pop rbp
286 SECTION_RODATA
287 align 16
288 x_s1sqr2:
289 times 4 dw 0x8A8C
290 align 16
291 x_c1sqr2less1:
292 times 4 dw 0x4E7B
293 align 16
294 fours:
295 times 4 dw 0x0004