vp8/common/x86/idctllm_mmx.asm

   1 ;
   2 ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
   3 ;
   4 ;  Use of this source code is governed by a BSD-style license
   5 ;  that can be found in the LICENSE file in the root of the source
   6 ;  tree. An additional intellectual property rights grant can be found
   7 ;  in the file PATENTS.  All contributing project authors may
   8 ;  be found in the AUTHORS file in the root of the source tree.
   9 ;
  10
  11
  12 %include "vpx_ports/x86_abi_support.asm"
  13
  14 ; /****************************************************************************
  15 ; * Notes:
  16 ; *
  17 ; * This implementation makes use of 16 bit fixed point version of two multiply
  18 ; * constants:
  19 ; *        1.   sqrt(2) * cos (pi/8)
  20 ; *        2.   sqrt(2) * sin (pi/8)
  21 ; * Because the first constant is bigger than 1, to maintain the same 16 bit
  22 ; * fixed point precision as the second one, we use a trick of
  23 ; *        x * a = x + x*(a-1)
  24 ; * so
  25 ; *        x * sqrt(2) * cos (pi/8) = x + x * (sqrt(2) *cos(pi/8)-1).
  26 ; *
  27 ; * For the second constant, because of the 16bit version is 35468, which
  28 ; * is bigger than 32768, in signed 16 bit multiply, it becomes a negative
  29 ; * number.
  30 ; *        (x * (unsigned)35468 >> 16) = x * (signed)35468 >> 16 + x
  31 ; *
  32 ; **************************************************************************/
  33
  34
  35 ;void vp8_short_idct4x4llm_mmx(short *input, unsigned char *pred,
  36 ;int pitch, unsigned char *dest,int stride)
  37 global sym(vp8_short_idct4x4llm_mmx) PRIVATE
  38 sym(vp8_short_idct4x4llm_mmx):
  39     push        rbp
  40     mov         rbp, rsp
  41     SHADOW_ARGS_TO_STACK 5
  42     GET_GOT     rbx
  43     push        rsi
  44     push        rdi
  45     ; end prolog
  46
  47     mov         rax,    arg(0)              ;input
  48     mov         rsi,    arg(1)              ;pred
  49
  50     movq        mm0,    [rax   ]
  51     movq        mm1,    [rax+ 8]
  52     movq        mm2,    [rax+16]
  53     movq        mm3,    [rax+24]
  54
  55 %if 0
  56     pxor        mm7,    mm7
  57     movq        [rax],   mm7
  58     movq        [rax+8], mm7
  59     movq        [rax+16],mm7
  60     movq        [rax+24],mm7
  61 %endif
  62     movsxd      rax,    dword ptr arg(2)    ;pitch
  63     mov         rdx,    arg(3)              ;dest
  64     movsxd      rdi,    dword ptr arg(4)    ;stride
  65
  66
  67     psubw       mm0,            mm2             ; b1= 0-2
  68     paddw       mm2,            mm2             ;
  69
  70     movq        mm5,            mm1
  71     paddw       mm2,            mm0             ; a1 =0+2
  72
  73     pmulhw      mm5,            [GLOBAL(x_s1sqr2)];
  74     paddw       mm5,            mm1             ; ip1 * sin(pi/8) * sqrt(2)
  75
  76     movq        mm7,            mm3             ;
  77     pmulhw      mm7,            [GLOBAL(x_c1sqr2less1)];
  78
  79     paddw       mm7,            mm3             ; ip3 * cos(pi/8) * sqrt(2)
  80     psubw       mm7,            mm5             ; c1
  81
  82     movq        mm5,            mm1
  83     movq        mm4,            mm3
  84
  85     pmulhw      mm5,            [GLOBAL(x_c1sqr2less1)]
  86     paddw       mm5,            mm1
  87
  88     pmulhw      mm3,            [GLOBAL(x_s1sqr2)]
  89     paddw       mm3,            mm4
  90
  91     paddw       mm3,            mm5             ; d1
  92     movq        mm6,            mm2             ; a1
  93
  94     movq        mm4,            mm0             ; b1
  95     paddw       mm2,            mm3             ;0
  96
  97     paddw       mm4,            mm7             ;1
  98     psubw       mm0,            mm7             ;2
  99
 100     psubw       mm6,            mm3             ;3
 101
 102     movq        mm1,            mm2             ; 03 02 01 00
 103     movq        mm3,            mm4             ; 23 22 21 20
 104
 105     punpcklwd   mm1,            mm0             ; 11 01 10 00
 106     punpckhwd   mm2,            mm0             ; 13 03 12 02
 107
 108     punpcklwd   mm3,            mm6             ; 31 21 30 20
 109     punpckhwd   mm4,            mm6             ; 33 23 32 22
 110
 111     movq        mm0,            mm1             ; 11 01 10 00
 112     movq        mm5,            mm2             ; 13 03 12 02
 113
 114     punpckldq   mm0,            mm3             ; 30 20 10 00
 115     punpckhdq   mm1,            mm3             ; 31 21 11 01
 116
 117     punpckldq   mm2,            mm4             ; 32 22 12 02
 118     punpckhdq   mm5,            mm4             ; 33 23 13 03
 119
 120     movq        mm3,            mm5             ; 33 23 13 03
 121
 122     psubw       mm0,            mm2             ; b1= 0-2
 123     paddw       mm2,            mm2             ;
 124
 125     movq        mm5,            mm1
 126     paddw       mm2,            mm0             ; a1 =0+2
 127
 128     pmulhw      mm5,            [GLOBAL(x_s1sqr2)];
 129     paddw       mm5,            mm1             ; ip1 * sin(pi/8) * sqrt(2)
 130
 131     movq        mm7,            mm3             ;
 132     pmulhw      mm7,            [GLOBAL(x_c1sqr2less1)];
 133
 134     paddw       mm7,            mm3             ; ip3 * cos(pi/8) * sqrt(2)
 135     psubw       mm7,            mm5             ; c1
 136
 137     movq        mm5,            mm1
 138     movq        mm4,            mm3
 139
 140     pmulhw      mm5,            [GLOBAL(x_c1sqr2less1)]
 141     paddw       mm5,            mm1
 142
 143     pmulhw      mm3,            [GLOBAL(x_s1sqr2)]
 144     paddw       mm3,            mm4
 145
 146     paddw       mm3,            mm5             ; d1
 147     paddw       mm0,            [GLOBAL(fours)]
 148
 149     paddw       mm2,            [GLOBAL(fours)]
 150     movq        mm6,            mm2             ; a1
 151
 152     movq        mm4,            mm0             ; b1
 153     paddw       mm2,            mm3             ;0
 154
 155     paddw       mm4,            mm7             ;1
 156     psubw       mm0,            mm7             ;2
 157
 158     psubw       mm6,            mm3             ;3
 159     psraw       mm2,            3
 160
 161     psraw       mm0,            3
 162     psraw       mm4,            3
 163
 164     psraw       mm6,            3
 165
 166     movq        mm1,            mm2             ; 03 02 01 00
 167     movq        mm3,            mm4             ; 23 22 21 20
 168
 169     punpcklwd   mm1,            mm0             ; 11 01 10 00
 170     punpckhwd   mm2,            mm0             ; 13 03 12 02
 171
 172     punpcklwd   mm3,            mm6             ; 31 21 30 20
 173     punpckhwd   mm4,            mm6             ; 33 23 32 22
 174
 175     movq        mm0,            mm1             ; 11 01 10 00
 176     movq        mm5,            mm2             ; 13 03 12 02
 177
 178     punpckldq   mm0,            mm3             ; 30 20 10 00
 179     punpckhdq   mm1,            mm3             ; 31 21 11 01
 180
 181     punpckldq   mm2,            mm4             ; 32 22 12 02
 182     punpckhdq   mm5,            mm4             ; 33 23 13 03
 183
 184     pxor        mm7,            mm7
 185
 186     movd        mm4,            [rsi]
 187     punpcklbw   mm4,            mm7
 188     paddsw      mm0,            mm4
 189     packuswb    mm0,            mm7
 190     movd        [rdx],          mm0
 191
 192     movd        mm4,            [rsi+rax]
 193     punpcklbw   mm4,            mm7
 194     paddsw      mm1,            mm4
 195     packuswb    mm1,            mm7
 196     movd        [rdx+rdi],      mm1
 197
 198     movd        mm4,            [rsi+2*rax]
 199     punpcklbw   mm4,            mm7
 200     paddsw      mm2,            mm4
 201     packuswb    mm2,            mm7
 202     movd        [rdx+rdi*2],    mm2
 203
 204     add         rdx,            rdi
 205     add         rsi,            rax
 206
 207     movd        mm4,            [rsi+2*rax]
 208     punpcklbw   mm4,            mm7
 209     paddsw      mm5,            mm4
 210     packuswb    mm5,            mm7
 211     movd        [rdx+rdi*2],    mm5
 212
 213     ; begin epilog
 214     pop rdi
 215     pop rsi
 216     RESTORE_GOT
 217     UNSHADOW_ARGS
 218     pop         rbp
 219     ret
 220
 221 ;void vp8_dc_only_idct_add_mmx(
 222 ;short input_dc,
 223 ;unsigned char *pred_ptr,
 224 ;int pred_stride,
 225 ;unsigned char *dst_ptr,
 226 ;int stride)
 227 global sym(vp8_dc_only_idct_add_mmx) PRIVATE
 228 sym(vp8_dc_only_idct_add_mmx):
 229     push        rbp
 230     mov         rbp, rsp
 231     SHADOW_ARGS_TO_STACK 5
 232     GET_GOT     rbx
 233     ; end prolog
 234
 235         movd        mm5,            arg(0) ;input_dc
 236         mov         rax,            arg(1) ;pred_ptr
 237         movsxd      rdx,            dword ptr arg(2) ;pred_stride
 238
 239         pxor        mm0,            mm0
 240
 241         paddw       mm5,            [GLOBAL(fours)]
 242         lea         rcx,            [rdx + rdx*2]
 243
 244         psraw       mm5,            3
 245
 246         punpcklwd   mm5,            mm5
 247
 248         punpckldq   mm5,            mm5
 249
 250         movd        mm1,            [rax]
 251         movd        mm2,            [rax+rdx]
 252         movd        mm3,            [rax+2*rdx]
 253         movd        mm4,            [rax+rcx]
 254
 255         mov         rax,            arg(3) ;d -- destination
 256         movsxd      rdx,            dword ptr arg(4) ;dst_stride
 257
 258         punpcklbw   mm1,            mm0
 259         paddsw      mm1,            mm5
 260         packuswb    mm1,            mm0              ; pack and unpack to saturate
 261         lea         rcx,            [rdx + rdx*2]
 262
 263         punpcklbw   mm2,            mm0
 264         paddsw      mm2,            mm5
 265         packuswb    mm2,            mm0              ; pack and unpack to saturate
 266
 267         punpcklbw   mm3,            mm0
 268         paddsw      mm3,            mm5
 269         packuswb    mm3,            mm0              ; pack and unpack to saturate
 270
 271         punpcklbw   mm4,            mm0
 272         paddsw      mm4,            mm5
 273         packuswb    mm4,            mm0              ; pack and unpack to saturate
 274
 275         movd        [rax],          mm1
 276         movd        [rax+rdx],      mm2
 277         movd        [rax+2*rdx],    mm3
 278         movd        [rax+rcx],      mm4
 279
 280     ; begin epilog
 281     RESTORE_GOT
 282     UNSHADOW_ARGS
 283     pop         rbp
 284     ret
 285
 286 SECTION_RODATA
 287 align 16
 288 x_s1sqr2:
 289     times 4 dw 0x8A8C
 290 align 16
 291 x_c1sqr2less1:
 292     times 4 dw 0x4E7B
 293 align 16
 294 fours:
 295     times 4 dw 0x0004