2 * Copyright (C) 2006 Michael Niedermayer <michaelni@gmx.at>
4 * This file is part of FFmpeg.
6 * FFmpeg is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
16 * You should have received a copy of the GNU General Public License along
17 * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
18 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
21 #ifdef COMPILE_TEMPLATE_SSE
26 #define MOVQU "movdqu"
28 #define LOAD(mem,dst) \
29 MOV" "mem", "dst" \n\t"\
30 "punpcklbw "MM"7, "dst" \n\t"
31 #define PSRL1(reg) "psrldq $1, "reg" \n\t"
32 #define PSRL2(reg) "psrldq $2, "reg" \n\t"
33 #define PSHUF(src,dst) "movdqa "dst", "src" \n\t"\
34 "psrldq $2, "src" \n\t"
42 #define LOAD(mem,dst) \
43 MOV" "mem", "dst" \n\t"\
44 "punpcklbw "MM"7, "dst" \n\t"
45 #define PSRL1(reg) "psrlq $8, "reg" \n\t"
46 #define PSRL2(reg) "psrlq $16, "reg" \n\t"
47 #define PSHUF(src,dst) "pshufw $9, "dst", "src" \n\t"
50 #ifdef COMPILE_TEMPLATE_SSSE3
51 #define PABS(tmp,dst) \
52 "pabsw "dst", "dst" \n\t"
54 #define PABS(tmp,dst) \
55 "pxor "tmp", "tmp" \n\t"\
56 "psubw "dst", "tmp" \n\t"\
57 "pmaxsw "tmp", "dst" \n\t"
61 #define CHECK(pj,mj) \
62 MOVQU" "#pj"(%[cur],%[mrefs]), "MM"2 \n\t" /* cur[x-refs-1+j] */\
63 MOVQU" "#mj"(%[cur],%[prefs]), "MM"3 \n\t" /* cur[x+refs-1-j] */\
64 MOVQ" "MM"2, "MM"4 \n\t"\
65 MOVQ" "MM"2, "MM"5 \n\t"\
66 "pxor "MM"3, "MM"4 \n\t"\
67 "pavgb "MM"3, "MM"5 \n\t"\
68 "pand %[pb_1], "MM"4 \n\t"\
69 "psubusb "MM"4, "MM"5 \n\t"\
71 "punpcklbw "MM"7, "MM"5 \n\t" /* (cur[x-refs+j] + cur[x+refs-j])>>1 */\
72 MOVQ" "MM"2, "MM"4 \n\t"\
73 "psubusb "MM"3, "MM"2 \n\t"\
74 "psubusb "MM"4, "MM"3 \n\t"\
75 "pmaxub "MM"3, "MM"2 \n\t"\
76 MOVQ" "MM"2, "MM"3 \n\t"\
77 MOVQ" "MM"2, "MM"4 \n\t" /* ABS(cur[x-refs-1+j] - cur[x+refs-1-j]) */\
78 PSRL1(MM"3") /* ABS(cur[x-refs +j] - cur[x+refs -j]) */\
79 PSRL2(MM"4") /* ABS(cur[x-refs+1+j] - cur[x+refs+1-j]) */\
80 "punpcklbw "MM"7, "MM"2 \n\t"\
81 "punpcklbw "MM"7, "MM"3 \n\t"\
82 "punpcklbw "MM"7, "MM"4 \n\t"\
83 "paddw "MM"3, "MM"2 \n\t"\
84 "paddw "MM"4, "MM"2 \n\t" /* score */
87 MOVQ" "MM"0, "MM"3 \n\t"\
88 "pcmpgtw "MM"2, "MM"3 \n\t" /* if(score < spatial_score) */\
89 "pminsw "MM"2, "MM"0 \n\t" /* spatial_score= score; */\
90 MOVQ" "MM"3, "MM"6 \n\t"\
91 "pand "MM"3, "MM"5 \n\t"\
92 "pandn "MM"1, "MM"3 \n\t"\
93 "por "MM"5, "MM"3 \n\t"\
94 MOVQ" "MM"3, "MM"1 \n\t" /* spatial_pred= (cur[x-refs+j] + cur[x+refs-j])>>1; */
96 #define CHECK2 /* pretend not to have checked dir=2 if dir=1 was bad.\
97 hurts both quality and speed, but matches the C version. */\
98 "paddw %[pw_1], "MM"6 \n\t"\
99 "psllw $14, "MM"6 \n\t"\
100 "paddsw "MM"6, "MM"2 \n\t"\
101 MOVQ" "MM"0, "MM"3 \n\t"\
102 "pcmpgtw "MM"2, "MM"3 \n\t"\
103 "pminsw "MM"2, "MM"0 \n\t"\
104 "pand "MM"3, "MM"5 \n\t"\
105 "pandn "MM"1, "MM"3 \n\t"\
106 "por "MM"5, "MM"3 \n\t"\
107 MOVQ" "MM"3, "MM"1 \n\t"
109 VLC_TARGET
static void RENAME(yadif_filter_line
)(uint8_t *dst
,
110 uint8_t *prev
, uint8_t *cur
, uint8_t *next
,
111 int w
, int prefs
, int mrefs
, int parity
, int mode
)
114 uint8_t *tmpA
= (uint8_t*)(((uint64_t)(tmp
+15)) & ~15);
118 for(x=0; x<w; x+=STEP){\
120 "pxor "MM"7, "MM"7 \n\t"\
121 LOAD("(%[cur],%[mrefs])", MM"0") /* c = cur[x-refs] */\
122 LOAD("(%[cur],%[prefs])", MM"1") /* e = cur[x+refs] */\
123 LOAD("(%["prev2"])", MM"2") /* prev2[x] */\
124 LOAD("(%["next2"])", MM"3") /* next2[x] */\
125 MOVQ" "MM"3, "MM"4 \n\t"\
126 "paddw "MM"2, "MM"3 \n\t"\
127 "psraw $1, "MM"3 \n\t" /* d = (prev2[x] + next2[x])>>1 */\
128 MOVQ" "MM"0, (%[tmpA]) \n\t" /* c */\
129 MOVQ" "MM"3, 16(%[tmpA]) \n\t" /* d */\
130 MOVQ" "MM"1, 32(%[tmpA]) \n\t" /* e */\
131 "psubw "MM"4, "MM"2 \n\t"\
132 PABS( MM"4", MM"2") /* temporal_diff0 */\
133 LOAD("(%[prev],%[mrefs])", MM"3") /* prev[x-refs] */\
134 LOAD("(%[prev],%[prefs])", MM"4") /* prev[x+refs] */\
135 "psubw "MM"0, "MM"3 \n\t"\
136 "psubw "MM"1, "MM"4 \n\t"\
139 "paddw "MM"4, "MM"3 \n\t" /* temporal_diff1 */\
140 "psrlw $1, "MM"2 \n\t"\
141 "psrlw $1, "MM"3 \n\t"\
142 "pmaxsw "MM"3, "MM"2 \n\t"\
143 LOAD("(%[next],%[mrefs])", MM"3") /* next[x-refs] */\
144 LOAD("(%[next],%[prefs])", MM"4") /* next[x+refs] */\
145 "psubw "MM"0, "MM"3 \n\t"\
146 "psubw "MM"1, "MM"4 \n\t"\
149 "paddw "MM"4, "MM"3 \n\t" /* temporal_diff2 */\
150 "psrlw $1, "MM"3 \n\t"\
151 "pmaxsw "MM"3, "MM"2 \n\t"\
152 MOVQ" "MM"2, 48(%[tmpA]) \n\t" /* diff */\
154 "paddw "MM"0, "MM"1 \n\t"\
155 "paddw "MM"0, "MM"0 \n\t"\
156 "psubw "MM"1, "MM"0 \n\t"\
157 "psrlw $1, "MM"1 \n\t" /* spatial_pred */\
158 PABS( MM"2", MM"0") /* ABS(c-e) */\
160 MOVQU" -1(%[cur],%[mrefs]), "MM"2 \n\t" /* cur[x-refs-1] */\
161 MOVQU" -1(%[cur],%[prefs]), "MM"3 \n\t" /* cur[x+refs-1] */\
162 MOVQ" "MM"2, "MM"4 \n\t"\
163 "psubusb "MM"3, "MM"2 \n\t"\
164 "psubusb "MM"4, "MM"3 \n\t"\
165 "pmaxub "MM"3, "MM"2 \n\t"\
166 PSHUF(MM"3", MM"2") \
167 "punpcklbw "MM"7, "MM"2 \n\t" /* ABS(cur[x-refs-1] - cur[x+refs-1]) */\
168 "punpcklbw "MM"7, "MM"3 \n\t" /* ABS(cur[x-refs+1] - cur[x+refs+1]) */\
169 "paddw "MM"2, "MM"0 \n\t"\
170 "paddw "MM"3, "MM"0 \n\t"\
171 "psubw %[pw_1], "MM"0 \n\t" /* spatial_score */\
182 /* if(p->mode<2) ... */\
183 MOVQ" 48(%[tmpA]), "MM"6 \n\t" /* diff */\
184 "cmpl $2, %[mode] \n\t"\
186 LOAD("(%["prev2"],%[mrefs],2)", MM"2") /* prev2[x-2*refs] */\
187 LOAD("(%["next2"],%[mrefs],2)", MM"4") /* next2[x-2*refs] */\
188 LOAD("(%["prev2"],%[prefs],2)", MM"3") /* prev2[x+2*refs] */\
189 LOAD("(%["next2"],%[prefs],2)", MM"5") /* next2[x+2*refs] */\
190 "paddw "MM"4, "MM"2 \n\t"\
191 "paddw "MM"5, "MM"3 \n\t"\
192 "psrlw $1, "MM"2 \n\t" /* b */\
193 "psrlw $1, "MM"3 \n\t" /* f */\
194 MOVQ" (%[tmpA]), "MM"4 \n\t" /* c */\
195 MOVQ" 16(%[tmpA]), "MM"5 \n\t" /* d */\
196 MOVQ" 32(%[tmpA]), "MM"7 \n\t" /* e */\
197 "psubw "MM"4, "MM"2 \n\t" /* b-c */\
198 "psubw "MM"7, "MM"3 \n\t" /* f-e */\
199 MOVQ" "MM"5, "MM"0 \n\t"\
200 "psubw "MM"4, "MM"5 \n\t" /* d-c */\
201 "psubw "MM"7, "MM"0 \n\t" /* d-e */\
202 MOVQ" "MM"2, "MM"4 \n\t"\
203 "pminsw "MM"3, "MM"2 \n\t"\
204 "pmaxsw "MM"4, "MM"3 \n\t"\
205 "pmaxsw "MM"5, "MM"2 \n\t"\
206 "pminsw "MM"5, "MM"3 \n\t"\
207 "pmaxsw "MM"0, "MM"2 \n\t" /* max */\
208 "pminsw "MM"0, "MM"3 \n\t" /* min */\
209 "pxor "MM"4, "MM"4 \n\t"\
210 "pmaxsw "MM"3, "MM"6 \n\t"\
211 "psubw "MM"2, "MM"4 \n\t" /* -max */\
212 "pmaxsw "MM"4, "MM"6 \n\t" /* diff= MAX3(diff, min, -max); */\
215 MOVQ" 16(%[tmpA]), "MM"2 \n\t" /* d */\
216 MOVQ" "MM"2, "MM"3 \n\t"\
217 "psubw "MM"6, "MM"2 \n\t" /* d-diff */\
218 "paddw "MM"6, "MM"3 \n\t" /* d+diff */\
219 "pmaxsw "MM"2, "MM"1 \n\t"\
220 "pminsw "MM"3, "MM"1 \n\t" /* d = clip(spatial_pred, d-diff, d+diff); */\
221 "packuswb "MM"1, "MM"1 \n\t"\
228 [prefs]"r"((x86_reg)prefs),\
229 [mrefs]"r"((x86_reg)mrefs),\
233 :REGMM"0",REGMM"1",REGMM"2",REGMM"3",REGMM"4",REGMM"5",REGMM"6",REGMM"7"\
235 __asm__ volatile(MOV" "MM"1, %0" :"=m"(*dst));\