2 * Copyright (C) 2006 Michael Niedermayer <michaelni@gmx.at>
4 * This file is part of FFmpeg.
6 * FFmpeg is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
16 * You should have received a copy of the GNU General Public License along
17 * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
18 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
21 /* For some reason clang doens't like that %%rip macro */
23 #if defined(__x86_64__) && !defined(__APPLE__)
24 #define MANGLE(a) "" #a "(%%rip)"
25 #define MANGLEVARIABLES [mode] "g"(mode),
27 #define MANGLE(a) "" "%["#a"]"
28 #define MANGLEVARIABLES [pw_1] "m"(pw_1),\
33 #ifdef COMPILE_TEMPLATE_SSE
38 #define MOVQU "movdqu"
40 #define LOAD(mem,dst) \
41 MOV" "mem", "dst" \n\t"\
42 "punpcklbw "MM"7, "dst" \n\t"
43 #define PSRL1(reg) "psrldq $1, "reg" \n\t"
44 #define PSRL2(reg) "psrldq $2, "reg" \n\t"
45 #define PSHUF(src,dst) "movdqa "dst", "src" \n\t"\
46 "psrldq $2, "src" \n\t"
54 #define LOAD(mem,dst) \
55 MOV" "mem", "dst" \n\t"\
56 "punpcklbw "MM"7, "dst" \n\t"
57 #define PSRL1(reg) "psrlq $8, "reg" \n\t"
58 #define PSRL2(reg) "psrlq $16, "reg" \n\t"
59 #define PSHUF(src,dst) "pshufw $9, "dst", "src" \n\t"
62 #ifdef COMPILE_TEMPLATE_SSSE3
63 #define PABS(tmp,dst) \
64 "pabsw "dst", "dst" \n\t"
66 #define PABS(tmp,dst) \
67 "pxor "tmp", "tmp" \n\t"\
68 "psubw "dst", "tmp" \n\t"\
69 "pmaxsw "tmp", "dst" \n\t"
73 #define CHECK(pj,mj) \
74 MOVQU" "#pj"(%[cur],%[mrefs]), "MM"2 \n\t" /* cur[x-refs-1+j] */\
75 MOVQU" "#mj"(%[cur],%[prefs]), "MM"3 \n\t" /* cur[x+refs-1-j] */\
76 MOVQ" "MM"2, "MM"4 \n\t"\
77 MOVQ" "MM"2, "MM"5 \n\t"\
78 "pxor "MM"3, "MM"4 \n\t"\
79 "pavgb "MM"3, "MM"5 \n\t"\
80 "pand "MANGLE(pb_1)", "MM"4 \n\t"\
81 "psubusb "MM"4, "MM"5 \n\t"\
83 "punpcklbw "MM"7, "MM"5 \n\t" /* (cur[x-refs+j] + cur[x+refs-j])>>1 */\
84 MOVQ" "MM"2, "MM"4 \n\t"\
85 "psubusb "MM"3, "MM"2 \n\t"\
86 "psubusb "MM"4, "MM"3 \n\t"\
87 "pmaxub "MM"3, "MM"2 \n\t"\
88 MOVQ" "MM"2, "MM"3 \n\t"\
89 MOVQ" "MM"2, "MM"4 \n\t" /* ABS(cur[x-refs-1+j] - cur[x+refs-1-j]) */\
90 PSRL1(MM"3") /* ABS(cur[x-refs +j] - cur[x+refs -j]) */\
91 PSRL2(MM"4") /* ABS(cur[x-refs+1+j] - cur[x+refs+1-j]) */\
92 "punpcklbw "MM"7, "MM"2 \n\t"\
93 "punpcklbw "MM"7, "MM"3 \n\t"\
94 "punpcklbw "MM"7, "MM"4 \n\t"\
95 "paddw "MM"3, "MM"2 \n\t"\
96 "paddw "MM"4, "MM"2 \n\t" /* score */
99 MOVQ" "MM"0, "MM"3 \n\t"\
100 "pcmpgtw "MM"2, "MM"3 \n\t" /* if(score < spatial_score) */\
101 "pminsw "MM"2, "MM"0 \n\t" /* spatial_score= score; */\
102 MOVQ" "MM"3, "MM"6 \n\t"\
103 "pand "MM"3, "MM"5 \n\t"\
104 "pandn "MM"1, "MM"3 \n\t"\
105 "por "MM"5, "MM"3 \n\t"\
106 MOVQ" "MM"3, "MM"1 \n\t" /* spatial_pred= (cur[x-refs+j] + cur[x+refs-j])>>1; */
108 #define CHECK2 /* pretend not to have checked dir=2 if dir=1 was bad.\
109 hurts both quality and speed, but matches the C version. */\
110 "paddw "MANGLE(pw_1)", "MM"6 \n\t"\
111 "psllw $14, "MM"6 \n\t"\
112 "paddsw "MM"6, "MM"2 \n\t"\
113 MOVQ" "MM"0, "MM"3 \n\t"\
114 "pcmpgtw "MM"2, "MM"3 \n\t"\
115 "pminsw "MM"2, "MM"0 \n\t"\
116 "pand "MM"3, "MM"5 \n\t"\
117 "pandn "MM"1, "MM"3 \n\t"\
118 "por "MM"5, "MM"3 \n\t"\
119 MOVQ" "MM"3, "MM"1 \n\t"
121 #if defined(__MINGW32__) && defined(_WIN32) && !defined(_WIN64)
122 __attribute__((__force_align_arg_pointer__
))
124 VLC_TARGET
static void RENAME(yadif_filter_line
)(uint8_t *dst
,
125 uint8_t *prev
, uint8_t *cur
, uint8_t *next
,
126 int w
, int prefs
, int mrefs
, int parity
, int mode
)
129 uint8_t *tmp
= (uint8_t*)(((uintptr_t)(tmpU
+15)) & ~15);
133 for(x=0; x<w; x+=STEP){\
135 "pxor "MM"7, "MM"7 \n\t"\
136 LOAD("(%[cur],%[mrefs])", MM"0") /* c = cur[x-refs] */\
137 LOAD("(%[cur],%[prefs])", MM"1") /* e = cur[x+refs] */\
138 LOAD("(%["prev2"])", MM"2") /* prev2[x] */\
139 LOAD("(%["next2"])", MM"3") /* next2[x] */\
140 MOVQ" "MM"3, "MM"4 \n\t"\
141 "paddw "MM"2, "MM"3 \n\t"\
142 "psraw $1, "MM"3 \n\t" /* d = (prev2[x] + next2[x])>>1 */\
143 MOVQ" "MM"0, (%[tmp]) \n\t" /* c */\
144 MOVQ" "MM"3, 16(%[tmp]) \n\t" /* d */\
145 MOVQ" "MM"1, 32(%[tmp]) \n\t" /* e */\
146 "psubw "MM"4, "MM"2 \n\t"\
147 PABS( MM"4", MM"2") /* temporal_diff0 */\
148 LOAD("(%[prev],%[mrefs])", MM"3") /* prev[x-refs] */\
149 LOAD("(%[prev],%[prefs])", MM"4") /* prev[x+refs] */\
150 "psubw "MM"0, "MM"3 \n\t"\
151 "psubw "MM"1, "MM"4 \n\t"\
154 "paddw "MM"4, "MM"3 \n\t" /* temporal_diff1 */\
155 "psrlw $1, "MM"2 \n\t"\
156 "psrlw $1, "MM"3 \n\t"\
157 "pmaxsw "MM"3, "MM"2 \n\t"\
158 LOAD("(%[next],%[mrefs])", MM"3") /* next[x-refs] */\
159 LOAD("(%[next],%[prefs])", MM"4") /* next[x+refs] */\
160 "psubw "MM"0, "MM"3 \n\t"\
161 "psubw "MM"1, "MM"4 \n\t"\
164 "paddw "MM"4, "MM"3 \n\t" /* temporal_diff2 */\
165 "psrlw $1, "MM"3 \n\t"\
166 "pmaxsw "MM"3, "MM"2 \n\t"\
167 MOVQ" "MM"2, 48(%[tmp]) \n\t" /* diff */\
169 "paddw "MM"0, "MM"1 \n\t"\
170 "paddw "MM"0, "MM"0 \n\t"\
171 "psubw "MM"1, "MM"0 \n\t"\
172 "psrlw $1, "MM"1 \n\t" /* spatial_pred */\
173 PABS( MM"2", MM"0") /* ABS(c-e) */\
175 MOVQU" -1(%[cur],%[mrefs]), "MM"2 \n\t" /* cur[x-refs-1] */\
176 MOVQU" -1(%[cur],%[prefs]), "MM"3 \n\t" /* cur[x+refs-1] */\
177 MOVQ" "MM"2, "MM"4 \n\t"\
178 "psubusb "MM"3, "MM"2 \n\t"\
179 "psubusb "MM"4, "MM"3 \n\t"\
180 "pmaxub "MM"3, "MM"2 \n\t"\
181 PSHUF(MM"3", MM"2") \
182 "punpcklbw "MM"7, "MM"2 \n\t" /* ABS(cur[x-refs-1] - cur[x+refs-1]) */\
183 "punpcklbw "MM"7, "MM"3 \n\t" /* ABS(cur[x-refs+1] - cur[x+refs+1]) */\
184 "paddw "MM"2, "MM"0 \n\t"\
185 "paddw "MM"3, "MM"0 \n\t"\
186 "psubw "MANGLE(pw_1)", "MM"0 \n\t" /* spatial_score */\
197 /* if(p->mode<2) ... */\
198 MOVQ" 48(%[tmp]), "MM"6 \n\t" /* diff */\
199 "cmpl $2, %[mode] \n\t"\
201 LOAD("(%["prev2"],%[mrefs],2)", MM"2") /* prev2[x-2*refs] */\
202 LOAD("(%["next2"],%[mrefs],2)", MM"4") /* next2[x-2*refs] */\
203 LOAD("(%["prev2"],%[prefs],2)", MM"3") /* prev2[x+2*refs] */\
204 LOAD("(%["next2"],%[prefs],2)", MM"5") /* next2[x+2*refs] */\
205 "paddw "MM"4, "MM"2 \n\t"\
206 "paddw "MM"5, "MM"3 \n\t"\
207 "psrlw $1, "MM"2 \n\t" /* b */\
208 "psrlw $1, "MM"3 \n\t" /* f */\
209 MOVQ" (%[tmp]), "MM"4 \n\t" /* c */\
210 MOVQ" 16(%[tmp]), "MM"5 \n\t" /* d */\
211 MOVQ" 32(%[tmp]), "MM"7 \n\t" /* e */\
212 "psubw "MM"4, "MM"2 \n\t" /* b-c */\
213 "psubw "MM"7, "MM"3 \n\t" /* f-e */\
214 MOVQ" "MM"5, "MM"0 \n\t"\
215 "psubw "MM"4, "MM"5 \n\t" /* d-c */\
216 "psubw "MM"7, "MM"0 \n\t" /* d-e */\
217 MOVQ" "MM"2, "MM"4 \n\t"\
218 "pminsw "MM"3, "MM"2 \n\t"\
219 "pmaxsw "MM"4, "MM"3 \n\t"\
220 "pmaxsw "MM"5, "MM"2 \n\t"\
221 "pminsw "MM"5, "MM"3 \n\t"\
222 "pmaxsw "MM"0, "MM"2 \n\t" /* max */\
223 "pminsw "MM"0, "MM"3 \n\t" /* min */\
224 "pxor "MM"4, "MM"4 \n\t"\
225 "pmaxsw "MM"3, "MM"6 \n\t"\
226 "psubw "MM"2, "MM"4 \n\t" /* -max */\
227 "pmaxsw "MM"4, "MM"6 \n\t" /* diff= MAX3(diff, min, -max); */\
230 MOVQ" 16(%[tmp]), "MM"2 \n\t" /* d */\
231 MOVQ" "MM"2, "MM"3 \n\t"\
232 "psubw "MM"6, "MM"2 \n\t" /* d-diff */\
233 "paddw "MM"6, "MM"3 \n\t" /* d+diff */\
234 "pmaxsw "MM"2, "MM"1 \n\t"\
235 "pminsw "MM"3, "MM"1 \n\t" /* d = clip(spatial_pred, d-diff, d+diff); */\
236 "packuswb "MM"1, "MM"1 \n\t"\
241 [prefs]"r"((x86_reg)prefs),\
242 [mrefs]"r"((x86_reg)mrefs),\
246 __asm__ volatile(MOV" "MM"1, %0" :"=m"(*dst));\