Minor fix on MemSubpic.[PART 1]
[xy_vsfilter.git] / src / subpic / xy_intrinsics.h
blobe80172850a44f9399cd55a75604d2ba300b55980
1 #ifndef __XY_INTRINSICS_D66EF42F_67BC_47F4_A70D_40F1AB80F376_H__
2 #define __XY_INTRINSICS_D66EF42F_67BC_47F4_A70D_40F1AB80F376_H__
4 #ifdef LINUX
5 #include <pmmintrin.h>
6 #else
7 #include<intrin.h>
8 #endif
10 #include <WTypes.h>
12 //out: m128_1 = avg(m128_1.u8[0],m128_1.u8[1],m128_2.u8[0],m128_2.u8[1])
13 // 0
14 // avg(...)
15 // 0
16 // ...
17 #define AVERAGE_4_PIX_INTRINSICS(m128_1, m128_2) \
18 m128_1 = _mm_avg_epu8(m128_1, m128_2); \
19 m128_2 = _mm_slli_epi16(m128_1, 8); \
20 m128_1 = _mm_srli_epi16(m128_1, 8); \
21 m128_2 = _mm_srli_epi16(m128_2, 8); \
22 m128_1 = _mm_avg_epu8(m128_1, m128_2);
24 //out: m128_1 = avg(m128_1.u8[0],m128_1.u8[1],m128_2.u8[0],m128_2.u8[1])
25 // avg(m128_1.u8[0],m128_1.u8[1],m128_2.u8[0],m128_2.u8[1])
26 // avg(...)
27 // avg(...)
28 // ...
29 #define AVERAGE_4_PIX_INTRINSICS_2(m128_1, m128_2) \
31 m128_1 = _mm_avg_epu8(m128_1, m128_2); \
32 m128_2 = _mm_slli_epi16(m128_1, 8); \
33 __m128i m128_3 = _mm_srli_epi16(m128_1, 8); \
34 m128_2 = _mm_or_si128(m128_2, m128_3);\
35 m128_1 = _mm_avg_epu8(m128_1, m128_2);\
38 //in : m128_1 = whatever, m128_last = u8 U_last 0 0 0 ... 0
39 //out: m128_1 = avg(U_last, u8[0], u8[1])
40 // 0
41 // avg(u8[1], u8[2], u8[3])
42 // 0
43 // ...
44 // m128_last = m128_1.u8[14] m128_1.u8[15] 0 0 0 ... 0
45 #define AVERAGE_4_PIX_INTRINSICS_3(m128_1, m128_last) \
47 __m128i m128_2 = _mm_slli_si128(m128_1,2);\
48 m128_2 = _mm_or_si128(m128_2, m128_last);\
49 m128_2 = _mm_avg_epu8(m128_2, m128_1);\
50 m128_last = _mm_srli_si128(m128_1,14);\
51 m128_1 = _mm_slli_epi16(m128_1, 8);\
52 m128_1 = _mm_avg_epu8(m128_1, m128_2);\
53 m128_1 = _mm_srli_epi16(m128_1, 8);\
56 static void average_4_pix_intrinsics_3_c(__m128i& m128i_1, __m128i& m128i_last)
58 int last=m128i_last.m128i_u8[1];
59 m128i_last.m128i_u8[0] = m128i_1.m128i_u8[14];
60 m128i_last.m128i_u8[1] = m128i_1.m128i_u8[15];
61 for (int i=2;i<16;i++)
63 m128i_last.m128i_u8[i] = 0;
65 for (int i=0;i<8;i++)
67 int u0 = m128i_1.m128i_u8[2*i];
68 int u1 = m128i_1.m128i_u8[2*i+1];
69 last = (last + u1 + 1)/2;
70 u0 = (last + u0 + 1)/2;
71 last = u1;
72 m128i_1.m128i_u8[2*i] = u0;
73 m128i_1.m128i_u8[2*i+1] = 0;
77 //in : m128_1 = whatever, m128_last = u8 U_last 0 0 0 ... 0
78 //out: m128_1 = 0
79 // avg(U_last, u8[0], u8[1])
80 // 0
81 // avg(u8[1], u8[2], u8[3])
82 // ...
83 // m128_last = m128_1.u8[14] m128_1.u8[15] 0 0 0 ... 0
84 #define AVERAGE_4_PIX_INTRINSICS_4(m128_1, m128_last) \
86 __m128i m128_2 = _mm_slli_si128(m128_1,2);\
87 m128_2 = _mm_or_si128(m128_2, m128_last);\
88 m128_2 = _mm_avg_epu8(m128_2, m128_1);\
89 m128_last = _mm_srli_si128(m128_1,14);\
90 m128_2 = _mm_srli_epi16(m128_2, 8);\
91 m128_1 = _mm_avg_epu8(m128_1, m128_2);\
92 m128_1 = _mm_slli_epi16(m128_1, 8);\
95 static void average_4_pix_intrinsics_4_c(__m128i& m128i_1, __m128i& m128i_last)
97 int last=m128i_last.m128i_u8[1];
98 m128i_last.m128i_u8[0] = m128i_1.m128i_u8[14];
99 m128i_last.m128i_u8[1] = m128i_1.m128i_u8[15];
100 for (int i=2;i<16;i++)
102 m128i_last.m128i_u8[i] = 0;
104 for (int i=0;i<8;i++)
106 int u0 = m128i_1.m128i_u8[2*i];
107 int u1 = m128i_1.m128i_u8[2*i+1];
108 last = (last + u1 + 1)/2;
109 u0 = (last + u0 + 1)/2;
110 last = u1;
111 m128i_1.m128i_u8[2*i+1] = u0;
112 m128i_1.m128i_u8[2*i] = 0;
115 //in : m128_1 = whatever, m128_last = u8 U_last 0 0 0 ... 0
116 //out: m128_1 = avg(U_last, u8[0], u8[1])
117 // avg(U_last, u8[0], u8[1])
118 // avg(u8[1], u8[2], u8[3])
119 // avg(u8[1], u8[2], u8[3])
120 // ...
121 // m128_last = m128_1.u8[14] m128_1.u8[15] 0 0 0 ... 0
122 #define AVERAGE_4_PIX_INTRINSICS_5(m128_1, m128_last) \
124 __m128i m128_2 = _mm_slli_si128(m128_1,2);\
125 m128_2 = _mm_or_si128(m128_2, m128_last);\
126 m128_2 = _mm_avg_epu8(m128_2, m128_1);\
127 m128_last = _mm_srli_si128(m128_1,14);\
128 m128_2 = _mm_srli_epi16(m128_2, 8);\
129 m128_1 = _mm_avg_epu8(m128_1, m128_2);\
130 m128_1 = _mm_slli_epi16(m128_1, 8);\
131 m128_2 = _mm_srli_epi16(m128_1, 8);\
132 m128_1 = _mm_or_si128(m128_1, m128_2);\
135 static void average_4_pix_intrinsics_5_c(__m128i& m128i_1, __m128i& m128i_last)
137 int last=m128i_last.m128i_u8[1];
138 m128i_last.m128i_u8[0] = m128i_1.m128i_u8[14];
139 m128i_last.m128i_u8[1] = m128i_1.m128i_u8[15];
140 for (int i=2;i<16;i++)
142 m128i_last.m128i_u8[i] = 0;
144 for (int i=0;i<8;i++)
146 int u0 = m128i_1.m128i_u8[2*i];
147 int u1 = m128i_1.m128i_u8[2*i+1];
148 last = (last + u1 + 1)/2;
149 u0 = (last + u0 + 1)/2;
150 last = u1;
151 m128i_1.m128i_u8[2*i+1] = u0;
152 m128i_1.m128i_u8[2*i] = u0;
156 static void subsample_and_interlace_2_line_c(BYTE* dst, const BYTE* u, const BYTE* v, int w, int pitch)
158 const BYTE* end = u + w;
159 for (;u<end;dst+=2,u+=2,v+=2)
161 dst[0] = (u[0] + u[0+pitch] + 1)/2;
162 int tmp1 = (u[1] + u[1+pitch] + 1)/2;
163 dst[0] = (dst[0] + tmp1 + 1)/2;
164 dst[1] = (v[0] + v[0+pitch] + 1)/2;
165 tmp1 = (v[1] + v[1+pitch] + 1)/2;
166 dst[1] = (dst[1] + tmp1 + 1)/2;
170 static __forceinline void subsample_and_interlace_2_line_sse2(BYTE* dst, const BYTE* u, const BYTE* v, int w, int pitch)
172 const BYTE* end = u + w;
173 for (;u<end;dst+=16,u+=16,v+=16)
175 __m128i u_1 = _mm_load_si128( reinterpret_cast<const __m128i*>(u) );
176 __m128i u_2 = _mm_load_si128( reinterpret_cast<const __m128i*>(u+pitch) );
177 __m128i v_1 = _mm_load_si128( reinterpret_cast<const __m128i*>(v) );
178 __m128i v_2 = _mm_load_si128( reinterpret_cast<const __m128i*>(v+pitch) );
179 AVERAGE_4_PIX_INTRINSICS(u_1, u_2);
180 AVERAGE_4_PIX_INTRINSICS(v_1, v_2);
181 u_1 = _mm_packus_epi16(u_1, u_1);
182 v_1 = _mm_packus_epi16(v_1, v_1);
183 u_1 = _mm_unpacklo_epi8(u_1, v_1);
185 _mm_store_si128( reinterpret_cast<__m128i*>(dst), u_1 );
189 static __forceinline void pix_alpha_blend_yv12_luma_sse2(byte* dst, const byte* alpha, const byte* sub)
191 __m128i dst128 = _mm_load_si128( reinterpret_cast<const __m128i*>(dst) );
192 __m128i alpha128 = _mm_load_si128( reinterpret_cast<const __m128i*>(alpha) );
193 __m128i sub128 = _mm_load_si128( reinterpret_cast<const __m128i*>(sub) );
194 __m128i zero = _mm_setzero_si128();
196 __m128i ones;
197 #ifdef _DEBUG
198 ones = _mm_setzero_si128();//disable warning C4700
199 #endif
200 ones = _mm_cmpeq_epi32(ones,ones);
201 ones = _mm_cmpeq_epi8(ones,alpha128);
203 __m128i dst_lo128 = _mm_unpacklo_epi8(dst128, zero);
204 __m128i alpha_lo128 = _mm_unpacklo_epi8(alpha128, zero);
206 __m128i ones2 = _mm_unpacklo_epi8(ones, zero);
208 dst_lo128 = _mm_mullo_epi16(dst_lo128, alpha_lo128);
209 dst_lo128 = _mm_adds_epu16(dst_lo128, ones2);
210 dst_lo128 = _mm_srli_epi16(dst_lo128, 8);
212 dst128 = _mm_unpackhi_epi8(dst128, zero);
213 alpha128 = _mm_unpackhi_epi8(alpha128, zero);
215 ones2 = _mm_unpackhi_epi8(ones, zero);
217 dst128 = _mm_mullo_epi16(dst128, alpha128);
218 dst128 = _mm_adds_epu16(dst128, ones2);
219 dst128 = _mm_srli_epi16(dst128, 8);
220 dst_lo128 = _mm_packus_epi16(dst_lo128, dst128);
222 dst_lo128 = _mm_adds_epu8(dst_lo128, sub128);
223 _mm_store_si128( reinterpret_cast<__m128i*>(dst), dst_lo128 );
226 /***
227 * output not exactly identical to pix_alpha_blend_yv12_chroma
229 static __forceinline void pix_alpha_blend_yv12_chroma_sse2(byte* dst, const byte* src, const byte* alpha, int src_pitch)
231 __m128i zero = _mm_setzero_si128();
232 __m128i alpha128_1 = _mm_load_si128( reinterpret_cast<const __m128i*>(alpha) );
233 __m128i alpha128_2 = _mm_load_si128( reinterpret_cast<const __m128i*>(alpha+src_pitch) );
234 __m128i dst128 = _mm_loadl_epi64( reinterpret_cast<const __m128i*>(dst) );
236 __m128i sub128_1 = _mm_load_si128( reinterpret_cast<const __m128i*>(src) );
237 __m128i sub128_2 = _mm_load_si128( reinterpret_cast<const __m128i*>(src+src_pitch) );
239 AVERAGE_4_PIX_INTRINSICS(alpha128_1, alpha128_2);
241 __m128i ones;
242 #ifdef _DEBUG
243 ones = _mm_setzero_si128();//disable warning C4700
244 #endif
245 ones = _mm_cmpeq_epi32(ones,ones);
246 ones = _mm_cmpeq_epi8(ones, alpha128_1);
248 dst128 = _mm_unpacklo_epi8(dst128, zero);
249 __m128i dst128_2 = _mm_and_si128(dst128, ones);
251 dst128 = _mm_mullo_epi16(dst128, alpha128_1);
252 dst128 = _mm_adds_epu16(dst128, dst128_2);
254 dst128 = _mm_srli_epi16(dst128, 8);
256 AVERAGE_4_PIX_INTRINSICS(sub128_1, sub128_2);
258 dst128 = _mm_adds_epi16(dst128, sub128_1);
259 dst128 = _mm_packus_epi16(dst128, dst128);
261 _mm_storel_epi64( reinterpret_cast<__m128i*>(dst), dst128 );
264 static __forceinline void mix_16_y_p010_sse2(BYTE* dst, const BYTE* src, const BYTE* src_alpha)
266 //important!
267 __m128i alpha = _mm_load_si128( reinterpret_cast<const __m128i*>(src_alpha) );
268 __m128i src_y = _mm_load_si128( reinterpret_cast<const __m128i*>(src) );
269 __m128i dst_y = _mm_load_si128( reinterpret_cast<const __m128i*>(dst) );
271 __m128i alpha_ff;
272 #ifdef _DEBUG
273 alpha_ff = _mm_setzero_si128();//disable warning C4700
274 #endif
275 alpha_ff = _mm_cmpeq_epi32(alpha_ff,alpha_ff);
277 alpha_ff = _mm_cmpeq_epi8(alpha_ff, alpha);
279 __m128i lo = _mm_unpacklo_epi8(alpha_ff, alpha);//(alpha<<8)+0x100 will overflow
280 //so we do it another way
281 //first, (alpha<<8)+0xff
282 __m128i ones = _mm_setzero_si128();
283 ones = _mm_cmpeq_epi16(dst_y, ones);
285 __m128i ones2;
286 #ifdef _DEBUG
287 ones2 = _mm_setzero_si128();//disable warning C4700
288 #endif
289 ones2 = _mm_cmpeq_epi32(ones2,ones2);
291 ones = _mm_xor_si128(ones, ones2);
292 ones = _mm_srli_epi16(ones, 15);
293 ones = _mm_and_si128(ones, lo);
295 dst_y = _mm_mulhi_epu16(dst_y, lo);
296 dst_y = _mm_adds_epu16(dst_y, ones);//then add one if necessary
298 lo = _mm_setzero_si128();
299 lo = _mm_unpacklo_epi8(lo, src_y);
300 dst_y = _mm_adds_epu16(dst_y, lo);
301 _mm_store_si128( reinterpret_cast<__m128i*>(dst), dst_y );
303 dst += 16;
304 dst_y = _mm_load_si128( reinterpret_cast<const __m128i*>(dst) );
306 lo = _mm_unpackhi_epi8(alpha_ff, alpha);
308 ones = _mm_setzero_si128();
309 ones = _mm_cmpeq_epi16(dst_y, ones);
310 ones = _mm_xor_si128(ones, ones2);
311 ones = _mm_srli_epi16(ones, 15);
312 ones = _mm_and_si128(ones, lo);
314 dst_y = _mm_mulhi_epu16(dst_y, lo);
315 dst_y = _mm_adds_epu16(dst_y, ones);
317 lo = _mm_setzero_si128();
318 lo = _mm_unpackhi_epi8(lo, src_y);
319 dst_y = _mm_adds_epu16(dst_y, lo);
320 _mm_store_si128( reinterpret_cast<__m128i*>(dst), dst_y );
323 //for test only
324 static void mix_16_y_p010_c(BYTE* dst, const BYTE* src, const BYTE* src_alpha)
326 WORD* dst_word = reinterpret_cast<WORD*>(dst);
327 for (int i=0;i<16;i++)
329 if (src_alpha[i]!=0xff)
331 dst_word[i] = ((dst_word[i] *src_alpha[i])>>8) + (src[i]<<8);
336 static __forceinline void pix_alpha_blend_yv12_chroma(byte* dst, const byte* src, const byte* alpha, int src_pitch)
338 unsigned int ia = (alpha[0]+alpha[1]+
339 alpha[0+src_pitch]+alpha[1+src_pitch])>>2;
340 if(ia!=0xff)
342 *dst= (((*dst)*ia)>>8) + ((src[0] +src[1]+
343 src[src_pitch]+src[1+src_pitch] )>>2);
346 static __forceinline void mix_16_uv_p010_sse2(BYTE* dst, const BYTE* src, const BYTE* src_alpha, int pitch)
348 //important!
349 __m128i alpha = _mm_load_si128( reinterpret_cast<const __m128i*>(src_alpha) );
350 __m128i alpha2 = _mm_load_si128( reinterpret_cast<const __m128i*>(src_alpha+pitch) );
352 __m128i src_y = _mm_load_si128( reinterpret_cast<const __m128i*>(src) );
353 __m128i dst_y = _mm_load_si128( reinterpret_cast<const __m128i*>(dst) );
355 AVERAGE_4_PIX_INTRINSICS_2(alpha, alpha2);
357 __m128i alpha_ff;
358 #ifdef _DEBUG
359 alpha_ff = _mm_setzero_si128();//disable warning C4700
360 #endif
361 alpha_ff = _mm_cmpeq_epi32(alpha_ff,alpha_ff);
363 alpha_ff = _mm_cmpeq_epi8(alpha_ff, alpha);
365 __m128i lo = _mm_unpacklo_epi8(alpha_ff, alpha);//(alpha<<8)+0x100 will overflow
366 //so we do it another way
367 //first, (alpha<<8)+0xff
368 __m128i ones = _mm_setzero_si128();
369 ones = _mm_cmpeq_epi16(dst_y, ones);
371 __m128i ones2;
372 #ifdef _DEBUG
373 ones2 = _mm_setzero_si128();//disable warning C4700
374 #endif
375 ones2 = _mm_cmpeq_epi32(ones2,ones2);
376 ones = _mm_xor_si128(ones, ones2);
377 ones = _mm_srli_epi16(ones, 15);
378 ones = _mm_and_si128(ones, lo);
380 dst_y = _mm_mulhi_epu16(dst_y, lo);
381 dst_y = _mm_adds_epu16(dst_y, ones);//then add one if necessary
383 lo = _mm_setzero_si128();
384 lo = _mm_unpacklo_epi8(lo, src_y);
385 dst_y = _mm_adds_epu16(dst_y, lo);
386 _mm_store_si128( reinterpret_cast<__m128i*>(dst), dst_y );
388 dst += 16;
389 dst_y = _mm_load_si128( reinterpret_cast<const __m128i*>(dst) );
391 lo = _mm_unpackhi_epi8(alpha_ff, alpha);
393 ones = _mm_setzero_si128();
394 ones = _mm_cmpeq_epi16(dst_y, ones);
395 ones = _mm_xor_si128(ones, ones2);
396 ones = _mm_srli_epi16(ones, 15);
397 ones = _mm_and_si128(ones, lo);
399 dst_y = _mm_mulhi_epu16(dst_y, lo);
400 dst_y = _mm_adds_epu16(dst_y, ones);
402 lo = _mm_setzero_si128();
403 lo = _mm_unpackhi_epi8(lo, src_y);
404 dst_y = _mm_adds_epu16(dst_y, lo);
405 _mm_store_si128( reinterpret_cast<__m128i*>(dst), dst_y );
408 static void mix_16_uv_p010_c(BYTE* dst, const BYTE* src, const BYTE* src_alpha, int pitch)
410 WORD* dst_word = reinterpret_cast<WORD*>(dst);
411 for (int i=0;i<8;i++, src_alpha+=2, src+=2, dst_word+=2)
413 unsigned int ia = (
414 (src_alpha[0]+src_alpha[0+pitch]+1)/2+
415 (src_alpha[1]+src_alpha[1+pitch]+1)/2+1)/2;
416 if( ia!=0xFF )
418 int tmp = (((dst_word[0])*ia)>>8) + (src[0]<<8);
419 if(tmp>0xffff) tmp = 0xffff;
420 dst_word[0] = tmp;
421 tmp = (((dst_word[1])*ia)>>8) + (src[1]<<8);
422 if(tmp>0xffff) tmp = 0xffff;
423 dst_word[1] = tmp;
428 static __forceinline void mix_16_uv_nvxx_sse2(BYTE* dst, const BYTE* src, const BYTE* src_alpha, int pitch)
430 __m128i dst128 = _mm_load_si128( reinterpret_cast<const __m128i*>(dst) );
431 __m128i alpha128_1 = _mm_load_si128( reinterpret_cast<const __m128i*>(src_alpha) );
432 __m128i alpha128_2 = _mm_load_si128( reinterpret_cast<const __m128i*>(src_alpha+pitch) );
433 __m128i sub128 = _mm_load_si128( reinterpret_cast<const __m128i*>(src) );
435 AVERAGE_4_PIX_INTRINSICS_2(alpha128_1, alpha128_2);
436 __m128i zero = _mm_setzero_si128();
438 __m128i ones;
439 #ifdef _DEBUG
440 ones = _mm_setzero_si128();//disable warning C4700
441 #endif
442 ones = _mm_cmpeq_epi32(ones,ones);
443 ones = _mm_cmpeq_epi8(ones,alpha128_1);
445 __m128i dst_lo128 = _mm_unpacklo_epi8(dst128, zero);
446 alpha128_2 = _mm_unpacklo_epi8(alpha128_1, zero);
448 __m128i ones2 = _mm_unpacklo_epi8(ones, zero);
450 dst_lo128 = _mm_mullo_epi16(dst_lo128, alpha128_2);
451 dst_lo128 = _mm_adds_epu16(dst_lo128, ones2);
452 dst_lo128 = _mm_srli_epi16(dst_lo128, 8);
454 dst128 = _mm_unpackhi_epi8(dst128, zero);
455 alpha128_1 = _mm_unpackhi_epi8(alpha128_1, zero);
457 ones2 = _mm_unpackhi_epi8(ones, zero);
459 dst128 = _mm_mullo_epi16(dst128, alpha128_1);
460 dst128 = _mm_adds_epu16(dst128, ones2);
461 dst128 = _mm_srli_epi16(dst128, 8);
462 dst_lo128 = _mm_packus_epi16(dst_lo128, dst128);
464 dst_lo128 = _mm_adds_epu8(dst_lo128, sub128);
465 _mm_store_si128( reinterpret_cast<__m128i*>(dst), dst_lo128 );
468 //for test only
469 static void mix_16_uv_nvxx_c(BYTE* dst, const BYTE* src, const BYTE* src_alpha, int pitch)
471 for (int i=0;i<8;i++, src_alpha+=2, src+=2, dst+=2)
473 unsigned int ia = (
474 (src_alpha[0]+src_alpha[0+pitch]+1)/2+
475 (src_alpha[1]+src_alpha[1+pitch]+1)/2+1)/2;
476 if( ia!=0xFF )
478 dst[0] = (((dst[0])*ia)>>8) + src[0];
479 dst[1] = (((dst[1])*ia)>>8) + src[1];
484 /******
485 * hleft_vmid:
486 * chroma placement(x=Y, o=U,V):
487 * x x x x ...
488 * o o ...
489 * x x x x ...
490 * o o ...
491 * x x x x ...
492 ******/
493 static __forceinline void hleft_vmid_subsample_and_interlace_2_line_c(BYTE* dst, const BYTE* u, const BYTE* v, int w, int pitch, int last_src_id=0)
495 const BYTE* end = u + w;
496 BYTE last_u = (u[last_src_id]+u[last_src_id+pitch]+1)/2;
497 BYTE last_v = (v[last_src_id]+v[last_src_id+pitch]+1)/2;
498 for (;u<end;dst+=2,u+=2,v+=2)
500 dst[0] = (u[0] + u[0+pitch] + 1)/2;
501 int tmp1 = (u[1] + u[1+pitch] + 1)/2;
502 last_u = (tmp1+last_u+1)/2;
503 dst[0] = (dst[0] + last_u + 1)/2;
504 last_u = tmp1;
506 dst[1] = (v[0] + v[0+pitch] + 1)/2;
507 tmp1 = (v[1] + v[1+pitch] + 1)/2;
508 last_v = (tmp1+last_v+1)/2;
509 dst[1] = (last_v + dst[1] + 1)/2;
510 last_v = tmp1;
514 // @w : w % 16 must == 0!
515 static __forceinline void hleft_vmid_subsample_and_interlace_2_line_sse2(BYTE* dst, const BYTE* u, const BYTE* v, int w, int pitch, int last_src_id=0)
517 const BYTE* end_mod16 = u + (w&~15);
519 __m128i u_last = _mm_cvtsi32_si128( (u[last_src_id]+u[pitch+last_src_id]+1)<<7 );
520 __m128i v_last = _mm_cvtsi32_si128( (v[last_src_id]+v[pitch+last_src_id]+1)<<7 );
521 for (;u<end_mod16;dst+=16,u+=16,v+=16)
523 __m128i u_1 = _mm_load_si128( reinterpret_cast<const __m128i*>(u) );
524 __m128i u_2 = _mm_load_si128( reinterpret_cast<const __m128i*>(u+pitch) );
525 __m128i v_1 = _mm_load_si128( reinterpret_cast<const __m128i*>(v) );
526 __m128i v_2 = _mm_load_si128( reinterpret_cast<const __m128i*>(v+pitch) );
527 u_1 = _mm_avg_epu8(u_1, u_2);
528 AVERAGE_4_PIX_INTRINSICS_3(u_1, u_last);
529 v_1 = _mm_avg_epu8(v_1, v_2);
530 AVERAGE_4_PIX_INTRINSICS_4(v_1, v_last);
531 u_1 = _mm_or_si128(u_1, v_1);
532 _mm_store_si128( reinterpret_cast<__m128i*>(dst), u_1 );
534 //The following fails if dst==u
535 //hleft_vmid_subsample_and_interlace_2_line_c(dst, u, v, w&15, pitch, w>15?-1:0);
538 static __forceinline void hleft_vmid_mix_uv_yv12_c(byte* dst, int w, const byte* src, const byte* am, int src_pitch, int last_src_id=0)
540 int last_alpha = (am[last_src_id]+am[last_src_id+src_pitch]+1)/2;
541 int last_sub = (src[last_src_id]+src[last_src_id+src_pitch]+1)/2;
542 const BYTE* end = src + w;
543 for(; src < end; src += 2, am += 2, dst++)
545 int ia = (am[0]+am[0+src_pitch]+1)/2;
546 int tmp1 = (am[1]+am[1+src_pitch]+1)/2;
547 last_alpha = (last_alpha + tmp1 + 1)/2;
548 ia = (ia + last_alpha + 1)/2;
549 last_alpha = tmp1;
551 if(ia!=0xff)
553 tmp1 = (src[0]+src[0+src_pitch]+1)/2;
554 int tmp2 = (src[1]+src[1+src_pitch]+1)/2;
555 last_sub = (last_sub+tmp2+1)/2;
556 tmp1 = (tmp1+last_sub+1)/2;
557 last_sub = tmp2;
559 *dst= (((*dst)*ia)>>8) + tmp1;
561 else
563 last_sub = (src[1]+src[1+src_pitch]+1)/2;
568 static __forceinline void hleft_vmid_mix_uv_yv12_sse2(byte* dst, int w, const byte* src, const byte* am, int src_pitch, int last_src_id=0)
570 __m128i last_src = _mm_cvtsi32_si128( (src[last_src_id]+src[src_pitch+last_src_id]+1)<<7 );
571 __m128i last_alpha = _mm_cvtsi32_si128( (am[last_src_id]+am[src_pitch+last_src_id]+1)<<7 );
572 const BYTE* end_mod16 = src + (w&~15);
573 for(; src < end_mod16; src += 16, am += 16, dst+=8)
575 __m128i zero = _mm_setzero_si128();
577 __m128i alpha128_1 = _mm_load_si128( reinterpret_cast<const __m128i*>(am) );
578 __m128i tmp = _mm_load_si128( reinterpret_cast<const __m128i*>(am+src_pitch) );
579 alpha128_1 = _mm_avg_epu8(alpha128_1, tmp);
580 AVERAGE_4_PIX_INTRINSICS_3(alpha128_1, last_alpha);
582 __m128i dst128 = _mm_loadl_epi64( reinterpret_cast<const __m128i*>(dst) );
584 __m128i sub128_1 = _mm_load_si128( reinterpret_cast<const __m128i*>(src) );
585 tmp = _mm_load_si128( reinterpret_cast<const __m128i*>(src+src_pitch) );
586 sub128_1 = _mm_avg_epu8(sub128_1, tmp);
587 AVERAGE_4_PIX_INTRINSICS_3(sub128_1, last_src);
589 __m128i ones;
590 #ifdef _DEBUG
591 ones = _mm_setzero_si128();//disable warning C4700
592 #endif
593 ones = _mm_cmpeq_epi32(ones,ones);
594 ones = _mm_cmpeq_epi8(ones, alpha128_1);
596 dst128 = _mm_unpacklo_epi8(dst128, zero);
597 __m128i dst128_2 = _mm_and_si128(dst128, ones);
599 dst128 = _mm_mullo_epi16(dst128, alpha128_1);
600 dst128 = _mm_adds_epu16(dst128, dst128_2);
602 dst128 = _mm_srli_epi16(dst128, 8);
604 dst128 = _mm_adds_epi16(dst128, sub128_1);
605 dst128 = _mm_packus_epi16(dst128, dst128);
607 _mm_storel_epi64( reinterpret_cast<__m128i*>(dst), dst128 );
609 hleft_vmid_mix_uv_yv12_c(dst, w&15, src, am, src_pitch, w>15?-1:0);
612 static __forceinline void hleft_vmid_mix_uv_p010_c(BYTE* dst, int w, const BYTE* src, const BYTE* am, int src_pitch, int last_src_id=0)
614 int last_alpha = (am[last_src_id]+am[src_pitch+last_src_id]+1)/2;
615 const BYTE* end = src + w;
616 WORD* dst_word = reinterpret_cast<WORD*>(dst);
617 for(; src < end; src+=2, am+=2, dst_word+=2)
619 int ia = (am[0]+am[0+src_pitch]+1)/2;
620 int tmp2 = (am[1]+am[1+src_pitch]+1)/2;
621 last_alpha = (last_alpha + tmp2 + 1)/2;
622 ia = (ia + last_alpha + 1)/2;
623 last_alpha = tmp2;
625 if( ia!=0xFF )
627 int tmp = (((dst_word[0])*ia)>>8) + (src[0]<<8);
628 #ifdef XY_UNIT_TEST
629 tmp ^= (tmp^0xffff)&((0xffff-tmp)>>31);//if(tmp>0xffff) tmp = 0xffff;
630 #endif
631 dst_word[0] = tmp;
632 tmp = (((dst_word[1])*ia)>>8) + (src[1]<<8);
633 #ifdef XY_UNIT_TEST
634 tmp ^= (tmp^0xffff)&((0xffff-tmp)>>31);//if(tmp>0xffff) tmp = 0xffff;
635 #endif
636 dst_word[1] = tmp;
642 static __forceinline void hleft_vmid_mix_uv_p010_sse2(BYTE* dst, int w, const BYTE* src, const BYTE* am, int src_pitch, int last_src_id=0)
644 __m128i last_alpha = _mm_cvtsi32_si128( (am[last_src_id]+am[src_pitch+last_src_id]+1)<<7 );
645 const BYTE* end_mod16 = src + (w&~15);
646 for(; src < end_mod16; src+=16, am+=16, dst+=32)
648 //important!
649 __m128i alpha = _mm_load_si128( reinterpret_cast<const __m128i*>(am) );
650 __m128i alpha2 = _mm_load_si128( reinterpret_cast<const __m128i*>(am+src_pitch) );
652 __m128i src_y = _mm_load_si128( reinterpret_cast<const __m128i*>(src) );
653 __m128i dst_y = _mm_load_si128( reinterpret_cast<const __m128i*>(dst) );
655 alpha = _mm_avg_epu8(alpha, alpha2);
656 AVERAGE_4_PIX_INTRINSICS_5(alpha, last_alpha);
658 __m128i alpha_ff;
659 #ifdef _DEBUG
660 alpha_ff = _mm_setzero_si128();//disable warning C4700
661 #endif
662 alpha_ff = _mm_cmpeq_epi32(alpha_ff,alpha_ff);
664 alpha_ff = _mm_cmpeq_epi8(alpha_ff, alpha);
666 __m128i lo = _mm_unpacklo_epi8(alpha_ff, alpha);//(alpha<<8)+0x100 will overflow
667 //so we do it another way
668 //first, (alpha<<8)+0xff
669 __m128i ones = _mm_setzero_si128();
670 ones = _mm_cmpeq_epi16(dst_y, ones);
672 __m128i ones2;
673 #ifdef _DEBUG
674 ones2 = _mm_setzero_si128();//disable warning C4700
675 #endif
676 ones2 = _mm_cmpeq_epi32(ones2,ones2);
677 ones = _mm_xor_si128(ones, ones2);
678 ones = _mm_srli_epi16(ones, 15);
679 ones = _mm_and_si128(ones, lo);
681 dst_y = _mm_mulhi_epu16(dst_y, lo);
682 dst_y = _mm_adds_epu16(dst_y, ones);//then add one if necessary
684 lo = _mm_setzero_si128();
685 lo = _mm_unpacklo_epi8(lo, src_y);
686 dst_y = _mm_adds_epu16(dst_y, lo);
687 _mm_store_si128( reinterpret_cast<__m128i*>(dst), dst_y );
689 dst_y = _mm_load_si128( reinterpret_cast<const __m128i*>(dst+16) );
691 lo = _mm_unpackhi_epi8(alpha_ff, alpha);
693 ones = _mm_setzero_si128();
694 ones = _mm_cmpeq_epi16(dst_y, ones);
695 ones = _mm_xor_si128(ones, ones2);
696 ones = _mm_srli_epi16(ones, 15);
697 ones = _mm_and_si128(ones, lo);
699 dst_y = _mm_mulhi_epu16(dst_y, lo);
700 dst_y = _mm_adds_epu16(dst_y, ones);
702 lo = _mm_setzero_si128();
703 lo = _mm_unpackhi_epi8(lo, src_y);
704 dst_y = _mm_adds_epu16(dst_y, lo);
705 _mm_store_si128( reinterpret_cast<__m128i*>(dst+16), dst_y );
707 hleft_vmid_mix_uv_p010_c(dst, w&15, src, am, src_pitch, w>15?-1:0);
710 static __forceinline void hleft_vmid_mix_uv_nv12_c(BYTE* dst, int w, const BYTE* src, const BYTE* am, int src_pitch, int last_src_id=0)
712 int last_alpha = (am[last_src_id]+am[src_pitch+last_src_id]+1)/2;
713 const BYTE* end = src + w;
714 for(; src < end; src+=2, am+=2, dst+=2)
716 int ia = (am[0]+am[0+src_pitch]+1)/2;
717 int tmp2 = (am[1]+am[1+src_pitch]+1)/2;
718 last_alpha = (last_alpha + tmp2 + 1)/2;
719 ia = (ia + last_alpha + 1)/2;
720 last_alpha = tmp2;
721 if ( ia!=0xFF )
723 dst[0] = (((dst[0])*ia)>>8) + src[0];
724 dst[1] = (((dst[1])*ia)>>8) + src[1];
729 static __forceinline void hleft_vmid_mix_uv_nv12_sse2(BYTE* dst, int w, const BYTE* src, const BYTE* am, int src_pitch, int last_src_id=0)
731 __m128i last_alpha = _mm_cvtsi32_si128( (am[last_src_id]+am[src_pitch+last_src_id]+1)<<7 );
732 const BYTE* end_mod16 = src + (w&~15);
733 for(; src < end_mod16; src+=16, am+=16, dst+=16)
735 __m128i dst128 = _mm_load_si128( reinterpret_cast<const __m128i*>(dst) );
736 __m128i alpha128_1 = _mm_load_si128( reinterpret_cast<const __m128i*>(am) );
737 __m128i alpha128_2 = _mm_load_si128( reinterpret_cast<const __m128i*>(am+src_pitch) );
738 __m128i sub128 = _mm_load_si128( reinterpret_cast<const __m128i*>(src) );
740 alpha128_1 = _mm_avg_epu8(alpha128_1, alpha128_2);
741 AVERAGE_4_PIX_INTRINSICS_5(alpha128_1, last_alpha);
743 __m128i zero = _mm_setzero_si128();
745 __m128i ones;
746 #ifdef _DEBUG
747 ones = _mm_setzero_si128();//disable warning C4700
748 #endif
749 ones = _mm_cmpeq_epi32(ones,ones);
750 ones = _mm_cmpeq_epi8(ones,alpha128_1);
752 __m128i dst_lo128 = _mm_unpacklo_epi8(dst128, zero);
753 alpha128_2 = _mm_unpacklo_epi8(alpha128_1, zero);
755 __m128i ones2 = _mm_unpacklo_epi8(ones, zero);
757 dst_lo128 = _mm_mullo_epi16(dst_lo128, alpha128_2);
758 dst_lo128 = _mm_adds_epu16(dst_lo128, ones2);
759 dst_lo128 = _mm_srli_epi16(dst_lo128, 8);
761 dst128 = _mm_unpackhi_epi8(dst128, zero);
762 alpha128_1 = _mm_unpackhi_epi8(alpha128_1, zero);
764 ones2 = _mm_unpackhi_epi8(ones, zero);
766 dst128 = _mm_mullo_epi16(dst128, alpha128_1);
767 dst128 = _mm_adds_epu16(dst128, ones2);
768 dst128 = _mm_srli_epi16(dst128, 8);
769 dst_lo128 = _mm_packus_epi16(dst_lo128, dst128);
771 dst_lo128 = _mm_adds_epu8(dst_lo128, sub128);
772 _mm_store_si128( reinterpret_cast<__m128i*>(dst), dst_lo128 );
774 hleft_vmid_mix_uv_nv12_c(dst, w&15, src, am, src_pitch, w>15?-1:0);
777 #endif // __XY_INTRINSICS_D66EF42F_67BC_47F4_A70D_40F1AB80F376_H__