1 #ifndef __XY_INTRINSICS_D66EF42F_67BC_47F4_A70D_40F1AB80F376_H__
2 #define __XY_INTRINSICS_D66EF42F_67BC_47F4_A70D_40F1AB80F376_H__
12 //out: m128_1 = avg(m128_1.u8[0],m128_1.u8[1],m128_2.u8[0],m128_2.u8[1])
17 #define AVERAGE_4_PIX_INTRINSICS(m128_1, m128_2) \
18 m128_1 = _mm_avg_epu8(m128_1, m128_2); \
19 m128_2 = _mm_slli_epi16(m128_1, 8); \
20 m128_1 = _mm_srli_epi16(m128_1, 8); \
21 m128_2 = _mm_srli_epi16(m128_2, 8); \
22 m128_1 = _mm_avg_epu8(m128_1, m128_2);
24 //out: m128_1 = avg(m128_1.u8[0],m128_1.u8[1],m128_2.u8[0],m128_2.u8[1])
25 // avg(m128_1.u8[0],m128_1.u8[1],m128_2.u8[0],m128_2.u8[1])
29 #define AVERAGE_4_PIX_INTRINSICS_2(m128_1, m128_2) \
31 m128_1 = _mm_avg_epu8(m128_1, m128_2); \
32 m128_2 = _mm_slli_epi16(m128_1, 8); \
33 __m128i m128_3 = _mm_srli_epi16(m128_1, 8); \
34 m128_2 = _mm_or_si128(m128_2, m128_3);\
35 m128_1 = _mm_avg_epu8(m128_1, m128_2);\
38 //in : m128_1 = whatever, m128_last = u8 U_last 0 0 0 ... 0
39 //out: m128_1 = avg(U_last, u8[0], u8[1])
41 // avg(u8[1], u8[2], u8[3])
44 // m128_last = m128_1.u8[14] m128_1.u8[15] 0 0 0 ... 0
45 #define AVERAGE_4_PIX_INTRINSICS_3(m128_1, m128_last) \
47 __m128i m128_2 = _mm_slli_si128(m128_1,2);\
48 m128_2 = _mm_or_si128(m128_2, m128_last);\
49 m128_2 = _mm_avg_epu8(m128_2, m128_1);\
50 m128_last = _mm_srli_si128(m128_1,14);\
51 m128_1 = _mm_slli_epi16(m128_1, 8);\
52 m128_1 = _mm_avg_epu8(m128_1, m128_2);\
53 m128_1 = _mm_srli_epi16(m128_1, 8);\
56 static void average_4_pix_intrinsics_3_c(__m128i
& m128i_1
, __m128i
& m128i_last
)
58 int last
=m128i_last
.m128i_u8
[1];
59 m128i_last
.m128i_u8
[0] = m128i_1
.m128i_u8
[14];
60 m128i_last
.m128i_u8
[1] = m128i_1
.m128i_u8
[15];
61 for (int i
=2;i
<16;i
++)
63 m128i_last
.m128i_u8
[i
] = 0;
67 int u0
= m128i_1
.m128i_u8
[2*i
];
68 int u1
= m128i_1
.m128i_u8
[2*i
+1];
69 last
= (last
+ u1
+ 1)/2;
70 u0
= (last
+ u0
+ 1)/2;
72 m128i_1
.m128i_u8
[2*i
] = u0
;
73 m128i_1
.m128i_u8
[2*i
+1] = 0;
77 //in : m128_1 = whatever, m128_last = u8 U_last 0 0 0 ... 0
79 // avg(U_last, u8[0], u8[1])
81 // avg(u8[1], u8[2], u8[3])
83 // m128_last = m128_1.u8[14] m128_1.u8[15] 0 0 0 ... 0
84 #define AVERAGE_4_PIX_INTRINSICS_4(m128_1, m128_last) \
86 __m128i m128_2 = _mm_slli_si128(m128_1,2);\
87 m128_2 = _mm_or_si128(m128_2, m128_last);\
88 m128_2 = _mm_avg_epu8(m128_2, m128_1);\
89 m128_last = _mm_srli_si128(m128_1,14);\
90 m128_2 = _mm_srli_epi16(m128_2, 8);\
91 m128_1 = _mm_avg_epu8(m128_1, m128_2);\
92 m128_1 = _mm_slli_epi16(m128_1, 8);\
95 static void average_4_pix_intrinsics_4_c(__m128i
& m128i_1
, __m128i
& m128i_last
)
97 int last
=m128i_last
.m128i_u8
[1];
98 m128i_last
.m128i_u8
[0] = m128i_1
.m128i_u8
[14];
99 m128i_last
.m128i_u8
[1] = m128i_1
.m128i_u8
[15];
100 for (int i
=2;i
<16;i
++)
102 m128i_last
.m128i_u8
[i
] = 0;
104 for (int i
=0;i
<8;i
++)
106 int u0
= m128i_1
.m128i_u8
[2*i
];
107 int u1
= m128i_1
.m128i_u8
[2*i
+1];
108 last
= (last
+ u1
+ 1)/2;
109 u0
= (last
+ u0
+ 1)/2;
111 m128i_1
.m128i_u8
[2*i
+1] = u0
;
112 m128i_1
.m128i_u8
[2*i
] = 0;
115 //in : m128_1 = whatever, m128_last = u8 U_last 0 0 0 ... 0
116 //out: m128_1 = avg(U_last, u8[0], u8[1])
117 // avg(U_last, u8[0], u8[1])
118 // avg(u8[1], u8[2], u8[3])
119 // avg(u8[1], u8[2], u8[3])
121 // m128_last = m128_1.u8[14] m128_1.u8[15] 0 0 0 ... 0
122 #define AVERAGE_4_PIX_INTRINSICS_5(m128_1, m128_last) \
124 __m128i m128_2 = _mm_slli_si128(m128_1,2);\
125 m128_2 = _mm_or_si128(m128_2, m128_last);\
126 m128_2 = _mm_avg_epu8(m128_2, m128_1);\
127 m128_last = _mm_srli_si128(m128_1,14);\
128 m128_2 = _mm_srli_epi16(m128_2, 8);\
129 m128_1 = _mm_avg_epu8(m128_1, m128_2);\
130 m128_1 = _mm_slli_epi16(m128_1, 8);\
131 m128_2 = _mm_srli_epi16(m128_1, 8);\
132 m128_1 = _mm_or_si128(m128_1, m128_2);\
135 static void average_4_pix_intrinsics_5_c(__m128i
& m128i_1
, __m128i
& m128i_last
)
137 int last
=m128i_last
.m128i_u8
[1];
138 m128i_last
.m128i_u8
[0] = m128i_1
.m128i_u8
[14];
139 m128i_last
.m128i_u8
[1] = m128i_1
.m128i_u8
[15];
140 for (int i
=2;i
<16;i
++)
142 m128i_last
.m128i_u8
[i
] = 0;
144 for (int i
=0;i
<8;i
++)
146 int u0
= m128i_1
.m128i_u8
[2*i
];
147 int u1
= m128i_1
.m128i_u8
[2*i
+1];
148 last
= (last
+ u1
+ 1)/2;
149 u0
= (last
+ u0
+ 1)/2;
151 m128i_1
.m128i_u8
[2*i
+1] = u0
;
152 m128i_1
.m128i_u8
[2*i
] = u0
;
156 static void subsample_and_interlace_2_line_c(BYTE
* dst
, const BYTE
* u
, const BYTE
* v
, int w
, int pitch
)
158 const BYTE
* end
= u
+ w
;
159 for (;u
<end
;dst
+=2,u
+=2,v
+=2)
161 dst
[0] = (u
[0] + u
[0+pitch
] + 1)/2;
162 int tmp1
= (u
[1] + u
[1+pitch
] + 1)/2;
163 dst
[0] = (dst
[0] + tmp1
+ 1)/2;
164 dst
[1] = (v
[0] + v
[0+pitch
] + 1)/2;
165 tmp1
= (v
[1] + v
[1+pitch
] + 1)/2;
166 dst
[1] = (dst
[1] + tmp1
+ 1)/2;
170 static __forceinline
void subsample_and_interlace_2_line_sse2(BYTE
* dst
, const BYTE
* u
, const BYTE
* v
, int w
, int pitch
)
172 const BYTE
* end
= u
+ w
;
173 for (;u
<end
;dst
+=16,u
+=16,v
+=16)
175 __m128i u_1
= _mm_load_si128( reinterpret_cast<const __m128i
*>(u
) );
176 __m128i u_2
= _mm_load_si128( reinterpret_cast<const __m128i
*>(u
+pitch
) );
177 __m128i v_1
= _mm_load_si128( reinterpret_cast<const __m128i
*>(v
) );
178 __m128i v_2
= _mm_load_si128( reinterpret_cast<const __m128i
*>(v
+pitch
) );
179 AVERAGE_4_PIX_INTRINSICS(u_1
, u_2
);
180 AVERAGE_4_PIX_INTRINSICS(v_1
, v_2
);
181 u_1
= _mm_packus_epi16(u_1
, u_1
);
182 v_1
= _mm_packus_epi16(v_1
, v_1
);
183 u_1
= _mm_unpacklo_epi8(u_1
, v_1
);
185 _mm_store_si128( reinterpret_cast<__m128i
*>(dst
), u_1
);
189 static __forceinline
void pix_alpha_blend_yv12_luma_sse2(byte
* dst
, const byte
* alpha
, const byte
* sub
)
191 __m128i dst128
= _mm_load_si128( reinterpret_cast<const __m128i
*>(dst
) );
192 __m128i alpha128
= _mm_load_si128( reinterpret_cast<const __m128i
*>(alpha
) );
193 __m128i sub128
= _mm_load_si128( reinterpret_cast<const __m128i
*>(sub
) );
194 __m128i zero
= _mm_setzero_si128();
198 ones
= _mm_setzero_si128();//disable warning C4700
200 ones
= _mm_cmpeq_epi32(ones
,ones
);
201 ones
= _mm_cmpeq_epi8(ones
,alpha128
);
203 __m128i dst_lo128
= _mm_unpacklo_epi8(dst128
, zero
);
204 __m128i alpha_lo128
= _mm_unpacklo_epi8(alpha128
, zero
);
206 __m128i ones2
= _mm_unpacklo_epi8(ones
, zero
);
208 dst_lo128
= _mm_mullo_epi16(dst_lo128
, alpha_lo128
);
209 dst_lo128
= _mm_adds_epu16(dst_lo128
, ones2
);
210 dst_lo128
= _mm_srli_epi16(dst_lo128
, 8);
212 dst128
= _mm_unpackhi_epi8(dst128
, zero
);
213 alpha128
= _mm_unpackhi_epi8(alpha128
, zero
);
215 ones2
= _mm_unpackhi_epi8(ones
, zero
);
217 dst128
= _mm_mullo_epi16(dst128
, alpha128
);
218 dst128
= _mm_adds_epu16(dst128
, ones2
);
219 dst128
= _mm_srli_epi16(dst128
, 8);
220 dst_lo128
= _mm_packus_epi16(dst_lo128
, dst128
);
222 dst_lo128
= _mm_adds_epu8(dst_lo128
, sub128
);
223 _mm_store_si128( reinterpret_cast<__m128i
*>(dst
), dst_lo128
);
227 * output not exactly identical to pix_alpha_blend_yv12_chroma
229 static __forceinline
void pix_alpha_blend_yv12_chroma_sse2(byte
* dst
, const byte
* src
, const byte
* alpha
, int src_pitch
)
231 __m128i zero
= _mm_setzero_si128();
232 __m128i alpha128_1
= _mm_load_si128( reinterpret_cast<const __m128i
*>(alpha
) );
233 __m128i alpha128_2
= _mm_load_si128( reinterpret_cast<const __m128i
*>(alpha
+src_pitch
) );
234 __m128i dst128
= _mm_loadl_epi64( reinterpret_cast<const __m128i
*>(dst
) );
236 __m128i sub128_1
= _mm_load_si128( reinterpret_cast<const __m128i
*>(src
) );
237 __m128i sub128_2
= _mm_load_si128( reinterpret_cast<const __m128i
*>(src
+src_pitch
) );
239 AVERAGE_4_PIX_INTRINSICS(alpha128_1
, alpha128_2
);
243 ones
= _mm_setzero_si128();//disable warning C4700
245 ones
= _mm_cmpeq_epi32(ones
,ones
);
246 ones
= _mm_cmpeq_epi8(ones
, alpha128_1
);
248 dst128
= _mm_unpacklo_epi8(dst128
, zero
);
249 __m128i dst128_2
= _mm_and_si128(dst128
, ones
);
251 dst128
= _mm_mullo_epi16(dst128
, alpha128_1
);
252 dst128
= _mm_adds_epu16(dst128
, dst128_2
);
254 dst128
= _mm_srli_epi16(dst128
, 8);
256 AVERAGE_4_PIX_INTRINSICS(sub128_1
, sub128_2
);
258 dst128
= _mm_adds_epi16(dst128
, sub128_1
);
259 dst128
= _mm_packus_epi16(dst128
, dst128
);
261 _mm_storel_epi64( reinterpret_cast<__m128i
*>(dst
), dst128
);
264 static __forceinline
void mix_16_y_p010_sse2(BYTE
* dst
, const BYTE
* src
, const BYTE
* src_alpha
)
267 __m128i alpha
= _mm_load_si128( reinterpret_cast<const __m128i
*>(src_alpha
) );
268 __m128i src_y
= _mm_load_si128( reinterpret_cast<const __m128i
*>(src
) );
269 __m128i dst_y
= _mm_load_si128( reinterpret_cast<const __m128i
*>(dst
) );
273 alpha_ff
= _mm_setzero_si128();//disable warning C4700
275 alpha_ff
= _mm_cmpeq_epi32(alpha_ff
,alpha_ff
);
277 alpha_ff
= _mm_cmpeq_epi8(alpha_ff
, alpha
);
279 __m128i lo
= _mm_unpacklo_epi8(alpha_ff
, alpha
);//(alpha<<8)+0x100 will overflow
280 //so we do it another way
281 //first, (alpha<<8)+0xff
282 __m128i ones
= _mm_setzero_si128();
283 ones
= _mm_cmpeq_epi16(dst_y
, ones
);
287 ones2
= _mm_setzero_si128();//disable warning C4700
289 ones2
= _mm_cmpeq_epi32(ones2
,ones2
);
291 ones
= _mm_xor_si128(ones
, ones2
);
292 ones
= _mm_srli_epi16(ones
, 15);
293 ones
= _mm_and_si128(ones
, lo
);
295 dst_y
= _mm_mulhi_epu16(dst_y
, lo
);
296 dst_y
= _mm_adds_epu16(dst_y
, ones
);//then add one if necessary
298 lo
= _mm_setzero_si128();
299 lo
= _mm_unpacklo_epi8(lo
, src_y
);
300 dst_y
= _mm_adds_epu16(dst_y
, lo
);
301 _mm_store_si128( reinterpret_cast<__m128i
*>(dst
), dst_y
);
304 dst_y
= _mm_load_si128( reinterpret_cast<const __m128i
*>(dst
) );
306 lo
= _mm_unpackhi_epi8(alpha_ff
, alpha
);
308 ones
= _mm_setzero_si128();
309 ones
= _mm_cmpeq_epi16(dst_y
, ones
);
310 ones
= _mm_xor_si128(ones
, ones2
);
311 ones
= _mm_srli_epi16(ones
, 15);
312 ones
= _mm_and_si128(ones
, lo
);
314 dst_y
= _mm_mulhi_epu16(dst_y
, lo
);
315 dst_y
= _mm_adds_epu16(dst_y
, ones
);
317 lo
= _mm_setzero_si128();
318 lo
= _mm_unpackhi_epi8(lo
, src_y
);
319 dst_y
= _mm_adds_epu16(dst_y
, lo
);
320 _mm_store_si128( reinterpret_cast<__m128i
*>(dst
), dst_y
);
324 static void mix_16_y_p010_c(BYTE
* dst
, const BYTE
* src
, const BYTE
* src_alpha
)
326 WORD
* dst_word
= reinterpret_cast<WORD
*>(dst
);
327 for (int i
=0;i
<16;i
++)
329 if (src_alpha
[i
]!=0xff)
331 dst_word
[i
] = ((dst_word
[i
] *src_alpha
[i
])>>8) + (src
[i
]<<8);
336 static __forceinline
void pix_alpha_blend_yv12_chroma(byte
* dst
, const byte
* src
, const byte
* alpha
, int src_pitch
)
338 unsigned int ia
= (alpha
[0]+alpha
[1]+
339 alpha
[0+src_pitch
]+alpha
[1+src_pitch
])>>2;
342 *dst
= (((*dst
)*ia
)>>8) + ((src
[0] +src
[1]+
343 src
[src_pitch
]+src
[1+src_pitch
] )>>2);
346 static __forceinline
void mix_16_uv_p010_sse2(BYTE
* dst
, const BYTE
* src
, const BYTE
* src_alpha
, int pitch
)
349 __m128i alpha
= _mm_load_si128( reinterpret_cast<const __m128i
*>(src_alpha
) );
350 __m128i alpha2
= _mm_load_si128( reinterpret_cast<const __m128i
*>(src_alpha
+pitch
) );
352 __m128i src_y
= _mm_load_si128( reinterpret_cast<const __m128i
*>(src
) );
353 __m128i dst_y
= _mm_load_si128( reinterpret_cast<const __m128i
*>(dst
) );
355 AVERAGE_4_PIX_INTRINSICS_2(alpha
, alpha2
);
359 alpha_ff
= _mm_setzero_si128();//disable warning C4700
361 alpha_ff
= _mm_cmpeq_epi32(alpha_ff
,alpha_ff
);
363 alpha_ff
= _mm_cmpeq_epi8(alpha_ff
, alpha
);
365 __m128i lo
= _mm_unpacklo_epi8(alpha_ff
, alpha
);//(alpha<<8)+0x100 will overflow
366 //so we do it another way
367 //first, (alpha<<8)+0xff
368 __m128i ones
= _mm_setzero_si128();
369 ones
= _mm_cmpeq_epi16(dst_y
, ones
);
373 ones2
= _mm_setzero_si128();//disable warning C4700
375 ones2
= _mm_cmpeq_epi32(ones2
,ones2
);
376 ones
= _mm_xor_si128(ones
, ones2
);
377 ones
= _mm_srli_epi16(ones
, 15);
378 ones
= _mm_and_si128(ones
, lo
);
380 dst_y
= _mm_mulhi_epu16(dst_y
, lo
);
381 dst_y
= _mm_adds_epu16(dst_y
, ones
);//then add one if necessary
383 lo
= _mm_setzero_si128();
384 lo
= _mm_unpacklo_epi8(lo
, src_y
);
385 dst_y
= _mm_adds_epu16(dst_y
, lo
);
386 _mm_store_si128( reinterpret_cast<__m128i
*>(dst
), dst_y
);
389 dst_y
= _mm_load_si128( reinterpret_cast<const __m128i
*>(dst
) );
391 lo
= _mm_unpackhi_epi8(alpha_ff
, alpha
);
393 ones
= _mm_setzero_si128();
394 ones
= _mm_cmpeq_epi16(dst_y
, ones
);
395 ones
= _mm_xor_si128(ones
, ones2
);
396 ones
= _mm_srli_epi16(ones
, 15);
397 ones
= _mm_and_si128(ones
, lo
);
399 dst_y
= _mm_mulhi_epu16(dst_y
, lo
);
400 dst_y
= _mm_adds_epu16(dst_y
, ones
);
402 lo
= _mm_setzero_si128();
403 lo
= _mm_unpackhi_epi8(lo
, src_y
);
404 dst_y
= _mm_adds_epu16(dst_y
, lo
);
405 _mm_store_si128( reinterpret_cast<__m128i
*>(dst
), dst_y
);
408 static void mix_16_uv_p010_c(BYTE
* dst
, const BYTE
* src
, const BYTE
* src_alpha
, int pitch
)
410 WORD
* dst_word
= reinterpret_cast<WORD
*>(dst
);
411 for (int i
=0;i
<8;i
++, src_alpha
+=2, src
+=2, dst_word
+=2)
414 (src_alpha
[0]+src_alpha
[0+pitch
]+1)/2+
415 (src_alpha
[1]+src_alpha
[1+pitch
]+1)/2+1)/2;
418 int tmp
= (((dst_word
[0])*ia
)>>8) + (src
[0]<<8);
419 if(tmp
>0xffff) tmp
= 0xffff;
421 tmp
= (((dst_word
[1])*ia
)>>8) + (src
[1]<<8);
422 if(tmp
>0xffff) tmp
= 0xffff;
428 static __forceinline
void mix_16_uv_nvxx_sse2(BYTE
* dst
, const BYTE
* src
, const BYTE
* src_alpha
, int pitch
)
430 __m128i dst128
= _mm_load_si128( reinterpret_cast<const __m128i
*>(dst
) );
431 __m128i alpha128_1
= _mm_load_si128( reinterpret_cast<const __m128i
*>(src_alpha
) );
432 __m128i alpha128_2
= _mm_load_si128( reinterpret_cast<const __m128i
*>(src_alpha
+pitch
) );
433 __m128i sub128
= _mm_load_si128( reinterpret_cast<const __m128i
*>(src
) );
435 AVERAGE_4_PIX_INTRINSICS_2(alpha128_1
, alpha128_2
);
436 __m128i zero
= _mm_setzero_si128();
440 ones
= _mm_setzero_si128();//disable warning C4700
442 ones
= _mm_cmpeq_epi32(ones
,ones
);
443 ones
= _mm_cmpeq_epi8(ones
,alpha128_1
);
445 __m128i dst_lo128
= _mm_unpacklo_epi8(dst128
, zero
);
446 alpha128_2
= _mm_unpacklo_epi8(alpha128_1
, zero
);
448 __m128i ones2
= _mm_unpacklo_epi8(ones
, zero
);
450 dst_lo128
= _mm_mullo_epi16(dst_lo128
, alpha128_2
);
451 dst_lo128
= _mm_adds_epu16(dst_lo128
, ones2
);
452 dst_lo128
= _mm_srli_epi16(dst_lo128
, 8);
454 dst128
= _mm_unpackhi_epi8(dst128
, zero
);
455 alpha128_1
= _mm_unpackhi_epi8(alpha128_1
, zero
);
457 ones2
= _mm_unpackhi_epi8(ones
, zero
);
459 dst128
= _mm_mullo_epi16(dst128
, alpha128_1
);
460 dst128
= _mm_adds_epu16(dst128
, ones2
);
461 dst128
= _mm_srli_epi16(dst128
, 8);
462 dst_lo128
= _mm_packus_epi16(dst_lo128
, dst128
);
464 dst_lo128
= _mm_adds_epu8(dst_lo128
, sub128
);
465 _mm_store_si128( reinterpret_cast<__m128i
*>(dst
), dst_lo128
);
469 static void mix_16_uv_nvxx_c(BYTE
* dst
, const BYTE
* src
, const BYTE
* src_alpha
, int pitch
)
471 for (int i
=0;i
<8;i
++, src_alpha
+=2, src
+=2, dst
+=2)
474 (src_alpha
[0]+src_alpha
[0+pitch
]+1)/2+
475 (src_alpha
[1]+src_alpha
[1+pitch
]+1)/2+1)/2;
478 dst
[0] = (((dst
[0])*ia
)>>8) + src
[0];
479 dst
[1] = (((dst
[1])*ia
)>>8) + src
[1];
486 * chroma placement(x=Y, o=U,V):
493 static __forceinline
void hleft_vmid_subsample_and_interlace_2_line_c(BYTE
* dst
, const BYTE
* u
, const BYTE
* v
, int w
, int pitch
, int last_src_id
=0)
495 const BYTE
* end
= u
+ w
;
496 BYTE last_u
= (u
[last_src_id
]+u
[last_src_id
+pitch
]+1)/2;
497 BYTE last_v
= (v
[last_src_id
]+v
[last_src_id
+pitch
]+1)/2;
498 for (;u
<end
;dst
+=2,u
+=2,v
+=2)
500 dst
[0] = (u
[0] + u
[0+pitch
] + 1)/2;
501 int tmp1
= (u
[1] + u
[1+pitch
] + 1)/2;
502 last_u
= (tmp1
+last_u
+1)/2;
503 dst
[0] = (dst
[0] + last_u
+ 1)/2;
506 dst
[1] = (v
[0] + v
[0+pitch
] + 1)/2;
507 tmp1
= (v
[1] + v
[1+pitch
] + 1)/2;
508 last_v
= (tmp1
+last_v
+1)/2;
509 dst
[1] = (last_v
+ dst
[1] + 1)/2;
514 // @w : w % 16 must == 0!
515 static __forceinline
void hleft_vmid_subsample_and_interlace_2_line_sse2(BYTE
* dst
, const BYTE
* u
, const BYTE
* v
, int w
, int pitch
, int last_src_id
=0)
517 const BYTE
* end_mod16
= u
+ (w
&~15);
519 __m128i u_last
= _mm_cvtsi32_si128( (u
[last_src_id
]+u
[pitch
+last_src_id
]+1)<<7 );
520 __m128i v_last
= _mm_cvtsi32_si128( (v
[last_src_id
]+v
[pitch
+last_src_id
]+1)<<7 );
521 for (;u
<end_mod16
;dst
+=16,u
+=16,v
+=16)
523 __m128i u_1
= _mm_load_si128( reinterpret_cast<const __m128i
*>(u
) );
524 __m128i u_2
= _mm_load_si128( reinterpret_cast<const __m128i
*>(u
+pitch
) );
525 __m128i v_1
= _mm_load_si128( reinterpret_cast<const __m128i
*>(v
) );
526 __m128i v_2
= _mm_load_si128( reinterpret_cast<const __m128i
*>(v
+pitch
) );
527 u_1
= _mm_avg_epu8(u_1
, u_2
);
528 AVERAGE_4_PIX_INTRINSICS_3(u_1
, u_last
);
529 v_1
= _mm_avg_epu8(v_1
, v_2
);
530 AVERAGE_4_PIX_INTRINSICS_4(v_1
, v_last
);
531 u_1
= _mm_or_si128(u_1
, v_1
);
532 _mm_store_si128( reinterpret_cast<__m128i
*>(dst
), u_1
);
534 //The following fails if dst==u
535 //hleft_vmid_subsample_and_interlace_2_line_c(dst, u, v, w&15, pitch, w>15?-1:0);
538 static __forceinline
void hleft_vmid_mix_uv_yv12_c(byte
* dst
, int w
, const byte
* src
, const byte
* am
, int src_pitch
, int last_src_id
=0)
540 int last_alpha
= (am
[last_src_id
]+am
[last_src_id
+src_pitch
]+1)/2;
541 int last_sub
= (src
[last_src_id
]+src
[last_src_id
+src_pitch
]+1)/2;
542 const BYTE
* end
= src
+ w
;
543 for(; src
< end
; src
+= 2, am
+= 2, dst
++)
545 int ia
= (am
[0]+am
[0+src_pitch
]+1)/2;
546 int tmp1
= (am
[1]+am
[1+src_pitch
]+1)/2;
547 last_alpha
= (last_alpha
+ tmp1
+ 1)/2;
548 ia
= (ia
+ last_alpha
+ 1)/2;
553 tmp1
= (src
[0]+src
[0+src_pitch
]+1)/2;
554 int tmp2
= (src
[1]+src
[1+src_pitch
]+1)/2;
555 last_sub
= (last_sub
+tmp2
+1)/2;
556 tmp1
= (tmp1
+last_sub
+1)/2;
559 *dst
= (((*dst
)*ia
)>>8) + tmp1
;
563 last_sub
= (src
[1]+src
[1+src_pitch
]+1)/2;
568 static __forceinline
void hleft_vmid_mix_uv_yv12_sse2(byte
* dst
, int w
, const byte
* src
, const byte
* am
, int src_pitch
, int last_src_id
=0)
570 __m128i last_src
= _mm_cvtsi32_si128( (src
[last_src_id
]+src
[src_pitch
+last_src_id
]+1)<<7 );
571 __m128i last_alpha
= _mm_cvtsi32_si128( (am
[last_src_id
]+am
[src_pitch
+last_src_id
]+1)<<7 );
572 const BYTE
* end_mod16
= src
+ (w
&~15);
573 for(; src
< end_mod16
; src
+= 16, am
+= 16, dst
+=8)
575 __m128i zero
= _mm_setzero_si128();
577 __m128i alpha128_1
= _mm_load_si128( reinterpret_cast<const __m128i
*>(am
) );
578 __m128i tmp
= _mm_load_si128( reinterpret_cast<const __m128i
*>(am
+src_pitch
) );
579 alpha128_1
= _mm_avg_epu8(alpha128_1
, tmp
);
580 AVERAGE_4_PIX_INTRINSICS_3(alpha128_1
, last_alpha
);
582 __m128i dst128
= _mm_loadl_epi64( reinterpret_cast<const __m128i
*>(dst
) );
584 __m128i sub128_1
= _mm_load_si128( reinterpret_cast<const __m128i
*>(src
) );
585 tmp
= _mm_load_si128( reinterpret_cast<const __m128i
*>(src
+src_pitch
) );
586 sub128_1
= _mm_avg_epu8(sub128_1
, tmp
);
587 AVERAGE_4_PIX_INTRINSICS_3(sub128_1
, last_src
);
591 ones
= _mm_setzero_si128();//disable warning C4700
593 ones
= _mm_cmpeq_epi32(ones
,ones
);
594 ones
= _mm_cmpeq_epi8(ones
, alpha128_1
);
596 dst128
= _mm_unpacklo_epi8(dst128
, zero
);
597 __m128i dst128_2
= _mm_and_si128(dst128
, ones
);
599 dst128
= _mm_mullo_epi16(dst128
, alpha128_1
);
600 dst128
= _mm_adds_epu16(dst128
, dst128_2
);
602 dst128
= _mm_srli_epi16(dst128
, 8);
604 dst128
= _mm_adds_epi16(dst128
, sub128_1
);
605 dst128
= _mm_packus_epi16(dst128
, dst128
);
607 _mm_storel_epi64( reinterpret_cast<__m128i
*>(dst
), dst128
);
609 hleft_vmid_mix_uv_yv12_c(dst
, w
&15, src
, am
, src_pitch
, w
>15?-1:0);
612 static __forceinline
void hleft_vmid_mix_uv_p010_c(BYTE
* dst
, int w
, const BYTE
* src
, const BYTE
* am
, int src_pitch
, int last_src_id
=0)
614 int last_alpha
= (am
[last_src_id
]+am
[src_pitch
+last_src_id
]+1)/2;
615 const BYTE
* end
= src
+ w
;
616 WORD
* dst_word
= reinterpret_cast<WORD
*>(dst
);
617 for(; src
< end
; src
+=2, am
+=2, dst_word
+=2)
619 int ia
= (am
[0]+am
[0+src_pitch
]+1)/2;
620 int tmp2
= (am
[1]+am
[1+src_pitch
]+1)/2;
621 last_alpha
= (last_alpha
+ tmp2
+ 1)/2;
622 ia
= (ia
+ last_alpha
+ 1)/2;
627 int tmp
= (((dst_word
[0])*ia
)>>8) + (src
[0]<<8);
629 tmp
^= (tmp
^0xffff)&((0xffff-tmp
)>>31);//if(tmp>0xffff) tmp = 0xffff;
632 tmp
= (((dst_word
[1])*ia
)>>8) + (src
[1]<<8);
634 tmp
^= (tmp
^0xffff)&((0xffff-tmp
)>>31);//if(tmp>0xffff) tmp = 0xffff;
642 static __forceinline
void hleft_vmid_mix_uv_p010_sse2(BYTE
* dst
, int w
, const BYTE
* src
, const BYTE
* am
, int src_pitch
, int last_src_id
=0)
644 __m128i last_alpha
= _mm_cvtsi32_si128( (am
[last_src_id
]+am
[src_pitch
+last_src_id
]+1)<<7 );
645 const BYTE
* end_mod16
= src
+ (w
&~15);
646 for(; src
< end_mod16
; src
+=16, am
+=16, dst
+=32)
649 __m128i alpha
= _mm_load_si128( reinterpret_cast<const __m128i
*>(am
) );
650 __m128i alpha2
= _mm_load_si128( reinterpret_cast<const __m128i
*>(am
+src_pitch
) );
652 __m128i src_y
= _mm_load_si128( reinterpret_cast<const __m128i
*>(src
) );
653 __m128i dst_y
= _mm_load_si128( reinterpret_cast<const __m128i
*>(dst
) );
655 alpha
= _mm_avg_epu8(alpha
, alpha2
);
656 AVERAGE_4_PIX_INTRINSICS_5(alpha
, last_alpha
);
660 alpha_ff
= _mm_setzero_si128();//disable warning C4700
662 alpha_ff
= _mm_cmpeq_epi32(alpha_ff
,alpha_ff
);
664 alpha_ff
= _mm_cmpeq_epi8(alpha_ff
, alpha
);
666 __m128i lo
= _mm_unpacklo_epi8(alpha_ff
, alpha
);//(alpha<<8)+0x100 will overflow
667 //so we do it another way
668 //first, (alpha<<8)+0xff
669 __m128i ones
= _mm_setzero_si128();
670 ones
= _mm_cmpeq_epi16(dst_y
, ones
);
674 ones2
= _mm_setzero_si128();//disable warning C4700
676 ones2
= _mm_cmpeq_epi32(ones2
,ones2
);
677 ones
= _mm_xor_si128(ones
, ones2
);
678 ones
= _mm_srli_epi16(ones
, 15);
679 ones
= _mm_and_si128(ones
, lo
);
681 dst_y
= _mm_mulhi_epu16(dst_y
, lo
);
682 dst_y
= _mm_adds_epu16(dst_y
, ones
);//then add one if necessary
684 lo
= _mm_setzero_si128();
685 lo
= _mm_unpacklo_epi8(lo
, src_y
);
686 dst_y
= _mm_adds_epu16(dst_y
, lo
);
687 _mm_store_si128( reinterpret_cast<__m128i
*>(dst
), dst_y
);
689 dst_y
= _mm_load_si128( reinterpret_cast<const __m128i
*>(dst
+16) );
691 lo
= _mm_unpackhi_epi8(alpha_ff
, alpha
);
693 ones
= _mm_setzero_si128();
694 ones
= _mm_cmpeq_epi16(dst_y
, ones
);
695 ones
= _mm_xor_si128(ones
, ones2
);
696 ones
= _mm_srli_epi16(ones
, 15);
697 ones
= _mm_and_si128(ones
, lo
);
699 dst_y
= _mm_mulhi_epu16(dst_y
, lo
);
700 dst_y
= _mm_adds_epu16(dst_y
, ones
);
702 lo
= _mm_setzero_si128();
703 lo
= _mm_unpackhi_epi8(lo
, src_y
);
704 dst_y
= _mm_adds_epu16(dst_y
, lo
);
705 _mm_store_si128( reinterpret_cast<__m128i
*>(dst
+16), dst_y
);
707 hleft_vmid_mix_uv_p010_c(dst
, w
&15, src
, am
, src_pitch
, w
>15?-1:0);
710 static __forceinline
void hleft_vmid_mix_uv_nv12_c(BYTE
* dst
, int w
, const BYTE
* src
, const BYTE
* am
, int src_pitch
, int last_src_id
=0)
712 int last_alpha
= (am
[last_src_id
]+am
[src_pitch
+last_src_id
]+1)/2;
713 const BYTE
* end
= src
+ w
;
714 for(; src
< end
; src
+=2, am
+=2, dst
+=2)
716 int ia
= (am
[0]+am
[0+src_pitch
]+1)/2;
717 int tmp2
= (am
[1]+am
[1+src_pitch
]+1)/2;
718 last_alpha
= (last_alpha
+ tmp2
+ 1)/2;
719 ia
= (ia
+ last_alpha
+ 1)/2;
723 dst
[0] = (((dst
[0])*ia
)>>8) + src
[0];
724 dst
[1] = (((dst
[1])*ia
)>>8) + src
[1];
729 static __forceinline
void hleft_vmid_mix_uv_nv12_sse2(BYTE
* dst
, int w
, const BYTE
* src
, const BYTE
* am
, int src_pitch
, int last_src_id
=0)
731 __m128i last_alpha
= _mm_cvtsi32_si128( (am
[last_src_id
]+am
[src_pitch
+last_src_id
]+1)<<7 );
732 const BYTE
* end_mod16
= src
+ (w
&~15);
733 for(; src
< end_mod16
; src
+=16, am
+=16, dst
+=16)
735 __m128i dst128
= _mm_load_si128( reinterpret_cast<const __m128i
*>(dst
) );
736 __m128i alpha128_1
= _mm_load_si128( reinterpret_cast<const __m128i
*>(am
) );
737 __m128i alpha128_2
= _mm_load_si128( reinterpret_cast<const __m128i
*>(am
+src_pitch
) );
738 __m128i sub128
= _mm_load_si128( reinterpret_cast<const __m128i
*>(src
) );
740 alpha128_1
= _mm_avg_epu8(alpha128_1
, alpha128_2
);
741 AVERAGE_4_PIX_INTRINSICS_5(alpha128_1
, last_alpha
);
743 __m128i zero
= _mm_setzero_si128();
747 ones
= _mm_setzero_si128();//disable warning C4700
749 ones
= _mm_cmpeq_epi32(ones
,ones
);
750 ones
= _mm_cmpeq_epi8(ones
,alpha128_1
);
752 __m128i dst_lo128
= _mm_unpacklo_epi8(dst128
, zero
);
753 alpha128_2
= _mm_unpacklo_epi8(alpha128_1
, zero
);
755 __m128i ones2
= _mm_unpacklo_epi8(ones
, zero
);
757 dst_lo128
= _mm_mullo_epi16(dst_lo128
, alpha128_2
);
758 dst_lo128
= _mm_adds_epu16(dst_lo128
, ones2
);
759 dst_lo128
= _mm_srli_epi16(dst_lo128
, 8);
761 dst128
= _mm_unpackhi_epi8(dst128
, zero
);
762 alpha128_1
= _mm_unpackhi_epi8(alpha128_1
, zero
);
764 ones2
= _mm_unpackhi_epi8(ones
, zero
);
766 dst128
= _mm_mullo_epi16(dst128
, alpha128_1
);
767 dst128
= _mm_adds_epu16(dst128
, ones2
);
768 dst128
= _mm_srli_epi16(dst128
, 8);
769 dst_lo128
= _mm_packus_epi16(dst_lo128
, dst128
);
771 dst_lo128
= _mm_adds_epu8(dst_lo128
, sub128
);
772 _mm_store_si128( reinterpret_cast<__m128i
*>(dst
), dst_lo128
);
774 hleft_vmid_mix_uv_nv12_c(dst
, w
&15, src
, am
, src_pitch
, w
>15?-1:0);
777 #endif // __XY_INTRINSICS_D66EF42F_67BC_47F4_A70D_40F1AB80F376_H__