Add CPU capability check. [PART 1]
[xy_vsfilter.git] / src / subpic / SimpleSubpicImpl.cpp
blob610c06c5514de9642e5df8778b800046513bb65a
1 #include "stdafx.h"
2 #include "SimpleSubpicImpl.h"
3 #include "ISimpleSubPic.h"
4 #include "xy_intrinsics.h"
5 #include "../subtitles/xy_malloc.h"
6 #include "MemSubPic.h"
8 //////////////////////////////////////////////////////////////////////////
9 //
10 // SimpleSubpic
13 SimpleSubpic::SimpleSubpic( IXySubRenderFrame*sub_render_frame, int alpha_blt_dst_type )
14 : CUnknown(NAME("SimpleSubpic"), NULL)
15 , m_sub_render_frame(sub_render_frame)
16 , m_alpha_blt_dst_type(alpha_blt_dst_type)
18 ConvertColorSpace();
21 SimpleSubpic::~SimpleSubpic()
23 for(unsigned i=0;i<m_buffers.GetCount();i++)
24 xy_free(m_buffers.GetAt(i));
27 STDMETHODIMP SimpleSubpic::NonDelegatingQueryInterface( REFIID riid, void** ppv )
29 return
30 QI(ISimpleSubPic)
31 __super::NonDelegatingQueryInterface(riid, ppv);
34 STDMETHODIMP SimpleSubpic::AlphaBlt( SubPicDesc* target )
36 ASSERT(target!=NULL);
37 HRESULT hr = S_FALSE;
38 int count = m_bitmap.GetCount();
39 for(int i=0;i<count;i++)
41 switch(target->type)
43 case MSP_NV12:
44 case MSP_NV21:
45 hr = AlphaBltAnv12_Nv12(target, m_bitmap.GetAt(i));
46 break;
47 case MSP_P010:
48 case MSP_P016:
49 hr = AlphaBltAnv12_P010(target, m_bitmap.GetAt(i));
50 break;
51 default:
52 hr = AlphaBlt(target, m_bitmap.GetAt(i));
53 break;
56 if (FAILED(hr))
58 return hr;
62 return hr;
65 HRESULT SimpleSubpic::AlphaBltAnv12_P010( SubPicDesc* target, const Bitmap& src )
67 //fix me: check colorspace and log error
68 SubPicDesc dst = *target; // copy, because we might modify it
70 CRect rd(src.pos, src.size);
71 if(dst.h < 0)
73 dst.h = -dst.h;
74 rd.bottom = dst.h - rd.bottom;
75 rd.top = dst.h - rd.top;
78 int w = src.size.cx, h = src.size.cy;
79 bool bottom_down = rd.top > rd.bottom;
81 BYTE* d = NULL;
82 BYTE* dUV = NULL;
83 if(!bottom_down)
85 d = reinterpret_cast<BYTE*>(dst.bits) + dst.pitch*rd.top + rd.left*2;
86 dUV = reinterpret_cast<BYTE*>(dst.bits) + dst.pitch*dst.h + dst.pitch*rd.top/2 + rd.left*2;
88 else
90 d = reinterpret_cast<BYTE*>(dst.bits) + dst.pitch*(rd.top-1) + rd.left*2;
91 dUV = reinterpret_cast<BYTE*>(dst.bits) + dst.pitch*dst.h + dst.pitch*(rd.top/2-1) + rd.left*2;
92 dst.pitch = -dst.pitch;
94 ASSERT(dst.pitchUV==0 || dst.pitchUV==abs(dst.pitch));
96 enum PLANS{A=0,Y,UV};
97 const BYTE* sa = reinterpret_cast<const BYTE*>(src.extra.plans[A]);
98 const BYTE* sy = reinterpret_cast<const BYTE*>(src.extra.plans[Y]);
99 const BYTE* s_uv = reinterpret_cast<const BYTE*>(src.extra.plans[UV]);
100 return CMemSubPic::AlphaBltAnv12_P010(sa, sy, s_uv, src.pitch, d, dUV, dst.pitch, w, h);
103 HRESULT SimpleSubpic::AlphaBltAnv12_Nv12( SubPicDesc* target, const Bitmap& src )
105 //fix me: check colorspace and log error
106 SubPicDesc dst = *target; // copy, because we might modify it
108 CRect rd(src.pos, src.size);
109 if(dst.h < 0)
111 dst.h = -dst.h;
112 rd.bottom = dst.h - rd.bottom;
113 rd.top = dst.h - rd.top;
116 int w = src.size.cx, h = src.size.cy;
117 bool bottom_down = rd.top > rd.bottom;
119 BYTE* d = NULL;
120 BYTE* dUV = NULL;
121 if (!bottom_down)
123 d = reinterpret_cast<BYTE*>(dst.bits) + dst.pitch*rd.top + rd.left;
124 dUV = reinterpret_cast<BYTE*>(dst.bits) + dst.pitch*dst.h + dst.pitch*rd.top/2 + rd.left;
126 else
128 d = reinterpret_cast<BYTE*>(dst.bits) + dst.pitch*(rd.top-1) + rd.left;
129 dUV = reinterpret_cast<BYTE*>(dst.bits) + dst.pitch*dst.h + dst.pitch*(rd.top/2-1) + rd.left;
130 dst.pitch = -dst.pitch;
132 ASSERT(dst.pitchUV==0 || dst.pitchUV==abs(dst.pitch));
134 enum PLANS{A=0,Y,UV};
135 const BYTE* sa = reinterpret_cast<const BYTE*>(src.extra.plans[A]);
136 const BYTE* sy = reinterpret_cast<const BYTE*>(src.extra.plans[Y]);
137 const BYTE* s_uv = reinterpret_cast<const BYTE*>(src.extra.plans[UV]);
138 return CMemSubPic::AlphaBltAnv12_Nv12(sa, sy, s_uv, src.pitch, d, dUV, dst.pitch, w, h);
141 HRESULT SimpleSubpic::AlphaBlt( SubPicDesc* target, const Bitmap& src )
143 SubPicDesc dst = *target; // copy, because we might modify it
145 CRect rd(src.pos, src.size);
146 if(dst.h < 0)
148 dst.h = -dst.h;
149 rd.bottom = dst.h - rd.bottom;
150 rd.top = dst.h - rd.top;
153 int w = src.size.cx, h = src.size.cy;
154 const BYTE* s = reinterpret_cast<const BYTE*>(src.pixels);
155 BYTE* d = reinterpret_cast<BYTE*>(dst.bits) + dst.pitch*rd.top + ((rd.left*dst.bpp)>>3);
157 if(rd.top > rd.bottom)
159 if(dst.type == MSP_RGB32 || dst.type == MSP_RGB24
160 || dst.type == MSP_RGB16 || dst.type == MSP_RGB15
161 || dst.type == MSP_YUY2 || dst.type == MSP_AYUV)
163 d = reinterpret_cast<BYTE*>(dst.bits) + dst.pitch*(rd.top-1) + (rd.left*dst.bpp>>3);
165 else if(dst.type == MSP_YV12 || dst.type == MSP_IYUV)
167 d = reinterpret_cast<BYTE*>(dst.bits) + dst.pitch*(rd.top-1) + (rd.left*8>>3);
169 else
171 return E_NOTIMPL;
173 dst.pitch = -dst.pitch;
175 DbgLog((LOG_TRACE, 5, TEXT("w=%d h=%d"), w, h));
176 switch(dst.type)
178 case MSP_RGBA:
179 for(int j = 0; j < h; j++, s += src.pitch, d += dst.pitch)
181 const BYTE* s2 = s;
182 const BYTE* s2end = s2 + w*4;
183 DWORD* d2 = reinterpret_cast<DWORD*>(d);
184 for(; s2 < s2end; s2 += 4, d2++)
186 if(s2[3] < 0xff)
188 DWORD bd =0x00000100 -( (DWORD) s2[3]);
189 DWORD B = ((*((DWORD*)s2)&0x000000ff)<<8)/bd;
190 DWORD V = ((*((DWORD*)s2)&0x0000ff00)/bd)<<8;
191 DWORD R = (((*((DWORD*)s2)&0x00ff0000)>>8)/bd)<<16;
192 *d2 = B | V | R
193 | (0xff000000-(*((DWORD*)s2)&0xff000000))&0xff000000;
197 break;
198 case MSP_RGB32:
199 case MSP_AYUV: //ToDo: fix me MSP_VUYA indeed?
200 for(int j = 0; j < h; j++, s += src.pitch, d += dst.pitch)
202 const BYTE* s2 = s;
203 const BYTE* s2end = s2 + w*4;
204 DWORD* d2 = reinterpret_cast<DWORD*>(d);
205 for(; s2 < s2end; s2 += 4, d2++)
207 #ifdef _WIN64
208 DWORD ia = 256-s2[3];
209 if(s2[3] < 0xff) {
210 *d2 = ((((*d2&0x00ff00ff)*s2[3])>>8) + (((*((DWORD*)s2)&0x00ff00ff)*ia)>>8)&0x00ff00ff)
211 | ((((*d2&0x0000ff00)*s2[3])>>8) + (((*((DWORD*)s2)&0x0000ff00)*ia)>>8)&0x0000ff00);
213 #else
214 if(s2[3] < 0xff)
216 *d2 = (((((*d2&0x00ff00ff)*s2[3])>>8) + (*((DWORD*)s2)&0x00ff00ff))&0x00ff00ff)
217 | (((((*d2&0x0000ff00)*s2[3])>>8) + (*((DWORD*)s2)&0x0000ff00))&0x0000ff00);
219 #endif
222 break;
223 case MSP_RGB24:
224 for(int j = 0; j < h; j++, s += src.pitch, d += dst.pitch)
226 const BYTE* s2 = s;
227 const BYTE* s2end = s2 + w*4;
228 BYTE* d2 = d;
229 for(; s2 < s2end; s2 += 4, d2 += 3)
231 if(s2[3] < 0xff)
233 d2[0] = ((d2[0]*s2[3])>>8) + s2[0];
234 d2[1] = ((d2[1]*s2[3])>>8) + s2[1];
235 d2[2] = ((d2[2]*s2[3])>>8) + s2[2];
239 break;
240 case MSP_RGB16:
241 for(int j = 0; j < h; j++, s += src.pitch, d += dst.pitch)
243 const BYTE* s2 = s;
244 const BYTE* s2end = s2 + w*4;
245 WORD* d2 = reinterpret_cast<WORD*>(d);
246 for(; s2 < s2end; s2 += 4, d2++)
248 if(s2[3] < 0x1f)
250 *d2 = (WORD)((((((*d2&0xf81f)*s2[3])>>5) + (*(DWORD*)s2&0xf81f))&0xf81f)
251 | (((((*d2&0x07e0)*s2[3])>>5) + (*(DWORD*)s2&0x07e0))&0x07e0));
252 /* *d2 = (WORD)((((((*d2&0xf800)*s2[3])>>8) + (*(DWORD*)s2&0xf800))&0xf800)
253 | (((((*d2&0x07e0)*s2[3])>>8) + (*(DWORD*)s2&0x07e0))&0x07e0)
254 | (((((*d2&0x001f)*s2[3])>>8) + (*(DWORD*)s2&0x001f))&0x001f));
259 break;
260 case MSP_RGB15:
261 for(int j = 0; j < h; j++, s += src.pitch, d += dst.pitch)
263 const BYTE* s2 = s;
264 const BYTE* s2end = s2 + w*4;
265 WORD* d2 = reinterpret_cast<WORD*>(d);
266 for(; s2 < s2end; s2 += 4, d2++)
268 if(s2[3] < 0x1f)
270 *d2 = (WORD)((((((*d2&0x7c1f)*s2[3])>>5) + (*(DWORD*)s2&0x7c1f))&0x7c1f)
271 | (((((*d2&0x03e0)*s2[3])>>5) + (*(DWORD*)s2&0x03e0))&0x03e0));
272 /* *d2 = (WORD)((((((*d2&0x7c00)*s2[3])>>8) + (*(DWORD*)s2&0x7c00))&0x7c00)
273 | (((((*d2&0x03e0)*s2[3])>>8) + (*(DWORD*)s2&0x03e0))&0x03e0)
274 | (((((*d2&0x001f)*s2[3])>>8) + (*(DWORD*)s2&0x001f))&0x001f));
279 break;
280 case MSP_YUY2:
281 for(int j = 0; j < h; j++, s += src.pitch, d += dst.pitch)
283 unsigned int ia, c;
284 const BYTE* s2 = s;
285 const BYTE* s2end = s2 + w*4;
286 DWORD* d2 = reinterpret_cast<DWORD*>(d);
287 int last_a = w > 0 ? s2[3] : 0;
288 for(; s2 < s2end; s2 += 8, d2++)
290 ia = (last_a + 2*s2[3] + s2[7])>>2;
291 last_a = s2[7];
292 if(ia < 0xff)
294 //int y1 = (BYTE)(((((*d2&0xff))*s2[3])>>8) + s2[1]); // + y1;
295 //int u = (BYTE)((((((*d2>>8)&0xff))*ia)>>8) + s2[0]); // + u;
296 //int y2 = (BYTE)((((((*d2>>16)&0xff))*s2[7])>>8) + s2[5]); // + y2;
297 //int v = (BYTE)((((((*d2>>24)&0xff))*ia)>>8) + s2[4]); // + v;
298 //*d2 = (v<<24)|(y2<<16)|(u<<8)|y1;
300 ia = (ia<<24)|(s2[7]<<16)|(ia<<8)|s2[3];
301 c = (s2[4]<<24)|(s2[5]<<16)|(s2[0]<<8)|s2[1]; // (v<<24)|(y2<<16)|(u<<8)|y1;
302 __asm
304 mov edi, d2
305 pxor mm0, mm0
306 movd mm2, c
307 punpcklbw mm2, mm0
308 movd mm3, [edi]
309 punpcklbw mm3, mm0
310 movd mm4, ia
311 punpcklbw mm4, mm0
312 psraw mm4, 1 //or else, overflow because psraw shift in sign bit
313 pmullw mm3, mm4
314 psraw mm3, 7
315 paddsw mm3, mm2
316 packuswb mm3, mm3
317 movd [edi], mm3
322 __asm emms;
323 break;
324 case MSP_YV12:
325 case MSP_IYUV:
327 //dst.pitch = abs(dst.pitch);
328 int h2 = h/2;
329 if(!dst.pitchUV)
331 dst.pitchUV = abs(dst.pitch)/2;
333 if(!dst.bitsU || !dst.bitsV)
335 dst.bitsU = reinterpret_cast<BYTE*>(dst.bits) + abs(dst.pitch)*dst.h;
336 dst.bitsV = dst.bitsU + dst.pitchUV*dst.h/2;
337 if(dst.type == MSP_YV12)
339 BYTE* p = dst.bitsU;
340 dst.bitsU = dst.bitsV;
341 dst.bitsV = p;
344 BYTE* dd[2];
345 dd[0] = dst.bitsU + dst.pitchUV*rd.top/2 + rd.left/2;
346 dd[1] = dst.bitsV + dst.pitchUV*rd.top/2 + rd.left/2;
347 if(rd.top > rd.bottom)
349 dd[0] = dst.bitsU + dst.pitchUV*(rd.top/2-1) + rd.left/2;
350 dd[1] = dst.bitsV + dst.pitchUV*(rd.top/2-1) + rd.left/2;
351 dst.pitchUV = -dst.pitchUV;
354 enum PLANS{A=0,Y,U,V};
355 const BYTE* sa = reinterpret_cast<const BYTE*>(src.extra.plans[A]);
356 const BYTE* sy = reinterpret_cast<const BYTE*>(src.extra.plans[Y]);
357 const BYTE* su = reinterpret_cast<const BYTE*>(src.extra.plans[U]);
358 const BYTE* sv = reinterpret_cast<const BYTE*>(src.extra.plans[V]);
359 CMemSubPic::AlphaBltYv12Luma( d, dst.pitch, w, h, sy, sa, src.pitch );
360 CMemSubPic::AlphaBltYv12Chroma( dd[0], dst.pitchUV, w, h2, su, sa, src.pitch);
361 CMemSubPic::AlphaBltYv12Chroma( dd[1], dst.pitchUV, w, h2, sv, sa, src.pitch);
363 __asm emms;
365 break;
366 default:
367 return E_NOTIMPL;
368 break;
371 //emmsÒª40¸öcpuÖÜÆÚ
372 //__asm emms;
373 return S_OK;
376 HRESULT SimpleSubpic::ConvertColorSpace()
378 int count = 0;
379 HRESULT hr = m_sub_render_frame->GetBitmapCount(&count);
380 if (FAILED(hr) || count==0)
382 return hr;
384 int xy_color_space = 0;
385 hr = m_sub_render_frame->GetXyColorSpace(&xy_color_space);
386 if (FAILED(hr))
388 return hr;
390 m_bitmap.SetCount(count);
391 m_buffers.SetCount(count);
392 for (int i=0;i<count;i++)
394 m_buffers.GetAt(i) = NULL;//safe
396 Bitmap &bitmap = m_bitmap.GetAt(i);
397 hr = m_sub_render_frame->GetBitmap(i, &bitmap.id, &bitmap.pos, &bitmap.size, &bitmap.pixels, &bitmap.pitch);
398 if (FAILED(hr))
400 return hr;
402 if (xy_color_space==XY_CS_AYUV_PLANAR)
404 hr = m_sub_render_frame->GetBitmapExtra(i, &bitmap.extra);
405 if (FAILED(hr))
407 return hr;
411 int w = bitmap.size.cx, h = bitmap.size.cy;
412 if (w<=0 || h<=0)
414 continue;
417 const BYTE* top = reinterpret_cast<const BYTE*>(bitmap.pixels);
418 const BYTE* bottom = top + bitmap.pitch*h;
419 if(m_alpha_blt_dst_type == MSP_RGB16)
421 ASSERT(xy_color_space==XY_CS_ARGB);
423 BYTE* dst = reinterpret_cast<BYTE*>(xy_malloc(bitmap.pitch*h, (bitmap.pos.x*4)&15));
424 m_buffers.GetAt(i) = dst;
425 bitmap.pixels = dst;
426 for(; top < bottom ; top += bitmap.pitch, dst += bitmap.pitch)
428 const DWORD* s = reinterpret_cast<const DWORD*>(top);
429 const DWORD* e = s + w;
430 DWORD* dst2 = reinterpret_cast<DWORD*>(dst);
431 for(; s < e; s++, dst2++)
433 *dst2 = ((*s>>3)&0x1f000000)|((*s>>8)&0xf800)|((*s>>5)&0x07e0)|((*s>>3)&0x001f);
434 // *s = (*s&0xff000000)|((*s>>8)&0xf800)|((*s>>5)&0x07e0)|((*s>>3)&0x001f);
438 else if(m_alpha_blt_dst_type == MSP_RGB15)
440 ASSERT(xy_color_space==XY_CS_ARGB);
442 BYTE* dst = reinterpret_cast<BYTE*>(xy_malloc(bitmap.pitch*h, (bitmap.pos.x*4)&15));
443 m_buffers.GetAt(i) = dst;
444 bitmap.pixels = dst;
445 for(; top < bottom; top += bitmap.pitch, dst += bitmap.pitch)
447 const DWORD* s = reinterpret_cast<const DWORD*>(top);
448 const DWORD* e = s + w;
449 DWORD* dst2 = reinterpret_cast<DWORD*>(dst);
450 for(; s < e; s++, dst2++)
452 *dst2 = ((*s>>3)&0x1f000000)|((*s>>9)&0x7c00)|((*s>>6)&0x03e0)|((*s>>3)&0x001f);
453 // *s = (*s&0xff000000)|((*s>>9)&0x7c00)|((*s>>6)&0x03e0)|((*s>>3)&0x001f);
457 else if(m_alpha_blt_dst_type == MSP_YUY2)
459 ASSERT(xy_color_space==XY_CS_AUYV);
460 XY_DO_ONCE( xy_logger::write_file("G:\\b1_ul", top, bitmap.pitch*(h-1)) );
462 BYTE* dst = reinterpret_cast<BYTE*>(xy_malloc(bitmap.pitch*h, (bitmap.pos.x*4)&15));
463 m_buffers.GetAt(i) = dst;
464 memcpy(dst, bitmap.pixels, bitmap.pitch*h);
465 bitmap.pixels = dst;
466 for(BYTE* tempTop=dst; tempTop < dst+bitmap.pitch*h ; tempTop += bitmap.pitch)
468 BYTE* s = tempTop;
469 BYTE* e = s + w*4;
470 BYTE last_v = s[0], last_u=s[2];
471 for(; s < e; s+=8) // AUYV AUYV -> AxYU AxYV
473 BYTE tmp = s[4];
474 s[4] = (last_v + 2*s[0] + s[4] + 2)>>2;
475 last_v = tmp;
477 s[0] = (last_u + 2*s[2] + s[6] + 2)>>2;
478 last_u = s[6];
481 XY_DO_ONCE( xy_logger::write_file("G:\\a1_ul", dst, bitmap.pitch*(h-1)) );
483 else if(m_alpha_blt_dst_type == MSP_YV12 || m_alpha_blt_dst_type == MSP_IYUV )
485 ASSERT(xy_color_space==XY_CS_AYUV_PLANAR);
486 //nothing to do
488 else if ( m_alpha_blt_dst_type == MSP_P010 || m_alpha_blt_dst_type == MSP_P016
489 || m_alpha_blt_dst_type == MSP_NV12 )
491 ASSERT(xy_color_space==XY_CS_AYUV_PLANAR);
492 SubsampleAndInterlace(i, &bitmap, true);
494 else if( m_alpha_blt_dst_type == MSP_NV21 )
496 ASSERT(xy_color_space==XY_CS_AYUV_PLANAR);
497 SubsampleAndInterlace(i, &bitmap, false);
500 return S_OK;
503 void SimpleSubpic::SubsampleAndInterlace( int index, Bitmap*bitmap, bool u_first )
505 ASSERT(bitmap!=NULL);
506 //fix me: check alignment and log error
507 int w = bitmap->size.cx, h = bitmap->size.cy;
508 ASSERT(h%2==0);
509 const BYTE* u_start = reinterpret_cast<const BYTE*>(bitmap->extra.plans[2]);
510 const BYTE* v_start = reinterpret_cast<const BYTE*>(bitmap->extra.plans[3]);
512 BYTE* dst = reinterpret_cast<BYTE*>(xy_malloc(bitmap->pitch*h/2, bitmap->pos.x&15));
513 m_buffers.GetAt(index) = dst;
514 bitmap->extra.plans[2] = dst;
516 if(!u_first)
518 const BYTE* tmp = v_start;
519 v_start = u_start;
520 u_start = tmp;
523 //Todo: fix me.
524 //Walkarround for alignment
525 if ( ((bitmap->pitch | (int)u_start | (int)v_start)&15) == 0 && (g_cpuid.m_flags & CCpuID::sse2) )
527 for (int i=0;i<h;i+=2)
529 int w16 = w&~15;
530 hleft_vmid_subsample_and_interlace_2_line_sse2(dst, u_start, v_start, w16, bitmap->pitch);
531 ASSERT(w>0);
532 hleft_vmid_subsample_and_interlace_2_line_c(dst+w16, u_start+w16, v_start+w16, w&15, bitmap->pitch, -1);
533 u_start += 2*bitmap->pitch;
534 v_start += 2*bitmap->pitch;
535 dst += bitmap->pitch;
538 else
540 for (int i=0;i<h;i+=2)
542 hleft_vmid_subsample_and_interlace_2_line_c(dst, u_start, v_start, w, bitmap->pitch);
543 u_start += 2*bitmap->pitch;
544 v_start += 2*bitmap->pitch;
545 dst += bitmap->pitch;