d3d11: handle VLC_CODEC_D3D11_OPAQUE_10B upload/download
[vlc.git] / modules / video_chroma / copy.c
blob861f6543be04c5e9ca81a50db15119ddf42cf6d6
1 /*****************************************************************************
2 * copy.c: Fast YV12/NV12 copy
3 *****************************************************************************
4 * Copyright (C) 2010 Laurent Aimar
5 * $Id$
7 * Authors: Laurent Aimar <fenrir _AT_ videolan _DOT_ org>
8 * Victorien Le Couviour--Tuffet <victorien.lecouviour.tuffet@gmail.com>
10 * This program is free software; you can redistribute it and/or modify it
11 * under the terms of the GNU Lesser General Public License as published by
12 * the Free Software Foundation; either version 2.1 of the License, or
13 * (at your option) any later version.
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU Lesser General Public License for more details.
20 * You should have received a copy of the GNU Lesser General Public License
21 * along with this program; if not, write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
23 *****************************************************************************/
25 #ifdef HAVE_CONFIG_H
26 # include "config.h"
27 #endif
29 #ifdef COPY_TEST
30 # undef NDEBUG
31 #endif
33 #include <vlc_common.h>
34 #include <vlc_picture.h>
35 #include <vlc_cpu.h>
36 #include <assert.h>
38 #include "copy.h"
39 static void CopyPlane(uint8_t *dst, size_t dst_pitch,
40 const uint8_t *src, size_t src_pitch,
41 unsigned height, int bitshift);
43 #define ASSERT_PLANE(i) assert(src[i]); \
44 assert(src_pitch[i])
46 #define ASSERT_2PLANES \
47 assert(dst); \
48 ASSERT_PLANE(0); \
49 ASSERT_PLANE(1); \
50 assert(height)
52 #define ASSERT_3PLANES ASSERT_2PLANES; \
53 ASSERT_PLANE(2)
55 int CopyInitCache(copy_cache_t *cache, unsigned width)
57 #ifdef CAN_COMPILE_SSE2
58 cache->size = __MAX((width + 0x3f) & ~ 0x3f, 16384);
59 cache->buffer = aligned_alloc(64, cache->size);
60 if (!cache->buffer)
61 return VLC_EGENERIC;
62 #else
63 (void) cache; (void) width;
64 #endif
65 return VLC_SUCCESS;
68 void CopyCleanCache(copy_cache_t *cache)
70 #ifdef CAN_COMPILE_SSE2
71 aligned_free(cache->buffer);
72 cache->buffer = NULL;
73 cache->size = 0;
74 #else
75 (void) cache;
76 #endif
79 #ifdef CAN_COMPILE_SSE2
80 /* Copy 16/64 bytes from srcp to dstp loading data with the SSE>=2 instruction
81 * load and storing data with the SSE>=2 instruction store.
84 #define COPY16_SHIFTR(x) \
85 "psrlw "x", %%xmm1\n"
86 #define COPY16_SHIFTL(x) \
87 "psllw "x", %%xmm1\n"
89 #define COPY16_S(dstp, srcp, load, store, shiftstr) \
90 asm volatile ( \
91 load " 0(%[src]), %%xmm1\n" \
92 shiftstr \
93 store " %%xmm1, 0(%[dst])\n" \
94 : : [dst]"r"(dstp), [src]"r"(srcp) : "memory", "xmm1")
96 #define COPY16(dstp, srcp, load, store) COPY16_S(dstp, srcp, load, store, "")
98 #define COPY64_SHIFTR(x) \
99 "psrlw "x", %%xmm1\n" \
100 "psrlw "x", %%xmm2\n" \
101 "psrlw "x", %%xmm3\n" \
102 "psrlw "x", %%xmm4\n"
103 #define COPY64_SHIFTL(x) \
104 "psllw "x", %%xmm1\n" \
105 "psllw "x", %%xmm2\n" \
106 "psllw "x", %%xmm3\n" \
107 "psllw "x", %%xmm4\n"
109 #define COPY64_S(dstp, srcp, load, store, shiftstr) \
110 asm volatile ( \
111 load " 0(%[src]), %%xmm1\n" \
112 load " 16(%[src]), %%xmm2\n" \
113 load " 32(%[src]), %%xmm3\n" \
114 load " 48(%[src]), %%xmm4\n" \
115 shiftstr \
116 store " %%xmm1, 0(%[dst])\n" \
117 store " %%xmm2, 16(%[dst])\n" \
118 store " %%xmm3, 32(%[dst])\n" \
119 store " %%xmm4, 48(%[dst])\n" \
120 : : [dst]"r"(dstp), [src]"r"(srcp) : "memory", "xmm1", "xmm2", "xmm3", "xmm4")
122 #define COPY64(dstp, srcp, load, store) \
123 COPY64_S(dstp, srcp, load, store, "")
125 #ifdef COPY_TEST_NOOPTIM
126 # undef vlc_CPU_SSE4_1
127 # define vlc_CPU_SSE4_1() (0)
128 # undef vlc_CPU_SSE3
129 # define vlc_CPU_SSE3() (0)
130 # undef vlc_CPU_SSSE3
131 # define vlc_CPU_SSSE3() (0)
132 # undef vlc_CPU_SSE2
133 # define vlc_CPU_SSE2() (0)
134 #endif
136 /* Optimized copy from "Uncacheable Speculative Write Combining" memory
137 * as used by some video surface.
138 * XXX It is really efficient only when SSE4.1 is available.
140 VLC_SSE
141 static void CopyFromUswc(uint8_t *dst, size_t dst_pitch,
142 const uint8_t *src, size_t src_pitch,
143 unsigned width, unsigned height, int bitshift)
145 assert(((intptr_t)dst & 0x0f) == 0 && (dst_pitch & 0x0f) == 0);
147 asm volatile ("mfence");
149 #define SSE_USWC_COPY(shiftstr16, shiftstr64) \
150 for (unsigned y = 0; y < height; y++) { \
151 const unsigned unaligned = (-(uintptr_t)src) & 0x0f; \
152 unsigned x = unaligned; \
153 if (vlc_CPU_SSE4_1()) { \
154 if (!unaligned) { \
155 for (; x+63 < width; x += 64) \
156 COPY64_S(&dst[x], &src[x], "movntdqa", "movdqa", shiftstr64); \
157 } else { \
158 COPY16_S(dst, src, "movdqu", "movdqa", shiftstr16); \
159 for (; x+63 < width; x += 64) \
160 COPY64_S(&dst[x], &src[x], "movntdqa", "movdqu", shiftstr64); \
162 } else { \
163 if (!unaligned) { \
164 for (; x+63 < width; x += 64) \
165 COPY64_S(&dst[x], &src[x], "movdqa", "movdqa", shiftstr64); \
166 } else { \
167 COPY16_S(dst, src, "movdqu", "movdqa", shiftstr16); \
168 for (; x+63 < width; x += 64) \
169 COPY64_S(&dst[x], &src[x], "movdqa", "movdqu", shiftstr64); \
172 /* The following should not happen since buffers are generally well aligned */ \
173 if (x < width) \
174 CopyPlane(&dst[x], dst_pitch - x, &src[x], src_pitch - x, 1, bitshift); \
175 src += src_pitch; \
176 dst += dst_pitch; \
179 switch (bitshift)
181 case 0:
182 SSE_USWC_COPY("", "")
183 break;
184 case -6:
185 SSE_USWC_COPY(COPY16_SHIFTL("$6"), COPY64_SHIFTL("$6"))
186 break;
187 case 6:
188 SSE_USWC_COPY(COPY16_SHIFTR("$6"), COPY64_SHIFTR("$6"))
189 break;
190 case 2:
191 SSE_USWC_COPY(COPY16_SHIFTR("$2"), COPY64_SHIFTR("$2"))
192 break;
193 case -2:
194 SSE_USWC_COPY(COPY16_SHIFTL("$2"), COPY64_SHIFTL("$2"))
195 break;
196 case 4:
197 SSE_USWC_COPY(COPY16_SHIFTR("$4"), COPY64_SHIFTR("$4"))
198 break;
199 case -4:
200 SSE_USWC_COPY(COPY16_SHIFTL("$2"), COPY64_SHIFTL("$2"))
201 break;
202 default:
203 vlc_assert_unreachable();
205 #undef SSE_USWC_COPY
207 asm volatile ("mfence");
210 VLC_SSE
211 static void Copy2d(uint8_t *dst, size_t dst_pitch,
212 const uint8_t *src, size_t src_pitch,
213 unsigned width, unsigned height)
215 assert(((intptr_t)src & 0x0f) == 0 && (src_pitch & 0x0f) == 0);
217 for (unsigned y = 0; y < height; y++) {
218 unsigned x = 0;
220 bool unaligned = ((intptr_t)dst & 0x0f) != 0;
221 if (!unaligned) {
222 for (; x+63 < width; x += 64)
223 COPY64(&dst[x], &src[x], "movdqa", "movntdq");
224 } else {
225 for (; x+63 < width; x += 64)
226 COPY64(&dst[x], &src[x], "movdqa", "movdqu");
229 for (; x < width; x++)
230 dst[x] = src[x];
232 src += src_pitch;
233 dst += dst_pitch;
237 VLC_SSE
238 static void
239 SSE_InterleaveUV(uint8_t *dst, size_t dst_pitch,
240 uint8_t *srcu, size_t srcu_pitch,
241 uint8_t *srcv, size_t srcv_pitch,
242 unsigned int width, unsigned int height, uint8_t pixel_size)
244 assert(!((intptr_t)srcu & 0xf) && !(srcu_pitch & 0x0f) &&
245 !((intptr_t)srcv & 0xf) && !(srcv_pitch & 0x0f));
247 static const uint8_t shuffle_8[] = { 0, 8,
248 1, 9,
249 2, 10,
250 3, 11,
251 4, 12,
252 5, 13,
253 6, 14,
254 7, 15 };
255 static const uint8_t shuffle_16[] = { 0, 1, 8, 9,
256 2, 3, 10, 11,
257 4, 5, 12, 13,
258 6, 7, 14, 15 };
259 const uint8_t *shuffle = pixel_size == 1 ? shuffle_8 : shuffle_16;
261 for (unsigned int y = 0; y < height; ++y)
263 unsigned int x;
265 #define LOAD2X32 \
266 "movhpd 0x00(%[src2]), %%xmm0\n" \
267 "movlpd 0x00(%[src1]), %%xmm0\n" \
269 "movhpd 0x08(%[src2]), %%xmm1\n" \
270 "movlpd 0x08(%[src1]), %%xmm1\n" \
272 "movhpd 0x10(%[src2]), %%xmm2\n" \
273 "movlpd 0x10(%[src1]), %%xmm2\n" \
275 "movhpd 0x18(%[src2]), %%xmm3\n" \
276 "movlpd 0x18(%[src1]), %%xmm3\n"
278 #define STORE64 \
279 "movdqu %%xmm0, 0x00(%[dst])\n" \
280 "movdqu %%xmm1, 0x10(%[dst])\n" \
281 "movdqu %%xmm2, 0x20(%[dst])\n" \
282 "movdqu %%xmm3, 0x30(%[dst])\n"
284 #ifdef CAN_COMPILE_SSSE3
285 if (vlc_CPU_SSSE3())
286 for (x = 0; x < (width & ~31); x += 32)
287 asm volatile
289 "movdqu (%[shuffle]), %%xmm7\n"
290 LOAD2X32
291 "pshufb %%xmm7, %%xmm0\n"
292 "pshufb %%xmm7, %%xmm1\n"
293 "pshufb %%xmm7, %%xmm2\n"
294 "pshufb %%xmm7, %%xmm3\n"
295 STORE64
296 : : [dst]"r"(dst+2*x),
297 [src1]"r"(srcu+x), [src2]"r"(srcv+x),
298 [shuffle]"r"(shuffle)
299 : "memory", "xmm0", "xmm1", "xmm2", "xmm3", "xmm7"
301 else
302 #endif
305 assert(pixel_size == 1);
306 for (x = 0; x < (width & ~31); x += 32)
307 asm volatile
309 LOAD2X32
310 "movhlps %%xmm0, %%xmm4\n"
311 "punpcklbw %%xmm4, %%xmm0\n"
313 "movhlps %%xmm1, %%xmm4\n"
314 "punpcklbw %%xmm4, %%xmm1\n"
316 "movhlps %%xmm2, %%xmm4\n"
317 "punpcklbw %%xmm4, %%xmm2\n"
319 "movhlps %%xmm3, %%xmm4\n"
320 "punpcklbw %%xmm4, %%xmm3\n"
321 STORE64
322 : : [dst]"r"(dst+2*x),
323 [src1]"r"(srcu+x), [src2]"r"(srcv+x)
324 : "memory",
325 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm7"
328 #undef LOAD2X32
329 #undef STORE64
331 if (pixel_size == 1)
333 for (; x < width; x++) {
334 dst[2*x+0] = srcu[x];
335 dst[2*x+1] = srcv[x];
338 else
340 for (; x < width; x+= 2) {
341 dst[2*x+0] = srcu[x];
342 dst[2*x+1] = srcu[x + 1];
343 dst[2*x+2] = srcv[x];
344 dst[2*x+3] = srcv[x + 1];
347 srcu += srcu_pitch;
348 srcv += srcv_pitch;
349 dst += dst_pitch;
353 VLC_SSE
354 static void SSE_SplitUV(uint8_t *dstu, size_t dstu_pitch,
355 uint8_t *dstv, size_t dstv_pitch,
356 const uint8_t *src, size_t src_pitch,
357 unsigned width, unsigned height, uint8_t pixel_size)
359 assert(pixel_size == 1 || pixel_size == 2);
360 assert(((intptr_t)src & 0xf) == 0 && (src_pitch & 0x0f) == 0);
362 #define LOAD64 \
363 "movdqa 0(%[src]), %%xmm0\n" \
364 "movdqa 16(%[src]), %%xmm1\n" \
365 "movdqa 32(%[src]), %%xmm2\n" \
366 "movdqa 48(%[src]), %%xmm3\n"
368 #define STORE2X32 \
369 "movq %%xmm0, 0(%[dst1])\n" \
370 "movq %%xmm1, 8(%[dst1])\n" \
371 "movhpd %%xmm0, 0(%[dst2])\n" \
372 "movhpd %%xmm1, 8(%[dst2])\n" \
373 "movq %%xmm2, 16(%[dst1])\n" \
374 "movq %%xmm3, 24(%[dst1])\n" \
375 "movhpd %%xmm2, 16(%[dst2])\n" \
376 "movhpd %%xmm3, 24(%[dst2])\n"
378 #ifdef CAN_COMPILE_SSSE3
379 if (vlc_CPU_SSSE3())
381 static const uint8_t shuffle_8[] = { 0, 2, 4, 6, 8, 10, 12, 14,
382 1, 3, 5, 7, 9, 11, 13, 15 };
383 static const uint8_t shuffle_16[] = { 0, 1, 4, 5, 8, 9, 12, 13,
384 2, 3, 6, 7, 10, 11, 14, 15 };
385 const uint8_t *shuffle = pixel_size == 1 ? shuffle_8 : shuffle_16;
386 for (unsigned y = 0; y < height; y++) {
387 unsigned x = 0;
388 for (; x < (width & ~31); x += 32) {
389 asm volatile (
390 "movdqu (%[shuffle]), %%xmm7\n"
391 LOAD64
392 "pshufb %%xmm7, %%xmm0\n"
393 "pshufb %%xmm7, %%xmm1\n"
394 "pshufb %%xmm7, %%xmm2\n"
395 "pshufb %%xmm7, %%xmm3\n"
396 STORE2X32
397 : : [dst1]"r"(&dstu[x]), [dst2]"r"(&dstv[x]), [src]"r"(&src[2*x]), [shuffle]"r"(shuffle) : "memory", "xmm0", "xmm1", "xmm2", "xmm3", "xmm7");
399 if (pixel_size == 1)
401 for (; x < width; x++) {
402 dstu[x] = src[2*x+0];
403 dstv[x] = src[2*x+1];
406 else
408 for (; x < width; x+= 2) {
409 dstu[x] = src[2*x+0];
410 dstu[x+1] = src[2*x+1];
411 dstv[x] = src[2*x+2];
412 dstv[x+1] = src[2*x+3];
415 src += src_pitch;
416 dstu += dstu_pitch;
417 dstv += dstv_pitch;
419 } else
420 #endif
422 assert(pixel_size == 1);
423 static const uint8_t mask[] = { 0xff, 0x00, 0xff, 0x00, 0xff, 0x00, 0xff, 0x00,
424 0xff, 0x00, 0xff, 0x00, 0xff, 0x00, 0xff, 0x00 };
426 for (unsigned y = 0; y < height; y++)
428 unsigned x = 0;
429 for (; x < (width & ~31); x += 32) {
430 asm volatile (
431 "movdqu (%[mask]), %%xmm7\n"
432 LOAD64
433 "movdqa %%xmm0, %%xmm4\n"
434 "movdqa %%xmm1, %%xmm5\n"
435 "movdqa %%xmm2, %%xmm6\n"
436 "psrlw $8, %%xmm0\n"
437 "psrlw $8, %%xmm1\n"
438 "pand %%xmm7, %%xmm4\n"
439 "pand %%xmm7, %%xmm5\n"
440 "pand %%xmm7, %%xmm6\n"
441 "packuswb %%xmm4, %%xmm0\n"
442 "packuswb %%xmm5, %%xmm1\n"
443 "pand %%xmm3, %%xmm7\n"
444 "psrlw $8, %%xmm2\n"
445 "psrlw $8, %%xmm3\n"
446 "packuswb %%xmm6, %%xmm2\n"
447 "packuswb %%xmm7, %%xmm3\n"
448 STORE2X32
449 : : [dst2]"r"(&dstu[x]), [dst1]"r"(&dstv[x]), [src]"r"(&src[2*x]), [mask]"r"(mask) : "memory", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7");
451 for (; x < width; x++) {
452 dstu[x] = src[2*x+0];
453 dstv[x] = src[2*x+1];
455 src += src_pitch;
456 dstu += dstu_pitch;
457 dstv += dstv_pitch;
460 #undef STORE2X32
461 #undef LOAD64
464 static void SSE_CopyPlane(uint8_t *dst, size_t dst_pitch,
465 const uint8_t *src, size_t src_pitch,
466 uint8_t *cache, size_t cache_size,
467 unsigned height, int bitshift)
469 const unsigned w16 = (src_pitch+15) & ~15;
470 const unsigned hstep = cache_size / w16;
471 assert(hstep > 0);
473 /* If SSE4.1: CopyFromUswc is faster than memcpy */
474 if (!vlc_CPU_SSE4_1() && bitshift == 0 && src_pitch == dst_pitch)
475 memcpy(dst, src, src_pitch * height);
476 else
477 for (unsigned y = 0; y < height; y += hstep) {
478 const unsigned hblock = __MIN(hstep, height - y);
480 /* Copy a bunch of line into our cache */
481 CopyFromUswc(cache, w16, src, src_pitch, src_pitch, hblock, bitshift);
483 /* Copy from our cache to the destination */
484 Copy2d(dst, dst_pitch, cache, w16, src_pitch, hblock);
486 /* */
487 src += src_pitch * hblock;
488 dst += dst_pitch * hblock;
492 static void
493 SSE_InterleavePlanes(uint8_t *dst, size_t dst_pitch,
494 const uint8_t *srcu, size_t srcu_pitch,
495 const uint8_t *srcv, size_t srcv_pitch,
496 uint8_t *cache, size_t cache_size,
497 unsigned int height, uint8_t pixel_size, int bitshift)
499 assert(srcu_pitch == srcv_pitch);
500 unsigned int const w16 = (srcu_pitch+15) & ~15;
501 unsigned int const hstep = (cache_size) / (2*w16);
502 assert(hstep > 0);
504 for (unsigned int y = 0; y < height; y += hstep)
506 unsigned int const hblock = __MIN(hstep, height - y);
508 /* Copy a bunch of line into our cache */
509 CopyFromUswc(cache, w16, srcu, srcu_pitch, srcu_pitch, hblock, bitshift);
510 CopyFromUswc(cache+w16*hblock, w16, srcv, srcv_pitch,
511 srcv_pitch, hblock, bitshift);
513 /* Copy from our cache to the destination */
514 SSE_InterleaveUV(dst, dst_pitch, cache, w16,
515 cache + w16 * hblock, w16,
516 srcu_pitch, hblock, pixel_size);
518 /* */
519 srcu += hblock * srcu_pitch;
520 srcv += hblock * srcv_pitch;
521 dst += hblock * dst_pitch;
525 static void SSE_SplitPlanes(uint8_t *dstu, size_t dstu_pitch,
526 uint8_t *dstv, size_t dstv_pitch,
527 const uint8_t *src, size_t src_pitch,
528 uint8_t *cache, size_t cache_size,
529 unsigned height, uint8_t pixel_size, int bitshift)
531 const unsigned w16 = (src_pitch+15) & ~15;
532 const unsigned hstep = cache_size / w16;
533 assert(hstep > 0);
535 for (unsigned y = 0; y < height; y += hstep) {
536 const unsigned hblock = __MIN(hstep, height - y);
538 /* Copy a bunch of line into our cache */
539 CopyFromUswc(cache, w16, src, src_pitch, src_pitch, hblock, bitshift);
541 /* Copy from our cache to the destination */
542 SSE_SplitUV(dstu, dstu_pitch, dstv, dstv_pitch,
543 cache, w16, src_pitch / 2, hblock, pixel_size);
545 /* */
546 src += src_pitch * hblock;
547 dstu += dstu_pitch * hblock;
548 dstv += dstv_pitch * hblock;
552 static void SSE_Copy420_P_to_P(picture_t *dst, const uint8_t *src[static 3],
553 const size_t src_pitch[static 3], unsigned height,
554 const copy_cache_t *cache)
556 for (unsigned n = 0; n < 3; n++) {
557 const unsigned d = n > 0 ? 2 : 1;
558 SSE_CopyPlane(dst->p[n].p_pixels, dst->p[n].i_pitch,
559 src[n], src_pitch[n],
560 cache->buffer, cache->size,
561 (height+d-1)/d, 0);
563 asm volatile ("emms");
567 static void SSE_Copy420_SP_to_SP(picture_t *dst, const uint8_t *src[static 2],
568 const size_t src_pitch[static 2], unsigned height,
569 const copy_cache_t *cache)
571 SSE_CopyPlane(dst->p[0].p_pixels, dst->p[0].i_pitch, src[0], src_pitch[0],
572 cache->buffer, cache->size, height, 0);
573 SSE_CopyPlane(dst->p[1].p_pixels, dst->p[1].i_pitch, src[1], src_pitch[1],
574 cache->buffer, cache->size, height / 2, 0);
575 asm volatile ("emms");
578 static void
579 SSE_Copy420_SP_to_P(picture_t *dest, const uint8_t *src[static 2],
580 const size_t src_pitch[static 2], unsigned int height,
581 uint8_t pixel_size, int bitshift, const copy_cache_t *cache)
583 SSE_CopyPlane(dest->p[0].p_pixels, dest->p[0].i_pitch,
584 src[0], src_pitch[0], cache->buffer, cache->size, height, bitshift);
586 SSE_SplitPlanes(dest->p[1].p_pixels, dest->p[1].i_pitch,
587 dest->p[2].p_pixels, dest->p[2].i_pitch,
588 src[1], src_pitch[1], cache->buffer, cache->size,
589 height / 2, pixel_size, bitshift);
590 asm volatile ("emms");
593 static void SSE_Copy420_P_to_SP(picture_t *dst, const uint8_t *src[static 3],
594 const size_t src_pitch[static 3],
595 unsigned height, uint8_t pixel_size,
596 int bitshift, const copy_cache_t *cache)
598 SSE_CopyPlane(dst->p[0].p_pixels, dst->p[0].i_pitch, src[0], src_pitch[0],
599 cache->buffer, cache->size, height, bitshift);
600 SSE_InterleavePlanes(dst->p[1].p_pixels, dst->p[1].i_pitch,
601 src[U_PLANE], src_pitch[U_PLANE],
602 src[V_PLANE], src_pitch[V_PLANE],
603 cache->buffer, cache->size, height / 2, pixel_size, bitshift);
604 asm volatile ("emms");
606 #undef COPY64
607 #endif /* CAN_COMPILE_SSE2 */
609 static void CopyPlane(uint8_t *dst, size_t dst_pitch,
610 const uint8_t *src, size_t src_pitch,
611 unsigned height, int bitshift)
613 if (bitshift != 0)
615 for (unsigned y = 0; y < height; y++)
617 uint16_t *dst16 = (uint16_t *) dst;
618 const uint16_t *src16 = (const uint16_t *) src;
620 if (bitshift > 0)
621 for (unsigned x = 0; x < (src_pitch / 2); x++)
622 *dst16++ = (*src16++) >> (bitshift & 0xf);
623 else
624 for (unsigned x = 0; x < (src_pitch / 2); x++)
625 *dst16++ = (*src16++) << ((-bitshift) & 0xf);
626 src += src_pitch;
627 dst += dst_pitch;
630 else if (src_pitch == dst_pitch)
631 memcpy(dst, src, src_pitch * height);
632 else
633 for (unsigned y = 0; y < height; y++) {
634 memcpy(dst, src, src_pitch);
635 src += src_pitch;
636 dst += dst_pitch;
640 void CopyPacked(picture_t *dst, const uint8_t *src, const size_t src_pitch,
641 unsigned height, const copy_cache_t *cache)
643 assert(dst);
644 assert(src); assert(src_pitch);
645 assert(height);
647 if (vlc_CPU_SSE4_1())
648 SSE_CopyPlane(dst->p[0].p_pixels, dst->p[0].i_pitch, src, src_pitch,
649 cache->buffer, cache->size, height, 0);
650 else
651 CopyPlane(dst->p[0].p_pixels, dst->p[0].i_pitch, src, src_pitch,
652 height, 0);
655 void Copy420_SP_to_SP(picture_t *dst, const uint8_t *src[static 2],
656 const size_t src_pitch[static 2], unsigned height,
657 const copy_cache_t *cache)
659 ASSERT_2PLANES;
660 #ifdef CAN_COMPILE_SSE2
661 if (vlc_CPU_SSE2())
662 return SSE_Copy420_SP_to_SP(dst, src, src_pitch, height, cache);
663 #else
664 (void) cache;
665 #endif
667 CopyPlane(dst->p[0].p_pixels, dst->p[0].i_pitch,
668 src[0], src_pitch[0], height, 0);
669 CopyPlane(dst->p[1].p_pixels, dst->p[1].i_pitch,
670 src[1], src_pitch[1], height/2, 0);
673 #define SPLIT_PLANES(type, pitch_den) do { \
674 for (unsigned y = 0; y < height; y++) { \
675 for (unsigned x = 0; x < src_pitch / pitch_den; x++) { \
676 ((type *) dstu)[x] = ((const type *) src)[2*x+0]; \
677 ((type *) dstv)[x] = ((const type *) src)[2*x+1]; \
679 src += src_pitch; \
680 dstu += dstu_pitch; \
681 dstv += dstv_pitch; \
683 } while(0)
685 #define SPLIT_PLANES_SHIFTR(type, pitch_den, bitshift) do { \
686 for (unsigned y = 0; y < height; y++) { \
687 for (unsigned x = 0; x < src_pitch / pitch_den; x++) { \
688 ((type *) dstu)[x] = (((const type *) src)[2*x+0]) >> (bitshift); \
689 ((type *) dstv)[x] = (((const type *) src)[2*x+1]) >> (bitshift); \
691 src += src_pitch; \
692 dstu += dstu_pitch; \
693 dstv += dstv_pitch; \
695 } while(0)
697 #define SPLIT_PLANES_SHIFTL(type, pitch_den, bitshift) do { \
698 for (unsigned y = 0; y < height; y++) { \
699 for (unsigned x = 0; x < src_pitch / pitch_den; x++) { \
700 ((type *) dstu)[x] = (((const type *) src)[2*x+0]) << (bitshift); \
701 ((type *) dstv)[x] = (((const type *) src)[2*x+1]) << (bitshift); \
703 src += src_pitch; \
704 dstu += dstu_pitch; \
705 dstv += dstv_pitch; \
707 } while(0)
709 static void SplitPlanes(uint8_t *dstu, size_t dstu_pitch,
710 uint8_t *dstv, size_t dstv_pitch,
711 const uint8_t *src, size_t src_pitch, unsigned height)
713 SPLIT_PLANES(uint8_t, 2);
716 static void SplitPlanes16(uint8_t *dstu, size_t dstu_pitch,
717 uint8_t *dstv, size_t dstv_pitch,
718 const uint8_t *src, size_t src_pitch, unsigned height,
719 int bitshift)
721 if (bitshift == 0)
722 SPLIT_PLANES(uint16_t, 4);
723 else if (bitshift > 0)
724 SPLIT_PLANES_SHIFTR(uint16_t, 4, bitshift & 0xf);
725 else
726 SPLIT_PLANES_SHIFTL(uint16_t, 4, (-bitshift) & 0xf);
729 void Copy420_SP_to_P(picture_t *dst, const uint8_t *src[static 2],
730 const size_t src_pitch[static 2], unsigned height,
731 const copy_cache_t *cache)
733 ASSERT_2PLANES;
734 #ifdef CAN_COMPILE_SSE2
735 if (vlc_CPU_SSE2())
736 return SSE_Copy420_SP_to_P(dst, src, src_pitch, height, 1, 0, cache);
737 #else
738 VLC_UNUSED(cache);
739 #endif
741 CopyPlane(dst->p[0].p_pixels, dst->p[0].i_pitch,
742 src[0], src_pitch[0], height, 0);
743 SplitPlanes(dst->p[1].p_pixels, dst->p[1].i_pitch,
744 dst->p[2].p_pixels, dst->p[2].i_pitch,
745 src[1], src_pitch[1], height/2);
748 void Copy420_16_SP_to_P(picture_t *dst, const uint8_t *src[static 2],
749 const size_t src_pitch[static 2], unsigned height,
750 int bitshift, const copy_cache_t *cache)
752 ASSERT_2PLANES;
753 assert(bitshift >= -6 && bitshift <= 6 && (bitshift % 2 == 0));
755 #ifdef CAN_COMPILE_SSE3
756 if (vlc_CPU_SSSE3())
757 return SSE_Copy420_SP_to_P(dst, src, src_pitch, height, 2, bitshift, cache);
758 #else
759 VLC_UNUSED(cache);
760 #endif
762 CopyPlane(dst->p[0].p_pixels, dst->p[0].i_pitch,
763 src[0], src_pitch[0], height, bitshift);
764 SplitPlanes16(dst->p[1].p_pixels, dst->p[1].i_pitch,
765 dst->p[2].p_pixels, dst->p[2].i_pitch,
766 src[1], src_pitch[1], height/2, bitshift);
769 #define INTERLEAVE_UV() do { \
770 for ( unsigned int line = 0; line < copy_lines; line++ ) { \
771 for ( unsigned int col = 0; col < copy_pitch; col++ ) { \
772 *dstUV++ = *srcU++; \
773 *dstUV++ = *srcV++; \
775 dstUV += i_extra_pitch_uv; \
776 srcU += i_extra_pitch_u; \
777 srcV += i_extra_pitch_v; \
779 }while(0)
781 #define INTERLEAVE_UV_SHIFTR(bitshitf) do { \
782 for ( unsigned int line = 0; line < copy_lines; line++ ) { \
783 for ( unsigned int col = 0; col < copy_pitch; col++ ) { \
784 *dstUV++ = (*srcU++) >> (bitshitf); \
785 *dstUV++ = (*srcV++) >> (bitshitf); \
787 dstUV += i_extra_pitch_uv; \
788 srcU += i_extra_pitch_u; \
789 srcV += i_extra_pitch_v; \
791 }while(0)
793 #define INTERLEAVE_UV_SHIFTL(bitshitf) do { \
794 for ( unsigned int line = 0; line < copy_lines; line++ ) { \
795 for ( unsigned int col = 0; col < copy_pitch; col++ ) { \
796 *dstUV++ = (*srcU++) << (bitshitf); \
797 *dstUV++ = (*srcV++) << (bitshitf); \
799 dstUV += i_extra_pitch_uv; \
800 srcU += i_extra_pitch_u; \
801 srcV += i_extra_pitch_v; \
803 }while(0)
805 void Copy420_P_to_SP(picture_t *dst, const uint8_t *src[static 3],
806 const size_t src_pitch[static 3], unsigned height,
807 const copy_cache_t *cache)
809 ASSERT_3PLANES;
810 #ifdef CAN_COMPILE_SSE2
811 if (vlc_CPU_SSE2())
812 return SSE_Copy420_P_to_SP(dst, src, src_pitch, height, 1, 0, cache);
813 #else
814 (void) cache;
815 #endif
817 CopyPlane(dst->p[0].p_pixels, dst->p[0].i_pitch,
818 src[0], src_pitch[0], height, 0);
820 const unsigned copy_lines = height / 2;
821 const unsigned copy_pitch = src_pitch[1];
823 const int i_extra_pitch_uv = dst->p[1].i_pitch - 2 * copy_pitch;
824 const int i_extra_pitch_u = src_pitch[U_PLANE] - copy_pitch;
825 const int i_extra_pitch_v = src_pitch[V_PLANE] - copy_pitch;
827 uint8_t *dstUV = dst->p[1].p_pixels;
828 const uint8_t *srcU = src[U_PLANE];
829 const uint8_t *srcV = src[V_PLANE];
830 INTERLEAVE_UV();
833 void Copy420_16_P_to_SP(picture_t *dst, const uint8_t *src[static 3],
834 const size_t src_pitch[static 3], unsigned height,
835 int bitshift, const copy_cache_t *cache)
837 ASSERT_3PLANES;
838 assert(bitshift >= -6 && bitshift <= 6 && (bitshift % 2 == 0));
839 #ifdef CAN_COMPILE_SSE2
840 if (vlc_CPU_SSSE3())
841 return SSE_Copy420_P_to_SP(dst, src, src_pitch, height, 2, bitshift, cache);
842 #else
843 (void) cache;
844 #endif
846 CopyPlane(dst->p[0].p_pixels, dst->p[0].i_pitch,
847 src[0], src_pitch[0], height, bitshift);
849 const unsigned copy_lines = height / 2;
850 const unsigned copy_pitch = src_pitch[1] / 2;
852 const int i_extra_pitch_uv = dst->p[1].i_pitch / 2 - 2 * copy_pitch;
853 const int i_extra_pitch_u = src_pitch[U_PLANE] / 2 - copy_pitch;
854 const int i_extra_pitch_v = src_pitch[V_PLANE] / 2 - copy_pitch;
856 uint16_t *dstUV = (void*) dst->p[1].p_pixels;
857 const uint16_t *srcU = (const uint16_t *) src[U_PLANE];
858 const uint16_t *srcV = (const uint16_t *) src[V_PLANE];
860 if (bitshift == 0)
861 INTERLEAVE_UV();
862 else if (bitshift > 0)
863 INTERLEAVE_UV_SHIFTR(bitshift & 0xf);
864 else
865 INTERLEAVE_UV_SHIFTL((-bitshift) & 0xf);
868 void CopyFromI420_10ToP010(picture_t *dst, const uint8_t *src[static 3],
869 const size_t src_pitch[static 3],
870 unsigned height, const copy_cache_t *cache)
872 (void) cache;
874 const int i_extra_pitch_dst_y = (dst->p[0].i_pitch - src_pitch[0]) / 2;
875 const int i_extra_pitch_src_y = (src_pitch[Y_PLANE] - src_pitch[0]) / 2;
876 uint16_t *dstY = (uint16_t *) dst->p[0].p_pixels;
877 const uint16_t *srcY = (const uint16_t *) src[Y_PLANE];
878 for (unsigned y = 0; y < height; y++) {
879 for (unsigned x = 0; x < (src_pitch[0] / 2); x++) {
880 *dstY++ = *srcY++ << 6;
882 dstY += i_extra_pitch_dst_y;
883 srcY += i_extra_pitch_src_y;
886 const unsigned copy_lines = height / 2;
887 const unsigned copy_pitch = src_pitch[1] / 2;
889 const int i_extra_pitch_uv = dst->p[1].i_pitch / 2 - 2 * copy_pitch;
890 const int i_extra_pitch_u = src_pitch[U_PLANE] / 2 - copy_pitch;
891 const int i_extra_pitch_v = src_pitch[V_PLANE] / 2 - copy_pitch;
893 uint16_t *dstUV = (uint16_t *) dst->p[1].p_pixels;
894 const uint16_t *srcU = (const uint16_t *) src[U_PLANE];
895 const uint16_t *srcV = (const uint16_t *) src[V_PLANE];
896 for ( unsigned int line = 0; line < copy_lines; line++ )
898 for ( unsigned int col = 0; col < copy_pitch; col++ )
900 *dstUV++ = *srcU++ << 6;
901 *dstUV++ = *srcV++ << 6;
903 dstUV += i_extra_pitch_uv;
904 srcU += i_extra_pitch_u;
905 srcV += i_extra_pitch_v;
909 void Copy420_P_to_P(picture_t *dst, const uint8_t *src[static 3],
910 const size_t src_pitch[static 3], unsigned height,
911 const copy_cache_t *cache)
913 ASSERT_3PLANES;
914 #ifdef CAN_COMPILE_SSE2
915 if (vlc_CPU_SSE2())
916 return SSE_Copy420_P_to_P(dst, src, src_pitch, height, cache);
917 #else
918 (void) cache;
919 #endif
921 CopyPlane(dst->p[0].p_pixels, dst->p[0].i_pitch,
922 src[0], src_pitch[0], height, 0);
923 CopyPlane(dst->p[1].p_pixels, dst->p[1].i_pitch,
924 src[1], src_pitch[1], height / 2, 0);
925 CopyPlane(dst->p[2].p_pixels, dst->p[2].i_pitch,
926 src[2], src_pitch[2], height / 2, 0);
929 int picture_UpdatePlanes(picture_t *picture, uint8_t *data, unsigned pitch)
931 /* fill in buffer info in first plane */
932 picture->p->p_pixels = data;
933 picture->p->i_pitch = pitch;
934 picture->p->i_lines = picture->format.i_height;
935 assert(picture->p->i_visible_pitch <= picture->p->i_pitch);
936 assert(picture->p->i_visible_lines <= picture->p->i_lines);
938 /* Fill chroma planes for biplanar YUV */
939 if (picture->format.i_chroma == VLC_CODEC_NV12 ||
940 picture->format.i_chroma == VLC_CODEC_NV21 ||
941 picture->format.i_chroma == VLC_CODEC_P010) {
943 for (int n = 1; n < picture->i_planes; n++) {
944 const plane_t *o = &picture->p[n-1];
945 plane_t *p = &picture->p[n];
947 p->p_pixels = o->p_pixels + o->i_lines * o->i_pitch;
948 p->i_pitch = pitch;
949 p->i_lines = picture->format.i_height;
950 assert(p->i_visible_pitch <= p->i_pitch);
951 assert(p->i_visible_lines <= p->i_lines);
953 /* The dx/d3d buffer is always allocated as NV12 */
954 if (vlc_fourcc_AreUVPlanesSwapped(picture->format.i_chroma, VLC_CODEC_NV12)) {
955 /* TODO : Swap NV21 UV planes to match NV12 */
956 return VLC_EGENERIC;
960 /* Fill chroma planes for planar YUV */
961 else
962 if (picture->format.i_chroma == VLC_CODEC_I420 ||
963 picture->format.i_chroma == VLC_CODEC_J420 ||
964 picture->format.i_chroma == VLC_CODEC_YV12) {
966 for (int n = 1; n < picture->i_planes; n++) {
967 const plane_t *o = &picture->p[n-1];
968 plane_t *p = &picture->p[n];
970 p->p_pixels = o->p_pixels + o->i_lines * o->i_pitch;
971 p->i_pitch = pitch / 2;
972 p->i_lines = picture->format.i_height / 2;
974 /* The dx/d3d buffer is always allocated as YV12 */
975 if (vlc_fourcc_AreUVPlanesSwapped(picture->format.i_chroma, VLC_CODEC_YV12))
976 picture_SwapUV( picture );
978 return VLC_SUCCESS;
981 #ifdef COPY_TEST
983 #include <vlc_picture.h>
985 struct test_dst
987 vlc_fourcc_t chroma;
988 int bitshift;
989 union
991 void (*conv)(picture_t *, const uint8_t *[], const size_t [], unsigned,
992 const copy_cache_t *);
993 void (*conv16)(picture_t *, const uint8_t *[], const size_t [], unsigned, int,
994 const copy_cache_t *);
998 struct test_conv
1000 vlc_fourcc_t src_chroma;
1001 struct test_dst dsts[3];
1004 static const struct test_conv convs[] = {
1005 { .src_chroma = VLC_CODEC_NV12,
1006 .dsts = { { VLC_CODEC_I420, 0, .conv = Copy420_SP_to_P },
1007 { VLC_CODEC_NV12, 0, .conv = Copy420_SP_to_SP } },
1009 { .src_chroma = VLC_CODEC_I420,
1010 .dsts = { { VLC_CODEC_I420, 0, .conv = Copy420_P_to_P },
1011 { VLC_CODEC_NV12, 0, .conv = Copy420_P_to_SP } },
1013 { .src_chroma = VLC_CODEC_P010,
1014 .dsts = { { VLC_CODEC_I420_10L, 6, .conv16 = Copy420_16_SP_to_P } },
1016 { .src_chroma = VLC_CODEC_I420_10L,
1017 .dsts = { { VLC_CODEC_P010, -6, .conv16 = Copy420_16_P_to_SP } },
1020 #define NB_CONVS ARRAY_SIZE(convs)
1022 struct test_size
1024 int i_width;
1025 int i_height;
1026 int i_visible_width;
1027 int i_visible_height;
1029 static const struct test_size sizes[] = {
1030 { 1, 1, 1, 1 },
1031 { 3, 3, 3, 3 },
1032 { 65, 39, 65, 39 },
1033 { 560, 369, 540, 350 },
1034 { 1274, 721, 1200, 720 },
1035 { 1920, 1088, 1920, 1080 },
1036 { 3840, 2160, 3840, 2160 },
1037 #if 0 /* too long */
1038 { 8192, 8192, 8192, 8192 },
1039 #endif
1041 #define NB_SIZES ARRAY_SIZE(sizes)
1043 static void piccheck(picture_t *pic, const vlc_chroma_description_t *dsc,
1044 bool init)
1046 #define ASSERT_COLOR(good) do { \
1047 fprintf(stderr, "error: pixel doesn't match @ plane: %d: %d x %d: 0x%X vs 0x%X\n", i, x, y, *(--p), good); \
1048 assert(!"error: pixel doesn't match"); \
1049 } while(0)
1051 #define PICCHECK(type_u, type_uv, colors_P, color_UV, pitch_den) do { \
1052 for (int i = 0; i < pic->i_planes; ++i) \
1054 const struct plane_t *plane = &pic->p[i]; \
1055 for (int y = 0; y < plane->i_visible_lines; ++y) \
1057 if (pic->i_planes == 2 && i == 1) \
1059 type_uv *p = (type_uv *)&plane->p_pixels[y * plane->i_pitch]; \
1060 for (int x = 0; x < plane->i_visible_pitch / 2 / pitch_den; ++x) \
1061 if (init) \
1062 *(p++) = color_UV; \
1063 else if (*(p++) != color_UV) \
1064 ASSERT_COLOR(color_UV); \
1066 else \
1068 type_u *p = (type_u *) &plane->p_pixels[y * plane->i_pitch]; \
1069 for (int x = 0; x < plane->i_visible_pitch / pitch_den; ++x) \
1070 if (init) \
1071 *(p++) = colors_P[i]; \
1072 else if (*(p++) != colors_P[i]) \
1073 ASSERT_COLOR(colors_P[i]); \
1077 } while (0)
1079 assert(pic->i_planes == 2 || pic->i_planes == 3);
1080 assert(dsc->pixel_size == 1 || dsc->pixel_size == 2);
1082 if (dsc->pixel_size == 1)
1084 const uint8_t colors_8_P[3] = { 0x42, 0xF1, 0x36 };
1085 const uint16_t color_8_UV = ntoh16(0xF136);
1086 PICCHECK(uint8_t, uint16_t, colors_8_P, color_8_UV, 1);
1088 else
1090 const unsigned mask = (1 << dsc->pixel_bits) - 1;
1091 uint16_t colors_16_P[3] = { 0x1042 &mask, 0xF114 &mask, 0x3645 &mask};
1093 switch (pic->format.i_chroma)
1095 case VLC_CODEC_P010:
1096 for (size_t i = 0; i < 3; ++i)
1097 colors_16_P[i] <<= 6;
1098 break;
1099 case VLC_CODEC_I420_10L:
1100 break;
1101 default:
1102 vlc_assert_unreachable();
1105 uint32_t color_16_UV = (colors_16_P[2] << 16) | colors_16_P[1];
1107 PICCHECK(uint16_t, uint32_t, colors_16_P, color_16_UV, 2);
1111 static void pic_rsc_destroy(picture_t *pic)
1113 for (unsigned i = 0; i < 3; i++)
1114 free(pic->p[i].p_pixels);
1115 free(pic);
1118 static picture_t *pic_new_unaligned(const video_format_t *fmt)
1120 /* Allocate a no-aligned picture in order to ease buffer overflow detection
1121 * from the source picture */
1122 const vlc_chroma_description_t *dsc = vlc_fourcc_GetChromaDescription(fmt->i_chroma);
1123 assert(dsc);
1124 picture_resource_t rsc = { .pf_destroy = pic_rsc_destroy };
1125 for (unsigned i = 0; i < dsc->plane_count; i++)
1127 rsc.p[i].i_lines = ((fmt->i_visible_height + 1) & ~ 1) * dsc->p[i].h.num / dsc->p[i].h.den;
1128 rsc.p[i].i_pitch = ((fmt->i_visible_width + 1) & ~ 1) * dsc->pixel_size * dsc->p[i].w.num / dsc->p[i].w.den;
1129 rsc.p[i].p_pixels = malloc(rsc.p[i].i_lines * rsc.p[i].i_pitch);
1130 assert(rsc.p[i].p_pixels);
1132 return picture_NewFromResource(fmt, &rsc);
1135 int main(void)
1137 alarm(10);
1139 #ifndef COPY_TEST_NOOPTIM
1140 if (!vlc_CPU_SSE2())
1142 fprintf(stderr, "WARNING: could not test SSE\n");
1143 return 77;
1145 #endif
1147 for (size_t i = 0; i < NB_CONVS; ++i)
1149 const struct test_conv *conv = &convs[i];
1151 for (size_t j = 0; j < NB_SIZES; ++j)
1153 const struct test_size *size = &sizes[j];
1155 const vlc_chroma_description_t *src_dsc =
1156 vlc_fourcc_GetChromaDescription(conv->src_chroma);
1157 assert(src_dsc);
1159 video_format_t fmt;
1160 video_format_Init(&fmt, 0);
1161 video_format_Setup(&fmt, conv->src_chroma,
1162 size->i_width, size->i_height,
1163 size->i_visible_width, size->i_visible_height,
1164 1, 1);
1165 picture_t *src = pic_new_unaligned(&fmt);
1166 assert(src);
1167 piccheck(src, src_dsc, true);
1169 copy_cache_t cache;
1170 int ret = CopyInitCache(&cache, src->format.i_width
1171 * src_dsc->pixel_size);
1172 assert(ret == VLC_SUCCESS);
1174 for (size_t f = 0; conv->dsts[f].chroma != 0; ++f)
1176 const struct test_dst *test_dst= &conv->dsts[f];
1178 const vlc_chroma_description_t *dst_dsc =
1179 vlc_fourcc_GetChromaDescription(test_dst->chroma);
1180 assert(dst_dsc);
1181 fmt.i_chroma = test_dst->chroma;
1182 picture_t *dst = picture_NewFromFormat(&fmt);
1183 assert(dst);
1185 const uint8_t * src_planes[3] = { src->p[Y_PLANE].p_pixels,
1186 src->p[U_PLANE].p_pixels,
1187 src->p[V_PLANE].p_pixels };
1188 const size_t src_pitches[3] = { src->p[Y_PLANE].i_pitch,
1189 src->p[U_PLANE].i_pitch,
1190 src->p[V_PLANE].i_pitch };
1192 fprintf(stderr, "testing: %u x %u (vis: %u x %u) %4.4s -> %4.4s\n",
1193 size->i_width, size->i_height,
1194 size->i_visible_width, size->i_visible_height,
1195 (const char *) &src->format.i_chroma,
1196 (const char *) &dst->format.i_chroma);
1197 if (test_dst->bitshift == 0)
1198 test_dst->conv(dst, src_planes, src_pitches,
1199 src->format.i_visible_height, &cache);
1200 else
1201 test_dst->conv16(dst, src_planes, src_pitches,
1202 src->format.i_visible_height, test_dst->bitshift,
1203 &cache);
1204 piccheck(dst, dst_dsc, false);
1205 picture_Release(dst);
1207 picture_Release(src);
1208 CopyCleanCache(&cache);
1211 return 0;
1214 #endif