1 /*****************************************************************************
2 * startcode_helper.h: Startcodes helpers
3 *****************************************************************************
4 * Copyright (C) 2016 VideoLAN Authors
6 * This program is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU Lesser General Public License as published by
8 * the Free Software Foundation; either version 2.1 of the License, or
9 * (at your option) any later version.
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public License
17 * along with this program; if not, write to the Free Software Foundation,
18 * Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
19 *****************************************************************************/
20 #ifndef VLC_STARTCODE_HELPER_H_
21 #define VLC_STARTCODE_HELPER_H_
25 #if !defined(CAN_COMPILE_SSE2) && defined(HAVE_SSE2_INTRINSICS)
26 #include <emmintrin.h>
29 /* Looks up efficiently for an AnnexB startcode 0x00 0x00 0x01
30 * by using a 4 times faster trick than single byte lookup. */
32 #define TRY_MATCH(p,a) {\
34 if (p[a+0] == 0 && p[a+2] == 1)\
36 if (p[a+2] == 0 && p[a+3] == 1)\
40 if (p[a+2] == 0 && p[a+4] == 1)\
42 if (p[a+4] == 0 && p[a+5] == 1)\
47 #if defined(CAN_COMPILE_SSE2) || defined(HAVE_SSE2_INTRINSICS)
49 __attribute__ ((__target__ ("sse2")))
50 static inline const uint8_t * startcode_FindAnnexB_SSE2( const uint8_t *p
, const uint8_t *end
)
52 /* First align to 16 */
53 /* Skipping this step and doing unaligned loads isn't faster */
54 const uint8_t *alignedend
= p
+ 16 - ((intptr_t)p
& 15);
55 for (end
-= 3; p
< alignedend
&& p
< end
; p
++) {
56 if (p
[0] == 0 && p
[1] == 0 && p
[2] == 1)
63 alignedend
= end
- ((intptr_t) end
& 15);
66 #ifdef CAN_COMPILE_SSE2
68 "pxor %%xmm1, %%xmm1\n"
72 __m128i zeros
= _mm_set1_epi8( 0x00 );
74 for( ; p
< alignedend
; p
+= 16)
77 #ifdef CAN_COMPILE_SSE2
79 "movdqa 0(%[v]), %%xmm0\n"
80 "pcmpeqb %%xmm1, %%xmm0\n"
81 "pmovmskb %%xmm0, %[match]\n"
87 __m128i v
= _mm_load_si128((__m128i
*)p
);
88 __m128i res
= _mm_cmpeq_epi8( zeros
, v
);
89 match
= _mm_movemask_epi8( res
); /* mask will be in reversed match order */
102 for (; p
< end
; p
++) {
103 if (p
[0] == 0 && p
[1] == 0 && p
[2] == 1)
112 /* That code is adapted from libav's ff_avc_find_startcode_internal
113 * and i believe the trick originated from
114 * https://graphics.stanford.edu/~seander/bithacks.html#ZeroInWord
116 static inline const uint8_t * startcode_FindAnnexB( const uint8_t *p
, const uint8_t *end
)
118 #if defined(CAN_COMPILE_SSE2) || defined(HAVE_SSE2_INTRINSICS)
120 return startcode_FindAnnexB_SSE2(p
, end
);
122 const uint8_t *a
= p
+ 4 - ((intptr_t)p
& 3);
124 for (end
-= 3; p
< a
&& p
< end
; p
++) {
125 if (p
[0] == 0 && p
[1] == 0 && p
[2] == 1)
129 for (end
-= 3; p
< end
; p
+= 4) {
130 uint32_t x
= *(const uint32_t*)p
;
131 if ((x
- 0x01010101) & (~x
) & 0x80808080)
133 /* matching DW isn't faster */
138 for (end
+= 3; p
< end
; p
++) {
139 if (p
[0] == 0 && p
[1] == 0 && p
[2] == 1)
146 /* Special variation to return on prefix only and no data */
147 static inline const uint8_t * startcode_FindAnyAnnexB( const uint8_t *p
, const uint8_t *end
)
149 size_t i_size
= end
- p
;
156 else if ( i_size
== 3 && p
[0] == 0 && p
[1] == 0 && p
[2] == 1 )
160 else return startcode_FindAnnexB( p
, end
);