1 /*****************************************************************************
2 * cpu.c: h264 encoder library
3 *****************************************************************************
4 * Copyright (C) 2003-2008 x264 project
6 * Authors: Loren Merritt <lorenm@u.washington.edu>
7 * Laurent Aimar <fenrir@via.ecp.fr>
8 * Jason Garrett-Glaser <darkshikari@gmail.com>
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
23 *****************************************************************************/
25 #if defined(HAVE_PTHREAD) && defined(SYS_LINUX)
30 #include <kernel/OS.h>
32 #if defined(SYS_MACOSX) || defined(SYS_FREEBSD)
33 #include <sys/types.h>
34 #include <sys/sysctl.h>
40 const x264_cpu_name_t x264_cpu_names
[] = {
41 {"Altivec", X264_CPU_ALTIVEC
},
42 // {"MMX", X264_CPU_MMX}, // we don't support asm on mmx1 cpus anymore
43 {"MMX2", X264_CPU_MMX
|X264_CPU_MMXEXT
},
44 {"MMXEXT", X264_CPU_MMX
|X264_CPU_MMXEXT
},
45 // {"SSE", X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_SSE}, // there are no sse1 functions in x264
46 {"SSE2Slow",X264_CPU_MMX
|X264_CPU_MMXEXT
|X264_CPU_SSE
|X264_CPU_SSE2
|X264_CPU_SSE2_IS_SLOW
},
47 {"SSE2", X264_CPU_MMX
|X264_CPU_MMXEXT
|X264_CPU_SSE
|X264_CPU_SSE2
},
48 {"SSE2Fast",X264_CPU_MMX
|X264_CPU_MMXEXT
|X264_CPU_SSE
|X264_CPU_SSE2
|X264_CPU_SSE2_IS_FAST
},
49 {"SSE3", X264_CPU_MMX
|X264_CPU_MMXEXT
|X264_CPU_SSE
|X264_CPU_SSE2
|X264_CPU_SSE3
},
50 {"SSSE3", X264_CPU_MMX
|X264_CPU_MMXEXT
|X264_CPU_SSE
|X264_CPU_SSE2
|X264_CPU_SSE3
|X264_CPU_SSSE3
},
51 {"PHADD", X264_CPU_MMX
|X264_CPU_MMXEXT
|X264_CPU_SSE
|X264_CPU_SSE2
|X264_CPU_SSE3
|X264_CPU_SSSE3
|X264_CPU_PHADD_IS_FAST
},
52 {"SSE4.1", X264_CPU_MMX
|X264_CPU_MMXEXT
|X264_CPU_SSE
|X264_CPU_SSE2
|X264_CPU_SSE3
|X264_CPU_SSSE3
|X264_CPU_SSE4
},
53 {"SSE4.2", X264_CPU_MMX
|X264_CPU_MMXEXT
|X264_CPU_SSE
|X264_CPU_SSE2
|X264_CPU_SSE3
|X264_CPU_SSSE3
|X264_CPU_SSE4
|X264_CPU_SSE42
},
54 {"Cache32", X264_CPU_CACHELINE_32
},
55 {"Cache64", X264_CPU_CACHELINE_64
},
56 {"SSEMisalign", X264_CPU_SSE_MISALIGN
},
57 {"Slow_mod4_stack", X264_CPU_STACK_MOD4
},
63 extern int x264_cpu_cpuid_test( void );
64 extern uint32_t x264_cpu_cpuid( uint32_t op
, uint32_t *eax
, uint32_t *ebx
, uint32_t *ecx
, uint32_t *edx
);
66 uint32_t x264_cpu_detect( void )
69 uint32_t eax
, ebx
, ecx
, edx
;
70 uint32_t vendor
[4] = {0};
75 if( !x264_cpu_cpuid_test() )
79 x264_cpu_cpuid( 0, &eax
, vendor
+0, vendor
+2, vendor
+1 );
83 x264_cpu_cpuid( 1, &eax
, &ebx
, &ecx
, &edx
);
89 cpu
|= X264_CPU_MMXEXT
|X264_CPU_SSE
;
95 cpu
|= X264_CPU_SSSE3
;
99 cpu
|= X264_CPU_SSE42
;
101 if( cpu
& X264_CPU_SSSE3
)
102 cpu
|= X264_CPU_SSE2_IS_FAST
;
103 if( cpu
& X264_CPU_SSE4
)
104 cpu
|= X264_CPU_PHADD_IS_FAST
;
106 x264_cpu_cpuid( 0x80000000, &eax
, &ebx
, &ecx
, &edx
);
107 max_extended_cap
= eax
;
109 if( !strcmp((char*)vendor
, "AuthenticAMD") && max_extended_cap
>= 0x80000001 )
111 x264_cpu_cpuid( 0x80000001, &eax
, &ebx
, &ecx
, &edx
);
113 cpu
|= X264_CPU_MMXEXT
;
114 if( cpu
& X264_CPU_SSE2
)
116 if( ecx
&0x00000040 ) /* SSE4a */
118 cpu
|= X264_CPU_SSE2_IS_FAST
;
119 cpu
|= X264_CPU_SSE_MISALIGN
;
120 x264_cpu_mask_misalign_sse();
123 cpu
|= X264_CPU_SSE2_IS_SLOW
;
127 if( !strcmp((char*)vendor
, "GenuineIntel") )
129 int family
, model
, stepping
;
130 x264_cpu_cpuid( 1, &eax
, &ebx
, &ecx
, &edx
);
131 family
= ((eax
>>8)&0xf) + ((eax
>>20)&0xff);
132 model
= ((eax
>>4)&0xf) + ((eax
>>12)&0xf0);
134 /* 6/9 (pentium-m "banias"), 6/13 (pentium-m "dothan"), and 6/14 (core1 "yonah")
135 * theoretically support sse2, but it's significantly slower than mmx for
136 * almost all of x264's functions, so let's just pretend they don't. */
137 if( family
==6 && (model
==9 || model
==13 || model
==14) )
139 cpu
&= ~(X264_CPU_SSE2
|X264_CPU_SSE3
);
140 assert(!(cpu
&(X264_CPU_SSSE3
|X264_CPU_SSE4
)));
144 if( (!strcmp((char*)vendor
, "GenuineIntel") || !strcmp((char*)vendor
, "CyrixInstead")) && !(cpu
&X264_CPU_SSE42
))
146 /* cacheline size is specified in 3 places, any of which may be missing */
147 x264_cpu_cpuid( 1, &eax
, &ebx
, &ecx
, &edx
);
148 cache
= (ebx
&0xff00)>>5; // cflush size
149 if( !cache
&& max_extended_cap
>= 0x80000006 )
151 x264_cpu_cpuid( 0x80000006, &eax
, &ebx
, &ecx
, &edx
);
152 cache
= ecx
&0xff; // cacheline size
156 // Cache and TLB Information
157 static const char cache32_ids
[] = { 0x0a, 0x0c, 0x41, 0x42, 0x43, 0x44, 0x45, 0x82, 0x83, 0x84, 0x85, 0 };
158 static const char cache64_ids
[] = { 0x22, 0x23, 0x25, 0x29, 0x2c, 0x46, 0x47, 0x49, 0x60, 0x66, 0x67, 0x68, 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7c, 0x7f, 0x86, 0x87, 0 };
162 x264_cpu_cpuid( 2, buf
+0, buf
+1, buf
+2, buf
+3 );
169 if( strchr( cache32_ids
, buf
[j
]&0xff ) )
171 if( strchr( cache64_ids
, buf
[j
]&0xff ) )
175 } while( ++i
< max
);
179 cpu
|= X264_CPU_CACHELINE_32
;
180 else if( cache
== 64 )
181 cpu
|= X264_CPU_CACHELINE_64
;
183 fprintf( stderr
, "x264 [warning]: unable to determine cacheline size\n" );
186 #ifdef BROKEN_STACK_ALIGNMENT
187 cpu
|= X264_CPU_STACK_MOD4
;
193 #elif defined( ARCH_PPC )
196 #include <sys/sysctl.h>
197 uint32_t x264_cpu_detect( void )
201 int selectors
[2] = { CTL_HW
, HW_VECTORUNIT
};
203 size_t length
= sizeof( has_altivec
);
204 int error
= sysctl( selectors
, 2, &has_altivec
, &length
, NULL
, 0 );
206 if( error
== 0 && has_altivec
!= 0 )
208 cpu
|= X264_CPU_ALTIVEC
;
214 #elif defined( SYS_LINUX )
217 static sigjmp_buf jmpbuf
;
218 static volatile sig_atomic_t canjump
= 0;
220 static void sigill_handler( int sig
)
224 signal( sig
, SIG_DFL
);
229 siglongjmp( jmpbuf
, 1 );
232 uint32_t x264_cpu_detect( void )
234 static void (* oldsig
)( int );
236 oldsig
= signal( SIGILL
, sigill_handler
);
237 if( sigsetjmp( jmpbuf
, 1 ) )
239 signal( SIGILL
, oldsig
);
244 asm volatile( "mtspr 256, %0\n\t"
250 signal( SIGILL
, oldsig
);
252 return X264_CPU_ALTIVEC
;
258 uint32_t x264_cpu_detect( void )
266 void x264_emms( void )
272 int x264_cpu_num_processors( void )
274 #if !defined(HAVE_PTHREAD)
277 #elif defined(_WIN32)
278 return pthread_num_processors_np();
280 #elif defined(SYS_LINUX)
284 memset( &p_aff
, 0, sizeof(p_aff
) );
285 sched_getaffinity( 0, sizeof(p_aff
), &p_aff
);
286 for( np
= 0, bit
= 0; bit
< sizeof(p_aff
); bit
++ )
287 np
+= (((uint8_t *)&p_aff
)[bit
/ 8] >> (bit
% 8)) & 1;
290 #elif defined(SYS_BEOS)
292 get_system_info( &info
);
293 return info
.cpu_count
;
295 #elif defined(SYS_MACOSX) || defined(SYS_FREEBSD)
297 size_t length
= sizeof( numberOfCPUs
);
298 if( sysctlbyname("hw.ncpu", &numberOfCPUs
, &length
, NULL
, 0) )