From 0435403c9d0c17c5de09b9a3e7e0d9b0002d422e Mon Sep 17 00:00:00 2001 From: Ulrich Drepper Date: Sat, 22 Sep 2007 05:54:03 +0000 Subject: [PATCH] * sysdeps/x86_64/cacheinfo.c (__x86_64_data_cache_size_half): Renamed from __x86_64_core_cache_size_half. (init_cacheinfo): Compute shared cache size for AMD processors with shared L3 correctly. * sysdeps/x86_64/memcpy.S: Adjust for __x86_64_data_cache_size_half name change. Patch in large parts by Evandro Menezes. --- ChangeLog | 10 ++ sysdeps/x86_64/cacheinfo.c | 76 +++++++--- sysdeps/x86_64/memcpy.S | 338 ++++++++++++++++++++++----------------------- 3 files changed, 237 insertions(+), 187 deletions(-) diff --git a/ChangeLog b/ChangeLog index 38453902d9..d0dcf629f9 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,13 @@ +2007-09-21 Ulrich Drepper + + * sysdeps/x86_64/cacheinfo.c (__x86_64_data_cache_size_half): Renamed + from __x86_64_core_cache_size_half. + (init_cacheinfo): Compute shared cache size for AMD processors with + shared L3 correctly. + * sysdeps/x86_64/memcpy.S: Adjust for __x86_64_data_cache_size_half + name change. + Patch in large parts by Evandro Menezes. + 2007-09-19 Ulrich Drepper * elf/dl-lookup.c (add_dependency): Handle failing memory diff --git a/sysdeps/x86_64/cacheinfo.c b/sysdeps/x86_64/cacheinfo.c index 793dc2d357..5b92bd5849 100644 --- a/sysdeps/x86_64/cacheinfo.c +++ b/sysdeps/x86_64/cacheinfo.c @@ -398,13 +398,13 @@ __cache_sysconf (int name) } -/* Half the core cache size for use in memory and string routines, typically - L1 size. */ -long int __x86_64_core_cache_size_half attribute_hidden = 32 * 1024 / 2; +/* Half the data cache size for use in memory and string routines, typically + L1 size. */ +long int __x86_64_data_cache_size_half attribute_hidden = 32 * 1024 / 2; /* Shared cache size for use in memory and string routines, typically - L2 or L3 size. */ + L2 or L3 size. */ long int __x86_64_shared_cache_size_half attribute_hidden = 1024 * 1024 / 2; -/* PREFETCHW support flag for use in memory and string routines. */ +/* PREFETCHW support flag for use in memory and string routines. */ int __x86_64_prefetchw attribute_hidden; @@ -419,7 +419,7 @@ init_cacheinfo (void) unsigned int edx; int max_cpuid; int max_cpuid_ex; - long int core = -1; + long int data = -1; long int shared = -1; unsigned int level; unsigned int threads = 0; @@ -431,26 +431,26 @@ init_cacheinfo (void) /* This spells out "GenuineIntel". */ if (ebx == 0x756e6547 && ecx == 0x6c65746e && edx == 0x49656e69) { - core = handle_intel (_SC_LEVEL1_DCACHE_SIZE, max_cpuid); + data = handle_intel (_SC_LEVEL1_DCACHE_SIZE, max_cpuid); - /* Try L3 first. */ + /* Try L3 first. */ level = 3; shared = handle_intel (_SC_LEVEL3_CACHE_SIZE, max_cpuid); if (shared <= 0) { - /* Try L2 otherwise. */ + /* Try L2 otherwise. */ level = 2; shared = handle_intel (_SC_LEVEL2_CACHE_SIZE, max_cpuid); } /* Figure out the number of logical threads that share the - highest cache level. */ + highest cache level. */ if (max_cpuid >= 4) { int i = 0; - /* Query until desired cache level is enumerated. */ + /* Query until desired cache level is enumerated. */ do { asm volatile ("cpuid" @@ -463,7 +463,7 @@ init_cacheinfo (void) } else { - /* Assume that all logical threads share the highest cache level. */ + /* Assume that all logical threads share the highest cache level. */ asm volatile ("cpuid" : "=a" (eax), "=b" (ebx), "=c" (ecx), "=d" (edx) : "0" (1)); @@ -472,33 +472,73 @@ init_cacheinfo (void) } /* Cap usage of highest cache level to the number of supported - threads. */ + threads. */ if (shared > 0 && threads > 0) shared /= threads; } /* This spells out "AuthenticAMD". */ else if (ebx == 0x68747541 && ecx == 0x444d4163 && edx == 0x69746e65) { - core = handle_amd (_SC_LEVEL1_DCACHE_SIZE); - shared = handle_amd (_SC_LEVEL2_CACHE_SIZE); + data = handle_amd (_SC_LEVEL1_DCACHE_SIZE); + long int core = handle_amd (_SC_LEVEL2_CACHE_SIZE); + shared = handle_amd (_SC_LEVEL3_CACHE_SIZE); + /* Get maximum extended function. */ asm volatile ("cpuid" : "=a" (max_cpuid_ex), "=b" (ebx), "=c" (ecx), "=d" (edx) : "0" (0x80000000)); + if (shared <= 0) + /* No shared L3 cache. All we have is the L2 cache. */ + shared = core; + else + { + /* Figure out the number of logical threads that share L3. */ + if (max_cpuid_ex >= 0x80000008) + { + /* Get width of APIC ID. */ + asm volatile ("cpuid" + : "=a" (max_cpuid_ex), "=b" (ebx), "=c" (ecx), + "=d" (edx) + : "0" (0x80000008)); + threads = 1 << ((ecx >> 12) & 0x0f); + } + + if (threads == 0) + { + /* If APIC ID width is not available, use logical + processor count. */ + asm volatile ("cpuid" + : "=a" (max_cpuid_ex), "=b" (ebx), "=c" (ecx), + "=d" (edx) + : "0" (0x00000001)); + + if ((edx & (1 << 28)) != 0) + threads = (ebx >> 16) & 0xff; + } + + /* Cap usage of highest cache level to the number of + supported threads. */ + if (threads > 0) + shared /= threads; + + /* Account for exclusive L2 and L3 caches. */ + shared += core; + } + if (max_cpuid_ex >= 0x80000001) { asm volatile ("cpuid" : "=a" (eax), "=b" (ebx), "=c" (ecx), "=d" (edx) : "0" (0x80000001)); - /* PREFETCHW || 3DNow! */ + /* PREFETCHW || 3DNow! */ if ((ecx & 0x100) || (edx & 0x80000000)) __x86_64_prefetchw = -1; } } - if (core > 0) - __x86_64_core_cache_size_half = core / 2; + if (data > 0) + __x86_64_data_cache_size_half = data / 2; if (shared > 0) __x86_64_shared_cache_size_half = shared / 2; diff --git a/sysdeps/x86_64/memcpy.S b/sysdeps/x86_64/memcpy.S index 231329864f..b25646b8c5 100644 --- a/sysdeps/x86_64/memcpy.S +++ b/sysdeps/x86_64/memcpy.S @@ -114,15 +114,15 @@ L(1d): /* 16-byte loop */ .p2align 4 L(1loop): - movq (%rsi), %rcx - movq 8 (%rsi), %r8 - movq %rcx, (%rdi) - movq %r8, 8 (%rdi) + movq (%rsi), %rcx + movq 8(%rsi), %r8 + movq %rcx, (%rdi) + movq %r8, 8(%rdi) subl $16, %edx - leaq 16 (%rsi), %rsi - leaq 16 (%rdi), %rdi + leaq 16(%rsi), %rsi + leaq 16(%rdi), %rdi jnz L(1loop) @@ -140,19 +140,19 @@ L(exit): /* exit */ L(1after): #ifndef USE_AS_MEMPCPY - movq %rax, RETVAL (%rsp) /* save return value */ + movq %rax, RETVAL(%rsp) /* save return value */ #endif /* Align to the natural word size. */ L(aligntry): - movl %esi, %ecx /* align by destination */ + movl %esi, %ecx /* align by source */ andl $7, %ecx jz L(alignafter) /* already aligned */ L(align): /* align */ - leaq -8 (%rcx, %rdx), %rdx /* calculate remaining bytes */ + leaq -8(%rcx, %rdx), %rdx /* calculate remaining bytes */ subl $8, %ecx .p2align 4 @@ -163,8 +163,8 @@ L(alignloop): /* 1-byte alignment loop */ incl %ecx - leaq 1 (%rsi), %rsi - leaq 1 (%rdi), %rdi + leaq 1(%rsi), %rsi + leaq 1(%rdi), %rdi jnz L(alignloop) @@ -172,7 +172,7 @@ L(alignloop): /* 1-byte alignment loop */ L(alignafter): -/* Loop to handle mid-sized blocks. */ +/* Handle mid-sized blocks. */ L(32try): /* up to 1KB */ cmpq $1024, %rdx @@ -188,15 +188,15 @@ L(32): /* 32-byte loop */ L(32loop): decl %ecx - movq (%rsi), %rax - movq 8 (%rsi), %r8 - movq 16 (%rsi), %r9 - movq 24 (%rsi), %r10 + movq (%rsi), %rax + movq 8(%rsi), %r8 + movq 16(%rsi), %r9 + movq 24(%rsi), %r10 - movq %rax, (%rdi) - movq %r8, 8 (%rdi) - movq %r9, 16 (%rdi) - movq %r10, 24 (%rdi) + movq %rax, (%rdi) + movq %r8, 8(%rdi) + movq %r9, 16(%rdi) + movq %r10, 24(%rdi) leaq 32(%rsi), %rsi leaq 32(%rdi), %rdi @@ -205,18 +205,18 @@ L(32loop): decl %ecx - movq (%rsi), %rax - movq 8 (%rsi), %r8 - movq 16 (%rsi), %r9 - movq 24 (%rsi), %r10 + movq (%rsi), %rax + movq 8(%rsi), %r8 + movq 16(%rsi), %r9 + movq 24(%rsi), %r10 - movq %rax, (%rdi) - movq %r8, 8 (%rdi) - movq %r9, 16 (%rdi) - movq %r10, 24 (%rdi) + movq %rax, (%rdi) + movq %r8, 8(%rdi) + movq %r9, 16(%rdi) + movq %r10, 24(%rdi) - leaq 32 (%rsi), %rsi - leaq 32 (%rdi), %rdi + leaq 32(%rsi), %rsi + leaq 32(%rdi), %rdi jnz L(32loop) @@ -229,9 +229,9 @@ L(32skip): movq %rdi, %rax #else - movq RETVAL (%rsp), %rax + movq RETVAL(%rsp), %rax jnz L(1) - + rep #endif retq /* exit */ @@ -245,11 +245,11 @@ L(32after): larger blocks are excluded when building for RTLD. */ -/* Handle large blocks smaller than 1/2 L1. */ +/* Handle blocks smaller than 1/2 L1. */ L(fasttry): /* first 1/2 L1 */ #ifndef NOT_IN_libc /* only up to this algorithm outside of libc.so */ - movq __x86_64_core_cache_size_half (%rip), %r11 + movq __x86_64_data_cache_size_half(%rip), %r11 cmpq %rdx, %r11 /* calculate the smaller of */ cmovaq %rdx, %r11 /* remaining bytes and 1/2 L1 */ #endif @@ -282,7 +282,7 @@ L(fastskip): movq %rdi, %rax #else - movq RETVAL (%rsp), %rax + movq RETVAL(%rsp), %rax jnz L(1) rep @@ -308,16 +308,16 @@ L(pre): /* 64-byte with prefetching */ shrq $6, %rcx jz L(preskip) - movq %r14, SAVE0 (%rsp) + movq %r14, SAVE0(%rsp) cfi_rel_offset (%r14, SAVE0) - movq %r13, SAVE1 (%rsp) + movq %r13, SAVE1(%rsp) cfi_rel_offset (%r13, SAVE1) - movq %r12, SAVE2 (%rsp) + movq %r12, SAVE2(%rsp) cfi_rel_offset (%r12, SAVE2) - movq %rbx, SAVE3 (%rsp) + movq %rbx, SAVE3(%rsp) cfi_rel_offset (%rbx, SAVE3) - cmpl $0, __x86_64_prefetchw (%rip) + cmpl $0, __x86_64_prefetchw(%rip) jz L(preloop) /* check if PREFETCHW OK */ .p2align 4 @@ -339,45 +339,45 @@ L(prewloop): /* cache-line in state M */ prefetcht0 0 + 896 (%rsi) prefetcht0 64 + 896 (%rsi) - movq %rax, (%rdi) - movq %rbx, 8 (%rdi) - movq %r9, 16 (%rdi) - movq %r10, 24 (%rdi) - movq %r11, 32 (%rdi) - movq %r12, 40 (%rdi) - movq %r13, 48 (%rdi) - movq %r14, 56 (%rdi) + movq %rax, (%rdi) + movq %rbx, 8(%rdi) + movq %r9, 16(%rdi) + movq %r10, 24(%rdi) + movq %r11, 32(%rdi) + movq %r12, 40(%rdi) + movq %r13, 48(%rdi) + movq %r14, 56(%rdi) - leaq 64 (%rsi), %rsi - leaq 64 (%rdi), %rdi + leaq 64(%rsi), %rsi + leaq 64(%rdi), %rdi jz L(prebail) decq %rcx - movq (%rsi), %rax - movq 8 (%rsi), %rbx - movq 16 (%rsi), %r9 - movq 24 (%rsi), %r10 - movq 32 (%rsi), %r11 - movq 40 (%rsi), %r12 - movq 48 (%rsi), %r13 - movq 56 (%rsi), %r14 - - movq %rax, (%rdi) - movq %rbx, 8 (%rdi) - movq %r9, 16 (%rdi) - movq %r10, 24 (%rdi) - movq %r11, 32 (%rdi) - movq %r12, 40 (%rdi) - movq %r13, 48 (%rdi) - movq %r14, 56 (%rdi) - - prefetchw 896 - 64 (%rdi) - prefetchw 896 - 0 (%rdi) - - leaq 64 (%rsi), %rsi - leaq 64 (%rdi), %rdi + movq (%rsi), %rax + movq 8(%rsi), %rbx + movq 16(%rsi), %r9 + movq 24(%rsi), %r10 + movq 32(%rsi), %r11 + movq 40(%rsi), %r12 + movq 48(%rsi), %r13 + movq 56(%rsi), %r14 + + movq %rax, (%rdi) + movq %rbx, 8(%rdi) + movq %r9, 16(%rdi) + movq %r10, 24(%rdi) + movq %r11, 32(%rdi) + movq %r12, 40(%rdi) + movq %r13, 48(%rdi) + movq %r14, 56(%rdi) + + prefetchw 896 - 64(%rdi) + prefetchw 896 - 0(%rdi) + + leaq 64(%rsi), %rsi + leaq 64(%rdi), %rdi jnz L(prewloop) jmp L(prebail) @@ -389,26 +389,26 @@ L(prewloop): /* cache-line in state M */ L(preloop): /* cache-line in state E */ decq %rcx - movq (%rsi), %rax - movq 8 (%rsi), %rbx - movq 16 (%rsi), %r9 - movq 24 (%rsi), %r10 - movq 32 (%rsi), %r11 - movq 40 (%rsi), %r12 - movq 48 (%rsi), %r13 - movq 56 (%rsi), %r14 - - prefetcht0 896 + 0 (%rsi) - prefetcht0 896 + 64 (%rsi) - - movq %rax, (%rdi) - movq %rbx, 8 (%rdi) - movq %r9, 16 (%rdi) - movq %r10, 24 (%rdi) - movq %r11, 32 (%rdi) - movq %r12, 40 (%rdi) - movq %r13, 48 (%rdi) - movq %r14, 56 (%rdi) + movq (%rsi), %rax + movq 8(%rsi), %rbx + movq 16(%rsi), %r9 + movq 24(%rsi), %r10 + movq 32(%rsi), %r11 + movq 40(%rsi), %r12 + movq 48(%rsi), %r13 + movq 56(%rsi), %r14 + + prefetcht0 896 + 0(%rsi) + prefetcht0 896 + 64(%rsi) + + movq %rax, (%rdi) + movq %rbx, 8(%rdi) + movq %r9, 16(%rdi) + movq %r10, 24(%rdi) + movq %r11, 32(%rdi) + movq %r12, 40(%rdi) + movq %r13, 48(%rdi) + movq %r14, 56(%rdi) leaq 64 (%rsi), %rsi leaq 64 (%rdi), %rdi @@ -417,40 +417,40 @@ L(preloop): /* cache-line in state E */ decq %rcx - movq (%rsi), %rax - movq 8 (%rsi), %rbx - movq 16 (%rsi), %r9 - movq 24 (%rsi), %r10 - movq 32 (%rsi), %r11 - movq 40 (%rsi), %r12 - movq 48 (%rsi), %r13 - movq 56 (%rsi), %r14 - - prefetcht0 896 - 64 (%rdi) - prefetcht0 896 - 0 (%rdi) - - movq %rax, (%rdi) - movq %rbx, 8 (%rdi) - movq %r9, 16 (%rdi) - movq %r10, 24 (%rdi) - movq %r11, 32 (%rdi) - movq %r12, 40 (%rdi) - movq %r13, 48 (%rdi) - movq %r14, 56 (%rdi) - - leaq 64 (%rsi), %rsi - leaq 64 (%rdi), %rdi + movq (%rsi), %rax + movq 8(%rsi), %rbx + movq 16(%rsi), %r9 + movq 24(%rsi), %r10 + movq 32(%rsi), %r11 + movq 40(%rsi), %r12 + movq 48(%rsi), %r13 + movq 56(%rsi), %r14 + + prefetcht0 896 - 64(%rdi) + prefetcht0 896 - 0(%rdi) + + movq %rax, (%rdi) + movq %rbx, 8(%rdi) + movq %r9, 16(%rdi) + movq %r10, 24(%rdi) + movq %r11, 32(%rdi) + movq %r12, 40(%rdi) + movq %r13, 48(%rdi) + movq %r14, 56(%rdi) + + leaq 64(%rsi), %rsi + leaq 64(%rdi), %rdi jnz L(preloop) L(prebail): - movq SAVE3 (%rsp), %rbx + movq SAVE3(%rsp), %rbx cfi_restore (%rbx) - movq SAVE2 (%rsp), %r12 + movq SAVE2(%rsp), %r12 cfi_restore (%r12) - movq SAVE1 (%rsp), %r13 + movq SAVE1(%rsp), %r13 cfi_restore (%r13) - movq SAVE0 (%rsp), %r14 + movq SAVE0(%rsp), %r14 cfi_restore (%r14) /* .p2align 4 */ @@ -466,7 +466,7 @@ L(preskip): movq %rdi, %rax #else - movq RETVAL (%rsp), %rax + movq RETVAL(%rsp), %rax jnz L(1) rep @@ -477,7 +477,7 @@ L(preskip): L(preafter): -/* Loop to handle huge blocks. */ +/* Handle huge blocks. */ L(NTtry): @@ -486,69 +486,69 @@ L(NT): /* non-temporal 128-byte */ shrq $7, %rcx jz L(NTskip) - movq %r14, SAVE0 (%rsp) + movq %r14, SAVE0(%rsp) cfi_rel_offset (%r14, SAVE0) - movq %r13, SAVE1 (%rsp) + movq %r13, SAVE1(%rsp) cfi_rel_offset (%r13, SAVE1) - movq %r12, SAVE2 (%rsp) + movq %r12, SAVE2(%rsp) cfi_rel_offset (%r12, SAVE2) .p2align 4 L(NTloop): - prefetchnta 768 (%rsi) - prefetchnta 832 (%rsi) + prefetchnta 768(%rsi) + prefetchnta 832(%rsi) decq %rcx - movq (%rsi), %rax - movq 8 (%rsi), %r8 - movq 16 (%rsi), %r9 - movq 24 (%rsi), %r10 - movq 32 (%rsi), %r11 - movq 40 (%rsi), %r12 - movq 48 (%rsi), %r13 - movq 56 (%rsi), %r14 - - movntiq %rax, (%rdi) - movntiq %r8, 8 (%rdi) - movntiq %r9, 16 (%rdi) - movntiq %r10, 24 (%rdi) - movntiq %r11, 32 (%rdi) - movntiq %r12, 40 (%rdi) - movntiq %r13, 48 (%rdi) - movntiq %r14, 56 (%rdi) - - movq 64 (%rsi), %rax - movq 72 (%rsi), %r8 - movq 80 (%rsi), %r9 - movq 88 (%rsi), %r10 - movq 96 (%rsi), %r11 - movq 104 (%rsi), %r12 - movq 112 (%rsi), %r13 - movq 120 (%rsi), %r14 - - movntiq %rax, 64 (%rdi) - movntiq %r8, 72 (%rdi) - movntiq %r9, 80 (%rdi) - movntiq %r10, 88 (%rdi) - movntiq %r11, 96 (%rdi) - movntiq %r12, 104 (%rdi) - movntiq %r13, 112 (%rdi) - movntiq %r14, 120 (%rdi) - - leaq 128 (%rsi), %rsi - leaq 128 (%rdi), %rdi + movq (%rsi), %rax + movq 8(%rsi), %r8 + movq 16(%rsi), %r9 + movq 24(%rsi), %r10 + movq 32(%rsi), %r11 + movq 40(%rsi), %r12 + movq 48(%rsi), %r13 + movq 56(%rsi), %r14 + + movntiq %rax, (%rdi) + movntiq %r8, 8(%rdi) + movntiq %r9, 16(%rdi) + movntiq %r10, 24(%rdi) + movntiq %r11, 32(%rdi) + movntiq %r12, 40(%rdi) + movntiq %r13, 48(%rdi) + movntiq %r14, 56(%rdi) + + movq 64(%rsi), %rax + movq 72(%rsi), %r8 + movq 80(%rsi), %r9 + movq 88(%rsi), %r10 + movq 96(%rsi), %r11 + movq 104(%rsi), %r12 + movq 112(%rsi), %r13 + movq 120(%rsi), %r14 + + movntiq %rax, 64(%rdi) + movntiq %r8, 72(%rdi) + movntiq %r9, 80(%rdi) + movntiq %r10, 88(%rdi) + movntiq %r11, 96(%rdi) + movntiq %r12, 104(%rdi) + movntiq %r13, 112(%rdi) + movntiq %r14, 120(%rdi) + + leaq 128(%rsi), %rsi + leaq 128(%rdi), %rdi jnz L(NTloop) sfence /* serialize memory stores */ - movq SAVE2 (%rsp), %r12 + movq SAVE2(%rsp), %r12 cfi_restore (%r12) - movq SAVE1 (%rsp), %r13 + movq SAVE1(%rsp), %r13 cfi_restore (%r13) - movq SAVE0 (%rsp), %r14 + movq SAVE0(%rsp), %r14 cfi_restore (%r14) L(NTskip): @@ -558,7 +558,7 @@ L(NTskip): movq %rdi, %rax #else - movq RETVAL (%rsp), %rax + movq RETVAL(%rsp), %rax jnz L(1) rep -- 2.11.4.GIT