From 2194737e77256a847ed4fca7652e4dcb8d3f9c1e Mon Sep 17 00:00:00 2001 From: "H.J. Lu" Date: Tue, 25 Aug 2015 08:51:09 -0700 Subject: [PATCH] Replace %xmm[8-12] with %xmm[0-4] Since ld.so preserves vector registers now, we can use %xmm[0-4] to avoid the REX prefix. * sysdeps/x86_64/strlen.S: Replace %xmm[8-12] with %xmm[0-4]. --- ChangeLog | 4 +++ sysdeps/x86_64/strlen.S | 94 ++++++++++++++++++++++++------------------------- 2 files changed, 51 insertions(+), 47 deletions(-) diff --git a/ChangeLog b/ChangeLog index cdecbea2de..a442ee1e12 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,5 +1,9 @@ 2015-08-25 H.J. Lu + * sysdeps/x86_64/strlen.S: Replace %xmm[8-12] with %xmm[0-4]. + +2015-08-25 H.J. Lu + * sysdeps/x86_64/rtld-memcmp.c: Removed. * sysdeps/x86_64/rtld-memset.S: Likewise. * sysdeps/x86_64/rtld-strchr.S: Likewise. diff --git a/sysdeps/x86_64/strlen.S b/sysdeps/x86_64/strlen.S index c382c8d23e..07253330cf 100644 --- a/sysdeps/x86_64/strlen.S +++ b/sysdeps/x86_64/strlen.S @@ -20,7 +20,7 @@ /* Long lived register in strlen(s), strnlen(s, n) are: - %xmm11 - zero + %xmm3 - zero %rdi - s %r10 (s+n) & (~(64-1)) %r11 s+n @@ -32,14 +32,14 @@ ENTRY(strlen) /* Test 64 bytes from %rax for zero. Save result as bitmask in %rdx. */ #define FIND_ZERO \ - pcmpeqb (%rax), %xmm8; \ - pcmpeqb 16(%rax), %xmm9; \ - pcmpeqb 32(%rax), %xmm10; \ - pcmpeqb 48(%rax), %xmm11; \ - pmovmskb %xmm8, %esi; \ - pmovmskb %xmm9, %edx; \ - pmovmskb %xmm10, %r8d; \ - pmovmskb %xmm11, %ecx; \ + pcmpeqb (%rax), %xmm0; \ + pcmpeqb 16(%rax), %xmm1; \ + pcmpeqb 32(%rax), %xmm2; \ + pcmpeqb 48(%rax), %xmm3; \ + pmovmskb %xmm0, %esi; \ + pmovmskb %xmm1, %edx; \ + pmovmskb %xmm2, %r8d; \ + pmovmskb %xmm3, %ecx; \ salq $16, %rdx; \ salq $16, %rcx; \ orq %rsi, %rdx; \ @@ -63,10 +63,10 @@ L(n_nonzero): mov %rsi, %r11 #endif - pxor %xmm8, %xmm8 - pxor %xmm9, %xmm9 - pxor %xmm10, %xmm10 - pxor %xmm11, %xmm11 + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 movq %rdi, %rax movq %rdi, %rcx andq $4095, %rcx @@ -103,9 +103,9 @@ L(n_nonzero): FIND_ZERO #else /* Test first 16 bytes unaligned. */ - movdqu (%rax), %xmm12 - pcmpeqb %xmm8, %xmm12 - pmovmskb %xmm12, %edx + movdqu (%rax), %xmm4 + pcmpeqb %xmm0, %xmm4 + pmovmskb %xmm4, %edx test %edx, %edx je L(next48_bytes) bsf %edx, %eax /* If eax is zeroed 16bit bsf can be used. */ @@ -114,12 +114,12 @@ L(n_nonzero): L(next48_bytes): /* Same as FIND_ZERO except we do not check first 16 bytes. */ andq $-16, %rax - pcmpeqb 16(%rax), %xmm9 - pcmpeqb 32(%rax), %xmm10 - pcmpeqb 48(%rax), %xmm11 - pmovmskb %xmm9, %edx - pmovmskb %xmm10, %r8d - pmovmskb %xmm11, %ecx + pcmpeqb 16(%rax), %xmm1 + pcmpeqb 32(%rax), %xmm2 + pcmpeqb 48(%rax), %xmm3 + pmovmskb %xmm1, %edx + pmovmskb %xmm2, %r8d + pmovmskb %xmm3, %ecx salq $16, %rdx salq $16, %rcx orq %r8, %rcx @@ -127,7 +127,7 @@ L(next48_bytes): orq %rcx, %rdx #endif - /* When no zero byte is found xmm9-11 are zero so we do not have to + /* When no zero byte is found xmm1-3 are zero so we do not have to zero them. */ PROLOG(loop) @@ -149,9 +149,9 @@ L(strnlen_ret): #endif .p2align 4 L(loop_init): - pxor %xmm9, %xmm9 - pxor %xmm10, %xmm10 - pxor %xmm11, %xmm11 + pxor %xmm1, %xmm1 + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 #ifdef AS_STRNLEN .p2align 4 L(loop): @@ -160,12 +160,12 @@ L(loop): cmpq %rax, %r10 je L(exit_end) - movdqa (%rax), %xmm8 - pminub 16(%rax), %xmm8 - pminub 32(%rax), %xmm8 - pminub 48(%rax), %xmm8 - pcmpeqb %xmm11, %xmm8 - pmovmskb %xmm8, %edx + movdqa (%rax), %xmm0 + pminub 16(%rax), %xmm0 + pminub 32(%rax), %xmm0 + pminub 48(%rax), %xmm0 + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx testl %edx, %edx jne L(exit) jmp L(loop) @@ -174,7 +174,7 @@ L(loop): L(exit_end): cmp %rax, %r11 je L(first) /* Do not read when end is at page boundary. */ - pxor %xmm8, %xmm8 + pxor %xmm0, %xmm0 FIND_ZERO L(first): @@ -186,7 +186,7 @@ L(first): .p2align 4 L(exit): - pxor %xmm8, %xmm8 + pxor %xmm0, %xmm0 FIND_ZERO bsfq %rdx, %rdx @@ -200,23 +200,23 @@ L(exit): .p2align 4 L(loop): - movdqa 64(%rax), %xmm8 - pminub 80(%rax), %xmm8 - pminub 96(%rax), %xmm8 - pminub 112(%rax), %xmm8 - pcmpeqb %xmm11, %xmm8 - pmovmskb %xmm8, %edx + movdqa 64(%rax), %xmm0 + pminub 80(%rax), %xmm0 + pminub 96(%rax), %xmm0 + pminub 112(%rax), %xmm0 + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx testl %edx, %edx jne L(exit64) subq $-128, %rax - movdqa (%rax), %xmm8 - pminub 16(%rax), %xmm8 - pminub 32(%rax), %xmm8 - pminub 48(%rax), %xmm8 - pcmpeqb %xmm11, %xmm8 - pmovmskb %xmm8, %edx + movdqa (%rax), %xmm0 + pminub 16(%rax), %xmm0 + pminub 32(%rax), %xmm0 + pminub 48(%rax), %xmm0 + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx testl %edx, %edx jne L(exit0) jmp L(loop) @@ -225,7 +225,7 @@ L(loop): L(exit64): addq $64, %rax L(exit0): - pxor %xmm8, %xmm8 + pxor %xmm0, %xmm0 FIND_ZERO bsfq %rdx, %rdx -- 2.11.4.GIT