From 5313581cb52fd5d3d2cf222ddb6f8f86f090974f Mon Sep 17 00:00:00 2001
From: "H.J. Lu" <hjl.tools@gmail.com>
Date: Sun, 22 Oct 2017 08:11:15 -0700
Subject: [PATCH] i386: Replace assembly versions of e_powf with generic
 e_powf.c

This patch replaces i386 assembly versions of e_powf with generic
e_powf.c.  For workload-spec2017.wrf, on Nehalem, it improves
performance by:

                           Before            After     Improvement
reciprocal-throughput      230.855          78.3358       194%
latency                    231.685          94.1259       146%

On Skylake, it improves performance by:

                           Before            After     Improvement
reciprocal-throughput      239.858          47.4713       405%
latency                    247.57           93.8798       163%

On IvyBridge with --disable-multi-arch, it improves performance by:

                           Before            After     Improvement
reciprocal-throughput      269.078          63.3758       324%
latency                    271.473          102.091       165%

	* sysdeps/i386/fpu/e_powf.S: Removed.
	* sysdeps/i386/fpu/e_powf_log2_data.c: Likewise.
	* sysdeps/i386/fpu/w_powf.c: Likewise.
	* sysdeps/i386/fpu/libm-test-ulps: Updated for generic e_powf.c.
	* sysdeps/i386/i686/fpu/multiarch/libm-test-ulps: Likewise.
	* sysdeps/i386/i686/fpu/multiarch/Makefile (libm-sysdep_routines):
	Add e_powf-sse2.
	(CFLAGS-e_powf-sse2.c): New.
	* sysdeps/i386/i686/fpu/multiarch/e_powf-sse2.c: New file.
	* sysdeps/i386/i686/fpu/multiarch/e_powf.c: Likewise.
---
 ChangeLog                                      |  13 +
 sysdeps/i386/fpu/e_powf.S                      | 392 -------------------------
 sysdeps/i386/fpu/e_powf_log2_data.c            |   1 -
 sysdeps/i386/fpu/libm-test-ulps                |   6 +
 sysdeps/i386/fpu/w_powf.c                      |   1 -
 sysdeps/i386/i686/fpu/multiarch/Makefile       |   3 +-
 sysdeps/i386/i686/fpu/multiarch/e_powf-sse2.c  |   3 +
 sysdeps/i386/i686/fpu/multiarch/e_powf.c       |  43 +++
 sysdeps/i386/i686/fpu/multiarch/libm-test-ulps |  18 +-
 9 files changed, 79 insertions(+), 401 deletions(-)
 delete mode 100644 sysdeps/i386/fpu/e_powf.S
 delete mode 100644 sysdeps/i386/fpu/e_powf_log2_data.c
 delete mode 100644 sysdeps/i386/fpu/w_powf.c
 create mode 100644 sysdeps/i386/i686/fpu/multiarch/e_powf-sse2.c
 create mode 100644 sysdeps/i386/i686/fpu/multiarch/e_powf.c

diff --git a/ChangeLog b/ChangeLog
index 78910c50dc..5d45da1d8c 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,18 @@
 2017-10-22  H.J. Lu  <hongjiu.lu@intel.com>
 
+	* sysdeps/i386/fpu/e_powf.S: Removed.
+	* sysdeps/i386/fpu/e_powf_log2_data.c: Likewise.
+	* sysdeps/i386/fpu/w_powf.c: Likewise.
+	* sysdeps/i386/fpu/libm-test-ulps: Updated for generic e_powf.c.
+	* sysdeps/i386/i686/fpu/multiarch/libm-test-ulps: Likewise.
+	* sysdeps/i386/i686/fpu/multiarch/Makefile (libm-sysdep_routines):
+	Add e_powf-sse2.
+	(CFLAGS-e_powf-sse2.c): New.
+	* sysdeps/i386/i686/fpu/multiarch/e_powf-sse2.c: New file.
+	* sysdeps/i386/i686/fpu/multiarch/e_powf.c: Likewise.
+
+2017-10-22  H.J. Lu  <hongjiu.lu@intel.com>
+
 	* sysdeps/i386/fpu/e_log2f.S: Removed.
 	* sysdeps/i386/fpu/e_log2f_data.c: Likewise.
 	* sysdeps/i386/fpu/w_log2f.c: Likewise.
diff --git a/sysdeps/i386/fpu/e_powf.S b/sysdeps/i386/fpu/e_powf.S
deleted file mode 100644
index 467ef2380b..0000000000
--- a/sysdeps/i386/fpu/e_powf.S
+++ /dev/null
@@ -1,392 +0,0 @@
-/* ix87 specific implementation of pow function.
-   Copyright (C) 1996-2017 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-   Contributed by Ulrich Drepper <drepper@cygnus.com>, 1996.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <http://www.gnu.org/licenses/>.  */
-
-#include <machine/asm.h>
-#include <i386-math-asm.h>
-
-	.section .rodata.cst8,"aM",@progbits,8
-
-	.p2align 3
-	.type one,@object
-one:	.double 1.0
-	ASM_SIZE_DIRECTIVE(one)
-	.type limit,@object
-limit:	.double 0.29
-	ASM_SIZE_DIRECTIVE(limit)
-	.type p31,@object
-p31:	.byte 0, 0, 0, 0, 0, 0, 0xe0, 0x41
-	ASM_SIZE_DIRECTIVE(p31)
-
-	.section .rodata.cst16,"aM",@progbits,16
-
-	.p2align 3
-	.type infinity,@object
-inf_zero:
-infinity:
-	.byte 0, 0, 0, 0, 0, 0, 0xf0, 0x7f
-	ASM_SIZE_DIRECTIVE(infinity)
-	.type zero,@object
-zero:	.double 0.0
-	ASM_SIZE_DIRECTIVE(zero)
-	.type minf_mzero,@object
-minf_mzero:
-minfinity:
-	.byte 0, 0, 0, 0, 0, 0, 0xf0, 0xff
-mzero:
-	.byte 0, 0, 0, 0, 0, 0, 0, 0x80
-	ASM_SIZE_DIRECTIVE(minf_mzero)
-DEFINE_FLT_MIN
-
-#ifdef PIC
-# define MO(op) op##@GOTOFF(%ecx)
-# define MOX(op,x,f) op##@GOTOFF(%ecx,x,f)
-#else
-# define MO(op) op
-# define MOX(op,x,f) op(,x,f)
-#endif
-
-	.text
-ENTRY(__ieee754_powf)
-	flds	8(%esp)	// y
-	fxam
-
-#ifdef	PIC
-	LOAD_PIC_REG (cx)
-#endif
-
-	fnstsw
-	movb	%ah, %dl
-	andb	$0x45, %ah
-	cmpb	$0x40, %ah	// is y == 0 ?
-	je	11f
-
-	cmpb	$0x05, %ah	// is y == ±inf ?
-	je	12f
-
-	cmpb	$0x01, %ah	// is y == NaN ?
-	je	30f
-
-	flds	4(%esp)		// x : y
-
-	subl	$4, %esp
-	cfi_adjust_cfa_offset (4)
-
-	fxam
-	fnstsw
-	movb	%ah, %dh
-	andb	$0x45, %ah
-	cmpb	$0x40, %ah
-	je	20f		// x is ±0
-
-	cmpb	$0x05, %ah
-	je	15f		// x is ±inf
-
-	cmpb	$0x01, %ah
-	je	33f		// x is NaN
-
-	fxch			// y : x
-
-	/* fistpl raises invalid exception for |y| >= 1L<<31.  */
-	fld	%st		// y : y : x
-	fabs			// |y| : y : x
-	fcompl	MO(p31)		// y : x
-	fnstsw
-	sahf
-	jnc	2f
-
-	/* First see whether `y' is a natural number.  In this case we
-	   can use a more precise algorithm.  */
-	fld	%st		// y : y : x
-	fistpl	(%esp)		// y : x
-	fildl	(%esp)		// int(y) : y : x
-	fucomp	%st(1)		// y : x
-	fnstsw
-	sahf
-	jne	3f
-
-	/* OK, we have an integer value for y.  */
-	popl	%edx
-	cfi_adjust_cfa_offset (-4)
-	orl	$0, %edx
-	fstp	%st(0)		// x
-	jns	4f		// y >= 0, jump
-	fdivrl	MO(one)		// 1/x		(now referred to as x)
-	negl	%edx
-4:	fldl	MO(one)		// 1 : x
-	fxch
-
-	/* If y is even, take the absolute value of x.  Otherwise,
-	   ensure all intermediate values that might overflow have the
-	   sign of x.  */
-	testb	$1, %dl
-	jnz	6f
-	fabs
-
-6:	shrl	$1, %edx
-	jnc	5f
-	fxch
-	fabs
-	fmul	%st(1)		// x : ST*x
-	fxch
-5:	fld	%st		// x : x : ST*x
-	fabs			// |x| : x : ST*x
-	fmulp			// |x|*x : ST*x
-	testl	%edx, %edx
-	jnz	6b
-	fstp	%st(0)		// ST*x
-	FLT_NARROW_EVAL_UFLOW_NONNAN
-	ret
-
-	/* y is ±NAN */
-30:	flds	4(%esp)		// x : y
-	fldl	MO(one)		// 1.0 : x : y
-	fucomp	%st(1)		// x : y
-	fnstsw
-	sahf
-	je	31f
-	fxch			// y : x
-31:	fstp	%st(1)
-	ret
-
-	cfi_adjust_cfa_offset (4)
-	.align ALIGNARG(4)
-2:	/* y is a large integer (so even).  */
-	fxch			// x : y
-	fabs			// |x| : y
-	fxch			// y : x
-	.align ALIGNARG(4)
-3:	/* y is a real number.  */
-	fxch			// x : y
-	fldl	MO(one)		// 1.0 : x : y
-	fldl	MO(limit)	// 0.29 : 1.0 : x : y
-	fld	%st(2)		// x : 0.29 : 1.0 : x : y
-	fsub	%st(2)		// x-1 : 0.29 : 1.0 : x : y
-	fabs			// |x-1| : 0.29 : 1.0 : x : y
-	fucompp			// 1.0 : x : y
-	fnstsw
-	fxch			// x : 1.0 : y
-	sahf
-	ja	7f
-	fsub	%st(1)		// x-1 : 1.0 : y
-	fyl2xp1			// log2(x) : y
-	jmp	8f
-
-7:	fyl2x			// log2(x) : y
-8:	fmul	%st(1)		// y*log2(x) : y
-	fst	%st(1)		// y*log2(x) : y*log2(x)
-	frndint			// int(y*log2(x)) : y*log2(x)
-	fsubr	%st, %st(1)	// int(y*log2(x)) : fract(y*log2(x))
-	fxch			// fract(y*log2(x)) : int(y*log2(x))
-	f2xm1			// 2^fract(y*log2(x))-1 : int(y*log2(x))
-	faddl	MO(one)		// 2^fract(y*log2(x)) : int(y*log2(x))
-	fscale			// 2^fract(y*log2(x))*2^int(y*log2(x)) : int(y*log2(x))
-32:	addl	$4, %esp
-	cfi_adjust_cfa_offset (-4)
-	fstp	%st(1)		// 2^fract(y*log2(x))*2^int(y*log2(x))
-	FLT_NARROW_EVAL_UFLOW_NONNAN
-	ret
-
-	/* x is NaN.  */
-	cfi_adjust_cfa_offset (4)
-33:	addl	$4, %esp
-	cfi_adjust_cfa_offset (-4)
-	fstp	%st(1)
-	ret
-
-	// pow(x,±0) = 1
-	.align ALIGNARG(4)
-11:	fstp	%st(0)		// pop y
-	fldl	MO(one)
-	ret
-
-	// y == ±inf
-	.align ALIGNARG(4)
-12:	fstp	%st(0)		// pop y
-	fldl	MO(one)		// 1
-	flds	4(%esp)		// x : 1
-	fabs			// abs(x) : 1
-	fucompp			// < 1, == 1, or > 1
-	fnstsw
-	andb	$0x45, %ah
-	cmpb	$0x45, %ah
-	je	13f		// jump if x is NaN
-
-	cmpb	$0x40, %ah
-	je	14f		// jump if |x| == 1
-
-	shlb	$1, %ah
-	xorb	%ah, %dl
-	andl	$2, %edx
-	fldl	MOX(inf_zero, %edx, 4)
-	ret
-
-	.align ALIGNARG(4)
-14:	fldl	MO(one)
-	ret
-
-	.align ALIGNARG(4)
-13:	flds	4(%esp)		// load x == NaN
-	ret
-
-	cfi_adjust_cfa_offset (4)
-	.align ALIGNARG(4)
-	// x is ±inf
-15:	fstp	%st(0)		// y
-	testb	$2, %dh
-	jz	16f		// jump if x == +inf
-
-	// fistpl raises invalid exception for |y| >= 1L<<31, so test
-	// that (in which case y is certainly even) before testing
-	// whether y is odd.
-	fld	%st		// y : y
-	fabs			// |y| : y
-	fcompl	MO(p31)		// y
-	fnstsw
-	sahf
-	jnc	16f
-
-	// We must find out whether y is an odd integer.
-	fld	%st		// y : y
-	fistpl	(%esp)		// y
-	fildl	(%esp)		// int(y) : y
-	fucompp			// <empty>
-	fnstsw
-	sahf
-	jne	17f
-
-	// OK, the value is an integer.
-	popl	%edx
-	cfi_adjust_cfa_offset (-4)
-	testb	$1, %dl
-	jz	18f		// jump if not odd
-	// It's an odd integer.
-	shrl	$31, %edx
-	fldl	MOX(minf_mzero, %edx, 8)
-	ret
-
-	cfi_adjust_cfa_offset (4)
-	.align ALIGNARG(4)
-16:	fcompl	MO(zero)
-	addl	$4, %esp
-	cfi_adjust_cfa_offset (-4)
-	fnstsw
-	shrl	$5, %eax
-	andl	$8, %eax
-	fldl	MOX(inf_zero, %eax, 1)
-	ret
-
-	cfi_adjust_cfa_offset (4)
-	.align ALIGNARG(4)
-17:	shll	$30, %edx	// sign bit for y in right position
-	addl	$4, %esp
-	cfi_adjust_cfa_offset (-4)
-18:	shrl	$31, %edx
-	fldl	MOX(inf_zero, %edx, 8)
-	ret
-
-	cfi_adjust_cfa_offset (4)
-	.align ALIGNARG(4)
-	// x is ±0
-20:	fstp	%st(0)		// y
-	testb	$2, %dl
-	jz	21f		// y > 0
-
-	// x is ±0 and y is < 0.  We must find out whether y is an odd integer.
-	testb	$2, %dh
-	jz	25f
-
-	// fistpl raises invalid exception for |y| >= 1L<<31, so test
-	// that (in which case y is certainly even) before testing
-	// whether y is odd.
-	fld	%st		// y : y
-	fabs			// |y| : y
-	fcompl	MO(p31)		// y
-	fnstsw
-	sahf
-	jnc	25f
-
-	fld	%st		// y : y
-	fistpl	(%esp)		// y
-	fildl	(%esp)		// int(y) : y
-	fucompp			// <empty>
-	fnstsw
-	sahf
-	jne	26f
-
-	// OK, the value is an integer.
-	popl	%edx
-	cfi_adjust_cfa_offset (-4)
-	testb	$1, %dl
-	jz	27f		// jump if not odd
-	// It's an odd integer.
-	// Raise divide-by-zero exception and get minus infinity value.
-	fldl	MO(one)
-	fdivl	MO(zero)
-	fchs
-	ret
-
-	cfi_adjust_cfa_offset (4)
-25:	fstp	%st(0)
-26:	addl	$4, %esp
-	cfi_adjust_cfa_offset (-4)
-27:	// Raise divide-by-zero exception and get infinity value.
-	fldl	MO(one)
-	fdivl	MO(zero)
-	ret
-
-	cfi_adjust_cfa_offset (4)
-	.align ALIGNARG(4)
-	// x is ±0 and y is > 0.  We must find out whether y is an odd integer.
-21:	testb	$2, %dh
-	jz	22f
-
-	// fistpl raises invalid exception for |y| >= 1L<<31, so test
-	// that (in which case y is certainly even) before testing
-	// whether y is odd.
-	fcoml	MO(p31)		// y
-	fnstsw
-	sahf
-	jnc	22f
-
-	fld	%st		// y : y
-	fistpl	(%esp)		// y
-	fildl	(%esp)		// int(y) : y
-	fucompp			// <empty>
-	fnstsw
-	sahf
-	jne	23f
-
-	// OK, the value is an integer.
-	popl	%edx
-	cfi_adjust_cfa_offset (-4)
-	testb	$1, %dl
-	jz	24f		// jump if not odd
-	// It's an odd integer.
-	fldl	MO(mzero)
-	ret
-
-	cfi_adjust_cfa_offset (4)
-22:	fstp	%st(0)
-23:	addl	$4, %esp	// Don't use pop.
-	cfi_adjust_cfa_offset (-4)
-24:	fldl	MO(zero)
-	ret
-
-END(__ieee754_powf)
-strong_alias (__ieee754_powf, __powf_finite)
diff --git a/sysdeps/i386/fpu/e_powf_log2_data.c b/sysdeps/i386/fpu/e_powf_log2_data.c
deleted file mode 100644
index 1cc8931700..0000000000
--- a/sysdeps/i386/fpu/e_powf_log2_data.c
+++ /dev/null
@@ -1 +0,0 @@
-/* Not needed.  */
diff --git a/sysdeps/i386/fpu/libm-test-ulps b/sysdeps/i386/fpu/libm-test-ulps
index 64cac565f2..3ab3fd8d2c 100644
--- a/sysdeps/i386/fpu/libm-test-ulps
+++ b/sysdeps/i386/fpu/libm-test-ulps
@@ -2370,24 +2370,30 @@ ldouble: 1
 
 Function: "pow_downward":
 double: 1
+float: 1
 float128: 2
 idouble: 1
+ifloat: 1
 ifloat128: 2
 ildouble: 4
 ldouble: 4
 
 Function: "pow_towardzero":
 double: 1
+float: 1
 float128: 2
 idouble: 1
+ifloat: 1
 ifloat128: 2
 ildouble: 4
 ldouble: 4
 
 Function: "pow_upward":
 double: 1
+float: 1
 float128: 2
 idouble: 1
+ifloat: 1
 ifloat128: 2
 ildouble: 4
 ldouble: 4
diff --git a/sysdeps/i386/fpu/w_powf.c b/sysdeps/i386/fpu/w_powf.c
deleted file mode 100644
index d133216f5b..0000000000
--- a/sysdeps/i386/fpu/w_powf.c
+++ /dev/null
@@ -1 +0,0 @@
-#include <sysdeps/../math/w_powf.c>
diff --git a/sysdeps/i386/i686/fpu/multiarch/Makefile b/sysdeps/i386/i686/fpu/multiarch/Makefile
index eee3b8b1fd..c0fa9761d3 100644
--- a/sysdeps/i386/i686/fpu/multiarch/Makefile
+++ b/sysdeps/i386/i686/fpu/multiarch/Makefile
@@ -1,9 +1,10 @@
 ifeq ($(subdir),math)
 libm-sysdep_routines += e_exp2f-sse2 e_expf-sse2 e_logf-sse2 e_log2f-sse2 \
-			s_sinf-sse2 s_cosf-sse2 s_sincosf-sse2
+			e_powf-sse2 s_sinf-sse2 s_cosf-sse2 s_sincosf-sse2
 
 CFLAGS-e_exp2f-sse2.c = -msse2 -mfpmath=sse
 CFLAGS-e_expf-sse2.c = -msse2 -mfpmath=sse
 CFLAGS-e_log2f-sse2.c = -msse2 -mfpmath=sse
 CFLAGS-e_logf-sse2.c = -msse2 -mfpmath=sse
+CFLAGS-e_powf-sse2.c = -msse2 -mfpmath=sse
 endif
diff --git a/sysdeps/i386/i686/fpu/multiarch/e_powf-sse2.c b/sysdeps/i386/i686/fpu/multiarch/e_powf-sse2.c
new file mode 100644
index 0000000000..c56f6ee89f
--- /dev/null
+++ b/sysdeps/i386/i686/fpu/multiarch/e_powf-sse2.c
@@ -0,0 +1,3 @@
+#define __powf __powf_sse2
+
+#include <sysdeps/ieee754/flt-32/e_powf.c>
diff --git a/sysdeps/i386/i686/fpu/multiarch/e_powf.c b/sysdeps/i386/i686/fpu/multiarch/e_powf.c
new file mode 100644
index 0000000000..4dc4c87326
--- /dev/null
+++ b/sysdeps/i386/i686/fpu/multiarch/e_powf.c
@@ -0,0 +1,43 @@
+/* Multiple versions of powf.
+   Copyright (C) 2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#define powf __redirect_powf
+#define __DECL_SIMD___redirect_powf
+#include <math.h>
+#undef powf
+
+#define SYMBOL_NAME powf
+#include "ifunc-sse2.h"
+
+libc_ifunc_redirected (__redirect_powf, __powf, IFUNC_SELECTOR ());
+
+#ifdef SHARED
+__hidden_ver1 (__powf_ia32, __GI___powf, __redirect_powf)
+  __attribute__ ((visibility ("hidden")));
+
+# include <shlib-compat.h>
+versioned_symbol (libm, __powf, powf, GLIBC_2_27);
+#else
+weak_alias (__powf, powf)
+#endif
+
+strong_alias (__powf, __ieee754_powf)
+strong_alias (__powf, __powf_finite)
+
+#define __powf __powf_ia32
+#include <sysdeps/ieee754/flt-32/e_powf.c>
diff --git a/sysdeps/i386/i686/fpu/multiarch/libm-test-ulps b/sysdeps/i386/i686/fpu/multiarch/libm-test-ulps
index b5d74df580..26d90ec636 100644
--- a/sysdeps/i386/i686/fpu/multiarch/libm-test-ulps
+++ b/sysdeps/i386/i686/fpu/multiarch/libm-test-ulps
@@ -2370,24 +2370,30 @@ ldouble: 1
 
 Function: "pow_downward":
 double: 1
+float: 1
 float128: 2
 idouble: 1
+ifloat: 1
 ifloat128: 2
 ildouble: 4
 ldouble: 4
 
 Function: "pow_towardzero":
 double: 1
+float: 1
 float128: 2
 idouble: 1
+ifloat: 1
 ifloat128: 2
 ildouble: 4
 ldouble: 4
 
 Function: "pow_upward":
 double: 1
+float: 1
 float128: 2
 idouble: 1
+ifloat: 1
 ifloat128: 2
 ildouble: 4
 ldouble: 4
@@ -2577,30 +2583,30 @@ ldouble: 5
 
 Function: "tgamma_downward":
 double: 3
-float: 4
+float: 5
 float128: 5
 idouble: 3
-ifloat: 4
+ifloat: 5
 ifloat128: 5
 ildouble: 5
 ldouble: 5
 
 Function: "tgamma_towardzero":
 double: 4
-float: 4
+float: 5
 float128: 5
 idouble: 4
-ifloat: 4
+ifloat: 5
 ifloat128: 5
 ildouble: 5
 ldouble: 5
 
 Function: "tgamma_upward":
 double: 4
-float: 4
+float: 6
 float128: 4
 idouble: 4
-ifloat: 4
+ifloat: 6
 ifloat128: 4
 ildouble: 5
 ldouble: 5
-- 
2.11.4.GIT