From dd728ef392942807dbf572465dc73d44febe7d9f Mon Sep 17 00:00:00 2001 From: hubicka Date: Thu, 5 Oct 2017 14:56:32 +0000 Subject: [PATCH] * i386.c (ix86_size_cost, i386_cost, i486_cost, pentium_cost, lakemont_cost, pentiumpro_cost, geode_cost, k6_cost, athlon_cost, k8_cost, amdfam10_cost, btver1_cost, btver2_cost, pentium4_cost, nocona_cost): Set reassociation width to 1. (bdver1_cost, bdver2_cost, bdver3_cost, bdver4_cost): Set reassociation width to 2 for fp operations and 1 otherwise. (znver1_cost): Set scalar reassoc width to 4 and vector to 3 and 6 for int and fp. (atom_cost): Set reassociation width to 2. (slm_cost, generic_cost): Set fp reassociation width to 2 and 1 otherwise. (intel_cost): Set fp reassociation width to 4 and 1 otherwise. (core_cost): Set fp reassociation width to 4 and vector to 2. (ix86_reassociation_width): Rewrite using cost table; special case plus/minus on Zen; honor X86_TUNE_SSE_SPLIT_REGS and TARGET_AVX128_OPTIMAL. * i386.h (processor_costs): Add reassoc_int, reassoc_fp, reassoc_vec_int, reassoc_vec_fp. (TARGET_VECTOR_PARALLEL_EXECUTION, TARGET_REASSOC_INT_TO_PARALLEL, TARGET_REASSOC_FP_TO_PARALLEL): Remove. * x86-tune.def (X86_TUNE_REASSOC_INT_TO_PARALLEL): Remove. (X86_TUNE_REASSOC_FP_TO_PARALLEL): Remove. (X86_TUNE_VECTOR_PARALLEL_EXECUTION): Remove. git-svn-id: svn+ssh://gcc.gnu.org/svn/gcc/trunk@253448 138bc75d-0d04-0410-961f-82ee72b054a4 --- gcc/ChangeLog | 25 ++++++++++++ gcc/config/i386/i386.c | 92 +++++++++++++++++++++++++++++++------------- gcc/config/i386/i386.h | 13 ++++--- gcc/config/i386/x86-tune.def | 15 -------- 4 files changed, 97 insertions(+), 48 deletions(-) diff --git a/gcc/ChangeLog b/gcc/ChangeLog index 8b735d7f5a5..ef4a47cf0ad 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,28 @@ +2017-10-05 Jan Hubicka + + * i386.c (ix86_size_cost, i386_cost, i486_cost, pentium_cost, + lakemont_cost, pentiumpro_cost, geode_cost, k6_cost, + athlon_cost, k8_cost, amdfam10_cost, btver1_cost, btver2_cost, + pentium4_cost, nocona_cost): Set reassociation width to 1. + (bdver1_cost, bdver2_cost, bdver3_cost, bdver4_cost): Set reassociation + width to 2 for fp operations and 1 otherwise. + (znver1_cost): Set scalar reassoc width to 4 and vector to 3 and 6 + for int and fp. + (atom_cost): Set reassociation width to 2. + (slm_cost, generic_cost): Set fp reassociation width to 2 and 1 otherwise. + (intel_cost): Set fp reassociation width to 4 and 1 otherwise. + (core_cost): Set fp reassociation width to 4 and vector to 2. + (ix86_reassociation_width): Rewrite using cost table; special case + plus/minus on Zen; honor X86_TUNE_SSE_SPLIT_REGS + and TARGET_AVX128_OPTIMAL. + * i386.h (processor_costs): Add + reassoc_int, reassoc_fp, reassoc_vec_int, reassoc_vec_fp. + (TARGET_VECTOR_PARALLEL_EXECUTION, TARGET_REASSOC_INT_TO_PARALLEL, + TARGET_REASSOC_FP_TO_PARALLEL): Remove. + * x86-tune.def (X86_TUNE_REASSOC_INT_TO_PARALLEL): Remove. + (X86_TUNE_REASSOC_FP_TO_PARALLEL): Remove. + (X86_TUNE_VECTOR_PARALLEL_EXECUTION): Remove. + 2017-10-05 Nathan Sidwell * doc/invoke.texi (Wparentheses): Document C++ MVP behaviour. diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index 519336e0ed1..b7c125c20ec 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -177,6 +177,7 @@ struct processor_costs ix86_size_cost = {/* costs for tuning for size */ COSTS_N_BYTES (2), /* cost of FABS instruction. */ COSTS_N_BYTES (2), /* cost of FCHS instruction. */ COSTS_N_BYTES (2), /* cost of FSQRT instruction. */ + 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ ix86_size_memcpy, ix86_size_memset, 1, /* scalar_stmt_cost. */ @@ -253,6 +254,7 @@ struct processor_costs i386_cost = { /* 386 specific costs */ COSTS_N_INSNS (22), /* cost of FABS instruction. */ COSTS_N_INSNS (24), /* cost of FCHS instruction. */ COSTS_N_INSNS (122), /* cost of FSQRT instruction. */ + 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ i386_memcpy, i386_memset, 1, /* scalar_stmt_cost. */ @@ -330,6 +332,7 @@ struct processor_costs i486_cost = { /* 486 specific costs */ COSTS_N_INSNS (3), /* cost of FABS instruction. */ COSTS_N_INSNS (3), /* cost of FCHS instruction. */ COSTS_N_INSNS (83), /* cost of FSQRT instruction. */ + 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ i486_memcpy, i486_memset, 1, /* scalar_stmt_cost. */ @@ -405,6 +408,7 @@ struct processor_costs pentium_cost = { COSTS_N_INSNS (1), /* cost of FABS instruction. */ COSTS_N_INSNS (1), /* cost of FCHS instruction. */ COSTS_N_INSNS (70), /* cost of FSQRT instruction. */ + 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ pentium_memcpy, pentium_memset, 1, /* scalar_stmt_cost. */ @@ -473,6 +477,7 @@ struct processor_costs lakemont_cost = { COSTS_N_INSNS (1), /* cost of FABS instruction. */ COSTS_N_INSNS (1), /* cost of FCHS instruction. */ COSTS_N_INSNS (70), /* cost of FSQRT instruction. */ + 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ pentium_memcpy, pentium_memset, 1, /* scalar_stmt_cost. */ @@ -556,6 +561,7 @@ struct processor_costs pentiumpro_cost = { COSTS_N_INSNS (2), /* cost of FABS instruction. */ COSTS_N_INSNS (2), /* cost of FCHS instruction. */ COSTS_N_INSNS (56), /* cost of FSQRT instruction. */ + 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ pentiumpro_memcpy, pentiumpro_memset, 1, /* scalar_stmt_cost. */ @@ -631,6 +637,7 @@ struct processor_costs geode_cost = { COSTS_N_INSNS (1), /* cost of FABS instruction. */ COSTS_N_INSNS (1), /* cost of FCHS instruction. */ COSTS_N_INSNS (54), /* cost of FSQRT instruction. */ + 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ geode_memcpy, geode_memset, 1, /* scalar_stmt_cost. */ @@ -708,6 +715,7 @@ struct processor_costs k6_cost = { COSTS_N_INSNS (2), /* cost of FABS instruction. */ COSTS_N_INSNS (2), /* cost of FCHS instruction. */ COSTS_N_INSNS (56), /* cost of FSQRT instruction. */ + 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ k6_memcpy, k6_memset, 1, /* scalar_stmt_cost. */ @@ -785,6 +793,7 @@ struct processor_costs athlon_cost = { COSTS_N_INSNS (2), /* cost of FABS instruction. */ COSTS_N_INSNS (2), /* cost of FCHS instruction. */ COSTS_N_INSNS (35), /* cost of FSQRT instruction. */ + 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ athlon_memcpy, athlon_memset, 1, /* scalar_stmt_cost. */ @@ -871,7 +880,7 @@ struct processor_costs k8_cost = { COSTS_N_INSNS (2), /* cost of FABS instruction. */ COSTS_N_INSNS (2), /* cost of FCHS instruction. */ COSTS_N_INSNS (35), /* cost of FSQRT instruction. */ - + 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ k8_memcpy, k8_memset, 4, /* scalar_stmt_cost. */ @@ -965,7 +974,7 @@ struct processor_costs amdfam10_cost = { COSTS_N_INSNS (2), /* cost of FABS instruction. */ COSTS_N_INSNS (2), /* cost of FCHS instruction. */ COSTS_N_INSNS (35), /* cost of FSQRT instruction. */ - + 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ amdfam10_memcpy, amdfam10_memset, 4, /* scalar_stmt_cost. */ @@ -1060,7 +1069,7 @@ const struct processor_costs bdver1_cost = { COSTS_N_INSNS (2), /* cost of FABS instruction. */ COSTS_N_INSNS (2), /* cost of FCHS instruction. */ COSTS_N_INSNS (52), /* cost of FSQRT instruction. */ - + 1, 2, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ bdver1_memcpy, bdver1_memset, 6, /* scalar_stmt_cost. */ @@ -1156,7 +1165,7 @@ const struct processor_costs bdver2_cost = { COSTS_N_INSNS (2), /* cost of FABS instruction. */ COSTS_N_INSNS (2), /* cost of FCHS instruction. */ COSTS_N_INSNS (52), /* cost of FSQRT instruction. */ - + 1, 2, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ bdver2_memcpy, bdver2_memset, 6, /* scalar_stmt_cost. */ @@ -1243,7 +1252,7 @@ struct processor_costs bdver3_cost = { COSTS_N_INSNS (2), /* cost of FABS instruction. */ COSTS_N_INSNS (2), /* cost of FCHS instruction. */ COSTS_N_INSNS (52), /* cost of FSQRT instruction. */ - + 1, 2, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ bdver3_memcpy, bdver3_memset, 6, /* scalar_stmt_cost. */ @@ -1329,7 +1338,7 @@ struct processor_costs bdver4_cost = { COSTS_N_INSNS (2), /* cost of FABS instruction. */ COSTS_N_INSNS (2), /* cost of FCHS instruction. */ COSTS_N_INSNS (52), /* cost of FSQRT instruction. */ - + 1, 2, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ bdver4_memcpy, bdver4_memset, 6, /* scalar_stmt_cost. */ @@ -1419,7 +1428,15 @@ struct processor_costs znver1_cost = { COSTS_N_INSNS (2), /* cost of FABS instruction. */ COSTS_N_INSNS (2), /* cost of FCHS instruction. */ COSTS_N_INSNS (52), /* cost of FSQRT instruction. */ - + /* Zen can execute 4 integer operations per cycle. FP operations take 3 cycles + and it can execute 2 integer additions and 2 multiplications thus + reassociation may make sense up to with of 6. SPEC2k6 bencharks suggests + that 4 works better than 6 probably due to register pressure. + + Integer vector operations are taken by FP unit and execute 3 vector + plus/minus operations per cycle but only one multiply. This is adjusted + in ix86_reassociation_width. */ + 4, 4, 3, 6, /* reassoc int, fp, vec_int, vec_fp. */ znver1_memcpy, znver1_memset, 6, /* scalar_stmt_cost. */ @@ -1508,7 +1525,7 @@ const struct processor_costs btver1_cost = { COSTS_N_INSNS (2), /* cost of FABS instruction. */ COSTS_N_INSNS (2), /* cost of FCHS instruction. */ COSTS_N_INSNS (35), /* cost of FSQRT instruction. */ - + 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ btver1_memcpy, btver1_memset, 4, /* scalar_stmt_cost. */ @@ -1594,6 +1611,7 @@ const struct processor_costs btver2_cost = { COSTS_N_INSNS (2), /* cost of FABS instruction. */ COSTS_N_INSNS (2), /* cost of FCHS instruction. */ COSTS_N_INSNS (35), /* cost of FSQRT instruction. */ + 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ btver2_memcpy, btver2_memset, 4, /* scalar_stmt_cost. */ @@ -1670,6 +1688,7 @@ struct processor_costs pentium4_cost = { COSTS_N_INSNS (2), /* cost of FABS instruction. */ COSTS_N_INSNS (2), /* cost of FCHS instruction. */ COSTS_N_INSNS (43), /* cost of FSQRT instruction. */ + 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ pentium4_memcpy, pentium4_memset, 1, /* scalar_stmt_cost. */ @@ -1749,6 +1768,7 @@ struct processor_costs nocona_cost = { COSTS_N_INSNS (3), /* cost of FABS instruction. */ COSTS_N_INSNS (3), /* cost of FCHS instruction. */ COSTS_N_INSNS (44), /* cost of FSQRT instruction. */ + 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ nocona_memcpy, nocona_memset, 1, /* scalar_stmt_cost. */ @@ -1826,6 +1846,7 @@ struct processor_costs atom_cost = { COSTS_N_INSNS (8), /* cost of FABS instruction. */ COSTS_N_INSNS (8), /* cost of FCHS instruction. */ COSTS_N_INSNS (40), /* cost of FSQRT instruction. */ + 2, 2, 2, 2, /* reassoc int, fp, vec_int, vec_fp. */ atom_memcpy, atom_memset, 1, /* scalar_stmt_cost. */ @@ -1903,6 +1924,7 @@ struct processor_costs slm_cost = { COSTS_N_INSNS (8), /* cost of FABS instruction. */ COSTS_N_INSNS (8), /* cost of FCHS instruction. */ COSTS_N_INSNS (40), /* cost of FSQRT instruction. */ + 1, 2, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ slm_memcpy, slm_memset, 1, /* scalar_stmt_cost. */ @@ -1980,6 +2002,7 @@ struct processor_costs intel_cost = { COSTS_N_INSNS (8), /* cost of FABS instruction. */ COSTS_N_INSNS (8), /* cost of FCHS instruction. */ COSTS_N_INSNS (40), /* cost of FSQRT instruction. */ + 1, 4, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ intel_memcpy, intel_memset, 1, /* scalar_stmt_cost. */ @@ -2067,6 +2090,7 @@ struct processor_costs generic_cost = { COSTS_N_INSNS (8), /* cost of FABS instruction. */ COSTS_N_INSNS (8), /* cost of FCHS instruction. */ COSTS_N_INSNS (40), /* cost of FSQRT instruction. */ + 1, 2, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ generic_memcpy, generic_memset, 1, /* scalar_stmt_cost. */ @@ -2153,6 +2177,7 @@ struct processor_costs core_cost = { COSTS_N_INSNS (8), /* cost of FABS instruction. */ COSTS_N_INSNS (8), /* cost of FCHS instruction. */ COSTS_N_INSNS (40), /* cost of FSQRT instruction. */ + 1, 4, 2, 2, /* reassoc int, fp, vec_int, vec_fp. */ core_memcpy, core_memset, 1, /* scalar_stmt_cost. */ @@ -51830,34 +51855,47 @@ has_dispatch (rtx_insn *insn, int action) /* Implementation of reassociation_width target hook used by reassoc phase to identify parallelism level in reassociated tree. Statements tree_code is passed in OPC. Arguments type - is passed in MODE. - - Currently parallel reassociation is enabled for Atom - processors only and we set reassociation width to be 2 - because Atom may issue up to 2 instructions per cycle. - - Return value should be fixed if parallel reassociation is - enabled for other processors. */ + is passed in MODE. */ static int -ix86_reassociation_width (unsigned int, machine_mode mode) +ix86_reassociation_width (unsigned int op, machine_mode mode) { + int width = 1; /* Vector part. */ if (VECTOR_MODE_P (mode)) { - if (TARGET_VECTOR_PARALLEL_EXECUTION) - return 2; - else + int div = 1; + if (INTEGRAL_MODE_P (mode)) + width = ix86_cost->reassoc_vec_int; + else if (FLOAT_MODE_P (mode)) + width = ix86_cost->reassoc_vec_fp; + + if (width == 1) + return 1; + + /* Integer vector instructions execute in FP unit + and can execute 3 additions and one multiplication per cycle. */ + if (ix86_tune == PROCESSOR_ZNVER1 && INTEGRAL_MODE_P (mode) + && op != PLUS && op != MINUS) return 1; - } + /* Account for targets that splits wide vectors into multiple parts. */ + if (TARGET_AVX128_OPTIMAL && GET_MODE_BITSIZE (mode) > 128) + div = GET_MODE_BITSIZE (mode) / 128; + else if (TARGET_SSE_SPLIT_REGS && GET_MODE_BITSIZE (mode) > 64) + div = GET_MODE_BITSIZE (mode) / 64; + width = (width + div - 1) / div; + } /* Scalar part. */ - if (INTEGRAL_MODE_P (mode) && TARGET_REASSOC_INT_TO_PARALLEL) - return 2; - else if (FLOAT_MODE_P (mode) && TARGET_REASSOC_FP_TO_PARALLEL) - return ((TARGET_64BIT && ix86_tune == PROCESSOR_HASWELL)? 4 : 2); - else - return 1; + else if (INTEGRAL_MODE_P (mode)) + width = ix86_cost->reassoc_int; + else if (FLOAT_MODE_P (mode)) + width = ix86_cost->reassoc_fp; + + /* Avoid using too many registers in 32bit mode. */ + if (!TARGET_64BIT && width > 2) + width = 2; + return width; } /* ??? No autovectorization into MMX or 3DNOW until we can reliably diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h index 899ba8b3706..ef88d89cae2 100644 --- a/gcc/config/i386/i386.h +++ b/gcc/config/i386/i386.h @@ -257,6 +257,13 @@ struct processor_costs { const int fsqrt; /* cost of FSQRT instruction. */ /* Specify what algorithm to use for stringops on unknown size. */ + const int reassoc_int, reassoc_fp, reassoc_vec_int, reassoc_vec_fp; + /* Specify reassociation width for integer, + fp, vector integer and vector fp + operations. Generally should correspond + to number of instructions executed in + parallel. See also + ix86_reassociation_width. */ struct stringop_algs *memcpy, *memset; const int scalar_stmt_cost; /* Cost of any scalar operation, excluding load and store. */ @@ -466,8 +473,6 @@ extern unsigned char ix86_tune_features[X86_TUNE_LAST]; ix86_tune_features[X86_TUNE_USE_VECTOR_CONVERTS] #define TARGET_SLOW_PSHUFB \ ix86_tune_features[X86_TUNE_SLOW_PSHUFB] -#define TARGET_VECTOR_PARALLEL_EXECUTION \ - ix86_tune_features[X86_TUNE_VECTOR_PARALLEL_EXECUTION] #define TARGET_AVOID_4BYTE_PREFIXES \ ix86_tune_features[X86_TUNE_AVOID_4BYTE_PREFIXES] #define TARGET_FUSE_CMP_AND_BRANCH_32 \ @@ -488,10 +493,6 @@ extern unsigned char ix86_tune_features[X86_TUNE_LAST]; ix86_tune_features[X86_TUNE_SOFTWARE_PREFETCHING_BENEFICIAL] #define TARGET_AVX128_OPTIMAL \ ix86_tune_features[X86_TUNE_AVX128_OPTIMAL] -#define TARGET_REASSOC_INT_TO_PARALLEL \ - ix86_tune_features[X86_TUNE_REASSOC_INT_TO_PARALLEL] -#define TARGET_REASSOC_FP_TO_PARALLEL \ - ix86_tune_features[X86_TUNE_REASSOC_FP_TO_PARALLEL] #define TARGET_GENERAL_REGS_SSE_SPILL \ ix86_tune_features[X86_TUNE_GENERAL_REGS_SSE_SPILL] #define TARGET_AVOID_MEM_OPND_FOR_CMOVE \ diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def index 82c853bd939..63f69b4b503 100644 --- a/gcc/config/i386/x86-tune.def +++ b/gcc/config/i386/x86-tune.def @@ -117,16 +117,6 @@ DEF_TUNE (X86_TUNE_FUSE_CMP_AND_BRANCH_SOFLAGS, "fuse_cmp_and_branch_soflags", DEF_TUNE (X86_TUNE_FUSE_ALU_AND_BRANCH, "fuse_alu_and_branch", m_SANDYBRIDGE | m_HASWELL) -/* X86_TUNE_REASSOC_INT_TO_PARALLEL: Try to produce parallel computations - during reassociation of integer computation. */ -DEF_TUNE (X86_TUNE_REASSOC_INT_TO_PARALLEL, "reassoc_int_to_parallel", - m_BONNELL) - -/* X86_TUNE_REASSOC_FP_TO_PARALLEL: Try to produce parallel computations - during reassociation of fp computation. */ -DEF_TUNE (X86_TUNE_REASSOC_FP_TO_PARALLEL, "reassoc_fp_to_parallel", - m_BONNELL | m_SILVERMONT | m_HASWELL | m_KNL | m_KNM |m_INTEL | m_BDVER1 - | m_BDVER2 | m_ZNVER1 | m_GENERIC) /*****************************************************************************/ /* Function prologue, epilogue and function calling sequences. */ @@ -391,11 +381,6 @@ DEF_TUNE (X86_TUNE_USE_VECTOR_CONVERTS, "use_vector_converts", m_AMDFAM10) DEF_TUNE (X86_TUNE_SLOW_PSHUFB, "slow_pshufb", m_BONNELL | m_SILVERMONT | m_KNL | m_KNM | m_INTEL) -/* X86_TUNE_VECTOR_PARALLEL_EXECUTION: Indicates tunings with ability to - execute 2 or more vector instructions in parallel. */ -DEF_TUNE (X86_TUNE_VECTOR_PARALLEL_EXECUTION, "vec_parallel", - m_NEHALEM | m_SANDYBRIDGE | m_HASWELL) - /* X86_TUNE_AVOID_4BYTE_PREFIXES: Avoid instructions requiring 4+ bytes of prefixes. */ DEF_TUNE (X86_TUNE_AVOID_4BYTE_PREFIXES, "avoid_4byte_prefixes", m_SILVERMONT | m_INTEL) -- 2.11.4.GIT