From 0c2279d7cbb4a2cd0ff94dc279fe0975959b612c Mon Sep 17 00:00:00 2001 From: hubicka Date: Sat, 19 Oct 2013 12:11:14 +0000 Subject: [PATCH] * config/i386/i386.h (ACCUMULATE_OUTGOING_ARGS): Disable accumulation for cold functions. * x86-tune.def (X86_TUNE_USE_LEAVE): Update comment. (X86_TUNE_PUSH_MEMORY): Likewise. (X86_TUNE_AVX256_UNALIGNED_LOAD_OPTIMAL, X86_TUNE_AVX256_UNALIGNED_STORE_OPTIMAL): New. (X86_TUNE_ACCUMULATE_OUTGOING_ARGS, X86_TUNE_ALWAYS_FANCY_MATH_387): New. * i386.c (x86_accumulate_outgoing_args, x86_arch_always_fancy_math_387, x86_avx256_split_unaligned_load, x86_avx256_split_unaligned_store): Remove. (ix86_option_override_internal): Update to use tune features instead of variables. git-svn-id: svn+ssh://gcc.gnu.org/svn/gcc/trunk@203855 138bc75d-0d04-0410-961f-82ee72b054a4 --- gcc/ChangeLog | 15 +++++++++++++++ gcc/config/i386/i386.c | 23 +++++------------------ gcc/config/i386/i386.h | 19 ++++++++++++++++--- gcc/config/i386/x86-tune.def | 39 ++++++++++++++++++++++++++++++++++----- 4 files changed, 70 insertions(+), 26 deletions(-) diff --git a/gcc/ChangeLog b/gcc/ChangeLog index bdb9e01c601..086b5b21787 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,18 @@ +2013-10-18 Jan Hubicka + + * config/i386/i386.h (ACCUMULATE_OUTGOING_ARGS): Disable accumulation + for cold functions. + * x86-tune.def (X86_TUNE_USE_LEAVE): Update comment. + (X86_TUNE_PUSH_MEMORY): Likewise. + (X86_TUNE_AVX256_UNALIGNED_LOAD_OPTIMAL, + X86_TUNE_AVX256_UNALIGNED_STORE_OPTIMAL): New. + (X86_TUNE_ACCUMULATE_OUTGOING_ARGS, X86_TUNE_ALWAYS_FANCY_MATH_387): New. + * i386.c (x86_accumulate_outgoing_args, x86_arch_always_fancy_math_387, + x86_avx256_split_unaligned_load, x86_avx256_split_unaligned_store): + Remove. + (ix86_option_override_internal): Update to use tune features instead + of variables. + 2013-10-18 Cong Hou PR tree-optimization/58508 diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index b8c3c1d7128..91e65105a5c 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -1897,18 +1897,6 @@ static unsigned int initial_ix86_arch_features[X86_ARCH_LAST] = { ~m_386, }; -static const unsigned int x86_accumulate_outgoing_args - = m_PPRO | m_P4_NOCONA | m_ATOM | m_SLM | m_AMD_MULTIPLE | m_GENERIC; - -static const unsigned int x86_arch_always_fancy_math_387 - = m_PENT | m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_SLM | m_AMD_MULTIPLE | m_GENERIC; - -static const unsigned int x86_avx256_split_unaligned_load - = m_COREI7 | m_GENERIC; - -static const unsigned int x86_avx256_split_unaligned_store - = m_COREI7 | m_BDVER | m_GENERIC; - /* In case the average insn count for single function invocation is lower than this constant, emit fast (but longer) prologue and epilogue code. */ @@ -2925,7 +2913,7 @@ ix86_option_override_internal (bool main_args_p, struct gcc_options *opts_set) { int i; - unsigned int ix86_arch_mask, ix86_tune_mask; + unsigned int ix86_arch_mask; const bool ix86_tune_specified = (opts->x_ix86_tune_string != NULL); const char *prefix; const char *suffix; @@ -3693,7 +3681,7 @@ ix86_option_override_internal (bool main_args_p, /* If the architecture always has an FPU, turn off NO_FANCY_MATH_387, since the insns won't need emulation. */ - if (x86_arch_always_fancy_math_387 & ix86_arch_mask) + if (ix86_tune_features [X86_TUNE_ALWAYS_FANCY_MATH_387]) opts->x_target_flags &= ~MASK_NO_FANCY_MATH_387; /* Likewise, if the target doesn't have a 387, or we've specified @@ -3835,8 +3823,7 @@ ix86_option_override_internal (bool main_args_p, gcc_unreachable (); } - ix86_tune_mask = 1u << ix86_tune; - if ((x86_accumulate_outgoing_args & ix86_tune_mask) + if (ix86_tune_features [X86_TUNE_ACCUMULATE_OUTGOING_ARGS] && !(opts_set->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS) && !opts->x_optimize_size) opts->x_target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS; @@ -3976,10 +3963,10 @@ ix86_option_override_internal (bool main_args_p, if (flag_expensive_optimizations && !(opts_set->x_target_flags & MASK_VZEROUPPER)) opts->x_target_flags |= MASK_VZEROUPPER; - if ((x86_avx256_split_unaligned_load & ix86_tune_mask) + if (!ix86_tune_features[X86_TUNE_SSE_UNALIGNED_LOAD_OPTIMAL] && !(opts_set->x_target_flags & MASK_AVX256_SPLIT_UNALIGNED_LOAD)) opts->x_target_flags |= MASK_AVX256_SPLIT_UNALIGNED_LOAD; - if ((x86_avx256_split_unaligned_store & ix86_tune_mask) + if (!ix86_tune_features[X86_TUNE_SSE_UNALIGNED_STORE_OPTIMAL] && !(opts_set->x_target_flags & MASK_AVX256_SPLIT_UNALIGNED_STORE)) opts->x_target_flags |= MASK_AVX256_SPLIT_UNALIGNED_STORE; /* Enable 128-bit AVX instruction generation diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h index 10f7ff096c3..63e49032748 100644 --- a/gcc/config/i386/i386.h +++ b/gcc/config/i386/i386.h @@ -1544,13 +1544,26 @@ enum reg_class will be computed and placed into the variable `crtl->outgoing_args_size'. No space will be pushed onto the stack for each call; instead, the function prologue should increase the stack frame size by this amount. + + In 32bit mode enabling argument accumulation results in about 5% code size + growth becuase move instructions are less compact than push. In 64bit + mode the difference is less drastic but visible. + + FIXME: Unlike earlier implementations, the size of unwind info seems to + actually grouw with accumulation. Is that because accumulated args + unwind info became unnecesarily bloated? 64-bit MS ABI seem to require 16 byte alignment everywhere except for - function prologue and apilogue. This is not possible without - ACCUMULATE_OUTGOING_ARGS. */ + function prologue and epilogue. This is not possible without + ACCUMULATE_OUTGOING_ARGS. + + If stack probes are required, the space used for large function + arguments on the stack must also be probed, so enable + -maccumulate-outgoing-args so this happens in the prologue. */ #define ACCUMULATE_OUTGOING_ARGS \ - (TARGET_ACCUMULATE_OUTGOING_ARGS || TARGET_64BIT_MS_ABI) + ((TARGET_ACCUMULATE_OUTGOING_ARGS && optimize_function_for_speed_p (cfun)) \ + || TARGET_STACK_PROBE || TARGET_64BIT_MS_ABI) /* If defined, a C expression whose value is nonzero when we want to use PUSH instructions to pass outgoing arguments. */ diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def index 34484a28749..42eee33cbe4 100644 --- a/gcc/config/i386/x86-tune.def +++ b/gcc/config/i386/x86-tune.def @@ -18,15 +18,13 @@ a copy of the GCC Runtime Library Exception along with this program; see the files COPYING3 and COPYING.RUNTIME respectively. If not, see . */ -/* X86_TUNE_USE_LEAVE: Leave does not affect Nocona SPEC2000 results - negatively, so enabling for Generic64 seems like good code size - tradeoff. We can't enable it for 32bit generic because it does not - work well with PPro base chips. */ +/* X86_TUNE_USE_LEAVE: Use "leave" instruction in epilogues where it fits. */ DEF_TUNE (X86_TUNE_USE_LEAVE, "use_leave", m_386 | m_CORE_ALL | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC) /* X86_TUNE_PUSH_MEMORY: Enable generation of "push mem" instructions. - Some chips, like 486 and Pentium have problems with these sequences. */ + Some chips, like 486 and Pentium works faster with separate load + and push instructions. */ DEF_TUNE (X86_TUNE_PUSH_MEMORY, "push_memory", m_386 | m_P4_NOCONA | m_CORE_ALL | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC) @@ -210,6 +208,16 @@ DEF_TUNE (X86_TUNE_SSE_UNALIGNED_LOAD_OPTIMAL, "sse_unaligned_load_optimal", DEF_TUNE (X86_TUNE_SSE_UNALIGNED_STORE_OPTIMAL, "sse_unaligned_store_optimal", m_COREI7 | m_BDVER | m_SLM | m_GENERIC) +/* X86_TUNE_AVX256_UNALIGNED_LOAD_OPTIMAL: if true, unaligned loads are + split. */ +DEF_TUNE (X86_TUNE_AVX256_UNALIGNED_LOAD_OPTIMAL, "256_unaligned_load_optimal", + ~(m_COREI7 | m_GENERIC)) + +/* X86_TUNE_AVX256_UNALIGNED_STORE_OPTIMAL: if true, unaligned loads are + split. */ +DEF_TUNE (X86_TUNE_AVX256_UNALIGNED_STORE_OPTIMAL, "256_unaligned_load_optimal", + ~(m_COREI7 | m_BDVER | m_GENERIC)) + /* Use packed single precision instructions where posisble. I.e. movups instead of movupd. */ DEF_TUNE (X86_TUNE_SSE_PACKED_SINGLE_INSN_OPTIMAL, "sse_packed_single_insn_optimal", @@ -398,3 +406,24 @@ DEF_TUNE (X86_TUNE_AVOID_MEM_OPND_FOR_CMOVE, "avoid_mem_opnd_for_cmove", fp converts to destination register. */ DEF_TUNE (X86_TUNE_SPLIT_MEM_OPND_FOR_FP_CONVERTS, "split_mem_opnd_for_fp_converts", m_SLM) + +/* X86_TUNE_ACCUMULATE_OUTGOING_ARGS: Allocate stack space for outgoing + arguments in prologue/epilogue instead of separately for each call + by push/pop instructions. + This increase code size by about 5% in 32bit mode, less so in 64bit mode + because parameters are passed in registers. It is considerable + win for targets without stack engine that prevents multple push operations + to happen in parallel. + + FIXME: the flags is incorrectly enabled for amdfam10, Bulldozer, + Bobcat and Generic. This is because disabling it causes large + regression on mgrid due to IRA limitation leading to unecessary + use of the frame pointer in 32bit mode. */ +DEF_TUNE (X86_TUNE_ACCUMULATE_OUTGOING_ARGS, "accumulate_outgoing_args", + m_PPRO | m_P4_NOCONA | m_ATOM | m_SLM | m_AMD_MULTIPLE | m_GENERIC) + +/* X86_TUNE_ALWAYS_FANCY_MATH_387: controls use of fancy 387 operations, + such as fsqrt, fprem, fsin, fcos, fsincos etc. + Should be enabled for all targets that always has coprocesor. */ +DEF_TUNE (X86_TUNE_ALWAYS_FANCY_MATH_387, "always_fancy_math_387", + ~(m_386 | m_486)) -- 2.11.4.GIT