From 0c2279d7cbb4a2cd0ff94dc279fe0975959b612c Mon Sep 17 00:00:00 2001
From: hubicka <hubicka@138bc75d-0d04-0410-961f-82ee72b054a4>
Date: Sat, 19 Oct 2013 12:11:14 +0000
Subject: [PATCH] 	* config/i386/i386.h (ACCUMULATE_OUTGOING_ARGS):
 Disable accumulation 	for cold functions. 	* x86-tune.def
 (X86_TUNE_USE_LEAVE): Update comment. 	(X86_TUNE_PUSH_MEMORY): Likewise. 
 (X86_TUNE_AVX256_UNALIGNED_LOAD_OPTIMAL, 
 X86_TUNE_AVX256_UNALIGNED_STORE_OPTIMAL): New. 
 (X86_TUNE_ACCUMULATE_OUTGOING_ARGS, X86_TUNE_ALWAYS_FANCY_MATH_387): New. 
 * i386.c (x86_accumulate_outgoing_args, x86_arch_always_fancy_math_387, 
 x86_avx256_split_unaligned_load, x86_avx256_split_unaligned_store): 
 Remove. 	(ix86_option_override_internal): Update to use tune features
 instead 	of variables.

git-svn-id: svn+ssh://gcc.gnu.org/svn/gcc/trunk@203855 138bc75d-0d04-0410-961f-82ee72b054a4
---
 gcc/ChangeLog                | 15 +++++++++++++++
 gcc/config/i386/i386.c       | 23 +++++------------------
 gcc/config/i386/i386.h       | 19 ++++++++++++++++---
 gcc/config/i386/x86-tune.def | 39 ++++++++++++++++++++++++++++++++++-----
 4 files changed, 70 insertions(+), 26 deletions(-)

diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index bdb9e01c601..086b5b21787 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,18 @@
+2013-10-18  Jan Hubicka  <jh@suse.cz>
+
+	* config/i386/i386.h (ACCUMULATE_OUTGOING_ARGS): Disable accumulation
+	for cold functions.
+	* x86-tune.def (X86_TUNE_USE_LEAVE): Update comment.
+	(X86_TUNE_PUSH_MEMORY): Likewise.
+	(X86_TUNE_AVX256_UNALIGNED_LOAD_OPTIMAL,
+	X86_TUNE_AVX256_UNALIGNED_STORE_OPTIMAL): New.
+	(X86_TUNE_ACCUMULATE_OUTGOING_ARGS, X86_TUNE_ALWAYS_FANCY_MATH_387): New.
+	* i386.c (x86_accumulate_outgoing_args, x86_arch_always_fancy_math_387,
+	x86_avx256_split_unaligned_load, x86_avx256_split_unaligned_store):
+	Remove.
+	(ix86_option_override_internal): Update to use tune features instead
+	of variables.
+
 2013-10-18  Cong Hou  <congh@google.com>
 
 	PR tree-optimization/58508
diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index b8c3c1d7128..91e65105a5c 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -1897,18 +1897,6 @@ static unsigned int initial_ix86_arch_features[X86_ARCH_LAST] = {
   ~m_386,
 };
 
-static const unsigned int x86_accumulate_outgoing_args
-  = m_PPRO | m_P4_NOCONA | m_ATOM | m_SLM | m_AMD_MULTIPLE | m_GENERIC;
-
-static const unsigned int x86_arch_always_fancy_math_387
-  = m_PENT | m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_SLM | m_AMD_MULTIPLE | m_GENERIC;
-
-static const unsigned int x86_avx256_split_unaligned_load
-  = m_COREI7 | m_GENERIC;
-
-static const unsigned int x86_avx256_split_unaligned_store
-  = m_COREI7 | m_BDVER | m_GENERIC;
-
 /* In case the average insn count for single function invocation is
    lower than this constant, emit fast (but longer) prologue and
    epilogue code.  */
@@ -2925,7 +2913,7 @@ ix86_option_override_internal (bool main_args_p,
 			       struct gcc_options *opts_set)
 {
   int i;
-  unsigned int ix86_arch_mask, ix86_tune_mask;
+  unsigned int ix86_arch_mask;
   const bool ix86_tune_specified = (opts->x_ix86_tune_string != NULL);
   const char *prefix;
   const char *suffix;
@@ -3693,7 +3681,7 @@ ix86_option_override_internal (bool main_args_p,
 
   /* If the architecture always has an FPU, turn off NO_FANCY_MATH_387,
      since the insns won't need emulation.  */
-  if (x86_arch_always_fancy_math_387 & ix86_arch_mask)
+  if (ix86_tune_features [X86_TUNE_ALWAYS_FANCY_MATH_387])
     opts->x_target_flags &= ~MASK_NO_FANCY_MATH_387;
 
   /* Likewise, if the target doesn't have a 387, or we've specified
@@ -3835,8 +3823,7 @@ ix86_option_override_internal (bool main_args_p,
 	gcc_unreachable ();
       }
 
-  ix86_tune_mask = 1u << ix86_tune;
-  if ((x86_accumulate_outgoing_args & ix86_tune_mask)
+  if (ix86_tune_features [X86_TUNE_ACCUMULATE_OUTGOING_ARGS]
       && !(opts_set->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS)
       && !opts->x_optimize_size)
     opts->x_target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
@@ -3976,10 +3963,10 @@ ix86_option_override_internal (bool main_args_p,
       if (flag_expensive_optimizations
 	  && !(opts_set->x_target_flags & MASK_VZEROUPPER))
 	opts->x_target_flags |= MASK_VZEROUPPER;
-      if ((x86_avx256_split_unaligned_load & ix86_tune_mask)
+      if (!ix86_tune_features[X86_TUNE_SSE_UNALIGNED_LOAD_OPTIMAL]
 	  && !(opts_set->x_target_flags & MASK_AVX256_SPLIT_UNALIGNED_LOAD))
 	opts->x_target_flags |= MASK_AVX256_SPLIT_UNALIGNED_LOAD;
-      if ((x86_avx256_split_unaligned_store & ix86_tune_mask)
+      if (!ix86_tune_features[X86_TUNE_SSE_UNALIGNED_STORE_OPTIMAL]
 	  && !(opts_set->x_target_flags & MASK_AVX256_SPLIT_UNALIGNED_STORE))
 	opts->x_target_flags |= MASK_AVX256_SPLIT_UNALIGNED_STORE;
       /* Enable 128-bit AVX instruction generation
diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
index 10f7ff096c3..63e49032748 100644
--- a/gcc/config/i386/i386.h
+++ b/gcc/config/i386/i386.h
@@ -1544,13 +1544,26 @@ enum reg_class
    will be computed and placed into the variable `crtl->outgoing_args_size'.
    No space will be pushed onto the stack for each call; instead, the
    function prologue should increase the stack frame size by this amount.  
+
+   In 32bit mode enabling argument accumulation results in about 5% code size
+   growth becuase move instructions are less compact than push.  In 64bit
+   mode the difference is less drastic but visible.  
+
+   FIXME: Unlike earlier implementations, the size of unwind info seems to
+   actually grouw with accumulation.  Is that because accumulated args
+   unwind info became unnecesarily bloated?
    
    64-bit MS ABI seem to require 16 byte alignment everywhere except for
-   function prologue and apilogue.  This is not possible without
-   ACCUMULATE_OUTGOING_ARGS.  */
+   function prologue and epilogue.  This is not possible without
+   ACCUMULATE_OUTGOING_ARGS.  
+
+   If stack probes are required, the space used for large function
+   arguments on the stack must also be probed, so enable
+   -maccumulate-outgoing-args so this happens in the prologue.  */
 
 #define ACCUMULATE_OUTGOING_ARGS \
-  (TARGET_ACCUMULATE_OUTGOING_ARGS || TARGET_64BIT_MS_ABI)
+  ((TARGET_ACCUMULATE_OUTGOING_ARGS && optimize_function_for_speed_p (cfun)) \
+   || TARGET_STACK_PROBE || TARGET_64BIT_MS_ABI)
 
 /* If defined, a C expression whose value is nonzero when we want to use PUSH
    instructions to pass outgoing arguments.  */
diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def
index 34484a28749..42eee33cbe4 100644
--- a/gcc/config/i386/x86-tune.def
+++ b/gcc/config/i386/x86-tune.def
@@ -18,15 +18,13 @@ a copy of the GCC Runtime Library Exception along with this program;
 see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
 <http://www.gnu.org/licenses/>.  */
 
-/* X86_TUNE_USE_LEAVE: Leave does not affect Nocona SPEC2000 results
-   negatively, so enabling for Generic64 seems like good code size
-   tradeoff.  We can't enable it for 32bit generic because it does not
-   work well with PPro base chips.  */
+/* X86_TUNE_USE_LEAVE: Use "leave" instruction in epilogues where it fits.  */
 DEF_TUNE (X86_TUNE_USE_LEAVE, "use_leave", 
 	  m_386 | m_CORE_ALL | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC)
 
 /* X86_TUNE_PUSH_MEMORY: Enable generation of "push mem" instructions.
-   Some chips, like 486 and Pentium have problems with these sequences.  */
+   Some chips, like 486 and Pentium works faster with separate load
+   and push instructions.  */
 DEF_TUNE (X86_TUNE_PUSH_MEMORY, "push_memory", 
           m_386 | m_P4_NOCONA | m_CORE_ALL | m_K6_GEODE | m_AMD_MULTIPLE 
           | m_GENERIC)
@@ -210,6 +208,16 @@ DEF_TUNE (X86_TUNE_SSE_UNALIGNED_LOAD_OPTIMAL, "sse_unaligned_load_optimal",
 DEF_TUNE (X86_TUNE_SSE_UNALIGNED_STORE_OPTIMAL, "sse_unaligned_store_optimal",
           m_COREI7 | m_BDVER | m_SLM | m_GENERIC)
 
+/* X86_TUNE_AVX256_UNALIGNED_LOAD_OPTIMAL: if true, unaligned loads are
+   split.  */
+DEF_TUNE (X86_TUNE_AVX256_UNALIGNED_LOAD_OPTIMAL, "256_unaligned_load_optimal", 
+          ~(m_COREI7 | m_GENERIC))
+
+/* X86_TUNE_AVX256_UNALIGNED_STORE_OPTIMAL: if true, unaligned loads are
+   split.  */
+DEF_TUNE (X86_TUNE_AVX256_UNALIGNED_STORE_OPTIMAL, "256_unaligned_load_optimal", 
+          ~(m_COREI7 | m_BDVER | m_GENERIC))
+
 /* Use packed single precision instructions where posisble.  I.e. movups instead
    of movupd.  */
 DEF_TUNE (X86_TUNE_SSE_PACKED_SINGLE_INSN_OPTIMAL, "sse_packed_single_insn_optimal",
@@ -398,3 +406,24 @@ DEF_TUNE (X86_TUNE_AVOID_MEM_OPND_FOR_CMOVE, "avoid_mem_opnd_for_cmove",
    fp converts to destination register.  */
 DEF_TUNE (X86_TUNE_SPLIT_MEM_OPND_FOR_FP_CONVERTS, "split_mem_opnd_for_fp_converts",
           m_SLM)
+
+/* X86_TUNE_ACCUMULATE_OUTGOING_ARGS: Allocate stack space for outgoing
+   arguments in prologue/epilogue instead of separately for each call
+   by push/pop instructions.
+   This increase code size by about 5% in 32bit mode, less so in 64bit mode
+   because parameters are passed in registers.  It is considerable
+   win for targets without stack engine that prevents multple push operations
+   to happen in parallel.
+
+   FIXME: the flags is incorrectly enabled for amdfam10, Bulldozer,
+   Bobcat and Generic.  This is because disabling it causes large
+   regression on mgrid due to IRA limitation leading to unecessary
+   use of the frame pointer in 32bit mode.  */
+DEF_TUNE (X86_TUNE_ACCUMULATE_OUTGOING_ARGS, "accumulate_outgoing_args", 
+	  m_PPRO | m_P4_NOCONA | m_ATOM | m_SLM | m_AMD_MULTIPLE | m_GENERIC)
+
+/* X86_TUNE_ALWAYS_FANCY_MATH_387: controls use of fancy 387 operations,
+   such as fsqrt, fprem, fsin, fcos, fsincos etc.
+   Should be enabled for all targets that always has coprocesor.  */
+DEF_TUNE (X86_TUNE_ALWAYS_FANCY_MATH_387, "always_fancy_math_387", 
+          ~(m_386 | m_486))
-- 
2.11.4.GIT