gcc/config/i386/x86-tune.def

   1 /* Definitions of x86 tunable features.
   2    Copyright (C) 2013 Free Software Foundation, Inc.
   3
   4 This file is part of GCC.
   5
   6 GCC is free software; you can redistribute it and/or modify
   7 it under the terms of the GNU General Public License as published by
   8 the Free Software Foundation; either version 3, or (at your option)
   9 any later version.
  10
  11 GCC is distributed in the hope that it will be useful,
  12 but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 GNU General Public License for more details.
  15
  16 You should have received a copy of the GNU General Public License and
  17 a copy of the GCC Runtime Library Exception along with this program;
  18 see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
  19 <http://www.gnu.org/licenses/>.  */
  20
  21 /* X86_TUNE_USE_LEAVE: Leave does not affect Nocona SPEC2000 results
  22    negatively, so enabling for Generic64 seems like good code size
  23    tradeoff.  We can't enable it for 32bit generic because it does not
  24    work well with PPro base chips.  */
  25 DEF_TUNE (X86_TUNE_USE_LEAVE, "use_leave",
  26           m_386 | m_CORE_ALL | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC)
  27 DEF_TUNE (X86_TUNE_PUSH_MEMORY, "push_memory",
  28           m_386 | m_P4_NOCONA | m_CORE_ALL | m_K6_GEODE | m_AMD_MULTIPLE
  29           | m_GENERIC)
  30 DEF_TUNE (X86_TUNE_ZERO_EXTEND_WITH_AND, "zero_extend_with_and",  m_486 | m_PENT)
  31 DEF_TUNE (X86_TUNE_UNROLL_STRLEN, "unroll_strlen",
  32          m_486 | m_PENT | m_PPRO | m_ATOM | m_SLM | m_CORE_ALL | m_K6
  33          | m_AMD_MULTIPLE | m_GENERIC)
  34 /* X86_TUNE_BRANCH_PREDICTION_HINTS: Branch hints were put in P4 based
  35    on simulation result. But after P4 was made, no performance benefit
  36    was observed with branch hints.  It also increases the code size.
  37    As a result, icc never generates branch hints.  */
  38 DEF_TUNE (X86_TUNE_BRANCH_PREDICTION_HINTS, "branch_prediction_hints", 0)
  39 DEF_TUNE (X86_TUNE_DOUBLE_WITH_ADD, "double_with_add", ~m_386)
  40 DEF_TUNE (X86_TUNE_USE_SAHF, "use_sahf",
  41           m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_SLM | m_K6_GEODE
  42           | m_K8 | m_AMDFAM10 | m_BDVER | m_BTVER | m_GENERIC)
  43 /* X86_TUNE_MOVX: Enable to zero extend integer registers to avoid
  44    partial dependencies.  */
  45 DEF_TUNE (X86_TUNE_MOVX, "movx",
  46           m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_SLM | m_GEODE
  47           | m_AMD_MULTIPLE  | m_GENERIC)
  48 /* X86_TUNE_PARTIAL_REG_STALL: We probably ought to watch for partial
  49    register stalls on Generic32 compilation setting as well.  However
  50    in current implementation the partial register stalls are not eliminated
  51    very well - they can be introduced via subregs synthesized by combine
  52    and can happen in caller/callee saving sequences.  */
  53 DEF_TUNE (X86_TUNE_PARTIAL_REG_STALL, "partial_reg_stall", m_PPRO)
  54 DEF_TUNE (X86_TUNE_PARTIAL_FLAG_REG_STALL, "partial_flag_reg_stall",
  55           m_CORE2 | m_GENERIC)
  56 /* X86_TUNE_LCP_STALL: Avoid an expensive length-changing prefix stall
  57  * on 16-bit immediate moves into memory on Core2 and Corei7.  */
  58 DEF_TUNE (X86_TUNE_LCP_STALL, "lcp_stall", m_CORE_ALL | m_GENERIC)
  59 DEF_TUNE (X86_TUNE_USE_HIMODE_FIOP, "use_himode_fiop",
  60           m_386 | m_486 | m_K6_GEODE)
  61 DEF_TUNE (X86_TUNE_USE_SIMODE_FIOP, "use_simode_fiop",
  62           ~(m_PENT | m_PPRO | m_CORE_ALL | m_ATOM
  63             | m_SLM | m_AMD_MULTIPLE | m_GENERIC))
  64 DEF_TUNE (X86_TUNE_USE_MOV0, "use_mov0", m_K6)
  65 DEF_TUNE (X86_TUNE_USE_CLTD, "use_cltd", ~(m_PENT | m_ATOM | m_SLM | m_K6))
  66 /* X86_TUNE_USE_XCHGB: Use xchgb %rh,%rl instead of rolw/rorw $8,rx.  */
  67 DEF_TUNE (X86_TUNE_USE_XCHGB, "use_xchgb", m_PENT4)
  68 DEF_TUNE (X86_TUNE_SPLIT_LONG_MOVES, "split_long_moves", m_PPRO)
  69 DEF_TUNE (X86_TUNE_READ_MODIFY_WRITE, "read_modify_write", ~m_PENT)
  70 DEF_TUNE (X86_TUNE_READ_MODIFY, "read_modify", ~(m_PENT | m_PPRO))
  71 DEF_TUNE (X86_TUNE_PROMOTE_QIMODE, "promote_qimode",
  72           m_386 | m_486 | m_PENT | m_CORE_ALL | m_ATOM | m_SLM
  73           | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC)
  74 DEF_TUNE (X86_TUNE_FAST_PREFIX, "fast_prefix", ~(m_386 | m_486 | m_PENT))
  75 DEF_TUNE (X86_TUNE_SINGLE_STRINGOP, "single_stringop", m_386 | m_P4_NOCONA)
  76 DEF_TUNE (X86_TUNE_QIMODE_MATH, "qimode_math", ~0)
  77 /* X86_TUNE_HIMODE_MATH: On PPro this flag is meant to avoid partial
  78    register stalls.  Just like X86_TUNE_PARTIAL_REG_STALL this option
  79    might be considered for Generic32 if our scheme for avoiding partial
  80    stalls was more effective.  */
  81 DEF_TUNE (X86_TUNE_HIMODE_MATH, "himode_math", ~m_PPRO)
  82 DEF_TUNE (X86_TUNE_PROMOTE_QI_REGS, "promote_qi_regs", 0)
  83 DEF_TUNE (X86_TUNE_PROMOTE_HI_REGS, "promote_hi_regs", m_PPRO)
  84 /* X86_TUNE_SINGLE_POP: Enable if single pop insn is preferred
  85    over esp addition.  */
  86 DEF_TUNE (X86_TUNE_SINGLE_POP, "single_pop", m_386 | m_486 | m_PENT | m_PPRO)
  87 /* X86_TUNE_DOUBLE_POP: Enable if double pop insn is preferred
  88    over esp addition.  */
  89 DEF_TUNE (X86_TUNE_DOUBLE_POP, "double_pop", m_PENT)
  90 /* X86_TUNE_SINGLE_PUSH: Enable if single push insn is preferred
  91    over esp subtraction.  */
  92 DEF_TUNE (X86_TUNE_SINGLE_PUSH, "single_push", m_386 | m_486 | m_PENT
  93           | m_K6_GEODE)
  94 /* X86_TUNE_DOUBLE_PUSH. Enable if double push insn is preferred
  95    over esp subtraction.  */
  96 DEF_TUNE (X86_TUNE_DOUBLE_PUSH, "double_push", m_PENT | m_K6_GEODE)
  97 /* X86_TUNE_INTEGER_DFMODE_MOVES: Enable if integer moves are preferred
  98    for DFmode copies */
  99 DEF_TUNE (X86_TUNE_INTEGER_DFMODE_MOVES, "integer_dfmode_moves",
 100           ~(m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_SLM
 101           | m_GEODE | m_AMD_MULTIPLE | m_GENERIC))
 102 DEF_TUNE (X86_TUNE_PARTIAL_REG_DEPENDENCY, "partial_reg_dependency",
 103           m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_SLM | m_AMD_MULTIPLE
 104           | m_GENERIC)
 105 /* X86_TUNE_SSE_PARTIAL_REG_DEPENDENCY: In the Generic model we have a
 106    conflict here in between PPro/Pentium4 based chips that thread 128bit
 107    SSE registers as single units versus K8 based chips that divide SSE
 108    registers to two 64bit halves.  This knob promotes all store destinations
 109    to be 128bit to allow register renaming on 128bit SSE units, but usually
 110    results in one extra microop on 64bit SSE units.  Experimental results
 111    shows that disabling this option on P4 brings over 20% SPECfp regression,
 112    while enabling it on K8 brings roughly 2.4% regression that can be partly
 113     masked by careful scheduling of moves.  */
 114 DEF_TUNE (X86_TUNE_SSE_PARTIAL_REG_DEPENDENCY, "sse_partial_reg_dependency",
 115           m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_SLM | m_AMDFAM10
 116           | m_BDVER | m_GENERIC)
 117 DEF_TUNE (X86_TUNE_SSE_UNALIGNED_LOAD_OPTIMAL, "sse_unaligned_load_optimal",
 118           m_COREI7 | m_AMDFAM10 | m_BDVER | m_BTVER | m_SLM | m_GENERIC)
 119 DEF_TUNE (X86_TUNE_SSE_UNALIGNED_STORE_OPTIMAL, "sse_unaligned_store_optimal",
 120           m_COREI7 | m_BDVER | m_SLM | m_GENERIC)
 121 DEF_TUNE (X86_TUNE_SSE_PACKED_SINGLE_INSN_OPTIMAL, "sse_packed_single_insn_optimal",
 122           m_BDVER)
 123 /* X86_TUNE_SSE_SPLIT_REGS: Set for machines where the type and dependencies
 124    are resolved on SSE register parts instead of whole registers, so we may
 125    maintain just lower part of scalar values in proper format leaving the
 126    upper part undefined.  */
 127 DEF_TUNE (X86_TUNE_SSE_SPLIT_REGS, "sse_split_regs", m_ATHLON_K8)
 128 DEF_TUNE (X86_TUNE_SSE_TYPELESS_STORES, "sse_typeless_stores",
 129           m_AMD_MULTIPLE | m_CORE_ALL)
 130 DEF_TUNE (X86_TUNE_SSE_LOAD0_BY_PXOR, "sse_load0_by_pxor",
 131           m_PPRO | m_P4_NOCONA | m_CORE_ALL)
 132 DEF_TUNE (X86_TUNE_MEMORY_MISMATCH_STALL, "memory_mismatch_stall",
 133           m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_SLM | m_AMD_MULTIPLE | m_GENERIC)
 134 DEF_TUNE (X86_TUNE_PROLOGUE_USING_MOVE, "prologue_using_move",
 135           m_PPRO | m_ATHLON_K8)
 136 DEF_TUNE (X86_TUNE_EPILOGUE_USING_MOVE, "epilogue_using_move",
 137           m_PPRO | m_ATHLON_K8)
 138 DEF_TUNE (X86_TUNE_SHIFT1, "shift1", ~m_486)
 139 DEF_TUNE (X86_TUNE_USE_FFREEP, "use_ffreep", m_AMD_MULTIPLE)
 140 DEF_TUNE (X86_TUNE_INTER_UNIT_MOVES_TO_VEC, "inter_unit_moves_to_vec",
 141           ~(m_AMD_MULTIPLE | m_GENERIC))
 142 DEF_TUNE (X86_TUNE_INTER_UNIT_MOVES_FROM_VEC, "inter_unit_moves_from_vec",
 143           ~m_ATHLON_K8)
 144 DEF_TUNE (X86_TUNE_INTER_UNIT_CONVERSIONS, "inter_unit_conversions",
 145           ~(m_AMDFAM10 | m_BDVER ))
 146 /* X86_TUNE_FOUR_JUMP_LIMIT: Some CPU cores are not able to predict more
 147    than 4 branch instructions in the 16 byte window.  */
 148 DEF_TUNE (X86_TUNE_FOUR_JUMP_LIMIT, "four_jump_limit",
 149           m_PPRO | m_P4_NOCONA | m_ATOM | m_SLM | m_ATHLON_K8 | m_AMDFAM10)
 150 DEF_TUNE (X86_TUNE_SCHEDULE, "schedule",
 151           m_PENT | m_PPRO | m_CORE_ALL | m_ATOM | m_SLM | m_K6_GEODE
 152           | m_AMD_MULTIPLE | m_GENERIC)
 153 DEF_TUNE (X86_TUNE_USE_BT, "use_bt",
 154           m_CORE_ALL | m_ATOM | m_SLM | m_AMD_MULTIPLE | m_GENERIC)
 155 DEF_TUNE (X86_TUNE_USE_INCDEC, "use_incdec",
 156           ~(m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_SLM | m_GENERIC))
 157 DEF_TUNE (X86_TUNE_PAD_RETURNS, "pad_returns",
 158           m_ATHLON_K8 | m_AMDFAM10 | m_GENERIC)
 159 DEF_TUNE (X86_TUNE_PAD_SHORT_FUNCTION, "pad_short_function", m_ATOM)
 160 DEF_TUNE (X86_TUNE_EXT_80387_CONSTANTS, "ext_80387_constants",
 161           m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_SLM | m_K6_GEODE
 162           | m_ATHLON_K8 | m_GENERIC)
 163 DEF_TUNE (X86_TUNE_AVOID_VECTOR_DECODE, "avoid_vector_decode",
 164           m_K8)
 165 /* X86_TUNE_PROMOTE_HIMODE_IMUL: Modern CPUs have same latency for HImode
 166    and SImode multiply, but 386 and 486 do HImode multiply faster.  */
 167 DEF_TUNE (X86_TUNE_PROMOTE_HIMODE_IMUL, "promote_himode_imul",
 168           ~(m_386 | m_486))
 169 /* X86_TUNE_SLOW_IMUL_IMM32_MEM: Imul of 32-bit constant and memory is
 170    vector path on AMD machines.  */
 171 DEF_TUNE (X86_TUNE_SLOW_IMUL_IMM32_MEM, "slow_imul_imm32_mem",
 172           m_CORE_ALL | m_K8 | m_AMDFAM10 | m_BDVER | m_BTVER | m_GENERIC)
 173 /* X86_TUNE_SLOW_IMUL_IMM8: Imul of 8-bit constant is vector path on AMD
 174    machines.  */
 175 DEF_TUNE (X86_TUNE_SLOW_IMUL_IMM8, "slow_imul_imm8",
 176           m_CORE_ALL | m_K8 | m_AMDFAM10 | m_BDVER | m_BTVER | m_GENERIC)
 177 /* X86_TUNE_MOVE_M1_VIA_OR: On pentiums, it is faster to load -1 via OR
 178    than a MOV.  */
 179 DEF_TUNE (X86_TUNE_MOVE_M1_VIA_OR, "move_m1_via_or", m_PENT)
 180 /* X86_TUNE_NOT_UNPAIRABLE: NOT is not pairable on Pentium, while XOR is,
 181    but one byte longer.  */
 182 DEF_TUNE (X86_TUNE_NOT_UNPAIRABLE, "not_unpairable", m_PENT)
 183 /* X86_TUNE_NOT_VECTORMODE: On AMD K6, NOT is vector decoded with memory
 184    operand that cannot be represented using a modRM byte.  The XOR
 185    replacement is long decoded, so this split helps here as well.  */
 186 DEF_TUNE (X86_TUNE_NOT_VECTORMODE, "not_vectormode", m_K6)
 187 /* X86_TUNE_USE_VECTOR_FP_CONVERTS: Prefer vector packed SSE conversion
 188    from FP to FP. */
 189 DEF_TUNE (X86_TUNE_USE_VECTOR_FP_CONVERTS, "use_vector_fp_converts",
 190           m_CORE_ALL | m_AMDFAM10 | m_GENERIC)
 191 /* X86_TUNE_USE_VECTOR_CONVERTS: Prefer vector packed SSE conversion
 192    from integer to FP. */
 193 DEF_TUNE (X86_TUNE_USE_VECTOR_CONVERTS, "use_vector_converts", m_AMDFAM10)
 194 /* X86_TUNE_FUSE_CMP_AND_BRANCH: Fuse a compare or test instruction
 195    with a subsequent conditional jump instruction into a single
 196    compare-and-branch uop.  */
 197 DEF_TUNE (X86_TUNE_FUSE_CMP_AND_BRANCH, "fuse_cmp_and_branch", m_BDVER | m_CORE_ALL)
 198 /* X86_TUNE_OPT_AGU: Optimize for Address Generation Unit. This flag
 199    will impact LEA instruction selection. */
 200 DEF_TUNE (X86_TUNE_OPT_AGU, "opt_agu", m_ATOM | m_SLM)
 201 /* X86_TUNE_VECTORIZE_DOUBLE: Enable double precision vector
 202    instructions.  */
 203 DEF_TUNE (X86_TUNE_VECTORIZE_DOUBLE, "vectorize_double", ~m_ATOM)
 204 /* X86_TUNE_SOFTWARE_PREFETCHING_BENEFICIAL: Enable software prefetching
 205    at -O3.  For the moment, the prefetching seems badly tuned for Intel
 206    chips.  */
 207 DEF_TUNE (X86_TUNE_SOFTWARE_PREFETCHING_BENEFICIAL, "software_prefetching_beneficial",
 208           m_K6_GEODE | m_AMD_MULTIPLE)
 209 /* X86_TUNE_AVX128_OPTIMAL: Enable 128-bit AVX instruction generation for
 210    the auto-vectorizer.  */
 211 DEF_TUNE (X86_TUNE_AVX128_OPTIMAL, "avx128_optimal", m_BDVER | m_BTVER2)
 212 /* X86_TUNE_REASSOC_INT_TO_PARALLEL: Try to produce parallel computations
 213    during reassociation of integer computation.  */
 214 DEF_TUNE (X86_TUNE_REASSOC_INT_TO_PARALLEL, "reassoc_int_to_parallel",
 215           m_ATOM)
 216 /* X86_TUNE_REASSOC_FP_TO_PARALLEL: Try to produce parallel computations
 217    during reassociation of fp computation.  */
 218 DEF_TUNE (X86_TUNE_REASSOC_FP_TO_PARALLEL, "reassoc_fp_to_parallel",
 219           m_ATOM | m_SLM | m_HASWELL | m_BDVER1 | m_BDVER2 | m_GENERIC)
 220 /* X86_TUNE_GENERAL_REGS_SSE_SPILL: Try to spill general regs to SSE
 221    regs instead of memory.  */
 222 DEF_TUNE (X86_TUNE_GENERAL_REGS_SSE_SPILL, "general_regs_sse_spill",
 223           m_CORE_ALL)
 224 /* X86_TUNE_AVOID_MEM_OPND_FOR_CMOVE: Try to avoid memory operands for
 225    a conditional move.  */
 226 DEF_TUNE (X86_TUNE_AVOID_MEM_OPND_FOR_CMOVE, "avoid_mem_opnd_for_cmove",
 227           m_ATOM | m_SLM)
 228 /* X86_TUNE_SPLIT_MEM_OPND_FOR_FP_CONVERTS: Try to split memory operand for
 229    fp converts to destination register.  */
 230 DEF_TUNE (X86_TUNE_SPLIT_MEM_OPND_FOR_FP_CONVERTS, "split_mem_opnd_for_fp_converts",
 231           m_SLM)