gcc/config/i386/x86-tune.def

   1 /* Definitions of x86 tunable features.
   2    Copyright (C) 2013-2018 Free Software Foundation, Inc.
   3
   4 This file is part of GCC.
   5
   6 GCC is free software; you can redistribute it and/or modify
   7 it under the terms of the GNU General Public License as published by
   8 the Free Software Foundation; either version 3, or (at your option)
   9 any later version.
  10
  11 GCC is distributed in the hope that it will be useful,
  12 but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 GNU General Public License for more details.
  15
  16 You should have received a copy of the GNU General Public License and
  17 a copy of the GCC Runtime Library Exception along with this program;
  18 see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
  19 <http://www.gnu.org/licenses/>.  */
  20
  21 /* Tuning for a given CPU XXXX consists of:
  22     - adding new CPU into:
  23         - adding PROCESSOR_XXX to processor_type (in i386.h)
  24         - possibly adding XXX into CPU attribute in i386.md
  25         - adding XXX to processor_alias_table (in i386.c)
  26     - introducing ix86_XXX_cost in i386.c
  27         - Stringop generation table can be build based on test_stringop
  28         - script (once rest of tuning is complete)
  29     - designing a scheduler model in
  30         - XXXX.md file
  31         - Updating ix86_issue_rate and ix86_adjust_cost in i386.md
  32         - possibly updating ia32_multipass_dfa_lookahead, ix86_sched_reorder
  33           and ix86_sched_init_global if those tricks are needed.
  34     - Tunning the flags bellow. Those are split into sections and each
  35       section is very roughly ordered by importance.  */
  36
  37 /*****************************************************************************/
  38 /* Scheduling flags.                                                         */
  39 /*****************************************************************************/
  40
  41 /* X86_TUNE_SCHEDULE: Enable scheduling.  */
  42 DEF_TUNE (X86_TUNE_SCHEDULE, "schedule",
  43           m_PENT | m_LAKEMONT | m_PPRO | m_CORE_ALL | m_BONNELL | m_SILVERMONT
  44           | m_INTEL | m_KNL | m_KNM | m_K6_GEODE | m_AMD_MULTIPLE | m_GOLDMONT
  45           | m_GOLDMONT_PLUS | m_GENERIC)
  46
  47 /* X86_TUNE_PARTIAL_REG_DEPENDENCY: Enable more register renaming
  48    on modern chips.  Preffer stores affecting whole integer register
  49    over partial stores.  For example preffer MOVZBL or MOVQ to load 8bit
  50    value over movb.  */
  51 DEF_TUNE (X86_TUNE_PARTIAL_REG_DEPENDENCY, "partial_reg_dependency",
  52           m_P4_NOCONA | m_CORE2 | m_NEHALEM  | m_SANDYBRIDGE | m_HASWELL
  53           | m_BONNELL | m_SILVERMONT | m_GOLDMONT | m_GOLDMONT_PLUS | m_INTEL
  54           | m_KNL | m_KNM | m_AMD_MULTIPLE | m_SKYLAKE_AVX512 | m_GENERIC)
  55
  56 /* X86_TUNE_SSE_PARTIAL_REG_DEPENDENCY: This knob promotes all store
  57    destinations to be 128bit to allow register renaming on 128bit SSE units,
  58    but usually results in one extra microop on 64bit SSE units.
  59    Experimental results shows that disabling this option on P4 brings over 20%
  60    SPECfp regression, while enabling it on K8 brings roughly 2.4% regression
  61    that can be partly masked by careful scheduling of moves.  */
  62 DEF_TUNE (X86_TUNE_SSE_PARTIAL_REG_DEPENDENCY, "sse_partial_reg_dependency",
  63           m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_BONNELL | m_AMDFAM10
  64           | m_BDVER | m_ZNVER1 | m_GENERIC)
  65
  66 /* X86_TUNE_SSE_SPLIT_REGS: Set for machines where the type and dependencies
  67    are resolved on SSE register parts instead of whole registers, so we may
  68    maintain just lower part of scalar values in proper format leaving the
  69    upper part undefined.  */
  70 DEF_TUNE (X86_TUNE_SSE_SPLIT_REGS, "sse_split_regs", m_ATHLON_K8)
  71
  72 /* X86_TUNE_PARTIAL_FLAG_REG_STALL: this flag disables use of of flags
  73    set by instructions affecting just some flags (in particular shifts).
  74    This is because Core2 resolves dependencies on whole flags register
  75    and such sequences introduce false dependency on previous instruction
  76    setting full flags.
  77
  78    The flags does not affect generation of INC and DEC that is controlled
  79    by X86_TUNE_USE_INCDEC.  */
  80
  81 DEF_TUNE (X86_TUNE_PARTIAL_FLAG_REG_STALL, "partial_flag_reg_stall",
  82           m_CORE2)
  83
  84 /* X86_TUNE_MOVX: Enable to zero extend integer registers to avoid
  85    partial dependencies.  */
  86 DEF_TUNE (X86_TUNE_MOVX, "movx",
  87           m_PPRO | m_P4_NOCONA | m_CORE2 | m_NEHALEM  | m_SANDYBRIDGE
  88           | m_BONNELL | m_SILVERMONT | m_GOLDMONT | m_KNL | m_KNM | m_INTEL
  89           | m_GOLDMONT_PLUS | m_GEODE | m_AMD_MULTIPLE | m_SKYLAKE_AVX512
  90           | m_HASWELL | m_GENERIC)
  91
  92 /* X86_TUNE_MEMORY_MISMATCH_STALL: Avoid partial stores that are followed by
  93    full sized loads.  */
  94 DEF_TUNE (X86_TUNE_MEMORY_MISMATCH_STALL, "memory_mismatch_stall",
  95           m_P4_NOCONA | m_CORE_ALL | m_BONNELL | m_SILVERMONT | m_INTEL
  96           | m_KNL | m_KNM | m_GOLDMONT | m_GOLDMONT_PLUS | m_AMD_MULTIPLE
  97           | m_GENERIC)
  98
  99 /* X86_TUNE_FUSE_CMP_AND_BRANCH_32: Fuse compare with a subsequent
 100    conditional jump instruction for 32 bit TARGET.  */
 101 DEF_TUNE (X86_TUNE_FUSE_CMP_AND_BRANCH_32, "fuse_cmp_and_branch_32",
 102           m_CORE_ALL | m_BDVER | m_ZNVER1 | m_GENERIC)
 103
 104 /* X86_TUNE_FUSE_CMP_AND_BRANCH_64: Fuse compare with a subsequent
 105    conditional jump instruction for TARGET_64BIT.  */
 106 DEF_TUNE (X86_TUNE_FUSE_CMP_AND_BRANCH_64, "fuse_cmp_and_branch_64",
 107           m_NEHALEM | m_SANDYBRIDGE | m_HASWELL | m_BDVER | m_ZNVER1 | m_GENERIC)
 108
 109 /* X86_TUNE_FUSE_CMP_AND_BRANCH_SOFLAGS: Fuse compare with a
 110    subsequent conditional jump instruction when the condition jump
 111    check sign flag (SF) or overflow flag (OF).  */
 112 DEF_TUNE (X86_TUNE_FUSE_CMP_AND_BRANCH_SOFLAGS, "fuse_cmp_and_branch_soflags",
 113           m_NEHALEM | m_SANDYBRIDGE | m_HASWELL | m_BDVER | m_ZNVER1 | m_GENERIC)
 114
 115 /* X86_TUNE_FUSE_ALU_AND_BRANCH: Fuse alu with a subsequent conditional
 116    jump instruction when the alu instruction produces the CCFLAG consumed by
 117    the conditional jump instruction. */
 118 DEF_TUNE (X86_TUNE_FUSE_ALU_AND_BRANCH, "fuse_alu_and_branch",
 119           m_SANDYBRIDGE | m_HASWELL | m_GENERIC)
 120
 121
 122 /*****************************************************************************/
 123 /* Function prologue, epilogue and function calling sequences.               */
 124 /*****************************************************************************/
 125
 126 /* X86_TUNE_ACCUMULATE_OUTGOING_ARGS: Allocate stack space for outgoing
 127    arguments in prologue/epilogue instead of separately for each call
 128    by push/pop instructions.
 129    This increase code size by about 5% in 32bit mode, less so in 64bit mode
 130    because parameters are passed in registers.  It is considerable
 131    win for targets without stack engine that prevents multple push operations
 132    to happen in parallel.  */
 133
 134 DEF_TUNE (X86_TUNE_ACCUMULATE_OUTGOING_ARGS, "accumulate_outgoing_args",
 135           m_PPRO | m_P4_NOCONA | m_BONNELL | m_SILVERMONT | m_KNL | m_KNM | m_INTEL
 136           | m_GOLDMONT | m_GOLDMONT_PLUS | m_ATHLON_K8)
 137
 138 /* X86_TUNE_PROLOGUE_USING_MOVE: Do not use push/pop in prologues that are
 139    considered on critical path.  */
 140 DEF_TUNE (X86_TUNE_PROLOGUE_USING_MOVE, "prologue_using_move",
 141           m_PPRO | m_ATHLON_K8)
 142
 143 /* X86_TUNE_PROLOGUE_USING_MOVE: Do not use push/pop in epilogues that are
 144    considered on critical path.  */
 145 DEF_TUNE (X86_TUNE_EPILOGUE_USING_MOVE, "epilogue_using_move",
 146           m_PPRO | m_ATHLON_K8)
 147
 148 /* X86_TUNE_USE_LEAVE: Use "leave" instruction in epilogues where it fits.  */
 149 DEF_TUNE (X86_TUNE_USE_LEAVE, "use_leave",
 150           m_386 | m_CORE_ALL | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC)
 151
 152 /* X86_TUNE_PUSH_MEMORY: Enable generation of "push mem" instructions.
 153    Some chips, like 486 and Pentium works faster with separate load
 154    and push instructions.  */
 155 DEF_TUNE (X86_TUNE_PUSH_MEMORY, "push_memory",
 156           m_386 | m_P4_NOCONA | m_CORE_ALL | m_K6_GEODE | m_AMD_MULTIPLE
 157           | m_GENERIC)
 158
 159 /* X86_TUNE_SINGLE_PUSH: Enable if single push insn is preferred
 160    over esp subtraction.  */
 161 DEF_TUNE (X86_TUNE_SINGLE_PUSH, "single_push", m_386 | m_486 | m_PENT
 162           | m_LAKEMONT | m_K6_GEODE)
 163
 164 /* X86_TUNE_DOUBLE_PUSH. Enable if double push insn is preferred
 165    over esp subtraction.  */
 166 DEF_TUNE (X86_TUNE_DOUBLE_PUSH, "double_push", m_PENT | m_LAKEMONT
 167           | m_K6_GEODE)
 168
 169 /* X86_TUNE_SINGLE_POP: Enable if single pop insn is preferred
 170    over esp addition.  */
 171 DEF_TUNE (X86_TUNE_SINGLE_POP, "single_pop", m_386 | m_486 | m_PENT
 172           | m_LAKEMONT | m_PPRO)
 173
 174 /* X86_TUNE_DOUBLE_POP: Enable if double pop insn is preferred
 175    over esp addition.  */
 176 DEF_TUNE (X86_TUNE_DOUBLE_POP, "double_pop", m_PENT | m_LAKEMONT)
 177
 178 /*****************************************************************************/
 179 /* Branch predictor tuning                                                   */
 180 /*****************************************************************************/
 181
 182 /* X86_TUNE_PAD_SHORT_FUNCTION: Make every function to be at least 4
 183    instructions long.  */
 184 DEF_TUNE (X86_TUNE_PAD_SHORT_FUNCTION, "pad_short_function", m_BONNELL)
 185
 186 /* X86_TUNE_PAD_RETURNS: Place NOP before every RET that is a destination
 187    of conditional jump or directly preceded by other jump instruction.
 188    This is important for AND K8-AMDFAM10 because the branch prediction
 189    architecture expect at most one jump per 2 byte window.  Failing to
 190    pad returns leads to misaligned return stack.  */
 191 DEF_TUNE (X86_TUNE_PAD_RETURNS, "pad_returns",
 192           m_ATHLON_K8 | m_AMDFAM10)
 193
 194 /* X86_TUNE_FOUR_JUMP_LIMIT: Some CPU cores are not able to predict more
 195    than 4 branch instructions in the 16 byte window.  */
 196 DEF_TUNE (X86_TUNE_FOUR_JUMP_LIMIT, "four_jump_limit",
 197           m_PPRO | m_P4_NOCONA | m_BONNELL | m_SILVERMONT | m_KNL | m_KNM
 198           | m_GOLDMONT | m_GOLDMONT_PLUS | m_INTEL | m_ATHLON_K8 | m_AMDFAM10)
 199
 200 /*****************************************************************************/
 201 /* Integer instruction selection tuning                                      */
 202 /*****************************************************************************/
 203
 204 /* X86_TUNE_SOFTWARE_PREFETCHING_BENEFICIAL: Enable software prefetching
 205    at -O3.  For the moment, the prefetching seems badly tuned for Intel
 206    chips.  */
 207 DEF_TUNE (X86_TUNE_SOFTWARE_PREFETCHING_BENEFICIAL, "software_prefetching_beneficial",
 208           m_K6_GEODE | m_ATHLON_K8 | m_AMDFAM10 | m_BDVER | m_BTVER)
 209
 210 /* X86_TUNE_LCP_STALL: Avoid an expensive length-changing prefix stall
 211    on 16-bit immediate moves into memory on Core2 and Corei7.  */
 212 DEF_TUNE (X86_TUNE_LCP_STALL, "lcp_stall", m_CORE_ALL | m_GENERIC)
 213
 214 /* X86_TUNE_READ_MODIFY: Enable use of read-modify instructions such
 215    as "add mem, reg".  */
 216 DEF_TUNE (X86_TUNE_READ_MODIFY, "read_modify", ~(m_PENT | m_LAKEMONT | m_PPRO))
 217
 218 /* X86_TUNE_USE_INCDEC: Enable use of inc/dec instructions.
 219
 220    Core2 and nehalem has stall of 7 cycles for partial flag register stalls.
 221    Sandy bridge and Ivy bridge generate extra uop.  On Haswell this extra uop
 222    is output only when the values needs to be really merged, which is not
 223    done by GCC generated code.  */
 224 DEF_TUNE (X86_TUNE_USE_INCDEC, "use_incdec",
 225           ~(m_P4_NOCONA | m_CORE2 | m_NEHALEM  | m_SANDYBRIDGE
 226             | m_BONNELL | m_SILVERMONT | m_INTEL |  m_KNL | m_KNM | m_GOLDMONT
 227             | m_GOLDMONT_PLUS | m_GENERIC))
 228
 229 /* X86_TUNE_INTEGER_DFMODE_MOVES: Enable if integer moves are preferred
 230    for DFmode copies */
 231 DEF_TUNE (X86_TUNE_INTEGER_DFMODE_MOVES, "integer_dfmode_moves",
 232           ~(m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_BONNELL | m_SILVERMONT
 233             | m_KNL | m_KNM | m_INTEL | m_GEODE | m_AMD_MULTIPLE | m_GOLDMONT
 234             | m_GOLDMONT_PLUS | m_GENERIC))
 235
 236 /* X86_TUNE_OPT_AGU: Optimize for Address Generation Unit. This flag
 237    will impact LEA instruction selection. */
 238 DEF_TUNE (X86_TUNE_OPT_AGU, "opt_agu", m_BONNELL | m_SILVERMONT | m_KNL
 239          | m_KNM | m_GOLDMONT | m_GOLDMONT_PLUS | m_INTEL)
 240
 241 /* X86_TUNE_AVOID_LEA_FOR_ADDR: Avoid lea for address computation.  */
 242 DEF_TUNE (X86_TUNE_AVOID_LEA_FOR_ADDR, "avoid_lea_for_addr",
 243           m_BONNELL | m_SILVERMONT | m_GOLDMONT | m_GOLDMONT_PLUS | m_KNL
 244           | m_KNM)
 245
 246 /* X86_TUNE_SLOW_IMUL_IMM32_MEM: Imul of 32-bit constant and memory is
 247    vector path on AMD machines.
 248    FIXME: Do we need to enable this for core? */
 249 DEF_TUNE (X86_TUNE_SLOW_IMUL_IMM32_MEM, "slow_imul_imm32_mem",
 250           m_K8 | m_AMDFAM10)
 251
 252 /* X86_TUNE_SLOW_IMUL_IMM8: Imul of 8-bit constant is vector path on AMD
 253    machines.
 254    FIXME: Do we need to enable this for core? */
 255 DEF_TUNE (X86_TUNE_SLOW_IMUL_IMM8, "slow_imul_imm8",
 256           m_K8 | m_AMDFAM10)
 257
 258 /* X86_TUNE_AVOID_MEM_OPND_FOR_CMOVE: Try to avoid memory operands for
 259    a conditional move.  */
 260 DEF_TUNE (X86_TUNE_AVOID_MEM_OPND_FOR_CMOVE, "avoid_mem_opnd_for_cmove",
 261           m_BONNELL | m_SILVERMONT | m_GOLDMONT | m_GOLDMONT_PLUS | m_KNL
 262           | m_KNM | m_INTEL)
 263
 264 /* X86_TUNE_SINGLE_STRINGOP: Enable use of single string operations, such
 265    as MOVS and STOS (without a REP prefix) to move/set sequences of bytes.  */
 266 DEF_TUNE (X86_TUNE_SINGLE_STRINGOP, "single_stringop", m_386 | m_P4_NOCONA)
 267
 268 /* X86_TUNE_MISALIGNED_MOVE_STRING_PRO_EPILOGUES: Enable generation of
 269    compact prologues and epilogues by issuing a misaligned moves.  This
 270    requires target to handle misaligned moves and partial memory stalls
 271    reasonably well.
 272    FIXME: This may actualy be a win on more targets than listed here.  */
 273 DEF_TUNE (X86_TUNE_MISALIGNED_MOVE_STRING_PRO_EPILOGUES,
 274           "misaligned_move_string_pro_epilogues",
 275           m_386 | m_486 | m_CORE_ALL | m_AMD_MULTIPLE | m_GENERIC)
 276
 277 /* X86_TUNE_USE_SAHF: Controls use of SAHF.  */
 278 DEF_TUNE (X86_TUNE_USE_SAHF, "use_sahf",
 279           m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_BONNELL | m_SILVERMONT
 280           | m_KNL | m_KNM | m_INTEL | m_K6_GEODE | m_K8 | m_AMDFAM10 | m_BDVER
 281           | m_BTVER | m_ZNVER1 | m_GOLDMONT | m_GOLDMONT_PLUS | m_GENERIC)
 282
 283 /* X86_TUNE_USE_CLTD: Controls use of CLTD and CTQO instructions.  */
 284 DEF_TUNE (X86_TUNE_USE_CLTD, "use_cltd",
 285           ~(m_PENT | m_LAKEMONT | m_BONNELL | m_SILVERMONT | m_KNL | m_KNM | m_INTEL
 286             | m_K6 | m_GOLDMONT | m_GOLDMONT_PLUS))
 287
 288 /* X86_TUNE_USE_BT: Enable use of BT (bit test) instructions.  */
 289 DEF_TUNE (X86_TUNE_USE_BT, "use_bt",
 290           m_CORE_ALL | m_BONNELL | m_SILVERMONT | m_KNL | m_KNM | m_INTEL
 291           | m_LAKEMONT | m_AMD_MULTIPLE | m_GOLDMONT | m_GOLDMONT_PLUS
 292           | m_GENERIC)
 293
 294 /* X86_TUNE_AVOID_FALSE_DEP_FOR_BMI: Avoid false dependency
 295    for bit-manipulation instructions.  */
 296 DEF_TUNE (X86_TUNE_AVOID_FALSE_DEP_FOR_BMI, "avoid_false_dep_for_bmi",
 297           m_SANDYBRIDGE | m_HASWELL | m_GENERIC)
 298
 299 /* X86_TUNE_ADJUST_UNROLL: This enables adjusting the unroll factor based
 300    on hardware capabilities. Bdver3 hardware has a loop buffer which makes
 301    unrolling small loop less important. For, such architectures we adjust
 302    the unroll factor so that the unrolled loop fits the loop buffer.  */
 303 DEF_TUNE (X86_TUNE_ADJUST_UNROLL, "adjust_unroll_factor", m_BDVER3 | m_BDVER4)
 304
 305 /* X86_TUNE_ONE_IF_CONV_INSNS: Restrict a number of cmov insns in
 306    if-converted sequence to one.  */
 307 DEF_TUNE (X86_TUNE_ONE_IF_CONV_INSN, "one_if_conv_insn",
 308           m_SILVERMONT | m_KNL | m_KNM | m_INTEL | m_CORE_ALL | m_GOLDMONT
 309           | m_GOLDMONT_PLUS | m_GENERIC)
 310
 311 /*****************************************************************************/
 312 /* 387 instruction selection tuning                                          */
 313 /*****************************************************************************/
 314
 315 /* X86_TUNE_USE_HIMODE_FIOP: Enables use of x87 instructions with 16bit
 316    integer operand.
 317    FIXME: Why this is disabled for modern chips?  */
 318 DEF_TUNE (X86_TUNE_USE_HIMODE_FIOP, "use_himode_fiop",
 319           m_386 | m_486 | m_K6_GEODE)
 320
 321 /* X86_TUNE_USE_SIMODE_FIOP: Enables use of x87 instructions with 32bit
 322    integer operand.  */
 323 DEF_TUNE (X86_TUNE_USE_SIMODE_FIOP, "use_simode_fiop",
 324           ~(m_PENT | m_LAKEMONT | m_PPRO | m_CORE_ALL | m_BONNELL
 325             | m_SILVERMONT | m_KNL | m_KNM | m_INTEL | m_AMD_MULTIPLE
 326             | m_GOLDMONT | m_GOLDMONT_PLUS | m_GENERIC))
 327
 328 /* X86_TUNE_USE_FFREEP: Use freep instruction instead of fstp.  */
 329 DEF_TUNE (X86_TUNE_USE_FFREEP, "use_ffreep", m_AMD_MULTIPLE)
 330
 331 /* X86_TUNE_EXT_80387_CONSTANTS: Use fancy 80387 constants, such as PI.  */
 332 DEF_TUNE (X86_TUNE_EXT_80387_CONSTANTS, "ext_80387_constants",
 333           m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_BONNELL | m_SILVERMONT
 334           | m_KNL | m_KNM | m_INTEL | m_K6_GEODE | m_ATHLON_K8 | m_GOLDMONT
 335           | m_GOLDMONT_PLUS | m_GENERIC)
 336
 337 /*****************************************************************************/
 338 /* SSE instruction selection tuning                                          */
 339 /*****************************************************************************/
 340
 341 /* X86_TUNE_GENERAL_REGS_SSE_SPILL: Try to spill general regs to SSE
 342    regs instead of memory.  */
 343 DEF_TUNE (X86_TUNE_GENERAL_REGS_SSE_SPILL, "general_regs_sse_spill",
 344           m_CORE_ALL)
 345
 346 /* X86_TUNE_SSE_UNALIGNED_LOAD_OPTIMAL: Use movups for misaligned loads instead
 347    of a sequence loading registers by parts.  */
 348 DEF_TUNE (X86_TUNE_SSE_UNALIGNED_LOAD_OPTIMAL, "sse_unaligned_load_optimal",
 349           m_NEHALEM | m_SANDYBRIDGE | m_HASWELL | m_SILVERMONT | m_KNL | m_KNM
 350           | m_INTEL | m_SKYLAKE_AVX512 | m_GOLDMONT | m_GOLDMONT_PLUS
 351           | m_AMDFAM10 | m_BDVER | m_BTVER | m_ZNVER1 | m_GENERIC)
 352
 353 /* X86_TUNE_SSE_UNALIGNED_STORE_OPTIMAL: Use movups for misaligned stores instead
 354    of a sequence loading registers by parts.  */
 355 DEF_TUNE (X86_TUNE_SSE_UNALIGNED_STORE_OPTIMAL, "sse_unaligned_store_optimal",
 356           m_NEHALEM | m_SANDYBRIDGE | m_HASWELL | m_SILVERMONT | m_KNL | m_KNM
 357           | m_INTEL | m_SKYLAKE_AVX512 | m_GOLDMONT | m_GOLDMONT_PLUS
 358           | m_BDVER | m_ZNVER1 | m_GENERIC)
 359
 360 /* Use packed single precision instructions where posisble.  I.e. movups instead
 361    of movupd.  */
 362 DEF_TUNE (X86_TUNE_SSE_PACKED_SINGLE_INSN_OPTIMAL, "sse_packed_single_insn_optimal",
 363           m_BDVER | m_ZNVER1)
 364
 365 /* X86_TUNE_SSE_TYPELESS_STORES: Always movaps/movups for 128bit stores.   */
 366 DEF_TUNE (X86_TUNE_SSE_TYPELESS_STORES, "sse_typeless_stores",
 367           m_AMD_MULTIPLE | m_CORE_ALL | m_GENERIC)
 368
 369 /* X86_TUNE_SSE_LOAD0_BY_PXOR: Always use pxor to load0 as opposed to
 370    xorps/xorpd and other variants.  */
 371 DEF_TUNE (X86_TUNE_SSE_LOAD0_BY_PXOR, "sse_load0_by_pxor",
 372           m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_BDVER | m_BTVER | m_ZNVER1
 373           | m_GENERIC)
 374
 375 /* X86_TUNE_INTER_UNIT_MOVES_TO_VEC: Enable moves in from integer
 376    to SSE registers.  If disabled, the moves will be done by storing
 377    the value to memory and reloading.  */
 378 DEF_TUNE (X86_TUNE_INTER_UNIT_MOVES_TO_VEC, "inter_unit_moves_to_vec",
 379           ~(m_ATHLON_K8 | m_AMDFAM10 | m_BDVER | m_BTVER | m_GENERIC))
 380
 381 /* X86_TUNE_INTER_UNIT_MOVES_TO_VEC: Enable moves in from SSE
 382    to integer registers.  If disabled, the moves will be done by storing
 383    the value to memory and reloading.  */
 384 DEF_TUNE (X86_TUNE_INTER_UNIT_MOVES_FROM_VEC, "inter_unit_moves_from_vec",
 385           ~m_ATHLON_K8)
 386
 387 /* X86_TUNE_INTER_UNIT_CONVERSIONS: Enable float<->integer conversions
 388    to use both SSE and integer registers at a same time.  */
 389 DEF_TUNE (X86_TUNE_INTER_UNIT_CONVERSIONS, "inter_unit_conversions",
 390           ~(m_AMDFAM10 | m_BDVER))
 391
 392 /* X86_TUNE_SPLIT_MEM_OPND_FOR_FP_CONVERTS: Try to split memory operand for
 393    fp converts to destination register.  */
 394 DEF_TUNE (X86_TUNE_SPLIT_MEM_OPND_FOR_FP_CONVERTS, "split_mem_opnd_for_fp_converts",
 395           m_SILVERMONT | m_KNL | m_KNM | m_GOLDMONT | m_GOLDMONT_PLUS
 396           | m_INTEL)
 397
 398 /* X86_TUNE_USE_VECTOR_FP_CONVERTS: Prefer vector packed SSE conversion
 399    from FP to FP.  This form of instructions avoids partial write to the
 400    destination.  */
 401 DEF_TUNE (X86_TUNE_USE_VECTOR_FP_CONVERTS, "use_vector_fp_converts",
 402           m_AMDFAM10)
 403
 404 /* X86_TUNE_USE_VECTOR_CONVERTS: Prefer vector packed SSE conversion
 405    from integer to FP. */
 406 DEF_TUNE (X86_TUNE_USE_VECTOR_CONVERTS, "use_vector_converts", m_AMDFAM10)
 407
 408 /* X86_TUNE_SLOW_SHUFB: Indicates tunings with slow pshufb instruction.  */
 409 DEF_TUNE (X86_TUNE_SLOW_PSHUFB, "slow_pshufb",
 410           m_BONNELL | m_SILVERMONT | m_KNL | m_KNM | m_GOLDMONT
 411           | m_GOLDMONT_PLUS | m_INTEL)
 412
 413 /* X86_TUNE_AVOID_4BYTE_PREFIXES: Avoid instructions requiring 4+ bytes of prefixes.  */
 414 DEF_TUNE (X86_TUNE_AVOID_4BYTE_PREFIXES, "avoid_4byte_prefixes",
 415           m_SILVERMONT | m_GOLDMONT | m_GOLDMONT_PLUS | m_INTEL)
 416
 417 /* X86_TUNE_USE_GATHER: Use gather instructions.  */
 418 DEF_TUNE (X86_TUNE_USE_GATHER, "use_gather",
 419           ~(m_ZNVER1 | m_GENERIC))
 420
 421 /* X86_TUNE_AVOID_128FMA_CHAINS: Avoid creating loops with tight 128bit or
 422    smaller FMA chain.  */
 423 DEF_TUNE (X86_TUNE_AVOID_128FMA_CHAINS, "avoid_fma_chains", m_ZNVER1)
 424
 425 /*****************************************************************************/
 426 /* AVX instruction selection tuning (some of SSE flags affects AVX, too)     */
 427 /*****************************************************************************/
 428
 429 /* X86_TUNE_AVX256_UNALIGNED_LOAD_OPTIMAL: if false, unaligned loads are
 430    split.  */
 431 DEF_TUNE (X86_TUNE_AVX256_UNALIGNED_LOAD_OPTIMAL, "256_unaligned_load_optimal",
 432           ~(m_NEHALEM | m_SANDYBRIDGE | m_GENERIC))
 433
 434 /* X86_TUNE_AVX256_UNALIGNED_STORE_OPTIMAL: if false, unaligned stores are
 435    split.  */
 436 DEF_TUNE (X86_TUNE_AVX256_UNALIGNED_STORE_OPTIMAL, "256_unaligned_store_optimal",
 437           ~(m_NEHALEM | m_SANDYBRIDGE | m_BDVER | m_ZNVER1 | m_GENERIC))
 438
 439 /* X86_TUNE_AVX128_OPTIMAL: Enable 128-bit AVX instruction generation for
 440    the auto-vectorizer.  */
 441 DEF_TUNE (X86_TUNE_AVX128_OPTIMAL, "avx128_optimal", m_BDVER | m_BTVER2
 442           | m_ZNVER1)
 443
 444 /* X86_TUNE_AVX256_OPTIMAL: Use 256-bit AVX instructions instead of 512-bit AVX
 445    instructions in the auto-vectorizer.  */
 446 DEF_TUNE (X86_TUNE_AVX256_OPTIMAL, "avx256_optimal", m_SKYLAKE_AVX512)
 447
 448 /*****************************************************************************/
 449 /* Historical relics: tuning flags that helps a specific old CPU designs     */
 450 /*****************************************************************************/
 451
 452 /* X86_TUNE_DOUBLE_WITH_ADD: Use add instead of sal to double value in
 453    an integer register.  */
 454 DEF_TUNE (X86_TUNE_DOUBLE_WITH_ADD, "double_with_add", ~m_386)
 455
 456 /* X86_TUNE_ALWAYS_FANCY_MATH_387: controls use of fancy 387 operations,
 457    such as fsqrt, fprem, fsin, fcos, fsincos etc.
 458    Should be enabled for all targets that always has coprocesor.  */
 459 DEF_TUNE (X86_TUNE_ALWAYS_FANCY_MATH_387, "always_fancy_math_387",
 460           ~(m_386 | m_486 | m_LAKEMONT))
 461
 462 /* X86_TUNE_UNROLL_STRLEN: Produce (quite lame) unrolled sequence for
 463    inline strlen.  This affects only -minline-all-stringops mode. By
 464    default we always dispatch to a library since our internal strlen
 465    is bad.  */
 466 DEF_TUNE (X86_TUNE_UNROLL_STRLEN, "unroll_strlen", ~m_386)
 467
 468 /* X86_TUNE_SHIFT1: Enables use of short encoding of "sal reg" instead of
 469    longer "sal $1, reg".  */
 470 DEF_TUNE (X86_TUNE_SHIFT1, "shift1", ~m_486)
 471
 472 /* X86_TUNE_ZERO_EXTEND_WITH_AND: Use AND instruction instead
 473    of mozbl/movwl.  */
 474 DEF_TUNE (X86_TUNE_ZERO_EXTEND_WITH_AND, "zero_extend_with_and",
 475           m_486 | m_PENT)
 476
 477 /* X86_TUNE_PROMOTE_HIMODE_IMUL: Modern CPUs have same latency for HImode
 478    and SImode multiply, but 386 and 486 do HImode multiply faster.  */
 479 DEF_TUNE (X86_TUNE_PROMOTE_HIMODE_IMUL, "promote_himode_imul",
 480           ~(m_386 | m_486))
 481
 482 /* X86_TUNE_FAST_PREFIX: Enable demoting some 32bit or 64bit arithmetic
 483    into 16bit/8bit when resulting sequence is shorter.  For example
 484    for "and $-65536, reg" to 16bit store of 0.  */
 485 DEF_TUNE (X86_TUNE_FAST_PREFIX, "fast_prefix",
 486           ~(m_386 | m_486 | m_PENT | m_LAKEMONT))
 487
 488 /* X86_TUNE_READ_MODIFY_WRITE: Enable use of read modify write instructions
 489    such as "add $1, mem".  */
 490 DEF_TUNE (X86_TUNE_READ_MODIFY_WRITE, "read_modify_write",
 491           ~(m_PENT | m_LAKEMONT))
 492
 493 /* X86_TUNE_MOVE_M1_VIA_OR: On pentiums, it is faster to load -1 via OR
 494    than a MOV.  */
 495 DEF_TUNE (X86_TUNE_MOVE_M1_VIA_OR, "move_m1_via_or", m_PENT | m_LAKEMONT)
 496
 497 /* X86_TUNE_NOT_UNPAIRABLE: NOT is not pairable on Pentium, while XOR is,
 498    but one byte longer.  */
 499 DEF_TUNE (X86_TUNE_NOT_UNPAIRABLE, "not_unpairable", m_PENT | m_LAKEMONT)
 500
 501 /* X86_TUNE_PARTIAL_REG_STALL: Pentium pro, unlike later chips, handled
 502    use of partial registers by renaming.  This improved performance of 16bit
 503    code where upper halves of registers are not used.  It also leads to
 504    an penalty whenever a 16bit store is followed by 32bit use.  This flag
 505    disables production of such sequences in common cases.
 506    See also X86_TUNE_HIMODE_MATH.
 507
 508    In current implementation the partial register stalls are not eliminated
 509    very well - they can be introduced via subregs synthesized by combine
 510    and can happen in caller/callee saving sequences.  */
 511 DEF_TUNE (X86_TUNE_PARTIAL_REG_STALL, "partial_reg_stall", m_PPRO)
 512
 513 /* X86_TUNE_PROMOTE_QIMODE: When it is cheap, turn 8bit arithmetic to
 514    corresponding 32bit arithmetic.  */
 515 DEF_TUNE (X86_TUNE_PROMOTE_QIMODE, "promote_qimode",
 516           ~m_PPRO)
 517
 518 /* X86_TUNE_PROMOTE_HI_REGS: Same, but for 16bit artihmetic.  Again we avoid
 519    partial register stalls on PentiumPro targets. */
 520 DEF_TUNE (X86_TUNE_PROMOTE_HI_REGS, "promote_hi_regs", m_PPRO)
 521
 522 /* X86_TUNE_HIMODE_MATH: Enable use of 16bit arithmetic.
 523    On PPro this flag is meant to avoid partial register stalls.  */
 524 DEF_TUNE (X86_TUNE_HIMODE_MATH, "himode_math", ~m_PPRO)
 525
 526 /* X86_TUNE_SPLIT_LONG_MOVES: Avoid instructions moving immediates
 527    directly to memory.  */
 528 DEF_TUNE (X86_TUNE_SPLIT_LONG_MOVES, "split_long_moves", m_PPRO)
 529
 530 /* X86_TUNE_USE_XCHGB: Use xchgb %rh,%rl instead of rolw/rorw $8,rx.  */
 531 DEF_TUNE (X86_TUNE_USE_XCHGB, "use_xchgb", m_PENT4)
 532
 533 /* X86_TUNE_USE_MOV0: Use "mov $0, reg" instead of "xor reg, reg" to clear
 534    integer register.  */
 535 DEF_TUNE (X86_TUNE_USE_MOV0, "use_mov0", m_K6)
 536
 537 /* X86_TUNE_NOT_VECTORMODE: On AMD K6, NOT is vector decoded with memory
 538    operand that cannot be represented using a modRM byte.  The XOR
 539    replacement is long decoded, so this split helps here as well.  */
 540 DEF_TUNE (X86_TUNE_NOT_VECTORMODE, "not_vectormode", m_K6)
 541
 542 /* X86_TUNE_AVOID_VECTOR_DECODE: Enable splitters that avoid vector decoded
 543    forms of instructions on K8 targets.  */
 544 DEF_TUNE (X86_TUNE_AVOID_VECTOR_DECODE, "avoid_vector_decode",
 545           m_K8)
 546
 547 /*****************************************************************************/
 548 /* This never worked well before.                                            */
 549 /*****************************************************************************/
 550
 551 /* X86_TUNE_BRANCH_PREDICTION_HINTS: Branch hints were put in P4 based
 552    on simulation result. But after P4 was made, no performance benefit
 553    was observed with branch hints.  It also increases the code size.
 554    As a result, icc never generates branch hints.  */
 555 DEF_TUNE (X86_TUNE_BRANCH_PREDICTION_HINTS, "branch_prediction_hints", 0U)
 556
 557 /* X86_TUNE_QIMODE_MATH: Enable use of 8bit arithmetic.  */
 558 DEF_TUNE (X86_TUNE_QIMODE_MATH, "qimode_math", ~0U)
 559
 560 /* X86_TUNE_PROMOTE_QI_REGS: This enables generic code that promotes all 8bit
 561    arithmetic to 32bit via PROMOTE_MODE macro.  This code generation scheme
 562    is usually used for RISC targets.  */
 563 DEF_TUNE (X86_TUNE_PROMOTE_QI_REGS, "promote_qi_regs", 0U)
 564
 565 /* X86_TUNE_EMIT_VZEROUPPER: This enables vzeroupper instruction insertion
 566    before a transfer of control flow out of the function.  */
 567 DEF_TUNE (X86_TUNE_EMIT_VZEROUPPER, "emit_vzeroupper", ~m_KNL)