gcc/config/i386/x86-tune.def

   1 /* Definitions of x86 tunable features.
   2    Copyright (C) 2013 Free Software Foundation, Inc.
   3
   4 This file is part of GCC.
   5
   6 GCC is free software; you can redistribute it and/or modify
   7 it under the terms of the GNU General Public License as published by
   8 the Free Software Foundation; either version 3, or (at your option)
   9 any later version.
  10
  11 GCC is distributed in the hope that it will be useful,
  12 but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 GNU General Public License for more details.
  15
  16 You should have received a copy of the GNU General Public License and
  17 a copy of the GCC Runtime Library Exception along with this program;
  18 see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
  19 <http://www.gnu.org/licenses/>.  */
  20
  21 /* Tuning for a given CPU XXXX consists of:
  22     - adding new CPU into:
  23         - adding PROCESSOR_XXX to processor_type (in i386.h)
  24         - possibly adding XXX into CPU attribute in i386.md
  25         - adding XXX to processor_alias_table (in i386.c)
  26     - introducing ix86_XXX_cost in i386.c
  27         - Stringop generation table can be build based on test_stringop
  28         - script (once rest of tuning is complete)
  29     - designing a scheduler model in
  30         - XXXX.md file
  31         - Updating ix86_issue_rate and ix86_adjust_cost in i386.md
  32         - possibly updating ia32_multipass_dfa_lookahead, ix86_sched_reorder
  33           and ix86_sched_init_global if those tricks are needed.
  34     - Tunning the flags bellow. Those are split into sections and each
  35       section is very roughly ordered by importance.  */
  36
  37 /*****************************************************************************/
  38 /* Scheduling flags.                                                         */
  39 /*****************************************************************************/
  40
  41 /* X86_TUNE_SCHEDULE: Enable scheduling.  */
  42 DEF_TUNE (X86_TUNE_SCHEDULE, "schedule",
  43           m_PENT | m_PPRO | m_CORE_ALL | m_ATOM | m_SLM | m_K6_GEODE
  44           | m_AMD_MULTIPLE | m_GENERIC)
  45
  46 /* X86_TUNE_PARTIAL_REG_DEPENDENCY: Enable more register renaming
  47    on modern chips.  Preffer stores affecting whole integer register
  48    over partial stores.  For example preffer MOVZBL or MOVQ to load 8bit
  49    value over movb.  */
  50 DEF_TUNE (X86_TUNE_PARTIAL_REG_DEPENDENCY, "partial_reg_dependency",
  51           m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_SLM | m_AMD_MULTIPLE
  52           | m_GENERIC)
  53
  54 /* X86_TUNE_SSE_PARTIAL_REG_DEPENDENCY: This knob promotes all store
  55    destinations to be 128bit to allow register renaming on 128bit SSE units,
  56    but usually results in one extra microop on 64bit SSE units.
  57    Experimental results shows that disabling this option on P4 brings over 20%
  58    SPECfp regression, while enabling it on K8 brings roughly 2.4% regression
  59    that can be partly masked by careful scheduling of moves.  */
  60 DEF_TUNE (X86_TUNE_SSE_PARTIAL_REG_DEPENDENCY, "sse_partial_reg_dependency",
  61           m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_SLM | m_AMDFAM10
  62           | m_BDVER | m_GENERIC)
  63
  64 /* X86_TUNE_SSE_SPLIT_REGS: Set for machines where the type and dependencies
  65    are resolved on SSE register parts instead of whole registers, so we may
  66    maintain just lower part of scalar values in proper format leaving the
  67    upper part undefined.  */
  68 DEF_TUNE (X86_TUNE_SSE_SPLIT_REGS, "sse_split_regs", m_ATHLON_K8)
  69
  70 /* X86_TUNE_PARTIAL_FLAG_REG_STALL: this flag disables use of of flags
  71    set by instructions affecting just some flags (in particular shifts).
  72    This is because Core2 resolves dependencies on whole flags register
  73    and such sequences introduce false dependency on previous instruction
  74    setting full flags.
  75
  76    The flags does not affect generation of INC and DEC that is controlled
  77    by X86_TUNE_USE_INCDEC.
  78
  79    This flag may be dropped from generic once core2-corei5 machines are
  80    rare enough.  */
  81 DEF_TUNE (X86_TUNE_PARTIAL_FLAG_REG_STALL, "partial_flag_reg_stall",
  82           m_CORE2 | m_GENERIC)
  83
  84 /* X86_TUNE_MOVX: Enable to zero extend integer registers to avoid
  85    partial dependencies.  */
  86 DEF_TUNE (X86_TUNE_MOVX, "movx",
  87           m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_SLM | m_GEODE
  88           | m_AMD_MULTIPLE  | m_GENERIC)
  89
  90 /* X86_TUNE_MEMORY_MISMATCH_STALL: Avoid partial stores that are followed by
  91    full sized loads.  */
  92 DEF_TUNE (X86_TUNE_MEMORY_MISMATCH_STALL, "memory_mismatch_stall",
  93           m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_SLM | m_AMD_MULTIPLE | m_GENERIC)
  94
  95 /* X86_TUNE_FUSE_CMP_AND_BRANCH: Fuse a compare or test instruction
  96    with a subsequent conditional jump instruction into a single
  97    compare-and-branch uop.
  98    FIXME: revisit for generic.  */
  99 DEF_TUNE (X86_TUNE_FUSE_CMP_AND_BRANCH, "fuse_cmp_and_branch", m_BDVER | m_CORE_ALL)
 100
 101 /* X86_TUNE_REASSOC_INT_TO_PARALLEL: Try to produce parallel computations
 102    during reassociation of integer computation.  */
 103 DEF_TUNE (X86_TUNE_REASSOC_INT_TO_PARALLEL, "reassoc_int_to_parallel",
 104           m_ATOM)
 105
 106 /* X86_TUNE_REASSOC_FP_TO_PARALLEL: Try to produce parallel computations
 107    during reassociation of fp computation.  */
 108 DEF_TUNE (X86_TUNE_REASSOC_FP_TO_PARALLEL, "reassoc_fp_to_parallel",
 109           m_ATOM | m_SLM | m_HASWELL | m_BDVER1 | m_BDVER2 | m_GENERIC)
 110
 111 /*****************************************************************************/
 112 /* Function prologue, epilogue and function calling sequences.               */
 113 /*****************************************************************************/
 114
 115 /* X86_TUNE_ACCUMULATE_OUTGOING_ARGS: Allocate stack space for outgoing
 116    arguments in prologue/epilogue instead of separately for each call
 117    by push/pop instructions.
 118    This increase code size by about 5% in 32bit mode, less so in 64bit mode
 119    because parameters are passed in registers.  It is considerable
 120    win for targets without stack engine that prevents multple push operations
 121    to happen in parallel.
 122
 123    FIXME: the flags is incorrectly enabled for amdfam10, Bulldozer,
 124    Bobcat and Generic.  This is because disabling it causes large
 125    regression on mgrid due to IRA limitation leading to unecessary
 126    use of the frame pointer in 32bit mode.  */
 127 DEF_TUNE (X86_TUNE_ACCUMULATE_OUTGOING_ARGS, "accumulate_outgoing_args",
 128           m_PPRO | m_P4_NOCONA | m_ATOM | m_SLM | m_AMD_MULTIPLE | m_GENERIC)
 129
 130 /* X86_TUNE_PROLOGUE_USING_MOVE: Do not use push/pop in prologues that are
 131    considered on critical path.  */
 132 DEF_TUNE (X86_TUNE_PROLOGUE_USING_MOVE, "prologue_using_move",
 133           m_PPRO | m_ATHLON_K8)
 134
 135 /* X86_TUNE_PROLOGUE_USING_MOVE: Do not use push/pop in epilogues that are
 136    considered on critical path.  */
 137 DEF_TUNE (X86_TUNE_EPILOGUE_USING_MOVE, "epilogue_using_move",
 138           m_PPRO | m_ATHLON_K8)
 139
 140 /* X86_TUNE_USE_LEAVE: Use "leave" instruction in epilogues where it fits.  */
 141 DEF_TUNE (X86_TUNE_USE_LEAVE, "use_leave",
 142           m_386 | m_CORE_ALL | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC)
 143
 144 /* X86_TUNE_PUSH_MEMORY: Enable generation of "push mem" instructions.
 145    Some chips, like 486 and Pentium works faster with separate load
 146    and push instructions.  */
 147 DEF_TUNE (X86_TUNE_PUSH_MEMORY, "push_memory",
 148           m_386 | m_P4_NOCONA | m_CORE_ALL | m_K6_GEODE | m_AMD_MULTIPLE
 149           | m_GENERIC)
 150
 151 /* X86_TUNE_SINGLE_PUSH: Enable if single push insn is preferred
 152    over esp subtraction.  */
 153 DEF_TUNE (X86_TUNE_SINGLE_PUSH, "single_push", m_386 | m_486 | m_PENT
 154           | m_K6_GEODE)
 155
 156 /* X86_TUNE_DOUBLE_PUSH. Enable if double push insn is preferred
 157    over esp subtraction.  */
 158 DEF_TUNE (X86_TUNE_DOUBLE_PUSH, "double_push", m_PENT | m_K6_GEODE)
 159
 160 /* X86_TUNE_SINGLE_POP: Enable if single pop insn is preferred
 161    over esp addition.  */
 162 DEF_TUNE (X86_TUNE_SINGLE_POP, "single_pop", m_386 | m_486 | m_PENT | m_PPRO)
 163
 164 /* X86_TUNE_DOUBLE_POP: Enable if double pop insn is preferred
 165    over esp addition.  */
 166 DEF_TUNE (X86_TUNE_DOUBLE_POP, "double_pop", m_PENT)
 167
 168 /*****************************************************************************/
 169 /* Branch predictor tuning                                                   */
 170 /*****************************************************************************/
 171
 172 /* X86_TUNE_PAD_SHORT_FUNCTION: Make every function to be at least 4
 173    instructions long.  */
 174 DEF_TUNE (X86_TUNE_PAD_SHORT_FUNCTION, "pad_short_function", m_ATOM)
 175
 176 /* X86_TUNE_PAD_RETURNS: Place NOP before every RET that is a destination
 177    of conditional jump or directly preceded by other jump instruction.
 178    This is important for AND K8-AMDFAM10 because the branch prediction
 179    architecture expect at most one jump per 2 byte window.  Failing to
 180    pad returns leads to misaligned return stack.  */
 181 DEF_TUNE (X86_TUNE_PAD_RETURNS, "pad_returns",
 182           m_ATHLON_K8 | m_AMDFAM10 | m_GENERIC)
 183
 184 /* X86_TUNE_FOUR_JUMP_LIMIT: Some CPU cores are not able to predict more
 185    than 4 branch instructions in the 16 byte window.  */
 186 DEF_TUNE (X86_TUNE_FOUR_JUMP_LIMIT, "four_jump_limit",
 187           m_PPRO | m_P4_NOCONA | m_ATOM | m_SLM | m_ATHLON_K8 | m_AMDFAM10)
 188
 189 /*****************************************************************************/
 190 /* Integer instruction selection tuning                                      */
 191 /*****************************************************************************/
 192
 193 /* X86_TUNE_SOFTWARE_PREFETCHING_BENEFICIAL: Enable software prefetching
 194    at -O3.  For the moment, the prefetching seems badly tuned for Intel
 195    chips.  */
 196 DEF_TUNE (X86_TUNE_SOFTWARE_PREFETCHING_BENEFICIAL, "software_prefetching_beneficial",
 197           m_K6_GEODE | m_AMD_MULTIPLE)
 198
 199 /* X86_TUNE_LCP_STALL: Avoid an expensive length-changing prefix stall
 200    on 16-bit immediate moves into memory on Core2 and Corei7.  */
 201 DEF_TUNE (X86_TUNE_LCP_STALL, "lcp_stall", m_CORE_ALL | m_GENERIC)
 202
 203 /* X86_TUNE_READ_MODIFY: Enable use of read-modify instructions such
 204    as "add mem, reg".  */
 205 DEF_TUNE (X86_TUNE_READ_MODIFY, "read_modify", ~(m_PENT | m_PPRO))
 206
 207 /* X86_TUNE_USE_INCDEC: Enable use of inc/dec instructions.   */
 208 DEF_TUNE (X86_TUNE_USE_INCDEC, "use_incdec",
 209           ~(m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_SLM | m_GENERIC))
 210
 211 /* X86_TUNE_INTEGER_DFMODE_MOVES: Enable if integer moves are preferred
 212    for DFmode copies */
 213 DEF_TUNE (X86_TUNE_INTEGER_DFMODE_MOVES, "integer_dfmode_moves",
 214           ~(m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_SLM
 215           | m_GEODE | m_AMD_MULTIPLE | m_GENERIC))
 216
 217 /* X86_TUNE_OPT_AGU: Optimize for Address Generation Unit. This flag
 218    will impact LEA instruction selection. */
 219 DEF_TUNE (X86_TUNE_OPT_AGU, "opt_agu", m_ATOM | m_SLM)
 220
 221 /* X86_TUNE_SLOW_IMUL_IMM32_MEM: Imul of 32-bit constant and memory is
 222    vector path on AMD machines.
 223    FIXME: Do we need to enable this for core? */
 224 DEF_TUNE (X86_TUNE_SLOW_IMUL_IMM32_MEM, "slow_imul_imm32_mem",
 225           m_K8 | m_AMDFAM10)
 226
 227 /* X86_TUNE_SLOW_IMUL_IMM8: Imul of 8-bit constant is vector path on AMD
 228    machines.
 229    FIXME: Do we need to enable this for core? */
 230 DEF_TUNE (X86_TUNE_SLOW_IMUL_IMM8, "slow_imul_imm8",
 231           m_K8 | m_AMDFAM10)
 232
 233 /* X86_TUNE_AVOID_MEM_OPND_FOR_CMOVE: Try to avoid memory operands for
 234    a conditional move.  */
 235 DEF_TUNE (X86_TUNE_AVOID_MEM_OPND_FOR_CMOVE, "avoid_mem_opnd_for_cmove",
 236           m_ATOM | m_SLM)
 237
 238 /* X86_TUNE_SINGLE_STRINGOP: Enable use of single string operations, such
 239    as MOVS and STOS (without a REP prefix) to move/set sequences of bytes.  */
 240 DEF_TUNE (X86_TUNE_SINGLE_STRINGOP, "single_stringop", m_386 | m_P4_NOCONA)
 241
 242 /* TARGET_MISALIGNED_MOVE_STRING_PROLOGUES: Enable generation of compace
 243    prologues and epilogues by issuing a misaligned moves.  This require
 244    target to handle misaligned moves and partial memory stalls resonably
 245    well.
 246    FIXME: This actualy may be a win on more targets than listed here.  */
 247 DEF_TUNE (TARGET_MISALIGNED_MOVE_STRING_PROLOGUES,
 248           "misaligned_move_string_prologues",
 249           m_386 | m_486 | m_CORE_ALL | m_AMD_MULTIPLE | m_GENERIC)
 250
 251 /* X86_TUNE_USE_SAHF: Controls use of SAHF.  */
 252 DEF_TUNE (X86_TUNE_USE_SAHF, "use_sahf",
 253           m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_SLM | m_K6_GEODE
 254           | m_K8 | m_AMDFAM10 | m_BDVER | m_BTVER | m_GENERIC)
 255
 256 /* X86_TUNE_USE_CLTD: Controls use of CLTD and CTQO instructions.  */
 257 DEF_TUNE (X86_TUNE_USE_CLTD, "use_cltd", ~(m_PENT | m_ATOM | m_SLM | m_K6))
 258
 259 /* X86_TUNE_USE_BT: Enable use of BT (bit test) instructions.  */
 260 DEF_TUNE (X86_TUNE_USE_BT, "use_bt",
 261           m_CORE_ALL | m_ATOM | m_SLM | m_AMD_MULTIPLE | m_GENERIC)
 262
 263 /*****************************************************************************/
 264 /* 387 instruction selection tuning                                          */
 265 /*****************************************************************************/
 266
 267 /* X86_TUNE_USE_HIMODE_FIOP: Enables use of x87 instructions with 16bit
 268    integer operand.
 269    FIXME: Why this is disabled for modern chips?  */
 270 DEF_TUNE (X86_TUNE_USE_HIMODE_FIOP, "use_himode_fiop",
 271           m_386 | m_486 | m_K6_GEODE)
 272
 273 /* X86_TUNE_USE_SIMODE_FIOP: Enables use of x87 instructions with 32bit
 274    integer operand.  */
 275 DEF_TUNE (X86_TUNE_USE_SIMODE_FIOP, "use_simode_fiop",
 276           ~(m_PENT | m_PPRO | m_CORE_ALL | m_ATOM
 277             | m_SLM | m_AMD_MULTIPLE | m_GENERIC))
 278
 279 /* X86_TUNE_USE_FFREEP: Use freep instruction instead of fstp.  */
 280 DEF_TUNE (X86_TUNE_USE_FFREEP, "use_ffreep", m_AMD_MULTIPLE)
 281
 282 /* X86_TUNE_EXT_80387_CONSTANTS: Use fancy 80387 constants, such as PI.  */
 283 DEF_TUNE (X86_TUNE_EXT_80387_CONSTANTS, "ext_80387_constants",
 284           m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_SLM | m_K6_GEODE
 285           | m_ATHLON_K8 | m_GENERIC)
 286
 287 /*****************************************************************************/
 288 /* SSE instruction selection tuning                                          */
 289 /*****************************************************************************/
 290
 291 /* X86_TUNE_VECTORIZE_DOUBLE: Enable double precision vector
 292    instructions.  */
 293 DEF_TUNE (X86_TUNE_VECTORIZE_DOUBLE, "vectorize_double", ~m_ATOM)
 294
 295 /* X86_TUNE_GENERAL_REGS_SSE_SPILL: Try to spill general regs to SSE
 296    regs instead of memory.  */
 297 DEF_TUNE (X86_TUNE_GENERAL_REGS_SSE_SPILL, "general_regs_sse_spill",
 298           m_CORE_ALL)
 299
 300 /* X86_TUNE_SSE_UNALIGNED_LOAD_OPTIMAL: Use movups for misaligned loads instead
 301    of a sequence loading registers by parts.  */
 302 DEF_TUNE (X86_TUNE_SSE_UNALIGNED_LOAD_OPTIMAL, "sse_unaligned_load_optimal",
 303           m_COREI7 | m_AMDFAM10 | m_BDVER | m_BTVER | m_SLM | m_GENERIC)
 304
 305 /* X86_TUNE_SSE_UNALIGNED_STORE_OPTIMAL: Use movups for misaligned stores instead
 306    of a sequence loading registers by parts.  */
 307 DEF_TUNE (X86_TUNE_SSE_UNALIGNED_STORE_OPTIMAL, "sse_unaligned_store_optimal",
 308           m_COREI7 | m_BDVER | m_SLM | m_GENERIC)
 309
 310 /* Use packed single precision instructions where posisble.  I.e. movups instead
 311    of movupd.  */
 312 DEF_TUNE (X86_TUNE_SSE_PACKED_SINGLE_INSN_OPTIMAL, "sse_packed_single_insn_optimal",
 313           m_BDVER)
 314
 315 /* X86_TUNE_SSE_TYPELESS_STORES: Always movaps/movups for 128bit stores.   */
 316 DEF_TUNE (X86_TUNE_SSE_TYPELESS_STORES, "sse_typeless_stores",
 317           m_AMD_MULTIPLE | m_CORE_ALL | m_GENERIC)
 318
 319 /* X86_TUNE_SSE_LOAD0_BY_PXOR: Always use pxor to load0 as opposed to
 320    xorps/xorpd and other variants.  */
 321 DEF_TUNE (X86_TUNE_SSE_LOAD0_BY_PXOR, "sse_load0_by_pxor",
 322           m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_BDVER | m_BTVER | m_GENERIC)
 323
 324 /* X86_TUNE_INTER_UNIT_MOVES_TO_VEC: Enable moves in from integer
 325    to SSE registers.  If disabled, the moves will be done by storing
 326    the value to memory and reloading.  */
 327 DEF_TUNE (X86_TUNE_INTER_UNIT_MOVES_TO_VEC, "inter_unit_moves_to_vec",
 328           ~(m_AMD_MULTIPLE | m_GENERIC))
 329
 330 /* X86_TUNE_INTER_UNIT_MOVES_TO_VEC: Enable moves in from SSE
 331    to integer registers.  If disabled, the moves will be done by storing
 332    the value to memory and reloading.  */
 333 DEF_TUNE (X86_TUNE_INTER_UNIT_MOVES_FROM_VEC, "inter_unit_moves_from_vec",
 334           ~m_ATHLON_K8)
 335
 336 /* X86_TUNE_INTER_UNIT_CONVERSIONS: Enable float<->integer conversions
 337    to use both SSE and integer registers at a same time.
 338    FIXME: revisit importance of this for generic.  */
 339 DEF_TUNE (X86_TUNE_INTER_UNIT_CONVERSIONS, "inter_unit_conversions",
 340           ~(m_AMDFAM10 | m_BDVER))
 341
 342 /* X86_TUNE_SPLIT_MEM_OPND_FOR_FP_CONVERTS: Try to split memory operand for
 343    fp converts to destination register.  */
 344 DEF_TUNE (X86_TUNE_SPLIT_MEM_OPND_FOR_FP_CONVERTS, "split_mem_opnd_for_fp_converts",
 345           m_SLM)
 346
 347 /* X86_TUNE_USE_VECTOR_FP_CONVERTS: Prefer vector packed SSE conversion
 348    from FP to FP.  This form of instructions avoids partial write to the
 349    destination.  */
 350 DEF_TUNE (X86_TUNE_USE_VECTOR_FP_CONVERTS, "use_vector_fp_converts",
 351           m_AMDFAM10)
 352
 353 /* X86_TUNE_USE_VECTOR_CONVERTS: Prefer vector packed SSE conversion
 354    from integer to FP. */
 355 DEF_TUNE (X86_TUNE_USE_VECTOR_CONVERTS, "use_vector_converts", m_AMDFAM10)
 356
 357 /*****************************************************************************/
 358 /* AVX instruction selection tuning (some of SSE flags affects AVX, too)     */
 359 /*****************************************************************************/
 360
 361 /* X86_TUNE_AVX256_UNALIGNED_LOAD_OPTIMAL: if true, unaligned loads are
 362    split.  */
 363 DEF_TUNE (X86_TUNE_AVX256_UNALIGNED_LOAD_OPTIMAL, "256_unaligned_load_optimal",
 364           ~(m_COREI7 | m_GENERIC))
 365
 366 /* X86_TUNE_AVX256_UNALIGNED_STORE_OPTIMAL: if true, unaligned loads are
 367    split.  */
 368 DEF_TUNE (X86_TUNE_AVX256_UNALIGNED_STORE_OPTIMAL, "256_unaligned_load_optimal",
 369           ~(m_COREI7 | m_BDVER | m_GENERIC))
 370
 371 /* X86_TUNE_AVX128_OPTIMAL: Enable 128-bit AVX instruction generation for
 372    the auto-vectorizer.  */
 373 DEF_TUNE (X86_TUNE_AVX128_OPTIMAL, "avx128_optimal", m_BDVER | m_BTVER2)
 374
 375 /*****************************************************************************/
 376 /* Historical relics: tuning flags that helps a specific old CPU designs     */
 377 /*****************************************************************************/
 378
 379 /* X86_TUNE_DOUBLE_WITH_ADD: Use add instead of sal to double value in
 380    an integer register.  */
 381 DEF_TUNE (X86_TUNE_DOUBLE_WITH_ADD, "double_with_add", ~m_386)
 382
 383 /* X86_TUNE_ALWAYS_FANCY_MATH_387: controls use of fancy 387 operations,
 384    such as fsqrt, fprem, fsin, fcos, fsincos etc.
 385    Should be enabled for all targets that always has coprocesor.  */
 386 DEF_TUNE (X86_TUNE_ALWAYS_FANCY_MATH_387, "always_fancy_math_387",
 387           ~(m_386 | m_486))
 388
 389 /* X86_TUNE_UNROLL_STRLEN: Produce (quite lame) unrolled sequence for
 390    inline strlen.  This affects only -minline-all-stringops mode. By
 391    default we always dispatch to a library since our internal strlen
 392    is bad.  */
 393 DEF_TUNE (X86_TUNE_UNROLL_STRLEN, "unroll_strlen", ~m_386)
 394
 395 /* X86_TUNE_SHIFT1: Enables use of short encoding of "sal reg" instead of
 396    longer "sal $1, reg".  */
 397 DEF_TUNE (X86_TUNE_SHIFT1, "shift1", ~m_486)
 398
 399 /* X86_TUNE_ZERO_EXTEND_WITH_AND: Use AND instruction instead
 400    of mozbl/movwl.  */
 401 DEF_TUNE (X86_TUNE_ZERO_EXTEND_WITH_AND, "zero_extend_with_and",  m_486 | m_PENT)
 402
 403 /* X86_TUNE_PROMOTE_HIMODE_IMUL: Modern CPUs have same latency for HImode
 404    and SImode multiply, but 386 and 486 do HImode multiply faster.  */
 405 DEF_TUNE (X86_TUNE_PROMOTE_HIMODE_IMUL, "promote_himode_imul",
 406           ~(m_386 | m_486))
 407
 408 /* X86_TUNE_FAST_PREFIX: Enable demoting some 32bit or 64bit arithmetic
 409    into 16bit/8bit when resulting sequence is shorter.  For example
 410    for "and $-65536, reg" to 16bit store of 0.  */
 411 DEF_TUNE (X86_TUNE_FAST_PREFIX, "fast_prefix", ~(m_386 | m_486 | m_PENT))
 412
 413 /* X86_TUNE_READ_MODIFY_WRITE: Enable use of read modify write instructions
 414    such as "add $1, mem".  */
 415 DEF_TUNE (X86_TUNE_READ_MODIFY_WRITE, "read_modify_write", ~m_PENT)
 416
 417 /* X86_TUNE_MOVE_M1_VIA_OR: On pentiums, it is faster to load -1 via OR
 418    than a MOV.  */
 419 DEF_TUNE (X86_TUNE_MOVE_M1_VIA_OR, "move_m1_via_or", m_PENT)
 420
 421 /* X86_TUNE_NOT_UNPAIRABLE: NOT is not pairable on Pentium, while XOR is,
 422    but one byte longer.  */
 423 DEF_TUNE (X86_TUNE_NOT_UNPAIRABLE, "not_unpairable", m_PENT)
 424
 425 /* X86_TUNE_PARTIAL_REG_STALL: Pentium pro, unlike later chips, handled
 426    use of partial registers by renaming.  This improved performance of 16bit
 427    code where upper halves of registers are not used.  It also leads to
 428    an penalty whenever a 16bit store is followed by 32bit use.  This flag
 429    disables production of such sequences in common cases.
 430    See also X86_TUNE_HIMODE_MATH.
 431
 432    In current implementation the partial register stalls are not eliminated
 433    very well - they can be introduced via subregs synthesized by combine
 434    and can happen in caller/callee saving sequences.  */
 435 DEF_TUNE (X86_TUNE_PARTIAL_REG_STALL, "partial_reg_stall", m_PPRO)
 436
 437 /* X86_TUNE_PROMOTE_QIMODE: When it is cheap, turn 8bit arithmetic to
 438    corresponding 32bit arithmetic.  */
 439 DEF_TUNE (X86_TUNE_PROMOTE_QIMODE, "promote_qimode",
 440           ~m_PPRO)
 441
 442 /* X86_TUNE_PROMOTE_HI_REGS: Same, but for 16bit artihmetic.  Again we avoid
 443    partial register stalls on PentiumPro targets. */
 444 DEF_TUNE (X86_TUNE_PROMOTE_HI_REGS, "promote_hi_regs", m_PPRO)
 445
 446 /* X86_TUNE_HIMODE_MATH: Enable use of 16bit arithmetic.
 447    On PPro this flag is meant to avoid partial register stalls.  */
 448 DEF_TUNE (X86_TUNE_HIMODE_MATH, "himode_math", ~m_PPRO)
 449
 450 /* X86_TUNE_SPLIT_LONG_MOVES: Avoid instructions moving immediates
 451    directly to memory.  */
 452 DEF_TUNE (X86_TUNE_SPLIT_LONG_MOVES, "split_long_moves", m_PPRO)
 453
 454 /* X86_TUNE_USE_XCHGB: Use xchgb %rh,%rl instead of rolw/rorw $8,rx.  */
 455 DEF_TUNE (X86_TUNE_USE_XCHGB, "use_xchgb", m_PENT4)
 456
 457 /* X86_TUNE_USE_MOV0: Use "mov $0, reg" instead of "xor reg, reg" to clear
 458    integer register.  */
 459 DEF_TUNE (X86_TUNE_USE_MOV0, "use_mov0", m_K6)
 460
 461 /* X86_TUNE_NOT_VECTORMODE: On AMD K6, NOT is vector decoded with memory
 462    operand that cannot be represented using a modRM byte.  The XOR
 463    replacement is long decoded, so this split helps here as well.  */
 464 DEF_TUNE (X86_TUNE_NOT_VECTORMODE, "not_vectormode", m_K6)
 465
 466 /* X86_TUNE_AVOID_VECTOR_DECODE: Enable splitters that avoid vector decoded
 467    forms of instructions on K8 targets.  */
 468 DEF_TUNE (X86_TUNE_AVOID_VECTOR_DECODE, "avoid_vector_decode",
 469           m_K8)
 470
 471 /*****************************************************************************/
 472 /* This never worked well before.                                            */
 473 /*****************************************************************************/
 474
 475 /* X86_TUNE_BRANCH_PREDICTION_HINTS: Branch hints were put in P4 based
 476    on simulation result. But after P4 was made, no performance benefit
 477    was observed with branch hints.  It also increases the code size.
 478    As a result, icc never generates branch hints.  */
 479 DEF_TUNE (X86_TUNE_BRANCH_PREDICTION_HINTS, "branch_prediction_hints", 0)
 480
 481 /* X86_TUNE_QIMODE_MATH: Enable use of 8bit arithmetic.  */
 482 DEF_TUNE (X86_TUNE_QIMODE_MATH, "qimode_math", ~0)
 483
 484 /* X86_TUNE_PROMOTE_QI_REGS: This enables generic code that promotes all 8bit
 485    arithmetic to 32bit via PROMOTE_MODE macro.  This code generation scheme
 486    is usually used for RISC targets.  */
 487 DEF_TUNE (X86_TUNE_PROMOTE_QI_REGS, "promote_qi_regs", 0)