* x86-tune.def (X86_TUNE_SSE_UNALIGNED_LOAD_OPTIMAL): Enable for generic.
[official-gcc.git] / gcc / config / i386 / x86-tune.def
blobb7f597df5d712d99306776e8dda29a552e8574c8
1 /* Definitions of x86 tunable features.
2 Copyright (C) 2013 Free Software Foundation, Inc.
4 This file is part of GCC.
6 GCC is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3, or (at your option)
9 any later version.
11 GCC is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License and
17 a copy of the GCC Runtime Library Exception along with this program;
18 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
19 <http://www.gnu.org/licenses/>. */
21 /* X86_TUNE_USE_LEAVE: Leave does not affect Nocona SPEC2000 results
22 negatively, so enabling for Generic64 seems like good code size
23 tradeoff. We can't enable it for 32bit generic because it does not
24 work well with PPro base chips. */
25 DEF_TUNE (X86_TUNE_USE_LEAVE, "use_leave",
26 m_386 | m_CORE_ALL | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC)
27 DEF_TUNE (X86_TUNE_PUSH_MEMORY, "push_memory",
28 m_386 | m_P4_NOCONA | m_CORE_ALL | m_K6_GEODE | m_AMD_MULTIPLE
29 | m_GENERIC)
30 DEF_TUNE (X86_TUNE_ZERO_EXTEND_WITH_AND, "zero_extend_with_and", m_486 | m_PENT)
31 DEF_TUNE (X86_TUNE_UNROLL_STRLEN, "unroll_strlen",
32 m_486 | m_PENT | m_PPRO | m_ATOM | m_SLM | m_CORE_ALL | m_K6
33 | m_AMD_MULTIPLE | m_GENERIC)
34 /* X86_TUNE_BRANCH_PREDICTION_HINTS: Branch hints were put in P4 based
35 on simulation result. But after P4 was made, no performance benefit
36 was observed with branch hints. It also increases the code size.
37 As a result, icc never generates branch hints. */
38 DEF_TUNE (X86_TUNE_BRANCH_PREDICTION_HINTS, "branch_prediction_hints", 0)
39 DEF_TUNE (X86_TUNE_DOUBLE_WITH_ADD, "double_with_add", ~m_386)
40 DEF_TUNE (X86_TUNE_USE_SAHF, "use_sahf",
41 m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_SLM | m_K6_GEODE
42 | m_K8 | m_AMDFAM10 | m_BDVER | m_BTVER | m_GENERIC)
43 /* X86_TUNE_MOVX: Enable to zero extend integer registers to avoid
44 partial dependencies. */
45 DEF_TUNE (X86_TUNE_MOVX, "movx",
46 m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_SLM | m_GEODE
47 | m_AMD_MULTIPLE | m_GENERIC)
48 /* X86_TUNE_PARTIAL_REG_STALL: We probably ought to watch for partial
49 register stalls on Generic32 compilation setting as well. However
50 in current implementation the partial register stalls are not eliminated
51 very well - they can be introduced via subregs synthesized by combine
52 and can happen in caller/callee saving sequences. */
53 DEF_TUNE (X86_TUNE_PARTIAL_REG_STALL, "partial_reg_stall", m_PPRO)
54 DEF_TUNE (X86_TUNE_PARTIAL_FLAG_REG_STALL, "partial_flag_reg_stall",
55 m_CORE2 | m_GENERIC)
56 /* X86_TUNE_LCP_STALL: Avoid an expensive length-changing prefix stall
57 * on 16-bit immediate moves into memory on Core2 and Corei7. */
58 DEF_TUNE (X86_TUNE_LCP_STALL, "lcp_stall", m_CORE_ALL | m_GENERIC)
59 DEF_TUNE (X86_TUNE_USE_HIMODE_FIOP, "use_himode_fiop",
60 m_386 | m_486 | m_K6_GEODE)
61 DEF_TUNE (X86_TUNE_USE_SIMODE_FIOP, "use_simode_fiop",
62 ~(m_PENT | m_PPRO | m_CORE_ALL | m_ATOM
63 | m_SLM | m_AMD_MULTIPLE | m_GENERIC))
64 DEF_TUNE (X86_TUNE_USE_MOV0, "use_mov0", m_K6)
65 DEF_TUNE (X86_TUNE_USE_CLTD, "use_cltd", ~(m_PENT | m_ATOM | m_SLM | m_K6))
66 /* X86_TUNE_USE_XCHGB: Use xchgb %rh,%rl instead of rolw/rorw $8,rx. */
67 DEF_TUNE (X86_TUNE_USE_XCHGB, "use_xchgb", m_PENT4)
68 DEF_TUNE (X86_TUNE_SPLIT_LONG_MOVES, "split_long_moves", m_PPRO)
69 DEF_TUNE (X86_TUNE_READ_MODIFY_WRITE, "read_modify_write", ~m_PENT)
70 DEF_TUNE (X86_TUNE_READ_MODIFY, "read_modify", ~(m_PENT | m_PPRO))
71 DEF_TUNE (X86_TUNE_PROMOTE_QIMODE, "promote_qimode",
72 m_386 | m_486 | m_PENT | m_CORE_ALL | m_ATOM | m_SLM
73 | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC)
74 DEF_TUNE (X86_TUNE_FAST_PREFIX, "fast_prefix", ~(m_386 | m_486 | m_PENT))
75 DEF_TUNE (X86_TUNE_SINGLE_STRINGOP, "single_stringop", m_386 | m_P4_NOCONA)
76 DEF_TUNE (X86_TUNE_QIMODE_MATH, "qimode_math", ~0)
77 /* X86_TUNE_HIMODE_MATH: On PPro this flag is meant to avoid partial
78 register stalls. Just like X86_TUNE_PARTIAL_REG_STALL this option
79 might be considered for Generic32 if our scheme for avoiding partial
80 stalls was more effective. */
81 DEF_TUNE (X86_TUNE_HIMODE_MATH, "himode_math", ~m_PPRO)
82 DEF_TUNE (X86_TUNE_PROMOTE_QI_REGS, "promote_qi_regs", 0)
83 DEF_TUNE (X86_TUNE_PROMOTE_HI_REGS, "promote_hi_regs", m_PPRO)
84 /* X86_TUNE_SINGLE_POP: Enable if single pop insn is preferred
85 over esp addition. */
86 DEF_TUNE (X86_TUNE_SINGLE_POP, "single_pop", m_386 | m_486 | m_PENT | m_PPRO)
87 /* X86_TUNE_DOUBLE_POP: Enable if double pop insn is preferred
88 over esp addition. */
89 DEF_TUNE (X86_TUNE_DOUBLE_POP, "double_pop", m_PENT)
90 /* X86_TUNE_SINGLE_PUSH: Enable if single push insn is preferred
91 over esp subtraction. */
92 DEF_TUNE (X86_TUNE_SINGLE_PUSH, "single_push", m_386 | m_486 | m_PENT
93 | m_K6_GEODE)
94 /* X86_TUNE_DOUBLE_PUSH. Enable if double push insn is preferred
95 over esp subtraction. */
96 DEF_TUNE (X86_TUNE_DOUBLE_PUSH, "double_push", m_PENT | m_K6_GEODE)
97 /* X86_TUNE_INTEGER_DFMODE_MOVES: Enable if integer moves are preferred
98 for DFmode copies */
99 DEF_TUNE (X86_TUNE_INTEGER_DFMODE_MOVES, "integer_dfmode_moves",
100 ~(m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_SLM
101 | m_GEODE | m_AMD_MULTIPLE | m_GENERIC))
102 DEF_TUNE (X86_TUNE_PARTIAL_REG_DEPENDENCY, "partial_reg_dependency",
103 m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_SLM | m_AMD_MULTIPLE
104 | m_GENERIC)
105 /* X86_TUNE_SSE_PARTIAL_REG_DEPENDENCY: In the Generic model we have a
106 conflict here in between PPro/Pentium4 based chips that thread 128bit
107 SSE registers as single units versus K8 based chips that divide SSE
108 registers to two 64bit halves. This knob promotes all store destinations
109 to be 128bit to allow register renaming on 128bit SSE units, but usually
110 results in one extra microop on 64bit SSE units. Experimental results
111 shows that disabling this option on P4 brings over 20% SPECfp regression,
112 while enabling it on K8 brings roughly 2.4% regression that can be partly
113 masked by careful scheduling of moves. */
114 DEF_TUNE (X86_TUNE_SSE_PARTIAL_REG_DEPENDENCY, "sse_partial_reg_dependency",
115 m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_SLM | m_AMDFAM10
116 | m_BDVER | m_GENERIC)
117 DEF_TUNE (X86_TUNE_SSE_UNALIGNED_LOAD_OPTIMAL, "sse_unaligned_load_optimal",
118 m_COREI7 | m_AMDFAM10 | m_BDVER | m_BTVER | m_SLM | m_GENERIC)
119 DEF_TUNE (X86_TUNE_SSE_UNALIGNED_STORE_OPTIMAL, "sse_unaligned_store_optimal",
120 m_COREI7 | m_BDVER | m_SLM | m_GENERIC)
121 DEF_TUNE (X86_TUNE_SSE_PACKED_SINGLE_INSN_OPTIMAL, "sse_packed_single_insn_optimal",
122 m_BDVER)
123 /* X86_TUNE_SSE_SPLIT_REGS: Set for machines where the type and dependencies
124 are resolved on SSE register parts instead of whole registers, so we may
125 maintain just lower part of scalar values in proper format leaving the
126 upper part undefined. */
127 DEF_TUNE (X86_TUNE_SSE_SPLIT_REGS, "sse_split_regs", m_ATHLON_K8)
128 DEF_TUNE (X86_TUNE_SSE_TYPELESS_STORES, "sse_typeless_stores",
129 m_AMD_MULTIPLE | m_CORE_ALL)
130 DEF_TUNE (X86_TUNE_SSE_LOAD0_BY_PXOR, "sse_load0_by_pxor",
131 m_PPRO | m_P4_NOCONA | m_CORE_ALL)
132 DEF_TUNE (X86_TUNE_MEMORY_MISMATCH_STALL, "memory_mismatch_stall",
133 m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_SLM | m_AMD_MULTIPLE | m_GENERIC)
134 DEF_TUNE (X86_TUNE_PROLOGUE_USING_MOVE, "prologue_using_move",
135 m_PPRO | m_ATHLON_K8)
136 DEF_TUNE (X86_TUNE_EPILOGUE_USING_MOVE, "epilogue_using_move",
137 m_PPRO | m_ATHLON_K8)
138 DEF_TUNE (X86_TUNE_SHIFT1, "shift1", ~m_486)
139 DEF_TUNE (X86_TUNE_USE_FFREEP, "use_ffreep", m_AMD_MULTIPLE)
140 DEF_TUNE (X86_TUNE_INTER_UNIT_MOVES_TO_VEC, "inter_unit_moves_to_vec",
141 ~(m_AMD_MULTIPLE | m_GENERIC))
142 DEF_TUNE (X86_TUNE_INTER_UNIT_MOVES_FROM_VEC, "inter_unit_moves_from_vec",
143 ~m_ATHLON_K8)
144 DEF_TUNE (X86_TUNE_INTER_UNIT_CONVERSIONS, "inter_unit_conversions",
145 ~(m_AMDFAM10 | m_BDVER ))
146 /* X86_TUNE_FOUR_JUMP_LIMIT: Some CPU cores are not able to predict more
147 than 4 branch instructions in the 16 byte window. */
148 DEF_TUNE (X86_TUNE_FOUR_JUMP_LIMIT, "four_jump_limit",
149 m_PPRO | m_P4_NOCONA | m_ATOM | m_SLM | m_ATHLON_K8 | m_AMDFAM10)
150 DEF_TUNE (X86_TUNE_SCHEDULE, "schedule",
151 m_PENT | m_PPRO | m_CORE_ALL | m_ATOM | m_SLM | m_K6_GEODE
152 | m_AMD_MULTIPLE | m_GENERIC)
153 DEF_TUNE (X86_TUNE_USE_BT, "use_bt",
154 m_CORE_ALL | m_ATOM | m_SLM | m_AMD_MULTIPLE | m_GENERIC)
155 DEF_TUNE (X86_TUNE_USE_INCDEC, "use_incdec",
156 ~(m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_SLM | m_GENERIC))
157 DEF_TUNE (X86_TUNE_PAD_RETURNS, "pad_returns",
158 m_ATHLON_K8 | m_AMDFAM10 | m_GENERIC)
159 DEF_TUNE (X86_TUNE_PAD_SHORT_FUNCTION, "pad_short_function", m_ATOM)
160 DEF_TUNE (X86_TUNE_EXT_80387_CONSTANTS, "ext_80387_constants",
161 m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_SLM | m_K6_GEODE
162 | m_ATHLON_K8 | m_GENERIC)
163 DEF_TUNE (X86_TUNE_AVOID_VECTOR_DECODE, "avoid_vector_decode",
164 m_K8)
165 /* X86_TUNE_PROMOTE_HIMODE_IMUL: Modern CPUs have same latency for HImode
166 and SImode multiply, but 386 and 486 do HImode multiply faster. */
167 DEF_TUNE (X86_TUNE_PROMOTE_HIMODE_IMUL, "promote_himode_imul",
168 ~(m_386 | m_486))
169 /* X86_TUNE_SLOW_IMUL_IMM32_MEM: Imul of 32-bit constant and memory is
170 vector path on AMD machines. */
171 DEF_TUNE (X86_TUNE_SLOW_IMUL_IMM32_MEM, "slow_imul_imm32_mem",
172 m_CORE_ALL | m_K8 | m_AMDFAM10 | m_BDVER | m_BTVER | m_GENERIC)
173 /* X86_TUNE_SLOW_IMUL_IMM8: Imul of 8-bit constant is vector path on AMD
174 machines. */
175 DEF_TUNE (X86_TUNE_SLOW_IMUL_IMM8, "slow_imul_imm8",
176 m_CORE_ALL | m_K8 | m_AMDFAM10 | m_BDVER | m_BTVER | m_GENERIC)
177 /* X86_TUNE_MOVE_M1_VIA_OR: On pentiums, it is faster to load -1 via OR
178 than a MOV. */
179 DEF_TUNE (X86_TUNE_MOVE_M1_VIA_OR, "move_m1_via_or", m_PENT)
180 /* X86_TUNE_NOT_UNPAIRABLE: NOT is not pairable on Pentium, while XOR is,
181 but one byte longer. */
182 DEF_TUNE (X86_TUNE_NOT_UNPAIRABLE, "not_unpairable", m_PENT)
183 /* X86_TUNE_NOT_VECTORMODE: On AMD K6, NOT is vector decoded with memory
184 operand that cannot be represented using a modRM byte. The XOR
185 replacement is long decoded, so this split helps here as well. */
186 DEF_TUNE (X86_TUNE_NOT_VECTORMODE, "not_vectormode", m_K6)
187 /* X86_TUNE_USE_VECTOR_FP_CONVERTS: Prefer vector packed SSE conversion
188 from FP to FP. */
189 DEF_TUNE (X86_TUNE_USE_VECTOR_FP_CONVERTS, "use_vector_fp_converts",
190 m_CORE_ALL | m_AMDFAM10 | m_GENERIC)
191 /* X86_TUNE_USE_VECTOR_CONVERTS: Prefer vector packed SSE conversion
192 from integer to FP. */
193 DEF_TUNE (X86_TUNE_USE_VECTOR_CONVERTS, "use_vector_converts", m_AMDFAM10)
194 /* X86_TUNE_FUSE_CMP_AND_BRANCH: Fuse a compare or test instruction
195 with a subsequent conditional jump instruction into a single
196 compare-and-branch uop. */
197 DEF_TUNE (X86_TUNE_FUSE_CMP_AND_BRANCH, "fuse_cmp_and_branch", m_BDVER | m_CORE_ALL)
198 /* X86_TUNE_OPT_AGU: Optimize for Address Generation Unit. This flag
199 will impact LEA instruction selection. */
200 DEF_TUNE (X86_TUNE_OPT_AGU, "opt_agu", m_ATOM | m_SLM)
201 /* X86_TUNE_VECTORIZE_DOUBLE: Enable double precision vector
202 instructions. */
203 DEF_TUNE (X86_TUNE_VECTORIZE_DOUBLE, "vectorize_double", ~m_ATOM)
204 /* X86_TUNE_SOFTWARE_PREFETCHING_BENEFICIAL: Enable software prefetching
205 at -O3. For the moment, the prefetching seems badly tuned for Intel
206 chips. */
207 DEF_TUNE (X86_TUNE_SOFTWARE_PREFETCHING_BENEFICIAL, "software_prefetching_beneficial",
208 m_K6_GEODE | m_AMD_MULTIPLE)
209 /* X86_TUNE_AVX128_OPTIMAL: Enable 128-bit AVX instruction generation for
210 the auto-vectorizer. */
211 DEF_TUNE (X86_TUNE_AVX128_OPTIMAL, "avx128_optimal", m_BDVER | m_BTVER2)
212 /* X86_TUNE_REASSOC_INT_TO_PARALLEL: Try to produce parallel computations
213 during reassociation of integer computation. */
214 DEF_TUNE (X86_TUNE_REASSOC_INT_TO_PARALLEL, "reassoc_int_to_parallel",
215 m_ATOM)
216 /* X86_TUNE_REASSOC_FP_TO_PARALLEL: Try to produce parallel computations
217 during reassociation of fp computation. */
218 DEF_TUNE (X86_TUNE_REASSOC_FP_TO_PARALLEL, "reassoc_fp_to_parallel",
219 m_ATOM | m_SLM | m_HASWELL | m_BDVER1 | m_BDVER2 | m_GENERIC)
220 /* X86_TUNE_GENERAL_REGS_SSE_SPILL: Try to spill general regs to SSE
221 regs instead of memory. */
222 DEF_TUNE (X86_TUNE_GENERAL_REGS_SSE_SPILL, "general_regs_sse_spill",
223 m_CORE_ALL)
224 /* X86_TUNE_AVOID_MEM_OPND_FOR_CMOVE: Try to avoid memory operands for
225 a conditional move. */
226 DEF_TUNE (X86_TUNE_AVOID_MEM_OPND_FOR_CMOVE, "avoid_mem_opnd_for_cmove",
227 m_ATOM | m_SLM)
228 /* X86_TUNE_SPLIT_MEM_OPND_FOR_FP_CONVERTS: Try to split memory operand for
229 fp converts to destination register. */
230 DEF_TUNE (X86_TUNE_SPLIT_MEM_OPND_FOR_FP_CONVERTS, "split_mem_opnd_for_fp_converts",
231 m_SLM)