* expmed.c (flip_storage_order): Deal with complex modes specially.
[official-gcc.git] / gcc / tree-ssa-math-opts.c
blob8d5bf003c539a8a12a0f208d54a4b80b6472f226
1 /* Global, SSA-based optimizations using mathematical identities.
2 Copyright (C) 2005-2015 Free Software Foundation, Inc.
4 This file is part of GCC.
6 GCC is free software; you can redistribute it and/or modify it
7 under the terms of the GNU General Public License as published by the
8 Free Software Foundation; either version 3, or (at your option) any
9 later version.
11 GCC is distributed in the hope that it will be useful, but WITHOUT
12 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14 for more details.
16 You should have received a copy of the GNU General Public License
17 along with GCC; see the file COPYING3. If not see
18 <http://www.gnu.org/licenses/>. */
20 /* Currently, the only mini-pass in this file tries to CSE reciprocal
21 operations. These are common in sequences such as this one:
23 modulus = sqrt(x*x + y*y + z*z);
24 x = x / modulus;
25 y = y / modulus;
26 z = z / modulus;
28 that can be optimized to
30 modulus = sqrt(x*x + y*y + z*z);
31 rmodulus = 1.0 / modulus;
32 x = x * rmodulus;
33 y = y * rmodulus;
34 z = z * rmodulus;
36 We do this for loop invariant divisors, and with this pass whenever
37 we notice that a division has the same divisor multiple times.
39 Of course, like in PRE, we don't insert a division if a dominator
40 already has one. However, this cannot be done as an extension of
41 PRE for several reasons.
43 First of all, with some experiments it was found out that the
44 transformation is not always useful if there are only two divisions
45 hy the same divisor. This is probably because modern processors
46 can pipeline the divisions; on older, in-order processors it should
47 still be effective to optimize two divisions by the same number.
48 We make this a param, and it shall be called N in the remainder of
49 this comment.
51 Second, if trapping math is active, we have less freedom on where
52 to insert divisions: we can only do so in basic blocks that already
53 contain one. (If divisions don't trap, instead, we can insert
54 divisions elsewhere, which will be in blocks that are common dominators
55 of those that have the division).
57 We really don't want to compute the reciprocal unless a division will
58 be found. To do this, we won't insert the division in a basic block
59 that has less than N divisions *post-dominating* it.
61 The algorithm constructs a subset of the dominator tree, holding the
62 blocks containing the divisions and the common dominators to them,
63 and walk it twice. The first walk is in post-order, and it annotates
64 each block with the number of divisions that post-dominate it: this
65 gives information on where divisions can be inserted profitably.
66 The second walk is in pre-order, and it inserts divisions as explained
67 above, and replaces divisions by multiplications.
69 In the best case, the cost of the pass is O(n_statements). In the
70 worst-case, the cost is due to creating the dominator tree subset,
71 with a cost of O(n_basic_blocks ^ 2); however this can only happen
72 for n_statements / n_basic_blocks statements. So, the amortized cost
73 of creating the dominator tree subset is O(n_basic_blocks) and the
74 worst-case cost of the pass is O(n_statements * n_basic_blocks).
76 More practically, the cost will be small because there are few
77 divisions, and they tend to be in the same basic block, so insert_bb
78 is called very few times.
80 If we did this using domwalk.c, an efficient implementation would have
81 to work on all the variables in a single pass, because we could not
82 work on just a subset of the dominator tree, as we do now, and the
83 cost would also be something like O(n_statements * n_basic_blocks).
84 The data structures would be more complex in order to work on all the
85 variables in a single pass. */
87 #include "config.h"
88 #include "system.h"
89 #include "coretypes.h"
90 #include "tm.h"
91 #include "flags.h"
92 #include "input.h"
93 #include "alias.h"
94 #include "symtab.h"
95 #include "tree.h"
96 #include "fold-const.h"
97 #include "predict.h"
98 #include "hard-reg-set.h"
99 #include "function.h"
100 #include "dominance.h"
101 #include "cfg.h"
102 #include "basic-block.h"
103 #include "tree-ssa-alias.h"
104 #include "internal-fn.h"
105 #include "gimple-fold.h"
106 #include "gimple-expr.h"
107 #include "is-a.h"
108 #include "gimple.h"
109 #include "gimple-iterator.h"
110 #include "gimplify.h"
111 #include "gimplify-me.h"
112 #include "stor-layout.h"
113 #include "gimple-ssa.h"
114 #include "tree-cfg.h"
115 #include "tree-phinodes.h"
116 #include "ssa-iterators.h"
117 #include "stringpool.h"
118 #include "tree-ssanames.h"
119 #include "rtl.h"
120 #include "insn-config.h"
121 #include "expmed.h"
122 #include "dojump.h"
123 #include "explow.h"
124 #include "calls.h"
125 #include "emit-rtl.h"
126 #include "varasm.h"
127 #include "stmt.h"
128 #include "expr.h"
129 #include "tree-dfa.h"
130 #include "tree-ssa.h"
131 #include "tree-pass.h"
132 #include "alloc-pool.h"
133 #include "target.h"
134 #include "gimple-pretty-print.h"
135 #include "builtins.h"
136 #include "params.h"
138 /* FIXME: RTL headers have to be included here for optabs. */
139 #include "rtl.h" /* Because optabs.h wants enum rtx_code. */
140 #include "expr.h" /* Because optabs.h wants sepops. */
141 #include "insn-codes.h"
142 #include "optabs.h"
144 /* This structure represents one basic block that either computes a
145 division, or is a common dominator for basic block that compute a
146 division. */
147 struct occurrence {
148 /* The basic block represented by this structure. */
149 basic_block bb;
151 /* If non-NULL, the SSA_NAME holding the definition for a reciprocal
152 inserted in BB. */
153 tree recip_def;
155 /* If non-NULL, the GIMPLE_ASSIGN for a reciprocal computation that
156 was inserted in BB. */
157 gimple recip_def_stmt;
159 /* Pointer to a list of "struct occurrence"s for blocks dominated
160 by BB. */
161 struct occurrence *children;
163 /* Pointer to the next "struct occurrence"s in the list of blocks
164 sharing a common dominator. */
165 struct occurrence *next;
167 /* The number of divisions that are in BB before compute_merit. The
168 number of divisions that are in BB or post-dominate it after
169 compute_merit. */
170 int num_divisions;
172 /* True if the basic block has a division, false if it is a common
173 dominator for basic blocks that do. If it is false and trapping
174 math is active, BB is not a candidate for inserting a reciprocal. */
175 bool bb_has_division;
178 static struct
180 /* Number of 1.0/X ops inserted. */
181 int rdivs_inserted;
183 /* Number of 1.0/FUNC ops inserted. */
184 int rfuncs_inserted;
185 } reciprocal_stats;
187 static struct
189 /* Number of cexpi calls inserted. */
190 int inserted;
191 } sincos_stats;
193 static struct
195 /* Number of hand-written 16-bit nop / bswaps found. */
196 int found_16bit;
198 /* Number of hand-written 32-bit nop / bswaps found. */
199 int found_32bit;
201 /* Number of hand-written 64-bit nop / bswaps found. */
202 int found_64bit;
203 } nop_stats, bswap_stats;
205 static struct
207 /* Number of widening multiplication ops inserted. */
208 int widen_mults_inserted;
210 /* Number of integer multiply-and-accumulate ops inserted. */
211 int maccs_inserted;
213 /* Number of fp fused multiply-add ops inserted. */
214 int fmas_inserted;
215 } widen_mul_stats;
217 /* The instance of "struct occurrence" representing the highest
218 interesting block in the dominator tree. */
219 static struct occurrence *occ_head;
221 /* Allocation pool for getting instances of "struct occurrence". */
222 static pool_allocator<occurrence> *occ_pool;
226 /* Allocate and return a new struct occurrence for basic block BB, and
227 whose children list is headed by CHILDREN. */
228 static struct occurrence *
229 occ_new (basic_block bb, struct occurrence *children)
231 struct occurrence *occ;
233 bb->aux = occ = occ_pool->allocate ();
234 memset (occ, 0, sizeof (struct occurrence));
236 occ->bb = bb;
237 occ->children = children;
238 return occ;
242 /* Insert NEW_OCC into our subset of the dominator tree. P_HEAD points to a
243 list of "struct occurrence"s, one per basic block, having IDOM as
244 their common dominator.
246 We try to insert NEW_OCC as deep as possible in the tree, and we also
247 insert any other block that is a common dominator for BB and one
248 block already in the tree. */
250 static void
251 insert_bb (struct occurrence *new_occ, basic_block idom,
252 struct occurrence **p_head)
254 struct occurrence *occ, **p_occ;
256 for (p_occ = p_head; (occ = *p_occ) != NULL; )
258 basic_block bb = new_occ->bb, occ_bb = occ->bb;
259 basic_block dom = nearest_common_dominator (CDI_DOMINATORS, occ_bb, bb);
260 if (dom == bb)
262 /* BB dominates OCC_BB. OCC becomes NEW_OCC's child: remove OCC
263 from its list. */
264 *p_occ = occ->next;
265 occ->next = new_occ->children;
266 new_occ->children = occ;
268 /* Try the next block (it may as well be dominated by BB). */
271 else if (dom == occ_bb)
273 /* OCC_BB dominates BB. Tail recurse to look deeper. */
274 insert_bb (new_occ, dom, &occ->children);
275 return;
278 else if (dom != idom)
280 gcc_assert (!dom->aux);
282 /* There is a dominator between IDOM and BB, add it and make
283 two children out of NEW_OCC and OCC. First, remove OCC from
284 its list. */
285 *p_occ = occ->next;
286 new_occ->next = occ;
287 occ->next = NULL;
289 /* None of the previous blocks has DOM as a dominator: if we tail
290 recursed, we would reexamine them uselessly. Just switch BB with
291 DOM, and go on looking for blocks dominated by DOM. */
292 new_occ = occ_new (dom, new_occ);
295 else
297 /* Nothing special, go on with the next element. */
298 p_occ = &occ->next;
302 /* No place was found as a child of IDOM. Make BB a sibling of IDOM. */
303 new_occ->next = *p_head;
304 *p_head = new_occ;
307 /* Register that we found a division in BB. */
309 static inline void
310 register_division_in (basic_block bb)
312 struct occurrence *occ;
314 occ = (struct occurrence *) bb->aux;
315 if (!occ)
317 occ = occ_new (bb, NULL);
318 insert_bb (occ, ENTRY_BLOCK_PTR_FOR_FN (cfun), &occ_head);
321 occ->bb_has_division = true;
322 occ->num_divisions++;
326 /* Compute the number of divisions that postdominate each block in OCC and
327 its children. */
329 static void
330 compute_merit (struct occurrence *occ)
332 struct occurrence *occ_child;
333 basic_block dom = occ->bb;
335 for (occ_child = occ->children; occ_child; occ_child = occ_child->next)
337 basic_block bb;
338 if (occ_child->children)
339 compute_merit (occ_child);
341 if (flag_exceptions)
342 bb = single_noncomplex_succ (dom);
343 else
344 bb = dom;
346 if (dominated_by_p (CDI_POST_DOMINATORS, bb, occ_child->bb))
347 occ->num_divisions += occ_child->num_divisions;
352 /* Return whether USE_STMT is a floating-point division by DEF. */
353 static inline bool
354 is_division_by (gimple use_stmt, tree def)
356 return is_gimple_assign (use_stmt)
357 && gimple_assign_rhs_code (use_stmt) == RDIV_EXPR
358 && gimple_assign_rhs2 (use_stmt) == def
359 /* Do not recognize x / x as valid division, as we are getting
360 confused later by replacing all immediate uses x in such
361 a stmt. */
362 && gimple_assign_rhs1 (use_stmt) != def;
365 /* Walk the subset of the dominator tree rooted at OCC, setting the
366 RECIP_DEF field to a definition of 1.0 / DEF that can be used in
367 the given basic block. The field may be left NULL, of course,
368 if it is not possible or profitable to do the optimization.
370 DEF_BSI is an iterator pointing at the statement defining DEF.
371 If RECIP_DEF is set, a dominator already has a computation that can
372 be used. */
374 static void
375 insert_reciprocals (gimple_stmt_iterator *def_gsi, struct occurrence *occ,
376 tree def, tree recip_def, int threshold)
378 tree type;
379 gassign *new_stmt;
380 gimple_stmt_iterator gsi;
381 struct occurrence *occ_child;
383 if (!recip_def
384 && (occ->bb_has_division || !flag_trapping_math)
385 && occ->num_divisions >= threshold)
387 /* Make a variable with the replacement and substitute it. */
388 type = TREE_TYPE (def);
389 recip_def = create_tmp_reg (type, "reciptmp");
390 new_stmt = gimple_build_assign (recip_def, RDIV_EXPR,
391 build_one_cst (type), def);
393 if (occ->bb_has_division)
395 /* Case 1: insert before an existing division. */
396 gsi = gsi_after_labels (occ->bb);
397 while (!gsi_end_p (gsi) && !is_division_by (gsi_stmt (gsi), def))
398 gsi_next (&gsi);
400 gsi_insert_before (&gsi, new_stmt, GSI_SAME_STMT);
402 else if (def_gsi && occ->bb == def_gsi->bb)
404 /* Case 2: insert right after the definition. Note that this will
405 never happen if the definition statement can throw, because in
406 that case the sole successor of the statement's basic block will
407 dominate all the uses as well. */
408 gsi_insert_after (def_gsi, new_stmt, GSI_NEW_STMT);
410 else
412 /* Case 3: insert in a basic block not containing defs/uses. */
413 gsi = gsi_after_labels (occ->bb);
414 gsi_insert_before (&gsi, new_stmt, GSI_SAME_STMT);
417 reciprocal_stats.rdivs_inserted++;
419 occ->recip_def_stmt = new_stmt;
422 occ->recip_def = recip_def;
423 for (occ_child = occ->children; occ_child; occ_child = occ_child->next)
424 insert_reciprocals (def_gsi, occ_child, def, recip_def, threshold);
428 /* Replace the division at USE_P with a multiplication by the reciprocal, if
429 possible. */
431 static inline void
432 replace_reciprocal (use_operand_p use_p)
434 gimple use_stmt = USE_STMT (use_p);
435 basic_block bb = gimple_bb (use_stmt);
436 struct occurrence *occ = (struct occurrence *) bb->aux;
438 if (optimize_bb_for_speed_p (bb)
439 && occ->recip_def && use_stmt != occ->recip_def_stmt)
441 gimple_stmt_iterator gsi = gsi_for_stmt (use_stmt);
442 gimple_assign_set_rhs_code (use_stmt, MULT_EXPR);
443 SET_USE (use_p, occ->recip_def);
444 fold_stmt_inplace (&gsi);
445 update_stmt (use_stmt);
450 /* Free OCC and return one more "struct occurrence" to be freed. */
452 static struct occurrence *
453 free_bb (struct occurrence *occ)
455 struct occurrence *child, *next;
457 /* First get the two pointers hanging off OCC. */
458 next = occ->next;
459 child = occ->children;
460 occ->bb->aux = NULL;
461 occ_pool->remove (occ);
463 /* Now ensure that we don't recurse unless it is necessary. */
464 if (!child)
465 return next;
466 else
468 while (next)
469 next = free_bb (next);
471 return child;
476 /* Look for floating-point divisions among DEF's uses, and try to
477 replace them by multiplications with the reciprocal. Add
478 as many statements computing the reciprocal as needed.
480 DEF must be a GIMPLE register of a floating-point type. */
482 static void
483 execute_cse_reciprocals_1 (gimple_stmt_iterator *def_gsi, tree def)
485 use_operand_p use_p;
486 imm_use_iterator use_iter;
487 struct occurrence *occ;
488 int count = 0, threshold;
490 gcc_assert (FLOAT_TYPE_P (TREE_TYPE (def)) && is_gimple_reg (def));
492 FOR_EACH_IMM_USE_FAST (use_p, use_iter, def)
494 gimple use_stmt = USE_STMT (use_p);
495 if (is_division_by (use_stmt, def))
497 register_division_in (gimple_bb (use_stmt));
498 count++;
502 /* Do the expensive part only if we can hope to optimize something. */
503 threshold = targetm.min_divisions_for_recip_mul (TYPE_MODE (TREE_TYPE (def)));
504 if (count >= threshold)
506 gimple use_stmt;
507 for (occ = occ_head; occ; occ = occ->next)
509 compute_merit (occ);
510 insert_reciprocals (def_gsi, occ, def, NULL, threshold);
513 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, def)
515 if (is_division_by (use_stmt, def))
517 FOR_EACH_IMM_USE_ON_STMT (use_p, use_iter)
518 replace_reciprocal (use_p);
523 for (occ = occ_head; occ; )
524 occ = free_bb (occ);
526 occ_head = NULL;
529 /* Go through all the floating-point SSA_NAMEs, and call
530 execute_cse_reciprocals_1 on each of them. */
531 namespace {
533 const pass_data pass_data_cse_reciprocals =
535 GIMPLE_PASS, /* type */
536 "recip", /* name */
537 OPTGROUP_NONE, /* optinfo_flags */
538 TV_NONE, /* tv_id */
539 PROP_ssa, /* properties_required */
540 0, /* properties_provided */
541 0, /* properties_destroyed */
542 0, /* todo_flags_start */
543 TODO_update_ssa, /* todo_flags_finish */
546 class pass_cse_reciprocals : public gimple_opt_pass
548 public:
549 pass_cse_reciprocals (gcc::context *ctxt)
550 : gimple_opt_pass (pass_data_cse_reciprocals, ctxt)
553 /* opt_pass methods: */
554 virtual bool gate (function *) { return optimize && flag_reciprocal_math; }
555 virtual unsigned int execute (function *);
557 }; // class pass_cse_reciprocals
559 unsigned int
560 pass_cse_reciprocals::execute (function *fun)
562 basic_block bb;
563 tree arg;
565 occ_pool = new pool_allocator<occurrence>
566 ("dominators for recip", n_basic_blocks_for_fn (fun) / 3 + 1);
568 memset (&reciprocal_stats, 0, sizeof (reciprocal_stats));
569 calculate_dominance_info (CDI_DOMINATORS);
570 calculate_dominance_info (CDI_POST_DOMINATORS);
572 #ifdef ENABLE_CHECKING
573 FOR_EACH_BB_FN (bb, fun)
574 gcc_assert (!bb->aux);
575 #endif
577 for (arg = DECL_ARGUMENTS (fun->decl); arg; arg = DECL_CHAIN (arg))
578 if (FLOAT_TYPE_P (TREE_TYPE (arg))
579 && is_gimple_reg (arg))
581 tree name = ssa_default_def (fun, arg);
582 if (name)
583 execute_cse_reciprocals_1 (NULL, name);
586 FOR_EACH_BB_FN (bb, fun)
588 tree def;
590 for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
591 gsi_next (&gsi))
593 gphi *phi = gsi.phi ();
594 def = PHI_RESULT (phi);
595 if (! virtual_operand_p (def)
596 && FLOAT_TYPE_P (TREE_TYPE (def)))
597 execute_cse_reciprocals_1 (NULL, def);
600 for (gimple_stmt_iterator gsi = gsi_after_labels (bb); !gsi_end_p (gsi);
601 gsi_next (&gsi))
603 gimple stmt = gsi_stmt (gsi);
605 if (gimple_has_lhs (stmt)
606 && (def = SINGLE_SSA_TREE_OPERAND (stmt, SSA_OP_DEF)) != NULL
607 && FLOAT_TYPE_P (TREE_TYPE (def))
608 && TREE_CODE (def) == SSA_NAME)
609 execute_cse_reciprocals_1 (&gsi, def);
612 if (optimize_bb_for_size_p (bb))
613 continue;
615 /* Scan for a/func(b) and convert it to reciprocal a*rfunc(b). */
616 for (gimple_stmt_iterator gsi = gsi_after_labels (bb); !gsi_end_p (gsi);
617 gsi_next (&gsi))
619 gimple stmt = gsi_stmt (gsi);
620 tree fndecl;
622 if (is_gimple_assign (stmt)
623 && gimple_assign_rhs_code (stmt) == RDIV_EXPR)
625 tree arg1 = gimple_assign_rhs2 (stmt);
626 gimple stmt1;
628 if (TREE_CODE (arg1) != SSA_NAME)
629 continue;
631 stmt1 = SSA_NAME_DEF_STMT (arg1);
633 if (is_gimple_call (stmt1)
634 && gimple_call_lhs (stmt1)
635 && (fndecl = gimple_call_fndecl (stmt1))
636 && (DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_NORMAL
637 || DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD))
639 enum built_in_function code;
640 bool md_code, fail;
641 imm_use_iterator ui;
642 use_operand_p use_p;
644 code = DECL_FUNCTION_CODE (fndecl);
645 md_code = DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD;
647 fndecl = targetm.builtin_reciprocal (code, md_code, false);
648 if (!fndecl)
649 continue;
651 /* Check that all uses of the SSA name are divisions,
652 otherwise replacing the defining statement will do
653 the wrong thing. */
654 fail = false;
655 FOR_EACH_IMM_USE_FAST (use_p, ui, arg1)
657 gimple stmt2 = USE_STMT (use_p);
658 if (is_gimple_debug (stmt2))
659 continue;
660 if (!is_gimple_assign (stmt2)
661 || gimple_assign_rhs_code (stmt2) != RDIV_EXPR
662 || gimple_assign_rhs1 (stmt2) == arg1
663 || gimple_assign_rhs2 (stmt2) != arg1)
665 fail = true;
666 break;
669 if (fail)
670 continue;
672 gimple_replace_ssa_lhs (stmt1, arg1);
673 gimple_call_set_fndecl (stmt1, fndecl);
674 update_stmt (stmt1);
675 reciprocal_stats.rfuncs_inserted++;
677 FOR_EACH_IMM_USE_STMT (stmt, ui, arg1)
679 gimple_stmt_iterator gsi = gsi_for_stmt (stmt);
680 gimple_assign_set_rhs_code (stmt, MULT_EXPR);
681 fold_stmt_inplace (&gsi);
682 update_stmt (stmt);
689 statistics_counter_event (fun, "reciprocal divs inserted",
690 reciprocal_stats.rdivs_inserted);
691 statistics_counter_event (fun, "reciprocal functions inserted",
692 reciprocal_stats.rfuncs_inserted);
694 free_dominance_info (CDI_DOMINATORS);
695 free_dominance_info (CDI_POST_DOMINATORS);
696 delete occ_pool;
697 return 0;
700 } // anon namespace
702 gimple_opt_pass *
703 make_pass_cse_reciprocals (gcc::context *ctxt)
705 return new pass_cse_reciprocals (ctxt);
708 /* Records an occurrence at statement USE_STMT in the vector of trees
709 STMTS if it is dominated by *TOP_BB or dominates it or this basic block
710 is not yet initialized. Returns true if the occurrence was pushed on
711 the vector. Adjusts *TOP_BB to be the basic block dominating all
712 statements in the vector. */
714 static bool
715 maybe_record_sincos (vec<gimple> *stmts,
716 basic_block *top_bb, gimple use_stmt)
718 basic_block use_bb = gimple_bb (use_stmt);
719 if (*top_bb
720 && (*top_bb == use_bb
721 || dominated_by_p (CDI_DOMINATORS, use_bb, *top_bb)))
722 stmts->safe_push (use_stmt);
723 else if (!*top_bb
724 || dominated_by_p (CDI_DOMINATORS, *top_bb, use_bb))
726 stmts->safe_push (use_stmt);
727 *top_bb = use_bb;
729 else
730 return false;
732 return true;
735 /* Look for sin, cos and cexpi calls with the same argument NAME and
736 create a single call to cexpi CSEing the result in this case.
737 We first walk over all immediate uses of the argument collecting
738 statements that we can CSE in a vector and in a second pass replace
739 the statement rhs with a REALPART or IMAGPART expression on the
740 result of the cexpi call we insert before the use statement that
741 dominates all other candidates. */
743 static bool
744 execute_cse_sincos_1 (tree name)
746 gimple_stmt_iterator gsi;
747 imm_use_iterator use_iter;
748 tree fndecl, res, type;
749 gimple def_stmt, use_stmt, stmt;
750 int seen_cos = 0, seen_sin = 0, seen_cexpi = 0;
751 auto_vec<gimple> stmts;
752 basic_block top_bb = NULL;
753 int i;
754 bool cfg_changed = false;
756 type = TREE_TYPE (name);
757 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, name)
759 if (gimple_code (use_stmt) != GIMPLE_CALL
760 || !gimple_call_lhs (use_stmt)
761 || !(fndecl = gimple_call_fndecl (use_stmt))
762 || DECL_BUILT_IN_CLASS (fndecl) != BUILT_IN_NORMAL)
763 continue;
765 switch (DECL_FUNCTION_CODE (fndecl))
767 CASE_FLT_FN (BUILT_IN_COS):
768 seen_cos |= maybe_record_sincos (&stmts, &top_bb, use_stmt) ? 1 : 0;
769 break;
771 CASE_FLT_FN (BUILT_IN_SIN):
772 seen_sin |= maybe_record_sincos (&stmts, &top_bb, use_stmt) ? 1 : 0;
773 break;
775 CASE_FLT_FN (BUILT_IN_CEXPI):
776 seen_cexpi |= maybe_record_sincos (&stmts, &top_bb, use_stmt) ? 1 : 0;
777 break;
779 default:;
783 if (seen_cos + seen_sin + seen_cexpi <= 1)
784 return false;
786 /* Simply insert cexpi at the beginning of top_bb but not earlier than
787 the name def statement. */
788 fndecl = mathfn_built_in (type, BUILT_IN_CEXPI);
789 if (!fndecl)
790 return false;
791 stmt = gimple_build_call (fndecl, 1, name);
792 res = make_temp_ssa_name (TREE_TYPE (TREE_TYPE (fndecl)), stmt, "sincostmp");
793 gimple_call_set_lhs (stmt, res);
795 def_stmt = SSA_NAME_DEF_STMT (name);
796 if (!SSA_NAME_IS_DEFAULT_DEF (name)
797 && gimple_code (def_stmt) != GIMPLE_PHI
798 && gimple_bb (def_stmt) == top_bb)
800 gsi = gsi_for_stmt (def_stmt);
801 gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
803 else
805 gsi = gsi_after_labels (top_bb);
806 gsi_insert_before (&gsi, stmt, GSI_SAME_STMT);
808 sincos_stats.inserted++;
810 /* And adjust the recorded old call sites. */
811 for (i = 0; stmts.iterate (i, &use_stmt); ++i)
813 tree rhs = NULL;
814 fndecl = gimple_call_fndecl (use_stmt);
816 switch (DECL_FUNCTION_CODE (fndecl))
818 CASE_FLT_FN (BUILT_IN_COS):
819 rhs = fold_build1 (REALPART_EXPR, type, res);
820 break;
822 CASE_FLT_FN (BUILT_IN_SIN):
823 rhs = fold_build1 (IMAGPART_EXPR, type, res);
824 break;
826 CASE_FLT_FN (BUILT_IN_CEXPI):
827 rhs = res;
828 break;
830 default:;
831 gcc_unreachable ();
834 /* Replace call with a copy. */
835 stmt = gimple_build_assign (gimple_call_lhs (use_stmt), rhs);
837 gsi = gsi_for_stmt (use_stmt);
838 gsi_replace (&gsi, stmt, true);
839 if (gimple_purge_dead_eh_edges (gimple_bb (stmt)))
840 cfg_changed = true;
843 return cfg_changed;
846 /* To evaluate powi(x,n), the floating point value x raised to the
847 constant integer exponent n, we use a hybrid algorithm that
848 combines the "window method" with look-up tables. For an
849 introduction to exponentiation algorithms and "addition chains",
850 see section 4.6.3, "Evaluation of Powers" of Donald E. Knuth,
851 "Seminumerical Algorithms", Vol. 2, "The Art of Computer Programming",
852 3rd Edition, 1998, and Daniel M. Gordon, "A Survey of Fast Exponentiation
853 Methods", Journal of Algorithms, Vol. 27, pp. 129-146, 1998. */
855 /* Provide a default value for POWI_MAX_MULTS, the maximum number of
856 multiplications to inline before calling the system library's pow
857 function. powi(x,n) requires at worst 2*bits(n)-2 multiplications,
858 so this default never requires calling pow, powf or powl. */
860 #ifndef POWI_MAX_MULTS
861 #define POWI_MAX_MULTS (2*HOST_BITS_PER_WIDE_INT-2)
862 #endif
864 /* The size of the "optimal power tree" lookup table. All
865 exponents less than this value are simply looked up in the
866 powi_table below. This threshold is also used to size the
867 cache of pseudo registers that hold intermediate results. */
868 #define POWI_TABLE_SIZE 256
870 /* The size, in bits of the window, used in the "window method"
871 exponentiation algorithm. This is equivalent to a radix of
872 (1<<POWI_WINDOW_SIZE) in the corresponding "m-ary method". */
873 #define POWI_WINDOW_SIZE 3
875 /* The following table is an efficient representation of an
876 "optimal power tree". For each value, i, the corresponding
877 value, j, in the table states than an optimal evaluation
878 sequence for calculating pow(x,i) can be found by evaluating
879 pow(x,j)*pow(x,i-j). An optimal power tree for the first
880 100 integers is given in Knuth's "Seminumerical algorithms". */
882 static const unsigned char powi_table[POWI_TABLE_SIZE] =
884 0, 1, 1, 2, 2, 3, 3, 4, /* 0 - 7 */
885 4, 6, 5, 6, 6, 10, 7, 9, /* 8 - 15 */
886 8, 16, 9, 16, 10, 12, 11, 13, /* 16 - 23 */
887 12, 17, 13, 18, 14, 24, 15, 26, /* 24 - 31 */
888 16, 17, 17, 19, 18, 33, 19, 26, /* 32 - 39 */
889 20, 25, 21, 40, 22, 27, 23, 44, /* 40 - 47 */
890 24, 32, 25, 34, 26, 29, 27, 44, /* 48 - 55 */
891 28, 31, 29, 34, 30, 60, 31, 36, /* 56 - 63 */
892 32, 64, 33, 34, 34, 46, 35, 37, /* 64 - 71 */
893 36, 65, 37, 50, 38, 48, 39, 69, /* 72 - 79 */
894 40, 49, 41, 43, 42, 51, 43, 58, /* 80 - 87 */
895 44, 64, 45, 47, 46, 59, 47, 76, /* 88 - 95 */
896 48, 65, 49, 66, 50, 67, 51, 66, /* 96 - 103 */
897 52, 70, 53, 74, 54, 104, 55, 74, /* 104 - 111 */
898 56, 64, 57, 69, 58, 78, 59, 68, /* 112 - 119 */
899 60, 61, 61, 80, 62, 75, 63, 68, /* 120 - 127 */
900 64, 65, 65, 128, 66, 129, 67, 90, /* 128 - 135 */
901 68, 73, 69, 131, 70, 94, 71, 88, /* 136 - 143 */
902 72, 128, 73, 98, 74, 132, 75, 121, /* 144 - 151 */
903 76, 102, 77, 124, 78, 132, 79, 106, /* 152 - 159 */
904 80, 97, 81, 160, 82, 99, 83, 134, /* 160 - 167 */
905 84, 86, 85, 95, 86, 160, 87, 100, /* 168 - 175 */
906 88, 113, 89, 98, 90, 107, 91, 122, /* 176 - 183 */
907 92, 111, 93, 102, 94, 126, 95, 150, /* 184 - 191 */
908 96, 128, 97, 130, 98, 133, 99, 195, /* 192 - 199 */
909 100, 128, 101, 123, 102, 164, 103, 138, /* 200 - 207 */
910 104, 145, 105, 146, 106, 109, 107, 149, /* 208 - 215 */
911 108, 200, 109, 146, 110, 170, 111, 157, /* 216 - 223 */
912 112, 128, 113, 130, 114, 182, 115, 132, /* 224 - 231 */
913 116, 200, 117, 132, 118, 158, 119, 206, /* 232 - 239 */
914 120, 240, 121, 162, 122, 147, 123, 152, /* 240 - 247 */
915 124, 166, 125, 214, 126, 138, 127, 153, /* 248 - 255 */
919 /* Return the number of multiplications required to calculate
920 powi(x,n) where n is less than POWI_TABLE_SIZE. This is a
921 subroutine of powi_cost. CACHE is an array indicating
922 which exponents have already been calculated. */
924 static int
925 powi_lookup_cost (unsigned HOST_WIDE_INT n, bool *cache)
927 /* If we've already calculated this exponent, then this evaluation
928 doesn't require any additional multiplications. */
929 if (cache[n])
930 return 0;
932 cache[n] = true;
933 return powi_lookup_cost (n - powi_table[n], cache)
934 + powi_lookup_cost (powi_table[n], cache) + 1;
937 /* Return the number of multiplications required to calculate
938 powi(x,n) for an arbitrary x, given the exponent N. This
939 function needs to be kept in sync with powi_as_mults below. */
941 static int
942 powi_cost (HOST_WIDE_INT n)
944 bool cache[POWI_TABLE_SIZE];
945 unsigned HOST_WIDE_INT digit;
946 unsigned HOST_WIDE_INT val;
947 int result;
949 if (n == 0)
950 return 0;
952 /* Ignore the reciprocal when calculating the cost. */
953 val = (n < 0) ? -n : n;
955 /* Initialize the exponent cache. */
956 memset (cache, 0, POWI_TABLE_SIZE * sizeof (bool));
957 cache[1] = true;
959 result = 0;
961 while (val >= POWI_TABLE_SIZE)
963 if (val & 1)
965 digit = val & ((1 << POWI_WINDOW_SIZE) - 1);
966 result += powi_lookup_cost (digit, cache)
967 + POWI_WINDOW_SIZE + 1;
968 val >>= POWI_WINDOW_SIZE;
970 else
972 val >>= 1;
973 result++;
977 return result + powi_lookup_cost (val, cache);
980 /* Recursive subroutine of powi_as_mults. This function takes the
981 array, CACHE, of already calculated exponents and an exponent N and
982 returns a tree that corresponds to CACHE[1]**N, with type TYPE. */
984 static tree
985 powi_as_mults_1 (gimple_stmt_iterator *gsi, location_t loc, tree type,
986 HOST_WIDE_INT n, tree *cache)
988 tree op0, op1, ssa_target;
989 unsigned HOST_WIDE_INT digit;
990 gassign *mult_stmt;
992 if (n < POWI_TABLE_SIZE && cache[n])
993 return cache[n];
995 ssa_target = make_temp_ssa_name (type, NULL, "powmult");
997 if (n < POWI_TABLE_SIZE)
999 cache[n] = ssa_target;
1000 op0 = powi_as_mults_1 (gsi, loc, type, n - powi_table[n], cache);
1001 op1 = powi_as_mults_1 (gsi, loc, type, powi_table[n], cache);
1003 else if (n & 1)
1005 digit = n & ((1 << POWI_WINDOW_SIZE) - 1);
1006 op0 = powi_as_mults_1 (gsi, loc, type, n - digit, cache);
1007 op1 = powi_as_mults_1 (gsi, loc, type, digit, cache);
1009 else
1011 op0 = powi_as_mults_1 (gsi, loc, type, n >> 1, cache);
1012 op1 = op0;
1015 mult_stmt = gimple_build_assign (ssa_target, MULT_EXPR, op0, op1);
1016 gimple_set_location (mult_stmt, loc);
1017 gsi_insert_before (gsi, mult_stmt, GSI_SAME_STMT);
1019 return ssa_target;
1022 /* Convert ARG0**N to a tree of multiplications of ARG0 with itself.
1023 This function needs to be kept in sync with powi_cost above. */
1025 static tree
1026 powi_as_mults (gimple_stmt_iterator *gsi, location_t loc,
1027 tree arg0, HOST_WIDE_INT n)
1029 tree cache[POWI_TABLE_SIZE], result, type = TREE_TYPE (arg0);
1030 gassign *div_stmt;
1031 tree target;
1033 if (n == 0)
1034 return build_real (type, dconst1);
1036 memset (cache, 0, sizeof (cache));
1037 cache[1] = arg0;
1039 result = powi_as_mults_1 (gsi, loc, type, (n < 0) ? -n : n, cache);
1040 if (n >= 0)
1041 return result;
1043 /* If the original exponent was negative, reciprocate the result. */
1044 target = make_temp_ssa_name (type, NULL, "powmult");
1045 div_stmt = gimple_build_assign (target, RDIV_EXPR,
1046 build_real (type, dconst1), result);
1047 gimple_set_location (div_stmt, loc);
1048 gsi_insert_before (gsi, div_stmt, GSI_SAME_STMT);
1050 return target;
1053 /* ARG0 and N are the two arguments to a powi builtin in GSI with
1054 location info LOC. If the arguments are appropriate, create an
1055 equivalent sequence of statements prior to GSI using an optimal
1056 number of multiplications, and return an expession holding the
1057 result. */
1059 static tree
1060 gimple_expand_builtin_powi (gimple_stmt_iterator *gsi, location_t loc,
1061 tree arg0, HOST_WIDE_INT n)
1063 /* Avoid largest negative number. */
1064 if (n != -n
1065 && ((n >= -1 && n <= 2)
1066 || (optimize_function_for_speed_p (cfun)
1067 && powi_cost (n) <= POWI_MAX_MULTS)))
1068 return powi_as_mults (gsi, loc, arg0, n);
1070 return NULL_TREE;
1073 /* Build a gimple call statement that calls FN with argument ARG.
1074 Set the lhs of the call statement to a fresh SSA name. Insert the
1075 statement prior to GSI's current position, and return the fresh
1076 SSA name. */
1078 static tree
1079 build_and_insert_call (gimple_stmt_iterator *gsi, location_t loc,
1080 tree fn, tree arg)
1082 gcall *call_stmt;
1083 tree ssa_target;
1085 call_stmt = gimple_build_call (fn, 1, arg);
1086 ssa_target = make_temp_ssa_name (TREE_TYPE (arg), NULL, "powroot");
1087 gimple_set_lhs (call_stmt, ssa_target);
1088 gimple_set_location (call_stmt, loc);
1089 gsi_insert_before (gsi, call_stmt, GSI_SAME_STMT);
1091 return ssa_target;
1094 /* Build a gimple binary operation with the given CODE and arguments
1095 ARG0, ARG1, assigning the result to a new SSA name for variable
1096 TARGET. Insert the statement prior to GSI's current position, and
1097 return the fresh SSA name.*/
1099 static tree
1100 build_and_insert_binop (gimple_stmt_iterator *gsi, location_t loc,
1101 const char *name, enum tree_code code,
1102 tree arg0, tree arg1)
1104 tree result = make_temp_ssa_name (TREE_TYPE (arg0), NULL, name);
1105 gassign *stmt = gimple_build_assign (result, code, arg0, arg1);
1106 gimple_set_location (stmt, loc);
1107 gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
1108 return result;
1111 /* Build a gimple reference operation with the given CODE and argument
1112 ARG, assigning the result to a new SSA name of TYPE with NAME.
1113 Insert the statement prior to GSI's current position, and return
1114 the fresh SSA name. */
1116 static inline tree
1117 build_and_insert_ref (gimple_stmt_iterator *gsi, location_t loc, tree type,
1118 const char *name, enum tree_code code, tree arg0)
1120 tree result = make_temp_ssa_name (type, NULL, name);
1121 gimple stmt = gimple_build_assign (result, build1 (code, type, arg0));
1122 gimple_set_location (stmt, loc);
1123 gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
1124 return result;
1127 /* Build a gimple assignment to cast VAL to TYPE. Insert the statement
1128 prior to GSI's current position, and return the fresh SSA name. */
1130 static tree
1131 build_and_insert_cast (gimple_stmt_iterator *gsi, location_t loc,
1132 tree type, tree val)
1134 tree result = make_ssa_name (type);
1135 gassign *stmt = gimple_build_assign (result, NOP_EXPR, val);
1136 gimple_set_location (stmt, loc);
1137 gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
1138 return result;
1141 struct pow_synth_sqrt_info
1143 bool *factors;
1144 unsigned int deepest;
1145 unsigned int num_mults;
1148 /* Return true iff the real value C can be represented as a
1149 sum of powers of 0.5 up to N. That is:
1150 C == SUM<i from 1..N> (a[i]*(0.5**i)) where a[i] is either 0 or 1.
1151 Record in INFO the various parameters of the synthesis algorithm such
1152 as the factors a[i], the maximum 0.5 power and the number of
1153 multiplications that will be required. */
1155 bool
1156 representable_as_half_series_p (REAL_VALUE_TYPE c, unsigned n,
1157 struct pow_synth_sqrt_info *info)
1159 REAL_VALUE_TYPE factor = dconsthalf;
1160 REAL_VALUE_TYPE remainder = c;
1162 info->deepest = 0;
1163 info->num_mults = 0;
1164 memset (info->factors, 0, n * sizeof (bool));
1166 for (unsigned i = 0; i < n; i++)
1168 REAL_VALUE_TYPE res;
1170 /* If something inexact happened bail out now. */
1171 if (REAL_ARITHMETIC (res, MINUS_EXPR, remainder, factor))
1172 return false;
1174 /* We have hit zero. The number is representable as a sum
1175 of powers of 0.5. */
1176 if (REAL_VALUES_EQUAL (res, dconst0))
1178 info->factors[i] = true;
1179 info->deepest = i + 1;
1180 return true;
1182 else if (!REAL_VALUE_NEGATIVE (res))
1184 remainder = res;
1185 info->factors[i] = true;
1186 info->num_mults++;
1188 else
1189 info->factors[i] = false;
1191 REAL_ARITHMETIC (factor, MULT_EXPR, factor, dconsthalf);
1193 return false;
1196 /* Return the tree corresponding to FN being applied
1197 to ARG N times at GSI and LOC.
1198 Look up previous results from CACHE if need be.
1199 cache[0] should contain just plain ARG i.e. FN applied to ARG 0 times. */
1201 static tree
1202 get_fn_chain (tree arg, unsigned int n, gimple_stmt_iterator *gsi,
1203 tree fn, location_t loc, tree *cache)
1205 tree res = cache[n];
1206 if (!res)
1208 tree prev = get_fn_chain (arg, n - 1, gsi, fn, loc, cache);
1209 res = build_and_insert_call (gsi, loc, fn, prev);
1210 cache[n] = res;
1213 return res;
1216 /* Print to STREAM the repeated application of function FNAME to ARG
1217 N times. So, for FNAME = "foo", ARG = "x", N = 2 it would print:
1218 "foo (foo (x))". */
1220 static void
1221 print_nested_fn (FILE* stream, const char *fname, const char* arg,
1222 unsigned int n)
1224 if (n == 0)
1225 fprintf (stream, "%s", arg);
1226 else
1228 fprintf (stream, "%s (", fname);
1229 print_nested_fn (stream, fname, arg, n - 1);
1230 fprintf (stream, ")");
1234 /* Print to STREAM the fractional sequence of sqrt chains
1235 applied to ARG, described by INFO. Used for the dump file. */
1237 static void
1238 dump_fractional_sqrt_sequence (FILE *stream, const char *arg,
1239 struct pow_synth_sqrt_info *info)
1241 for (unsigned int i = 0; i < info->deepest; i++)
1243 bool is_set = info->factors[i];
1244 if (is_set)
1246 print_nested_fn (stream, "sqrt", arg, i + 1);
1247 if (i != info->deepest - 1)
1248 fprintf (stream, " * ");
1253 /* Print to STREAM a representation of raising ARG to an integer
1254 power N. Used for the dump file. */
1256 static void
1257 dump_integer_part (FILE *stream, const char* arg, HOST_WIDE_INT n)
1259 if (n > 1)
1260 fprintf (stream, "powi (%s, " HOST_WIDE_INT_PRINT_DEC ")", arg, n);
1261 else if (n == 1)
1262 fprintf (stream, "%s", arg);
1265 /* Attempt to synthesize a POW[F] (ARG0, ARG1) call using chains of
1266 square roots. Place at GSI and LOC. Limit the maximum depth
1267 of the sqrt chains to MAX_DEPTH. Return the tree holding the
1268 result of the expanded sequence or NULL_TREE if the expansion failed.
1270 This routine assumes that ARG1 is a real number with a fractional part
1271 (the integer exponent case will have been handled earlier in
1272 gimple_expand_builtin_pow).
1274 For ARG1 > 0.0:
1275 * For ARG1 composed of a whole part WHOLE_PART and a fractional part
1276 FRAC_PART i.e. WHOLE_PART == floor (ARG1) and
1277 FRAC_PART == ARG1 - WHOLE_PART:
1278 Produce POWI (ARG0, WHOLE_PART) * POW (ARG0, FRAC_PART) where
1279 POW (ARG0, FRAC_PART) is expanded as a product of square root chains
1280 if it can be expressed as such, that is if FRAC_PART satisfies:
1281 FRAC_PART == <SUM from i = 1 until MAX_DEPTH> (a[i] * (0.5**i))
1282 where integer a[i] is either 0 or 1.
1284 Example:
1285 POW (x, 3.625) == POWI (x, 3) * POW (x, 0.625)
1286 --> POWI (x, 3) * SQRT (x) * SQRT (SQRT (SQRT (x)))
1288 For ARG1 < 0.0 there are two approaches:
1289 * (A) Expand to 1.0 / POW (ARG0, -ARG1) where POW (ARG0, -ARG1)
1290 is calculated as above.
1292 Example:
1293 POW (x, -5.625) == 1.0 / POW (x, 5.625)
1294 --> 1.0 / (POWI (x, 5) * SQRT (x) * SQRT (SQRT (SQRT (x))))
1296 * (B) : WHOLE_PART := - ceil (abs (ARG1))
1297 FRAC_PART := ARG1 - WHOLE_PART
1298 and expand to POW (x, FRAC_PART) / POWI (x, WHOLE_PART).
1299 Example:
1300 POW (x, -5.875) == POW (x, 0.125) / POWI (X, 6)
1301 --> SQRT (SQRT (SQRT (x))) / (POWI (x, 6))
1303 For ARG1 < 0.0 we choose between (A) and (B) depending on
1304 how many multiplications we'd have to do.
1305 So, for the example in (B): POW (x, -5.875), if we were to
1306 follow algorithm (A) we would produce:
1307 1.0 / POWI (X, 5) * SQRT (X) * SQRT (SQRT (X)) * SQRT (SQRT (SQRT (X)))
1308 which contains more multiplications than approach (B).
1310 Hopefully, this approach will eliminate potentially expensive POW library
1311 calls when unsafe floating point math is enabled and allow the compiler to
1312 further optimise the multiplies, square roots and divides produced by this
1313 function. */
1315 static tree
1316 expand_pow_as_sqrts (gimple_stmt_iterator *gsi, location_t loc,
1317 tree arg0, tree arg1, HOST_WIDE_INT max_depth)
1319 tree type = TREE_TYPE (arg0);
1320 machine_mode mode = TYPE_MODE (type);
1321 tree sqrtfn = mathfn_built_in (type, BUILT_IN_SQRT);
1322 bool one_over = true;
1324 if (!sqrtfn)
1325 return NULL_TREE;
1327 if (TREE_CODE (arg1) != REAL_CST)
1328 return NULL_TREE;
1330 REAL_VALUE_TYPE exp_init = TREE_REAL_CST (arg1);
1332 gcc_assert (max_depth > 0);
1333 tree *cache = XALLOCAVEC (tree, max_depth + 1);
1335 struct pow_synth_sqrt_info synth_info;
1336 synth_info.factors = XALLOCAVEC (bool, max_depth + 1);
1337 synth_info.deepest = 0;
1338 synth_info.num_mults = 0;
1340 bool neg_exp = REAL_VALUE_NEGATIVE (exp_init);
1341 REAL_VALUE_TYPE exp = real_value_abs (&exp_init);
1343 /* The whole and fractional parts of exp. */
1344 REAL_VALUE_TYPE whole_part;
1345 REAL_VALUE_TYPE frac_part;
1347 real_floor (&whole_part, mode, &exp);
1348 REAL_ARITHMETIC (frac_part, MINUS_EXPR, exp, whole_part);
1351 REAL_VALUE_TYPE ceil_whole = dconst0;
1352 REAL_VALUE_TYPE ceil_fract = dconst0;
1354 if (neg_exp)
1356 real_ceil (&ceil_whole, mode, &exp);
1357 REAL_ARITHMETIC (ceil_fract, MINUS_EXPR, ceil_whole, exp);
1360 if (!representable_as_half_series_p (frac_part, max_depth, &synth_info))
1361 return NULL_TREE;
1363 /* Check whether it's more profitable to not use 1.0 / ... */
1364 if (neg_exp)
1366 struct pow_synth_sqrt_info alt_synth_info;
1367 alt_synth_info.factors = XALLOCAVEC (bool, max_depth + 1);
1368 alt_synth_info.deepest = 0;
1369 alt_synth_info.num_mults = 0;
1371 if (representable_as_half_series_p (ceil_fract, max_depth,
1372 &alt_synth_info)
1373 && alt_synth_info.deepest <= synth_info.deepest
1374 && alt_synth_info.num_mults < synth_info.num_mults)
1376 whole_part = ceil_whole;
1377 frac_part = ceil_fract;
1378 synth_info.deepest = alt_synth_info.deepest;
1379 synth_info.num_mults = alt_synth_info.num_mults;
1380 memcpy (synth_info.factors, alt_synth_info.factors,
1381 (max_depth + 1) * sizeof (bool));
1382 one_over = false;
1386 HOST_WIDE_INT n = real_to_integer (&whole_part);
1387 REAL_VALUE_TYPE cint;
1388 real_from_integer (&cint, VOIDmode, n, SIGNED);
1390 if (!real_identical (&whole_part, &cint))
1391 return NULL_TREE;
1393 if (powi_cost (n) + synth_info.num_mults > POWI_MAX_MULTS)
1394 return NULL_TREE;
1396 memset (cache, 0, (max_depth + 1) * sizeof (tree));
1398 tree integer_res = n == 0 ? build_real (type, dconst1) : arg0;
1400 /* Calculate the integer part of the exponent. */
1401 if (n > 1)
1403 integer_res = gimple_expand_builtin_powi (gsi, loc, arg0, n);
1404 if (!integer_res)
1405 return NULL_TREE;
1408 if (dump_file)
1410 char string[64];
1412 real_to_decimal (string, &exp_init, sizeof (string), 0, 1);
1413 fprintf (dump_file, "synthesizing pow (x, %s) as:\n", string);
1415 if (neg_exp)
1417 if (one_over)
1419 fprintf (dump_file, "1.0 / (");
1420 dump_integer_part (dump_file, "x", n);
1421 if (n > 0)
1422 fprintf (dump_file, " * ");
1423 dump_fractional_sqrt_sequence (dump_file, "x", &synth_info);
1424 fprintf (dump_file, ")");
1426 else
1428 dump_fractional_sqrt_sequence (dump_file, "x", &synth_info);
1429 fprintf (dump_file, " / (");
1430 dump_integer_part (dump_file, "x", n);
1431 fprintf (dump_file, ")");
1434 else
1436 dump_fractional_sqrt_sequence (dump_file, "x", &synth_info);
1437 if (n > 0)
1438 fprintf (dump_file, " * ");
1439 dump_integer_part (dump_file, "x", n);
1442 fprintf (dump_file, "\ndeepest sqrt chain: %d\n", synth_info.deepest);
1446 tree fract_res = NULL_TREE;
1447 cache[0] = arg0;
1449 /* Calculate the fractional part of the exponent. */
1450 for (unsigned i = 0; i < synth_info.deepest; i++)
1452 if (synth_info.factors[i])
1454 tree sqrt_chain = get_fn_chain (arg0, i + 1, gsi, sqrtfn, loc, cache);
1456 if (!fract_res)
1457 fract_res = sqrt_chain;
1459 else
1460 fract_res = build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR,
1461 fract_res, sqrt_chain);
1465 tree res = NULL_TREE;
1467 if (neg_exp)
1469 if (one_over)
1471 if (n > 0)
1472 res = build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR,
1473 fract_res, integer_res);
1474 else
1475 res = fract_res;
1477 res = build_and_insert_binop (gsi, loc, "powrootrecip", RDIV_EXPR,
1478 build_real (type, dconst1), res);
1480 else
1482 res = build_and_insert_binop (gsi, loc, "powroot", RDIV_EXPR,
1483 fract_res, integer_res);
1486 else
1487 res = build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR,
1488 fract_res, integer_res);
1489 return res;
1492 /* ARG0 and ARG1 are the two arguments to a pow builtin call in GSI
1493 with location info LOC. If possible, create an equivalent and
1494 less expensive sequence of statements prior to GSI, and return an
1495 expession holding the result. */
1497 static tree
1498 gimple_expand_builtin_pow (gimple_stmt_iterator *gsi, location_t loc,
1499 tree arg0, tree arg1)
1501 REAL_VALUE_TYPE c, cint, dconst1_3, dconst1_4, dconst1_6;
1502 REAL_VALUE_TYPE c2, dconst3;
1503 HOST_WIDE_INT n;
1504 tree type, sqrtfn, cbrtfn, sqrt_arg0, result, cbrt_x, powi_cbrt_x;
1505 machine_mode mode;
1506 bool speed_p = optimize_bb_for_speed_p (gsi_bb (*gsi));
1507 bool hw_sqrt_exists, c_is_int, c2_is_int;
1509 dconst1_4 = dconst1;
1510 SET_REAL_EXP (&dconst1_4, REAL_EXP (&dconst1_4) - 2);
1512 /* If the exponent isn't a constant, there's nothing of interest
1513 to be done. */
1514 if (TREE_CODE (arg1) != REAL_CST)
1515 return NULL_TREE;
1517 /* If the exponent is equivalent to an integer, expand to an optimal
1518 multiplication sequence when profitable. */
1519 c = TREE_REAL_CST (arg1);
1520 n = real_to_integer (&c);
1521 real_from_integer (&cint, VOIDmode, n, SIGNED);
1522 c_is_int = real_identical (&c, &cint);
1524 if (c_is_int
1525 && ((n >= -1 && n <= 2)
1526 || (flag_unsafe_math_optimizations
1527 && speed_p
1528 && powi_cost (n) <= POWI_MAX_MULTS)))
1529 return gimple_expand_builtin_powi (gsi, loc, arg0, n);
1531 /* Attempt various optimizations using sqrt and cbrt. */
1532 type = TREE_TYPE (arg0);
1533 mode = TYPE_MODE (type);
1534 sqrtfn = mathfn_built_in (type, BUILT_IN_SQRT);
1536 /* Optimize pow(x,0.5) = sqrt(x). This replacement is always safe
1537 unless signed zeros must be maintained. pow(-0,0.5) = +0, while
1538 sqrt(-0) = -0. */
1539 if (sqrtfn
1540 && REAL_VALUES_EQUAL (c, dconsthalf)
1541 && !HONOR_SIGNED_ZEROS (mode))
1542 return build_and_insert_call (gsi, loc, sqrtfn, arg0);
1544 hw_sqrt_exists = optab_handler (sqrt_optab, mode) != CODE_FOR_nothing;
1546 /* Optimize pow(x,1./3.) = cbrt(x). This requires unsafe math
1547 optimizations since 1./3. is not exactly representable. If x
1548 is negative and finite, the correct value of pow(x,1./3.) is
1549 a NaN with the "invalid" exception raised, because the value
1550 of 1./3. actually has an even denominator. The correct value
1551 of cbrt(x) is a negative real value. */
1552 cbrtfn = mathfn_built_in (type, BUILT_IN_CBRT);
1553 dconst1_3 = real_value_truncate (mode, dconst_third ());
1555 if (flag_unsafe_math_optimizations
1556 && cbrtfn
1557 && (gimple_val_nonnegative_real_p (arg0) || !HONOR_NANS (mode))
1558 && REAL_VALUES_EQUAL (c, dconst1_3))
1559 return build_and_insert_call (gsi, loc, cbrtfn, arg0);
1561 /* Optimize pow(x,1./6.) = cbrt(sqrt(x)). Don't do this optimization
1562 if we don't have a hardware sqrt insn. */
1563 dconst1_6 = dconst1_3;
1564 SET_REAL_EXP (&dconst1_6, REAL_EXP (&dconst1_6) - 1);
1566 if (flag_unsafe_math_optimizations
1567 && sqrtfn
1568 && cbrtfn
1569 && (gimple_val_nonnegative_real_p (arg0) || !HONOR_NANS (mode))
1570 && speed_p
1571 && hw_sqrt_exists
1572 && REAL_VALUES_EQUAL (c, dconst1_6))
1574 /* sqrt(x) */
1575 sqrt_arg0 = build_and_insert_call (gsi, loc, sqrtfn, arg0);
1577 /* cbrt(sqrt(x)) */
1578 return build_and_insert_call (gsi, loc, cbrtfn, sqrt_arg0);
1582 /* Attempt to expand the POW as a product of square root chains.
1583 Expand the 0.25 case even when otpimising for size. */
1584 if (flag_unsafe_math_optimizations
1585 && sqrtfn
1586 && hw_sqrt_exists
1587 && (speed_p || REAL_VALUES_EQUAL (c, dconst1_4))
1588 && !HONOR_SIGNED_ZEROS (mode))
1590 unsigned int max_depth = speed_p
1591 ? PARAM_VALUE (PARAM_MAX_POW_SQRT_DEPTH)
1592 : 2;
1594 tree expand_with_sqrts
1595 = expand_pow_as_sqrts (gsi, loc, arg0, arg1, max_depth);
1597 if (expand_with_sqrts)
1598 return expand_with_sqrts;
1601 real_arithmetic (&c2, MULT_EXPR, &c, &dconst2);
1602 n = real_to_integer (&c2);
1603 real_from_integer (&cint, VOIDmode, n, SIGNED);
1604 c2_is_int = real_identical (&c2, &cint);
1606 /* Optimize pow(x,c), where 3c = n for some nonzero integer n, into
1608 powi(x, n/3) * powi(cbrt(x), n%3), n > 0;
1609 1.0 / (powi(x, abs(n)/3) * powi(cbrt(x), abs(n)%3)), n < 0.
1611 Do not calculate the first factor when n/3 = 0. As cbrt(x) is
1612 different from pow(x, 1./3.) due to rounding and behavior with
1613 negative x, we need to constrain this transformation to unsafe
1614 math and positive x or finite math. */
1615 real_from_integer (&dconst3, VOIDmode, 3, SIGNED);
1616 real_arithmetic (&c2, MULT_EXPR, &c, &dconst3);
1617 real_round (&c2, mode, &c2);
1618 n = real_to_integer (&c2);
1619 real_from_integer (&cint, VOIDmode, n, SIGNED);
1620 real_arithmetic (&c2, RDIV_EXPR, &cint, &dconst3);
1621 real_convert (&c2, mode, &c2);
1623 if (flag_unsafe_math_optimizations
1624 && cbrtfn
1625 && (gimple_val_nonnegative_real_p (arg0) || !HONOR_NANS (mode))
1626 && real_identical (&c2, &c)
1627 && !c2_is_int
1628 && optimize_function_for_speed_p (cfun)
1629 && powi_cost (n / 3) <= POWI_MAX_MULTS)
1631 tree powi_x_ndiv3 = NULL_TREE;
1633 /* Attempt to fold powi(arg0, abs(n/3)) into multiplies. If not
1634 possible or profitable, give up. Skip the degenerate case when
1635 abs(n) < 3, where the result is always 1. */
1636 if (absu_hwi (n) >= 3)
1638 powi_x_ndiv3 = gimple_expand_builtin_powi (gsi, loc, arg0,
1639 abs_hwi (n / 3));
1640 if (!powi_x_ndiv3)
1641 return NULL_TREE;
1644 /* Calculate powi(cbrt(x), n%3). Don't use gimple_expand_builtin_powi
1645 as that creates an unnecessary variable. Instead, just produce
1646 either cbrt(x) or cbrt(x) * cbrt(x). */
1647 cbrt_x = build_and_insert_call (gsi, loc, cbrtfn, arg0);
1649 if (absu_hwi (n) % 3 == 1)
1650 powi_cbrt_x = cbrt_x;
1651 else
1652 powi_cbrt_x = build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR,
1653 cbrt_x, cbrt_x);
1655 /* Multiply the two subexpressions, unless powi(x,abs(n)/3) = 1. */
1656 if (absu_hwi (n) < 3)
1657 result = powi_cbrt_x;
1658 else
1659 result = build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR,
1660 powi_x_ndiv3, powi_cbrt_x);
1662 /* If n is negative, reciprocate the result. */
1663 if (n < 0)
1664 result = build_and_insert_binop (gsi, loc, "powroot", RDIV_EXPR,
1665 build_real (type, dconst1), result);
1667 return result;
1670 /* No optimizations succeeded. */
1671 return NULL_TREE;
1674 /* ARG is the argument to a cabs builtin call in GSI with location info
1675 LOC. Create a sequence of statements prior to GSI that calculates
1676 sqrt(R*R + I*I), where R and I are the real and imaginary components
1677 of ARG, respectively. Return an expression holding the result. */
1679 static tree
1680 gimple_expand_builtin_cabs (gimple_stmt_iterator *gsi, location_t loc, tree arg)
1682 tree real_part, imag_part, addend1, addend2, sum, result;
1683 tree type = TREE_TYPE (TREE_TYPE (arg));
1684 tree sqrtfn = mathfn_built_in (type, BUILT_IN_SQRT);
1685 machine_mode mode = TYPE_MODE (type);
1687 if (!flag_unsafe_math_optimizations
1688 || !optimize_bb_for_speed_p (gimple_bb (gsi_stmt (*gsi)))
1689 || !sqrtfn
1690 || optab_handler (sqrt_optab, mode) == CODE_FOR_nothing)
1691 return NULL_TREE;
1693 real_part = build_and_insert_ref (gsi, loc, type, "cabs",
1694 REALPART_EXPR, arg);
1695 addend1 = build_and_insert_binop (gsi, loc, "cabs", MULT_EXPR,
1696 real_part, real_part);
1697 imag_part = build_and_insert_ref (gsi, loc, type, "cabs",
1698 IMAGPART_EXPR, arg);
1699 addend2 = build_and_insert_binop (gsi, loc, "cabs", MULT_EXPR,
1700 imag_part, imag_part);
1701 sum = build_and_insert_binop (gsi, loc, "cabs", PLUS_EXPR, addend1, addend2);
1702 result = build_and_insert_call (gsi, loc, sqrtfn, sum);
1704 return result;
1707 /* Go through all calls to sin, cos and cexpi and call execute_cse_sincos_1
1708 on the SSA_NAME argument of each of them. Also expand powi(x,n) into
1709 an optimal number of multiplies, when n is a constant. */
1711 namespace {
1713 const pass_data pass_data_cse_sincos =
1715 GIMPLE_PASS, /* type */
1716 "sincos", /* name */
1717 OPTGROUP_NONE, /* optinfo_flags */
1718 TV_NONE, /* tv_id */
1719 PROP_ssa, /* properties_required */
1720 0, /* properties_provided */
1721 0, /* properties_destroyed */
1722 0, /* todo_flags_start */
1723 TODO_update_ssa, /* todo_flags_finish */
1726 class pass_cse_sincos : public gimple_opt_pass
1728 public:
1729 pass_cse_sincos (gcc::context *ctxt)
1730 : gimple_opt_pass (pass_data_cse_sincos, ctxt)
1733 /* opt_pass methods: */
1734 virtual bool gate (function *)
1736 /* We no longer require either sincos or cexp, since powi expansion
1737 piggybacks on this pass. */
1738 return optimize;
1741 virtual unsigned int execute (function *);
1743 }; // class pass_cse_sincos
1745 unsigned int
1746 pass_cse_sincos::execute (function *fun)
1748 basic_block bb;
1749 bool cfg_changed = false;
1751 calculate_dominance_info (CDI_DOMINATORS);
1752 memset (&sincos_stats, 0, sizeof (sincos_stats));
1754 FOR_EACH_BB_FN (bb, fun)
1756 gimple_stmt_iterator gsi;
1757 bool cleanup_eh = false;
1759 for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi); gsi_next (&gsi))
1761 gimple stmt = gsi_stmt (gsi);
1762 tree fndecl;
1764 /* Only the last stmt in a bb could throw, no need to call
1765 gimple_purge_dead_eh_edges if we change something in the middle
1766 of a basic block. */
1767 cleanup_eh = false;
1769 if (is_gimple_call (stmt)
1770 && gimple_call_lhs (stmt)
1771 && (fndecl = gimple_call_fndecl (stmt))
1772 && DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_NORMAL)
1774 tree arg, arg0, arg1, result;
1775 HOST_WIDE_INT n;
1776 location_t loc;
1778 switch (DECL_FUNCTION_CODE (fndecl))
1780 CASE_FLT_FN (BUILT_IN_COS):
1781 CASE_FLT_FN (BUILT_IN_SIN):
1782 CASE_FLT_FN (BUILT_IN_CEXPI):
1783 /* Make sure we have either sincos or cexp. */
1784 if (!targetm.libc_has_function (function_c99_math_complex)
1785 && !targetm.libc_has_function (function_sincos))
1786 break;
1788 arg = gimple_call_arg (stmt, 0);
1789 if (TREE_CODE (arg) == SSA_NAME)
1790 cfg_changed |= execute_cse_sincos_1 (arg);
1791 break;
1793 CASE_FLT_FN (BUILT_IN_POW):
1794 arg0 = gimple_call_arg (stmt, 0);
1795 arg1 = gimple_call_arg (stmt, 1);
1797 loc = gimple_location (stmt);
1798 result = gimple_expand_builtin_pow (&gsi, loc, arg0, arg1);
1800 if (result)
1802 tree lhs = gimple_get_lhs (stmt);
1803 gassign *new_stmt = gimple_build_assign (lhs, result);
1804 gimple_set_location (new_stmt, loc);
1805 unlink_stmt_vdef (stmt);
1806 gsi_replace (&gsi, new_stmt, true);
1807 cleanup_eh = true;
1808 if (gimple_vdef (stmt))
1809 release_ssa_name (gimple_vdef (stmt));
1811 break;
1813 CASE_FLT_FN (BUILT_IN_POWI):
1814 arg0 = gimple_call_arg (stmt, 0);
1815 arg1 = gimple_call_arg (stmt, 1);
1816 loc = gimple_location (stmt);
1818 if (real_minus_onep (arg0))
1820 tree t0, t1, cond, one, minus_one;
1821 gassign *stmt;
1823 t0 = TREE_TYPE (arg0);
1824 t1 = TREE_TYPE (arg1);
1825 one = build_real (t0, dconst1);
1826 minus_one = build_real (t0, dconstm1);
1828 cond = make_temp_ssa_name (t1, NULL, "powi_cond");
1829 stmt = gimple_build_assign (cond, BIT_AND_EXPR,
1830 arg1, build_int_cst (t1, 1));
1831 gimple_set_location (stmt, loc);
1832 gsi_insert_before (&gsi, stmt, GSI_SAME_STMT);
1834 result = make_temp_ssa_name (t0, NULL, "powi");
1835 stmt = gimple_build_assign (result, COND_EXPR, cond,
1836 minus_one, one);
1837 gimple_set_location (stmt, loc);
1838 gsi_insert_before (&gsi, stmt, GSI_SAME_STMT);
1840 else
1842 if (!tree_fits_shwi_p (arg1))
1843 break;
1845 n = tree_to_shwi (arg1);
1846 result = gimple_expand_builtin_powi (&gsi, loc, arg0, n);
1849 if (result)
1851 tree lhs = gimple_get_lhs (stmt);
1852 gassign *new_stmt = gimple_build_assign (lhs, result);
1853 gimple_set_location (new_stmt, loc);
1854 unlink_stmt_vdef (stmt);
1855 gsi_replace (&gsi, new_stmt, true);
1856 cleanup_eh = true;
1857 if (gimple_vdef (stmt))
1858 release_ssa_name (gimple_vdef (stmt));
1860 break;
1862 CASE_FLT_FN (BUILT_IN_CABS):
1863 arg0 = gimple_call_arg (stmt, 0);
1864 loc = gimple_location (stmt);
1865 result = gimple_expand_builtin_cabs (&gsi, loc, arg0);
1867 if (result)
1869 tree lhs = gimple_get_lhs (stmt);
1870 gassign *new_stmt = gimple_build_assign (lhs, result);
1871 gimple_set_location (new_stmt, loc);
1872 unlink_stmt_vdef (stmt);
1873 gsi_replace (&gsi, new_stmt, true);
1874 cleanup_eh = true;
1875 if (gimple_vdef (stmt))
1876 release_ssa_name (gimple_vdef (stmt));
1878 break;
1880 default:;
1884 if (cleanup_eh)
1885 cfg_changed |= gimple_purge_dead_eh_edges (bb);
1888 statistics_counter_event (fun, "sincos statements inserted",
1889 sincos_stats.inserted);
1891 free_dominance_info (CDI_DOMINATORS);
1892 return cfg_changed ? TODO_cleanup_cfg : 0;
1895 } // anon namespace
1897 gimple_opt_pass *
1898 make_pass_cse_sincos (gcc::context *ctxt)
1900 return new pass_cse_sincos (ctxt);
1903 /* A symbolic number is used to detect byte permutation and selection
1904 patterns. Therefore the field N contains an artificial number
1905 consisting of octet sized markers:
1907 0 - target byte has the value 0
1908 FF - target byte has an unknown value (eg. due to sign extension)
1909 1..size - marker value is the target byte index minus one.
1911 To detect permutations on memory sources (arrays and structures), a symbolic
1912 number is also associated a base address (the array or structure the load is
1913 made from), an offset from the base address and a range which gives the
1914 difference between the highest and lowest accessed memory location to make
1915 such a symbolic number. The range is thus different from size which reflects
1916 the size of the type of current expression. Note that for non memory source,
1917 range holds the same value as size.
1919 For instance, for an array char a[], (short) a[0] | (short) a[3] would have
1920 a size of 2 but a range of 4 while (short) a[0] | ((short) a[0] << 1) would
1921 still have a size of 2 but this time a range of 1. */
1923 struct symbolic_number {
1924 uint64_t n;
1925 tree type;
1926 tree base_addr;
1927 tree offset;
1928 HOST_WIDE_INT bytepos;
1929 tree alias_set;
1930 tree vuse;
1931 unsigned HOST_WIDE_INT range;
1934 #define BITS_PER_MARKER 8
1935 #define MARKER_MASK ((1 << BITS_PER_MARKER) - 1)
1936 #define MARKER_BYTE_UNKNOWN MARKER_MASK
1937 #define HEAD_MARKER(n, size) \
1938 ((n) & ((uint64_t) MARKER_MASK << (((size) - 1) * BITS_PER_MARKER)))
1940 /* The number which the find_bswap_or_nop_1 result should match in
1941 order to have a nop. The number is masked according to the size of
1942 the symbolic number before using it. */
1943 #define CMPNOP (sizeof (int64_t) < 8 ? 0 : \
1944 (uint64_t)0x08070605 << 32 | 0x04030201)
1946 /* The number which the find_bswap_or_nop_1 result should match in
1947 order to have a byte swap. The number is masked according to the
1948 size of the symbolic number before using it. */
1949 #define CMPXCHG (sizeof (int64_t) < 8 ? 0 : \
1950 (uint64_t)0x01020304 << 32 | 0x05060708)
1952 /* Perform a SHIFT or ROTATE operation by COUNT bits on symbolic
1953 number N. Return false if the requested operation is not permitted
1954 on a symbolic number. */
1956 static inline bool
1957 do_shift_rotate (enum tree_code code,
1958 struct symbolic_number *n,
1959 int count)
1961 int i, size = TYPE_PRECISION (n->type) / BITS_PER_UNIT;
1962 unsigned head_marker;
1964 if (count % BITS_PER_UNIT != 0)
1965 return false;
1966 count = (count / BITS_PER_UNIT) * BITS_PER_MARKER;
1968 /* Zero out the extra bits of N in order to avoid them being shifted
1969 into the significant bits. */
1970 if (size < 64 / BITS_PER_MARKER)
1971 n->n &= ((uint64_t) 1 << (size * BITS_PER_MARKER)) - 1;
1973 switch (code)
1975 case LSHIFT_EXPR:
1976 n->n <<= count;
1977 break;
1978 case RSHIFT_EXPR:
1979 head_marker = HEAD_MARKER (n->n, size);
1980 n->n >>= count;
1981 /* Arithmetic shift of signed type: result is dependent on the value. */
1982 if (!TYPE_UNSIGNED (n->type) && head_marker)
1983 for (i = 0; i < count / BITS_PER_MARKER; i++)
1984 n->n |= (uint64_t) MARKER_BYTE_UNKNOWN
1985 << ((size - 1 - i) * BITS_PER_MARKER);
1986 break;
1987 case LROTATE_EXPR:
1988 n->n = (n->n << count) | (n->n >> ((size * BITS_PER_MARKER) - count));
1989 break;
1990 case RROTATE_EXPR:
1991 n->n = (n->n >> count) | (n->n << ((size * BITS_PER_MARKER) - count));
1992 break;
1993 default:
1994 return false;
1996 /* Zero unused bits for size. */
1997 if (size < 64 / BITS_PER_MARKER)
1998 n->n &= ((uint64_t) 1 << (size * BITS_PER_MARKER)) - 1;
1999 return true;
2002 /* Perform sanity checking for the symbolic number N and the gimple
2003 statement STMT. */
2005 static inline bool
2006 verify_symbolic_number_p (struct symbolic_number *n, gimple stmt)
2008 tree lhs_type;
2010 lhs_type = gimple_expr_type (stmt);
2012 if (TREE_CODE (lhs_type) != INTEGER_TYPE)
2013 return false;
2015 if (TYPE_PRECISION (lhs_type) != TYPE_PRECISION (n->type))
2016 return false;
2018 return true;
2021 /* Initialize the symbolic number N for the bswap pass from the base element
2022 SRC manipulated by the bitwise OR expression. */
2024 static bool
2025 init_symbolic_number (struct symbolic_number *n, tree src)
2027 int size;
2029 n->base_addr = n->offset = n->alias_set = n->vuse = NULL_TREE;
2031 /* Set up the symbolic number N by setting each byte to a value between 1 and
2032 the byte size of rhs1. The highest order byte is set to n->size and the
2033 lowest order byte to 1. */
2034 n->type = TREE_TYPE (src);
2035 size = TYPE_PRECISION (n->type);
2036 if (size % BITS_PER_UNIT != 0)
2037 return false;
2038 size /= BITS_PER_UNIT;
2039 if (size > 64 / BITS_PER_MARKER)
2040 return false;
2041 n->range = size;
2042 n->n = CMPNOP;
2044 if (size < 64 / BITS_PER_MARKER)
2045 n->n &= ((uint64_t) 1 << (size * BITS_PER_MARKER)) - 1;
2047 return true;
2050 /* Check if STMT might be a byte swap or a nop from a memory source and returns
2051 the answer. If so, REF is that memory source and the base of the memory area
2052 accessed and the offset of the access from that base are recorded in N. */
2054 bool
2055 find_bswap_or_nop_load (gimple stmt, tree ref, struct symbolic_number *n)
2057 /* Leaf node is an array or component ref. Memorize its base and
2058 offset from base to compare to other such leaf node. */
2059 HOST_WIDE_INT bitsize, bitpos;
2060 machine_mode mode;
2061 int unsignedp, reversep, volatilep;
2062 tree offset, base_addr;
2064 /* Not prepared to handle PDP endian. */
2065 if (BYTES_BIG_ENDIAN != WORDS_BIG_ENDIAN)
2066 return false;
2068 if (!gimple_assign_load_p (stmt) || gimple_has_volatile_ops (stmt))
2069 return false;
2071 base_addr = get_inner_reference (ref, &bitsize, &bitpos, &offset, &mode,
2072 &unsignedp, &reversep, &volatilep, false);
2074 if (TREE_CODE (base_addr) == MEM_REF)
2076 offset_int bit_offset = 0;
2077 tree off = TREE_OPERAND (base_addr, 1);
2079 if (!integer_zerop (off))
2081 offset_int boff, coff = mem_ref_offset (base_addr);
2082 boff = wi::lshift (coff, LOG2_BITS_PER_UNIT);
2083 bit_offset += boff;
2086 base_addr = TREE_OPERAND (base_addr, 0);
2088 /* Avoid returning a negative bitpos as this may wreak havoc later. */
2089 if (wi::neg_p (bit_offset))
2091 offset_int mask = wi::mask <offset_int> (LOG2_BITS_PER_UNIT, false);
2092 offset_int tem = bit_offset.and_not (mask);
2093 /* TEM is the bitpos rounded to BITS_PER_UNIT towards -Inf.
2094 Subtract it to BIT_OFFSET and add it (scaled) to OFFSET. */
2095 bit_offset -= tem;
2096 tem = wi::arshift (tem, LOG2_BITS_PER_UNIT);
2097 if (offset)
2098 offset = size_binop (PLUS_EXPR, offset,
2099 wide_int_to_tree (sizetype, tem));
2100 else
2101 offset = wide_int_to_tree (sizetype, tem);
2104 bitpos += bit_offset.to_shwi ();
2107 if (bitpos % BITS_PER_UNIT)
2108 return false;
2109 if (bitsize % BITS_PER_UNIT)
2110 return false;
2111 if (reversep)
2112 return false;
2114 if (!init_symbolic_number (n, ref))
2115 return false;
2116 n->base_addr = base_addr;
2117 n->offset = offset;
2118 n->bytepos = bitpos / BITS_PER_UNIT;
2119 n->alias_set = reference_alias_ptr_type (ref);
2120 n->vuse = gimple_vuse (stmt);
2121 return true;
2124 /* Compute the symbolic number N representing the result of a bitwise OR on 2
2125 symbolic number N1 and N2 whose source statements are respectively
2126 SOURCE_STMT1 and SOURCE_STMT2. */
2128 static gimple
2129 perform_symbolic_merge (gimple source_stmt1, struct symbolic_number *n1,
2130 gimple source_stmt2, struct symbolic_number *n2,
2131 struct symbolic_number *n)
2133 int i, size;
2134 uint64_t mask;
2135 gimple source_stmt;
2136 struct symbolic_number *n_start;
2138 /* Sources are different, cancel bswap if they are not memory location with
2139 the same base (array, structure, ...). */
2140 if (gimple_assign_rhs1 (source_stmt1) != gimple_assign_rhs1 (source_stmt2))
2142 int64_t inc;
2143 HOST_WIDE_INT start_sub, end_sub, end1, end2, end;
2144 struct symbolic_number *toinc_n_ptr, *n_end;
2146 if (!n1->base_addr || !n2->base_addr
2147 || !operand_equal_p (n1->base_addr, n2->base_addr, 0))
2148 return NULL;
2150 if (!n1->offset != !n2->offset
2151 || (n1->offset && !operand_equal_p (n1->offset, n2->offset, 0)))
2152 return NULL;
2154 if (n1->bytepos < n2->bytepos)
2156 n_start = n1;
2157 start_sub = n2->bytepos - n1->bytepos;
2158 source_stmt = source_stmt1;
2160 else
2162 n_start = n2;
2163 start_sub = n1->bytepos - n2->bytepos;
2164 source_stmt = source_stmt2;
2167 /* Find the highest address at which a load is performed and
2168 compute related info. */
2169 end1 = n1->bytepos + (n1->range - 1);
2170 end2 = n2->bytepos + (n2->range - 1);
2171 if (end1 < end2)
2173 end = end2;
2174 end_sub = end2 - end1;
2176 else
2178 end = end1;
2179 end_sub = end1 - end2;
2181 n_end = (end2 > end1) ? n2 : n1;
2183 /* Find symbolic number whose lsb is the most significant. */
2184 if (BYTES_BIG_ENDIAN)
2185 toinc_n_ptr = (n_end == n1) ? n2 : n1;
2186 else
2187 toinc_n_ptr = (n_start == n1) ? n2 : n1;
2189 n->range = end - n_start->bytepos + 1;
2191 /* Check that the range of memory covered can be represented by
2192 a symbolic number. */
2193 if (n->range > 64 / BITS_PER_MARKER)
2194 return NULL;
2196 /* Reinterpret byte marks in symbolic number holding the value of
2197 bigger weight according to target endianness. */
2198 inc = BYTES_BIG_ENDIAN ? end_sub : start_sub;
2199 size = TYPE_PRECISION (n1->type) / BITS_PER_UNIT;
2200 for (i = 0; i < size; i++, inc <<= BITS_PER_MARKER)
2202 unsigned marker
2203 = (toinc_n_ptr->n >> (i * BITS_PER_MARKER)) & MARKER_MASK;
2204 if (marker && marker != MARKER_BYTE_UNKNOWN)
2205 toinc_n_ptr->n += inc;
2208 else
2210 n->range = n1->range;
2211 n_start = n1;
2212 source_stmt = source_stmt1;
2215 if (!n1->alias_set
2216 || alias_ptr_types_compatible_p (n1->alias_set, n2->alias_set))
2217 n->alias_set = n1->alias_set;
2218 else
2219 n->alias_set = ptr_type_node;
2220 n->vuse = n_start->vuse;
2221 n->base_addr = n_start->base_addr;
2222 n->offset = n_start->offset;
2223 n->bytepos = n_start->bytepos;
2224 n->type = n_start->type;
2225 size = TYPE_PRECISION (n->type) / BITS_PER_UNIT;
2227 for (i = 0, mask = MARKER_MASK; i < size; i++, mask <<= BITS_PER_MARKER)
2229 uint64_t masked1, masked2;
2231 masked1 = n1->n & mask;
2232 masked2 = n2->n & mask;
2233 if (masked1 && masked2 && masked1 != masked2)
2234 return NULL;
2236 n->n = n1->n | n2->n;
2238 return source_stmt;
2241 /* find_bswap_or_nop_1 invokes itself recursively with N and tries to perform
2242 the operation given by the rhs of STMT on the result. If the operation
2243 could successfully be executed the function returns a gimple stmt whose
2244 rhs's first tree is the expression of the source operand and NULL
2245 otherwise. */
2247 static gimple
2248 find_bswap_or_nop_1 (gimple stmt, struct symbolic_number *n, int limit)
2250 enum tree_code code;
2251 tree rhs1, rhs2 = NULL;
2252 gimple rhs1_stmt, rhs2_stmt, source_stmt1;
2253 enum gimple_rhs_class rhs_class;
2255 if (!limit || !is_gimple_assign (stmt))
2256 return NULL;
2258 rhs1 = gimple_assign_rhs1 (stmt);
2260 if (find_bswap_or_nop_load (stmt, rhs1, n))
2261 return stmt;
2263 if (TREE_CODE (rhs1) != SSA_NAME)
2264 return NULL;
2266 code = gimple_assign_rhs_code (stmt);
2267 rhs_class = gimple_assign_rhs_class (stmt);
2268 rhs1_stmt = SSA_NAME_DEF_STMT (rhs1);
2270 if (rhs_class == GIMPLE_BINARY_RHS)
2271 rhs2 = gimple_assign_rhs2 (stmt);
2273 /* Handle unary rhs and binary rhs with integer constants as second
2274 operand. */
2276 if (rhs_class == GIMPLE_UNARY_RHS
2277 || (rhs_class == GIMPLE_BINARY_RHS
2278 && TREE_CODE (rhs2) == INTEGER_CST))
2280 if (code != BIT_AND_EXPR
2281 && code != LSHIFT_EXPR
2282 && code != RSHIFT_EXPR
2283 && code != LROTATE_EXPR
2284 && code != RROTATE_EXPR
2285 && !CONVERT_EXPR_CODE_P (code))
2286 return NULL;
2288 source_stmt1 = find_bswap_or_nop_1 (rhs1_stmt, n, limit - 1);
2290 /* If find_bswap_or_nop_1 returned NULL, STMT is a leaf node and
2291 we have to initialize the symbolic number. */
2292 if (!source_stmt1)
2294 if (gimple_assign_load_p (stmt)
2295 || !init_symbolic_number (n, rhs1))
2296 return NULL;
2297 source_stmt1 = stmt;
2300 switch (code)
2302 case BIT_AND_EXPR:
2304 int i, size = TYPE_PRECISION (n->type) / BITS_PER_UNIT;
2305 uint64_t val = int_cst_value (rhs2), mask = 0;
2306 uint64_t tmp = (1 << BITS_PER_UNIT) - 1;
2308 /* Only constants masking full bytes are allowed. */
2309 for (i = 0; i < size; i++, tmp <<= BITS_PER_UNIT)
2310 if ((val & tmp) != 0 && (val & tmp) != tmp)
2311 return NULL;
2312 else if (val & tmp)
2313 mask |= (uint64_t) MARKER_MASK << (i * BITS_PER_MARKER);
2315 n->n &= mask;
2317 break;
2318 case LSHIFT_EXPR:
2319 case RSHIFT_EXPR:
2320 case LROTATE_EXPR:
2321 case RROTATE_EXPR:
2322 if (!do_shift_rotate (code, n, (int) TREE_INT_CST_LOW (rhs2)))
2323 return NULL;
2324 break;
2325 CASE_CONVERT:
2327 int i, type_size, old_type_size;
2328 tree type;
2330 type = gimple_expr_type (stmt);
2331 type_size = TYPE_PRECISION (type);
2332 if (type_size % BITS_PER_UNIT != 0)
2333 return NULL;
2334 type_size /= BITS_PER_UNIT;
2335 if (type_size > 64 / BITS_PER_MARKER)
2336 return NULL;
2338 /* Sign extension: result is dependent on the value. */
2339 old_type_size = TYPE_PRECISION (n->type) / BITS_PER_UNIT;
2340 if (!TYPE_UNSIGNED (n->type) && type_size > old_type_size
2341 && HEAD_MARKER (n->n, old_type_size))
2342 for (i = 0; i < type_size - old_type_size; i++)
2343 n->n |= (uint64_t) MARKER_BYTE_UNKNOWN
2344 << ((type_size - 1 - i) * BITS_PER_MARKER);
2346 if (type_size < 64 / BITS_PER_MARKER)
2348 /* If STMT casts to a smaller type mask out the bits not
2349 belonging to the target type. */
2350 n->n &= ((uint64_t) 1 << (type_size * BITS_PER_MARKER)) - 1;
2352 n->type = type;
2353 if (!n->base_addr)
2354 n->range = type_size;
2356 break;
2357 default:
2358 return NULL;
2360 return verify_symbolic_number_p (n, stmt) ? source_stmt1 : NULL;
2363 /* Handle binary rhs. */
2365 if (rhs_class == GIMPLE_BINARY_RHS)
2367 struct symbolic_number n1, n2;
2368 gimple source_stmt, source_stmt2;
2370 if (code != BIT_IOR_EXPR)
2371 return NULL;
2373 if (TREE_CODE (rhs2) != SSA_NAME)
2374 return NULL;
2376 rhs2_stmt = SSA_NAME_DEF_STMT (rhs2);
2378 switch (code)
2380 case BIT_IOR_EXPR:
2381 source_stmt1 = find_bswap_or_nop_1 (rhs1_stmt, &n1, limit - 1);
2383 if (!source_stmt1)
2384 return NULL;
2386 source_stmt2 = find_bswap_or_nop_1 (rhs2_stmt, &n2, limit - 1);
2388 if (!source_stmt2)
2389 return NULL;
2391 if (TYPE_PRECISION (n1.type) != TYPE_PRECISION (n2.type))
2392 return NULL;
2394 if (!n1.vuse != !n2.vuse
2395 || (n1.vuse && !operand_equal_p (n1.vuse, n2.vuse, 0)))
2396 return NULL;
2398 source_stmt
2399 = perform_symbolic_merge (source_stmt1, &n1, source_stmt2, &n2, n);
2401 if (!source_stmt)
2402 return NULL;
2404 if (!verify_symbolic_number_p (n, stmt))
2405 return NULL;
2407 break;
2408 default:
2409 return NULL;
2411 return source_stmt;
2413 return NULL;
2416 /* Check if STMT completes a bswap implementation or a read in a given
2417 endianness consisting of ORs, SHIFTs and ANDs and sets *BSWAP
2418 accordingly. It also sets N to represent the kind of operations
2419 performed: size of the resulting expression and whether it works on
2420 a memory source, and if so alias-set and vuse. At last, the
2421 function returns a stmt whose rhs's first tree is the source
2422 expression. */
2424 static gimple
2425 find_bswap_or_nop (gimple stmt, struct symbolic_number *n, bool *bswap)
2427 /* The number which the find_bswap_or_nop_1 result should match in order
2428 to have a full byte swap. The number is shifted to the right
2429 according to the size of the symbolic number before using it. */
2430 uint64_t cmpxchg = CMPXCHG;
2431 uint64_t cmpnop = CMPNOP;
2433 gimple source_stmt;
2434 int limit;
2436 /* The last parameter determines the depth search limit. It usually
2437 correlates directly to the number n of bytes to be touched. We
2438 increase that number by log2(n) + 1 here in order to also
2439 cover signed -> unsigned conversions of the src operand as can be seen
2440 in libgcc, and for initial shift/and operation of the src operand. */
2441 limit = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (gimple_expr_type (stmt)));
2442 limit += 1 + (int) ceil_log2 ((unsigned HOST_WIDE_INT) limit);
2443 source_stmt = find_bswap_or_nop_1 (stmt, n, limit);
2445 if (!source_stmt)
2446 return NULL;
2448 /* Find real size of result (highest non-zero byte). */
2449 if (n->base_addr)
2451 int rsize;
2452 uint64_t tmpn;
2454 for (tmpn = n->n, rsize = 0; tmpn; tmpn >>= BITS_PER_MARKER, rsize++);
2455 n->range = rsize;
2458 /* Zero out the extra bits of N and CMP*. */
2459 if (n->range < (int) sizeof (int64_t))
2461 uint64_t mask;
2463 mask = ((uint64_t) 1 << (n->range * BITS_PER_MARKER)) - 1;
2464 cmpxchg >>= (64 / BITS_PER_MARKER - n->range) * BITS_PER_MARKER;
2465 cmpnop &= mask;
2468 /* A complete byte swap should make the symbolic number to start with
2469 the largest digit in the highest order byte. Unchanged symbolic
2470 number indicates a read with same endianness as target architecture. */
2471 if (n->n == cmpnop)
2472 *bswap = false;
2473 else if (n->n == cmpxchg)
2474 *bswap = true;
2475 else
2476 return NULL;
2478 /* Useless bit manipulation performed by code. */
2479 if (!n->base_addr && n->n == cmpnop)
2480 return NULL;
2482 n->range *= BITS_PER_UNIT;
2483 return source_stmt;
2486 namespace {
2488 const pass_data pass_data_optimize_bswap =
2490 GIMPLE_PASS, /* type */
2491 "bswap", /* name */
2492 OPTGROUP_NONE, /* optinfo_flags */
2493 TV_NONE, /* tv_id */
2494 PROP_ssa, /* properties_required */
2495 0, /* properties_provided */
2496 0, /* properties_destroyed */
2497 0, /* todo_flags_start */
2498 0, /* todo_flags_finish */
2501 class pass_optimize_bswap : public gimple_opt_pass
2503 public:
2504 pass_optimize_bswap (gcc::context *ctxt)
2505 : gimple_opt_pass (pass_data_optimize_bswap, ctxt)
2508 /* opt_pass methods: */
2509 virtual bool gate (function *)
2511 return flag_expensive_optimizations && optimize;
2514 virtual unsigned int execute (function *);
2516 }; // class pass_optimize_bswap
2518 /* Perform the bswap optimization: replace the expression computed in the rhs
2519 of CUR_STMT by an equivalent bswap, load or load + bswap expression.
2520 Which of these alternatives replace the rhs is given by N->base_addr (non
2521 null if a load is needed) and BSWAP. The type, VUSE and set-alias of the
2522 load to perform are also given in N while the builtin bswap invoke is given
2523 in FNDEL. Finally, if a load is involved, SRC_STMT refers to one of the
2524 load statements involved to construct the rhs in CUR_STMT and N->range gives
2525 the size of the rhs expression for maintaining some statistics.
2527 Note that if the replacement involve a load, CUR_STMT is moved just after
2528 SRC_STMT to do the load with the same VUSE which can lead to CUR_STMT
2529 changing of basic block. */
2531 static bool
2532 bswap_replace (gimple cur_stmt, gimple src_stmt, tree fndecl, tree bswap_type,
2533 tree load_type, struct symbolic_number *n, bool bswap)
2535 gimple_stmt_iterator gsi;
2536 tree src, tmp, tgt;
2537 gimple bswap_stmt;
2539 gsi = gsi_for_stmt (cur_stmt);
2540 src = gimple_assign_rhs1 (src_stmt);
2541 tgt = gimple_assign_lhs (cur_stmt);
2543 /* Need to load the value from memory first. */
2544 if (n->base_addr)
2546 gimple_stmt_iterator gsi_ins = gsi_for_stmt (src_stmt);
2547 tree addr_expr, addr_tmp, val_expr, val_tmp;
2548 tree load_offset_ptr, aligned_load_type;
2549 gimple addr_stmt, load_stmt;
2550 unsigned align;
2551 HOST_WIDE_INT load_offset = 0;
2553 align = get_object_alignment (src);
2554 /* If the new access is smaller than the original one, we need
2555 to perform big endian adjustment. */
2556 if (BYTES_BIG_ENDIAN)
2558 HOST_WIDE_INT bitsize, bitpos;
2559 machine_mode mode;
2560 int unsignedp, reversep, volatilep;
2561 tree offset;
2563 get_inner_reference (src, &bitsize, &bitpos, &offset, &mode,
2564 &unsignedp, &reversep, &volatilep, false);
2565 if (n->range < (unsigned HOST_WIDE_INT) bitsize)
2567 load_offset = (bitsize - n->range) / BITS_PER_UNIT;
2568 unsigned HOST_WIDE_INT l
2569 = (load_offset * BITS_PER_UNIT) & (align - 1);
2570 if (l)
2571 align = l & -l;
2575 if (bswap
2576 && align < GET_MODE_ALIGNMENT (TYPE_MODE (load_type))
2577 && SLOW_UNALIGNED_ACCESS (TYPE_MODE (load_type), align))
2578 return false;
2580 /* Move cur_stmt just before one of the load of the original
2581 to ensure it has the same VUSE. See PR61517 for what could
2582 go wrong. */
2583 gsi_move_before (&gsi, &gsi_ins);
2584 gsi = gsi_for_stmt (cur_stmt);
2586 /* Compute address to load from and cast according to the size
2587 of the load. */
2588 addr_expr = build_fold_addr_expr (unshare_expr (src));
2589 if (is_gimple_mem_ref_addr (addr_expr))
2590 addr_tmp = addr_expr;
2591 else
2593 addr_tmp = make_temp_ssa_name (TREE_TYPE (addr_expr), NULL,
2594 "load_src");
2595 addr_stmt = gimple_build_assign (addr_tmp, addr_expr);
2596 gsi_insert_before (&gsi, addr_stmt, GSI_SAME_STMT);
2599 /* Perform the load. */
2600 aligned_load_type = load_type;
2601 if (align < TYPE_ALIGN (load_type))
2602 aligned_load_type = build_aligned_type (load_type, align);
2603 load_offset_ptr = build_int_cst (n->alias_set, load_offset);
2604 val_expr = fold_build2 (MEM_REF, aligned_load_type, addr_tmp,
2605 load_offset_ptr);
2607 if (!bswap)
2609 if (n->range == 16)
2610 nop_stats.found_16bit++;
2611 else if (n->range == 32)
2612 nop_stats.found_32bit++;
2613 else
2615 gcc_assert (n->range == 64);
2616 nop_stats.found_64bit++;
2619 /* Convert the result of load if necessary. */
2620 if (!useless_type_conversion_p (TREE_TYPE (tgt), load_type))
2622 val_tmp = make_temp_ssa_name (aligned_load_type, NULL,
2623 "load_dst");
2624 load_stmt = gimple_build_assign (val_tmp, val_expr);
2625 gimple_set_vuse (load_stmt, n->vuse);
2626 gsi_insert_before (&gsi, load_stmt, GSI_SAME_STMT);
2627 gimple_assign_set_rhs_with_ops (&gsi, NOP_EXPR, val_tmp);
2629 else
2631 gimple_assign_set_rhs_with_ops (&gsi, MEM_REF, val_expr);
2632 gimple_set_vuse (cur_stmt, n->vuse);
2634 update_stmt (cur_stmt);
2636 if (dump_file)
2638 fprintf (dump_file,
2639 "%d bit load in target endianness found at: ",
2640 (int) n->range);
2641 print_gimple_stmt (dump_file, cur_stmt, 0, 0);
2643 return true;
2645 else
2647 val_tmp = make_temp_ssa_name (aligned_load_type, NULL, "load_dst");
2648 load_stmt = gimple_build_assign (val_tmp, val_expr);
2649 gimple_set_vuse (load_stmt, n->vuse);
2650 gsi_insert_before (&gsi, load_stmt, GSI_SAME_STMT);
2652 src = val_tmp;
2655 if (n->range == 16)
2656 bswap_stats.found_16bit++;
2657 else if (n->range == 32)
2658 bswap_stats.found_32bit++;
2659 else
2661 gcc_assert (n->range == 64);
2662 bswap_stats.found_64bit++;
2665 tmp = src;
2667 /* Convert the src expression if necessary. */
2668 if (!useless_type_conversion_p (TREE_TYPE (tmp), bswap_type))
2670 gimple convert_stmt;
2672 tmp = make_temp_ssa_name (bswap_type, NULL, "bswapsrc");
2673 convert_stmt = gimple_build_assign (tmp, NOP_EXPR, src);
2674 gsi_insert_before (&gsi, convert_stmt, GSI_SAME_STMT);
2677 /* Canonical form for 16 bit bswap is a rotate expression. Only 16bit values
2678 are considered as rotation of 2N bit values by N bits is generally not
2679 equivalent to a bswap. Consider for instance 0x01020304 r>> 16 which
2680 gives 0x03040102 while a bswap for that value is 0x04030201. */
2681 if (bswap && n->range == 16)
2683 tree count = build_int_cst (NULL, BITS_PER_UNIT);
2684 src = fold_build2 (LROTATE_EXPR, bswap_type, tmp, count);
2685 bswap_stmt = gimple_build_assign (NULL, src);
2687 else
2688 bswap_stmt = gimple_build_call (fndecl, 1, tmp);
2690 tmp = tgt;
2692 /* Convert the result if necessary. */
2693 if (!useless_type_conversion_p (TREE_TYPE (tgt), bswap_type))
2695 gimple convert_stmt;
2697 tmp = make_temp_ssa_name (bswap_type, NULL, "bswapdst");
2698 convert_stmt = gimple_build_assign (tgt, NOP_EXPR, tmp);
2699 gsi_insert_after (&gsi, convert_stmt, GSI_SAME_STMT);
2702 gimple_set_lhs (bswap_stmt, tmp);
2704 if (dump_file)
2706 fprintf (dump_file, "%d bit bswap implementation found at: ",
2707 (int) n->range);
2708 print_gimple_stmt (dump_file, cur_stmt, 0, 0);
2711 gsi_insert_after (&gsi, bswap_stmt, GSI_SAME_STMT);
2712 gsi_remove (&gsi, true);
2713 return true;
2716 /* Find manual byte swap implementations as well as load in a given
2717 endianness. Byte swaps are turned into a bswap builtin invokation
2718 while endian loads are converted to bswap builtin invokation or
2719 simple load according to the target endianness. */
2721 unsigned int
2722 pass_optimize_bswap::execute (function *fun)
2724 basic_block bb;
2725 bool bswap32_p, bswap64_p;
2726 bool changed = false;
2727 tree bswap32_type = NULL_TREE, bswap64_type = NULL_TREE;
2729 if (BITS_PER_UNIT != 8)
2730 return 0;
2732 bswap32_p = (builtin_decl_explicit_p (BUILT_IN_BSWAP32)
2733 && optab_handler (bswap_optab, SImode) != CODE_FOR_nothing);
2734 bswap64_p = (builtin_decl_explicit_p (BUILT_IN_BSWAP64)
2735 && (optab_handler (bswap_optab, DImode) != CODE_FOR_nothing
2736 || (bswap32_p && word_mode == SImode)));
2738 /* Determine the argument type of the builtins. The code later on
2739 assumes that the return and argument type are the same. */
2740 if (bswap32_p)
2742 tree fndecl = builtin_decl_explicit (BUILT_IN_BSWAP32);
2743 bswap32_type = TREE_VALUE (TYPE_ARG_TYPES (TREE_TYPE (fndecl)));
2746 if (bswap64_p)
2748 tree fndecl = builtin_decl_explicit (BUILT_IN_BSWAP64);
2749 bswap64_type = TREE_VALUE (TYPE_ARG_TYPES (TREE_TYPE (fndecl)));
2752 memset (&nop_stats, 0, sizeof (nop_stats));
2753 memset (&bswap_stats, 0, sizeof (bswap_stats));
2755 FOR_EACH_BB_FN (bb, fun)
2757 gimple_stmt_iterator gsi;
2759 /* We do a reverse scan for bswap patterns to make sure we get the
2760 widest match. As bswap pattern matching doesn't handle previously
2761 inserted smaller bswap replacements as sub-patterns, the wider
2762 variant wouldn't be detected. */
2763 for (gsi = gsi_last_bb (bb); !gsi_end_p (gsi);)
2765 gimple src_stmt, cur_stmt = gsi_stmt (gsi);
2766 tree fndecl = NULL_TREE, bswap_type = NULL_TREE, load_type;
2767 enum tree_code code;
2768 struct symbolic_number n;
2769 bool bswap;
2771 /* This gsi_prev (&gsi) is not part of the for loop because cur_stmt
2772 might be moved to a different basic block by bswap_replace and gsi
2773 must not points to it if that's the case. Moving the gsi_prev
2774 there make sure that gsi points to the statement previous to
2775 cur_stmt while still making sure that all statements are
2776 considered in this basic block. */
2777 gsi_prev (&gsi);
2779 if (!is_gimple_assign (cur_stmt))
2780 continue;
2782 code = gimple_assign_rhs_code (cur_stmt);
2783 switch (code)
2785 case LROTATE_EXPR:
2786 case RROTATE_EXPR:
2787 if (!tree_fits_uhwi_p (gimple_assign_rhs2 (cur_stmt))
2788 || tree_to_uhwi (gimple_assign_rhs2 (cur_stmt))
2789 % BITS_PER_UNIT)
2790 continue;
2791 /* Fall through. */
2792 case BIT_IOR_EXPR:
2793 break;
2794 default:
2795 continue;
2798 src_stmt = find_bswap_or_nop (cur_stmt, &n, &bswap);
2800 if (!src_stmt)
2801 continue;
2803 switch (n.range)
2805 case 16:
2806 /* Already in canonical form, nothing to do. */
2807 if (code == LROTATE_EXPR || code == RROTATE_EXPR)
2808 continue;
2809 load_type = bswap_type = uint16_type_node;
2810 break;
2811 case 32:
2812 load_type = uint32_type_node;
2813 if (bswap32_p)
2815 fndecl = builtin_decl_explicit (BUILT_IN_BSWAP32);
2816 bswap_type = bswap32_type;
2818 break;
2819 case 64:
2820 load_type = uint64_type_node;
2821 if (bswap64_p)
2823 fndecl = builtin_decl_explicit (BUILT_IN_BSWAP64);
2824 bswap_type = bswap64_type;
2826 break;
2827 default:
2828 continue;
2831 if (bswap && !fndecl && n.range != 16)
2832 continue;
2834 if (bswap_replace (cur_stmt, src_stmt, fndecl, bswap_type, load_type,
2835 &n, bswap))
2836 changed = true;
2840 statistics_counter_event (fun, "16-bit nop implementations found",
2841 nop_stats.found_16bit);
2842 statistics_counter_event (fun, "32-bit nop implementations found",
2843 nop_stats.found_32bit);
2844 statistics_counter_event (fun, "64-bit nop implementations found",
2845 nop_stats.found_64bit);
2846 statistics_counter_event (fun, "16-bit bswap implementations found",
2847 bswap_stats.found_16bit);
2848 statistics_counter_event (fun, "32-bit bswap implementations found",
2849 bswap_stats.found_32bit);
2850 statistics_counter_event (fun, "64-bit bswap implementations found",
2851 bswap_stats.found_64bit);
2853 return (changed ? TODO_update_ssa : 0);
2856 } // anon namespace
2858 gimple_opt_pass *
2859 make_pass_optimize_bswap (gcc::context *ctxt)
2861 return new pass_optimize_bswap (ctxt);
2864 /* Return true if stmt is a type conversion operation that can be stripped
2865 when used in a widening multiply operation. */
2866 static bool
2867 widening_mult_conversion_strippable_p (tree result_type, gimple stmt)
2869 enum tree_code rhs_code = gimple_assign_rhs_code (stmt);
2871 if (TREE_CODE (result_type) == INTEGER_TYPE)
2873 tree op_type;
2874 tree inner_op_type;
2876 if (!CONVERT_EXPR_CODE_P (rhs_code))
2877 return false;
2879 op_type = TREE_TYPE (gimple_assign_lhs (stmt));
2881 /* If the type of OP has the same precision as the result, then
2882 we can strip this conversion. The multiply operation will be
2883 selected to create the correct extension as a by-product. */
2884 if (TYPE_PRECISION (result_type) == TYPE_PRECISION (op_type))
2885 return true;
2887 /* We can also strip a conversion if it preserves the signed-ness of
2888 the operation and doesn't narrow the range. */
2889 inner_op_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
2891 /* If the inner-most type is unsigned, then we can strip any
2892 intermediate widening operation. If it's signed, then the
2893 intermediate widening operation must also be signed. */
2894 if ((TYPE_UNSIGNED (inner_op_type)
2895 || TYPE_UNSIGNED (op_type) == TYPE_UNSIGNED (inner_op_type))
2896 && TYPE_PRECISION (op_type) > TYPE_PRECISION (inner_op_type))
2897 return true;
2899 return false;
2902 return rhs_code == FIXED_CONVERT_EXPR;
2905 /* Return true if RHS is a suitable operand for a widening multiplication,
2906 assuming a target type of TYPE.
2907 There are two cases:
2909 - RHS makes some value at least twice as wide. Store that value
2910 in *NEW_RHS_OUT if so, and store its type in *TYPE_OUT.
2912 - RHS is an integer constant. Store that value in *NEW_RHS_OUT if so,
2913 but leave *TYPE_OUT untouched. */
2915 static bool
2916 is_widening_mult_rhs_p (tree type, tree rhs, tree *type_out,
2917 tree *new_rhs_out)
2919 gimple stmt;
2920 tree type1, rhs1;
2922 if (TREE_CODE (rhs) == SSA_NAME)
2924 stmt = SSA_NAME_DEF_STMT (rhs);
2925 if (is_gimple_assign (stmt))
2927 if (! widening_mult_conversion_strippable_p (type, stmt))
2928 rhs1 = rhs;
2929 else
2931 rhs1 = gimple_assign_rhs1 (stmt);
2933 if (TREE_CODE (rhs1) == INTEGER_CST)
2935 *new_rhs_out = rhs1;
2936 *type_out = NULL;
2937 return true;
2941 else
2942 rhs1 = rhs;
2944 type1 = TREE_TYPE (rhs1);
2946 if (TREE_CODE (type1) != TREE_CODE (type)
2947 || TYPE_PRECISION (type1) * 2 > TYPE_PRECISION (type))
2948 return false;
2950 *new_rhs_out = rhs1;
2951 *type_out = type1;
2952 return true;
2955 if (TREE_CODE (rhs) == INTEGER_CST)
2957 *new_rhs_out = rhs;
2958 *type_out = NULL;
2959 return true;
2962 return false;
2965 /* Return true if STMT performs a widening multiplication, assuming the
2966 output type is TYPE. If so, store the unwidened types of the operands
2967 in *TYPE1_OUT and *TYPE2_OUT respectively. Also fill *RHS1_OUT and
2968 *RHS2_OUT such that converting those operands to types *TYPE1_OUT
2969 and *TYPE2_OUT would give the operands of the multiplication. */
2971 static bool
2972 is_widening_mult_p (gimple stmt,
2973 tree *type1_out, tree *rhs1_out,
2974 tree *type2_out, tree *rhs2_out)
2976 tree type = TREE_TYPE (gimple_assign_lhs (stmt));
2978 if (TREE_CODE (type) != INTEGER_TYPE
2979 && TREE_CODE (type) != FIXED_POINT_TYPE)
2980 return false;
2982 if (!is_widening_mult_rhs_p (type, gimple_assign_rhs1 (stmt), type1_out,
2983 rhs1_out))
2984 return false;
2986 if (!is_widening_mult_rhs_p (type, gimple_assign_rhs2 (stmt), type2_out,
2987 rhs2_out))
2988 return false;
2990 if (*type1_out == NULL)
2992 if (*type2_out == NULL || !int_fits_type_p (*rhs1_out, *type2_out))
2993 return false;
2994 *type1_out = *type2_out;
2997 if (*type2_out == NULL)
2999 if (!int_fits_type_p (*rhs2_out, *type1_out))
3000 return false;
3001 *type2_out = *type1_out;
3004 /* Ensure that the larger of the two operands comes first. */
3005 if (TYPE_PRECISION (*type1_out) < TYPE_PRECISION (*type2_out))
3007 std::swap (*type1_out, *type2_out);
3008 std::swap (*rhs1_out, *rhs2_out);
3011 return true;
3014 /* Process a single gimple statement STMT, which has a MULT_EXPR as
3015 its rhs, and try to convert it into a WIDEN_MULT_EXPR. The return
3016 value is true iff we converted the statement. */
3018 static bool
3019 convert_mult_to_widen (gimple stmt, gimple_stmt_iterator *gsi)
3021 tree lhs, rhs1, rhs2, type, type1, type2;
3022 enum insn_code handler;
3023 machine_mode to_mode, from_mode, actual_mode;
3024 optab op;
3025 int actual_precision;
3026 location_t loc = gimple_location (stmt);
3027 bool from_unsigned1, from_unsigned2;
3029 lhs = gimple_assign_lhs (stmt);
3030 type = TREE_TYPE (lhs);
3031 if (TREE_CODE (type) != INTEGER_TYPE)
3032 return false;
3034 if (!is_widening_mult_p (stmt, &type1, &rhs1, &type2, &rhs2))
3035 return false;
3037 to_mode = TYPE_MODE (type);
3038 from_mode = TYPE_MODE (type1);
3039 from_unsigned1 = TYPE_UNSIGNED (type1);
3040 from_unsigned2 = TYPE_UNSIGNED (type2);
3042 if (from_unsigned1 && from_unsigned2)
3043 op = umul_widen_optab;
3044 else if (!from_unsigned1 && !from_unsigned2)
3045 op = smul_widen_optab;
3046 else
3047 op = usmul_widen_optab;
3049 handler = find_widening_optab_handler_and_mode (op, to_mode, from_mode,
3050 0, &actual_mode);
3052 if (handler == CODE_FOR_nothing)
3054 if (op != smul_widen_optab)
3056 /* We can use a signed multiply with unsigned types as long as
3057 there is a wider mode to use, or it is the smaller of the two
3058 types that is unsigned. Note that type1 >= type2, always. */
3059 if ((TYPE_UNSIGNED (type1)
3060 && TYPE_PRECISION (type1) == GET_MODE_PRECISION (from_mode))
3061 || (TYPE_UNSIGNED (type2)
3062 && TYPE_PRECISION (type2) == GET_MODE_PRECISION (from_mode)))
3064 from_mode = GET_MODE_WIDER_MODE (from_mode);
3065 if (GET_MODE_SIZE (to_mode) <= GET_MODE_SIZE (from_mode))
3066 return false;
3069 op = smul_widen_optab;
3070 handler = find_widening_optab_handler_and_mode (op, to_mode,
3071 from_mode, 0,
3072 &actual_mode);
3074 if (handler == CODE_FOR_nothing)
3075 return false;
3077 from_unsigned1 = from_unsigned2 = false;
3079 else
3080 return false;
3083 /* Ensure that the inputs to the handler are in the correct precison
3084 for the opcode. This will be the full mode size. */
3085 actual_precision = GET_MODE_PRECISION (actual_mode);
3086 if (2 * actual_precision > TYPE_PRECISION (type))
3087 return false;
3088 if (actual_precision != TYPE_PRECISION (type1)
3089 || from_unsigned1 != TYPE_UNSIGNED (type1))
3090 rhs1 = build_and_insert_cast (gsi, loc,
3091 build_nonstandard_integer_type
3092 (actual_precision, from_unsigned1), rhs1);
3093 if (actual_precision != TYPE_PRECISION (type2)
3094 || from_unsigned2 != TYPE_UNSIGNED (type2))
3095 rhs2 = build_and_insert_cast (gsi, loc,
3096 build_nonstandard_integer_type
3097 (actual_precision, from_unsigned2), rhs2);
3099 /* Handle constants. */
3100 if (TREE_CODE (rhs1) == INTEGER_CST)
3101 rhs1 = fold_convert (type1, rhs1);
3102 if (TREE_CODE (rhs2) == INTEGER_CST)
3103 rhs2 = fold_convert (type2, rhs2);
3105 gimple_assign_set_rhs1 (stmt, rhs1);
3106 gimple_assign_set_rhs2 (stmt, rhs2);
3107 gimple_assign_set_rhs_code (stmt, WIDEN_MULT_EXPR);
3108 update_stmt (stmt);
3109 widen_mul_stats.widen_mults_inserted++;
3110 return true;
3113 /* Process a single gimple statement STMT, which is found at the
3114 iterator GSI and has a either a PLUS_EXPR or a MINUS_EXPR as its
3115 rhs (given by CODE), and try to convert it into a
3116 WIDEN_MULT_PLUS_EXPR or a WIDEN_MULT_MINUS_EXPR. The return value
3117 is true iff we converted the statement. */
3119 static bool
3120 convert_plusminus_to_widen (gimple_stmt_iterator *gsi, gimple stmt,
3121 enum tree_code code)
3123 gimple rhs1_stmt = NULL, rhs2_stmt = NULL;
3124 gimple conv1_stmt = NULL, conv2_stmt = NULL, conv_stmt;
3125 tree type, type1, type2, optype;
3126 tree lhs, rhs1, rhs2, mult_rhs1, mult_rhs2, add_rhs;
3127 enum tree_code rhs1_code = ERROR_MARK, rhs2_code = ERROR_MARK;
3128 optab this_optab;
3129 enum tree_code wmult_code;
3130 enum insn_code handler;
3131 machine_mode to_mode, from_mode, actual_mode;
3132 location_t loc = gimple_location (stmt);
3133 int actual_precision;
3134 bool from_unsigned1, from_unsigned2;
3136 lhs = gimple_assign_lhs (stmt);
3137 type = TREE_TYPE (lhs);
3138 if (TREE_CODE (type) != INTEGER_TYPE
3139 && TREE_CODE (type) != FIXED_POINT_TYPE)
3140 return false;
3142 if (code == MINUS_EXPR)
3143 wmult_code = WIDEN_MULT_MINUS_EXPR;
3144 else
3145 wmult_code = WIDEN_MULT_PLUS_EXPR;
3147 rhs1 = gimple_assign_rhs1 (stmt);
3148 rhs2 = gimple_assign_rhs2 (stmt);
3150 if (TREE_CODE (rhs1) == SSA_NAME)
3152 rhs1_stmt = SSA_NAME_DEF_STMT (rhs1);
3153 if (is_gimple_assign (rhs1_stmt))
3154 rhs1_code = gimple_assign_rhs_code (rhs1_stmt);
3157 if (TREE_CODE (rhs2) == SSA_NAME)
3159 rhs2_stmt = SSA_NAME_DEF_STMT (rhs2);
3160 if (is_gimple_assign (rhs2_stmt))
3161 rhs2_code = gimple_assign_rhs_code (rhs2_stmt);
3164 /* Allow for one conversion statement between the multiply
3165 and addition/subtraction statement. If there are more than
3166 one conversions then we assume they would invalidate this
3167 transformation. If that's not the case then they should have
3168 been folded before now. */
3169 if (CONVERT_EXPR_CODE_P (rhs1_code))
3171 conv1_stmt = rhs1_stmt;
3172 rhs1 = gimple_assign_rhs1 (rhs1_stmt);
3173 if (TREE_CODE (rhs1) == SSA_NAME)
3175 rhs1_stmt = SSA_NAME_DEF_STMT (rhs1);
3176 if (is_gimple_assign (rhs1_stmt))
3177 rhs1_code = gimple_assign_rhs_code (rhs1_stmt);
3179 else
3180 return false;
3182 if (CONVERT_EXPR_CODE_P (rhs2_code))
3184 conv2_stmt = rhs2_stmt;
3185 rhs2 = gimple_assign_rhs1 (rhs2_stmt);
3186 if (TREE_CODE (rhs2) == SSA_NAME)
3188 rhs2_stmt = SSA_NAME_DEF_STMT (rhs2);
3189 if (is_gimple_assign (rhs2_stmt))
3190 rhs2_code = gimple_assign_rhs_code (rhs2_stmt);
3192 else
3193 return false;
3196 /* If code is WIDEN_MULT_EXPR then it would seem unnecessary to call
3197 is_widening_mult_p, but we still need the rhs returns.
3199 It might also appear that it would be sufficient to use the existing
3200 operands of the widening multiply, but that would limit the choice of
3201 multiply-and-accumulate instructions.
3203 If the widened-multiplication result has more than one uses, it is
3204 probably wiser not to do the conversion. */
3205 if (code == PLUS_EXPR
3206 && (rhs1_code == MULT_EXPR || rhs1_code == WIDEN_MULT_EXPR))
3208 if (!has_single_use (rhs1)
3209 || !is_widening_mult_p (rhs1_stmt, &type1, &mult_rhs1,
3210 &type2, &mult_rhs2))
3211 return false;
3212 add_rhs = rhs2;
3213 conv_stmt = conv1_stmt;
3215 else if (rhs2_code == MULT_EXPR || rhs2_code == WIDEN_MULT_EXPR)
3217 if (!has_single_use (rhs2)
3218 || !is_widening_mult_p (rhs2_stmt, &type1, &mult_rhs1,
3219 &type2, &mult_rhs2))
3220 return false;
3221 add_rhs = rhs1;
3222 conv_stmt = conv2_stmt;
3224 else
3225 return false;
3227 to_mode = TYPE_MODE (type);
3228 from_mode = TYPE_MODE (type1);
3229 from_unsigned1 = TYPE_UNSIGNED (type1);
3230 from_unsigned2 = TYPE_UNSIGNED (type2);
3231 optype = type1;
3233 /* There's no such thing as a mixed sign madd yet, so use a wider mode. */
3234 if (from_unsigned1 != from_unsigned2)
3236 if (!INTEGRAL_TYPE_P (type))
3237 return false;
3238 /* We can use a signed multiply with unsigned types as long as
3239 there is a wider mode to use, or it is the smaller of the two
3240 types that is unsigned. Note that type1 >= type2, always. */
3241 if ((from_unsigned1
3242 && TYPE_PRECISION (type1) == GET_MODE_PRECISION (from_mode))
3243 || (from_unsigned2
3244 && TYPE_PRECISION (type2) == GET_MODE_PRECISION (from_mode)))
3246 from_mode = GET_MODE_WIDER_MODE (from_mode);
3247 if (GET_MODE_SIZE (from_mode) >= GET_MODE_SIZE (to_mode))
3248 return false;
3251 from_unsigned1 = from_unsigned2 = false;
3252 optype = build_nonstandard_integer_type (GET_MODE_PRECISION (from_mode),
3253 false);
3256 /* If there was a conversion between the multiply and addition
3257 then we need to make sure it fits a multiply-and-accumulate.
3258 The should be a single mode change which does not change the
3259 value. */
3260 if (conv_stmt)
3262 /* We use the original, unmodified data types for this. */
3263 tree from_type = TREE_TYPE (gimple_assign_rhs1 (conv_stmt));
3264 tree to_type = TREE_TYPE (gimple_assign_lhs (conv_stmt));
3265 int data_size = TYPE_PRECISION (type1) + TYPE_PRECISION (type2);
3266 bool is_unsigned = TYPE_UNSIGNED (type1) && TYPE_UNSIGNED (type2);
3268 if (TYPE_PRECISION (from_type) > TYPE_PRECISION (to_type))
3270 /* Conversion is a truncate. */
3271 if (TYPE_PRECISION (to_type) < data_size)
3272 return false;
3274 else if (TYPE_PRECISION (from_type) < TYPE_PRECISION (to_type))
3276 /* Conversion is an extend. Check it's the right sort. */
3277 if (TYPE_UNSIGNED (from_type) != is_unsigned
3278 && !(is_unsigned && TYPE_PRECISION (from_type) > data_size))
3279 return false;
3281 /* else convert is a no-op for our purposes. */
3284 /* Verify that the machine can perform a widening multiply
3285 accumulate in this mode/signedness combination, otherwise
3286 this transformation is likely to pessimize code. */
3287 this_optab = optab_for_tree_code (wmult_code, optype, optab_default);
3288 handler = find_widening_optab_handler_and_mode (this_optab, to_mode,
3289 from_mode, 0, &actual_mode);
3291 if (handler == CODE_FOR_nothing)
3292 return false;
3294 /* Ensure that the inputs to the handler are in the correct precison
3295 for the opcode. This will be the full mode size. */
3296 actual_precision = GET_MODE_PRECISION (actual_mode);
3297 if (actual_precision != TYPE_PRECISION (type1)
3298 || from_unsigned1 != TYPE_UNSIGNED (type1))
3299 mult_rhs1 = build_and_insert_cast (gsi, loc,
3300 build_nonstandard_integer_type
3301 (actual_precision, from_unsigned1),
3302 mult_rhs1);
3303 if (actual_precision != TYPE_PRECISION (type2)
3304 || from_unsigned2 != TYPE_UNSIGNED (type2))
3305 mult_rhs2 = build_and_insert_cast (gsi, loc,
3306 build_nonstandard_integer_type
3307 (actual_precision, from_unsigned2),
3308 mult_rhs2);
3310 if (!useless_type_conversion_p (type, TREE_TYPE (add_rhs)))
3311 add_rhs = build_and_insert_cast (gsi, loc, type, add_rhs);
3313 /* Handle constants. */
3314 if (TREE_CODE (mult_rhs1) == INTEGER_CST)
3315 mult_rhs1 = fold_convert (type1, mult_rhs1);
3316 if (TREE_CODE (mult_rhs2) == INTEGER_CST)
3317 mult_rhs2 = fold_convert (type2, mult_rhs2);
3319 gimple_assign_set_rhs_with_ops (gsi, wmult_code, mult_rhs1, mult_rhs2,
3320 add_rhs);
3321 update_stmt (gsi_stmt (*gsi));
3322 widen_mul_stats.maccs_inserted++;
3323 return true;
3326 /* Combine the multiplication at MUL_STMT with operands MULOP1 and MULOP2
3327 with uses in additions and subtractions to form fused multiply-add
3328 operations. Returns true if successful and MUL_STMT should be removed. */
3330 static bool
3331 convert_mult_to_fma (gimple mul_stmt, tree op1, tree op2)
3333 tree mul_result = gimple_get_lhs (mul_stmt);
3334 tree type = TREE_TYPE (mul_result);
3335 gimple use_stmt, neguse_stmt;
3336 gassign *fma_stmt;
3337 use_operand_p use_p;
3338 imm_use_iterator imm_iter;
3340 if (FLOAT_TYPE_P (type)
3341 && flag_fp_contract_mode == FP_CONTRACT_OFF)
3342 return false;
3344 /* We don't want to do bitfield reduction ops. */
3345 if (INTEGRAL_TYPE_P (type)
3346 && (TYPE_PRECISION (type)
3347 != GET_MODE_PRECISION (TYPE_MODE (type))))
3348 return false;
3350 /* If the target doesn't support it, don't generate it. We assume that
3351 if fma isn't available then fms, fnma or fnms are not either. */
3352 if (optab_handler (fma_optab, TYPE_MODE (type)) == CODE_FOR_nothing)
3353 return false;
3355 /* If the multiplication has zero uses, it is kept around probably because
3356 of -fnon-call-exceptions. Don't optimize it away in that case,
3357 it is DCE job. */
3358 if (has_zero_uses (mul_result))
3359 return false;
3361 /* Make sure that the multiplication statement becomes dead after
3362 the transformation, thus that all uses are transformed to FMAs.
3363 This means we assume that an FMA operation has the same cost
3364 as an addition. */
3365 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, mul_result)
3367 enum tree_code use_code;
3368 tree result = mul_result;
3369 bool negate_p = false;
3371 use_stmt = USE_STMT (use_p);
3373 if (is_gimple_debug (use_stmt))
3374 continue;
3376 /* For now restrict this operations to single basic blocks. In theory
3377 we would want to support sinking the multiplication in
3378 m = a*b;
3379 if ()
3380 ma = m + c;
3381 else
3382 d = m;
3383 to form a fma in the then block and sink the multiplication to the
3384 else block. */
3385 if (gimple_bb (use_stmt) != gimple_bb (mul_stmt))
3386 return false;
3388 if (!is_gimple_assign (use_stmt))
3389 return false;
3391 use_code = gimple_assign_rhs_code (use_stmt);
3393 /* A negate on the multiplication leads to FNMA. */
3394 if (use_code == NEGATE_EXPR)
3396 ssa_op_iter iter;
3397 use_operand_p usep;
3399 result = gimple_assign_lhs (use_stmt);
3401 /* Make sure the negate statement becomes dead with this
3402 single transformation. */
3403 if (!single_imm_use (gimple_assign_lhs (use_stmt),
3404 &use_p, &neguse_stmt))
3405 return false;
3407 /* Make sure the multiplication isn't also used on that stmt. */
3408 FOR_EACH_PHI_OR_STMT_USE (usep, neguse_stmt, iter, SSA_OP_USE)
3409 if (USE_FROM_PTR (usep) == mul_result)
3410 return false;
3412 /* Re-validate. */
3413 use_stmt = neguse_stmt;
3414 if (gimple_bb (use_stmt) != gimple_bb (mul_stmt))
3415 return false;
3416 if (!is_gimple_assign (use_stmt))
3417 return false;
3419 use_code = gimple_assign_rhs_code (use_stmt);
3420 negate_p = true;
3423 switch (use_code)
3425 case MINUS_EXPR:
3426 if (gimple_assign_rhs2 (use_stmt) == result)
3427 negate_p = !negate_p;
3428 break;
3429 case PLUS_EXPR:
3430 break;
3431 default:
3432 /* FMA can only be formed from PLUS and MINUS. */
3433 return false;
3436 /* If the subtrahend (gimple_assign_rhs2 (use_stmt)) is computed
3437 by a MULT_EXPR that we'll visit later, we might be able to
3438 get a more profitable match with fnma.
3439 OTOH, if we don't, a negate / fma pair has likely lower latency
3440 that a mult / subtract pair. */
3441 if (use_code == MINUS_EXPR && !negate_p
3442 && gimple_assign_rhs1 (use_stmt) == result
3443 && optab_handler (fms_optab, TYPE_MODE (type)) == CODE_FOR_nothing
3444 && optab_handler (fnma_optab, TYPE_MODE (type)) != CODE_FOR_nothing)
3446 tree rhs2 = gimple_assign_rhs2 (use_stmt);
3448 if (TREE_CODE (rhs2) == SSA_NAME)
3450 gimple stmt2 = SSA_NAME_DEF_STMT (rhs2);
3451 if (has_single_use (rhs2)
3452 && is_gimple_assign (stmt2)
3453 && gimple_assign_rhs_code (stmt2) == MULT_EXPR)
3454 return false;
3458 /* We can't handle a * b + a * b. */
3459 if (gimple_assign_rhs1 (use_stmt) == gimple_assign_rhs2 (use_stmt))
3460 return false;
3462 /* While it is possible to validate whether or not the exact form
3463 that we've recognized is available in the backend, the assumption
3464 is that the transformation is never a loss. For instance, suppose
3465 the target only has the plain FMA pattern available. Consider
3466 a*b-c -> fma(a,b,-c): we've exchanged MUL+SUB for FMA+NEG, which
3467 is still two operations. Consider -(a*b)-c -> fma(-a,b,-c): we
3468 still have 3 operations, but in the FMA form the two NEGs are
3469 independent and could be run in parallel. */
3472 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, mul_result)
3474 gimple_stmt_iterator gsi = gsi_for_stmt (use_stmt);
3475 enum tree_code use_code;
3476 tree addop, mulop1 = op1, result = mul_result;
3477 bool negate_p = false;
3479 if (is_gimple_debug (use_stmt))
3480 continue;
3482 use_code = gimple_assign_rhs_code (use_stmt);
3483 if (use_code == NEGATE_EXPR)
3485 result = gimple_assign_lhs (use_stmt);
3486 single_imm_use (gimple_assign_lhs (use_stmt), &use_p, &neguse_stmt);
3487 gsi_remove (&gsi, true);
3488 release_defs (use_stmt);
3490 use_stmt = neguse_stmt;
3491 gsi = gsi_for_stmt (use_stmt);
3492 use_code = gimple_assign_rhs_code (use_stmt);
3493 negate_p = true;
3496 if (gimple_assign_rhs1 (use_stmt) == result)
3498 addop = gimple_assign_rhs2 (use_stmt);
3499 /* a * b - c -> a * b + (-c) */
3500 if (gimple_assign_rhs_code (use_stmt) == MINUS_EXPR)
3501 addop = force_gimple_operand_gsi (&gsi,
3502 build1 (NEGATE_EXPR,
3503 type, addop),
3504 true, NULL_TREE, true,
3505 GSI_SAME_STMT);
3507 else
3509 addop = gimple_assign_rhs1 (use_stmt);
3510 /* a - b * c -> (-b) * c + a */
3511 if (gimple_assign_rhs_code (use_stmt) == MINUS_EXPR)
3512 negate_p = !negate_p;
3515 if (negate_p)
3516 mulop1 = force_gimple_operand_gsi (&gsi,
3517 build1 (NEGATE_EXPR,
3518 type, mulop1),
3519 true, NULL_TREE, true,
3520 GSI_SAME_STMT);
3522 fma_stmt = gimple_build_assign (gimple_assign_lhs (use_stmt),
3523 FMA_EXPR, mulop1, op2, addop);
3524 gsi_replace (&gsi, fma_stmt, true);
3525 widen_mul_stats.fmas_inserted++;
3528 return true;
3531 /* Find integer multiplications where the operands are extended from
3532 smaller types, and replace the MULT_EXPR with a WIDEN_MULT_EXPR
3533 where appropriate. */
3535 namespace {
3537 const pass_data pass_data_optimize_widening_mul =
3539 GIMPLE_PASS, /* type */
3540 "widening_mul", /* name */
3541 OPTGROUP_NONE, /* optinfo_flags */
3542 TV_NONE, /* tv_id */
3543 PROP_ssa, /* properties_required */
3544 0, /* properties_provided */
3545 0, /* properties_destroyed */
3546 0, /* todo_flags_start */
3547 TODO_update_ssa, /* todo_flags_finish */
3550 class pass_optimize_widening_mul : public gimple_opt_pass
3552 public:
3553 pass_optimize_widening_mul (gcc::context *ctxt)
3554 : gimple_opt_pass (pass_data_optimize_widening_mul, ctxt)
3557 /* opt_pass methods: */
3558 virtual bool gate (function *)
3560 return flag_expensive_optimizations && optimize;
3563 virtual unsigned int execute (function *);
3565 }; // class pass_optimize_widening_mul
3567 unsigned int
3568 pass_optimize_widening_mul::execute (function *fun)
3570 basic_block bb;
3571 bool cfg_changed = false;
3573 memset (&widen_mul_stats, 0, sizeof (widen_mul_stats));
3575 FOR_EACH_BB_FN (bb, fun)
3577 gimple_stmt_iterator gsi;
3579 for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi);)
3581 gimple stmt = gsi_stmt (gsi);
3582 enum tree_code code;
3584 if (is_gimple_assign (stmt))
3586 code = gimple_assign_rhs_code (stmt);
3587 switch (code)
3589 case MULT_EXPR:
3590 if (!convert_mult_to_widen (stmt, &gsi)
3591 && convert_mult_to_fma (stmt,
3592 gimple_assign_rhs1 (stmt),
3593 gimple_assign_rhs2 (stmt)))
3595 gsi_remove (&gsi, true);
3596 release_defs (stmt);
3597 continue;
3599 break;
3601 case PLUS_EXPR:
3602 case MINUS_EXPR:
3603 convert_plusminus_to_widen (&gsi, stmt, code);
3604 break;
3606 default:;
3609 else if (is_gimple_call (stmt)
3610 && gimple_call_lhs (stmt))
3612 tree fndecl = gimple_call_fndecl (stmt);
3613 if (fndecl
3614 && DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_NORMAL)
3616 switch (DECL_FUNCTION_CODE (fndecl))
3618 case BUILT_IN_POWF:
3619 case BUILT_IN_POW:
3620 case BUILT_IN_POWL:
3621 if (TREE_CODE (gimple_call_arg (stmt, 1)) == REAL_CST
3622 && REAL_VALUES_EQUAL
3623 (TREE_REAL_CST (gimple_call_arg (stmt, 1)),
3624 dconst2)
3625 && convert_mult_to_fma (stmt,
3626 gimple_call_arg (stmt, 0),
3627 gimple_call_arg (stmt, 0)))
3629 unlink_stmt_vdef (stmt);
3630 if (gsi_remove (&gsi, true)
3631 && gimple_purge_dead_eh_edges (bb))
3632 cfg_changed = true;
3633 release_defs (stmt);
3634 continue;
3636 break;
3638 default:;
3642 gsi_next (&gsi);
3646 statistics_counter_event (fun, "widening multiplications inserted",
3647 widen_mul_stats.widen_mults_inserted);
3648 statistics_counter_event (fun, "widening maccs inserted",
3649 widen_mul_stats.maccs_inserted);
3650 statistics_counter_event (fun, "fused multiply-adds inserted",
3651 widen_mul_stats.fmas_inserted);
3653 return cfg_changed ? TODO_cleanup_cfg : 0;
3656 } // anon namespace
3658 gimple_opt_pass *
3659 make_pass_optimize_widening_mul (gcc::context *ctxt)
3661 return new pass_optimize_widening_mul (ctxt);