2015-06-11 Paul Thomas <pault@gcc.gnu.org>
[official-gcc.git] / gcc / tree-ssa-math-opts.c
blobd46839cc5e3aa5a2ca47465d8b86a0603b7abc1c
1 /* Global, SSA-based optimizations using mathematical identities.
2 Copyright (C) 2005-2015 Free Software Foundation, Inc.
4 This file is part of GCC.
6 GCC is free software; you can redistribute it and/or modify it
7 under the terms of the GNU General Public License as published by the
8 Free Software Foundation; either version 3, or (at your option) any
9 later version.
11 GCC is distributed in the hope that it will be useful, but WITHOUT
12 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14 for more details.
16 You should have received a copy of the GNU General Public License
17 along with GCC; see the file COPYING3. If not see
18 <http://www.gnu.org/licenses/>. */
20 /* Currently, the only mini-pass in this file tries to CSE reciprocal
21 operations. These are common in sequences such as this one:
23 modulus = sqrt(x*x + y*y + z*z);
24 x = x / modulus;
25 y = y / modulus;
26 z = z / modulus;
28 that can be optimized to
30 modulus = sqrt(x*x + y*y + z*z);
31 rmodulus = 1.0 / modulus;
32 x = x * rmodulus;
33 y = y * rmodulus;
34 z = z * rmodulus;
36 We do this for loop invariant divisors, and with this pass whenever
37 we notice that a division has the same divisor multiple times.
39 Of course, like in PRE, we don't insert a division if a dominator
40 already has one. However, this cannot be done as an extension of
41 PRE for several reasons.
43 First of all, with some experiments it was found out that the
44 transformation is not always useful if there are only two divisions
45 hy the same divisor. This is probably because modern processors
46 can pipeline the divisions; on older, in-order processors it should
47 still be effective to optimize two divisions by the same number.
48 We make this a param, and it shall be called N in the remainder of
49 this comment.
51 Second, if trapping math is active, we have less freedom on where
52 to insert divisions: we can only do so in basic blocks that already
53 contain one. (If divisions don't trap, instead, we can insert
54 divisions elsewhere, which will be in blocks that are common dominators
55 of those that have the division).
57 We really don't want to compute the reciprocal unless a division will
58 be found. To do this, we won't insert the division in a basic block
59 that has less than N divisions *post-dominating* it.
61 The algorithm constructs a subset of the dominator tree, holding the
62 blocks containing the divisions and the common dominators to them,
63 and walk it twice. The first walk is in post-order, and it annotates
64 each block with the number of divisions that post-dominate it: this
65 gives information on where divisions can be inserted profitably.
66 The second walk is in pre-order, and it inserts divisions as explained
67 above, and replaces divisions by multiplications.
69 In the best case, the cost of the pass is O(n_statements). In the
70 worst-case, the cost is due to creating the dominator tree subset,
71 with a cost of O(n_basic_blocks ^ 2); however this can only happen
72 for n_statements / n_basic_blocks statements. So, the amortized cost
73 of creating the dominator tree subset is O(n_basic_blocks) and the
74 worst-case cost of the pass is O(n_statements * n_basic_blocks).
76 More practically, the cost will be small because there are few
77 divisions, and they tend to be in the same basic block, so insert_bb
78 is called very few times.
80 If we did this using domwalk.c, an efficient implementation would have
81 to work on all the variables in a single pass, because we could not
82 work on just a subset of the dominator tree, as we do now, and the
83 cost would also be something like O(n_statements * n_basic_blocks).
84 The data structures would be more complex in order to work on all the
85 variables in a single pass. */
87 #include "config.h"
88 #include "system.h"
89 #include "coretypes.h"
90 #include "tm.h"
91 #include "flags.h"
92 #include "input.h"
93 #include "alias.h"
94 #include "symtab.h"
95 #include "tree.h"
96 #include "fold-const.h"
97 #include "predict.h"
98 #include "hard-reg-set.h"
99 #include "function.h"
100 #include "dominance.h"
101 #include "cfg.h"
102 #include "basic-block.h"
103 #include "tree-ssa-alias.h"
104 #include "internal-fn.h"
105 #include "gimple-fold.h"
106 #include "gimple-expr.h"
107 #include "is-a.h"
108 #include "gimple.h"
109 #include "gimple-iterator.h"
110 #include "gimplify.h"
111 #include "gimplify-me.h"
112 #include "stor-layout.h"
113 #include "gimple-ssa.h"
114 #include "tree-cfg.h"
115 #include "tree-phinodes.h"
116 #include "ssa-iterators.h"
117 #include "stringpool.h"
118 #include "tree-ssanames.h"
119 #include "rtl.h"
120 #include "insn-config.h"
121 #include "expmed.h"
122 #include "dojump.h"
123 #include "explow.h"
124 #include "calls.h"
125 #include "emit-rtl.h"
126 #include "varasm.h"
127 #include "stmt.h"
128 #include "expr.h"
129 #include "tree-dfa.h"
130 #include "tree-ssa.h"
131 #include "tree-pass.h"
132 #include "alloc-pool.h"
133 #include "target.h"
134 #include "gimple-pretty-print.h"
135 #include "builtins.h"
136 #include "params.h"
138 /* FIXME: RTL headers have to be included here for optabs. */
139 #include "rtl.h" /* Because optabs.h wants enum rtx_code. */
140 #include "expr.h" /* Because optabs.h wants sepops. */
141 #include "insn-codes.h"
142 #include "optabs.h"
144 /* This structure represents one basic block that either computes a
145 division, or is a common dominator for basic block that compute a
146 division. */
147 struct occurrence {
148 /* The basic block represented by this structure. */
149 basic_block bb;
151 /* If non-NULL, the SSA_NAME holding the definition for a reciprocal
152 inserted in BB. */
153 tree recip_def;
155 /* If non-NULL, the GIMPLE_ASSIGN for a reciprocal computation that
156 was inserted in BB. */
157 gimple recip_def_stmt;
159 /* Pointer to a list of "struct occurrence"s for blocks dominated
160 by BB. */
161 struct occurrence *children;
163 /* Pointer to the next "struct occurrence"s in the list of blocks
164 sharing a common dominator. */
165 struct occurrence *next;
167 /* The number of divisions that are in BB before compute_merit. The
168 number of divisions that are in BB or post-dominate it after
169 compute_merit. */
170 int num_divisions;
172 /* True if the basic block has a division, false if it is a common
173 dominator for basic blocks that do. If it is false and trapping
174 math is active, BB is not a candidate for inserting a reciprocal. */
175 bool bb_has_division;
178 static struct
180 /* Number of 1.0/X ops inserted. */
181 int rdivs_inserted;
183 /* Number of 1.0/FUNC ops inserted. */
184 int rfuncs_inserted;
185 } reciprocal_stats;
187 static struct
189 /* Number of cexpi calls inserted. */
190 int inserted;
191 } sincos_stats;
193 static struct
195 /* Number of hand-written 16-bit nop / bswaps found. */
196 int found_16bit;
198 /* Number of hand-written 32-bit nop / bswaps found. */
199 int found_32bit;
201 /* Number of hand-written 64-bit nop / bswaps found. */
202 int found_64bit;
203 } nop_stats, bswap_stats;
205 static struct
207 /* Number of widening multiplication ops inserted. */
208 int widen_mults_inserted;
210 /* Number of integer multiply-and-accumulate ops inserted. */
211 int maccs_inserted;
213 /* Number of fp fused multiply-add ops inserted. */
214 int fmas_inserted;
215 } widen_mul_stats;
217 /* The instance of "struct occurrence" representing the highest
218 interesting block in the dominator tree. */
219 static struct occurrence *occ_head;
221 /* Allocation pool for getting instances of "struct occurrence". */
222 static pool_allocator<occurrence> *occ_pool;
226 /* Allocate and return a new struct occurrence for basic block BB, and
227 whose children list is headed by CHILDREN. */
228 static struct occurrence *
229 occ_new (basic_block bb, struct occurrence *children)
231 struct occurrence *occ;
233 bb->aux = occ = occ_pool->allocate ();
234 memset (occ, 0, sizeof (struct occurrence));
236 occ->bb = bb;
237 occ->children = children;
238 return occ;
242 /* Insert NEW_OCC into our subset of the dominator tree. P_HEAD points to a
243 list of "struct occurrence"s, one per basic block, having IDOM as
244 their common dominator.
246 We try to insert NEW_OCC as deep as possible in the tree, and we also
247 insert any other block that is a common dominator for BB and one
248 block already in the tree. */
250 static void
251 insert_bb (struct occurrence *new_occ, basic_block idom,
252 struct occurrence **p_head)
254 struct occurrence *occ, **p_occ;
256 for (p_occ = p_head; (occ = *p_occ) != NULL; )
258 basic_block bb = new_occ->bb, occ_bb = occ->bb;
259 basic_block dom = nearest_common_dominator (CDI_DOMINATORS, occ_bb, bb);
260 if (dom == bb)
262 /* BB dominates OCC_BB. OCC becomes NEW_OCC's child: remove OCC
263 from its list. */
264 *p_occ = occ->next;
265 occ->next = new_occ->children;
266 new_occ->children = occ;
268 /* Try the next block (it may as well be dominated by BB). */
271 else if (dom == occ_bb)
273 /* OCC_BB dominates BB. Tail recurse to look deeper. */
274 insert_bb (new_occ, dom, &occ->children);
275 return;
278 else if (dom != idom)
280 gcc_assert (!dom->aux);
282 /* There is a dominator between IDOM and BB, add it and make
283 two children out of NEW_OCC and OCC. First, remove OCC from
284 its list. */
285 *p_occ = occ->next;
286 new_occ->next = occ;
287 occ->next = NULL;
289 /* None of the previous blocks has DOM as a dominator: if we tail
290 recursed, we would reexamine them uselessly. Just switch BB with
291 DOM, and go on looking for blocks dominated by DOM. */
292 new_occ = occ_new (dom, new_occ);
295 else
297 /* Nothing special, go on with the next element. */
298 p_occ = &occ->next;
302 /* No place was found as a child of IDOM. Make BB a sibling of IDOM. */
303 new_occ->next = *p_head;
304 *p_head = new_occ;
307 /* Register that we found a division in BB. */
309 static inline void
310 register_division_in (basic_block bb)
312 struct occurrence *occ;
314 occ = (struct occurrence *) bb->aux;
315 if (!occ)
317 occ = occ_new (bb, NULL);
318 insert_bb (occ, ENTRY_BLOCK_PTR_FOR_FN (cfun), &occ_head);
321 occ->bb_has_division = true;
322 occ->num_divisions++;
326 /* Compute the number of divisions that postdominate each block in OCC and
327 its children. */
329 static void
330 compute_merit (struct occurrence *occ)
332 struct occurrence *occ_child;
333 basic_block dom = occ->bb;
335 for (occ_child = occ->children; occ_child; occ_child = occ_child->next)
337 basic_block bb;
338 if (occ_child->children)
339 compute_merit (occ_child);
341 if (flag_exceptions)
342 bb = single_noncomplex_succ (dom);
343 else
344 bb = dom;
346 if (dominated_by_p (CDI_POST_DOMINATORS, bb, occ_child->bb))
347 occ->num_divisions += occ_child->num_divisions;
352 /* Return whether USE_STMT is a floating-point division by DEF. */
353 static inline bool
354 is_division_by (gimple use_stmt, tree def)
356 return is_gimple_assign (use_stmt)
357 && gimple_assign_rhs_code (use_stmt) == RDIV_EXPR
358 && gimple_assign_rhs2 (use_stmt) == def
359 /* Do not recognize x / x as valid division, as we are getting
360 confused later by replacing all immediate uses x in such
361 a stmt. */
362 && gimple_assign_rhs1 (use_stmt) != def;
365 /* Walk the subset of the dominator tree rooted at OCC, setting the
366 RECIP_DEF field to a definition of 1.0 / DEF that can be used in
367 the given basic block. The field may be left NULL, of course,
368 if it is not possible or profitable to do the optimization.
370 DEF_BSI is an iterator pointing at the statement defining DEF.
371 If RECIP_DEF is set, a dominator already has a computation that can
372 be used. */
374 static void
375 insert_reciprocals (gimple_stmt_iterator *def_gsi, struct occurrence *occ,
376 tree def, tree recip_def, int threshold)
378 tree type;
379 gassign *new_stmt;
380 gimple_stmt_iterator gsi;
381 struct occurrence *occ_child;
383 if (!recip_def
384 && (occ->bb_has_division || !flag_trapping_math)
385 && occ->num_divisions >= threshold)
387 /* Make a variable with the replacement and substitute it. */
388 type = TREE_TYPE (def);
389 recip_def = create_tmp_reg (type, "reciptmp");
390 new_stmt = gimple_build_assign (recip_def, RDIV_EXPR,
391 build_one_cst (type), def);
393 if (occ->bb_has_division)
395 /* Case 1: insert before an existing division. */
396 gsi = gsi_after_labels (occ->bb);
397 while (!gsi_end_p (gsi) && !is_division_by (gsi_stmt (gsi), def))
398 gsi_next (&gsi);
400 gsi_insert_before (&gsi, new_stmt, GSI_SAME_STMT);
402 else if (def_gsi && occ->bb == def_gsi->bb)
404 /* Case 2: insert right after the definition. Note that this will
405 never happen if the definition statement can throw, because in
406 that case the sole successor of the statement's basic block will
407 dominate all the uses as well. */
408 gsi_insert_after (def_gsi, new_stmt, GSI_NEW_STMT);
410 else
412 /* Case 3: insert in a basic block not containing defs/uses. */
413 gsi = gsi_after_labels (occ->bb);
414 gsi_insert_before (&gsi, new_stmt, GSI_SAME_STMT);
417 reciprocal_stats.rdivs_inserted++;
419 occ->recip_def_stmt = new_stmt;
422 occ->recip_def = recip_def;
423 for (occ_child = occ->children; occ_child; occ_child = occ_child->next)
424 insert_reciprocals (def_gsi, occ_child, def, recip_def, threshold);
428 /* Replace the division at USE_P with a multiplication by the reciprocal, if
429 possible. */
431 static inline void
432 replace_reciprocal (use_operand_p use_p)
434 gimple use_stmt = USE_STMT (use_p);
435 basic_block bb = gimple_bb (use_stmt);
436 struct occurrence *occ = (struct occurrence *) bb->aux;
438 if (optimize_bb_for_speed_p (bb)
439 && occ->recip_def && use_stmt != occ->recip_def_stmt)
441 gimple_stmt_iterator gsi = gsi_for_stmt (use_stmt);
442 gimple_assign_set_rhs_code (use_stmt, MULT_EXPR);
443 SET_USE (use_p, occ->recip_def);
444 fold_stmt_inplace (&gsi);
445 update_stmt (use_stmt);
450 /* Free OCC and return one more "struct occurrence" to be freed. */
452 static struct occurrence *
453 free_bb (struct occurrence *occ)
455 struct occurrence *child, *next;
457 /* First get the two pointers hanging off OCC. */
458 next = occ->next;
459 child = occ->children;
460 occ->bb->aux = NULL;
461 occ_pool->remove (occ);
463 /* Now ensure that we don't recurse unless it is necessary. */
464 if (!child)
465 return next;
466 else
468 while (next)
469 next = free_bb (next);
471 return child;
476 /* Look for floating-point divisions among DEF's uses, and try to
477 replace them by multiplications with the reciprocal. Add
478 as many statements computing the reciprocal as needed.
480 DEF must be a GIMPLE register of a floating-point type. */
482 static void
483 execute_cse_reciprocals_1 (gimple_stmt_iterator *def_gsi, tree def)
485 use_operand_p use_p;
486 imm_use_iterator use_iter;
487 struct occurrence *occ;
488 int count = 0, threshold;
490 gcc_assert (FLOAT_TYPE_P (TREE_TYPE (def)) && is_gimple_reg (def));
492 FOR_EACH_IMM_USE_FAST (use_p, use_iter, def)
494 gimple use_stmt = USE_STMT (use_p);
495 if (is_division_by (use_stmt, def))
497 register_division_in (gimple_bb (use_stmt));
498 count++;
502 /* Do the expensive part only if we can hope to optimize something. */
503 threshold = targetm.min_divisions_for_recip_mul (TYPE_MODE (TREE_TYPE (def)));
504 if (count >= threshold)
506 gimple use_stmt;
507 for (occ = occ_head; occ; occ = occ->next)
509 compute_merit (occ);
510 insert_reciprocals (def_gsi, occ, def, NULL, threshold);
513 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, def)
515 if (is_division_by (use_stmt, def))
517 FOR_EACH_IMM_USE_ON_STMT (use_p, use_iter)
518 replace_reciprocal (use_p);
523 for (occ = occ_head; occ; )
524 occ = free_bb (occ);
526 occ_head = NULL;
529 /* Go through all the floating-point SSA_NAMEs, and call
530 execute_cse_reciprocals_1 on each of them. */
531 namespace {
533 const pass_data pass_data_cse_reciprocals =
535 GIMPLE_PASS, /* type */
536 "recip", /* name */
537 OPTGROUP_NONE, /* optinfo_flags */
538 TV_NONE, /* tv_id */
539 PROP_ssa, /* properties_required */
540 0, /* properties_provided */
541 0, /* properties_destroyed */
542 0, /* todo_flags_start */
543 TODO_update_ssa, /* todo_flags_finish */
546 class pass_cse_reciprocals : public gimple_opt_pass
548 public:
549 pass_cse_reciprocals (gcc::context *ctxt)
550 : gimple_opt_pass (pass_data_cse_reciprocals, ctxt)
553 /* opt_pass methods: */
554 virtual bool gate (function *) { return optimize && flag_reciprocal_math; }
555 virtual unsigned int execute (function *);
557 }; // class pass_cse_reciprocals
559 unsigned int
560 pass_cse_reciprocals::execute (function *fun)
562 basic_block bb;
563 tree arg;
565 occ_pool = new pool_allocator<occurrence>
566 ("dominators for recip", n_basic_blocks_for_fn (fun) / 3 + 1);
568 memset (&reciprocal_stats, 0, sizeof (reciprocal_stats));
569 calculate_dominance_info (CDI_DOMINATORS);
570 calculate_dominance_info (CDI_POST_DOMINATORS);
572 #ifdef ENABLE_CHECKING
573 FOR_EACH_BB_FN (bb, fun)
574 gcc_assert (!bb->aux);
575 #endif
577 for (arg = DECL_ARGUMENTS (fun->decl); arg; arg = DECL_CHAIN (arg))
578 if (FLOAT_TYPE_P (TREE_TYPE (arg))
579 && is_gimple_reg (arg))
581 tree name = ssa_default_def (fun, arg);
582 if (name)
583 execute_cse_reciprocals_1 (NULL, name);
586 FOR_EACH_BB_FN (bb, fun)
588 tree def;
590 for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
591 gsi_next (&gsi))
593 gphi *phi = gsi.phi ();
594 def = PHI_RESULT (phi);
595 if (! virtual_operand_p (def)
596 && FLOAT_TYPE_P (TREE_TYPE (def)))
597 execute_cse_reciprocals_1 (NULL, def);
600 for (gimple_stmt_iterator gsi = gsi_after_labels (bb); !gsi_end_p (gsi);
601 gsi_next (&gsi))
603 gimple stmt = gsi_stmt (gsi);
605 if (gimple_has_lhs (stmt)
606 && (def = SINGLE_SSA_TREE_OPERAND (stmt, SSA_OP_DEF)) != NULL
607 && FLOAT_TYPE_P (TREE_TYPE (def))
608 && TREE_CODE (def) == SSA_NAME)
609 execute_cse_reciprocals_1 (&gsi, def);
612 if (optimize_bb_for_size_p (bb))
613 continue;
615 /* Scan for a/func(b) and convert it to reciprocal a*rfunc(b). */
616 for (gimple_stmt_iterator gsi = gsi_after_labels (bb); !gsi_end_p (gsi);
617 gsi_next (&gsi))
619 gimple stmt = gsi_stmt (gsi);
620 tree fndecl;
622 if (is_gimple_assign (stmt)
623 && gimple_assign_rhs_code (stmt) == RDIV_EXPR)
625 tree arg1 = gimple_assign_rhs2 (stmt);
626 gimple stmt1;
628 if (TREE_CODE (arg1) != SSA_NAME)
629 continue;
631 stmt1 = SSA_NAME_DEF_STMT (arg1);
633 if (is_gimple_call (stmt1)
634 && gimple_call_lhs (stmt1)
635 && (fndecl = gimple_call_fndecl (stmt1))
636 && (DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_NORMAL
637 || DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD))
639 enum built_in_function code;
640 bool md_code, fail;
641 imm_use_iterator ui;
642 use_operand_p use_p;
644 code = DECL_FUNCTION_CODE (fndecl);
645 md_code = DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD;
647 fndecl = targetm.builtin_reciprocal (code, md_code, false);
648 if (!fndecl)
649 continue;
651 /* Check that all uses of the SSA name are divisions,
652 otherwise replacing the defining statement will do
653 the wrong thing. */
654 fail = false;
655 FOR_EACH_IMM_USE_FAST (use_p, ui, arg1)
657 gimple stmt2 = USE_STMT (use_p);
658 if (is_gimple_debug (stmt2))
659 continue;
660 if (!is_gimple_assign (stmt2)
661 || gimple_assign_rhs_code (stmt2) != RDIV_EXPR
662 || gimple_assign_rhs1 (stmt2) == arg1
663 || gimple_assign_rhs2 (stmt2) != arg1)
665 fail = true;
666 break;
669 if (fail)
670 continue;
672 gimple_replace_ssa_lhs (stmt1, arg1);
673 gimple_call_set_fndecl (stmt1, fndecl);
674 update_stmt (stmt1);
675 reciprocal_stats.rfuncs_inserted++;
677 FOR_EACH_IMM_USE_STMT (stmt, ui, arg1)
679 gimple_stmt_iterator gsi = gsi_for_stmt (stmt);
680 gimple_assign_set_rhs_code (stmt, MULT_EXPR);
681 fold_stmt_inplace (&gsi);
682 update_stmt (stmt);
689 statistics_counter_event (fun, "reciprocal divs inserted",
690 reciprocal_stats.rdivs_inserted);
691 statistics_counter_event (fun, "reciprocal functions inserted",
692 reciprocal_stats.rfuncs_inserted);
694 free_dominance_info (CDI_DOMINATORS);
695 free_dominance_info (CDI_POST_DOMINATORS);
696 delete occ_pool;
697 return 0;
700 } // anon namespace
702 gimple_opt_pass *
703 make_pass_cse_reciprocals (gcc::context *ctxt)
705 return new pass_cse_reciprocals (ctxt);
708 /* Records an occurrence at statement USE_STMT in the vector of trees
709 STMTS if it is dominated by *TOP_BB or dominates it or this basic block
710 is not yet initialized. Returns true if the occurrence was pushed on
711 the vector. Adjusts *TOP_BB to be the basic block dominating all
712 statements in the vector. */
714 static bool
715 maybe_record_sincos (vec<gimple> *stmts,
716 basic_block *top_bb, gimple use_stmt)
718 basic_block use_bb = gimple_bb (use_stmt);
719 if (*top_bb
720 && (*top_bb == use_bb
721 || dominated_by_p (CDI_DOMINATORS, use_bb, *top_bb)))
722 stmts->safe_push (use_stmt);
723 else if (!*top_bb
724 || dominated_by_p (CDI_DOMINATORS, *top_bb, use_bb))
726 stmts->safe_push (use_stmt);
727 *top_bb = use_bb;
729 else
730 return false;
732 return true;
735 /* Look for sin, cos and cexpi calls with the same argument NAME and
736 create a single call to cexpi CSEing the result in this case.
737 We first walk over all immediate uses of the argument collecting
738 statements that we can CSE in a vector and in a second pass replace
739 the statement rhs with a REALPART or IMAGPART expression on the
740 result of the cexpi call we insert before the use statement that
741 dominates all other candidates. */
743 static bool
744 execute_cse_sincos_1 (tree name)
746 gimple_stmt_iterator gsi;
747 imm_use_iterator use_iter;
748 tree fndecl, res, type;
749 gimple def_stmt, use_stmt, stmt;
750 int seen_cos = 0, seen_sin = 0, seen_cexpi = 0;
751 auto_vec<gimple> stmts;
752 basic_block top_bb = NULL;
753 int i;
754 bool cfg_changed = false;
756 type = TREE_TYPE (name);
757 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, name)
759 if (gimple_code (use_stmt) != GIMPLE_CALL
760 || !gimple_call_lhs (use_stmt)
761 || !(fndecl = gimple_call_fndecl (use_stmt))
762 || DECL_BUILT_IN_CLASS (fndecl) != BUILT_IN_NORMAL)
763 continue;
765 switch (DECL_FUNCTION_CODE (fndecl))
767 CASE_FLT_FN (BUILT_IN_COS):
768 seen_cos |= maybe_record_sincos (&stmts, &top_bb, use_stmt) ? 1 : 0;
769 break;
771 CASE_FLT_FN (BUILT_IN_SIN):
772 seen_sin |= maybe_record_sincos (&stmts, &top_bb, use_stmt) ? 1 : 0;
773 break;
775 CASE_FLT_FN (BUILT_IN_CEXPI):
776 seen_cexpi |= maybe_record_sincos (&stmts, &top_bb, use_stmt) ? 1 : 0;
777 break;
779 default:;
783 if (seen_cos + seen_sin + seen_cexpi <= 1)
784 return false;
786 /* Simply insert cexpi at the beginning of top_bb but not earlier than
787 the name def statement. */
788 fndecl = mathfn_built_in (type, BUILT_IN_CEXPI);
789 if (!fndecl)
790 return false;
791 stmt = gimple_build_call (fndecl, 1, name);
792 res = make_temp_ssa_name (TREE_TYPE (TREE_TYPE (fndecl)), stmt, "sincostmp");
793 gimple_call_set_lhs (stmt, res);
795 def_stmt = SSA_NAME_DEF_STMT (name);
796 if (!SSA_NAME_IS_DEFAULT_DEF (name)
797 && gimple_code (def_stmt) != GIMPLE_PHI
798 && gimple_bb (def_stmt) == top_bb)
800 gsi = gsi_for_stmt (def_stmt);
801 gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
803 else
805 gsi = gsi_after_labels (top_bb);
806 gsi_insert_before (&gsi, stmt, GSI_SAME_STMT);
808 sincos_stats.inserted++;
810 /* And adjust the recorded old call sites. */
811 for (i = 0; stmts.iterate (i, &use_stmt); ++i)
813 tree rhs = NULL;
814 fndecl = gimple_call_fndecl (use_stmt);
816 switch (DECL_FUNCTION_CODE (fndecl))
818 CASE_FLT_FN (BUILT_IN_COS):
819 rhs = fold_build1 (REALPART_EXPR, type, res);
820 break;
822 CASE_FLT_FN (BUILT_IN_SIN):
823 rhs = fold_build1 (IMAGPART_EXPR, type, res);
824 break;
826 CASE_FLT_FN (BUILT_IN_CEXPI):
827 rhs = res;
828 break;
830 default:;
831 gcc_unreachable ();
834 /* Replace call with a copy. */
835 stmt = gimple_build_assign (gimple_call_lhs (use_stmt), rhs);
837 gsi = gsi_for_stmt (use_stmt);
838 gsi_replace (&gsi, stmt, true);
839 if (gimple_purge_dead_eh_edges (gimple_bb (stmt)))
840 cfg_changed = true;
843 return cfg_changed;
846 /* To evaluate powi(x,n), the floating point value x raised to the
847 constant integer exponent n, we use a hybrid algorithm that
848 combines the "window method" with look-up tables. For an
849 introduction to exponentiation algorithms and "addition chains",
850 see section 4.6.3, "Evaluation of Powers" of Donald E. Knuth,
851 "Seminumerical Algorithms", Vol. 2, "The Art of Computer Programming",
852 3rd Edition, 1998, and Daniel M. Gordon, "A Survey of Fast Exponentiation
853 Methods", Journal of Algorithms, Vol. 27, pp. 129-146, 1998. */
855 /* Provide a default value for POWI_MAX_MULTS, the maximum number of
856 multiplications to inline before calling the system library's pow
857 function. powi(x,n) requires at worst 2*bits(n)-2 multiplications,
858 so this default never requires calling pow, powf or powl. */
860 #ifndef POWI_MAX_MULTS
861 #define POWI_MAX_MULTS (2*HOST_BITS_PER_WIDE_INT-2)
862 #endif
864 /* The size of the "optimal power tree" lookup table. All
865 exponents less than this value are simply looked up in the
866 powi_table below. This threshold is also used to size the
867 cache of pseudo registers that hold intermediate results. */
868 #define POWI_TABLE_SIZE 256
870 /* The size, in bits of the window, used in the "window method"
871 exponentiation algorithm. This is equivalent to a radix of
872 (1<<POWI_WINDOW_SIZE) in the corresponding "m-ary method". */
873 #define POWI_WINDOW_SIZE 3
875 /* The following table is an efficient representation of an
876 "optimal power tree". For each value, i, the corresponding
877 value, j, in the table states than an optimal evaluation
878 sequence for calculating pow(x,i) can be found by evaluating
879 pow(x,j)*pow(x,i-j). An optimal power tree for the first
880 100 integers is given in Knuth's "Seminumerical algorithms". */
882 static const unsigned char powi_table[POWI_TABLE_SIZE] =
884 0, 1, 1, 2, 2, 3, 3, 4, /* 0 - 7 */
885 4, 6, 5, 6, 6, 10, 7, 9, /* 8 - 15 */
886 8, 16, 9, 16, 10, 12, 11, 13, /* 16 - 23 */
887 12, 17, 13, 18, 14, 24, 15, 26, /* 24 - 31 */
888 16, 17, 17, 19, 18, 33, 19, 26, /* 32 - 39 */
889 20, 25, 21, 40, 22, 27, 23, 44, /* 40 - 47 */
890 24, 32, 25, 34, 26, 29, 27, 44, /* 48 - 55 */
891 28, 31, 29, 34, 30, 60, 31, 36, /* 56 - 63 */
892 32, 64, 33, 34, 34, 46, 35, 37, /* 64 - 71 */
893 36, 65, 37, 50, 38, 48, 39, 69, /* 72 - 79 */
894 40, 49, 41, 43, 42, 51, 43, 58, /* 80 - 87 */
895 44, 64, 45, 47, 46, 59, 47, 76, /* 88 - 95 */
896 48, 65, 49, 66, 50, 67, 51, 66, /* 96 - 103 */
897 52, 70, 53, 74, 54, 104, 55, 74, /* 104 - 111 */
898 56, 64, 57, 69, 58, 78, 59, 68, /* 112 - 119 */
899 60, 61, 61, 80, 62, 75, 63, 68, /* 120 - 127 */
900 64, 65, 65, 128, 66, 129, 67, 90, /* 128 - 135 */
901 68, 73, 69, 131, 70, 94, 71, 88, /* 136 - 143 */
902 72, 128, 73, 98, 74, 132, 75, 121, /* 144 - 151 */
903 76, 102, 77, 124, 78, 132, 79, 106, /* 152 - 159 */
904 80, 97, 81, 160, 82, 99, 83, 134, /* 160 - 167 */
905 84, 86, 85, 95, 86, 160, 87, 100, /* 168 - 175 */
906 88, 113, 89, 98, 90, 107, 91, 122, /* 176 - 183 */
907 92, 111, 93, 102, 94, 126, 95, 150, /* 184 - 191 */
908 96, 128, 97, 130, 98, 133, 99, 195, /* 192 - 199 */
909 100, 128, 101, 123, 102, 164, 103, 138, /* 200 - 207 */
910 104, 145, 105, 146, 106, 109, 107, 149, /* 208 - 215 */
911 108, 200, 109, 146, 110, 170, 111, 157, /* 216 - 223 */
912 112, 128, 113, 130, 114, 182, 115, 132, /* 224 - 231 */
913 116, 200, 117, 132, 118, 158, 119, 206, /* 232 - 239 */
914 120, 240, 121, 162, 122, 147, 123, 152, /* 240 - 247 */
915 124, 166, 125, 214, 126, 138, 127, 153, /* 248 - 255 */
919 /* Return the number of multiplications required to calculate
920 powi(x,n) where n is less than POWI_TABLE_SIZE. This is a
921 subroutine of powi_cost. CACHE is an array indicating
922 which exponents have already been calculated. */
924 static int
925 powi_lookup_cost (unsigned HOST_WIDE_INT n, bool *cache)
927 /* If we've already calculated this exponent, then this evaluation
928 doesn't require any additional multiplications. */
929 if (cache[n])
930 return 0;
932 cache[n] = true;
933 return powi_lookup_cost (n - powi_table[n], cache)
934 + powi_lookup_cost (powi_table[n], cache) + 1;
937 /* Return the number of multiplications required to calculate
938 powi(x,n) for an arbitrary x, given the exponent N. This
939 function needs to be kept in sync with powi_as_mults below. */
941 static int
942 powi_cost (HOST_WIDE_INT n)
944 bool cache[POWI_TABLE_SIZE];
945 unsigned HOST_WIDE_INT digit;
946 unsigned HOST_WIDE_INT val;
947 int result;
949 if (n == 0)
950 return 0;
952 /* Ignore the reciprocal when calculating the cost. */
953 val = (n < 0) ? -n : n;
955 /* Initialize the exponent cache. */
956 memset (cache, 0, POWI_TABLE_SIZE * sizeof (bool));
957 cache[1] = true;
959 result = 0;
961 while (val >= POWI_TABLE_SIZE)
963 if (val & 1)
965 digit = val & ((1 << POWI_WINDOW_SIZE) - 1);
966 result += powi_lookup_cost (digit, cache)
967 + POWI_WINDOW_SIZE + 1;
968 val >>= POWI_WINDOW_SIZE;
970 else
972 val >>= 1;
973 result++;
977 return result + powi_lookup_cost (val, cache);
980 /* Recursive subroutine of powi_as_mults. This function takes the
981 array, CACHE, of already calculated exponents and an exponent N and
982 returns a tree that corresponds to CACHE[1]**N, with type TYPE. */
984 static tree
985 powi_as_mults_1 (gimple_stmt_iterator *gsi, location_t loc, tree type,
986 HOST_WIDE_INT n, tree *cache)
988 tree op0, op1, ssa_target;
989 unsigned HOST_WIDE_INT digit;
990 gassign *mult_stmt;
992 if (n < POWI_TABLE_SIZE && cache[n])
993 return cache[n];
995 ssa_target = make_temp_ssa_name (type, NULL, "powmult");
997 if (n < POWI_TABLE_SIZE)
999 cache[n] = ssa_target;
1000 op0 = powi_as_mults_1 (gsi, loc, type, n - powi_table[n], cache);
1001 op1 = powi_as_mults_1 (gsi, loc, type, powi_table[n], cache);
1003 else if (n & 1)
1005 digit = n & ((1 << POWI_WINDOW_SIZE) - 1);
1006 op0 = powi_as_mults_1 (gsi, loc, type, n - digit, cache);
1007 op1 = powi_as_mults_1 (gsi, loc, type, digit, cache);
1009 else
1011 op0 = powi_as_mults_1 (gsi, loc, type, n >> 1, cache);
1012 op1 = op0;
1015 mult_stmt = gimple_build_assign (ssa_target, MULT_EXPR, op0, op1);
1016 gimple_set_location (mult_stmt, loc);
1017 gsi_insert_before (gsi, mult_stmt, GSI_SAME_STMT);
1019 return ssa_target;
1022 /* Convert ARG0**N to a tree of multiplications of ARG0 with itself.
1023 This function needs to be kept in sync with powi_cost above. */
1025 static tree
1026 powi_as_mults (gimple_stmt_iterator *gsi, location_t loc,
1027 tree arg0, HOST_WIDE_INT n)
1029 tree cache[POWI_TABLE_SIZE], result, type = TREE_TYPE (arg0);
1030 gassign *div_stmt;
1031 tree target;
1033 if (n == 0)
1034 return build_real (type, dconst1);
1036 memset (cache, 0, sizeof (cache));
1037 cache[1] = arg0;
1039 result = powi_as_mults_1 (gsi, loc, type, (n < 0) ? -n : n, cache);
1040 if (n >= 0)
1041 return result;
1043 /* If the original exponent was negative, reciprocate the result. */
1044 target = make_temp_ssa_name (type, NULL, "powmult");
1045 div_stmt = gimple_build_assign (target, RDIV_EXPR,
1046 build_real (type, dconst1), result);
1047 gimple_set_location (div_stmt, loc);
1048 gsi_insert_before (gsi, div_stmt, GSI_SAME_STMT);
1050 return target;
1053 /* ARG0 and N are the two arguments to a powi builtin in GSI with
1054 location info LOC. If the arguments are appropriate, create an
1055 equivalent sequence of statements prior to GSI using an optimal
1056 number of multiplications, and return an expession holding the
1057 result. */
1059 static tree
1060 gimple_expand_builtin_powi (gimple_stmt_iterator *gsi, location_t loc,
1061 tree arg0, HOST_WIDE_INT n)
1063 /* Avoid largest negative number. */
1064 if (n != -n
1065 && ((n >= -1 && n <= 2)
1066 || (optimize_function_for_speed_p (cfun)
1067 && powi_cost (n) <= POWI_MAX_MULTS)))
1068 return powi_as_mults (gsi, loc, arg0, n);
1070 return NULL_TREE;
1073 /* Build a gimple call statement that calls FN with argument ARG.
1074 Set the lhs of the call statement to a fresh SSA name. Insert the
1075 statement prior to GSI's current position, and return the fresh
1076 SSA name. */
1078 static tree
1079 build_and_insert_call (gimple_stmt_iterator *gsi, location_t loc,
1080 tree fn, tree arg)
1082 gcall *call_stmt;
1083 tree ssa_target;
1085 call_stmt = gimple_build_call (fn, 1, arg);
1086 ssa_target = make_temp_ssa_name (TREE_TYPE (arg), NULL, "powroot");
1087 gimple_set_lhs (call_stmt, ssa_target);
1088 gimple_set_location (call_stmt, loc);
1089 gsi_insert_before (gsi, call_stmt, GSI_SAME_STMT);
1091 return ssa_target;
1094 /* Build a gimple binary operation with the given CODE and arguments
1095 ARG0, ARG1, assigning the result to a new SSA name for variable
1096 TARGET. Insert the statement prior to GSI's current position, and
1097 return the fresh SSA name.*/
1099 static tree
1100 build_and_insert_binop (gimple_stmt_iterator *gsi, location_t loc,
1101 const char *name, enum tree_code code,
1102 tree arg0, tree arg1)
1104 tree result = make_temp_ssa_name (TREE_TYPE (arg0), NULL, name);
1105 gassign *stmt = gimple_build_assign (result, code, arg0, arg1);
1106 gimple_set_location (stmt, loc);
1107 gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
1108 return result;
1111 /* Build a gimple reference operation with the given CODE and argument
1112 ARG, assigning the result to a new SSA name of TYPE with NAME.
1113 Insert the statement prior to GSI's current position, and return
1114 the fresh SSA name. */
1116 static inline tree
1117 build_and_insert_ref (gimple_stmt_iterator *gsi, location_t loc, tree type,
1118 const char *name, enum tree_code code, tree arg0)
1120 tree result = make_temp_ssa_name (type, NULL, name);
1121 gimple stmt = gimple_build_assign (result, build1 (code, type, arg0));
1122 gimple_set_location (stmt, loc);
1123 gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
1124 return result;
1127 /* Build a gimple assignment to cast VAL to TYPE. Insert the statement
1128 prior to GSI's current position, and return the fresh SSA name. */
1130 static tree
1131 build_and_insert_cast (gimple_stmt_iterator *gsi, location_t loc,
1132 tree type, tree val)
1134 tree result = make_ssa_name (type);
1135 gassign *stmt = gimple_build_assign (result, NOP_EXPR, val);
1136 gimple_set_location (stmt, loc);
1137 gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
1138 return result;
1141 struct pow_synth_sqrt_info
1143 bool *factors;
1144 unsigned int deepest;
1145 unsigned int num_mults;
1148 /* Return true iff the real value C can be represented as a
1149 sum of powers of 0.5 up to N. That is:
1150 C == SUM<i from 1..N> (a[i]*(0.5**i)) where a[i] is either 0 or 1.
1151 Record in INFO the various parameters of the synthesis algorithm such
1152 as the factors a[i], the maximum 0.5 power and the number of
1153 multiplications that will be required. */
1155 bool
1156 representable_as_half_series_p (REAL_VALUE_TYPE c, unsigned n,
1157 struct pow_synth_sqrt_info *info)
1159 REAL_VALUE_TYPE factor = dconsthalf;
1160 REAL_VALUE_TYPE remainder = c;
1162 info->deepest = 0;
1163 info->num_mults = 0;
1164 memset (info->factors, 0, n * sizeof (bool));
1166 for (unsigned i = 0; i < n; i++)
1168 REAL_VALUE_TYPE res;
1170 /* If something inexact happened bail out now. */
1171 if (REAL_ARITHMETIC (res, MINUS_EXPR, remainder, factor))
1172 return false;
1174 /* We have hit zero. The number is representable as a sum
1175 of powers of 0.5. */
1176 if (REAL_VALUES_EQUAL (res, dconst0))
1178 info->factors[i] = true;
1179 info->deepest = i + 1;
1180 return true;
1182 else if (!REAL_VALUE_NEGATIVE (res))
1184 remainder = res;
1185 info->factors[i] = true;
1186 info->num_mults++;
1188 else
1189 info->factors[i] = false;
1191 REAL_ARITHMETIC (factor, MULT_EXPR, factor, dconsthalf);
1193 return false;
1196 /* Return the tree corresponding to FN being applied
1197 to ARG N times at GSI and LOC.
1198 Look up previous results from CACHE if need be.
1199 cache[0] should contain just plain ARG i.e. FN applied to ARG 0 times. */
1201 static tree
1202 get_fn_chain (tree arg, unsigned int n, gimple_stmt_iterator *gsi,
1203 tree fn, location_t loc, tree *cache)
1205 tree res = cache[n];
1206 if (!res)
1208 tree prev = get_fn_chain (arg, n - 1, gsi, fn, loc, cache);
1209 res = build_and_insert_call (gsi, loc, fn, prev);
1210 cache[n] = res;
1213 return res;
1216 /* Print to STREAM the repeated application of function FNAME to ARG
1217 N times. So, for FNAME = "foo", ARG = "x", N = 2 it would print:
1218 "foo (foo (x))". */
1220 static void
1221 print_nested_fn (FILE* stream, const char *fname, const char* arg,
1222 unsigned int n)
1224 if (n == 0)
1225 fprintf (stream, "%s", arg);
1226 else
1228 fprintf (stream, "%s (", fname);
1229 print_nested_fn (stream, fname, arg, n - 1);
1230 fprintf (stream, ")");
1234 /* Print to STREAM the fractional sequence of sqrt chains
1235 applied to ARG, described by INFO. Used for the dump file. */
1237 static void
1238 dump_fractional_sqrt_sequence (FILE *stream, const char *arg,
1239 struct pow_synth_sqrt_info *info)
1241 for (unsigned int i = 0; i < info->deepest; i++)
1243 bool is_set = info->factors[i];
1244 if (is_set)
1246 print_nested_fn (stream, "sqrt", arg, i + 1);
1247 if (i != info->deepest - 1)
1248 fprintf (stream, " * ");
1253 /* Print to STREAM a representation of raising ARG to an integer
1254 power N. Used for the dump file. */
1256 static void
1257 dump_integer_part (FILE *stream, const char* arg, HOST_WIDE_INT n)
1259 if (n > 1)
1260 fprintf (stream, "powi (%s, " HOST_WIDE_INT_PRINT_DEC ")", arg, n);
1261 else if (n == 1)
1262 fprintf (stream, "%s", arg);
1265 /* Attempt to synthesize a POW[F] (ARG0, ARG1) call using chains of
1266 square roots. Place at GSI and LOC. Limit the maximum depth
1267 of the sqrt chains to MAX_DEPTH. Return the tree holding the
1268 result of the expanded sequence or NULL_TREE if the expansion failed.
1270 This routine assumes that ARG1 is a real number with a fractional part
1271 (the integer exponent case will have been handled earlier in
1272 gimple_expand_builtin_pow).
1274 For ARG1 > 0.0:
1275 * For ARG1 composed of a whole part WHOLE_PART and a fractional part
1276 FRAC_PART i.e. WHOLE_PART == floor (ARG1) and
1277 FRAC_PART == ARG1 - WHOLE_PART:
1278 Produce POWI (ARG0, WHOLE_PART) * POW (ARG0, FRAC_PART) where
1279 POW (ARG0, FRAC_PART) is expanded as a product of square root chains
1280 if it can be expressed as such, that is if FRAC_PART satisfies:
1281 FRAC_PART == <SUM from i = 1 until MAX_DEPTH> (a[i] * (0.5**i))
1282 where integer a[i] is either 0 or 1.
1284 Example:
1285 POW (x, 3.625) == POWI (x, 3) * POW (x, 0.625)
1286 --> POWI (x, 3) * SQRT (x) * SQRT (SQRT (SQRT (x)))
1288 For ARG1 < 0.0 there are two approaches:
1289 * (A) Expand to 1.0 / POW (ARG0, -ARG1) where POW (ARG0, -ARG1)
1290 is calculated as above.
1292 Example:
1293 POW (x, -5.625) == 1.0 / POW (x, 5.625)
1294 --> 1.0 / (POWI (x, 5) * SQRT (x) * SQRT (SQRT (SQRT (x))))
1296 * (B) : WHOLE_PART := - ceil (abs (ARG1))
1297 FRAC_PART := ARG1 - WHOLE_PART
1298 and expand to POW (x, FRAC_PART) / POWI (x, WHOLE_PART).
1299 Example:
1300 POW (x, -5.875) == POW (x, 0.125) / POWI (X, 6)
1301 --> SQRT (SQRT (SQRT (x))) / (POWI (x, 6))
1303 For ARG1 < 0.0 we choose between (A) and (B) depending on
1304 how many multiplications we'd have to do.
1305 So, for the example in (B): POW (x, -5.875), if we were to
1306 follow algorithm (A) we would produce:
1307 1.0 / POWI (X, 5) * SQRT (X) * SQRT (SQRT (X)) * SQRT (SQRT (SQRT (X)))
1308 which contains more multiplications than approach (B).
1310 Hopefully, this approach will eliminate potentially expensive POW library
1311 calls when unsafe floating point math is enabled and allow the compiler to
1312 further optimise the multiplies, square roots and divides produced by this
1313 function. */
1315 static tree
1316 expand_pow_as_sqrts (gimple_stmt_iterator *gsi, location_t loc,
1317 tree arg0, tree arg1, HOST_WIDE_INT max_depth)
1319 tree type = TREE_TYPE (arg0);
1320 machine_mode mode = TYPE_MODE (type);
1321 tree sqrtfn = mathfn_built_in (type, BUILT_IN_SQRT);
1322 bool one_over = true;
1324 if (!sqrtfn)
1325 return NULL_TREE;
1327 if (TREE_CODE (arg1) != REAL_CST)
1328 return NULL_TREE;
1330 REAL_VALUE_TYPE exp_init = TREE_REAL_CST (arg1);
1332 gcc_assert (max_depth > 0);
1333 tree *cache = XALLOCAVEC (tree, max_depth + 1);
1335 struct pow_synth_sqrt_info synth_info;
1336 synth_info.factors = XALLOCAVEC (bool, max_depth + 1);
1337 synth_info.deepest = 0;
1338 synth_info.num_mults = 0;
1340 bool neg_exp = REAL_VALUE_NEGATIVE (exp_init);
1341 REAL_VALUE_TYPE exp = real_value_abs (&exp_init);
1343 /* The whole and fractional parts of exp. */
1344 REAL_VALUE_TYPE whole_part;
1345 REAL_VALUE_TYPE frac_part;
1347 real_floor (&whole_part, mode, &exp);
1348 REAL_ARITHMETIC (frac_part, MINUS_EXPR, exp, whole_part);
1351 REAL_VALUE_TYPE ceil_whole = dconst0;
1352 REAL_VALUE_TYPE ceil_fract = dconst0;
1354 if (neg_exp)
1356 real_ceil (&ceil_whole, mode, &exp);
1357 REAL_ARITHMETIC (ceil_fract, MINUS_EXPR, ceil_whole, exp);
1360 if (!representable_as_half_series_p (frac_part, max_depth, &synth_info))
1361 return NULL_TREE;
1363 /* Check whether it's more profitable to not use 1.0 / ... */
1364 if (neg_exp)
1366 struct pow_synth_sqrt_info alt_synth_info;
1367 alt_synth_info.factors = XALLOCAVEC (bool, max_depth + 1);
1368 alt_synth_info.deepest = 0;
1369 alt_synth_info.num_mults = 0;
1371 if (representable_as_half_series_p (ceil_fract, max_depth,
1372 &alt_synth_info)
1373 && alt_synth_info.deepest <= synth_info.deepest
1374 && alt_synth_info.num_mults < synth_info.num_mults)
1376 whole_part = ceil_whole;
1377 frac_part = ceil_fract;
1378 synth_info.deepest = alt_synth_info.deepest;
1379 synth_info.num_mults = alt_synth_info.num_mults;
1380 memcpy (synth_info.factors, alt_synth_info.factors,
1381 (max_depth + 1) * sizeof (bool));
1382 one_over = false;
1386 HOST_WIDE_INT n = real_to_integer (&whole_part);
1387 REAL_VALUE_TYPE cint;
1388 real_from_integer (&cint, VOIDmode, n, SIGNED);
1390 if (!real_identical (&whole_part, &cint))
1391 return NULL_TREE;
1393 if (powi_cost (n) + synth_info.num_mults > POWI_MAX_MULTS)
1394 return NULL_TREE;
1396 memset (cache, 0, (max_depth + 1) * sizeof (tree));
1398 tree integer_res = n == 0 ? build_real (type, dconst1) : arg0;
1400 /* Calculate the integer part of the exponent. */
1401 if (n > 1)
1403 integer_res = gimple_expand_builtin_powi (gsi, loc, arg0, n);
1404 if (!integer_res)
1405 return NULL_TREE;
1408 if (dump_file)
1410 char string[64];
1412 real_to_decimal (string, &exp_init, sizeof (string), 0, 1);
1413 fprintf (dump_file, "synthesizing pow (x, %s) as:\n", string);
1415 if (neg_exp)
1417 if (one_over)
1419 fprintf (dump_file, "1.0 / (");
1420 dump_integer_part (dump_file, "x", n);
1421 if (n > 0)
1422 fprintf (dump_file, " * ");
1423 dump_fractional_sqrt_sequence (dump_file, "x", &synth_info);
1424 fprintf (dump_file, ")");
1426 else
1428 dump_fractional_sqrt_sequence (dump_file, "x", &synth_info);
1429 fprintf (dump_file, " / (");
1430 dump_integer_part (dump_file, "x", n);
1431 fprintf (dump_file, ")");
1434 else
1436 dump_fractional_sqrt_sequence (dump_file, "x", &synth_info);
1437 if (n > 0)
1438 fprintf (dump_file, " * ");
1439 dump_integer_part (dump_file, "x", n);
1442 fprintf (dump_file, "\ndeepest sqrt chain: %d\n", synth_info.deepest);
1446 tree fract_res = NULL_TREE;
1447 cache[0] = arg0;
1449 /* Calculate the fractional part of the exponent. */
1450 for (unsigned i = 0; i < synth_info.deepest; i++)
1452 if (synth_info.factors[i])
1454 tree sqrt_chain = get_fn_chain (arg0, i + 1, gsi, sqrtfn, loc, cache);
1456 if (!fract_res)
1457 fract_res = sqrt_chain;
1459 else
1460 fract_res = build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR,
1461 fract_res, sqrt_chain);
1465 tree res = NULL_TREE;
1467 if (neg_exp)
1469 if (one_over)
1471 if (n > 0)
1472 res = build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR,
1473 fract_res, integer_res);
1474 else
1475 res = fract_res;
1477 res = build_and_insert_binop (gsi, loc, "powrootrecip", RDIV_EXPR,
1478 build_real (type, dconst1), res);
1480 else
1482 res = build_and_insert_binop (gsi, loc, "powroot", RDIV_EXPR,
1483 fract_res, integer_res);
1486 else
1487 res = build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR,
1488 fract_res, integer_res);
1489 return res;
1492 /* ARG0 and ARG1 are the two arguments to a pow builtin call in GSI
1493 with location info LOC. If possible, create an equivalent and
1494 less expensive sequence of statements prior to GSI, and return an
1495 expession holding the result. */
1497 static tree
1498 gimple_expand_builtin_pow (gimple_stmt_iterator *gsi, location_t loc,
1499 tree arg0, tree arg1)
1501 REAL_VALUE_TYPE c, cint, dconst1_3, dconst1_4, dconst1_6;
1502 REAL_VALUE_TYPE c2, dconst3;
1503 HOST_WIDE_INT n;
1504 tree type, sqrtfn, cbrtfn, sqrt_arg0, result, cbrt_x, powi_cbrt_x;
1505 machine_mode mode;
1506 bool speed_p = optimize_bb_for_speed_p (gsi_bb (*gsi));
1507 bool hw_sqrt_exists, c_is_int, c2_is_int;
1509 dconst1_4 = dconst1;
1510 SET_REAL_EXP (&dconst1_4, REAL_EXP (&dconst1_4) - 2);
1512 /* If the exponent isn't a constant, there's nothing of interest
1513 to be done. */
1514 if (TREE_CODE (arg1) != REAL_CST)
1515 return NULL_TREE;
1517 /* If the exponent is equivalent to an integer, expand to an optimal
1518 multiplication sequence when profitable. */
1519 c = TREE_REAL_CST (arg1);
1520 n = real_to_integer (&c);
1521 real_from_integer (&cint, VOIDmode, n, SIGNED);
1522 c_is_int = real_identical (&c, &cint);
1524 if (c_is_int
1525 && ((n >= -1 && n <= 2)
1526 || (flag_unsafe_math_optimizations
1527 && speed_p
1528 && powi_cost (n) <= POWI_MAX_MULTS)))
1529 return gimple_expand_builtin_powi (gsi, loc, arg0, n);
1531 /* Attempt various optimizations using sqrt and cbrt. */
1532 type = TREE_TYPE (arg0);
1533 mode = TYPE_MODE (type);
1534 sqrtfn = mathfn_built_in (type, BUILT_IN_SQRT);
1536 /* Optimize pow(x,0.5) = sqrt(x). This replacement is always safe
1537 unless signed zeros must be maintained. pow(-0,0.5) = +0, while
1538 sqrt(-0) = -0. */
1539 if (sqrtfn
1540 && REAL_VALUES_EQUAL (c, dconsthalf)
1541 && !HONOR_SIGNED_ZEROS (mode))
1542 return build_and_insert_call (gsi, loc, sqrtfn, arg0);
1544 hw_sqrt_exists = optab_handler (sqrt_optab, mode) != CODE_FOR_nothing;
1546 /* Optimize pow(x,1./3.) = cbrt(x). This requires unsafe math
1547 optimizations since 1./3. is not exactly representable. If x
1548 is negative and finite, the correct value of pow(x,1./3.) is
1549 a NaN with the "invalid" exception raised, because the value
1550 of 1./3. actually has an even denominator. The correct value
1551 of cbrt(x) is a negative real value. */
1552 cbrtfn = mathfn_built_in (type, BUILT_IN_CBRT);
1553 dconst1_3 = real_value_truncate (mode, dconst_third ());
1555 if (flag_unsafe_math_optimizations
1556 && cbrtfn
1557 && (gimple_val_nonnegative_real_p (arg0) || !HONOR_NANS (mode))
1558 && REAL_VALUES_EQUAL (c, dconst1_3))
1559 return build_and_insert_call (gsi, loc, cbrtfn, arg0);
1561 /* Optimize pow(x,1./6.) = cbrt(sqrt(x)). Don't do this optimization
1562 if we don't have a hardware sqrt insn. */
1563 dconst1_6 = dconst1_3;
1564 SET_REAL_EXP (&dconst1_6, REAL_EXP (&dconst1_6) - 1);
1566 if (flag_unsafe_math_optimizations
1567 && sqrtfn
1568 && cbrtfn
1569 && (gimple_val_nonnegative_real_p (arg0) || !HONOR_NANS (mode))
1570 && speed_p
1571 && hw_sqrt_exists
1572 && REAL_VALUES_EQUAL (c, dconst1_6))
1574 /* sqrt(x) */
1575 sqrt_arg0 = build_and_insert_call (gsi, loc, sqrtfn, arg0);
1577 /* cbrt(sqrt(x)) */
1578 return build_and_insert_call (gsi, loc, cbrtfn, sqrt_arg0);
1582 /* Attempt to expand the POW as a product of square root chains.
1583 Expand the 0.25 case even when otpimising for size. */
1584 if (flag_unsafe_math_optimizations
1585 && sqrtfn
1586 && hw_sqrt_exists
1587 && (speed_p || REAL_VALUES_EQUAL (c, dconst1_4))
1588 && !HONOR_SIGNED_ZEROS (mode))
1590 unsigned int max_depth = speed_p
1591 ? PARAM_VALUE (PARAM_MAX_POW_SQRT_DEPTH)
1592 : 2;
1594 tree expand_with_sqrts
1595 = expand_pow_as_sqrts (gsi, loc, arg0, arg1, max_depth);
1597 if (expand_with_sqrts)
1598 return expand_with_sqrts;
1601 real_arithmetic (&c2, MULT_EXPR, &c, &dconst2);
1602 n = real_to_integer (&c2);
1603 real_from_integer (&cint, VOIDmode, n, SIGNED);
1604 c2_is_int = real_identical (&c2, &cint);
1606 /* Optimize pow(x,c), where 3c = n for some nonzero integer n, into
1608 powi(x, n/3) * powi(cbrt(x), n%3), n > 0;
1609 1.0 / (powi(x, abs(n)/3) * powi(cbrt(x), abs(n)%3)), n < 0.
1611 Do not calculate the first factor when n/3 = 0. As cbrt(x) is
1612 different from pow(x, 1./3.) due to rounding and behavior with
1613 negative x, we need to constrain this transformation to unsafe
1614 math and positive x or finite math. */
1615 real_from_integer (&dconst3, VOIDmode, 3, SIGNED);
1616 real_arithmetic (&c2, MULT_EXPR, &c, &dconst3);
1617 real_round (&c2, mode, &c2);
1618 n = real_to_integer (&c2);
1619 real_from_integer (&cint, VOIDmode, n, SIGNED);
1620 real_arithmetic (&c2, RDIV_EXPR, &cint, &dconst3);
1621 real_convert (&c2, mode, &c2);
1623 if (flag_unsafe_math_optimizations
1624 && cbrtfn
1625 && (gimple_val_nonnegative_real_p (arg0) || !HONOR_NANS (mode))
1626 && real_identical (&c2, &c)
1627 && !c2_is_int
1628 && optimize_function_for_speed_p (cfun)
1629 && powi_cost (n / 3) <= POWI_MAX_MULTS)
1631 tree powi_x_ndiv3 = NULL_TREE;
1633 /* Attempt to fold powi(arg0, abs(n/3)) into multiplies. If not
1634 possible or profitable, give up. Skip the degenerate case when
1635 abs(n) < 3, where the result is always 1. */
1636 if (absu_hwi (n) >= 3)
1638 powi_x_ndiv3 = gimple_expand_builtin_powi (gsi, loc, arg0,
1639 abs_hwi (n / 3));
1640 if (!powi_x_ndiv3)
1641 return NULL_TREE;
1644 /* Calculate powi(cbrt(x), n%3). Don't use gimple_expand_builtin_powi
1645 as that creates an unnecessary variable. Instead, just produce
1646 either cbrt(x) or cbrt(x) * cbrt(x). */
1647 cbrt_x = build_and_insert_call (gsi, loc, cbrtfn, arg0);
1649 if (absu_hwi (n) % 3 == 1)
1650 powi_cbrt_x = cbrt_x;
1651 else
1652 powi_cbrt_x = build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR,
1653 cbrt_x, cbrt_x);
1655 /* Multiply the two subexpressions, unless powi(x,abs(n)/3) = 1. */
1656 if (absu_hwi (n) < 3)
1657 result = powi_cbrt_x;
1658 else
1659 result = build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR,
1660 powi_x_ndiv3, powi_cbrt_x);
1662 /* If n is negative, reciprocate the result. */
1663 if (n < 0)
1664 result = build_and_insert_binop (gsi, loc, "powroot", RDIV_EXPR,
1665 build_real (type, dconst1), result);
1667 return result;
1670 /* No optimizations succeeded. */
1671 return NULL_TREE;
1674 /* ARG is the argument to a cabs builtin call in GSI with location info
1675 LOC. Create a sequence of statements prior to GSI that calculates
1676 sqrt(R*R + I*I), where R and I are the real and imaginary components
1677 of ARG, respectively. Return an expression holding the result. */
1679 static tree
1680 gimple_expand_builtin_cabs (gimple_stmt_iterator *gsi, location_t loc, tree arg)
1682 tree real_part, imag_part, addend1, addend2, sum, result;
1683 tree type = TREE_TYPE (TREE_TYPE (arg));
1684 tree sqrtfn = mathfn_built_in (type, BUILT_IN_SQRT);
1685 machine_mode mode = TYPE_MODE (type);
1687 if (!flag_unsafe_math_optimizations
1688 || !optimize_bb_for_speed_p (gimple_bb (gsi_stmt (*gsi)))
1689 || !sqrtfn
1690 || optab_handler (sqrt_optab, mode) == CODE_FOR_nothing)
1691 return NULL_TREE;
1693 real_part = build_and_insert_ref (gsi, loc, type, "cabs",
1694 REALPART_EXPR, arg);
1695 addend1 = build_and_insert_binop (gsi, loc, "cabs", MULT_EXPR,
1696 real_part, real_part);
1697 imag_part = build_and_insert_ref (gsi, loc, type, "cabs",
1698 IMAGPART_EXPR, arg);
1699 addend2 = build_and_insert_binop (gsi, loc, "cabs", MULT_EXPR,
1700 imag_part, imag_part);
1701 sum = build_and_insert_binop (gsi, loc, "cabs", PLUS_EXPR, addend1, addend2);
1702 result = build_and_insert_call (gsi, loc, sqrtfn, sum);
1704 return result;
1707 /* Go through all calls to sin, cos and cexpi and call execute_cse_sincos_1
1708 on the SSA_NAME argument of each of them. Also expand powi(x,n) into
1709 an optimal number of multiplies, when n is a constant. */
1711 namespace {
1713 const pass_data pass_data_cse_sincos =
1715 GIMPLE_PASS, /* type */
1716 "sincos", /* name */
1717 OPTGROUP_NONE, /* optinfo_flags */
1718 TV_NONE, /* tv_id */
1719 PROP_ssa, /* properties_required */
1720 0, /* properties_provided */
1721 0, /* properties_destroyed */
1722 0, /* todo_flags_start */
1723 TODO_update_ssa, /* todo_flags_finish */
1726 class pass_cse_sincos : public gimple_opt_pass
1728 public:
1729 pass_cse_sincos (gcc::context *ctxt)
1730 : gimple_opt_pass (pass_data_cse_sincos, ctxt)
1733 /* opt_pass methods: */
1734 virtual bool gate (function *)
1736 /* We no longer require either sincos or cexp, since powi expansion
1737 piggybacks on this pass. */
1738 return optimize;
1741 virtual unsigned int execute (function *);
1743 }; // class pass_cse_sincos
1745 unsigned int
1746 pass_cse_sincos::execute (function *fun)
1748 basic_block bb;
1749 bool cfg_changed = false;
1751 calculate_dominance_info (CDI_DOMINATORS);
1752 memset (&sincos_stats, 0, sizeof (sincos_stats));
1754 FOR_EACH_BB_FN (bb, fun)
1756 gimple_stmt_iterator gsi;
1757 bool cleanup_eh = false;
1759 for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi); gsi_next (&gsi))
1761 gimple stmt = gsi_stmt (gsi);
1762 tree fndecl;
1764 /* Only the last stmt in a bb could throw, no need to call
1765 gimple_purge_dead_eh_edges if we change something in the middle
1766 of a basic block. */
1767 cleanup_eh = false;
1769 if (is_gimple_call (stmt)
1770 && gimple_call_lhs (stmt)
1771 && (fndecl = gimple_call_fndecl (stmt))
1772 && DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_NORMAL)
1774 tree arg, arg0, arg1, result;
1775 HOST_WIDE_INT n;
1776 location_t loc;
1778 switch (DECL_FUNCTION_CODE (fndecl))
1780 CASE_FLT_FN (BUILT_IN_COS):
1781 CASE_FLT_FN (BUILT_IN_SIN):
1782 CASE_FLT_FN (BUILT_IN_CEXPI):
1783 /* Make sure we have either sincos or cexp. */
1784 if (!targetm.libc_has_function (function_c99_math_complex)
1785 && !targetm.libc_has_function (function_sincos))
1786 break;
1788 arg = gimple_call_arg (stmt, 0);
1789 if (TREE_CODE (arg) == SSA_NAME)
1790 cfg_changed |= execute_cse_sincos_1 (arg);
1791 break;
1793 CASE_FLT_FN (BUILT_IN_POW):
1794 arg0 = gimple_call_arg (stmt, 0);
1795 arg1 = gimple_call_arg (stmt, 1);
1797 loc = gimple_location (stmt);
1798 result = gimple_expand_builtin_pow (&gsi, loc, arg0, arg1);
1800 if (result)
1802 tree lhs = gimple_get_lhs (stmt);
1803 gassign *new_stmt = gimple_build_assign (lhs, result);
1804 gimple_set_location (new_stmt, loc);
1805 unlink_stmt_vdef (stmt);
1806 gsi_replace (&gsi, new_stmt, true);
1807 cleanup_eh = true;
1808 if (gimple_vdef (stmt))
1809 release_ssa_name (gimple_vdef (stmt));
1811 break;
1813 CASE_FLT_FN (BUILT_IN_POWI):
1814 arg0 = gimple_call_arg (stmt, 0);
1815 arg1 = gimple_call_arg (stmt, 1);
1816 loc = gimple_location (stmt);
1818 if (real_minus_onep (arg0))
1820 tree t0, t1, cond, one, minus_one;
1821 gassign *stmt;
1823 t0 = TREE_TYPE (arg0);
1824 t1 = TREE_TYPE (arg1);
1825 one = build_real (t0, dconst1);
1826 minus_one = build_real (t0, dconstm1);
1828 cond = make_temp_ssa_name (t1, NULL, "powi_cond");
1829 stmt = gimple_build_assign (cond, BIT_AND_EXPR,
1830 arg1, build_int_cst (t1, 1));
1831 gimple_set_location (stmt, loc);
1832 gsi_insert_before (&gsi, stmt, GSI_SAME_STMT);
1834 result = make_temp_ssa_name (t0, NULL, "powi");
1835 stmt = gimple_build_assign (result, COND_EXPR, cond,
1836 minus_one, one);
1837 gimple_set_location (stmt, loc);
1838 gsi_insert_before (&gsi, stmt, GSI_SAME_STMT);
1840 else
1842 if (!tree_fits_shwi_p (arg1))
1843 break;
1845 n = tree_to_shwi (arg1);
1846 result = gimple_expand_builtin_powi (&gsi, loc, arg0, n);
1849 if (result)
1851 tree lhs = gimple_get_lhs (stmt);
1852 gassign *new_stmt = gimple_build_assign (lhs, result);
1853 gimple_set_location (new_stmt, loc);
1854 unlink_stmt_vdef (stmt);
1855 gsi_replace (&gsi, new_stmt, true);
1856 cleanup_eh = true;
1857 if (gimple_vdef (stmt))
1858 release_ssa_name (gimple_vdef (stmt));
1860 break;
1862 CASE_FLT_FN (BUILT_IN_CABS):
1863 arg0 = gimple_call_arg (stmt, 0);
1864 loc = gimple_location (stmt);
1865 result = gimple_expand_builtin_cabs (&gsi, loc, arg0);
1867 if (result)
1869 tree lhs = gimple_get_lhs (stmt);
1870 gassign *new_stmt = gimple_build_assign (lhs, result);
1871 gimple_set_location (new_stmt, loc);
1872 unlink_stmt_vdef (stmt);
1873 gsi_replace (&gsi, new_stmt, true);
1874 cleanup_eh = true;
1875 if (gimple_vdef (stmt))
1876 release_ssa_name (gimple_vdef (stmt));
1878 break;
1880 default:;
1884 if (cleanup_eh)
1885 cfg_changed |= gimple_purge_dead_eh_edges (bb);
1888 statistics_counter_event (fun, "sincos statements inserted",
1889 sincos_stats.inserted);
1891 free_dominance_info (CDI_DOMINATORS);
1892 return cfg_changed ? TODO_cleanup_cfg : 0;
1895 } // anon namespace
1897 gimple_opt_pass *
1898 make_pass_cse_sincos (gcc::context *ctxt)
1900 return new pass_cse_sincos (ctxt);
1903 /* A symbolic number is used to detect byte permutation and selection
1904 patterns. Therefore the field N contains an artificial number
1905 consisting of octet sized markers:
1907 0 - target byte has the value 0
1908 FF - target byte has an unknown value (eg. due to sign extension)
1909 1..size - marker value is the target byte index minus one.
1911 To detect permutations on memory sources (arrays and structures), a symbolic
1912 number is also associated a base address (the array or structure the load is
1913 made from), an offset from the base address and a range which gives the
1914 difference between the highest and lowest accessed memory location to make
1915 such a symbolic number. The range is thus different from size which reflects
1916 the size of the type of current expression. Note that for non memory source,
1917 range holds the same value as size.
1919 For instance, for an array char a[], (short) a[0] | (short) a[3] would have
1920 a size of 2 but a range of 4 while (short) a[0] | ((short) a[0] << 1) would
1921 still have a size of 2 but this time a range of 1. */
1923 struct symbolic_number {
1924 uint64_t n;
1925 tree type;
1926 tree base_addr;
1927 tree offset;
1928 HOST_WIDE_INT bytepos;
1929 tree alias_set;
1930 tree vuse;
1931 unsigned HOST_WIDE_INT range;
1934 #define BITS_PER_MARKER 8
1935 #define MARKER_MASK ((1 << BITS_PER_MARKER) - 1)
1936 #define MARKER_BYTE_UNKNOWN MARKER_MASK
1937 #define HEAD_MARKER(n, size) \
1938 ((n) & ((uint64_t) MARKER_MASK << (((size) - 1) * BITS_PER_MARKER)))
1940 /* The number which the find_bswap_or_nop_1 result should match in
1941 order to have a nop. The number is masked according to the size of
1942 the symbolic number before using it. */
1943 #define CMPNOP (sizeof (int64_t) < 8 ? 0 : \
1944 (uint64_t)0x08070605 << 32 | 0x04030201)
1946 /* The number which the find_bswap_or_nop_1 result should match in
1947 order to have a byte swap. The number is masked according to the
1948 size of the symbolic number before using it. */
1949 #define CMPXCHG (sizeof (int64_t) < 8 ? 0 : \
1950 (uint64_t)0x01020304 << 32 | 0x05060708)
1952 /* Perform a SHIFT or ROTATE operation by COUNT bits on symbolic
1953 number N. Return false if the requested operation is not permitted
1954 on a symbolic number. */
1956 static inline bool
1957 do_shift_rotate (enum tree_code code,
1958 struct symbolic_number *n,
1959 int count)
1961 int i, size = TYPE_PRECISION (n->type) / BITS_PER_UNIT;
1962 unsigned head_marker;
1964 if (count % BITS_PER_UNIT != 0)
1965 return false;
1966 count = (count / BITS_PER_UNIT) * BITS_PER_MARKER;
1968 /* Zero out the extra bits of N in order to avoid them being shifted
1969 into the significant bits. */
1970 if (size < 64 / BITS_PER_MARKER)
1971 n->n &= ((uint64_t) 1 << (size * BITS_PER_MARKER)) - 1;
1973 switch (code)
1975 case LSHIFT_EXPR:
1976 n->n <<= count;
1977 break;
1978 case RSHIFT_EXPR:
1979 head_marker = HEAD_MARKER (n->n, size);
1980 n->n >>= count;
1981 /* Arithmetic shift of signed type: result is dependent on the value. */
1982 if (!TYPE_UNSIGNED (n->type) && head_marker)
1983 for (i = 0; i < count / BITS_PER_MARKER; i++)
1984 n->n |= (uint64_t) MARKER_BYTE_UNKNOWN
1985 << ((size - 1 - i) * BITS_PER_MARKER);
1986 break;
1987 case LROTATE_EXPR:
1988 n->n = (n->n << count) | (n->n >> ((size * BITS_PER_MARKER) - count));
1989 break;
1990 case RROTATE_EXPR:
1991 n->n = (n->n >> count) | (n->n << ((size * BITS_PER_MARKER) - count));
1992 break;
1993 default:
1994 return false;
1996 /* Zero unused bits for size. */
1997 if (size < 64 / BITS_PER_MARKER)
1998 n->n &= ((uint64_t) 1 << (size * BITS_PER_MARKER)) - 1;
1999 return true;
2002 /* Perform sanity checking for the symbolic number N and the gimple
2003 statement STMT. */
2005 static inline bool
2006 verify_symbolic_number_p (struct symbolic_number *n, gimple stmt)
2008 tree lhs_type;
2010 lhs_type = gimple_expr_type (stmt);
2012 if (TREE_CODE (lhs_type) != INTEGER_TYPE)
2013 return false;
2015 if (TYPE_PRECISION (lhs_type) != TYPE_PRECISION (n->type))
2016 return false;
2018 return true;
2021 /* Initialize the symbolic number N for the bswap pass from the base element
2022 SRC manipulated by the bitwise OR expression. */
2024 static bool
2025 init_symbolic_number (struct symbolic_number *n, tree src)
2027 int size;
2029 n->base_addr = n->offset = n->alias_set = n->vuse = NULL_TREE;
2031 /* Set up the symbolic number N by setting each byte to a value between 1 and
2032 the byte size of rhs1. The highest order byte is set to n->size and the
2033 lowest order byte to 1. */
2034 n->type = TREE_TYPE (src);
2035 size = TYPE_PRECISION (n->type);
2036 if (size % BITS_PER_UNIT != 0)
2037 return false;
2038 size /= BITS_PER_UNIT;
2039 if (size > 64 / BITS_PER_MARKER)
2040 return false;
2041 n->range = size;
2042 n->n = CMPNOP;
2044 if (size < 64 / BITS_PER_MARKER)
2045 n->n &= ((uint64_t) 1 << (size * BITS_PER_MARKER)) - 1;
2047 return true;
2050 /* Check if STMT might be a byte swap or a nop from a memory source and returns
2051 the answer. If so, REF is that memory source and the base of the memory area
2052 accessed and the offset of the access from that base are recorded in N. */
2054 bool
2055 find_bswap_or_nop_load (gimple stmt, tree ref, struct symbolic_number *n)
2057 /* Leaf node is an array or component ref. Memorize its base and
2058 offset from base to compare to other such leaf node. */
2059 HOST_WIDE_INT bitsize, bitpos;
2060 machine_mode mode;
2061 int unsignedp, volatilep;
2062 tree offset, base_addr;
2064 /* Not prepared to handle PDP endian. */
2065 if (BYTES_BIG_ENDIAN != WORDS_BIG_ENDIAN)
2066 return false;
2068 if (!gimple_assign_load_p (stmt) || gimple_has_volatile_ops (stmt))
2069 return false;
2071 base_addr = get_inner_reference (ref, &bitsize, &bitpos, &offset, &mode,
2072 &unsignedp, &volatilep, false);
2074 if (TREE_CODE (base_addr) == MEM_REF)
2076 offset_int bit_offset = 0;
2077 tree off = TREE_OPERAND (base_addr, 1);
2079 if (!integer_zerop (off))
2081 offset_int boff, coff = mem_ref_offset (base_addr);
2082 boff = wi::lshift (coff, LOG2_BITS_PER_UNIT);
2083 bit_offset += boff;
2086 base_addr = TREE_OPERAND (base_addr, 0);
2088 /* Avoid returning a negative bitpos as this may wreak havoc later. */
2089 if (wi::neg_p (bit_offset))
2091 offset_int mask = wi::mask <offset_int> (LOG2_BITS_PER_UNIT, false);
2092 offset_int tem = bit_offset.and_not (mask);
2093 /* TEM is the bitpos rounded to BITS_PER_UNIT towards -Inf.
2094 Subtract it to BIT_OFFSET and add it (scaled) to OFFSET. */
2095 bit_offset -= tem;
2096 tem = wi::arshift (tem, LOG2_BITS_PER_UNIT);
2097 if (offset)
2098 offset = size_binop (PLUS_EXPR, offset,
2099 wide_int_to_tree (sizetype, tem));
2100 else
2101 offset = wide_int_to_tree (sizetype, tem);
2104 bitpos += bit_offset.to_shwi ();
2107 if (bitpos % BITS_PER_UNIT)
2108 return false;
2109 if (bitsize % BITS_PER_UNIT)
2110 return false;
2112 if (!init_symbolic_number (n, ref))
2113 return false;
2114 n->base_addr = base_addr;
2115 n->offset = offset;
2116 n->bytepos = bitpos / BITS_PER_UNIT;
2117 n->alias_set = reference_alias_ptr_type (ref);
2118 n->vuse = gimple_vuse (stmt);
2119 return true;
2122 /* Compute the symbolic number N representing the result of a bitwise OR on 2
2123 symbolic number N1 and N2 whose source statements are respectively
2124 SOURCE_STMT1 and SOURCE_STMT2. */
2126 static gimple
2127 perform_symbolic_merge (gimple source_stmt1, struct symbolic_number *n1,
2128 gimple source_stmt2, struct symbolic_number *n2,
2129 struct symbolic_number *n)
2131 int i, size;
2132 uint64_t mask;
2133 gimple source_stmt;
2134 struct symbolic_number *n_start;
2136 /* Sources are different, cancel bswap if they are not memory location with
2137 the same base (array, structure, ...). */
2138 if (gimple_assign_rhs1 (source_stmt1) != gimple_assign_rhs1 (source_stmt2))
2140 int64_t inc;
2141 HOST_WIDE_INT start_sub, end_sub, end1, end2, end;
2142 struct symbolic_number *toinc_n_ptr, *n_end;
2144 if (!n1->base_addr || !n2->base_addr
2145 || !operand_equal_p (n1->base_addr, n2->base_addr, 0))
2146 return NULL;
2148 if (!n1->offset != !n2->offset
2149 || (n1->offset && !operand_equal_p (n1->offset, n2->offset, 0)))
2150 return NULL;
2152 if (n1->bytepos < n2->bytepos)
2154 n_start = n1;
2155 start_sub = n2->bytepos - n1->bytepos;
2156 source_stmt = source_stmt1;
2158 else
2160 n_start = n2;
2161 start_sub = n1->bytepos - n2->bytepos;
2162 source_stmt = source_stmt2;
2165 /* Find the highest address at which a load is performed and
2166 compute related info. */
2167 end1 = n1->bytepos + (n1->range - 1);
2168 end2 = n2->bytepos + (n2->range - 1);
2169 if (end1 < end2)
2171 end = end2;
2172 end_sub = end2 - end1;
2174 else
2176 end = end1;
2177 end_sub = end1 - end2;
2179 n_end = (end2 > end1) ? n2 : n1;
2181 /* Find symbolic number whose lsb is the most significant. */
2182 if (BYTES_BIG_ENDIAN)
2183 toinc_n_ptr = (n_end == n1) ? n2 : n1;
2184 else
2185 toinc_n_ptr = (n_start == n1) ? n2 : n1;
2187 n->range = end - n_start->bytepos + 1;
2189 /* Check that the range of memory covered can be represented by
2190 a symbolic number. */
2191 if (n->range > 64 / BITS_PER_MARKER)
2192 return NULL;
2194 /* Reinterpret byte marks in symbolic number holding the value of
2195 bigger weight according to target endianness. */
2196 inc = BYTES_BIG_ENDIAN ? end_sub : start_sub;
2197 size = TYPE_PRECISION (n1->type) / BITS_PER_UNIT;
2198 for (i = 0; i < size; i++, inc <<= BITS_PER_MARKER)
2200 unsigned marker
2201 = (toinc_n_ptr->n >> (i * BITS_PER_MARKER)) & MARKER_MASK;
2202 if (marker && marker != MARKER_BYTE_UNKNOWN)
2203 toinc_n_ptr->n += inc;
2206 else
2208 n->range = n1->range;
2209 n_start = n1;
2210 source_stmt = source_stmt1;
2213 if (!n1->alias_set
2214 || alias_ptr_types_compatible_p (n1->alias_set, n2->alias_set))
2215 n->alias_set = n1->alias_set;
2216 else
2217 n->alias_set = ptr_type_node;
2218 n->vuse = n_start->vuse;
2219 n->base_addr = n_start->base_addr;
2220 n->offset = n_start->offset;
2221 n->bytepos = n_start->bytepos;
2222 n->type = n_start->type;
2223 size = TYPE_PRECISION (n->type) / BITS_PER_UNIT;
2225 for (i = 0, mask = MARKER_MASK; i < size; i++, mask <<= BITS_PER_MARKER)
2227 uint64_t masked1, masked2;
2229 masked1 = n1->n & mask;
2230 masked2 = n2->n & mask;
2231 if (masked1 && masked2 && masked1 != masked2)
2232 return NULL;
2234 n->n = n1->n | n2->n;
2236 return source_stmt;
2239 /* find_bswap_or_nop_1 invokes itself recursively with N and tries to perform
2240 the operation given by the rhs of STMT on the result. If the operation
2241 could successfully be executed the function returns a gimple stmt whose
2242 rhs's first tree is the expression of the source operand and NULL
2243 otherwise. */
2245 static gimple
2246 find_bswap_or_nop_1 (gimple stmt, struct symbolic_number *n, int limit)
2248 enum tree_code code;
2249 tree rhs1, rhs2 = NULL;
2250 gimple rhs1_stmt, rhs2_stmt, source_stmt1;
2251 enum gimple_rhs_class rhs_class;
2253 if (!limit || !is_gimple_assign (stmt))
2254 return NULL;
2256 rhs1 = gimple_assign_rhs1 (stmt);
2258 if (find_bswap_or_nop_load (stmt, rhs1, n))
2259 return stmt;
2261 if (TREE_CODE (rhs1) != SSA_NAME)
2262 return NULL;
2264 code = gimple_assign_rhs_code (stmt);
2265 rhs_class = gimple_assign_rhs_class (stmt);
2266 rhs1_stmt = SSA_NAME_DEF_STMT (rhs1);
2268 if (rhs_class == GIMPLE_BINARY_RHS)
2269 rhs2 = gimple_assign_rhs2 (stmt);
2271 /* Handle unary rhs and binary rhs with integer constants as second
2272 operand. */
2274 if (rhs_class == GIMPLE_UNARY_RHS
2275 || (rhs_class == GIMPLE_BINARY_RHS
2276 && TREE_CODE (rhs2) == INTEGER_CST))
2278 if (code != BIT_AND_EXPR
2279 && code != LSHIFT_EXPR
2280 && code != RSHIFT_EXPR
2281 && code != LROTATE_EXPR
2282 && code != RROTATE_EXPR
2283 && !CONVERT_EXPR_CODE_P (code))
2284 return NULL;
2286 source_stmt1 = find_bswap_or_nop_1 (rhs1_stmt, n, limit - 1);
2288 /* If find_bswap_or_nop_1 returned NULL, STMT is a leaf node and
2289 we have to initialize the symbolic number. */
2290 if (!source_stmt1)
2292 if (gimple_assign_load_p (stmt)
2293 || !init_symbolic_number (n, rhs1))
2294 return NULL;
2295 source_stmt1 = stmt;
2298 switch (code)
2300 case BIT_AND_EXPR:
2302 int i, size = TYPE_PRECISION (n->type) / BITS_PER_UNIT;
2303 uint64_t val = int_cst_value (rhs2), mask = 0;
2304 uint64_t tmp = (1 << BITS_PER_UNIT) - 1;
2306 /* Only constants masking full bytes are allowed. */
2307 for (i = 0; i < size; i++, tmp <<= BITS_PER_UNIT)
2308 if ((val & tmp) != 0 && (val & tmp) != tmp)
2309 return NULL;
2310 else if (val & tmp)
2311 mask |= (uint64_t) MARKER_MASK << (i * BITS_PER_MARKER);
2313 n->n &= mask;
2315 break;
2316 case LSHIFT_EXPR:
2317 case RSHIFT_EXPR:
2318 case LROTATE_EXPR:
2319 case RROTATE_EXPR:
2320 if (!do_shift_rotate (code, n, (int) TREE_INT_CST_LOW (rhs2)))
2321 return NULL;
2322 break;
2323 CASE_CONVERT:
2325 int i, type_size, old_type_size;
2326 tree type;
2328 type = gimple_expr_type (stmt);
2329 type_size = TYPE_PRECISION (type);
2330 if (type_size % BITS_PER_UNIT != 0)
2331 return NULL;
2332 type_size /= BITS_PER_UNIT;
2333 if (type_size > 64 / BITS_PER_MARKER)
2334 return NULL;
2336 /* Sign extension: result is dependent on the value. */
2337 old_type_size = TYPE_PRECISION (n->type) / BITS_PER_UNIT;
2338 if (!TYPE_UNSIGNED (n->type) && type_size > old_type_size
2339 && HEAD_MARKER (n->n, old_type_size))
2340 for (i = 0; i < type_size - old_type_size; i++)
2341 n->n |= (uint64_t) MARKER_BYTE_UNKNOWN
2342 << ((type_size - 1 - i) * BITS_PER_MARKER);
2344 if (type_size < 64 / BITS_PER_MARKER)
2346 /* If STMT casts to a smaller type mask out the bits not
2347 belonging to the target type. */
2348 n->n &= ((uint64_t) 1 << (type_size * BITS_PER_MARKER)) - 1;
2350 n->type = type;
2351 if (!n->base_addr)
2352 n->range = type_size;
2354 break;
2355 default:
2356 return NULL;
2358 return verify_symbolic_number_p (n, stmt) ? source_stmt1 : NULL;
2361 /* Handle binary rhs. */
2363 if (rhs_class == GIMPLE_BINARY_RHS)
2365 struct symbolic_number n1, n2;
2366 gimple source_stmt, source_stmt2;
2368 if (code != BIT_IOR_EXPR)
2369 return NULL;
2371 if (TREE_CODE (rhs2) != SSA_NAME)
2372 return NULL;
2374 rhs2_stmt = SSA_NAME_DEF_STMT (rhs2);
2376 switch (code)
2378 case BIT_IOR_EXPR:
2379 source_stmt1 = find_bswap_or_nop_1 (rhs1_stmt, &n1, limit - 1);
2381 if (!source_stmt1)
2382 return NULL;
2384 source_stmt2 = find_bswap_or_nop_1 (rhs2_stmt, &n2, limit - 1);
2386 if (!source_stmt2)
2387 return NULL;
2389 if (TYPE_PRECISION (n1.type) != TYPE_PRECISION (n2.type))
2390 return NULL;
2392 if (!n1.vuse != !n2.vuse
2393 || (n1.vuse && !operand_equal_p (n1.vuse, n2.vuse, 0)))
2394 return NULL;
2396 source_stmt
2397 = perform_symbolic_merge (source_stmt1, &n1, source_stmt2, &n2, n);
2399 if (!source_stmt)
2400 return NULL;
2402 if (!verify_symbolic_number_p (n, stmt))
2403 return NULL;
2405 break;
2406 default:
2407 return NULL;
2409 return source_stmt;
2411 return NULL;
2414 /* Check if STMT completes a bswap implementation or a read in a given
2415 endianness consisting of ORs, SHIFTs and ANDs and sets *BSWAP
2416 accordingly. It also sets N to represent the kind of operations
2417 performed: size of the resulting expression and whether it works on
2418 a memory source, and if so alias-set and vuse. At last, the
2419 function returns a stmt whose rhs's first tree is the source
2420 expression. */
2422 static gimple
2423 find_bswap_or_nop (gimple stmt, struct symbolic_number *n, bool *bswap)
2425 /* The number which the find_bswap_or_nop_1 result should match in order
2426 to have a full byte swap. The number is shifted to the right
2427 according to the size of the symbolic number before using it. */
2428 uint64_t cmpxchg = CMPXCHG;
2429 uint64_t cmpnop = CMPNOP;
2431 gimple source_stmt;
2432 int limit;
2434 /* The last parameter determines the depth search limit. It usually
2435 correlates directly to the number n of bytes to be touched. We
2436 increase that number by log2(n) + 1 here in order to also
2437 cover signed -> unsigned conversions of the src operand as can be seen
2438 in libgcc, and for initial shift/and operation of the src operand. */
2439 limit = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (gimple_expr_type (stmt)));
2440 limit += 1 + (int) ceil_log2 ((unsigned HOST_WIDE_INT) limit);
2441 source_stmt = find_bswap_or_nop_1 (stmt, n, limit);
2443 if (!source_stmt)
2444 return NULL;
2446 /* Find real size of result (highest non-zero byte). */
2447 if (n->base_addr)
2449 int rsize;
2450 uint64_t tmpn;
2452 for (tmpn = n->n, rsize = 0; tmpn; tmpn >>= BITS_PER_MARKER, rsize++);
2453 n->range = rsize;
2456 /* Zero out the extra bits of N and CMP*. */
2457 if (n->range < (int) sizeof (int64_t))
2459 uint64_t mask;
2461 mask = ((uint64_t) 1 << (n->range * BITS_PER_MARKER)) - 1;
2462 cmpxchg >>= (64 / BITS_PER_MARKER - n->range) * BITS_PER_MARKER;
2463 cmpnop &= mask;
2466 /* A complete byte swap should make the symbolic number to start with
2467 the largest digit in the highest order byte. Unchanged symbolic
2468 number indicates a read with same endianness as target architecture. */
2469 if (n->n == cmpnop)
2470 *bswap = false;
2471 else if (n->n == cmpxchg)
2472 *bswap = true;
2473 else
2474 return NULL;
2476 /* Useless bit manipulation performed by code. */
2477 if (!n->base_addr && n->n == cmpnop)
2478 return NULL;
2480 n->range *= BITS_PER_UNIT;
2481 return source_stmt;
2484 namespace {
2486 const pass_data pass_data_optimize_bswap =
2488 GIMPLE_PASS, /* type */
2489 "bswap", /* name */
2490 OPTGROUP_NONE, /* optinfo_flags */
2491 TV_NONE, /* tv_id */
2492 PROP_ssa, /* properties_required */
2493 0, /* properties_provided */
2494 0, /* properties_destroyed */
2495 0, /* todo_flags_start */
2496 0, /* todo_flags_finish */
2499 class pass_optimize_bswap : public gimple_opt_pass
2501 public:
2502 pass_optimize_bswap (gcc::context *ctxt)
2503 : gimple_opt_pass (pass_data_optimize_bswap, ctxt)
2506 /* opt_pass methods: */
2507 virtual bool gate (function *)
2509 return flag_expensive_optimizations && optimize;
2512 virtual unsigned int execute (function *);
2514 }; // class pass_optimize_bswap
2516 /* Perform the bswap optimization: replace the expression computed in the rhs
2517 of CUR_STMT by an equivalent bswap, load or load + bswap expression.
2518 Which of these alternatives replace the rhs is given by N->base_addr (non
2519 null if a load is needed) and BSWAP. The type, VUSE and set-alias of the
2520 load to perform are also given in N while the builtin bswap invoke is given
2521 in FNDEL. Finally, if a load is involved, SRC_STMT refers to one of the
2522 load statements involved to construct the rhs in CUR_STMT and N->range gives
2523 the size of the rhs expression for maintaining some statistics.
2525 Note that if the replacement involve a load, CUR_STMT is moved just after
2526 SRC_STMT to do the load with the same VUSE which can lead to CUR_STMT
2527 changing of basic block. */
2529 static bool
2530 bswap_replace (gimple cur_stmt, gimple src_stmt, tree fndecl, tree bswap_type,
2531 tree load_type, struct symbolic_number *n, bool bswap)
2533 gimple_stmt_iterator gsi;
2534 tree src, tmp, tgt;
2535 gimple bswap_stmt;
2537 gsi = gsi_for_stmt (cur_stmt);
2538 src = gimple_assign_rhs1 (src_stmt);
2539 tgt = gimple_assign_lhs (cur_stmt);
2541 /* Need to load the value from memory first. */
2542 if (n->base_addr)
2544 gimple_stmt_iterator gsi_ins = gsi_for_stmt (src_stmt);
2545 tree addr_expr, addr_tmp, val_expr, val_tmp;
2546 tree load_offset_ptr, aligned_load_type;
2547 gimple addr_stmt, load_stmt;
2548 unsigned align;
2549 HOST_WIDE_INT load_offset = 0;
2551 align = get_object_alignment (src);
2552 /* If the new access is smaller than the original one, we need
2553 to perform big endian adjustment. */
2554 if (BYTES_BIG_ENDIAN)
2556 HOST_WIDE_INT bitsize, bitpos;
2557 machine_mode mode;
2558 int unsignedp, volatilep;
2559 tree offset;
2561 get_inner_reference (src, &bitsize, &bitpos, &offset, &mode,
2562 &unsignedp, &volatilep, false);
2563 if (n->range < (unsigned HOST_WIDE_INT) bitsize)
2565 load_offset = (bitsize - n->range) / BITS_PER_UNIT;
2566 unsigned HOST_WIDE_INT l
2567 = (load_offset * BITS_PER_UNIT) & (align - 1);
2568 if (l)
2569 align = l & -l;
2573 if (bswap
2574 && align < GET_MODE_ALIGNMENT (TYPE_MODE (load_type))
2575 && SLOW_UNALIGNED_ACCESS (TYPE_MODE (load_type), align))
2576 return false;
2578 /* Move cur_stmt just before one of the load of the original
2579 to ensure it has the same VUSE. See PR61517 for what could
2580 go wrong. */
2581 gsi_move_before (&gsi, &gsi_ins);
2582 gsi = gsi_for_stmt (cur_stmt);
2584 /* Compute address to load from and cast according to the size
2585 of the load. */
2586 addr_expr = build_fold_addr_expr (unshare_expr (src));
2587 if (is_gimple_mem_ref_addr (addr_expr))
2588 addr_tmp = addr_expr;
2589 else
2591 addr_tmp = make_temp_ssa_name (TREE_TYPE (addr_expr), NULL,
2592 "load_src");
2593 addr_stmt = gimple_build_assign (addr_tmp, addr_expr);
2594 gsi_insert_before (&gsi, addr_stmt, GSI_SAME_STMT);
2597 /* Perform the load. */
2598 aligned_load_type = load_type;
2599 if (align < TYPE_ALIGN (load_type))
2600 aligned_load_type = build_aligned_type (load_type, align);
2601 load_offset_ptr = build_int_cst (n->alias_set, load_offset);
2602 val_expr = fold_build2 (MEM_REF, aligned_load_type, addr_tmp,
2603 load_offset_ptr);
2605 if (!bswap)
2607 if (n->range == 16)
2608 nop_stats.found_16bit++;
2609 else if (n->range == 32)
2610 nop_stats.found_32bit++;
2611 else
2613 gcc_assert (n->range == 64);
2614 nop_stats.found_64bit++;
2617 /* Convert the result of load if necessary. */
2618 if (!useless_type_conversion_p (TREE_TYPE (tgt), load_type))
2620 val_tmp = make_temp_ssa_name (aligned_load_type, NULL,
2621 "load_dst");
2622 load_stmt = gimple_build_assign (val_tmp, val_expr);
2623 gimple_set_vuse (load_stmt, n->vuse);
2624 gsi_insert_before (&gsi, load_stmt, GSI_SAME_STMT);
2625 gimple_assign_set_rhs_with_ops (&gsi, NOP_EXPR, val_tmp);
2627 else
2629 gimple_assign_set_rhs_with_ops (&gsi, MEM_REF, val_expr);
2630 gimple_set_vuse (cur_stmt, n->vuse);
2632 update_stmt (cur_stmt);
2634 if (dump_file)
2636 fprintf (dump_file,
2637 "%d bit load in target endianness found at: ",
2638 (int) n->range);
2639 print_gimple_stmt (dump_file, cur_stmt, 0, 0);
2641 return true;
2643 else
2645 val_tmp = make_temp_ssa_name (aligned_load_type, NULL, "load_dst");
2646 load_stmt = gimple_build_assign (val_tmp, val_expr);
2647 gimple_set_vuse (load_stmt, n->vuse);
2648 gsi_insert_before (&gsi, load_stmt, GSI_SAME_STMT);
2650 src = val_tmp;
2653 if (n->range == 16)
2654 bswap_stats.found_16bit++;
2655 else if (n->range == 32)
2656 bswap_stats.found_32bit++;
2657 else
2659 gcc_assert (n->range == 64);
2660 bswap_stats.found_64bit++;
2663 tmp = src;
2665 /* Convert the src expression if necessary. */
2666 if (!useless_type_conversion_p (TREE_TYPE (tmp), bswap_type))
2668 gimple convert_stmt;
2670 tmp = make_temp_ssa_name (bswap_type, NULL, "bswapsrc");
2671 convert_stmt = gimple_build_assign (tmp, NOP_EXPR, src);
2672 gsi_insert_before (&gsi, convert_stmt, GSI_SAME_STMT);
2675 /* Canonical form for 16 bit bswap is a rotate expression. Only 16bit values
2676 are considered as rotation of 2N bit values by N bits is generally not
2677 equivalent to a bswap. Consider for instance 0x01020304 r>> 16 which
2678 gives 0x03040102 while a bswap for that value is 0x04030201. */
2679 if (bswap && n->range == 16)
2681 tree count = build_int_cst (NULL, BITS_PER_UNIT);
2682 src = fold_build2 (LROTATE_EXPR, bswap_type, tmp, count);
2683 bswap_stmt = gimple_build_assign (NULL, src);
2685 else
2686 bswap_stmt = gimple_build_call (fndecl, 1, tmp);
2688 tmp = tgt;
2690 /* Convert the result if necessary. */
2691 if (!useless_type_conversion_p (TREE_TYPE (tgt), bswap_type))
2693 gimple convert_stmt;
2695 tmp = make_temp_ssa_name (bswap_type, NULL, "bswapdst");
2696 convert_stmt = gimple_build_assign (tgt, NOP_EXPR, tmp);
2697 gsi_insert_after (&gsi, convert_stmt, GSI_SAME_STMT);
2700 gimple_set_lhs (bswap_stmt, tmp);
2702 if (dump_file)
2704 fprintf (dump_file, "%d bit bswap implementation found at: ",
2705 (int) n->range);
2706 print_gimple_stmt (dump_file, cur_stmt, 0, 0);
2709 gsi_insert_after (&gsi, bswap_stmt, GSI_SAME_STMT);
2710 gsi_remove (&gsi, true);
2711 return true;
2714 /* Find manual byte swap implementations as well as load in a given
2715 endianness. Byte swaps are turned into a bswap builtin invokation
2716 while endian loads are converted to bswap builtin invokation or
2717 simple load according to the target endianness. */
2719 unsigned int
2720 pass_optimize_bswap::execute (function *fun)
2722 basic_block bb;
2723 bool bswap32_p, bswap64_p;
2724 bool changed = false;
2725 tree bswap32_type = NULL_TREE, bswap64_type = NULL_TREE;
2727 if (BITS_PER_UNIT != 8)
2728 return 0;
2730 bswap32_p = (builtin_decl_explicit_p (BUILT_IN_BSWAP32)
2731 && optab_handler (bswap_optab, SImode) != CODE_FOR_nothing);
2732 bswap64_p = (builtin_decl_explicit_p (BUILT_IN_BSWAP64)
2733 && (optab_handler (bswap_optab, DImode) != CODE_FOR_nothing
2734 || (bswap32_p && word_mode == SImode)));
2736 /* Determine the argument type of the builtins. The code later on
2737 assumes that the return and argument type are the same. */
2738 if (bswap32_p)
2740 tree fndecl = builtin_decl_explicit (BUILT_IN_BSWAP32);
2741 bswap32_type = TREE_VALUE (TYPE_ARG_TYPES (TREE_TYPE (fndecl)));
2744 if (bswap64_p)
2746 tree fndecl = builtin_decl_explicit (BUILT_IN_BSWAP64);
2747 bswap64_type = TREE_VALUE (TYPE_ARG_TYPES (TREE_TYPE (fndecl)));
2750 memset (&nop_stats, 0, sizeof (nop_stats));
2751 memset (&bswap_stats, 0, sizeof (bswap_stats));
2753 FOR_EACH_BB_FN (bb, fun)
2755 gimple_stmt_iterator gsi;
2757 /* We do a reverse scan for bswap patterns to make sure we get the
2758 widest match. As bswap pattern matching doesn't handle previously
2759 inserted smaller bswap replacements as sub-patterns, the wider
2760 variant wouldn't be detected. */
2761 for (gsi = gsi_last_bb (bb); !gsi_end_p (gsi);)
2763 gimple src_stmt, cur_stmt = gsi_stmt (gsi);
2764 tree fndecl = NULL_TREE, bswap_type = NULL_TREE, load_type;
2765 enum tree_code code;
2766 struct symbolic_number n;
2767 bool bswap;
2769 /* This gsi_prev (&gsi) is not part of the for loop because cur_stmt
2770 might be moved to a different basic block by bswap_replace and gsi
2771 must not points to it if that's the case. Moving the gsi_prev
2772 there make sure that gsi points to the statement previous to
2773 cur_stmt while still making sure that all statements are
2774 considered in this basic block. */
2775 gsi_prev (&gsi);
2777 if (!is_gimple_assign (cur_stmt))
2778 continue;
2780 code = gimple_assign_rhs_code (cur_stmt);
2781 switch (code)
2783 case LROTATE_EXPR:
2784 case RROTATE_EXPR:
2785 if (!tree_fits_uhwi_p (gimple_assign_rhs2 (cur_stmt))
2786 || tree_to_uhwi (gimple_assign_rhs2 (cur_stmt))
2787 % BITS_PER_UNIT)
2788 continue;
2789 /* Fall through. */
2790 case BIT_IOR_EXPR:
2791 break;
2792 default:
2793 continue;
2796 src_stmt = find_bswap_or_nop (cur_stmt, &n, &bswap);
2798 if (!src_stmt)
2799 continue;
2801 switch (n.range)
2803 case 16:
2804 /* Already in canonical form, nothing to do. */
2805 if (code == LROTATE_EXPR || code == RROTATE_EXPR)
2806 continue;
2807 load_type = bswap_type = uint16_type_node;
2808 break;
2809 case 32:
2810 load_type = uint32_type_node;
2811 if (bswap32_p)
2813 fndecl = builtin_decl_explicit (BUILT_IN_BSWAP32);
2814 bswap_type = bswap32_type;
2816 break;
2817 case 64:
2818 load_type = uint64_type_node;
2819 if (bswap64_p)
2821 fndecl = builtin_decl_explicit (BUILT_IN_BSWAP64);
2822 bswap_type = bswap64_type;
2824 break;
2825 default:
2826 continue;
2829 if (bswap && !fndecl && n.range != 16)
2830 continue;
2832 if (bswap_replace (cur_stmt, src_stmt, fndecl, bswap_type, load_type,
2833 &n, bswap))
2834 changed = true;
2838 statistics_counter_event (fun, "16-bit nop implementations found",
2839 nop_stats.found_16bit);
2840 statistics_counter_event (fun, "32-bit nop implementations found",
2841 nop_stats.found_32bit);
2842 statistics_counter_event (fun, "64-bit nop implementations found",
2843 nop_stats.found_64bit);
2844 statistics_counter_event (fun, "16-bit bswap implementations found",
2845 bswap_stats.found_16bit);
2846 statistics_counter_event (fun, "32-bit bswap implementations found",
2847 bswap_stats.found_32bit);
2848 statistics_counter_event (fun, "64-bit bswap implementations found",
2849 bswap_stats.found_64bit);
2851 return (changed ? TODO_update_ssa : 0);
2854 } // anon namespace
2856 gimple_opt_pass *
2857 make_pass_optimize_bswap (gcc::context *ctxt)
2859 return new pass_optimize_bswap (ctxt);
2862 /* Return true if stmt is a type conversion operation that can be stripped
2863 when used in a widening multiply operation. */
2864 static bool
2865 widening_mult_conversion_strippable_p (tree result_type, gimple stmt)
2867 enum tree_code rhs_code = gimple_assign_rhs_code (stmt);
2869 if (TREE_CODE (result_type) == INTEGER_TYPE)
2871 tree op_type;
2872 tree inner_op_type;
2874 if (!CONVERT_EXPR_CODE_P (rhs_code))
2875 return false;
2877 op_type = TREE_TYPE (gimple_assign_lhs (stmt));
2879 /* If the type of OP has the same precision as the result, then
2880 we can strip this conversion. The multiply operation will be
2881 selected to create the correct extension as a by-product. */
2882 if (TYPE_PRECISION (result_type) == TYPE_PRECISION (op_type))
2883 return true;
2885 /* We can also strip a conversion if it preserves the signed-ness of
2886 the operation and doesn't narrow the range. */
2887 inner_op_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
2889 /* If the inner-most type is unsigned, then we can strip any
2890 intermediate widening operation. If it's signed, then the
2891 intermediate widening operation must also be signed. */
2892 if ((TYPE_UNSIGNED (inner_op_type)
2893 || TYPE_UNSIGNED (op_type) == TYPE_UNSIGNED (inner_op_type))
2894 && TYPE_PRECISION (op_type) > TYPE_PRECISION (inner_op_type))
2895 return true;
2897 return false;
2900 return rhs_code == FIXED_CONVERT_EXPR;
2903 /* Return true if RHS is a suitable operand for a widening multiplication,
2904 assuming a target type of TYPE.
2905 There are two cases:
2907 - RHS makes some value at least twice as wide. Store that value
2908 in *NEW_RHS_OUT if so, and store its type in *TYPE_OUT.
2910 - RHS is an integer constant. Store that value in *NEW_RHS_OUT if so,
2911 but leave *TYPE_OUT untouched. */
2913 static bool
2914 is_widening_mult_rhs_p (tree type, tree rhs, tree *type_out,
2915 tree *new_rhs_out)
2917 gimple stmt;
2918 tree type1, rhs1;
2920 if (TREE_CODE (rhs) == SSA_NAME)
2922 stmt = SSA_NAME_DEF_STMT (rhs);
2923 if (is_gimple_assign (stmt))
2925 if (! widening_mult_conversion_strippable_p (type, stmt))
2926 rhs1 = rhs;
2927 else
2929 rhs1 = gimple_assign_rhs1 (stmt);
2931 if (TREE_CODE (rhs1) == INTEGER_CST)
2933 *new_rhs_out = rhs1;
2934 *type_out = NULL;
2935 return true;
2939 else
2940 rhs1 = rhs;
2942 type1 = TREE_TYPE (rhs1);
2944 if (TREE_CODE (type1) != TREE_CODE (type)
2945 || TYPE_PRECISION (type1) * 2 > TYPE_PRECISION (type))
2946 return false;
2948 *new_rhs_out = rhs1;
2949 *type_out = type1;
2950 return true;
2953 if (TREE_CODE (rhs) == INTEGER_CST)
2955 *new_rhs_out = rhs;
2956 *type_out = NULL;
2957 return true;
2960 return false;
2963 /* Return true if STMT performs a widening multiplication, assuming the
2964 output type is TYPE. If so, store the unwidened types of the operands
2965 in *TYPE1_OUT and *TYPE2_OUT respectively. Also fill *RHS1_OUT and
2966 *RHS2_OUT such that converting those operands to types *TYPE1_OUT
2967 and *TYPE2_OUT would give the operands of the multiplication. */
2969 static bool
2970 is_widening_mult_p (gimple stmt,
2971 tree *type1_out, tree *rhs1_out,
2972 tree *type2_out, tree *rhs2_out)
2974 tree type = TREE_TYPE (gimple_assign_lhs (stmt));
2976 if (TREE_CODE (type) != INTEGER_TYPE
2977 && TREE_CODE (type) != FIXED_POINT_TYPE)
2978 return false;
2980 if (!is_widening_mult_rhs_p (type, gimple_assign_rhs1 (stmt), type1_out,
2981 rhs1_out))
2982 return false;
2984 if (!is_widening_mult_rhs_p (type, gimple_assign_rhs2 (stmt), type2_out,
2985 rhs2_out))
2986 return false;
2988 if (*type1_out == NULL)
2990 if (*type2_out == NULL || !int_fits_type_p (*rhs1_out, *type2_out))
2991 return false;
2992 *type1_out = *type2_out;
2995 if (*type2_out == NULL)
2997 if (!int_fits_type_p (*rhs2_out, *type1_out))
2998 return false;
2999 *type2_out = *type1_out;
3002 /* Ensure that the larger of the two operands comes first. */
3003 if (TYPE_PRECISION (*type1_out) < TYPE_PRECISION (*type2_out))
3005 std::swap (*type1_out, *type2_out);
3006 std::swap (*rhs1_out, *rhs2_out);
3009 return true;
3012 /* Process a single gimple statement STMT, which has a MULT_EXPR as
3013 its rhs, and try to convert it into a WIDEN_MULT_EXPR. The return
3014 value is true iff we converted the statement. */
3016 static bool
3017 convert_mult_to_widen (gimple stmt, gimple_stmt_iterator *gsi)
3019 tree lhs, rhs1, rhs2, type, type1, type2;
3020 enum insn_code handler;
3021 machine_mode to_mode, from_mode, actual_mode;
3022 optab op;
3023 int actual_precision;
3024 location_t loc = gimple_location (stmt);
3025 bool from_unsigned1, from_unsigned2;
3027 lhs = gimple_assign_lhs (stmt);
3028 type = TREE_TYPE (lhs);
3029 if (TREE_CODE (type) != INTEGER_TYPE)
3030 return false;
3032 if (!is_widening_mult_p (stmt, &type1, &rhs1, &type2, &rhs2))
3033 return false;
3035 to_mode = TYPE_MODE (type);
3036 from_mode = TYPE_MODE (type1);
3037 from_unsigned1 = TYPE_UNSIGNED (type1);
3038 from_unsigned2 = TYPE_UNSIGNED (type2);
3040 if (from_unsigned1 && from_unsigned2)
3041 op = umul_widen_optab;
3042 else if (!from_unsigned1 && !from_unsigned2)
3043 op = smul_widen_optab;
3044 else
3045 op = usmul_widen_optab;
3047 handler = find_widening_optab_handler_and_mode (op, to_mode, from_mode,
3048 0, &actual_mode);
3050 if (handler == CODE_FOR_nothing)
3052 if (op != smul_widen_optab)
3054 /* We can use a signed multiply with unsigned types as long as
3055 there is a wider mode to use, or it is the smaller of the two
3056 types that is unsigned. Note that type1 >= type2, always. */
3057 if ((TYPE_UNSIGNED (type1)
3058 && TYPE_PRECISION (type1) == GET_MODE_PRECISION (from_mode))
3059 || (TYPE_UNSIGNED (type2)
3060 && TYPE_PRECISION (type2) == GET_MODE_PRECISION (from_mode)))
3062 from_mode = GET_MODE_WIDER_MODE (from_mode);
3063 if (GET_MODE_SIZE (to_mode) <= GET_MODE_SIZE (from_mode))
3064 return false;
3067 op = smul_widen_optab;
3068 handler = find_widening_optab_handler_and_mode (op, to_mode,
3069 from_mode, 0,
3070 &actual_mode);
3072 if (handler == CODE_FOR_nothing)
3073 return false;
3075 from_unsigned1 = from_unsigned2 = false;
3077 else
3078 return false;
3081 /* Ensure that the inputs to the handler are in the correct precison
3082 for the opcode. This will be the full mode size. */
3083 actual_precision = GET_MODE_PRECISION (actual_mode);
3084 if (2 * actual_precision > TYPE_PRECISION (type))
3085 return false;
3086 if (actual_precision != TYPE_PRECISION (type1)
3087 || from_unsigned1 != TYPE_UNSIGNED (type1))
3088 rhs1 = build_and_insert_cast (gsi, loc,
3089 build_nonstandard_integer_type
3090 (actual_precision, from_unsigned1), rhs1);
3091 if (actual_precision != TYPE_PRECISION (type2)
3092 || from_unsigned2 != TYPE_UNSIGNED (type2))
3093 rhs2 = build_and_insert_cast (gsi, loc,
3094 build_nonstandard_integer_type
3095 (actual_precision, from_unsigned2), rhs2);
3097 /* Handle constants. */
3098 if (TREE_CODE (rhs1) == INTEGER_CST)
3099 rhs1 = fold_convert (type1, rhs1);
3100 if (TREE_CODE (rhs2) == INTEGER_CST)
3101 rhs2 = fold_convert (type2, rhs2);
3103 gimple_assign_set_rhs1 (stmt, rhs1);
3104 gimple_assign_set_rhs2 (stmt, rhs2);
3105 gimple_assign_set_rhs_code (stmt, WIDEN_MULT_EXPR);
3106 update_stmt (stmt);
3107 widen_mul_stats.widen_mults_inserted++;
3108 return true;
3111 /* Process a single gimple statement STMT, which is found at the
3112 iterator GSI and has a either a PLUS_EXPR or a MINUS_EXPR as its
3113 rhs (given by CODE), and try to convert it into a
3114 WIDEN_MULT_PLUS_EXPR or a WIDEN_MULT_MINUS_EXPR. The return value
3115 is true iff we converted the statement. */
3117 static bool
3118 convert_plusminus_to_widen (gimple_stmt_iterator *gsi, gimple stmt,
3119 enum tree_code code)
3121 gimple rhs1_stmt = NULL, rhs2_stmt = NULL;
3122 gimple conv1_stmt = NULL, conv2_stmt = NULL, conv_stmt;
3123 tree type, type1, type2, optype;
3124 tree lhs, rhs1, rhs2, mult_rhs1, mult_rhs2, add_rhs;
3125 enum tree_code rhs1_code = ERROR_MARK, rhs2_code = ERROR_MARK;
3126 optab this_optab;
3127 enum tree_code wmult_code;
3128 enum insn_code handler;
3129 machine_mode to_mode, from_mode, actual_mode;
3130 location_t loc = gimple_location (stmt);
3131 int actual_precision;
3132 bool from_unsigned1, from_unsigned2;
3134 lhs = gimple_assign_lhs (stmt);
3135 type = TREE_TYPE (lhs);
3136 if (TREE_CODE (type) != INTEGER_TYPE
3137 && TREE_CODE (type) != FIXED_POINT_TYPE)
3138 return false;
3140 if (code == MINUS_EXPR)
3141 wmult_code = WIDEN_MULT_MINUS_EXPR;
3142 else
3143 wmult_code = WIDEN_MULT_PLUS_EXPR;
3145 rhs1 = gimple_assign_rhs1 (stmt);
3146 rhs2 = gimple_assign_rhs2 (stmt);
3148 if (TREE_CODE (rhs1) == SSA_NAME)
3150 rhs1_stmt = SSA_NAME_DEF_STMT (rhs1);
3151 if (is_gimple_assign (rhs1_stmt))
3152 rhs1_code = gimple_assign_rhs_code (rhs1_stmt);
3155 if (TREE_CODE (rhs2) == SSA_NAME)
3157 rhs2_stmt = SSA_NAME_DEF_STMT (rhs2);
3158 if (is_gimple_assign (rhs2_stmt))
3159 rhs2_code = gimple_assign_rhs_code (rhs2_stmt);
3162 /* Allow for one conversion statement between the multiply
3163 and addition/subtraction statement. If there are more than
3164 one conversions then we assume they would invalidate this
3165 transformation. If that's not the case then they should have
3166 been folded before now. */
3167 if (CONVERT_EXPR_CODE_P (rhs1_code))
3169 conv1_stmt = rhs1_stmt;
3170 rhs1 = gimple_assign_rhs1 (rhs1_stmt);
3171 if (TREE_CODE (rhs1) == SSA_NAME)
3173 rhs1_stmt = SSA_NAME_DEF_STMT (rhs1);
3174 if (is_gimple_assign (rhs1_stmt))
3175 rhs1_code = gimple_assign_rhs_code (rhs1_stmt);
3177 else
3178 return false;
3180 if (CONVERT_EXPR_CODE_P (rhs2_code))
3182 conv2_stmt = rhs2_stmt;
3183 rhs2 = gimple_assign_rhs1 (rhs2_stmt);
3184 if (TREE_CODE (rhs2) == SSA_NAME)
3186 rhs2_stmt = SSA_NAME_DEF_STMT (rhs2);
3187 if (is_gimple_assign (rhs2_stmt))
3188 rhs2_code = gimple_assign_rhs_code (rhs2_stmt);
3190 else
3191 return false;
3194 /* If code is WIDEN_MULT_EXPR then it would seem unnecessary to call
3195 is_widening_mult_p, but we still need the rhs returns.
3197 It might also appear that it would be sufficient to use the existing
3198 operands of the widening multiply, but that would limit the choice of
3199 multiply-and-accumulate instructions.
3201 If the widened-multiplication result has more than one uses, it is
3202 probably wiser not to do the conversion. */
3203 if (code == PLUS_EXPR
3204 && (rhs1_code == MULT_EXPR || rhs1_code == WIDEN_MULT_EXPR))
3206 if (!has_single_use (rhs1)
3207 || !is_widening_mult_p (rhs1_stmt, &type1, &mult_rhs1,
3208 &type2, &mult_rhs2))
3209 return false;
3210 add_rhs = rhs2;
3211 conv_stmt = conv1_stmt;
3213 else if (rhs2_code == MULT_EXPR || rhs2_code == WIDEN_MULT_EXPR)
3215 if (!has_single_use (rhs2)
3216 || !is_widening_mult_p (rhs2_stmt, &type1, &mult_rhs1,
3217 &type2, &mult_rhs2))
3218 return false;
3219 add_rhs = rhs1;
3220 conv_stmt = conv2_stmt;
3222 else
3223 return false;
3225 to_mode = TYPE_MODE (type);
3226 from_mode = TYPE_MODE (type1);
3227 from_unsigned1 = TYPE_UNSIGNED (type1);
3228 from_unsigned2 = TYPE_UNSIGNED (type2);
3229 optype = type1;
3231 /* There's no such thing as a mixed sign madd yet, so use a wider mode. */
3232 if (from_unsigned1 != from_unsigned2)
3234 if (!INTEGRAL_TYPE_P (type))
3235 return false;
3236 /* We can use a signed multiply with unsigned types as long as
3237 there is a wider mode to use, or it is the smaller of the two
3238 types that is unsigned. Note that type1 >= type2, always. */
3239 if ((from_unsigned1
3240 && TYPE_PRECISION (type1) == GET_MODE_PRECISION (from_mode))
3241 || (from_unsigned2
3242 && TYPE_PRECISION (type2) == GET_MODE_PRECISION (from_mode)))
3244 from_mode = GET_MODE_WIDER_MODE (from_mode);
3245 if (GET_MODE_SIZE (from_mode) >= GET_MODE_SIZE (to_mode))
3246 return false;
3249 from_unsigned1 = from_unsigned2 = false;
3250 optype = build_nonstandard_integer_type (GET_MODE_PRECISION (from_mode),
3251 false);
3254 /* If there was a conversion between the multiply and addition
3255 then we need to make sure it fits a multiply-and-accumulate.
3256 The should be a single mode change which does not change the
3257 value. */
3258 if (conv_stmt)
3260 /* We use the original, unmodified data types for this. */
3261 tree from_type = TREE_TYPE (gimple_assign_rhs1 (conv_stmt));
3262 tree to_type = TREE_TYPE (gimple_assign_lhs (conv_stmt));
3263 int data_size = TYPE_PRECISION (type1) + TYPE_PRECISION (type2);
3264 bool is_unsigned = TYPE_UNSIGNED (type1) && TYPE_UNSIGNED (type2);
3266 if (TYPE_PRECISION (from_type) > TYPE_PRECISION (to_type))
3268 /* Conversion is a truncate. */
3269 if (TYPE_PRECISION (to_type) < data_size)
3270 return false;
3272 else if (TYPE_PRECISION (from_type) < TYPE_PRECISION (to_type))
3274 /* Conversion is an extend. Check it's the right sort. */
3275 if (TYPE_UNSIGNED (from_type) != is_unsigned
3276 && !(is_unsigned && TYPE_PRECISION (from_type) > data_size))
3277 return false;
3279 /* else convert is a no-op for our purposes. */
3282 /* Verify that the machine can perform a widening multiply
3283 accumulate in this mode/signedness combination, otherwise
3284 this transformation is likely to pessimize code. */
3285 this_optab = optab_for_tree_code (wmult_code, optype, optab_default);
3286 handler = find_widening_optab_handler_and_mode (this_optab, to_mode,
3287 from_mode, 0, &actual_mode);
3289 if (handler == CODE_FOR_nothing)
3290 return false;
3292 /* Ensure that the inputs to the handler are in the correct precison
3293 for the opcode. This will be the full mode size. */
3294 actual_precision = GET_MODE_PRECISION (actual_mode);
3295 if (actual_precision != TYPE_PRECISION (type1)
3296 || from_unsigned1 != TYPE_UNSIGNED (type1))
3297 mult_rhs1 = build_and_insert_cast (gsi, loc,
3298 build_nonstandard_integer_type
3299 (actual_precision, from_unsigned1),
3300 mult_rhs1);
3301 if (actual_precision != TYPE_PRECISION (type2)
3302 || from_unsigned2 != TYPE_UNSIGNED (type2))
3303 mult_rhs2 = build_and_insert_cast (gsi, loc,
3304 build_nonstandard_integer_type
3305 (actual_precision, from_unsigned2),
3306 mult_rhs2);
3308 if (!useless_type_conversion_p (type, TREE_TYPE (add_rhs)))
3309 add_rhs = build_and_insert_cast (gsi, loc, type, add_rhs);
3311 /* Handle constants. */
3312 if (TREE_CODE (mult_rhs1) == INTEGER_CST)
3313 mult_rhs1 = fold_convert (type1, mult_rhs1);
3314 if (TREE_CODE (mult_rhs2) == INTEGER_CST)
3315 mult_rhs2 = fold_convert (type2, mult_rhs2);
3317 gimple_assign_set_rhs_with_ops (gsi, wmult_code, mult_rhs1, mult_rhs2,
3318 add_rhs);
3319 update_stmt (gsi_stmt (*gsi));
3320 widen_mul_stats.maccs_inserted++;
3321 return true;
3324 /* Combine the multiplication at MUL_STMT with operands MULOP1 and MULOP2
3325 with uses in additions and subtractions to form fused multiply-add
3326 operations. Returns true if successful and MUL_STMT should be removed. */
3328 static bool
3329 convert_mult_to_fma (gimple mul_stmt, tree op1, tree op2)
3331 tree mul_result = gimple_get_lhs (mul_stmt);
3332 tree type = TREE_TYPE (mul_result);
3333 gimple use_stmt, neguse_stmt;
3334 gassign *fma_stmt;
3335 use_operand_p use_p;
3336 imm_use_iterator imm_iter;
3338 if (FLOAT_TYPE_P (type)
3339 && flag_fp_contract_mode == FP_CONTRACT_OFF)
3340 return false;
3342 /* We don't want to do bitfield reduction ops. */
3343 if (INTEGRAL_TYPE_P (type)
3344 && (TYPE_PRECISION (type)
3345 != GET_MODE_PRECISION (TYPE_MODE (type))))
3346 return false;
3348 /* If the target doesn't support it, don't generate it. We assume that
3349 if fma isn't available then fms, fnma or fnms are not either. */
3350 if (optab_handler (fma_optab, TYPE_MODE (type)) == CODE_FOR_nothing)
3351 return false;
3353 /* If the multiplication has zero uses, it is kept around probably because
3354 of -fnon-call-exceptions. Don't optimize it away in that case,
3355 it is DCE job. */
3356 if (has_zero_uses (mul_result))
3357 return false;
3359 /* Make sure that the multiplication statement becomes dead after
3360 the transformation, thus that all uses are transformed to FMAs.
3361 This means we assume that an FMA operation has the same cost
3362 as an addition. */
3363 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, mul_result)
3365 enum tree_code use_code;
3366 tree result = mul_result;
3367 bool negate_p = false;
3369 use_stmt = USE_STMT (use_p);
3371 if (is_gimple_debug (use_stmt))
3372 continue;
3374 /* For now restrict this operations to single basic blocks. In theory
3375 we would want to support sinking the multiplication in
3376 m = a*b;
3377 if ()
3378 ma = m + c;
3379 else
3380 d = m;
3381 to form a fma in the then block and sink the multiplication to the
3382 else block. */
3383 if (gimple_bb (use_stmt) != gimple_bb (mul_stmt))
3384 return false;
3386 if (!is_gimple_assign (use_stmt))
3387 return false;
3389 use_code = gimple_assign_rhs_code (use_stmt);
3391 /* A negate on the multiplication leads to FNMA. */
3392 if (use_code == NEGATE_EXPR)
3394 ssa_op_iter iter;
3395 use_operand_p usep;
3397 result = gimple_assign_lhs (use_stmt);
3399 /* Make sure the negate statement becomes dead with this
3400 single transformation. */
3401 if (!single_imm_use (gimple_assign_lhs (use_stmt),
3402 &use_p, &neguse_stmt))
3403 return false;
3405 /* Make sure the multiplication isn't also used on that stmt. */
3406 FOR_EACH_PHI_OR_STMT_USE (usep, neguse_stmt, iter, SSA_OP_USE)
3407 if (USE_FROM_PTR (usep) == mul_result)
3408 return false;
3410 /* Re-validate. */
3411 use_stmt = neguse_stmt;
3412 if (gimple_bb (use_stmt) != gimple_bb (mul_stmt))
3413 return false;
3414 if (!is_gimple_assign (use_stmt))
3415 return false;
3417 use_code = gimple_assign_rhs_code (use_stmt);
3418 negate_p = true;
3421 switch (use_code)
3423 case MINUS_EXPR:
3424 if (gimple_assign_rhs2 (use_stmt) == result)
3425 negate_p = !negate_p;
3426 break;
3427 case PLUS_EXPR:
3428 break;
3429 default:
3430 /* FMA can only be formed from PLUS and MINUS. */
3431 return false;
3434 /* If the subtrahend (gimple_assign_rhs2 (use_stmt)) is computed
3435 by a MULT_EXPR that we'll visit later, we might be able to
3436 get a more profitable match with fnma.
3437 OTOH, if we don't, a negate / fma pair has likely lower latency
3438 that a mult / subtract pair. */
3439 if (use_code == MINUS_EXPR && !negate_p
3440 && gimple_assign_rhs1 (use_stmt) == result
3441 && optab_handler (fms_optab, TYPE_MODE (type)) == CODE_FOR_nothing
3442 && optab_handler (fnma_optab, TYPE_MODE (type)) != CODE_FOR_nothing)
3444 tree rhs2 = gimple_assign_rhs2 (use_stmt);
3446 if (TREE_CODE (rhs2) == SSA_NAME)
3448 gimple stmt2 = SSA_NAME_DEF_STMT (rhs2);
3449 if (has_single_use (rhs2)
3450 && is_gimple_assign (stmt2)
3451 && gimple_assign_rhs_code (stmt2) == MULT_EXPR)
3452 return false;
3456 /* We can't handle a * b + a * b. */
3457 if (gimple_assign_rhs1 (use_stmt) == gimple_assign_rhs2 (use_stmt))
3458 return false;
3460 /* While it is possible to validate whether or not the exact form
3461 that we've recognized is available in the backend, the assumption
3462 is that the transformation is never a loss. For instance, suppose
3463 the target only has the plain FMA pattern available. Consider
3464 a*b-c -> fma(a,b,-c): we've exchanged MUL+SUB for FMA+NEG, which
3465 is still two operations. Consider -(a*b)-c -> fma(-a,b,-c): we
3466 still have 3 operations, but in the FMA form the two NEGs are
3467 independent and could be run in parallel. */
3470 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, mul_result)
3472 gimple_stmt_iterator gsi = gsi_for_stmt (use_stmt);
3473 enum tree_code use_code;
3474 tree addop, mulop1 = op1, result = mul_result;
3475 bool negate_p = false;
3477 if (is_gimple_debug (use_stmt))
3478 continue;
3480 use_code = gimple_assign_rhs_code (use_stmt);
3481 if (use_code == NEGATE_EXPR)
3483 result = gimple_assign_lhs (use_stmt);
3484 single_imm_use (gimple_assign_lhs (use_stmt), &use_p, &neguse_stmt);
3485 gsi_remove (&gsi, true);
3486 release_defs (use_stmt);
3488 use_stmt = neguse_stmt;
3489 gsi = gsi_for_stmt (use_stmt);
3490 use_code = gimple_assign_rhs_code (use_stmt);
3491 negate_p = true;
3494 if (gimple_assign_rhs1 (use_stmt) == result)
3496 addop = gimple_assign_rhs2 (use_stmt);
3497 /* a * b - c -> a * b + (-c) */
3498 if (gimple_assign_rhs_code (use_stmt) == MINUS_EXPR)
3499 addop = force_gimple_operand_gsi (&gsi,
3500 build1 (NEGATE_EXPR,
3501 type, addop),
3502 true, NULL_TREE, true,
3503 GSI_SAME_STMT);
3505 else
3507 addop = gimple_assign_rhs1 (use_stmt);
3508 /* a - b * c -> (-b) * c + a */
3509 if (gimple_assign_rhs_code (use_stmt) == MINUS_EXPR)
3510 negate_p = !negate_p;
3513 if (negate_p)
3514 mulop1 = force_gimple_operand_gsi (&gsi,
3515 build1 (NEGATE_EXPR,
3516 type, mulop1),
3517 true, NULL_TREE, true,
3518 GSI_SAME_STMT);
3520 fma_stmt = gimple_build_assign (gimple_assign_lhs (use_stmt),
3521 FMA_EXPR, mulop1, op2, addop);
3522 gsi_replace (&gsi, fma_stmt, true);
3523 widen_mul_stats.fmas_inserted++;
3526 return true;
3529 /* Find integer multiplications where the operands are extended from
3530 smaller types, and replace the MULT_EXPR with a WIDEN_MULT_EXPR
3531 where appropriate. */
3533 namespace {
3535 const pass_data pass_data_optimize_widening_mul =
3537 GIMPLE_PASS, /* type */
3538 "widening_mul", /* name */
3539 OPTGROUP_NONE, /* optinfo_flags */
3540 TV_NONE, /* tv_id */
3541 PROP_ssa, /* properties_required */
3542 0, /* properties_provided */
3543 0, /* properties_destroyed */
3544 0, /* todo_flags_start */
3545 TODO_update_ssa, /* todo_flags_finish */
3548 class pass_optimize_widening_mul : public gimple_opt_pass
3550 public:
3551 pass_optimize_widening_mul (gcc::context *ctxt)
3552 : gimple_opt_pass (pass_data_optimize_widening_mul, ctxt)
3555 /* opt_pass methods: */
3556 virtual bool gate (function *)
3558 return flag_expensive_optimizations && optimize;
3561 virtual unsigned int execute (function *);
3563 }; // class pass_optimize_widening_mul
3565 unsigned int
3566 pass_optimize_widening_mul::execute (function *fun)
3568 basic_block bb;
3569 bool cfg_changed = false;
3571 memset (&widen_mul_stats, 0, sizeof (widen_mul_stats));
3573 FOR_EACH_BB_FN (bb, fun)
3575 gimple_stmt_iterator gsi;
3577 for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi);)
3579 gimple stmt = gsi_stmt (gsi);
3580 enum tree_code code;
3582 if (is_gimple_assign (stmt))
3584 code = gimple_assign_rhs_code (stmt);
3585 switch (code)
3587 case MULT_EXPR:
3588 if (!convert_mult_to_widen (stmt, &gsi)
3589 && convert_mult_to_fma (stmt,
3590 gimple_assign_rhs1 (stmt),
3591 gimple_assign_rhs2 (stmt)))
3593 gsi_remove (&gsi, true);
3594 release_defs (stmt);
3595 continue;
3597 break;
3599 case PLUS_EXPR:
3600 case MINUS_EXPR:
3601 convert_plusminus_to_widen (&gsi, stmt, code);
3602 break;
3604 default:;
3607 else if (is_gimple_call (stmt)
3608 && gimple_call_lhs (stmt))
3610 tree fndecl = gimple_call_fndecl (stmt);
3611 if (fndecl
3612 && DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_NORMAL)
3614 switch (DECL_FUNCTION_CODE (fndecl))
3616 case BUILT_IN_POWF:
3617 case BUILT_IN_POW:
3618 case BUILT_IN_POWL:
3619 if (TREE_CODE (gimple_call_arg (stmt, 1)) == REAL_CST
3620 && REAL_VALUES_EQUAL
3621 (TREE_REAL_CST (gimple_call_arg (stmt, 1)),
3622 dconst2)
3623 && convert_mult_to_fma (stmt,
3624 gimple_call_arg (stmt, 0),
3625 gimple_call_arg (stmt, 0)))
3627 unlink_stmt_vdef (stmt);
3628 if (gsi_remove (&gsi, true)
3629 && gimple_purge_dead_eh_edges (bb))
3630 cfg_changed = true;
3631 release_defs (stmt);
3632 continue;
3634 break;
3636 default:;
3640 gsi_next (&gsi);
3644 statistics_counter_event (fun, "widening multiplications inserted",
3645 widen_mul_stats.widen_mults_inserted);
3646 statistics_counter_event (fun, "widening maccs inserted",
3647 widen_mul_stats.maccs_inserted);
3648 statistics_counter_event (fun, "fused multiply-adds inserted",
3649 widen_mul_stats.fmas_inserted);
3651 return cfg_changed ? TODO_cleanup_cfg : 0;
3654 } // anon namespace
3656 gimple_opt_pass *
3657 make_pass_optimize_widening_mul (gcc::context *ctxt)
3659 return new pass_optimize_widening_mul (ctxt);