2017-10-27 Steven G. Kargl <kargl@gcc.gnu.org>
[official-gcc.git] / gcc / tree-ssa-math-opts.c
bloba87040a91e17f022a5a19ec4b75a9099e205d9ef
1 /* Global, SSA-based optimizations using mathematical identities.
2 Copyright (C) 2005-2017 Free Software Foundation, Inc.
4 This file is part of GCC.
6 GCC is free software; you can redistribute it and/or modify it
7 under the terms of the GNU General Public License as published by the
8 Free Software Foundation; either version 3, or (at your option) any
9 later version.
11 GCC is distributed in the hope that it will be useful, but WITHOUT
12 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14 for more details.
16 You should have received a copy of the GNU General Public License
17 along with GCC; see the file COPYING3. If not see
18 <http://www.gnu.org/licenses/>. */
20 /* Currently, the only mini-pass in this file tries to CSE reciprocal
21 operations. These are common in sequences such as this one:
23 modulus = sqrt(x*x + y*y + z*z);
24 x = x / modulus;
25 y = y / modulus;
26 z = z / modulus;
28 that can be optimized to
30 modulus = sqrt(x*x + y*y + z*z);
31 rmodulus = 1.0 / modulus;
32 x = x * rmodulus;
33 y = y * rmodulus;
34 z = z * rmodulus;
36 We do this for loop invariant divisors, and with this pass whenever
37 we notice that a division has the same divisor multiple times.
39 Of course, like in PRE, we don't insert a division if a dominator
40 already has one. However, this cannot be done as an extension of
41 PRE for several reasons.
43 First of all, with some experiments it was found out that the
44 transformation is not always useful if there are only two divisions
45 by the same divisor. This is probably because modern processors
46 can pipeline the divisions; on older, in-order processors it should
47 still be effective to optimize two divisions by the same number.
48 We make this a param, and it shall be called N in the remainder of
49 this comment.
51 Second, if trapping math is active, we have less freedom on where
52 to insert divisions: we can only do so in basic blocks that already
53 contain one. (If divisions don't trap, instead, we can insert
54 divisions elsewhere, which will be in blocks that are common dominators
55 of those that have the division).
57 We really don't want to compute the reciprocal unless a division will
58 be found. To do this, we won't insert the division in a basic block
59 that has less than N divisions *post-dominating* it.
61 The algorithm constructs a subset of the dominator tree, holding the
62 blocks containing the divisions and the common dominators to them,
63 and walk it twice. The first walk is in post-order, and it annotates
64 each block with the number of divisions that post-dominate it: this
65 gives information on where divisions can be inserted profitably.
66 The second walk is in pre-order, and it inserts divisions as explained
67 above, and replaces divisions by multiplications.
69 In the best case, the cost of the pass is O(n_statements). In the
70 worst-case, the cost is due to creating the dominator tree subset,
71 with a cost of O(n_basic_blocks ^ 2); however this can only happen
72 for n_statements / n_basic_blocks statements. So, the amortized cost
73 of creating the dominator tree subset is O(n_basic_blocks) and the
74 worst-case cost of the pass is O(n_statements * n_basic_blocks).
76 More practically, the cost will be small because there are few
77 divisions, and they tend to be in the same basic block, so insert_bb
78 is called very few times.
80 If we did this using domwalk.c, an efficient implementation would have
81 to work on all the variables in a single pass, because we could not
82 work on just a subset of the dominator tree, as we do now, and the
83 cost would also be something like O(n_statements * n_basic_blocks).
84 The data structures would be more complex in order to work on all the
85 variables in a single pass. */
87 #include "config.h"
88 #include "system.h"
89 #include "coretypes.h"
90 #include "backend.h"
91 #include "target.h"
92 #include "rtl.h"
93 #include "tree.h"
94 #include "gimple.h"
95 #include "predict.h"
96 #include "alloc-pool.h"
97 #include "tree-pass.h"
98 #include "ssa.h"
99 #include "optabs-tree.h"
100 #include "gimple-pretty-print.h"
101 #include "alias.h"
102 #include "fold-const.h"
103 #include "gimple-fold.h"
104 #include "gimple-iterator.h"
105 #include "gimplify.h"
106 #include "gimplify-me.h"
107 #include "stor-layout.h"
108 #include "tree-cfg.h"
109 #include "tree-dfa.h"
110 #include "tree-ssa.h"
111 #include "builtins.h"
112 #include "params.h"
113 #include "internal-fn.h"
114 #include "case-cfn-macros.h"
115 #include "optabs-libfuncs.h"
116 #include "tree-eh.h"
117 #include "targhooks.h"
119 /* This structure represents one basic block that either computes a
120 division, or is a common dominator for basic block that compute a
121 division. */
122 struct occurrence {
123 /* The basic block represented by this structure. */
124 basic_block bb;
126 /* If non-NULL, the SSA_NAME holding the definition for a reciprocal
127 inserted in BB. */
128 tree recip_def;
130 /* If non-NULL, the GIMPLE_ASSIGN for a reciprocal computation that
131 was inserted in BB. */
132 gimple *recip_def_stmt;
134 /* Pointer to a list of "struct occurrence"s for blocks dominated
135 by BB. */
136 struct occurrence *children;
138 /* Pointer to the next "struct occurrence"s in the list of blocks
139 sharing a common dominator. */
140 struct occurrence *next;
142 /* The number of divisions that are in BB before compute_merit. The
143 number of divisions that are in BB or post-dominate it after
144 compute_merit. */
145 int num_divisions;
147 /* True if the basic block has a division, false if it is a common
148 dominator for basic blocks that do. If it is false and trapping
149 math is active, BB is not a candidate for inserting a reciprocal. */
150 bool bb_has_division;
153 static struct
155 /* Number of 1.0/X ops inserted. */
156 int rdivs_inserted;
158 /* Number of 1.0/FUNC ops inserted. */
159 int rfuncs_inserted;
160 } reciprocal_stats;
162 static struct
164 /* Number of cexpi calls inserted. */
165 int inserted;
166 } sincos_stats;
168 static struct
170 /* Number of hand-written 16-bit nop / bswaps found. */
171 int found_16bit;
173 /* Number of hand-written 32-bit nop / bswaps found. */
174 int found_32bit;
176 /* Number of hand-written 64-bit nop / bswaps found. */
177 int found_64bit;
178 } nop_stats, bswap_stats;
180 static struct
182 /* Number of widening multiplication ops inserted. */
183 int widen_mults_inserted;
185 /* Number of integer multiply-and-accumulate ops inserted. */
186 int maccs_inserted;
188 /* Number of fp fused multiply-add ops inserted. */
189 int fmas_inserted;
191 /* Number of divmod calls inserted. */
192 int divmod_calls_inserted;
193 } widen_mul_stats;
195 /* The instance of "struct occurrence" representing the highest
196 interesting block in the dominator tree. */
197 static struct occurrence *occ_head;
199 /* Allocation pool for getting instances of "struct occurrence". */
200 static object_allocator<occurrence> *occ_pool;
204 /* Allocate and return a new struct occurrence for basic block BB, and
205 whose children list is headed by CHILDREN. */
206 static struct occurrence *
207 occ_new (basic_block bb, struct occurrence *children)
209 struct occurrence *occ;
211 bb->aux = occ = occ_pool->allocate ();
212 memset (occ, 0, sizeof (struct occurrence));
214 occ->bb = bb;
215 occ->children = children;
216 return occ;
220 /* Insert NEW_OCC into our subset of the dominator tree. P_HEAD points to a
221 list of "struct occurrence"s, one per basic block, having IDOM as
222 their common dominator.
224 We try to insert NEW_OCC as deep as possible in the tree, and we also
225 insert any other block that is a common dominator for BB and one
226 block already in the tree. */
228 static void
229 insert_bb (struct occurrence *new_occ, basic_block idom,
230 struct occurrence **p_head)
232 struct occurrence *occ, **p_occ;
234 for (p_occ = p_head; (occ = *p_occ) != NULL; )
236 basic_block bb = new_occ->bb, occ_bb = occ->bb;
237 basic_block dom = nearest_common_dominator (CDI_DOMINATORS, occ_bb, bb);
238 if (dom == bb)
240 /* BB dominates OCC_BB. OCC becomes NEW_OCC's child: remove OCC
241 from its list. */
242 *p_occ = occ->next;
243 occ->next = new_occ->children;
244 new_occ->children = occ;
246 /* Try the next block (it may as well be dominated by BB). */
249 else if (dom == occ_bb)
251 /* OCC_BB dominates BB. Tail recurse to look deeper. */
252 insert_bb (new_occ, dom, &occ->children);
253 return;
256 else if (dom != idom)
258 gcc_assert (!dom->aux);
260 /* There is a dominator between IDOM and BB, add it and make
261 two children out of NEW_OCC and OCC. First, remove OCC from
262 its list. */
263 *p_occ = occ->next;
264 new_occ->next = occ;
265 occ->next = NULL;
267 /* None of the previous blocks has DOM as a dominator: if we tail
268 recursed, we would reexamine them uselessly. Just switch BB with
269 DOM, and go on looking for blocks dominated by DOM. */
270 new_occ = occ_new (dom, new_occ);
273 else
275 /* Nothing special, go on with the next element. */
276 p_occ = &occ->next;
280 /* No place was found as a child of IDOM. Make BB a sibling of IDOM. */
281 new_occ->next = *p_head;
282 *p_head = new_occ;
285 /* Register that we found a division in BB. */
287 static inline void
288 register_division_in (basic_block bb)
290 struct occurrence *occ;
292 occ = (struct occurrence *) bb->aux;
293 if (!occ)
295 occ = occ_new (bb, NULL);
296 insert_bb (occ, ENTRY_BLOCK_PTR_FOR_FN (cfun), &occ_head);
299 occ->bb_has_division = true;
300 occ->num_divisions++;
304 /* Compute the number of divisions that postdominate each block in OCC and
305 its children. */
307 static void
308 compute_merit (struct occurrence *occ)
310 struct occurrence *occ_child;
311 basic_block dom = occ->bb;
313 for (occ_child = occ->children; occ_child; occ_child = occ_child->next)
315 basic_block bb;
316 if (occ_child->children)
317 compute_merit (occ_child);
319 if (flag_exceptions)
320 bb = single_noncomplex_succ (dom);
321 else
322 bb = dom;
324 if (dominated_by_p (CDI_POST_DOMINATORS, bb, occ_child->bb))
325 occ->num_divisions += occ_child->num_divisions;
330 /* Return whether USE_STMT is a floating-point division by DEF. */
331 static inline bool
332 is_division_by (gimple *use_stmt, tree def)
334 return is_gimple_assign (use_stmt)
335 && gimple_assign_rhs_code (use_stmt) == RDIV_EXPR
336 && gimple_assign_rhs2 (use_stmt) == def
337 /* Do not recognize x / x as valid division, as we are getting
338 confused later by replacing all immediate uses x in such
339 a stmt. */
340 && gimple_assign_rhs1 (use_stmt) != def;
343 /* Walk the subset of the dominator tree rooted at OCC, setting the
344 RECIP_DEF field to a definition of 1.0 / DEF that can be used in
345 the given basic block. The field may be left NULL, of course,
346 if it is not possible or profitable to do the optimization.
348 DEF_BSI is an iterator pointing at the statement defining DEF.
349 If RECIP_DEF is set, a dominator already has a computation that can
350 be used. */
352 static void
353 insert_reciprocals (gimple_stmt_iterator *def_gsi, struct occurrence *occ,
354 tree def, tree recip_def, int threshold)
356 tree type;
357 gassign *new_stmt;
358 gimple_stmt_iterator gsi;
359 struct occurrence *occ_child;
361 if (!recip_def
362 && (occ->bb_has_division || !flag_trapping_math)
363 && occ->num_divisions >= threshold)
365 /* Make a variable with the replacement and substitute it. */
366 type = TREE_TYPE (def);
367 recip_def = create_tmp_reg (type, "reciptmp");
368 new_stmt = gimple_build_assign (recip_def, RDIV_EXPR,
369 build_one_cst (type), def);
371 if (occ->bb_has_division)
373 /* Case 1: insert before an existing division. */
374 gsi = gsi_after_labels (occ->bb);
375 while (!gsi_end_p (gsi) && !is_division_by (gsi_stmt (gsi), def))
376 gsi_next (&gsi);
378 gsi_insert_before (&gsi, new_stmt, GSI_SAME_STMT);
380 else if (def_gsi && occ->bb == def_gsi->bb)
382 /* Case 2: insert right after the definition. Note that this will
383 never happen if the definition statement can throw, because in
384 that case the sole successor of the statement's basic block will
385 dominate all the uses as well. */
386 gsi_insert_after (def_gsi, new_stmt, GSI_NEW_STMT);
388 else
390 /* Case 3: insert in a basic block not containing defs/uses. */
391 gsi = gsi_after_labels (occ->bb);
392 gsi_insert_before (&gsi, new_stmt, GSI_SAME_STMT);
395 reciprocal_stats.rdivs_inserted++;
397 occ->recip_def_stmt = new_stmt;
400 occ->recip_def = recip_def;
401 for (occ_child = occ->children; occ_child; occ_child = occ_child->next)
402 insert_reciprocals (def_gsi, occ_child, def, recip_def, threshold);
406 /* Replace the division at USE_P with a multiplication by the reciprocal, if
407 possible. */
409 static inline void
410 replace_reciprocal (use_operand_p use_p)
412 gimple *use_stmt = USE_STMT (use_p);
413 basic_block bb = gimple_bb (use_stmt);
414 struct occurrence *occ = (struct occurrence *) bb->aux;
416 if (optimize_bb_for_speed_p (bb)
417 && occ->recip_def && use_stmt != occ->recip_def_stmt)
419 gimple_stmt_iterator gsi = gsi_for_stmt (use_stmt);
420 gimple_assign_set_rhs_code (use_stmt, MULT_EXPR);
421 SET_USE (use_p, occ->recip_def);
422 fold_stmt_inplace (&gsi);
423 update_stmt (use_stmt);
428 /* Free OCC and return one more "struct occurrence" to be freed. */
430 static struct occurrence *
431 free_bb (struct occurrence *occ)
433 struct occurrence *child, *next;
435 /* First get the two pointers hanging off OCC. */
436 next = occ->next;
437 child = occ->children;
438 occ->bb->aux = NULL;
439 occ_pool->remove (occ);
441 /* Now ensure that we don't recurse unless it is necessary. */
442 if (!child)
443 return next;
444 else
446 while (next)
447 next = free_bb (next);
449 return child;
454 /* Look for floating-point divisions among DEF's uses, and try to
455 replace them by multiplications with the reciprocal. Add
456 as many statements computing the reciprocal as needed.
458 DEF must be a GIMPLE register of a floating-point type. */
460 static void
461 execute_cse_reciprocals_1 (gimple_stmt_iterator *def_gsi, tree def)
463 use_operand_p use_p;
464 imm_use_iterator use_iter;
465 struct occurrence *occ;
466 int count = 0, threshold;
468 gcc_assert (FLOAT_TYPE_P (TREE_TYPE (def)) && is_gimple_reg (def));
470 FOR_EACH_IMM_USE_FAST (use_p, use_iter, def)
472 gimple *use_stmt = USE_STMT (use_p);
473 if (is_division_by (use_stmt, def))
475 register_division_in (gimple_bb (use_stmt));
476 count++;
480 /* Do the expensive part only if we can hope to optimize something. */
481 threshold = targetm.min_divisions_for_recip_mul (TYPE_MODE (TREE_TYPE (def)));
482 if (count >= threshold)
484 gimple *use_stmt;
485 for (occ = occ_head; occ; occ = occ->next)
487 compute_merit (occ);
488 insert_reciprocals (def_gsi, occ, def, NULL, threshold);
491 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, def)
493 if (is_division_by (use_stmt, def))
495 FOR_EACH_IMM_USE_ON_STMT (use_p, use_iter)
496 replace_reciprocal (use_p);
501 for (occ = occ_head; occ; )
502 occ = free_bb (occ);
504 occ_head = NULL;
507 /* Return an internal function that implements the reciprocal of CALL,
508 or IFN_LAST if there is no such function that the target supports. */
510 internal_fn
511 internal_fn_reciprocal (gcall *call)
513 internal_fn ifn;
515 switch (gimple_call_combined_fn (call))
517 CASE_CFN_SQRT:
518 CASE_CFN_SQRT_FN:
519 ifn = IFN_RSQRT;
520 break;
522 default:
523 return IFN_LAST;
526 tree_pair types = direct_internal_fn_types (ifn, call);
527 if (!direct_internal_fn_supported_p (ifn, types, OPTIMIZE_FOR_SPEED))
528 return IFN_LAST;
530 return ifn;
533 /* Go through all the floating-point SSA_NAMEs, and call
534 execute_cse_reciprocals_1 on each of them. */
535 namespace {
537 const pass_data pass_data_cse_reciprocals =
539 GIMPLE_PASS, /* type */
540 "recip", /* name */
541 OPTGROUP_NONE, /* optinfo_flags */
542 TV_NONE, /* tv_id */
543 PROP_ssa, /* properties_required */
544 0, /* properties_provided */
545 0, /* properties_destroyed */
546 0, /* todo_flags_start */
547 TODO_update_ssa, /* todo_flags_finish */
550 class pass_cse_reciprocals : public gimple_opt_pass
552 public:
553 pass_cse_reciprocals (gcc::context *ctxt)
554 : gimple_opt_pass (pass_data_cse_reciprocals, ctxt)
557 /* opt_pass methods: */
558 virtual bool gate (function *) { return optimize && flag_reciprocal_math; }
559 virtual unsigned int execute (function *);
561 }; // class pass_cse_reciprocals
563 unsigned int
564 pass_cse_reciprocals::execute (function *fun)
566 basic_block bb;
567 tree arg;
569 occ_pool = new object_allocator<occurrence> ("dominators for recip");
571 memset (&reciprocal_stats, 0, sizeof (reciprocal_stats));
572 calculate_dominance_info (CDI_DOMINATORS);
573 calculate_dominance_info (CDI_POST_DOMINATORS);
575 if (flag_checking)
576 FOR_EACH_BB_FN (bb, fun)
577 gcc_assert (!bb->aux);
579 for (arg = DECL_ARGUMENTS (fun->decl); arg; arg = DECL_CHAIN (arg))
580 if (FLOAT_TYPE_P (TREE_TYPE (arg))
581 && is_gimple_reg (arg))
583 tree name = ssa_default_def (fun, arg);
584 if (name)
585 execute_cse_reciprocals_1 (NULL, name);
588 FOR_EACH_BB_FN (bb, fun)
590 tree def;
592 for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
593 gsi_next (&gsi))
595 gphi *phi = gsi.phi ();
596 def = PHI_RESULT (phi);
597 if (! virtual_operand_p (def)
598 && FLOAT_TYPE_P (TREE_TYPE (def)))
599 execute_cse_reciprocals_1 (NULL, def);
602 for (gimple_stmt_iterator gsi = gsi_after_labels (bb); !gsi_end_p (gsi);
603 gsi_next (&gsi))
605 gimple *stmt = gsi_stmt (gsi);
607 if (gimple_has_lhs (stmt)
608 && (def = SINGLE_SSA_TREE_OPERAND (stmt, SSA_OP_DEF)) != NULL
609 && FLOAT_TYPE_P (TREE_TYPE (def))
610 && TREE_CODE (def) == SSA_NAME)
611 execute_cse_reciprocals_1 (&gsi, def);
614 if (optimize_bb_for_size_p (bb))
615 continue;
617 /* Scan for a/func(b) and convert it to reciprocal a*rfunc(b). */
618 for (gimple_stmt_iterator gsi = gsi_after_labels (bb); !gsi_end_p (gsi);
619 gsi_next (&gsi))
621 gimple *stmt = gsi_stmt (gsi);
623 if (is_gimple_assign (stmt)
624 && gimple_assign_rhs_code (stmt) == RDIV_EXPR)
626 tree arg1 = gimple_assign_rhs2 (stmt);
627 gimple *stmt1;
629 if (TREE_CODE (arg1) != SSA_NAME)
630 continue;
632 stmt1 = SSA_NAME_DEF_STMT (arg1);
634 if (is_gimple_call (stmt1)
635 && gimple_call_lhs (stmt1))
637 bool fail;
638 imm_use_iterator ui;
639 use_operand_p use_p;
640 tree fndecl = NULL_TREE;
642 gcall *call = as_a <gcall *> (stmt1);
643 internal_fn ifn = internal_fn_reciprocal (call);
644 if (ifn == IFN_LAST)
646 fndecl = gimple_call_fndecl (call);
647 if (!fndecl
648 || DECL_BUILT_IN_CLASS (fndecl) != BUILT_IN_MD)
649 continue;
650 fndecl = targetm.builtin_reciprocal (fndecl);
651 if (!fndecl)
652 continue;
655 /* Check that all uses of the SSA name are divisions,
656 otherwise replacing the defining statement will do
657 the wrong thing. */
658 fail = false;
659 FOR_EACH_IMM_USE_FAST (use_p, ui, arg1)
661 gimple *stmt2 = USE_STMT (use_p);
662 if (is_gimple_debug (stmt2))
663 continue;
664 if (!is_gimple_assign (stmt2)
665 || gimple_assign_rhs_code (stmt2) != RDIV_EXPR
666 || gimple_assign_rhs1 (stmt2) == arg1
667 || gimple_assign_rhs2 (stmt2) != arg1)
669 fail = true;
670 break;
673 if (fail)
674 continue;
676 gimple_replace_ssa_lhs (call, arg1);
677 if (gimple_call_internal_p (call) != (ifn != IFN_LAST))
679 auto_vec<tree, 4> args;
680 for (unsigned int i = 0;
681 i < gimple_call_num_args (call); i++)
682 args.safe_push (gimple_call_arg (call, i));
683 gcall *stmt2;
684 if (ifn == IFN_LAST)
685 stmt2 = gimple_build_call_vec (fndecl, args);
686 else
687 stmt2 = gimple_build_call_internal_vec (ifn, args);
688 gimple_call_set_lhs (stmt2, arg1);
689 if (gimple_vdef (call))
691 gimple_set_vdef (stmt2, gimple_vdef (call));
692 SSA_NAME_DEF_STMT (gimple_vdef (stmt2)) = stmt2;
694 gimple_call_set_nothrow (stmt2,
695 gimple_call_nothrow_p (call));
696 gimple_set_vuse (stmt2, gimple_vuse (call));
697 gimple_stmt_iterator gsi2 = gsi_for_stmt (call);
698 gsi_replace (&gsi2, stmt2, true);
700 else
702 if (ifn == IFN_LAST)
703 gimple_call_set_fndecl (call, fndecl);
704 else
705 gimple_call_set_internal_fn (call, ifn);
706 update_stmt (call);
708 reciprocal_stats.rfuncs_inserted++;
710 FOR_EACH_IMM_USE_STMT (stmt, ui, arg1)
712 gimple_stmt_iterator gsi = gsi_for_stmt (stmt);
713 gimple_assign_set_rhs_code (stmt, MULT_EXPR);
714 fold_stmt_inplace (&gsi);
715 update_stmt (stmt);
722 statistics_counter_event (fun, "reciprocal divs inserted",
723 reciprocal_stats.rdivs_inserted);
724 statistics_counter_event (fun, "reciprocal functions inserted",
725 reciprocal_stats.rfuncs_inserted);
727 free_dominance_info (CDI_DOMINATORS);
728 free_dominance_info (CDI_POST_DOMINATORS);
729 delete occ_pool;
730 return 0;
733 } // anon namespace
735 gimple_opt_pass *
736 make_pass_cse_reciprocals (gcc::context *ctxt)
738 return new pass_cse_reciprocals (ctxt);
741 /* Records an occurrence at statement USE_STMT in the vector of trees
742 STMTS if it is dominated by *TOP_BB or dominates it or this basic block
743 is not yet initialized. Returns true if the occurrence was pushed on
744 the vector. Adjusts *TOP_BB to be the basic block dominating all
745 statements in the vector. */
747 static bool
748 maybe_record_sincos (vec<gimple *> *stmts,
749 basic_block *top_bb, gimple *use_stmt)
751 basic_block use_bb = gimple_bb (use_stmt);
752 if (*top_bb
753 && (*top_bb == use_bb
754 || dominated_by_p (CDI_DOMINATORS, use_bb, *top_bb)))
755 stmts->safe_push (use_stmt);
756 else if (!*top_bb
757 || dominated_by_p (CDI_DOMINATORS, *top_bb, use_bb))
759 stmts->safe_push (use_stmt);
760 *top_bb = use_bb;
762 else
763 return false;
765 return true;
768 /* Look for sin, cos and cexpi calls with the same argument NAME and
769 create a single call to cexpi CSEing the result in this case.
770 We first walk over all immediate uses of the argument collecting
771 statements that we can CSE in a vector and in a second pass replace
772 the statement rhs with a REALPART or IMAGPART expression on the
773 result of the cexpi call we insert before the use statement that
774 dominates all other candidates. */
776 static bool
777 execute_cse_sincos_1 (tree name)
779 gimple_stmt_iterator gsi;
780 imm_use_iterator use_iter;
781 tree fndecl, res, type;
782 gimple *def_stmt, *use_stmt, *stmt;
783 int seen_cos = 0, seen_sin = 0, seen_cexpi = 0;
784 auto_vec<gimple *> stmts;
785 basic_block top_bb = NULL;
786 int i;
787 bool cfg_changed = false;
789 type = TREE_TYPE (name);
790 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, name)
792 if (gimple_code (use_stmt) != GIMPLE_CALL
793 || !gimple_call_lhs (use_stmt))
794 continue;
796 switch (gimple_call_combined_fn (use_stmt))
798 CASE_CFN_COS:
799 seen_cos |= maybe_record_sincos (&stmts, &top_bb, use_stmt) ? 1 : 0;
800 break;
802 CASE_CFN_SIN:
803 seen_sin |= maybe_record_sincos (&stmts, &top_bb, use_stmt) ? 1 : 0;
804 break;
806 CASE_CFN_CEXPI:
807 seen_cexpi |= maybe_record_sincos (&stmts, &top_bb, use_stmt) ? 1 : 0;
808 break;
810 default:;
814 if (seen_cos + seen_sin + seen_cexpi <= 1)
815 return false;
817 /* Simply insert cexpi at the beginning of top_bb but not earlier than
818 the name def statement. */
819 fndecl = mathfn_built_in (type, BUILT_IN_CEXPI);
820 if (!fndecl)
821 return false;
822 stmt = gimple_build_call (fndecl, 1, name);
823 res = make_temp_ssa_name (TREE_TYPE (TREE_TYPE (fndecl)), stmt, "sincostmp");
824 gimple_call_set_lhs (stmt, res);
826 def_stmt = SSA_NAME_DEF_STMT (name);
827 if (!SSA_NAME_IS_DEFAULT_DEF (name)
828 && gimple_code (def_stmt) != GIMPLE_PHI
829 && gimple_bb (def_stmt) == top_bb)
831 gsi = gsi_for_stmt (def_stmt);
832 gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
834 else
836 gsi = gsi_after_labels (top_bb);
837 gsi_insert_before (&gsi, stmt, GSI_SAME_STMT);
839 sincos_stats.inserted++;
841 /* And adjust the recorded old call sites. */
842 for (i = 0; stmts.iterate (i, &use_stmt); ++i)
844 tree rhs = NULL;
846 switch (gimple_call_combined_fn (use_stmt))
848 CASE_CFN_COS:
849 rhs = fold_build1 (REALPART_EXPR, type, res);
850 break;
852 CASE_CFN_SIN:
853 rhs = fold_build1 (IMAGPART_EXPR, type, res);
854 break;
856 CASE_CFN_CEXPI:
857 rhs = res;
858 break;
860 default:;
861 gcc_unreachable ();
864 /* Replace call with a copy. */
865 stmt = gimple_build_assign (gimple_call_lhs (use_stmt), rhs);
867 gsi = gsi_for_stmt (use_stmt);
868 gsi_replace (&gsi, stmt, true);
869 if (gimple_purge_dead_eh_edges (gimple_bb (stmt)))
870 cfg_changed = true;
873 return cfg_changed;
876 /* To evaluate powi(x,n), the floating point value x raised to the
877 constant integer exponent n, we use a hybrid algorithm that
878 combines the "window method" with look-up tables. For an
879 introduction to exponentiation algorithms and "addition chains",
880 see section 4.6.3, "Evaluation of Powers" of Donald E. Knuth,
881 "Seminumerical Algorithms", Vol. 2, "The Art of Computer Programming",
882 3rd Edition, 1998, and Daniel M. Gordon, "A Survey of Fast Exponentiation
883 Methods", Journal of Algorithms, Vol. 27, pp. 129-146, 1998. */
885 /* Provide a default value for POWI_MAX_MULTS, the maximum number of
886 multiplications to inline before calling the system library's pow
887 function. powi(x,n) requires at worst 2*bits(n)-2 multiplications,
888 so this default never requires calling pow, powf or powl. */
890 #ifndef POWI_MAX_MULTS
891 #define POWI_MAX_MULTS (2*HOST_BITS_PER_WIDE_INT-2)
892 #endif
894 /* The size of the "optimal power tree" lookup table. All
895 exponents less than this value are simply looked up in the
896 powi_table below. This threshold is also used to size the
897 cache of pseudo registers that hold intermediate results. */
898 #define POWI_TABLE_SIZE 256
900 /* The size, in bits of the window, used in the "window method"
901 exponentiation algorithm. This is equivalent to a radix of
902 (1<<POWI_WINDOW_SIZE) in the corresponding "m-ary method". */
903 #define POWI_WINDOW_SIZE 3
905 /* The following table is an efficient representation of an
906 "optimal power tree". For each value, i, the corresponding
907 value, j, in the table states than an optimal evaluation
908 sequence for calculating pow(x,i) can be found by evaluating
909 pow(x,j)*pow(x,i-j). An optimal power tree for the first
910 100 integers is given in Knuth's "Seminumerical algorithms". */
912 static const unsigned char powi_table[POWI_TABLE_SIZE] =
914 0, 1, 1, 2, 2, 3, 3, 4, /* 0 - 7 */
915 4, 6, 5, 6, 6, 10, 7, 9, /* 8 - 15 */
916 8, 16, 9, 16, 10, 12, 11, 13, /* 16 - 23 */
917 12, 17, 13, 18, 14, 24, 15, 26, /* 24 - 31 */
918 16, 17, 17, 19, 18, 33, 19, 26, /* 32 - 39 */
919 20, 25, 21, 40, 22, 27, 23, 44, /* 40 - 47 */
920 24, 32, 25, 34, 26, 29, 27, 44, /* 48 - 55 */
921 28, 31, 29, 34, 30, 60, 31, 36, /* 56 - 63 */
922 32, 64, 33, 34, 34, 46, 35, 37, /* 64 - 71 */
923 36, 65, 37, 50, 38, 48, 39, 69, /* 72 - 79 */
924 40, 49, 41, 43, 42, 51, 43, 58, /* 80 - 87 */
925 44, 64, 45, 47, 46, 59, 47, 76, /* 88 - 95 */
926 48, 65, 49, 66, 50, 67, 51, 66, /* 96 - 103 */
927 52, 70, 53, 74, 54, 104, 55, 74, /* 104 - 111 */
928 56, 64, 57, 69, 58, 78, 59, 68, /* 112 - 119 */
929 60, 61, 61, 80, 62, 75, 63, 68, /* 120 - 127 */
930 64, 65, 65, 128, 66, 129, 67, 90, /* 128 - 135 */
931 68, 73, 69, 131, 70, 94, 71, 88, /* 136 - 143 */
932 72, 128, 73, 98, 74, 132, 75, 121, /* 144 - 151 */
933 76, 102, 77, 124, 78, 132, 79, 106, /* 152 - 159 */
934 80, 97, 81, 160, 82, 99, 83, 134, /* 160 - 167 */
935 84, 86, 85, 95, 86, 160, 87, 100, /* 168 - 175 */
936 88, 113, 89, 98, 90, 107, 91, 122, /* 176 - 183 */
937 92, 111, 93, 102, 94, 126, 95, 150, /* 184 - 191 */
938 96, 128, 97, 130, 98, 133, 99, 195, /* 192 - 199 */
939 100, 128, 101, 123, 102, 164, 103, 138, /* 200 - 207 */
940 104, 145, 105, 146, 106, 109, 107, 149, /* 208 - 215 */
941 108, 200, 109, 146, 110, 170, 111, 157, /* 216 - 223 */
942 112, 128, 113, 130, 114, 182, 115, 132, /* 224 - 231 */
943 116, 200, 117, 132, 118, 158, 119, 206, /* 232 - 239 */
944 120, 240, 121, 162, 122, 147, 123, 152, /* 240 - 247 */
945 124, 166, 125, 214, 126, 138, 127, 153, /* 248 - 255 */
949 /* Return the number of multiplications required to calculate
950 powi(x,n) where n is less than POWI_TABLE_SIZE. This is a
951 subroutine of powi_cost. CACHE is an array indicating
952 which exponents have already been calculated. */
954 static int
955 powi_lookup_cost (unsigned HOST_WIDE_INT n, bool *cache)
957 /* If we've already calculated this exponent, then this evaluation
958 doesn't require any additional multiplications. */
959 if (cache[n])
960 return 0;
962 cache[n] = true;
963 return powi_lookup_cost (n - powi_table[n], cache)
964 + powi_lookup_cost (powi_table[n], cache) + 1;
967 /* Return the number of multiplications required to calculate
968 powi(x,n) for an arbitrary x, given the exponent N. This
969 function needs to be kept in sync with powi_as_mults below. */
971 static int
972 powi_cost (HOST_WIDE_INT n)
974 bool cache[POWI_TABLE_SIZE];
975 unsigned HOST_WIDE_INT digit;
976 unsigned HOST_WIDE_INT val;
977 int result;
979 if (n == 0)
980 return 0;
982 /* Ignore the reciprocal when calculating the cost. */
983 val = (n < 0) ? -n : n;
985 /* Initialize the exponent cache. */
986 memset (cache, 0, POWI_TABLE_SIZE * sizeof (bool));
987 cache[1] = true;
989 result = 0;
991 while (val >= POWI_TABLE_SIZE)
993 if (val & 1)
995 digit = val & ((1 << POWI_WINDOW_SIZE) - 1);
996 result += powi_lookup_cost (digit, cache)
997 + POWI_WINDOW_SIZE + 1;
998 val >>= POWI_WINDOW_SIZE;
1000 else
1002 val >>= 1;
1003 result++;
1007 return result + powi_lookup_cost (val, cache);
1010 /* Recursive subroutine of powi_as_mults. This function takes the
1011 array, CACHE, of already calculated exponents and an exponent N and
1012 returns a tree that corresponds to CACHE[1]**N, with type TYPE. */
1014 static tree
1015 powi_as_mults_1 (gimple_stmt_iterator *gsi, location_t loc, tree type,
1016 HOST_WIDE_INT n, tree *cache)
1018 tree op0, op1, ssa_target;
1019 unsigned HOST_WIDE_INT digit;
1020 gassign *mult_stmt;
1022 if (n < POWI_TABLE_SIZE && cache[n])
1023 return cache[n];
1025 ssa_target = make_temp_ssa_name (type, NULL, "powmult");
1027 if (n < POWI_TABLE_SIZE)
1029 cache[n] = ssa_target;
1030 op0 = powi_as_mults_1 (gsi, loc, type, n - powi_table[n], cache);
1031 op1 = powi_as_mults_1 (gsi, loc, type, powi_table[n], cache);
1033 else if (n & 1)
1035 digit = n & ((1 << POWI_WINDOW_SIZE) - 1);
1036 op0 = powi_as_mults_1 (gsi, loc, type, n - digit, cache);
1037 op1 = powi_as_mults_1 (gsi, loc, type, digit, cache);
1039 else
1041 op0 = powi_as_mults_1 (gsi, loc, type, n >> 1, cache);
1042 op1 = op0;
1045 mult_stmt = gimple_build_assign (ssa_target, MULT_EXPR, op0, op1);
1046 gimple_set_location (mult_stmt, loc);
1047 gsi_insert_before (gsi, mult_stmt, GSI_SAME_STMT);
1049 return ssa_target;
1052 /* Convert ARG0**N to a tree of multiplications of ARG0 with itself.
1053 This function needs to be kept in sync with powi_cost above. */
1055 static tree
1056 powi_as_mults (gimple_stmt_iterator *gsi, location_t loc,
1057 tree arg0, HOST_WIDE_INT n)
1059 tree cache[POWI_TABLE_SIZE], result, type = TREE_TYPE (arg0);
1060 gassign *div_stmt;
1061 tree target;
1063 if (n == 0)
1064 return build_real (type, dconst1);
1066 memset (cache, 0, sizeof (cache));
1067 cache[1] = arg0;
1069 result = powi_as_mults_1 (gsi, loc, type, (n < 0) ? -n : n, cache);
1070 if (n >= 0)
1071 return result;
1073 /* If the original exponent was negative, reciprocate the result. */
1074 target = make_temp_ssa_name (type, NULL, "powmult");
1075 div_stmt = gimple_build_assign (target, RDIV_EXPR,
1076 build_real (type, dconst1), result);
1077 gimple_set_location (div_stmt, loc);
1078 gsi_insert_before (gsi, div_stmt, GSI_SAME_STMT);
1080 return target;
1083 /* ARG0 and N are the two arguments to a powi builtin in GSI with
1084 location info LOC. If the arguments are appropriate, create an
1085 equivalent sequence of statements prior to GSI using an optimal
1086 number of multiplications, and return an expession holding the
1087 result. */
1089 static tree
1090 gimple_expand_builtin_powi (gimple_stmt_iterator *gsi, location_t loc,
1091 tree arg0, HOST_WIDE_INT n)
1093 /* Avoid largest negative number. */
1094 if (n != -n
1095 && ((n >= -1 && n <= 2)
1096 || (optimize_function_for_speed_p (cfun)
1097 && powi_cost (n) <= POWI_MAX_MULTS)))
1098 return powi_as_mults (gsi, loc, arg0, n);
1100 return NULL_TREE;
1103 /* Build a gimple call statement that calls FN with argument ARG.
1104 Set the lhs of the call statement to a fresh SSA name. Insert the
1105 statement prior to GSI's current position, and return the fresh
1106 SSA name. */
1108 static tree
1109 build_and_insert_call (gimple_stmt_iterator *gsi, location_t loc,
1110 tree fn, tree arg)
1112 gcall *call_stmt;
1113 tree ssa_target;
1115 call_stmt = gimple_build_call (fn, 1, arg);
1116 ssa_target = make_temp_ssa_name (TREE_TYPE (arg), NULL, "powroot");
1117 gimple_set_lhs (call_stmt, ssa_target);
1118 gimple_set_location (call_stmt, loc);
1119 gsi_insert_before (gsi, call_stmt, GSI_SAME_STMT);
1121 return ssa_target;
1124 /* Build a gimple binary operation with the given CODE and arguments
1125 ARG0, ARG1, assigning the result to a new SSA name for variable
1126 TARGET. Insert the statement prior to GSI's current position, and
1127 return the fresh SSA name.*/
1129 static tree
1130 build_and_insert_binop (gimple_stmt_iterator *gsi, location_t loc,
1131 const char *name, enum tree_code code,
1132 tree arg0, tree arg1)
1134 tree result = make_temp_ssa_name (TREE_TYPE (arg0), NULL, name);
1135 gassign *stmt = gimple_build_assign (result, code, arg0, arg1);
1136 gimple_set_location (stmt, loc);
1137 gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
1138 return result;
1141 /* Build a gimple reference operation with the given CODE and argument
1142 ARG, assigning the result to a new SSA name of TYPE with NAME.
1143 Insert the statement prior to GSI's current position, and return
1144 the fresh SSA name. */
1146 static inline tree
1147 build_and_insert_ref (gimple_stmt_iterator *gsi, location_t loc, tree type,
1148 const char *name, enum tree_code code, tree arg0)
1150 tree result = make_temp_ssa_name (type, NULL, name);
1151 gimple *stmt = gimple_build_assign (result, build1 (code, type, arg0));
1152 gimple_set_location (stmt, loc);
1153 gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
1154 return result;
1157 /* Build a gimple assignment to cast VAL to TYPE. Insert the statement
1158 prior to GSI's current position, and return the fresh SSA name. */
1160 static tree
1161 build_and_insert_cast (gimple_stmt_iterator *gsi, location_t loc,
1162 tree type, tree val)
1164 tree result = make_ssa_name (type);
1165 gassign *stmt = gimple_build_assign (result, NOP_EXPR, val);
1166 gimple_set_location (stmt, loc);
1167 gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
1168 return result;
1171 struct pow_synth_sqrt_info
1173 bool *factors;
1174 unsigned int deepest;
1175 unsigned int num_mults;
1178 /* Return true iff the real value C can be represented as a
1179 sum of powers of 0.5 up to N. That is:
1180 C == SUM<i from 1..N> (a[i]*(0.5**i)) where a[i] is either 0 or 1.
1181 Record in INFO the various parameters of the synthesis algorithm such
1182 as the factors a[i], the maximum 0.5 power and the number of
1183 multiplications that will be required. */
1185 bool
1186 representable_as_half_series_p (REAL_VALUE_TYPE c, unsigned n,
1187 struct pow_synth_sqrt_info *info)
1189 REAL_VALUE_TYPE factor = dconsthalf;
1190 REAL_VALUE_TYPE remainder = c;
1192 info->deepest = 0;
1193 info->num_mults = 0;
1194 memset (info->factors, 0, n * sizeof (bool));
1196 for (unsigned i = 0; i < n; i++)
1198 REAL_VALUE_TYPE res;
1200 /* If something inexact happened bail out now. */
1201 if (real_arithmetic (&res, MINUS_EXPR, &remainder, &factor))
1202 return false;
1204 /* We have hit zero. The number is representable as a sum
1205 of powers of 0.5. */
1206 if (real_equal (&res, &dconst0))
1208 info->factors[i] = true;
1209 info->deepest = i + 1;
1210 return true;
1212 else if (!REAL_VALUE_NEGATIVE (res))
1214 remainder = res;
1215 info->factors[i] = true;
1216 info->num_mults++;
1218 else
1219 info->factors[i] = false;
1221 real_arithmetic (&factor, MULT_EXPR, &factor, &dconsthalf);
1223 return false;
1226 /* Return the tree corresponding to FN being applied
1227 to ARG N times at GSI and LOC.
1228 Look up previous results from CACHE if need be.
1229 cache[0] should contain just plain ARG i.e. FN applied to ARG 0 times. */
1231 static tree
1232 get_fn_chain (tree arg, unsigned int n, gimple_stmt_iterator *gsi,
1233 tree fn, location_t loc, tree *cache)
1235 tree res = cache[n];
1236 if (!res)
1238 tree prev = get_fn_chain (arg, n - 1, gsi, fn, loc, cache);
1239 res = build_and_insert_call (gsi, loc, fn, prev);
1240 cache[n] = res;
1243 return res;
1246 /* Print to STREAM the repeated application of function FNAME to ARG
1247 N times. So, for FNAME = "foo", ARG = "x", N = 2 it would print:
1248 "foo (foo (x))". */
1250 static void
1251 print_nested_fn (FILE* stream, const char *fname, const char* arg,
1252 unsigned int n)
1254 if (n == 0)
1255 fprintf (stream, "%s", arg);
1256 else
1258 fprintf (stream, "%s (", fname);
1259 print_nested_fn (stream, fname, arg, n - 1);
1260 fprintf (stream, ")");
1264 /* Print to STREAM the fractional sequence of sqrt chains
1265 applied to ARG, described by INFO. Used for the dump file. */
1267 static void
1268 dump_fractional_sqrt_sequence (FILE *stream, const char *arg,
1269 struct pow_synth_sqrt_info *info)
1271 for (unsigned int i = 0; i < info->deepest; i++)
1273 bool is_set = info->factors[i];
1274 if (is_set)
1276 print_nested_fn (stream, "sqrt", arg, i + 1);
1277 if (i != info->deepest - 1)
1278 fprintf (stream, " * ");
1283 /* Print to STREAM a representation of raising ARG to an integer
1284 power N. Used for the dump file. */
1286 static void
1287 dump_integer_part (FILE *stream, const char* arg, HOST_WIDE_INT n)
1289 if (n > 1)
1290 fprintf (stream, "powi (%s, " HOST_WIDE_INT_PRINT_DEC ")", arg, n);
1291 else if (n == 1)
1292 fprintf (stream, "%s", arg);
1295 /* Attempt to synthesize a POW[F] (ARG0, ARG1) call using chains of
1296 square roots. Place at GSI and LOC. Limit the maximum depth
1297 of the sqrt chains to MAX_DEPTH. Return the tree holding the
1298 result of the expanded sequence or NULL_TREE if the expansion failed.
1300 This routine assumes that ARG1 is a real number with a fractional part
1301 (the integer exponent case will have been handled earlier in
1302 gimple_expand_builtin_pow).
1304 For ARG1 > 0.0:
1305 * For ARG1 composed of a whole part WHOLE_PART and a fractional part
1306 FRAC_PART i.e. WHOLE_PART == floor (ARG1) and
1307 FRAC_PART == ARG1 - WHOLE_PART:
1308 Produce POWI (ARG0, WHOLE_PART) * POW (ARG0, FRAC_PART) where
1309 POW (ARG0, FRAC_PART) is expanded as a product of square root chains
1310 if it can be expressed as such, that is if FRAC_PART satisfies:
1311 FRAC_PART == <SUM from i = 1 until MAX_DEPTH> (a[i] * (0.5**i))
1312 where integer a[i] is either 0 or 1.
1314 Example:
1315 POW (x, 3.625) == POWI (x, 3) * POW (x, 0.625)
1316 --> POWI (x, 3) * SQRT (x) * SQRT (SQRT (SQRT (x)))
1318 For ARG1 < 0.0 there are two approaches:
1319 * (A) Expand to 1.0 / POW (ARG0, -ARG1) where POW (ARG0, -ARG1)
1320 is calculated as above.
1322 Example:
1323 POW (x, -5.625) == 1.0 / POW (x, 5.625)
1324 --> 1.0 / (POWI (x, 5) * SQRT (x) * SQRT (SQRT (SQRT (x))))
1326 * (B) : WHOLE_PART := - ceil (abs (ARG1))
1327 FRAC_PART := ARG1 - WHOLE_PART
1328 and expand to POW (x, FRAC_PART) / POWI (x, WHOLE_PART).
1329 Example:
1330 POW (x, -5.875) == POW (x, 0.125) / POWI (X, 6)
1331 --> SQRT (SQRT (SQRT (x))) / (POWI (x, 6))
1333 For ARG1 < 0.0 we choose between (A) and (B) depending on
1334 how many multiplications we'd have to do.
1335 So, for the example in (B): POW (x, -5.875), if we were to
1336 follow algorithm (A) we would produce:
1337 1.0 / POWI (X, 5) * SQRT (X) * SQRT (SQRT (X)) * SQRT (SQRT (SQRT (X)))
1338 which contains more multiplications than approach (B).
1340 Hopefully, this approach will eliminate potentially expensive POW library
1341 calls when unsafe floating point math is enabled and allow the compiler to
1342 further optimise the multiplies, square roots and divides produced by this
1343 function. */
1345 static tree
1346 expand_pow_as_sqrts (gimple_stmt_iterator *gsi, location_t loc,
1347 tree arg0, tree arg1, HOST_WIDE_INT max_depth)
1349 tree type = TREE_TYPE (arg0);
1350 machine_mode mode = TYPE_MODE (type);
1351 tree sqrtfn = mathfn_built_in (type, BUILT_IN_SQRT);
1352 bool one_over = true;
1354 if (!sqrtfn)
1355 return NULL_TREE;
1357 if (TREE_CODE (arg1) != REAL_CST)
1358 return NULL_TREE;
1360 REAL_VALUE_TYPE exp_init = TREE_REAL_CST (arg1);
1362 gcc_assert (max_depth > 0);
1363 tree *cache = XALLOCAVEC (tree, max_depth + 1);
1365 struct pow_synth_sqrt_info synth_info;
1366 synth_info.factors = XALLOCAVEC (bool, max_depth + 1);
1367 synth_info.deepest = 0;
1368 synth_info.num_mults = 0;
1370 bool neg_exp = REAL_VALUE_NEGATIVE (exp_init);
1371 REAL_VALUE_TYPE exp = real_value_abs (&exp_init);
1373 /* The whole and fractional parts of exp. */
1374 REAL_VALUE_TYPE whole_part;
1375 REAL_VALUE_TYPE frac_part;
1377 real_floor (&whole_part, mode, &exp);
1378 real_arithmetic (&frac_part, MINUS_EXPR, &exp, &whole_part);
1381 REAL_VALUE_TYPE ceil_whole = dconst0;
1382 REAL_VALUE_TYPE ceil_fract = dconst0;
1384 if (neg_exp)
1386 real_ceil (&ceil_whole, mode, &exp);
1387 real_arithmetic (&ceil_fract, MINUS_EXPR, &ceil_whole, &exp);
1390 if (!representable_as_half_series_p (frac_part, max_depth, &synth_info))
1391 return NULL_TREE;
1393 /* Check whether it's more profitable to not use 1.0 / ... */
1394 if (neg_exp)
1396 struct pow_synth_sqrt_info alt_synth_info;
1397 alt_synth_info.factors = XALLOCAVEC (bool, max_depth + 1);
1398 alt_synth_info.deepest = 0;
1399 alt_synth_info.num_mults = 0;
1401 if (representable_as_half_series_p (ceil_fract, max_depth,
1402 &alt_synth_info)
1403 && alt_synth_info.deepest <= synth_info.deepest
1404 && alt_synth_info.num_mults < synth_info.num_mults)
1406 whole_part = ceil_whole;
1407 frac_part = ceil_fract;
1408 synth_info.deepest = alt_synth_info.deepest;
1409 synth_info.num_mults = alt_synth_info.num_mults;
1410 memcpy (synth_info.factors, alt_synth_info.factors,
1411 (max_depth + 1) * sizeof (bool));
1412 one_over = false;
1416 HOST_WIDE_INT n = real_to_integer (&whole_part);
1417 REAL_VALUE_TYPE cint;
1418 real_from_integer (&cint, VOIDmode, n, SIGNED);
1420 if (!real_identical (&whole_part, &cint))
1421 return NULL_TREE;
1423 if (powi_cost (n) + synth_info.num_mults > POWI_MAX_MULTS)
1424 return NULL_TREE;
1426 memset (cache, 0, (max_depth + 1) * sizeof (tree));
1428 tree integer_res = n == 0 ? build_real (type, dconst1) : arg0;
1430 /* Calculate the integer part of the exponent. */
1431 if (n > 1)
1433 integer_res = gimple_expand_builtin_powi (gsi, loc, arg0, n);
1434 if (!integer_res)
1435 return NULL_TREE;
1438 if (dump_file)
1440 char string[64];
1442 real_to_decimal (string, &exp_init, sizeof (string), 0, 1);
1443 fprintf (dump_file, "synthesizing pow (x, %s) as:\n", string);
1445 if (neg_exp)
1447 if (one_over)
1449 fprintf (dump_file, "1.0 / (");
1450 dump_integer_part (dump_file, "x", n);
1451 if (n > 0)
1452 fprintf (dump_file, " * ");
1453 dump_fractional_sqrt_sequence (dump_file, "x", &synth_info);
1454 fprintf (dump_file, ")");
1456 else
1458 dump_fractional_sqrt_sequence (dump_file, "x", &synth_info);
1459 fprintf (dump_file, " / (");
1460 dump_integer_part (dump_file, "x", n);
1461 fprintf (dump_file, ")");
1464 else
1466 dump_fractional_sqrt_sequence (dump_file, "x", &synth_info);
1467 if (n > 0)
1468 fprintf (dump_file, " * ");
1469 dump_integer_part (dump_file, "x", n);
1472 fprintf (dump_file, "\ndeepest sqrt chain: %d\n", synth_info.deepest);
1476 tree fract_res = NULL_TREE;
1477 cache[0] = arg0;
1479 /* Calculate the fractional part of the exponent. */
1480 for (unsigned i = 0; i < synth_info.deepest; i++)
1482 if (synth_info.factors[i])
1484 tree sqrt_chain = get_fn_chain (arg0, i + 1, gsi, sqrtfn, loc, cache);
1486 if (!fract_res)
1487 fract_res = sqrt_chain;
1489 else
1490 fract_res = build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR,
1491 fract_res, sqrt_chain);
1495 tree res = NULL_TREE;
1497 if (neg_exp)
1499 if (one_over)
1501 if (n > 0)
1502 res = build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR,
1503 fract_res, integer_res);
1504 else
1505 res = fract_res;
1507 res = build_and_insert_binop (gsi, loc, "powrootrecip", RDIV_EXPR,
1508 build_real (type, dconst1), res);
1510 else
1512 res = build_and_insert_binop (gsi, loc, "powroot", RDIV_EXPR,
1513 fract_res, integer_res);
1516 else
1517 res = build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR,
1518 fract_res, integer_res);
1519 return res;
1522 /* ARG0 and ARG1 are the two arguments to a pow builtin call in GSI
1523 with location info LOC. If possible, create an equivalent and
1524 less expensive sequence of statements prior to GSI, and return an
1525 expession holding the result. */
1527 static tree
1528 gimple_expand_builtin_pow (gimple_stmt_iterator *gsi, location_t loc,
1529 tree arg0, tree arg1)
1531 REAL_VALUE_TYPE c, cint, dconst1_3, dconst1_4, dconst1_6;
1532 REAL_VALUE_TYPE c2, dconst3;
1533 HOST_WIDE_INT n;
1534 tree type, sqrtfn, cbrtfn, sqrt_arg0, result, cbrt_x, powi_cbrt_x;
1535 machine_mode mode;
1536 bool speed_p = optimize_bb_for_speed_p (gsi_bb (*gsi));
1537 bool hw_sqrt_exists, c_is_int, c2_is_int;
1539 dconst1_4 = dconst1;
1540 SET_REAL_EXP (&dconst1_4, REAL_EXP (&dconst1_4) - 2);
1542 /* If the exponent isn't a constant, there's nothing of interest
1543 to be done. */
1544 if (TREE_CODE (arg1) != REAL_CST)
1545 return NULL_TREE;
1547 /* Don't perform the operation if flag_signaling_nans is on
1548 and the operand is a signaling NaN. */
1549 if (HONOR_SNANS (TYPE_MODE (TREE_TYPE (arg1)))
1550 && ((TREE_CODE (arg0) == REAL_CST
1551 && REAL_VALUE_ISSIGNALING_NAN (TREE_REAL_CST (arg0)))
1552 || REAL_VALUE_ISSIGNALING_NAN (TREE_REAL_CST (arg1))))
1553 return NULL_TREE;
1555 /* If the exponent is equivalent to an integer, expand to an optimal
1556 multiplication sequence when profitable. */
1557 c = TREE_REAL_CST (arg1);
1558 n = real_to_integer (&c);
1559 real_from_integer (&cint, VOIDmode, n, SIGNED);
1560 c_is_int = real_identical (&c, &cint);
1562 if (c_is_int
1563 && ((n >= -1 && n <= 2)
1564 || (flag_unsafe_math_optimizations
1565 && speed_p
1566 && powi_cost (n) <= POWI_MAX_MULTS)))
1567 return gimple_expand_builtin_powi (gsi, loc, arg0, n);
1569 /* Attempt various optimizations using sqrt and cbrt. */
1570 type = TREE_TYPE (arg0);
1571 mode = TYPE_MODE (type);
1572 sqrtfn = mathfn_built_in (type, BUILT_IN_SQRT);
1574 /* Optimize pow(x,0.5) = sqrt(x). This replacement is always safe
1575 unless signed zeros must be maintained. pow(-0,0.5) = +0, while
1576 sqrt(-0) = -0. */
1577 if (sqrtfn
1578 && real_equal (&c, &dconsthalf)
1579 && !HONOR_SIGNED_ZEROS (mode))
1580 return build_and_insert_call (gsi, loc, sqrtfn, arg0);
1582 hw_sqrt_exists = optab_handler (sqrt_optab, mode) != CODE_FOR_nothing;
1584 /* Optimize pow(x,1./3.) = cbrt(x). This requires unsafe math
1585 optimizations since 1./3. is not exactly representable. If x
1586 is negative and finite, the correct value of pow(x,1./3.) is
1587 a NaN with the "invalid" exception raised, because the value
1588 of 1./3. actually has an even denominator. The correct value
1589 of cbrt(x) is a negative real value. */
1590 cbrtfn = mathfn_built_in (type, BUILT_IN_CBRT);
1591 dconst1_3 = real_value_truncate (mode, dconst_third ());
1593 if (flag_unsafe_math_optimizations
1594 && cbrtfn
1595 && (!HONOR_NANS (mode) || tree_expr_nonnegative_p (arg0))
1596 && real_equal (&c, &dconst1_3))
1597 return build_and_insert_call (gsi, loc, cbrtfn, arg0);
1599 /* Optimize pow(x,1./6.) = cbrt(sqrt(x)). Don't do this optimization
1600 if we don't have a hardware sqrt insn. */
1601 dconst1_6 = dconst1_3;
1602 SET_REAL_EXP (&dconst1_6, REAL_EXP (&dconst1_6) - 1);
1604 if (flag_unsafe_math_optimizations
1605 && sqrtfn
1606 && cbrtfn
1607 && (!HONOR_NANS (mode) || tree_expr_nonnegative_p (arg0))
1608 && speed_p
1609 && hw_sqrt_exists
1610 && real_equal (&c, &dconst1_6))
1612 /* sqrt(x) */
1613 sqrt_arg0 = build_and_insert_call (gsi, loc, sqrtfn, arg0);
1615 /* cbrt(sqrt(x)) */
1616 return build_and_insert_call (gsi, loc, cbrtfn, sqrt_arg0);
1620 /* Attempt to expand the POW as a product of square root chains.
1621 Expand the 0.25 case even when otpimising for size. */
1622 if (flag_unsafe_math_optimizations
1623 && sqrtfn
1624 && hw_sqrt_exists
1625 && (speed_p || real_equal (&c, &dconst1_4))
1626 && !HONOR_SIGNED_ZEROS (mode))
1628 unsigned int max_depth = speed_p
1629 ? PARAM_VALUE (PARAM_MAX_POW_SQRT_DEPTH)
1630 : 2;
1632 tree expand_with_sqrts
1633 = expand_pow_as_sqrts (gsi, loc, arg0, arg1, max_depth);
1635 if (expand_with_sqrts)
1636 return expand_with_sqrts;
1639 real_arithmetic (&c2, MULT_EXPR, &c, &dconst2);
1640 n = real_to_integer (&c2);
1641 real_from_integer (&cint, VOIDmode, n, SIGNED);
1642 c2_is_int = real_identical (&c2, &cint);
1644 /* Optimize pow(x,c), where 3c = n for some nonzero integer n, into
1646 powi(x, n/3) * powi(cbrt(x), n%3), n > 0;
1647 1.0 / (powi(x, abs(n)/3) * powi(cbrt(x), abs(n)%3)), n < 0.
1649 Do not calculate the first factor when n/3 = 0. As cbrt(x) is
1650 different from pow(x, 1./3.) due to rounding and behavior with
1651 negative x, we need to constrain this transformation to unsafe
1652 math and positive x or finite math. */
1653 real_from_integer (&dconst3, VOIDmode, 3, SIGNED);
1654 real_arithmetic (&c2, MULT_EXPR, &c, &dconst3);
1655 real_round (&c2, mode, &c2);
1656 n = real_to_integer (&c2);
1657 real_from_integer (&cint, VOIDmode, n, SIGNED);
1658 real_arithmetic (&c2, RDIV_EXPR, &cint, &dconst3);
1659 real_convert (&c2, mode, &c2);
1661 if (flag_unsafe_math_optimizations
1662 && cbrtfn
1663 && (!HONOR_NANS (mode) || tree_expr_nonnegative_p (arg0))
1664 && real_identical (&c2, &c)
1665 && !c2_is_int
1666 && optimize_function_for_speed_p (cfun)
1667 && powi_cost (n / 3) <= POWI_MAX_MULTS)
1669 tree powi_x_ndiv3 = NULL_TREE;
1671 /* Attempt to fold powi(arg0, abs(n/3)) into multiplies. If not
1672 possible or profitable, give up. Skip the degenerate case when
1673 abs(n) < 3, where the result is always 1. */
1674 if (absu_hwi (n) >= 3)
1676 powi_x_ndiv3 = gimple_expand_builtin_powi (gsi, loc, arg0,
1677 abs_hwi (n / 3));
1678 if (!powi_x_ndiv3)
1679 return NULL_TREE;
1682 /* Calculate powi(cbrt(x), n%3). Don't use gimple_expand_builtin_powi
1683 as that creates an unnecessary variable. Instead, just produce
1684 either cbrt(x) or cbrt(x) * cbrt(x). */
1685 cbrt_x = build_and_insert_call (gsi, loc, cbrtfn, arg0);
1687 if (absu_hwi (n) % 3 == 1)
1688 powi_cbrt_x = cbrt_x;
1689 else
1690 powi_cbrt_x = build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR,
1691 cbrt_x, cbrt_x);
1693 /* Multiply the two subexpressions, unless powi(x,abs(n)/3) = 1. */
1694 if (absu_hwi (n) < 3)
1695 result = powi_cbrt_x;
1696 else
1697 result = build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR,
1698 powi_x_ndiv3, powi_cbrt_x);
1700 /* If n is negative, reciprocate the result. */
1701 if (n < 0)
1702 result = build_and_insert_binop (gsi, loc, "powroot", RDIV_EXPR,
1703 build_real (type, dconst1), result);
1705 return result;
1708 /* No optimizations succeeded. */
1709 return NULL_TREE;
1712 /* ARG is the argument to a cabs builtin call in GSI with location info
1713 LOC. Create a sequence of statements prior to GSI that calculates
1714 sqrt(R*R + I*I), where R and I are the real and imaginary components
1715 of ARG, respectively. Return an expression holding the result. */
1717 static tree
1718 gimple_expand_builtin_cabs (gimple_stmt_iterator *gsi, location_t loc, tree arg)
1720 tree real_part, imag_part, addend1, addend2, sum, result;
1721 tree type = TREE_TYPE (TREE_TYPE (arg));
1722 tree sqrtfn = mathfn_built_in (type, BUILT_IN_SQRT);
1723 machine_mode mode = TYPE_MODE (type);
1725 if (!flag_unsafe_math_optimizations
1726 || !optimize_bb_for_speed_p (gimple_bb (gsi_stmt (*gsi)))
1727 || !sqrtfn
1728 || optab_handler (sqrt_optab, mode) == CODE_FOR_nothing)
1729 return NULL_TREE;
1731 real_part = build_and_insert_ref (gsi, loc, type, "cabs",
1732 REALPART_EXPR, arg);
1733 addend1 = build_and_insert_binop (gsi, loc, "cabs", MULT_EXPR,
1734 real_part, real_part);
1735 imag_part = build_and_insert_ref (gsi, loc, type, "cabs",
1736 IMAGPART_EXPR, arg);
1737 addend2 = build_and_insert_binop (gsi, loc, "cabs", MULT_EXPR,
1738 imag_part, imag_part);
1739 sum = build_and_insert_binop (gsi, loc, "cabs", PLUS_EXPR, addend1, addend2);
1740 result = build_and_insert_call (gsi, loc, sqrtfn, sum);
1742 return result;
1745 /* Go through all calls to sin, cos and cexpi and call execute_cse_sincos_1
1746 on the SSA_NAME argument of each of them. Also expand powi(x,n) into
1747 an optimal number of multiplies, when n is a constant. */
1749 namespace {
1751 const pass_data pass_data_cse_sincos =
1753 GIMPLE_PASS, /* type */
1754 "sincos", /* name */
1755 OPTGROUP_NONE, /* optinfo_flags */
1756 TV_NONE, /* tv_id */
1757 PROP_ssa, /* properties_required */
1758 PROP_gimple_opt_math, /* properties_provided */
1759 0, /* properties_destroyed */
1760 0, /* todo_flags_start */
1761 TODO_update_ssa, /* todo_flags_finish */
1764 class pass_cse_sincos : public gimple_opt_pass
1766 public:
1767 pass_cse_sincos (gcc::context *ctxt)
1768 : gimple_opt_pass (pass_data_cse_sincos, ctxt)
1771 /* opt_pass methods: */
1772 virtual bool gate (function *)
1774 /* We no longer require either sincos or cexp, since powi expansion
1775 piggybacks on this pass. */
1776 return optimize;
1779 virtual unsigned int execute (function *);
1781 }; // class pass_cse_sincos
1783 unsigned int
1784 pass_cse_sincos::execute (function *fun)
1786 basic_block bb;
1787 bool cfg_changed = false;
1789 calculate_dominance_info (CDI_DOMINATORS);
1790 memset (&sincos_stats, 0, sizeof (sincos_stats));
1792 FOR_EACH_BB_FN (bb, fun)
1794 gimple_stmt_iterator gsi;
1795 bool cleanup_eh = false;
1797 for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi); gsi_next (&gsi))
1799 gimple *stmt = gsi_stmt (gsi);
1801 /* Only the last stmt in a bb could throw, no need to call
1802 gimple_purge_dead_eh_edges if we change something in the middle
1803 of a basic block. */
1804 cleanup_eh = false;
1806 if (is_gimple_call (stmt)
1807 && gimple_call_lhs (stmt))
1809 tree arg, arg0, arg1, result;
1810 HOST_WIDE_INT n;
1811 location_t loc;
1813 switch (gimple_call_combined_fn (stmt))
1815 CASE_CFN_COS:
1816 CASE_CFN_SIN:
1817 CASE_CFN_CEXPI:
1818 /* Make sure we have either sincos or cexp. */
1819 if (!targetm.libc_has_function (function_c99_math_complex)
1820 && !targetm.libc_has_function (function_sincos))
1821 break;
1823 arg = gimple_call_arg (stmt, 0);
1824 if (TREE_CODE (arg) == SSA_NAME)
1825 cfg_changed |= execute_cse_sincos_1 (arg);
1826 break;
1828 CASE_CFN_POW:
1829 arg0 = gimple_call_arg (stmt, 0);
1830 arg1 = gimple_call_arg (stmt, 1);
1832 loc = gimple_location (stmt);
1833 result = gimple_expand_builtin_pow (&gsi, loc, arg0, arg1);
1835 if (result)
1837 tree lhs = gimple_get_lhs (stmt);
1838 gassign *new_stmt = gimple_build_assign (lhs, result);
1839 gimple_set_location (new_stmt, loc);
1840 unlink_stmt_vdef (stmt);
1841 gsi_replace (&gsi, new_stmt, true);
1842 cleanup_eh = true;
1843 if (gimple_vdef (stmt))
1844 release_ssa_name (gimple_vdef (stmt));
1846 break;
1848 CASE_CFN_POWI:
1849 arg0 = gimple_call_arg (stmt, 0);
1850 arg1 = gimple_call_arg (stmt, 1);
1851 loc = gimple_location (stmt);
1853 if (real_minus_onep (arg0))
1855 tree t0, t1, cond, one, minus_one;
1856 gassign *stmt;
1858 t0 = TREE_TYPE (arg0);
1859 t1 = TREE_TYPE (arg1);
1860 one = build_real (t0, dconst1);
1861 minus_one = build_real (t0, dconstm1);
1863 cond = make_temp_ssa_name (t1, NULL, "powi_cond");
1864 stmt = gimple_build_assign (cond, BIT_AND_EXPR,
1865 arg1, build_int_cst (t1, 1));
1866 gimple_set_location (stmt, loc);
1867 gsi_insert_before (&gsi, stmt, GSI_SAME_STMT);
1869 result = make_temp_ssa_name (t0, NULL, "powi");
1870 stmt = gimple_build_assign (result, COND_EXPR, cond,
1871 minus_one, one);
1872 gimple_set_location (stmt, loc);
1873 gsi_insert_before (&gsi, stmt, GSI_SAME_STMT);
1875 else
1877 if (!tree_fits_shwi_p (arg1))
1878 break;
1880 n = tree_to_shwi (arg1);
1881 result = gimple_expand_builtin_powi (&gsi, loc, arg0, n);
1884 if (result)
1886 tree lhs = gimple_get_lhs (stmt);
1887 gassign *new_stmt = gimple_build_assign (lhs, result);
1888 gimple_set_location (new_stmt, loc);
1889 unlink_stmt_vdef (stmt);
1890 gsi_replace (&gsi, new_stmt, true);
1891 cleanup_eh = true;
1892 if (gimple_vdef (stmt))
1893 release_ssa_name (gimple_vdef (stmt));
1895 break;
1897 CASE_CFN_CABS:
1898 arg0 = gimple_call_arg (stmt, 0);
1899 loc = gimple_location (stmt);
1900 result = gimple_expand_builtin_cabs (&gsi, loc, arg0);
1902 if (result)
1904 tree lhs = gimple_get_lhs (stmt);
1905 gassign *new_stmt = gimple_build_assign (lhs, result);
1906 gimple_set_location (new_stmt, loc);
1907 unlink_stmt_vdef (stmt);
1908 gsi_replace (&gsi, new_stmt, true);
1909 cleanup_eh = true;
1910 if (gimple_vdef (stmt))
1911 release_ssa_name (gimple_vdef (stmt));
1913 break;
1915 default:;
1919 if (cleanup_eh)
1920 cfg_changed |= gimple_purge_dead_eh_edges (bb);
1923 statistics_counter_event (fun, "sincos statements inserted",
1924 sincos_stats.inserted);
1926 return cfg_changed ? TODO_cleanup_cfg : 0;
1929 } // anon namespace
1931 gimple_opt_pass *
1932 make_pass_cse_sincos (gcc::context *ctxt)
1934 return new pass_cse_sincos (ctxt);
1937 /* A symbolic number structure is used to detect byte permutation and selection
1938 patterns of a source. To achieve that, its field N contains an artificial
1939 number consisting of BITS_PER_MARKER sized markers tracking where does each
1940 byte come from in the source:
1942 0 - target byte has the value 0
1943 FF - target byte has an unknown value (eg. due to sign extension)
1944 1..size - marker value is the byte index in the source (0 for lsb).
1946 To detect permutations on memory sources (arrays and structures), a symbolic
1947 number is also associated:
1948 - a base address BASE_ADDR and an OFFSET giving the address of the source;
1949 - a range which gives the difference between the highest and lowest accessed
1950 memory location to make such a symbolic number;
1951 - the address SRC of the source element of lowest address as a convenience
1952 to easily get BASE_ADDR + offset + lowest bytepos;
1953 - number of expressions N_OPS bitwise ored together to represent
1954 approximate cost of the computation.
1956 Note 1: the range is different from size as size reflects the size of the
1957 type of the current expression. For instance, for an array char a[],
1958 (short) a[0] | (short) a[3] would have a size of 2 but a range of 4 while
1959 (short) a[0] | ((short) a[0] << 1) would still have a size of 2 but this
1960 time a range of 1.
1962 Note 2: for non-memory sources, range holds the same value as size.
1964 Note 3: SRC points to the SSA_NAME in case of non-memory source. */
1966 struct symbolic_number {
1967 uint64_t n;
1968 tree type;
1969 tree base_addr;
1970 tree offset;
1971 HOST_WIDE_INT bytepos;
1972 tree src;
1973 tree alias_set;
1974 tree vuse;
1975 unsigned HOST_WIDE_INT range;
1976 int n_ops;
1979 #define BITS_PER_MARKER 8
1980 #define MARKER_MASK ((1 << BITS_PER_MARKER) - 1)
1981 #define MARKER_BYTE_UNKNOWN MARKER_MASK
1982 #define HEAD_MARKER(n, size) \
1983 ((n) & ((uint64_t) MARKER_MASK << (((size) - 1) * BITS_PER_MARKER)))
1985 /* The number which the find_bswap_or_nop_1 result should match in
1986 order to have a nop. The number is masked according to the size of
1987 the symbolic number before using it. */
1988 #define CMPNOP (sizeof (int64_t) < 8 ? 0 : \
1989 (uint64_t)0x08070605 << 32 | 0x04030201)
1991 /* The number which the find_bswap_or_nop_1 result should match in
1992 order to have a byte swap. The number is masked according to the
1993 size of the symbolic number before using it. */
1994 #define CMPXCHG (sizeof (int64_t) < 8 ? 0 : \
1995 (uint64_t)0x01020304 << 32 | 0x05060708)
1997 /* Perform a SHIFT or ROTATE operation by COUNT bits on symbolic
1998 number N. Return false if the requested operation is not permitted
1999 on a symbolic number. */
2001 static inline bool
2002 do_shift_rotate (enum tree_code code,
2003 struct symbolic_number *n,
2004 int count)
2006 int i, size = TYPE_PRECISION (n->type) / BITS_PER_UNIT;
2007 unsigned head_marker;
2009 if (count % BITS_PER_UNIT != 0)
2010 return false;
2011 count = (count / BITS_PER_UNIT) * BITS_PER_MARKER;
2013 /* Zero out the extra bits of N in order to avoid them being shifted
2014 into the significant bits. */
2015 if (size < 64 / BITS_PER_MARKER)
2016 n->n &= ((uint64_t) 1 << (size * BITS_PER_MARKER)) - 1;
2018 switch (code)
2020 case LSHIFT_EXPR:
2021 n->n <<= count;
2022 break;
2023 case RSHIFT_EXPR:
2024 head_marker = HEAD_MARKER (n->n, size);
2025 n->n >>= count;
2026 /* Arithmetic shift of signed type: result is dependent on the value. */
2027 if (!TYPE_UNSIGNED (n->type) && head_marker)
2028 for (i = 0; i < count / BITS_PER_MARKER; i++)
2029 n->n |= (uint64_t) MARKER_BYTE_UNKNOWN
2030 << ((size - 1 - i) * BITS_PER_MARKER);
2031 break;
2032 case LROTATE_EXPR:
2033 n->n = (n->n << count) | (n->n >> ((size * BITS_PER_MARKER) - count));
2034 break;
2035 case RROTATE_EXPR:
2036 n->n = (n->n >> count) | (n->n << ((size * BITS_PER_MARKER) - count));
2037 break;
2038 default:
2039 return false;
2041 /* Zero unused bits for size. */
2042 if (size < 64 / BITS_PER_MARKER)
2043 n->n &= ((uint64_t) 1 << (size * BITS_PER_MARKER)) - 1;
2044 return true;
2047 /* Perform sanity checking for the symbolic number N and the gimple
2048 statement STMT. */
2050 static inline bool
2051 verify_symbolic_number_p (struct symbolic_number *n, gimple *stmt)
2053 tree lhs_type;
2055 lhs_type = gimple_expr_type (stmt);
2057 if (TREE_CODE (lhs_type) != INTEGER_TYPE)
2058 return false;
2060 if (TYPE_PRECISION (lhs_type) != TYPE_PRECISION (n->type))
2061 return false;
2063 return true;
2066 /* Initialize the symbolic number N for the bswap pass from the base element
2067 SRC manipulated by the bitwise OR expression. */
2069 static bool
2070 init_symbolic_number (struct symbolic_number *n, tree src)
2072 int size;
2074 if (! INTEGRAL_TYPE_P (TREE_TYPE (src)))
2075 return false;
2077 n->base_addr = n->offset = n->alias_set = n->vuse = NULL_TREE;
2078 n->src = src;
2080 /* Set up the symbolic number N by setting each byte to a value between 1 and
2081 the byte size of rhs1. The highest order byte is set to n->size and the
2082 lowest order byte to 1. */
2083 n->type = TREE_TYPE (src);
2084 size = TYPE_PRECISION (n->type);
2085 if (size % BITS_PER_UNIT != 0)
2086 return false;
2087 size /= BITS_PER_UNIT;
2088 if (size > 64 / BITS_PER_MARKER)
2089 return false;
2090 n->range = size;
2091 n->n = CMPNOP;
2092 n->n_ops = 1;
2094 if (size < 64 / BITS_PER_MARKER)
2095 n->n &= ((uint64_t) 1 << (size * BITS_PER_MARKER)) - 1;
2097 return true;
2100 /* Check if STMT might be a byte swap or a nop from a memory source and returns
2101 the answer. If so, REF is that memory source and the base of the memory area
2102 accessed and the offset of the access from that base are recorded in N. */
2104 bool
2105 find_bswap_or_nop_load (gimple *stmt, tree ref, struct symbolic_number *n)
2107 /* Leaf node is an array or component ref. Memorize its base and
2108 offset from base to compare to other such leaf node. */
2109 HOST_WIDE_INT bitsize, bitpos;
2110 machine_mode mode;
2111 int unsignedp, reversep, volatilep;
2112 tree offset, base_addr;
2114 /* Not prepared to handle PDP endian. */
2115 if (BYTES_BIG_ENDIAN != WORDS_BIG_ENDIAN)
2116 return false;
2118 if (!gimple_assign_load_p (stmt) || gimple_has_volatile_ops (stmt))
2119 return false;
2121 base_addr = get_inner_reference (ref, &bitsize, &bitpos, &offset, &mode,
2122 &unsignedp, &reversep, &volatilep);
2124 if (TREE_CODE (base_addr) == MEM_REF)
2126 offset_int bit_offset = 0;
2127 tree off = TREE_OPERAND (base_addr, 1);
2129 if (!integer_zerop (off))
2131 offset_int boff, coff = mem_ref_offset (base_addr);
2132 boff = coff << LOG2_BITS_PER_UNIT;
2133 bit_offset += boff;
2136 base_addr = TREE_OPERAND (base_addr, 0);
2138 /* Avoid returning a negative bitpos as this may wreak havoc later. */
2139 if (wi::neg_p (bit_offset))
2141 offset_int mask = wi::mask <offset_int> (LOG2_BITS_PER_UNIT, false);
2142 offset_int tem = wi::bit_and_not (bit_offset, mask);
2143 /* TEM is the bitpos rounded to BITS_PER_UNIT towards -Inf.
2144 Subtract it to BIT_OFFSET and add it (scaled) to OFFSET. */
2145 bit_offset -= tem;
2146 tem >>= LOG2_BITS_PER_UNIT;
2147 if (offset)
2148 offset = size_binop (PLUS_EXPR, offset,
2149 wide_int_to_tree (sizetype, tem));
2150 else
2151 offset = wide_int_to_tree (sizetype, tem);
2154 bitpos += bit_offset.to_shwi ();
2157 if (bitpos % BITS_PER_UNIT)
2158 return false;
2159 if (bitsize % BITS_PER_UNIT)
2160 return false;
2161 if (reversep)
2162 return false;
2164 if (!init_symbolic_number (n, ref))
2165 return false;
2166 n->base_addr = base_addr;
2167 n->offset = offset;
2168 n->bytepos = bitpos / BITS_PER_UNIT;
2169 n->alias_set = reference_alias_ptr_type (ref);
2170 n->vuse = gimple_vuse (stmt);
2171 return true;
2174 /* Compute the symbolic number N representing the result of a bitwise OR on 2
2175 symbolic number N1 and N2 whose source statements are respectively
2176 SOURCE_STMT1 and SOURCE_STMT2. */
2178 static gimple *
2179 perform_symbolic_merge (gimple *source_stmt1, struct symbolic_number *n1,
2180 gimple *source_stmt2, struct symbolic_number *n2,
2181 struct symbolic_number *n)
2183 int i, size;
2184 uint64_t mask;
2185 gimple *source_stmt;
2186 struct symbolic_number *n_start;
2188 tree rhs1 = gimple_assign_rhs1 (source_stmt1);
2189 if (TREE_CODE (rhs1) == BIT_FIELD_REF
2190 && TREE_CODE (TREE_OPERAND (rhs1, 0)) == SSA_NAME)
2191 rhs1 = TREE_OPERAND (rhs1, 0);
2192 tree rhs2 = gimple_assign_rhs1 (source_stmt2);
2193 if (TREE_CODE (rhs2) == BIT_FIELD_REF
2194 && TREE_CODE (TREE_OPERAND (rhs2, 0)) == SSA_NAME)
2195 rhs2 = TREE_OPERAND (rhs2, 0);
2197 /* Sources are different, cancel bswap if they are not memory location with
2198 the same base (array, structure, ...). */
2199 if (rhs1 != rhs2)
2201 uint64_t inc;
2202 HOST_WIDE_INT start_sub, end_sub, end1, end2, end;
2203 struct symbolic_number *toinc_n_ptr, *n_end;
2204 basic_block bb1, bb2;
2206 if (!n1->base_addr || !n2->base_addr
2207 || !operand_equal_p (n1->base_addr, n2->base_addr, 0))
2208 return NULL;
2210 if (!n1->offset != !n2->offset
2211 || (n1->offset && !operand_equal_p (n1->offset, n2->offset, 0)))
2212 return NULL;
2214 if (n1->bytepos < n2->bytepos)
2216 n_start = n1;
2217 start_sub = n2->bytepos - n1->bytepos;
2219 else
2221 n_start = n2;
2222 start_sub = n1->bytepos - n2->bytepos;
2225 bb1 = gimple_bb (source_stmt1);
2226 bb2 = gimple_bb (source_stmt2);
2227 if (dominated_by_p (CDI_DOMINATORS, bb1, bb2))
2228 source_stmt = source_stmt1;
2229 else
2230 source_stmt = source_stmt2;
2232 /* Find the highest address at which a load is performed and
2233 compute related info. */
2234 end1 = n1->bytepos + (n1->range - 1);
2235 end2 = n2->bytepos + (n2->range - 1);
2236 if (end1 < end2)
2238 end = end2;
2239 end_sub = end2 - end1;
2241 else
2243 end = end1;
2244 end_sub = end1 - end2;
2246 n_end = (end2 > end1) ? n2 : n1;
2248 /* Find symbolic number whose lsb is the most significant. */
2249 if (BYTES_BIG_ENDIAN)
2250 toinc_n_ptr = (n_end == n1) ? n2 : n1;
2251 else
2252 toinc_n_ptr = (n_start == n1) ? n2 : n1;
2254 n->range = end - n_start->bytepos + 1;
2256 /* Check that the range of memory covered can be represented by
2257 a symbolic number. */
2258 if (n->range > 64 / BITS_PER_MARKER)
2259 return NULL;
2261 /* Reinterpret byte marks in symbolic number holding the value of
2262 bigger weight according to target endianness. */
2263 inc = BYTES_BIG_ENDIAN ? end_sub : start_sub;
2264 size = TYPE_PRECISION (n1->type) / BITS_PER_UNIT;
2265 for (i = 0; i < size; i++, inc <<= BITS_PER_MARKER)
2267 unsigned marker
2268 = (toinc_n_ptr->n >> (i * BITS_PER_MARKER)) & MARKER_MASK;
2269 if (marker && marker != MARKER_BYTE_UNKNOWN)
2270 toinc_n_ptr->n += inc;
2273 else
2275 n->range = n1->range;
2276 n_start = n1;
2277 source_stmt = source_stmt1;
2280 if (!n1->alias_set
2281 || alias_ptr_types_compatible_p (n1->alias_set, n2->alias_set))
2282 n->alias_set = n1->alias_set;
2283 else
2284 n->alias_set = ptr_type_node;
2285 n->vuse = n_start->vuse;
2286 n->base_addr = n_start->base_addr;
2287 n->offset = n_start->offset;
2288 n->src = n_start->src;
2289 n->bytepos = n_start->bytepos;
2290 n->type = n_start->type;
2291 size = TYPE_PRECISION (n->type) / BITS_PER_UNIT;
2293 for (i = 0, mask = MARKER_MASK; i < size; i++, mask <<= BITS_PER_MARKER)
2295 uint64_t masked1, masked2;
2297 masked1 = n1->n & mask;
2298 masked2 = n2->n & mask;
2299 if (masked1 && masked2 && masked1 != masked2)
2300 return NULL;
2302 n->n = n1->n | n2->n;
2303 n->n_ops = n1->n_ops + n2->n_ops;
2305 return source_stmt;
2308 /* find_bswap_or_nop_1 invokes itself recursively with N and tries to perform
2309 the operation given by the rhs of STMT on the result. If the operation
2310 could successfully be executed the function returns a gimple stmt whose
2311 rhs's first tree is the expression of the source operand and NULL
2312 otherwise. */
2314 static gimple *
2315 find_bswap_or_nop_1 (gimple *stmt, struct symbolic_number *n, int limit)
2317 enum tree_code code;
2318 tree rhs1, rhs2 = NULL;
2319 gimple *rhs1_stmt, *rhs2_stmt, *source_stmt1;
2320 enum gimple_rhs_class rhs_class;
2322 if (!limit || !is_gimple_assign (stmt))
2323 return NULL;
2325 rhs1 = gimple_assign_rhs1 (stmt);
2327 if (find_bswap_or_nop_load (stmt, rhs1, n))
2328 return stmt;
2330 /* Handle BIT_FIELD_REF. */
2331 if (TREE_CODE (rhs1) == BIT_FIELD_REF
2332 && TREE_CODE (TREE_OPERAND (rhs1, 0)) == SSA_NAME)
2334 unsigned HOST_WIDE_INT bitsize = tree_to_uhwi (TREE_OPERAND (rhs1, 1));
2335 unsigned HOST_WIDE_INT bitpos = tree_to_uhwi (TREE_OPERAND (rhs1, 2));
2336 if (bitpos % BITS_PER_UNIT == 0
2337 && bitsize % BITS_PER_UNIT == 0
2338 && init_symbolic_number (n, TREE_OPERAND (rhs1, 0)))
2340 /* Handle big-endian bit numbering in BIT_FIELD_REF. */
2341 if (BYTES_BIG_ENDIAN)
2342 bitpos = TYPE_PRECISION (n->type) - bitpos - bitsize;
2344 /* Shift. */
2345 if (!do_shift_rotate (RSHIFT_EXPR, n, bitpos))
2346 return NULL;
2348 /* Mask. */
2349 uint64_t mask = 0;
2350 uint64_t tmp = (1 << BITS_PER_UNIT) - 1;
2351 for (unsigned i = 0; i < bitsize / BITS_PER_UNIT;
2352 i++, tmp <<= BITS_PER_UNIT)
2353 mask |= (uint64_t) MARKER_MASK << (i * BITS_PER_MARKER);
2354 n->n &= mask;
2356 /* Convert. */
2357 n->type = TREE_TYPE (rhs1);
2358 if (!n->base_addr)
2359 n->range = TYPE_PRECISION (n->type) / BITS_PER_UNIT;
2361 return verify_symbolic_number_p (n, stmt) ? stmt : NULL;
2364 return NULL;
2367 if (TREE_CODE (rhs1) != SSA_NAME)
2368 return NULL;
2370 code = gimple_assign_rhs_code (stmt);
2371 rhs_class = gimple_assign_rhs_class (stmt);
2372 rhs1_stmt = SSA_NAME_DEF_STMT (rhs1);
2374 if (rhs_class == GIMPLE_BINARY_RHS)
2375 rhs2 = gimple_assign_rhs2 (stmt);
2377 /* Handle unary rhs and binary rhs with integer constants as second
2378 operand. */
2380 if (rhs_class == GIMPLE_UNARY_RHS
2381 || (rhs_class == GIMPLE_BINARY_RHS
2382 && TREE_CODE (rhs2) == INTEGER_CST))
2384 if (code != BIT_AND_EXPR
2385 && code != LSHIFT_EXPR
2386 && code != RSHIFT_EXPR
2387 && code != LROTATE_EXPR
2388 && code != RROTATE_EXPR
2389 && !CONVERT_EXPR_CODE_P (code))
2390 return NULL;
2392 source_stmt1 = find_bswap_or_nop_1 (rhs1_stmt, n, limit - 1);
2394 /* If find_bswap_or_nop_1 returned NULL, STMT is a leaf node and
2395 we have to initialize the symbolic number. */
2396 if (!source_stmt1)
2398 if (gimple_assign_load_p (stmt)
2399 || !init_symbolic_number (n, rhs1))
2400 return NULL;
2401 source_stmt1 = stmt;
2404 switch (code)
2406 case BIT_AND_EXPR:
2408 int i, size = TYPE_PRECISION (n->type) / BITS_PER_UNIT;
2409 uint64_t val = int_cst_value (rhs2), mask = 0;
2410 uint64_t tmp = (1 << BITS_PER_UNIT) - 1;
2412 /* Only constants masking full bytes are allowed. */
2413 for (i = 0; i < size; i++, tmp <<= BITS_PER_UNIT)
2414 if ((val & tmp) != 0 && (val & tmp) != tmp)
2415 return NULL;
2416 else if (val & tmp)
2417 mask |= (uint64_t) MARKER_MASK << (i * BITS_PER_MARKER);
2419 n->n &= mask;
2421 break;
2422 case LSHIFT_EXPR:
2423 case RSHIFT_EXPR:
2424 case LROTATE_EXPR:
2425 case RROTATE_EXPR:
2426 if (!do_shift_rotate (code, n, (int) TREE_INT_CST_LOW (rhs2)))
2427 return NULL;
2428 break;
2429 CASE_CONVERT:
2431 int i, type_size, old_type_size;
2432 tree type;
2434 type = gimple_expr_type (stmt);
2435 type_size = TYPE_PRECISION (type);
2436 if (type_size % BITS_PER_UNIT != 0)
2437 return NULL;
2438 type_size /= BITS_PER_UNIT;
2439 if (type_size > 64 / BITS_PER_MARKER)
2440 return NULL;
2442 /* Sign extension: result is dependent on the value. */
2443 old_type_size = TYPE_PRECISION (n->type) / BITS_PER_UNIT;
2444 if (!TYPE_UNSIGNED (n->type) && type_size > old_type_size
2445 && HEAD_MARKER (n->n, old_type_size))
2446 for (i = 0; i < type_size - old_type_size; i++)
2447 n->n |= (uint64_t) MARKER_BYTE_UNKNOWN
2448 << ((type_size - 1 - i) * BITS_PER_MARKER);
2450 if (type_size < 64 / BITS_PER_MARKER)
2452 /* If STMT casts to a smaller type mask out the bits not
2453 belonging to the target type. */
2454 n->n &= ((uint64_t) 1 << (type_size * BITS_PER_MARKER)) - 1;
2456 n->type = type;
2457 if (!n->base_addr)
2458 n->range = type_size;
2460 break;
2461 default:
2462 return NULL;
2464 return verify_symbolic_number_p (n, stmt) ? source_stmt1 : NULL;
2467 /* Handle binary rhs. */
2469 if (rhs_class == GIMPLE_BINARY_RHS)
2471 struct symbolic_number n1, n2;
2472 gimple *source_stmt, *source_stmt2;
2474 if (code != BIT_IOR_EXPR)
2475 return NULL;
2477 if (TREE_CODE (rhs2) != SSA_NAME)
2478 return NULL;
2480 rhs2_stmt = SSA_NAME_DEF_STMT (rhs2);
2482 switch (code)
2484 case BIT_IOR_EXPR:
2485 source_stmt1 = find_bswap_or_nop_1 (rhs1_stmt, &n1, limit - 1);
2487 if (!source_stmt1)
2488 return NULL;
2490 source_stmt2 = find_bswap_or_nop_1 (rhs2_stmt, &n2, limit - 1);
2492 if (!source_stmt2)
2493 return NULL;
2495 if (TYPE_PRECISION (n1.type) != TYPE_PRECISION (n2.type))
2496 return NULL;
2498 if (!n1.vuse != !n2.vuse
2499 || (n1.vuse && !operand_equal_p (n1.vuse, n2.vuse, 0)))
2500 return NULL;
2502 source_stmt
2503 = perform_symbolic_merge (source_stmt1, &n1, source_stmt2, &n2, n);
2505 if (!source_stmt)
2506 return NULL;
2508 if (!verify_symbolic_number_p (n, stmt))
2509 return NULL;
2511 break;
2512 default:
2513 return NULL;
2515 return source_stmt;
2517 return NULL;
2520 /* Check if STMT completes a bswap implementation or a read in a given
2521 endianness consisting of ORs, SHIFTs and ANDs and sets *BSWAP
2522 accordingly. It also sets N to represent the kind of operations
2523 performed: size of the resulting expression and whether it works on
2524 a memory source, and if so alias-set and vuse. At last, the
2525 function returns a stmt whose rhs's first tree is the source
2526 expression. */
2528 static gimple *
2529 find_bswap_or_nop (gimple *stmt, struct symbolic_number *n, bool *bswap)
2531 unsigned rsize;
2532 uint64_t tmpn, mask;
2533 /* The number which the find_bswap_or_nop_1 result should match in order
2534 to have a full byte swap. The number is shifted to the right
2535 according to the size of the symbolic number before using it. */
2536 uint64_t cmpxchg = CMPXCHG;
2537 uint64_t cmpnop = CMPNOP;
2539 gimple *ins_stmt;
2540 int limit;
2542 /* The last parameter determines the depth search limit. It usually
2543 correlates directly to the number n of bytes to be touched. We
2544 increase that number by log2(n) + 1 here in order to also
2545 cover signed -> unsigned conversions of the src operand as can be seen
2546 in libgcc, and for initial shift/and operation of the src operand. */
2547 limit = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (gimple_expr_type (stmt)));
2548 limit += 1 + (int) ceil_log2 ((unsigned HOST_WIDE_INT) limit);
2549 ins_stmt = find_bswap_or_nop_1 (stmt, n, limit);
2551 if (!ins_stmt)
2552 return NULL;
2554 /* Find real size of result (highest non-zero byte). */
2555 if (n->base_addr)
2556 for (tmpn = n->n, rsize = 0; tmpn; tmpn >>= BITS_PER_MARKER, rsize++);
2557 else
2558 rsize = n->range;
2560 /* Zero out the bits corresponding to untouched bytes in original gimple
2561 expression. */
2562 if (n->range < (int) sizeof (int64_t))
2564 mask = ((uint64_t) 1 << (n->range * BITS_PER_MARKER)) - 1;
2565 cmpxchg >>= (64 / BITS_PER_MARKER - n->range) * BITS_PER_MARKER;
2566 cmpnop &= mask;
2569 /* Zero out the bits corresponding to unused bytes in the result of the
2570 gimple expression. */
2571 if (rsize < n->range)
2573 if (BYTES_BIG_ENDIAN)
2575 mask = ((uint64_t) 1 << (rsize * BITS_PER_MARKER)) - 1;
2576 cmpxchg &= mask;
2577 cmpnop >>= (n->range - rsize) * BITS_PER_MARKER;
2579 else
2581 mask = ((uint64_t) 1 << (rsize * BITS_PER_MARKER)) - 1;
2582 cmpxchg >>= (n->range - rsize) * BITS_PER_MARKER;
2583 cmpnop &= mask;
2585 n->range = rsize;
2588 /* A complete byte swap should make the symbolic number to start with
2589 the largest digit in the highest order byte. Unchanged symbolic
2590 number indicates a read with same endianness as target architecture. */
2591 if (n->n == cmpnop)
2592 *bswap = false;
2593 else if (n->n == cmpxchg)
2594 *bswap = true;
2595 else
2596 return NULL;
2598 /* Useless bit manipulation performed by code. */
2599 if (!n->base_addr && n->n == cmpnop && n->n_ops == 1)
2600 return NULL;
2602 n->range *= BITS_PER_UNIT;
2603 return ins_stmt;
2606 namespace {
2608 const pass_data pass_data_optimize_bswap =
2610 GIMPLE_PASS, /* type */
2611 "bswap", /* name */
2612 OPTGROUP_NONE, /* optinfo_flags */
2613 TV_NONE, /* tv_id */
2614 PROP_ssa, /* properties_required */
2615 0, /* properties_provided */
2616 0, /* properties_destroyed */
2617 0, /* todo_flags_start */
2618 0, /* todo_flags_finish */
2621 class pass_optimize_bswap : public gimple_opt_pass
2623 public:
2624 pass_optimize_bswap (gcc::context *ctxt)
2625 : gimple_opt_pass (pass_data_optimize_bswap, ctxt)
2628 /* opt_pass methods: */
2629 virtual bool gate (function *)
2631 return flag_expensive_optimizations && optimize;
2634 virtual unsigned int execute (function *);
2636 }; // class pass_optimize_bswap
2638 /* Perform the bswap optimization: replace the expression computed in the rhs
2639 of CUR_STMT by an equivalent bswap, load or load + bswap expression.
2640 Which of these alternatives replace the rhs is given by N->base_addr (non
2641 null if a load is needed) and BSWAP. The type, VUSE and set-alias of the
2642 load to perform are also given in N while the builtin bswap invoke is given
2643 in FNDEL. Finally, if a load is involved, SRC_STMT refers to one of the
2644 load statements involved to construct the rhs in CUR_STMT and N->range gives
2645 the size of the rhs expression for maintaining some statistics.
2647 Note that if the replacement involve a load, CUR_STMT is moved just after
2648 SRC_STMT to do the load with the same VUSE which can lead to CUR_STMT
2649 changing of basic block. */
2651 static bool
2652 bswap_replace (gimple *cur_stmt, gimple *ins_stmt, tree fndecl,
2653 tree bswap_type, tree load_type, struct symbolic_number *n,
2654 bool bswap)
2656 gimple_stmt_iterator gsi;
2657 tree src, tmp, tgt;
2658 gimple *bswap_stmt;
2660 gsi = gsi_for_stmt (cur_stmt);
2661 src = n->src;
2662 tgt = gimple_assign_lhs (cur_stmt);
2664 /* Need to load the value from memory first. */
2665 if (n->base_addr)
2667 gimple_stmt_iterator gsi_ins = gsi_for_stmt (ins_stmt);
2668 tree addr_expr, addr_tmp, val_expr, val_tmp;
2669 tree load_offset_ptr, aligned_load_type;
2670 gimple *addr_stmt, *load_stmt;
2671 unsigned align;
2672 HOST_WIDE_INT load_offset = 0;
2673 basic_block ins_bb, cur_bb;
2675 ins_bb = gimple_bb (ins_stmt);
2676 cur_bb = gimple_bb (cur_stmt);
2677 if (!dominated_by_p (CDI_DOMINATORS, cur_bb, ins_bb))
2678 return false;
2680 align = get_object_alignment (src);
2682 /* Move cur_stmt just before one of the load of the original
2683 to ensure it has the same VUSE. See PR61517 for what could
2684 go wrong. */
2685 if (gimple_bb (cur_stmt) != gimple_bb (ins_stmt))
2686 reset_flow_sensitive_info (gimple_assign_lhs (cur_stmt));
2687 gsi_move_before (&gsi, &gsi_ins);
2688 gsi = gsi_for_stmt (cur_stmt);
2690 /* Compute address to load from and cast according to the size
2691 of the load. */
2692 addr_expr = build_fold_addr_expr (unshare_expr (src));
2693 if (is_gimple_mem_ref_addr (addr_expr))
2694 addr_tmp = addr_expr;
2695 else
2697 addr_tmp = make_temp_ssa_name (TREE_TYPE (addr_expr), NULL,
2698 "load_src");
2699 addr_stmt = gimple_build_assign (addr_tmp, addr_expr);
2700 gsi_insert_before (&gsi, addr_stmt, GSI_SAME_STMT);
2703 /* Perform the load. */
2704 aligned_load_type = load_type;
2705 if (align < TYPE_ALIGN (load_type))
2706 aligned_load_type = build_aligned_type (load_type, align);
2707 load_offset_ptr = build_int_cst (n->alias_set, load_offset);
2708 val_expr = fold_build2 (MEM_REF, aligned_load_type, addr_tmp,
2709 load_offset_ptr);
2711 if (!bswap)
2713 if (n->range == 16)
2714 nop_stats.found_16bit++;
2715 else if (n->range == 32)
2716 nop_stats.found_32bit++;
2717 else
2719 gcc_assert (n->range == 64);
2720 nop_stats.found_64bit++;
2723 /* Convert the result of load if necessary. */
2724 if (!useless_type_conversion_p (TREE_TYPE (tgt), load_type))
2726 val_tmp = make_temp_ssa_name (aligned_load_type, NULL,
2727 "load_dst");
2728 load_stmt = gimple_build_assign (val_tmp, val_expr);
2729 gimple_set_vuse (load_stmt, n->vuse);
2730 gsi_insert_before (&gsi, load_stmt, GSI_SAME_STMT);
2731 gimple_assign_set_rhs_with_ops (&gsi, NOP_EXPR, val_tmp);
2733 else
2735 gimple_assign_set_rhs_with_ops (&gsi, MEM_REF, val_expr);
2736 gimple_set_vuse (cur_stmt, n->vuse);
2738 update_stmt (cur_stmt);
2740 if (dump_file)
2742 fprintf (dump_file,
2743 "%d bit load in target endianness found at: ",
2744 (int) n->range);
2745 print_gimple_stmt (dump_file, cur_stmt, 0);
2747 return true;
2749 else
2751 val_tmp = make_temp_ssa_name (aligned_load_type, NULL, "load_dst");
2752 load_stmt = gimple_build_assign (val_tmp, val_expr);
2753 gimple_set_vuse (load_stmt, n->vuse);
2754 gsi_insert_before (&gsi, load_stmt, GSI_SAME_STMT);
2756 src = val_tmp;
2758 else if (!bswap)
2760 gimple *g;
2761 if (!useless_type_conversion_p (TREE_TYPE (tgt), TREE_TYPE (src)))
2763 if (!is_gimple_val (src))
2764 return false;
2765 g = gimple_build_assign (tgt, NOP_EXPR, src);
2767 else
2768 g = gimple_build_assign (tgt, src);
2769 if (n->range == 16)
2770 nop_stats.found_16bit++;
2771 else if (n->range == 32)
2772 nop_stats.found_32bit++;
2773 else
2775 gcc_assert (n->range == 64);
2776 nop_stats.found_64bit++;
2778 if (dump_file)
2780 fprintf (dump_file,
2781 "%d bit reshuffle in target endianness found at: ",
2782 (int) n->range);
2783 print_gimple_stmt (dump_file, cur_stmt, 0);
2785 gsi_replace (&gsi, g, true);
2786 return true;
2788 else if (TREE_CODE (src) == BIT_FIELD_REF)
2789 src = TREE_OPERAND (src, 0);
2791 if (n->range == 16)
2792 bswap_stats.found_16bit++;
2793 else if (n->range == 32)
2794 bswap_stats.found_32bit++;
2795 else
2797 gcc_assert (n->range == 64);
2798 bswap_stats.found_64bit++;
2801 tmp = src;
2803 /* Convert the src expression if necessary. */
2804 if (!useless_type_conversion_p (TREE_TYPE (tmp), bswap_type))
2806 gimple *convert_stmt;
2808 tmp = make_temp_ssa_name (bswap_type, NULL, "bswapsrc");
2809 convert_stmt = gimple_build_assign (tmp, NOP_EXPR, src);
2810 gsi_insert_before (&gsi, convert_stmt, GSI_SAME_STMT);
2813 /* Canonical form for 16 bit bswap is a rotate expression. Only 16bit values
2814 are considered as rotation of 2N bit values by N bits is generally not
2815 equivalent to a bswap. Consider for instance 0x01020304 r>> 16 which
2816 gives 0x03040102 while a bswap for that value is 0x04030201. */
2817 if (bswap && n->range == 16)
2819 tree count = build_int_cst (NULL, BITS_PER_UNIT);
2820 src = fold_build2 (LROTATE_EXPR, bswap_type, tmp, count);
2821 bswap_stmt = gimple_build_assign (NULL, src);
2823 else
2824 bswap_stmt = gimple_build_call (fndecl, 1, tmp);
2826 tmp = tgt;
2828 /* Convert the result if necessary. */
2829 if (!useless_type_conversion_p (TREE_TYPE (tgt), bswap_type))
2831 gimple *convert_stmt;
2833 tmp = make_temp_ssa_name (bswap_type, NULL, "bswapdst");
2834 convert_stmt = gimple_build_assign (tgt, NOP_EXPR, tmp);
2835 gsi_insert_after (&gsi, convert_stmt, GSI_SAME_STMT);
2838 gimple_set_lhs (bswap_stmt, tmp);
2840 if (dump_file)
2842 fprintf (dump_file, "%d bit bswap implementation found at: ",
2843 (int) n->range);
2844 print_gimple_stmt (dump_file, cur_stmt, 0);
2847 gsi_insert_after (&gsi, bswap_stmt, GSI_SAME_STMT);
2848 gsi_remove (&gsi, true);
2849 return true;
2852 /* Find manual byte swap implementations as well as load in a given
2853 endianness. Byte swaps are turned into a bswap builtin invokation
2854 while endian loads are converted to bswap builtin invokation or
2855 simple load according to the target endianness. */
2857 unsigned int
2858 pass_optimize_bswap::execute (function *fun)
2860 basic_block bb;
2861 bool bswap32_p, bswap64_p;
2862 bool changed = false;
2863 tree bswap32_type = NULL_TREE, bswap64_type = NULL_TREE;
2865 if (BITS_PER_UNIT != 8)
2866 return 0;
2868 bswap32_p = (builtin_decl_explicit_p (BUILT_IN_BSWAP32)
2869 && optab_handler (bswap_optab, SImode) != CODE_FOR_nothing);
2870 bswap64_p = (builtin_decl_explicit_p (BUILT_IN_BSWAP64)
2871 && (optab_handler (bswap_optab, DImode) != CODE_FOR_nothing
2872 || (bswap32_p && word_mode == SImode)));
2874 /* Determine the argument type of the builtins. The code later on
2875 assumes that the return and argument type are the same. */
2876 if (bswap32_p)
2878 tree fndecl = builtin_decl_explicit (BUILT_IN_BSWAP32);
2879 bswap32_type = TREE_VALUE (TYPE_ARG_TYPES (TREE_TYPE (fndecl)));
2882 if (bswap64_p)
2884 tree fndecl = builtin_decl_explicit (BUILT_IN_BSWAP64);
2885 bswap64_type = TREE_VALUE (TYPE_ARG_TYPES (TREE_TYPE (fndecl)));
2888 memset (&nop_stats, 0, sizeof (nop_stats));
2889 memset (&bswap_stats, 0, sizeof (bswap_stats));
2890 calculate_dominance_info (CDI_DOMINATORS);
2892 FOR_EACH_BB_FN (bb, fun)
2894 gimple_stmt_iterator gsi;
2896 /* We do a reverse scan for bswap patterns to make sure we get the
2897 widest match. As bswap pattern matching doesn't handle previously
2898 inserted smaller bswap replacements as sub-patterns, the wider
2899 variant wouldn't be detected. */
2900 for (gsi = gsi_last_bb (bb); !gsi_end_p (gsi);)
2902 gimple *ins_stmt, *cur_stmt = gsi_stmt (gsi);
2903 tree fndecl = NULL_TREE, bswap_type = NULL_TREE, load_type;
2904 enum tree_code code;
2905 struct symbolic_number n;
2906 bool bswap;
2908 /* This gsi_prev (&gsi) is not part of the for loop because cur_stmt
2909 might be moved to a different basic block by bswap_replace and gsi
2910 must not points to it if that's the case. Moving the gsi_prev
2911 there make sure that gsi points to the statement previous to
2912 cur_stmt while still making sure that all statements are
2913 considered in this basic block. */
2914 gsi_prev (&gsi);
2916 if (!is_gimple_assign (cur_stmt))
2917 continue;
2919 code = gimple_assign_rhs_code (cur_stmt);
2920 switch (code)
2922 case LROTATE_EXPR:
2923 case RROTATE_EXPR:
2924 if (!tree_fits_uhwi_p (gimple_assign_rhs2 (cur_stmt))
2925 || tree_to_uhwi (gimple_assign_rhs2 (cur_stmt))
2926 % BITS_PER_UNIT)
2927 continue;
2928 /* Fall through. */
2929 case BIT_IOR_EXPR:
2930 break;
2931 default:
2932 continue;
2935 ins_stmt = find_bswap_or_nop (cur_stmt, &n, &bswap);
2937 if (!ins_stmt)
2938 continue;
2940 switch (n.range)
2942 case 16:
2943 /* Already in canonical form, nothing to do. */
2944 if (code == LROTATE_EXPR || code == RROTATE_EXPR)
2945 continue;
2946 load_type = bswap_type = uint16_type_node;
2947 break;
2948 case 32:
2949 load_type = uint32_type_node;
2950 if (bswap32_p)
2952 fndecl = builtin_decl_explicit (BUILT_IN_BSWAP32);
2953 bswap_type = bswap32_type;
2955 break;
2956 case 64:
2957 load_type = uint64_type_node;
2958 if (bswap64_p)
2960 fndecl = builtin_decl_explicit (BUILT_IN_BSWAP64);
2961 bswap_type = bswap64_type;
2963 break;
2964 default:
2965 continue;
2968 if (bswap && !fndecl && n.range != 16)
2969 continue;
2971 if (bswap_replace (cur_stmt, ins_stmt, fndecl, bswap_type, load_type,
2972 &n, bswap))
2973 changed = true;
2977 statistics_counter_event (fun, "16-bit nop implementations found",
2978 nop_stats.found_16bit);
2979 statistics_counter_event (fun, "32-bit nop implementations found",
2980 nop_stats.found_32bit);
2981 statistics_counter_event (fun, "64-bit nop implementations found",
2982 nop_stats.found_64bit);
2983 statistics_counter_event (fun, "16-bit bswap implementations found",
2984 bswap_stats.found_16bit);
2985 statistics_counter_event (fun, "32-bit bswap implementations found",
2986 bswap_stats.found_32bit);
2987 statistics_counter_event (fun, "64-bit bswap implementations found",
2988 bswap_stats.found_64bit);
2990 return (changed ? TODO_update_ssa : 0);
2993 } // anon namespace
2995 gimple_opt_pass *
2996 make_pass_optimize_bswap (gcc::context *ctxt)
2998 return new pass_optimize_bswap (ctxt);
3001 /* Return true if stmt is a type conversion operation that can be stripped
3002 when used in a widening multiply operation. */
3003 static bool
3004 widening_mult_conversion_strippable_p (tree result_type, gimple *stmt)
3006 enum tree_code rhs_code = gimple_assign_rhs_code (stmt);
3008 if (TREE_CODE (result_type) == INTEGER_TYPE)
3010 tree op_type;
3011 tree inner_op_type;
3013 if (!CONVERT_EXPR_CODE_P (rhs_code))
3014 return false;
3016 op_type = TREE_TYPE (gimple_assign_lhs (stmt));
3018 /* If the type of OP has the same precision as the result, then
3019 we can strip this conversion. The multiply operation will be
3020 selected to create the correct extension as a by-product. */
3021 if (TYPE_PRECISION (result_type) == TYPE_PRECISION (op_type))
3022 return true;
3024 /* We can also strip a conversion if it preserves the signed-ness of
3025 the operation and doesn't narrow the range. */
3026 inner_op_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
3028 /* If the inner-most type is unsigned, then we can strip any
3029 intermediate widening operation. If it's signed, then the
3030 intermediate widening operation must also be signed. */
3031 if ((TYPE_UNSIGNED (inner_op_type)
3032 || TYPE_UNSIGNED (op_type) == TYPE_UNSIGNED (inner_op_type))
3033 && TYPE_PRECISION (op_type) > TYPE_PRECISION (inner_op_type))
3034 return true;
3036 return false;
3039 return rhs_code == FIXED_CONVERT_EXPR;
3042 /* Return true if RHS is a suitable operand for a widening multiplication,
3043 assuming a target type of TYPE.
3044 There are two cases:
3046 - RHS makes some value at least twice as wide. Store that value
3047 in *NEW_RHS_OUT if so, and store its type in *TYPE_OUT.
3049 - RHS is an integer constant. Store that value in *NEW_RHS_OUT if so,
3050 but leave *TYPE_OUT untouched. */
3052 static bool
3053 is_widening_mult_rhs_p (tree type, tree rhs, tree *type_out,
3054 tree *new_rhs_out)
3056 gimple *stmt;
3057 tree type1, rhs1;
3059 if (TREE_CODE (rhs) == SSA_NAME)
3061 stmt = SSA_NAME_DEF_STMT (rhs);
3062 if (is_gimple_assign (stmt))
3064 if (! widening_mult_conversion_strippable_p (type, stmt))
3065 rhs1 = rhs;
3066 else
3068 rhs1 = gimple_assign_rhs1 (stmt);
3070 if (TREE_CODE (rhs1) == INTEGER_CST)
3072 *new_rhs_out = rhs1;
3073 *type_out = NULL;
3074 return true;
3078 else
3079 rhs1 = rhs;
3081 type1 = TREE_TYPE (rhs1);
3083 if (TREE_CODE (type1) != TREE_CODE (type)
3084 || TYPE_PRECISION (type1) * 2 > TYPE_PRECISION (type))
3085 return false;
3087 *new_rhs_out = rhs1;
3088 *type_out = type1;
3089 return true;
3092 if (TREE_CODE (rhs) == INTEGER_CST)
3094 *new_rhs_out = rhs;
3095 *type_out = NULL;
3096 return true;
3099 return false;
3102 /* Return true if STMT performs a widening multiplication, assuming the
3103 output type is TYPE. If so, store the unwidened types of the operands
3104 in *TYPE1_OUT and *TYPE2_OUT respectively. Also fill *RHS1_OUT and
3105 *RHS2_OUT such that converting those operands to types *TYPE1_OUT
3106 and *TYPE2_OUT would give the operands of the multiplication. */
3108 static bool
3109 is_widening_mult_p (gimple *stmt,
3110 tree *type1_out, tree *rhs1_out,
3111 tree *type2_out, tree *rhs2_out)
3113 tree type = TREE_TYPE (gimple_assign_lhs (stmt));
3115 if (TREE_CODE (type) != INTEGER_TYPE
3116 && TREE_CODE (type) != FIXED_POINT_TYPE)
3117 return false;
3119 if (!is_widening_mult_rhs_p (type, gimple_assign_rhs1 (stmt), type1_out,
3120 rhs1_out))
3121 return false;
3123 if (!is_widening_mult_rhs_p (type, gimple_assign_rhs2 (stmt), type2_out,
3124 rhs2_out))
3125 return false;
3127 if (*type1_out == NULL)
3129 if (*type2_out == NULL || !int_fits_type_p (*rhs1_out, *type2_out))
3130 return false;
3131 *type1_out = *type2_out;
3134 if (*type2_out == NULL)
3136 if (!int_fits_type_p (*rhs2_out, *type1_out))
3137 return false;
3138 *type2_out = *type1_out;
3141 /* Ensure that the larger of the two operands comes first. */
3142 if (TYPE_PRECISION (*type1_out) < TYPE_PRECISION (*type2_out))
3144 std::swap (*type1_out, *type2_out);
3145 std::swap (*rhs1_out, *rhs2_out);
3148 return true;
3151 /* Check to see if the CALL statement is an invocation of copysign
3152 with 1. being the first argument. */
3153 static bool
3154 is_copysign_call_with_1 (gimple *call)
3156 gcall *c = dyn_cast <gcall *> (call);
3157 if (! c)
3158 return false;
3160 enum combined_fn code = gimple_call_combined_fn (c);
3162 if (code == CFN_LAST)
3163 return false;
3165 if (builtin_fn_p (code))
3167 switch (as_builtin_fn (code))
3169 CASE_FLT_FN (BUILT_IN_COPYSIGN):
3170 CASE_FLT_FN_FLOATN_NX (BUILT_IN_COPYSIGN):
3171 return real_onep (gimple_call_arg (c, 0));
3172 default:
3173 return false;
3177 if (internal_fn_p (code))
3179 switch (as_internal_fn (code))
3181 case IFN_COPYSIGN:
3182 return real_onep (gimple_call_arg (c, 0));
3183 default:
3184 return false;
3188 return false;
3191 /* Try to expand the pattern x * copysign (1, y) into xorsign (x, y).
3192 This only happens when the the xorsign optab is defined, if the
3193 pattern is not a xorsign pattern or if expansion fails FALSE is
3194 returned, otherwise TRUE is returned. */
3195 static bool
3196 convert_expand_mult_copysign (gimple *stmt, gimple_stmt_iterator *gsi)
3198 tree treeop0, treeop1, lhs, type;
3199 location_t loc = gimple_location (stmt);
3200 lhs = gimple_assign_lhs (stmt);
3201 treeop0 = gimple_assign_rhs1 (stmt);
3202 treeop1 = gimple_assign_rhs2 (stmt);
3203 type = TREE_TYPE (lhs);
3204 machine_mode mode = TYPE_MODE (type);
3206 if (HONOR_SNANS (type))
3207 return false;
3209 if (TREE_CODE (treeop0) == SSA_NAME && TREE_CODE (treeop1) == SSA_NAME)
3211 gimple *call0 = SSA_NAME_DEF_STMT (treeop0);
3212 if (!has_single_use (treeop0) || !is_copysign_call_with_1 (call0))
3214 call0 = SSA_NAME_DEF_STMT (treeop1);
3215 if (!has_single_use (treeop1) || !is_copysign_call_with_1 (call0))
3216 return false;
3218 treeop1 = treeop0;
3220 if (optab_handler (xorsign_optab, mode) == CODE_FOR_nothing)
3221 return false;
3223 gcall *c = as_a<gcall*> (call0);
3224 treeop0 = gimple_call_arg (c, 1);
3226 gcall *call_stmt
3227 = gimple_build_call_internal (IFN_XORSIGN, 2, treeop1, treeop0);
3228 gimple_set_lhs (call_stmt, lhs);
3229 gimple_set_location (call_stmt, loc);
3230 gsi_replace (gsi, call_stmt, true);
3231 return true;
3234 return false;
3237 /* Process a single gimple statement STMT, which has a MULT_EXPR as
3238 its rhs, and try to convert it into a WIDEN_MULT_EXPR. The return
3239 value is true iff we converted the statement. */
3241 static bool
3242 convert_mult_to_widen (gimple *stmt, gimple_stmt_iterator *gsi)
3244 tree lhs, rhs1, rhs2, type, type1, type2;
3245 enum insn_code handler;
3246 machine_mode to_mode, from_mode, actual_mode;
3247 optab op;
3248 int actual_precision;
3249 location_t loc = gimple_location (stmt);
3250 bool from_unsigned1, from_unsigned2;
3252 lhs = gimple_assign_lhs (stmt);
3253 type = TREE_TYPE (lhs);
3254 if (TREE_CODE (type) != INTEGER_TYPE)
3255 return false;
3257 if (!is_widening_mult_p (stmt, &type1, &rhs1, &type2, &rhs2))
3258 return false;
3260 to_mode = SCALAR_INT_TYPE_MODE (type);
3261 from_mode = SCALAR_INT_TYPE_MODE (type1);
3262 from_unsigned1 = TYPE_UNSIGNED (type1);
3263 from_unsigned2 = TYPE_UNSIGNED (type2);
3265 if (from_unsigned1 && from_unsigned2)
3266 op = umul_widen_optab;
3267 else if (!from_unsigned1 && !from_unsigned2)
3268 op = smul_widen_optab;
3269 else
3270 op = usmul_widen_optab;
3272 handler = find_widening_optab_handler_and_mode (op, to_mode, from_mode,
3273 0, &actual_mode);
3275 if (handler == CODE_FOR_nothing)
3277 if (op != smul_widen_optab)
3279 /* We can use a signed multiply with unsigned types as long as
3280 there is a wider mode to use, or it is the smaller of the two
3281 types that is unsigned. Note that type1 >= type2, always. */
3282 if ((TYPE_UNSIGNED (type1)
3283 && TYPE_PRECISION (type1) == GET_MODE_PRECISION (from_mode))
3284 || (TYPE_UNSIGNED (type2)
3285 && TYPE_PRECISION (type2) == GET_MODE_PRECISION (from_mode)))
3287 if (!GET_MODE_WIDER_MODE (from_mode).exists (&from_mode)
3288 || GET_MODE_SIZE (to_mode) <= GET_MODE_SIZE (from_mode))
3289 return false;
3292 op = smul_widen_optab;
3293 handler = find_widening_optab_handler_and_mode (op, to_mode,
3294 from_mode, 0,
3295 &actual_mode);
3297 if (handler == CODE_FOR_nothing)
3298 return false;
3300 from_unsigned1 = from_unsigned2 = false;
3302 else
3303 return false;
3306 /* Ensure that the inputs to the handler are in the correct precison
3307 for the opcode. This will be the full mode size. */
3308 actual_precision = GET_MODE_PRECISION (actual_mode);
3309 if (2 * actual_precision > TYPE_PRECISION (type))
3310 return false;
3311 if (actual_precision != TYPE_PRECISION (type1)
3312 || from_unsigned1 != TYPE_UNSIGNED (type1))
3313 rhs1 = build_and_insert_cast (gsi, loc,
3314 build_nonstandard_integer_type
3315 (actual_precision, from_unsigned1), rhs1);
3316 if (actual_precision != TYPE_PRECISION (type2)
3317 || from_unsigned2 != TYPE_UNSIGNED (type2))
3318 rhs2 = build_and_insert_cast (gsi, loc,
3319 build_nonstandard_integer_type
3320 (actual_precision, from_unsigned2), rhs2);
3322 /* Handle constants. */
3323 if (TREE_CODE (rhs1) == INTEGER_CST)
3324 rhs1 = fold_convert (type1, rhs1);
3325 if (TREE_CODE (rhs2) == INTEGER_CST)
3326 rhs2 = fold_convert (type2, rhs2);
3328 gimple_assign_set_rhs1 (stmt, rhs1);
3329 gimple_assign_set_rhs2 (stmt, rhs2);
3330 gimple_assign_set_rhs_code (stmt, WIDEN_MULT_EXPR);
3331 update_stmt (stmt);
3332 widen_mul_stats.widen_mults_inserted++;
3333 return true;
3336 /* Process a single gimple statement STMT, which is found at the
3337 iterator GSI and has a either a PLUS_EXPR or a MINUS_EXPR as its
3338 rhs (given by CODE), and try to convert it into a
3339 WIDEN_MULT_PLUS_EXPR or a WIDEN_MULT_MINUS_EXPR. The return value
3340 is true iff we converted the statement. */
3342 static bool
3343 convert_plusminus_to_widen (gimple_stmt_iterator *gsi, gimple *stmt,
3344 enum tree_code code)
3346 gimple *rhs1_stmt = NULL, *rhs2_stmt = NULL;
3347 gimple *conv1_stmt = NULL, *conv2_stmt = NULL, *conv_stmt;
3348 tree type, type1, type2, optype;
3349 tree lhs, rhs1, rhs2, mult_rhs1, mult_rhs2, add_rhs;
3350 enum tree_code rhs1_code = ERROR_MARK, rhs2_code = ERROR_MARK;
3351 optab this_optab;
3352 enum tree_code wmult_code;
3353 enum insn_code handler;
3354 scalar_mode to_mode, from_mode;
3355 machine_mode actual_mode;
3356 location_t loc = gimple_location (stmt);
3357 int actual_precision;
3358 bool from_unsigned1, from_unsigned2;
3360 lhs = gimple_assign_lhs (stmt);
3361 type = TREE_TYPE (lhs);
3362 if (TREE_CODE (type) != INTEGER_TYPE
3363 && TREE_CODE (type) != FIXED_POINT_TYPE)
3364 return false;
3366 if (code == MINUS_EXPR)
3367 wmult_code = WIDEN_MULT_MINUS_EXPR;
3368 else
3369 wmult_code = WIDEN_MULT_PLUS_EXPR;
3371 rhs1 = gimple_assign_rhs1 (stmt);
3372 rhs2 = gimple_assign_rhs2 (stmt);
3374 if (TREE_CODE (rhs1) == SSA_NAME)
3376 rhs1_stmt = SSA_NAME_DEF_STMT (rhs1);
3377 if (is_gimple_assign (rhs1_stmt))
3378 rhs1_code = gimple_assign_rhs_code (rhs1_stmt);
3381 if (TREE_CODE (rhs2) == SSA_NAME)
3383 rhs2_stmt = SSA_NAME_DEF_STMT (rhs2);
3384 if (is_gimple_assign (rhs2_stmt))
3385 rhs2_code = gimple_assign_rhs_code (rhs2_stmt);
3388 /* Allow for one conversion statement between the multiply
3389 and addition/subtraction statement. If there are more than
3390 one conversions then we assume they would invalidate this
3391 transformation. If that's not the case then they should have
3392 been folded before now. */
3393 if (CONVERT_EXPR_CODE_P (rhs1_code))
3395 conv1_stmt = rhs1_stmt;
3396 rhs1 = gimple_assign_rhs1 (rhs1_stmt);
3397 if (TREE_CODE (rhs1) == SSA_NAME)
3399 rhs1_stmt = SSA_NAME_DEF_STMT (rhs1);
3400 if (is_gimple_assign (rhs1_stmt))
3401 rhs1_code = gimple_assign_rhs_code (rhs1_stmt);
3403 else
3404 return false;
3406 if (CONVERT_EXPR_CODE_P (rhs2_code))
3408 conv2_stmt = rhs2_stmt;
3409 rhs2 = gimple_assign_rhs1 (rhs2_stmt);
3410 if (TREE_CODE (rhs2) == SSA_NAME)
3412 rhs2_stmt = SSA_NAME_DEF_STMT (rhs2);
3413 if (is_gimple_assign (rhs2_stmt))
3414 rhs2_code = gimple_assign_rhs_code (rhs2_stmt);
3416 else
3417 return false;
3420 /* If code is WIDEN_MULT_EXPR then it would seem unnecessary to call
3421 is_widening_mult_p, but we still need the rhs returns.
3423 It might also appear that it would be sufficient to use the existing
3424 operands of the widening multiply, but that would limit the choice of
3425 multiply-and-accumulate instructions.
3427 If the widened-multiplication result has more than one uses, it is
3428 probably wiser not to do the conversion. */
3429 if (code == PLUS_EXPR
3430 && (rhs1_code == MULT_EXPR || rhs1_code == WIDEN_MULT_EXPR))
3432 if (!has_single_use (rhs1)
3433 || !is_widening_mult_p (rhs1_stmt, &type1, &mult_rhs1,
3434 &type2, &mult_rhs2))
3435 return false;
3436 add_rhs = rhs2;
3437 conv_stmt = conv1_stmt;
3439 else if (rhs2_code == MULT_EXPR || rhs2_code == WIDEN_MULT_EXPR)
3441 if (!has_single_use (rhs2)
3442 || !is_widening_mult_p (rhs2_stmt, &type1, &mult_rhs1,
3443 &type2, &mult_rhs2))
3444 return false;
3445 add_rhs = rhs1;
3446 conv_stmt = conv2_stmt;
3448 else
3449 return false;
3451 to_mode = SCALAR_TYPE_MODE (type);
3452 from_mode = SCALAR_TYPE_MODE (type1);
3453 from_unsigned1 = TYPE_UNSIGNED (type1);
3454 from_unsigned2 = TYPE_UNSIGNED (type2);
3455 optype = type1;
3457 /* There's no such thing as a mixed sign madd yet, so use a wider mode. */
3458 if (from_unsigned1 != from_unsigned2)
3460 if (!INTEGRAL_TYPE_P (type))
3461 return false;
3462 /* We can use a signed multiply with unsigned types as long as
3463 there is a wider mode to use, or it is the smaller of the two
3464 types that is unsigned. Note that type1 >= type2, always. */
3465 if ((from_unsigned1
3466 && TYPE_PRECISION (type1) == GET_MODE_PRECISION (from_mode))
3467 || (from_unsigned2
3468 && TYPE_PRECISION (type2) == GET_MODE_PRECISION (from_mode)))
3470 if (!GET_MODE_WIDER_MODE (from_mode).exists (&from_mode)
3471 || GET_MODE_SIZE (from_mode) >= GET_MODE_SIZE (to_mode))
3472 return false;
3475 from_unsigned1 = from_unsigned2 = false;
3476 optype = build_nonstandard_integer_type (GET_MODE_PRECISION (from_mode),
3477 false);
3480 /* If there was a conversion between the multiply and addition
3481 then we need to make sure it fits a multiply-and-accumulate.
3482 The should be a single mode change which does not change the
3483 value. */
3484 if (conv_stmt)
3486 /* We use the original, unmodified data types for this. */
3487 tree from_type = TREE_TYPE (gimple_assign_rhs1 (conv_stmt));
3488 tree to_type = TREE_TYPE (gimple_assign_lhs (conv_stmt));
3489 int data_size = TYPE_PRECISION (type1) + TYPE_PRECISION (type2);
3490 bool is_unsigned = TYPE_UNSIGNED (type1) && TYPE_UNSIGNED (type2);
3492 if (TYPE_PRECISION (from_type) > TYPE_PRECISION (to_type))
3494 /* Conversion is a truncate. */
3495 if (TYPE_PRECISION (to_type) < data_size)
3496 return false;
3498 else if (TYPE_PRECISION (from_type) < TYPE_PRECISION (to_type))
3500 /* Conversion is an extend. Check it's the right sort. */
3501 if (TYPE_UNSIGNED (from_type) != is_unsigned
3502 && !(is_unsigned && TYPE_PRECISION (from_type) > data_size))
3503 return false;
3505 /* else convert is a no-op for our purposes. */
3508 /* Verify that the machine can perform a widening multiply
3509 accumulate in this mode/signedness combination, otherwise
3510 this transformation is likely to pessimize code. */
3511 this_optab = optab_for_tree_code (wmult_code, optype, optab_default);
3512 handler = find_widening_optab_handler_and_mode (this_optab, to_mode,
3513 from_mode, 0, &actual_mode);
3515 if (handler == CODE_FOR_nothing)
3516 return false;
3518 /* Ensure that the inputs to the handler are in the correct precison
3519 for the opcode. This will be the full mode size. */
3520 actual_precision = GET_MODE_PRECISION (actual_mode);
3521 if (actual_precision != TYPE_PRECISION (type1)
3522 || from_unsigned1 != TYPE_UNSIGNED (type1))
3523 mult_rhs1 = build_and_insert_cast (gsi, loc,
3524 build_nonstandard_integer_type
3525 (actual_precision, from_unsigned1),
3526 mult_rhs1);
3527 if (actual_precision != TYPE_PRECISION (type2)
3528 || from_unsigned2 != TYPE_UNSIGNED (type2))
3529 mult_rhs2 = build_and_insert_cast (gsi, loc,
3530 build_nonstandard_integer_type
3531 (actual_precision, from_unsigned2),
3532 mult_rhs2);
3534 if (!useless_type_conversion_p (type, TREE_TYPE (add_rhs)))
3535 add_rhs = build_and_insert_cast (gsi, loc, type, add_rhs);
3537 /* Handle constants. */
3538 if (TREE_CODE (mult_rhs1) == INTEGER_CST)
3539 mult_rhs1 = fold_convert (type1, mult_rhs1);
3540 if (TREE_CODE (mult_rhs2) == INTEGER_CST)
3541 mult_rhs2 = fold_convert (type2, mult_rhs2);
3543 gimple_assign_set_rhs_with_ops (gsi, wmult_code, mult_rhs1, mult_rhs2,
3544 add_rhs);
3545 update_stmt (gsi_stmt (*gsi));
3546 widen_mul_stats.maccs_inserted++;
3547 return true;
3550 /* Combine the multiplication at MUL_STMT with operands MULOP1 and MULOP2
3551 with uses in additions and subtractions to form fused multiply-add
3552 operations. Returns true if successful and MUL_STMT should be removed. */
3554 static bool
3555 convert_mult_to_fma (gimple *mul_stmt, tree op1, tree op2)
3557 tree mul_result = gimple_get_lhs (mul_stmt);
3558 tree type = TREE_TYPE (mul_result);
3559 gimple *use_stmt, *neguse_stmt;
3560 gassign *fma_stmt;
3561 use_operand_p use_p;
3562 imm_use_iterator imm_iter;
3564 if (FLOAT_TYPE_P (type)
3565 && flag_fp_contract_mode == FP_CONTRACT_OFF)
3566 return false;
3568 /* We don't want to do bitfield reduction ops. */
3569 if (INTEGRAL_TYPE_P (type)
3570 && !type_has_mode_precision_p (type))
3571 return false;
3573 /* If the target doesn't support it, don't generate it. We assume that
3574 if fma isn't available then fms, fnma or fnms are not either. */
3575 if (optab_handler (fma_optab, TYPE_MODE (type)) == CODE_FOR_nothing)
3576 return false;
3578 /* If the multiplication has zero uses, it is kept around probably because
3579 of -fnon-call-exceptions. Don't optimize it away in that case,
3580 it is DCE job. */
3581 if (has_zero_uses (mul_result))
3582 return false;
3584 /* Make sure that the multiplication statement becomes dead after
3585 the transformation, thus that all uses are transformed to FMAs.
3586 This means we assume that an FMA operation has the same cost
3587 as an addition. */
3588 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, mul_result)
3590 enum tree_code use_code;
3591 tree result = mul_result;
3592 bool negate_p = false;
3594 use_stmt = USE_STMT (use_p);
3596 if (is_gimple_debug (use_stmt))
3597 continue;
3599 /* For now restrict this operations to single basic blocks. In theory
3600 we would want to support sinking the multiplication in
3601 m = a*b;
3602 if ()
3603 ma = m + c;
3604 else
3605 d = m;
3606 to form a fma in the then block and sink the multiplication to the
3607 else block. */
3608 if (gimple_bb (use_stmt) != gimple_bb (mul_stmt))
3609 return false;
3611 if (!is_gimple_assign (use_stmt))
3612 return false;
3614 use_code = gimple_assign_rhs_code (use_stmt);
3616 /* A negate on the multiplication leads to FNMA. */
3617 if (use_code == NEGATE_EXPR)
3619 ssa_op_iter iter;
3620 use_operand_p usep;
3622 result = gimple_assign_lhs (use_stmt);
3624 /* Make sure the negate statement becomes dead with this
3625 single transformation. */
3626 if (!single_imm_use (gimple_assign_lhs (use_stmt),
3627 &use_p, &neguse_stmt))
3628 return false;
3630 /* Make sure the multiplication isn't also used on that stmt. */
3631 FOR_EACH_PHI_OR_STMT_USE (usep, neguse_stmt, iter, SSA_OP_USE)
3632 if (USE_FROM_PTR (usep) == mul_result)
3633 return false;
3635 /* Re-validate. */
3636 use_stmt = neguse_stmt;
3637 if (gimple_bb (use_stmt) != gimple_bb (mul_stmt))
3638 return false;
3639 if (!is_gimple_assign (use_stmt))
3640 return false;
3642 use_code = gimple_assign_rhs_code (use_stmt);
3643 negate_p = true;
3646 switch (use_code)
3648 case MINUS_EXPR:
3649 if (gimple_assign_rhs2 (use_stmt) == result)
3650 negate_p = !negate_p;
3651 break;
3652 case PLUS_EXPR:
3653 break;
3654 default:
3655 /* FMA can only be formed from PLUS and MINUS. */
3656 return false;
3659 /* If the subtrahend (gimple_assign_rhs2 (use_stmt)) is computed
3660 by a MULT_EXPR that we'll visit later, we might be able to
3661 get a more profitable match with fnma.
3662 OTOH, if we don't, a negate / fma pair has likely lower latency
3663 that a mult / subtract pair. */
3664 if (use_code == MINUS_EXPR && !negate_p
3665 && gimple_assign_rhs1 (use_stmt) == result
3666 && optab_handler (fms_optab, TYPE_MODE (type)) == CODE_FOR_nothing
3667 && optab_handler (fnma_optab, TYPE_MODE (type)) != CODE_FOR_nothing)
3669 tree rhs2 = gimple_assign_rhs2 (use_stmt);
3671 if (TREE_CODE (rhs2) == SSA_NAME)
3673 gimple *stmt2 = SSA_NAME_DEF_STMT (rhs2);
3674 if (has_single_use (rhs2)
3675 && is_gimple_assign (stmt2)
3676 && gimple_assign_rhs_code (stmt2) == MULT_EXPR)
3677 return false;
3681 /* We can't handle a * b + a * b. */
3682 if (gimple_assign_rhs1 (use_stmt) == gimple_assign_rhs2 (use_stmt))
3683 return false;
3685 /* While it is possible to validate whether or not the exact form
3686 that we've recognized is available in the backend, the assumption
3687 is that the transformation is never a loss. For instance, suppose
3688 the target only has the plain FMA pattern available. Consider
3689 a*b-c -> fma(a,b,-c): we've exchanged MUL+SUB for FMA+NEG, which
3690 is still two operations. Consider -(a*b)-c -> fma(-a,b,-c): we
3691 still have 3 operations, but in the FMA form the two NEGs are
3692 independent and could be run in parallel. */
3695 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, mul_result)
3697 gimple_stmt_iterator gsi = gsi_for_stmt (use_stmt);
3698 enum tree_code use_code;
3699 tree addop, mulop1 = op1, result = mul_result;
3700 bool negate_p = false;
3702 if (is_gimple_debug (use_stmt))
3703 continue;
3705 use_code = gimple_assign_rhs_code (use_stmt);
3706 if (use_code == NEGATE_EXPR)
3708 result = gimple_assign_lhs (use_stmt);
3709 single_imm_use (gimple_assign_lhs (use_stmt), &use_p, &neguse_stmt);
3710 gsi_remove (&gsi, true);
3711 release_defs (use_stmt);
3713 use_stmt = neguse_stmt;
3714 gsi = gsi_for_stmt (use_stmt);
3715 use_code = gimple_assign_rhs_code (use_stmt);
3716 negate_p = true;
3719 if (gimple_assign_rhs1 (use_stmt) == result)
3721 addop = gimple_assign_rhs2 (use_stmt);
3722 /* a * b - c -> a * b + (-c) */
3723 if (gimple_assign_rhs_code (use_stmt) == MINUS_EXPR)
3724 addop = force_gimple_operand_gsi (&gsi,
3725 build1 (NEGATE_EXPR,
3726 type, addop),
3727 true, NULL_TREE, true,
3728 GSI_SAME_STMT);
3730 else
3732 addop = gimple_assign_rhs1 (use_stmt);
3733 /* a - b * c -> (-b) * c + a */
3734 if (gimple_assign_rhs_code (use_stmt) == MINUS_EXPR)
3735 negate_p = !negate_p;
3738 if (negate_p)
3739 mulop1 = force_gimple_operand_gsi (&gsi,
3740 build1 (NEGATE_EXPR,
3741 type, mulop1),
3742 true, NULL_TREE, true,
3743 GSI_SAME_STMT);
3745 fma_stmt = gimple_build_assign (gimple_assign_lhs (use_stmt),
3746 FMA_EXPR, mulop1, op2, addop);
3747 gsi_replace (&gsi, fma_stmt, true);
3748 widen_mul_stats.fmas_inserted++;
3751 return true;
3755 /* Helper function of match_uaddsub_overflow. Return 1
3756 if USE_STMT is unsigned overflow check ovf != 0 for
3757 STMT, -1 if USE_STMT is unsigned overflow check ovf == 0
3758 and 0 otherwise. */
3760 static int
3761 uaddsub_overflow_check_p (gimple *stmt, gimple *use_stmt)
3763 enum tree_code ccode = ERROR_MARK;
3764 tree crhs1 = NULL_TREE, crhs2 = NULL_TREE;
3765 if (gimple_code (use_stmt) == GIMPLE_COND)
3767 ccode = gimple_cond_code (use_stmt);
3768 crhs1 = gimple_cond_lhs (use_stmt);
3769 crhs2 = gimple_cond_rhs (use_stmt);
3771 else if (is_gimple_assign (use_stmt))
3773 if (gimple_assign_rhs_class (use_stmt) == GIMPLE_BINARY_RHS)
3775 ccode = gimple_assign_rhs_code (use_stmt);
3776 crhs1 = gimple_assign_rhs1 (use_stmt);
3777 crhs2 = gimple_assign_rhs2 (use_stmt);
3779 else if (gimple_assign_rhs_code (use_stmt) == COND_EXPR)
3781 tree cond = gimple_assign_rhs1 (use_stmt);
3782 if (COMPARISON_CLASS_P (cond))
3784 ccode = TREE_CODE (cond);
3785 crhs1 = TREE_OPERAND (cond, 0);
3786 crhs2 = TREE_OPERAND (cond, 1);
3788 else
3789 return 0;
3791 else
3792 return 0;
3794 else
3795 return 0;
3797 if (TREE_CODE_CLASS (ccode) != tcc_comparison)
3798 return 0;
3800 enum tree_code code = gimple_assign_rhs_code (stmt);
3801 tree lhs = gimple_assign_lhs (stmt);
3802 tree rhs1 = gimple_assign_rhs1 (stmt);
3803 tree rhs2 = gimple_assign_rhs2 (stmt);
3805 switch (ccode)
3807 case GT_EXPR:
3808 case LE_EXPR:
3809 /* r = a - b; r > a or r <= a
3810 r = a + b; a > r or a <= r or b > r or b <= r. */
3811 if ((code == MINUS_EXPR && crhs1 == lhs && crhs2 == rhs1)
3812 || (code == PLUS_EXPR && (crhs1 == rhs1 || crhs1 == rhs2)
3813 && crhs2 == lhs))
3814 return ccode == GT_EXPR ? 1 : -1;
3815 break;
3816 case LT_EXPR:
3817 case GE_EXPR:
3818 /* r = a - b; a < r or a >= r
3819 r = a + b; r < a or r >= a or r < b or r >= b. */
3820 if ((code == MINUS_EXPR && crhs1 == rhs1 && crhs2 == lhs)
3821 || (code == PLUS_EXPR && crhs1 == lhs
3822 && (crhs2 == rhs1 || crhs2 == rhs2)))
3823 return ccode == LT_EXPR ? 1 : -1;
3824 break;
3825 default:
3826 break;
3828 return 0;
3831 /* Recognize for unsigned x
3832 x = y - z;
3833 if (x > y)
3834 where there are other uses of x and replace it with
3835 _7 = SUB_OVERFLOW (y, z);
3836 x = REALPART_EXPR <_7>;
3837 _8 = IMAGPART_EXPR <_7>;
3838 if (_8)
3839 and similarly for addition. */
3841 static bool
3842 match_uaddsub_overflow (gimple_stmt_iterator *gsi, gimple *stmt,
3843 enum tree_code code)
3845 tree lhs = gimple_assign_lhs (stmt);
3846 tree type = TREE_TYPE (lhs);
3847 use_operand_p use_p;
3848 imm_use_iterator iter;
3849 bool use_seen = false;
3850 bool ovf_use_seen = false;
3851 gimple *use_stmt;
3853 gcc_checking_assert (code == PLUS_EXPR || code == MINUS_EXPR);
3854 if (!INTEGRAL_TYPE_P (type)
3855 || !TYPE_UNSIGNED (type)
3856 || has_zero_uses (lhs)
3857 || has_single_use (lhs)
3858 || optab_handler (code == PLUS_EXPR ? uaddv4_optab : usubv4_optab,
3859 TYPE_MODE (type)) == CODE_FOR_nothing)
3860 return false;
3862 FOR_EACH_IMM_USE_FAST (use_p, iter, lhs)
3864 use_stmt = USE_STMT (use_p);
3865 if (is_gimple_debug (use_stmt))
3866 continue;
3868 if (uaddsub_overflow_check_p (stmt, use_stmt))
3869 ovf_use_seen = true;
3870 else
3871 use_seen = true;
3872 if (ovf_use_seen && use_seen)
3873 break;
3876 if (!ovf_use_seen || !use_seen)
3877 return false;
3879 tree ctype = build_complex_type (type);
3880 tree rhs1 = gimple_assign_rhs1 (stmt);
3881 tree rhs2 = gimple_assign_rhs2 (stmt);
3882 gcall *g = gimple_build_call_internal (code == PLUS_EXPR
3883 ? IFN_ADD_OVERFLOW : IFN_SUB_OVERFLOW,
3884 2, rhs1, rhs2);
3885 tree ctmp = make_ssa_name (ctype);
3886 gimple_call_set_lhs (g, ctmp);
3887 gsi_insert_before (gsi, g, GSI_SAME_STMT);
3888 gassign *g2 = gimple_build_assign (lhs, REALPART_EXPR,
3889 build1 (REALPART_EXPR, type, ctmp));
3890 gsi_replace (gsi, g2, true);
3891 tree ovf = make_ssa_name (type);
3892 g2 = gimple_build_assign (ovf, IMAGPART_EXPR,
3893 build1 (IMAGPART_EXPR, type, ctmp));
3894 gsi_insert_after (gsi, g2, GSI_NEW_STMT);
3896 FOR_EACH_IMM_USE_STMT (use_stmt, iter, lhs)
3898 if (is_gimple_debug (use_stmt))
3899 continue;
3901 int ovf_use = uaddsub_overflow_check_p (stmt, use_stmt);
3902 if (ovf_use == 0)
3903 continue;
3904 if (gimple_code (use_stmt) == GIMPLE_COND)
3906 gcond *cond_stmt = as_a <gcond *> (use_stmt);
3907 gimple_cond_set_lhs (cond_stmt, ovf);
3908 gimple_cond_set_rhs (cond_stmt, build_int_cst (type, 0));
3909 gimple_cond_set_code (cond_stmt, ovf_use == 1 ? NE_EXPR : EQ_EXPR);
3911 else
3913 gcc_checking_assert (is_gimple_assign (use_stmt));
3914 if (gimple_assign_rhs_class (use_stmt) == GIMPLE_BINARY_RHS)
3916 gimple_assign_set_rhs1 (use_stmt, ovf);
3917 gimple_assign_set_rhs2 (use_stmt, build_int_cst (type, 0));
3918 gimple_assign_set_rhs_code (use_stmt,
3919 ovf_use == 1 ? NE_EXPR : EQ_EXPR);
3921 else
3923 gcc_checking_assert (gimple_assign_rhs_code (use_stmt)
3924 == COND_EXPR);
3925 tree cond = build2 (ovf_use == 1 ? NE_EXPR : EQ_EXPR,
3926 boolean_type_node, ovf,
3927 build_int_cst (type, 0));
3928 gimple_assign_set_rhs1 (use_stmt, cond);
3931 update_stmt (use_stmt);
3933 return true;
3936 /* Return true if target has support for divmod. */
3938 static bool
3939 target_supports_divmod_p (optab divmod_optab, optab div_optab, machine_mode mode)
3941 /* If target supports hardware divmod insn, use it for divmod. */
3942 if (optab_handler (divmod_optab, mode) != CODE_FOR_nothing)
3943 return true;
3945 /* Check if libfunc for divmod is available. */
3946 rtx libfunc = optab_libfunc (divmod_optab, mode);
3947 if (libfunc != NULL_RTX)
3949 /* If optab_handler exists for div_optab, perhaps in a wider mode,
3950 we don't want to use the libfunc even if it exists for given mode. */
3951 machine_mode div_mode;
3952 FOR_EACH_MODE_FROM (div_mode, mode)
3953 if (optab_handler (div_optab, div_mode) != CODE_FOR_nothing)
3954 return false;
3956 return targetm.expand_divmod_libfunc != NULL;
3959 return false;
3962 /* Check if stmt is candidate for divmod transform. */
3964 static bool
3965 divmod_candidate_p (gassign *stmt)
3967 tree type = TREE_TYPE (gimple_assign_lhs (stmt));
3968 machine_mode mode = TYPE_MODE (type);
3969 optab divmod_optab, div_optab;
3971 if (TYPE_UNSIGNED (type))
3973 divmod_optab = udivmod_optab;
3974 div_optab = udiv_optab;
3976 else
3978 divmod_optab = sdivmod_optab;
3979 div_optab = sdiv_optab;
3982 tree op1 = gimple_assign_rhs1 (stmt);
3983 tree op2 = gimple_assign_rhs2 (stmt);
3985 /* Disable the transform if either is a constant, since division-by-constant
3986 may have specialized expansion. */
3987 if (CONSTANT_CLASS_P (op1) || CONSTANT_CLASS_P (op2))
3988 return false;
3990 /* Exclude the case where TYPE_OVERFLOW_TRAPS (type) as that should
3991 expand using the [su]divv optabs. */
3992 if (TYPE_OVERFLOW_TRAPS (type))
3993 return false;
3995 if (!target_supports_divmod_p (divmod_optab, div_optab, mode))
3996 return false;
3998 return true;
4001 /* This function looks for:
4002 t1 = a TRUNC_DIV_EXPR b;
4003 t2 = a TRUNC_MOD_EXPR b;
4004 and transforms it to the following sequence:
4005 complex_tmp = DIVMOD (a, b);
4006 t1 = REALPART_EXPR(a);
4007 t2 = IMAGPART_EXPR(b);
4008 For conditions enabling the transform see divmod_candidate_p().
4010 The pass has three parts:
4011 1) Find top_stmt which is trunc_div or trunc_mod stmt and dominates all
4012 other trunc_div_expr and trunc_mod_expr stmts.
4013 2) Add top_stmt and all trunc_div and trunc_mod stmts dominated by top_stmt
4014 to stmts vector.
4015 3) Insert DIVMOD call just before top_stmt and update entries in
4016 stmts vector to use return value of DIMOVD (REALEXPR_PART for div,
4017 IMAGPART_EXPR for mod). */
4019 static bool
4020 convert_to_divmod (gassign *stmt)
4022 if (stmt_can_throw_internal (stmt)
4023 || !divmod_candidate_p (stmt))
4024 return false;
4026 tree op1 = gimple_assign_rhs1 (stmt);
4027 tree op2 = gimple_assign_rhs2 (stmt);
4029 imm_use_iterator use_iter;
4030 gimple *use_stmt;
4031 auto_vec<gimple *> stmts;
4033 gimple *top_stmt = stmt;
4034 basic_block top_bb = gimple_bb (stmt);
4036 /* Part 1: Try to set top_stmt to "topmost" stmt that dominates
4037 at-least stmt and possibly other trunc_div/trunc_mod stmts
4038 having same operands as stmt. */
4040 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, op1)
4042 if (is_gimple_assign (use_stmt)
4043 && (gimple_assign_rhs_code (use_stmt) == TRUNC_DIV_EXPR
4044 || gimple_assign_rhs_code (use_stmt) == TRUNC_MOD_EXPR)
4045 && operand_equal_p (op1, gimple_assign_rhs1 (use_stmt), 0)
4046 && operand_equal_p (op2, gimple_assign_rhs2 (use_stmt), 0))
4048 if (stmt_can_throw_internal (use_stmt))
4049 continue;
4051 basic_block bb = gimple_bb (use_stmt);
4053 if (bb == top_bb)
4055 if (gimple_uid (use_stmt) < gimple_uid (top_stmt))
4056 top_stmt = use_stmt;
4058 else if (dominated_by_p (CDI_DOMINATORS, top_bb, bb))
4060 top_bb = bb;
4061 top_stmt = use_stmt;
4066 tree top_op1 = gimple_assign_rhs1 (top_stmt);
4067 tree top_op2 = gimple_assign_rhs2 (top_stmt);
4069 stmts.safe_push (top_stmt);
4070 bool div_seen = (gimple_assign_rhs_code (top_stmt) == TRUNC_DIV_EXPR);
4072 /* Part 2: Add all trunc_div/trunc_mod statements domianted by top_bb
4073 to stmts vector. The 2nd loop will always add stmt to stmts vector, since
4074 gimple_bb (top_stmt) dominates gimple_bb (stmt), so the
4075 2nd loop ends up adding at-least single trunc_mod_expr stmt. */
4077 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, top_op1)
4079 if (is_gimple_assign (use_stmt)
4080 && (gimple_assign_rhs_code (use_stmt) == TRUNC_DIV_EXPR
4081 || gimple_assign_rhs_code (use_stmt) == TRUNC_MOD_EXPR)
4082 && operand_equal_p (top_op1, gimple_assign_rhs1 (use_stmt), 0)
4083 && operand_equal_p (top_op2, gimple_assign_rhs2 (use_stmt), 0))
4085 if (use_stmt == top_stmt
4086 || stmt_can_throw_internal (use_stmt)
4087 || !dominated_by_p (CDI_DOMINATORS, gimple_bb (use_stmt), top_bb))
4088 continue;
4090 stmts.safe_push (use_stmt);
4091 if (gimple_assign_rhs_code (use_stmt) == TRUNC_DIV_EXPR)
4092 div_seen = true;
4096 if (!div_seen)
4097 return false;
4099 /* Part 3: Create libcall to internal fn DIVMOD:
4100 divmod_tmp = DIVMOD (op1, op2). */
4102 gcall *call_stmt = gimple_build_call_internal (IFN_DIVMOD, 2, op1, op2);
4103 tree res = make_temp_ssa_name (build_complex_type (TREE_TYPE (op1)),
4104 call_stmt, "divmod_tmp");
4105 gimple_call_set_lhs (call_stmt, res);
4106 /* We rejected throwing statements above. */
4107 gimple_call_set_nothrow (call_stmt, true);
4109 /* Insert the call before top_stmt. */
4110 gimple_stmt_iterator top_stmt_gsi = gsi_for_stmt (top_stmt);
4111 gsi_insert_before (&top_stmt_gsi, call_stmt, GSI_SAME_STMT);
4113 widen_mul_stats.divmod_calls_inserted++;
4115 /* Update all statements in stmts vector:
4116 lhs = op1 TRUNC_DIV_EXPR op2 -> lhs = REALPART_EXPR<divmod_tmp>
4117 lhs = op1 TRUNC_MOD_EXPR op2 -> lhs = IMAGPART_EXPR<divmod_tmp>. */
4119 for (unsigned i = 0; stmts.iterate (i, &use_stmt); ++i)
4121 tree new_rhs;
4123 switch (gimple_assign_rhs_code (use_stmt))
4125 case TRUNC_DIV_EXPR:
4126 new_rhs = fold_build1 (REALPART_EXPR, TREE_TYPE (op1), res);
4127 break;
4129 case TRUNC_MOD_EXPR:
4130 new_rhs = fold_build1 (IMAGPART_EXPR, TREE_TYPE (op1), res);
4131 break;
4133 default:
4134 gcc_unreachable ();
4137 gimple_stmt_iterator gsi = gsi_for_stmt (use_stmt);
4138 gimple_assign_set_rhs_from_tree (&gsi, new_rhs);
4139 update_stmt (use_stmt);
4142 return true;
4145 /* Find integer multiplications where the operands are extended from
4146 smaller types, and replace the MULT_EXPR with a WIDEN_MULT_EXPR
4147 where appropriate. */
4149 namespace {
4151 const pass_data pass_data_optimize_widening_mul =
4153 GIMPLE_PASS, /* type */
4154 "widening_mul", /* name */
4155 OPTGROUP_NONE, /* optinfo_flags */
4156 TV_NONE, /* tv_id */
4157 PROP_ssa, /* properties_required */
4158 0, /* properties_provided */
4159 0, /* properties_destroyed */
4160 0, /* todo_flags_start */
4161 TODO_update_ssa, /* todo_flags_finish */
4164 class pass_optimize_widening_mul : public gimple_opt_pass
4166 public:
4167 pass_optimize_widening_mul (gcc::context *ctxt)
4168 : gimple_opt_pass (pass_data_optimize_widening_mul, ctxt)
4171 /* opt_pass methods: */
4172 virtual bool gate (function *)
4174 return flag_expensive_optimizations && optimize;
4177 virtual unsigned int execute (function *);
4179 }; // class pass_optimize_widening_mul
4181 unsigned int
4182 pass_optimize_widening_mul::execute (function *fun)
4184 basic_block bb;
4185 bool cfg_changed = false;
4187 memset (&widen_mul_stats, 0, sizeof (widen_mul_stats));
4188 calculate_dominance_info (CDI_DOMINATORS);
4189 renumber_gimple_stmt_uids ();
4191 FOR_EACH_BB_FN (bb, fun)
4193 gimple_stmt_iterator gsi;
4195 for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi);)
4197 gimple *stmt = gsi_stmt (gsi);
4198 enum tree_code code;
4200 if (is_gimple_assign (stmt))
4202 code = gimple_assign_rhs_code (stmt);
4203 switch (code)
4205 case MULT_EXPR:
4206 if (!convert_mult_to_widen (stmt, &gsi)
4207 && !convert_expand_mult_copysign (stmt, &gsi)
4208 && convert_mult_to_fma (stmt,
4209 gimple_assign_rhs1 (stmt),
4210 gimple_assign_rhs2 (stmt)))
4212 gsi_remove (&gsi, true);
4213 release_defs (stmt);
4214 continue;
4216 break;
4218 case PLUS_EXPR:
4219 case MINUS_EXPR:
4220 if (!convert_plusminus_to_widen (&gsi, stmt, code))
4221 match_uaddsub_overflow (&gsi, stmt, code);
4222 break;
4224 case TRUNC_MOD_EXPR:
4225 convert_to_divmod (as_a<gassign *> (stmt));
4226 break;
4228 default:;
4231 else if (is_gimple_call (stmt)
4232 && gimple_call_lhs (stmt))
4234 tree fndecl = gimple_call_fndecl (stmt);
4235 if (fndecl
4236 && gimple_call_builtin_p (stmt, BUILT_IN_NORMAL))
4238 switch (DECL_FUNCTION_CODE (fndecl))
4240 case BUILT_IN_POWF:
4241 case BUILT_IN_POW:
4242 case BUILT_IN_POWL:
4243 if (TREE_CODE (gimple_call_arg (stmt, 1)) == REAL_CST
4244 && real_equal
4245 (&TREE_REAL_CST (gimple_call_arg (stmt, 1)),
4246 &dconst2)
4247 && convert_mult_to_fma (stmt,
4248 gimple_call_arg (stmt, 0),
4249 gimple_call_arg (stmt, 0)))
4251 unlink_stmt_vdef (stmt);
4252 if (gsi_remove (&gsi, true)
4253 && gimple_purge_dead_eh_edges (bb))
4254 cfg_changed = true;
4255 release_defs (stmt);
4256 continue;
4258 break;
4260 default:;
4264 gsi_next (&gsi);
4268 statistics_counter_event (fun, "widening multiplications inserted",
4269 widen_mul_stats.widen_mults_inserted);
4270 statistics_counter_event (fun, "widening maccs inserted",
4271 widen_mul_stats.maccs_inserted);
4272 statistics_counter_event (fun, "fused multiply-adds inserted",
4273 widen_mul_stats.fmas_inserted);
4274 statistics_counter_event (fun, "divmod calls inserted",
4275 widen_mul_stats.divmod_calls_inserted);
4277 return cfg_changed ? TODO_cleanup_cfg : 0;
4280 } // anon namespace
4282 gimple_opt_pass *
4283 make_pass_optimize_widening_mul (gcc::context *ctxt)
4285 return new pass_optimize_widening_mul (ctxt);